From 9273ba533ed2775ee5aa89611849ba81a3a60445 Mon Sep 17 00:00:00 2001 From: ericm Date: Sun, 26 Feb 2006 23:06:34 +0000 Subject: [PATCH] branch: b1_5 update from b1_4. --- .../patches/ext3-ea-in-inode-2.6-rhel4.patch | 6 +- .../patches/ext3-extents-2.6.12.patch | 2924 ++++++++ .../patches/ext3-extents-2.6.5.patch | 26 +- .../patches/ext3-extents-2.6.9-rhel4.patch | 28 +- .../patches/ext3-external-journal-2.6.12.patch | 148 + .../patches/ext3-include-fixes-2.6-rhel4.patch | 2 +- .../patches/ext3-include-fixes-2.6-suse.patch | 2 +- .../patches/ext3-mballoc2-2.6-suse.patch | 2173 +++--- .../patches/ext3-mballoc2-2.6.12.patch | 2774 +++++++ .../patches/ext3-mballoc2-2.6.9-rhel4.patch | 2217 +++--- .../kernel_patches/patches/ext3-nlinks-2.6.9.patch | 163 + .../ext3-remove-cond_resched-calls-2.6.12.patch | 29 + .../kernel_patches/patches/iopen-2.6-rhel4.patch | 27 +- .../kernel_patches/patches/iopen-2.6-suse.patch | 92 +- .../kernel_patches/patches/iopen-2.6.12.patch | 270 +- .../kernel_patches/series/ldiskfs-2.6-rhel4.series | 3 +- .../series/ldiskfs-2.6.12-vanilla.series | 13 + ldiskfs/ldiskfs/Makefile.in | 2 +- ldiskfs/ldiskfs/autoMakefile.am | 3 +- lustre/ChangeLog | 396 +- lustre/autoconf/lustre-core.m4 | 28 +- lustre/autoconf/lustre-version.ac | 31 +- lustre/doc/Makefile.am | 4 +- lustre/doc/lctl.8 | 2 +- lustre/doc/lctl.lyx | 2 +- lustre/doc/lmc.1 | 2 +- lustre/doc/lmc.lyx | 2 +- lustre/doc/lwizard.1 | 113 - lustre/include/liblustre.h | 7 +- lustre/include/linux/.cvsignore | 1 + lustre/include/linux/lustre_compat25.h | 25 +- lustre/include/linux/lustre_disk.h | 55 + lustre/include/linux/lustre_fsfilt.h | 54 +- lustre/include/linux/lustre_ver.h.in | 23 + lustre/include/lustre/liblustreapi.h | 3 + .../kernel-2.4.20-hp_pnnl-2.4-ia64-smp.config | 5 + .../kernel-2.4.20-hp_pnnl-2.4-ia64.config | 5 + .../kernel-2.4.20-rh-2.4-i686-smp.config | 2 +- .../kernel-2.4.20-rh-2.4-i686.config | 2 +- .../kernel-2.4.21-rhel-2.4-i686-smp.config | 6 +- .../kernel-2.4.21-rhel-2.4-i686.config | 6 +- .../kernel-2.4.21-rhel-2.4-ia64-smp.config | 4 + .../kernel-2.4.21-rhel-2.4-ia64.config | 4 + .../kernel-2.4.21-rhel-2.4-x86_64-smp.config | 6 +- .../kernel-2.4.21-rhel-2.4-x86_64.config | 7 +- .../kernel-2.4.21-sles-2.4-i686-smp.config | 2 +- .../kernel-2.4.21-sles-2.4-i686.config | 2 +- .../kernel-2.4.21-suse-2.4.21-2-x86_64.config | 3 +- .../kernel-2.6.5-2.6-suse-i686-bigsmp.config | 28 +- .../kernel-2.6.5-2.6-suse-i686-smp.config | 28 +- .../kernel-2.6.5-2.6-suse-i686.config | 28 +- .../kernel-2.6.5-2.6-suse-ia64-smp.config | 26 +- .../kernel-2.6.5-2.6-suse-ia64.config | 26 +- .../kernel-2.6.5-2.6-suse-ppc-pseries64.config | 4 +- .../kernel-2.6.5-2.6-suse-ppc.config | 4 +- .../kernel-2.6.5-2.6-suse-x86_64-smp.config | 35 +- .../kernel-2.6.5-2.6-suse-x86_64.config | 35 +- .../kernel-2.6.9-2.6-rhel4-i686-smp.config | 81 +- .../kernel-2.6.9-2.6-rhel4-i686.config | 80 +- .../kernel-2.6.9-2.6-rhel4-ia64-smp.config | 70 +- .../kernel-2.6.9-2.6-rhel4-ia64.config | 70 +- .../kernel-2.6.9-2.6-rhel4-x86_64-smp.config | 76 +- .../kernel-2.6.9-2.6-rhel4-x86_64.config | 76 +- lustre/kernel_patches/patches/2.4.19-ext3.patch | 7892 -------------------- lustre/kernel_patches/patches/2.4.19-jbd.patch | 6524 ---------------- .../kernel_patches/patches/2.6-rhel4-kgdb-ga.patch | 6371 ++++++++++++++++ lustre/kernel_patches/patches/8kstack-2.6.12.patch | 13 + .../patches/add_page_private-2.4.19-bgl.patch | 15 - .../patches/compile-fixes-2.6.9-rhel4-22.patch | 76 + .../patches/dcache-qstr-api-fix-2.6-suse.patch | 148 + .../patches/export-show_task-2.4-bgl.patch | 32 - .../patches/export-truncate-bgl.patch | 37 - .../patches/export_symbols-2.6-rhel4.patch | 18 + .../patches/export_symbols-2.6-suse.patch | 22 + .../patches/export_symbols-2.6.12.patch | 105 + .../patches/exports_2.4.19-bgl.patch | 42 - .../patches/ext-2.4-patch-1-2.4.19-suse.patch | 2560 ------- .../patches/ext3-delete_thread-2.4.19-suse.patch | 481 -- .../patches/ext3-delete_thread-2.4.20.patch | 541 -- .../patches/ext3-ea-in-inode-2.6-rhel4.patch | 6 +- .../patches/ext3-extents-2.4.21-chaos.patch | 8 +- .../patches/ext3-extents-2.4.21-suse2.patch | 8 +- .../patches/ext3-extents-2.4.24.patch | 8 +- .../patches/ext3-extents-2.4.29.patch | 8 +- .../patches/ext3-extents-2.6.12.patch | 2924 ++++++++ .../patches/ext3-extents-2.6.5.patch | 26 +- .../patches/ext3-extents-2.6.9-rhel4.patch | 28 +- .../ext3-extents-asyncdel-2.4.21-chaos.patch | 4 +- .../patches/ext3-extents-asyncdel-2.4.24.patch | 4 +- .../patches/ext3-external-journal-2.6.12.patch | 148 + .../patches/ext3-htree-2.4.19-bgl.patch | 2584 ------- .../patches/ext3-htree-path-ops.patch | 894 +++ .../patches/ext3-include-fixes-2.6-rhel4.patch | 2 +- .../patches/ext3-include-fixes-2.6-suse.patch | 2 +- .../patches/ext3-mballoc2-2.4.24.patch | 1766 ----- .../patches/ext3-mballoc2-2.6-suse.patch | 2173 +++--- .../patches/ext3-mballoc2-2.6.12.patch | 2774 +++++++ .../patches/ext3-mballoc2-2.6.9-rhel4.patch | 2217 +++--- .../kernel_patches/patches/ext3-nlinks-2.6.9.patch | 163 + .../patches/ext3-no-write-super.patch | 22 - .../patches/ext3-orphan_lock-2.4.19-suse.patch | 85 - .../ext3-remove-cond_resched-calls-2.6.12.patch | 29 + .../patches/ext3-statfs-2.6.12.patch | 177 + .../kernel_patches/patches/ext3-unmount_sync.patch | 21 - .../patches/ext3-use-after-free-2.4.19-pre1.patch | 53 - .../patches/ext3-use-after-free-suse.patch | 53 - .../patches/extN-wantedi-2.4.19-suse.patch | 226 - .../patches/fc3_to_rhel4_updates.patch | 12 + .../patches/invalidate_show-2.4.19-bgl.patch | 121 - .../patches/iod-stock-24-exports-2.4.19-bgl.patch | 52 - .../patches/iod-stock-24-exports-2.4.19-suse.patch | 52 - lustre/kernel_patches/patches/iopen-2.4.20.patch | 4 +- .../patches/iopen-2.4.21-chaos.patch | 4 +- .../patches/iopen-2.4.21-sles8sp3.patch | 497 -- .../kernel_patches/patches/iopen-2.6-rhel4.patch | 27 +- lustre/kernel_patches/patches/iopen-2.6-suse.patch | 92 +- .../{iopen-2.4.19-bgl.patch => iopen-2.6.12.patch} | 271 +- .../kernel_patches/patches/iopen-misc-2.6.12.patch | 82 + .../kernel_patches/patches/jbd-2.4.18-jcberr.patch | 274 - .../patches/jbd-2.4.19-pre1-jcberr.patch | 274 - .../patches/jbd-commit-tricks-rhel3.patch | 132 + .../patches/jbd-flushtime-2.4.19-suse.patch | 35 - .../patches/jbd-stats-2.6.13.4.patch | 735 ++ .../kernel_patches/patches/kallsyms-2.4-bgl.patch | 685 -- .../kernel_patches/patches/kksymoops-2.4-bgl.patch | 678 -- .../patches/linux-2.4.18-netdump.patch | 1842 ----- .../patches/linux-2.4.19-bgl-xattr-0.8.54.patch | 5242 ------------- .../linux-2.4.19-suse-xattr-0.8.54-hp.patch | 346 - .../patches/linux-2.4.19-xattr-0.8.54-suse.patch | 47 - .../patches/linux-2.4.21-xattr-0.8.54-chaos.patch | 27 +- .../linux-2.4.24-jbd-handle-EIO-rhel3.patch | 23 + .../patches/linux-2.6-binutils-2.16.patch | 102 + .../linux-2.6.9-ext3-sub-second-timestamp.patch | 631 ++ .../patches/listman-2.4.19-bgl.patch | 72 - lustre/kernel_patches/patches/mcore-2.4.20-8.patch | 2738 ------- .../patches/mkdep-revert-rh-2.4.patch | 50 - .../patches/nfs-cifs-intent-2.6.12.patch | 110 + .../patches/nfs_export_kernel-2.4.19-bgl.patch | 741 -- .../patches/nfs_export_kernel-2.4.20-hp.patch | 3 +- .../patches/nfs_export_kernel-2.4.21-chaos.patch | 3 +- .../patches/nfs_export_kernel-2.4.21-suse2.patch | 3 +- .../patches/nfs_export_kernel-2.4.22.patch | 3 +- .../patches/nfs_export_kernel-2.4.29.patch | 3 +- lustre/kernel_patches/patches/qsnet-rhel-2.4.patch | 16 +- .../kernel_patches/patches/qsnet-rhel4-2.6.patch | 10 +- .../patches/removepage-2.4.19-suse.patch | 30 - .../patches/resched-2.4.19-pre1.patch | 16 - .../patches/small_scatterlist-2.4.21-rhel.patch | 4 +- .../patches/socket-exports-2.4.19-bgl.patch | 46 - .../patches/tcp-zero-copy-2.4.19-pre1.patch | 461 -- .../patches/uml-exprt-clearuser-2.6.12.patch | 11 + .../patches/vfs_intent-2.4.19-bgl.patch | 1849 ----- .../patches/vfs_intent-2.4.19-suse.patch | 1858 ----- .../patches/vfs_intent-2.6-rhel4.patch | 198 +- .../patches/vfs_intent-2.6-suse.patch | 24 +- .../kernel_patches/patches/vfs_intent-2.6.12.patch | 819 ++ .../patches/vfs_nointent-2.6-rhel4.patch | 124 +- .../patches/vfs_nointent-2.6.12.patch | 490 ++ .../kernel_patches/patches/vfs_races-2.6.12.patch | 61 + .../kernel_patches/patches/vm-tunables-rhel4.patch | 73 + .../patches/vmalloc_to_page-2.4.19-bgl.patch | 12 - lustre/kernel_patches/series/2.6-fc3.series | 2 + lustre/kernel_patches/series/2.6-rhel4.series | 4 +- lustre/kernel_patches/series/2.6-suse-newer.series | 1 + lustre/kernel_patches/series/2.6.12-vanilla.series | 19 + lustre/kernel_patches/series/bgl-2.4.19 | 47 - .../kernel_patches/series/ldiskfs-2.6-rhel4.series | 3 +- .../series/ldiskfs-2.6.12-vanilla.series | 13 + lustre/kernel_patches/series/rhel-2.4.21 | 4 +- lustre/kernel_patches/series/suse-2.4.21-cray | 2 +- lustre/kernel_patches/series/suse-2.4.21-jvn | 2 +- lustre/kernel_patches/targets/2.6-rhel4.target.in | 2 +- lustre/kernel_patches/targets/2.6-suse.target.in | 2 +- lustre/kernel_patches/targets/rhel-2.4.target.in | 2 +- lustre/kernel_patches/which_patch | 7 +- lustre/ldiskfs/Makefile.in | 2 +- lustre/ldiskfs/autoMakefile.am | 3 +- lustre/ldiskfs/lustre_quota_fmt.c | 69 +- lustre/ldiskfs/quotafmt_test.c | 8 +- lustre/ldlm/ldlm_internal.h | 2 +- lustre/ldlm/ldlm_lock.c | 81 +- lustre/ldlm/ldlm_lockd.c | 165 +- lustre/ldlm/ldlm_resource.c | 4 +- lustre/liblustre/Makefile.am | 2 +- lustre/liblustre/dir.c | 10 +- lustre/liblustre/file.c | 26 +- lustre/liblustre/genlib.sh | 3 +- lustre/liblustre/llite_lib.c | 39 +- lustre/liblustre/llite_lib.h | 38 +- lustre/liblustre/lutil.c | 56 +- lustre/liblustre/rw.c | 85 +- lustre/liblustre/super.c | 94 +- lustre/liblustre/tests/echo_test.c | 2 +- lustre/liblustre/tests/sanity.c | 82 +- lustre/llite/dcache.c | 35 +- lustre/llite/dir.c | 82 +- lustre/llite/file.c | 258 +- lustre/llite/llite_mmap.c | 5 +- lustre/llite/llite_nfs.c | 7 +- lustre/llite/namei.c | 45 +- lustre/llite/rw.c | 29 +- lustre/llite/rw24.c | 12 +- lustre/llite/super.c | 6 +- lustre/llite/super25.c | 5 +- lustre/llite/xattr.c | 2 +- lustre/lov/Makefile.in | 2 +- lustre/lov/autoMakefile.am | 2 +- lustre/lov/lov_ea.c | 546 ++ lustre/lov/lov_internal.h | 17 +- lustre/lov/lov_log.c | 4 +- lustre/lov/lov_merge.c | 66 +- lustre/lov/lov_offset.c | 36 +- lustre/lov/lov_pack.c | 152 +- lustre/lvfs/fsfilt_ext3.c | 136 +- lustre/lvfs/fsfilt_reiserfs.c | 5 +- lustre/lvfs/lvfs_linux.c | 26 +- lustre/mdc/lproc_mdc.c | 1 + lustre/mdc/mdc_lib.c | 42 +- lustre/mdc/mdc_locks.c | 138 +- lustre/mdc/mdc_reint.c | 48 +- lustre/mds/Makefile.in | 2 +- lustre/mds/lproc_mds.c | 4 +- lustre/mds/mds_fs.c | 86 +- lustre/mds/mds_join.c | 503 ++ lustre/mds/mds_lib.c | 15 +- lustre/mds/mds_log.c | 21 +- lustre/mds/mds_open.c | 136 +- lustre/mds/mds_reint.c | 164 +- lustre/mds/mds_unlink_open.c | 44 +- lustre/mds/mds_xattr.c | 11 +- lustre/obdclass/autoMakefile.am | 3 +- lustre/obdclass/debug.c | 8 +- lustre/obdclass/llog.c | 89 + lustre/obdclass/llog_cat.c | 64 + lustre/obdclass/llog_lvfs.c | 85 +- lustre/obdclass/llog_test.c | 19 +- lustre/obdclass/lprocfs_status.c | 48 +- lustre/obdclass/lustre_peer.c | 3 +- lustre/obdclass/obd_config.c | 49 +- lustre/obdclass/prng.c | 69 + lustre/obdecho/echo.c | 3 +- lustre/obdfilter/filter_io.c | 142 +- lustre/obdfilter/filter_io_24.c | 81 +- lustre/obdfilter/filter_io_26.c | 272 +- lustre/obdfilter/filter_log.c | 19 +- lustre/obdfilter/filter_lvb.c | 28 +- lustre/osc/lproc_osc.c | 82 +- lustre/ost/ost_handler.c | 279 +- lustre/ost/ost_internal.h | 12 +- lustre/ptlrpc/client.c | 31 +- lustre/ptlrpc/events.c | 4 +- lustre/ptlrpc/llog_client.c | 89 +- lustre/ptlrpc/llog_net.c | 3 +- lustre/ptlrpc/llog_server.c | 142 + lustre/ptlrpc/lproc_ptlrpc.c | 9 +- lustre/ptlrpc/niobuf.c | 148 +- lustre/ptlrpc/pers.c | 22 + lustre/ptlrpc/ptlrpc_internal.h | 11 +- lustre/ptlrpc/ptlrpc_module.c | 41 +- lustre/ptlrpc/recov_thread.c | 3 +- lustre/ptlrpc/recover.c | 6 +- lustre/quota/Makefile.in | 10 - lustre/quota/quota_check.c | 17 +- lustre/quota/quota_context.c | 25 +- lustre/quota/quota_ctl.c | 14 +- lustre/quota/quota_master.c | 42 +- lustre/scripts/lustre | 25 +- lustre/tests/.cvsignore | 1 + lustre/tests/Makefile.am | 8 +- lustre/tests/acl/README | 5 +- lustre/tests/acl/inheritance.test | 10 +- lustre/tests/conf-sanity.sh | 30 +- lustre/tests/echo.sh | 5 +- lustre/tests/fsx.c | 5 +- lustre/tests/insanity.sh | 4 +- lustre/tests/llmount.sh | 7 +- lustre/tests/llog-test.sh | 6 +- lustre/tests/llrmount.sh | 10 +- lustre/tests/local.sh | 6 +- lustre/tests/lockorder.sh | 3 +- lustre/tests/lov.sh | 6 +- lustre/tests/memhog.c | 32 +- lustre/tests/oos.sh | 19 +- lustre/tests/oos2.sh | 2 +- lustre/tests/random-reads.c | 13 +- lustre/tests/recovery-small.sh | 14 +- lustre/tests/replay-dual.sh | 18 +- lustre/tests/replay-ost-single.sh | 6 +- lustre/tests/replay-single.sh | 14 +- lustre/tests/run-llog.sh | 6 +- lustre/tests/rundbench | 5 +- lustre/tests/runregression-mds.sh | 2 +- lustre/tests/sanityN.sh | 150 +- lustre/tests/tbox.sh | 116 - lustre/tests/test-framework.sh | 26 +- lustre/tests/test-lwizard.sh | 45 - lustre/tests/uml.sh | 4 +- lustre/utils/Makefile.am | 2 +- lustre/utils/lctl.c | 2 + lustre/utils/lfs.c | 329 +- lustre/utils/liblustreapi.c | 192 +- lustre/utils/llmount.c | 62 +- lustre/utils/llog_reader.c | 46 +- lustre/utils/lmc | 6 +- lustre/utils/lustre_cfg.c | 7 +- lustre/utils/lwizard | 530 -- lustre/utils/wirehdr.c | 1 + 307 files changed, 35037 insertions(+), 48065 deletions(-) create mode 100644 ldiskfs/kernel_patches/patches/ext3-extents-2.6.12.patch create mode 100644 ldiskfs/kernel_patches/patches/ext3-external-journal-2.6.12.patch create mode 100644 ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6.12.patch create mode 100644 ldiskfs/kernel_patches/patches/ext3-nlinks-2.6.9.patch create mode 100644 ldiskfs/kernel_patches/patches/ext3-remove-cond_resched-calls-2.6.12.patch rename lustre/kernel_patches/patches/iopen-2.4.19-suse.patch => ldiskfs/kernel_patches/patches/iopen-2.6.12.patch (62%) create mode 100644 ldiskfs/kernel_patches/series/ldiskfs-2.6.12-vanilla.series delete mode 100644 lustre/doc/lwizard.1 create mode 100644 lustre/include/linux/lustre_disk.h create mode 100644 lustre/include/linux/lustre_ver.h.in delete mode 100644 lustre/kernel_patches/patches/2.4.19-ext3.patch delete mode 100644 lustre/kernel_patches/patches/2.4.19-jbd.patch create mode 100644 lustre/kernel_patches/patches/2.6-rhel4-kgdb-ga.patch create mode 100644 lustre/kernel_patches/patches/8kstack-2.6.12.patch delete mode 100644 lustre/kernel_patches/patches/add_page_private-2.4.19-bgl.patch create mode 100644 lustre/kernel_patches/patches/compile-fixes-2.6.9-rhel4-22.patch create mode 100644 lustre/kernel_patches/patches/dcache-qstr-api-fix-2.6-suse.patch delete mode 100644 lustre/kernel_patches/patches/export-show_task-2.4-bgl.patch delete mode 100644 lustre/kernel_patches/patches/export-truncate-bgl.patch create mode 100644 lustre/kernel_patches/patches/export_symbols-2.6.12.patch delete mode 100644 lustre/kernel_patches/patches/exports_2.4.19-bgl.patch delete mode 100644 lustre/kernel_patches/patches/ext-2.4-patch-1-2.4.19-suse.patch delete mode 100644 lustre/kernel_patches/patches/ext3-delete_thread-2.4.19-suse.patch delete mode 100644 lustre/kernel_patches/patches/ext3-delete_thread-2.4.20.patch create mode 100644 lustre/kernel_patches/patches/ext3-extents-2.6.12.patch create mode 100644 lustre/kernel_patches/patches/ext3-external-journal-2.6.12.patch delete mode 100644 lustre/kernel_patches/patches/ext3-htree-2.4.19-bgl.patch create mode 100644 lustre/kernel_patches/patches/ext3-htree-path-ops.patch delete mode 100644 lustre/kernel_patches/patches/ext3-mballoc2-2.4.24.patch create mode 100644 lustre/kernel_patches/patches/ext3-mballoc2-2.6.12.patch create mode 100644 lustre/kernel_patches/patches/ext3-nlinks-2.6.9.patch delete mode 100644 lustre/kernel_patches/patches/ext3-no-write-super.patch delete mode 100644 lustre/kernel_patches/patches/ext3-orphan_lock-2.4.19-suse.patch create mode 100644 lustre/kernel_patches/patches/ext3-remove-cond_resched-calls-2.6.12.patch create mode 100644 lustre/kernel_patches/patches/ext3-statfs-2.6.12.patch delete mode 100644 lustre/kernel_patches/patches/ext3-unmount_sync.patch delete mode 100644 lustre/kernel_patches/patches/ext3-use-after-free-2.4.19-pre1.patch delete mode 100644 lustre/kernel_patches/patches/ext3-use-after-free-suse.patch delete mode 100644 lustre/kernel_patches/patches/extN-wantedi-2.4.19-suse.patch create mode 100644 lustre/kernel_patches/patches/fc3_to_rhel4_updates.patch delete mode 100644 lustre/kernel_patches/patches/invalidate_show-2.4.19-bgl.patch delete mode 100644 lustre/kernel_patches/patches/iod-stock-24-exports-2.4.19-bgl.patch delete mode 100644 lustre/kernel_patches/patches/iod-stock-24-exports-2.4.19-suse.patch delete mode 100644 lustre/kernel_patches/patches/iopen-2.4.21-sles8sp3.patch rename lustre/kernel_patches/patches/{iopen-2.4.19-bgl.patch => iopen-2.6.12.patch} (61%) create mode 100644 lustre/kernel_patches/patches/iopen-misc-2.6.12.patch delete mode 100644 lustre/kernel_patches/patches/jbd-2.4.18-jcberr.patch delete mode 100644 lustre/kernel_patches/patches/jbd-2.4.19-pre1-jcberr.patch create mode 100644 lustre/kernel_patches/patches/jbd-commit-tricks-rhel3.patch delete mode 100644 lustre/kernel_patches/patches/jbd-flushtime-2.4.19-suse.patch create mode 100644 lustre/kernel_patches/patches/jbd-stats-2.6.13.4.patch delete mode 100644 lustre/kernel_patches/patches/kallsyms-2.4-bgl.patch delete mode 100644 lustre/kernel_patches/patches/kksymoops-2.4-bgl.patch delete mode 100644 lustre/kernel_patches/patches/linux-2.4.18-netdump.patch delete mode 100644 lustre/kernel_patches/patches/linux-2.4.19-bgl-xattr-0.8.54.patch delete mode 100644 lustre/kernel_patches/patches/linux-2.4.19-suse-xattr-0.8.54-hp.patch delete mode 100644 lustre/kernel_patches/patches/linux-2.4.19-xattr-0.8.54-suse.patch create mode 100644 lustre/kernel_patches/patches/linux-2.4.24-jbd-handle-EIO-rhel3.patch create mode 100644 lustre/kernel_patches/patches/linux-2.6-binutils-2.16.patch create mode 100644 lustre/kernel_patches/patches/linux-2.6.9-ext3-sub-second-timestamp.patch delete mode 100644 lustre/kernel_patches/patches/listman-2.4.19-bgl.patch delete mode 100644 lustre/kernel_patches/patches/mcore-2.4.20-8.patch delete mode 100644 lustre/kernel_patches/patches/mkdep-revert-rh-2.4.patch create mode 100644 lustre/kernel_patches/patches/nfs-cifs-intent-2.6.12.patch delete mode 100644 lustre/kernel_patches/patches/nfs_export_kernel-2.4.19-bgl.patch delete mode 100644 lustre/kernel_patches/patches/removepage-2.4.19-suse.patch delete mode 100644 lustre/kernel_patches/patches/resched-2.4.19-pre1.patch delete mode 100644 lustre/kernel_patches/patches/socket-exports-2.4.19-bgl.patch delete mode 100644 lustre/kernel_patches/patches/tcp-zero-copy-2.4.19-pre1.patch create mode 100644 lustre/kernel_patches/patches/uml-exprt-clearuser-2.6.12.patch delete mode 100644 lustre/kernel_patches/patches/vfs_intent-2.4.19-bgl.patch delete mode 100644 lustre/kernel_patches/patches/vfs_intent-2.4.19-suse.patch create mode 100644 lustre/kernel_patches/patches/vfs_intent-2.6.12.patch create mode 100644 lustre/kernel_patches/patches/vfs_nointent-2.6.12.patch create mode 100644 lustre/kernel_patches/patches/vfs_races-2.6.12.patch create mode 100644 lustre/kernel_patches/patches/vm-tunables-rhel4.patch delete mode 100644 lustre/kernel_patches/patches/vmalloc_to_page-2.4.19-bgl.patch create mode 100644 lustre/kernel_patches/series/2.6.12-vanilla.series delete mode 100644 lustre/kernel_patches/series/bgl-2.4.19 create mode 100644 lustre/kernel_patches/series/ldiskfs-2.6.12-vanilla.series create mode 100755 lustre/lov/lov_ea.c create mode 100644 lustre/mds/mds_join.c create mode 100644 lustre/obdclass/prng.c delete mode 100644 lustre/quota/Makefile.in delete mode 100644 lustre/tests/tbox.sh delete mode 100755 lustre/tests/test-lwizard.sh delete mode 100755 lustre/utils/lwizard diff --git a/ldiskfs/kernel_patches/patches/ext3-ea-in-inode-2.6-rhel4.patch b/ldiskfs/kernel_patches/patches/ext3-ea-in-inode-2.6-rhel4.patch index 507b044..3f5687b 100644 --- a/ldiskfs/kernel_patches/patches/ext3-ea-in-inode-2.6-rhel4.patch +++ b/ldiskfs/kernel_patches/patches/ext3-ea-in-inode-2.6-rhel4.patch @@ -27,7 +27,7 @@ Index: linux-stage/fs/ext3/inode.c struct ext3_iloc *iloc, int in_mem) { unsigned long block; -@@ -2484,6 +2484,11 @@ +@@ -2484,6 +2484,11 @@ void ext3_read_inode(struct inode * inod ei->i_data[block] = raw_inode->i_block[block]; INIT_LIST_HEAD(&ei->i_orphan); @@ -39,7 +39,7 @@ Index: linux-stage/fs/ext3/inode.c if (S_ISREG(inode->i_mode)) { inode->i_op = &ext3_file_inode_operations; inode->i_fop = &ext3_file_operations; -@@ -2619,6 +2624,9 @@ +@@ -2619,6 +2624,9 @@ static int ext3_do_update_inode(handle_t } else for (block = 0; block < EXT3_N_BLOCKS; block++) raw_inode->i_block[block] = ei->i_data[block]; @@ -49,7 +49,7 @@ Index: linux-stage/fs/ext3/inode.c BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); rc = ext3_journal_dirty_metadata(handle, bh); if (!err) -@@ -2849,7 +2857,8 @@ +@@ -2849,7 +2857,8 @@ ext3_reserve_inode_write(handle_t *handl { int err = 0; if (handle) { diff --git a/ldiskfs/kernel_patches/patches/ext3-extents-2.6.12.patch b/ldiskfs/kernel_patches/patches/ext3-extents-2.6.12.patch new file mode 100644 index 0000000..657ecf4 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext3-extents-2.6.12.patch @@ -0,0 +1,2924 @@ +Index: linux-2.6.12-rc6/fs/ext3/extents.c +=================================================================== +--- linux-2.6.12-rc6.orig/fs/ext3/extents.c 2005-06-14 16:31:25.756503133 +0200 ++++ linux-2.6.12-rc6/fs/ext3/extents.c 2005-06-14 16:31:25.836581257 +0200 +@@ -0,0 +1,2347 @@ ++/* ++ * Copyright(c) 2003, 2004, 2005, Cluster File Systems, Inc, info@clusterfs.com ++ * Written by Alex Tomas ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public Licens ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- ++ */ ++ ++/* ++ * Extents support for EXT3 ++ * ++ * TODO: ++ * - ext3_ext_walk_space() sould not use ext3_ext_find_extent() ++ * - ext3_ext_calc_credits() could take 'mergable' into account ++ * - ext3*_error() should be used in some situations ++ * - find_goal() [to be tested and improved] ++ * - smart tree reduction ++ * - arch-independence ++ * common on-disk format for big/little-endian arch ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++ ++static inline int ext3_ext_check_header(struct ext3_extent_header *eh) ++{ ++ if (eh->eh_magic != EXT3_EXT_MAGIC) { ++ printk(KERN_ERR "EXT3-fs: invalid magic = 0x%x\n", ++ (unsigned)eh->eh_magic); ++ return -EIO; ++ } ++ if (eh->eh_max == 0) { ++ printk(KERN_ERR "EXT3-fs: invalid eh_max = %u\n", ++ (unsigned)eh->eh_max); ++ return -EIO; ++ } ++ if (eh->eh_entries > eh->eh_max) { ++ printk(KERN_ERR "EXT3-fs: invalid eh_entries = %u\n", ++ (unsigned)eh->eh_entries); ++ return -EIO; ++ } ++ return 0; ++} ++ ++static handle_t *ext3_ext_journal_restart(handle_t *handle, int needed) ++{ ++ int err; ++ ++ if (handle->h_buffer_credits > needed) ++ return handle; ++ if (!ext3_journal_extend(handle, needed)) ++ return handle; ++ err = ext3_journal_restart(handle, needed); ++ ++ return handle; ++} ++ ++static int inline ++ext3_ext_get_access_for_root(handle_t *h, struct ext3_extents_tree *tree) ++{ ++ if (tree->ops->get_write_access) ++ return tree->ops->get_write_access(h,tree->buffer); ++ else ++ return 0; ++} ++ ++static int inline ++ext3_ext_mark_root_dirty(handle_t *h, struct ext3_extents_tree *tree) ++{ ++ if (tree->ops->mark_buffer_dirty) ++ return tree->ops->mark_buffer_dirty(h,tree->buffer); ++ else ++ return 0; ++} ++ ++/* ++ * could return: ++ * - EROFS ++ * - ENOMEM ++ */ ++static int ext3_ext_get_access(handle_t *handle, ++ struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ int err; ++ ++ if (path->p_bh) { ++ /* path points to block */ ++ err = ext3_journal_get_write_access(handle, path->p_bh); ++ } else { ++ /* path points to leaf/index in inode body */ ++ err = ext3_ext_get_access_for_root(handle, tree); ++ } ++ return err; ++} ++ ++/* ++ * could return: ++ * - EROFS ++ * - ENOMEM ++ * - EIO ++ */ ++static int ext3_ext_dirty(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ int err; ++ if (path->p_bh) { ++ /* path points to block */ ++ err =ext3_journal_dirty_metadata(handle, path->p_bh); ++ } else { ++ /* path points to leaf/index in inode body */ ++ err = ext3_ext_mark_root_dirty(handle, tree); ++ } ++ return err; ++} ++ ++static int inline ++ext3_ext_new_block(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, struct ext3_extent *ex, ++ int *err) ++{ ++ int goal, depth, newblock; ++ struct inode *inode; ++ ++ EXT_ASSERT(tree); ++ if (tree->ops->new_block) ++ return tree->ops->new_block(handle, tree, path, ex, err); ++ ++ inode = tree->inode; ++ depth = EXT_DEPTH(tree); ++ if (path && depth > 0) { ++ goal = path[depth-1].p_block; ++ } else { ++ struct ext3_inode_info *ei = EXT3_I(inode); ++ unsigned long bg_start; ++ unsigned long colour; ++ ++ bg_start = (ei->i_block_group * ++ EXT3_BLOCKS_PER_GROUP(inode->i_sb)) + ++ le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block); ++ colour = (current->pid % 16) * ++ (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16); ++ goal = bg_start + colour; ++ } ++ ++ newblock = ext3_new_block(handle, inode, goal, err); ++ return newblock; ++} ++ ++static inline void ext3_ext_tree_changed(struct ext3_extents_tree *tree) ++{ ++ struct ext3_extent_header *neh; ++ neh = EXT_ROOT_HDR(tree); ++ neh->eh_generation++; ++} ++ ++static inline int ext3_ext_space_block(struct ext3_extents_tree *tree) ++{ ++ int size; ++ ++ size = (tree->inode->i_sb->s_blocksize - ++ sizeof(struct ext3_extent_header)) / ++ sizeof(struct ext3_extent); ++#ifdef AGRESSIVE_TEST ++ size = 6; ++#endif ++ return size; ++} ++ ++static inline int ext3_ext_space_block_idx(struct ext3_extents_tree *tree) ++{ ++ int size; ++ ++ size = (tree->inode->i_sb->s_blocksize - ++ sizeof(struct ext3_extent_header)) / ++ sizeof(struct ext3_extent_idx); ++#ifdef AGRESSIVE_TEST ++ size = 5; ++#endif ++ return size; ++} ++ ++static inline int ext3_ext_space_root(struct ext3_extents_tree *tree) ++{ ++ int size; ++ ++ size = (tree->buffer_len - sizeof(struct ext3_extent_header)) / ++ sizeof(struct ext3_extent); ++#ifdef AGRESSIVE_TEST ++ size = 3; ++#endif ++ return size; ++} ++ ++static inline int ext3_ext_space_root_idx(struct ext3_extents_tree *tree) ++{ ++ int size; ++ ++ size = (tree->buffer_len - sizeof(struct ext3_extent_header)) / ++ sizeof(struct ext3_extent_idx); ++#ifdef AGRESSIVE_TEST ++ size = 4; ++#endif ++ return size; ++} ++ ++static void ext3_ext_show_path(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++#ifdef EXT_DEBUG ++ int k, l = path->p_depth; ++ ++ ext_debug(tree, "path:"); ++ for (k = 0; k <= l; k++, path++) { ++ if (path->p_idx) { ++ ext_debug(tree, " %d->%d", path->p_idx->ei_block, ++ path->p_idx->ei_leaf); ++ } else if (path->p_ext) { ++ ext_debug(tree, " %d:%d:%d", ++ path->p_ext->ee_block, ++ path->p_ext->ee_len, ++ path->p_ext->ee_start); ++ } else ++ ext_debug(tree, " []"); ++ } ++ ext_debug(tree, "\n"); ++#endif ++} ++ ++static void ext3_ext_show_leaf(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++#ifdef EXT_DEBUG ++ int depth = EXT_DEPTH(tree); ++ struct ext3_extent_header *eh; ++ struct ext3_extent *ex; ++ int i; ++ ++ if (!path) ++ return; ++ ++ eh = path[depth].p_hdr; ++ ex = EXT_FIRST_EXTENT(eh); ++ ++ for (i = 0; i < eh->eh_entries; i++, ex++) { ++ ext_debug(tree, "%d:%d:%d ", ++ ex->ee_block, ex->ee_len, ex->ee_start); ++ } ++ ext_debug(tree, "\n"); ++#endif ++} ++ ++static void ext3_ext_drop_refs(struct ext3_ext_path *path) ++{ ++ int depth = path->p_depth; ++ int i; ++ ++ for (i = 0; i <= depth; i++, path++) { ++ if (path->p_bh) { ++ brelse(path->p_bh); ++ path->p_bh = NULL; ++ } ++ } ++} ++ ++/* ++ * binary search for closest index by given block ++ */ ++static inline void ++ext3_ext_binsearch_idx(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, int block) ++{ ++ struct ext3_extent_header *eh = path->p_hdr; ++ struct ext3_extent_idx *ix; ++ int l = 0, k, r; ++ ++ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); ++ EXT_ASSERT(eh->eh_entries <= eh->eh_max); ++ EXT_ASSERT(eh->eh_entries > 0); ++ ++ ext_debug(tree, "binsearch for %d(idx): ", block); ++ ++ path->p_idx = ix = EXT_FIRST_INDEX(eh); ++ ++ r = k = eh->eh_entries; ++ while (k > 1) { ++ k = (r - l) / 2; ++ if (block < ix[l + k].ei_block) ++ r -= k; ++ else ++ l += k; ++ ext_debug(tree, "%d:%d:%d ", k, l, r); ++ } ++ ++ ix += l; ++ path->p_idx = ix; ++ ext_debug(tree," -> %d->%d ",path->p_idx->ei_block,path->p_idx->ei_leaf); ++ ++ while (l++ < r) { ++ if (block < ix->ei_block) ++ break; ++ path->p_idx = ix++; ++ } ++ ext_debug(tree, " -> %d->%d\n", path->p_idx->ei_block, ++ path->p_idx->ei_leaf); ++ ++#ifdef CHECK_BINSEARCH ++ { ++ struct ext3_extent_idx *chix; ++ ++ chix = ix = EXT_FIRST_INDEX(eh); ++ for (k = 0; k < eh->eh_entries; k++, ix++) { ++ if (k != 0 && ix->ei_block <= ix[-1].ei_block) { ++ printk("k=%d, ix=0x%p, first=0x%p\n", k, ++ ix, EXT_FIRST_INDEX(eh)); ++ printk("%u <= %u\n", ++ ix->ei_block,ix[-1].ei_block); ++ } ++ EXT_ASSERT(k == 0 || ix->ei_block > ix[-1].ei_block); ++ if (block < ix->ei_block) ++ break; ++ chix = ix; ++ } ++ EXT_ASSERT(chix == path->p_idx); ++ } ++#endif ++} ++ ++/* ++ * binary search for closest extent by given block ++ */ ++static inline void ++ext3_ext_binsearch(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, int block) ++{ ++ struct ext3_extent_header *eh = path->p_hdr; ++ struct ext3_extent *ex; ++ int l = 0, k, r; ++ ++ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); ++ EXT_ASSERT(eh->eh_entries <= eh->eh_max); ++ ++ if (eh->eh_entries == 0) { ++ /* ++ * this leaf is empty yet: ++ * we get such a leaf in split/add case ++ */ ++ return; ++ } ++ ++ ext_debug(tree, "binsearch for %d: ", block); ++ ++ path->p_ext = ex = EXT_FIRST_EXTENT(eh); ++ ++ r = k = eh->eh_entries; ++ while (k > 1) { ++ k = (r - l) / 2; ++ if (block < ex[l + k].ee_block) ++ r -= k; ++ else ++ l += k; ++ ext_debug(tree, "%d:%d:%d ", k, l, r); ++ } ++ ++ ex += l; ++ path->p_ext = ex; ++ ext_debug(tree, " -> %d:%d:%d ", path->p_ext->ee_block, ++ path->p_ext->ee_start, path->p_ext->ee_len); ++ ++ while (l++ < r) { ++ if (block < ex->ee_block) ++ break; ++ path->p_ext = ex++; ++ } ++ ext_debug(tree, " -> %d:%d:%d\n", path->p_ext->ee_block, ++ path->p_ext->ee_start, path->p_ext->ee_len); ++ ++#ifdef CHECK_BINSEARCH ++ { ++ struct ext3_extent *chex; ++ ++ chex = ex = EXT_FIRST_EXTENT(eh); ++ for (k = 0; k < eh->eh_entries; k++, ex++) { ++ EXT_ASSERT(k == 0 || ex->ee_block > ex[-1].ee_block); ++ if (block < ex->ee_block) ++ break; ++ chex = ex; ++ } ++ EXT_ASSERT(chex == path->p_ext); ++ } ++#endif ++} ++ ++int ext3_extent_tree_init(handle_t *handle, struct ext3_extents_tree *tree) ++{ ++ struct ext3_extent_header *eh; ++ ++ BUG_ON(tree->buffer_len == 0); ++ ext3_ext_get_access_for_root(handle, tree); ++ eh = EXT_ROOT_HDR(tree); ++ eh->eh_depth = 0; ++ eh->eh_entries = 0; ++ eh->eh_magic = EXT3_EXT_MAGIC; ++ eh->eh_max = ext3_ext_space_root(tree); ++ ext3_ext_mark_root_dirty(handle, tree); ++ ext3_ext_invalidate_cache(tree); ++ return 0; ++} ++ ++struct ext3_ext_path * ++ext3_ext_find_extent(struct ext3_extents_tree *tree, int block, ++ struct ext3_ext_path *path) ++{ ++ struct ext3_extent_header *eh; ++ struct buffer_head *bh; ++ int depth, i, ppos = 0; ++ ++ EXT_ASSERT(tree); ++ EXT_ASSERT(tree->inode); ++ EXT_ASSERT(tree->root); ++ ++ eh = EXT_ROOT_HDR(tree); ++ EXT_ASSERT(eh); ++ if (ext3_ext_check_header(eh)) ++ goto err; ++ ++ i = depth = EXT_DEPTH(tree); ++ EXT_ASSERT(eh->eh_max); ++ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); ++ ++ /* account possible depth increase */ ++ if (!path) { ++ path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 2), ++ GFP_NOFS); ++ if (!path) ++ return ERR_PTR(-ENOMEM); ++ } ++ memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1)); ++ path[0].p_hdr = eh; ++ ++ /* walk through the tree */ ++ while (i) { ++ ext_debug(tree, "depth %d: num %d, max %d\n", ++ ppos, eh->eh_entries, eh->eh_max); ++ ext3_ext_binsearch_idx(tree, path + ppos, block); ++ path[ppos].p_block = path[ppos].p_idx->ei_leaf; ++ path[ppos].p_depth = i; ++ path[ppos].p_ext = NULL; ++ ++ bh = sb_bread(tree->inode->i_sb, path[ppos].p_block); ++ if (!bh) ++ goto err; ++ ++ eh = EXT_BLOCK_HDR(bh); ++ ppos++; ++ EXT_ASSERT(ppos <= depth); ++ path[ppos].p_bh = bh; ++ path[ppos].p_hdr = eh; ++ i--; ++ ++ if (ext3_ext_check_header(eh)) ++ goto err; ++ } ++ ++ path[ppos].p_depth = i; ++ path[ppos].p_hdr = eh; ++ path[ppos].p_ext = NULL; ++ path[ppos].p_idx = NULL; ++ ++ if (ext3_ext_check_header(eh)) ++ goto err; ++ ++ /* find extent */ ++ ext3_ext_binsearch(tree, path + ppos, block); ++ ++ ext3_ext_show_path(tree, path); ++ ++ return path; ++ ++err: ++ printk(KERN_ERR "EXT3-fs: header is corrupted!\n"); ++ ext3_ext_drop_refs(path); ++ kfree(path); ++ return ERR_PTR(-EIO); ++} ++ ++/* ++ * insert new index [logical;ptr] into the block at cupr ++ * it check where to insert: before curp or after curp ++ */ ++static int ext3_ext_insert_index(handle_t *handle, ++ struct ext3_extents_tree *tree, ++ struct ext3_ext_path *curp, ++ int logical, int ptr) ++{ ++ struct ext3_extent_idx *ix; ++ int len, err; ++ ++ if ((err = ext3_ext_get_access(handle, tree, curp))) ++ return err; ++ ++ EXT_ASSERT(logical != curp->p_idx->ei_block); ++ len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx; ++ if (logical > curp->p_idx->ei_block) { ++ /* insert after */ ++ if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) { ++ len = (len - 1) * sizeof(struct ext3_extent_idx); ++ len = len < 0 ? 0 : len; ++ ext_debug(tree, "insert new index %d after: %d. " ++ "move %d from 0x%p to 0x%p\n", ++ logical, ptr, len, ++ (curp->p_idx + 1), (curp->p_idx + 2)); ++ memmove(curp->p_idx + 2, curp->p_idx + 1, len); ++ } ++ ix = curp->p_idx + 1; ++ } else { ++ /* insert before */ ++ len = len * sizeof(struct ext3_extent_idx); ++ len = len < 0 ? 0 : len; ++ ext_debug(tree, "insert new index %d before: %d. " ++ "move %d from 0x%p to 0x%p\n", ++ logical, ptr, len, ++ curp->p_idx, (curp->p_idx + 1)); ++ memmove(curp->p_idx + 1, curp->p_idx, len); ++ ix = curp->p_idx; ++ } ++ ++ ix->ei_block = logical; ++ ix->ei_leaf = ptr; ++ curp->p_hdr->eh_entries++; ++ ++ EXT_ASSERT(curp->p_hdr->eh_entries <= curp->p_hdr->eh_max); ++ EXT_ASSERT(ix <= EXT_LAST_INDEX(curp->p_hdr)); ++ ++ err = ext3_ext_dirty(handle, tree, curp); ++ ext3_std_error(tree->inode->i_sb, err); ++ ++ return err; ++} ++ ++/* ++ * routine inserts new subtree into the path, using free index entry ++ * at depth 'at: ++ * - allocates all needed blocks (new leaf and all intermediate index blocks) ++ * - makes decision where to split ++ * - moves remaining extens and index entries (right to the split point) ++ * into the newly allocated blocks ++ * - initialize subtree ++ */ ++static int ext3_ext_split(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newext, int at) ++{ ++ struct buffer_head *bh = NULL; ++ int depth = EXT_DEPTH(tree); ++ struct ext3_extent_header *neh; ++ struct ext3_extent_idx *fidx; ++ struct ext3_extent *ex; ++ int i = at, k, m, a; ++ unsigned long newblock, oldblock, border; ++ int *ablocks = NULL; /* array of allocated blocks */ ++ int err = 0; ++ ++ /* make decision: where to split? */ ++ /* FIXME: now desicion is simplest: at current extent */ ++ ++ /* if current leaf will be splitted, then we should use ++ * border from split point */ ++ EXT_ASSERT(path[depth].p_ext <= EXT_MAX_EXTENT(path[depth].p_hdr)); ++ if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) { ++ border = path[depth].p_ext[1].ee_block; ++ ext_debug(tree, "leaf will be splitted." ++ " next leaf starts at %d\n", ++ (int)border); ++ } else { ++ border = newext->ee_block; ++ ext_debug(tree, "leaf will be added." ++ " next leaf starts at %d\n", ++ (int)border); ++ } ++ ++ /* ++ * if error occurs, then we break processing ++ * and turn filesystem read-only. so, index won't ++ * be inserted and tree will be in consistent ++ * state. next mount will repair buffers too ++ */ ++ ++ /* ++ * get array to track all allocated blocks ++ * we need this to handle errors and free blocks ++ * upon them ++ */ ++ ablocks = kmalloc(sizeof(unsigned long) * depth, GFP_NOFS); ++ if (!ablocks) ++ return -ENOMEM; ++ memset(ablocks, 0, sizeof(unsigned long) * depth); ++ ++ /* allocate all needed blocks */ ++ ext_debug(tree, "allocate %d blocks for indexes/leaf\n", depth - at); ++ for (a = 0; a < depth - at; a++) { ++ newblock = ext3_ext_new_block(handle, tree, path, newext, &err); ++ if (newblock == 0) ++ goto cleanup; ++ ablocks[a] = newblock; ++ } ++ ++ /* initialize new leaf */ ++ newblock = ablocks[--a]; ++ EXT_ASSERT(newblock); ++ bh = sb_getblk(tree->inode->i_sb, newblock); ++ if (!bh) { ++ err = -EIO; ++ goto cleanup; ++ } ++ lock_buffer(bh); ++ ++ if ((err = ext3_journal_get_create_access(handle, bh))) ++ goto cleanup; ++ ++ neh = EXT_BLOCK_HDR(bh); ++ neh->eh_entries = 0; ++ neh->eh_max = ext3_ext_space_block(tree); ++ neh->eh_magic = EXT3_EXT_MAGIC; ++ neh->eh_depth = 0; ++ ex = EXT_FIRST_EXTENT(neh); ++ ++ /* move remain of path[depth] to the new leaf */ ++ EXT_ASSERT(path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max); ++ /* start copy from next extent */ ++ /* TODO: we could do it by single memmove */ ++ m = 0; ++ path[depth].p_ext++; ++ while (path[depth].p_ext <= ++ EXT_MAX_EXTENT(path[depth].p_hdr)) { ++ ext_debug(tree, "move %d:%d:%d in new leaf %lu\n", ++ path[depth].p_ext->ee_block, ++ path[depth].p_ext->ee_start, ++ path[depth].p_ext->ee_len, ++ newblock); ++ memmove(ex++, path[depth].p_ext++, sizeof(struct ext3_extent)); ++ neh->eh_entries++; ++ m++; ++ } ++ set_buffer_uptodate(bh); ++ unlock_buffer(bh); ++ ++ if ((err = ext3_journal_dirty_metadata(handle, bh))) ++ goto cleanup; ++ brelse(bh); ++ bh = NULL; ++ ++ /* correct old leaf */ ++ if (m) { ++ if ((err = ext3_ext_get_access(handle, tree, path + depth))) ++ goto cleanup; ++ path[depth].p_hdr->eh_entries -= m; ++ if ((err = ext3_ext_dirty(handle, tree, path + depth))) ++ goto cleanup; ++ ++ } ++ ++ /* create intermediate indexes */ ++ k = depth - at - 1; ++ EXT_ASSERT(k >= 0); ++ if (k) ++ ext_debug(tree, "create %d intermediate indices\n", k); ++ /* insert new index into current index block */ ++ /* current depth stored in i var */ ++ i = depth - 1; ++ while (k--) { ++ oldblock = newblock; ++ newblock = ablocks[--a]; ++ bh = sb_getblk(tree->inode->i_sb, newblock); ++ if (!bh) { ++ err = -EIO; ++ goto cleanup; ++ } ++ lock_buffer(bh); ++ ++ if ((err = ext3_journal_get_create_access(handle, bh))) ++ goto cleanup; ++ ++ neh = EXT_BLOCK_HDR(bh); ++ neh->eh_entries = 1; ++ neh->eh_magic = EXT3_EXT_MAGIC; ++ neh->eh_max = ext3_ext_space_block_idx(tree); ++ neh->eh_depth = depth - i; ++ fidx = EXT_FIRST_INDEX(neh); ++ fidx->ei_block = border; ++ fidx->ei_leaf = oldblock; ++ ++ ext_debug(tree, "int.index at %d (block %lu): %lu -> %lu\n", ++ i, newblock, border, oldblock); ++ /* copy indexes */ ++ m = 0; ++ path[i].p_idx++; ++ ++ ext_debug(tree, "cur 0x%p, last 0x%p\n", path[i].p_idx, ++ EXT_MAX_INDEX(path[i].p_hdr)); ++ EXT_ASSERT(EXT_MAX_INDEX(path[i].p_hdr) == ++ EXT_LAST_INDEX(path[i].p_hdr)); ++ while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) { ++ ext_debug(tree, "%d: move %d:%d in new index %lu\n", ++ i, path[i].p_idx->ei_block, ++ path[i].p_idx->ei_leaf, newblock); ++ memmove(++fidx, path[i].p_idx++, ++ sizeof(struct ext3_extent_idx)); ++ neh->eh_entries++; ++ EXT_ASSERT(neh->eh_entries <= neh->eh_max); ++ m++; ++ } ++ set_buffer_uptodate(bh); ++ unlock_buffer(bh); ++ ++ if ((err = ext3_journal_dirty_metadata(handle, bh))) ++ goto cleanup; ++ brelse(bh); ++ bh = NULL; ++ ++ /* correct old index */ ++ if (m) { ++ err = ext3_ext_get_access(handle, tree, path + i); ++ if (err) ++ goto cleanup; ++ path[i].p_hdr->eh_entries -= m; ++ err = ext3_ext_dirty(handle, tree, path + i); ++ if (err) ++ goto cleanup; ++ } ++ ++ i--; ++ } ++ ++ /* insert new index */ ++ if (!err) ++ err = ext3_ext_insert_index(handle, tree, path + at, ++ border, newblock); ++ ++cleanup: ++ if (bh) { ++ if (buffer_locked(bh)) ++ unlock_buffer(bh); ++ brelse(bh); ++ } ++ ++ if (err) { ++ /* free all allocated blocks in error case */ ++ for (i = 0; i < depth; i++) { ++ if (!ablocks[i]) ++ continue; ++ ext3_free_blocks(handle, tree->inode, ablocks[i], 1); ++ } ++ } ++ kfree(ablocks); ++ ++ return err; ++} ++ ++/* ++ * routine implements tree growing procedure: ++ * - allocates new block ++ * - moves top-level data (index block or leaf) into the new block ++ * - initialize new top-level, creating index that points to the ++ * just created block ++ */ ++static int ext3_ext_grow_indepth(handle_t *handle, ++ struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newext) ++{ ++ struct ext3_ext_path *curp = path; ++ struct ext3_extent_header *neh; ++ struct ext3_extent_idx *fidx; ++ struct buffer_head *bh; ++ unsigned long newblock; ++ int err = 0; ++ ++ newblock = ext3_ext_new_block(handle, tree, path, newext, &err); ++ if (newblock == 0) ++ return err; ++ ++ bh = sb_getblk(tree->inode->i_sb, newblock); ++ if (!bh) { ++ err = -EIO; ++ ext3_std_error(tree->inode->i_sb, err); ++ return err; ++ } ++ lock_buffer(bh); ++ ++ if ((err = ext3_journal_get_create_access(handle, bh))) { ++ unlock_buffer(bh); ++ goto out; ++ } ++ ++ /* move top-level index/leaf into new block */ ++ memmove(bh->b_data, curp->p_hdr, tree->buffer_len); ++ ++ /* set size of new block */ ++ neh = EXT_BLOCK_HDR(bh); ++ /* old root could have indexes or leaves ++ * so calculate eh_max right way */ ++ if (EXT_DEPTH(tree)) ++ neh->eh_max = ext3_ext_space_block_idx(tree); ++ else ++ neh->eh_max = ext3_ext_space_block(tree); ++ neh->eh_magic = EXT3_EXT_MAGIC; ++ set_buffer_uptodate(bh); ++ unlock_buffer(bh); ++ ++ if ((err = ext3_journal_dirty_metadata(handle, bh))) ++ goto out; ++ ++ /* create index in new top-level index: num,max,pointer */ ++ if ((err = ext3_ext_get_access(handle, tree, curp))) ++ goto out; ++ ++ curp->p_hdr->eh_magic = EXT3_EXT_MAGIC; ++ curp->p_hdr->eh_max = ext3_ext_space_root_idx(tree); ++ curp->p_hdr->eh_entries = 1; ++ curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr); ++ /* FIXME: it works, but actually path[0] can be index */ ++ curp->p_idx->ei_block = EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block; ++ curp->p_idx->ei_leaf = newblock; ++ ++ neh = EXT_ROOT_HDR(tree); ++ fidx = EXT_FIRST_INDEX(neh); ++ ext_debug(tree, "new root: num %d(%d), lblock %d, ptr %d\n", ++ neh->eh_entries, neh->eh_max, fidx->ei_block, fidx->ei_leaf); ++ ++ neh->eh_depth = path->p_depth + 1; ++ err = ext3_ext_dirty(handle, tree, curp); ++out: ++ brelse(bh); ++ ++ return err; ++} ++ ++/* ++ * routine finds empty index and adds new leaf. if no free index found ++ * then it requests in-depth growing ++ */ ++static int ext3_ext_create_new_leaf(handle_t *handle, ++ struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newext) ++{ ++ struct ext3_ext_path *curp; ++ int depth, i, err = 0; ++ ++repeat: ++ i = depth = EXT_DEPTH(tree); ++ ++ /* walk up to the tree and look for free index entry */ ++ curp = path + depth; ++ while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) { ++ i--; ++ curp--; ++ } ++ ++ /* we use already allocated block for index block ++ * so, subsequent data blocks should be contigoues */ ++ if (EXT_HAS_FREE_INDEX(curp)) { ++ /* if we found index with free entry, then use that ++ * entry: create all needed subtree and add new leaf */ ++ err = ext3_ext_split(handle, tree, path, newext, i); ++ ++ /* refill path */ ++ ext3_ext_drop_refs(path); ++ path = ext3_ext_find_extent(tree, newext->ee_block, path); ++ if (IS_ERR(path)) ++ err = PTR_ERR(path); ++ } else { ++ /* tree is full, time to grow in depth */ ++ err = ext3_ext_grow_indepth(handle, tree, path, newext); ++ ++ /* refill path */ ++ ext3_ext_drop_refs(path); ++ path = ext3_ext_find_extent(tree, newext->ee_block, path); ++ if (IS_ERR(path)) ++ err = PTR_ERR(path); ++ ++ /* ++ * only first (depth 0 -> 1) produces free space ++ * in all other cases we have to split growed tree ++ */ ++ depth = EXT_DEPTH(tree); ++ if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) { ++ /* now we need split */ ++ goto repeat; ++ } ++ } ++ ++ if (err) ++ return err; ++ ++ return 0; ++} ++ ++/* ++ * returns allocated block in subsequent extent or EXT_MAX_BLOCK ++ * NOTE: it consider block number from index entry as ++ * allocated block. thus, index entries have to be consistent ++ * with leafs ++ */ ++static unsigned long ++ext3_ext_next_allocated_block(struct ext3_ext_path *path) ++{ ++ int depth; ++ ++ EXT_ASSERT(path != NULL); ++ depth = path->p_depth; ++ ++ if (depth == 0 && path->p_ext == NULL) ++ return EXT_MAX_BLOCK; ++ ++ /* FIXME: what if index isn't full ?! */ ++ while (depth >= 0) { ++ if (depth == path->p_depth) { ++ /* leaf */ ++ if (path[depth].p_ext != ++ EXT_LAST_EXTENT(path[depth].p_hdr)) ++ return path[depth].p_ext[1].ee_block; ++ } else { ++ /* index */ ++ if (path[depth].p_idx != ++ EXT_LAST_INDEX(path[depth].p_hdr)) ++ return path[depth].p_idx[1].ei_block; ++ } ++ depth--; ++ } ++ ++ return EXT_MAX_BLOCK; ++} ++ ++/* ++ * returns first allocated block from next leaf or EXT_MAX_BLOCK ++ */ ++static unsigned ext3_ext_next_leaf_block(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ int depth; ++ ++ EXT_ASSERT(path != NULL); ++ depth = path->p_depth; ++ ++ /* zero-tree has no leaf blocks at all */ ++ if (depth == 0) ++ return EXT_MAX_BLOCK; ++ ++ /* go to index block */ ++ depth--; ++ ++ while (depth >= 0) { ++ if (path[depth].p_idx != ++ EXT_LAST_INDEX(path[depth].p_hdr)) ++ return path[depth].p_idx[1].ei_block; ++ depth--; ++ } ++ ++ return EXT_MAX_BLOCK; ++} ++ ++/* ++ * if leaf gets modified and modified extent is first in the leaf ++ * then we have to correct all indexes above ++ * TODO: do we need to correct tree in all cases? ++ */ ++int ext3_ext_correct_indexes(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ struct ext3_extent_header *eh; ++ int depth = EXT_DEPTH(tree); ++ struct ext3_extent *ex; ++ unsigned long border; ++ int k, err = 0; ++ ++ eh = path[depth].p_hdr; ++ ex = path[depth].p_ext; ++ EXT_ASSERT(ex); ++ EXT_ASSERT(eh); ++ ++ if (depth == 0) { ++ /* there is no tree at all */ ++ return 0; ++ } ++ ++ if (ex != EXT_FIRST_EXTENT(eh)) { ++ /* we correct tree if first leaf got modified only */ ++ return 0; ++ } ++ ++ /* ++ * TODO: we need correction if border is smaller then current one ++ */ ++ k = depth - 1; ++ border = path[depth].p_ext->ee_block; ++ if ((err = ext3_ext_get_access(handle, tree, path + k))) ++ return err; ++ path[k].p_idx->ei_block = border; ++ if ((err = ext3_ext_dirty(handle, tree, path + k))) ++ return err; ++ ++ while (k--) { ++ /* change all left-side indexes */ ++ if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr)) ++ break; ++ if ((err = ext3_ext_get_access(handle, tree, path + k))) ++ break; ++ path[k].p_idx->ei_block = border; ++ if ((err = ext3_ext_dirty(handle, tree, path + k))) ++ break; ++ } ++ ++ return err; ++} ++ ++static int inline ++ext3_can_extents_be_merged(struct ext3_extents_tree *tree, ++ struct ext3_extent *ex1, ++ struct ext3_extent *ex2) ++{ ++ if (ex1->ee_block + ex1->ee_len != ex2->ee_block) ++ return 0; ++ ++#ifdef AGRESSIVE_TEST ++ if (ex1->ee_len >= 4) ++ return 0; ++#endif ++ ++ if (!tree->ops->mergable) ++ return 1; ++ ++ return tree->ops->mergable(ex1, ex2); ++} ++ ++/* ++ * this routine tries to merge requsted extent into the existing ++ * extent or inserts requested extent as new one into the tree, ++ * creating new leaf in no-space case ++ */ ++int ext3_ext_insert_extent(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newext) ++{ ++ struct ext3_extent_header * eh; ++ struct ext3_extent *ex, *fex; ++ struct ext3_extent *nearex; /* nearest extent */ ++ struct ext3_ext_path *npath = NULL; ++ int depth, len, err, next; ++ ++ EXT_ASSERT(newext->ee_len > 0); ++ depth = EXT_DEPTH(tree); ++ ex = path[depth].p_ext; ++ EXT_ASSERT(path[depth].p_hdr); ++ ++ /* try to insert block into found extent and return */ ++ if (ex && ext3_can_extents_be_merged(tree, ex, newext)) { ++ ext_debug(tree, "append %d block to %d:%d (from %d)\n", ++ newext->ee_len, ex->ee_block, ex->ee_len, ++ ex->ee_start); ++ if ((err = ext3_ext_get_access(handle, tree, path + depth))) ++ return err; ++ ex->ee_len += newext->ee_len; ++ eh = path[depth].p_hdr; ++ nearex = ex; ++ goto merge; ++ } ++ ++repeat: ++ depth = EXT_DEPTH(tree); ++ eh = path[depth].p_hdr; ++ if (eh->eh_entries < eh->eh_max) ++ goto has_space; ++ ++ /* probably next leaf has space for us? */ ++ fex = EXT_LAST_EXTENT(eh); ++ next = ext3_ext_next_leaf_block(tree, path); ++ if (newext->ee_block > fex->ee_block && next != EXT_MAX_BLOCK) { ++ ext_debug(tree, "next leaf block - %d\n", next); ++ EXT_ASSERT(!npath); ++ npath = ext3_ext_find_extent(tree, next, NULL); ++ if (IS_ERR(npath)) ++ return PTR_ERR(npath); ++ EXT_ASSERT(npath->p_depth == path->p_depth); ++ eh = npath[depth].p_hdr; ++ if (eh->eh_entries < eh->eh_max) { ++ ext_debug(tree, "next leaf isnt full(%d)\n", ++ eh->eh_entries); ++ path = npath; ++ goto repeat; ++ } ++ ext_debug(tree, "next leaf hasno free space(%d,%d)\n", ++ eh->eh_entries, eh->eh_max); ++ } ++ ++ /* ++ * there is no free space in found leaf ++ * we're gonna add new leaf in the tree ++ */ ++ err = ext3_ext_create_new_leaf(handle, tree, path, newext); ++ if (err) ++ goto cleanup; ++ depth = EXT_DEPTH(tree); ++ eh = path[depth].p_hdr; ++ ++has_space: ++ nearex = path[depth].p_ext; ++ ++ if ((err = ext3_ext_get_access(handle, tree, path + depth))) ++ goto cleanup; ++ ++ if (!nearex) { ++ /* there is no extent in this leaf, create first one */ ++ ext_debug(tree, "first extent in the leaf: %d:%d:%d\n", ++ newext->ee_block, newext->ee_start, ++ newext->ee_len); ++ path[depth].p_ext = EXT_FIRST_EXTENT(eh); ++ } else if (newext->ee_block > nearex->ee_block) { ++ EXT_ASSERT(newext->ee_block != nearex->ee_block); ++ if (nearex != EXT_LAST_EXTENT(eh)) { ++ len = EXT_MAX_EXTENT(eh) - nearex; ++ len = (len - 1) * sizeof(struct ext3_extent); ++ len = len < 0 ? 0 : len; ++ ext_debug(tree, "insert %d:%d:%d after: nearest 0x%p, " ++ "move %d from 0x%p to 0x%p\n", ++ newext->ee_block, newext->ee_start, ++ newext->ee_len, ++ nearex, len, nearex + 1, nearex + 2); ++ memmove(nearex + 2, nearex + 1, len); ++ } ++ path[depth].p_ext = nearex + 1; ++ } else { ++ EXT_ASSERT(newext->ee_block != nearex->ee_block); ++ len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext3_extent); ++ len = len < 0 ? 0 : len; ++ ext_debug(tree, "insert %d:%d:%d before: nearest 0x%p, " ++ "move %d from 0x%p to 0x%p\n", ++ newext->ee_block, newext->ee_start, newext->ee_len, ++ nearex, len, nearex + 1, nearex + 2); ++ memmove(nearex + 1, nearex, len); ++ path[depth].p_ext = nearex; ++ } ++ ++ eh->eh_entries++; ++ nearex = path[depth].p_ext; ++ nearex->ee_block = newext->ee_block; ++ nearex->ee_start = newext->ee_start; ++ nearex->ee_len = newext->ee_len; ++ /* FIXME: support for large fs */ ++ nearex->ee_start_hi = 0; ++ ++merge: ++ /* try to merge extents to the right */ ++ while (nearex < EXT_LAST_EXTENT(eh)) { ++ if (!ext3_can_extents_be_merged(tree, nearex, nearex + 1)) ++ break; ++ /* merge with next extent! */ ++ nearex->ee_len += nearex[1].ee_len; ++ if (nearex + 1 < EXT_LAST_EXTENT(eh)) { ++ len = (EXT_LAST_EXTENT(eh) - nearex - 1) * ++ sizeof(struct ext3_extent); ++ memmove(nearex + 1, nearex + 2, len); ++ } ++ eh->eh_entries--; ++ EXT_ASSERT(eh->eh_entries > 0); ++ } ++ ++ /* try to merge extents to the left */ ++ ++ /* time to correct all indexes above */ ++ err = ext3_ext_correct_indexes(handle, tree, path); ++ if (err) ++ goto cleanup; ++ ++ err = ext3_ext_dirty(handle, tree, path + depth); ++ ++cleanup: ++ if (npath) { ++ ext3_ext_drop_refs(npath); ++ kfree(npath); ++ } ++ ext3_ext_tree_changed(tree); ++ ext3_ext_invalidate_cache(tree); ++ return err; ++} ++ ++int ext3_ext_walk_space(struct ext3_extents_tree *tree, unsigned long block, ++ unsigned long num, ext_prepare_callback func) ++{ ++ struct ext3_ext_path *path = NULL; ++ struct ext3_ext_cache cbex; ++ struct ext3_extent *ex; ++ unsigned long next, start = 0, end = 0; ++ unsigned long last = block + num; ++ int depth, exists, err = 0; ++ ++ EXT_ASSERT(tree); ++ EXT_ASSERT(func); ++ EXT_ASSERT(tree->inode); ++ EXT_ASSERT(tree->root); ++ ++ while (block < last && block != EXT_MAX_BLOCK) { ++ num = last - block; ++ /* find extent for this block */ ++ path = ext3_ext_find_extent(tree, block, path); ++ if (IS_ERR(path)) { ++ err = PTR_ERR(path); ++ path = NULL; ++ break; ++ } ++ ++ depth = EXT_DEPTH(tree); ++ EXT_ASSERT(path[depth].p_hdr); ++ ex = path[depth].p_ext; ++ next = ext3_ext_next_allocated_block(path); ++ ++ exists = 0; ++ if (!ex) { ++ /* there is no extent yet, so try to allocate ++ * all requested space */ ++ start = block; ++ end = block + num; ++ } else if (ex->ee_block > block) { ++ /* need to allocate space before found extent */ ++ start = block; ++ end = ex->ee_block; ++ if (block + num < end) ++ end = block + num; ++ } else if (block >= ex->ee_block + ex->ee_len) { ++ /* need to allocate space after found extent */ ++ start = block; ++ end = block + num; ++ if (end >= next) ++ end = next; ++ } else if (block >= ex->ee_block) { ++ /* ++ * some part of requested space is covered ++ * by found extent ++ */ ++ start = block; ++ end = ex->ee_block + ex->ee_len; ++ if (block + num < end) ++ end = block + num; ++ exists = 1; ++ } else { ++ BUG(); ++ } ++ EXT_ASSERT(end > start); ++ ++ if (!exists) { ++ cbex.ec_block = start; ++ cbex.ec_len = end - start; ++ cbex.ec_start = 0; ++ cbex.ec_type = EXT3_EXT_CACHE_GAP; ++ } else { ++ cbex.ec_block = ex->ee_block; ++ cbex.ec_len = ex->ee_len; ++ cbex.ec_start = ex->ee_start; ++ cbex.ec_type = EXT3_EXT_CACHE_EXTENT; ++ } ++ ++ EXT_ASSERT(cbex.ec_len > 0); ++ EXT_ASSERT(path[depth].p_hdr); ++ err = func(tree, path, &cbex); ++ ext3_ext_drop_refs(path); ++ ++ if (err < 0) ++ break; ++ if (err == EXT_REPEAT) ++ continue; ++ else if (err == EXT_BREAK) { ++ err = 0; ++ break; ++ } ++ ++ if (EXT_DEPTH(tree) != depth) { ++ /* depth was changed. we have to realloc path */ ++ kfree(path); ++ path = NULL; ++ } ++ ++ block = cbex.ec_block + cbex.ec_len; ++ } ++ ++ if (path) { ++ ext3_ext_drop_refs(path); ++ kfree(path); ++ } ++ ++ return err; ++} ++ ++static inline void ++ext3_ext_put_in_cache(struct ext3_extents_tree *tree, __u32 block, ++ __u32 len, __u32 start, int type) ++{ ++ EXT_ASSERT(len > 0); ++ if (tree->cex) { ++ tree->cex->ec_type = type; ++ tree->cex->ec_block = block; ++ tree->cex->ec_len = len; ++ tree->cex->ec_start = start; ++ } ++} ++ ++/* ++ * this routine calculate boundaries of the gap requested block fits into ++ * and cache this gap ++ */ ++static inline void ++ext3_ext_put_gap_in_cache(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ unsigned long block) ++{ ++ int depth = EXT_DEPTH(tree); ++ unsigned long lblock, len; ++ struct ext3_extent *ex; ++ ++ if (!tree->cex) ++ return; ++ ++ ex = path[depth].p_ext; ++ if (ex == NULL) { ++ /* there is no extent yet, so gap is [0;-] */ ++ lblock = 0; ++ len = EXT_MAX_BLOCK; ++ ext_debug(tree, "cache gap(whole file):"); ++ } else if (block < ex->ee_block) { ++ lblock = block; ++ len = ex->ee_block - block; ++ ext_debug(tree, "cache gap(before): %lu [%lu:%lu]", ++ (unsigned long) block, ++ (unsigned long) ex->ee_block, ++ (unsigned long) ex->ee_len); ++ } else if (block >= ex->ee_block + ex->ee_len) { ++ lblock = ex->ee_block + ex->ee_len; ++ len = ext3_ext_next_allocated_block(path); ++ ext_debug(tree, "cache gap(after): [%lu:%lu] %lu", ++ (unsigned long) ex->ee_block, ++ (unsigned long) ex->ee_len, ++ (unsigned long) block); ++ EXT_ASSERT(len > lblock); ++ len = len - lblock; ++ } else { ++ lblock = len = 0; ++ BUG(); ++ } ++ ++ ext_debug(tree, " -> %lu:%lu\n", (unsigned long) lblock, len); ++ ext3_ext_put_in_cache(tree, lblock, len, 0, EXT3_EXT_CACHE_GAP); ++} ++ ++static inline int ++ext3_ext_in_cache(struct ext3_extents_tree *tree, unsigned long block, ++ struct ext3_extent *ex) ++{ ++ struct ext3_ext_cache *cex = tree->cex; ++ ++ /* is there cache storage at all? */ ++ if (!cex) ++ return EXT3_EXT_CACHE_NO; ++ ++ /* has cache valid data? */ ++ if (cex->ec_type == EXT3_EXT_CACHE_NO) ++ return EXT3_EXT_CACHE_NO; ++ ++ EXT_ASSERT(cex->ec_type == EXT3_EXT_CACHE_GAP || ++ cex->ec_type == EXT3_EXT_CACHE_EXTENT); ++ if (block >= cex->ec_block && block < cex->ec_block + cex->ec_len) { ++ ex->ee_block = cex->ec_block; ++ ex->ee_start = cex->ec_start; ++ ex->ee_len = cex->ec_len; ++ ext_debug(tree, "%lu cached by %lu:%lu:%lu\n", ++ (unsigned long) block, ++ (unsigned long) ex->ee_block, ++ (unsigned long) ex->ee_len, ++ (unsigned long) ex->ee_start); ++ return cex->ec_type; ++ } ++ ++ /* not in cache */ ++ return EXT3_EXT_CACHE_NO; ++} ++ ++/* ++ * routine removes index from the index block ++ * it's used in truncate case only. thus all requests are for ++ * last index in the block only ++ */ ++int ext3_ext_rm_idx(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ struct buffer_head *bh; ++ int err; ++ ++ /* free index block */ ++ path--; ++ EXT_ASSERT(path->p_hdr->eh_entries); ++ if ((err = ext3_ext_get_access(handle, tree, path))) ++ return err; ++ path->p_hdr->eh_entries--; ++ if ((err = ext3_ext_dirty(handle, tree, path))) ++ return err; ++ ext_debug(tree, "index is empty, remove it, free block %d\n", ++ path->p_idx->ei_leaf); ++ bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf); ++ ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf); ++ ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1); ++ return err; ++} ++ ++int ext3_ext_calc_credits_for_insert(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ int depth = EXT_DEPTH(tree); ++ int needed; ++ ++ if (path) { ++ /* probably there is space in leaf? */ ++ if (path[depth].p_hdr->eh_entries < path[depth].p_hdr->eh_max) ++ return 1; ++ } ++ ++ /* ++ * the worste case we're expecting is creation of the ++ * new root (growing in depth) with index splitting ++ * for splitting we have to consider depth + 1 because ++ * previous growing could increase it ++ */ ++ depth = depth + 1; ++ ++ /* ++ * growing in depth: ++ * block allocation + new root + old root ++ */ ++ needed = EXT3_ALLOC_NEEDED + 2; ++ ++ /* index split. we may need: ++ * allocate intermediate indexes and new leaf ++ * change two blocks at each level, but root ++ * modify root block (inode) ++ */ ++ needed += (depth * EXT3_ALLOC_NEEDED) + (2 * depth) + 1; ++ ++ return needed; ++} ++ ++static int ++ext3_ext_split_for_rm(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, unsigned long start, ++ unsigned long end) ++{ ++ struct ext3_extent *ex, tex; ++ struct ext3_ext_path *npath; ++ int depth, creds, err; ++ ++ depth = EXT_DEPTH(tree); ++ ex = path[depth].p_ext; ++ EXT_ASSERT(ex); ++ EXT_ASSERT(end < ex->ee_block + ex->ee_len - 1); ++ EXT_ASSERT(ex->ee_block < start); ++ ++ /* calculate tail extent */ ++ tex.ee_block = end + 1; ++ EXT_ASSERT(tex.ee_block < ex->ee_block + ex->ee_len); ++ tex.ee_len = ex->ee_block + ex->ee_len - tex.ee_block; ++ ++ creds = ext3_ext_calc_credits_for_insert(tree, path); ++ handle = ext3_ext_journal_restart(handle, creds); ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ ++ /* calculate head extent. use primary extent */ ++ err = ext3_ext_get_access(handle, tree, path + depth); ++ if (err) ++ return err; ++ ex->ee_len = start - ex->ee_block; ++ err = ext3_ext_dirty(handle, tree, path + depth); ++ if (err) ++ return err; ++ ++ /* FIXME: some callback to free underlying resource ++ * and correct ee_start? */ ++ ext_debug(tree, "split extent: head %u:%u, tail %u:%u\n", ++ ex->ee_block, ex->ee_len, tex.ee_block, tex.ee_len); ++ ++ npath = ext3_ext_find_extent(tree, ex->ee_block, NULL); ++ if (IS_ERR(npath)) ++ return PTR_ERR(npath); ++ depth = EXT_DEPTH(tree); ++ EXT_ASSERT(npath[depth].p_ext->ee_block == ex->ee_block); ++ EXT_ASSERT(npath[depth].p_ext->ee_len == ex->ee_len); ++ ++ err = ext3_ext_insert_extent(handle, tree, npath, &tex); ++ ext3_ext_drop_refs(npath); ++ kfree(npath); ++ ++ return err; ++} ++ ++static int ++ext3_ext_rm_leaf(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, unsigned long start, ++ unsigned long end) ++{ ++ struct ext3_extent *ex, *fu = NULL, *lu, *le; ++ int err = 0, correct_index = 0; ++ int depth = EXT_DEPTH(tree), credits; ++ struct ext3_extent_header *eh; ++ unsigned a, b, block, num; ++ ++ ext_debug(tree, "remove [%lu:%lu] in leaf\n", start, end); ++ if (!path[depth].p_hdr) ++ path[depth].p_hdr = EXT_BLOCK_HDR(path[depth].p_bh); ++ eh = path[depth].p_hdr; ++ EXT_ASSERT(eh); ++ EXT_ASSERT(eh->eh_entries <= eh->eh_max); ++ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); ++ ++ /* find where to start removing */ ++ le = ex = EXT_LAST_EXTENT(eh); ++ while (ex != EXT_FIRST_EXTENT(eh)) { ++ if (ex->ee_block <= end) ++ break; ++ ex--; ++ } ++ ++ if (start > ex->ee_block && end < ex->ee_block + ex->ee_len - 1) { ++ /* removal of internal part of the extent requested ++ * tail and head must be placed in different extent ++ * so, we have to insert one more extent */ ++ path[depth].p_ext = ex; ++ return ext3_ext_split_for_rm(handle, tree, path, start, end); ++ } ++ ++ lu = ex; ++ while (ex >= EXT_FIRST_EXTENT(eh) && ex->ee_block + ex->ee_len > start) { ++ ext_debug(tree, "remove ext %u:%u\n", ex->ee_block, ex->ee_len); ++ path[depth].p_ext = ex; ++ ++ a = ex->ee_block > start ? ex->ee_block : start; ++ b = ex->ee_block + ex->ee_len - 1 < end ? ++ ex->ee_block + ex->ee_len - 1 : end; ++ ++ ext_debug(tree, " border %u:%u\n", a, b); ++ ++ if (a != ex->ee_block && b != ex->ee_block + ex->ee_len - 1) { ++ block = 0; ++ num = 0; ++ BUG(); ++ } else if (a != ex->ee_block) { ++ /* remove tail of the extent */ ++ block = ex->ee_block; ++ num = a - block; ++ } else if (b != ex->ee_block + ex->ee_len - 1) { ++ /* remove head of the extent */ ++ block = a; ++ num = b - a; ++ } else { ++ /* remove whole extent: excelent! */ ++ block = ex->ee_block; ++ num = 0; ++ EXT_ASSERT(a == ex->ee_block && ++ b == ex->ee_block + ex->ee_len - 1); ++ } ++ ++ if (ex == EXT_FIRST_EXTENT(eh)) ++ correct_index = 1; ++ ++ credits = 1; ++ if (correct_index) ++ credits += (EXT_DEPTH(tree) * EXT3_ALLOC_NEEDED) + 1; ++ if (tree->ops->remove_extent_credits) ++ credits+=tree->ops->remove_extent_credits(tree,ex,a,b); ++ ++ handle = ext3_ext_journal_restart(handle, credits); ++ if (IS_ERR(handle)) { ++ err = PTR_ERR(handle); ++ goto out; ++ } ++ ++ err = ext3_ext_get_access(handle, tree, path + depth); ++ if (err) ++ goto out; ++ ++ if (tree->ops->remove_extent) ++ err = tree->ops->remove_extent(tree, ex, a, b); ++ if (err) ++ goto out; ++ ++ if (num == 0) { ++ /* this extent is removed entirely mark slot unused */ ++ ex->ee_start = 0; ++ eh->eh_entries--; ++ fu = ex; ++ } ++ ++ ex->ee_block = block; ++ ex->ee_len = num; ++ ++ err = ext3_ext_dirty(handle, tree, path + depth); ++ if (err) ++ goto out; ++ ++ ext_debug(tree, "new extent: %u:%u:%u\n", ++ ex->ee_block, ex->ee_len, ex->ee_start); ++ ex--; ++ } ++ ++ if (fu) { ++ /* reuse unused slots */ ++ while (lu < le) { ++ if (lu->ee_start) { ++ *fu = *lu; ++ lu->ee_start = 0; ++ fu++; ++ } ++ lu++; ++ } ++ } ++ ++ if (correct_index && eh->eh_entries) ++ err = ext3_ext_correct_indexes(handle, tree, path); ++ ++ /* if this leaf is free, then we should ++ * remove it from index block above */ ++ if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL) ++ err = ext3_ext_rm_idx(handle, tree, path + depth); ++ ++out: ++ return err; ++} ++ ++ ++static struct ext3_extent_idx * ++ext3_ext_last_covered(struct ext3_extent_header *hdr, unsigned long block) ++{ ++ struct ext3_extent_idx *ix; ++ ++ ix = EXT_LAST_INDEX(hdr); ++ while (ix != EXT_FIRST_INDEX(hdr)) { ++ if (ix->ei_block <= block) ++ break; ++ ix--; ++ } ++ return ix; ++} ++ ++/* ++ * returns 1 if current index have to be freed (even partial) ++ */ ++static int inline ++ext3_ext_more_to_rm(struct ext3_ext_path *path) ++{ ++ EXT_ASSERT(path->p_idx); ++ ++ if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr)) ++ return 0; ++ ++ /* ++ * if truncate on deeper level happened it it wasn't partial ++ * so we have to consider current index for truncation ++ */ ++ if (path->p_hdr->eh_entries == path->p_block) ++ return 0; ++ return 1; ++} ++ ++int ext3_ext_remove_space(struct ext3_extents_tree *tree, ++ unsigned long start, unsigned long end) ++{ ++ struct inode *inode = tree->inode; ++ struct super_block *sb = inode->i_sb; ++ int depth = EXT_DEPTH(tree); ++ struct ext3_ext_path *path; ++ handle_t *handle; ++ int i = 0, err = 0; ++ ++ ext_debug(tree, "space to be removed: %lu:%lu\n", start, end); ++ ++ /* probably first extent we're gonna free will be last in block */ ++ handle = ext3_journal_start(inode, depth + 1); ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ ++ ext3_ext_invalidate_cache(tree); ++ ++ /* ++ * we start scanning from right side freeing all the blocks ++ * after i_size and walking into the deep ++ */ ++ path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 1), GFP_KERNEL); ++ if (IS_ERR(path)) { ++ ext3_error(sb, __FUNCTION__, "Can't allocate path array"); ++ ext3_journal_stop(handle); ++ return -ENOMEM; ++ } ++ memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1)); ++ path[i].p_hdr = EXT_ROOT_HDR(tree); ++ ++ while (i >= 0 && err == 0) { ++ if (i == depth) { ++ /* this is leaf block */ ++ err = ext3_ext_rm_leaf(handle, tree, path, start, end); ++ /* root level have p_bh == NULL, brelse() eats this */ ++ brelse(path[i].p_bh); ++ i--; ++ continue; ++ } ++ ++ /* this is index block */ ++ if (!path[i].p_hdr) { ++ ext_debug(tree, "initialize header\n"); ++ path[i].p_hdr = EXT_BLOCK_HDR(path[i].p_bh); ++ } ++ ++ EXT_ASSERT(path[i].p_hdr->eh_entries <= path[i].p_hdr->eh_max); ++ EXT_ASSERT(path[i].p_hdr->eh_magic == EXT3_EXT_MAGIC); ++ ++ if (!path[i].p_idx) { ++ /* this level hasn't touched yet */ ++ path[i].p_idx = ++ ext3_ext_last_covered(path[i].p_hdr, end); ++ path[i].p_block = path[i].p_hdr->eh_entries + 1; ++ ext_debug(tree, "init index ptr: hdr 0x%p, num %d\n", ++ path[i].p_hdr, path[i].p_hdr->eh_entries); ++ } else { ++ /* we've already was here, see at next index */ ++ path[i].p_idx--; ++ } ++ ++ ext_debug(tree, "level %d - index, first 0x%p, cur 0x%p\n", ++ i, EXT_FIRST_INDEX(path[i].p_hdr), ++ path[i].p_idx); ++ if (ext3_ext_more_to_rm(path + i)) { ++ /* go to the next level */ ++ ext_debug(tree, "move to level %d (block %d)\n", ++ i + 1, path[i].p_idx->ei_leaf); ++ memset(path + i + 1, 0, sizeof(*path)); ++ path[i+1].p_bh = sb_bread(sb, path[i].p_idx->ei_leaf); ++ if (!path[i+1].p_bh) { ++ /* should we reset i_size? */ ++ err = -EIO; ++ break; ++ } ++ /* put actual number of indexes to know is this ++ * number got changed at the next iteration */ ++ path[i].p_block = path[i].p_hdr->eh_entries; ++ i++; ++ } else { ++ /* we finish processing this index, go up */ ++ if (path[i].p_hdr->eh_entries == 0 && i > 0) { ++ /* index is empty, remove it ++ * handle must be already prepared by the ++ * truncatei_leaf() */ ++ err = ext3_ext_rm_idx(handle, tree, path + i); ++ } ++ /* root level have p_bh == NULL, brelse() eats this */ ++ brelse(path[i].p_bh); ++ i--; ++ ext_debug(tree, "return to level %d\n", i); ++ } ++ } ++ ++ /* TODO: flexible tree reduction should be here */ ++ if (path->p_hdr->eh_entries == 0) { ++ /* ++ * truncate to zero freed all the tree ++ * so, we need to correct eh_depth ++ */ ++ err = ext3_ext_get_access(handle, tree, path); ++ if (err == 0) { ++ EXT_ROOT_HDR(tree)->eh_depth = 0; ++ EXT_ROOT_HDR(tree)->eh_max = ext3_ext_space_root(tree); ++ err = ext3_ext_dirty(handle, tree, path); ++ } ++ } ++ ext3_ext_tree_changed(tree); ++ ++ kfree(path); ++ ext3_journal_stop(handle); ++ ++ return err; ++} ++ ++int ext3_ext_calc_metadata_amount(struct ext3_extents_tree *tree, int blocks) ++{ ++ int lcap, icap, rcap, leafs, idxs, num; ++ ++ rcap = ext3_ext_space_root(tree); ++ if (blocks <= rcap) { ++ /* all extents fit to the root */ ++ return 0; ++ } ++ ++ rcap = ext3_ext_space_root_idx(tree); ++ lcap = ext3_ext_space_block(tree); ++ icap = ext3_ext_space_block_idx(tree); ++ ++ num = leafs = (blocks + lcap - 1) / lcap; ++ if (leafs <= rcap) { ++ /* all pointers to leafs fit to the root */ ++ return leafs; ++ } ++ ++ /* ok. we need separate index block(s) to link all leaf blocks */ ++ idxs = (leafs + icap - 1) / icap; ++ do { ++ num += idxs; ++ idxs = (idxs + icap - 1) / icap; ++ } while (idxs > rcap); ++ ++ return num; ++} ++ ++/* ++ * called at mount time ++ */ ++void ext3_ext_init(struct super_block *sb) ++{ ++ /* ++ * possible initialization would be here ++ */ ++ ++ if (test_opt(sb, EXTENTS)) { ++ printk("EXT3-fs: file extents enabled"); ++#ifdef AGRESSIVE_TEST ++ printk(", agressive tests"); ++#endif ++#ifdef CHECK_BINSEARCH ++ printk(", check binsearch"); ++#endif ++ printk("\n"); ++ } ++} ++ ++/* ++ * called at umount time ++ */ ++void ext3_ext_release(struct super_block *sb) ++{ ++} ++ ++/************************************************************************ ++ * VFS related routines ++ ************************************************************************/ ++ ++static int ext3_get_inode_write_access(handle_t *handle, void *buffer) ++{ ++ /* we use in-core data, not bh */ ++ return 0; ++} ++ ++static int ext3_mark_buffer_dirty(handle_t *handle, void *buffer) ++{ ++ struct inode *inode = buffer; ++ return ext3_mark_inode_dirty(handle, inode); ++} ++ ++static int ext3_ext_mergable(struct ext3_extent *ex1, ++ struct ext3_extent *ex2) ++{ ++ /* FIXME: support for large fs */ ++ if (ex1->ee_start + ex1->ee_len == ex2->ee_start) ++ return 1; ++ return 0; ++} ++ ++static int ++ext3_remove_blocks_credits(struct ext3_extents_tree *tree, ++ struct ext3_extent *ex, ++ unsigned long from, unsigned long to) ++{ ++ int needed; ++ ++ /* at present, extent can't cross block group */; ++ needed = 4; /* bitmap + group desc + sb + inode */ ++ ++#ifdef CONFIG_QUOTA ++ needed += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS; ++#endif ++ return needed; ++} ++ ++static int ++ext3_remove_blocks(struct ext3_extents_tree *tree, ++ struct ext3_extent *ex, ++ unsigned long from, unsigned long to) ++{ ++ int needed = ext3_remove_blocks_credits(tree, ex, from, to); ++ handle_t *handle = ext3_journal_start(tree->inode, needed); ++ struct buffer_head *bh; ++ int i; ++ ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) { ++ /* tail removal */ ++ unsigned long num, start; ++ num = ex->ee_block + ex->ee_len - from; ++ start = ex->ee_start + ex->ee_len - num; ++ ext_debug(tree, "free last %lu blocks starting %lu\n", ++ num, start); ++ for (i = 0; i < num; i++) { ++ bh = sb_find_get_block(tree->inode->i_sb, start + i); ++ ext3_forget(handle, 0, tree->inode, bh, start + i); ++ } ++ ext3_free_blocks(handle, tree->inode, start, num); ++ } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) { ++ printk("strange request: removal %lu-%lu from %u:%u\n", ++ from, to, ex->ee_block, ex->ee_len); ++ } else { ++ printk("strange request: removal(2) %lu-%lu from %u:%u\n", ++ from, to, ex->ee_block, ex->ee_len); ++ } ++ ext3_journal_stop(handle); ++ return 0; ++} ++ ++static int ext3_ext_find_goal(struct inode *inode, ++ struct ext3_ext_path *path, unsigned long block) ++{ ++ struct ext3_inode_info *ei = EXT3_I(inode); ++ unsigned long bg_start; ++ unsigned long colour; ++ int depth; ++ ++ if (path) { ++ struct ext3_extent *ex; ++ depth = path->p_depth; ++ ++ /* try to predict block placement */ ++ if ((ex = path[depth].p_ext)) ++ return ex->ee_start + (block - ex->ee_block); ++ ++ /* it looks index is empty ++ * try to find starting from index itself */ ++ if (path[depth].p_bh) ++ return path[depth].p_bh->b_blocknr; ++ } ++ ++ /* OK. use inode's group */ ++ bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) + ++ le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block); ++ colour = (current->pid % 16) * ++ (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16); ++ return bg_start + colour + block; ++} ++ ++static int ext3_new_block_cb(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *ex, int *err) ++{ ++ struct inode *inode = tree->inode; ++ int newblock, goal; ++ ++ EXT_ASSERT(path); ++ EXT_ASSERT(ex); ++ EXT_ASSERT(ex->ee_start); ++ EXT_ASSERT(ex->ee_len); ++ ++ /* reuse block from the extent to order data/metadata */ ++ newblock = ex->ee_start++; ++ ex->ee_len--; ++ if (ex->ee_len == 0) { ++ ex->ee_len = 1; ++ /* allocate new block for the extent */ ++ goal = ext3_ext_find_goal(inode, path, ex->ee_block); ++ ex->ee_start = ext3_new_block(handle, inode, goal, err); ++ if (ex->ee_start == 0) { ++ /* error occured: restore old extent */ ++ ex->ee_start = newblock; ++ return 0; ++ } ++ } ++ return newblock; ++} ++ ++static struct ext3_extents_helpers ext3_blockmap_helpers = { ++ .get_write_access = ext3_get_inode_write_access, ++ .mark_buffer_dirty = ext3_mark_buffer_dirty, ++ .mergable = ext3_ext_mergable, ++ .new_block = ext3_new_block_cb, ++ .remove_extent = ext3_remove_blocks, ++ .remove_extent_credits = ext3_remove_blocks_credits, ++}; ++ ++void ext3_init_tree_desc(struct ext3_extents_tree *tree, ++ struct inode *inode) ++{ ++ tree->inode = inode; ++ tree->root = (void *) EXT3_I(inode)->i_data; ++ tree->buffer = (void *) inode; ++ tree->buffer_len = sizeof(EXT3_I(inode)->i_data); ++ tree->cex = (struct ext3_ext_cache *) &EXT3_I(inode)->i_cached_extent; ++ tree->ops = &ext3_blockmap_helpers; ++} ++ ++int ext3_ext_get_block(handle_t *handle, struct inode *inode, ++ long iblock, struct buffer_head *bh_result, ++ int create, int extend_disksize) ++{ ++ struct ext3_ext_path *path = NULL; ++ struct ext3_extent newex; ++ struct ext3_extent *ex; ++ int goal, newblock, err = 0, depth; ++ struct ext3_extents_tree tree; ++ ++ clear_buffer_new(bh_result); ++ ext3_init_tree_desc(&tree, inode); ++ ext_debug(&tree, "block %d requested for inode %u\n", ++ (int) iblock, (unsigned) inode->i_ino); ++ down(&EXT3_I(inode)->truncate_sem); ++ ++ /* check in cache */ ++ if ((goal = ext3_ext_in_cache(&tree, iblock, &newex))) { ++ if (goal == EXT3_EXT_CACHE_GAP) { ++ if (!create) { ++ /* block isn't allocated yet and ++ * user don't want to allocate it */ ++ goto out2; ++ } ++ /* we should allocate requested block */ ++ } else if (goal == EXT3_EXT_CACHE_EXTENT) { ++ /* block is already allocated */ ++ newblock = iblock - newex.ee_block + newex.ee_start; ++ goto out; ++ } else { ++ EXT_ASSERT(0); ++ } ++ } ++ ++ /* find extent for this block */ ++ path = ext3_ext_find_extent(&tree, iblock, NULL); ++ if (IS_ERR(path)) { ++ err = PTR_ERR(path); ++ path = NULL; ++ goto out2; ++ } ++ ++ depth = EXT_DEPTH(&tree); ++ ++ /* ++ * consistent leaf must not be empty ++ * this situations is possible, though, _during_ tree modification ++ * this is why assert can't be put in ext3_ext_find_extent() ++ */ ++ EXT_ASSERT(path[depth].p_ext != NULL || depth == 0); ++ ++ if ((ex = path[depth].p_ext)) { ++ /* if found exent covers block, simple return it */ ++ if (iblock >= ex->ee_block && iblock < ex->ee_block + ex->ee_len) { ++ newblock = iblock - ex->ee_block + ex->ee_start; ++ ext_debug(&tree, "%d fit into %d:%d -> %d\n", ++ (int) iblock, ex->ee_block, ex->ee_len, ++ newblock); ++ ext3_ext_put_in_cache(&tree, ex->ee_block, ++ ex->ee_len, ex->ee_start, ++ EXT3_EXT_CACHE_EXTENT); ++ goto out; ++ } ++ } ++ ++ /* ++ * requested block isn't allocated yet ++ * we couldn't try to create block if create flag is zero ++ */ ++ if (!create) { ++ /* put just found gap into cache to speedup subsequest reqs */ ++ ext3_ext_put_gap_in_cache(&tree, path, iblock); ++ goto out2; ++ } ++ ++ /* allocate new block */ ++ goal = ext3_ext_find_goal(inode, path, iblock); ++ newblock = ext3_new_block(handle, inode, goal, &err); ++ if (!newblock) ++ goto out2; ++ ext_debug(&tree, "allocate new block: goal %d, found %d\n", ++ goal, newblock); ++ ++ /* try to insert new extent into found leaf and return */ ++ newex.ee_block = iblock; ++ newex.ee_start = newblock; ++ newex.ee_len = 1; ++ err = ext3_ext_insert_extent(handle, &tree, path, &newex); ++ if (err) ++ goto out2; ++ ++ if (extend_disksize && inode->i_size > EXT3_I(inode)->i_disksize) ++ EXT3_I(inode)->i_disksize = inode->i_size; ++ ++ /* previous routine could use block we allocated */ ++ newblock = newex.ee_start; ++ set_buffer_new(bh_result); ++ ++ ext3_ext_put_in_cache(&tree, newex.ee_block, newex.ee_len, ++ newex.ee_start, EXT3_EXT_CACHE_EXTENT); ++out: ++ ext3_ext_show_leaf(&tree, path); ++ map_bh(bh_result, inode->i_sb, newblock); ++out2: ++ if (path) { ++ ext3_ext_drop_refs(path); ++ kfree(path); ++ } ++ up(&EXT3_I(inode)->truncate_sem); ++ ++ return err; ++} ++ ++void ext3_ext_truncate(struct inode * inode, struct page *page) ++{ ++ struct address_space *mapping = inode->i_mapping; ++ struct super_block *sb = inode->i_sb; ++ struct ext3_extents_tree tree; ++ unsigned long last_block; ++ handle_t *handle; ++ int err = 0; ++ ++ ext3_init_tree_desc(&tree, inode); ++ ++ /* ++ * probably first extent we're gonna free will be last in block ++ */ ++ err = ext3_writepage_trans_blocks(inode) + 3; ++ handle = ext3_journal_start(inode, err); ++ if (IS_ERR(handle)) { ++ if (page) { ++ clear_highpage(page); ++ flush_dcache_page(page); ++ unlock_page(page); ++ page_cache_release(page); ++ } ++ return; ++ } ++ ++ if (page) ++ ext3_block_truncate_page(handle, page, mapping, inode->i_size); ++ ++ down(&EXT3_I(inode)->truncate_sem); ++ ext3_ext_invalidate_cache(&tree); ++ ++ /* ++ * TODO: optimization is possible here ++ * probably we need not scaning at all, ++ * because page truncation is enough ++ */ ++ if (ext3_orphan_add(handle, inode)) ++ goto out_stop; ++ ++ /* we have to know where to truncate from in crash case */ ++ EXT3_I(inode)->i_disksize = inode->i_size; ++ ext3_mark_inode_dirty(handle, inode); ++ ++ last_block = (inode->i_size + sb->s_blocksize - 1) >> ++ EXT3_BLOCK_SIZE_BITS(sb); ++ err = ext3_ext_remove_space(&tree, last_block, EXT_MAX_BLOCK); ++ ++ /* In a multi-transaction truncate, we only make the final ++ * transaction synchronous */ ++ if (IS_SYNC(inode)) ++ handle->h_sync = 1; ++ ++out_stop: ++ /* ++ * If this was a simple ftruncate(), and the file will remain alive ++ * then we need to clear up the orphan record which we created above. ++ * However, if this was a real unlink then we were called by ++ * ext3_delete_inode(), and we allow that function to clean up the ++ * orphan info for us. ++ */ ++ if (inode->i_nlink) ++ ext3_orphan_del(handle, inode); ++ ++ up(&EXT3_I(inode)->truncate_sem); ++ ext3_journal_stop(handle); ++} ++ ++/* ++ * this routine calculate max number of blocks we could modify ++ * in order to allocate new block for an inode ++ */ ++int ext3_ext_writepage_trans_blocks(struct inode *inode, int num) ++{ ++ struct ext3_extents_tree tree; ++ int needed; ++ ++ ext3_init_tree_desc(&tree, inode); ++ ++ needed = ext3_ext_calc_credits_for_insert(&tree, NULL); ++ ++ /* caller want to allocate num blocks */ ++ needed *= num; ++ ++#ifdef CONFIG_QUOTA ++ /* ++ * FIXME: real calculation should be here ++ * it depends on blockmap format of qouta file ++ */ ++ needed += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS; ++#endif ++ ++ return needed; ++} ++ ++void ext3_extents_initialize_blockmap(handle_t *handle, struct inode *inode) ++{ ++ struct ext3_extents_tree tree; ++ ++ ext3_init_tree_desc(&tree, inode); ++ ext3_extent_tree_init(handle, &tree); ++} ++ ++int ext3_ext_calc_blockmap_metadata(struct inode *inode, int blocks) ++{ ++ struct ext3_extents_tree tree; ++ ++ ext3_init_tree_desc(&tree, inode); ++ return ext3_ext_calc_metadata_amount(&tree, blocks); ++} ++ ++static int ++ext3_ext_store_extent_cb(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_ext_cache *newex) ++{ ++ struct ext3_extent_buf *buf = (struct ext3_extent_buf *) tree->private; ++ ++ if (newex->ec_type != EXT3_EXT_CACHE_EXTENT) ++ return EXT_CONTINUE; ++ ++ if (buf->err < 0) ++ return EXT_BREAK; ++ if (buf->cur - buf->buffer + sizeof(*newex) > buf->buflen) ++ return EXT_BREAK; ++ ++ if (!copy_to_user(buf->cur, newex, sizeof(*newex))) { ++ buf->err++; ++ buf->cur += sizeof(*newex); ++ } else { ++ buf->err = -EFAULT; ++ return EXT_BREAK; ++ } ++ return EXT_CONTINUE; ++} ++ ++static int ++ext3_ext_collect_stats_cb(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_ext_cache *ex) ++{ ++ struct ext3_extent_tree_stats *buf = ++ (struct ext3_extent_tree_stats *) tree->private; ++ int depth; ++ ++ if (ex->ec_type != EXT3_EXT_CACHE_EXTENT) ++ return EXT_CONTINUE; ++ ++ depth = EXT_DEPTH(tree); ++ buf->extents_num++; ++ if (path[depth].p_ext == EXT_FIRST_EXTENT(path[depth].p_hdr)) ++ buf->leaf_num++; ++ return EXT_CONTINUE; ++} ++ ++int ext3_ext_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, ++ unsigned long arg) ++{ ++ int err = 0; ++ ++ if (!(EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL)) ++ return -EINVAL; ++ ++ if (cmd == EXT3_IOC_GET_EXTENTS) { ++ struct ext3_extent_buf buf; ++ struct ext3_extents_tree tree; ++ ++ if (copy_from_user(&buf, (void *) arg, sizeof(buf))) ++ return -EFAULT; ++ ++ ext3_init_tree_desc(&tree, inode); ++ buf.cur = buf.buffer; ++ buf.err = 0; ++ tree.private = &buf; ++ down(&EXT3_I(inode)->truncate_sem); ++ err = ext3_ext_walk_space(&tree, buf.start, EXT_MAX_BLOCK, ++ ext3_ext_store_extent_cb); ++ up(&EXT3_I(inode)->truncate_sem); ++ if (err == 0) ++ err = buf.err; ++ } else if (cmd == EXT3_IOC_GET_TREE_STATS) { ++ struct ext3_extent_tree_stats buf; ++ struct ext3_extents_tree tree; ++ ++ ext3_init_tree_desc(&tree, inode); ++ down(&EXT3_I(inode)->truncate_sem); ++ buf.depth = EXT_DEPTH(&tree); ++ buf.extents_num = 0; ++ buf.leaf_num = 0; ++ tree.private = &buf; ++ err = ext3_ext_walk_space(&tree, 0, EXT_MAX_BLOCK, ++ ext3_ext_collect_stats_cb); ++ up(&EXT3_I(inode)->truncate_sem); ++ if (!err) ++ err = copy_to_user((void *) arg, &buf, sizeof(buf)); ++ } else if (cmd == EXT3_IOC_GET_TREE_DEPTH) { ++ struct ext3_extents_tree tree; ++ ext3_init_tree_desc(&tree, inode); ++ down(&EXT3_I(inode)->truncate_sem); ++ err = EXT_DEPTH(&tree); ++ up(&EXT3_I(inode)->truncate_sem); ++ } ++ ++ return err; ++} ++ ++EXPORT_SYMBOL(ext3_init_tree_desc); ++EXPORT_SYMBOL(ext3_mark_inode_dirty); ++EXPORT_SYMBOL(ext3_ext_invalidate_cache); ++EXPORT_SYMBOL(ext3_ext_insert_extent); ++EXPORT_SYMBOL(ext3_ext_walk_space); ++EXPORT_SYMBOL(ext3_ext_find_goal); ++EXPORT_SYMBOL(ext3_ext_calc_credits_for_insert); +Index: linux-2.6.12-rc6/fs/ext3/ialloc.c +=================================================================== +--- linux-2.6.12-rc6.orig/fs/ext3/ialloc.c 2005-06-14 16:31:08.634433030 +0200 ++++ linux-2.6.12-rc6/fs/ext3/ialloc.c 2005-06-14 16:31:25.846346882 +0200 +@@ -598,7 +598,7 @@ + ei->i_dir_start_lookup = 0; + ei->i_disksize = 0; + +- ei->i_flags = EXT3_I(dir)->i_flags & ~EXT3_INDEX_FL; ++ ei->i_flags = EXT3_I(dir)->i_flags & ~(EXT3_INDEX_FL|EXT3_EXTENTS_FL); + if (S_ISLNK(mode)) + ei->i_flags &= ~(EXT3_IMMUTABLE_FL|EXT3_APPEND_FL); + /* dirsync only applies to directories */ +@@ -639,6 +639,18 @@ + DQUOT_FREE_INODE(inode); + goto fail2; + } ++ if (test_opt(sb, EXTENTS) && S_ISREG(inode->i_mode)) { ++ EXT3_I(inode)->i_flags |= EXT3_EXTENTS_FL; ++ ext3_extents_initialize_blockmap(handle, inode); ++ if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_EXTENTS)) { ++ err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh); ++ if (err) goto fail; ++ EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_EXTENTS); ++ BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "call ext3_journal_dirty_metadata"); ++ err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); ++ } ++ } ++ + err = ext3_mark_inode_dirty(handle, inode); + if (err) { + ext3_std_error(sb, err); +Index: linux-2.6.12-rc6/fs/ext3/inode.c +=================================================================== +--- linux-2.6.12-rc6.orig/fs/ext3/inode.c 2005-06-14 16:31:09.701815830 +0200 ++++ linux-2.6.12-rc6/fs/ext3/inode.c 2005-06-14 16:31:25.861971882 +0200 +@@ -40,7 +40,7 @@ + #include "iopen.h" + #include "acl.h" + +-static int ext3_writepage_trans_blocks(struct inode *inode); ++int ext3_writepage_trans_blocks(struct inode *inode); + + /* + * Test whether an inode is a fast symlink. +@@ -784,6 +784,17 @@ + return err; + } + ++static inline int ++ext3_get_block_wrap(handle_t *handle, struct inode *inode, long block, ++ struct buffer_head *bh, int create, int extend_disksize) ++{ ++ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) ++ return ext3_ext_get_block(handle, inode, block, bh, create, ++ extend_disksize); ++ return ext3_get_block_handle(handle, inode, block, bh, create, ++ extend_disksize); ++} ++ + static int ext3_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) + { +@@ -794,8 +805,8 @@ + handle = ext3_journal_current_handle(); + J_ASSERT(handle != 0); + } +- ret = ext3_get_block_handle(handle, inode, iblock, +- bh_result, create, 1); ++ ret = ext3_get_block_wrap(handle, inode, iblock, ++ bh_result, create, 1); + return ret; + } + +@@ -839,7 +850,7 @@ + + get_block: + if (ret == 0) +- ret = ext3_get_block_handle(handle, inode, iblock, ++ ret = ext3_get_block_wrap(handle, inode, iblock, + bh_result, create, 0); + bh_result->b_size = (1 << inode->i_blkbits); + return ret; +@@ -859,7 +870,7 @@ + dummy.b_state = 0; + dummy.b_blocknr = -1000; + buffer_trace_init(&dummy.b_history); +- *errp = ext3_get_block_handle(handle, inode, block, &dummy, create, 1); ++ *errp = ext3_get_block_wrap(handle, inode, block, &dummy, create, 1); + if (!*errp && buffer_mapped(&dummy)) { + struct buffer_head *bh; + bh = sb_getblk(inode->i_sb, dummy.b_blocknr); +@@ -1593,7 +1604,7 @@ + * This required during truncate. We need to physically zero the tail end + * of that block so it doesn't yield old data if the file is later grown. + */ +-static int ext3_block_truncate_page(handle_t *handle, struct page *page, ++int ext3_block_truncate_page(handle_t *handle, struct page *page, + struct address_space *mapping, loff_t from) + { + unsigned long index = from >> PAGE_CACHE_SHIFT; +@@ -2104,6 +2115,9 @@ + return; + } + ++ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) ++ return ext3_ext_truncate(inode, page); ++ + handle = start_transaction(inode); + if (IS_ERR(handle)) { + if (page) { +@@ -2850,12 +2864,15 @@ + * block and work out the exact number of indirects which are touched. Pah. + */ + +-static int ext3_writepage_trans_blocks(struct inode *inode) ++int ext3_writepage_trans_blocks(struct inode *inode) + { + int bpp = ext3_journal_blocks_per_page(inode); + int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3; + int ret; + ++ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) ++ return ext3_ext_writepage_trans_blocks(inode, bpp); ++ + if (ext3_should_journal_data(inode)) + ret = 3 * (bpp + indirects) + 2; + else +Index: linux-2.6.12-rc6/fs/ext3/Makefile +=================================================================== +--- linux-2.6.12-rc6.orig/fs/ext3/Makefile 2005-06-14 16:31:09.179354899 +0200 ++++ linux-2.6.12-rc6/fs/ext3/Makefile 2005-06-14 16:31:25.872714069 +0200 +@@ -5,7 +5,8 @@ + obj-$(CONFIG_EXT3_FS) += ext3.o + + ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ +- ioctl.o namei.o super.o symlink.o hash.o resize.o ++ ioctl.o namei.o super.o symlink.o hash.o resize.o \ ++ extents.o + + ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o + ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o +Index: linux-2.6.12-rc6/fs/ext3/super.c +=================================================================== +--- linux-2.6.12-rc6.orig/fs/ext3/super.c 2005-06-14 16:31:09.950839264 +0200 ++++ linux-2.6.12-rc6/fs/ext3/super.c 2005-06-14 16:31:25.886385944 +0200 +@@ -387,6 +387,7 @@ + struct ext3_super_block *es = sbi->s_es; + int i; + ++ ext3_ext_release(sb); + ext3_xattr_put_super(sb); + journal_destroy(sbi->s_journal); + if (!(sb->s_flags & MS_RDONLY)) { +@@ -451,6 +452,8 @@ + #endif + ei->i_block_alloc_info = NULL; + ei->vfs_inode.i_version = 1; ++ ++ memset(&ei->i_cached_extent, 0, sizeof(ei->i_cached_extent)); + return &ei->vfs_inode; + } + +@@ -593,6 +596,7 @@ + Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, + Opt_ignore, Opt_barrier, Opt_err, Opt_resize, + Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, ++ Opt_extents, Opt_extdebug, + }; + + static match_table_t tokens = { +@@ -644,6 +647,8 @@ + {Opt_iopen, "iopen"}, + {Opt_noiopen, "noiopen"}, + {Opt_iopen_nopriv, "iopen_nopriv"}, ++ {Opt_extents, "extents"}, ++ {Opt_extdebug, "extdebug"}, + {Opt_barrier, "barrier=%u"}, + {Opt_err, NULL}, + {Opt_resize, "resize"}, +@@ -953,6 +958,12 @@ + case Opt_nobh: + set_opt(sbi->s_mount_opt, NOBH); + break; ++ case Opt_extents: ++ set_opt (sbi->s_mount_opt, EXTENTS); ++ break; ++ case Opt_extdebug: ++ set_opt (sbi->s_mount_opt, EXTDEBUG); ++ break; + default: + printk (KERN_ERR + "EXT3-fs: Unrecognized mount option \"%s\" " +@@ -1668,6 +1681,7 @@ + percpu_counter_mod(&sbi->s_dirs_counter, + ext3_count_dirs(sb)); + ++ ext3_ext_init(sb); + lock_kernel(); + return 0; + +Index: linux-2.6.12-rc6/fs/ext3/ioctl.c +=================================================================== +--- linux-2.6.12-rc6.orig/fs/ext3/ioctl.c 2005-06-14 16:31:08.646151780 +0200 ++++ linux-2.6.12-rc6/fs/ext3/ioctl.c 2005-06-14 16:31:25.897128131 +0200 +@@ -124,6 +124,10 @@ + err = ext3_change_inode_journal_flag(inode, jflag); + return err; + } ++ case EXT3_IOC_GET_EXTENTS: ++ case EXT3_IOC_GET_TREE_STATS: ++ case EXT3_IOC_GET_TREE_DEPTH: ++ return ext3_ext_ioctl(inode, filp, cmd, arg); + case EXT3_IOC_GETVERSION: + case EXT3_IOC_GETVERSION_OLD: + return put_user(inode->i_generation, (int __user *) arg); +Index: linux-2.6.12-rc6/include/linux/ext3_fs.h +=================================================================== +--- linux-2.6.12-rc6.orig/include/linux/ext3_fs.h 2005-06-14 16:31:10.185214261 +0200 ++++ linux-2.6.12-rc6/include/linux/ext3_fs.h 2005-06-14 16:31:52.859041864 +0200 +@@ -186,8 +186,9 @@ + #define EXT3_NOTAIL_FL 0x00008000 /* don't merge file tail */ + #define EXT3_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ + #define EXT3_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ ++#define EXT3_EXTENTS_FL 0x00080000 /* Inode uses extents */ + #define EXT3_RESERVED_FL 0x80000000 /* reserved for ext3 lib */ + +-#define EXT3_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */ ++#define EXT3_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */ + #define EXT3_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */ + +@@ -237,6 +238,9 @@ + #endif + #define EXT3_IOC_GETRSVSZ _IOR('f', 5, long) + #define EXT3_IOC_SETRSVSZ _IOW('f', 6, long) ++#define EXT3_IOC_GET_EXTENTS _IOR('f', 7, long) ++#define EXT3_IOC_GET_TREE_DEPTH _IOR('f', 8, long) ++#define EXT3_IOC_GET_TREE_STATS _IOR('f', 9, long) + + /* + * Structure of an inode on the disk +@@ -360,6 +364,8 @@ + #define EXT3_MOUNT_NOBH 0x40000 /* No bufferheads */ + #define EXT3_MOUNT_IOPEN 0x80000 /* Allow access via iopen */ + #define EXT3_MOUNT_IOPEN_NOPRIV 0x100000/* Make iopen world-readable */ ++#define EXT3_MOUNT_EXTENTS 0x200000/* Extents support */ ++#define EXT3_MOUNT_EXTDEBUG 0x400000/* Extents debug */ + + /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ + #ifndef clear_opt +@@ -548,11 +554,13 @@ + #define EXT3_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */ + #define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */ + #define EXT3_FEATURE_INCOMPAT_META_BG 0x0010 ++#define EXT3_FEATURE_INCOMPAT_EXTENTS 0x0040 /* extents support */ + + #define EXT3_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR + #define EXT3_FEATURE_INCOMPAT_SUPP (EXT3_FEATURE_INCOMPAT_FILETYPE| \ + EXT3_FEATURE_INCOMPAT_RECOVER| \ +- EXT3_FEATURE_INCOMPAT_META_BG) ++ EXT3_FEATURE_INCOMPAT_META_BG| \ ++ EXT3_FEATURE_INCOMPAT_EXTENTS) + #define EXT3_FEATURE_RO_COMPAT_SUPP (EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT3_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT3_FEATURE_RO_COMPAT_BTREE_DIR) +@@ -759,6 +767,7 @@ + + + /* inode.c */ ++extern int ext3_block_truncate_page(handle_t *, struct page *, struct address_space *, loff_t); + extern int ext3_forget(handle_t *, int, struct inode *, struct buffer_head *, int); + extern struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *); + extern struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *); +@@ -828,6 +837,16 @@ + extern struct inode_operations ext3_symlink_inode_operations; + extern struct inode_operations ext3_fast_symlink_inode_operations; + ++/* extents.c */ ++extern int ext3_ext_writepage_trans_blocks(struct inode *, int); ++extern int ext3_ext_get_block(handle_t *, struct inode *, long, ++ struct buffer_head *, int, int); ++extern void ext3_ext_truncate(struct inode *, struct page *); ++extern void ext3_ext_init(struct super_block *); ++extern void ext3_ext_release(struct super_block *); ++extern void ext3_extents_initialize_blockmap(handle_t *, struct inode *); ++extern int ext3_ext_ioctl(struct inode *inode, struct file *filp, ++ unsigned int cmd, unsigned long arg); + + #endif /* __KERNEL__ */ + +Index: linux-2.6.12-rc6/include/linux/ext3_extents.h +=================================================================== +--- linux-2.6.12-rc6.orig/include/linux/ext3_extents.h 2005-06-14 16:31:25.780917195 +0200 ++++ linux-2.6.12-rc6/include/linux/ext3_extents.h 2005-06-14 16:31:25.932284381 +0200 +@@ -0,0 +1,264 @@ ++/* ++ * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com ++ * Written by Alex Tomas ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public Licens ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- ++ */ ++ ++#ifndef _LINUX_EXT3_EXTENTS ++#define _LINUX_EXT3_EXTENTS ++ ++/* ++ * with AGRESSIVE_TEST defined capacity of index/leaf blocks ++ * become very little, so index split, in-depth growing and ++ * other hard changes happens much more often ++ * this is for debug purposes only ++ */ ++#define AGRESSIVE_TEST_ ++ ++/* ++ * if CHECK_BINSEARCH defined, then results of binary search ++ * will be checked by linear search ++ */ ++#define CHECK_BINSEARCH_ ++ ++/* ++ * if EXT_DEBUG is defined you can use 'extdebug' mount option ++ * to get lots of info what's going on ++ */ ++#define EXT_DEBUG_ ++#ifdef EXT_DEBUG ++#define ext_debug(tree,fmt,a...) \ ++do { \ ++ if (test_opt((tree)->inode->i_sb, EXTDEBUG)) \ ++ printk(fmt, ##a); \ ++} while (0); ++#else ++#define ext_debug(tree,fmt,a...) ++#endif ++ ++/* ++ * if EXT_STATS is defined then stats numbers are collected ++ * these number will be displayed at umount time ++ */ ++#define EXT_STATS_ ++ ++ ++#define EXT3_ALLOC_NEEDED 3 /* block bitmap + group desc. + sb */ ++ ++/* ++ * ext3_inode has i_block array (total 60 bytes) ++ * first 4 bytes are used to store: ++ * - tree depth (0 mean there is no tree yet. all extents in the inode) ++ * - number of alive extents in the inode ++ */ ++ ++/* ++ * this is extent on-disk structure ++ * it's used at the bottom of the tree ++ */ ++struct ext3_extent { ++ __u32 ee_block; /* first logical block extent covers */ ++ __u16 ee_len; /* number of blocks covered by extent */ ++ __u16 ee_start_hi; /* high 16 bits of physical block */ ++ __u32 ee_start; /* low 32 bigs of physical block */ ++}; ++ ++/* ++ * this is index on-disk structure ++ * it's used at all the levels, but the bottom ++ */ ++struct ext3_extent_idx { ++ __u32 ei_block; /* index covers logical blocks from 'block' */ ++ __u32 ei_leaf; /* pointer to the physical block of the next * ++ * level. leaf or next index could bet here */ ++ __u16 ei_leaf_hi; /* high 16 bits of physical block */ ++ __u16 ei_unused; ++}; ++ ++/* ++ * each block (leaves and indexes), even inode-stored has header ++ */ ++struct ext3_extent_header { ++ __u16 eh_magic; /* probably will support different formats */ ++ __u16 eh_entries; /* number of valid entries */ ++ __u16 eh_max; /* capacity of store in entries */ ++ __u16 eh_depth; /* has tree real underlaying blocks? */ ++ __u32 eh_generation; /* generation of the tree */ ++}; ++ ++#define EXT3_EXT_MAGIC 0xf30a ++ ++/* ++ * array of ext3_ext_path contains path to some extent ++ * creation/lookup routines use it for traversal/splitting/etc ++ * truncate uses it to simulate recursive walking ++ */ ++struct ext3_ext_path { ++ __u32 p_block; ++ __u16 p_depth; ++ struct ext3_extent *p_ext; ++ struct ext3_extent_idx *p_idx; ++ struct ext3_extent_header *p_hdr; ++ struct buffer_head *p_bh; ++}; ++ ++/* ++ * structure for external API ++ */ ++ ++/* ++ * storage for cached extent ++ */ ++struct ext3_ext_cache { ++ __u32 ec_start; ++ __u32 ec_block; ++ __u32 ec_len; ++ __u32 ec_type; ++}; ++ ++#define EXT3_EXT_CACHE_NO 0 ++#define EXT3_EXT_CACHE_GAP 1 ++#define EXT3_EXT_CACHE_EXTENT 2 ++ ++/* ++ * ext3_extents_tree is used to pass initial information ++ * to top-level extents API ++ */ ++struct ext3_extents_helpers; ++struct ext3_extents_tree { ++ struct inode *inode; /* inode which tree belongs to */ ++ void *root; /* ptr to data top of tree resides at */ ++ void *buffer; /* will be passed as arg to ^^ routines */ ++ int buffer_len; ++ void *private; ++ struct ext3_ext_cache *cex;/* last found extent */ ++ struct ext3_extents_helpers *ops; ++}; ++ ++struct ext3_extents_helpers { ++ int (*get_write_access)(handle_t *h, void *buffer); ++ int (*mark_buffer_dirty)(handle_t *h, void *buffer); ++ int (*mergable)(struct ext3_extent *ex1, struct ext3_extent *ex2); ++ int (*remove_extent_credits)(struct ext3_extents_tree *, ++ struct ext3_extent *, unsigned long, ++ unsigned long); ++ int (*remove_extent)(struct ext3_extents_tree *, ++ struct ext3_extent *, unsigned long, ++ unsigned long); ++ int (*new_block)(handle_t *, struct ext3_extents_tree *, ++ struct ext3_ext_path *, struct ext3_extent *, ++ int *); ++}; ++ ++/* ++ * to be called by ext3_ext_walk_space() ++ * negative retcode - error ++ * positive retcode - signal for ext3_ext_walk_space(), see below ++ * callback must return valid extent (passed or newly created) ++ */ ++typedef int (*ext_prepare_callback)(struct ext3_extents_tree *, ++ struct ext3_ext_path *, ++ struct ext3_ext_cache *); ++ ++#define EXT_CONTINUE 0 ++#define EXT_BREAK 1 ++#define EXT_REPEAT 2 ++ ++ ++#define EXT_MAX_BLOCK 0xffffffff ++ ++ ++#define EXT_FIRST_EXTENT(__hdr__) \ ++ ((struct ext3_extent *) (((char *) (__hdr__)) + \ ++ sizeof(struct ext3_extent_header))) ++#define EXT_FIRST_INDEX(__hdr__) \ ++ ((struct ext3_extent_idx *) (((char *) (__hdr__)) + \ ++ sizeof(struct ext3_extent_header))) ++#define EXT_HAS_FREE_INDEX(__path__) \ ++ ((__path__)->p_hdr->eh_entries < (__path__)->p_hdr->eh_max) ++#define EXT_LAST_EXTENT(__hdr__) \ ++ (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_entries - 1) ++#define EXT_LAST_INDEX(__hdr__) \ ++ (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_entries - 1) ++#define EXT_MAX_EXTENT(__hdr__) \ ++ (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_max - 1) ++#define EXT_MAX_INDEX(__hdr__) \ ++ (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_max - 1) ++ ++#define EXT_ROOT_HDR(tree) \ ++ ((struct ext3_extent_header *) (tree)->root) ++#define EXT_BLOCK_HDR(bh) \ ++ ((struct ext3_extent_header *) (bh)->b_data) ++#define EXT_DEPTH(_t_) \ ++ (((struct ext3_extent_header *)((_t_)->root))->eh_depth) ++#define EXT_GENERATION(_t_) \ ++ (((struct ext3_extent_header *)((_t_)->root))->eh_generation) ++ ++ ++#define EXT_ASSERT(__x__) if (!(__x__)) BUG(); ++ ++#define EXT_CHECK_PATH(tree,path) \ ++{ \ ++ int depth = EXT_DEPTH(tree); \ ++ BUG_ON((unsigned long) (path) < __PAGE_OFFSET); \ ++ BUG_ON((unsigned long) (path)[depth].p_idx < \ ++ __PAGE_OFFSET && (path)[depth].p_idx != NULL); \ ++ BUG_ON((unsigned long) (path)[depth].p_ext < \ ++ __PAGE_OFFSET && (path)[depth].p_ext != NULL); \ ++ BUG_ON((unsigned long) (path)[depth].p_hdr < __PAGE_OFFSET); \ ++ BUG_ON((unsigned long) (path)[depth].p_bh < __PAGE_OFFSET \ ++ && depth != 0); \ ++ BUG_ON((path)[0].p_depth != depth); \ ++} ++ ++ ++/* ++ * this structure is used to gather extents from the tree via ioctl ++ */ ++struct ext3_extent_buf { ++ unsigned long start; ++ int buflen; ++ void *buffer; ++ void *cur; ++ int err; ++}; ++ ++/* ++ * this structure is used to collect stats info about the tree ++ */ ++struct ext3_extent_tree_stats { ++ int depth; ++ int extents_num; ++ int leaf_num; ++}; ++ ++extern void ext3_init_tree_desc(struct ext3_extents_tree *, struct inode *); ++extern int ext3_extent_tree_init(handle_t *, struct ext3_extents_tree *); ++extern int ext3_ext_calc_credits_for_insert(struct ext3_extents_tree *, struct ext3_ext_path *); ++extern int ext3_ext_insert_extent(handle_t *, struct ext3_extents_tree *, struct ext3_ext_path *, struct ext3_extent *); ++extern int ext3_ext_walk_space(struct ext3_extents_tree *, unsigned long, unsigned long, ext_prepare_callback); ++extern int ext3_ext_remove_space(struct ext3_extents_tree *, unsigned long, unsigned long); ++extern struct ext3_ext_path * ext3_ext_find_extent(struct ext3_extents_tree *, int, struct ext3_ext_path *); ++extern int ext3_ext_calc_blockmap_metadata(struct inode *, int); ++ ++static inline void ++ext3_ext_invalidate_cache(struct ext3_extents_tree *tree) ++{ ++ if (tree->cex) ++ tree->cex->ec_type = EXT3_EXT_CACHE_NO; ++} ++ ++ ++#endif /* _LINUX_EXT3_EXTENTS */ +Index: linux-2.6.12-rc6/include/linux/ext3_fs_i.h +=================================================================== +--- linux-2.6.12-rc6.orig/include/linux/ext3_fs_i.h 2005-06-06 17:22:29.000000000 +0200 ++++ linux-2.6.12-rc6/include/linux/ext3_fs_i.h 2005-06-14 16:31:25.941073443 +0200 +@@ -133,6 +133,8 @@ + */ + struct semaphore truncate_sem; + struct inode vfs_inode; ++ ++ __u32 i_cached_extent[4]; + }; + + #endif /* _LINUX_EXT3_FS_I */ diff --git a/ldiskfs/kernel_patches/patches/ext3-extents-2.6.5.patch b/ldiskfs/kernel_patches/patches/ext3-extents-2.6.5.patch index f69e16c..0ee8d28 100644 --- a/ldiskfs/kernel_patches/patches/ext3-extents-2.6.5.patch +++ b/ldiskfs/kernel_patches/patches/ext3-extents-2.6.5.patch @@ -2471,12 +2471,13 @@ Index: linux-2.6.5-sles9/fs/ext3/Makefile =================================================================== --- linux-2.6.5-sles9.orig/fs/ext3/Makefile 2005-02-23 01:01:46.501172896 +0300 +++ linux-2.6.5-sles9/fs/ext3/Makefile 2005-02-23 01:02:37.405434272 +0300 -@@ -5,7 +5,7 @@ +@@ -5,7 +5,8 @@ obj-$(CONFIG_EXT3_FS) += ext3.o ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ - ioctl.o namei.o super.o symlink.o hash.o -+ ioctl.o namei.o super.o symlink.o hash.o extents.o ++ ioctl.o namei.o super.o symlink.o hash.o \ ++ extents.o ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o @@ -2501,12 +2502,11 @@ Index: linux-2.6.5-sles9/fs/ext3/super.c return &ei->vfs_inode; } -@@ -537,7 +540,7 @@ - Opt_commit, Opt_journal_update, Opt_journal_inum, - Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, - Opt_ignore, Opt_barrier, Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, -- Opt_err, -+ Opt_err, Opt_extents, Opt_extdebug +@@ -537,6 +540,7 @@ + Opt_ignore, Opt_barrier, + Opt_err, + Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, ++ Opt_extents, Opt_extdebug, }; static match_table_t tokens = { @@ -2516,9 +2516,9 @@ Index: linux-2.6.5-sles9/fs/ext3/super.c {Opt_iopen_nopriv, "iopen_nopriv"}, + {Opt_extents, "extents"}, + {Opt_extdebug, "extdebug"}, + {Opt_barrier, "barrier=%u"}, {Opt_err, NULL} }; - @@ -797,6 +802,12 @@ break; case Opt_ignore: @@ -2583,10 +2583,10 @@ Index: linux-2.6.5-sles9/include/linux/ext3_fs.h * Structure of an inode on the disk @@ -333,6 +337,8 @@ #define EXT3_MOUNT_BARRIER 0x20000 /* Use block barriers */ - #define EXT3_MOUNT_IOPEN 0x40000 /* Allow access via iopen */ - #define EXT3_MOUNT_IOPEN_NOPRIV 0x80000 /* Make iopen world-readable */ -+#define EXT3_MOUNT_EXTENTS 0x100000/* Extents support */ -+#define EXT3_MOUNT_EXTDEBUG 0x200000/* Extents debug */ + #define EXT3_MOUNT_IOPEN 0x80000 /* Allow access via iopen */ + #define EXT3_MOUNT_IOPEN_NOPRIV 0x100000/* Make iopen world-readable */ ++#define EXT3_MOUNT_EXTENTS 0x200000/* Extents support */ ++#define EXT3_MOUNT_EXTDEBUG 0x400000/* Extents debug */ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ #ifndef clear_opt diff --git a/ldiskfs/kernel_patches/patches/ext3-extents-2.6.9-rhel4.patch b/ldiskfs/kernel_patches/patches/ext3-extents-2.6.9-rhel4.patch index 3b873c2..56fe653 100644 --- a/ldiskfs/kernel_patches/patches/ext3-extents-2.6.9-rhel4.patch +++ b/ldiskfs/kernel_patches/patches/ext3-extents-2.6.9-rhel4.patch @@ -2466,12 +2466,13 @@ Index: linux-stage/fs/ext3/Makefile =================================================================== --- linux-stage.orig/fs/ext3/Makefile 2005-02-25 14:49:42.168561008 +0200 +++ linux-stage/fs/ext3/Makefile 2005-02-25 15:39:28.384587168 +0200 -@@ -5,7 +5,7 @@ +@@ -5,7 +5,8 @@ obj-$(CONFIG_EXT3_FS) += ext3.o - ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o\ + ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ - ioctl.o namei.o super.o symlink.o hash.o resize.o -+ ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o ++ ioctl.o namei.o super.o symlink.o hash.o resize.o \ ++ extents.o ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o @@ -2496,19 +2497,18 @@ Index: linux-stage/fs/ext3/super.c return &ei->vfs_inode; } -@@ -589,7 +594,7 @@ - Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, +@@ -589,6 +594,7 @@ Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, + Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, -- Opt_ignore, Opt_barrier, Opt_err, Opt_resize, -+ Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_extents, Opt_extdebug, ++ Opt_extents, Opt_extdebug, }; static match_table_t tokens = { @@ -639,6 +644,8 @@ - {Opt_iopen, "iopen"}, - {Opt_noiopen, "noiopen"}, - {Opt_iopen_nopriv, "iopen_nopriv"}, + {Opt_iopen, "iopen"}, + {Opt_noiopen, "noiopen"}, + {Opt_iopen_nopriv, "iopen_nopriv"}, + {Opt_extents, "extents"}, + {Opt_extdebug, "extdebug"}, {Opt_barrier, "barrier=%u"}, @@ -2578,10 +2578,10 @@ Index: linux-stage/include/linux/ext3_fs.h * Structure of an inode on the disk @@ -359,6 +363,8 @@ #define EXT3_MOUNT_RESERVATION 0x20000 /* Preallocation */ - #define EXT3_MOUNT_IOPEN 0x40000 /* Allow access via iopen */ - #define EXT3_MOUNT_IOPEN_NOPRIV 0x80000 /* Make iopen world-readable */ -+#define EXT3_MOUNT_EXTENTS 0x100000/* Extents support */ -+#define EXT3_MOUNT_EXTDEBUG 0x200000/* Extents debug */ + #define EXT3_MOUNT_IOPEN 0x80000 /* Allow access via iopen */ + #define EXT3_MOUNT_IOPEN_NOPRIV 0x100000/* Make iopen world-readable */ ++#define EXT3_MOUNT_EXTENTS 0x200000/* Extents support */ ++#define EXT3_MOUNT_EXTDEBUG 0x400000/* Extents debug */ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ #ifndef _LINUX_EXT2_FS_H diff --git a/ldiskfs/kernel_patches/patches/ext3-external-journal-2.6.12.patch b/ldiskfs/kernel_patches/patches/ext3-external-journal-2.6.12.patch new file mode 100644 index 0000000..bcfdae2 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext3-external-journal-2.6.12.patch @@ -0,0 +1,148 @@ +Signed-off-by: Johann Lombardi + +--- linux-2.6.12.orig/fs/ext3/super.c 2005-06-17 21:48:29.000000000 +0200 ++++ linux-2.6.12/fs/ext3/super.c 2005-11-07 13:37:30.000000000 +0100 +@@ -39,7 +39,8 @@ + #include "xattr.h" + #include "acl.h" + +-static int ext3_load_journal(struct super_block *, struct ext3_super_block *); ++static int ext3_load_journal(struct super_block *, struct ext3_super_block *, ++ unsigned long journal_devnum); + static int ext3_create_journal(struct super_block *, struct ext3_super_block *, + int); + static void ext3_commit_super (struct super_block * sb, +@@ -586,7 +587,7 @@ enum { + Opt_nouid32, Opt_check, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov, + Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, + Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, +- Opt_commit, Opt_journal_update, Opt_journal_inum, ++ Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev, + Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, + Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, + Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, +@@ -624,6 +625,7 @@ static match_table_t tokens = { + {Opt_commit, "commit=%u"}, + {Opt_journal_update, "journal=update"}, + {Opt_journal_inum, "journal=%u"}, ++ {Opt_journal_dev, "journal_dev=%u"}, + {Opt_abort, "abort"}, + {Opt_data_journal, "data=journal"}, + {Opt_data_ordered, "data=ordered"}, +@@ -663,8 +665,9 @@ static unsigned long get_sb_block(void * + return sb_block; + } + +-static int parse_options (char * options, struct super_block *sb, +- unsigned long * inum, unsigned long *n_blocks_count, int is_remount) ++static int parse_options (char *options, struct super_block *sb, ++ unsigned long *inum, unsigned long *journal_devnum, ++ unsigned long *n_blocks_count, int is_remount) + { + struct ext3_sb_info *sbi = EXT3_SB(sb); + char * p; +@@ -805,6 +808,16 @@ static int parse_options (char * options + return 0; + *inum = option; + break; ++ case Opt_journal_dev: ++ if (is_remount) { ++ printk(KERN_ERR "EXT3-fs: cannot specify " ++ "journal on remount\n"); ++ return 0; ++ } ++ if (match_int(&args[0], &option)) ++ return 0; ++ *journal_devnum = option; ++ break; + case Opt_noload: + set_opt (sbi->s_mount_opt, NOLOAD); + break; +@@ -1250,6 +1263,7 @@ static int ext3_fill_super (struct super + unsigned long logic_sb_block; + unsigned long offset = 0; + unsigned long journal_inum = 0; ++ unsigned long journal_devnum = 0; + unsigned long def_mount_opts; + struct inode *root; + int blocksize; +@@ -1330,7 +1344,8 @@ static int ext3_fill_super (struct super + + set_opt(sbi->s_mount_opt, RESERVATION); + +- if (!parse_options ((char *) data, sb, &journal_inum, NULL, 0)) ++ if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum, ++ NULL, 0)) + goto failed_mount; + + sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | +@@ -1541,7 +1556,7 @@ static int ext3_fill_super (struct super + */ + if (!test_opt(sb, NOLOAD) && + EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) { +- if (ext3_load_journal(sb, es)) ++ if (ext3_load_journal(sb, es, journal_devnum)) + goto failed_mount2; + } else if (journal_inum) { + if (ext3_create_journal(sb, es, journal_inum)) +@@ -1821,15 +1836,24 @@ out_bdev: + return NULL; + } + +-static int ext3_load_journal(struct super_block * sb, +- struct ext3_super_block * es) ++static int ext3_load_journal(struct super_block *sb, ++ struct ext3_super_block *es, ++ unsigned long journal_devnum) + { + journal_t *journal; + int journal_inum = le32_to_cpu(es->s_journal_inum); +- dev_t journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev)); ++ dev_t journal_dev; + int err = 0; + int really_read_only; + ++ if (journal_devnum && ++ journal_devnum != le32_to_cpu(es->s_journal_dev)) { ++ printk(KERN_INFO "EXT3-fs: external journal device major/minor " ++ "numbers have changed\n"); ++ journal_dev = new_decode_dev(journal_devnum); ++ } else ++ journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev)); ++ + really_read_only = bdev_read_only(sb->s_bdev); + + /* +@@ -1888,6 +1912,16 @@ static int ext3_load_journal(struct supe + + EXT3_SB(sb)->s_journal = journal; + ext3_clear_journal_err(sb, es); ++ ++ if (journal_devnum && ++ journal_devnum != le32_to_cpu(es->s_journal_dev)) { ++ es->s_journal_dev = cpu_to_le32(journal_devnum); ++ sb->s_dirt = 1; ++ ++ /* Make sure we flush the recovery flag to disk. */ ++ ext3_commit_super(sb, es, 1); ++ } ++ + return 0; + } + +@@ -2093,13 +2127,13 @@ static int ext3_remount (struct super_bl + { + struct ext3_super_block * es; + struct ext3_sb_info *sbi = EXT3_SB(sb); +- unsigned long tmp; ++ unsigned long tmp1, tmp2; + unsigned long n_blocks_count = 0; + + /* + * Allow the "check" option to be passed as a remount option. + */ +- if (!parse_options(data, sb, &tmp, &n_blocks_count, 1)) ++ if (!parse_options(data, sb, &tmp1, &tmp2, &n_blocks_count, 1)) + return -EINVAL; + + if (sbi->s_mount_opt & EXT3_MOUNT_ABORT) diff --git a/ldiskfs/kernel_patches/patches/ext3-include-fixes-2.6-rhel4.patch b/ldiskfs/kernel_patches/patches/ext3-include-fixes-2.6-rhel4.patch index 49528cf..52e5521 100644 --- a/ldiskfs/kernel_patches/patches/ext3-include-fixes-2.6-rhel4.patch +++ b/ldiskfs/kernel_patches/patches/ext3-include-fixes-2.6-rhel4.patch @@ -3,7 +3,7 @@ Index: linux-stage/include/linux/ext3_fs.h --- linux-stage.orig/include/linux/ext3_fs.h 2005-02-25 14:53:56.424908168 +0200 +++ linux-stage/include/linux/ext3_fs.h 2005-02-25 14:53:59.376459464 +0200 @@ -361,12 +361,13 @@ - #define EXT3_MOUNT_IOPEN_NOPRIV 0x80000 /* Make iopen world-readable */ + #define EXT3_MOUNT_IOPEN_NOPRIV 0x100000/* Make iopen world-readable */ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ -#ifndef _LINUX_EXT2_FS_H diff --git a/ldiskfs/kernel_patches/patches/ext3-include-fixes-2.6-suse.patch b/ldiskfs/kernel_patches/patches/ext3-include-fixes-2.6-suse.patch index acf97dd..1ac944b 100644 --- a/ldiskfs/kernel_patches/patches/ext3-include-fixes-2.6-suse.patch +++ b/ldiskfs/kernel_patches/patches/ext3-include-fixes-2.6-suse.patch @@ -3,7 +3,7 @@ Index: linux-stage/include/linux/ext3_fs.h --- linux-stage.orig/include/linux/ext3_fs.h 2004-04-02 16:43:37.000000000 -0500 +++ linux-stage/include/linux/ext3_fs.h 2004-04-02 16:43:37.000000000 -0500 @@ -331,12 +331,13 @@ - #define EXT3_MOUNT_IOPEN_NOPRIV 0x80000 /* Make iopen world-readable */ + #define EXT3_MOUNT_IOPEN_NOPRIV 0x100000/* Make iopen world-readable */ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ -#ifndef _LINUX_EXT2_FS_H diff --git a/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6-suse.patch b/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6-suse.patch index 31e7e38..bb9928a 100644 --- a/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6-suse.patch +++ b/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6-suse.patch @@ -1,71 +1,8 @@ -Index: linux-2.6.5-7.201/include/linux/ext3_fs_sb.h -=================================================================== ---- linux-2.6.5-7.201.orig/include/linux/ext3_fs_sb.h 2005-10-14 08:59:35.000000000 +0400 -+++ linux-2.6.5-7.201/include/linux/ext3_fs_sb.h 2005-10-14 08:59:39.000000000 +0400 -@@ -23,10 +23,30 @@ - #define EXT_INCLUDE - #include - #include -+#include - #endif - #endif - #include - -+#define EXT3_BB_MAX_BLOCKS 30 -+struct ext3_free_metadata { -+ unsigned short group; -+ unsigned short num; -+ unsigned short blocks[EXT3_BB_MAX_BLOCKS]; -+ struct list_head list; -+}; -+ -+struct ext3_buddy_group_blocks { -+ __u32 bb_bitmap; -+ __u32 bb_buddy; -+ spinlock_t bb_lock; -+ unsigned long bb_tid; -+ struct ext3_free_metadata *bb_md_cur; -+ unsigned short bb_first_free; -+ unsigned short bb_free; -+ unsigned bb_counters[]; -+}; -+ - /* - * third extended-fs super-block data in memory - */ -@@ -78,6 +98,27 @@ - struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */ - wait_queue_head_t ro_wait_queue; /* For people waiting for the fs to go read-only */ - #endif -+ -+ /* for buddy allocator */ -+ struct ext3_buddy_group_blocks **s_buddy_blocks; -+ struct inode *s_buddy; -+ long s_blocks_reserved; -+ spinlock_t s_reserve_lock; -+ struct list_head s_active_transaction; -+ struct list_head s_closed_transaction; -+ struct list_head s_committed_transaction; -+ spinlock_t s_md_lock; -+ tid_t s_last_transaction; -+ int s_mb_factor; -+ -+ /* stats for buddy allocator */ -+ spinlock_t s_bal_lock; -+ unsigned long s_bal_reqs; /* number of reqs with len > 1 */ -+ unsigned long s_bal_success; /* we found long enough chunks */ -+ unsigned long s_bal_allocated; /* in blocks */ -+ unsigned long s_bal_ex_scanned; /* total extents scanned */ -+ unsigned long s_bal_goals; /* goal hits */ -+ unsigned long s_bal_breaks; /* too long searches */ - }; - - #endif /* _LINUX_EXT3_FS_SB */ Index: linux-2.6.5-7.201/include/linux/ext3_fs.h =================================================================== ---- linux-2.6.5-7.201.orig/include/linux/ext3_fs.h 2005-10-14 08:59:38.000000000 +0400 -+++ linux-2.6.5-7.201/include/linux/ext3_fs.h 2005-10-14 09:02:36.000000000 +0400 -@@ -57,6 +57,14 @@ +--- linux-2.6.5-7.201.orig/include/linux/ext3_fs.h 2005-12-17 02:53:30.000000000 +0300 ++++ linux-2.6.5-7.201/include/linux/ext3_fs.h 2005-12-17 03:13:38.000000000 +0300 +@@ -57,6 +57,14 @@ struct statfs; #define ext3_debug(f, a...) do {} while (0) #endif @@ -80,29 +17,30 @@ Index: linux-2.6.5-7.201/include/linux/ext3_fs.h /* * Special inodes numbers */ -@@ -339,6 +347,7 @@ - #define EXT3_MOUNT_IOPEN_NOPRIV 0x80000 /* Make iopen world-readable */ - #define EXT3_MOUNT_EXTENTS 0x100000/* Extents support */ - #define EXT3_MOUNT_EXTDEBUG 0x200000/* Extents debug */ -+#define EXT3_MOUNT_MBALLOC 0x400000/* Buddy allocation support */ +@@ -339,6 +347,7 @@ struct ext3_inode { + #define EXT3_MOUNT_IOPEN_NOPRIV 0x100000/* Make iopen world-readable */ + #define EXT3_MOUNT_EXTENTS 0x200000/* Extents support */ + #define EXT3_MOUNT_EXTDEBUG 0x400000/* Extents debug */ ++#define EXT3_MOUNT_MBALLOC 0x800000/* Buddy allocation support */ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ #ifndef clear_opt -@@ -700,7 +709,7 @@ +@@ -700,7 +709,9 @@ extern int ext3_bg_has_super(struct supe extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group); extern int ext3_new_block (handle_t *, struct inode *, unsigned long, int *); extern void ext3_free_blocks (handle_t *, struct inode *, unsigned long, - unsigned long); + unsigned long, int); ++extern void ext3_free_blocks_old (handle_t *, struct inode *, unsigned long, ++ unsigned long); extern unsigned long ext3_count_free_blocks (struct super_block *); extern void ext3_check_blocks_bitmap (struct super_block *); extern struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb, -@@ -822,6 +831,44 @@ +@@ -822,6 +833,17 @@ extern void ext3_extents_initialize_bloc extern int ext3_ext_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg); +/* mballoc.c */ -+extern long ext3_mb_aggressive; +extern long ext3_mb_stats; +extern long ext3_mb_max_to_scan; +extern int ext3_mb_init(struct super_block *, int); @@ -110,74 +48,146 @@ Index: linux-2.6.5-7.201/include/linux/ext3_fs.h +extern int ext3_mb_new_blocks(handle_t *, struct inode *, unsigned long, int *, int, int *); +extern int ext3_mb_reserve_blocks(struct super_block *, int); +extern void ext3_mb_release_blocks(struct super_block *, int); -+ -+/* writeback.c */ -+extern int ext3_wb_writepages(struct address_space *, struct writeback_control *); -+extern int ext3_wb_prepare_write(struct file *file, struct page *page, -+ unsigned from, unsigned to); -+extern int ext3_wb_commit_write(struct file *, struct page *, unsigned, unsigned); -+extern int ext3_wb_writepage(struct page *, struct writeback_control *); -+extern int ext3_wb_invalidatepage(struct page *, unsigned long); -+extern int ext3_wb_releasepage(struct page *, int); -+extern int ext3_wb_block_truncate_page(handle_t *, struct page *, struct address_space *, loff_t); -+extern void ext3_wb_init(struct super_block *); -+extern void ext3_wb_release(struct super_block *); -+ -+/* writeback.c */ -+extern int ext3_wb_writepages(struct address_space *, struct writeback_control *); -+extern int ext3_wb_prepare_write(struct file *file, struct page *page, -+ unsigned from, unsigned to); -+extern int ext3_wb_commit_write(struct file *, struct page *, unsigned, unsigned); -+extern int ext3_wb_writepage(struct page *, struct writeback_control *); -+extern int ext3_wb_invalidatepage(struct page *, unsigned long); -+extern int ext3_wb_releasepage(struct page *, int); -+extern int ext3_wb_block_truncate_page(handle_t *, struct page *, struct address_space *, loff_t); -+extern void ext3_wb_init(struct super_block *); -+extern void ext3_wb_release(struct super_block *); -+ -+/* proc.c */ -+extern int init_ext3_proc(void); -+extern void exit_ext3_proc(void); ++int __init init_ext3_proc(void); ++void exit_ext3_proc(void); + #endif /* __KERNEL__ */ #define EXT3_IOC_CREATE_INUM _IOW('f', 5, long) -Index: linux-2.6.5-7.201/fs/ext3/balloc.c +Index: linux-2.6.5-7.201/include/linux/ext3_fs_sb.h =================================================================== ---- linux-2.6.5-7.201.orig/fs/ext3/balloc.c 2005-10-11 00:12:45.000000000 +0400 -+++ linux-2.6.5-7.201/fs/ext3/balloc.c 2005-10-14 08:59:39.000000000 +0400 -@@ -78,7 +78,7 @@ - * - * Return buffer_head on success or NULL in case of failure. - */ --static struct buffer_head * -+struct buffer_head * - read_block_bitmap(struct super_block *sb, unsigned int block_group) +--- linux-2.6.5-7.201.orig/include/linux/ext3_fs_sb.h 2005-12-17 02:53:25.000000000 +0300 ++++ linux-2.6.5-7.201/include/linux/ext3_fs_sb.h 2005-12-17 03:10:23.000000000 +0300 +@@ -23,9 +23,15 @@ + #define EXT_INCLUDE + #include + #include ++#include + #endif + #endif + #include ++#include ++ ++struct ext3_buddy_group_blocks; ++struct ext3_mb_history; ++#define EXT3_BB_MAX_BLOCKS + + /* + * third extended-fs super-block data in memory +@@ -78,6 +84,38 @@ struct ext3_sb_info { + struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */ + wait_queue_head_t ro_wait_queue; /* For people waiting for the fs to go read-only */ + #endif ++ ++ /* for buddy allocator */ ++ struct ext3_group_info **s_group_info; ++ struct inode *s_buddy_cache; ++ long s_blocks_reserved; ++ spinlock_t s_reserve_lock; ++ struct list_head s_active_transaction; ++ struct list_head s_closed_transaction; ++ struct list_head s_committed_transaction; ++ spinlock_t s_md_lock; ++ tid_t s_last_transaction; ++ int s_mb_factor; ++ unsigned short *s_mb_offsets, *s_mb_maxs; ++ ++ /* history to debug policy */ ++ struct ext3_mb_history *s_mb_history; ++ int s_mb_history_cur; ++ int s_mb_history_max; ++ struct proc_dir_entry *s_mb_proc; ++ spinlock_t s_mb_history_lock; ++ ++ /* stats for buddy allocator */ ++ atomic_t s_bal_reqs; /* number of reqs with len > 1 */ ++ atomic_t s_bal_success; /* we found long enough chunks */ ++ atomic_t s_bal_allocated; /* in blocks */ ++ atomic_t s_bal_ex_scanned; /* total extents scanned */ ++ atomic_t s_bal_goals; /* goal hits */ ++ atomic_t s_bal_breaks; /* too long searches */ ++ atomic_t s_bal_2orders; /* 2^order hits */ ++ spinlock_t s_bal_lock; ++ unsigned long s_mb_buddies_generated; ++ unsigned long long s_mb_generation_time; + }; + + #endif /* _LINUX_EXT3_FS_SB */ +Index: linux-2.6.5-7.201/fs/ext3/super.c +=================================================================== +--- linux-2.6.5-7.201.orig/fs/ext3/super.c 2005-12-17 02:53:30.000000000 +0300 ++++ linux-2.6.5-7.201/fs/ext3/super.c 2005-12-17 03:10:23.000000000 +0300 +@@ -389,6 +389,7 @@ void ext3_put_super (struct super_block + struct ext3_super_block *es = sbi->s_es; + int i; + ++ ext3_mb_release(sb); + ext3_ext_release(sb); + ext3_xattr_put_super(sb); + journal_destroy(sbi->s_journal); +@@ -543,7 +544,7 @@ enum { + Opt_ignore, Opt_barrier, + Opt_err, + Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, +- Opt_extents, Opt_extdebug, ++ Opt_extents, Opt_extdebug, Opt_mballoc, + }; + + static match_table_t tokens = { +@@ -590,6 +591,7 @@ static match_table_t tokens = { + {Opt_iopen_nopriv, "iopen_nopriv"}, + {Opt_extents, "extents"}, + {Opt_extdebug, "extdebug"}, ++ {Opt_mballoc, "mballoc"}, + {Opt_barrier, "barrier=%u"}, + {Opt_err, NULL} + }; +@@ -811,6 +813,9 @@ static int parse_options (char * options + case Opt_extdebug: + set_opt (sbi->s_mount_opt, EXTDEBUG); + break; ++ case Opt_mballoc: ++ set_opt (sbi->s_mount_opt, MBALLOC); ++ break; + default: + printk (KERN_ERR + "EXT3-fs: Unrecognized mount option \"%s\" " +@@ -1464,6 +1469,7 @@ static int ext3_fill_super (struct super + ext3_count_dirs(sb)); + + ext3_ext_init(sb); ++ ext3_mb_init(sb, needs_recovery); + + return 0; + +@@ -2112,7 +2118,13 @@ static struct file_system_type ext3_fs_t + + static int __init init_ext3_fs(void) { - struct ext3_group_desc * desc; -@@ -274,7 +274,7 @@ +- int err = init_ext3_xattr(); ++ int err; ++ ++ err = init_ext3_proc(); ++ if (err) ++ return err; ++ ++ err = init_ext3_xattr(); + if (err) + return err; + err = init_inodecache(); +@@ -2141,6 +2153,7 @@ static void __exit exit_ext3_fs(void) + unregister_filesystem(&ext3_fs_type); + destroy_inodecache(); + exit_ext3_xattr(); ++ exit_ext3_proc(); } - /* Free given blocks, update quota and i_blocks field */ --void ext3_free_blocks(handle_t *handle, struct inode *inode, -+void ext3_free_blocks_old(handle_t *handle, struct inode *inode, - unsigned long block, unsigned long count) - { - struct buffer_head *bitmap_bh = NULL; -@@ -1142,7 +1142,7 @@ - * bitmap, and then for any free bit if that fails. - * This function also updates quota and i_blocks field. - */ --int ext3_new_block(handle_t *handle, struct inode *inode, -+int ext3_new_block_old(handle_t *handle, struct inode *inode, - unsigned long goal, int *errp) - { - struct buffer_head *bitmap_bh = NULL; + int ext3_prep_san_write(struct inode *inode, long *blocks, Index: linux-2.6.5-7.201/fs/ext3/extents.c =================================================================== ---- linux-2.6.5-7.201.orig/fs/ext3/extents.c 2005-10-14 08:59:38.000000000 +0400 -+++ linux-2.6.5-7.201/fs/ext3/extents.c 2005-10-14 08:59:39.000000000 +0400 -@@ -771,7 +771,7 @@ +--- linux-2.6.5-7.201.orig/fs/ext3/extents.c 2005-12-17 02:53:29.000000000 +0300 ++++ linux-2.6.5-7.201/fs/ext3/extents.c 2005-12-17 03:10:23.000000000 +0300 +@@ -771,7 +771,7 @@ cleanup: for (i = 0; i < depth; i++) { if (!ablocks[i]) continue; @@ -186,7 +196,7 @@ Index: linux-2.6.5-7.201/fs/ext3/extents.c } } kfree(ablocks); -@@ -1428,7 +1428,7 @@ +@@ -1428,7 +1428,7 @@ int ext3_ext_rm_idx(handle_t *handle, st path->p_idx->ei_leaf); bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf); ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf); @@ -195,7 +205,7 @@ Index: linux-2.6.5-7.201/fs/ext3/extents.c return err; } -@@ -1913,10 +1913,12 @@ +@@ -1913,10 +1913,12 @@ ext3_remove_blocks(struct ext3_extents_t int needed = ext3_remove_blocks_credits(tree, ex, from, to); handle_t *handle = ext3_journal_start(tree->inode, needed); struct buffer_head *bh; @@ -204,12 +214,12 @@ Index: linux-2.6.5-7.201/fs/ext3/extents.c if (IS_ERR(handle)) return PTR_ERR(handle); -+ if (S_ISDIR(tree->inode->i_mode)) ++ if (S_ISDIR(tree->inode->i_mode) || S_ISLNK(tree->inode->i_mode)) + metadata = 1; if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) { /* tail removal */ unsigned long num, start; -@@ -1928,7 +1930,7 @@ +@@ -1928,7 +1930,7 @@ ext3_remove_blocks(struct ext3_extents_t bh = sb_find_get_block(tree->inode->i_sb, start + i); ext3_forget(handle, 0, tree->inode, bh, start + i); } @@ -218,24 +228,82 @@ Index: linux-2.6.5-7.201/fs/ext3/extents.c } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) { printk("strange request: removal %lu-%lu from %u:%u\n", from, to, ex->ee_block, ex->ee_len); -Index: linux-2.6.5-7.201/fs/ext3/namei.c +Index: linux-2.6.5-7.201/fs/ext3/inode.c =================================================================== ---- linux-2.6.5-7.201.orig/fs/ext3/namei.c 2005-10-14 08:59:35.000000000 +0400 -+++ linux-2.6.5-7.201/fs/ext3/namei.c 2005-10-14 08:59:39.000000000 +0400 -@@ -1640,7 +1640,7 @@ - * If the create succeeds, we fill in the inode information - * with d_instantiate(). +--- linux-2.6.5-7.201.orig/fs/ext3/inode.c 2005-12-17 02:53:30.000000000 +0300 ++++ linux-2.6.5-7.201/fs/ext3/inode.c 2005-12-17 03:10:23.000000000 +0300 +@@ -572,7 +572,7 @@ static int ext3_alloc_branch(handle_t *h + ext3_journal_forget(handle, branch[i].bh); + } + for (i = 0; i < keys; i++) +- ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1); ++ ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1, 1); + return err; + } + +@@ -673,7 +673,7 @@ err_out: + if (err == -EAGAIN) + for (i = 0; i < num; i++) + ext3_free_blocks(handle, inode, +- le32_to_cpu(where[i].key), 1); ++ le32_to_cpu(where[i].key), 1, 1); + return err; + } + +@@ -1835,7 +1835,7 @@ ext3_clear_blocks(handle_t *handle, stru + } + } + +- ext3_free_blocks(handle, inode, block_to_free, count); ++ ext3_free_blocks(handle, inode, block_to_free, count, 1); + } + + /** +@@ -2006,7 +2006,7 @@ static void ext3_free_branches(handle_t + ext3_journal_test_restart(handle, inode); + } + +- ext3_free_blocks(handle, inode, nr, 1); ++ ext3_free_blocks(handle, inode, nr, 1, 1); + + if (parent_bh) { + /* +Index: linux-2.6.5-7.201/fs/ext3/balloc.c +=================================================================== +--- linux-2.6.5-7.201.orig/fs/ext3/balloc.c 2005-10-11 00:12:45.000000000 +0400 ++++ linux-2.6.5-7.201/fs/ext3/balloc.c 2005-12-17 03:10:23.000000000 +0300 +@@ -78,7 +78,7 @@ struct ext3_group_desc * ext3_get_group_ + * + * Return buffer_head on success or NULL in case of failure. */ --static int ext3_create (struct inode * dir, struct dentry * dentry, int mode, -+int ext3_create (struct inode * dir, struct dentry * dentry, int mode, - struct nameidata *nd) +-static struct buffer_head * ++struct buffer_head * + read_block_bitmap(struct super_block *sb, unsigned int block_group) { - handle_t *handle; + struct ext3_group_desc * desc; +@@ -274,7 +274,7 @@ void ext3_discard_reservation(struct ino + } + + /* Free given blocks, update quota and i_blocks field */ +-void ext3_free_blocks(handle_t *handle, struct inode *inode, ++void ext3_free_blocks_old(handle_t *handle, struct inode *inode, + unsigned long block, unsigned long count) + { + struct buffer_head *bitmap_bh = NULL; +@@ -1142,7 +1142,7 @@ int ext3_should_retry_alloc(struct super + * bitmap, and then for any free bit if that fails. + * This function also updates quota and i_blocks field. + */ +-int ext3_new_block(handle_t *handle, struct inode *inode, ++int ext3_new_block_old(handle_t *handle, struct inode *inode, + unsigned long goal, int *errp) + { + struct buffer_head *bitmap_bh = NULL; Index: linux-2.6.5-7.201/fs/ext3/xattr.c =================================================================== ---- linux-2.6.5-7.201.orig/fs/ext3/xattr.c 2005-10-14 08:59:36.000000000 +0400 -+++ linux-2.6.5-7.201/fs/ext3/xattr.c 2005-10-14 08:59:39.000000000 +0400 -@@ -1371,7 +1371,7 @@ +--- linux-2.6.5-7.201.orig/fs/ext3/xattr.c 2005-12-17 02:53:26.000000000 +0300 ++++ linux-2.6.5-7.201/fs/ext3/xattr.c 2005-12-17 03:10:41.000000000 +0300 +@@ -1371,7 +1371,7 @@ ext3_xattr_set_handle2(handle_t *handle, new_bh = sb_getblk(sb, block); if (!new_bh) { getblk_failed: @@ -244,7 +312,7 @@ Index: linux-2.6.5-7.201/fs/ext3/xattr.c error = -EIO; goto cleanup; } -@@ -1411,7 +1411,7 @@ +@@ -1411,7 +1411,7 @@ getblk_failed: if (HDR(old_bh)->h_refcount == cpu_to_le32(1)) { /* Free the old block. */ ea_bdebug(old_bh, "freeing"); @@ -253,7 +321,7 @@ Index: linux-2.6.5-7.201/fs/ext3/xattr.c /* ext3_forget() calls bforget() for us, but we let our caller release old_bh, so we need to -@@ -1519,7 +1519,7 @@ +@@ -1519,7 +1519,7 @@ ext3_xattr_delete_inode(handle_t *handle mb_cache_entry_free(ce); ce = NULL; } @@ -262,26 +330,13 @@ Index: linux-2.6.5-7.201/fs/ext3/xattr.c get_bh(bh); ext3_forget(handle, 1, inode, bh, EXT3_I(inode)->i_file_acl); } else { -Index: linux-2.6.5-7.201/fs/ext3/Makefile -=================================================================== ---- linux-2.6.5-7.201.orig/fs/ext3/Makefile 2005-10-14 08:59:38.000000000 +0400 -+++ linux-2.6.5-7.201/fs/ext3/Makefile 2005-10-14 08:59:39.000000000 +0400 -@@ -5,7 +5,7 @@ - obj-$(CONFIG_EXT3_FS) += ext3.o - - ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ -- ioctl.o namei.o super.o symlink.o hash.o extents.o -+ ioctl.o namei.o super.o symlink.o hash.o extents.o mballoc.o - - ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o - ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o Index: linux-2.6.5-7.201/fs/ext3/mballoc.c =================================================================== ---- linux-2.6.5-7.201.orig/fs/ext3/mballoc.c 2005-10-13 19:40:57.851699336 +0400 -+++ linux-2.6.5-7.201/fs/ext3/mballoc.c 2005-10-14 09:02:36.000000000 +0400 -@@ -0,0 +1,1868 @@ +--- linux-2.6.5-7.201.orig/fs/ext3/mballoc.c 2005-12-09 13:08:53.191437750 +0300 ++++ linux-2.6.5-7.201/fs/ext3/mballoc.c 2005-12-17 03:15:04.000000000 +0300 +@@ -0,0 +1,2430 @@ +/* -+ * Copyright(c) 2003, 2004, 2005, Cluster File Systems, Inc, info@clusterfs.com ++ * Copyright (c) 2003-2005, Cluster File Systems, Inc, info@clusterfs.com + * Written by Alex Tomas + * + * This program is free software; you can redistribute it and/or modify @@ -313,12 +368,15 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c +#include +#include +#include ++#include ++#include ++#include ++#include + +/* + * TODO: -+ * - bitmap/buddy read-ahead (proposed by Oleg Drokin aka green) ++ * - bitmap read-ahead (proposed by Oleg Drokin aka green) + * - track min/max extents in each group for better group selection -+ * - is it worthwhile to use buddies directly if req is 2^N blocks? + * - mb_mark_used() may allocate chunk right after splitting buddy + * - special flag to advice allocator to look for requested + N blocks + * this may improve interaction between extents and mballoc @@ -328,17 +386,10 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + */ + +/* -+ * with 'ext3_mb_aggressive' set the allocator runs consistency checks over ++ * with AGRESSIVE_CHECK allocator runs consistency checks over + * structures. these checks slow things down a lot + */ -+long ext3_mb_aggressive = 0; -+ -+ -+/* -+ * with 'ext3_mb_stats' allocator will collect stats that will be -+ * shown at umount. The collecting costs though! -+ */ -+long ext3_mb_stats = 1; ++#define AGGRESSIVE_CHECK__ + +/* + */ @@ -350,33 +401,56 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c +#endif + +/* -+ * where to save buddies structures beetween umount/mount (clean case only) ++ * with EXT3_MB_HISTORY mballoc stores last N allocations in memory ++ * and you can monitor it in /proc/fs/ext3//mb_history + */ -+#define EXT3_BUDDY_FILE ".buddy" ++#define EXT3_MB_HISTORY + +/* + * How long mballoc can look for a best extent (in found extents) + */ -+long ext3_mb_max_to_scan = 100; ++long ext3_mb_max_to_scan = 500; + +/* -+ * This structure is on-disk description of a group for mballoc ++ * How long mballoc must look for a best extent + */ -+struct ext3_mb_group_descr { -+ __u16 mgd_first_free; /* first free block in the group */ -+ __u16 mgd_free; /* number of free blocks in the group */ -+ __u16 mgd_counters[16]; /* number of free blocks by order */ -+}; ++long ext3_mb_min_to_scan = 30; + +/* -+ * This structure is header of mballoc's file ++ * with 'ext3_mb_stats' allocator will collect stats that will be ++ * shown at umount. The collecting costs though! + */ -+struct ext3_mb_grp_header { -+ __u32 mh_magic; ++ ++long ext3_mb_stats = 1; ++ ++#ifdef EXT3_BB_MAX_BLOCKS ++#undef EXT3_BB_MAX_BLOCKS ++#endif ++#define EXT3_BB_MAX_BLOCKS 30 ++ ++struct ext3_free_metadata { ++ unsigned short group; ++ unsigned short num; ++ unsigned short blocks[EXT3_BB_MAX_BLOCKS]; ++ struct list_head list; ++}; ++ ++struct ext3_group_info { ++ unsigned long bb_state; ++ unsigned long bb_tid; ++ struct ext3_free_metadata *bb_md_cur; ++ unsigned short bb_first_free; ++ unsigned short bb_free; ++ unsigned short bb_fragments; ++ unsigned short bb_counters[]; +}; + -+#define EXT3_MB_MAGIC_V1 0xbabd16fd + ++#define EXT3_GROUP_INFO_NEED_INIT_BIT 0 ++#define EXT3_GROUP_INFO_LOCKED_BIT 1 ++ ++#define EXT3_MB_GRP_NEED_INIT(grp) \ ++ (test_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, &(grp)->bb_state)) + +struct ext3_free_extent { + __u16 fe_start; @@ -397,28 +471,55 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + unsigned long ac_ex_scanned; + __u16 ac_groups_scanned; + __u16 ac_found; ++ __u16 ac_tail; ++ __u16 ac_buddy; + __u8 ac_status; + __u8 ac_flags; /* allocation hints */ ++ __u8 ac_criteria; + __u8 ac_repeats; ++ __u8 ac_2order; /* if request is to allocate 2^N blocks and ++ * N > 0, the field stores N, otherwise 0 */ +}; + +#define AC_STATUS_CONTINUE 1 +#define AC_STATUS_FOUND 2 +#define AC_STATUS_BREAK 3 + ++struct ext3_mb_history { ++ struct ext3_free_extent goal; /* goal allocation */ ++ struct ext3_free_extent result; /* result allocation */ ++ __u16 found; /* how many extents have been found */ ++ __u16 groups; /* how many groups have been scanned */ ++ __u16 tail; /* what tail broke some buddy */ ++ __u16 buddy; /* buddy the tail ^^^ broke */ ++ __u8 cr; /* which phase the result extent was found at */ ++ __u8 merged; ++}; ++ +struct ext3_buddy { -+ struct buffer_head *bd_bh; -+ struct buffer_head *bd_bh2; -+ struct ext3_buddy_group_blocks *bd_bd; ++ struct page *bd_buddy_page; ++ void *bd_buddy; ++ struct page *bd_bitmap_page; ++ void *bd_bitmap; ++ struct ext3_group_info *bd_info; + struct super_block *bd_sb; + __u16 bd_blkbits; + __u16 bd_group; +}; -+#define EXT3_MB_BITMAP(e3b) ((e3b)->bd_bh->b_data) -+#define EXT3_MB_BUDDY(e3b) ((e3b)->bd_bh2->b_data) ++#define EXT3_MB_BITMAP(e3b) ((e3b)->bd_bitmap) ++#define EXT3_MB_BUDDY(e3b) ((e3b)->bd_buddy) ++ ++#ifndef EXT3_MB_HISTORY ++#define ext3_mb_store_history(sb,ac) ++#else ++static void ext3_mb_store_history(struct super_block *, ++ struct ext3_allocation_context *ac); ++#endif + +#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) + ++static struct proc_dir_entry *proc_root_ext3; ++ +int ext3_create (struct inode *, struct dentry *, int, struct nameidata *); +struct buffer_head * read_block_bitmap(struct super_block *, unsigned int); +int ext3_new_block_old(handle_t *, struct inode *, unsigned long, int *); @@ -473,9 +574,25 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + ext2_clear_bit_atomic(NULL, bit, addr); +} + ++static inline int mb_find_next_zero_bit(void *addr, int max, int start) ++{ ++ int fix; ++#if BITS_PER_LONG == 64 ++ fix = ((unsigned long) addr & 7UL) << 3; ++ addr = (void *) ((unsigned long) addr & ~7UL); ++#elif BITS_PER_LONG == 32 ++ fix = ((unsigned long) addr & 3UL) << 3; ++ addr = (void *) ((unsigned long) addr & ~3UL); ++#else ++#error "how many bits you are?!" ++#endif ++ max += fix; ++ start += fix; ++ return ext2_find_next_zero_bit(addr, max, start) - fix; ++} ++ +static inline void *mb_find_buddy(struct ext3_buddy *e3b, int order, int *max) +{ -+ int i = 1; + char *bb; + + J_ASSERT(EXT3_MB_BITMAP(e3b) != EXT3_MB_BUDDY(e3b)); @@ -491,89 +608,30 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + if (order == 0) + return EXT3_MB_BITMAP(e3b); + -+ bb = EXT3_MB_BUDDY(e3b); -+ *max = *max >> 1; -+ while (i < order) { -+ bb += 1 << (e3b->bd_blkbits - i); -+ i++; -+ *max = *max >> 1; -+ } -+ J_ASSERT((unsigned) (bb - (char *) EXT3_MB_BUDDY(e3b)) < -+ e3b->bd_sb->s_blocksize); -+ return bb; -+} -+ -+static int ext3_mb_load_buddy(struct super_block *sb, int group, -+ struct ext3_buddy *e3b) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ -+ J_ASSERT(sbi->s_buddy_blocks[group]->bb_bitmap); -+ J_ASSERT(sbi->s_buddy_blocks[group]->bb_buddy); -+ -+ /* load bitmap */ -+ e3b->bd_bh = sb_getblk(sb, sbi->s_buddy_blocks[group]->bb_bitmap); -+ if (e3b->bd_bh == NULL) { -+ ext3_error(sb, "ext3_mb_load_buddy", -+ "can't get block for buddy bitmap\n"); -+ goto out; -+ } -+ /* load buddy */ -+ e3b->bd_bh2 = sb_getblk(sb, sbi->s_buddy_blocks[group]->bb_buddy); -+ if (e3b->bd_bh2 == NULL) { -+ ext3_error(sb, "ext3_mb_load_buddy", -+ "can't get block for buddy bitmap\n"); -+ goto out; -+ } -+ -+ if (!buffer_uptodate(e3b->bd_bh)) -+ ll_rw_block(READ, 1, &e3b->bd_bh); -+ if (!buffer_uptodate(e3b->bd_bh2)) -+ ll_rw_block(READ, 1, &e3b->bd_bh2); -+ -+ wait_on_buffer(e3b->bd_bh); -+ J_ASSERT(buffer_uptodate(e3b->bd_bh)); -+ wait_on_buffer(e3b->bd_bh2); -+ J_ASSERT(buffer_uptodate(e3b->bd_bh2)); -+ -+ e3b->bd_blkbits = sb->s_blocksize_bits; -+ e3b->bd_bd = sbi->s_buddy_blocks[group]; -+ e3b->bd_sb = sb; -+ e3b->bd_group = group; -+ -+ return 0; -+out: -+ brelse(e3b->bd_bh); -+ brelse(e3b->bd_bh2); -+ e3b->bd_bh = NULL; -+ e3b->bd_bh2 = NULL; -+ return -EIO; -+} ++ bb = EXT3_MB_BUDDY(e3b) + EXT3_SB(e3b->bd_sb)->s_mb_offsets[order]; ++ *max = EXT3_SB(e3b->bd_sb)->s_mb_maxs[order]; + -+static void ext3_mb_dirty_buddy(struct ext3_buddy *e3b) -+{ -+ mark_buffer_dirty(e3b->bd_bh); -+ mark_buffer_dirty(e3b->bd_bh2); ++ return bb; +} + -+static void ext3_mb_release_desc(struct ext3_buddy *e3b) -+{ -+ brelse(e3b->bd_bh); -+ brelse(e3b->bd_bh2); -+} ++#ifdef AGGRESSIVE_CHECK + +static void mb_check_buddy(struct ext3_buddy *e3b) +{ + int order = e3b->bd_blkbits + 1; + int max, max2, i, j, k, count; ++ int fragments = 0, fstart; + void *buddy, *buddy2; + -+ if (likely(!ext3_mb_aggressive)) -+ return; -+ + if (!test_opt(e3b->bd_sb, MBALLOC)) + return; + ++ { ++ static int mb_check_counter = 0; ++ if (mb_check_counter++ % 300 != 0) ++ return; ++ } ++ + while (order > 1) { + buddy = mb_find_buddy(e3b, order, &max); + J_ASSERT(buddy); @@ -604,14 +662,22 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + } + count++; + } -+ J_ASSERT(e3b->bd_bd->bb_counters[order] == count); ++ J_ASSERT(e3b->bd_info->bb_counters[order] == count); + order--; + } + ++ fstart = -1; + buddy = mb_find_buddy(e3b, 0, &max); + for (i = 0; i < max; i++) { -+ if (!mb_test_bit(i, buddy)) ++ if (!mb_test_bit(i, buddy)) { ++ J_ASSERT(i >= e3b->bd_info->bb_first_free); ++ if (fstart == -1) { ++ fragments++; ++ fstart = i; ++ } + continue; ++ } ++ fstart = -1; + /* check used bits only */ + for (j = 0; j < e3b->bd_blkbits + 1; j++) { + buddy2 = mb_find_buddy(e3b, j, &max2); @@ -620,18 +686,325 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + J_ASSERT(mb_test_bit(k, buddy2)); + } + } ++ J_ASSERT(!EXT3_MB_GRP_NEED_INIT(e3b->bd_info)); ++ J_ASSERT(e3b->bd_info->bb_fragments == fragments); ++} ++ ++#else ++#define mb_check_buddy(e3b) ++#endif ++ ++/* find most significant bit */ ++static int inline fmsb(unsigned short word) ++{ ++ int order; ++ ++ if (word > 255) { ++ order = 7; ++ word >>= 8; ++ } else { ++ order = -1; ++ } ++ ++ do { ++ order++; ++ word >>= 1; ++ } while (word != 0); ++ ++ return order; ++} ++ ++static void inline ++ext3_mb_mark_free_simple(struct super_block *sb, void *buddy, unsigned first, ++ int len, struct ext3_group_info *grp) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ unsigned short min, max, chunk, border; ++ ++ mb_debug("mark %u/%u free\n", first, len); ++ J_ASSERT(len < EXT3_BLOCKS_PER_GROUP(sb)); ++ ++ border = 2 << sb->s_blocksize_bits; ++ ++ while (len > 0) { ++ /* find how many blocks can be covered since this position */ ++ max = ffs(first | border) - 1; ++ ++ /* find how many blocks of power 2 we need to mark */ ++ min = fmsb(len); ++ ++ mb_debug(" %u/%u -> max %u, min %u\n", ++ first & ((2 << sb->s_blocksize_bits) - 1), ++ len, max, min); ++ ++ if (max < min) ++ min = max; ++ chunk = 1 << min; ++ ++ /* mark multiblock chunks only */ ++ grp->bb_counters[min]++; ++ if (min > 0) { ++ mb_debug(" set %u at %u \n", first >> min, ++ sbi->s_mb_offsets[min]); ++ mb_clear_bit(first >> min, buddy + sbi->s_mb_offsets[min]); ++ } ++ ++ len -= chunk; ++ first += chunk; ++ } ++} ++ ++static void ++ext3_mb_generate_buddy(struct super_block *sb, void *buddy, void *bitmap, ++ struct ext3_group_info *grp) ++{ ++ unsigned short max = EXT3_BLOCKS_PER_GROUP(sb); ++ unsigned short i = 0, first, len; ++ unsigned free = 0, fragments = 0; ++ unsigned long long period = get_cycles(); ++ ++ i = mb_find_next_zero_bit(bitmap, max, 0); ++ grp->bb_first_free = i; ++ while (i < max) { ++ fragments++; ++ first = i; ++ i = find_next_bit(bitmap, max, i); ++ len = i - first; ++ free += len; ++ if (len > 1) ++ ext3_mb_mark_free_simple(sb, buddy, first, len, grp); ++ else ++ grp->bb_counters[0]++; ++ if (i < max) ++ i = mb_find_next_zero_bit(bitmap, max, i); ++ } ++ grp->bb_fragments = fragments; ++ ++ /* bb_state shouldn't being modified because all ++ * others waits for init completion on page lock */ ++ clear_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, &grp->bb_state); ++ if (free != grp->bb_free) { ++ printk("EXT3-fs: %u blocks in bitmap, %u in group descriptor\n", ++ free, grp->bb_free); ++ grp->bb_free = free; ++ } ++ ++ period = get_cycles() - period; ++ spin_lock(&EXT3_SB(sb)->s_bal_lock); ++ EXT3_SB(sb)->s_mb_buddies_generated++; ++ EXT3_SB(sb)->s_mb_generation_time += period; ++ spin_unlock(&EXT3_SB(sb)->s_bal_lock); ++} ++ ++static int ext3_mb_init_cache(struct page *page) ++{ ++ int blocksize, blocks_per_page, groups_per_page; ++ int err = 0, i, first_group, first_block; ++ struct super_block *sb; ++ struct buffer_head *bhs; ++ struct buffer_head **bh; ++ struct inode *inode; ++ char *data, *bitmap; ++ ++ mb_debug("init page %lu\n", page->index); ++ ++ inode = page->mapping->host; ++ sb = inode->i_sb; ++ blocksize = 1 << inode->i_blkbits; ++ blocks_per_page = PAGE_CACHE_SIZE / blocksize; ++ ++ groups_per_page = blocks_per_page >> 1; ++ if (groups_per_page == 0) ++ groups_per_page = 1; ++ ++ /* allocate buffer_heads to read bitmaps */ ++ if (groups_per_page > 1) { ++ err = -ENOMEM; ++ i = sizeof(struct buffer_head *) * groups_per_page; ++ bh = kmalloc(i, GFP_NOFS); ++ if (bh == NULL) ++ goto out; ++ memset(bh, 0, i); ++ } else ++ bh = &bhs; ++ ++ first_group = page->index * blocks_per_page / 2; ++ ++ /* read all groups the page covers into the cache */ ++ for (i = 0; i < groups_per_page; i++) { ++ struct ext3_group_desc * desc; ++ ++ if (first_group + i >= EXT3_SB(sb)->s_groups_count) ++ break; ++ ++ err = -EIO; ++ desc = ext3_get_group_desc(sb, first_group + i, NULL); ++ if (desc == NULL) ++ goto out; ++ ++ err = -ENOMEM; ++ bh[i] = sb_getblk(sb, le32_to_cpu(desc->bg_block_bitmap)); ++ if (bh[i] == NULL) ++ goto out; ++ ++ if (buffer_uptodate(bh[i])) ++ continue; ++ ++ lock_buffer(bh[i]); ++ if (buffer_uptodate(bh[i])) { ++ unlock_buffer(bh[i]); ++ continue; ++ } ++ ++ get_bh(bh[i]); ++ bh[i]->b_end_io = end_buffer_read_sync; ++ submit_bh(READ, bh[i]); ++ mb_debug("read bitmap for group %u\n", first_group + i); ++ } ++ ++ /* wait for I/O completion */ ++ for (i = 0; i < groups_per_page && bh[i]; i++) ++ wait_on_buffer(bh[i]); ++ ++ /* XXX: I/O error handling here */ ++ ++ first_block = page->index * blocks_per_page; ++ for (i = 0; i < blocks_per_page; i++) { ++ int group; ++ ++ group = (first_block + i) >> 1; ++ if (group >= EXT3_SB(sb)->s_groups_count) ++ break; ++ ++ data = page_address(page) + (i * blocksize); ++ bitmap = bh[group - first_group]->b_data; ++ ++ if ((first_block + i) & 1) { ++ /* this is block of buddy */ ++ mb_debug("put buddy for group %u in page %lu/%x\n", ++ group, page->index, i * blocksize); ++ memset(data, 0xff, blocksize); ++ EXT3_SB(sb)->s_group_info[group]->bb_fragments = 0; ++ memset(EXT3_SB(sb)->s_group_info[group]->bb_counters, 0, ++ sizeof(unsigned short)*(sb->s_blocksize_bits+2)); ++ ext3_mb_generate_buddy(sb, data, bitmap, ++ EXT3_SB(sb)->s_group_info[group]); ++ } else { ++ /* this is block of bitmap */ ++ mb_debug("put bitmap for group %u in page %lu/%x\n", ++ group, page->index, i * blocksize); ++ memcpy(data, bitmap, blocksize); ++ } ++ } ++ SetPageUptodate(page); ++ ++out: ++ if (bh) { ++ for (i = 0; bh && i < groups_per_page && bh[i]; i++) ++ brelse(bh[i]); ++ if (bh != &bhs) ++ kfree(bh); ++ } ++ return err; ++} ++ ++static int ext3_mb_load_buddy(struct super_block *sb, int group, ++ struct ext3_buddy *e3b) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct inode *inode = sbi->s_buddy_cache; ++ int blocks_per_page, block, pnum, poff; ++ struct page *page; ++ ++ mb_debug("load group %u\n", group); ++ ++ blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; ++ ++ e3b->bd_blkbits = sb->s_blocksize_bits; ++ e3b->bd_info = sbi->s_group_info[group]; ++ e3b->bd_sb = sb; ++ e3b->bd_group = group; ++ e3b->bd_buddy_page = NULL; ++ e3b->bd_bitmap_page = NULL; ++ ++ block = group * 2; ++ pnum = block / blocks_per_page; ++ poff = block % blocks_per_page; ++ ++ page = find_get_page(inode->i_mapping, pnum); ++ if (page == NULL || !PageUptodate(page)) { ++ if (page) ++ page_cache_release(page); ++ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); ++ if (page) { ++ if (!PageUptodate(page)) ++ ext3_mb_init_cache(page); ++ unlock_page(page); ++ } ++ } ++ if (page == NULL || !PageUptodate(page)) ++ goto err; ++ e3b->bd_bitmap_page = page; ++ e3b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); ++ mark_page_accessed(page); ++ ++ block++; ++ pnum = block / blocks_per_page; ++ poff = block % blocks_per_page; ++ ++ page = find_get_page(inode->i_mapping, pnum); ++ if (page == NULL || !PageUptodate(page)) { ++ if (page) ++ page_cache_release(page); ++ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); ++ if (page) { ++ if (!PageUptodate(page)) ++ ext3_mb_init_cache(page); ++ unlock_page(page); ++ } ++ } ++ if (page == NULL || !PageUptodate(page)) ++ goto err; ++ e3b->bd_buddy_page = page; ++ e3b->bd_buddy = page_address(page) + (poff * sb->s_blocksize); ++ mark_page_accessed(page); ++ ++ J_ASSERT(e3b->bd_bitmap_page != NULL); ++ J_ASSERT(e3b->bd_buddy_page != NULL); ++ ++ return 0; ++ ++err: ++ if (e3b->bd_bitmap_page) ++ page_cache_release(e3b->bd_bitmap_page); ++ if (e3b->bd_buddy_page) ++ page_cache_release(e3b->bd_buddy_page); ++ e3b->bd_buddy = NULL; ++ e3b->bd_bitmap = NULL; ++ return -EIO; ++} ++ ++static void ext3_mb_release_desc(struct ext3_buddy *e3b) ++{ ++ if (e3b->bd_bitmap_page) ++ page_cache_release(e3b->bd_bitmap_page); ++ if (e3b->bd_buddy_page) ++ page_cache_release(e3b->bd_buddy_page); +} + ++ +static inline void +ext3_lock_group(struct super_block *sb, int group) +{ -+ spin_lock(&EXT3_SB(sb)->s_buddy_blocks[group]->bb_lock); ++ bit_spin_lock(EXT3_GROUP_INFO_LOCKED_BIT, ++ &EXT3_SB(sb)->s_group_info[group]->bb_state); +} + +static inline void +ext3_unlock_group(struct super_block *sb, int group) +{ -+ spin_unlock(&EXT3_SB(sb)->s_buddy_blocks[group]->bb_lock); ++ bit_spin_unlock(EXT3_GROUP_INFO_LOCKED_BIT, ++ &EXT3_SB(sb)->s_group_info[group]->bb_state); +} + +static int mb_find_order_for_block(struct ext3_buddy *e3b, int block) @@ -693,22 +1066,33 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + +static int mb_free_blocks(struct ext3_buddy *e3b, int first, int count) +{ -+ int block, max, order; ++ int block = 0, max = 0, order; + void *buddy, *buddy2; + + mb_check_buddy(e3b); + -+ e3b->bd_bd->bb_free += count; -+ if (first < e3b->bd_bd->bb_first_free) -+ e3b->bd_bd->bb_first_free = first; -+ ++ e3b->bd_info->bb_free += count; ++ if (first < e3b->bd_info->bb_first_free) ++ e3b->bd_info->bb_first_free = first; ++ ++ /* let's maintain fragments counter */ ++ if (first != 0) ++ block = !mb_test_bit(first - 1, EXT3_MB_BITMAP(e3b)); ++ if (first + count < EXT3_SB(e3b->bd_sb)->s_mb_maxs[0]) ++ max = !mb_test_bit(first + count, EXT3_MB_BITMAP(e3b)); ++ if (block && max) ++ e3b->bd_info->bb_fragments--; ++ else if (!block && !max) ++ e3b->bd_info->bb_fragments++; ++ ++ /* let's maintain buddy itself */ + while (count-- > 0) { + block = first++; + order = 0; + + J_ASSERT(mb_test_bit(block, EXT3_MB_BITMAP(e3b))); + mb_clear_bit(block, EXT3_MB_BITMAP(e3b)); -+ e3b->bd_bd->bb_counters[order]++; ++ e3b->bd_info->bb_counters[order]++; + + /* start of the buddy */ + buddy = mb_find_buddy(e3b, order, &max); @@ -731,12 +1115,12 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + mb_set_bit(block, buddy); + mb_set_bit(block + 1, buddy); + } -+ e3b->bd_bd->bb_counters[order]--; -+ e3b->bd_bd->bb_counters[order]--; ++ e3b->bd_info->bb_counters[order]--; ++ e3b->bd_info->bb_counters[order]--; + + block = block >> 1; + order++; -+ e3b->bd_bd->bb_counters[order]++; ++ e3b->bd_info->bb_counters[order]++; + + mb_clear_bit(block, buddy2); + buddy = buddy2; @@ -748,7 +1132,7 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c +} + +static int mb_find_extent(struct ext3_buddy *e3b, int order, int block, -+ int needed, struct ext3_free_extent *ex) ++ int needed, struct ext3_free_extent *ex) +{ + int next, max, ord; + void *buddy; @@ -765,7 +1149,7 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + return 0; + } + -+ if (order == 0) { ++ if (likely(order == 0)) { + /* find actual order */ + order = mb_find_order_for_block(e3b, block); + block = block >> order; @@ -775,7 +1159,7 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + ex->fe_start = block << order; + ex->fe_group = e3b->bd_group; + -+ while ((buddy = mb_find_buddy(e3b, order, &max))) { ++ while (needed > ex->fe_len && (buddy = mb_find_buddy(e3b, order, &max))) { + + if (block + 1 >= max) + break; @@ -797,16 +1181,30 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + +static int mb_mark_used(struct ext3_buddy *e3b, struct ext3_free_extent *ex) +{ ++ int ord, mlen = 0, max = 0, cur; + int start = ex->fe_start; + int len = ex->fe_len; -+ int ord, mlen, max, cur; ++ unsigned ret = 0; + int len0 = len; + void *buddy; + -+ e3b->bd_bd->bb_free -= len; -+ if (e3b->bd_bd->bb_first_free == start) -+ e3b->bd_bd->bb_first_free += len; ++ mb_check_buddy(e3b); + ++ e3b->bd_info->bb_free -= len; ++ if (e3b->bd_info->bb_first_free == start) ++ e3b->bd_info->bb_first_free += len; ++ ++ /* let's maintain fragments counter */ ++ if (start != 0) ++ mlen = !mb_test_bit(start - 1, EXT3_MB_BITMAP(e3b)); ++ if (start + len < EXT3_SB(e3b->bd_sb)->s_mb_maxs[0]) ++ max = !mb_test_bit(start + len, EXT3_MB_BITMAP(e3b)); ++ if (mlen && max) ++ e3b->bd_info->bb_fragments++; ++ else if (!mlen && !max) ++ e3b->bd_info->bb_fragments--; ++ ++ /* let's maintain buddy itself */ + while (len) { + ord = mb_find_order_for_block(e3b, start); + @@ -816,26 +1214,30 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + buddy = mb_find_buddy(e3b, ord, &max); + J_ASSERT((start >> ord) < max); + mb_set_bit(start >> ord, buddy); -+ e3b->bd_bd->bb_counters[ord]--; ++ e3b->bd_info->bb_counters[ord]--; + start += mlen; + len -= mlen; + J_ASSERT(len >= 0); + continue; + } + ++ /* store for history */ ++ if (ret == 0) ++ ret = len | (ord << 16); ++ + /* we have to split large buddy */ + J_ASSERT(ord > 0); + buddy = mb_find_buddy(e3b, ord, &max); + mb_set_bit(start >> ord, buddy); -+ e3b->bd_bd->bb_counters[ord]--; ++ e3b->bd_info->bb_counters[ord]--; + + ord--; + cur = (start >> ord) & ~1U; + buddy = mb_find_buddy(e3b, ord, &max); + mb_clear_bit(cur, buddy); + mb_clear_bit(cur + 1, buddy); -+ e3b->bd_bd->bb_counters[ord]++; -+ e3b->bd_bd->bb_counters[ord]++; ++ e3b->bd_info->bb_counters[ord]++; ++ e3b->bd_info->bb_counters[ord]++; + } + + /* now drop all the bits in bitmap */ @@ -843,7 +1245,7 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + + mb_check_buddy(e3b); + -+ return 0; ++ return ret; +} + +/* @@ -852,9 +1254,14 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c +static void ext3_mb_use_best_found(struct ext3_allocation_context *ac, + struct ext3_buddy *e3b) +{ ++ unsigned long ret; ++ + ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len); -+ mb_mark_used(e3b, &ac->ac_b_ex); ++ ret = mb_mark_used(e3b, &ac->ac_b_ex); ++ + ac->ac_status = AC_STATUS_FOUND; ++ ac->ac_tail = ret & 0xffff; ++ ac->ac_buddy = ret >> 16; +} + +/* @@ -871,9 +1278,8 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + struct ext3_free_extent *ex, + struct ext3_buddy *e3b) +{ -+ int factor = EXT3_SB(ac->ac_sb)->s_mb_factor; + struct ext3_free_extent *bex = &ac->ac_b_ex; -+ int diff = ac->ac_g_ex.fe_len - ex->fe_len; ++ struct ext3_free_extent *gex = &ac->ac_g_ex; + + J_ASSERT(ex->fe_len > 0); + J_ASSERT(ex->fe_len < (1 << ac->ac_sb->s_blocksize_bits) * 8); @@ -884,7 +1290,7 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + /* + * The special case - take what you catch first + */ -+ if (ac->ac_flags & EXT3_MB_HINT_FIRST) { ++ if (unlikely(ac->ac_flags & EXT3_MB_HINT_FIRST)) { + *bex = *ex; + ext3_mb_use_best_found(ac, e3b); + return; @@ -893,26 +1299,7 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + /* + * Let's check whether the chuck is good enough + */ -+ if (ex->fe_len >= ac->ac_g_ex.fe_len) { -+ *bex = *ex; -+ ext3_mb_use_best_found(ac, e3b); -+ return; -+ } -+ -+ /* -+ * If the request is vey large, then it makes sense to use large -+ * chunks for it. Even if they don't satisfy whole request. -+ */ -+ if (ex->fe_len > 1000) { -+ *bex = *ex; -+ ext3_mb_use_best_found(ac, e3b); -+ return; -+ } -+ -+ /* -+ * Sometimes it's worty to take close chunk -+ */ -+ if (factor && (ac->ac_g_ex.fe_len * 100) / (diff * 100) >= factor) { ++ if (ex->fe_len == gex->fe_len) { + *bex = *ex; + ext3_mb_use_best_found(ac, e3b); + return; @@ -928,13 +1315,26 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + + /* + * If new found extent is better, store it in the context -+ * FIXME: possible the policy should be more complex? + */ -+ if (ex->fe_len > bex->fe_len) { ++ if (bex->fe_len < gex->fe_len) { ++ /* if the request isn't satisfied, any found extent ++ * larger than previous best one is better */ ++ if (ex->fe_len > bex->fe_len) ++ *bex = *ex; ++ } else if (ex->fe_len > gex->fe_len) { ++ /* if the request is satisfied, then we try to find ++ * an extent that still satisfy the request, but is ++ * smaller than previous one */ + *bex = *ex; + } + + /* ++ * Let's scan at least few extents and don't pick up a first one ++ */ ++ if (bex->fe_len > gex->fe_len && ac->ac_found > ext3_mb_min_to_scan) ++ ac->ac_status = AC_STATUS_BREAK; ++ ++ /* + * We don't want to scan for a whole year + */ + if (ac->ac_found > ext3_mb_max_to_scan) @@ -955,13 +1355,13 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + ext3_lock_group(ac->ac_sb, group); + max = mb_find_extent(e3b, 0, ex.fe_start, ex.fe_len, &ex); + -+ if (max > 0) ++ if (max > 0) { ++ ac->ac_b_ex = ex; + ext3_mb_use_best_found(ac, e3b); ++ } + + ext3_unlock_group(ac->ac_sb, group); + -+ if (ac->ac_status == AC_STATUS_FOUND) -+ ext3_mb_dirty_buddy(e3b); + ext3_mb_release_desc(e3b); + + return 0; @@ -985,37 +1385,79 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + J_ASSERT(ex.fe_len > 0); + J_ASSERT(ex.fe_group == ac->ac_g_ex.fe_group); + J_ASSERT(ex.fe_start == ac->ac_g_ex.fe_start); ++ ac->ac_found++; + ac->ac_b_ex = ex; + ext3_mb_use_best_found(ac, e3b); + } + ext3_unlock_group(ac->ac_sb, group); + -+ if (ac->ac_status == AC_STATUS_FOUND) -+ ext3_mb_dirty_buddy(e3b); + ext3_mb_release_desc(e3b); + + return 0; +} ++ ++/* ++ * The routine scans buddy structures (not bitmap!) from given order ++ * to max order and tries to find big enough chunk to satisfy the req ++ */ ++static void ext3_mb_simple_scan_group(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ struct super_block *sb = ac->ac_sb; ++ struct ext3_group_info *grp = e3b->bd_info; ++ void *buddy; ++ int i, k, max; ++ ++ J_ASSERT(ac->ac_2order > 0); ++ for (i = ac->ac_2order; i < sb->s_blocksize_bits + 1; i++) { ++ if (grp->bb_counters[i] == 0) ++ continue; ++ ++ buddy = mb_find_buddy(e3b, i, &max); ++ if (buddy == NULL) { ++ printk(KERN_ALERT "looking for wrong order?\n"); ++ break; ++ } ++ ++ k = mb_find_next_zero_bit(buddy, max, 0); ++ J_ASSERT(k < max); ++ ++ ac->ac_found++; ++ ++ ac->ac_b_ex.fe_len = 1 << i; ++ ac->ac_b_ex.fe_start = k << i; ++ ac->ac_b_ex.fe_group = e3b->bd_group; ++ ++ ext3_mb_use_best_found(ac, e3b); ++ J_ASSERT(ac->ac_b_ex.fe_len == ac->ac_g_ex.fe_len); ++ ++ if (unlikely(ext3_mb_stats)) ++ atomic_inc(&EXT3_SB(sb)->s_bal_2orders); ++ ++ break; ++ } ++} ++ +/* + * The routine scans the group and measures all found extents. + * In order to optimize scanning, caller must pass number of -+ * free blocks in the group, so the routine can upper limit. ++ * free blocks in the group, so the routine can know upper limit. + */ -+static void ext3_mb_scan_group(struct ext3_allocation_context *ac, -+ struct ext3_buddy *e3b) ++static void ext3_mb_complex_scan_group(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) +{ + struct super_block *sb = ac->ac_sb; + void *bitmap = EXT3_MB_BITMAP(e3b); + struct ext3_free_extent ex; + int i, free; + -+ free = e3b->bd_bd->bb_free; ++ free = e3b->bd_info->bb_free; + J_ASSERT(free > 0); + -+ i = e3b->bd_bd->bb_first_free; ++ i = e3b->bd_info->bb_first_free; + -+ while (free && ac->ac_status != AC_STATUS_FOUND) { -+ i = ext2_find_next_zero_bit(bitmap, sb->s_blocksize * 8, i); ++ while (free && ac->ac_status == AC_STATUS_CONTINUE) { ++ i = mb_find_next_zero_bit(bitmap, sb->s_blocksize * 8, i); + if (i >= sb->s_blocksize * 8) { + J_ASSERT(free == 0); + break; @@ -1035,23 +1477,39 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c +static int ext3_mb_good_group(struct ext3_allocation_context *ac, + int group, int cr) +{ -+ int free; ++ struct ext3_sb_info *sbi = EXT3_SB(ac->ac_sb); ++ struct ext3_group_info *grp = sbi->s_group_info[group]; ++ unsigned free, fragments, i, bits; + -+ J_ASSERT(cr >= 0 && cr < 3); ++ J_ASSERT(cr >= 0 && cr < 4); ++ J_ASSERT(!EXT3_MB_GRP_NEED_INIT(grp)); + -+ free = EXT3_SB(ac->ac_sb)->s_buddy_blocks[group]->bb_free; ++ free = grp->bb_free; ++ fragments = grp->bb_fragments; + if (free == 0) + return 0; ++ if (fragments == 0) ++ return 0; + -+ if (cr == 0) { -+ if (free >= ac->ac_g_ex.fe_len >> 1) -+ return 1; -+ } else if (cr == 1) { -+ if (free >= ac->ac_g_ex.fe_len >> 2) ++ switch (cr) { ++ case 0: ++ J_ASSERT(ac->ac_2order != 0); ++ bits = ac->ac_sb->s_blocksize_bits + 1; ++ for (i = ac->ac_2order; i < bits; i++) ++ if (grp->bb_counters[i] > 0) ++ return 1; ++ case 1: ++ if ((free / fragments) >= ac->ac_g_ex.fe_len) ++ return 1; ++ case 2: ++ if (free >= ac->ac_g_ex.fe_len) ++ return 1; ++ case 3: + return 1; -+ } else if (cr == 2) { -+ return 1; ++ default: ++ BUG(); + } ++ + return 0; +} + @@ -1143,11 +1601,19 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + ac.ac_g_ex.fe_start = block; + ac.ac_g_ex.fe_len = *len; + ac.ac_flags = flags; ++ ac.ac_2order = 0; ++ ac.ac_criteria = 0; + -+ /* -+ * Sometimes, caller may want to merge even small number -+ * of blocks to an existing extent -+ */ ++ /* probably, the request is for 2^8+ blocks (1/2/3/... MB) */ ++ i = ffs(*len); ++ if (i >= 8) { ++ i--; ++ if ((*len & (~(1 << i))) == 0) ++ ac.ac_2order = i; ++ } ++ ++ /* Sometimes, caller may want to merge even small ++ * number of blocks to an existing extent */ + if (ac.ac_flags & EXT3_MB_HINT_MERGE) { + err = ext3_mb_find_by_goal(&ac, &e3b); + if (err) @@ -1156,23 +1622,24 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + goto found; + } + -+ /* -+ * FIXME -+ * If requested chunk is power of 2 length, we can try -+ * to exploit buddy nature to speed allocation up -+ */ -+ -+ -+ /* -+ * Let's just scan groups to find more-less suitable blocks -+ */ -+ cr = 0; ++ /* Let's just scan groups to find more-less suitable blocks */ ++ cr = ac.ac_2order ? 0 : 1; +repeat: -+ for (; cr < 3 && ac.ac_status == AC_STATUS_CONTINUE; cr++) { ++ for (; cr < 4 && ac.ac_status == AC_STATUS_CONTINUE; cr++) { ++ ac.ac_criteria = cr; + for (i = 0; i < EXT3_SB(sb)->s_groups_count; group++, i++) { + if (group == EXT3_SB(sb)->s_groups_count) + group = 0; + ++ if (EXT3_MB_GRP_NEED_INIT(sbi->s_group_info[group])) { ++ /* we need full data about the group ++ * to make a good selection */ ++ err = ext3_mb_load_buddy(ac.ac_sb, group, &e3b); ++ if (err) ++ goto out_err; ++ ext3_mb_release_desc(&e3b); ++ } ++ + /* check is group good for our criteries */ + if (!ext3_mb_good_group(&ac, group, cr)) + continue; @@ -1189,29 +1656,32 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + continue; + } + -+ ext3_mb_scan_group(&ac, &e3b); ++ ac.ac_groups_scanned++; ++ if (cr == 0) ++ ext3_mb_simple_scan_group(&ac, &e3b); ++ else ++ ext3_mb_complex_scan_group(&ac, &e3b); ++ + ext3_unlock_group(sb, group); + -+ if (ac.ac_status == AC_STATUS_FOUND) -+ ext3_mb_dirty_buddy(&e3b); + ext3_mb_release_desc(&e3b); + -+ if (err) -+ goto out_err; + if (ac.ac_status != AC_STATUS_CONTINUE) + break; + } + } + -+ if (ac.ac_status == AC_STATUS_BREAK && ++ if (ac.ac_b_ex.fe_len > 0 && ac.ac_status != AC_STATUS_FOUND && + !(ac.ac_flags & EXT3_MB_HINT_FIRST)) { + /* + * We've been searching too long. Let's try to allocate + * the best chunk we've found so far + */ -+ ext3_warning(inode->i_sb, __FUNCTION__, -+ "too long searching: got %d want %d\n", -+ ac.ac_b_ex.fe_len, ac.ac_g_ex.fe_len); ++ ++ /*if (ac.ac_found > ext3_mb_max_to_scan) ++ printk(KERN_ERR "EXT3-fs: too long searching at " ++ "%u (%d/%d)\n", cr, ac.ac_b_ex.fe_len, ++ ac.ac_g_ex.fe_len);*/ + ext3_mb_try_best_found(&ac, &e3b); + if (ac.ac_status != AC_STATUS_FOUND) { + /* @@ -1225,7 +1695,7 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + ac.ac_b_ex.fe_len = 0; + ac.ac_status = AC_STATUS_CONTINUE; + ac.ac_flags |= EXT3_MB_HINT_FIRST; -+ cr = 2; ++ cr = 3; + goto repeat; + } + } @@ -1248,7 +1718,7 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + printk("EXT3-fs: groups: "); + for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) + printk("%d: %d ", i, -+ sbi->s_buddy_blocks[i]->bb_free); ++ sbi->s_group_info[i]->bb_free); + printk("\n"); +#endif + goto out; @@ -1302,12 +1772,10 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + ext3_error(sb, "ext3_new_block", + "Allocating block in system zone - " + "block = %u", block); -+ if (unlikely(ext3_mb_aggressive)) { -+ for (i = 0; i < ac.ac_b_ex.fe_len; i++) -+ J_ASSERT(!mb_test_bit(ac.ac_b_ex.fe_start + i, -+ bitmap_bh->b_data)); -+ } -+ ++#ifdef AGGRESSIVE_CHECK ++ for (i = 0; i < ac.ac_b_ex.fe_len; i++) ++ J_ASSERT(!mb_test_bit(ac.ac_b_ex.fe_start + i, bitmap_bh->b_data)); ++#endif + mb_set_bits(bitmap_bh->b_data, ac.ac_b_ex.fe_start, ac.ac_b_ex.fe_len); + + spin_lock(sb_bgl_lock(sbi, ac.ac_b_ex.fe_group)); @@ -1358,368 +1826,358 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + ext3_mb_release_blocks(sb, 1); + } + -+ if ((ext3_mb_stats) && (ac.ac_g_ex.fe_len > 1)) { -+ spin_lock(&sbi->s_bal_lock); -+ sbi->s_bal_reqs++; -+ sbi->s_bal_allocated += *len; ++ if (unlikely(ext3_mb_stats) && ac.ac_g_ex.fe_len > 1) { ++ atomic_inc(&sbi->s_bal_reqs); ++ atomic_add(*len, &sbi->s_bal_allocated); + if (*len >= ac.ac_g_ex.fe_len) -+ sbi->s_bal_success++; -+ sbi->s_bal_ex_scanned += ac.ac_found; ++ atomic_inc(&sbi->s_bal_success); ++ atomic_add(ac.ac_found, &sbi->s_bal_ex_scanned); + if (ac.ac_g_ex.fe_start == ac.ac_b_ex.fe_start && + ac.ac_g_ex.fe_group == ac.ac_b_ex.fe_group) -+ sbi->s_bal_goals++; ++ atomic_inc(&sbi->s_bal_goals); + if (ac.ac_found > ext3_mb_max_to_scan) -+ sbi->s_bal_breaks++; -+ spin_unlock(&sbi->s_bal_lock); ++ atomic_inc(&sbi->s_bal_breaks); + } + ++ ext3_mb_store_history(sb, &ac); ++ + return block; +} ++EXPORT_SYMBOL(ext3_mb_new_blocks); + -+int ext3_mb_get_descr_loc(struct ext3_buddy *e3b, struct buffer_head **bh, -+ struct ext3_mb_group_descr **grp) -+{ -+ struct super_block *sb = e3b->bd_sb; -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ int descr_per_block, err, offset; -+ struct ext3_mb_grp_header *hdr; -+ unsigned long block; -+ -+ descr_per_block = (sb->s_blocksize - sizeof(struct ext3_mb_grp_header)) -+ / sizeof(struct ext3_mb_group_descr); -+ block = e3b->bd_group / descr_per_block; -+ *bh = ext3_bread(NULL, sbi->s_buddy, block, 0, &err); -+ if (*bh == NULL) { -+ printk(KERN_ERR "EXT3-fs: cant getblk descr for group %d: %d\n", -+ e3b->bd_group, err); -+ return err; -+ } -+ -+ hdr = (struct ext3_mb_grp_header *) (*bh)->b_data; -+ if (hdr->mh_magic != EXT3_MB_MAGIC_V1) { -+ printk(KERN_ERR "EXT3-fs: invalid magic in group %d!\n", -+ e3b->bd_group); -+ brelse(*bh); -+ *bh = NULL; -+ return -EIO; -+ } ++#ifdef EXT3_MB_HISTORY ++struct ext3_mb_proc_session { ++ struct ext3_mb_history *history; ++ struct super_block *sb; ++ int start; ++ int max; ++}; + -+ offset = e3b->bd_group % descr_per_block -+ * sizeof(struct ext3_mb_group_descr) -+ + sizeof(struct ext3_mb_grp_header); -+ *grp = (struct ext3_mb_group_descr *) ((*bh)->b_data + offset); ++static void *ext3_mb_history_skip_empty(struct ext3_mb_proc_session *s, ++ struct ext3_mb_history *hs, ++ int first) ++{ ++ if (hs == s->history + s->max) ++ hs = s->history; ++ if (!first && hs == s->history + s->start) ++ return NULL; ++ while (hs->goal.fe_len == 0) { ++ hs++; ++ if (hs == s->history + s->max) ++ hs = s->history; ++ if (hs == s->history + s->start) ++ return NULL; ++ } ++ return hs; ++} + -+ return 0; ++static void *ext3_mb_seq_history_start(struct seq_file *seq, loff_t *pos) ++{ ++ struct ext3_mb_proc_session *s = seq->private; ++ struct ext3_mb_history *hs; ++ int l = *pos; ++ ++ if (l == 0) ++ return SEQ_START_TOKEN; ++ hs = ext3_mb_history_skip_empty(s, s->history + s->start, 1); ++ if (!hs) ++ return NULL; ++ while (--l && (hs = ext3_mb_history_skip_empty(s, ++hs, 0)) != NULL); ++ return hs; +} + -+int ext3_mb_load_descr(struct ext3_buddy *e3b) ++static void *ext3_mb_seq_history_next(struct seq_file *seq, void *v, loff_t *pos) +{ -+ struct ext3_mb_group_descr *grp; -+ struct ext3_group_desc *gdp; -+ struct buffer_head *bh; -+ int err, i; ++ struct ext3_mb_proc_session *s = seq->private; ++ struct ext3_mb_history *hs = v; ++ ++ ++*pos; ++ if (v == SEQ_START_TOKEN) ++ return ext3_mb_history_skip_empty(s, s->history + s->start, 1); ++ else ++ return ext3_mb_history_skip_empty(s, ++hs, 0); ++} + -+ err = ext3_mb_get_descr_loc(e3b, &bh, &grp); -+ if (err) -+ return err; -+ -+ e3b->bd_bd->bb_first_free = grp->mgd_first_free; -+ e3b->bd_bd->bb_free = grp->mgd_free; -+ for (i = 0; i <= e3b->bd_blkbits + 1; i++) { -+ J_ASSERT(i < 16); -+ e3b->bd_bd->bb_counters[i] = grp->mgd_counters[i]; -+ } -+ brelse(bh); ++static int ext3_mb_seq_history_show(struct seq_file *seq, void *v) ++{ ++ struct ext3_mb_history *hs = v; ++ char buf[20], buf2[20]; + -+ /* additional checks against old group descriptor */ -+ gdp = ext3_get_group_desc(e3b->bd_sb, e3b->bd_group, NULL); -+ if (!gdp) -+ return -EIO; -+ if (e3b->bd_bd->bb_free != le16_to_cpu(gdp->bg_free_blocks_count)) { -+ printk(KERN_ERR "EXT3-fs: mbgroup %d corrupted (%d != %d)\n", -+ e3b->bd_group, e3b->bd_bd->bb_free, -+ le16_to_cpu(gdp->bg_free_blocks_count)); -+ return -ENODATA; ++ if (v == SEQ_START_TOKEN) { ++ seq_printf(seq, "%-17s %-17s %-5s %-5s %-2s %-5s %-5s %-6s\n", ++ "goal", "result", "found", "grps", "cr", "merge", ++ "tail", "broken"); ++ return 0; + } + ++ sprintf(buf, "%u/%u/%u", hs->goal.fe_group, ++ hs->goal.fe_start, hs->goal.fe_len); ++ sprintf(buf2, "%u/%u/%u", hs->result.fe_group, ++ hs->result.fe_start, hs->result.fe_len); ++ seq_printf(seq, "%-17s %-17s %-5u %-5u %-2u %-5s %-5u %-6u\n", buf, ++ buf2, hs->found, hs->groups, hs->cr, ++ hs->merged ? "M" : "", hs->tail, ++ hs->buddy ? 1 << hs->buddy : 0); + return 0; +} + ++static void ext3_mb_seq_history_stop(struct seq_file *seq, void *v) ++{ ++} ++ ++static struct seq_operations ext3_mb_seq_history_ops = { ++ .start = ext3_mb_seq_history_start, ++ .next = ext3_mb_seq_history_next, ++ .stop = ext3_mb_seq_history_stop, ++ .show = ext3_mb_seq_history_show, ++}; + -+int ext3_mb_update_descr(struct ext3_buddy *e3b) ++static int ext3_mb_seq_history_open(struct inode *inode, struct file *file) +{ -+ struct ext3_mb_group_descr *grp; -+ struct ext3_group_desc *gdp; -+ struct buffer_head *bh; -+ handle_t *handle; -+ int err, i; ++ struct super_block *sb = PDE(inode)->data; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct ext3_mb_proc_session *s; ++ int rc, size; + -+ /* additional checks against old group descriptor */ -+ gdp = ext3_get_group_desc(e3b->bd_sb, e3b->bd_group, NULL); -+ if (!gdp) ++ s = kmalloc(sizeof(*s), GFP_KERNEL); ++ if (s == NULL) + return -EIO; -+ if (e3b->bd_bd->bb_free != le16_to_cpu(gdp->bg_free_blocks_count)) { -+ printk(KERN_ERR "EXT3-fs: mbgroup %d corrupted (%d != %d)\n", -+ e3b->bd_group, e3b->bd_bd->bb_free, -+ le16_to_cpu(gdp->bg_free_blocks_count)); -+ return -ENODATA; -+ } -+ -+ err = ext3_mb_get_descr_loc(e3b, &bh, &grp); -+ if (err) -+ return err; ++ size = sizeof(struct ext3_mb_history) * sbi->s_mb_history_max; ++ s->history = kmalloc(size, GFP_KERNEL); ++ ++ spin_lock(&sbi->s_mb_history_lock); ++ memcpy(s->history, sbi->s_mb_history, size); ++ s->max = sbi->s_mb_history_max; ++ s->start = sbi->s_mb_history_cur % s->max; ++ spin_unlock(&sbi->s_mb_history_lock); + -+ handle = ext3_journal_start(EXT3_SB(e3b->bd_sb)->s_buddy, 1); -+ if (IS_ERR(handle)) { -+ err = PTR_ERR(handle); -+ handle = NULL; -+ goto out; ++ rc = seq_open(file, &ext3_mb_seq_history_ops); ++ if (rc == 0) { ++ struct seq_file *m = (struct seq_file *)file->private_data; ++ m->private = s; ++ } else { ++ kfree(s->history); ++ kfree(s); + } ++ return rc; + -+ err = ext3_journal_get_write_access(handle, bh); -+ if (err) -+ goto out; -+ grp->mgd_first_free = e3b->bd_bd->bb_first_free; -+ grp->mgd_free = e3b->bd_bd->bb_free; -+ for (i = 0; i <= e3b->bd_blkbits + 1; i++) { -+ J_ASSERT(i < 16); -+ grp->mgd_counters[i] = e3b->bd_bd->bb_counters[i]; -+ } -+ err = ext3_journal_dirty_metadata(handle, bh); -+ if (err) -+ goto out; -+ err = 0; -+out: -+ brelse(bh); -+ if (handle) -+ ext3_journal_stop(handle); -+ return err; +} + -+int ext3_mb_generate_buddy(struct ext3_buddy *e3b) ++static int ext3_mb_seq_history_release(struct inode *inode, struct file *file) ++{ ++ struct seq_file *seq = (struct seq_file *)file->private_data; ++ struct ext3_mb_proc_session *s = seq->private; ++ kfree(s->history); ++ kfree(s); ++ return seq_release(inode, file); ++} ++ ++static struct file_operations ext3_mb_seq_history_fops = { ++ .owner = THIS_MODULE, ++ .open = ext3_mb_seq_history_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = ext3_mb_seq_history_release, ++}; ++ ++static void ext3_mb_history_release(struct super_block *sb) +{ -+ struct super_block *sb = e3b->bd_sb; -+ struct buffer_head *bh; -+ int i, count = 0; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ char name[64]; + -+ mb_debug("generate buddy for group %d\n", e3b->bd_group); -+ memset(e3b->bd_bh->b_data, 0xff, sb->s_blocksize); -+ memset(e3b->bd_bh2->b_data, 0xff, sb->s_blocksize); ++ snprintf(name, sizeof(name) - 1, "%s", bdevname(sb->s_bdev, name)); ++ remove_proc_entry("mb_history", sbi->s_mb_proc); ++ remove_proc_entry(name, proc_root_ext3); + -+ bh = read_block_bitmap(sb, e3b->bd_group); -+ if (bh == NULL) -+ return -EIO; ++ if (sbi->s_mb_history) ++ kfree(sbi->s_mb_history); ++} + -+ /* mb_free_blocks will set real free */ -+ e3b->bd_bd->bb_free = 0; -+ e3b->bd_bd->bb_first_free = 1 << 15; -+ /* -+ * if change bb_counters size, don't forget about -+ * ext3_mb_init_backend() -bzzz -+ */ -+ memset(e3b->bd_bd->bb_counters, 0, -+ sizeof(unsigned) * (sb->s_blocksize_bits + 2)); ++static void ext3_mb_history_init(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ char name[64]; ++ int i; + -+ /* loop over the blocks, and create buddies for free ones */ -+ for (i = 0; i < sb->s_blocksize * 8; i++) { -+ if (!mb_test_bit(i, (void *) bh->b_data)) { -+ mb_free_blocks(e3b, i, 1); -+ count++; ++ snprintf(name, sizeof(name) - 1, "%s", bdevname(sb->s_bdev, name)); ++ sbi->s_mb_proc = proc_mkdir(name, proc_root_ext3); ++ if (sbi->s_mb_proc != NULL) { ++ struct proc_dir_entry *p; ++ p = create_proc_entry("mb_history", S_IRUGO, sbi->s_mb_proc); ++ if (p) { ++ p->proc_fops = &ext3_mb_seq_history_fops; ++ p->data = sb; + } + } -+ brelse(bh); -+ mb_check_buddy(e3b); -+ ext3_mb_dirty_buddy(e3b); + -+ return 0; ++ sbi->s_mb_history_max = 1000; ++ sbi->s_mb_history_cur = 0; ++ spin_lock_init(&sbi->s_mb_history_lock); ++ i = sbi->s_mb_history_max * sizeof(struct ext3_mb_history); ++ sbi->s_mb_history = kmalloc(i, GFP_KERNEL); ++ memset(sbi->s_mb_history, 0, i); ++ /* if we can't allocate history, then we simple won't use it */ +} + -+EXPORT_SYMBOL(ext3_mb_new_blocks); ++static void ++ext3_mb_store_history(struct super_block *sb, struct ext3_allocation_context *ac) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct ext3_mb_history h; ++ ++ if (likely(sbi->s_mb_history == NULL)) ++ return; ++ ++ h.goal = ac->ac_g_ex; ++ h.result = ac->ac_b_ex; ++ h.found = ac->ac_found; ++ h.cr = ac->ac_criteria; ++ h.groups = ac->ac_groups_scanned; ++ h.tail = ac->ac_tail; ++ h.buddy = ac->ac_buddy; ++ h.merged = 0; ++ if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && ++ ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group) ++ h.merged = 1; ++ ++ spin_lock(&sbi->s_mb_history_lock); ++ memcpy(sbi->s_mb_history + sbi->s_mb_history_cur, &h, sizeof(h)); ++ if (++sbi->s_mb_history_cur >= sbi->s_mb_history_max) ++ sbi->s_mb_history_cur = 0; ++ spin_unlock(&sbi->s_mb_history_lock); ++} + -+#define MB_CREDITS \ -+ (EXT3_DATA_TRANS_BLOCKS + 3 + EXT3_INDEX_EXTRA_TRANS_BLOCKS + \ -+ 2 * EXT3_SINGLEDATA_TRANS_BLOCKS) ++#else ++#define ext3_mb_history_release(sb) ++#define ext3_mb_history_init(sb) ++#endif + -+int ext3_mb_init_backend(struct super_block *sb, int *created) ++int ext3_mb_init_backend(struct super_block *sb) +{ -+ int err, i, len, descr_per_block, buddy_offset, size; -+ struct inode *root = sb->s_root->d_inode; + struct ext3_sb_info *sbi = EXT3_SB(sb); -+ struct ext3_mb_grp_header *hdr; -+ struct buffer_head *bh = NULL; -+ unsigned long block; -+ struct dentry *db; -+ handle_t *handle; -+ tid_t target; -+ -+ *created = 0; ++ int i, len; ++ + len = sizeof(struct ext3_buddy_group_blocks *) * sbi->s_groups_count; -+ sbi->s_buddy_blocks = kmalloc(len, GFP_KERNEL); -+ if (sbi->s_buddy_blocks == NULL) { -+ printk(KERN_ERR "EXT3-fs: cant allocate mem for buddy\n"); ++ sbi->s_group_info = kmalloc(len, GFP_KERNEL); ++ if (sbi->s_group_info == NULL) { ++ printk(KERN_ERR "EXT3-fs: can't allocate mem for buddy\n"); + return -ENOMEM; + } -+ memset(sbi->s_buddy_blocks, 0, len); -+ sbi->s_buddy = NULL; -+ -+ down(&root->i_sem); -+ len = strlen(EXT3_BUDDY_FILE); -+ db = lookup_one_len(EXT3_BUDDY_FILE, sb->s_root, len); -+ if (IS_ERR(db)) { -+ err = PTR_ERR(db); -+ printk(KERN_ERR "EXT3-fs: cant lookup buddy: %d\n", err); -+ up(&root->i_sem); -+ goto out; -+ } ++ memset(sbi->s_group_info, 0, len); + -+ if (db->d_inode == NULL) { -+ err = ext3_create(root, db, S_IFREG, NULL); -+ if (err) { -+ printk(KERN_ERR "EXT3-fs: cant create buddy: %d\n", err); -+ up(&root->i_sem); -+ goto out; -+ } -+ db->d_inode->i_flags |= S_IMMUTABLE | S_NOATIME; -+ *created = 1; -+ mb_debug("no buddy file, regenerate\n"); -+ } -+ up(&root->i_sem); -+ sbi->s_buddy = igrab(db->d_inode); -+ -+ /* calculate needed size */ -+ descr_per_block = (sb->s_blocksize - sizeof(struct ext3_mb_grp_header)) -+ / sizeof(struct ext3_mb_group_descr); -+ buddy_offset = (sbi->s_groups_count + descr_per_block - 1) -+ / descr_per_block; -+ len = sbi->s_groups_count * sb->s_blocksize * 2 + -+ buddy_offset * sb->s_blocksize; -+ if (len != i_size_read(sbi->s_buddy)) { -+ if (*created == 0) -+ printk("EXT3-fs: wrong i_size (%u != %u), regenerate\n", -+ (unsigned) len, -+ (unsigned) i_size_read(sbi->s_buddy)); -+ *created = 1; -+ } -+ -+ /* read/create mb group descriptors */ -+ for (i = 0; i < buddy_offset; i++) { -+ handle = ext3_journal_start(sbi->s_buddy, MB_CREDITS); -+ if (IS_ERR(handle)) { -+ printk(KERN_ERR "EXT3-fs: cant start transaction\n"); -+ err = PTR_ERR(handle); -+ goto err_out; -+ } -+ -+ bh = ext3_bread(handle, sbi->s_buddy, i, 1, &err); -+ if (bh == NULL) { -+ printk(KERN_ERR "EXT3-fs: cant getblk grp: %d\n", err); -+ goto err_out; -+ } -+ hdr = (struct ext3_mb_grp_header *) bh->b_data; -+ if (hdr->mh_magic != EXT3_MB_MAGIC_V1) { -+ err = ext3_journal_get_write_access(handle, bh); -+ if (err) -+ goto err_out; -+ if (*created == 0) -+ printk(KERN_ERR -+ "EXT3-fs: invalid header 0x%x in %d," -+ "regenerate\n", hdr->mh_magic, i); -+ *created = 1; -+ hdr->mh_magic = EXT3_MB_MAGIC_V1; -+ err = ext3_journal_dirty_metadata(handle, bh); -+ if (err) -+ goto err_out; -+ } -+ brelse(bh); -+ ext3_journal_stop(handle); ++ sbi->s_buddy_cache = new_inode(sb); ++ if (sbi->s_buddy_cache == NULL) { ++ printk(KERN_ERR "EXT3-fs: can't get new inode\n"); ++ kfree(sbi->s_group_info); ++ return -ENOMEM; + } + + /* -+ * if change bb_counters size, don't forget about ext3_mb_generate_buddy() ++ * calculate needed size. if change bb_counters size, ++ * don't forget about ext3_mb_generate_buddy() + */ -+ len = sizeof(struct ext3_buddy_group_blocks); -+ len += sizeof(unsigned) * (sb->s_blocksize_bits + 2); ++ len = sizeof(struct ext3_group_info); ++ len += sizeof(unsigned short) * (sb->s_blocksize_bits + 2); + for (i = 0; i < sbi->s_groups_count; i++) { ++ struct ext3_group_desc * desc; + -+ sbi->s_buddy_blocks[i] = kmalloc(len, GFP_KERNEL); -+ if (sbi->s_buddy_blocks[i] == NULL) { ++ sbi->s_group_info[i] = kmalloc(len, GFP_KERNEL); ++ if (sbi->s_group_info[i] == NULL) { + printk(KERN_ERR "EXT3-fs: cant allocate mem for buddy\n"); -+ err = -ENOMEM; -+ goto out2; -+ } -+ memset(sbi->s_buddy_blocks[i], 0, len); -+ -+ handle = ext3_journal_start(sbi->s_buddy, MB_CREDITS); -+ if (IS_ERR(handle)) { -+ printk(KERN_ERR "EXT3-fs: cant start transaction\n"); -+ err = PTR_ERR(handle); -+ goto out2; -+ } -+ -+ /* allocate block for bitmap */ -+ block = buddy_offset + i * 2; -+ bh = ext3_getblk(handle, sbi->s_buddy, block, 1, &err); -+ if (bh == NULL) { -+ printk(KERN_ERR "EXT3-fs: cant getblk bitmap: %d\n", err); -+ goto out2; -+ } -+ sbi->s_buddy_blocks[i]->bb_bitmap = bh->b_blocknr; -+ brelse(bh); -+ -+ /* allocate block for buddy */ -+ block = buddy_offset + i * 2 + 1; -+ bh = ext3_getblk(handle, sbi->s_buddy, block, 1, &err); -+ if (bh == NULL) { -+ printk(KERN_ERR "EXT3-fs: cant getblk for buddy: %d\n", err); -+ goto out2; ++ goto err_out; + } -+ sbi->s_buddy_blocks[i]->bb_buddy = bh->b_blocknr; -+ brelse(bh); -+ -+ size = (block + 1) << sbi->s_buddy->i_blkbits; -+ if (size > sbi->s_buddy->i_size) { -+ *created = 1; -+ EXT3_I(sbi->s_buddy)->i_disksize = size; -+ i_size_write(sbi->s_buddy, size); -+ mark_inode_dirty(sbi->s_buddy); ++ desc = ext3_get_group_desc(sb, i, NULL); ++ if (desc == NULL) { ++ printk(KERN_ERR "EXT3-fs: cant read descriptor %u\n", i); ++ goto err_out; + } -+ ext3_journal_stop(handle); -+ -+ spin_lock_init(&sbi->s_buddy_blocks[i]->bb_lock); -+ sbi->s_buddy_blocks[i]->bb_md_cur = NULL; -+ sbi->s_buddy_blocks[i]->bb_tid = 0; ++ memset(sbi->s_group_info[i], 0, len); ++ set_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, ++ &sbi->s_group_info[i]->bb_state); ++ sbi->s_group_info[i]->bb_free = ++ le16_to_cpu(desc->bg_free_blocks_count); + } + -+ if (journal_start_commit(sbi->s_journal, &target)) -+ log_wait_commit(sbi->s_journal, target); -+ -+out2: -+ dput(db); -+out: -+ return err; ++ return 0; + +err_out: -+ return err; ++ while (--i >= 0) ++ kfree(sbi->s_group_info[i]); ++ iput(sbi->s_buddy_cache); ++ ++ return -ENOMEM; +} + -+int ext3_mb_write_descriptors(struct super_block *sb) ++int ext3_mb_init(struct super_block *sb, int needs_recovery) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); -+ struct ext3_buddy e3b; -+ int ret = 0, i, err; ++ struct inode *root = sb->s_root->d_inode; ++ unsigned i, offset, max; ++ struct dentry *dentry; + -+ for (i = 0; i < sbi->s_groups_count; i++) { -+ if (sbi->s_buddy_blocks[i] == NULL) -+ continue; ++ if (!test_opt(sb, MBALLOC)) ++ return 0; + -+ err = ext3_mb_load_buddy(sb, i, &e3b); -+ if (err == 0) { -+ ext3_mb_update_descr(&e3b); -+ ext3_mb_release_desc(&e3b); -+ } else -+ ret = err; ++ i = (sb->s_blocksize_bits + 2) * sizeof(unsigned short); ++ ++ sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL); ++ if (sbi->s_mb_offsets == NULL) { ++ clear_opt(sbi->s_mount_opt, MBALLOC); ++ return -ENOMEM; + } -+ return ret; ++ sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); ++ if (sbi->s_mb_maxs == NULL) { ++ clear_opt(sbi->s_mount_opt, MBALLOC); ++ kfree(sbi->s_mb_maxs); ++ return -ENOMEM; ++ } ++ ++ /* order 0 is regular bitmap */ ++ sbi->s_mb_maxs[0] = sb->s_blocksize << 3; ++ sbi->s_mb_offsets[0] = 0; ++ ++ i = 1; ++ offset = 0; ++ max = sb->s_blocksize << 2; ++ do { ++ sbi->s_mb_offsets[i] = offset; ++ sbi->s_mb_maxs[i] = max; ++ offset += 1 << (sb->s_blocksize_bits - i); ++ max = max >> 1; ++ i++; ++ } while (i <= sb->s_blocksize_bits + 1); ++ ++ ++ /* init file for buddy data */ ++ if ((i = ext3_mb_init_backend(sb))) { ++ clear_opt(sbi->s_mount_opt, MBALLOC); ++ kfree(sbi->s_mb_offsets); ++ kfree(sbi->s_mb_maxs); ++ return i; ++ } ++ ++ spin_lock_init(&sbi->s_reserve_lock); ++ spin_lock_init(&sbi->s_md_lock); ++ INIT_LIST_HEAD(&sbi->s_active_transaction); ++ INIT_LIST_HEAD(&sbi->s_closed_transaction); ++ INIT_LIST_HEAD(&sbi->s_committed_transaction); ++ spin_lock_init(&sbi->s_bal_lock); ++ ++ /* remove old on-disk buddy file */ ++ down(&root->i_sem); ++ dentry = lookup_one_len(".buddy", sb->s_root, strlen(".buddy")); ++ if (dentry->d_inode != NULL) { ++ i = vfs_unlink(root, dentry); ++ if (i != 0) ++ printk("EXT3-fs: can't remove .buddy file: %d\n", i); ++ } ++ dput(dentry); ++ up(&root->i_sem); ++ ++ ext3_mb_history_init(sb); ++ ++ printk("EXT3-fs: mballoc enabled\n"); ++ return 0; +} + +int ext3_mb_release(struct super_block *sb) @@ -1739,78 +2197,40 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + spin_unlock(&sbi->s_md_lock); + ext3_mb_free_committed_blocks(sb); + -+ if (sbi->s_buddy_blocks) { -+ ext3_mb_write_descriptors(sb); ++ if (sbi->s_group_info) { + for (i = 0; i < sbi->s_groups_count; i++) { -+ if (sbi->s_buddy_blocks[i] == NULL) ++ if (sbi->s_group_info[i] == NULL) + continue; -+ kfree(sbi->s_buddy_blocks[i]); ++ kfree(sbi->s_group_info[i]); + } -+ kfree(sbi->s_buddy_blocks); -+ } -+ if (sbi->s_buddy) -+ iput(sbi->s_buddy); ++ kfree(sbi->s_group_info); ++ } ++ if (sbi->s_mb_offsets) ++ kfree(sbi->s_mb_offsets); ++ if (sbi->s_mb_maxs) ++ kfree(sbi->s_mb_maxs); ++ if (sbi->s_buddy_cache) ++ iput(sbi->s_buddy_cache); + if (sbi->s_blocks_reserved) + printk("ext3-fs: %ld blocks being reserved at umount!\n", + sbi->s_blocks_reserved); + if (ext3_mb_stats) { -+ printk("EXT3-fs: mballoc: %lu blocks %lu reqs " -+ "(%lu success)\n", sbi->s_bal_allocated, -+ sbi->s_bal_reqs, sbi->s_bal_success); -+ printk("EXT3-fs: mballoc: %lu extents scanned, " -+ "%lu goal hits, %lu breaks\n", sbi->s_bal_ex_scanned, -+ sbi->s_bal_goals, sbi->s_bal_breaks); -+ } -+ -+ return 0; -+} -+ -+int ext3_mb_init(struct super_block *sb, int needs_recovery) -+{ -+ struct ext3_buddy e3b; -+ int i, err, created; -+ -+ if (!test_opt(sb, MBALLOC)) -+ return 0; -+ -+ /* init file for buddy data */ -+ clear_opt(EXT3_SB(sb)->s_mount_opt, MBALLOC); -+ if ((err = ext3_mb_init_backend(sb, &created))) -+ return err; -+ -+repeat: -+ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) { -+ err = ext3_mb_load_buddy(sb, i, &e3b); -+ if (err) { -+ /* FIXME: release backend */ -+ return err; -+ } -+ if (created || needs_recovery) -+ ext3_mb_generate_buddy(&e3b); -+ else -+ err = ext3_mb_load_descr(&e3b); -+ ext3_mb_release_desc(&e3b); -+ if (err == -ENODATA) { -+ created = 1; -+ goto repeat; -+ } -+ } -+ if (created || needs_recovery) -+ printk(KERN_ERR "EXT3-fs: generated buddies for %lu groups\n", -+ EXT3_SB(sb)->s_groups_count); -+ spin_lock_init(&EXT3_SB(sb)->s_reserve_lock); -+ spin_lock_init(&EXT3_SB(sb)->s_md_lock); -+ INIT_LIST_HEAD(&EXT3_SB(sb)->s_active_transaction); -+ INIT_LIST_HEAD(&EXT3_SB(sb)->s_closed_transaction); -+ INIT_LIST_HEAD(&EXT3_SB(sb)->s_committed_transaction); -+ set_opt(EXT3_SB(sb)->s_mount_opt, MBALLOC); -+ -+ spin_lock_init(&EXT3_SB(sb)->s_bal_lock); -+ if (ext3_mb_stats) { -+ printk("EXT3-fs: mballoc enabled (stats)\n"); -+ } else { -+ printk("EXT3-fs: mballoc enabled\n"); -+ } ++ printk("EXT3-fs: mballoc: %u blocks %u reqs (%u success)\n", ++ atomic_read(&sbi->s_bal_allocated), ++ atomic_read(&sbi->s_bal_reqs), ++ atomic_read(&sbi->s_bal_success)); ++ printk("EXT3-fs: mballoc: %u extents scanned, %u goal hits, " ++ "%u 2^N hits, %u breaks\n", ++ atomic_read(&sbi->s_bal_ex_scanned), ++ atomic_read(&sbi->s_bal_goals), ++ atomic_read(&sbi->s_bal_2orders), ++ atomic_read(&sbi->s_bal_breaks)); ++ printk("EXT3-fs: mballoc: %lu generated and it took %Lu\n", ++ sbi->s_mb_buddies_generated++, ++ sbi->s_mb_generation_time); ++ } ++ ++ ext3_mb_history_release(sb); + + return 0; +} @@ -1857,8 +2277,11 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + mb_debug("\n"); + ext3_unlock_group(sb, md->group); + ++ /* balance refcounts from ext3_mb_free_metadata() */ ++ page_cache_release(e3b.bd_buddy_page); ++ page_cache_release(e3b.bd_bitmap_page); ++ + kfree(md); -+ ext3_mb_dirty_buddy(&e3b); + ext3_mb_release_desc(&e3b); + + } while (md); @@ -1875,7 +2298,7 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + /* new transaction! time to close last one and free blocks for + * committed transaction. we know that only transaction can be + * active, so previos transaction can be being logged and we -+ * know that transaction before previous is known to be alreade ++ * know that transaction before previous is known to be already + * logged. this means that now we may free blocks freed in all + * transactions before previous one. hope I'm clear enough ... */ + @@ -1898,12 +2321,15 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c +int ext3_mb_free_metadata(handle_t *handle, struct ext3_buddy *e3b, + int group, int block, int count) +{ -+ struct ext3_buddy_group_blocks *db = e3b->bd_bd; ++ struct ext3_group_info *db = e3b->bd_info; + struct super_block *sb = e3b->bd_sb; + struct ext3_sb_info *sbi = EXT3_SB(sb); + struct ext3_free_metadata *md; + int i; + ++ J_ASSERT(e3b->bd_bitmap_page != NULL); ++ J_ASSERT(e3b->bd_buddy_page != NULL); ++ + ext3_lock_group(sb, group); + for (i = 0; i < count; i++) { + md = db->bb_md_cur; @@ -1925,6 +2351,12 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + spin_lock(&sbi->s_md_lock); + list_add(&md->list, &sbi->s_active_transaction); + spin_unlock(&sbi->s_md_lock); ++ /* protect buddy cache from being freed, ++ * otherwise we'll refresh it from ++ * on-disk bitmap and lose not-yet-available ++ * blocks */ ++ page_cache_get(e3b->bd_buddy_page); ++ page_cache_get(e3b->bd_bitmap_page); + db->bb_md_cur = md; + db->bb_tid = handle->h_transaction->t_tid; + mb_debug("new md 0x%p for group %u\n", @@ -2036,12 +2468,13 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + if (err) + goto error_return; + -+ if (unlikely(ext3_mb_aggressive)) { ++#ifdef AGGRESSIVE_CHECK ++ { + int i; + for (i = 0; i < count; i++) + J_ASSERT(mb_test_bit(bit + i, bitmap_bh->b_data)); + } -+ ++#endif + mb_clear_bits(bitmap_bh->b_data, bit, count); + + /* We dirtied the bitmap block */ @@ -2064,7 +2497,6 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + spin_unlock(sb_bgl_lock(sbi, block_group)); + percpu_counter_mod(&sbi->s_freeblocks_counter, count); + -+ ext3_mb_dirty_buddy(&e3b); + ext3_mb_release_desc(&e3b); + + *freed = count; @@ -2131,52 +2563,30 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c +} + + -+extern void ext3_free_blocks_old(handle_t *handle, struct inode *inode, -+ unsigned long block, unsigned long count); -+void ext3_free_blocks(handle_t *handle, struct inode *inode, -+ unsigned long block, unsigned long count, int metadata) ++void ext3_free_blocks(handle_t *handle, struct inode * inode, ++ unsigned long block, unsigned long count, int metadata) +{ ++ struct super_block *sb; + int freed; + -+ if (!test_opt(inode->i_sb, MBALLOC) || -+ EXT3_SB(inode->i_sb)->s_buddy_blocks == NULL) ++ sb = inode->i_sb; ++ if (!test_opt(sb, MBALLOC)) + ext3_free_blocks_old(handle, inode, block, count); + else { -+ ext3_mb_free_blocks(handle, inode, block,count,metadata,&freed); ++ ext3_mb_free_blocks(handle, inode, block, count, metadata, &freed); + if (freed) + DQUOT_FREE_BLOCK(inode, freed); + } + return; +} -Index: linux-2.6.5-7.201/fs/ext3/proc.c -=================================================================== ---- linux-2.6.5-7.201.orig/fs/ext3/proc.c 2005-10-13 19:40:57.851699336 +0400 -+++ linux-2.6.5-7.201/fs/ext3/proc.c 2005-10-14 09:02:36.000000000 +0400 -@@ -0,0 +1,195 @@ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ + +#define EXT3_ROOT "ext3" -+#define EXT3_MB_AGGRESSIVE_NAME "mb_aggressive" +#define EXT3_MB_STATS_NAME "mb_stats" +#define EXT3_MB_MAX_TO_SCAN_NAME "mb_max_to_scan" ++#define EXT3_MB_MIN_TO_SCAN_NAME "mb_min_to_scan" + -+ -+static struct proc_dir_entry *proc_root_ext3; -+ -+ -+static int ext3_mb_aggressive_read(char *page, char **start, off_t off, -+ int count, int *eof, void *data) ++static int ext3_mb_stats_read(char *page, char **start, off_t off, ++ int count, int *eof, void *data) +{ + int len; + @@ -2184,19 +2594,19 @@ Index: linux-2.6.5-7.201/fs/ext3/proc.c + if (off != 0) + return 0; + -+ len = sprintf(page, "%ld\n", ext3_mb_aggressive); ++ len = sprintf(page, "%ld\n", ext3_mb_stats); + *start = page; + return len; +} + -+static int ext3_mb_aggressive_write(struct file *file, const char *buffer, -+ unsigned long count, void *data) ++static int ext3_mb_stats_write(struct file *file, const char *buffer, ++ unsigned long count, void *data) +{ + char str[32]; + + if (count >= sizeof(str)) { + printk(KERN_ERR "EXT3: %s string to long, max %u bytes\n", -+ EXT3_MB_AGGRESSIVE_NAME, sizeof(str)); ++ EXT3_MB_STATS_NAME, (int)sizeof(str)); + return -EOVERFLOW; + } + @@ -2204,12 +2614,12 @@ Index: linux-2.6.5-7.201/fs/ext3/proc.c + return -EFAULT; + + /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ -+ ext3_mb_aggressive = (simple_strtol(str, NULL, 0) != 0); ++ ext3_mb_stats = (simple_strtol(str, NULL, 0) != 0); + return count; +} + -+static int ext3_mb_stats_read(char *page, char **start, off_t off, -+ int count, int *eof, void *data) ++static int ext3_mb_max_to_scan_read(char *page, char **start, off_t off, ++ int count, int *eof, void *data) +{ + int len; + @@ -2217,19 +2627,20 @@ Index: linux-2.6.5-7.201/fs/ext3/proc.c + if (off != 0) + return 0; + -+ len = sprintf(page, "%ld\n", ext3_mb_stats); ++ len = sprintf(page, "%ld\n", ext3_mb_max_to_scan); + *start = page; + return len; +} + -+static int ext3_mb_stats_write(struct file *file, const char *buffer, -+ unsigned long count, void *data) ++static int ext3_mb_max_to_scan_write(struct file *file, const char *buffer, ++ unsigned long count, void *data) +{ + char str[32]; ++ long value; + + if (count >= sizeof(str)) { + printk(KERN_ERR "EXT3: %s string to long, max %u bytes\n", -+ EXT3_MB_STATS_NAME, sizeof(str)); ++ EXT3_MB_MAX_TO_SCAN_NAME, (int)sizeof(str)); + return -EOVERFLOW; + } + @@ -2237,12 +2648,17 @@ Index: linux-2.6.5-7.201/fs/ext3/proc.c + return -EFAULT; + + /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ -+ ext3_mb_stats = (simple_strtol(str, NULL, 0) != 0); ++ value = simple_strtol(str, NULL, 0); ++ if (value <= 0) ++ return -ERANGE; ++ ++ ext3_mb_max_to_scan = value; ++ + return count; +} + -+static int ext3_mb_max_to_scan_read(char *page, char **start, off_t off, -+ int count, int *eof, void *data) ++static int ext3_mb_min_to_scan_read(char *page, char **start, off_t off, ++ int count, int *eof, void *data) +{ + int len; + @@ -2250,20 +2666,20 @@ Index: linux-2.6.5-7.201/fs/ext3/proc.c + if (off != 0) + return 0; + -+ len = sprintf(page, "%ld\n", ext3_mb_max_to_scan); ++ len = sprintf(page, "%ld\n", ext3_mb_min_to_scan); + *start = page; + return len; +} + -+static int ext3_mb_max_to_scan_write(struct file *file, const char *buffer, -+ unsigned long count, void *data) ++static int ext3_mb_min_to_scan_write(struct file *file, const char *buffer, ++ unsigned long count, void *data) +{ + char str[32]; + long value; + + if (count >= sizeof(str)) { + printk(KERN_ERR "EXT3: %s string to long, max %u bytes\n", -+ EXT3_MB_MAX_TO_SCAN_NAME, sizeof(str)); ++ EXT3_MB_MIN_TO_SCAN_NAME, (int)sizeof(str)); + return -EOVERFLOW; + } + @@ -2272,47 +2688,32 @@ Index: linux-2.6.5-7.201/fs/ext3/proc.c + + /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ + value = simple_strtol(str, NULL, 0); -+ if (value <= 0) ++ if (value <= 0) + return -ERANGE; + -+ ext3_mb_max_to_scan = value; ++ ext3_mb_min_to_scan = value; + + return count; +} + +int __init init_ext3_proc(void) +{ -+ struct proc_dir_entry *proc_ext3_mb_aggressive; + struct proc_dir_entry *proc_ext3_mb_stats; + struct proc_dir_entry *proc_ext3_mb_max_to_scan; ++ struct proc_dir_entry *proc_ext3_mb_min_to_scan; + + proc_root_ext3 = proc_mkdir(EXT3_ROOT, proc_root_fs); + if (proc_root_ext3 == NULL) { -+ printk(KERN_ERR "EXT3: Unable to create %s\n", EXT3_ROOT); -+ return -EIO; -+ } -+ -+ /* Initialize EXT3_MB_AGGRESSIVE_NAME */ -+ proc_ext3_mb_aggressive = create_proc_entry(EXT3_MB_AGGRESSIVE_NAME, -+ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); -+ if (proc_ext3_mb_aggressive == NULL) { -+ printk(KERN_ERR "EXT3: Unable to create %s\n", -+ EXT3_MB_AGGRESSIVE_NAME); -+ remove_proc_entry(EXT3_ROOT, proc_root_fs); ++ printk(KERN_ERR "EXT3: Unable to create %s\n", EXT3_ROOT); + return -EIO; + } + -+ proc_ext3_mb_aggressive->data = NULL; -+ proc_ext3_mb_aggressive->read_proc = ext3_mb_aggressive_read; -+ proc_ext3_mb_aggressive->write_proc = ext3_mb_aggressive_write; -+ + /* Initialize EXT3_MB_STATS_NAME */ + proc_ext3_mb_stats = create_proc_entry(EXT3_MB_STATS_NAME, -+ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); ++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); + if (proc_ext3_mb_stats == NULL) { -+ printk(KERN_ERR "EXT3: Unable to create %s\n", -+ EXT3_MB_STATS_NAME); -+ remove_proc_entry(EXT3_MB_AGGRESSIVE_NAME, proc_root_ext3); ++ printk(KERN_ERR "EXT3: Unable to create %s\n", ++ EXT3_MB_STATS_NAME); + remove_proc_entry(EXT3_ROOT, proc_root_fs); + return -EIO; + } @@ -2323,13 +2724,12 @@ Index: linux-2.6.5-7.201/fs/ext3/proc.c + + /* Initialize EXT3_MAX_TO_SCAN_NAME */ + proc_ext3_mb_max_to_scan = create_proc_entry( -+ EXT3_MB_MAX_TO_SCAN_NAME, -+ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); ++ EXT3_MB_MAX_TO_SCAN_NAME, ++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); + if (proc_ext3_mb_max_to_scan == NULL) { -+ printk(KERN_ERR "EXT3: Unable to create %s\n", -+ EXT3_MB_STATS_NAME); ++ printk(KERN_ERR "EXT3: Unable to create %s\n", ++ EXT3_MB_MAX_TO_SCAN_NAME); + remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); -+ remove_proc_entry(EXT3_MB_AGGRESSIVE_NAME, proc_root_ext3); + remove_proc_entry(EXT3_ROOT, proc_root_fs); + return -EIO; + } @@ -2338,130 +2738,43 @@ Index: linux-2.6.5-7.201/fs/ext3/proc.c + proc_ext3_mb_max_to_scan->read_proc = ext3_mb_max_to_scan_read; + proc_ext3_mb_max_to_scan->write_proc = ext3_mb_max_to_scan_write; + ++ /* Initialize EXT3_MIN_TO_SCAN_NAME */ ++ proc_ext3_mb_min_to_scan = create_proc_entry( ++ EXT3_MB_MIN_TO_SCAN_NAME, ++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); ++ if (proc_ext3_mb_min_to_scan == NULL) { ++ printk(KERN_ERR "EXT3: Unable to create %s\n", ++ EXT3_MB_MIN_TO_SCAN_NAME); ++ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_ROOT, proc_root_fs); ++ return -EIO; ++ } ++ ++ proc_ext3_mb_min_to_scan->data = NULL; ++ proc_ext3_mb_min_to_scan->read_proc = ext3_mb_min_to_scan_read; ++ proc_ext3_mb_min_to_scan->write_proc = ext3_mb_min_to_scan_write; ++ + return 0; +} + +void exit_ext3_proc(void) +{ -+ remove_proc_entry(EXT3_MB_AGGRESSIVE_NAME, proc_root_ext3); + remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); + remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_MIN_TO_SCAN_NAME, proc_root_ext3); + remove_proc_entry(EXT3_ROOT, proc_root_fs); +} -Index: linux-2.6.5-7.201/fs/ext3/inode.c -=================================================================== ---- linux-2.6.5-7.201.orig/fs/ext3/inode.c 2005-10-14 08:59:38.000000000 +0400 -+++ linux-2.6.5-7.201/fs/ext3/inode.c 2005-10-14 08:59:39.000000000 +0400 -@@ -572,7 +572,7 @@ - ext3_journal_forget(handle, branch[i].bh); - } - for (i = 0; i < keys; i++) -- ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1); -+ ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1, 1); - return err; - } - -@@ -673,7 +673,7 @@ - if (err == -EAGAIN) - for (i = 0; i < num; i++) - ext3_free_blocks(handle, inode, -- le32_to_cpu(where[i].key), 1); -+ le32_to_cpu(where[i].key), 1, 1); - return err; - } - -@@ -1835,7 +1835,7 @@ - } - } - -- ext3_free_blocks(handle, inode, block_to_free, count); -+ ext3_free_blocks(handle, inode, block_to_free, count, 1); - } - - /** -@@ -2006,7 +2006,7 @@ - ext3_journal_test_restart(handle, inode); - } - -- ext3_free_blocks(handle, inode, nr, 1); -+ ext3_free_blocks(handle, inode, nr, 1, 1); - - if (parent_bh) { - /* -Index: linux-2.6.5-7.201/fs/ext3/super.c +Index: linux-2.6.5-7.201/fs/ext3/Makefile =================================================================== ---- linux-2.6.5-7.201.orig/fs/ext3/super.c 2005-10-14 08:59:38.000000000 +0400 -+++ linux-2.6.5-7.201/fs/ext3/super.c 2005-10-14 09:02:36.000000000 +0400 -@@ -389,6 +389,7 @@ - struct ext3_super_block *es = sbi->s_es; - int i; - -+ ext3_mb_release(sb); - ext3_ext_release(sb); - ext3_xattr_put_super(sb); - journal_destroy(sbi->s_journal); -@@ -543,6 +544,7 @@ - Opt_commit, Opt_journal_update, Opt_journal_inum, - Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, - Opt_ignore, Opt_barrier, Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, -+ Opt_mballoc, Opt_mbfactor, - Opt_err, Opt_extents, Opt_extdebug - }; - -@@ -590,6 +592,8 @@ - {Opt_iopen_nopriv, "iopen_nopriv"}, - {Opt_extents, "extents"}, - {Opt_extdebug, "extdebug"}, -+ {Opt_mballoc, "mballoc"}, -+ {Opt_mballoc, "mbfactor=%u"}, - {Opt_err, NULL} - }; - -@@ -811,6 +815,16 @@ - case Opt_extdebug: - set_opt (sbi->s_mount_opt, EXTDEBUG); - break; -+ case Opt_mballoc: -+ set_opt (sbi->s_mount_opt, MBALLOC); -+ break; -+ case Opt_mbfactor: -+ if (match_int(&args[0], &option)) -+ return 0; -+ if (option < 0) -+ return 0; -+ sbi->s_mb_factor = option; -+ break; - default: - printk (KERN_ERR - "EXT3-fs: Unrecognized mount option \"%s\" " -@@ -1464,6 +1478,7 @@ - ext3_count_dirs(sb)); - - ext3_ext_init(sb); -+ ext3_mb_init(sb, needs_recovery); - - return 0; - -@@ -2112,7 +2127,13 @@ +--- linux-2.6.5-7.201.orig/fs/ext3/Makefile 2005-12-17 02:53:30.000000000 +0300 ++++ linux-2.6.5-7.201/fs/ext3/Makefile 2005-12-17 03:10:23.000000000 +0300 +@@ -6,7 +6,7 @@ - static int __init init_ext3_fs(void) - { -- int err = init_ext3_xattr(); -+ int err; -+ -+ err = init_ext3_proc(); -+ if (err) -+ return err; -+ -+ err = init_ext3_xattr(); - if (err) - return err; - err = init_inodecache(); -@@ -2141,6 +2162,7 @@ - unregister_filesystem(&ext3_fs_type); - destroy_inodecache(); - exit_ext3_xattr(); -+ exit_ext3_proc(); - } + ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ + ioctl.o namei.o super.o symlink.o hash.o \ +- extents.o ++ extents.o mballoc.o - int ext3_prep_san_write(struct inode *inode, long *blocks, + ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o + ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o diff --git a/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6.12.patch b/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6.12.patch new file mode 100644 index 0000000..a2b9caf --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6.12.patch @@ -0,0 +1,2774 @@ +Index: linux-2.6.12.6/include/linux/ext3_fs.h +=================================================================== +--- linux-2.6.12.6.orig/include/linux/ext3_fs.h 2005-12-17 02:17:16.000000000 +0300 ++++ linux-2.6.12.6/include/linux/ext3_fs.h 2005-12-17 02:21:21.000000000 +0300 +@@ -57,6 +57,14 @@ struct statfs; + #define ext3_debug(f, a...) do {} while (0) + #endif + ++#define EXT3_MULTIBLOCK_ALLOCATOR 1 ++ ++#define EXT3_MB_HINT_MERGE 1 ++#define EXT3_MB_HINT_RESERVED 2 ++#define EXT3_MB_HINT_METADATA 4 ++#define EXT3_MB_HINT_FIRST 8 ++#define EXT3_MB_HINT_BEST 16 ++ + /* + * Special inodes numbers + */ +@@ -366,6 +374,7 @@ struct ext3_inode { + #define EXT3_MOUNT_IOPEN_NOPRIV 0x100000/* Make iopen world-readable */ + #define EXT3_MOUNT_EXTENTS 0x200000/* Extents support */ + #define EXT3_MOUNT_EXTDEBUG 0x400000/* Extents debug */ ++#define EXT3_MOUNT_MBALLOC 0x800000/* Buddy allocation support */ + + /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ + #ifndef clear_opt +@@ -727,7 +736,7 @@ extern int ext3_bg_has_super(struct supe + extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group); + extern int ext3_new_block (handle_t *, struct inode *, unsigned long, int *); + extern void ext3_free_blocks (handle_t *, struct inode *, unsigned long, +- unsigned long); ++ unsigned long, int); + extern void ext3_free_blocks_sb (handle_t *, struct super_block *, + unsigned long, unsigned long, int *); + extern unsigned long ext3_count_free_blocks (struct super_block *); +@@ -848,6 +857,17 @@ extern void ext3_extents_initialize_bloc + extern int ext3_ext_ioctl(struct inode *inode, struct file *filp, + unsigned int cmd, unsigned long arg); + ++/* mballoc.c */ ++extern long ext3_mb_stats; ++extern long ext3_mb_max_to_scan; ++extern int ext3_mb_init(struct super_block *, int); ++extern int ext3_mb_release(struct super_block *); ++extern int ext3_mb_new_blocks(handle_t *, struct inode *, unsigned long, int *, int, int *); ++extern int ext3_mb_reserve_blocks(struct super_block *, int); ++extern void ext3_mb_release_blocks(struct super_block *, int); ++int __init init_ext3_proc(void); ++void exit_ext3_proc(void); ++ + #endif /* __KERNEL__ */ + + /* EXT3_IOC_CREATE_INUM at bottom of file (visible to kernel and user). */ +Index: linux-2.6.12.6/include/linux/ext3_fs_sb.h +=================================================================== +--- linux-2.6.12.6.orig/include/linux/ext3_fs_sb.h 2005-08-29 20:55:27.000000000 +0400 ++++ linux-2.6.12.6/include/linux/ext3_fs_sb.h 2005-12-17 02:21:21.000000000 +0300 +@@ -21,8 +21,14 @@ + #include + #include + #include ++#include + #endif + #include ++#include ++ ++struct ext3_buddy_group_blocks; ++struct ext3_mb_history; ++#define EXT3_BB_MAX_BLOCKS + + /* + * third extended-fs super-block data in memory +@@ -78,6 +84,38 @@ struct ext3_sb_info { + char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */ + int s_jquota_fmt; /* Format of quota to use */ + #endif ++ ++ /* for buddy allocator */ ++ struct ext3_group_info **s_group_info; ++ struct inode *s_buddy_cache; ++ long s_blocks_reserved; ++ spinlock_t s_reserve_lock; ++ struct list_head s_active_transaction; ++ struct list_head s_closed_transaction; ++ struct list_head s_committed_transaction; ++ spinlock_t s_md_lock; ++ tid_t s_last_transaction; ++ int s_mb_factor; ++ unsigned short *s_mb_offsets, *s_mb_maxs; ++ ++ /* history to debug policy */ ++ struct ext3_mb_history *s_mb_history; ++ int s_mb_history_cur; ++ int s_mb_history_max; ++ struct proc_dir_entry *s_mb_proc; ++ spinlock_t s_mb_history_lock; ++ ++ /* stats for buddy allocator */ ++ atomic_t s_bal_reqs; /* number of reqs with len > 1 */ ++ atomic_t s_bal_success; /* we found long enough chunks */ ++ atomic_t s_bal_allocated; /* in blocks */ ++ atomic_t s_bal_ex_scanned; /* total extents scanned */ ++ atomic_t s_bal_goals; /* goal hits */ ++ atomic_t s_bal_breaks; /* too long searches */ ++ atomic_t s_bal_2orders; /* 2^order hits */ ++ spinlock_t s_bal_lock; ++ unsigned long s_mb_buddies_generated; ++ unsigned long long s_mb_generation_time; + }; + + #endif /* _LINUX_EXT3_FS_SB */ +Index: linux-2.6.12.6/fs/ext3/super.c +=================================================================== +--- linux-2.6.12.6.orig/fs/ext3/super.c 2005-12-17 02:17:16.000000000 +0300 ++++ linux-2.6.12.6/fs/ext3/super.c 2005-12-17 02:21:21.000000000 +0300 +@@ -387,6 +387,7 @@ static void ext3_put_super (struct super + struct ext3_super_block *es = sbi->s_es; + int i; + ++ ext3_mb_release(sb); + ext3_ext_release(sb); + ext3_xattr_put_super(sb); + journal_destroy(sbi->s_journal); +@@ -597,7 +598,7 @@ enum { + Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, + Opt_ignore, Opt_barrier, Opt_err, Opt_resize, + Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, +- Opt_extents, Opt_extdebug, ++ Opt_extents, Opt_extdebug, Opt_mballoc, + }; + + static match_table_t tokens = { +@@ -649,6 +651,7 @@ static match_table_t tokens = { + {Opt_iopen_nopriv, "iopen_nopriv"}, + {Opt_extents, "extents"}, + {Opt_extdebug, "extdebug"}, ++ {Opt_mballoc, "mballoc"}, + {Opt_barrier, "barrier=%u"}, + {Opt_err, NULL}, + {Opt_resize, "resize"}, +@@ -964,6 +967,9 @@ clear_qf_name: + case Opt_extdebug: + set_opt (sbi->s_mount_opt, EXTDEBUG); + break; ++ case Opt_mballoc: ++ set_opt (sbi->s_mount_opt, MBALLOC); ++ break; + default: + printk (KERN_ERR + "EXT3-fs: Unrecognized mount option \"%s\" " +@@ -1669,6 +1675,7 @@ static int ext3_fill_super (struct super + ext3_count_dirs(sb)); + + ext3_ext_init(sb); ++ ext3_mb_init(sb, needs_recovery); + lock_kernel(); + return 0; + +@@ -2548,7 +2555,13 @@ static struct file_system_type ext3_fs_t + + static int __init init_ext3_fs(void) + { +- int err = init_ext3_xattr(); ++ int err; ++ ++ err = init_ext3_proc(); ++ if (err) ++ return err; ++ ++ err = init_ext3_xattr(); + if (err) + return err; + err = init_inodecache(); +@@ -2570,6 +2583,7 @@ static void __exit exit_ext3_fs(void) + unregister_filesystem(&ext3_fs_type); + destroy_inodecache(); + exit_ext3_xattr(); ++ exit_ext3_proc(); + } + + int ext3_prep_san_write(struct inode *inode, long *blocks, +Index: linux-2.6.12.6/fs/ext3/extents.c +=================================================================== +--- linux-2.6.12.6.orig/fs/ext3/extents.c 2005-12-17 02:17:16.000000000 +0300 ++++ linux-2.6.12.6/fs/ext3/extents.c 2005-12-17 02:21:21.000000000 +0300 +@@ -771,7 +771,7 @@ cleanup: + for (i = 0; i < depth; i++) { + if (!ablocks[i]) + continue; +- ext3_free_blocks(handle, tree->inode, ablocks[i], 1); ++ ext3_free_blocks(handle, tree->inode, ablocks[i], 1, 1); + } + } + kfree(ablocks); +@@ -1428,7 +1428,7 @@ int ext3_ext_rm_idx(handle_t *handle, st + path->p_idx->ei_leaf); + bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf); + ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf); +- ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1); ++ ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1, 1); + return err; + } + +@@ -1913,10 +1913,12 @@ ext3_remove_blocks(struct ext3_extents_t + int needed = ext3_remove_blocks_credits(tree, ex, from, to); + handle_t *handle = ext3_journal_start(tree->inode, needed); + struct buffer_head *bh; +- int i; ++ int i, metadata = 0; + + if (IS_ERR(handle)) + return PTR_ERR(handle); ++ if (S_ISDIR(tree->inode->i_mode) || S_ISLNK(tree->inode->i_mode)) ++ metadata = 1; + if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) { + /* tail removal */ + unsigned long num, start; +@@ -1928,7 +1930,7 @@ ext3_remove_blocks(struct ext3_extents_t + bh = sb_find_get_block(tree->inode->i_sb, start + i); + ext3_forget(handle, 0, tree->inode, bh, start + i); + } +- ext3_free_blocks(handle, tree->inode, start, num); ++ ext3_free_blocks(handle, tree->inode, start, num, metadata); + } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) { + printk("strange request: removal %lu-%lu from %u:%u\n", + from, to, ex->ee_block, ex->ee_len); +Index: linux-2.6.12.6/fs/ext3/inode.c +=================================================================== +--- linux-2.6.12.6.orig/fs/ext3/inode.c 2005-12-17 02:17:16.000000000 +0300 ++++ linux-2.6.12.6/fs/ext3/inode.c 2005-12-17 02:21:21.000000000 +0300 +@@ -564,7 +564,7 @@ static int ext3_alloc_branch(handle_t *h + ext3_journal_forget(handle, branch[i].bh); + } + for (i = 0; i < keys; i++) +- ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1); ++ ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1, 1); + return err; + } + +@@ -1850,7 +1850,7 @@ ext3_clear_blocks(handle_t *handle, stru + } + } + +- ext3_free_blocks(handle, inode, block_to_free, count); ++ ext3_free_blocks(handle, inode, block_to_free, count, 1); + } + + /** +@@ -2023,7 +2023,7 @@ static void ext3_free_branches(handle_t + ext3_journal_test_restart(handle, inode); + } + +- ext3_free_blocks(handle, inode, nr, 1); ++ ext3_free_blocks(handle, inode, nr, 1, 1); + + if (parent_bh) { + /* +Index: linux-2.6.12.6/fs/ext3/balloc.c +=================================================================== +--- linux-2.6.12.6.orig/fs/ext3/balloc.c 2005-08-29 20:55:27.000000000 +0400 ++++ linux-2.6.12.6/fs/ext3/balloc.c 2005-12-17 02:21:21.000000000 +0300 +@@ -79,7 +79,7 @@ struct ext3_group_desc * ext3_get_group_ + * + * Return buffer_head on success or NULL in case of failure. + */ +-static struct buffer_head * ++struct buffer_head * + read_block_bitmap(struct super_block *sb, unsigned int block_group) + { + struct ext3_group_desc * desc; +@@ -490,24 +490,6 @@ error_return: + return; + } + +-/* Free given blocks, update quota and i_blocks field */ +-void ext3_free_blocks(handle_t *handle, struct inode *inode, +- unsigned long block, unsigned long count) +-{ +- struct super_block * sb; +- int dquot_freed_blocks; +- +- sb = inode->i_sb; +- if (!sb) { +- printk ("ext3_free_blocks: nonexistent device"); +- return; +- } +- ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks); +- if (dquot_freed_blocks) +- DQUOT_FREE_BLOCK(inode, dquot_freed_blocks); +- return; +-} +- + /* + * For ext3 allocations, we must not reuse any blocks which are + * allocated in the bitmap buffer's "last committed data" copy. This +@@ -1162,7 +1144,7 @@ int ext3_should_retry_alloc(struct super + * bitmap, and then for any free bit if that fails. + * This function also updates quota and i_blocks field. + */ +-int ext3_new_block(handle_t *handle, struct inode *inode, ++int ext3_new_block_old(handle_t *handle, struct inode *inode, + unsigned long goal, int *errp) + { + struct buffer_head *bitmap_bh = NULL; +Index: linux-2.6.12.6/fs/ext3/xattr.c +=================================================================== +--- linux-2.6.12.6.orig/fs/ext3/xattr.c 2005-08-29 20:55:27.000000000 +0400 ++++ linux-2.6.12.6/fs/ext3/xattr.c 2005-12-17 02:21:33.000000000 +0300 +@@ -484,7 +484,7 @@ ext3_xattr_release_block(handle_t *handl + ea_bdebug(bh, "refcount now=0; freeing"); + if (ce) + mb_cache_entry_free(ce); +- ext3_free_blocks(handle, inode, bh->b_blocknr, 1); ++ ext3_free_blocks(handle, inode, bh->b_blocknr, 1, 1); + get_bh(bh); + ext3_forget(handle, 1, inode, bh, bh->b_blocknr); + } else { +@@ -804,7 +804,7 @@ inserted: + new_bh = sb_getblk(sb, block); + if (!new_bh) { + getblk_failed: +- ext3_free_blocks(handle, inode, block, 1); ++ ext3_free_blocks(handle, inode, block, 1, 1); + error = -EIO; + goto cleanup; + } +Index: linux-2.6.12.6/fs/ext3/mballoc.c +=================================================================== +--- linux-2.6.12.6.orig/fs/ext3/mballoc.c 2005-12-09 13:08:53.191437750 +0300 ++++ linux-2.6.12.6/fs/ext3/mballoc.c 2005-12-17 02:21:21.000000000 +0300 +@@ -0,0 +1,2429 @@ ++/* ++ * Copyright (c) 2003-2005, Cluster File Systems, Inc, info@clusterfs.com ++ * Written by Alex Tomas ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public Licens ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- ++ */ ++ ++ ++/* ++ * mballoc.c contains the multiblocks allocation routines ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * TODO: ++ * - bitmap read-ahead (proposed by Oleg Drokin aka green) ++ * - track min/max extents in each group for better group selection ++ * - mb_mark_used() may allocate chunk right after splitting buddy ++ * - special flag to advice allocator to look for requested + N blocks ++ * this may improve interaction between extents and mballoc ++ * - tree of groups sorted by number of free blocks ++ * - percpu reservation code (hotpath) ++ * - error handling ++ */ ++ ++/* ++ * with AGRESSIVE_CHECK allocator runs consistency checks over ++ * structures. these checks slow things down a lot ++ */ ++#define AGGRESSIVE_CHECK__ ++ ++/* ++ */ ++#define MB_DEBUG__ ++#ifdef MB_DEBUG ++#define mb_debug(fmt,a...) printk(fmt, ##a) ++#else ++#define mb_debug(fmt,a...) ++#endif ++ ++/* ++ * with EXT3_MB_HISTORY mballoc stores last N allocations in memory ++ * and you can monitor it in /proc/fs/ext3//mb_history ++ */ ++#define EXT3_MB_HISTORY ++ ++/* ++ * How long mballoc can look for a best extent (in found extents) ++ */ ++long ext3_mb_max_to_scan = 500; ++ ++/* ++ * How long mballoc must look for a best extent ++ */ ++long ext3_mb_min_to_scan = 30; ++ ++/* ++ * with 'ext3_mb_stats' allocator will collect stats that will be ++ * shown at umount. The collecting costs though! ++ */ ++ ++long ext3_mb_stats = 1; ++ ++#ifdef EXT3_BB_MAX_BLOCKS ++#undef EXT3_BB_MAX_BLOCKS ++#endif ++#define EXT3_BB_MAX_BLOCKS 30 ++ ++struct ext3_free_metadata { ++ unsigned short group; ++ unsigned short num; ++ unsigned short blocks[EXT3_BB_MAX_BLOCKS]; ++ struct list_head list; ++}; ++ ++struct ext3_group_info { ++ unsigned long bb_state; ++ unsigned long bb_tid; ++ struct ext3_free_metadata *bb_md_cur; ++ unsigned short bb_first_free; ++ unsigned short bb_free; ++ unsigned short bb_fragments; ++ unsigned short bb_counters[]; ++}; ++ ++ ++#define EXT3_GROUP_INFO_NEED_INIT_BIT 0 ++#define EXT3_GROUP_INFO_LOCKED_BIT 1 ++ ++#define EXT3_MB_GRP_NEED_INIT(grp) \ ++ (test_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, &(grp)->bb_state)) ++ ++struct ext3_free_extent { ++ __u16 fe_start; ++ __u16 fe_len; ++ __u16 fe_group; ++}; ++ ++struct ext3_allocation_context { ++ struct super_block *ac_sb; ++ ++ /* search goals */ ++ struct ext3_free_extent ac_g_ex; ++ ++ /* the best found extent */ ++ struct ext3_free_extent ac_b_ex; ++ ++ /* number of iterations done. we have to track to limit searching */ ++ unsigned long ac_ex_scanned; ++ __u16 ac_groups_scanned; ++ __u16 ac_found; ++ __u16 ac_tail; ++ __u16 ac_buddy; ++ __u8 ac_status; ++ __u8 ac_flags; /* allocation hints */ ++ __u8 ac_criteria; ++ __u8 ac_repeats; ++ __u8 ac_2order; /* if request is to allocate 2^N blocks and ++ * N > 0, the field stores N, otherwise 0 */ ++}; ++ ++#define AC_STATUS_CONTINUE 1 ++#define AC_STATUS_FOUND 2 ++#define AC_STATUS_BREAK 3 ++ ++struct ext3_mb_history { ++ struct ext3_free_extent goal; /* goal allocation */ ++ struct ext3_free_extent result; /* result allocation */ ++ __u16 found; /* how many extents have been found */ ++ __u16 groups; /* how many groups have been scanned */ ++ __u16 tail; /* what tail broke some buddy */ ++ __u16 buddy; /* buddy the tail ^^^ broke */ ++ __u8 cr; /* which phase the result extent was found at */ ++ __u8 merged; ++}; ++ ++struct ext3_buddy { ++ struct page *bd_buddy_page; ++ void *bd_buddy; ++ struct page *bd_bitmap_page; ++ void *bd_bitmap; ++ struct ext3_group_info *bd_info; ++ struct super_block *bd_sb; ++ __u16 bd_blkbits; ++ __u16 bd_group; ++}; ++#define EXT3_MB_BITMAP(e3b) ((e3b)->bd_bitmap) ++#define EXT3_MB_BUDDY(e3b) ((e3b)->bd_buddy) ++ ++#ifndef EXT3_MB_HISTORY ++#define ext3_mb_store_history(sb,ac) ++#else ++static void ext3_mb_store_history(struct super_block *, ++ struct ext3_allocation_context *ac); ++#endif ++ ++#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) ++ ++static struct proc_dir_entry *proc_root_ext3; ++ ++int ext3_create (struct inode *, struct dentry *, int, struct nameidata *); ++struct buffer_head * read_block_bitmap(struct super_block *, unsigned int); ++int ext3_new_block_old(handle_t *, struct inode *, unsigned long, int *); ++int ext3_mb_reserve_blocks(struct super_block *, int); ++void ext3_mb_release_blocks(struct super_block *, int); ++void ext3_mb_poll_new_transaction(struct super_block *, handle_t *); ++void ext3_mb_free_committed_blocks(struct super_block *); ++ ++#if BITS_PER_LONG == 64 ++#define mb_correct_addr_and_bit(bit,addr) \ ++{ \ ++ bit += ((unsigned long) addr & 7UL) << 3; \ ++ addr = (void *) ((unsigned long) addr & ~7UL); \ ++} ++#elif BITS_PER_LONG == 32 ++#define mb_correct_addr_and_bit(bit,addr) \ ++{ \ ++ bit += ((unsigned long) addr & 3UL) << 3; \ ++ addr = (void *) ((unsigned long) addr & ~3UL); \ ++} ++#else ++#error "how many bits you are?!" ++#endif ++ ++static inline int mb_test_bit(int bit, void *addr) ++{ ++ mb_correct_addr_and_bit(bit,addr); ++ return ext2_test_bit(bit, addr); ++} ++ ++static inline void mb_set_bit(int bit, void *addr) ++{ ++ mb_correct_addr_and_bit(bit,addr); ++ ext2_set_bit(bit, addr); ++} ++ ++static inline void mb_set_bit_atomic(int bit, void *addr) ++{ ++ mb_correct_addr_and_bit(bit,addr); ++ ext2_set_bit_atomic(NULL, bit, addr); ++} ++ ++static inline void mb_clear_bit(int bit, void *addr) ++{ ++ mb_correct_addr_and_bit(bit,addr); ++ ext2_clear_bit(bit, addr); ++} ++ ++static inline void mb_clear_bit_atomic(int bit, void *addr) ++{ ++ mb_correct_addr_and_bit(bit,addr); ++ ext2_clear_bit_atomic(NULL, bit, addr); ++} ++ ++static inline int mb_find_next_zero_bit(void *addr, int max, int start) ++{ ++ int fix; ++#if BITS_PER_LONG == 64 ++ fix = ((unsigned long) addr & 7UL) << 3; ++ addr = (void *) ((unsigned long) addr & ~7UL); ++#elif BITS_PER_LONG == 32 ++ fix = ((unsigned long) addr & 3UL) << 3; ++ addr = (void *) ((unsigned long) addr & ~3UL); ++#else ++#error "how many bits you are?!" ++#endif ++ max += fix; ++ start += fix; ++ return ext2_find_next_zero_bit(addr, max, start) - fix; ++} ++ ++static inline void *mb_find_buddy(struct ext3_buddy *e3b, int order, int *max) ++{ ++ char *bb; ++ ++ J_ASSERT(EXT3_MB_BITMAP(e3b) != EXT3_MB_BUDDY(e3b)); ++ J_ASSERT(max != NULL); ++ ++ if (order > e3b->bd_blkbits + 1) { ++ *max = 0; ++ return NULL; ++ } ++ ++ /* at order 0 we see each particular block */ ++ *max = 1 << (e3b->bd_blkbits + 3); ++ if (order == 0) ++ return EXT3_MB_BITMAP(e3b); ++ ++ bb = EXT3_MB_BUDDY(e3b) + EXT3_SB(e3b->bd_sb)->s_mb_offsets[order]; ++ *max = EXT3_SB(e3b->bd_sb)->s_mb_maxs[order]; ++ ++ return bb; ++} ++ ++#ifdef AGGRESSIVE_CHECK ++ ++static void mb_check_buddy(struct ext3_buddy *e3b) ++{ ++ int order = e3b->bd_blkbits + 1; ++ int max, max2, i, j, k, count; ++ int fragments = 0, fstart; ++ void *buddy, *buddy2; ++ ++ if (!test_opt(e3b->bd_sb, MBALLOC)) ++ return; ++ ++ { ++ static int mb_check_counter = 0; ++ if (mb_check_counter++ % 300 != 0) ++ return; ++ } ++ ++ while (order > 1) { ++ buddy = mb_find_buddy(e3b, order, &max); ++ J_ASSERT(buddy); ++ buddy2 = mb_find_buddy(e3b, order - 1, &max2); ++ J_ASSERT(buddy2); ++ J_ASSERT(buddy != buddy2); ++ J_ASSERT(max * 2 == max2); ++ ++ count = 0; ++ for (i = 0; i < max; i++) { ++ ++ if (mb_test_bit(i, buddy)) { ++ /* only single bit in buddy2 may be 1 */ ++ if (!mb_test_bit(i << 1, buddy2)) ++ J_ASSERT(mb_test_bit((i<<1)+1, buddy2)); ++ else if (!mb_test_bit((i << 1) + 1, buddy2)) ++ J_ASSERT(mb_test_bit(i << 1, buddy2)); ++ continue; ++ } ++ ++ /* both bits in buddy2 must be 0 */ ++ J_ASSERT(mb_test_bit(i << 1, buddy2)); ++ J_ASSERT(mb_test_bit((i << 1) + 1, buddy2)); ++ ++ for (j = 0; j < (1 << order); j++) { ++ k = (i * (1 << order)) + j; ++ J_ASSERT(!mb_test_bit(k, EXT3_MB_BITMAP(e3b))); ++ } ++ count++; ++ } ++ J_ASSERT(e3b->bd_info->bb_counters[order] == count); ++ order--; ++ } ++ ++ fstart = -1; ++ buddy = mb_find_buddy(e3b, 0, &max); ++ for (i = 0; i < max; i++) { ++ if (!mb_test_bit(i, buddy)) { ++ J_ASSERT(i >= e3b->bd_info->bb_first_free); ++ if (fstart == -1) { ++ fragments++; ++ fstart = i; ++ } ++ continue; ++ } ++ fstart = -1; ++ /* check used bits only */ ++ for (j = 0; j < e3b->bd_blkbits + 1; j++) { ++ buddy2 = mb_find_buddy(e3b, j, &max2); ++ k = i >> j; ++ J_ASSERT(k < max2); ++ J_ASSERT(mb_test_bit(k, buddy2)); ++ } ++ } ++ J_ASSERT(!EXT3_MB_GRP_NEED_INIT(e3b->bd_info)); ++ J_ASSERT(e3b->bd_info->bb_fragments == fragments); ++} ++ ++#else ++#define mb_check_buddy(e3b) ++#endif ++ ++/* find most significant bit */ ++static int inline fmsb(unsigned short word) ++{ ++ int order; ++ ++ if (word > 255) { ++ order = 7; ++ word >>= 8; ++ } else { ++ order = -1; ++ } ++ ++ do { ++ order++; ++ word >>= 1; ++ } while (word != 0); ++ ++ return order; ++} ++ ++static void inline ++ext3_mb_mark_free_simple(struct super_block *sb, void *buddy, unsigned first, ++ int len, struct ext3_group_info *grp) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ unsigned short min, max, chunk, border; ++ ++ mb_debug("mark %u/%u free\n", first, len); ++ J_ASSERT(len < EXT3_BLOCKS_PER_GROUP(sb)); ++ ++ border = 2 << sb->s_blocksize_bits; ++ ++ while (len > 0) { ++ /* find how many blocks can be covered since this position */ ++ max = ffs(first | border) - 1; ++ ++ /* find how many blocks of power 2 we need to mark */ ++ min = fmsb(len); ++ ++ mb_debug(" %u/%u -> max %u, min %u\n", ++ first & ((2 << sb->s_blocksize_bits) - 1), ++ len, max, min); ++ ++ if (max < min) ++ min = max; ++ chunk = 1 << min; ++ ++ /* mark multiblock chunks only */ ++ grp->bb_counters[min]++; ++ if (min > 0) { ++ mb_debug(" set %u at %u \n", first >> min, ++ sbi->s_mb_offsets[min]); ++ mb_clear_bit(first >> min, buddy + sbi->s_mb_offsets[min]); ++ } ++ ++ len -= chunk; ++ first += chunk; ++ } ++} ++ ++static void ++ext3_mb_generate_buddy(struct super_block *sb, void *buddy, void *bitmap, ++ struct ext3_group_info *grp) ++{ ++ unsigned short max = EXT3_BLOCKS_PER_GROUP(sb); ++ unsigned short i = 0, first, len; ++ unsigned free = 0, fragments = 0; ++ unsigned long long period = get_cycles(); ++ ++ i = mb_find_next_zero_bit(bitmap, max, 0); ++ grp->bb_first_free = i; ++ while (i < max) { ++ fragments++; ++ first = i; ++ i = find_next_bit(bitmap, max, i); ++ len = i - first; ++ free += len; ++ if (len > 1) ++ ext3_mb_mark_free_simple(sb, buddy, first, len, grp); ++ else ++ grp->bb_counters[0]++; ++ if (i < max) ++ i = mb_find_next_zero_bit(bitmap, max, i); ++ } ++ grp->bb_fragments = fragments; ++ ++ /* bb_state shouldn't being modified because all ++ * others waits for init completion on page lock */ ++ clear_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, &grp->bb_state); ++ if (free != grp->bb_free) { ++ printk("EXT3-fs: %u blocks in bitmap, %u in group descriptor\n", ++ free, grp->bb_free); ++ grp->bb_free = free; ++ } ++ ++ period = get_cycles() - period; ++ spin_lock(&EXT3_SB(sb)->s_bal_lock); ++ EXT3_SB(sb)->s_mb_buddies_generated++; ++ EXT3_SB(sb)->s_mb_generation_time += period; ++ spin_unlock(&EXT3_SB(sb)->s_bal_lock); ++} ++ ++static int ext3_mb_init_cache(struct page *page) ++{ ++ int blocksize, blocks_per_page, groups_per_page; ++ int err = 0, i, first_group, first_block; ++ struct super_block *sb; ++ struct buffer_head *bhs; ++ struct buffer_head **bh; ++ struct inode *inode; ++ char *data, *bitmap; ++ ++ mb_debug("init page %lu\n", page->index); ++ ++ inode = page->mapping->host; ++ sb = inode->i_sb; ++ blocksize = 1 << inode->i_blkbits; ++ blocks_per_page = PAGE_CACHE_SIZE / blocksize; ++ ++ groups_per_page = blocks_per_page >> 1; ++ if (groups_per_page == 0) ++ groups_per_page = 1; ++ ++ /* allocate buffer_heads to read bitmaps */ ++ if (groups_per_page > 1) { ++ err = -ENOMEM; ++ i = sizeof(struct buffer_head *) * groups_per_page; ++ bh = kmalloc(i, GFP_NOFS); ++ if (bh == NULL) ++ goto out; ++ memset(bh, 0, i); ++ } else ++ bh = &bhs; ++ ++ first_group = page->index * blocks_per_page / 2; ++ ++ /* read all groups the page covers into the cache */ ++ for (i = 0; i < groups_per_page; i++) { ++ struct ext3_group_desc * desc; ++ ++ if (first_group + i >= EXT3_SB(sb)->s_groups_count) ++ break; ++ ++ err = -EIO; ++ desc = ext3_get_group_desc(sb, first_group + i, NULL); ++ if (desc == NULL) ++ goto out; ++ ++ err = -ENOMEM; ++ bh[i] = sb_getblk(sb, le32_to_cpu(desc->bg_block_bitmap)); ++ if (bh[i] == NULL) ++ goto out; ++ ++ if (buffer_uptodate(bh[i])) ++ continue; ++ ++ lock_buffer(bh[i]); ++ if (buffer_uptodate(bh[i])) { ++ unlock_buffer(bh[i]); ++ continue; ++ } ++ ++ get_bh(bh[i]); ++ bh[i]->b_end_io = end_buffer_read_sync; ++ submit_bh(READ, bh[i]); ++ mb_debug("read bitmap for group %u\n", first_group + i); ++ } ++ ++ /* wait for I/O completion */ ++ for (i = 0; i < groups_per_page && bh[i]; i++) ++ wait_on_buffer(bh[i]); ++ ++ /* XXX: I/O error handling here */ ++ ++ first_block = page->index * blocks_per_page; ++ for (i = 0; i < blocks_per_page; i++) { ++ int group; ++ ++ group = (first_block + i) >> 1; ++ if (group >= EXT3_SB(sb)->s_groups_count) ++ break; ++ ++ data = page_address(page) + (i * blocksize); ++ bitmap = bh[group - first_group]->b_data; ++ ++ if ((first_block + i) & 1) { ++ /* this is block of buddy */ ++ mb_debug("put buddy for group %u in page %lu/%x\n", ++ group, page->index, i * blocksize); ++ memset(data, 0xff, blocksize); ++ EXT3_SB(sb)->s_group_info[group]->bb_fragments = 0; ++ memset(EXT3_SB(sb)->s_group_info[group]->bb_counters, 0, ++ sizeof(unsigned short)*(sb->s_blocksize_bits+2)); ++ ext3_mb_generate_buddy(sb, data, bitmap, ++ EXT3_SB(sb)->s_group_info[group]); ++ } else { ++ /* this is block of bitmap */ ++ mb_debug("put bitmap for group %u in page %lu/%x\n", ++ group, page->index, i * blocksize); ++ memcpy(data, bitmap, blocksize); ++ } ++ } ++ SetPageUptodate(page); ++ ++out: ++ if (bh) { ++ for (i = 0; bh && i < groups_per_page && bh[i]; i++) ++ brelse(bh[i]); ++ if (bh != &bhs) ++ kfree(bh); ++ } ++ return err; ++} ++ ++static int ext3_mb_load_buddy(struct super_block *sb, int group, ++ struct ext3_buddy *e3b) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct inode *inode = sbi->s_buddy_cache; ++ int blocks_per_page, block, pnum, poff; ++ struct page *page; ++ ++ mb_debug("load group %u\n", group); ++ ++ blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; ++ ++ e3b->bd_blkbits = sb->s_blocksize_bits; ++ e3b->bd_info = sbi->s_group_info[group]; ++ e3b->bd_sb = sb; ++ e3b->bd_group = group; ++ e3b->bd_buddy_page = NULL; ++ e3b->bd_bitmap_page = NULL; ++ ++ block = group * 2; ++ pnum = block / blocks_per_page; ++ poff = block % blocks_per_page; ++ ++ page = find_get_page(inode->i_mapping, pnum); ++ if (page == NULL || !PageUptodate(page)) { ++ if (page) ++ page_cache_release(page); ++ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); ++ if (page) { ++ if (!PageUptodate(page)) ++ ext3_mb_init_cache(page); ++ unlock_page(page); ++ } ++ } ++ if (page == NULL || !PageUptodate(page)) ++ goto err; ++ e3b->bd_bitmap_page = page; ++ e3b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); ++ mark_page_accessed(page); ++ ++ block++; ++ pnum = block / blocks_per_page; ++ poff = block % blocks_per_page; ++ ++ page = find_get_page(inode->i_mapping, pnum); ++ if (page == NULL || !PageUptodate(page)) { ++ if (page) ++ page_cache_release(page); ++ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); ++ if (page) { ++ if (!PageUptodate(page)) ++ ext3_mb_init_cache(page); ++ unlock_page(page); ++ } ++ } ++ if (page == NULL || !PageUptodate(page)) ++ goto err; ++ e3b->bd_buddy_page = page; ++ e3b->bd_buddy = page_address(page) + (poff * sb->s_blocksize); ++ mark_page_accessed(page); ++ ++ J_ASSERT(e3b->bd_bitmap_page != NULL); ++ J_ASSERT(e3b->bd_buddy_page != NULL); ++ ++ return 0; ++ ++err: ++ if (e3b->bd_bitmap_page) ++ page_cache_release(e3b->bd_bitmap_page); ++ if (e3b->bd_buddy_page) ++ page_cache_release(e3b->bd_buddy_page); ++ e3b->bd_buddy = NULL; ++ e3b->bd_bitmap = NULL; ++ return -EIO; ++} ++ ++static void ext3_mb_release_desc(struct ext3_buddy *e3b) ++{ ++ if (e3b->bd_bitmap_page) ++ page_cache_release(e3b->bd_bitmap_page); ++ if (e3b->bd_buddy_page) ++ page_cache_release(e3b->bd_buddy_page); ++} ++ ++ ++static inline void ++ext3_lock_group(struct super_block *sb, int group) ++{ ++ bit_spin_lock(EXT3_GROUP_INFO_LOCKED_BIT, ++ &EXT3_SB(sb)->s_group_info[group]->bb_state); ++} ++ ++static inline void ++ext3_unlock_group(struct super_block *sb, int group) ++{ ++ bit_spin_unlock(EXT3_GROUP_INFO_LOCKED_BIT, ++ &EXT3_SB(sb)->s_group_info[group]->bb_state); ++} ++ ++static int mb_find_order_for_block(struct ext3_buddy *e3b, int block) ++{ ++ int order = 1; ++ void *bb; ++ ++ J_ASSERT(EXT3_MB_BITMAP(e3b) != EXT3_MB_BUDDY(e3b)); ++ J_ASSERT(block < (1 << (e3b->bd_blkbits + 3))); ++ ++ bb = EXT3_MB_BUDDY(e3b); ++ while (order <= e3b->bd_blkbits + 1) { ++ block = block >> 1; ++ if (!mb_test_bit(block, bb)) { ++ /* this block is part of buddy of order 'order' */ ++ return order; ++ } ++ bb += 1 << (e3b->bd_blkbits - order); ++ order++; ++ } ++ return 0; ++} ++ ++static inline void mb_clear_bits(void *bm, int cur, int len) ++{ ++ __u32 *addr; ++ ++ len = cur + len; ++ while (cur < len) { ++ if ((cur & 31) == 0 && (len - cur) >= 32) { ++ /* fast path: clear whole word at once */ ++ addr = bm + (cur >> 3); ++ *addr = 0; ++ cur += 32; ++ continue; ++ } ++ mb_clear_bit_atomic(cur, bm); ++ cur++; ++ } ++} ++ ++static inline void mb_set_bits(void *bm, int cur, int len) ++{ ++ __u32 *addr; ++ ++ len = cur + len; ++ while (cur < len) { ++ if ((cur & 31) == 0 && (len - cur) >= 32) { ++ /* fast path: clear whole word at once */ ++ addr = bm + (cur >> 3); ++ *addr = 0xffffffff; ++ cur += 32; ++ continue; ++ } ++ mb_set_bit_atomic(cur, bm); ++ cur++; ++ } ++} ++ ++static int mb_free_blocks(struct ext3_buddy *e3b, int first, int count) ++{ ++ int block = 0, max = 0, order; ++ void *buddy, *buddy2; ++ ++ mb_check_buddy(e3b); ++ ++ e3b->bd_info->bb_free += count; ++ if (first < e3b->bd_info->bb_first_free) ++ e3b->bd_info->bb_first_free = first; ++ ++ /* let's maintain fragments counter */ ++ if (first != 0) ++ block = !mb_test_bit(first - 1, EXT3_MB_BITMAP(e3b)); ++ if (first + count < EXT3_SB(e3b->bd_sb)->s_mb_maxs[0]) ++ max = !mb_test_bit(first + count, EXT3_MB_BITMAP(e3b)); ++ if (block && max) ++ e3b->bd_info->bb_fragments--; ++ else if (!block && !max) ++ e3b->bd_info->bb_fragments++; ++ ++ /* let's maintain buddy itself */ ++ while (count-- > 0) { ++ block = first++; ++ order = 0; ++ ++ J_ASSERT(mb_test_bit(block, EXT3_MB_BITMAP(e3b))); ++ mb_clear_bit(block, EXT3_MB_BITMAP(e3b)); ++ e3b->bd_info->bb_counters[order]++; ++ ++ /* start of the buddy */ ++ buddy = mb_find_buddy(e3b, order, &max); ++ ++ do { ++ block &= ~1UL; ++ if (mb_test_bit(block, buddy) || ++ mb_test_bit(block + 1, buddy)) ++ break; ++ ++ /* both the buddies are free, try to coalesce them */ ++ buddy2 = mb_find_buddy(e3b, order + 1, &max); ++ ++ if (!buddy2) ++ break; ++ ++ if (order > 0) { ++ /* for special purposes, we don't set ++ * free bits in bitmap */ ++ mb_set_bit(block, buddy); ++ mb_set_bit(block + 1, buddy); ++ } ++ e3b->bd_info->bb_counters[order]--; ++ e3b->bd_info->bb_counters[order]--; ++ ++ block = block >> 1; ++ order++; ++ e3b->bd_info->bb_counters[order]++; ++ ++ mb_clear_bit(block, buddy2); ++ buddy = buddy2; ++ } while (1); ++ } ++ mb_check_buddy(e3b); ++ ++ return 0; ++} ++ ++static int mb_find_extent(struct ext3_buddy *e3b, int order, int block, ++ int needed, struct ext3_free_extent *ex) ++{ ++ int next, max, ord; ++ void *buddy; ++ ++ J_ASSERT(ex != NULL); ++ ++ buddy = mb_find_buddy(e3b, order, &max); ++ J_ASSERT(buddy); ++ J_ASSERT(block < max); ++ if (mb_test_bit(block, buddy)) { ++ ex->fe_len = 0; ++ ex->fe_start = 0; ++ ex->fe_group = 0; ++ return 0; ++ } ++ ++ if (likely(order == 0)) { ++ /* find actual order */ ++ order = mb_find_order_for_block(e3b, block); ++ block = block >> order; ++ } ++ ++ ex->fe_len = 1 << order; ++ ex->fe_start = block << order; ++ ex->fe_group = e3b->bd_group; ++ ++ while (needed > ex->fe_len && (buddy = mb_find_buddy(e3b, order, &max))) { ++ ++ if (block + 1 >= max) ++ break; ++ ++ next = (block + 1) * (1 << order); ++ if (mb_test_bit(next, EXT3_MB_BITMAP(e3b))) ++ break; ++ ++ ord = mb_find_order_for_block(e3b, next); ++ ++ order = ord; ++ block = next >> order; ++ ex->fe_len += 1 << order; ++ } ++ ++ J_ASSERT(ex->fe_start + ex->fe_len <= (1 << (e3b->bd_blkbits + 3))); ++ return ex->fe_len; ++} ++ ++static int mb_mark_used(struct ext3_buddy *e3b, struct ext3_free_extent *ex) ++{ ++ int ord, mlen = 0, max = 0, cur; ++ int start = ex->fe_start; ++ int len = ex->fe_len; ++ unsigned ret = 0; ++ int len0 = len; ++ void *buddy; ++ ++ mb_check_buddy(e3b); ++ ++ e3b->bd_info->bb_free -= len; ++ if (e3b->bd_info->bb_first_free == start) ++ e3b->bd_info->bb_first_free += len; ++ ++ /* let's maintain fragments counter */ ++ if (start != 0) ++ mlen = !mb_test_bit(start - 1, EXT3_MB_BITMAP(e3b)); ++ if (start + len < EXT3_SB(e3b->bd_sb)->s_mb_maxs[0]) ++ max = !mb_test_bit(start + len, EXT3_MB_BITMAP(e3b)); ++ if (mlen && max) ++ e3b->bd_info->bb_fragments++; ++ else if (!mlen && !max) ++ e3b->bd_info->bb_fragments--; ++ ++ /* let's maintain buddy itself */ ++ while (len) { ++ ord = mb_find_order_for_block(e3b, start); ++ ++ if (((start >> ord) << ord) == start && len >= (1 << ord)) { ++ /* the whole chunk may be allocated at once! */ ++ mlen = 1 << ord; ++ buddy = mb_find_buddy(e3b, ord, &max); ++ J_ASSERT((start >> ord) < max); ++ mb_set_bit(start >> ord, buddy); ++ e3b->bd_info->bb_counters[ord]--; ++ start += mlen; ++ len -= mlen; ++ J_ASSERT(len >= 0); ++ continue; ++ } ++ ++ /* store for history */ ++ if (ret == 0) ++ ret = len | (ord << 16); ++ ++ /* we have to split large buddy */ ++ J_ASSERT(ord > 0); ++ buddy = mb_find_buddy(e3b, ord, &max); ++ mb_set_bit(start >> ord, buddy); ++ e3b->bd_info->bb_counters[ord]--; ++ ++ ord--; ++ cur = (start >> ord) & ~1U; ++ buddy = mb_find_buddy(e3b, ord, &max); ++ mb_clear_bit(cur, buddy); ++ mb_clear_bit(cur + 1, buddy); ++ e3b->bd_info->bb_counters[ord]++; ++ e3b->bd_info->bb_counters[ord]++; ++ } ++ ++ /* now drop all the bits in bitmap */ ++ mb_set_bits(EXT3_MB_BITMAP(e3b), ex->fe_start, len0); ++ ++ mb_check_buddy(e3b); ++ ++ return ret; ++} ++ ++/* ++ * Must be called under group lock! ++ */ ++static void ext3_mb_use_best_found(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ unsigned long ret; ++ ++ ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len); ++ ret = mb_mark_used(e3b, &ac->ac_b_ex); ++ ++ ac->ac_status = AC_STATUS_FOUND; ++ ac->ac_tail = ret & 0xffff; ++ ac->ac_buddy = ret >> 16; ++} ++ ++/* ++ * The routine checks whether found extent is good enough. If it is, ++ * then the extent gets marked used and flag is set to the context ++ * to stop scanning. Otherwise, the extent is compared with the ++ * previous found extent and if new one is better, then it's stored ++ * in the context. Later, the best found extent will be used, if ++ * mballoc can't find good enough extent. ++ * ++ * FIXME: real allocation policy is to be designed yet! ++ */ ++static void ext3_mb_measure_extent(struct ext3_allocation_context *ac, ++ struct ext3_free_extent *ex, ++ struct ext3_buddy *e3b) ++{ ++ struct ext3_free_extent *bex = &ac->ac_b_ex; ++ struct ext3_free_extent *gex = &ac->ac_g_ex; ++ ++ J_ASSERT(ex->fe_len > 0); ++ J_ASSERT(ex->fe_len < (1 << ac->ac_sb->s_blocksize_bits) * 8); ++ J_ASSERT(ex->fe_start < (1 << ac->ac_sb->s_blocksize_bits) * 8); ++ ++ ac->ac_found++; ++ ++ /* ++ * The special case - take what you catch first ++ */ ++ if (unlikely(ac->ac_flags & EXT3_MB_HINT_FIRST)) { ++ *bex = *ex; ++ ext3_mb_use_best_found(ac, e3b); ++ return; ++ } ++ ++ /* ++ * Let's check whether the chuck is good enough ++ */ ++ if (ex->fe_len == gex->fe_len) { ++ *bex = *ex; ++ ext3_mb_use_best_found(ac, e3b); ++ return; ++ } ++ ++ /* ++ * If this is first found extent, just store it in the context ++ */ ++ if (bex->fe_len == 0) { ++ *bex = *ex; ++ return; ++ } ++ ++ /* ++ * If new found extent is better, store it in the context ++ */ ++ if (bex->fe_len < gex->fe_len) { ++ /* if the request isn't satisfied, any found extent ++ * larger than previous best one is better */ ++ if (ex->fe_len > bex->fe_len) ++ *bex = *ex; ++ } else if (ex->fe_len > gex->fe_len) { ++ /* if the request is satisfied, then we try to find ++ * an extent that still satisfy the request, but is ++ * smaller than previous one */ ++ *bex = *ex; ++ } ++ ++ /* ++ * Let's scan at least few extents and don't pick up a first one ++ */ ++ if (bex->fe_len > gex->fe_len && ac->ac_found > ext3_mb_min_to_scan) ++ ac->ac_status = AC_STATUS_BREAK; ++ ++ /* ++ * We don't want to scan for a whole year ++ */ ++ if (ac->ac_found > ext3_mb_max_to_scan) ++ ac->ac_status = AC_STATUS_BREAK; ++} ++ ++static int ext3_mb_try_best_found(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ struct ext3_free_extent ex = ac->ac_b_ex; ++ int group = ex.fe_group, max, err; ++ ++ J_ASSERT(ex.fe_len > 0); ++ err = ext3_mb_load_buddy(ac->ac_sb, group, e3b); ++ if (err) ++ return err; ++ ++ ext3_lock_group(ac->ac_sb, group); ++ max = mb_find_extent(e3b, 0, ex.fe_start, ex.fe_len, &ex); ++ ++ if (max > 0) { ++ ac->ac_b_ex = ex; ++ ext3_mb_use_best_found(ac, e3b); ++ } ++ ++ ext3_unlock_group(ac->ac_sb, group); ++ ++ ext3_mb_release_desc(e3b); ++ ++ return 0; ++} ++ ++static int ext3_mb_find_by_goal(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ int group = ac->ac_g_ex.fe_group, max, err; ++ struct ext3_free_extent ex; ++ ++ err = ext3_mb_load_buddy(ac->ac_sb, group, e3b); ++ if (err) ++ return err; ++ ++ ext3_lock_group(ac->ac_sb, group); ++ max = mb_find_extent(e3b, 0, ac->ac_g_ex.fe_start, ++ ac->ac_g_ex.fe_len, &ex); ++ ++ if (max > 0) { ++ J_ASSERT(ex.fe_len > 0); ++ J_ASSERT(ex.fe_group == ac->ac_g_ex.fe_group); ++ J_ASSERT(ex.fe_start == ac->ac_g_ex.fe_start); ++ ac->ac_found++; ++ ac->ac_b_ex = ex; ++ ext3_mb_use_best_found(ac, e3b); ++ } ++ ext3_unlock_group(ac->ac_sb, group); ++ ++ ext3_mb_release_desc(e3b); ++ ++ return 0; ++} ++ ++/* ++ * The routine scans buddy structures (not bitmap!) from given order ++ * to max order and tries to find big enough chunk to satisfy the req ++ */ ++static void ext3_mb_simple_scan_group(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ struct super_block *sb = ac->ac_sb; ++ struct ext3_group_info *grp = e3b->bd_info; ++ void *buddy; ++ int i, k, max; ++ ++ J_ASSERT(ac->ac_2order > 0); ++ for (i = ac->ac_2order; i < sb->s_blocksize_bits + 1; i++) { ++ if (grp->bb_counters[i] == 0) ++ continue; ++ ++ buddy = mb_find_buddy(e3b, i, &max); ++ if (buddy == NULL) { ++ printk(KERN_ALERT "looking for wrong order?\n"); ++ break; ++ } ++ ++ k = mb_find_next_zero_bit(buddy, max, 0); ++ J_ASSERT(k < max); ++ ++ ac->ac_found++; ++ ++ ac->ac_b_ex.fe_len = 1 << i; ++ ac->ac_b_ex.fe_start = k << i; ++ ac->ac_b_ex.fe_group = e3b->bd_group; ++ ++ ext3_mb_use_best_found(ac, e3b); ++ J_ASSERT(ac->ac_b_ex.fe_len == ac->ac_g_ex.fe_len); ++ ++ if (unlikely(ext3_mb_stats)) ++ atomic_inc(&EXT3_SB(sb)->s_bal_2orders); ++ ++ break; ++ } ++} ++ ++/* ++ * The routine scans the group and measures all found extents. ++ * In order to optimize scanning, caller must pass number of ++ * free blocks in the group, so the routine can know upper limit. ++ */ ++static void ext3_mb_complex_scan_group(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ struct super_block *sb = ac->ac_sb; ++ void *bitmap = EXT3_MB_BITMAP(e3b); ++ struct ext3_free_extent ex; ++ int i, free; ++ ++ free = e3b->bd_info->bb_free; ++ J_ASSERT(free > 0); ++ ++ i = e3b->bd_info->bb_first_free; ++ ++ while (free && ac->ac_status == AC_STATUS_CONTINUE) { ++ i = mb_find_next_zero_bit(bitmap, sb->s_blocksize * 8, i); ++ if (i >= sb->s_blocksize * 8) { ++ J_ASSERT(free == 0); ++ break; ++ } ++ ++ mb_find_extent(e3b, 0, i, ac->ac_g_ex.fe_len, &ex); ++ J_ASSERT(ex.fe_len > 0); ++ J_ASSERT(free >= ex.fe_len); ++ ++ ext3_mb_measure_extent(ac, &ex, e3b); ++ ++ i += ex.fe_len; ++ free -= ex.fe_len; ++ } ++} ++ ++static int ext3_mb_good_group(struct ext3_allocation_context *ac, ++ int group, int cr) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(ac->ac_sb); ++ struct ext3_group_info *grp = sbi->s_group_info[group]; ++ unsigned free, fragments, i, bits; ++ ++ J_ASSERT(cr >= 0 && cr < 4); ++ J_ASSERT(!EXT3_MB_GRP_NEED_INIT(grp)); ++ ++ free = grp->bb_free; ++ fragments = grp->bb_fragments; ++ if (free == 0) ++ return 0; ++ if (fragments == 0) ++ return 0; ++ ++ switch (cr) { ++ case 0: ++ J_ASSERT(ac->ac_2order != 0); ++ bits = ac->ac_sb->s_blocksize_bits + 1; ++ for (i = ac->ac_2order; i < bits; i++) ++ if (grp->bb_counters[i] > 0) ++ return 1; ++ case 1: ++ if ((free / fragments) >= ac->ac_g_ex.fe_len) ++ return 1; ++ case 2: ++ if (free >= ac->ac_g_ex.fe_len) ++ return 1; ++ case 3: ++ return 1; ++ default: ++ BUG(); ++ } ++ ++ return 0; ++} ++ ++int ext3_mb_new_blocks(handle_t *handle, struct inode *inode, ++ unsigned long goal, int *len, int flags, int *errp) ++{ ++ struct buffer_head *bitmap_bh = NULL; ++ struct ext3_allocation_context ac; ++ int i, group, block, cr, err = 0; ++ struct ext3_group_desc *gdp; ++ struct ext3_super_block *es; ++ struct buffer_head *gdp_bh; ++ struct ext3_sb_info *sbi; ++ struct super_block *sb; ++ struct ext3_buddy e3b; ++ ++ J_ASSERT(len != NULL); ++ J_ASSERT(*len > 0); ++ ++ sb = inode->i_sb; ++ if (!sb) { ++ printk("ext3_mb_new_nblocks: nonexistent device"); ++ return 0; ++ } ++ ++ if (!test_opt(sb, MBALLOC)) { ++ static int ext3_mballoc_warning = 0; ++ if (ext3_mballoc_warning == 0) { ++ printk(KERN_ERR "EXT3-fs: multiblock request with " ++ "mballoc disabled!\n"); ++ ext3_mballoc_warning++; ++ } ++ *len = 1; ++ err = ext3_new_block_old(handle, inode, goal, errp); ++ return err; ++ } ++ ++ ext3_mb_poll_new_transaction(sb, handle); ++ ++ sbi = EXT3_SB(sb); ++ es = EXT3_SB(sb)->s_es; ++ ++ /* ++ * We can't allocate > group size ++ */ ++ if (*len >= EXT3_BLOCKS_PER_GROUP(sb) - 10) ++ *len = EXT3_BLOCKS_PER_GROUP(sb) - 10; ++ ++ if (!(flags & EXT3_MB_HINT_RESERVED)) { ++ /* someone asks for non-reserved blocks */ ++ BUG_ON(*len > 1); ++ err = ext3_mb_reserve_blocks(sb, 1); ++ if (err) { ++ *errp = err; ++ return 0; ++ } ++ } ++ ++ /* ++ * Check quota for allocation of this blocks. ++ */ ++ while (*len && DQUOT_ALLOC_BLOCK(inode, *len)) ++ *len -= 1; ++ if (*len == 0) { ++ *errp = -EDQUOT; ++ block = 0; ++ goto out; ++ } ++ ++ /* start searching from the goal */ ++ if (goal < le32_to_cpu(es->s_first_data_block) || ++ goal >= le32_to_cpu(es->s_blocks_count)) ++ goal = le32_to_cpu(es->s_first_data_block); ++ group = (goal - le32_to_cpu(es->s_first_data_block)) / ++ EXT3_BLOCKS_PER_GROUP(sb); ++ block = ((goal - le32_to_cpu(es->s_first_data_block)) % ++ EXT3_BLOCKS_PER_GROUP(sb)); ++ ++ /* set up allocation goals */ ++ ac.ac_b_ex.fe_group = 0; ++ ac.ac_b_ex.fe_start = 0; ++ ac.ac_b_ex.fe_len = 0; ++ ac.ac_status = AC_STATUS_CONTINUE; ++ ac.ac_groups_scanned = 0; ++ ac.ac_ex_scanned = 0; ++ ac.ac_found = 0; ++ ac.ac_sb = inode->i_sb; ++ ac.ac_g_ex.fe_group = group; ++ ac.ac_g_ex.fe_start = block; ++ ac.ac_g_ex.fe_len = *len; ++ ac.ac_flags = flags; ++ ac.ac_2order = 0; ++ ac.ac_criteria = 0; ++ ++ /* probably, the request is for 2^8+ blocks (1/2/3/... MB) */ ++ i = ffs(*len); ++ if (i >= 8) { ++ i--; ++ if ((*len & (~(1 << i))) == 0) ++ ac.ac_2order = i; ++ } ++ ++ /* Sometimes, caller may want to merge even small ++ * number of blocks to an existing extent */ ++ if (ac.ac_flags & EXT3_MB_HINT_MERGE) { ++ err = ext3_mb_find_by_goal(&ac, &e3b); ++ if (err) ++ goto out_err; ++ if (ac.ac_status == AC_STATUS_FOUND) ++ goto found; ++ } ++ ++ /* Let's just scan groups to find more-less suitable blocks */ ++ cr = ac.ac_2order ? 0 : 1; ++repeat: ++ for (; cr < 4 && ac.ac_status == AC_STATUS_CONTINUE; cr++) { ++ ac.ac_criteria = cr; ++ for (i = 0; i < EXT3_SB(sb)->s_groups_count; group++, i++) { ++ if (group == EXT3_SB(sb)->s_groups_count) ++ group = 0; ++ ++ if (EXT3_MB_GRP_NEED_INIT(sbi->s_group_info[group])) { ++ /* we need full data about the group ++ * to make a good selection */ ++ err = ext3_mb_load_buddy(ac.ac_sb, group, &e3b); ++ if (err) ++ goto out_err; ++ ext3_mb_release_desc(&e3b); ++ } ++ ++ /* check is group good for our criteries */ ++ if (!ext3_mb_good_group(&ac, group, cr)) ++ continue; ++ ++ err = ext3_mb_load_buddy(ac.ac_sb, group, &e3b); ++ if (err) ++ goto out_err; ++ ++ ext3_lock_group(sb, group); ++ if (!ext3_mb_good_group(&ac, group, cr)) { ++ /* someone did allocation from this group */ ++ ext3_unlock_group(sb, group); ++ ext3_mb_release_desc(&e3b); ++ continue; ++ } ++ ++ ac.ac_groups_scanned++; ++ if (cr == 0) ++ ext3_mb_simple_scan_group(&ac, &e3b); ++ else ++ ext3_mb_complex_scan_group(&ac, &e3b); ++ ++ ext3_unlock_group(sb, group); ++ ++ ext3_mb_release_desc(&e3b); ++ ++ if (ac.ac_status != AC_STATUS_CONTINUE) ++ break; ++ } ++ } ++ ++ if (ac.ac_b_ex.fe_len > 0 && ac.ac_status != AC_STATUS_FOUND && ++ !(ac.ac_flags & EXT3_MB_HINT_FIRST)) { ++ /* ++ * We've been searching too long. Let's try to allocate ++ * the best chunk we've found so far ++ */ ++ ++ /*if (ac.ac_found > ext3_mb_max_to_scan) ++ printk(KERN_ERR "EXT3-fs: too long searching at " ++ "%u (%d/%d)\n", cr, ac.ac_b_ex.fe_len, ++ ac.ac_g_ex.fe_len);*/ ++ ext3_mb_try_best_found(&ac, &e3b); ++ if (ac.ac_status != AC_STATUS_FOUND) { ++ /* ++ * Someone more lucky has already allocated it. ++ * The only thing we can do is just take first ++ * found block(s) ++ */ ++ printk(KERN_ERR "EXT3-fs: and someone won our chunk\n"); ++ ac.ac_b_ex.fe_group = 0; ++ ac.ac_b_ex.fe_start = 0; ++ ac.ac_b_ex.fe_len = 0; ++ ac.ac_status = AC_STATUS_CONTINUE; ++ ac.ac_flags |= EXT3_MB_HINT_FIRST; ++ cr = 3; ++ goto repeat; ++ } ++ } ++ ++ if (ac.ac_status != AC_STATUS_FOUND) { ++ /* ++ * We aren't lucky definitely ++ */ ++ DQUOT_FREE_BLOCK(inode, *len); ++ *errp = -ENOSPC; ++ block = 0; ++#if 1 ++ printk(KERN_ERR "EXT3-fs: cant allocate: status %d, flags %d\n", ++ ac.ac_status, ac.ac_flags); ++ printk(KERN_ERR "EXT3-fs: goal %d, best found %d/%d/%d, cr %d\n", ++ ac.ac_g_ex.fe_len, ac.ac_b_ex.fe_group, ++ ac.ac_b_ex.fe_start, ac.ac_b_ex.fe_len, cr); ++ printk(KERN_ERR "EXT3-fs: %lu block reserved, %d found\n", ++ sbi->s_blocks_reserved, ac.ac_found); ++ printk("EXT3-fs: groups: "); ++ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) ++ printk("%d: %d ", i, ++ sbi->s_group_info[i]->bb_free); ++ printk("\n"); ++#endif ++ goto out; ++ } ++ ++found: ++ J_ASSERT(ac.ac_b_ex.fe_len > 0); ++ ++ /* good news - free block(s) have been found. now it's time ++ * to mark block(s) in good old journaled bitmap */ ++ block = ac.ac_b_ex.fe_group * EXT3_BLOCKS_PER_GROUP(sb) ++ + ac.ac_b_ex.fe_start ++ + le32_to_cpu(es->s_first_data_block); ++ ++ /* we made a desicion, now mark found blocks in good old ++ * bitmap to be journaled */ ++ ++ ext3_debug("using block group %d(%d)\n", ++ ac.ac_b_group.group, gdp->bg_free_blocks_count); ++ ++ bitmap_bh = read_block_bitmap(sb, ac.ac_b_ex.fe_group); ++ if (!bitmap_bh) { ++ *errp = -EIO; ++ goto out_err; ++ } ++ ++ err = ext3_journal_get_write_access(handle, bitmap_bh); ++ if (err) { ++ *errp = err; ++ goto out_err; ++ } ++ ++ gdp = ext3_get_group_desc(sb, ac.ac_b_ex.fe_group, &gdp_bh); ++ if (!gdp) { ++ *errp = -EIO; ++ goto out_err; ++ } ++ ++ err = ext3_journal_get_write_access(handle, gdp_bh); ++ if (err) ++ goto out_err; ++ ++ block = ac.ac_b_ex.fe_group * EXT3_BLOCKS_PER_GROUP(sb) ++ + ac.ac_b_ex.fe_start ++ + le32_to_cpu(es->s_first_data_block); ++ ++ if (block == le32_to_cpu(gdp->bg_block_bitmap) || ++ block == le32_to_cpu(gdp->bg_inode_bitmap) || ++ in_range(block, le32_to_cpu(gdp->bg_inode_table), ++ EXT3_SB(sb)->s_itb_per_group)) ++ ext3_error(sb, "ext3_new_block", ++ "Allocating block in system zone - " ++ "block = %u", block); ++#ifdef AGGRESSIVE_CHECK ++ for (i = 0; i < ac.ac_b_ex.fe_len; i++) ++ J_ASSERT(!mb_test_bit(ac.ac_b_ex.fe_start + i, bitmap_bh->b_data)); ++#endif ++ mb_set_bits(bitmap_bh->b_data, ac.ac_b_ex.fe_start, ac.ac_b_ex.fe_len); ++ ++ spin_lock(sb_bgl_lock(sbi, ac.ac_b_ex.fe_group)); ++ gdp->bg_free_blocks_count = ++ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) ++ - ac.ac_b_ex.fe_len); ++ spin_unlock(sb_bgl_lock(sbi, ac.ac_b_ex.fe_group)); ++ percpu_counter_mod(&sbi->s_freeblocks_counter, - ac.ac_b_ex.fe_len); ++ ++ err = ext3_journal_dirty_metadata(handle, bitmap_bh); ++ if (err) ++ goto out_err; ++ err = ext3_journal_dirty_metadata(handle, gdp_bh); ++ if (err) ++ goto out_err; ++ ++ sb->s_dirt = 1; ++ *errp = 0; ++ brelse(bitmap_bh); ++ ++ /* drop non-allocated, but dquote'd blocks */ ++ J_ASSERT(*len >= ac.ac_b_ex.fe_len); ++ DQUOT_FREE_BLOCK(inode, *len - ac.ac_b_ex.fe_len); ++ ++ *len = ac.ac_b_ex.fe_len; ++ J_ASSERT(*len > 0); ++ J_ASSERT(block != 0); ++ goto out; ++ ++out_err: ++ /* if we've already allocated something, roll it back */ ++ if (ac.ac_status == AC_STATUS_FOUND) { ++ /* FIXME: free blocks here */ ++ } ++ ++ DQUOT_FREE_BLOCK(inode, *len); ++ brelse(bitmap_bh); ++ *errp = err; ++ block = 0; ++out: ++ if (!(flags & EXT3_MB_HINT_RESERVED)) { ++ /* block wasn't reserved before and we reserved it ++ * at the beginning of allocation. it doesn't matter ++ * whether we allocated anything or we failed: time ++ * to release reservation. NOTE: because I expect ++ * any multiblock request from delayed allocation ++ * path only, here is single block always */ ++ ext3_mb_release_blocks(sb, 1); ++ } ++ ++ if (unlikely(ext3_mb_stats) && ac.ac_g_ex.fe_len > 1) { ++ atomic_inc(&sbi->s_bal_reqs); ++ atomic_add(*len, &sbi->s_bal_allocated); ++ if (*len >= ac.ac_g_ex.fe_len) ++ atomic_inc(&sbi->s_bal_success); ++ atomic_add(ac.ac_found, &sbi->s_bal_ex_scanned); ++ if (ac.ac_g_ex.fe_start == ac.ac_b_ex.fe_start && ++ ac.ac_g_ex.fe_group == ac.ac_b_ex.fe_group) ++ atomic_inc(&sbi->s_bal_goals); ++ if (ac.ac_found > ext3_mb_max_to_scan) ++ atomic_inc(&sbi->s_bal_breaks); ++ } ++ ++ ext3_mb_store_history(sb, &ac); ++ ++ return block; ++} ++EXPORT_SYMBOL(ext3_mb_new_blocks); ++ ++#ifdef EXT3_MB_HISTORY ++struct ext3_mb_proc_session { ++ struct ext3_mb_history *history; ++ struct super_block *sb; ++ int start; ++ int max; ++}; ++ ++static void *ext3_mb_history_skip_empty(struct ext3_mb_proc_session *s, ++ struct ext3_mb_history *hs, ++ int first) ++{ ++ if (hs == s->history + s->max) ++ hs = s->history; ++ if (!first && hs == s->history + s->start) ++ return NULL; ++ while (hs->goal.fe_len == 0) { ++ hs++; ++ if (hs == s->history + s->max) ++ hs = s->history; ++ if (hs == s->history + s->start) ++ return NULL; ++ } ++ return hs; ++} ++ ++static void *ext3_mb_seq_history_start(struct seq_file *seq, loff_t *pos) ++{ ++ struct ext3_mb_proc_session *s = seq->private; ++ struct ext3_mb_history *hs; ++ int l = *pos; ++ ++ if (l == 0) ++ return SEQ_START_TOKEN; ++ hs = ext3_mb_history_skip_empty(s, s->history + s->start, 1); ++ if (!hs) ++ return NULL; ++ while (--l && (hs = ext3_mb_history_skip_empty(s, ++hs, 0)) != NULL); ++ return hs; ++} ++ ++static void *ext3_mb_seq_history_next(struct seq_file *seq, void *v, loff_t *pos) ++{ ++ struct ext3_mb_proc_session *s = seq->private; ++ struct ext3_mb_history *hs = v; ++ ++ ++*pos; ++ if (v == SEQ_START_TOKEN) ++ return ext3_mb_history_skip_empty(s, s->history + s->start, 1); ++ else ++ return ext3_mb_history_skip_empty(s, ++hs, 0); ++} ++ ++static int ext3_mb_seq_history_show(struct seq_file *seq, void *v) ++{ ++ struct ext3_mb_history *hs = v; ++ char buf[20], buf2[20]; ++ ++ if (v == SEQ_START_TOKEN) { ++ seq_printf(seq, "%-17s %-17s %-5s %-5s %-2s %-5s %-5s %-6s\n", ++ "goal", "result", "found", "grps", "cr", "merge", ++ "tail", "broken"); ++ return 0; ++ } ++ ++ sprintf(buf, "%u/%u/%u", hs->goal.fe_group, ++ hs->goal.fe_start, hs->goal.fe_len); ++ sprintf(buf2, "%u/%u/%u", hs->result.fe_group, ++ hs->result.fe_start, hs->result.fe_len); ++ seq_printf(seq, "%-17s %-17s %-5u %-5u %-2u %-5s %-5u %-6u\n", buf, ++ buf2, hs->found, hs->groups, hs->cr, ++ hs->merged ? "M" : "", hs->tail, ++ hs->buddy ? 1 << hs->buddy : 0); ++ return 0; ++} ++ ++static void ext3_mb_seq_history_stop(struct seq_file *seq, void *v) ++{ ++} ++ ++static struct seq_operations ext3_mb_seq_history_ops = { ++ .start = ext3_mb_seq_history_start, ++ .next = ext3_mb_seq_history_next, ++ .stop = ext3_mb_seq_history_stop, ++ .show = ext3_mb_seq_history_show, ++}; ++ ++static int ext3_mb_seq_history_open(struct inode *inode, struct file *file) ++{ ++ struct super_block *sb = PDE(inode)->data; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct ext3_mb_proc_session *s; ++ int rc, size; ++ ++ s = kmalloc(sizeof(*s), GFP_KERNEL); ++ if (s == NULL) ++ return -EIO; ++ size = sizeof(struct ext3_mb_history) * sbi->s_mb_history_max; ++ s->history = kmalloc(size, GFP_KERNEL); ++ ++ spin_lock(&sbi->s_mb_history_lock); ++ memcpy(s->history, sbi->s_mb_history, size); ++ s->max = sbi->s_mb_history_max; ++ s->start = sbi->s_mb_history_cur % s->max; ++ spin_unlock(&sbi->s_mb_history_lock); ++ ++ rc = seq_open(file, &ext3_mb_seq_history_ops); ++ if (rc == 0) { ++ struct seq_file *m = (struct seq_file *)file->private_data; ++ m->private = s; ++ } else { ++ kfree(s->history); ++ kfree(s); ++ } ++ return rc; ++ ++} ++ ++static int ext3_mb_seq_history_release(struct inode *inode, struct file *file) ++{ ++ struct seq_file *seq = (struct seq_file *)file->private_data; ++ struct ext3_mb_proc_session *s = seq->private; ++ kfree(s->history); ++ kfree(s); ++ return seq_release(inode, file); ++} ++ ++static struct file_operations ext3_mb_seq_history_fops = { ++ .owner = THIS_MODULE, ++ .open = ext3_mb_seq_history_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = ext3_mb_seq_history_release, ++}; ++ ++static void ext3_mb_history_release(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ char name[64]; ++ ++ snprintf(name, sizeof(name) - 1, "%s", bdevname(sb->s_bdev, name)); ++ remove_proc_entry("mb_history", sbi->s_mb_proc); ++ remove_proc_entry(name, proc_root_ext3); ++ ++ if (sbi->s_mb_history) ++ kfree(sbi->s_mb_history); ++} ++ ++static void ext3_mb_history_init(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ char name[64]; ++ int i; ++ ++ snprintf(name, sizeof(name) - 1, "%s", bdevname(sb->s_bdev, name)); ++ sbi->s_mb_proc = proc_mkdir(name, proc_root_ext3); ++ if (sbi->s_mb_proc != NULL) { ++ struct proc_dir_entry *p; ++ p = create_proc_entry("mb_history", S_IRUGO, sbi->s_mb_proc); ++ if (p) { ++ p->proc_fops = &ext3_mb_seq_history_fops; ++ p->data = sb; ++ } ++ } ++ ++ sbi->s_mb_history_max = 1000; ++ sbi->s_mb_history_cur = 0; ++ spin_lock_init(&sbi->s_mb_history_lock); ++ i = sbi->s_mb_history_max * sizeof(struct ext3_mb_history); ++ sbi->s_mb_history = kmalloc(i, GFP_KERNEL); ++ memset(sbi->s_mb_history, 0, i); ++ /* if we can't allocate history, then we simple won't use it */ ++} ++ ++static void ++ext3_mb_store_history(struct super_block *sb, struct ext3_allocation_context *ac) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct ext3_mb_history h; ++ ++ if (likely(sbi->s_mb_history == NULL)) ++ return; ++ ++ h.goal = ac->ac_g_ex; ++ h.result = ac->ac_b_ex; ++ h.found = ac->ac_found; ++ h.cr = ac->ac_criteria; ++ h.groups = ac->ac_groups_scanned; ++ h.tail = ac->ac_tail; ++ h.buddy = ac->ac_buddy; ++ h.merged = 0; ++ if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && ++ ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group) ++ h.merged = 1; ++ ++ spin_lock(&sbi->s_mb_history_lock); ++ memcpy(sbi->s_mb_history + sbi->s_mb_history_cur, &h, sizeof(h)); ++ if (++sbi->s_mb_history_cur >= sbi->s_mb_history_max) ++ sbi->s_mb_history_cur = 0; ++ spin_unlock(&sbi->s_mb_history_lock); ++} ++ ++#else ++#define ext3_mb_history_release(sb) ++#define ext3_mb_history_init(sb) ++#endif ++ ++int ext3_mb_init_backend(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int i, len; ++ ++ len = sizeof(struct ext3_buddy_group_blocks *) * sbi->s_groups_count; ++ sbi->s_group_info = kmalloc(len, GFP_KERNEL); ++ if (sbi->s_group_info == NULL) { ++ printk(KERN_ERR "EXT3-fs: can't allocate mem for buddy\n"); ++ return -ENOMEM; ++ } ++ memset(sbi->s_group_info, 0, len); ++ ++ sbi->s_buddy_cache = new_inode(sb); ++ if (sbi->s_buddy_cache == NULL) { ++ printk(KERN_ERR "EXT3-fs: can't get new inode\n"); ++ kfree(sbi->s_group_info); ++ return -ENOMEM; ++ } ++ ++ /* ++ * calculate needed size. if change bb_counters size, ++ * don't forget about ext3_mb_generate_buddy() ++ */ ++ len = sizeof(struct ext3_group_info); ++ len += sizeof(unsigned short) * (sb->s_blocksize_bits + 2); ++ for (i = 0; i < sbi->s_groups_count; i++) { ++ struct ext3_group_desc * desc; ++ ++ sbi->s_group_info[i] = kmalloc(len, GFP_KERNEL); ++ if (sbi->s_group_info[i] == NULL) { ++ printk(KERN_ERR "EXT3-fs: cant allocate mem for buddy\n"); ++ goto err_out; ++ } ++ desc = ext3_get_group_desc(sb, i, NULL); ++ if (desc == NULL) { ++ printk(KERN_ERR "EXT3-fs: cant read descriptor %u\n", i); ++ goto err_out; ++ } ++ memset(sbi->s_group_info[i], 0, len); ++ set_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, ++ &sbi->s_group_info[i]->bb_state); ++ sbi->s_group_info[i]->bb_free = ++ le16_to_cpu(desc->bg_free_blocks_count); ++ } ++ ++ return 0; ++ ++err_out: ++ while (--i >= 0) ++ kfree(sbi->s_group_info[i]); ++ iput(sbi->s_buddy_cache); ++ ++ return -ENOMEM; ++} ++ ++int ext3_mb_init(struct super_block *sb, int needs_recovery) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct inode *root = sb->s_root->d_inode; ++ unsigned i, offset, max; ++ struct dentry *dentry; ++ ++ if (!test_opt(sb, MBALLOC)) ++ return 0; ++ ++ i = (sb->s_blocksize_bits + 2) * sizeof(unsigned short); ++ ++ sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL); ++ if (sbi->s_mb_offsets == NULL) { ++ clear_opt(sbi->s_mount_opt, MBALLOC); ++ return -ENOMEM; ++ } ++ sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); ++ if (sbi->s_mb_maxs == NULL) { ++ clear_opt(sbi->s_mount_opt, MBALLOC); ++ kfree(sbi->s_mb_maxs); ++ return -ENOMEM; ++ } ++ ++ /* order 0 is regular bitmap */ ++ sbi->s_mb_maxs[0] = sb->s_blocksize << 3; ++ sbi->s_mb_offsets[0] = 0; ++ ++ i = 1; ++ offset = 0; ++ max = sb->s_blocksize << 2; ++ do { ++ sbi->s_mb_offsets[i] = offset; ++ sbi->s_mb_maxs[i] = max; ++ offset += 1 << (sb->s_blocksize_bits - i); ++ max = max >> 1; ++ i++; ++ } while (i <= sb->s_blocksize_bits + 1); ++ ++ ++ /* init file for buddy data */ ++ if ((i = ext3_mb_init_backend(sb))) { ++ clear_opt(sbi->s_mount_opt, MBALLOC); ++ kfree(sbi->s_mb_offsets); ++ kfree(sbi->s_mb_maxs); ++ return i; ++ } ++ ++ spin_lock_init(&sbi->s_reserve_lock); ++ spin_lock_init(&sbi->s_md_lock); ++ INIT_LIST_HEAD(&sbi->s_active_transaction); ++ INIT_LIST_HEAD(&sbi->s_closed_transaction); ++ INIT_LIST_HEAD(&sbi->s_committed_transaction); ++ spin_lock_init(&sbi->s_bal_lock); ++ ++ /* remove old on-disk buddy file */ ++ down(&root->i_sem); ++ dentry = lookup_one_len(".buddy", sb->s_root, strlen(".buddy")); ++ if (dentry->d_inode != NULL) { ++ i = vfs_unlink(root, dentry); ++ if (i != 0) ++ printk("EXT3-fs: can't remove .buddy file: %d\n", i); ++ } ++ dput(dentry); ++ up(&root->i_sem); ++ ++ ext3_mb_history_init(sb); ++ ++ printk("EXT3-fs: mballoc enabled\n"); ++ return 0; ++} ++ ++int ext3_mb_release(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int i; ++ ++ if (!test_opt(sb, MBALLOC)) ++ return 0; ++ ++ /* release freed, non-committed blocks */ ++ spin_lock(&sbi->s_md_lock); ++ list_splice_init(&sbi->s_closed_transaction, ++ &sbi->s_committed_transaction); ++ list_splice_init(&sbi->s_active_transaction, ++ &sbi->s_committed_transaction); ++ spin_unlock(&sbi->s_md_lock); ++ ext3_mb_free_committed_blocks(sb); ++ ++ if (sbi->s_group_info) { ++ for (i = 0; i < sbi->s_groups_count; i++) { ++ if (sbi->s_group_info[i] == NULL) ++ continue; ++ kfree(sbi->s_group_info[i]); ++ } ++ kfree(sbi->s_group_info); ++ } ++ if (sbi->s_mb_offsets) ++ kfree(sbi->s_mb_offsets); ++ if (sbi->s_mb_maxs) ++ kfree(sbi->s_mb_maxs); ++ if (sbi->s_buddy_cache) ++ iput(sbi->s_buddy_cache); ++ if (sbi->s_blocks_reserved) ++ printk("ext3-fs: %ld blocks being reserved at umount!\n", ++ sbi->s_blocks_reserved); ++ if (ext3_mb_stats) { ++ printk("EXT3-fs: mballoc: %u blocks %u reqs (%u success)\n", ++ atomic_read(&sbi->s_bal_allocated), ++ atomic_read(&sbi->s_bal_reqs), ++ atomic_read(&sbi->s_bal_success)); ++ printk("EXT3-fs: mballoc: %u extents scanned, %u goal hits, " ++ "%u 2^N hits, %u breaks\n", ++ atomic_read(&sbi->s_bal_ex_scanned), ++ atomic_read(&sbi->s_bal_goals), ++ atomic_read(&sbi->s_bal_2orders), ++ atomic_read(&sbi->s_bal_breaks)); ++ printk("EXT3-fs: mballoc: %lu generated and it took %Lu\n", ++ sbi->s_mb_buddies_generated++, ++ sbi->s_mb_generation_time); ++ } ++ ++ ext3_mb_history_release(sb); ++ ++ return 0; ++} ++ ++void ext3_mb_free_committed_blocks(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int err, i, count = 0, count2 = 0; ++ struct ext3_free_metadata *md; ++ struct ext3_buddy e3b; ++ ++ if (list_empty(&sbi->s_committed_transaction)) ++ return; ++ ++ /* there is committed blocks to be freed yet */ ++ do { ++ /* get next array of blocks */ ++ md = NULL; ++ spin_lock(&sbi->s_md_lock); ++ if (!list_empty(&sbi->s_committed_transaction)) { ++ md = list_entry(sbi->s_committed_transaction.next, ++ struct ext3_free_metadata, list); ++ list_del(&md->list); ++ } ++ spin_unlock(&sbi->s_md_lock); ++ ++ if (md == NULL) ++ break; ++ ++ mb_debug("gonna free %u blocks in group %u (0x%p):", ++ md->num, md->group, md); ++ ++ err = ext3_mb_load_buddy(sb, md->group, &e3b); ++ BUG_ON(err != 0); ++ ++ /* there are blocks to put in buddy to make them really free */ ++ count += md->num; ++ count2++; ++ ext3_lock_group(sb, md->group); ++ for (i = 0; i < md->num; i++) { ++ mb_debug(" %u", md->blocks[i]); ++ mb_free_blocks(&e3b, md->blocks[i], 1); ++ } ++ mb_debug("\n"); ++ ext3_unlock_group(sb, md->group); ++ ++ /* balance refcounts from ext3_mb_free_metadata() */ ++ page_cache_release(e3b.bd_buddy_page); ++ page_cache_release(e3b.bd_bitmap_page); ++ ++ kfree(md); ++ ext3_mb_release_desc(&e3b); ++ ++ } while (md); ++ mb_debug("freed %u blocks in %u structures\n", count, count2); ++} ++ ++void ext3_mb_poll_new_transaction(struct super_block *sb, handle_t *handle) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ ++ if (sbi->s_last_transaction == handle->h_transaction->t_tid) ++ return; ++ ++ /* new transaction! time to close last one and free blocks for ++ * committed transaction. we know that only transaction can be ++ * active, so previos transaction can be being logged and we ++ * know that transaction before previous is known to be already ++ * logged. this means that now we may free blocks freed in all ++ * transactions before previous one. hope I'm clear enough ... */ ++ ++ spin_lock(&sbi->s_md_lock); ++ if (sbi->s_last_transaction != handle->h_transaction->t_tid) { ++ mb_debug("new transaction %lu, old %lu\n", ++ (unsigned long) handle->h_transaction->t_tid, ++ (unsigned long) sbi->s_last_transaction); ++ list_splice_init(&sbi->s_closed_transaction, ++ &sbi->s_committed_transaction); ++ list_splice_init(&sbi->s_active_transaction, ++ &sbi->s_closed_transaction); ++ sbi->s_last_transaction = handle->h_transaction->t_tid; ++ } ++ spin_unlock(&sbi->s_md_lock); ++ ++ ext3_mb_free_committed_blocks(sb); ++} ++ ++int ext3_mb_free_metadata(handle_t *handle, struct ext3_buddy *e3b, ++ int group, int block, int count) ++{ ++ struct ext3_group_info *db = e3b->bd_info; ++ struct super_block *sb = e3b->bd_sb; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct ext3_free_metadata *md; ++ int i; ++ ++ J_ASSERT(e3b->bd_bitmap_page != NULL); ++ J_ASSERT(e3b->bd_buddy_page != NULL); ++ ++ ext3_lock_group(sb, group); ++ for (i = 0; i < count; i++) { ++ md = db->bb_md_cur; ++ if (md && db->bb_tid != handle->h_transaction->t_tid) { ++ db->bb_md_cur = NULL; ++ md = NULL; ++ } ++ ++ if (md == NULL) { ++ ext3_unlock_group(sb, group); ++ md = kmalloc(sizeof(*md), GFP_KERNEL); ++ if (md == NULL) ++ return -ENOMEM; ++ md->num = 0; ++ md->group = group; ++ ++ ext3_lock_group(sb, group); ++ if (db->bb_md_cur == NULL) { ++ spin_lock(&sbi->s_md_lock); ++ list_add(&md->list, &sbi->s_active_transaction); ++ spin_unlock(&sbi->s_md_lock); ++ /* protect buddy cache from being freed, ++ * otherwise we'll refresh it from ++ * on-disk bitmap and lose not-yet-available ++ * blocks */ ++ page_cache_get(e3b->bd_buddy_page); ++ page_cache_get(e3b->bd_bitmap_page); ++ db->bb_md_cur = md; ++ db->bb_tid = handle->h_transaction->t_tid; ++ mb_debug("new md 0x%p for group %u\n", ++ md, md->group); ++ } else { ++ kfree(md); ++ md = db->bb_md_cur; ++ } ++ } ++ ++ BUG_ON(md->num >= EXT3_BB_MAX_BLOCKS); ++ md->blocks[md->num] = block + i; ++ md->num++; ++ if (md->num == EXT3_BB_MAX_BLOCKS) { ++ /* no more space, put full container on a sb's list */ ++ db->bb_md_cur = NULL; ++ } ++ } ++ ext3_unlock_group(sb, group); ++ return 0; ++} ++ ++void ext3_mb_free_blocks(handle_t *handle, struct inode *inode, ++ unsigned long block, unsigned long count, ++ int metadata, int *freed) ++{ ++ struct buffer_head *bitmap_bh = NULL; ++ struct ext3_group_desc *gdp; ++ struct ext3_super_block *es; ++ unsigned long bit, overflow; ++ struct buffer_head *gd_bh; ++ unsigned long block_group; ++ struct ext3_sb_info *sbi; ++ struct super_block *sb; ++ struct ext3_buddy e3b; ++ int err = 0, ret; ++ ++ *freed = 0; ++ sb = inode->i_sb; ++ if (!sb) { ++ printk ("ext3_free_blocks: nonexistent device"); ++ return; ++ } ++ ++ ext3_mb_poll_new_transaction(sb, handle); ++ ++ sbi = EXT3_SB(sb); ++ es = EXT3_SB(sb)->s_es; ++ if (block < le32_to_cpu(es->s_first_data_block) || ++ block + count < block || ++ block + count > le32_to_cpu(es->s_blocks_count)) { ++ ext3_error (sb, "ext3_free_blocks", ++ "Freeing blocks not in datazone - " ++ "block = %lu, count = %lu", block, count); ++ goto error_return; ++ } ++ ++ ext3_debug("freeing block %lu\n", block); ++ ++do_more: ++ overflow = 0; ++ block_group = (block - le32_to_cpu(es->s_first_data_block)) / ++ EXT3_BLOCKS_PER_GROUP(sb); ++ bit = (block - le32_to_cpu(es->s_first_data_block)) % ++ EXT3_BLOCKS_PER_GROUP(sb); ++ /* ++ * Check to see if we are freeing blocks across a group ++ * boundary. ++ */ ++ if (bit + count > EXT3_BLOCKS_PER_GROUP(sb)) { ++ overflow = bit + count - EXT3_BLOCKS_PER_GROUP(sb); ++ count -= overflow; ++ } ++ brelse(bitmap_bh); ++ bitmap_bh = read_block_bitmap(sb, block_group); ++ if (!bitmap_bh) ++ goto error_return; ++ gdp = ext3_get_group_desc (sb, block_group, &gd_bh); ++ if (!gdp) ++ goto error_return; ++ ++ if (in_range (le32_to_cpu(gdp->bg_block_bitmap), block, count) || ++ in_range (le32_to_cpu(gdp->bg_inode_bitmap), block, count) || ++ in_range (block, le32_to_cpu(gdp->bg_inode_table), ++ EXT3_SB(sb)->s_itb_per_group) || ++ in_range (block + count - 1, le32_to_cpu(gdp->bg_inode_table), ++ EXT3_SB(sb)->s_itb_per_group)) ++ ext3_error (sb, "ext3_free_blocks", ++ "Freeing blocks in system zones - " ++ "Block = %lu, count = %lu", ++ block, count); ++ ++ BUFFER_TRACE(bitmap_bh, "getting write access"); ++ err = ext3_journal_get_write_access(handle, bitmap_bh); ++ if (err) ++ goto error_return; ++ ++ /* ++ * We are about to modify some metadata. Call the journal APIs ++ * to unshare ->b_data if a currently-committing transaction is ++ * using it ++ */ ++ BUFFER_TRACE(gd_bh, "get_write_access"); ++ err = ext3_journal_get_write_access(handle, gd_bh); ++ if (err) ++ goto error_return; ++ ++ err = ext3_mb_load_buddy(sb, block_group, &e3b); ++ if (err) ++ goto error_return; ++ ++#ifdef AGGRESSIVE_CHECK ++ { ++ int i; ++ for (i = 0; i < count; i++) ++ J_ASSERT(mb_test_bit(bit + i, bitmap_bh->b_data)); ++ } ++#endif ++ mb_clear_bits(bitmap_bh->b_data, bit, count); ++ ++ /* We dirtied the bitmap block */ ++ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); ++ err = ext3_journal_dirty_metadata(handle, bitmap_bh); ++ ++ if (metadata) { ++ /* blocks being freed are metadata. these blocks shouldn't ++ * be used until this transaction is committed */ ++ ext3_mb_free_metadata(handle, &e3b, block_group, bit, count); ++ } else { ++ ext3_lock_group(sb, block_group); ++ mb_free_blocks(&e3b, bit, count); ++ ext3_unlock_group(sb, block_group); ++ } ++ ++ spin_lock(sb_bgl_lock(sbi, block_group)); ++ gdp->bg_free_blocks_count = ++ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + count); ++ spin_unlock(sb_bgl_lock(sbi, block_group)); ++ percpu_counter_mod(&sbi->s_freeblocks_counter, count); ++ ++ ext3_mb_release_desc(&e3b); ++ ++ *freed = count; ++ ++ /* And the group descriptor block */ ++ BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); ++ ret = ext3_journal_dirty_metadata(handle, gd_bh); ++ if (!err) err = ret; ++ ++ if (overflow && !err) { ++ block += count; ++ count = overflow; ++ goto do_more; ++ } ++ sb->s_dirt = 1; ++error_return: ++ brelse(bitmap_bh); ++ ext3_std_error(sb, err); ++ return; ++} ++ ++int ext3_mb_reserve_blocks(struct super_block *sb, int blocks) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int free, ret = -ENOSPC; ++ ++ BUG_ON(blocks < 0); ++ spin_lock(&sbi->s_reserve_lock); ++ free = percpu_counter_read_positive(&sbi->s_freeblocks_counter); ++ if (blocks <= free - sbi->s_blocks_reserved) { ++ sbi->s_blocks_reserved += blocks; ++ ret = 0; ++ } ++ spin_unlock(&sbi->s_reserve_lock); ++ return ret; ++} ++ ++void ext3_mb_release_blocks(struct super_block *sb, int blocks) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ ++ BUG_ON(blocks < 0); ++ spin_lock(&sbi->s_reserve_lock); ++ sbi->s_blocks_reserved -= blocks; ++ WARN_ON(sbi->s_blocks_reserved < 0); ++ if (sbi->s_blocks_reserved < 0) ++ sbi->s_blocks_reserved = 0; ++ spin_unlock(&sbi->s_reserve_lock); ++} ++ ++int ext3_new_block(handle_t *handle, struct inode *inode, ++ unsigned long goal, int *errp) ++{ ++ int ret, len; ++ ++ if (!test_opt(inode->i_sb, MBALLOC)) { ++ ret = ext3_new_block_old(handle, inode, goal, errp); ++ goto out; ++ } ++ len = 1; ++ ret = ext3_mb_new_blocks(handle, inode, goal, &len, 0, errp); ++out: ++ return ret; ++} ++ ++ ++void ext3_free_blocks(handle_t *handle, struct inode * inode, ++ unsigned long block, unsigned long count, int metadata) ++{ ++ struct super_block *sb; ++ int freed; ++ ++ sb = inode->i_sb; ++ if (!test_opt(sb, MBALLOC)) ++ ext3_free_blocks_sb(handle, sb, block, count, &freed); ++ else ++ ext3_mb_free_blocks(handle, inode, block, count, metadata, &freed); ++ if (freed) ++ DQUOT_FREE_BLOCK(inode, freed); ++ return; ++} ++ ++#define EXT3_ROOT "ext3" ++#define EXT3_MB_STATS_NAME "mb_stats" ++#define EXT3_MB_MAX_TO_SCAN_NAME "mb_max_to_scan" ++#define EXT3_MB_MIN_TO_SCAN_NAME "mb_min_to_scan" ++ ++static int ext3_mb_stats_read(char *page, char **start, off_t off, ++ int count, int *eof, void *data) ++{ ++ int len; ++ ++ *eof = 1; ++ if (off != 0) ++ return 0; ++ ++ len = sprintf(page, "%ld\n", ext3_mb_stats); ++ *start = page; ++ return len; ++} ++ ++static int ext3_mb_stats_write(struct file *file, const char *buffer, ++ unsigned long count, void *data) ++{ ++ char str[32]; ++ ++ if (count >= sizeof(str)) { ++ printk(KERN_ERR "EXT3: %s string to long, max %u bytes\n", ++ EXT3_MB_STATS_NAME, (int)sizeof(str)); ++ return -EOVERFLOW; ++ } ++ ++ if (copy_from_user(str, buffer, count)) ++ return -EFAULT; ++ ++ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ ++ ext3_mb_stats = (simple_strtol(str, NULL, 0) != 0); ++ return count; ++} ++ ++static int ext3_mb_max_to_scan_read(char *page, char **start, off_t off, ++ int count, int *eof, void *data) ++{ ++ int len; ++ ++ *eof = 1; ++ if (off != 0) ++ return 0; ++ ++ len = sprintf(page, "%ld\n", ext3_mb_max_to_scan); ++ *start = page; ++ return len; ++} ++ ++static int ext3_mb_max_to_scan_write(struct file *file, const char *buffer, ++ unsigned long count, void *data) ++{ ++ char str[32]; ++ long value; ++ ++ if (count >= sizeof(str)) { ++ printk(KERN_ERR "EXT3: %s string to long, max %u bytes\n", ++ EXT3_MB_MAX_TO_SCAN_NAME, (int)sizeof(str)); ++ return -EOVERFLOW; ++ } ++ ++ if (copy_from_user(str, buffer, count)) ++ return -EFAULT; ++ ++ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ ++ value = simple_strtol(str, NULL, 0); ++ if (value <= 0) ++ return -ERANGE; ++ ++ ext3_mb_max_to_scan = value; ++ ++ return count; ++} ++ ++static int ext3_mb_min_to_scan_read(char *page, char **start, off_t off, ++ int count, int *eof, void *data) ++{ ++ int len; ++ ++ *eof = 1; ++ if (off != 0) ++ return 0; ++ ++ len = sprintf(page, "%ld\n", ext3_mb_min_to_scan); ++ *start = page; ++ return len; ++} ++ ++static int ext3_mb_min_to_scan_write(struct file *file, const char *buffer, ++ unsigned long count, void *data) ++{ ++ char str[32]; ++ long value; ++ ++ if (count >= sizeof(str)) { ++ printk(KERN_ERR "EXT3: %s string to long, max %u bytes\n", ++ EXT3_MB_MIN_TO_SCAN_NAME, (int)sizeof(str)); ++ return -EOVERFLOW; ++ } ++ ++ if (copy_from_user(str, buffer, count)) ++ return -EFAULT; ++ ++ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ ++ value = simple_strtol(str, NULL, 0); ++ if (value <= 0) ++ return -ERANGE; ++ ++ ext3_mb_min_to_scan = value; ++ ++ return count; ++} ++ ++int __init init_ext3_proc(void) ++{ ++ struct proc_dir_entry *proc_ext3_mb_stats; ++ struct proc_dir_entry *proc_ext3_mb_max_to_scan; ++ struct proc_dir_entry *proc_ext3_mb_min_to_scan; ++ ++ proc_root_ext3 = proc_mkdir(EXT3_ROOT, proc_root_fs); ++ if (proc_root_ext3 == NULL) { ++ printk(KERN_ERR "EXT3: Unable to create %s\n", EXT3_ROOT); ++ return -EIO; ++ } ++ ++ /* Initialize EXT3_MB_STATS_NAME */ ++ proc_ext3_mb_stats = create_proc_entry(EXT3_MB_STATS_NAME, ++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); ++ if (proc_ext3_mb_stats == NULL) { ++ printk(KERN_ERR "EXT3: Unable to create %s\n", ++ EXT3_MB_STATS_NAME); ++ remove_proc_entry(EXT3_ROOT, proc_root_fs); ++ return -EIO; ++ } ++ ++ proc_ext3_mb_stats->data = NULL; ++ proc_ext3_mb_stats->read_proc = ext3_mb_stats_read; ++ proc_ext3_mb_stats->write_proc = ext3_mb_stats_write; ++ ++ /* Initialize EXT3_MAX_TO_SCAN_NAME */ ++ proc_ext3_mb_max_to_scan = create_proc_entry( ++ EXT3_MB_MAX_TO_SCAN_NAME, ++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); ++ if (proc_ext3_mb_max_to_scan == NULL) { ++ printk(KERN_ERR "EXT3: Unable to create %s\n", ++ EXT3_MB_MAX_TO_SCAN_NAME); ++ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_ROOT, proc_root_fs); ++ return -EIO; ++ } ++ ++ proc_ext3_mb_max_to_scan->data = NULL; ++ proc_ext3_mb_max_to_scan->read_proc = ext3_mb_max_to_scan_read; ++ proc_ext3_mb_max_to_scan->write_proc = ext3_mb_max_to_scan_write; ++ ++ /* Initialize EXT3_MIN_TO_SCAN_NAME */ ++ proc_ext3_mb_min_to_scan = create_proc_entry( ++ EXT3_MB_MIN_TO_SCAN_NAME, ++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); ++ if (proc_ext3_mb_min_to_scan == NULL) { ++ printk(KERN_ERR "EXT3: Unable to create %s\n", ++ EXT3_MB_MIN_TO_SCAN_NAME); ++ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_ROOT, proc_root_fs); ++ return -EIO; ++ } ++ ++ proc_ext3_mb_min_to_scan->data = NULL; ++ proc_ext3_mb_min_to_scan->read_proc = ext3_mb_min_to_scan_read; ++ proc_ext3_mb_min_to_scan->write_proc = ext3_mb_min_to_scan_write; ++ ++ return 0; ++} ++ ++void exit_ext3_proc(void) ++{ ++ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_MIN_TO_SCAN_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_ROOT, proc_root_fs); ++} +Index: linux-2.6.12.6/fs/ext3/Makefile +=================================================================== +--- linux-2.6.12.6.orig/fs/ext3/Makefile 2005-12-17 02:17:16.000000000 +0300 ++++ linux-2.6.12.6/fs/ext3/Makefile 2005-12-17 02:21:21.000000000 +0300 +@@ -6,7 +6,7 @@ + + ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ + ioctl.o namei.o super.o symlink.o hash.o resize.o \ +- extents.o ++ extents.o mballoc.o + + ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o + ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o diff --git a/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6.9-rhel4.patch b/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6.9-rhel4.patch index b3d9f73..d12c678 100644 --- a/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6.9-rhel4.patch +++ b/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6.9-rhel4.patch @@ -1,71 +1,8 @@ -Index: linux-2.6.9/include/linux/ext3_fs_sb.h +Index: linux-2.6.9-full/include/linux/ext3_fs.h =================================================================== ---- linux-2.6.9.orig/include/linux/ext3_fs_sb.h 2005-10-14 09:10:05.000000000 +0400 -+++ linux-2.6.9/include/linux/ext3_fs_sb.h 2005-10-14 09:10:13.000000000 +0400 -@@ -23,10 +23,30 @@ - #define EXT_INCLUDE - #include - #include -+#include - #endif - #endif - #include - -+#define EXT3_BB_MAX_BLOCKS 30 -+struct ext3_free_metadata { -+ unsigned short group; -+ unsigned short num; -+ unsigned short blocks[EXT3_BB_MAX_BLOCKS]; -+ struct list_head list; -+}; -+ -+struct ext3_buddy_group_blocks { -+ __u32 bb_bitmap; -+ __u32 bb_buddy; -+ spinlock_t bb_lock; -+ unsigned long bb_tid; -+ struct ext3_free_metadata *bb_md_cur; -+ unsigned short bb_first_free; -+ unsigned short bb_free; -+ unsigned bb_counters[]; -+}; -+ - /* - * third extended-fs super-block data in memory - */ -@@ -81,6 +101,27 @@ - char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */ - int s_jquota_fmt; /* Format of quota to use */ - #endif -+ -+ /* for buddy allocator */ -+ struct ext3_buddy_group_blocks **s_buddy_blocks; -+ struct inode *s_buddy; -+ long s_blocks_reserved; -+ spinlock_t s_reserve_lock; -+ struct list_head s_active_transaction; -+ struct list_head s_closed_transaction; -+ struct list_head s_committed_transaction; -+ spinlock_t s_md_lock; -+ tid_t s_last_transaction; -+ int s_mb_factor; -+ -+ /* stats for buddy allocator */ -+ spinlock_t s_bal_lock; -+ unsigned long s_bal_reqs; /* number of reqs with len > 1 */ -+ unsigned long s_bal_success; /* we found long enough chunks */ -+ unsigned long s_bal_allocated; /* in blocks */ -+ unsigned long s_bal_ex_scanned; /* total extents scanned */ -+ unsigned long s_bal_goals; /* goal hits */ -+ unsigned long s_bal_breaks; /* too long searches */ - }; - - #endif /* _LINUX_EXT3_FS_SB */ -Index: linux-2.6.9/include/linux/ext3_fs.h -=================================================================== ---- linux-2.6.9.orig/include/linux/ext3_fs.h 2005-10-14 09:10:12.000000000 +0400 -+++ linux-2.6.9/include/linux/ext3_fs.h 2005-10-14 09:10:31.000000000 +0400 -@@ -57,6 +57,14 @@ +--- linux-2.6.9-full.orig/include/linux/ext3_fs.h 2005-12-16 23:16:41.000000000 +0300 ++++ linux-2.6.9-full/include/linux/ext3_fs.h 2005-12-16 23:16:42.000000000 +0300 +@@ -57,6 +57,14 @@ struct statfs; #define ext3_debug(f, a...) do {} while (0) #endif @@ -80,15 +17,15 @@ Index: linux-2.6.9/include/linux/ext3_fs.h /* * Special inodes numbers */ -@@ -365,6 +373,7 @@ - #define EXT3_MOUNT_IOPEN_NOPRIV 0x80000 /* Make iopen world-readable */ - #define EXT3_MOUNT_EXTENTS 0x100000/* Extents support */ - #define EXT3_MOUNT_EXTDEBUG 0x200000/* Extents debug */ -+#define EXT3_MOUNT_MBALLOC 0x400000/* Buddy allocation support */ +@@ -365,6 +373,7 @@ struct ext3_inode { + #define EXT3_MOUNT_IOPEN_NOPRIV 0x100000/* Make iopen world-readable */ + #define EXT3_MOUNT_EXTENTS 0x200000/* Extents support */ + #define EXT3_MOUNT_EXTDEBUG 0x400000/* Extents debug */ ++#define EXT3_MOUNT_MBALLOC 0x800000/* Buddy allocation support */ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ #ifndef clear_opt -@@ -726,7 +735,7 @@ +@@ -726,7 +735,7 @@ extern int ext3_bg_has_super(struct supe extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group); extern int ext3_new_block (handle_t *, struct inode *, unsigned long, int *); extern void ext3_free_blocks (handle_t *, struct inode *, unsigned long, @@ -97,12 +34,11 @@ Index: linux-2.6.9/include/linux/ext3_fs.h extern void ext3_free_blocks_sb (handle_t *, struct super_block *, unsigned long, unsigned long, int *); extern unsigned long ext3_count_free_blocks (struct super_block *); -@@ -857,6 +866,44 @@ +@@ -857,6 +866,17 @@ extern void ext3_extents_initialize_bloc extern int ext3_ext_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg); +/* mballoc.c */ -+extern long ext3_mb_aggressive; +extern long ext3_mb_stats; +extern long ext3_mb_max_to_scan; +extern int ext3_mb_init(struct super_block *, int); @@ -110,90 +46,146 @@ Index: linux-2.6.9/include/linux/ext3_fs.h +extern int ext3_mb_new_blocks(handle_t *, struct inode *, unsigned long, int *, int, int *); +extern int ext3_mb_reserve_blocks(struct super_block *, int); +extern void ext3_mb_release_blocks(struct super_block *, int); -+ -+/* writeback.c */ -+extern int ext3_wb_writepages(struct address_space *, struct writeback_control *); -+extern int ext3_wb_prepare_write(struct file *file, struct page *page, -+ unsigned from, unsigned to); -+extern int ext3_wb_commit_write(struct file *, struct page *, unsigned, unsigned); -+extern int ext3_wb_writepage(struct page *, struct writeback_control *); -+extern int ext3_wb_invalidatepage(struct page *, unsigned long); -+extern int ext3_wb_releasepage(struct page *, int); -+extern int ext3_wb_block_truncate_page(handle_t *, struct page *, struct address_space *, loff_t); -+extern void ext3_wb_init(struct super_block *); -+extern void ext3_wb_release(struct super_block *); -+ -+/* writeback.c */ -+extern int ext3_wb_writepages(struct address_space *, struct writeback_control *); -+extern int ext3_wb_prepare_write(struct file *file, struct page *page, -+ unsigned from, unsigned to); -+extern int ext3_wb_commit_write(struct file *, struct page *, unsigned, unsigned); -+extern int ext3_wb_writepage(struct page *, struct writeback_control *); -+extern int ext3_wb_invalidatepage(struct page *, unsigned long); -+extern int ext3_wb_releasepage(struct page *, int); -+extern int ext3_wb_block_truncate_page(handle_t *, struct page *, struct address_space *, loff_t); -+extern void ext3_wb_init(struct super_block *); -+extern void ext3_wb_release(struct super_block *); -+ -+/* proc.c */ -+extern int init_ext3_proc(void); -+extern void exit_ext3_proc(void); ++int __init init_ext3_proc(void); ++void exit_ext3_proc(void); + #endif /* __KERNEL__ */ /* EXT3_IOC_CREATE_INUM at bottom of file (visible to kernel and user). */ -Index: linux-2.6.9/fs/ext3/balloc.c +Index: linux-2.6.9-full/include/linux/ext3_fs_sb.h =================================================================== ---- linux-2.6.9.orig/fs/ext3/balloc.c 2005-05-13 21:39:03.000000000 +0400 -+++ linux-2.6.9/fs/ext3/balloc.c 2005-10-14 09:10:13.000000000 +0400 -@@ -79,7 +79,7 @@ - * - * Return buffer_head on success or NULL in case of failure. - */ --static struct buffer_head * -+struct buffer_head * - read_block_bitmap(struct super_block *sb, unsigned int block_group) - { - struct ext3_group_desc * desc; -@@ -450,24 +450,6 @@ - return; - } +--- linux-2.6.9-full.orig/include/linux/ext3_fs_sb.h 2005-12-16 23:16:39.000000000 +0300 ++++ linux-2.6.9-full/include/linux/ext3_fs_sb.h 2005-12-16 23:16:42.000000000 +0300 +@@ -23,9 +23,15 @@ + #define EXT_INCLUDE + #include + #include ++#include + #endif + #endif + #include ++#include ++ ++struct ext3_buddy_group_blocks; ++struct ext3_mb_history; ++#define EXT3_BB_MAX_BLOCKS --/* Free given blocks, update quota and i_blocks field */ --void ext3_free_blocks(handle_t *handle, struct inode *inode, -- unsigned long block, unsigned long count) --{ -- struct super_block * sb; -- int dquot_freed_blocks; -- -- sb = inode->i_sb; -- if (!sb) { -- printk ("ext3_free_blocks: nonexistent device"); -- return; -- } -- ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks); -- if (dquot_freed_blocks) -- DQUOT_FREE_BLOCK(inode, dquot_freed_blocks); -- return; --} -- /* - * For ext3 allocations, we must not reuse any blocks which are - * allocated in the bitmap buffer's "last committed data" copy. This -@@ -1140,7 +1122,7 @@ - * bitmap, and then for any free bit if that fails. - * This function also updates quota and i_blocks field. - */ --int ext3_new_block(handle_t *handle, struct inode *inode, -+int ext3_new_block_old(handle_t *handle, struct inode *inode, - unsigned long goal, int *errp) + * third extended-fs super-block data in memory +@@ -81,6 +87,38 @@ struct ext3_sb_info { + char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */ + int s_jquota_fmt; /* Format of quota to use */ + #endif ++ ++ /* for buddy allocator */ ++ struct ext3_group_info **s_group_info; ++ struct inode *s_buddy_cache; ++ long s_blocks_reserved; ++ spinlock_t s_reserve_lock; ++ struct list_head s_active_transaction; ++ struct list_head s_closed_transaction; ++ struct list_head s_committed_transaction; ++ spinlock_t s_md_lock; ++ tid_t s_last_transaction; ++ int s_mb_factor; ++ unsigned short *s_mb_offsets, *s_mb_maxs; ++ ++ /* history to debug policy */ ++ struct ext3_mb_history *s_mb_history; ++ int s_mb_history_cur; ++ int s_mb_history_max; ++ struct proc_dir_entry *s_mb_proc; ++ spinlock_t s_mb_history_lock; ++ ++ /* stats for buddy allocator */ ++ atomic_t s_bal_reqs; /* number of reqs with len > 1 */ ++ atomic_t s_bal_success; /* we found long enough chunks */ ++ atomic_t s_bal_allocated; /* in blocks */ ++ atomic_t s_bal_ex_scanned; /* total extents scanned */ ++ atomic_t s_bal_goals; /* goal hits */ ++ atomic_t s_bal_breaks; /* too long searches */ ++ atomic_t s_bal_2orders; /* 2^order hits */ ++ spinlock_t s_bal_lock; ++ unsigned long s_mb_buddies_generated; ++ unsigned long long s_mb_generation_time; + }; + + #endif /* _LINUX_EXT3_FS_SB */ +Index: linux-2.6.9-full/fs/ext3/super.c +=================================================================== +--- linux-2.6.9-full.orig/fs/ext3/super.c 2005-12-16 23:16:41.000000000 +0300 ++++ linux-2.6.9-full/fs/ext3/super.c 2005-12-16 23:16:42.000000000 +0300 +@@ -394,6 +394,7 @@ void ext3_put_super (struct super_block + struct ext3_super_block *es = sbi->s_es; + int i; + ++ ext3_mb_release(sb); + ext3_ext_release(sb); + ext3_xattr_put_super(sb); + journal_destroy(sbi->s_journal); +@@ -596,7 +597,7 @@ enum { + Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, + Opt_ignore, Opt_barrier, Opt_err, Opt_resize, + Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, +- Opt_extents, Opt_extdebug, ++ Opt_extents, Opt_extdebug, Opt_mballoc, + }; + + static match_table_t tokens = { +@@ -647,6 +649,7 @@ static match_table_t tokens = { + {Opt_iopen_nopriv, "iopen_nopriv"}, + {Opt_extents, "extents"}, + {Opt_extdebug, "extdebug"}, ++ {Opt_mballoc, "mballoc"}, + {Opt_barrier, "barrier=%u"}, + {Opt_err, NULL}, + {Opt_resize, "resize"}, +@@ -957,6 +960,9 @@ clear_qf_name: + case Opt_extdebug: + set_opt (sbi->s_mount_opt, EXTDEBUG); + break; ++ case Opt_mballoc: ++ set_opt (sbi->s_mount_opt, MBALLOC); ++ break; + default: + printk (KERN_ERR + "EXT3-fs: Unrecognized mount option \"%s\" " +@@ -1646,6 +1652,7 @@ static int ext3_fill_super (struct super + ext3_count_dirs(sb)); + + ext3_ext_init(sb); ++ ext3_mb_init(sb, needs_recovery); + + return 0; + +@@ -2428,7 +2435,13 @@ static struct file_system_type ext3_fs_t + + static int __init init_ext3_fs(void) { - struct buffer_head *bitmap_bh = NULL; -Index: linux-2.6.9/fs/ext3/extents.c +- int err = init_ext3_xattr(); ++ int err; ++ ++ err = init_ext3_proc(); ++ if (err) ++ return err; ++ ++ err = init_ext3_xattr(); + if (err) + return err; + err = init_inodecache(); +@@ -2450,6 +2463,7 @@ static void __exit exit_ext3_fs(void) + unregister_filesystem(&ext3_fs_type); + destroy_inodecache(); + exit_ext3_xattr(); ++ exit_ext3_proc(); + } + + int ext3_prep_san_write(struct inode *inode, long *blocks, +Index: linux-2.6.9-full/fs/ext3/extents.c =================================================================== ---- linux-2.6.9.orig/fs/ext3/extents.c 2005-10-14 09:10:12.000000000 +0400 -+++ linux-2.6.9/fs/ext3/extents.c 2005-10-14 09:10:13.000000000 +0400 -@@ -771,7 +771,7 @@ +--- linux-2.6.9-full.orig/fs/ext3/extents.c 2005-12-16 23:16:41.000000000 +0300 ++++ linux-2.6.9-full/fs/ext3/extents.c 2005-12-16 23:16:42.000000000 +0300 +@@ -771,7 +771,7 @@ cleanup: for (i = 0; i < depth; i++) { if (!ablocks[i]) continue; @@ -202,7 +194,7 @@ Index: linux-2.6.9/fs/ext3/extents.c } } kfree(ablocks); -@@ -1428,7 +1428,7 @@ +@@ -1428,7 +1428,7 @@ int ext3_ext_rm_idx(handle_t *handle, st path->p_idx->ei_leaf); bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf); ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf); @@ -211,7 +203,7 @@ Index: linux-2.6.9/fs/ext3/extents.c return err; } -@@ -1913,10 +1913,12 @@ +@@ -1913,10 +1913,12 @@ ext3_remove_blocks(struct ext3_extents_t int needed = ext3_remove_blocks_credits(tree, ex, from, to); handle_t *handle = ext3_journal_start(tree->inode, needed); struct buffer_head *bh; @@ -220,12 +212,12 @@ Index: linux-2.6.9/fs/ext3/extents.c if (IS_ERR(handle)) return PTR_ERR(handle); -+ if (S_ISDIR(tree->inode->i_mode)) ++ if (S_ISDIR(tree->inode->i_mode) || S_ISLNK(tree->inode->i_mode)) + metadata = 1; if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) { /* tail removal */ unsigned long num, start; -@@ -1928,7 +1930,7 @@ +@@ -1928,7 +1930,7 @@ ext3_remove_blocks(struct ext3_extents_t bh = sb_find_get_block(tree->inode->i_sb, start + i); ext3_forget(handle, 0, tree->inode, bh, start + i); } @@ -234,24 +226,98 @@ Index: linux-2.6.9/fs/ext3/extents.c } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) { printk("strange request: removal %lu-%lu from %u:%u\n", from, to, ex->ee_block, ex->ee_len); -Index: linux-2.6.9/fs/ext3/namei.c +Index: linux-2.6.9-full/fs/ext3/inode.c =================================================================== ---- linux-2.6.9.orig/fs/ext3/namei.c 2005-10-14 09:10:04.000000000 +0400 -+++ linux-2.6.9/fs/ext3/namei.c 2005-10-14 09:10:13.000000000 +0400 -@@ -1639,7 +1639,7 @@ - * If the create succeeds, we fill in the inode information - * with d_instantiate(). +--- linux-2.6.9-full.orig/fs/ext3/inode.c 2005-12-16 23:16:41.000000000 +0300 ++++ linux-2.6.9-full/fs/ext3/inode.c 2005-12-16 23:16:42.000000000 +0300 +@@ -572,7 +572,7 @@ static int ext3_alloc_branch(handle_t *h + ext3_journal_forget(handle, branch[i].bh); + } + for (i = 0; i < keys; i++) +- ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1); ++ ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1, 1); + return err; + } + +@@ -673,7 +673,7 @@ err_out: + if (err == -EAGAIN) + for (i = 0; i < num; i++) + ext3_free_blocks(handle, inode, +- le32_to_cpu(where[i].key), 1); ++ le32_to_cpu(where[i].key), 1, 1); + return err; + } + +@@ -1831,7 +1831,7 @@ ext3_clear_blocks(handle_t *handle, stru + } + } + +- ext3_free_blocks(handle, inode, block_to_free, count); ++ ext3_free_blocks(handle, inode, block_to_free, count, 1); + } + + /** +@@ -2004,7 +2004,7 @@ static void ext3_free_branches(handle_t + ext3_journal_test_restart(handle, inode); + } + +- ext3_free_blocks(handle, inode, nr, 1); ++ ext3_free_blocks(handle, inode, nr, 1, 1); + + if (parent_bh) { + /* +Index: linux-2.6.9-full/fs/ext3/balloc.c +=================================================================== +--- linux-2.6.9-full.orig/fs/ext3/balloc.c 2005-10-27 21:44:24.000000000 +0400 ++++ linux-2.6.9-full/fs/ext3/balloc.c 2005-12-16 23:16:42.000000000 +0300 +@@ -79,7 +79,7 @@ struct ext3_group_desc * ext3_get_group_ + * + * Return buffer_head on success or NULL in case of failure. */ --static int ext3_create (struct inode * dir, struct dentry * dentry, int mode, -+int ext3_create (struct inode * dir, struct dentry * dentry, int mode, - struct nameidata *nd) +-static struct buffer_head * ++struct buffer_head * + read_block_bitmap(struct super_block *sb, unsigned int block_group) { - handle_t *handle; -Index: linux-2.6.9/fs/ext3/xattr.c + struct ext3_group_desc * desc; +@@ -450,24 +450,6 @@ error_return: + return; + } + +-/* Free given blocks, update quota and i_blocks field */ +-void ext3_free_blocks(handle_t *handle, struct inode *inode, +- unsigned long block, unsigned long count) +-{ +- struct super_block * sb; +- int dquot_freed_blocks; +- +- sb = inode->i_sb; +- if (!sb) { +- printk ("ext3_free_blocks: nonexistent device"); +- return; +- } +- ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks); +- if (dquot_freed_blocks) +- DQUOT_FREE_BLOCK(inode, dquot_freed_blocks); +- return; +-} +- + /* + * For ext3 allocations, we must not reuse any blocks which are + * allocated in the bitmap buffer's "last committed data" copy. This +@@ -1140,7 +1122,7 @@ int ext3_should_retry_alloc(struct super + * bitmap, and then for any free bit if that fails. + * This function also updates quota and i_blocks field. + */ +-int ext3_new_block(handle_t *handle, struct inode *inode, ++int ext3_new_block_old(handle_t *handle, struct inode *inode, + unsigned long goal, int *errp) + { + struct buffer_head *bitmap_bh = NULL; +Index: linux-2.6.9-full/fs/ext3/xattr.c =================================================================== ---- linux-2.6.9.orig/fs/ext3/xattr.c 2005-10-14 09:10:08.000000000 +0400 -+++ linux-2.6.9/fs/ext3/xattr.c 2005-10-14 09:10:13.000000000 +0400 -@@ -1281,7 +1281,7 @@ +--- linux-2.6.9-full.orig/fs/ext3/xattr.c 2005-12-16 23:16:40.000000000 +0300 ++++ linux-2.6.9-full/fs/ext3/xattr.c 2005-12-16 23:16:42.000000000 +0300 +@@ -1281,7 +1281,7 @@ ext3_xattr_set_handle2(handle_t *handle, new_bh = sb_getblk(sb, block); if (!new_bh) { getblk_failed: @@ -260,7 +326,7 @@ Index: linux-2.6.9/fs/ext3/xattr.c error = -EIO; goto cleanup; } -@@ -1328,7 +1328,7 @@ +@@ -1328,7 +1328,7 @@ getblk_failed: if (ce) mb_cache_entry_free(ce); ea_bdebug(old_bh, "freeing"); @@ -269,7 +335,7 @@ Index: linux-2.6.9/fs/ext3/xattr.c /* ext3_forget() calls bforget() for us, but we let our caller release old_bh, so we need to -@@ -1427,7 +1427,7 @@ +@@ -1427,7 +1427,7 @@ ext3_xattr_delete_inode(handle_t *handle if (HDR(bh)->h_refcount == cpu_to_le32(1)) { if (ce) mb_cache_entry_free(ce); @@ -278,27 +344,13 @@ Index: linux-2.6.9/fs/ext3/xattr.c get_bh(bh); ext3_forget(handle, 1, inode, bh, EXT3_I(inode)->i_file_acl); } else { -Index: linux-2.6.9/fs/ext3/Makefile +Index: linux-2.6.9-full/fs/ext3/mballoc.c =================================================================== ---- linux-2.6.9.orig/fs/ext3/Makefile 2005-10-14 09:10:12.000000000 +0400 -+++ linux-2.6.9/fs/ext3/Makefile 2005-10-14 09:10:13.000000000 +0400 -@@ -5,7 +5,8 @@ - obj-$(CONFIG_EXT3_FS) += ext3.o - - ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o\ -- ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o -+ ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ -+ mballoc.o - - ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o - ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o -Index: linux-2.6.9/fs/ext3/mballoc.c -=================================================================== ---- linux-2.6.9.orig/fs/ext3/mballoc.c 2005-10-13 19:40:57.851699336 +0400 -+++ linux-2.6.9/fs/ext3/mballoc.c 2005-10-14 09:10:31.000000000 +0400 -@@ -0,0 +1,1865 @@ +--- linux-2.6.9-full.orig/fs/ext3/mballoc.c 2005-12-16 17:46:19.148560250 +0300 ++++ linux-2.6.9-full/fs/ext3/mballoc.c 2005-12-17 00:10:15.000000000 +0300 +@@ -0,0 +1,2429 @@ +/* -+ * Copyright(c) 2003, 2004, 2005, Cluster File Systems, Inc, info@clusterfs.com ++ * Copyright (c) 2003-2005, Cluster File Systems, Inc, info@clusterfs.com + * Written by Alex Tomas + * + * This program is free software; you can redistribute it and/or modify @@ -330,12 +382,15 @@ Index: linux-2.6.9/fs/ext3/mballoc.c +#include +#include +#include ++#include ++#include ++#include ++#include + +/* + * TODO: -+ * - bitmap/buddy read-ahead (proposed by Oleg Drokin aka green) ++ * - bitmap read-ahead (proposed by Oleg Drokin aka green) + * - track min/max extents in each group for better group selection -+ * - is it worthwhile to use buddies directly if req is 2^N blocks? + * - mb_mark_used() may allocate chunk right after splitting buddy + * - special flag to advice allocator to look for requested + N blocks + * this may improve interaction between extents and mballoc @@ -345,17 +400,10 @@ Index: linux-2.6.9/fs/ext3/mballoc.c + */ + +/* -+ * with 'ext3_mb_aggressive' set the allocator runs consistency checks over ++ * with AGRESSIVE_CHECK allocator runs consistency checks over + * structures. these checks slow things down a lot + */ -+long ext3_mb_aggressive = 0; -+ -+ -+/* -+ * with 'ext3_mb_stats' allocator will collect stats that will be -+ * shown at umount. The collecting costs though! -+ */ -+long ext3_mb_stats = 1; ++#define AGGRESSIVE_CHECK__ + +/* + */ @@ -367,33 +415,56 @@ Index: linux-2.6.9/fs/ext3/mballoc.c +#endif + +/* -+ * where to save buddies structures beetween umount/mount (clean case only) ++ * with EXT3_MB_HISTORY mballoc stores last N allocations in memory ++ * and you can monitor it in /proc/fs/ext3//mb_history + */ -+#define EXT3_BUDDY_FILE ".buddy" ++#define EXT3_MB_HISTORY + +/* + * How long mballoc can look for a best extent (in found extents) + */ -+long ext3_mb_max_to_scan = 100; ++long ext3_mb_max_to_scan = 500; + +/* -+ * This structure is on-disk description of a group for mballoc ++ * How long mballoc must look for a best extent + */ -+struct ext3_mb_group_descr { -+ __u16 mgd_first_free; /* first free block in the group */ -+ __u16 mgd_free; /* number of free blocks in the group */ -+ __u16 mgd_counters[16]; /* number of free blocks by order */ -+}; ++long ext3_mb_min_to_scan = 30; + +/* -+ * This structure is header of mballoc's file ++ * with 'ext3_mb_stats' allocator will collect stats that will be ++ * shown at umount. The collecting costs though! + */ -+struct ext3_mb_grp_header { -+ __u32 mh_magic; ++ ++long ext3_mb_stats = 1; ++ ++#ifdef EXT3_BB_MAX_BLOCKS ++#undef EXT3_BB_MAX_BLOCKS ++#endif ++#define EXT3_BB_MAX_BLOCKS 30 ++ ++struct ext3_free_metadata { ++ unsigned short group; ++ unsigned short num; ++ unsigned short blocks[EXT3_BB_MAX_BLOCKS]; ++ struct list_head list; +}; + -+#define EXT3_MB_MAGIC_V1 0xbabd16fd ++struct ext3_group_info { ++ unsigned long bb_state; ++ unsigned long bb_tid; ++ struct ext3_free_metadata *bb_md_cur; ++ unsigned short bb_first_free; ++ unsigned short bb_free; ++ unsigned short bb_fragments; ++ unsigned short bb_counters[]; ++}; ++ ++ ++#define EXT3_GROUP_INFO_NEED_INIT_BIT 0 ++#define EXT3_GROUP_INFO_LOCKED_BIT 1 + ++#define EXT3_MB_GRP_NEED_INIT(grp) \ ++ (test_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, &(grp)->bb_state)) + +struct ext3_free_extent { + __u16 fe_start; @@ -414,28 +485,55 @@ Index: linux-2.6.9/fs/ext3/mballoc.c + unsigned long ac_ex_scanned; + __u16 ac_groups_scanned; + __u16 ac_found; ++ __u16 ac_tail; ++ __u16 ac_buddy; + __u8 ac_status; + __u8 ac_flags; /* allocation hints */ ++ __u8 ac_criteria; + __u8 ac_repeats; ++ __u8 ac_2order; /* if request is to allocate 2^N blocks and ++ * N > 0, the field stores N, otherwise 0 */ +}; + +#define AC_STATUS_CONTINUE 1 +#define AC_STATUS_FOUND 2 +#define AC_STATUS_BREAK 3 + ++struct ext3_mb_history { ++ struct ext3_free_extent goal; /* goal allocation */ ++ struct ext3_free_extent result; /* result allocation */ ++ __u16 found; /* how many extents have been found */ ++ __u16 groups; /* how many groups have been scanned */ ++ __u16 tail; /* what tail broke some buddy */ ++ __u16 buddy; /* buddy the tail ^^^ broke */ ++ __u8 cr; /* which phase the result extent was found at */ ++ __u8 merged; ++}; ++ +struct ext3_buddy { -+ struct buffer_head *bd_bh; -+ struct buffer_head *bd_bh2; -+ struct ext3_buddy_group_blocks *bd_bd; ++ struct page *bd_buddy_page; ++ void *bd_buddy; ++ struct page *bd_bitmap_page; ++ void *bd_bitmap; ++ struct ext3_group_info *bd_info; + struct super_block *bd_sb; + __u16 bd_blkbits; + __u16 bd_group; +}; -+#define EXT3_MB_BITMAP(e3b) ((e3b)->bd_bh->b_data) -+#define EXT3_MB_BUDDY(e3b) ((e3b)->bd_bh2->b_data) ++#define EXT3_MB_BITMAP(e3b) ((e3b)->bd_bitmap) ++#define EXT3_MB_BUDDY(e3b) ((e3b)->bd_buddy) ++ ++#ifndef EXT3_MB_HISTORY ++#define ext3_mb_store_history(sb,ac) ++#else ++static void ext3_mb_store_history(struct super_block *, ++ struct ext3_allocation_context *ac); ++#endif + +#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) + ++static struct proc_dir_entry *proc_root_ext3; ++ +int ext3_create (struct inode *, struct dentry *, int, struct nameidata *); +struct buffer_head * read_block_bitmap(struct super_block *, unsigned int); +int ext3_new_block_old(handle_t *, struct inode *, unsigned long, int *); @@ -490,9 +588,25 @@ Index: linux-2.6.9/fs/ext3/mballoc.c + ext2_clear_bit_atomic(NULL, bit, addr); +} + ++static inline int mb_find_next_zero_bit(void *addr, int max, int start) ++{ ++ int fix; ++#if BITS_PER_LONG == 64 ++ fix = ((unsigned long) addr & 7UL) << 3; ++ addr = (void *) ((unsigned long) addr & ~7UL); ++#elif BITS_PER_LONG == 32 ++ fix = ((unsigned long) addr & 3UL) << 3; ++ addr = (void *) ((unsigned long) addr & ~3UL); ++#else ++#error "how many bits you are?!" ++#endif ++ max += fix; ++ start += fix; ++ return ext2_find_next_zero_bit(addr, max, start) - fix; ++} ++ +static inline void *mb_find_buddy(struct ext3_buddy *e3b, int order, int *max) +{ -+ int i = 1; + char *bb; + + J_ASSERT(EXT3_MB_BITMAP(e3b) != EXT3_MB_BUDDY(e3b)); @@ -508,89 +622,30 @@ Index: linux-2.6.9/fs/ext3/mballoc.c + if (order == 0) + return EXT3_MB_BITMAP(e3b); + -+ bb = EXT3_MB_BUDDY(e3b); -+ *max = *max >> 1; -+ while (i < order) { -+ bb += 1 << (e3b->bd_blkbits - i); -+ i++; -+ *max = *max >> 1; -+ } -+ J_ASSERT((unsigned) (bb - (char *) EXT3_MB_BUDDY(e3b)) < -+ e3b->bd_sb->s_blocksize); -+ return bb; -+} -+ -+static int ext3_mb_load_buddy(struct super_block *sb, int group, -+ struct ext3_buddy *e3b) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ -+ J_ASSERT(sbi->s_buddy_blocks[group]->bb_bitmap); -+ J_ASSERT(sbi->s_buddy_blocks[group]->bb_buddy); -+ -+ /* load bitmap */ -+ e3b->bd_bh = sb_getblk(sb, sbi->s_buddy_blocks[group]->bb_bitmap); -+ if (e3b->bd_bh == NULL) { -+ ext3_error(sb, "ext3_mb_load_buddy", -+ "can't get block for buddy bitmap\n"); -+ goto out; -+ } -+ /* load buddy */ -+ e3b->bd_bh2 = sb_getblk(sb, sbi->s_buddy_blocks[group]->bb_buddy); -+ if (e3b->bd_bh2 == NULL) { -+ ext3_error(sb, "ext3_mb_load_buddy", -+ "can't get block for buddy bitmap\n"); -+ goto out; -+ } -+ -+ if (!buffer_uptodate(e3b->bd_bh)) -+ ll_rw_block(READ, 1, &e3b->bd_bh); -+ if (!buffer_uptodate(e3b->bd_bh2)) -+ ll_rw_block(READ, 1, &e3b->bd_bh2); -+ -+ wait_on_buffer(e3b->bd_bh); -+ J_ASSERT(buffer_uptodate(e3b->bd_bh)); -+ wait_on_buffer(e3b->bd_bh2); -+ J_ASSERT(buffer_uptodate(e3b->bd_bh2)); -+ -+ e3b->bd_blkbits = sb->s_blocksize_bits; -+ e3b->bd_bd = sbi->s_buddy_blocks[group]; -+ e3b->bd_sb = sb; -+ e3b->bd_group = group; -+ -+ return 0; -+out: -+ brelse(e3b->bd_bh); -+ brelse(e3b->bd_bh2); -+ e3b->bd_bh = NULL; -+ e3b->bd_bh2 = NULL; -+ return -EIO; -+} ++ bb = EXT3_MB_BUDDY(e3b) + EXT3_SB(e3b->bd_sb)->s_mb_offsets[order]; ++ *max = EXT3_SB(e3b->bd_sb)->s_mb_maxs[order]; + -+static void ext3_mb_dirty_buddy(struct ext3_buddy *e3b) -+{ -+ mark_buffer_dirty(e3b->bd_bh); -+ mark_buffer_dirty(e3b->bd_bh2); ++ return bb; +} + -+static void ext3_mb_release_desc(struct ext3_buddy *e3b) -+{ -+ brelse(e3b->bd_bh); -+ brelse(e3b->bd_bh2); -+} ++#ifdef AGGRESSIVE_CHECK + +static void mb_check_buddy(struct ext3_buddy *e3b) +{ + int order = e3b->bd_blkbits + 1; + int max, max2, i, j, k, count; ++ int fragments = 0, fstart; + void *buddy, *buddy2; + -+ if (likely(!ext3_mb_aggressive)) -+ return; -+ + if (!test_opt(e3b->bd_sb, MBALLOC)) + return; + ++ { ++ static int mb_check_counter = 0; ++ if (mb_check_counter++ % 300 != 0) ++ return; ++ } ++ + while (order > 1) { + buddy = mb_find_buddy(e3b, order, &max); + J_ASSERT(buddy); @@ -621,14 +676,22 @@ Index: linux-2.6.9/fs/ext3/mballoc.c + } + count++; + } -+ J_ASSERT(e3b->bd_bd->bb_counters[order] == count); ++ J_ASSERT(e3b->bd_info->bb_counters[order] == count); + order--; + } + ++ fstart = -1; + buddy = mb_find_buddy(e3b, 0, &max); + for (i = 0; i < max; i++) { -+ if (!mb_test_bit(i, buddy)) ++ if (!mb_test_bit(i, buddy)) { ++ J_ASSERT(i >= e3b->bd_info->bb_first_free); ++ if (fstart == -1) { ++ fragments++; ++ fstart = i; ++ } + continue; ++ } ++ fstart = -1; + /* check used bits only */ + for (j = 0; j < e3b->bd_blkbits + 1; j++) { + buddy2 = mb_find_buddy(e3b, j, &max2); @@ -637,18 +700,325 @@ Index: linux-2.6.9/fs/ext3/mballoc.c + J_ASSERT(mb_test_bit(k, buddy2)); + } + } ++ J_ASSERT(!EXT3_MB_GRP_NEED_INIT(e3b->bd_info)); ++ J_ASSERT(e3b->bd_info->bb_fragments == fragments); ++} ++ ++#else ++#define mb_check_buddy(e3b) ++#endif ++ ++/* find most significant bit */ ++static int inline fmsb(unsigned short word) ++{ ++ int order; ++ ++ if (word > 255) { ++ order = 7; ++ word >>= 8; ++ } else { ++ order = -1; ++ } ++ ++ do { ++ order++; ++ word >>= 1; ++ } while (word != 0); ++ ++ return order; ++} ++ ++static void inline ++ext3_mb_mark_free_simple(struct super_block *sb, void *buddy, unsigned first, ++ int len, struct ext3_group_info *grp) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ unsigned short min, max, chunk, border; ++ ++ mb_debug("mark %u/%u free\n", first, len); ++ J_ASSERT(len < EXT3_BLOCKS_PER_GROUP(sb)); ++ ++ border = 2 << sb->s_blocksize_bits; ++ ++ while (len > 0) { ++ /* find how many blocks can be covered since this position */ ++ max = ffs(first | border) - 1; ++ ++ /* find how many blocks of power 2 we need to mark */ ++ min = fmsb(len); ++ ++ mb_debug(" %u/%u -> max %u, min %u\n", ++ first & ((2 << sb->s_blocksize_bits) - 1), ++ len, max, min); ++ ++ if (max < min) ++ min = max; ++ chunk = 1 << min; ++ ++ /* mark multiblock chunks only */ ++ grp->bb_counters[min]++; ++ if (min > 0) { ++ mb_debug(" set %u at %u \n", first >> min, ++ sbi->s_mb_offsets[min]); ++ mb_clear_bit(first >> min, buddy + sbi->s_mb_offsets[min]); ++ } ++ ++ len -= chunk; ++ first += chunk; ++ } ++} ++ ++static void ++ext3_mb_generate_buddy(struct super_block *sb, void *buddy, void *bitmap, ++ struct ext3_group_info *grp) ++{ ++ unsigned short max = EXT3_BLOCKS_PER_GROUP(sb); ++ unsigned short i = 0, first, len; ++ unsigned free = 0, fragments = 0; ++ unsigned long long period = get_cycles(); ++ ++ i = mb_find_next_zero_bit(bitmap, max, 0); ++ grp->bb_first_free = i; ++ while (i < max) { ++ fragments++; ++ first = i; ++ i = find_next_bit(bitmap, max, i); ++ len = i - first; ++ free += len; ++ if (len > 1) ++ ext3_mb_mark_free_simple(sb, buddy, first, len, grp); ++ else ++ grp->bb_counters[0]++; ++ if (i < max) ++ i = mb_find_next_zero_bit(bitmap, max, i); ++ } ++ grp->bb_fragments = fragments; ++ ++ /* bb_state shouldn't being modified because all ++ * others waits for init completion on page lock */ ++ clear_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, &grp->bb_state); ++ if (free != grp->bb_free) { ++ printk("EXT3-fs: %u blocks in bitmap, %u in group descriptor\n", ++ free, grp->bb_free); ++ grp->bb_free = free; ++ } ++ ++ period = get_cycles() - period; ++ spin_lock(&EXT3_SB(sb)->s_bal_lock); ++ EXT3_SB(sb)->s_mb_buddies_generated++; ++ EXT3_SB(sb)->s_mb_generation_time += period; ++ spin_unlock(&EXT3_SB(sb)->s_bal_lock); ++} ++ ++static int ext3_mb_init_cache(struct page *page) ++{ ++ int blocksize, blocks_per_page, groups_per_page; ++ int err = 0, i, first_group, first_block; ++ struct super_block *sb; ++ struct buffer_head *bhs; ++ struct buffer_head **bh; ++ struct inode *inode; ++ char *data, *bitmap; ++ ++ mb_debug("init page %lu\n", page->index); ++ ++ inode = page->mapping->host; ++ sb = inode->i_sb; ++ blocksize = 1 << inode->i_blkbits; ++ blocks_per_page = PAGE_CACHE_SIZE / blocksize; ++ ++ groups_per_page = blocks_per_page >> 1; ++ if (groups_per_page == 0) ++ groups_per_page = 1; ++ ++ /* allocate buffer_heads to read bitmaps */ ++ if (groups_per_page > 1) { ++ err = -ENOMEM; ++ i = sizeof(struct buffer_head *) * groups_per_page; ++ bh = kmalloc(i, GFP_NOFS); ++ if (bh == NULL) ++ goto out; ++ memset(bh, 0, i); ++ } else ++ bh = &bhs; ++ ++ first_group = page->index * blocks_per_page / 2; ++ ++ /* read all groups the page covers into the cache */ ++ for (i = 0; i < groups_per_page; i++) { ++ struct ext3_group_desc * desc; ++ ++ if (first_group + i >= EXT3_SB(sb)->s_groups_count) ++ break; ++ ++ err = -EIO; ++ desc = ext3_get_group_desc(sb, first_group + i, NULL); ++ if (desc == NULL) ++ goto out; ++ ++ err = -ENOMEM; ++ bh[i] = sb_getblk(sb, le32_to_cpu(desc->bg_block_bitmap)); ++ if (bh[i] == NULL) ++ goto out; ++ ++ if (buffer_uptodate(bh[i])) ++ continue; ++ ++ lock_buffer(bh[i]); ++ if (buffer_uptodate(bh[i])) { ++ unlock_buffer(bh[i]); ++ continue; ++ } ++ ++ get_bh(bh[i]); ++ bh[i]->b_end_io = end_buffer_read_sync; ++ submit_bh(READ, bh[i]); ++ mb_debug("read bitmap for group %u\n", first_group + i); ++ } ++ ++ /* wait for I/O completion */ ++ for (i = 0; i < groups_per_page && bh[i]; i++) ++ wait_on_buffer(bh[i]); ++ ++ /* XXX: I/O error handling here */ ++ ++ first_block = page->index * blocks_per_page; ++ for (i = 0; i < blocks_per_page; i++) { ++ int group; ++ ++ group = (first_block + i) >> 1; ++ if (group >= EXT3_SB(sb)->s_groups_count) ++ break; ++ ++ data = page_address(page) + (i * blocksize); ++ bitmap = bh[group - first_group]->b_data; ++ ++ if ((first_block + i) & 1) { ++ /* this is block of buddy */ ++ mb_debug("put buddy for group %u in page %lu/%x\n", ++ group, page->index, i * blocksize); ++ memset(data, 0xff, blocksize); ++ EXT3_SB(sb)->s_group_info[group]->bb_fragments = 0; ++ memset(EXT3_SB(sb)->s_group_info[group]->bb_counters, 0, ++ sizeof(unsigned short)*(sb->s_blocksize_bits+2)); ++ ext3_mb_generate_buddy(sb, data, bitmap, ++ EXT3_SB(sb)->s_group_info[group]); ++ } else { ++ /* this is block of bitmap */ ++ mb_debug("put bitmap for group %u in page %lu/%x\n", ++ group, page->index, i * blocksize); ++ memcpy(data, bitmap, blocksize); ++ } ++ } ++ SetPageUptodate(page); ++ ++out: ++ if (bh) { ++ for (i = 0; bh && i < groups_per_page && bh[i]; i++) ++ brelse(bh[i]); ++ if (bh != &bhs) ++ kfree(bh); ++ } ++ return err; ++} ++ ++static int ext3_mb_load_buddy(struct super_block *sb, int group, ++ struct ext3_buddy *e3b) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct inode *inode = sbi->s_buddy_cache; ++ int blocks_per_page, block, pnum, poff; ++ struct page *page; ++ ++ mb_debug("load group %u\n", group); ++ ++ blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; ++ ++ e3b->bd_blkbits = sb->s_blocksize_bits; ++ e3b->bd_info = sbi->s_group_info[group]; ++ e3b->bd_sb = sb; ++ e3b->bd_group = group; ++ e3b->bd_buddy_page = NULL; ++ e3b->bd_bitmap_page = NULL; ++ ++ block = group * 2; ++ pnum = block / blocks_per_page; ++ poff = block % blocks_per_page; ++ ++ page = find_get_page(inode->i_mapping, pnum); ++ if (page == NULL || !PageUptodate(page)) { ++ if (page) ++ page_cache_release(page); ++ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); ++ if (page) { ++ if (!PageUptodate(page)) ++ ext3_mb_init_cache(page); ++ unlock_page(page); ++ } ++ } ++ if (page == NULL || !PageUptodate(page)) ++ goto err; ++ e3b->bd_bitmap_page = page; ++ e3b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); ++ mark_page_accessed(page); ++ ++ block++; ++ pnum = block / blocks_per_page; ++ poff = block % blocks_per_page; ++ ++ page = find_get_page(inode->i_mapping, pnum); ++ if (page == NULL || !PageUptodate(page)) { ++ if (page) ++ page_cache_release(page); ++ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); ++ if (page) { ++ if (!PageUptodate(page)) ++ ext3_mb_init_cache(page); ++ unlock_page(page); ++ } ++ } ++ if (page == NULL || !PageUptodate(page)) ++ goto err; ++ e3b->bd_buddy_page = page; ++ e3b->bd_buddy = page_address(page) + (poff * sb->s_blocksize); ++ mark_page_accessed(page); ++ ++ J_ASSERT(e3b->bd_bitmap_page != NULL); ++ J_ASSERT(e3b->bd_buddy_page != NULL); ++ ++ return 0; ++ ++err: ++ if (e3b->bd_bitmap_page) ++ page_cache_release(e3b->bd_bitmap_page); ++ if (e3b->bd_buddy_page) ++ page_cache_release(e3b->bd_buddy_page); ++ e3b->bd_buddy = NULL; ++ e3b->bd_bitmap = NULL; ++ return -EIO; ++} ++ ++static void ext3_mb_release_desc(struct ext3_buddy *e3b) ++{ ++ if (e3b->bd_bitmap_page) ++ page_cache_release(e3b->bd_bitmap_page); ++ if (e3b->bd_buddy_page) ++ page_cache_release(e3b->bd_buddy_page); +} + ++ +static inline void +ext3_lock_group(struct super_block *sb, int group) +{ -+ spin_lock(&EXT3_SB(sb)->s_buddy_blocks[group]->bb_lock); ++ bit_spin_lock(EXT3_GROUP_INFO_LOCKED_BIT, ++ &EXT3_SB(sb)->s_group_info[group]->bb_state); +} + +static inline void +ext3_unlock_group(struct super_block *sb, int group) +{ -+ spin_unlock(&EXT3_SB(sb)->s_buddy_blocks[group]->bb_lock); ++ bit_spin_unlock(EXT3_GROUP_INFO_LOCKED_BIT, ++ &EXT3_SB(sb)->s_group_info[group]->bb_state); +} + +static int mb_find_order_for_block(struct ext3_buddy *e3b, int block) @@ -710,22 +1080,33 @@ Index: linux-2.6.9/fs/ext3/mballoc.c + +static int mb_free_blocks(struct ext3_buddy *e3b, int first, int count) +{ -+ int block, max, order; ++ int block = 0, max = 0, order; + void *buddy, *buddy2; + + mb_check_buddy(e3b); + -+ e3b->bd_bd->bb_free += count; -+ if (first < e3b->bd_bd->bb_first_free) -+ e3b->bd_bd->bb_first_free = first; -+ ++ e3b->bd_info->bb_free += count; ++ if (first < e3b->bd_info->bb_first_free) ++ e3b->bd_info->bb_first_free = first; ++ ++ /* let's maintain fragments counter */ ++ if (first != 0) ++ block = !mb_test_bit(first - 1, EXT3_MB_BITMAP(e3b)); ++ if (first + count < EXT3_SB(e3b->bd_sb)->s_mb_maxs[0]) ++ max = !mb_test_bit(first + count, EXT3_MB_BITMAP(e3b)); ++ if (block && max) ++ e3b->bd_info->bb_fragments--; ++ else if (!block && !max) ++ e3b->bd_info->bb_fragments++; ++ ++ /* let's maintain buddy itself */ + while (count-- > 0) { + block = first++; + order = 0; + + J_ASSERT(mb_test_bit(block, EXT3_MB_BITMAP(e3b))); + mb_clear_bit(block, EXT3_MB_BITMAP(e3b)); -+ e3b->bd_bd->bb_counters[order]++; ++ e3b->bd_info->bb_counters[order]++; + + /* start of the buddy */ + buddy = mb_find_buddy(e3b, order, &max); @@ -748,12 +1129,12 @@ Index: linux-2.6.9/fs/ext3/mballoc.c + mb_set_bit(block, buddy); + mb_set_bit(block + 1, buddy); + } -+ e3b->bd_bd->bb_counters[order]--; -+ e3b->bd_bd->bb_counters[order]--; ++ e3b->bd_info->bb_counters[order]--; ++ e3b->bd_info->bb_counters[order]--; + + block = block >> 1; + order++; -+ e3b->bd_bd->bb_counters[order]++; ++ e3b->bd_info->bb_counters[order]++; + + mb_clear_bit(block, buddy2); + buddy = buddy2; @@ -765,7 +1146,7 @@ Index: linux-2.6.9/fs/ext3/mballoc.c +} + +static int mb_find_extent(struct ext3_buddy *e3b, int order, int block, -+ int needed, struct ext3_free_extent *ex) ++ int needed, struct ext3_free_extent *ex) +{ + int next, max, ord; + void *buddy; @@ -782,7 +1163,7 @@ Index: linux-2.6.9/fs/ext3/mballoc.c + return 0; + } + -+ if (order == 0) { ++ if (likely(order == 0)) { + /* find actual order */ + order = mb_find_order_for_block(e3b, block); + block = block >> order; @@ -792,7 +1173,7 @@ Index: linux-2.6.9/fs/ext3/mballoc.c + ex->fe_start = block << order; + ex->fe_group = e3b->bd_group; + -+ while ((buddy = mb_find_buddy(e3b, order, &max))) { ++ while (needed > ex->fe_len && (buddy = mb_find_buddy(e3b, order, &max))) { + + if (block + 1 >= max) + break; @@ -814,16 +1195,30 @@ Index: linux-2.6.9/fs/ext3/mballoc.c + +static int mb_mark_used(struct ext3_buddy *e3b, struct ext3_free_extent *ex) +{ ++ int ord, mlen = 0, max = 0, cur; + int start = ex->fe_start; + int len = ex->fe_len; -+ int ord, mlen, max, cur; ++ unsigned ret = 0; + int len0 = len; + void *buddy; + -+ e3b->bd_bd->bb_free -= len; -+ if (e3b->bd_bd->bb_first_free == start) -+ e3b->bd_bd->bb_first_free += len; ++ mb_check_buddy(e3b); + ++ e3b->bd_info->bb_free -= len; ++ if (e3b->bd_info->bb_first_free == start) ++ e3b->bd_info->bb_first_free += len; ++ ++ /* let's maintain fragments counter */ ++ if (start != 0) ++ mlen = !mb_test_bit(start - 1, EXT3_MB_BITMAP(e3b)); ++ if (start + len < EXT3_SB(e3b->bd_sb)->s_mb_maxs[0]) ++ max = !mb_test_bit(start + len, EXT3_MB_BITMAP(e3b)); ++ if (mlen && max) ++ e3b->bd_info->bb_fragments++; ++ else if (!mlen && !max) ++ e3b->bd_info->bb_fragments--; ++ ++ /* let's maintain buddy itself */ + while (len) { + ord = mb_find_order_for_block(e3b, start); + @@ -833,26 +1228,30 @@ Index: linux-2.6.9/fs/ext3/mballoc.c + buddy = mb_find_buddy(e3b, ord, &max); + J_ASSERT((start >> ord) < max); + mb_set_bit(start >> ord, buddy); -+ e3b->bd_bd->bb_counters[ord]--; ++ e3b->bd_info->bb_counters[ord]--; + start += mlen; + len -= mlen; + J_ASSERT(len >= 0); + continue; + } + ++ /* store for history */ ++ if (ret == 0) ++ ret = len | (ord << 16); ++ + /* we have to split large buddy */ + J_ASSERT(ord > 0); + buddy = mb_find_buddy(e3b, ord, &max); + mb_set_bit(start >> ord, buddy); -+ e3b->bd_bd->bb_counters[ord]--; ++ e3b->bd_info->bb_counters[ord]--; + + ord--; + cur = (start >> ord) & ~1U; + buddy = mb_find_buddy(e3b, ord, &max); + mb_clear_bit(cur, buddy); + mb_clear_bit(cur + 1, buddy); -+ e3b->bd_bd->bb_counters[ord]++; -+ e3b->bd_bd->bb_counters[ord]++; ++ e3b->bd_info->bb_counters[ord]++; ++ e3b->bd_info->bb_counters[ord]++; + } + + /* now drop all the bits in bitmap */ @@ -860,7 +1259,7 @@ Index: linux-2.6.9/fs/ext3/mballoc.c + + mb_check_buddy(e3b); + -+ return 0; ++ return ret; +} + +/* @@ -869,9 +1268,14 @@ Index: linux-2.6.9/fs/ext3/mballoc.c +static void ext3_mb_use_best_found(struct ext3_allocation_context *ac, + struct ext3_buddy *e3b) +{ ++ unsigned long ret; ++ + ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len); -+ mb_mark_used(e3b, &ac->ac_b_ex); ++ ret = mb_mark_used(e3b, &ac->ac_b_ex); ++ + ac->ac_status = AC_STATUS_FOUND; ++ ac->ac_tail = ret & 0xffff; ++ ac->ac_buddy = ret >> 16; +} + +/* @@ -888,9 +1292,8 @@ Index: linux-2.6.9/fs/ext3/mballoc.c + struct ext3_free_extent *ex, + struct ext3_buddy *e3b) +{ -+ int factor = EXT3_SB(ac->ac_sb)->s_mb_factor; + struct ext3_free_extent *bex = &ac->ac_b_ex; -+ int diff = ac->ac_g_ex.fe_len - ex->fe_len; ++ struct ext3_free_extent *gex = &ac->ac_g_ex; + + J_ASSERT(ex->fe_len > 0); + J_ASSERT(ex->fe_len < (1 << ac->ac_sb->s_blocksize_bits) * 8); @@ -901,7 +1304,7 @@ Index: linux-2.6.9/fs/ext3/mballoc.c + /* + * The special case - take what you catch first + */ -+ if (ac->ac_flags & EXT3_MB_HINT_FIRST) { ++ if (unlikely(ac->ac_flags & EXT3_MB_HINT_FIRST)) { + *bex = *ex; + ext3_mb_use_best_found(ac, e3b); + return; @@ -910,26 +1313,7 @@ Index: linux-2.6.9/fs/ext3/mballoc.c + /* + * Let's check whether the chuck is good enough + */ -+ if (ex->fe_len >= ac->ac_g_ex.fe_len) { -+ *bex = *ex; -+ ext3_mb_use_best_found(ac, e3b); -+ return; -+ } -+ -+ /* -+ * If the request is vey large, then it makes sense to use large -+ * chunks for it. Even if they don't satisfy whole request. -+ */ -+ if (ex->fe_len > 1000) { -+ *bex = *ex; -+ ext3_mb_use_best_found(ac, e3b); -+ return; -+ } -+ -+ /* -+ * Sometimes it's worty to take close chunk -+ */ -+ if (factor && (ac->ac_g_ex.fe_len * 100) / (diff * 100) >= factor) { ++ if (ex->fe_len == gex->fe_len) { + *bex = *ex; + ext3_mb_use_best_found(ac, e3b); + return; @@ -945,13 +1329,26 @@ Index: linux-2.6.9/fs/ext3/mballoc.c + + /* + * If new found extent is better, store it in the context -+ * FIXME: possible the policy should be more complex? + */ -+ if (ex->fe_len > bex->fe_len) { ++ if (bex->fe_len < gex->fe_len) { ++ /* if the request isn't satisfied, any found extent ++ * larger than previous best one is better */ ++ if (ex->fe_len > bex->fe_len) ++ *bex = *ex; ++ } else if (ex->fe_len > gex->fe_len) { ++ /* if the request is satisfied, then we try to find ++ * an extent that still satisfy the request, but is ++ * smaller than previous one */ + *bex = *ex; + } + + /* ++ * Let's scan at least few extents and don't pick up a first one ++ */ ++ if (bex->fe_len > gex->fe_len && ac->ac_found > ext3_mb_min_to_scan) ++ ac->ac_status = AC_STATUS_BREAK; ++ ++ /* + * We don't want to scan for a whole year + */ + if (ac->ac_found > ext3_mb_max_to_scan) @@ -972,13 +1369,13 @@ Index: linux-2.6.9/fs/ext3/mballoc.c + ext3_lock_group(ac->ac_sb, group); + max = mb_find_extent(e3b, 0, ex.fe_start, ex.fe_len, &ex); + -+ if (max > 0) ++ if (max > 0) { ++ ac->ac_b_ex = ex; + ext3_mb_use_best_found(ac, e3b); ++ } + + ext3_unlock_group(ac->ac_sb, group); + -+ if (ac->ac_status == AC_STATUS_FOUND) -+ ext3_mb_dirty_buddy(e3b); + ext3_mb_release_desc(e3b); + + return 0; @@ -1002,37 +1399,79 @@ Index: linux-2.6.9/fs/ext3/mballoc.c + J_ASSERT(ex.fe_len > 0); + J_ASSERT(ex.fe_group == ac->ac_g_ex.fe_group); + J_ASSERT(ex.fe_start == ac->ac_g_ex.fe_start); ++ ac->ac_found++; + ac->ac_b_ex = ex; + ext3_mb_use_best_found(ac, e3b); + } + ext3_unlock_group(ac->ac_sb, group); + -+ if (ac->ac_status == AC_STATUS_FOUND) -+ ext3_mb_dirty_buddy(e3b); + ext3_mb_release_desc(e3b); + + return 0; +} ++ ++/* ++ * The routine scans buddy structures (not bitmap!) from given order ++ * to max order and tries to find big enough chunk to satisfy the req ++ */ ++static void ext3_mb_simple_scan_group(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ struct super_block *sb = ac->ac_sb; ++ struct ext3_group_info *grp = e3b->bd_info; ++ void *buddy; ++ int i, k, max; ++ ++ J_ASSERT(ac->ac_2order > 0); ++ for (i = ac->ac_2order; i < sb->s_blocksize_bits + 1; i++) { ++ if (grp->bb_counters[i] == 0) ++ continue; ++ ++ buddy = mb_find_buddy(e3b, i, &max); ++ if (buddy == NULL) { ++ printk(KERN_ALERT "looking for wrong order?\n"); ++ break; ++ } ++ ++ k = mb_find_next_zero_bit(buddy, max, 0); ++ J_ASSERT(k < max); ++ ++ ac->ac_found++; ++ ++ ac->ac_b_ex.fe_len = 1 << i; ++ ac->ac_b_ex.fe_start = k << i; ++ ac->ac_b_ex.fe_group = e3b->bd_group; ++ ++ ext3_mb_use_best_found(ac, e3b); ++ J_ASSERT(ac->ac_b_ex.fe_len == ac->ac_g_ex.fe_len); ++ ++ if (unlikely(ext3_mb_stats)) ++ atomic_inc(&EXT3_SB(sb)->s_bal_2orders); ++ ++ break; ++ } ++} ++ +/* + * The routine scans the group and measures all found extents. + * In order to optimize scanning, caller must pass number of -+ * free blocks in the group, so the routine can upper limit. ++ * free blocks in the group, so the routine can know upper limit. + */ -+static void ext3_mb_scan_group(struct ext3_allocation_context *ac, -+ struct ext3_buddy *e3b) ++static void ext3_mb_complex_scan_group(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) +{ + struct super_block *sb = ac->ac_sb; + void *bitmap = EXT3_MB_BITMAP(e3b); + struct ext3_free_extent ex; + int i, free; + -+ free = e3b->bd_bd->bb_free; ++ free = e3b->bd_info->bb_free; + J_ASSERT(free > 0); + -+ i = e3b->bd_bd->bb_first_free; ++ i = e3b->bd_info->bb_first_free; + -+ while (free && ac->ac_status != AC_STATUS_FOUND) { -+ i = ext2_find_next_zero_bit(bitmap, sb->s_blocksize * 8, i); ++ while (free && ac->ac_status == AC_STATUS_CONTINUE) { ++ i = mb_find_next_zero_bit(bitmap, sb->s_blocksize * 8, i); + if (i >= sb->s_blocksize * 8) { + J_ASSERT(free == 0); + break; @@ -1052,23 +1491,39 @@ Index: linux-2.6.9/fs/ext3/mballoc.c +static int ext3_mb_good_group(struct ext3_allocation_context *ac, + int group, int cr) +{ -+ int free; ++ struct ext3_sb_info *sbi = EXT3_SB(ac->ac_sb); ++ struct ext3_group_info *grp = sbi->s_group_info[group]; ++ unsigned free, fragments, i, bits; + -+ J_ASSERT(cr >= 0 && cr < 3); ++ J_ASSERT(cr >= 0 && cr < 4); ++ J_ASSERT(!EXT3_MB_GRP_NEED_INIT(grp)); + -+ free = EXT3_SB(ac->ac_sb)->s_buddy_blocks[group]->bb_free; ++ free = grp->bb_free; ++ fragments = grp->bb_fragments; + if (free == 0) + return 0; ++ if (fragments == 0) ++ return 0; + -+ if (cr == 0) { -+ if (free >= ac->ac_g_ex.fe_len >> 1) ++ switch (cr) { ++ case 0: ++ J_ASSERT(ac->ac_2order != 0); ++ bits = ac->ac_sb->s_blocksize_bits + 1; ++ for (i = ac->ac_2order; i < bits; i++) ++ if (grp->bb_counters[i] > 0) ++ return 1; ++ case 1: ++ if ((free / fragments) >= ac->ac_g_ex.fe_len) ++ return 1; ++ case 2: ++ if (free >= ac->ac_g_ex.fe_len) ++ return 1; ++ case 3: + return 1; -+ } else if (cr == 1) { -+ if (free >= ac->ac_g_ex.fe_len >> 2) -+ return 1; -+ } else if (cr == 2) { -+ return 1; ++ default: ++ BUG(); + } ++ + return 0; +} + @@ -1160,11 +1615,19 @@ Index: linux-2.6.9/fs/ext3/mballoc.c + ac.ac_g_ex.fe_start = block; + ac.ac_g_ex.fe_len = *len; + ac.ac_flags = flags; ++ ac.ac_2order = 0; ++ ac.ac_criteria = 0; + -+ /* -+ * Sometimes, caller may want to merge even small number -+ * of blocks to an existing extent -+ */ ++ /* probably, the request is for 2^8+ blocks (1/2/3/... MB) */ ++ i = ffs(*len); ++ if (i >= 8) { ++ i--; ++ if ((*len & (~(1 << i))) == 0) ++ ac.ac_2order = i; ++ } ++ ++ /* Sometimes, caller may want to merge even small ++ * number of blocks to an existing extent */ + if (ac.ac_flags & EXT3_MB_HINT_MERGE) { + err = ext3_mb_find_by_goal(&ac, &e3b); + if (err) @@ -1173,23 +1636,24 @@ Index: linux-2.6.9/fs/ext3/mballoc.c + goto found; + } + -+ /* -+ * FIXME -+ * If requested chunk is power of 2 length, we can try -+ * to exploit buddy nature to speed allocation up -+ */ -+ -+ -+ /* -+ * Let's just scan groups to find more-less suitable blocks -+ */ -+ cr = 0; ++ /* Let's just scan groups to find more-less suitable blocks */ ++ cr = ac.ac_2order ? 0 : 1; +repeat: -+ for (; cr < 3 && ac.ac_status == AC_STATUS_CONTINUE; cr++) { ++ for (; cr < 4 && ac.ac_status == AC_STATUS_CONTINUE; cr++) { ++ ac.ac_criteria = cr; + for (i = 0; i < EXT3_SB(sb)->s_groups_count; group++, i++) { + if (group == EXT3_SB(sb)->s_groups_count) + group = 0; + ++ if (EXT3_MB_GRP_NEED_INIT(sbi->s_group_info[group])) { ++ /* we need full data about the group ++ * to make a good selection */ ++ err = ext3_mb_load_buddy(ac.ac_sb, group, &e3b); ++ if (err) ++ goto out_err; ++ ext3_mb_release_desc(&e3b); ++ } ++ + /* check is group good for our criteries */ + if (!ext3_mb_good_group(&ac, group, cr)) + continue; @@ -1206,29 +1670,32 @@ Index: linux-2.6.9/fs/ext3/mballoc.c + continue; + } + -+ ext3_mb_scan_group(&ac, &e3b); ++ ac.ac_groups_scanned++; ++ if (cr == 0) ++ ext3_mb_simple_scan_group(&ac, &e3b); ++ else ++ ext3_mb_complex_scan_group(&ac, &e3b); ++ + ext3_unlock_group(sb, group); + -+ if (ac.ac_status == AC_STATUS_FOUND) -+ ext3_mb_dirty_buddy(&e3b); + ext3_mb_release_desc(&e3b); + -+ if (err) -+ goto out_err; + if (ac.ac_status != AC_STATUS_CONTINUE) + break; + } + } + -+ if (ac.ac_status == AC_STATUS_BREAK && ++ if (ac.ac_b_ex.fe_len > 0 && ac.ac_status != AC_STATUS_FOUND && + !(ac.ac_flags & EXT3_MB_HINT_FIRST)) { + /* + * We've been searching too long. Let's try to allocate + * the best chunk we've found so far + */ -+ ext3_warning(inode->i_sb, __FUNCTION__, -+ "too long searching: got %d want %d\n", -+ ac.ac_b_ex.fe_len, ac.ac_g_ex.fe_len); ++ ++ /*if (ac.ac_found > ext3_mb_max_to_scan) ++ printk(KERN_ERR "EXT3-fs: too long searching at " ++ "%u (%d/%d)\n", cr, ac.ac_b_ex.fe_len, ++ ac.ac_g_ex.fe_len);*/ + ext3_mb_try_best_found(&ac, &e3b); + if (ac.ac_status != AC_STATUS_FOUND) { + /* @@ -1242,7 +1709,7 @@ Index: linux-2.6.9/fs/ext3/mballoc.c + ac.ac_b_ex.fe_len = 0; + ac.ac_status = AC_STATUS_CONTINUE; + ac.ac_flags |= EXT3_MB_HINT_FIRST; -+ cr = 2; ++ cr = 3; + goto repeat; + } + } @@ -1265,7 +1732,7 @@ Index: linux-2.6.9/fs/ext3/mballoc.c + printk("EXT3-fs: groups: "); + for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) + printk("%d: %d ", i, -+ sbi->s_buddy_blocks[i]->bb_free); ++ sbi->s_group_info[i]->bb_free); + printk("\n"); +#endif + goto out; @@ -1319,12 +1786,10 @@ Index: linux-2.6.9/fs/ext3/mballoc.c + ext3_error(sb, "ext3_new_block", + "Allocating block in system zone - " + "block = %u", block); -+ if (unlikely(ext3_mb_aggressive)) { -+ for (i = 0; i < ac.ac_b_ex.fe_len; i++) -+ J_ASSERT(!mb_test_bit(ac.ac_b_ex.fe_start + i, -+ bitmap_bh->b_data)); -+ } -+ ++#ifdef AGGRESSIVE_CHECK ++ for (i = 0; i < ac.ac_b_ex.fe_len; i++) ++ J_ASSERT(!mb_test_bit(ac.ac_b_ex.fe_start + i, bitmap_bh->b_data)); ++#endif + mb_set_bits(bitmap_bh->b_data, ac.ac_b_ex.fe_start, ac.ac_b_ex.fe_len); + + spin_lock(sb_bgl_lock(sbi, ac.ac_b_ex.fe_group)); @@ -1374,369 +1839,359 @@ Index: linux-2.6.9/fs/ext3/mballoc.c + * path only, here is single block always */ + ext3_mb_release_blocks(sb, 1); + } -+ -+ if ((ext3_mb_stats) && (ac.ac_g_ex.fe_len > 1)) { -+ spin_lock(&sbi->s_bal_lock); -+ sbi->s_bal_reqs++; -+ sbi->s_bal_allocated += *len; ++ ++ if (unlikely(ext3_mb_stats) && ac.ac_g_ex.fe_len > 1) { ++ atomic_inc(&sbi->s_bal_reqs); ++ atomic_add(*len, &sbi->s_bal_allocated); + if (*len >= ac.ac_g_ex.fe_len) -+ sbi->s_bal_success++; -+ sbi->s_bal_ex_scanned += ac.ac_found; ++ atomic_inc(&sbi->s_bal_success); ++ atomic_add(ac.ac_found, &sbi->s_bal_ex_scanned); + if (ac.ac_g_ex.fe_start == ac.ac_b_ex.fe_start && + ac.ac_g_ex.fe_group == ac.ac_b_ex.fe_group) -+ sbi->s_bal_goals++; ++ atomic_inc(&sbi->s_bal_goals); + if (ac.ac_found > ext3_mb_max_to_scan) -+ sbi->s_bal_breaks++; -+ spin_unlock(&sbi->s_bal_lock); ++ atomic_inc(&sbi->s_bal_breaks); + } + ++ ext3_mb_store_history(sb, &ac); ++ + return block; +} ++EXPORT_SYMBOL(ext3_mb_new_blocks); ++ ++#ifdef EXT3_MB_HISTORY ++struct ext3_mb_proc_session { ++ struct ext3_mb_history *history; ++ struct super_block *sb; ++ int start; ++ int max; ++}; + -+int ext3_mb_get_descr_loc(struct ext3_buddy *e3b, struct buffer_head **bh, -+ struct ext3_mb_group_descr **grp) ++static void *ext3_mb_history_skip_empty(struct ext3_mb_proc_session *s, ++ struct ext3_mb_history *hs, ++ int first) +{ -+ struct super_block *sb = e3b->bd_sb; -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ int descr_per_block, err, offset; -+ struct ext3_mb_grp_header *hdr; -+ unsigned long block; -+ -+ descr_per_block = (sb->s_blocksize - sizeof(struct ext3_mb_grp_header)) -+ / sizeof(struct ext3_mb_group_descr); -+ block = e3b->bd_group / descr_per_block; -+ *bh = ext3_bread(NULL, sbi->s_buddy, block, 0, &err); -+ if (*bh == NULL) { -+ printk(KERN_ERR "EXT3-fs: cant getblk descr for group %d: %d\n", -+ e3b->bd_group, err); -+ return err; -+ } ++ if (hs == s->history + s->max) ++ hs = s->history; ++ if (!first && hs == s->history + s->start) ++ return NULL; ++ while (hs->goal.fe_len == 0) { ++ hs++; ++ if (hs == s->history + s->max) ++ hs = s->history; ++ if (hs == s->history + s->start) ++ return NULL; ++ } ++ return hs; ++} + -+ hdr = (struct ext3_mb_grp_header *) (*bh)->b_data; -+ if (hdr->mh_magic != EXT3_MB_MAGIC_V1) { -+ printk(KERN_ERR "EXT3-fs: invalid magic in group %d!\n", -+ e3b->bd_group); -+ brelse(*bh); -+ *bh = NULL; -+ return -EIO; -+ } ++static void *ext3_mb_seq_history_start(struct seq_file *seq, loff_t *pos) ++{ ++ struct ext3_mb_proc_session *s = seq->private; ++ struct ext3_mb_history *hs; ++ int l = *pos; ++ ++ if (l == 0) ++ return SEQ_START_TOKEN; ++ hs = ext3_mb_history_skip_empty(s, s->history + s->start, 1); ++ if (!hs) ++ return NULL; ++ while (--l && (hs = ext3_mb_history_skip_empty(s, ++hs, 0)) != NULL); ++ return hs; ++} + -+ offset = e3b->bd_group % descr_per_block -+ * sizeof(struct ext3_mb_group_descr) -+ + sizeof(struct ext3_mb_grp_header); -+ *grp = (struct ext3_mb_group_descr *) ((*bh)->b_data + offset); ++static void *ext3_mb_seq_history_next(struct seq_file *seq, void *v, loff_t *pos) ++{ ++ struct ext3_mb_proc_session *s = seq->private; ++ struct ext3_mb_history *hs = v; + -+ return 0; ++ ++*pos; ++ if (v == SEQ_START_TOKEN) ++ return ext3_mb_history_skip_empty(s, s->history + s->start, 1); ++ else ++ return ext3_mb_history_skip_empty(s, ++hs, 0); +} + -+int ext3_mb_load_descr(struct ext3_buddy *e3b) ++static int ext3_mb_seq_history_show(struct seq_file *seq, void *v) +{ -+ struct ext3_mb_group_descr *grp; -+ struct ext3_group_desc *gdp; -+ struct buffer_head *bh; -+ int err, i; -+ -+ err = ext3_mb_get_descr_loc(e3b, &bh, &grp); -+ if (err) -+ return err; -+ -+ e3b->bd_bd->bb_first_free = grp->mgd_first_free; -+ e3b->bd_bd->bb_free = grp->mgd_free; -+ for (i = 0; i <= e3b->bd_blkbits + 1; i++) { -+ J_ASSERT(i < 16); -+ e3b->bd_bd->bb_counters[i] = grp->mgd_counters[i]; -+ } -+ brelse(bh); ++ struct ext3_mb_history *hs = v; ++ char buf[20], buf2[20]; + -+ /* additional checks against old group descriptor */ -+ gdp = ext3_get_group_desc(e3b->bd_sb, e3b->bd_group, NULL); -+ if (!gdp) -+ return -EIO; -+ if (e3b->bd_bd->bb_free != le16_to_cpu(gdp->bg_free_blocks_count)) { -+ printk(KERN_ERR "EXT3-fs: mbgroup %d corrupted (%d != %d)\n", -+ e3b->bd_group, e3b->bd_bd->bb_free, -+ le16_to_cpu(gdp->bg_free_blocks_count)); -+ return -ENODATA; ++ if (v == SEQ_START_TOKEN) { ++ seq_printf(seq, "%-17s %-17s %-5s %-5s %-2s %-5s %-5s %-6s\n", ++ "goal", "result", "found", "grps", "cr", "merge", ++ "tail", "broken"); ++ return 0; + } + ++ sprintf(buf, "%u/%u/%u", hs->goal.fe_group, ++ hs->goal.fe_start, hs->goal.fe_len); ++ sprintf(buf2, "%u/%u/%u", hs->result.fe_group, ++ hs->result.fe_start, hs->result.fe_len); ++ seq_printf(seq, "%-17s %-17s %-5u %-5u %-2u %-5s %-5u %-6u\n", buf, ++ buf2, hs->found, hs->groups, hs->cr, ++ hs->merged ? "M" : "", hs->tail, ++ hs->buddy ? 1 << hs->buddy : 0); + return 0; +} + ++static void ext3_mb_seq_history_stop(struct seq_file *seq, void *v) ++{ ++} ++ ++static struct seq_operations ext3_mb_seq_history_ops = { ++ .start = ext3_mb_seq_history_start, ++ .next = ext3_mb_seq_history_next, ++ .stop = ext3_mb_seq_history_stop, ++ .show = ext3_mb_seq_history_show, ++}; + -+int ext3_mb_update_descr(struct ext3_buddy *e3b) ++static int ext3_mb_seq_history_open(struct inode *inode, struct file *file) +{ -+ struct ext3_mb_group_descr *grp; -+ struct ext3_group_desc *gdp; -+ struct buffer_head *bh; -+ handle_t *handle; -+ int err, i; ++ struct super_block *sb = PDE(inode)->data; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct ext3_mb_proc_session *s; ++ int rc, size; + -+ /* additional checks against old group descriptor */ -+ gdp = ext3_get_group_desc(e3b->bd_sb, e3b->bd_group, NULL); -+ if (!gdp) ++ s = kmalloc(sizeof(*s), GFP_KERNEL); ++ if (s == NULL) + return -EIO; -+ if (e3b->bd_bd->bb_free != le16_to_cpu(gdp->bg_free_blocks_count)) { -+ printk(KERN_ERR "EXT3-fs: mbgroup %d corrupted (%d != %d)\n", -+ e3b->bd_group, e3b->bd_bd->bb_free, -+ le16_to_cpu(gdp->bg_free_blocks_count)); -+ return -ENODATA; -+ } -+ -+ err = ext3_mb_get_descr_loc(e3b, &bh, &grp); -+ if (err) -+ return err; ++ size = sizeof(struct ext3_mb_history) * sbi->s_mb_history_max; ++ s->history = kmalloc(size, GFP_KERNEL); ++ ++ spin_lock(&sbi->s_mb_history_lock); ++ memcpy(s->history, sbi->s_mb_history, size); ++ s->max = sbi->s_mb_history_max; ++ s->start = sbi->s_mb_history_cur % s->max; ++ spin_unlock(&sbi->s_mb_history_lock); + -+ handle = ext3_journal_start_sb(e3b->bd_sb, 1); -+ if (IS_ERR(handle)) { -+ err = PTR_ERR(handle); -+ handle = NULL; -+ goto out; -+ } -+ -+ err = ext3_journal_get_write_access(handle, bh); -+ if (err) -+ goto out; -+ grp->mgd_first_free = e3b->bd_bd->bb_first_free; -+ grp->mgd_free = e3b->bd_bd->bb_free; -+ for (i = 0; i <= e3b->bd_blkbits + 1; i++) { -+ J_ASSERT(i < 16); -+ grp->mgd_counters[i] = e3b->bd_bd->bb_counters[i]; ++ rc = seq_open(file, &ext3_mb_seq_history_ops); ++ if (rc == 0) { ++ struct seq_file *m = (struct seq_file *)file->private_data; ++ m->private = s; ++ } else { ++ kfree(s->history); ++ kfree(s); + } -+ err = ext3_journal_dirty_metadata(handle, bh); -+ if (err) -+ goto out; -+ err = 0; -+out: -+ brelse(bh); -+ if (handle) -+ ext3_journal_stop(handle); -+ return err; ++ return rc; ++ +} + -+int ext3_mb_generate_buddy(struct ext3_buddy *e3b) ++static int ext3_mb_seq_history_release(struct inode *inode, struct file *file) +{ -+ struct super_block *sb = e3b->bd_sb; -+ struct buffer_head *bh; -+ int i, count = 0; ++ struct seq_file *seq = (struct seq_file *)file->private_data; ++ struct ext3_mb_proc_session *s = seq->private; ++ kfree(s->history); ++ kfree(s); ++ return seq_release(inode, file); ++} + -+ mb_debug("generate buddy for group %d\n", e3b->bd_group); -+ memset(e3b->bd_bh->b_data, 0xff, sb->s_blocksize); -+ memset(e3b->bd_bh2->b_data, 0xff, sb->s_blocksize); ++static struct file_operations ext3_mb_seq_history_fops = { ++ .owner = THIS_MODULE, ++ .open = ext3_mb_seq_history_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = ext3_mb_seq_history_release, ++}; + -+ bh = read_block_bitmap(sb, e3b->bd_group); -+ if (bh == NULL) -+ return -EIO; ++static void ext3_mb_history_release(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ char name[64]; + -+ /* mb_free_blocks will set real free */ -+ e3b->bd_bd->bb_free = 0; -+ e3b->bd_bd->bb_first_free = 1 << 15; -+ /* -+ * if change bb_counters size, don't forget about -+ * ext3_mb_init_backend() -bzzz -+ */ -+ memset(e3b->bd_bd->bb_counters, 0, -+ sizeof(unsigned) * (sb->s_blocksize_bits + 2)); ++ snprintf(name, sizeof(name) - 1, "%s", bdevname(sb->s_bdev, name)); ++ remove_proc_entry("mb_history", sbi->s_mb_proc); ++ remove_proc_entry(name, proc_root_ext3); + -+ /* loop over the blocks, and create buddies for free ones */ -+ for (i = 0; i < sb->s_blocksize * 8; i++) { -+ if (!mb_test_bit(i, (void *) bh->b_data)) { -+ mb_free_blocks(e3b, i, 1); -+ count++; ++ if (sbi->s_mb_history) ++ kfree(sbi->s_mb_history); ++} ++ ++static void ext3_mb_history_init(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ char name[64]; ++ int i; ++ ++ snprintf(name, sizeof(name) - 1, "%s", bdevname(sb->s_bdev, name)); ++ sbi->s_mb_proc = proc_mkdir(name, proc_root_ext3); ++ if (sbi->s_mb_proc != NULL) { ++ struct proc_dir_entry *p; ++ p = create_proc_entry("mb_history", S_IRUGO, sbi->s_mb_proc); ++ if (p) { ++ p->proc_fops = &ext3_mb_seq_history_fops; ++ p->data = sb; + } + } -+ brelse(bh); -+ mb_check_buddy(e3b); -+ ext3_mb_dirty_buddy(e3b); + -+ return 0; ++ sbi->s_mb_history_max = 1000; ++ sbi->s_mb_history_cur = 0; ++ spin_lock_init(&sbi->s_mb_history_lock); ++ i = sbi->s_mb_history_max * sizeof(struct ext3_mb_history); ++ sbi->s_mb_history = kmalloc(i, GFP_KERNEL); ++ memset(sbi->s_mb_history, 0, i); ++ /* if we can't allocate history, then we simple won't use it */ +} + -+EXPORT_SYMBOL(ext3_mb_new_blocks); ++static void ++ext3_mb_store_history(struct super_block *sb, struct ext3_allocation_context *ac) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct ext3_mb_history h; ++ ++ if (likely(sbi->s_mb_history == NULL)) ++ return; ++ ++ h.goal = ac->ac_g_ex; ++ h.result = ac->ac_b_ex; ++ h.found = ac->ac_found; ++ h.cr = ac->ac_criteria; ++ h.groups = ac->ac_groups_scanned; ++ h.tail = ac->ac_tail; ++ h.buddy = ac->ac_buddy; ++ h.merged = 0; ++ if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && ++ ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group) ++ h.merged = 1; ++ ++ spin_lock(&sbi->s_mb_history_lock); ++ memcpy(sbi->s_mb_history + sbi->s_mb_history_cur, &h, sizeof(h)); ++ if (++sbi->s_mb_history_cur >= sbi->s_mb_history_max) ++ sbi->s_mb_history_cur = 0; ++ spin_unlock(&sbi->s_mb_history_lock); ++} + -+#define MB_CREDITS \ -+ (EXT3_DATA_TRANS_BLOCKS + 3 + EXT3_INDEX_EXTRA_TRANS_BLOCKS + \ -+ 2 * EXT3_SINGLEDATA_TRANS_BLOCKS) ++#else ++#define ext3_mb_history_release(sb) ++#define ext3_mb_history_init(sb) ++#endif + -+int ext3_mb_init_backend(struct super_block *sb, int *created) ++int ext3_mb_init_backend(struct super_block *sb) +{ -+ int err, i, len, descr_per_block, buddy_offset, size; -+ struct inode *root = sb->s_root->d_inode; + struct ext3_sb_info *sbi = EXT3_SB(sb); -+ struct ext3_mb_grp_header *hdr; -+ struct buffer_head *bh = NULL; -+ unsigned long block; -+ struct dentry *db; -+ handle_t *handle; -+ tid_t target; -+ -+ *created = 0; ++ int i, len; ++ + len = sizeof(struct ext3_buddy_group_blocks *) * sbi->s_groups_count; -+ sbi->s_buddy_blocks = kmalloc(len, GFP_KERNEL); -+ if (sbi->s_buddy_blocks == NULL) { -+ printk(KERN_ERR "EXT3-fs: cant allocate mem for buddy\n"); ++ sbi->s_group_info = kmalloc(len, GFP_KERNEL); ++ if (sbi->s_group_info == NULL) { ++ printk(KERN_ERR "EXT3-fs: can't allocate mem for buddy\n"); + return -ENOMEM; + } -+ memset(sbi->s_buddy_blocks, 0, len); -+ sbi->s_buddy = NULL; -+ -+ down(&root->i_sem); -+ len = strlen(EXT3_BUDDY_FILE); -+ db = lookup_one_len(EXT3_BUDDY_FILE, sb->s_root, len); -+ if (IS_ERR(db)) { -+ err = PTR_ERR(db); -+ printk(KERN_ERR "EXT3-fs: cant lookup buddy: %d\n", err); -+ up(&root->i_sem); -+ goto out; -+ } ++ memset(sbi->s_group_info, 0, len); + -+ if (db->d_inode == NULL) { -+ err = ext3_create(root, db, S_IFREG, NULL); -+ if (err) { -+ printk(KERN_ERR "EXT3-fs: cant create buddy: %d\n", err); -+ up(&root->i_sem); -+ goto out; -+ } -+ db->d_inode->i_flags |= S_IMMUTABLE | S_NOATIME; -+ *created = 1; -+ mb_debug("no buddy file, regenerate\n"); -+ } -+ up(&root->i_sem); -+ sbi->s_buddy = igrab(db->d_inode); -+ -+ /* calculate needed size */ -+ descr_per_block = (sb->s_blocksize - sizeof(struct ext3_mb_grp_header)) -+ / sizeof(struct ext3_mb_group_descr); -+ buddy_offset = (sbi->s_groups_count + descr_per_block - 1) -+ / descr_per_block; -+ len = sbi->s_groups_count * sb->s_blocksize * 2 + -+ buddy_offset * sb->s_blocksize; -+ if (len != i_size_read(sbi->s_buddy)) { -+ if (*created == 0) -+ printk("EXT3-fs: wrong i_size (%u != %u), regenerate\n", -+ (unsigned) len, -+ (unsigned) i_size_read(sbi->s_buddy)); -+ *created = 1; -+ } -+ -+ /* read/create mb group descriptors */ -+ for (i = 0; i < buddy_offset; i++) { -+ handle = ext3_journal_start(sbi->s_buddy, MB_CREDITS); -+ if (IS_ERR(handle)) { -+ printk(KERN_ERR "EXT3-fs: cant start transaction\n"); -+ err = PTR_ERR(handle); -+ goto err_out; -+ } -+ -+ bh = ext3_bread(handle, sbi->s_buddy, i, 1, &err); -+ if (bh == NULL) { -+ printk(KERN_ERR "EXT3-fs: cant getblk grp: %d\n", err); -+ goto err_out; -+ } -+ hdr = (struct ext3_mb_grp_header *) bh->b_data; -+ if (hdr->mh_magic != EXT3_MB_MAGIC_V1) { -+ err = ext3_journal_get_write_access(handle, bh); -+ if (err) -+ goto err_out; -+ if (*created == 0) -+ printk(KERN_ERR -+ "EXT3-fs: invalid header 0x%x in %d," -+ "regenerate\n", hdr->mh_magic, i); -+ *created = 1; -+ hdr->mh_magic = EXT3_MB_MAGIC_V1; -+ err = ext3_journal_dirty_metadata(handle, bh); -+ if (err) -+ goto err_out; -+ } -+ brelse(bh); -+ ext3_journal_stop(handle); ++ sbi->s_buddy_cache = new_inode(sb); ++ if (sbi->s_buddy_cache == NULL) { ++ printk(KERN_ERR "EXT3-fs: can't get new inode\n"); ++ kfree(sbi->s_group_info); ++ return -ENOMEM; + } + + /* -+ * if change bb_counters size, don't forget about ext3_mb_generate_buddy() ++ * calculate needed size. if change bb_counters size, ++ * don't forget about ext3_mb_generate_buddy() + */ -+ len = sizeof(struct ext3_buddy_group_blocks); -+ len += sizeof(unsigned) * (sb->s_blocksize_bits + 2); ++ len = sizeof(struct ext3_group_info); ++ len += sizeof(unsigned short) * (sb->s_blocksize_bits + 2); + for (i = 0; i < sbi->s_groups_count; i++) { ++ struct ext3_group_desc * desc; + -+ sbi->s_buddy_blocks[i] = kmalloc(len, GFP_KERNEL); -+ if (sbi->s_buddy_blocks[i] == NULL) { ++ sbi->s_group_info[i] = kmalloc(len, GFP_KERNEL); ++ if (sbi->s_group_info[i] == NULL) { + printk(KERN_ERR "EXT3-fs: cant allocate mem for buddy\n"); -+ err = -ENOMEM; -+ goto out2; -+ } -+ memset(sbi->s_buddy_blocks[i], 0, len); -+ -+ handle = ext3_journal_start(sbi->s_buddy, MB_CREDITS); -+ if (IS_ERR(handle)) { -+ printk(KERN_ERR "EXT3-fs: cant start transaction\n"); -+ err = PTR_ERR(handle); -+ goto out2; -+ } -+ -+ /* allocate block for bitmap */ -+ block = buddy_offset + i * 2; -+ bh = ext3_getblk(handle, sbi->s_buddy, block, 1, &err); -+ if (bh == NULL) { -+ printk(KERN_ERR "EXT3-fs: cant getblk bitmap: %d\n", err); -+ goto out2; -+ } -+ sbi->s_buddy_blocks[i]->bb_bitmap = bh->b_blocknr; -+ brelse(bh); -+ -+ /* allocate block for buddy */ -+ block = buddy_offset + i * 2 + 1; -+ bh = ext3_getblk(handle, sbi->s_buddy, block, 1, &err); -+ if (bh == NULL) { -+ printk(KERN_ERR "EXT3-fs: cant getblk for buddy: %d\n", err); -+ goto out2; ++ goto err_out; + } -+ sbi->s_buddy_blocks[i]->bb_buddy = bh->b_blocknr; -+ brelse(bh); -+ -+ size = (block + 1) << sbi->s_buddy->i_blkbits; -+ if (size > sbi->s_buddy->i_size) { -+ *created = 1; -+ EXT3_I(sbi->s_buddy)->i_disksize = size; -+ i_size_write(sbi->s_buddy, size); -+ mark_inode_dirty(sbi->s_buddy); ++ desc = ext3_get_group_desc(sb, i, NULL); ++ if (desc == NULL) { ++ printk(KERN_ERR "EXT3-fs: cant read descriptor %u\n", i); ++ goto err_out; + } -+ ext3_journal_stop(handle); -+ -+ spin_lock_init(&sbi->s_buddy_blocks[i]->bb_lock); -+ sbi->s_buddy_blocks[i]->bb_md_cur = NULL; -+ sbi->s_buddy_blocks[i]->bb_tid = 0; ++ memset(sbi->s_group_info[i], 0, len); ++ set_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, ++ &sbi->s_group_info[i]->bb_state); ++ sbi->s_group_info[i]->bb_free = ++ le16_to_cpu(desc->bg_free_blocks_count); + } + -+ if (journal_start_commit(sbi->s_journal, &target)) -+ log_wait_commit(sbi->s_journal, target); -+ -+out2: -+ dput(db); -+out: -+ return err; ++ return 0; + +err_out: -+ return err; ++ while (--i >= 0) ++ kfree(sbi->s_group_info[i]); ++ iput(sbi->s_buddy_cache); ++ ++ return -ENOMEM; +} + -+int ext3_mb_write_descriptors(struct super_block *sb) ++int ext3_mb_init(struct super_block *sb, int needs_recovery) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); -+ struct ext3_buddy e3b; -+ int ret = 0, i, err; ++ struct inode *root = sb->s_root->d_inode; ++ unsigned i, offset, max; ++ struct dentry *dentry; + -+ for (i = 0; i < sbi->s_groups_count; i++) { -+ if (sbi->s_buddy_blocks[i] == NULL) -+ continue; ++ if (!test_opt(sb, MBALLOC)) ++ return 0; + -+ err = ext3_mb_load_buddy(sb, i, &e3b); -+ if (err == 0) { -+ ext3_mb_update_descr(&e3b); -+ ext3_mb_release_desc(&e3b); -+ } else -+ ret = err; ++ i = (sb->s_blocksize_bits + 2) * sizeof(unsigned short); ++ ++ sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL); ++ if (sbi->s_mb_offsets == NULL) { ++ clear_opt(sbi->s_mount_opt, MBALLOC); ++ return -ENOMEM; + } -+ return ret; ++ sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); ++ if (sbi->s_mb_maxs == NULL) { ++ clear_opt(sbi->s_mount_opt, MBALLOC); ++ kfree(sbi->s_mb_maxs); ++ return -ENOMEM; ++ } ++ ++ /* order 0 is regular bitmap */ ++ sbi->s_mb_maxs[0] = sb->s_blocksize << 3; ++ sbi->s_mb_offsets[0] = 0; ++ ++ i = 1; ++ offset = 0; ++ max = sb->s_blocksize << 2; ++ do { ++ sbi->s_mb_offsets[i] = offset; ++ sbi->s_mb_maxs[i] = max; ++ offset += 1 << (sb->s_blocksize_bits - i); ++ max = max >> 1; ++ i++; ++ } while (i <= sb->s_blocksize_bits + 1); ++ ++ ++ /* init file for buddy data */ ++ if ((i = ext3_mb_init_backend(sb))) { ++ clear_opt(sbi->s_mount_opt, MBALLOC); ++ kfree(sbi->s_mb_offsets); ++ kfree(sbi->s_mb_maxs); ++ return i; ++ } ++ ++ spin_lock_init(&sbi->s_reserve_lock); ++ spin_lock_init(&sbi->s_md_lock); ++ INIT_LIST_HEAD(&sbi->s_active_transaction); ++ INIT_LIST_HEAD(&sbi->s_closed_transaction); ++ INIT_LIST_HEAD(&sbi->s_committed_transaction); ++ spin_lock_init(&sbi->s_bal_lock); ++ ++ /* remove old on-disk buddy file */ ++ down(&root->i_sem); ++ dentry = lookup_one_len(".buddy", sb->s_root, strlen(".buddy")); ++ if (dentry->d_inode != NULL) { ++ i = vfs_unlink(root, dentry); ++ if (i != 0) ++ printk("EXT3-fs: can't remove .buddy file: %d\n", i); ++ } ++ dput(dentry); ++ up(&root->i_sem); ++ ++ ext3_mb_history_init(sb); ++ ++ printk("EXT3-fs: mballoc enabled\n"); ++ return 0; +} + +int ext3_mb_release(struct super_block *sb) @@ -1756,78 +2211,40 @@ Index: linux-2.6.9/fs/ext3/mballoc.c + spin_unlock(&sbi->s_md_lock); + ext3_mb_free_committed_blocks(sb); + -+ if (sbi->s_buddy_blocks) { -+ ext3_mb_write_descriptors(sb); ++ if (sbi->s_group_info) { + for (i = 0; i < sbi->s_groups_count; i++) { -+ if (sbi->s_buddy_blocks[i] == NULL) ++ if (sbi->s_group_info[i] == NULL) + continue; -+ kfree(sbi->s_buddy_blocks[i]); ++ kfree(sbi->s_group_info[i]); + } -+ kfree(sbi->s_buddy_blocks); -+ } -+ if (sbi->s_buddy) -+ iput(sbi->s_buddy); ++ kfree(sbi->s_group_info); ++ } ++ if (sbi->s_mb_offsets) ++ kfree(sbi->s_mb_offsets); ++ if (sbi->s_mb_maxs) ++ kfree(sbi->s_mb_maxs); ++ if (sbi->s_buddy_cache) ++ iput(sbi->s_buddy_cache); + if (sbi->s_blocks_reserved) + printk("ext3-fs: %ld blocks being reserved at umount!\n", + sbi->s_blocks_reserved); + if (ext3_mb_stats) { -+ printk("EXT3-fs: mballoc: %lu blocks %lu reqs " -+ "(%lu success)\n", sbi->s_bal_allocated, -+ sbi->s_bal_reqs, sbi->s_bal_success); -+ printk("EXT3-fs: mballoc: %lu extents scanned, " -+ "%lu goal hits, %lu breaks\n", sbi->s_bal_ex_scanned, -+ sbi->s_bal_goals, sbi->s_bal_breaks); -+ } -+ -+ return 0; -+} -+ -+int ext3_mb_init(struct super_block *sb, int needs_recovery) -+{ -+ struct ext3_buddy e3b; -+ int i, err, created; -+ -+ if (!test_opt(sb, MBALLOC)) -+ return 0; -+ -+ /* init file for buddy data */ -+ clear_opt(EXT3_SB(sb)->s_mount_opt, MBALLOC); -+ if ((err = ext3_mb_init_backend(sb, &created))) -+ return err; -+ -+repeat: -+ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) { -+ err = ext3_mb_load_buddy(sb, i, &e3b); -+ if (err) { -+ /* FIXME: release backend */ -+ return err; -+ } -+ if (created || needs_recovery) -+ ext3_mb_generate_buddy(&e3b); -+ else -+ err = ext3_mb_load_descr(&e3b); -+ ext3_mb_release_desc(&e3b); -+ if (err == -ENODATA) { -+ created = 1; -+ goto repeat; -+ } -+ } -+ if (created || needs_recovery) -+ printk(KERN_ERR "EXT3-fs: generated buddies for %lu groups\n", -+ EXT3_SB(sb)->s_groups_count); -+ spin_lock_init(&EXT3_SB(sb)->s_reserve_lock); -+ spin_lock_init(&EXT3_SB(sb)->s_md_lock); -+ INIT_LIST_HEAD(&EXT3_SB(sb)->s_active_transaction); -+ INIT_LIST_HEAD(&EXT3_SB(sb)->s_closed_transaction); -+ INIT_LIST_HEAD(&EXT3_SB(sb)->s_committed_transaction); -+ set_opt(EXT3_SB(sb)->s_mount_opt, MBALLOC); -+ -+ spin_lock_init(&EXT3_SB(sb)->s_bal_lock); -+ if (ext3_mb_stats) { -+ printk("EXT3-fs: mballoc enabled (stats)\n"); -+ } else { -+ printk("EXT3-fs: mballoc enabled\n"); -+ } ++ printk("EXT3-fs: mballoc: %u blocks %u reqs (%u success)\n", ++ atomic_read(&sbi->s_bal_allocated), ++ atomic_read(&sbi->s_bal_reqs), ++ atomic_read(&sbi->s_bal_success)); ++ printk("EXT3-fs: mballoc: %u extents scanned, %u goal hits, " ++ "%u 2^N hits, %u breaks\n", ++ atomic_read(&sbi->s_bal_ex_scanned), ++ atomic_read(&sbi->s_bal_goals), ++ atomic_read(&sbi->s_bal_2orders), ++ atomic_read(&sbi->s_bal_breaks)); ++ printk("EXT3-fs: mballoc: %lu generated and it took %Lu\n", ++ sbi->s_mb_buddies_generated++, ++ sbi->s_mb_generation_time); ++ } ++ ++ ext3_mb_history_release(sb); + + return 0; +} @@ -1874,8 +2291,11 @@ Index: linux-2.6.9/fs/ext3/mballoc.c + mb_debug("\n"); + ext3_unlock_group(sb, md->group); + ++ /* balance refcounts from ext3_mb_free_metadata() */ ++ page_cache_release(e3b.bd_buddy_page); ++ page_cache_release(e3b.bd_bitmap_page); ++ + kfree(md); -+ ext3_mb_dirty_buddy(&e3b); + ext3_mb_release_desc(&e3b); + + } while (md); @@ -1892,7 +2312,7 @@ Index: linux-2.6.9/fs/ext3/mballoc.c + /* new transaction! time to close last one and free blocks for + * committed transaction. we know that only transaction can be + * active, so previos transaction can be being logged and we -+ * know that transaction before previous is known to be alreade ++ * know that transaction before previous is known to be already + * logged. this means that now we may free blocks freed in all + * transactions before previous one. hope I'm clear enough ... */ + @@ -1915,12 +2335,15 @@ Index: linux-2.6.9/fs/ext3/mballoc.c +int ext3_mb_free_metadata(handle_t *handle, struct ext3_buddy *e3b, + int group, int block, int count) +{ -+ struct ext3_buddy_group_blocks *db = e3b->bd_bd; ++ struct ext3_group_info *db = e3b->bd_info; + struct super_block *sb = e3b->bd_sb; + struct ext3_sb_info *sbi = EXT3_SB(sb); + struct ext3_free_metadata *md; + int i; + ++ J_ASSERT(e3b->bd_bitmap_page != NULL); ++ J_ASSERT(e3b->bd_buddy_page != NULL); ++ + ext3_lock_group(sb, group); + for (i = 0; i < count; i++) { + md = db->bb_md_cur; @@ -1942,6 +2365,12 @@ Index: linux-2.6.9/fs/ext3/mballoc.c + spin_lock(&sbi->s_md_lock); + list_add(&md->list, &sbi->s_active_transaction); + spin_unlock(&sbi->s_md_lock); ++ /* protect buddy cache from being freed, ++ * otherwise we'll refresh it from ++ * on-disk bitmap and lose not-yet-available ++ * blocks */ ++ page_cache_get(e3b->bd_buddy_page); ++ page_cache_get(e3b->bd_bitmap_page); + db->bb_md_cur = md; + db->bb_tid = handle->h_transaction->t_tid; + mb_debug("new md 0x%p for group %u\n", @@ -2053,12 +2482,13 @@ Index: linux-2.6.9/fs/ext3/mballoc.c + if (err) + goto error_return; + -+ if (unlikely(ext3_mb_aggressive)) { ++#ifdef AGGRESSIVE_CHECK ++ { + int i; + for (i = 0; i < count; i++) + J_ASSERT(mb_test_bit(bit + i, bitmap_bh->b_data)); + } -+ ++#endif + mb_clear_bits(bitmap_bh->b_data, bit, count); + + /* We dirtied the bitmap block */ @@ -2081,7 +2511,6 @@ Index: linux-2.6.9/fs/ext3/mballoc.c + spin_unlock(sb_bgl_lock(sbi, block_group)); + percpu_counter_mod(&sbi->s_freeblocks_counter, count); + -+ ext3_mb_dirty_buddy(&e3b); + ext3_mb_release_desc(&e3b); + + *freed = count; @@ -2147,50 +2576,30 @@ Index: linux-2.6.9/fs/ext3/mballoc.c + return ret; +} + -+void ext3_free_blocks(handle_t *handle, struct inode *inode, -+ unsigned long block, unsigned long count, int metadata) ++ ++void ext3_free_blocks(handle_t *handle, struct inode * inode, ++ unsigned long block, unsigned long count, int metadata) +{ ++ struct super_block *sb; + int freed; + -+ if (!test_opt(inode->i_sb, MBALLOC) || -+ EXT3_SB(inode->i_sb)->s_buddy_blocks == NULL) -+ ext3_free_blocks_sb(handle, inode->i_sb, block, count, &freed); ++ sb = inode->i_sb; ++ if (!test_opt(sb, MBALLOC)) ++ ext3_free_blocks_sb(handle, sb, block, count, &freed); + else -+ ext3_mb_free_blocks(handle, inode, block,count,metadata,&freed); -+ ++ ext3_mb_free_blocks(handle, inode, block, count, metadata, &freed); + if (freed) + DQUOT_FREE_BLOCK(inode, freed); + return; +} -Index: linux-2.6.5-7.201/fs/ext3/proc.c -=================================================================== ---- linux-2.6.5-7.201.orig/fs/ext3/proc.c 2005-10-13 19:40:57.851699336 +0400 -+++ linux-2.6.5-7.201/fs/ext3/proc.c 2005-10-14 09:02:36.000000000 +0400 -@@ -0,0 +1,195 @@ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ + +#define EXT3_ROOT "ext3" -+#define EXT3_MB_AGGRESSIVE_NAME "mb_aggressive" +#define EXT3_MB_STATS_NAME "mb_stats" +#define EXT3_MB_MAX_TO_SCAN_NAME "mb_max_to_scan" ++#define EXT3_MB_MIN_TO_SCAN_NAME "mb_min_to_scan" + -+ -+static struct proc_dir_entry *proc_root_ext3; -+ -+ -+static int ext3_mb_aggressive_read(char *page, char **start, off_t off, -+ int count, int *eof, void *data) ++static int ext3_mb_stats_read(char *page, char **start, off_t off, ++ int count, int *eof, void *data) +{ + int len; + @@ -2198,19 +2607,19 @@ Index: linux-2.6.5-7.201/fs/ext3/proc.c + if (off != 0) + return 0; + -+ len = sprintf(page, "%ld\n", ext3_mb_aggressive); ++ len = sprintf(page, "%ld\n", ext3_mb_stats); + *start = page; + return len; +} + -+static int ext3_mb_aggressive_write(struct file *file, const char *buffer, -+ unsigned long count, void *data) ++static int ext3_mb_stats_write(struct file *file, const char *buffer, ++ unsigned long count, void *data) +{ + char str[32]; + + if (count >= sizeof(str)) { + printk(KERN_ERR "EXT3: %s string to long, max %u bytes\n", -+ EXT3_MB_AGGRESSIVE_NAME, sizeof(str)); ++ EXT3_MB_STATS_NAME, (int)sizeof(str)); + return -EOVERFLOW; + } + @@ -2218,12 +2627,12 @@ Index: linux-2.6.5-7.201/fs/ext3/proc.c + return -EFAULT; + + /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ -+ ext3_mb_aggressive = (simple_strtol(str, NULL, 0) != 0); ++ ext3_mb_stats = (simple_strtol(str, NULL, 0) != 0); + return count; +} + -+static int ext3_mb_stats_read(char *page, char **start, off_t off, -+ int count, int *eof, void *data) ++static int ext3_mb_max_to_scan_read(char *page, char **start, off_t off, ++ int count, int *eof, void *data) +{ + int len; + @@ -2231,19 +2640,20 @@ Index: linux-2.6.5-7.201/fs/ext3/proc.c + if (off != 0) + return 0; + -+ len = sprintf(page, "%ld\n", ext3_mb_stats); ++ len = sprintf(page, "%ld\n", ext3_mb_max_to_scan); + *start = page; + return len; +} + -+static int ext3_mb_stats_write(struct file *file, const char *buffer, -+ unsigned long count, void *data) ++static int ext3_mb_max_to_scan_write(struct file *file, const char *buffer, ++ unsigned long count, void *data) +{ + char str[32]; ++ long value; + + if (count >= sizeof(str)) { + printk(KERN_ERR "EXT3: %s string to long, max %u bytes\n", -+ EXT3_MB_STATS_NAME, sizeof(str)); ++ EXT3_MB_MAX_TO_SCAN_NAME, (int)sizeof(str)); + return -EOVERFLOW; + } + @@ -2251,12 +2661,17 @@ Index: linux-2.6.5-7.201/fs/ext3/proc.c + return -EFAULT; + + /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ -+ ext3_mb_stats = (simple_strtol(str, NULL, 0) != 0); ++ value = simple_strtol(str, NULL, 0); ++ if (value <= 0) ++ return -ERANGE; ++ ++ ext3_mb_max_to_scan = value; ++ + return count; +} + -+static int ext3_mb_max_to_scan_read(char *page, char **start, off_t off, -+ int count, int *eof, void *data) ++static int ext3_mb_min_to_scan_read(char *page, char **start, off_t off, ++ int count, int *eof, void *data) +{ + int len; + @@ -2264,20 +2679,20 @@ Index: linux-2.6.5-7.201/fs/ext3/proc.c + if (off != 0) + return 0; + -+ len = sprintf(page, "%ld\n", ext3_mb_max_to_scan); ++ len = sprintf(page, "%ld\n", ext3_mb_min_to_scan); + *start = page; + return len; +} + -+static int ext3_mb_max_to_scan_write(struct file *file, const char *buffer, -+ unsigned long count, void *data) ++static int ext3_mb_min_to_scan_write(struct file *file, const char *buffer, ++ unsigned long count, void *data) +{ + char str[32]; + long value; + + if (count >= sizeof(str)) { + printk(KERN_ERR "EXT3: %s string to long, max %u bytes\n", -+ EXT3_MB_MAX_TO_SCAN_NAME, sizeof(str)); ++ EXT3_MB_MIN_TO_SCAN_NAME, (int)sizeof(str)); + return -EOVERFLOW; + } + @@ -2286,47 +2701,32 @@ Index: linux-2.6.5-7.201/fs/ext3/proc.c + + /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ + value = simple_strtol(str, NULL, 0); -+ if (value <= 0) ++ if (value <= 0) + return -ERANGE; + -+ ext3_mb_max_to_scan = value; ++ ext3_mb_min_to_scan = value; + + return count; +} + +int __init init_ext3_proc(void) +{ -+ struct proc_dir_entry *proc_ext3_mb_aggressive; + struct proc_dir_entry *proc_ext3_mb_stats; + struct proc_dir_entry *proc_ext3_mb_max_to_scan; ++ struct proc_dir_entry *proc_ext3_mb_min_to_scan; + + proc_root_ext3 = proc_mkdir(EXT3_ROOT, proc_root_fs); + if (proc_root_ext3 == NULL) { -+ printk(KERN_ERR "EXT3: Unable to create %s\n", EXT3_ROOT); -+ return -EIO; -+ } -+ -+ /* Initialize EXT3_MB_AGGRESSIVE_NAME */ -+ proc_ext3_mb_aggressive = create_proc_entry(EXT3_MB_AGGRESSIVE_NAME, -+ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); -+ if (proc_ext3_mb_aggressive == NULL) { -+ printk(KERN_ERR "EXT3: Unable to create %s\n", -+ EXT3_MB_AGGRESSIVE_NAME); -+ remove_proc_entry(EXT3_ROOT, proc_root_fs); ++ printk(KERN_ERR "EXT3: Unable to create %s\n", EXT3_ROOT); + return -EIO; + } + -+ proc_ext3_mb_aggressive->data = NULL; -+ proc_ext3_mb_aggressive->read_proc = ext3_mb_aggressive_read; -+ proc_ext3_mb_aggressive->write_proc = ext3_mb_aggressive_write; -+ + /* Initialize EXT3_MB_STATS_NAME */ + proc_ext3_mb_stats = create_proc_entry(EXT3_MB_STATS_NAME, -+ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); ++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); + if (proc_ext3_mb_stats == NULL) { -+ printk(KERN_ERR "EXT3: Unable to create %s\n", -+ EXT3_MB_STATS_NAME); -+ remove_proc_entry(EXT3_MB_AGGRESSIVE_NAME, proc_root_ext3); ++ printk(KERN_ERR "EXT3: Unable to create %s\n", ++ EXT3_MB_STATS_NAME); + remove_proc_entry(EXT3_ROOT, proc_root_fs); + return -EIO; + } @@ -2337,13 +2737,12 @@ Index: linux-2.6.5-7.201/fs/ext3/proc.c + + /* Initialize EXT3_MAX_TO_SCAN_NAME */ + proc_ext3_mb_max_to_scan = create_proc_entry( -+ EXT3_MB_MAX_TO_SCAN_NAME, -+ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); ++ EXT3_MB_MAX_TO_SCAN_NAME, ++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); + if (proc_ext3_mb_max_to_scan == NULL) { -+ printk(KERN_ERR "EXT3: Unable to create %s\n", -+ EXT3_MB_STATS_NAME); ++ printk(KERN_ERR "EXT3: Unable to create %s\n", ++ EXT3_MB_MAX_TO_SCAN_NAME); + remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); -+ remove_proc_entry(EXT3_MB_AGGRESSIVE_NAME, proc_root_ext3); + remove_proc_entry(EXT3_ROOT, proc_root_fs); + return -EIO; + } @@ -2352,131 +2751,43 @@ Index: linux-2.6.5-7.201/fs/ext3/proc.c + proc_ext3_mb_max_to_scan->read_proc = ext3_mb_max_to_scan_read; + proc_ext3_mb_max_to_scan->write_proc = ext3_mb_max_to_scan_write; + ++ /* Initialize EXT3_MIN_TO_SCAN_NAME */ ++ proc_ext3_mb_min_to_scan = create_proc_entry( ++ EXT3_MB_MIN_TO_SCAN_NAME, ++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); ++ if (proc_ext3_mb_min_to_scan == NULL) { ++ printk(KERN_ERR "EXT3: Unable to create %s\n", ++ EXT3_MB_MIN_TO_SCAN_NAME); ++ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_ROOT, proc_root_fs); ++ return -EIO; ++ } ++ ++ proc_ext3_mb_min_to_scan->data = NULL; ++ proc_ext3_mb_min_to_scan->read_proc = ext3_mb_min_to_scan_read; ++ proc_ext3_mb_min_to_scan->write_proc = ext3_mb_min_to_scan_write; ++ + return 0; +} + +void exit_ext3_proc(void) +{ -+ remove_proc_entry(EXT3_MB_AGGRESSIVE_NAME, proc_root_ext3); + remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); + remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_MIN_TO_SCAN_NAME, proc_root_ext3); + remove_proc_entry(EXT3_ROOT, proc_root_fs); +} -Index: linux-2.6.9/fs/ext3/inode.c -=================================================================== ---- linux-2.6.9.orig/fs/ext3/inode.c 2005-10-14 09:10:12.000000000 +0400 -+++ linux-2.6.9/fs/ext3/inode.c 2005-10-14 09:10:13.000000000 +0400 -@@ -572,7 +572,7 @@ - ext3_journal_forget(handle, branch[i].bh); - } - for (i = 0; i < keys; i++) -- ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1); -+ ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1, 1); - return err; - } - -@@ -673,7 +673,7 @@ - if (err == -EAGAIN) - for (i = 0; i < num; i++) - ext3_free_blocks(handle, inode, -- le32_to_cpu(where[i].key), 1); -+ le32_to_cpu(where[i].key), 1, 1); - return err; - } - -@@ -1831,7 +1831,7 @@ - } - } - -- ext3_free_blocks(handle, inode, block_to_free, count); -+ ext3_free_blocks(handle, inode, block_to_free, count, 1); - } - - /** -@@ -2004,7 +2004,7 @@ - ext3_journal_test_restart(handle, inode); - } - -- ext3_free_blocks(handle, inode, nr, 1); -+ ext3_free_blocks(handle, inode, nr, 1, 1); - - if (parent_bh) { - /* -Index: linux-2.6.9/fs/ext3/super.c +Index: linux-2.6.9-full/fs/ext3/Makefile =================================================================== ---- linux-2.6.9.orig/fs/ext3/super.c 2005-10-14 09:10:12.000000000 +0400 -+++ linux-2.6.9/fs/ext3/super.c 2005-10-14 09:10:31.000000000 +0400 -@@ -394,6 +394,7 @@ - struct ext3_super_block *es = sbi->s_es; - int i; - -+ ext3_mb_release(sb); - ext3_ext_release(sb); - ext3_xattr_put_super(sb); - journal_destroy(sbi->s_journal); -@@ -590,7 +591,7 @@ - Opt_commit, Opt_journal_update, Opt_journal_inum, - Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, - Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, -- Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, -+ Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_mballoc, Opt_mbfactor, - Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, - Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_extents, Opt_extdebug, - }; -@@ -644,6 +645,8 @@ - {Opt_iopen_nopriv, "iopen_nopriv"}, - {Opt_extents, "extents"}, - {Opt_extdebug, "extdebug"}, -+ {Opt_mballoc, "mballoc"}, -+ {Opt_mballoc, "mbfactor=%u"}, - {Opt_barrier, "barrier=%u"}, - {Opt_err, NULL}, - {Opt_resize, "resize"}, -@@ -954,6 +957,16 @@ - case Opt_extdebug: - set_opt (sbi->s_mount_opt, EXTDEBUG); - break; -+ case Opt_mballoc: -+ set_opt (sbi->s_mount_opt, MBALLOC); -+ break; -+ case Opt_mbfactor: -+ if (match_int(&args[0], &option)) -+ return 0; -+ if (option < 0) -+ return 0; -+ sbi->s_mb_factor = option; -+ break; - default: - printk (KERN_ERR - "EXT3-fs: Unrecognized mount option \"%s\" " -@@ -1637,6 +1650,7 @@ - ext3_count_dirs(sb)); - - ext3_ext_init(sb); -+ ext3_mb_init(sb, needs_recovery); +--- linux-2.6.9-full.orig/fs/ext3/Makefile 2005-12-16 23:16:41.000000000 +0300 ++++ linux-2.6.9-full/fs/ext3/Makefile 2005-12-16 23:16:42.000000000 +0300 +@@ -6,7 +6,7 @@ - return 0; - -@@ -2419,7 +2433,13 @@ - - static int __init init_ext3_fs(void) - { -- int err = init_ext3_xattr(); -+ int err; -+ -+ err = init_ext3_proc(); -+ if (err) -+ return err; -+ -+ err = init_ext3_xattr(); - if (err) - return err; - err = init_inodecache(); -@@ -2441,6 +2461,7 @@ - unregister_filesystem(&ext3_fs_type); - destroy_inodecache(); - exit_ext3_xattr(); -+ exit_ext3_proc(); - } + ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ + ioctl.o namei.o super.o symlink.o hash.o resize.o \ +- extents.o ++ extents.o mballoc.o - int ext3_prep_san_write(struct inode *inode, long *blocks, + ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o + ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o diff --git a/ldiskfs/kernel_patches/patches/ext3-nlinks-2.6.9.patch b/ldiskfs/kernel_patches/patches/ext3-nlinks-2.6.9.patch new file mode 100644 index 0000000..62bf156 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext3-nlinks-2.6.9.patch @@ -0,0 +1,163 @@ +diff -Nur orig/fs/ext3/namei.c patch/fs/ext3/namei.c +--- orig/fs/ext3/namei.c 2005-10-12 13:58:19.000000000 -0700 ++++ patch/fs/ext3/namei.c 2005-10-12 14:00:33.000000000 -0700 +@@ -1603,11 +1603,17 @@ + static inline void ext3_inc_count(handle_t *handle, struct inode *inode) + { + inode->i_nlink++; ++ if (is_dx(inode) && inode->i_nlink > 1) { ++ /* limit is 16-bit i_links_count */ ++ if (inode->i_nlink >= EXT3_LINK_MAX || inode->i_nlink == 2) ++ inode->i_nlink = 1; ++ } + } + + static inline void ext3_dec_count(handle_t *handle, struct inode *inode) + { +- inode->i_nlink--; ++ if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 2) ++ inode->i_nlink--; + } + + static int ext3_add_nondir(handle_t *handle, +@@ -1706,7 +1712,7 @@ + struct ext3_dir_entry_2 * de; + int err, retries = 0; + +- if (dir->i_nlink >= EXT3_LINK_MAX) ++ if (EXT3_DIR_LINK_MAXED(dir)) + return -EMLINK; + + retry: +@@ -1729,7 +1735,7 @@ + inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize; + dir_block = ext3_bread (handle, inode, 0, 1, &err); + if (!dir_block) { +- inode->i_nlink--; /* is this nlink == 0? */ ++ ext3_dec_count(handle, inode); /* is this nlink == 0? */ + ext3_mark_inode_dirty(handle, inode); + iput (inode); + goto out_stop; +@@ -1761,7 +1767,7 @@ + iput (inode); + goto out_stop; + } +- dir->i_nlink++; ++ ext3_inc_count(handle, dir); + ext3_update_dx_flag(dir); + ext3_mark_inode_dirty(handle, dir); + d_instantiate(dentry, inode); +@@ -2026,10 +2032,10 @@ + retval = ext3_delete_entry(handle, dir, de, bh); + if (retval) + goto end_rmdir; +- if (inode->i_nlink != 2) +- ext3_warning (inode->i_sb, "ext3_rmdir", +- "empty directory has nlink!=2 (%d)", +- inode->i_nlink); ++ if (!EXT3_DIR_LINK_EMPTY(inode)) ++ ext3_warning(inode->i_sb, "ext3_rmdir", ++ "empty directory has too many links (%d)", ++ inode->i_nlink); + inode->i_version++; + inode->i_nlink = 0; + /* There's no need to set i_disksize: the fact that i_nlink is +@@ -2039,7 +2045,7 @@ + ext3_orphan_add(handle, inode); + inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; + ext3_mark_inode_dirty(handle, inode); +- dir->i_nlink--; ++ ext3_dec_count(handle, dir); + ext3_update_dx_flag(dir); + ext3_mark_inode_dirty(handle, dir); + +@@ -2090,7 +2096,7 @@ + dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; + ext3_update_dx_flag(dir); + ext3_mark_inode_dirty(handle, dir); +- inode->i_nlink--; ++ ext3_dec_count(handle, inode); + if (!inode->i_nlink) + ext3_orphan_add(handle, inode); + inode->i_ctime = dir->i_ctime; +@@ -2165,7 +2171,7 @@ + struct inode *inode = old_dentry->d_inode; + int err, retries = 0; + +- if (inode->i_nlink >= EXT3_LINK_MAX) ++ if (EXT3_DIR_LINK_MAXED(inode)) + return -EMLINK; + + retry: +@@ -2252,8 +2258,8 @@ + if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino) + goto end_rename; + retval = -EMLINK; +- if (!new_inode && new_dir!=old_dir && +- new_dir->i_nlink >= EXT3_LINK_MAX) ++ if (!new_inode && new_dir != old_dir && ++ EXT3_DIR_LINK_MAXED(new_dir)) + goto end_rename; + } + if (!new_bh) { +@@ -2310,7 +2316,7 @@ + } + + if (new_inode) { +- new_inode->i_nlink--; ++ ext3_dec_count(handle, new_inode); + new_inode->i_ctime = CURRENT_TIME_SEC; + } + old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME_SEC; +@@ -2321,11 +2327,13 @@ + PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino); + BUFFER_TRACE(dir_bh, "call ext3_journal_dirty_metadata"); + ext3_journal_dirty_metadata(handle, dir_bh); +- old_dir->i_nlink--; ++ ext3_dec_count(handle, old_dir); + if (new_inode) { +- new_inode->i_nlink--; ++ /* checked empty_dir above, can't have another parent, ++ * ext3_dec_count() won't work for many-linked dirs */ ++ new_inode->i_nlink = 0; + } else { +- new_dir->i_nlink++; ++ ext3_inc_count(handle, new_dir); + ext3_update_dx_flag(new_dir); + ext3_mark_inode_dirty(handle, new_dir); + } + +Index: linux-2.6.7/include/linux/ext3_fs.h +=================================================================== +--- linux-2.6.7.orig/include/linux/ext3_fs.h 2004-06-15 23:19:36.000000000 -0600 ++++ linux-2.6.7/include/linux/ext3_fs.h 2004-08-20 17:41:27.000000000 -0600 +@@ -79,7 +81,7 @@ + /* + * Maximal count of links to a file + */ +-#define EXT3_LINK_MAX 32000 ++#define EXT3_LINK_MAX 65000 + + /* + * Macro-instructions used to manage several block sizes +@@ -595,14 +595,15 @@ struct ext3_dir_entry_2 { + */ + + #ifdef CONFIG_EXT3_INDEX +- #define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \ +- EXT3_FEATURE_COMPAT_DIR_INDEX) && \ ++#define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \ ++ EXT3_FEATURE_COMPAT_DIR_INDEX) && \ + (EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) +-#define EXT3_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT3_LINK_MAX) +-#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1) ++#define EXT3_DIR_LINK_MAXED(dir) (!is_dx(dir) && (dir)->i_nlink >=EXT3_LINK_MAX) ++#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || \ ++ (is_dx(dir) && (dir)->i_nlink == 1)) + #else + #define is_dx(dir) 0 +-#define EXT3_DIR_LINK_MAX(dir) ((dir)->i_nlink >= EXT3_LINK_MAX) ++#define EXT3_DIR_LINK_MAXED(dir) ((dir)->i_nlink >= EXT3_LINK_MAX) + #define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2) + #endif + diff --git a/ldiskfs/kernel_patches/patches/ext3-remove-cond_resched-calls-2.6.12.patch b/ldiskfs/kernel_patches/patches/ext3-remove-cond_resched-calls-2.6.12.patch new file mode 100644 index 0000000..57898d5 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext3-remove-cond_resched-calls-2.6.12.patch @@ -0,0 +1,29 @@ +Index: linux-stage/fs/ext3/ialloc.c +=================================================================== +--- linux-stage.orig/fs/ext3/ialloc.c 2005-06-26 10:59:43.048185981 +0200 ++++ linux-stage/fs/ext3/ialloc.c 2005-06-26 11:01:21.317716027 +0200 +@@ -775,7 +775,6 @@ + if (!gdp) + continue; + desc_count += le16_to_cpu(gdp->bg_free_inodes_count); +- cond_resched(); + } + return desc_count; + #endif +Index: linux-stage/fs/ext3/super.c +=================================================================== +--- linux-stage.orig/fs/ext3/super.c 2005-06-26 10:59:43.205412542 +0200 ++++ linux-stage/fs/ext3/super.c 2005-06-26 11:02:29.599941754 +0200 +@@ -2236,11 +2232,9 @@ + * block group descriptors. If the sparse superblocks + * feature is turned on, then not all groups have this. + */ +- for (i = 0; i < ngroups; i++) { ++ for (i = 0; i < ngroups; i++) + overhead += ext3_bg_has_super(sb, i) + + ext3_bg_num_gdb(sb, i); +- cond_resched(); +- } + + /* + * Every block group has an inode bitmap, a block diff --git a/ldiskfs/kernel_patches/patches/iopen-2.6-rhel4.patch b/ldiskfs/kernel_patches/patches/iopen-2.6-rhel4.patch index cbb024a..98dbca4 100644 --- a/ldiskfs/kernel_patches/patches/iopen-2.6-rhel4.patch +++ b/ldiskfs/kernel_patches/patches/iopen-2.6-rhel4.patch @@ -7,7 +7,7 @@ Index: linux-stage/fs/ext3/Makefile obj-$(CONFIG_EXT3_FS) += ext3.o -ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ -+ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o\ ++ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ ioctl.o namei.o super.o symlink.o hash.o resize.o ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o @@ -36,7 +36,7 @@ Index: linux-stage/fs/ext3/iopen.c =================================================================== --- linux-stage.orig/fs/ext3/iopen.c 2005-02-25 14:41:01.017787968 +0200 +++ linux-stage/fs/ext3/iopen.c 2005-02-25 14:41:01.045783712 +0200 -@@ -0,0 +1,277 @@ +@@ -0,0 +1,278 @@ +/* + * linux/fs/ext3/iopen.c + * @@ -124,7 +124,7 @@ Index: linux-stage/fs/ext3/iopen.c + } + + assert(list_empty(&dentry->d_alias)); /* d_instantiate */ -+ assert(d_unhashed(dentry)); /* d_rehash */ ++ assert(d_unhashed(dentry)); /* d_rehash */ + + /* preferrably return a connected dentry */ + spin_lock(&dcache_lock); @@ -188,7 +188,7 @@ Index: linux-stage/fs/ext3/iopen.c + assert(dentry->d_inode == NULL); + assert(list_empty(&dentry->d_alias)); /* d_instantiate */ + if (rehash) -+ assert(d_unhashed(dentry)); /* d_rehash */ ++ assert(d_unhashed(dentry)); /* d_rehash */ + assert(list_empty(&dentry->d_subdirs)); + + spin_lock(&dcache_lock); @@ -214,8 +214,9 @@ Index: linux-stage/fs/ext3/iopen.c + goto do_instantiate; + + /* Move the goal to the de hash queue */ -+ goal->d_flags &= ~ DCACHE_DISCONNECTED; ++ goal->d_flags &= ~DCACHE_DISCONNECTED; + security_d_instantiate(goal, inode); ++ __d_drop(dentry); + __d_rehash(dentry, 0); + __d_move(goal, dentry); + spin_unlock(&dcache_lock); @@ -410,7 +411,7 @@ Index: linux-stage/fs/ext3/namei.c - err = ext3_add_nondir(handle, dentry, inode); + err = ext3_add_link(handle, dentry, inode); -+ ext3_orphan_del(handle,inode); ++ ext3_orphan_del(handle, inode); ext3_journal_stop(handle); if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) goto retry; @@ -419,20 +420,20 @@ Index: linux-stage/fs/ext3/super.c --- linux-stage.orig/fs/ext3/super.c 2005-02-25 14:37:30.987717392 +0200 +++ linux-stage/fs/ext3/super.c 2005-02-25 14:44:50.495901992 +0200 @@ -586,6 +586,7 @@ - Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, -+ Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, Opt_ignore, Opt_barrier, Opt_err, Opt_resize, ++ Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, }; + static match_table_t tokens = { @@ -633,6 +634,9 @@ {Opt_ignore, "noquota"}, {Opt_ignore, "quota"}, {Opt_ignore, "usrquota"}, -+ {Opt_iopen, "iopen"}, -+ {Opt_noiopen, "noiopen"}, -+ {Opt_iopen_nopriv, "iopen_nopriv"}, ++ {Opt_iopen, "iopen"}, ++ {Opt_noiopen, "noiopen"}, ++ {Opt_iopen_nopriv, "iopen_nopriv"}, {Opt_barrier, "barrier=%u"}, {Opt_err, NULL}, {Opt_resize, "resize"}, @@ -463,8 +464,8 @@ Index: linux-stage/include/linux/ext3_fs.h #define EXT3_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */ #define EXT3_MOUNT_BARRIER 0x10000 /* Use block barriers */ #define EXT3_MOUNT_RESERVATION 0x20000 /* Preallocation */ -+#define EXT3_MOUNT_IOPEN 0x40000 /* Allow access via iopen */ -+#define EXT3_MOUNT_IOPEN_NOPRIV 0x80000 /* Make iopen world-readable */ ++#define EXT3_MOUNT_IOPEN 0x80000 /* Allow access via iopen */ ++#define EXT3_MOUNT_IOPEN_NOPRIV 0x100000/* Make iopen world-readable */ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ #ifndef _LINUX_EXT2_FS_H diff --git a/ldiskfs/kernel_patches/patches/iopen-2.6-suse.patch b/ldiskfs/kernel_patches/patches/iopen-2.6-suse.patch index 9aba4f6..1c5e900 100644 --- a/ldiskfs/kernel_patches/patches/iopen-2.6-suse.patch +++ b/ldiskfs/kernel_patches/patches/iopen-2.6-suse.patch @@ -1,15 +1,7 @@ - fs/ext3/inode.c | 3 - fs/ext3/iopen.c | 239 +++++++++++++++++++++++++++++++++++++ - fs/ext3/iopen.h | 15 ++ - fs/ext3/namei.c | 13 ++ - fs/ext3/super.c | 17 ++ - include/linux/ext3_fs.h | 2 - 7 files changed, 304 insertions(+), 1 deletion(-) - -Index: linux-2.6.5-sles9/fs/ext3/Makefile +Index: linux-stage/fs/ext3/Makefile =================================================================== ---- linux-2.6.5-sles9.orig/fs/ext3/Makefile 2004-04-04 07:36:18.000000000 +0400 -+++ linux-2.6.5-sles9/fs/ext3/Makefile 2004-11-09 02:18:27.604914376 +0300 +--- linux-stage.orig/fs/ext3/Makefile 2005-02-25 14:31:53.151076368 +0200 ++++ linux-stage/fs/ext3/Makefile 2005-02-25 14:41:51.259150120 +0200 @@ -4,7 +4,7 @@ obj-$(CONFIG_EXT3_FS) += ext3.o @@ -19,10 +11,10 @@ Index: linux-2.6.5-sles9/fs/ext3/Makefile ioctl.o namei.o super.o symlink.o hash.o ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o -Index: linux-2.6.5-sles9/fs/ext3/inode.c +Index: linux-stage/fs/ext3/inode.c =================================================================== ---- linux-2.6.5-sles9.orig/fs/ext3/inode.c 2004-11-09 02:15:44.739673656 +0300 -+++ linux-2.6.5-sles9/fs/ext3/inode.c 2004-11-09 02:18:27.608913768 +0300 +--- linux-stage.orig/fs/ext3/inode.c 2005-02-25 14:37:30.983718000 +0200 ++++ linux-stage/fs/ext3/inode.c 2005-02-25 14:47:42.069818792 +0200 @@ -37,6 +37,7 @@ #include #include @@ -31,7 +23,7 @@ Index: linux-2.6.5-sles9/fs/ext3/inode.c #include "acl.h" /* -@@ -2402,6 +2403,9 @@ +@@ -2408,6 +2409,9 @@ #endif ei->i_rsv_window.rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED; @@ -41,11 +33,11 @@ Index: linux-2.6.5-sles9/fs/ext3/inode.c if (ext3_get_inode_loc(inode, &iloc, 0)) goto bad_inode; bh = iloc.bh; -Index: linux-2.6.5-sles9/fs/ext3/iopen.c +Index: linux-stage/fs/ext3/iopen.c =================================================================== --- linux-2.6.5-sles9.orig/fs/ext3/iopen.c 2003-01-30 13:24:37.000000000 +0300 +++ linux-2.6.5-sles9/fs/ext3/iopen.c 2004-11-09 02:18:27.611913312 +0300 -@@ -0,0 +1,275 @@ +@@ -0,0 +1,278 @@ +/* + * linux/fs/ext3/iopen.c + * @@ -133,7 +125,7 @@ Index: linux-2.6.5-sles9/fs/ext3/iopen.c + } + + assert(list_empty(&dentry->d_alias)); /* d_instantiate */ -+ assert(d_unhashed(dentry)); /* d_rehash */ ++ assert(d_unhashed(dentry)); /* d_rehash */ + + /* preferrably return a connected dentry */ + spin_lock(&dcache_lock); @@ -146,7 +138,9 @@ Index: linux-2.6.5-sles9/fs/ext3/iopen.c + alternate = list_entry(inode->i_dentry.next, + struct dentry, d_alias); + dget_locked(alternate); ++ spin_lock(&alternate->d_lock); + alternate->d_vfs_flags |= DCACHE_REFERENCED; ++ spin_unlock(&alternate->d_lock); + iput(inode); + spin_unlock(&dcache_lock); + return alternate; @@ -221,8 +215,9 @@ Index: linux-2.6.5-sles9/fs/ext3/iopen.c + goto do_instantiate; + + /* Move the goal to the de hash queue */ -+ goal->d_flags &= ~ DCACHE_DISCONNECTED; ++ goal->d_flags &= ~DCACHE_DISCONNECTED; + security_d_instantiate(goal, inode); ++ __d_drop(dentry); + __d_rehash(dentry, 0); + __d_move(goal, dentry); + spin_unlock(&dcache_lock); @@ -321,10 +316,10 @@ Index: linux-2.6.5-sles9/fs/ext3/iopen.c + + return 1; +} -Index: linux-2.6.5-sles9/fs/ext3/iopen.h +Index: linux-stage/fs/ext3/iopen.h =================================================================== ---- linux-2.6.5-sles9.orig/fs/ext3/iopen.h 2003-01-30 13:24:37.000000000 +0300 -+++ linux-2.6.5-sles9/fs/ext3/iopen.h 2004-11-09 02:18:27.613913008 +0300 +--- linux-stage.orig/fs/ext3/iopen.h 2005-02-25 14:41:01.017787968 +0200 ++++ linux-stage/fs/ext3/iopen.h 2005-02-25 14:41:01.045783712 +0200 @@ -0,0 +1,15 @@ +/* + * iopen.h @@ -341,10 +336,10 @@ Index: linux-2.6.5-sles9/fs/ext3/iopen.h +extern int ext3_iopen_get_inode(struct inode *inode); +extern struct dentry *iopen_connect_dentry(struct dentry *dentry, + struct inode *inode, int rehash); -Index: linux-2.6.5-sles9/fs/ext3/namei.c +Index: linux-stage/fs/ext3/namei.c =================================================================== ---- linux-2.6.5-sles9.orig/fs/ext3/namei.c 2004-11-09 02:15:44.614692656 +0300 -+++ linux-2.6.5-sles9/fs/ext3/namei.c 2004-11-09 02:18:27.616912552 +0300 +--- linux-stage.orig/fs/ext3/namei.c 2005-02-25 14:37:28.975023368 +0200 ++++ linux-stage/fs/ext3/namei.c 2005-02-25 14:46:43.090784968 +0200 @@ -37,6 +37,7 @@ #include #include @@ -353,7 +348,7 @@ Index: linux-2.6.5-sles9/fs/ext3/namei.c #include "acl.h" /* -@@ -979,6 +980,9 @@ +@@ -980,6 +981,9 @@ if (dentry->d_name.len > EXT3_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); @@ -363,7 +358,7 @@ Index: linux-2.6.5-sles9/fs/ext3/namei.c bh = ext3_find_entry(dentry, &de); inode = NULL; if (bh) { -@@ -989,10 +993,8 @@ +@@ -990,10 +994,8 @@ if (!inode) return ERR_PTR(-EACCES); } @@ -376,7 +371,7 @@ Index: linux-2.6.5-sles9/fs/ext3/namei.c } -@@ -2029,10 +2031,6 @@ +@@ -2037,10 +2039,6 @@ inode->i_nlink); inode->i_version++; inode->i_nlink = 0; @@ -387,7 +382,7 @@ Index: linux-2.6.5-sles9/fs/ext3/namei.c ext3_orphan_add(handle, inode); inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; ext3_mark_inode_dirty(handle, inode); -@@ -2152,6 +2150,23 @@ +@@ -2163,6 +2161,23 @@ return err; } @@ -411,40 +406,39 @@ Index: linux-2.6.5-sles9/fs/ext3/namei.c static int ext3_link (struct dentry * old_dentry, struct inode * dir, struct dentry *dentry) { -@@ -2175,7 +2190,8 @@ +@@ -2186,7 +2201,8 @@ ext3_inc_count(handle, inode); atomic_inc(&inode->i_count); - err = ext3_add_nondir(handle, dentry, inode); + err = ext3_add_link(handle, dentry, inode); -+ ext3_orphan_del(handle,inode); ++ ext3_orphan_del(handle, inode); ext3_journal_stop(handle); if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) goto retry; -Index: linux-2.6.5-sles9/fs/ext3/super.c +Index: linux-stage/fs/ext3/super.c =================================================================== ---- linux-2.6.5-sles9.orig/fs/ext3/super.c 2004-11-09 02:15:44.743673048 +0300 -+++ linux-2.6.5-sles9/fs/ext3/super.c 2004-11-09 02:18:27.620911944 +0300 -@@ -534,7 +534,7 @@ - Opt_reservation, Opt_noreservation, Opt_noload, - Opt_commit, Opt_journal_update, Opt_journal_inum, +--- linux-stage.orig/fs/ext3/super.c 2005-02-25 14:37:30.987717392 +0200 ++++ linux-stage/fs/ext3/super.c 2005-02-25 14:44:50.495901992 +0200 +@@ -586,6 +586,7 @@ Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, -- Opt_ignore, Opt_barrier, -+ Opt_ignore, Opt_barrier, Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, + Opt_ignore, Opt_barrier, Opt_err, ++ Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, }; -@@ -577,6 +577,9 @@ + static match_table_t tokens = { +@@ -633,6 +634,9 @@ + {Opt_ignore, "noquota"}, {Opt_ignore, "quota"}, {Opt_ignore, "usrquota"}, - {Opt_barrier, "barrier=%u"}, + {Opt_iopen, "iopen"}, + {Opt_noiopen, "noiopen"}, + {Opt_iopen_nopriv, "iopen_nopriv"}, + {Opt_barrier, "barrier=%u"}, {Opt_err, NULL} }; - -@@ -778,6 +781,18 @@ +@@ -914,6 +918,18 @@ else clear_opt(sbi->s_mount_opt, BARRIER); break; @@ -463,16 +457,16 @@ Index: linux-2.6.5-sles9/fs/ext3/super.c case Opt_ignore: break; default: -Index: linux-2.6.5-sles9/include/linux/ext3_fs.h +Index: linux-stage/include/linux/ext3_fs.h =================================================================== ---- linux-2.6.5-sles9.orig/include/linux/ext3_fs.h 2004-11-09 02:15:44.616692352 +0300 -+++ linux-2.6.5-sles9/include/linux/ext3_fs.h 2004-11-09 02:18:27.622911640 +0300 -@@ -329,6 +329,8 @@ +--- linux-stage.orig/include/linux/ext3_fs.h 2005-02-25 14:37:28.977023064 +0200 ++++ linux-stage/include/linux/ext3_fs.h 2005-02-25 14:49:00.569884968 +0200 +@@ -355,6 +355,8 @@ #define EXT3_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */ #define EXT3_MOUNT_RESERVATION 0x10000 /* Preallocation */ #define EXT3_MOUNT_BARRIER 0x20000 /* Use block barriers */ -+#define EXT3_MOUNT_IOPEN 0x40000 /* Allow access via iopen */ -+#define EXT3_MOUNT_IOPEN_NOPRIV 0x80000 /* Make iopen world-readable */ ++#define EXT3_MOUNT_IOPEN 0x80000 /* Allow access via iopen */ ++#define EXT3_MOUNT_IOPEN_NOPRIV 0x100000/* Make iopen world-readable */ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ #ifndef _LINUX_EXT2_FS_H diff --git a/lustre/kernel_patches/patches/iopen-2.4.19-suse.patch b/ldiskfs/kernel_patches/patches/iopen-2.6.12.patch similarity index 62% rename from lustre/kernel_patches/patches/iopen-2.4.19-suse.patch rename to ldiskfs/kernel_patches/patches/iopen-2.6.12.patch index 3c10d3d..8d456ac 100644 --- a/lustre/kernel_patches/patches/iopen-2.4.19-suse.patch +++ b/ldiskfs/kernel_patches/patches/iopen-2.6.12.patch @@ -1,80 +1,42 @@ - Documentation/filesystems/ext2.txt | 16 ++ - fs/ext3/Makefile | 2 - fs/ext3/inode.c | 4 - fs/ext3/iopen.c | 259 +++++++++++++++++++++++++++++++++++++ - fs/ext3/iopen.h | 13 + - fs/ext3/namei.c | 13 + - fs/ext3/super.c | 11 + - include/linux/ext3_fs.h | 2 - 8 files changed, 318 insertions(+), 2 deletions(-) - -Index: linux-2.4.19.SuSE/Documentation/filesystems/ext2.txt +Index: linux-2.6.12-rc6/fs/ext3/Makefile =================================================================== ---- linux-2.4.19.SuSE.orig/Documentation/filesystems/ext2.txt Wed Jul 11 15:44:45 2001 -+++ linux-2.4.19.SuSE/Documentation/filesystems/ext2.txt Sun Nov 16 01:27:31 2003 -@@ -35,6 +35,22 @@ +--- linux-2.6.12-rc6.orig/fs/ext3/Makefile 2005-06-14 16:00:45.206720992 +0200 ++++ linux-2.6.12-rc6/fs/ext3/Makefile 2005-06-14 16:14:33.595382720 +0200 +@@ -4,7 +4,7 @@ - sb=n Use alternate superblock at this location. + obj-$(CONFIG_EXT3_FS) += ext3.o -+iopen Makes an invisible pseudo-directory called -+ __iopen__ available in the root directory -+ of the filesystem. Allows open-by-inode- -+ number. i.e., inode 3145 can be accessed -+ via /mntpt/__iopen__/3145 -+ -+iopen_nopriv This option makes the iopen directory be -+ world-readable. This may be safer since it -+ allows daemons to run as an unprivileged user, -+ however it significantly changes the security -+ model of a Unix filesystem, since previously -+ all files under a mode 700 directory were not -+ generally avilable even if the -+ permissions on the file itself is -+ world-readable. -+ - grpquota,noquota,quota,usrquota Quota options are silently ignored by ext2. +-ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ ++ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ + ioctl.o namei.o super.o symlink.o hash.o resize.o - -Index: linux-2.4.19.SuSE/fs/ext3/Makefile -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/ext3/Makefile Sun Nov 16 00:40:59 2003 -+++ linux-2.4.19.SuSE/fs/ext3/Makefile Sun Nov 16 01:27:31 2003 -@@ -11,7 +11,7 @@ - - export-objs := ext3-exports.o - --obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ -+obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ - ioctl.o namei.o super.o symlink.o hash.o ext3-exports.o - obj-m := $(O_TARGET) - -Index: linux-2.4.19.SuSE/fs/ext3/inode.c + ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o +Index: linux-2.6.12-rc6/fs/ext3/inode.c =================================================================== ---- linux-2.4.19.SuSE.orig/fs/ext3/inode.c Sun Nov 16 01:26:04 2003 -+++ linux-2.4.19.SuSE/fs/ext3/inode.c Sun Nov 16 01:27:31 2003 -@@ -34,6 +34,7 @@ - #include - #include - #include +--- linux-2.6.12-rc6.orig/fs/ext3/inode.c 2005-06-14 16:01:16.272150299 +0200 ++++ linux-2.6.12-rc6/fs/ext3/inode.c 2005-06-14 16:24:55.686195412 +0200 +@@ -37,6 +37,7 @@ + #include + #include + #include "xattr.h" +#include "iopen.h" + #include "acl.h" - /* - * SEARCH_FROM_ZERO forces each block allocation to search from the start -@@ -2350,6 +2351,9 @@ - struct buffer_head *bh; - int block; - -+ if (ext3_iopen_get_inode(inode)) -+ return; -+ - if(ext3_get_inode_loc(inode, &iloc)) + static int ext3_writepage_trans_blocks(struct inode *inode); +@@ -2437,6 +2438,8 @@ + ei->i_default_acl = EXT3_ACL_NOT_CACHED; + #endif + ei->i_block_alloc_info = NULL; ++ if (ext3_iopen_get_inode(inode)) ++ return; + + if (__ext3_get_inode_loc(inode, &iloc, 0)) goto bad_inode; - bh = iloc.bh; -Index: lum/fs/ext3/iopen.c +Index: linux-2.6.12-rc6/fs/ext3/iopen.c =================================================================== ---- lum.orig/fs/ext3/iopen.c 2004-03-09 16:46:37.000000000 -0700 -+++ lum/fs/ext3/iopen.c 2004-03-09 16:48:03.000000000 -0700 -@@ -0,0 +1,285 @@ +--- linux-2.6.12-rc6.orig/fs/ext3/iopen.c 2005-06-14 16:14:33.530929595 +0200 ++++ linux-2.6.12-rc6/fs/ext3/iopen.c 2005-06-14 16:14:33.626632719 +0200 +@@ -0,0 +1,278 @@ +/* + * linux/fs/ext3/iopen.c + * @@ -107,11 +69,12 @@ Index: lum/fs/ext3/iopen.c + +#include +#include -+#include +#include +#include +#include +#include ++#include ++#include +#include "iopen.h" + +#ifndef assert @@ -123,7 +86,8 @@ Index: lum/fs/ext3/iopen.c +/* + * This implements looking up an inode by number. + */ -+static struct dentry *iopen_lookup(struct inode *dir, struct dentry *dentry) ++static struct dentry *iopen_lookup(struct inode * dir, struct dentry *dentry, ++ struct nameidata *nd) +{ + struct inode *inode; + unsigned long ino; @@ -148,7 +112,7 @@ Index: lum/fs/ext3/iopen.c + //ino != EXT3_ACL_IDX_INO && + //ino != EXT3_ACL_DATA_INO && + ino < EXT3_FIRST_INO(dir->i_sb)) || -+ ino > le32_to_cpu(dir->i_sb->u.ext3_sb.s_es->s_inodes_count)) ++ ino > le32_to_cpu(EXT3_SB(dir->i_sb)->s_es->s_inodes_count)) + return ERR_PTR(-ENOENT); + + inode = iget(dir->i_sb, ino); @@ -160,31 +124,33 @@ Index: lum/fs/ext3/iopen.c + } + + assert(list_empty(&dentry->d_alias)); /* d_instantiate */ -+ assert(list_empty(&dentry->d_hash)); /* d_rehash */ ++ assert(d_unhashed(dentry)); /* d_rehash */ + + /* preferrably return a connected dentry */ + spin_lock(&dcache_lock); + list_for_each(lp, &inode->i_dentry) { + alternate = list_entry(lp, struct dentry, d_alias); -+ assert(!(alternate->d_flags & DCACHE_NFSD_DISCONNECTED)); ++ assert(!(alternate->d_flags & DCACHE_DISCONNECTED)); + } + + if (!list_empty(&inode->i_dentry)) { + alternate = list_entry(inode->i_dentry.next, + struct dentry, d_alias); + dget_locked(alternate); -+ alternate->d_vfs_flags |= DCACHE_REFERENCED; ++ spin_lock(&alternate->d_lock); ++ alternate->d_flags |= DCACHE_REFERENCED; ++ spin_unlock(&alternate->d_lock); + iput(inode); + spin_unlock(&dcache_lock); + return alternate; + } -+ dentry->d_flags |= DCACHE_NFSD_DISCONNECTED; ++ dentry->d_flags |= DCACHE_DISCONNECTED; + + /* d_add(), but don't drop dcache_lock before adding dentry to inode */ + list_add(&dentry->d_alias, &inode->i_dentry); /* d_instantiate */ + dentry->d_inode = inode; + -+ __d_rehash(dentry, 0); /* d_rehash */ ++ d_rehash_cond(dentry, 0); /* d_rehash */ + spin_unlock(&dcache_lock); + + return NULL; @@ -198,7 +164,7 @@ Index: lum/fs/ext3/iopen.c +{ + const unsigned char *old_name, *new_name; + -+ memcpy(dentry->d_iname, target->d_iname, DNAME_INLINE_LEN); ++ memcpy(dentry->d_iname, target->d_iname, DNAME_INLINE_LEN_MIN); + old_name = target->d_name.name; + new_name = dentry->d_name.name; + if (old_name == target->d_iname) @@ -222,7 +188,7 @@ Index: lum/fs/ext3/iopen.c + assert(dentry->d_inode == NULL); + assert(list_empty(&dentry->d_alias)); /* d_instantiate */ + if (rehash) -+ assert(list_empty(&dentry->d_hash)); /* d_rehash */ ++ assert(d_unhashed(dentry)); /* d_rehash */ + assert(list_empty(&dentry->d_subdirs)); + + spin_lock(&dcache_lock); @@ -235,7 +201,7 @@ Index: lum/fs/ext3/iopen.c + /* preferrably return a connected dentry */ + list_for_each(lp, &inode->i_dentry) { + tmp = list_entry(lp, struct dentry, d_alias); -+ if (tmp->d_flags & DCACHE_NFSD_DISCONNECTED) { ++ if (tmp->d_flags & DCACHE_DISCONNECTED) { + assert(tmp->d_alias.next == &inode->i_dentry); + assert(tmp->d_alias.prev == &inode->i_dentry); + goal = tmp; @@ -247,23 +213,12 @@ Index: lum/fs/ext3/iopen.c + if (!goal) + goto do_instantiate; + -+ /* Move the goal to the de hash queue - like d_move() */ -+ goal->d_flags &= ~DCACHE_NFSD_DISCONNECTED; -+ list_del_init(&goal->d_hash); -+ -+ list_del(&goal->d_child); -+ list_del(&dentry->d_child); -+ -+ /* Switch the parents and the names.. */ -+ switch_names(goal, dentry); -+ do_switch(goal->d_parent, dentry->d_parent); -+ do_switch(goal->d_name.len, dentry->d_name.len); -+ do_switch(goal->d_name.hash, dentry->d_name.hash); -+ -+ /* And add them back to the (new) parent lists */ -+ list_add(&goal->d_child, &goal->d_parent->d_subdirs); -+ list_add(&dentry->d_child, &dentry->d_parent->d_subdirs); -+ __d_rehash(goal, 0); ++ /* Move the goal to the de hash queue */ ++ goal->d_flags &= ~DCACHE_DISCONNECTED; ++ security_d_instantiate(goal, inode); ++ __d_drop(dentry); ++ d_rehash_cond(dentry, 0); ++ __d_move(goal, dentry); + spin_unlock(&dcache_lock); + iput(inode); + @@ -275,7 +230,7 @@ Index: lum/fs/ext3/iopen.c + dentry->d_inode = inode; +do_rehash: + if (rehash) -+ __d_rehash(dentry, 0); /* d_rehash */ ++ d_rehash_cond(dentry, 0); /* d_rehash */ + spin_unlock(&dcache_lock); + + return NULL; @@ -346,7 +301,7 @@ Index: lum/fs/ext3/iopen.c + inode->i_atime = CURRENT_TIME; + inode->i_ctime = CURRENT_TIME; + inode->i_mtime = CURRENT_TIME; -+ inode->u.ext3_i.i_dtime = 0; ++ EXT3_I(inode)->i_dtime = 0; + inode->i_blksize = PAGE_SIZE; /* This is the optimal IO size + * (for stat), not the fs block + * size */ @@ -360,10 +315,10 @@ Index: lum/fs/ext3/iopen.c + + return 1; +} -Index: lum/fs/ext3/iopen.h +Index: linux-2.6.12-rc6/fs/ext3/iopen.h =================================================================== ---- lum.orig/fs/ext3/iopen.h 2004-03-09 16:46:37.000000000 -0700 -+++ lum/fs/ext3/iopen.h 2004-03-09 16:48:03.000000000 -0700 +--- linux-2.6.12-rc6.orig/fs/ext3/iopen.h 2005-06-14 16:14:33.534835845 +0200 ++++ linux-2.6.12-rc6/fs/ext3/iopen.h 2005-06-14 16:14:33.633468657 +0200 @@ -0,0 +1,15 @@ +/* + * iopen.h @@ -380,20 +335,19 @@ Index: lum/fs/ext3/iopen.h +extern int ext3_iopen_get_inode(struct inode *inode); +extern struct dentry *iopen_connect_dentry(struct dentry *dentry, + struct inode *inode, int rehash); -Index: linux-2.4.19.SuSE/fs/ext3/namei.c +Index: linux-2.6.12-rc6/fs/ext3/namei.c =================================================================== ---- linux-2.4.19.SuSE.orig/fs/ext3/namei.c Sun Nov 16 01:23:20 2003 -+++ linux-2.4.19.SuSE/fs/ext3/namei.c Sun Nov 16 01:27:31 2003 -@@ -36,7 +36,7 @@ - #include - #include - #include -- +--- linux-2.6.12-rc6.orig/fs/ext3/namei.c 2005-06-14 16:01:14.701837819 +0200 ++++ linux-2.6.12-rc6/fs/ext3/namei.c 2005-06-14 16:14:33.644210844 +0200 +@@ -37,6 +37,7 @@ + #include + #include + #include "xattr.h" +#include "iopen.h" + #include "acl.h" /* - * define how far ahead to read directories while searching them. -@@ -926,6 +927,9 @@ +@@ -985,6 +986,9 @@ if (dentry->d_name.len > EXT3_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); @@ -403,29 +357,31 @@ Index: linux-2.4.19.SuSE/fs/ext3/namei.c bh = ext3_find_entry(dentry, &de); inode = NULL; if (bh) { -@@ -943,8 +948,8 @@ +@@ -995,10 +999,8 @@ + if (!inode) return ERR_PTR(-EACCES); - } } +- if (inode) +- return d_splice_alias(inode, dentry); - d_add(dentry, inode); - return NULL; + + return iopen_connect_dentry(dentry, inode, 1); } - #define S_SHIFT 12 -@@ -1932,10 +1935,6 @@ + +@@ -2042,10 +2044,6 @@ inode->i_nlink); - inode->i_version = ++event; + inode->i_version++; inode->i_nlink = 0; - /* There's no need to set i_disksize: the fact that i_nlink is - * zero will ensure that the right thing happens during any - * recovery. */ - inode->i_size = 0; ext3_orphan_add(handle, inode); - dir->i_nlink--; - inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; -@@ -2086,6 +2085,23 @@ + inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; + ext3_mark_inode_dirty(handle, inode); +@@ -2168,6 +2166,23 @@ return err; } @@ -449,49 +405,67 @@ Index: linux-2.4.19.SuSE/fs/ext3/namei.c static int ext3_link (struct dentry * old_dentry, struct inode * dir, struct dentry *dentry) { -@@ -2113,7 +2129,8 @@ +@@ -2191,7 +2206,8 @@ ext3_inc_count(handle, inode); atomic_inc(&inode->i_count); - err = ext3_add_nondir(handle, dentry, inode); + err = ext3_add_link(handle, dentry, inode); + ext3_orphan_del(handle, inode); - ext3_journal_stop(handle, dir); - return err; - } -Index: linux-2.4.19.SuSE/fs/ext3/super.c + ext3_journal_stop(handle); + if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) + goto retry; +Index: linux-2.6.12-rc6/fs/ext3/super.c =================================================================== ---- linux-2.4.19.SuSE.orig/fs/ext3/super.c Sun Nov 16 01:19:22 2003 -+++ linux-2.4.19.SuSE/fs/ext3/super.c Sun Nov 16 01:27:31 2003 -@@ -864,6 +864,18 @@ - || !strcmp (this_char, "quota") - || !strcmp (this_char, "usrquota")) - /* Don't do anything ;-) */ ; -+ else if (!strcmp (this_char, "iopen")) { +--- linux-2.6.12-rc6.orig/fs/ext3/super.c 2005-06-14 16:01:16.287775299 +0200 ++++ linux-2.6.12-rc6/fs/ext3/super.c 2005-06-14 16:14:33.656906156 +0200 +@@ -590,6 +590,7 @@ + Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, + Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, + Opt_ignore, Opt_barrier, Opt_err, Opt_resize, ++ Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, + }; + + static match_table_t tokens = { +@@ -638,6 +639,9 @@ + {Opt_ignore, "noquota"}, + {Opt_ignore, "quota"}, + {Opt_ignore, "usrquota"}, ++ {Opt_iopen, "iopen"}, ++ {Opt_noiopen, "noiopen"}, ++ {Opt_iopen_nopriv, "iopen_nopriv"}, + {Opt_barrier, "barrier=%u"}, + {Opt_err, NULL}, + {Opt_resize, "resize"}, +@@ -921,6 +925,18 @@ + else + clear_opt(sbi->s_mount_opt, BARRIER); + break; ++ case Opt_iopen: + set_opt (sbi->s_mount_opt, IOPEN); + clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV); -+ } -+ else if (!strcmp (this_char, "noiopen")) { ++ break; ++ case Opt_noiopen: + clear_opt (sbi->s_mount_opt, IOPEN); + clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV); -+ } -+ else if (!strcmp (this_char, "iopen_nopriv")) { ++ break; ++ case Opt_iopen_nopriv: + set_opt (sbi->s_mount_opt, IOPEN); + set_opt (sbi->s_mount_opt, IOPEN_NOPRIV); -+ } - else if (!strcmp (this_char, "journal")) { - /* @@@ FIXME */ - /* Eventually we will want to be able to create -Index: linux-2.4.19.SuSE/include/linux/ext3_fs.h ++ break; + case Opt_ignore: + break; + case Opt_resize: +Index: linux-2.6.12-rc6/include/linux/ext3_fs.h =================================================================== ---- linux-2.4.19.SuSE.orig/include/linux/ext3_fs.h Sun Nov 16 01:25:42 2003 -+++ linux-2.4.19.SuSE/include/linux/ext3_fs.h Sun Nov 16 01:30:05 2003 -@@ -324,6 +324,8 @@ - #define EXT3_MOUNT_XATTR_USER 0x4000 /* Extended user attributes */ - #define EXT3_MOUNT_POSIX_ACL 0x8000 /* POSIX Access Control Lists */ - #define EXT3_MOUNT_ASYNCDEL 0x20000 /* Delayed deletion */ -+#define EXT3_MOUNT_IOPEN 0x40000 /* Allow access via iopen */ -+#define EXT3_MOUNT_IOPEN_NOPRIV 0x80000 /* Make iopen world-readable */ +--- linux-2.6.12-rc6.orig/include/linux/ext3_fs.h 2005-06-14 16:01:14.709650318 +0200 ++++ linux-2.6.12-rc6/include/linux/ext3_fs.h 2005-06-14 16:28:38.452794245 +0200 +@@ -358,6 +358,8 @@ + #define EXT3_MOUNT_RESERVATION 0x10000 /* Preallocation */ + #define EXT3_MOUNT_BARRIER 0x20000 /* Use block barriers */ + #define EXT3_MOUNT_NOBH 0x40000 /* No bufferheads */ ++#define EXT3_MOUNT_IOPEN 0x80000 /* Allow access via iopen */ ++#define EXT3_MOUNT_IOPEN_NOPRIV 0x100000/* Make iopen world-readable */ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ #ifndef _LINUX_EXT2_FS_H diff --git a/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel4.series b/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel4.series index 8e76197..bab81b9 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel4.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel4.series @@ -8,6 +8,5 @@ export-ext3-2.6-rhel4.patch ext3-include-fixes-2.6-rhel4.patch ext3-extents-2.6.9-rhel4.patch ext3-mballoc2-2.6.9-rhel4.patch -ext3-nlinks-2.6.7.patch -ext3-htree-dot-2.6.patch +ext3-nlinks-2.6.9.patch ext3-ialloc-2.6.patch diff --git a/ldiskfs/kernel_patches/series/ldiskfs-2.6.12-vanilla.series b/ldiskfs/kernel_patches/series/ldiskfs-2.6.12-vanilla.series new file mode 100644 index 0000000..7d0a383 --- /dev/null +++ b/ldiskfs/kernel_patches/series/ldiskfs-2.6.12-vanilla.series @@ -0,0 +1,13 @@ +ext3-wantedi-2.6-rhel4.patch +ext3-san-jdike-2.6-suse.patch +iopen-2.6.12.patch +ext3-map_inode_page-2.6-suse.patch +export-ext3-2.6-rhel4.patch +ext3-include-fixes-2.6-rhel4.patch +ext3-extents-2.6.12.patch +ext3-mballoc2-2.6.12.patch +ext3-nlinks-2.6.9.patch +ext3-ialloc-2.6.patch +ext3-remove-cond_resched-calls-2.6.12.patch +ext3-htree-dot-2.6.patch +ext3-external-journal-2.6.12.patch diff --git a/ldiskfs/ldiskfs/Makefile.in b/ldiskfs/ldiskfs/Makefile.in index 92d9b6b..e52e62f 100644 --- a/ldiskfs/ldiskfs/Makefile.in +++ b/ldiskfs/ldiskfs/Makefile.in @@ -11,7 +11,7 @@ ext3_headers := $(wildcard @LINUX@/fs/ext3/*.h) linux_headers := $(wildcard @LINUX@/include/linux/ext3*.h) ext3_sources := $(filter-out %.mod.c,$(wildcard @LINUX@/fs/ext3/*.c)) -new_sources := iopen.c iopen.h extents.c mballoc.c proc.c +new_sources := iopen.c iopen.h extents.c mballoc.c new_headers := ext3_extents.h ldiskfs_patched_sources := $(notdir $(ext3_sources) $(ext3_headers)) $(new_sources) $(new_headers) ldiskfs_sources := $(ldiskfs_patched_sources) diff --git a/ldiskfs/ldiskfs/autoMakefile.am b/ldiskfs/ldiskfs/autoMakefile.am index 0eff073..7e378c2 100644 --- a/ldiskfs/ldiskfs/autoMakefile.am +++ b/ldiskfs/ldiskfs/autoMakefile.am @@ -38,7 +38,8 @@ sources: $(ext3_sources) $(ext3_headers) $(linux_headers) $(series) cp $(ext3_sources) $(ext3_headers) $(ext3_extra) linux-stage/fs/ext3 cp $(linux_headers) linux-stage/include/linux if USE_QUILT - cd linux-stage && quilt setup -d ../$(patches) ../$(series) + ln -s ../$(patches) linux-stage/patches + ln -s ../$(series) linux-stage/series cd linux-stage && quilt push -a -q else @echo -n "Applying ext3 patches:" diff --git a/lustre/ChangeLog b/lustre/ChangeLog index 62e376d..441cb5c 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -1,18 +1,15 @@ -tbd Cluster File Systems, Inc. +02-14-2006 Cluster File Systems, Inc. * version 1.4.6 * WIRE PROTOCOL CHANGE. This version of Lustre networking WILL NOT INTEROPERATE with older versions automatically. Please read the user documentation before upgrading any part of a live system. + * WARNING: Lustre networking configuration changes are required with + this release. See https://bugzilla.clusterfs.com/show_bug.cgi?id=10052 + for details. * bug fixes + * Support for newer kernels: 2.6.9-22.0.2.EL (RHEL 4), + 2.6.5-7.244 (SLES 9) - same as 1.4.5.2. -Severity : enhancement -Bugzilla : 8888 -Description: Introduced CReate On Write (CROW) -Details : CROW is improved create approach, which defers OST objects - creates to the time when they realy needed. This is when client - wants to perform first write to file for instance. Or when object - changes some of its attributes stored on OST. This should improve - create rate. Severity : enhancement Bugzilla : 7981/8208 @@ -26,7 +23,7 @@ Details : LNET is new networking infrastructure for Lustre, it includes Severity : enhancement Bugzilla : 7982 -Description: Configuration change for the XT3 +Description: Configuration change for the XT3 The PTLLND is now used to run Lustre over Portals on the XT3 The configure option(s) --with-cray-portals are no longer used. Rather --with-portals= is used to @@ -97,6 +94,13 @@ Details : sending a glimpse AST to a liblustre client waits for every AST to time out, as liblustre clients will not respond. Since they cannot cache data we refresh the OST lock LVB from disk instead. +Severity : enhancement +Bugzilla : 7198 +Description: doing an ls at the same time as file IO can be slow +Details : enqueue and other "small" requests can be blocked behind many + large IO requests. Create a new OST IO portal for non-IO + requests so they can be processed faster. + Severity : minor Frequency : rare (only HPUX clients mounting unsupported re-exported NFS vol) Bugzilla : 5781 @@ -212,7 +216,7 @@ Details : Having an LWI_INTR() wait event (interruptible, but no timeout) request was interrupted, and we also didn't break out of the event loop if there was no timeout -Severity : minor +Severity : major Frequency : rare Bugzilla : 5047 Description: data loss during non-page-aligned writes to a single file from @@ -258,8 +262,8 @@ Description: do not expand extent locks acquired on OST-side Details : Modify ldlm_extent_policy() to not expand local locks, acquired by server: they are not cached anyway. -Severity : medium -Frequency : seldom, when mmap is used/files executed from lustre +Severity : major +Frequency : when mmap is used/binaries executed from Lustre Bugzilla : 9482 Description: Unmmap pages before throwing them away from read cache. Details : llap_shrink cache now attempts to unmap pages before discarding @@ -267,6 +271,12 @@ Details : llap_shrink cache now attempts to unmap pages before discarding extra checks that trigger if this unmapping is not done first. Severity : minor +Frequency : rare +Bugzilla : 6034 +Description: lconf didn't resolve symlinks before checking to see whether a + given mountpoint was already in use + +Severity : minor Frequency : when migrating failover services Bugzilla : 6395, 9514 Description: When migrating a subset of services from a node (e.g. failback @@ -277,6 +287,312 @@ Details : lconf --force (implied by --failover) sets the global obd_timeout other RPCs to time out too quickly. Do not change the global obd_timeout for force cleanup, only set it for DISCONNECT RPCs. +Severity : enhancement +Frequency : if MDS is started with down OST +Bugzilla : 9439,5706 +Description: Allow startup/shutdown of an MDS without depending on the + availability of the OSTs. +Details : Asynchronously call mds_lov_synchronize during MDS startup. + Add appropriate locking and lov-osc refcounts for safe + cleaning. Add osc abort_inflight calls in case the + synchronize never started. + +Severity : minor +Frequency : occasional (Cray XT3 only) +Bugzilla : 7305 +Description: root not authorized to access files in CRAY_PORTALS environment +Details : The client process capabilities were not honoured on the MDS in + a CRAY_PORTALS/CRAY_XT3 environment. If the file had previously + been accessed by an authorized user then root was able to access + the file on the local client also. The root user capabilities + are now allowed on the MDS, as this environment has secure UID. + +Severity : minor +Frequency : occasional +Bugzilla : 6449 +Description: ldiskfs "too long searching" message happens too often +Details : A debugging message (otherwise harmless) prints too often on + the OST console. This has been reduced to only happen when + there are fragmentation problems on the filesystem. + +Severity : minor +Frequency : rare +Bugzilla : 9598 +Description: Division by zero in statfs when all OSCs are inactive +Details : lov_get_stripecnt() returns zero due to incorrect order of checks, + lov_statfs divides by value returned by lov_get_stripecnt(). + +Severity : minor +Frequency : common +Bugzilla : 9489, 3273 +Description: First write from each client to each OST was only 4kB in size, + to initialize client writeback cache, which caused sub-optimal + RPCs and poor layout on disk for the first writen file. +Details : Clients now request an initial cache grant at (re)connect time + and so that they can start streaming writes to the cache right + away and always do full-sized RPCs if there is enough data. + If the OST is rebooted the client also re-establishes its grant + so that client cached writes will be honoured under the grant. + +Severity : minor +Frequency : common +Bugzilla : 7198 +Description: Slow ls (and stat(2) syscall) on files residing on IO-loaded OSTs +Details : Now I/O RPCs go to different portal number and (presumably) fast + lock requests (and glimses) and other RPCs get their own service + threads pool that should be able to service those RPCs + immediatelly. + +Severity : enhancement +Bugzilla : 7417 +Description: Ability to exchange lustre version between client and servers and + issue warnings at client side if client is too old. Also for + liblustre clients there is ability to refuse connection of too old + clients. +Details : New 'version' field is added to connect data structure that is + filled with version info. That info is later checked by server and + by client. + +Severity : minor +Frequency : rare, liblustre only. +Bugzilla : 9296, 9581 +Description: Two simultaneous writes from liblustre at offset within same page + might proceed at the same time overwriting eachother with stale + data. +Details : I/O lock withing llu_file_prwv was released too early, before data + actually was hitting the wire. Extended lock-holding time until + server acknowledges receiving data. + +Severity : minor +Frequency : extremely rare. Never observed in practice. +Bugzilla : 9652 +Description: avoid generating lustre_handle cookie of 0. +Details : class_handle_hash() generates handle cookies by incrementing + global counter, and can hit 0 occasionaly (this is unlikely, but + not impossible, because initial value of cookie counter is + selected randonly). Value of 0 is used as a sentinel meaning + "unassigned handle" --- avoid it. Also coalesce two critical + sections in this function into one. + +Severity : enhancement +Bugzilla : 9528 +Description: allow liblustre clients to delegate truncate locking to OST +Details : To avoid overhead of locking, liblustre client instructs OST to + take extent lock in ost_punch() on client's behalf. New connection + flag is added to handle backward compatibility. + +Severity : enhancement +Bugzilla : 4928, 7341, 9758 +Description: allow number of OST service threads to be specified +Details : a module parameter allows the number of OST service threads + to be specified via "options ost ost_num_threads=X" in + /etc/modules.conf or /etc/modutils.conf. + +Severity : major +Frequency : rare +Bugzilla : 6146, 9635, 9895 +Description: servers crash with bad pointer in target_handle_connect() +Details : In rare cases when a client is reconnecting it was possible that + the connection request was the last reference for that export. + We would temporarily drop the export reference and get a new + one, but this may have been the last reference and the export + was just destroyed. Get new reference before dropping old one. + +Severity : enhancement +Frequency : if client is started with failover MDS +Bugzilla : 9818 +Description: Allow multiple MDS hostnames in the mount command +Details : Try to read the configuration from all specified MDS + hostnames during a client mount in case the "primary" + MDS is down. + +Severity : enhancement +Bugzilla : 9297 +Description: Stop sending data to evicted clients as soon as possible. +Details : Check if the client we are about to send or are sending data to + was evicted already. (Check is done every second of waiting, + for which l_wait_event interface was extended to allow checking + of exit condition at specified intervals). + +Severity : minor +Frequency : rare, normally only when NFS exporting is done from client +Bugzilla : 9301 +Description: 'bad disk LOV MAGIC: 0x00000000' error when chown'ing files + without objects +Details : Make mds_get_md() recognise empty md case and set lmm size to 0. + +Severity : minor +Frequency : always, if srand() is called before liblustre initialization +Bugzilla : 9794 +Description: Liblustre uses system PRNG disturbing its usage by user application +Details : Introduce internal to lustre fast and high-quality PRNG for + lustre usage and make liblustre and some other places in generic + lustre code to use it. + +Severity : enhancement +Bugzilla : 9477, 9557, 9870 +Description: Verify that the MDS configuration logs are updated when xml is +Details : Check if the .xml configuration logs are newer than the config + logs stored on the MDS and report an error if this is the case. + Request --write-conf, or allow starting with --old_conf. + +Severity : enhancement +Bugzilla : 6034 +Description: Handle symlinks in the path when checking if Lustre is mounted. +Details : Resolve intermediate symlinks when checking if a client has + mounted a filesystem to avoid duplicate client mounts. + +Severity : minor +Frequency : rare +Bugzilla : 9309 +Description: lconf can hit an error exception but still return success. +Details : The lconf command catches the Command error exception at the top + level script context and will exit with the associated exit + status, but doesn't ensure that this exit status is non-zero. + +Severity : minor +Frequency : rare +Bugzilla : 9493 +Description: failure of ptlrpc thread startup can cause oops +Details : Starting a ptlrpc service thread can fail if there are a large + number of threads or the server memory is very fragmented. + Handle this without oopsing. + +Severity : minor +Frequency : always, only if liblustre and non-default acceptor port was used +Bugzilla : 9933 +Description: liblustre cannot connect to servers with non-default acceptor port +Details : tcpnal_set_default_params() was not called and was therefore + ignoring the environment varaible TCPNAL_PORT, as well as other + TCPNAL_ environment variables + +Severity : minor +Frequency : rare +Bugzilla : 9923 +Description: two objects could be created on the same OST for a single file +Details : If an OST is down, in some cases it was possible to create two + objects on a single OST for a single file. No problems other + than potential performance impact and spurious error messages. + +Severity : minor +Frequency : rare +Bugzilla : 5681, 9562 +Description: Client may oops in ll_unhash_aliases +Details : Client dcache may become inconsistent in race condition. + In some cases "getcwd" can fail if the current directory is + modified. + +Severity : minor +Frequency : always +Bugzilla : 9942 +Description: Inode refcounting problems in NFS export code +Details : link_raw functions used to call d_instantiate without obtaining + extra inode reference first. + +Severity : minor +Frequency : rare +Bugzilla : 9942, 9903 +Description: Referencing freed requests leading to crash, memleaks with NFS. +Details : We used to require that call to ll_revalidate_it was always + followed by ll_lookup_it. Also with revalidate_special() it is + possible to call ll_revalidate_it() twice for the same dentry + even if first occurence returned success. This fix changes semantic + between DISP_ENQ_COMPLETE disposition flag to mean there is extra + reference on a request referred from the intent. + ll_intent_release() then releases such a request. + +Severity : minor +Frequency : rare, normally benchmark loads only +Bugzilla : 1443 +Description: unlinked inodes were kept in memory on the client +Details : If a client is repeatedly creating and unlinking files it + can accumulate a lot of stale inodes in the inode slab cache. + If there is no other client load running this can cause the + client node to run out of memory. Instead flush old inodes + from client cache that have the same inode number as a new inode. + +Severity : major +Frequency : rare, unless heavy write-truncate concurrency is continuous +Bugzilla : 4180, 6984, 7171, 9963, 9331 +Description: OST becomes very slow and/or deadlocked during object unlink +Details : filter_destroy() was holding onto the parent directory lock + while truncating+unlinking objects. For very large objects this + may block other threads for a long time and slow overall OST + responsiveness. It may also be possible to get a lock ordering + deadlock in this case, or run out of journal credits because of + the combined truncate+unlink. Solution is to do object truncate + first in one transaction without parent lock, and then do the + final unlink in a new transaction with the parent lock. This + reduces the lock hold time dramatically. + +Severity : major +Frequency : rare, 2.4 kernels only +Bugzilla : 9967 +Description: MDS or OST cleanup may trip kernel BUG when dropping kernel lock +Details : mds_cleanup() and filter_cleanup() need to drop the kernel lock + before unmounting their filesystem in order to avoid deadlock. + The kernel_locked() function in 2.4 kernels only checks whether + the kernel lock is held, not whether it is this process that is + holding it as 2.6 kernels do. + +Severity : major +Frequency : rare +Bugzilla : 9635 +Description: MDS or OST may oops/LBUG if a client is connecting multiple times +Details : The client ptlrpc code may be trying to reconnect to a down + server before a previous connection attempt has timed out. + Increase the reconnect interval to be longer than the connection + timeout interval to avoid sending duplicate connections to + servers. + +Severity : minor +Frequency : echo_client brw_test command +Bugzilla : 9919 +Description: fix echo_client to work with OST preallocated code +Details : OST preallocation code (5137) didn't take echo_client IO path + into account: echo_client calls filter methods outside of any + OST thread and, hence, there is no per-thread preallocated + pages and buffers to use. Solution: hijack pga pages for IO. As + a byproduct, this avoids unnecessary data copying. + +Severity : minor +Frequency : rare +Bugzilla : 3555, 5962, 6025, 6155, 6296, 9574 +Description: Client can oops in mdc_commit_close() after open replay +Details : It was possible for the MDS to return an open request with no + transaction number in mds_finish_transno() if the client was + evicted, but without actually returning an error. Clients + would later try to replay that open and may trip an assertion + Simplify the client close codepath, and always return an error + from the MDS in case the open is not successful. + +Severity : major +Frequency : rare, 2.6 OSTs only +Bugzilla : 10076 +Description: OST may deadlock under high load on fragmented files +Details : If there was a heavy load and highly-fragmented OST filesystems + it was possible to have all the OST threads deadlock waiting on + allocation of biovecs, because the biovecs were not released + until the entire RPC IO was completed. Instead, release biovecs + as soon as they are complete to ensure forward IO progress. + +Severity : enhancement +Bugzilla : 9578 +Description: Support for specifying external journal device at mount +Details : If an OST or MDS device is formatted with an external journal + device, this device major/minor is stored in the ext3 superblock + and may not be valid for failover. Allow detecting and + specifying the external journal at mount time. + +Severity : major +Frequency : rare +Bugzilla : 10235 +Description: Mounting an MDS with pending unlinked files may cause oops +Details : target_finish_recovery() calls mds_postrecov() which returned + the number of orphans unlinked. mds_lov_connect->mds_postsetup() + considers this an error and immediately begins cleaning up the + lov, just after starting the mds_lov process + ------------------------------------------------------------------------------ 08-26-2005 Cluster File Systems, Inc. @@ -391,19 +707,19 @@ Details : lconf was attempting to abort recovery on the MDT device and not * bug fixes Severity : major -Frequency : rare (only unsupported configurations with a node running as an +Frequency : rare (only unsupported configurations with a node running as an OST and a client) Bugzilla : 6514, 5137 Description: Mounting a Lustre file system on a node running as an OST could lead to deadlocks -Details : OSTs now allocate memory needed to write out data at - startup, instead of when needed, to avoid having to - allocate memory in possibly low memory situations. - Specifically, if the file system is mounted on on OST, - memory pressure could force it to try to write out data, - which it needed to allocate memory to do. Due to the low - memory, it would be unable to do so and the node would - become unresponsive. +Details : OSTs now preallocates memory needed to write out data at + startup, instead of when needed, to avoid having to + allocate memory in possibly low memory situations. + Specifically, if the file system is mounted on on OST, + memory pressure could force it to try to write out data, + which it needed to allocate memory to do. Due to the low + memory, it would be unable to do so and the node would + become unresponsive. Severity : enhancement Bugzilla : 7015 @@ -420,7 +736,7 @@ Details : By default, OSTs will now run in failover mode. To return to Severity : enhancement Bugzilla : 1693 Description: Health checks are now provided for MDS and OSTs -Details : Additional detailed health check information on MSD and OSTs +Details : Additional detailed health check information on MSD and OSTs is now provided through the procfs health_check value. Severity : minor @@ -443,6 +759,16 @@ Details : The config llog parsing code may overwrite the error return of an error. Severity : minor +Bugzilla : 6422 +Frequency : rare +Description: MDS can fail to allocate large reply buffers +Details : After long uptimes the MDS can fail to allocate large reply + buffers (e.g. zconf client mount config records) due to memory + fragmentation or consumption by the buffer cache. Preallocate + some large reply buffers so that these replies can be sent even + under memory pressure. + +Severity : minor Bugzilla : 6266 Frequency : rare (liblustre) Description: fsx running with liblustre complained that using truncate() to @@ -470,7 +796,7 @@ Details : It was possible under high-load situations to have an extent Severity : minor Bugzilla : 7241 -Frequency : filesystems with default stripe_count larger than 77 +Frequency : filesystems with default stripe_count larger than 77 Description: lconf+mke2fs fail when formatting filesystem with > 77 stripes Details : lconf specifies an inode size of 4096 bytes when the default stripe_count is larger than 77. This conflicts with the default @@ -537,7 +863,7 @@ Severity: : enhancement Bugzilla : 3262, 6359 Description: Attempts to reconnect to servers are now more aggressive. Details : This builds on the enhanced upcall-less recovery that was added - in 1.4.2. When trying to reconnect to servers, clients will + in 1.4.2. When trying to reconnect to servers, clients will now try each server in the failover group every 10 seconds. By default, clients would previously try one server every 25 seconds. @@ -547,13 +873,13 @@ Bugzilla : 6371 Description: After recovery, certain operations trigger a failed assertion on a client. Details : Failing over an mds, using lconf -d --failover, while a - client was doing a readdir() call would cause the client to + client was doing a readdir() call would cause the client to LBUG after recovery completed and the readdir() was resent. Severity : enhancement Bugzilla : 6296 Description: Default groups are now added by lconf -Details : You can now run lconf --group without having to +Details : You can now run lconf --group without having to manually add groups with lmc. Severity : major @@ -612,7 +938,7 @@ Details : Creating a new file via mkdir or mknod (starting a transaction Severity : minor Frequency : occasional -Description: While starting a server, the fsfilt_ext3 module could not be +Description: While starting a server, the fsfilt_ext3 module could not be loaded. Details : CFS's improved ext3 filesystem is named ldiskfs for 2.6 kernels. Previously, lconf would still use the ext3 name @@ -718,11 +1044,11 @@ Description: Changes the "SCSI I/O Stats" kernel patch to default to "enabled" - lconf should create multiple TCP connections from a client (5201) - init scripts are now turned off by default; run chkconfig --on lustre and chkconfig --on lustrefs to use them - - upcalls are no longer needed for clients to recover to failover + - upcalls are no longer needed for clients to recover to failover servers (3262) - add --abort-recovery option to lconf to abort recovery on device startup (6017) - - add support for an arbitrary number of OSTs (3026) + - add support for an arbitrary number of OSTs (3026) - Quota support protocol changes. - forward compatibility changes to wire structs (6007) - rmmod NALs that might be loaded because of /etc/modules.conf (6133) @@ -1500,9 +1826,9 @@ tbd Cluster File Systems, Inc. - fix dbench 2, extN refcount problem (170, 258, 356, 418) - fix double-O_EXCL intent crash (424) - avoid sending multiple lock CANCELs (352) - * Features + * Features - MDS can do multi-client recovery (modulo bugs in new code) - * Documentation + * Documentation - many updates, edits, cleanups 2002-11-18 Phil Schwan @@ -1686,8 +2012,8 @@ tbd Cluster File Systems, Inc. * small changes in the DLM wire protocol 2002-07-25 Peter J. Braam - * version 0_5_1 with some initial stability, - * locking on MD and file I/O. + * version 0_5_1 with some initial stability, + * locking on MD and file I/O. * documentation updates * several bug fixes since 0.5.0 * small changes in wire protocol @@ -1721,4 +2047,4 @@ tbd Cluster File Systems, Inc. * move forward to latest Lustre kernel 2002-06-25 Peter Braam - * release version v0_4_1. Hopefully stable on single node use. + * release version v0_4_1. Hopefully stable on single node use. diff --git a/lustre/autoconf/lustre-core.m4 b/lustre/autoconf/lustre-core.m4 index b656b49..eae19cb 100644 --- a/lustre/autoconf/lustre-core.m4 +++ b/lustre/autoconf/lustre-core.m4 @@ -320,8 +320,19 @@ AC_MSG_RESULT([$enable_ldiskfs]) if test x$enable_ldiskfs = xyes ; then BACKINGFS="ldiskfs" + AC_MSG_CHECKING([whether to enable quilt for making ldiskfs]) + AC_ARG_ENABLE([quilt], + AC_HELP_STRING([--disable-quilt],[disable use of quilt for ldiskfs]), + [],[enable_quilt='yes']) + AC_MSG_RESULT([$enable_quilt]) + AC_PATH_PROG(PATCH, patch, [no]) - AC_PATH_PROG(QUILT, quilt, [no]) + + if test x$enable_quilt = xno ; then + QUILT="no" + else + AC_PATH_PROG(QUILT, quilt, [no]) + fi if test x$enable_ldiskfs$PATCH$QUILT = xyesnono ; then AC_MSG_ERROR([Quilt or patch are needed to build the ldiskfs module (for Linux 2.6)]) @@ -348,11 +359,15 @@ case $BACKINGFS in ]) ;; ldiskfs) - LC_FSHOOKS([ - LDISKFS_SERIES="2.6-suse.series" - ],[ - LDISKFS_SERIES="2.6-rhel4.series" - ]) + AC_MSG_CHECKING([which ldiskfs series to use]) + case $LINUXRELEASE in + 2.6.5*) LDISKFS_SERIES="2.6-suse.series" ;; + 2.6.9*) LDISKFS_SERIES="2.6-rhel4.series" ;; + 2.6.10*) LDISKFS_SERIES="2.6-rhel4.series" ;; + 2.6.12*) LDISKFS_SERIES="2.6.12-vanilla.series" ;; + *) AC_MSG_WARN([Unknown kernel version $LINUXRELEASE, fix lustre/autoconf/lustre-core.m4]) + esac + AC_MSG_RESULT([$LDISKFS_SERIES]) AC_SUBST(LDISKFS_SERIES) ;; esac # $BACKINGFS @@ -601,6 +616,7 @@ lustre/conf/Makefile lustre/doc/Makefile lustre/include/Makefile lustre/include/linux/Makefile +lustre/include/linux/lustre_ver.h lustre/include/lustre/Makefile lustre/kernel_patches/targets/2.6-suse.target lustre/kernel_patches/targets/2.6-vanilla.target diff --git a/lustre/autoconf/lustre-version.ac b/lustre/autoconf/lustre-version.ac index 0d76e65..bc74354 100644 --- a/lustre/autoconf/lustre-version.ac +++ b/lustre/autoconf/lustre-version.ac @@ -1 +1,30 @@ -m4_define([LUSTRE_VERSION],[1.4.5.93]) +m4_define([LUSTRE_MAJOR],[1]) +m4_define([LUSTRE_MINOR],[4]) +m4_define([LUSTRE_PATCH],[6]) +m4_define([LUSTRE_FIX],[0]) + +dnl # 288 stands for 0.0.1.32 , next version with fixes is ok, but next after +dnl # next release candidate/beta would spill this warning already. +m4_define([LUSTRE_VER_ALLOWED_OFFSET],[288]) +m4_define([LUSTRE_VER_OFFSET_WARN],[288]) + +dnl # User editable part ends here. ----------------------------------------- + +m4_pattern_allow(AC_LUSTRE) +m4_define([LUSTRE_VERSION],m4_if(LUSTRE_FIX,[0],LUSTRE_MAJOR.LUSTRE_MINOR.LUSTRE_PATCH,LUSTRE_MAJOR.LUSTRE_MINOR.LUSTRE_PATCH.LUSTRE_FIX)) + +[AC_LUSTRE_MAJOR]=LUSTRE_MAJOR +[AC_LUSTRE_MINOR]=LUSTRE_MINOR +[AC_LUSTRE_PATCH]=LUSTRE_PATCH +[AC_LUSTRE_FIX]=LUSTRE_FIX +[AC_LUSTRE_VERSION_STRING]=LUSTRE_VERSION +[AC_LUSTRE_VER_ALLOWED_OFFSET]=LUSTRE_VER_ALLOWED_OFFSET +[AC_LUSTRE_VER_OFFSET_WARN]=LUSTRE_VER_OFFSET_WARN + +AC_SUBST([AC_LUSTRE_MAJOR]) +AC_SUBST([AC_LUSTRE_MINOR]) +AC_SUBST([AC_LUSTRE_PATCH]) +AC_SUBST([AC_LUSTRE_FIX]) +AC_SUBST([AC_LUSTRE_VERSION_STRING]) +AC_SUBST([AC_LUSTRE_VER_ALLOWED_OFFSET]) +AC_SUBST([AC_LUSTRE_VER_OFFSET_WARN]) diff --git a/lustre/doc/Makefile.am b/lustre/doc/Makefile.am index efe70d2..2b214f7 100644 --- a/lustre/doc/Makefile.am +++ b/lustre/doc/Makefile.am @@ -15,7 +15,7 @@ TEXEXPAND = texexpand SUFFIXES = .lin .lyx .pdf .ps .sgml .html .txt .tex .fig .eps .dvi if UTILS -man_MANS = lfs.1 lmc.1 lwizard.1 lconf.8 lctl.8 +man_MANS = lfs.1 lmc.1 lconf.8 lctl.8 endif LYXFILES= $(filter-out $(patsubst %.lin,%.lyx,$(wildcard *.lin)),\ @@ -24,7 +24,7 @@ LYXFILES= $(filter-out $(patsubst %.lin,%.lyx,$(wildcard *.lin)),\ CLEANFILES = *.aux *.tex *.log *.pdf EXTRA_DIST = tex2pdf $(man_MANS) \ - $(LYXFILES) lfs.1 lmc.1 lwizard.1 lconf.8 lctl.8 + $(LYXFILES) lfs.1 lmc.1 lconf.8 lctl.8 all: diff --git a/lustre/doc/lctl.8 b/lustre/doc/lctl.8 index 58e5a80..69c6ece 100644 --- a/lustre/doc/lctl.8 +++ b/lustre/doc/lctl.8 @@ -374,7 +374,7 @@ Finished (success) .B setup -lctl > setup /dev/loop0 extN +lctl > setup /dev/loop0 ldiskfs .br lctl > quit diff --git a/lustre/doc/lctl.lyx b/lustre/doc/lctl.lyx index 087a2cd..c3a769f 100644 --- a/lustre/doc/lctl.lyx +++ b/lustre/doc/lctl.lyx @@ -910,7 +910,7 @@ setup \size small -lctl > setup /dev/loop0 extN +lctl > setup /dev/loop0 ldiskfs \newline lctl > quit \size default diff --git a/lustre/doc/lmc.1 b/lustre/doc/lmc.1 index 0377c33..d755de8 100644 --- a/lustre/doc/lmc.1 +++ b/lustre/doc/lmc.1 @@ -192,7 +192,7 @@ Optional arguement. Name of LOV to which this OSC will be attached. Specify the UUID of the OST device. .TP --fstype -extN|ext3 Optional arguement used to specify the file system type. Default is ext3. +ldiskfs|ext3 Optional arguement used to specify the file system type. Default is ext3. .TP --inode_size Specify new inode size for underlying ext3 file system. diff --git a/lustre/doc/lmc.lyx b/lustre/doc/lmc.lyx index 1c27a15..e42e64b 100644 --- a/lustre/doc/lmc.lyx +++ b/lustre/doc/lmc.lyx @@ -454,7 +454,7 @@ UUID Specify the UUID of the OST device. \layout Description --fstype\SpecialChar ~ -extN|ext3 Optional arguement used to specify the file system type. +ldiskfs|ext3 Optional arguement used to specify the file system type. Default is ext3. \layout Description diff --git a/lustre/doc/lwizard.1 b/lustre/doc/lwizard.1 deleted file mode 100644 index 31339d2..0000000 --- a/lustre/doc/lwizard.1 +++ /dev/null @@ -1,113 +0,0 @@ -.TH lwizard 1 "2003 Oct 29" Lustre "Configuration utilities" -.SH NAME -lwizard \- Lustre configuration wizard -.SH SYNOPSIS -.br -.B lwizard -.br -.B lwizard [--help] -.br -.BR lwizard [-o|--file=CONFIG_FILE][--stripe_size=SIZE][--stripe_cnt=COUNT] -.SH DESCRIPTION -The configuration files for Lustre installation are generally created through a series of lmc commands, this generates an XML file which describes the complete cluster. The lwizard eliminates the need to learn lmc to generate configuration files, instead it achieves the same through asking some simple questions. The -XML configuration file generated using lwizard will still have to be made accessible to all the cluster nodes either by storing it on an LDAP server, NFS or by copying it over to all the involved nodes and then running lconf on all nodes to start up the various Lustre services, device setups or mounting the filesystem. -So, once invoked, lwizard asks a series of questions about the various pieces of the cluster : -.TP -.B MDS hostname -If `hostname' has more than one network interfaces (not including lo) and you dicide to use as many interfaces as possible, you need to specify the interfaces' IP addresses separated by blank space. See below example for how to enter interfaces. -.TP -.B MDS device information -.TP -.B MDS failover information -Failover is optional. if failover is enabled, failover hostname and device name are needed. The failover device MUST be the shared device of MDS device. -.TP -.B OST hostname -This will be asked for every new OST added. You can also specify multiple network interfaces as mentioned above for MDS hostname. -.TP -.B OST device information -This will be asked for every new OST added -.TP -.B OST failover information -Failover is optional. if failover is enabled, failover hostname and device name are needed. The failover device MUST be the shared device of OST device. -.TP -.B Lustre mount-point -This is the Lustre mount-point on the client (default - /mnt/lustre). -.TP -.B Lustre client -By default, Lustre can be mounted on any node. However, by default, Lustre will use only one network interface to communicate with servers. If you want to mount Lustre filesystem on a multi-host node and use many netowork interfaces to communicate, you need to configure it specifically. This will tell Lustre client which interfaces it can use to communicate with servers. See example below for details. - -The wizard saves the XML file to the filename specified using the -o or --file option or the default file config.xml. It will also save the lmc commands used to create the XML file in a script config.sh or .sh. - -The lwizard tool currently assumes the following defaults: - -.TP -.B Network type -tcp -.TP -.B Filesystem type -ext3 -.TP -.B LMC path -.I /usr/sbin/lmc - -.SH EXAMPLES -The example below shows a sample session using lwizard. -.PP -[username@meghna utils]$ ./lwizard --stripe_size=64 --stripe_cnt=2 -.br -This script will help you create a Lustre configuration file. -.br -Creating mds "mds1"... -.br -Please enter the HOSTNAME for mds1: meghna -.br -If meghna has more than one network INTERFACE, enter here, separating them -by blank space. See lwizard man page for help. -.br -(hit enter if only one): 192.168.1.29/24 10.0.0.29/24 -.br -Please enter the device name or loop file name for meghna: /dev/sda1 -.br -Please enter the device SIZE or 0 to use entire device (in KB): -.br -Do you want to configure FAILOVER mds1? y -.br -Please enter the HOSTNAME for failover mds1: lester2 -.br -Please enter the device for failover mds1 on lester2: /dev/sdb2 -.br -Creating ost "ost1"... -.br -Please enter the HOSTNAME for ost1: meghna -.br -If meghna has more than one network INTERFACE, enter here, separating them -by blank space. See lwizard man page for help. -.br -(hit enter if only one): -.br -Please enter the device name or loop file name for meghna: /tmp/ost1 -.br -Please enter the device SIZE or 0 to use entire device (in KB): 10000 -.br -Do you want to configure FAILOVER ost1? -.br -Creating ost "ost2"... -.br -Please enter the HOSTNAME for ost2, or just hit enter to finish: -.br -Please enter the clients' mountpoint (/mnt/lustre): -.br -Do you want to configure another client with multiple network interfaces? y -.br -Please enter the HOSTNAME: client2 -.br -Please enter network interface address (separated by space): 192.168.1.30/24 10.0.0.30/24 -.br -Do you want to configure another client with multiple network interfaces? -.br - mds1 lov1 ost1 client client -.br -The Lustre configuration has been written to lwizard.xml. -.br -.SH BUGS -None are known. diff --git a/lustre/include/liblustre.h b/lustre/include/liblustre.h index 6187482..7b4df89 100644 --- a/lustre/include/liblustre.h +++ b/lustre/include/liblustre.h @@ -139,7 +139,7 @@ static inline void *kmalloc(int size, int prot) #define GFP_HIGHUSER 1 #define GFP_ATOMIC 1 #define GFP_NOFS 1 -#define IS_ERR(a) (((a) && abs((long)(a)) < 500) ? 1 : 0) +#define IS_ERR(a) ((unsigned long)(a) < 1000) #define PTR_ERR(a) ((long)(a)) #define ERR_PTR(a) ((void*)((long)(a))) @@ -293,6 +293,7 @@ typedef __u64 kdev_t; #define SPIN_LOCK_UNLOCKED (spinlock_t) { } #define LASSERT_SPIN_LOCKED(lock) do {} while(0) +#define LASSERT_SEM_LOCKED(sem) do {} while(0) static inline void spin_lock(spinlock_t *l) {return;} static inline void spin_unlock(spinlock_t *l) {return;} @@ -330,8 +331,8 @@ void get_random_bytes(void *ptr, int size); /* memory */ -/* FIXME */ -#define num_physpages (16 * 1024) +/* memory size: used for some client tunables */ +#define num_physpages (256 * 1024) /* 1GB */ static inline int copy_from_user(void *a,void *b, int c) { diff --git a/lustre/include/linux/.cvsignore b/lustre/include/linux/.cvsignore index b731c89..ee57167 100644 --- a/lustre/include/linux/.cvsignore +++ b/lustre/include/linux/.cvsignore @@ -13,3 +13,4 @@ extN_jbd.h extN_xattr.h xattr.h lustre_build_version.h +lustre_ver.h diff --git a/lustre/include/linux/lustre_compat25.h b/lustre/include/linux/lustre_compat25.h index 8c28176..51b8389 100644 --- a/lustre/include/linux/lustre_compat25.h +++ b/lustre/include/linux/lustre_compat25.h @@ -56,8 +56,12 @@ void groups_free(struct group_info *ginfo); #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0) +#define lock_dentry(___dentry) spin_lock(&(___dentry)->d_lock) +#define unlock_dentry(___dentry) spin_unlock(&(___dentry)->d_lock) + #define lock_24kernel() do {} while (0) #define unlock_24kernel() do {} while (0) +#define ll_kernel_locked() kernel_locked() /* * OBD need working random driver, thus all our @@ -147,8 +151,12 @@ static inline int cleanup_group_info(void) #else /* 2.4.. */ +#define lock_dentry(___dentry) +#define unlock_dentry(___dentry) + #define lock_24kernel() lock_kernel() #define unlock_24kernel() unlock_kernel() +#define ll_kernel_locked() (current->lock_depth >= 0) #ifdef HAVE_MM_INLINE #include @@ -172,10 +180,16 @@ static inline int cleanup_group_info(void) #define hlist_node list_head #define HLIST_HEAD LIST_HEAD #define INIT_HLIST_HEAD INIT_LIST_HEAD -#define INIT_HLIST_NODE(p) (p) #define hlist_del_init list_del_init #define hlist_add_head list_add +#endif +#ifndef INIT_HLIST_NODE +#define INIT_HLIST_NODE(p) ((p)->next = NULL, (p)->prev = NULL) +#endif +#ifndef hlist_for_each #define hlist_for_each list_for_each +#endif +#ifndef hlist_for_each_safe #define hlist_for_each_safe list_for_each_safe #endif #define KDEVT_INIT(val) (val) @@ -300,6 +314,15 @@ static inline int page_mapped(struct page *page) } #endif /* !HAVE_PAGE_MAPPED */ +static inline void file_accessed(struct file *file) +{ +#ifdef O_NOATIME + if (file->f_flags & O_NOATIME) + return; +#endif + update_atime(file->f_dentry->d_inode); +} + #endif /* end of 2.4 compat macros */ #ifdef HAVE_PAGE_LIST diff --git a/lustre/include/linux/lustre_disk.h b/lustre/include/linux/lustre_disk.h new file mode 100644 index 0000000..43cfba2 --- /dev/null +++ b/lustre/include/linux/lustre_disk.h @@ -0,0 +1,55 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * This file is part of Lustre, http://www.lustre.org + * + * Lustre disk format definitions. + */ +#ifndef _LUSTRE_DISK_H +#define _LUSTRE_DISK_H_ + +#include + +#include + +/****************** last_rcvd file *********************/ + +#define LAST_RCVD "last_rcvd" +#define LOV_OBJID "lov_objid" + +#define LR_SERVER_SIZE 512 +#define LR_CLIENT_START 8192 +#define LR_CLIENT_SIZE 128 +#if LR_CLIENT_START < LR_SERVER_SIZE +#error "Can't have LR_CLIENT_START < LR_SERVER_SIZE" +#endif +/* This limit is arbitrary (32k clients on x86), but it is convenient to use + * 2^n * PAGE_SIZE * 8 for the number of bits that fit an order-n allocation. */ +#define LR_MAX_CLIENTS (PAGE_SIZE * 8) + +#define OBD_COMPAT_OST 0x00000002 /* this is an OST (temporary) */ +#define OBD_COMPAT_MDT 0x00000004 /* this is an MDT (temporary) */ + +#define OBD_ROCOMPAT_LOVOBJID 0x00000001 /* MDS handles LOV_OBJID file */ +#define OBD_ROCOMPAT_CROW 0x00000002 /* OST will CROW create objects */ + +#define OBD_INCOMPAT_GROUPS 0x00000001 /* OST handles group subdirs */ +#define OBD_INCOMPAT_OST 0x00000002 /* this is an OST (permanent) */ +#define OBD_INCOMPAT_MDT 0x00000004 /* this is an MDT (permanent) */ + +/* Data stored per client in the last_rcvd file. In le32 order. */ +struct lsd_client_data { + __u8 lcd_uuid[40]; /* client UUID */ + __u64 lcd_last_transno; /* last completed transaction ID */ + __u64 lcd_last_xid; /* xid for the last transaction */ + __u32 lcd_last_result; /* result from last RPC */ + __u32 lcd_last_data; /* per-op data (disposition for open &c.) */ + /* for MDS_CLOSE requests */ + __u64 lcd_last_close_transno; /* last completed transaction ID */ + __u64 lcd_last_close_xid; /* xid for the last transaction */ + __u32 lcd_last_close_result; /* result from last RPC */ + __u32 lcd_last_close_data; /* per-op data */ + __u8 lcd_padding[LR_CLIENT_SIZE - 88]; +}; + +#endif /* _LUSTRE_DISK_H_ */ diff --git a/lustre/include/linux/lustre_fsfilt.h b/lustre/include/linux/lustre_fsfilt.h index 2136bc7..5358084 100644 --- a/lustre/include/linux/lustre_fsfilt.h +++ b/lustre/include/linux/lustre_fsfilt.h @@ -44,10 +44,13 @@ struct fsfilt_objinfo { #define XATTR_LUSTRE_MDS_LOV_EA "lov" +struct lustre_dquot; struct fsfilt_operations { struct list_head fs_list; struct module *fs_owner; char *fs_type; + char *(* fs_label)(struct super_block *sb); + char *(* fs_uuid)(struct super_block *sb); void *(* fs_start)(struct inode *inode, int op, void *desc_private, int logs); void *(* fs_brw_start)(int objcount, struct fsfilt_objinfo *fso, @@ -62,8 +65,9 @@ struct fsfilt_operations { int (* fs_iocontrol)(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg); int (* fs_set_md)(struct inode *inode, void *handle, void *md, - int size); - int (* fs_get_md)(struct inode *inode, void *md, int size); + int size, const char *name); + int (* fs_get_md)(struct inode *inode, void *md, int size, + const char *name); /* * this method is needed to make IO operation fsfilt nature depend. * @@ -98,7 +102,9 @@ struct fsfilt_operations { int (* fs_quotactl)(struct super_block *sb, struct obd_quotactl *oqctl); int (* fs_quotainfo)(struct lustre_quota_info *lqi, int type, - int cmd, struct list_head *list); + int cmd); + int (* fs_qids)(struct file *file, struct inode *inode, int type, + struct list_head *list); int (* fs_dquot)(struct lustre_dquot *dquot, int cmd); }; @@ -107,6 +113,24 @@ extern void fsfilt_unregister_ops(struct fsfilt_operations *fs_ops); extern struct fsfilt_operations *fsfilt_get_ops(const char *type); extern void fsfilt_put_ops(struct fsfilt_operations *fs_ops); +static inline char *fsfilt_label(struct obd_device *obd, struct super_block *sb) +{ + if (obd->obd_fsops->fs_label == NULL) + return NULL; + if (obd->obd_fsops->fs_label(sb)[0] == '\0') + return NULL; + + return obd->obd_fsops->fs_label(sb); +} + +static inline __u8 *fsfilt_uuid(struct obd_device *obd, struct super_block *sb) +{ + if (obd->obd_fsops->fs_uuid == NULL) + return NULL; + + return obd->obd_fsops->fs_uuid(sb); +} + #define FSFILT_OP_UNLINK 1 #define FSFILT_OP_RMDIR 2 #define FSFILT_OP_RENAME 3 @@ -117,6 +141,8 @@ extern void fsfilt_put_ops(struct fsfilt_operations *fs_ops); #define FSFILT_OP_SETATTR 8 #define FSFILT_OP_LINK 9 #define FSFILT_OP_CANCEL_UNLINK 10 +#define FSFILT_OP_JOIN 11 +#define FSFILT_OP_NOOP 15 #define fsfilt_check_slow(start, timeout, msg) \ do { \ @@ -252,15 +278,16 @@ static inline int fsfilt_iocontrol(struct obd_device *obd, struct inode *inode, } static inline int fsfilt_set_md(struct obd_device *obd, struct inode *inode, - void *handle, void *md, int size) + void *handle, void *md, int size, + const char *name) { - return obd->obd_fsops->fs_set_md(inode, handle, md, size); + return obd->obd_fsops->fs_set_md(inode, handle, md, size, name); } static inline int fsfilt_get_md(struct obd_device *obd, struct inode *inode, - void *md, int size) + void *md, int size, const char *name) { - return obd->obd_fsops->fs_get_md(inode, md, size); + return obd->obd_fsops->fs_get_md(inode, md, size, name); } static inline int fsfilt_send_bio(int rw, struct obd_device *obd, @@ -331,10 +358,19 @@ static inline int fsfilt_quotactl(struct obd_device *obd, static inline int fsfilt_quotainfo(struct obd_device *obd, struct lustre_quota_info *lqi, - int type, int cmd, struct list_head *list) + int type, int cmd) { if (obd->obd_fsops->fs_quotainfo) - return obd->obd_fsops->fs_quotainfo(lqi, type, cmd, list); + return obd->obd_fsops->fs_quotainfo(lqi, type, cmd); + return -ENOTSUPP; +} + +static inline int fsfilt_qids(struct obd_device *obd, struct file *file, + struct inode *inode, int type, + struct list_head *list) +{ + if (obd->obd_fsops->fs_qids) + return obd->obd_fsops->fs_qids(file, inode, type, list); return -ENOTSUPP; } diff --git a/lustre/include/linux/lustre_ver.h.in b/lustre/include/linux/lustre_ver.h.in new file mode 100644 index 0000000..4abf818 --- /dev/null +++ b/lustre/include/linux/lustre_ver.h.in @@ -0,0 +1,23 @@ +#ifndef _LUSTRE_VER_H_ +#define _LUSTRE_VER_H_ + +#include + +#define LUSTRE_MAJOR @AC_LUSTRE_MAJOR@ +#define LUSTRE_MINOR @AC_LUSTRE_MINOR@ +#define LUSTRE_PATCH @AC_LUSTRE_PATCH@ +#define LUSTRE_FIX @AC_LUSTRE_FIX@ +#define LUSTRE_VERSION_STRING "@AC_LUSTRE_VERSION_STRING@" + +// liblustre clients are only allowed to connect if their LUSTRE_FIX mismatches +// by this amount (set in lustre/autoconf/lustre-version.ac) +#define LUSTRE_VERSION_ALLOWED_OFFSET @AC_LUSTRE_VER_ALLOWED_OFFSET@ + +// if lustre version of client and servers it connects to differs by more than +// this amount, client would issue a warning +// (set in lustre/autoconf/lustre-version.ac) +#define LUSTRE_VERSION_OFFSET_WARN @AC_LUSTRE_VER_OFFSET_WARN@ + +#define LUSTRE_VERSION_CODE OBD_OCD_VERSION(LUSTRE_MAJOR,LUSTRE_MINOR,LUSTRE_PATCH,LUSTRE_FIX) + +#endif diff --git a/lustre/include/lustre/liblustreapi.h b/lustre/include/lustre/liblustreapi.h index 35da48d..557c3ab 100644 --- a/lustre/include/lustre/liblustreapi.h +++ b/lustre/include/lustre/liblustreapi.h @@ -16,6 +16,9 @@ extern int llapi_file_create(char *name, long stripe_size, int stripe_offset, extern int llapi_file_get_stripe(char *path, struct lov_user_md *lum); extern int llapi_find(char *path, struct obd_uuid *obduuid, int recursive, int verbose, int quiet); +extern int llapi_obd_statfs(char *path, __u32 type, __u32 index, + struct obd_statfs *stat_buf, + struct obd_uuid *uuid_buf); extern int llapi_ping(char *obd_type, char *obd_name); extern int llapi_target_check(int num_types, char **obd_types, char *dir); extern int llapi_catinfo(char *dir, char *keyword, char *node_name); diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.4.20-hp_pnnl-2.4-ia64-smp.config b/lustre/kernel_patches/kernel_configs/kernel-2.4.20-hp_pnnl-2.4-ia64-smp.config index 1c65518..4446f20 100644 --- a/lustre/kernel_patches/kernel_configs/kernel-2.4.20-hp_pnnl-2.4-ia64-smp.config +++ b/lustre/kernel_patches/kernel_configs/kernel-2.4.20-hp_pnnl-2.4-ia64-smp.config @@ -1040,3 +1040,8 @@ CONFIG_IA64_EARLY_PRINTK_UART_BASE=0 # CONFIG_IA64_DEBUG_CMPXCHG is not set # CONFIG_IA64_DEBUG_IRQ is not set CONFIG_KALLSYMS=y +CONFIG_IEEE1394=m +CONFIG_IEEE1394_PCILYNX=m +CONFIG_IEEE1394_OHCI1394=m +CONFIG_IEEE1394_SBP2=m +CONFIG_IEEE1394_SBP2_PHYS_DMA=y diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.4.20-hp_pnnl-2.4-ia64.config b/lustre/kernel_patches/kernel_configs/kernel-2.4.20-hp_pnnl-2.4-ia64.config index 38669e8..8ba58df 100644 --- a/lustre/kernel_patches/kernel_configs/kernel-2.4.20-hp_pnnl-2.4-ia64.config +++ b/lustre/kernel_patches/kernel_configs/kernel-2.4.20-hp_pnnl-2.4-ia64.config @@ -1040,3 +1040,8 @@ CONFIG_IA64_EARLY_PRINTK_UART_BASE=0 # CONFIG_IA64_DEBUG_CMPXCHG is not set # CONFIG_IA64_DEBUG_IRQ is not set CONFIG_KALLSYMS=y +CONFIG_IEEE1394=m +CONFIG_IEEE1394_PCILYNX=m +CONFIG_IEEE1394_OHCI1394=m +CONFIG_IEEE1394_SBP2=m +CONFIG_IEEE1394_SBP2_PHYS_DMA=y diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.4.20-rh-2.4-i686-smp.config b/lustre/kernel_patches/kernel_configs/kernel-2.4.20-rh-2.4-i686-smp.config index 94f370e..1103250 100644 --- a/lustre/kernel_patches/kernel_configs/kernel-2.4.20-rh-2.4-i686-smp.config +++ b/lustre/kernel_patches/kernel_configs/kernel-2.4.20-rh-2.4-i686-smp.config @@ -631,7 +631,7 @@ CONFIG_NET_FC=y # IEEE 1394 (FireWire) support (EXPERIMENTAL) # CONFIG_IEEE1394=m -# CONFIG_IEEE1394_PCILYNX is not set +CONFIG_IEEE1394_PCILYNX=m CONFIG_IEEE1394_OHCI1394=m CONFIG_IEEE1394_VIDEO1394=m CONFIG_IEEE1394_SBP2=m diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.4.20-rh-2.4-i686.config b/lustre/kernel_patches/kernel_configs/kernel-2.4.20-rh-2.4-i686.config index 94f370e..1103250 100644 --- a/lustre/kernel_patches/kernel_configs/kernel-2.4.20-rh-2.4-i686.config +++ b/lustre/kernel_patches/kernel_configs/kernel-2.4.20-rh-2.4-i686.config @@ -631,7 +631,7 @@ CONFIG_NET_FC=y # IEEE 1394 (FireWire) support (EXPERIMENTAL) # CONFIG_IEEE1394=m -# CONFIG_IEEE1394_PCILYNX is not set +CONFIG_IEEE1394_PCILYNX=m CONFIG_IEEE1394_OHCI1394=m CONFIG_IEEE1394_VIDEO1394=m CONFIG_IEEE1394_SBP2=m diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.4.21-rhel-2.4-i686-smp.config b/lustre/kernel_patches/kernel_configs/kernel-2.4.21-rhel-2.4-i686-smp.config index e8ba99d..6d1a75b 100644 --- a/lustre/kernel_patches/kernel_configs/kernel-2.4.21-rhel-2.4-i686-smp.config +++ b/lustre/kernel_patches/kernel_configs/kernel-2.4.21-rhel-2.4-i686-smp.config @@ -232,6 +232,7 @@ CONFIG_BLK_DEV_RAM_SIZE=8192 CONFIG_BLK_DEV_INITRD=y CONFIG_BLK_STATS=y CONFIG_DISKDUMP=m +CONFIG_BLOCKDUMP=m # # Multi-device support (RAID and LVM) @@ -726,7 +727,7 @@ CONFIG_IEEE1394=m # # Device Drivers # -# CONFIG_IEEE1394_PCILYNX is not set +CONFIG_IEEE1394_PCILYNX=m CONFIG_IEEE1394_OHCI1394=m # @@ -1419,9 +1420,12 @@ CONFIG_INPUT_TURBOGRAFX=m # CONFIG_QIC02_TAPE is not set CONFIG_IPMI_HANDLER=m # CONFIG_IPMI_PANIC_EVENT is not set +# CONFIG_IPMI_PANIC_STRING is not set CONFIG_IPMI_DEVICE_INTERFACE=m +CONFIG_IPMI_SI=m CONFIG_IPMI_KCS=m CONFIG_IPMI_WATCHDOG=m +CONFIG_IPMI_POWEROFF=m # # Watchdog Cards diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.4.21-rhel-2.4-i686.config b/lustre/kernel_patches/kernel_configs/kernel-2.4.21-rhel-2.4-i686.config index e8ba99d..6d1a75b 100644 --- a/lustre/kernel_patches/kernel_configs/kernel-2.4.21-rhel-2.4-i686.config +++ b/lustre/kernel_patches/kernel_configs/kernel-2.4.21-rhel-2.4-i686.config @@ -232,6 +232,7 @@ CONFIG_BLK_DEV_RAM_SIZE=8192 CONFIG_BLK_DEV_INITRD=y CONFIG_BLK_STATS=y CONFIG_DISKDUMP=m +CONFIG_BLOCKDUMP=m # # Multi-device support (RAID and LVM) @@ -726,7 +727,7 @@ CONFIG_IEEE1394=m # # Device Drivers # -# CONFIG_IEEE1394_PCILYNX is not set +CONFIG_IEEE1394_PCILYNX=m CONFIG_IEEE1394_OHCI1394=m # @@ -1419,9 +1420,12 @@ CONFIG_INPUT_TURBOGRAFX=m # CONFIG_QIC02_TAPE is not set CONFIG_IPMI_HANDLER=m # CONFIG_IPMI_PANIC_EVENT is not set +# CONFIG_IPMI_PANIC_STRING is not set CONFIG_IPMI_DEVICE_INTERFACE=m +CONFIG_IPMI_SI=m CONFIG_IPMI_KCS=m CONFIG_IPMI_WATCHDOG=m +CONFIG_IPMI_POWEROFF=m # # Watchdog Cards diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.4.21-rhel-2.4-ia64-smp.config b/lustre/kernel_patches/kernel_configs/kernel-2.4.21-rhel-2.4-ia64-smp.config index 94fca767..4fb3337 100644 --- a/lustre/kernel_patches/kernel_configs/kernel-2.4.21-rhel-2.4-ia64-smp.config +++ b/lustre/kernel_patches/kernel_configs/kernel-2.4.21-rhel-2.4-ia64-smp.config @@ -364,6 +364,7 @@ CONFIG_BLK_DEV_RAM_SIZE=8192 CONFIG_BLK_DEV_INITRD=y CONFIG_BLK_STATS=y CONFIG_DISKDUMP=m +CONFIG_BLOCKDUMP=m # # IEEE 1394 (FireWire) support (EXPERIMENTAL) @@ -867,9 +868,12 @@ CONFIG_INPUT_STINGER=m # CONFIG_QIC02_TAPE is not set CONFIG_IPMI_HANDLER=m # CONFIG_IPMI_PANIC_EVENT is not set +# CONFIG_IPMI_PANIC_STRING is not set CONFIG_IPMI_DEVICE_INTERFACE=m +CONFIG_IPMI_SI=m CONFIG_IPMI_KCS=m CONFIG_IPMI_WATCHDOG=m +CONFIG_IPMI_POWEROFF=m # # Watchdog Cards diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.4.21-rhel-2.4-ia64.config b/lustre/kernel_patches/kernel_configs/kernel-2.4.21-rhel-2.4-ia64.config index 94fca767..4fb3337 100644 --- a/lustre/kernel_patches/kernel_configs/kernel-2.4.21-rhel-2.4-ia64.config +++ b/lustre/kernel_patches/kernel_configs/kernel-2.4.21-rhel-2.4-ia64.config @@ -364,6 +364,7 @@ CONFIG_BLK_DEV_RAM_SIZE=8192 CONFIG_BLK_DEV_INITRD=y CONFIG_BLK_STATS=y CONFIG_DISKDUMP=m +CONFIG_BLOCKDUMP=m # # IEEE 1394 (FireWire) support (EXPERIMENTAL) @@ -867,9 +868,12 @@ CONFIG_INPUT_STINGER=m # CONFIG_QIC02_TAPE is not set CONFIG_IPMI_HANDLER=m # CONFIG_IPMI_PANIC_EVENT is not set +# CONFIG_IPMI_PANIC_STRING is not set CONFIG_IPMI_DEVICE_INTERFACE=m +CONFIG_IPMI_SI=m CONFIG_IPMI_KCS=m CONFIG_IPMI_WATCHDOG=m +CONFIG_IPMI_POWEROFF=m # # Watchdog Cards diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.4.21-rhel-2.4-x86_64-smp.config b/lustre/kernel_patches/kernel_configs/kernel-2.4.21-rhel-2.4-x86_64-smp.config index 594b821..5295a33 100644 --- a/lustre/kernel_patches/kernel_configs/kernel-2.4.21-rhel-2.4-x86_64-smp.config +++ b/lustre/kernel_patches/kernel_configs/kernel-2.4.21-rhel-2.4-x86_64-smp.config @@ -45,7 +45,7 @@ CONFIG_MTRR=y CONFIG_SMP=y CONFIG_HPET_TIMER=y CONFIG_GART_IOMMU=y -# CONFIG_SWIOTLB is not set +CONFIG_SWIOTLB=y CONFIG_NR_SIBLINGS_0=y # CONFIG_NR_SIBLINGS_2 is not set CONFIG_HAVE_DEC_LOCK=y @@ -189,6 +189,7 @@ CONFIG_BLK_DEV_RAM_SIZE=8192 CONFIG_BLK_DEV_INITRD=y CONFIG_BLK_STATS=y CONFIG_DISKDUMP=m +CONFIG_BLOCKDUMP=m # # Multi-device support (RAID and LVM) @@ -1132,9 +1133,12 @@ CONFIG_INPUT_TURBOGRAFX=m # CONFIG_QIC02_TAPE is not set CONFIG_IPMI_HANDLER=m # CONFIG_IPMI_PANIC_EVENT is not set +# CONFIG_IPMI_PANIC_STRING is not set CONFIG_IPMI_DEVICE_INTERFACE=m +CONFIG_IPMI_SI=m CONFIG_IPMI_KCS=m CONFIG_IPMI_WATCHDOG=m +CONFIG_IPMI_POWEROFF=m # # Watchdog Cards diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.4.21-rhel-2.4-x86_64.config b/lustre/kernel_patches/kernel_configs/kernel-2.4.21-rhel-2.4-x86_64.config index 594b821..527d397 100644 --- a/lustre/kernel_patches/kernel_configs/kernel-2.4.21-rhel-2.4-x86_64.config +++ b/lustre/kernel_patches/kernel_configs/kernel-2.4.21-rhel-2.4-x86_64.config @@ -45,7 +45,7 @@ CONFIG_MTRR=y CONFIG_SMP=y CONFIG_HPET_TIMER=y CONFIG_GART_IOMMU=y -# CONFIG_SWIOTLB is not set +CONFIG_SWIOTLB=y CONFIG_NR_SIBLINGS_0=y # CONFIG_NR_SIBLINGS_2 is not set CONFIG_HAVE_DEC_LOCK=y @@ -189,6 +189,7 @@ CONFIG_BLK_DEV_RAM_SIZE=8192 CONFIG_BLK_DEV_INITRD=y CONFIG_BLK_STATS=y CONFIG_DISKDUMP=m +CONFIG_BLOCKDUMP=m # # Multi-device support (RAID and LVM) @@ -1132,9 +1133,12 @@ CONFIG_INPUT_TURBOGRAFX=m # CONFIG_QIC02_TAPE is not set CONFIG_IPMI_HANDLER=m # CONFIG_IPMI_PANIC_EVENT is not set +# CONFIG_IPMI_PANIC_STRING is not set CONFIG_IPMI_DEVICE_INTERFACE=m +CONFIG_IPMI_SI=m CONFIG_IPMI_KCS=m CONFIG_IPMI_WATCHDOG=m +CONFIG_IPMI_POWEROFF=m # # Watchdog Cards @@ -1773,3 +1777,4 @@ CONFIG_CRC32=m CONFIG_ZLIB_INFLATE=y CONFIG_ZLIB_DEFLATE=y CONFIG_QSORT=y +CONFIG_IEEE1394_PCILYNX=m diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.4.21-sles-2.4-i686-smp.config b/lustre/kernel_patches/kernel_configs/kernel-2.4.21-sles-2.4-i686-smp.config index 19d6c5f..c369622 100644 --- a/lustre/kernel_patches/kernel_configs/kernel-2.4.21-sles-2.4-i686-smp.config +++ b/lustre/kernel_patches/kernel_configs/kernel-2.4.21-sles-2.4-i686-smp.config @@ -917,7 +917,7 @@ CONFIG_IEEE1394_PCILYNX=m CONFIG_IEEE1394_OHCI1394=m CONFIG_IEEE1394_VIDEO1394=m CONFIG_IEEE1394_SBP2=m -# CONFIG_IEEE1394_SBP2_PHYS_DMA is not set +CONFIG_IEEE1394_SBP2_PHYS_DMA=y CONFIG_IEEE1394_ETH1394=m CONFIG_IEEE1394_DV1394=m CONFIG_IEEE1394_RAWIO=m diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.4.21-sles-2.4-i686.config b/lustre/kernel_patches/kernel_configs/kernel-2.4.21-sles-2.4-i686.config index 19d6c5f..c369622 100644 --- a/lustre/kernel_patches/kernel_configs/kernel-2.4.21-sles-2.4-i686.config +++ b/lustre/kernel_patches/kernel_configs/kernel-2.4.21-sles-2.4-i686.config @@ -917,7 +917,7 @@ CONFIG_IEEE1394_PCILYNX=m CONFIG_IEEE1394_OHCI1394=m CONFIG_IEEE1394_VIDEO1394=m CONFIG_IEEE1394_SBP2=m -# CONFIG_IEEE1394_SBP2_PHYS_DMA is not set +CONFIG_IEEE1394_SBP2_PHYS_DMA=y CONFIG_IEEE1394_ETH1394=m CONFIG_IEEE1394_DV1394=m CONFIG_IEEE1394_RAWIO=m diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.4.21-suse-2.4.21-2-x86_64.config b/lustre/kernel_patches/kernel_configs/kernel-2.4.21-suse-2.4.21-2-x86_64.config index 2daf682..a041ccf 100644 --- a/lustre/kernel_patches/kernel_configs/kernel-2.4.21-suse-2.4.21-2-x86_64.config +++ b/lustre/kernel_patches/kernel_configs/kernel-2.4.21-suse-2.4.21-2-x86_64.config @@ -782,7 +782,7 @@ CONFIG_IEEE1394_PCILYNX=m CONFIG_IEEE1394_OHCI1394=m CONFIG_IEEE1394_VIDEO1394=m CONFIG_IEEE1394_SBP2=m -# CONFIG_IEEE1394_SBP2_PHYS_DMA is not set +CONFIG_IEEE1394_SBP2_PHYS_DMA=y CONFIG_IEEE1394_ETH1394=m CONFIG_IEEE1394_DV1394=m CONFIG_IEEE1394_RAWIO=m @@ -2039,3 +2039,4 @@ CONFIG_FW_LOADER=m CONFIG_SUSE_KERNEL=y CONFIG_CFGNAME="default" CONFIG_RELEASE=171 +CONFIG_SWIOTLB=y diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-i686-bigsmp.config b/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-i686-bigsmp.config index a84f323..9971cfa 100644 --- a/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-i686-bigsmp.config +++ b/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-i686-bigsmp.config @@ -67,7 +67,6 @@ CONFIG_STOP_MACHINE=y # # Processor type and features # -CONFIG_MEM_MIRROR=y # CONFIG_X86_PC is not set # CONFIG_X86_ELAN is not set # CONFIG_X86_VOYAGER is not set @@ -131,6 +130,8 @@ CONFIG_X86_CPUID=m # Firmware Drivers # CONFIG_EDD=m +CONFIG_DELL_RBU=m +CONFIG_DCDBAS=m # CONFIG_NOHIGHMEM is not set # CONFIG_HIGHMEM4G is not set CONFIG_HIGHMEM64G=y @@ -610,6 +611,7 @@ CONFIG_MEGARAID_NEWGEN=y CONFIG_MEGARAID_MM=m CONFIG_MEGARAID_MAILBOX=m CONFIG_MEGARAID_LEGACY=m +CONFIG_MEGARAID_SAS=m CONFIG_SCSI_SATA=y CONFIG_SCSI_SATA_AHCI=m CONFIG_SCSI_SATA_SVW=m @@ -667,6 +669,7 @@ CONFIG_SCSI_QLA21XX=m CONFIG_SCSI_QLA22XX=m CONFIG_SCSI_QLA2300=m CONFIG_SCSI_QLA2322=m +CONFIG_SCSI_QLA24XX=m CONFIG_SCSI_QLA6312=m CONFIG_SCSI_QLA2XXX_FAILOVER=y CONFIG_SCSI_QLA4XXX=m @@ -729,7 +732,10 @@ CONFIG_BLK_DEV_DM_BBR=m # # Fusion MPT device support # -CONFIG_FUSION=m +CONFIG_FUSION=y +CONFIG_FUSION_SPI=m +CONFIG_FUSION_FC=m +CONFIG_FUSION_SAS=m CONFIG_FUSION_MAX_SGE=40 CONFIG_FUSION_CTL=m CONFIG_FUSION_LAN=m @@ -758,7 +764,7 @@ CONFIG_IEEE1394_OHCI1394=m # CONFIG_IEEE1394_VIDEO1394=m CONFIG_IEEE1394_SBP2=m -# CONFIG_IEEE1394_SBP2_PHYS_DMA is not set +CONFIG_IEEE1394_SBP2_PHYS_DMA=y CONFIG_IEEE1394_ETH1394=m CONFIG_IEEE1394_DV1394=m CONFIG_IEEE1394_RAWIO=m @@ -932,6 +938,9 @@ CONFIG_IP_NF_CONNTRACK_MARK=y CONFIG_IP_NF_TARGET_CONNMARK=m CONFIG_IP_NF_MATCH_CONNMARK=m CONFIG_IP_NF_TARGET_CLUSTERIP=m +CONFIG_IP_NF_MATCH_ADDRTYPE=m +CONFIG_IP_NF_MATCH_HASHLIMIT=m +# CONFIG_IP_NF_MATCH_IPV4OPTIONS is not set # # IPv6: Netfilter Configuration @@ -1194,6 +1203,7 @@ CONFIG_SK98LIN=m CONFIG_TIGON3=m CONFIG_NET_BROADCOM=m CONFIG_NET_BCM44=m +CONFIG_BNX2=m # # Ethernet (10000 Mbit) @@ -1202,6 +1212,7 @@ CONFIG_IXGB=m CONFIG_IXGB_NAPI=y CONFIG_S2IO=m CONFIG_S2IO_NAPI=y +# CONFIG_2BUFF_MODE is not set CONFIG_FDDI=y # CONFIG_DEFXX is not set CONFIG_SKFP=m @@ -1804,8 +1815,9 @@ CONFIG_IPMI_HANDLER=m CONFIG_IPMI_PANIC_EVENT=y CONFIG_IPMI_PANIC_STRING=y CONFIG_IPMI_DEVICE_INTERFACE=m -CONFIG_IPMI_KCS=m +CONFIG_IPMI_SI=m CONFIG_IPMI_WATCHDOG=m +CONFIG_IPMI_POWEROFF=m # # Watchdog Cards @@ -1865,6 +1877,13 @@ CONFIG_SONYPI=m # # Ftape, the floppy tape device driver # + +# +# TPM devices +# +CONFIG_TCG_TPM=m +CONFIG_TCG_NSC=m +CONFIG_TCG_ATMEL=m CONFIG_AGP=m CONFIG_AGP_ALI=m CONFIG_AGP_ATI=m @@ -1966,6 +1985,7 @@ CONFIG_SENSORS_VT1211=m CONFIG_SENSORS_W83781D=m CONFIG_SENSORS_W83L785TS=m CONFIG_SENSORS_W83627HF=m +CONFIG_SENSORS_PCF8574=m # # Other I2C Chip support diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-i686-smp.config b/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-i686-smp.config index a84f323..9971cfa 100644 --- a/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-i686-smp.config +++ b/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-i686-smp.config @@ -67,7 +67,6 @@ CONFIG_STOP_MACHINE=y # # Processor type and features # -CONFIG_MEM_MIRROR=y # CONFIG_X86_PC is not set # CONFIG_X86_ELAN is not set # CONFIG_X86_VOYAGER is not set @@ -131,6 +130,8 @@ CONFIG_X86_CPUID=m # Firmware Drivers # CONFIG_EDD=m +CONFIG_DELL_RBU=m +CONFIG_DCDBAS=m # CONFIG_NOHIGHMEM is not set # CONFIG_HIGHMEM4G is not set CONFIG_HIGHMEM64G=y @@ -610,6 +611,7 @@ CONFIG_MEGARAID_NEWGEN=y CONFIG_MEGARAID_MM=m CONFIG_MEGARAID_MAILBOX=m CONFIG_MEGARAID_LEGACY=m +CONFIG_MEGARAID_SAS=m CONFIG_SCSI_SATA=y CONFIG_SCSI_SATA_AHCI=m CONFIG_SCSI_SATA_SVW=m @@ -667,6 +669,7 @@ CONFIG_SCSI_QLA21XX=m CONFIG_SCSI_QLA22XX=m CONFIG_SCSI_QLA2300=m CONFIG_SCSI_QLA2322=m +CONFIG_SCSI_QLA24XX=m CONFIG_SCSI_QLA6312=m CONFIG_SCSI_QLA2XXX_FAILOVER=y CONFIG_SCSI_QLA4XXX=m @@ -729,7 +732,10 @@ CONFIG_BLK_DEV_DM_BBR=m # # Fusion MPT device support # -CONFIG_FUSION=m +CONFIG_FUSION=y +CONFIG_FUSION_SPI=m +CONFIG_FUSION_FC=m +CONFIG_FUSION_SAS=m CONFIG_FUSION_MAX_SGE=40 CONFIG_FUSION_CTL=m CONFIG_FUSION_LAN=m @@ -758,7 +764,7 @@ CONFIG_IEEE1394_OHCI1394=m # CONFIG_IEEE1394_VIDEO1394=m CONFIG_IEEE1394_SBP2=m -# CONFIG_IEEE1394_SBP2_PHYS_DMA is not set +CONFIG_IEEE1394_SBP2_PHYS_DMA=y CONFIG_IEEE1394_ETH1394=m CONFIG_IEEE1394_DV1394=m CONFIG_IEEE1394_RAWIO=m @@ -932,6 +938,9 @@ CONFIG_IP_NF_CONNTRACK_MARK=y CONFIG_IP_NF_TARGET_CONNMARK=m CONFIG_IP_NF_MATCH_CONNMARK=m CONFIG_IP_NF_TARGET_CLUSTERIP=m +CONFIG_IP_NF_MATCH_ADDRTYPE=m +CONFIG_IP_NF_MATCH_HASHLIMIT=m +# CONFIG_IP_NF_MATCH_IPV4OPTIONS is not set # # IPv6: Netfilter Configuration @@ -1194,6 +1203,7 @@ CONFIG_SK98LIN=m CONFIG_TIGON3=m CONFIG_NET_BROADCOM=m CONFIG_NET_BCM44=m +CONFIG_BNX2=m # # Ethernet (10000 Mbit) @@ -1202,6 +1212,7 @@ CONFIG_IXGB=m CONFIG_IXGB_NAPI=y CONFIG_S2IO=m CONFIG_S2IO_NAPI=y +# CONFIG_2BUFF_MODE is not set CONFIG_FDDI=y # CONFIG_DEFXX is not set CONFIG_SKFP=m @@ -1804,8 +1815,9 @@ CONFIG_IPMI_HANDLER=m CONFIG_IPMI_PANIC_EVENT=y CONFIG_IPMI_PANIC_STRING=y CONFIG_IPMI_DEVICE_INTERFACE=m -CONFIG_IPMI_KCS=m +CONFIG_IPMI_SI=m CONFIG_IPMI_WATCHDOG=m +CONFIG_IPMI_POWEROFF=m # # Watchdog Cards @@ -1865,6 +1877,13 @@ CONFIG_SONYPI=m # # Ftape, the floppy tape device driver # + +# +# TPM devices +# +CONFIG_TCG_TPM=m +CONFIG_TCG_NSC=m +CONFIG_TCG_ATMEL=m CONFIG_AGP=m CONFIG_AGP_ALI=m CONFIG_AGP_ATI=m @@ -1966,6 +1985,7 @@ CONFIG_SENSORS_VT1211=m CONFIG_SENSORS_W83781D=m CONFIG_SENSORS_W83L785TS=m CONFIG_SENSORS_W83627HF=m +CONFIG_SENSORS_PCF8574=m # # Other I2C Chip support diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-i686.config b/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-i686.config index a84f323..9971cfa 100644 --- a/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-i686.config +++ b/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-i686.config @@ -67,7 +67,6 @@ CONFIG_STOP_MACHINE=y # # Processor type and features # -CONFIG_MEM_MIRROR=y # CONFIG_X86_PC is not set # CONFIG_X86_ELAN is not set # CONFIG_X86_VOYAGER is not set @@ -131,6 +130,8 @@ CONFIG_X86_CPUID=m # Firmware Drivers # CONFIG_EDD=m +CONFIG_DELL_RBU=m +CONFIG_DCDBAS=m # CONFIG_NOHIGHMEM is not set # CONFIG_HIGHMEM4G is not set CONFIG_HIGHMEM64G=y @@ -610,6 +611,7 @@ CONFIG_MEGARAID_NEWGEN=y CONFIG_MEGARAID_MM=m CONFIG_MEGARAID_MAILBOX=m CONFIG_MEGARAID_LEGACY=m +CONFIG_MEGARAID_SAS=m CONFIG_SCSI_SATA=y CONFIG_SCSI_SATA_AHCI=m CONFIG_SCSI_SATA_SVW=m @@ -667,6 +669,7 @@ CONFIG_SCSI_QLA21XX=m CONFIG_SCSI_QLA22XX=m CONFIG_SCSI_QLA2300=m CONFIG_SCSI_QLA2322=m +CONFIG_SCSI_QLA24XX=m CONFIG_SCSI_QLA6312=m CONFIG_SCSI_QLA2XXX_FAILOVER=y CONFIG_SCSI_QLA4XXX=m @@ -729,7 +732,10 @@ CONFIG_BLK_DEV_DM_BBR=m # # Fusion MPT device support # -CONFIG_FUSION=m +CONFIG_FUSION=y +CONFIG_FUSION_SPI=m +CONFIG_FUSION_FC=m +CONFIG_FUSION_SAS=m CONFIG_FUSION_MAX_SGE=40 CONFIG_FUSION_CTL=m CONFIG_FUSION_LAN=m @@ -758,7 +764,7 @@ CONFIG_IEEE1394_OHCI1394=m # CONFIG_IEEE1394_VIDEO1394=m CONFIG_IEEE1394_SBP2=m -# CONFIG_IEEE1394_SBP2_PHYS_DMA is not set +CONFIG_IEEE1394_SBP2_PHYS_DMA=y CONFIG_IEEE1394_ETH1394=m CONFIG_IEEE1394_DV1394=m CONFIG_IEEE1394_RAWIO=m @@ -932,6 +938,9 @@ CONFIG_IP_NF_CONNTRACK_MARK=y CONFIG_IP_NF_TARGET_CONNMARK=m CONFIG_IP_NF_MATCH_CONNMARK=m CONFIG_IP_NF_TARGET_CLUSTERIP=m +CONFIG_IP_NF_MATCH_ADDRTYPE=m +CONFIG_IP_NF_MATCH_HASHLIMIT=m +# CONFIG_IP_NF_MATCH_IPV4OPTIONS is not set # # IPv6: Netfilter Configuration @@ -1194,6 +1203,7 @@ CONFIG_SK98LIN=m CONFIG_TIGON3=m CONFIG_NET_BROADCOM=m CONFIG_NET_BCM44=m +CONFIG_BNX2=m # # Ethernet (10000 Mbit) @@ -1202,6 +1212,7 @@ CONFIG_IXGB=m CONFIG_IXGB_NAPI=y CONFIG_S2IO=m CONFIG_S2IO_NAPI=y +# CONFIG_2BUFF_MODE is not set CONFIG_FDDI=y # CONFIG_DEFXX is not set CONFIG_SKFP=m @@ -1804,8 +1815,9 @@ CONFIG_IPMI_HANDLER=m CONFIG_IPMI_PANIC_EVENT=y CONFIG_IPMI_PANIC_STRING=y CONFIG_IPMI_DEVICE_INTERFACE=m -CONFIG_IPMI_KCS=m +CONFIG_IPMI_SI=m CONFIG_IPMI_WATCHDOG=m +CONFIG_IPMI_POWEROFF=m # # Watchdog Cards @@ -1865,6 +1877,13 @@ CONFIG_SONYPI=m # # Ftape, the floppy tape device driver # + +# +# TPM devices +# +CONFIG_TCG_TPM=m +CONFIG_TCG_NSC=m +CONFIG_TCG_ATMEL=m CONFIG_AGP=m CONFIG_AGP_ALI=m CONFIG_AGP_ATI=m @@ -1966,6 +1985,7 @@ CONFIG_SENSORS_VT1211=m CONFIG_SENSORS_W83781D=m CONFIG_SENSORS_W83L785TS=m CONFIG_SENSORS_W83627HF=m +CONFIG_SENSORS_PCF8574=m # # Other I2C Chip support diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-ia64-smp.config b/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-ia64-smp.config index b27e735..c205dc4 100644 --- a/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-ia64-smp.config +++ b/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-ia64-smp.config @@ -93,6 +93,7 @@ CONFIG_IA64_SGI_SN_XPC=m CONFIG_FORCE_MAX_ZONEORDER=18 CONFIG_SMP=y CONFIG_NR_CPUS=128 +CONFIG_SCHED_SMT=y # CONFIG_PREEMPT is not set CONFIG_HAVE_DEC_LOCK=y CONFIG_IA32_SUPPORT=y @@ -439,6 +440,7 @@ CONFIG_MEGARAID_NEWGEN=y CONFIG_MEGARAID_MM=m CONFIG_MEGARAID_MAILBOX=m CONFIG_MEGARAID_LEGACY=m +CONFIG_MEGARAID_SAS=m CONFIG_SCSI_SATA=y CONFIG_SCSI_SATA_AHCI=m CONFIG_SCSI_SATA_SVW=m @@ -482,6 +484,7 @@ CONFIG_SCSI_QLA21XX=m CONFIG_SCSI_QLA22XX=m CONFIG_SCSI_QLA2300=m CONFIG_SCSI_QLA2322=m +CONFIG_SCSI_QLA24XX=m CONFIG_SCSI_QLA6312=m CONFIG_SCSI_QLA2XXX_FAILOVER=y CONFIG_SCSI_QLA4XXX=m @@ -520,7 +523,10 @@ CONFIG_BLK_DEV_DM_BBR=m # # Fusion MPT device support # -CONFIG_FUSION=m +CONFIG_FUSION=y +CONFIG_FUSION_SPI=m +CONFIG_FUSION_FC=m +CONFIG_FUSION_SAS=m CONFIG_FUSION_MAX_SGE=40 CONFIG_FUSION_CTL=m CONFIG_FUSION_LAN=m @@ -549,7 +555,7 @@ CONFIG_IEEE1394_OHCI1394=m # CONFIG_IEEE1394_VIDEO1394=m CONFIG_IEEE1394_SBP2=m -# CONFIG_IEEE1394_SBP2_PHYS_DMA is not set +CONFIG_IEEE1394_SBP2_PHYS_DMA=y CONFIG_IEEE1394_ETH1394=m CONFIG_IEEE1394_DV1394=m CONFIG_IEEE1394_RAWIO=m @@ -723,6 +729,9 @@ CONFIG_IP_NF_CONNTRACK_MARK=y CONFIG_IP_NF_TARGET_CONNMARK=m CONFIG_IP_NF_MATCH_CONNMARK=m CONFIG_IP_NF_TARGET_CLUSTERIP=m +CONFIG_IP_NF_MATCH_ADDRTYPE=m +CONFIG_IP_NF_MATCH_HASHLIMIT=m +# CONFIG_IP_NF_MATCH_IPV4OPTIONS is not set # # IPv6: Netfilter Configuration @@ -942,6 +951,7 @@ CONFIG_SK98LIN=m CONFIG_TIGON3=m CONFIG_NET_BROADCOM=m CONFIG_NET_BCM44=m +CONFIG_BNX2=m # # Ethernet (10000 Mbit) @@ -950,6 +960,7 @@ CONFIG_IXGB=m CONFIG_IXGB_NAPI=y CONFIG_S2IO=m CONFIG_S2IO_NAPI=y +# CONFIG_2BUFF_MODE is not set CONFIG_FDDI=y # CONFIG_DEFXX is not set CONFIG_SKFP=m @@ -1489,8 +1500,9 @@ CONFIG_IPMI_HANDLER=m CONFIG_IPMI_PANIC_EVENT=y CONFIG_IPMI_PANIC_STRING=y CONFIG_IPMI_DEVICE_INTERFACE=m -CONFIG_IPMI_KCS=m +CONFIG_IPMI_SI=m CONFIG_IPMI_WATCHDOG=m +CONFIG_IPMI_POWEROFF=m # # Watchdog Cards @@ -1525,6 +1537,13 @@ CONFIG_APPLICOM=m # # Ftape, the floppy tape device driver # + +# +# TPM devices +# +CONFIG_TCG_TPM=m +CONFIG_TCG_NSC=m +CONFIG_TCG_ATMEL=m CONFIG_AGP=m CONFIG_AGP_I460=m CONFIG_AGP_HP_ZX1=m @@ -1612,6 +1631,7 @@ CONFIG_SENSORS_VT1211=m CONFIG_SENSORS_W83781D=m # CONFIG_SENSORS_W83L785TS is not set CONFIG_SENSORS_W83627HF=m +CONFIG_SENSORS_PCF8574=m # # Other I2C Chip support diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-ia64.config b/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-ia64.config index b27e735..c205dc4 100644 --- a/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-ia64.config +++ b/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-ia64.config @@ -93,6 +93,7 @@ CONFIG_IA64_SGI_SN_XPC=m CONFIG_FORCE_MAX_ZONEORDER=18 CONFIG_SMP=y CONFIG_NR_CPUS=128 +CONFIG_SCHED_SMT=y # CONFIG_PREEMPT is not set CONFIG_HAVE_DEC_LOCK=y CONFIG_IA32_SUPPORT=y @@ -439,6 +440,7 @@ CONFIG_MEGARAID_NEWGEN=y CONFIG_MEGARAID_MM=m CONFIG_MEGARAID_MAILBOX=m CONFIG_MEGARAID_LEGACY=m +CONFIG_MEGARAID_SAS=m CONFIG_SCSI_SATA=y CONFIG_SCSI_SATA_AHCI=m CONFIG_SCSI_SATA_SVW=m @@ -482,6 +484,7 @@ CONFIG_SCSI_QLA21XX=m CONFIG_SCSI_QLA22XX=m CONFIG_SCSI_QLA2300=m CONFIG_SCSI_QLA2322=m +CONFIG_SCSI_QLA24XX=m CONFIG_SCSI_QLA6312=m CONFIG_SCSI_QLA2XXX_FAILOVER=y CONFIG_SCSI_QLA4XXX=m @@ -520,7 +523,10 @@ CONFIG_BLK_DEV_DM_BBR=m # # Fusion MPT device support # -CONFIG_FUSION=m +CONFIG_FUSION=y +CONFIG_FUSION_SPI=m +CONFIG_FUSION_FC=m +CONFIG_FUSION_SAS=m CONFIG_FUSION_MAX_SGE=40 CONFIG_FUSION_CTL=m CONFIG_FUSION_LAN=m @@ -549,7 +555,7 @@ CONFIG_IEEE1394_OHCI1394=m # CONFIG_IEEE1394_VIDEO1394=m CONFIG_IEEE1394_SBP2=m -# CONFIG_IEEE1394_SBP2_PHYS_DMA is not set +CONFIG_IEEE1394_SBP2_PHYS_DMA=y CONFIG_IEEE1394_ETH1394=m CONFIG_IEEE1394_DV1394=m CONFIG_IEEE1394_RAWIO=m @@ -723,6 +729,9 @@ CONFIG_IP_NF_CONNTRACK_MARK=y CONFIG_IP_NF_TARGET_CONNMARK=m CONFIG_IP_NF_MATCH_CONNMARK=m CONFIG_IP_NF_TARGET_CLUSTERIP=m +CONFIG_IP_NF_MATCH_ADDRTYPE=m +CONFIG_IP_NF_MATCH_HASHLIMIT=m +# CONFIG_IP_NF_MATCH_IPV4OPTIONS is not set # # IPv6: Netfilter Configuration @@ -942,6 +951,7 @@ CONFIG_SK98LIN=m CONFIG_TIGON3=m CONFIG_NET_BROADCOM=m CONFIG_NET_BCM44=m +CONFIG_BNX2=m # # Ethernet (10000 Mbit) @@ -950,6 +960,7 @@ CONFIG_IXGB=m CONFIG_IXGB_NAPI=y CONFIG_S2IO=m CONFIG_S2IO_NAPI=y +# CONFIG_2BUFF_MODE is not set CONFIG_FDDI=y # CONFIG_DEFXX is not set CONFIG_SKFP=m @@ -1489,8 +1500,9 @@ CONFIG_IPMI_HANDLER=m CONFIG_IPMI_PANIC_EVENT=y CONFIG_IPMI_PANIC_STRING=y CONFIG_IPMI_DEVICE_INTERFACE=m -CONFIG_IPMI_KCS=m +CONFIG_IPMI_SI=m CONFIG_IPMI_WATCHDOG=m +CONFIG_IPMI_POWEROFF=m # # Watchdog Cards @@ -1525,6 +1537,13 @@ CONFIG_APPLICOM=m # # Ftape, the floppy tape device driver # + +# +# TPM devices +# +CONFIG_TCG_TPM=m +CONFIG_TCG_NSC=m +CONFIG_TCG_ATMEL=m CONFIG_AGP=m CONFIG_AGP_I460=m CONFIG_AGP_HP_ZX1=m @@ -1612,6 +1631,7 @@ CONFIG_SENSORS_VT1211=m CONFIG_SENSORS_W83781D=m # CONFIG_SENSORS_W83L785TS is not set CONFIG_SENSORS_W83627HF=m +CONFIG_SENSORS_PCF8574=m # # Other I2C Chip support diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-ppc-pseries64.config b/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-ppc-pseries64.config index 0048533..76b4290 100644 --- a/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-ppc-pseries64.config +++ b/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-ppc-pseries64.config @@ -374,7 +374,7 @@ CONFIG_IEEE1394_OHCI1394=m # CONFIG_IEEE1394_VIDEO1394=m CONFIG_IEEE1394_SBP2=m -# CONFIG_IEEE1394_SBP2_PHYS_DMA is not set +CONFIG_IEEE1394_SBP2_PHYS_DMA=y CONFIG_IEEE1394_ETH1394=m CONFIG_IEEE1394_DV1394=m CONFIG_IEEE1394_RAWIO=m @@ -1450,3 +1450,5 @@ CONFIG_ZLIB_DEFLATE=m CONFIG_SUSE_KERNEL=y CONFIG_CFGNAME="pseries64" CONFIG_RELEASE="7.141" +CONFIG_IEEE1394=m +CONFIG_IEEE1394_PCILYNX=m diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-ppc.config b/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-ppc.config index 2e87a08..b5e692b 100644 --- a/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-ppc.config +++ b/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-ppc.config @@ -373,7 +373,7 @@ CONFIG_IEEE1394_OHCI1394=m # CONFIG_IEEE1394_VIDEO1394=m CONFIG_IEEE1394_SBP2=m -# CONFIG_IEEE1394_SBP2_PHYS_DMA is not set +CONFIG_IEEE1394_SBP2_PHYS_DMA=y CONFIG_IEEE1394_ETH1394=m CONFIG_IEEE1394_DV1394=m CONFIG_IEEE1394_RAWIO=m @@ -1449,3 +1449,5 @@ CONFIG_ZLIB_DEFLATE=m CONFIG_SUSE_KERNEL=y CONFIG_CFGNAME="pseries64" CONFIG_RELEASE="SLES9_SP1_BRANCH_91" +CONFIG_IEEE1394=m +CONFIG_IEEE1394_PCILYNX=m diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-x86_64-smp.config b/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-x86_64-smp.config index 31b93e4..81f3823 100644 --- a/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-x86_64-smp.config +++ b/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-x86_64-smp.config @@ -84,6 +84,13 @@ CONFIG_X86_GOOD_APIC=y CONFIG_MICROCODE=m CONFIG_X86_MSR=m CONFIG_X86_CPUID=m + +# +# Firmware Drivers +# +CONFIG_EDD=m +CONFIG_DELL_RBU=m +CONFIG_DCDBAS=m CONFIG_X86_HT=y CONFIG_X86_IO_APIC=y CONFIG_X86_LOCAL_APIC=y @@ -496,6 +503,7 @@ CONFIG_MEGARAID_NEWGEN=y CONFIG_MEGARAID_MM=m CONFIG_MEGARAID_MAILBOX=m CONFIG_MEGARAID_LEGACY=m +CONFIG_MEGARAID_SAS=m CONFIG_SCSI_SATA=y CONFIG_SCSI_SATA_AHCI=m CONFIG_SCSI_SATA_SVW=m @@ -543,6 +551,7 @@ CONFIG_SCSI_QLA21XX=m CONFIG_SCSI_QLA22XX=m CONFIG_SCSI_QLA2300=m CONFIG_SCSI_QLA2322=m +CONFIG_SCSI_QLA24XX=m CONFIG_SCSI_QLA6312=m CONFIG_SCSI_QLA2XXX_FAILOVER=y CONFIG_SCSI_QLA4XXX=m @@ -581,7 +590,10 @@ CONFIG_BLK_DEV_DM_BBR=m # # Fusion MPT device support # -CONFIG_FUSION=m +CONFIG_FUSION=y +CONFIG_FUSION_SPI=m +CONFIG_FUSION_FC=m +CONFIG_FUSION_SAS=m CONFIG_FUSION_MAX_SGE=40 CONFIG_FUSION_CTL=m CONFIG_FUSION_LAN=m @@ -610,7 +622,7 @@ CONFIG_IEEE1394_OHCI1394=m # CONFIG_IEEE1394_VIDEO1394=m CONFIG_IEEE1394_SBP2=m -# CONFIG_IEEE1394_SBP2_PHYS_DMA is not set +CONFIG_IEEE1394_SBP2_PHYS_DMA=y CONFIG_IEEE1394_ETH1394=m CONFIG_IEEE1394_DV1394=m CONFIG_IEEE1394_RAWIO=m @@ -787,6 +799,9 @@ CONFIG_IP_NF_CONNTRACK_MARK=y CONFIG_IP_NF_TARGET_CONNMARK=m CONFIG_IP_NF_MATCH_CONNMARK=m CONFIG_IP_NF_TARGET_CLUSTERIP=m +CONFIG_IP_NF_MATCH_ADDRTYPE=m +CONFIG_IP_NF_MATCH_HASHLIMIT=m +# CONFIG_IP_NF_MATCH_IPV4OPTIONS is not set # # IPv6: Netfilter Configuration @@ -999,6 +1014,7 @@ CONFIG_SK98LIN=m CONFIG_TIGON3=m CONFIG_NET_BROADCOM=m CONFIG_NET_BCM44=m +CONFIG_BNX2=m # # Ethernet (10000 Mbit) @@ -1007,6 +1023,7 @@ CONFIG_IXGB=m CONFIG_IXGB_NAPI=y CONFIG_S2IO=m CONFIG_S2IO_NAPI=y +# CONFIG_2BUFF_MODE is not set CONFIG_FDDI=y # CONFIG_DEFXX is not set CONFIG_SKFP=m @@ -1494,8 +1511,9 @@ CONFIG_IPMI_HANDLER=m CONFIG_IPMI_PANIC_EVENT=y # CONFIG_IPMI_PANIC_STRING is not set CONFIG_IPMI_DEVICE_INTERFACE=m -CONFIG_IPMI_KCS=m +CONFIG_IPMI_SI=m CONFIG_IPMI_WATCHDOG=m +CONFIG_IPMI_POWEROFF=m # # Watchdog Cards @@ -1546,8 +1564,16 @@ CONFIG_APPLICOM=m # # Ftape, the floppy tape device driver # + +# +# TPM devices +# +CONFIG_TCG_TPM=m +CONFIG_TCG_NSC=m +CONFIG_TCG_ATMEL=m CONFIG_AGP=y CONFIG_AGP_AMD64=y +CONFIG_AGP_INTEL=m CONFIG_AGP_INTEL_MCH=m # CONFIG_DRM is not set @@ -1633,6 +1659,7 @@ CONFIG_SENSORS_VT1211=m CONFIG_SENSORS_W83781D=m CONFIG_SENSORS_W83L785TS=m CONFIG_SENSORS_W83627HF=m +CONFIG_SENSORS_PCF8574=m # # Other I2C Chip support @@ -1763,6 +1790,8 @@ CONFIG_FB_VESA=y CONFIG_VIDEO_SELECT=y CONFIG_FB_HGA=m CONFIG_FB_RIVA=m +CONFIG_FB_I810=m +CONFIG_FB_I810_GTF=y CONFIG_FB_MATROX=m CONFIG_FB_MATROX_MILLENIUM=y CONFIG_FB_MATROX_MYSTIQUE=y diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-x86_64.config b/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-x86_64.config index 31b93e4..81f3823 100644 --- a/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-x86_64.config +++ b/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-x86_64.config @@ -84,6 +84,13 @@ CONFIG_X86_GOOD_APIC=y CONFIG_MICROCODE=m CONFIG_X86_MSR=m CONFIG_X86_CPUID=m + +# +# Firmware Drivers +# +CONFIG_EDD=m +CONFIG_DELL_RBU=m +CONFIG_DCDBAS=m CONFIG_X86_HT=y CONFIG_X86_IO_APIC=y CONFIG_X86_LOCAL_APIC=y @@ -496,6 +503,7 @@ CONFIG_MEGARAID_NEWGEN=y CONFIG_MEGARAID_MM=m CONFIG_MEGARAID_MAILBOX=m CONFIG_MEGARAID_LEGACY=m +CONFIG_MEGARAID_SAS=m CONFIG_SCSI_SATA=y CONFIG_SCSI_SATA_AHCI=m CONFIG_SCSI_SATA_SVW=m @@ -543,6 +551,7 @@ CONFIG_SCSI_QLA21XX=m CONFIG_SCSI_QLA22XX=m CONFIG_SCSI_QLA2300=m CONFIG_SCSI_QLA2322=m +CONFIG_SCSI_QLA24XX=m CONFIG_SCSI_QLA6312=m CONFIG_SCSI_QLA2XXX_FAILOVER=y CONFIG_SCSI_QLA4XXX=m @@ -581,7 +590,10 @@ CONFIG_BLK_DEV_DM_BBR=m # # Fusion MPT device support # -CONFIG_FUSION=m +CONFIG_FUSION=y +CONFIG_FUSION_SPI=m +CONFIG_FUSION_FC=m +CONFIG_FUSION_SAS=m CONFIG_FUSION_MAX_SGE=40 CONFIG_FUSION_CTL=m CONFIG_FUSION_LAN=m @@ -610,7 +622,7 @@ CONFIG_IEEE1394_OHCI1394=m # CONFIG_IEEE1394_VIDEO1394=m CONFIG_IEEE1394_SBP2=m -# CONFIG_IEEE1394_SBP2_PHYS_DMA is not set +CONFIG_IEEE1394_SBP2_PHYS_DMA=y CONFIG_IEEE1394_ETH1394=m CONFIG_IEEE1394_DV1394=m CONFIG_IEEE1394_RAWIO=m @@ -787,6 +799,9 @@ CONFIG_IP_NF_CONNTRACK_MARK=y CONFIG_IP_NF_TARGET_CONNMARK=m CONFIG_IP_NF_MATCH_CONNMARK=m CONFIG_IP_NF_TARGET_CLUSTERIP=m +CONFIG_IP_NF_MATCH_ADDRTYPE=m +CONFIG_IP_NF_MATCH_HASHLIMIT=m +# CONFIG_IP_NF_MATCH_IPV4OPTIONS is not set # # IPv6: Netfilter Configuration @@ -999,6 +1014,7 @@ CONFIG_SK98LIN=m CONFIG_TIGON3=m CONFIG_NET_BROADCOM=m CONFIG_NET_BCM44=m +CONFIG_BNX2=m # # Ethernet (10000 Mbit) @@ -1007,6 +1023,7 @@ CONFIG_IXGB=m CONFIG_IXGB_NAPI=y CONFIG_S2IO=m CONFIG_S2IO_NAPI=y +# CONFIG_2BUFF_MODE is not set CONFIG_FDDI=y # CONFIG_DEFXX is not set CONFIG_SKFP=m @@ -1494,8 +1511,9 @@ CONFIG_IPMI_HANDLER=m CONFIG_IPMI_PANIC_EVENT=y # CONFIG_IPMI_PANIC_STRING is not set CONFIG_IPMI_DEVICE_INTERFACE=m -CONFIG_IPMI_KCS=m +CONFIG_IPMI_SI=m CONFIG_IPMI_WATCHDOG=m +CONFIG_IPMI_POWEROFF=m # # Watchdog Cards @@ -1546,8 +1564,16 @@ CONFIG_APPLICOM=m # # Ftape, the floppy tape device driver # + +# +# TPM devices +# +CONFIG_TCG_TPM=m +CONFIG_TCG_NSC=m +CONFIG_TCG_ATMEL=m CONFIG_AGP=y CONFIG_AGP_AMD64=y +CONFIG_AGP_INTEL=m CONFIG_AGP_INTEL_MCH=m # CONFIG_DRM is not set @@ -1633,6 +1659,7 @@ CONFIG_SENSORS_VT1211=m CONFIG_SENSORS_W83781D=m CONFIG_SENSORS_W83L785TS=m CONFIG_SENSORS_W83627HF=m +CONFIG_SENSORS_PCF8574=m # # Other I2C Chip support @@ -1763,6 +1790,8 @@ CONFIG_FB_VESA=y CONFIG_VIDEO_SELECT=y CONFIG_FB_HGA=m CONFIG_FB_RIVA=m +CONFIG_FB_I810=m +CONFIG_FB_I810_GTF=y CONFIG_FB_MATROX=m CONFIG_FB_MATROX_MILLENIUM=y CONFIG_FB_MATROX_MYSTIQUE=y diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-i686-smp.config b/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-i686-smp.config index 11d5d85..2b18f96 100644 --- a/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-i686-smp.config +++ b/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-i686-smp.config @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.9-prep -# Fri May 13 14:09:31 2005 +# Linux kernel version: 2.6.9-prep.qp2.2.5.11.3qsnet +# Thu Oct 27 17:02:11 2005 # CONFIG_X86=y CONFIG_MMU=y @@ -27,6 +27,7 @@ CONFIG_BSD_PROCESS_ACCT=y CONFIG_SYSCTL=y CONFIG_AUDIT=y CONFIG_AUDITSYSCALL=y +# CONFIG_AUDITFILESYSTEM is not set CONFIG_LOG_BUF_SHIFT=17 CONFIG_HOTPLUG=y # CONFIG_IKCONFIG is not set @@ -60,6 +61,7 @@ CONFIG_STOP_MACHINE=y # # Processor type and features # +CONFIG_MEM_MIRROR=y # CONFIG_X86_PC is not set # CONFIG_X86_ELAN is not set # CONFIG_X86_VOYAGER is not set @@ -125,23 +127,6 @@ CONFIG_I8K=m CONFIG_MICROCODE=m CONFIG_X86_MSR=m CONFIG_X86_CPUID=m -CONFIG_IOPROC=y -CONFIG_PTRACK=y - -# -# Quadrics QsNet -# -CONFIG_QSNET=m -CONFIG_ELAN3=m -CONFIG_ELAN4=m -CONFIG_EP=m -CONFIG_EIP=m -CONFIG_RMS=m -CONFIG_JTAG=m -CONFIG_NET_FC=y -CONFIG_SHAPER=m -CONFIG_NETCONSOLE=m - # # Firmware Drivers @@ -160,7 +145,8 @@ CONFIG_MTRR=y # CONFIG_IRQBALANCE is not set CONFIG_HAVE_DEC_LOCK=y CONFIG_REGPARM=y -CONFIG_KEXEC=y +CONFIG_IOPROC=y +CONFIG_PTRACK=y # # Power management options (ACPI, APM) @@ -546,6 +532,7 @@ CONFIG_SCSI_LOGGING=y # CONFIG_SCSI_SPI_ATTRS=m CONFIG_SCSI_FC_ATTRS=m +CONFIG_SCSI_ISCSI_ATTRS=m # # SCSI low-level drivers @@ -600,6 +587,7 @@ CONFIG_SCSI_GDTH=m CONFIG_SCSI_IPS=m CONFIG_SCSI_INITIO=m # CONFIG_SCSI_INIA100 is not set +CONFIG_SCSI_ISCSI_SFNET=m CONFIG_SCSI_PPA=m CONFIG_SCSI_IMM=m # CONFIG_SCSI_IZIP_EPP16 is not set @@ -623,7 +611,7 @@ CONFIG_SCSI_QLA22XX=m CONFIG_SCSI_QLA2300=m CONFIG_SCSI_QLA2322=m CONFIG_SCSI_QLA6312=m -CONFIG_SCSI_QLA6322=m +CONFIG_SCSI_QLA24XX=m # CONFIG_SCSI_SYM53C416 is not set # CONFIG_SCSI_DC395x is not set # CONFIG_SCSI_DC390T is not set @@ -664,6 +652,8 @@ CONFIG_DM_CRYPT=m CONFIG_DM_SNAPSHOT=m CONFIG_DM_MIRROR=m CONFIG_DM_ZERO=m +CONFIG_DM_MULTIPATH=m +CONFIG_DM_MULTIPATH_EMC=m # # Fusion MPT device support @@ -676,7 +666,33 @@ CONFIG_FUSION_LAN=m # # IEEE 1394 (FireWire) support # -# CONFIG_IEEE1394 is not set +CONFIG_IEEE1394=m + +# +# Subsystem Options +# +# CONFIG_IEEE1394_VERBOSEDEBUG is not set +# CONFIG_IEEE1394_OUI_DB is not set +CONFIG_IEEE1394_EXTRA_CONFIG_ROMS=y +CONFIG_IEEE1394_CONFIG_ROM_IP1394=y + +# +# Device Drivers +# +CONFIG_IEEE1394_PCILYNX=m +CONFIG_IEEE1394_OHCI1394=m + +# +# Protocol Drivers +# +CONFIG_IEEE1394_VIDEO1394=m +CONFIG_IEEE1394_SBP2=m +CONFIG_IEEE1394_SBP2_PHYS_DMA=y +CONFIG_IEEE1394_ETH1394=m +CONFIG_IEEE1394_DV1394=m +CONFIG_IEEE1394_RAWIO=m +CONFIG_IEEE1394_CMP=m +CONFIG_IEEE1394_AMDTP=m # # I2O device support @@ -1122,6 +1138,17 @@ CONFIG_ABYSS=m CONFIG_SMCTR=m # +# Quadrics QsNet +# +CONFIG_QSNET=m +CONFIG_ELAN3=m +CONFIG_ELAN4=m +CONFIG_EP=m +CONFIG_EIP=m +CONFIG_RMS=m +CONFIG_JTAG=m + +# # Wireless LAN (non-hamradio) # CONFIG_NET_RADIO=y @@ -1150,9 +1177,8 @@ CONFIG_IEEE80211_WPA=m CONFIG_IEEE80211_CRYPT_CCMP=m CONFIG_IEEE80211_CRYPT_TKIP=m CONFIG_IPW2100=m -# CONFIG_IPW_DEBUG is not set CONFIG_IPW2100_PROMISC=y -# CONFIG_IPW2100_LEGACY_FW_LOAD is not set +# CONFIG_IPW_DEBUG is not set CONFIG_IPW2200=m CONFIG_AIRO=m CONFIG_HERMES=m @@ -1846,6 +1872,7 @@ CONFIG_SND_AU8810=m CONFIG_SND_AU8820=m CONFIG_SND_AU8830=m CONFIG_SND_AZT3328=m +CONFIG_SND_AZX=m CONFIG_SND_BT87X=m CONFIG_SND_CS46XX=m CONFIG_SND_CS46XX_NEW_DSP=y @@ -2147,6 +2174,7 @@ CONFIG_TMPFS_SECURITY=y CONFIG_HUGETLBFS=y CONFIG_HUGETLB_PAGE=y CONFIG_RAMFS=y +CONFIG_RELAYFS_FS=y # # Miscellaneous filesystems @@ -2284,9 +2312,10 @@ CONFIG_DEBUG_HIGHMEM=y # CONFIG_FRAME_POINTER is not set CONFIG_EARLY_PRINTK=y CONFIG_DEBUG_STACKOVERFLOW=y -# CONFIG_KPROBES is not set +CONFIG_KPROBES=y CONFIG_DEBUG_STACK_USAGE=y # CONFIG_DEBUG_PAGEALLOC is not set +# CONFIG_4KSTACKS is not set # CONFIG_SCHEDSTATS is not set CONFIG_X86_FIND_SMP_CONFIG=y CONFIG_X86_MPPARSE=y @@ -2294,6 +2323,8 @@ CONFIG_X86_MPPARSE=y # # Security options # +CONFIG_KEYS=y +CONFIG_KEYS_DEBUG_PROC_KEYS=y CONFIG_SECURITY=y CONFIG_SECURITY_NETWORK=y CONFIG_SECURITY_CAPABILITIES=y diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-i686.config b/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-i686.config index 11d5d85..25a3848 100644 --- a/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-i686.config +++ b/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-i686.config @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.9-prep -# Fri May 13 14:09:31 2005 +# Linux kernel version: 2.6.9-prep.qp2.2.5.11.3qsnet +# Thu Oct 27 17:01:23 2005 # CONFIG_X86=y CONFIG_MMU=y @@ -27,6 +27,7 @@ CONFIG_BSD_PROCESS_ACCT=y CONFIG_SYSCTL=y CONFIG_AUDIT=y CONFIG_AUDITSYSCALL=y +# CONFIG_AUDITFILESYSTEM is not set CONFIG_LOG_BUF_SHIFT=17 CONFIG_HOTPLUG=y # CONFIG_IKCONFIG is not set @@ -60,6 +61,7 @@ CONFIG_STOP_MACHINE=y # # Processor type and features # +CONFIG_MEM_MIRROR=y # CONFIG_X86_PC is not set # CONFIG_X86_ELAN is not set # CONFIG_X86_VOYAGER is not set @@ -125,23 +127,6 @@ CONFIG_I8K=m CONFIG_MICROCODE=m CONFIG_X86_MSR=m CONFIG_X86_CPUID=m -CONFIG_IOPROC=y -CONFIG_PTRACK=y - -# -# Quadrics QsNet -# -CONFIG_QSNET=m -CONFIG_ELAN3=m -CONFIG_ELAN4=m -CONFIG_EP=m -CONFIG_EIP=m -CONFIG_RMS=m -CONFIG_JTAG=m -CONFIG_NET_FC=y -CONFIG_SHAPER=m -CONFIG_NETCONSOLE=m - # # Firmware Drivers @@ -160,7 +145,8 @@ CONFIG_MTRR=y # CONFIG_IRQBALANCE is not set CONFIG_HAVE_DEC_LOCK=y CONFIG_REGPARM=y -CONFIG_KEXEC=y +CONFIG_IOPROC=y +CONFIG_PTRACK=y # # Power management options (ACPI, APM) @@ -546,6 +532,7 @@ CONFIG_SCSI_LOGGING=y # CONFIG_SCSI_SPI_ATTRS=m CONFIG_SCSI_FC_ATTRS=m +CONFIG_SCSI_ISCSI_ATTRS=m # # SCSI low-level drivers @@ -600,6 +587,7 @@ CONFIG_SCSI_GDTH=m CONFIG_SCSI_IPS=m CONFIG_SCSI_INITIO=m # CONFIG_SCSI_INIA100 is not set +CONFIG_SCSI_ISCSI_SFNET=m CONFIG_SCSI_PPA=m CONFIG_SCSI_IMM=m # CONFIG_SCSI_IZIP_EPP16 is not set @@ -623,7 +611,7 @@ CONFIG_SCSI_QLA22XX=m CONFIG_SCSI_QLA2300=m CONFIG_SCSI_QLA2322=m CONFIG_SCSI_QLA6312=m -CONFIG_SCSI_QLA6322=m +CONFIG_SCSI_QLA24XX=m # CONFIG_SCSI_SYM53C416 is not set # CONFIG_SCSI_DC395x is not set # CONFIG_SCSI_DC390T is not set @@ -664,6 +652,8 @@ CONFIG_DM_CRYPT=m CONFIG_DM_SNAPSHOT=m CONFIG_DM_MIRROR=m CONFIG_DM_ZERO=m +CONFIG_DM_MULTIPATH=m +CONFIG_DM_MULTIPATH_EMC=m # # Fusion MPT device support @@ -676,7 +666,32 @@ CONFIG_FUSION_LAN=m # # IEEE 1394 (FireWire) support # -# CONFIG_IEEE1394 is not set +CONFIG_IEEE1394=m + +# +# Subsystem Options +# +# CONFIG_IEEE1394_VERBOSEDEBUG is not set +# CONFIG_IEEE1394_OUI_DB is not set +# CONFIG_IEEE1394_EXTRA_CONFIG_ROMS is not set + +# +# Device Drivers +# +CONFIG_IEEE1394_PCILYNX=m +CONFIG_IEEE1394_OHCI1394=y + +# +# Protocol Drivers +# +CONFIG_IEEE1394_VIDEO1394=m +CONFIG_IEEE1394_SBP2=m +CONFIG_IEEE1394_SBP2_PHYS_DMA=y +CONFIG_IEEE1394_ETH1394=m +CONFIG_IEEE1394_DV1394=m +CONFIG_IEEE1394_RAWIO=m +CONFIG_IEEE1394_CMP=m +CONFIG_IEEE1394_AMDTP=m # # I2O device support @@ -1122,6 +1137,17 @@ CONFIG_ABYSS=m CONFIG_SMCTR=m # +# Quadrics QsNet +# +CONFIG_QSNET=m +CONFIG_ELAN3=m +CONFIG_ELAN4=m +CONFIG_EP=m +CONFIG_EIP=m +CONFIG_RMS=m +CONFIG_JTAG=m + +# # Wireless LAN (non-hamradio) # CONFIG_NET_RADIO=y @@ -1150,9 +1176,8 @@ CONFIG_IEEE80211_WPA=m CONFIG_IEEE80211_CRYPT_CCMP=m CONFIG_IEEE80211_CRYPT_TKIP=m CONFIG_IPW2100=m -# CONFIG_IPW_DEBUG is not set CONFIG_IPW2100_PROMISC=y -# CONFIG_IPW2100_LEGACY_FW_LOAD is not set +# CONFIG_IPW_DEBUG is not set CONFIG_IPW2200=m CONFIG_AIRO=m CONFIG_HERMES=m @@ -1846,6 +1871,7 @@ CONFIG_SND_AU8810=m CONFIG_SND_AU8820=m CONFIG_SND_AU8830=m CONFIG_SND_AZT3328=m +CONFIG_SND_AZX=m CONFIG_SND_BT87X=m CONFIG_SND_CS46XX=m CONFIG_SND_CS46XX_NEW_DSP=y @@ -2147,6 +2173,7 @@ CONFIG_TMPFS_SECURITY=y CONFIG_HUGETLBFS=y CONFIG_HUGETLB_PAGE=y CONFIG_RAMFS=y +CONFIG_RELAYFS_FS=y # # Miscellaneous filesystems @@ -2284,9 +2311,10 @@ CONFIG_DEBUG_HIGHMEM=y # CONFIG_FRAME_POINTER is not set CONFIG_EARLY_PRINTK=y CONFIG_DEBUG_STACKOVERFLOW=y -# CONFIG_KPROBES is not set +CONFIG_KPROBES=y CONFIG_DEBUG_STACK_USAGE=y # CONFIG_DEBUG_PAGEALLOC is not set +# CONFIG_4KSTACKS is not set # CONFIG_SCHEDSTATS is not set CONFIG_X86_FIND_SMP_CONFIG=y CONFIG_X86_MPPARSE=y @@ -2294,6 +2322,8 @@ CONFIG_X86_MPPARSE=y # # Security options # +CONFIG_KEYS=y +CONFIG_KEYS_DEBUG_PROC_KEYS=y CONFIG_SECURITY=y CONFIG_SECURITY_NETWORK=y CONFIG_SECURITY_CAPABILITIES=y diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-ia64-smp.config b/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-ia64-smp.config index 45f9db3..46499d4 100644 --- a/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-ia64-smp.config +++ b/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-ia64-smp.config @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.9-5.0.3.EL_lustre-b1_4_rhel4.200503031449smp -# Thu Mar 3 14:52:42 2005 +# Linux kernel version: 2.6.9-prep.qp2.2.5.11.3qsnet +# Thu Oct 27 17:05:00 2005 # # @@ -22,6 +22,7 @@ CONFIG_BSD_PROCESS_ACCT=y CONFIG_SYSCTL=y CONFIG_AUDIT=y CONFIG_AUDITSYSCALL=y +# CONFIG_AUDITFILESYSTEM is not set CONFIG_LOG_BUF_SHIFT=17 CONFIG_HOTPLUG=y # CONFIG_IKCONFIG is not set @@ -93,21 +94,6 @@ CONFIG_IOPROC=y CONFIG_PTRACK=y # -# Quadrics QsNet -# -CONFIG_QSNET=m -CONFIG_ELAN3=m -CONFIG_ELAN4=m -CONFIG_EP=m -CONFIG_EIP=m -CONFIG_RMS=m -CONFIG_JTAG=m -CONFIG_NET_FC=y -CONFIG_SHAPER=m -CONFIG_NETCONSOLE=m - - -# # Firmware Drivers # CONFIG_EFI_VARS=y @@ -320,6 +306,7 @@ CONFIG_SCSI_LOGGING=y # CONFIG_SCSI_SPI_ATTRS=m CONFIG_SCSI_FC_ATTRS=m +CONFIG_SCSI_ISCSI_ATTRS=m # # SCSI low-level drivers @@ -366,6 +353,7 @@ CONFIG_SCSI_GDTH=m CONFIG_SCSI_IPS=m CONFIG_SCSI_INITIO=m # CONFIG_SCSI_INIA100 is not set +CONFIG_SCSI_ISCSI_SFNET=m CONFIG_SCSI_PPA=m CONFIG_SCSI_IMM=m # CONFIG_SCSI_IZIP_EPP16 is not set @@ -385,7 +373,7 @@ CONFIG_SCSI_QLA22XX=m CONFIG_SCSI_QLA2300=m CONFIG_SCSI_QLA2322=m CONFIG_SCSI_QLA6312=m -CONFIG_SCSI_QLA6322=m +CONFIG_SCSI_QLA24XX=m # CONFIG_SCSI_DC395x is not set # CONFIG_SCSI_DC390T is not set # CONFIG_SCSI_DEBUG is not set @@ -414,6 +402,8 @@ CONFIG_DM_CRYPT=m CONFIG_DM_SNAPSHOT=m CONFIG_DM_MIRROR=m CONFIG_DM_ZERO=m +CONFIG_DM_MULTIPATH=m +CONFIG_DM_MULTIPATH_EMC=m # # Fusion MPT device support @@ -426,7 +416,33 @@ CONFIG_FUSION_LAN=m # # IEEE 1394 (FireWire) support # -# CONFIG_IEEE1394 is not set +CONFIG_IEEE1394=m + +# +# Subsystem Options +# +# CONFIG_IEEE1394_VERBOSEDEBUG is not set +# CONFIG_IEEE1394_OUI_DB is not set +CONFIG_IEEE1394_EXTRA_CONFIG_ROMS=y +CONFIG_IEEE1394_CONFIG_ROM_IP1394=y + +# +# Device Drivers +# +CONFIG_IEEE1394_PCILYNX=m +CONFIG_IEEE1394_OHCI1394=m + +# +# Protocol Drivers +# +CONFIG_IEEE1394_VIDEO1394=m +CONFIG_IEEE1394_SBP2=m +CONFIG_IEEE1394_SBP2_PHYS_DMA=y +CONFIG_IEEE1394_ETH1394=m +CONFIG_IEEE1394_DV1394=m +CONFIG_IEEE1394_RAWIO=m +CONFIG_IEEE1394_CMP=m +CONFIG_IEEE1394_AMDTP=m # # I2O device support @@ -837,6 +853,17 @@ CONFIG_TMSPCI=m CONFIG_ABYSS=m # +# Quadrics QsNet +# +CONFIG_QSNET=m +CONFIG_ELAN3=m +CONFIG_ELAN4=m +CONFIG_EP=m +CONFIG_EIP=m +CONFIG_RMS=m +CONFIG_JTAG=m + +# # Wireless LAN (non-hamradio) # CONFIG_NET_RADIO=y @@ -1433,6 +1460,7 @@ CONFIG_SND_AU8810=m CONFIG_SND_AU8820=m CONFIG_SND_AU8830=m CONFIG_SND_AZT3328=m +CONFIG_SND_AZX=m CONFIG_SND_BT87X=m CONFIG_SND_CS46XX=m CONFIG_SND_CS46XX_NEW_DSP=y @@ -1731,6 +1759,7 @@ CONFIG_TMPFS_SECURITY=y CONFIG_HUGETLBFS=y CONFIG_HUGETLB_PAGE=y CONFIG_RAMFS=y +CONFIG_RELAYFS_FS=y # # Miscellaneous filesystems @@ -1872,6 +1901,7 @@ CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_SPINLOCK=y CONFIG_DEBUG_SPINLOCK_SLEEP=y # CONFIG_DEBUG_INFO is not set +CONFIG_KPROBES=y CONFIG_IA64_GRANULE_16MB=y # CONFIG_IA64_GRANULE_64MB is not set # CONFIG_IA64_PRINT_HAZARDS is not set @@ -1882,6 +1912,8 @@ CONFIG_IA64_GRANULE_16MB=y # # Security options # +CONFIG_KEYS=y +CONFIG_KEYS_DEBUG_PROC_KEYS=y CONFIG_SECURITY=y CONFIG_SECURITY_NETWORK=y CONFIG_SECURITY_CAPABILITIES=y diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-ia64.config b/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-ia64.config index 45f9db3..92aa946 100644 --- a/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-ia64.config +++ b/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-ia64.config @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.9-5.0.3.EL_lustre-b1_4_rhel4.200503031449smp -# Thu Mar 3 14:52:42 2005 +# Linux kernel version: 2.6.9-prep.qp2.2.5.11.3qsnet +# Thu Oct 27 17:04:10 2005 # # @@ -22,6 +22,7 @@ CONFIG_BSD_PROCESS_ACCT=y CONFIG_SYSCTL=y CONFIG_AUDIT=y CONFIG_AUDITSYSCALL=y +# CONFIG_AUDITFILESYSTEM is not set CONFIG_LOG_BUF_SHIFT=17 CONFIG_HOTPLUG=y # CONFIG_IKCONFIG is not set @@ -93,21 +94,6 @@ CONFIG_IOPROC=y CONFIG_PTRACK=y # -# Quadrics QsNet -# -CONFIG_QSNET=m -CONFIG_ELAN3=m -CONFIG_ELAN4=m -CONFIG_EP=m -CONFIG_EIP=m -CONFIG_RMS=m -CONFIG_JTAG=m -CONFIG_NET_FC=y -CONFIG_SHAPER=m -CONFIG_NETCONSOLE=m - - -# # Firmware Drivers # CONFIG_EFI_VARS=y @@ -320,6 +306,7 @@ CONFIG_SCSI_LOGGING=y # CONFIG_SCSI_SPI_ATTRS=m CONFIG_SCSI_FC_ATTRS=m +CONFIG_SCSI_ISCSI_ATTRS=m # # SCSI low-level drivers @@ -366,6 +353,7 @@ CONFIG_SCSI_GDTH=m CONFIG_SCSI_IPS=m CONFIG_SCSI_INITIO=m # CONFIG_SCSI_INIA100 is not set +CONFIG_SCSI_ISCSI_SFNET=m CONFIG_SCSI_PPA=m CONFIG_SCSI_IMM=m # CONFIG_SCSI_IZIP_EPP16 is not set @@ -385,7 +373,7 @@ CONFIG_SCSI_QLA22XX=m CONFIG_SCSI_QLA2300=m CONFIG_SCSI_QLA2322=m CONFIG_SCSI_QLA6312=m -CONFIG_SCSI_QLA6322=m +CONFIG_SCSI_QLA24XX=m # CONFIG_SCSI_DC395x is not set # CONFIG_SCSI_DC390T is not set # CONFIG_SCSI_DEBUG is not set @@ -414,6 +402,8 @@ CONFIG_DM_CRYPT=m CONFIG_DM_SNAPSHOT=m CONFIG_DM_MIRROR=m CONFIG_DM_ZERO=m +CONFIG_DM_MULTIPATH=m +CONFIG_DM_MULTIPATH_EMC=m # # Fusion MPT device support @@ -426,7 +416,33 @@ CONFIG_FUSION_LAN=m # # IEEE 1394 (FireWire) support # -# CONFIG_IEEE1394 is not set +CONFIG_IEEE1394=m + +# +# Subsystem Options +# +# CONFIG_IEEE1394_VERBOSEDEBUG is not set +# CONFIG_IEEE1394_OUI_DB is not set +CONFIG_IEEE1394_EXTRA_CONFIG_ROMS=y +CONFIG_IEEE1394_CONFIG_ROM_IP1394=y + +# +# Device Drivers +# +CONFIG_IEEE1394_PCILYNX=m +CONFIG_IEEE1394_OHCI1394=m + +# +# Protocol Drivers +# +CONFIG_IEEE1394_VIDEO1394=m +CONFIG_IEEE1394_SBP2=m +CONFIG_IEEE1394_SBP2_PHYS_DMA=y +CONFIG_IEEE1394_ETH1394=m +CONFIG_IEEE1394_DV1394=m +CONFIG_IEEE1394_RAWIO=m +CONFIG_IEEE1394_CMP=m +CONFIG_IEEE1394_AMDTP=m # # I2O device support @@ -837,6 +853,17 @@ CONFIG_TMSPCI=m CONFIG_ABYSS=m # +# Quadrics QsNet +# +CONFIG_QSNET=m +CONFIG_ELAN3=m +CONFIG_ELAN4=m +CONFIG_EP=m +CONFIG_EIP=m +CONFIG_RMS=m +CONFIG_JTAG=m + +# # Wireless LAN (non-hamradio) # CONFIG_NET_RADIO=y @@ -1433,6 +1460,7 @@ CONFIG_SND_AU8810=m CONFIG_SND_AU8820=m CONFIG_SND_AU8830=m CONFIG_SND_AZT3328=m +CONFIG_SND_AZX=m CONFIG_SND_BT87X=m CONFIG_SND_CS46XX=m CONFIG_SND_CS46XX_NEW_DSP=y @@ -1731,6 +1759,7 @@ CONFIG_TMPFS_SECURITY=y CONFIG_HUGETLBFS=y CONFIG_HUGETLB_PAGE=y CONFIG_RAMFS=y +CONFIG_RELAYFS_FS=y # # Miscellaneous filesystems @@ -1872,6 +1901,7 @@ CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_SPINLOCK=y CONFIG_DEBUG_SPINLOCK_SLEEP=y # CONFIG_DEBUG_INFO is not set +CONFIG_KPROBES=y CONFIG_IA64_GRANULE_16MB=y # CONFIG_IA64_GRANULE_64MB is not set # CONFIG_IA64_PRINT_HAZARDS is not set @@ -1882,6 +1912,8 @@ CONFIG_IA64_GRANULE_16MB=y # # Security options # +CONFIG_KEYS=y +CONFIG_KEYS_DEBUG_PROC_KEYS=y CONFIG_SECURITY=y CONFIG_SECURITY_NETWORK=y CONFIG_SECURITY_CAPABILITIES=y diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-x86_64-smp.config b/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-x86_64-smp.config index 9b2bb13..f621ca1 100644 --- a/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-x86_64-smp.config +++ b/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-x86_64-smp.config @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.9-prep -# Tue Aug 2 15:46:19 2005 +# Linux kernel version: 2.6.9-prep.qp2.2.5.11.3qsnet +# Thu Oct 27 17:06:20 2005 # CONFIG_X86_64=y CONFIG_64BIT=y @@ -11,6 +11,7 @@ CONFIG_RWSEM_GENERIC_SPINLOCK=y CONFIG_X86_CMPXCHG=y CONFIG_EARLY_PRINTK=y CONFIG_HPET_TIMER=y +CONFIG_X86_PM_TIMER=y CONFIG_HPET_EMULATE_RTC=y CONFIG_GENERIC_ISA_DMA=y CONFIG_GENERIC_IOMAP=y @@ -33,6 +34,7 @@ CONFIG_BSD_PROCESS_ACCT=y CONFIG_SYSCTL=y CONFIG_AUDIT=y CONFIG_AUDITSYSCALL=y +# CONFIG_AUDITFILESYSTEM is not set CONFIG_LOG_BUF_SHIFT=17 CONFIG_HOTPLUG=y # CONFIG_IKCONFIG is not set @@ -95,21 +97,6 @@ CONFIG_IOPROC=y CONFIG_PTRACK=y # -# Quadrics QsNet -# -CONFIG_QSNET=m -CONFIG_ELAN3=m -CONFIG_ELAN4=m -CONFIG_EP=m -CONFIG_EIP=m -CONFIG_RMS=m -CONFIG_JTAG=m -CONFIG_NET_FC=y -CONFIG_SHAPER=m -CONFIG_NETCONSOLE=m - - -# # Power management options # CONFIG_PM=y @@ -130,6 +117,7 @@ CONFIG_ACPI_BUTTON=m CONFIG_ACPI_FAN=y CONFIG_ACPI_PROCESSOR=y CONFIG_ACPI_THERMAL=y +CONFIG_ACPI_NUMA=y CONFIG_ACPI_ASUS=m CONFIG_ACPI_TOSHIBA=m CONFIG_ACPI_BLACKLIST_YEAR=2001 @@ -208,7 +196,6 @@ CONFIG_IA32_EMULATION=y CONFIG_COMPAT=y CONFIG_SYSVIPC_COMPAT=y CONFIG_UID16=y -CONFIG_KEXEC=y # # Device Drivers @@ -450,6 +437,7 @@ CONFIG_SCSI_LOGGING=y # CONFIG_SCSI_SPI_ATTRS=m CONFIG_SCSI_FC_ATTRS=m +CONFIG_SCSI_ISCSI_ATTRS=m # # SCSI low-level drivers @@ -496,6 +484,7 @@ CONFIG_SCSI_GDTH=m CONFIG_SCSI_IPS=m CONFIG_SCSI_INITIO=m # CONFIG_SCSI_INIA100 is not set +CONFIG_SCSI_ISCSI_SFNET=m CONFIG_SCSI_PPA=m CONFIG_SCSI_IMM=m # CONFIG_SCSI_IZIP_EPP16 is not set @@ -515,7 +504,7 @@ CONFIG_SCSI_QLA22XX=m CONFIG_SCSI_QLA2300=m CONFIG_SCSI_QLA2322=m CONFIG_SCSI_QLA6312=m -CONFIG_SCSI_QLA6322=m +CONFIG_SCSI_QLA24XX=m # CONFIG_SCSI_DC395x is not set # CONFIG_SCSI_DC390T is not set # CONFIG_SCSI_DEBUG is not set @@ -544,6 +533,8 @@ CONFIG_DM_CRYPT=m CONFIG_DM_SNAPSHOT=m CONFIG_DM_MIRROR=m CONFIG_DM_ZERO=m +CONFIG_DM_MULTIPATH=m +CONFIG_DM_MULTIPATH_EMC=m # # Fusion MPT device support @@ -556,7 +547,33 @@ CONFIG_FUSION_LAN=m # # IEEE 1394 (FireWire) support # -# CONFIG_IEEE1394 is not set +CONFIG_IEEE1394=m + +# +# Subsystem Options +# +# CONFIG_IEEE1394_VERBOSEDEBUG is not set +# CONFIG_IEEE1394_OUI_DB is not set +CONFIG_IEEE1394_EXTRA_CONFIG_ROMS=y +CONFIG_IEEE1394_CONFIG_ROM_IP1394=y + +# +# Device Drivers +# +CONFIG_IEEE1394_PCILYNX=m +CONFIG_IEEE1394_OHCI1394=m + +# +# Protocol Drivers +# +CONFIG_IEEE1394_VIDEO1394=m +CONFIG_IEEE1394_SBP2=m +CONFIG_IEEE1394_SBP2_PHYS_DMA=y +CONFIG_IEEE1394_ETH1394=m +CONFIG_IEEE1394_DV1394=m +CONFIG_IEEE1394_RAWIO=m +CONFIG_IEEE1394_CMP=m +CONFIG_IEEE1394_AMDTP=m # # I2O device support @@ -971,6 +988,17 @@ CONFIG_TMSPCI=m CONFIG_ABYSS=m # +# Quadrics QsNet +# +CONFIG_QSNET=m +CONFIG_ELAN3=m +CONFIG_ELAN4=m +CONFIG_EP=m +CONFIG_EIP=m +CONFIG_RMS=m +CONFIG_JTAG=m + +# # Wireless LAN (non-hamradio) # CONFIG_NET_RADIO=y @@ -996,9 +1024,8 @@ CONFIG_IEEE80211_CRYPT=m CONFIG_IEEE80211_WPA=m CONFIG_IEEE80211_CRYPT_TKIP=m CONFIG_IPW2100=m -# CONFIG_IPW_DEBUG is not set CONFIG_IPW2100_PROMISC=y -# CONFIG_IPW2100_LEGACY_FW_LOAD is not set +# CONFIG_IPW_DEBUG is not set CONFIG_IPW2200=m CONFIG_HERMES=m CONFIG_PLX_HERMES=m @@ -1600,6 +1627,7 @@ CONFIG_SND_AU8810=m CONFIG_SND_AU8820=m CONFIG_SND_AU8830=m CONFIG_SND_AZT3328=m +CONFIG_SND_AZX=m CONFIG_SND_BT87X=m CONFIG_SND_CS46XX=m CONFIG_SND_CS46XX_NEW_DSP=y @@ -1903,6 +1931,7 @@ CONFIG_TMPFS_SECURITY=y CONFIG_HUGETLBFS=y CONFIG_HUGETLB_PAGE=y CONFIG_RAMFS=y +CONFIG_RELAYFS_FS=y # # Miscellaneous filesystems @@ -2039,10 +2068,13 @@ CONFIG_DEBUG_SPINLOCK_SLEEP=y CONFIG_INIT_DEBUG=y # CONFIG_SCHEDSTATS is not set # CONFIG_IOMMU_DEBUG is not set +CONFIG_KPROBES=y # # Security options # +CONFIG_KEYS=y +CONFIG_KEYS_DEBUG_PROC_KEYS=y CONFIG_SECURITY=y CONFIG_SECURITY_NETWORK=y CONFIG_SECURITY_CAPABILITIES=y diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-x86_64.config b/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-x86_64.config index 9b2bb13..8a1b02f 100644 --- a/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-x86_64.config +++ b/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-x86_64.config @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.9-prep -# Tue Aug 2 15:46:19 2005 +# Linux kernel version: 2.6.9-prep.qp2.2.5.11.3qsnet +# Thu Oct 27 17:05:31 2005 # CONFIG_X86_64=y CONFIG_64BIT=y @@ -11,6 +11,7 @@ CONFIG_RWSEM_GENERIC_SPINLOCK=y CONFIG_X86_CMPXCHG=y CONFIG_EARLY_PRINTK=y CONFIG_HPET_TIMER=y +CONFIG_X86_PM_TIMER=y CONFIG_HPET_EMULATE_RTC=y CONFIG_GENERIC_ISA_DMA=y CONFIG_GENERIC_IOMAP=y @@ -33,6 +34,7 @@ CONFIG_BSD_PROCESS_ACCT=y CONFIG_SYSCTL=y CONFIG_AUDIT=y CONFIG_AUDITSYSCALL=y +# CONFIG_AUDITFILESYSTEM is not set CONFIG_LOG_BUF_SHIFT=17 CONFIG_HOTPLUG=y # CONFIG_IKCONFIG is not set @@ -95,21 +97,6 @@ CONFIG_IOPROC=y CONFIG_PTRACK=y # -# Quadrics QsNet -# -CONFIG_QSNET=m -CONFIG_ELAN3=m -CONFIG_ELAN4=m -CONFIG_EP=m -CONFIG_EIP=m -CONFIG_RMS=m -CONFIG_JTAG=m -CONFIG_NET_FC=y -CONFIG_SHAPER=m -CONFIG_NETCONSOLE=m - - -# # Power management options # CONFIG_PM=y @@ -130,6 +117,7 @@ CONFIG_ACPI_BUTTON=m CONFIG_ACPI_FAN=y CONFIG_ACPI_PROCESSOR=y CONFIG_ACPI_THERMAL=y +CONFIG_ACPI_NUMA=y CONFIG_ACPI_ASUS=m CONFIG_ACPI_TOSHIBA=m CONFIG_ACPI_BLACKLIST_YEAR=2001 @@ -208,7 +196,6 @@ CONFIG_IA32_EMULATION=y CONFIG_COMPAT=y CONFIG_SYSVIPC_COMPAT=y CONFIG_UID16=y -CONFIG_KEXEC=y # # Device Drivers @@ -450,6 +437,7 @@ CONFIG_SCSI_LOGGING=y # CONFIG_SCSI_SPI_ATTRS=m CONFIG_SCSI_FC_ATTRS=m +CONFIG_SCSI_ISCSI_ATTRS=m # # SCSI low-level drivers @@ -496,6 +484,7 @@ CONFIG_SCSI_GDTH=m CONFIG_SCSI_IPS=m CONFIG_SCSI_INITIO=m # CONFIG_SCSI_INIA100 is not set +CONFIG_SCSI_ISCSI_SFNET=m CONFIG_SCSI_PPA=m CONFIG_SCSI_IMM=m # CONFIG_SCSI_IZIP_EPP16 is not set @@ -515,7 +504,7 @@ CONFIG_SCSI_QLA22XX=m CONFIG_SCSI_QLA2300=m CONFIG_SCSI_QLA2322=m CONFIG_SCSI_QLA6312=m -CONFIG_SCSI_QLA6322=m +CONFIG_SCSI_QLA24XX=m # CONFIG_SCSI_DC395x is not set # CONFIG_SCSI_DC390T is not set # CONFIG_SCSI_DEBUG is not set @@ -544,6 +533,8 @@ CONFIG_DM_CRYPT=m CONFIG_DM_SNAPSHOT=m CONFIG_DM_MIRROR=m CONFIG_DM_ZERO=m +CONFIG_DM_MULTIPATH=m +CONFIG_DM_MULTIPATH_EMC=m # # Fusion MPT device support @@ -556,7 +547,33 @@ CONFIG_FUSION_LAN=m # # IEEE 1394 (FireWire) support # -# CONFIG_IEEE1394 is not set +CONFIG_IEEE1394=m + +# +# Subsystem Options +# +# CONFIG_IEEE1394_VERBOSEDEBUG is not set +# CONFIG_IEEE1394_OUI_DB is not set +CONFIG_IEEE1394_EXTRA_CONFIG_ROMS=y +CONFIG_IEEE1394_CONFIG_ROM_IP1394=y + +# +# Device Drivers +# +CONFIG_IEEE1394_PCILYNX=m +CONFIG_IEEE1394_OHCI1394=y + +# +# Protocol Drivers +# +CONFIG_IEEE1394_VIDEO1394=m +CONFIG_IEEE1394_SBP2=m +CONFIG_IEEE1394_SBP2_PHYS_DMA=y +CONFIG_IEEE1394_ETH1394=m +CONFIG_IEEE1394_DV1394=m +CONFIG_IEEE1394_RAWIO=m +CONFIG_IEEE1394_CMP=m +CONFIG_IEEE1394_AMDTP=m # # I2O device support @@ -971,6 +988,17 @@ CONFIG_TMSPCI=m CONFIG_ABYSS=m # +# Quadrics QsNet +# +CONFIG_QSNET=m +CONFIG_ELAN3=m +CONFIG_ELAN4=m +CONFIG_EP=m +CONFIG_EIP=m +CONFIG_RMS=m +CONFIG_JTAG=m + +# # Wireless LAN (non-hamradio) # CONFIG_NET_RADIO=y @@ -996,9 +1024,8 @@ CONFIG_IEEE80211_CRYPT=m CONFIG_IEEE80211_WPA=m CONFIG_IEEE80211_CRYPT_TKIP=m CONFIG_IPW2100=m -# CONFIG_IPW_DEBUG is not set CONFIG_IPW2100_PROMISC=y -# CONFIG_IPW2100_LEGACY_FW_LOAD is not set +# CONFIG_IPW_DEBUG is not set CONFIG_IPW2200=m CONFIG_HERMES=m CONFIG_PLX_HERMES=m @@ -1600,6 +1627,7 @@ CONFIG_SND_AU8810=m CONFIG_SND_AU8820=m CONFIG_SND_AU8830=m CONFIG_SND_AZT3328=m +CONFIG_SND_AZX=m CONFIG_SND_BT87X=m CONFIG_SND_CS46XX=m CONFIG_SND_CS46XX_NEW_DSP=y @@ -1903,6 +1931,7 @@ CONFIG_TMPFS_SECURITY=y CONFIG_HUGETLBFS=y CONFIG_HUGETLB_PAGE=y CONFIG_RAMFS=y +CONFIG_RELAYFS_FS=y # # Miscellaneous filesystems @@ -2039,10 +2068,13 @@ CONFIG_DEBUG_SPINLOCK_SLEEP=y CONFIG_INIT_DEBUG=y # CONFIG_SCHEDSTATS is not set # CONFIG_IOMMU_DEBUG is not set +CONFIG_KPROBES=y # # Security options # +CONFIG_KEYS=y +CONFIG_KEYS_DEBUG_PROC_KEYS=y CONFIG_SECURITY=y CONFIG_SECURITY_NETWORK=y CONFIG_SECURITY_CAPABILITIES=y diff --git a/lustre/kernel_patches/patches/2.4.19-ext3.patch b/lustre/kernel_patches/patches/2.4.19-ext3.patch deleted file mode 100644 index a167c6a..0000000 --- a/lustre/kernel_patches/patches/2.4.19-ext3.patch +++ /dev/null @@ -1,7892 +0,0 @@ -diff -rup --new-file linux.mcp2/fs/ext3/Makefile linux_tmp/fs/ext3/Makefile ---- linux.mcp2/fs/ext3/Makefile 1969-12-31 16:00:00.000000000 -0800 -+++ linux_tmp/fs/ext3/Makefile 2001-12-21 09:41:55.000000000 -0800 -@@ -0,0 +1,16 @@ -+# -+# Makefile for the linux ext2-filesystem routines. -+# -+# Note! Dependencies are done automagically by 'make dep', which also -+# removes any old dependencies. DON'T put your own dependencies here -+# unless it's something special (ie not a .c file). -+# -+# Note 2! The CFLAGS definitions are now in the main makefile... -+ -+O_TARGET := ext3.o -+ -+obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ -+ ioctl.o namei.o super.o symlink.o -+obj-m := $(O_TARGET) -+ -+include $(TOPDIR)/Rules.make -diff -rup --new-file linux.mcp2/fs/ext3/balloc.c linux_tmp/fs/ext3/balloc.c ---- linux.mcp2/fs/ext3/balloc.c 1969-12-31 16:00:00.000000000 -0800 -+++ linux_tmp/fs/ext3/balloc.c 2002-08-02 17:39:45.000000000 -0700 -@@ -0,0 +1,999 @@ -+/* -+ * linux/fs/ext3/balloc.c -+ * -+ * Copyright (C) 1992, 1993, 1994, 1995 -+ * Remy Card (card@masi.ibp.fr) -+ * Laboratoire MASI - Institut Blaise Pascal -+ * Universite Pierre et Marie Curie (Paris VI) -+ * -+ * Enhanced block allocation by Stephen Tweedie (sct@redhat.com), 1993 -+ * Big-endian to little-endian byte-swapping/bitmaps by -+ * David S. Miller (davem@caip.rutgers.edu), 1995 -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+/* -+ * balloc.c contains the blocks allocation and deallocation routines -+ */ -+ -+/* -+ * The free blocks are managed by bitmaps. A file system contains several -+ * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap -+ * block for inodes, N blocks for the inode table and data blocks. -+ * -+ * The file system contains group descriptors which are located after the -+ * super block. Each descriptor contains the number of the bitmap block and -+ * the free blocks count in the block. The descriptors are loaded in memory -+ * when a file system is mounted (see ext3_read_super). -+ */ -+ -+ -+#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) -+ -+struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb, -+ unsigned int block_group, -+ struct buffer_head ** bh) -+{ -+ unsigned long group_desc; -+ unsigned long desc; -+ struct ext3_group_desc * gdp; -+ -+ if (block_group >= sb->u.ext3_sb.s_groups_count) { -+ ext3_error (sb, "ext3_get_group_desc", -+ "block_group >= groups_count - " -+ "block_group = %d, groups_count = %lu", -+ block_group, sb->u.ext3_sb.s_groups_count); -+ -+ return NULL; -+ } -+ -+ group_desc = block_group / EXT3_DESC_PER_BLOCK(sb); -+ desc = block_group % EXT3_DESC_PER_BLOCK(sb); -+ if (!sb->u.ext3_sb.s_group_desc[group_desc]) { -+ ext3_error (sb, "ext3_get_group_desc", -+ "Group descriptor not loaded - " -+ "block_group = %d, group_desc = %lu, desc = %lu", -+ block_group, group_desc, desc); -+ return NULL; -+ } -+ -+ gdp = (struct ext3_group_desc *) -+ sb->u.ext3_sb.s_group_desc[group_desc]->b_data; -+ if (bh) -+ *bh = sb->u.ext3_sb.s_group_desc[group_desc]; -+ return gdp + desc; -+} -+ -+/* -+ * Read the bitmap for a given block_group, reading into the specified -+ * slot in the superblock's bitmap cache. -+ * -+ * Return >=0 on success or a -ve error code. -+ */ -+ -+static int read_block_bitmap (struct super_block * sb, -+ unsigned int block_group, -+ unsigned long bitmap_nr) -+{ -+ struct ext3_group_desc * gdp; -+ struct buffer_head * bh = NULL; -+ int retval = -EIO; -+ -+ gdp = ext3_get_group_desc (sb, block_group, NULL); -+ if (!gdp) -+ goto error_out; -+ retval = 0; -+ bh = sb_bread(sb, le32_to_cpu(gdp->bg_block_bitmap)); -+ if (!bh) { -+ ext3_error (sb, "read_block_bitmap", -+ "Cannot read block bitmap - " -+ "block_group = %d, block_bitmap = %lu", -+ block_group, (unsigned long) gdp->bg_block_bitmap); -+ retval = -EIO; -+ } -+ /* -+ * On IO error, just leave a zero in the superblock's block pointer for -+ * this group. The IO will be retried next time. -+ */ -+error_out: -+ sb->u.ext3_sb.s_block_bitmap_number[bitmap_nr] = block_group; -+ sb->u.ext3_sb.s_block_bitmap[bitmap_nr] = bh; -+ return retval; -+} -+ -+/* -+ * load_block_bitmap loads the block bitmap for a blocks group -+ * -+ * It maintains a cache for the last bitmaps loaded. This cache is managed -+ * with a LRU algorithm. -+ * -+ * Notes: -+ * 1/ There is one cache per mounted file system. -+ * 2/ If the file system contains less than EXT3_MAX_GROUP_LOADED groups, -+ * this function reads the bitmap without maintaining a LRU cache. -+ * -+ * Return the slot used to store the bitmap, or a -ve error code. -+ */ -+static int __load_block_bitmap (struct super_block * sb, -+ unsigned int block_group) -+{ -+ int i, j, retval = 0; -+ unsigned long block_bitmap_number; -+ struct buffer_head * block_bitmap; -+ -+ if (block_group >= sb->u.ext3_sb.s_groups_count) -+ ext3_panic (sb, "load_block_bitmap", -+ "block_group >= groups_count - " -+ "block_group = %d, groups_count = %lu", -+ block_group, sb->u.ext3_sb.s_groups_count); -+ -+ if (sb->u.ext3_sb.s_groups_count <= EXT3_MAX_GROUP_LOADED) { -+ if (sb->u.ext3_sb.s_block_bitmap[block_group]) { -+ if (sb->u.ext3_sb.s_block_bitmap_number[block_group] == -+ block_group) -+ return block_group; -+ ext3_error (sb, "__load_block_bitmap", -+ "block_group != block_bitmap_number"); -+ } -+ retval = read_block_bitmap (sb, block_group, block_group); -+ if (retval < 0) -+ return retval; -+ return block_group; -+ } -+ -+ for (i = 0; i < sb->u.ext3_sb.s_loaded_block_bitmaps && -+ sb->u.ext3_sb.s_block_bitmap_number[i] != block_group; i++) -+ ; -+ if (i < sb->u.ext3_sb.s_loaded_block_bitmaps && -+ sb->u.ext3_sb.s_block_bitmap_number[i] == block_group) { -+ block_bitmap_number = sb->u.ext3_sb.s_block_bitmap_number[i]; -+ block_bitmap = sb->u.ext3_sb.s_block_bitmap[i]; -+ for (j = i; j > 0; j--) { -+ sb->u.ext3_sb.s_block_bitmap_number[j] = -+ sb->u.ext3_sb.s_block_bitmap_number[j - 1]; -+ sb->u.ext3_sb.s_block_bitmap[j] = -+ sb->u.ext3_sb.s_block_bitmap[j - 1]; -+ } -+ sb->u.ext3_sb.s_block_bitmap_number[0] = block_bitmap_number; -+ sb->u.ext3_sb.s_block_bitmap[0] = block_bitmap; -+ -+ /* -+ * There's still one special case here --- if block_bitmap == 0 -+ * then our last attempt to read the bitmap failed and we have -+ * just ended up caching that failure. Try again to read it. -+ */ -+ if (!block_bitmap) -+ retval = read_block_bitmap (sb, block_group, 0); -+ } else { -+ if (sb->u.ext3_sb.s_loaded_block_bitmapsu.ext3_sb.s_loaded_block_bitmaps++; -+ else -+ brelse (sb->u.ext3_sb.s_block_bitmap -+ [EXT3_MAX_GROUP_LOADED - 1]); -+ for (j = sb->u.ext3_sb.s_loaded_block_bitmaps - 1; -+ j > 0; j--) { -+ sb->u.ext3_sb.s_block_bitmap_number[j] = -+ sb->u.ext3_sb.s_block_bitmap_number[j - 1]; -+ sb->u.ext3_sb.s_block_bitmap[j] = -+ sb->u.ext3_sb.s_block_bitmap[j - 1]; -+ } -+ retval = read_block_bitmap (sb, block_group, 0); -+ } -+ return retval; -+} -+ -+/* -+ * Load the block bitmap for a given block group. First of all do a couple -+ * of fast lookups for common cases and then pass the request onto the guts -+ * of the bitmap loader. -+ * -+ * Return the slot number of the group in the superblock bitmap cache's on -+ * success, or a -ve error code. -+ * -+ * There is still one inconsistency here --- if the number of groups in this -+ * filesystems is <= EXT3_MAX_GROUP_LOADED, then we have no way of -+ * differentiating between a group for which we have never performed a bitmap -+ * IO request, and a group for which the last bitmap read request failed. -+ */ -+static inline int load_block_bitmap (struct super_block * sb, -+ unsigned int block_group) -+{ -+ int slot; -+ -+ /* -+ * Do the lookup for the slot. First of all, check if we're asking -+ * for the same slot as last time, and did we succeed that last time? -+ */ -+ if (sb->u.ext3_sb.s_loaded_block_bitmaps > 0 && -+ sb->u.ext3_sb.s_block_bitmap_number[0] == block_group && -+ sb->u.ext3_sb.s_block_bitmap[0]) { -+ return 0; -+ } -+ /* -+ * Or can we do a fast lookup based on a loaded group on a filesystem -+ * small enough to be mapped directly into the superblock? -+ */ -+ else if (sb->u.ext3_sb.s_groups_count <= EXT3_MAX_GROUP_LOADED && -+ sb->u.ext3_sb.s_block_bitmap_number[block_group]==block_group -+ && sb->u.ext3_sb.s_block_bitmap[block_group]) { -+ slot = block_group; -+ } -+ /* -+ * If not, then do a full lookup for this block group. -+ */ -+ else { -+ slot = __load_block_bitmap (sb, block_group); -+ } -+ -+ /* -+ * <0 means we just got an error -+ */ -+ if (slot < 0) -+ return slot; -+ -+ /* -+ * If it's a valid slot, we may still have cached a previous IO error, -+ * in which case the bh in the superblock cache will be zero. -+ */ -+ if (!sb->u.ext3_sb.s_block_bitmap[slot]) -+ return -EIO; -+ -+ /* -+ * Must have been read in OK to get this far. -+ */ -+ return slot; -+} -+ -+/* Free given blocks, update quota and i_blocks field */ -+void ext3_free_blocks (handle_t *handle, struct inode * inode, -+ unsigned long block, unsigned long count) -+{ -+ struct buffer_head *bitmap_bh; -+ struct buffer_head *gd_bh; -+ unsigned long block_group; -+ unsigned long bit; -+ unsigned long i; -+ int bitmap_nr; -+ unsigned long overflow; -+ struct super_block * sb; -+ struct ext3_group_desc * gdp; -+ struct ext3_super_block * es; -+ int err = 0, ret; -+ int dquot_freed_blocks = 0; -+ -+ sb = inode->i_sb; -+ if (!sb) { -+ printk ("ext3_free_blocks: nonexistent device"); -+ return; -+ } -+ lock_super (sb); -+ es = sb->u.ext3_sb.s_es; -+ if (block < le32_to_cpu(es->s_first_data_block) || -+ (block + count) > le32_to_cpu(es->s_blocks_count)) { -+ ext3_error (sb, "ext3_free_blocks", -+ "Freeing blocks not in datazone - " -+ "block = %lu, count = %lu", block, count); -+ goto error_return; -+ } -+ -+ ext3_debug ("freeing block %lu\n", block); -+ -+do_more: -+ overflow = 0; -+ block_group = (block - le32_to_cpu(es->s_first_data_block)) / -+ EXT3_BLOCKS_PER_GROUP(sb); -+ bit = (block - le32_to_cpu(es->s_first_data_block)) % -+ EXT3_BLOCKS_PER_GROUP(sb); -+ /* -+ * Check to see if we are freeing blocks across a group -+ * boundary. -+ */ -+ if (bit + count > EXT3_BLOCKS_PER_GROUP(sb)) { -+ overflow = bit + count - EXT3_BLOCKS_PER_GROUP(sb); -+ count -= overflow; -+ } -+ bitmap_nr = load_block_bitmap (sb, block_group); -+ if (bitmap_nr < 0) -+ goto error_return; -+ -+ bitmap_bh = sb->u.ext3_sb.s_block_bitmap[bitmap_nr]; -+ gdp = ext3_get_group_desc (sb, block_group, &gd_bh); -+ if (!gdp) -+ goto error_return; -+ -+ if (in_range (le32_to_cpu(gdp->bg_block_bitmap), block, count) || -+ in_range (le32_to_cpu(gdp->bg_inode_bitmap), block, count) || -+ in_range (block, le32_to_cpu(gdp->bg_inode_table), -+ sb->u.ext3_sb.s_itb_per_group) || -+ in_range (block + count - 1, le32_to_cpu(gdp->bg_inode_table), -+ sb->u.ext3_sb.s_itb_per_group)) -+ ext3_error (sb, "ext3_free_blocks", -+ "Freeing blocks in system zones - " -+ "Block = %lu, count = %lu", -+ block, count); -+ -+ /* -+ * We are about to start releasing blocks in the bitmap, -+ * so we need undo access. -+ */ -+ /* @@@ check errors */ -+ BUFFER_TRACE(bitmap_bh, "getting undo access"); -+ err = ext3_journal_get_undo_access(handle, bitmap_bh); -+ if (err) -+ goto error_return; -+ -+ /* -+ * We are about to modify some metadata. Call the journal APIs -+ * to unshare ->b_data if a currently-committing transaction is -+ * using it -+ */ -+ BUFFER_TRACE(gd_bh, "get_write_access"); -+ err = ext3_journal_get_write_access(handle, gd_bh); -+ if (err) -+ goto error_return; -+ -+ BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "get_write_access"); -+ err = ext3_journal_get_write_access(handle, sb->u.ext3_sb.s_sbh); -+ if (err) -+ goto error_return; -+ -+ for (i = 0; i < count; i++) { -+ /* -+ * An HJ special. This is expensive... -+ */ -+#ifdef CONFIG_JBD_DEBUG -+ { -+ struct buffer_head *debug_bh; -+ debug_bh = sb_get_hash_table(sb, block + i); -+ if (debug_bh) { -+ BUFFER_TRACE(debug_bh, "Deleted!"); -+ if (!bh2jh(bitmap_bh)->b_committed_data) -+ BUFFER_TRACE(debug_bh, -+ "No commited data in bitmap"); -+ BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap"); -+ __brelse(debug_bh); -+ } -+ } -+#endif -+ BUFFER_TRACE(bitmap_bh, "clear bit"); -+ if (!ext3_clear_bit (bit + i, bitmap_bh->b_data)) { -+ ext3_error (sb, __FUNCTION__, -+ "bit already cleared for block %lu", -+ block + i); -+ BUFFER_TRACE(bitmap_bh, "bit already cleared"); -+ } else { -+ dquot_freed_blocks++; -+ gdp->bg_free_blocks_count = -+ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count)+1); -+ es->s_free_blocks_count = -+ cpu_to_le32(le32_to_cpu(es->s_free_blocks_count)+1); -+ } -+ /* @@@ This prevents newly-allocated data from being -+ * freed and then reallocated within the same -+ * transaction. -+ * -+ * Ideally we would want to allow that to happen, but to -+ * do so requires making journal_forget() capable of -+ * revoking the queued write of a data block, which -+ * implies blocking on the journal lock. *forget() -+ * cannot block due to truncate races. -+ * -+ * Eventually we can fix this by making journal_forget() -+ * return a status indicating whether or not it was able -+ * to revoke the buffer. On successful revoke, it is -+ * safe not to set the allocation bit in the committed -+ * bitmap, because we know that there is no outstanding -+ * activity on the buffer any more and so it is safe to -+ * reallocate it. -+ */ -+ BUFFER_TRACE(bitmap_bh, "clear in b_committed_data"); -+ J_ASSERT_BH(bitmap_bh, -+ bh2jh(bitmap_bh)->b_committed_data != NULL); -+ ext3_set_bit(bit + i, bh2jh(bitmap_bh)->b_committed_data); -+ } -+ -+ /* We dirtied the bitmap block */ -+ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); -+ err = ext3_journal_dirty_metadata(handle, bitmap_bh); -+ -+ /* And the group descriptor block */ -+ BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); -+ ret = ext3_journal_dirty_metadata(handle, gd_bh); -+ if (!err) err = ret; -+ -+ /* And the superblock */ -+ BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "dirtied superblock"); -+ ret = ext3_journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh); -+ if (!err) err = ret; -+ -+ if (overflow && !err) { -+ block += count; -+ count = overflow; -+ goto do_more; -+ } -+ sb->s_dirt = 1; -+error_return: -+ ext3_std_error(sb, err); -+ unlock_super(sb); -+ if (dquot_freed_blocks) -+ DQUOT_FREE_BLOCK(inode, dquot_freed_blocks); -+ return; -+} -+ -+/* For ext3 allocations, we must not reuse any blocks which are -+ * allocated in the bitmap buffer's "last committed data" copy. This -+ * prevents deletes from freeing up the page for reuse until we have -+ * committed the delete transaction. -+ * -+ * If we didn't do this, then deleting something and reallocating it as -+ * data would allow the old block to be overwritten before the -+ * transaction committed (because we force data to disk before commit). -+ * This would lead to corruption if we crashed between overwriting the -+ * data and committing the delete. -+ * -+ * @@@ We may want to make this allocation behaviour conditional on -+ * data-writes at some point, and disable it for metadata allocations or -+ * sync-data inodes. -+ */ -+static int ext3_test_allocatable(int nr, struct buffer_head *bh) -+{ -+ if (ext3_test_bit(nr, bh->b_data)) -+ return 0; -+ if (!buffer_jbd(bh) || !bh2jh(bh)->b_committed_data) -+ return 1; -+ return !ext3_test_bit(nr, bh2jh(bh)->b_committed_data); -+} -+ -+/* -+ * Find an allocatable block in a bitmap. We honour both the bitmap and -+ * its last-committed copy (if that exists), and perform the "most -+ * appropriate allocation" algorithm of looking for a free block near -+ * the initial goal; then for a free byte somewhere in the bitmap; then -+ * for any free bit in the bitmap. -+ */ -+static int find_next_usable_block(int start, -+ struct buffer_head *bh, int maxblocks) -+{ -+ int here, next; -+ char *p, *r; -+ -+ if (start > 0) { -+ /* -+ * The goal was occupied; search forward for a free -+ * block within the next XX blocks. -+ * -+ * end_goal is more or less random, but it has to be -+ * less than EXT3_BLOCKS_PER_GROUP. Aligning up to the -+ * next 64-bit boundary is simple.. -+ */ -+ int end_goal = (start + 63) & ~63; -+ here = ext3_find_next_zero_bit(bh->b_data, end_goal, start); -+ if (here < end_goal && ext3_test_allocatable(here, bh)) -+ return here; -+ -+ ext3_debug ("Bit not found near goal\n"); -+ -+ } -+ -+ here = start; -+ if (here < 0) -+ here = 0; -+ -+ /* -+ * There has been no free block found in the near vicinity of -+ * the goal: do a search forward through the block groups, -+ * searching in each group first for an entire free byte in the -+ * bitmap and then for any free bit. -+ * -+ * Search first in the remainder of the current group -+ */ -+ p = ((char *) bh->b_data) + (here >> 3); -+ r = memscan(p, 0, (maxblocks - here + 7) >> 3); -+ next = (r - ((char *) bh->b_data)) << 3; -+ -+ if (next < maxblocks && ext3_test_allocatable(next, bh)) -+ return next; -+ -+ /* The bitmap search --- search forward alternately -+ * through the actual bitmap and the last-committed copy -+ * until we find a bit free in both. */ -+ -+ while (here < maxblocks) { -+ next = ext3_find_next_zero_bit ((unsigned long *) bh->b_data, -+ maxblocks, here); -+ if (next >= maxblocks) -+ return -1; -+ if (ext3_test_allocatable(next, bh)) -+ return next; -+ -+ J_ASSERT_BH(bh, bh2jh(bh)->b_committed_data); -+ here = ext3_find_next_zero_bit -+ ((unsigned long *) bh2jh(bh)->b_committed_data, -+ maxblocks, next); -+ } -+ return -1; -+} -+ -+/* -+ * ext3_new_block uses a goal block to assist allocation. If the goal is -+ * free, or there is a free block within 32 blocks of the goal, that block -+ * is allocated. Otherwise a forward search is made for a free block; within -+ * each block group the search first looks for an entire free byte in the block -+ * bitmap, and then for any free bit if that fails. -+ * This function also updates quota and i_blocks field. -+ */ -+int ext3_new_block (handle_t *handle, struct inode * inode, -+ unsigned long goal, u32 * prealloc_count, -+ u32 * prealloc_block, int * errp) -+{ -+ struct buffer_head * bh, *bhtmp; -+ struct buffer_head * bh2; -+#if 0 -+ char * p, * r; -+#endif -+ int i, j, k, tmp, alloctmp; -+ int bitmap_nr; -+ int fatal = 0, err; -+ int performed_allocation = 0; -+ struct super_block * sb; -+ struct ext3_group_desc * gdp; -+ struct ext3_super_block * es; -+#ifdef EXT3FS_DEBUG -+ static int goal_hits = 0, goal_attempts = 0; -+#endif -+ *errp = -ENOSPC; -+ sb = inode->i_sb; -+ if (!sb) { -+ printk ("ext3_new_block: nonexistent device"); -+ return 0; -+ } -+ -+ /* -+ * Check quota for allocation of this block. -+ */ -+ if (DQUOT_ALLOC_BLOCK(inode, 1)) { -+ *errp = -EDQUOT; -+ return 0; -+ } -+ -+ lock_super (sb); -+ es = sb->u.ext3_sb.s_es; -+ if (le32_to_cpu(es->s_free_blocks_count) <= -+ le32_to_cpu(es->s_r_blocks_count) && -+ ((sb->u.ext3_sb.s_resuid != current->fsuid) && -+ (sb->u.ext3_sb.s_resgid == 0 || -+ !in_group_p (sb->u.ext3_sb.s_resgid)) && -+ !capable(CAP_SYS_RESOURCE))) -+ goto out; -+ -+ ext3_debug ("goal=%lu.\n", goal); -+ -+ /* -+ * First, test whether the goal block is free. -+ */ -+ if (goal < le32_to_cpu(es->s_first_data_block) || -+ goal >= le32_to_cpu(es->s_blocks_count)) -+ goal = le32_to_cpu(es->s_first_data_block); -+ i = (goal - le32_to_cpu(es->s_first_data_block)) / -+ EXT3_BLOCKS_PER_GROUP(sb); -+ gdp = ext3_get_group_desc (sb, i, &bh2); -+ if (!gdp) -+ goto io_error; -+ -+ if (le16_to_cpu(gdp->bg_free_blocks_count) > 0) { -+ j = ((goal - le32_to_cpu(es->s_first_data_block)) % -+ EXT3_BLOCKS_PER_GROUP(sb)); -+#ifdef EXT3FS_DEBUG -+ if (j) -+ goal_attempts++; -+#endif -+ bitmap_nr = load_block_bitmap (sb, i); -+ if (bitmap_nr < 0) -+ goto io_error; -+ -+ bh = sb->u.ext3_sb.s_block_bitmap[bitmap_nr]; -+ -+ ext3_debug ("goal is at %d:%d.\n", i, j); -+ -+ if (ext3_test_allocatable(j, bh)) { -+#ifdef EXT3FS_DEBUG -+ goal_hits++; -+ ext3_debug ("goal bit allocated.\n"); -+#endif -+ goto got_block; -+ } -+ -+ j = find_next_usable_block(j, bh, EXT3_BLOCKS_PER_GROUP(sb)); -+ if (j >= 0) -+ goto search_back; -+ } -+ -+ ext3_debug ("Bit not found in block group %d.\n", i); -+ -+ /* -+ * Now search the rest of the groups. We assume that -+ * i and gdp correctly point to the last group visited. -+ */ -+ for (k = 0; k < sb->u.ext3_sb.s_groups_count; k++) { -+ i++; -+ if (i >= sb->u.ext3_sb.s_groups_count) -+ i = 0; -+ gdp = ext3_get_group_desc (sb, i, &bh2); -+ if (!gdp) { -+ *errp = -EIO; -+ goto out; -+ } -+ if (le16_to_cpu(gdp->bg_free_blocks_count) > 0) { -+ bitmap_nr = load_block_bitmap (sb, i); -+ if (bitmap_nr < 0) -+ goto io_error; -+ -+ bh = sb->u.ext3_sb.s_block_bitmap[bitmap_nr]; -+ j = find_next_usable_block(-1, bh, -+ EXT3_BLOCKS_PER_GROUP(sb)); -+ if (j >= 0) -+ goto search_back; -+ } -+ } -+ -+ /* No space left on the device */ -+ goto out; -+ -+search_back: -+ /* -+ * We have succeeded in finding a free byte in the block -+ * bitmap. Now search backwards up to 7 bits to find the -+ * start of this group of free blocks. -+ */ -+ for ( k = 0; -+ k < 7 && j > 0 && ext3_test_allocatable(j - 1, bh); -+ k++, j--) -+ ; -+ -+got_block: -+ -+ ext3_debug ("using block group %d(%d)\n", i, gdp->bg_free_blocks_count); -+ -+ /* Make sure we use undo access for the bitmap, because it is -+ critical that we do the frozen_data COW on bitmap buffers in -+ all cases even if the buffer is in BJ_Forget state in the -+ committing transaction. */ -+ BUFFER_TRACE(bh, "get undo access for marking new block"); -+ fatal = ext3_journal_get_undo_access(handle, bh); -+ if (fatal) goto out; -+ -+ BUFFER_TRACE(bh2, "get_write_access"); -+ fatal = ext3_journal_get_write_access(handle, bh2); -+ if (fatal) goto out; -+ -+ BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "get_write_access"); -+ fatal = ext3_journal_get_write_access(handle, sb->u.ext3_sb.s_sbh); -+ if (fatal) goto out; -+ -+ tmp = j + i * EXT3_BLOCKS_PER_GROUP(sb) -+ + le32_to_cpu(es->s_first_data_block); -+ -+ if (tmp == le32_to_cpu(gdp->bg_block_bitmap) || -+ tmp == le32_to_cpu(gdp->bg_inode_bitmap) || -+ in_range (tmp, le32_to_cpu(gdp->bg_inode_table), -+ sb->u.ext3_sb.s_itb_per_group)) -+ ext3_error (sb, "ext3_new_block", -+ "Allocating block in system zone - " -+ "block = %u", tmp); -+ -+ /* The superblock lock should guard against anybody else beating -+ * us to this point! */ -+ J_ASSERT_BH(bh, !ext3_test_bit(j, bh->b_data)); -+ BUFFER_TRACE(bh, "setting bitmap bit"); -+ ext3_set_bit(j, bh->b_data); -+ performed_allocation = 1; -+ -+#ifdef CONFIG_JBD_DEBUG -+ { -+ struct buffer_head *debug_bh; -+ -+ /* Record bitmap buffer state in the newly allocated block */ -+ debug_bh = sb_get_hash_table(sb, tmp); -+ if (debug_bh) { -+ BUFFER_TRACE(debug_bh, "state when allocated"); -+ BUFFER_TRACE2(debug_bh, bh, "bitmap state"); -+ brelse(debug_bh); -+ } -+ } -+#endif -+ if (buffer_jbd(bh) && bh2jh(bh)->b_committed_data) -+ J_ASSERT_BH(bh, !ext3_test_bit(j, bh2jh(bh)->b_committed_data)); -+ bhtmp = bh; -+ alloctmp = j; -+ -+ ext3_debug ("found bit %d\n", j); -+ -+ /* -+ * Do block preallocation now if required. -+ */ -+#ifdef EXT3_PREALLOCATE -+ /* -+ * akpm: this is not enabled for ext3. Need to use -+ * ext3_test_allocatable() -+ */ -+ /* Writer: ->i_prealloc* */ -+ if (prealloc_count && !*prealloc_count) { -+ int prealloc_goal; -+ unsigned long next_block = tmp + 1; -+ -+ prealloc_goal = es->s_prealloc_blocks ? -+ es->s_prealloc_blocks : EXT3_DEFAULT_PREALLOC_BLOCKS; -+ -+ *prealloc_block = next_block; -+ /* Writer: end */ -+ for (k = 1; -+ k < prealloc_goal && (j + k) < EXT3_BLOCKS_PER_GROUP(sb); -+ k++, next_block++) { -+ if (DQUOT_PREALLOC_BLOCK(inode, 1)) -+ break; -+ /* Writer: ->i_prealloc* */ -+ if (*prealloc_block + *prealloc_count != next_block || -+ ext3_set_bit (j + k, bh->b_data)) { -+ /* Writer: end */ -+ DQUOT_FREE_BLOCK(inode, 1); -+ break; -+ } -+ (*prealloc_count)++; -+ /* Writer: end */ -+ } -+ /* -+ * As soon as we go for per-group spinlocks we'll need these -+ * done inside the loop above. -+ */ -+ gdp->bg_free_blocks_count = -+ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) - -+ (k - 1)); -+ es->s_free_blocks_count = -+ cpu_to_le32(le32_to_cpu(es->s_free_blocks_count) - -+ (k - 1)); -+ ext3_debug ("Preallocated a further %lu bits.\n", -+ (k - 1)); -+ } -+#endif -+ -+ j = tmp; -+ -+ BUFFER_TRACE(bh, "journal_dirty_metadata for bitmap block"); -+ err = ext3_journal_dirty_metadata(handle, bh); -+ if (!fatal) fatal = err; -+ -+ if (j >= le32_to_cpu(es->s_blocks_count)) { -+ ext3_error (sb, "ext3_new_block", -+ "block(%d) >= blocks count(%d) - " -+ "block_group = %d, es == %p ",j, -+ le32_to_cpu(es->s_blocks_count), i, es); -+ goto out; -+ } -+ -+ /* -+ * It is up to the caller to add the new buffer to a journal -+ * list of some description. We don't know in advance whether -+ * the caller wants to use it as metadata or data. -+ */ -+ -+ ext3_debug ("allocating block %d. " -+ "Goal hits %d of %d.\n", j, goal_hits, goal_attempts); -+ -+ gdp->bg_free_blocks_count = -+ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) - 1); -+ es->s_free_blocks_count = -+ cpu_to_le32(le32_to_cpu(es->s_free_blocks_count) - 1); -+ -+ BUFFER_TRACE(bh2, "journal_dirty_metadata for group descriptor"); -+ err = ext3_journal_dirty_metadata(handle, bh2); -+ if (!fatal) fatal = err; -+ -+ BUFFER_TRACE(bh, "journal_dirty_metadata for superblock"); -+ err = ext3_journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh); -+ if (!fatal) fatal = err; -+ -+ sb->s_dirt = 1; -+ if (fatal) -+ goto out; -+ -+ unlock_super (sb); -+ *errp = 0; -+ return j; -+ -+io_error: -+ *errp = -EIO; -+out: -+ if (fatal) { -+ *errp = fatal; -+ ext3_std_error(sb, fatal); -+ } -+ unlock_super (sb); -+ /* -+ * Undo the block allocation -+ */ -+ if (!performed_allocation) -+ DQUOT_FREE_BLOCK(inode, 1); -+ return 0; -+ -+} -+ -+unsigned long ext3_count_free_blocks (struct super_block * sb) -+{ -+#ifdef EXT3FS_DEBUG -+ struct ext3_super_block * es; -+ unsigned long desc_count, bitmap_count, x; -+ int bitmap_nr; -+ struct ext3_group_desc * gdp; -+ int i; -+ -+ lock_super (sb); -+ es = sb->u.ext3_sb.s_es; -+ desc_count = 0; -+ bitmap_count = 0; -+ gdp = NULL; -+ for (i = 0; i < sb->u.ext3_sb.s_groups_count; i++) { -+ gdp = ext3_get_group_desc (sb, i, NULL); -+ if (!gdp) -+ continue; -+ desc_count += le16_to_cpu(gdp->bg_free_blocks_count); -+ bitmap_nr = load_block_bitmap (sb, i); -+ if (bitmap_nr < 0) -+ continue; -+ -+ x = ext3_count_free (sb->u.ext3_sb.s_block_bitmap[bitmap_nr], -+ sb->s_blocksize); -+ printk ("group %d: stored = %d, counted = %lu\n", -+ i, le16_to_cpu(gdp->bg_free_blocks_count), x); -+ bitmap_count += x; -+ } -+ printk("ext3_count_free_blocks: stored = %lu, computed = %lu, %lu\n", -+ le32_to_cpu(es->s_free_blocks_count), desc_count, bitmap_count); -+ unlock_super (sb); -+ return bitmap_count; -+#else -+ return le32_to_cpu(sb->u.ext3_sb.s_es->s_free_blocks_count); -+#endif -+} -+ -+static inline int block_in_use (unsigned long block, -+ struct super_block * sb, -+ unsigned char * map) -+{ -+ return ext3_test_bit ((block - -+ le32_to_cpu(sb->u.ext3_sb.s_es->s_first_data_block)) % -+ EXT3_BLOCKS_PER_GROUP(sb), map); -+} -+ -+static inline int test_root(int a, int b) -+{ -+ if (a == 0) -+ return 1; -+ while (1) { -+ if (a == 1) -+ return 1; -+ if (a % b) -+ return 0; -+ a = a / b; -+ } -+} -+ -+int ext3_group_sparse(int group) -+{ -+ return (test_root(group, 3) || test_root(group, 5) || -+ test_root(group, 7)); -+} -+ -+/** -+ * ext3_bg_has_super - number of blocks used by the superblock in group -+ * @sb: superblock for filesystem -+ * @group: group number to check -+ * -+ * Return the number of blocks used by the superblock (primary or backup) -+ * in this group. Currently this will be only 0 or 1. -+ */ -+int ext3_bg_has_super(struct super_block *sb, int group) -+{ -+ if (EXT3_HAS_RO_COMPAT_FEATURE(sb,EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER)&& -+ !ext3_group_sparse(group)) -+ return 0; -+ return 1; -+} -+ -+/** -+ * ext3_bg_num_gdb - number of blocks used by the group table in group -+ * @sb: superblock for filesystem -+ * @group: group number to check -+ * -+ * Return the number of blocks used by the group descriptor table -+ * (primary or backup) in this group. In the future there may be a -+ * different number of descriptor blocks in each group. -+ */ -+unsigned long ext3_bg_num_gdb(struct super_block *sb, int group) -+{ -+ if (EXT3_HAS_RO_COMPAT_FEATURE(sb,EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER)&& -+ !ext3_group_sparse(group)) -+ return 0; -+ return EXT3_SB(sb)->s_gdb_count; -+} -+ -+#ifdef CONFIG_EXT3_CHECK -+/* Called at mount-time, super-block is locked */ -+void ext3_check_blocks_bitmap (struct super_block * sb) -+{ -+ struct buffer_head * bh; -+ struct ext3_super_block * es; -+ unsigned long desc_count, bitmap_count, x, j; -+ unsigned long desc_blocks; -+ int bitmap_nr; -+ struct ext3_group_desc * gdp; -+ int i; -+ -+ es = sb->u.ext3_sb.s_es; -+ desc_count = 0; -+ bitmap_count = 0; -+ gdp = NULL; -+ for (i = 0; i < sb->u.ext3_sb.s_groups_count; i++) { -+ gdp = ext3_get_group_desc (sb, i, NULL); -+ if (!gdp) -+ continue; -+ desc_count += le16_to_cpu(gdp->bg_free_blocks_count); -+ bitmap_nr = load_block_bitmap (sb, i); -+ if (bitmap_nr < 0) -+ continue; -+ -+ bh = EXT3_SB(sb)->s_block_bitmap[bitmap_nr]; -+ -+ if (ext3_bg_has_super(sb, i) && !ext3_test_bit(0, bh->b_data)) -+ ext3_error(sb, __FUNCTION__, -+ "Superblock in group %d is marked free", i); -+ -+ desc_blocks = ext3_bg_num_gdb(sb, i); -+ for (j = 0; j < desc_blocks; j++) -+ if (!ext3_test_bit(j + 1, bh->b_data)) -+ ext3_error(sb, __FUNCTION__, -+ "Descriptor block #%ld in group " -+ "%d is marked free", j, i); -+ -+ if (!block_in_use (le32_to_cpu(gdp->bg_block_bitmap), -+ sb, bh->b_data)) -+ ext3_error (sb, "ext3_check_blocks_bitmap", -+ "Block bitmap for group %d is marked free", -+ i); -+ -+ if (!block_in_use (le32_to_cpu(gdp->bg_inode_bitmap), -+ sb, bh->b_data)) -+ ext3_error (sb, "ext3_check_blocks_bitmap", -+ "Inode bitmap for group %d is marked free", -+ i); -+ -+ for (j = 0; j < sb->u.ext3_sb.s_itb_per_group; j++) -+ if (!block_in_use (le32_to_cpu(gdp->bg_inode_table) + j, -+ sb, bh->b_data)) -+ ext3_error (sb, "ext3_check_blocks_bitmap", -+ "Block #%d of the inode table in " -+ "group %d is marked free", j, i); -+ -+ x = ext3_count_free (bh, sb->s_blocksize); -+ if (le16_to_cpu(gdp->bg_free_blocks_count) != x) -+ ext3_error (sb, "ext3_check_blocks_bitmap", -+ "Wrong free blocks count for group %d, " -+ "stored = %d, counted = %lu", i, -+ le16_to_cpu(gdp->bg_free_blocks_count), x); -+ bitmap_count += x; -+ } -+ if (le32_to_cpu(es->s_free_blocks_count) != bitmap_count) -+ ext3_error (sb, "ext3_check_blocks_bitmap", -+ "Wrong free blocks count in super block, " -+ "stored = %lu, counted = %lu", -+ (unsigned long)le32_to_cpu(es->s_free_blocks_count), -+ bitmap_count); -+} -+#endif -diff -rup --new-file linux.mcp2/fs/ext3/bitmap.c linux_tmp/fs/ext3/bitmap.c ---- linux.mcp2/fs/ext3/bitmap.c 1969-12-31 16:00:00.000000000 -0800 -+++ linux_tmp/fs/ext3/bitmap.c 2001-11-09 14:25:04.000000000 -0800 -@@ -0,0 +1,26 @@ -+/* -+ * linux/fs/ext3/bitmap.c -+ * -+ * Copyright (C) 1992, 1993, 1994, 1995 -+ * Remy Card (card@masi.ibp.fr) -+ * Laboratoire MASI - Institut Blaise Pascal -+ * Universite Pierre et Marie Curie (Paris VI) -+ */ -+ -+#include -+ -+ -+static int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0}; -+ -+unsigned long ext3_count_free (struct buffer_head * map, unsigned int numchars) -+{ -+ unsigned int i; -+ unsigned long sum = 0; -+ -+ if (!map) -+ return (0); -+ for (i = 0; i < numchars; i++) -+ sum += nibblemap[map->b_data[i] & 0xf] + -+ nibblemap[(map->b_data[i] >> 4) & 0xf]; -+ return (sum); -+} -diff -rup --new-file linux.mcp2/fs/ext3/dir.c linux_tmp/fs/ext3/dir.c ---- linux.mcp2/fs/ext3/dir.c 1969-12-31 16:00:00.000000000 -0800 -+++ linux_tmp/fs/ext3/dir.c 2001-11-09 14:25:04.000000000 -0800 -@@ -0,0 +1,190 @@ -+/* -+ * linux/fs/ext3/dir.c -+ * -+ * Copyright (C) 1992, 1993, 1994, 1995 -+ * Remy Card (card@masi.ibp.fr) -+ * Laboratoire MASI - Institut Blaise Pascal -+ * Universite Pierre et Marie Curie (Paris VI) -+ * -+ * from -+ * -+ * linux/fs/minix/dir.c -+ * -+ * Copyright (C) 1991, 1992 Linus Torvalds -+ * -+ * ext3 directory handling functions -+ * -+ * Big-endian to little-endian byte-swapping/bitmaps by -+ * David S. Miller (davem@caip.rutgers.edu), 1995 -+ */ -+ -+#include -+#include -+#include -+ -+static unsigned char ext3_filetype_table[] = { -+ DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK -+}; -+ -+static int ext3_readdir(struct file *, void *, filldir_t); -+ -+struct file_operations ext3_dir_operations = { -+ read: generic_read_dir, -+ readdir: ext3_readdir, /* BKL held */ -+ ioctl: ext3_ioctl, /* BKL held */ -+ fsync: ext3_sync_file, /* BKL held */ -+}; -+ -+int ext3_check_dir_entry (const char * function, struct inode * dir, -+ struct ext3_dir_entry_2 * de, -+ struct buffer_head * bh, -+ unsigned long offset) -+{ -+ const char * error_msg = NULL; -+ const int rlen = le16_to_cpu(de->rec_len); -+ -+ if (rlen < EXT3_DIR_REC_LEN(1)) -+ error_msg = "rec_len is smaller than minimal"; -+ else if (rlen % 4 != 0) -+ error_msg = "rec_len % 4 != 0"; -+ else if (rlen < EXT3_DIR_REC_LEN(de->name_len)) -+ error_msg = "rec_len is too small for name_len"; -+ else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize) -+ error_msg = "directory entry across blocks"; -+ else if (le32_to_cpu(de->inode) > -+ le32_to_cpu(dir->i_sb->u.ext3_sb.s_es->s_inodes_count)) -+ error_msg = "inode out of bounds"; -+ -+ if (error_msg != NULL) -+ ext3_error (dir->i_sb, function, -+ "bad entry in directory #%lu: %s - " -+ "offset=%lu, inode=%lu, rec_len=%d, name_len=%d", -+ dir->i_ino, error_msg, offset, -+ (unsigned long) le32_to_cpu(de->inode), -+ rlen, de->name_len); -+ return error_msg == NULL ? 1 : 0; -+} -+ -+static int ext3_readdir(struct file * filp, -+ void * dirent, filldir_t filldir) -+{ -+ int error = 0; -+ unsigned long offset, blk; -+ int i, num, stored; -+ struct buffer_head * bh, * tmp, * bha[16]; -+ struct ext3_dir_entry_2 * de; -+ struct super_block * sb; -+ int err; -+ struct inode *inode = filp->f_dentry->d_inode; -+ -+ sb = inode->i_sb; -+ -+ stored = 0; -+ bh = NULL; -+ offset = filp->f_pos & (sb->s_blocksize - 1); -+ -+ while (!error && !stored && filp->f_pos < inode->i_size) { -+ blk = (filp->f_pos) >> EXT3_BLOCK_SIZE_BITS(sb); -+ bh = ext3_bread (0, inode, blk, 0, &err); -+ if (!bh) { -+ ext3_error (sb, "ext3_readdir", -+ "directory #%lu contains a hole at offset %lu", -+ inode->i_ino, (unsigned long)filp->f_pos); -+ filp->f_pos += sb->s_blocksize - offset; -+ continue; -+ } -+ -+ /* -+ * Do the readahead -+ */ -+ if (!offset) { -+ for (i = 16 >> (EXT3_BLOCK_SIZE_BITS(sb) - 9), num = 0; -+ i > 0; i--) { -+ tmp = ext3_getblk (NULL, inode, ++blk, 0, &err); -+ if (tmp && !buffer_uptodate(tmp) && -+ !buffer_locked(tmp)) -+ bha[num++] = tmp; -+ else -+ brelse (tmp); -+ } -+ if (num) { -+ ll_rw_block (READA, num, bha); -+ for (i = 0; i < num; i++) -+ brelse (bha[i]); -+ } -+ } -+ -+revalidate: -+ /* If the dir block has changed since the last call to -+ * readdir(2), then we might be pointing to an invalid -+ * dirent right now. Scan from the start of the block -+ * to make sure. */ -+ if (filp->f_version != inode->i_version) { -+ for (i = 0; i < sb->s_blocksize && i < offset; ) { -+ de = (struct ext3_dir_entry_2 *) -+ (bh->b_data + i); -+ /* It's too expensive to do a full -+ * dirent test each time round this -+ * loop, but we do have to test at -+ * least that it is non-zero. A -+ * failure will be detected in the -+ * dirent test below. */ -+ if (le16_to_cpu(de->rec_len) < -+ EXT3_DIR_REC_LEN(1)) -+ break; -+ i += le16_to_cpu(de->rec_len); -+ } -+ offset = i; -+ filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1)) -+ | offset; -+ filp->f_version = inode->i_version; -+ } -+ -+ while (!error && filp->f_pos < inode->i_size -+ && offset < sb->s_blocksize) { -+ de = (struct ext3_dir_entry_2 *) (bh->b_data + offset); -+ if (!ext3_check_dir_entry ("ext3_readdir", inode, de, -+ bh, offset)) { -+ /* On error, skip the f_pos to the -+ next block. */ -+ filp->f_pos = (filp->f_pos | -+ (sb->s_blocksize - 1)) + 1; -+ brelse (bh); -+ return stored; -+ } -+ offset += le16_to_cpu(de->rec_len); -+ if (le32_to_cpu(de->inode)) { -+ /* We might block in the next section -+ * if the data destination is -+ * currently swapped out. So, use a -+ * version stamp to detect whether or -+ * not the directory has been modified -+ * during the copy operation. -+ */ -+ unsigned long version = filp->f_version; -+ unsigned char d_type = DT_UNKNOWN; -+ -+ if (EXT3_HAS_INCOMPAT_FEATURE(sb, -+ EXT3_FEATURE_INCOMPAT_FILETYPE) -+ && de->file_type < EXT3_FT_MAX) -+ d_type = -+ ext3_filetype_table[de->file_type]; -+ error = filldir(dirent, de->name, -+ de->name_len, -+ filp->f_pos, -+ le32_to_cpu(de->inode), -+ d_type); -+ if (error) -+ break; -+ if (version != filp->f_version) -+ goto revalidate; -+ stored ++; -+ } -+ filp->f_pos += le16_to_cpu(de->rec_len); -+ } -+ offset = 0; -+ brelse (bh); -+ } -+ UPDATE_ATIME(inode); -+ return 0; -+} -diff -rup --new-file linux.mcp2/fs/ext3/file.c linux_tmp/fs/ext3/file.c ---- linux.mcp2/fs/ext3/file.c 1969-12-31 16:00:00.000000000 -0800 -+++ linux_tmp/fs/ext3/file.c 2001-11-15 13:37:55.000000000 -0800 -@@ -0,0 +1,94 @@ -+/* -+ * linux/fs/ext3/file.c -+ * -+ * Copyright (C) 1992, 1993, 1994, 1995 -+ * Remy Card (card@masi.ibp.fr) -+ * Laboratoire MASI - Institut Blaise Pascal -+ * Universite Pierre et Marie Curie (Paris VI) -+ * -+ * from -+ * -+ * linux/fs/minix/file.c -+ * -+ * Copyright (C) 1991, 1992 Linus Torvalds -+ * -+ * ext3 fs regular file handling primitives -+ * -+ * 64-bit file support on 64-bit platforms by Jakub Jelinek -+ * (jj@sunsite.ms.mff.cuni.cz) -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+/* -+ * Called when an inode is released. Note that this is different -+ * from ext3_file_open: open gets called at every open, but release -+ * gets called only when /all/ the files are closed. -+ */ -+static int ext3_release_file (struct inode * inode, struct file * filp) -+{ -+ if (filp->f_mode & FMODE_WRITE) -+ ext3_discard_prealloc (inode); -+ return 0; -+} -+ -+/* -+ * Called when an inode is about to be opened. -+ * We use this to disallow opening RW large files on 32bit systems if -+ * the caller didn't specify O_LARGEFILE. On 64bit systems we force -+ * on this flag in sys_open. -+ */ -+static int ext3_open_file (struct inode * inode, struct file * filp) -+{ -+ if (!(filp->f_flags & O_LARGEFILE) && -+ inode->i_size > 0x7FFFFFFFLL) -+ return -EFBIG; -+ return 0; -+} -+ -+/* -+ * ext3_file_write(). -+ * -+ * Most things are done in ext3_prepare_write() and ext3_commit_write(). -+ */ -+ -+static ssize_t -+ext3_file_write(struct file *file, const char *buf, size_t count, loff_t *ppos) -+{ -+ struct inode *inode = file->f_dentry->d_inode; -+ -+ /* -+ * Nasty: if the file is subject to synchronous writes then we need -+ * to force generic_osync_inode() to call ext3_write_inode(). -+ * We do that by marking the inode dirty. This adds much more -+ * computational expense than we need, but we're going to sync -+ * anyway. -+ */ -+ if (IS_SYNC(inode) || (file->f_flags & O_SYNC)) -+ mark_inode_dirty(inode); -+ -+ return generic_file_write(file, buf, count, ppos); -+} -+ -+struct file_operations ext3_file_operations = { -+ llseek: generic_file_llseek, /* BKL held */ -+ read: generic_file_read, /* BKL not held. Don't need */ -+ write: ext3_file_write, /* BKL not held. Don't need */ -+ ioctl: ext3_ioctl, /* BKL held */ -+ mmap: generic_file_mmap, -+ open: ext3_open_file, /* BKL not held. Don't need */ -+ release: ext3_release_file, /* BKL not held. Don't need */ -+ fsync: ext3_sync_file, /* BKL held */ -+}; -+ -+struct inode_operations ext3_file_inode_operations = { -+ truncate: ext3_truncate, /* BKL held */ -+ setattr: ext3_setattr, /* BKL held */ -+}; -+ -diff -rup --new-file linux.mcp2/fs/ext3/fsync.c linux_tmp/fs/ext3/fsync.c ---- linux.mcp2/fs/ext3/fsync.c 1969-12-31 16:00:00.000000000 -0800 -+++ linux_tmp/fs/ext3/fsync.c 2001-11-20 21:34:13.000000000 -0800 -@@ -0,0 +1,70 @@ -+/* -+ * linux/fs/ext3/fsync.c -+ * -+ * Copyright (C) 1993 Stephen Tweedie (sct@redhat.com) -+ * from -+ * Copyright (C) 1992 Remy Card (card@masi.ibp.fr) -+ * Laboratoire MASI - Institut Blaise Pascal -+ * Universite Pierre et Marie Curie (Paris VI) -+ * from -+ * linux/fs/minix/truncate.c Copyright (C) 1991, 1992 Linus Torvalds -+ * -+ * ext3fs fsync primitive -+ * -+ * Big-endian to little-endian byte-swapping/bitmaps by -+ * David S. Miller (davem@caip.rutgers.edu), 1995 -+ * -+ * Removed unnecessary code duplication for little endian machines -+ * and excessive __inline__s. -+ * Andi Kleen, 1997 -+ * -+ * Major simplications and cleanup - we only need to do the metadata, because -+ * we can depend on generic_block_fdatasync() to sync the data blocks. -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+/* -+ * akpm: A new design for ext3_sync_file(). -+ * -+ * This is only called from sys_fsync(), sys_fdatasync() and sys_msync(). -+ * There cannot be a transaction open by this task. (AKPM: quotas?) -+ * Another task could have dirtied this inode. Its data can be in any -+ * state in the journalling system. -+ * -+ * What we do is just kick off a commit and wait on it. This will snapshot the -+ * inode to disk. -+ * -+ * Note that there is a serious optimisation we can make here: if the current -+ * inode is not part of j_running_transaction or j_committing_transaction -+ * then we have nothing to do. That would require implementation of t_ilist, -+ * which isn't too hard. -+ */ -+ -+int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync) -+{ -+ struct inode *inode = dentry->d_inode; -+ int ret; -+ -+ J_ASSERT(ext3_journal_current_handle() == 0); -+ -+ /* -+ * fsync_inode_buffers() just walks i_dirty_buffers and waits -+ * on them. It's a no-op for full data journalling because -+ * i_dirty_buffers will be ampty. -+ * Really, we only need to start I/O on the dirty buffers - -+ * we'll end up waiting on them in commit. -+ */ -+ ret = fsync_inode_buffers(inode); -+ ret |= fsync_inode_data_buffers(inode); -+ -+ ext3_force_commit(inode->i_sb); -+ -+ return ret; -+} -diff -rup --new-file linux.mcp2/fs/ext3/ialloc.c linux_tmp/fs/ext3/ialloc.c ---- linux.mcp2/fs/ext3/ialloc.c 1969-12-31 16:00:00.000000000 -0800 -+++ linux_tmp/fs/ext3/ialloc.c 2002-02-25 11:38:08.000000000 -0800 -@@ -0,0 +1,663 @@ -+/* -+ * linux/fs/ext3/ialloc.c -+ * -+ * Copyright (C) 1992, 1993, 1994, 1995 -+ * Remy Card (card@masi.ibp.fr) -+ * Laboratoire MASI - Institut Blaise Pascal -+ * Universite Pierre et Marie Curie (Paris VI) -+ * -+ * BSD ufs-inspired inode and directory allocation by -+ * Stephen Tweedie (sct@redhat.com), 1993 -+ * Big-endian to little-endian byte-swapping/bitmaps by -+ * David S. Miller (davem@caip.rutgers.edu), 1995 -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+#include -+ -+/* -+ * ialloc.c contains the inodes allocation and deallocation routines -+ */ -+ -+/* -+ * The free inodes are managed by bitmaps. A file system contains several -+ * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap -+ * block for inodes, N blocks for the inode table and data blocks. -+ * -+ * The file system contains group descriptors which are located after the -+ * super block. Each descriptor contains the number of the bitmap block and -+ * the free blocks count in the block. The descriptors are loaded in memory -+ * when a file system is mounted (see ext3_read_super). -+ */ -+ -+ -+/* -+ * Read the inode allocation bitmap for a given block_group, reading -+ * into the specified slot in the superblock's bitmap cache. -+ * -+ * Return >=0 on success or a -ve error code. -+ */ -+static int read_inode_bitmap (struct super_block * sb, -+ unsigned long block_group, -+ unsigned int bitmap_nr) -+{ -+ struct ext3_group_desc * gdp; -+ struct buffer_head * bh = NULL; -+ int retval = 0; -+ -+ gdp = ext3_get_group_desc (sb, block_group, NULL); -+ if (!gdp) { -+ retval = -EIO; -+ goto error_out; -+ } -+ bh = sb_bread(sb, le32_to_cpu(gdp->bg_inode_bitmap)); -+ if (!bh) { -+ ext3_error (sb, "read_inode_bitmap", -+ "Cannot read inode bitmap - " -+ "block_group = %lu, inode_bitmap = %lu", -+ block_group, (unsigned long) gdp->bg_inode_bitmap); -+ retval = -EIO; -+ } -+ /* -+ * On IO error, just leave a zero in the superblock's block pointer for -+ * this group. The IO will be retried next time. -+ */ -+error_out: -+ sb->u.ext3_sb.s_inode_bitmap_number[bitmap_nr] = block_group; -+ sb->u.ext3_sb.s_inode_bitmap[bitmap_nr] = bh; -+ return retval; -+} -+ -+/* -+ * load_inode_bitmap loads the inode bitmap for a blocks group -+ * -+ * It maintains a cache for the last bitmaps loaded. This cache is managed -+ * with a LRU algorithm. -+ * -+ * Notes: -+ * 1/ There is one cache per mounted file system. -+ * 2/ If the file system contains less than EXT3_MAX_GROUP_LOADED groups, -+ * this function reads the bitmap without maintaining a LRU cache. -+ * -+ * Return the slot used to store the bitmap, or a -ve error code. -+ */ -+static int load_inode_bitmap (struct super_block * sb, -+ unsigned int block_group) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ unsigned long inode_bitmap_number; -+ struct buffer_head * inode_bitmap; -+ int i, j, retval = 0; -+ -+ if (block_group >= sbi->s_groups_count) -+ ext3_panic (sb, "load_inode_bitmap", -+ "block_group >= groups_count - " -+ "block_group = %d, groups_count = %lu", -+ block_group, sbi->s_groups_count); -+ if (sbi->s_loaded_inode_bitmaps > 0 && -+ sbi->s_inode_bitmap_number[0] == block_group && -+ sbi->s_inode_bitmap[0] != NULL) -+ return 0; -+ if (sbi->s_groups_count <= EXT3_MAX_GROUP_LOADED) { -+ if (sbi->s_inode_bitmap[block_group]) { -+ if (sbi->s_inode_bitmap_number[block_group] != -+ block_group) -+ ext3_panic(sb, "load_inode_bitmap", -+ "block_group != inode_bitmap_number"); -+ return block_group; -+ } -+ retval = read_inode_bitmap(sb, block_group, block_group); -+ if (retval < 0) -+ return retval; -+ return block_group; -+ } -+ -+ for (i = 0; i < sbi->s_loaded_inode_bitmaps && -+ sbi->s_inode_bitmap_number[i] != block_group; i++) -+ /* do nothing */; -+ if (i < sbi->s_loaded_inode_bitmaps && -+ sbi->s_inode_bitmap_number[i] == block_group) { -+ inode_bitmap_number = sbi->s_inode_bitmap_number[i]; -+ inode_bitmap = sbi->s_inode_bitmap[i]; -+ for (j = i; j > 0; j--) { -+ sbi->s_inode_bitmap_number[j] = -+ sbi->s_inode_bitmap_number[j - 1]; -+ sbi->s_inode_bitmap[j] = sbi->s_inode_bitmap[j - 1]; -+ } -+ sbi->s_inode_bitmap_number[0] = inode_bitmap_number; -+ sbi->s_inode_bitmap[0] = inode_bitmap; -+ -+ /* -+ * There's still one special case here --- if inode_bitmap == 0 -+ * then our last attempt to read the bitmap failed and we have -+ * just ended up caching that failure. Try again to read it. -+ */ -+ if (!inode_bitmap) -+ retval = read_inode_bitmap (sb, block_group, 0); -+ } else { -+ if (sbi->s_loaded_inode_bitmaps < EXT3_MAX_GROUP_LOADED) -+ sbi->s_loaded_inode_bitmaps++; -+ else -+ brelse(sbi->s_inode_bitmap[EXT3_MAX_GROUP_LOADED - 1]); -+ for (j = sbi->s_loaded_inode_bitmaps - 1; j > 0; j--) { -+ sbi->s_inode_bitmap_number[j] = -+ sbi->s_inode_bitmap_number[j - 1]; -+ sbi->s_inode_bitmap[j] = sbi->s_inode_bitmap[j - 1]; -+ } -+ retval = read_inode_bitmap (sb, block_group, 0); -+ } -+ return retval; -+} -+ -+/* -+ * NOTE! When we get the inode, we're the only people -+ * that have access to it, and as such there are no -+ * race conditions we have to worry about. The inode -+ * is not on the hash-lists, and it cannot be reached -+ * through the filesystem because the directory entry -+ * has been deleted earlier. -+ * -+ * HOWEVER: we must make sure that we get no aliases, -+ * which means that we have to call "clear_inode()" -+ * _before_ we mark the inode not in use in the inode -+ * bitmaps. Otherwise a newly created file might use -+ * the same inode number (not actually the same pointer -+ * though), and then we'd have two inodes sharing the -+ * same inode number and space on the harddisk. -+ */ -+void ext3_free_inode (handle_t *handle, struct inode * inode) -+{ -+ struct super_block * sb = inode->i_sb; -+ int is_directory; -+ unsigned long ino; -+ struct buffer_head * bh; -+ struct buffer_head * bh2; -+ unsigned long block_group; -+ unsigned long bit; -+ int bitmap_nr; -+ struct ext3_group_desc * gdp; -+ struct ext3_super_block * es; -+ int fatal = 0, err; -+ -+ if (!inode->i_dev) { -+ printk ("ext3_free_inode: inode has no device\n"); -+ return; -+ } -+ if (atomic_read(&inode->i_count) > 1) { -+ printk ("ext3_free_inode: inode has count=%d\n", -+ atomic_read(&inode->i_count)); -+ return; -+ } -+ if (inode->i_nlink) { -+ printk ("ext3_free_inode: inode has nlink=%d\n", -+ inode->i_nlink); -+ return; -+ } -+ if (!sb) { -+ printk("ext3_free_inode: inode on nonexistent device\n"); -+ return; -+ } -+ -+ ino = inode->i_ino; -+ ext3_debug ("freeing inode %lu\n", ino); -+ -+ /* -+ * Note: we must free any quota before locking the superblock, -+ * as writing the quota to disk may need the lock as well. -+ */ -+ DQUOT_INIT(inode); -+ DQUOT_FREE_INODE(inode); -+ DQUOT_DROP(inode); -+ -+ is_directory = S_ISDIR(inode->i_mode); -+ -+ /* Do this BEFORE marking the inode not in use or returning an error */ -+ clear_inode (inode); -+ -+ lock_super (sb); -+ es = sb->u.ext3_sb.s_es; -+ if (ino < EXT3_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) { -+ ext3_error (sb, "ext3_free_inode", -+ "reserved or nonexistent inode %lu", ino); -+ goto error_return; -+ } -+ block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb); -+ bit = (ino - 1) % EXT3_INODES_PER_GROUP(sb); -+ bitmap_nr = load_inode_bitmap (sb, block_group); -+ if (bitmap_nr < 0) -+ goto error_return; -+ -+ bh = sb->u.ext3_sb.s_inode_bitmap[bitmap_nr]; -+ -+ BUFFER_TRACE(bh, "get_write_access"); -+ fatal = ext3_journal_get_write_access(handle, bh); -+ if (fatal) -+ goto error_return; -+ -+ /* Ok, now we can actually update the inode bitmaps.. */ -+ if (!ext3_clear_bit (bit, bh->b_data)) -+ ext3_error (sb, "ext3_free_inode", -+ "bit already cleared for inode %lu", ino); -+ else { -+ gdp = ext3_get_group_desc (sb, block_group, &bh2); -+ -+ BUFFER_TRACE(bh2, "get_write_access"); -+ fatal = ext3_journal_get_write_access(handle, bh2); -+ if (fatal) goto error_return; -+ -+ BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "get write access"); -+ fatal = ext3_journal_get_write_access(handle, sb->u.ext3_sb.s_sbh); -+ if (fatal) goto error_return; -+ -+ if (gdp) { -+ gdp->bg_free_inodes_count = cpu_to_le16( -+ le16_to_cpu(gdp->bg_free_inodes_count) + 1); -+ if (is_directory) -+ gdp->bg_used_dirs_count = cpu_to_le16( -+ le16_to_cpu(gdp->bg_used_dirs_count) - 1); -+ } -+ BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata"); -+ err = ext3_journal_dirty_metadata(handle, bh2); -+ if (!fatal) fatal = err; -+ es->s_free_inodes_count = -+ cpu_to_le32(le32_to_cpu(es->s_free_inodes_count) + 1); -+ BUFFER_TRACE(sb->u.ext3_sb.s_sbh, -+ "call ext3_journal_dirty_metadata"); -+ err = ext3_journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh); -+ if (!fatal) fatal = err; -+ } -+ BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); -+ err = ext3_journal_dirty_metadata(handle, bh); -+ if (!fatal) -+ fatal = err; -+ sb->s_dirt = 1; -+error_return: -+ ext3_std_error(sb, fatal); -+ unlock_super(sb); -+} -+ -+/* -+ * There are two policies for allocating an inode. If the new inode is -+ * a directory, then a forward search is made for a block group with both -+ * free space and a low directory-to-inode ratio; if that fails, then of -+ * the groups with above-average free space, that group with the fewest -+ * directories already is chosen. -+ * -+ * For other inodes, search forward from the parent directory's block -+ * group to find a free inode. -+ */ -+struct inode * ext3_new_inode (handle_t *handle, -+ const struct inode * dir, int mode) -+{ -+ struct super_block * sb; -+ struct buffer_head * bh; -+ struct buffer_head * bh2; -+ int i, j, avefreei; -+ struct inode * inode; -+ int bitmap_nr; -+ struct ext3_group_desc * gdp; -+ struct ext3_group_desc * tmp; -+ struct ext3_super_block * es; -+ int err = 0; -+ -+ /* Cannot create files in a deleted directory */ -+ if (!dir || !dir->i_nlink) -+ return ERR_PTR(-EPERM); -+ -+ sb = dir->i_sb; -+ inode = new_inode(sb); -+ if (!inode) -+ return ERR_PTR(-ENOMEM); -+ init_rwsem(&inode->u.ext3_i.truncate_sem); -+ -+ lock_super (sb); -+ es = sb->u.ext3_sb.s_es; -+repeat: -+ gdp = NULL; -+ i = 0; -+ -+ if (S_ISDIR(mode)) { -+ avefreei = le32_to_cpu(es->s_free_inodes_count) / -+ sb->u.ext3_sb.s_groups_count; -+ if (!gdp) { -+ for (j = 0; j < sb->u.ext3_sb.s_groups_count; j++) { -+ struct buffer_head *temp_buffer; -+ tmp = ext3_get_group_desc (sb, j, &temp_buffer); -+ if (tmp && -+ le16_to_cpu(tmp->bg_free_inodes_count) && -+ le16_to_cpu(tmp->bg_free_inodes_count) >= -+ avefreei) { -+ if (!gdp || (le16_to_cpu(tmp->bg_free_blocks_count) > -+ le16_to_cpu(gdp->bg_free_blocks_count))) { -+ i = j; -+ gdp = tmp; -+ bh2 = temp_buffer; -+ } -+ } -+ } -+ } -+ } else { -+ /* -+ * Try to place the inode in its parent directory -+ */ -+ i = dir->u.ext3_i.i_block_group; -+ tmp = ext3_get_group_desc (sb, i, &bh2); -+ if (tmp && le16_to_cpu(tmp->bg_free_inodes_count)) -+ gdp = tmp; -+ else -+ { -+ /* -+ * Use a quadratic hash to find a group with a -+ * free inode -+ */ -+ for (j = 1; j < sb->u.ext3_sb.s_groups_count; j <<= 1) { -+ i += j; -+ if (i >= sb->u.ext3_sb.s_groups_count) -+ i -= sb->u.ext3_sb.s_groups_count; -+ tmp = ext3_get_group_desc (sb, i, &bh2); -+ if (tmp && -+ le16_to_cpu(tmp->bg_free_inodes_count)) { -+ gdp = tmp; -+ break; -+ } -+ } -+ } -+ if (!gdp) { -+ /* -+ * That failed: try linear search for a free inode -+ */ -+ i = dir->u.ext3_i.i_block_group + 1; -+ for (j = 2; j < sb->u.ext3_sb.s_groups_count; j++) { -+ if (++i >= sb->u.ext3_sb.s_groups_count) -+ i = 0; -+ tmp = ext3_get_group_desc (sb, i, &bh2); -+ if (tmp && -+ le16_to_cpu(tmp->bg_free_inodes_count)) { -+ gdp = tmp; -+ break; -+ } -+ } -+ } -+ } -+ -+ err = -ENOSPC; -+ if (!gdp) -+ goto fail; -+ -+ err = -EIO; -+ bitmap_nr = load_inode_bitmap (sb, i); -+ if (bitmap_nr < 0) -+ goto fail; -+ -+ bh = sb->u.ext3_sb.s_inode_bitmap[bitmap_nr]; -+ -+ if ((j = ext3_find_first_zero_bit ((unsigned long *) bh->b_data, -+ EXT3_INODES_PER_GROUP(sb))) < -+ EXT3_INODES_PER_GROUP(sb)) { -+ BUFFER_TRACE(bh, "get_write_access"); -+ err = ext3_journal_get_write_access(handle, bh); -+ if (err) goto fail; -+ -+ if (ext3_set_bit (j, bh->b_data)) { -+ ext3_error (sb, "ext3_new_inode", -+ "bit already set for inode %d", j); -+ goto repeat; -+ } -+ BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); -+ err = ext3_journal_dirty_metadata(handle, bh); -+ if (err) goto fail; -+ } else { -+ if (le16_to_cpu(gdp->bg_free_inodes_count) != 0) { -+ ext3_error (sb, "ext3_new_inode", -+ "Free inodes count corrupted in group %d", -+ i); -+ /* Is it really ENOSPC? */ -+ err = -ENOSPC; -+ if (sb->s_flags & MS_RDONLY) -+ goto fail; -+ -+ BUFFER_TRACE(bh2, "get_write_access"); -+ err = ext3_journal_get_write_access(handle, bh2); -+ if (err) goto fail; -+ gdp->bg_free_inodes_count = 0; -+ BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata"); -+ err = ext3_journal_dirty_metadata(handle, bh2); -+ if (err) goto fail; -+ } -+ goto repeat; -+ } -+ j += i * EXT3_INODES_PER_GROUP(sb) + 1; -+ if (j < EXT3_FIRST_INO(sb) || j > le32_to_cpu(es->s_inodes_count)) { -+ ext3_error (sb, "ext3_new_inode", -+ "reserved inode or inode > inodes count - " -+ "block_group = %d,inode=%d", i, j); -+ err = -EIO; -+ goto fail; -+ } -+ -+ BUFFER_TRACE(bh2, "get_write_access"); -+ err = ext3_journal_get_write_access(handle, bh2); -+ if (err) goto fail; -+ gdp->bg_free_inodes_count = -+ cpu_to_le16(le16_to_cpu(gdp->bg_free_inodes_count) - 1); -+ if (S_ISDIR(mode)) -+ gdp->bg_used_dirs_count = -+ cpu_to_le16(le16_to_cpu(gdp->bg_used_dirs_count) + 1); -+ BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata"); -+ err = ext3_journal_dirty_metadata(handle, bh2); -+ if (err) goto fail; -+ -+ BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "get_write_access"); -+ err = ext3_journal_get_write_access(handle, sb->u.ext3_sb.s_sbh); -+ if (err) goto fail; -+ es->s_free_inodes_count = -+ cpu_to_le32(le32_to_cpu(es->s_free_inodes_count) - 1); -+ BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "call ext3_journal_dirty_metadata"); -+ err = ext3_journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh); -+ sb->s_dirt = 1; -+ if (err) goto fail; -+ -+ inode->i_uid = current->fsuid; -+ if (test_opt (sb, GRPID)) -+ inode->i_gid = dir->i_gid; -+ else if (dir->i_mode & S_ISGID) { -+ inode->i_gid = dir->i_gid; -+ if (S_ISDIR(mode)) -+ mode |= S_ISGID; -+ } else -+ inode->i_gid = current->fsgid; -+ inode->i_mode = mode; -+ -+ inode->i_ino = j; -+ /* This is the optimal IO size (for stat), not the fs block size */ -+ inode->i_blksize = PAGE_SIZE; -+ inode->i_blocks = 0; -+ inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; -+ inode->u.ext3_i.i_flags = dir->u.ext3_i.i_flags & ~EXT3_INDEX_FL; -+ if (S_ISLNK(mode)) -+ inode->u.ext3_i.i_flags &= ~(EXT3_IMMUTABLE_FL|EXT3_APPEND_FL); -+#ifdef EXT3_FRAGMENTS -+ inode->u.ext3_i.i_faddr = 0; -+ inode->u.ext3_i.i_frag_no = 0; -+ inode->u.ext3_i.i_frag_size = 0; -+#endif -+ inode->u.ext3_i.i_file_acl = 0; -+ inode->u.ext3_i.i_dir_acl = 0; -+ inode->u.ext3_i.i_dtime = 0; -+ INIT_LIST_HEAD(&inode->u.ext3_i.i_orphan); -+#ifdef EXT3_PREALLOCATE -+ inode->u.ext3_i.i_prealloc_count = 0; -+#endif -+ inode->u.ext3_i.i_block_group = i; -+ -+ if (inode->u.ext3_i.i_flags & EXT3_SYNC_FL) -+ inode->i_flags |= S_SYNC; -+ if (IS_SYNC(inode)) -+ handle->h_sync = 1; -+ insert_inode_hash(inode); -+ inode->i_generation = sb->u.ext3_sb.s_next_generation++; -+ -+ inode->u.ext3_i.i_state = EXT3_STATE_NEW; -+ err = ext3_mark_inode_dirty(handle, inode); -+ if (err) goto fail; -+ -+ unlock_super (sb); -+ if(DQUOT_ALLOC_INODE(inode)) { -+ DQUOT_DROP(inode); -+ inode->i_flags |= S_NOQUOTA; -+ inode->i_nlink = 0; -+ iput(inode); -+ return ERR_PTR(-EDQUOT); -+ } -+ ext3_debug ("allocating inode %lu\n", inode->i_ino); -+ return inode; -+ -+fail: -+ unlock_super(sb); -+ iput(inode); -+ ext3_std_error(sb, err); -+ return ERR_PTR(err); -+} -+ -+/* Verify that we are loading a valid orphan from disk */ -+struct inode *ext3_orphan_get (struct super_block * sb, ino_t ino) -+{ -+ ino_t max_ino = le32_to_cpu(EXT3_SB(sb)->s_es->s_inodes_count); -+ unsigned long block_group; -+ int bit; -+ int bitmap_nr; -+ struct buffer_head *bh; -+ struct inode *inode = NULL; -+ -+ /* Error cases - e2fsck has already cleaned up for us */ -+ if (ino > max_ino) { -+ ext3_warning(sb, __FUNCTION__, -+ "bad orphan ino %ld! e2fsck was run?\n", ino); -+ return NULL; -+ } -+ -+ block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb); -+ bit = (ino - 1) % EXT3_INODES_PER_GROUP(sb); -+ if ((bitmap_nr = load_inode_bitmap(sb, block_group)) < 0 || -+ !(bh = EXT3_SB(sb)->s_inode_bitmap[bitmap_nr])) { -+ ext3_warning(sb, __FUNCTION__, -+ "inode bitmap error for orphan %ld\n", ino); -+ return NULL; -+ } -+ -+ /* Having the inode bit set should be a 100% indicator that this -+ * is a valid orphan (no e2fsck run on fs). Orphans also include -+ * inodes that were being truncated, so we can't check i_nlink==0. -+ */ -+ if (!ext3_test_bit(bit, bh->b_data) || !(inode = iget(sb, ino)) || -+ is_bad_inode(inode) || NEXT_ORPHAN(inode) > max_ino) { -+ ext3_warning(sb, __FUNCTION__, -+ "bad orphan inode %ld! e2fsck was run?\n", ino); -+ printk(KERN_NOTICE "ext3_test_bit(bit=%d, block=%ld) = %d\n", -+ bit, bh->b_blocknr, ext3_test_bit(bit, bh->b_data)); -+ printk(KERN_NOTICE "inode=%p\n", inode); -+ if (inode) { -+ printk(KERN_NOTICE "is_bad_inode(inode)=%d\n", -+ is_bad_inode(inode)); -+ printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%d\n", -+ NEXT_ORPHAN(inode)); -+ printk(KERN_NOTICE "max_ino=%ld\n", max_ino); -+ } -+ /* Avoid freeing blocks if we got a bad deleted inode */ -+ if (inode && inode->i_nlink == 0) -+ inode->i_blocks = 0; -+ iput(inode); -+ return NULL; -+ } -+ -+ return inode; -+} -+ -+unsigned long ext3_count_free_inodes (struct super_block * sb) -+{ -+#ifdef EXT3FS_DEBUG -+ struct ext3_super_block * es; -+ unsigned long desc_count, bitmap_count, x; -+ int bitmap_nr; -+ struct ext3_group_desc * gdp; -+ int i; -+ -+ lock_super (sb); -+ es = sb->u.ext3_sb.s_es; -+ desc_count = 0; -+ bitmap_count = 0; -+ gdp = NULL; -+ for (i = 0; i < sb->u.ext3_sb.s_groups_count; i++) { -+ gdp = ext3_get_group_desc (sb, i, NULL); -+ if (!gdp) -+ continue; -+ desc_count += le16_to_cpu(gdp->bg_free_inodes_count); -+ bitmap_nr = load_inode_bitmap (sb, i); -+ if (bitmap_nr < 0) -+ continue; -+ -+ x = ext3_count_free (sb->u.ext3_sb.s_inode_bitmap[bitmap_nr], -+ EXT3_INODES_PER_GROUP(sb) / 8); -+ printk ("group %d: stored = %d, counted = %lu\n", -+ i, le16_to_cpu(gdp->bg_free_inodes_count), x); -+ bitmap_count += x; -+ } -+ printk("ext3_count_free_inodes: stored = %lu, computed = %lu, %lu\n", -+ le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count); -+ unlock_super (sb); -+ return desc_count; -+#else -+ return le32_to_cpu(sb->u.ext3_sb.s_es->s_free_inodes_count); -+#endif -+} -+ -+#ifdef CONFIG_EXT3_CHECK -+/* Called at mount-time, super-block is locked */ -+void ext3_check_inodes_bitmap (struct super_block * sb) -+{ -+ struct ext3_super_block * es; -+ unsigned long desc_count, bitmap_count, x; -+ int bitmap_nr; -+ struct ext3_group_desc * gdp; -+ int i; -+ -+ es = sb->u.ext3_sb.s_es; -+ desc_count = 0; -+ bitmap_count = 0; -+ gdp = NULL; -+ for (i = 0; i < sb->u.ext3_sb.s_groups_count; i++) { -+ gdp = ext3_get_group_desc (sb, i, NULL); -+ if (!gdp) -+ continue; -+ desc_count += le16_to_cpu(gdp->bg_free_inodes_count); -+ bitmap_nr = load_inode_bitmap (sb, i); -+ if (bitmap_nr < 0) -+ continue; -+ -+ x = ext3_count_free (sb->u.ext3_sb.s_inode_bitmap[bitmap_nr], -+ EXT3_INODES_PER_GROUP(sb) / 8); -+ if (le16_to_cpu(gdp->bg_free_inodes_count) != x) -+ ext3_error (sb, "ext3_check_inodes_bitmap", -+ "Wrong free inodes count in group %d, " -+ "stored = %d, counted = %lu", i, -+ le16_to_cpu(gdp->bg_free_inodes_count), x); -+ bitmap_count += x; -+ } -+ if (le32_to_cpu(es->s_free_inodes_count) != bitmap_count) -+ ext3_error (sb, "ext3_check_inodes_bitmap", -+ "Wrong free inodes count in super block, " -+ "stored = %lu, counted = %lu", -+ (unsigned long)le32_to_cpu(es->s_free_inodes_count), -+ bitmap_count); -+} -+#endif -diff -rup --new-file linux.mcp2/fs/ext3/inode.c linux_tmp/fs/ext3/inode.c ---- linux.mcp2/fs/ext3/inode.c 1969-12-31 16:00:00.000000000 -0800 -+++ linux_tmp/fs/ext3/inode.c 2002-08-02 17:39:45.000000000 -0700 -@@ -0,0 +1,2699 @@ -+/* -+ * linux/fs/ext3/inode.c -+ * -+ * Copyright (C) 1992, 1993, 1994, 1995 -+ * Remy Card (card@masi.ibp.fr) -+ * Laboratoire MASI - Institut Blaise Pascal -+ * Universite Pierre et Marie Curie (Paris VI) -+ * -+ * from -+ * -+ * linux/fs/minix/inode.c -+ * -+ * Copyright (C) 1991, 1992 Linus Torvalds -+ * -+ * Goal-directed block allocation by Stephen Tweedie -+ * (sct@redhat.com), 1993, 1998 -+ * Big-endian to little-endian byte-swapping/bitmaps by -+ * David S. Miller (davem@caip.rutgers.edu), 1995 -+ * 64-bit file support on 64-bit platforms by Jakub Jelinek -+ * (jj@sunsite.ms.mff.cuni.cz) -+ * -+ * Assorted race fixes, rewrite of ext3_get_block() by Al Viro, 2000 -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+/* -+ * SEARCH_FROM_ZERO forces each block allocation to search from the start -+ * of the filesystem. This is to force rapid reallocation of recently-freed -+ * blocks. The file fragmentation is horrendous. -+ */ -+#undef SEARCH_FROM_ZERO -+ -+/* The ext3 forget function must perform a revoke if we are freeing data -+ * which has been journaled. Metadata (eg. indirect blocks) must be -+ * revoked in all cases. -+ * -+ * "bh" may be NULL: a metadata block may have been freed from memory -+ * but there may still be a record of it in the journal, and that record -+ * still needs to be revoked. -+ */ -+ -+static int ext3_forget(handle_t *handle, int is_metadata, -+ struct inode *inode, struct buffer_head *bh, -+ int blocknr) -+{ -+ int err; -+ -+ BUFFER_TRACE(bh, "enter"); -+ -+ jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, " -+ "data mode %lx\n", -+ bh, is_metadata, inode->i_mode, -+ test_opt(inode->i_sb, DATA_FLAGS)); -+ -+ /* Never use the revoke function if we are doing full data -+ * journaling: there is no need to, and a V1 superblock won't -+ * support it. Otherwise, only skip the revoke on un-journaled -+ * data blocks. */ -+ -+ if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA || -+ (!is_metadata && !ext3_should_journal_data(inode))) { -+ if (bh) { -+ BUFFER_TRACE(bh, "call journal_forget"); -+ ext3_journal_forget(handle, bh); -+ } -+ return 0; -+ } -+ -+ /* -+ * data!=journal && (is_metadata || should_journal_data(inode)) -+ */ -+ BUFFER_TRACE(bh, "call ext3_journal_revoke"); -+ err = ext3_journal_revoke(handle, blocknr, bh); -+ if (err) -+ ext3_abort(inode->i_sb, __FUNCTION__, -+ "error %d when attempting revoke", err); -+ BUFFER_TRACE(bh, "exit"); -+ return err; -+} -+ -+/* -+ * Truncate transactions can be complex and absolutely huge. So we need to -+ * be able to restart the transaction at a conventient checkpoint to make -+ * sure we don't overflow the journal. -+ * -+ * start_transaction gets us a new handle for a truncate transaction, -+ * and extend_transaction tries to extend the existing one a bit. If -+ * extend fails, we need to propagate the failure up and restart the -+ * transaction in the top-level truncate loop. --sct -+ */ -+ -+static handle_t *start_transaction(struct inode *inode) -+{ -+ long needed; -+ handle_t *result; -+ -+ needed = inode->i_blocks; -+ if (needed > EXT3_MAX_TRANS_DATA) -+ needed = EXT3_MAX_TRANS_DATA; -+ -+ result = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS + needed); -+ if (!IS_ERR(result)) -+ return result; -+ -+ ext3_std_error(inode->i_sb, PTR_ERR(result)); -+ return result; -+} -+ -+/* -+ * Try to extend this transaction for the purposes of truncation. -+ * -+ * Returns 0 if we managed to create more room. If we can't create more -+ * room, and the transaction must be restarted we return 1. -+ */ -+static int try_to_extend_transaction(handle_t *handle, struct inode *inode) -+{ -+ long needed; -+ -+ if (handle->h_buffer_credits > EXT3_RESERVE_TRANS_BLOCKS) -+ return 0; -+ needed = inode->i_blocks; -+ if (needed > EXT3_MAX_TRANS_DATA) -+ needed = EXT3_MAX_TRANS_DATA; -+ if (!ext3_journal_extend(handle, EXT3_RESERVE_TRANS_BLOCKS + needed)) -+ return 0; -+ return 1; -+} -+ -+/* -+ * Restart the transaction associated with *handle. This does a commit, -+ * so before we call here everything must be consistently dirtied against -+ * this transaction. -+ */ -+static int ext3_journal_test_restart(handle_t *handle, struct inode *inode) -+{ -+ long needed = inode->i_blocks; -+ if (needed > EXT3_MAX_TRANS_DATA) -+ needed = EXT3_MAX_TRANS_DATA; -+ jbd_debug(2, "restarting handle %p\n", handle); -+ return ext3_journal_restart(handle, EXT3_DATA_TRANS_BLOCKS + needed); -+} -+ -+/* -+ * Called at each iput() -+ */ -+void ext3_put_inode (struct inode * inode) -+{ -+ ext3_discard_prealloc (inode); -+} -+ -+/* -+ * Called at the last iput() if i_nlink is zero. -+ */ -+void ext3_delete_inode (struct inode * inode) -+{ -+ handle_t *handle; -+ -+ if (is_bad_inode(inode) || -+ inode->i_ino == EXT3_ACL_IDX_INO || -+ inode->i_ino == EXT3_ACL_DATA_INO) -+ goto no_delete; -+ -+ lock_kernel(); -+ handle = start_transaction(inode); -+ if (IS_ERR(handle)) { -+ /* If we're going to skip the normal cleanup, we still -+ * need to make sure that the in-core orphan linked list -+ * is properly cleaned up. */ -+ ext3_orphan_del(NULL, inode); -+ -+ ext3_std_error(inode->i_sb, PTR_ERR(handle)); -+ unlock_kernel(); -+ goto no_delete; -+ } -+ -+ if (IS_SYNC(inode)) -+ handle->h_sync = 1; -+ inode->i_size = 0; -+ if (inode->i_blocks) -+ ext3_truncate(inode); -+ /* -+ * Kill off the orphan record which ext3_truncate created. -+ * AKPM: I think this can be inside the above `if'. -+ * Note that ext3_orphan_del() has to be able to cope with the -+ * deletion of a non-existent orphan - this is because we don't -+ * know if ext3_truncate() actually created an orphan record. -+ * (Well, we could do this if we need to, but heck - it works) -+ */ -+ ext3_orphan_del(handle, inode); -+ inode->u.ext3_i.i_dtime = CURRENT_TIME; -+ -+ /* -+ * One subtle ordering requirement: if anything has gone wrong -+ * (transaction abort, IO errors, whatever), then we can still -+ * do these next steps (the fs will already have been marked as -+ * having errors), but we can't free the inode if the mark_dirty -+ * fails. -+ */ -+ if (ext3_mark_inode_dirty(handle, inode)) -+ /* If that failed, just do the required in-core inode clear. */ -+ clear_inode(inode); -+ else -+ ext3_free_inode(handle, inode); -+ ext3_journal_stop(handle, inode); -+ unlock_kernel(); -+ return; -+no_delete: -+ clear_inode(inode); /* We must guarantee clearing of inode... */ -+} -+ -+void ext3_discard_prealloc (struct inode * inode) -+{ -+#ifdef EXT3_PREALLOCATE -+ lock_kernel(); -+ /* Writer: ->i_prealloc* */ -+ if (inode->u.ext3_i.i_prealloc_count) { -+ unsigned short total = inode->u.ext3_i.i_prealloc_count; -+ unsigned long block = inode->u.ext3_i.i_prealloc_block; -+ inode->u.ext3_i.i_prealloc_count = 0; -+ inode->u.ext3_i.i_prealloc_block = 0; -+ /* Writer: end */ -+ ext3_free_blocks (inode, block, total); -+ } -+ unlock_kernel(); -+#endif -+} -+ -+static int ext3_alloc_block (handle_t *handle, -+ struct inode * inode, unsigned long goal, int *err) -+{ -+#ifdef EXT3FS_DEBUG -+ static unsigned long alloc_hits = 0, alloc_attempts = 0; -+#endif -+ unsigned long result; -+ -+#ifdef EXT3_PREALLOCATE -+ /* Writer: ->i_prealloc* */ -+ if (inode->u.ext3_i.i_prealloc_count && -+ (goal == inode->u.ext3_i.i_prealloc_block || -+ goal + 1 == inode->u.ext3_i.i_prealloc_block)) -+ { -+ result = inode->u.ext3_i.i_prealloc_block++; -+ inode->u.ext3_i.i_prealloc_count--; -+ /* Writer: end */ -+ ext3_debug ("preallocation hit (%lu/%lu).\n", -+ ++alloc_hits, ++alloc_attempts); -+ } else { -+ ext3_discard_prealloc (inode); -+ ext3_debug ("preallocation miss (%lu/%lu).\n", -+ alloc_hits, ++alloc_attempts); -+ if (S_ISREG(inode->i_mode)) -+ result = ext3_new_block (inode, goal, -+ &inode->u.ext3_i.i_prealloc_count, -+ &inode->u.ext3_i.i_prealloc_block, err); -+ else -+ result = ext3_new_block (inode, goal, 0, 0, err); -+ /* -+ * AKPM: this is somewhat sticky. I'm not surprised it was -+ * disabled in 2.2's ext3. Need to integrate b_committed_data -+ * guarding with preallocation, if indeed preallocation is -+ * effective. -+ */ -+ } -+#else -+ result = ext3_new_block (handle, inode, goal, 0, 0, err); -+#endif -+ return result; -+} -+ -+ -+typedef struct { -+ u32 *p; -+ u32 key; -+ struct buffer_head *bh; -+} Indirect; -+ -+static inline void add_chain(Indirect *p, struct buffer_head *bh, u32 *v) -+{ -+ p->key = *(p->p = v); -+ p->bh = bh; -+} -+ -+static inline int verify_chain(Indirect *from, Indirect *to) -+{ -+ while (from <= to && from->key == *from->p) -+ from++; -+ return (from > to); -+} -+ -+/** -+ * ext3_block_to_path - parse the block number into array of offsets -+ * @inode: inode in question (we are only interested in its superblock) -+ * @i_block: block number to be parsed -+ * @offsets: array to store the offsets in -+ * -+ * To store the locations of file's data ext3 uses a data structure common -+ * for UNIX filesystems - tree of pointers anchored in the inode, with -+ * data blocks at leaves and indirect blocks in intermediate nodes. -+ * This function translates the block number into path in that tree - -+ * return value is the path length and @offsets[n] is the offset of -+ * pointer to (n+1)th node in the nth one. If @block is out of range -+ * (negative or too large) warning is printed and zero returned. -+ * -+ * Note: function doesn't find node addresses, so no IO is needed. All -+ * we need to know is the capacity of indirect blocks (taken from the -+ * inode->i_sb). -+ */ -+ -+/* -+ * Portability note: the last comparison (check that we fit into triple -+ * indirect block) is spelled differently, because otherwise on an -+ * architecture with 32-bit longs and 8Kb pages we might get into trouble -+ * if our filesystem had 8Kb blocks. We might use long long, but that would -+ * kill us on x86. Oh, well, at least the sign propagation does not matter - -+ * i_block would have to be negative in the very beginning, so we would not -+ * get there at all. -+ */ -+ -+static int ext3_block_to_path(struct inode *inode, long i_block, int offsets[4]) -+{ -+ int ptrs = EXT3_ADDR_PER_BLOCK(inode->i_sb); -+ int ptrs_bits = EXT3_ADDR_PER_BLOCK_BITS(inode->i_sb); -+ const long direct_blocks = EXT3_NDIR_BLOCKS, -+ indirect_blocks = ptrs, -+ double_blocks = (1 << (ptrs_bits * 2)); -+ int n = 0; -+ -+ if (i_block < 0) { -+ ext3_warning (inode->i_sb, "ext3_block_to_path", "block < 0"); -+ } else if (i_block < direct_blocks) { -+ offsets[n++] = i_block; -+ } else if ( (i_block -= direct_blocks) < indirect_blocks) { -+ offsets[n++] = EXT3_IND_BLOCK; -+ offsets[n++] = i_block; -+ } else if ((i_block -= indirect_blocks) < double_blocks) { -+ offsets[n++] = EXT3_DIND_BLOCK; -+ offsets[n++] = i_block >> ptrs_bits; -+ offsets[n++] = i_block & (ptrs - 1); -+ } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) { -+ offsets[n++] = EXT3_TIND_BLOCK; -+ offsets[n++] = i_block >> (ptrs_bits * 2); -+ offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1); -+ offsets[n++] = i_block & (ptrs - 1); -+ } else { -+ ext3_warning (inode->i_sb, "ext3_block_to_path", "block > big"); -+ } -+ return n; -+} -+ -+/** -+ * ext3_get_branch - read the chain of indirect blocks leading to data -+ * @inode: inode in question -+ * @depth: depth of the chain (1 - direct pointer, etc.) -+ * @offsets: offsets of pointers in inode/indirect blocks -+ * @chain: place to store the result -+ * @err: here we store the error value -+ * -+ * Function fills the array of triples and returns %NULL -+ * if everything went OK or the pointer to the last filled triple -+ * (incomplete one) otherwise. Upon the return chain[i].key contains -+ * the number of (i+1)-th block in the chain (as it is stored in memory, -+ * i.e. little-endian 32-bit), chain[i].p contains the address of that -+ * number (it points into struct inode for i==0 and into the bh->b_data -+ * for i>0) and chain[i].bh points to the buffer_head of i-th indirect -+ * block for i>0 and NULL for i==0. In other words, it holds the block -+ * numbers of the chain, addresses they were taken from (and where we can -+ * verify that chain did not change) and buffer_heads hosting these -+ * numbers. -+ * -+ * Function stops when it stumbles upon zero pointer (absent block) -+ * (pointer to last triple returned, *@err == 0) -+ * or when it gets an IO error reading an indirect block -+ * (ditto, *@err == -EIO) -+ * or when it notices that chain had been changed while it was reading -+ * (ditto, *@err == -EAGAIN) -+ * or when it reads all @depth-1 indirect blocks successfully and finds -+ * the whole chain, all way to the data (returns %NULL, *err == 0). -+ */ -+static Indirect *ext3_get_branch(struct inode *inode, int depth, int *offsets, -+ Indirect chain[4], int *err) -+{ -+ struct super_block *sb = inode->i_sb; -+ Indirect *p = chain; -+ struct buffer_head *bh; -+ -+ *err = 0; -+ /* i_data is not going away, no lock needed */ -+ add_chain (chain, NULL, inode->u.ext3_i.i_data + *offsets); -+ if (!p->key) -+ goto no_block; -+ while (--depth) { -+ bh = sb_bread(sb, le32_to_cpu(p->key)); -+ if (!bh) -+ goto failure; -+ /* Reader: pointers */ -+ if (!verify_chain(chain, p)) -+ goto changed; -+ add_chain(++p, bh, (u32*)bh->b_data + *++offsets); -+ /* Reader: end */ -+ if (!p->key) -+ goto no_block; -+ } -+ return NULL; -+ -+changed: -+ *err = -EAGAIN; -+ goto no_block; -+failure: -+ *err = -EIO; -+no_block: -+ return p; -+} -+ -+/** -+ * ext3_find_near - find a place for allocation with sufficient locality -+ * @inode: owner -+ * @ind: descriptor of indirect block. -+ * -+ * This function returns the prefered place for block allocation. -+ * It is used when heuristic for sequential allocation fails. -+ * Rules are: -+ * + if there is a block to the left of our position - allocate near it. -+ * + if pointer will live in indirect block - allocate near that block. -+ * + if pointer will live in inode - allocate in the same -+ * cylinder group. -+ * Caller must make sure that @ind is valid and will stay that way. -+ */ -+ -+static inline unsigned long ext3_find_near(struct inode *inode, Indirect *ind) -+{ -+ u32 *start = ind->bh ? (u32*) ind->bh->b_data : inode->u.ext3_i.i_data; -+ u32 *p; -+ -+ /* Try to find previous block */ -+ for (p = ind->p - 1; p >= start; p--) -+ if (*p) -+ return le32_to_cpu(*p); -+ -+ /* No such thing, so let's try location of indirect block */ -+ if (ind->bh) -+ return ind->bh->b_blocknr; -+ -+ /* -+ * It is going to be refered from inode itself? OK, just put it into -+ * the same cylinder group then. -+ */ -+ return (inode->u.ext3_i.i_block_group * -+ EXT3_BLOCKS_PER_GROUP(inode->i_sb)) + -+ le32_to_cpu(inode->i_sb->u.ext3_sb.s_es->s_first_data_block); -+} -+ -+/** -+ * ext3_find_goal - find a prefered place for allocation. -+ * @inode: owner -+ * @block: block we want -+ * @chain: chain of indirect blocks -+ * @partial: pointer to the last triple within a chain -+ * @goal: place to store the result. -+ * -+ * Normally this function find the prefered place for block allocation, -+ * stores it in *@goal and returns zero. If the branch had been changed -+ * under us we return -EAGAIN. -+ */ -+ -+static int ext3_find_goal(struct inode *inode, long block, Indirect chain[4], -+ Indirect *partial, unsigned long *goal) -+{ -+ /* Writer: ->i_next_alloc* */ -+ if (block == inode->u.ext3_i.i_next_alloc_block + 1) { -+ inode->u.ext3_i.i_next_alloc_block++; -+ inode->u.ext3_i.i_next_alloc_goal++; -+ } -+#ifdef SEARCH_FROM_ZERO -+ inode->u.ext3_i.i_next_alloc_block = 0; -+ inode->u.ext3_i.i_next_alloc_goal = 0; -+#endif -+ /* Writer: end */ -+ /* Reader: pointers, ->i_next_alloc* */ -+ if (verify_chain(chain, partial)) { -+ /* -+ * try the heuristic for sequential allocation, -+ * failing that at least try to get decent locality. -+ */ -+ if (block == inode->u.ext3_i.i_next_alloc_block) -+ *goal = inode->u.ext3_i.i_next_alloc_goal; -+ if (!*goal) -+ *goal = ext3_find_near(inode, partial); -+#ifdef SEARCH_FROM_ZERO -+ *goal = 0; -+#endif -+ return 0; -+ } -+ /* Reader: end */ -+ return -EAGAIN; -+} -+ -+/** -+ * ext3_alloc_branch - allocate and set up a chain of blocks. -+ * @inode: owner -+ * @num: depth of the chain (number of blocks to allocate) -+ * @offsets: offsets (in the blocks) to store the pointers to next. -+ * @branch: place to store the chain in. -+ * -+ * This function allocates @num blocks, zeroes out all but the last one, -+ * links them into chain and (if we are synchronous) writes them to disk. -+ * In other words, it prepares a branch that can be spliced onto the -+ * inode. It stores the information about that chain in the branch[], in -+ * the same format as ext3_get_branch() would do. We are calling it after -+ * we had read the existing part of chain and partial points to the last -+ * triple of that (one with zero ->key). Upon the exit we have the same -+ * picture as after the successful ext3_get_block(), excpet that in one -+ * place chain is disconnected - *branch->p is still zero (we did not -+ * set the last link), but branch->key contains the number that should -+ * be placed into *branch->p to fill that gap. -+ * -+ * If allocation fails we free all blocks we've allocated (and forget -+ * their buffer_heads) and return the error value the from failed -+ * ext3_alloc_block() (normally -ENOSPC). Otherwise we set the chain -+ * as described above and return 0. -+ */ -+ -+static int ext3_alloc_branch(handle_t *handle, struct inode *inode, -+ int num, -+ unsigned long goal, -+ int *offsets, -+ Indirect *branch) -+{ -+ int blocksize = inode->i_sb->s_blocksize; -+ int n = 0, keys = 0; -+ int err = 0; -+ int i; -+ int parent = ext3_alloc_block(handle, inode, goal, &err); -+ -+ branch[0].key = cpu_to_le32(parent); -+ if (parent) { -+ for (n = 1; n < num; n++) { -+ struct buffer_head *bh; -+ /* Allocate the next block */ -+ int nr = ext3_alloc_block(handle, inode, parent, &err); -+ if (!nr) -+ break; -+ branch[n].key = cpu_to_le32(nr); -+ keys = n+1; -+ -+ /* -+ * Get buffer_head for parent block, zero it out -+ * and set the pointer to new one, then send -+ * parent to disk. -+ */ -+ bh = sb_getblk(inode->i_sb, parent); -+ branch[n].bh = bh; -+ lock_buffer(bh); -+ BUFFER_TRACE(bh, "call get_create_access"); -+ err = ext3_journal_get_create_access(handle, bh); -+ if (err) { -+ unlock_buffer(bh); -+ brelse(bh); -+ break; -+ } -+ -+ memset(bh->b_data, 0, blocksize); -+ branch[n].p = (u32*) bh->b_data + offsets[n]; -+ *branch[n].p = branch[n].key; -+ BUFFER_TRACE(bh, "marking uptodate"); -+ mark_buffer_uptodate(bh, 1); -+ unlock_buffer(bh); -+ -+ BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); -+ err = ext3_journal_dirty_metadata(handle, bh); -+ if (err) -+ break; -+ -+ parent = nr; -+ } -+ } -+ if (n == num) -+ return 0; -+ -+ /* Allocation failed, free what we already allocated */ -+ for (i = 1; i < keys; i++) { -+ BUFFER_TRACE(branch[i].bh, "call journal_forget"); -+ ext3_journal_forget(handle, branch[i].bh); -+ } -+ for (i = 0; i < keys; i++) -+ ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1); -+ return err; -+} -+ -+/** -+ * ext3_splice_branch - splice the allocated branch onto inode. -+ * @inode: owner -+ * @block: (logical) number of block we are adding -+ * @chain: chain of indirect blocks (with a missing link - see -+ * ext3_alloc_branch) -+ * @where: location of missing link -+ * @num: number of blocks we are adding -+ * -+ * This function verifies that chain (up to the missing link) had not -+ * changed, fills the missing link and does all housekeeping needed in -+ * inode (->i_blocks, etc.). In case of success we end up with the full -+ * chain to new block and return 0. Otherwise (== chain had been changed) -+ * we free the new blocks (forgetting their buffer_heads, indeed) and -+ * return -EAGAIN. -+ */ -+ -+static int ext3_splice_branch(handle_t *handle, struct inode *inode, long block, -+ Indirect chain[4], Indirect *where, int num) -+{ -+ int i; -+ int err = 0; -+ -+ /* -+ * If we're splicing into a [td]indirect block (as opposed to the -+ * inode) then we need to get write access to the [td]indirect block -+ * before the splice. -+ */ -+ if (where->bh) { -+ BUFFER_TRACE(where->bh, "get_write_access"); -+ err = ext3_journal_get_write_access(handle, where->bh); -+ if (err) -+ goto err_out; -+ } -+ /* Verify that place we are splicing to is still there and vacant */ -+ -+ /* Writer: pointers, ->i_next_alloc* */ -+ if (!verify_chain(chain, where-1) || *where->p) -+ /* Writer: end */ -+ goto changed; -+ -+ /* That's it */ -+ -+ *where->p = where->key; -+ inode->u.ext3_i.i_next_alloc_block = block; -+ inode->u.ext3_i.i_next_alloc_goal = le32_to_cpu(where[num-1].key); -+#ifdef SEARCH_FROM_ZERO -+ inode->u.ext3_i.i_next_alloc_block = 0; -+ inode->u.ext3_i.i_next_alloc_goal = 0; -+#endif -+ /* Writer: end */ -+ -+ /* We are done with atomic stuff, now do the rest of housekeeping */ -+ -+ inode->i_ctime = CURRENT_TIME; -+ ext3_mark_inode_dirty(handle, inode); -+ -+ /* had we spliced it onto indirect block? */ -+ if (where->bh) { -+ /* -+ * akpm: If we spliced it onto an indirect block, we haven't -+ * altered the inode. Note however that if it is being spliced -+ * onto an indirect block at the very end of the file (the -+ * file is growing) then we *will* alter the inode to reflect -+ * the new i_size. But that is not done here - it is done in -+ * generic_commit_write->__mark_inode_dirty->ext3_dirty_inode. -+ */ -+ jbd_debug(5, "splicing indirect only\n"); -+ BUFFER_TRACE(where->bh, "call ext3_journal_dirty_metadata"); -+ err = ext3_journal_dirty_metadata(handle, where->bh); -+ if (err) -+ goto err_out; -+ } else { -+ /* -+ * OK, we spliced it into the inode itself on a direct block. -+ * Inode was dirtied above. -+ */ -+ jbd_debug(5, "splicing direct\n"); -+ } -+ return err; -+ -+changed: -+ /* -+ * AKPM: if where[i].bh isn't part of the current updating -+ * transaction then we explode nastily. Test this code path. -+ */ -+ jbd_debug(1, "the chain changed: try again\n"); -+ err = -EAGAIN; -+ -+err_out: -+ for (i = 1; i < num; i++) { -+ BUFFER_TRACE(where[i].bh, "call journal_forget"); -+ ext3_journal_forget(handle, where[i].bh); -+ } -+ /* For the normal collision cleanup case, we free up the blocks. -+ * On genuine filesystem errors we don't even think about doing -+ * that. */ -+ if (err == -EAGAIN) -+ for (i = 0; i < num; i++) -+ ext3_free_blocks(handle, inode, -+ le32_to_cpu(where[i].key), 1); -+ return err; -+} -+ -+/* -+ * Allocation strategy is simple: if we have to allocate something, we will -+ * have to go the whole way to leaf. So let's do it before attaching anything -+ * to tree, set linkage between the newborn blocks, write them if sync is -+ * required, recheck the path, free and repeat if check fails, otherwise -+ * set the last missing link (that will protect us from any truncate-generated -+ * removals - all blocks on the path are immune now) and possibly force the -+ * write on the parent block. -+ * That has a nice additional property: no special recovery from the failed -+ * allocations is needed - we simply release blocks and do not touch anything -+ * reachable from inode. -+ * -+ * akpm: `handle' can be NULL if create == 0. -+ * -+ * The BKL may not be held on entry here. Be sure to take it early. -+ */ -+ -+static int ext3_get_block_handle(handle_t *handle, struct inode *inode, -+ long iblock, -+ struct buffer_head *bh_result, int create) -+{ -+ int err = -EIO; -+ int offsets[4]; -+ Indirect chain[4]; -+ Indirect *partial; -+ unsigned long goal; -+ int left; -+ int depth = ext3_block_to_path(inode, iblock, offsets); -+ loff_t new_size; -+ -+ J_ASSERT(handle != NULL || create == 0); -+ -+ if (depth == 0) -+ goto out; -+ -+ lock_kernel(); -+reread: -+ partial = ext3_get_branch(inode, depth, offsets, chain, &err); -+ -+ /* Simplest case - block found, no allocation needed */ -+ if (!partial) { -+ bh_result->b_state &= ~(1UL << BH_New); -+got_it: -+ bh_result->b_dev = inode->i_dev; -+ bh_result->b_blocknr = le32_to_cpu(chain[depth-1].key); -+ bh_result->b_state |= (1UL << BH_Mapped); -+ /* Clean up and exit */ -+ partial = chain+depth-1; /* the whole chain */ -+ goto cleanup; -+ } -+ -+ /* Next simple case - plain lookup or failed read of indirect block */ -+ if (!create || err == -EIO) { -+cleanup: -+ while (partial > chain) { -+ BUFFER_TRACE(partial->bh, "call brelse"); -+ brelse(partial->bh); -+ partial--; -+ } -+ BUFFER_TRACE(bh_result, "returned"); -+ unlock_kernel(); -+out: -+ return err; -+ } -+ -+ /* -+ * Indirect block might be removed by truncate while we were -+ * reading it. Handling of that case (forget what we've got and -+ * reread) is taken out of the main path. -+ */ -+ if (err == -EAGAIN) -+ goto changed; -+ -+ if (ext3_find_goal(inode, iblock, chain, partial, &goal) < 0) -+ goto changed; -+ -+ left = (chain + depth) - partial; -+ -+ /* -+ * Block out ext3_truncate while we alter the tree -+ */ -+ down_read(&inode->u.ext3_i.truncate_sem); -+ err = ext3_alloc_branch(handle, inode, left, goal, -+ offsets+(partial-chain), partial); -+ -+ /* The ext3_splice_branch call will free and forget any buffers -+ * on the new chain if there is a failure, but that risks using -+ * up transaction credits, especially for bitmaps where the -+ * credits cannot be returned. Can we handle this somehow? We -+ * may need to return -EAGAIN upwards in the worst case. --sct */ -+ if (!err) -+ err = ext3_splice_branch(handle, inode, iblock, chain, -+ partial, left); -+ up_read(&inode->u.ext3_i.truncate_sem); -+ if (err == -EAGAIN) -+ goto changed; -+ if (err) -+ goto cleanup; -+ -+ new_size = inode->i_size; -+ /* -+ * This is not racy against ext3_truncate's modification of i_disksize -+ * because VM/VFS ensures that the file cannot be extended while -+ * truncate is in progress. It is racy between multiple parallel -+ * instances of get_block, but we have the BKL. -+ */ -+ if (new_size > inode->u.ext3_i.i_disksize) -+ inode->u.ext3_i.i_disksize = new_size; -+ -+ bh_result->b_state |= (1UL << BH_New); -+ goto got_it; -+ -+changed: -+ while (partial > chain) { -+ jbd_debug(1, "buffer chain changed, retrying\n"); -+ BUFFER_TRACE(partial->bh, "brelsing"); -+ brelse(partial->bh); -+ partial--; -+ } -+ goto reread; -+} -+ -+/* -+ * The BKL is not held on entry here. -+ */ -+static int ext3_get_block(struct inode *inode, long iblock, -+ struct buffer_head *bh_result, int create) -+{ -+ handle_t *handle = 0; -+ int ret; -+ -+ if (create) { -+ handle = ext3_journal_current_handle(); -+ J_ASSERT(handle != 0); -+ } -+ ret = ext3_get_block_handle(handle, inode, iblock, bh_result, create); -+ return ret; -+} -+ -+/* -+ * `handle' can be NULL if create is zero -+ */ -+struct buffer_head *ext3_getblk(handle_t *handle, struct inode * inode, -+ long block, int create, int * errp) -+{ -+ struct buffer_head dummy; -+ int fatal = 0, err; -+ -+ J_ASSERT(handle != NULL || create == 0); -+ -+ dummy.b_state = 0; -+ dummy.b_blocknr = -1000; -+ buffer_trace_init(&dummy.b_history); -+ *errp = ext3_get_block_handle(handle, inode, block, &dummy, create); -+ if (!*errp && buffer_mapped(&dummy)) { -+ struct buffer_head *bh; -+ bh = sb_getblk(inode->i_sb, dummy.b_blocknr); -+ if (buffer_new(&dummy)) { -+ J_ASSERT(create != 0); -+ J_ASSERT(handle != 0); -+ -+ /* Now that we do not always journal data, we -+ should keep in mind whether this should -+ always journal the new buffer as metadata. -+ For now, regular file writes use -+ ext3_get_block instead, so it's not a -+ problem. */ -+ lock_kernel(); -+ lock_buffer(bh); -+ BUFFER_TRACE(bh, "call get_create_access"); -+ fatal = ext3_journal_get_create_access(handle, bh); -+ if (!fatal) { -+ memset(bh->b_data, 0, -+ inode->i_sb->s_blocksize); -+ mark_buffer_uptodate(bh, 1); -+ } -+ unlock_buffer(bh); -+ BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); -+ err = ext3_journal_dirty_metadata(handle, bh); -+ if (!fatal) fatal = err; -+ unlock_kernel(); -+ } else { -+ BUFFER_TRACE(bh, "not a new buffer"); -+ } -+ if (fatal) { -+ *errp = fatal; -+ brelse(bh); -+ bh = NULL; -+ } -+ return bh; -+ } -+ return NULL; -+} -+ -+struct buffer_head *ext3_bread(handle_t *handle, struct inode * inode, -+ int block, int create, int *err) -+{ -+ struct buffer_head * bh; -+ int prev_blocks; -+ -+ prev_blocks = inode->i_blocks; -+ -+ bh = ext3_getblk (handle, inode, block, create, err); -+ if (!bh) -+ return bh; -+#ifdef EXT3_PREALLOCATE -+ /* -+ * If the inode has grown, and this is a directory, then use a few -+ * more of the preallocated blocks to keep directory fragmentation -+ * down. The preallocated blocks are guaranteed to be contiguous. -+ */ -+ if (create && -+ S_ISDIR(inode->i_mode) && -+ inode->i_blocks > prev_blocks && -+ EXT3_HAS_COMPAT_FEATURE(inode->i_sb, -+ EXT3_FEATURE_COMPAT_DIR_PREALLOC)) { -+ int i; -+ struct buffer_head *tmp_bh; -+ -+ for (i = 1; -+ inode->u.ext3_i.i_prealloc_count && -+ i < EXT3_SB(inode->i_sb)->s_es->s_prealloc_dir_blocks; -+ i++) { -+ /* -+ * ext3_getblk will zero out the contents of the -+ * directory for us -+ */ -+ tmp_bh = ext3_getblk(handle, inode, -+ block+i, create, err); -+ if (!tmp_bh) { -+ brelse (bh); -+ return 0; -+ } -+ brelse (tmp_bh); -+ } -+ } -+#endif -+ if (buffer_uptodate(bh)) -+ return bh; -+ ll_rw_block (READ, 1, &bh); -+ wait_on_buffer (bh); -+ if (buffer_uptodate(bh)) -+ return bh; -+ brelse (bh); -+ *err = -EIO; -+ return NULL; -+} -+ -+static int walk_page_buffers( handle_t *handle, -+ struct buffer_head *head, -+ unsigned from, -+ unsigned to, -+ int *partial, -+ int (*fn)( handle_t *handle, -+ struct buffer_head *bh)) -+{ -+ struct buffer_head *bh; -+ unsigned block_start, block_end; -+ unsigned blocksize = head->b_size; -+ int err, ret = 0; -+ -+ for ( bh = head, block_start = 0; -+ ret == 0 && (bh != head || !block_start); -+ block_start = block_end, bh = bh->b_this_page) -+ { -+ block_end = block_start + blocksize; -+ if (block_end <= from || block_start >= to) { -+ if (partial && !buffer_uptodate(bh)) -+ *partial = 1; -+ continue; -+ } -+ err = (*fn)(handle, bh); -+ if (!ret) -+ ret = err; -+ } -+ return ret; -+} -+ -+/* -+ * To preserve ordering, it is essential that the hole instantiation and -+ * the data write be encapsulated in a single transaction. We cannot -+ * close off a transaction and start a new one between the ext3_get_block() -+ * and the commit_write(). So doing the journal_start at the start of -+ * prepare_write() is the right place. -+ * -+ * Also, this function can nest inside ext3_writepage() -> -+ * block_write_full_page(). In that case, we *know* that ext3_writepage() -+ * has generated enough buffer credits to do the whole page. So we won't -+ * block on the journal in that case, which is good, because the caller may -+ * be PF_MEMALLOC. -+ * -+ * By accident, ext3 can be reentered when a transaction is open via -+ * quota file writes. If we were to commit the transaction while thus -+ * reentered, there can be a deadlock - we would be holding a quota -+ * lock, and the commit would never complete if another thread had a -+ * transaction open and was blocking on the quota lock - a ranking -+ * violation. -+ * -+ * So what we do is to rely on the fact that journal_stop/journal_start -+ * will _not_ run commit under these circumstances because handle->h_ref -+ * is elevated. We'll still have enough credits for the tiny quotafile -+ * write. -+ */ -+ -+static int do_journal_get_write_access(handle_t *handle, -+ struct buffer_head *bh) -+{ -+ return ext3_journal_get_write_access(handle, bh); -+} -+ -+static int ext3_prepare_write(struct file *file, struct page *page, -+ unsigned from, unsigned to) -+{ -+ struct inode *inode = page->mapping->host; -+ int ret, needed_blocks = ext3_writepage_trans_blocks(inode); -+ handle_t *handle; -+ -+ lock_kernel(); -+ handle = ext3_journal_start(inode, needed_blocks); -+ if (IS_ERR(handle)) { -+ ret = PTR_ERR(handle); -+ goto out; -+ } -+ unlock_kernel(); -+ ret = block_prepare_write(page, from, to, ext3_get_block); -+ lock_kernel(); -+ if (ret != 0) -+ goto prepare_write_failed; -+ -+ if (ext3_should_journal_data(inode)) { -+ ret = walk_page_buffers(handle, page->buffers, -+ from, to, NULL, do_journal_get_write_access); -+ if (ret) { -+ /* -+ * We're going to fail this prepare_write(), -+ * so commit_write() will not be called. -+ * We need to undo block_prepare_write()'s kmap(). -+ * AKPM: Do we need to clear PageUptodate? I don't -+ * think so. -+ */ -+ kunmap(page); -+ } -+ } -+prepare_write_failed: -+ if (ret) -+ ext3_journal_stop(handle, inode); -+out: -+ unlock_kernel(); -+ return ret; -+} -+ -+static int journal_dirty_sync_data(handle_t *handle, struct buffer_head *bh) -+{ -+ return ext3_journal_dirty_data(handle, bh, 0); -+} -+ -+/* -+ * For ext3_writepage(). We also brelse() the buffer to account for -+ * the bget() which ext3_writepage() performs. -+ */ -+static int journal_dirty_async_data(handle_t *handle, struct buffer_head *bh) -+{ -+ int ret = ext3_journal_dirty_data(handle, bh, 1); -+ __brelse(bh); -+ return ret; -+} -+ -+/* For commit_write() in data=journal mode */ -+static int commit_write_fn(handle_t *handle, struct buffer_head *bh) -+{ -+ set_bit(BH_Uptodate, &bh->b_state); -+ return ext3_journal_dirty_metadata(handle, bh); -+} -+ -+/* -+ * We need to pick up the new inode size which generic_commit_write gave us -+ * `file' can be NULL - eg, when called from block_symlink(). -+ * -+ * ext3 inode->i_dirty_buffers policy: If we're journalling data we -+ * definitely don't want them to appear on the inode at all - instead -+ * we need to manage them at the JBD layer and we need to intercept -+ * the relevant sync operations and translate them into journal operations. -+ * -+ * If we're not journalling data then we can just leave the buffers -+ * on ->i_dirty_buffers. If someone writes them out for us then thanks. -+ * Otherwise we'll do it in commit, if we're using ordered data. -+ */ -+ -+static int ext3_commit_write(struct file *file, struct page *page, -+ unsigned from, unsigned to) -+{ -+ handle_t *handle = ext3_journal_current_handle(); -+ struct inode *inode = page->mapping->host; -+ int ret = 0, ret2; -+ -+ lock_kernel(); -+ if (ext3_should_journal_data(inode)) { -+ /* -+ * Here we duplicate the generic_commit_write() functionality -+ */ -+ int partial = 0; -+ loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; -+ -+ ret = walk_page_buffers(handle, page->buffers, -+ from, to, &partial, commit_write_fn); -+ if (!partial) -+ SetPageUptodate(page); -+ kunmap(page); -+ if (pos > inode->i_size) -+ inode->i_size = pos; -+ EXT3_I(inode)->i_state |= EXT3_STATE_JDATA; -+ } else { -+ if (ext3_should_order_data(inode)) { -+ ret = walk_page_buffers(handle, page->buffers, -+ from, to, NULL, journal_dirty_sync_data); -+ } -+ /* Be careful here if generic_commit_write becomes a -+ * required invocation after block_prepare_write. */ -+ if (ret == 0) { -+ ret = generic_commit_write(file, page, from, to); -+ } else { -+ /* -+ * block_prepare_write() was called, but we're not -+ * going to call generic_commit_write(). So we -+ * need to perform generic_commit_write()'s kunmap -+ * by hand. -+ */ -+ kunmap(page); -+ } -+ } -+ if (inode->i_size > inode->u.ext3_i.i_disksize) { -+ inode->u.ext3_i.i_disksize = inode->i_size; -+ ret2 = ext3_mark_inode_dirty(handle, inode); -+ if (!ret) -+ ret = ret2; -+ } -+ ret2 = ext3_journal_stop(handle, inode); -+ unlock_kernel(); -+ if (!ret) -+ ret = ret2; -+ return ret; -+} -+ -+/* -+ * bmap() is special. It gets used by applications such as lilo and by -+ * the swapper to find the on-disk block of a specific piece of data. -+ * -+ * Naturally, this is dangerous if the block concerned is still in the -+ * journal. If somebody makes a swapfile on an ext3 data-journaling -+ * filesystem and enables swap, then they may get a nasty shock when the -+ * data getting swapped to that swapfile suddenly gets overwritten by -+ * the original zero's written out previously to the journal and -+ * awaiting writeback in the kernel's buffer cache. -+ * -+ * So, if we see any bmap calls here on a modified, data-journaled file, -+ * take extra steps to flush any blocks which might be in the cache. -+ */ -+static int ext3_bmap(struct address_space *mapping, long block) -+{ -+ struct inode *inode = mapping->host; -+ journal_t *journal; -+ int err; -+ -+ if (EXT3_I(inode)->i_state & EXT3_STATE_JDATA) { -+ /* -+ * This is a REALLY heavyweight approach, but the use of -+ * bmap on dirty files is expected to be extremely rare: -+ * only if we run lilo or swapon on a freshly made file -+ * do we expect this to happen. -+ * -+ * (bmap requires CAP_SYS_RAWIO so this does not -+ * represent an unprivileged user DOS attack --- we'd be -+ * in trouble if mortal users could trigger this path at -+ * will.) -+ * -+ * NB. EXT3_STATE_JDATA is not set on files other than -+ * regular files. If somebody wants to bmap a directory -+ * or symlink and gets confused because the buffer -+ * hasn't yet been flushed to disk, they deserve -+ * everything they get. -+ */ -+ -+ EXT3_I(inode)->i_state &= ~EXT3_STATE_JDATA; -+ journal = EXT3_JOURNAL(inode); -+ journal_lock_updates(journal); -+ err = journal_flush(journal); -+ journal_unlock_updates(journal); -+ -+ if (err) -+ return 0; -+ } -+ -+ return generic_block_bmap(mapping,block,ext3_get_block); -+} -+ -+static int bget_one(handle_t *handle, struct buffer_head *bh) -+{ -+ atomic_inc(&bh->b_count); -+ return 0; -+} -+ -+/* -+ * Note that we always start a transaction even if we're not journalling -+ * data. This is to preserve ordering: any hole instantiation within -+ * __block_write_full_page -> ext3_get_block() should be journalled -+ * along with the data so we don't crash and then get metadata which -+ * refers to old data. -+ * -+ * In all journalling modes block_write_full_page() will start the I/O. -+ * -+ * Problem: -+ * -+ * ext3_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() -> -+ * ext3_writepage() -+ * -+ * Similar for: -+ * -+ * ext3_file_write() -> generic_file_write() -> __alloc_pages() -> ... -+ * -+ * Same applies to ext3_get_block(). We will deadlock on various things like -+ * lock_journal and i_truncate_sem. -+ * -+ * Setting PF_MEMALLOC here doesn't work - too many internal memory -+ * allocations fail. -+ * -+ * 16May01: If we're reentered then journal_current_handle() will be -+ * non-zero. We simply *return*. -+ * -+ * 1 July 2001: @@@ FIXME: -+ * In journalled data mode, a data buffer may be metadata against the -+ * current transaction. But the same file is part of a shared mapping -+ * and someone does a writepage() on it. -+ * -+ * We will move the buffer onto the async_data list, but *after* it has -+ * been dirtied. So there's a small window where we have dirty data on -+ * BJ_Metadata. -+ * -+ * Note that this only applies to the last partial page in the file. The -+ * bit which block_write_full_page() uses prepare/commit for. (That's -+ * broken code anyway: it's wrong for msync()). -+ * -+ * It's a rare case: affects the final partial page, for journalled data -+ * where the file is subject to bith write() and writepage() in the same -+ * transction. To fix it we'll need a custom block_write_full_page(). -+ * We'll probably need that anyway for journalling writepage() output. -+ * -+ * We don't honour synchronous mounts for writepage(). That would be -+ * disastrous. Any write() or metadata operation will sync the fs for -+ * us. -+ */ -+static int ext3_writepage(struct page *page) -+{ -+ struct inode *inode = page->mapping->host; -+ struct buffer_head *page_buffers; -+ handle_t *handle = NULL; -+ int ret = 0, err; -+ int needed; -+ int order_data; -+ -+ J_ASSERT(PageLocked(page)); -+ -+ /* -+ * We give up here if we're reentered, because it might be -+ * for a different filesystem. One *could* look for a -+ * nested transaction opportunity. -+ */ -+ lock_kernel(); -+ if (ext3_journal_current_handle()) -+ goto out_fail; -+ -+ needed = ext3_writepage_trans_blocks(inode); -+ if (current->flags & PF_MEMALLOC) -+ handle = ext3_journal_try_start(inode, needed); -+ else -+ handle = ext3_journal_start(inode, needed); -+ -+ if (IS_ERR(handle)) { -+ ret = PTR_ERR(handle); -+ goto out_fail; -+ } -+ -+ order_data = ext3_should_order_data(inode) || -+ ext3_should_journal_data(inode); -+ -+ unlock_kernel(); -+ -+ page_buffers = NULL; /* Purely to prevent compiler warning */ -+ -+ /* bget() all the buffers */ -+ if (order_data) { -+ if (!page->buffers) -+ create_empty_buffers(page, -+ inode->i_dev, inode->i_sb->s_blocksize); -+ page_buffers = page->buffers; -+ walk_page_buffers(handle, page_buffers, 0, -+ PAGE_CACHE_SIZE, NULL, bget_one); -+ } -+ -+ ret = block_write_full_page(page, ext3_get_block); -+ -+ /* -+ * The page can become unlocked at any point now, and -+ * truncate can then come in and change things. So we -+ * can't touch *page from now on. But *page_buffers is -+ * safe due to elevated refcount. -+ */ -+ -+ handle = ext3_journal_current_handle(); -+ lock_kernel(); -+ -+ /* And attach them to the current transaction */ -+ if (order_data) { -+ err = walk_page_buffers(handle, page_buffers, -+ 0, PAGE_CACHE_SIZE, NULL, journal_dirty_async_data); -+ if (!ret) -+ ret = err; -+ } -+ -+ err = ext3_journal_stop(handle, inode); -+ if (!ret) -+ ret = err; -+ unlock_kernel(); -+ return ret; -+ -+out_fail: -+ -+ unlock_kernel(); -+ SetPageDirty(page); -+ UnlockPage(page); -+ return ret; -+} -+ -+static int ext3_readpage(struct file *file, struct page *page) -+{ -+ return block_read_full_page(page,ext3_get_block); -+} -+ -+ -+static int ext3_flushpage(struct page *page, unsigned long offset) -+{ -+ journal_t *journal = EXT3_JOURNAL(page->mapping->host); -+ return journal_flushpage(journal, page, offset); -+} -+ -+static int ext3_releasepage(struct page *page, int wait) -+{ -+ journal_t *journal = EXT3_JOURNAL(page->mapping->host); -+ return journal_try_to_free_buffers(journal, page, wait); -+} -+ -+ -+struct address_space_operations ext3_aops = { -+ readpage: ext3_readpage, /* BKL not held. Don't need */ -+ writepage: ext3_writepage, /* BKL not held. We take it */ -+ sync_page: block_sync_page, -+ prepare_write: ext3_prepare_write, /* BKL not held. We take it */ -+ commit_write: ext3_commit_write, /* BKL not held. We take it */ -+ bmap: ext3_bmap, /* BKL held */ -+ flushpage: ext3_flushpage, /* BKL not held. Don't need */ -+ releasepage: ext3_releasepage, /* BKL not held. Don't need */ -+}; -+ -+/* -+ * ext3_block_truncate_page() zeroes out a mapping from file offset `from' -+ * up to the end of the block which corresponds to `from'. -+ * This required during truncate. We need to physically zero the tail end -+ * of that block so it doesn't yield old data if the file is later grown. -+ */ -+static int ext3_block_truncate_page(handle_t *handle, -+ struct address_space *mapping, loff_t from) -+{ -+ unsigned long index = from >> PAGE_CACHE_SHIFT; -+ unsigned offset = from & (PAGE_CACHE_SIZE-1); -+ unsigned blocksize, iblock, length, pos; -+ struct inode *inode = mapping->host; -+ struct page *page; -+ struct buffer_head *bh; -+ int err; -+ -+ blocksize = inode->i_sb->s_blocksize; -+ length = offset & (blocksize - 1); -+ -+ /* Block boundary? Nothing to do */ -+ if (!length) -+ return 0; -+ -+ length = blocksize - length; -+ iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); -+ -+ page = grab_cache_page(mapping, index); -+ err = -ENOMEM; -+ if (!page) -+ goto out; -+ -+ if (!page->buffers) -+ create_empty_buffers(page, inode->i_dev, blocksize); -+ -+ /* Find the buffer that contains "offset" */ -+ bh = page->buffers; -+ pos = blocksize; -+ while (offset >= pos) { -+ bh = bh->b_this_page; -+ iblock++; -+ pos += blocksize; -+ } -+ -+ err = 0; -+ if (!buffer_mapped(bh)) { -+ /* Hole? Nothing to do */ -+ if (buffer_uptodate(bh)) -+ goto unlock; -+ ext3_get_block(inode, iblock, bh, 0); -+ /* Still unmapped? Nothing to do */ -+ if (!buffer_mapped(bh)) -+ goto unlock; -+ } -+ -+ /* Ok, it's mapped. Make sure it's up-to-date */ -+ if (Page_Uptodate(page)) -+ set_bit(BH_Uptodate, &bh->b_state); -+ -+ if (!buffer_uptodate(bh)) { -+ err = -EIO; -+ ll_rw_block(READ, 1, &bh); -+ wait_on_buffer(bh); -+ /* Uhhuh. Read error. Complain and punt. */ -+ if (!buffer_uptodate(bh)) -+ goto unlock; -+ } -+ -+ if (ext3_should_journal_data(inode)) { -+ BUFFER_TRACE(bh, "get write access"); -+ err = ext3_journal_get_write_access(handle, bh); -+ if (err) -+ goto unlock; -+ } -+ -+ memset(kmap(page) + offset, 0, length); -+ flush_dcache_page(page); -+ kunmap(page); -+ -+ BUFFER_TRACE(bh, "zeroed end of block"); -+ -+ err = 0; -+ if (ext3_should_journal_data(inode)) { -+ err = ext3_journal_dirty_metadata(handle, bh); -+ } else { -+ if (ext3_should_order_data(inode)) -+ err = ext3_journal_dirty_data(handle, bh, 0); -+ __mark_buffer_dirty(bh); -+ } -+ -+unlock: -+ UnlockPage(page); -+ page_cache_release(page); -+out: -+ return err; -+} -+ -+/* -+ * Probably it should be a library function... search for first non-zero word -+ * or memcmp with zero_page, whatever is better for particular architecture. -+ * Linus? -+ */ -+static inline int all_zeroes(u32 *p, u32 *q) -+{ -+ while (p < q) -+ if (*p++) -+ return 0; -+ return 1; -+} -+ -+/** -+ * ext3_find_shared - find the indirect blocks for partial truncation. -+ * @inode: inode in question -+ * @depth: depth of the affected branch -+ * @offsets: offsets of pointers in that branch (see ext3_block_to_path) -+ * @chain: place to store the pointers to partial indirect blocks -+ * @top: place to the (detached) top of branch -+ * -+ * This is a helper function used by ext3_truncate(). -+ * -+ * When we do truncate() we may have to clean the ends of several -+ * indirect blocks but leave the blocks themselves alive. Block is -+ * partially truncated if some data below the new i_size is refered -+ * from it (and it is on the path to the first completely truncated -+ * data block, indeed). We have to free the top of that path along -+ * with everything to the right of the path. Since no allocation -+ * past the truncation point is possible until ext3_truncate() -+ * finishes, we may safely do the latter, but top of branch may -+ * require special attention - pageout below the truncation point -+ * might try to populate it. -+ * -+ * We atomically detach the top of branch from the tree, store the -+ * block number of its root in *@top, pointers to buffer_heads of -+ * partially truncated blocks - in @chain[].bh and pointers to -+ * their last elements that should not be removed - in -+ * @chain[].p. Return value is the pointer to last filled element -+ * of @chain. -+ * -+ * The work left to caller to do the actual freeing of subtrees: -+ * a) free the subtree starting from *@top -+ * b) free the subtrees whose roots are stored in -+ * (@chain[i].p+1 .. end of @chain[i].bh->b_data) -+ * c) free the subtrees growing from the inode past the @chain[0]. -+ * (no partially truncated stuff there). */ -+ -+static Indirect *ext3_find_shared(struct inode *inode, -+ int depth, -+ int offsets[4], -+ Indirect chain[4], -+ u32 *top) -+{ -+ Indirect *partial, *p; -+ int k, err; -+ -+ *top = 0; -+ /* Make k index the deepest non-null offest + 1 */ -+ for (k = depth; k > 1 && !offsets[k-1]; k--) -+ ; -+ partial = ext3_get_branch(inode, k, offsets, chain, &err); -+ /* Writer: pointers */ -+ if (!partial) -+ partial = chain + k-1; -+ /* -+ * If the branch acquired continuation since we've looked at it - -+ * fine, it should all survive and (new) top doesn't belong to us. -+ */ -+ if (!partial->key && *partial->p) -+ /* Writer: end */ -+ goto no_top; -+ for (p=partial; p>chain && all_zeroes((u32*)p->bh->b_data,p->p); p--) -+ ; -+ /* -+ * OK, we've found the last block that must survive. The rest of our -+ * branch should be detached before unlocking. However, if that rest -+ * of branch is all ours and does not grow immediately from the inode -+ * it's easier to cheat and just decrement partial->p. -+ */ -+ if (p == chain + k - 1 && p > chain) { -+ p->p--; -+ } else { -+ *top = *p->p; -+ /* Nope, don't do this in ext3. Must leave the tree intact */ -+#if 0 -+ *p->p = 0; -+#endif -+ } -+ /* Writer: end */ -+ -+ while(partial > p) -+ { -+ brelse(partial->bh); -+ partial--; -+ } -+no_top: -+ return partial; -+} -+ -+/* -+ * Zero a number of block pointers in either an inode or an indirect block. -+ * If we restart the transaction we must again get write access to the -+ * indirect block for further modification. -+ * -+ * We release `count' blocks on disk, but (last - first) may be greater -+ * than `count' because there can be holes in there. -+ */ -+static void -+ext3_clear_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bh, -+ unsigned long block_to_free, unsigned long count, -+ u32 *first, u32 *last) -+{ -+ u32 *p; -+ if (try_to_extend_transaction(handle, inode)) { -+ if (bh) { -+ BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); -+ ext3_journal_dirty_metadata(handle, bh); -+ } -+ ext3_mark_inode_dirty(handle, inode); -+ ext3_journal_test_restart(handle, inode); -+ BUFFER_TRACE(bh, "get_write_access"); -+ ext3_journal_get_write_access(handle, bh); -+ } -+ -+ /* -+ * Any buffers which are on the journal will be in memory. We find -+ * them on the hash table so journal_revoke() will run journal_forget() -+ * on them. We've already detached each block from the file, so -+ * bforget() in journal_forget() should be safe. -+ * -+ * AKPM: turn on bforget in journal_forget()!!! -+ */ -+ for (p = first; p < last; p++) { -+ u32 nr = le32_to_cpu(*p); -+ if (nr) { -+ struct buffer_head *bh; -+ -+ *p = 0; -+ bh = sb_get_hash_table(inode->i_sb, nr); -+ ext3_forget(handle, 0, inode, bh, nr); -+ } -+ } -+ -+ ext3_free_blocks(handle, inode, block_to_free, count); -+} -+ -+/** -+ * ext3_free_data - free a list of data blocks -+ * @handle: handle for this transaction -+ * @inode: inode we are dealing with -+ * @this_bh: indirect buffer_head which contains *@first and *@last -+ * @first: array of block numbers -+ * @last: points immediately past the end of array -+ * -+ * We are freeing all blocks refered from that array (numbers are stored as -+ * little-endian 32-bit) and updating @inode->i_blocks appropriately. -+ * -+ * We accumulate contiguous runs of blocks to free. Conveniently, if these -+ * blocks are contiguous then releasing them at one time will only affect one -+ * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't -+ * actually use a lot of journal space. -+ * -+ * @this_bh will be %NULL if @first and @last point into the inode's direct -+ * block pointers. -+ */ -+static void ext3_free_data(handle_t *handle, struct inode *inode, -+ struct buffer_head *this_bh, u32 *first, u32 *last) -+{ -+ unsigned long block_to_free = 0; /* Starting block # of a run */ -+ unsigned long count = 0; /* Number of blocks in the run */ -+ u32 *block_to_free_p = NULL; /* Pointer into inode/ind -+ corresponding to -+ block_to_free */ -+ unsigned long nr; /* Current block # */ -+ u32 *p; /* Pointer into inode/ind -+ for current block */ -+ int err; -+ -+ if (this_bh) { /* For indirect block */ -+ BUFFER_TRACE(this_bh, "get_write_access"); -+ err = ext3_journal_get_write_access(handle, this_bh); -+ /* Important: if we can't update the indirect pointers -+ * to the blocks, we can't free them. */ -+ if (err) -+ return; -+ } -+ -+ for (p = first; p < last; p++) { -+ nr = le32_to_cpu(*p); -+ if (nr) { -+ /* accumulate blocks to free if they're contiguous */ -+ if (count == 0) { -+ block_to_free = nr; -+ block_to_free_p = p; -+ count = 1; -+ } else if (nr == block_to_free + count) { -+ count++; -+ } else { -+ ext3_clear_blocks(handle, inode, this_bh, -+ block_to_free, -+ count, block_to_free_p, p); -+ block_to_free = nr; -+ block_to_free_p = p; -+ count = 1; -+ } -+ } -+ } -+ -+ if (count > 0) -+ ext3_clear_blocks(handle, inode, this_bh, block_to_free, -+ count, block_to_free_p, p); -+ -+ if (this_bh) { -+ BUFFER_TRACE(this_bh, "call ext3_journal_dirty_metadata"); -+ ext3_journal_dirty_metadata(handle, this_bh); -+ } -+} -+ -+/** -+ * ext3_free_branches - free an array of branches -+ * @handle: JBD handle for this transaction -+ * @inode: inode we are dealing with -+ * @parent_bh: the buffer_head which contains *@first and *@last -+ * @first: array of block numbers -+ * @last: pointer immediately past the end of array -+ * @depth: depth of the branches to free -+ * -+ * We are freeing all blocks refered from these branches (numbers are -+ * stored as little-endian 32-bit) and updating @inode->i_blocks -+ * appropriately. -+ */ -+static void ext3_free_branches(handle_t *handle, struct inode *inode, -+ struct buffer_head *parent_bh, -+ u32 *first, u32 *last, int depth) -+{ -+ unsigned long nr; -+ u32 *p; -+ -+ if (is_handle_aborted(handle)) -+ return; -+ -+ if (depth--) { -+ struct buffer_head *bh; -+ int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb); -+ p = last; -+ while (--p >= first) { -+ nr = le32_to_cpu(*p); -+ if (!nr) -+ continue; /* A hole */ -+ -+ /* Go read the buffer for the next level down */ -+ bh = sb_bread(inode->i_sb, nr); -+ -+ /* -+ * A read failure? Report error and clear slot -+ * (should be rare). -+ */ -+ if (!bh) { -+ ext3_error(inode->i_sb, "ext3_free_branches", -+ "Read failure, inode=%ld, block=%ld", -+ inode->i_ino, nr); -+ continue; -+ } -+ -+ /* This zaps the entire block. Bottom up. */ -+ BUFFER_TRACE(bh, "free child branches"); -+ ext3_free_branches(handle, inode, bh, (u32*)bh->b_data, -+ (u32*)bh->b_data + addr_per_block, -+ depth); -+ -+ /* -+ * We've probably journalled the indirect block several -+ * times during the truncate. But it's no longer -+ * needed and we now drop it from the transaction via -+ * journal_revoke(). -+ * -+ * That's easy if it's exclusively part of this -+ * transaction. But if it's part of the committing -+ * transaction then journal_forget() will simply -+ * brelse() it. That means that if the underlying -+ * block is reallocated in ext3_get_block(), -+ * unmap_underlying_metadata() will find this block -+ * and will try to get rid of it. damn, damn. -+ * -+ * If this block has already been committed to the -+ * journal, a revoke record will be written. And -+ * revoke records must be emitted *before* clearing -+ * this block's bit in the bitmaps. -+ */ -+ ext3_forget(handle, 1, inode, bh, bh->b_blocknr); -+ -+ /* -+ * Everything below this this pointer has been -+ * released. Now let this top-of-subtree go. -+ * -+ * We want the freeing of this indirect block to be -+ * atomic in the journal with the updating of the -+ * bitmap block which owns it. So make some room in -+ * the journal. -+ * -+ * We zero the parent pointer *after* freeing its -+ * pointee in the bitmaps, so if extend_transaction() -+ * for some reason fails to put the bitmap changes and -+ * the release into the same transaction, recovery -+ * will merely complain about releasing a free block, -+ * rather than leaking blocks. -+ */ -+ if (is_handle_aborted(handle)) -+ return; -+ if (try_to_extend_transaction(handle, inode)) { -+ ext3_mark_inode_dirty(handle, inode); -+ ext3_journal_test_restart(handle, inode); -+ } -+ -+ ext3_free_blocks(handle, inode, nr, 1); -+ -+ if (parent_bh) { -+ /* -+ * The block which we have just freed is -+ * pointed to by an indirect block: journal it -+ */ -+ BUFFER_TRACE(parent_bh, "get_write_access"); -+ if (!ext3_journal_get_write_access(handle, -+ parent_bh)){ -+ *p = 0; -+ BUFFER_TRACE(parent_bh, -+ "call ext3_journal_dirty_metadata"); -+ ext3_journal_dirty_metadata(handle, -+ parent_bh); -+ } -+ } -+ } -+ } else { -+ /* We have reached the bottom of the tree. */ -+ BUFFER_TRACE(parent_bh, "free data blocks"); -+ ext3_free_data(handle, inode, parent_bh, first, last); -+ } -+} -+ -+/* -+ * ext3_truncate() -+ * -+ * We block out ext3_get_block() block instantiations across the entire -+ * transaction, and VFS/VM ensures that ext3_truncate() cannot run -+ * simultaneously on behalf of the same inode. -+ * -+ * As we work through the truncate and commmit bits of it to the journal there -+ * is one core, guiding principle: the file's tree must always be consistent on -+ * disk. We must be able to restart the truncate after a crash. -+ * -+ * The file's tree may be transiently inconsistent in memory (although it -+ * probably isn't), but whenever we close off and commit a journal transaction, -+ * the contents of (the filesystem + the journal) must be consistent and -+ * restartable. It's pretty simple, really: bottom up, right to left (although -+ * left-to-right works OK too). -+ * -+ * Note that at recovery time, journal replay occurs *before* the restart of -+ * truncate against the orphan inode list. -+ * -+ * The committed inode has the new, desired i_size (which is the same as -+ * i_disksize in this case). After a crash, ext3_orphan_cleanup() will see -+ * that this inode's truncate did not complete and it will again call -+ * ext3_truncate() to have another go. So there will be instantiated blocks -+ * to the right of the truncation point in a crashed ext3 filesystem. But -+ * that's fine - as long as they are linked from the inode, the post-crash -+ * ext3_truncate() run will find them and release them. -+ */ -+ -+void ext3_truncate(struct inode * inode) -+{ -+ handle_t *handle; -+ u32 *i_data = inode->u.ext3_i.i_data; -+ int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb); -+ int offsets[4]; -+ Indirect chain[4]; -+ Indirect *partial; -+ int nr = 0; -+ int n; -+ long last_block; -+ unsigned blocksize; -+ -+ if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || -+ S_ISLNK(inode->i_mode))) -+ return; -+ if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) -+ return; -+ -+ ext3_discard_prealloc(inode); -+ -+ handle = start_transaction(inode); -+ if (IS_ERR(handle)) -+ return; /* AKPM: return what? */ -+ -+ blocksize = inode->i_sb->s_blocksize; -+ last_block = (inode->i_size + blocksize-1) -+ >> EXT3_BLOCK_SIZE_BITS(inode->i_sb); -+ -+ ext3_block_truncate_page(handle, inode->i_mapping, inode->i_size); -+ -+ -+ n = ext3_block_to_path(inode, last_block, offsets); -+ if (n == 0) -+ goto out_stop; /* error */ -+ -+ /* -+ * OK. This truncate is going to happen. We add the inode to the -+ * orphan list, so that if this truncate spans multiple transactions, -+ * and we crash, we will resume the truncate when the filesystem -+ * recovers. It also marks the inode dirty, to catch the new size. -+ * -+ * Implication: the file must always be in a sane, consistent -+ * truncatable state while each transaction commits. -+ */ -+ if (ext3_orphan_add(handle, inode)) -+ goto out_stop; -+ -+ /* -+ * The orphan list entry will now protect us from any crash which -+ * occurs before the truncate completes, so it is now safe to propagate -+ * the new, shorter inode size (held for now in i_size) into the -+ * on-disk inode. We do this via i_disksize, which is the value which -+ * ext3 *really* writes onto the disk inode. -+ */ -+ inode->u.ext3_i.i_disksize = inode->i_size; -+ -+ /* -+ * From here we block out all ext3_get_block() callers who want to -+ * modify the block allocation tree. -+ */ -+ down_write(&inode->u.ext3_i.truncate_sem); -+ -+ if (n == 1) { /* direct blocks */ -+ ext3_free_data(handle, inode, NULL, i_data+offsets[0], -+ i_data + EXT3_NDIR_BLOCKS); -+ goto do_indirects; -+ } -+ -+ partial = ext3_find_shared(inode, n, offsets, chain, &nr); -+ /* Kill the top of shared branch (not detached) */ -+ if (nr) { -+ if (partial == chain) { -+ /* Shared branch grows from the inode */ -+ ext3_free_branches(handle, inode, NULL, -+ &nr, &nr+1, (chain+n-1) - partial); -+ *partial->p = 0; -+ /* -+ * We mark the inode dirty prior to restart, -+ * and prior to stop. No need for it here. -+ */ -+ } else { -+ /* Shared branch grows from an indirect block */ -+ BUFFER_TRACE(partial->bh, "get_write_access"); -+ ext3_free_branches(handle, inode, partial->bh, -+ partial->p, -+ partial->p+1, (chain+n-1) - partial); -+ } -+ } -+ /* Clear the ends of indirect blocks on the shared branch */ -+ while (partial > chain) { -+ ext3_free_branches(handle, inode, partial->bh, partial->p + 1, -+ (u32*)partial->bh->b_data + addr_per_block, -+ (chain+n-1) - partial); -+ BUFFER_TRACE(partial->bh, "call brelse"); -+ brelse (partial->bh); -+ partial--; -+ } -+do_indirects: -+ /* Kill the remaining (whole) subtrees */ -+ switch (offsets[0]) { -+ default: -+ nr = i_data[EXT3_IND_BLOCK]; -+ if (nr) { -+ ext3_free_branches(handle, inode, NULL, -+ &nr, &nr+1, 1); -+ i_data[EXT3_IND_BLOCK] = 0; -+ } -+ case EXT3_IND_BLOCK: -+ nr = i_data[EXT3_DIND_BLOCK]; -+ if (nr) { -+ ext3_free_branches(handle, inode, NULL, -+ &nr, &nr+1, 2); -+ i_data[EXT3_DIND_BLOCK] = 0; -+ } -+ case EXT3_DIND_BLOCK: -+ nr = i_data[EXT3_TIND_BLOCK]; -+ if (nr) { -+ ext3_free_branches(handle, inode, NULL, -+ &nr, &nr+1, 3); -+ i_data[EXT3_TIND_BLOCK] = 0; -+ } -+ case EXT3_TIND_BLOCK: -+ ; -+ } -+ up_write(&inode->u.ext3_i.truncate_sem); -+ inode->i_mtime = inode->i_ctime = CURRENT_TIME; -+ ext3_mark_inode_dirty(handle, inode); -+ -+ /* In a multi-transaction truncate, we only make the final -+ * transaction synchronous */ -+ if (IS_SYNC(inode)) -+ handle->h_sync = 1; -+out_stop: -+ /* -+ * If this was a simple ftruncate(), and the file will remain alive -+ * then we need to clear up the orphan record which we created above. -+ * However, if this was a real unlink then we were called by -+ * ext3_delete_inode(), and we allow that function to clean up the -+ * orphan info for us. -+ */ -+ if (inode->i_nlink) -+ ext3_orphan_del(handle, inode); -+ -+ ext3_journal_stop(handle, inode); -+} -+ -+/* -+ * ext3_get_inode_loc returns with an extra refcount against the -+ * inode's underlying buffer_head on success. -+ */ -+ -+int ext3_get_inode_loc (struct inode *inode, struct ext3_iloc *iloc) -+{ -+ struct buffer_head *bh = 0; -+ unsigned long block; -+ unsigned long block_group; -+ unsigned long group_desc; -+ unsigned long desc; -+ unsigned long offset; -+ struct ext3_group_desc * gdp; -+ -+ if ((inode->i_ino != EXT3_ROOT_INO && -+ inode->i_ino != EXT3_ACL_IDX_INO && -+ inode->i_ino != EXT3_ACL_DATA_INO && -+ inode->i_ino != EXT3_JOURNAL_INO && -+ inode->i_ino < EXT3_FIRST_INO(inode->i_sb)) || -+ inode->i_ino > le32_to_cpu( -+ inode->i_sb->u.ext3_sb.s_es->s_inodes_count)) { -+ ext3_error (inode->i_sb, "ext3_get_inode_loc", -+ "bad inode number: %lu", inode->i_ino); -+ goto bad_inode; -+ } -+ block_group = (inode->i_ino - 1) / EXT3_INODES_PER_GROUP(inode->i_sb); -+ if (block_group >= inode->i_sb->u.ext3_sb.s_groups_count) { -+ ext3_error (inode->i_sb, "ext3_get_inode_loc", -+ "group >= groups count"); -+ goto bad_inode; -+ } -+ group_desc = block_group >> EXT3_DESC_PER_BLOCK_BITS(inode->i_sb); -+ desc = block_group & (EXT3_DESC_PER_BLOCK(inode->i_sb) - 1); -+ bh = inode->i_sb->u.ext3_sb.s_group_desc[group_desc]; -+ if (!bh) { -+ ext3_error (inode->i_sb, "ext3_get_inode_loc", -+ "Descriptor not loaded"); -+ goto bad_inode; -+ } -+ -+ gdp = (struct ext3_group_desc *) bh->b_data; -+ /* -+ * Figure out the offset within the block group inode table -+ */ -+ offset = ((inode->i_ino - 1) % EXT3_INODES_PER_GROUP(inode->i_sb)) * -+ EXT3_INODE_SIZE(inode->i_sb); -+ block = le32_to_cpu(gdp[desc].bg_inode_table) + -+ (offset >> EXT3_BLOCK_SIZE_BITS(inode->i_sb)); -+ if (!(bh = sb_bread(inode->i_sb, block))) { -+ ext3_error (inode->i_sb, "ext3_get_inode_loc", -+ "unable to read inode block - " -+ "inode=%lu, block=%lu", inode->i_ino, block); -+ goto bad_inode; -+ } -+ offset &= (EXT3_BLOCK_SIZE(inode->i_sb) - 1); -+ -+ iloc->bh = bh; -+ iloc->raw_inode = (struct ext3_inode *) (bh->b_data + offset); -+ iloc->block_group = block_group; -+ -+ return 0; -+ -+ bad_inode: -+ return -EIO; -+} -+ -+void ext3_read_inode(struct inode * inode) -+{ -+ struct ext3_iloc iloc; -+ struct ext3_inode *raw_inode; -+ struct buffer_head *bh; -+ int block; -+ -+ if(ext3_get_inode_loc(inode, &iloc)) -+ goto bad_inode; -+ bh = iloc.bh; -+ raw_inode = iloc.raw_inode; -+ init_rwsem(&inode->u.ext3_i.truncate_sem); -+ inode->i_mode = le16_to_cpu(raw_inode->i_mode); -+ inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); -+ inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); -+ if(!(test_opt (inode->i_sb, NO_UID32))) { -+ inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; -+ inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; -+ } -+ inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); -+ inode->i_size = le32_to_cpu(raw_inode->i_size); -+ inode->i_atime = le32_to_cpu(raw_inode->i_atime); -+ inode->i_ctime = le32_to_cpu(raw_inode->i_ctime); -+ inode->i_mtime = le32_to_cpu(raw_inode->i_mtime); -+ inode->u.ext3_i.i_dtime = le32_to_cpu(raw_inode->i_dtime); -+ /* We now have enough fields to check if the inode was active or not. -+ * This is needed because nfsd might try to access dead inodes -+ * the test is that same one that e2fsck uses -+ * NeilBrown 1999oct15 -+ */ -+ if (inode->i_nlink == 0) { -+ if (inode->i_mode == 0 || -+ !(inode->i_sb->u.ext3_sb.s_mount_state & EXT3_ORPHAN_FS)) { -+ /* this inode is deleted */ -+ brelse (bh); -+ goto bad_inode; -+ } -+ /* The only unlinked inodes we let through here have -+ * valid i_mode and are being read by the orphan -+ * recovery code: that's fine, we're about to complete -+ * the process of deleting those. */ -+ } -+ inode->i_blksize = PAGE_SIZE; /* This is the optimal IO size -+ * (for stat), not the fs block -+ * size */ -+ inode->i_blocks = le32_to_cpu(raw_inode->i_blocks); -+ inode->i_version = ++event; -+ inode->u.ext3_i.i_flags = le32_to_cpu(raw_inode->i_flags); -+#ifdef EXT3_FRAGMENTS -+ inode->u.ext3_i.i_faddr = le32_to_cpu(raw_inode->i_faddr); -+ inode->u.ext3_i.i_frag_no = raw_inode->i_frag; -+ inode->u.ext3_i.i_frag_size = raw_inode->i_fsize; -+#endif -+ inode->u.ext3_i.i_file_acl = le32_to_cpu(raw_inode->i_file_acl); -+ if (!S_ISREG(inode->i_mode)) { -+ inode->u.ext3_i.i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl); -+ } else { -+ inode->i_size |= -+ ((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32; -+ } -+ inode->u.ext3_i.i_disksize = inode->i_size; -+ inode->i_generation = le32_to_cpu(raw_inode->i_generation); -+#ifdef EXT3_PREALLOCATE -+ inode->u.ext3_i.i_prealloc_count = 0; -+#endif -+ inode->u.ext3_i.i_block_group = iloc.block_group; -+ -+ /* -+ * NOTE! The in-memory inode i_data array is in little-endian order -+ * even on big-endian machines: we do NOT byteswap the block numbers! -+ */ -+ for (block = 0; block < EXT3_N_BLOCKS; block++) -+ inode->u.ext3_i.i_data[block] = iloc.raw_inode->i_block[block]; -+ INIT_LIST_HEAD(&inode->u.ext3_i.i_orphan); -+ -+ brelse (iloc.bh); -+ -+ if (inode->i_ino == EXT3_ACL_IDX_INO || -+ inode->i_ino == EXT3_ACL_DATA_INO) -+ /* Nothing to do */ ; -+ else if (S_ISREG(inode->i_mode)) { -+ inode->i_op = &ext3_file_inode_operations; -+ inode->i_fop = &ext3_file_operations; -+ inode->i_mapping->a_ops = &ext3_aops; -+ } else if (S_ISDIR(inode->i_mode)) { -+ inode->i_op = &ext3_dir_inode_operations; -+ inode->i_fop = &ext3_dir_operations; -+ } else if (S_ISLNK(inode->i_mode)) { -+ if (!inode->i_blocks) -+ inode->i_op = &ext3_fast_symlink_inode_operations; -+ else { -+ inode->i_op = &page_symlink_inode_operations; -+ inode->i_mapping->a_ops = &ext3_aops; -+ } -+ } else -+ init_special_inode(inode, inode->i_mode, -+ le32_to_cpu(iloc.raw_inode->i_block[0])); -+ /* inode->i_attr_flags = 0; unused */ -+ if (inode->u.ext3_i.i_flags & EXT3_SYNC_FL) { -+ /* inode->i_attr_flags |= ATTR_FLAG_SYNCRONOUS; unused */ -+ inode->i_flags |= S_SYNC; -+ } -+ if (inode->u.ext3_i.i_flags & EXT3_APPEND_FL) { -+ /* inode->i_attr_flags |= ATTR_FLAG_APPEND; unused */ -+ inode->i_flags |= S_APPEND; -+ } -+ if (inode->u.ext3_i.i_flags & EXT3_IMMUTABLE_FL) { -+ /* inode->i_attr_flags |= ATTR_FLAG_IMMUTABLE; unused */ -+ inode->i_flags |= S_IMMUTABLE; -+ } -+ if (inode->u.ext3_i.i_flags & EXT3_NOATIME_FL) { -+ /* inode->i_attr_flags |= ATTR_FLAG_NOATIME; unused */ -+ inode->i_flags |= S_NOATIME; -+ } -+ return; -+ -+bad_inode: -+ make_bad_inode(inode); -+ return; -+} -+ -+/* -+ * Post the struct inode info into an on-disk inode location in the -+ * buffer-cache. This gobbles the caller's reference to the -+ * buffer_head in the inode location struct. -+ */ -+ -+static int ext3_do_update_inode(handle_t *handle, -+ struct inode *inode, -+ struct ext3_iloc *iloc) -+{ -+ struct ext3_inode *raw_inode = iloc->raw_inode; -+ struct buffer_head *bh = iloc->bh; -+ int err = 0, rc, block; -+ -+ if (handle) { -+ BUFFER_TRACE(bh, "get_write_access"); -+ err = ext3_journal_get_write_access(handle, bh); -+ if (err) -+ goto out_brelse; -+ } -+ raw_inode->i_mode = cpu_to_le16(inode->i_mode); -+ if(!(test_opt(inode->i_sb, NO_UID32))) { -+ raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid)); -+ raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid)); -+/* -+ * Fix up interoperability with old kernels. Otherwise, old inodes get -+ * re-used with the upper 16 bits of the uid/gid intact -+ */ -+ if(!inode->u.ext3_i.i_dtime) { -+ raw_inode->i_uid_high = -+ cpu_to_le16(high_16_bits(inode->i_uid)); -+ raw_inode->i_gid_high = -+ cpu_to_le16(high_16_bits(inode->i_gid)); -+ } else { -+ raw_inode->i_uid_high = 0; -+ raw_inode->i_gid_high = 0; -+ } -+ } else { -+ raw_inode->i_uid_low = -+ cpu_to_le16(fs_high2lowuid(inode->i_uid)); -+ raw_inode->i_gid_low = -+ cpu_to_le16(fs_high2lowgid(inode->i_gid)); -+ raw_inode->i_uid_high = 0; -+ raw_inode->i_gid_high = 0; -+ } -+ raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); -+ raw_inode->i_size = cpu_to_le32(inode->u.ext3_i.i_disksize); -+ raw_inode->i_atime = cpu_to_le32(inode->i_atime); -+ raw_inode->i_ctime = cpu_to_le32(inode->i_ctime); -+ raw_inode->i_mtime = cpu_to_le32(inode->i_mtime); -+ raw_inode->i_blocks = cpu_to_le32(inode->i_blocks); -+ raw_inode->i_dtime = cpu_to_le32(inode->u.ext3_i.i_dtime); -+ raw_inode->i_flags = cpu_to_le32(inode->u.ext3_i.i_flags); -+#ifdef EXT3_FRAGMENTS -+ raw_inode->i_faddr = cpu_to_le32(inode->u.ext3_i.i_faddr); -+ raw_inode->i_frag = inode->u.ext3_i.i_frag_no; -+ raw_inode->i_fsize = inode->u.ext3_i.i_frag_size; -+#else -+ /* If we are not tracking these fields in the in-memory inode, -+ * then preserve them on disk, but still initialise them to zero -+ * for new inodes. */ -+ if (EXT3_I(inode)->i_state & EXT3_STATE_NEW) { -+ raw_inode->i_faddr = 0; -+ raw_inode->i_frag = 0; -+ raw_inode->i_fsize = 0; -+ } -+#endif -+ raw_inode->i_file_acl = cpu_to_le32(inode->u.ext3_i.i_file_acl); -+ if (!S_ISREG(inode->i_mode)) { -+ raw_inode->i_dir_acl = cpu_to_le32(inode->u.ext3_i.i_dir_acl); -+ } else { -+ raw_inode->i_size_high = -+ cpu_to_le32(inode->u.ext3_i.i_disksize >> 32); -+ if (inode->u.ext3_i.i_disksize > 0x7fffffffULL) { -+ struct super_block *sb = inode->i_sb; -+ if (!EXT3_HAS_RO_COMPAT_FEATURE(sb, -+ EXT3_FEATURE_RO_COMPAT_LARGE_FILE) || -+ EXT3_SB(sb)->s_es->s_rev_level == -+ cpu_to_le32(EXT3_GOOD_OLD_REV)) { -+ /* If this is the first large file -+ * created, add a flag to the superblock. -+ */ -+ err = ext3_journal_get_write_access(handle, -+ sb->u.ext3_sb.s_sbh); -+ if (err) -+ goto out_brelse; -+ ext3_update_dynamic_rev(sb); -+ EXT3_SET_RO_COMPAT_FEATURE(sb, -+ EXT3_FEATURE_RO_COMPAT_LARGE_FILE); -+ sb->s_dirt = 1; -+ handle->h_sync = 1; -+ err = ext3_journal_dirty_metadata(handle, -+ sb->u.ext3_sb.s_sbh); -+ } -+ } -+ } -+ raw_inode->i_generation = le32_to_cpu(inode->i_generation); -+ if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) -+ raw_inode->i_block[0] = -+ cpu_to_le32(kdev_t_to_nr(inode->i_rdev)); -+ else for (block = 0; block < EXT3_N_BLOCKS; block++) -+ raw_inode->i_block[block] = inode->u.ext3_i.i_data[block]; -+ -+ BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); -+ rc = ext3_journal_dirty_metadata(handle, bh); -+ if (!err) -+ err = rc; -+ EXT3_I(inode)->i_state &= ~EXT3_STATE_NEW; -+ -+out_brelse: -+ brelse (bh); -+ ext3_std_error(inode->i_sb, err); -+ return err; -+} -+ -+/* -+ * ext3_write_inode() -+ * -+ * We are called from a few places: -+ * -+ * - Within generic_file_write() for O_SYNC files. -+ * Here, there will be no transaction running. We wait for any running -+ * trasnaction to commit. -+ * -+ * - Within sys_sync(), kupdate and such. -+ * We wait on commit, if tol to. -+ * -+ * - Within prune_icache() (PF_MEMALLOC == true) -+ * Here we simply return. We can't afford to block kswapd on the -+ * journal commit. -+ * -+ * In all cases it is actually safe for us to return without doing anything, -+ * because the inode has been copied into a raw inode buffer in -+ * ext3_mark_inode_dirty(). This is a correctness thing for O_SYNC and for -+ * knfsd. -+ * -+ * Note that we are absolutely dependent upon all inode dirtiers doing the -+ * right thing: they *must* call mark_inode_dirty() after dirtying info in -+ * which we are interested. -+ * -+ * It would be a bug for them to not do this. The code: -+ * -+ * mark_inode_dirty(inode) -+ * stuff(); -+ * inode->i_size = expr; -+ * -+ * is in error because a kswapd-driven write_inode() could occur while -+ * `stuff()' is running, and the new i_size will be lost. Plus the inode -+ * will no longer be on the superblock's dirty inode list. -+ */ -+void ext3_write_inode(struct inode *inode, int wait) -+{ -+ if (current->flags & PF_MEMALLOC) -+ return; -+ -+ if (ext3_journal_current_handle()) { -+ jbd_debug(0, "called recursively, non-PF_MEMALLOC!\n"); -+ return; -+ } -+ -+ if (!wait) -+ return; -+ -+ ext3_force_commit(inode->i_sb); -+} -+ -+/* -+ * ext3_setattr() -+ * -+ * Called from notify_change. -+ * -+ * We want to trap VFS attempts to truncate the file as soon as -+ * possible. In particular, we want to make sure that when the VFS -+ * shrinks i_size, we put the inode on the orphan list and modify -+ * i_disksize immediately, so that during the subsequent flushing of -+ * dirty pages and freeing of disk blocks, we can guarantee that any -+ * commit will leave the blocks being flushed in an unused state on -+ * disk. (On recovery, the inode will get truncated and the blocks will -+ * be freed, so we have a strong guarantee that no future commit will -+ * leave these blocks visible to the user.) -+ * -+ * This is only needed for regular files. rmdir() has its own path, and -+ * we can never truncate a direcory except on final unlink (at which -+ * point i_nlink is zero so recovery is easy.) -+ * -+ * Called with the BKL. -+ */ -+ -+int ext3_setattr(struct dentry *dentry, struct iattr *attr) -+{ -+ struct inode *inode = dentry->d_inode; -+ int error, rc = 0; -+ const unsigned int ia_valid = attr->ia_valid; -+ -+ error = inode_change_ok(inode, attr); -+ if (error) -+ return error; -+ -+ if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || -+ (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { -+ error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0; -+ if (error) -+ return error; -+ } -+ -+ if (attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) { -+ handle_t *handle; -+ -+ handle = ext3_journal_start(inode, 3); -+ if (IS_ERR(handle)) { -+ error = PTR_ERR(handle); -+ goto err_out; -+ } -+ -+ error = ext3_orphan_add(handle, inode); -+ inode->u.ext3_i.i_disksize = attr->ia_size; -+ rc = ext3_mark_inode_dirty(handle, inode); -+ if (!error) -+ error = rc; -+ ext3_journal_stop(handle, inode); -+ } -+ -+ rc = inode_setattr(inode, attr); -+ -+ /* If inode_setattr's call to ext3_truncate failed to get a -+ * transaction handle at all, we need to clean up the in-core -+ * orphan list manually. */ -+ if (inode->i_nlink) -+ ext3_orphan_del(NULL, inode); -+ -+err_out: -+ ext3_std_error(inode->i_sb, error); -+ if (!error) -+ error = rc; -+ return error; -+} -+ -+ -+/* -+ * akpm: how many blocks doth make a writepage()? -+ * -+ * With N blocks per page, it may be: -+ * N data blocks -+ * 2 indirect block -+ * 2 dindirect -+ * 1 tindirect -+ * N+5 bitmap blocks (from the above) -+ * N+5 group descriptor summary blocks -+ * 1 inode block -+ * 1 superblock. -+ * 2 * EXT3_SINGLEDATA_TRANS_BLOCKS for the quote files -+ * -+ * 3 * (N + 5) + 2 + 2 * EXT3_SINGLEDATA_TRANS_BLOCKS -+ * -+ * With ordered or writeback data it's the same, less the N data blocks. -+ * -+ * If the inode's direct blocks can hold an integral number of pages then a -+ * page cannot straddle two indirect blocks, and we can only touch one indirect -+ * and dindirect block, and the "5" above becomes "3". -+ * -+ * This still overestimates under most circumstances. If we were to pass the -+ * start and end offsets in here as well we could do block_to_path() on each -+ * block and work out the exact number of indirects which are touched. Pah. -+ */ -+ -+int ext3_writepage_trans_blocks(struct inode *inode) -+{ -+ int bpp = ext3_journal_blocks_per_page(inode); -+ int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3; -+ int ret; -+ -+ if (ext3_should_journal_data(inode)) -+ ret = 3 * (bpp + indirects) + 2; -+ else -+ ret = 2 * (bpp + indirects) + 2; -+ -+#ifdef CONFIG_QUOTA -+ ret += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS; -+#endif -+ -+ return ret; -+} -+ -+int -+ext3_mark_iloc_dirty(handle_t *handle, -+ struct inode *inode, -+ struct ext3_iloc *iloc) -+{ -+ int err = 0; -+ -+ if (handle) { -+ /* the do_update_inode consumes one bh->b_count */ -+ atomic_inc(&iloc->bh->b_count); -+ err = ext3_do_update_inode(handle, inode, iloc); -+ /* ext3_do_update_inode() does journal_dirty_metadata */ -+ brelse(iloc->bh); -+ } else { -+ printk(KERN_EMERG __FUNCTION__ ": called with no handle!\n"); -+ } -+ return err; -+} -+ -+/* -+ * On success, We end up with an outstanding reference count against -+ * iloc->bh. This _must_ be cleaned up later. -+ */ -+ -+int -+ext3_reserve_inode_write(handle_t *handle, struct inode *inode, -+ struct ext3_iloc *iloc) -+{ -+ int err = 0; -+ if (handle) { -+ err = ext3_get_inode_loc(inode, iloc); -+ if (!err) { -+ BUFFER_TRACE(iloc->bh, "get_write_access"); -+ err = ext3_journal_get_write_access(handle, iloc->bh); -+ if (err) { -+ brelse(iloc->bh); -+ iloc->bh = NULL; -+ } -+ } -+ } -+ ext3_std_error(inode->i_sb, err); -+ return err; -+} -+ -+/* -+ * akpm: What we do here is to mark the in-core inode as clean -+ * with respect to inode dirtiness (it may still be data-dirty). -+ * This means that the in-core inode may be reaped by prune_icache -+ * without having to perform any I/O. This is a very good thing, -+ * because *any* task may call prune_icache - even ones which -+ * have a transaction open against a different journal. -+ * -+ * Is this cheating? Not really. Sure, we haven't written the -+ * inode out, but prune_icache isn't a user-visible syncing function. -+ * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync) -+ * we start and wait on commits. -+ * -+ * Is this efficient/effective? Well, we're being nice to the system -+ * by cleaning up our inodes proactively so they can be reaped -+ * without I/O. But we are potentially leaving up to five seconds' -+ * worth of inodes floating about which prune_icache wants us to -+ * write out. One way to fix that would be to get prune_icache() -+ * to do a write_super() to free up some memory. It has the desired -+ * effect. -+ */ -+int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode) -+{ -+ struct ext3_iloc iloc; -+ int err; -+ -+ err = ext3_reserve_inode_write(handle, inode, &iloc); -+ if (!err) -+ err = ext3_mark_iloc_dirty(handle, inode, &iloc); -+ return err; -+} -+ -+/* -+ * akpm: ext3_dirty_inode() is called from __mark_inode_dirty() -+ * -+ * We're really interested in the case where a file is being extended. -+ * i_size has been changed by generic_commit_write() and we thus need -+ * to include the updated inode in the current transaction. -+ * -+ * Also, DQUOT_ALLOC_SPACE() will always dirty the inode when blocks -+ * are allocated to the file. -+ * -+ * If the inode is marked synchronous, we don't honour that here - doing -+ * so would cause a commit on atime updates, which we don't bother doing. -+ * We handle synchronous inodes at the highest possible level. -+ */ -+void ext3_dirty_inode(struct inode *inode) -+{ -+ handle_t *current_handle = ext3_journal_current_handle(); -+ handle_t *handle; -+ -+ lock_kernel(); -+ handle = ext3_journal_start(inode, 1); -+ if (IS_ERR(handle)) -+ goto out; -+ if (current_handle && -+ current_handle->h_transaction != handle->h_transaction) { -+ /* This task has a transaction open against a different fs */ -+ printk(KERN_EMERG __FUNCTION__": transactions do not match!\n"); -+ } else { -+ jbd_debug(5, "marking dirty. outer handle=%p\n", -+ current_handle); -+ ext3_mark_inode_dirty(handle, inode); -+ } -+ ext3_journal_stop(handle, inode); -+out: -+ unlock_kernel(); -+} -+ -+#ifdef AKPM -+/* -+ * Bind an inode's backing buffer_head into this transaction, to prevent -+ * it from being flushed to disk early. Unlike -+ * ext3_reserve_inode_write, this leaves behind no bh reference and -+ * returns no iloc structure, so the caller needs to repeat the iloc -+ * lookup to mark the inode dirty later. -+ */ -+static inline int -+ext3_pin_inode(handle_t *handle, struct inode *inode) -+{ -+ struct ext3_iloc iloc; -+ -+ int err = 0; -+ if (handle) { -+ err = ext3_get_inode_loc(inode, &iloc); -+ if (!err) { -+ BUFFER_TRACE(iloc.bh, "get_write_access"); -+ err = journal_get_write_access(handle, iloc.bh); -+ if (!err) -+ err = ext3_journal_dirty_metadata(handle, -+ iloc.bh); -+ brelse(iloc.bh); -+ } -+ } -+ ext3_std_error(inode->i_sb, err); -+ return err; -+} -+#endif -+ -+int ext3_change_inode_journal_flag(struct inode *inode, int val) -+{ -+ journal_t *journal; -+ handle_t *handle; -+ int err; -+ -+ /* -+ * We have to be very careful here: changing a data block's -+ * journaling status dynamically is dangerous. If we write a -+ * data block to the journal, change the status and then delete -+ * that block, we risk forgetting to revoke the old log record -+ * from the journal and so a subsequent replay can corrupt data. -+ * So, first we make sure that the journal is empty and that -+ * nobody is changing anything. -+ */ -+ -+ journal = EXT3_JOURNAL(inode); -+ if (is_journal_aborted(journal) || IS_RDONLY(inode)) -+ return -EROFS; -+ -+ journal_lock_updates(journal); -+ journal_flush(journal); -+ -+ /* -+ * OK, there are no updates running now, and all cached data is -+ * synced to disk. We are now in a completely consistent state -+ * which doesn't have anything in the journal, and we know that -+ * no filesystem updates are running, so it is safe to modify -+ * the inode's in-core data-journaling state flag now. -+ */ -+ -+ if (val) -+ inode->u.ext3_i.i_flags |= EXT3_JOURNAL_DATA_FL; -+ else -+ inode->u.ext3_i.i_flags &= ~EXT3_JOURNAL_DATA_FL; -+ -+ journal_unlock_updates(journal); -+ -+ /* Finally we can mark the inode as dirty. */ -+ -+ handle = ext3_journal_start(inode, 1); -+ if (IS_ERR(handle)) -+ return PTR_ERR(handle); -+ -+ err = ext3_mark_inode_dirty(handle, inode); -+ handle->h_sync = 1; -+ ext3_journal_stop(handle, inode); -+ ext3_std_error(inode->i_sb, err); -+ -+ return err; -+} -+ -+ -+/* -+ * ext3_aops_journal_start(). -+ * -+ * -+ * -+ * We need to take the inode semaphore *outside* the -+ * journal_start/journal_stop. Otherwise, a different task could do a -+ * wait_for_commit() while holding ->i_sem, which deadlocks. The rule -+ * is: transaction open/closes are considered to be a locking operation -+ * and they nest *inside* ->i_sem. -+ * ---------------------------------------------------------------------------- -+ * Possible problem: -+ * ext3_file_write() -+ * -> generic_file_write() -+ * -> __alloc_pages() -+ * -> page_launder() -+ * -> ext3_writepage() -+ * -+ * And the writepage can be on a different fs while we have a -+ * transaction open against this one! Bad. -+ * -+ * I tried making the task PF_MEMALLOC here, but that simply results in -+ * 0-order allocation failures passed back to generic_file_write(). -+ * Instead, we rely on the reentrancy protection in ext3_writepage(). -+ * ---------------------------------------------------------------------------- -+ * When we do the journal_start() here we don't really need to reserve -+ * any blocks - we won't need any until we hit ext3_prepare_write(), -+ * which does all the needed journal extending. However! There is a -+ * problem with quotas: -+ * -+ * Thread 1: -+ * sys_sync -+ * ->sync_dquots -+ * ->commit_dquot -+ * ->lock_dquot -+ * ->write_dquot -+ * ->ext3_file_write -+ * ->journal_start -+ * ->ext3_prepare_write -+ * ->journal_extend -+ * ->journal_start -+ * Thread 2: -+ * ext3_create (for example) -+ * ->ext3_new_inode -+ * ->dquot_initialize -+ * ->lock_dquot -+ * -+ * Deadlock. Thread 1's journal_start blocks because thread 2 has a -+ * transaction open. Thread 2's transaction will never close because -+ * thread 2 is stuck waiting for the dquot lock. -+ * -+ * So. We must ensure that thread 1 *never* needs to extend the journal -+ * for quota writes. We do that by reserving enough journal blocks -+ * here, in ext3_aops_journal_start() to ensure that the forthcoming "see if we -+ * need to extend" test in ext3_prepare_write() succeeds. -+ */ -diff -rup --new-file linux.mcp2/fs/ext3/ioctl.c linux_tmp/fs/ext3/ioctl.c ---- linux.mcp2/fs/ext3/ioctl.c 1969-12-31 16:00:00.000000000 -0800 -+++ linux_tmp/fs/ext3/ioctl.c 2001-11-09 14:25:04.000000000 -0800 -@@ -0,0 +1,170 @@ -+/* -+ * linux/fs/ext3/ioctl.c -+ * -+ * Copyright (C) 1993, 1994, 1995 -+ * Remy Card (card@masi.ibp.fr) -+ * Laboratoire MASI - Institut Blaise Pascal -+ * Universite Pierre et Marie Curie (Paris VI) -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+ -+ -+int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, -+ unsigned long arg) -+{ -+ unsigned int flags; -+ -+ ext3_debug ("cmd = %u, arg = %lu\n", cmd, arg); -+ -+ switch (cmd) { -+ case EXT3_IOC_GETFLAGS: -+ flags = inode->u.ext3_i.i_flags & EXT3_FL_USER_VISIBLE; -+ return put_user(flags, (int *) arg); -+ case EXT3_IOC_SETFLAGS: { -+ handle_t *handle = NULL; -+ int err; -+ struct ext3_iloc iloc; -+ unsigned int oldflags; -+ unsigned int jflag; -+ -+ if (IS_RDONLY(inode)) -+ return -EROFS; -+ -+ if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) -+ return -EPERM; -+ -+ if (get_user(flags, (int *) arg)) -+ return -EFAULT; -+ -+ oldflags = inode->u.ext3_i.i_flags; -+ -+ /* The JOURNAL_DATA flag is modifiable only by root */ -+ jflag = flags & EXT3_JOURNAL_DATA_FL; -+ -+ /* -+ * The IMMUTABLE and APPEND_ONLY flags can only be changed by -+ * the relevant capability. -+ * -+ * This test looks nicer. Thanks to Pauline Middelink -+ */ -+ if ((flags ^ oldflags) & (EXT3_APPEND_FL | EXT3_IMMUTABLE_FL)) { -+ if (!capable(CAP_LINUX_IMMUTABLE)) -+ return -EPERM; -+ } -+ -+ /* -+ * The JOURNAL_DATA flag can only be changed by -+ * the relevant capability. -+ */ -+ if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) { -+ if (!capable(CAP_SYS_RESOURCE)) -+ return -EPERM; -+ } -+ -+ -+ handle = ext3_journal_start(inode, 1); -+ if (IS_ERR(handle)) -+ return PTR_ERR(handle); -+ if (IS_SYNC(inode)) -+ handle->h_sync = 1; -+ err = ext3_reserve_inode_write(handle, inode, &iloc); -+ if (err) -+ goto flags_err; -+ -+ flags = flags & EXT3_FL_USER_MODIFIABLE; -+ flags |= oldflags & ~EXT3_FL_USER_MODIFIABLE; -+ inode->u.ext3_i.i_flags = flags; -+ -+ if (flags & EXT3_SYNC_FL) -+ inode->i_flags |= S_SYNC; -+ else -+ inode->i_flags &= ~S_SYNC; -+ if (flags & EXT3_APPEND_FL) -+ inode->i_flags |= S_APPEND; -+ else -+ inode->i_flags &= ~S_APPEND; -+ if (flags & EXT3_IMMUTABLE_FL) -+ inode->i_flags |= S_IMMUTABLE; -+ else -+ inode->i_flags &= ~S_IMMUTABLE; -+ if (flags & EXT3_NOATIME_FL) -+ inode->i_flags |= S_NOATIME; -+ else -+ inode->i_flags &= ~S_NOATIME; -+ inode->i_ctime = CURRENT_TIME; -+ -+ err = ext3_mark_iloc_dirty(handle, inode, &iloc); -+flags_err: -+ ext3_journal_stop(handle, inode); -+ if (err) -+ return err; -+ -+ if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) -+ err = ext3_change_inode_journal_flag(inode, jflag); -+ return err; -+ } -+ case EXT3_IOC_GETVERSION: -+ case EXT3_IOC_GETVERSION_OLD: -+ return put_user(inode->i_generation, (int *) arg); -+ case EXT3_IOC_SETVERSION: -+ case EXT3_IOC_SETVERSION_OLD: { -+ handle_t *handle; -+ struct ext3_iloc iloc; -+ __u32 generation; -+ int err; -+ -+ if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) -+ return -EPERM; -+ if (IS_RDONLY(inode)) -+ return -EROFS; -+ if (get_user(generation, (int *) arg)) -+ return -EFAULT; -+ -+ handle = ext3_journal_start(inode, 1); -+ if (IS_ERR(handle)) -+ return PTR_ERR(handle); -+ err = ext3_reserve_inode_write(handle, inode, &iloc); -+ if (err) -+ return err; -+ -+ inode->i_ctime = CURRENT_TIME; -+ inode->i_generation = generation; -+ -+ err = ext3_mark_iloc_dirty(handle, inode, &iloc); -+ ext3_journal_stop(handle, inode); -+ return err; -+ } -+#ifdef CONFIG_JBD_DEBUG -+ case EXT3_IOC_WAIT_FOR_READONLY: -+ /* -+ * This is racy - by the time we're woken up and running, -+ * the superblock could be released. And the module could -+ * have been unloaded. So sue me. -+ * -+ * Returns 1 if it slept, else zero. -+ */ -+ { -+ struct super_block *sb = inode->i_sb; -+ DECLARE_WAITQUEUE(wait, current); -+ int ret = 0; -+ -+ set_current_state(TASK_INTERRUPTIBLE); -+ add_wait_queue(&sb->u.ext3_sb.ro_wait_queue, &wait); -+ if (timer_pending(&sb->u.ext3_sb.turn_ro_timer)) { -+ schedule(); -+ ret = 1; -+ } -+ remove_wait_queue(&sb->u.ext3_sb.ro_wait_queue, &wait); -+ return ret; -+ } -+#endif -+ default: -+ return -ENOTTY; -+ } -+} -diff -rup --new-file linux.mcp2/fs/ext3/namei.c linux_tmp/fs/ext3/namei.c ---- linux.mcp2/fs/ext3/namei.c 1969-12-31 16:00:00.000000000 -0800 -+++ linux_tmp/fs/ext3/namei.c 2001-11-09 14:25:04.000000000 -0800 -@@ -0,0 +1,1125 @@ -+/* -+ * linux/fs/ext3/namei.c -+ * -+ * Copyright (C) 1992, 1993, 1994, 1995 -+ * Remy Card (card@masi.ibp.fr) -+ * Laboratoire MASI - Institut Blaise Pascal -+ * Universite Pierre et Marie Curie (Paris VI) -+ * -+ * from -+ * -+ * linux/fs/minix/namei.c -+ * -+ * Copyright (C) 1991, 1992 Linus Torvalds -+ * -+ * Big-endian to little-endian byte-swapping/bitmaps by -+ * David S. Miller (davem@caip.rutgers.edu), 1995 -+ * Directory entry file type support and forward compatibility hooks -+ * for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998 -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+ -+/* -+ * define how far ahead to read directories while searching them. -+ */ -+#define NAMEI_RA_CHUNKS 2 -+#define NAMEI_RA_BLOCKS 4 -+#define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS) -+#define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b)) -+ -+/* -+ * NOTE! unlike strncmp, ext3_match returns 1 for success, 0 for failure. -+ * -+ * `len <= EXT3_NAME_LEN' is guaranteed by caller. -+ * `de != NULL' is guaranteed by caller. -+ */ -+static inline int ext3_match (int len, const char * const name, -+ struct ext3_dir_entry_2 * de) -+{ -+ if (len != de->name_len) -+ return 0; -+ if (!de->inode) -+ return 0; -+ return !memcmp(name, de->name, len); -+} -+ -+/* -+ * Returns 0 if not found, -1 on failure, and 1 on success -+ */ -+static int inline search_dirblock(struct buffer_head * bh, -+ struct inode *dir, -+ struct dentry *dentry, -+ unsigned long offset, -+ struct ext3_dir_entry_2 ** res_dir) -+{ -+ struct ext3_dir_entry_2 * de; -+ char * dlimit; -+ int de_len; -+ const char *name = dentry->d_name.name; -+ int namelen = dentry->d_name.len; -+ -+ de = (struct ext3_dir_entry_2 *) bh->b_data; -+ dlimit = bh->b_data + dir->i_sb->s_blocksize; -+ while ((char *) de < dlimit) { -+ /* this code is executed quadratically often */ -+ /* do minimal checking `by hand' */ -+ -+ if ((char *) de + namelen <= dlimit && -+ ext3_match (namelen, name, de)) { -+ /* found a match - just to be sure, do a full check */ -+ if (!ext3_check_dir_entry("ext3_find_entry", -+ dir, de, bh, offset)) -+ return -1; -+ *res_dir = de; -+ return 1; -+ } -+ /* prevent looping on a bad block */ -+ de_len = le16_to_cpu(de->rec_len); -+ if (de_len <= 0) -+ return -1; -+ offset += de_len; -+ de = (struct ext3_dir_entry_2 *) ((char *) de + de_len); -+ } -+ return 0; -+} -+ -+/* -+ * ext3_find_entry() -+ * -+ * finds an entry in the specified directory with the wanted name. It -+ * returns the cache buffer in which the entry was found, and the entry -+ * itself (as a parameter - res_dir). It does NOT read the inode of the -+ * entry - you'll have to do that yourself if you want to. -+ * -+ * The returned buffer_head has ->b_count elevated. The caller is expected -+ * to brelse() it when appropriate. -+ */ -+static struct buffer_head * ext3_find_entry (struct dentry *dentry, -+ struct ext3_dir_entry_2 ** res_dir) -+{ -+ struct super_block * sb; -+ struct buffer_head * bh_use[NAMEI_RA_SIZE]; -+ struct buffer_head * bh, *ret = NULL; -+ unsigned long start, block, b; -+ int ra_max = 0; /* Number of bh's in the readahead -+ buffer, bh_use[] */ -+ int ra_ptr = 0; /* Current index into readahead -+ buffer */ -+ int num = 0; -+ int nblocks, i, err; -+ struct inode *dir = dentry->d_parent->d_inode; -+ -+ *res_dir = NULL; -+ sb = dir->i_sb; -+ -+ nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb); -+ start = dir->u.ext3_i.i_dir_start_lookup; -+ if (start >= nblocks) -+ start = 0; -+ block = start; -+restart: -+ do { -+ /* -+ * We deal with the read-ahead logic here. -+ */ -+ if (ra_ptr >= ra_max) { -+ /* Refill the readahead buffer */ -+ ra_ptr = 0; -+ b = block; -+ for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) { -+ /* -+ * Terminate if we reach the end of the -+ * directory and must wrap, or if our -+ * search has finished at this block. -+ */ -+ if (b >= nblocks || (num && block == start)) { -+ bh_use[ra_max] = NULL; -+ break; -+ } -+ num++; -+ bh = ext3_getblk(NULL, dir, b++, 0, &err); -+ bh_use[ra_max] = bh; -+ if (bh) -+ ll_rw_block(READ, 1, &bh); -+ } -+ } -+ if ((bh = bh_use[ra_ptr++]) == NULL) -+ goto next; -+ wait_on_buffer(bh); -+ if (!buffer_uptodate(bh)) { -+ /* read error, skip block & hope for the best */ -+ brelse(bh); -+ goto next; -+ } -+ i = search_dirblock(bh, dir, dentry, -+ block << EXT3_BLOCK_SIZE_BITS(sb), res_dir); -+ if (i == 1) { -+ dir->u.ext3_i.i_dir_start_lookup = block; -+ ret = bh; -+ goto cleanup_and_exit; -+ } else { -+ brelse(bh); -+ if (i < 0) -+ goto cleanup_and_exit; -+ } -+ next: -+ if (++block >= nblocks) -+ block = 0; -+ } while (block != start); -+ -+ /* -+ * If the directory has grown while we were searching, then -+ * search the last part of the directory before giving up. -+ */ -+ block = nblocks; -+ nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb); -+ if (block < nblocks) { -+ start = 0; -+ goto restart; -+ } -+ -+cleanup_and_exit: -+ /* Clean up the read-ahead blocks */ -+ for (; ra_ptr < ra_max; ra_ptr++) -+ brelse (bh_use[ra_ptr]); -+ return ret; -+} -+ -+static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry) -+{ -+ struct inode * inode; -+ struct ext3_dir_entry_2 * de; -+ struct buffer_head * bh; -+ -+ if (dentry->d_name.len > EXT3_NAME_LEN) -+ return ERR_PTR(-ENAMETOOLONG); -+ -+ bh = ext3_find_entry(dentry, &de); -+ inode = NULL; -+ if (bh) { -+ unsigned long ino = le32_to_cpu(de->inode); -+ brelse (bh); -+ inode = iget(dir->i_sb, ino); -+ -+ if (!inode) -+ return ERR_PTR(-EACCES); -+ } -+ d_add(dentry, inode); -+ return NULL; -+} -+ -+#define S_SHIFT 12 -+static unsigned char ext3_type_by_mode[S_IFMT >> S_SHIFT] = { -+ [S_IFREG >> S_SHIFT] EXT3_FT_REG_FILE, -+ [S_IFDIR >> S_SHIFT] EXT3_FT_DIR, -+ [S_IFCHR >> S_SHIFT] EXT3_FT_CHRDEV, -+ [S_IFBLK >> S_SHIFT] EXT3_FT_BLKDEV, -+ [S_IFIFO >> S_SHIFT] EXT3_FT_FIFO, -+ [S_IFSOCK >> S_SHIFT] EXT3_FT_SOCK, -+ [S_IFLNK >> S_SHIFT] EXT3_FT_SYMLINK, -+}; -+ -+static inline void ext3_set_de_type(struct super_block *sb, -+ struct ext3_dir_entry_2 *de, -+ umode_t mode) { -+ if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_FILETYPE)) -+ de->file_type = ext3_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; -+} -+ -+/* -+ * ext3_add_entry() -+ * -+ * adds a file entry to the specified directory, using the same -+ * semantics as ext3_find_entry(). It returns NULL if it failed. -+ * -+ * NOTE!! The inode part of 'de' is left at 0 - which means you -+ * may not sleep between calling this and putting something into -+ * the entry, as someone else might have used it while you slept. -+ */ -+ -+/* -+ * AKPM: the journalling code here looks wrong on the error paths -+ */ -+static int ext3_add_entry (handle_t *handle, struct dentry *dentry, -+ struct inode *inode) -+{ -+ struct inode *dir = dentry->d_parent->d_inode; -+ const char *name = dentry->d_name.name; -+ int namelen = dentry->d_name.len; -+ unsigned long offset; -+ unsigned short rec_len; -+ struct buffer_head * bh; -+ struct ext3_dir_entry_2 * de, * de1; -+ struct super_block * sb; -+ int retval; -+ -+ sb = dir->i_sb; -+ -+ if (!namelen) -+ return -EINVAL; -+ bh = ext3_bread (handle, dir, 0, 0, &retval); -+ if (!bh) -+ return retval; -+ rec_len = EXT3_DIR_REC_LEN(namelen); -+ offset = 0; -+ de = (struct ext3_dir_entry_2 *) bh->b_data; -+ while (1) { -+ if ((char *)de >= sb->s_blocksize + bh->b_data) { -+ brelse (bh); -+ bh = NULL; -+ bh = ext3_bread (handle, dir, -+ offset >> EXT3_BLOCK_SIZE_BITS(sb), 1, &retval); -+ if (!bh) -+ return retval; -+ if (dir->i_size <= offset) { -+ if (dir->i_size == 0) { -+ brelse(bh); -+ return -ENOENT; -+ } -+ -+ ext3_debug ("creating next block\n"); -+ -+ BUFFER_TRACE(bh, "get_write_access"); -+ ext3_journal_get_write_access(handle, bh); -+ de = (struct ext3_dir_entry_2 *) bh->b_data; -+ de->inode = 0; -+ de->rec_len = le16_to_cpu(sb->s_blocksize); -+ dir->u.ext3_i.i_disksize = -+ dir->i_size = offset + sb->s_blocksize; -+ dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; -+ ext3_mark_inode_dirty(handle, dir); -+ } else { -+ -+ ext3_debug ("skipping to next block\n"); -+ -+ de = (struct ext3_dir_entry_2 *) bh->b_data; -+ } -+ } -+ if (!ext3_check_dir_entry ("ext3_add_entry", dir, de, bh, -+ offset)) { -+ brelse (bh); -+ return -ENOENT; -+ } -+ if (ext3_match (namelen, name, de)) { -+ brelse (bh); -+ return -EEXIST; -+ } -+ if ((le32_to_cpu(de->inode) == 0 && -+ le16_to_cpu(de->rec_len) >= rec_len) || -+ (le16_to_cpu(de->rec_len) >= -+ EXT3_DIR_REC_LEN(de->name_len) + rec_len)) { -+ BUFFER_TRACE(bh, "get_write_access"); -+ ext3_journal_get_write_access(handle, bh); -+ /* By now the buffer is marked for journaling */ -+ offset += le16_to_cpu(de->rec_len); -+ if (le32_to_cpu(de->inode)) { -+ de1 = (struct ext3_dir_entry_2 *) ((char *) de + -+ EXT3_DIR_REC_LEN(de->name_len)); -+ de1->rec_len = -+ cpu_to_le16(le16_to_cpu(de->rec_len) - -+ EXT3_DIR_REC_LEN(de->name_len)); -+ de->rec_len = cpu_to_le16( -+ EXT3_DIR_REC_LEN(de->name_len)); -+ de = de1; -+ } -+ de->file_type = EXT3_FT_UNKNOWN; -+ if (inode) { -+ de->inode = cpu_to_le32(inode->i_ino); -+ ext3_set_de_type(dir->i_sb, de, inode->i_mode); -+ } else -+ de->inode = 0; -+ de->name_len = namelen; -+ memcpy (de->name, name, namelen); -+ /* -+ * XXX shouldn't update any times until successful -+ * completion of syscall, but too many callers depend -+ * on this. -+ * -+ * XXX similarly, too many callers depend on -+ * ext3_new_inode() setting the times, but error -+ * recovery deletes the inode, so the worst that can -+ * happen is that the times are slightly out of date -+ * and/or different from the directory change time. -+ */ -+ dir->i_mtime = dir->i_ctime = CURRENT_TIME; -+ dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; -+ ext3_mark_inode_dirty(handle, dir); -+ dir->i_version = ++event; -+ BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); -+ ext3_journal_dirty_metadata(handle, bh); -+ brelse(bh); -+ return 0; -+ } -+ offset += le16_to_cpu(de->rec_len); -+ de = (struct ext3_dir_entry_2 *) -+ ((char *) de + le16_to_cpu(de->rec_len)); -+ } -+ brelse (bh); -+ return -ENOSPC; -+} -+ -+/* -+ * ext3_delete_entry deletes a directory entry by merging it with the -+ * previous entry -+ */ -+static int ext3_delete_entry (handle_t *handle, -+ struct inode * dir, -+ struct ext3_dir_entry_2 * de_del, -+ struct buffer_head * bh) -+{ -+ struct ext3_dir_entry_2 * de, * pde; -+ int i; -+ -+ i = 0; -+ pde = NULL; -+ de = (struct ext3_dir_entry_2 *) bh->b_data; -+ while (i < bh->b_size) { -+ if (!ext3_check_dir_entry("ext3_delete_entry", dir, de, bh, i)) -+ return -EIO; -+ if (de == de_del) { -+ BUFFER_TRACE(bh, "get_write_access"); -+ ext3_journal_get_write_access(handle, bh); -+ if (pde) -+ pde->rec_len = -+ cpu_to_le16(le16_to_cpu(pde->rec_len) + -+ le16_to_cpu(de->rec_len)); -+ else -+ de->inode = 0; -+ dir->i_version = ++event; -+ BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); -+ ext3_journal_dirty_metadata(handle, bh); -+ return 0; -+ } -+ i += le16_to_cpu(de->rec_len); -+ pde = de; -+ de = (struct ext3_dir_entry_2 *) -+ ((char *) de + le16_to_cpu(de->rec_len)); -+ } -+ return -ENOENT; -+} -+ -+/* -+ * ext3_mark_inode_dirty is somewhat expensive, so unlike ext2 we -+ * do not perform it in these functions. We perform it at the call site, -+ * if it is needed. -+ */ -+static inline void ext3_inc_count(handle_t *handle, struct inode *inode) -+{ -+ inode->i_nlink++; -+} -+ -+static inline void ext3_dec_count(handle_t *handle, struct inode *inode) -+{ -+ inode->i_nlink--; -+} -+ -+static int ext3_add_nondir(handle_t *handle, -+ struct dentry *dentry, struct inode *inode) -+{ -+ int err = ext3_add_entry(handle, dentry, inode); -+ if (!err) { -+ d_instantiate(dentry, inode); -+ return 0; -+ } -+ ext3_dec_count(handle, inode); -+ iput(inode); -+ return err; -+} -+ -+/* -+ * By the time this is called, we already have created -+ * the directory cache entry for the new file, but it -+ * is so far negative - it has no inode. -+ * -+ * If the create succeeds, we fill in the inode information -+ * with d_instantiate(). -+ */ -+static int ext3_create (struct inode * dir, struct dentry * dentry, int mode) -+{ -+ handle_t *handle; -+ struct inode * inode; -+ int err; -+ -+ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3); -+ if (IS_ERR(handle)) -+ return PTR_ERR(handle); -+ -+ if (IS_SYNC(dir)) -+ handle->h_sync = 1; -+ -+ inode = ext3_new_inode (handle, dir, mode); -+ err = PTR_ERR(inode); -+ if (!IS_ERR(inode)) { -+ inode->i_op = &ext3_file_inode_operations; -+ inode->i_fop = &ext3_file_operations; -+ inode->i_mapping->a_ops = &ext3_aops; -+ ext3_mark_inode_dirty(handle, inode); -+ err = ext3_add_nondir(handle, dentry, inode); -+ } -+ ext3_journal_stop(handle, dir); -+ return err; -+} -+ -+static int ext3_mknod (struct inode * dir, struct dentry *dentry, -+ int mode, int rdev) -+{ -+ handle_t *handle; -+ struct inode *inode; -+ int err; -+ -+ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3); -+ if (IS_ERR(handle)) -+ return PTR_ERR(handle); -+ -+ if (IS_SYNC(dir)) -+ handle->h_sync = 1; -+ -+ inode = ext3_new_inode (handle, dir, mode); -+ err = PTR_ERR(inode); -+ if (!IS_ERR(inode)) { -+ init_special_inode(inode, mode, rdev); -+ ext3_mark_inode_dirty(handle, inode); -+ err = ext3_add_nondir(handle, dentry, inode); -+ } -+ ext3_journal_stop(handle, dir); -+ return err; -+} -+ -+static int ext3_mkdir(struct inode * dir, struct dentry * dentry, int mode) -+{ -+ handle_t *handle; -+ struct inode * inode; -+ struct buffer_head * dir_block; -+ struct ext3_dir_entry_2 * de; -+ int err; -+ -+ if (dir->i_nlink >= EXT3_LINK_MAX) -+ return -EMLINK; -+ -+ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3); -+ if (IS_ERR(handle)) -+ return PTR_ERR(handle); -+ -+ if (IS_SYNC(dir)) -+ handle->h_sync = 1; -+ -+ inode = ext3_new_inode (handle, dir, S_IFDIR); -+ err = PTR_ERR(inode); -+ if (IS_ERR(inode)) -+ goto out_stop; -+ -+ inode->i_op = &ext3_dir_inode_operations; -+ inode->i_fop = &ext3_dir_operations; -+ inode->i_size = inode->u.ext3_i.i_disksize = inode->i_sb->s_blocksize; -+ inode->i_blocks = 0; -+ dir_block = ext3_bread (handle, inode, 0, 1, &err); -+ if (!dir_block) { -+ inode->i_nlink--; /* is this nlink == 0? */ -+ ext3_mark_inode_dirty(handle, inode); -+ iput (inode); -+ goto out_stop; -+ } -+ BUFFER_TRACE(dir_block, "get_write_access"); -+ ext3_journal_get_write_access(handle, dir_block); -+ de = (struct ext3_dir_entry_2 *) dir_block->b_data; -+ de->inode = cpu_to_le32(inode->i_ino); -+ de->name_len = 1; -+ de->rec_len = cpu_to_le16(EXT3_DIR_REC_LEN(de->name_len)); -+ strcpy (de->name, "."); -+ ext3_set_de_type(dir->i_sb, de, S_IFDIR); -+ de = (struct ext3_dir_entry_2 *) -+ ((char *) de + le16_to_cpu(de->rec_len)); -+ de->inode = cpu_to_le32(dir->i_ino); -+ de->rec_len = cpu_to_le16(inode->i_sb->s_blocksize-EXT3_DIR_REC_LEN(1)); -+ de->name_len = 2; -+ strcpy (de->name, ".."); -+ ext3_set_de_type(dir->i_sb, de, S_IFDIR); -+ inode->i_nlink = 2; -+ BUFFER_TRACE(dir_block, "call ext3_journal_dirty_metadata"); -+ ext3_journal_dirty_metadata(handle, dir_block); -+ brelse (dir_block); -+ inode->i_mode = S_IFDIR | mode; -+ if (dir->i_mode & S_ISGID) -+ inode->i_mode |= S_ISGID; -+ ext3_mark_inode_dirty(handle, inode); -+ err = ext3_add_entry (handle, dentry, inode); -+ if (err) -+ goto out_no_entry; -+ dir->i_nlink++; -+ dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; -+ ext3_mark_inode_dirty(handle, dir); -+ d_instantiate(dentry, inode); -+out_stop: -+ ext3_journal_stop(handle, dir); -+ return err; -+ -+out_no_entry: -+ inode->i_nlink = 0; -+ ext3_mark_inode_dirty(handle, inode); -+ iput (inode); -+ goto out_stop; -+} -+ -+/* -+ * routine to check that the specified directory is empty (for rmdir) -+ */ -+static int empty_dir (struct inode * inode) -+{ -+ unsigned long offset; -+ struct buffer_head * bh; -+ struct ext3_dir_entry_2 * de, * de1; -+ struct super_block * sb; -+ int err; -+ -+ sb = inode->i_sb; -+ if (inode->i_size < EXT3_DIR_REC_LEN(1) + EXT3_DIR_REC_LEN(2) || -+ !(bh = ext3_bread (NULL, inode, 0, 0, &err))) { -+ ext3_warning (inode->i_sb, "empty_dir", -+ "bad directory (dir #%lu) - no data block", -+ inode->i_ino); -+ return 1; -+ } -+ de = (struct ext3_dir_entry_2 *) bh->b_data; -+ de1 = (struct ext3_dir_entry_2 *) -+ ((char *) de + le16_to_cpu(de->rec_len)); -+ if (le32_to_cpu(de->inode) != inode->i_ino || -+ !le32_to_cpu(de1->inode) || -+ strcmp (".", de->name) || -+ strcmp ("..", de1->name)) { -+ ext3_warning (inode->i_sb, "empty_dir", -+ "bad directory (dir #%lu) - no `.' or `..'", -+ inode->i_ino); -+ brelse (bh); -+ return 1; -+ } -+ offset = le16_to_cpu(de->rec_len) + le16_to_cpu(de1->rec_len); -+ de = (struct ext3_dir_entry_2 *) -+ ((char *) de1 + le16_to_cpu(de1->rec_len)); -+ while (offset < inode->i_size ) { -+ if (!bh || -+ (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) { -+ brelse (bh); -+ bh = ext3_bread (NULL, inode, -+ offset >> EXT3_BLOCK_SIZE_BITS(sb), 0, &err); -+ if (!bh) { -+#if 0 -+ ext3_error (sb, "empty_dir", -+ "directory #%lu contains a hole at offset %lu", -+ inode->i_ino, offset); -+#endif -+ offset += sb->s_blocksize; -+ continue; -+ } -+ de = (struct ext3_dir_entry_2 *) bh->b_data; -+ } -+ if (!ext3_check_dir_entry ("empty_dir", inode, de, bh, -+ offset)) { -+ brelse (bh); -+ return 1; -+ } -+ if (le32_to_cpu(de->inode)) { -+ brelse (bh); -+ return 0; -+ } -+ offset += le16_to_cpu(de->rec_len); -+ de = (struct ext3_dir_entry_2 *) -+ ((char *) de + le16_to_cpu(de->rec_len)); -+ } -+ brelse (bh); -+ return 1; -+} -+ -+/* ext3_orphan_add() links an unlinked or truncated inode into a list of -+ * such inodes, starting at the superblock, in case we crash before the -+ * file is closed/deleted, or in case the inode truncate spans multiple -+ * transactions and the last transaction is not recovered after a crash. -+ * -+ * At filesystem recovery time, we walk this list deleting unlinked -+ * inodes and truncating linked inodes in ext3_orphan_cleanup(). -+ */ -+int ext3_orphan_add(handle_t *handle, struct inode *inode) -+{ -+ struct super_block *sb = inode->i_sb; -+ struct ext3_iloc iloc; -+ int err = 0, rc; -+ -+ lock_super(sb); -+ if (!list_empty(&inode->u.ext3_i.i_orphan)) -+ goto out_unlock; -+ -+ /* Orphan handling is only valid for files with data blocks -+ * being truncated, or files being unlinked. */ -+ -+ /* @@@ FIXME: Observation from aviro: -+ * I think I can trigger J_ASSERT in ext3_orphan_add(). We block -+ * here (on lock_super()), so race with ext3_link() which might bump -+ * ->i_nlink. For, say it, character device. Not a regular file, -+ * not a directory, not a symlink and ->i_nlink > 0. -+ */ -+ J_ASSERT ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || -+ S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); -+ -+ BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "get_write_access"); -+ err = ext3_journal_get_write_access(handle, sb->u.ext3_sb.s_sbh); -+ if (err) -+ goto out_unlock; -+ -+ err = ext3_reserve_inode_write(handle, inode, &iloc); -+ if (err) -+ goto out_unlock; -+ -+ /* Insert this inode at the head of the on-disk orphan list... */ -+ NEXT_ORPHAN(inode) = le32_to_cpu(EXT3_SB(sb)->s_es->s_last_orphan); -+ EXT3_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino); -+ err = ext3_journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh); -+ rc = ext3_mark_iloc_dirty(handle, inode, &iloc); -+ if (!err) -+ err = rc; -+ -+ /* Only add to the head of the in-memory list if all the -+ * previous operations succeeded. If the orphan_add is going to -+ * fail (possibly taking the journal offline), we can't risk -+ * leaving the inode on the orphan list: stray orphan-list -+ * entries can cause panics at unmount time. -+ * -+ * This is safe: on error we're going to ignore the orphan list -+ * anyway on the next recovery. */ -+ if (!err) -+ list_add(&inode->u.ext3_i.i_orphan, &EXT3_SB(sb)->s_orphan); -+ -+ jbd_debug(4, "superblock will point to %ld\n", inode->i_ino); -+ jbd_debug(4, "orphan inode %ld will point to %d\n", -+ inode->i_ino, NEXT_ORPHAN(inode)); -+out_unlock: -+ unlock_super(sb); -+ ext3_std_error(inode->i_sb, err); -+ return err; -+} -+ -+/* -+ * ext3_orphan_del() removes an unlinked or truncated inode from the list -+ * of such inodes stored on disk, because it is finally being cleaned up. -+ */ -+int ext3_orphan_del(handle_t *handle, struct inode *inode) -+{ -+ struct list_head *prev; -+ struct ext3_sb_info *sbi; -+ ino_t ino_next; -+ struct ext3_iloc iloc; -+ int err = 0; -+ -+ lock_super(inode->i_sb); -+ if (list_empty(&inode->u.ext3_i.i_orphan)) { -+ unlock_super(inode->i_sb); -+ return 0; -+ } -+ -+ ino_next = NEXT_ORPHAN(inode); -+ prev = inode->u.ext3_i.i_orphan.prev; -+ sbi = EXT3_SB(inode->i_sb); -+ -+ jbd_debug(4, "remove inode %ld from orphan list\n", inode->i_ino); -+ -+ list_del(&inode->u.ext3_i.i_orphan); -+ INIT_LIST_HEAD(&inode->u.ext3_i.i_orphan); -+ -+ /* If we're on an error path, we may not have a valid -+ * transaction handle with which to update the orphan list on -+ * disk, but we still need to remove the inode from the linked -+ * list in memory. */ -+ if (!handle) -+ goto out; -+ -+ err = ext3_reserve_inode_write(handle, inode, &iloc); -+ if (err) -+ goto out_err; -+ -+ if (prev == &sbi->s_orphan) { -+ jbd_debug(4, "superblock will point to %ld\n", ino_next); -+ BUFFER_TRACE(sbi->s_sbh, "get_write_access"); -+ err = ext3_journal_get_write_access(handle, sbi->s_sbh); -+ if (err) -+ goto out_brelse; -+ sbi->s_es->s_last_orphan = cpu_to_le32(ino_next); -+ err = ext3_journal_dirty_metadata(handle, sbi->s_sbh); -+ } else { -+ struct ext3_iloc iloc2; -+ struct inode *i_prev = -+ list_entry(prev, struct inode, u.ext3_i.i_orphan); -+ -+ jbd_debug(4, "orphan inode %ld will point to %ld\n", -+ i_prev->i_ino, ino_next); -+ err = ext3_reserve_inode_write(handle, i_prev, &iloc2); -+ if (err) -+ goto out_brelse; -+ NEXT_ORPHAN(i_prev) = ino_next; -+ err = ext3_mark_iloc_dirty(handle, i_prev, &iloc2); -+ } -+ if (err) -+ goto out_brelse; -+ NEXT_ORPHAN(inode) = 0; -+ err = ext3_mark_iloc_dirty(handle, inode, &iloc); -+ if (err) -+ goto out_brelse; -+ -+out_err: -+ ext3_std_error(inode->i_sb, err); -+out: -+ unlock_super(inode->i_sb); -+ return err; -+ -+out_brelse: -+ brelse(iloc.bh); -+ goto out_err; -+} -+ -+static int ext3_rmdir (struct inode * dir, struct dentry *dentry) -+{ -+ int retval; -+ struct inode * inode; -+ struct buffer_head * bh; -+ struct ext3_dir_entry_2 * de; -+ handle_t *handle; -+ -+ handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS); -+ if (IS_ERR(handle)) -+ return PTR_ERR(handle); -+ -+ retval = -ENOENT; -+ bh = ext3_find_entry (dentry, &de); -+ if (!bh) -+ goto end_rmdir; -+ -+ if (IS_SYNC(dir)) -+ handle->h_sync = 1; -+ -+ inode = dentry->d_inode; -+ DQUOT_INIT(inode); -+ -+ retval = -EIO; -+ if (le32_to_cpu(de->inode) != inode->i_ino) -+ goto end_rmdir; -+ -+ retval = -ENOTEMPTY; -+ if (!empty_dir (inode)) -+ goto end_rmdir; -+ -+ retval = ext3_delete_entry(handle, dir, de, bh); -+ if (retval) -+ goto end_rmdir; -+ if (inode->i_nlink != 2) -+ ext3_warning (inode->i_sb, "ext3_rmdir", -+ "empty directory has nlink!=2 (%d)", -+ inode->i_nlink); -+ inode->i_version = ++event; -+ inode->i_nlink = 0; -+ /* There's no need to set i_disksize: the fact that i_nlink is -+ * zero will ensure that the right thing happens during any -+ * recovery. */ -+ inode->i_size = 0; -+ ext3_orphan_add(handle, inode); -+ ext3_mark_inode_dirty(handle, inode); -+ dir->i_nlink--; -+ inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; -+ dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; -+ ext3_mark_inode_dirty(handle, dir); -+ -+end_rmdir: -+ ext3_journal_stop(handle, dir); -+ brelse (bh); -+ return retval; -+} -+ -+static int ext3_unlink(struct inode * dir, struct dentry *dentry) -+{ -+ int retval; -+ struct inode * inode; -+ struct buffer_head * bh; -+ struct ext3_dir_entry_2 * de; -+ handle_t *handle; -+ -+ handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS); -+ if (IS_ERR(handle)) -+ return PTR_ERR(handle); -+ -+ if (IS_SYNC(dir)) -+ handle->h_sync = 1; -+ -+ retval = -ENOENT; -+ bh = ext3_find_entry (dentry, &de); -+ if (!bh) -+ goto end_unlink; -+ -+ inode = dentry->d_inode; -+ DQUOT_INIT(inode); -+ -+ retval = -EIO; -+ if (le32_to_cpu(de->inode) != inode->i_ino) -+ goto end_unlink; -+ -+ if (!inode->i_nlink) { -+ ext3_warning (inode->i_sb, "ext3_unlink", -+ "Deleting nonexistent file (%lu), %d", -+ inode->i_ino, inode->i_nlink); -+ inode->i_nlink = 1; -+ } -+ retval = ext3_delete_entry(handle, dir, de, bh); -+ if (retval) -+ goto end_unlink; -+ dir->i_ctime = dir->i_mtime = CURRENT_TIME; -+ dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; -+ ext3_mark_inode_dirty(handle, dir); -+ inode->i_nlink--; -+ if (!inode->i_nlink) -+ ext3_orphan_add(handle, inode); -+ ext3_mark_inode_dirty(handle, inode); -+ inode->i_ctime = dir->i_ctime; -+ retval = 0; -+ -+end_unlink: -+ ext3_journal_stop(handle, dir); -+ brelse (bh); -+ return retval; -+} -+ -+static int ext3_symlink (struct inode * dir, -+ struct dentry *dentry, const char * symname) -+{ -+ handle_t *handle; -+ struct inode * inode; -+ int l, err; -+ -+ l = strlen(symname)+1; -+ if (l > dir->i_sb->s_blocksize) -+ return -ENAMETOOLONG; -+ -+ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 5); -+ if (IS_ERR(handle)) -+ return PTR_ERR(handle); -+ -+ if (IS_SYNC(dir)) -+ handle->h_sync = 1; -+ -+ inode = ext3_new_inode (handle, dir, S_IFLNK|S_IRWXUGO); -+ err = PTR_ERR(inode); -+ if (IS_ERR(inode)) -+ goto out_stop; -+ -+ if (l > sizeof (inode->u.ext3_i.i_data)) { -+ inode->i_op = &page_symlink_inode_operations; -+ inode->i_mapping->a_ops = &ext3_aops; -+ /* -+ * block_symlink() calls back into ext3_prepare/commit_write. -+ * We have a transaction open. All is sweetness. It also sets -+ * i_size in generic_commit_write(). -+ */ -+ err = block_symlink(inode, symname, l); -+ if (err) -+ goto out_no_entry; -+ } else { -+ inode->i_op = &ext3_fast_symlink_inode_operations; -+ memcpy((char*)&inode->u.ext3_i.i_data,symname,l); -+ inode->i_size = l-1; -+ } -+ inode->u.ext3_i.i_disksize = inode->i_size; -+ ext3_mark_inode_dirty(handle, inode); -+ err = ext3_add_nondir(handle, dentry, inode); -+out_stop: -+ ext3_journal_stop(handle, dir); -+ return err; -+ -+out_no_entry: -+ ext3_dec_count(handle, inode); -+ ext3_mark_inode_dirty(handle, inode); -+ iput (inode); -+ goto out_stop; -+} -+ -+static int ext3_link (struct dentry * old_dentry, -+ struct inode * dir, struct dentry *dentry) -+{ -+ handle_t *handle; -+ struct inode *inode = old_dentry->d_inode; -+ int err; -+ -+ if (S_ISDIR(inode->i_mode)) -+ return -EPERM; -+ -+ if (inode->i_nlink >= EXT3_LINK_MAX) -+ return -EMLINK; -+ -+ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS); -+ if (IS_ERR(handle)) -+ return PTR_ERR(handle); -+ -+ if (IS_SYNC(dir)) -+ handle->h_sync = 1; -+ -+ inode->i_ctime = CURRENT_TIME; -+ ext3_inc_count(handle, inode); -+ atomic_inc(&inode->i_count); -+ -+ ext3_mark_inode_dirty(handle, inode); -+ err = ext3_add_nondir(handle, dentry, inode); -+ ext3_journal_stop(handle, dir); -+ return err; -+} -+ -+#define PARENT_INO(buffer) \ -+ ((struct ext3_dir_entry_2 *) ((char *) buffer + \ -+ le16_to_cpu(((struct ext3_dir_entry_2 *) buffer)->rec_len)))->inode -+ -+/* -+ * Anybody can rename anything with this: the permission checks are left to the -+ * higher-level routines. -+ */ -+static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry, -+ struct inode * new_dir,struct dentry *new_dentry) -+{ -+ handle_t *handle; -+ struct inode * old_inode, * new_inode; -+ struct buffer_head * old_bh, * new_bh, * dir_bh; -+ struct ext3_dir_entry_2 * old_de, * new_de; -+ int retval; -+ -+ old_bh = new_bh = dir_bh = NULL; -+ -+ handle = ext3_journal_start(old_dir, 2 * EXT3_DATA_TRANS_BLOCKS + 2); -+ if (IS_ERR(handle)) -+ return PTR_ERR(handle); -+ -+ if (IS_SYNC(old_dir) || IS_SYNC(new_dir)) -+ handle->h_sync = 1; -+ -+ old_bh = ext3_find_entry (old_dentry, &old_de); -+ /* -+ * Check for inode number is _not_ due to possible IO errors. -+ * We might rmdir the source, keep it as pwd of some process -+ * and merrily kill the link to whatever was created under the -+ * same name. Goodbye sticky bit ;-< -+ */ -+ old_inode = old_dentry->d_inode; -+ retval = -ENOENT; -+ if (!old_bh || le32_to_cpu(old_de->inode) != old_inode->i_ino) -+ goto end_rename; -+ -+ new_inode = new_dentry->d_inode; -+ new_bh = ext3_find_entry (new_dentry, &new_de); -+ if (new_bh) { -+ if (!new_inode) { -+ brelse (new_bh); -+ new_bh = NULL; -+ } else { -+ DQUOT_INIT(new_inode); -+ } -+ } -+ if (S_ISDIR(old_inode->i_mode)) { -+ if (new_inode) { -+ retval = -ENOTEMPTY; -+ if (!empty_dir (new_inode)) -+ goto end_rename; -+ } -+ retval = -EIO; -+ dir_bh = ext3_bread (handle, old_inode, 0, 0, &retval); -+ if (!dir_bh) -+ goto end_rename; -+ if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino) -+ goto end_rename; -+ retval = -EMLINK; -+ if (!new_inode && new_dir!=old_dir && -+ new_dir->i_nlink >= EXT3_LINK_MAX) -+ goto end_rename; -+ } -+ if (!new_bh) { -+ retval = ext3_add_entry (handle, new_dentry, old_inode); -+ if (retval) -+ goto end_rename; -+ } else { -+ BUFFER_TRACE(new_bh, "get write access"); -+ BUFFER_TRACE(new_bh, "get_write_access"); -+ ext3_journal_get_write_access(handle, new_bh); -+ new_de->inode = le32_to_cpu(old_inode->i_ino); -+ if (EXT3_HAS_INCOMPAT_FEATURE(new_dir->i_sb, -+ EXT3_FEATURE_INCOMPAT_FILETYPE)) -+ new_de->file_type = old_de->file_type; -+ new_dir->i_version = ++event; -+ BUFFER_TRACE(new_bh, "call ext3_journal_dirty_metadata"); -+ ext3_journal_dirty_metadata(handle, new_bh); -+ brelse(new_bh); -+ new_bh = NULL; -+ } -+ -+ /* -+ * Like most other Unix systems, set the ctime for inodes on a -+ * rename. -+ */ -+ old_inode->i_ctime = CURRENT_TIME; -+ ext3_mark_inode_dirty(handle, old_inode); -+ -+ /* -+ * ok, that's it -+ */ -+ ext3_delete_entry(handle, old_dir, old_de, old_bh); -+ -+ if (new_inode) { -+ new_inode->i_nlink--; -+ new_inode->i_ctime = CURRENT_TIME; -+ } -+ old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME; -+ old_dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; -+ if (dir_bh) { -+ BUFFER_TRACE(dir_bh, "get_write_access"); -+ ext3_journal_get_write_access(handle, dir_bh); -+ PARENT_INO(dir_bh->b_data) = le32_to_cpu(new_dir->i_ino); -+ BUFFER_TRACE(dir_bh, "call ext3_journal_dirty_metadata"); -+ ext3_journal_dirty_metadata(handle, dir_bh); -+ old_dir->i_nlink--; -+ if (new_inode) { -+ new_inode->i_nlink--; -+ } else { -+ new_dir->i_nlink++; -+ new_dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; -+ ext3_mark_inode_dirty(handle, new_dir); -+ } -+ } -+ ext3_mark_inode_dirty(handle, old_dir); -+ if (new_inode) { -+ ext3_mark_inode_dirty(handle, new_inode); -+ if (!new_inode->i_nlink) -+ ext3_orphan_add(handle, new_inode); -+ } -+ retval = 0; -+ -+end_rename: -+ brelse (dir_bh); -+ brelse (old_bh); -+ brelse (new_bh); -+ ext3_journal_stop(handle, old_dir); -+ return retval; -+} -+ -+/* -+ * directories can handle most operations... -+ */ -+struct inode_operations ext3_dir_inode_operations = { -+ create: ext3_create, /* BKL held */ -+ lookup: ext3_lookup, /* BKL held */ -+ link: ext3_link, /* BKL held */ -+ unlink: ext3_unlink, /* BKL held */ -+ symlink: ext3_symlink, /* BKL held */ -+ mkdir: ext3_mkdir, /* BKL held */ -+ rmdir: ext3_rmdir, /* BKL held */ -+ mknod: ext3_mknod, /* BKL held */ -+ rename: ext3_rename, /* BKL held */ -+}; -diff -rup --new-file linux.mcp2/fs/ext3/super.c linux_tmp/fs/ext3/super.c ---- linux.mcp2/fs/ext3/super.c 1969-12-31 16:00:00.000000000 -0800 -+++ linux_tmp/fs/ext3/super.c 2002-02-25 11:38:08.000000000 -0800 -@@ -0,0 +1,1753 @@ -+/* -+ * linux/fs/ext3/super.c -+ * -+ * Copyright (C) 1992, 1993, 1994, 1995 -+ * Remy Card (card@masi.ibp.fr) -+ * Laboratoire MASI - Institut Blaise Pascal -+ * Universite Pierre et Marie Curie (Paris VI) -+ * -+ * from -+ * -+ * linux/fs/minix/inode.c -+ * -+ * Copyright (C) 1991, 1992 Linus Torvalds -+ * -+ * Big-endian to little-endian byte-swapping/bitmaps by -+ * David S. Miller (davem@caip.rutgers.edu), 1995 -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#ifdef CONFIG_JBD_DEBUG -+static int ext3_ro_after; /* Make fs read-only after this many jiffies */ -+#endif -+ -+static int ext3_load_journal(struct super_block *, struct ext3_super_block *); -+static int ext3_create_journal(struct super_block *, struct ext3_super_block *, -+ int); -+static void ext3_commit_super (struct super_block * sb, -+ struct ext3_super_block * es, -+ int sync); -+static void ext3_mark_recovery_complete(struct super_block * sb, -+ struct ext3_super_block * es); -+static void ext3_clear_journal_err(struct super_block * sb, -+ struct ext3_super_block * es); -+ -+#ifdef CONFIG_JBD_DEBUG -+int journal_no_write[2]; -+ -+/* -+ * Debug code for turning filesystems "read-only" after a specified -+ * amount of time. This is for crash/recovery testing. -+ */ -+ -+static void make_rdonly(kdev_t dev, int *no_write) -+{ -+ if (dev) { -+ printk(KERN_WARNING "Turning device %s read-only\n", -+ bdevname(dev)); -+ *no_write = 0xdead0000 + dev; -+ } -+} -+ -+static void turn_fs_readonly(unsigned long arg) -+{ -+ struct super_block *sb = (struct super_block *)arg; -+ -+ make_rdonly(sb->s_dev, &journal_no_write[0]); -+ make_rdonly(EXT3_SB(sb)->s_journal->j_dev, &journal_no_write[1]); -+ wake_up(&EXT3_SB(sb)->ro_wait_queue); -+} -+ -+static void setup_ro_after(struct super_block *sb) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ init_timer(&sbi->turn_ro_timer); -+ if (ext3_ro_after) { -+ printk(KERN_DEBUG "fs will go read-only in %d jiffies\n", -+ ext3_ro_after); -+ init_waitqueue_head(&sbi->ro_wait_queue); -+ journal_no_write[0] = 0; -+ journal_no_write[1] = 0; -+ sbi->turn_ro_timer.function = turn_fs_readonly; -+ sbi->turn_ro_timer.data = (unsigned long)sb; -+ sbi->turn_ro_timer.expires = jiffies + ext3_ro_after; -+ ext3_ro_after = 0; -+ add_timer(&sbi->turn_ro_timer); -+ } -+} -+ -+static void clear_ro_after(struct super_block *sb) -+{ -+ del_timer_sync(&EXT3_SB(sb)->turn_ro_timer); -+ journal_no_write[0] = 0; -+ journal_no_write[1] = 0; -+ ext3_ro_after = 0; -+} -+#else -+#define setup_ro_after(sb) do {} while (0) -+#define clear_ro_after(sb) do {} while (0) -+#endif -+ -+ -+static char error_buf[1024]; -+ -+/* Determine the appropriate response to ext3_error on a given filesystem */ -+ -+static int ext3_error_behaviour(struct super_block *sb) -+{ -+ /* First check for mount-time options */ -+ if (test_opt (sb, ERRORS_PANIC)) -+ return EXT3_ERRORS_PANIC; -+ if (test_opt (sb, ERRORS_RO)) -+ return EXT3_ERRORS_RO; -+ if (test_opt (sb, ERRORS_CONT)) -+ return EXT3_ERRORS_CONTINUE; -+ -+ /* If no overrides were specified on the mount, then fall back -+ * to the default behaviour set in the filesystem's superblock -+ * on disk. */ -+ switch (le16_to_cpu(sb->u.ext3_sb.s_es->s_errors)) { -+ case EXT3_ERRORS_PANIC: -+ return EXT3_ERRORS_PANIC; -+ case EXT3_ERRORS_RO: -+ return EXT3_ERRORS_RO; -+ default: -+ break; -+ } -+ return EXT3_ERRORS_CONTINUE; -+} -+ -+/* Deal with the reporting of failure conditions on a filesystem such as -+ * inconsistencies detected or read IO failures. -+ * -+ * On ext2, we can store the error state of the filesystem in the -+ * superblock. That is not possible on ext3, because we may have other -+ * write ordering constraints on the superblock which prevent us from -+ * writing it out straight away; and given that the journal is about to -+ * be aborted, we can't rely on the current, or future, transactions to -+ * write out the superblock safely. -+ * -+ * We'll just use the journal_abort() error code to record an error in -+ * the journal instead. On recovery, the journal will compain about -+ * that error until we've noted it down and cleared it. -+ */ -+ -+static void ext3_handle_error(struct super_block *sb) -+{ -+ struct ext3_super_block *es = EXT3_SB(sb)->s_es; -+ -+ EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS; -+ es->s_state |= cpu_to_le32(EXT3_ERROR_FS); -+ -+ if (sb->s_flags & MS_RDONLY) -+ return; -+ -+ if (ext3_error_behaviour(sb) != EXT3_ERRORS_CONTINUE) { -+ EXT3_SB(sb)->s_mount_opt |= EXT3_MOUNT_ABORT; -+ journal_abort(EXT3_SB(sb)->s_journal, -EIO); -+ } -+ -+ if (ext3_error_behaviour(sb) == EXT3_ERRORS_PANIC) -+ panic ("EXT3-fs (device %s): panic forced after error\n", -+ bdevname(sb->s_dev)); -+ -+ if (ext3_error_behaviour(sb) == EXT3_ERRORS_RO) { -+ printk (KERN_CRIT "Remounting filesystem read-only\n"); -+ sb->s_flags |= MS_RDONLY; -+ } -+ -+ ext3_commit_super(sb, es, 1); -+} -+ -+void ext3_error (struct super_block * sb, const char * function, -+ const char * fmt, ...) -+{ -+ va_list args; -+ -+ va_start (args, fmt); -+ vsprintf (error_buf, fmt, args); -+ va_end (args); -+ -+ printk (KERN_CRIT "EXT3-fs error (device %s): %s: %s\n", -+ bdevname(sb->s_dev), function, error_buf); -+ -+ ext3_handle_error(sb); -+} -+ -+const char *ext3_decode_error(struct super_block * sb, int errno, char nbuf[16]) -+{ -+ char *errstr = NULL; -+ -+ switch (errno) { -+ case -EIO: -+ errstr = "IO failure"; -+ break; -+ case -ENOMEM: -+ errstr = "Out of memory"; -+ break; -+ case -EROFS: -+ if (!sb || EXT3_SB(sb)->s_journal->j_flags & JFS_ABORT) -+ errstr = "Journal has aborted"; -+ else -+ errstr = "Readonly filesystem"; -+ break; -+ default: -+ /* If the caller passed in an extra buffer for unknown -+ * errors, textualise them now. Else we just return -+ * NULL. */ -+ if (nbuf) { -+ /* Check for truncated error codes... */ -+ if (snprintf(nbuf, 16, "error %d", -errno) >= 0) -+ errstr = nbuf; -+ } -+ -+ break; -+ } -+ -+ return errstr; -+} -+ -+/* __ext3_std_error decodes expected errors from journaling functions -+ * automatically and invokes the appropriate error response. */ -+ -+void __ext3_std_error (struct super_block * sb, const char * function, -+ int errno) -+{ -+ char nbuf[16]; -+ const char *errstr = ext3_decode_error(sb, errno, nbuf); -+ -+ printk (KERN_CRIT "EXT3-fs error (device %s) in %s: %s\n", -+ bdevname(sb->s_dev), function, errstr); -+ -+ ext3_handle_error(sb); -+} -+ -+/* -+ * ext3_abort is a much stronger failure handler than ext3_error. The -+ * abort function may be used to deal with unrecoverable failures such -+ * as journal IO errors or ENOMEM at a critical moment in log management. -+ * -+ * We unconditionally force the filesystem into an ABORT|READONLY state, -+ * unless the error response on the fs has been set to panic in which -+ * case we take the easy way out and panic immediately. -+ */ -+ -+void ext3_abort (struct super_block * sb, const char * function, -+ const char * fmt, ...) -+{ -+ va_list args; -+ -+ printk (KERN_CRIT "ext3_abort called.\n"); -+ -+ va_start (args, fmt); -+ vsprintf (error_buf, fmt, args); -+ va_end (args); -+ -+ if (ext3_error_behaviour(sb) == EXT3_ERRORS_PANIC) -+ panic ("EXT3-fs panic (device %s): %s: %s\n", -+ bdevname(sb->s_dev), function, error_buf); -+ -+ printk (KERN_CRIT "EXT3-fs abort (device %s): %s: %s\n", -+ bdevname(sb->s_dev), function, error_buf); -+ -+ if (sb->s_flags & MS_RDONLY) -+ return; -+ -+ printk (KERN_CRIT "Remounting filesystem read-only\n"); -+ sb->u.ext3_sb.s_mount_state |= EXT3_ERROR_FS; -+ sb->s_flags |= MS_RDONLY; -+ sb->u.ext3_sb.s_mount_opt |= EXT3_MOUNT_ABORT; -+ journal_abort(EXT3_SB(sb)->s_journal, -EIO); -+} -+ -+/* Deal with the reporting of failure conditions while running, such as -+ * inconsistencies in operation or invalid system states. -+ * -+ * Use ext3_error() for cases of invalid filesystem states, as that will -+ * record an error on disk and force a filesystem check on the next boot. -+ */ -+NORET_TYPE void ext3_panic (struct super_block * sb, const char * function, -+ const char * fmt, ...) -+{ -+ va_list args; -+ -+ va_start (args, fmt); -+ vsprintf (error_buf, fmt, args); -+ va_end (args); -+ -+ /* this is to prevent panic from syncing this filesystem */ -+ /* AKPM: is this sufficient? */ -+ sb->s_flags |= MS_RDONLY; -+ panic ("EXT3-fs panic (device %s): %s: %s\n", -+ bdevname(sb->s_dev), function, error_buf); -+} -+ -+void ext3_warning (struct super_block * sb, const char * function, -+ const char * fmt, ...) -+{ -+ va_list args; -+ -+ va_start (args, fmt); -+ vsprintf (error_buf, fmt, args); -+ va_end (args); -+ printk (KERN_WARNING "EXT3-fs warning (device %s): %s: %s\n", -+ bdevname(sb->s_dev), function, error_buf); -+} -+ -+void ext3_update_dynamic_rev(struct super_block *sb) -+{ -+ struct ext3_super_block *es = EXT3_SB(sb)->s_es; -+ -+ if (le32_to_cpu(es->s_rev_level) > EXT3_GOOD_OLD_REV) -+ return; -+ -+ ext3_warning(sb, __FUNCTION__, -+ "updating to rev %d because of new feature flag, " -+ "running e2fsck is recommended", -+ EXT3_DYNAMIC_REV); -+ -+ es->s_first_ino = cpu_to_le32(EXT3_GOOD_OLD_FIRST_INO); -+ es->s_inode_size = cpu_to_le16(EXT3_GOOD_OLD_INODE_SIZE); -+ es->s_rev_level = cpu_to_le32(EXT3_DYNAMIC_REV); -+ /* leave es->s_feature_*compat flags alone */ -+ /* es->s_uuid will be set by e2fsck if empty */ -+ -+ /* -+ * The rest of the superblock fields should be zero, and if not it -+ * means they are likely already in use, so leave them alone. We -+ * can leave it up to e2fsck to clean up any inconsistencies there. -+ */ -+} -+ -+/* -+ * Open the external journal device -+ */ -+static struct block_device *ext3_blkdev_get(kdev_t dev) -+{ -+ struct block_device *bdev; -+ int err = -ENODEV; -+ -+ bdev = bdget(kdev_t_to_nr(dev)); -+ if (bdev == NULL) -+ goto fail; -+ err = blkdev_get(bdev, FMODE_READ|FMODE_WRITE, 0, BDEV_FS); -+ if (err < 0) -+ goto fail; -+ return bdev; -+ -+fail: -+ printk(KERN_ERR "EXT3: failed to open journal device %s: %d\n", -+ bdevname(dev), err); -+ return NULL; -+} -+ -+/* -+ * Release the journal device -+ */ -+static int ext3_blkdev_put(struct block_device *bdev) -+{ -+ return blkdev_put(bdev, BDEV_FS); -+} -+ -+static int ext3_blkdev_remove(struct ext3_sb_info *sbi) -+{ -+ struct block_device *bdev; -+ int ret = -ENODEV; -+ -+ bdev = sbi->journal_bdev; -+ if (bdev) { -+ ret = ext3_blkdev_put(bdev); -+ sbi->journal_bdev = 0; -+ } -+ return ret; -+} -+ -+#define orphan_list_entry(l) list_entry((l), struct inode, u.ext3_i.i_orphan) -+ -+static void dump_orphan_list(struct super_block *sb, struct ext3_sb_info *sbi) -+{ -+ struct list_head *l; -+ -+ printk(KERN_ERR "sb orphan head is %d\n", -+ le32_to_cpu(sbi->s_es->s_last_orphan)); -+ -+ printk(KERN_ERR "sb_info orphan list:\n"); -+ list_for_each(l, &sbi->s_orphan) { -+ struct inode *inode = orphan_list_entry(l); -+ printk(KERN_ERR " " -+ "inode 0x%04x:%ld at %p: mode %o, nlink %d, next %d\n", -+ inode->i_dev, inode->i_ino, inode, -+ inode->i_mode, inode->i_nlink, -+ le32_to_cpu(NEXT_ORPHAN(inode))); -+ } -+} -+ -+void ext3_put_super (struct super_block * sb) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ struct ext3_super_block *es = sbi->s_es; -+ kdev_t j_dev = sbi->s_journal->j_dev; -+ int i; -+ -+ journal_destroy(sbi->s_journal); -+ if (!(sb->s_flags & MS_RDONLY)) { -+ EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); -+ es->s_state = le16_to_cpu(sbi->s_mount_state); -+ BUFFER_TRACE(sbi->s_sbh, "marking dirty"); -+ mark_buffer_dirty(sbi->s_sbh); -+ ext3_commit_super(sb, es, 1); -+ } -+ -+ for (i = 0; i < sbi->s_gdb_count; i++) -+ brelse(sbi->s_group_desc[i]); -+ kfree(sbi->s_group_desc); -+ for (i = 0; i < EXT3_MAX_GROUP_LOADED; i++) -+ brelse(sbi->s_inode_bitmap[i]); -+ for (i = 0; i < EXT3_MAX_GROUP_LOADED; i++) -+ brelse(sbi->s_block_bitmap[i]); -+ brelse(sbi->s_sbh); -+ -+ /* Debugging code just in case the in-memory inode orphan list -+ * isn't empty. The on-disk one can be non-empty if we've -+ * detected an error and taken the fs readonly, but the -+ * in-memory list had better be clean by this point. */ -+ if (!list_empty(&sbi->s_orphan)) -+ dump_orphan_list(sb, sbi); -+ J_ASSERT(list_empty(&sbi->s_orphan)); -+ -+ invalidate_buffers(sb->s_dev); -+ if (j_dev != sb->s_dev) { -+ /* -+ * Invalidate the journal device's buffers. We don't want them -+ * floating about in memory - the physical journal device may -+ * hotswapped, and it breaks the `ro-after' testing code. -+ */ -+ fsync_no_super(j_dev); -+ invalidate_buffers(j_dev); -+ ext3_blkdev_remove(sbi); -+ } -+ clear_ro_after(sb); -+ -+ return; -+} -+ -+static struct super_operations ext3_sops = { -+ read_inode: ext3_read_inode, /* BKL held */ -+ write_inode: ext3_write_inode, /* BKL not held. Don't need */ -+ dirty_inode: ext3_dirty_inode, /* BKL not held. We take it */ -+ put_inode: ext3_put_inode, /* BKL not held. Don't need */ -+ delete_inode: ext3_delete_inode, /* BKL not held. We take it */ -+ put_super: ext3_put_super, /* BKL held */ -+ write_super: ext3_write_super, /* BKL held */ -+ write_super_lockfs: ext3_write_super_lockfs, /* BKL not held. Take it */ -+ unlockfs: ext3_unlockfs, /* BKL not held. We take it */ -+ statfs: ext3_statfs, /* BKL held */ -+ remount_fs: ext3_remount, /* BKL held */ -+}; -+ -+static int want_value(char *value, char *option) -+{ -+ if (!value || !*value) { -+ printk(KERN_NOTICE "EXT3-fs: the %s option needs an argument\n", -+ option); -+ return -1; -+ } -+ return 0; -+} -+ -+static int want_null_value(char *value, char *option) -+{ -+ if (*value) { -+ printk(KERN_NOTICE "EXT3-fs: Invalid %s argument: %s\n", -+ option, value); -+ return -1; -+ } -+ return 0; -+} -+ -+static int want_numeric(char *value, char *option, unsigned long *number) -+{ -+ if (want_value(value, option)) -+ return -1; -+ *number = simple_strtoul(value, &value, 0); -+ if (want_null_value(value, option)) -+ return -1; -+ return 0; -+} -+ -+/* -+ * This function has been shamelessly adapted from the msdos fs -+ */ -+static int parse_options (char * options, unsigned long * sb_block, -+ struct ext3_sb_info *sbi, -+ unsigned long * inum, -+ int is_remount) -+{ -+ unsigned long *mount_options = &sbi->s_mount_opt; -+ uid_t *resuid = &sbi->s_resuid; -+ gid_t *resgid = &sbi->s_resgid; -+ char * this_char; -+ char * value; -+ -+ if (!options) -+ return 1; -+ for (this_char = strtok (options, ","); -+ this_char != NULL; -+ this_char = strtok (NULL, ",")) { -+ if ((value = strchr (this_char, '=')) != NULL) -+ *value++ = 0; -+ if (!strcmp (this_char, "bsddf")) -+ clear_opt (*mount_options, MINIX_DF); -+ else if (!strcmp (this_char, "nouid32")) { -+ set_opt (*mount_options, NO_UID32); -+ } -+ else if (!strcmp (this_char, "abort")) -+ set_opt (*mount_options, ABORT); -+ else if (!strcmp (this_char, "check")) { -+ if (!value || !*value || !strcmp (value, "none")) -+ clear_opt (*mount_options, CHECK); -+ else -+#ifdef CONFIG_EXT3_CHECK -+ set_opt (*mount_options, CHECK); -+#else -+ printk(KERN_ERR -+ "EXT3 Check option not supported\n"); -+#endif -+ } -+ else if (!strcmp (this_char, "debug")) -+ set_opt (*mount_options, DEBUG); -+ else if (!strcmp (this_char, "errors")) { -+ if (want_value(value, "errors")) -+ return 0; -+ if (!strcmp (value, "continue")) { -+ clear_opt (*mount_options, ERRORS_RO); -+ clear_opt (*mount_options, ERRORS_PANIC); -+ set_opt (*mount_options, ERRORS_CONT); -+ } -+ else if (!strcmp (value, "remount-ro")) { -+ clear_opt (*mount_options, ERRORS_CONT); -+ clear_opt (*mount_options, ERRORS_PANIC); -+ set_opt (*mount_options, ERRORS_RO); -+ } -+ else if (!strcmp (value, "panic")) { -+ clear_opt (*mount_options, ERRORS_CONT); -+ clear_opt (*mount_options, ERRORS_RO); -+ set_opt (*mount_options, ERRORS_PANIC); -+ } -+ else { -+ printk (KERN_ERR -+ "EXT3-fs: Invalid errors option: %s\n", -+ value); -+ return 0; -+ } -+ } -+ else if (!strcmp (this_char, "grpid") || -+ !strcmp (this_char, "bsdgroups")) -+ set_opt (*mount_options, GRPID); -+ else if (!strcmp (this_char, "minixdf")) -+ set_opt (*mount_options, MINIX_DF); -+ else if (!strcmp (this_char, "nocheck")) -+ clear_opt (*mount_options, CHECK); -+ else if (!strcmp (this_char, "nogrpid") || -+ !strcmp (this_char, "sysvgroups")) -+ clear_opt (*mount_options, GRPID); -+ else if (!strcmp (this_char, "resgid")) { -+ unsigned long v; -+ if (want_numeric(value, "resgid", &v)) -+ return 0; -+ *resgid = v; -+ } -+ else if (!strcmp (this_char, "resuid")) { -+ unsigned long v; -+ if (want_numeric(value, "resuid", &v)) -+ return 0; -+ *resuid = v; -+ } -+ else if (!strcmp (this_char, "sb")) { -+ if (want_numeric(value, "sb", sb_block)) -+ return 0; -+ } -+#ifdef CONFIG_JBD_DEBUG -+ else if (!strcmp (this_char, "ro-after")) { -+ unsigned long v; -+ if (want_numeric(value, "ro-after", &v)) -+ return 0; -+ ext3_ro_after = v; -+ } -+#endif -+ /* Silently ignore the quota options */ -+ else if (!strcmp (this_char, "grpquota") -+ || !strcmp (this_char, "noquota") -+ || !strcmp (this_char, "quota") -+ || !strcmp (this_char, "usrquota")) -+ /* Don't do anything ;-) */ ; -+ else if (!strcmp (this_char, "journal")) { -+ /* @@@ FIXME */ -+ /* Eventually we will want to be able to create -+ a journal file here. For now, only allow the -+ user to specify an existing inode to be the -+ journal file. */ -+ if (is_remount) { -+ printk(KERN_ERR "EXT3-fs: cannot specify " -+ "journal on remount\n"); -+ return 0; -+ } -+ -+ if (want_value(value, "journal")) -+ return 0; -+ if (!strcmp (value, "update")) -+ set_opt (*mount_options, UPDATE_JOURNAL); -+ else if (want_numeric(value, "journal", inum)) -+ return 0; -+ } -+ else if (!strcmp (this_char, "noload")) -+ set_opt (*mount_options, NOLOAD); -+ else if (!strcmp (this_char, "data")) { -+ int data_opt = 0; -+ -+ if (want_value(value, "data")) -+ return 0; -+ if (!strcmp (value, "journal")) -+ data_opt = EXT3_MOUNT_JOURNAL_DATA; -+ else if (!strcmp (value, "ordered")) -+ data_opt = EXT3_MOUNT_ORDERED_DATA; -+ else if (!strcmp (value, "writeback")) -+ data_opt = EXT3_MOUNT_WRITEBACK_DATA; -+ else { -+ printk (KERN_ERR -+ "EXT3-fs: Invalid data option: %s\n", -+ value); -+ return 0; -+ } -+ if (is_remount) { -+ if ((*mount_options & EXT3_MOUNT_DATA_FLAGS) != -+ data_opt) { -+ printk(KERN_ERR -+ "EXT3-fs: cannot change data " -+ "mode on remount\n"); -+ return 0; -+ } -+ } else { -+ *mount_options &= ~EXT3_MOUNT_DATA_FLAGS; -+ *mount_options |= data_opt; -+ } -+ } else { -+ printk (KERN_ERR -+ "EXT3-fs: Unrecognized mount option %s\n", -+ this_char); -+ return 0; -+ } -+ } -+ return 1; -+} -+ -+static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es, -+ int read_only) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ int res = 0; -+ -+ if (le32_to_cpu(es->s_rev_level) > EXT3_MAX_SUPP_REV) { -+ printk (KERN_ERR "EXT3-fs warning: revision level too high, " -+ "forcing read-only mode\n"); -+ res = MS_RDONLY; -+ } -+ if (read_only) -+ return res; -+ if (!(sbi->s_mount_state & EXT3_VALID_FS)) -+ printk (KERN_WARNING "EXT3-fs warning: mounting unchecked fs, " -+ "running e2fsck is recommended\n"); -+ else if ((sbi->s_mount_state & EXT3_ERROR_FS)) -+ printk (KERN_WARNING -+ "EXT3-fs warning: mounting fs with errors, " -+ "running e2fsck is recommended\n"); -+ else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 && -+ le16_to_cpu(es->s_mnt_count) >= -+ (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count)) -+ printk (KERN_WARNING -+ "EXT3-fs warning: maximal mount count reached, " -+ "running e2fsck is recommended\n"); -+ else if (le32_to_cpu(es->s_checkinterval) && -+ (le32_to_cpu(es->s_lastcheck) + -+ le32_to_cpu(es->s_checkinterval) <= CURRENT_TIME)) -+ printk (KERN_WARNING -+ "EXT3-fs warning: checktime reached, " -+ "running e2fsck is recommended\n"); -+#if 0 -+ /* @@@ We _will_ want to clear the valid bit if we find -+ inconsistencies, to force a fsck at reboot. But for -+ a plain journaled filesystem we can keep it set as -+ valid forever! :) */ -+ es->s_state = cpu_to_le16(le16_to_cpu(es->s_state) & ~EXT3_VALID_FS); -+#endif -+ if (!(__s16) le16_to_cpu(es->s_max_mnt_count)) -+ es->s_max_mnt_count = -+ (__s16) cpu_to_le16(EXT3_DFL_MAX_MNT_COUNT); -+ es->s_mnt_count=cpu_to_le16(le16_to_cpu(es->s_mnt_count) + 1); -+ es->s_mtime = cpu_to_le32(CURRENT_TIME); -+ ext3_update_dynamic_rev(sb); -+ EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); -+ ext3_commit_super (sb, es, 1); -+ if (test_opt (sb, DEBUG)) -+ printk (KERN_INFO -+ "[EXT3 FS %s, %s, bs=%lu, gc=%lu, " -+ "bpg=%lu, ipg=%lu, mo=%04lx]\n", -+ EXT3FS_VERSION, EXT3FS_DATE, sb->s_blocksize, -+ sbi->s_groups_count, -+ EXT3_BLOCKS_PER_GROUP(sb), -+ EXT3_INODES_PER_GROUP(sb), -+ sbi->s_mount_opt); -+ printk(KERN_INFO "EXT3 FS " EXT3FS_VERSION ", " EXT3FS_DATE " on %s, ", -+ bdevname(sb->s_dev)); -+ if (EXT3_SB(sb)->s_journal->j_inode == NULL) { -+ printk("external journal on %s\n", -+ bdevname(EXT3_SB(sb)->s_journal->j_dev)); -+ } else { -+ printk("internal journal\n"); -+ } -+#ifdef CONFIG_EXT3_CHECK -+ if (test_opt (sb, CHECK)) { -+ ext3_check_blocks_bitmap (sb); -+ ext3_check_inodes_bitmap (sb); -+ } -+#endif -+ setup_ro_after(sb); -+ return res; -+} -+ -+static int ext3_check_descriptors (struct super_block * sb) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ unsigned long block = le32_to_cpu(sbi->s_es->s_first_data_block); -+ struct ext3_group_desc * gdp = NULL; -+ int desc_block = 0; -+ int i; -+ -+ ext3_debug ("Checking group descriptors"); -+ -+ for (i = 0; i < sbi->s_groups_count; i++) -+ { -+ if ((i % EXT3_DESC_PER_BLOCK(sb)) == 0) -+ gdp = (struct ext3_group_desc *) -+ sbi->s_group_desc[desc_block++]->b_data; -+ if (le32_to_cpu(gdp->bg_block_bitmap) < block || -+ le32_to_cpu(gdp->bg_block_bitmap) >= -+ block + EXT3_BLOCKS_PER_GROUP(sb)) -+ { -+ ext3_error (sb, "ext3_check_descriptors", -+ "Block bitmap for group %d" -+ " not in group (block %lu)!", -+ i, (unsigned long) -+ le32_to_cpu(gdp->bg_block_bitmap)); -+ return 0; -+ } -+ if (le32_to_cpu(gdp->bg_inode_bitmap) < block || -+ le32_to_cpu(gdp->bg_inode_bitmap) >= -+ block + EXT3_BLOCKS_PER_GROUP(sb)) -+ { -+ ext3_error (sb, "ext3_check_descriptors", -+ "Inode bitmap for group %d" -+ " not in group (block %lu)!", -+ i, (unsigned long) -+ le32_to_cpu(gdp->bg_inode_bitmap)); -+ return 0; -+ } -+ if (le32_to_cpu(gdp->bg_inode_table) < block || -+ le32_to_cpu(gdp->bg_inode_table) + sbi->s_itb_per_group >= -+ block + EXT3_BLOCKS_PER_GROUP(sb)) -+ { -+ ext3_error (sb, "ext3_check_descriptors", -+ "Inode table for group %d" -+ " not in group (block %lu)!", -+ i, (unsigned long) -+ le32_to_cpu(gdp->bg_inode_table)); -+ return 0; -+ } -+ block += EXT3_BLOCKS_PER_GROUP(sb); -+ gdp++; -+ } -+ return 1; -+} -+ -+ -+/* ext3_orphan_cleanup() walks a singly-linked list of inodes (starting at -+ * the superblock) which were deleted from all directories, but held open by -+ * a process at the time of a crash. We walk the list and try to delete these -+ * inodes at recovery time (only with a read-write filesystem). -+ * -+ * In order to keep the orphan inode chain consistent during traversal (in -+ * case of crash during recovery), we link each inode into the superblock -+ * orphan list_head and handle it the same way as an inode deletion during -+ * normal operation (which journals the operations for us). -+ * -+ * We only do an iget() and an iput() on each inode, which is very safe if we -+ * accidentally point at an in-use or already deleted inode. The worst that -+ * can happen in this case is that we get a "bit already cleared" message from -+ * ext3_free_inode(). The only reason we would point at a wrong inode is if -+ * e2fsck was run on this filesystem, and it must have already done the orphan -+ * inode cleanup for us, so we can safely abort without any further action. -+ */ -+static void ext3_orphan_cleanup (struct super_block * sb, -+ struct ext3_super_block * es) -+{ -+ unsigned int s_flags = sb->s_flags; -+ int nr_orphans = 0, nr_truncates = 0; -+ if (!es->s_last_orphan) { -+ jbd_debug(4, "no orphan inodes to clean up\n"); -+ return; -+ } -+ -+ if (s_flags & MS_RDONLY) { -+ printk(KERN_INFO "EXT3-fs: %s: orphan cleanup on readonly fs\n", -+ bdevname(sb->s_dev)); -+ sb->s_flags &= ~MS_RDONLY; -+ } -+ -+ if (sb->u.ext3_sb.s_mount_state & EXT3_ERROR_FS) { -+ if (es->s_last_orphan) -+ jbd_debug(1, "Errors on filesystem, " -+ "clearing orphan list.\n"); -+ es->s_last_orphan = 0; -+ jbd_debug(1, "Skipping orphan recovery on fs with errors.\n"); -+ return; -+ } -+ -+ while (es->s_last_orphan) { -+ struct inode *inode; -+ -+ if (!(inode = -+ ext3_orphan_get(sb, le32_to_cpu(es->s_last_orphan)))) { -+ es->s_last_orphan = 0; -+ break; -+ } -+ -+ list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan); -+ if (inode->i_nlink) { -+ printk(KERN_DEBUG __FUNCTION__ -+ ": truncating inode %ld to %Ld bytes\n", -+ inode->i_ino, inode->i_size); -+ jbd_debug(2, "truncating inode %ld to %Ld bytes\n", -+ inode->i_ino, inode->i_size); -+ ext3_truncate(inode); -+ nr_truncates++; -+ } else { -+ printk(KERN_DEBUG __FUNCTION__ -+ ": deleting unreferenced inode %ld\n", -+ inode->i_ino); -+ jbd_debug(2, "deleting unreferenced inode %ld\n", -+ inode->i_ino); -+ nr_orphans++; -+ } -+ iput(inode); /* The delete magic happens here! */ -+ } -+ -+#define PLURAL(x) (x), ((x)==1) ? "" : "s" -+ -+ if (nr_orphans) -+ printk(KERN_INFO "EXT3-fs: %s: %d orphan inode%s deleted\n", -+ bdevname(sb->s_dev), PLURAL(nr_orphans)); -+ if (nr_truncates) -+ printk(KERN_INFO "EXT3-fs: %s: %d truncate%s cleaned up\n", -+ bdevname(sb->s_dev), PLURAL(nr_truncates)); -+ sb->s_flags = s_flags; /* Restore MS_RDONLY status */ -+} -+ -+#define log2(n) ffz(~(n)) -+ -+/* -+ * Maximal file size. There is a direct, and {,double-,triple-}indirect -+ * block limit, and also a limit of (2^32 - 1) 512-byte sectors in i_blocks. -+ * We need to be 1 filesystem block less than the 2^32 sector limit. -+ */ -+static loff_t ext3_max_size(int bits) -+{ -+ loff_t res = EXT3_NDIR_BLOCKS; -+ res += 1LL << (bits-2); -+ res += 1LL << (2*(bits-2)); -+ res += 1LL << (3*(bits-2)); -+ res <<= bits; -+ if (res > (512LL << 32) - (1 << bits)) -+ res = (512LL << 32) - (1 << bits); -+ return res; -+} -+ -+struct super_block * ext3_read_super (struct super_block * sb, void * data, -+ int silent) -+{ -+ struct buffer_head * bh; -+ struct ext3_super_block *es = 0; -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ unsigned long sb_block = 1; -+ unsigned long logic_sb_block = 1; -+ unsigned long offset = 0; -+ unsigned long journal_inum = 0; -+ kdev_t dev = sb->s_dev; -+ int blocksize; -+ int hblock; -+ int db_count; -+ int i; -+ int needs_recovery; -+ -+#ifdef CONFIG_JBD_DEBUG -+ ext3_ro_after = 0; -+#endif -+ /* -+ * See what the current blocksize for the device is, and -+ * use that as the blocksize. Otherwise (or if the blocksize -+ * is smaller than the default) use the default. -+ * This is important for devices that have a hardware -+ * sectorsize that is larger than the default. -+ */ -+ blocksize = EXT3_MIN_BLOCK_SIZE; -+ hblock = get_hardsect_size(dev); -+ if (blocksize < hblock) -+ blocksize = hblock; -+ -+ sbi->s_mount_opt = 0; -+ sbi->s_resuid = EXT3_DEF_RESUID; -+ sbi->s_resgid = EXT3_DEF_RESGID; -+ if (!parse_options ((char *) data, &sb_block, sbi, &journal_inum, 0)) { -+ sb->s_dev = 0; -+ goto out_fail; -+ } -+ -+ sb->s_blocksize = blocksize; -+ set_blocksize (dev, blocksize); -+ -+ /* -+ * The ext3 superblock will not be buffer aligned for other than 1kB -+ * block sizes. We need to calculate the offset from buffer start. -+ */ -+ if (blocksize != EXT3_MIN_BLOCK_SIZE) { -+ logic_sb_block = (sb_block * EXT3_MIN_BLOCK_SIZE) / blocksize; -+ offset = (sb_block * EXT3_MIN_BLOCK_SIZE) % blocksize; -+ } -+ -+ if (!(bh = sb_bread(sb, logic_sb_block))) { -+ printk (KERN_ERR "EXT3-fs: unable to read superblock\n"); -+ goto out_fail; -+ } -+ /* -+ * Note: s_es must be initialized as soon as possible because -+ * some ext3 macro-instructions depend on its value -+ */ -+ es = (struct ext3_super_block *) (((char *)bh->b_data) + offset); -+ sbi->s_es = es; -+ sb->s_magic = le16_to_cpu(es->s_magic); -+ if (sb->s_magic != EXT3_SUPER_MAGIC) { -+ if (!silent) -+ printk(KERN_ERR -+ "VFS: Can't find ext3 filesystem on dev %s.\n", -+ bdevname(dev)); -+ goto failed_mount; -+ } -+ if (le32_to_cpu(es->s_rev_level) == EXT3_GOOD_OLD_REV && -+ (EXT3_HAS_COMPAT_FEATURE(sb, ~0U) || -+ EXT3_HAS_RO_COMPAT_FEATURE(sb, ~0U) || -+ EXT3_HAS_INCOMPAT_FEATURE(sb, ~0U))) -+ printk(KERN_WARNING -+ "EXT3-fs warning: feature flags set on rev 0 fs, " -+ "running e2fsck is recommended\n"); -+ /* -+ * Check feature flags regardless of the revision level, since we -+ * previously didn't change the revision level when setting the flags, -+ * so there is a chance incompat flags are set on a rev 0 filesystem. -+ */ -+ if ((i = EXT3_HAS_INCOMPAT_FEATURE(sb, ~EXT3_FEATURE_INCOMPAT_SUPP))) { -+ printk(KERN_ERR "EXT3-fs: %s: couldn't mount because of " -+ "unsupported optional features (%x).\n", -+ bdevname(dev), i); -+ goto failed_mount; -+ } -+ if (!(sb->s_flags & MS_RDONLY) && -+ (i = EXT3_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP))){ -+ printk(KERN_ERR "EXT3-fs: %s: couldn't mount RDWR because of " -+ "unsupported optional features (%x).\n", -+ bdevname(dev), i); -+ goto failed_mount; -+ } -+ sb->s_blocksize_bits = le32_to_cpu(es->s_log_block_size) + 10; -+ sb->s_blocksize = 1 << sb->s_blocksize_bits; -+ -+ if (sb->s_blocksize < EXT3_MIN_BLOCK_SIZE || -+ sb->s_blocksize > EXT3_MAX_BLOCK_SIZE) { -+ printk(KERN_ERR -+ "EXT3-fs: Unsupported filesystem blocksize %d on %s.\n", -+ blocksize, bdevname(dev)); -+ goto failed_mount; -+ } -+ -+ sb->s_maxbytes = ext3_max_size(sb->s_blocksize_bits); -+ -+ if (sb->s_blocksize != blocksize) { -+ blocksize = sb->s_blocksize; -+ -+ /* -+ * Make sure the blocksize for the filesystem is larger -+ * than the hardware sectorsize for the machine. -+ */ -+ if (sb->s_blocksize < hblock) { -+ printk(KERN_ERR "EXT3-fs: blocksize %d too small for " -+ "device blocksize %d.\n", blocksize, hblock); -+ goto failed_mount; -+ } -+ -+ brelse (bh); -+ set_blocksize (dev, sb->s_blocksize); -+ logic_sb_block = (sb_block * EXT3_MIN_BLOCK_SIZE) / blocksize; -+ offset = (sb_block * EXT3_MIN_BLOCK_SIZE) % blocksize; -+ bh = sb_bread(sb, logic_sb_block); -+ if (!bh) { -+ printk(KERN_ERR -+ "EXT3-fs: Can't read superblock on 2nd try.\n"); -+ return NULL; -+ } -+ es = (struct ext3_super_block *)(((char *)bh->b_data) + offset); -+ sbi->s_es = es; -+ if (es->s_magic != le16_to_cpu(EXT3_SUPER_MAGIC)) { -+ printk (KERN_ERR -+ "EXT3-fs: Magic mismatch, very weird !\n"); -+ goto failed_mount; -+ } -+ } -+ -+ if (le32_to_cpu(es->s_rev_level) == EXT3_GOOD_OLD_REV) { -+ sbi->s_inode_size = EXT3_GOOD_OLD_INODE_SIZE; -+ sbi->s_first_ino = EXT3_GOOD_OLD_FIRST_INO; -+ } else { -+ sbi->s_inode_size = le16_to_cpu(es->s_inode_size); -+ sbi->s_first_ino = le32_to_cpu(es->s_first_ino); -+ if (sbi->s_inode_size != EXT3_GOOD_OLD_INODE_SIZE) { -+ printk (KERN_ERR -+ "EXT3-fs: unsupported inode size: %d\n", -+ sbi->s_inode_size); -+ goto failed_mount; -+ } -+ } -+ sbi->s_frag_size = EXT3_MIN_FRAG_SIZE << -+ le32_to_cpu(es->s_log_frag_size); -+ if (blocksize != sbi->s_frag_size) { -+ printk(KERN_ERR -+ "EXT3-fs: fragsize %lu != blocksize %u (unsupported)\n", -+ sbi->s_frag_size, blocksize); -+ goto failed_mount; -+ } -+ sbi->s_frags_per_block = 1; -+ sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group); -+ sbi->s_frags_per_group = le32_to_cpu(es->s_frags_per_group); -+ sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group); -+ sbi->s_inodes_per_block = blocksize / EXT3_INODE_SIZE(sb); -+ sbi->s_itb_per_group = sbi->s_inodes_per_group /sbi->s_inodes_per_block; -+ sbi->s_desc_per_block = blocksize / sizeof(struct ext3_group_desc); -+ sbi->s_sbh = bh; -+ if (sbi->s_resuid == EXT3_DEF_RESUID) -+ sbi->s_resuid = le16_to_cpu(es->s_def_resuid); -+ if (sbi->s_resgid == EXT3_DEF_RESGID) -+ sbi->s_resgid = le16_to_cpu(es->s_def_resgid); -+ sbi->s_mount_state = le16_to_cpu(es->s_state); -+ sbi->s_addr_per_block_bits = log2(EXT3_ADDR_PER_BLOCK(sb)); -+ sbi->s_desc_per_block_bits = log2(EXT3_DESC_PER_BLOCK(sb)); -+ -+ if (sbi->s_blocks_per_group > blocksize * 8) { -+ printk (KERN_ERR -+ "EXT3-fs: #blocks per group too big: %lu\n", -+ sbi->s_blocks_per_group); -+ goto failed_mount; -+ } -+ if (sbi->s_frags_per_group > blocksize * 8) { -+ printk (KERN_ERR -+ "EXT3-fs: #fragments per group too big: %lu\n", -+ sbi->s_frags_per_group); -+ goto failed_mount; -+ } -+ if (sbi->s_inodes_per_group > blocksize * 8) { -+ printk (KERN_ERR -+ "EXT3-fs: #inodes per group too big: %lu\n", -+ sbi->s_inodes_per_group); -+ goto failed_mount; -+ } -+ -+ sbi->s_groups_count = (le32_to_cpu(es->s_blocks_count) - -+ le32_to_cpu(es->s_first_data_block) + -+ EXT3_BLOCKS_PER_GROUP(sb) - 1) / -+ EXT3_BLOCKS_PER_GROUP(sb); -+ db_count = (sbi->s_groups_count + EXT3_DESC_PER_BLOCK(sb) - 1) / -+ EXT3_DESC_PER_BLOCK(sb); -+ sbi->s_group_desc = kmalloc(db_count * sizeof (struct buffer_head *), -+ GFP_KERNEL); -+ if (sbi->s_group_desc == NULL) { -+ printk (KERN_ERR "EXT3-fs: not enough memory\n"); -+ goto failed_mount; -+ } -+ for (i = 0; i < db_count; i++) { -+ sbi->s_group_desc[i] = sb_bread(sb, logic_sb_block + i + 1); -+ if (!sbi->s_group_desc[i]) { -+ printk (KERN_ERR "EXT3-fs: " -+ "can't read group descriptor %d\n", i); -+ db_count = i; -+ goto failed_mount2; -+ } -+ } -+ if (!ext3_check_descriptors (sb)) { -+ printk (KERN_ERR "EXT3-fs: group descriptors corrupted !\n"); -+ goto failed_mount2; -+ } -+ for (i = 0; i < EXT3_MAX_GROUP_LOADED; i++) { -+ sbi->s_inode_bitmap_number[i] = 0; -+ sbi->s_inode_bitmap[i] = NULL; -+ sbi->s_block_bitmap_number[i] = 0; -+ sbi->s_block_bitmap[i] = NULL; -+ } -+ sbi->s_loaded_inode_bitmaps = 0; -+ sbi->s_loaded_block_bitmaps = 0; -+ sbi->s_gdb_count = db_count; -+ get_random_bytes(&sbi->s_next_generation, sizeof(u32)); -+ /* -+ * set up enough so that it can read an inode -+ */ -+ sb->s_op = &ext3_sops; -+ INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ -+ -+ sb->s_root = 0; -+ -+ needs_recovery = (es->s_last_orphan != 0 || -+ EXT3_HAS_INCOMPAT_FEATURE(sb, -+ EXT3_FEATURE_INCOMPAT_RECOVER)); -+ -+ /* -+ * The first inode we look at is the journal inode. Don't try -+ * root first: it may be modified in the journal! -+ */ -+ if (!test_opt(sb, NOLOAD) && -+ EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) { -+ if (ext3_load_journal(sb, es)) -+ goto failed_mount2; -+ } else if (journal_inum) { -+ if (ext3_create_journal(sb, es, journal_inum)) -+ goto failed_mount2; -+ } else { -+ if (!silent) -+ printk (KERN_ERR -+ "ext3: No journal on filesystem on %s\n", -+ bdevname(dev)); -+ goto failed_mount2; -+ } -+ -+ /* We have now updated the journal if required, so we can -+ * validate the data journaling mode. */ -+ switch (test_opt(sb, DATA_FLAGS)) { -+ case 0: -+ /* No mode set, assume a default based on the journal -+ capabilities: ORDERED_DATA if the journal can -+ cope, else JOURNAL_DATA */ -+ if (journal_check_available_features -+ (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)) -+ set_opt(sbi->s_mount_opt, ORDERED_DATA); -+ else -+ set_opt(sbi->s_mount_opt, JOURNAL_DATA); -+ break; -+ -+ case EXT3_MOUNT_ORDERED_DATA: -+ case EXT3_MOUNT_WRITEBACK_DATA: -+ if (!journal_check_available_features -+ (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)) { -+ printk(KERN_ERR "EXT3-fs: Journal does not support " -+ "requested data journaling mode\n"); -+ goto failed_mount3; -+ } -+ default: -+ break; -+ } -+ -+ /* -+ * The journal_load will have done any necessary log recovery, -+ * so we can safely mount the rest of the filesystem now. -+ */ -+ -+ sb->s_root = d_alloc_root(iget(sb, EXT3_ROOT_INO)); -+ if (!sb->s_root || !S_ISDIR(sb->s_root->d_inode->i_mode) || -+ !sb->s_root->d_inode->i_blocks || !sb->s_root->d_inode->i_size) { -+ if (sb->s_root) { -+ dput(sb->s_root); -+ sb->s_root = NULL; -+ printk(KERN_ERR -+ "EXT3-fs: corrupt root inode, run e2fsck\n"); -+ } else -+ printk(KERN_ERR "EXT3-fs: get root inode failed\n"); -+ goto failed_mount3; -+ } -+ -+ ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY); -+ /* -+ * akpm: core read_super() calls in here with the superblock locked. -+ * That deadlocks, because orphan cleanup needs to lock the superblock -+ * in numerous places. Here we just pop the lock - it's relatively -+ * harmless, because we are now ready to accept write_super() requests, -+ * and aviro says that's the only reason for hanging onto the -+ * superblock lock. -+ */ -+ EXT3_SB(sb)->s_mount_state |= EXT3_ORPHAN_FS; -+ unlock_super(sb); /* akpm: sigh */ -+ ext3_orphan_cleanup(sb, es); -+ lock_super(sb); -+ EXT3_SB(sb)->s_mount_state &= ~EXT3_ORPHAN_FS; -+ if (needs_recovery) -+ printk (KERN_INFO "EXT3-fs: recovery complete.\n"); -+ ext3_mark_recovery_complete(sb, es); -+ printk (KERN_INFO "EXT3-fs: mounted filesystem with %s data mode.\n", -+ test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ? "journal": -+ test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered": -+ "writeback"); -+ -+ return sb; -+ -+failed_mount3: -+ journal_destroy(sbi->s_journal); -+failed_mount2: -+ for (i = 0; i < db_count; i++) -+ brelse(sbi->s_group_desc[i]); -+ kfree(sbi->s_group_desc); -+failed_mount: -+ ext3_blkdev_remove(sbi); -+ brelse(bh); -+out_fail: -+ return NULL; -+} -+ -+static journal_t *ext3_get_journal(struct super_block *sb, int journal_inum) -+{ -+ struct inode *journal_inode; -+ journal_t *journal; -+ -+ /* First, test for the existence of a valid inode on disk. Bad -+ * things happen if we iget() an unused inode, as the subsequent -+ * iput() will try to delete it. */ -+ -+ journal_inode = iget(sb, journal_inum); -+ if (!journal_inode) { -+ printk(KERN_ERR "EXT3-fs: no journal found.\n"); -+ return NULL; -+ } -+ if (!journal_inode->i_nlink) { -+ make_bad_inode(journal_inode); -+ iput(journal_inode); -+ printk(KERN_ERR "EXT3-fs: journal inode is deleted.\n"); -+ return NULL; -+ } -+ -+ jbd_debug(2, "Journal inode found at %p: %Ld bytes\n", -+ journal_inode, journal_inode->i_size); -+ if (is_bad_inode(journal_inode) || !S_ISREG(journal_inode->i_mode)) { -+ printk(KERN_ERR "EXT3-fs: invalid journal inode.\n"); -+ iput(journal_inode); -+ return NULL; -+ } -+ -+ journal = journal_init_inode(journal_inode); -+ if (!journal) { -+ printk(KERN_ERR "EXT3-fs: Could not load journal inode\n"); -+ iput(journal_inode); -+ } -+ -+ return journal; -+} -+ -+static journal_t *ext3_get_dev_journal(struct super_block *sb, -+ int dev) -+{ -+ struct buffer_head * bh; -+ journal_t *journal; -+ int start; -+ int len; -+ int hblock, blocksize; -+ unsigned long sb_block; -+ unsigned long offset; -+ kdev_t journal_dev = to_kdev_t(dev); -+ struct ext3_super_block * es; -+ struct block_device *bdev; -+ -+ bdev = ext3_blkdev_get(journal_dev); -+ if (bdev == NULL) -+ return NULL; -+ -+ blocksize = sb->s_blocksize; -+ hblock = get_hardsect_size(journal_dev); -+ if (blocksize < hblock) { -+ printk(KERN_ERR -+ "EXT3-fs: blocksize too small for journal device.\n"); -+ goto out_bdev; -+ } -+ -+ sb_block = EXT3_MIN_BLOCK_SIZE / blocksize; -+ offset = EXT3_MIN_BLOCK_SIZE % blocksize; -+ set_blocksize(dev, blocksize); -+ if (!(bh = bread(dev, sb_block, blocksize))) { -+ printk(KERN_ERR "EXT3-fs: couldn't read superblock of " -+ "external journal\n"); -+ goto out_bdev; -+ } -+ -+ es = (struct ext3_super_block *) (((char *)bh->b_data) + offset); -+ if ((le16_to_cpu(es->s_magic) != EXT3_SUPER_MAGIC) || -+ !(le32_to_cpu(es->s_feature_incompat) & -+ EXT3_FEATURE_INCOMPAT_JOURNAL_DEV)) { -+ printk(KERN_ERR "EXT3-fs: external journal has " -+ "bad superblock\n"); -+ brelse(bh); -+ goto out_bdev; -+ } -+ -+ if (memcmp(EXT3_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) { -+ printk(KERN_ERR "EXT3-fs: journal UUID does not match\n"); -+ brelse(bh); -+ goto out_bdev; -+ } -+ -+ len = le32_to_cpu(es->s_blocks_count); -+ start = sb_block + 1; -+ brelse(bh); /* we're done with the superblock */ -+ -+ journal = journal_init_dev(journal_dev, sb->s_dev, -+ start, len, blocksize); -+ if (!journal) { -+ printk(KERN_ERR "EXT3-fs: failed to create device journal\n"); -+ goto out_bdev; -+ } -+ ll_rw_block(READ, 1, &journal->j_sb_buffer); -+ wait_on_buffer(journal->j_sb_buffer); -+ if (!buffer_uptodate(journal->j_sb_buffer)) { -+ printk(KERN_ERR "EXT3-fs: I/O error on journal device\n"); -+ goto out_journal; -+ } -+ if (ntohl(journal->j_superblock->s_nr_users) != 1) { -+ printk(KERN_ERR "EXT3-fs: External journal has more than one " -+ "user (unsupported) - %d\n", -+ ntohl(journal->j_superblock->s_nr_users)); -+ goto out_journal; -+ } -+ EXT3_SB(sb)->journal_bdev = bdev; -+ return journal; -+out_journal: -+ journal_destroy(journal); -+out_bdev: -+ ext3_blkdev_put(bdev); -+ return NULL; -+} -+ -+static int ext3_load_journal(struct super_block * sb, -+ struct ext3_super_block * es) -+{ -+ journal_t *journal; -+ int journal_inum = le32_to_cpu(es->s_journal_inum); -+ int journal_dev = le32_to_cpu(es->s_journal_dev); -+ int err = 0; -+ int really_read_only; -+ -+ really_read_only = is_read_only(sb->s_dev); -+ -+ /* -+ * Are we loading a blank journal or performing recovery after a -+ * crash? For recovery, we need to check in advance whether we -+ * can get read-write access to the device. -+ */ -+ -+ if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER)) { -+ if (sb->s_flags & MS_RDONLY) { -+ printk(KERN_INFO "EXT3-fs: INFO: recovery " -+ "required on readonly filesystem.\n"); -+ if (really_read_only) { -+ printk(KERN_ERR "EXT3-fs: write access " -+ "unavailable, cannot proceed.\n"); -+ return -EROFS; -+ } -+ printk (KERN_INFO "EXT3-fs: write access will " -+ "be enabled during recovery.\n"); -+ } -+ } -+ -+ if (journal_inum && journal_dev) { -+ printk(KERN_ERR "EXT3-fs: filesystem has both journal " -+ "and inode journals!\n"); -+ return -EINVAL; -+ } -+ -+ if (journal_inum) { -+ if (!(journal = ext3_get_journal(sb, journal_inum))) -+ return -EINVAL; -+ } else { -+ if (!(journal = ext3_get_dev_journal(sb, journal_dev))) -+ return -EINVAL; -+ } -+ -+ -+ if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) { -+ err = journal_update_format(journal); -+ if (err) { -+ printk(KERN_ERR "EXT3-fs: error updating journal.\n"); -+ journal_destroy(journal); -+ return err; -+ } -+ } -+ -+ if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER)) -+ err = journal_wipe(journal, !really_read_only); -+ if (!err) -+ err = journal_load(journal); -+ -+ if (err) { -+ printk(KERN_ERR "EXT3-fs: error loading journal.\n"); -+ journal_destroy(journal); -+ return err; -+ } -+ -+ EXT3_SB(sb)->s_journal = journal; -+ ext3_clear_journal_err(sb, es); -+ return 0; -+} -+ -+static int ext3_create_journal(struct super_block * sb, -+ struct ext3_super_block * es, -+ int journal_inum) -+{ -+ journal_t *journal; -+ -+ if (sb->s_flags & MS_RDONLY) { -+ printk(KERN_ERR "EXT3-fs: readonly filesystem when trying to " -+ "create journal.\n"); -+ return -EROFS; -+ } -+ -+ if (!(journal = ext3_get_journal(sb, journal_inum))) -+ return -EINVAL; -+ -+ printk(KERN_INFO "EXT3-fs: creating new journal on inode %d\n", -+ journal_inum); -+ -+ if (journal_create(journal)) { -+ printk(KERN_ERR "EXT3-fs: error creating journal.\n"); -+ journal_destroy(journal); -+ return -EIO; -+ } -+ -+ EXT3_SB(sb)->s_journal = journal; -+ -+ ext3_update_dynamic_rev(sb); -+ EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); -+ EXT3_SET_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL); -+ -+ es->s_journal_inum = cpu_to_le32(journal_inum); -+ sb->s_dirt = 1; -+ -+ /* Make sure we flush the recovery flag to disk. */ -+ ext3_commit_super(sb, es, 1); -+ -+ return 0; -+} -+ -+static void ext3_commit_super (struct super_block * sb, -+ struct ext3_super_block * es, -+ int sync) -+{ -+ es->s_wtime = cpu_to_le32(CURRENT_TIME); -+ BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "marking dirty"); -+ mark_buffer_dirty(sb->u.ext3_sb.s_sbh); -+ if (sync) { -+ ll_rw_block(WRITE, 1, &sb->u.ext3_sb.s_sbh); -+ wait_on_buffer(sb->u.ext3_sb.s_sbh); -+ } -+} -+ -+ -+/* -+ * Have we just finished recovery? If so, and if we are mounting (or -+ * remounting) the filesystem readonly, then we will end up with a -+ * consistent fs on disk. Record that fact. -+ */ -+static void ext3_mark_recovery_complete(struct super_block * sb, -+ struct ext3_super_block * es) -+{ -+ journal_flush(EXT3_SB(sb)->s_journal); -+ if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER) && -+ sb->s_flags & MS_RDONLY) { -+ EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); -+ sb->s_dirt = 0; -+ ext3_commit_super(sb, es, 1); -+ } -+} -+ -+/* -+ * If we are mounting (or read-write remounting) a filesystem whose journal -+ * has recorded an error from a previous lifetime, move that error to the -+ * main filesystem now. -+ */ -+static void ext3_clear_journal_err(struct super_block * sb, -+ struct ext3_super_block * es) -+{ -+ journal_t *journal; -+ int j_errno; -+ const char *errstr; -+ -+ journal = EXT3_SB(sb)->s_journal; -+ -+ /* -+ * Now check for any error status which may have been recorded in the -+ * journal by a prior ext3_error() or ext3_abort() -+ */ -+ -+ j_errno = journal_errno(journal); -+ if (j_errno) { -+ char nbuf[16]; -+ -+ errstr = ext3_decode_error(sb, j_errno, nbuf); -+ ext3_warning(sb, __FUNCTION__, "Filesystem error recorded " -+ "from previous mount: %s", errstr); -+ ext3_warning(sb, __FUNCTION__, "Marking fs in need of " -+ "filesystem check."); -+ -+ sb->u.ext3_sb.s_mount_state |= EXT3_ERROR_FS; -+ es->s_state |= cpu_to_le16(EXT3_ERROR_FS); -+ ext3_commit_super (sb, es, 1); -+ -+ journal_clear_err(journal); -+ } -+} -+ -+/* -+ * Force the running and committing transactions to commit, -+ * and wait on the commit. -+ */ -+int ext3_force_commit(struct super_block *sb) -+{ -+ journal_t *journal; -+ int ret; -+ -+ if (sb->s_flags & MS_RDONLY) -+ return 0; -+ -+ journal = EXT3_SB(sb)->s_journal; -+ sb->s_dirt = 0; -+ lock_kernel(); /* important: lock down j_running_transaction */ -+ ret = ext3_journal_force_commit(journal); -+ unlock_kernel(); -+ return ret; -+} -+ -+/* -+ * Ext3 always journals updates to the superblock itself, so we don't -+ * have to propagate any other updates to the superblock on disk at this -+ * point. Just start an async writeback to get the buffers on their way -+ * to the disk. -+ * -+ * This implicitly triggers the writebehind on sync(). -+ */ -+ -+static int do_sync_supers = 0; -+MODULE_PARM(do_sync_supers, "i"); -+MODULE_PARM_DESC(do_sync_supers, "Write superblocks synchronously"); -+ -+void ext3_write_super (struct super_block * sb) -+{ -+ tid_t target; -+ -+ if (down_trylock(&sb->s_lock) == 0) -+ BUG(); /* aviro detector */ -+ sb->s_dirt = 0; -+ target = log_start_commit(EXT3_SB(sb)->s_journal, NULL); -+ -+ if (do_sync_supers) { -+ unlock_super(sb); -+ log_wait_commit(EXT3_SB(sb)->s_journal, target); -+ lock_super(sb); -+ } -+} -+ -+/* -+ * LVM calls this function before a (read-only) snapshot is created. This -+ * gives us a chance to flush the journal completely and mark the fs clean. -+ */ -+void ext3_write_super_lockfs(struct super_block *sb) -+{ -+ sb->s_dirt = 0; -+ -+ lock_kernel(); /* 2.4.5 forgot to do this for us */ -+ if (!(sb->s_flags & MS_RDONLY)) { -+ journal_t *journal = EXT3_SB(sb)->s_journal; -+ -+ /* Now we set up the journal barrier. */ -+ journal_lock_updates(journal); -+ journal_flush(journal); -+ -+ /* Journal blocked and flushed, clear needs_recovery flag. */ -+ EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); -+ ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1); -+ } -+ unlock_kernel(); -+} -+ -+/* -+ * Called by LVM after the snapshot is done. We need to reset the RECOVER -+ * flag here, even though the filesystem is not technically dirty yet. -+ */ -+void ext3_unlockfs(struct super_block *sb) -+{ -+ if (!(sb->s_flags & MS_RDONLY)) { -+ lock_kernel(); -+ lock_super(sb); -+ /* Reser the needs_recovery flag before the fs is unlocked. */ -+ EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); -+ ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1); -+ unlock_super(sb); -+ journal_unlock_updates(EXT3_SB(sb)->s_journal); -+ unlock_kernel(); -+ } -+} -+ -+int ext3_remount (struct super_block * sb, int * flags, char * data) -+{ -+ struct ext3_super_block * es; -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ unsigned long tmp; -+ -+ clear_ro_after(sb); -+ -+ /* -+ * Allow the "check" option to be passed as a remount option. -+ */ -+ if (!parse_options(data, &tmp, sbi, &tmp, 1)) -+ return -EINVAL; -+ -+ if (sbi->s_mount_opt & EXT3_MOUNT_ABORT) -+ ext3_abort(sb, __FUNCTION__, "Abort forced by user"); -+ -+ es = sbi->s_es; -+ -+ if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) { -+ if (sbi->s_mount_opt & EXT3_MOUNT_ABORT) -+ return -EROFS; -+ -+ if (*flags & MS_RDONLY) { -+ /* -+ * First of all, the unconditional stuff we have to do -+ * to disable replay of the journal when we next remount -+ */ -+ sb->s_flags |= MS_RDONLY; -+ -+ /* -+ * OK, test if we are remounting a valid rw partition -+ * readonly, and if so set the rdonly flag and then -+ * mark the partition as valid again. -+ */ -+ if (!(es->s_state & cpu_to_le16(EXT3_VALID_FS)) && -+ (sbi->s_mount_state & EXT3_VALID_FS)) -+ es->s_state = cpu_to_le16(sbi->s_mount_state); -+ -+ ext3_mark_recovery_complete(sb, es); -+ } else { -+ int ret; -+ if ((ret = EXT3_HAS_RO_COMPAT_FEATURE(sb, -+ ~EXT3_FEATURE_RO_COMPAT_SUPP))) { -+ printk(KERN_WARNING "EXT3-fs: %s: couldn't " -+ "remount RDWR because of unsupported " -+ "optional features (%x).\n", -+ bdevname(sb->s_dev), ret); -+ return -EROFS; -+ } -+ /* -+ * Mounting a RDONLY partition read-write, so reread -+ * and store the current valid flag. (It may have -+ * been changed by e2fsck since we originally mounted -+ * the partition.) -+ */ -+ ext3_clear_journal_err(sb, es); -+ sbi->s_mount_state = le16_to_cpu(es->s_state); -+ if (!ext3_setup_super (sb, es, 0)) -+ sb->s_flags &= ~MS_RDONLY; -+ } -+ } -+ setup_ro_after(sb); -+ return 0; -+} -+ -+int ext3_statfs (struct super_block * sb, struct statfs * buf) -+{ -+ struct ext3_super_block *es = EXT3_SB(sb)->s_es; -+ unsigned long overhead; -+ int i; -+ -+ if (test_opt (sb, MINIX_DF)) -+ overhead = 0; -+ else { -+ /* -+ * Compute the overhead (FS structures) -+ */ -+ -+ /* -+ * All of the blocks before first_data_block are -+ * overhead -+ */ -+ overhead = le32_to_cpu(es->s_first_data_block); -+ -+ /* -+ * Add the overhead attributed to the superblock and -+ * block group descriptors. If the sparse superblocks -+ * feature is turned on, then not all groups have this. -+ */ -+ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) -+ overhead += ext3_bg_has_super(sb, i) + -+ ext3_bg_num_gdb(sb, i); -+ -+ /* -+ * Every block group has an inode bitmap, a block -+ * bitmap, and an inode table. -+ */ -+ overhead += (EXT3_SB(sb)->s_groups_count * -+ (2 + EXT3_SB(sb)->s_itb_per_group)); -+ } -+ -+ buf->f_type = EXT3_SUPER_MAGIC; -+ buf->f_bsize = sb->s_blocksize; -+ buf->f_blocks = le32_to_cpu(es->s_blocks_count) - overhead; -+ buf->f_bfree = ext3_count_free_blocks (sb); -+ buf->f_bavail = buf->f_bfree - le32_to_cpu(es->s_r_blocks_count); -+ if (buf->f_bfree < le32_to_cpu(es->s_r_blocks_count)) -+ buf->f_bavail = 0; -+ buf->f_files = le32_to_cpu(es->s_inodes_count); -+ buf->f_ffree = ext3_count_free_inodes (sb); -+ buf->f_namelen = EXT3_NAME_LEN; -+ return 0; -+} -+ -+static DECLARE_FSTYPE_DEV(ext3_fs_type, "ext3", ext3_read_super); -+ -+static int __init init_ext3_fs(void) -+{ -+ return register_filesystem(&ext3_fs_type); -+} -+ -+static void __exit exit_ext3_fs(void) -+{ -+ unregister_filesystem(&ext3_fs_type); -+} -+ -+EXPORT_NO_SYMBOLS; -+ -+MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); -+MODULE_DESCRIPTION("Second Extended Filesystem with journaling extensions"); -+MODULE_LICENSE("GPL"); -+module_init(init_ext3_fs) -+module_exit(exit_ext3_fs) -diff -rup --new-file linux.mcp2/fs/ext3/symlink.c linux_tmp/fs/ext3/symlink.c ---- linux.mcp2/fs/ext3/symlink.c 1969-12-31 16:00:00.000000000 -0800 -+++ linux_tmp/fs/ext3/symlink.c 2001-11-09 14:25:04.000000000 -0800 -@@ -0,0 +1,39 @@ -+/* -+ * linux/fs/ext3/symlink.c -+ * -+ * Only fast symlinks left here - the rest is done by generic code. AV, 1999 -+ * -+ * Copyright (C) 1992, 1993, 1994, 1995 -+ * Remy Card (card@masi.ibp.fr) -+ * Laboratoire MASI - Institut Blaise Pascal -+ * Universite Pierre et Marie Curie (Paris VI) -+ * -+ * from -+ * -+ * linux/fs/minix/symlink.c -+ * -+ * Copyright (C) 1991, 1992 Linus Torvalds -+ * -+ * ext3 symlink handling code -+ */ -+ -+#include -+#include -+#include -+ -+static int ext3_readlink(struct dentry *dentry, char *buffer, int buflen) -+{ -+ char *s = (char *)dentry->d_inode->u.ext3_i.i_data; -+ return vfs_readlink(dentry, buffer, buflen, s); -+} -+ -+static int ext3_follow_link(struct dentry *dentry, struct nameidata *nd) -+{ -+ char *s = (char *)dentry->d_inode->u.ext3_i.i_data; -+ return vfs_follow_link(nd, s); -+} -+ -+struct inode_operations ext3_fast_symlink_inode_operations = { -+ readlink: ext3_readlink, /* BKL not held. Don't need */ -+ follow_link: ext3_follow_link, /* BKL not held. Don't need */ -+}; diff --git a/lustre/kernel_patches/patches/2.4.19-jbd.patch b/lustre/kernel_patches/patches/2.4.19-jbd.patch deleted file mode 100644 index 4f4b38e..0000000 --- a/lustre/kernel_patches/patches/2.4.19-jbd.patch +++ /dev/null @@ -1,6524 +0,0 @@ -diff -ruP linux.mcp2/fs/jbd/Makefile linuxppc_2.4.19_final/fs/jbd/Makefile ---- linux.mcp2/fs/jbd/Makefile 1969-12-31 16:00:00.000000000 -0800 -+++ linuxppc_2.4.19_final/fs/jbd/Makefile 2004-05-17 13:56:17.000000000 -0700 -@@ -0,0 +1,15 @@ -+# -+# fs/jbd/Makefile -+# -+# Makefile for the linux journaling routines. -+# -+ -+export-objs := journal.o -+O_TARGET := jbd.o -+ -+obj-y := transaction.o commit.o recovery.o checkpoint.o revoke.o journal.o -+ -+obj-m := $(O_TARGET) -+ -+include $(TOPDIR)/Rules.make -+ -diff -ruP linux.mcp2/fs/jbd/checkpoint.c linuxppc_2.4.19_final/fs/jbd/checkpoint.c ---- linux.mcp2/fs/jbd/checkpoint.c 1969-12-31 16:00:00.000000000 -0800 -+++ linuxppc_2.4.19_final/fs/jbd/checkpoint.c 2004-05-17 13:56:17.000000000 -0700 -@@ -0,0 +1,605 @@ -+/* -+ * linux/fs/checkpoint.c -+ * -+ * Written by Stephen C. Tweedie , 1999 -+ * -+ * Copyright 1999 Red Hat Software --- All Rights Reserved -+ * -+ * This file is part of the Linux kernel and is made available under -+ * the terms of the GNU General Public License, version 2, or at your -+ * option, any later version, incorporated herein by reference. -+ * -+ * Checkpoint routines for the generic filesystem journaling code. -+ * Part of the ext2fs journaling system. -+ * -+ * Checkpointing is the process of ensuring that a section of the log is -+ * committed fully to disk, so that that portion of the log can be -+ * reused. -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+ -+extern spinlock_t journal_datalist_lock; -+ -+/* -+ * Unlink a buffer from a transaction. -+ * -+ * Called with journal_datalist_lock held. -+ */ -+ -+static inline void __buffer_unlink(struct journal_head *jh) -+{ -+ transaction_t *transaction; -+ -+ transaction = jh->b_cp_transaction; -+ jh->b_cp_transaction = NULL; -+ -+ jh->b_cpnext->b_cpprev = jh->b_cpprev; -+ jh->b_cpprev->b_cpnext = jh->b_cpnext; -+ if (transaction->t_checkpoint_list == jh) -+ transaction->t_checkpoint_list = jh->b_cpnext; -+ if (transaction->t_checkpoint_list == jh) -+ transaction->t_checkpoint_list = NULL; -+} -+ -+/* -+ * Try to release a checkpointed buffer from its transaction. -+ * Returns 1 if we released it. -+ * Requires journal_datalist_lock -+ */ -+static int __try_to_free_cp_buf(struct journal_head *jh) -+{ -+ int ret = 0; -+ struct buffer_head *bh = jh2bh(jh); -+ -+ if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) { -+ JBUFFER_TRACE(jh, "remove from checkpoint list"); -+ __journal_remove_checkpoint(jh); -+ __journal_remove_journal_head(bh); -+ BUFFER_TRACE(bh, "release"); -+ /* BUF_LOCKED -> BUF_CLEAN (fwiw) */ -+ refile_buffer(bh); -+ __brelse(bh); -+ ret = 1; -+ } -+ return ret; -+} -+ -+/* -+ * log_wait_for_space: wait until there is space in the journal. -+ * -+ * Called with the journal already locked, but it will be unlocked if we have -+ * to wait for a checkpoint to free up some space in the log. -+ */ -+ -+void log_wait_for_space(journal_t *journal, int nblocks) -+{ -+ while (log_space_left(journal) < nblocks) { -+ if (journal->j_flags & JFS_ABORT) -+ return; -+ unlock_journal(journal); -+ down(&journal->j_checkpoint_sem); -+ lock_journal(journal); -+ -+ /* Test again, another process may have checkpointed -+ * while we were waiting for the checkpoint lock */ -+ if (log_space_left(journal) < nblocks) { -+ log_do_checkpoint(journal, nblocks); -+ } -+ up(&journal->j_checkpoint_sem); -+ } -+} -+ -+/* -+ * Clean up a transaction's checkpoint list. -+ * -+ * We wait for any pending IO to complete and make sure any clean -+ * buffers are removed from the transaction. -+ * -+ * Return 1 if we performed any actions which might have destroyed the -+ * checkpoint. (journal_remove_checkpoint() deletes the transaction when -+ * the last checkpoint buffer is cleansed) -+ * -+ * Called with the journal locked. -+ * Called with journal_datalist_lock held. -+ */ -+static int __cleanup_transaction(journal_t *journal, transaction_t *transaction) -+{ -+ struct journal_head *jh, *next_jh, *last_jh; -+ struct buffer_head *bh; -+ int ret = 0; -+ -+ assert_spin_locked(&journal_datalist_lock); -+ jh = transaction->t_checkpoint_list; -+ if (!jh) -+ return 0; -+ -+ last_jh = jh->b_cpprev; -+ next_jh = jh; -+ do { -+ jh = next_jh; -+ bh = jh2bh(jh); -+ if (buffer_locked(bh)) { -+ atomic_inc(&bh->b_count); -+ spin_unlock(&journal_datalist_lock); -+ unlock_journal(journal); -+ wait_on_buffer(bh); -+ /* the journal_head may have gone by now */ -+ BUFFER_TRACE(bh, "brelse"); -+ __brelse(bh); -+ goto out_return_1; -+ } -+ -+ if (jh->b_transaction != NULL) { -+ transaction_t *transaction = jh->b_transaction; -+ tid_t tid = transaction->t_tid; -+ -+ spin_unlock(&journal_datalist_lock); -+ log_start_commit(journal, transaction); -+ unlock_journal(journal); -+ log_wait_commit(journal, tid); -+ goto out_return_1; -+ } -+ -+ /* -+ * We used to test for (jh->b_list != BUF_CLEAN) here. -+ * But unmap_underlying_metadata() can place buffer onto -+ * BUF_CLEAN. Since refile_buffer() no longer takes buffers -+ * off checkpoint lists, we cope with it here -+ */ -+ /* -+ * AKPM: I think the buffer_jdirty test is redundant - it -+ * shouldn't have NULL b_transaction? -+ */ -+ next_jh = jh->b_cpnext; -+ if (!buffer_dirty(bh) && !buffer_jdirty(bh)) { -+ BUFFER_TRACE(bh, "remove from checkpoint"); -+ __journal_remove_checkpoint(jh); -+ __journal_remove_journal_head(bh); -+ refile_buffer(bh); -+ __brelse(bh); -+ ret = 1; -+ } -+ -+ jh = next_jh; -+ } while (jh != last_jh); -+ -+ return ret; -+out_return_1: -+ lock_journal(journal); -+ spin_lock(&journal_datalist_lock); -+ return 1; -+} -+ -+#define NR_BATCH 64 -+ -+static void __flush_batch(struct buffer_head **bhs, int *batch_count) -+{ -+ int i; -+ -+ spin_unlock(&journal_datalist_lock); -+ ll_rw_block(WRITE, *batch_count, bhs); -+ run_task_queue(&tq_disk); -+ spin_lock(&journal_datalist_lock); -+ for (i = 0; i < *batch_count; i++) { -+ struct buffer_head *bh = bhs[i]; -+ clear_bit(BH_JWrite, &bh->b_state); -+ BUFFER_TRACE(bh, "brelse"); -+ __brelse(bh); -+ } -+ *batch_count = 0; -+} -+ -+/* -+ * Try to flush one buffer from the checkpoint list to disk. -+ * -+ * Return 1 if something happened which requires us to abort the current -+ * scan of the checkpoint list. -+ * -+ * Called with journal_datalist_lock held. -+ */ -+static int __flush_buffer(journal_t *journal, struct journal_head *jh, -+ struct buffer_head **bhs, int *batch_count, -+ int *drop_count) -+{ -+ struct buffer_head *bh = jh2bh(jh); -+ int ret = 0; -+ -+ if (buffer_dirty(bh) && !buffer_locked(bh) && jh->b_jlist == BJ_None) { -+ J_ASSERT_JH(jh, jh->b_transaction == NULL); -+ -+ /* -+ * Important: we are about to write the buffer, and -+ * possibly block, while still holding the journal lock. -+ * We cannot afford to let the transaction logic start -+ * messing around with this buffer before we write it to -+ * disk, as that would break recoverability. -+ */ -+ BUFFER_TRACE(bh, "queue"); -+ atomic_inc(&bh->b_count); -+ J_ASSERT_BH(bh, !test_bit(BH_JWrite, &bh->b_state)); -+ set_bit(BH_JWrite, &bh->b_state); -+ bhs[*batch_count] = bh; -+ (*batch_count)++; -+ if (*batch_count == NR_BATCH) { -+ __flush_batch(bhs, batch_count); -+ ret = 1; -+ } -+ } else { -+ int last_buffer = 0; -+ if (jh->b_cpnext == jh) { -+ /* We may be about to drop the transaction. Tell the -+ * caller that the lists have changed. -+ */ -+ last_buffer = 1; -+ } -+ if (__try_to_free_cp_buf(jh)) { -+ (*drop_count)++; -+ ret = last_buffer; -+ } -+ } -+ return ret; -+} -+ -+ -+/* -+ * Perform an actual checkpoint. We don't write out only enough to -+ * satisfy the current blocked requests: rather we submit a reasonably -+ * sized chunk of the outstanding data to disk at once for -+ * efficiency. log_wait_for_space() will retry if we didn't free enough. -+ * -+ * However, we _do_ take into account the amount requested so that once -+ * the IO has been queued, we can return as soon as enough of it has -+ * completed to disk. -+ * -+ * The journal should be locked before calling this function. -+ */ -+ -+/* @@@ `nblocks' is unused. Should it be used? */ -+int log_do_checkpoint (journal_t *journal, int nblocks) -+{ -+ transaction_t *transaction, *last_transaction, *next_transaction; -+ int result; -+ int target; -+ int batch_count = 0; -+ struct buffer_head *bhs[NR_BATCH]; -+ -+ jbd_debug(1, "Start checkpoint\n"); -+ -+ /* -+ * First thing: if there are any transactions in the log which -+ * don't need checkpointing, just eliminate them from the -+ * journal straight away. -+ */ -+ result = cleanup_journal_tail(journal); -+ jbd_debug(1, "cleanup_journal_tail returned %d\n", result); -+ if (result <= 0) -+ return result; -+ -+ /* -+ * OK, we need to start writing disk blocks. Try to free up a -+ * quarter of the log in a single checkpoint if we can. -+ */ -+ /* -+ * AKPM: check this code. I had a feeling a while back that it -+ * degenerates into a busy loop at unmount time. -+ */ -+ target = (journal->j_last - journal->j_first) / 4; -+ -+ spin_lock(&journal_datalist_lock); -+repeat: -+ transaction = journal->j_checkpoint_transactions; -+ if (transaction == NULL) -+ goto done; -+ last_transaction = transaction->t_cpprev; -+ next_transaction = transaction; -+ -+ do { -+ struct journal_head *jh, *last_jh, *next_jh; -+ int drop_count = 0; -+ int cleanup_ret, retry = 0; -+ -+ transaction = next_transaction; -+ next_transaction = transaction->t_cpnext; -+ jh = transaction->t_checkpoint_list; -+ last_jh = jh->b_cpprev; -+ next_jh = jh; -+ do { -+ jh = next_jh; -+ next_jh = jh->b_cpnext; -+ retry = __flush_buffer(journal, jh, bhs, &batch_count, -+ &drop_count); -+ } while (jh != last_jh && !retry); -+ if (batch_count) { -+ __flush_batch(bhs, &batch_count); -+ goto repeat; -+ } -+ if (retry) -+ goto repeat; -+ /* -+ * We have walked the whole transaction list without -+ * finding anything to write to disk. We had better be -+ * able to make some progress or we are in trouble. -+ */ -+ cleanup_ret = __cleanup_transaction(journal, transaction); -+ J_ASSERT(drop_count != 0 || cleanup_ret != 0); -+ goto repeat; /* __cleanup may have dropped lock */ -+ } while (transaction != last_transaction); -+ -+done: -+ spin_unlock(&journal_datalist_lock); -+ result = cleanup_journal_tail(journal); -+ if (result < 0) -+ return result; -+ -+ return 0; -+} -+ -+/* -+ * Check the list of checkpoint transactions for the journal to see if -+ * we have already got rid of any since the last update of the log tail -+ * in the journal superblock. If so, we can instantly roll the -+ * superblock forward to remove those transactions from the log. -+ * -+ * Return <0 on error, 0 on success, 1 if there was nothing to clean up. -+ * -+ * Called with the journal lock held. -+ * -+ * This is the only part of the journaling code which really needs to be -+ * aware of transaction aborts. Checkpointing involves writing to the -+ * main filesystem area rather than to the journal, so it can proceed -+ * even in abort state, but we must not update the journal superblock if -+ * we have an abort error outstanding. -+ */ -+ -+int cleanup_journal_tail(journal_t *journal) -+{ -+ transaction_t * transaction; -+ tid_t first_tid; -+ unsigned long blocknr, freed; -+ -+ /* OK, work out the oldest transaction remaining in the log, and -+ * the log block it starts at. -+ * -+ * If the log is now empty, we need to work out which is the -+ * next transaction ID we will write, and where it will -+ * start. */ -+ -+ /* j_checkpoint_transactions needs locking */ -+ spin_lock(&journal_datalist_lock); -+ transaction = journal->j_checkpoint_transactions; -+ if (transaction) { -+ first_tid = transaction->t_tid; -+ blocknr = transaction->t_log_start; -+ } else if ((transaction = journal->j_committing_transaction) != NULL) { -+ first_tid = transaction->t_tid; -+ blocknr = transaction->t_log_start; -+ } else if ((transaction = journal->j_running_transaction) != NULL) { -+ first_tid = transaction->t_tid; -+ blocknr = journal->j_head; -+ } else { -+ first_tid = journal->j_transaction_sequence; -+ blocknr = journal->j_head; -+ } -+ spin_unlock(&journal_datalist_lock); -+ J_ASSERT (blocknr != 0); -+ -+ /* If the oldest pinned transaction is at the tail of the log -+ already then there's not much we can do right now. */ -+ if (journal->j_tail_sequence == first_tid) -+ return 1; -+ -+ /* OK, update the superblock to recover the freed space. -+ * Physical blocks come first: have we wrapped beyond the end of -+ * the log? */ -+ freed = blocknr - journal->j_tail; -+ if (blocknr < journal->j_tail) -+ freed = freed + journal->j_last - journal->j_first; -+ -+ jbd_debug(1, -+ "Cleaning journal tail from %d to %d (offset %lu), " -+ "freeing %lu\n", -+ journal->j_tail_sequence, first_tid, blocknr, freed); -+ -+ journal->j_free += freed; -+ journal->j_tail_sequence = first_tid; -+ journal->j_tail = blocknr; -+ if (!(journal->j_flags & JFS_ABORT)) -+ journal_update_superblock(journal, 1); -+ return 0; -+} -+ -+ -+/* Checkpoint list management */ -+ -+/* -+ * journal_clean_checkpoint_list -+ * -+ * Find all the written-back checkpoint buffers in the journal and release them. -+ * -+ * Called with the journal locked. -+ * Called with journal_datalist_lock held. -+ * Returns number of bufers reaped (for debug) -+ */ -+ -+int __journal_clean_checkpoint_list(journal_t *journal) -+{ -+ transaction_t *transaction, *last_transaction, *next_transaction; -+ int ret = 0; -+ -+ transaction = journal->j_checkpoint_transactions; -+ if (transaction == 0) -+ goto out; -+ -+ last_transaction = transaction->t_cpprev; -+ next_transaction = transaction; -+ do { -+ struct journal_head *jh; -+ -+ transaction = next_transaction; -+ next_transaction = transaction->t_cpnext; -+ jh = transaction->t_checkpoint_list; -+ if (jh) { -+ struct journal_head *last_jh = jh->b_cpprev; -+ struct journal_head *next_jh = jh; -+ do { -+ jh = next_jh; -+ next_jh = jh->b_cpnext; -+ ret += __try_to_free_cp_buf(jh); -+ } while (jh != last_jh); -+ } -+ } while (transaction != last_transaction); -+out: -+ return ret; -+} -+ -+/* -+ * journal_remove_checkpoint: called after a buffer has been committed -+ * to disk (either by being write-back flushed to disk, or being -+ * committed to the log). -+ * -+ * We cannot safely clean a transaction out of the log until all of the -+ * buffer updates committed in that transaction have safely been stored -+ * elsewhere on disk. To achieve this, all of the buffers in a -+ * transaction need to be maintained on the transaction's checkpoint -+ * list until they have been rewritten, at which point this function is -+ * called to remove the buffer from the existing transaction's -+ * checkpoint list. -+ * -+ * This function is called with the journal locked. -+ * This function is called with journal_datalist_lock held. -+ */ -+ -+void __journal_remove_checkpoint(struct journal_head *jh) -+{ -+ transaction_t *transaction; -+ journal_t *journal; -+ -+ JBUFFER_TRACE(jh, "entry"); -+ -+ if ((transaction = jh->b_cp_transaction) == NULL) { -+ JBUFFER_TRACE(jh, "not on transaction"); -+ goto out; -+ } -+ -+ journal = transaction->t_journal; -+ -+ __buffer_unlink(jh); -+ -+ if (transaction->t_checkpoint_list != NULL) -+ goto out; -+ JBUFFER_TRACE(jh, "transaction has no more buffers"); -+ -+ /* There is one special case to worry about: if we have just -+ pulled the buffer off a committing transaction's forget list, -+ then even if the checkpoint list is empty, the transaction -+ obviously cannot be dropped! */ -+ -+ if (transaction == journal->j_committing_transaction) { -+ JBUFFER_TRACE(jh, "belongs to committing transaction"); -+ goto out; -+ } -+ -+ /* OK, that was the last buffer for the transaction: we can now -+ safely remove this transaction from the log */ -+ -+ __journal_drop_transaction(journal, transaction); -+ -+ /* Just in case anybody was waiting for more transactions to be -+ checkpointed... */ -+ wake_up(&journal->j_wait_logspace); -+out: -+ JBUFFER_TRACE(jh, "exit"); -+} -+ -+void journal_remove_checkpoint(struct journal_head *jh) -+{ -+ spin_lock(&journal_datalist_lock); -+ __journal_remove_checkpoint(jh); -+ spin_unlock(&journal_datalist_lock); -+} -+ -+/* -+ * journal_insert_checkpoint: put a committed buffer onto a checkpoint -+ * list so that we know when it is safe to clean the transaction out of -+ * the log. -+ * -+ * Called with the journal locked. -+ * Called with journal_datalist_lock held. -+ */ -+void __journal_insert_checkpoint(struct journal_head *jh, -+ transaction_t *transaction) -+{ -+ JBUFFER_TRACE(jh, "entry"); -+ J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jdirty(jh2bh(jh))); -+ J_ASSERT_JH(jh, jh->b_cp_transaction == NULL); -+ -+ assert_spin_locked(&journal_datalist_lock); -+ jh->b_cp_transaction = transaction; -+ -+ if (!transaction->t_checkpoint_list) { -+ jh->b_cpnext = jh->b_cpprev = jh; -+ } else { -+ jh->b_cpnext = transaction->t_checkpoint_list; -+ jh->b_cpprev = transaction->t_checkpoint_list->b_cpprev; -+ jh->b_cpprev->b_cpnext = jh; -+ jh->b_cpnext->b_cpprev = jh; -+ } -+ transaction->t_checkpoint_list = jh; -+} -+ -+void journal_insert_checkpoint(struct journal_head *jh, -+ transaction_t *transaction) -+{ -+ spin_lock(&journal_datalist_lock); -+ __journal_insert_checkpoint(jh, transaction); -+ spin_unlock(&journal_datalist_lock); -+} -+ -+/* -+ * We've finished with this transaction structure: adios... -+ * -+ * The transaction must have no links except for the checkpoint by this -+ * point. -+ * -+ * Called with the journal locked. -+ * Called with journal_datalist_lock held. -+ */ -+ -+void __journal_drop_transaction(journal_t *journal, transaction_t *transaction) -+{ -+ assert_spin_locked(&journal_datalist_lock); -+ if (transaction->t_cpnext) { -+ transaction->t_cpnext->t_cpprev = transaction->t_cpprev; -+ transaction->t_cpprev->t_cpnext = transaction->t_cpnext; -+ if (journal->j_checkpoint_transactions == transaction) -+ journal->j_checkpoint_transactions = -+ transaction->t_cpnext; -+ if (journal->j_checkpoint_transactions == transaction) -+ journal->j_checkpoint_transactions = NULL; -+ } -+ -+ J_ASSERT (transaction->t_ilist == NULL); -+ J_ASSERT (transaction->t_buffers == NULL); -+ J_ASSERT (transaction->t_sync_datalist == NULL); -+ J_ASSERT (transaction->t_async_datalist == NULL); -+ J_ASSERT (transaction->t_forget == NULL); -+ J_ASSERT (transaction->t_iobuf_list == NULL); -+ J_ASSERT (transaction->t_shadow_list == NULL); -+ J_ASSERT (transaction->t_log_list == NULL); -+ J_ASSERT (transaction->t_checkpoint_list == NULL); -+ J_ASSERT (transaction->t_updates == 0); -+ -+ J_ASSERT (transaction->t_journal->j_committing_transaction != -+ transaction); -+ -+ jbd_debug (1, "Dropping transaction %d, all done\n", -+ transaction->t_tid); -+ kfree (transaction); -+} -+ -diff -ruP linux.mcp2/fs/jbd/commit.c linuxppc_2.4.19_final/fs/jbd/commit.c ---- linux.mcp2/fs/jbd/commit.c 1969-12-31 16:00:00.000000000 -0800 -+++ linuxppc_2.4.19_final/fs/jbd/commit.c 2004-05-17 13:56:17.000000000 -0700 -@@ -0,0 +1,719 @@ -+/* -+ * linux/fs/commit.c -+ * -+ * Written by Stephen C. Tweedie , 1998 -+ * -+ * Copyright 1998 Red Hat corp --- All Rights Reserved -+ * -+ * This file is part of the Linux kernel and is made available under -+ * the terms of the GNU General Public License, version 2, or at your -+ * option, any later version, incorporated herein by reference. -+ * -+ * Journal commit routines for the generic filesystem journaling code; -+ * part of the ext2fs journaling system. -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+extern spinlock_t journal_datalist_lock; -+ -+/* -+ * Default IO end handler for temporary BJ_IO buffer_heads. -+ */ -+void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate) -+{ -+ BUFFER_TRACE(bh, ""); -+ mark_buffer_uptodate(bh, uptodate); -+ unlock_buffer(bh); -+} -+ -+/* -+ * journal_commit_transaction -+ * -+ * The primary function for committing a transaction to the log. This -+ * function is called by the journal thread to begin a complete commit. -+ */ -+void journal_commit_transaction(journal_t *journal) -+{ -+ transaction_t *commit_transaction; -+ struct journal_head *jh, *new_jh, *descriptor; -+ struct journal_head *next_jh, *last_jh; -+ struct buffer_head *wbuf[64]; -+ int bufs; -+ int flags; -+ int err; -+ unsigned long blocknr; -+ char *tagp = NULL; -+ journal_header_t *header; -+ journal_block_tag_t *tag = NULL; -+ int space_left = 0; -+ int first_tag = 0; -+ int tag_flag; -+ int i; -+ -+ /* -+ * First job: lock down the current transaction and wait for -+ * all outstanding updates to complete. -+ */ -+ -+ lock_journal(journal); /* Protect journal->j_running_transaction */ -+ -+#ifdef COMMIT_STATS -+ spin_lock(&journal_datalist_lock); -+ summarise_journal_usage(journal); -+ spin_unlock(&journal_datalist_lock); -+#endif -+ -+ lock_kernel(); -+ -+ J_ASSERT (journal->j_running_transaction != NULL); -+ J_ASSERT (journal->j_committing_transaction == NULL); -+ -+ commit_transaction = journal->j_running_transaction; -+ J_ASSERT (commit_transaction->t_state == T_RUNNING); -+ -+ jbd_debug (1, "JBD: starting commit of transaction %d\n", -+ commit_transaction->t_tid); -+ -+ commit_transaction->t_state = T_LOCKED; -+ while (commit_transaction->t_updates != 0) { -+ unlock_journal(journal); -+ sleep_on(&journal->j_wait_updates); -+ lock_journal(journal); -+ } -+ -+ J_ASSERT (commit_transaction->t_outstanding_credits <= -+ journal->j_max_transaction_buffers); -+ -+ /* Do we need to erase the effects of a prior journal_flush? */ -+ if (journal->j_flags & JFS_FLUSHED) { -+ jbd_debug(3, "super block updated\n"); -+ journal_update_superblock(journal, 1); -+ } else { -+ jbd_debug(3, "superblock not updated\n"); -+ } -+ -+ /* -+ * First thing we are allowed to do is to discard any remaining -+ * BJ_Reserved buffers. Note, it is _not_ permissible to assume -+ * that there are no such buffers: if a large filesystem -+ * operation like a truncate needs to split itself over multiple -+ * transactions, then it may try to do a journal_restart() while -+ * there are still BJ_Reserved buffers outstanding. These must -+ * be released cleanly from the current transaction. -+ * -+ * In this case, the filesystem must still reserve write access -+ * again before modifying the buffer in the new transaction, but -+ * we do not require it to remember exactly which old buffers it -+ * has reserved. This is consistent with the existing behaviour -+ * that multiple journal_get_write_access() calls to the same -+ * buffer are perfectly permissable. -+ */ -+ -+ while (commit_transaction->t_reserved_list) { -+ jh = commit_transaction->t_reserved_list; -+ JBUFFER_TRACE(jh, "reserved, unused: refile"); -+ journal_refile_buffer(jh); -+ } -+ -+ /* -+ * Now try to drop any written-back buffers from the journal's -+ * checkpoint lists. We do this *before* commit because it potentially -+ * frees some memory -+ */ -+ spin_lock(&journal_datalist_lock); -+ __journal_clean_checkpoint_list(journal); -+ spin_unlock(&journal_datalist_lock); -+ -+ /* First part of the commit: force the revoke list out to disk. -+ * The revoke code generates its own metadata blocks on disk for this. -+ * -+ * It is important that we do this while the transaction is -+ * still locked. Generating the revoke records should not -+ * generate any IO stalls, so this should be quick; and doing -+ * the work while we have the transaction locked means that we -+ * only ever have to maintain the revoke list for one -+ * transaction at a time. -+ */ -+ -+ jbd_debug (3, "JBD: commit phase 1\n"); -+ -+ journal_write_revoke_records(journal, commit_transaction); -+ -+ /* -+ * Now that we have built the revoke records, we can start -+ * reusing the revoke list for a new running transaction. We -+ * can now safely start committing the old transaction: time to -+ * get a new running transaction for incoming filesystem updates -+ */ -+ -+ commit_transaction->t_state = T_FLUSH; -+ -+ wake_up(&journal->j_wait_transaction_locked); -+ -+ journal->j_committing_transaction = commit_transaction; -+ journal->j_running_transaction = NULL; -+ -+ commit_transaction->t_log_start = journal->j_head; -+ -+ unlock_kernel(); -+ -+ jbd_debug (3, "JBD: commit phase 2\n"); -+ -+ /* -+ * Now start flushing things to disk, in the order they appear -+ * on the transaction lists. Data blocks go first. -+ */ -+ -+ /* -+ * Whenever we unlock the journal and sleep, things can get added -+ * onto ->t_datalist, so we have to keep looping back to write_out_data -+ * until we *know* that the list is empty. -+ */ -+write_out_data: -+ -+ /* -+ * Cleanup any flushed data buffers from the data list. Even in -+ * abort mode, we want to flush this out as soon as possible. -+ * -+ * We take journal_datalist_lock to protect the lists from -+ * journal_try_to_free_buffers(). -+ */ -+ spin_lock(&journal_datalist_lock); -+ -+write_out_data_locked: -+ bufs = 0; -+ next_jh = commit_transaction->t_sync_datalist; -+ if (next_jh == NULL) -+ goto sync_datalist_empty; -+ last_jh = next_jh->b_tprev; -+ -+ do { -+ struct buffer_head *bh; -+ -+ jh = next_jh; -+ next_jh = jh->b_tnext; -+ bh = jh2bh(jh); -+ if (!buffer_locked(bh)) { -+ if (buffer_dirty(bh)) { -+ BUFFER_TRACE(bh, "start journal writeout"); -+ atomic_inc(&bh->b_count); -+ wbuf[bufs++] = bh; -+ } else { -+ BUFFER_TRACE(bh, "writeout complete: unfile"); -+ __journal_unfile_buffer(jh); -+ jh->b_transaction = NULL; -+ __journal_remove_journal_head(bh); -+ refile_buffer(bh); -+ __brelse(bh); -+ } -+ } -+ if (bufs == ARRAY_SIZE(wbuf)) { -+ /* -+ * Major speedup: start here on the next scan -+ */ -+ J_ASSERT(commit_transaction->t_sync_datalist != 0); -+ commit_transaction->t_sync_datalist = jh; -+ break; -+ } -+ } while (jh != last_jh); -+ -+ if (bufs || current->need_resched) { -+ jbd_debug(2, "submit %d writes\n", bufs); -+ spin_unlock(&journal_datalist_lock); -+ unlock_journal(journal); -+ if (bufs) -+ ll_rw_block(WRITE, bufs, wbuf); -+ if (current->need_resched) -+ schedule(); -+ journal_brelse_array(wbuf, bufs); -+ lock_journal(journal); -+ spin_lock(&journal_datalist_lock); -+ if (bufs) -+ goto write_out_data_locked; -+ } -+ -+ /* -+ * Wait for all previously submitted IO on the data list to complete. -+ */ -+ jh = commit_transaction->t_sync_datalist; -+ if (jh == NULL) -+ goto sync_datalist_empty; -+ -+ do { -+ struct buffer_head *bh; -+ jh = jh->b_tprev; /* Wait on the last written */ -+ bh = jh2bh(jh); -+ if (buffer_locked(bh)) { -+ spin_unlock(&journal_datalist_lock); -+ unlock_journal(journal); -+ wait_on_buffer(bh); -+ /* the journal_head may have been removed now */ -+ lock_journal(journal); -+ goto write_out_data; -+ } else if (buffer_dirty(bh)) { -+ goto write_out_data_locked; -+ } -+ } while (jh != commit_transaction->t_sync_datalist); -+ goto write_out_data_locked; -+ -+sync_datalist_empty: -+ /* -+ * Wait for all the async writepage data. As they become unlocked -+ * in end_buffer_io_async(), the only place where they can be -+ * reaped is in try_to_free_buffers(), and we're locked against -+ * that. -+ */ -+ while ((jh = commit_transaction->t_async_datalist)) { -+ struct buffer_head *bh = jh2bh(jh); -+ if (buffer_locked(bh)) { -+ spin_unlock(&journal_datalist_lock); -+ unlock_journal(journal); -+ wait_on_buffer(bh); -+ lock_journal(journal); -+ spin_lock(&journal_datalist_lock); -+ continue; /* List may have changed */ -+ } -+ if (jh->b_next_transaction) { -+ /* -+ * For writepage() buffers in journalled data mode: a -+ * later transaction may want the buffer for "metadata" -+ */ -+ __journal_refile_buffer(jh); -+ } else { -+ BUFFER_TRACE(bh, "finished async writeout: unfile"); -+ __journal_unfile_buffer(jh); -+ jh->b_transaction = NULL; -+ __journal_remove_journal_head(bh); -+ BUFFER_TRACE(bh, "finished async writeout: refile"); -+ /* It can sometimes be on BUF_LOCKED due to migration -+ * from syncdata to asyncdata */ -+ if (bh->b_list != BUF_CLEAN) -+ refile_buffer(bh); -+ __brelse(bh); -+ } -+ } -+ spin_unlock(&journal_datalist_lock); -+ -+ /* -+ * If we found any dirty or locked buffers, then we should have -+ * looped back up to the write_out_data label. If there weren't -+ * any then journal_clean_data_list should have wiped the list -+ * clean by now, so check that it is in fact empty. -+ */ -+ J_ASSERT (commit_transaction->t_sync_datalist == NULL); -+ J_ASSERT (commit_transaction->t_async_datalist == NULL); -+ -+ jbd_debug (3, "JBD: commit phase 3\n"); -+ -+ /* -+ * Way to go: we have now written out all of the data for a -+ * transaction! Now comes the tricky part: we need to write out -+ * metadata. Loop over the transaction's entire buffer list: -+ */ -+ commit_transaction->t_state = T_COMMIT; -+ -+ descriptor = 0; -+ bufs = 0; -+ while (commit_transaction->t_buffers) { -+ -+ /* Find the next buffer to be journaled... */ -+ -+ jh = commit_transaction->t_buffers; -+ -+ /* If we're in abort mode, we just un-journal the buffer and -+ release it for background writing. */ -+ -+ if (is_journal_aborted(journal)) { -+ JBUFFER_TRACE(jh, "journal is aborting: refile"); -+ journal_refile_buffer(jh); -+ /* If that was the last one, we need to clean up -+ * any descriptor buffers which may have been -+ * already allocated, even if we are now -+ * aborting. */ -+ if (!commit_transaction->t_buffers) -+ goto start_journal_io; -+ continue; -+ } -+ -+ /* Make sure we have a descriptor block in which to -+ record the metadata buffer. */ -+ -+ if (!descriptor) { -+ struct buffer_head *bh; -+ -+ J_ASSERT (bufs == 0); -+ -+ jbd_debug(4, "JBD: get descriptor\n"); -+ -+ descriptor = journal_get_descriptor_buffer(journal); -+ if (!descriptor) { -+ __journal_abort_hard(journal); -+ continue; -+ } -+ -+ bh = jh2bh(descriptor); -+ jbd_debug(4, "JBD: got buffer %ld (%p)\n", -+ bh->b_blocknr, bh->b_data); -+ header = (journal_header_t *)&bh->b_data[0]; -+ header->h_magic = htonl(JFS_MAGIC_NUMBER); -+ header->h_blocktype = htonl(JFS_DESCRIPTOR_BLOCK); -+ header->h_sequence = htonl(commit_transaction->t_tid); -+ -+ tagp = &bh->b_data[sizeof(journal_header_t)]; -+ space_left = bh->b_size - sizeof(journal_header_t); -+ first_tag = 1; -+ set_bit(BH_JWrite, &bh->b_state); -+ wbuf[bufs++] = bh; -+ -+ /* Record it so that we can wait for IO -+ completion later */ -+ BUFFER_TRACE(bh, "ph3: file as descriptor"); -+ journal_file_buffer(descriptor, commit_transaction, -+ BJ_LogCtl); -+ } -+ -+ /* Where is the buffer to be written? */ -+ -+ err = journal_next_log_block(journal, &blocknr); -+ /* If the block mapping failed, just abandon the buffer -+ and repeat this loop: we'll fall into the -+ refile-on-abort condition above. */ -+ if (err) { -+ __journal_abort_hard(journal); -+ continue; -+ } -+ -+ /* Bump b_count to prevent truncate from stumbling over -+ the shadowed buffer! @@@ This can go if we ever get -+ rid of the BJ_IO/BJ_Shadow pairing of buffers. */ -+ atomic_inc(&jh2bh(jh)->b_count); -+ -+ /* Make a temporary IO buffer with which to write it out -+ (this will requeue both the metadata buffer and the -+ temporary IO buffer). new_bh goes on BJ_IO*/ -+ -+ set_bit(BH_JWrite, &jh2bh(jh)->b_state); -+ /* -+ * akpm: journal_write_metadata_buffer() sets -+ * new_bh->b_transaction to commit_transaction. -+ * We need to clean this up before we release new_bh -+ * (which is of type BJ_IO) -+ */ -+ JBUFFER_TRACE(jh, "ph3: write metadata"); -+ flags = journal_write_metadata_buffer(commit_transaction, -+ jh, &new_jh, blocknr); -+ set_bit(BH_JWrite, &jh2bh(new_jh)->b_state); -+ set_bit(BH_Lock, &jh2bh(new_jh)->b_state); -+ wbuf[bufs++] = jh2bh(new_jh); -+ -+ /* Record the new block's tag in the current descriptor -+ buffer */ -+ -+ tag_flag = 0; -+ if (flags & 1) -+ tag_flag |= JFS_FLAG_ESCAPE; -+ if (!first_tag) -+ tag_flag |= JFS_FLAG_SAME_UUID; -+ -+ tag = (journal_block_tag_t *) tagp; -+ tag->t_blocknr = htonl(jh2bh(jh)->b_blocknr); -+ tag->t_flags = htonl(tag_flag); -+ tagp += sizeof(journal_block_tag_t); -+ space_left -= sizeof(journal_block_tag_t); -+ -+ if (first_tag) { -+ memcpy (tagp, journal->j_uuid, 16); -+ tagp += 16; -+ space_left -= 16; -+ first_tag = 0; -+ } -+ -+ /* If there's no more to do, or if the descriptor is full, -+ let the IO rip! */ -+ -+ if (bufs == ARRAY_SIZE(wbuf) || -+ commit_transaction->t_buffers == NULL || -+ space_left < sizeof(journal_block_tag_t) + 16) { -+ -+ jbd_debug(4, "JBD: Submit %d IOs\n", bufs); -+ -+ /* Write an end-of-descriptor marker before -+ submitting the IOs. "tag" still points to -+ the last tag we set up. */ -+ -+ tag->t_flags |= htonl(JFS_FLAG_LAST_TAG); -+ -+start_journal_io: -+ unlock_journal(journal); -+ for (i=0; ib_state); -+ bh->b_end_io = journal_end_buffer_io_sync; -+ submit_bh(WRITE, bh); -+ } -+ if (current->need_resched) -+ schedule(); -+ lock_journal(journal); -+ -+ /* Force a new descriptor to be generated next -+ time round the loop. */ -+ descriptor = NULL; -+ bufs = 0; -+ } -+ } -+ -+ /* Lo and behold: we have just managed to send a transaction to -+ the log. Before we can commit it, wait for the IO so far to -+ complete. Control buffers being written are on the -+ transaction's t_log_list queue, and metadata buffers are on -+ the t_iobuf_list queue. -+ -+ Wait for the transactions in reverse order. That way we are -+ less likely to be woken up until all IOs have completed, and -+ so we incur less scheduling load. -+ */ -+ -+ jbd_debug(3, "JBD: commit phase 4\n"); -+ -+ /* akpm: these are BJ_IO, and journal_datalist_lock is not needed */ -+ wait_for_iobuf: -+ while (commit_transaction->t_iobuf_list != NULL) { -+ struct buffer_head *bh; -+ jh = commit_transaction->t_iobuf_list->b_tprev; -+ bh = jh2bh(jh); -+ if (buffer_locked(bh)) { -+ unlock_journal(journal); -+ wait_on_buffer(bh); -+ lock_journal(journal); -+ goto wait_for_iobuf; -+ } -+ -+ clear_bit(BH_JWrite, &jh2bh(jh)->b_state); -+ -+ JBUFFER_TRACE(jh, "ph4: unfile after journal write"); -+ journal_unfile_buffer(jh); -+ -+ /* -+ * akpm: don't put back a buffer_head with stale pointers -+ * dangling around. -+ */ -+ J_ASSERT_JH(jh, jh->b_transaction != NULL); -+ jh->b_transaction = NULL; -+ -+ /* -+ * ->t_iobuf_list should contain only dummy buffer_heads -+ * which were created by journal_write_metadata_buffer(). -+ */ -+ bh = jh2bh(jh); -+ BUFFER_TRACE(bh, "dumping temporary bh"); -+ journal_unlock_journal_head(jh); -+ __brelse(bh); -+ J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0); -+ put_unused_buffer_head(bh); -+ -+ /* We also have to unlock and free the corresponding -+ shadowed buffer */ -+ jh = commit_transaction->t_shadow_list->b_tprev; -+ bh = jh2bh(jh); -+ clear_bit(BH_JWrite, &bh->b_state); -+ J_ASSERT_BH(bh, buffer_jdirty(bh)); -+ -+ /* The metadata is now released for reuse, but we need -+ to remember it against this transaction so that when -+ we finally commit, we can do any checkpointing -+ required. */ -+ JBUFFER_TRACE(jh, "file as BJ_Forget"); -+ journal_file_buffer(jh, commit_transaction, BJ_Forget); -+ /* Wake up any transactions which were waiting for this -+ IO to complete */ -+ wake_up(&bh->b_wait); -+ JBUFFER_TRACE(jh, "brelse shadowed buffer"); -+ __brelse(bh); -+ } -+ -+ J_ASSERT (commit_transaction->t_shadow_list == NULL); -+ -+ jbd_debug(3, "JBD: commit phase 5\n"); -+ -+ /* Here we wait for the revoke record and descriptor record buffers */ -+ wait_for_ctlbuf: -+ while (commit_transaction->t_log_list != NULL) { -+ struct buffer_head *bh; -+ -+ jh = commit_transaction->t_log_list->b_tprev; -+ bh = jh2bh(jh); -+ if (buffer_locked(bh)) { -+ unlock_journal(journal); -+ wait_on_buffer(bh); -+ lock_journal(journal); -+ goto wait_for_ctlbuf; -+ } -+ -+ BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile"); -+ clear_bit(BH_JWrite, &bh->b_state); -+ journal_unfile_buffer(jh); -+ jh->b_transaction = NULL; -+ journal_unlock_journal_head(jh); -+ put_bh(bh); /* One for getblk */ -+ } -+ -+ jbd_debug(3, "JBD: commit phase 6\n"); -+ -+ if (is_journal_aborted(journal)) -+ goto skip_commit; -+ -+ /* Done it all: now write the commit record. We should have -+ * cleaned up our previous buffers by now, so if we are in abort -+ * mode we can now just skip the rest of the journal write -+ * entirely. */ -+ -+ descriptor = journal_get_descriptor_buffer(journal); -+ if (!descriptor) { -+ __journal_abort_hard(journal); -+ goto skip_commit; -+ } -+ -+ /* AKPM: buglet - add `i' to tmp! */ -+ for (i = 0; i < jh2bh(descriptor)->b_size; i += 512) { -+ journal_header_t *tmp = -+ (journal_header_t*)jh2bh(descriptor)->b_data; -+ tmp->h_magic = htonl(JFS_MAGIC_NUMBER); -+ tmp->h_blocktype = htonl(JFS_COMMIT_BLOCK); -+ tmp->h_sequence = htonl(commit_transaction->t_tid); -+ } -+ -+ unlock_journal(journal); -+ JBUFFER_TRACE(descriptor, "write commit block"); -+ { -+ struct buffer_head *bh = jh2bh(descriptor); -+ clear_bit(BH_Dirty, &bh->b_state); -+ bh->b_end_io = journal_end_buffer_io_sync; -+ submit_bh(WRITE, bh); -+ wait_on_buffer(bh); -+ put_bh(bh); /* One for getblk() */ -+ journal_unlock_journal_head(descriptor); -+ } -+ lock_journal(journal); -+ -+ /* End of a transaction! Finally, we can do checkpoint -+ processing: any buffers committed as a result of this -+ transaction can be removed from any checkpoint list it was on -+ before. */ -+ -+skip_commit: -+ -+ jbd_debug(3, "JBD: commit phase 7\n"); -+ -+ J_ASSERT(commit_transaction->t_sync_datalist == NULL); -+ J_ASSERT(commit_transaction->t_async_datalist == NULL); -+ J_ASSERT(commit_transaction->t_buffers == NULL); -+ J_ASSERT(commit_transaction->t_checkpoint_list == NULL); -+ J_ASSERT(commit_transaction->t_iobuf_list == NULL); -+ J_ASSERT(commit_transaction->t_shadow_list == NULL); -+ J_ASSERT(commit_transaction->t_log_list == NULL); -+ -+ while (commit_transaction->t_forget) { -+ transaction_t *cp_transaction; -+ struct buffer_head *bh; -+ -+ jh = commit_transaction->t_forget; -+ J_ASSERT_JH(jh, jh->b_transaction == commit_transaction || -+ jh->b_transaction == journal->j_running_transaction); -+ -+ /* -+ * If there is undo-protected committed data against -+ * this buffer, then we can remove it now. If it is a -+ * buffer needing such protection, the old frozen_data -+ * field now points to a committed version of the -+ * buffer, so rotate that field to the new committed -+ * data. -+ * -+ * Otherwise, we can just throw away the frozen data now. -+ */ -+ if (jh->b_committed_data) { -+ kfree(jh->b_committed_data); -+ jh->b_committed_data = NULL; -+ if (jh->b_frozen_data) { -+ jh->b_committed_data = jh->b_frozen_data; -+ jh->b_frozen_data = NULL; -+ } -+ } else if (jh->b_frozen_data) { -+ kfree(jh->b_frozen_data); -+ jh->b_frozen_data = NULL; -+ } -+ -+ spin_lock(&journal_datalist_lock); -+ cp_transaction = jh->b_cp_transaction; -+ if (cp_transaction) { -+ JBUFFER_TRACE(jh, "remove from old cp transaction"); -+ J_ASSERT_JH(jh, commit_transaction != cp_transaction); -+ __journal_remove_checkpoint(jh); -+ } -+ -+ /* Only re-checkpoint the buffer_head if it is marked -+ * dirty. If the buffer was added to the BJ_Forget list -+ * by journal_forget, it may no longer be dirty and -+ * there's no point in keeping a checkpoint record for -+ * it. */ -+ bh = jh2bh(jh); -+ if (buffer_jdirty(bh)) { -+ JBUFFER_TRACE(jh, "add to new checkpointing trans"); -+ __journal_insert_checkpoint(jh, commit_transaction); -+ JBUFFER_TRACE(jh, "refile for checkpoint writeback"); -+ __journal_refile_buffer(jh); -+ } else { -+ J_ASSERT_BH(bh, !buffer_dirty(bh)); -+ J_ASSERT_JH(jh, jh->b_next_transaction == NULL); -+ __journal_unfile_buffer(jh); -+ jh->b_transaction = 0; -+ __journal_remove_journal_head(bh); -+ __brelse(bh); -+ } -+ spin_unlock(&journal_datalist_lock); -+ } -+ -+ /* Done with this transaction! */ -+ -+ jbd_debug(3, "JBD: commit phase 8\n"); -+ -+ J_ASSERT (commit_transaction->t_state == T_COMMIT); -+ commit_transaction->t_state = T_FINISHED; -+ -+ J_ASSERT (commit_transaction == journal->j_committing_transaction); -+ journal->j_commit_sequence = commit_transaction->t_tid; -+ journal->j_committing_transaction = NULL; -+ -+ spin_lock(&journal_datalist_lock); -+ if (commit_transaction->t_checkpoint_list == NULL) { -+ __journal_drop_transaction(journal, commit_transaction); -+ } else { -+ if (journal->j_checkpoint_transactions == NULL) { -+ journal->j_checkpoint_transactions = commit_transaction; -+ commit_transaction->t_cpnext = commit_transaction; -+ commit_transaction->t_cpprev = commit_transaction; -+ } else { -+ commit_transaction->t_cpnext = -+ journal->j_checkpoint_transactions; -+ commit_transaction->t_cpprev = -+ commit_transaction->t_cpnext->t_cpprev; -+ commit_transaction->t_cpnext->t_cpprev = -+ commit_transaction; -+ commit_transaction->t_cpprev->t_cpnext = -+ commit_transaction; -+ } -+ } -+ spin_unlock(&journal_datalist_lock); -+ -+ jbd_debug(1, "JBD: commit %d complete, head %d\n", -+ journal->j_commit_sequence, journal->j_tail_sequence); -+ -+ unlock_journal(journal); -+ wake_up(&journal->j_wait_done_commit); -+} -diff -ruP linux.mcp2/fs/jbd/journal.c linuxppc_2.4.19_final/fs/jbd/journal.c ---- linux.mcp2/fs/jbd/journal.c 1969-12-31 16:00:00.000000000 -0800 -+++ linuxppc_2.4.19_final/fs/jbd/journal.c 2004-05-17 13:56:17.000000000 -0700 -@@ -0,0 +1,1877 @@ -+/* -+ * linux/fs/journal.c -+ * -+ * Written by Stephen C. Tweedie , 1998 -+ * -+ * Copyright 1998 Red Hat corp --- All Rights Reserved -+ * -+ * This file is part of the Linux kernel and is made available under -+ * the terms of the GNU General Public License, version 2, or at your -+ * option, any later version, incorporated herein by reference. -+ * -+ * Generic filesystem journal-writing code; part of the ext2fs -+ * journaling system. -+ * -+ * This file manages journals: areas of disk reserved for logging -+ * transactional updates. This includes the kernel journaling thread -+ * which is responsible for scheduling updates to the log. -+ * -+ * We do not actually manage the physical storage of the journal in this -+ * file: that is left to a per-journal policy function, which allows us -+ * to store the journal within a filesystem-specified area for ext2 -+ * journaling (ext2 can use a reserved inode for storing the log). -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+EXPORT_SYMBOL(journal_start); -+EXPORT_SYMBOL(journal_try_start); -+EXPORT_SYMBOL(journal_restart); -+EXPORT_SYMBOL(journal_extend); -+EXPORT_SYMBOL(journal_stop); -+EXPORT_SYMBOL(journal_lock_updates); -+EXPORT_SYMBOL(journal_unlock_updates); -+EXPORT_SYMBOL(journal_get_write_access); -+EXPORT_SYMBOL(journal_get_create_access); -+EXPORT_SYMBOL(journal_get_undo_access); -+EXPORT_SYMBOL(journal_dirty_data); -+EXPORT_SYMBOL(journal_dirty_metadata); -+#if 0 -+EXPORT_SYMBOL(journal_release_buffer); -+#endif -+EXPORT_SYMBOL(journal_forget); -+#if 0 -+EXPORT_SYMBOL(journal_sync_buffer); -+#endif -+EXPORT_SYMBOL(journal_flush); -+EXPORT_SYMBOL(journal_revoke); -+ -+EXPORT_SYMBOL(journal_init_dev); -+EXPORT_SYMBOL(journal_init_inode); -+EXPORT_SYMBOL(journal_update_format); -+EXPORT_SYMBOL(journal_check_used_features); -+EXPORT_SYMBOL(journal_check_available_features); -+EXPORT_SYMBOL(journal_set_features); -+EXPORT_SYMBOL(journal_create); -+EXPORT_SYMBOL(journal_load); -+EXPORT_SYMBOL(journal_destroy); -+EXPORT_SYMBOL(journal_recover); -+EXPORT_SYMBOL(journal_update_superblock); -+EXPORT_SYMBOL(journal_abort); -+EXPORT_SYMBOL(journal_errno); -+EXPORT_SYMBOL(journal_ack_err); -+EXPORT_SYMBOL(journal_clear_err); -+EXPORT_SYMBOL(log_wait_commit); -+EXPORT_SYMBOL(log_start_commit); -+EXPORT_SYMBOL(journal_wipe); -+EXPORT_SYMBOL(journal_blocks_per_page); -+EXPORT_SYMBOL(journal_flushpage); -+EXPORT_SYMBOL(journal_try_to_free_buffers); -+EXPORT_SYMBOL(journal_bmap); -+EXPORT_SYMBOL(journal_force_commit); -+ -+static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *); -+ -+/* -+ * journal_datalist_lock is used to protect data buffers: -+ * -+ * bh->b_transaction -+ * bh->b_tprev -+ * bh->b_tnext -+ * -+ * journal_free_buffer() is called from journal_try_to_free_buffer(), and is -+ * async wrt everything else. -+ * -+ * It is also used for checkpoint data, also to protect against -+ * journal_try_to_free_buffer(): -+ * -+ * bh->b_cp_transaction -+ * bh->b_cpnext -+ * bh->b_cpprev -+ * transaction->t_checkpoint_list -+ * transaction->t_cpnext -+ * transaction->t_cpprev -+ * journal->j_checkpoint_transactions -+ * -+ * It is global at this time rather than per-journal because it's -+ * impossible for __journal_free_buffer to go from a buffer_head -+ * back to a journal_t unracily (well, not true. Fix later) -+ * -+ * -+ * The `datalist' and `checkpoint list' functions are quite -+ * separate and we could use two spinlocks here. -+ * -+ * lru_list_lock nests inside journal_datalist_lock. -+ */ -+spinlock_t journal_datalist_lock = SPIN_LOCK_UNLOCKED; -+ -+/* -+ * jh_splice_lock needs explantion. -+ * -+ * In a number of places we want to do things like: -+ * -+ * if (buffer_jbd(bh) && bh2jh(bh)->foo) -+ * -+ * This is racy on SMP, because another CPU could remove the journal_head -+ * in the middle of this expression. We need locking. -+ * -+ * But we can greatly optimise the locking cost by testing BH_JBD -+ * outside the lock. So, effectively: -+ * -+ * ret = 0; -+ * if (buffer_jbd(bh)) { -+ * spin_lock(&jh_splice_lock); -+ * if (buffer_jbd(bh)) { (* Still there? *) -+ * ret = bh2jh(bh)->foo; -+ * } -+ * spin_unlock(&jh_splice_lock); -+ * } -+ * return ret; -+ * -+ * Now, that protects us from races where another CPU can remove the -+ * journal_head. But it doesn't defend us from the situation where another -+ * CPU can *add* a journal_head. This is a correctness issue. But it's not -+ * a problem because a) the calling code was *already* racy and b) it often -+ * can't happen at the call site and c) the places where we add journal_heads -+ * tend to be under external locking. -+ */ -+spinlock_t jh_splice_lock = SPIN_LOCK_UNLOCKED; -+ -+/* -+ * List of all journals in the system. Protected by the BKL. -+ */ -+static LIST_HEAD(all_journals); -+ -+/* -+ * Helper function used to manage commit timeouts -+ */ -+ -+static void commit_timeout(unsigned long __data) -+{ -+ struct task_struct * p = (struct task_struct *) __data; -+ -+ wake_up_process(p); -+} -+ -+/* Static check for data structure consistency. There's no code -+ * invoked --- we'll just get a linker failure if things aren't right. -+ */ -+void __journal_internal_check(void) -+{ -+ extern void journal_bad_superblock_size(void); -+ if (sizeof(struct journal_superblock_s) != 1024) -+ journal_bad_superblock_size(); -+} -+ -+/* -+ * kjournald: The main thread function used to manage a logging device -+ * journal. -+ * -+ * This kernel thread is responsible for two things: -+ * -+ * 1) COMMIT: Every so often we need to commit the current state of the -+ * filesystem to disk. The journal thread is responsible for writing -+ * all of the metadata buffers to disk. -+ * -+ * 2) CHECKPOINT: We cannot reuse a used section of the log file until all -+ * of the data in that part of the log has been rewritten elsewhere on -+ * the disk. Flushing these old buffers to reclaim space in the log is -+ * known as checkpointing, and this thread is responsible for that job. -+ */ -+ -+journal_t *current_journal; // AKPM: debug -+ -+int kjournald(void *arg) -+{ -+ journal_t *journal = (journal_t *) arg; -+ transaction_t *transaction; -+ struct timer_list timer; -+ -+ current_journal = journal; -+ -+ lock_kernel(); -+ daemonize(); -+ reparent_to_init(); -+ spin_lock_irq(¤t->sigmask_lock); -+ sigfillset(¤t->blocked); -+ recalc_sigpending(current); -+ spin_unlock_irq(¤t->sigmask_lock); -+ -+ sprintf(current->comm, "kjournald"); -+ -+ /* Set up an interval timer which can be used to trigger a -+ commit wakeup after the commit interval expires */ -+ init_timer(&timer); -+ timer.data = (unsigned long) current; -+ timer.function = commit_timeout; -+ journal->j_commit_timer = &timer; -+ -+ /* Record that the journal thread is running */ -+ journal->j_task = current; -+ wake_up(&journal->j_wait_done_commit); -+ -+ printk(KERN_INFO "kjournald starting. Commit interval %ld seconds\n", -+ journal->j_commit_interval / HZ); -+ list_add(&journal->j_all_journals, &all_journals); -+ -+ /* And now, wait forever for commit wakeup events. */ -+ while (1) { -+ if (journal->j_flags & JFS_UNMOUNT) -+ break; -+ -+ jbd_debug(1, "commit_sequence=%d, commit_request=%d\n", -+ journal->j_commit_sequence, journal->j_commit_request); -+ -+ if (journal->j_commit_sequence != journal->j_commit_request) { -+ jbd_debug(1, "OK, requests differ\n"); -+ if (journal->j_commit_timer_active) { -+ journal->j_commit_timer_active = 0; -+ del_timer(journal->j_commit_timer); -+ } -+ -+ journal_commit_transaction(journal); -+ continue; -+ } -+ -+ wake_up(&journal->j_wait_done_commit); -+ interruptible_sleep_on(&journal->j_wait_commit); -+ -+ jbd_debug(1, "kjournald wakes\n"); -+ -+ /* Were we woken up by a commit wakeup event? */ -+ if ((transaction = journal->j_running_transaction) != NULL && -+ time_after_eq(jiffies, transaction->t_expires)) { -+ journal->j_commit_request = transaction->t_tid; -+ jbd_debug(1, "woke because of timeout\n"); -+ } -+ } -+ -+ if (journal->j_commit_timer_active) { -+ journal->j_commit_timer_active = 0; -+ del_timer_sync(journal->j_commit_timer); -+ } -+ -+ list_del(&journal->j_all_journals); -+ -+ journal->j_task = NULL; -+ wake_up(&journal->j_wait_done_commit); -+ unlock_kernel(); -+ jbd_debug(1, "Journal thread exiting.\n"); -+ return 0; -+} -+ -+static void journal_start_thread(journal_t *journal) -+{ -+ kernel_thread(kjournald, (void *) journal, -+ CLONE_VM | CLONE_FS | CLONE_FILES); -+ while (!journal->j_task) -+ sleep_on(&journal->j_wait_done_commit); -+} -+ -+static void journal_kill_thread(journal_t *journal) -+{ -+ journal->j_flags |= JFS_UNMOUNT; -+ -+ while (journal->j_task) { -+ wake_up(&journal->j_wait_commit); -+ sleep_on(&journal->j_wait_done_commit); -+ } -+} -+ -+#if 0 -+ -+This is no longer needed - we do it in commit quite efficiently. -+Note that if this function is resurrected, the loop needs to -+be reorganised into the next_jh/last_jh algorithm. -+ -+/* -+ * journal_clean_data_list: cleanup after data IO. -+ * -+ * Once the IO system has finished writing the buffers on the transaction's -+ * data list, we can remove those buffers from the list. This function -+ * scans the list for such buffers and removes them cleanly. -+ * -+ * We assume that the journal is already locked. -+ * We are called with journal_datalist_lock held. -+ * -+ * AKPM: This function looks inefficient. Approximately O(n^2) -+ * for potentially thousands of buffers. It no longer shows on profiles -+ * because these buffers are mainly dropped in journal_commit_transaction(). -+ */ -+ -+void __journal_clean_data_list(transaction_t *transaction) -+{ -+ struct journal_head *jh, *next; -+ -+ assert_spin_locked(&journal_datalist_lock); -+ -+restart: -+ jh = transaction->t_sync_datalist; -+ if (!jh) -+ goto out; -+ do { -+ next = jh->b_tnext; -+ if (!buffer_locked(jh2bh(jh)) && !buffer_dirty(jh2bh(jh))) { -+ struct buffer_head *bh = jh2bh(jh); -+ BUFFER_TRACE(bh, "data writeout complete: unfile"); -+ __journal_unfile_buffer(jh); -+ jh->b_transaction = NULL; -+ __journal_remove_journal_head(bh); -+ refile_buffer(bh); -+ __brelse(bh); -+ goto restart; -+ } -+ jh = next; -+ } while (transaction->t_sync_datalist && -+ jh != transaction->t_sync_datalist); -+out: -+ return; -+} -+#endif -+ -+/* -+ * journal_write_metadata_buffer: write a metadata buffer to the journal. -+ * -+ * Writes a metadata buffer to a given disk block. The actual IO is not -+ * performed but a new buffer_head is constructed which labels the data -+ * to be written with the correct destination disk block. -+ * -+ * Any magic-number escaping which needs to be done will cause a -+ * copy-out here. If the buffer happens to start with the -+ * JFS_MAGIC_NUMBER, then we can't write it to the log directly: the -+ * magic number is only written to the log for descripter blocks. In -+ * this case, we copy the data and replace the first word with 0, and we -+ * return a result code which indicates that this buffer needs to be -+ * marked as an escaped buffer in the corresponding log descriptor -+ * block. The missing word can then be restored when the block is read -+ * during recovery. -+ * -+ * If the source buffer has already been modified by a new transaction -+ * since we took the last commit snapshot, we use the frozen copy of -+ * that data for IO. If we end up using the existing buffer_head's data -+ * for the write, then we *have* to lock the buffer to prevent anyone -+ * else from using and possibly modifying it while the IO is in -+ * progress. -+ * -+ * The function returns a pointer to the buffer_heads to be used for IO. -+ * -+ * We assume that the journal has already been locked in this function. -+ * -+ * Return value: -+ * <0: Error -+ * >=0: Finished OK -+ * -+ * On success: -+ * Bit 0 set == escape performed on the data -+ * Bit 1 set == buffer copy-out performed (kfree the data after IO) -+ */ -+ -+static inline unsigned long virt_to_offset(void *p) -+{return ((unsigned long) p) & ~PAGE_MASK;} -+ -+int journal_write_metadata_buffer(transaction_t *transaction, -+ struct journal_head *jh_in, -+ struct journal_head **jh_out, -+ int blocknr) -+{ -+ int need_copy_out = 0; -+ int done_copy_out = 0; -+ int do_escape = 0; -+ char *mapped_data; -+ struct buffer_head *new_bh; -+ struct journal_head * new_jh; -+ struct page *new_page; -+ unsigned int new_offset; -+ -+ /* -+ * The buffer really shouldn't be locked: only the current committing -+ * transaction is allowed to write it, so nobody else is allowed -+ * to do any IO. -+ * -+ * akpm: except if we're journalling data, and write() output is -+ * also part of a shared mapping, and another thread has -+ * decided to launch a writepage() against this buffer. -+ */ -+ J_ASSERT_JH(jh_in, buffer_jdirty(jh2bh(jh_in))); -+ -+ /* -+ * If a new transaction has already done a buffer copy-out, then -+ * we use that version of the data for the commit. -+ */ -+ -+ if (jh_in->b_frozen_data) { -+ done_copy_out = 1; -+ new_page = virt_to_page(jh_in->b_frozen_data); -+ new_offset = virt_to_offset(jh_in->b_frozen_data); -+ } else { -+ new_page = jh2bh(jh_in)->b_page; -+ new_offset = virt_to_offset(jh2bh(jh_in)->b_data); -+ } -+ -+ mapped_data = ((char *) kmap(new_page)) + new_offset; -+ -+ /* -+ * Check for escaping -+ */ -+ if (* ((unsigned int *) mapped_data) == htonl(JFS_MAGIC_NUMBER)) { -+ need_copy_out = 1; -+ do_escape = 1; -+ } -+ -+ /* -+ * Do we need to do a data copy? -+ */ -+ -+ if (need_copy_out && !done_copy_out) { -+ char *tmp; -+ tmp = jbd_rep_kmalloc(jh2bh(jh_in)->b_size, GFP_NOFS); -+ -+ jh_in->b_frozen_data = tmp; -+ memcpy (tmp, mapped_data, jh2bh(jh_in)->b_size); -+ -+ /* If we get to this path, we'll always need the new -+ address kmapped so that we can clear the escaped -+ magic number below. */ -+ kunmap(new_page); -+ new_page = virt_to_page(tmp); -+ new_offset = virt_to_offset(tmp); -+ mapped_data = ((char *) kmap(new_page)) + new_offset; -+ -+ done_copy_out = 1; -+ } -+ -+ /* -+ * Right, time to make up the new buffer_head. -+ */ -+ do { -+ new_bh = get_unused_buffer_head(0); -+ if (!new_bh) { -+ printk (KERN_NOTICE __FUNCTION__ -+ ": ENOMEM at get_unused_buffer_head, " -+ "trying again.\n"); -+ current->policy |= SCHED_YIELD; -+ schedule(); -+ } -+ } while (!new_bh); -+ /* keep subsequent assertions sane */ -+ new_bh->b_prev_free = 0; -+ new_bh->b_next_free = 0; -+ new_bh->b_state = 0; -+ init_buffer(new_bh, NULL, NULL); -+ atomic_set(&new_bh->b_count, 1); -+ new_jh = journal_add_journal_head(new_bh); -+ -+ set_bh_page(new_bh, new_page, new_offset); -+ -+ new_jh->b_transaction = NULL; -+ new_bh->b_size = jh2bh(jh_in)->b_size; -+ new_bh->b_dev = transaction->t_journal->j_dev; -+ new_bh->b_blocknr = blocknr; -+ new_bh->b_state |= (1 << BH_Mapped) | (1 << BH_Dirty); -+ -+ *jh_out = new_jh; -+ -+ /* -+ * Did we need to do an escaping? Now we've done all the -+ * copying, we can finally do so. -+ */ -+ -+ if (do_escape) -+ * ((unsigned int *) mapped_data) = 0; -+ kunmap(new_page); -+ -+ /* -+ * The to-be-written buffer needs to get moved to the io queue, -+ * and the original buffer whose contents we are shadowing or -+ * copying is moved to the transaction's shadow queue. -+ */ -+ JBUFFER_TRACE(jh_in, "file as BJ_Shadow"); -+ journal_file_buffer(jh_in, transaction, BJ_Shadow); -+ JBUFFER_TRACE(new_jh, "file as BJ_IO"); -+ journal_file_buffer(new_jh, transaction, BJ_IO); -+ -+ return do_escape | (done_copy_out << 1); -+} -+ -+/* -+ * Allocation code for the journal file. Manage the space left in the -+ * journal, so that we can begin checkpointing when appropriate. -+ */ -+ -+/* -+ * log_space_left: Return the number of free blocks left in the journal. -+ * -+ * Called with the journal already locked. -+ */ -+ -+int log_space_left (journal_t *journal) -+{ -+ int left = journal->j_free; -+ -+ /* Be pessimistic here about the number of those free blocks -+ * which might be required for log descriptor control blocks. */ -+ -+#define MIN_LOG_RESERVED_BLOCKS 32 /* Allow for rounding errors */ -+ -+ left -= MIN_LOG_RESERVED_BLOCKS; -+ -+ if (left <= 0) -+ return 0; -+ left -= (left >> 3); -+ return left; -+} -+ -+/* -+ * This function must be non-allocating for PF_MEMALLOC tasks -+ */ -+tid_t log_start_commit (journal_t *journal, transaction_t *transaction) -+{ -+ tid_t target = journal->j_commit_request; -+ -+ lock_kernel(); /* Protect journal->j_running_transaction */ -+ -+ /* -+ * A NULL transaction asks us to commit the currently running -+ * transaction, if there is one. -+ */ -+ if (transaction) -+ target = transaction->t_tid; -+ else { -+ transaction = journal->j_running_transaction; -+ if (!transaction) -+ goto out; -+ target = transaction->t_tid; -+ } -+ -+ /* -+ * Are we already doing a recent enough commit? -+ */ -+ if (tid_geq(journal->j_commit_request, target)) -+ goto out; -+ -+ /* -+ * We want a new commit: OK, mark the request and wakup the -+ * commit thread. We do _not_ do the commit ourselves. -+ */ -+ -+ journal->j_commit_request = target; -+ jbd_debug(1, "JBD: requesting commit %d/%d\n", -+ journal->j_commit_request, -+ journal->j_commit_sequence); -+ wake_up(&journal->j_wait_commit); -+ -+out: -+ unlock_kernel(); -+ return target; -+} -+ -+/* -+ * Wait for a specified commit to complete. -+ * The caller may not hold the journal lock. -+ */ -+void log_wait_commit (journal_t *journal, tid_t tid) -+{ -+ lock_kernel(); -+#ifdef CONFIG_JBD_DEBUG -+ lock_journal(journal); -+ if (!tid_geq(journal->j_commit_request, tid)) { -+ printk(KERN_EMERG __FUNCTION__ -+ ": error: j_commit_request=%d, tid=%d\n", -+ journal->j_commit_request, tid); -+ } -+ unlock_journal(journal); -+#endif -+ while (tid_gt(tid, journal->j_commit_sequence)) { -+ jbd_debug(1, "JBD: want %d, j_commit_sequence=%d\n", -+ tid, journal->j_commit_sequence); -+ wake_up(&journal->j_wait_commit); -+ sleep_on(&journal->j_wait_done_commit); -+ } -+ unlock_kernel(); -+} -+ -+/* -+ * Log buffer allocation routines: -+ */ -+ -+int journal_next_log_block(journal_t *journal, unsigned long *retp) -+{ -+ unsigned long blocknr; -+ -+ J_ASSERT(journal->j_free > 1); -+ -+ blocknr = journal->j_head; -+ journal->j_head++; -+ journal->j_free--; -+ if (journal->j_head == journal->j_last) -+ journal->j_head = journal->j_first; -+ return journal_bmap(journal, blocknr, retp); -+} -+ -+/* -+ * Conversion of logical to physical block numbers for the journal -+ * -+ * On external journals the journal blocks are identity-mapped, so -+ * this is a no-op. If needed, we can use j_blk_offset - everything is -+ * ready. -+ */ -+int journal_bmap(journal_t *journal, unsigned long blocknr, -+ unsigned long *retp) -+{ -+ int err = 0; -+ unsigned long ret; -+ -+ if (journal->j_inode) { -+ ret = bmap(journal->j_inode, blocknr); -+ if (ret) -+ *retp = ret; -+ else { -+ printk (KERN_ALERT __FUNCTION__ -+ ": journal block not found " -+ "at offset %lu on %s\n", -+ blocknr, bdevname(journal->j_dev)); -+ err = -EIO; -+ __journal_abort_soft(journal, err); -+ } -+ } else { -+ *retp = blocknr; /* +journal->j_blk_offset */ -+ } -+ return err; -+} -+ -+/* -+ * We play buffer_head aliasing tricks to write data/metadata blocks to -+ * the journal without copying their contents, but for journal -+ * descriptor blocks we do need to generate bona fide buffers. -+ * -+ * We return a jh whose bh is locked and ready to be populated. -+ */ -+ -+struct journal_head * journal_get_descriptor_buffer(journal_t *journal) -+{ -+ struct buffer_head *bh; -+ unsigned long blocknr; -+ int err; -+ -+ err = journal_next_log_block(journal, &blocknr); -+ -+ if (err) -+ return NULL; -+ -+ bh = getblk(journal->j_dev, blocknr, journal->j_blocksize); -+ lock_buffer(bh); -+ BUFFER_TRACE(bh, "return this buffer"); -+ return journal_add_journal_head(bh); -+} -+ -+/* -+ * Management for journal control blocks: functions to create and -+ * destroy journal_t structures, and to initialise and read existing -+ * journal blocks from disk. */ -+ -+/* First: create and setup a journal_t object in memory. We initialise -+ * very few fields yet: that has to wait until we have created the -+ * journal structures from from scratch, or loaded them from disk. */ -+ -+static journal_t * journal_init_common (void) -+{ -+ journal_t *journal; -+ int err; -+ -+ MOD_INC_USE_COUNT; -+ -+ journal = jbd_kmalloc(sizeof(*journal), GFP_KERNEL); -+ if (!journal) -+ goto fail; -+ memset(journal, 0, sizeof(*journal)); -+ -+ init_waitqueue_head(&journal->j_wait_transaction_locked); -+ init_waitqueue_head(&journal->j_wait_logspace); -+ init_waitqueue_head(&journal->j_wait_done_commit); -+ init_waitqueue_head(&journal->j_wait_checkpoint); -+ init_waitqueue_head(&journal->j_wait_commit); -+ init_waitqueue_head(&journal->j_wait_updates); -+ init_MUTEX(&journal->j_barrier); -+ init_MUTEX(&journal->j_checkpoint_sem); -+ init_MUTEX(&journal->j_sem); -+ -+ journal->j_commit_interval = (HZ * 5); -+ -+ /* The journal is marked for error until we succeed with recovery! */ -+ journal->j_flags = JFS_ABORT; -+ -+ /* Set up a default-sized revoke table for the new mount. */ -+ err = journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH); -+ if (err) { -+ kfree(journal); -+ goto fail; -+ } -+ return journal; -+fail: -+ MOD_DEC_USE_COUNT; -+ return NULL; -+} -+ -+/* journal_init_dev and journal_init_inode: -+ * -+ * Create a journal structure assigned some fixed set of disk blocks to -+ * the journal. We don't actually touch those disk blocks yet, but we -+ * need to set up all of the mapping information to tell the journaling -+ * system where the journal blocks are. -+ * -+ * journal_init_dev creates a journal which maps a fixed contiguous -+ * range of blocks on an arbitrary block device. -+ * -+ * journal_init_inode creates a journal which maps an on-disk inode as -+ * the journal. The inode must exist already, must support bmap() and -+ * must have all data blocks preallocated. -+ */ -+ -+journal_t * journal_init_dev(kdev_t dev, kdev_t fs_dev, -+ int start, int len, int blocksize) -+{ -+ journal_t *journal = journal_init_common(); -+ struct buffer_head *bh; -+ -+ if (!journal) -+ return NULL; -+ -+ journal->j_dev = dev; -+ journal->j_fs_dev = fs_dev; -+ journal->j_blk_offset = start; -+ journal->j_maxlen = len; -+ journal->j_blocksize = blocksize; -+ -+ bh = getblk(journal->j_dev, start, journal->j_blocksize); -+ J_ASSERT(bh != NULL); -+ journal->j_sb_buffer = bh; -+ journal->j_superblock = (journal_superblock_t *)bh->b_data; -+ -+ return journal; -+} -+ -+journal_t * journal_init_inode (struct inode *inode) -+{ -+ struct buffer_head *bh; -+ journal_t *journal = journal_init_common(); -+ int err; -+ unsigned long blocknr; -+ -+ if (!journal) -+ return NULL; -+ -+ journal->j_dev = inode->i_dev; -+ journal->j_fs_dev = inode->i_dev; -+ journal->j_inode = inode; -+ jbd_debug(1, -+ "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n", -+ journal, bdevname(inode->i_dev), inode->i_ino, -+ (long long) inode->i_size, -+ inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize); -+ -+ journal->j_maxlen = inode->i_size >> inode->i_sb->s_blocksize_bits; -+ journal->j_blocksize = inode->i_sb->s_blocksize; -+ -+ err = journal_bmap(journal, 0, &blocknr); -+ /* If that failed, give up */ -+ if (err) { -+ printk(KERN_ERR __FUNCTION__ ": Cannnot locate journal " -+ "superblock\n"); -+ kfree(journal); -+ return NULL; -+ } -+ -+ bh = getblk(journal->j_dev, blocknr, journal->j_blocksize); -+ J_ASSERT(bh != NULL); -+ journal->j_sb_buffer = bh; -+ journal->j_superblock = (journal_superblock_t *)bh->b_data; -+ -+ return journal; -+} -+ -+/* -+ * If the journal init or create aborts, we need to mark the journal -+ * superblock as being NULL to prevent the journal destroy from writing -+ * back a bogus superblock. -+ */ -+static void journal_fail_superblock (journal_t *journal) -+{ -+ struct buffer_head *bh = journal->j_sb_buffer; -+ brelse(bh); -+ journal->j_sb_buffer = NULL; -+} -+ -+/* -+ * Given a journal_t structure, initialise the various fields for -+ * startup of a new journaling session. We use this both when creating -+ * a journal, and after recovering an old journal to reset it for -+ * subsequent use. -+ */ -+ -+static int journal_reset (journal_t *journal) -+{ -+ journal_superblock_t *sb = journal->j_superblock; -+ unsigned int first, last; -+ -+ first = ntohl(sb->s_first); -+ last = ntohl(sb->s_maxlen); -+ -+ journal->j_first = first; -+ journal->j_last = last; -+ -+ journal->j_head = first; -+ journal->j_tail = first; -+ journal->j_free = last - first; -+ -+ journal->j_tail_sequence = journal->j_transaction_sequence; -+ journal->j_commit_sequence = journal->j_transaction_sequence - 1; -+ journal->j_commit_request = journal->j_commit_sequence; -+ -+ journal->j_max_transaction_buffers = journal->j_maxlen / 4; -+ -+ /* Add the dynamic fields and write it to disk. */ -+ journal_update_superblock(journal, 1); -+ -+ lock_journal(journal); -+ journal_start_thread(journal); -+ unlock_journal(journal); -+ -+ return 0; -+} -+ -+/* -+ * Given a journal_t structure which tells us which disk blocks we can -+ * use, create a new journal superblock and initialise all of the -+ * journal fields from scratch. */ -+ -+int journal_create (journal_t *journal) -+{ -+ unsigned long blocknr; -+ struct buffer_head *bh; -+ journal_superblock_t *sb; -+ int i, err; -+ -+ if (journal->j_maxlen < JFS_MIN_JOURNAL_BLOCKS) { -+ printk (KERN_ERR "Journal length (%d blocks) too short.\n", -+ journal->j_maxlen); -+ journal_fail_superblock(journal); -+ return -EINVAL; -+ } -+ -+ if (journal->j_inode == NULL) { -+ /* -+ * We don't know what block to start at! -+ */ -+ printk(KERN_EMERG __FUNCTION__ -+ ": creation of journal on external device!\n"); -+ BUG(); -+ } -+ -+ /* Zero out the entire journal on disk. We cannot afford to -+ have any blocks on disk beginning with JFS_MAGIC_NUMBER. */ -+ jbd_debug(1, "JBD: Zeroing out journal blocks...\n"); -+ for (i = 0; i < journal->j_maxlen; i++) { -+ err = journal_bmap(journal, i, &blocknr); -+ if (err) -+ return err; -+ bh = getblk(journal->j_dev, blocknr, journal->j_blocksize); -+ wait_on_buffer(bh); -+ memset (bh->b_data, 0, journal->j_blocksize); -+ BUFFER_TRACE(bh, "marking dirty"); -+ mark_buffer_dirty(bh); -+ BUFFER_TRACE(bh, "marking uptodate"); -+ mark_buffer_uptodate(bh, 1); -+ __brelse(bh); -+ } -+ -+ sync_dev(journal->j_dev); -+ jbd_debug(1, "JBD: journal cleared.\n"); -+ -+ /* OK, fill in the initial static fields in the new superblock */ -+ sb = journal->j_superblock; -+ -+ sb->s_header.h_magic = htonl(JFS_MAGIC_NUMBER); -+ sb->s_header.h_blocktype = htonl(JFS_SUPERBLOCK_V2); -+ -+ sb->s_blocksize = htonl(journal->j_blocksize); -+ sb->s_maxlen = htonl(journal->j_maxlen); -+ sb->s_first = htonl(1); -+ -+ journal->j_transaction_sequence = 1; -+ -+ journal->j_flags &= ~JFS_ABORT; -+ journal->j_format_version = 2; -+ -+ return journal_reset(journal); -+} -+ -+/* -+ * Update a journal's dynamic superblock fields and write it to disk, -+ * optionally waiting for the IO to complete. -+*/ -+ -+void journal_update_superblock(journal_t *journal, int wait) -+{ -+ journal_superblock_t *sb = journal->j_superblock; -+ struct buffer_head *bh = journal->j_sb_buffer; -+ -+ jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n", -+ journal->j_tail, journal->j_tail_sequence, journal->j_errno); -+ -+ sb->s_sequence = htonl(journal->j_tail_sequence); -+ sb->s_start = htonl(journal->j_tail); -+ sb->s_errno = htonl(journal->j_errno); -+ -+ BUFFER_TRACE(bh, "marking dirty"); -+ mark_buffer_dirty(bh); -+ ll_rw_block(WRITE, 1, &bh); -+ if (wait) -+ wait_on_buffer(bh); -+ -+ /* If we have just flushed the log (by marking s_start==0), then -+ * any future commit will have to be careful to update the -+ * superblock again to re-record the true start of the log. */ -+ -+ if (sb->s_start) -+ journal->j_flags &= ~JFS_FLUSHED; -+ else -+ journal->j_flags |= JFS_FLUSHED; -+} -+ -+ -+/* -+ * Read the superblock for a given journal, performing initial -+ * validation of the format. -+ */ -+ -+static int journal_get_superblock(journal_t *journal) -+{ -+ struct buffer_head *bh; -+ journal_superblock_t *sb; -+ int err = -EIO; -+ -+ bh = journal->j_sb_buffer; -+ -+ J_ASSERT(bh != NULL); -+ if (!buffer_uptodate(bh)) { -+ ll_rw_block(READ, 1, &bh); -+ wait_on_buffer(bh); -+ if (!buffer_uptodate(bh)) { -+ printk (KERN_ERR -+ "JBD: IO error reading journal superblock\n"); -+ goto out; -+ } -+ } -+ -+ sb = journal->j_superblock; -+ -+ err = -EINVAL; -+ -+ if (sb->s_header.h_magic != htonl(JFS_MAGIC_NUMBER) || -+ sb->s_blocksize != htonl(journal->j_blocksize)) { -+ printk(KERN_WARNING "JBD: no valid journal superblock found\n"); -+ goto out; -+ } -+ -+ switch(ntohl(sb->s_header.h_blocktype)) { -+ case JFS_SUPERBLOCK_V1: -+ journal->j_format_version = 1; -+ break; -+ case JFS_SUPERBLOCK_V2: -+ journal->j_format_version = 2; -+ break; -+ default: -+ printk(KERN_WARNING "JBD: unrecognised superblock format ID\n"); -+ goto out; -+ } -+ -+ if (ntohl(sb->s_maxlen) < journal->j_maxlen) -+ journal->j_maxlen = ntohl(sb->s_maxlen); -+ else if (ntohl(sb->s_maxlen) > journal->j_maxlen) { -+ printk (KERN_WARNING "JBD: journal file too short\n"); -+ goto out; -+ } -+ -+ return 0; -+ -+out: -+ journal_fail_superblock(journal); -+ return err; -+} -+ -+/* -+ * Load the on-disk journal superblock and read the key fields into the -+ * journal_t. -+ */ -+ -+static int load_superblock(journal_t *journal) -+{ -+ int err; -+ journal_superblock_t *sb; -+ -+ err = journal_get_superblock(journal); -+ if (err) -+ return err; -+ -+ sb = journal->j_superblock; -+ -+ journal->j_tail_sequence = ntohl(sb->s_sequence); -+ journal->j_tail = ntohl(sb->s_start); -+ journal->j_first = ntohl(sb->s_first); -+ journal->j_last = ntohl(sb->s_maxlen); -+ journal->j_errno = ntohl(sb->s_errno); -+ -+ return 0; -+} -+ -+ -+/* -+ * Given a journal_t structure which tells us which disk blocks contain -+ * a journal, read the journal from disk to initialise the in-memory -+ * structures. -+ */ -+ -+int journal_load(journal_t *journal) -+{ -+ int err; -+ -+ err = load_superblock(journal); -+ if (err) -+ return err; -+ -+ /* If this is a V2 superblock, then we have to check the -+ * features flags on it. */ -+ -+ if (journal->j_format_version >= 2) { -+ journal_superblock_t *sb = journal->j_superblock; -+ -+ if ((sb->s_feature_ro_compat & -+ ~cpu_to_be32(JFS_KNOWN_ROCOMPAT_FEATURES)) || -+ (sb->s_feature_incompat & -+ ~cpu_to_be32(JFS_KNOWN_INCOMPAT_FEATURES))) { -+ printk (KERN_WARNING -+ "JBD: Unrecognised features on journal\n"); -+ return -EINVAL; -+ } -+ } -+ -+ /* Let the recovery code check whether it needs to recover any -+ * data from the journal. */ -+ if (journal_recover(journal)) -+ goto recovery_error; -+ -+ /* OK, we've finished with the dynamic journal bits: -+ * reinitialise the dynamic contents of the superblock in memory -+ * and reset them on disk. */ -+ if (journal_reset(journal)) -+ goto recovery_error; -+ -+ journal->j_flags &= ~JFS_ABORT; -+ journal->j_flags |= JFS_LOADED; -+ return 0; -+ -+recovery_error: -+ printk (KERN_WARNING "JBD: recovery failed\n"); -+ return -EIO; -+} -+ -+/* -+ * Release a journal_t structure once it is no longer in use by the -+ * journaled object. -+ */ -+ -+void journal_destroy (journal_t *journal) -+{ -+ /* Wait for the commit thread to wake up and die. */ -+ journal_kill_thread(journal); -+ -+ /* Force a final log commit */ -+ if (journal->j_running_transaction) -+ journal_commit_transaction(journal); -+ -+ /* Force any old transactions to disk */ -+ lock_journal(journal); -+ while (journal->j_checkpoint_transactions != NULL) -+ log_do_checkpoint(journal, 1); -+ -+ J_ASSERT(journal->j_running_transaction == NULL); -+ J_ASSERT(journal->j_committing_transaction == NULL); -+ J_ASSERT(journal->j_checkpoint_transactions == NULL); -+ -+ /* We can now mark the journal as empty. */ -+ journal->j_tail = 0; -+ journal->j_tail_sequence = ++journal->j_transaction_sequence; -+ if (journal->j_sb_buffer) { -+ journal_update_superblock(journal, 1); -+ brelse(journal->j_sb_buffer); -+ } -+ -+ if (journal->j_inode) -+ iput(journal->j_inode); -+ if (journal->j_revoke) -+ journal_destroy_revoke(journal); -+ -+ unlock_journal(journal); -+ kfree(journal); -+ MOD_DEC_USE_COUNT; -+} -+ -+ -+/* Published API: Check whether the journal uses all of a given set of -+ * features. Return true (non-zero) if it does. */ -+ -+int journal_check_used_features (journal_t *journal, unsigned long compat, -+ unsigned long ro, unsigned long incompat) -+{ -+ journal_superblock_t *sb; -+ -+ if (!compat && !ro && !incompat) -+ return 1; -+ if (journal->j_format_version == 1) -+ return 0; -+ -+ sb = journal->j_superblock; -+ -+ if (((be32_to_cpu(sb->s_feature_compat) & compat) == compat) && -+ ((be32_to_cpu(sb->s_feature_ro_compat) & ro) == ro) && -+ ((be32_to_cpu(sb->s_feature_incompat) & incompat) == incompat)) -+ return 1; -+ -+ return 0; -+} -+ -+/* Published API: Check whether the journaling code supports the use of -+ * all of a given set of features on this journal. Return true -+ * (non-zero) if it can. */ -+ -+int journal_check_available_features (journal_t *journal, unsigned long compat, -+ unsigned long ro, unsigned long incompat) -+{ -+ journal_superblock_t *sb; -+ -+ if (!compat && !ro && !incompat) -+ return 1; -+ -+ sb = journal->j_superblock; -+ -+ /* We can support any known requested features iff the -+ * superblock is in version 2. Otherwise we fail to support any -+ * extended sb features. */ -+ -+ if (journal->j_format_version != 2) -+ return 0; -+ -+ if ((compat & JFS_KNOWN_COMPAT_FEATURES) == compat && -+ (ro & JFS_KNOWN_ROCOMPAT_FEATURES) == ro && -+ (incompat & JFS_KNOWN_INCOMPAT_FEATURES) == incompat) -+ return 1; -+ -+ return 0; -+} -+ -+/* Published API: Mark a given journal feature as present on the -+ * superblock. Returns true if the requested features could be set. */ -+ -+int journal_set_features (journal_t *journal, unsigned long compat, -+ unsigned long ro, unsigned long incompat) -+{ -+ journal_superblock_t *sb; -+ -+ if (journal_check_used_features(journal, compat, ro, incompat)) -+ return 1; -+ -+ if (!journal_check_available_features(journal, compat, ro, incompat)) -+ return 0; -+ -+ jbd_debug(1, "Setting new features 0x%lx/0x%lx/0x%lx\n", -+ compat, ro, incompat); -+ -+ sb = journal->j_superblock; -+ -+ sb->s_feature_compat |= cpu_to_be32(compat); -+ sb->s_feature_ro_compat |= cpu_to_be32(ro); -+ sb->s_feature_incompat |= cpu_to_be32(incompat); -+ -+ return 1; -+} -+ -+ -+/* -+ * Published API: -+ * Given an initialised but unloaded journal struct, poke about in the -+ * on-disk structure to update it to the most recent supported version. -+ */ -+ -+int journal_update_format (journal_t *journal) -+{ -+ journal_superblock_t *sb; -+ int err; -+ -+ err = journal_get_superblock(journal); -+ if (err) -+ return err; -+ -+ sb = journal->j_superblock; -+ -+ switch (ntohl(sb->s_header.h_blocktype)) { -+ case JFS_SUPERBLOCK_V2: -+ return 0; -+ case JFS_SUPERBLOCK_V1: -+ return journal_convert_superblock_v1(journal, sb); -+ default: -+ break; -+ } -+ return -EINVAL; -+} -+ -+static int journal_convert_superblock_v1(journal_t *journal, -+ journal_superblock_t *sb) -+{ -+ int offset, blocksize; -+ struct buffer_head *bh; -+ -+ printk(KERN_WARNING -+ "JBD: Converting superblock from version 1 to 2.\n"); -+ -+ /* Pre-initialise new fields to zero */ -+ offset = ((char *) &(sb->s_feature_compat)) - ((char *) sb); -+ blocksize = ntohl(sb->s_blocksize); -+ memset(&sb->s_feature_compat, 0, blocksize-offset); -+ -+ sb->s_nr_users = cpu_to_be32(1); -+ sb->s_header.h_blocktype = cpu_to_be32(JFS_SUPERBLOCK_V2); -+ journal->j_format_version = 2; -+ -+ bh = journal->j_sb_buffer; -+ BUFFER_TRACE(bh, "marking dirty"); -+ mark_buffer_dirty(bh); -+ ll_rw_block(WRITE, 1, &bh); -+ wait_on_buffer(bh); -+ return 0; -+} -+ -+ -+/* -+ * Flush all data for a given journal to disk and empty the journal. -+ * Filesystems can use this when remounting readonly to ensure that -+ * recovery does not need to happen on remount. -+ */ -+ -+int journal_flush (journal_t *journal) -+{ -+ int err = 0; -+ transaction_t *transaction = NULL; -+ unsigned long old_tail; -+ -+ lock_kernel(); -+ -+ /* Force everything buffered to the log... */ -+ if (journal->j_running_transaction) { -+ transaction = journal->j_running_transaction; -+ log_start_commit(journal, transaction); -+ } else if (journal->j_committing_transaction) -+ transaction = journal->j_committing_transaction; -+ -+ /* Wait for the log commit to complete... */ -+ if (transaction) -+ log_wait_commit(journal, transaction->t_tid); -+ -+ /* ...and flush everything in the log out to disk. */ -+ lock_journal(journal); -+ while (!err && journal->j_checkpoint_transactions != NULL) -+ err = log_do_checkpoint(journal, journal->j_maxlen); -+ cleanup_journal_tail(journal); -+ -+ /* Finally, mark the journal as really needing no recovery. -+ * This sets s_start==0 in the underlying superblock, which is -+ * the magic code for a fully-recovered superblock. Any future -+ * commits of data to the journal will restore the current -+ * s_start value. */ -+ old_tail = journal->j_tail; -+ journal->j_tail = 0; -+ journal_update_superblock(journal, 1); -+ journal->j_tail = old_tail; -+ -+ unlock_journal(journal); -+ -+ J_ASSERT(!journal->j_running_transaction); -+ J_ASSERT(!journal->j_committing_transaction); -+ J_ASSERT(!journal->j_checkpoint_transactions); -+ J_ASSERT(journal->j_head == journal->j_tail); -+ J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence); -+ -+ unlock_kernel(); -+ -+ return err; -+} -+ -+/* -+ * Wipe out all of the contents of a journal, safely. This will produce -+ * a warning if the journal contains any valid recovery information. -+ * Must be called between journal_init_*() and journal_load(). -+ * -+ * If (write) is non-zero, then we wipe out the journal on disk; otherwise -+ * we merely suppress recovery. -+ */ -+ -+int journal_wipe (journal_t *journal, int write) -+{ -+ journal_superblock_t *sb; -+ int err = 0; -+ -+ J_ASSERT (!(journal->j_flags & JFS_LOADED)); -+ -+ err = load_superblock(journal); -+ if (err) -+ return err; -+ -+ sb = journal->j_superblock; -+ -+ if (!journal->j_tail) -+ goto no_recovery; -+ -+ printk (KERN_WARNING "JBD: %s recovery information on journal\n", -+ write ? "Clearing" : "Ignoring"); -+ -+ err = journal_skip_recovery(journal); -+ if (write) -+ journal_update_superblock(journal, 1); -+ -+ no_recovery: -+ return err; -+} -+ -+/* -+ * journal_dev_name: format a character string to describe on what -+ * device this journal is present. -+ */ -+ -+const char * journal_dev_name(journal_t *journal) -+{ -+ kdev_t dev; -+ -+ if (journal->j_inode) -+ dev = journal->j_inode->i_dev; -+ else -+ dev = journal->j_dev; -+ -+ return bdevname(dev); -+} -+ -+/* -+ * journal_abort: perform a complete, immediate shutdown of the ENTIRE -+ * journal (not of a single transaction). This operation cannot be -+ * undone without closing and reopening the journal. -+ * -+ * The journal_abort function is intended to support higher level error -+ * recovery mechanisms such as the ext2/ext3 remount-readonly error -+ * mode. -+ * -+ * Journal abort has very specific semantics. Any existing dirty, -+ * unjournaled buffers in the main filesystem will still be written to -+ * disk by bdflush, but the journaling mechanism will be suspended -+ * immediately and no further transaction commits will be honoured. -+ * -+ * Any dirty, journaled buffers will be written back to disk without -+ * hitting the journal. Atomicity cannot be guaranteed on an aborted -+ * filesystem, but we _do_ attempt to leave as much data as possible -+ * behind for fsck to use for cleanup. -+ * -+ * Any attempt to get a new transaction handle on a journal which is in -+ * ABORT state will just result in an -EROFS error return. A -+ * journal_stop on an existing handle will return -EIO if we have -+ * entered abort state during the update. -+ * -+ * Recursive transactions are not disturbed by journal abort until the -+ * final journal_stop, which will receive the -EIO error. -+ * -+ * Finally, the journal_abort call allows the caller to supply an errno -+ * which will be recored (if possible) in the journal superblock. This -+ * allows a client to record failure conditions in the middle of a -+ * transaction without having to complete the transaction to record the -+ * failure to disk. ext3_error, for example, now uses this -+ * functionality. -+ * -+ * Errors which originate from within the journaling layer will NOT -+ * supply an errno; a null errno implies that absolutely no further -+ * writes are done to the journal (unless there are any already in -+ * progress). -+ */ -+ -+/* Quick version for internal journal use (doesn't lock the journal). -+ * Aborts hard --- we mark the abort as occurred, but do _nothing_ else, -+ * and don't attempt to make any other journal updates. */ -+void __journal_abort_hard (journal_t *journal) -+{ -+ transaction_t *transaction; -+ -+ if (journal->j_flags & JFS_ABORT) -+ return; -+ -+ printk (KERN_ERR "Aborting journal on device %s.\n", -+ journal_dev_name(journal)); -+ -+ journal->j_flags |= JFS_ABORT; -+ transaction = journal->j_running_transaction; -+ if (transaction) -+ log_start_commit(journal, transaction); -+} -+ -+/* Soft abort: record the abort error status in the journal superblock, -+ * but don't do any other IO. */ -+void __journal_abort_soft (journal_t *journal, int errno) -+{ -+ if (journal->j_flags & JFS_ABORT) -+ return; -+ -+ if (!journal->j_errno) -+ journal->j_errno = errno; -+ -+ __journal_abort_hard(journal); -+ -+ if (errno) -+ journal_update_superblock(journal, 1); -+} -+ -+/* Full version for external use */ -+void journal_abort (journal_t *journal, int errno) -+{ -+ lock_journal(journal); -+ __journal_abort_soft(journal, errno); -+ unlock_journal(journal); -+} -+ -+int journal_errno (journal_t *journal) -+{ -+ int err; -+ -+ lock_journal(journal); -+ if (journal->j_flags & JFS_ABORT) -+ err = -EROFS; -+ else -+ err = journal->j_errno; -+ unlock_journal(journal); -+ return err; -+} -+ -+int journal_clear_err (journal_t *journal) -+{ -+ int err = 0; -+ -+ lock_journal(journal); -+ if (journal->j_flags & JFS_ABORT) -+ err = -EROFS; -+ else -+ journal->j_errno = 0; -+ unlock_journal(journal); -+ return err; -+} -+ -+void journal_ack_err (journal_t *journal) -+{ -+ lock_journal(journal); -+ if (journal->j_errno) -+ journal->j_flags |= JFS_ACK_ERR; -+ unlock_journal(journal); -+} -+ -+int journal_blocks_per_page(struct inode *inode) -+{ -+ return 1 << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); -+} -+ -+/* -+ * shrink_journal_memory(). -+ * Called when we're under memory pressure. Free up all the written-back -+ * checkpointed metadata buffers. -+ */ -+void shrink_journal_memory(void) -+{ -+ struct list_head *list; -+ -+ lock_kernel(); -+ list_for_each(list, &all_journals) { -+ journal_t *journal = -+ list_entry(list, journal_t, j_all_journals); -+ spin_lock(&journal_datalist_lock); -+ __journal_clean_checkpoint_list(journal); -+ spin_unlock(&journal_datalist_lock); -+ } -+ unlock_kernel(); -+} -+ -+/* -+ * Simple support for retying memory allocations. Introduced to help to -+ * debug different VM deadlock avoidance strategies. -+ */ -+/* -+ * Simple support for retying memory allocations. Introduced to help to -+ * debug different VM deadlock avoidance strategies. -+ */ -+void * __jbd_kmalloc (char *where, size_t size, int flags, int retry) -+{ -+ void *p; -+ static unsigned long last_warning; -+ -+ while (1) { -+ p = kmalloc(size, flags); -+ if (p) -+ return p; -+ if (!retry) -+ return NULL; -+ /* Log every retry for debugging. Also log them to the -+ * syslog, but do rate-limiting on the non-debugging -+ * messages. */ -+ jbd_debug(1, "ENOMEM in %s, retrying.\n", where); -+ -+ if (time_after(jiffies, last_warning + 5*HZ)) { -+ printk(KERN_NOTICE -+ "ENOMEM in %s, retrying.\n", where); -+ last_warning = jiffies; -+ } -+ -+ current->policy |= SCHED_YIELD; -+ schedule(); -+ } -+} -+ -+/* -+ * Journal_head storage management -+ */ -+static kmem_cache_t *journal_head_cache; -+#ifdef CONFIG_JBD_DEBUG -+static atomic_t nr_journal_heads = ATOMIC_INIT(0); -+#endif -+ -+static int journal_init_journal_head_cache(void) -+{ -+ int retval; -+ -+ J_ASSERT(journal_head_cache == 0); -+ journal_head_cache = kmem_cache_create("journal_head", -+ sizeof(struct journal_head), -+ 0, /* offset */ -+ 0, /* flags */ -+ NULL, /* ctor */ -+ NULL); /* dtor */ -+ retval = 0; -+ if (journal_head_cache == 0) { -+ retval = -ENOMEM; -+ printk(KERN_EMERG "JBD: no memory for journal_head cache\n"); -+ } -+ return retval; -+} -+ -+static void journal_destroy_journal_head_cache(void) -+{ -+ J_ASSERT(journal_head_cache != NULL); -+ kmem_cache_destroy(journal_head_cache); -+ journal_head_cache = 0; -+} -+ -+/* -+ * journal_head splicing and dicing -+ */ -+static struct journal_head *journal_alloc_journal_head(void) -+{ -+ struct journal_head *ret; -+ static unsigned long last_warning; -+ -+#ifdef CONFIG_JBD_DEBUG -+ atomic_inc(&nr_journal_heads); -+#endif -+ ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS); -+ if (ret == 0) { -+ jbd_debug(1, "out of memory for journal_head\n"); -+ if (time_after(jiffies, last_warning + 5*HZ)) { -+ printk(KERN_NOTICE "ENOMEM in " __FUNCTION__ -+ ", retrying.\n"); -+ last_warning = jiffies; -+ } -+ while (ret == 0) { -+ current->policy |= SCHED_YIELD; -+ schedule(); -+ ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS); -+ } -+ } -+ return ret; -+} -+ -+static void journal_free_journal_head(struct journal_head *jh) -+{ -+#ifdef CONFIG_JBD_DEBUG -+ atomic_dec(&nr_journal_heads); -+ memset(jh, 0x5b, sizeof(*jh)); -+#endif -+ kmem_cache_free(journal_head_cache, jh); -+} -+ -+/* -+ * A journal_head is attached to a buffer_head whenever JBD has an -+ * interest in the buffer. -+ * -+ * Whenever a buffer has an attached journal_head, its ->b_state:BH_JBD bit -+ * is set. This bit is tested in core kernel code where we need to take -+ * JBD-specific actions. Testing the zeroness of ->b_private is not reliable -+ * there. -+ * -+ * When a buffer has its BH_JBD bit set, its ->b_count is elevated by one. -+ * -+ * When a buffer has its BH_JBD bit set it is immune from being released by -+ * core kernel code, mainly via ->b_count. -+ * -+ * A journal_head may be detached from its buffer_head when the journal_head's -+ * b_transaction, b_cp_transaction and b_next_transaction pointers are NULL. -+ * Various places in JBD call journal_remove_journal_head() to indicate that the -+ * journal_head can be dropped if needed. -+ * -+ * Various places in the kernel want to attach a journal_head to a buffer_head -+ * _before_ attaching the journal_head to a transaction. To protect the -+ * journal_head in this situation, journal_add_journal_head elevates the -+ * journal_head's b_jcount refcount by one. The caller must call -+ * journal_unlock_journal_head() to undo this. -+ * -+ * So the typical usage would be: -+ * -+ * (Attach a journal_head if needed. Increments b_jcount) -+ * struct journal_head *jh = journal_add_journal_head(bh); -+ * ... -+ * jh->b_transaction = xxx; -+ * journal_unlock_journal_head(jh); -+ * -+ * Now, the journal_head's b_jcount is zero, but it is safe from being released -+ * because it has a non-zero b_transaction. -+ */ -+ -+/* -+ * Give a buffer_head a journal_head. -+ * -+ * Doesn't need the journal lock. -+ * May sleep. -+ * Cannot be called with journal_datalist_lock held. -+ */ -+struct journal_head *journal_add_journal_head(struct buffer_head *bh) -+{ -+ struct journal_head *jh; -+ -+ spin_lock(&journal_datalist_lock); -+ if (buffer_jbd(bh)) { -+ jh = bh2jh(bh); -+ } else { -+ J_ASSERT_BH(bh, -+ (atomic_read(&bh->b_count) > 0) || -+ (bh->b_page && bh->b_page->mapping)); -+ spin_unlock(&journal_datalist_lock); -+ jh = journal_alloc_journal_head(); -+ memset(jh, 0, sizeof(*jh)); -+ spin_lock(&journal_datalist_lock); -+ -+ if (buffer_jbd(bh)) { -+ /* Someone did it for us! */ -+ J_ASSERT_BH(bh, bh->b_private != NULL); -+ journal_free_journal_head(jh); -+ jh = bh->b_private; -+ } else { -+ /* -+ * We actually don't need jh_splice_lock when -+ * adding a journal_head - only on removal. -+ */ -+ spin_lock(&jh_splice_lock); -+ set_bit(BH_JBD, &bh->b_state); -+ bh->b_private = jh; -+ jh->b_bh = bh; -+ atomic_inc(&bh->b_count); -+ spin_unlock(&jh_splice_lock); -+ BUFFER_TRACE(bh, "added journal_head"); -+ } -+ } -+ jh->b_jcount++; -+ spin_unlock(&journal_datalist_lock); -+ return bh->b_private; -+} -+ -+/* -+ * journal_remove_journal_head(): if the buffer isn't attached to a transaction -+ * and has a zero b_jcount then remove and release its journal_head. If we did -+ * see that the buffer is not used by any transaction we also "logically" -+ * decrement ->b_count. -+ * -+ * We in fact take an additional increment on ->b_count as a convenience, -+ * because the caller usually wants to do additional things with the bh -+ * after calling here. -+ * The caller of journal_remove_journal_head() *must* run __brelse(bh) at some -+ * time. Once the caller has run __brelse(), the buffer is eligible for -+ * reaping by try_to_free_buffers(). -+ * -+ * Requires journal_datalist_lock. -+ */ -+void __journal_remove_journal_head(struct buffer_head *bh) -+{ -+ struct journal_head *jh = bh2jh(bh); -+ -+ assert_spin_locked(&journal_datalist_lock); -+ J_ASSERT_JH(jh, jh->b_jcount >= 0); -+ atomic_inc(&bh->b_count); -+ if (jh->b_jcount == 0) { -+ if (jh->b_transaction == NULL && -+ jh->b_next_transaction == NULL && -+ jh->b_cp_transaction == NULL) { -+ J_ASSERT_BH(bh, buffer_jbd(bh)); -+ J_ASSERT_BH(bh, jh2bh(jh) == bh); -+ BUFFER_TRACE(bh, "remove journal_head"); -+ spin_lock(&jh_splice_lock); -+ bh->b_private = NULL; -+ jh->b_bh = NULL; /* debug, really */ -+ clear_bit(BH_JBD, &bh->b_state); -+ __brelse(bh); -+ spin_unlock(&jh_splice_lock); -+ journal_free_journal_head(jh); -+ } else { -+ BUFFER_TRACE(bh, "journal_head was locked"); -+ } -+ } -+} -+ -+void journal_unlock_journal_head(struct journal_head *jh) -+{ -+ spin_lock(&journal_datalist_lock); -+ J_ASSERT_JH(jh, jh->b_jcount > 0); -+ --jh->b_jcount; -+ if (!jh->b_jcount && !jh->b_transaction) { -+ struct buffer_head *bh; -+ bh = jh2bh(jh); -+ __journal_remove_journal_head(bh); -+ __brelse(bh); -+ } -+ -+ spin_unlock(&journal_datalist_lock); -+} -+ -+void journal_remove_journal_head(struct buffer_head *bh) -+{ -+ spin_lock(&journal_datalist_lock); -+ __journal_remove_journal_head(bh); -+ spin_unlock(&journal_datalist_lock); -+} -+ -+/* -+ * /proc tunables -+ */ -+#if defined(CONFIG_JBD_DEBUG) -+int journal_enable_debug; -+EXPORT_SYMBOL(journal_enable_debug); -+#endif -+ -+#if defined(CONFIG_JBD_DEBUG) && defined(CONFIG_PROC_FS) -+ -+static struct proc_dir_entry *proc_jbd_debug; -+ -+int read_jbd_debug(char *page, char **start, off_t off, -+ int count, int *eof, void *data) -+{ -+ int ret; -+ -+ ret = sprintf(page + off, "%d\n", journal_enable_debug); -+ *eof = 1; -+ return ret; -+} -+ -+int write_jbd_debug(struct file *file, const char *buffer, -+ unsigned long count, void *data) -+{ -+ char buf[32]; -+ -+ if (count > ARRAY_SIZE(buf) - 1) -+ count = ARRAY_SIZE(buf) - 1; -+ if (copy_from_user(buf, buffer, count)) -+ return -EFAULT; -+ buf[ARRAY_SIZE(buf) - 1] = '\0'; -+ journal_enable_debug = simple_strtoul(buf, NULL, 10); -+ return count; -+} -+ -+#define JBD_PROC_NAME "sys/fs/jbd-debug" -+ -+static void __init create_jbd_proc_entry(void) -+{ -+ proc_jbd_debug = create_proc_entry(JBD_PROC_NAME, 0644, NULL); -+ if (proc_jbd_debug) { -+ /* Why is this so hard? */ -+ proc_jbd_debug->read_proc = read_jbd_debug; -+ proc_jbd_debug->write_proc = write_jbd_debug; -+ } -+} -+ -+static void __exit remove_jbd_proc_entry(void) -+{ -+ if (proc_jbd_debug) -+ remove_proc_entry(JBD_PROC_NAME, NULL); -+} -+ -+#else -+ -+#define create_jbd_proc_entry() do {} while (0) -+#define remove_jbd_proc_entry() do {} while (0) -+ -+#endif -+ -+/* -+ * Module startup and shutdown -+ */ -+ -+static int __init journal_init_caches(void) -+{ -+ int ret; -+ -+ ret = journal_init_revoke_caches(); -+ if (ret == 0) -+ ret = journal_init_journal_head_cache(); -+ return ret; -+} -+ -+static void journal_destroy_caches(void) -+{ -+ journal_destroy_revoke_caches(); -+ journal_destroy_journal_head_cache(); -+} -+ -+static int __init journal_init(void) -+{ -+ int ret; -+ -+ printk(KERN_INFO "Journalled Block Device driver loaded\n"); -+ ret = journal_init_caches(); -+ if (ret != 0) -+ journal_destroy_caches(); -+ create_jbd_proc_entry(); -+ return ret; -+} -+ -+static void __exit journal_exit(void) -+{ -+#ifdef CONFIG_JBD_DEBUG -+ int n = atomic_read(&nr_journal_heads); -+ if (n) -+ printk(KERN_EMERG "JBD: leaked %d journal_heads!\n", n); -+#endif -+ remove_jbd_proc_entry(); -+ journal_destroy_caches(); -+} -+ -+MODULE_LICENSE("GPL"); -+module_init(journal_init); -+module_exit(journal_exit); -+ -diff -ruP linux.mcp2/fs/jbd/recovery.c linuxppc_2.4.19_final/fs/jbd/recovery.c ---- linux.mcp2/fs/jbd/recovery.c 1969-12-31 16:00:00.000000000 -0800 -+++ linuxppc_2.4.19_final/fs/jbd/recovery.c 2004-05-17 13:56:17.000000000 -0700 -@@ -0,0 +1,589 @@ -+/* -+ * linux/fs/recovery.c -+ * -+ * Written by Stephen C. Tweedie , 1999 -+ * -+ * Copyright 1999-2000 Red Hat Software --- All Rights Reserved -+ * -+ * This file is part of the Linux kernel and is made available under -+ * the terms of the GNU General Public License, version 2, or at your -+ * option, any later version, incorporated herein by reference. -+ * -+ * Journal recovery routines for the generic filesystem journaling code; -+ * part of the ext2fs journaling system. -+ */ -+ -+#ifndef __KERNEL__ -+#include "jfs_user.h" -+#else -+#include -+#include -+#include -+#include -+#include -+#include -+#endif -+ -+/* -+ * Maintain information about the progress of the recovery job, so that -+ * the different passes can carry information between them. -+ */ -+struct recovery_info -+{ -+ tid_t start_transaction; -+ tid_t end_transaction; -+ -+ int nr_replays; -+ int nr_revokes; -+ int nr_revoke_hits; -+}; -+ -+enum passtype {PASS_SCAN, PASS_REVOKE, PASS_REPLAY}; -+static int do_one_pass(journal_t *journal, -+ struct recovery_info *info, enum passtype pass); -+static int scan_revoke_records(journal_t *, struct buffer_head *, -+ tid_t, struct recovery_info *); -+ -+#ifdef __KERNEL__ -+ -+/* Release readahead buffers after use */ -+void journal_brelse_array(struct buffer_head *b[], int n) -+{ -+ while (--n >= 0) -+ brelse (b[n]); -+} -+ -+ -+/* -+ * When reading from the journal, we are going through the block device -+ * layer directly and so there is no readahead being done for us. We -+ * need to implement any readahead ourselves if we want it to happen at -+ * all. Recovery is basically one long sequential read, so make sure we -+ * do the IO in reasonably large chunks. -+ * -+ * This is not so critical that we need to be enormously clever about -+ * the readahead size, though. 128K is a purely arbitrary, good-enough -+ * fixed value. -+ */ -+ -+#define MAXBUF 8 -+static int do_readahead(journal_t *journal, unsigned int start) -+{ -+ int err; -+ unsigned int max, nbufs, next; -+ unsigned long blocknr; -+ struct buffer_head *bh; -+ -+ struct buffer_head * bufs[MAXBUF]; -+ -+ /* Do up to 128K of readahead */ -+ max = start + (128 * 1024 / journal->j_blocksize); -+ if (max > journal->j_maxlen) -+ max = journal->j_maxlen; -+ -+ /* Do the readahead itself. We'll submit MAXBUF buffer_heads at -+ * a time to the block device IO layer. */ -+ -+ nbufs = 0; -+ -+ for (next = start; next < max; next++) { -+ err = journal_bmap(journal, next, &blocknr); -+ -+ if (err) { -+ printk (KERN_ERR "JBD: bad block at offset %u\n", -+ next); -+ goto failed; -+ } -+ -+ bh = getblk(journal->j_dev, blocknr, journal->j_blocksize); -+ if (!bh) { -+ err = -ENOMEM; -+ goto failed; -+ } -+ -+ if (!buffer_uptodate(bh) && !buffer_locked(bh)) { -+ bufs[nbufs++] = bh; -+ if (nbufs == MAXBUF) { -+ ll_rw_block(READ, nbufs, bufs); -+ journal_brelse_array(bufs, nbufs); -+ nbufs = 0; -+ } -+ } else -+ brelse(bh); -+ } -+ -+ if (nbufs) -+ ll_rw_block(READ, nbufs, bufs); -+ err = 0; -+ -+failed: -+ if (nbufs) -+ journal_brelse_array(bufs, nbufs); -+ return err; -+} -+ -+#endif /* __KERNEL__ */ -+ -+ -+/* -+ * Read a block from the journal -+ */ -+ -+static int jread(struct buffer_head **bhp, journal_t *journal, -+ unsigned int offset) -+{ -+ int err; -+ unsigned long blocknr; -+ struct buffer_head *bh; -+ -+ *bhp = NULL; -+ -+ J_ASSERT (offset < journal->j_maxlen); -+ -+ err = journal_bmap(journal, offset, &blocknr); -+ -+ if (err) { -+ printk (KERN_ERR "JBD: bad block at offset %u\n", -+ offset); -+ return err; -+ } -+ -+ bh = getblk(journal->j_dev, blocknr, journal->j_blocksize); -+ if (!bh) -+ return -ENOMEM; -+ -+ if (!buffer_uptodate(bh)) { -+ /* If this is a brand new buffer, start readahead. -+ Otherwise, we assume we are already reading it. */ -+ if (!buffer_req(bh)) -+ do_readahead(journal, offset); -+ wait_on_buffer(bh); -+ } -+ -+ if (!buffer_uptodate(bh)) { -+ printk (KERN_ERR "JBD: Failed to read block at offset %u\n", -+ offset); -+ brelse(bh); -+ return -EIO; -+ } -+ -+ *bhp = bh; -+ return 0; -+} -+ -+ -+/* -+ * Count the number of in-use tags in a journal descriptor block. -+ */ -+ -+static int count_tags(struct buffer_head *bh, int size) -+{ -+ char * tagp; -+ journal_block_tag_t * tag; -+ int nr = 0; -+ -+ tagp = &bh->b_data[sizeof(journal_header_t)]; -+ -+ while ((tagp - bh->b_data + sizeof(journal_block_tag_t)) <= size) { -+ tag = (journal_block_tag_t *) tagp; -+ -+ nr++; -+ tagp += sizeof(journal_block_tag_t); -+ if (!(tag->t_flags & htonl(JFS_FLAG_SAME_UUID))) -+ tagp += 16; -+ -+ if (tag->t_flags & htonl(JFS_FLAG_LAST_TAG)) -+ break; -+ } -+ -+ return nr; -+} -+ -+ -+/* Make sure we wrap around the log correctly! */ -+#define wrap(journal, var) \ -+do { \ -+ if (var >= (journal)->j_last) \ -+ var -= ((journal)->j_last - (journal)->j_first); \ -+} while (0) -+ -+/* -+ * journal_recover -+ * -+ * The primary function for recovering the log contents when mounting a -+ * journaled device. -+ * -+ * Recovery is done in three passes. In the first pass, we look for the -+ * end of the log. In the second, we assemble the list of revoke -+ * blocks. In the third and final pass, we replay any un-revoked blocks -+ * in the log. -+ */ -+ -+int journal_recover(journal_t *journal) -+{ -+ int err; -+ journal_superblock_t * sb; -+ -+ struct recovery_info info; -+ -+ memset(&info, 0, sizeof(info)); -+ sb = journal->j_superblock; -+ -+ /* -+ * The journal superblock's s_start field (the current log head) -+ * is always zero if, and only if, the journal was cleanly -+ * unmounted. -+ */ -+ -+ if (!sb->s_start) { -+ jbd_debug(1, "No recovery required, last transaction %d\n", -+ ntohl(sb->s_sequence)); -+ journal->j_transaction_sequence = ntohl(sb->s_sequence) + 1; -+ return 0; -+ } -+ -+ -+ err = do_one_pass(journal, &info, PASS_SCAN); -+ if (!err) -+ err = do_one_pass(journal, &info, PASS_REVOKE); -+ if (!err) -+ err = do_one_pass(journal, &info, PASS_REPLAY); -+ -+ jbd_debug(0, "JBD: recovery, exit status %d, " -+ "recovered transactions %u to %u\n", -+ err, info.start_transaction, info.end_transaction); -+ jbd_debug(0, "JBD: Replayed %d and revoked %d/%d blocks\n", -+ info.nr_replays, info.nr_revoke_hits, info.nr_revokes); -+ -+ /* Restart the log at the next transaction ID, thus invalidating -+ * any existing commit records in the log. */ -+ journal->j_transaction_sequence = ++info.end_transaction; -+ -+ journal_clear_revoke(journal); -+ fsync_no_super(journal->j_fs_dev); -+ return err; -+} -+ -+/* -+ * journal_skip_recovery -+ * -+ * Locate any valid recovery information from the journal and set up the -+ * journal structures in memory to ignore it (presumably because the -+ * caller has evidence that it is out of date). -+ * -+ * We perform one pass over the journal to allow us to tell the user how -+ * much recovery information is being erased, and to let us initialise -+ * the journal transaction sequence numbers to the next unused ID. -+ */ -+ -+int journal_skip_recovery(journal_t *journal) -+{ -+ int err; -+ journal_superblock_t * sb; -+ -+ struct recovery_info info; -+ -+ memset (&info, 0, sizeof(info)); -+ sb = journal->j_superblock; -+ -+ err = do_one_pass(journal, &info, PASS_SCAN); -+ -+ if (err) { -+ printk(KERN_ERR "JBD: error %d scanning journal\n", err); -+ ++journal->j_transaction_sequence; -+ } else { -+#ifdef CONFIG_JBD_DEBUG -+ int dropped = info.end_transaction - ntohl(sb->s_sequence); -+#endif -+ -+ jbd_debug(0, -+ "JBD: ignoring %d transaction%s from the journal.\n", -+ dropped, (dropped == 1) ? "" : "s"); -+ journal->j_transaction_sequence = ++info.end_transaction; -+ } -+ -+ journal->j_tail = 0; -+ -+ return err; -+} -+ -+static int do_one_pass(journal_t *journal, -+ struct recovery_info *info, enum passtype pass) -+{ -+ -+ unsigned int first_commit_ID, next_commit_ID; -+ unsigned long next_log_block; -+ int err, success = 0; -+ journal_superblock_t * sb; -+ journal_header_t * tmp; -+ struct buffer_head * bh; -+ unsigned int sequence; -+ int blocktype; -+ -+ /* Precompute the maximum metadata descriptors in a descriptor block */ -+ int MAX_BLOCKS_PER_DESC; -+ MAX_BLOCKS_PER_DESC = ((journal->j_blocksize-sizeof(journal_header_t)) -+ / sizeof(journal_block_tag_t)); -+ -+ /* -+ * First thing is to establish what we expect to find in the log -+ * (in terms of transaction IDs), and where (in terms of log -+ * block offsets): query the superblock. -+ */ -+ -+ sb = journal->j_superblock; -+ next_commit_ID = ntohl(sb->s_sequence); -+ next_log_block = ntohl(sb->s_start); -+ -+ first_commit_ID = next_commit_ID; -+ if (pass == PASS_SCAN) -+ info->start_transaction = first_commit_ID; -+ -+ jbd_debug(1, "Starting recovery pass %d\n", pass); -+ -+ /* -+ * Now we walk through the log, transaction by transaction, -+ * making sure that each transaction has a commit block in the -+ * expected place. Each complete transaction gets replayed back -+ * into the main filesystem. -+ */ -+ -+ while (1) { -+ int flags; -+ char * tagp; -+ journal_block_tag_t * tag; -+ struct buffer_head * obh; -+ struct buffer_head * nbh; -+ -+ /* If we already know where to stop the log traversal, -+ * check right now that we haven't gone past the end of -+ * the log. */ -+ -+ if (pass != PASS_SCAN) -+ if (tid_geq(next_commit_ID, info->end_transaction)) -+ break; -+ -+ jbd_debug(2, "Scanning for sequence ID %u at %lu/%lu\n", -+ next_commit_ID, next_log_block, journal->j_last); -+ -+ /* Skip over each chunk of the transaction looking -+ * either the next descriptor block or the final commit -+ * record. */ -+ -+ jbd_debug(3, "JBD: checking block %ld\n", next_log_block); -+ err = jread(&bh, journal, next_log_block); -+ if (err) -+ goto failed; -+ -+ next_log_block++; -+ wrap(journal, next_log_block); -+ -+ /* What kind of buffer is it? -+ * -+ * If it is a descriptor block, check that it has the -+ * expected sequence number. Otherwise, we're all done -+ * here. */ -+ -+ tmp = (journal_header_t *)bh->b_data; -+ -+ if (tmp->h_magic != htonl(JFS_MAGIC_NUMBER)) { -+ brelse(bh); -+ break; -+ } -+ -+ blocktype = ntohl(tmp->h_blocktype); -+ sequence = ntohl(tmp->h_sequence); -+ jbd_debug(3, "Found magic %d, sequence %d\n", -+ blocktype, sequence); -+ -+ if (sequence != next_commit_ID) { -+ brelse(bh); -+ break; -+ } -+ -+ /* OK, we have a valid descriptor block which matches -+ * all of the sequence number checks. What are we going -+ * to do with it? That depends on the pass... */ -+ -+ switch(blocktype) { -+ case JFS_DESCRIPTOR_BLOCK: -+ /* If it is a valid descriptor block, replay it -+ * in pass REPLAY; otherwise, just skip over the -+ * blocks it describes. */ -+ if (pass != PASS_REPLAY) { -+ next_log_block += -+ count_tags(bh, journal->j_blocksize); -+ wrap(journal, next_log_block); -+ brelse(bh); -+ continue; -+ } -+ -+ /* A descriptor block: we can now write all of -+ * the data blocks. Yay, useful work is finally -+ * getting done here! */ -+ -+ tagp = &bh->b_data[sizeof(journal_header_t)]; -+ while ((tagp - bh->b_data +sizeof(journal_block_tag_t)) -+ <= journal->j_blocksize) { -+ unsigned long io_block; -+ -+ tag = (journal_block_tag_t *) tagp; -+ flags = ntohl(tag->t_flags); -+ -+ io_block = next_log_block++; -+ wrap(journal, next_log_block); -+ err = jread(&obh, journal, io_block); -+ if (err) { -+ /* Recover what we can, but -+ * report failure at the end. */ -+ success = err; -+ printk (KERN_ERR -+ "JBD: IO error %d recovering " -+ "block %ld in log\n", -+ err, io_block); -+ } else { -+ unsigned long blocknr; -+ -+ J_ASSERT(obh != NULL); -+ blocknr = ntohl(tag->t_blocknr); -+ -+ /* If the block has been -+ * revoked, then we're all done -+ * here. */ -+ if (journal_test_revoke -+ (journal, blocknr, -+ next_commit_ID)) { -+ brelse(obh); -+ ++info->nr_revoke_hits; -+ goto skip_write; -+ } -+ -+ /* Find a buffer for the new -+ * data being restored */ -+ nbh = getblk(journal->j_fs_dev, blocknr, -+ journal->j_blocksize); -+ if (nbh == NULL) { -+ printk(KERN_ERR -+ "JBD: Out of memory " -+ "during recovery.\n"); -+ err = -ENOMEM; -+ brelse(bh); -+ brelse(obh); -+ goto failed; -+ } -+ -+ lock_buffer(nbh); -+ memcpy(nbh->b_data, obh->b_data, -+ journal->j_blocksize); -+ if (flags & JFS_FLAG_ESCAPE) { -+ *((unsigned int *)bh->b_data) = -+ htonl(JFS_MAGIC_NUMBER); -+ } -+ -+ BUFFER_TRACE(nbh, "marking dirty"); -+ mark_buffer_dirty(nbh); -+ BUFFER_TRACE(nbh, "marking uptodate"); -+ mark_buffer_uptodate(nbh, 1); -+ unlock_buffer(nbh); -+ ++info->nr_replays; -+ /* ll_rw_block(WRITE, 1, &nbh); */ -+ brelse(obh); -+ brelse(nbh); -+ } -+ -+ skip_write: -+ tagp += sizeof(journal_block_tag_t); -+ if (!(flags & JFS_FLAG_SAME_UUID)) -+ tagp += 16; -+ -+ if (flags & JFS_FLAG_LAST_TAG) -+ break; -+ } -+ -+ brelse(bh); -+ continue; -+ -+ case JFS_COMMIT_BLOCK: -+ /* Found an expected commit block: not much to -+ * do other than move on to the next sequence -+ * number. */ -+ brelse(bh); -+ next_commit_ID++; -+ continue; -+ -+ case JFS_REVOKE_BLOCK: -+ /* If we aren't in the REVOKE pass, then we can -+ * just skip over this block. */ -+ if (pass != PASS_REVOKE) { -+ brelse(bh); -+ continue; -+ } -+ -+ err = scan_revoke_records(journal, bh, -+ next_commit_ID, info); -+ brelse(bh); -+ if (err) -+ goto failed; -+ continue; -+ -+ default: -+ jbd_debug(3, "Unrecognised magic %d, end of scan.\n", -+ blocktype); -+ goto done; -+ } -+ } -+ -+ done: -+ /* -+ * We broke out of the log scan loop: either we came to the -+ * known end of the log or we found an unexpected block in the -+ * log. If the latter happened, then we know that the "current" -+ * transaction marks the end of the valid log. -+ */ -+ -+ if (pass == PASS_SCAN) -+ info->end_transaction = next_commit_ID; -+ else { -+ /* It's really bad news if different passes end up at -+ * different places (but possible due to IO errors). */ -+ if (info->end_transaction != next_commit_ID) { -+ printk (KERN_ERR "JBD: recovery pass %d ended at " -+ "transaction %u, expected %u\n", -+ pass, next_commit_ID, info->end_transaction); -+ if (!success) -+ success = -EIO; -+ } -+ } -+ -+ return success; -+ -+ failed: -+ return err; -+} -+ -+ -+/* Scan a revoke record, marking all blocks mentioned as revoked. */ -+ -+static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, -+ tid_t sequence, struct recovery_info *info) -+{ -+ journal_revoke_header_t *header; -+ int offset, max; -+ -+ header = (journal_revoke_header_t *) bh->b_data; -+ offset = sizeof(journal_revoke_header_t); -+ max = ntohl(header->r_count); -+ -+ while (offset < max) { -+ unsigned long blocknr; -+ int err; -+ -+ blocknr = ntohl(* ((unsigned int *) (bh->b_data+offset))); -+ offset += 4; -+ err = journal_set_revoke(journal, blocknr, sequence); -+ if (err) -+ return err; -+ ++info->nr_revokes; -+ } -+ return 0; -+} -diff -ruP linux.mcp2/fs/jbd/revoke.c linuxppc_2.4.19_final/fs/jbd/revoke.c ---- linux.mcp2/fs/jbd/revoke.c 1969-12-31 16:00:00.000000000 -0800 -+++ linuxppc_2.4.19_final/fs/jbd/revoke.c 2004-05-17 13:56:17.000000000 -0700 -@@ -0,0 +1,636 @@ -+/* -+ * linux/fs/revoke.c -+ * -+ * Written by Stephen C. Tweedie , 2000 -+ * -+ * Copyright 2000 Red Hat corp --- All Rights Reserved -+ * -+ * This file is part of the Linux kernel and is made available under -+ * the terms of the GNU General Public License, version 2, or at your -+ * option, any later version, incorporated herein by reference. -+ * -+ * Journal revoke routines for the generic filesystem journaling code; -+ * part of the ext2fs journaling system. -+ * -+ * Revoke is the mechanism used to prevent old log records for deleted -+ * metadata from being replayed on top of newer data using the same -+ * blocks. The revoke mechanism is used in two separate places: -+ * -+ * + Commit: during commit we write the entire list of the current -+ * transaction's revoked blocks to the journal -+ * -+ * + Recovery: during recovery we record the transaction ID of all -+ * revoked blocks. If there are multiple revoke records in the log -+ * for a single block, only the last one counts, and if there is a log -+ * entry for a block beyond the last revoke, then that log entry still -+ * gets replayed. -+ * -+ * We can get interactions between revokes and new log data within a -+ * single transaction: -+ * -+ * Block is revoked and then journaled: -+ * The desired end result is the journaling of the new block, so we -+ * cancel the revoke before the transaction commits. -+ * -+ * Block is journaled and then revoked: -+ * The revoke must take precedence over the write of the block, so we -+ * need either to cancel the journal entry or to write the revoke -+ * later in the log than the log block. In this case, we choose the -+ * latter: journaling a block cancels any revoke record for that block -+ * in the current transaction, so any revoke for that block in the -+ * transaction must have happened after the block was journaled and so -+ * the revoke must take precedence. -+ * -+ * Block is revoked and then written as data: -+ * The data write is allowed to succeed, but the revoke is _not_ -+ * cancelled. We still need to prevent old log records from -+ * overwriting the new data. We don't even need to clear the revoke -+ * bit here. -+ * -+ * Revoke information on buffers is a tri-state value: -+ * -+ * RevokeValid clear: no cached revoke status, need to look it up -+ * RevokeValid set, Revoked clear: -+ * buffer has not been revoked, and cancel_revoke -+ * need do nothing. -+ * RevokeValid set, Revoked set: -+ * buffer has been revoked. -+ */ -+ -+#ifndef __KERNEL__ -+#include "jfs_user.h" -+#else -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#endif -+ -+static kmem_cache_t *revoke_record_cache; -+static kmem_cache_t *revoke_table_cache; -+ -+/* Each revoke record represents one single revoked block. During -+ journal replay, this involves recording the transaction ID of the -+ last transaction to revoke this block. */ -+ -+struct jbd_revoke_record_s -+{ -+ struct list_head hash; -+ tid_t sequence; /* Used for recovery only */ -+ unsigned long blocknr; -+}; -+ -+ -+/* The revoke table is just a simple hash table of revoke records. */ -+struct jbd_revoke_table_s -+{ -+ /* It is conceivable that we might want a larger hash table -+ * for recovery. Must be a power of two. */ -+ int hash_size; -+ int hash_shift; -+ struct list_head *hash_table; -+}; -+ -+ -+#ifdef __KERNEL__ -+static void write_one_revoke_record(journal_t *, transaction_t *, -+ struct journal_head **, int *, -+ struct jbd_revoke_record_s *); -+static void flush_descriptor(journal_t *, struct journal_head *, int); -+#endif -+ -+/* Utility functions to maintain the revoke table */ -+ -+/* Borrowed from buffer.c: this is a tried and tested block hash function */ -+static inline int hash(journal_t *journal, unsigned long block) -+{ -+ struct jbd_revoke_table_s *table = journal->j_revoke; -+ int hash_shift = table->hash_shift; -+ -+ return ((block << (hash_shift - 6)) ^ -+ (block >> 13) ^ -+ (block << (hash_shift - 12))) & (table->hash_size - 1); -+} -+ -+int insert_revoke_hash(journal_t *journal, unsigned long blocknr, tid_t seq) -+{ -+ struct list_head *hash_list; -+ struct jbd_revoke_record_s *record; -+ -+repeat: -+ record = kmem_cache_alloc(revoke_record_cache, GFP_NOFS); -+ if (!record) -+ goto oom; -+ -+ record->sequence = seq; -+ record->blocknr = blocknr; -+ hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)]; -+ list_add(&record->hash, hash_list); -+ return 0; -+ -+oom: -+ if (!journal_oom_retry) -+ return -ENOMEM; -+ jbd_debug(1, "ENOMEM in " __FUNCTION__ ", retrying.\n"); -+ current->policy |= SCHED_YIELD; -+ schedule(); -+ goto repeat; -+} -+ -+/* Find a revoke record in the journal's hash table. */ -+ -+static struct jbd_revoke_record_s *find_revoke_record(journal_t *journal, -+ unsigned long blocknr) -+{ -+ struct list_head *hash_list; -+ struct jbd_revoke_record_s *record; -+ -+ hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)]; -+ -+ record = (struct jbd_revoke_record_s *) hash_list->next; -+ while (&(record->hash) != hash_list) { -+ if (record->blocknr == blocknr) -+ return record; -+ record = (struct jbd_revoke_record_s *) record->hash.next; -+ } -+ return NULL; -+} -+ -+int __init journal_init_revoke_caches(void) -+{ -+ revoke_record_cache = kmem_cache_create("revoke_record", -+ sizeof(struct jbd_revoke_record_s), -+ 0, SLAB_HWCACHE_ALIGN, NULL, NULL); -+ if (revoke_record_cache == 0) -+ return -ENOMEM; -+ -+ revoke_table_cache = kmem_cache_create("revoke_table", -+ sizeof(struct jbd_revoke_table_s), -+ 0, 0, NULL, NULL); -+ if (revoke_table_cache == 0) { -+ kmem_cache_destroy(revoke_record_cache); -+ revoke_record_cache = NULL; -+ return -ENOMEM; -+ } -+ return 0; -+} -+ -+void journal_destroy_revoke_caches(void) -+{ -+ kmem_cache_destroy(revoke_record_cache); -+ revoke_record_cache = 0; -+ kmem_cache_destroy(revoke_table_cache); -+ revoke_table_cache = 0; -+} -+ -+/* Initialise the revoke table for a given journal to a given size. */ -+ -+int journal_init_revoke(journal_t *journal, int hash_size) -+{ -+ int shift, tmp; -+ -+ J_ASSERT (journal->j_revoke == NULL); -+ -+ journal->j_revoke = kmem_cache_alloc(revoke_table_cache, GFP_KERNEL); -+ if (!journal->j_revoke) -+ return -ENOMEM; -+ -+ /* Check that the hash_size is a power of two */ -+ J_ASSERT ((hash_size & (hash_size-1)) == 0); -+ -+ journal->j_revoke->hash_size = hash_size; -+ -+ shift = 0; -+ tmp = hash_size; -+ while((tmp >>= 1UL) != 0UL) -+ shift++; -+ journal->j_revoke->hash_shift = shift; -+ -+ journal->j_revoke->hash_table = -+ kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL); -+ if (!journal->j_revoke->hash_table) { -+ kmem_cache_free(revoke_table_cache, journal->j_revoke); -+ journal->j_revoke = NULL; -+ return -ENOMEM; -+ } -+ -+ for (tmp = 0; tmp < hash_size; tmp++) -+ INIT_LIST_HEAD(&journal->j_revoke->hash_table[tmp]); -+ -+ return 0; -+} -+ -+/* Destoy a journal's revoke table. The table must already be empty! */ -+ -+void journal_destroy_revoke(journal_t *journal) -+{ -+ struct jbd_revoke_table_s *table; -+ struct list_head *hash_list; -+ int i; -+ -+ table = journal->j_revoke; -+ if (!table) -+ return; -+ -+ for (i=0; ihash_size; i++) { -+ hash_list = &table->hash_table[i]; -+ J_ASSERT (list_empty(hash_list)); -+ } -+ -+ kfree(table->hash_table); -+ kmem_cache_free(revoke_table_cache, table); -+ journal->j_revoke = NULL; -+} -+ -+ -+#ifdef __KERNEL__ -+ -+/* -+ * journal_revoke: revoke a given buffer_head from the journal. This -+ * prevents the block from being replayed during recovery if we take a -+ * crash after this current transaction commits. Any subsequent -+ * metadata writes of the buffer in this transaction cancel the -+ * revoke. -+ * -+ * Note that this call may block --- it is up to the caller to make -+ * sure that there are no further calls to journal_write_metadata -+ * before the revoke is complete. In ext3, this implies calling the -+ * revoke before clearing the block bitmap when we are deleting -+ * metadata. -+ * -+ * Revoke performs a journal_forget on any buffer_head passed in as a -+ * parameter, but does _not_ forget the buffer_head if the bh was only -+ * found implicitly. -+ * -+ * bh_in may not be a journalled buffer - it may have come off -+ * the hash tables without an attached journal_head. -+ * -+ * If bh_in is non-zero, journal_revoke() will decrement its b_count -+ * by one. -+ */ -+ -+int journal_revoke(handle_t *handle, unsigned long blocknr, -+ struct buffer_head *bh_in) -+{ -+ struct buffer_head *bh = NULL; -+ journal_t *journal; -+ kdev_t dev; -+ int err; -+ -+ if (bh_in) -+ BUFFER_TRACE(bh_in, "enter"); -+ -+ journal = handle->h_transaction->t_journal; -+ if (!journal_set_features(journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)){ -+ J_ASSERT (!"Cannot set revoke feature!"); -+ return -EINVAL; -+ } -+ -+ dev = journal->j_fs_dev; -+ bh = bh_in; -+ -+ if (!bh) { -+ bh = get_hash_table(dev, blocknr, journal->j_blocksize); -+ if (bh) -+ BUFFER_TRACE(bh, "found on hash"); -+ } -+#ifdef JBD_EXPENSIVE_CHECKING -+ else { -+ struct buffer_head *bh2; -+ -+ /* If there is a different buffer_head lying around in -+ * memory anywhere... */ -+ bh2 = get_hash_table(dev, blocknr, journal->j_blocksize); -+ if (bh2) { -+ /* ... and it has RevokeValid status... */ -+ if ((bh2 != bh) && -+ test_bit(BH_RevokeValid, &bh2->b_state)) -+ /* ...then it better be revoked too, -+ * since it's illegal to create a revoke -+ * record against a buffer_head which is -+ * not marked revoked --- that would -+ * risk missing a subsequent revoke -+ * cancel. */ -+ J_ASSERT_BH(bh2, test_bit(BH_Revoked, & -+ bh2->b_state)); -+ __brelse(bh2); -+ } -+ } -+#endif -+ -+ /* We really ought not ever to revoke twice in a row without -+ first having the revoke cancelled: it's illegal to free a -+ block twice without allocating it in between! */ -+ if (bh) { -+ J_ASSERT_BH(bh, !test_bit(BH_Revoked, &bh->b_state)); -+ set_bit(BH_Revoked, &bh->b_state); -+ set_bit(BH_RevokeValid, &bh->b_state); -+ if (bh_in) { -+ BUFFER_TRACE(bh_in, "call journal_forget"); -+ journal_forget(handle, bh_in); -+ } else { -+ BUFFER_TRACE(bh, "call brelse"); -+ __brelse(bh); -+ } -+ } -+ -+ lock_journal(journal); -+ jbd_debug(2, "insert revoke for block %lu, bh_in=%p\n", blocknr, bh_in); -+ err = insert_revoke_hash(journal, blocknr, -+ handle->h_transaction->t_tid); -+ unlock_journal(journal); -+ BUFFER_TRACE(bh_in, "exit"); -+ return err; -+} -+ -+/* -+ * Cancel an outstanding revoke. For use only internally by the -+ * journaling code (called from journal_get_write_access). -+ * -+ * We trust the BH_Revoked bit on the buffer if the buffer is already -+ * being journaled: if there is no revoke pending on the buffer, then we -+ * don't do anything here. -+ * -+ * This would break if it were possible for a buffer to be revoked and -+ * discarded, and then reallocated within the same transaction. In such -+ * a case we would have lost the revoked bit, but when we arrived here -+ * the second time we would still have a pending revoke to cancel. So, -+ * do not trust the Revoked bit on buffers unless RevokeValid is also -+ * set. -+ * -+ * The caller must have the journal locked. -+ */ -+int journal_cancel_revoke(handle_t *handle, struct journal_head *jh) -+{ -+ struct jbd_revoke_record_s *record; -+ journal_t *journal = handle->h_transaction->t_journal; -+ int need_cancel; -+ int did_revoke = 0; /* akpm: debug */ -+ struct buffer_head *bh = jh2bh(jh); -+ -+ jbd_debug(4, "journal_head %p, cancelling revoke\n", jh); -+ -+ /* Is the existing Revoke bit valid? If so, we trust it, and -+ * only perform the full cancel if the revoke bit is set. If -+ * not, we can't trust the revoke bit, and we need to do the -+ * full search for a revoke record. */ -+ if (test_and_set_bit(BH_RevokeValid, &bh->b_state)) -+ need_cancel = (test_and_clear_bit(BH_Revoked, &bh->b_state)); -+ else { -+ need_cancel = 1; -+ clear_bit(BH_Revoked, &bh->b_state); -+ } -+ -+ if (need_cancel) { -+ record = find_revoke_record(journal, bh->b_blocknr); -+ if (record) { -+ jbd_debug(4, "cancelled existing revoke on " -+ "blocknr %lu\n", bh->b_blocknr); -+ list_del(&record->hash); -+ kmem_cache_free(revoke_record_cache, record); -+ did_revoke = 1; -+ } -+ } -+ -+#ifdef JBD_EXPENSIVE_CHECKING -+ /* There better not be one left behind by now! */ -+ record = find_revoke_record(journal, bh->b_blocknr); -+ J_ASSERT_JH(jh, record == NULL); -+#endif -+ -+ /* Finally, have we just cleared revoke on an unhashed -+ * buffer_head? If so, we'd better make sure we clear the -+ * revoked status on any hashed alias too, otherwise the revoke -+ * state machine will get very upset later on. */ -+ if (need_cancel && !bh->b_pprev) { -+ struct buffer_head *bh2; -+ bh2 = get_hash_table(bh->b_dev, bh->b_blocknr, bh->b_size); -+ if (bh2) { -+ clear_bit(BH_Revoked, &bh2->b_state); -+ __brelse(bh2); -+ } -+ } -+ -+ return did_revoke; -+} -+ -+ -+/* -+ * Write revoke records to the journal for all entries in the current -+ * revoke hash, deleting the entries as we go. -+ * -+ * Called with the journal lock held. -+ */ -+ -+void journal_write_revoke_records(journal_t *journal, -+ transaction_t *transaction) -+{ -+ struct journal_head *descriptor; -+ struct jbd_revoke_record_s *record; -+ struct jbd_revoke_table_s *revoke; -+ struct list_head *hash_list; -+ int i, offset, count; -+ -+ descriptor = NULL; -+ offset = 0; -+ count = 0; -+ revoke = journal->j_revoke; -+ -+ for (i = 0; i < revoke->hash_size; i++) { -+ hash_list = &revoke->hash_table[i]; -+ -+ while (!list_empty(hash_list)) { -+ record = (struct jbd_revoke_record_s *) -+ hash_list->next; -+ write_one_revoke_record(journal, transaction, -+ &descriptor, &offset, -+ record); -+ count++; -+ list_del(&record->hash); -+ kmem_cache_free(revoke_record_cache, record); -+ } -+ } -+ if (descriptor) -+ flush_descriptor(journal, descriptor, offset); -+ jbd_debug(1, "Wrote %d revoke records\n", count); -+} -+ -+/* -+ * Write out one revoke record. We need to create a new descriptor -+ * block if the old one is full or if we have not already created one. -+ */ -+ -+static void write_one_revoke_record(journal_t *journal, -+ transaction_t *transaction, -+ struct journal_head **descriptorp, -+ int *offsetp, -+ struct jbd_revoke_record_s *record) -+{ -+ struct journal_head *descriptor; -+ int offset; -+ journal_header_t *header; -+ -+ /* If we are already aborting, this all becomes a noop. We -+ still need to go round the loop in -+ journal_write_revoke_records in order to free all of the -+ revoke records: only the IO to the journal is omitted. */ -+ if (is_journal_aborted(journal)) -+ return; -+ -+ descriptor = *descriptorp; -+ offset = *offsetp; -+ -+ /* Make sure we have a descriptor with space left for the record */ -+ if (descriptor) { -+ if (offset == journal->j_blocksize) { -+ flush_descriptor(journal, descriptor, offset); -+ descriptor = NULL; -+ } -+ } -+ -+ if (!descriptor) { -+ descriptor = journal_get_descriptor_buffer(journal); -+ if (!descriptor) -+ return; -+ header = (journal_header_t *) &jh2bh(descriptor)->b_data[0]; -+ header->h_magic = htonl(JFS_MAGIC_NUMBER); -+ header->h_blocktype = htonl(JFS_REVOKE_BLOCK); -+ header->h_sequence = htonl(transaction->t_tid); -+ -+ /* Record it so that we can wait for IO completion later */ -+ JBUFFER_TRACE(descriptor, "file as BJ_LogCtl"); -+ journal_file_buffer(descriptor, transaction, BJ_LogCtl); -+ -+ offset = sizeof(journal_revoke_header_t); -+ *descriptorp = descriptor; -+ } -+ -+ * ((unsigned int *)(&jh2bh(descriptor)->b_data[offset])) = -+ htonl(record->blocknr); -+ offset += 4; -+ *offsetp = offset; -+} -+ -+/* -+ * Flush a revoke descriptor out to the journal. If we are aborting, -+ * this is a noop; otherwise we are generating a buffer which needs to -+ * be waited for during commit, so it has to go onto the appropriate -+ * journal buffer list. -+ */ -+ -+static void flush_descriptor(journal_t *journal, -+ struct journal_head *descriptor, -+ int offset) -+{ -+ journal_revoke_header_t *header; -+ -+ if (is_journal_aborted(journal)) { -+ JBUFFER_TRACE(descriptor, "brelse"); -+ unlock_buffer(jh2bh(descriptor)); -+ __brelse(jh2bh(descriptor)); -+ return; -+ } -+ -+ header = (journal_revoke_header_t *) jh2bh(descriptor)->b_data; -+ header->r_count = htonl(offset); -+ set_bit(BH_JWrite, &jh2bh(descriptor)->b_state); -+ { -+ struct buffer_head *bh = jh2bh(descriptor); -+ BUFFER_TRACE(bh, "write"); -+ clear_bit(BH_Dirty, &bh->b_state); -+ bh->b_end_io = journal_end_buffer_io_sync; -+ submit_bh(WRITE, bh); -+ } -+} -+ -+#endif -+ -+/* -+ * Revoke support for recovery. -+ * -+ * Recovery needs to be able to: -+ * -+ * record all revoke records, including the tid of the latest instance -+ * of each revoke in the journal -+ * -+ * check whether a given block in a given transaction should be replayed -+ * (ie. has not been revoked by a revoke record in that or a subsequent -+ * transaction) -+ * -+ * empty the revoke table after recovery. -+ */ -+ -+/* -+ * First, setting revoke records. We create a new revoke record for -+ * every block ever revoked in the log as we scan it for recovery, and -+ * we update the existing records if we find multiple revokes for a -+ * single block. -+ */ -+ -+int journal_set_revoke(journal_t *journal, -+ unsigned long blocknr, -+ tid_t sequence) -+{ -+ struct jbd_revoke_record_s *record; -+ -+ record = find_revoke_record(journal, blocknr); -+ if (record) { -+ /* If we have multiple occurences, only record the -+ * latest sequence number in the hashed record */ -+ if (tid_gt(sequence, record->sequence)) -+ record->sequence = sequence; -+ return 0; -+ } -+ return insert_revoke_hash(journal, blocknr, sequence); -+} -+ -+/* -+ * Test revoke records. For a given block referenced in the log, has -+ * that block been revoked? A revoke record with a given transaction -+ * sequence number revokes all blocks in that transaction and earlier -+ * ones, but later transactions still need replayed. -+ */ -+ -+int journal_test_revoke(journal_t *journal, -+ unsigned long blocknr, -+ tid_t sequence) -+{ -+ struct jbd_revoke_record_s *record; -+ -+ record = find_revoke_record(journal, blocknr); -+ if (!record) -+ return 0; -+ if (tid_gt(sequence, record->sequence)) -+ return 0; -+ return 1; -+} -+ -+/* -+ * Finally, once recovery is over, we need to clear the revoke table so -+ * that it can be reused by the running filesystem. -+ */ -+ -+void journal_clear_revoke(journal_t *journal) -+{ -+ int i; -+ struct list_head *hash_list; -+ struct jbd_revoke_record_s *record; -+ struct jbd_revoke_table_s *revoke; -+ -+ revoke = journal->j_revoke; -+ -+ for (i = 0; i < revoke->hash_size; i++) { -+ hash_list = &revoke->hash_table[i]; -+ while (!list_empty(hash_list)) { -+ record = (struct jbd_revoke_record_s*) hash_list->next; -+ list_del(&record->hash); -+ kmem_cache_free(revoke_record_cache, record); -+ } -+ } -+} -+ -diff -ruP linux.mcp2/fs/jbd/transaction.c linuxppc_2.4.19_final/fs/jbd/transaction.c ---- linux.mcp2/fs/jbd/transaction.c 1969-12-31 16:00:00.000000000 -0800 -+++ linuxppc_2.4.19_final/fs/jbd/transaction.c 2004-05-17 13:56:17.000000000 -0700 -@@ -0,0 +1,2055 @@ -+/* -+ * linux/fs/transaction.c -+ * -+ * Written by Stephen C. Tweedie , 1998 -+ * -+ * Copyright 1998 Red Hat corp --- All Rights Reserved -+ * -+ * This file is part of the Linux kernel and is made available under -+ * the terms of the GNU General Public License, version 2, or at your -+ * option, any later version, incorporated herein by reference. -+ * -+ * Generic filesystem transaction handling code; part of the ext2fs -+ * journaling system. -+ * -+ * This file manages transactions (compound commits managed by the -+ * journaling code) and handles (individual atomic operations by the -+ * filesystem). -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+extern spinlock_t journal_datalist_lock; -+ -+/* -+ * get_transaction: obtain a new transaction_t object. -+ * -+ * Simply allocate and initialise a new transaction. Create it in -+ * RUNNING state and add it to the current journal (which should not -+ * have an existing running transaction: we only make a new transaction -+ * once we have started to commit the old one). -+ * -+ * Preconditions: -+ * The journal MUST be locked. We don't perform atomic mallocs on the -+ * new transaction and we can't block without protecting against other -+ * processes trying to touch the journal while it is in transition. -+ */ -+ -+static transaction_t * get_transaction (journal_t * journal, int is_try) -+{ -+ transaction_t * transaction; -+ -+ transaction = jbd_kmalloc (sizeof (transaction_t), GFP_NOFS); -+ if (!transaction) -+ return NULL; -+ -+ memset (transaction, 0, sizeof (transaction_t)); -+ -+ transaction->t_journal = journal; -+ transaction->t_state = T_RUNNING; -+ transaction->t_tid = journal->j_transaction_sequence++; -+ transaction->t_expires = jiffies + journal->j_commit_interval; -+ -+ /* Set up the commit timer for the new transaction. */ -+ J_ASSERT (!journal->j_commit_timer_active); -+ journal->j_commit_timer_active = 1; -+ journal->j_commit_timer->expires = transaction->t_expires; -+ add_timer(journal->j_commit_timer); -+ -+ J_ASSERT (journal->j_running_transaction == NULL); -+ journal->j_running_transaction = transaction; -+ -+ return transaction; -+} -+ -+/* -+ * Handle management. -+ * -+ * A handle_t is an object which represents a single atomic update to a -+ * filesystem, and which tracks all of the modifications which form part -+ * of that one update. -+ */ -+ -+/* -+ * start_this_handle: Given a handle, deal with any locking or stalling -+ * needed to make sure that there is enough journal space for the handle -+ * to begin. Attach the handle to a transaction and set up the -+ * transaction's buffer credits. -+ */ -+ -+static int start_this_handle(journal_t *journal, handle_t *handle) -+{ -+ transaction_t *transaction; -+ int needed; -+ int nblocks = handle->h_buffer_credits; -+ -+ jbd_debug(3, "New handle %p going live.\n", handle); -+ -+repeat: -+ -+ lock_journal(journal); -+ -+repeat_locked: -+ -+ if (is_journal_aborted(journal) || -+ (journal->j_errno != 0 && !(journal->j_flags & JFS_ACK_ERR))) { -+ unlock_journal(journal); -+ return -EROFS; -+ } -+ -+ /* Wait on the journal's transaction barrier if necessary */ -+ if (journal->j_barrier_count) { -+ unlock_journal(journal); -+ sleep_on(&journal->j_wait_transaction_locked); -+ goto repeat; -+ } -+ -+ if (!journal->j_running_transaction) -+ get_transaction(journal, 0); -+ /* @@@ Error? */ -+ J_ASSERT(journal->j_running_transaction); -+ -+ transaction = journal->j_running_transaction; -+ -+ /* If the current transaction is locked down for commit, wait -+ * for the lock to be released. */ -+ -+ if (transaction->t_state == T_LOCKED) { -+ unlock_journal(journal); -+ jbd_debug(3, "Handle %p stalling...\n", handle); -+ sleep_on(&journal->j_wait_transaction_locked); -+ goto repeat; -+ } -+ -+ /* If there is not enough space left in the log to write all -+ * potential buffers requested by this operation, we need to -+ * stall pending a log checkpoint to free some more log -+ * space. */ -+ -+ needed = transaction->t_outstanding_credits + nblocks; -+ -+ if (needed > journal->j_max_transaction_buffers) { -+ /* If the current transaction is already too large, then -+ * start to commit it: we can then go back and attach -+ * this handle to a new transaction. */ -+ -+ jbd_debug(2, "Handle %p starting new commit...\n", handle); -+ log_start_commit(journal, transaction); -+ unlock_journal(journal); -+ sleep_on(&journal->j_wait_transaction_locked); -+ lock_journal(journal); -+ goto repeat_locked; -+ } -+ -+ /* -+ * The commit code assumes that it can get enough log space -+ * without forcing a checkpoint. This is *critical* for -+ * correctness: a checkpoint of a buffer which is also -+ * associated with a committing transaction creates a deadlock, -+ * so commit simply cannot force through checkpoints. -+ * -+ * We must therefore ensure the necessary space in the journal -+ * *before* starting to dirty potentially checkpointed buffers -+ * in the new transaction. -+ * -+ * The worst part is, any transaction currently committing can -+ * reduce the free space arbitrarily. Be careful to account for -+ * those buffers when checkpointing. -+ */ -+ -+ /* -+ * @@@ AKPM: This seems rather over-defensive. We're giving commit -+ * a _lot_ of headroom: 1/4 of the journal plus the size of -+ * the committing transaction. Really, we only need to give it -+ * committing_transaction->t_outstanding_credits plus "enough" for -+ * the log control blocks. -+ * Also, this test is inconsitent with the matching one in -+ * journal_extend(). -+ */ -+ needed = journal->j_max_transaction_buffers; -+ if (journal->j_committing_transaction) -+ needed += journal->j_committing_transaction-> -+ t_outstanding_credits; -+ -+ if (log_space_left(journal) < needed) { -+ jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle); -+ log_wait_for_space(journal, needed); -+ goto repeat_locked; -+ } -+ -+ /* OK, account for the buffers that this operation expects to -+ * use and add the handle to the running transaction. */ -+ -+ handle->h_transaction = transaction; -+ transaction->t_outstanding_credits += nblocks; -+ transaction->t_updates++; -+ transaction->t_handle_count++; -+ jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n", -+ handle, nblocks, transaction->t_outstanding_credits, -+ log_space_left(journal)); -+ -+ unlock_journal(journal); -+ -+ return 0; -+} -+ -+/* -+ * Obtain a new handle. -+ * -+ * We make sure that the transaction can guarantee at least nblocks of -+ * modified buffers in the log. We block until the log can guarantee -+ * that much space. -+ * -+ * This function is visible to journal users (like ext2fs), so is not -+ * called with the journal already locked. -+ * -+ * Return a pointer to a newly allocated handle, or NULL on failure -+ */ -+ -+handle_t *journal_start(journal_t *journal, int nblocks) -+{ -+ handle_t *handle = journal_current_handle(); -+ int err; -+ -+ if (!journal) -+ return ERR_PTR(-EROFS); -+ -+ if (handle) { -+ J_ASSERT(handle->h_transaction->t_journal == journal); -+ handle->h_ref++; -+ return handle; -+ } -+ -+ handle = jbd_kmalloc(sizeof (handle_t), GFP_NOFS); -+ if (!handle) -+ return ERR_PTR(-ENOMEM); -+ memset (handle, 0, sizeof (handle_t)); -+ -+ handle->h_buffer_credits = nblocks; -+ handle->h_ref = 1; -+ current->journal_info = handle; -+ -+ err = start_this_handle(journal, handle); -+ if (err < 0) { -+ kfree(handle); -+ current->journal_info = NULL; -+ return ERR_PTR(err); -+ } -+ -+ return handle; -+} -+ -+/* -+ * Return zero on success -+ */ -+static int try_start_this_handle(journal_t *journal, handle_t *handle) -+{ -+ transaction_t *transaction; -+ int needed; -+ int nblocks = handle->h_buffer_credits; -+ int ret = 0; -+ -+ jbd_debug(3, "New handle %p maybe going live.\n", handle); -+ -+ lock_journal(journal); -+ -+ if (is_journal_aborted(journal) || -+ (journal->j_errno != 0 && !(journal->j_flags & JFS_ACK_ERR))) { -+ ret = -EROFS; -+ goto fail_unlock; -+ } -+ -+ if (journal->j_barrier_count) -+ goto fail_unlock; -+ -+ if (!journal->j_running_transaction && get_transaction(journal, 1) == 0) -+ goto fail_unlock; -+ -+ transaction = journal->j_running_transaction; -+ if (transaction->t_state == T_LOCKED) -+ goto fail_unlock; -+ -+ needed = transaction->t_outstanding_credits + nblocks; -+ /* We could run log_start_commit here */ -+ if (needed > journal->j_max_transaction_buffers) -+ goto fail_unlock; -+ -+ needed = journal->j_max_transaction_buffers; -+ if (journal->j_committing_transaction) -+ needed += journal->j_committing_transaction-> -+ t_outstanding_credits; -+ -+ if (log_space_left(journal) < needed) -+ goto fail_unlock; -+ -+ handle->h_transaction = transaction; -+ transaction->t_outstanding_credits += nblocks; -+ transaction->t_updates++; -+ jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n", -+ handle, nblocks, transaction->t_outstanding_credits, -+ log_space_left(journal)); -+ unlock_journal(journal); -+ return 0; -+ -+fail_unlock: -+ unlock_journal(journal); -+ if (ret >= 0) -+ ret = -1; -+ return ret; -+} -+ -+/* -+ * Try to start a handle, but non-blockingly. If we weren't able -+ * to, return an ERR_PTR value. -+ */ -+handle_t *journal_try_start(journal_t *journal, int nblocks) -+{ -+ handle_t *handle = journal_current_handle(); -+ int err; -+ -+ if (!journal) -+ return ERR_PTR(-EROFS); -+ -+ if (handle) { -+ jbd_debug(4, "h_ref %d -> %d\n", -+ handle->h_ref, -+ handle->h_ref + 1); -+ J_ASSERT(handle->h_transaction->t_journal == journal); -+ if (is_handle_aborted(handle)) -+ return ERR_PTR(-EIO); -+ handle->h_ref++; -+ return handle; -+ } else { -+ jbd_debug(4, "no current transaction\n"); -+ } -+ -+ if (is_journal_aborted(journal)) -+ return ERR_PTR(-EIO); -+ -+ handle = jbd_kmalloc(sizeof (handle_t), GFP_NOFS); -+ if (!handle) -+ return ERR_PTR(-ENOMEM); -+ memset (handle, 0, sizeof (handle_t)); -+ -+ handle->h_buffer_credits = nblocks; -+ handle->h_ref = 1; -+ current->journal_info = handle; -+ -+ err = try_start_this_handle(journal, handle); -+ if (err < 0) { -+ kfree(handle); -+ current->journal_info = NULL; -+ return ERR_PTR(err); -+ } -+ -+ return handle; -+} -+ -+/* -+ * journal_extend: extend buffer credits. -+ * -+ * Some transactions, such as large extends and truncates, can be done -+ * atomically all at once or in several stages. The operation requests -+ * a credit for a number of buffer modications in advance, but can -+ * extend its credit if it needs more. -+ * -+ * journal_extend tries to give the running handle more buffer credits. -+ * It does not guarantee that allocation: this is a best-effort only. -+ * The calling process MUST be able to deal cleanly with a failure to -+ * extend here. -+ * -+ * Return 0 on success, non-zero on failure. -+ * -+ * return code < 0 implies an error -+ * return code > 0 implies normal transaction-full status. -+ */ -+ -+int journal_extend (handle_t *handle, int nblocks) -+{ -+ transaction_t *transaction = handle->h_transaction; -+ journal_t *journal = transaction->t_journal; -+ int result; -+ int wanted; -+ -+ lock_journal (journal); -+ -+ result = -EIO; -+ if (is_handle_aborted(handle)) -+ goto error_out; -+ -+ result = 1; -+ -+ /* Don't extend a locked-down transaction! */ -+ if (handle->h_transaction->t_state != T_RUNNING) { -+ jbd_debug(3, "denied handle %p %d blocks: " -+ "transaction not running\n", handle, nblocks); -+ goto error_out; -+ } -+ -+ wanted = transaction->t_outstanding_credits + nblocks; -+ -+ if (wanted > journal->j_max_transaction_buffers) { -+ jbd_debug(3, "denied handle %p %d blocks: " -+ "transaction too large\n", handle, nblocks); -+ goto error_out; -+ } -+ -+ if (wanted > log_space_left(journal)) { -+ jbd_debug(3, "denied handle %p %d blocks: " -+ "insufficient log space\n", handle, nblocks); -+ goto error_out; -+ } -+ -+ handle->h_buffer_credits += nblocks; -+ transaction->t_outstanding_credits += nblocks; -+ result = 0; -+ -+ jbd_debug(3, "extended handle %p by %d\n", handle, nblocks); -+ -+error_out: -+ unlock_journal (journal); -+ return result; -+} -+ -+ -+/* -+ * journal_restart: restart a handle for a multi-transaction filesystem -+ * operation. -+ * -+ * If the journal_extend() call above fails to grant new buffer credits -+ * to a running handle, a call to journal_restart will commit the -+ * handle's transaction so far and reattach the handle to a new -+ * transaction capabable of guaranteeing the requested number of -+ * credits. -+ */ -+ -+int journal_restart(handle_t *handle, int nblocks) -+{ -+ transaction_t *transaction = handle->h_transaction; -+ journal_t *journal = transaction->t_journal; -+ int ret; -+ -+ /* If we've had an abort of any type, don't even think about -+ * actually doing the restart! */ -+ if (is_handle_aborted(handle)) -+ return 0; -+ -+ /* First unlink the handle from its current transaction, and -+ * start the commit on that. */ -+ -+ J_ASSERT (transaction->t_updates > 0); -+ J_ASSERT (journal_current_handle() == handle); -+ -+ transaction->t_outstanding_credits -= handle->h_buffer_credits; -+ transaction->t_updates--; -+ -+ if (!transaction->t_updates) -+ wake_up(&journal->j_wait_updates); -+ -+ jbd_debug(2, "restarting handle %p\n", handle); -+ log_start_commit(journal, transaction); -+ -+ handle->h_buffer_credits = nblocks; -+ ret = start_this_handle(journal, handle); -+ return ret; -+} -+ -+ -+/* -+ * Barrier operation: establish a transaction barrier. -+ * -+ * This locks out any further updates from being started, and blocks -+ * until all existing updates have completed, returning only once the -+ * journal is in a quiescent state with no updates running. -+ * -+ * The journal lock should not be held on entry. -+ */ -+ -+void journal_lock_updates (journal_t *journal) -+{ -+ lock_journal(journal); -+ ++journal->j_barrier_count; -+ -+ /* Wait until there are no running updates */ -+ while (1) { -+ transaction_t *transaction = journal->j_running_transaction; -+ if (!transaction) -+ break; -+ if (!transaction->t_updates) -+ break; -+ -+ unlock_journal(journal); -+ sleep_on(&journal->j_wait_updates); -+ lock_journal(journal); -+ } -+ -+ unlock_journal(journal); -+ -+ /* We have now established a barrier against other normal -+ * updates, but we also need to barrier against other -+ * journal_lock_updates() calls to make sure that we serialise -+ * special journal-locked operations too. */ -+ down(&journal->j_barrier); -+} -+ -+/* -+ * Release a transaction barrier obtained with journal_lock_updates(). -+ * -+ * Should be called without the journal lock held. -+ */ -+ -+void journal_unlock_updates (journal_t *journal) -+{ -+ lock_journal(journal); -+ -+ J_ASSERT (journal->j_barrier_count != 0); -+ -+ up(&journal->j_barrier); -+ --journal->j_barrier_count; -+ wake_up(&journal->j_wait_transaction_locked); -+ unlock_journal(journal); -+} -+ -+/* -+ * journal_get_write_access: notify intent to modify a buffer for metadata -+ * (not data) update. -+ * -+ * If the buffer is already part of the current transaction, then there -+ * is nothing we need to do. If it is already part of a prior -+ * transaction which we are still committing to disk, then we need to -+ * make sure that we do not overwrite the old copy: we do copy-out to -+ * preserve the copy going to disk. We also account the buffer against -+ * the handle's metadata buffer credits (unless the buffer is already -+ * part of the transaction, that is). -+ * -+ * Returns an error code or 0 on success. -+ * -+ * In full data journalling mode the buffer may be of type BJ_AsyncData, -+ * because we're write()ing a buffer which is also part of a shared mapping. -+ */ -+ -+static int -+do_get_write_access(handle_t *handle, struct journal_head *jh, int force_copy) -+{ -+ transaction_t *transaction = handle->h_transaction; -+ journal_t *journal = transaction->t_journal; -+ int error; -+ char *frozen_buffer = NULL; -+ int need_copy = 0; -+ -+ jbd_debug(5, "buffer_head %p, force_copy %d\n", jh, force_copy); -+ -+ JBUFFER_TRACE(jh, "entry"); -+repeat: -+ /* @@@ Need to check for errors here at some point. */ -+ -+ /* -+ * AKPM: neither bdflush nor kupdate run with the BKL. There's -+ * nothing we can do to prevent them from starting writeout of a -+ * BUF_DIRTY buffer at any time. And checkpointing buffers are on -+ * BUF_DIRTY. So. We no longer assert that the buffer is unlocked. -+ * -+ * However. It is very wrong for us to allow ext3 to start directly -+ * altering the ->b_data of buffers which may at that very time be -+ * undergoing writeout to the client filesystem. This can leave -+ * the filesystem in an inconsistent, transient state if we crash. -+ * So what we do is to steal the buffer if it is in checkpoint -+ * mode and dirty. The journal lock will keep out checkpoint-mode -+ * state transitions within journal_remove_checkpoint() and the buffer -+ * is locked to keep bdflush/kupdate/whoever away from it as well. -+ * -+ * AKPM: we have replaced all the lock_journal_bh_wait() stuff with a -+ * simple lock_journal(). This code here will care for locked buffers. -+ */ -+ /* -+ * The buffer_locked() || buffer_dirty() tests here are simply an -+ * optimisation tweak. If anyone else in the system decides to -+ * lock this buffer later on, we'll blow up. There doesn't seem -+ * to be a good reason why they should do this. -+ */ -+ if (jh->b_cp_transaction && -+ (buffer_locked(jh2bh(jh)) || buffer_dirty(jh2bh(jh)))) { -+ unlock_journal(journal); -+ lock_buffer(jh2bh(jh)); -+ spin_lock(&journal_datalist_lock); -+ if (jh->b_cp_transaction && buffer_dirty(jh2bh(jh))) { -+ /* OK, we need to steal it */ -+ JBUFFER_TRACE(jh, "stealing from checkpoint mode"); -+ J_ASSERT_JH(jh, jh->b_next_transaction == NULL); -+ J_ASSERT_JH(jh, jh->b_frozen_data == NULL); -+ -+ J_ASSERT(handle->h_buffer_credits > 0); -+ handle->h_buffer_credits--; -+ -+ /* This will clear BH_Dirty and set BH_JBDDirty. */ -+ JBUFFER_TRACE(jh, "file as BJ_Reserved"); -+ __journal_file_buffer(jh, transaction, BJ_Reserved); -+ -+ /* And pull it off BUF_DIRTY, onto BUF_CLEAN */ -+ refile_buffer(jh2bh(jh)); -+ -+ /* -+ * The buffer is now hidden from bdflush. It is -+ * metadata against the current transaction. -+ */ -+ JBUFFER_TRACE(jh, "steal from cp mode is complete"); -+ } -+ spin_unlock(&journal_datalist_lock); -+ unlock_buffer(jh2bh(jh)); -+ lock_journal(journal); -+ goto repeat; -+ } -+ -+ J_ASSERT_JH(jh, !buffer_locked(jh2bh(jh))); -+ -+ error = -EROFS; -+ if (is_handle_aborted(handle)) -+ goto out_unlocked; -+ error = 0; -+ -+ spin_lock(&journal_datalist_lock); -+ -+ /* The buffer is already part of this transaction if -+ * b_transaction or b_next_transaction points to it. */ -+ -+ if (jh->b_transaction == transaction || -+ jh->b_next_transaction == transaction) -+ goto done_locked; -+ -+ /* If there is already a copy-out version of this buffer, then -+ * we don't need to make another one. */ -+ -+ if (jh->b_frozen_data) { -+ JBUFFER_TRACE(jh, "has frozen data"); -+ J_ASSERT_JH(jh, jh->b_next_transaction == NULL); -+ jh->b_next_transaction = transaction; -+ -+ J_ASSERT_JH(jh, handle->h_buffer_credits > 0); -+ handle->h_buffer_credits--; -+ goto done_locked; -+ } -+ -+ /* Is there data here we need to preserve? */ -+ -+ if (jh->b_transaction && jh->b_transaction != transaction) { -+ JBUFFER_TRACE(jh, "owned by older transaction"); -+ J_ASSERT_JH(jh, jh->b_next_transaction == NULL); -+ J_ASSERT_JH(jh, jh->b_transaction == -+ journal->j_committing_transaction); -+ -+ /* There is one case we have to be very careful about. -+ * If the committing transaction is currently writing -+ * this buffer out to disk and has NOT made a copy-out, -+ * then we cannot modify the buffer contents at all -+ * right now. The essence of copy-out is that it is the -+ * extra copy, not the primary copy, which gets -+ * journaled. If the primary copy is already going to -+ * disk then we cannot do copy-out here. */ -+ -+ if (jh->b_jlist == BJ_Shadow) { -+ JBUFFER_TRACE(jh, "on shadow: sleep"); -+ spin_unlock(&journal_datalist_lock); -+ unlock_journal(journal); -+ /* commit wakes up all shadow buffers after IO */ -+ sleep_on(&jh2bh(jh)->b_wait); -+ lock_journal(journal); -+ goto repeat; -+ } -+ -+ /* Only do the copy if the currently-owning transaction -+ * still needs it. If it is on the Forget list, the -+ * committing transaction is past that stage. The -+ * buffer had better remain locked during the kmalloc, -+ * but that should be true --- we hold the journal lock -+ * still and the buffer is already on the BUF_JOURNAL -+ * list so won't be flushed. -+ * -+ * Subtle point, though: if this is a get_undo_access, -+ * then we will be relying on the frozen_data to contain -+ * the new value of the committed_data record after the -+ * transaction, so we HAVE to force the frozen_data copy -+ * in that case. */ -+ -+ if (jh->b_jlist != BJ_Forget || force_copy) { -+ JBUFFER_TRACE(jh, "generate frozen data"); -+ if (!frozen_buffer) { -+ JBUFFER_TRACE(jh, "allocate memory for buffer"); -+ spin_unlock(&journal_datalist_lock); -+ unlock_journal(journal); -+ frozen_buffer = jbd_kmalloc(jh2bh(jh)->b_size, -+ GFP_NOFS); -+ lock_journal(journal); -+ if (!frozen_buffer) { -+ printk(KERN_EMERG __FUNCTION__ -+ "OOM for frozen_buffer\n"); -+ JBUFFER_TRACE(jh, "oom!"); -+ error = -ENOMEM; -+ spin_lock(&journal_datalist_lock); -+ goto done_locked; -+ } -+ goto repeat; -+ } -+ -+ jh->b_frozen_data = frozen_buffer; -+ frozen_buffer = NULL; -+ need_copy = 1; -+ } -+ jh->b_next_transaction = transaction; -+ } -+ -+ J_ASSERT(handle->h_buffer_credits > 0); -+ handle->h_buffer_credits--; -+ -+ /* Finally, if the buffer is not journaled right now, we need to -+ * make sure it doesn't get written to disk before the caller -+ * actually commits the new data. */ -+ -+ if (!jh->b_transaction) { -+ JBUFFER_TRACE(jh, "no transaction"); -+ J_ASSERT_JH(jh, !jh->b_next_transaction); -+ jh->b_transaction = transaction; -+ JBUFFER_TRACE(jh, "file as BJ_Reserved"); -+ __journal_file_buffer(jh, transaction, BJ_Reserved); -+ } -+ -+done_locked: -+ spin_unlock(&journal_datalist_lock); -+ if (need_copy) { -+ struct page *page; -+ int offset; -+ char *source; -+ -+ J_ASSERT_JH(jh, buffer_uptodate(jh2bh(jh))); -+ page = jh2bh(jh)->b_page; -+ offset = ((unsigned long) jh2bh(jh)->b_data) & ~PAGE_MASK; -+ source = kmap(page); -+ memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size); -+ kunmap(page); -+ } -+ -+ -+ /* If we are about to journal a buffer, then any revoke pending -+ on it is no longer valid. */ -+ journal_cancel_revoke(handle, jh); -+ -+out_unlocked: -+ if (frozen_buffer) -+ kfree(frozen_buffer); -+ -+ JBUFFER_TRACE(jh, "exit"); -+ return error; -+} -+ -+int journal_get_write_access (handle_t *handle, struct buffer_head *bh) -+{ -+ transaction_t *transaction = handle->h_transaction; -+ journal_t *journal = transaction->t_journal; -+ struct journal_head *jh = journal_add_journal_head(bh); -+ int rc; -+ -+ /* We do not want to get caught playing with fields which the -+ * log thread also manipulates. Make sure that the buffer -+ * completes any outstanding IO before proceeding. */ -+ lock_journal(journal); -+ rc = do_get_write_access(handle, jh, 0); -+ journal_unlock_journal_head(jh); -+ unlock_journal(journal); -+ return rc; -+} -+ -+ -+/* -+ * When the user wants to journal a newly created buffer_head -+ * (ie. getblk() returned a new buffer and we are going to populate it -+ * manually rather than reading off disk), then we need to keep the -+ * buffer_head locked until it has been completely filled with new -+ * data. In this case, we should be able to make the assertion that -+ * the bh is not already part of an existing transaction. -+ * -+ * The buffer should already be locked by the caller by this point. -+ * There is no lock ranking violation: it was a newly created, -+ * unlocked buffer beforehand. */ -+ -+int journal_get_create_access (handle_t *handle, struct buffer_head *bh) -+{ -+ transaction_t *transaction = handle->h_transaction; -+ journal_t *journal = transaction->t_journal; -+ struct journal_head *jh = journal_add_journal_head(bh); -+ int err; -+ -+ jbd_debug(5, "journal_head %p\n", jh); -+ lock_journal(journal); -+ err = -EROFS; -+ if (is_handle_aborted(handle)) -+ goto out; -+ err = 0; -+ -+ JBUFFER_TRACE(jh, "entry"); -+ /* The buffer may already belong to this transaction due to -+ * pre-zeroing in the filesystem's new_block code. It may also -+ * be on the previous, committing transaction's lists, but it -+ * HAS to be in Forget state in that case: the transaction must -+ * have deleted the buffer for it to be reused here. */ -+ J_ASSERT_JH(jh, (jh->b_transaction == transaction || -+ jh->b_transaction == NULL || -+ (jh->b_transaction == journal->j_committing_transaction && -+ jh->b_jlist == BJ_Forget))); -+ -+ J_ASSERT_JH(jh, jh->b_next_transaction == NULL); -+ J_ASSERT_JH(jh, buffer_locked(jh2bh(jh))); -+ -+ J_ASSERT_JH(jh, handle->h_buffer_credits > 0); -+ handle->h_buffer_credits--; -+ -+ spin_lock(&journal_datalist_lock); -+ if (jh->b_transaction == NULL) { -+ jh->b_transaction = transaction; -+ JBUFFER_TRACE(jh, "file as BJ_Reserved"); -+ __journal_file_buffer(jh, transaction, BJ_Reserved); -+ JBUFFER_TRACE(jh, "refile"); -+ refile_buffer(jh2bh(jh)); -+ } else if (jh->b_transaction == journal->j_committing_transaction) { -+ JBUFFER_TRACE(jh, "set next transaction"); -+ jh->b_next_transaction = transaction; -+ } -+ spin_unlock(&journal_datalist_lock); -+ -+ /* -+ * akpm: I added this. ext3_alloc_branch can pick up new indirect -+ * blocks which contain freed but then revoked metadata. We need -+ * to cancel the revoke in case we end up freeing it yet again -+ * and the reallocating as data - this would cause a second revoke, -+ * which hits an assertion error. -+ */ -+ JBUFFER_TRACE(jh, "cancelling revoke"); -+ journal_cancel_revoke(handle, jh); -+ journal_unlock_journal_head(jh); -+out: -+ unlock_journal(journal); -+ return err; -+} -+ -+ -+ -+/* -+ * journal_get_undo_access: Notify intent to modify metadata with non- -+ * rewindable consequences -+ * -+ * Sometimes there is a need to distinguish between metadata which has -+ * been committed to disk and that which has not. The ext3fs code uses -+ * this for freeing and allocating space: we have to make sure that we -+ * do not reuse freed space until the deallocation has been committed, -+ * since if we overwrote that space we would make the delete -+ * un-rewindable in case of a crash. -+ * -+ * To deal with that, journal_get_undo_access requests write access to a -+ * buffer for parts of non-rewindable operations such as delete -+ * operations on the bitmaps. The journaling code must keep a copy of -+ * the buffer's contents prior to the undo_access call until such time -+ * as we know that the buffer has definitely been committed to disk. -+ * -+ * We never need to know which transaction the committed data is part -+ * of: buffers touched here are guaranteed to be dirtied later and so -+ * will be committed to a new transaction in due course, at which point -+ * we can discard the old committed data pointer. -+ * -+ * Returns error number or 0 on success. -+ */ -+ -+int journal_get_undo_access (handle_t *handle, struct buffer_head *bh) -+{ -+ journal_t *journal = handle->h_transaction->t_journal; -+ int err; -+ struct journal_head *jh = journal_add_journal_head(bh); -+ -+ JBUFFER_TRACE(jh, "entry"); -+ lock_journal(journal); -+ -+ /* Do this first --- it can drop the journal lock, so we want to -+ * make sure that obtaining the committed_data is done -+ * atomically wrt. completion of any outstanding commits. */ -+ err = do_get_write_access (handle, jh, 1); -+ if (err) -+ goto out; -+ -+ if (!jh->b_committed_data) { -+ /* Copy out the current buffer contents into the -+ * preserved, committed copy. */ -+ JBUFFER_TRACE(jh, "generate b_committed data"); -+ jh->b_committed_data = jbd_kmalloc(jh2bh(jh)->b_size, -+ GFP_NOFS); -+ if (!jh->b_committed_data) { -+ printk(KERN_EMERG __FUNCTION__ -+ ": No memory for committed data!\n"); -+ err = -ENOMEM; -+ goto out; -+ } -+ -+ memcpy (jh->b_committed_data, jh2bh(jh)->b_data, -+ jh2bh(jh)->b_size); -+ } -+ -+out: -+ if (!err) -+ J_ASSERT_JH(jh, jh->b_committed_data); -+ journal_unlock_journal_head(jh); -+ unlock_journal(journal); -+ return err; -+} -+ -+/* -+ * journal_dirty_data: mark a buffer as containing dirty data which -+ * needs to be flushed before we can commit the current transaction. -+ * -+ * The buffer is placed on the transaction's data list and is marked as -+ * belonging to the transaction. -+ * -+ * If `async' is set then the writebask will be initiated by the caller -+ * using submit_bh -> end_buffer_io_async. We put the buffer onto -+ * t_async_datalist. -+ * -+ * Returns error number or 0 on success. -+ * -+ * journal_dirty_data() can be called via page_launder->ext3_writepage -+ * by kswapd. So it cannot block. Happily, there's nothing here -+ * which needs lock_journal if `async' is set. -+ * -+ * When the buffer is on the current transaction we freely move it -+ * between BJ_AsyncData and BJ_SyncData according to who tried to -+ * change its state last. -+ */ -+ -+int journal_dirty_data (handle_t *handle, struct buffer_head *bh, int async) -+{ -+ journal_t *journal = handle->h_transaction->t_journal; -+ int need_brelse = 0; -+ int wanted_jlist = async ? BJ_AsyncData : BJ_SyncData; -+ struct journal_head *jh; -+ -+ if (is_handle_aborted(handle)) -+ return 0; -+ -+ jh = journal_add_journal_head(bh); -+ JBUFFER_TRACE(jh, "entry"); -+ -+ /* -+ * The buffer could *already* be dirty. Writeout can start -+ * at any time. -+ */ -+ jbd_debug(4, "jh: %p, tid:%d\n", jh, handle->h_transaction->t_tid); -+ -+ /* -+ * What if the buffer is already part of a running transaction? -+ * -+ * There are two cases: -+ * 1) It is part of the current running transaction. Refile it, -+ * just in case we have allocated it as metadata, deallocated -+ * it, then reallocated it as data. -+ * 2) It is part of the previous, still-committing transaction. -+ * If all we want to do is to guarantee that the buffer will be -+ * written to disk before this new transaction commits, then -+ * being sure that the *previous* transaction has this same -+ * property is sufficient for us! Just leave it on its old -+ * transaction. -+ * -+ * In case (2), the buffer must not already exist as metadata -+ * --- that would violate write ordering (a transaction is free -+ * to write its data at any point, even before the previous -+ * committing transaction has committed). The caller must -+ * never, ever allow this to happen: there's nothing we can do -+ * about it in this layer. -+ */ -+ spin_lock(&journal_datalist_lock); -+ if (jh->b_transaction) { -+ JBUFFER_TRACE(jh, "has transaction"); -+ if (jh->b_transaction != handle->h_transaction) { -+ JBUFFER_TRACE(jh, "belongs to older transaction"); -+ J_ASSERT_JH(jh, jh->b_transaction == -+ journal->j_committing_transaction); -+ -+ /* @@@ IS THIS TRUE ? */ -+ /* -+ * Not any more. Scenario: someone does a write() -+ * in data=journal mode. The buffer's transaction has -+ * moved into commit. Then someone does another -+ * write() to the file. We do the frozen data copyout -+ * and set b_next_transaction to point to j_running_t. -+ * And while we're in that state, someone does a -+ * writepage() in an attempt to pageout the same area -+ * of the file via a shared mapping. At present that -+ * calls journal_dirty_data(), and we get right here. -+ * It may be too late to journal the data. Simply -+ * falling through to the next test will suffice: the -+ * data will be dirty and wil be checkpointed. The -+ * ordering comments in the next comment block still -+ * apply. -+ */ -+ //J_ASSERT_JH(jh, jh->b_next_transaction == NULL); -+ -+ /* -+ * If we're journalling data, and this buffer was -+ * subject to a write(), it could be metadata, forget -+ * or shadow against the committing transaction. Now, -+ * someone has dirtied the same darn page via a mapping -+ * and it is being writepage()'d. -+ * We *could* just steal the page from commit, with some -+ * fancy locking there. Instead, we just skip it - -+ * don't tie the page's buffers to the new transaction -+ * at all. -+ * Implication: if we crash before the writepage() data -+ * is written into the filesystem, recovery will replay -+ * the write() data. -+ */ -+ if (jh->b_jlist != BJ_None && -+ jh->b_jlist != BJ_SyncData && -+ jh->b_jlist != BJ_AsyncData) { -+ JBUFFER_TRACE(jh, "Not stealing"); -+ goto no_journal; -+ } -+ -+ /* -+ * This buffer may be undergoing writeout in commit. We -+ * can't return from here and let the caller dirty it -+ * again because that can cause the write-out loop in -+ * commit to never terminate. -+ */ -+ if (!async && buffer_dirty(bh)) { -+ atomic_inc(&bh->b_count); -+ spin_unlock(&journal_datalist_lock); -+ need_brelse = 1; -+ ll_rw_block(WRITE, 1, &bh); -+ wait_on_buffer(bh); -+ spin_lock(&journal_datalist_lock); -+ /* The buffer may become locked again at any -+ time if it is redirtied */ -+ } -+ -+ /* journal_clean_data_list() may have got there first */ -+ if (jh->b_transaction != NULL) { -+ JBUFFER_TRACE(jh, "unfile from commit"); -+ __journal_unfile_buffer(jh); -+ jh->b_transaction = NULL; -+ } -+ /* The buffer will be refiled below */ -+ -+ } -+ /* -+ * Special case --- the buffer might actually have been -+ * allocated and then immediately deallocated in the previous, -+ * committing transaction, so might still be left on that -+ * transaction's metadata lists. -+ */ -+ if (jh->b_jlist != wanted_jlist) { -+ JBUFFER_TRACE(jh, "not on correct data list: unfile"); -+ J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow); -+ __journal_unfile_buffer(jh); -+ jh->b_transaction = NULL; -+ JBUFFER_TRACE(jh, "file as data"); -+ __journal_file_buffer(jh, handle->h_transaction, -+ wanted_jlist); -+ } -+ } else { -+ JBUFFER_TRACE(jh, "not on a transaction"); -+ __journal_file_buffer(jh, handle->h_transaction, wanted_jlist); -+ } -+no_journal: -+ spin_unlock(&journal_datalist_lock); -+ if (need_brelse) { -+ BUFFER_TRACE(bh, "brelse"); -+ __brelse(bh); -+ } -+ JBUFFER_TRACE(jh, "exit"); -+ journal_unlock_journal_head(jh); -+ return 0; -+} -+ -+/* -+ * journal_dirty_metadata: mark a buffer as containing dirty metadata -+ * which needs to be journaled as part of the current transaction. -+ * -+ * The buffer is placed on the transaction's metadata list and is marked -+ * as belonging to the transaction. -+ * -+ * Special care needs to be taken if the buffer already belongs to the -+ * current committing transaction (in which case we should have frozen -+ * data present for that commit). In that case, we don't relink the -+ * buffer: that only gets done when the old transaction finally -+ * completes its commit. -+ * -+ * Returns error number or 0 on success. -+ */ -+ -+int journal_dirty_metadata (handle_t *handle, struct buffer_head *bh) -+{ -+ transaction_t *transaction = handle->h_transaction; -+ journal_t *journal = transaction->t_journal; -+ struct journal_head *jh = bh2jh(bh); -+ -+ jbd_debug(5, "journal_head %p\n", jh); -+ JBUFFER_TRACE(jh, "entry"); -+ lock_journal(journal); -+ if (is_handle_aborted(handle)) -+ goto out_unlock; -+ -+ spin_lock(&journal_datalist_lock); -+ set_bit(BH_JBDDirty, &bh->b_state); -+ set_buffer_flushtime(bh); -+ -+ J_ASSERT_JH(jh, jh->b_transaction != NULL); -+ -+ /* -+ * Metadata already on the current transaction list doesn't -+ * need to be filed. Metadata on another transaction's list must -+ * be committing, and will be refiled once the commit completes: -+ * leave it alone for now. -+ */ -+ -+ if (jh->b_transaction != transaction) { -+ JBUFFER_TRACE(jh, "already on other transaction"); -+ J_ASSERT_JH(jh, jh->b_transaction == -+ journal->j_committing_transaction); -+ J_ASSERT_JH(jh, jh->b_next_transaction == transaction); -+ /* And this case is illegal: we can't reuse another -+ * transaction's data buffer, ever. */ -+ /* FIXME: writepage() should be journalled */ -+ J_ASSERT_JH(jh, jh->b_jlist != BJ_SyncData); -+ goto done_locked; -+ } -+ -+ /* That test should have eliminated the following case: */ -+ J_ASSERT_JH(jh, jh->b_frozen_data == 0); -+ -+ JBUFFER_TRACE(jh, "file as BJ_Metadata"); -+ __journal_file_buffer(jh, handle->h_transaction, BJ_Metadata); -+ -+done_locked: -+ spin_unlock(&journal_datalist_lock); -+ JBUFFER_TRACE(jh, "exit"); -+out_unlock: -+ unlock_journal(journal); -+ return 0; -+} -+ -+#if 0 -+/* -+ * journal_release_buffer: undo a get_write_access without any buffer -+ * updates, if the update decided in the end that it didn't need access. -+ * -+ * journal_get_write_access() can block, so it is quite possible for a -+ * journaling component to decide after the write access is returned -+ * that global state has changed and the update is no longer required. */ -+ -+void journal_release_buffer (handle_t *handle, struct buffer_head *bh) -+{ -+ transaction_t *transaction = handle->h_transaction; -+ journal_t *journal = transaction->t_journal; -+ struct journal_head *jh = bh2jh(bh); -+ -+ lock_journal(journal); -+ JBUFFER_TRACE(jh, "entry"); -+ -+ /* If the buffer is reserved but not modified by this -+ * transaction, then it is safe to release it. In all other -+ * cases, just leave the buffer as it is. */ -+ -+ spin_lock(&journal_datalist_lock); -+ if (jh->b_jlist == BJ_Reserved && jh->b_transaction == transaction && -+ !buffer_jdirty(jh2bh(jh))) { -+ JBUFFER_TRACE(jh, "unused: refiling it"); -+ handle->h_buffer_credits++; -+ __journal_refile_buffer(jh); -+ } -+ spin_unlock(&journal_datalist_lock); -+ -+ JBUFFER_TRACE(jh, "exit"); -+ unlock_journal(journal); -+} -+#endif -+ -+/* -+ * journal_forget: bforget() for potentially-journaled buffers. We can -+ * only do the bforget if there are no commits pending against the -+ * buffer. If the buffer is dirty in the current running transaction we -+ * can safely unlink it. -+ * -+ * bh may not be a journalled buffer at all - it may be a non-JBD -+ * buffer which came off the hashtable. Check for this. -+ * -+ * Decrements bh->b_count by one. -+ * -+ * Allow this call even if the handle has aborted --- it may be part of -+ * the caller's cleanup after an abort. -+ */ -+ -+void journal_forget (handle_t *handle, struct buffer_head *bh) -+{ -+ transaction_t *transaction = handle->h_transaction; -+ journal_t *journal = transaction->t_journal; -+ struct journal_head *jh; -+ -+ BUFFER_TRACE(bh, "entry"); -+ -+ lock_journal(journal); -+ spin_lock(&journal_datalist_lock); -+ -+ if (!buffer_jbd(bh)) -+ goto not_jbd; -+ jh = bh2jh(bh); -+ -+ if (jh->b_transaction == handle->h_transaction) { -+ J_ASSERT_JH(jh, !jh->b_frozen_data); -+ -+ /* If we are forgetting a buffer which is already part -+ * of this transaction, then we can just drop it from -+ * the transaction immediately. */ -+ clear_bit(BH_Dirty, &bh->b_state); -+ clear_bit(BH_JBDDirty, &bh->b_state); -+ -+ JBUFFER_TRACE(jh, "belongs to current transaction: unfile"); -+ J_ASSERT_JH(jh, !jh->b_committed_data); -+ -+ __journal_unfile_buffer(jh); -+ jh->b_transaction = 0; -+ -+ /* -+ * We are no longer going to journal this buffer. -+ * However, the commit of this transaction is still -+ * important to the buffer: the delete that we are now -+ * processing might obsolete an old log entry, so by -+ * committing, we can satisfy the buffer's checkpoint. -+ * -+ * So, if we have a checkpoint on the buffer, we should -+ * now refile the buffer on our BJ_Forget list so that -+ * we know to remove the checkpoint after we commit. -+ */ -+ -+ if (jh->b_cp_transaction) { -+ __journal_file_buffer(jh, transaction, BJ_Forget); -+ } else { -+ __journal_remove_journal_head(bh); -+ __brelse(bh); -+ if (!buffer_jbd(bh)) { -+ spin_unlock(&journal_datalist_lock); -+ unlock_journal(journal); -+ __bforget(bh); -+ return; -+ } -+ } -+ -+ } else if (jh->b_transaction) { -+ J_ASSERT_JH(jh, (jh->b_transaction == -+ journal->j_committing_transaction)); -+ /* However, if the buffer is still owned by a prior -+ * (committing) transaction, we can't drop it yet... */ -+ JBUFFER_TRACE(jh, "belongs to older transaction"); -+ /* ... but we CAN drop it from the new transaction if we -+ * have also modified it since the original commit. */ -+ -+ if (jh->b_next_transaction) { -+ J_ASSERT(jh->b_next_transaction == transaction); -+ jh->b_next_transaction = NULL; -+ } -+ } -+ -+not_jbd: -+ spin_unlock(&journal_datalist_lock); -+ unlock_journal(journal); -+ __brelse(bh); -+ return; -+} -+ -+#if 0 /* Unused */ -+/* -+ * journal_sync_buffer: flush a potentially-journaled buffer to disk. -+ * -+ * Used for O_SYNC filesystem operations. If the buffer is journaled, -+ * we need to complete the O_SYNC by waiting for the transaction to -+ * complete. It is an error to call journal_sync_buffer before -+ * journal_stop! -+ */ -+ -+void journal_sync_buffer(struct buffer_head *bh) -+{ -+ transaction_t *transaction; -+ journal_t *journal; -+ long sequence; -+ struct journal_head *jh; -+ -+ /* If the buffer isn't journaled, this is easy: just sync it to -+ * disk. */ -+ BUFFER_TRACE(bh, "entry"); -+ -+ spin_lock(&journal_datalist_lock); -+ if (!buffer_jbd(bh)) { -+ spin_unlock(&journal_datalist_lock); -+ return; -+ } -+ jh = bh2jh(bh); -+ if (jh->b_transaction == NULL) { -+ /* If the buffer has already been journaled, then this -+ * is a noop. */ -+ if (jh->b_cp_transaction == NULL) { -+ spin_unlock(&journal_datalist_lock); -+ return; -+ } -+ atomic_inc(&bh->b_count); -+ spin_unlock(&journal_datalist_lock); -+ ll_rw_block (WRITE, 1, &bh); -+ wait_on_buffer(bh); -+ __brelse(bh); -+ goto out; -+ } -+ -+ /* Otherwise, just wait until the transaction is synced to disk. */ -+ transaction = jh->b_transaction; -+ journal = transaction->t_journal; -+ sequence = transaction->t_tid; -+ spin_unlock(&journal_datalist_lock); -+ -+ jbd_debug(2, "requesting commit for jh %p\n", jh); -+ log_start_commit (journal, transaction); -+ -+ while (tid_gt(sequence, journal->j_commit_sequence)) { -+ wake_up(&journal->j_wait_done_commit); -+ sleep_on(&journal->j_wait_done_commit); -+ } -+ JBUFFER_TRACE(jh, "exit"); -+out: -+ return; -+} -+#endif -+ -+/* -+ * All done for a particular handle. -+ * -+ * There is not much action needed here. We just return any remaining -+ * buffer credits to the transaction and remove the handle. The only -+ * complication is that we need to start a commit operation if the -+ * filesystem is marked for synchronous update. -+ * -+ * journal_stop itself will not usually return an error, but it may -+ * do so in unusual circumstances. In particular, expect it to -+ * return -EIO if a journal_abort has been executed since the -+ * transaction began. -+ */ -+ -+int journal_stop(handle_t *handle) -+{ -+ transaction_t *transaction = handle->h_transaction; -+ journal_t *journal = transaction->t_journal; -+ int old_handle_count, err; -+ -+ if (!handle) -+ return 0; -+ -+ J_ASSERT (transaction->t_updates > 0); -+ J_ASSERT (journal_current_handle() == handle); -+ -+ if (is_handle_aborted(handle)) -+ err = -EIO; -+ else -+ err = 0; -+ -+ if (--handle->h_ref > 0) { -+ jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1, -+ handle->h_ref); -+ return err; -+ } -+ -+ jbd_debug(4, "Handle %p going down\n", handle); -+ -+ /* -+ * Implement synchronous transaction batching. If the handle -+ * was synchronous, don't force a commit immediately. Let's -+ * yield and let another thread piggyback onto this transaction. -+ * Keep doing that while new threads continue to arrive. -+ * It doesn't cost much - we're about to run a commit and sleep -+ * on IO anyway. Speeds up many-threaded, many-dir operations -+ * by 30x or more... -+ */ -+ if (handle->h_sync) { -+ do { -+ old_handle_count = transaction->t_handle_count; -+ set_current_state(TASK_RUNNING); -+ current->policy |= SCHED_YIELD; -+ schedule(); -+ } while (old_handle_count != transaction->t_handle_count); -+ } -+ -+ current->journal_info = NULL; -+ transaction->t_outstanding_credits -= handle->h_buffer_credits; -+ transaction->t_updates--; -+ if (!transaction->t_updates) { -+ wake_up(&journal->j_wait_updates); -+ if (journal->j_barrier_count) -+ wake_up(&journal->j_wait_transaction_locked); -+ } -+ -+ /* -+ * If the handle is marked SYNC, we need to set another commit -+ * going! We also want to force a commit if the current -+ * transaction is occupying too much of the log, or if the -+ * transaction is too old now. -+ */ -+ if (handle->h_sync || -+ transaction->t_outstanding_credits > -+ journal->j_max_transaction_buffers || -+ time_after_eq(jiffies, transaction->t_expires)) { -+ /* Do this even for aborted journals: an abort still -+ * completes the commit thread, it just doesn't write -+ * anything to disk. */ -+ tid_t tid = transaction->t_tid; -+ -+ jbd_debug(2, "transaction too old, requesting commit for " -+ "handle %p\n", handle); -+ /* This is non-blocking */ -+ log_start_commit(journal, transaction); -+ -+ /* -+ * Special case: JFS_SYNC synchronous updates require us -+ * to wait for the commit to complete. -+ */ -+ if (handle->h_sync && !(current->flags & PF_MEMALLOC)) -+ log_wait_commit(journal, tid); -+ } -+ kfree(handle); -+ return err; -+} -+ -+/* -+ * For synchronous operations: force any uncommitted trasnactions -+ * to disk. May seem kludgy, but it reuses all the handle batching -+ * code in a very simple manner. -+ */ -+int journal_force_commit(journal_t *journal) -+{ -+ handle_t *handle; -+ int ret = 0; -+ -+ lock_kernel(); -+ handle = journal_start(journal, 1); -+ if (IS_ERR(handle)) { -+ ret = PTR_ERR(handle); -+ goto out; -+ } -+ handle->h_sync = 1; -+ journal_stop(handle); -+out: -+ unlock_kernel(); -+ return ret; -+} -+ -+/* -+ * -+ * List management code snippets: various functions for manipulating the -+ * transaction buffer lists. -+ * -+ */ -+ -+/* -+ * Append a buffer to a transaction list, given the transaction's list head -+ * pointer. -+ * journal_datalist_lock is held. -+ */ -+ -+static inline void -+__blist_add_buffer(struct journal_head **list, struct journal_head *jh) -+{ -+ if (!*list) { -+ jh->b_tnext = jh->b_tprev = jh; -+ *list = jh; -+ } else { -+ /* Insert at the tail of the list to preserve order */ -+ struct journal_head *first = *list, *last = first->b_tprev; -+ jh->b_tprev = last; -+ jh->b_tnext = first; -+ last->b_tnext = first->b_tprev = jh; -+ } -+} -+ -+/* -+ * Remove a buffer from a transaction list, given the transaction's list -+ * head pointer. -+ * -+ * Called with journal_datalist_lock held, and the journal may not -+ * be locked. -+ */ -+ -+static inline void -+__blist_del_buffer(struct journal_head **list, struct journal_head *jh) -+{ -+ if (*list == jh) { -+ *list = jh->b_tnext; -+ if (*list == jh) -+ *list = 0; -+ } -+ jh->b_tprev->b_tnext = jh->b_tnext; -+ jh->b_tnext->b_tprev = jh->b_tprev; -+} -+ -+/* -+ * Remove a buffer from the appropriate transaction list. -+ * -+ * Note that this function can *change* the value of -+ * bh->b_transaction->t_sync_datalist, t_async_datalist, t_buffers, t_forget, -+ * t_iobuf_list, t_shadow_list, t_log_list or t_reserved_list. If the caller -+ * is holding onto a copy of one of thee pointers, it could go bad. -+ * Generally the caller needs to re-read the pointer from the transaction_t. -+ * -+ * If bh->b_jlist is BJ_SyncData or BJ_AsyncData then we may have been called -+ * via journal_try_to_free_buffer() or journal_clean_data_list(). In that -+ * case, journal_datalist_lock will be held, and the journal may not be locked. -+ */ -+void __journal_unfile_buffer(struct journal_head *jh) -+{ -+ struct journal_head **list = 0; -+ transaction_t * transaction; -+ -+ assert_spin_locked(&journal_datalist_lock); -+ transaction = jh->b_transaction; -+ -+#ifdef __SMP__ -+ J_ASSERT (current->lock_depth >= 0); -+#endif -+ J_ASSERT_JH(jh, jh->b_jlist < BJ_Types); -+ -+ if (jh->b_jlist != BJ_None) -+ J_ASSERT_JH(jh, transaction != 0); -+ -+ switch (jh->b_jlist) { -+ case BJ_None: -+ return; -+ case BJ_SyncData: -+ list = &transaction->t_sync_datalist; -+ break; -+ case BJ_AsyncData: -+ list = &transaction->t_async_datalist; -+ break; -+ case BJ_Metadata: -+ transaction->t_nr_buffers--; -+ J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0); -+ list = &transaction->t_buffers; -+ break; -+ case BJ_Forget: -+ list = &transaction->t_forget; -+ break; -+ case BJ_IO: -+ list = &transaction->t_iobuf_list; -+ break; -+ case BJ_Shadow: -+ list = &transaction->t_shadow_list; -+ break; -+ case BJ_LogCtl: -+ list = &transaction->t_log_list; -+ break; -+ case BJ_Reserved: -+ list = &transaction->t_reserved_list; -+ break; -+ } -+ -+ __blist_del_buffer(list, jh); -+ jh->b_jlist = BJ_None; -+ if (test_and_clear_bit(BH_JBDDirty, &jh2bh(jh)->b_state)) { -+ set_bit(BH_Dirty, &jh2bh(jh)->b_state); -+ } -+} -+ -+void journal_unfile_buffer(struct journal_head *jh) -+{ -+ spin_lock(&journal_datalist_lock); -+ __journal_unfile_buffer(jh); -+ spin_unlock(&journal_datalist_lock); -+} -+ -+/* -+ * Called from journal_try_to_free_buffers(). The journal is not -+ * locked. lru_list_lock is not held. -+ * -+ * Here we see why journal_datalist_lock is global and not per-journal. -+ * We cannot get back to this buffer's journal pointer without locking -+ * out journal_clean_data_list() in some manner. -+ * -+ * One could use journal_datalist_lock to get unracy access to a -+ * per-journal lock. -+ * -+ * Called with journal_datalist_lock held. -+ * -+ * Returns non-zero iff we were able to free the journal_head. -+ */ -+static int __journal_try_to_free_buffer(struct buffer_head *bh, -+ int *locked_or_dirty) -+{ -+ struct journal_head *jh; -+ -+ assert_spin_locked(&journal_datalist_lock); -+ -+ jh = bh2jh(bh); -+ -+ if (buffer_locked(bh) || buffer_dirty(bh)) { -+ *locked_or_dirty = 1; -+ goto out; -+ } -+ -+ if (!buffer_uptodate(bh)) -+ goto out; -+ -+ if (jh->b_next_transaction != 0) -+ goto out; -+ -+ if (jh->b_transaction != 0 && jh->b_cp_transaction == 0) { -+ if (jh->b_jlist == BJ_SyncData || jh->b_jlist==BJ_AsyncData) { -+ /* A written-back ordered data buffer */ -+ JBUFFER_TRACE(jh, "release data"); -+ __journal_unfile_buffer(jh); -+ jh->b_transaction = 0; -+ __journal_remove_journal_head(bh); -+ __brelse(bh); -+ } -+ } -+ else if (jh->b_cp_transaction != 0 && jh->b_transaction == 0) { -+ /* written-back checkpointed metadata buffer */ -+ if (jh->b_jlist == BJ_None) { -+ JBUFFER_TRACE(jh, "remove from checkpoint list"); -+ __journal_remove_checkpoint(jh); -+ __journal_remove_journal_head(bh); -+ __brelse(bh); -+ } -+ } -+ return !buffer_jbd(bh); -+ -+out: -+ return 0; -+} -+ -+/* -+ * journal_try_to_free_buffers(). For all the buffers on this page, -+ * if they are fully written out ordered data, move them onto BUF_CLEAN -+ * so try_to_free_buffers() can reap them. Called with lru_list_lock -+ * not held. Does its own locking. -+ * -+ * This complicates JBD locking somewhat. We aren't protected by the -+ * BKL here. We wish to remove the buffer from its committing or -+ * running transaction's ->t_datalist via __journal_unfile_buffer. -+ * -+ * This may *change* the value of transaction_t->t_datalist, so anyone -+ * who looks at t_datalist needs to lock against this function. -+ * -+ * Even worse, someone may be doing a journal_dirty_data on this -+ * buffer. So we need to lock against that. journal_dirty_data() -+ * will come out of the lock with the buffer dirty, which makes it -+ * ineligible for release here. -+ * -+ * Who else is affected by this? hmm... Really the only contender -+ * is do_get_write_access() - it could be looking at the buffer while -+ * journal_try_to_free_buffer() is changing its state. But that -+ * cannot happen because we never reallocate freed data as metadata -+ * while the data is part of a transaction. Yes? -+ * -+ * This function returns non-zero if we wish try_to_free_buffers() -+ * to be called. We do this is the page is releasable by try_to_free_buffers(). -+ * We also do it if the page has locked or dirty buffers and the caller wants -+ * us to perform sync or async writeout. -+ */ -+int journal_try_to_free_buffers(journal_t *journal, -+ struct page *page, int gfp_mask) -+{ -+ struct buffer_head *bh; -+ struct buffer_head *tmp; -+ int locked_or_dirty = 0; -+ int call_ttfb = 1; -+ -+ J_ASSERT(PageLocked(page)); -+ -+ bh = page->buffers; -+ tmp = bh; -+ spin_lock(&journal_datalist_lock); -+ do { -+ struct buffer_head *p = tmp; -+ -+ tmp = tmp->b_this_page; -+ if (buffer_jbd(p)) -+ if (!__journal_try_to_free_buffer(p, &locked_or_dirty)) -+ call_ttfb = 0; -+ } while (tmp != bh); -+ spin_unlock(&journal_datalist_lock); -+ -+ if (!(gfp_mask & (__GFP_IO|__GFP_WAIT))) -+ goto out; -+ if (!locked_or_dirty) -+ goto out; -+ /* -+ * The VM wants us to do writeout, or to block on IO, or both. -+ * So we allow try_to_free_buffers to be called even if the page -+ * still has journalled buffers. -+ */ -+ call_ttfb = 1; -+out: -+ return call_ttfb; -+} -+ -+/* -+ * This buffer is no longer needed. If it is on an older transaction's -+ * checkpoint list we need to record it on this transaction's forget list -+ * to pin this buffer (and hence its checkpointing transaction) down until -+ * this transaction commits. If the buffer isn't on a checkpoint list, we -+ * release it. -+ * Returns non-zero if JBD no longer has an interest in the buffer. -+ */ -+static int dispose_buffer(struct journal_head *jh, -+ transaction_t *transaction) -+{ -+ int may_free = 1; -+ struct buffer_head *bh = jh2bh(jh); -+ -+ spin_lock(&journal_datalist_lock); -+ __journal_unfile_buffer(jh); -+ jh->b_transaction = 0; -+ -+ if (jh->b_cp_transaction) { -+ JBUFFER_TRACE(jh, "on running+cp transaction"); -+ __journal_file_buffer(jh, transaction, BJ_Forget); -+ clear_bit(BH_JBDDirty, &bh->b_state); -+ may_free = 0; -+ } else { -+ JBUFFER_TRACE(jh, "on running transaction"); -+ __journal_remove_journal_head(bh); -+ __brelse(bh); -+ } -+ spin_unlock(&journal_datalist_lock); -+ return may_free; -+} -+ -+/* -+ * journal_flushpage -+ * -+ * This code is tricky. It has a number of cases to deal with. -+ * -+ * There are two invariants which this code relies on: -+ * -+ * i_size must be updated on disk before we start calling flushpage on the -+ * data. -+ * -+ * This is done in ext3 by defining an ext3_setattr method which -+ * updates i_size before truncate gets going. By maintaining this -+ * invariant, we can be sure that it is safe to throw away any buffers -+ * attached to the current transaction: once the transaction commits, -+ * we know that the data will not be needed. -+ * -+ * Note however that we can *not* throw away data belonging to the -+ * previous, committing transaction! -+ * -+ * Any disk blocks which *are* part of the previous, committing -+ * transaction (and which therefore cannot be discarded immediately) are -+ * not going to be reused in the new running transaction -+ * -+ * The bitmap committed_data images guarantee this: any block which is -+ * allocated in one transaction and removed in the next will be marked -+ * as in-use in the committed_data bitmap, so cannot be reused until -+ * the next transaction to delete the block commits. This means that -+ * leaving committing buffers dirty is quite safe: the disk blocks -+ * cannot be reallocated to a different file and so buffer aliasing is -+ * not possible. -+ * -+ * -+ * The above applies mainly to ordered data mode. In writeback mode we -+ * don't make guarantees about the order in which data hits disk --- in -+ * particular we don't guarantee that new dirty data is flushed before -+ * transaction commit --- so it is always safe just to discard data -+ * immediately in that mode. --sct -+ */ -+ -+/* -+ * The journal_unmap_buffer helper function returns zero if the buffer -+ * concerned remains pinned as an anonymous buffer belonging to an older -+ * transaction. -+ * -+ * We're outside-transaction here. Either or both of j_running_transaction -+ * and j_committing_transaction may be NULL. -+ */ -+static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) -+{ -+ transaction_t *transaction; -+ struct journal_head *jh; -+ int may_free = 1; -+ -+ BUFFER_TRACE(bh, "entry"); -+ -+ if (!buffer_mapped(bh)) -+ return 1; -+ -+ /* It is safe to proceed here without the -+ * journal_datalist_spinlock because the buffers cannot be -+ * stolen by try_to_free_buffers as long as we are holding the -+ * page lock. --sct */ -+ -+ if (!buffer_jbd(bh)) -+ goto zap_buffer; -+ -+ jh = bh2jh(bh); -+ transaction = jh->b_transaction; -+ if (transaction == NULL) { -+ /* First case: not on any transaction. If it -+ * has no checkpoint link, then we can zap it: -+ * it's a writeback-mode buffer so we don't care -+ * if it hits disk safely. */ -+ if (!jh->b_cp_transaction) { -+ JBUFFER_TRACE(jh, "not on any transaction: zap"); -+ goto zap_buffer; -+ } -+ -+ if (!buffer_dirty(bh)) { -+ /* bdflush has written it. We can drop it now */ -+ goto zap_buffer; -+ } -+ -+ /* OK, it must be in the journal but still not -+ * written fully to disk: it's metadata or -+ * journaled data... */ -+ -+ if (journal->j_running_transaction) { -+ /* ... and once the current transaction has -+ * committed, the buffer won't be needed any -+ * longer. */ -+ JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget"); -+ return dispose_buffer(jh, -+ journal->j_running_transaction); -+ } else { -+ /* There is no currently-running transaction. So the -+ * orphan record which we wrote for this file must have -+ * passed into commit. We must attach this buffer to -+ * the committing transaction, if it exists. */ -+ if (journal->j_committing_transaction) { -+ JBUFFER_TRACE(jh, "give to committing trans"); -+ return dispose_buffer(jh, -+ journal->j_committing_transaction); -+ } else { -+ /* The orphan record's transaction has -+ * committed. We can cleanse this buffer */ -+ clear_bit(BH_JBDDirty, &bh->b_state); -+ goto zap_buffer; -+ } -+ } -+ } else if (transaction == journal->j_committing_transaction) { -+ /* If it is committing, we simply cannot touch it. We -+ * can remove it's next_transaction pointer from the -+ * running transaction if that is set, but nothing -+ * else. */ -+ JBUFFER_TRACE(jh, "on committing transaction"); -+ if (jh->b_next_transaction) { -+ J_ASSERT(jh->b_next_transaction == -+ journal->j_running_transaction); -+ jh->b_next_transaction = NULL; -+ } -+ return 0; -+ } else { -+ /* Good, the buffer belongs to the running transaction. -+ * We are writing our own transaction's data, not any -+ * previous one's, so it is safe to throw it away -+ * (remember that we expect the filesystem to have set -+ * i_size already for this truncate so recovery will not -+ * expose the disk blocks we are discarding here.) */ -+ J_ASSERT_JH(jh, transaction == journal->j_running_transaction); -+ may_free = dispose_buffer(jh, transaction); -+ } -+ -+zap_buffer: -+ if (buffer_dirty(bh)) -+ mark_buffer_clean(bh); -+ J_ASSERT_BH(bh, !buffer_jdirty(bh)); -+ clear_bit(BH_Uptodate, &bh->b_state); -+ clear_bit(BH_Mapped, &bh->b_state); -+ clear_bit(BH_Req, &bh->b_state); -+ clear_bit(BH_New, &bh->b_state); -+ return may_free; -+} -+ -+/* -+ * Return non-zero if the page's buffers were successfully reaped -+ */ -+int journal_flushpage(journal_t *journal, -+ struct page *page, -+ unsigned long offset) -+{ -+ struct buffer_head *head, *bh, *next; -+ unsigned int curr_off = 0; -+ int may_free = 1; -+ -+ if (!PageLocked(page)) -+ BUG(); -+ if (!page->buffers) -+ return 1; -+ -+ /* We will potentially be playing with lists other than just the -+ * data lists (especially for journaled data mode), so be -+ * cautious in our locking. */ -+ lock_journal(journal); -+ -+ head = bh = page->buffers; -+ do { -+ unsigned int next_off = curr_off + bh->b_size; -+ next = bh->b_this_page; -+ -+ /* AKPM: doing lock_buffer here may be overly paranoid */ -+ if (offset <= curr_off) { -+ /* This block is wholly outside the truncation point */ -+ lock_buffer(bh); -+ may_free &= journal_unmap_buffer(journal, bh); -+ unlock_buffer(bh); -+ } -+ curr_off = next_off; -+ bh = next; -+ -+ } while (bh != head); -+ -+ unlock_journal(journal); -+ -+ if (!offset) { -+ if (!may_free || !try_to_free_buffers(page, 0)) -+ return 0; -+ J_ASSERT(page->buffers == NULL); -+ } -+ return 1; -+} -+ -+/* -+ * File a buffer on the given transaction list. -+ */ -+void __journal_file_buffer(struct journal_head *jh, -+ transaction_t *transaction, int jlist) -+{ -+ struct journal_head **list = 0; -+ -+ assert_spin_locked(&journal_datalist_lock); -+ -+#ifdef __SMP__ -+ J_ASSERT (current->lock_depth >= 0); -+#endif -+ J_ASSERT_JH(jh, jh->b_jlist < BJ_Types); -+ J_ASSERT_JH(jh, jh->b_transaction == transaction || -+ jh->b_transaction == 0); -+ -+ if (jh->b_transaction) { -+ if (jh->b_jlist == jlist) -+ return; -+ __journal_unfile_buffer(jh); -+ } else { -+ jh->b_transaction = transaction; -+ } -+ -+ switch (jlist) { -+ case BJ_None: -+ J_ASSERT_JH(jh, !jh->b_committed_data); -+ J_ASSERT_JH(jh, !jh->b_frozen_data); -+ return; -+ case BJ_SyncData: -+ list = &transaction->t_sync_datalist; -+ break; -+ case BJ_AsyncData: -+ list = &transaction->t_async_datalist; -+ break; -+ case BJ_Metadata: -+ transaction->t_nr_buffers++; -+ list = &transaction->t_buffers; -+ break; -+ case BJ_Forget: -+ list = &transaction->t_forget; -+ break; -+ case BJ_IO: -+ list = &transaction->t_iobuf_list; -+ break; -+ case BJ_Shadow: -+ list = &transaction->t_shadow_list; -+ break; -+ case BJ_LogCtl: -+ list = &transaction->t_log_list; -+ break; -+ case BJ_Reserved: -+ list = &transaction->t_reserved_list; -+ break; -+ } -+ -+ __blist_add_buffer(list, jh); -+ jh->b_jlist = jlist; -+ -+ if (jlist == BJ_Metadata || jlist == BJ_Reserved || -+ jlist == BJ_Shadow || jlist == BJ_Forget) { -+ if (atomic_set_buffer_clean(jh2bh(jh))) { -+ set_bit(BH_JBDDirty, &jh2bh(jh)->b_state); -+ } -+ } -+} -+ -+void journal_file_buffer(struct journal_head *jh, -+ transaction_t *transaction, int jlist) -+{ -+ spin_lock(&journal_datalist_lock); -+ __journal_file_buffer(jh, transaction, jlist); -+ spin_unlock(&journal_datalist_lock); -+} -+ -+/* -+ * Remove a buffer from its current buffer list in preparation for -+ * dropping it from its current transaction entirely. If the buffer has -+ * already started to be used by a subsequent transaction, refile the -+ * buffer on that transaction's metadata list. -+ */ -+ -+void __journal_refile_buffer(struct journal_head *jh) -+{ -+ assert_spin_locked(&journal_datalist_lock); -+#ifdef __SMP__ -+ J_ASSERT_JH(jh, current->lock_depth >= 0); -+#endif -+ __journal_unfile_buffer(jh); -+ -+ /* If the buffer is now unused, just drop it. If it has been -+ modified by a later transaction, add it to the new -+ transaction's metadata list. */ -+ -+ jh->b_transaction = jh->b_next_transaction; -+ jh->b_next_transaction = NULL; -+ -+ if (jh->b_transaction != NULL) { -+ __journal_file_buffer(jh, jh->b_transaction, BJ_Metadata); -+ J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING); -+ } else { -+ /* Onto BUF_DIRTY for writeback */ -+ refile_buffer(jh2bh(jh)); -+ } -+} -+ -+/* -+ * For the unlocked version of this call, also make sure that any -+ * hanging journal_head is cleaned up if necessary. -+ * -+ * __journal_refile_buffer is usually called as part of a single locked -+ * operation on a buffer_head, in which the caller is probably going to -+ * be hooking the journal_head onto other lists. In that case it is up -+ * to the caller to remove the journal_head if necessary. For the -+ * unlocked journal_refile_buffer call, the caller isn't going to be -+ * doing anything else to the buffer so we need to do the cleanup -+ * ourselves to avoid a jh leak. -+ * -+ * *** The journal_head may be freed by this call! *** -+ */ -+void journal_refile_buffer(struct journal_head *jh) -+{ -+ struct buffer_head *bh; -+ -+ spin_lock(&journal_datalist_lock); -+ bh = jh2bh(jh); -+ -+ __journal_refile_buffer(jh); -+ __journal_remove_journal_head(bh); -+ -+ spin_unlock(&journal_datalist_lock); -+ __brelse(bh); -+} diff --git a/lustre/kernel_patches/patches/2.6-rhel4-kgdb-ga.patch b/lustre/kernel_patches/patches/2.6-rhel4-kgdb-ga.patch new file mode 100644 index 0000000..f3067fa --- /dev/null +++ b/lustre/kernel_patches/patches/2.6-rhel4-kgdb-ga.patch @@ -0,0 +1,6371 @@ + + +This kgdb will get called and will trap almost any kernel +fault WITHOUT BEING ARMED. + +It is entered at boot time via "kgdb" in the boot string, +not "gdb". This entry occurs when the first setup on the +boot string is called, not sometime later. You will not +find a "waiting for gdb" on your console, as the console has +not yet been enabled at this time. (Note, this early stuff +is a bit fragile as the full trap table has yet to be +loaded, something I might address, sometime... So don't try +to look at memory that can not be reached, for example. +Once the full trap table is loaded this restriction goes +away.) + +If you hard code it, you can put a breakpoint() as the FIRST +LINE OF C CODE. + +It does NOT use the serial driver, but if the serial driver +is loaded, it tells it to release the port to avoid +conflict. + +The threads stuff is not configurable, does not require +redirection of schedule() calls and does back track to the +first non schedule() caller on the info threads command. If +you switch to the thread, however, it will show it in the +switch code (as it should). + +It is MUCH more aggressive and paranoid about grabbing the +other cpus on entry. It issues a "send_nmi_all_but_self()" +rather than depending on them to interrupt or hit an NMI +sometime in the distant future. If a cpu does not come to +the party, it will continue without it so all is not lost. + +It does not have anything to do with IOCTL calls, but does +do the control-C thing. + +There is a LOT of info in the patch which ends up in +.../Documentation/i386/kgdb/* + +There is a nifty little thing call kgdb_ts() (kgdb time +stamp) which is a function you can code calls to which puts +some useful stuff in a circular buffer which can be examined +with the supplied gdb macros. + +It also allows you do to do "p foobar(...)" i.e. to call a +function from gdb, just like gdb allows in program +debugging. + +In an SMP system, you can choose to "hold" any given set of +cpus. It also defaults to holding other cpus on single step +(this can be overridden). + +This said, you can imagine my consternation when I found it +"lost it" on continues on 2.5. I found and fixed this this +early pm, a hold cpu on exit goof on my part. + +Oh, and a final point, the configure options are more +extensive (the serial port is set up here, for example, (can +not wait for a command line to do this)). There is one to +do system call exit tests. This is VERY new and causes the +kernel to hit a hard "int 3" if a system call attempts to +exit with preempt count other than zero. This is a fault, +of course, but the current 2.5 is full of them so I don't +recommend turning this on. + + +DESC +kgdbL warning fix +EDESC +From: Ingo Molnar + +this patch fixes a deprecated use of asm input operands. (and shuts up a +gcc 3.3 warning.) + +DESC +kgdb buffer overflow fix +EDESC +From: George Anzinger + + +DESC +kgdbL warning fix +EDESC +From: Ingo Molnar + +this patch fixes a deprecated use of asm input operands. (and shuts up a +gcc 3.3 warning.) + +DESC +kgdb: CONFIG_DEBUG_INFO fix +EDESC +From: Thomas Schlichter + +that patch sets DEBUG_INFO to y by default, even if whether DEBUG_KERNEL nor +KGDB is enabled. The attached patch changes this to enable DEBUG_INFO by +default only if KGDB is enabled. + +DESC +x86_64 fixes +EDESC +From Andi Kleen + +Fix x86_64 for kgdb. We forget why. +DESC +correct kgdb.txt Documentation link (against 2.6.1-rc1-mm2) +EDESC +From: Jesper Juhl + +The help text for "config KGDB" in arch/i386/Kconfig refers to +Documentation/i386/kgdb.txt - the actual location is +Documentation/i386/kgdb/kgdb.txt - patch below to fix that. + +DESC +kgdb: fix for recent gcc +EDESC + +arch/i386/kernel/traps.c:97: error: conflicting types for 'int3' +arch/i386/kernel/traps.c:77: error: previous declaration of 'int3' was here +arch/i386/kernel/traps.c:97: error: conflicting types for 'int3' +arch/i386/kernel/traps.c:77: error: previous declaration of 'int3' was here +arch/i386/kernel/traps.c:99: error: conflicting types for 'debug' +arch/i386/kernel/traps.c:75: error: previous declaration of 'debug' was here +arch/i386/kernel/traps.c:99: error: conflicting types for 'debug' +arch/i386/kernel/traps.c:75: error: previous declaration of 'debug' was here + +DESC +kgdb warning fixes +EDESC + +arch/i386/kernel/kgdb_stub.c:1306: warning: 'time' might be used uninitialized in this function +arch/i386/kernel/kgdb_stub.c:1306: warning: 'dum' might be used uninitialized in this function +DESC +THREAD_SIZE fixes for kgdb +EDESC +From: Matt Mackall + +Noticed the THREAD_SIZE clean-ups are in -mm now. Here are the missing +bits for kgdb, tested in -tiny with 4k stacks. +DESC +Fix stack overflow test for non-8k stacks +EDESC +From: Matt Mackall + +This is needed to work properly with 4k and 16k stacks. +DESC +kgdb-ga.patch fix for i386 single-step into sysenter +EDESC +From: Roland McGrath + +Using kgdb-ga.patch from -mm, if userland single-steps (PTRACE_SINGLESTEP) +into the `sysenter' instruction, kgdb reports a bogus trap: + + Program received signal SIGTRAP, Trace/breakpoint trap. + sysenter_past_esp () at arch/i386/kernel/entry.S:249 + 1: x/i $pc 0xc0106023 : sti + (gdb) + +The hackery in the "FIX_STACK" macro in entry.S changes the saved PC for a +the spurious kernel-mode debug trap when TF was set on user-mode execution +of `sysenter', so sysenter_past_esp is where it actually lies in this case. + The following patch removes the kgdb hiccup when userland +PTRACE_SINGLESTEP's into sysenter. +DESC +fix TRAP_BAD_SYSCALL_EXITS on i386 +EDESC +From: Andy Whitcroft + +We are not using the right offset name, nor the right address when checking +for a non-zero preempt count. Move to TI_preempt_count(%ebp). + +Signed-off-by: Andy Whitcroft +DESC +add TRAP_BAD_SYSCALL_EXITS config for i386 +EDESC +From: Andy Whitcroft + +There seems to be code recently added to -bk and thereby -mm which supports +extra debug for preempt on system call exit. Oddly there doesn't seem to +be configuration options to enable them. Below is a possible patch to +allow enabling this on i386. Sadly the most obvious menu to add this to is +the Kernel Hacking menu, but that is defined in architecture specific +configuration. If this makes sense I could patch the other arches? + +Add a configuration option to allow enabling TRAP_BAD_SYSCALL_EXITS to the +Kernel Hacking menu. + +Signed-off-by: Andy Whitcroft +Signed-off-by: Andrew Morton +--- + + 25-akpm/Documentation/i386/kgdb/andthen | 100 + + 25-akpm/Documentation/i386/kgdb/debug-nmi.txt | 37 + 25-akpm/Documentation/i386/kgdb/gdb-globals.txt | 71 + 25-akpm/Documentation/i386/kgdb/gdbinit | 14 + 25-akpm/Documentation/i386/kgdb/gdbinit-modules | 146 + + 25-akpm/Documentation/i386/kgdb/gdbinit.hw | 117 + + 25-akpm/Documentation/i386/kgdb/kgdb.txt | 775 +++++++ + 25-akpm/Documentation/i386/kgdb/loadmodule.sh | 78 + 25-akpm/MAINTAINERS | 6 + 25-akpm/arch/i386/Kconfig | 8 + 25-akpm/arch/i386/Kconfig.debug | 2 + 25-akpm/arch/i386/Kconfig.kgdb | 175 + + 25-akpm/arch/i386/Makefile | 3 + 25-akpm/arch/i386/kernel/Makefile | 1 + 25-akpm/arch/i386/kernel/entry.S | 29 + 25-akpm/arch/i386/kernel/kgdb_stub.c | 2330 ++++++++++++++++++++++++ + 25-akpm/arch/i386/kernel/nmi.c | 25 + 25-akpm/arch/i386/kernel/smp.c | 12 + 25-akpm/arch/i386/kernel/traps.c | 77 + 25-akpm/arch/i386/lib/Makefile | 1 + 25-akpm/arch/i386/lib/kgdb_serial.c | 485 ++++ + 25-akpm/arch/i386/mm/fault.c | 6 + 25-akpm/arch/x86_64/boot/compressed/head.S | 1 + 25-akpm/arch/x86_64/boot/compressed/misc.c | 1 + 25-akpm/drivers/char/keyboard.c | 3 + 25-akpm/drivers/char/sysrq.c | 23 + 25-akpm/drivers/serial/8250.c | 40 + 25-akpm/drivers/serial/serial_core.c | 5 + 25-akpm/include/asm-i386/bugs.h | 21 + 25-akpm/include/asm-i386/kgdb.h | 59 + 25-akpm/include/asm-i386/kgdb_local.h | 102 + + 25-akpm/include/linux/config.h | 3 + 25-akpm/include/linux/dwarf2-lang.h | 132 + + 25-akpm/include/linux/dwarf2.h | 738 +++++++ + 25-akpm/include/linux/serial_core.h | 4 + 25-akpm/include/linux/spinlock.h | 12 + 25-akpm/kernel/pid.c | 6 + 25-akpm/kernel/sched.c | 7 + 38 files changed, 5645 insertions(+), 10 deletions(-) + +diff -puN arch/i386/Kconfig~kgdb-ga arch/i386/Kconfig +--- 25/arch/i386/Kconfig~kgdb-ga 2004-10-21 14:54:15.256604136 -0700 ++++ 25-akpm/arch/i386/Kconfig 2004-10-21 14:54:15.295598208 -0700 +@@ -1184,6 +1184,14 @@ menu "Executable file formats" + + source "fs/Kconfig.binfmt" + ++config TRAP_BAD_SYSCALL_EXITS ++ bool "Debug bad system call exits" ++ depends on KGDB ++ help ++ If you say Y here the kernel will check for system calls which ++ return without clearing preempt. ++ default n ++ + endmenu + + source "drivers/Kconfig" +diff -puN arch/i386/kernel/entry.S~kgdb-ga arch/i386/kernel/entry.S +--- 25/arch/i386/kernel/entry.S~kgdb-ga 2004-10-21 14:54:15.257603984 -0700 ++++ 25-akpm/arch/i386/kernel/entry.S 2004-10-21 14:54:15.296598056 -0700 +@@ -48,6 +48,18 @@ + #include + #include + #include "irq_vectors.h" ++ /* We do not recover from a stack overflow, but at least ++ * we know it happened and should be able to track it down. ++ */ ++#ifdef CONFIG_STACK_OVERFLOW_TEST ++#define STACK_OVERFLOW_TEST \ ++ testl $(THREAD_SIZE - 512),%esp; \ ++ jnz 10f; \ ++ call stack_overflow; \ ++10: ++#else ++#define STACK_OVERFLOW_TEST ++#endif + + #define nr_syscalls ((syscall_table_size)/4) + +@@ -94,7 +106,8 @@ VM_MASK = 0x00020000 + pushl %ebx; \ + movl $(__USER_DS), %edx; \ + movl %edx, %ds; \ +- movl %edx, %es; ++ movl %edx, %es; \ ++ STACK_OVERFLOW_TEST + + #define RESTORE_INT_REGS \ + popl %ebx; \ +@@ -198,6 +211,7 @@ need_resched: + # sysenter call handler stub + ENTRY(sysenter_entry) + movl TSS_sysenter_esp0(%esp),%esp ++ .globl sysenter_past_esp + sysenter_past_esp: + sti + pushl $(__USER_DS) +@@ -260,6 +274,19 @@ syscall_exit: + testw $_TIF_ALLWORK_MASK, %cx # current->work + jne syscall_exit_work + restore_all: ++#ifdef CONFIG_TRAP_BAD_SYSCALL_EXITS ++ movl EFLAGS(%esp), %eax # mix EFLAGS and CS ++ movb CS(%esp), %al ++ testl $(VM_MASK | 3), %eax ++ jz resume_kernelX # returning to kernel or vm86-space ++ ++ cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ? ++ jz resume_kernelX ++ ++ int $3 ++ ++resume_kernelX: ++#endif + RESTORE_ALL + + # perform work that needs to be done immediately before resumption +diff -puN /dev/null arch/i386/kernel/kgdb_stub.c +--- /dev/null 2003-09-15 06:40:47.000000000 -0700 ++++ 25-akpm/arch/i386/kernel/kgdb_stub.c 2004-10-21 14:54:15.307596384 -0700 +@@ -0,0 +1,2330 @@ ++/* ++ * ++ * This program is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2, or (at your option) any ++ * later version. ++ * ++ * This program is distributed in the hope that it will be useful, but ++ * WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ */ ++ ++/* ++ * Copyright (c) 2000 VERITAS Software Corporation. ++ * ++ */ ++/**************************************************************************** ++ * Header: remcom.c,v 1.34 91/03/09 12:29:49 glenne Exp $ ++ * ++ * Module name: remcom.c $ ++ * Revision: 1.34 $ ++ * Date: 91/03/09 12:29:49 $ ++ * Contributor: Lake Stevens Instrument Division$ ++ * ++ * Description: low level support for gdb debugger. $ ++ * ++ * Considerations: only works on target hardware $ ++ * ++ * Written by: Glenn Engel $ ++ * Updated by: David Grothe ++ * ModuleState: Experimental $ ++ * ++ * NOTES: See Below $ ++ * ++ * Modified for 386 by Jim Kingdon, Cygnus Support. ++ * Compatibility with 2.1.xx kernel by David Grothe ++ * ++ * Changes to allow auto initilization. All that is needed is that it ++ * be linked with the kernel and a break point (int 3) be executed. ++ * The header file defines BREAKPOINT to allow one to do ++ * this. It should also be possible, once the interrupt system is up, to ++ * call putDebugChar("+"). Once this is done, the remote debugger should ++ * get our attention by sending a ^C in a packet. George Anzinger ++ * ++ * Integrated into 2.2.5 kernel by Tigran Aivazian ++ * Added thread support, support for multiple processors, ++ * support for ia-32(x86) hardware debugging. ++ * Amit S. Kale ( akale@veritas.com ) ++ * ++ * ++ * To enable debugger support, two things need to happen. One, a ++ * call to set_debug_traps() is necessary in order to allow any breakpoints ++ * or error conditions to be properly intercepted and reported to gdb. ++ * Two, a breakpoint needs to be generated to begin communication. This ++ * is most easily accomplished by a call to breakpoint(). Breakpoint() ++ * simulates a breakpoint by executing an int 3. ++ * ++ ************* ++ * ++ * The following gdb commands are supported: ++ * ++ * command function Return value ++ * ++ * g return the value of the CPU registers hex data or ENN ++ * G set the value of the CPU registers OK or ENN ++ * ++ * mAA..AA,LLLL Read LLLL bytes at address AA..AA hex data or ENN ++ * MAA..AA,LLLL: Write LLLL bytes at address AA.AA OK or ENN ++ * ++ * c Resume at current address SNN ( signal NN) ++ * cAA..AA Continue at address AA..AA SNN ++ * ++ * s Step one instruction SNN ++ * sAA..AA Step one instruction from AA..AA SNN ++ * ++ * k kill ++ * ++ * ? What was the last sigval ? SNN (signal NN) ++ * ++ * All commands and responses are sent with a packet which includes a ++ * checksum. A packet consists of ++ * ++ * $#. ++ * ++ * where ++ * :: ++ * :: < two hex digits computed as modulo 256 sum of > ++ * ++ * When a packet is received, it is first acknowledged with either '+' or '-'. ++ * '+' indicates a successful transfer. '-' indicates a failed transfer. ++ * ++ * Example: ++ * ++ * Host: Reply: ++ * $m0,10#2a +$00010203040506070809101112131415#42 ++ * ++ ****************************************************************************/ ++#define KGDB_VERSION "<20030915.1651.33>" ++#include ++#include ++#include /* for strcpy */ ++#include ++#include ++#include ++#include ++#include /* for linux pt_regs struct */ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/************************************************************************ ++ * ++ * external low-level support routines ++ */ ++typedef void (*Function) (void); /* pointer to a function */ ++ ++/* Thread reference */ ++typedef unsigned char threadref[8]; ++ ++extern void putDebugChar(int); /* write a single character */ ++extern int getDebugChar(void); /* read and return a single char */ ++ ++/************************************************************************/ ++/* BUFMAX defines the maximum number of characters in inbound/outbound buffers*/ ++/* at least NUMREGBYTES*2 are needed for register packets */ ++/* Longer buffer is needed to list all threads */ ++#define BUFMAX 400 ++ ++char *kgdb_version = KGDB_VERSION; ++ ++/* debug > 0 prints ill-formed commands in valid packets & checksum errors */ ++int debug_regs = 0; /* set to non-zero to print registers */ ++ ++/* filled in by an external module */ ++char *gdb_module_offsets; ++ ++static const char hexchars[] = "0123456789abcdef"; ++ ++/* Number of bytes of registers. */ ++#define NUMREGBYTES 64 ++/* ++ * Note that this register image is in a different order than ++ * the register image that Linux produces at interrupt time. ++ * ++ * Linux's register image is defined by struct pt_regs in ptrace.h. ++ * Just why GDB uses a different order is a historical mystery. ++ */ ++enum regnames { _EAX, /* 0 */ ++ _ECX, /* 1 */ ++ _EDX, /* 2 */ ++ _EBX, /* 3 */ ++ _ESP, /* 4 */ ++ _EBP, /* 5 */ ++ _ESI, /* 6 */ ++ _EDI, /* 7 */ ++ _PC /* 8 also known as eip */ , ++ _PS /* 9 also known as eflags */ , ++ _CS, /* 10 */ ++ _SS, /* 11 */ ++ _DS, /* 12 */ ++ _ES, /* 13 */ ++ _FS, /* 14 */ ++ _GS /* 15 */ ++}; ++ ++/*************************** ASSEMBLY CODE MACROS *************************/ ++/* ++ * Put the error code here just in case the user cares. ++ * Likewise, the vector number here (since GDB only gets the signal ++ * number through the usual means, and that's not very specific). ++ * The called_from is the return address so he can tell how we entered kgdb. ++ * This will allow him to seperate out the various possible entries. ++ */ ++#define REMOTE_DEBUG 0 /* set != to turn on printing (also available in info) */ ++ ++#define PID_MAX PID_MAX_DEFAULT ++ ++#ifdef CONFIG_SMP ++void smp_send_nmi_allbutself(void); ++#define IF_SMP(x) x ++#undef MAX_NO_CPUS ++#ifndef CONFIG_NO_KGDB_CPUS ++#define CONFIG_NO_KGDB_CPUS 2 ++#endif ++#if CONFIG_NO_KGDB_CPUS > NR_CPUS ++#define MAX_NO_CPUS NR_CPUS ++#else ++#define MAX_NO_CPUS CONFIG_NO_KGDB_CPUS ++#endif ++#define hold_init hold_on_sstep: 1, ++#define MAX_CPU_MASK (unsigned long)((1LL << MAX_NO_CPUS) - 1LL) ++#define NUM_CPUS num_online_cpus() ++#else ++#define IF_SMP(x) ++#define hold_init ++#undef MAX_NO_CPUS ++#define MAX_NO_CPUS 1 ++#define NUM_CPUS 1 ++#endif ++#define NOCPU (struct task_struct *)0xbad1fbad ++/* *INDENT-OFF* */ ++struct kgdb_info { ++ int used_malloc; ++ void *called_from; ++ long long entry_tsc; ++ int errcode; ++ int vector; ++ int print_debug_info; ++#ifdef CONFIG_SMP ++ int hold_on_sstep; ++ struct { ++ volatile struct task_struct *task; ++ int pid; ++ int hold; ++ struct pt_regs *regs; ++ } cpus_waiting[MAX_NO_CPUS]; ++#endif ++} kgdb_info = {hold_init print_debug_info:REMOTE_DEBUG, vector:-1}; ++ ++/* *INDENT-ON* */ ++ ++#define used_m kgdb_info.used_malloc ++/* ++ * This is little area we set aside to contain the stack we ++ * need to build to allow gdb to call functions. We use one ++ * per cpu to avoid locking issues. We will do all this work ++ * with interrupts off so that should take care of the protection ++ * issues. ++ */ ++#define LOOKASIDE_SIZE 200 /* should be more than enough */ ++#define MALLOC_MAX 200 /* Max malloc size */ ++struct { ++ unsigned int esp; ++ int array[LOOKASIDE_SIZE]; ++} fn_call_lookaside[MAX_NO_CPUS]; ++ ++static int trap_cpu; ++static unsigned int OLD_esp; ++ ++#define END_OF_LOOKASIDE &fn_call_lookaside[trap_cpu].array[LOOKASIDE_SIZE] ++#define IF_BIT 0x200 ++#define TF_BIT 0x100 ++ ++#define MALLOC_ROUND 8-1 ++ ++static char malloc_array[MALLOC_MAX]; ++IF_SMP(static void to_gdb(const char *mess)); ++void * ++malloc(int size) ++{ ++ ++ if (size <= (MALLOC_MAX - used_m)) { ++ int old_used = used_m; ++ used_m += ((size + MALLOC_ROUND) & (~MALLOC_ROUND)); ++ return &malloc_array[old_used]; ++ } else { ++ return NULL; ++ } ++} ++ ++/* ++ * Gdb calls functions by pushing agruments, including a return address ++ * on the stack and the adjusting EIP to point to the function. The ++ * whole assumption in GDB is that we are on a different stack than the ++ * one the "user" i.e. code that hit the break point, is on. This, of ++ * course is not true in the kernel. Thus various dodges are needed to ++ * do the call without directly messing with EIP (which we can not change ++ * as it is just a location and not a register. To adjust it would then ++ * require that we move every thing below EIP up or down as needed. This ++ * will not work as we may well have stack relative pointer on the stack ++ * (such as the pointer to regs, for example). ++ ++ * So here is what we do: ++ * We detect gdb attempting to store into the stack area and instead, store ++ * into the fn_call_lookaside.array at the same relative location as if it ++ * were the area ESP pointed at. We also trap ESP modifications ++ * and uses these to adjust fn_call_lookaside.esp. On entry ++ * fn_call_lookaside.esp will be set to point at the last entry in ++ * fn_call_lookaside.array. This allows us to check if it has changed, and ++ * if so, on exit, we add the registers we will use to do the move and a ++ * trap/ interrupt return exit sequence. We then adjust the eflags in the ++ * regs array (remember we now have a copy in the fn_call_lookaside.array) to ++ * kill the interrupt bit, AND we change EIP to point at our set up stub. ++ * As part of the register set up we preset the registers to point at the ++ * begining and end of the fn_call_lookaside.array, so all the stub needs to ++ * do is move words from the array to the stack until ESP= the desired value ++ * then do the rti. This will then transfer to the desired function with ++ * all the correct registers. Nifty huh? ++ */ ++extern asmlinkage void fn_call_stub(void); ++extern asmlinkage void fn_rtn_stub(void); ++/* *INDENT-OFF* */ ++__asm__("fn_rtn_stub:\n\t" ++ "movl %eax,%esp\n\t" ++ "fn_call_stub:\n\t" ++ "1:\n\t" ++ "addl $-4,%ebx\n\t" ++ "movl (%ebx), %eax\n\t" ++ "pushl %eax\n\t" ++ "cmpl %esp,%ecx\n\t" ++ "jne 1b\n\t" ++ "popl %eax\n\t" ++ "popl %ebx\n\t" ++ "popl %ecx\n\t" ++ "iret \n\t"); ++/* *INDENT-ON* */ ++#define gdb_i386vector kgdb_info.vector ++#define gdb_i386errcode kgdb_info.errcode ++#define waiting_cpus kgdb_info.cpus_waiting ++#define remote_debug kgdb_info.print_debug_info ++#define hold_cpu(cpu) kgdb_info.cpus_waiting[cpu].hold ++/* gdb locks */ ++ ++#ifdef CONFIG_SMP ++static int in_kgdb_called; ++static spinlock_t waitlocks[MAX_NO_CPUS] = ++ {[0 ... MAX_NO_CPUS - 1] = SPIN_LOCK_UNLOCKED }; ++/* ++ * The following array has the thread pointer of each of the "other" ++ * cpus. We make it global so it can be seen by gdb. ++ */ ++volatile int in_kgdb_entry_log[MAX_NO_CPUS]; ++volatile struct pt_regs *in_kgdb_here_log[MAX_NO_CPUS]; ++/* ++static spinlock_t continuelocks[MAX_NO_CPUS]; ++*/ ++spinlock_t kgdb_spinlock = SPIN_LOCK_UNLOCKED; ++/* waiters on our spinlock plus us */ ++static atomic_t spinlock_waiters = ATOMIC_INIT(1); ++static int spinlock_count = 0; ++static int spinlock_cpu = 0; ++/* ++ * Note we use nested spin locks to account for the case where a break ++ * point is encountered when calling a function by user direction from ++ * kgdb. Also there is the memory exception recursion to account for. ++ * Well, yes, but this lets other cpus thru too. Lets add a ++ * cpu id to the lock. ++ */ ++#define KGDB_SPIN_LOCK(x) if( spinlock_count == 0 || \ ++ spinlock_cpu != smp_processor_id()){\ ++ atomic_inc(&spinlock_waiters); \ ++ while (! spin_trylock(x)) {\ ++ in_kgdb(®s);\ ++ }\ ++ atomic_dec(&spinlock_waiters); \ ++ spinlock_count = 1; \ ++ spinlock_cpu = smp_processor_id(); \ ++ }else{ \ ++ spinlock_count++; \ ++ } ++#define KGDB_SPIN_UNLOCK(x) if( --spinlock_count == 0) spin_unlock(x) ++#else ++unsigned kgdb_spinlock = 0; ++#define KGDB_SPIN_LOCK(x) --*x ++#define KGDB_SPIN_UNLOCK(x) ++*x ++#endif ++ ++int ++hex(char ch) ++{ ++ if ((ch >= 'a') && (ch <= 'f')) ++ return (ch - 'a' + 10); ++ if ((ch >= '0') && (ch <= '9')) ++ return (ch - '0'); ++ if ((ch >= 'A') && (ch <= 'F')) ++ return (ch - 'A' + 10); ++ return (-1); ++} ++ ++/* scan for the sequence $# */ ++void ++getpacket(char *buffer) ++{ ++ unsigned char checksum; ++ unsigned char xmitcsum; ++ int i; ++ int count; ++ char ch; ++ ++ do { ++ /* wait around for the start character, ignore all other characters */ ++ while ((ch = (getDebugChar() & 0x7f)) != '$') ; ++ checksum = 0; ++ xmitcsum = -1; ++ ++ count = 0; ++ ++ /* now, read until a # or end of buffer is found */ ++ while (count < BUFMAX) { ++ ch = getDebugChar() & 0x7f; ++ if (ch == '#') ++ break; ++ checksum = checksum + ch; ++ buffer[count] = ch; ++ count = count + 1; ++ } ++ buffer[count] = 0; ++ ++ if (ch == '#') { ++ xmitcsum = hex(getDebugChar() & 0x7f) << 4; ++ xmitcsum += hex(getDebugChar() & 0x7f); ++ if ((remote_debug) && (checksum != xmitcsum)) { ++ printk ++ ("bad checksum. My count = 0x%x, sent=0x%x. buf=%s\n", ++ checksum, xmitcsum, buffer); ++ } ++ ++ if (checksum != xmitcsum) ++ putDebugChar('-'); /* failed checksum */ ++ else { ++ putDebugChar('+'); /* successful transfer */ ++ /* if a sequence char is present, reply the sequence ID */ ++ if (buffer[2] == ':') { ++ putDebugChar(buffer[0]); ++ putDebugChar(buffer[1]); ++ /* remove sequence chars from buffer */ ++ count = strlen(buffer); ++ for (i = 3; i <= count; i++) ++ buffer[i - 3] = buffer[i]; ++ } ++ } ++ } ++ } while (checksum != xmitcsum); ++ ++ if (remote_debug) ++ printk("R:%s\n", buffer); ++} ++ ++/* send the packet in buffer. */ ++ ++void ++putpacket(char *buffer) ++{ ++ unsigned char checksum; ++ int count; ++ char ch; ++ ++ /* $#. */ ++ do { ++ if (remote_debug) ++ printk("T:%s\n", buffer); ++ putDebugChar('$'); ++ checksum = 0; ++ count = 0; ++ ++ while ((ch = buffer[count])) { ++ putDebugChar(ch); ++ checksum += ch; ++ count += 1; ++ } ++ ++ putDebugChar('#'); ++ putDebugChar(hexchars[checksum >> 4]); ++ putDebugChar(hexchars[checksum % 16]); ++ ++ } while ((getDebugChar() & 0x7f) != '+'); ++ ++} ++ ++static char remcomInBuffer[BUFMAX]; ++static char remcomOutBuffer[BUFMAX]; ++static short error; ++ ++void ++debug_error(char *format, char *parm) ++{ ++ if (remote_debug) ++ printk(format, parm); ++} ++ ++static void ++print_regs(struct pt_regs *regs) ++{ ++ printk("EAX=%08lx ", regs->eax); ++ printk("EBX=%08lx ", regs->ebx); ++ printk("ECX=%08lx ", regs->ecx); ++ printk("EDX=%08lx ", regs->edx); ++ printk("\n"); ++ printk("ESI=%08lx ", regs->esi); ++ printk("EDI=%08lx ", regs->edi); ++ printk("EBP=%08lx ", regs->ebp); ++ printk("ESP=%08lx ", (long) ®s->esp); ++ printk("\n"); ++ printk(" DS=%08x ", regs->xds); ++ printk(" ES=%08x ", regs->xes); ++ printk(" SS=%08x ", __KERNEL_DS); ++ printk(" FL=%08lx ", regs->eflags); ++ printk("\n"); ++ printk(" CS=%08x ", regs->xcs); ++ printk(" IP=%08lx ", regs->eip); ++#if 0 ++ printk(" FS=%08x ", regs->fs); ++ printk(" GS=%08x ", regs->gs); ++#endif ++ printk("\n"); ++ ++} /* print_regs */ ++ ++#define NEW_esp fn_call_lookaside[trap_cpu].esp ++ ++static void ++regs_to_gdb_regs(int *gdb_regs, struct pt_regs *regs) ++{ ++ gdb_regs[_EAX] = regs->eax; ++ gdb_regs[_EBX] = regs->ebx; ++ gdb_regs[_ECX] = regs->ecx; ++ gdb_regs[_EDX] = regs->edx; ++ gdb_regs[_ESI] = regs->esi; ++ gdb_regs[_EDI] = regs->edi; ++ gdb_regs[_EBP] = regs->ebp; ++ gdb_regs[_DS] = regs->xds; ++ gdb_regs[_ES] = regs->xes; ++ gdb_regs[_PS] = regs->eflags; ++ gdb_regs[_CS] = regs->xcs; ++ gdb_regs[_PC] = regs->eip; ++ /* Note, as we are a debugging the kernel, we will always ++ * trap in kernel code, this means no priviledge change, ++ * and so the pt_regs structure is not completely valid. In a non ++ * privilege change trap, only EFLAGS, CS and EIP are put on the stack, ++ * SS and ESP are not stacked, this means that the last 2 elements of ++ * pt_regs is not valid (they would normally refer to the user stack) ++ * also, using regs+1 is no good because you end up will a value that is ++ * 2 longs (8) too high. This used to cause stepping over functions ++ * to fail, so my fix is to use the address of regs->esp, which ++ * should point at the end of the stack frame. Note I have ignored ++ * completely exceptions that cause an error code to be stacked, such ++ * as double fault. Stuart Hughes, Zentropix. ++ * original code: gdb_regs[_ESP] = (int) (regs + 1) ; ++ ++ * this is now done on entry and moved to OLD_esp (as well as NEW_esp). ++ */ ++ gdb_regs[_ESP] = NEW_esp; ++ gdb_regs[_SS] = __KERNEL_DS; ++ gdb_regs[_FS] = 0xFFFF; ++ gdb_regs[_GS] = 0xFFFF; ++} /* regs_to_gdb_regs */ ++ ++static void ++gdb_regs_to_regs(int *gdb_regs, struct pt_regs *regs) ++{ ++ regs->eax = gdb_regs[_EAX]; ++ regs->ebx = gdb_regs[_EBX]; ++ regs->ecx = gdb_regs[_ECX]; ++ regs->edx = gdb_regs[_EDX]; ++ regs->esi = gdb_regs[_ESI]; ++ regs->edi = gdb_regs[_EDI]; ++ regs->ebp = gdb_regs[_EBP]; ++ regs->xds = gdb_regs[_DS]; ++ regs->xes = gdb_regs[_ES]; ++ regs->eflags = gdb_regs[_PS]; ++ regs->xcs = gdb_regs[_CS]; ++ regs->eip = gdb_regs[_PC]; ++ NEW_esp = gdb_regs[_ESP]; /* keep the value */ ++#if 0 /* can't change these */ ++ regs->esp = gdb_regs[_ESP]; ++ regs->xss = gdb_regs[_SS]; ++ regs->fs = gdb_regs[_FS]; ++ regs->gs = gdb_regs[_GS]; ++#endif ++ ++} /* gdb_regs_to_regs */ ++ ++int thread_list = 0; ++ ++void ++get_gdb_regs(struct task_struct *p, struct pt_regs *regs, int *gdb_regs) ++{ ++ unsigned long stack_page; ++ int count = 0; ++ IF_SMP(int i); ++ if (!p || p == current) { ++ regs_to_gdb_regs(gdb_regs, regs); ++ return; ++ } ++#ifdef CONFIG_SMP ++ for (i = 0; i < MAX_NO_CPUS; i++) { ++ if (p == kgdb_info.cpus_waiting[i].task) { ++ regs_to_gdb_regs(gdb_regs, ++ kgdb_info.cpus_waiting[i].regs); ++ gdb_regs[_ESP] = ++ (int) &kgdb_info.cpus_waiting[i].regs->esp; ++ ++ return; ++ } ++ } ++#endif ++ memset(gdb_regs, 0, NUMREGBYTES); ++ gdb_regs[_ESP] = p->thread.esp; ++ gdb_regs[_PC] = p->thread.eip; ++ gdb_regs[_EBP] = *(int *) gdb_regs[_ESP]; ++ gdb_regs[_EDI] = *(int *) (gdb_regs[_ESP] + 4); ++ gdb_regs[_ESI] = *(int *) (gdb_regs[_ESP] + 8); ++ ++/* ++ * This code is to give a more informative notion of where a process ++ * is waiting. It is used only when the user asks for a thread info ++ * list. If he then switches to the thread, s/he will find the task ++ * is in schedule, but a back trace should show the same info we come ++ * up with. This code was shamelessly purloined from process.c. It was ++ * then enhanced to provide more registers than simply the program ++ * counter. ++ */ ++ ++ if (!thread_list) { ++ return; ++ } ++ ++ if (p->state == TASK_RUNNING) ++ return; ++ stack_page = (unsigned long) p->thread_info; ++ if (gdb_regs[_ESP] < stack_page || gdb_regs[_ESP] > ++ THREAD_SIZE - sizeof(long) + stack_page) ++ return; ++ /* include/asm-i386/system.h:switch_to() pushes ebp last. */ ++ do { ++ if (gdb_regs[_EBP] < stack_page || ++ gdb_regs[_EBP] > THREAD_SIZE - 2*sizeof(long) + stack_page) ++ return; ++ gdb_regs[_PC] = *(unsigned long *) (gdb_regs[_EBP] + 4); ++ gdb_regs[_ESP] = gdb_regs[_EBP] + 8; ++ gdb_regs[_EBP] = *(unsigned long *) gdb_regs[_EBP]; ++ if (!in_sched_functions(gdb_regs[_PC])) ++ return; ++ } while (count++ < 16); ++ return; ++} ++ ++/* Indicate to caller of mem2hex or hex2mem that there has been an ++ error. */ ++static volatile int mem_err = 0; ++static volatile int mem_err_expected = 0; ++static volatile int mem_err_cnt = 0; ++static int garbage_loc = -1; ++ ++int ++get_char(char *addr) ++{ ++ return *addr; ++} ++ ++void ++set_char(char *addr, int val, int may_fault) ++{ ++ /* ++ * This code traps references to the area mapped to the kernel ++ * stack as given by the regs and, instead, stores to the ++ * fn_call_lookaside[cpu].array ++ */ ++ if (may_fault && ++ (unsigned int) addr < OLD_esp && ++ ((unsigned int) addr > (OLD_esp - (unsigned int) LOOKASIDE_SIZE))) { ++ addr = (char *) END_OF_LOOKASIDE - ((char *) OLD_esp - addr); ++ } ++ *addr = val; ++} ++ ++/* convert the memory pointed to by mem into hex, placing result in buf */ ++/* return a pointer to the last char put in buf (null) */ ++/* If MAY_FAULT is non-zero, then we should set mem_err in response to ++ a fault; if zero treat a fault like any other fault in the stub. */ ++char * ++mem2hex(char *mem, char *buf, int count, int may_fault) ++{ ++ int i; ++ unsigned char ch; ++ ++ if (may_fault) { ++ mem_err_expected = 1; ++ mem_err = 0; ++ } ++ for (i = 0; i < count; i++) { ++ /* printk("%lx = ", mem) ; */ ++ ++ ch = get_char(mem++); ++ ++ /* printk("%02x\n", ch & 0xFF) ; */ ++ if (may_fault && mem_err) { ++ if (remote_debug) ++ printk("Mem fault fetching from addr %lx\n", ++ (long) (mem - 1)); ++ *buf = 0; /* truncate buffer */ ++ return (buf); ++ } ++ *buf++ = hexchars[ch >> 4]; ++ *buf++ = hexchars[ch % 16]; ++ } ++ *buf = 0; ++ if (may_fault) ++ mem_err_expected = 0; ++ return (buf); ++} ++ ++/* convert the hex array pointed to by buf into binary to be placed in mem */ ++/* return a pointer to the character AFTER the last byte written */ ++/* NOTE: We use the may fault flag to also indicate if the write is to ++ * the registers (0) or "other" memory (!=0) ++ */ ++char * ++hex2mem(char *buf, char *mem, int count, int may_fault) ++{ ++ int i; ++ unsigned char ch; ++ ++ if (may_fault) { ++ mem_err_expected = 1; ++ mem_err = 0; ++ } ++ for (i = 0; i < count; i++) { ++ ch = hex(*buf++) << 4; ++ ch = ch + hex(*buf++); ++ set_char(mem++, ch, may_fault); ++ ++ if (may_fault && mem_err) { ++ if (remote_debug) ++ printk("Mem fault storing to addr %lx\n", ++ (long) (mem - 1)); ++ return (mem); ++ } ++ } ++ if (may_fault) ++ mem_err_expected = 0; ++ return (mem); ++} ++ ++/**********************************************/ ++/* WHILE WE FIND NICE HEX CHARS, BUILD AN INT */ ++/* RETURN NUMBER OF CHARS PROCESSED */ ++/**********************************************/ ++int ++hexToInt(char **ptr, int *intValue) ++{ ++ int numChars = 0; ++ int hexValue; ++ ++ *intValue = 0; ++ ++ while (**ptr) { ++ hexValue = hex(**ptr); ++ if (hexValue >= 0) { ++ *intValue = (*intValue << 4) | hexValue; ++ numChars++; ++ } else ++ break; ++ ++ (*ptr)++; ++ } ++ ++ return (numChars); ++} ++ ++#define stubhex(h) hex(h) ++#ifdef old_thread_list ++ ++static int ++stub_unpack_int(char *buff, int fieldlength) ++{ ++ int nibble; ++ int retval = 0; ++ ++ while (fieldlength) { ++ nibble = stubhex(*buff++); ++ retval |= nibble; ++ fieldlength--; ++ if (fieldlength) ++ retval = retval << 4; ++ } ++ return retval; ++} ++#endif ++static char * ++pack_hex_byte(char *pkt, int byte) ++{ ++ *pkt++ = hexchars[(byte >> 4) & 0xf]; ++ *pkt++ = hexchars[(byte & 0xf)]; ++ return pkt; ++} ++ ++#define BUF_THREAD_ID_SIZE 16 ++ ++static char * ++pack_threadid(char *pkt, threadref * id) ++{ ++ char *limit; ++ unsigned char *altid; ++ ++ altid = (unsigned char *) id; ++ limit = pkt + BUF_THREAD_ID_SIZE; ++ while (pkt < limit) ++ pkt = pack_hex_byte(pkt, *altid++); ++ return pkt; ++} ++ ++#ifdef old_thread_list ++static char * ++unpack_byte(char *buf, int *value) ++{ ++ *value = stub_unpack_int(buf, 2); ++ return buf + 2; ++} ++ ++static char * ++unpack_threadid(char *inbuf, threadref * id) ++{ ++ char *altref; ++ char *limit = inbuf + BUF_THREAD_ID_SIZE; ++ int x, y; ++ ++ altref = (char *) id; ++ ++ while (inbuf < limit) { ++ x = stubhex(*inbuf++); ++ y = stubhex(*inbuf++); ++ *altref++ = (x << 4) | y; ++ } ++ return inbuf; ++} ++#endif ++void ++int_to_threadref(threadref * id, int value) ++{ ++ unsigned char *scan; ++ ++ scan = (unsigned char *) id; ++ { ++ int i = 4; ++ while (i--) ++ *scan++ = 0; ++ } ++ *scan++ = (value >> 24) & 0xff; ++ *scan++ = (value >> 16) & 0xff; ++ *scan++ = (value >> 8) & 0xff; ++ *scan++ = (value & 0xff); ++} ++int ++int_to_hex_v(unsigned char * id, int value) ++{ ++ unsigned char *start = id; ++ int shift; ++ int ch; ++ ++ for (shift = 28; shift >= 0; shift -= 4) { ++ if ((ch = (value >> shift) & 0xf) || (id != start)) { ++ *id = hexchars[ch]; ++ id++; ++ } ++ } ++ if (id == start) ++ *id++ = '0'; ++ return id - start; ++} ++#ifdef old_thread_list ++ ++static int ++threadref_to_int(threadref * ref) ++{ ++ int i, value = 0; ++ unsigned char *scan; ++ ++ scan = (char *) ref; ++ scan += 4; ++ i = 4; ++ while (i-- > 0) ++ value = (value << 8) | ((*scan++) & 0xff); ++ return value; ++} ++#endif ++static int ++cmp_str(char *s1, char *s2, int count) ++{ ++ while (count--) { ++ if (*s1++ != *s2++) ++ return 0; ++ } ++ return 1; ++} ++ ++#if 1 /* this is a hold over from 2.4 where O(1) was "sometimes" */ ++extern struct task_struct *kgdb_get_idle(int cpu); ++#define idle_task(cpu) kgdb_get_idle(cpu) ++#else ++#define idle_task(cpu) init_tasks[cpu] ++#endif ++ ++extern int kgdb_pid_init_done; ++ ++struct task_struct * ++getthread(int pid) ++{ ++ struct task_struct *thread; ++ if (pid >= PID_MAX && pid <= (PID_MAX + MAX_NO_CPUS)) { ++ ++ return idle_task(pid - PID_MAX); ++ } else { ++ /* ++ * find_task_by_pid is relatively safe all the time ++ * Other pid functions require lock downs which imply ++ * that we may be interrupting them (as we get here ++ * in the middle of most any lock down). ++ * Still we don't want to call until the table exists! ++ */ ++ if (kgdb_pid_init_done){ ++ thread = find_task_by_pid(pid); ++ if (thread) { ++ return thread; ++ } ++ } ++ } ++ return NULL; ++} ++/* *INDENT-OFF* */ ++struct hw_breakpoint { ++ unsigned enabled; ++ unsigned type; ++ unsigned len; ++ unsigned addr; ++} breakinfo[4] = { {enabled:0}, ++ {enabled:0}, ++ {enabled:0}, ++ {enabled:0}}; ++/* *INDENT-ON* */ ++unsigned hw_breakpoint_status; ++void ++correct_hw_break(void) ++{ ++ int breakno; ++ int correctit; ++ int breakbit; ++ unsigned dr7; ++ ++ asm volatile ("movl %%db7, %0\n":"=r" (dr7) ++ :); ++ /* *INDENT-OFF* */ ++ do { ++ unsigned addr0, addr1, addr2, addr3; ++ asm volatile ("movl %%db0, %0\n" ++ "movl %%db1, %1\n" ++ "movl %%db2, %2\n" ++ "movl %%db3, %3\n" ++ :"=r" (addr0), "=r"(addr1), ++ "=r"(addr2), "=r"(addr3) ++ :); ++ } while (0); ++ /* *INDENT-ON* */ ++ correctit = 0; ++ for (breakno = 0; breakno < 3; breakno++) { ++ breakbit = 2 << (breakno << 1); ++ if (!(dr7 & breakbit) && breakinfo[breakno].enabled) { ++ correctit = 1; ++ dr7 |= breakbit; ++ dr7 &= ~(0xf0000 << (breakno << 2)); ++ dr7 |= (((breakinfo[breakno].len << 2) | ++ breakinfo[breakno].type) << 16) << ++ (breakno << 2); ++ switch (breakno) { ++ case 0: ++ asm volatile ("movl %0, %%dr0\n"::"r" ++ (breakinfo[breakno].addr)); ++ break; ++ ++ case 1: ++ asm volatile ("movl %0, %%dr1\n"::"r" ++ (breakinfo[breakno].addr)); ++ break; ++ ++ case 2: ++ asm volatile ("movl %0, %%dr2\n"::"r" ++ (breakinfo[breakno].addr)); ++ break; ++ ++ case 3: ++ asm volatile ("movl %0, %%dr3\n"::"r" ++ (breakinfo[breakno].addr)); ++ break; ++ } ++ } else if ((dr7 & breakbit) && !breakinfo[breakno].enabled) { ++ correctit = 1; ++ dr7 &= ~breakbit; ++ dr7 &= ~(0xf0000 << (breakno << 2)); ++ } ++ } ++ if (correctit) { ++ asm volatile ("movl %0, %%db7\n"::"r" (dr7)); ++ } ++} ++ ++int ++remove_hw_break(unsigned breakno) ++{ ++ if (!breakinfo[breakno].enabled) { ++ return -1; ++ } ++ breakinfo[breakno].enabled = 0; ++ return 0; ++} ++ ++int ++set_hw_break(unsigned breakno, unsigned type, unsigned len, unsigned addr) ++{ ++ if (breakinfo[breakno].enabled) { ++ return -1; ++ } ++ breakinfo[breakno].enabled = 1; ++ breakinfo[breakno].type = type; ++ breakinfo[breakno].len = len; ++ breakinfo[breakno].addr = addr; ++ return 0; ++} ++ ++#ifdef CONFIG_SMP ++static int in_kgdb_console = 0; ++ ++int ++in_kgdb(struct pt_regs *regs) ++{ ++ unsigned flags; ++ int cpu = smp_processor_id(); ++ in_kgdb_called = 1; ++ if (!spin_is_locked(&kgdb_spinlock)) { ++ if (in_kgdb_here_log[cpu] || /* we are holding this cpu */ ++ in_kgdb_console) { /* or we are doing slow i/o */ ++ return 1; ++ } ++ return 0; ++ } ++ ++ /* As I see it the only reason not to let all cpus spin on ++ * the same spin_lock is to allow selected ones to proceed. ++ * This would be a good thing, so we leave it this way. ++ * Maybe someday.... Done ! ++ ++ * in_kgdb() is called from an NMI so we don't pretend ++ * to have any resources, like printk() for example. ++ */ ++ ++ kgdb_local_irq_save(flags); /* only local here, to avoid hanging */ ++ /* ++ * log arival of this cpu ++ * The NMI keeps on ticking. Protect against recurring more ++ * than once, and ignor the cpu that has the kgdb lock ++ */ ++ in_kgdb_entry_log[cpu]++; ++ in_kgdb_here_log[cpu] = regs; ++ if (cpu == spinlock_cpu || waiting_cpus[cpu].task) { ++ goto exit_in_kgdb; ++ } ++ /* ++ * For protection of the initilization of the spin locks by kgdb ++ * it locks the kgdb spinlock before it gets the wait locks set ++ * up. We wait here for the wait lock to be taken. If the ++ * kgdb lock goes away first?? Well, it could be a slow exit ++ * sequence where the wait lock is removed prior to the kgdb lock ++ * so if kgdb gets unlocked, we just exit. ++ */ ++ while (spin_is_locked(&kgdb_spinlock) && ++ !spin_is_locked(waitlocks + cpu)) ; ++ if (!spin_is_locked(&kgdb_spinlock)) { ++ goto exit_in_kgdb; ++ } ++ waiting_cpus[cpu].task = current; ++ waiting_cpus[cpu].pid = (current->pid) ? : (PID_MAX + cpu); ++ waiting_cpus[cpu].regs = regs; ++ ++ spin_unlock_wait(waitlocks + cpu); ++ /* ++ * log departure of this cpu ++ */ ++ waiting_cpus[cpu].task = 0; ++ waiting_cpus[cpu].pid = 0; ++ waiting_cpus[cpu].regs = 0; ++ correct_hw_break(); ++ exit_in_kgdb: ++ in_kgdb_here_log[cpu] = 0; ++ kgdb_local_irq_restore(flags); ++ return 1; ++ /* ++ spin_unlock(continuelocks + smp_processor_id()); ++ */ ++} ++ ++void ++smp__in_kgdb(struct pt_regs regs) ++{ ++ ack_APIC_irq(); ++ in_kgdb(®s); ++} ++#else ++int ++in_kgdb(struct pt_regs *regs) ++{ ++ return (kgdb_spinlock); ++} ++#endif ++ ++void ++printexceptioninfo(int exceptionNo, int errorcode, char *buffer) ++{ ++ unsigned dr6; ++ int i; ++ switch (exceptionNo) { ++ case 1: /* debug exception */ ++ break; ++ case 3: /* breakpoint */ ++ sprintf(buffer, "Software breakpoint"); ++ return; ++ default: ++ sprintf(buffer, "Details not available"); ++ return; ++ } ++ asm volatile ("movl %%db6, %0\n":"=r" (dr6) ++ :); ++ if (dr6 & 0x4000) { ++ sprintf(buffer, "Single step"); ++ return; ++ } ++ for (i = 0; i < 4; ++i) { ++ if (dr6 & (1 << i)) { ++ sprintf(buffer, "Hardware breakpoint %d", i); ++ return; ++ } ++ } ++ sprintf(buffer, "Unknown trap"); ++ return; ++} ++ ++/* ++ * This function does all command procesing for interfacing to gdb. ++ * ++ * NOTE: The INT nn instruction leaves the state of the interrupt ++ * enable flag UNCHANGED. That means that when this routine ++ * is entered via a breakpoint (INT 3) instruction from code ++ * that has interrupts enabled, then interrupts will STILL BE ++ * enabled when this routine is entered. The first thing that ++ * we do here is disable interrupts so as to prevent recursive ++ * entries and bothersome serial interrupts while we are ++ * trying to run the serial port in polled mode. ++ * ++ * For kernel version 2.1.xx the kgdb_cli() actually gets a spin lock so ++ * it is always necessary to do a restore_flags before returning ++ * so as to let go of that lock. ++ */ ++int ++kgdb_handle_exception(int exceptionVector, ++ int signo, int err_code, struct pt_regs *linux_regs) ++{ ++ struct task_struct *usethread = NULL; ++ struct task_struct *thread_list_start = 0, *thread = NULL; ++ int addr, length; ++ int breakno, breaktype; ++ char *ptr; ++ int newPC; ++ threadref thref; ++ int threadid; ++ int thread_min = PID_MAX + MAX_NO_CPUS; ++#ifdef old_thread_list ++ int maxthreads; ++#endif ++ int nothreads; ++ unsigned long flags; ++ int gdb_regs[NUMREGBYTES / 4]; ++ int dr6; ++ IF_SMP(int entry_state = 0); /* 0, ok, 1, no nmi, 2 sync failed */ ++#define NO_NMI 1 ++#define NO_SYNC 2 ++#define regs (*linux_regs) ++#define NUMREGS NUMREGBYTES/4 ++ /* ++ * If the entry is not from the kernel then return to the Linux ++ * trap handler and let it process the interrupt normally. ++ */ ++ if ((linux_regs->eflags & VM_MASK) || (3 & linux_regs->xcs)) { ++ printk("ignoring non-kernel exception\n"); ++ print_regs(®s); ++ return (0); ++ } ++ ++ kgdb_local_irq_save(flags); ++ ++ /* Get kgdb spinlock */ ++ ++ KGDB_SPIN_LOCK(&kgdb_spinlock); ++ rdtscll(kgdb_info.entry_tsc); ++ /* ++ * We depend on this spinlock and the NMI watch dog to control the ++ * other cpus. They will arrive at "in_kgdb()" as a result of the ++ * NMI and will wait there for the following spin locks to be ++ * released. ++ */ ++#ifdef CONFIG_SMP ++ ++#if 0 ++ if (cpu_callout_map & ~MAX_CPU_MASK) { ++ printk("kgdb : too many cpus, possibly not mapped" ++ " in contiguous space, change MAX_NO_CPUS" ++ " in kgdb_stub and make new kernel.\n" ++ " cpu_callout_map is %lx\n", cpu_callout_map); ++ goto exit_just_unlock; ++ } ++#endif ++ if (spinlock_count == 1) { ++ int time = 0, end_time, dum = 0; ++ int i; ++ int cpu_logged_in[MAX_NO_CPUS] = {[0 ... MAX_NO_CPUS - 1] = (0) ++ }; ++ if (remote_debug) { ++ printk("kgdb : cpu %d entry, syncing others\n", ++ smp_processor_id()); ++ } ++ for (i = 0; i < MAX_NO_CPUS; i++) { ++ /* ++ * Use trylock as we may already hold the lock if ++ * we are holding the cpu. Net result is all ++ * locked. ++ */ ++ spin_trylock(&waitlocks[i]); ++ } ++ for (i = 0; i < MAX_NO_CPUS; i++) ++ cpu_logged_in[i] = 0; ++ /* ++ * Wait for their arrival. We know the watch dog is active if ++ * in_kgdb() has ever been called, as it is always called on a ++ * watchdog tick. ++ */ ++ rdtsc(dum, time); ++ end_time = time + 2; /* Note: we use the High order bits! */ ++ i = 1; ++ if (num_online_cpus() > 1) { ++ int me_in_kgdb = in_kgdb_entry_log[smp_processor_id()]; ++ smp_send_nmi_allbutself(); ++ while (i < num_online_cpus() && time != end_time) { ++ int j; ++ for (j = 0; j < MAX_NO_CPUS; j++) { ++ if (waiting_cpus[j].task && ++ !cpu_logged_in[j]) { ++ i++; ++ cpu_logged_in[j] = 1; ++ if (remote_debug) { ++ printk ++ ("kgdb : cpu %d arrived at kgdb\n", ++ j); ++ } ++ break; ++ } else if (!waiting_cpus[j].task && ++ !cpu_online(j)) { ++ waiting_cpus[j].task = NOCPU; ++ cpu_logged_in[j] = 1; ++ waiting_cpus[j].hold = 1; ++ break; ++ } ++ if (!waiting_cpus[j].task && ++ in_kgdb_here_log[j]) { ++ ++ int wait = 100000; ++ while (wait--) ; ++ if (!waiting_cpus[j].task && ++ in_kgdb_here_log[j]) { ++ printk ++ ("kgdb : cpu %d stall" ++ " in in_kgdb\n", ++ j); ++ i++; ++ cpu_logged_in[j] = 1; ++ waiting_cpus[j].task = ++ (struct task_struct ++ *) 1; ++ } ++ } ++ } ++ ++ if (in_kgdb_entry_log[smp_processor_id()] > ++ (me_in_kgdb + 10)) { ++ break; ++ } ++ ++ rdtsc(dum, time); ++ } ++ if (i < num_online_cpus()) { ++ printk ++ ("kgdb : time out, proceeding without sync\n"); ++#if 0 ++ printk("kgdb : Waiting_cpus: 0 = %d, 1 = %d\n", ++ waiting_cpus[0].task != 0, ++ waiting_cpus[1].task != 0); ++ printk("kgdb : Cpu_logged in: 0 = %d, 1 = %d\n", ++ cpu_logged_in[0], cpu_logged_in[1]); ++ printk ++ ("kgdb : in_kgdb_here_log in: 0 = %d, 1 = %d\n", ++ in_kgdb_here_log[0] != 0, ++ in_kgdb_here_log[1] != 0); ++#endif ++ entry_state = NO_SYNC; ++ } else { ++#if 0 ++ int ent = ++ in_kgdb_entry_log[smp_processor_id()] - ++ me_in_kgdb; ++ printk("kgdb : sync after %d entries\n", ent); ++#endif ++ } ++ } else { ++ if (remote_debug) { ++ printk ++ ("kgdb : %d cpus, but watchdog not active\n" ++ "proceeding without locking down other cpus\n", ++ num_online_cpus()); ++ entry_state = NO_NMI; ++ } ++ } ++ } ++#endif ++ ++ if (remote_debug) { ++ unsigned long *lp = (unsigned long *) &linux_regs; ++ ++ printk("handle_exception(exceptionVector=%d, " ++ "signo=%d, err_code=%d, linux_regs=%p)\n", ++ exceptionVector, signo, err_code, linux_regs); ++ if (debug_regs) { ++ print_regs(®s); ++ printk("Stk: %8lx %8lx %8lx %8lx" ++ " %8lx %8lx %8lx %8lx\n", ++ lp[0], lp[1], lp[2], lp[3], ++ lp[4], lp[5], lp[6], lp[7]); ++ printk(" %8lx %8lx %8lx %8lx" ++ " %8lx %8lx %8lx %8lx\n", ++ lp[8], lp[9], lp[10], lp[11], ++ lp[12], lp[13], lp[14], lp[15]); ++ printk(" %8lx %8lx %8lx %8lx " ++ "%8lx %8lx %8lx %8lx\n", ++ lp[16], lp[17], lp[18], lp[19], ++ lp[20], lp[21], lp[22], lp[23]); ++ printk(" %8lx %8lx %8lx %8lx " ++ "%8lx %8lx %8lx %8lx\n", ++ lp[24], lp[25], lp[26], lp[27], ++ lp[28], lp[29], lp[30], lp[31]); ++ } ++ } ++ ++ /* Disable hardware debugging while we are in kgdb */ ++ /* Get the debug register status register */ ++/* *INDENT-OFF* */ ++ __asm__("movl %0,%%db7" ++ : /* no output */ ++ :"r"(0)); ++ ++ asm volatile ("movl %%db6, %0\n" ++ :"=r" (hw_breakpoint_status) ++ :); ++ ++/* *INDENT-ON* */ ++ switch (exceptionVector) { ++ case 0: /* divide error */ ++ case 1: /* debug exception */ ++ case 2: /* NMI */ ++ case 3: /* breakpoint */ ++ case 4: /* overflow */ ++ case 5: /* bounds check */ ++ case 6: /* invalid opcode */ ++ case 7: /* device not available */ ++ case 8: /* double fault (errcode) */ ++ case 10: /* invalid TSS (errcode) */ ++ case 12: /* stack fault (errcode) */ ++ case 16: /* floating point error */ ++ case 17: /* alignment check (errcode) */ ++ default: /* any undocumented */ ++ break; ++ case 11: /* segment not present (errcode) */ ++ case 13: /* general protection (errcode) */ ++ case 14: /* page fault (special errcode) */ ++ case 19: /* cache flush denied */ ++ if (mem_err_expected) { ++ /* ++ * This fault occured because of the ++ * get_char or set_char routines. These ++ * two routines use either eax of edx to ++ * indirectly reference the location in ++ * memory that they are working with. ++ * For a page fault, when we return the ++ * instruction will be retried, so we ++ * have to make sure that these ++ * registers point to valid memory. ++ */ ++ mem_err = 1; /* set mem error flag */ ++ mem_err_expected = 0; ++ mem_err_cnt++; /* helps in debugging */ ++ /* make valid address */ ++ regs.eax = (long) &garbage_loc; ++ /* make valid address */ ++ regs.edx = (long) &garbage_loc; ++ if (remote_debug) ++ printk("Return after memory error: " ++ "mem_err_cnt=%d\n", mem_err_cnt); ++ if (debug_regs) ++ print_regs(®s); ++ goto exit_kgdb; ++ } ++ break; ++ } ++ if (remote_debug) ++ printk("kgdb : entered kgdb on cpu %d\n", smp_processor_id()); ++ ++ gdb_i386vector = exceptionVector; ++ gdb_i386errcode = err_code; ++ kgdb_info.called_from = __builtin_return_address(0); ++#ifdef CONFIG_SMP ++ /* ++ * OK, we can now communicate, lets tell gdb about the sync. ++ * but only if we had a problem. ++ */ ++ switch (entry_state) { ++ case NO_NMI: ++ to_gdb("NMI not active, other cpus not stopped\n"); ++ break; ++ case NO_SYNC: ++ to_gdb("Some cpus not stopped, see 'kgdb_info' for details\n"); ++ default:; ++ } ++ ++#endif ++/* ++ * Set up the gdb function call area. ++ */ ++ trap_cpu = smp_processor_id(); ++ OLD_esp = NEW_esp = (int) (&linux_regs->esp); ++ ++ IF_SMP(once_again:) ++ /* reply to host that an exception has occurred */ ++ remcomOutBuffer[0] = 'S'; ++ remcomOutBuffer[1] = hexchars[signo >> 4]; ++ remcomOutBuffer[2] = hexchars[signo % 16]; ++ remcomOutBuffer[3] = 0; ++ ++ putpacket(remcomOutBuffer); ++ ++ while (1 == 1) { ++ error = 0; ++ remcomOutBuffer[0] = 0; ++ getpacket(remcomInBuffer); ++ switch (remcomInBuffer[0]) { ++ case '?': ++ remcomOutBuffer[0] = 'S'; ++ remcomOutBuffer[1] = hexchars[signo >> 4]; ++ remcomOutBuffer[2] = hexchars[signo % 16]; ++ remcomOutBuffer[3] = 0; ++ break; ++ case 'd': ++ remote_debug = !(remote_debug); /* toggle debug flag */ ++ printk("Remote debug %s\n", ++ remote_debug ? "on" : "off"); ++ break; ++ case 'g': /* return the value of the CPU registers */ ++ get_gdb_regs(usethread, ®s, gdb_regs); ++ mem2hex((char *) gdb_regs, ++ remcomOutBuffer, NUMREGBYTES, 0); ++ break; ++ case 'G': /* set the value of the CPU registers - return OK */ ++ hex2mem(&remcomInBuffer[1], ++ (char *) gdb_regs, NUMREGBYTES, 0); ++ if (!usethread || usethread == current) { ++ gdb_regs_to_regs(gdb_regs, ®s); ++ strcpy(remcomOutBuffer, "OK"); ++ } else { ++ strcpy(remcomOutBuffer, "E00"); ++ } ++ break; ++ ++ case 'P':{ /* set the value of a single CPU register - ++ return OK */ ++ /* ++ * For some reason, gdb wants to talk about psudo ++ * registers (greater than 15). These may have ++ * meaning for ptrace, but for us it is safe to ++ * ignor them. We do this by dumping them into ++ * _GS which we also ignor, but do have memory for. ++ */ ++ int regno; ++ ++ ptr = &remcomInBuffer[1]; ++ regs_to_gdb_regs(gdb_regs, ®s); ++ if ((!usethread || usethread == current) && ++ hexToInt(&ptr, ®no) && ++ *ptr++ == '=' && (regno >= 0)) { ++ regno = ++ (regno >= NUMREGS ? _GS : regno); ++ hex2mem(ptr, (char *) &gdb_regs[regno], ++ 4, 0); ++ gdb_regs_to_regs(gdb_regs, ®s); ++ strcpy(remcomOutBuffer, "OK"); ++ break; ++ } ++ strcpy(remcomOutBuffer, "E01"); ++ break; ++ } ++ ++ /* mAA..AA,LLLL Read LLLL bytes at address AA..AA */ ++ case 'm': ++ /* TRY TO READ %x,%x. IF SUCCEED, SET PTR = 0 */ ++ ptr = &remcomInBuffer[1]; ++ if (hexToInt(&ptr, &addr) && ++ (*(ptr++) == ',') && (hexToInt(&ptr, &length))) { ++ ptr = 0; ++ /* ++ * hex doubles the byte count ++ */ ++ if (length > (BUFMAX / 2)) ++ length = BUFMAX / 2; ++ mem2hex((char *) addr, ++ remcomOutBuffer, length, 1); ++ if (mem_err) { ++ strcpy(remcomOutBuffer, "E03"); ++ debug_error("memory fault\n", NULL); ++ } ++ } ++ ++ if (ptr) { ++ strcpy(remcomOutBuffer, "E01"); ++ debug_error ++ ("malformed read memory command: %s\n", ++ remcomInBuffer); ++ } ++ break; ++ ++ /* MAA..AA,LLLL: ++ Write LLLL bytes at address AA.AA return OK */ ++ case 'M': ++ /* TRY TO READ '%x,%x:'. IF SUCCEED, SET PTR = 0 */ ++ ptr = &remcomInBuffer[1]; ++ if (hexToInt(&ptr, &addr) && ++ (*(ptr++) == ',') && ++ (hexToInt(&ptr, &length)) && (*(ptr++) == ':')) { ++ hex2mem(ptr, (char *) addr, length, 1); ++ ++ if (mem_err) { ++ strcpy(remcomOutBuffer, "E03"); ++ debug_error("memory fault\n", NULL); ++ } else { ++ strcpy(remcomOutBuffer, "OK"); ++ } ++ ++ ptr = 0; ++ } ++ if (ptr) { ++ strcpy(remcomOutBuffer, "E02"); ++ debug_error ++ ("malformed write memory command: %s\n", ++ remcomInBuffer); ++ } ++ break; ++ case 'S': ++ remcomInBuffer[0] = 's'; ++ case 'C': ++ /* Csig;AA..AA where ;AA..AA is optional ++ * continue with signal ++ * Since signals are meaning less to us, delete that ++ * part and then fall into the 'c' code. ++ */ ++ ptr = &remcomInBuffer[1]; ++ length = 2; ++ while (*ptr && *ptr != ';') { ++ length++; ++ ptr++; ++ } ++ if (*ptr) { ++ do { ++ ptr++; ++ *(ptr - length++) = *ptr; ++ } while (*ptr); ++ } else { ++ remcomInBuffer[1] = 0; ++ } ++ ++ /* cAA..AA Continue at address AA..AA(optional) */ ++ /* sAA..AA Step one instruction from AA..AA(optional) */ ++ /* D detach, reply OK and then continue */ ++ case 'c': ++ case 's': ++ case 'D': ++ ++ /* try to read optional parameter, ++ pc unchanged if no parm */ ++ ptr = &remcomInBuffer[1]; ++ if (hexToInt(&ptr, &addr)) { ++ if (remote_debug) ++ printk("Changing EIP to 0x%x\n", addr); ++ ++ regs.eip = addr; ++ } ++ ++ newPC = regs.eip; ++ ++ /* clear the trace bit */ ++ regs.eflags &= 0xfffffeff; ++ ++ /* set the trace bit if we're stepping */ ++ if (remcomInBuffer[0] == 's') ++ regs.eflags |= 0x100; ++ ++ /* detach is a friendly version of continue. Note that ++ debugging is still enabled (e.g hit control C) ++ */ ++ if (remcomInBuffer[0] == 'D') { ++ strcpy(remcomOutBuffer, "OK"); ++ putpacket(remcomOutBuffer); ++ } ++ ++ if (remote_debug) { ++ printk("Resuming execution\n"); ++ print_regs(®s); ++ } ++ asm volatile ("movl %%db6, %0\n":"=r" (dr6) ++ :); ++ if (!(dr6 & 0x4000)) { ++ for (breakno = 0; breakno < 4; ++breakno) { ++ if (dr6 & (1 << breakno) && ++ (breakinfo[breakno].type == 0)) { ++ /* Set restore flag */ ++ regs.eflags |= 0x10000; ++ break; ++ } ++ } ++ } ++ correct_hw_break(); ++ asm volatile ("movl %0, %%db6\n"::"r" (0)); ++ goto exit_kgdb; ++ ++ /* kill the program */ ++ case 'k': /* do nothing */ ++ break; ++ ++ /* query */ ++ case 'q': ++ nothreads = 0; ++ switch (remcomInBuffer[1]) { ++ case 'f': ++ threadid = 1; ++ thread_list = 2; ++ thread_list_start = (usethread ? : current); ++ case 's': ++ if (!cmp_str(&remcomInBuffer[2], ++ "ThreadInfo", 10)) ++ break; ++ ++ remcomOutBuffer[nothreads++] = 'm'; ++ for (; threadid < PID_MAX + MAX_NO_CPUS; ++ threadid++) { ++ thread = getthread(threadid); ++ if (thread) { ++ nothreads += int_to_hex_v( ++ &remcomOutBuffer[ ++ nothreads], ++ threadid); ++ if (thread_min > threadid) ++ thread_min = threadid; ++ remcomOutBuffer[ ++ nothreads] = ','; ++ nothreads++; ++ if (nothreads > BUFMAX - 10) ++ break; ++ } ++ } ++ if (remcomOutBuffer[nothreads - 1] == 'm') { ++ remcomOutBuffer[nothreads - 1] = 'l'; ++ } else { ++ nothreads--; ++ } ++ remcomOutBuffer[nothreads] = 0; ++ break; ++ ++#ifdef old_thread_list /* Old thread info request */ ++ case 'L': ++ /* List threads */ ++ thread_list = 2; ++ thread_list_start = (usethread ? : current); ++ unpack_byte(remcomInBuffer + 3, &maxthreads); ++ unpack_threadid(remcomInBuffer + 5, &thref); ++ do { ++ int buf_thread_limit = ++ (BUFMAX - 22) / BUF_THREAD_ID_SIZE; ++ if (maxthreads > buf_thread_limit) { ++ maxthreads = buf_thread_limit; ++ } ++ } while (0); ++ remcomOutBuffer[0] = 'q'; ++ remcomOutBuffer[1] = 'M'; ++ remcomOutBuffer[4] = '0'; ++ pack_threadid(remcomOutBuffer + 5, &thref); ++ ++ threadid = threadref_to_int(&thref); ++ for (nothreads = 0; ++ nothreads < maxthreads && ++ threadid < PID_MAX + MAX_NO_CPUS; ++ threadid++) { ++ thread = getthread(threadid); ++ if (thread) { ++ int_to_threadref(&thref, ++ threadid); ++ pack_threadid(remcomOutBuffer + ++ 21 + ++ nothreads * 16, ++ &thref); ++ nothreads++; ++ if (thread_min > threadid) ++ thread_min = threadid; ++ } ++ } ++ ++ if (threadid == PID_MAX + MAX_NO_CPUS) { ++ remcomOutBuffer[4] = '1'; ++ } ++ pack_hex_byte(remcomOutBuffer + 2, nothreads); ++ remcomOutBuffer[21 + nothreads * 16] = '\0'; ++ break; ++#endif ++ case 'C': ++ /* Current thread id */ ++ remcomOutBuffer[0] = 'Q'; ++ remcomOutBuffer[1] = 'C'; ++ threadid = current->pid; ++ if (!threadid) { ++ /* ++ * idle thread ++ */ ++ for (threadid = PID_MAX; ++ threadid < PID_MAX + MAX_NO_CPUS; ++ threadid++) { ++ if (current == ++ idle_task(threadid - ++ PID_MAX)) ++ break; ++ } ++ } ++ int_to_threadref(&thref, threadid); ++ pack_threadid(remcomOutBuffer + 2, &thref); ++ remcomOutBuffer[18] = '\0'; ++ break; ++ ++ case 'E': ++ /* Print exception info */ ++ printexceptioninfo(exceptionVector, ++ err_code, remcomOutBuffer); ++ break; ++ case 'T':{ ++ char * nptr; ++ /* Thread extra info */ ++ if (!cmp_str(&remcomInBuffer[2], ++ "hreadExtraInfo,", 15)) { ++ break; ++ } ++ ptr = &remcomInBuffer[17]; ++ hexToInt(&ptr, &threadid); ++ thread = getthread(threadid); ++ nptr = &thread->comm[0]; ++ length = 0; ++ ptr = &remcomOutBuffer[0]; ++ do { ++ length++; ++ ptr = pack_hex_byte(ptr, *nptr++); ++ } while (*nptr && length < 16); ++ /* ++ * would like that 16 to be the size of ++ * task_struct.comm but don't know the ++ * syntax.. ++ */ ++ *ptr = 0; ++ } ++ } ++ break; ++ ++ /* task related */ ++ case 'H': ++ switch (remcomInBuffer[1]) { ++ case 'g': ++ ptr = &remcomInBuffer[2]; ++ hexToInt(&ptr, &threadid); ++ thread = getthread(threadid); ++ if (!thread) { ++ remcomOutBuffer[0] = 'E'; ++ remcomOutBuffer[1] = '\0'; ++ break; ++ } ++ /* ++ * Just in case I forget what this is all about, ++ * the "thread info" command to gdb causes it ++ * to ask for a thread list. It then switches ++ * to each thread and asks for the registers. ++ * For this (and only this) usage, we want to ++ * fudge the registers of tasks not on the run ++ * list (i.e. waiting) to show the routine that ++ * called schedule. Also, gdb, is a minimalist ++ * in that if the current thread is the last ++ * it will not re-read the info when done. ++ * This means that in this case we must show ++ * the real registers. So here is how we do it: ++ * Each entry we keep track of the min ++ * thread in the list (the last that gdb will) ++ * get info for. We also keep track of the ++ * starting thread. ++ * "thread_list" is cleared when switching back ++ * to the min thread if it is was current, or ++ * if it was not current, thread_list is set ++ * to 1. When the switch to current comes, ++ * if thread_list is 1, clear it, else do ++ * nothing. ++ */ ++ usethread = thread; ++ if ((thread_list == 1) && ++ (thread == thread_list_start)) { ++ thread_list = 0; ++ } ++ if (thread_list && (threadid == thread_min)) { ++ if (thread == thread_list_start) { ++ thread_list = 0; ++ } else { ++ thread_list = 1; ++ } ++ } ++ /* follow through */ ++ case 'c': ++ remcomOutBuffer[0] = 'O'; ++ remcomOutBuffer[1] = 'K'; ++ remcomOutBuffer[2] = '\0'; ++ break; ++ } ++ break; ++ ++ /* Query thread status */ ++ case 'T': ++ ptr = &remcomInBuffer[1]; ++ hexToInt(&ptr, &threadid); ++ thread = getthread(threadid); ++ if (thread) { ++ remcomOutBuffer[0] = 'O'; ++ remcomOutBuffer[1] = 'K'; ++ remcomOutBuffer[2] = '\0'; ++ if (thread_min > threadid) ++ thread_min = threadid; ++ } else { ++ remcomOutBuffer[0] = 'E'; ++ remcomOutBuffer[1] = '\0'; ++ } ++ break; ++ ++ case 'Y': /* set up a hardware breakpoint */ ++ ptr = &remcomInBuffer[1]; ++ hexToInt(&ptr, &breakno); ++ ptr++; ++ hexToInt(&ptr, &breaktype); ++ ptr++; ++ hexToInt(&ptr, &length); ++ ptr++; ++ hexToInt(&ptr, &addr); ++ if (set_hw_break(breakno & 0x3, ++ breaktype & 0x3, ++ length & 0x3, addr) == 0) { ++ strcpy(remcomOutBuffer, "OK"); ++ } else { ++ strcpy(remcomOutBuffer, "ERROR"); ++ } ++ break; ++ ++ /* Remove hardware breakpoint */ ++ case 'y': ++ ptr = &remcomInBuffer[1]; ++ hexToInt(&ptr, &breakno); ++ if (remove_hw_break(breakno & 0x3) == 0) { ++ strcpy(remcomOutBuffer, "OK"); ++ } else { ++ strcpy(remcomOutBuffer, "ERROR"); ++ } ++ break; ++ ++ case 'r': /* reboot */ ++ strcpy(remcomOutBuffer, "OK"); ++ putpacket(remcomOutBuffer); ++ /*to_gdb("Rebooting\n"); */ ++ /* triplefault no return from here */ ++ { ++ static long no_idt[2]; ++ __asm__ __volatile__("lidt %0"::"m"(no_idt[0])); ++ BREAKPOINT; ++ } ++ ++ } /* switch */ ++ ++ /* reply to the request */ ++ putpacket(remcomOutBuffer); ++ } /* while(1==1) */ ++ /* ++ * reached by goto only. ++ */ ++ exit_kgdb: ++ /* ++ * Here is where we set up to trap a gdb function call. NEW_esp ++ * will be changed if we are trying to do this. We handle both ++ * adding and subtracting, thus allowing gdb to put grung on ++ * the stack which it removes later. ++ */ ++ if (NEW_esp != OLD_esp) { ++ int *ptr = END_OF_LOOKASIDE; ++ if (NEW_esp < OLD_esp) ++ ptr -= (OLD_esp - NEW_esp) / sizeof (int); ++ *--ptr = linux_regs->eflags; ++ *--ptr = linux_regs->xcs; ++ *--ptr = linux_regs->eip; ++ *--ptr = linux_regs->ecx; ++ *--ptr = linux_regs->ebx; ++ *--ptr = linux_regs->eax; ++ linux_regs->ecx = NEW_esp - (sizeof (int) * 6); ++ linux_regs->ebx = (unsigned int) END_OF_LOOKASIDE; ++ if (NEW_esp < OLD_esp) { ++ linux_regs->eip = (unsigned int) fn_call_stub; ++ } else { ++ linux_regs->eip = (unsigned int) fn_rtn_stub; ++ linux_regs->eax = NEW_esp; ++ } ++ linux_regs->eflags &= ~(IF_BIT | TF_BIT); ++ } ++#ifdef CONFIG_SMP ++ /* ++ * Release gdb wait locks ++ * Sanity check time. Must have at least one cpu to run. Also single ++ * step must not be done if the current cpu is on hold. ++ */ ++ if (spinlock_count == 1) { ++ int ss_hold = (regs.eflags & 0x100) && kgdb_info.hold_on_sstep; ++ int cpu_avail = 0; ++ int i; ++ ++ for (i = 0; i < MAX_NO_CPUS; i++) { ++ if (!cpu_online(i)) ++ break; ++ if (!hold_cpu(i)) { ++ cpu_avail = 1; ++ } ++ } ++ /* ++ * Early in the bring up there will be NO cpus on line... ++ */ ++ if (!cpu_avail && !cpus_empty(cpu_online_map)) { ++ to_gdb("No cpus unblocked, see 'kgdb_info.hold_cpu'\n"); ++ goto once_again; ++ } ++ if (hold_cpu(smp_processor_id()) && (regs.eflags & 0x100)) { ++ to_gdb ++ ("Current cpu must be unblocked to single step\n"); ++ goto once_again; ++ } ++ if (!(ss_hold)) { ++ int i; ++ for (i = 0; i < MAX_NO_CPUS; i++) { ++ if (!hold_cpu(i)) { ++ spin_unlock(&waitlocks[i]); ++ } ++ } ++ } else { ++ spin_unlock(&waitlocks[smp_processor_id()]); ++ } ++ /* Release kgdb spinlock */ ++ KGDB_SPIN_UNLOCK(&kgdb_spinlock); ++ /* ++ * If this cpu is on hold, this is where we ++ * do it. Note, the NMI will pull us out of here, ++ * but will return as the above lock is not held. ++ * We will stay here till another cpu releases the lock for us. ++ */ ++ spin_unlock_wait(waitlocks + smp_processor_id()); ++ kgdb_local_irq_restore(flags); ++ return (0); ++ } ++#if 0 ++exit_just_unlock: ++#endif ++#endif ++ /* Release kgdb spinlock */ ++ KGDB_SPIN_UNLOCK(&kgdb_spinlock); ++ kgdb_local_irq_restore(flags); ++ return (0); ++} ++ ++/* this function is used to set up exception handlers for tracing and ++ * breakpoints. ++ * This function is not needed as the above line does all that is needed. ++ * We leave it for backward compatitability... ++ */ ++void ++set_debug_traps(void) ++{ ++ /* ++ * linux_debug_hook is defined in traps.c. We store a pointer ++ * to our own exception handler into it. ++ ++ * But really folks, every hear of labeled common, an old Fortran ++ * concept. Lots of folks can reference it and it is define if ++ * anyone does. Only one can initialize it at link time. We do ++ * this with the hook. See the statement above. No need for any ++ * executable code and it is ready as soon as the kernel is ++ * loaded. Very desirable in kernel debugging. ++ ++ linux_debug_hook = handle_exception ; ++ */ ++ ++ /* In case GDB is started before us, ack any packets (presumably ++ "$?#xx") sitting there. ++ putDebugChar ('+'); ++ ++ initialized = 1; ++ */ ++} ++ ++/* This function will generate a breakpoint exception. It is used at the ++ beginning of a program to sync up with a debugger and can be used ++ otherwise as a quick means to stop program execution and "break" into ++ the debugger. */ ++/* But really, just use the BREAKPOINT macro. We will handle the int stuff ++ */ ++ ++#ifdef later ++/* ++ * possibly we should not go thru the traps.c code at all? Someday. ++ */ ++void ++do_kgdb_int3(struct pt_regs *regs, long error_code) ++{ ++ kgdb_handle_exception(3, 5, error_code, regs); ++ return; ++} ++#endif ++#undef regs ++#ifdef CONFIG_TRAP_BAD_SYSCALL_EXITS ++asmlinkage void ++bad_sys_call_exit(int stuff) ++{ ++ struct pt_regs *regs = (struct pt_regs *) &stuff; ++ printk("Sys call %d return with %x preempt_count\n", ++ (int) regs->orig_eax, preempt_count()); ++} ++#endif ++#ifdef CONFIG_STACK_OVERFLOW_TEST ++#include ++asmlinkage void ++stack_overflow(void) ++{ ++#ifdef BREAKPOINT ++ BREAKPOINT; ++#else ++ printk("Kernel stack overflow, looping forever\n"); ++#endif ++ while (1) { ++ } ++} ++#endif ++ ++#if defined(CONFIG_SMP) || defined(CONFIG_KGDB_CONSOLE) ++char gdbconbuf[BUFMAX]; ++ ++static void ++kgdb_gdb_message(const char *s, unsigned count) ++{ ++ int i; ++ int wcount; ++ char *bufptr; ++ /* ++ * This takes care of NMI while spining out chars to gdb ++ */ ++ IF_SMP(in_kgdb_console = 1); ++ gdbconbuf[0] = 'O'; ++ bufptr = gdbconbuf + 1; ++ while (count > 0) { ++ if ((count << 1) > (BUFMAX - 2)) { ++ wcount = (BUFMAX - 2) >> 1; ++ } else { ++ wcount = count; ++ } ++ count -= wcount; ++ for (i = 0; i < wcount; i++) { ++ bufptr = pack_hex_byte(bufptr, s[i]); ++ } ++ *bufptr = '\0'; ++ s += wcount; ++ ++ putpacket(gdbconbuf); ++ ++ } ++ IF_SMP(in_kgdb_console = 0); ++} ++#endif ++#ifdef CONFIG_SMP ++static void ++to_gdb(const char *s) ++{ ++ int count = 0; ++ while (s[count] && (count++ < BUFMAX)) ; ++ kgdb_gdb_message(s, count); ++} ++#endif ++#ifdef CONFIG_KGDB_CONSOLE ++#include ++#include ++#include ++#include ++#include ++ ++void ++kgdb_console_write(struct console *co, const char *s, unsigned count) ++{ ++ ++ if (gdb_i386vector == -1) { ++ /* ++ * We have not yet talked to gdb. What to do... ++ * lets break, on continue we can do the write. ++ * But first tell him whats up. Uh, well no can do, ++ * as this IS the console. Oh well... ++ * We do need to wait or the messages will be lost. ++ * Other option would be to tell the above code to ++ * ignore this breakpoint and do an auto return, ++ * but that might confuse gdb. Also this happens ++ * early enough in boot up that we don't have the traps ++ * set up yet, so... ++ */ ++ breakpoint(); ++ } ++ kgdb_gdb_message(s, count); ++} ++ ++/* ++ * ------------------------------------------------------------ ++ * Serial KGDB driver ++ * ------------------------------------------------------------ ++ */ ++ ++static struct console kgdbcons = { ++ name:"kgdb", ++ write:kgdb_console_write, ++#ifdef CONFIG_KGDB_USER_CONSOLE ++ device:kgdb_console_device, ++#endif ++ flags:CON_PRINTBUFFER | CON_ENABLED, ++ index:-1, ++}; ++ ++/* ++ * The trick here is that this file gets linked before printk.o ++ * That means we get to peer at the console info in the command ++ * line before it does. If we are up, we register, otherwise, ++ * do nothing. By returning 0, we allow printk to look also. ++ */ ++static int kgdb_console_enabled; ++ ++int __init ++kgdb_console_init(char *str) ++{ ++ if ((strncmp(str, "kgdb", 4) == 0) || (strncmp(str, "gdb", 3) == 0)) { ++ register_console(&kgdbcons); ++ kgdb_console_enabled = 1; ++ } ++ return 0; /* let others look at the string */ ++} ++ ++__setup("console=", kgdb_console_init); ++ ++#ifdef CONFIG_KGDB_USER_CONSOLE ++static kdev_t kgdb_console_device(struct console *c); ++/* This stuff sort of works, but it knocks out telnet devices ++ * we are leaving it here in case we (or you) find time to figure it out ++ * better.. ++ */ ++ ++/* ++ * We need a real char device as well for when the console is opened for user ++ * space activities. ++ */ ++ ++static int ++kgdb_consdev_open(struct inode *inode, struct file *file) ++{ ++ return 0; ++} ++ ++static ssize_t ++kgdb_consdev_write(struct file *file, const char *buf, ++ size_t count, loff_t * ppos) ++{ ++ int size, ret = 0; ++ static char kbuf[128]; ++ static DECLARE_MUTEX(sem); ++ ++ /* We are not reentrant... */ ++ if (down_interruptible(&sem)) ++ return -ERESTARTSYS; ++ ++ while (count > 0) { ++ /* need to copy the data from user space */ ++ size = count; ++ if (size > sizeof (kbuf)) ++ size = sizeof (kbuf); ++ if (copy_from_user(kbuf, buf, size)) { ++ ret = -EFAULT; ++ break;; ++ } ++ kgdb_console_write(&kgdbcons, kbuf, size); ++ count -= size; ++ ret += size; ++ buf += size; ++ } ++ ++ up(&sem); ++ ++ return ret; ++} ++ ++struct file_operations kgdb_consdev_fops = { ++ open:kgdb_consdev_open, ++ write:kgdb_consdev_write ++}; ++static kdev_t ++kgdb_console_device(struct console *c) ++{ ++ return MKDEV(TTYAUX_MAJOR, 1); ++} ++ ++/* ++ * This routine gets called from the serial stub in the i386/lib ++ * This is so it is done late in bring up (just before the console open). ++ */ ++void ++kgdb_console_finit(void) ++{ ++ if (kgdb_console_enabled) { ++ char *cptr = cdevname(MKDEV(TTYAUX_MAJOR, 1)); ++ char *cp = cptr; ++ while (*cptr && *cptr != '(') ++ cptr++; ++ *cptr = 0; ++ unregister_chrdev(TTYAUX_MAJOR, cp); ++ register_chrdev(TTYAUX_MAJOR, "kgdb", &kgdb_consdev_fops); ++ } ++} ++#endif ++#endif ++#ifdef CONFIG_KGDB_TS ++#include /* time stamp code */ ++#include /* in_interrupt */ ++#ifdef CONFIG_KGDB_TS_64 ++#define DATA_POINTS 64 ++#endif ++#ifdef CONFIG_KGDB_TS_128 ++#define DATA_POINTS 128 ++#endif ++#ifdef CONFIG_KGDB_TS_256 ++#define DATA_POINTS 256 ++#endif ++#ifdef CONFIG_KGDB_TS_512 ++#define DATA_POINTS 512 ++#endif ++#ifdef CONFIG_KGDB_TS_1024 ++#define DATA_POINTS 1024 ++#endif ++#ifndef DATA_POINTS ++#define DATA_POINTS 128 /* must be a power of two */ ++#endif ++#define INDEX_MASK (DATA_POINTS - 1) ++#if (INDEX_MASK & DATA_POINTS) ++#error "CONFIG_KGDB_TS_COUNT must be a power of 2" ++#endif ++struct kgdb_and_then_struct { ++#ifdef CONFIG_SMP ++ int on_cpu; ++#endif ++ struct task_struct *task; ++ long long at_time; ++ int from_ln; ++ char *in_src; ++ void *from; ++ int *with_shpf; ++ int data0; ++ int data1; ++}; ++struct kgdb_and_then_struct2 { ++#ifdef CONFIG_SMP ++ int on_cpu; ++#endif ++ struct task_struct *task; ++ long long at_time; ++ int from_ln; ++ char *in_src; ++ void *from; ++ int *with_shpf; ++ struct task_struct *t1; ++ struct task_struct *t2; ++}; ++struct kgdb_and_then_struct kgdb_data[DATA_POINTS]; ++ ++struct kgdb_and_then_struct *kgdb_and_then = &kgdb_data[0]; ++int kgdb_and_then_count; ++ ++void ++kgdb_tstamp(int line, char *source, int data0, int data1) ++{ ++ static spinlock_t ts_spin = SPIN_LOCK_UNLOCKED; ++ int flags; ++ kgdb_local_irq_save(flags); ++ spin_lock(&ts_spin); ++ rdtscll(kgdb_and_then->at_time); ++#ifdef CONFIG_SMP ++ kgdb_and_then->on_cpu = smp_processor_id(); ++#endif ++ kgdb_and_then->task = current; ++ kgdb_and_then->from_ln = line; ++ kgdb_and_then->in_src = source; ++ kgdb_and_then->from = __builtin_return_address(0); ++ kgdb_and_then->with_shpf = (int *) (((flags & IF_BIT) >> 9) | ++ (preempt_count() << 8)); ++ kgdb_and_then->data0 = data0; ++ kgdb_and_then->data1 = data1; ++ kgdb_and_then = &kgdb_data[++kgdb_and_then_count & INDEX_MASK]; ++ spin_unlock(&ts_spin); ++ kgdb_local_irq_restore(flags); ++#ifdef CONFIG_PREEMPT ++ ++#endif ++ return; ++} ++#endif ++typedef int gdb_debug_hook(int exceptionVector, ++ int signo, int err_code, struct pt_regs *linux_regs); ++gdb_debug_hook *linux_debug_hook = &kgdb_handle_exception; /* histerical reasons... */ +diff -puN arch/i386/kernel/Makefile~kgdb-ga arch/i386/kernel/Makefile +--- 25/arch/i386/kernel/Makefile~kgdb-ga 2004-10-21 14:54:15.259603680 -0700 ++++ 25-akpm/arch/i386/kernel/Makefile 2004-10-21 14:54:15.308596232 -0700 +@@ -14,6 +14,7 @@ obj-y += timers/ + obj-$(CONFIG_ACPI_BOOT) += acpi/ + obj-$(CONFIG_X86_BIOS_REBOOT) += reboot.o + obj-$(CONFIG_MCA) += mca.o ++obj-$(CONFIG_KGDB) += kgdb_stub.o + obj-$(CONFIG_X86_MSR) += msr.o + obj-$(CONFIG_X86_CPUID) += cpuid.o + obj-$(CONFIG_MICROCODE) += microcode.o +diff -puN arch/i386/kernel/nmi.c~kgdb-ga arch/i386/kernel/nmi.c +--- 25/arch/i386/kernel/nmi.c~kgdb-ga 2004-10-21 14:54:15.261603376 -0700 ++++ 25-akpm/arch/i386/kernel/nmi.c 2004-10-21 14:54:15.308596232 -0700 +@@ -34,7 +34,17 @@ + + #include "mach_traps.h" + ++#ifdef CONFIG_KGDB ++#include ++#ifdef CONFIG_SMP ++unsigned int nmi_watchdog = NMI_IO_APIC; ++#else ++unsigned int nmi_watchdog = NMI_LOCAL_APIC; ++#endif ++#else + unsigned int nmi_watchdog = NMI_NONE; ++#endif ++ + extern int unknown_nmi_panic; + static unsigned int nmi_hz = HZ; + static unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */ +@@ -466,6 +476,9 @@ void touch_nmi_watchdog (void) + for (i = 0; i < NR_CPUS; i++) + alert_counter[i] = 0; + } ++#ifdef CONFIG_KGDB ++int tune_watchdog = 5*HZ; ++#endif + + extern void die_nmi(struct pt_regs *, const char *msg); + +@@ -481,12 +494,24 @@ void nmi_watchdog_tick (struct pt_regs * + */ + sum = irq_stat[cpu].apic_timer_irqs; + ++#ifdef CONFIG_KGDB ++ if (!in_kgdb(regs) && last_irq_sums[cpu] == sum) { ++ ++#else + if (last_irq_sums[cpu] == sum) { ++#endif + /* + * Ayiee, looks like this CPU is stuck ... + * wait a few IRQs (5 seconds) before doing the oops ... + */ + alert_counter[cpu]++; ++#ifdef CONFIG_KGDB ++ if (alert_counter[cpu] == tune_watchdog) { ++ kgdb_handle_exception(2, SIGPWR, 0, regs); ++ last_irq_sums[cpu] = sum; ++ alert_counter[cpu] = 0; ++ } ++#endif + if (alert_counter[cpu] == 30*nmi_hz) + die_nmi(regs, "NMI Watchdog detected LOCKUP"); + } else { +diff -puN arch/i386/kernel/smp.c~kgdb-ga arch/i386/kernel/smp.c +--- 25/arch/i386/kernel/smp.c~kgdb-ga 2004-10-21 14:54:15.262603224 -0700 ++++ 25-akpm/arch/i386/kernel/smp.c 2004-10-21 14:54:15.309596080 -0700 +@@ -466,7 +466,17 @@ void flush_tlb_all(void) + { + on_each_cpu(do_flush_tlb_all, NULL, 1, 1); + } +- ++#ifdef CONFIG_KGDB ++/* ++ * By using the NMI code instead of a vector we just sneak thru the ++ * word generator coming out with just what we want. AND it does ++ * not matter if clustered_apic_mode is set or not. ++ */ ++void smp_send_nmi_allbutself(void) ++{ ++ send_IPI_allbutself(APIC_DM_NMI); ++} ++#endif + /* + * this function sends a 'reschedule' IPI to another CPU. + * it goes straight through and wastes no time serializing +diff -puN arch/i386/kernel/traps.c~kgdb-ga arch/i386/kernel/traps.c +--- 25/arch/i386/kernel/traps.c~kgdb-ga 2004-10-21 14:54:15.264602920 -0700 ++++ 25-akpm/arch/i386/kernel/traps.c 2004-10-21 14:54:15.311595776 -0700 +@@ -105,6 +105,39 @@ int register_die_notifier(struct notifie + return err; + } + ++#ifdef CONFIG_KGDB ++extern void sysenter_past_esp(void); ++#include ++#include ++void set_intr_gate(unsigned int n, void *addr); ++static void set_intr_usr_gate(unsigned int n, void *addr); ++/* ++ * Should be able to call this breakpoint() very early in ++ * bring up. Just hard code the call where needed. ++ * The breakpoint() code is here because set_?_gate() functions ++ * are local (static) to trap.c. They need be done only once, ++ * but it does not hurt to do them over. ++ */ ++void breakpoint(void) ++{ ++ set_intr_usr_gate(3,&int3); /* disable ints on trap */ ++ set_intr_gate(1,&debug); ++ set_intr_gate(14,&page_fault); ++ ++ BREAKPOINT; ++} ++#define CHK_REMOTE_DEBUG(trapnr,signr,error_code,regs,after) \ ++ { \ ++ if (!user_mode(regs) ) \ ++ { \ ++ kgdb_handle_exception(trapnr, signr, error_code, regs); \ ++ after; \ ++ } else if ((trapnr == 3) && (regs->eflags &0x200)) local_irq_enable(); \ ++ } ++#else ++#define CHK_REMOTE_DEBUG(trapnr,signr,error_code,regs,after) ++#endif ++ + static inline int valid_stack_ptr(struct thread_info *tinfo, void *p) + { + return p > (void *)tinfo && +@@ -332,6 +365,15 @@ void die(const char * str, struct pt_reg + #endif + if (nl) + printk("\n"); ++#ifdef CONFIG_KGDB ++ /* This is about the only place we want to go to kgdb even if in ++ * user mode. But we must go in via a trap so within kgdb we will ++ * always be in kernel mode. ++ */ ++ if (user_mode(regs)) ++ BREAKPOINT; ++#endif ++ CHK_REMOTE_DEBUG(0,SIGTRAP,err,regs,) + notify_die(DIE_OOPS, (char *)str, regs, err, 255, SIGSEGV); + show_registers(regs); + } else +@@ -406,6 +448,7 @@ static inline void do_trap(int trapnr, i + #define DO_ERROR(trapnr, signr, str, name) \ + asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ + { \ ++ CHK_REMOTE_DEBUG(trapnr,signr,error_code,regs,) \ + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ + == NOTIFY_STOP) \ + return; \ +@@ -429,6 +472,7 @@ asmlinkage void do_##name(struct pt_regs + #define DO_VM86_ERROR(trapnr, signr, str, name) \ + asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ + { \ ++ CHK_REMOTE_DEBUG(trapnr, signr, error_code,regs, return) \ + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ + == NOTIFY_STOP) \ + return; \ +@@ -512,7 +556,8 @@ gp_in_vm86: + + gp_in_kernel: + if (!fixup_exception(regs)) { + die: ++ CHK_REMOTE_DEBUG(13,SIGSEGV,error_code,regs,) + if (notify_die(DIE_GPF, "general protection fault", regs, + error_code, 13, SIGSEGV) == NOTIFY_STOP) + return; +@@ -721,8 +766,18 @@ asmlinkage void do_debug(struct pt_regs + * allowing programs to debug themselves without the ptrace() + * interface. + */ ++#ifdef CONFIG_KGDB ++ /* ++ * I think this is the only "real" case of a TF in the kernel ++ * that really belongs to user space. Others are ++ * "Ours all ours!" ++ */ ++ if (((regs->xcs & 3) == 0) && ((void *)regs->eip == sysenter_past_esp)) ++ goto clear_TF_reenable; ++#else + if ((regs->xcs & 3) == 0) + goto clear_TF_reenable; ++#endif + if ((tsk->ptrace & (PT_DTRACE|PT_PTRACED)) == PT_DTRACE) + goto clear_TF; + } +@@ -734,6 +789,17 @@ asmlinkage void do_debug(struct pt_regs + info.si_errno = 0; + info.si_code = TRAP_BRKPT; + ++#ifdef CONFIG_KGDB ++ /* ++ * If this is a kernel mode trap, we need to reset db7 to allow us ++ * to continue sanely ALSO skip the signal delivery ++ */ ++ if ((regs->xcs & 3) == 0) ++ goto clear_dr7; ++ ++ /* if not kernel, allow ints but only if they were on */ ++ if ( regs->eflags & 0x200) local_irq_enable(); ++#endif + /* If this is a kernel mode trap, save the user PC on entry to + * the kernel, that's what the debugger can make sense of. + */ +@@ -748,6 +814,7 @@ clear_dr7: + __asm__("movl %0,%%db7" + : /* no output */ + : "r" (0)); ++ CHK_REMOTE_DEBUG(1,SIGTRAP,error_code,regs,) + return; + + debug_vm86: +@@ -1004,6 +1071,12 @@ static void __init set_task_gate(unsigne + { + _set_gate(idt_table+n,5,0,0,(gdt_entry<<3)); + } ++#ifdef CONFIG_KGDB ++void set_intr_usr_gate(unsigned int n, void *addr) ++{ ++ _set_gate(idt_table+n,14,3,addr,__KERNEL_CS); ++} ++#endif + + + void __init trap_init(void) +@@ -1021,7 +1094,11 @@ void __init trap_init(void) + set_trap_gate(0,÷_error); + set_intr_gate(1,&debug); + set_intr_gate(2,&nmi); ++#ifndef CONFIG_KGDB + set_system_intr_gate(3, &int3); /* int3-5 can be called from all */ ++#else ++ set_intr_usr_gate(3,&int3); /* int3-5 can be called from all */ ++#endif + set_system_gate(4,&overflow); + set_system_gate(5,&bounds); + set_trap_gate(6,&invalid_op); +diff -puN /dev/null arch/i386/lib/kgdb_serial.c +--- /dev/null 2003-09-15 06:40:47.000000000 -0700 ++++ 25-akpm/arch/i386/lib/kgdb_serial.c 2004-10-21 14:54:15.313595472 -0700 +@@ -0,0 +1,485 @@ ++/* ++ * Serial interface GDB stub ++ * ++ * Written (hacked together) by David Grothe (dave@gcom.com) ++ * Modified to allow invokation early in boot see also ++ * kgdb.h for instructions by George Anzinger(george@mvista.com) ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#ifdef CONFIG_KGDB_USER_CONSOLE ++extern void kgdb_console_finit(void); ++#endif ++#define PRNT_off ++#define TEST_EXISTANCE ++#ifdef PRNT ++#define dbprintk(s) printk s ++#else ++#define dbprintk(s) ++#endif ++#define TEST_INTERRUPT_off ++#ifdef TEST_INTERRUPT ++#define intprintk(s) printk s ++#else ++#define intprintk(s) ++#endif ++ ++#define IRQ_T(info) ((info->flags & ASYNC_SHARE_IRQ) ? SA_SHIRQ : SA_INTERRUPT) ++ ++#define GDB_BUF_SIZE 512 /* power of 2, please */ ++ ++static char gdb_buf[GDB_BUF_SIZE]; ++static int gdb_buf_in_inx; ++static atomic_t gdb_buf_in_cnt; ++static int gdb_buf_out_inx; ++ ++struct async_struct *gdb_async_info; ++static int gdb_async_irq; ++ ++#define outb_px(a,b) outb_p(b,a) ++ ++static void program_uart(struct async_struct *info); ++static void write_char(struct async_struct *info, int chr); ++/* ++ * Get a byte from the hardware data buffer and return it ++ */ ++static int ++read_data_bfr(struct async_struct *info) ++{ ++ char it = inb_p(info->port + UART_LSR); ++ ++ if (it & UART_LSR_DR) ++ return (inb_p(info->port + UART_RX)); ++ /* ++ * If we have a framing error assume somebody messed with ++ * our uart. Reprogram it and send '-' both ways... ++ */ ++ if (it & 0xc) { ++ program_uart(info); ++ write_char(info, '-'); ++ return ('-'); ++ } ++ return (-1); ++ ++} /* read_data_bfr */ ++ ++/* ++ * Get a char if available, return -1 if nothing available. ++ * Empty the receive buffer first, then look at the interface hardware. ++ ++ * Locking here is a bit of a problem. We MUST not lock out communication ++ * if we are trying to talk to gdb about a kgdb entry. ON the other hand ++ * we can loose chars in the console pass thru if we don't lock. It is also ++ * possible that we could hold the lock or be waiting for it when kgdb ++ * NEEDS to talk. Since kgdb locks down the world, it does not need locks. ++ * We do, of course have possible issues with interrupting a uart operation, ++ * but we will just depend on the uart status to help keep that straight. ++ ++ */ ++static spinlock_t uart_interrupt_lock = SPIN_LOCK_UNLOCKED; ++#ifdef CONFIG_SMP ++extern spinlock_t kgdb_spinlock; ++#endif ++ ++static int ++read_char(struct async_struct *info) ++{ ++ int chr; ++ unsigned long flags; ++ local_irq_save(flags); ++#ifdef CONFIG_SMP ++ if (!spin_is_locked(&kgdb_spinlock)) { ++ spin_lock(&uart_interrupt_lock); ++ } ++#endif ++ if (atomic_read(&gdb_buf_in_cnt) != 0) { /* intr routine has q'd chars */ ++ chr = gdb_buf[gdb_buf_out_inx++]; ++ gdb_buf_out_inx &= (GDB_BUF_SIZE - 1); ++ atomic_dec(&gdb_buf_in_cnt); ++ } else { ++ chr = read_data_bfr(info); ++ } ++#ifdef CONFIG_SMP ++ if (!spin_is_locked(&kgdb_spinlock)) { ++ spin_unlock(&uart_interrupt_lock); ++ } ++#endif ++ local_irq_restore(flags); ++ return (chr); ++} ++ ++/* ++ * Wait until the interface can accept a char, then write it. ++ */ ++static void ++write_char(struct async_struct *info, int chr) ++{ ++ while (!(inb_p(info->port + UART_LSR) & UART_LSR_THRE)) ; ++ ++ outb_p(chr, info->port + UART_TX); ++ ++} /* write_char */ ++ ++/* ++ * Mostly we don't need a spinlock, but since the console goes ++ * thru here with interrutps on, well, we need to catch those ++ * chars. ++ */ ++/* ++ * This is the receiver interrupt routine for the GDB stub. ++ * It will receive a limited number of characters of input ++ * from the gdb host machine and save them up in a buffer. ++ * ++ * When the gdb stub routine getDebugChar() is called it ++ * draws characters out of the buffer until it is empty and ++ * then reads directly from the serial port. ++ * ++ * We do not attempt to write chars from the interrupt routine ++ * since the stubs do all of that via putDebugChar() which ++ * writes one byte after waiting for the interface to become ++ * ready. ++ * ++ * The debug stubs like to run with interrupts disabled since, ++ * after all, they run as a consequence of a breakpoint in ++ * the kernel. ++ * ++ * Perhaps someone who knows more about the tty driver than I ++ * care to learn can make this work for any low level serial ++ * driver. ++ */ ++static irqreturn_t ++gdb_interrupt(int irq, void *dev_id, struct pt_regs *regs) ++{ ++ struct async_struct *info; ++ unsigned long flags; ++ ++ info = gdb_async_info; ++ if (!info || !info->tty || irq != gdb_async_irq) ++ return IRQ_NONE; ++ ++ local_irq_save(flags); ++ spin_lock(&uart_interrupt_lock); ++ do { ++ int chr = read_data_bfr(info); ++ intprintk(("Debug char on int: %x hex\n", chr)); ++ if (chr < 0) ++ continue; ++ ++ if (chr == 3) { /* Ctrl-C means remote interrupt */ ++ BREAKPOINT; ++ continue; ++ } ++ ++ if (atomic_read(&gdb_buf_in_cnt) >= GDB_BUF_SIZE) { ++ /* buffer overflow tosses early char */ ++ read_char(info); ++ } ++ gdb_buf[gdb_buf_in_inx++] = chr; ++ gdb_buf_in_inx &= (GDB_BUF_SIZE - 1); ++ } while (inb_p(info->port + UART_IIR) & UART_IIR_RDI); ++ spin_unlock(&uart_interrupt_lock); ++ local_irq_restore(flags); ++ return IRQ_HANDLED; ++} /* gdb_interrupt */ ++ ++/* ++ * Just a NULL routine for testing. ++ */ ++void ++gdb_null(void) ++{ ++} /* gdb_null */ ++ ++/* These structure are filled in with values defined in asm/kgdb_local.h ++ */ ++static struct serial_state state = SB_STATE; ++static struct async_struct local_info = SB_INFO; ++static int ok_to_enable_ints = 0; ++static void kgdb_enable_ints_now(void); ++ ++extern char *kgdb_version; ++/* ++ * Hook an IRQ for KGDB. ++ * ++ * This routine is called from putDebugChar, below. ++ */ ++static int ints_disabled = 1; ++int ++gdb_hook_interrupt(struct async_struct *info, int verb) ++{ ++ struct serial_state *state = info->state; ++ unsigned long flags; ++ int port; ++#ifdef TEST_EXISTANCE ++ int scratch, scratch2; ++#endif ++ ++ /* The above fails if memory managment is not set up yet. ++ * Rather than fail the set up, just keep track of the fact ++ * and pick up the interrupt thing later. ++ */ ++ gdb_async_info = info; ++ port = gdb_async_info->port; ++ gdb_async_irq = state->irq; ++ if (verb) { ++ printk("kgdb %s : port =%x, IRQ=%d, divisor =%d\n", ++ kgdb_version, ++ port, ++ gdb_async_irq, gdb_async_info->state->custom_divisor); ++ } ++ local_irq_save(flags); ++#ifdef TEST_EXISTANCE ++ /* Existance test */ ++ /* Should not need all this, but just in case.... */ ++ ++ scratch = inb_p(port + UART_IER); ++ outb_px(port + UART_IER, 0); ++ outb_px(0xff, 0x080); ++ scratch2 = inb_p(port + UART_IER); ++ outb_px(port + UART_IER, scratch); ++ if (scratch2) { ++ printk ++ ("gdb_hook_interrupt: Could not clear IER, not a UART!\n"); ++ local_irq_restore(flags); ++ return 1; /* We failed; there's nothing here */ ++ } ++ scratch2 = inb_p(port + UART_LCR); ++ outb_px(port + UART_LCR, 0xBF); /* set up for StarTech test */ ++ outb_px(port + UART_EFR, 0); /* EFR is the same as FCR */ ++ outb_px(port + UART_LCR, 0); ++ outb_px(port + UART_FCR, UART_FCR_ENABLE_FIFO); ++ scratch = inb_p(port + UART_IIR) >> 6; ++ if (scratch == 1) { ++ printk("gdb_hook_interrupt: Undefined UART type!" ++ " Not a UART! \n"); ++ local_irq_restore(flags); ++ return 1; ++ } else { ++ dbprintk(("gdb_hook_interrupt: UART type " ++ "is %d where 0=16450, 2=16550 3=16550A\n", scratch)); ++ } ++ scratch = inb_p(port + UART_MCR); ++ outb_px(port + UART_MCR, UART_MCR_LOOP | scratch); ++ outb_px(port + UART_MCR, UART_MCR_LOOP | 0x0A); ++ scratch2 = inb_p(port + UART_MSR) & 0xF0; ++ outb_px(port + UART_MCR, scratch); ++ if (scratch2 != 0x90) { ++ printk("gdb_hook_interrupt: " ++ "Loop back test failed! Not a UART!\n"); ++ local_irq_restore(flags); ++ return scratch2 + 1000; /* force 0 to fail */ ++ } ++#endif /* test existance */ ++ program_uart(info); ++ local_irq_restore(flags); ++ ++ return (0); ++ ++} /* gdb_hook_interrupt */ ++ ++static void ++program_uart(struct async_struct *info) ++{ ++ int port = info->port; ++ ++ (void) inb_p(port + UART_RX); ++ outb_px(port + UART_IER, 0); ++ ++ (void) inb_p(port + UART_RX); /* serial driver comments say */ ++ (void) inb_p(port + UART_IIR); /* this clears the interrupt regs */ ++ (void) inb_p(port + UART_MSR); ++ outb_px(port + UART_LCR, UART_LCR_WLEN8 | UART_LCR_DLAB); ++ outb_px(port + UART_DLL, info->state->custom_divisor & 0xff); /* LS */ ++ outb_px(port + UART_DLM, info->state->custom_divisor >> 8); /* MS */ ++ outb_px(port + UART_MCR, info->MCR); ++ ++ outb_px(port + UART_FCR, UART_FCR_ENABLE_FIFO | UART_FCR_TRIGGER_1 | UART_FCR_CLEAR_XMIT | UART_FCR_CLEAR_RCVR); /* set fcr */ ++ outb_px(port + UART_LCR, UART_LCR_WLEN8); /* reset DLAB */ ++ outb_px(port + UART_FCR, UART_FCR_ENABLE_FIFO | UART_FCR_TRIGGER_1); /* set fcr */ ++ if (!ints_disabled) { ++ intprintk(("KGDB: Sending %d to port %x offset %d\n", ++ gdb_async_info->IER, ++ (int) gdb_async_info->port, UART_IER)); ++ outb_px(gdb_async_info->port + UART_IER, gdb_async_info->IER); ++ } ++ return; ++} ++ ++/* ++ * getDebugChar ++ * ++ * This is a GDB stub routine. It waits for a character from the ++ * serial interface and then returns it. If there is no serial ++ * interface connection then it returns a bogus value which will ++ * almost certainly cause the system to hang. In the ++ */ ++int kgdb_in_isr = 0; ++int kgdb_in_lsr = 0; ++extern spinlock_t kgdb_spinlock; ++ ++/* Caller takes needed protections */ ++ ++int ++getDebugChar(void) ++{ ++ volatile int chr, dum, time, end_time; ++ ++ dbprintk(("getDebugChar(port %x): ", gdb_async_info->port)); ++ ++ if (gdb_async_info == NULL) { ++ gdb_hook_interrupt(&local_info, 0); ++ } ++ /* ++ * This trick says if we wait a very long time and get ++ * no char, return the -1 and let the upper level deal ++ * with it. ++ */ ++ rdtsc(dum, time); ++ end_time = time + 2; ++ while (((chr = read_char(gdb_async_info)) == -1) && ++ (end_time - time) > 0) { ++ rdtsc(dum, time); ++ }; ++ /* ++ * This covers our butts if some other code messes with ++ * our uart, hay, it happens :o) ++ */ ++ if (chr == -1) ++ program_uart(gdb_async_info); ++ ++ dbprintk(("%c\n", chr > ' ' && chr < 0x7F ? chr : ' ')); ++ return (chr); ++ ++} /* getDebugChar */ ++ ++static int count = 3; ++static spinlock_t one_at_atime = SPIN_LOCK_UNLOCKED; ++ ++static int __init ++kgdb_enable_ints(void) ++{ ++ if (gdb_async_info == NULL) { ++ gdb_hook_interrupt(&local_info, 1); ++ } ++ ok_to_enable_ints = 1; ++ kgdb_enable_ints_now(); ++#ifdef CONFIG_KGDB_USER_CONSOLE ++ kgdb_console_finit(); ++#endif ++ return 0; ++} ++ ++#ifdef CONFIG_SERIAL_8250 ++void shutdown_for_kgdb(struct async_struct *gdb_async_info); ++#endif ++ ++#ifdef CONFIG_DISCONTIGMEM ++static inline int kgdb_mem_init_done(void) ++{ ++ return highmem_start_page != NULL; ++} ++#else ++static inline int kgdb_mem_init_done(void) ++{ ++ return max_mapnr != 0; ++} ++#endif ++ ++static void ++kgdb_enable_ints_now(void) ++{ ++ if (!spin_trylock(&one_at_atime)) ++ return; ++ if (!ints_disabled) ++ goto exit; ++ if (kgdb_mem_init_done() && ++ ints_disabled) { /* don't try till mem init */ ++#ifdef CONFIG_SERIAL_8250 ++ /* ++ * The ifdef here allows the system to be configured ++ * without the serial driver. ++ * Don't make it a module, however, it will steal the port ++ */ ++ shutdown_for_kgdb(gdb_async_info); ++#endif ++ ints_disabled = request_irq(gdb_async_info->state->irq, ++ gdb_interrupt, ++ IRQ_T(gdb_async_info), ++ "KGDB-stub", NULL); ++ intprintk(("KGDB: request_irq returned %d\n", ints_disabled)); ++ } ++ if (!ints_disabled) { ++ intprintk(("KGDB: Sending %d to port %x offset %d\n", ++ gdb_async_info->IER, ++ (int) gdb_async_info->port, UART_IER)); ++ outb_px(gdb_async_info->port + UART_IER, gdb_async_info->IER); ++ } ++ exit: ++ spin_unlock(&one_at_atime); ++} ++ ++/* ++ * putDebugChar ++ * ++ * This is a GDB stub routine. It waits until the interface is ready ++ * to transmit a char and then sends it. If there is no serial ++ * interface connection then it simply returns to its caller, having ++ * pretended to send the char. Caller takes needed protections. ++ */ ++void ++putDebugChar(int chr) ++{ ++ dbprintk(("putDebugChar(port %x): chr=%02x '%c', ints_on=%d\n", ++ gdb_async_info->port, ++ chr, ++ chr > ' ' && chr < 0x7F ? chr : ' ', ints_disabled ? 0 : 1)); ++ ++ if (gdb_async_info == NULL) { ++ gdb_hook_interrupt(&local_info, 0); ++ } ++ ++ write_char(gdb_async_info, chr); /* this routine will wait */ ++ count = (chr == '#') ? 0 : count + 1; ++ if ((count == 2)) { /* try to enable after */ ++ if (ints_disabled & ok_to_enable_ints) ++ kgdb_enable_ints_now(); /* try to enable after */ ++ ++ /* We do this a lot because, well we really want to get these ++ * interrupts. The serial driver will clear these bits when it ++ * initializes the chip. Every thing else it does is ok, ++ * but this. ++ */ ++ if (!ints_disabled) { ++ outb_px(gdb_async_info->port + UART_IER, ++ gdb_async_info->IER); ++ } ++ } ++ ++} /* putDebugChar */ ++ ++module_init(kgdb_enable_ints); +diff -puN arch/i386/lib/Makefile~kgdb-ga arch/i386/lib/Makefile +--- 25/arch/i386/lib/Makefile~kgdb-ga 2004-10-21 14:54:15.265602768 -0700 ++++ 25-akpm/arch/i386/lib/Makefile 2004-10-21 14:54:15.313595472 -0700 +@@ -8,3 +8,4 @@ lib-y = checksum.o delay.o usercopy.o ge + + lib-$(CONFIG_X86_USE_3DNOW) += mmx.o + lib-$(CONFIG_HAVE_DEC_LOCK) += dec_and_lock.o ++lib-$(CONFIG_KGDB) += kgdb_serial.o +diff -puN arch/i386/Makefile~kgdb-ga arch/i386/Makefile +--- 25/arch/i386/Makefile~kgdb-ga 2004-10-21 14:54:15.266602616 -0700 ++++ 25-akpm/arch/i386/Makefile 2004-10-21 14:54:15.314595320 -0700 +@@ -99,6 +99,9 @@ core-$(CONFIG_X86_ES7000) := arch/i386/m + # default subarch .h files + mflags-y += -Iinclude/asm-i386/mach-default + ++mflags-$(CONFIG_KGDB) += -gdwarf-2 ++mflags-$(CONFIG_KGDB_MORE) += $(shell echo $(CONFIG_KGDB_OPTIONS) | sed -e 's/"//g') ++ + head-y := arch/i386/kernel/head.o arch/i386/kernel/init_task.o + + libs-y += arch/i386/lib/ +diff -puN arch/i386/mm/fault.c~kgdb-ga arch/i386/mm/fault.c +--- 25/arch/i386/mm/fault.c~kgdb-ga 2004-10-21 14:54:15.268602312 -0700 ++++ 25-akpm/arch/i386/mm/fault.c 2004-10-21 14:54:15.314595320 -0700 +@@ -430,6 +430,12 @@ no_context: + * Oops. The kernel tried to access some bad page. We'll have to + * terminate things with extreme prejudice. + */ ++#ifdef CONFIG_KGDB ++ if (!user_mode(regs)){ ++ kgdb_handle_exception(14,SIGBUS, error_code, regs); ++ return; ++ } ++#endif + + bust_spinlocks(1); + +diff -puN arch/x86_64/boot/compressed/head.S~kgdb-ga arch/x86_64/boot/compressed/head.S +--- 25/arch/x86_64/boot/compressed/head.S~kgdb-ga 2004-10-21 14:54:15.269602160 -0700 ++++ 25-akpm/arch/x86_64/boot/compressed/head.S 2004-10-21 14:54:15.315595168 -0700 +@@ -26,6 +26,7 @@ + .code32 + .text + ++#define IN_BOOTLOADER + #include + #include + +diff -puN arch/x86_64/boot/compressed/misc.c~kgdb-ga arch/x86_64/boot/compressed/misc.c +--- 25/arch/x86_64/boot/compressed/misc.c~kgdb-ga 2004-10-21 14:54:15.270602008 -0700 ++++ 25-akpm/arch/x86_64/boot/compressed/misc.c 2004-10-21 14:54:15.315595168 -0700 +@@ -9,6 +9,7 @@ + * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996 + */ + ++#define IN_BOOTLOADER + #include "miscsetup.h" + #include + +diff -puN /dev/null Documentation/i386/kgdb/andthen +--- /dev/null 2003-09-15 06:40:47.000000000 -0700 ++++ 25-akpm/Documentation/i386/kgdb/andthen 2004-10-21 14:54:15.316595016 -0700 +@@ -0,0 +1,100 @@ ++ ++define set_andthen ++ set var $thp=0 ++ set var $thp=(struct kgdb_and_then_struct *)&kgdb_data[0] ++ set var $at_size = (sizeof kgdb_data)/(sizeof *$thp) ++ set var $at_oc=kgdb_and_then_count ++ set var $at_cc=$at_oc ++end ++ ++define andthen_next ++ set var $at_cc=$arg0 ++end ++ ++define andthen ++ andthen_set_edge ++ if ($at_cc >= $at_oc) ++ printf "Outside window. Window size is %d\n",($at_oc-$at_low) ++ else ++ printf "%d: ",$at_cc ++ output *($thp+($at_cc++ % $at_size )) ++ printf "\n" ++ end ++end ++define andthen_set_edge ++ set var $at_oc=kgdb_and_then_count ++ set var $at_low = $at_oc - $at_size ++ if ($at_low < 0 ) ++ set var $at_low = 0 ++ end ++ if (( $at_cc > $at_oc) || ($at_cc < $at_low)) ++ printf "Count outside of window, setting count to " ++ if ($at_cc >= $at_oc) ++ set var $at_cc = $at_oc ++ else ++ set var $at_cc = $at_low ++ end ++ printf "%d\n",$at_cc ++ end ++end ++ ++define beforethat ++ andthen_set_edge ++ if ($at_cc <= $at_low) ++ printf "Outside window. Window size is %d\n",($at_oc-$at_low) ++ else ++ printf "%d: ",$at_cc-1 ++ output *($thp+(--$at_cc % $at_size )) ++ printf "\n" ++ end ++end ++ ++document andthen_next ++ andthen_next ++ . sets the number of the event to display next. If this event ++ . is not in the event pool, either andthen or beforethat will ++ . correct it to the nearest event pool edge. The event pool ++ . ends at the last event recorded and begins ++ . prior to that. If beforethat is used next, it will display ++ . event -1. ++. ++ andthen commands are: set_andthen, andthen_next, andthen and beforethat ++end ++ ++ ++document andthen ++ andthen ++. displays the next event in the list. sets up to display ++. the oldest saved event first. ++. (optional) count of the event to display. ++. note the number of events saved is specified at configure time. ++. if events are saved between calls to andthen the index will change ++. but the displayed event will be the next one (unless the event buffer ++. is overrun). ++. ++. andthen commands are: set_andthen, andthen_next, andthen and beforethat ++end ++ ++document set_andthen ++ set_andthen ++. sets up to use the and commands. ++. if you have defined your own struct, use the above and ++. then enter the following: ++. p $thp=(struct kgdb_and_then_structX *)&kgdb_data[0] ++. where is the name of your structure. ++. ++. andthen commands are: set_andthen, andthen_next, andthen and beforethat ++end ++ ++document beforethat ++ beforethat ++. displays the next prior event in the list. sets up to ++. display the last occuring event first. ++. ++. note the number of events saved is specified at configure time. ++. if events are saved between calls to beforethat the index will change ++. but the displayed event will be the next one (unless the event buffer ++. is overrun). ++. ++. andthen commands are: set_andthen, andthen_next, andthen and beforethat ++end +diff -puN /dev/null Documentation/i386/kgdb/debug-nmi.txt +--- /dev/null 2003-09-15 06:40:47.000000000 -0700 ++++ 25-akpm/Documentation/i386/kgdb/debug-nmi.txt 2004-10-21 14:54:15.316595016 -0700 +@@ -0,0 +1,37 @@ ++Subject: Debugging with NMI ++Date: Mon, 12 Jul 1999 11:28:31 -0500 ++From: David Grothe ++Organization: Gcom, Inc ++To: David Grothe ++ ++Kernel hackers: ++ ++Maybe this is old hat, but it is new to me -- ++ ++On an ISA bus machine, if you short out the A1 and B1 pins of an ISA ++slot you will generate an NMI to the CPU. This interrupts even a ++machine that is hung in a loop with interrupts disabled. Used in ++conjunction with kgdb < ++ftp://ftp.gcom.com/pub/linux/src/kgdb-2.3.35/kgdb-2.3.35.tgz > you can ++gain debugger control of a machine that is hung in the kernel! Even ++without kgdb the kernel will print a stack trace so you can find out ++where it was hung. ++ ++The A1/B1 pins are directly opposite one another and the farthest pins ++towards the bracket end of the ISA bus socket. You can stick a paper ++clip or multi-meter probe between them to short them out. ++ ++I had a spare ISA bus to PC104 bus adapter around. The PC104 end of the ++board consists of two rows of wire wrap pins. So I wired a push button ++between the A1/B1 pins and now have an ISA board that I can stick into ++any ISA bus slot for debugger entry. ++ ++Microsoft has a circuit diagram of a PCI card at ++http://www.microsoft.com/hwdev/DEBUGGING/DMPSW.HTM. If you want to ++build one you will have to mail them and ask for the PAL equations. ++Nobody makes one comercially. ++ ++[THIS TIP COMES WITH NO WARRANTY WHATSOEVER. It works for me, but if ++your machine catches fire, it is your problem, not mine.] ++ ++-- Dave (the kgdb guy) +diff -puN /dev/null Documentation/i386/kgdb/gdb-globals.txt +--- /dev/null 2003-09-15 06:40:47.000000000 -0700 ++++ 25-akpm/Documentation/i386/kgdb/gdb-globals.txt 2004-10-21 14:54:15.317594864 -0700 +@@ -0,0 +1,71 @@ ++Sender: akale@veritas.com ++Date: Fri, 23 Jun 2000 19:26:35 +0530 ++From: "Amit S. Kale" ++Organization: Veritas Software (India) ++To: Dave Grothe , linux-kernel@vger.rutgers.edu ++CC: David Milburn , ++ "Edouard G. Parmelan" , ++ ezannoni@cygnus.com, Keith Owens ++Subject: Re: Module debugging using kgdb ++ ++Dave Grothe wrote: ++> ++> Amit: ++> ++> There is a 2.4.0 version of kgdb on our ftp site: ++> ftp://ftp.gcom.com/pub/linux/src/kgdb. I mirrored your version of gdb ++> and loadmodule.sh there. ++> ++> Have a look at the README file and see if I go it right. If not, send ++> me some corrections and I will update it. ++> ++> Does your version of gdb solve the global variable problem? ++ ++Yes. ++Thanks to Elena Zanoni, gdb (developement version) can now calculate ++correctly addresses of dynamically loaded object files. I have not been ++following gdb developement for sometime and am not sure when symbol ++address calculation fix is going to appear in a gdb stable version. ++ ++Elena, any idea when the fix will make it to a prebuilt gdb from a ++redhat release? ++ ++For the time being I have built a gdb developement version. It can be ++used for module debugging with loadmodule.sh script. ++ ++The problem with calculating of module addresses with previous versions ++of gdb was as follows: ++gdb did not use base address of a section while calculating address of ++a symbol in the section in an object file loaded via 'add-symbol-file'. ++It used address of .text segment instead. Due to this addresses of ++symbols in .data, .bss etc. (e.g. global variables) were calculated incorrectly. ++ ++Above mentioned fix allow gdb to use base address of a segment while ++calculating address of a symbol in it. It adds a parameter '-s' to ++'add-symbol-file' command for specifying base address of a segment. ++ ++loadmodule.sh script works as follows. ++ ++1. Copy a module file to target machine. ++2. Load the module on the target machine using insmod with -m parameter. ++insmod produces a module load map which contains base addresses of all ++sections in the module and addresses of symbols in the module file. ++3. Find all sections and their base addresses in the module from ++the module map. ++4. Generate a script that loads the module file. The script uses ++'add-symbol-file' and specifies address of text segment followed by ++addresses of all segments in the module. ++ ++Here is an example gdb script produced by loadmodule.sh script. ++ ++add-symbol-file foo 0xd082c060 -s .text.lock 0xd08cbfb5 ++-s .fixup 0xd08cfbdf -s .rodata 0xd08cfde0 -s __ex_table 0xd08e3b38 ++-s .data 0xd08e3d00 -s .bss 0xd08ec8c0 -s __ksymtab 0xd08ee838 ++ ++With this command gdb can calculate addresses of symbols in ANY segment ++in a module file. ++ ++Regards. ++-- ++Amit Kale ++Veritas Software ( http://www.veritas.com ) +diff -puN /dev/null Documentation/i386/kgdb/gdbinit +--- /dev/null 2003-09-15 06:40:47.000000000 -0700 ++++ 25-akpm/Documentation/i386/kgdb/gdbinit 2004-10-21 14:54:15.317594864 -0700 +@@ -0,0 +1,14 @@ ++shell echo -e "\003" >/dev/ttyS0 ++set remotebaud 38400 ++target remote /dev/ttyS0 ++define si ++stepi ++printf "EAX=%08x EBX=%08x ECX=%08x EDX=%08x\n", $eax, $ebx, $ecx, $edx ++printf "ESI=%08x EDI=%08x EBP=%08x ESP=%08x\n", $esi, $edi, $ebp, $esp ++x/i $eip ++end ++define ni ++nexti ++printf "EAX=%08x EBX=%08x ECX=%08x EDX=%08x\n", $eax, $ebx, $ecx, $edx ++printf "ESI=%08x EDI=%08x EBP=%08x ESP=%08x\n", $esi, $edi, $ebp, $esp ++x/i $eip +diff -puN /dev/null Documentation/i386/kgdb/gdbinit.hw +--- /dev/null 2003-09-15 06:40:47.000000000 -0700 ++++ 25-akpm/Documentation/i386/kgdb/gdbinit.hw 2004-10-21 14:54:15.318594712 -0700 +@@ -0,0 +1,117 @@ ++ ++#Using ia-32 hardware breakpoints. ++# ++#4 hardware breakpoints are available in ia-32 processors. These breakpoints ++#do not need code modification. They are set using debug registers. ++# ++#Each hardware breakpoint can be of one of the ++#three types: execution, write, access. ++#1. An Execution breakpoint is triggered when code at the breakpoint address is ++#executed. ++#2. A write breakpoint ( aka watchpoints ) is triggered when memory location ++#at the breakpoint address is written. ++#3. An access breakpoint is triggered when memory location at the breakpoint ++#address is either read or written. ++# ++#As hardware breakpoints are available in limited number, use software ++#breakpoints ( br command in gdb ) instead of execution hardware breakpoints. ++# ++#Length of an access or a write breakpoint defines length of the datatype to ++#be watched. Length is 1 for char, 2 short , 3 int. ++# ++#For placing execution, write and access breakpoints, use commands ++#hwebrk, hwwbrk, hwabrk ++#To remove a breakpoint use hwrmbrk command. ++# ++#These commands take following types of arguments. For arguments associated ++#with each command, use help command. ++#1. breakpointno: 0 to 3 ++#2. length: 1 to 3 ++#3. address: Memory location in hex ( without 0x ) e.g c015e9bc ++# ++#Use the command exinfo to find which hardware breakpoint occured. ++ ++#hwebrk breakpointno address ++define hwebrk ++ maintenance packet Y$arg0,0,0,$arg1 ++end ++document hwebrk ++ hwebrk
++ Places a hardware execution breakpoint ++ = 0 - 3 ++
= Hex digits without leading "0x". ++end ++ ++#hwwbrk breakpointno length address ++define hwwbrk ++ maintenance packet Y$arg0,1,$arg1,$arg2 ++end ++document hwwbrk ++ hwwbrk
++ Places a hardware write breakpoint ++ = 0 - 3 ++ = 1 (1 byte), 2 (2 byte), 3 (4 byte) ++
= Hex digits without leading "0x". ++end ++ ++#hwabrk breakpointno length address ++define hwabrk ++ maintenance packet Y$arg0,1,$arg1,$arg2 ++end ++document hwabrk ++ hwabrk
++ Places a hardware access breakpoint ++ = 0 - 3 ++ = 1 (1 byte), 2 (2 byte), 3 (4 byte) ++
= Hex digits without leading "0x". ++end ++ ++#hwrmbrk breakpointno ++define hwrmbrk ++ maintenance packet y$arg0 ++end ++document hwrmbrk ++ hwrmbrk ++ = 0 - 3 ++ Removes a hardware breakpoint ++end ++ ++define reboot ++ maintenance packet r ++end ++#exinfo ++define exinfo ++ maintenance packet qE ++end ++document exinfo ++ exinfo ++ Gives information about a breakpoint. ++end ++define get_th ++ p $th=(struct thread_info *)((int)$esp & ~8191) ++end ++document get_th ++ get_tu ++ Gets and prints the current thread_info pointer, Defines th to be it. ++end ++define get_cu ++ p $cu=((struct thread_info *)((int)$esp & ~8191))->task ++end ++document get_cu ++ get_cu ++ Gets and print the "current" value. Defines $cu to be it. ++end ++define int_off ++ set var $flags=$eflags ++ set $eflags=$eflags&~0x200 ++ end ++define int_on ++ set var $eflags|=$flags&0x200 ++ end ++document int_off ++ saves the current interrupt state and clears the processor interrupt ++ flag. Use int_on to restore the saved flag. ++end ++document int_on ++ Restores the interrupt flag saved by int_off. ++end +diff -puN /dev/null Documentation/i386/kgdb/gdbinit-modules +--- /dev/null Thu Apr 11 07:25:15 2002 ++++ 25-akpm/Documentation/i386/kgdb/gdbinit-modules Fri Jan 13 17:54:25 2006 +@@ -0,0 +1,149 @@ ++# ++# Usefull GDB user-command to debug Linux Kernel Modules with gdbstub. ++# ++# This don't work for Linux-2.0 or older. ++# ++# Author Edouard G. Parmelan ++# ++# ++# Fri Apr 30 20:33:29 CEST 1999 ++# First public release. ++# ++# Major cleanup after experiment Linux-2.0 kernel without success. ++# Symbols of a module are not in the correct order, I can't explain ++# why :( ++# ++# Fri Mar 19 15:41:40 CET 1999 ++# Initial version. ++# ++# Thu Jan 6 16:29:03 CST 2000 ++# A little fixing by Dave Grothe ++# ++# Mon Jun 19 09:33:13 CDT 2000 ++# Alignment changes from Edouard Parmelan ++# ++# The basic idea is to find where insmod load the module and inform ++# GDB to load the symbol table of the module with the GDB command ++# ``add-symbol-file
''. ++# ++# The Linux kernel holds the list of all loaded modules in module_list, ++# this list end with &kernel_module (exactly with module->next == NULL, ++# but the last module is not a real module). ++# ++# Insmod allocates the struct module before the object file. Since ++# Linux-2.1, this structure contain his size. The real address of ++# the object file is then (char*)module + module->size_of_struct. ++# ++# You can use three user functions ``mod-list'', ``mod-print-symbols'' ++# and ``add-module-symbols''. ++# ++# mod-list list all loaded modules with the format: ++# ++# ++# As soon as you have found the address of your module, you can ++# print its exported symbols (mod-print-symbols) or inform GDB to add ++# symbols from your module file (mod-add-symbols). ++# ++# The argument that you give to mod-print-symbols or mod-add-symbols ++# is the from the mod-list command. ++# ++# When using the mod-add-symbols command you must also give the full ++# pathname of the modules object code file. ++# ++# The command mod-add-lis is an example of how to make this easier. ++# You can edit this macro to contain the path name of your own ++# favorite module and then use it as a shorthand to load it. You ++# still need the module-address, however. ++# ++# The internal function ``mod-validate'' set the GDB variable $mod ++# as a ``struct module*'' if the kernel known the module otherwise ++# $mod is set to NULL. This ensure to not add symbols for a wrong ++# address. ++# ++# ++# Sat Feb 12 20:05:47 CET 2005 ++# ++# Adapted to the 2.6.* module data structure. ++# (Getting miffed at gdb for not having "offsetof" in the process :-/ ) ++# ++# Autogenerate add-symbol-file statements from the module list instead ++# of relying on a no-longer-working loadmodule.sh program. ++# ++# Matthias Urlichs ++# ++# ++# Have a nice hacking day ! ++# ++# ++define mod-list ++ set $lmod = modules->next ++ # This is a circular data structure ++ while $lmod != &modules ++ set $mod = (struct module *)(((char *)$lmod) - ((int)&(((struct module *)0) -> list))) ++ printf "%p\t%s\n", $mod, $mod->name ++ set $lmod = $lmod->next ++ end ++end ++document mod-list ++mod-list ++List all modules in the form: ++Use the as the argument for the other ++mod-commands: mod-print-symbols, mod-add-symbols. ++end ++ ++define mod-list-syms ++ set $lmod = modules->next ++ # This is a circular data structure ++ while $lmod != &modules ++ set $mod = (struct module *)(((char *)$lmod) - ((int)&(((struct module *)0) -> list))) ++ printf "add-symbol-file %s.ko %p\n", $mod->name, $mod->module_core ++ set $lmod = $lmod->next ++ end ++end ++document mod-list-syms ++mod-list-syms ++List all modules in the form: add-symbol-file ++for adding modules' symbol tables without loadmodule.sh. ++end ++ ++define mod-validate ++ set $lmod = modules->next ++ set $mod = (struct module *)(((char *)$lmod) - ((int)&(((struct module *)0) -> list))) ++ while ($lmod != &modules) && ($mod != $arg0) ++ set $lmod = $lmod->next ++ set $mod = (struct module *)(((char *)$lmod) - ((int)&(((struct module *)0) -> list))) ++ end ++ if $lmod == &modules ++ set $mod = 0 ++ printf "%p is not a module\n", $arg0 ++ end ++end ++document mod-validate ++mod-validate ++Internal user-command used to validate the module parameter. ++If is a real loaded module, set $mod to it, otherwise set $mod ++to 0. ++end ++ ++define mod-print-symbols ++ mod-validate $arg0 ++ if $mod != 0 ++ set $i = 0 ++ while $i < $mod->num_syms ++ set $sym = $mod->syms[$i] ++ printf "%p\t%s\n", $sym->value, $sym->name ++ set $i = $i + 1 ++ end ++ set $i = 0 ++ while $i < $mod->num_gpl_syms ++ set $sym = $mod->gpl_syms[$i] ++ printf "%p\t%s\n", $sym->value, $sym->name ++ set $i = $i + 1 ++ end ++ end ++end ++document mod-print-symbols ++mod-print-symbols ++Print all exported symbols of the module. See mod-list ++end ++ +diff -puN /dev/null Documentation/i386/kgdb/kgdb.txt +--- /dev/null 2003-09-15 06:40:47.000000000 -0700 ++++ 25-akpm/Documentation/i386/kgdb/kgdb.txt 2004-10-21 14:54:15.324593800 -0700 +@@ -0,0 +1,775 @@ ++Last edit: <20030806.1637.12> ++This file has information specific to the i386 kgdb option. Other ++platforms with the kgdb option may behave in a similar fashion. ++ ++New features: ++============ ++20030806.1557.37 ++This version was made against the 2.6.0-test2 kernel. We have made the ++following changes: ++ ++- The getthread() code in the stub calls find_task_by_pid(). It fails ++ if we are early in the bring up such that the pid arrays have yet to ++ be allocated. We have added a line to kernel/pid.c to make ++ "kgdb_pid_init_done" true once the arrays are allocated. This way the ++ getthread() code knows not to call. This is only used by the thread ++ debugging stuff and threads will not yet exist at this point in the ++ boot. ++ ++- For some reason, gdb was not asking for a new thread list when the ++ "info thread" command was given. We changed to the newer version of ++ the thread info command and gdb now seems to ask when needed. Result, ++ we now get all threads in the thread list. ++ ++- We now respond to the ThreadExtraInfo request from gdb with the thread ++ name from task_struct .comm. This then appears in the thread list. ++ Thoughts on additional options for this are welcome. Things such as ++ "has BKL" and "Preempted" come to mind. I think we could have a flag ++ word that could enable different bits of info here. ++ ++- We now honor, sort of, the C and S commands. These are continue and ++ single set after delivering a signal. We ignore the signal and do the ++ requested action. This only happens when we told gdb that a signal ++ was the reason for entry, which is only done on memory faults. The ++ result is that you can now continue into the Oops. ++ ++- We changed the -g to -gdwarf-2. This seems to be the same as -ggdb, ++ but it is more exact on what language to use. ++ ++- We added two dwarf2 include files and a bit of code at the end of ++ entry.S. This does not yet work, so it is disabled. Still we want to ++ keep track of the code and "maybe" someone out there can fix it. ++ ++- Randy Dunlap sent some fix ups for this file which are now merged. ++ ++- Hugh Dickins sent a fix to a bit of code in traps.c that prevents a ++ compiler warning if CONFIG_KGDB is off (now who would do that :). ++ ++- Andrew Morton sent a fix for the serial driver which is now merged. ++ ++- Andrew also sent a change to the stub around the cpu managment code ++ which is also merged. ++ ++- Andrew also sent a patch to make "f" as well as "g" work as SysRq ++ commands to enter kgdb, merged. ++ ++- If CONFIG_KGDB and CONFIG_DEBUG_SPINLOCKS are both set we added a ++ "who" field to the spinlock data struct. This is filled with ++ "current" when ever the spinlock suceeds. Useful if you want to know ++ who has the lock. ++ ++_ And last, but not least, we fixed the "get_cu" macro to properly get ++ the current value of "current". ++ ++New features: ++============ ++20030505.1827.27 ++We are starting to align with the sourceforge version, at least in ++commands. To this end, the boot command string to start kgdb at ++boot time has been changed from "kgdb" to "gdb". ++ ++Andrew Morton sent a couple of patches which are now included as follows: ++1.) We now return a flag to the interrupt handler. ++2.) We no longer use smp_num_cpus (a conflict with the lock meter). ++3.) And from William Lee Irwin III code to make ++ sure high-mem is set up before we attempt to register our interrupt ++ handler. ++We now include asm/kgdb.h from config.h so you will most likely never ++have to include it. It also 'NULLS' the kgdb macros you might have in ++your code when CONFIG_KGDB is not defined. This allows you to just ++turn off CONFIG_KGDB to turn off all the kgdb_ts() calls and such. ++This include is conditioned on the machine being an x86 so as to not ++mess with other archs. ++ ++20020801.1129.03 ++This is currently the version for the 2.4.18 (and beyond?) kernel. ++ ++We have several new "features" beginning with this version: ++ ++1.) Kgdb now syncs the "other" CPUs with a cross-CPU NMI. No more ++ waiting and it will pull that guy out of an IRQ off spin lock :) ++ ++2.) We doctored up the code that tells where a task is waiting and ++ included it so that the "info thread" command will show a bit more ++ than "schedule()". Try it... ++ ++3.) Added the ability to call a function from gdb. All the standard gdb ++ issues apply, i.e. if you hit a breakpoint in the function, you are ++ not allowed to call another (gdb limitation, not kgdb). To help ++ this capability we added a memory allocation function. Gdb does not ++ return this memory (it is used for strings that you pass to that function ++ you are calling from gdb) so we fixed up a way to allow you to ++ manually return the memory (see below). ++ ++4.) Kgdb time stamps (kgdb_ts()) are enhanced to expand what was the ++ interrupt flag to now also include the preemption count and the ++ "in_interrupt" info. The flag is now called "with_pif" to indicate ++ the order, preempt_count, in_interrupt, flag. The preempt_count is ++ shifted left by 4 bits so you can read the count in hex by dropping ++ the low order digit. In_interrupt is in bit 1, and the flag is in ++ bit 0. ++ ++5.) The command: "p kgdb_info" is now expanded and prints something ++ like: ++(gdb) p kgdb_info ++$2 = {used_malloc = 0, called_from = 0xc0107506, entry_tsc = 67468627259, ++ errcode = 0, vector = 3, print_debug_info = 0, hold_on_sstep = 1, ++ cpus_waiting = {{task = 0xc027a000, pid = 32768, hold = 0, ++ regs = 0xc027bf84}, {task = 0x0, pid = 0, hold = 0, regs = 0x0}}} ++ ++ Things to note here: a.) used_malloc is the amount of memory that ++ has been malloc'ed to do calls from gdb. You can reclaim this ++ memory like this: "p kgdb_info.used_malloc=0" Cool, huh? b.) ++ cpus_waiting is now "sized" by the number of CPUs you enter at ++ configure time in the kgdb configure section. This is NOT used ++ anywhere else in the system, but it is "nice" here. c.) The task's ++ "pid" is now in the structure. This is the pid you will need to use ++ to decode to the thread id to get gdb to look at that thread. ++ Remember that the "info thread" command prints a list of threads ++ wherein it numbers each thread with its reference number followed ++ by the thread's pid. Note that the per-CPU idle threads actually ++ have pids of 0 (yes, there is more than one pid 0 in an SMP system). ++ To avoid confusion, kgdb numbers these threads with numbers beyond ++ the MAX_PID. That is why you see 32768 and above. ++ ++6.) A subtle change, we now provide the complete register set for tasks ++ that are active on the other CPUs. This allows better trace back on ++ those tasks. ++ ++ And, let's mention what we could not fix. Back-trace from all but the ++ thread that we trapped will, most likely, have a bogus entry in it. ++ The problem is that gdb does not recognize the entry code for ++ functions that use "current" near (at all?) the entry. The compiler ++ is putting the "current" decode as the first two instructions of the ++ function where gdb expects to find %ebp changing code. Back trace ++ also has trouble with interrupt frames. I am talking with Daniel ++ Jacobowitz about some way to fix this, but don't hold your breath. ++ ++20011220.0050.35 ++Major enhancement with this version is the ability to hold one or more ++CPUs in an SMP system while allowing the others to continue. Also, by ++default only the current CPU is enabled on single-step commands (please ++note that gdb issues single-step commands at times other than when you ++use the si command). ++ ++Another change is to collect some useful information in ++a global structure called "kgdb_info". You should be able to just: ++ ++p kgdb_info ++ ++although I have seen cases where the first time this is done gdb just ++prints the first member but prints the whole structure if you then enter ++CR (carriage return or enter). This also works: ++ ++p *&kgdb_info ++ ++Here is a sample: ++(gdb) p kgdb_info ++$4 = {called_from = 0xc010732c, entry_tsc = 32804123790856, errcode = 0, ++ vector = 3, print_debug_info = 0} ++ ++"Called_from" is the return address from the current entry into kgdb. ++Sometimes it is useful to know why you are in kgdb, for example, was ++it an NMI or a real breakpoint? The simple way to interrogate this ++return address is: ++ ++l *0xc010732c ++ ++which will print the surrounding few lines of source code. ++ ++"Entry_tsc" is the CPU TSC on entry to kgdb (useful to compare to the ++kgdb_ts entries). ++ ++"errcode" and "vector" are other entry parameters which may be helpful on ++some traps. ++ ++"print_debug_info" is the internal debugging kgdb print enable flag. Yes, ++you can modify it. ++ ++In SMP systems kgdb_info also includes the "cpus_waiting" structure and ++"hold_on_step": ++ ++(gdb) p kgdb_info ++$7 = {called_from = 0xc0112739, entry_tsc = 1034936624074, errcode = 0, ++ vector = 2, print_debug_info = 0, hold_on_sstep = 1, cpus_waiting = {{ ++ task = 0x0, hold = 0, regs = 0x0}, {task = 0xc71b8000, hold = 0, ++ regs = 0xc71b9f70}, {task = 0x0, hold = 0, regs = 0x0}, {task = 0x0, ++ hold = 0, regs = 0x0}, {task = 0x0, hold = 0, regs = 0x0}, {task = 0x0, ++ hold = 0, regs = 0x0}, {task = 0x0, hold = 0, regs = 0x0}, {task = 0x0, ++ hold = 0, regs = 0x0}}} ++ ++"Cpus_waiting" has an entry for each CPU other than the current one that ++has been stopped. Each entry contains the task_struct address for that ++CPU, the address of the regs for that task and a hold flag. All these ++have the proper typing so that, for example: ++ ++p *kgdb_info.cpus_waiting[1].regs ++ ++will print the registers for CPU 1. ++ ++"Hold_on_sstep" is a new feature with this version and comes up set or ++true. What this means is that whenever kgdb is asked to single-step all ++other CPUs are held (i.e. not allowed to execute). The flag applies to ++all but the current CPU and, again, can be changed: ++ ++p kgdb_info.hold_on_sstep=0 ++ ++restores the old behavior of letting all CPUs run during single-stepping. ++ ++Likewise, each CPU has a "hold" flag, which if set, locks that CPU out ++of execution. Note that this has some risk in cases where the CPUs need ++to communicate with each other. If kgdb finds no CPU available on exit, ++it will push a message thru gdb and stay in kgdb. Note that it is legal ++to hold the current CPU as long as at least one CPU can execute. ++ ++20010621.1117.09 ++This version implements an event queue. Events are signaled by calling ++a function in the kgdb stub and may be examined from gdb. See EVENTS ++below for details. This version also tightens up the interrupt and SMP ++handling to not allow interrupts on the way to kgdb from a breakpoint ++trap. It is fine to allow these interrupts for user code, but not ++system debugging. ++ ++Version ++======= ++ ++This version of the kgdb package was developed and tested on ++kernel version 2.4.16. It will not install on any earlier kernels. ++It is possible that it will continue to work on later versions ++of 2.4 and then versions of 2.5 (I hope). ++ ++ ++Debugging Setup ++=============== ++ ++Designate one machine as the "development" machine. This is the ++machine on which you run your compiles and which has your source ++code for the kernel. Designate a second machine as the "target" ++machine. This is the machine that will run your experimental ++kernel. ++ ++The two machines will be connected together via a serial line out ++one or the other of the COM ports of the PC. You will need the ++appropriate modem eliminator (null modem) cable(s) for this. ++ ++Decide on which tty port you want the machines to communicate, then ++connect them up back-to-back using the null modem cable. COM1 is ++/dev/ttyS0 and COM2 is /dev/ttyS1. You should test this connection ++with the two machines prior to trying to debug a kernel. Once you ++have it working, on the TARGET machine, enter: ++ ++setserial /dev/ttyS0 (or what ever tty you are using) ++ ++and record the port address and the IRQ number. ++ ++On the DEVELOPMENT machine you need to apply the patch for the kgdb ++hooks. You have probably already done that if you are reading this ++file. ++ ++On your DEVELOPMENT machine, go to your kernel source directory and do ++"make Xconfig" where X is one of "x", "menu", or "". If you are ++configuring in the standard serial driver, it must not be a module. ++Either yes or no is ok, but making the serial driver a module means it ++will initialize after kgdb has set up the UART interrupt code and may ++cause a failure of the control-C option discussed below. The configure ++question for the serial driver is under the "Character devices" heading ++and is: ++ ++"Standard/generic (8250/16550 and compatible UARTs) serial support" ++ ++Go down to the kernel debugging menu item and open it up. Enable the ++kernel kgdb stub code by selecting that item. You can also choose to ++turn on the "-ggdb -O1" compile options. The -ggdb causes the compiler ++to put more debug info (like local symbols) in the object file. On the ++i386 -g and -ggdb are the same so this option just reduces to "O1". The ++-O1 reduces the optimization level. This may be helpful in some cases, ++be aware, however, that this may also mask the problem you are looking ++for. ++ ++The baud rate. Default is 115200. What ever you choose be sure that ++the host machine is set to the same speed. I recommend the default. ++ ++The port. This is the I/O address of the serial UART that you should ++have gotten using setserial as described above. The standard COM1 port ++(3f8) using IRQ 4 is default. COM2 is 2f8 which by convention uses IRQ ++3. ++ ++The port IRQ (see above). ++ ++Stack overflow test. This option makes a minor change in the trap, ++system call and interrupt code to detect stack overflow and transfer ++control to kgdb if it happens. (Some platforms have this in the ++baseline code, but the i386 does not.) ++ ++You can also configure the system to recognize the boot option ++"console=kgdb" which if given will cause all console output during ++booting to be put thru gdb as well as other consoles. This option ++requires that gdb and kgdb be connected prior to sending console output ++so, if they are not, a breakpoint is executed to force the connection. ++This will happen before any kernel output (it is going thru gdb, right), ++and will stall the boot until the connection is made. ++ ++You can also configure in a patch to SysRq to enable the kGdb SysRq. ++This request generates a breakpoint. Since the serial port IRQ line is ++set up after any serial drivers, it is possible that this command will ++work when the control-C will not. ++ ++Save and exit the Xconfig program. Then do "make clean" , "make dep" ++and "make bzImage" (or whatever target you want to make). This gets the ++kernel compiled with the "-g" option set -- necessary for debugging. ++ ++You have just built the kernel on your DEVELOPMENT machine that you ++intend to run on your TARGET machine. ++ ++To install this new kernel, use the following installation procedure. ++Remember, you are on the DEVELOPMENT machine patching the kernel source ++for the kernel that you intend to run on the TARGET machine. ++ ++Copy this kernel to your target machine using your usual procedures. I ++usually arrange to copy development: ++/usr/src/linux/arch/i386/boot/bzImage to /vmlinuz on the TARGET machine ++via a LAN based NFS access. That is, I run the cp command on the target ++and copy from the development machine via the LAN. Run Lilo (see "man ++lilo" for details on how to set this up) on the new kernel on the target ++machine so that it will boot! Then boot the kernel on the target ++machine. ++ ++On the DEVELOPMENT machine, create a file called .gdbinit in the ++directory /usr/src/linux. An example .gdbinit file looks like this: ++ ++shell echo -e "\003" >/dev/ttyS0 ++set remotebaud 38400 (or what ever speed you have chosen) ++target remote /dev/ttyS0 ++ ++ ++Change the "echo" and "target" definition so that it specifies the tty ++port that you intend to use. Change the "remotebaud" definition to ++match the data rate that you are going to use for the com line. ++ ++You are now ready to try it out. ++ ++Boot your target machine with "kgdb" in the boot command i.e. something ++like: ++ ++lilo> test kgdb ++ ++or if you also want console output thru gdb: ++ ++lilo> test kgdb console=kgdb ++ ++You should see the lilo message saying it has loaded the kernel and then ++all output stops. The kgdb stub is trying to connect with gdb. Start ++gdb something like this: ++ ++ ++On your DEVELOPMENT machine, cd /usr/src/linux and enter "gdb vmlinux". ++When gdb gets the symbols loaded it will read your .gdbinit file and, if ++everything is working correctly, you should see gdb print out a few ++lines indicating that a breakpoint has been taken. It will actually ++show a line of code in the target kernel inside the kgdb activation ++code. ++ ++The gdb interaction should look something like this: ++ ++ linux-dev:/usr/src/linux# gdb vmlinux ++ GDB is free software and you are welcome to distribute copies of it ++ under certain conditions; type "show copying" to see the conditions. ++ There is absolutely no warranty for GDB; type "show warranty" for details. ++ GDB 4.15.1 (i486-slackware-linux), ++ Copyright 1995 Free Software Foundation, Inc... ++ breakpoint () at i386-stub.c:750 ++ 750 } ++ (gdb) ++ ++You can now use whatever gdb commands you like to set breakpoints. ++Enter "continue" to start your target machine executing again. At this ++point the target system will run at full speed until it encounters ++your breakpoint or gets a segment violation in the kernel, or whatever. ++ ++If you have the kgdb console enabled when you continue, gdb will print ++out all the console messages. ++ ++The above example caused a breakpoint relatively early in the boot ++process. For the i386 kgdb it is possible to code a break instruction ++as the first C-language point in init/main.c, i.e. as the first instruction ++in start_kernel(). This could be done as follows: ++ ++#include ++ breakpoint(); ++ ++This breakpoint() is really a function that sets up the breakpoint and ++single-step hardware trap cells and then executes a breakpoint. Any ++early hard coded breakpoint will need to use this function. Once the ++trap cells are set up they need not be set again, but doing it again ++does not hurt anything, so you don't need to be concerned about which ++breakpoint is hit first. Once the trap cells are set up (and the kernel ++sets them up in due course even if breakpoint() is never called) the ++macro: ++ ++BREAKPOINT; ++ ++will generate an inline breakpoint. This may be more useful as it stops ++the processor at the instruction instead of in a function a step removed ++from the location of interest. In either case must be ++included to define both breakpoint() and BREAKPOINT. ++ ++Triggering kgdbstub at other times ++================================== ++ ++Often you don't need to enter the debugger until much later in the boot ++or even after the machine has been running for some time. Once the ++kernel is booted and interrupts are on, you can force the system to ++enter the debugger by sending a control-C to the debug port. This is ++what the first line of the recommended .gdbinit file does. This allows ++you to start gdb any time after the system is up as well as when the ++system is already at a breakpoint. (In the case where the system is ++already at a breakpoint the control-C is not needed, however, it will ++be ignored by the target so no harm is done. Also note the the echo ++command assumes that the port speed is already set. This will be true ++once gdb has connected, but it is best to set the port speed before you ++run gdb.) ++ ++Another simple way to do this is to put the following file in you ~/bin ++directory: ++ ++#!/bin/bash ++echo -e "\003" > /dev/ttyS0 ++ ++Here, the ttyS0 should be replaced with what ever port you are using. ++The "\003" is control-C. Once you are connected with gdb, you can enter ++control-C at the command prompt. ++ ++An alternative way to get control to the debugger is to enable the kGdb ++SysRq command. Then you would enter Alt-SysRq-g (all three keys at the ++same time, but push them down in the order given). To refresh your ++memory of the available SysRq commands try Alt-SysRq-=. Actually any ++undefined command could replace the "=", but I like to KNOW that what I ++am pushing will never be defined. ++ ++Debugging hints ++=============== ++ ++You can break into the target machine at any time from the development ++machine by typing ^C (see above paragraph). If the target machine has ++interrupts enabled this will stop it in the kernel and enter the ++debugger. ++ ++There is unfortunately no way of breaking into the kernel if it is ++in a loop with interrupts disabled, so if this happens to you then ++you need to place exploratory breakpoints or printk's into the kernel ++to find out where it is looping. The exploratory breakpoints can be ++entered either thru gdb or hard coded into the source. This is very ++handy if you do something like: ++ ++if () BREAKPOINT; ++ ++ ++There is a copy of an e-mail in the Documentation/i386/kgdb/ directory ++(debug-nmi.txt) which describes how to create an NMI on an ISA bus ++machine using a paper clip. I have a sophisticated version of this made ++by wiring a push button switch into a PC104/ISA bus adapter card. The ++adapter card nicely furnishes wire wrap pins for all the ISA bus ++signals. ++ ++When you are done debugging the kernel on the target machine it is a ++good idea to leave it in a running state. This makes reboots faster, ++bypassing the fsck. So do a gdb "continue" as the last gdb command if ++this is possible. To terminate gdb itself on the development machine ++and leave the target machine running, first clear all breakpoints and ++continue, then type ^Z to suspend gdb and then kill it with "kill %1" or ++something similar. ++ ++If gdbstub Does Not Work ++======================== ++ ++If it doesn't work, you will have to troubleshoot it. Do the easy ++things first like double checking your cabling and data rates. You ++might try some non-kernel based programs to see if the back-to-back ++connection works properly. Just something simple like cat /etc/hosts ++>/dev/ttyS0 on one machine and cat /dev/ttyS0 on the other will tell you ++if you can send data from one machine to the other. Make sure it works ++in both directions. There is no point in tearing out your hair in the ++kernel if the line doesn't work. ++ ++All of the real action takes place in the file ++/usr/src/linux/arch/i386/kernel/kgdb_stub.c. That is the code on the target ++machine that interacts with gdb on the development machine. In gdb you can ++turn on a debug switch with the following command: ++ ++ set remotedebug ++ ++This will print out the protocol messages that gdb is exchanging with ++the target machine. ++ ++Another place to look is /usr/src/arch/i386/lib/kgdb_serial.c. This is ++the code that talks to the serial port on the target side. There might ++be a problem there. In particular there is a section of this code that ++tests the UART which will tell you what UART you have if you define ++"PRNT" (just remove "_off" from the #define PRNT_off). To view this ++report you will need to boot the system without any beakpoints. This ++allows the kernel to run to the point where it calls kgdb to set up ++interrupts. At this time kgdb will test the UART and print out the type ++it finds. (You need to wait so that the printks are actually being ++printed. Early in the boot they are cached, waiting for the console to ++be enabled. Also, if kgdb is entered thru a breakpoint it is possible ++to cause a dead lock by calling printk when the console is locked. The ++stub thus avoids doing printks from breakpoints, especially in the ++serial code.) At this time, if the UART fails to do the expected thing, ++kgdb will print out (using printk) information on what failed. (These ++messages will be buried in all the other boot up messages. Look for ++lines that start with "gdb_hook_interrupt:". You may want to use dmesg ++once the system is up to view the log. If this fails or if you still ++don't connect, review your answers for the port address. Use: ++ ++setserial /dev/ttyS0 ++ ++to get the current port and IRQ information. This command will also ++tell you what the system found for the UART type. The stub recognizes ++the following UART types: ++ ++16450, 16550, and 16550A ++ ++If you are really desperate you can use printk debugging in the ++kgdbstub code in the target kernel until you get it working. In particular, ++there is a global variable in /usr/src/linux/arch/i386/kernel/kgdb_stub.c ++named "remote_debug". Compile your kernel with this set to 1, rather ++than 0 and the debug stub will print out lots of stuff as it does ++what it does. Likewise there are debug printks in the kgdb_serial.c ++code that can be turned on with simple changes in the macro defines. ++ ++ ++Debugging Loadable Modules ++========================== ++ ++This technique comes courtesy of Edouard Parmelan ++ ++ ++When you run gdb, enter the command ++ ++source gdbinit-modules ++ ++This will read in a file of gdb macros that was installed in your ++kernel source directory when kgdb was installed. This file implements ++the following commands: ++ ++mod-list ++ Lists the loaded modules in the form ++ ++mod-print-symbols ++ Prints all the symbols in the indicated module. ++ ++mod-add-symbols ++ Loads the symbols from the object file and associates them ++ with the indicated module. ++ ++After you have loaded the module that you want to debug, use the command ++mod-list to find the of your module. Then use that ++address in the mod-add-symbols command to load your module's symbols. ++From that point onward you can debug your module as if it were a part ++of the kernel. ++ ++The file gdbinit-modules also contains a command named mod-add-lis as ++an example of how to construct a command of your own to load your ++favorite module. The idea is to "can" the pathname of the module ++in the command so you don't have to type so much. ++ ++Threads ++======= ++ ++Each process in a target machine is seen as a gdb thread. gdb thread ++related commands (info threads, thread n) can be used. ++ ++ia-32 hardware breakpoints ++========================== ++ ++kgdb stub contains support for hardware breakpoints using debugging features ++of ia-32(x86) processors. These breakpoints do not need code modification. ++They use debugging registers. 4 hardware breakpoints are available in ia-32 ++processors. ++ ++Each hardware breakpoint can be of one of the following three types. ++ ++1. Execution breakpoint - An Execution breakpoint is triggered when code ++ at the breakpoint address is executed. ++ ++ As limited number of hardware breakpoints are available, it is ++ advisable to use software breakpoints ( break command ) instead ++ of execution hardware breakpoints, unless modification of code ++ is to be avoided. ++ ++2. Write breakpoint - A write breakpoint is triggered when memory ++ location at the breakpoint address is written. ++ ++ A write or can be placed for data of variable length. Length of ++ a write breakpoint indicates length of the datatype to be ++ watched. Length is 1 for 1 byte data , 2 for 2 byte data, 3 for ++ 4 byte data. ++ ++3. Access breakpoint - An access breakpoint is triggered when memory ++ location at the breakpoint address is either read or written. ++ ++ Access breakpoints also have lengths similar to write breakpoints. ++ ++IO breakpoints in ia-32 are not supported. ++ ++Since gdb stub at present does not use the protocol used by gdb for hardware ++breakpoints, hardware breakpoints are accessed through gdb macros. gdb macros ++for hardware breakpoints are described below. ++ ++hwebrk - Places an execution breakpoint ++ hwebrk breakpointno address ++hwwbrk - Places a write breakpoint ++ hwwbrk breakpointno length address ++hwabrk - Places an access breakpoint ++ hwabrk breakpointno length address ++hwrmbrk - Removes a breakpoint ++ hwrmbrk breakpointno ++exinfo - Tells whether a software or hardware breakpoint has occurred. ++ Prints number of the hardware breakpoint if a hardware breakpoint has ++ occurred. ++ ++Arguments required by these commands are as follows ++breakpointno - 0 to 3 ++length - 1 to 3 ++address - Memory location in hex digits ( without 0x ) e.g c015e9bc ++ ++SMP support ++========== ++ ++When a breakpoint occurs or user issues a break ( Ctrl + C ) to gdb ++client, all the processors are forced to enter the debugger. Current ++thread corresponds to the thread running on the processor where ++breakpoint occurred. Threads running on other processor(s) appear ++similar to other non-running threads in the 'info threads' output. ++Within the kgdb stub there is a structure "waiting_cpus" in which kgdb ++records the values of "current" and "regs" for each CPU other than the ++one that hit the breakpoint. "current" is a pointer to the task ++structure for the task that CPU is running, while "regs" points to the ++saved registers for the task. This structure can be examined with the ++gdb "p" command. ++ ++ia-32 hardware debugging registers on all processors are set to same ++values. Hence any hardware breakpoints may occur on any processor. ++ ++gdb troubleshooting ++=================== ++ ++1. gdb hangs ++Kill it. restart gdb. Connect to target machine. ++ ++2. gdb cannot connect to target machine (after killing a gdb and ++restarting another) If the target machine was not inside debugger when ++you killed gdb, gdb cannot connect because the target machine won't ++respond. In this case echo "Ctrl+C"(ASCII 3) to the serial line. ++e.g. echo -e "\003" > /dev/ttyS1 ++This forces that target machine into the debugger, after which you ++can connect. ++ ++3. gdb cannot connect even after echoing Ctrl+C into serial line ++Try changing serial line settings min to 1 and time to 0 ++e.g. stty min 1 time 0 < /dev/ttyS1 ++Try echoing again ++ ++Check serial line speed and set it to correct value if required ++e.g. stty ispeed 115200 ospeed 115200 < /dev/ttyS1 ++ ++EVENTS ++====== ++ ++Ever want to know the order of things happening? Which CPU did what and ++when? How did the spinlock get the way it is? Then events are for ++you. Events are defined by calls to an event collection interface and ++saved for later examination. In this case, kgdb events are saved by a ++very fast bit of code in kgdb which is fully SMP and interrupt protected ++and they are examined by using gdb to display them. Kgdb keeps only ++the last N events, where N must be a power of two and is defined at ++configure time. ++ ++ ++Events are signaled to kgdb by calling: ++ ++kgdb_ts(data0,data1) ++ ++For each call kgdb records each call in an array along with other info. ++Here is the array definition: ++ ++struct kgdb_and_then_struct { ++#ifdef CONFIG_SMP ++ int on_cpu; ++#endif ++ long long at_time; ++ int from_ln; ++ char * in_src; ++ void *from; ++ int with_if; ++ int data0; ++ int data1; ++}; ++ ++For SMP machines the CPU is recorded, for all machines the TSC is ++recorded (gets a time stamp) as well as the line number and source file ++the call was made from. The address of the (from), the "if" (interrupt ++flag) and the two data items are also recorded. The macro kgdb_ts casts ++the types to int, so you can put any 32-bit values here. There is a ++configure option to select the number of events you want to keep. A ++nice number might be 128, but you can keep up to 1024 if you want. The ++number must be a power of two. An "andthen" macro library is provided ++for gdb to help you look at these events. It is also possible to define ++a different structure for the event storage and cast the data to this ++structure. For example the following structure is defined in kgdb: ++ ++struct kgdb_and_then_struct2 { ++#ifdef CONFIG_SMP ++ int on_cpu; ++#endif ++ long long at_time; ++ int from_ln; ++ char * in_src; ++ void *from; ++ int with_if; ++ struct task_struct *t1; ++ struct task_struct *t2; ++}; ++ ++If you use this for display, the data elements will be displayed as ++pointers to task_struct entries. You may want to define your own ++structure to use in casting. You should only change the last two items ++and you must keep the structure size the same. Kgdb will handle these ++as 32-bit ints, but within that constraint you can define a structure to ++cast to any 32-bit quantity. This need only be available to gdb and is ++only used for casting in the display code. ++ ++Final Items ++=========== ++ ++I picked up this code from Amit S. Kale and enhanced it. ++ ++If you make some really cool modification to this stuff, or if you ++fix a bug, please let me know. ++ ++George Anzinger ++ ++ ++Amit S. Kale ++ ++ ++(First kgdb by David Grothe ) ++ ++(modified by Tigran Aivazian ) ++ Putting gdbstub into the kernel config menu. ++ ++(modified by Scott Foehner ) ++ Hooks for entering gdbstub at boot time. ++ ++(modified by Amit S. Kale ) ++ Threads, ia-32 hw debugging, mp support, console support, ++ nmi watchdog handling. ++ ++(modified by George Anzinger ) ++ Extended threads to include the idle threads. ++ Enhancements to allow breakpoint() at first C code. ++ Use of module_init() and __setup() to automate the configure. ++ Enhanced the cpu "collection" code to work in early bring-up. ++ Added ability to call functions from gdb ++ Print info thread stuff without going back to schedule() ++ Now collect the "other" cpus with an IPI/ NMI. +diff -puN /dev/null Documentation/i386/kgdb/loadmodule.sh +--- /dev/null 2003-09-15 06:40:47.000000000 -0700 ++++ 25-akpm/Documentation/i386/kgdb/loadmodule.sh 2004-10-21 14:54:15.325593648 -0700 +@@ -0,0 +1,78 @@ ++#/bin/sh ++# This script loads a module on a target machine and generates a gdb script. ++# source generated gdb script to load the module file at appropriate addresses ++# in gdb. ++# ++# Usage: ++# Loading the module on target machine and generating gdb script) ++# [foo]$ loadmodule.sh ++# ++# Loading the module file into gdb ++# (gdb) source ++# ++# Modify following variables according to your setup. ++# TESTMACHINE - Name of the target machine ++# GDBSCRIPTS - The directory where a gdb script will be generated ++# ++# Author: Amit S. Kale (akale@veritas.com). ++# ++# If you run into problems, please check files pointed to by following ++# variables. ++# ERRFILE - /tmp/.errs contains stderr output of insmod ++# MAPFILE - /tmp/.map contains stdout output of insmod ++# GDBSCRIPT - $GDBSCRIPTS/load gdb script. ++ ++TESTMACHINE=foo ++GDBSCRIPTS=/home/bar ++ ++if [ $# -lt 1 ] ; then { ++ echo Usage: $0 modulefile ++ exit ++} ; fi ++ ++MODULEFILE=$1 ++MODULEFILEBASENAME=`basename $1` ++ ++if [ $MODULEFILE = $MODULEFILEBASENAME ] ; then { ++ MODULEFILE=`pwd`/$MODULEFILE ++} fi ++ ++ERRFILE=/tmp/$MODULEFILEBASENAME.errs ++MAPFILE=/tmp/$MODULEFILEBASENAME.map ++GDBSCRIPT=$GDBSCRIPTS/load$MODULEFILEBASENAME ++ ++function findaddr() { ++ local ADDR=0x$(echo "$SEGMENTS" | \ ++ grep "$1" | sed 's/^[^ ]*[ ]*[^ ]*[ ]*//' | \ ++ sed 's/[ ]*[^ ]*$//') ++ echo $ADDR ++} ++ ++function checkerrs() { ++ if [ "`cat $ERRFILE`" != "" ] ; then { ++ cat $ERRFILE ++ exit ++ } fi ++} ++ ++#load the module ++echo Copying $MODULEFILE to $TESTMACHINE ++rcp $MODULEFILE root@${TESTMACHINE}: ++ ++echo Loading module $MODULEFILE ++rsh -l root $TESTMACHINE /sbin/insmod -m ./`basename $MODULEFILE` \ ++ > $MAPFILE 2> $ERRFILE ++checkerrs ++ ++SEGMENTS=`head -n 11 $MAPFILE | tail -n 10` ++TEXTADDR=$(findaddr "\\.text[^.]") ++LOADSTRING="add-symbol-file $MODULEFILE $TEXTADDR" ++SEGADDRS=`echo "$SEGMENTS" | awk '//{ ++ if ($1 != ".text" && $1 != ".this" && ++ $1 != ".kstrtab" && $1 != ".kmodtab") { ++ print " -s " $1 " 0x" $3 " " ++ } ++}'` ++LOADSTRING="$LOADSTRING $SEGADDRS" ++echo Generating script $GDBSCRIPT ++echo $LOADSTRING > $GDBSCRIPT +diff -puN drivers/char/keyboard.c~kgdb-ga drivers/char/keyboard.c +--- 25/drivers/char/keyboard.c~kgdb-ga 2004-10-21 14:54:15.273601552 -0700 ++++ 25-akpm/drivers/char/keyboard.c 2004-10-21 14:54:15.326593496 -0700 +@@ -1081,6 +1081,9 @@ void kbd_keycode(unsigned int keycode, i + } + if (sysrq_down && down && !rep) { + handle_sysrq(kbd_sysrq_xlate[keycode], regs, tty); ++#ifdef CONFIG_KGDB_SYSRQ ++ sysrq_down = 0; /* in case we miss the "up" event */ ++#endif + return; + } + #endif +diff -puN drivers/char/sysrq.c~kgdb-ga drivers/char/sysrq.c +--- 25/drivers/char/sysrq.c~kgdb-ga 2004-10-21 14:54:15.275601248 -0700 ++++ 25-akpm/drivers/char/sysrq.c 2004-10-21 14:54:15.326593496 -0700 +@@ -35,6 +35,25 @@ + #include + + #include ++#ifdef CONFIG_KGDB_SYSRQ ++ ++#define GDB_OP &kgdb_op ++static void kgdb_sysrq(int key, struct pt_regs *pt_regs, struct tty_struct *tty) ++{ ++ printk("kgdb sysrq\n"); ++ breakpoint(); ++} ++ ++static struct sysrq_key_op kgdb_op = { ++ .handler = kgdb_sysrq, ++ .help_msg = "kGdb|Fgdb", ++ .action_msg = "Debug breakpoint\n", ++}; ++ ++#else ++#define GDB_OP NULL ++#endif ++ + + extern void reset_vc(unsigned int); + +@@ -238,8 +257,8 @@ static struct sysrq_key_op *sysrq_key_ta + /* c */ NULL, + /* d */ NULL, + /* e */ &sysrq_term_op, +-/* f */ NULL, +-/* g */ NULL, ++/* f */ GDB_OP, ++/* g */ GDB_OP, + /* h */ NULL, + /* i */ &sysrq_kill_op, + /* j */ NULL, +diff -puN drivers/serial/8250.c~kgdb-ga drivers/serial/8250.c +--- 25/drivers/serial/8250.c~kgdb-ga 2004-10-21 14:54:15.276601096 -0700 ++++ 25-akpm/drivers/serial/8250.c 2004-10-21 14:54:15.328593192 -0700 +@@ -983,7 +983,7 @@ receive_chars(struct uart_8250_port *up, + if (unlikely(tty->flip.count >= TTY_FLIPBUF_SIZE)) { + tty->flip.work.func((void *)tty); + if (tty->flip.count >= TTY_FLIPBUF_SIZE) +- return; // if TTY_DONT_FLIP is set ++ return; /* if TTY_DONT_FLIP is set */ + } + ch = serial_inp(up, UART_RX); + *tty->flip.char_buf_ptr = ch; +@@ -1348,12 +1348,21 @@ static void serial8250_break_ctl(struct + spin_unlock_irqrestore(&up->port.lock, flags); + } + ++#ifdef CONFIG_KGDB ++static int kgdb_irq = -1; ++#endif ++ + static int serial8250_startup(struct uart_port *port) + { + struct uart_8250_port *up = (struct uart_8250_port *)port; + unsigned long flags; + int retval; + ++#ifdef CONFIG_KGDB ++ if (up->port.irq == kgdb_irq) ++ return -EBUSY; ++#endif ++ + up->capabilities = uart_config[up->port.type].flags; + up->mcr = 0; + +@@ -1990,6 +1999,10 @@ serial8250_register_ports(struct uart_dr + for (i = 0; i < UART_NR; i++) { + struct uart_8250_port *up = &serial8250_ports[i]; + ++#ifdef CONFIG_KGDB ++ if (up->port.irq == kgdb_irq) ++ up->port.kgdb = 1; ++#endif + up->port.line = i; + up->port.ops = &serial8250_pops; + up->port.dev = dev; +@@ -2376,6 +2389,31 @@ void serial8250_unregister_port(int line + } + EXPORT_SYMBOL(serial8250_unregister_port); + ++#ifdef CONFIG_KGDB ++/* ++ * Find all the ports using the given irq and shut them down. ++ * Result should be that the irq will be released. ++ */ ++void shutdown_for_kgdb(struct async_struct * info) ++{ ++ int irq = info->state->irq; ++ struct uart_8250_port *up; ++ int ttyS; ++ ++ kgdb_irq = irq; /* save for later init */ ++ for (ttyS = 0; ttyS < UART_NR; ttyS++){ ++ up = &serial8250_ports[ttyS]; ++ if (up->port.irq == irq && (irq_lists + irq)->head) { ++#ifdef CONFIG_DEBUG_SPINLOCK /* ugly business... */ ++ if(up->port.lock.magic != SPINLOCK_MAGIC) ++ spin_lock_init(&up->port.lock); ++#endif ++ serial8250_shutdown(&up->port); ++ } ++ } ++} ++#endif /* CONFIG_KGDB */ ++ + static int __init serial8250_init(void) + { + int ret, i; +diff -puN drivers/serial/serial_core.c~kgdb-ga drivers/serial/serial_core.c +--- 25/drivers/serial/serial_core.c~kgdb-ga 2004-10-21 14:54:15.278600792 -0700 ++++ 25-akpm/drivers/serial/serial_core.c 2004-10-21 14:54:15.330592888 -0700 +@@ -1976,6 +1976,11 @@ uart_configure_port(struct uart_driver * + { + unsigned int flags; + ++#ifdef CONFIG_KGDB ++ if (port->kgdb) ++ return; ++#endif ++ + /* + * If there isn't a port here, don't do anything further. + */ +diff -puN include/asm-i386/bugs.h~kgdb-ga include/asm-i386/bugs.h +--- 25/include/asm-i386/bugs.h~kgdb-ga 2004-10-21 14:54:15.279600640 -0700 ++++ 25-akpm/include/asm-i386/bugs.h 2004-10-21 14:54:15.331592736 -0700 +@@ -1,11 +1,11 @@ + /* + * include/asm-i386/bugs.h + * +- * Copyright (C) 1994 Linus Torvalds ++ * Copyright (C) 1994 Linus Torvalds + * + * Cyrix stuff, June 1998 by: + * - Rafael R. Reilova (moved everything from head.S), +- * ++ * + * - Channing Corn (tests & fixes), + * - Andrew D. Balsa (code cleanup). + * +@@ -25,7 +25,20 @@ + #include + #include + #include +- ++#ifdef CONFIG_KGDB ++/* ++ * Provied the command line "gdb" initial break ++ */ ++int __init kgdb_initial_break(char * str) ++{ ++ if (*str == '\0'){ ++ breakpoint(); ++ return 1; ++ } ++ return 0; ++} ++__setup("gdb",kgdb_initial_break); ++#endif + static int __init no_halt(char *s) + { + boot_cpu_data.hlt_works_ok = 0; +@@ -140,7 +153,7 @@ static void __init check_popad(void) + : "ecx", "edi" ); + /* If this fails, it means that any user program may lock the CPU hard. Too bad. */ + if (res != 12345678) printk( "Buggy.\n" ); +- else printk( "OK.\n" ); ++ else printk( "OK.\n" ); + #endif + } + +diff -puN /dev/null include/asm-i386/kgdb.h +--- /dev/null 2003-09-15 06:40:47.000000000 -0700 ++++ 25-akpm/include/asm-i386/kgdb.h 2004-10-21 14:54:15.331592736 -0700 +@@ -0,0 +1,59 @@ ++#ifndef __KGDB ++#define __KGDB ++ ++/* ++ * This file should not include ANY others. This makes it usable ++ * most anywhere without the fear of include order or inclusion. ++ * Make it so! ++ * ++ * This file may be included all the time. It is only active if ++ * CONFIG_KGDB is defined, otherwise it stubs out all the macros ++ * and entry points. ++ */ ++#if defined(CONFIG_KGDB) && !defined(__ASSEMBLY__) ++ ++extern void breakpoint(void); ++#define INIT_KGDB_INTS kgdb_enable_ints() ++ ++#ifndef BREAKPOINT ++#define BREAKPOINT asm(" int $3") ++#endif ++/* ++ * GDB debug stub (or any debug stub) can point the 'linux_debug_hook' ++ * pointer to its routine and it will be entered as the first thing ++ * when a trap occurs. ++ * ++ * Return values are, at present, undefined. ++ * ++ * The debug hook routine does not necessarily return to its caller. ++ * It has the register image and thus may choose to resume execution ++ * anywhere it pleases. ++ */ ++struct pt_regs; ++ ++extern int kgdb_handle_exception(int trapno, ++ int signo, int err_code, struct pt_regs *regs); ++extern int in_kgdb(struct pt_regs *regs); ++ ++#ifdef CONFIG_KGDB_TS ++void kgdb_tstamp(int line, char *source, int data0, int data1); ++/* ++ * This is the time stamp function. The macro adds the source info and ++ * does a cast on the data to allow most any 32-bit value. ++ */ ++ ++#define kgdb_ts(data0,data1) kgdb_tstamp(__LINE__,__FILE__,(int)data0,(int)data1) ++#else ++#define kgdb_ts(data0,data1) ++#endif ++#else /* CONFIG_KGDB && ! __ASSEMBLY__ ,stubs follow... */ ++#ifndef BREAKPOINT ++#define BREAKPOINT ++#endif ++#define kgdb_ts(data0,data1) ++#define in_kgdb ++#define kgdb_handle_exception ++#define breakpoint ++#define INIT_KGDB_INTS ++#endif ++#endif /* __KGDB */ +diff -puN /dev/null include/asm-i386/kgdb_local.h +--- /dev/null 2003-09-15 06:40:47.000000000 -0700 ++++ 25-akpm/include/asm-i386/kgdb_local.h 2004-10-21 14:54:15.332592584 -0700 +@@ -0,0 +1,102 @@ ++#ifndef __KGDB_LOCAL ++#define ___KGDB_LOCAL ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#define PORT 0x3f8 ++#ifdef CONFIG_KGDB_PORT ++#undef PORT ++#define PORT CONFIG_KGDB_PORT ++#endif ++#define IRQ 4 ++#ifdef CONFIG_KGDB_IRQ ++#undef IRQ ++#define IRQ CONFIG_KGDB_IRQ ++#endif ++#define SB_CLOCK 1843200 ++#define SB_BASE (SB_CLOCK/16) ++#define SB_BAUD9600 SB_BASE/9600 ++#define SB_BAUD192 SB_BASE/19200 ++#define SB_BAUD384 SB_BASE/38400 ++#define SB_BAUD576 SB_BASE/57600 ++#define SB_BAUD1152 SB_BASE/115200 ++#ifdef CONFIG_KGDB_9600BAUD ++#define SB_BAUD SB_BAUD9600 ++#endif ++#ifdef CONFIG_KGDB_19200BAUD ++#define SB_BAUD SB_BAUD192 ++#endif ++#ifdef CONFIG_KGDB_38400BAUD ++#define SB_BAUD SB_BAUD384 ++#endif ++#ifdef CONFIG_KGDB_57600BAUD ++#define SB_BAUD SB_BAUD576 ++#endif ++#ifdef CONFIG_KGDB_115200BAUD ++#define SB_BAUD SB_BAUD1152 ++#endif ++#ifndef SB_BAUD ++#define SB_BAUD SB_BAUD1152 /* Start with this if not given */ ++#endif ++ ++#ifndef CONFIG_X86_TSC ++#undef rdtsc ++#define rdtsc(a,b) if (a++ > 10000){a = 0; b++;} ++#undef rdtscll ++#define rdtscll(s) s++ ++#endif ++ ++#ifdef _raw_read_unlock /* must use a name that is "define"ed, not an inline */ ++#undef spin_lock ++#undef spin_trylock ++#undef spin_unlock ++#define spin_lock _raw_spin_lock ++#define spin_trylock _raw_spin_trylock ++#define spin_unlock _raw_spin_unlock ++#else ++#endif ++#undef spin_unlock_wait ++#define spin_unlock_wait(x) do { cpu_relax(); barrier();} \ ++ while(spin_is_locked(x)) ++ ++#define SB_IER 1 ++#define SB_MCR UART_MCR_OUT2 | UART_MCR_DTR | UART_MCR_RTS ++ ++#define FLAGS 0 ++#define SB_STATE { \ ++ magic: SSTATE_MAGIC, \ ++ baud_base: SB_BASE, \ ++ port: PORT, \ ++ irq: IRQ, \ ++ flags: FLAGS, \ ++ custom_divisor:SB_BAUD} ++#define SB_INFO { \ ++ magic: SERIAL_MAGIC, \ ++ port: PORT,0,FLAGS, \ ++ state: &state, \ ++ tty: (struct tty_struct *)&state, \ ++ IER: SB_IER, \ ++ MCR: SB_MCR} ++extern void putDebugChar(int); ++/* RTAI support needs us to really stop/start interrupts */ ++ ++#define kgdb_sti() __asm__ __volatile__("sti": : :"memory") ++#define kgdb_cli() __asm__ __volatile__("cli": : :"memory") ++#define kgdb_local_save_flags(x) __asm__ __volatile__(\ ++ "pushfl ; popl %0":"=g" (x): /* no input */) ++#define kgdb_local_irq_restore(x) __asm__ __volatile__(\ ++ "pushl %0 ; popfl": \ ++ /* no output */ :"g" (x):"memory", "cc") ++#define kgdb_local_irq_save(x) kgdb_local_save_flags(x); kgdb_cli() ++ ++#ifdef CONFIG_SERIAL ++extern void shutdown_for_kgdb(struct async_struct *info); ++#endif ++#define INIT_KDEBUG putDebugChar("+"); ++#endif /* __KGDB_LOCAL */ +diff -puN include/linux/config.h~kgdb-ga include/linux/config.h +--- 25/include/linux/config.h~kgdb-ga 2004-10-21 14:54:15.281600336 -0700 ++++ 25-akpm/include/linux/config.h 2004-10-21 14:54:15.332592584 -0700 +@@ -2,6 +2,9 @@ + #define _LINUX_CONFIG_H + + #include ++#if defined(__i386__) && !defined(IN_BOOTLOADER) ++#include ++#endif + #if !defined (__KERNEL__) && !defined(__KERNGLUE__) + #error including kernel header in userspace; use the glibc headers instead! + #endif +diff -puN /dev/null include/linux/dwarf2.h +--- /dev/null 2003-09-15 06:40:47.000000000 -0700 ++++ 25-akpm/include/linux/dwarf2.h 2004-10-21 14:54:15.336591976 -0700 +@@ -0,0 +1,738 @@ ++/* Declarations and definitions of codes relating to the DWARF2 symbolic ++ debugging information format. ++ Copyright (C) 1992, 1993, 1995, 1996, 1997, 1999, 2000, 2001, 2002 ++ Free Software Foundation, Inc. ++ ++ Written by Gary Funck (gary@intrepid.com) The Ada Joint Program ++ Office (AJPO), Florida State Unviversity and Silicon Graphics Inc. ++ provided support for this effort -- June 21, 1995. ++ ++ Derived from the DWARF 1 implementation written by Ron Guilmette ++ (rfg@netcom.com), November 1990. ++ ++ This file is part of GCC. ++ ++ GCC is free software; you can redistribute it and/or modify it under ++ the terms of the GNU General Public License as published by the Free ++ Software Foundation; either version 2, or (at your option) any later ++ version. ++ ++ GCC is distributed in the hope that it will be useful, but WITHOUT ++ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++ or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public ++ License for more details. ++ ++ You should have received a copy of the GNU General Public License ++ along with GCC; see the file COPYING. If not, write to the Free ++ Software Foundation, 59 Temple Place - Suite 330, Boston, MA ++ 02111-1307, USA. */ ++ ++/* This file is derived from the DWARF specification (a public document) ++ Revision 2.0.0 (July 27, 1993) developed by the UNIX International ++ Programming Languages Special Interest Group (UI/PLSIG) and distributed ++ by UNIX International. Copies of this specification are available from ++ UNIX International, 20 Waterview Boulevard, Parsippany, NJ, 07054. ++ ++ This file also now contains definitions from the DWARF 3 specification. */ ++ ++/* This file is shared between GCC and GDB, and should not contain ++ prototypes. */ ++ ++#ifndef _ELF_DWARF2_H ++#define _ELF_DWARF2_H ++ ++/* Structure found in the .debug_line section. */ ++#ifndef __ASSEMBLY__ ++typedef struct ++{ ++ unsigned char li_length [4]; ++ unsigned char li_version [2]; ++ unsigned char li_prologue_length [4]; ++ unsigned char li_min_insn_length [1]; ++ unsigned char li_default_is_stmt [1]; ++ unsigned char li_line_base [1]; ++ unsigned char li_line_range [1]; ++ unsigned char li_opcode_base [1]; ++} ++DWARF2_External_LineInfo; ++ ++typedef struct ++{ ++ unsigned long li_length; ++ unsigned short li_version; ++ unsigned int li_prologue_length; ++ unsigned char li_min_insn_length; ++ unsigned char li_default_is_stmt; ++ int li_line_base; ++ unsigned char li_line_range; ++ unsigned char li_opcode_base; ++} ++DWARF2_Internal_LineInfo; ++ ++/* Structure found in .debug_pubnames section. */ ++typedef struct ++{ ++ unsigned char pn_length [4]; ++ unsigned char pn_version [2]; ++ unsigned char pn_offset [4]; ++ unsigned char pn_size [4]; ++} ++DWARF2_External_PubNames; ++ ++typedef struct ++{ ++ unsigned long pn_length; ++ unsigned short pn_version; ++ unsigned long pn_offset; ++ unsigned long pn_size; ++} ++DWARF2_Internal_PubNames; ++ ++/* Structure found in .debug_info section. */ ++typedef struct ++{ ++ unsigned char cu_length [4]; ++ unsigned char cu_version [2]; ++ unsigned char cu_abbrev_offset [4]; ++ unsigned char cu_pointer_size [1]; ++} ++DWARF2_External_CompUnit; ++ ++typedef struct ++{ ++ unsigned long cu_length; ++ unsigned short cu_version; ++ unsigned long cu_abbrev_offset; ++ unsigned char cu_pointer_size; ++} ++DWARF2_Internal_CompUnit; ++ ++typedef struct ++{ ++ unsigned char ar_length [4]; ++ unsigned char ar_version [2]; ++ unsigned char ar_info_offset [4]; ++ unsigned char ar_pointer_size [1]; ++ unsigned char ar_segment_size [1]; ++} ++DWARF2_External_ARange; ++ ++typedef struct ++{ ++ unsigned long ar_length; ++ unsigned short ar_version; ++ unsigned long ar_info_offset; ++ unsigned char ar_pointer_size; ++ unsigned char ar_segment_size; ++} ++DWARF2_Internal_ARange; ++ ++#define ENUM(name) enum name { ++#define IF_NOT_ASM(a) a ++#define COMMA , ++#else ++#define ENUM(name) ++#define IF_NOT_ASM(a) ++#define COMMA ++ ++#endif ++ ++/* Tag names and codes. */ ++ENUM(dwarf_tag) ++ ++ DW_TAG_padding = 0x00 COMMA ++ DW_TAG_array_type = 0x01 COMMA ++ DW_TAG_class_type = 0x02 COMMA ++ DW_TAG_entry_point = 0x03 COMMA ++ DW_TAG_enumeration_type = 0x04 COMMA ++ DW_TAG_formal_parameter = 0x05 COMMA ++ DW_TAG_imported_declaration = 0x08 COMMA ++ DW_TAG_label = 0x0a COMMA ++ DW_TAG_lexical_block = 0x0b COMMA ++ DW_TAG_member = 0x0d COMMA ++ DW_TAG_pointer_type = 0x0f COMMA ++ DW_TAG_reference_type = 0x10 COMMA ++ DW_TAG_compile_unit = 0x11 COMMA ++ DW_TAG_string_type = 0x12 COMMA ++ DW_TAG_structure_type = 0x13 COMMA ++ DW_TAG_subroutine_type = 0x15 COMMA ++ DW_TAG_typedef = 0x16 COMMA ++ DW_TAG_union_type = 0x17 COMMA ++ DW_TAG_unspecified_parameters = 0x18 COMMA ++ DW_TAG_variant = 0x19 COMMA ++ DW_TAG_common_block = 0x1a COMMA ++ DW_TAG_common_inclusion = 0x1b COMMA ++ DW_TAG_inheritance = 0x1c COMMA ++ DW_TAG_inlined_subroutine = 0x1d COMMA ++ DW_TAG_module = 0x1e COMMA ++ DW_TAG_ptr_to_member_type = 0x1f COMMA ++ DW_TAG_set_type = 0x20 COMMA ++ DW_TAG_subrange_type = 0x21 COMMA ++ DW_TAG_with_stmt = 0x22 COMMA ++ DW_TAG_access_declaration = 0x23 COMMA ++ DW_TAG_base_type = 0x24 COMMA ++ DW_TAG_catch_block = 0x25 COMMA ++ DW_TAG_const_type = 0x26 COMMA ++ DW_TAG_constant = 0x27 COMMA ++ DW_TAG_enumerator = 0x28 COMMA ++ DW_TAG_file_type = 0x29 COMMA ++ DW_TAG_friend = 0x2a COMMA ++ DW_TAG_namelist = 0x2b COMMA ++ DW_TAG_namelist_item = 0x2c COMMA ++ DW_TAG_packed_type = 0x2d COMMA ++ DW_TAG_subprogram = 0x2e COMMA ++ DW_TAG_template_type_param = 0x2f COMMA ++ DW_TAG_template_value_param = 0x30 COMMA ++ DW_TAG_thrown_type = 0x31 COMMA ++ DW_TAG_try_block = 0x32 COMMA ++ DW_TAG_variant_part = 0x33 COMMA ++ DW_TAG_variable = 0x34 COMMA ++ DW_TAG_volatile_type = 0x35 COMMA ++ /* DWARF 3. */ ++ DW_TAG_dwarf_procedure = 0x36 COMMA ++ DW_TAG_restrict_type = 0x37 COMMA ++ DW_TAG_interface_type = 0x38 COMMA ++ DW_TAG_namespace = 0x39 COMMA ++ DW_TAG_imported_module = 0x3a COMMA ++ DW_TAG_unspecified_type = 0x3b COMMA ++ DW_TAG_partial_unit = 0x3c COMMA ++ DW_TAG_imported_unit = 0x3d COMMA ++ /* SGI/MIPS Extensions. */ ++ DW_TAG_MIPS_loop = 0x4081 COMMA ++ /* GNU extensions. */ ++ DW_TAG_format_label = 0x4101 COMMA /* For FORTRAN 77 and Fortran 90. */ ++ DW_TAG_function_template = 0x4102 COMMA /* For C++. */ ++ DW_TAG_class_template = 0x4103 COMMA /* For C++. */ ++ DW_TAG_GNU_BINCL = 0x4104 COMMA ++ DW_TAG_GNU_EINCL = 0x4105 COMMA ++ /* Extensions for UPC. See: http://upc.gwu.edu/~upc. */ ++ DW_TAG_upc_shared_type = 0x8765 COMMA ++ DW_TAG_upc_strict_type = 0x8766 COMMA ++ DW_TAG_upc_relaxed_type = 0x8767 ++IF_NOT_ASM(};) ++ ++#define DW_TAG_lo_user 0x4080 ++#define DW_TAG_hi_user 0xffff ++ ++/* Flag that tells whether entry has a child or not. */ ++#define DW_children_no 0 ++#define DW_children_yes 1 ++ ++/* Form names and codes. */ ++ENUM(dwarf_form) ++ ++ DW_FORM_addr = 0x01 COMMA ++ DW_FORM_block2 = 0x03 COMMA ++ DW_FORM_block4 = 0x04 COMMA ++ DW_FORM_data2 = 0x05 COMMA ++ DW_FORM_data4 = 0x06 COMMA ++ DW_FORM_data8 = 0x07 COMMA ++ DW_FORM_string = 0x08 COMMA ++ DW_FORM_block = 0x09 COMMA ++ DW_FORM_block1 = 0x0a COMMA ++ DW_FORM_data1 = 0x0b COMMA ++ DW_FORM_flag = 0x0c COMMA ++ DW_FORM_sdata = 0x0d COMMA ++ DW_FORM_strp = 0x0e COMMA ++ DW_FORM_udata = 0x0f COMMA ++ DW_FORM_ref_addr = 0x10 COMMA ++ DW_FORM_ref1 = 0x11 COMMA ++ DW_FORM_ref2 = 0x12 COMMA ++ DW_FORM_ref4 = 0x13 COMMA ++ DW_FORM_ref8 = 0x14 COMMA ++ DW_FORM_ref_udata = 0x15 COMMA ++ DW_FORM_indirect = 0x16 ++IF_NOT_ASM(};) ++ ++/* Attribute names and codes. */ ++ ++ENUM(dwarf_attribute) ++ ++ DW_AT_sibling = 0x01 COMMA ++ DW_AT_location = 0x02 COMMA ++ DW_AT_name = 0x03 COMMA ++ DW_AT_ordering = 0x09 COMMA ++ DW_AT_subscr_data = 0x0a COMMA ++ DW_AT_byte_size = 0x0b COMMA ++ DW_AT_bit_offset = 0x0c COMMA ++ DW_AT_bit_size = 0x0d COMMA ++ DW_AT_element_list = 0x0f COMMA ++ DW_AT_stmt_list = 0x10 COMMA ++ DW_AT_low_pc = 0x11 COMMA ++ DW_AT_high_pc = 0x12 COMMA ++ DW_AT_language = 0x13 COMMA ++ DW_AT_member = 0x14 COMMA ++ DW_AT_discr = 0x15 COMMA ++ DW_AT_discr_value = 0x16 COMMA ++ DW_AT_visibility = 0x17 COMMA ++ DW_AT_import = 0x18 COMMA ++ DW_AT_string_length = 0x19 COMMA ++ DW_AT_common_reference = 0x1a COMMA ++ DW_AT_comp_dir = 0x1b COMMA ++ DW_AT_const_value = 0x1c COMMA ++ DW_AT_containing_type = 0x1d COMMA ++ DW_AT_default_value = 0x1e COMMA ++ DW_AT_inline = 0x20 COMMA ++ DW_AT_is_optional = 0x21 COMMA ++ DW_AT_lower_bound = 0x22 COMMA ++ DW_AT_producer = 0x25 COMMA ++ DW_AT_prototyped = 0x27 COMMA ++ DW_AT_return_addr = 0x2a COMMA ++ DW_AT_start_scope = 0x2c COMMA ++ DW_AT_stride_size = 0x2e COMMA ++ DW_AT_upper_bound = 0x2f COMMA ++ DW_AT_abstract_origin = 0x31 COMMA ++ DW_AT_accessibility = 0x32 COMMA ++ DW_AT_address_class = 0x33 COMMA ++ DW_AT_artificial = 0x34 COMMA ++ DW_AT_base_types = 0x35 COMMA ++ DW_AT_calling_convention = 0x36 COMMA ++ DW_AT_count = 0x37 COMMA ++ DW_AT_data_member_location = 0x38 COMMA ++ DW_AT_decl_column = 0x39 COMMA ++ DW_AT_decl_file = 0x3a COMMA ++ DW_AT_decl_line = 0x3b COMMA ++ DW_AT_declaration = 0x3c COMMA ++ DW_AT_discr_list = 0x3d COMMA ++ DW_AT_encoding = 0x3e COMMA ++ DW_AT_external = 0x3f COMMA ++ DW_AT_frame_base = 0x40 COMMA ++ DW_AT_friend = 0x41 COMMA ++ DW_AT_identifier_case = 0x42 COMMA ++ DW_AT_macro_info = 0x43 COMMA ++ DW_AT_namelist_items = 0x44 COMMA ++ DW_AT_priority = 0x45 COMMA ++ DW_AT_segment = 0x46 COMMA ++ DW_AT_specification = 0x47 COMMA ++ DW_AT_static_link = 0x48 COMMA ++ DW_AT_type = 0x49 COMMA ++ DW_AT_use_location = 0x4a COMMA ++ DW_AT_variable_parameter = 0x4b COMMA ++ DW_AT_virtuality = 0x4c COMMA ++ DW_AT_vtable_elem_location = 0x4d COMMA ++ /* DWARF 3 values. */ ++ DW_AT_allocated = 0x4e COMMA ++ DW_AT_associated = 0x4f COMMA ++ DW_AT_data_location = 0x50 COMMA ++ DW_AT_stride = 0x51 COMMA ++ DW_AT_entry_pc = 0x52 COMMA ++ DW_AT_use_UTF8 = 0x53 COMMA ++ DW_AT_extension = 0x54 COMMA ++ DW_AT_ranges = 0x55 COMMA ++ DW_AT_trampoline = 0x56 COMMA ++ DW_AT_call_column = 0x57 COMMA ++ DW_AT_call_file = 0x58 COMMA ++ DW_AT_call_line = 0x59 COMMA ++ /* SGI/MIPS extensions. */ ++ DW_AT_MIPS_fde = 0x2001 COMMA ++ DW_AT_MIPS_loop_begin = 0x2002 COMMA ++ DW_AT_MIPS_tail_loop_begin = 0x2003 COMMA ++ DW_AT_MIPS_epilog_begin = 0x2004 COMMA ++ DW_AT_MIPS_loop_unroll_factor = 0x2005 COMMA ++ DW_AT_MIPS_software_pipeline_depth = 0x2006 COMMA ++ DW_AT_MIPS_linkage_name = 0x2007 COMMA ++ DW_AT_MIPS_stride = 0x2008 COMMA ++ DW_AT_MIPS_abstract_name = 0x2009 COMMA ++ DW_AT_MIPS_clone_origin = 0x200a COMMA ++ DW_AT_MIPS_has_inlines = 0x200b COMMA ++ /* GNU extensions. */ ++ DW_AT_sf_names = 0x2101 COMMA ++ DW_AT_src_info = 0x2102 COMMA ++ DW_AT_mac_info = 0x2103 COMMA ++ DW_AT_src_coords = 0x2104 COMMA ++ DW_AT_body_begin = 0x2105 COMMA ++ DW_AT_body_end = 0x2106 COMMA ++ DW_AT_GNU_vector = 0x2107 COMMA ++ /* VMS extensions. */ ++ DW_AT_VMS_rtnbeg_pd_address = 0x2201 COMMA ++ /* UPC extension. */ ++ DW_AT_upc_threads_scaled = 0x3210 ++IF_NOT_ASM(};) ++ ++#define DW_AT_lo_user 0x2000 /* Implementation-defined range start. */ ++#define DW_AT_hi_user 0x3ff0 /* Implementation-defined range end. */ ++ ++/* Location atom names and codes. */ ++ENUM(dwarf_location_atom) ++ ++ DW_OP_addr = 0x03 COMMA ++ DW_OP_deref = 0x06 COMMA ++ DW_OP_const1u = 0x08 COMMA ++ DW_OP_const1s = 0x09 COMMA ++ DW_OP_const2u = 0x0a COMMA ++ DW_OP_const2s = 0x0b COMMA ++ DW_OP_const4u = 0x0c COMMA ++ DW_OP_const4s = 0x0d COMMA ++ DW_OP_const8u = 0x0e COMMA ++ DW_OP_const8s = 0x0f COMMA ++ DW_OP_constu = 0x10 COMMA ++ DW_OP_consts = 0x11 COMMA ++ DW_OP_dup = 0x12 COMMA ++ DW_OP_drop = 0x13 COMMA ++ DW_OP_over = 0x14 COMMA ++ DW_OP_pick = 0x15 COMMA ++ DW_OP_swap = 0x16 COMMA ++ DW_OP_rot = 0x17 COMMA ++ DW_OP_xderef = 0x18 COMMA ++ DW_OP_abs = 0x19 COMMA ++ DW_OP_and = 0x1a COMMA ++ DW_OP_div = 0x1b COMMA ++ DW_OP_minus = 0x1c COMMA ++ DW_OP_mod = 0x1d COMMA ++ DW_OP_mul = 0x1e COMMA ++ DW_OP_neg = 0x1f COMMA ++ DW_OP_not = 0x20 COMMA ++ DW_OP_or = 0x21 COMMA ++ DW_OP_plus = 0x22 COMMA ++ DW_OP_plus_uconst = 0x23 COMMA ++ DW_OP_shl = 0x24 COMMA ++ DW_OP_shr = 0x25 COMMA ++ DW_OP_shra = 0x26 COMMA ++ DW_OP_xor = 0x27 COMMA ++ DW_OP_bra = 0x28 COMMA ++ DW_OP_eq = 0x29 COMMA ++ DW_OP_ge = 0x2a COMMA ++ DW_OP_gt = 0x2b COMMA ++ DW_OP_le = 0x2c COMMA ++ DW_OP_lt = 0x2d COMMA ++ DW_OP_ne = 0x2e COMMA ++ DW_OP_skip = 0x2f COMMA ++ DW_OP_lit0 = 0x30 COMMA ++ DW_OP_lit1 = 0x31 COMMA ++ DW_OP_lit2 = 0x32 COMMA ++ DW_OP_lit3 = 0x33 COMMA ++ DW_OP_lit4 = 0x34 COMMA ++ DW_OP_lit5 = 0x35 COMMA ++ DW_OP_lit6 = 0x36 COMMA ++ DW_OP_lit7 = 0x37 COMMA ++ DW_OP_lit8 = 0x38 COMMA ++ DW_OP_lit9 = 0x39 COMMA ++ DW_OP_lit10 = 0x3a COMMA ++ DW_OP_lit11 = 0x3b COMMA ++ DW_OP_lit12 = 0x3c COMMA ++ DW_OP_lit13 = 0x3d COMMA ++ DW_OP_lit14 = 0x3e COMMA ++ DW_OP_lit15 = 0x3f COMMA ++ DW_OP_lit16 = 0x40 COMMA ++ DW_OP_lit17 = 0x41 COMMA ++ DW_OP_lit18 = 0x42 COMMA ++ DW_OP_lit19 = 0x43 COMMA ++ DW_OP_lit20 = 0x44 COMMA ++ DW_OP_lit21 = 0x45 COMMA ++ DW_OP_lit22 = 0x46 COMMA ++ DW_OP_lit23 = 0x47 COMMA ++ DW_OP_lit24 = 0x48 COMMA ++ DW_OP_lit25 = 0x49 COMMA ++ DW_OP_lit26 = 0x4a COMMA ++ DW_OP_lit27 = 0x4b COMMA ++ DW_OP_lit28 = 0x4c COMMA ++ DW_OP_lit29 = 0x4d COMMA ++ DW_OP_lit30 = 0x4e COMMA ++ DW_OP_lit31 = 0x4f COMMA ++ DW_OP_reg0 = 0x50 COMMA ++ DW_OP_reg1 = 0x51 COMMA ++ DW_OP_reg2 = 0x52 COMMA ++ DW_OP_reg3 = 0x53 COMMA ++ DW_OP_reg4 = 0x54 COMMA ++ DW_OP_reg5 = 0x55 COMMA ++ DW_OP_reg6 = 0x56 COMMA ++ DW_OP_reg7 = 0x57 COMMA ++ DW_OP_reg8 = 0x58 COMMA ++ DW_OP_reg9 = 0x59 COMMA ++ DW_OP_reg10 = 0x5a COMMA ++ DW_OP_reg11 = 0x5b COMMA ++ DW_OP_reg12 = 0x5c COMMA ++ DW_OP_reg13 = 0x5d COMMA ++ DW_OP_reg14 = 0x5e COMMA ++ DW_OP_reg15 = 0x5f COMMA ++ DW_OP_reg16 = 0x60 COMMA ++ DW_OP_reg17 = 0x61 COMMA ++ DW_OP_reg18 = 0x62 COMMA ++ DW_OP_reg19 = 0x63 COMMA ++ DW_OP_reg20 = 0x64 COMMA ++ DW_OP_reg21 = 0x65 COMMA ++ DW_OP_reg22 = 0x66 COMMA ++ DW_OP_reg23 = 0x67 COMMA ++ DW_OP_reg24 = 0x68 COMMA ++ DW_OP_reg25 = 0x69 COMMA ++ DW_OP_reg26 = 0x6a COMMA ++ DW_OP_reg27 = 0x6b COMMA ++ DW_OP_reg28 = 0x6c COMMA ++ DW_OP_reg29 = 0x6d COMMA ++ DW_OP_reg30 = 0x6e COMMA ++ DW_OP_reg31 = 0x6f COMMA ++ DW_OP_breg0 = 0x70 COMMA ++ DW_OP_breg1 = 0x71 COMMA ++ DW_OP_breg2 = 0x72 COMMA ++ DW_OP_breg3 = 0x73 COMMA ++ DW_OP_breg4 = 0x74 COMMA ++ DW_OP_breg5 = 0x75 COMMA ++ DW_OP_breg6 = 0x76 COMMA ++ DW_OP_breg7 = 0x77 COMMA ++ DW_OP_breg8 = 0x78 COMMA ++ DW_OP_breg9 = 0x79 COMMA ++ DW_OP_breg10 = 0x7a COMMA ++ DW_OP_breg11 = 0x7b COMMA ++ DW_OP_breg12 = 0x7c COMMA ++ DW_OP_breg13 = 0x7d COMMA ++ DW_OP_breg14 = 0x7e COMMA ++ DW_OP_breg15 = 0x7f COMMA ++ DW_OP_breg16 = 0x80 COMMA ++ DW_OP_breg17 = 0x81 COMMA ++ DW_OP_breg18 = 0x82 COMMA ++ DW_OP_breg19 = 0x83 COMMA ++ DW_OP_breg20 = 0x84 COMMA ++ DW_OP_breg21 = 0x85 COMMA ++ DW_OP_breg22 = 0x86 COMMA ++ DW_OP_breg23 = 0x87 COMMA ++ DW_OP_breg24 = 0x88 COMMA ++ DW_OP_breg25 = 0x89 COMMA ++ DW_OP_breg26 = 0x8a COMMA ++ DW_OP_breg27 = 0x8b COMMA ++ DW_OP_breg28 = 0x8c COMMA ++ DW_OP_breg29 = 0x8d COMMA ++ DW_OP_breg30 = 0x8e COMMA ++ DW_OP_breg31 = 0x8f COMMA ++ DW_OP_regx = 0x90 COMMA ++ DW_OP_fbreg = 0x91 COMMA ++ DW_OP_bregx = 0x92 COMMA ++ DW_OP_piece = 0x93 COMMA ++ DW_OP_deref_size = 0x94 COMMA ++ DW_OP_xderef_size = 0x95 COMMA ++ DW_OP_nop = 0x96 COMMA ++ /* DWARF 3 extensions. */ ++ DW_OP_push_object_address = 0x97 COMMA ++ DW_OP_call2 = 0x98 COMMA ++ DW_OP_call4 = 0x99 COMMA ++ DW_OP_call_ref = 0x9a COMMA ++ /* GNU extensions. */ ++ DW_OP_GNU_push_tls_address = 0xe0 ++IF_NOT_ASM(};) ++ ++#define DW_OP_lo_user 0xe0 /* Implementation-defined range start. */ ++#define DW_OP_hi_user 0xff /* Implementation-defined range end. */ ++ ++/* Type encodings. */ ++ENUM(dwarf_type) ++ ++ DW_ATE_void = 0x0 COMMA ++ DW_ATE_address = 0x1 COMMA ++ DW_ATE_boolean = 0x2 COMMA ++ DW_ATE_complex_float = 0x3 COMMA ++ DW_ATE_float = 0x4 COMMA ++ DW_ATE_signed = 0x5 COMMA ++ DW_ATE_signed_char = 0x6 COMMA ++ DW_ATE_unsigned = 0x7 COMMA ++ DW_ATE_unsigned_char = 0x8 COMMA ++ /* DWARF 3. */ ++ DW_ATE_imaginary_float = 0x9 ++IF_NOT_ASM(};) ++ ++#define DW_ATE_lo_user 0x80 ++#define DW_ATE_hi_user 0xff ++ ++/* Array ordering names and codes. */ ++ENUM(dwarf_array_dim_ordering) ++ ++ DW_ORD_row_major = 0 COMMA ++ DW_ORD_col_major = 1 ++IF_NOT_ASM(};) ++ ++/* Access attribute. */ ++ENUM(dwarf_access_attribute) ++ ++ DW_ACCESS_public = 1 COMMA ++ DW_ACCESS_protected = 2 COMMA ++ DW_ACCESS_private = 3 ++IF_NOT_ASM(};) ++ ++/* Visibility. */ ++ENUM(dwarf_visibility_attribute) ++ ++ DW_VIS_local = 1 COMMA ++ DW_VIS_exported = 2 COMMA ++ DW_VIS_qualified = 3 ++IF_NOT_ASM(};) ++ ++/* Virtuality. */ ++ENUM(dwarf_virtuality_attribute) ++ ++ DW_VIRTUALITY_none = 0 COMMA ++ DW_VIRTUALITY_virtual = 1 COMMA ++ DW_VIRTUALITY_pure_virtual = 2 ++IF_NOT_ASM(};) ++ ++/* Case sensitivity. */ ++ENUM(dwarf_id_case) ++ ++ DW_ID_case_sensitive = 0 COMMA ++ DW_ID_up_case = 1 COMMA ++ DW_ID_down_case = 2 COMMA ++ DW_ID_case_insensitive = 3 ++IF_NOT_ASM(};) ++ ++/* Calling convention. */ ++ENUM(dwarf_calling_convention) ++ ++ DW_CC_normal = 0x1 COMMA ++ DW_CC_program = 0x2 COMMA ++ DW_CC_nocall = 0x3 ++IF_NOT_ASM(};) ++ ++#define DW_CC_lo_user 0x40 ++#define DW_CC_hi_user 0xff ++ ++/* Inline attribute. */ ++ENUM(dwarf_inline_attribute) ++ ++ DW_INL_not_inlined = 0 COMMA ++ DW_INL_inlined = 1 COMMA ++ DW_INL_declared_not_inlined = 2 COMMA ++ DW_INL_declared_inlined = 3 ++IF_NOT_ASM(};) ++ ++/* Discriminant lists. */ ++ENUM(dwarf_discrim_list) ++ ++ DW_DSC_label = 0 COMMA ++ DW_DSC_range = 1 ++IF_NOT_ASM(};) ++ ++/* Line number opcodes. */ ++ENUM(dwarf_line_number_ops) ++ ++ DW_LNS_extended_op = 0 COMMA ++ DW_LNS_copy = 1 COMMA ++ DW_LNS_advance_pc = 2 COMMA ++ DW_LNS_advance_line = 3 COMMA ++ DW_LNS_set_file = 4 COMMA ++ DW_LNS_set_column = 5 COMMA ++ DW_LNS_negate_stmt = 6 COMMA ++ DW_LNS_set_basic_block = 7 COMMA ++ DW_LNS_const_add_pc = 8 COMMA ++ DW_LNS_fixed_advance_pc = 9 COMMA ++ /* DWARF 3. */ ++ DW_LNS_set_prologue_end = 10 COMMA ++ DW_LNS_set_epilogue_begin = 11 COMMA ++ DW_LNS_set_isa = 12 ++IF_NOT_ASM(};) ++ ++/* Line number extended opcodes. */ ++ENUM(dwarf_line_number_x_ops) ++ ++ DW_LNE_end_sequence = 1 COMMA ++ DW_LNE_set_address = 2 COMMA ++ DW_LNE_define_file = 3 ++IF_NOT_ASM(};) ++ ++/* Call frame information. */ ++ENUM(dwarf_call_frame_info) ++ ++ DW_CFA_advance_loc = 0x40 COMMA ++ DW_CFA_offset = 0x80 COMMA ++ DW_CFA_restore = 0xc0 COMMA ++ DW_CFA_nop = 0x00 COMMA ++ DW_CFA_set_loc = 0x01 COMMA ++ DW_CFA_advance_loc1 = 0x02 COMMA ++ DW_CFA_advance_loc2 = 0x03 COMMA ++ DW_CFA_advance_loc4 = 0x04 COMMA ++ DW_CFA_offset_extended = 0x05 COMMA ++ DW_CFA_restore_extended = 0x06 COMMA ++ DW_CFA_undefined = 0x07 COMMA ++ DW_CFA_same_value = 0x08 COMMA ++ DW_CFA_register = 0x09 COMMA ++ DW_CFA_remember_state = 0x0a COMMA ++ DW_CFA_restore_state = 0x0b COMMA ++ DW_CFA_def_cfa = 0x0c COMMA ++ DW_CFA_def_cfa_register = 0x0d COMMA ++ DW_CFA_def_cfa_offset = 0x0e COMMA ++ ++ /* DWARF 3. */ ++ DW_CFA_def_cfa_expression = 0x0f COMMA ++ DW_CFA_expression = 0x10 COMMA ++ DW_CFA_offset_extended_sf = 0x11 COMMA ++ DW_CFA_def_cfa_sf = 0x12 COMMA ++ DW_CFA_def_cfa_offset_sf = 0x13 COMMA ++ ++ /* SGI/MIPS specific. */ ++ DW_CFA_MIPS_advance_loc8 = 0x1d COMMA ++ ++ /* GNU extensions. */ ++ DW_CFA_GNU_window_save = 0x2d COMMA ++ DW_CFA_GNU_args_size = 0x2e COMMA ++ DW_CFA_GNU_negative_offset_extended = 0x2f ++IF_NOT_ASM(};) ++ ++#define DW_CIE_ID 0xffffffff ++#define DW_CIE_VERSION 1 ++ ++#define DW_CFA_extended 0 ++#define DW_CFA_lo_user 0x1c ++#define DW_CFA_hi_user 0x3f ++ ++#define DW_CHILDREN_no 0x00 ++#define DW_CHILDREN_yes 0x01 ++ ++#define DW_ADDR_none 0 ++ ++/* Source language names and codes. */ ++ENUM(dwarf_source_language) ++ ++ DW_LANG_C89 = 0x0001 COMMA ++ DW_LANG_C = 0x0002 COMMA ++ DW_LANG_Ada83 = 0x0003 COMMA ++ DW_LANG_C_plus_plus = 0x0004 COMMA ++ DW_LANG_Cobol74 = 0x0005 COMMA ++ DW_LANG_Cobol85 = 0x0006 COMMA ++ DW_LANG_Fortran77 = 0x0007 COMMA ++ DW_LANG_Fortran90 = 0x0008 COMMA ++ DW_LANG_Pascal83 = 0x0009 COMMA ++ DW_LANG_Modula2 = 0x000a COMMA ++ DW_LANG_Java = 0x000b COMMA ++ /* DWARF 3. */ ++ DW_LANG_C99 = 0x000c COMMA ++ DW_LANG_Ada95 = 0x000d COMMA ++ DW_LANG_Fortran95 = 0x000e COMMA ++ /* MIPS. */ ++ DW_LANG_Mips_Assembler = 0x8001 COMMA ++ /* UPC. */ ++ DW_LANG_Upc = 0x8765 ++IF_NOT_ASM(};) ++ ++#define DW_LANG_lo_user 0x8000 /* Implementation-defined range start. */ ++#define DW_LANG_hi_user 0xffff /* Implementation-defined range start. */ ++ ++/* Names and codes for macro information. */ ++ENUM(dwarf_macinfo_record_type) ++ ++ DW_MACINFO_define = 1 COMMA ++ DW_MACINFO_undef = 2 COMMA ++ DW_MACINFO_start_file = 3 COMMA ++ DW_MACINFO_end_file = 4 COMMA ++ DW_MACINFO_vendor_ext = 255 ++IF_NOT_ASM(};) ++ ++/* @@@ For use with GNU frame unwind information. */ ++ ++#define DW_EH_PE_absptr 0x00 ++#define DW_EH_PE_omit 0xff ++ ++#define DW_EH_PE_uleb128 0x01 ++#define DW_EH_PE_udata2 0x02 ++#define DW_EH_PE_udata4 0x03 ++#define DW_EH_PE_udata8 0x04 ++#define DW_EH_PE_sleb128 0x09 ++#define DW_EH_PE_sdata2 0x0A ++#define DW_EH_PE_sdata4 0x0B ++#define DW_EH_PE_sdata8 0x0C ++#define DW_EH_PE_signed 0x08 ++ ++#define DW_EH_PE_pcrel 0x10 ++#define DW_EH_PE_textrel 0x20 ++#define DW_EH_PE_datarel 0x30 ++#define DW_EH_PE_funcrel 0x40 ++#define DW_EH_PE_aligned 0x50 ++ ++#define DW_EH_PE_indirect 0x80 ++ ++#endif /* _ELF_DWARF2_H */ +diff -puN /dev/null include/linux/dwarf2-lang.h +--- /dev/null 2003-09-15 06:40:47.000000000 -0700 ++++ 25-akpm/include/linux/dwarf2-lang.h 2004-10-21 14:54:15.337591824 -0700 +@@ -0,0 +1,132 @@ ++#ifndef DWARF2_LANG ++#define DWARF2_LANG ++#include ++ ++/* ++ * This is free software; you can redistribute it and/or modify it under ++ * the terms of the GNU General Public License as published by the Free ++ * Software Foundation; either version 2, or (at your option) any later ++ * version. ++ */ ++/* ++ * This file defines macros that allow generation of DWARF debug records ++ * for asm files. This file is platform independent. Register numbers ++ * (which are about the only thing that is platform dependent) are to be ++ * supplied by a platform defined file. ++ */ ++#define DWARF_preamble() .section .debug_frame,"",@progbits ++/* ++ * This macro starts a debug frame section. The debug_frame describes ++ * where to find the registers that the enclosing function saved on ++ * entry. ++ * ++ * ORD is use by the label generator and should be the same as what is ++ * passed to CFI_postamble. ++ * ++ * pc, pc register gdb ordinal. ++ * ++ * code_align this is the factor used to define locations or regions ++ * where the given definitions apply. If you use labels to define these ++ * this should be 1. ++ * ++ * data_align this is the factor used to define register offsets. If ++ * you use struct offset, this should be the size of the register in ++ * bytes or the negative of that. This is how it is used: you will ++ * define a register as the reference register, say the stack pointer, ++ * then you will say where a register is located relative to this ++ * reference registers value, say 40 for register 3 (the gdb register ++ * number). The <40> will be multiplied by to define the ++ * byte offset of the given register (3, in this example). So if your ++ * <40> is the byte offset and the reference register points at the ++ * begining, you would want 1 for the data_offset. If <40> was the 40th ++ * 4-byte element in that structure you would want 4. And if your ++ * reference register points at the end of the structure you would want ++ * a negative data_align value(and you would have to do other math as ++ * well). ++ */ ++ ++#define CFI_preamble(ORD, pc, code_align, data_align) \ ++.section .debug_frame,"",@progbits ; \ ++frame/**/_/**/ORD: \ ++ .long end/**/_/**/ORD-start/**/_/**/ORD; \ ++start/**/_/**/ORD: \ ++ .long DW_CIE_ID; \ ++ .byte DW_CIE_VERSION; \ ++ .byte 0 ; \ ++ .uleb128 code_align; \ ++ .sleb128 data_align; \ ++ .byte pc; ++ ++/* ++ * After the above macro and prior to the CFI_postamble, you need to ++ * define the initial state. This starts with defining the reference ++ * register and, usually the pc. Here are some helper macros: ++ */ ++ ++#define CFA_define_reference(reg, offset) \ ++ .byte DW_CFA_def_cfa; \ ++ .uleb128 reg; \ ++ .uleb128 (offset); ++ ++#define CFA_define_offset(reg, offset) \ ++ .byte (DW_CFA_offset + reg); \ ++ .uleb128 (offset); ++ ++#define CFI_postamble(ORD) \ ++ .align 4; \ ++end/**/_/**/ORD: ++/* ++ * So now your code pushs stuff on the stack, you need a new location ++ * and the rules for what to do. This starts a running description of ++ * the call frame. You need to describe what changes with respect to ++ * the call registers as the location of the pc moves through the code. ++ * The following builds an FDE (fram descriptor entry?). Like the ++ * above, it has a preamble and a postamble. It also is tied to the CFI ++ * above. ++ * The first entry after the preamble must be the location in the code ++ * that the call frame is being described for. ++ */ ++#define FDE_preamble(ORD, fde_no, initial_address, length) \ ++ .long FDE_end/**/_/**/fde_no-FDE_start/**/_/**/fde_no; \ ++FDE_start/**/_/**/fde_no: \ ++ .long frame/**/_/**/ORD; \ ++ .long initial_address; \ ++ .long length; ++ ++#define FDE_postamble(fde_no) \ ++ .align 4; \ ++FDE_end/**/_/**/fde_no: ++/* ++ * That done, you can now add registers, subtract registers, move the ++ * reference and even change the reference. You can also define a new ++ * area of code the info applies to. For discontinuous bits you should ++ * start a new FDE. You may have as many as you like. ++ */ ++ ++/* ++ * To advance the address by ++ */ ++ ++#define FDE_advance(bytes) \ ++ .byte DW_CFA_advance_loc4 \ ++ .long bytes ++ ++ ++ ++/* ++ * With the above you can define all the register locations. But ++ * suppose the reference register moves... Takes the new offset NOT an ++ * increment. This is how esp is tracked if it is not saved. ++ */ ++ ++#define CFA_define_cfa_offset(offset) \ ++ .byte $DW_CFA_def_cfa_offset; \ ++ .uleb128 (offset); ++/* ++ * Or suppose you want to use a different reference register... ++ */ ++#define CFA_define_cfa_register(reg) \ ++ .byte DW_CFA_def_cfa_register; \ ++ .uleb128 reg; ++ ++#endif +diff -puN include/linux/serial_core.h~kgdb-ga include/linux/serial_core.h +--- 25/include/linux/serial_core.h~kgdb-ga 2004-10-21 14:54:15.282600184 -0700 ++++ 25-akpm/include/linux/serial_core.h 2004-10-21 14:54:15.338591672 -0700 +@@ -172,7 +172,9 @@ struct uart_port { + unsigned char x_char; /* xon/xoff char */ + unsigned char regshift; /* reg offset shift */ + unsigned char iotype; /* io access style */ +- ++#ifdef CONFIG_KGDB ++ int kgdb; /* in use by kgdb */ ++#endif + #define UPIO_PORT (0) + #define UPIO_HUB6 (1) + #define UPIO_MEM (2) +diff -puN include/linux/spinlock.h~kgdb-ga include/linux/spinlock.h +--- 25/include/linux/spinlock.h~kgdb-ga 2004-10-21 14:54:15.284599880 -0700 ++++ 25-akpm/include/linux/spinlock.h 2004-10-21 14:54:15.338591672 -0700 +@@ -15,6 +15,12 @@ + + #include /* for cpu relax */ + #include ++#ifdef CONFIG_KGDB ++#include ++#define SET_WHO(x, him) (x)->who = him; ++#else ++#define SET_WHO(x, him) ++#endif + + /* + * Must define these before including other files, inline functions need them +@@ -88,6 +94,9 @@ typedef struct { + const char *module; + char *owner; + int oline; ++#ifdef CONFIG_KGDB ++ struct task_struct *who; ++#endif + } spinlock_t; + #define SPIN_LOCK_UNLOCKED (spinlock_t) { SPINLOCK_MAGIC, 0, 10, __FILE__ , NULL, 0} + +@@ -99,6 +108,7 @@ typedef struct { + (x)->module = __FILE__; \ + (x)->owner = NULL; \ + (x)->oline = 0; \ ++ SET_WHO(x, NULL) \ + } while (0) + + #define CHECK_LOCK(x) \ +@@ -121,6 +131,7 @@ typedef struct { + (x)->lock = 1; \ + (x)->owner = __FILE__; \ + (x)->oline = __LINE__; \ ++ SET_WHO(x, current) \ + } while (0) + + /* without debugging, spin_is_locked on UP always says +@@ -151,6 +162,7 @@ typedef struct { + (x)->lock = 1; \ + (x)->owner = __FILE__; \ + (x)->oline = __LINE__; \ ++ SET_WHO(x, current) \ + 1; \ + }) + +diff -puN kernel/pid.c~kgdb-ga kernel/pid.c +--- 25/kernel/pid.c~kgdb-ga 2004-10-21 14:54:15.285599728 -0700 ++++ 25-akpm/kernel/pid.c 2004-10-21 14:54:15.339591520 -0700 +@@ -252,6 +252,9 @@ void switch_exec_pids(task_t *leader, ta + * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or + * more. + */ ++#ifdef CONFIG_KGDB ++int kgdb_pid_init_done; /* so we don't call prior to... */ ++#endif + void __init pidhash_init(void) + { + int i, j, pidhash_size; +@@ -273,6 +276,9 @@ void __init pidhash_init(void) + for (j = 0; j < pidhash_size; j++) + INIT_HLIST_HEAD(&pid_hash[i][j]); + } ++#ifdef CONFIG_KGDB ++ kgdb_pid_init_done++; ++#endif + } + + void __init pidmap_init(void) +diff -puN kernel/sched.c~kgdb-ga kernel/sched.c +--- 25/kernel/sched.c~kgdb-ga 2004-10-21 14:54:15.287599424 -0700 ++++ 25-akpm/kernel/sched.c 2004-10-21 14:54:15.342591064 -0700 +@@ -2931,6 +2931,13 @@ out_unlock: + + EXPORT_SYMBOL(set_user_nice); + ++#ifdef CONFIG_KGDB ++struct task_struct *kgdb_get_idle(int this_cpu) ++{ ++ return cpu_rq(this_cpu)->idle; ++} ++#endif ++ + #ifdef __ARCH_WANT_SYS_NICE + + /* +diff -puN MAINTAINERS~kgdb-ga MAINTAINERS +--- 25/MAINTAINERS~kgdb-ga 2004-10-21 14:54:15.288599272 -0700 ++++ 25-akpm/MAINTAINERS 2004-10-21 14:54:15.344590760 -0700 +@@ -1242,6 +1242,12 @@ W: http://sf.net/projects/kernel-janitor + W: http://developer.osdl.org/rddunlap/kj-patches/ + S: Maintained + ++KGDB FOR I386 PLATFORM ++P: George Anzinger ++M: george@mvista.com ++L: linux-net@vger.kernel.org ++S: Supported ++ + KERNEL NFSD + P: Neil Brown + M: neilb@cse.unsw.edu.au +diff -puN arch/i386/Kconfig.debug~kgdb-ga arch/i386/Kconfig.debug +--- 25/arch/i386/Kconfig.debug~kgdb-ga 2004-10-21 14:54:15.290598968 -0700 ++++ 25-akpm/arch/i386/Kconfig.debug 2004-10-21 14:54:15.344590760 -0700 +@@ -65,4 +65,6 @@ config X86_MPPARSE + depends on X86_LOCAL_APIC && !X86_VISWS + default y + ++source "arch/i386/Kconfig.kgdb" ++ + endmenu +diff -puN /dev/null arch/i386/Kconfig.kgdb +--- /dev/null 2003-09-15 06:40:47.000000000 -0700 ++++ 25-akpm/arch/i386/Kconfig.kgdb 2004-10-21 14:54:15.345590608 -0700 +@@ -0,0 +1,175 @@ ++config KGDB ++ bool "Include kgdb kernel debugger" ++ depends on DEBUG_KERNEL ++ help ++ If you say Y here, the system will be compiled with the debug ++ option (-g) and a debugging stub will be included in the ++ kernel. This stub communicates with gdb on another (host) ++ computer via a serial port. The host computer should have ++ access to the kernel binary file (vmlinux) and a serial port ++ that is connected to the target machine. Gdb can be made to ++ configure the serial port or you can use stty and setserial to ++ do this. See the 'target' command in gdb. This option also ++ configures in the ability to request a breakpoint early in the ++ boot process. To request the breakpoint just include 'kgdb' ++ as a boot option when booting the target machine. The system ++ will then break as soon as it looks at the boot options. This ++ option also installs a breakpoint in panic and sends any ++ kernel faults to the debugger. For more information see the ++ Documentation/i386/kgdb/kgdb.txt file. ++ ++choice ++ depends on KGDB ++ prompt "Debug serial port BAUD" ++ default KGDB_115200BAUD ++ help ++ Gdb and the kernel stub need to agree on the baud rate to be ++ used. Some systems (x86 family at this writing) allow this to ++ be configured. ++ ++config KGDB_9600BAUD ++ bool "9600" ++ ++config KGDB_19200BAUD ++ bool "19200" ++ ++config KGDB_38400BAUD ++ bool "38400" ++ ++config KGDB_57600BAUD ++ bool "57600" ++ ++config KGDB_115200BAUD ++ bool "115200" ++endchoice ++ ++config KGDB_PORT ++ hex "hex I/O port address of the debug serial port" ++ depends on KGDB ++ default 3f8 ++ help ++ Some systems (x86 family at this writing) allow the port ++ address to be configured. The number entered is assumed to be ++ hex, don't put 0x in front of it. The standard address are: ++ COM1 3f8 , irq 4 and COM2 2f8 irq 3. Setserial /dev/ttySx ++ will tell you what you have. It is good to test the serial ++ connection with a live system before trying to debug. ++ ++config KGDB_IRQ ++ int "IRQ of the debug serial port" ++ depends on KGDB ++ default 4 ++ help ++ This is the irq for the debug port. If everything is working ++ correctly and the kernel has interrupts on a control C to the ++ port should cause a break into the kernel debug stub. ++ ++config DEBUG_INFO ++ bool ++ depends on KGDB ++ default y ++ ++config KGDB_MORE ++ bool "Add any additional compile options" ++ depends on KGDB ++ default n ++ help ++ Saying yes here turns on the ability to enter additional ++ compile options. ++ ++ ++config KGDB_OPTIONS ++ depends on KGDB_MORE ++ string "Additional compile arguments" ++ default "-O1" ++ help ++ This option allows you enter additional compile options for ++ the whole kernel compile. Each platform will have a default ++ that seems right for it. For example on PPC "-ggdb -O1", and ++ for i386 "-O1". Note that by configuring KGDB "-g" is already ++ turned on. In addition, on i386 platforms ++ "-fomit-frame-pointer" is deleted from the standard compile ++ options. ++ ++config NO_KGDB_CPUS ++ int "Number of CPUs" ++ depends on KGDB && SMP ++ default NR_CPUS ++ help ++ ++ This option sets the number of cpus for kgdb ONLY. It is used ++ to prune some internal structures so they look "nice" when ++ displayed with gdb. This is to overcome possibly larger ++ numbers that may have been entered above. Enter the real ++ number to get nice clean kgdb_info displays. ++ ++config KGDB_TS ++ bool "Enable kgdb time stamp macros?" ++ depends on KGDB ++ default n ++ help ++ Kgdb event macros allow you to instrument your code with calls ++ to the kgdb event recording function. The event log may be ++ examined with gdb at a break point. Turning on this ++ capability also allows you to choose how many events to ++ keep. Kgdb always keeps the lastest events. ++ ++choice ++ depends on KGDB_TS ++ prompt "Max number of time stamps to save?" ++ default KGDB_TS_128 ++ ++config KGDB_TS_64 ++ bool "64" ++ ++config KGDB_TS_128 ++ bool "128" ++ ++config KGDB_TS_256 ++ bool "256" ++ ++config KGDB_TS_512 ++ bool "512" ++ ++config KGDB_TS_1024 ++ bool "1024" ++ ++endchoice ++ ++config STACK_OVERFLOW_TEST ++ bool "Turn on kernel stack overflow testing?" ++ depends on KGDB ++ default n ++ help ++ This option enables code in the front line interrupt handlers ++ to check for kernel stack overflow on interrupts and system ++ calls. This is part of the kgdb code on x86 systems. ++ ++config KGDB_CONSOLE ++ bool "Enable serial console thru kgdb port" ++ depends on KGDB ++ default n ++ help ++ This option enables the command line "console=kgdb" option. ++ When the system is booted with this option in the command line ++ all kernel printk output is sent to gdb (as well as to other ++ consoles). For this to work gdb must be connected. For this ++ reason, this command line option will generate a breakpoint if ++ gdb has not yet connected. After the gdb continue command is ++ given all pent up console output will be printed by gdb on the ++ host machine. Neither this option, nor KGDB require the ++ serial driver to be configured. ++ ++config KGDB_SYSRQ ++ bool "Turn on SysRq 'G' command to do a break?" ++ depends on KGDB ++ default y ++ help ++ This option includes an option in the SysRq code that allows ++ you to enter SysRq G which generates a breakpoint to the KGDB ++ stub. This will work if the keyboard is alive and can ++ interrupt the system. Because of constraints on when the ++ serial port interrupt can be enabled, this code may allow you ++ to interrupt the system before the serial port control C is ++ available. Just say yes here. ++ +_ diff --git a/lustre/kernel_patches/patches/8kstack-2.6.12.patch b/lustre/kernel_patches/patches/8kstack-2.6.12.patch new file mode 100644 index 0000000..f3a2160 --- /dev/null +++ b/lustre/kernel_patches/patches/8kstack-2.6.12.patch @@ -0,0 +1,13 @@ +Index: linux-2.6.9-5.0.3.EL/include/asm-i386/thread_info.h +=================================================================== +--- linux-2.6.9-5.0.3.EL.orig/include/asm-i386/thread_info.h 2005-02-25 10:25:33.000000000 +0200 ++++ linux-2.6.9-5.0.3.EL/include/asm-i386/thread_info.h 2005-02-25 20:19:11.676139032 +0200 +@@ -54,7 +54,7 @@ + + #define PREEMPT_ACTIVE 0x10000000 + #ifdef CONFIG_4KSTACKS +-#define THREAD_SIZE (4096) ++#define THREAD_SIZE (8192) + #else + #define THREAD_SIZE (8192) + #endif diff --git a/lustre/kernel_patches/patches/add_page_private-2.4.19-bgl.patch b/lustre/kernel_patches/patches/add_page_private-2.4.19-bgl.patch deleted file mode 100644 index 9bb754a..0000000 --- a/lustre/kernel_patches/patches/add_page_private-2.4.19-bgl.patch +++ /dev/null @@ -1,15 +0,0 @@ - include/linux/mm.h | 1 + - 1 files changed, 1 insertion(+) - -Index: linux.mcp2/include/linux/mm.h -=================================================================== ---- linux.mcp2.orig/include/linux/mm.h 2004-05-05 14:32:29.000000000 -0700 -+++ linux.mcp2/include/linux/mm.h 2004-05-05 14:46:54.000000000 -0700 -@@ -162,6 +162,7 @@ - protected by pagemap_lru_lock !! */ - struct page **pprev_hash; /* Complement to *next_hash. */ - struct buffer_head * buffers; /* Buffer maps us to a disk block. */ -+ unsigned long private; - - /* - * On machines where all RAM is mapped into kernel address space, diff --git a/lustre/kernel_patches/patches/compile-fixes-2.6.9-rhel4-22.patch b/lustre/kernel_patches/patches/compile-fixes-2.6.9-rhel4-22.patch new file mode 100644 index 0000000..98b8715 --- /dev/null +++ b/lustre/kernel_patches/patches/compile-fixes-2.6.9-rhel4-22.patch @@ -0,0 +1,76 @@ +--- linux-2.6.9/arch/i386/kernel/apic.c.orig 2005-08-04 08:11:13.000000000 -0400 ++++ linux-2.6.9/arch/i386/kernel/apic.c 2005-08-04 08:27:04.000000000 -0400 +@@ -1125,8 +1125,10 @@ asmlinkage void smp_local_timer_interrup + + void smp_apic_timer_interrupt(struct pt_regs regs) + { ++#ifdef CONFIG_4KSTACKS + union irq_ctx *curctx; + union irq_ctx *irqctx; ++#endif + int cpu; + u32 *isp; + +@@ -1147,11 +1149,11 @@ void smp_apic_timer_interrupt(struct pt_ + * interrupt lock, which is the WrongThing (tm) to do. + */ + irq_enter(); ++ ++#ifdef CONFIG_4KSTACKS + curctx = (union irq_ctx *) current_thread_info(); + irqctx = hardirq_ctx[cpu]; +- if (curctx == irqctx) { +- smp_local_timer_interrupt(®s); +- } else { ++ if (curctx != irqctx) { + /* build the stack frame on the IRQ stack */ + isp = (u32*) ((char*)irqctx + sizeof(*irqctx)); + irqctx->tinfo.task = curctx->tinfo.task; +@@ -1167,7 +1169,10 @@ void smp_apic_timer_interrupt(struct pt_ + : : "b"(isp) + : "memory", "cc", "edx", "ecx" + ); +- } ++ } else ++#endif ++ smp_local_timer_interrupt(®s); ++ + irq_exit(); + } + +--- linux-2.6.9/include/asm-i386/crashdump.h.orig 2005-08-04 08:11:22.000000000 -0400 ++++ linux-2.6.9/include/asm-i386/crashdump.h 2005-08-04 08:27:04.000000000 -0400 +@@ -48,12 +48,14 @@ extern unsigned long next_ram_page (unsi + + static inline void platform_init_stack(void **stackptr) + { ++#ifdef CONFIG_4KSTACKS + *stackptr = (void *)kmalloc(sizeof(union irq_ctx), GFP_KERNEL); + if (*stackptr) + memset(*stackptr, 0, sizeof(union irq_ctx)); + else + printk(KERN_WARNING + "crashdump: unable to allocate separate stack\n"); ++#endif + } + + typedef asmlinkage void (*crashdump_func_t)(struct pt_regs *, void *); +@@ -62,6 +64,7 @@ static inline void platform_start_crashd + crashdump_func_t dumpfunc, + struct pt_regs *regs) + { ++#ifdef CONFIG_4KSTACKS + u32 *dsp; + union irq_ctx * curctx; + union irq_ctx * dumpctx; +@@ -90,6 +93,10 @@ static inline void platform_start_crashd + : "memory", "cc", "edx", "ecx" + ); + } ++#else ++ dumpfunc(regs, NULL); ++#endif ++ + } + + #define platform_cleanup_stack(stackptr) \ diff --git a/lustre/kernel_patches/patches/dcache-qstr-api-fix-2.6-suse.patch b/lustre/kernel_patches/patches/dcache-qstr-api-fix-2.6-suse.patch new file mode 100644 index 0000000..64b8bd3 --- /dev/null +++ b/lustre/kernel_patches/patches/dcache-qstr-api-fix-2.6-suse.patch @@ -0,0 +1,148 @@ +Index: linux-2.6.5-7.201/include/linux/dcache.h +=================================================================== +--- linux-2.6.5-7.201.orig/include/linux/dcache.h 2005-10-11 00:12:48.000000000 +0400 ++++ linux-2.6.5-7.201/include/linux/dcache.h 2005-12-20 23:16:31.000000000 +0300 +@@ -38,7 +38,6 @@ struct qstr { + const unsigned char * name; + unsigned int len; + unsigned int hash; +- char name_str[0]; + }; + + #include +@@ -104,7 +103,6 @@ struct dentry { + struct rcu_head d_rcu; + struct dcookie_struct * d_cookie; /* cookie, if any */ + unsigned long d_move_count; /* to indicated moved dentry while lockless lookup */ +- struct qstr * d_qstr; /* quick str ptr used in lockless lookup and concurrent d_move */ + struct dentry * d_parent; /* parent directory */ + struct qstr d_name; + struct hlist_node d_hash; /* lookup hash list */ +Index: linux-2.6.5-7.201/fs/dcache.c +=================================================================== +--- linux-2.6.5-7.201.orig/fs/dcache.c 2005-10-11 00:12:45.000000000 +0400 ++++ linux-2.6.5-7.201/fs/dcache.c 2005-12-20 23:16:31.000000000 +0300 +@@ -41,6 +41,8 @@ EXPORT_SYMBOL(dcache_lock); + + static kmem_cache_t *dentry_cache; + ++#define DNAME_INLINE_LEN (sizeof(struct dentry)-offsetof(struct dentry,d_iname)) ++ + /* + * This is the single most critical data structure when it comes + * to the dcache: the hashtable for lookups. Somebody should try +@@ -67,7 +69,7 @@ static void d_callback(void *arg) + struct dentry * dentry = (struct dentry *)arg; + + if (dname_external(dentry)) { +- kfree(dentry->d_qstr); ++ kfree(dentry->d_name.name); + } + kmem_cache_free(dentry_cache, dentry); + } +@@ -678,8 +680,6 @@ static int shrink_dcache_memory(int nr, + return dentry_stat.nr_unused; + } + +-#define NAME_ALLOC_LEN(len) ((len+16) & ~15) +- + /** + * d_alloc - allocate a dcache entry + * @parent: parent of entry to allocate +@@ -694,26 +694,18 @@ struct dentry * d_alloc(struct dentry * + { + char * str; + struct dentry *dentry; +- struct qstr * qstr; + + dentry = kmem_cache_alloc(dentry_cache, GFP_KERNEL); + if (!dentry) + return NULL; + + if (name->len > DNAME_INLINE_LEN-1) { +- qstr = kmalloc(sizeof(*qstr) + NAME_ALLOC_LEN(name->len), +- GFP_KERNEL); +- if (!qstr) { ++ str = kmalloc(name->len + 1, GFP_KERNEL); ++ if (!str) { + kmem_cache_free(dentry_cache, dentry); + return NULL; + } +- qstr->name = qstr->name_str; +- qstr->len = name->len; +- qstr->hash = name->hash; +- dentry->d_qstr = qstr; +- str = qstr->name_str; + } else { +- dentry->d_qstr = &dentry->d_name; + str = dentry->d_iname; + } + +@@ -1010,7 +1002,7 @@ struct dentry * __d_lookup(struct dentry + if (dentry->d_parent != parent) + continue; + +- qstr = dentry->d_qstr; ++ qstr = &dentry->d_name; + smp_read_barrier_depends(); + if (parent->d_op && parent->d_op->d_compare) { + if (parent->d_op->d_compare(parent, qstr, name)) +@@ -1163,26 +1155,38 @@ void d_rehash(struct dentry * entry) + */ + static inline void switch_names(struct dentry * dentry, struct dentry * target) + { +- const unsigned char *old_name, *new_name; +- struct qstr *old_qstr, *new_qstr; +- +- memcpy(dentry->d_iname, target->d_iname, DNAME_INLINE_LEN); +- old_qstr = target->d_qstr; +- old_name = target->d_name.name; +- new_qstr = dentry->d_qstr; +- new_name = dentry->d_name.name; +- if (old_name == target->d_iname) { +- old_name = dentry->d_iname; +- old_qstr = &dentry->d_name; +- } +- if (new_name == dentry->d_iname) { +- new_name = target->d_iname; +- new_qstr = &target->d_name; +- } +- target->d_name.name = new_name; +- dentry->d_name.name = old_name; +- target->d_qstr = new_qstr; +- dentry->d_qstr = old_qstr; ++ if (dname_external(target)) { ++ if (dname_external(dentry)) { ++ /* ++ * Both external: swap the pointers ++ */ ++ do_switch(target->d_name.name, dentry->d_name.name); ++ } else { ++ /* ++ * dentry:internal, target:external. Steal target's ++ * storage and make target internal. ++ */ ++ dentry->d_name.name = target->d_name.name; ++ target->d_name.name = target->d_iname; ++ } ++ } else { ++ if (dname_external(dentry)) { ++ /* ++ * dentry:external, target:internal. Give dentry's ++ * storage to target and make dentry internal ++ */ ++ memcpy(dentry->d_iname, target->d_name.name, ++ target->d_name.len + 1); ++ target->d_name.name = dentry->d_name.name; ++ dentry->d_name.name = dentry->d_iname; ++ } else { ++ /* ++ * Both are internal. Just copy target to dentry ++ */ ++ memcpy(dentry->d_iname, target->d_name.name, ++ target->d_name.len + 1); ++ } ++ } + } + + /* diff --git a/lustre/kernel_patches/patches/export-show_task-2.4-bgl.patch b/lustre/kernel_patches/patches/export-show_task-2.4-bgl.patch deleted file mode 100644 index a7bdb63..0000000 --- a/lustre/kernel_patches/patches/export-show_task-2.4-bgl.patch +++ /dev/null @@ -1,32 +0,0 @@ -Index: linux-bgl/kernel/sched.c -=================================================================== ---- linux-bgl.orig/kernel/sched.c 2003-07-02 08:43:33.000000000 -0700 -+++ linux-bgl/kernel/sched.c 2004-10-26 23:37:44.314193755 -0700 -@@ -1124,7 +1124,7 @@ - return retval; - } - --static void show_task(struct task_struct * p) -+void show_task(struct task_struct * p) - { - unsigned long free = 0; - int state; -Index: linux-bgl/kernel/ksyms.c -=================================================================== ---- linux-bgl.orig/kernel/ksyms.c 2004-10-26 23:23:00.518654978 -0700 -+++ linux-bgl/kernel/ksyms.c 2004-10-26 23:38:29.289071295 -0700 -@@ -76,6 +76,7 @@ - }; - #endif - -+void show_task(struct task_struct *); - - EXPORT_SYMBOL(inter_module_register); - EXPORT_SYMBOL(inter_module_unregister); -@@ -595,3 +596,6 @@ - - EXPORT_SYMBOL(tasklist_lock); - EXPORT_SYMBOL(pidhash); -+ -+/* debug */ -+EXPORT_SYMBOL(show_task); diff --git a/lustre/kernel_patches/patches/export-truncate-bgl.patch b/lustre/kernel_patches/patches/export-truncate-bgl.patch deleted file mode 100644 index 9508215..0000000 --- a/lustre/kernel_patches/patches/export-truncate-bgl.patch +++ /dev/null @@ -1,37 +0,0 @@ - include/linux/mm.h | 1 + - mm/filemap.c | 3 ++- - 2 files changed, 3 insertions(+), 1 deletion(-) - -Index: linux-ion/include/linux/mm.h -=================================================================== ---- linux-ion.orig/include/linux/mm.h 2004-07-28 14:34:57.000000000 -0700 -+++ linux-ion/include/linux/mm.h 2004-09-27 15:07:50.000000000 -0700 -@@ -593,6 +593,7 @@ - /* filemap.c */ - extern void remove_inode_page(struct page *); - extern unsigned long page_unuse(struct page *); -+extern void truncate_complete_page(struct page *); - extern void truncate_inode_pages(struct address_space *, loff_t); - - /* generic vm_area_ops exported for stackable file systems */ -Index: linux-ion/mm/filemap.c -=================================================================== ---- linux-ion.orig/mm/filemap.c 2004-07-28 14:34:57.000000000 -0700 -+++ linux-ion/mm/filemap.c 2004-09-27 15:08:13.000000000 -0700 -@@ -231,7 +231,7 @@ - do_flushpage(page, partial); - } - --static void truncate_complete_page(struct page *page) -+void truncate_complete_page(struct page *page) - { - /* Leave it on the LRU if it gets converted into anonymous buffers */ - if (!page->buffers || do_flushpage(page, 0)) -@@ -249,6 +249,7 @@ - remove_inode_page(page); - page_cache_release(page); - } -+EXPORT_SYMBOL(truncate_complete_page); - - static int FASTCALL(truncate_list_pages(struct list_head *, unsigned long, unsigned *)); - static int truncate_list_pages(struct list_head *head, unsigned long start, unsigned *partial) diff --git a/lustre/kernel_patches/patches/export_symbols-2.6-rhel4.patch b/lustre/kernel_patches/patches/export_symbols-2.6-rhel4.patch index 16f26b0..8d9ab40 100644 --- a/lustre/kernel_patches/patches/export_symbols-2.6-rhel4.patch +++ b/lustre/kernel_patches/patches/export_symbols-2.6-rhel4.patch @@ -79,6 +79,24 @@ Index: linux-2.6.9-5.0.3.EL/kernel/exit.c =================================================================== --- linux-2.6.9-5.0.3.EL.orig/kernel/exit.c 2005-02-26 13:47:31.300655280 +0200 +++ linux-2.6.9-5.0.3.EL/kernel/exit.c 2005-02-26 13:53:13.805586616 +0200 +@@ -244,6 +244,8 @@ + write_unlock_irq(&tasklist_lock); + } + ++EXPORT_SYMBOL(reparent_to_init); ++ + void __set_special_pids(pid_t session, pid_t pgrp) + { + struct task_struct *curr = current; +@@ -428,6 +430,8 @@ + __exit_files(tsk); + } + ++EXPORT_SYMBOL(exit_files); ++ + static inline void __put_fs_struct(struct fs_struct *fs) + { + /* No need to hold fs->lock if we are killing it */ @@ -516,6 +516,7 @@ { __exit_mm(tsk); diff --git a/lustre/kernel_patches/patches/export_symbols-2.6-suse.patch b/lustre/kernel_patches/patches/export_symbols-2.6-suse.patch index b22d925..de1bf20 100644 --- a/lustre/kernel_patches/patches/export_symbols-2.6-suse.patch +++ b/lustre/kernel_patches/patches/export_symbols-2.6-suse.patch @@ -42,6 +42,28 @@ Index: linux-2.6.4-51.0/include/linux/ext2_fs_sb.h /* * second extended-fs super-block data in memory */ +Index: linux-2.6.5-12.1/kernel/exit.c +=================================================================== +--- linux-2.6.5-12.1.orig/kernel/exit.c 2004-05-10 12:21:56.000000000 -0400 ++++ linux-2.6.5-12.1/kernel/exit.c 2004-06-03 18:31:28.000000000 -0400 +@@ -260,6 +260,8 @@ + write_unlock_irq(&tasklist_lock); + } + ++EXPORT_SYMBOL(reparent_to_init); ++ + void __set_special_pids(pid_t session, pid_t pgrp) + { + struct task_struct *curr = current; +@@ -429,6 +431,8 @@ + __exit_files(tsk); + } + ++EXPORT_SYMBOL(exit_files); ++ + static inline void __put_fs_struct(struct fs_struct *fs) + { + /* No need to hold fs->lock if we are killing it */ Index: linux-2.6.4-51.0/kernel/kallsyms.c =================================================================== --- linux-2.6.4-51.0.orig/kernel/kallsyms.c 2004-04-05 12:42:08.000000000 -0400 diff --git a/lustre/kernel_patches/patches/export_symbols-2.6.12.patch b/lustre/kernel_patches/patches/export_symbols-2.6.12.patch new file mode 100644 index 0000000..c08e30f --- /dev/null +++ b/lustre/kernel_patches/patches/export_symbols-2.6.12.patch @@ -0,0 +1,105 @@ +Index: linux-2.6.12-rc6/fs/filesystems.c +=================================================================== +--- linux-2.6.12-rc6.orig/fs/filesystems.c 2005-06-06 17:22:29.000000000 +0200 ++++ linux-2.6.12-rc6/fs/filesystems.c 2005-06-14 15:53:58.298522852 +0200 +@@ -28,7 +28,9 @@ + */ + + static struct file_system_type *file_systems; +-static DEFINE_RWLOCK(file_systems_lock); ++DEFINE_RWLOCK(file_systems_lock); ++ ++EXPORT_SYMBOL(file_systems_lock); + + /* WARNING: This can be used only if we _already_ own a reference */ + void get_filesystem(struct file_system_type *fs) +Index: linux-2.6.12-rc6/include/linux/fs.h +=================================================================== +--- linux-2.6.12-rc6.orig/include/linux/fs.h 2005-06-14 15:53:18.356140529 +0200 ++++ linux-2.6.12-rc6/include/linux/fs.h 2005-06-14 15:53:58.309265039 +0200 +@@ -1563,6 +1563,7 @@ + + extern struct file_operations generic_ro_fops; + ++extern rwlock_t file_systems_lock; + #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m)) + + extern int vfs_readlink(struct dentry *, char __user *, int, const char *); +Index: linux-2.6.12-rc6/net/core/sock.c +=================================================================== +--- linux-2.6.12-rc6.orig/net/core/sock.c 2005-06-06 17:22:29.000000000 +0200 ++++ linux-2.6.12-rc6/net/core/sock.c 2005-06-14 15:53:58.349304101 +0200 +@@ -613,6 +613,7 @@ + return -EFAULT; + return 0; + } ++EXPORT_SYMBOL(sock_getsockopt); + + /** + * sk_alloc - All socket objects are allocated here +Index: linux-2.6.12-rc6/fs/namespace.c +=================================================================== +--- linux-2.6.12-rc6.orig/fs/namespace.c 2005-06-14 15:53:17.868835847 +0200 ++++ linux-2.6.12-rc6/fs/namespace.c 2005-06-14 15:53:58.361022851 +0200 +@@ -1240,6 +1240,7 @@ + mntput(old_pwdmnt); + } + } ++EXPORT_SYMBOL(set_fs_pwd); + + static void chroot_fs_refs(struct nameidata *old_nd, struct nameidata *new_nd) + { +Index: linux-2.6.12.5/kernel/exit.c +=================================================================== +--- linux-2.6.12.5.orig/kernel/exit.c 2005-08-17 17:51:28.000000000 +0200 ++++ linux-2.6.12.5/kernel/exit.c 2005-08-17 17:51:44.000000000 +0200 +@@ -250,6 +250,8 @@ + switch_uid(INIT_USER); + } + ++EXPORT_SYMBOL(reparent_to_init); ++ + void __set_special_pids(pid_t session, pid_t pgrp) + { + struct task_struct *curr = current; +@@ -432,6 +434,8 @@ + __exit_files(tsk); + } + ++EXPORT_SYMBOL(exit_files); ++ + static inline void __put_fs_struct(struct fs_struct *fs) + { + /* No need to hold fs->lock if we are killing it */ +@@ -515,6 +515,7 @@ + task_unlock(tsk); + mmput(mm); + } ++EXPORT_SYMBOL(exit_mm); + + static inline void choose_new_parent(task_t *p, task_t *reaper, task_t *child_reaper) + { +Index: linux-2.6.12-rc6/fs/dcache.c +=================================================================== +--- linux-2.6.12-rc6.orig/fs/dcache.c 2005-06-14 15:53:19.812195198 +0200 ++++ linux-2.6.12-rc6/fs/dcache.c 2005-06-14 15:53:58.385436913 +0200 +@@ -1581,6 +1581,7 @@ + + return result; + } ++EXPORT_SYMBOL(is_subdir); + + void d_genocide(struct dentry *root) + { +Index: linux-2.6.12-rc6/fs/file_table.c +=================================================================== +--- linux-2.6.12-rc6.orig/fs/file_table.c 2005-06-06 17:22:29.000000000 +0200 ++++ linux-2.6.12-rc6/fs/file_table.c 2005-06-14 15:53:58.396179101 +0200 +@@ -197,6 +197,7 @@ + file_free(file); + } + } ++EXPORT_SYMBOL(put_filp); + + void file_move(struct file *file, struct list_head *list) + { diff --git a/lustre/kernel_patches/patches/exports_2.4.19-bgl.patch b/lustre/kernel_patches/patches/exports_2.4.19-bgl.patch deleted file mode 100644 index 82a0182..0000000 --- a/lustre/kernel_patches/patches/exports_2.4.19-bgl.patch +++ /dev/null @@ -1,42 +0,0 @@ - - - -Index: linux-ion/kernel/ksyms.c -=================================================================== ---- linux-ion.orig/kernel/ksyms.c 2004-07-28 14:34:57.000000000 -0700 -+++ linux-ion/kernel/ksyms.c 2004-09-27 15:04:52.000000000 -0700 -@@ -286,6 +286,10 @@ - EXPORT_SYMBOL(dcache_readdir); - EXPORT_SYMBOL(dcache_dir_ops); - -+/* lustre */ -+EXPORT_SYMBOL(panic_notifier_list); -+EXPORT_SYMBOL(do_kern_mount); -+ - /* for stackable file systems (lofs, wrapfs, cryptfs, etc.) */ - EXPORT_SYMBOL(default_llseek); - EXPORT_SYMBOL(dentry_open); -Index: linux-ion/include/linux/fs.h -=================================================================== ---- linux-ion.orig/include/linux/fs.h 2004-07-28 14:34:57.000000000 -0700 -+++ linux-ion/include/linux/fs.h 2004-09-27 15:04:52.000000000 -0700 -@@ -1050,6 +1050,7 @@ - extern struct vfsmount *kern_mount(struct file_system_type *); - extern int may_umount(struct vfsmount *); - extern long do_mount(char *, char *, char *, unsigned long, void *); -+struct vfsmount *do_kern_mount(const char *fstype, int flags, char *name, void *data); - extern void umount_tree(struct vfsmount *); - - #define kern_umount mntput -Index: linux-ion/mm/memory.c -=================================================================== ---- linux-ion.orig/mm/memory.c 2004-07-28 14:34:57.000000000 -0700 -+++ linux-ion/mm/memory.c 2004-09-27 15:05:56.000000000 -0700 -@@ -401,6 +401,7 @@ - mm->rss = 0; - spin_unlock(&mm->page_table_lock); - } -+EXPORT_SYMBOL(zap_page_range); - - /* - * Do a quick page-table lookup for a single page. diff --git a/lustre/kernel_patches/patches/ext-2.4-patch-1-2.4.19-suse.patch b/lustre/kernel_patches/patches/ext-2.4-patch-1-2.4.19-suse.patch deleted file mode 100644 index 1cdaa93..0000000 --- a/lustre/kernel_patches/patches/ext-2.4-patch-1-2.4.19-suse.patch +++ /dev/null @@ -1,2560 +0,0 @@ - fs/ext3/Makefile | 2 - fs/ext3/dir.c | 299 +++++++++ - fs/ext3/file.c | 3 - fs/ext3/hash.c | 215 ++++++ - fs/ext3/namei.c | 1388 ++++++++++++++++++++++++++++++++++++++++----- - fs/ext3/super.c | 7 - include/linux/ext3_fs.h | 85 ++ - include/linux/ext3_fs_sb.h | 2 - include/linux/ext3_jbd.h | 2 - include/linux/rbtree.h | 2 - lib/rbtree.c | 42 + - 11 files changed, 1887 insertions(+), 160 deletions(-) - -Index: linux-2.4.19.SuSE/fs/ext3/Makefile -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/ext3/Makefile 2004-05-27 11:07:21.000000000 -0700 -+++ linux-2.4.19.SuSE/fs/ext3/Makefile 2004-05-27 11:08:28.000000000 -0700 -@@ -12,7 +12,7 @@ - export-objs := super.o inode.o - - obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ -- ioctl.o namei.o super.o symlink.o -+ ioctl.o namei.o super.o symlink.o hash.o - obj-m := $(O_TARGET) - - obj-$(CONFIG_EXT3_FS_XATTR) += xattr.o -Index: linux-2.4.19.SuSE/fs/ext3/dir.c -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/ext3/dir.c 2001-11-09 14:25:04.000000000 -0800 -+++ linux-2.4.19.SuSE/fs/ext3/dir.c 2004-05-27 11:08:28.000000000 -0700 -@@ -21,12 +21,16 @@ - #include - #include - #include -+#include -+#include - - static unsigned char ext3_filetype_table[] = { - DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK - }; - - static int ext3_readdir(struct file *, void *, filldir_t); -+static int ext3_dx_readdir(struct file * filp, -+ void * dirent, filldir_t filldir); - - struct file_operations ext3_dir_operations = { - read: generic_read_dir, -@@ -35,6 +39,17 @@ - fsync: ext3_sync_file, /* BKL held */ - }; - -+ -+static unsigned char get_dtype(struct super_block *sb, int filetype) -+{ -+ if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_FILETYPE) || -+ (filetype >= EXT3_FT_MAX)) -+ return DT_UNKNOWN; -+ -+ return (ext3_filetype_table[filetype]); -+} -+ -+ - int ext3_check_dir_entry (const char * function, struct inode * dir, - struct ext3_dir_entry_2 * de, - struct buffer_head * bh, -@@ -79,6 +94,16 @@ - - sb = inode->i_sb; - -+ if (is_dx(inode)) { -+ err = ext3_dx_readdir(filp, dirent, filldir); -+ if (err != ERR_BAD_DX_DIR) -+ return err; -+ /* -+ * We don't set the inode dirty flag since it's not -+ * critical that it get flushed back to the disk. -+ */ -+ EXT3_I(filp->f_dentry->d_inode)->i_flags &= ~EXT3_INDEX_FL; -+ } - stored = 0; - bh = NULL; - offset = filp->f_pos & (sb->s_blocksize - 1); -@@ -162,18 +187,12 @@ - * during the copy operation. - */ - unsigned long version = filp->f_version; -- unsigned char d_type = DT_UNKNOWN; - -- if (EXT3_HAS_INCOMPAT_FEATURE(sb, -- EXT3_FEATURE_INCOMPAT_FILETYPE) -- && de->file_type < EXT3_FT_MAX) -- d_type = -- ext3_filetype_table[de->file_type]; - error = filldir(dirent, de->name, - de->name_len, - filp->f_pos, - le32_to_cpu(de->inode), -- d_type); -+ get_dtype(sb, de->file_type)); - if (error) - break; - if (version != filp->f_version) -@@ -188,3 +207,269 @@ - UPDATE_ATIME(inode); - return 0; - } -+ -+#ifdef CONFIG_EXT3_INDEX -+/* -+ * These functions convert from the major/minor hash to an f_pos -+ * value. -+ * -+ * Currently we only use major hash numer. This is unfortunate, but -+ * on 32-bit machines, the same VFS interface is used for lseek and -+ * llseek, so if we use the 64 bit offset, then the 32-bit versions of -+ * lseek/telldir/seekdir will blow out spectacularly, and from within -+ * the ext2 low-level routine, we don't know if we're being called by -+ * a 64-bit version of the system call or the 32-bit version of the -+ * system call. Worse yet, NFSv2 only allows for a 32-bit readdir -+ * cookie. Sigh. -+ */ -+#define hash2pos(major, minor) (major >> 1) -+#define pos2maj_hash(pos) ((pos << 1) & 0xffffffff) -+#define pos2min_hash(pos) (0) -+ -+/* -+ * This structure holds the nodes of the red-black tree used to store -+ * the directory entry in hash order. -+ */ -+struct fname { -+ __u32 hash; -+ __u32 minor_hash; -+ rb_node_t rb_hash; -+ struct fname *next; -+ __u32 inode; -+ __u8 name_len; -+ __u8 file_type; -+ char name[0]; -+}; -+ -+/* -+ * This functoin implements a non-recursive way of freeing all of the -+ * nodes in the red-black tree. -+ */ -+static void free_rb_tree_fname(rb_root_t *root) -+{ -+ rb_node_t *n = root->rb_node; -+ rb_node_t *parent; -+ struct fname *fname; -+ -+ while (n) { -+ /* Do the node's children first */ -+ if ((n)->rb_left) { -+ n = n->rb_left; -+ continue; -+ } -+ if (n->rb_right) { -+ n = n->rb_right; -+ continue; -+ } -+ /* -+ * The node has no children; free it, and then zero -+ * out parent's link to it. Finally go to the -+ * beginning of the loop and try to free the parent -+ * node. -+ */ -+ parent = n->rb_parent; -+ fname = rb_entry(n, struct fname, rb_hash); -+ kfree(fname); -+ if (!parent) -+ root->rb_node = 0; -+ else if (parent->rb_left == n) -+ parent->rb_left = 0; -+ else if (parent->rb_right == n) -+ parent->rb_right = 0; -+ n = parent; -+ } -+ root->rb_node = 0; -+} -+ -+ -+struct dir_private_info *create_dir_info(loff_t pos) -+{ -+ struct dir_private_info *p; -+ -+ p = kmalloc(sizeof(struct dir_private_info), GFP_KERNEL); -+ if (!p) -+ return NULL; -+ p->root.rb_node = 0; -+ p->curr_node = 0; -+ p->extra_fname = 0; -+ p->last_pos = 0; -+ p->curr_hash = pos2maj_hash(pos); -+ p->curr_minor_hash = pos2min_hash(pos); -+ p->next_hash = 0; -+ return p; -+} -+ -+void ext3_htree_free_dir_info(struct dir_private_info *p) -+{ -+ free_rb_tree_fname(&p->root); -+ kfree(p); -+} -+ -+/* -+ * Given a directory entry, enter it into the fname rb tree. -+ */ -+void ext3_htree_store_dirent(struct file *dir_file, __u32 hash, -+ __u32 minor_hash, -+ struct ext3_dir_entry_2 *dirent) -+{ -+ rb_node_t **p, *parent = NULL; -+ struct fname * fname, *new_fn; -+ struct dir_private_info *info; -+ int len; -+ -+ info = (struct dir_private_info *) dir_file->private_data; -+ p = &info->root.rb_node; -+ -+ /* Create and allocate the fname structure */ -+ len = sizeof(struct fname) + dirent->name_len + 1; -+ new_fn = kmalloc(len, GFP_KERNEL); -+ memset(new_fn, 0, len); -+ new_fn->hash = hash; -+ new_fn->minor_hash = minor_hash; -+ new_fn->inode = le32_to_cpu(dirent->inode); -+ new_fn->name_len = dirent->name_len; -+ new_fn->file_type = dirent->file_type; -+ memcpy(new_fn->name, dirent->name, dirent->name_len); -+ new_fn->name[dirent->name_len] = 0; -+ -+ while (*p) { -+ parent = *p; -+ fname = rb_entry(parent, struct fname, rb_hash); -+ -+ /* -+ * If the hash and minor hash match up, then we put -+ * them on a linked list. This rarely happens... -+ */ -+ if ((new_fn->hash == fname->hash) && -+ (new_fn->minor_hash == fname->minor_hash)) { -+ new_fn->next = fname->next; -+ fname->next = new_fn; -+ return; -+ } -+ -+ if (new_fn->hash < fname->hash) -+ p = &(*p)->rb_left; -+ else if (new_fn->hash > fname->hash) -+ p = &(*p)->rb_right; -+ else if (new_fn->minor_hash < fname->minor_hash) -+ p = &(*p)->rb_left; -+ else /* if (new_fn->minor_hash > fname->minor_hash) */ -+ p = &(*p)->rb_right; -+ } -+ -+ rb_link_node(&new_fn->rb_hash, parent, p); -+ rb_insert_color(&new_fn->rb_hash, &info->root); -+} -+ -+ -+ -+/* -+ * This is a helper function for ext3_dx_readdir. It calls filldir -+ * for all entres on the fname linked list. (Normally there is only -+ * one entry on the linked list, unless there are 62 bit hash collisions.) -+ */ -+static int call_filldir(struct file * filp, void * dirent, -+ filldir_t filldir, struct fname *fname) -+{ -+ struct dir_private_info *info = filp->private_data; -+ loff_t curr_pos; -+ struct inode *inode = filp->f_dentry->d_inode; -+ struct super_block * sb; -+ int error; -+ -+ sb = inode->i_sb; -+ -+ if (!fname) { -+ printk("call_filldir: called with null fname?!?\n"); -+ return 0; -+ } -+ curr_pos = hash2pos(fname->hash, fname->minor_hash); -+ while (fname) { -+ error = filldir(dirent, fname->name, -+ fname->name_len, curr_pos, -+ fname->inode, -+ get_dtype(sb, fname->file_type)); -+ if (error) { -+ filp->f_pos = curr_pos; -+ info->extra_fname = fname->next; -+ return error; -+ } -+ fname = fname->next; -+ } -+ return 0; -+} -+ -+static int ext3_dx_readdir(struct file * filp, -+ void * dirent, filldir_t filldir) -+{ -+ struct dir_private_info *info = filp->private_data; -+ struct inode *inode = filp->f_dentry->d_inode; -+ struct fname *fname; -+ int ret; -+ -+ if (!info) { -+ info = create_dir_info(filp->f_pos); -+ if (!info) -+ return -ENOMEM; -+ filp->private_data = info; -+ } -+ -+ /* Some one has messed with f_pos; reset the world */ -+ if (info->last_pos != filp->f_pos) { -+ free_rb_tree_fname(&info->root); -+ info->curr_node = 0; -+ info->extra_fname = 0; -+ info->curr_hash = pos2maj_hash(filp->f_pos); -+ info->curr_minor_hash = pos2min_hash(filp->f_pos); -+ } -+ -+ /* -+ * If there are any leftover names on the hash collision -+ * chain, return them first. -+ */ -+ if (info->extra_fname && -+ call_filldir(filp, dirent, filldir, info->extra_fname)) -+ goto finished; -+ -+ if (!info->curr_node) -+ info->curr_node = rb_get_first(&info->root); -+ -+ while (1) { -+ /* -+ * Fill the rbtree if we have no more entries, -+ * or the inode has changed since we last read in the -+ * cached entries. -+ */ -+ if ((!info->curr_node) || -+ (filp->f_version != inode->i_version)) { -+ info->curr_node = 0; -+ free_rb_tree_fname(&info->root); -+ filp->f_version = inode->i_version; -+ ret = ext3_htree_fill_tree(filp, info->curr_hash, -+ info->curr_minor_hash, -+ &info->next_hash); -+ if (ret < 0) -+ return ret; -+ if (ret == 0) -+ break; -+ info->curr_node = rb_get_first(&info->root); -+ } -+ -+ fname = rb_entry(info->curr_node, struct fname, rb_hash); -+ info->curr_hash = fname->hash; -+ info->curr_minor_hash = fname->minor_hash; -+ if (call_filldir(filp, dirent, filldir, fname)) -+ break; -+ -+ info->curr_node = rb_get_next(info->curr_node); -+ if (!info->curr_node) { -+ info->curr_hash = info->next_hash; -+ info->curr_minor_hash = 0; -+ } -+ } -+finished: -+ info->last_pos = filp->f_pos; -+ UPDATE_ATIME(inode); -+ return 0; -+} -+#endif -Index: linux-2.4.19.SuSE/fs/ext3/namei.c -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/ext3/namei.c 2002-12-04 09:46:03.000000000 -0800 -+++ linux-2.4.19.SuSE/fs/ext3/namei.c 2004-05-27 11:08:52.000000000 -0700 -@@ -16,6 +16,12 @@ - * David S. Miller (davem@caip.rutgers.edu), 1995 - * Directory entry file type support and forward compatibility hooks - * for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998 -+ * Hash Tree Directory indexing (c) -+ * Daniel Phillips, 2001 -+ * Hash Tree Directory indexing porting -+ * Christopher Li, 2002 -+ * Hash Tree Directory indexing cleanup -+ * Theodore Ts'o, 2002 - */ - - #include -@@ -40,6 +46,630 @@ - #define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS) - #define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b)) - -+static struct buffer_head *ext3_append(handle_t *handle, -+ struct inode *inode, -+ u32 *block, int *err) -+{ -+ struct buffer_head *bh; -+ -+ *block = inode->i_size >> inode->i_sb->s_blocksize_bits; -+ -+ if ((bh = ext3_bread(handle, inode, *block, 1, err))) { -+ inode->i_size += inode->i_sb->s_blocksize; -+ EXT3_I(inode)->i_disksize = inode->i_size; -+ ext3_journal_get_write_access(handle,bh); -+ } -+ return bh; -+} -+ -+#ifndef assert -+#define assert(test) J_ASSERT(test) -+#endif -+ -+#ifndef swap -+#define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0) -+#endif -+ -+typedef struct { u32 v; } le_u32; -+typedef struct { u16 v; } le_u16; -+ -+#ifdef DX_DEBUG -+#define dxtrace(command) command -+#else -+#define dxtrace(command) -+#endif -+ -+struct fake_dirent -+{ -+ /*le*/u32 inode; -+ /*le*/u16 rec_len; -+ u8 name_len; -+ u8 file_type; -+}; -+ -+struct dx_countlimit -+{ -+ le_u16 limit; -+ le_u16 count; -+}; -+ -+struct dx_entry -+{ -+ le_u32 hash; -+ le_u32 block; -+}; -+ -+/* -+ * dx_root_info is laid out so that if it should somehow get overlaid by a -+ * dirent the two low bits of the hash version will be zero. Therefore, the -+ * hash version mod 4 should never be 0. Sincerely, the paranoia department. -+ */ -+ -+struct dx_root -+{ -+ struct fake_dirent dot; -+ char dot_name[4]; -+ struct fake_dirent dotdot; -+ char dotdot_name[4]; -+ struct dx_root_info -+ { -+ le_u32 reserved_zero; -+ u8 hash_version; -+ u8 info_length; /* 8 */ -+ u8 indirect_levels; -+ u8 unused_flags; -+ } -+ info; -+ struct dx_entry entries[0]; -+}; -+ -+struct dx_node -+{ -+ struct fake_dirent fake; -+ struct dx_entry entries[0]; -+}; -+ -+ -+struct dx_frame -+{ -+ struct buffer_head *bh; -+ struct dx_entry *entries; -+ struct dx_entry *at; -+}; -+ -+struct dx_map_entry -+{ -+ u32 hash; -+ u32 offs; -+}; -+ -+#ifdef CONFIG_EXT3_INDEX -+static inline unsigned dx_get_block (struct dx_entry *entry); -+static void dx_set_block (struct dx_entry *entry, unsigned value); -+static inline unsigned dx_get_hash (struct dx_entry *entry); -+static void dx_set_hash (struct dx_entry *entry, unsigned value); -+static unsigned dx_get_count (struct dx_entry *entries); -+static unsigned dx_get_limit (struct dx_entry *entries); -+static void dx_set_count (struct dx_entry *entries, unsigned value); -+static void dx_set_limit (struct dx_entry *entries, unsigned value); -+static unsigned dx_root_limit (struct inode *dir, unsigned infosize); -+static unsigned dx_node_limit (struct inode *dir); -+static struct dx_frame *dx_probe(struct dentry *dentry, -+ struct inode *dir, -+ struct dx_hash_info *hinfo, -+ struct dx_frame *frame, -+ int *err); -+static void dx_release (struct dx_frame *frames); -+static int dx_make_map (struct ext3_dir_entry_2 *de, int size, -+ struct dx_hash_info *hinfo, struct dx_map_entry map[]); -+static void dx_sort_map(struct dx_map_entry *map, unsigned count); -+static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to, -+ struct dx_map_entry *offsets, int count); -+static struct ext3_dir_entry_2* dx_pack_dirents (char *base, int size); -+static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block); -+static int ext3_htree_next_block(struct inode *dir, __u32 hash, -+ struct dx_frame *frame, -+ struct dx_frame *frames, int *err, -+ __u32 *start_hash); -+static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry, -+ struct ext3_dir_entry_2 **res_dir, int *err); -+static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, -+ struct inode *inode); -+ -+/* -+ * Future: use high four bits of block for coalesce-on-delete flags -+ * Mask them off for now. -+ */ -+ -+static inline unsigned dx_get_block (struct dx_entry *entry) -+{ -+ return le32_to_cpu(entry->block.v) & 0x00ffffff; -+} -+ -+static inline void dx_set_block (struct dx_entry *entry, unsigned value) -+{ -+ entry->block.v = cpu_to_le32(value); -+} -+ -+static inline unsigned dx_get_hash (struct dx_entry *entry) -+{ -+ return le32_to_cpu(entry->hash.v); -+} -+ -+static inline void dx_set_hash (struct dx_entry *entry, unsigned value) -+{ -+ entry->hash.v = cpu_to_le32(value); -+} -+ -+static inline unsigned dx_get_count (struct dx_entry *entries) -+{ -+ return le16_to_cpu(((struct dx_countlimit *) entries)->count.v); -+} -+ -+static inline unsigned dx_get_limit (struct dx_entry *entries) -+{ -+ return le16_to_cpu(((struct dx_countlimit *) entries)->limit.v); -+} -+ -+static inline void dx_set_count (struct dx_entry *entries, unsigned value) -+{ -+ ((struct dx_countlimit *) entries)->count.v = cpu_to_le16(value); -+} -+ -+static inline void dx_set_limit (struct dx_entry *entries, unsigned value) -+{ -+ ((struct dx_countlimit *) entries)->limit.v = cpu_to_le16(value); -+} -+ -+static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize) -+{ -+ unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(1) - -+ EXT3_DIR_REC_LEN(2) - infosize; -+ return 0? 20: entry_space / sizeof(struct dx_entry); -+} -+ -+static inline unsigned dx_node_limit (struct inode *dir) -+{ -+ unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(0); -+ return 0? 22: entry_space / sizeof(struct dx_entry); -+} -+ -+/* -+ * Debug -+ */ -+#ifdef DX_DEBUG -+struct stats -+{ -+ unsigned names; -+ unsigned space; -+ unsigned bcount; -+}; -+ -+static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext3_dir_entry_2 *de, -+ int size, int show_names) -+{ -+ unsigned names = 0, space = 0; -+ char *base = (char *) de; -+ struct dx_hash_info h = *hinfo; -+ -+ printk("names: "); -+ while ((char *) de < base + size) -+ { -+ if (de->inode) -+ { -+ if (show_names) -+ { -+ int len = de->name_len; -+ char *name = de->name; -+ while (len--) printk("%c", *name++); -+ ext3fs_dirhash(de->name, de->name_len, &h); -+ printk(":%x.%u ", h.hash, -+ ((char *) de - base)); -+ } -+ space += EXT3_DIR_REC_LEN(de->name_len); -+ names++; -+ } -+ de = (struct ext3_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len)); -+ } -+ printk("(%i)\n", names); -+ return (struct stats) { names, space, 1 }; -+} -+ -+struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, -+ struct dx_entry *entries, int levels) -+{ -+ unsigned blocksize = dir->i_sb->s_blocksize; -+ unsigned count = dx_get_count (entries), names = 0, space = 0, i; -+ unsigned bcount = 0; -+ struct buffer_head *bh; -+ int err; -+ printk("%i indexed blocks...\n", count); -+ for (i = 0; i < count; i++, entries++) -+ { -+ u32 block = dx_get_block(entries), hash = i? dx_get_hash(entries): 0; -+ u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash; -+ struct stats stats; -+ printk("%s%3u:%03u hash %8x/%8x ",levels?"":" ", i, block, hash, range); -+ if (!(bh = ext3_bread (NULL,dir, block, 0,&err))) continue; -+ stats = levels? -+ dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1): -+ dx_show_leaf(hinfo, (struct ext3_dir_entry_2 *) bh->b_data, blocksize, 0); -+ names += stats.names; -+ space += stats.space; -+ bcount += stats.bcount; -+ brelse (bh); -+ } -+ if (bcount) -+ printk("%snames %u, fullness %u (%u%%)\n", levels?"":" ", -+ names, space/bcount,(space/bcount)*100/blocksize); -+ return (struct stats) { names, space, bcount}; -+} -+#endif /* DX_DEBUG */ -+ -+/* -+ * Probe for a directory leaf block to search. -+ * -+ * dx_probe can return ERR_BAD_DX_DIR, which means there was a format -+ * error in the directory index, and the caller should fall back to -+ * searching the directory normally. The callers of dx_probe **MUST** -+ * check for this error code, and make sure it never gets reflected -+ * back to userspace. -+ */ -+static struct dx_frame * -+dx_probe(struct dentry *dentry, struct inode *dir, -+ struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err) -+{ -+ unsigned count, indirect; -+ struct dx_entry *at, *entries, *p, *q, *m; -+ struct dx_root *root; -+ struct buffer_head *bh; -+ struct dx_frame *frame = frame_in; -+ u32 hash; -+ -+ frame->bh = NULL; -+ if (dentry) -+ dir = dentry->d_parent->d_inode; -+ if (!(bh = ext3_bread (NULL,dir, 0, 0, err))) -+ goto fail; -+ root = (struct dx_root *) bh->b_data; -+ if (root->info.hash_version != DX_HASH_TEA && -+ root->info.hash_version != DX_HASH_HALF_MD4 && -+ root->info.hash_version != DX_HASH_LEGACY) { -+ ext3_warning(dir->i_sb, __FUNCTION__, -+ "Unrecognised inode hash code %d", -+ root->info.hash_version); -+ brelse(bh); -+ *err = ERR_BAD_DX_DIR; -+ goto fail; -+ } -+ hinfo->hash_version = root->info.hash_version; -+ hinfo->seed = dir->i_sb->u.ext3_sb.s_hash_seed; -+ if (dentry) -+ ext3fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo); -+ hash = hinfo->hash; -+ -+ if (root->info.unused_flags & 1) { -+ ext3_warning(dir->i_sb, __FUNCTION__, -+ "Unimplemented inode hash flags: %#06x", -+ root->info.unused_flags); -+ brelse(bh); -+ *err = ERR_BAD_DX_DIR; -+ goto fail; -+ } -+ -+ if ((indirect = root->info.indirect_levels) > 1) { -+ ext3_warning(dir->i_sb, __FUNCTION__, -+ "Unimplemented inode hash depth: %#06x", -+ root->info.indirect_levels); -+ brelse(bh); -+ *err = ERR_BAD_DX_DIR; -+ goto fail; -+ } -+ -+ entries = (struct dx_entry *) (((char *)&root->info) + -+ root->info.info_length); -+ assert(dx_get_limit(entries) == dx_root_limit(dir, -+ root->info.info_length)); -+ dxtrace (printk("Look up %x", hash)); -+ while (1) -+ { -+ count = dx_get_count(entries); -+ assert (count && count <= dx_get_limit(entries)); -+ p = entries + 1; -+ q = entries + count - 1; -+ while (p <= q) -+ { -+ m = p + (q - p)/2; -+ dxtrace(printk(".")); -+ if (dx_get_hash(m) > hash) -+ q = m - 1; -+ else -+ p = m + 1; -+ } -+ -+ if (0) // linear search cross check -+ { -+ unsigned n = count - 1; -+ at = entries; -+ while (n--) -+ { -+ dxtrace(printk(",")); -+ if (dx_get_hash(++at) > hash) -+ { -+ at--; -+ break; -+ } -+ } -+ assert (at == p - 1); -+ } -+ -+ at = p - 1; -+ dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at))); -+ frame->bh = bh; -+ frame->entries = entries; -+ frame->at = at; -+ if (!indirect--) return frame; -+ if (!(bh = ext3_bread (NULL,dir, dx_get_block(at), 0, err))) -+ goto fail2; -+ at = entries = ((struct dx_node *) bh->b_data)->entries; -+ assert (dx_get_limit(entries) == dx_node_limit (dir)); -+ frame++; -+ } -+fail2: -+ while (frame >= frame_in) { -+ brelse(frame->bh); -+ frame--; -+ } -+fail: -+ return NULL; -+} -+ -+static void dx_release (struct dx_frame *frames) -+{ -+ if (frames[0].bh == NULL) -+ return; -+ -+ if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels) -+ brelse(frames[1].bh); -+ brelse(frames[0].bh); -+} -+ -+/* -+ * This function increments the frame pointer to search the next leaf -+ * block, and reads in the necessary intervening nodes if the search -+ * should be necessary. Whether or not the search is necessary is -+ * controlled by the hash parameter. If the hash value is even, then -+ * the search is only continued if the next block starts with that -+ * hash value. This is used if we are searching for a specific file. -+ * -+ * If the hash value is HASH_NB_ALWAYS, then always go to the next block. -+ * -+ * This function returns 1 if the caller should continue to search, -+ * or 0 if it should not. If there is an error reading one of the -+ * index blocks, it will return -1. -+ * -+ * If start_hash is non-null, it will be filled in with the starting -+ * hash of the next page. -+ */ -+static int ext3_htree_next_block(struct inode *dir, __u32 hash, -+ struct dx_frame *frame, -+ struct dx_frame *frames, int *err, -+ __u32 *start_hash) -+{ -+ struct dx_frame *p; -+ struct buffer_head *bh; -+ int num_frames = 0; -+ __u32 bhash; -+ -+ *err = ENOENT; -+ p = frame; -+ /* -+ * Find the next leaf page by incrementing the frame pointer. -+ * If we run out of entries in the interior node, loop around and -+ * increment pointer in the parent node. When we break out of -+ * this loop, num_frames indicates the number of interior -+ * nodes need to be read. -+ */ -+ while (1) { -+ if (++(p->at) < p->entries + dx_get_count(p->entries)) -+ break; -+ if (p == frames) -+ return 0; -+ num_frames++; -+ p--; -+ } -+ -+ /* -+ * If the hash is 1, then continue only if the next page has a -+ * continuation hash of any value. This is used for readdir -+ * handling. Otherwise, check to see if the hash matches the -+ * desired contiuation hash. If it doesn't, return since -+ * there's no point to read in the successive index pages. -+ */ -+ bhash = dx_get_hash(p->at); -+ if (start_hash) -+ *start_hash = bhash; -+ if ((hash & 1) == 0) { -+ if ((bhash & ~1) != hash) -+ return 0; -+ } -+ /* -+ * If the hash is HASH_NB_ALWAYS, we always go to the next -+ * block so no check is necessary -+ */ -+ while (num_frames--) { -+ if (!(bh = ext3_bread(NULL, dir, dx_get_block(p->at), -+ 0, err))) -+ return -1; /* Failure */ -+ p++; -+ brelse (p->bh); -+ p->bh = bh; -+ p->at = p->entries = ((struct dx_node *) bh->b_data)->entries; -+ } -+ return 1; -+} -+ -+ -+/* -+ * p is at least 6 bytes before the end of page -+ */ -+static inline struct ext3_dir_entry_2 *ext3_next_entry(struct ext3_dir_entry_2 *p) -+{ -+ return (struct ext3_dir_entry_2 *)((char*)p + le16_to_cpu(p->rec_len)); -+} -+ -+/* -+ * This function fills a red-black tree with information from a -+ * directory. We start scanning the directory in hash order, starting -+ * at start_hash and start_minor_hash. -+ * -+ * This function returns the number of entries inserted into the tree, -+ * or a negative error code. -+ */ -+int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash, -+ __u32 start_minor_hash, __u32 *next_hash) -+{ -+ struct dx_hash_info hinfo; -+ struct buffer_head *bh; -+ struct ext3_dir_entry_2 *de, *top; -+ static struct dx_frame frames[2], *frame; -+ struct inode *dir; -+ int block, err; -+ int count = 0; -+ int ret; -+ __u32 hashval; -+ -+ dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash, -+ start_minor_hash)); -+ dir = dir_file->f_dentry->d_inode; -+ hinfo.hash = start_hash; -+ hinfo.minor_hash = 0; -+ frame = dx_probe(0, dir_file->f_dentry->d_inode, &hinfo, frames, &err); -+ if (!frame) -+ return err; -+ -+ while (1) { -+ block = dx_get_block(frame->at); -+ dxtrace(printk("Reading block %d\n", block)); -+ if (!(bh = ext3_bread (NULL, dir, block, 0, &err))) -+ goto errout; -+ -+ de = (struct ext3_dir_entry_2 *) bh->b_data; -+ top = (struct ext3_dir_entry_2 *) ((char *) de + dir->i_sb->s_blocksize - -+ EXT3_DIR_REC_LEN(0)); -+ for (; de < top; de = ext3_next_entry(de)) { -+ ext3fs_dirhash(de->name, de->name_len, &hinfo); -+ if ((hinfo.hash < start_hash) || -+ ((hinfo.hash == start_hash) && -+ (hinfo.minor_hash < start_minor_hash))) -+ continue; -+ ext3_htree_store_dirent(dir_file, hinfo.hash, -+ hinfo.minor_hash, de); -+ count++; -+ } -+ brelse (bh); -+ hashval = ~1; -+ ret = ext3_htree_next_block(dir, HASH_NB_ALWAYS, -+ frame, frames, &err, &hashval); -+ if (next_hash) -+ *next_hash = hashval; -+ if (ret == -1) -+ goto errout; -+ /* -+ * Stop if: (a) there are no more entries, or -+ * (b) we have inserted at least one entry and the -+ * next hash value is not a continuation -+ */ -+ if ((ret == 0) || -+ (count && ((hashval & 1) == 0))) -+ break; -+ } -+ dx_release(frames); -+ dxtrace(printk("Fill tree: returned %d entries\n", count)); -+ return count; -+errout: -+ dx_release(frames); -+ return (err); -+} -+ -+ -+/* -+ * Directory block splitting, compacting -+ */ -+ -+static int dx_make_map (struct ext3_dir_entry_2 *de, int size, -+ struct dx_hash_info *hinfo, struct dx_map_entry *map_tail) -+{ -+ int count = 0; -+ char *base = (char *) de; -+ struct dx_hash_info h = *hinfo; -+ -+ while ((char *) de < base + size) -+ { -+ if (de->name_len && de->inode) { -+ ext3fs_dirhash(de->name, de->name_len, &h); -+ map_tail--; -+ map_tail->hash = h.hash; -+ map_tail->offs = (u32) ((char *) de - base); -+ count++; -+ } -+ /* XXX: do we need to check rec_len == 0 case? -Chris */ -+ de = (struct ext3_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len)); -+ } -+ return count; -+} -+ -+static void dx_sort_map (struct dx_map_entry *map, unsigned count) -+{ -+ struct dx_map_entry *p, *q, *top = map + count - 1; -+ int more; -+ /* Combsort until bubble sort doesn't suck */ -+ while (count > 2) -+ { -+ count = count*10/13; -+ if (count - 9 < 2) /* 9, 10 -> 11 */ -+ count = 11; -+ for (p = top, q = p - count; q >= map; p--, q--) -+ if (p->hash < q->hash) -+ swap(*p, *q); -+ } -+ /* Garden variety bubble sort */ -+ do { -+ more = 0; -+ q = top; -+ while (q-- > map) -+ { -+ if (q[1].hash >= q[0].hash) -+ continue; -+ swap(*(q+1), *q); -+ more = 1; -+ } -+ } while(more); -+} -+ -+static void dx_insert_block(struct dx_frame *frame, u32 hash, u32 block) -+{ -+ struct dx_entry *entries = frame->entries; -+ struct dx_entry *old = frame->at, *new = old + 1; -+ int count = dx_get_count(entries); -+ -+ assert(count < dx_get_limit(entries)); -+ assert(old < entries + count); -+ memmove(new + 1, new, (char *)(entries + count) - (char *)(new)); -+ dx_set_hash(new, hash); -+ dx_set_block(new, block); -+ dx_set_count(entries, count + 1); -+} -+#endif -+ -+ -+static void ext3_update_dx_flag(struct inode *inode) -+{ -+ if (!EXT3_HAS_COMPAT_FEATURE(inode->i_sb, -+ EXT3_FEATURE_COMPAT_DIR_INDEX)) -+ EXT3_I(inode)->i_flags &= ~EXT3_INDEX_FL; -+} -+ - /* - * NOTE! unlike strncmp, ext3_match returns 1 for success, 0 for failure. - * -@@ -96,6 +726,7 @@ - return 0; - } - -+ - /* - * ext3_find_entry() - * -@@ -107,6 +738,8 @@ - * The returned buffer_head has ->b_count elevated. The caller is expected - * to brelse() it when appropriate. - */ -+ -+ - static struct buffer_head * ext3_find_entry (struct dentry *dentry, - struct ext3_dir_entry_2 ** res_dir) - { -@@ -121,12 +754,32 @@ - int num = 0; - int nblocks, i, err; - struct inode *dir = dentry->d_parent->d_inode; -+ int namelen; -+ const u8 *name; -+ unsigned blocksize; - - *res_dir = NULL; - sb = dir->i_sb; -- -+ blocksize = sb->s_blocksize; -+ namelen = dentry->d_name.len; -+ name = dentry->d_name.name; -+ if (namelen > EXT3_NAME_LEN) -+ return NULL; -+#ifdef CONFIG_EXT3_INDEX -+ if (is_dx(dir)) { -+ bh = ext3_dx_find_entry(dentry, res_dir, &err); -+ /* -+ * On success, or if the error was file not found, -+ * return. Otherwise, fall back to doing a search the -+ * old fashioned way. -+ */ -+ if (bh || (err != ERR_BAD_DX_DIR)) -+ return bh; -+ dxtrace(printk("ext3_find_entry: dx failed, falling back\n")); -+ } -+#endif - nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb); -- start = dir->u.ext3_i.i_dir_start_lookup; -+ start = EXT3_I(dir)->i_dir_start_lookup; - if (start >= nblocks) - start = 0; - block = start; -@@ -167,7 +820,7 @@ - i = search_dirblock(bh, dir, dentry, - block << EXT3_BLOCK_SIZE_BITS(sb), res_dir); - if (i == 1) { -- dir->u.ext3_i.i_dir_start_lookup = block; -+ EXT3_I(dir)->i_dir_start_lookup = block; - ret = bh; - goto cleanup_and_exit; - } else { -@@ -198,6 +851,74 @@ - return ret; - } - -+#ifdef CONFIG_EXT3_INDEX -+static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry, -+ struct ext3_dir_entry_2 **res_dir, int *err) -+{ -+ struct super_block * sb; -+ struct dx_hash_info hinfo; -+ u32 hash; -+ struct dx_frame frames[2], *frame; -+ struct ext3_dir_entry_2 *de, *top; -+ struct buffer_head *bh; -+ unsigned long block; -+ int retval; -+ int namelen = dentry->d_name.len; -+ const u8 *name = dentry->d_name.name; -+ struct inode *dir = dentry->d_parent->d_inode; -+ -+ sb = dir->i_sb; -+ /* NFS may look up ".." - look at dx_root directory block */ -+ if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){ -+ if (!(frame = dx_probe(dentry, 0, &hinfo, frames, err))) -+ return NULL; -+ } else { -+ frame = frames; -+ frame->bh = NULL; /* for dx_release() */ -+ frame->at = (struct dx_entry *)frames; /* hack for zero entry*/ -+ dx_set_block(frame->at, 0); /* dx_root block is 0 */ -+ } -+ hash = hinfo.hash; -+ do { -+ block = dx_get_block(frame->at); -+ if (!(bh = ext3_bread (NULL,dir, block, 0, err))) -+ goto errout; -+ de = (struct ext3_dir_entry_2 *) bh->b_data; -+ top = (struct ext3_dir_entry_2 *)((char *)de + sb->s_blocksize - -+ EXT3_DIR_REC_LEN(0)); -+ for (; de < top; de = ext3_next_entry(de)) -+ if (ext3_match (namelen, name, de)) { -+ if (!ext3_check_dir_entry("ext3_find_entry", -+ dir, de, bh, -+ (block<b_data))) { -+ brelse (bh); -+ goto errout; -+ } -+ *res_dir = de; -+ dx_release (frames); -+ return bh; -+ } -+ brelse (bh); -+ /* Check to see if we should continue to search */ -+ retval = ext3_htree_next_block(dir, hash, frame, -+ frames, err, 0); -+ if (retval == -1) { -+ ext3_warning(sb, __FUNCTION__, -+ "error reading index page in directory #%lu", -+ dir->i_ino); -+ goto errout; -+ } -+ } while (retval == 1); -+ -+ *err = -ENOENT; -+errout: -+ dxtrace(printk("%s not found\n", name)); -+ dx_release (frames); -+ return NULL; -+} -+#endif -+ - static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry) - { - struct inode * inode; -@@ -214,8 +927,9 @@ - brelse (bh); - inode = iget(dir->i_sb, ino); - -- if (!inode) -+ if (!inode) { - return ERR_PTR(-EACCES); -+ } - } - d_add(dentry, inode); - return NULL; -@@ -239,6 +953,301 @@ - de->file_type = ext3_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; - } - -+#ifdef CONFIG_EXT3_INDEX -+static struct ext3_dir_entry_2 * -+dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count) -+{ -+ unsigned rec_len = 0; -+ -+ while (count--) { -+ struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) (from + map->offs); -+ rec_len = EXT3_DIR_REC_LEN(de->name_len); -+ memcpy (to, de, rec_len); -+ ((struct ext3_dir_entry_2 *)to)->rec_len = cpu_to_le16(rec_len); -+ de->inode = 0; -+ map++; -+ to += rec_len; -+ } -+ return (struct ext3_dir_entry_2 *) (to - rec_len); -+} -+ -+static struct ext3_dir_entry_2* dx_pack_dirents(char *base, int size) -+{ -+ struct ext3_dir_entry_2 *next, *to, *prev, *de = (struct ext3_dir_entry_2 *) base; -+ unsigned rec_len = 0; -+ -+ prev = to = de; -+ while ((char*)de < base + size) { -+ next = (struct ext3_dir_entry_2 *) ((char *) de + -+ le16_to_cpu(de->rec_len)); -+ if (de->inode && de->name_len) { -+ rec_len = EXT3_DIR_REC_LEN(de->name_len); -+ if (de > to) -+ memmove(to, de, rec_len); -+ to->rec_len = cpu_to_le16(rec_len); -+ prev = to; -+ to = (struct ext3_dir_entry_2 *) (((char *) to) + rec_len); -+ } -+ de = next; -+ } -+ return prev; -+} -+ -+static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, -+ struct buffer_head **bh,struct dx_frame *frame, -+ struct dx_hash_info *hinfo, int *error) -+{ -+ unsigned blocksize = dir->i_sb->s_blocksize; -+ unsigned count, continued; -+ struct buffer_head *bh2; -+ u32 newblock; -+ u32 hash2; -+ struct dx_map_entry *map; -+ char *data1 = (*bh)->b_data, *data2; -+ unsigned split; -+ struct ext3_dir_entry_2 *de = NULL, *de2; -+ int err; -+ -+ bh2 = ext3_append (handle, dir, &newblock, error); -+ if (!(bh2)) { -+ brelse(*bh); -+ *bh = NULL; -+ goto errout; -+ } -+ -+ BUFFER_TRACE(*bh, "get_write_access"); -+ err = ext3_journal_get_write_access(handle, *bh); -+ if (err) { -+ journal_error: -+ brelse(*bh); -+ brelse(bh2); -+ *bh = NULL; -+ ext3_std_error(dir->i_sb, err); -+ goto errout; -+ } -+ BUFFER_TRACE(frame->bh, "get_write_access"); -+ err = ext3_journal_get_write_access(handle, frame->bh); -+ if (err) -+ goto journal_error; -+ -+ data2 = bh2->b_data; -+ -+ /* create map in the end of data2 block */ -+ map = (struct dx_map_entry *) (data2 + blocksize); -+ count = dx_make_map ((struct ext3_dir_entry_2 *) data1, -+ blocksize, hinfo, map); -+ map -= count; -+ split = count/2; // need to adjust to actual middle -+ dx_sort_map (map, count); -+ hash2 = map[split].hash; -+ continued = hash2 == map[split - 1].hash; -+ dxtrace(printk("Split block %i at %x, %i/%i\n", -+ dx_get_block(frame->at), hash2, split, count-split)); -+ -+ /* Fancy dance to stay within two buffers */ -+ de2 = dx_move_dirents(data1, data2, map + split, count - split); -+ de = dx_pack_dirents(data1,blocksize); -+ de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de); -+ de2->rec_len = cpu_to_le16(data2 + blocksize - (char *) de2); -+ dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data1, blocksize, 1)); -+ dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data2, blocksize, 1)); -+ -+ /* Which block gets the new entry? */ -+ if (hinfo->hash >= hash2) -+ { -+ swap(*bh, bh2); -+ de = de2; -+ } -+ dx_insert_block (frame, hash2 + continued, newblock); -+ err = ext3_journal_dirty_metadata (handle, bh2); -+ if (err) -+ goto journal_error; -+ err = ext3_journal_dirty_metadata (handle, frame->bh); -+ if (err) -+ goto journal_error; -+ brelse (bh2); -+ dxtrace(dx_show_index ("frame", frame->entries)); -+errout: -+ return de; -+} -+#endif -+ -+ -+/* -+ * Add a new entry into a directory (leaf) block. If de is non-NULL, -+ * it points to a directory entry which is guaranteed to be large -+ * enough for new directory entry. If de is NULL, then -+ * add_dirent_to_buf will attempt search the directory block for -+ * space. It will return -ENOSPC if no space is available, and -EIO -+ * and -EEXIST if directory entry already exists. -+ * -+ * NOTE! bh is NOT released in the case where ENOSPC is returned. In -+ * all other cases bh is released. -+ */ -+static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, -+ struct inode *inode, struct ext3_dir_entry_2 *de, -+ struct buffer_head * bh) -+{ -+ struct inode *dir = dentry->d_parent->d_inode; -+ const char *name = dentry->d_name.name; -+ int namelen = dentry->d_name.len; -+ unsigned long offset = 0; -+ unsigned short reclen; -+ int nlen, rlen, err; -+ char *top; -+ -+ reclen = EXT3_DIR_REC_LEN(namelen); -+ if (!de) { -+ de = (struct ext3_dir_entry_2 *)bh->b_data; -+ top = bh->b_data + dir->i_sb->s_blocksize - reclen; -+ while ((char *) de <= top) { -+ if (!ext3_check_dir_entry("ext3_add_entry", dir, de, -+ bh, offset)) { -+ brelse (bh); -+ return -EIO; -+ } -+ if (ext3_match (namelen, name, de)) { -+ brelse (bh); -+ return -EEXIST; -+ } -+ nlen = EXT3_DIR_REC_LEN(de->name_len); -+ rlen = le16_to_cpu(de->rec_len); -+ if ((de->inode? rlen - nlen: rlen) >= reclen) -+ break; -+ de = (struct ext3_dir_entry_2 *)((char *)de + rlen); -+ offset += rlen; -+ } -+ if ((char *) de > top) -+ return -ENOSPC; -+ } -+ BUFFER_TRACE(bh, "get_write_access"); -+ err = ext3_journal_get_write_access(handle, bh); -+ if (err) { -+ ext3_std_error(dir->i_sb, err); -+ brelse(bh); -+ return err; -+ } -+ -+ /* By now the buffer is marked for journaling */ -+ nlen = EXT3_DIR_REC_LEN(de->name_len); -+ rlen = le16_to_cpu(de->rec_len); -+ if (de->inode) { -+ struct ext3_dir_entry_2 *de1 = (struct ext3_dir_entry_2 *)((char *)de + nlen); -+ de1->rec_len = cpu_to_le16(rlen - nlen); -+ de->rec_len = cpu_to_le16(nlen); -+ de = de1; -+ } -+ de->file_type = EXT3_FT_UNKNOWN; -+ if (inode) { -+ de->inode = cpu_to_le32(inode->i_ino); -+ ext3_set_de_type(dir->i_sb, de, inode->i_mode); -+ } else -+ de->inode = 0; -+ de->name_len = namelen; -+ memcpy (de->name, name, namelen); -+ /* -+ * XXX shouldn't update any times until successful -+ * completion of syscall, but too many callers depend -+ * on this. -+ * -+ * XXX similarly, too many callers depend on -+ * ext3_new_inode() setting the times, but error -+ * recovery deletes the inode, so the worst that can -+ * happen is that the times are slightly out of date -+ * and/or different from the directory change time. -+ */ -+ dir->i_mtime = dir->i_ctime = CURRENT_TIME; -+ ext3_update_dx_flag(dir); -+ dir->i_version = ++event; -+ ext3_mark_inode_dirty(handle, dir); -+ BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); -+ err = ext3_journal_dirty_metadata(handle, bh); -+ if (err) -+ ext3_std_error(dir->i_sb, err); -+ brelse(bh); -+ return 0; -+} -+ -+#ifdef CONFIG_EXT3_INDEX -+/* -+ * This converts a one block unindexed directory to a 3 block indexed -+ * directory, and adds the dentry to the indexed directory. -+ */ -+static int make_indexed_dir(handle_t *handle, struct dentry *dentry, -+ struct inode *inode, struct buffer_head *bh) -+{ -+ struct inode *dir = dentry->d_parent->d_inode; -+ const char *name = dentry->d_name.name; -+ int namelen = dentry->d_name.len; -+ struct buffer_head *bh2; -+ struct dx_root *root; -+ struct dx_frame frames[2], *frame; -+ struct dx_entry *entries; -+ struct ext3_dir_entry_2 *de, *de2; -+ char *data1, *top; -+ unsigned len; -+ int retval; -+ unsigned blocksize; -+ struct dx_hash_info hinfo; -+ u32 block; -+ -+ blocksize = dir->i_sb->s_blocksize; -+ dxtrace(printk("Creating index\n")); -+ retval = ext3_journal_get_write_access(handle, bh); -+ if (retval) { -+ ext3_std_error(dir->i_sb, retval); -+ brelse(bh); -+ return retval; -+ } -+ root = (struct dx_root *) bh->b_data; -+ -+ EXT3_I(dir)->i_flags |= EXT3_INDEX_FL; -+ bh2 = ext3_append (handle, dir, &block, &retval); -+ if (!(bh2)) { -+ brelse(bh); -+ return retval; -+ } -+ data1 = bh2->b_data; -+ -+ /* The 0th block becomes the root, move the dirents out */ -+ de = (struct ext3_dir_entry_2 *)&root->dotdot; -+ de = (struct ext3_dir_entry_2 *)((char *)de + le16_to_cpu(de->rec_len)); -+ len = ((char *) root) + blocksize - (char *) de; -+ memcpy (data1, de, len); -+ de = (struct ext3_dir_entry_2 *) data1; -+ top = data1 + len; -+ while (((char *) de2=(char*)de+le16_to_cpu(de->rec_len)) < top) -+ de = de2; -+ de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de); -+ /* Initialize the root; the dot dirents already exist */ -+ de = (struct ext3_dir_entry_2 *) (&root->dotdot); -+ de->rec_len = cpu_to_le16(blocksize - EXT3_DIR_REC_LEN(2)); -+ memset (&root->info, 0, sizeof(root->info)); -+ root->info.info_length = sizeof(root->info); -+ root->info.hash_version = dir->i_sb->u.ext3_sb.s_def_hash_version; -+ entries = root->entries; -+ dx_set_block (entries, 1); -+ dx_set_count (entries, 1); -+ dx_set_limit (entries, dx_root_limit(dir, sizeof(root->info))); -+ -+ /* Initialize as for dx_probe */ -+ hinfo.hash_version = root->info.hash_version; -+ hinfo.seed = dir->i_sb->u.ext3_sb.s_hash_seed; -+ ext3fs_dirhash(name, namelen, &hinfo); -+ frame = frames; -+ frame->entries = entries; -+ frame->at = entries; -+ frame->bh = bh; -+ bh = bh2; -+ de = do_split(handle,dir, &bh, frame, &hinfo, &retval); -+ dx_release (frames); -+ if (!(de)) -+ return retval; -+ -+ return add_dirent_to_buf(handle, dentry, inode, de, bh); -+} -+#endif -+ - /* - * ext3_add_entry() - * -@@ -249,127 +1258,198 @@ - * may not sleep between calling this and putting something into - * the entry, as someone else might have used it while you slept. - */ -- --/* -- * AKPM: the journalling code here looks wrong on the error paths -- */ - static int ext3_add_entry (handle_t *handle, struct dentry *dentry, - struct inode *inode) - { - struct inode *dir = dentry->d_parent->d_inode; -- const char *name = dentry->d_name.name; -- int namelen = dentry->d_name.len; - unsigned long offset; -- unsigned short rec_len; - struct buffer_head * bh; -- struct ext3_dir_entry_2 * de, * de1; -+ struct ext3_dir_entry_2 *de; - struct super_block * sb; - int retval; -+#ifdef CONFIG_EXT3_INDEX -+ int dx_fallback=0; -+#endif -+ unsigned blocksize; -+ unsigned nlen, rlen; -+ u32 block, blocks; - - sb = dir->i_sb; -- -- if (!namelen) -+ blocksize = sb->s_blocksize; -+ if (!dentry->d_name.len) - return -EINVAL; -- bh = ext3_bread (handle, dir, 0, 0, &retval); -+#ifdef CONFIG_EXT3_INDEX -+ if (is_dx(dir)) { -+ retval = ext3_dx_add_entry(handle, dentry, inode); -+ if (!retval || (retval != ERR_BAD_DX_DIR)) -+ return retval; -+ EXT3_I(dir)->i_flags &= ~EXT3_INDEX_FL; -+ dx_fallback++; -+ ext3_mark_inode_dirty(handle, dir); -+ } -+#endif -+ blocks = dir->i_size >> sb->s_blocksize_bits; -+ for (block = 0, offset = 0; block < blocks; block++) { -+ bh = ext3_bread(handle, dir, block, 0, &retval); -+ if(!bh) -+ return retval; -+ retval = add_dirent_to_buf(handle, dentry, inode, 0, bh); -+ if (retval != -ENOSPC) -+ return retval; -+ -+#ifdef CONFIG_EXT3_INDEX -+ if (blocks == 1 && !dx_fallback && -+ EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX)) -+ return make_indexed_dir(handle, dentry, inode, bh); -+#endif -+ brelse(bh); -+ } -+ bh = ext3_append(handle, dir, &block, &retval); - if (!bh) - return retval; -- rec_len = EXT3_DIR_REC_LEN(namelen); -- offset = 0; - de = (struct ext3_dir_entry_2 *) bh->b_data; -- while (1) { -- if ((char *)de >= sb->s_blocksize + bh->b_data) { -- brelse (bh); -- bh = NULL; -- bh = ext3_bread (handle, dir, -- offset >> EXT3_BLOCK_SIZE_BITS(sb), 1, &retval); -- if (!bh) -- return retval; -- if (dir->i_size <= offset) { -- if (dir->i_size == 0) { -- brelse(bh); -- return -ENOENT; -- } -- -- ext3_debug ("creating next block\n"); -- -- BUFFER_TRACE(bh, "get_write_access"); -- ext3_journal_get_write_access(handle, bh); -- de = (struct ext3_dir_entry_2 *) bh->b_data; -- de->inode = 0; -- de->rec_len = le16_to_cpu(sb->s_blocksize); -- dir->u.ext3_i.i_disksize = -- dir->i_size = offset + sb->s_blocksize; -- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; -- ext3_mark_inode_dirty(handle, dir); -- } else { -- -- ext3_debug ("skipping to next block\n"); -+ de->inode = 0; -+ de->rec_len = cpu_to_le16(rlen = blocksize); -+ nlen = 0; -+ return add_dirent_to_buf(handle, dentry, inode, de, bh); -+} - -- de = (struct ext3_dir_entry_2 *) bh->b_data; -- } -- } -- if (!ext3_check_dir_entry ("ext3_add_entry", dir, de, bh, -- offset)) { -- brelse (bh); -- return -ENOENT; -- } -- if (ext3_match (namelen, name, de)) { -- brelse (bh); -- return -EEXIST; -+#ifdef CONFIG_EXT3_INDEX -+/* -+ * Returns 0 for success, or a negative error value -+ */ -+static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, -+ struct inode *inode) -+{ -+ struct dx_frame frames[2], *frame; -+ struct dx_entry *entries, *at; -+ struct dx_hash_info hinfo; -+ struct buffer_head * bh; -+ struct inode *dir = dentry->d_parent->d_inode; -+ struct super_block * sb = dir->i_sb; -+ struct ext3_dir_entry_2 *de; -+ int err; -+ -+ frame = dx_probe(dentry, 0, &hinfo, frames, &err); -+ if (!frame) -+ return err; -+ entries = frame->entries; -+ at = frame->at; -+ -+ if (!(bh = ext3_bread(handle,dir, dx_get_block(frame->at), 0, &err))) -+ goto cleanup; -+ -+ BUFFER_TRACE(bh, "get_write_access"); -+ err = ext3_journal_get_write_access(handle, bh); -+ if (err) -+ goto journal_error; -+ -+ err = add_dirent_to_buf(handle, dentry, inode, 0, bh); -+ if (err != -ENOSPC) { -+ bh = 0; -+ goto cleanup; -+ } -+ -+ /* Block full, should compress but for now just split */ -+ dxtrace(printk("using %u of %u node entries\n", -+ dx_get_count(entries), dx_get_limit(entries))); -+ /* Need to split index? */ -+ if (dx_get_count(entries) == dx_get_limit(entries)) { -+ u32 newblock; -+ unsigned icount = dx_get_count(entries); -+ int levels = frame - frames; -+ struct dx_entry *entries2; -+ struct dx_node *node2; -+ struct buffer_head *bh2; -+ -+ if (levels && (dx_get_count(frames->entries) == -+ dx_get_limit(frames->entries))) { -+ ext3_warning(sb, __FUNCTION__, -+ "Directory index full!\n"); -+ err = -ENOSPC; -+ goto cleanup; - } -- if ((le32_to_cpu(de->inode) == 0 && -- le16_to_cpu(de->rec_len) >= rec_len) || -- (le16_to_cpu(de->rec_len) >= -- EXT3_DIR_REC_LEN(de->name_len) + rec_len)) { -- BUFFER_TRACE(bh, "get_write_access"); -- ext3_journal_get_write_access(handle, bh); -- /* By now the buffer is marked for journaling */ -- offset += le16_to_cpu(de->rec_len); -- if (le32_to_cpu(de->inode)) { -- de1 = (struct ext3_dir_entry_2 *) ((char *) de + -- EXT3_DIR_REC_LEN(de->name_len)); -- de1->rec_len = -- cpu_to_le16(le16_to_cpu(de->rec_len) - -- EXT3_DIR_REC_LEN(de->name_len)); -- de->rec_len = cpu_to_le16( -- EXT3_DIR_REC_LEN(de->name_len)); -- de = de1; -+ -+ bh2 = ext3_append (handle, dir, &newblock, &err); -+ if (!(bh2)) -+ goto cleanup; -+ node2 = (struct dx_node *)(bh2->b_data); -+ entries2 = node2->entries; -+ node2->fake.rec_len = cpu_to_le16(sb->s_blocksize); -+ node2->fake.inode = 0; -+ BUFFER_TRACE(frame->bh, "get_write_access"); -+ err = ext3_journal_get_write_access(handle, frame->bh); -+ if (err) -+ goto journal_error; -+ if (levels) { -+ unsigned icount1 = icount/2, icount2 = icount - icount1; -+ unsigned hash2 = dx_get_hash(entries + icount1); -+ dxtrace(printk("Split index %i/%i\n", icount1, icount2)); -+ -+ BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */ -+ err = ext3_journal_get_write_access(handle, -+ frames[0].bh); -+ if (err) -+ goto journal_error; -+ -+ memcpy ((char *) entries2, (char *) (entries + icount1),+ icount2 * sizeof(struct dx_entry)); -+ dx_set_count (entries, icount1); -+ dx_set_count (entries2, icount2); -+ dx_set_limit (entries2, dx_node_limit(dir)); -+ -+ /* Which index block gets the new entry? */ -+ if (at - entries >= icount1) { -+ frame->at = at = at - entries - icount1 + entries2; -+ frame->entries = entries = entries2; -+ swap(frame->bh, bh2); - } -- de->file_type = EXT3_FT_UNKNOWN; -- if (inode) { -- de->inode = cpu_to_le32(inode->i_ino); -- ext3_set_de_type(dir->i_sb, de, inode->i_mode); -- } else -- de->inode = 0; -- de->name_len = namelen; -- memcpy (de->name, name, namelen); -- /* -- * XXX shouldn't update any times until successful -- * completion of syscall, but too many callers depend -- * on this. -- * -- * XXX similarly, too many callers depend on -- * ext3_new_inode() setting the times, but error -- * recovery deletes the inode, so the worst that can -- * happen is that the times are slightly out of date -- * and/or different from the directory change time. -- */ -- dir->i_mtime = dir->i_ctime = CURRENT_TIME; -- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; -- ext3_mark_inode_dirty(handle, dir); -- dir->i_version = ++event; -- BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); -- ext3_journal_dirty_metadata(handle, bh); -- brelse(bh); -- return 0; -+ dx_insert_block (frames + 0, hash2, newblock); -+ dxtrace(dx_show_index ("node", frames[1].entries)); -+ dxtrace(dx_show_index ("node", -+ ((struct dx_node *) bh2->b_data)->entries)); -+ err = ext3_journal_dirty_metadata(handle, bh2); -+ if (err) -+ goto journal_error; -+ brelse (bh2); -+ } else { -+ dxtrace(printk("Creating second level index...\n")); -+ memcpy((char *) entries2, (char *) entries, -+ icount * sizeof(struct dx_entry)); -+ dx_set_limit(entries2, dx_node_limit(dir)); -+ -+ /* Set up root */ -+ dx_set_count(entries, 1); -+ dx_set_block(entries + 0, newblock); -+ ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1; -+ -+ /* Add new access path frame */ -+ frame = frames + 1; -+ frame->at = at = at - entries + entries2; -+ frame->entries = entries = entries2; -+ frame->bh = bh2; -+ err = ext3_journal_get_write_access(handle, -+ frame->bh); -+ if (err) -+ goto journal_error; - } -- offset += le16_to_cpu(de->rec_len); -- de = (struct ext3_dir_entry_2 *) -- ((char *) de + le16_to_cpu(de->rec_len)); -+ ext3_journal_dirty_metadata(handle, frames[0].bh); - } -- brelse (bh); -- return -ENOSPC; -+ de = do_split(handle, dir, &bh, frame, &hinfo, &err); -+ if (!de) -+ goto cleanup; -+ err = add_dirent_to_buf(handle, dentry, inode, de, bh); -+ bh = 0; -+ goto cleanup; -+ -+journal_error: -+ ext3_std_error(dir->i_sb, err); -+cleanup: -+ if (bh) -+ brelse(bh); -+ dx_release(frames); -+ return err; - } -+#endif - - /* - * ext3_delete_entry deletes a directory entry by merging it with the -@@ -453,9 +1533,11 @@ - struct inode * inode; - int err; - -- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3); -- if (IS_ERR(handle)) -+ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + -+ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3); -+ if (IS_ERR(handle)) { - return PTR_ERR(handle); -+ } - - if (IS_SYNC(dir)) - handle->h_sync = 1; -@@ -480,9 +1562,11 @@ - struct inode *inode; - int err; - -- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3); -- if (IS_ERR(handle)) -+ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + -+ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3); -+ if (IS_ERR(handle)) { - return PTR_ERR(handle); -+ } - - if (IS_SYNC(dir)) - handle->h_sync = 1; -@@ -512,9 +1596,11 @@ - if (dir->i_nlink >= EXT3_LINK_MAX) - return -EMLINK; - -- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3); -- if (IS_ERR(handle)) -+ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + -+ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3); -+ if (IS_ERR(handle)) { - return PTR_ERR(handle); -+ } - - if (IS_SYNC(dir)) - handle->h_sync = 1; -@@ -526,7 +1612,8 @@ - - inode->i_op = &ext3_dir_inode_operations; - inode->i_fop = &ext3_dir_operations; -- inode->i_size = inode->u.ext3_i.i_disksize = inode->i_sb->s_blocksize; -+ inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize; -+ inode->i_blocks = 0; - dir_block = ext3_bread (handle, inode, 0, 1, &err); - if (!dir_block) { - inode->i_nlink--; /* is this nlink == 0? */ -@@ -555,21 +1642,19 @@ - brelse (dir_block); - ext3_mark_inode_dirty(handle, inode); - err = ext3_add_entry (handle, dentry, inode); -- if (err) -- goto out_no_entry; -+ if (err) { -+ inode->i_nlink = 0; -+ ext3_mark_inode_dirty(handle, inode); -+ iput (inode); -+ goto out_stop; -+ } - dir->i_nlink++; -- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; -+ ext3_update_dx_flag(dir); - ext3_mark_inode_dirty(handle, dir); - d_instantiate(dentry, inode); - out_stop: - ext3_journal_stop(handle, dir); - return err; -- --out_no_entry: -- inode->i_nlink = 0; -- ext3_mark_inode_dirty(handle, inode); -- iput (inode); -- goto out_stop; - } - - /* -@@ -656,7 +1741,7 @@ - int err = 0, rc; - - lock_super(sb); -- if (!list_empty(&inode->u.ext3_i.i_orphan)) -+ if (!list_empty(&EXT3_I(inode)->i_orphan)) - goto out_unlock; - - /* Orphan handling is only valid for files with data blocks -@@ -697,7 +1782,7 @@ - * This is safe: on error we're going to ignore the orphan list - * anyway on the next recovery. */ - if (!err) -- list_add(&inode->u.ext3_i.i_orphan, &EXT3_SB(sb)->s_orphan); -+ list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan); - - jbd_debug(4, "superblock will point to %ld\n", inode->i_ino); - jbd_debug(4, "orphan inode %ld will point to %d\n", -@@ -715,25 +1800,26 @@ - int ext3_orphan_del(handle_t *handle, struct inode *inode) - { - struct list_head *prev; -+ struct ext3_inode_info *ei = EXT3_I(inode); - struct ext3_sb_info *sbi; - ino_t ino_next; - struct ext3_iloc iloc; - int err = 0; - - lock_super(inode->i_sb); -- if (list_empty(&inode->u.ext3_i.i_orphan)) { -+ if (list_empty(&ei->i_orphan)) { - unlock_super(inode->i_sb); - return 0; - } - - ino_next = NEXT_ORPHAN(inode); -- prev = inode->u.ext3_i.i_orphan.prev; -+ prev = ei->i_orphan.prev; - sbi = EXT3_SB(inode->i_sb); - - jbd_debug(4, "remove inode %ld from orphan list\n", inode->i_ino); - -- list_del(&inode->u.ext3_i.i_orphan); -- INIT_LIST_HEAD(&inode->u.ext3_i.i_orphan); -+ list_del(&ei->i_orphan); -+ INIT_LIST_HEAD(&ei->i_orphan); - - /* If we're on an error path, we may not have a valid - * transaction handle with which to update the orphan list on -@@ -794,8 +1880,9 @@ - handle_t *handle; - - handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS); -- if (IS_ERR(handle)) -+ if (IS_ERR(handle)) { - return PTR_ERR(handle); -+ } - - retval = -ENOENT; - bh = ext3_find_entry (dentry, &de); -@@ -833,7 +1920,7 @@ - ext3_mark_inode_dirty(handle, inode); - dir->i_nlink--; - inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; -- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; -+ ext3_update_dx_flag(dir); - ext3_mark_inode_dirty(handle, dir); - - end_rmdir: -@@ -851,8 +1938,9 @@ - handle_t *handle; - - handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS); -- if (IS_ERR(handle)) -+ if (IS_ERR(handle)) { - return PTR_ERR(handle); -+ } - - if (IS_SYNC(dir)) - handle->h_sync = 1; -@@ -879,7 +1967,7 @@ - if (retval) - goto end_unlink; - dir->i_ctime = dir->i_mtime = CURRENT_TIME; -- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; -+ ext3_update_dx_flag(dir); - ext3_mark_inode_dirty(handle, dir); - inode->i_nlink--; - if (!inode->i_nlink) -@@ -905,9 +1993,11 @@ - if (l > dir->i_sb->s_blocksize) - return -ENAMETOOLONG; - -- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 5); -- if (IS_ERR(handle)) -+ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + -+ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 5); -+ if (IS_ERR(handle)) { - return PTR_ERR(handle); -+ } - - if (IS_SYNC(dir)) - handle->h_sync = 1; -@@ -917,7 +2007,7 @@ - if (IS_ERR(inode)) - goto out_stop; - -- if (l > sizeof (inode->u.ext3_i.i_data)) { -+ if (l > sizeof (EXT3_I(inode)->i_data)) { - inode->i_op = &ext3_symlink_inode_operations; - inode->i_mapping->a_ops = &ext3_aops; - /* -@@ -926,25 +2016,23 @@ - * i_size in generic_commit_write(). - */ - err = block_symlink(inode, symname, l); -- if (err) -- goto out_no_entry; -+ if (err) { -+ ext3_dec_count(handle, inode); -+ ext3_mark_inode_dirty(handle, inode); -+ iput (inode); -+ goto out_stop; -+ } - } else { - inode->i_op = &ext3_fast_symlink_inode_operations; -- memcpy((char*)&inode->u.ext3_i.i_data,symname,l); -+ memcpy((char*)&EXT3_I(inode)->i_data,symname,l); - inode->i_size = l-1; - } -- inode->u.ext3_i.i_disksize = inode->i_size; -+ EXT3_I(inode)->i_disksize = inode->i_size; - ext3_mark_inode_dirty(handle, inode); - err = ext3_add_nondir(handle, dentry, inode); - out_stop: - ext3_journal_stop(handle, dir); - return err; -- --out_no_entry: -- ext3_dec_count(handle, inode); -- ext3_mark_inode_dirty(handle, inode); -- iput (inode); -- goto out_stop; - } - - static int ext3_link (struct dentry * old_dentry, -@@ -957,12 +2045,15 @@ - if (S_ISDIR(inode->i_mode)) - return -EPERM; - -- if (inode->i_nlink >= EXT3_LINK_MAX) -+ if (inode->i_nlink >= EXT3_LINK_MAX) { - return -EMLINK; -+ } - -- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS); -- if (IS_ERR(handle)) -+ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + -+ EXT3_INDEX_EXTRA_TRANS_BLOCKS); -+ if (IS_ERR(handle)) { - return PTR_ERR(handle); -+ } - - if (IS_SYNC(dir)) - handle->h_sync = 1; -@@ -996,9 +2087,11 @@ - - old_bh = new_bh = dir_bh = NULL; - -- handle = ext3_journal_start(old_dir, 2 * EXT3_DATA_TRANS_BLOCKS + 2); -- if (IS_ERR(handle)) -+ handle = ext3_journal_start(old_dir, 2 * EXT3_DATA_TRANS_BLOCKS + -+ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 2); -+ if (IS_ERR(handle)) { - return PTR_ERR(handle); -+ } - - if (IS_SYNC(old_dir) || IS_SYNC(new_dir)) - handle->h_sync = 1; -@@ -1078,7 +2171,7 @@ - new_inode->i_ctime = CURRENT_TIME; - } - old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME; -- old_dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; -+ ext3_update_dx_flag(old_dir); - if (dir_bh) { - BUFFER_TRACE(dir_bh, "get_write_access"); - ext3_journal_get_write_access(handle, dir_bh); -@@ -1090,7 +2183,7 @@ - new_inode->i_nlink--; - } else { - new_dir->i_nlink++; -- new_dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; -+ ext3_update_dx_flag(new_dir); - ext3_mark_inode_dirty(handle, new_dir); - } - } -Index: linux-2.4.19.SuSE/fs/ext3/super.c -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/ext3/super.c 2004-05-27 11:07:21.000000000 -0700 -+++ linux-2.4.19.SuSE/fs/ext3/super.c 2004-05-27 11:08:28.000000000 -0700 -@@ -741,6 +741,7 @@ - es->s_mtime = cpu_to_le32(CURRENT_TIME); - ext3_update_dynamic_rev(sb); - EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); -+ - ext3_commit_super (sb, es, 1); - if (test_opt (sb, DEBUG)) - printk (KERN_INFO -@@ -751,6 +752,7 @@ - EXT3_BLOCKS_PER_GROUP(sb), - EXT3_INODES_PER_GROUP(sb), - sbi->s_mount_opt); -+ - printk(KERN_INFO "EXT3 FS " EXT3FS_VERSION ", " EXT3FS_DATE " on %s, ", - bdevname(sb->s_dev)); - if (EXT3_SB(sb)->s_journal->j_inode == NULL) { -@@ -925,6 +927,7 @@ - return res; - } - -+ - struct super_block * ext3_read_super (struct super_block * sb, void * data, - int silent) - { -@@ -1113,6 +1116,9 @@ - sbi->s_mount_state = le16_to_cpu(es->s_state); - sbi->s_addr_per_block_bits = log2(EXT3_ADDR_PER_BLOCK(sb)); - sbi->s_desc_per_block_bits = log2(EXT3_DESC_PER_BLOCK(sb)); -+ for (i=0; i < 4; i++) -+ sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); -+ sbi->s_def_hash_version = es->s_def_hash_version; - - if (sbi->s_blocks_per_group > blocksize * 8) { - printk (KERN_ERR -@@ -1821,6 +1827,7 @@ - exit_ext3_xattr(); - } - -+EXPORT_SYMBOL(ext3_force_commit); - EXPORT_SYMBOL(ext3_bread); - - MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); -Index: linux-2.4.19.SuSE/fs/ext3/file.c -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/ext3/file.c 2002-12-04 09:46:18.000000000 -0800 -+++ linux-2.4.19.SuSE/fs/ext3/file.c 2004-05-27 11:08:28.000000000 -0700 -@@ -38,6 +38,9 @@ - { - if (filp->f_mode & FMODE_WRITE) - ext3_discard_prealloc (inode); -+ if (is_dx(inode) && filp->private_data) -+ ext3_htree_free_dir_info(filp->private_data); -+ - return 0; - } - -Index: linux-2.4.19.SuSE/fs/ext3/hash.c -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/ext3/hash.c 1970-01-02 14:15:01.000000000 -0800 -+++ linux-2.4.19.SuSE/fs/ext3/hash.c 2004-05-27 11:08:28.000000000 -0700 -@@ -0,0 +1,215 @@ -+/* -+ * linux/fs/ext3/hash.c -+ * -+ * Copyright (C) 2002 by Theodore Ts'o -+ * -+ * This file is released under the GPL v2. -+ * -+ * This file may be redistributed under the terms of the GNU Public -+ * License. -+ */ -+ -+#include -+#include -+#include -+#include -+ -+#define DELTA 0x9E3779B9 -+ -+static void TEA_transform(__u32 buf[4], __u32 const in[]) -+{ -+ __u32 sum = 0; -+ __u32 b0 = buf[0], b1 = buf[1]; -+ __u32 a = in[0], b = in[1], c = in[2], d = in[3]; -+ int n = 16; -+ -+ do { -+ sum += DELTA; -+ b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); -+ b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); -+ } while(--n); -+ -+ buf[0] += b0; -+ buf[1] += b1; -+} -+ -+/* F, G and H are basic MD4 functions: selection, majority, parity */ -+#define F(x, y, z) ((z) ^ ((x) & ((y) ^ (z)))) -+#define G(x, y, z) (((x) & (y)) + (((x) ^ (y)) & (z))) -+#define H(x, y, z) ((x) ^ (y) ^ (z)) -+ -+/* -+ * The generic round function. The application is so specific that -+ * we don't bother protecting all the arguments with parens, as is generally -+ * good macro practice, in favor of extra legibility. -+ * Rotation is separate from addition to prevent recomputation -+ */ -+#define ROUND(f, a, b, c, d, x, s) \ -+ (a += f(b, c, d) + x, a = (a << s) | (a >> (32-s))) -+#define K1 0 -+#define K2 013240474631UL -+#define K3 015666365641UL -+ -+/* -+ * Basic cut-down MD4 transform. Returns only 32 bits of result. -+ */ -+static void halfMD4Transform (__u32 buf[4], __u32 const in[]) -+{ -+ __u32 a = buf[0], b = buf[1], c = buf[2], d = buf[3]; -+ -+ /* Round 1 */ -+ ROUND(F, a, b, c, d, in[0] + K1, 3); -+ ROUND(F, d, a, b, c, in[1] + K1, 7); -+ ROUND(F, c, d, a, b, in[2] + K1, 11); -+ ROUND(F, b, c, d, a, in[3] + K1, 19); -+ ROUND(F, a, b, c, d, in[4] + K1, 3); -+ ROUND(F, d, a, b, c, in[5] + K1, 7); -+ ROUND(F, c, d, a, b, in[6] + K1, 11); -+ ROUND(F, b, c, d, a, in[7] + K1, 19); -+ -+ /* Round 2 */ -+ ROUND(G, a, b, c, d, in[1] + K2, 3); -+ ROUND(G, d, a, b, c, in[3] + K2, 5); -+ ROUND(G, c, d, a, b, in[5] + K2, 9); -+ ROUND(G, b, c, d, a, in[7] + K2, 13); -+ ROUND(G, a, b, c, d, in[0] + K2, 3); -+ ROUND(G, d, a, b, c, in[2] + K2, 5); -+ ROUND(G, c, d, a, b, in[4] + K2, 9); -+ ROUND(G, b, c, d, a, in[6] + K2, 13); -+ -+ /* Round 3 */ -+ ROUND(H, a, b, c, d, in[3] + K3, 3); -+ ROUND(H, d, a, b, c, in[7] + K3, 9); -+ ROUND(H, c, d, a, b, in[2] + K3, 11); -+ ROUND(H, b, c, d, a, in[6] + K3, 15); -+ ROUND(H, a, b, c, d, in[1] + K3, 3); -+ ROUND(H, d, a, b, c, in[5] + K3, 9); -+ ROUND(H, c, d, a, b, in[0] + K3, 11); -+ ROUND(H, b, c, d, a, in[4] + K3, 15); -+ -+ buf[0] += a; -+ buf[1] += b; -+ buf[2] += c; -+ buf[3] += d; -+} -+ -+#undef ROUND -+#undef F -+#undef G -+#undef H -+#undef K1 -+#undef K2 -+#undef K3 -+ -+/* The old legacy hash */ -+static __u32 dx_hack_hash (const char *name, int len) -+{ -+ __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; -+ while (len--) { -+ __u32 hash = hash1 + (hash0 ^ (*name++ * 7152373)); -+ -+ if (hash & 0x80000000) hash -= 0x7fffffff; -+ hash1 = hash0; -+ hash0 = hash; -+ } -+ return (hash0 << 1); -+} -+ -+static void str2hashbuf(const char *msg, int len, __u32 *buf, int num) -+{ -+ __u32 pad, val; -+ int i; -+ -+ pad = (__u32)len | ((__u32)len << 8); -+ pad |= pad << 16; -+ -+ val = pad; -+ if (len > num*4) -+ len = num * 4; -+ for (i=0; i < len; i++) { -+ if ((i % 4) == 0) -+ val = pad; -+ val = msg[i] + (val << 8); -+ if ((i % 4) == 3) { -+ *buf++ = val; -+ val = pad; -+ num--; -+ } -+ } -+ if (--num >= 0) -+ *buf++ = val; -+ while (--num >= 0) -+ *buf++ = pad; -+} -+ -+/* -+ * Returns the hash of a filename. If len is 0 and name is NULL, then -+ * this function can be used to test whether or not a hash version is -+ * supported. -+ * -+ * The seed is an 4 longword (32 bits) "secret" which can be used to -+ * uniquify a hash. If the seed is all zero's, then some default seed -+ * may be used. -+ * -+ * A particular hash version specifies whether or not the seed is -+ * represented, and whether or not the returned hash is 32 bits or 64 -+ * bits. 32 bit hashes will return 0 for the minor hash. -+ */ -+int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) -+{ -+ __u32 hash; -+ __u32 minor_hash = 0; -+ const char *p; -+ int i; -+ __u32 in[8], buf[4]; -+ -+ /* Initialize the default seed for the hash checksum functions */ -+ buf[0] = 0x67452301; -+ buf[1] = 0xefcdab89; -+ buf[2] = 0x98badcfe; -+ buf[3] = 0x10325476; -+ -+ /* Check to see if the seed is all zero's */ -+ if (hinfo->seed) { -+ for (i=0; i < 4; i++) { -+ if (hinfo->seed[i]) -+ break; -+ } -+ if (i < 4) -+ memcpy(buf, hinfo->seed, sizeof(buf)); -+ } -+ -+ switch (hinfo->hash_version) { -+ case DX_HASH_LEGACY: -+ hash = dx_hack_hash(name, len); -+ break; -+ case DX_HASH_HALF_MD4: -+ p = name; -+ while (len > 0) { -+ str2hashbuf(p, len, in, 8); -+ halfMD4Transform(buf, in); -+ len -= 32; -+ p += 32; -+ } -+ minor_hash = buf[2]; -+ hash = buf[1]; -+ break; -+ case DX_HASH_TEA: -+ p = name; -+ while (len > 0) { -+ str2hashbuf(p, len, in, 4); -+ TEA_transform(buf, in); -+ len -= 16; -+ p += 16; -+ } -+ hash = buf[0]; -+ minor_hash = buf[1]; -+ break; -+ default: -+ hinfo->hash = 0; -+ return -1; -+ } -+ hinfo->hash = hash & ~1; -+ hinfo->minor_hash = minor_hash; -+ return 0; -+} -Index: linux-2.4.19.SuSE/lib/rbtree.c -=================================================================== ---- linux-2.4.19.SuSE.orig/lib/rbtree.c 2002-08-02 17:39:46.000000000 -0700 -+++ linux-2.4.19.SuSE/lib/rbtree.c 2004-05-27 11:08:28.000000000 -0700 -@@ -17,6 +17,8 @@ - Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - - linux/lib/rbtree.c -+ -+ rb_get_first and rb_get_next written by Theodore Ts'o, 9/8/2002 - */ - - #include -@@ -294,3 +296,43 @@ - __rb_erase_color(child, parent, root); - } - EXPORT_SYMBOL(rb_erase); -+ -+/* -+ * This function returns the first node (in sort order) of the tree. -+ */ -+rb_node_t *rb_get_first(rb_root_t *root) -+{ -+ rb_node_t *n; -+ -+ n = root->rb_node; -+ if (!n) -+ return 0; -+ while (n->rb_left) -+ n = n->rb_left; -+ return n; -+} -+EXPORT_SYMBOL(rb_get_first); -+ -+/* -+ * Given a node, this function will return the next node in the tree. -+ */ -+rb_node_t *rb_get_next(rb_node_t *n) -+{ -+ rb_node_t *parent; -+ -+ if (n->rb_right) { -+ n = n->rb_right; -+ while (n->rb_left) -+ n = n->rb_left; -+ return n; -+ } else { -+ while ((parent = n->rb_parent)) { -+ if (n == parent->rb_left) -+ return parent; -+ n = parent; -+ } -+ return 0; -+ } -+} -+EXPORT_SYMBOL(rb_get_next); -+ -Index: linux-2.4.19.SuSE/include/linux/ext3_fs.h -=================================================================== ---- linux-2.4.19.SuSE.orig/include/linux/ext3_fs.h 2003-10-05 09:30:34.000000000 -0700 -+++ linux-2.4.19.SuSE/include/linux/ext3_fs.h 2004-05-27 11:08:28.000000000 -0700 -@@ -40,6 +40,11 @@ - #define EXT3FS_VERSION "2.4-0.9.18" - - /* -+ * Always enable hashed directories -+ */ -+#define CONFIG_EXT3_INDEX -+ -+/* - * Debug code - */ - #ifdef EXT3FS_DEBUG -@@ -414,8 +419,11 @@ - /*E0*/ __u32 s_journal_inum; /* inode number of journal file */ - __u32 s_journal_dev; /* device number of journal file */ - __u32 s_last_orphan; /* start of list of inodes to delete */ -- --/*EC*/ __u32 s_reserved[197]; /* Padding to the end of the block */ -+ __u32 s_hash_seed[4]; /* HTREE hash seed */ -+ __u8 s_def_hash_version; /* Default hash version to use */ -+ __u8 s_reserved_char_pad; -+ __u16 s_reserved_word_pad; -+ __u32 s_reserved[192]; /* Padding to the end of the block */ - }; - - #ifdef __KERNEL__ -@@ -552,9 +560,46 @@ - #define EXT3_DIR_ROUND (EXT3_DIR_PAD - 1) - #define EXT3_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT3_DIR_ROUND) & \ - ~EXT3_DIR_ROUND) -+/* -+ * Hash Tree Directory indexing -+ * (c) Daniel Phillips, 2001 -+ */ -+ -+#ifdef CONFIG_EXT3_INDEX -+ #define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \ -+ EXT3_FEATURE_COMPAT_DIR_INDEX) && \ -+ (EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) -+#define EXT3_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT3_LINK_MAX) -+#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1) -+#else -+ #define is_dx(dir) 0 -+#define EXT3_DIR_LINK_MAX(dir) ((dir)->i_nlink >= EXT3_LINK_MAX) -+#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2) -+#endif -+ -+/* Legal values for the dx_root hash_version field: */ -+ -+#define DX_HASH_LEGACY 0 -+#define DX_HASH_HALF_MD4 1 -+#define DX_HASH_TEA 2 -+ -+/* hash info structure used by the directory hash */ -+struct dx_hash_info -+{ -+ u32 hash; -+ u32 minor_hash; -+ int hash_version; -+ u32 *seed; -+}; - - #ifdef __KERNEL__ - /* -+ * Control parameters used by ext3_htree_next_block -+ */ -+#define HASH_NB_ALWAYS 1 -+ -+ -+/* - * Describe an inode's exact location on disk and in memory - */ - struct ext3_iloc -@@ -564,6 +609,27 @@ - unsigned long block_group; - }; - -+ -+/* -+ * This structure is stuffed into the struct file's private_data field -+ * for directories. It is where we put information so that we can do -+ * readdir operations in hash tree order. -+ */ -+struct dir_private_info { -+ rb_root_t root; -+ rb_node_t *curr_node; -+ struct fname *extra_fname; -+ loff_t last_pos; -+ __u32 curr_hash; -+ __u32 curr_minor_hash; -+ __u32 next_hash; -+}; -+ -+/* -+ * Special error return code only used by dx_probe() and its callers. -+ */ -+#define ERR_BAD_DX_DIR -75000 -+ - /* - * Function prototypes - */ -@@ -591,11 +657,20 @@ - - /* dir.c */ - extern int ext3_check_dir_entry(const char *, struct inode *, -- struct ext3_dir_entry_2 *, struct buffer_head *, -- unsigned long); -+ struct ext3_dir_entry_2 *, -+ struct buffer_head *, unsigned long); -+extern void ext3_htree_store_dirent(struct file *dir_file, __u32 hash, -+ __u32 minor_hash, -+ struct ext3_dir_entry_2 *dirent); -+extern void ext3_htree_free_dir_info(struct dir_private_info *p); -+ - /* fsync.c */ - extern int ext3_sync_file (struct file *, struct dentry *, int); - -+/* hash.c */ -+extern int ext3fs_dirhash(const char *name, int len, struct -+ dx_hash_info *hinfo); -+ - /* ialloc.c */ - extern struct inode * ext3_new_inode (handle_t *, struct inode *, int); - extern void ext3_free_inode (handle_t *, struct inode *); -@@ -628,6 +703,8 @@ - /* namei.c */ - extern int ext3_orphan_add(handle_t *, struct inode *); - extern int ext3_orphan_del(handle_t *, struct inode *); -+extern int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash, -+ __u32 start_minor_hash, __u32 *next_hash); - - /* super.c */ - extern void ext3_error (struct super_block *, const char *, const char *, ...) -Index: linux-2.4.19.SuSE/include/linux/ext3_fs_sb.h -=================================================================== ---- linux-2.4.19.SuSE.orig/include/linux/ext3_fs_sb.h 2003-10-05 09:16:36.000000000 -0700 -+++ linux-2.4.19.SuSE/include/linux/ext3_fs_sb.h 2004-05-27 11:08:28.000000000 -0700 -@@ -62,6 +62,8 @@ - int s_inode_size; - int s_first_ino; - u32 s_next_generation; -+ u32 s_hash_seed[4]; -+ int s_def_hash_version; - - /* Journaling */ - struct inode * s_journal_inode; -Index: linux-2.4.19.SuSE/include/linux/ext3_jbd.h -=================================================================== ---- linux-2.4.19.SuSE.orig/include/linux/ext3_jbd.h 2003-10-05 09:30:34.000000000 -0700 -+++ linux-2.4.19.SuSE/include/linux/ext3_jbd.h 2004-05-27 11:08:28.000000000 -0700 -@@ -69,6 +69,8 @@ - - #define EXT3_RESERVE_TRANS_BLOCKS 12 - -+#define EXT3_INDEX_EXTRA_TRANS_BLOCKS 8 -+ - int - ext3_mark_iloc_dirty(handle_t *handle, - struct inode *inode, -Index: linux-2.4.19.SuSE/include/linux/rbtree.h -=================================================================== ---- linux-2.4.19.SuSE.orig/include/linux/rbtree.h 2003-10-05 09:16:36.000000000 -0700 -+++ linux-2.4.19.SuSE/include/linux/rbtree.h 2004-05-27 11:08:28.000000000 -0700 -@@ -120,6 +120,8 @@ - - extern void rb_insert_color(rb_node_t *, rb_root_t *); - extern void rb_erase(rb_node_t *, rb_root_t *); -+extern rb_node_t *rb_get_first(rb_root_t *root); -+extern rb_node_t *rb_get_next(rb_node_t *n); - - static inline void rb_link_node(rb_node_t * node, rb_node_t * parent, rb_node_t ** rb_link) - { diff --git a/lustre/kernel_patches/patches/ext3-delete_thread-2.4.19-suse.patch b/lustre/kernel_patches/patches/ext3-delete_thread-2.4.19-suse.patch deleted file mode 100644 index 4bcefce..0000000 --- a/lustre/kernel_patches/patches/ext3-delete_thread-2.4.19-suse.patch +++ /dev/null @@ -1,481 +0,0 @@ - fs/ext3/file.c | 4 - fs/ext3/inode.c | 116 ++++++++++++++++++++++ - fs/ext3/super.c | 230 +++++++++++++++++++++++++++++++++++++++++++++ - include/linux/ext3_fs.h | 5 - include/linux/ext3_fs_sb.h | 10 + - 5 files changed, 365 insertions(+) - -Index: linux-2.4.19.SuSE/fs/ext3/super.c -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/ext3/super.c Sun Nov 16 01:18:04 2003 -+++ linux-2.4.19.SuSE/fs/ext3/super.c Sun Nov 16 01:19:22 2003 -@@ -401,6 +401,220 @@ - } - } - -+#ifdef EXT3_DELETE_THREAD -+/* -+ * Delete inodes in a loop until there are no more to be deleted. -+ * Normally, we run in the background doing the deletes and sleeping again, -+ * and clients just add new inodes to be deleted onto the end of the list. -+ * If someone is concerned about free space (e.g. block allocation or similar) -+ * then they can sleep on s_delete_waiter_queue and be woken up when space -+ * has been freed. -+ */ -+int ext3_delete_thread(void *data) -+{ -+ struct super_block *sb = data; -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ struct task_struct *tsk = current; -+ -+ /* Almost like daemonize, but not quite */ -+ exit_mm(current); -+ tsk->session = 1; -+ tsk->pgrp = 1; -+ tsk->tty = NULL; -+ exit_files(current); -+ reparent_to_init(); -+ -+ sprintf(tsk->comm, "kdelext3-%s", kdevname(sb->s_dev)); -+ sigfillset(&tsk->blocked); -+ -+ /*tsk->flags |= PF_KERNTHREAD;*/ -+ -+ INIT_LIST_HEAD(&sbi->s_delete_list); -+ wake_up(&sbi->s_delete_waiter_queue); -+ ext3_debug("delete thread on %s started\n", kdevname(sb->s_dev)); -+ -+ /* main loop */ -+ for (;;) { -+ wait_event_interruptible(sbi->s_delete_thread_queue, -+ !list_empty(&sbi->s_delete_list) || -+ !test_opt(sb, ASYNCDEL)); -+ ext3_debug("%s woken up: %lu inodes, %lu blocks\n", -+ tsk->comm,sbi->s_delete_inodes,sbi->s_delete_blocks); -+ -+ spin_lock(&sbi->s_delete_lock); -+ if (list_empty(&sbi->s_delete_list)) { -+ clear_opt(sbi->s_mount_opt, ASYNCDEL); -+ memset(&sbi->s_delete_list, 0, -+ sizeof(sbi->s_delete_list)); -+ spin_unlock(&sbi->s_delete_lock); -+ ext3_debug("delete thread on %s exiting\n", -+ kdevname(sb->s_dev)); -+ wake_up(&sbi->s_delete_waiter_queue); -+ break; -+ } -+ -+ while (!list_empty(&sbi->s_delete_list)) { -+ struct inode *inode=list_entry(sbi->s_delete_list.next, -+ struct inode, i_dentry); -+ unsigned long blocks = inode->i_blocks >> -+ (inode->i_blkbits - 9); -+ -+ list_del_init(&inode->i_dentry); -+ spin_unlock(&sbi->s_delete_lock); -+ ext3_debug("%s delete ino %lu blk %lu\n", -+ tsk->comm, inode->i_ino, blocks); -+ -+ iput(inode); -+ -+ spin_lock(&sbi->s_delete_lock); -+ sbi->s_delete_blocks -= blocks; -+ sbi->s_delete_inodes--; -+ } -+ if (sbi->s_delete_blocks != 0 || sbi->s_delete_inodes != 0) { -+ ext3_warning(sb, __FUNCTION__, -+ "%lu blocks, %lu inodes on list?\n", -+ sbi->s_delete_blocks,sbi->s_delete_inodes); -+ sbi->s_delete_blocks = 0; -+ sbi->s_delete_inodes = 0; -+ } -+ spin_unlock(&sbi->s_delete_lock); -+ wake_up(&sbi->s_delete_waiter_queue); -+ } -+ -+ return 0; -+} -+ -+static void ext3_start_delete_thread(struct super_block *sb) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ int rc; -+ -+ spin_lock_init(&sbi->s_delete_lock); -+ init_waitqueue_head(&sbi->s_delete_thread_queue); -+ init_waitqueue_head(&sbi->s_delete_waiter_queue); -+ -+ if (!test_opt(sb, ASYNCDEL)) -+ return; -+ -+ rc = kernel_thread(ext3_delete_thread, sb, CLONE_VM | CLONE_FILES); -+ if (rc < 0) -+ printk(KERN_ERR "EXT3-fs: cannot start delete thread: rc %d\n", -+ rc); -+ else -+ wait_event(sbi->s_delete_waiter_queue, sbi->s_delete_list.next); -+} -+ -+static void ext3_stop_delete_thread(struct ext3_sb_info *sbi) -+{ -+ if (sbi->s_delete_list.next == 0) /* thread never started */ -+ return; -+ -+ clear_opt(sbi->s_mount_opt, ASYNCDEL); -+ wake_up(&sbi->s_delete_thread_queue); -+ wait_event(sbi->s_delete_waiter_queue, list_empty(&sbi->s_delete_list)); -+} -+ -+/* Instead of playing games with the inode flags, destruction, etc we just -+ * create a new inode locally and put it on a list for the truncate thread. -+ * We need large parts of the inode struct in order to complete the -+ * truncate and unlink, so we may as well just have a real inode to do it. -+ * -+ * If we have any problem deferring the delete, just delete it right away. -+ * If we defer it, we also mark how many blocks it would free, so that we -+ * can keep the statfs data correct, and we know if we should sleep on the -+ * delete thread when we run out of space. -+ */ -+static void ext3_delete_inode_thread(struct inode *old_inode) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(old_inode->i_sb); -+ struct ext3_inode_info *nei, *oei = EXT3_I(old_inode); -+ struct inode *new_inode; -+ unsigned long blocks = old_inode->i_blocks >> (old_inode->i_blkbits-9); -+ -+ if (is_bad_inode(old_inode)) { -+ clear_inode(old_inode); -+ return; -+ } -+ -+ if (!test_opt(old_inode->i_sb, ASYNCDEL) || !sbi->s_delete_list.next) -+ goto out_delete; -+ -+ /* We may want to delete the inode immediately and not defer it */ -+ if (IS_SYNC(old_inode) || blocks <= EXT3_NDIR_BLOCKS) -+ goto out_delete; -+ -+ /* We can't use the delete thread as-is during real orphan recovery, -+ * as we add to the orphan list here, causing ext3_orphan_cleanup() -+ * to loop endlessly. It would be nice to do so, but needs work. -+ */ -+ if (oei->i_state & EXT3_STATE_DELETE || -+ sbi->s_mount_state & EXT3_ORPHAN_FS) { -+ ext3_debug("doing deferred inode %lu delete (%lu blocks)\n", -+ old_inode->i_ino, blocks); -+ goto out_delete; -+ } -+ -+ /* We can iget this inode again here, because our caller has unhashed -+ * old_inode, so new_inode will be in a different inode struct. -+ * -+ * We need to ensure that the i_orphan pointers in the other inodes -+ * point at the new inode copy instead of the old one so the orphan -+ * list doesn't get corrupted when the old orphan inode is freed. -+ */ -+ down(&sbi->s_orphan_lock); -+ -+ sbi->s_mount_state |= EXT3_ORPHAN_FS; -+ new_inode = iget(old_inode->i_sb, old_inode->i_ino); -+ sbi->s_mount_state &= ~EXT3_ORPHAN_FS; -+ if (is_bad_inode(new_inode)) { -+ printk(KERN_WARNING "read bad inode %lu\n", old_inode->i_ino); -+ iput(new_inode); -+ new_inode = NULL; -+ } -+ if (!new_inode) { -+ up(&sbi->s_orphan_lock); -+ ext3_debug("delete inode %lu directly (bad read)\n", -+ old_inode->i_ino); -+ goto out_delete; -+ } -+ J_ASSERT(new_inode != old_inode); -+ -+ J_ASSERT(!list_empty(&oei->i_orphan)); -+ -+ nei = EXT3_I(new_inode); -+ /* Ugh. We need to insert new_inode into the same spot on the list -+ * as old_inode was, to ensure the in-memory orphan list is still -+ * in the same order as the on-disk orphan list (badness otherwise). -+ */ -+ nei->i_orphan = oei->i_orphan; -+ nei->i_orphan.next->prev = &nei->i_orphan; -+ nei->i_orphan.prev->next = &nei->i_orphan; -+ nei->i_state |= EXT3_STATE_DELETE; -+ up(&sbi->s_orphan_lock); -+ -+ clear_inode(old_inode); -+ -+ spin_lock(&sbi->s_delete_lock); -+ J_ASSERT(list_empty(&new_inode->i_dentry)); -+ list_add_tail(&new_inode->i_dentry, &sbi->s_delete_list); -+ sbi->s_delete_blocks += blocks; -+ sbi->s_delete_inodes++; -+ spin_unlock(&sbi->s_delete_lock); -+ -+ ext3_debug("delete inode %lu (%lu blocks) by thread\n", -+ new_inode->i_ino, blocks); -+ -+ wake_up(&sbi->s_delete_thread_queue); -+ return; -+ -+out_delete: -+ ext3_delete_inode(old_inode); -+} -+#else -+#define ext3_start_delete_thread(sbi) do {} while(0) -+#define ext3_stop_delete_thread(sbi) do {} while(0) -+#endif /* EXT3_DELETE_THREAD */ -+ - void ext3_put_super (struct super_block * sb) - { - struct ext3_sb_info *sbi = EXT3_SB(sb); -@@ -408,6 +622,7 @@ - kdev_t j_dev = sbi->s_journal->j_dev; - int i; - -+ ext3_stop_delete_thread(sbi); - ext3_xattr_put_super(sb); - journal_destroy(sbi->s_journal); - if (!(sb->s_flags & MS_RDONLY)) { -@@ -476,7 +691,11 @@ - write_inode: ext3_write_inode, /* BKL not held. Don't need */ - dirty_inode: ext3_dirty_inode, /* BKL not held. We take it */ - put_inode: ext3_put_inode, /* BKL not held. Don't need */ -+#ifdef EXT3_DELETE_THREAD -+ delete_inode: ext3_delete_inode_thread,/* BKL not held. We take it */ -+#else - delete_inode: ext3_delete_inode, /* BKL not held. We take it */ -+#endif - put_super: ext3_put_super, /* BKL held */ - write_super: ext3_write_super, /* BKL held */ - sync_fs: ext3_sync_fs, -@@ -553,6 +772,13 @@ - clear_opt (*mount_options, POSIX_ACL); - else - #endif -+#ifdef EXT3_DELETE_THREAD -+ if (!strcmp(this_char, "asyncdel")) -+ set_opt(*mount_options, ASYNCDEL); -+ else if (!strcmp(this_char, "noasyncdel")) -+ clear_opt(*mount_options, ASYNCDEL); -+ else -+#endif - if (!strcmp (this_char, "bsddf")) - clear_opt (*mount_options, MINIX_DF); - else if (!strcmp (this_char, "nouid32")) { -@@ -1254,6 +1480,7 @@ - } - - ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY); -+ ext3_start_delete_thread(sb); - /* - * akpm: core read_super() calls in here with the superblock locked. - * That deadlocks, because orphan cleanup needs to lock the superblock -@@ -1692,6 +1919,9 @@ - if (!parse_options(data, &tmp, sbi, &tmp, 1)) - return -EINVAL; - -+ if (!test_opt(sb, ASYNCDEL) || (*flags & MS_RDONLY)) -+ ext3_stop_delete_thread(sbi); -+ - if (sbi->s_mount_opt & EXT3_MOUNT_ABORT) - ext3_abort(sb, __FUNCTION__, "Abort forced by user"); - -Index: linux-2.4.19.SuSE/fs/ext3/inode.c -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/ext3/inode.c Sun Nov 16 01:02:56 2003 -+++ linux-2.4.19.SuSE/fs/ext3/inode.c Sun Nov 16 01:19:22 2003 -@@ -2114,6 +2114,118 @@ - ext3_journal_stop(handle, inode); - } - -+#ifdef EXT3_DELETE_THREAD -+/* Move blocks from to-be-truncated inode over to a new inode, and delete -+ * that one from the delete thread instead. This avoids a lot of latency -+ * when truncating large files. -+ * -+ * If we have any problem deferring the truncate, just truncate it right away. -+ * If we defer it, we also mark how many blocks it would free, so that we -+ * can keep the statfs data correct, and we know if we should sleep on the -+ * delete thread when we run out of space. -+ */ -+void ext3_truncate_thread(struct inode *old_inode) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(old_inode->i_sb); -+ struct ext3_inode_info *nei, *oei = EXT3_I(old_inode); -+ struct inode *new_inode; -+ handle_t *handle; -+ unsigned long blocks = old_inode->i_blocks >> (old_inode->i_blkbits-9); -+ -+ if (!test_opt(old_inode->i_sb, ASYNCDEL) || !sbi->s_delete_list.next) -+ goto out_truncate; -+ -+ /* XXX This is a temporary limitation for code simplicity. -+ * We could truncate to arbitrary sizes at some later time. -+ */ -+ if (old_inode->i_size != 0) -+ goto out_truncate; -+ -+ /* We may want to truncate the inode immediately and not defer it */ -+ if (IS_SYNC(old_inode) || blocks <= EXT3_NDIR_BLOCKS || -+ old_inode->i_size > oei->i_disksize) -+ goto out_truncate; -+ -+ /* We can't use the delete thread as-is during real orphan recovery, -+ * as we add to the orphan list here, causing ext3_orphan_cleanup() -+ * to loop endlessly. It would be nice to do so, but needs work. -+ */ -+ if (oei->i_state & EXT3_STATE_DELETE || -+ sbi->s_mount_state & EXT3_ORPHAN_FS) { -+ ext3_debug("doing deferred inode %lu delete (%lu blocks)\n", -+ old_inode->i_ino, blocks); -+ goto out_truncate; -+ } -+ -+ ext3_discard_prealloc(old_inode); -+ -+ /* old_inode = 1 -+ * new_inode = sb + GDT + ibitmap -+ * orphan list = 1 inode/superblock for add, 2 inodes for del -+ * quota files = 2 * EXT3_SINGLEDATA_TRANS_BLOCKS -+ */ -+ handle = ext3_journal_start(old_inode, 7); -+ if (IS_ERR(handle)) -+ goto out_truncate; -+ -+ new_inode = ext3_new_inode(handle, old_inode, old_inode->i_mode); -+ if (IS_ERR(new_inode)) { -+ ext3_debug("truncate inode %lu directly (no new inodes)\n", -+ old_inode->i_ino); -+ goto out_journal; -+ } -+ -+ nei = EXT3_I(new_inode); -+ -+ down_write(&oei->truncate_sem); -+ new_inode->i_size = old_inode->i_size; -+ new_inode->i_blocks = old_inode->i_blocks; -+ new_inode->i_uid = old_inode->i_uid; -+ new_inode->i_gid = old_inode->i_gid; -+ new_inode->i_nlink = 0; -+ -+ /* FIXME when we do arbitrary truncates */ -+ old_inode->i_blocks = oei->i_file_acl ? old_inode->i_blksize / 512 : 0; -+ old_inode->i_mtime = old_inode->i_ctime = CURRENT_TIME; -+ -+ memcpy(nei->i_data, oei->i_data, sizeof(nei->i_data)); -+ memset(oei->i_data, 0, sizeof(oei->i_data)); -+ -+ nei->i_disksize = oei->i_disksize; -+ nei->i_state |= EXT3_STATE_DELETE; -+ up_write(&oei->truncate_sem); -+ -+ if (ext3_orphan_add(handle, new_inode) < 0) -+ goto out_journal; -+ -+ if (ext3_orphan_del(handle, old_inode) < 0) { -+ ext3_orphan_del(handle, new_inode); -+ iput(new_inode); -+ goto out_journal; -+ } -+ -+ ext3_journal_stop(handle, old_inode); -+ -+ spin_lock(&sbi->s_delete_lock); -+ J_ASSERT(list_empty(&new_inode->i_dentry)); -+ list_add_tail(&new_inode->i_dentry, &sbi->s_delete_list); -+ sbi->s_delete_blocks += blocks; -+ sbi->s_delete_inodes++; -+ spin_unlock(&sbi->s_delete_lock); -+ -+ ext3_debug("delete inode %lu (%lu blocks) by thread\n", -+ new_inode->i_ino, blocks); -+ -+ wake_up(&sbi->s_delete_thread_queue); -+ return; -+ -+out_journal: -+ ext3_journal_stop(handle, old_inode); -+out_truncate: -+ ext3_truncate(old_inode); -+} -+#endif /* EXT3_DELETE_THREAD */ -+ - /* - * ext3_get_inode_loc returns with an extra refcount against the - * inode's underlying buffer_head on success. -Index: linux-2.4.19.SuSE/fs/ext3/file.c -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/ext3/file.c Sun Nov 16 00:40:59 2003 -+++ linux-2.4.19.SuSE/fs/ext3/file.c Sun Nov 16 01:19:22 2003 -@@ -132,7 +132,11 @@ - }; - - struct inode_operations ext3_file_inode_operations = { -+#ifdef EXT3_DELETE_THREAD -+ truncate: ext3_truncate_thread, /* BKL held */ -+#else - truncate: ext3_truncate, /* BKL held */ -+#endif - setattr: ext3_setattr, /* BKL held */ - setxattr: ext3_setxattr, /* BKL held */ - getxattr: ext3_getxattr, /* BKL held */ -Index: linux-2.4.19.SuSE/include/linux/ext3_fs.h -=================================================================== ---- linux-2.4.19.SuSE.orig/include/linux/ext3_fs.h Sun Nov 16 01:02:51 2003 -+++ linux-2.4.19.SuSE/include/linux/ext3_fs.h Sun Nov 16 01:20:06 2003 -@@ -193,6 +193,7 @@ - */ - #define EXT3_STATE_JDATA 0x00000001 /* journaled data exists */ - #define EXT3_STATE_NEW 0x00000002 /* inode is newly created */ -+#define EXT3_STATE_DELETE 0x00000010 /* deferred delete inode */ - - /* - * ioctl commands -@@ -321,6 +322,7 @@ - #define EXT3_MOUNT_NO_UID32 0x2000 /* Disable 32-bit UIDs */ - #define EXT3_MOUNT_XATTR_USER 0x4000 /* Extended user attributes */ - #define EXT3_MOUNT_POSIX_ACL 0x8000 /* POSIX Access Control Lists */ -+#define EXT3_MOUNT_ASYNCDEL 0x20000 /* Delayed deletion */ - - /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ - #ifndef _LINUX_EXT2_FS_H -@@ -695,6 +697,9 @@ - extern void ext3_dirty_inode(struct inode *); - extern int ext3_change_inode_journal_flag(struct inode *, int); - extern void ext3_truncate (struct inode *); -+#ifdef EXT3_DELETE_THREAD -+extern void ext3_truncate_thread(struct inode *inode); -+#endif - - /* ioctl.c */ - extern int ext3_ioctl (struct inode *, struct file *, unsigned int, -Index: linux-2.4.19.SuSE/include/linux/ext3_fs_sb.h -=================================================================== ---- linux-2.4.19.SuSE.orig/include/linux/ext3_fs_sb.h Sun Nov 16 01:18:41 2003 -+++ linux-2.4.19.SuSE/include/linux/ext3_fs_sb.h Sun Nov 16 01:19:22 2003 -@@ -29,6 +29,8 @@ - - #define EXT3_MAX_GROUP_LOADED 8 - -+#define EXT3_DELETE_THREAD -+ - /* - * third extended-fs super-block data in memory - */ -@@ -75,6 +77,14 @@ - struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */ - wait_queue_head_t ro_wait_queue; /* For people waiting for the fs to go read-only */ - #endif -+#ifdef EXT3_DELETE_THREAD -+ spinlock_t s_delete_lock; -+ struct list_head s_delete_list; -+ unsigned long s_delete_blocks; -+ unsigned long s_delete_inodes; -+ wait_queue_head_t s_delete_thread_queue; -+ wait_queue_head_t s_delete_waiter_queue; -+#endif - }; - - #endif /* _LINUX_EXT3_FS_SB */ diff --git a/lustre/kernel_patches/patches/ext3-delete_thread-2.4.20.patch b/lustre/kernel_patches/patches/ext3-delete_thread-2.4.20.patch deleted file mode 100644 index ca05893..0000000 --- a/lustre/kernel_patches/patches/ext3-delete_thread-2.4.20.patch +++ /dev/null @@ -1,541 +0,0 @@ - fs/ext3/file.c | 4 - fs/ext3/inode.c | 116 ++++++++++++++++++++++ - fs/ext3/super.c | 230 +++++++++++++++++++++++++++++++++++++++++++++ - include/linux/ext3_fs.h | 5 - include/linux/ext3_fs_sb.h | 10 + - 5 files changed, 365 insertions(+) - -Index: linux-2.4.20/fs/ext3/super.c -=================================================================== ---- linux-2.4.20.orig/fs/ext3/super.c 2004-01-12 20:13:37.000000000 +0300 -+++ linux-2.4.20/fs/ext3/super.c 2004-01-13 16:59:54.000000000 +0300 -@@ -48,6 +48,8 @@ - static void ext3_clear_journal_err(struct super_block * sb, - struct ext3_super_block * es); - -+static int ext3_sync_fs(struct super_block * sb); -+ - #ifdef CONFIG_JBD_DEBUG - int journal_no_write[2]; - -@@ -398,6 +400,221 @@ - } - } - -+#ifdef EXT3_DELETE_THREAD -+/* -+ * Delete inodes in a loop until there are no more to be deleted. -+ * Normally, we run in the background doing the deletes and sleeping again, -+ * and clients just add new inodes to be deleted onto the end of the list. -+ * If someone is concerned about free space (e.g. block allocation or similar) -+ * then they can sleep on s_delete_waiter_queue and be woken up when space -+ * has been freed. -+ */ -+int ext3_delete_thread(void *data) -+{ -+ struct super_block *sb = data; -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ struct task_struct *tsk = current; -+ -+ /* Almost like daemonize, but not quite */ -+ exit_mm(current); -+ tsk->session = 1; -+ tsk->pgrp = 1; -+ tsk->tty = NULL; -+ exit_files(current); -+ reparent_to_init(); -+ -+ sprintf(tsk->comm, "kdelext3-%s", kdevname(sb->s_dev)); -+ sigfillset(&tsk->blocked); -+ -+ /*tsk->flags |= PF_KERNTHREAD;*/ -+ -+ INIT_LIST_HEAD(&sbi->s_delete_list); -+ wake_up(&sbi->s_delete_waiter_queue); -+ ext3_debug("delete thread on %s started\n", kdevname(sb->s_dev)); -+ -+ /* main loop */ -+ for (;;) { -+ wait_event_interruptible(sbi->s_delete_thread_queue, -+ !list_empty(&sbi->s_delete_list) || -+ !test_opt(sb, ASYNCDEL)); -+ ext3_debug("%s woken up: %lu inodes, %lu blocks\n", -+ tsk->comm,sbi->s_delete_inodes,sbi->s_delete_blocks); -+ -+ spin_lock(&sbi->s_delete_lock); -+ if (list_empty(&sbi->s_delete_list)) { -+ clear_opt(sbi->s_mount_opt, ASYNCDEL); -+ memset(&sbi->s_delete_list, 0, -+ sizeof(sbi->s_delete_list)); -+ spin_unlock(&sbi->s_delete_lock); -+ ext3_debug("delete thread on %s exiting\n", -+ kdevname(sb->s_dev)); -+ wake_up(&sbi->s_delete_waiter_queue); -+ break; -+ } -+ -+ while (!list_empty(&sbi->s_delete_list)) { -+ struct inode *inode=list_entry(sbi->s_delete_list.next, -+ struct inode, i_dentry); -+ unsigned long blocks = inode->i_blocks >> -+ (inode->i_blkbits - 9); -+ -+ list_del_init(&inode->i_dentry); -+ spin_unlock(&sbi->s_delete_lock); -+ ext3_debug("%s delete ino %lu blk %lu\n", -+ tsk->comm, inode->i_ino, blocks); -+ -+ iput(inode); -+ -+ spin_lock(&sbi->s_delete_lock); -+ sbi->s_delete_blocks -= blocks; -+ sbi->s_delete_inodes--; -+ } -+ if (sbi->s_delete_blocks != 0 || sbi->s_delete_inodes != 0) { -+ ext3_warning(sb, __FUNCTION__, -+ "%lu blocks, %lu inodes on list?\n", -+ sbi->s_delete_blocks,sbi->s_delete_inodes); -+ sbi->s_delete_blocks = 0; -+ sbi->s_delete_inodes = 0; -+ } -+ spin_unlock(&sbi->s_delete_lock); -+ wake_up(&sbi->s_delete_waiter_queue); -+ } -+ -+ return 0; -+} -+ -+static void ext3_start_delete_thread(struct super_block *sb) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ int rc; -+ -+ spin_lock_init(&sbi->s_delete_lock); -+ init_waitqueue_head(&sbi->s_delete_thread_queue); -+ init_waitqueue_head(&sbi->s_delete_waiter_queue); -+ -+ if (!test_opt(sb, ASYNCDEL)) -+ return; -+ -+ rc = kernel_thread(ext3_delete_thread, sb, CLONE_VM | CLONE_FILES); -+ if (rc < 0) -+ printk(KERN_ERR "EXT3-fs: cannot start delete thread: rc %d\n", -+ rc); -+ else -+ wait_event(sbi->s_delete_waiter_queue, sbi->s_delete_list.next); -+} -+ -+static void ext3_stop_delete_thread(struct ext3_sb_info *sbi) -+{ -+ if (sbi->s_delete_list.next == 0) /* thread never started */ -+ return; -+ -+ clear_opt(sbi->s_mount_opt, ASYNCDEL); -+ wake_up(&sbi->s_delete_thread_queue); -+ wait_event(sbi->s_delete_waiter_queue, -+ sbi->s_delete_list.next == 0 && sbi->s_delete_inodes == 0); -+} -+ -+/* Instead of playing games with the inode flags, destruction, etc we just -+ * create a new inode locally and put it on a list for the truncate thread. -+ * We need large parts of the inode struct in order to complete the -+ * truncate and unlink, so we may as well just have a real inode to do it. -+ * -+ * If we have any problem deferring the delete, just delete it right away. -+ * If we defer it, we also mark how many blocks it would free, so that we -+ * can keep the statfs data correct, and we know if we should sleep on the -+ * delete thread when we run out of space. -+ */ -+static void ext3_delete_inode_thread(struct inode *old_inode) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(old_inode->i_sb); -+ struct ext3_inode_info *nei, *oei = EXT3_I(old_inode); -+ struct inode *new_inode; -+ unsigned long blocks = old_inode->i_blocks >> (old_inode->i_blkbits-9); -+ -+ if (is_bad_inode(old_inode)) { -+ clear_inode(old_inode); -+ return; -+ } -+ -+ if (!test_opt(old_inode->i_sb, ASYNCDEL) || !sbi->s_delete_list.next) -+ goto out_delete; -+ -+ /* We may want to delete the inode immediately and not defer it */ -+ if (IS_SYNC(old_inode) || blocks <= EXT3_NDIR_BLOCKS) -+ goto out_delete; -+ -+ /* We can't use the delete thread as-is during real orphan recovery, -+ * as we add to the orphan list here, causing ext3_orphan_cleanup() -+ * to loop endlessly. It would be nice to do so, but needs work. -+ */ -+ if (oei->i_state & EXT3_STATE_DELETE || -+ sbi->s_mount_state & EXT3_ORPHAN_FS) { -+ ext3_debug("doing deferred inode %lu delete (%lu blocks)\n", -+ old_inode->i_ino, blocks); -+ goto out_delete; -+ } -+ -+ /* We can iget this inode again here, because our caller has unhashed -+ * old_inode, so new_inode will be in a different inode struct. -+ * -+ * We need to ensure that the i_orphan pointers in the other inodes -+ * point at the new inode copy instead of the old one so the orphan -+ * list doesn't get corrupted when the old orphan inode is freed. -+ */ -+ down(&sbi->s_orphan_lock); -+ -+ sbi->s_mount_state |= EXT3_ORPHAN_FS; -+ new_inode = iget(old_inode->i_sb, old_inode->i_ino); -+ sbi->s_mount_state &= ~EXT3_ORPHAN_FS; -+ if (is_bad_inode(new_inode)) { -+ printk(KERN_WARNING "read bad inode %lu\n", old_inode->i_ino); -+ iput(new_inode); -+ new_inode = NULL; -+ } -+ if (!new_inode) { -+ up(&sbi->s_orphan_lock); -+ ext3_debug("delete inode %lu directly (bad read)\n", -+ old_inode->i_ino); -+ goto out_delete; -+ } -+ J_ASSERT(new_inode != old_inode); -+ -+ J_ASSERT(!list_empty(&oei->i_orphan)); -+ -+ nei = EXT3_I(new_inode); -+ /* Ugh. We need to insert new_inode into the same spot on the list -+ * as old_inode was, to ensure the in-memory orphan list is still -+ * in the same order as the on-disk orphan list (badness otherwise). -+ */ -+ nei->i_orphan = oei->i_orphan; -+ nei->i_orphan.next->prev = &nei->i_orphan; -+ nei->i_orphan.prev->next = &nei->i_orphan; -+ nei->i_state |= EXT3_STATE_DELETE; -+ up(&sbi->s_orphan_lock); -+ -+ clear_inode(old_inode); -+ -+ spin_lock(&sbi->s_delete_lock); -+ J_ASSERT(list_empty(&new_inode->i_dentry)); -+ list_add_tail(&new_inode->i_dentry, &sbi->s_delete_list); -+ sbi->s_delete_blocks += blocks; -+ sbi->s_delete_inodes++; -+ spin_unlock(&sbi->s_delete_lock); -+ -+ ext3_debug("delete inode %lu (%lu blocks) by thread\n", -+ new_inode->i_ino, blocks); -+ -+ wake_up(&sbi->s_delete_thread_queue); -+ return; -+ -+out_delete: -+ ext3_delete_inode(old_inode); -+} -+#else -+#define ext3_start_delete_thread(sbi) do {} while(0) -+#define ext3_stop_delete_thread(sbi) do {} while(0) -+#endif /* EXT3_DELETE_THREAD */ -+ - void ext3_put_super (struct super_block * sb) - { - struct ext3_sb_info *sbi = EXT3_SB(sb); -@@ -405,6 +622,7 @@ - kdev_t j_dev = sbi->s_journal->j_dev; - int i; - -+ J_ASSERT(sbi->s_delete_inodes == 0); - ext3_xattr_put_super(sb); - journal_destroy(sbi->s_journal); - if (!(sb->s_flags & MS_RDONLY)) { -@@ -453,9 +671,14 @@ - write_inode: ext3_write_inode, /* BKL not held. Don't need */ - dirty_inode: ext3_dirty_inode, /* BKL not held. We take it */ - put_inode: ext3_put_inode, /* BKL not held. Don't need */ -+#ifdef EXT3_DELETE_THREAD -+ delete_inode: ext3_delete_inode_thread,/* BKL not held. We take it */ -+#else - delete_inode: ext3_delete_inode, /* BKL not held. We take it */ -+#endif - put_super: ext3_put_super, /* BKL held */ - write_super: ext3_write_super, /* BKL held */ -+ sync_fs: ext3_sync_fs, - write_super_lockfs: ext3_write_super_lockfs, /* BKL not held. Take it */ - unlockfs: ext3_unlockfs, /* BKL not held. We take it */ - statfs: ext3_statfs, /* BKL held */ -@@ -521,6 +744,13 @@ - clear_opt (*mount_options, XATTR_USER); - else - #endif -+#ifdef EXT3_DELETE_THREAD -+ if (!strcmp(this_char, "asyncdel")) -+ set_opt(*mount_options, ASYNCDEL); -+ else if (!strcmp(this_char, "noasyncdel")) -+ clear_opt(*mount_options, ASYNCDEL); -+ else -+#endif - if (!strcmp (this_char, "bsddf")) - clear_opt (*mount_options, MINIX_DF); - else if (!strcmp (this_char, "nouid32")) { -@@ -1220,6 +1450,7 @@ - } - - ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY); -+ ext3_start_delete_thread(sb); - /* - * akpm: core read_super() calls in here with the superblock locked. - * That deadlocks, because orphan cleanup needs to lock the superblock -@@ -1625,6 +1856,21 @@ - } - } - -+static int ext3_sync_fs(struct super_block *sb) -+{ -+ tid_t target; -+ -+ if (atomic_read(&sb->s_active) == 0) { -+ /* fs is being umounted: time to stop delete thread */ -+ ext3_stop_delete_thread(EXT3_SB(sb)); -+ } -+ -+ sb->s_dirt = 0; -+ target = log_start_commit(EXT3_SB(sb)->s_journal, NULL); -+ log_wait_commit(EXT3_SB(sb)->s_journal, target); -+ return 0; -+} -+ - /* - * LVM calls this function before a (read-only) snapshot is created. This - * gives us a chance to flush the journal completely and mark the fs clean. -@@ -1682,6 +1928,9 @@ - if (!parse_options(data, &tmp, sbi, &tmp, 1)) - return -EINVAL; - -+ if (!test_opt(sb, ASYNCDEL) || (*flags & MS_RDONLY)) -+ ext3_stop_delete_thread(sbi); -+ - if (sbi->s_mount_opt & EXT3_MOUNT_ABORT) - ext3_abort(sb, __FUNCTION__, "Abort forced by user"); - -Index: linux-2.4.20/fs/ext3/inode.c -=================================================================== ---- linux-2.4.20.orig/fs/ext3/inode.c 2004-01-12 20:13:37.000000000 +0300 -+++ linux-2.4.20/fs/ext3/inode.c 2004-01-13 16:55:45.000000000 +0300 -@@ -2552,6 +2552,118 @@ - return err; - } - -+#ifdef EXT3_DELETE_THREAD -+/* Move blocks from to-be-truncated inode over to a new inode, and delete -+ * that one from the delete thread instead. This avoids a lot of latency -+ * when truncating large files. -+ * -+ * If we have any problem deferring the truncate, just truncate it right away. -+ * If we defer it, we also mark how many blocks it would free, so that we -+ * can keep the statfs data correct, and we know if we should sleep on the -+ * delete thread when we run out of space. -+ */ -+void ext3_truncate_thread(struct inode *old_inode) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(old_inode->i_sb); -+ struct ext3_inode_info *nei, *oei = EXT3_I(old_inode); -+ struct inode *new_inode; -+ handle_t *handle; -+ unsigned long blocks = old_inode->i_blocks >> (old_inode->i_blkbits-9); -+ -+ if (!test_opt(old_inode->i_sb, ASYNCDEL) || !sbi->s_delete_list.next) -+ goto out_truncate; -+ -+ /* XXX This is a temporary limitation for code simplicity. -+ * We could truncate to arbitrary sizes at some later time. -+ */ -+ if (old_inode->i_size != 0) -+ goto out_truncate; -+ -+ /* We may want to truncate the inode immediately and not defer it */ -+ if (IS_SYNC(old_inode) || blocks <= EXT3_NDIR_BLOCKS || -+ old_inode->i_size > oei->i_disksize) -+ goto out_truncate; -+ -+ /* We can't use the delete thread as-is during real orphan recovery, -+ * as we add to the orphan list here, causing ext3_orphan_cleanup() -+ * to loop endlessly. It would be nice to do so, but needs work. -+ */ -+ if (oei->i_state & EXT3_STATE_DELETE || -+ sbi->s_mount_state & EXT3_ORPHAN_FS) { -+ ext3_debug("doing deferred inode %lu delete (%lu blocks)\n", -+ old_inode->i_ino, blocks); -+ goto out_truncate; -+ } -+ -+ ext3_discard_prealloc(old_inode); -+ -+ /* old_inode = 1 -+ * new_inode = sb + GDT + ibitmap -+ * orphan list = 1 inode/superblock for add, 2 inodes for del -+ * quota files = 2 * EXT3_SINGLEDATA_TRANS_BLOCKS -+ */ -+ handle = ext3_journal_start(old_inode, 7); -+ if (IS_ERR(handle)) -+ goto out_truncate; -+ -+ new_inode = ext3_new_inode(handle, old_inode, old_inode->i_mode); -+ if (IS_ERR(new_inode)) { -+ ext3_debug("truncate inode %lu directly (no new inodes)\n", -+ old_inode->i_ino); -+ goto out_journal; -+ } -+ -+ nei = EXT3_I(new_inode); -+ -+ down_write(&oei->truncate_sem); -+ new_inode->i_size = old_inode->i_size; -+ new_inode->i_blocks = old_inode->i_blocks; -+ new_inode->i_uid = old_inode->i_uid; -+ new_inode->i_gid = old_inode->i_gid; -+ new_inode->i_nlink = 0; -+ -+ /* FIXME when we do arbitrary truncates */ -+ old_inode->i_blocks = oei->i_file_acl ? old_inode->i_blksize / 512 : 0; -+ old_inode->i_mtime = old_inode->i_ctime = CURRENT_TIME; -+ -+ memcpy(nei->i_data, oei->i_data, sizeof(nei->i_data)); -+ memset(oei->i_data, 0, sizeof(oei->i_data)); -+ -+ nei->i_disksize = oei->i_disksize; -+ nei->i_state |= EXT3_STATE_DELETE; -+ up_write(&oei->truncate_sem); -+ -+ if (ext3_orphan_add(handle, new_inode) < 0) -+ goto out_journal; -+ -+ if (ext3_orphan_del(handle, old_inode) < 0) { -+ ext3_orphan_del(handle, new_inode); -+ iput(new_inode); -+ goto out_journal; -+ } -+ -+ ext3_journal_stop(handle, old_inode); -+ -+ spin_lock(&sbi->s_delete_lock); -+ J_ASSERT(list_empty(&new_inode->i_dentry)); -+ list_add_tail(&new_inode->i_dentry, &sbi->s_delete_list); -+ sbi->s_delete_blocks += blocks; -+ sbi->s_delete_inodes++; -+ spin_unlock(&sbi->s_delete_lock); -+ -+ ext3_debug("delete inode %lu (%lu blocks) by thread\n", -+ new_inode->i_ino, blocks); -+ -+ wake_up(&sbi->s_delete_thread_queue); -+ return; -+ -+out_journal: -+ ext3_journal_stop(handle, old_inode); -+out_truncate: -+ ext3_truncate(old_inode); -+} -+#endif /* EXT3_DELETE_THREAD */ -+ - /* - * On success, We end up with an outstanding reference count against - * iloc->bh. This _must_ be cleaned up later. -Index: linux-2.4.20/fs/ext3/file.c -=================================================================== ---- linux-2.4.20.orig/fs/ext3/file.c 2004-01-12 20:13:36.000000000 +0300 -+++ linux-2.4.20/fs/ext3/file.c 2004-01-13 16:55:45.000000000 +0300 -@@ -125,7 +125,11 @@ - }; - - struct inode_operations ext3_file_inode_operations = { -+#ifdef EXT3_DELETE_THREAD -+ truncate: ext3_truncate_thread, /* BKL held */ -+#else - truncate: ext3_truncate, /* BKL held */ -+#endif - setattr: ext3_setattr, /* BKL held */ - setxattr: ext3_setxattr, /* BKL held */ - getxattr: ext3_getxattr, /* BKL held */ -Index: linux-2.4.20/fs/buffer.c -=================================================================== ---- linux-2.4.20.orig/fs/buffer.c 2003-05-16 05:29:12.000000000 +0400 -+++ linux-2.4.20/fs/buffer.c 2004-01-13 16:55:45.000000000 +0300 -@@ -328,6 +328,8 @@ - if (sb->s_dirt && sb->s_op && sb->s_op->write_super) - sb->s_op->write_super(sb); - unlock_super(sb); -+ if (sb->s_op && sb->s_op->sync_fs) -+ sb->s_op->sync_fs(sb); - unlock_kernel(); - - return sync_buffers(dev, 1); -Index: linux-2.4.20/include/linux/ext3_fs.h -=================================================================== ---- linux-2.4.20.orig/include/linux/ext3_fs.h 2004-01-12 20:13:37.000000000 +0300 -+++ linux-2.4.20/include/linux/ext3_fs.h 2004-01-13 16:55:45.000000000 +0300 -@@ -193,6 +193,7 @@ - */ - #define EXT3_STATE_JDATA 0x00000001 /* journaled data exists */ - #define EXT3_STATE_NEW 0x00000002 /* inode is newly created */ -+#define EXT3_STATE_DELETE 0x00000010 /* deferred delete inode */ - - /* - * ioctl commands -@@ -320,6 +321,7 @@ - #define EXT3_MOUNT_UPDATE_JOURNAL 0x1000 /* Update the journal format */ - #define EXT3_MOUNT_NO_UID32 0x2000 /* Disable 32-bit UIDs */ - #define EXT3_MOUNT_XATTR_USER 0x4000 /* Extended user attributes */ -+#define EXT3_MOUNT_ASYNCDEL 0x20000 /* Delayed deletion */ - - /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ - #ifndef _LINUX_EXT2_FS_H -@@ -696,6 +698,9 @@ - extern void ext3_dirty_inode(struct inode *); - extern int ext3_change_inode_journal_flag(struct inode *, int); - extern void ext3_truncate (struct inode *); -+#ifdef EXT3_DELETE_THREAD -+extern void ext3_truncate_thread(struct inode *inode); -+#endif - - /* ioctl.c */ - extern int ext3_ioctl (struct inode *, struct file *, unsigned int, -Index: linux-2.4.20/include/linux/ext3_fs_sb.h -=================================================================== ---- linux-2.4.20.orig/include/linux/ext3_fs_sb.h 2004-01-12 20:13:37.000000000 +0300 -+++ linux-2.4.20/include/linux/ext3_fs_sb.h 2004-01-13 16:55:45.000000000 +0300 -@@ -29,6 +29,8 @@ - - #define EXT3_MAX_GROUP_LOADED 8 - -+#define EXT3_DELETE_THREAD -+ - /* - * third extended-fs super-block data in memory - */ -@@ -76,6 +78,14 @@ - struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */ - wait_queue_head_t ro_wait_queue; /* For people waiting for the fs to go read-only */ - #endif -+#ifdef EXT3_DELETE_THREAD -+ spinlock_t s_delete_lock; -+ struct list_head s_delete_list; -+ unsigned long s_delete_blocks; -+ unsigned long s_delete_inodes; -+ wait_queue_head_t s_delete_thread_queue; -+ wait_queue_head_t s_delete_waiter_queue; -+#endif - }; - - #endif /* _LINUX_EXT3_FS_SB */ -Index: linux-2.4.20/include/linux/fs.h -=================================================================== ---- linux-2.4.20.orig/include/linux/fs.h 2004-01-12 20:13:36.000000000 +0300 -+++ linux-2.4.20/include/linux/fs.h 2004-01-13 16:55:45.000000000 +0300 -@@ -917,6 +917,7 @@ - void (*delete_inode) (struct inode *); - void (*put_super) (struct super_block *); - void (*write_super) (struct super_block *); -+ int (*sync_fs) (struct super_block *); - void (*write_super_lockfs) (struct super_block *); - void (*unlockfs) (struct super_block *); - int (*statfs) (struct super_block *, struct statfs *); diff --git a/lustre/kernel_patches/patches/ext3-ea-in-inode-2.6-rhel4.patch b/lustre/kernel_patches/patches/ext3-ea-in-inode-2.6-rhel4.patch index 507b044..3f5687b 100644 --- a/lustre/kernel_patches/patches/ext3-ea-in-inode-2.6-rhel4.patch +++ b/lustre/kernel_patches/patches/ext3-ea-in-inode-2.6-rhel4.patch @@ -27,7 +27,7 @@ Index: linux-stage/fs/ext3/inode.c struct ext3_iloc *iloc, int in_mem) { unsigned long block; -@@ -2484,6 +2484,11 @@ +@@ -2484,6 +2484,11 @@ void ext3_read_inode(struct inode * inod ei->i_data[block] = raw_inode->i_block[block]; INIT_LIST_HEAD(&ei->i_orphan); @@ -39,7 +39,7 @@ Index: linux-stage/fs/ext3/inode.c if (S_ISREG(inode->i_mode)) { inode->i_op = &ext3_file_inode_operations; inode->i_fop = &ext3_file_operations; -@@ -2619,6 +2624,9 @@ +@@ -2619,6 +2624,9 @@ static int ext3_do_update_inode(handle_t } else for (block = 0; block < EXT3_N_BLOCKS; block++) raw_inode->i_block[block] = ei->i_data[block]; @@ -49,7 +49,7 @@ Index: linux-stage/fs/ext3/inode.c BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); rc = ext3_journal_dirty_metadata(handle, bh); if (!err) -@@ -2849,7 +2857,8 @@ +@@ -2849,7 +2857,8 @@ ext3_reserve_inode_write(handle_t *handl { int err = 0; if (handle) { diff --git a/lustre/kernel_patches/patches/ext3-extents-2.4.21-chaos.patch b/lustre/kernel_patches/patches/ext3-extents-2.4.21-chaos.patch index 68e52bb..588916f 100644 --- a/lustre/kernel_patches/patches/ext3-extents-2.4.21-chaos.patch +++ b/lustre/kernel_patches/patches/ext3-extents-2.4.21-chaos.patch @@ -2540,10 +2540,10 @@ Index: linux-2.4.21-rhel/include/linux/ext3_fs.h * Structure of an inode on the disk @@ -332,6 +336,8 @@ #define EXT3_MOUNT_ASYNCDEL 0x20000 /* Delayed deletion */ - #define EXT3_MOUNT_IOPEN 0x40000 /* Allow access via iopen */ - #define EXT3_MOUNT_IOPEN_NOPRIV 0x80000 /* Make iopen world-readable */ -+#define EXT3_MOUNT_EXTENTS 0x100000/* Extents support */ -+#define EXT3_MOUNT_EXTDEBUG 0x200000/* Extents debug */ + #define EXT3_MOUNT_IOPEN 0x80000 /* Allow access via iopen */ + #define EXT3_MOUNT_IOPEN_NOPRIV 0x100000/* Make iopen world-readable */ ++#define EXT3_MOUNT_EXTENTS 0x200000/* Extents support */ ++#define EXT3_MOUNT_EXTDEBUG 0x400000/* Extents debug */ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ #ifndef _LINUX_EXT2_FS_H diff --git a/lustre/kernel_patches/patches/ext3-extents-2.4.21-suse2.patch b/lustre/kernel_patches/patches/ext3-extents-2.4.21-suse2.patch index fb7d2cb..305ef8e 100644 --- a/lustre/kernel_patches/patches/ext3-extents-2.4.21-suse2.patch +++ b/lustre/kernel_patches/patches/ext3-extents-2.4.21-suse2.patch @@ -2539,10 +2539,10 @@ Index: linux-2.4.21-suse2/include/linux/ext3_fs.h * Structure of an inode on the disk @@ -328,6 +332,8 @@ #define EXT3_MOUNT_ASYNCDEL 0x20000 /* Delayed deletion */ - #define EXT3_MOUNT_IOPEN 0x40000 /* Allow access via iopen */ - #define EXT3_MOUNT_IOPEN_NOPRIV 0x80000 /* Make iopen world-readable */ -+#define EXT3_MOUNT_EXTENTS 0x100000/* Extents support */ -+#define EXT3_MOUNT_EXTDEBUG 0x200000/* Extents debug */ + #define EXT3_MOUNT_IOPEN 0x80000 /* Allow access via iopen */ + #define EXT3_MOUNT_IOPEN_NOPRIV 0x100000/* Make iopen world-readable */ ++#define EXT3_MOUNT_EXTENTS 0x200000/* Extents support */ ++#define EXT3_MOUNT_EXTDEBUG 0x400000/* Extents debug */ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ #ifndef _LINUX_EXT2_FS_H diff --git a/lustre/kernel_patches/patches/ext3-extents-2.4.24.patch b/lustre/kernel_patches/patches/ext3-extents-2.4.24.patch index 7246be1..8e84625 100644 --- a/lustre/kernel_patches/patches/ext3-extents-2.4.24.patch +++ b/lustre/kernel_patches/patches/ext3-extents-2.4.24.patch @@ -2527,10 +2527,10 @@ Index: linux-2.4.24/include/linux/ext3_fs.h * Structure of an inode on the disk @@ -327,6 +331,8 @@ #define EXT3_MOUNT_ASYNCDEL 0x20000 /* Delayed deletion */ - #define EXT3_MOUNT_IOPEN 0x40000 /* Allow access via iopen */ - #define EXT3_MOUNT_IOPEN_NOPRIV 0x80000 /* Make iopen world-readable */ -+#define EXT3_MOUNT_EXTENTS 0x100000/* Extents support */ -+#define EXT3_MOUNT_EXTDEBUG 0x200000/* Extents debug */ + #define EXT3_MOUNT_IOPEN 0x80000 /* Allow access via iopen */ + #define EXT3_MOUNT_IOPEN_NOPRIV 0x100000/* Make iopen world-readable */ ++#define EXT3_MOUNT_EXTENTS 0x200000/* Extents support */ ++#define EXT3_MOUNT_EXTDEBUG 0x400000/* Extents debug */ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ #ifndef _LINUX_EXT2_FS_H diff --git a/lustre/kernel_patches/patches/ext3-extents-2.4.29.patch b/lustre/kernel_patches/patches/ext3-extents-2.4.29.patch index d030f04..d77d9a7 100644 --- a/lustre/kernel_patches/patches/ext3-extents-2.4.29.patch +++ b/lustre/kernel_patches/patches/ext3-extents-2.4.29.patch @@ -2527,10 +2527,10 @@ Index: linux-2.4.29/include/linux/ext3_fs.h * Structure of an inode on the disk @@ -327,6 +331,8 @@ #define EXT3_MOUNT_ASYNCDEL 0x20000 /* Delayed deletion */ - #define EXT3_MOUNT_IOPEN 0x40000 /* Allow access via iopen */ - #define EXT3_MOUNT_IOPEN_NOPRIV 0x80000 /* Make iopen world-readable */ -+#define EXT3_MOUNT_EXTENTS 0x100000/* Extents support */ -+#define EXT3_MOUNT_EXTDEBUG 0x200000/* Extents debug */ + #define EXT3_MOUNT_IOPEN 0x80000 /* Allow access via iopen */ + #define EXT3_MOUNT_IOPEN_NOPRIV 0x100000/* Make iopen world-readable */ ++#define EXT3_MOUNT_EXTENTS 0x200000/* Extents support */ ++#define EXT3_MOUNT_EXTDEBUG 0x400000/* Extents debug */ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ #ifndef _LINUX_EXT2_FS_H diff --git a/lustre/kernel_patches/patches/ext3-extents-2.6.12.patch b/lustre/kernel_patches/patches/ext3-extents-2.6.12.patch new file mode 100644 index 0000000..657ecf4 --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-extents-2.6.12.patch @@ -0,0 +1,2924 @@ +Index: linux-2.6.12-rc6/fs/ext3/extents.c +=================================================================== +--- linux-2.6.12-rc6.orig/fs/ext3/extents.c 2005-06-14 16:31:25.756503133 +0200 ++++ linux-2.6.12-rc6/fs/ext3/extents.c 2005-06-14 16:31:25.836581257 +0200 +@@ -0,0 +1,2347 @@ ++/* ++ * Copyright(c) 2003, 2004, 2005, Cluster File Systems, Inc, info@clusterfs.com ++ * Written by Alex Tomas ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public Licens ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- ++ */ ++ ++/* ++ * Extents support for EXT3 ++ * ++ * TODO: ++ * - ext3_ext_walk_space() sould not use ext3_ext_find_extent() ++ * - ext3_ext_calc_credits() could take 'mergable' into account ++ * - ext3*_error() should be used in some situations ++ * - find_goal() [to be tested and improved] ++ * - smart tree reduction ++ * - arch-independence ++ * common on-disk format for big/little-endian arch ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++ ++static inline int ext3_ext_check_header(struct ext3_extent_header *eh) ++{ ++ if (eh->eh_magic != EXT3_EXT_MAGIC) { ++ printk(KERN_ERR "EXT3-fs: invalid magic = 0x%x\n", ++ (unsigned)eh->eh_magic); ++ return -EIO; ++ } ++ if (eh->eh_max == 0) { ++ printk(KERN_ERR "EXT3-fs: invalid eh_max = %u\n", ++ (unsigned)eh->eh_max); ++ return -EIO; ++ } ++ if (eh->eh_entries > eh->eh_max) { ++ printk(KERN_ERR "EXT3-fs: invalid eh_entries = %u\n", ++ (unsigned)eh->eh_entries); ++ return -EIO; ++ } ++ return 0; ++} ++ ++static handle_t *ext3_ext_journal_restart(handle_t *handle, int needed) ++{ ++ int err; ++ ++ if (handle->h_buffer_credits > needed) ++ return handle; ++ if (!ext3_journal_extend(handle, needed)) ++ return handle; ++ err = ext3_journal_restart(handle, needed); ++ ++ return handle; ++} ++ ++static int inline ++ext3_ext_get_access_for_root(handle_t *h, struct ext3_extents_tree *tree) ++{ ++ if (tree->ops->get_write_access) ++ return tree->ops->get_write_access(h,tree->buffer); ++ else ++ return 0; ++} ++ ++static int inline ++ext3_ext_mark_root_dirty(handle_t *h, struct ext3_extents_tree *tree) ++{ ++ if (tree->ops->mark_buffer_dirty) ++ return tree->ops->mark_buffer_dirty(h,tree->buffer); ++ else ++ return 0; ++} ++ ++/* ++ * could return: ++ * - EROFS ++ * - ENOMEM ++ */ ++static int ext3_ext_get_access(handle_t *handle, ++ struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ int err; ++ ++ if (path->p_bh) { ++ /* path points to block */ ++ err = ext3_journal_get_write_access(handle, path->p_bh); ++ } else { ++ /* path points to leaf/index in inode body */ ++ err = ext3_ext_get_access_for_root(handle, tree); ++ } ++ return err; ++} ++ ++/* ++ * could return: ++ * - EROFS ++ * - ENOMEM ++ * - EIO ++ */ ++static int ext3_ext_dirty(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ int err; ++ if (path->p_bh) { ++ /* path points to block */ ++ err =ext3_journal_dirty_metadata(handle, path->p_bh); ++ } else { ++ /* path points to leaf/index in inode body */ ++ err = ext3_ext_mark_root_dirty(handle, tree); ++ } ++ return err; ++} ++ ++static int inline ++ext3_ext_new_block(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, struct ext3_extent *ex, ++ int *err) ++{ ++ int goal, depth, newblock; ++ struct inode *inode; ++ ++ EXT_ASSERT(tree); ++ if (tree->ops->new_block) ++ return tree->ops->new_block(handle, tree, path, ex, err); ++ ++ inode = tree->inode; ++ depth = EXT_DEPTH(tree); ++ if (path && depth > 0) { ++ goal = path[depth-1].p_block; ++ } else { ++ struct ext3_inode_info *ei = EXT3_I(inode); ++ unsigned long bg_start; ++ unsigned long colour; ++ ++ bg_start = (ei->i_block_group * ++ EXT3_BLOCKS_PER_GROUP(inode->i_sb)) + ++ le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block); ++ colour = (current->pid % 16) * ++ (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16); ++ goal = bg_start + colour; ++ } ++ ++ newblock = ext3_new_block(handle, inode, goal, err); ++ return newblock; ++} ++ ++static inline void ext3_ext_tree_changed(struct ext3_extents_tree *tree) ++{ ++ struct ext3_extent_header *neh; ++ neh = EXT_ROOT_HDR(tree); ++ neh->eh_generation++; ++} ++ ++static inline int ext3_ext_space_block(struct ext3_extents_tree *tree) ++{ ++ int size; ++ ++ size = (tree->inode->i_sb->s_blocksize - ++ sizeof(struct ext3_extent_header)) / ++ sizeof(struct ext3_extent); ++#ifdef AGRESSIVE_TEST ++ size = 6; ++#endif ++ return size; ++} ++ ++static inline int ext3_ext_space_block_idx(struct ext3_extents_tree *tree) ++{ ++ int size; ++ ++ size = (tree->inode->i_sb->s_blocksize - ++ sizeof(struct ext3_extent_header)) / ++ sizeof(struct ext3_extent_idx); ++#ifdef AGRESSIVE_TEST ++ size = 5; ++#endif ++ return size; ++} ++ ++static inline int ext3_ext_space_root(struct ext3_extents_tree *tree) ++{ ++ int size; ++ ++ size = (tree->buffer_len - sizeof(struct ext3_extent_header)) / ++ sizeof(struct ext3_extent); ++#ifdef AGRESSIVE_TEST ++ size = 3; ++#endif ++ return size; ++} ++ ++static inline int ext3_ext_space_root_idx(struct ext3_extents_tree *tree) ++{ ++ int size; ++ ++ size = (tree->buffer_len - sizeof(struct ext3_extent_header)) / ++ sizeof(struct ext3_extent_idx); ++#ifdef AGRESSIVE_TEST ++ size = 4; ++#endif ++ return size; ++} ++ ++static void ext3_ext_show_path(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++#ifdef EXT_DEBUG ++ int k, l = path->p_depth; ++ ++ ext_debug(tree, "path:"); ++ for (k = 0; k <= l; k++, path++) { ++ if (path->p_idx) { ++ ext_debug(tree, " %d->%d", path->p_idx->ei_block, ++ path->p_idx->ei_leaf); ++ } else if (path->p_ext) { ++ ext_debug(tree, " %d:%d:%d", ++ path->p_ext->ee_block, ++ path->p_ext->ee_len, ++ path->p_ext->ee_start); ++ } else ++ ext_debug(tree, " []"); ++ } ++ ext_debug(tree, "\n"); ++#endif ++} ++ ++static void ext3_ext_show_leaf(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++#ifdef EXT_DEBUG ++ int depth = EXT_DEPTH(tree); ++ struct ext3_extent_header *eh; ++ struct ext3_extent *ex; ++ int i; ++ ++ if (!path) ++ return; ++ ++ eh = path[depth].p_hdr; ++ ex = EXT_FIRST_EXTENT(eh); ++ ++ for (i = 0; i < eh->eh_entries; i++, ex++) { ++ ext_debug(tree, "%d:%d:%d ", ++ ex->ee_block, ex->ee_len, ex->ee_start); ++ } ++ ext_debug(tree, "\n"); ++#endif ++} ++ ++static void ext3_ext_drop_refs(struct ext3_ext_path *path) ++{ ++ int depth = path->p_depth; ++ int i; ++ ++ for (i = 0; i <= depth; i++, path++) { ++ if (path->p_bh) { ++ brelse(path->p_bh); ++ path->p_bh = NULL; ++ } ++ } ++} ++ ++/* ++ * binary search for closest index by given block ++ */ ++static inline void ++ext3_ext_binsearch_idx(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, int block) ++{ ++ struct ext3_extent_header *eh = path->p_hdr; ++ struct ext3_extent_idx *ix; ++ int l = 0, k, r; ++ ++ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); ++ EXT_ASSERT(eh->eh_entries <= eh->eh_max); ++ EXT_ASSERT(eh->eh_entries > 0); ++ ++ ext_debug(tree, "binsearch for %d(idx): ", block); ++ ++ path->p_idx = ix = EXT_FIRST_INDEX(eh); ++ ++ r = k = eh->eh_entries; ++ while (k > 1) { ++ k = (r - l) / 2; ++ if (block < ix[l + k].ei_block) ++ r -= k; ++ else ++ l += k; ++ ext_debug(tree, "%d:%d:%d ", k, l, r); ++ } ++ ++ ix += l; ++ path->p_idx = ix; ++ ext_debug(tree," -> %d->%d ",path->p_idx->ei_block,path->p_idx->ei_leaf); ++ ++ while (l++ < r) { ++ if (block < ix->ei_block) ++ break; ++ path->p_idx = ix++; ++ } ++ ext_debug(tree, " -> %d->%d\n", path->p_idx->ei_block, ++ path->p_idx->ei_leaf); ++ ++#ifdef CHECK_BINSEARCH ++ { ++ struct ext3_extent_idx *chix; ++ ++ chix = ix = EXT_FIRST_INDEX(eh); ++ for (k = 0; k < eh->eh_entries; k++, ix++) { ++ if (k != 0 && ix->ei_block <= ix[-1].ei_block) { ++ printk("k=%d, ix=0x%p, first=0x%p\n", k, ++ ix, EXT_FIRST_INDEX(eh)); ++ printk("%u <= %u\n", ++ ix->ei_block,ix[-1].ei_block); ++ } ++ EXT_ASSERT(k == 0 || ix->ei_block > ix[-1].ei_block); ++ if (block < ix->ei_block) ++ break; ++ chix = ix; ++ } ++ EXT_ASSERT(chix == path->p_idx); ++ } ++#endif ++} ++ ++/* ++ * binary search for closest extent by given block ++ */ ++static inline void ++ext3_ext_binsearch(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, int block) ++{ ++ struct ext3_extent_header *eh = path->p_hdr; ++ struct ext3_extent *ex; ++ int l = 0, k, r; ++ ++ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); ++ EXT_ASSERT(eh->eh_entries <= eh->eh_max); ++ ++ if (eh->eh_entries == 0) { ++ /* ++ * this leaf is empty yet: ++ * we get such a leaf in split/add case ++ */ ++ return; ++ } ++ ++ ext_debug(tree, "binsearch for %d: ", block); ++ ++ path->p_ext = ex = EXT_FIRST_EXTENT(eh); ++ ++ r = k = eh->eh_entries; ++ while (k > 1) { ++ k = (r - l) / 2; ++ if (block < ex[l + k].ee_block) ++ r -= k; ++ else ++ l += k; ++ ext_debug(tree, "%d:%d:%d ", k, l, r); ++ } ++ ++ ex += l; ++ path->p_ext = ex; ++ ext_debug(tree, " -> %d:%d:%d ", path->p_ext->ee_block, ++ path->p_ext->ee_start, path->p_ext->ee_len); ++ ++ while (l++ < r) { ++ if (block < ex->ee_block) ++ break; ++ path->p_ext = ex++; ++ } ++ ext_debug(tree, " -> %d:%d:%d\n", path->p_ext->ee_block, ++ path->p_ext->ee_start, path->p_ext->ee_len); ++ ++#ifdef CHECK_BINSEARCH ++ { ++ struct ext3_extent *chex; ++ ++ chex = ex = EXT_FIRST_EXTENT(eh); ++ for (k = 0; k < eh->eh_entries; k++, ex++) { ++ EXT_ASSERT(k == 0 || ex->ee_block > ex[-1].ee_block); ++ if (block < ex->ee_block) ++ break; ++ chex = ex; ++ } ++ EXT_ASSERT(chex == path->p_ext); ++ } ++#endif ++} ++ ++int ext3_extent_tree_init(handle_t *handle, struct ext3_extents_tree *tree) ++{ ++ struct ext3_extent_header *eh; ++ ++ BUG_ON(tree->buffer_len == 0); ++ ext3_ext_get_access_for_root(handle, tree); ++ eh = EXT_ROOT_HDR(tree); ++ eh->eh_depth = 0; ++ eh->eh_entries = 0; ++ eh->eh_magic = EXT3_EXT_MAGIC; ++ eh->eh_max = ext3_ext_space_root(tree); ++ ext3_ext_mark_root_dirty(handle, tree); ++ ext3_ext_invalidate_cache(tree); ++ return 0; ++} ++ ++struct ext3_ext_path * ++ext3_ext_find_extent(struct ext3_extents_tree *tree, int block, ++ struct ext3_ext_path *path) ++{ ++ struct ext3_extent_header *eh; ++ struct buffer_head *bh; ++ int depth, i, ppos = 0; ++ ++ EXT_ASSERT(tree); ++ EXT_ASSERT(tree->inode); ++ EXT_ASSERT(tree->root); ++ ++ eh = EXT_ROOT_HDR(tree); ++ EXT_ASSERT(eh); ++ if (ext3_ext_check_header(eh)) ++ goto err; ++ ++ i = depth = EXT_DEPTH(tree); ++ EXT_ASSERT(eh->eh_max); ++ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); ++ ++ /* account possible depth increase */ ++ if (!path) { ++ path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 2), ++ GFP_NOFS); ++ if (!path) ++ return ERR_PTR(-ENOMEM); ++ } ++ memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1)); ++ path[0].p_hdr = eh; ++ ++ /* walk through the tree */ ++ while (i) { ++ ext_debug(tree, "depth %d: num %d, max %d\n", ++ ppos, eh->eh_entries, eh->eh_max); ++ ext3_ext_binsearch_idx(tree, path + ppos, block); ++ path[ppos].p_block = path[ppos].p_idx->ei_leaf; ++ path[ppos].p_depth = i; ++ path[ppos].p_ext = NULL; ++ ++ bh = sb_bread(tree->inode->i_sb, path[ppos].p_block); ++ if (!bh) ++ goto err; ++ ++ eh = EXT_BLOCK_HDR(bh); ++ ppos++; ++ EXT_ASSERT(ppos <= depth); ++ path[ppos].p_bh = bh; ++ path[ppos].p_hdr = eh; ++ i--; ++ ++ if (ext3_ext_check_header(eh)) ++ goto err; ++ } ++ ++ path[ppos].p_depth = i; ++ path[ppos].p_hdr = eh; ++ path[ppos].p_ext = NULL; ++ path[ppos].p_idx = NULL; ++ ++ if (ext3_ext_check_header(eh)) ++ goto err; ++ ++ /* find extent */ ++ ext3_ext_binsearch(tree, path + ppos, block); ++ ++ ext3_ext_show_path(tree, path); ++ ++ return path; ++ ++err: ++ printk(KERN_ERR "EXT3-fs: header is corrupted!\n"); ++ ext3_ext_drop_refs(path); ++ kfree(path); ++ return ERR_PTR(-EIO); ++} ++ ++/* ++ * insert new index [logical;ptr] into the block at cupr ++ * it check where to insert: before curp or after curp ++ */ ++static int ext3_ext_insert_index(handle_t *handle, ++ struct ext3_extents_tree *tree, ++ struct ext3_ext_path *curp, ++ int logical, int ptr) ++{ ++ struct ext3_extent_idx *ix; ++ int len, err; ++ ++ if ((err = ext3_ext_get_access(handle, tree, curp))) ++ return err; ++ ++ EXT_ASSERT(logical != curp->p_idx->ei_block); ++ len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx; ++ if (logical > curp->p_idx->ei_block) { ++ /* insert after */ ++ if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) { ++ len = (len - 1) * sizeof(struct ext3_extent_idx); ++ len = len < 0 ? 0 : len; ++ ext_debug(tree, "insert new index %d after: %d. " ++ "move %d from 0x%p to 0x%p\n", ++ logical, ptr, len, ++ (curp->p_idx + 1), (curp->p_idx + 2)); ++ memmove(curp->p_idx + 2, curp->p_idx + 1, len); ++ } ++ ix = curp->p_idx + 1; ++ } else { ++ /* insert before */ ++ len = len * sizeof(struct ext3_extent_idx); ++ len = len < 0 ? 0 : len; ++ ext_debug(tree, "insert new index %d before: %d. " ++ "move %d from 0x%p to 0x%p\n", ++ logical, ptr, len, ++ curp->p_idx, (curp->p_idx + 1)); ++ memmove(curp->p_idx + 1, curp->p_idx, len); ++ ix = curp->p_idx; ++ } ++ ++ ix->ei_block = logical; ++ ix->ei_leaf = ptr; ++ curp->p_hdr->eh_entries++; ++ ++ EXT_ASSERT(curp->p_hdr->eh_entries <= curp->p_hdr->eh_max); ++ EXT_ASSERT(ix <= EXT_LAST_INDEX(curp->p_hdr)); ++ ++ err = ext3_ext_dirty(handle, tree, curp); ++ ext3_std_error(tree->inode->i_sb, err); ++ ++ return err; ++} ++ ++/* ++ * routine inserts new subtree into the path, using free index entry ++ * at depth 'at: ++ * - allocates all needed blocks (new leaf and all intermediate index blocks) ++ * - makes decision where to split ++ * - moves remaining extens and index entries (right to the split point) ++ * into the newly allocated blocks ++ * - initialize subtree ++ */ ++static int ext3_ext_split(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newext, int at) ++{ ++ struct buffer_head *bh = NULL; ++ int depth = EXT_DEPTH(tree); ++ struct ext3_extent_header *neh; ++ struct ext3_extent_idx *fidx; ++ struct ext3_extent *ex; ++ int i = at, k, m, a; ++ unsigned long newblock, oldblock, border; ++ int *ablocks = NULL; /* array of allocated blocks */ ++ int err = 0; ++ ++ /* make decision: where to split? */ ++ /* FIXME: now desicion is simplest: at current extent */ ++ ++ /* if current leaf will be splitted, then we should use ++ * border from split point */ ++ EXT_ASSERT(path[depth].p_ext <= EXT_MAX_EXTENT(path[depth].p_hdr)); ++ if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) { ++ border = path[depth].p_ext[1].ee_block; ++ ext_debug(tree, "leaf will be splitted." ++ " next leaf starts at %d\n", ++ (int)border); ++ } else { ++ border = newext->ee_block; ++ ext_debug(tree, "leaf will be added." ++ " next leaf starts at %d\n", ++ (int)border); ++ } ++ ++ /* ++ * if error occurs, then we break processing ++ * and turn filesystem read-only. so, index won't ++ * be inserted and tree will be in consistent ++ * state. next mount will repair buffers too ++ */ ++ ++ /* ++ * get array to track all allocated blocks ++ * we need this to handle errors and free blocks ++ * upon them ++ */ ++ ablocks = kmalloc(sizeof(unsigned long) * depth, GFP_NOFS); ++ if (!ablocks) ++ return -ENOMEM; ++ memset(ablocks, 0, sizeof(unsigned long) * depth); ++ ++ /* allocate all needed blocks */ ++ ext_debug(tree, "allocate %d blocks for indexes/leaf\n", depth - at); ++ for (a = 0; a < depth - at; a++) { ++ newblock = ext3_ext_new_block(handle, tree, path, newext, &err); ++ if (newblock == 0) ++ goto cleanup; ++ ablocks[a] = newblock; ++ } ++ ++ /* initialize new leaf */ ++ newblock = ablocks[--a]; ++ EXT_ASSERT(newblock); ++ bh = sb_getblk(tree->inode->i_sb, newblock); ++ if (!bh) { ++ err = -EIO; ++ goto cleanup; ++ } ++ lock_buffer(bh); ++ ++ if ((err = ext3_journal_get_create_access(handle, bh))) ++ goto cleanup; ++ ++ neh = EXT_BLOCK_HDR(bh); ++ neh->eh_entries = 0; ++ neh->eh_max = ext3_ext_space_block(tree); ++ neh->eh_magic = EXT3_EXT_MAGIC; ++ neh->eh_depth = 0; ++ ex = EXT_FIRST_EXTENT(neh); ++ ++ /* move remain of path[depth] to the new leaf */ ++ EXT_ASSERT(path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max); ++ /* start copy from next extent */ ++ /* TODO: we could do it by single memmove */ ++ m = 0; ++ path[depth].p_ext++; ++ while (path[depth].p_ext <= ++ EXT_MAX_EXTENT(path[depth].p_hdr)) { ++ ext_debug(tree, "move %d:%d:%d in new leaf %lu\n", ++ path[depth].p_ext->ee_block, ++ path[depth].p_ext->ee_start, ++ path[depth].p_ext->ee_len, ++ newblock); ++ memmove(ex++, path[depth].p_ext++, sizeof(struct ext3_extent)); ++ neh->eh_entries++; ++ m++; ++ } ++ set_buffer_uptodate(bh); ++ unlock_buffer(bh); ++ ++ if ((err = ext3_journal_dirty_metadata(handle, bh))) ++ goto cleanup; ++ brelse(bh); ++ bh = NULL; ++ ++ /* correct old leaf */ ++ if (m) { ++ if ((err = ext3_ext_get_access(handle, tree, path + depth))) ++ goto cleanup; ++ path[depth].p_hdr->eh_entries -= m; ++ if ((err = ext3_ext_dirty(handle, tree, path + depth))) ++ goto cleanup; ++ ++ } ++ ++ /* create intermediate indexes */ ++ k = depth - at - 1; ++ EXT_ASSERT(k >= 0); ++ if (k) ++ ext_debug(tree, "create %d intermediate indices\n", k); ++ /* insert new index into current index block */ ++ /* current depth stored in i var */ ++ i = depth - 1; ++ while (k--) { ++ oldblock = newblock; ++ newblock = ablocks[--a]; ++ bh = sb_getblk(tree->inode->i_sb, newblock); ++ if (!bh) { ++ err = -EIO; ++ goto cleanup; ++ } ++ lock_buffer(bh); ++ ++ if ((err = ext3_journal_get_create_access(handle, bh))) ++ goto cleanup; ++ ++ neh = EXT_BLOCK_HDR(bh); ++ neh->eh_entries = 1; ++ neh->eh_magic = EXT3_EXT_MAGIC; ++ neh->eh_max = ext3_ext_space_block_idx(tree); ++ neh->eh_depth = depth - i; ++ fidx = EXT_FIRST_INDEX(neh); ++ fidx->ei_block = border; ++ fidx->ei_leaf = oldblock; ++ ++ ext_debug(tree, "int.index at %d (block %lu): %lu -> %lu\n", ++ i, newblock, border, oldblock); ++ /* copy indexes */ ++ m = 0; ++ path[i].p_idx++; ++ ++ ext_debug(tree, "cur 0x%p, last 0x%p\n", path[i].p_idx, ++ EXT_MAX_INDEX(path[i].p_hdr)); ++ EXT_ASSERT(EXT_MAX_INDEX(path[i].p_hdr) == ++ EXT_LAST_INDEX(path[i].p_hdr)); ++ while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) { ++ ext_debug(tree, "%d: move %d:%d in new index %lu\n", ++ i, path[i].p_idx->ei_block, ++ path[i].p_idx->ei_leaf, newblock); ++ memmove(++fidx, path[i].p_idx++, ++ sizeof(struct ext3_extent_idx)); ++ neh->eh_entries++; ++ EXT_ASSERT(neh->eh_entries <= neh->eh_max); ++ m++; ++ } ++ set_buffer_uptodate(bh); ++ unlock_buffer(bh); ++ ++ if ((err = ext3_journal_dirty_metadata(handle, bh))) ++ goto cleanup; ++ brelse(bh); ++ bh = NULL; ++ ++ /* correct old index */ ++ if (m) { ++ err = ext3_ext_get_access(handle, tree, path + i); ++ if (err) ++ goto cleanup; ++ path[i].p_hdr->eh_entries -= m; ++ err = ext3_ext_dirty(handle, tree, path + i); ++ if (err) ++ goto cleanup; ++ } ++ ++ i--; ++ } ++ ++ /* insert new index */ ++ if (!err) ++ err = ext3_ext_insert_index(handle, tree, path + at, ++ border, newblock); ++ ++cleanup: ++ if (bh) { ++ if (buffer_locked(bh)) ++ unlock_buffer(bh); ++ brelse(bh); ++ } ++ ++ if (err) { ++ /* free all allocated blocks in error case */ ++ for (i = 0; i < depth; i++) { ++ if (!ablocks[i]) ++ continue; ++ ext3_free_blocks(handle, tree->inode, ablocks[i], 1); ++ } ++ } ++ kfree(ablocks); ++ ++ return err; ++} ++ ++/* ++ * routine implements tree growing procedure: ++ * - allocates new block ++ * - moves top-level data (index block or leaf) into the new block ++ * - initialize new top-level, creating index that points to the ++ * just created block ++ */ ++static int ext3_ext_grow_indepth(handle_t *handle, ++ struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newext) ++{ ++ struct ext3_ext_path *curp = path; ++ struct ext3_extent_header *neh; ++ struct ext3_extent_idx *fidx; ++ struct buffer_head *bh; ++ unsigned long newblock; ++ int err = 0; ++ ++ newblock = ext3_ext_new_block(handle, tree, path, newext, &err); ++ if (newblock == 0) ++ return err; ++ ++ bh = sb_getblk(tree->inode->i_sb, newblock); ++ if (!bh) { ++ err = -EIO; ++ ext3_std_error(tree->inode->i_sb, err); ++ return err; ++ } ++ lock_buffer(bh); ++ ++ if ((err = ext3_journal_get_create_access(handle, bh))) { ++ unlock_buffer(bh); ++ goto out; ++ } ++ ++ /* move top-level index/leaf into new block */ ++ memmove(bh->b_data, curp->p_hdr, tree->buffer_len); ++ ++ /* set size of new block */ ++ neh = EXT_BLOCK_HDR(bh); ++ /* old root could have indexes or leaves ++ * so calculate eh_max right way */ ++ if (EXT_DEPTH(tree)) ++ neh->eh_max = ext3_ext_space_block_idx(tree); ++ else ++ neh->eh_max = ext3_ext_space_block(tree); ++ neh->eh_magic = EXT3_EXT_MAGIC; ++ set_buffer_uptodate(bh); ++ unlock_buffer(bh); ++ ++ if ((err = ext3_journal_dirty_metadata(handle, bh))) ++ goto out; ++ ++ /* create index in new top-level index: num,max,pointer */ ++ if ((err = ext3_ext_get_access(handle, tree, curp))) ++ goto out; ++ ++ curp->p_hdr->eh_magic = EXT3_EXT_MAGIC; ++ curp->p_hdr->eh_max = ext3_ext_space_root_idx(tree); ++ curp->p_hdr->eh_entries = 1; ++ curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr); ++ /* FIXME: it works, but actually path[0] can be index */ ++ curp->p_idx->ei_block = EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block; ++ curp->p_idx->ei_leaf = newblock; ++ ++ neh = EXT_ROOT_HDR(tree); ++ fidx = EXT_FIRST_INDEX(neh); ++ ext_debug(tree, "new root: num %d(%d), lblock %d, ptr %d\n", ++ neh->eh_entries, neh->eh_max, fidx->ei_block, fidx->ei_leaf); ++ ++ neh->eh_depth = path->p_depth + 1; ++ err = ext3_ext_dirty(handle, tree, curp); ++out: ++ brelse(bh); ++ ++ return err; ++} ++ ++/* ++ * routine finds empty index and adds new leaf. if no free index found ++ * then it requests in-depth growing ++ */ ++static int ext3_ext_create_new_leaf(handle_t *handle, ++ struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newext) ++{ ++ struct ext3_ext_path *curp; ++ int depth, i, err = 0; ++ ++repeat: ++ i = depth = EXT_DEPTH(tree); ++ ++ /* walk up to the tree and look for free index entry */ ++ curp = path + depth; ++ while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) { ++ i--; ++ curp--; ++ } ++ ++ /* we use already allocated block for index block ++ * so, subsequent data blocks should be contigoues */ ++ if (EXT_HAS_FREE_INDEX(curp)) { ++ /* if we found index with free entry, then use that ++ * entry: create all needed subtree and add new leaf */ ++ err = ext3_ext_split(handle, tree, path, newext, i); ++ ++ /* refill path */ ++ ext3_ext_drop_refs(path); ++ path = ext3_ext_find_extent(tree, newext->ee_block, path); ++ if (IS_ERR(path)) ++ err = PTR_ERR(path); ++ } else { ++ /* tree is full, time to grow in depth */ ++ err = ext3_ext_grow_indepth(handle, tree, path, newext); ++ ++ /* refill path */ ++ ext3_ext_drop_refs(path); ++ path = ext3_ext_find_extent(tree, newext->ee_block, path); ++ if (IS_ERR(path)) ++ err = PTR_ERR(path); ++ ++ /* ++ * only first (depth 0 -> 1) produces free space ++ * in all other cases we have to split growed tree ++ */ ++ depth = EXT_DEPTH(tree); ++ if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) { ++ /* now we need split */ ++ goto repeat; ++ } ++ } ++ ++ if (err) ++ return err; ++ ++ return 0; ++} ++ ++/* ++ * returns allocated block in subsequent extent or EXT_MAX_BLOCK ++ * NOTE: it consider block number from index entry as ++ * allocated block. thus, index entries have to be consistent ++ * with leafs ++ */ ++static unsigned long ++ext3_ext_next_allocated_block(struct ext3_ext_path *path) ++{ ++ int depth; ++ ++ EXT_ASSERT(path != NULL); ++ depth = path->p_depth; ++ ++ if (depth == 0 && path->p_ext == NULL) ++ return EXT_MAX_BLOCK; ++ ++ /* FIXME: what if index isn't full ?! */ ++ while (depth >= 0) { ++ if (depth == path->p_depth) { ++ /* leaf */ ++ if (path[depth].p_ext != ++ EXT_LAST_EXTENT(path[depth].p_hdr)) ++ return path[depth].p_ext[1].ee_block; ++ } else { ++ /* index */ ++ if (path[depth].p_idx != ++ EXT_LAST_INDEX(path[depth].p_hdr)) ++ return path[depth].p_idx[1].ei_block; ++ } ++ depth--; ++ } ++ ++ return EXT_MAX_BLOCK; ++} ++ ++/* ++ * returns first allocated block from next leaf or EXT_MAX_BLOCK ++ */ ++static unsigned ext3_ext_next_leaf_block(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ int depth; ++ ++ EXT_ASSERT(path != NULL); ++ depth = path->p_depth; ++ ++ /* zero-tree has no leaf blocks at all */ ++ if (depth == 0) ++ return EXT_MAX_BLOCK; ++ ++ /* go to index block */ ++ depth--; ++ ++ while (depth >= 0) { ++ if (path[depth].p_idx != ++ EXT_LAST_INDEX(path[depth].p_hdr)) ++ return path[depth].p_idx[1].ei_block; ++ depth--; ++ } ++ ++ return EXT_MAX_BLOCK; ++} ++ ++/* ++ * if leaf gets modified and modified extent is first in the leaf ++ * then we have to correct all indexes above ++ * TODO: do we need to correct tree in all cases? ++ */ ++int ext3_ext_correct_indexes(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ struct ext3_extent_header *eh; ++ int depth = EXT_DEPTH(tree); ++ struct ext3_extent *ex; ++ unsigned long border; ++ int k, err = 0; ++ ++ eh = path[depth].p_hdr; ++ ex = path[depth].p_ext; ++ EXT_ASSERT(ex); ++ EXT_ASSERT(eh); ++ ++ if (depth == 0) { ++ /* there is no tree at all */ ++ return 0; ++ } ++ ++ if (ex != EXT_FIRST_EXTENT(eh)) { ++ /* we correct tree if first leaf got modified only */ ++ return 0; ++ } ++ ++ /* ++ * TODO: we need correction if border is smaller then current one ++ */ ++ k = depth - 1; ++ border = path[depth].p_ext->ee_block; ++ if ((err = ext3_ext_get_access(handle, tree, path + k))) ++ return err; ++ path[k].p_idx->ei_block = border; ++ if ((err = ext3_ext_dirty(handle, tree, path + k))) ++ return err; ++ ++ while (k--) { ++ /* change all left-side indexes */ ++ if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr)) ++ break; ++ if ((err = ext3_ext_get_access(handle, tree, path + k))) ++ break; ++ path[k].p_idx->ei_block = border; ++ if ((err = ext3_ext_dirty(handle, tree, path + k))) ++ break; ++ } ++ ++ return err; ++} ++ ++static int inline ++ext3_can_extents_be_merged(struct ext3_extents_tree *tree, ++ struct ext3_extent *ex1, ++ struct ext3_extent *ex2) ++{ ++ if (ex1->ee_block + ex1->ee_len != ex2->ee_block) ++ return 0; ++ ++#ifdef AGRESSIVE_TEST ++ if (ex1->ee_len >= 4) ++ return 0; ++#endif ++ ++ if (!tree->ops->mergable) ++ return 1; ++ ++ return tree->ops->mergable(ex1, ex2); ++} ++ ++/* ++ * this routine tries to merge requsted extent into the existing ++ * extent or inserts requested extent as new one into the tree, ++ * creating new leaf in no-space case ++ */ ++int ext3_ext_insert_extent(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newext) ++{ ++ struct ext3_extent_header * eh; ++ struct ext3_extent *ex, *fex; ++ struct ext3_extent *nearex; /* nearest extent */ ++ struct ext3_ext_path *npath = NULL; ++ int depth, len, err, next; ++ ++ EXT_ASSERT(newext->ee_len > 0); ++ depth = EXT_DEPTH(tree); ++ ex = path[depth].p_ext; ++ EXT_ASSERT(path[depth].p_hdr); ++ ++ /* try to insert block into found extent and return */ ++ if (ex && ext3_can_extents_be_merged(tree, ex, newext)) { ++ ext_debug(tree, "append %d block to %d:%d (from %d)\n", ++ newext->ee_len, ex->ee_block, ex->ee_len, ++ ex->ee_start); ++ if ((err = ext3_ext_get_access(handle, tree, path + depth))) ++ return err; ++ ex->ee_len += newext->ee_len; ++ eh = path[depth].p_hdr; ++ nearex = ex; ++ goto merge; ++ } ++ ++repeat: ++ depth = EXT_DEPTH(tree); ++ eh = path[depth].p_hdr; ++ if (eh->eh_entries < eh->eh_max) ++ goto has_space; ++ ++ /* probably next leaf has space for us? */ ++ fex = EXT_LAST_EXTENT(eh); ++ next = ext3_ext_next_leaf_block(tree, path); ++ if (newext->ee_block > fex->ee_block && next != EXT_MAX_BLOCK) { ++ ext_debug(tree, "next leaf block - %d\n", next); ++ EXT_ASSERT(!npath); ++ npath = ext3_ext_find_extent(tree, next, NULL); ++ if (IS_ERR(npath)) ++ return PTR_ERR(npath); ++ EXT_ASSERT(npath->p_depth == path->p_depth); ++ eh = npath[depth].p_hdr; ++ if (eh->eh_entries < eh->eh_max) { ++ ext_debug(tree, "next leaf isnt full(%d)\n", ++ eh->eh_entries); ++ path = npath; ++ goto repeat; ++ } ++ ext_debug(tree, "next leaf hasno free space(%d,%d)\n", ++ eh->eh_entries, eh->eh_max); ++ } ++ ++ /* ++ * there is no free space in found leaf ++ * we're gonna add new leaf in the tree ++ */ ++ err = ext3_ext_create_new_leaf(handle, tree, path, newext); ++ if (err) ++ goto cleanup; ++ depth = EXT_DEPTH(tree); ++ eh = path[depth].p_hdr; ++ ++has_space: ++ nearex = path[depth].p_ext; ++ ++ if ((err = ext3_ext_get_access(handle, tree, path + depth))) ++ goto cleanup; ++ ++ if (!nearex) { ++ /* there is no extent in this leaf, create first one */ ++ ext_debug(tree, "first extent in the leaf: %d:%d:%d\n", ++ newext->ee_block, newext->ee_start, ++ newext->ee_len); ++ path[depth].p_ext = EXT_FIRST_EXTENT(eh); ++ } else if (newext->ee_block > nearex->ee_block) { ++ EXT_ASSERT(newext->ee_block != nearex->ee_block); ++ if (nearex != EXT_LAST_EXTENT(eh)) { ++ len = EXT_MAX_EXTENT(eh) - nearex; ++ len = (len - 1) * sizeof(struct ext3_extent); ++ len = len < 0 ? 0 : len; ++ ext_debug(tree, "insert %d:%d:%d after: nearest 0x%p, " ++ "move %d from 0x%p to 0x%p\n", ++ newext->ee_block, newext->ee_start, ++ newext->ee_len, ++ nearex, len, nearex + 1, nearex + 2); ++ memmove(nearex + 2, nearex + 1, len); ++ } ++ path[depth].p_ext = nearex + 1; ++ } else { ++ EXT_ASSERT(newext->ee_block != nearex->ee_block); ++ len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext3_extent); ++ len = len < 0 ? 0 : len; ++ ext_debug(tree, "insert %d:%d:%d before: nearest 0x%p, " ++ "move %d from 0x%p to 0x%p\n", ++ newext->ee_block, newext->ee_start, newext->ee_len, ++ nearex, len, nearex + 1, nearex + 2); ++ memmove(nearex + 1, nearex, len); ++ path[depth].p_ext = nearex; ++ } ++ ++ eh->eh_entries++; ++ nearex = path[depth].p_ext; ++ nearex->ee_block = newext->ee_block; ++ nearex->ee_start = newext->ee_start; ++ nearex->ee_len = newext->ee_len; ++ /* FIXME: support for large fs */ ++ nearex->ee_start_hi = 0; ++ ++merge: ++ /* try to merge extents to the right */ ++ while (nearex < EXT_LAST_EXTENT(eh)) { ++ if (!ext3_can_extents_be_merged(tree, nearex, nearex + 1)) ++ break; ++ /* merge with next extent! */ ++ nearex->ee_len += nearex[1].ee_len; ++ if (nearex + 1 < EXT_LAST_EXTENT(eh)) { ++ len = (EXT_LAST_EXTENT(eh) - nearex - 1) * ++ sizeof(struct ext3_extent); ++ memmove(nearex + 1, nearex + 2, len); ++ } ++ eh->eh_entries--; ++ EXT_ASSERT(eh->eh_entries > 0); ++ } ++ ++ /* try to merge extents to the left */ ++ ++ /* time to correct all indexes above */ ++ err = ext3_ext_correct_indexes(handle, tree, path); ++ if (err) ++ goto cleanup; ++ ++ err = ext3_ext_dirty(handle, tree, path + depth); ++ ++cleanup: ++ if (npath) { ++ ext3_ext_drop_refs(npath); ++ kfree(npath); ++ } ++ ext3_ext_tree_changed(tree); ++ ext3_ext_invalidate_cache(tree); ++ return err; ++} ++ ++int ext3_ext_walk_space(struct ext3_extents_tree *tree, unsigned long block, ++ unsigned long num, ext_prepare_callback func) ++{ ++ struct ext3_ext_path *path = NULL; ++ struct ext3_ext_cache cbex; ++ struct ext3_extent *ex; ++ unsigned long next, start = 0, end = 0; ++ unsigned long last = block + num; ++ int depth, exists, err = 0; ++ ++ EXT_ASSERT(tree); ++ EXT_ASSERT(func); ++ EXT_ASSERT(tree->inode); ++ EXT_ASSERT(tree->root); ++ ++ while (block < last && block != EXT_MAX_BLOCK) { ++ num = last - block; ++ /* find extent for this block */ ++ path = ext3_ext_find_extent(tree, block, path); ++ if (IS_ERR(path)) { ++ err = PTR_ERR(path); ++ path = NULL; ++ break; ++ } ++ ++ depth = EXT_DEPTH(tree); ++ EXT_ASSERT(path[depth].p_hdr); ++ ex = path[depth].p_ext; ++ next = ext3_ext_next_allocated_block(path); ++ ++ exists = 0; ++ if (!ex) { ++ /* there is no extent yet, so try to allocate ++ * all requested space */ ++ start = block; ++ end = block + num; ++ } else if (ex->ee_block > block) { ++ /* need to allocate space before found extent */ ++ start = block; ++ end = ex->ee_block; ++ if (block + num < end) ++ end = block + num; ++ } else if (block >= ex->ee_block + ex->ee_len) { ++ /* need to allocate space after found extent */ ++ start = block; ++ end = block + num; ++ if (end >= next) ++ end = next; ++ } else if (block >= ex->ee_block) { ++ /* ++ * some part of requested space is covered ++ * by found extent ++ */ ++ start = block; ++ end = ex->ee_block + ex->ee_len; ++ if (block + num < end) ++ end = block + num; ++ exists = 1; ++ } else { ++ BUG(); ++ } ++ EXT_ASSERT(end > start); ++ ++ if (!exists) { ++ cbex.ec_block = start; ++ cbex.ec_len = end - start; ++ cbex.ec_start = 0; ++ cbex.ec_type = EXT3_EXT_CACHE_GAP; ++ } else { ++ cbex.ec_block = ex->ee_block; ++ cbex.ec_len = ex->ee_len; ++ cbex.ec_start = ex->ee_start; ++ cbex.ec_type = EXT3_EXT_CACHE_EXTENT; ++ } ++ ++ EXT_ASSERT(cbex.ec_len > 0); ++ EXT_ASSERT(path[depth].p_hdr); ++ err = func(tree, path, &cbex); ++ ext3_ext_drop_refs(path); ++ ++ if (err < 0) ++ break; ++ if (err == EXT_REPEAT) ++ continue; ++ else if (err == EXT_BREAK) { ++ err = 0; ++ break; ++ } ++ ++ if (EXT_DEPTH(tree) != depth) { ++ /* depth was changed. we have to realloc path */ ++ kfree(path); ++ path = NULL; ++ } ++ ++ block = cbex.ec_block + cbex.ec_len; ++ } ++ ++ if (path) { ++ ext3_ext_drop_refs(path); ++ kfree(path); ++ } ++ ++ return err; ++} ++ ++static inline void ++ext3_ext_put_in_cache(struct ext3_extents_tree *tree, __u32 block, ++ __u32 len, __u32 start, int type) ++{ ++ EXT_ASSERT(len > 0); ++ if (tree->cex) { ++ tree->cex->ec_type = type; ++ tree->cex->ec_block = block; ++ tree->cex->ec_len = len; ++ tree->cex->ec_start = start; ++ } ++} ++ ++/* ++ * this routine calculate boundaries of the gap requested block fits into ++ * and cache this gap ++ */ ++static inline void ++ext3_ext_put_gap_in_cache(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ unsigned long block) ++{ ++ int depth = EXT_DEPTH(tree); ++ unsigned long lblock, len; ++ struct ext3_extent *ex; ++ ++ if (!tree->cex) ++ return; ++ ++ ex = path[depth].p_ext; ++ if (ex == NULL) { ++ /* there is no extent yet, so gap is [0;-] */ ++ lblock = 0; ++ len = EXT_MAX_BLOCK; ++ ext_debug(tree, "cache gap(whole file):"); ++ } else if (block < ex->ee_block) { ++ lblock = block; ++ len = ex->ee_block - block; ++ ext_debug(tree, "cache gap(before): %lu [%lu:%lu]", ++ (unsigned long) block, ++ (unsigned long) ex->ee_block, ++ (unsigned long) ex->ee_len); ++ } else if (block >= ex->ee_block + ex->ee_len) { ++ lblock = ex->ee_block + ex->ee_len; ++ len = ext3_ext_next_allocated_block(path); ++ ext_debug(tree, "cache gap(after): [%lu:%lu] %lu", ++ (unsigned long) ex->ee_block, ++ (unsigned long) ex->ee_len, ++ (unsigned long) block); ++ EXT_ASSERT(len > lblock); ++ len = len - lblock; ++ } else { ++ lblock = len = 0; ++ BUG(); ++ } ++ ++ ext_debug(tree, " -> %lu:%lu\n", (unsigned long) lblock, len); ++ ext3_ext_put_in_cache(tree, lblock, len, 0, EXT3_EXT_CACHE_GAP); ++} ++ ++static inline int ++ext3_ext_in_cache(struct ext3_extents_tree *tree, unsigned long block, ++ struct ext3_extent *ex) ++{ ++ struct ext3_ext_cache *cex = tree->cex; ++ ++ /* is there cache storage at all? */ ++ if (!cex) ++ return EXT3_EXT_CACHE_NO; ++ ++ /* has cache valid data? */ ++ if (cex->ec_type == EXT3_EXT_CACHE_NO) ++ return EXT3_EXT_CACHE_NO; ++ ++ EXT_ASSERT(cex->ec_type == EXT3_EXT_CACHE_GAP || ++ cex->ec_type == EXT3_EXT_CACHE_EXTENT); ++ if (block >= cex->ec_block && block < cex->ec_block + cex->ec_len) { ++ ex->ee_block = cex->ec_block; ++ ex->ee_start = cex->ec_start; ++ ex->ee_len = cex->ec_len; ++ ext_debug(tree, "%lu cached by %lu:%lu:%lu\n", ++ (unsigned long) block, ++ (unsigned long) ex->ee_block, ++ (unsigned long) ex->ee_len, ++ (unsigned long) ex->ee_start); ++ return cex->ec_type; ++ } ++ ++ /* not in cache */ ++ return EXT3_EXT_CACHE_NO; ++} ++ ++/* ++ * routine removes index from the index block ++ * it's used in truncate case only. thus all requests are for ++ * last index in the block only ++ */ ++int ext3_ext_rm_idx(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ struct buffer_head *bh; ++ int err; ++ ++ /* free index block */ ++ path--; ++ EXT_ASSERT(path->p_hdr->eh_entries); ++ if ((err = ext3_ext_get_access(handle, tree, path))) ++ return err; ++ path->p_hdr->eh_entries--; ++ if ((err = ext3_ext_dirty(handle, tree, path))) ++ return err; ++ ext_debug(tree, "index is empty, remove it, free block %d\n", ++ path->p_idx->ei_leaf); ++ bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf); ++ ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf); ++ ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1); ++ return err; ++} ++ ++int ext3_ext_calc_credits_for_insert(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ int depth = EXT_DEPTH(tree); ++ int needed; ++ ++ if (path) { ++ /* probably there is space in leaf? */ ++ if (path[depth].p_hdr->eh_entries < path[depth].p_hdr->eh_max) ++ return 1; ++ } ++ ++ /* ++ * the worste case we're expecting is creation of the ++ * new root (growing in depth) with index splitting ++ * for splitting we have to consider depth + 1 because ++ * previous growing could increase it ++ */ ++ depth = depth + 1; ++ ++ /* ++ * growing in depth: ++ * block allocation + new root + old root ++ */ ++ needed = EXT3_ALLOC_NEEDED + 2; ++ ++ /* index split. we may need: ++ * allocate intermediate indexes and new leaf ++ * change two blocks at each level, but root ++ * modify root block (inode) ++ */ ++ needed += (depth * EXT3_ALLOC_NEEDED) + (2 * depth) + 1; ++ ++ return needed; ++} ++ ++static int ++ext3_ext_split_for_rm(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, unsigned long start, ++ unsigned long end) ++{ ++ struct ext3_extent *ex, tex; ++ struct ext3_ext_path *npath; ++ int depth, creds, err; ++ ++ depth = EXT_DEPTH(tree); ++ ex = path[depth].p_ext; ++ EXT_ASSERT(ex); ++ EXT_ASSERT(end < ex->ee_block + ex->ee_len - 1); ++ EXT_ASSERT(ex->ee_block < start); ++ ++ /* calculate tail extent */ ++ tex.ee_block = end + 1; ++ EXT_ASSERT(tex.ee_block < ex->ee_block + ex->ee_len); ++ tex.ee_len = ex->ee_block + ex->ee_len - tex.ee_block; ++ ++ creds = ext3_ext_calc_credits_for_insert(tree, path); ++ handle = ext3_ext_journal_restart(handle, creds); ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ ++ /* calculate head extent. use primary extent */ ++ err = ext3_ext_get_access(handle, tree, path + depth); ++ if (err) ++ return err; ++ ex->ee_len = start - ex->ee_block; ++ err = ext3_ext_dirty(handle, tree, path + depth); ++ if (err) ++ return err; ++ ++ /* FIXME: some callback to free underlying resource ++ * and correct ee_start? */ ++ ext_debug(tree, "split extent: head %u:%u, tail %u:%u\n", ++ ex->ee_block, ex->ee_len, tex.ee_block, tex.ee_len); ++ ++ npath = ext3_ext_find_extent(tree, ex->ee_block, NULL); ++ if (IS_ERR(npath)) ++ return PTR_ERR(npath); ++ depth = EXT_DEPTH(tree); ++ EXT_ASSERT(npath[depth].p_ext->ee_block == ex->ee_block); ++ EXT_ASSERT(npath[depth].p_ext->ee_len == ex->ee_len); ++ ++ err = ext3_ext_insert_extent(handle, tree, npath, &tex); ++ ext3_ext_drop_refs(npath); ++ kfree(npath); ++ ++ return err; ++} ++ ++static int ++ext3_ext_rm_leaf(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, unsigned long start, ++ unsigned long end) ++{ ++ struct ext3_extent *ex, *fu = NULL, *lu, *le; ++ int err = 0, correct_index = 0; ++ int depth = EXT_DEPTH(tree), credits; ++ struct ext3_extent_header *eh; ++ unsigned a, b, block, num; ++ ++ ext_debug(tree, "remove [%lu:%lu] in leaf\n", start, end); ++ if (!path[depth].p_hdr) ++ path[depth].p_hdr = EXT_BLOCK_HDR(path[depth].p_bh); ++ eh = path[depth].p_hdr; ++ EXT_ASSERT(eh); ++ EXT_ASSERT(eh->eh_entries <= eh->eh_max); ++ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); ++ ++ /* find where to start removing */ ++ le = ex = EXT_LAST_EXTENT(eh); ++ while (ex != EXT_FIRST_EXTENT(eh)) { ++ if (ex->ee_block <= end) ++ break; ++ ex--; ++ } ++ ++ if (start > ex->ee_block && end < ex->ee_block + ex->ee_len - 1) { ++ /* removal of internal part of the extent requested ++ * tail and head must be placed in different extent ++ * so, we have to insert one more extent */ ++ path[depth].p_ext = ex; ++ return ext3_ext_split_for_rm(handle, tree, path, start, end); ++ } ++ ++ lu = ex; ++ while (ex >= EXT_FIRST_EXTENT(eh) && ex->ee_block + ex->ee_len > start) { ++ ext_debug(tree, "remove ext %u:%u\n", ex->ee_block, ex->ee_len); ++ path[depth].p_ext = ex; ++ ++ a = ex->ee_block > start ? ex->ee_block : start; ++ b = ex->ee_block + ex->ee_len - 1 < end ? ++ ex->ee_block + ex->ee_len - 1 : end; ++ ++ ext_debug(tree, " border %u:%u\n", a, b); ++ ++ if (a != ex->ee_block && b != ex->ee_block + ex->ee_len - 1) { ++ block = 0; ++ num = 0; ++ BUG(); ++ } else if (a != ex->ee_block) { ++ /* remove tail of the extent */ ++ block = ex->ee_block; ++ num = a - block; ++ } else if (b != ex->ee_block + ex->ee_len - 1) { ++ /* remove head of the extent */ ++ block = a; ++ num = b - a; ++ } else { ++ /* remove whole extent: excelent! */ ++ block = ex->ee_block; ++ num = 0; ++ EXT_ASSERT(a == ex->ee_block && ++ b == ex->ee_block + ex->ee_len - 1); ++ } ++ ++ if (ex == EXT_FIRST_EXTENT(eh)) ++ correct_index = 1; ++ ++ credits = 1; ++ if (correct_index) ++ credits += (EXT_DEPTH(tree) * EXT3_ALLOC_NEEDED) + 1; ++ if (tree->ops->remove_extent_credits) ++ credits+=tree->ops->remove_extent_credits(tree,ex,a,b); ++ ++ handle = ext3_ext_journal_restart(handle, credits); ++ if (IS_ERR(handle)) { ++ err = PTR_ERR(handle); ++ goto out; ++ } ++ ++ err = ext3_ext_get_access(handle, tree, path + depth); ++ if (err) ++ goto out; ++ ++ if (tree->ops->remove_extent) ++ err = tree->ops->remove_extent(tree, ex, a, b); ++ if (err) ++ goto out; ++ ++ if (num == 0) { ++ /* this extent is removed entirely mark slot unused */ ++ ex->ee_start = 0; ++ eh->eh_entries--; ++ fu = ex; ++ } ++ ++ ex->ee_block = block; ++ ex->ee_len = num; ++ ++ err = ext3_ext_dirty(handle, tree, path + depth); ++ if (err) ++ goto out; ++ ++ ext_debug(tree, "new extent: %u:%u:%u\n", ++ ex->ee_block, ex->ee_len, ex->ee_start); ++ ex--; ++ } ++ ++ if (fu) { ++ /* reuse unused slots */ ++ while (lu < le) { ++ if (lu->ee_start) { ++ *fu = *lu; ++ lu->ee_start = 0; ++ fu++; ++ } ++ lu++; ++ } ++ } ++ ++ if (correct_index && eh->eh_entries) ++ err = ext3_ext_correct_indexes(handle, tree, path); ++ ++ /* if this leaf is free, then we should ++ * remove it from index block above */ ++ if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL) ++ err = ext3_ext_rm_idx(handle, tree, path + depth); ++ ++out: ++ return err; ++} ++ ++ ++static struct ext3_extent_idx * ++ext3_ext_last_covered(struct ext3_extent_header *hdr, unsigned long block) ++{ ++ struct ext3_extent_idx *ix; ++ ++ ix = EXT_LAST_INDEX(hdr); ++ while (ix != EXT_FIRST_INDEX(hdr)) { ++ if (ix->ei_block <= block) ++ break; ++ ix--; ++ } ++ return ix; ++} ++ ++/* ++ * returns 1 if current index have to be freed (even partial) ++ */ ++static int inline ++ext3_ext_more_to_rm(struct ext3_ext_path *path) ++{ ++ EXT_ASSERT(path->p_idx); ++ ++ if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr)) ++ return 0; ++ ++ /* ++ * if truncate on deeper level happened it it wasn't partial ++ * so we have to consider current index for truncation ++ */ ++ if (path->p_hdr->eh_entries == path->p_block) ++ return 0; ++ return 1; ++} ++ ++int ext3_ext_remove_space(struct ext3_extents_tree *tree, ++ unsigned long start, unsigned long end) ++{ ++ struct inode *inode = tree->inode; ++ struct super_block *sb = inode->i_sb; ++ int depth = EXT_DEPTH(tree); ++ struct ext3_ext_path *path; ++ handle_t *handle; ++ int i = 0, err = 0; ++ ++ ext_debug(tree, "space to be removed: %lu:%lu\n", start, end); ++ ++ /* probably first extent we're gonna free will be last in block */ ++ handle = ext3_journal_start(inode, depth + 1); ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ ++ ext3_ext_invalidate_cache(tree); ++ ++ /* ++ * we start scanning from right side freeing all the blocks ++ * after i_size and walking into the deep ++ */ ++ path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 1), GFP_KERNEL); ++ if (IS_ERR(path)) { ++ ext3_error(sb, __FUNCTION__, "Can't allocate path array"); ++ ext3_journal_stop(handle); ++ return -ENOMEM; ++ } ++ memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1)); ++ path[i].p_hdr = EXT_ROOT_HDR(tree); ++ ++ while (i >= 0 && err == 0) { ++ if (i == depth) { ++ /* this is leaf block */ ++ err = ext3_ext_rm_leaf(handle, tree, path, start, end); ++ /* root level have p_bh == NULL, brelse() eats this */ ++ brelse(path[i].p_bh); ++ i--; ++ continue; ++ } ++ ++ /* this is index block */ ++ if (!path[i].p_hdr) { ++ ext_debug(tree, "initialize header\n"); ++ path[i].p_hdr = EXT_BLOCK_HDR(path[i].p_bh); ++ } ++ ++ EXT_ASSERT(path[i].p_hdr->eh_entries <= path[i].p_hdr->eh_max); ++ EXT_ASSERT(path[i].p_hdr->eh_magic == EXT3_EXT_MAGIC); ++ ++ if (!path[i].p_idx) { ++ /* this level hasn't touched yet */ ++ path[i].p_idx = ++ ext3_ext_last_covered(path[i].p_hdr, end); ++ path[i].p_block = path[i].p_hdr->eh_entries + 1; ++ ext_debug(tree, "init index ptr: hdr 0x%p, num %d\n", ++ path[i].p_hdr, path[i].p_hdr->eh_entries); ++ } else { ++ /* we've already was here, see at next index */ ++ path[i].p_idx--; ++ } ++ ++ ext_debug(tree, "level %d - index, first 0x%p, cur 0x%p\n", ++ i, EXT_FIRST_INDEX(path[i].p_hdr), ++ path[i].p_idx); ++ if (ext3_ext_more_to_rm(path + i)) { ++ /* go to the next level */ ++ ext_debug(tree, "move to level %d (block %d)\n", ++ i + 1, path[i].p_idx->ei_leaf); ++ memset(path + i + 1, 0, sizeof(*path)); ++ path[i+1].p_bh = sb_bread(sb, path[i].p_idx->ei_leaf); ++ if (!path[i+1].p_bh) { ++ /* should we reset i_size? */ ++ err = -EIO; ++ break; ++ } ++ /* put actual number of indexes to know is this ++ * number got changed at the next iteration */ ++ path[i].p_block = path[i].p_hdr->eh_entries; ++ i++; ++ } else { ++ /* we finish processing this index, go up */ ++ if (path[i].p_hdr->eh_entries == 0 && i > 0) { ++ /* index is empty, remove it ++ * handle must be already prepared by the ++ * truncatei_leaf() */ ++ err = ext3_ext_rm_idx(handle, tree, path + i); ++ } ++ /* root level have p_bh == NULL, brelse() eats this */ ++ brelse(path[i].p_bh); ++ i--; ++ ext_debug(tree, "return to level %d\n", i); ++ } ++ } ++ ++ /* TODO: flexible tree reduction should be here */ ++ if (path->p_hdr->eh_entries == 0) { ++ /* ++ * truncate to zero freed all the tree ++ * so, we need to correct eh_depth ++ */ ++ err = ext3_ext_get_access(handle, tree, path); ++ if (err == 0) { ++ EXT_ROOT_HDR(tree)->eh_depth = 0; ++ EXT_ROOT_HDR(tree)->eh_max = ext3_ext_space_root(tree); ++ err = ext3_ext_dirty(handle, tree, path); ++ } ++ } ++ ext3_ext_tree_changed(tree); ++ ++ kfree(path); ++ ext3_journal_stop(handle); ++ ++ return err; ++} ++ ++int ext3_ext_calc_metadata_amount(struct ext3_extents_tree *tree, int blocks) ++{ ++ int lcap, icap, rcap, leafs, idxs, num; ++ ++ rcap = ext3_ext_space_root(tree); ++ if (blocks <= rcap) { ++ /* all extents fit to the root */ ++ return 0; ++ } ++ ++ rcap = ext3_ext_space_root_idx(tree); ++ lcap = ext3_ext_space_block(tree); ++ icap = ext3_ext_space_block_idx(tree); ++ ++ num = leafs = (blocks + lcap - 1) / lcap; ++ if (leafs <= rcap) { ++ /* all pointers to leafs fit to the root */ ++ return leafs; ++ } ++ ++ /* ok. we need separate index block(s) to link all leaf blocks */ ++ idxs = (leafs + icap - 1) / icap; ++ do { ++ num += idxs; ++ idxs = (idxs + icap - 1) / icap; ++ } while (idxs > rcap); ++ ++ return num; ++} ++ ++/* ++ * called at mount time ++ */ ++void ext3_ext_init(struct super_block *sb) ++{ ++ /* ++ * possible initialization would be here ++ */ ++ ++ if (test_opt(sb, EXTENTS)) { ++ printk("EXT3-fs: file extents enabled"); ++#ifdef AGRESSIVE_TEST ++ printk(", agressive tests"); ++#endif ++#ifdef CHECK_BINSEARCH ++ printk(", check binsearch"); ++#endif ++ printk("\n"); ++ } ++} ++ ++/* ++ * called at umount time ++ */ ++void ext3_ext_release(struct super_block *sb) ++{ ++} ++ ++/************************************************************************ ++ * VFS related routines ++ ************************************************************************/ ++ ++static int ext3_get_inode_write_access(handle_t *handle, void *buffer) ++{ ++ /* we use in-core data, not bh */ ++ return 0; ++} ++ ++static int ext3_mark_buffer_dirty(handle_t *handle, void *buffer) ++{ ++ struct inode *inode = buffer; ++ return ext3_mark_inode_dirty(handle, inode); ++} ++ ++static int ext3_ext_mergable(struct ext3_extent *ex1, ++ struct ext3_extent *ex2) ++{ ++ /* FIXME: support for large fs */ ++ if (ex1->ee_start + ex1->ee_len == ex2->ee_start) ++ return 1; ++ return 0; ++} ++ ++static int ++ext3_remove_blocks_credits(struct ext3_extents_tree *tree, ++ struct ext3_extent *ex, ++ unsigned long from, unsigned long to) ++{ ++ int needed; ++ ++ /* at present, extent can't cross block group */; ++ needed = 4; /* bitmap + group desc + sb + inode */ ++ ++#ifdef CONFIG_QUOTA ++ needed += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS; ++#endif ++ return needed; ++} ++ ++static int ++ext3_remove_blocks(struct ext3_extents_tree *tree, ++ struct ext3_extent *ex, ++ unsigned long from, unsigned long to) ++{ ++ int needed = ext3_remove_blocks_credits(tree, ex, from, to); ++ handle_t *handle = ext3_journal_start(tree->inode, needed); ++ struct buffer_head *bh; ++ int i; ++ ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) { ++ /* tail removal */ ++ unsigned long num, start; ++ num = ex->ee_block + ex->ee_len - from; ++ start = ex->ee_start + ex->ee_len - num; ++ ext_debug(tree, "free last %lu blocks starting %lu\n", ++ num, start); ++ for (i = 0; i < num; i++) { ++ bh = sb_find_get_block(tree->inode->i_sb, start + i); ++ ext3_forget(handle, 0, tree->inode, bh, start + i); ++ } ++ ext3_free_blocks(handle, tree->inode, start, num); ++ } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) { ++ printk("strange request: removal %lu-%lu from %u:%u\n", ++ from, to, ex->ee_block, ex->ee_len); ++ } else { ++ printk("strange request: removal(2) %lu-%lu from %u:%u\n", ++ from, to, ex->ee_block, ex->ee_len); ++ } ++ ext3_journal_stop(handle); ++ return 0; ++} ++ ++static int ext3_ext_find_goal(struct inode *inode, ++ struct ext3_ext_path *path, unsigned long block) ++{ ++ struct ext3_inode_info *ei = EXT3_I(inode); ++ unsigned long bg_start; ++ unsigned long colour; ++ int depth; ++ ++ if (path) { ++ struct ext3_extent *ex; ++ depth = path->p_depth; ++ ++ /* try to predict block placement */ ++ if ((ex = path[depth].p_ext)) ++ return ex->ee_start + (block - ex->ee_block); ++ ++ /* it looks index is empty ++ * try to find starting from index itself */ ++ if (path[depth].p_bh) ++ return path[depth].p_bh->b_blocknr; ++ } ++ ++ /* OK. use inode's group */ ++ bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) + ++ le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block); ++ colour = (current->pid % 16) * ++ (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16); ++ return bg_start + colour + block; ++} ++ ++static int ext3_new_block_cb(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *ex, int *err) ++{ ++ struct inode *inode = tree->inode; ++ int newblock, goal; ++ ++ EXT_ASSERT(path); ++ EXT_ASSERT(ex); ++ EXT_ASSERT(ex->ee_start); ++ EXT_ASSERT(ex->ee_len); ++ ++ /* reuse block from the extent to order data/metadata */ ++ newblock = ex->ee_start++; ++ ex->ee_len--; ++ if (ex->ee_len == 0) { ++ ex->ee_len = 1; ++ /* allocate new block for the extent */ ++ goal = ext3_ext_find_goal(inode, path, ex->ee_block); ++ ex->ee_start = ext3_new_block(handle, inode, goal, err); ++ if (ex->ee_start == 0) { ++ /* error occured: restore old extent */ ++ ex->ee_start = newblock; ++ return 0; ++ } ++ } ++ return newblock; ++} ++ ++static struct ext3_extents_helpers ext3_blockmap_helpers = { ++ .get_write_access = ext3_get_inode_write_access, ++ .mark_buffer_dirty = ext3_mark_buffer_dirty, ++ .mergable = ext3_ext_mergable, ++ .new_block = ext3_new_block_cb, ++ .remove_extent = ext3_remove_blocks, ++ .remove_extent_credits = ext3_remove_blocks_credits, ++}; ++ ++void ext3_init_tree_desc(struct ext3_extents_tree *tree, ++ struct inode *inode) ++{ ++ tree->inode = inode; ++ tree->root = (void *) EXT3_I(inode)->i_data; ++ tree->buffer = (void *) inode; ++ tree->buffer_len = sizeof(EXT3_I(inode)->i_data); ++ tree->cex = (struct ext3_ext_cache *) &EXT3_I(inode)->i_cached_extent; ++ tree->ops = &ext3_blockmap_helpers; ++} ++ ++int ext3_ext_get_block(handle_t *handle, struct inode *inode, ++ long iblock, struct buffer_head *bh_result, ++ int create, int extend_disksize) ++{ ++ struct ext3_ext_path *path = NULL; ++ struct ext3_extent newex; ++ struct ext3_extent *ex; ++ int goal, newblock, err = 0, depth; ++ struct ext3_extents_tree tree; ++ ++ clear_buffer_new(bh_result); ++ ext3_init_tree_desc(&tree, inode); ++ ext_debug(&tree, "block %d requested for inode %u\n", ++ (int) iblock, (unsigned) inode->i_ino); ++ down(&EXT3_I(inode)->truncate_sem); ++ ++ /* check in cache */ ++ if ((goal = ext3_ext_in_cache(&tree, iblock, &newex))) { ++ if (goal == EXT3_EXT_CACHE_GAP) { ++ if (!create) { ++ /* block isn't allocated yet and ++ * user don't want to allocate it */ ++ goto out2; ++ } ++ /* we should allocate requested block */ ++ } else if (goal == EXT3_EXT_CACHE_EXTENT) { ++ /* block is already allocated */ ++ newblock = iblock - newex.ee_block + newex.ee_start; ++ goto out; ++ } else { ++ EXT_ASSERT(0); ++ } ++ } ++ ++ /* find extent for this block */ ++ path = ext3_ext_find_extent(&tree, iblock, NULL); ++ if (IS_ERR(path)) { ++ err = PTR_ERR(path); ++ path = NULL; ++ goto out2; ++ } ++ ++ depth = EXT_DEPTH(&tree); ++ ++ /* ++ * consistent leaf must not be empty ++ * this situations is possible, though, _during_ tree modification ++ * this is why assert can't be put in ext3_ext_find_extent() ++ */ ++ EXT_ASSERT(path[depth].p_ext != NULL || depth == 0); ++ ++ if ((ex = path[depth].p_ext)) { ++ /* if found exent covers block, simple return it */ ++ if (iblock >= ex->ee_block && iblock < ex->ee_block + ex->ee_len) { ++ newblock = iblock - ex->ee_block + ex->ee_start; ++ ext_debug(&tree, "%d fit into %d:%d -> %d\n", ++ (int) iblock, ex->ee_block, ex->ee_len, ++ newblock); ++ ext3_ext_put_in_cache(&tree, ex->ee_block, ++ ex->ee_len, ex->ee_start, ++ EXT3_EXT_CACHE_EXTENT); ++ goto out; ++ } ++ } ++ ++ /* ++ * requested block isn't allocated yet ++ * we couldn't try to create block if create flag is zero ++ */ ++ if (!create) { ++ /* put just found gap into cache to speedup subsequest reqs */ ++ ext3_ext_put_gap_in_cache(&tree, path, iblock); ++ goto out2; ++ } ++ ++ /* allocate new block */ ++ goal = ext3_ext_find_goal(inode, path, iblock); ++ newblock = ext3_new_block(handle, inode, goal, &err); ++ if (!newblock) ++ goto out2; ++ ext_debug(&tree, "allocate new block: goal %d, found %d\n", ++ goal, newblock); ++ ++ /* try to insert new extent into found leaf and return */ ++ newex.ee_block = iblock; ++ newex.ee_start = newblock; ++ newex.ee_len = 1; ++ err = ext3_ext_insert_extent(handle, &tree, path, &newex); ++ if (err) ++ goto out2; ++ ++ if (extend_disksize && inode->i_size > EXT3_I(inode)->i_disksize) ++ EXT3_I(inode)->i_disksize = inode->i_size; ++ ++ /* previous routine could use block we allocated */ ++ newblock = newex.ee_start; ++ set_buffer_new(bh_result); ++ ++ ext3_ext_put_in_cache(&tree, newex.ee_block, newex.ee_len, ++ newex.ee_start, EXT3_EXT_CACHE_EXTENT); ++out: ++ ext3_ext_show_leaf(&tree, path); ++ map_bh(bh_result, inode->i_sb, newblock); ++out2: ++ if (path) { ++ ext3_ext_drop_refs(path); ++ kfree(path); ++ } ++ up(&EXT3_I(inode)->truncate_sem); ++ ++ return err; ++} ++ ++void ext3_ext_truncate(struct inode * inode, struct page *page) ++{ ++ struct address_space *mapping = inode->i_mapping; ++ struct super_block *sb = inode->i_sb; ++ struct ext3_extents_tree tree; ++ unsigned long last_block; ++ handle_t *handle; ++ int err = 0; ++ ++ ext3_init_tree_desc(&tree, inode); ++ ++ /* ++ * probably first extent we're gonna free will be last in block ++ */ ++ err = ext3_writepage_trans_blocks(inode) + 3; ++ handle = ext3_journal_start(inode, err); ++ if (IS_ERR(handle)) { ++ if (page) { ++ clear_highpage(page); ++ flush_dcache_page(page); ++ unlock_page(page); ++ page_cache_release(page); ++ } ++ return; ++ } ++ ++ if (page) ++ ext3_block_truncate_page(handle, page, mapping, inode->i_size); ++ ++ down(&EXT3_I(inode)->truncate_sem); ++ ext3_ext_invalidate_cache(&tree); ++ ++ /* ++ * TODO: optimization is possible here ++ * probably we need not scaning at all, ++ * because page truncation is enough ++ */ ++ if (ext3_orphan_add(handle, inode)) ++ goto out_stop; ++ ++ /* we have to know where to truncate from in crash case */ ++ EXT3_I(inode)->i_disksize = inode->i_size; ++ ext3_mark_inode_dirty(handle, inode); ++ ++ last_block = (inode->i_size + sb->s_blocksize - 1) >> ++ EXT3_BLOCK_SIZE_BITS(sb); ++ err = ext3_ext_remove_space(&tree, last_block, EXT_MAX_BLOCK); ++ ++ /* In a multi-transaction truncate, we only make the final ++ * transaction synchronous */ ++ if (IS_SYNC(inode)) ++ handle->h_sync = 1; ++ ++out_stop: ++ /* ++ * If this was a simple ftruncate(), and the file will remain alive ++ * then we need to clear up the orphan record which we created above. ++ * However, if this was a real unlink then we were called by ++ * ext3_delete_inode(), and we allow that function to clean up the ++ * orphan info for us. ++ */ ++ if (inode->i_nlink) ++ ext3_orphan_del(handle, inode); ++ ++ up(&EXT3_I(inode)->truncate_sem); ++ ext3_journal_stop(handle); ++} ++ ++/* ++ * this routine calculate max number of blocks we could modify ++ * in order to allocate new block for an inode ++ */ ++int ext3_ext_writepage_trans_blocks(struct inode *inode, int num) ++{ ++ struct ext3_extents_tree tree; ++ int needed; ++ ++ ext3_init_tree_desc(&tree, inode); ++ ++ needed = ext3_ext_calc_credits_for_insert(&tree, NULL); ++ ++ /* caller want to allocate num blocks */ ++ needed *= num; ++ ++#ifdef CONFIG_QUOTA ++ /* ++ * FIXME: real calculation should be here ++ * it depends on blockmap format of qouta file ++ */ ++ needed += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS; ++#endif ++ ++ return needed; ++} ++ ++void ext3_extents_initialize_blockmap(handle_t *handle, struct inode *inode) ++{ ++ struct ext3_extents_tree tree; ++ ++ ext3_init_tree_desc(&tree, inode); ++ ext3_extent_tree_init(handle, &tree); ++} ++ ++int ext3_ext_calc_blockmap_metadata(struct inode *inode, int blocks) ++{ ++ struct ext3_extents_tree tree; ++ ++ ext3_init_tree_desc(&tree, inode); ++ return ext3_ext_calc_metadata_amount(&tree, blocks); ++} ++ ++static int ++ext3_ext_store_extent_cb(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_ext_cache *newex) ++{ ++ struct ext3_extent_buf *buf = (struct ext3_extent_buf *) tree->private; ++ ++ if (newex->ec_type != EXT3_EXT_CACHE_EXTENT) ++ return EXT_CONTINUE; ++ ++ if (buf->err < 0) ++ return EXT_BREAK; ++ if (buf->cur - buf->buffer + sizeof(*newex) > buf->buflen) ++ return EXT_BREAK; ++ ++ if (!copy_to_user(buf->cur, newex, sizeof(*newex))) { ++ buf->err++; ++ buf->cur += sizeof(*newex); ++ } else { ++ buf->err = -EFAULT; ++ return EXT_BREAK; ++ } ++ return EXT_CONTINUE; ++} ++ ++static int ++ext3_ext_collect_stats_cb(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_ext_cache *ex) ++{ ++ struct ext3_extent_tree_stats *buf = ++ (struct ext3_extent_tree_stats *) tree->private; ++ int depth; ++ ++ if (ex->ec_type != EXT3_EXT_CACHE_EXTENT) ++ return EXT_CONTINUE; ++ ++ depth = EXT_DEPTH(tree); ++ buf->extents_num++; ++ if (path[depth].p_ext == EXT_FIRST_EXTENT(path[depth].p_hdr)) ++ buf->leaf_num++; ++ return EXT_CONTINUE; ++} ++ ++int ext3_ext_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, ++ unsigned long arg) ++{ ++ int err = 0; ++ ++ if (!(EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL)) ++ return -EINVAL; ++ ++ if (cmd == EXT3_IOC_GET_EXTENTS) { ++ struct ext3_extent_buf buf; ++ struct ext3_extents_tree tree; ++ ++ if (copy_from_user(&buf, (void *) arg, sizeof(buf))) ++ return -EFAULT; ++ ++ ext3_init_tree_desc(&tree, inode); ++ buf.cur = buf.buffer; ++ buf.err = 0; ++ tree.private = &buf; ++ down(&EXT3_I(inode)->truncate_sem); ++ err = ext3_ext_walk_space(&tree, buf.start, EXT_MAX_BLOCK, ++ ext3_ext_store_extent_cb); ++ up(&EXT3_I(inode)->truncate_sem); ++ if (err == 0) ++ err = buf.err; ++ } else if (cmd == EXT3_IOC_GET_TREE_STATS) { ++ struct ext3_extent_tree_stats buf; ++ struct ext3_extents_tree tree; ++ ++ ext3_init_tree_desc(&tree, inode); ++ down(&EXT3_I(inode)->truncate_sem); ++ buf.depth = EXT_DEPTH(&tree); ++ buf.extents_num = 0; ++ buf.leaf_num = 0; ++ tree.private = &buf; ++ err = ext3_ext_walk_space(&tree, 0, EXT_MAX_BLOCK, ++ ext3_ext_collect_stats_cb); ++ up(&EXT3_I(inode)->truncate_sem); ++ if (!err) ++ err = copy_to_user((void *) arg, &buf, sizeof(buf)); ++ } else if (cmd == EXT3_IOC_GET_TREE_DEPTH) { ++ struct ext3_extents_tree tree; ++ ext3_init_tree_desc(&tree, inode); ++ down(&EXT3_I(inode)->truncate_sem); ++ err = EXT_DEPTH(&tree); ++ up(&EXT3_I(inode)->truncate_sem); ++ } ++ ++ return err; ++} ++ ++EXPORT_SYMBOL(ext3_init_tree_desc); ++EXPORT_SYMBOL(ext3_mark_inode_dirty); ++EXPORT_SYMBOL(ext3_ext_invalidate_cache); ++EXPORT_SYMBOL(ext3_ext_insert_extent); ++EXPORT_SYMBOL(ext3_ext_walk_space); ++EXPORT_SYMBOL(ext3_ext_find_goal); ++EXPORT_SYMBOL(ext3_ext_calc_credits_for_insert); +Index: linux-2.6.12-rc6/fs/ext3/ialloc.c +=================================================================== +--- linux-2.6.12-rc6.orig/fs/ext3/ialloc.c 2005-06-14 16:31:08.634433030 +0200 ++++ linux-2.6.12-rc6/fs/ext3/ialloc.c 2005-06-14 16:31:25.846346882 +0200 +@@ -598,7 +598,7 @@ + ei->i_dir_start_lookup = 0; + ei->i_disksize = 0; + +- ei->i_flags = EXT3_I(dir)->i_flags & ~EXT3_INDEX_FL; ++ ei->i_flags = EXT3_I(dir)->i_flags & ~(EXT3_INDEX_FL|EXT3_EXTENTS_FL); + if (S_ISLNK(mode)) + ei->i_flags &= ~(EXT3_IMMUTABLE_FL|EXT3_APPEND_FL); + /* dirsync only applies to directories */ +@@ -639,6 +639,18 @@ + DQUOT_FREE_INODE(inode); + goto fail2; + } ++ if (test_opt(sb, EXTENTS) && S_ISREG(inode->i_mode)) { ++ EXT3_I(inode)->i_flags |= EXT3_EXTENTS_FL; ++ ext3_extents_initialize_blockmap(handle, inode); ++ if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_EXTENTS)) { ++ err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh); ++ if (err) goto fail; ++ EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_EXTENTS); ++ BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "call ext3_journal_dirty_metadata"); ++ err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); ++ } ++ } ++ + err = ext3_mark_inode_dirty(handle, inode); + if (err) { + ext3_std_error(sb, err); +Index: linux-2.6.12-rc6/fs/ext3/inode.c +=================================================================== +--- linux-2.6.12-rc6.orig/fs/ext3/inode.c 2005-06-14 16:31:09.701815830 +0200 ++++ linux-2.6.12-rc6/fs/ext3/inode.c 2005-06-14 16:31:25.861971882 +0200 +@@ -40,7 +40,7 @@ + #include "iopen.h" + #include "acl.h" + +-static int ext3_writepage_trans_blocks(struct inode *inode); ++int ext3_writepage_trans_blocks(struct inode *inode); + + /* + * Test whether an inode is a fast symlink. +@@ -784,6 +784,17 @@ + return err; + } + ++static inline int ++ext3_get_block_wrap(handle_t *handle, struct inode *inode, long block, ++ struct buffer_head *bh, int create, int extend_disksize) ++{ ++ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) ++ return ext3_ext_get_block(handle, inode, block, bh, create, ++ extend_disksize); ++ return ext3_get_block_handle(handle, inode, block, bh, create, ++ extend_disksize); ++} ++ + static int ext3_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) + { +@@ -794,8 +805,8 @@ + handle = ext3_journal_current_handle(); + J_ASSERT(handle != 0); + } +- ret = ext3_get_block_handle(handle, inode, iblock, +- bh_result, create, 1); ++ ret = ext3_get_block_wrap(handle, inode, iblock, ++ bh_result, create, 1); + return ret; + } + +@@ -839,7 +850,7 @@ + + get_block: + if (ret == 0) +- ret = ext3_get_block_handle(handle, inode, iblock, ++ ret = ext3_get_block_wrap(handle, inode, iblock, + bh_result, create, 0); + bh_result->b_size = (1 << inode->i_blkbits); + return ret; +@@ -859,7 +870,7 @@ + dummy.b_state = 0; + dummy.b_blocknr = -1000; + buffer_trace_init(&dummy.b_history); +- *errp = ext3_get_block_handle(handle, inode, block, &dummy, create, 1); ++ *errp = ext3_get_block_wrap(handle, inode, block, &dummy, create, 1); + if (!*errp && buffer_mapped(&dummy)) { + struct buffer_head *bh; + bh = sb_getblk(inode->i_sb, dummy.b_blocknr); +@@ -1593,7 +1604,7 @@ + * This required during truncate. We need to physically zero the tail end + * of that block so it doesn't yield old data if the file is later grown. + */ +-static int ext3_block_truncate_page(handle_t *handle, struct page *page, ++int ext3_block_truncate_page(handle_t *handle, struct page *page, + struct address_space *mapping, loff_t from) + { + unsigned long index = from >> PAGE_CACHE_SHIFT; +@@ -2104,6 +2115,9 @@ + return; + } + ++ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) ++ return ext3_ext_truncate(inode, page); ++ + handle = start_transaction(inode); + if (IS_ERR(handle)) { + if (page) { +@@ -2850,12 +2864,15 @@ + * block and work out the exact number of indirects which are touched. Pah. + */ + +-static int ext3_writepage_trans_blocks(struct inode *inode) ++int ext3_writepage_trans_blocks(struct inode *inode) + { + int bpp = ext3_journal_blocks_per_page(inode); + int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3; + int ret; + ++ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) ++ return ext3_ext_writepage_trans_blocks(inode, bpp); ++ + if (ext3_should_journal_data(inode)) + ret = 3 * (bpp + indirects) + 2; + else +Index: linux-2.6.12-rc6/fs/ext3/Makefile +=================================================================== +--- linux-2.6.12-rc6.orig/fs/ext3/Makefile 2005-06-14 16:31:09.179354899 +0200 ++++ linux-2.6.12-rc6/fs/ext3/Makefile 2005-06-14 16:31:25.872714069 +0200 +@@ -5,7 +5,8 @@ + obj-$(CONFIG_EXT3_FS) += ext3.o + + ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ +- ioctl.o namei.o super.o symlink.o hash.o resize.o ++ ioctl.o namei.o super.o symlink.o hash.o resize.o \ ++ extents.o + + ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o + ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o +Index: linux-2.6.12-rc6/fs/ext3/super.c +=================================================================== +--- linux-2.6.12-rc6.orig/fs/ext3/super.c 2005-06-14 16:31:09.950839264 +0200 ++++ linux-2.6.12-rc6/fs/ext3/super.c 2005-06-14 16:31:25.886385944 +0200 +@@ -387,6 +387,7 @@ + struct ext3_super_block *es = sbi->s_es; + int i; + ++ ext3_ext_release(sb); + ext3_xattr_put_super(sb); + journal_destroy(sbi->s_journal); + if (!(sb->s_flags & MS_RDONLY)) { +@@ -451,6 +452,8 @@ + #endif + ei->i_block_alloc_info = NULL; + ei->vfs_inode.i_version = 1; ++ ++ memset(&ei->i_cached_extent, 0, sizeof(ei->i_cached_extent)); + return &ei->vfs_inode; + } + +@@ -593,6 +596,7 @@ + Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, + Opt_ignore, Opt_barrier, Opt_err, Opt_resize, + Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, ++ Opt_extents, Opt_extdebug, + }; + + static match_table_t tokens = { +@@ -644,6 +647,8 @@ + {Opt_iopen, "iopen"}, + {Opt_noiopen, "noiopen"}, + {Opt_iopen_nopriv, "iopen_nopriv"}, ++ {Opt_extents, "extents"}, ++ {Opt_extdebug, "extdebug"}, + {Opt_barrier, "barrier=%u"}, + {Opt_err, NULL}, + {Opt_resize, "resize"}, +@@ -953,6 +958,12 @@ + case Opt_nobh: + set_opt(sbi->s_mount_opt, NOBH); + break; ++ case Opt_extents: ++ set_opt (sbi->s_mount_opt, EXTENTS); ++ break; ++ case Opt_extdebug: ++ set_opt (sbi->s_mount_opt, EXTDEBUG); ++ break; + default: + printk (KERN_ERR + "EXT3-fs: Unrecognized mount option \"%s\" " +@@ -1668,6 +1681,7 @@ + percpu_counter_mod(&sbi->s_dirs_counter, + ext3_count_dirs(sb)); + ++ ext3_ext_init(sb); + lock_kernel(); + return 0; + +Index: linux-2.6.12-rc6/fs/ext3/ioctl.c +=================================================================== +--- linux-2.6.12-rc6.orig/fs/ext3/ioctl.c 2005-06-14 16:31:08.646151780 +0200 ++++ linux-2.6.12-rc6/fs/ext3/ioctl.c 2005-06-14 16:31:25.897128131 +0200 +@@ -124,6 +124,10 @@ + err = ext3_change_inode_journal_flag(inode, jflag); + return err; + } ++ case EXT3_IOC_GET_EXTENTS: ++ case EXT3_IOC_GET_TREE_STATS: ++ case EXT3_IOC_GET_TREE_DEPTH: ++ return ext3_ext_ioctl(inode, filp, cmd, arg); + case EXT3_IOC_GETVERSION: + case EXT3_IOC_GETVERSION_OLD: + return put_user(inode->i_generation, (int __user *) arg); +Index: linux-2.6.12-rc6/include/linux/ext3_fs.h +=================================================================== +--- linux-2.6.12-rc6.orig/include/linux/ext3_fs.h 2005-06-14 16:31:10.185214261 +0200 ++++ linux-2.6.12-rc6/include/linux/ext3_fs.h 2005-06-14 16:31:52.859041864 +0200 +@@ -186,8 +186,9 @@ + #define EXT3_NOTAIL_FL 0x00008000 /* don't merge file tail */ + #define EXT3_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ + #define EXT3_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ ++#define EXT3_EXTENTS_FL 0x00080000 /* Inode uses extents */ + #define EXT3_RESERVED_FL 0x80000000 /* reserved for ext3 lib */ + +-#define EXT3_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */ ++#define EXT3_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */ + #define EXT3_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */ + +@@ -237,6 +238,9 @@ + #endif + #define EXT3_IOC_GETRSVSZ _IOR('f', 5, long) + #define EXT3_IOC_SETRSVSZ _IOW('f', 6, long) ++#define EXT3_IOC_GET_EXTENTS _IOR('f', 7, long) ++#define EXT3_IOC_GET_TREE_DEPTH _IOR('f', 8, long) ++#define EXT3_IOC_GET_TREE_STATS _IOR('f', 9, long) + + /* + * Structure of an inode on the disk +@@ -360,6 +364,8 @@ + #define EXT3_MOUNT_NOBH 0x40000 /* No bufferheads */ + #define EXT3_MOUNT_IOPEN 0x80000 /* Allow access via iopen */ + #define EXT3_MOUNT_IOPEN_NOPRIV 0x100000/* Make iopen world-readable */ ++#define EXT3_MOUNT_EXTENTS 0x200000/* Extents support */ ++#define EXT3_MOUNT_EXTDEBUG 0x400000/* Extents debug */ + + /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ + #ifndef clear_opt +@@ -548,11 +554,13 @@ + #define EXT3_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */ + #define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */ + #define EXT3_FEATURE_INCOMPAT_META_BG 0x0010 ++#define EXT3_FEATURE_INCOMPAT_EXTENTS 0x0040 /* extents support */ + + #define EXT3_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR + #define EXT3_FEATURE_INCOMPAT_SUPP (EXT3_FEATURE_INCOMPAT_FILETYPE| \ + EXT3_FEATURE_INCOMPAT_RECOVER| \ +- EXT3_FEATURE_INCOMPAT_META_BG) ++ EXT3_FEATURE_INCOMPAT_META_BG| \ ++ EXT3_FEATURE_INCOMPAT_EXTENTS) + #define EXT3_FEATURE_RO_COMPAT_SUPP (EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT3_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT3_FEATURE_RO_COMPAT_BTREE_DIR) +@@ -759,6 +767,7 @@ + + + /* inode.c */ ++extern int ext3_block_truncate_page(handle_t *, struct page *, struct address_space *, loff_t); + extern int ext3_forget(handle_t *, int, struct inode *, struct buffer_head *, int); + extern struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *); + extern struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *); +@@ -828,6 +837,16 @@ + extern struct inode_operations ext3_symlink_inode_operations; + extern struct inode_operations ext3_fast_symlink_inode_operations; + ++/* extents.c */ ++extern int ext3_ext_writepage_trans_blocks(struct inode *, int); ++extern int ext3_ext_get_block(handle_t *, struct inode *, long, ++ struct buffer_head *, int, int); ++extern void ext3_ext_truncate(struct inode *, struct page *); ++extern void ext3_ext_init(struct super_block *); ++extern void ext3_ext_release(struct super_block *); ++extern void ext3_extents_initialize_blockmap(handle_t *, struct inode *); ++extern int ext3_ext_ioctl(struct inode *inode, struct file *filp, ++ unsigned int cmd, unsigned long arg); + + #endif /* __KERNEL__ */ + +Index: linux-2.6.12-rc6/include/linux/ext3_extents.h +=================================================================== +--- linux-2.6.12-rc6.orig/include/linux/ext3_extents.h 2005-06-14 16:31:25.780917195 +0200 ++++ linux-2.6.12-rc6/include/linux/ext3_extents.h 2005-06-14 16:31:25.932284381 +0200 +@@ -0,0 +1,264 @@ ++/* ++ * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com ++ * Written by Alex Tomas ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public Licens ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- ++ */ ++ ++#ifndef _LINUX_EXT3_EXTENTS ++#define _LINUX_EXT3_EXTENTS ++ ++/* ++ * with AGRESSIVE_TEST defined capacity of index/leaf blocks ++ * become very little, so index split, in-depth growing and ++ * other hard changes happens much more often ++ * this is for debug purposes only ++ */ ++#define AGRESSIVE_TEST_ ++ ++/* ++ * if CHECK_BINSEARCH defined, then results of binary search ++ * will be checked by linear search ++ */ ++#define CHECK_BINSEARCH_ ++ ++/* ++ * if EXT_DEBUG is defined you can use 'extdebug' mount option ++ * to get lots of info what's going on ++ */ ++#define EXT_DEBUG_ ++#ifdef EXT_DEBUG ++#define ext_debug(tree,fmt,a...) \ ++do { \ ++ if (test_opt((tree)->inode->i_sb, EXTDEBUG)) \ ++ printk(fmt, ##a); \ ++} while (0); ++#else ++#define ext_debug(tree,fmt,a...) ++#endif ++ ++/* ++ * if EXT_STATS is defined then stats numbers are collected ++ * these number will be displayed at umount time ++ */ ++#define EXT_STATS_ ++ ++ ++#define EXT3_ALLOC_NEEDED 3 /* block bitmap + group desc. + sb */ ++ ++/* ++ * ext3_inode has i_block array (total 60 bytes) ++ * first 4 bytes are used to store: ++ * - tree depth (0 mean there is no tree yet. all extents in the inode) ++ * - number of alive extents in the inode ++ */ ++ ++/* ++ * this is extent on-disk structure ++ * it's used at the bottom of the tree ++ */ ++struct ext3_extent { ++ __u32 ee_block; /* first logical block extent covers */ ++ __u16 ee_len; /* number of blocks covered by extent */ ++ __u16 ee_start_hi; /* high 16 bits of physical block */ ++ __u32 ee_start; /* low 32 bigs of physical block */ ++}; ++ ++/* ++ * this is index on-disk structure ++ * it's used at all the levels, but the bottom ++ */ ++struct ext3_extent_idx { ++ __u32 ei_block; /* index covers logical blocks from 'block' */ ++ __u32 ei_leaf; /* pointer to the physical block of the next * ++ * level. leaf or next index could bet here */ ++ __u16 ei_leaf_hi; /* high 16 bits of physical block */ ++ __u16 ei_unused; ++}; ++ ++/* ++ * each block (leaves and indexes), even inode-stored has header ++ */ ++struct ext3_extent_header { ++ __u16 eh_magic; /* probably will support different formats */ ++ __u16 eh_entries; /* number of valid entries */ ++ __u16 eh_max; /* capacity of store in entries */ ++ __u16 eh_depth; /* has tree real underlaying blocks? */ ++ __u32 eh_generation; /* generation of the tree */ ++}; ++ ++#define EXT3_EXT_MAGIC 0xf30a ++ ++/* ++ * array of ext3_ext_path contains path to some extent ++ * creation/lookup routines use it for traversal/splitting/etc ++ * truncate uses it to simulate recursive walking ++ */ ++struct ext3_ext_path { ++ __u32 p_block; ++ __u16 p_depth; ++ struct ext3_extent *p_ext; ++ struct ext3_extent_idx *p_idx; ++ struct ext3_extent_header *p_hdr; ++ struct buffer_head *p_bh; ++}; ++ ++/* ++ * structure for external API ++ */ ++ ++/* ++ * storage for cached extent ++ */ ++struct ext3_ext_cache { ++ __u32 ec_start; ++ __u32 ec_block; ++ __u32 ec_len; ++ __u32 ec_type; ++}; ++ ++#define EXT3_EXT_CACHE_NO 0 ++#define EXT3_EXT_CACHE_GAP 1 ++#define EXT3_EXT_CACHE_EXTENT 2 ++ ++/* ++ * ext3_extents_tree is used to pass initial information ++ * to top-level extents API ++ */ ++struct ext3_extents_helpers; ++struct ext3_extents_tree { ++ struct inode *inode; /* inode which tree belongs to */ ++ void *root; /* ptr to data top of tree resides at */ ++ void *buffer; /* will be passed as arg to ^^ routines */ ++ int buffer_len; ++ void *private; ++ struct ext3_ext_cache *cex;/* last found extent */ ++ struct ext3_extents_helpers *ops; ++}; ++ ++struct ext3_extents_helpers { ++ int (*get_write_access)(handle_t *h, void *buffer); ++ int (*mark_buffer_dirty)(handle_t *h, void *buffer); ++ int (*mergable)(struct ext3_extent *ex1, struct ext3_extent *ex2); ++ int (*remove_extent_credits)(struct ext3_extents_tree *, ++ struct ext3_extent *, unsigned long, ++ unsigned long); ++ int (*remove_extent)(struct ext3_extents_tree *, ++ struct ext3_extent *, unsigned long, ++ unsigned long); ++ int (*new_block)(handle_t *, struct ext3_extents_tree *, ++ struct ext3_ext_path *, struct ext3_extent *, ++ int *); ++}; ++ ++/* ++ * to be called by ext3_ext_walk_space() ++ * negative retcode - error ++ * positive retcode - signal for ext3_ext_walk_space(), see below ++ * callback must return valid extent (passed or newly created) ++ */ ++typedef int (*ext_prepare_callback)(struct ext3_extents_tree *, ++ struct ext3_ext_path *, ++ struct ext3_ext_cache *); ++ ++#define EXT_CONTINUE 0 ++#define EXT_BREAK 1 ++#define EXT_REPEAT 2 ++ ++ ++#define EXT_MAX_BLOCK 0xffffffff ++ ++ ++#define EXT_FIRST_EXTENT(__hdr__) \ ++ ((struct ext3_extent *) (((char *) (__hdr__)) + \ ++ sizeof(struct ext3_extent_header))) ++#define EXT_FIRST_INDEX(__hdr__) \ ++ ((struct ext3_extent_idx *) (((char *) (__hdr__)) + \ ++ sizeof(struct ext3_extent_header))) ++#define EXT_HAS_FREE_INDEX(__path__) \ ++ ((__path__)->p_hdr->eh_entries < (__path__)->p_hdr->eh_max) ++#define EXT_LAST_EXTENT(__hdr__) \ ++ (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_entries - 1) ++#define EXT_LAST_INDEX(__hdr__) \ ++ (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_entries - 1) ++#define EXT_MAX_EXTENT(__hdr__) \ ++ (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_max - 1) ++#define EXT_MAX_INDEX(__hdr__) \ ++ (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_max - 1) ++ ++#define EXT_ROOT_HDR(tree) \ ++ ((struct ext3_extent_header *) (tree)->root) ++#define EXT_BLOCK_HDR(bh) \ ++ ((struct ext3_extent_header *) (bh)->b_data) ++#define EXT_DEPTH(_t_) \ ++ (((struct ext3_extent_header *)((_t_)->root))->eh_depth) ++#define EXT_GENERATION(_t_) \ ++ (((struct ext3_extent_header *)((_t_)->root))->eh_generation) ++ ++ ++#define EXT_ASSERT(__x__) if (!(__x__)) BUG(); ++ ++#define EXT_CHECK_PATH(tree,path) \ ++{ \ ++ int depth = EXT_DEPTH(tree); \ ++ BUG_ON((unsigned long) (path) < __PAGE_OFFSET); \ ++ BUG_ON((unsigned long) (path)[depth].p_idx < \ ++ __PAGE_OFFSET && (path)[depth].p_idx != NULL); \ ++ BUG_ON((unsigned long) (path)[depth].p_ext < \ ++ __PAGE_OFFSET && (path)[depth].p_ext != NULL); \ ++ BUG_ON((unsigned long) (path)[depth].p_hdr < __PAGE_OFFSET); \ ++ BUG_ON((unsigned long) (path)[depth].p_bh < __PAGE_OFFSET \ ++ && depth != 0); \ ++ BUG_ON((path)[0].p_depth != depth); \ ++} ++ ++ ++/* ++ * this structure is used to gather extents from the tree via ioctl ++ */ ++struct ext3_extent_buf { ++ unsigned long start; ++ int buflen; ++ void *buffer; ++ void *cur; ++ int err; ++}; ++ ++/* ++ * this structure is used to collect stats info about the tree ++ */ ++struct ext3_extent_tree_stats { ++ int depth; ++ int extents_num; ++ int leaf_num; ++}; ++ ++extern void ext3_init_tree_desc(struct ext3_extents_tree *, struct inode *); ++extern int ext3_extent_tree_init(handle_t *, struct ext3_extents_tree *); ++extern int ext3_ext_calc_credits_for_insert(struct ext3_extents_tree *, struct ext3_ext_path *); ++extern int ext3_ext_insert_extent(handle_t *, struct ext3_extents_tree *, struct ext3_ext_path *, struct ext3_extent *); ++extern int ext3_ext_walk_space(struct ext3_extents_tree *, unsigned long, unsigned long, ext_prepare_callback); ++extern int ext3_ext_remove_space(struct ext3_extents_tree *, unsigned long, unsigned long); ++extern struct ext3_ext_path * ext3_ext_find_extent(struct ext3_extents_tree *, int, struct ext3_ext_path *); ++extern int ext3_ext_calc_blockmap_metadata(struct inode *, int); ++ ++static inline void ++ext3_ext_invalidate_cache(struct ext3_extents_tree *tree) ++{ ++ if (tree->cex) ++ tree->cex->ec_type = EXT3_EXT_CACHE_NO; ++} ++ ++ ++#endif /* _LINUX_EXT3_EXTENTS */ +Index: linux-2.6.12-rc6/include/linux/ext3_fs_i.h +=================================================================== +--- linux-2.6.12-rc6.orig/include/linux/ext3_fs_i.h 2005-06-06 17:22:29.000000000 +0200 ++++ linux-2.6.12-rc6/include/linux/ext3_fs_i.h 2005-06-14 16:31:25.941073443 +0200 +@@ -133,6 +133,8 @@ + */ + struct semaphore truncate_sem; + struct inode vfs_inode; ++ ++ __u32 i_cached_extent[4]; + }; + + #endif /* _LINUX_EXT3_FS_I */ diff --git a/lustre/kernel_patches/patches/ext3-extents-2.6.5.patch b/lustre/kernel_patches/patches/ext3-extents-2.6.5.patch index f69e16c..0ee8d28 100644 --- a/lustre/kernel_patches/patches/ext3-extents-2.6.5.patch +++ b/lustre/kernel_patches/patches/ext3-extents-2.6.5.patch @@ -2471,12 +2471,13 @@ Index: linux-2.6.5-sles9/fs/ext3/Makefile =================================================================== --- linux-2.6.5-sles9.orig/fs/ext3/Makefile 2005-02-23 01:01:46.501172896 +0300 +++ linux-2.6.5-sles9/fs/ext3/Makefile 2005-02-23 01:02:37.405434272 +0300 -@@ -5,7 +5,7 @@ +@@ -5,7 +5,8 @@ obj-$(CONFIG_EXT3_FS) += ext3.o ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ - ioctl.o namei.o super.o symlink.o hash.o -+ ioctl.o namei.o super.o symlink.o hash.o extents.o ++ ioctl.o namei.o super.o symlink.o hash.o \ ++ extents.o ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o @@ -2501,12 +2502,11 @@ Index: linux-2.6.5-sles9/fs/ext3/super.c return &ei->vfs_inode; } -@@ -537,7 +540,7 @@ - Opt_commit, Opt_journal_update, Opt_journal_inum, - Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, - Opt_ignore, Opt_barrier, Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, -- Opt_err, -+ Opt_err, Opt_extents, Opt_extdebug +@@ -537,6 +540,7 @@ + Opt_ignore, Opt_barrier, + Opt_err, + Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, ++ Opt_extents, Opt_extdebug, }; static match_table_t tokens = { @@ -2516,9 +2516,9 @@ Index: linux-2.6.5-sles9/fs/ext3/super.c {Opt_iopen_nopriv, "iopen_nopriv"}, + {Opt_extents, "extents"}, + {Opt_extdebug, "extdebug"}, + {Opt_barrier, "barrier=%u"}, {Opt_err, NULL} }; - @@ -797,6 +802,12 @@ break; case Opt_ignore: @@ -2583,10 +2583,10 @@ Index: linux-2.6.5-sles9/include/linux/ext3_fs.h * Structure of an inode on the disk @@ -333,6 +337,8 @@ #define EXT3_MOUNT_BARRIER 0x20000 /* Use block barriers */ - #define EXT3_MOUNT_IOPEN 0x40000 /* Allow access via iopen */ - #define EXT3_MOUNT_IOPEN_NOPRIV 0x80000 /* Make iopen world-readable */ -+#define EXT3_MOUNT_EXTENTS 0x100000/* Extents support */ -+#define EXT3_MOUNT_EXTDEBUG 0x200000/* Extents debug */ + #define EXT3_MOUNT_IOPEN 0x80000 /* Allow access via iopen */ + #define EXT3_MOUNT_IOPEN_NOPRIV 0x100000/* Make iopen world-readable */ ++#define EXT3_MOUNT_EXTENTS 0x200000/* Extents support */ ++#define EXT3_MOUNT_EXTDEBUG 0x400000/* Extents debug */ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ #ifndef clear_opt diff --git a/lustre/kernel_patches/patches/ext3-extents-2.6.9-rhel4.patch b/lustre/kernel_patches/patches/ext3-extents-2.6.9-rhel4.patch index 3b873c2..56fe653 100644 --- a/lustre/kernel_patches/patches/ext3-extents-2.6.9-rhel4.patch +++ b/lustre/kernel_patches/patches/ext3-extents-2.6.9-rhel4.patch @@ -2466,12 +2466,13 @@ Index: linux-stage/fs/ext3/Makefile =================================================================== --- linux-stage.orig/fs/ext3/Makefile 2005-02-25 14:49:42.168561008 +0200 +++ linux-stage/fs/ext3/Makefile 2005-02-25 15:39:28.384587168 +0200 -@@ -5,7 +5,7 @@ +@@ -5,7 +5,8 @@ obj-$(CONFIG_EXT3_FS) += ext3.o - ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o\ + ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ - ioctl.o namei.o super.o symlink.o hash.o resize.o -+ ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o ++ ioctl.o namei.o super.o symlink.o hash.o resize.o \ ++ extents.o ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o @@ -2496,19 +2497,18 @@ Index: linux-stage/fs/ext3/super.c return &ei->vfs_inode; } -@@ -589,7 +594,7 @@ - Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, +@@ -589,6 +594,7 @@ Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, + Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, -- Opt_ignore, Opt_barrier, Opt_err, Opt_resize, -+ Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_extents, Opt_extdebug, ++ Opt_extents, Opt_extdebug, }; static match_table_t tokens = { @@ -639,6 +644,8 @@ - {Opt_iopen, "iopen"}, - {Opt_noiopen, "noiopen"}, - {Opt_iopen_nopriv, "iopen_nopriv"}, + {Opt_iopen, "iopen"}, + {Opt_noiopen, "noiopen"}, + {Opt_iopen_nopriv, "iopen_nopriv"}, + {Opt_extents, "extents"}, + {Opt_extdebug, "extdebug"}, {Opt_barrier, "barrier=%u"}, @@ -2578,10 +2578,10 @@ Index: linux-stage/include/linux/ext3_fs.h * Structure of an inode on the disk @@ -359,6 +363,8 @@ #define EXT3_MOUNT_RESERVATION 0x20000 /* Preallocation */ - #define EXT3_MOUNT_IOPEN 0x40000 /* Allow access via iopen */ - #define EXT3_MOUNT_IOPEN_NOPRIV 0x80000 /* Make iopen world-readable */ -+#define EXT3_MOUNT_EXTENTS 0x100000/* Extents support */ -+#define EXT3_MOUNT_EXTDEBUG 0x200000/* Extents debug */ + #define EXT3_MOUNT_IOPEN 0x80000 /* Allow access via iopen */ + #define EXT3_MOUNT_IOPEN_NOPRIV 0x100000/* Make iopen world-readable */ ++#define EXT3_MOUNT_EXTENTS 0x200000/* Extents support */ ++#define EXT3_MOUNT_EXTDEBUG 0x400000/* Extents debug */ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ #ifndef _LINUX_EXT2_FS_H diff --git a/lustre/kernel_patches/patches/ext3-extents-asyncdel-2.4.21-chaos.patch b/lustre/kernel_patches/patches/ext3-extents-asyncdel-2.4.21-chaos.patch index e5a5616..cd37db4 100644 --- a/lustre/kernel_patches/patches/ext3-extents-asyncdel-2.4.21-chaos.patch +++ b/lustre/kernel_patches/patches/ext3-extents-asyncdel-2.4.21-chaos.patch @@ -2,13 +2,15 @@ Index: 57chaos/fs/ext3/inode.c =================================================================== --- 57chaos.orig/fs/ext3/inode.c 2004-06-21 14:15:31.000000000 -0700 +++ 57chaos/fs/ext3/inode.c 2004-06-21 14:19:27.000000000 -0700 -@@ -2270,6 +2270,10 @@ void ext3_truncate_thread(struct inode * +@@ -2270,6 +2270,12 @@ void ext3_truncate_thread(struct inode * memcpy(nei->i_data, oei->i_data, sizeof(nei->i_data)); memset(oei->i_data, 0, sizeof(oei->i_data)); + if (EXT3_I(old_inode)->i_flags & EXT3_EXTENTS_FL) { + EXT3_I(new_inode)->i_flags |= EXT3_EXTENTS_FL; + ext3_extents_initialize_blockmap(handle, old_inode); ++ } else { ++ EXT3_I(new_inode)->i_flags &= ~EXT3_EXTENTS_FL; + } nei->i_disksize = oei->i_disksize; diff --git a/lustre/kernel_patches/patches/ext3-extents-asyncdel-2.4.24.patch b/lustre/kernel_patches/patches/ext3-extents-asyncdel-2.4.24.patch index 43681a6..bc752e5 100644 --- a/lustre/kernel_patches/patches/ext3-extents-asyncdel-2.4.24.patch +++ b/lustre/kernel_patches/patches/ext3-extents-asyncdel-2.4.24.patch @@ -16,13 +16,15 @@ Index: linux-2.4.24/fs/ext3/inode.c if (S_ISREG(inode->i_mode)) { inode->i_op = &ext3_file_inode_operations; inode->i_fop = &ext3_file_operations; -@@ -2659,6 +2665,10 @@ +@@ -2659,6 +2665,12 @@ memcpy(nei->i_data, oei->i_data, sizeof(nei->i_data)); memset(oei->i_data, 0, sizeof(oei->i_data)); + if (EXT3_I(old_inode)->i_flags & EXT3_EXTENTS_FL) { + EXT3_I(new_inode)->i_flags |= EXT3_EXTENTS_FL; + ext3_extents_initialize_blockmap(handle, old_inode); ++ } else { ++ EXT3_I(new_inode)->i_flags &= ~EXT3_EXTENTS_FL; + } nei->i_disksize = oei->i_disksize; diff --git a/lustre/kernel_patches/patches/ext3-external-journal-2.6.12.patch b/lustre/kernel_patches/patches/ext3-external-journal-2.6.12.patch new file mode 100644 index 0000000..bcfdae2 --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-external-journal-2.6.12.patch @@ -0,0 +1,148 @@ +Signed-off-by: Johann Lombardi + +--- linux-2.6.12.orig/fs/ext3/super.c 2005-06-17 21:48:29.000000000 +0200 ++++ linux-2.6.12/fs/ext3/super.c 2005-11-07 13:37:30.000000000 +0100 +@@ -39,7 +39,8 @@ + #include "xattr.h" + #include "acl.h" + +-static int ext3_load_journal(struct super_block *, struct ext3_super_block *); ++static int ext3_load_journal(struct super_block *, struct ext3_super_block *, ++ unsigned long journal_devnum); + static int ext3_create_journal(struct super_block *, struct ext3_super_block *, + int); + static void ext3_commit_super (struct super_block * sb, +@@ -586,7 +587,7 @@ enum { + Opt_nouid32, Opt_check, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov, + Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, + Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, +- Opt_commit, Opt_journal_update, Opt_journal_inum, ++ Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev, + Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, + Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, + Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, +@@ -624,6 +625,7 @@ static match_table_t tokens = { + {Opt_commit, "commit=%u"}, + {Opt_journal_update, "journal=update"}, + {Opt_journal_inum, "journal=%u"}, ++ {Opt_journal_dev, "journal_dev=%u"}, + {Opt_abort, "abort"}, + {Opt_data_journal, "data=journal"}, + {Opt_data_ordered, "data=ordered"}, +@@ -663,8 +665,9 @@ static unsigned long get_sb_block(void * + return sb_block; + } + +-static int parse_options (char * options, struct super_block *sb, +- unsigned long * inum, unsigned long *n_blocks_count, int is_remount) ++static int parse_options (char *options, struct super_block *sb, ++ unsigned long *inum, unsigned long *journal_devnum, ++ unsigned long *n_blocks_count, int is_remount) + { + struct ext3_sb_info *sbi = EXT3_SB(sb); + char * p; +@@ -805,6 +808,16 @@ static int parse_options (char * options + return 0; + *inum = option; + break; ++ case Opt_journal_dev: ++ if (is_remount) { ++ printk(KERN_ERR "EXT3-fs: cannot specify " ++ "journal on remount\n"); ++ return 0; ++ } ++ if (match_int(&args[0], &option)) ++ return 0; ++ *journal_devnum = option; ++ break; + case Opt_noload: + set_opt (sbi->s_mount_opt, NOLOAD); + break; +@@ -1250,6 +1263,7 @@ static int ext3_fill_super (struct super + unsigned long logic_sb_block; + unsigned long offset = 0; + unsigned long journal_inum = 0; ++ unsigned long journal_devnum = 0; + unsigned long def_mount_opts; + struct inode *root; + int blocksize; +@@ -1330,7 +1344,8 @@ static int ext3_fill_super (struct super + + set_opt(sbi->s_mount_opt, RESERVATION); + +- if (!parse_options ((char *) data, sb, &journal_inum, NULL, 0)) ++ if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum, ++ NULL, 0)) + goto failed_mount; + + sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | +@@ -1541,7 +1556,7 @@ static int ext3_fill_super (struct super + */ + if (!test_opt(sb, NOLOAD) && + EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) { +- if (ext3_load_journal(sb, es)) ++ if (ext3_load_journal(sb, es, journal_devnum)) + goto failed_mount2; + } else if (journal_inum) { + if (ext3_create_journal(sb, es, journal_inum)) +@@ -1821,15 +1836,24 @@ out_bdev: + return NULL; + } + +-static int ext3_load_journal(struct super_block * sb, +- struct ext3_super_block * es) ++static int ext3_load_journal(struct super_block *sb, ++ struct ext3_super_block *es, ++ unsigned long journal_devnum) + { + journal_t *journal; + int journal_inum = le32_to_cpu(es->s_journal_inum); +- dev_t journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev)); ++ dev_t journal_dev; + int err = 0; + int really_read_only; + ++ if (journal_devnum && ++ journal_devnum != le32_to_cpu(es->s_journal_dev)) { ++ printk(KERN_INFO "EXT3-fs: external journal device major/minor " ++ "numbers have changed\n"); ++ journal_dev = new_decode_dev(journal_devnum); ++ } else ++ journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev)); ++ + really_read_only = bdev_read_only(sb->s_bdev); + + /* +@@ -1888,6 +1912,16 @@ static int ext3_load_journal(struct supe + + EXT3_SB(sb)->s_journal = journal; + ext3_clear_journal_err(sb, es); ++ ++ if (journal_devnum && ++ journal_devnum != le32_to_cpu(es->s_journal_dev)) { ++ es->s_journal_dev = cpu_to_le32(journal_devnum); ++ sb->s_dirt = 1; ++ ++ /* Make sure we flush the recovery flag to disk. */ ++ ext3_commit_super(sb, es, 1); ++ } ++ + return 0; + } + +@@ -2093,13 +2127,13 @@ static int ext3_remount (struct super_bl + { + struct ext3_super_block * es; + struct ext3_sb_info *sbi = EXT3_SB(sb); +- unsigned long tmp; ++ unsigned long tmp1, tmp2; + unsigned long n_blocks_count = 0; + + /* + * Allow the "check" option to be passed as a remount option. + */ +- if (!parse_options(data, sb, &tmp, &n_blocks_count, 1)) ++ if (!parse_options(data, sb, &tmp1, &tmp2, &n_blocks_count, 1)) + return -EINVAL; + + if (sbi->s_mount_opt & EXT3_MOUNT_ABORT) diff --git a/lustre/kernel_patches/patches/ext3-htree-2.4.19-bgl.patch b/lustre/kernel_patches/patches/ext3-htree-2.4.19-bgl.patch deleted file mode 100644 index 6e4c834..0000000 --- a/lustre/kernel_patches/patches/ext3-htree-2.4.19-bgl.patch +++ /dev/null @@ -1,2584 +0,0 @@ - fs/ext3/Makefile | 2 - fs/ext3/dir.c | 302 +++++++++ - fs/ext3/file.c | 3 - fs/ext3/hash.c | 215 ++++++ - fs/ext3/namei.c | 1420 ++++++++++++++++++++++++++++++++++++++++----- - fs/ext3/super.c | 7 - include/linux/ext3_fs.h | 85 ++ - include/linux/ext3_fs_sb.h | 2 - include/linux/ext3_jbd.h | 2 - include/linux/rbtree.h | 2 - lib/rbtree.c | 42 + - 11 files changed, 1921 insertions(+), 161 deletions(-) - -Index: linux.mcp2/fs/ext3/dir.c -=================================================================== ---- linux.mcp2.orig/fs/ext3/dir.c 2004-05-17 15:03:55.000000000 -0700 -+++ linux.mcp2/fs/ext3/dir.c 2004-05-17 15:07:06.000000000 -0700 -@@ -21,12 +21,16 @@ - #include - #include - #include -+#include -+#include - - static unsigned char ext3_filetype_table[] = { - DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK - }; - - static int ext3_readdir(struct file *, void *, filldir_t); -+static int ext3_dx_readdir(struct file * filp, -+ void * dirent, filldir_t filldir); - - struct file_operations ext3_dir_operations = { - read: generic_read_dir, -@@ -35,6 +39,17 @@ - fsync: ext3_sync_file, /* BKL held */ - }; - -+ -+static unsigned char get_dtype(struct super_block *sb, int filetype) -+{ -+ if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_FILETYPE) || -+ (filetype >= EXT3_FT_MAX)) -+ return DT_UNKNOWN; -+ -+ return (ext3_filetype_table[filetype]); -+} -+ -+ - int ext3_check_dir_entry (const char * function, struct inode * dir, - struct ext3_dir_entry_2 * de, - struct buffer_head * bh, -@@ -79,6 +94,16 @@ - - sb = inode->i_sb; - -+ if (is_dx(inode)) { -+ err = ext3_dx_readdir(filp, dirent, filldir); -+ if (err != ERR_BAD_DX_DIR) -+ return err; -+ /* -+ * We don't set the inode dirty flag since it's not -+ * critical that it get flushed back to the disk. -+ */ -+ EXT3_I(filp->f_dentry->d_inode)->i_flags &= ~EXT3_INDEX_FL; -+ } - stored = 0; - bh = NULL; - offset = filp->f_pos & (sb->s_blocksize - 1); -@@ -162,18 +187,12 @@ - * during the copy operation. - */ - unsigned long version = filp->f_version; -- unsigned char d_type = DT_UNKNOWN; - -- if (EXT3_HAS_INCOMPAT_FEATURE(sb, -- EXT3_FEATURE_INCOMPAT_FILETYPE) -- && de->file_type < EXT3_FT_MAX) -- d_type = -- ext3_filetype_table[de->file_type]; - error = filldir(dirent, de->name, - de->name_len, - filp->f_pos, - le32_to_cpu(de->inode), -- d_type); -+ get_dtype(sb, de->file_type)); - if (error) - break; - if (version != filp->f_version) -@@ -188,3 +207,272 @@ - UPDATE_ATIME(inode); - return 0; - } -+ -+#ifdef CONFIG_EXT3_INDEX -+/* -+ * These functions convert from the major/minor hash to an f_pos -+ * value. -+ * -+ * Currently we only use major hash numer. This is unfortunate, but -+ * on 32-bit machines, the same VFS interface is used for lseek and -+ * llseek, so if we use the 64 bit offset, then the 32-bit versions of -+ * lseek/telldir/seekdir will blow out spectacularly, and from within -+ * the ext2 low-level routine, we don't know if we're being called by -+ * a 64-bit version of the system call or the 32-bit version of the -+ * system call. Worse yet, NFSv2 only allows for a 32-bit readdir -+ * cookie. Sigh. -+ */ -+#define hash2pos(major, minor) (major >> 1) -+#define pos2maj_hash(pos) ((pos << 1) & 0xffffffff) -+#define pos2min_hash(pos) (0) -+ -+/* -+ * This structure holds the nodes of the red-black tree used to store -+ * the directory entry in hash order. -+ */ -+struct fname { -+ __u32 hash; -+ __u32 minor_hash; -+ rb_node_t rb_hash; -+ struct fname *next; -+ __u32 inode; -+ __u8 name_len; -+ __u8 file_type; -+ char name[0]; -+}; -+ -+/* -+ * This functoin implements a non-recursive way of freeing all of the -+ * nodes in the red-black tree. -+ */ -+static void free_rb_tree_fname(rb_root_t *root) -+{ -+ rb_node_t *n = root->rb_node; -+ rb_node_t *parent; -+ struct fname *fname; -+ -+ while (n) { -+ /* Do the node's children first */ -+ if ((n)->rb_left) { -+ n = n->rb_left; -+ continue; -+ } -+ if (n->rb_right) { -+ n = n->rb_right; -+ continue; -+ } -+ /* -+ * The node has no children; free it, and then zero -+ * out parent's link to it. Finally go to the -+ * beginning of the loop and try to free the parent -+ * node. -+ */ -+ parent = n->rb_parent; -+ fname = rb_entry(n, struct fname, rb_hash); -+ kfree(fname); -+ if (!parent) -+ root->rb_node = 0; -+ else if (parent->rb_left == n) -+ parent->rb_left = 0; -+ else if (parent->rb_right == n) -+ parent->rb_right = 0; -+ n = parent; -+ } -+ root->rb_node = 0; -+} -+ -+ -+struct dir_private_info *create_dir_info(loff_t pos) -+{ -+ struct dir_private_info *p; -+ -+ p = kmalloc(sizeof(struct dir_private_info), GFP_KERNEL); -+ if (!p) -+ return NULL; -+ p->root.rb_node = 0; -+ p->curr_node = 0; -+ p->extra_fname = 0; -+ p->last_pos = 0; -+ p->curr_hash = pos2maj_hash(pos); -+ p->curr_minor_hash = pos2min_hash(pos); -+ p->next_hash = 0; -+ return p; -+} -+ -+void ext3_htree_free_dir_info(struct dir_private_info *p) -+{ -+ free_rb_tree_fname(&p->root); -+ kfree(p); -+} -+ -+/* -+ * Given a directory entry, enter it into the fname rb tree. -+ */ -+int ext3_htree_store_dirent(struct file *dir_file, __u32 hash, -+ __u32 minor_hash, -+ struct ext3_dir_entry_2 *dirent) -+{ -+ rb_node_t **p, *parent = NULL; -+ struct fname * fname, *new_fn; -+ struct dir_private_info *info; -+ int len; -+ -+ info = (struct dir_private_info *) dir_file->private_data; -+ p = &info->root.rb_node; -+ -+ /* Create and allocate the fname structure */ -+ len = sizeof(struct fname) + dirent->name_len + 1; -+ new_fn = kmalloc(len, GFP_KERNEL); -+ if (!new_fn) -+ return -ENOMEM; -+ memset(new_fn, 0, len); -+ new_fn->hash = hash; -+ new_fn->minor_hash = minor_hash; -+ new_fn->inode = le32_to_cpu(dirent->inode); -+ new_fn->name_len = dirent->name_len; -+ new_fn->file_type = dirent->file_type; -+ memcpy(new_fn->name, dirent->name, dirent->name_len); -+ new_fn->name[dirent->name_len] = 0; -+ -+ while (*p) { -+ parent = *p; -+ fname = rb_entry(parent, struct fname, rb_hash); -+ -+ /* -+ * If the hash and minor hash match up, then we put -+ * them on a linked list. This rarely happens... -+ */ -+ if ((new_fn->hash == fname->hash) && -+ (new_fn->minor_hash == fname->minor_hash)) { -+ new_fn->next = fname->next; -+ fname->next = new_fn; -+ return 0; -+ } -+ -+ if (new_fn->hash < fname->hash) -+ p = &(*p)->rb_left; -+ else if (new_fn->hash > fname->hash) -+ p = &(*p)->rb_right; -+ else if (new_fn->minor_hash < fname->minor_hash) -+ p = &(*p)->rb_left; -+ else /* if (new_fn->minor_hash > fname->minor_hash) */ -+ p = &(*p)->rb_right; -+ } -+ -+ rb_link_node(&new_fn->rb_hash, parent, p); -+ rb_insert_color(&new_fn->rb_hash, &info->root); -+ return 0; -+} -+ -+ -+ -+/* -+ * This is a helper function for ext3_dx_readdir. It calls filldir -+ * for all entres on the fname linked list. (Normally there is only -+ * one entry on the linked list, unless there are 62 bit hash collisions.) -+ */ -+static int call_filldir(struct file * filp, void * dirent, -+ filldir_t filldir, struct fname *fname) -+{ -+ struct dir_private_info *info = filp->private_data; -+ loff_t curr_pos; -+ struct inode *inode = filp->f_dentry->d_inode; -+ struct super_block * sb; -+ int error; -+ -+ sb = inode->i_sb; -+ -+ if (!fname) { -+ printk("call_filldir: called with null fname?!?\n"); -+ return 0; -+ } -+ curr_pos = hash2pos(fname->hash, fname->minor_hash); -+ while (fname) { -+ error = filldir(dirent, fname->name, -+ fname->name_len, curr_pos, -+ fname->inode, -+ get_dtype(sb, fname->file_type)); -+ if (error) { -+ filp->f_pos = curr_pos; -+ info->extra_fname = fname->next; -+ return error; -+ } -+ fname = fname->next; -+ } -+ return 0; -+} -+ -+static int ext3_dx_readdir(struct file * filp, -+ void * dirent, filldir_t filldir) -+{ -+ struct dir_private_info *info = filp->private_data; -+ struct inode *inode = filp->f_dentry->d_inode; -+ struct fname *fname; -+ int ret; -+ -+ if (!info) { -+ info = create_dir_info(filp->f_pos); -+ if (!info) -+ return -ENOMEM; -+ filp->private_data = info; -+ } -+ -+ /* Some one has messed with f_pos; reset the world */ -+ if (info->last_pos != filp->f_pos) { -+ free_rb_tree_fname(&info->root); -+ info->curr_node = 0; -+ info->extra_fname = 0; -+ info->curr_hash = pos2maj_hash(filp->f_pos); -+ info->curr_minor_hash = pos2min_hash(filp->f_pos); -+ } -+ -+ /* -+ * If there are any leftover names on the hash collision -+ * chain, return them first. -+ */ -+ if (info->extra_fname && -+ call_filldir(filp, dirent, filldir, info->extra_fname)) -+ goto finished; -+ -+ if (!info->curr_node) -+ info->curr_node = rb_get_first(&info->root); -+ -+ while (1) { -+ /* -+ * Fill the rbtree if we have no more entries, -+ * or the inode has changed since we last read in the -+ * cached entries. -+ */ -+ if ((!info->curr_node) || -+ (filp->f_version != inode->i_version)) { -+ info->curr_node = 0; -+ free_rb_tree_fname(&info->root); -+ filp->f_version = inode->i_version; -+ ret = ext3_htree_fill_tree(filp, info->curr_hash, -+ info->curr_minor_hash, -+ &info->next_hash); -+ if (ret < 0) -+ return ret; -+ if (ret == 0) -+ break; -+ info->curr_node = rb_get_first(&info->root); -+ } -+ -+ fname = rb_entry(info->curr_node, struct fname, rb_hash); -+ info->curr_hash = fname->hash; -+ info->curr_minor_hash = fname->minor_hash; -+ if (call_filldir(filp, dirent, filldir, fname)) -+ break; -+ -+ info->curr_node = rb_get_next(info->curr_node); -+ if (!info->curr_node) { -+ info->curr_hash = info->next_hash; -+ info->curr_minor_hash = 0; -+ } -+ } -+finished: -+ info->last_pos = filp->f_pos; -+ UPDATE_ATIME(inode); -+ return 0; -+} -+#endif -Index: linux.mcp2/fs/ext3/file.c -=================================================================== ---- linux.mcp2.orig/fs/ext3/file.c 2004-05-17 15:03:55.000000000 -0700 -+++ linux.mcp2/fs/ext3/file.c 2004-05-17 15:07:06.000000000 -0700 -@@ -35,6 +35,9 @@ - { - if (filp->f_mode & FMODE_WRITE) - ext3_discard_prealloc (inode); -+ if (is_dx(inode) && filp->private_data) -+ ext3_htree_free_dir_info(filp->private_data); -+ - return 0; - } - -Index: linux.mcp2/fs/ext3/hash.c -=================================================================== ---- linux.mcp2.orig/fs/ext3/hash.c 2002-04-11 07:25:15.000000000 -0700 -+++ linux.mcp2/fs/ext3/hash.c 2004-05-17 15:07:06.000000000 -0700 -@@ -0,0 +1,215 @@ -+/* -+ * linux/fs/ext3/hash.c -+ * -+ * Copyright (C) 2002 by Theodore Ts'o -+ * -+ * This file is released under the GPL v2. -+ * -+ * This file may be redistributed under the terms of the GNU Public -+ * License. -+ */ -+ -+#include -+#include -+#include -+#include -+ -+#define DELTA 0x9E3779B9 -+ -+static void TEA_transform(__u32 buf[4], __u32 const in[]) -+{ -+ __u32 sum = 0; -+ __u32 b0 = buf[0], b1 = buf[1]; -+ __u32 a = in[0], b = in[1], c = in[2], d = in[3]; -+ int n = 16; -+ -+ do { -+ sum += DELTA; -+ b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); -+ b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); -+ } while(--n); -+ -+ buf[0] += b0; -+ buf[1] += b1; -+} -+ -+/* F, G and H are basic MD4 functions: selection, majority, parity */ -+#define F(x, y, z) ((z) ^ ((x) & ((y) ^ (z)))) -+#define G(x, y, z) (((x) & (y)) + (((x) ^ (y)) & (z))) -+#define H(x, y, z) ((x) ^ (y) ^ (z)) -+ -+/* -+ * The generic round function. The application is so specific that -+ * we don't bother protecting all the arguments with parens, as is generally -+ * good macro practice, in favor of extra legibility. -+ * Rotation is separate from addition to prevent recomputation -+ */ -+#define ROUND(f, a, b, c, d, x, s) \ -+ (a += f(b, c, d) + x, a = (a << s) | (a >> (32-s))) -+#define K1 0 -+#define K2 013240474631UL -+#define K3 015666365641UL -+ -+/* -+ * Basic cut-down MD4 transform. Returns only 32 bits of result. -+ */ -+static void halfMD4Transform (__u32 buf[4], __u32 const in[]) -+{ -+ __u32 a = buf[0], b = buf[1], c = buf[2], d = buf[3]; -+ -+ /* Round 1 */ -+ ROUND(F, a, b, c, d, in[0] + K1, 3); -+ ROUND(F, d, a, b, c, in[1] + K1, 7); -+ ROUND(F, c, d, a, b, in[2] + K1, 11); -+ ROUND(F, b, c, d, a, in[3] + K1, 19); -+ ROUND(F, a, b, c, d, in[4] + K1, 3); -+ ROUND(F, d, a, b, c, in[5] + K1, 7); -+ ROUND(F, c, d, a, b, in[6] + K1, 11); -+ ROUND(F, b, c, d, a, in[7] + K1, 19); -+ -+ /* Round 2 */ -+ ROUND(G, a, b, c, d, in[1] + K2, 3); -+ ROUND(G, d, a, b, c, in[3] + K2, 5); -+ ROUND(G, c, d, a, b, in[5] + K2, 9); -+ ROUND(G, b, c, d, a, in[7] + K2, 13); -+ ROUND(G, a, b, c, d, in[0] + K2, 3); -+ ROUND(G, d, a, b, c, in[2] + K2, 5); -+ ROUND(G, c, d, a, b, in[4] + K2, 9); -+ ROUND(G, b, c, d, a, in[6] + K2, 13); -+ -+ /* Round 3 */ -+ ROUND(H, a, b, c, d, in[3] + K3, 3); -+ ROUND(H, d, a, b, c, in[7] + K3, 9); -+ ROUND(H, c, d, a, b, in[2] + K3, 11); -+ ROUND(H, b, c, d, a, in[6] + K3, 15); -+ ROUND(H, a, b, c, d, in[1] + K3, 3); -+ ROUND(H, d, a, b, c, in[5] + K3, 9); -+ ROUND(H, c, d, a, b, in[0] + K3, 11); -+ ROUND(H, b, c, d, a, in[4] + K3, 15); -+ -+ buf[0] += a; -+ buf[1] += b; -+ buf[2] += c; -+ buf[3] += d; -+} -+ -+#undef ROUND -+#undef F -+#undef G -+#undef H -+#undef K1 -+#undef K2 -+#undef K3 -+ -+/* The old legacy hash */ -+static __u32 dx_hack_hash (const char *name, int len) -+{ -+ __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; -+ while (len--) { -+ __u32 hash = hash1 + (hash0 ^ (*name++ * 7152373)); -+ -+ if (hash & 0x80000000) hash -= 0x7fffffff; -+ hash1 = hash0; -+ hash0 = hash; -+ } -+ return (hash0 << 1); -+} -+ -+static void str2hashbuf(const char *msg, int len, __u32 *buf, int num) -+{ -+ __u32 pad, val; -+ int i; -+ -+ pad = (__u32)len | ((__u32)len << 8); -+ pad |= pad << 16; -+ -+ val = pad; -+ if (len > num*4) -+ len = num * 4; -+ for (i=0; i < len; i++) { -+ if ((i % 4) == 0) -+ val = pad; -+ val = msg[i] + (val << 8); -+ if ((i % 4) == 3) { -+ *buf++ = val; -+ val = pad; -+ num--; -+ } -+ } -+ if (--num >= 0) -+ *buf++ = val; -+ while (--num >= 0) -+ *buf++ = pad; -+} -+ -+/* -+ * Returns the hash of a filename. If len is 0 and name is NULL, then -+ * this function can be used to test whether or not a hash version is -+ * supported. -+ * -+ * The seed is an 4 longword (32 bits) "secret" which can be used to -+ * uniquify a hash. If the seed is all zero's, then some default seed -+ * may be used. -+ * -+ * A particular hash version specifies whether or not the seed is -+ * represented, and whether or not the returned hash is 32 bits or 64 -+ * bits. 32 bit hashes will return 0 for the minor hash. -+ */ -+int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) -+{ -+ __u32 hash; -+ __u32 minor_hash = 0; -+ const char *p; -+ int i; -+ __u32 in[8], buf[4]; -+ -+ /* Initialize the default seed for the hash checksum functions */ -+ buf[0] = 0x67452301; -+ buf[1] = 0xefcdab89; -+ buf[2] = 0x98badcfe; -+ buf[3] = 0x10325476; -+ -+ /* Check to see if the seed is all zero's */ -+ if (hinfo->seed) { -+ for (i=0; i < 4; i++) { -+ if (hinfo->seed[i]) -+ break; -+ } -+ if (i < 4) -+ memcpy(buf, hinfo->seed, sizeof(buf)); -+ } -+ -+ switch (hinfo->hash_version) { -+ case DX_HASH_LEGACY: -+ hash = dx_hack_hash(name, len); -+ break; -+ case DX_HASH_HALF_MD4: -+ p = name; -+ while (len > 0) { -+ str2hashbuf(p, len, in, 8); -+ halfMD4Transform(buf, in); -+ len -= 32; -+ p += 32; -+ } -+ minor_hash = buf[2]; -+ hash = buf[1]; -+ break; -+ case DX_HASH_TEA: -+ p = name; -+ while (len > 0) { -+ str2hashbuf(p, len, in, 4); -+ TEA_transform(buf, in); -+ len -= 16; -+ p += 16; -+ } -+ hash = buf[0]; -+ minor_hash = buf[1]; -+ break; -+ default: -+ hinfo->hash = 0; -+ return -1; -+ } -+ hinfo->hash = hash & ~1; -+ hinfo->minor_hash = minor_hash; -+ return 0; -+} -Index: linux.mcp2/fs/ext3/Makefile -=================================================================== ---- linux.mcp2.orig/fs/ext3/Makefile 2004-05-17 15:03:55.000000000 -0700 -+++ linux.mcp2/fs/ext3/Makefile 2004-05-17 15:07:06.000000000 -0700 -@@ -10,7 +10,7 @@ - O_TARGET := ext3.o - - obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ -- ioctl.o namei.o super.o symlink.o -+ ioctl.o namei.o super.o symlink.o hash.o - obj-m := $(O_TARGET) - - include $(TOPDIR)/Rules.make -Index: linux.mcp2/fs/ext3/namei.c -=================================================================== ---- linux.mcp2.orig/fs/ext3/namei.c 2004-05-17 15:03:55.000000000 -0700 -+++ linux.mcp2/fs/ext3/namei.c 2004-05-17 15:07:06.000000000 -0700 -@@ -16,6 +16,12 @@ - * David S. Miller (davem@caip.rutgers.edu), 1995 - * Directory entry file type support and forward compatibility hooks - * for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998 -+ * Hash Tree Directory indexing (c) -+ * Daniel Phillips, 2001 -+ * Hash Tree Directory indexing porting -+ * Christopher Li, 2002 -+ * Hash Tree Directory indexing cleanup -+ * Theodore Ts'o, 2002 - */ - - #include -@@ -38,6 +44,642 @@ - #define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS) - #define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b)) - -+static struct buffer_head *ext3_append(handle_t *handle, -+ struct inode *inode, -+ u32 *block, int *err) -+{ -+ struct buffer_head *bh; -+ -+ *block = inode->i_size >> inode->i_sb->s_blocksize_bits; -+ -+ if ((bh = ext3_bread(handle, inode, *block, 1, err))) { -+ inode->i_size += inode->i_sb->s_blocksize; -+ EXT3_I(inode)->i_disksize = inode->i_size; -+ ext3_journal_get_write_access(handle,bh); -+ } -+ return bh; -+} -+ -+#ifndef assert -+#define assert(test) J_ASSERT(test) -+#endif -+ -+#ifndef swap -+#define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0) -+#endif -+ -+typedef struct { u32 v; } le_u32; -+typedef struct { u16 v; } le_u16; -+ -+#ifdef DX_DEBUG -+#define dxtrace(command) command -+#else -+#define dxtrace(command) -+#endif -+ -+struct fake_dirent -+{ -+ /*le*/u32 inode; -+ /*le*/u16 rec_len; -+ u8 name_len; -+ u8 file_type; -+}; -+ -+struct dx_countlimit -+{ -+ le_u16 limit; -+ le_u16 count; -+}; -+ -+struct dx_entry -+{ -+ le_u32 hash; -+ le_u32 block; -+}; -+ -+/* -+ * dx_root_info is laid out so that if it should somehow get overlaid by a -+ * dirent the two low bits of the hash version will be zero. Therefore, the -+ * hash version mod 4 should never be 0. Sincerely, the paranoia department. -+ */ -+ -+struct dx_root -+{ -+ struct fake_dirent dot; -+ char dot_name[4]; -+ struct fake_dirent dotdot; -+ char dotdot_name[4]; -+ struct dx_root_info -+ { -+ le_u32 reserved_zero; -+ u8 hash_version; -+ u8 info_length; /* 8 */ -+ u8 indirect_levels; -+ u8 unused_flags; -+ } -+ info; -+ struct dx_entry entries[0]; -+}; -+ -+struct dx_node -+{ -+ struct fake_dirent fake; -+ struct dx_entry entries[0]; -+}; -+ -+ -+struct dx_frame -+{ -+ struct buffer_head *bh; -+ struct dx_entry *entries; -+ struct dx_entry *at; -+}; -+ -+struct dx_map_entry -+{ -+ u32 hash; -+ u32 offs; -+}; -+ -+#ifdef CONFIG_EXT3_INDEX -+static inline unsigned dx_get_block (struct dx_entry *entry); -+static void dx_set_block (struct dx_entry *entry, unsigned value); -+static inline unsigned dx_get_hash (struct dx_entry *entry); -+static void dx_set_hash (struct dx_entry *entry, unsigned value); -+static unsigned dx_get_count (struct dx_entry *entries); -+static unsigned dx_get_limit (struct dx_entry *entries); -+static void dx_set_count (struct dx_entry *entries, unsigned value); -+static void dx_set_limit (struct dx_entry *entries, unsigned value); -+static unsigned dx_root_limit (struct inode *dir, unsigned infosize); -+static unsigned dx_node_limit (struct inode *dir); -+static struct dx_frame *dx_probe(struct dentry *dentry, -+ struct inode *dir, -+ struct dx_hash_info *hinfo, -+ struct dx_frame *frame, -+ int *err); -+static void dx_release (struct dx_frame *frames); -+static int dx_make_map (struct ext3_dir_entry_2 *de, int size, -+ struct dx_hash_info *hinfo, struct dx_map_entry map[]); -+static void dx_sort_map(struct dx_map_entry *map, unsigned count); -+static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to, -+ struct dx_map_entry *offsets, int count); -+static struct ext3_dir_entry_2* dx_pack_dirents (char *base, int size); -+static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block); -+static int ext3_htree_next_block(struct inode *dir, __u32 hash, -+ struct dx_frame *frame, -+ struct dx_frame *frames, int *err, -+ __u32 *start_hash); -+static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry, -+ struct ext3_dir_entry_2 **res_dir, int *err); -+static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, -+ struct inode *inode); -+ -+/* -+ * Future: use high four bits of block for coalesce-on-delete flags -+ * Mask them off for now. -+ */ -+ -+static inline unsigned dx_get_block (struct dx_entry *entry) -+{ -+ return le32_to_cpu(entry->block.v) & 0x00ffffff; -+} -+ -+static inline void dx_set_block (struct dx_entry *entry, unsigned value) -+{ -+ entry->block.v = cpu_to_le32(value); -+} -+ -+static inline unsigned dx_get_hash (struct dx_entry *entry) -+{ -+ return le32_to_cpu(entry->hash.v); -+} -+ -+static inline void dx_set_hash (struct dx_entry *entry, unsigned value) -+{ -+ entry->hash.v = cpu_to_le32(value); -+} -+ -+static inline unsigned dx_get_count (struct dx_entry *entries) -+{ -+ return le16_to_cpu(((struct dx_countlimit *) entries)->count.v); -+} -+ -+static inline unsigned dx_get_limit (struct dx_entry *entries) -+{ -+ return le16_to_cpu(((struct dx_countlimit *) entries)->limit.v); -+} -+ -+static inline void dx_set_count (struct dx_entry *entries, unsigned value) -+{ -+ ((struct dx_countlimit *) entries)->count.v = cpu_to_le16(value); -+} -+ -+static inline void dx_set_limit (struct dx_entry *entries, unsigned value) -+{ -+ ((struct dx_countlimit *) entries)->limit.v = cpu_to_le16(value); -+} -+ -+static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize) -+{ -+ unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(1) - -+ EXT3_DIR_REC_LEN(2) - infosize; -+ return 0? 20: entry_space / sizeof(struct dx_entry); -+} -+ -+static inline unsigned dx_node_limit (struct inode *dir) -+{ -+ unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(0); -+ return 0? 22: entry_space / sizeof(struct dx_entry); -+} -+ -+/* -+ * Debug -+ */ -+#ifdef DX_DEBUG -+struct stats -+{ -+ unsigned names; -+ unsigned space; -+ unsigned bcount; -+}; -+ -+static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext3_dir_entry_2 *de, -+ int size, int show_names) -+{ -+ unsigned names = 0, space = 0; -+ char *base = (char *) de; -+ struct dx_hash_info h = *hinfo; -+ -+ printk("names: "); -+ while ((char *) de < base + size) -+ { -+ if (de->inode) -+ { -+ if (show_names) -+ { -+ int len = de->name_len; -+ char *name = de->name; -+ while (len--) printk("%c", *name++); -+ ext3fs_dirhash(de->name, de->name_len, &h); -+ printk(":%x.%u ", h.hash, -+ ((char *) de - base)); -+ } -+ space += EXT3_DIR_REC_LEN(de->name_len); -+ names++; -+ } -+ de = (struct ext3_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len)); -+ } -+ printk("(%i)\n", names); -+ return (struct stats) { names, space, 1 }; -+} -+ -+struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, -+ struct dx_entry *entries, int levels) -+{ -+ unsigned blocksize = dir->i_sb->s_blocksize; -+ unsigned count = dx_get_count (entries), names = 0, space = 0, i; -+ unsigned bcount = 0; -+ struct buffer_head *bh; -+ int err; -+ printk("%i indexed blocks...\n", count); -+ for (i = 0; i < count; i++, entries++) -+ { -+ u32 block = dx_get_block(entries), hash = i? dx_get_hash(entries): 0; -+ u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash; -+ struct stats stats; -+ printk("%s%3u:%03u hash %8x/%8x ",levels?"":" ", i, block, hash, range); -+ if (!(bh = ext3_bread (NULL,dir, block, 0,&err))) continue; -+ stats = levels? -+ dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1): -+ dx_show_leaf(hinfo, (struct ext3_dir_entry_2 *) bh->b_data, blocksize, 0); -+ names += stats.names; -+ space += stats.space; -+ bcount += stats.bcount; -+ brelse (bh); -+ } -+ if (bcount) -+ printk("%snames %u, fullness %u (%u%%)\n", levels?"":" ", -+ names, space/bcount,(space/bcount)*100/blocksize); -+ return (struct stats) { names, space, bcount}; -+} -+#endif /* DX_DEBUG */ -+ -+/* -+ * Probe for a directory leaf block to search. -+ * -+ * dx_probe can return ERR_BAD_DX_DIR, which means there was a format -+ * error in the directory index, and the caller should fall back to -+ * searching the directory normally. The callers of dx_probe **MUST** -+ * check for this error code, and make sure it never gets reflected -+ * back to userspace. -+ */ -+static struct dx_frame * -+dx_probe(struct dentry *dentry, struct inode *dir, -+ struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err) -+{ -+ unsigned count, indirect; -+ struct dx_entry *at, *entries, *p, *q, *m; -+ struct dx_root *root; -+ struct buffer_head *bh; -+ struct dx_frame *frame = frame_in; -+ u32 hash; -+ -+ frame->bh = NULL; -+ if (dentry) -+ dir = dentry->d_parent->d_inode; -+ if (!(bh = ext3_bread (NULL,dir, 0, 0, err))) -+ goto fail; -+ root = (struct dx_root *) bh->b_data; -+ if (root->info.hash_version != DX_HASH_TEA && -+ root->info.hash_version != DX_HASH_HALF_MD4 && -+ root->info.hash_version != DX_HASH_LEGACY) { -+ ext3_warning(dir->i_sb, __FUNCTION__, -+ "Unrecognised inode hash code %d", -+ root->info.hash_version); -+ brelse(bh); -+ *err = ERR_BAD_DX_DIR; -+ goto fail; -+ } -+ hinfo->hash_version = root->info.hash_version; -+ hinfo->seed = dir->i_sb->u.ext3_sb.s_hash_seed; -+ if (dentry) -+ ext3fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo); -+ hash = hinfo->hash; -+ -+ if (root->info.unused_flags & 1) { -+ ext3_warning(dir->i_sb, __FUNCTION__, -+ "Unimplemented inode hash flags: %#06x", -+ root->info.unused_flags); -+ brelse(bh); -+ *err = ERR_BAD_DX_DIR; -+ goto fail; -+ } -+ -+ if ((indirect = root->info.indirect_levels) > 1) { -+ ext3_warning(dir->i_sb, __FUNCTION__, -+ "Unimplemented inode hash depth: %#06x", -+ root->info.indirect_levels); -+ brelse(bh); -+ *err = ERR_BAD_DX_DIR; -+ goto fail; -+ } -+ -+ entries = (struct dx_entry *) (((char *)&root->info) + -+ root->info.info_length); -+ assert(dx_get_limit(entries) == dx_root_limit(dir, -+ root->info.info_length)); -+ dxtrace (printk("Look up %x", hash)); -+ while (1) -+ { -+ count = dx_get_count(entries); -+ assert (count && count <= dx_get_limit(entries)); -+ p = entries + 1; -+ q = entries + count - 1; -+ while (p <= q) -+ { -+ m = p + (q - p)/2; -+ dxtrace(printk(".")); -+ if (dx_get_hash(m) > hash) -+ q = m - 1; -+ else -+ p = m + 1; -+ } -+ -+ if (0) // linear search cross check -+ { -+ unsigned n = count - 1; -+ at = entries; -+ while (n--) -+ { -+ dxtrace(printk(",")); -+ if (dx_get_hash(++at) > hash) -+ { -+ at--; -+ break; -+ } -+ } -+ assert (at == p - 1); -+ } -+ -+ at = p - 1; -+ dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at))); -+ frame->bh = bh; -+ frame->entries = entries; -+ frame->at = at; -+ if (!indirect--) return frame; -+ if (!(bh = ext3_bread (NULL,dir, dx_get_block(at), 0, err))) -+ goto fail2; -+ at = entries = ((struct dx_node *) bh->b_data)->entries; -+ assert (dx_get_limit(entries) == dx_node_limit (dir)); -+ frame++; -+ } -+fail2: -+ while (frame >= frame_in) { -+ brelse(frame->bh); -+ frame--; -+ } -+fail: -+ return NULL; -+} -+ -+static void dx_release (struct dx_frame *frames) -+{ -+ if (frames[0].bh == NULL) -+ return; -+ -+ if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels) -+ brelse(frames[1].bh); -+ brelse(frames[0].bh); -+} -+ -+/* -+ * This function increments the frame pointer to search the next leaf -+ * block, and reads in the necessary intervening nodes if the search -+ * should be necessary. Whether or not the search is necessary is -+ * controlled by the hash parameter. If the hash value is even, then -+ * the search is only continued if the next block starts with that -+ * hash value. This is used if we are searching for a specific file. -+ * -+ * If the hash value is HASH_NB_ALWAYS, then always go to the next block. -+ * -+ * This function returns 1 if the caller should continue to search, -+ * or 0 if it should not. If there is an error reading one of the -+ * index blocks, it will return -1. -+ * -+ * If start_hash is non-null, it will be filled in with the starting -+ * hash of the next page. -+ */ -+static int ext3_htree_next_block(struct inode *dir, __u32 hash, -+ struct dx_frame *frame, -+ struct dx_frame *frames, int *err, -+ __u32 *start_hash) -+{ -+ struct dx_frame *p; -+ struct buffer_head *bh; -+ int num_frames = 0; -+ __u32 bhash; -+ -+ *err = ENOENT; -+ p = frame; -+ /* -+ * Find the next leaf page by incrementing the frame pointer. -+ * If we run out of entries in the interior node, loop around and -+ * increment pointer in the parent node. When we break out of -+ * this loop, num_frames indicates the number of interior -+ * nodes need to be read. -+ */ -+ while (1) { -+ if (++(p->at) < p->entries + dx_get_count(p->entries)) -+ break; -+ if (p == frames) -+ return 0; -+ num_frames++; -+ p--; -+ } -+ -+ /* -+ * If the hash is 1, then continue only if the next page has a -+ * continuation hash of any value. This is used for readdir -+ * handling. Otherwise, check to see if the hash matches the -+ * desired contiuation hash. If it doesn't, return since -+ * there's no point to read in the successive index pages. -+ */ -+ bhash = dx_get_hash(p->at); -+ if (start_hash) -+ *start_hash = bhash; -+ if ((hash & 1) == 0) { -+ if ((bhash & ~1) != hash) -+ return 0; -+ } -+ /* -+ * If the hash is HASH_NB_ALWAYS, we always go to the next -+ * block so no check is necessary -+ */ -+ while (num_frames--) { -+ if (!(bh = ext3_bread(NULL, dir, dx_get_block(p->at), -+ 0, err))) -+ return -1; /* Failure */ -+ p++; -+ brelse (p->bh); -+ p->bh = bh; -+ p->at = p->entries = ((struct dx_node *) bh->b_data)->entries; -+ } -+ return 1; -+} -+ -+ -+/* -+ * p is at least 6 bytes before the end of page -+ */ -+static inline struct ext3_dir_entry_2 *ext3_next_entry(struct ext3_dir_entry_2 *p) -+{ -+ return (struct ext3_dir_entry_2 *)((char*)p + le16_to_cpu(p->rec_len)); -+} -+ -+/* -+ * This function fills a red-black tree with information from a -+ * directory. We start scanning the directory in hash order, starting -+ * at start_hash and start_minor_hash. -+ * -+ * This function returns the number of entries inserted into the tree, -+ * or a negative error code. -+ */ -+int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash, -+ __u32 start_minor_hash, __u32 *next_hash) -+{ -+ struct dx_hash_info hinfo; -+ struct buffer_head *bh; -+ struct ext3_dir_entry_2 *de, *top; -+ static struct dx_frame frames[2], *frame; -+ struct inode *dir; -+ int block, err; -+ int count = 0; -+ int ret; -+ __u32 hashval; -+ -+ dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash, -+ start_minor_hash)); -+ dir = dir_file->f_dentry->d_inode; -+ hinfo.hash = start_hash; -+ hinfo.minor_hash = 0; -+ frame = dx_probe(0, dir_file->f_dentry->d_inode, &hinfo, frames, &err); -+ if (!frame) -+ return err; -+ -+ /* Add '.' and '..' from the htree header */ -+ if (!start_hash && !start_minor_hash) { -+ de = (struct ext3_dir_entry_2 *) frames[0].bh->b_data; -+ if ((err = ext3_htree_store_dirent(dir_file, 0, 0, de)) != 0) -+ goto errout; -+ de = ext3_next_entry(de); -+ if ((err = ext3_htree_store_dirent(dir_file, 0, 0, de)) != 0) -+ goto errout; -+ count += 2; -+ } -+ -+ while (1) { -+ block = dx_get_block(frame->at); -+ dxtrace(printk("Reading block %d\n", block)); -+ if (!(bh = ext3_bread (NULL, dir, block, 0, &err))) -+ goto errout; -+ -+ de = (struct ext3_dir_entry_2 *) bh->b_data; -+ top = (struct ext3_dir_entry_2 *) ((char *) de + dir->i_sb->s_blocksize - -+ EXT3_DIR_REC_LEN(0)); -+ for (; de < top; de = ext3_next_entry(de)) { -+ ext3fs_dirhash(de->name, de->name_len, &hinfo); -+ if ((hinfo.hash < start_hash) || -+ ((hinfo.hash == start_hash) && -+ (hinfo.minor_hash < start_minor_hash))) -+ continue; -+ if ((err = ext3_htree_store_dirent(dir_file, -+ hinfo.hash, hinfo.minor_hash, de)) != 0) -+ goto errout; -+ count++; -+ } -+ brelse (bh); -+ hashval = ~1; -+ ret = ext3_htree_next_block(dir, HASH_NB_ALWAYS, -+ frame, frames, &err, &hashval); -+ if (next_hash) -+ *next_hash = hashval; -+ if (ret == -1) -+ goto errout; -+ /* -+ * Stop if: (a) there are no more entries, or -+ * (b) we have inserted at least one entry and the -+ * next hash value is not a continuation -+ */ -+ if ((ret == 0) || -+ (count && ((hashval & 1) == 0))) -+ break; -+ } -+ dx_release(frames); -+ dxtrace(printk("Fill tree: returned %d entries\n", count)); -+ return count; -+errout: -+ dx_release(frames); -+ return (err); -+} -+ -+ -+/* -+ * Directory block splitting, compacting -+ */ -+ -+static int dx_make_map (struct ext3_dir_entry_2 *de, int size, -+ struct dx_hash_info *hinfo, struct dx_map_entry *map_tail) -+{ -+ int count = 0; -+ char *base = (char *) de; -+ struct dx_hash_info h = *hinfo; -+ -+ while ((char *) de < base + size) -+ { -+ if (de->name_len && de->inode) { -+ ext3fs_dirhash(de->name, de->name_len, &h); -+ map_tail--; -+ map_tail->hash = h.hash; -+ map_tail->offs = (u32) ((char *) de - base); -+ count++; -+ } -+ /* XXX: do we need to check rec_len == 0 case? -Chris */ -+ de = (struct ext3_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len)); -+ } -+ return count; -+} -+ -+static void dx_sort_map (struct dx_map_entry *map, unsigned count) -+{ -+ struct dx_map_entry *p, *q, *top = map + count - 1; -+ int more; -+ /* Combsort until bubble sort doesn't suck */ -+ while (count > 2) -+ { -+ count = count*10/13; -+ if (count - 9 < 2) /* 9, 10 -> 11 */ -+ count = 11; -+ for (p = top, q = p - count; q >= map; p--, q--) -+ if (p->hash < q->hash) -+ swap(*p, *q); -+ } -+ /* Garden variety bubble sort */ -+ do { -+ more = 0; -+ q = top; -+ while (q-- > map) -+ { -+ if (q[1].hash >= q[0].hash) -+ continue; -+ swap(*(q+1), *q); -+ more = 1; -+ } -+ } while(more); -+} -+ -+static void dx_insert_block(struct dx_frame *frame, u32 hash, u32 block) -+{ -+ struct dx_entry *entries = frame->entries; -+ struct dx_entry *old = frame->at, *new = old + 1; -+ int count = dx_get_count(entries); -+ -+ assert(count < dx_get_limit(entries)); -+ assert(old < entries + count); -+ memmove(new + 1, new, (char *)(entries + count) - (char *)(new)); -+ dx_set_hash(new, hash); -+ dx_set_block(new, block); -+ dx_set_count(entries, count + 1); -+} -+#endif -+ -+ -+static void ext3_update_dx_flag(struct inode *inode) -+{ -+ if (!EXT3_HAS_COMPAT_FEATURE(inode->i_sb, -+ EXT3_FEATURE_COMPAT_DIR_INDEX)) -+ EXT3_I(inode)->i_flags &= ~EXT3_INDEX_FL; -+} -+ - /* - * NOTE! unlike strncmp, ext3_match returns 1 for success, 0 for failure. - * -@@ -94,6 +736,7 @@ - return 0; - } - -+ - /* - * ext3_find_entry() - * -@@ -105,6 +748,8 @@ - * The returned buffer_head has ->b_count elevated. The caller is expected - * to brelse() it when appropriate. - */ -+ -+ - static struct buffer_head * ext3_find_entry (struct dentry *dentry, - struct ext3_dir_entry_2 ** res_dir) - { -@@ -119,12 +764,32 @@ - int num = 0; - int nblocks, i, err; - struct inode *dir = dentry->d_parent->d_inode; -+ int namelen; -+ const u8 *name; -+ unsigned blocksize; - - *res_dir = NULL; - sb = dir->i_sb; -- -+ blocksize = sb->s_blocksize; -+ namelen = dentry->d_name.len; -+ name = dentry->d_name.name; -+ if (namelen > EXT3_NAME_LEN) -+ return NULL; -+#ifdef CONFIG_EXT3_INDEX -+ if (is_dx(dir)) { -+ bh = ext3_dx_find_entry(dentry, res_dir, &err); -+ /* -+ * On success, or if the error was file not found, -+ * return. Otherwise, fall back to doing a search the -+ * old fashioned way. -+ */ -+ if (bh || (err != ERR_BAD_DX_DIR)) -+ return bh; -+ dxtrace(printk("ext3_find_entry: dx failed, falling back\n")); -+ } -+#endif - nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb); -- start = dir->u.ext3_i.i_dir_start_lookup; -+ start = EXT3_I(dir)->i_dir_start_lookup; - if (start >= nblocks) - start = 0; - block = start; -@@ -165,7 +830,7 @@ - i = search_dirblock(bh, dir, dentry, - block << EXT3_BLOCK_SIZE_BITS(sb), res_dir); - if (i == 1) { -- dir->u.ext3_i.i_dir_start_lookup = block; -+ EXT3_I(dir)->i_dir_start_lookup = block; - ret = bh; - goto cleanup_and_exit; - } else { -@@ -196,6 +861,66 @@ - return ret; - } - -+#ifdef CONFIG_EXT3_INDEX -+static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry, -+ struct ext3_dir_entry_2 **res_dir, int *err) -+{ -+ struct super_block * sb; -+ struct dx_hash_info hinfo; -+ u32 hash; -+ struct dx_frame frames[2], *frame; -+ struct ext3_dir_entry_2 *de, *top; -+ struct buffer_head *bh; -+ unsigned long block; -+ int retval; -+ int namelen = dentry->d_name.len; -+ const u8 *name = dentry->d_name.name; -+ struct inode *dir = dentry->d_parent->d_inode; -+ -+ sb = dir->i_sb; -+ if (!(frame = dx_probe (dentry, 0, &hinfo, frames, err))) -+ return NULL; -+ hash = hinfo.hash; -+ do { -+ block = dx_get_block(frame->at); -+ if (!(bh = ext3_bread (NULL,dir, block, 0, err))) -+ goto errout; -+ de = (struct ext3_dir_entry_2 *) bh->b_data; -+ top = (struct ext3_dir_entry_2 *) ((char *) de + sb->s_blocksize - -+ EXT3_DIR_REC_LEN(0)); -+ for (; de < top; de = ext3_next_entry(de)) -+ if (ext3_match (namelen, name, de)) { -+ if (!ext3_check_dir_entry("ext3_find_entry", -+ dir, de, bh, -+ (block<b_data))) { -+ brelse (bh); -+ goto errout; -+ } -+ *res_dir = de; -+ dx_release (frames); -+ return bh; -+ } -+ brelse (bh); -+ /* Check to see if we should continue to search */ -+ retval = ext3_htree_next_block(dir, hash, frame, -+ frames, err, 0); -+ if (retval == -1) { -+ ext3_warning(sb, __FUNCTION__, -+ "error reading index page in directory #%lu", -+ dir->i_ino); -+ goto errout; -+ } -+ } while (retval == 1); -+ -+ *err = -ENOENT; -+errout: -+ dxtrace(printk("%s not found\n", name)); -+ dx_release (frames); -+ return NULL; -+} -+#endif -+ - static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry) - { - struct inode * inode; -@@ -212,8 +937,9 @@ - brelse (bh); - inode = iget(dir->i_sb, ino); - -- if (!inode) -+ if (!inode) { - return ERR_PTR(-EACCES); -+ } - } - d_add(dentry, inode); - return NULL; -@@ -237,6 +963,301 @@ - de->file_type = ext3_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; - } - -+#ifdef CONFIG_EXT3_INDEX -+static struct ext3_dir_entry_2 * -+dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count) -+{ -+ unsigned rec_len = 0; -+ -+ while (count--) { -+ struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) (from + map->offs); -+ rec_len = EXT3_DIR_REC_LEN(de->name_len); -+ memcpy (to, de, rec_len); -+ ((struct ext3_dir_entry_2 *)to)->rec_len = cpu_to_le16(rec_len); -+ de->inode = 0; -+ map++; -+ to += rec_len; -+ } -+ return (struct ext3_dir_entry_2 *) (to - rec_len); -+} -+ -+static struct ext3_dir_entry_2* dx_pack_dirents(char *base, int size) -+{ -+ struct ext3_dir_entry_2 *next, *to, *prev, *de = (struct ext3_dir_entry_2 *) base; -+ unsigned rec_len = 0; -+ -+ prev = to = de; -+ while ((char*)de < base + size) { -+ next = (struct ext3_dir_entry_2 *) ((char *) de + -+ le16_to_cpu(de->rec_len)); -+ if (de->inode && de->name_len) { -+ rec_len = EXT3_DIR_REC_LEN(de->name_len); -+ if (de > to) -+ memmove(to, de, rec_len); -+ to->rec_len = cpu_to_le16(rec_len); -+ prev = to; -+ to = (struct ext3_dir_entry_2 *)((char *)to + rec_len); -+ } -+ de = next; -+ } -+ return prev; -+} -+ -+static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, -+ struct buffer_head **bh,struct dx_frame *frame, -+ struct dx_hash_info *hinfo, int *error) -+{ -+ unsigned blocksize = dir->i_sb->s_blocksize; -+ unsigned count, continued; -+ struct buffer_head *bh2; -+ u32 newblock; -+ u32 hash2; -+ struct dx_map_entry *map; -+ char *data1 = (*bh)->b_data, *data2; -+ unsigned split; -+ struct ext3_dir_entry_2 *de = NULL, *de2; -+ int err; -+ -+ bh2 = ext3_append (handle, dir, &newblock, error); -+ if (!(bh2)) { -+ brelse(*bh); -+ *bh = NULL; -+ goto errout; -+ } -+ -+ BUFFER_TRACE(*bh, "get_write_access"); -+ err = ext3_journal_get_write_access(handle, *bh); -+ if (err) { -+ journal_error: -+ brelse(*bh); -+ brelse(bh2); -+ *bh = NULL; -+ ext3_std_error(dir->i_sb, err); -+ goto errout; -+ } -+ BUFFER_TRACE(frame->bh, "get_write_access"); -+ err = ext3_journal_get_write_access(handle, frame->bh); -+ if (err) -+ goto journal_error; -+ -+ data2 = bh2->b_data; -+ -+ /* create map in the end of data2 block */ -+ map = (struct dx_map_entry *) (data2 + blocksize); -+ count = dx_make_map ((struct ext3_dir_entry_2 *) data1, -+ blocksize, hinfo, map); -+ map -= count; -+ split = count/2; // need to adjust to actual middle -+ dx_sort_map (map, count); -+ hash2 = map[split].hash; -+ continued = hash2 == map[split - 1].hash; -+ dxtrace(printk("Split block %i at %x, %i/%i\n", -+ dx_get_block(frame->at), hash2, split, count-split)); -+ -+ /* Fancy dance to stay within two buffers */ -+ de2 = dx_move_dirents(data1, data2, map + split, count - split); -+ de = dx_pack_dirents(data1,blocksize); -+ de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de); -+ de2->rec_len = cpu_to_le16(data2 + blocksize - (char *) de2); -+ dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data1, blocksize, 1)); -+ dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data2, blocksize, 1)); -+ -+ /* Which block gets the new entry? */ -+ if (hinfo->hash >= hash2) -+ { -+ swap(*bh, bh2); -+ de = de2; -+ } -+ dx_insert_block (frame, hash2 + continued, newblock); -+ err = ext3_journal_dirty_metadata (handle, bh2); -+ if (err) -+ goto journal_error; -+ err = ext3_journal_dirty_metadata (handle, frame->bh); -+ if (err) -+ goto journal_error; -+ brelse (bh2); -+ dxtrace(dx_show_index ("frame", frame->entries)); -+errout: -+ return de; -+} -+#endif -+ -+ -+/* -+ * Add a new entry into a directory (leaf) block. If de is non-NULL, -+ * it points to a directory entry which is guaranteed to be large -+ * enough for new directory entry. If de is NULL, then -+ * add_dirent_to_buf will attempt search the directory block for -+ * space. It will return -ENOSPC if no space is available, and -EIO -+ * and -EEXIST if directory entry already exists. -+ * -+ * NOTE! bh is NOT released in the case where ENOSPC is returned. In -+ * all other cases bh is released. -+ */ -+static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, -+ struct inode *inode, struct ext3_dir_entry_2 *de, -+ struct buffer_head * bh) -+{ -+ struct inode *dir = dentry->d_parent->d_inode; -+ const char *name = dentry->d_name.name; -+ int namelen = dentry->d_name.len; -+ unsigned long offset = 0; -+ unsigned short reclen; -+ int nlen, rlen, err; -+ char *top; -+ -+ reclen = EXT3_DIR_REC_LEN(namelen); -+ if (!de) { -+ de = (struct ext3_dir_entry_2 *)bh->b_data; -+ top = bh->b_data + dir->i_sb->s_blocksize - reclen; -+ while ((char *) de <= top) { -+ if (!ext3_check_dir_entry("ext3_add_entry", dir, de, -+ bh, offset)) { -+ brelse (bh); -+ return -EIO; -+ } -+ if (ext3_match (namelen, name, de)) { -+ brelse (bh); -+ return -EEXIST; -+ } -+ nlen = EXT3_DIR_REC_LEN(de->name_len); -+ rlen = le16_to_cpu(de->rec_len); -+ if ((de->inode? rlen - nlen: rlen) >= reclen) -+ break; -+ de = (struct ext3_dir_entry_2 *)((char *)de + rlen); -+ offset += rlen; -+ } -+ if ((char *) de > top) -+ return -ENOSPC; -+ } -+ BUFFER_TRACE(bh, "get_write_access"); -+ err = ext3_journal_get_write_access(handle, bh); -+ if (err) { -+ ext3_std_error(dir->i_sb, err); -+ brelse(bh); -+ return err; -+ } -+ -+ /* By now the buffer is marked for journaling */ -+ nlen = EXT3_DIR_REC_LEN(de->name_len); -+ rlen = le16_to_cpu(de->rec_len); -+ if (de->inode) { -+ struct ext3_dir_entry_2 *de1 = (struct ext3_dir_entry_2 *)((char *)de + nlen); -+ de1->rec_len = cpu_to_le16(rlen - nlen); -+ de->rec_len = cpu_to_le16(nlen); -+ de = de1; -+ } -+ de->file_type = EXT3_FT_UNKNOWN; -+ if (inode) { -+ de->inode = cpu_to_le32(inode->i_ino); -+ ext3_set_de_type(dir->i_sb, de, inode->i_mode); -+ } else -+ de->inode = 0; -+ de->name_len = namelen; -+ memcpy (de->name, name, namelen); -+ /* -+ * XXX shouldn't update any times until successful -+ * completion of syscall, but too many callers depend -+ * on this. -+ * -+ * XXX similarly, too many callers depend on -+ * ext3_new_inode() setting the times, but error -+ * recovery deletes the inode, so the worst that can -+ * happen is that the times are slightly out of date -+ * and/or different from the directory change time. -+ */ -+ dir->i_mtime = dir->i_ctime = CURRENT_TIME; -+ ext3_update_dx_flag(dir); -+ dir->i_version = ++event; -+ ext3_mark_inode_dirty(handle, dir); -+ BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); -+ err = ext3_journal_dirty_metadata(handle, bh); -+ if (err) -+ ext3_std_error(dir->i_sb, err); -+ brelse(bh); -+ return 0; -+} -+ -+#ifdef CONFIG_EXT3_INDEX -+/* -+ * This converts a one block unindexed directory to a 3 block indexed -+ * directory, and adds the dentry to the indexed directory. -+ */ -+static int make_indexed_dir(handle_t *handle, struct dentry *dentry, -+ struct inode *inode, struct buffer_head *bh) -+{ -+ struct inode *dir = dentry->d_parent->d_inode; -+ const char *name = dentry->d_name.name; -+ int namelen = dentry->d_name.len; -+ struct buffer_head *bh2; -+ struct dx_root *root; -+ struct dx_frame frames[2], *frame; -+ struct dx_entry *entries; -+ struct ext3_dir_entry_2 *de, *de2; -+ char *data1, *top; -+ unsigned len; -+ int retval; -+ unsigned blocksize; -+ struct dx_hash_info hinfo; -+ u32 block; -+ -+ blocksize = dir->i_sb->s_blocksize; -+ dxtrace(printk("Creating index\n")); -+ retval = ext3_journal_get_write_access(handle, bh); -+ if (retval) { -+ ext3_std_error(dir->i_sb, retval); -+ brelse(bh); -+ return retval; -+ } -+ root = (struct dx_root *) bh->b_data; -+ -+ EXT3_I(dir)->i_flags |= EXT3_INDEX_FL; -+ bh2 = ext3_append (handle, dir, &block, &retval); -+ if (!(bh2)) { -+ brelse(bh); -+ return retval; -+ } -+ data1 = bh2->b_data; -+ -+ /* The 0th block becomes the root, move the dirents out */ -+ de = (struct ext3_dir_entry_2 *)&root->dotdot; -+ de = (struct ext3_dir_entry_2 *)((char *)de + le16_to_cpu(de->rec_len)); -+ len = ((char *) root) + blocksize - (char *) de; -+ memcpy (data1, de, len); -+ de = (struct ext3_dir_entry_2 *) data1; -+ top = data1 + len; -+ while (((char *) de2=(char*)de+le16_to_cpu(de->rec_len)) < top) -+ de = de2; -+ de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de); -+ /* Initialize the root; the dot dirents already exist */ -+ de = (struct ext3_dir_entry_2 *) (&root->dotdot); -+ de->rec_len = cpu_to_le16(blocksize - EXT3_DIR_REC_LEN(2)); -+ memset (&root->info, 0, sizeof(root->info)); -+ root->info.info_length = sizeof(root->info); -+ root->info.hash_version = dir->i_sb->u.ext3_sb.s_def_hash_version; -+ entries = root->entries; -+ dx_set_block (entries, 1); -+ dx_set_count (entries, 1); -+ dx_set_limit (entries, dx_root_limit(dir, sizeof(root->info))); -+ -+ /* Initialize as for dx_probe */ -+ hinfo.hash_version = root->info.hash_version; -+ hinfo.seed = dir->i_sb->u.ext3_sb.s_hash_seed; -+ ext3fs_dirhash(name, namelen, &hinfo); -+ frame = frames; -+ frame->entries = entries; -+ frame->at = entries; -+ frame->bh = bh; -+ bh = bh2; -+ de = do_split(handle,dir, &bh, frame, &hinfo, &retval); -+ dx_release (frames); -+ if (!(de)) -+ return retval; -+ -+ return add_dirent_to_buf(handle, dentry, inode, de, bh); -+} -+#endif -+ - /* - * ext3_add_entry() - * -@@ -247,127 +1268,198 @@ - * may not sleep between calling this and putting something into - * the entry, as someone else might have used it while you slept. - */ -- --/* -- * AKPM: the journalling code here looks wrong on the error paths -- */ - static int ext3_add_entry (handle_t *handle, struct dentry *dentry, - struct inode *inode) - { - struct inode *dir = dentry->d_parent->d_inode; -- const char *name = dentry->d_name.name; -- int namelen = dentry->d_name.len; - unsigned long offset; -- unsigned short rec_len; - struct buffer_head * bh; -- struct ext3_dir_entry_2 * de, * de1; -+ struct ext3_dir_entry_2 *de; - struct super_block * sb; - int retval; -+#ifdef CONFIG_EXT3_INDEX -+ int dx_fallback=0; -+#endif -+ unsigned blocksize; -+ unsigned nlen, rlen; -+ u32 block, blocks; - - sb = dir->i_sb; -- -- if (!namelen) -+ blocksize = sb->s_blocksize; -+ if (!dentry->d_name.len) - return -EINVAL; -- bh = ext3_bread (handle, dir, 0, 0, &retval); -+#ifdef CONFIG_EXT3_INDEX -+ if (is_dx(dir)) { -+ retval = ext3_dx_add_entry(handle, dentry, inode); -+ if (!retval || (retval != ERR_BAD_DX_DIR)) -+ return retval; -+ EXT3_I(dir)->i_flags &= ~EXT3_INDEX_FL; -+ dx_fallback++; -+ ext3_mark_inode_dirty(handle, dir); -+ } -+#endif -+ blocks = dir->i_size >> sb->s_blocksize_bits; -+ for (block = 0, offset = 0; block < blocks; block++) { -+ bh = ext3_bread(handle, dir, block, 0, &retval); -+ if(!bh) -+ return retval; -+ retval = add_dirent_to_buf(handle, dentry, inode, 0, bh); -+ if (retval != -ENOSPC) -+ return retval; -+ -+#ifdef CONFIG_EXT3_INDEX -+ if (blocks == 1 && !dx_fallback && -+ EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX)) -+ return make_indexed_dir(handle, dentry, inode, bh); -+#endif -+ brelse(bh); -+ } -+ bh = ext3_append(handle, dir, &block, &retval); - if (!bh) - return retval; -- rec_len = EXT3_DIR_REC_LEN(namelen); -- offset = 0; - de = (struct ext3_dir_entry_2 *) bh->b_data; -- while (1) { -- if ((char *)de >= sb->s_blocksize + bh->b_data) { -- brelse (bh); -- bh = NULL; -- bh = ext3_bread (handle, dir, -- offset >> EXT3_BLOCK_SIZE_BITS(sb), 1, &retval); -- if (!bh) -- return retval; -- if (dir->i_size <= offset) { -- if (dir->i_size == 0) { -- brelse(bh); -- return -ENOENT; -- } -+ de->inode = 0; -+ de->rec_len = cpu_to_le16(rlen = blocksize); -+ nlen = 0; -+ return add_dirent_to_buf(handle, dentry, inode, de, bh); -+} - -- ext3_debug ("creating next block\n"); -+#ifdef CONFIG_EXT3_INDEX -+/* -+ * Returns 0 for success, or a negative error value -+ */ -+static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, -+ struct inode *inode) -+{ -+ struct dx_frame frames[2], *frame; -+ struct dx_entry *entries, *at; -+ struct dx_hash_info hinfo; -+ struct buffer_head * bh; -+ struct inode *dir = dentry->d_parent->d_inode; -+ struct super_block * sb = dir->i_sb; -+ struct ext3_dir_entry_2 *de; -+ int err; - -- BUFFER_TRACE(bh, "get_write_access"); -- ext3_journal_get_write_access(handle, bh); -- de = (struct ext3_dir_entry_2 *) bh->b_data; -- de->inode = 0; -- de->rec_len = le16_to_cpu(sb->s_blocksize); -- dir->u.ext3_i.i_disksize = -- dir->i_size = offset + sb->s_blocksize; -- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; -- ext3_mark_inode_dirty(handle, dir); -- } else { -+ frame = dx_probe(dentry, 0, &hinfo, frames, &err); -+ if (!frame) -+ return err; -+ entries = frame->entries; -+ at = frame->at; - -- ext3_debug ("skipping to next block\n"); -+ if (!(bh = ext3_bread(handle,dir, dx_get_block(frame->at), 0, &err))) -+ goto cleanup; - -- de = (struct ext3_dir_entry_2 *) bh->b_data; -- } -- } -- if (!ext3_check_dir_entry ("ext3_add_entry", dir, de, bh, -- offset)) { -- brelse (bh); -- return -ENOENT; -- } -- if (ext3_match (namelen, name, de)) { -- brelse (bh); -- return -EEXIST; -+ BUFFER_TRACE(bh, "get_write_access"); -+ err = ext3_journal_get_write_access(handle, bh); -+ if (err) -+ goto journal_error; -+ -+ err = add_dirent_to_buf(handle, dentry, inode, 0, bh); -+ if (err != -ENOSPC) { -+ bh = 0; -+ goto cleanup; -+ } -+ -+ /* Block full, should compress but for now just split */ -+ dxtrace(printk("using %u of %u node entries\n", -+ dx_get_count(entries), dx_get_limit(entries))); -+ /* Need to split index? */ -+ if (dx_get_count(entries) == dx_get_limit(entries)) { -+ u32 newblock; -+ unsigned icount = dx_get_count(entries); -+ int levels = frame - frames; -+ struct dx_entry *entries2; -+ struct dx_node *node2; -+ struct buffer_head *bh2; -+ -+ if (levels && (dx_get_count(frames->entries) == -+ dx_get_limit(frames->entries))) { -+ ext3_warning(sb, __FUNCTION__, -+ "Directory index full!\n"); -+ err = -ENOSPC; -+ goto cleanup; - } -- if ((le32_to_cpu(de->inode) == 0 && -- le16_to_cpu(de->rec_len) >= rec_len) || -- (le16_to_cpu(de->rec_len) >= -- EXT3_DIR_REC_LEN(de->name_len) + rec_len)) { -- BUFFER_TRACE(bh, "get_write_access"); -- ext3_journal_get_write_access(handle, bh); -- /* By now the buffer is marked for journaling */ -- offset += le16_to_cpu(de->rec_len); -- if (le32_to_cpu(de->inode)) { -- de1 = (struct ext3_dir_entry_2 *) ((char *) de + -- EXT3_DIR_REC_LEN(de->name_len)); -- de1->rec_len = -- cpu_to_le16(le16_to_cpu(de->rec_len) - -- EXT3_DIR_REC_LEN(de->name_len)); -- de->rec_len = cpu_to_le16( -- EXT3_DIR_REC_LEN(de->name_len)); -- de = de1; -+ bh2 = ext3_append (handle, dir, &newblock, &err); -+ if (!(bh2)) -+ goto cleanup; -+ node2 = (struct dx_node *)(bh2->b_data); -+ entries2 = node2->entries; -+ node2->fake.rec_len = cpu_to_le16(sb->s_blocksize); -+ node2->fake.inode = 0; -+ BUFFER_TRACE(frame->bh, "get_write_access"); -+ err = ext3_journal_get_write_access(handle, frame->bh); -+ if (err) -+ goto journal_error; -+ if (levels) { -+ unsigned icount1 = icount/2, icount2 = icount - icount1; -+ unsigned hash2 = dx_get_hash(entries + icount1); -+ dxtrace(printk("Split index %i/%i\n", icount1, icount2)); -+ -+ BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */ -+ err = ext3_journal_get_write_access(handle, -+ frames[0].bh); -+ if (err) -+ goto journal_error; -+ -+ memcpy ((char *) entries2, (char *) (entries + icount1), -+ icount2 * sizeof(struct dx_entry)); -+ dx_set_count (entries, icount1); -+ dx_set_count (entries2, icount2); -+ dx_set_limit (entries2, dx_node_limit(dir)); -+ -+ /* Which index block gets the new entry? */ -+ if (at - entries >= icount1) { -+ frame->at = at = at - entries - icount1 + entries2; -+ frame->entries = entries = entries2; -+ swap(frame->bh, bh2); - } -- de->file_type = EXT3_FT_UNKNOWN; -- if (inode) { -- de->inode = cpu_to_le32(inode->i_ino); -- ext3_set_de_type(dir->i_sb, de, inode->i_mode); -- } else -- de->inode = 0; -- de->name_len = namelen; -- memcpy (de->name, name, namelen); -- /* -- * XXX shouldn't update any times until successful -- * completion of syscall, but too many callers depend -- * on this. -- * -- * XXX similarly, too many callers depend on -- * ext3_new_inode() setting the times, but error -- * recovery deletes the inode, so the worst that can -- * happen is that the times are slightly out of date -- * and/or different from the directory change time. -- */ -- dir->i_mtime = dir->i_ctime = CURRENT_TIME; -- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; -- ext3_mark_inode_dirty(handle, dir); -- dir->i_version = ++event; -- BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); -- ext3_journal_dirty_metadata(handle, bh); -- brelse(bh); -- return 0; -+ dx_insert_block (frames + 0, hash2, newblock); -+ dxtrace(dx_show_index ("node", frames[1].entries)); -+ dxtrace(dx_show_index ("node", -+ ((struct dx_node *) bh2->b_data)->entries)); -+ err = ext3_journal_dirty_metadata(handle, bh2); -+ if (err) -+ goto journal_error; -+ brelse (bh2); -+ } else { -+ dxtrace(printk("Creating second level index...\n")); -+ memcpy((char *) entries2, (char *) entries, -+ icount * sizeof(struct dx_entry)); -+ dx_set_limit(entries2, dx_node_limit(dir)); -+ -+ /* Set up root */ -+ dx_set_count(entries, 1); -+ dx_set_block(entries + 0, newblock); -+ ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1; -+ -+ /* Add new access path frame */ -+ frame = frames + 1; -+ frame->at = at = at - entries + entries2; -+ frame->entries = entries = entries2; -+ frame->bh = bh2; -+ err = ext3_journal_get_write_access(handle, -+ frame->bh); -+ if (err) -+ goto journal_error; - } -- offset += le16_to_cpu(de->rec_len); -- de = (struct ext3_dir_entry_2 *) -- ((char *) de + le16_to_cpu(de->rec_len)); -+ ext3_journal_dirty_metadata(handle, frames[0].bh); - } -- brelse (bh); -- return -ENOSPC; -+ de = do_split(handle, dir, &bh, frame, &hinfo, &err); -+ if (!de) -+ goto cleanup; -+ err = add_dirent_to_buf(handle, dentry, inode, de, bh); -+ bh = 0; -+ goto cleanup; -+ -+journal_error: -+ ext3_std_error(dir->i_sb, err); -+cleanup: -+ if (bh) -+ brelse(bh); -+ dx_release(frames); -+ return err; - } -+#endif - - /* - * ext3_delete_entry deletes a directory entry by merging it with the -@@ -451,9 +1543,11 @@ - struct inode * inode; - int err; - -- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3); -- if (IS_ERR(handle)) -+ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + -+ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3); -+ if (IS_ERR(handle)) { - return PTR_ERR(handle); -+ } - - if (IS_SYNC(dir)) - handle->h_sync = 1; -@@ -478,9 +1572,11 @@ - struct inode *inode; - int err; - -- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3); -- if (IS_ERR(handle)) -+ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + -+ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3); -+ if (IS_ERR(handle)) { - return PTR_ERR(handle); -+ } - - if (IS_SYNC(dir)) - handle->h_sync = 1; -@@ -507,9 +1603,11 @@ - if (dir->i_nlink >= EXT3_LINK_MAX) - return -EMLINK; - -- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3); -- if (IS_ERR(handle)) -+ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + -+ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3); -+ if (IS_ERR(handle)) { - return PTR_ERR(handle); -+ } - - if (IS_SYNC(dir)) - handle->h_sync = 1; -@@ -521,7 +1619,7 @@ - - inode->i_op = &ext3_dir_inode_operations; - inode->i_fop = &ext3_dir_operations; -- inode->i_size = inode->u.ext3_i.i_disksize = inode->i_sb->s_blocksize; -+ inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize; - inode->i_blocks = 0; - dir_block = ext3_bread (handle, inode, 0, 1, &err); - if (!dir_block) { -@@ -554,21 +1652,19 @@ - inode->i_mode |= S_ISGID; - ext3_mark_inode_dirty(handle, inode); - err = ext3_add_entry (handle, dentry, inode); -- if (err) -- goto out_no_entry; -+ if (err) { -+ inode->i_nlink = 0; -+ ext3_mark_inode_dirty(handle, inode); -+ iput (inode); -+ goto out_stop; -+ } - dir->i_nlink++; -- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; -+ ext3_update_dx_flag(dir); - ext3_mark_inode_dirty(handle, dir); - d_instantiate(dentry, inode); - out_stop: - ext3_journal_stop(handle, dir); - return err; -- --out_no_entry: -- inode->i_nlink = 0; -- ext3_mark_inode_dirty(handle, inode); -- iput (inode); -- goto out_stop; - } - - /* -@@ -655,7 +1751,7 @@ - int err = 0, rc; - - lock_super(sb); -- if (!list_empty(&inode->u.ext3_i.i_orphan)) -+ if (!list_empty(&EXT3_I(inode)->i_orphan)) - goto out_unlock; - - /* Orphan handling is only valid for files with data blocks -@@ -696,7 +1792,7 @@ - * This is safe: on error we're going to ignore the orphan list - * anyway on the next recovery. */ - if (!err) -- list_add(&inode->u.ext3_i.i_orphan, &EXT3_SB(sb)->s_orphan); -+ list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan); - - jbd_debug(4, "superblock will point to %ld\n", inode->i_ino); - jbd_debug(4, "orphan inode %ld will point to %d\n", -@@ -714,25 +1810,26 @@ - int ext3_orphan_del(handle_t *handle, struct inode *inode) - { - struct list_head *prev; -+ struct ext3_inode_info *ei = EXT3_I(inode); - struct ext3_sb_info *sbi; - ino_t ino_next; - struct ext3_iloc iloc; - int err = 0; - - lock_super(inode->i_sb); -- if (list_empty(&inode->u.ext3_i.i_orphan)) { -+ if (list_empty(&ei->i_orphan)) { - unlock_super(inode->i_sb); - return 0; - } - - ino_next = NEXT_ORPHAN(inode); -- prev = inode->u.ext3_i.i_orphan.prev; -+ prev = ei->i_orphan.prev; - sbi = EXT3_SB(inode->i_sb); - - jbd_debug(4, "remove inode %ld from orphan list\n", inode->i_ino); - -- list_del(&inode->u.ext3_i.i_orphan); -- INIT_LIST_HEAD(&inode->u.ext3_i.i_orphan); -+ list_del(&ei->i_orphan); -+ INIT_LIST_HEAD(&ei->i_orphan); - - /* If we're on an error path, we may not have a valid - * transaction handle with which to update the orphan list on -@@ -793,8 +1890,9 @@ - handle_t *handle; - - handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS); -- if (IS_ERR(handle)) -+ if (IS_ERR(handle)) { - return PTR_ERR(handle); -+ } - - retval = -ENOENT; - bh = ext3_find_entry (dentry, &de); -@@ -832,7 +1930,7 @@ - ext3_mark_inode_dirty(handle, inode); - dir->i_nlink--; - inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; -- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; -+ ext3_update_dx_flag(dir); - ext3_mark_inode_dirty(handle, dir); - - end_rmdir: -@@ -850,8 +1948,9 @@ - handle_t *handle; - - handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS); -- if (IS_ERR(handle)) -+ if (IS_ERR(handle)) { - return PTR_ERR(handle); -+ } - - if (IS_SYNC(dir)) - handle->h_sync = 1; -@@ -878,7 +1977,7 @@ - if (retval) - goto end_unlink; - dir->i_ctime = dir->i_mtime = CURRENT_TIME; -- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; -+ ext3_update_dx_flag(dir); - ext3_mark_inode_dirty(handle, dir); - inode->i_nlink--; - if (!inode->i_nlink) -@@ -904,9 +2003,11 @@ - if (l > dir->i_sb->s_blocksize) - return -ENAMETOOLONG; - -- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 5); -- if (IS_ERR(handle)) -+ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + -+ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 5); -+ if (IS_ERR(handle)) { - return PTR_ERR(handle); -+ } - - if (IS_SYNC(dir)) - handle->h_sync = 1; -@@ -916,7 +2017,7 @@ - if (IS_ERR(inode)) - goto out_stop; - -- if (l > sizeof (inode->u.ext3_i.i_data)) { -+ if (l > sizeof (EXT3_I(inode)->i_data)) { - inode->i_op = &page_symlink_inode_operations; - inode->i_mapping->a_ops = &ext3_aops; - /* -@@ -925,8 +2026,12 @@ - * i_size in generic_commit_write(). - */ - err = block_symlink(inode, symname, l); -- if (err) -- goto out_no_entry; -+ if (err) { -+ ext3_dec_count(handle, inode); -+ ext3_mark_inode_dirty(handle, inode); -+ iput (inode); -+ goto out_stop; -+ } - } else { - inode->i_op = &ext3_fast_symlink_inode_operations; - memcpy((char*)&inode->u.ext3_i.i_data,symname,l); -@@ -938,12 +2043,6 @@ - out_stop: - ext3_journal_stop(handle, dir); - return err; -- --out_no_entry: -- ext3_dec_count(handle, inode); -- ext3_mark_inode_dirty(handle, inode); -- iput (inode); -- goto out_stop; - } - - static int ext3_link (struct dentry * old_dentry, -@@ -956,12 +2055,15 @@ - if (S_ISDIR(inode->i_mode)) - return -EPERM; - -- if (inode->i_nlink >= EXT3_LINK_MAX) -+ if (inode->i_nlink >= EXT3_LINK_MAX) { - return -EMLINK; -+ } - -- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS); -- if (IS_ERR(handle)) -+ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + -+ EXT3_INDEX_EXTRA_TRANS_BLOCKS); -+ if (IS_ERR(handle)) { - return PTR_ERR(handle); -+ } - - if (IS_SYNC(dir)) - handle->h_sync = 1; -@@ -995,9 +2097,11 @@ - - old_bh = new_bh = dir_bh = NULL; - -- handle = ext3_journal_start(old_dir, 2 * EXT3_DATA_TRANS_BLOCKS + 2); -- if (IS_ERR(handle)) -+ handle = ext3_journal_start(old_dir, 2 * EXT3_DATA_TRANS_BLOCKS + -+ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 2); -+ if (IS_ERR(handle)) { - return PTR_ERR(handle); -+ } - - if (IS_SYNC(old_dir) || IS_SYNC(new_dir)) - handle->h_sync = 1; -@@ -1070,14 +2174,33 @@ - /* - * ok, that's it - */ -- ext3_delete_entry(handle, old_dir, old_de, old_bh); -+ retval = ext3_delete_entry(handle, old_dir, old_de, old_bh); -+ if (retval == -ENOENT) { -+ /* -+ * old_de could have moved out from under us. -+ */ -+ struct buffer_head *old_bh2; -+ struct ext3_dir_entry_2 *old_de2; -+ -+ old_bh2 = ext3_find_entry(old_dentry, &old_de2); -+ if (old_bh2) { -+ retval = ext3_delete_entry(handle, old_dir, -+ old_de2, old_bh2); -+ brelse(old_bh2); -+ } -+ } -+ if (retval) { -+ ext3_warning(old_dir->i_sb, "ext3_rename", -+ "Deleting old file (%lu), %d, error=%d", -+ old_dir->i_ino, old_dir->i_nlink, retval); -+ } - - if (new_inode) { - new_inode->i_nlink--; - new_inode->i_ctime = CURRENT_TIME; - } - old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME; -- old_dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; -+ ext3_update_dx_flag(old_dir); - if (dir_bh) { - BUFFER_TRACE(dir_bh, "get_write_access"); - ext3_journal_get_write_access(handle, dir_bh); -@@ -1089,7 +2212,7 @@ - new_inode->i_nlink--; - } else { - new_dir->i_nlink++; -- new_dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; -+ ext3_update_dx_flag(new_dir); - ext3_mark_inode_dirty(handle, new_dir); - } - } -Index: linux.mcp2/fs/ext3/super.c -=================================================================== ---- linux.mcp2.orig/fs/ext3/super.c 2004-05-17 15:03:55.000000000 -0700 -+++ linux.mcp2/fs/ext3/super.c 2004-05-17 15:08:50.000000000 -0700 -@@ -702,6 +702,7 @@ - es->s_mtime = cpu_to_le32(CURRENT_TIME); - ext3_update_dynamic_rev(sb); - EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); -+ - ext3_commit_super (sb, es, 1); - if (test_opt (sb, DEBUG)) - printk (KERN_INFO -@@ -712,6 +713,7 @@ - EXT3_BLOCKS_PER_GROUP(sb), - EXT3_INODES_PER_GROUP(sb), - sbi->s_mount_opt); -+ - printk(KERN_INFO "EXT3 FS " EXT3FS_VERSION ", " EXT3FS_DATE " on %s, ", - bdevname(sb->s_dev)); - if (EXT3_SB(sb)->s_journal->j_inode == NULL) { -@@ -886,6 +888,7 @@ - return res; - } - -+ - struct super_block * ext3_read_super (struct super_block * sb, void * data, - int silent) - { -@@ -1062,6 +1065,9 @@ - sbi->s_mount_state = le16_to_cpu(es->s_state); - sbi->s_addr_per_block_bits = log2(EXT3_ADDR_PER_BLOCK(sb)); - sbi->s_desc_per_block_bits = log2(EXT3_DESC_PER_BLOCK(sb)); -+ for (i=0; i < 4; i++) -+ sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); -+ sbi->s_def_hash_version = es->s_def_hash_version; - - if (sbi->s_blocks_per_group > blocksize * 8) { - printk (KERN_ERR -@@ -1744,7 +1750,7 @@ - unregister_filesystem(&ext3_fs_type); - } - --EXPORT_NO_SYMBOLS; -+EXPORT_SYMBOL(ext3_force_commit); - - MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); - MODULE_DESCRIPTION("Second Extended Filesystem with journaling extensions"); -Index: linux.mcp2/include/linux/ext3_fs.h -=================================================================== ---- linux.mcp2.orig/include/linux/ext3_fs.h 2004-05-17 14:53:17.000000000 -0700 -+++ linux.mcp2/include/linux/ext3_fs.h 2004-05-17 15:07:07.000000000 -0700 -@@ -40,6 +40,11 @@ - #define EXT3FS_VERSION "2.4-0.9.17" - - /* -+ * Always enable hashed directories -+ */ -+#define CONFIG_EXT3_INDEX -+ -+/* - * Debug code - */ - #ifdef EXT3FS_DEBUG -@@ -437,8 +442,11 @@ - /*E0*/ __u32 s_journal_inum; /* inode number of journal file */ - __u32 s_journal_dev; /* device number of journal file */ - __u32 s_last_orphan; /* start of list of inodes to delete */ -- --/*EC*/ __u32 s_reserved[197]; /* Padding to the end of the block */ -+ __u32 s_hash_seed[4]; /* HTREE hash seed */ -+ __u8 s_def_hash_version; /* Default hash version to use */ -+ __u8 s_reserved_char_pad; -+ __u16 s_reserved_word_pad; -+ __u32 s_reserved[192]; /* Padding to the end of the block */ - }; - - #ifdef __KERNEL__ -@@ -575,9 +583,46 @@ - #define EXT3_DIR_ROUND (EXT3_DIR_PAD - 1) - #define EXT3_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT3_DIR_ROUND) & \ - ~EXT3_DIR_ROUND) -+/* -+ * Hash Tree Directory indexing -+ * (c) Daniel Phillips, 2001 -+ */ -+ -+#ifdef CONFIG_EXT3_INDEX -+ #define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \ -+ EXT3_FEATURE_COMPAT_DIR_INDEX) && \ -+ (EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) -+#define EXT3_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT3_LINK_MAX) -+#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1) -+#else -+ #define is_dx(dir) 0 -+#define EXT3_DIR_LINK_MAX(dir) ((dir)->i_nlink >= EXT3_LINK_MAX) -+#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2) -+#endif -+ -+/* Legal values for the dx_root hash_version field: */ -+ -+#define DX_HASH_LEGACY 0 -+#define DX_HASH_HALF_MD4 1 -+#define DX_HASH_TEA 2 -+ -+/* hash info structure used by the directory hash */ -+struct dx_hash_info -+{ -+ u32 hash; -+ u32 minor_hash; -+ int hash_version; -+ u32 *seed; -+}; - - #ifdef __KERNEL__ - /* -+ * Control parameters used by ext3_htree_next_block -+ */ -+#define HASH_NB_ALWAYS 1 -+ -+ -+/* - * Describe an inode's exact location on disk and in memory - */ - struct ext3_iloc -@@ -587,6 +632,27 @@ - unsigned long block_group; - }; - -+ -+/* -+ * This structure is stuffed into the struct file's private_data field -+ * for directories. It is where we put information so that we can do -+ * readdir operations in hash tree order. -+ */ -+struct dir_private_info { -+ rb_root_t root; -+ rb_node_t *curr_node; -+ struct fname *extra_fname; -+ loff_t last_pos; -+ __u32 curr_hash; -+ __u32 curr_minor_hash; -+ __u32 next_hash; -+}; -+ -+/* -+ * Special error return code only used by dx_probe() and its callers. -+ */ -+#define ERR_BAD_DX_DIR -75000 -+ - /* - * Function prototypes - */ -@@ -614,11 +680,20 @@ - - /* dir.c */ - extern int ext3_check_dir_entry(const char *, struct inode *, -- struct ext3_dir_entry_2 *, struct buffer_head *, -- unsigned long); -+ struct ext3_dir_entry_2 *, -+ struct buffer_head *, unsigned long); -+extern int ext3_htree_store_dirent(struct file *dir_file, __u32 hash, -+ __u32 minor_hash, -+ struct ext3_dir_entry_2 *dirent); -+extern void ext3_htree_free_dir_info(struct dir_private_info *p); -+ - /* fsync.c */ - extern int ext3_sync_file (struct file *, struct dentry *, int); - -+/* hash.c */ -+extern int ext3fs_dirhash(const char *name, int len, struct -+ dx_hash_info *hinfo); -+ - /* ialloc.c */ - extern struct inode * ext3_new_inode (handle_t *, const struct inode *, int); - extern void ext3_free_inode (handle_t *, struct inode *); -@@ -650,6 +725,8 @@ - /* namei.c */ - extern int ext3_orphan_add(handle_t *, struct inode *); - extern int ext3_orphan_del(handle_t *, struct inode *); -+extern int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash, -+ __u32 start_minor_hash, __u32 *next_hash); - - /* super.c */ - extern void ext3_error (struct super_block *, const char *, const char *, ...) -Index: linux.mcp2/include/linux/ext3_fs_sb.h -=================================================================== ---- linux.mcp2.orig/include/linux/ext3_fs_sb.h 2004-05-17 14:41:25.000000000 -0700 -+++ linux.mcp2/include/linux/ext3_fs_sb.h 2004-05-17 15:07:07.000000000 -0700 -@@ -62,6 +62,8 @@ - int s_inode_size; - int s_first_ino; - u32 s_next_generation; -+ u32 s_hash_seed[4]; -+ int s_def_hash_version; - - /* Journaling */ - struct inode * s_journal_inode; -Index: linux.mcp2/include/linux/ext3_jbd.h -=================================================================== ---- linux.mcp2.orig/include/linux/ext3_jbd.h 2004-05-17 14:53:17.000000000 -0700 -+++ linux.mcp2/include/linux/ext3_jbd.h 2004-05-17 15:07:07.000000000 -0700 -@@ -63,6 +63,8 @@ - - #define EXT3_RESERVE_TRANS_BLOCKS 12 - -+#define EXT3_INDEX_EXTRA_TRANS_BLOCKS 8 -+ - int - ext3_mark_iloc_dirty(handle_t *handle, - struct inode *inode, -Index: linux.mcp2/include/linux/rbtree.h -=================================================================== ---- linux.mcp2.orig/include/linux/rbtree.h 2004-05-17 14:41:25.000000000 -0700 -+++ linux.mcp2/include/linux/rbtree.h 2004-05-17 15:07:07.000000000 -0700 -@@ -120,6 +120,8 @@ - - extern void rb_insert_color(rb_node_t *, rb_root_t *); - extern void rb_erase(rb_node_t *, rb_root_t *); -+extern rb_node_t *rb_get_first(rb_root_t *root); -+extern rb_node_t *rb_get_next(rb_node_t *n); - - static inline void rb_link_node(rb_node_t * node, rb_node_t * parent, rb_node_t ** rb_link) - { -Index: linux.mcp2/lib/rbtree.c -=================================================================== ---- linux.mcp2.orig/lib/rbtree.c 2004-01-19 07:49:44.000000000 -0800 -+++ linux.mcp2/lib/rbtree.c 2004-05-17 15:10:39.000000000 -0700 -@@ -17,6 +17,8 @@ - Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - - linux/lib/rbtree.c -+ -+ rb_get_first and rb_get_next written by Theodore Ts'o, 9/8/2002 - */ - - #include -@@ -294,3 +296,42 @@ - __rb_erase_color(child, parent, root); - } - EXPORT_SYMBOL(rb_erase); -+ -+/* -+ * This function returns the first node (in sort order) of the tree. -+ */ -+rb_node_t *rb_get_first(rb_root_t *root) -+{ -+ rb_node_t *n; -+ -+ n = root->rb_node; -+ if (!n) -+ return 0; -+ while (n->rb_left) -+ n = n->rb_left; -+ return n; -+} -+EXPORT_SYMBOL(rb_get_first); -+ -+/* -+ * Given a node, this function will return the next node in the tree. -+ */ -+rb_node_t *rb_get_next(rb_node_t *n) -+{ -+ rb_node_t *parent; -+ -+ if (n->rb_right) { -+ n = n->rb_right; -+ while (n->rb_left) -+ n = n->rb_left; -+ return n; -+ } else { -+ while ((parent = n->rb_parent)) { -+ if (n == parent->rb_left) -+ return parent; -+ n = parent; -+ } -+ return 0; -+ } -+} -+EXPORT_SYMBOL(rb_get_next); diff --git a/lustre/kernel_patches/patches/ext3-htree-path-ops.patch b/lustre/kernel_patches/patches/ext3-htree-path-ops.patch new file mode 100644 index 0000000..9a2edbd --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-htree-path-ops.patch @@ -0,0 +1,894 @@ +Index: iam-src/fs/ext3/namei.c +=================================================================== +--- iam-src.orig/fs/ext3/namei.c 2006-02-12 16:43:57.000000000 +0300 ++++ iam-src/fs/ext3/namei.c 2006-02-12 23:22:12.000000000 +0300 +@@ -83,22 +83,21 @@ static struct buffer_head *ext3_append(h + #define dxtrace(command) + #endif + +-struct fake_dirent +-{ ++struct fake_dirent { + __le32 inode; + __le16 rec_len; + u8 name_len; + u8 file_type; + }; + +-struct dx_countlimit +-{ ++struct dx_countlimit { + __le16 limit; + __le16 count; + }; + +-struct dx_entry +-{ ++struct dx_entry; /* incomplete type */ ++ ++struct dx_entry_compat { + __le32 hash; + __le32 block; + }; +@@ -109,8 +108,7 @@ struct dx_entry + * hash version mod 4 should never be 0. Sincerely, the paranoia department. + */ + +-struct dx_root +-{ ++struct dx_root { + struct fake_dirent dot; + char dot_name[4]; + struct fake_dirent dotdot; +@@ -124,13 +122,13 @@ struct dx_root + u8 unused_flags; + } + info; +- struct dx_entry entries[0]; ++ struct {} entries[0]; + }; + + struct dx_node + { + struct fake_dirent fake; +- struct dx_entry entries[0]; ++ struct {} entries[0]; + }; + + +@@ -147,38 +145,76 @@ struct dx_map_entry + u32 offs; + }; + ++struct dx_path; ++struct dx_param { ++ size_t dpo_key_size; ++ size_t dpo_ptr_size; ++ size_t dpo_node_gap; ++ size_t dpo_root_gap; ++ ++ u32 (*dpo_root_ptr)(struct dx_path *path); ++ int (*dpo_node_check)(struct dx_path *path, ++ struct dx_frame *frame, void *cookie); ++ int (*dpo_node_init)(struct dx_path *path, ++ struct buffer_head *bh, int root); ++}; ++ + /* + * Structure to keep track of a path drilled through htree. + */ + struct dx_path { +- struct inode *dp_object; +- struct dx_frame dp_frames[DX_MAX_TREE_HEIGHT]; +- struct dx_frame *dp_frame; ++ struct inode *dp_object; ++ struct dx_param *dp_param; ++ int dp_indirect; ++ struct dx_frame dp_frames[DX_MAX_TREE_HEIGHT]; ++ struct dx_frame *dp_frame; ++ void *dp_key_target; ++ void *dp_key; + }; + ++static u32 htree_root_ptr(struct dx_path *p); ++static int htree_node_check(struct dx_path *path, ++ struct dx_frame *frame, void *cookie); ++static int htree_node_init(struct dx_path *path, ++ struct buffer_head *bh, int root); ++ ++static struct dx_param htree_compat_param = { ++ .dpo_key_size = sizeof ((struct dx_map_entry *)NULL)->hash, ++ .dpo_ptr_size = sizeof ((struct dx_map_entry *)NULL)->offs, ++ .dpo_node_gap = offsetof(struct dx_node, entries), ++ .dpo_root_gap = offsetof(struct dx_root, entries), ++ ++ .dpo_root_ptr = htree_root_ptr, ++ .dpo_node_check = htree_node_check, ++ .dpo_node_init = htree_node_init ++}; ++ ++ + #ifdef CONFIG_EXT3_INDEX +-static inline unsigned dx_get_block (struct dx_entry *entry); +-static void dx_set_block (struct dx_entry *entry, unsigned value); +-static inline unsigned dx_get_hash (struct dx_entry *entry); +-static void dx_set_hash (struct dx_entry *entry, unsigned value); +-static unsigned dx_get_count (struct dx_entry *entries); +-static unsigned dx_get_limit (struct dx_entry *entries); +-static void dx_set_count (struct dx_entry *entries, unsigned value); +-static void dx_set_limit (struct dx_entry *entries, unsigned value); +-static unsigned dx_root_limit (struct inode *dir, unsigned infosize); +-static unsigned dx_node_limit (struct inode *dir); +-static struct dx_frame *dx_probe(struct dentry *dentry, +- struct inode *dir, +- struct dx_hash_info *hinfo, +- struct dx_path *path, +- int *err); ++static inline unsigned dx_get_block(struct dx_path *p, struct dx_entry *entry); ++static void dx_set_block(struct dx_path *p, ++ struct dx_entry *entry, unsigned value); ++static inline void *dx_get_key(struct dx_path *p, ++ struct dx_entry *entry, void *key); ++static void dx_set_key(struct dx_path *p, struct dx_entry *entry, void *key); ++static unsigned dx_get_count(struct dx_entry *entries); ++static unsigned dx_get_limit(struct dx_entry *entries); ++static void dx_set_count(struct dx_entry *entries, unsigned value); ++static void dx_set_limit(struct dx_entry *entries, unsigned value); ++static unsigned dx_root_limit(struct dx_path *p); ++static unsigned dx_node_limit(struct dx_path *p); ++static int dx_probe(struct dentry *dentry, ++ struct inode *dir, ++ struct dx_hash_info *hinfo, ++ struct dx_path *path); + static int dx_make_map (struct ext3_dir_entry_2 *de, int size, + struct dx_hash_info *hinfo, struct dx_map_entry map[]); + static void dx_sort_map(struct dx_map_entry *map, unsigned count); + static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to, + struct dx_map_entry *offsets, int count); + static struct ext3_dir_entry_2* dx_pack_dirents (char *base, int size); +-static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block); ++static void dx_insert_block (struct dx_path *path, ++ struct dx_frame *frame, u32 hash, u32 block); + static int ext3_htree_next_block(struct inode *dir, __u32 hash, + struct dx_path *path, __u32 *start_hash); + static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry, +@@ -186,29 +222,65 @@ static struct buffer_head * ext3_dx_find + static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, + struct inode *inode); + ++static inline void dx_path_init(struct dx_path *path, struct inode *inode); ++static inline void dx_path_fini(struct dx_path *path); ++ ++ + /* + * Future: use high four bits of block for coalesce-on-delete flags + * Mask them off for now. + */ + +-static inline unsigned dx_get_block (struct dx_entry *entry) ++static inline void *entry_off(struct dx_entry *entry, ptrdiff_t off) ++{ ++ return (void *)((char *)entry + off); ++} ++ ++static inline size_t dx_entry_size(struct dx_path *p) + { +- return le32_to_cpu(entry->block) & 0x00ffffff; ++ return p->dp_param->dpo_key_size + p->dp_param->dpo_ptr_size; + } + +-static inline void dx_set_block (struct dx_entry *entry, unsigned value) ++static inline struct dx_entry *dx_entry_shift(struct dx_path *p, ++ struct dx_entry *entry, int shift) + { +- entry->block = cpu_to_le32(value); ++ void *e = entry; ++ return e + shift * dx_entry_size(p); + } + +-static inline unsigned dx_get_hash (struct dx_entry *entry) ++static inline ptrdiff_t dx_entry_diff(struct dx_path *p, ++ struct dx_entry *e1, struct dx_entry *e2) + { +- return le32_to_cpu(entry->hash); ++ ptrdiff_t diff; ++ ++ diff = (void *)e1 - (void *)e2; ++ assert(diff / dx_entry_size(p) * dx_entry_size(p) == diff); ++ return diff / dx_entry_size(p); ++} ++ ++static inline unsigned dx_get_block(struct dx_path *p, struct dx_entry *entry) ++{ ++ return le32_to_cpu(*(u32 *)entry_off(entry, p->dp_param->dpo_key_size)) ++ & 0x00ffffff; + } + +-static inline void dx_set_hash (struct dx_entry *entry, unsigned value) ++static inline void dx_set_block(struct dx_path *p, ++ struct dx_entry *entry, unsigned value) + { +- entry->hash = cpu_to_le32(value); ++ *(u32*)entry_off(entry, p->dp_param->dpo_key_size) = cpu_to_le32(value); ++} ++ ++static inline void *dx_get_key(struct dx_path *p, ++ struct dx_entry *entry, void *key) ++{ ++ memcpy(key, entry, p->dp_param->dpo_key_size); ++ return key; ++} ++ ++static inline void dx_set_key(struct dx_path *p, ++ struct dx_entry *entry, void *key) ++{ ++ memcpy(entry, key, p->dp_param->dpo_key_size); + } + + static inline unsigned dx_get_count (struct dx_entry *entries) +@@ -231,17 +303,123 @@ static inline void dx_set_limit (struct + ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value); + } + +-static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize) ++static inline unsigned dx_root_limit(struct dx_path *p) + { +- unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(1) - +- EXT3_DIR_REC_LEN(2) - infosize; +- return 0? 20: entry_space / sizeof(struct dx_entry); ++ struct dx_param *param = p->dp_param; ++ unsigned entry_space = p->dp_object->i_sb->s_blocksize - ++ param->dpo_root_gap; ++ return entry_space / (param->dpo_key_size + param->dpo_ptr_size); ++} ++ ++static inline unsigned dx_node_limit(struct dx_path *p) ++{ ++ struct dx_param *param = p->dp_param; ++ unsigned entry_space = p->dp_object->i_sb->s_blocksize - ++ param->dpo_node_gap; ++ return entry_space / (param->dpo_key_size + param->dpo_ptr_size); ++} ++ ++static inline int dx_index_is_compat(struct dx_path *path) ++{ ++ return path->dp_param == &htree_compat_param; ++} ++ ++static struct dx_entry *dx_get_entries(struct dx_path *path, void *data, ++ int root) ++{ ++ return data + ++ (root ? ++ path->dp_param->dpo_root_gap : path->dp_param->dpo_node_gap); ++} ++ ++static struct dx_entry *dx_node_get_entries(struct dx_path *path, ++ struct dx_frame *frame) ++{ ++ return dx_get_entries(path, ++ frame->bh->b_data, frame == path->dp_frames); ++} ++ ++static u32 htree_root_ptr(struct dx_path *path) ++{ ++ return 0; ++} ++ ++struct htree_cookie { ++ struct dx_hash_info *hinfo; ++ struct dentry *dentry; ++}; ++ ++static int htree_node_check(struct dx_path *path, struct dx_frame *frame, ++ void *cookie) ++{ ++ void *data; ++ struct dx_entry *entries; ++ struct super_block *sb; ++ ++ data = frame->bh->b_data; ++ entries = dx_node_get_entries(path, frame); ++ sb = path->dp_object->i_sb; ++ if (frame == path->dp_frames) { ++ /* root node */ ++ struct dx_root *root; ++ struct htree_cookie *hc = cookie; ++ ++ root = data; ++ if (root->info.hash_version != DX_HASH_TEA && ++ root->info.hash_version != DX_HASH_HALF_MD4 && ++ root->info.hash_version != DX_HASH_R5 && ++ root->info.hash_version != DX_HASH_LEGACY) { ++ ext3_warning(sb, __FUNCTION__, ++ "Unrecognised inode hash code %d", ++ root->info.hash_version); ++ return ERR_BAD_DX_DIR; ++ } ++ ++ if (root->info.unused_flags & 1) { ++ ext3_warning(sb, __FUNCTION__, ++ "Unimplemented inode hash flags: %#06x", ++ root->info.unused_flags); ++ return ERR_BAD_DX_DIR; ++ } ++ ++ path->dp_indirect = root->info.indirect_levels; ++ if (path->dp_indirect > DX_MAX_TREE_HEIGHT - 1) { ++ ext3_warning(sb, __FUNCTION__, ++ "Unimplemented inode hash depth: %#06x", ++ root->info.indirect_levels); ++ return ERR_BAD_DX_DIR; ++ } ++ ++ assert((char *)entries == (((char *)&root->info) + ++ root->info.info_length)); ++ assert(dx_get_limit(entries) == dx_root_limit(path)); ++ ++ hc->hinfo->hash_version = root->info.hash_version; ++ hc->hinfo->seed = EXT3_SB(sb)->s_hash_seed; ++ if (hc->dentry) ++ ext3fs_dirhash(hc->dentry->d_name.name, ++ hc->dentry->d_name.len, hc->hinfo); ++ path->dp_key_target = &hc->hinfo->hash; ++ } else { ++ /* non-root index */ ++ assert(entries == data + path->dp_param->dpo_node_gap); ++ assert(dx_get_limit(entries) == dx_node_limit(path)); ++ } ++ frame->entries = frame->at = entries; ++ return 0; + } + +-static inline unsigned dx_node_limit (struct inode *dir) ++static int htree_node_init(struct dx_path *path, ++ struct buffer_head *bh, int root) + { +- unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(0); +- return 0? 22: entry_space / sizeof(struct dx_entry); ++ struct dx_node *node; ++ ++ assert(!root); ++ ++ node = (void *)bh->b_data; ++ node->fake.rec_len = cpu_to_le16(path->dp_object->i_sb->s_blocksize); ++ node->fake.inode = 0; ++ return 0; + } + + /* +@@ -327,123 +505,101 @@ struct stats dx_show_entries(struct dx_h + } + #endif /* DX_DEBUG */ + +-/* +- * Probe for a directory leaf block to search. +- * +- * dx_probe can return ERR_BAD_DX_DIR, which means there was a format +- * error in the directory index, and the caller should fall back to +- * searching the directory normally. The callers of dx_probe **MUST** +- * check for this error code, and make sure it never gets reflected +- * back to userspace. +- */ +-static struct dx_frame * +-dx_probe(struct dentry *dentry, struct inode *dir, +- struct dx_hash_info *hinfo, struct dx_path *path, int *err) +-{ +- unsigned count, indirect; +- struct dx_entry *at, *entries, *p, *q, *m; +- struct dx_root *root; +- struct buffer_head *bh; +- struct dx_frame *frame = path->dp_frames; +- u32 hash; ++static int dx_lookup(struct dx_path *path, void *cookie) ++{ ++ u32 ptr; ++ int err; ++ int i; + +- frame->bh = NULL; +- if (dentry) +- dir = dentry->d_parent->d_inode; +- if (!(bh = ext3_bread (NULL,dir, 0, 0, err))) +- goto fail; +- root = (struct dx_root *) bh->b_data; +- if (root->info.hash_version != DX_HASH_TEA && +- root->info.hash_version != DX_HASH_HALF_MD4 && +- root->info.hash_version != DX_HASH_R5 && +- root->info.hash_version != DX_HASH_LEGACY) { +- ext3_warning(dir->i_sb, __FUNCTION__, +- "Unrecognised inode hash code %d", root->info.hash_version); +- brelse(bh); +- *err = ERR_BAD_DX_DIR; +- goto fail; +- } +- hinfo->hash_version = root->info.hash_version; +- hinfo->seed = EXT3_SB(dir->i_sb)->s_hash_seed; +- if (dentry) +- ext3fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo); +- hash = hinfo->hash; +- +- if (root->info.unused_flags & 1) { +- ext3_warning(dir->i_sb, __FUNCTION__, +- "Unimplemented inode hash flags: %#06x", +- root->info.unused_flags); +- brelse(bh); +- *err = ERR_BAD_DX_DIR; +- goto fail; +- } ++ struct dx_param *param; ++ struct dx_frame *frame; + +- if ((indirect = root->info.indirect_levels) > DX_MAX_TREE_HEIGHT - 1) { +- ext3_warning(dir->i_sb, __FUNCTION__, +- "Unimplemented inode hash depth: %#06x", +- root->info.indirect_levels); +- brelse(bh); +- *err = ERR_BAD_DX_DIR; +- goto fail; +- } ++ param = path->dp_param; + +- entries = (struct dx_entry *) (((char *)&root->info) + +- root->info.info_length); +- assert(dx_get_limit(entries) == dx_root_limit(dir, +- root->info.info_length)); +- dxtrace (printk("Look up %x", hash)); +- while (1) +- { ++ for (frame = path->dp_frames, i = 0, ++ ptr = param->dpo_root_ptr(path); i <= path->dp_indirect; ++ ptr = dx_get_block(path, frame->at), ++frame, ++i) { ++ struct dx_entry *entries; ++ struct dx_entry *p; ++ struct dx_entry *q; ++ struct dx_entry *m; ++ unsigned count; ++ ++ frame->bh = ext3_bread(NULL, path->dp_object, ptr, 0, &err); ++ if (frame->bh == NULL) { ++ err = -EIO; ++ break; ++ } ++ err = param->dpo_node_check(path, frame, cookie); ++ if (err != 0) ++ break; ++ ++ entries = frame->entries; + count = dx_get_count(entries); +- assert (count && count <= dx_get_limit(entries)); +- p = entries + 1; +- q = entries + count - 1; +- while (p <= q) +- { +- m = p + (q - p)/2; ++ assert(count && count <= dx_get_limit(entries)); ++ p = dx_entry_shift(path, entries, 1); ++ q = dx_entry_shift(path, entries, count - 1); ++ while (p <= q) { ++ m = dx_entry_shift(path, ++ p, dx_entry_diff(path, q, p) / 2); + dxtrace(printk(".")); +- if (dx_get_hash(m) > hash) +- q = m - 1; ++ if (memcmp(dx_get_key(path, m, path->dp_key), ++ path->dp_key_target, ++ param->dpo_key_size) > 0) ++ q = dx_entry_shift(path, m, -1); + else +- p = m + 1; ++ p = dx_entry_shift(path, m, +1); + } + +- if (0) // linear search cross check +- { ++ frame->at = dx_entry_shift(path, p, -1); ++ if (1) { // linear search cross check + unsigned n = count - 1; ++ struct dx_entry *at; ++ + at = entries; +- while (n--) +- { ++ while (n--) { + dxtrace(printk(",")); +- if (dx_get_hash(++at) > hash) +- { +- at--; ++ at = dx_entry_shift(path, at, +1); ++ if (memcmp(dx_get_key(path, at, path->dp_key), ++ path->dp_key_target, ++ param->dpo_key_size) > 0) { ++ at = dx_entry_shift(path, at, -1); + break; + } + } +- assert (at == p - 1); ++ assert(at == frame->at); + } +- +- at = p - 1; +- dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at))); +- frame->bh = bh; +- frame->entries = entries; +- frame->at = at; +- if (!indirect--) +- return path->dp_frame = frame; +- if (!(bh = ext3_bread (NULL,dir, dx_get_block(at), 0, err))) +- goto fail2; +- at = entries = ((struct dx_node *) bh->b_data)->entries; +- assert (dx_get_limit(entries) == dx_node_limit (dir)); +- frame++; +- } +-fail2: +- while (frame >= path->dp_frames) { +- brelse(frame->bh); +- frame--; + } +-fail: +- return NULL; ++ if (err != 0) ++ dx_path_fini(path); ++ path->dp_frame = --frame; ++ return err; ++} ++ ++/* ++ * Probe for a directory leaf block to search. ++ * ++ * dx_probe can return ERR_BAD_DX_DIR, which means there was a format ++ * error in the directory index, and the caller should fall back to ++ * searching the directory normally. The callers of dx_probe **MUST** ++ * check for this error code, and make sure it never gets reflected ++ * back to userspace. ++ */ ++static int dx_probe(struct dentry *dentry, struct inode *dir, ++ struct dx_hash_info *hinfo, struct dx_path *path) ++{ ++ int err; ++ __u32 hash_storage; ++ struct htree_cookie hc = { ++ .dentry = dentry, ++ .hinfo = hinfo ++ }; ++ ++ assert(dx_index_is_compat(path)); ++ path->dp_key = &hash_storage; ++ err = dx_lookup(path, &hc); ++ assert(err != 0 || path->dp_frames[path->dp_indirect].bh != NULL); ++ return err; + } + + static inline void dx_path_init(struct dx_path *path, struct inode *inode) +@@ -458,8 +614,10 @@ static inline void dx_path_fini(struct d + int i; + + for (i = 0; i < ARRAY_SIZE(path->dp_frames); i--) { +- if (path->dp_frames[i].bh != NULL) ++ if (path->dp_frames[i].bh != NULL) { + brelse(path->dp_frames[i].bh); ++ path->dp_frames[i].bh = NULL; ++ } + } + } + +@@ -488,6 +646,8 @@ static int ext3_htree_next_block(struct + int err, num_frames = 0; + __u32 bhash; + ++ assert(dx_index_is_compat(path)); ++ + p = path->dp_frame; + /* + * Find the next leaf page by incrementing the frame pointer. +@@ -497,7 +657,9 @@ static int ext3_htree_next_block(struct + * nodes need to be read. + */ + while (1) { +- if (++(p->at) < p->entries + dx_get_count(p->entries)) ++ p->at = dx_entry_shift(path, p->at, +1); ++ if (p->at < dx_entry_shift(path, p->entries, ++ dx_get_count(p->entries))) + break; + if (p == path->dp_frames) + return 0; +@@ -512,7 +674,7 @@ static int ext3_htree_next_block(struct + * desired contiuation hash. If it doesn't, return since + * there's no point to read in the successive index pages. + */ +- bhash = dx_get_hash(p->at); ++ dx_get_key(path, p->at, &bhash); + if (start_hash) + *start_hash = bhash; + if ((hash & 1) == 0) { +@@ -524,12 +686,13 @@ static int ext3_htree_next_block(struct + * block so no check is necessary + */ + while (num_frames--) { +- if (!(bh = ext3_bread(NULL, dir, dx_get_block(p->at), 0, &err))) ++ if (!(bh = ext3_bread(NULL, dir, ++ dx_get_block(path, p->at), 0, &err))) + return err; /* Failure */ + ++p; + brelse (p->bh); + p->bh = bh; +- p->at = p->entries = ((struct dx_node *) bh->b_data)->entries; ++ p->at = p->entries = dx_node_get_entries(path, p); + } + return 1; + } +@@ -609,6 +772,7 @@ int ext3_htree_fill_tree(struct file *di + start_minor_hash)); + dir = dir_file->f_dentry->d_inode; + dx_path_init(&path, dir); ++ path.dp_param = &htree_compat_param; + if (!(EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) { + hinfo.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version; + hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed; +@@ -619,7 +783,8 @@ int ext3_htree_fill_tree(struct file *di + } + hinfo.hash = start_hash; + hinfo.minor_hash = 0; +- if (!dx_probe(NULL, dir_file->f_dentry->d_inode, &hinfo, &path, &err)) ++ err = dx_probe(NULL, dir_file->f_dentry->d_inode, &hinfo, &path); ++ if (err != 0) + return err; + + /* Add '.' and '..' from the htree header */ +@@ -634,7 +799,7 @@ int ext3_htree_fill_tree(struct file *di + } + + while (1) { +- block = dx_get_block(path.dp_frame->at); ++ block = dx_get_block(&path, path.dp_frame->at); + ret = htree_dirblock_to_tree(dir_file, dir, block, &hinfo, + start_hash, start_minor_hash); + if (ret < 0) { +@@ -722,17 +887,19 @@ static void dx_sort_map (struct dx_map_e + } while(more); + } + +-static void dx_insert_block(struct dx_frame *frame, u32 hash, u32 block) ++static void dx_insert_block(struct dx_path *path, ++ struct dx_frame *frame, u32 hash, u32 block) + { + struct dx_entry *entries = frame->entries; +- struct dx_entry *old = frame->at, *new = old + 1; ++ struct dx_entry *old = frame->at, *new = dx_entry_shift(path, old, +1); + int count = dx_get_count(entries); + + assert(count < dx_get_limit(entries)); +- assert(old < entries + count); +- memmove(new + 1, new, (char *)(entries + count) - (char *)(new)); +- dx_set_hash(new, hash); +- dx_set_block(new, block); ++ assert(old < dx_entry_shift(path, entries, count)); ++ memmove(dx_entry_shift(path, new, 1), new, ++ (char *)dx_entry_shift(path, entries, count) - (char *)new); ++ dx_set_key(path, new, &hash); ++ dx_set_block(path, new, block); + dx_set_count(entries, count + 1); + } + #endif +@@ -934,7 +1101,9 @@ static struct buffer_head * ext3_dx_find + struct dx_hash_info hinfo; + u32 hash; + struct dx_path path; +- struct dx_entry dummy_dot; ++ struct dx_entry_compat dummy_dot = { ++ .block = 0 ++ }; + struct ext3_dir_entry_2 *de, *top; + struct buffer_head *bh; + unsigned long block; +@@ -944,19 +1113,21 @@ static struct buffer_head * ext3_dx_find + struct inode *dir = dentry->d_parent->d_inode; + + dx_path_init(&path, dir); ++ path.dp_param = &htree_compat_param; ++ + sb = dir->i_sb; + /* NFS may look up ".." - look at dx_root directory block */ + if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){ +- if (!(dx_probe(dentry, NULL, &hinfo, &path, err))) ++ *err = dx_probe(dentry, NULL, &hinfo, &path); ++ if (*err != 0) + return NULL; + } else { +- path.dp_frame->bh = NULL; /* for dx_path_fini() */ +- path.dp_frame->at = &dummy_dot; /* hack for zero entry*/ +- dx_set_block(path.dp_frame->at, 0); /* dx_root block is 0 */ ++ path.dp_frame->bh = NULL; /* for dx_path_fini() */ ++ path.dp_frame->at = (void *)&dummy_dot; /* hack for zero entry*/ + } + hash = hinfo.hash; + do { +- block = dx_get_block(path.dp_frame->at); ++ block = dx_get_block(&path, path.dp_frame->at); + if (!(bh = ext3_bread (NULL,dir, block, 0, err))) + goto errout; + de = (struct ext3_dir_entry_2 *) bh->b_data; +@@ -1115,10 +1286,11 @@ static struct ext3_dir_entry_2* dx_pack_ + + /* Allocate new node, and split leaf node @bh into it, inserting new pointer + * into parent node identified by @frame */ +-static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, ++static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct dx_path *path, + struct buffer_head **bh,struct dx_frame *frame, + struct dx_hash_info *hinfo, int *error) + { ++ struct inode *dir = path->dp_object; + unsigned blocksize = dir->i_sb->s_blocksize; + unsigned count, continued; + struct buffer_head *bh2; +@@ -1180,7 +1352,7 @@ static struct ext3_dir_entry_2 *do_split + swap(*bh, bh2); + de = de2; + } +- dx_insert_block (frame, hash2 + continued, newblock); ++ dx_insert_block(path, frame, hash2 + continued, newblock); + err = ext3_journal_dirty_metadata (handle, bh2); + if (err) + goto journal_error; +@@ -1315,6 +1487,7 @@ static int make_indexed_dir(handle_t *ha + struct fake_dirent *fde; + + dx_path_init(&path, dir); ++ path.dp_param = &htree_compat_param; + blocksize = dir->i_sb->s_blocksize; + dxtrace(printk("Creating index\n")); + retval = ext3_journal_get_write_access(handle, bh); +@@ -1350,10 +1523,10 @@ static int make_indexed_dir(handle_t *ha + root->info.info_length = sizeof(root->info); + root->info.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version; + root->info.hash_version = DX_HASH_R5; +- entries = root->entries; +- dx_set_block (entries, 1); ++ entries = (void *)root->entries; ++ dx_set_block (&path, entries, 1); + dx_set_count (entries, 1); +- dx_set_limit (entries, dx_root_limit(dir, sizeof(root->info))); ++ dx_set_limit (entries, dx_root_limit(&path)); + + /* Initialize as for dx_probe */ + hinfo.hash_version = root->info.hash_version; +@@ -1363,7 +1536,7 @@ static int make_indexed_dir(handle_t *ha + path.dp_frame->at = entries; + path.dp_frame->bh = bh; + bh = bh2; +- de = do_split(handle,dir, &bh, path.dp_frame, &hinfo, &retval); ++ de = do_split(handle, &path, &bh, path.dp_frame, &hinfo, &retval); + dx_path_fini(&path); + if (!de) + return retval; +@@ -1446,8 +1619,8 @@ static int ext3_dx_add_entry(handle_t *h + struct inode *inode) + { + struct dx_path path; ++ struct dx_param *param; + struct dx_frame *frame, *safe; +- struct dx_node *node2; + struct dx_entry *entries; /* old block contents */ + struct dx_entry *entries2; /* new block contents */ + struct dx_hash_info hinfo; +@@ -1463,7 +1636,10 @@ static int ext3_dx_add_entry(handle_t *h + size_t isize; + + dx_path_init(&path, dir); +- if (!dx_probe(dentry, NULL, &hinfo, &path, &err)) ++ param = path.dp_param = &htree_compat_param; ++ ++ err = dx_probe(dentry, NULL, &hinfo, &path); ++ if (err != 0) + return err; + frame = path.dp_frame; + entries = frame->entries; +@@ -1471,7 +1647,8 @@ static int ext3_dx_add_entry(handle_t *h + /* XXX nikita: global serialization! */ + isize = dir->i_size; + +- if (!(bh = ext3_bread(handle,dir, dx_get_block(frame->at), 0, &err))) ++ if (!(bh = ext3_bread(handle, dir, ++ dx_get_block(&path, frame->at), 0, &err))) + goto cleanup; + + BUFFER_TRACE(bh, "get_write_access"); +@@ -1519,12 +1696,9 @@ static int ext3_dx_add_entry(handle_t *h + * transaction... */ + for (frame = safe + 1, i = 0; i < nr_splet; ++i, ++frame) { + bh_new[i] = ext3_append (handle, dir, &newblock[i], &err); +- if (!bh_new[i]) ++ if (!bh_new[i] || ++ param->dpo_node_init(&path, bh_new[i], 0) != 0) + goto cleanup; +- node2 = (struct dx_node *)(bh_new[i]->b_data); +- entries2 = node2->entries; +- node2->fake.rec_len = cpu_to_le16(sb->s_blocksize); +- node2->fake.inode = 0; + BUFFER_TRACE(frame->bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, frame->bh); + if (err) +@@ -1545,11 +1719,10 @@ static int ext3_dx_add_entry(handle_t *h + + entries = frame->entries; + count = dx_get_count(entries); +- idx = frame->at - entries; ++ idx = dx_entry_diff(&path, frame->at, entries); + + bh2 = bh_new[i]; +- node2 = (struct dx_node *)(bh2->b_data); +- entries2 = node2->entries; ++ entries2 = dx_get_entries(&path, bh2->b_data, 0); + + if (frame == path.dp_frames) { + /* splitting root node. Tricky point: +@@ -1571,19 +1744,19 @@ static int ext3_dx_add_entry(handle_t *h + indirects = root->info.indirect_levels; + dxtrace(printk("Creating new root %d\n", indirects)); + memcpy((char *) entries2, (char *) entries, +- count * sizeof(struct dx_entry)); +- dx_set_limit(entries2, dx_node_limit(dir)); ++ count * dx_entry_size(&path)); ++ dx_set_limit(entries2, dx_node_limit(&path)); + + /* Set up root */ + dx_set_count(entries, 1); +- dx_set_block(entries + 0, newblock[i]); ++ dx_set_block(&path, entries, newblock[i]); + root->info.indirect_levels = indirects + 1; + + /* Shift frames in the path */ + memmove(frames + 2, frames + 1, + (sizeof path.dp_frames) - 2 * sizeof frames[0]); + /* Add new access path frame */ +- frames[1].at = entries2 + idx; ++ frames[1].at = dx_entry_shift(&path, entries2, idx); + frames[1].entries = entries = entries2; + frames[1].bh = bh2; + ++ frame; +@@ -1594,23 +1767,30 @@ static int ext3_dx_add_entry(handle_t *h + } else { + /* splitting non-root index node. */ + unsigned count1 = count/2, count2 = count - count1; +- unsigned hash2 = dx_get_hash(entries + count1); ++ unsigned hash2; ++ ++ dx_get_key(&path, ++ dx_entry_shift(&path, entries, count1), ++ &hash2); ++ + dxtrace(printk("Split index %i/%i\n", count1, count2)); + +- memcpy ((char *) entries2, (char *) (entries + count1), +- count2 * sizeof(struct dx_entry)); ++ memcpy ((char *) entries2, ++ (char *) dx_entry_shift(&path, entries, count1), ++ count2 * dx_entry_size(&path)); + dx_set_count (entries, count1); + dx_set_count (entries2, count2); +- dx_set_limit (entries2, dx_node_limit(dir)); ++ dx_set_limit (entries2, dx_node_limit(&path)); + + /* Which index block gets the new entry? */ + if (idx >= count1) { +- frame->at = entries2 + idx - count1; ++ frame->at = dx_entry_shift(&path, entries2, ++ idx - count1); + frame->entries = entries = entries2; + swap(frame->bh, bh2); + bh_new[i] = bh2; + } +- dx_insert_block (frame - 1, hash2, newblock[i]); ++ dx_insert_block(&path, frame - 1, hash2, newblock[i]); + dxtrace(dx_show_index ("node", frame->entries)); + dxtrace(dx_show_index ("node", + ((struct dx_node *) bh2->b_data)->entries)); +@@ -1619,7 +1799,7 @@ static int ext3_dx_add_entry(handle_t *h + goto journal_error; + } + } +- de = do_split(handle, dir, &bh, --frame, &hinfo, &err); ++ de = do_split(handle, &path, &bh, --frame, &hinfo, &err); + if (!de) + goto cleanup; + err = add_dirent_to_buf(handle, dentry, inode, de, bh); diff --git a/lustre/kernel_patches/patches/ext3-include-fixes-2.6-rhel4.patch b/lustre/kernel_patches/patches/ext3-include-fixes-2.6-rhel4.patch index 49528cf..52e5521 100644 --- a/lustre/kernel_patches/patches/ext3-include-fixes-2.6-rhel4.patch +++ b/lustre/kernel_patches/patches/ext3-include-fixes-2.6-rhel4.patch @@ -3,7 +3,7 @@ Index: linux-stage/include/linux/ext3_fs.h --- linux-stage.orig/include/linux/ext3_fs.h 2005-02-25 14:53:56.424908168 +0200 +++ linux-stage/include/linux/ext3_fs.h 2005-02-25 14:53:59.376459464 +0200 @@ -361,12 +361,13 @@ - #define EXT3_MOUNT_IOPEN_NOPRIV 0x80000 /* Make iopen world-readable */ + #define EXT3_MOUNT_IOPEN_NOPRIV 0x100000/* Make iopen world-readable */ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ -#ifndef _LINUX_EXT2_FS_H diff --git a/lustre/kernel_patches/patches/ext3-include-fixes-2.6-suse.patch b/lustre/kernel_patches/patches/ext3-include-fixes-2.6-suse.patch index acf97dd..1ac944b 100644 --- a/lustre/kernel_patches/patches/ext3-include-fixes-2.6-suse.patch +++ b/lustre/kernel_patches/patches/ext3-include-fixes-2.6-suse.patch @@ -3,7 +3,7 @@ Index: linux-stage/include/linux/ext3_fs.h --- linux-stage.orig/include/linux/ext3_fs.h 2004-04-02 16:43:37.000000000 -0500 +++ linux-stage/include/linux/ext3_fs.h 2004-04-02 16:43:37.000000000 -0500 @@ -331,12 +331,13 @@ - #define EXT3_MOUNT_IOPEN_NOPRIV 0x80000 /* Make iopen world-readable */ + #define EXT3_MOUNT_IOPEN_NOPRIV 0x100000/* Make iopen world-readable */ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ -#ifndef _LINUX_EXT2_FS_H diff --git a/lustre/kernel_patches/patches/ext3-mballoc2-2.4.24.patch b/lustre/kernel_patches/patches/ext3-mballoc2-2.4.24.patch deleted file mode 100644 index 172432a..0000000 --- a/lustre/kernel_patches/patches/ext3-mballoc2-2.4.24.patch +++ /dev/null @@ -1,1766 +0,0 @@ -Index: linux-2.4.20-rh-20.9/fs/ext3/mballoc.c -=================================================================== ---- linux-2.4.20-rh-20.9.orig/fs/ext3/mballoc.c 2003-01-30 13:24:37.000000000 +0300 -+++ linux-2.4.20-rh-20.9/fs/ext3/mballoc.c 2004-10-20 22:28:51.000000000 +0400 -@@ -0,0 +1,1459 @@ -+/* -+ * Copyright (c) 2004, Cluster File Systems, Inc, info@clusterfs.com -+ * Written by Alex Tomas -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License version 2 as -+ * published by the Free Software Foundation. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public Licens -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- -+ */ -+ -+ -+/* -+ * mballoc.c contains the multiblocks allocation routines -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+/* -+ * TODO: -+ * - do not scan from the beginning, try to remember first free block -+ * - mb_mark_used_* may allocate chunk right after splitting buddy -+ * - special flag to advice allocator to look for requested + N blocks -+ * this may improve interaction between extents and mballoc -+ */ -+ -+/* -+ * with AGRESSIVE_CHECK allocator runs consistency checks over -+ * structures. this checks slow things down a lot -+ */ -+#define AGGRESSIVE_CHECK__ -+ -+/* -+ */ -+#define MB_DEBUG__ -+#ifdef MB_DEBUG -+#define mb_debug(fmt,a...) printk(fmt, ##a) -+#else -+#define mb_debug(fmt,a...) -+#endif -+ -+/* -+ * where to save buddies structures beetween umount/mount (clean case only) -+ */ -+#define EXT3_BUDDY_FILE ".buddy" -+ -+/* -+ * max. number of chunks to be tracked in ext3_free_extent struct -+ */ -+#define MB_ARR_SIZE 32 -+ -+struct ext3_allocation_context { -+ struct super_block *ac_sb; -+ -+ /* search goals */ -+ int ac_g_group; -+ int ac_g_start; -+ int ac_g_len; -+ int ac_g_flags; -+ -+ /* the best found extent */ -+ int ac_b_group; -+ int ac_b_start; -+ int ac_b_len; -+ -+ /* number of iterations done. we have to track to limit searching */ -+ int ac_repeats; -+ int ac_groups_scanned; -+ int ac_status; -+}; -+ -+#define AC_STATUS_CONTINUE 1 -+#define AC_STATUS_FOUND 2 -+ -+ -+struct ext3_buddy { -+ void *bd_bitmap; -+ void *bd_buddy; -+ int bd_blkbits; -+ struct buffer_head *bd_bh; -+ struct buffer_head *bd_bh2; -+ struct ext3_buddy_group_blocks *bd_bd; -+ struct super_block *bd_sb; -+}; -+ -+struct ext3_free_extent { -+ int fe_start; -+ int fe_len; -+ unsigned char fe_orders[MB_ARR_SIZE]; -+ unsigned char fe_nums; -+ unsigned char fe_back; -+}; -+ -+#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) -+ -+ -+int ext3_create (struct inode *, struct dentry *, int, struct nameidata *); -+void ext3_free_blocks_old(handle_t *, struct inode *, unsigned long, unsigned long); -+int ext3_new_block_old(handle_t *, struct inode *, unsigned long, u32 *, u32 *, int *); -+int ext3_mb_reserve_blocks(struct super_block *, int); -+void ext3_mb_release_blocks(struct super_block *, int); -+void ext3_mb_poll_new_transaction(struct super_block *, handle_t *); -+void ext3_mb_free_committed_blocks(struct super_block *); -+int load_block_bitmap (struct super_block *, unsigned int); -+ -+#define mb_correct_addr_and_bit(bit,addr) \ -+{ \ -+ if ((unsigned long) addr & 1) { \ -+ bit += 8; \ -+ addr--; \ -+ } \ -+ if ((unsigned long) addr & 2) { \ -+ bit += 16; \ -+ addr--; \ -+ addr--; \ -+ } \ -+} -+ -+static inline int mb_test_bit(int bit, void *addr) -+{ -+ mb_correct_addr_and_bit(bit,addr); -+ return test_bit(bit, addr); -+} -+ -+static inline void mb_set_bit(int bit, void *addr) -+{ -+ mb_correct_addr_and_bit(bit,addr); -+ set_bit(bit, addr); -+} -+ -+static inline void mb_clear_bit(int bit, void *addr) -+{ -+ mb_correct_addr_and_bit(bit,addr); -+ clear_bit(bit, addr); -+} -+ -+struct buffer_head * -+read_block_bitmap_bh(struct super_block *sb, unsigned int block_group) -+{ -+ struct buffer_head *bh; -+ int bitmap_nr; -+ -+ bitmap_nr = load_block_bitmap(sb, block_group); -+ if (bitmap_nr < 0) -+ return NULL; -+ -+ bh = EXT3_SB(sb)->s_block_bitmap[bitmap_nr]; -+ return bh; -+} -+ -+static inline void *mb_find_buddy(struct ext3_buddy *e3b, int order, int *max) -+{ -+ int i = 1; -+ void *bb; -+ -+ J_ASSERT(e3b->bd_bitmap != e3b->bd_buddy); -+ J_ASSERT(max != NULL); -+ -+ if (order > e3b->bd_blkbits + 1) -+ return NULL; -+ -+ /* at order 0 we see each particular block */ -+ *max = 1 << (e3b->bd_blkbits + 3); -+ if (order == 0) -+ return e3b->bd_bitmap; -+ -+ bb = e3b->bd_buddy; -+ *max = *max >> 1; -+ while (i < order) { -+ bb += 1 << (e3b->bd_blkbits - i); -+ i++; -+ *max = *max >> 1; -+ } -+ return bb; -+} -+ -+static int ext3_mb_load_desc(struct super_block *sb, int group, -+ struct ext3_buddy *e3b) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ -+ J_ASSERT(sbi->s_buddy_blocks[group]->bb_bitmap); -+ J_ASSERT(sbi->s_buddy_blocks[group]->bb_buddy); -+ -+ /* load bitmap */ -+ e3b->bd_bh = sb_getblk(sb, sbi->s_buddy_blocks[group]->bb_bitmap); -+ if (e3b->bd_bh == NULL) { -+ ext3_error(sb, "ext3_mb_load_desc", -+ "can't get block for buddy bitmap\n"); -+ goto out; -+ } -+ if (!buffer_uptodate(e3b->bd_bh)) { -+ ll_rw_block(READ, 1, &e3b->bd_bh); -+ wait_on_buffer(e3b->bd_bh); -+ } -+ J_ASSERT(buffer_uptodate(e3b->bd_bh)); -+ -+ /* load buddy */ -+ e3b->bd_bh2 = sb_getblk(sb, sbi->s_buddy_blocks[group]->bb_buddy); -+ if (e3b->bd_bh2 == NULL) { -+ ext3_error(sb, "ext3_mb_load_desc", -+ "can't get block for buddy bitmap\n"); -+ goto out; -+ } -+ if (!buffer_uptodate(e3b->bd_bh2)) { -+ ll_rw_block(READ, 1, &e3b->bd_bh2); -+ wait_on_buffer(e3b->bd_bh2); -+ } -+ J_ASSERT(buffer_uptodate(e3b->bd_bh2)); -+ -+ e3b->bd_bitmap = e3b->bd_bh->b_data; -+ e3b->bd_buddy = e3b->bd_bh2->b_data; -+ e3b->bd_blkbits = sb->s_blocksize_bits; -+ e3b->bd_bd = sbi->s_buddy_blocks[group]; -+ e3b->bd_sb = sb; -+ -+ return 0; -+out: -+ brelse(e3b->bd_bh); -+ brelse(e3b->bd_bh2); -+ e3b->bd_bh = NULL; -+ e3b->bd_bh2 = NULL; -+ return -EIO; -+} -+ -+static void ext3_mb_dirty_buddy(struct ext3_buddy *e3b) -+{ -+ mark_buffer_dirty(e3b->bd_bh); -+ mark_buffer_dirty(e3b->bd_bh2); -+} -+ -+static void ext3_mb_release_desc(struct ext3_buddy *e3b) -+{ -+ brelse(e3b->bd_bh); -+ brelse(e3b->bd_bh2); -+} -+ -+#ifdef AGGRESSIVE_CHECK -+static void mb_check_buddy(struct ext3_buddy *e3b) -+{ -+ int order = e3b->bd_blkbits + 1; -+ int max, max2, i, j, k, count; -+ void *buddy, *buddy2; -+ -+ if (!test_opt(e3b->bd_sb, MBALLOC)) -+ return; -+ -+ while (order > 1) { -+ buddy = mb_find_buddy(e3b, order, &max); -+ J_ASSERT(buddy); -+ buddy2 = mb_find_buddy(e3b, order - 1, &max2); -+ J_ASSERT(buddy2); -+ J_ASSERT(buddy != buddy2); -+ J_ASSERT(max * 2 == max2); -+ -+ count = 0; -+ for (i = 0; i < max; i++) { -+ -+ if (!mb_test_bit(i, buddy)) { -+ /* only single bit in buddy2 may be 1 */ -+ if (mb_test_bit(i << 1, buddy2)) -+ J_ASSERT(!mb_test_bit((i<<1)+1, buddy2)); -+ else if (mb_test_bit((i << 1) + 1, buddy2)) -+ J_ASSERT(!mb_test_bit(i << 1, buddy2)); -+ continue; -+ } -+ -+ /* both bits in buddy2 must be 0 */ -+ J_ASSERT(!mb_test_bit(i << 1, buddy2)); -+ J_ASSERT(!mb_test_bit((i << 1) + 1, buddy2)); -+ -+ for (j = 0; j < (1 << order); j++) { -+ k = (i * (1 << order)) + j; -+ J_ASSERT(mb_test_bit(k, e3b->bd_bitmap)); -+ } -+ count++; -+ } -+ J_ASSERT(e3b->bd_bd->bb_counters[order] == count); -+ order--; -+ } -+ -+ buddy = mb_find_buddy(e3b, 0, &max); -+ for (i = 0; i < max; i++) { -+ if (mb_test_bit(i, buddy)) -+ continue; -+ /* check used bits only */ -+ for (j = 0; j < e3b->bd_blkbits + 1; j++) { -+ buddy2 = mb_find_buddy(e3b, j, &max2); -+ k = i >> j; -+ J_ASSERT(k < max2); -+ J_ASSERT(!mb_test_bit(k, buddy2)); -+ } -+ } -+} -+#else -+#define mb_check_buddy(e3b) -+#endif -+ -+static inline void -+ext3_lock_group(struct super_block *sb, int group) -+{ -+ spin_lock(&EXT3_SB(sb)->s_buddy_blocks[group]->bb_lock); -+} -+ -+static inline void -+ext3_unlock_group(struct super_block *sb, int group) -+{ -+ spin_unlock(&EXT3_SB(sb)->s_buddy_blocks[group]->bb_lock); -+} -+ -+static int mb_find_order_for_block(struct ext3_buddy *e3b, int block) -+{ -+ int order = 1; -+ void *bb; -+ -+ J_ASSERT(e3b->bd_bitmap != e3b->bd_buddy); -+ J_ASSERT(block < (1 << (e3b->bd_blkbits + 3))); -+ -+ bb = e3b->bd_buddy; -+ while (order <= e3b->bd_blkbits + 1) { -+ block = block >> 1; -+ if (mb_test_bit(block, bb)) { -+ /* this block is part of buddy of order 'order' */ -+ return order; -+ } -+ bb += 1 << (e3b->bd_blkbits - order); -+ order++; -+ } -+ return 0; -+} -+ -+static inline void mb_clear_bits(void *bm, int cur, int len) -+{ -+ __u32 *addr; -+ -+ len = cur + len; -+ while (cur < len) { -+ if ((cur & 31) == 0 && (len - cur) >= 32) { -+ /* fast path: clear whole word at once */ -+ addr = bm + (cur >> 3); -+ *addr = 0; -+ cur += 32; -+ continue; -+ } -+ mb_clear_bit(cur, bm); -+ cur++; -+ } -+} -+ -+static inline void mb_set_bits(void *bm, int cur, int len) -+{ -+ __u32 *addr; -+ -+ len = cur + len; -+ while (cur < len) { -+ if ((cur & 31) == 0 && (len - cur) >= 32) { -+ /* fast path: clear whole word at once */ -+ addr = bm + (cur >> 3); -+ *addr = 0xffffffff; -+ cur += 32; -+ continue; -+ } -+ mb_set_bit(cur, bm); -+ cur++; -+ } -+} -+ -+static int mb_free_blocks(struct ext3_buddy *e3b, int first, int count) -+{ -+ int block, max, order; -+ void *buddy, *buddy2; -+ -+ mb_check_buddy(e3b); -+ while (count-- > 0) { -+ block = first++; -+ order = 0; -+ -+ J_ASSERT(!mb_test_bit(block, e3b->bd_bitmap)); -+ mb_set_bit(block, e3b->bd_bitmap); -+ e3b->bd_bd->bb_counters[order]++; -+ -+ /* start of the buddy */ -+ buddy = mb_find_buddy(e3b, order, &max); -+ -+ do { -+ block &= ~1UL; -+ if (!mb_test_bit(block, buddy) || -+ !mb_test_bit(block + 1, buddy)) -+ break; -+ -+ /* both the buddies are free, try to coalesce them */ -+ buddy2 = mb_find_buddy(e3b, order + 1, &max); -+ -+ if (!buddy2) -+ break; -+ -+ if (order > 0) { -+ /* for special purposes, we don't clear -+ * free bits in bitmap */ -+ mb_clear_bit(block, buddy); -+ mb_clear_bit(block + 1, buddy); -+ } -+ e3b->bd_bd->bb_counters[order]--; -+ e3b->bd_bd->bb_counters[order]--; -+ -+ block = block >> 1; -+ order++; -+ e3b->bd_bd->bb_counters[order]++; -+ -+ mb_set_bit(block, buddy2); -+ buddy = buddy2; -+ } while (1); -+ } -+ mb_check_buddy(e3b); -+ -+ return 0; -+} -+ -+/* -+ * returns 1 if out extent is enough to fill needed space -+ */ -+int mb_make_backward_extent(struct ext3_free_extent *in, -+ struct ext3_free_extent *out, int needed) -+{ -+ int i; -+ -+ J_ASSERT(in); -+ J_ASSERT(out); -+ J_ASSERT(in->fe_nums < MB_ARR_SIZE); -+ -+ out->fe_len = 0; -+ out->fe_start = in->fe_start + in->fe_len; -+ out->fe_nums = 0; -+ -+ /* for single-chunk extent we need not back order -+ * also, if an extent doesn't fill needed space -+ * then it makes no sense to try back order becase -+ * if we select this extent then it'll be use as is */ -+ if (in->fe_nums < 2 || in->fe_len < needed) -+ return 0; -+ -+ i = in->fe_nums - 1; -+ while (i >= 0 && out->fe_len < needed) { -+ out->fe_len += (1 << in->fe_orders[i]); -+ out->fe_start -= (1 << in->fe_orders[i]); -+ i--; -+ } -+ /* FIXME: in some situation fe_orders may be too small to hold -+ * all the buddies */ -+ J_ASSERT(out->fe_len >= needed); -+ -+ for (i++; i < in->fe_nums; i++) -+ out->fe_orders[out->fe_nums++] = in->fe_orders[i]; -+ J_ASSERT(out->fe_nums < MB_ARR_SIZE); -+ out->fe_back = 1; -+ -+ return 1; -+} -+ -+int mb_find_extent(struct ext3_buddy *e3b, int order, int block, -+ int needed, struct ext3_free_extent *ex) -+{ -+ int space = needed; -+ int next, max, ord; -+ void *buddy; -+ -+ J_ASSERT(ex != NULL); -+ -+ ex->fe_nums = 0; -+ ex->fe_len = 0; -+ -+ buddy = mb_find_buddy(e3b, order, &max); -+ J_ASSERT(buddy); -+ J_ASSERT(block < max); -+ if (!mb_test_bit(block, buddy)) -+ goto nofree; -+ -+ if (order == 0) { -+ /* find actual order */ -+ order = mb_find_order_for_block(e3b, block); -+ block = block >> order; -+ } -+ -+ ex->fe_orders[ex->fe_nums++] = order; -+ ex->fe_len = 1 << order; -+ ex->fe_start = block << order; -+ ex->fe_back = 0; -+ -+ while ((space = space - (1 << order)) > 0) { -+ -+ buddy = mb_find_buddy(e3b, order, &max); -+ J_ASSERT(buddy); -+ -+ if (block + 1 >= max) -+ break; -+ -+ next = (block + 1) * (1 << order); -+ if (!mb_test_bit(next, e3b->bd_bitmap)) -+ break; -+ -+ ord = mb_find_order_for_block(e3b, next); -+ -+ if ((1 << ord) >= needed) { -+ /* we dont want to coalesce with self-enough buddies */ -+ break; -+ } -+ order = ord; -+ block = next >> order; -+ ex->fe_len += 1 << order; -+ -+ if (ex->fe_nums < MB_ARR_SIZE) -+ ex->fe_orders[ex->fe_nums++] = order; -+ } -+ -+nofree: -+ J_ASSERT(ex->fe_start + ex->fe_len <= (1 << (e3b->bd_blkbits + 3))); -+ return ex->fe_len; -+} -+ -+static int mb_mark_used_backward(struct ext3_buddy *e3b, -+ struct ext3_free_extent *ex, int len) -+{ -+ int start = ex->fe_start, len0 = len; -+ int ord, mlen, max, cur; -+ void *buddy; -+ -+ start = ex->fe_start + ex->fe_len - 1; -+ while (len) { -+ ord = mb_find_order_for_block(e3b, start); -+ if (((start >> ord) << ord) == (start - (1 << ord) + 1) && -+ len >= (1 << ord)) { -+ /* the whole chunk may be allocated at once! */ -+ mlen = 1 << ord; -+ buddy = mb_find_buddy(e3b, ord, &max); -+ J_ASSERT((start >> ord) < max); -+ mb_clear_bit(start >> ord, buddy); -+ e3b->bd_bd->bb_counters[ord]--; -+ start -= mlen; -+ len -= mlen; -+ J_ASSERT(len >= 0); -+ J_ASSERT(start >= 0); -+ continue; -+ } -+ -+ /* we have to split large buddy */ -+ J_ASSERT(ord > 0); -+ buddy = mb_find_buddy(e3b, ord, &max); -+ mb_clear_bit(start >> ord, buddy); -+ e3b->bd_bd->bb_counters[ord]--; -+ -+ ord--; -+ cur = (start >> ord) & ~1U; -+ buddy = mb_find_buddy(e3b, ord, &max); -+ mb_set_bit(cur, buddy); -+ mb_set_bit(cur + 1, buddy); -+ e3b->bd_bd->bb_counters[ord]++; -+ e3b->bd_bd->bb_counters[ord]++; -+ } -+ -+ /* now drop all the bits in bitmap */ -+ mb_clear_bits(e3b->bd_bitmap, ex->fe_start + ex->fe_len - len0, len0); -+ -+ mb_check_buddy(e3b); -+ -+ return 0; -+} -+ -+static int mb_mark_used_forward(struct ext3_buddy *e3b, -+ struct ext3_free_extent *ex, int len) -+{ -+ int start = ex->fe_start, len0 = len; -+ int ord, mlen, max, cur; -+ void *buddy; -+ -+ while (len) { -+ ord = mb_find_order_for_block(e3b, start); -+ -+ if (((start >> ord) << ord) == start && len >= (1 << ord)) { -+ /* the whole chunk may be allocated at once! */ -+ mlen = 1 << ord; -+ buddy = mb_find_buddy(e3b, ord, &max); -+ J_ASSERT((start >> ord) < max); -+ mb_clear_bit(start >> ord, buddy); -+ e3b->bd_bd->bb_counters[ord]--; -+ start += mlen; -+ len -= mlen; -+ J_ASSERT(len >= 0); -+ continue; -+ } -+ -+ /* we have to split large buddy */ -+ J_ASSERT(ord > 0); -+ buddy = mb_find_buddy(e3b, ord, &max); -+ mb_clear_bit(start >> ord, buddy); -+ e3b->bd_bd->bb_counters[ord]--; -+ -+ ord--; -+ cur = (start >> ord) & ~1U; -+ buddy = mb_find_buddy(e3b, ord, &max); -+ mb_set_bit(cur, buddy); -+ mb_set_bit(cur + 1, buddy); -+ e3b->bd_bd->bb_counters[ord]++; -+ e3b->bd_bd->bb_counters[ord]++; -+ } -+ -+ /* now drop all the bits in bitmap */ -+ mb_clear_bits(e3b->bd_bitmap, ex->fe_start, len0); -+ -+ mb_check_buddy(e3b); -+ -+ return 0; -+} -+ -+int inline mb_mark_used(struct ext3_buddy *e3b, -+ struct ext3_free_extent *ex, int len) -+{ -+ int err; -+ -+ J_ASSERT(ex); -+ if (ex->fe_back == 0) -+ err = mb_mark_used_forward(e3b, ex, len); -+ else -+ err = mb_mark_used_backward(e3b, ex, len); -+ return err; -+} -+ -+int ext3_mb_new_in_group(struct ext3_allocation_context *ac, -+ struct ext3_buddy *e3b, int group) -+{ -+ struct super_block *sb = ac->ac_sb; -+ int err, gorder, max, i; -+ struct ext3_free_extent curex; -+ -+ /* let's know order of allocation */ -+ gorder = 0; -+ while (ac->ac_g_len > (1 << gorder)) -+ gorder++; -+ -+ if ((ac->ac_g_flags & 1) && ac->ac_g_group == group) { -+ /* someone asks for space at this specified block -+ * probably he wants to merge it into existing extent */ -+ if (mb_test_bit(ac->ac_g_start, e3b->bd_bitmap)) { -+ /* good. at least one block is free */ -+ max = mb_find_extent(e3b, 0, ac->ac_g_start, -+ ac->ac_g_len, &curex); -+ max = min(curex.fe_len, ac->ac_g_len); -+ mb_mark_used(e3b, &curex, max); -+ -+ ac->ac_b_group = group; -+ ac->ac_b_start = curex.fe_start; -+ ac->ac_b_len = max; -+ ac->ac_status = AC_STATUS_FOUND; -+ err = 0; -+ goto out; -+ } -+ /* don't try to find goal anymore */ -+ ac->ac_g_flags &= ~1; -+ } -+ -+ i = 0; -+ while (1) { -+ i = find_next_bit(e3b->bd_bitmap, sb->s_blocksize * 8, i); -+ if (i >= sb->s_blocksize * 8) -+ break; -+ -+ max = mb_find_extent(e3b, 0, i, ac->ac_g_len, &curex); -+ if (max >= ac->ac_g_len) { -+ max = min(curex.fe_len, ac->ac_g_len); -+ mb_mark_used(e3b, &curex, max); -+ -+ ac->ac_b_group = group; -+ ac->ac_b_start = curex.fe_start; -+ ac->ac_b_len = max; -+ ac->ac_status = AC_STATUS_FOUND; -+ break; -+ } -+ i += max; -+ } -+ -+ return 0; -+ -+out: -+ return err; -+} -+ -+int mb_good_group(struct ext3_allocation_context *ac, int group, int cr) -+{ -+ struct ext3_group_desc *gdp; -+ int free_blocks; -+ -+ gdp = ext3_get_group_desc(ac->ac_sb, group, NULL); -+ if (!gdp) -+ return 0; -+ free_blocks = le16_to_cpu(gdp->bg_free_blocks_count); -+ if (free_blocks == 0) -+ return 0; -+ -+ /* someone wants this block very much */ -+ if ((ac->ac_g_flags & 1) && ac->ac_g_group == group) -+ return 1; -+ -+ /* FIXME: I'd like to take fragmentation into account here */ -+ if (cr == 0) { -+ if (free_blocks >= ac->ac_g_len >> 1) -+ return 1; -+ } else if (cr == 1) { -+ if (free_blocks >= ac->ac_g_len >> 2) -+ return 1; -+ } else if (cr == 2) { -+ return 1; -+ } else { -+ BUG(); -+ } -+ return 0; -+} -+ -+int ext3_mb_new_blocks(handle_t *handle, struct inode *inode, -+ unsigned long goal, int *len, int flags, int *errp) -+{ -+ struct buffer_head *bitmap_bh = NULL; -+ struct ext3_allocation_context ac; -+ int i, group, block, cr, err = 0; -+ struct ext3_group_desc *gdp; -+ struct ext3_super_block *es; -+ struct buffer_head *gdp_bh; -+ struct ext3_sb_info *sbi; -+ struct super_block *sb; -+ struct ext3_buddy e3b; -+ -+ J_ASSERT(len != NULL); -+ J_ASSERT(*len > 0); -+ -+ sb = inode->i_sb; -+ if (!sb) { -+ printk("ext3_mb_new_nblocks: nonexistent device"); -+ return 0; -+ } -+ -+ if (!test_opt(sb, MBALLOC)) { -+ static int ext3_mballoc_warning = 0; -+ if (ext3_mballoc_warning == 0) { -+ printk(KERN_ERR "EXT3-fs: multiblock request with " -+ "mballoc disabled!\n"); -+ ext3_mballoc_warning++; -+ } -+ *len = 1; -+ err = ext3_new_block_old(handle, inode, goal, NULL,NULL, errp); -+ return err; -+ } -+ -+ ext3_mb_poll_new_transaction(sb, handle); -+ -+ sbi = EXT3_SB(sb); -+ es = EXT3_SB(sb)->s_es; -+ -+ if (!(flags & 2)) { -+ /* someone asks for non-reserved blocks */ -+ BUG_ON(*len > 1); -+ err = ext3_mb_reserve_blocks(sb, 1); -+ if (err) { -+ *errp = err; -+ return 0; -+ } -+ } -+ -+ /* -+ * Check quota for allocation of this blocks. -+ */ -+ while (*len && DQUOT_ALLOC_BLOCK(inode, *len)) -+ *len -= 1; -+ if (*len == 0) { -+ *errp = -EDQUOT; -+ block = 0; -+ goto out; -+ } -+ -+ /* start searching from the goal */ -+ if (goal < le32_to_cpu(es->s_first_data_block) || -+ goal >= le32_to_cpu(es->s_blocks_count)) -+ goal = le32_to_cpu(es->s_first_data_block); -+ group = (goal - le32_to_cpu(es->s_first_data_block)) / -+ EXT3_BLOCKS_PER_GROUP(sb); -+ block = ((goal - le32_to_cpu(es->s_first_data_block)) % -+ EXT3_BLOCKS_PER_GROUP(sb)); -+ -+ /* set up allocation goals */ -+ ac.ac_b_group = ac.ac_b_start = ac.ac_b_len = 0; -+ ac.ac_status = 0; -+ ac.ac_groups_scanned = 0; -+ ac.ac_sb = inode->i_sb; -+ ac.ac_g_group = group; -+ ac.ac_g_start = block; -+ ac.ac_g_len = *len; -+ ac.ac_g_flags = flags; -+ -+ /* loop over the groups */ -+ for (cr = 0; cr < 3 && ac.ac_status != AC_STATUS_FOUND; cr++) { -+ for (i = 0; i < EXT3_SB(sb)->s_groups_count; group++, i++) { -+ if (group == EXT3_SB(sb)->s_groups_count) -+ group = 0; -+ -+ /* check is group good for our criteries */ -+ if (!mb_good_group(&ac, group, cr)) -+ continue; -+ -+ err = ext3_mb_load_desc(ac.ac_sb, group, &e3b); -+ if (err) -+ goto out_err; -+ -+ ext3_lock_group(sb, group); -+ if (!mb_good_group(&ac, group, cr)) { -+ /* someone did allocation from this group */ -+ ext3_unlock_group(sb, group); -+ ext3_mb_release_desc(&e3b); -+ continue; -+ } -+ -+ err = ext3_mb_new_in_group(&ac, &e3b, group); -+ ext3_unlock_group(sb, group); -+ if (ac.ac_status == AC_STATUS_FOUND) -+ ext3_mb_dirty_buddy(&e3b); -+ ext3_mb_release_desc(&e3b); -+ if (err) -+ goto out_err; -+ if (ac.ac_status == AC_STATUS_FOUND) -+ break; -+ } -+ } -+ -+ if (ac.ac_status != AC_STATUS_FOUND) { -+ /* unfortunately, we can't satisfy this request */ -+ J_ASSERT(ac.ac_b_len == 0); -+ DQUOT_FREE_BLOCK(inode, *len); -+ *errp = -ENOSPC; -+ block = 0; -+ goto out; -+ } -+ -+ /* good news - free block(s) have been found. now it's time -+ * to mark block(s) in good old journaled bitmap */ -+ block = ac.ac_b_group * EXT3_BLOCKS_PER_GROUP(sb) -+ + ac.ac_b_start + le32_to_cpu(es->s_first_data_block); -+ -+ /* we made a desicion, now mark found blocks in good old -+ * bitmap to be journaled */ -+ -+ ext3_debug("using block group %d(%d)\n", -+ ac.ac_b_group.group, gdp->bg_free_blocks_count); -+ -+ bitmap_bh = read_block_bitmap_bh(sb, ac.ac_b_group); -+ if (!bitmap_bh) { -+ *errp = -EIO; -+ goto out_err; -+ } -+ -+ err = ext3_journal_get_write_access(handle, bitmap_bh); -+ if (err) { -+ *errp = err; -+ goto out_err; -+ } -+ -+ gdp = ext3_get_group_desc(sb, ac.ac_b_group, &gdp_bh); -+ if (!gdp) { -+ *errp = -EIO; -+ goto out_err; -+ } -+ -+ err = ext3_journal_get_write_access(handle, gdp_bh); -+ if (err) -+ goto out_err; -+ -+ block = ac.ac_b_start + ac.ac_b_group * EXT3_BLOCKS_PER_GROUP(sb) -+ + le32_to_cpu(es->s_first_data_block); -+ -+ if (block == le32_to_cpu(gdp->bg_block_bitmap) || -+ block == le32_to_cpu(gdp->bg_inode_bitmap) || -+ in_range(block, le32_to_cpu(gdp->bg_inode_table), -+ EXT3_SB(sb)->s_itb_per_group)) -+ ext3_error(sb, "ext3_new_block", -+ "Allocating block in system zone - " -+ "block = %u", block); -+#if 0 -+ for (i = 0; i < ac.ac_b_len; i++) -+ J_ASSERT(!mb_test_bit(ac.ac_b_start + i, bitmap_bh->b_data)); -+#endif -+ mb_set_bits(bitmap_bh->b_data, ac.ac_b_start, ac.ac_b_len); -+ -+ ext3_lock_group(sb, ac.ac_b_group); -+ gdp->bg_free_blocks_count = -+ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) - -+ ac.ac_b_len); -+ ext3_unlock_group(sb, ac.ac_b_group); -+ spin_lock(&sbi->s_md_lock); -+ es->s_free_blocks_count = -+ cpu_to_le32(le32_to_cpu(es->s_free_blocks_count) - ac.ac_b_len); -+ spin_unlock(&sbi->s_md_lock); -+ -+ err = ext3_journal_dirty_metadata(handle, bitmap_bh); -+ if (err) -+ goto out_err; -+ err = ext3_journal_dirty_metadata(handle, gdp_bh); -+ if (err) -+ goto out_err; -+ -+ sb->s_dirt = 1; -+ *errp = 0; -+ -+ /* drop non-allocated, but dquote'd blocks */ -+ J_ASSERT(*len >= ac.ac_b_len); -+ DQUOT_FREE_BLOCK(inode, *len - ac.ac_b_len); -+ -+ *len = ac.ac_b_len; -+ J_ASSERT(block != 0); -+ goto out; -+ -+out_err: -+ /* if we've already allocated something, roll it back */ -+ if (ac.ac_status == AC_STATUS_FOUND) { -+ /* FIXME: free blocks here */ -+ } -+ -+ DQUOT_FREE_BLOCK(inode, *len); -+ *errp = err; -+ block = 0; -+out: -+ if (!(flags & 2)) { -+ /* block wasn't reserved before and we reserved it -+ * at the beginning of allocation. it doesn't matter -+ * whether we allocated anything or we failed: time -+ * to release reservation. NOTE: because I expect -+ * any multiblock request from delayed allocation -+ * path only, here is single block always */ -+ ext3_mb_release_blocks(sb, 1); -+ } -+ return block; -+} -+ -+int ext3_mb_generate_buddy(struct super_block *sb, int group) -+{ -+ struct buffer_head *bh; -+ int i, err, count = 0; -+ struct ext3_buddy e3b; -+ -+ err = ext3_mb_load_desc(sb, group, &e3b); -+ if (err) -+ goto out; -+ memset(e3b.bd_bh->b_data, 0, sb->s_blocksize); -+ memset(e3b.bd_bh2->b_data, 0, sb->s_blocksize); -+ -+ bh = read_block_bitmap_bh(sb, group); -+ if (bh == NULL) { -+ err = -EIO; -+ goto out2; -+ } -+ -+ /* loop over the blocks, nad create buddies for free ones */ -+ for (i = 0; i < sb->s_blocksize * 8; i++) { -+ if (!mb_test_bit(i, (void *) bh->b_data)) { -+ mb_free_blocks(&e3b, i, 1); -+ count++; -+ } -+ } -+ mb_check_buddy(&e3b); -+ ext3_mb_dirty_buddy(&e3b); -+ -+out2: -+ ext3_mb_release_desc(&e3b); -+out: -+ return err; -+} -+ -+EXPORT_SYMBOL(ext3_mb_new_blocks); -+ -+#define MB_CREDITS \ -+ (EXT3_DATA_TRANS_BLOCKS + 3 + EXT3_INDEX_EXTRA_TRANS_BLOCKS) -+ -+int ext3_mb_init_backend(struct super_block *sb) -+{ -+ struct inode *root = sb->s_root->d_inode; -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ struct dentry *db; -+ tid_t target; -+ int err, i; -+ -+ sbi->s_buddy_blocks = kmalloc(sizeof(struct ext3_buddy_group_blocks *) * -+ sbi->s_groups_count, GFP_KERNEL); -+ if (sbi->s_buddy_blocks == NULL) { -+ printk("EXT3-fs: can't allocate mem for buddy maps\n"); -+ return -ENOMEM; -+ } -+ memset(sbi->s_buddy_blocks, 0, -+ sizeof(struct ext3_buddy_group_blocks *) * sbi->s_groups_count); -+ sbi->s_buddy = NULL; -+ -+ down(&root->i_sem); -+ db = lookup_one_len(EXT3_BUDDY_FILE, sb->s_root, -+ strlen(EXT3_BUDDY_FILE)); -+ if (IS_ERR(db)) { -+ err = PTR_ERR(db); -+ printk("EXT3-fs: can't lookup buddy file: %d\n", err); -+ goto out; -+ } -+ -+ if (db->d_inode != NULL) { -+ sbi->s_buddy = igrab(db->d_inode); -+ goto map; -+ } -+ -+ err = ext3_create(root, db, S_IFREG, NULL); -+ if (err) { -+ printk("error while creation buddy file: %d\n", err); -+ } else { -+ sbi->s_buddy = igrab(db->d_inode); -+ } -+ -+map: -+ for (i = 0; i < sbi->s_groups_count; i++) { -+ struct buffer_head *bh = NULL; -+ handle_t *handle; -+ -+ sbi->s_buddy_blocks[i] = -+ kmalloc(sizeof(struct ext3_buddy_group_blocks), -+ GFP_KERNEL); -+ if (sbi->s_buddy_blocks[i] == NULL) { -+ printk("EXT3-fs: can't allocate mem for buddy\n"); -+ err = -ENOMEM; -+ goto out2; -+ } -+ -+ handle = ext3_journal_start(sbi->s_buddy, MB_CREDITS); -+ if (IS_ERR(handle)) { -+ err = PTR_ERR(handle); -+ goto out2; -+ } -+ -+ /* allocate block for bitmap */ -+ bh = ext3_getblk(handle, sbi->s_buddy, i * 2, 1, &err); -+ if (bh == NULL) { -+ printk("can't get block for buddy bitmap: %d\n", err); -+ goto out2; -+ } -+ sbi->s_buddy_blocks[i]->bb_bitmap = bh->b_blocknr; -+ brelse(bh); -+ -+ /* allocate block for buddy */ -+ bh = ext3_getblk(handle, sbi->s_buddy, i * 2 + 1, 1, &err); -+ if (bh == NULL) { -+ printk("can't get block for buddy: %d\n", err); -+ goto out2; -+ } -+ sbi->s_buddy_blocks[i]->bb_buddy = bh->b_blocknr; -+ brelse(bh); -+ ext3_journal_stop(handle, sbi->s_buddy); -+ spin_lock_init(&sbi->s_buddy_blocks[i]->bb_lock); -+ sbi->s_buddy_blocks[i]->bb_md_cur = NULL; -+ sbi->s_buddy_blocks[i]->bb_tid = 0; -+ } -+ -+ if ((target = log_start_commit(sbi->s_journal, NULL))) -+ log_wait_commit(sbi->s_journal, target); -+ -+out2: -+ dput(db); -+out: -+ up(&root->i_sem); -+ return err; -+} -+ -+int ext3_mb_release(struct super_block *sb) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ int i; -+ -+ if (!test_opt(sb, MBALLOC)) -+ return 0; -+ -+ /* release freed, non-committed blocks */ -+ spin_lock(&sbi->s_md_lock); -+ list_splice_init(&sbi->s_closed_transaction, -+ &sbi->s_committed_transaction); -+ list_splice_init(&sbi->s_active_transaction, -+ &sbi->s_committed_transaction); -+ spin_unlock(&sbi->s_md_lock); -+ ext3_mb_free_committed_blocks(sb); -+ -+ if (sbi->s_buddy_blocks) { -+ for (i = 0; i < sbi->s_groups_count; i++) -+ if (sbi->s_buddy_blocks[i]) -+ kfree(sbi->s_buddy_blocks[i]); -+ kfree(sbi->s_buddy_blocks); -+ } -+ if (sbi->s_buddy) -+ iput(sbi->s_buddy); -+ if (sbi->s_blocks_reserved) -+ printk("ext3-fs: %ld blocks being reserved at umount!\n", -+ sbi->s_blocks_reserved); -+ return 0; -+} -+ -+int ext3_mb_init(struct super_block *sb) -+{ -+ struct ext3_super_block *es; -+ int i; -+ -+ if (!test_opt(sb, MBALLOC)) -+ return 0; -+ -+ /* init file for buddy data */ -+ clear_opt(EXT3_SB(sb)->s_mount_opt, MBALLOC); -+ ext3_mb_init_backend(sb); -+ -+ es = EXT3_SB(sb)->s_es; -+ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) -+ ext3_mb_generate_buddy(sb, i); -+ spin_lock_init(&EXT3_SB(sb)->s_reserve_lock); -+ spin_lock_init(&EXT3_SB(sb)->s_md_lock); -+ INIT_LIST_HEAD(&EXT3_SB(sb)->s_active_transaction); -+ INIT_LIST_HEAD(&EXT3_SB(sb)->s_closed_transaction); -+ INIT_LIST_HEAD(&EXT3_SB(sb)->s_committed_transaction); -+ set_opt(EXT3_SB(sb)->s_mount_opt, MBALLOC); -+ printk("EXT3-fs: mballoc enabled\n"); -+ return 0; -+} -+ -+void ext3_mb_free_committed_blocks(struct super_block *sb) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ int err, i, count = 0, count2 = 0; -+ struct ext3_free_metadata *md; -+ struct ext3_buddy e3b; -+ -+ if (list_empty(&sbi->s_committed_transaction)) -+ return; -+ -+ /* there is committed blocks to be freed yet */ -+ do { -+ /* get next array of blocks */ -+ md = NULL; -+ spin_lock(&sbi->s_md_lock); -+ if (!list_empty(&sbi->s_committed_transaction)) { -+ md = list_entry(sbi->s_committed_transaction.next, -+ struct ext3_free_metadata, list); -+ list_del(&md->list); -+ } -+ spin_unlock(&sbi->s_md_lock); -+ -+ if (md == NULL) -+ break; -+ -+ mb_debug("gonna free %u blocks in group %u (0x%p):", -+ md->num, md->group, md); -+ -+ err = ext3_mb_load_desc(sb, md->group, &e3b); -+ BUG_ON(err != 0); -+ -+ /* there are blocks to put in buddy to make them really free */ -+ count += md->num; -+ count2++; -+ ext3_lock_group(sb, md->group); -+ for (i = 0; i < md->num; i++) { -+ mb_debug(" %u", md->blocks[i]); -+ mb_free_blocks(&e3b, md->blocks[i], 1); -+ } -+ mb_debug("\n"); -+ ext3_unlock_group(sb, md->group); -+ -+ kfree(md); -+ ext3_mb_dirty_buddy(&e3b); -+ ext3_mb_release_desc(&e3b); -+ -+ } while (md); -+ mb_debug("freed %u blocks in %u structures\n", count, count2); -+} -+ -+void ext3_mb_poll_new_transaction(struct super_block *sb, handle_t *handle) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ -+ if (sbi->s_last_transaction == handle->h_transaction->t_tid) -+ return; -+ -+ /* new transaction! time to close last one and free blocks for -+ * committed transaction. we know that only transaction can be -+ * active, so previos transaction can be being logged and we -+ * know that transaction before previous is known to be alreade -+ * logged. this means that now we may free blocks freed in all -+ * transactions before previous one. hope I'm clear enough ... */ -+ -+ spin_lock(&sbi->s_md_lock); -+ if (sbi->s_last_transaction != handle->h_transaction->t_tid) { -+ mb_debug("new transaction %lu, old %lu\n", -+ (unsigned long) handle->h_transaction->t_tid, -+ (unsigned long) sbi->s_last_transaction); -+ list_splice_init(&sbi->s_closed_transaction, -+ &sbi->s_committed_transaction); -+ list_splice_init(&sbi->s_active_transaction, -+ &sbi->s_closed_transaction); -+ sbi->s_last_transaction = handle->h_transaction->t_tid; -+ } -+ spin_unlock(&sbi->s_md_lock); -+ -+ ext3_mb_free_committed_blocks(sb); -+} -+ -+int ext3_mb_free_metadata(handle_t *handle, struct ext3_buddy *e3b, -+ int group, int block, int count) -+{ -+ struct ext3_buddy_group_blocks *db = e3b->bd_bd; -+ struct super_block *sb = e3b->bd_sb; -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ struct ext3_free_metadata *md; -+ int i; -+ -+ ext3_lock_group(sb, group); -+ for (i = 0; i < count; i++) { -+ md = db->bb_md_cur; -+ if (md && db->bb_tid != handle->h_transaction->t_tid) { -+ db->bb_md_cur = NULL; -+ md = NULL; -+ } -+ -+ if (md == NULL) { -+ ext3_unlock_group(sb, group); -+ md = kmalloc(sizeof(*md), GFP_KERNEL); -+ if (md == NULL) -+ return -ENOMEM; -+ md->num = 0; -+ md->group = group; -+ -+ ext3_lock_group(sb, group); -+ if (db->bb_md_cur == NULL) { -+ spin_lock(&sbi->s_md_lock); -+ list_add(&md->list, &sbi->s_active_transaction); -+ spin_unlock(&sbi->s_md_lock); -+ db->bb_md_cur = md; -+ db->bb_tid = handle->h_transaction->t_tid; -+ mb_debug("new md 0x%p for group %u\n", -+ md, md->group); -+ } else { -+ kfree(md); -+ md = db->bb_md_cur; -+ } -+ } -+ -+ BUG_ON(md->num >= EXT3_BB_MAX_BLOCKS); -+ md->blocks[md->num] = block + i; -+ md->num++; -+ if (md->num == EXT3_BB_MAX_BLOCKS) { -+ /* no more space, put full container on a sb's list */ -+ db->bb_md_cur = NULL; -+ } -+ } -+ ext3_unlock_group(sb, group); -+ return 0; -+} -+ -+void ext3_mb_free_blocks(handle_t *handle, struct inode *inode, -+ unsigned long block, unsigned long count, int metadata) -+{ -+ struct buffer_head *bitmap_bh = NULL; -+ struct ext3_group_desc *gdp; -+ struct ext3_super_block *es; -+ unsigned long bit, overflow; -+ struct buffer_head *gd_bh; -+ unsigned long block_group; -+ struct ext3_sb_info *sbi; -+ struct super_block *sb; -+ struct ext3_buddy e3b; -+ int err = 0, ret; -+ -+ sb = inode->i_sb; -+ if (!sb) { -+ printk ("ext3_free_blocks: nonexistent device"); -+ return; -+ } -+ -+ ext3_mb_poll_new_transaction(sb, handle); -+ -+ sbi = EXT3_SB(sb); -+ es = EXT3_SB(sb)->s_es; -+ if (block < le32_to_cpu(es->s_first_data_block) || -+ block + count < block || -+ block + count > le32_to_cpu(es->s_blocks_count)) { -+ ext3_error (sb, "ext3_free_blocks", -+ "Freeing blocks not in datazone - " -+ "block = %lu, count = %lu", block, count); -+ goto error_return; -+ } -+ -+ ext3_debug("freeing block %lu\n", block); -+ -+do_more: -+ overflow = 0; -+ block_group = (block - le32_to_cpu(es->s_first_data_block)) / -+ EXT3_BLOCKS_PER_GROUP(sb); -+ bit = (block - le32_to_cpu(es->s_first_data_block)) % -+ EXT3_BLOCKS_PER_GROUP(sb); -+ /* -+ * Check to see if we are freeing blocks across a group -+ * boundary. -+ */ -+ if (bit + count > EXT3_BLOCKS_PER_GROUP(sb)) { -+ overflow = bit + count - EXT3_BLOCKS_PER_GROUP(sb); -+ count -= overflow; -+ } -+ bitmap_bh = read_block_bitmap_bh(sb, block_group); -+ if (!bitmap_bh) -+ goto error_return; -+ gdp = ext3_get_group_desc (sb, block_group, &gd_bh); -+ if (!gdp) -+ goto error_return; -+ -+ if (in_range (le32_to_cpu(gdp->bg_block_bitmap), block, count) || -+ in_range (le32_to_cpu(gdp->bg_inode_bitmap), block, count) || -+ in_range (block, le32_to_cpu(gdp->bg_inode_table), -+ EXT3_SB(sb)->s_itb_per_group) || -+ in_range (block + count - 1, le32_to_cpu(gdp->bg_inode_table), -+ EXT3_SB(sb)->s_itb_per_group)) -+ ext3_error (sb, "ext3_free_blocks", -+ "Freeing blocks in system zones - " -+ "Block = %lu, count = %lu", -+ block, count); -+ -+ BUFFER_TRACE(bitmap_bh, "getting write access"); -+ err = ext3_journal_get_write_access(handle, bitmap_bh); -+ if (err) -+ goto error_return; -+ -+ /* -+ * We are about to modify some metadata. Call the journal APIs -+ * to unshare ->b_data if a currently-committing transaction is -+ * using it -+ */ -+ BUFFER_TRACE(gd_bh, "get_write_access"); -+ err = ext3_journal_get_write_access(handle, gd_bh); -+ if (err) -+ goto error_return; -+ -+ err = ext3_mb_load_desc(sb, block_group, &e3b); -+ if (err) -+ goto error_return; -+ -+ if (metadata) { -+ /* blocks being freed are metadata. these blocks shouldn't -+ * be used until this transaction is committed */ -+ ext3_mb_free_metadata(handle, &e3b, block_group, bit, count); -+ } else { -+ ext3_lock_group(sb, block_group); -+ mb_free_blocks(&e3b, bit, count); -+ gdp->bg_free_blocks_count = -+ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + count); -+ ext3_unlock_group(sb, block_group); -+ spin_lock(&sbi->s_md_lock); -+ es->s_free_blocks_count = -+ cpu_to_le32(le32_to_cpu(es->s_free_blocks_count) + count); -+ spin_unlock(&sbi->s_md_lock); -+ } -+ -+ ext3_mb_dirty_buddy(&e3b); -+ ext3_mb_release_desc(&e3b); -+ -+ /* FIXME: undo logic will be implemented later and another way */ -+ mb_clear_bits(bitmap_bh->b_data, bit, count); -+ DQUOT_FREE_BLOCK(inode, count); -+ -+ /* We dirtied the bitmap block */ -+ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); -+ err = ext3_journal_dirty_metadata(handle, bitmap_bh); -+ -+ /* And the group descriptor block */ -+ BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); -+ ret = ext3_journal_dirty_metadata(handle, gd_bh); -+ if (!err) err = ret; -+ -+ if (overflow && !err) { -+ block += count; -+ count = overflow; -+ goto do_more; -+ } -+ sb->s_dirt = 1; -+error_return: -+ ext3_std_error(sb, err); -+ return; -+} -+ -+int ext3_mb_reserve_blocks(struct super_block *sb, int blocks) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ struct ext3_super_block *es; -+ int free, ret = -ENOSPC; -+ -+ BUG_ON(blocks < 0); -+ es = EXT3_SB(sb)->s_es; -+ spin_lock(&sbi->s_reserve_lock); -+ free = le32_to_cpu(es->s_free_blocks_count); -+ if (blocks <= free - sbi->s_blocks_reserved) { -+ sbi->s_blocks_reserved += blocks; -+ ret = 0; -+ } -+ spin_unlock(&sbi->s_reserve_lock); -+ return ret; -+} -+ -+void ext3_mb_release_blocks(struct super_block *sb, int blocks) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ -+ BUG_ON(blocks < 0); -+ spin_lock(&sbi->s_reserve_lock); -+ sbi->s_blocks_reserved -= blocks; -+ if (sbi->s_blocks_reserved < 0) -+ printk("EXT3-fs: reserve leak %ld\n", sbi->s_blocks_reserved); -+ if (sbi->s_blocks_reserved < 0) -+ sbi->s_blocks_reserved = 0; -+ spin_unlock(&sbi->s_reserve_lock); -+} -+ -+int ext3_new_block(handle_t *handle, struct inode *inode, -+ unsigned long goal, u32 *pc, u32 *pb, int *errp) -+{ -+ int ret, len; -+ -+ if (!test_opt(inode->i_sb, MBALLOC)) { -+ ret = ext3_new_block_old(handle, inode, goal, pc, pb, errp); -+ goto out; -+ } -+ len = 1; -+ ret = ext3_mb_new_blocks(handle, inode, goal, &len, 0, errp); -+out: -+ return ret; -+} -+ -+ -+void ext3_free_blocks(handle_t *handle, struct inode * inode, -+ unsigned long block, unsigned long count, int metadata) -+{ -+ if (!test_opt(inode->i_sb, MBALLOC)) -+ ext3_free_blocks_old(handle, inode, block, count); -+ else -+ ext3_mb_free_blocks(handle, inode, block, count, metadata); -+ return; -+} -+ -Index: linux-2.4.20-rh-20.9/fs/ext3/super.c -=================================================================== ---- linux-2.4.20-rh-20.9.orig/fs/ext3/super.c 2004-10-15 20:43:32.000000000 +0400 -+++ linux-2.4.20-rh-20.9/fs/ext3/super.c 2004-10-15 20:57:33.000000000 +0400 -@@ -622,6 +622,7 @@ - kdev_t j_dev = sbi->s_journal->j_dev; - int i; - -+ ext3_mb_release(sb); - J_ASSERT(sbi->s_delete_inodes == 0); - ext3_ext_release(sb); - ext3_xattr_put_super(sb); -@@ -877,6 +878,8 @@ - else if (want_numeric(value, "journal", inum)) - return 0; - } -+ else if (!strcmp (this_char, "mballoc")) -+ set_opt (*mount_options, MBALLOC); - else if (!strcmp (this_char, "noload")) - set_opt (*mount_options, NOLOAD); - else if (!strcmp (this_char, "data")) { -@@ -1506,6 +1509,7 @@ - } - - ext3_ext_init(sb); -+ ext3_mb_init(sb); - - return sb; - -Index: linux-2.4.20-rh-20.9/fs/ext3/Makefile -=================================================================== ---- linux-2.4.20-rh-20.9.orig/fs/ext3/Makefile 2004-10-15 20:43:32.000000000 +0400 -+++ linux-2.4.20-rh-20.9/fs/ext3/Makefile 2004-10-15 22:00:29.000000000 +0400 -@@ -13,8 +13,8 @@ - - obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ - ioctl.o namei.o super.o symlink.o hash.o ext3-exports.o \ -- xattr_trusted.o extents.o --export-objs += extents.o -+ xattr_trusted.o extents.o mballoc.o -+export-objs += extents.o mballoc.o - - obj-m := $(O_TARGET) - -Index: linux-2.4.20-rh-20.9/fs/ext3/balloc.c -=================================================================== ---- linux-2.4.20-rh-20.9.orig/fs/ext3/balloc.c 2004-10-15 20:43:28.000000000 +0400 -+++ linux-2.4.20-rh-20.9/fs/ext3/balloc.c 2004-10-15 20:57:33.000000000 +0400 -@@ -203,8 +203,7 @@ - * differentiating between a group for which we have never performed a bitmap - * IO request, and a group for which the last bitmap read request failed. - */ --static inline int load_block_bitmap (struct super_block * sb, -- unsigned int block_group) -+int load_block_bitmap (struct super_block * sb, unsigned int block_group) - { - int slot; - -@@ -253,8 +252,8 @@ - } - - /* Free given blocks, update quota and i_blocks field */ --void ext3_free_blocks (handle_t *handle, struct inode * inode, -- unsigned long block, unsigned long count) -+void ext3_free_blocks_old (handle_t *handle, struct inode * inode, -+ unsigned long block, unsigned long count) - { - struct buffer_head *bitmap_bh; - struct buffer_head *gd_bh; -@@ -531,9 +530,9 @@ - * bitmap, and then for any free bit if that fails. - * This function also updates quota and i_blocks field. - */ --int ext3_new_block (handle_t *handle, struct inode * inode, -- unsigned long goal, u32 * prealloc_count, -- u32 * prealloc_block, int * errp) -+int ext3_new_block_old (handle_t *handle, struct inode * inode, -+ unsigned long goal, u32 * prealloc_count, -+ u32 * prealloc_block, int * errp) - { - struct buffer_head * bh, *bhtmp; - struct buffer_head * bh2; -Index: linux-2.4.20-rh-20.9/fs/ext3/namei.c -=================================================================== ---- linux-2.4.20-rh-20.9.orig/fs/ext3/namei.c 2004-10-15 20:43:30.000000000 +0400 -+++ linux-2.4.20-rh-20.9/fs/ext3/namei.c 2004-10-15 20:57:33.000000000 +0400 -@@ -1877,7 +1877,7 @@ - * If the create succeeds, we fill in the inode information - * with d_instantiate(). - */ --static int ext3_create (struct inode * dir, struct dentry * dentry, int mode) -+int ext3_create (struct inode * dir, struct dentry * dentry, int mode) - { - handle_t *handle; - struct inode * inode; -Index: linux-2.4.20-rh-20.9/fs/ext3/inode.c -=================================================================== ---- linux-2.4.20-rh-20.9.orig/fs/ext3/inode.c 2004-10-15 20:43:32.000000000 +0400 -+++ linux-2.4.20-rh-20.9/fs/ext3/inode.c 2004-10-15 20:57:33.000000000 +0400 -@@ -255,7 +255,7 @@ - inode->u.ext3_i.i_prealloc_count = 0; - inode->u.ext3_i.i_prealloc_block = 0; - /* Writer: end */ -- ext3_free_blocks (inode, block, total); -+ ext3_free_blocks (inode, block, total, 1); - } - unlock_kernel(); - #endif -@@ -619,7 +619,7 @@ - ext3_journal_forget(handle, branch[i].bh); - } - for (i = 0; i < keys; i++) -- ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1); -+ ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1, 1); - return err; - } - -@@ -723,7 +723,7 @@ - if (err == -EAGAIN) - for (i = 0; i < num; i++) - ext3_free_blocks(handle, inode, -- le32_to_cpu(where[i].key), 1); -+ le32_to_cpu(where[i].key), 1, 1); - return err; - } - -@@ -1751,7 +1751,7 @@ - } - } - -- ext3_free_blocks(handle, inode, block_to_free, count); -+ ext3_free_blocks(handle, inode, block_to_free, count, 1); - } - - /** -@@ -1923,7 +1923,7 @@ - ext3_journal_test_restart(handle, inode); - } - -- ext3_free_blocks(handle, inode, nr, 1); -+ ext3_free_blocks(handle, inode, nr, 1, 1); - - if (parent_bh) { - /* -Index: linux-2.4.20-rh-20.9/fs/ext3/extents.c -=================================================================== ---- linux-2.4.20-rh-20.9.orig/fs/ext3/extents.c 2004-10-15 20:43:32.000000000 +0400 -+++ linux-2.4.20-rh-20.9/fs/ext3/extents.c 2004-10-15 20:57:33.000000000 +0400 -@@ -741,7 +741,7 @@ - for (i = 0; i < depth; i++) { - if (!ablocks[i]) - continue; -- ext3_free_blocks(handle, tree->inode, ablocks[i], 1); -+ ext3_free_blocks(handle, tree->inode, ablocks[i], 1, 1); - } - } - kfree(ablocks); -@@ -1389,7 +1389,7 @@ - path->p_idx->ei_leaf); - bh = sb_get_hash_table(tree->inode->i_sb, path->p_idx->ei_leaf); - ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf); -- ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1); -+ ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1, 1); - return err; - } - -@@ -1847,10 +1847,12 @@ - int needed = ext3_remove_blocks_credits(tree, ex, from, to); - handle_t *handle = ext3_journal_start(tree->inode, needed); - struct buffer_head *bh; -- int i; -+ int i, metadata = 0; - - if (IS_ERR(handle)) - return PTR_ERR(handle); -+ if (S_ISDIR(tree->inode->i_mode)) -+ metadata = 1; - if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) { - /* tail removal */ - unsigned long num, start; -@@ -1862,7 +1864,7 @@ - bh = sb_get_hash_table(tree->inode->i_sb, start + i); - ext3_forget(handle, 0, tree->inode, bh, start + i); - } -- ext3_free_blocks(handle, tree->inode, start, num); -+ ext3_free_blocks(handle, tree->inode, start, num, metadata); - } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) { - printk("strange request: removal %lu-%lu from %u:%u\n", - from, to, ex->ee_block, ex->ee_len); -Index: linux-2.4.20-rh-20.9/fs/ext3/xattr.c -=================================================================== ---- linux-2.4.20-rh-20.9.orig/fs/ext3/xattr.c 2004-10-15 20:43:31.000000000 +0400 -+++ linux-2.4.20-rh-20.9/fs/ext3/xattr.c 2004-10-15 20:57:33.000000000 +0400 -@@ -174,7 +174,7 @@ - ext3_xattr_free_block(handle_t *handle, struct inode * inode, - unsigned long block) - { -- ext3_free_blocks(handle, inode, block, 1); -+ ext3_free_blocks(handle, inode, block, 1, 1); - inode->i_blocks -= inode->i_sb->s_blocksize >> 9; - } - -@@ -182,7 +182,7 @@ - # define ext3_xattr_quota_free(inode) \ - DQUOT_FREE_BLOCK(inode, 1) - # define ext3_xattr_free_block(handle, inode, block) \ -- ext3_free_blocks(handle, inode, block, 1) -+ ext3_free_blocks(handle, inode, block, 1, 1) - #endif - - #if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,18) -Index: linux-2.4.20-rh-20.9/include/linux/ext3_fs.h -=================================================================== ---- linux-2.4.20-rh-20.9.orig/include/linux/ext3_fs.h 2004-10-15 20:43:32.000000000 +0400 -+++ linux-2.4.20-rh-20.9/include/linux/ext3_fs.h 2004-10-15 20:57:33.000000000 +0400 -@@ -334,6 +334,7 @@ - #define EXT3_MOUNT_IOPEN_NOPRIV 0x80000 /* Make iopen world-readable */ - #define EXT3_MOUNT_EXTENTS 0x100000/* Extents support */ - #define EXT3_MOUNT_EXTDEBUG 0x200000/* Extents debug */ -+#define EXT3_MOUNT_MBALLOC 0x400000/* buddy allocation support */ - - /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ - #ifndef _LINUX_EXT2_FS_H -@@ -664,7 +665,7 @@ - extern int ext3_new_block (handle_t *, struct inode *, unsigned long, - __u32 *, __u32 *, int *); - extern void ext3_free_blocks (handle_t *, struct inode *, unsigned long, -- unsigned long); -+ unsigned long, int); - extern unsigned long ext3_count_free_blocks (struct super_block *); - extern void ext3_check_blocks_bitmap (struct super_block *); - extern struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb, -@@ -727,6 +728,13 @@ - extern int ext3_ioctl (struct inode *, struct file *, unsigned int, - unsigned long); - -+/* mballoc.c */ -+extern int ext3_mb_init(struct super_block *sb); -+extern int ext3_mb_new_blocks(handle_t *handle, struct inode *inode, -+ unsigned long goal,int *len, int flags,int *errp); -+extern int ext3_mb_release(struct super_block *sb); -+extern void ext3_mb_release_blocks(struct super_block *, int); -+ - /* namei.c */ - extern int ext3_orphan_add(handle_t *, struct inode *); - extern int ext3_orphan_del(handle_t *, struct inode *); -Index: linux-2.4.20-rh-20.9/include/linux/ext3_fs_sb.h -=================================================================== ---- linux-2.4.20-rh-20.9.orig/include/linux/ext3_fs_sb.h 2004-10-15 20:43:29.000000000 +0400 -+++ linux-2.4.20-rh-20.9/include/linux/ext3_fs_sb.h 2004-10-20 22:08:40.000000000 +0400 -@@ -19,6 +19,7 @@ - #ifdef __KERNEL__ - #include - #include -+#include - #endif - - /* -@@ -31,6 +32,25 @@ - - #define EXT3_DELETE_THREAD - -+#define EXT3_BB_MAX_BLOCKS 30 -+struct ext3_free_metadata { -+ unsigned short group; -+ unsigned short num; -+ unsigned short blocks[EXT3_BB_MAX_BLOCKS]; -+ struct list_head list; -+}; -+ -+#define EXT3_BB_MAX_ORDER 14 -+ -+struct ext3_buddy_group_blocks { -+ unsigned long bb_bitmap; -+ unsigned long bb_buddy; -+ spinlock_t bb_lock; -+ unsigned bb_counters[EXT3_BB_MAX_ORDER]; -+ struct ext3_free_metadata *bb_md_cur; -+ unsigned long bb_tid; -+}; -+ - /* - * third extended-fs super-block data in memory - */ -@@ -86,6 +106,17 @@ - wait_queue_head_t s_delete_thread_queue; - wait_queue_head_t s_delete_waiter_queue; - #endif -+ -+ /* for buddy allocator */ -+ struct ext3_buddy_group_blocks **s_buddy_blocks; -+ struct inode *s_buddy; -+ long s_blocks_reserved; -+ spinlock_t s_reserve_lock; -+ struct list_head s_active_transaction; -+ struct list_head s_closed_transaction; -+ struct list_head s_committed_transaction; -+ spinlock_t s_md_lock; -+ unsigned int s_last_transaction; - }; - - #endif /* _LINUX_EXT3_FS_SB */ diff --git a/lustre/kernel_patches/patches/ext3-mballoc2-2.6-suse.patch b/lustre/kernel_patches/patches/ext3-mballoc2-2.6-suse.patch index 31e7e38..bb9928a 100644 --- a/lustre/kernel_patches/patches/ext3-mballoc2-2.6-suse.patch +++ b/lustre/kernel_patches/patches/ext3-mballoc2-2.6-suse.patch @@ -1,71 +1,8 @@ -Index: linux-2.6.5-7.201/include/linux/ext3_fs_sb.h -=================================================================== ---- linux-2.6.5-7.201.orig/include/linux/ext3_fs_sb.h 2005-10-14 08:59:35.000000000 +0400 -+++ linux-2.6.5-7.201/include/linux/ext3_fs_sb.h 2005-10-14 08:59:39.000000000 +0400 -@@ -23,10 +23,30 @@ - #define EXT_INCLUDE - #include - #include -+#include - #endif - #endif - #include - -+#define EXT3_BB_MAX_BLOCKS 30 -+struct ext3_free_metadata { -+ unsigned short group; -+ unsigned short num; -+ unsigned short blocks[EXT3_BB_MAX_BLOCKS]; -+ struct list_head list; -+}; -+ -+struct ext3_buddy_group_blocks { -+ __u32 bb_bitmap; -+ __u32 bb_buddy; -+ spinlock_t bb_lock; -+ unsigned long bb_tid; -+ struct ext3_free_metadata *bb_md_cur; -+ unsigned short bb_first_free; -+ unsigned short bb_free; -+ unsigned bb_counters[]; -+}; -+ - /* - * third extended-fs super-block data in memory - */ -@@ -78,6 +98,27 @@ - struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */ - wait_queue_head_t ro_wait_queue; /* For people waiting for the fs to go read-only */ - #endif -+ -+ /* for buddy allocator */ -+ struct ext3_buddy_group_blocks **s_buddy_blocks; -+ struct inode *s_buddy; -+ long s_blocks_reserved; -+ spinlock_t s_reserve_lock; -+ struct list_head s_active_transaction; -+ struct list_head s_closed_transaction; -+ struct list_head s_committed_transaction; -+ spinlock_t s_md_lock; -+ tid_t s_last_transaction; -+ int s_mb_factor; -+ -+ /* stats for buddy allocator */ -+ spinlock_t s_bal_lock; -+ unsigned long s_bal_reqs; /* number of reqs with len > 1 */ -+ unsigned long s_bal_success; /* we found long enough chunks */ -+ unsigned long s_bal_allocated; /* in blocks */ -+ unsigned long s_bal_ex_scanned; /* total extents scanned */ -+ unsigned long s_bal_goals; /* goal hits */ -+ unsigned long s_bal_breaks; /* too long searches */ - }; - - #endif /* _LINUX_EXT3_FS_SB */ Index: linux-2.6.5-7.201/include/linux/ext3_fs.h =================================================================== ---- linux-2.6.5-7.201.orig/include/linux/ext3_fs.h 2005-10-14 08:59:38.000000000 +0400 -+++ linux-2.6.5-7.201/include/linux/ext3_fs.h 2005-10-14 09:02:36.000000000 +0400 -@@ -57,6 +57,14 @@ +--- linux-2.6.5-7.201.orig/include/linux/ext3_fs.h 2005-12-17 02:53:30.000000000 +0300 ++++ linux-2.6.5-7.201/include/linux/ext3_fs.h 2005-12-17 03:13:38.000000000 +0300 +@@ -57,6 +57,14 @@ struct statfs; #define ext3_debug(f, a...) do {} while (0) #endif @@ -80,29 +17,30 @@ Index: linux-2.6.5-7.201/include/linux/ext3_fs.h /* * Special inodes numbers */ -@@ -339,6 +347,7 @@ - #define EXT3_MOUNT_IOPEN_NOPRIV 0x80000 /* Make iopen world-readable */ - #define EXT3_MOUNT_EXTENTS 0x100000/* Extents support */ - #define EXT3_MOUNT_EXTDEBUG 0x200000/* Extents debug */ -+#define EXT3_MOUNT_MBALLOC 0x400000/* Buddy allocation support */ +@@ -339,6 +347,7 @@ struct ext3_inode { + #define EXT3_MOUNT_IOPEN_NOPRIV 0x100000/* Make iopen world-readable */ + #define EXT3_MOUNT_EXTENTS 0x200000/* Extents support */ + #define EXT3_MOUNT_EXTDEBUG 0x400000/* Extents debug */ ++#define EXT3_MOUNT_MBALLOC 0x800000/* Buddy allocation support */ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ #ifndef clear_opt -@@ -700,7 +709,7 @@ +@@ -700,7 +709,9 @@ extern int ext3_bg_has_super(struct supe extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group); extern int ext3_new_block (handle_t *, struct inode *, unsigned long, int *); extern void ext3_free_blocks (handle_t *, struct inode *, unsigned long, - unsigned long); + unsigned long, int); ++extern void ext3_free_blocks_old (handle_t *, struct inode *, unsigned long, ++ unsigned long); extern unsigned long ext3_count_free_blocks (struct super_block *); extern void ext3_check_blocks_bitmap (struct super_block *); extern struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb, -@@ -822,6 +831,44 @@ +@@ -822,6 +833,17 @@ extern void ext3_extents_initialize_bloc extern int ext3_ext_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg); +/* mballoc.c */ -+extern long ext3_mb_aggressive; +extern long ext3_mb_stats; +extern long ext3_mb_max_to_scan; +extern int ext3_mb_init(struct super_block *, int); @@ -110,74 +48,146 @@ Index: linux-2.6.5-7.201/include/linux/ext3_fs.h +extern int ext3_mb_new_blocks(handle_t *, struct inode *, unsigned long, int *, int, int *); +extern int ext3_mb_reserve_blocks(struct super_block *, int); +extern void ext3_mb_release_blocks(struct super_block *, int); -+ -+/* writeback.c */ -+extern int ext3_wb_writepages(struct address_space *, struct writeback_control *); -+extern int ext3_wb_prepare_write(struct file *file, struct page *page, -+ unsigned from, unsigned to); -+extern int ext3_wb_commit_write(struct file *, struct page *, unsigned, unsigned); -+extern int ext3_wb_writepage(struct page *, struct writeback_control *); -+extern int ext3_wb_invalidatepage(struct page *, unsigned long); -+extern int ext3_wb_releasepage(struct page *, int); -+extern int ext3_wb_block_truncate_page(handle_t *, struct page *, struct address_space *, loff_t); -+extern void ext3_wb_init(struct super_block *); -+extern void ext3_wb_release(struct super_block *); -+ -+/* writeback.c */ -+extern int ext3_wb_writepages(struct address_space *, struct writeback_control *); -+extern int ext3_wb_prepare_write(struct file *file, struct page *page, -+ unsigned from, unsigned to); -+extern int ext3_wb_commit_write(struct file *, struct page *, unsigned, unsigned); -+extern int ext3_wb_writepage(struct page *, struct writeback_control *); -+extern int ext3_wb_invalidatepage(struct page *, unsigned long); -+extern int ext3_wb_releasepage(struct page *, int); -+extern int ext3_wb_block_truncate_page(handle_t *, struct page *, struct address_space *, loff_t); -+extern void ext3_wb_init(struct super_block *); -+extern void ext3_wb_release(struct super_block *); -+ -+/* proc.c */ -+extern int init_ext3_proc(void); -+extern void exit_ext3_proc(void); ++int __init init_ext3_proc(void); ++void exit_ext3_proc(void); + #endif /* __KERNEL__ */ #define EXT3_IOC_CREATE_INUM _IOW('f', 5, long) -Index: linux-2.6.5-7.201/fs/ext3/balloc.c +Index: linux-2.6.5-7.201/include/linux/ext3_fs_sb.h =================================================================== ---- linux-2.6.5-7.201.orig/fs/ext3/balloc.c 2005-10-11 00:12:45.000000000 +0400 -+++ linux-2.6.5-7.201/fs/ext3/balloc.c 2005-10-14 08:59:39.000000000 +0400 -@@ -78,7 +78,7 @@ - * - * Return buffer_head on success or NULL in case of failure. - */ --static struct buffer_head * -+struct buffer_head * - read_block_bitmap(struct super_block *sb, unsigned int block_group) +--- linux-2.6.5-7.201.orig/include/linux/ext3_fs_sb.h 2005-12-17 02:53:25.000000000 +0300 ++++ linux-2.6.5-7.201/include/linux/ext3_fs_sb.h 2005-12-17 03:10:23.000000000 +0300 +@@ -23,9 +23,15 @@ + #define EXT_INCLUDE + #include + #include ++#include + #endif + #endif + #include ++#include ++ ++struct ext3_buddy_group_blocks; ++struct ext3_mb_history; ++#define EXT3_BB_MAX_BLOCKS + + /* + * third extended-fs super-block data in memory +@@ -78,6 +84,38 @@ struct ext3_sb_info { + struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */ + wait_queue_head_t ro_wait_queue; /* For people waiting for the fs to go read-only */ + #endif ++ ++ /* for buddy allocator */ ++ struct ext3_group_info **s_group_info; ++ struct inode *s_buddy_cache; ++ long s_blocks_reserved; ++ spinlock_t s_reserve_lock; ++ struct list_head s_active_transaction; ++ struct list_head s_closed_transaction; ++ struct list_head s_committed_transaction; ++ spinlock_t s_md_lock; ++ tid_t s_last_transaction; ++ int s_mb_factor; ++ unsigned short *s_mb_offsets, *s_mb_maxs; ++ ++ /* history to debug policy */ ++ struct ext3_mb_history *s_mb_history; ++ int s_mb_history_cur; ++ int s_mb_history_max; ++ struct proc_dir_entry *s_mb_proc; ++ spinlock_t s_mb_history_lock; ++ ++ /* stats for buddy allocator */ ++ atomic_t s_bal_reqs; /* number of reqs with len > 1 */ ++ atomic_t s_bal_success; /* we found long enough chunks */ ++ atomic_t s_bal_allocated; /* in blocks */ ++ atomic_t s_bal_ex_scanned; /* total extents scanned */ ++ atomic_t s_bal_goals; /* goal hits */ ++ atomic_t s_bal_breaks; /* too long searches */ ++ atomic_t s_bal_2orders; /* 2^order hits */ ++ spinlock_t s_bal_lock; ++ unsigned long s_mb_buddies_generated; ++ unsigned long long s_mb_generation_time; + }; + + #endif /* _LINUX_EXT3_FS_SB */ +Index: linux-2.6.5-7.201/fs/ext3/super.c +=================================================================== +--- linux-2.6.5-7.201.orig/fs/ext3/super.c 2005-12-17 02:53:30.000000000 +0300 ++++ linux-2.6.5-7.201/fs/ext3/super.c 2005-12-17 03:10:23.000000000 +0300 +@@ -389,6 +389,7 @@ void ext3_put_super (struct super_block + struct ext3_super_block *es = sbi->s_es; + int i; + ++ ext3_mb_release(sb); + ext3_ext_release(sb); + ext3_xattr_put_super(sb); + journal_destroy(sbi->s_journal); +@@ -543,7 +544,7 @@ enum { + Opt_ignore, Opt_barrier, + Opt_err, + Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, +- Opt_extents, Opt_extdebug, ++ Opt_extents, Opt_extdebug, Opt_mballoc, + }; + + static match_table_t tokens = { +@@ -590,6 +591,7 @@ static match_table_t tokens = { + {Opt_iopen_nopriv, "iopen_nopriv"}, + {Opt_extents, "extents"}, + {Opt_extdebug, "extdebug"}, ++ {Opt_mballoc, "mballoc"}, + {Opt_barrier, "barrier=%u"}, + {Opt_err, NULL} + }; +@@ -811,6 +813,9 @@ static int parse_options (char * options + case Opt_extdebug: + set_opt (sbi->s_mount_opt, EXTDEBUG); + break; ++ case Opt_mballoc: ++ set_opt (sbi->s_mount_opt, MBALLOC); ++ break; + default: + printk (KERN_ERR + "EXT3-fs: Unrecognized mount option \"%s\" " +@@ -1464,6 +1469,7 @@ static int ext3_fill_super (struct super + ext3_count_dirs(sb)); + + ext3_ext_init(sb); ++ ext3_mb_init(sb, needs_recovery); + + return 0; + +@@ -2112,7 +2118,13 @@ static struct file_system_type ext3_fs_t + + static int __init init_ext3_fs(void) { - struct ext3_group_desc * desc; -@@ -274,7 +274,7 @@ +- int err = init_ext3_xattr(); ++ int err; ++ ++ err = init_ext3_proc(); ++ if (err) ++ return err; ++ ++ err = init_ext3_xattr(); + if (err) + return err; + err = init_inodecache(); +@@ -2141,6 +2153,7 @@ static void __exit exit_ext3_fs(void) + unregister_filesystem(&ext3_fs_type); + destroy_inodecache(); + exit_ext3_xattr(); ++ exit_ext3_proc(); } - /* Free given blocks, update quota and i_blocks field */ --void ext3_free_blocks(handle_t *handle, struct inode *inode, -+void ext3_free_blocks_old(handle_t *handle, struct inode *inode, - unsigned long block, unsigned long count) - { - struct buffer_head *bitmap_bh = NULL; -@@ -1142,7 +1142,7 @@ - * bitmap, and then for any free bit if that fails. - * This function also updates quota and i_blocks field. - */ --int ext3_new_block(handle_t *handle, struct inode *inode, -+int ext3_new_block_old(handle_t *handle, struct inode *inode, - unsigned long goal, int *errp) - { - struct buffer_head *bitmap_bh = NULL; + int ext3_prep_san_write(struct inode *inode, long *blocks, Index: linux-2.6.5-7.201/fs/ext3/extents.c =================================================================== ---- linux-2.6.5-7.201.orig/fs/ext3/extents.c 2005-10-14 08:59:38.000000000 +0400 -+++ linux-2.6.5-7.201/fs/ext3/extents.c 2005-10-14 08:59:39.000000000 +0400 -@@ -771,7 +771,7 @@ +--- linux-2.6.5-7.201.orig/fs/ext3/extents.c 2005-12-17 02:53:29.000000000 +0300 ++++ linux-2.6.5-7.201/fs/ext3/extents.c 2005-12-17 03:10:23.000000000 +0300 +@@ -771,7 +771,7 @@ cleanup: for (i = 0; i < depth; i++) { if (!ablocks[i]) continue; @@ -186,7 +196,7 @@ Index: linux-2.6.5-7.201/fs/ext3/extents.c } } kfree(ablocks); -@@ -1428,7 +1428,7 @@ +@@ -1428,7 +1428,7 @@ int ext3_ext_rm_idx(handle_t *handle, st path->p_idx->ei_leaf); bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf); ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf); @@ -195,7 +205,7 @@ Index: linux-2.6.5-7.201/fs/ext3/extents.c return err; } -@@ -1913,10 +1913,12 @@ +@@ -1913,10 +1913,12 @@ ext3_remove_blocks(struct ext3_extents_t int needed = ext3_remove_blocks_credits(tree, ex, from, to); handle_t *handle = ext3_journal_start(tree->inode, needed); struct buffer_head *bh; @@ -204,12 +214,12 @@ Index: linux-2.6.5-7.201/fs/ext3/extents.c if (IS_ERR(handle)) return PTR_ERR(handle); -+ if (S_ISDIR(tree->inode->i_mode)) ++ if (S_ISDIR(tree->inode->i_mode) || S_ISLNK(tree->inode->i_mode)) + metadata = 1; if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) { /* tail removal */ unsigned long num, start; -@@ -1928,7 +1930,7 @@ +@@ -1928,7 +1930,7 @@ ext3_remove_blocks(struct ext3_extents_t bh = sb_find_get_block(tree->inode->i_sb, start + i); ext3_forget(handle, 0, tree->inode, bh, start + i); } @@ -218,24 +228,82 @@ Index: linux-2.6.5-7.201/fs/ext3/extents.c } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) { printk("strange request: removal %lu-%lu from %u:%u\n", from, to, ex->ee_block, ex->ee_len); -Index: linux-2.6.5-7.201/fs/ext3/namei.c +Index: linux-2.6.5-7.201/fs/ext3/inode.c =================================================================== ---- linux-2.6.5-7.201.orig/fs/ext3/namei.c 2005-10-14 08:59:35.000000000 +0400 -+++ linux-2.6.5-7.201/fs/ext3/namei.c 2005-10-14 08:59:39.000000000 +0400 -@@ -1640,7 +1640,7 @@ - * If the create succeeds, we fill in the inode information - * with d_instantiate(). +--- linux-2.6.5-7.201.orig/fs/ext3/inode.c 2005-12-17 02:53:30.000000000 +0300 ++++ linux-2.6.5-7.201/fs/ext3/inode.c 2005-12-17 03:10:23.000000000 +0300 +@@ -572,7 +572,7 @@ static int ext3_alloc_branch(handle_t *h + ext3_journal_forget(handle, branch[i].bh); + } + for (i = 0; i < keys; i++) +- ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1); ++ ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1, 1); + return err; + } + +@@ -673,7 +673,7 @@ err_out: + if (err == -EAGAIN) + for (i = 0; i < num; i++) + ext3_free_blocks(handle, inode, +- le32_to_cpu(where[i].key), 1); ++ le32_to_cpu(where[i].key), 1, 1); + return err; + } + +@@ -1835,7 +1835,7 @@ ext3_clear_blocks(handle_t *handle, stru + } + } + +- ext3_free_blocks(handle, inode, block_to_free, count); ++ ext3_free_blocks(handle, inode, block_to_free, count, 1); + } + + /** +@@ -2006,7 +2006,7 @@ static void ext3_free_branches(handle_t + ext3_journal_test_restart(handle, inode); + } + +- ext3_free_blocks(handle, inode, nr, 1); ++ ext3_free_blocks(handle, inode, nr, 1, 1); + + if (parent_bh) { + /* +Index: linux-2.6.5-7.201/fs/ext3/balloc.c +=================================================================== +--- linux-2.6.5-7.201.orig/fs/ext3/balloc.c 2005-10-11 00:12:45.000000000 +0400 ++++ linux-2.6.5-7.201/fs/ext3/balloc.c 2005-12-17 03:10:23.000000000 +0300 +@@ -78,7 +78,7 @@ struct ext3_group_desc * ext3_get_group_ + * + * Return buffer_head on success or NULL in case of failure. */ --static int ext3_create (struct inode * dir, struct dentry * dentry, int mode, -+int ext3_create (struct inode * dir, struct dentry * dentry, int mode, - struct nameidata *nd) +-static struct buffer_head * ++struct buffer_head * + read_block_bitmap(struct super_block *sb, unsigned int block_group) { - handle_t *handle; + struct ext3_group_desc * desc; +@@ -274,7 +274,7 @@ void ext3_discard_reservation(struct ino + } + + /* Free given blocks, update quota and i_blocks field */ +-void ext3_free_blocks(handle_t *handle, struct inode *inode, ++void ext3_free_blocks_old(handle_t *handle, struct inode *inode, + unsigned long block, unsigned long count) + { + struct buffer_head *bitmap_bh = NULL; +@@ -1142,7 +1142,7 @@ int ext3_should_retry_alloc(struct super + * bitmap, and then for any free bit if that fails. + * This function also updates quota and i_blocks field. + */ +-int ext3_new_block(handle_t *handle, struct inode *inode, ++int ext3_new_block_old(handle_t *handle, struct inode *inode, + unsigned long goal, int *errp) + { + struct buffer_head *bitmap_bh = NULL; Index: linux-2.6.5-7.201/fs/ext3/xattr.c =================================================================== ---- linux-2.6.5-7.201.orig/fs/ext3/xattr.c 2005-10-14 08:59:36.000000000 +0400 -+++ linux-2.6.5-7.201/fs/ext3/xattr.c 2005-10-14 08:59:39.000000000 +0400 -@@ -1371,7 +1371,7 @@ +--- linux-2.6.5-7.201.orig/fs/ext3/xattr.c 2005-12-17 02:53:26.000000000 +0300 ++++ linux-2.6.5-7.201/fs/ext3/xattr.c 2005-12-17 03:10:41.000000000 +0300 +@@ -1371,7 +1371,7 @@ ext3_xattr_set_handle2(handle_t *handle, new_bh = sb_getblk(sb, block); if (!new_bh) { getblk_failed: @@ -244,7 +312,7 @@ Index: linux-2.6.5-7.201/fs/ext3/xattr.c error = -EIO; goto cleanup; } -@@ -1411,7 +1411,7 @@ +@@ -1411,7 +1411,7 @@ getblk_failed: if (HDR(old_bh)->h_refcount == cpu_to_le32(1)) { /* Free the old block. */ ea_bdebug(old_bh, "freeing"); @@ -253,7 +321,7 @@ Index: linux-2.6.5-7.201/fs/ext3/xattr.c /* ext3_forget() calls bforget() for us, but we let our caller release old_bh, so we need to -@@ -1519,7 +1519,7 @@ +@@ -1519,7 +1519,7 @@ ext3_xattr_delete_inode(handle_t *handle mb_cache_entry_free(ce); ce = NULL; } @@ -262,26 +330,13 @@ Index: linux-2.6.5-7.201/fs/ext3/xattr.c get_bh(bh); ext3_forget(handle, 1, inode, bh, EXT3_I(inode)->i_file_acl); } else { -Index: linux-2.6.5-7.201/fs/ext3/Makefile -=================================================================== ---- linux-2.6.5-7.201.orig/fs/ext3/Makefile 2005-10-14 08:59:38.000000000 +0400 -+++ linux-2.6.5-7.201/fs/ext3/Makefile 2005-10-14 08:59:39.000000000 +0400 -@@ -5,7 +5,7 @@ - obj-$(CONFIG_EXT3_FS) += ext3.o - - ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ -- ioctl.o namei.o super.o symlink.o hash.o extents.o -+ ioctl.o namei.o super.o symlink.o hash.o extents.o mballoc.o - - ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o - ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o Index: linux-2.6.5-7.201/fs/ext3/mballoc.c =================================================================== ---- linux-2.6.5-7.201.orig/fs/ext3/mballoc.c 2005-10-13 19:40:57.851699336 +0400 -+++ linux-2.6.5-7.201/fs/ext3/mballoc.c 2005-10-14 09:02:36.000000000 +0400 -@@ -0,0 +1,1868 @@ +--- linux-2.6.5-7.201.orig/fs/ext3/mballoc.c 2005-12-09 13:08:53.191437750 +0300 ++++ linux-2.6.5-7.201/fs/ext3/mballoc.c 2005-12-17 03:15:04.000000000 +0300 +@@ -0,0 +1,2430 @@ +/* -+ * Copyright(c) 2003, 2004, 2005, Cluster File Systems, Inc, info@clusterfs.com ++ * Copyright (c) 2003-2005, Cluster File Systems, Inc, info@clusterfs.com + * Written by Alex Tomas + * + * This program is free software; you can redistribute it and/or modify @@ -313,12 +368,15 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c +#include +#include +#include ++#include ++#include ++#include ++#include + +/* + * TODO: -+ * - bitmap/buddy read-ahead (proposed by Oleg Drokin aka green) ++ * - bitmap read-ahead (proposed by Oleg Drokin aka green) + * - track min/max extents in each group for better group selection -+ * - is it worthwhile to use buddies directly if req is 2^N blocks? + * - mb_mark_used() may allocate chunk right after splitting buddy + * - special flag to advice allocator to look for requested + N blocks + * this may improve interaction between extents and mballoc @@ -328,17 +386,10 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + */ + +/* -+ * with 'ext3_mb_aggressive' set the allocator runs consistency checks over ++ * with AGRESSIVE_CHECK allocator runs consistency checks over + * structures. these checks slow things down a lot + */ -+long ext3_mb_aggressive = 0; -+ -+ -+/* -+ * with 'ext3_mb_stats' allocator will collect stats that will be -+ * shown at umount. The collecting costs though! -+ */ -+long ext3_mb_stats = 1; ++#define AGGRESSIVE_CHECK__ + +/* + */ @@ -350,33 +401,56 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c +#endif + +/* -+ * where to save buddies structures beetween umount/mount (clean case only) ++ * with EXT3_MB_HISTORY mballoc stores last N allocations in memory ++ * and you can monitor it in /proc/fs/ext3//mb_history + */ -+#define EXT3_BUDDY_FILE ".buddy" ++#define EXT3_MB_HISTORY + +/* + * How long mballoc can look for a best extent (in found extents) + */ -+long ext3_mb_max_to_scan = 100; ++long ext3_mb_max_to_scan = 500; + +/* -+ * This structure is on-disk description of a group for mballoc ++ * How long mballoc must look for a best extent + */ -+struct ext3_mb_group_descr { -+ __u16 mgd_first_free; /* first free block in the group */ -+ __u16 mgd_free; /* number of free blocks in the group */ -+ __u16 mgd_counters[16]; /* number of free blocks by order */ -+}; ++long ext3_mb_min_to_scan = 30; + +/* -+ * This structure is header of mballoc's file ++ * with 'ext3_mb_stats' allocator will collect stats that will be ++ * shown at umount. The collecting costs though! + */ -+struct ext3_mb_grp_header { -+ __u32 mh_magic; ++ ++long ext3_mb_stats = 1; ++ ++#ifdef EXT3_BB_MAX_BLOCKS ++#undef EXT3_BB_MAX_BLOCKS ++#endif ++#define EXT3_BB_MAX_BLOCKS 30 ++ ++struct ext3_free_metadata { ++ unsigned short group; ++ unsigned short num; ++ unsigned short blocks[EXT3_BB_MAX_BLOCKS]; ++ struct list_head list; ++}; ++ ++struct ext3_group_info { ++ unsigned long bb_state; ++ unsigned long bb_tid; ++ struct ext3_free_metadata *bb_md_cur; ++ unsigned short bb_first_free; ++ unsigned short bb_free; ++ unsigned short bb_fragments; ++ unsigned short bb_counters[]; +}; + -+#define EXT3_MB_MAGIC_V1 0xbabd16fd + ++#define EXT3_GROUP_INFO_NEED_INIT_BIT 0 ++#define EXT3_GROUP_INFO_LOCKED_BIT 1 ++ ++#define EXT3_MB_GRP_NEED_INIT(grp) \ ++ (test_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, &(grp)->bb_state)) + +struct ext3_free_extent { + __u16 fe_start; @@ -397,28 +471,55 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + unsigned long ac_ex_scanned; + __u16 ac_groups_scanned; + __u16 ac_found; ++ __u16 ac_tail; ++ __u16 ac_buddy; + __u8 ac_status; + __u8 ac_flags; /* allocation hints */ ++ __u8 ac_criteria; + __u8 ac_repeats; ++ __u8 ac_2order; /* if request is to allocate 2^N blocks and ++ * N > 0, the field stores N, otherwise 0 */ +}; + +#define AC_STATUS_CONTINUE 1 +#define AC_STATUS_FOUND 2 +#define AC_STATUS_BREAK 3 + ++struct ext3_mb_history { ++ struct ext3_free_extent goal; /* goal allocation */ ++ struct ext3_free_extent result; /* result allocation */ ++ __u16 found; /* how many extents have been found */ ++ __u16 groups; /* how many groups have been scanned */ ++ __u16 tail; /* what tail broke some buddy */ ++ __u16 buddy; /* buddy the tail ^^^ broke */ ++ __u8 cr; /* which phase the result extent was found at */ ++ __u8 merged; ++}; ++ +struct ext3_buddy { -+ struct buffer_head *bd_bh; -+ struct buffer_head *bd_bh2; -+ struct ext3_buddy_group_blocks *bd_bd; ++ struct page *bd_buddy_page; ++ void *bd_buddy; ++ struct page *bd_bitmap_page; ++ void *bd_bitmap; ++ struct ext3_group_info *bd_info; + struct super_block *bd_sb; + __u16 bd_blkbits; + __u16 bd_group; +}; -+#define EXT3_MB_BITMAP(e3b) ((e3b)->bd_bh->b_data) -+#define EXT3_MB_BUDDY(e3b) ((e3b)->bd_bh2->b_data) ++#define EXT3_MB_BITMAP(e3b) ((e3b)->bd_bitmap) ++#define EXT3_MB_BUDDY(e3b) ((e3b)->bd_buddy) ++ ++#ifndef EXT3_MB_HISTORY ++#define ext3_mb_store_history(sb,ac) ++#else ++static void ext3_mb_store_history(struct super_block *, ++ struct ext3_allocation_context *ac); ++#endif + +#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) + ++static struct proc_dir_entry *proc_root_ext3; ++ +int ext3_create (struct inode *, struct dentry *, int, struct nameidata *); +struct buffer_head * read_block_bitmap(struct super_block *, unsigned int); +int ext3_new_block_old(handle_t *, struct inode *, unsigned long, int *); @@ -473,9 +574,25 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + ext2_clear_bit_atomic(NULL, bit, addr); +} + ++static inline int mb_find_next_zero_bit(void *addr, int max, int start) ++{ ++ int fix; ++#if BITS_PER_LONG == 64 ++ fix = ((unsigned long) addr & 7UL) << 3; ++ addr = (void *) ((unsigned long) addr & ~7UL); ++#elif BITS_PER_LONG == 32 ++ fix = ((unsigned long) addr & 3UL) << 3; ++ addr = (void *) ((unsigned long) addr & ~3UL); ++#else ++#error "how many bits you are?!" ++#endif ++ max += fix; ++ start += fix; ++ return ext2_find_next_zero_bit(addr, max, start) - fix; ++} ++ +static inline void *mb_find_buddy(struct ext3_buddy *e3b, int order, int *max) +{ -+ int i = 1; + char *bb; + + J_ASSERT(EXT3_MB_BITMAP(e3b) != EXT3_MB_BUDDY(e3b)); @@ -491,89 +608,30 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + if (order == 0) + return EXT3_MB_BITMAP(e3b); + -+ bb = EXT3_MB_BUDDY(e3b); -+ *max = *max >> 1; -+ while (i < order) { -+ bb += 1 << (e3b->bd_blkbits - i); -+ i++; -+ *max = *max >> 1; -+ } -+ J_ASSERT((unsigned) (bb - (char *) EXT3_MB_BUDDY(e3b)) < -+ e3b->bd_sb->s_blocksize); -+ return bb; -+} -+ -+static int ext3_mb_load_buddy(struct super_block *sb, int group, -+ struct ext3_buddy *e3b) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ -+ J_ASSERT(sbi->s_buddy_blocks[group]->bb_bitmap); -+ J_ASSERT(sbi->s_buddy_blocks[group]->bb_buddy); -+ -+ /* load bitmap */ -+ e3b->bd_bh = sb_getblk(sb, sbi->s_buddy_blocks[group]->bb_bitmap); -+ if (e3b->bd_bh == NULL) { -+ ext3_error(sb, "ext3_mb_load_buddy", -+ "can't get block for buddy bitmap\n"); -+ goto out; -+ } -+ /* load buddy */ -+ e3b->bd_bh2 = sb_getblk(sb, sbi->s_buddy_blocks[group]->bb_buddy); -+ if (e3b->bd_bh2 == NULL) { -+ ext3_error(sb, "ext3_mb_load_buddy", -+ "can't get block for buddy bitmap\n"); -+ goto out; -+ } -+ -+ if (!buffer_uptodate(e3b->bd_bh)) -+ ll_rw_block(READ, 1, &e3b->bd_bh); -+ if (!buffer_uptodate(e3b->bd_bh2)) -+ ll_rw_block(READ, 1, &e3b->bd_bh2); -+ -+ wait_on_buffer(e3b->bd_bh); -+ J_ASSERT(buffer_uptodate(e3b->bd_bh)); -+ wait_on_buffer(e3b->bd_bh2); -+ J_ASSERT(buffer_uptodate(e3b->bd_bh2)); -+ -+ e3b->bd_blkbits = sb->s_blocksize_bits; -+ e3b->bd_bd = sbi->s_buddy_blocks[group]; -+ e3b->bd_sb = sb; -+ e3b->bd_group = group; -+ -+ return 0; -+out: -+ brelse(e3b->bd_bh); -+ brelse(e3b->bd_bh2); -+ e3b->bd_bh = NULL; -+ e3b->bd_bh2 = NULL; -+ return -EIO; -+} ++ bb = EXT3_MB_BUDDY(e3b) + EXT3_SB(e3b->bd_sb)->s_mb_offsets[order]; ++ *max = EXT3_SB(e3b->bd_sb)->s_mb_maxs[order]; + -+static void ext3_mb_dirty_buddy(struct ext3_buddy *e3b) -+{ -+ mark_buffer_dirty(e3b->bd_bh); -+ mark_buffer_dirty(e3b->bd_bh2); ++ return bb; +} + -+static void ext3_mb_release_desc(struct ext3_buddy *e3b) -+{ -+ brelse(e3b->bd_bh); -+ brelse(e3b->bd_bh2); -+} ++#ifdef AGGRESSIVE_CHECK + +static void mb_check_buddy(struct ext3_buddy *e3b) +{ + int order = e3b->bd_blkbits + 1; + int max, max2, i, j, k, count; ++ int fragments = 0, fstart; + void *buddy, *buddy2; + -+ if (likely(!ext3_mb_aggressive)) -+ return; -+ + if (!test_opt(e3b->bd_sb, MBALLOC)) + return; + ++ { ++ static int mb_check_counter = 0; ++ if (mb_check_counter++ % 300 != 0) ++ return; ++ } ++ + while (order > 1) { + buddy = mb_find_buddy(e3b, order, &max); + J_ASSERT(buddy); @@ -604,14 +662,22 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + } + count++; + } -+ J_ASSERT(e3b->bd_bd->bb_counters[order] == count); ++ J_ASSERT(e3b->bd_info->bb_counters[order] == count); + order--; + } + ++ fstart = -1; + buddy = mb_find_buddy(e3b, 0, &max); + for (i = 0; i < max; i++) { -+ if (!mb_test_bit(i, buddy)) ++ if (!mb_test_bit(i, buddy)) { ++ J_ASSERT(i >= e3b->bd_info->bb_first_free); ++ if (fstart == -1) { ++ fragments++; ++ fstart = i; ++ } + continue; ++ } ++ fstart = -1; + /* check used bits only */ + for (j = 0; j < e3b->bd_blkbits + 1; j++) { + buddy2 = mb_find_buddy(e3b, j, &max2); @@ -620,18 +686,325 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + J_ASSERT(mb_test_bit(k, buddy2)); + } + } ++ J_ASSERT(!EXT3_MB_GRP_NEED_INIT(e3b->bd_info)); ++ J_ASSERT(e3b->bd_info->bb_fragments == fragments); ++} ++ ++#else ++#define mb_check_buddy(e3b) ++#endif ++ ++/* find most significant bit */ ++static int inline fmsb(unsigned short word) ++{ ++ int order; ++ ++ if (word > 255) { ++ order = 7; ++ word >>= 8; ++ } else { ++ order = -1; ++ } ++ ++ do { ++ order++; ++ word >>= 1; ++ } while (word != 0); ++ ++ return order; ++} ++ ++static void inline ++ext3_mb_mark_free_simple(struct super_block *sb, void *buddy, unsigned first, ++ int len, struct ext3_group_info *grp) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ unsigned short min, max, chunk, border; ++ ++ mb_debug("mark %u/%u free\n", first, len); ++ J_ASSERT(len < EXT3_BLOCKS_PER_GROUP(sb)); ++ ++ border = 2 << sb->s_blocksize_bits; ++ ++ while (len > 0) { ++ /* find how many blocks can be covered since this position */ ++ max = ffs(first | border) - 1; ++ ++ /* find how many blocks of power 2 we need to mark */ ++ min = fmsb(len); ++ ++ mb_debug(" %u/%u -> max %u, min %u\n", ++ first & ((2 << sb->s_blocksize_bits) - 1), ++ len, max, min); ++ ++ if (max < min) ++ min = max; ++ chunk = 1 << min; ++ ++ /* mark multiblock chunks only */ ++ grp->bb_counters[min]++; ++ if (min > 0) { ++ mb_debug(" set %u at %u \n", first >> min, ++ sbi->s_mb_offsets[min]); ++ mb_clear_bit(first >> min, buddy + sbi->s_mb_offsets[min]); ++ } ++ ++ len -= chunk; ++ first += chunk; ++ } ++} ++ ++static void ++ext3_mb_generate_buddy(struct super_block *sb, void *buddy, void *bitmap, ++ struct ext3_group_info *grp) ++{ ++ unsigned short max = EXT3_BLOCKS_PER_GROUP(sb); ++ unsigned short i = 0, first, len; ++ unsigned free = 0, fragments = 0; ++ unsigned long long period = get_cycles(); ++ ++ i = mb_find_next_zero_bit(bitmap, max, 0); ++ grp->bb_first_free = i; ++ while (i < max) { ++ fragments++; ++ first = i; ++ i = find_next_bit(bitmap, max, i); ++ len = i - first; ++ free += len; ++ if (len > 1) ++ ext3_mb_mark_free_simple(sb, buddy, first, len, grp); ++ else ++ grp->bb_counters[0]++; ++ if (i < max) ++ i = mb_find_next_zero_bit(bitmap, max, i); ++ } ++ grp->bb_fragments = fragments; ++ ++ /* bb_state shouldn't being modified because all ++ * others waits for init completion on page lock */ ++ clear_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, &grp->bb_state); ++ if (free != grp->bb_free) { ++ printk("EXT3-fs: %u blocks in bitmap, %u in group descriptor\n", ++ free, grp->bb_free); ++ grp->bb_free = free; ++ } ++ ++ period = get_cycles() - period; ++ spin_lock(&EXT3_SB(sb)->s_bal_lock); ++ EXT3_SB(sb)->s_mb_buddies_generated++; ++ EXT3_SB(sb)->s_mb_generation_time += period; ++ spin_unlock(&EXT3_SB(sb)->s_bal_lock); ++} ++ ++static int ext3_mb_init_cache(struct page *page) ++{ ++ int blocksize, blocks_per_page, groups_per_page; ++ int err = 0, i, first_group, first_block; ++ struct super_block *sb; ++ struct buffer_head *bhs; ++ struct buffer_head **bh; ++ struct inode *inode; ++ char *data, *bitmap; ++ ++ mb_debug("init page %lu\n", page->index); ++ ++ inode = page->mapping->host; ++ sb = inode->i_sb; ++ blocksize = 1 << inode->i_blkbits; ++ blocks_per_page = PAGE_CACHE_SIZE / blocksize; ++ ++ groups_per_page = blocks_per_page >> 1; ++ if (groups_per_page == 0) ++ groups_per_page = 1; ++ ++ /* allocate buffer_heads to read bitmaps */ ++ if (groups_per_page > 1) { ++ err = -ENOMEM; ++ i = sizeof(struct buffer_head *) * groups_per_page; ++ bh = kmalloc(i, GFP_NOFS); ++ if (bh == NULL) ++ goto out; ++ memset(bh, 0, i); ++ } else ++ bh = &bhs; ++ ++ first_group = page->index * blocks_per_page / 2; ++ ++ /* read all groups the page covers into the cache */ ++ for (i = 0; i < groups_per_page; i++) { ++ struct ext3_group_desc * desc; ++ ++ if (first_group + i >= EXT3_SB(sb)->s_groups_count) ++ break; ++ ++ err = -EIO; ++ desc = ext3_get_group_desc(sb, first_group + i, NULL); ++ if (desc == NULL) ++ goto out; ++ ++ err = -ENOMEM; ++ bh[i] = sb_getblk(sb, le32_to_cpu(desc->bg_block_bitmap)); ++ if (bh[i] == NULL) ++ goto out; ++ ++ if (buffer_uptodate(bh[i])) ++ continue; ++ ++ lock_buffer(bh[i]); ++ if (buffer_uptodate(bh[i])) { ++ unlock_buffer(bh[i]); ++ continue; ++ } ++ ++ get_bh(bh[i]); ++ bh[i]->b_end_io = end_buffer_read_sync; ++ submit_bh(READ, bh[i]); ++ mb_debug("read bitmap for group %u\n", first_group + i); ++ } ++ ++ /* wait for I/O completion */ ++ for (i = 0; i < groups_per_page && bh[i]; i++) ++ wait_on_buffer(bh[i]); ++ ++ /* XXX: I/O error handling here */ ++ ++ first_block = page->index * blocks_per_page; ++ for (i = 0; i < blocks_per_page; i++) { ++ int group; ++ ++ group = (first_block + i) >> 1; ++ if (group >= EXT3_SB(sb)->s_groups_count) ++ break; ++ ++ data = page_address(page) + (i * blocksize); ++ bitmap = bh[group - first_group]->b_data; ++ ++ if ((first_block + i) & 1) { ++ /* this is block of buddy */ ++ mb_debug("put buddy for group %u in page %lu/%x\n", ++ group, page->index, i * blocksize); ++ memset(data, 0xff, blocksize); ++ EXT3_SB(sb)->s_group_info[group]->bb_fragments = 0; ++ memset(EXT3_SB(sb)->s_group_info[group]->bb_counters, 0, ++ sizeof(unsigned short)*(sb->s_blocksize_bits+2)); ++ ext3_mb_generate_buddy(sb, data, bitmap, ++ EXT3_SB(sb)->s_group_info[group]); ++ } else { ++ /* this is block of bitmap */ ++ mb_debug("put bitmap for group %u in page %lu/%x\n", ++ group, page->index, i * blocksize); ++ memcpy(data, bitmap, blocksize); ++ } ++ } ++ SetPageUptodate(page); ++ ++out: ++ if (bh) { ++ for (i = 0; bh && i < groups_per_page && bh[i]; i++) ++ brelse(bh[i]); ++ if (bh != &bhs) ++ kfree(bh); ++ } ++ return err; ++} ++ ++static int ext3_mb_load_buddy(struct super_block *sb, int group, ++ struct ext3_buddy *e3b) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct inode *inode = sbi->s_buddy_cache; ++ int blocks_per_page, block, pnum, poff; ++ struct page *page; ++ ++ mb_debug("load group %u\n", group); ++ ++ blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; ++ ++ e3b->bd_blkbits = sb->s_blocksize_bits; ++ e3b->bd_info = sbi->s_group_info[group]; ++ e3b->bd_sb = sb; ++ e3b->bd_group = group; ++ e3b->bd_buddy_page = NULL; ++ e3b->bd_bitmap_page = NULL; ++ ++ block = group * 2; ++ pnum = block / blocks_per_page; ++ poff = block % blocks_per_page; ++ ++ page = find_get_page(inode->i_mapping, pnum); ++ if (page == NULL || !PageUptodate(page)) { ++ if (page) ++ page_cache_release(page); ++ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); ++ if (page) { ++ if (!PageUptodate(page)) ++ ext3_mb_init_cache(page); ++ unlock_page(page); ++ } ++ } ++ if (page == NULL || !PageUptodate(page)) ++ goto err; ++ e3b->bd_bitmap_page = page; ++ e3b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); ++ mark_page_accessed(page); ++ ++ block++; ++ pnum = block / blocks_per_page; ++ poff = block % blocks_per_page; ++ ++ page = find_get_page(inode->i_mapping, pnum); ++ if (page == NULL || !PageUptodate(page)) { ++ if (page) ++ page_cache_release(page); ++ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); ++ if (page) { ++ if (!PageUptodate(page)) ++ ext3_mb_init_cache(page); ++ unlock_page(page); ++ } ++ } ++ if (page == NULL || !PageUptodate(page)) ++ goto err; ++ e3b->bd_buddy_page = page; ++ e3b->bd_buddy = page_address(page) + (poff * sb->s_blocksize); ++ mark_page_accessed(page); ++ ++ J_ASSERT(e3b->bd_bitmap_page != NULL); ++ J_ASSERT(e3b->bd_buddy_page != NULL); ++ ++ return 0; ++ ++err: ++ if (e3b->bd_bitmap_page) ++ page_cache_release(e3b->bd_bitmap_page); ++ if (e3b->bd_buddy_page) ++ page_cache_release(e3b->bd_buddy_page); ++ e3b->bd_buddy = NULL; ++ e3b->bd_bitmap = NULL; ++ return -EIO; ++} ++ ++static void ext3_mb_release_desc(struct ext3_buddy *e3b) ++{ ++ if (e3b->bd_bitmap_page) ++ page_cache_release(e3b->bd_bitmap_page); ++ if (e3b->bd_buddy_page) ++ page_cache_release(e3b->bd_buddy_page); +} + ++ +static inline void +ext3_lock_group(struct super_block *sb, int group) +{ -+ spin_lock(&EXT3_SB(sb)->s_buddy_blocks[group]->bb_lock); ++ bit_spin_lock(EXT3_GROUP_INFO_LOCKED_BIT, ++ &EXT3_SB(sb)->s_group_info[group]->bb_state); +} + +static inline void +ext3_unlock_group(struct super_block *sb, int group) +{ -+ spin_unlock(&EXT3_SB(sb)->s_buddy_blocks[group]->bb_lock); ++ bit_spin_unlock(EXT3_GROUP_INFO_LOCKED_BIT, ++ &EXT3_SB(sb)->s_group_info[group]->bb_state); +} + +static int mb_find_order_for_block(struct ext3_buddy *e3b, int block) @@ -693,22 +1066,33 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + +static int mb_free_blocks(struct ext3_buddy *e3b, int first, int count) +{ -+ int block, max, order; ++ int block = 0, max = 0, order; + void *buddy, *buddy2; + + mb_check_buddy(e3b); + -+ e3b->bd_bd->bb_free += count; -+ if (first < e3b->bd_bd->bb_first_free) -+ e3b->bd_bd->bb_first_free = first; -+ ++ e3b->bd_info->bb_free += count; ++ if (first < e3b->bd_info->bb_first_free) ++ e3b->bd_info->bb_first_free = first; ++ ++ /* let's maintain fragments counter */ ++ if (first != 0) ++ block = !mb_test_bit(first - 1, EXT3_MB_BITMAP(e3b)); ++ if (first + count < EXT3_SB(e3b->bd_sb)->s_mb_maxs[0]) ++ max = !mb_test_bit(first + count, EXT3_MB_BITMAP(e3b)); ++ if (block && max) ++ e3b->bd_info->bb_fragments--; ++ else if (!block && !max) ++ e3b->bd_info->bb_fragments++; ++ ++ /* let's maintain buddy itself */ + while (count-- > 0) { + block = first++; + order = 0; + + J_ASSERT(mb_test_bit(block, EXT3_MB_BITMAP(e3b))); + mb_clear_bit(block, EXT3_MB_BITMAP(e3b)); -+ e3b->bd_bd->bb_counters[order]++; ++ e3b->bd_info->bb_counters[order]++; + + /* start of the buddy */ + buddy = mb_find_buddy(e3b, order, &max); @@ -731,12 +1115,12 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + mb_set_bit(block, buddy); + mb_set_bit(block + 1, buddy); + } -+ e3b->bd_bd->bb_counters[order]--; -+ e3b->bd_bd->bb_counters[order]--; ++ e3b->bd_info->bb_counters[order]--; ++ e3b->bd_info->bb_counters[order]--; + + block = block >> 1; + order++; -+ e3b->bd_bd->bb_counters[order]++; ++ e3b->bd_info->bb_counters[order]++; + + mb_clear_bit(block, buddy2); + buddy = buddy2; @@ -748,7 +1132,7 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c +} + +static int mb_find_extent(struct ext3_buddy *e3b, int order, int block, -+ int needed, struct ext3_free_extent *ex) ++ int needed, struct ext3_free_extent *ex) +{ + int next, max, ord; + void *buddy; @@ -765,7 +1149,7 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + return 0; + } + -+ if (order == 0) { ++ if (likely(order == 0)) { + /* find actual order */ + order = mb_find_order_for_block(e3b, block); + block = block >> order; @@ -775,7 +1159,7 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + ex->fe_start = block << order; + ex->fe_group = e3b->bd_group; + -+ while ((buddy = mb_find_buddy(e3b, order, &max))) { ++ while (needed > ex->fe_len && (buddy = mb_find_buddy(e3b, order, &max))) { + + if (block + 1 >= max) + break; @@ -797,16 +1181,30 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + +static int mb_mark_used(struct ext3_buddy *e3b, struct ext3_free_extent *ex) +{ ++ int ord, mlen = 0, max = 0, cur; + int start = ex->fe_start; + int len = ex->fe_len; -+ int ord, mlen, max, cur; ++ unsigned ret = 0; + int len0 = len; + void *buddy; + -+ e3b->bd_bd->bb_free -= len; -+ if (e3b->bd_bd->bb_first_free == start) -+ e3b->bd_bd->bb_first_free += len; ++ mb_check_buddy(e3b); + ++ e3b->bd_info->bb_free -= len; ++ if (e3b->bd_info->bb_first_free == start) ++ e3b->bd_info->bb_first_free += len; ++ ++ /* let's maintain fragments counter */ ++ if (start != 0) ++ mlen = !mb_test_bit(start - 1, EXT3_MB_BITMAP(e3b)); ++ if (start + len < EXT3_SB(e3b->bd_sb)->s_mb_maxs[0]) ++ max = !mb_test_bit(start + len, EXT3_MB_BITMAP(e3b)); ++ if (mlen && max) ++ e3b->bd_info->bb_fragments++; ++ else if (!mlen && !max) ++ e3b->bd_info->bb_fragments--; ++ ++ /* let's maintain buddy itself */ + while (len) { + ord = mb_find_order_for_block(e3b, start); + @@ -816,26 +1214,30 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + buddy = mb_find_buddy(e3b, ord, &max); + J_ASSERT((start >> ord) < max); + mb_set_bit(start >> ord, buddy); -+ e3b->bd_bd->bb_counters[ord]--; ++ e3b->bd_info->bb_counters[ord]--; + start += mlen; + len -= mlen; + J_ASSERT(len >= 0); + continue; + } + ++ /* store for history */ ++ if (ret == 0) ++ ret = len | (ord << 16); ++ + /* we have to split large buddy */ + J_ASSERT(ord > 0); + buddy = mb_find_buddy(e3b, ord, &max); + mb_set_bit(start >> ord, buddy); -+ e3b->bd_bd->bb_counters[ord]--; ++ e3b->bd_info->bb_counters[ord]--; + + ord--; + cur = (start >> ord) & ~1U; + buddy = mb_find_buddy(e3b, ord, &max); + mb_clear_bit(cur, buddy); + mb_clear_bit(cur + 1, buddy); -+ e3b->bd_bd->bb_counters[ord]++; -+ e3b->bd_bd->bb_counters[ord]++; ++ e3b->bd_info->bb_counters[ord]++; ++ e3b->bd_info->bb_counters[ord]++; + } + + /* now drop all the bits in bitmap */ @@ -843,7 +1245,7 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + + mb_check_buddy(e3b); + -+ return 0; ++ return ret; +} + +/* @@ -852,9 +1254,14 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c +static void ext3_mb_use_best_found(struct ext3_allocation_context *ac, + struct ext3_buddy *e3b) +{ ++ unsigned long ret; ++ + ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len); -+ mb_mark_used(e3b, &ac->ac_b_ex); ++ ret = mb_mark_used(e3b, &ac->ac_b_ex); ++ + ac->ac_status = AC_STATUS_FOUND; ++ ac->ac_tail = ret & 0xffff; ++ ac->ac_buddy = ret >> 16; +} + +/* @@ -871,9 +1278,8 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + struct ext3_free_extent *ex, + struct ext3_buddy *e3b) +{ -+ int factor = EXT3_SB(ac->ac_sb)->s_mb_factor; + struct ext3_free_extent *bex = &ac->ac_b_ex; -+ int diff = ac->ac_g_ex.fe_len - ex->fe_len; ++ struct ext3_free_extent *gex = &ac->ac_g_ex; + + J_ASSERT(ex->fe_len > 0); + J_ASSERT(ex->fe_len < (1 << ac->ac_sb->s_blocksize_bits) * 8); @@ -884,7 +1290,7 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + /* + * The special case - take what you catch first + */ -+ if (ac->ac_flags & EXT3_MB_HINT_FIRST) { ++ if (unlikely(ac->ac_flags & EXT3_MB_HINT_FIRST)) { + *bex = *ex; + ext3_mb_use_best_found(ac, e3b); + return; @@ -893,26 +1299,7 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + /* + * Let's check whether the chuck is good enough + */ -+ if (ex->fe_len >= ac->ac_g_ex.fe_len) { -+ *bex = *ex; -+ ext3_mb_use_best_found(ac, e3b); -+ return; -+ } -+ -+ /* -+ * If the request is vey large, then it makes sense to use large -+ * chunks for it. Even if they don't satisfy whole request. -+ */ -+ if (ex->fe_len > 1000) { -+ *bex = *ex; -+ ext3_mb_use_best_found(ac, e3b); -+ return; -+ } -+ -+ /* -+ * Sometimes it's worty to take close chunk -+ */ -+ if (factor && (ac->ac_g_ex.fe_len * 100) / (diff * 100) >= factor) { ++ if (ex->fe_len == gex->fe_len) { + *bex = *ex; + ext3_mb_use_best_found(ac, e3b); + return; @@ -928,13 +1315,26 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + + /* + * If new found extent is better, store it in the context -+ * FIXME: possible the policy should be more complex? + */ -+ if (ex->fe_len > bex->fe_len) { ++ if (bex->fe_len < gex->fe_len) { ++ /* if the request isn't satisfied, any found extent ++ * larger than previous best one is better */ ++ if (ex->fe_len > bex->fe_len) ++ *bex = *ex; ++ } else if (ex->fe_len > gex->fe_len) { ++ /* if the request is satisfied, then we try to find ++ * an extent that still satisfy the request, but is ++ * smaller than previous one */ + *bex = *ex; + } + + /* ++ * Let's scan at least few extents and don't pick up a first one ++ */ ++ if (bex->fe_len > gex->fe_len && ac->ac_found > ext3_mb_min_to_scan) ++ ac->ac_status = AC_STATUS_BREAK; ++ ++ /* + * We don't want to scan for a whole year + */ + if (ac->ac_found > ext3_mb_max_to_scan) @@ -955,13 +1355,13 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + ext3_lock_group(ac->ac_sb, group); + max = mb_find_extent(e3b, 0, ex.fe_start, ex.fe_len, &ex); + -+ if (max > 0) ++ if (max > 0) { ++ ac->ac_b_ex = ex; + ext3_mb_use_best_found(ac, e3b); ++ } + + ext3_unlock_group(ac->ac_sb, group); + -+ if (ac->ac_status == AC_STATUS_FOUND) -+ ext3_mb_dirty_buddy(e3b); + ext3_mb_release_desc(e3b); + + return 0; @@ -985,37 +1385,79 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + J_ASSERT(ex.fe_len > 0); + J_ASSERT(ex.fe_group == ac->ac_g_ex.fe_group); + J_ASSERT(ex.fe_start == ac->ac_g_ex.fe_start); ++ ac->ac_found++; + ac->ac_b_ex = ex; + ext3_mb_use_best_found(ac, e3b); + } + ext3_unlock_group(ac->ac_sb, group); + -+ if (ac->ac_status == AC_STATUS_FOUND) -+ ext3_mb_dirty_buddy(e3b); + ext3_mb_release_desc(e3b); + + return 0; +} ++ ++/* ++ * The routine scans buddy structures (not bitmap!) from given order ++ * to max order and tries to find big enough chunk to satisfy the req ++ */ ++static void ext3_mb_simple_scan_group(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ struct super_block *sb = ac->ac_sb; ++ struct ext3_group_info *grp = e3b->bd_info; ++ void *buddy; ++ int i, k, max; ++ ++ J_ASSERT(ac->ac_2order > 0); ++ for (i = ac->ac_2order; i < sb->s_blocksize_bits + 1; i++) { ++ if (grp->bb_counters[i] == 0) ++ continue; ++ ++ buddy = mb_find_buddy(e3b, i, &max); ++ if (buddy == NULL) { ++ printk(KERN_ALERT "looking for wrong order?\n"); ++ break; ++ } ++ ++ k = mb_find_next_zero_bit(buddy, max, 0); ++ J_ASSERT(k < max); ++ ++ ac->ac_found++; ++ ++ ac->ac_b_ex.fe_len = 1 << i; ++ ac->ac_b_ex.fe_start = k << i; ++ ac->ac_b_ex.fe_group = e3b->bd_group; ++ ++ ext3_mb_use_best_found(ac, e3b); ++ J_ASSERT(ac->ac_b_ex.fe_len == ac->ac_g_ex.fe_len); ++ ++ if (unlikely(ext3_mb_stats)) ++ atomic_inc(&EXT3_SB(sb)->s_bal_2orders); ++ ++ break; ++ } ++} ++ +/* + * The routine scans the group and measures all found extents. + * In order to optimize scanning, caller must pass number of -+ * free blocks in the group, so the routine can upper limit. ++ * free blocks in the group, so the routine can know upper limit. + */ -+static void ext3_mb_scan_group(struct ext3_allocation_context *ac, -+ struct ext3_buddy *e3b) ++static void ext3_mb_complex_scan_group(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) +{ + struct super_block *sb = ac->ac_sb; + void *bitmap = EXT3_MB_BITMAP(e3b); + struct ext3_free_extent ex; + int i, free; + -+ free = e3b->bd_bd->bb_free; ++ free = e3b->bd_info->bb_free; + J_ASSERT(free > 0); + -+ i = e3b->bd_bd->bb_first_free; ++ i = e3b->bd_info->bb_first_free; + -+ while (free && ac->ac_status != AC_STATUS_FOUND) { -+ i = ext2_find_next_zero_bit(bitmap, sb->s_blocksize * 8, i); ++ while (free && ac->ac_status == AC_STATUS_CONTINUE) { ++ i = mb_find_next_zero_bit(bitmap, sb->s_blocksize * 8, i); + if (i >= sb->s_blocksize * 8) { + J_ASSERT(free == 0); + break; @@ -1035,23 +1477,39 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c +static int ext3_mb_good_group(struct ext3_allocation_context *ac, + int group, int cr) +{ -+ int free; ++ struct ext3_sb_info *sbi = EXT3_SB(ac->ac_sb); ++ struct ext3_group_info *grp = sbi->s_group_info[group]; ++ unsigned free, fragments, i, bits; + -+ J_ASSERT(cr >= 0 && cr < 3); ++ J_ASSERT(cr >= 0 && cr < 4); ++ J_ASSERT(!EXT3_MB_GRP_NEED_INIT(grp)); + -+ free = EXT3_SB(ac->ac_sb)->s_buddy_blocks[group]->bb_free; ++ free = grp->bb_free; ++ fragments = grp->bb_fragments; + if (free == 0) + return 0; ++ if (fragments == 0) ++ return 0; + -+ if (cr == 0) { -+ if (free >= ac->ac_g_ex.fe_len >> 1) -+ return 1; -+ } else if (cr == 1) { -+ if (free >= ac->ac_g_ex.fe_len >> 2) ++ switch (cr) { ++ case 0: ++ J_ASSERT(ac->ac_2order != 0); ++ bits = ac->ac_sb->s_blocksize_bits + 1; ++ for (i = ac->ac_2order; i < bits; i++) ++ if (grp->bb_counters[i] > 0) ++ return 1; ++ case 1: ++ if ((free / fragments) >= ac->ac_g_ex.fe_len) ++ return 1; ++ case 2: ++ if (free >= ac->ac_g_ex.fe_len) ++ return 1; ++ case 3: + return 1; -+ } else if (cr == 2) { -+ return 1; ++ default: ++ BUG(); + } ++ + return 0; +} + @@ -1143,11 +1601,19 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + ac.ac_g_ex.fe_start = block; + ac.ac_g_ex.fe_len = *len; + ac.ac_flags = flags; ++ ac.ac_2order = 0; ++ ac.ac_criteria = 0; + -+ /* -+ * Sometimes, caller may want to merge even small number -+ * of blocks to an existing extent -+ */ ++ /* probably, the request is for 2^8+ blocks (1/2/3/... MB) */ ++ i = ffs(*len); ++ if (i >= 8) { ++ i--; ++ if ((*len & (~(1 << i))) == 0) ++ ac.ac_2order = i; ++ } ++ ++ /* Sometimes, caller may want to merge even small ++ * number of blocks to an existing extent */ + if (ac.ac_flags & EXT3_MB_HINT_MERGE) { + err = ext3_mb_find_by_goal(&ac, &e3b); + if (err) @@ -1156,23 +1622,24 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + goto found; + } + -+ /* -+ * FIXME -+ * If requested chunk is power of 2 length, we can try -+ * to exploit buddy nature to speed allocation up -+ */ -+ -+ -+ /* -+ * Let's just scan groups to find more-less suitable blocks -+ */ -+ cr = 0; ++ /* Let's just scan groups to find more-less suitable blocks */ ++ cr = ac.ac_2order ? 0 : 1; +repeat: -+ for (; cr < 3 && ac.ac_status == AC_STATUS_CONTINUE; cr++) { ++ for (; cr < 4 && ac.ac_status == AC_STATUS_CONTINUE; cr++) { ++ ac.ac_criteria = cr; + for (i = 0; i < EXT3_SB(sb)->s_groups_count; group++, i++) { + if (group == EXT3_SB(sb)->s_groups_count) + group = 0; + ++ if (EXT3_MB_GRP_NEED_INIT(sbi->s_group_info[group])) { ++ /* we need full data about the group ++ * to make a good selection */ ++ err = ext3_mb_load_buddy(ac.ac_sb, group, &e3b); ++ if (err) ++ goto out_err; ++ ext3_mb_release_desc(&e3b); ++ } ++ + /* check is group good for our criteries */ + if (!ext3_mb_good_group(&ac, group, cr)) + continue; @@ -1189,29 +1656,32 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + continue; + } + -+ ext3_mb_scan_group(&ac, &e3b); ++ ac.ac_groups_scanned++; ++ if (cr == 0) ++ ext3_mb_simple_scan_group(&ac, &e3b); ++ else ++ ext3_mb_complex_scan_group(&ac, &e3b); ++ + ext3_unlock_group(sb, group); + -+ if (ac.ac_status == AC_STATUS_FOUND) -+ ext3_mb_dirty_buddy(&e3b); + ext3_mb_release_desc(&e3b); + -+ if (err) -+ goto out_err; + if (ac.ac_status != AC_STATUS_CONTINUE) + break; + } + } + -+ if (ac.ac_status == AC_STATUS_BREAK && ++ if (ac.ac_b_ex.fe_len > 0 && ac.ac_status != AC_STATUS_FOUND && + !(ac.ac_flags & EXT3_MB_HINT_FIRST)) { + /* + * We've been searching too long. Let's try to allocate + * the best chunk we've found so far + */ -+ ext3_warning(inode->i_sb, __FUNCTION__, -+ "too long searching: got %d want %d\n", -+ ac.ac_b_ex.fe_len, ac.ac_g_ex.fe_len); ++ ++ /*if (ac.ac_found > ext3_mb_max_to_scan) ++ printk(KERN_ERR "EXT3-fs: too long searching at " ++ "%u (%d/%d)\n", cr, ac.ac_b_ex.fe_len, ++ ac.ac_g_ex.fe_len);*/ + ext3_mb_try_best_found(&ac, &e3b); + if (ac.ac_status != AC_STATUS_FOUND) { + /* @@ -1225,7 +1695,7 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + ac.ac_b_ex.fe_len = 0; + ac.ac_status = AC_STATUS_CONTINUE; + ac.ac_flags |= EXT3_MB_HINT_FIRST; -+ cr = 2; ++ cr = 3; + goto repeat; + } + } @@ -1248,7 +1718,7 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + printk("EXT3-fs: groups: "); + for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) + printk("%d: %d ", i, -+ sbi->s_buddy_blocks[i]->bb_free); ++ sbi->s_group_info[i]->bb_free); + printk("\n"); +#endif + goto out; @@ -1302,12 +1772,10 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + ext3_error(sb, "ext3_new_block", + "Allocating block in system zone - " + "block = %u", block); -+ if (unlikely(ext3_mb_aggressive)) { -+ for (i = 0; i < ac.ac_b_ex.fe_len; i++) -+ J_ASSERT(!mb_test_bit(ac.ac_b_ex.fe_start + i, -+ bitmap_bh->b_data)); -+ } -+ ++#ifdef AGGRESSIVE_CHECK ++ for (i = 0; i < ac.ac_b_ex.fe_len; i++) ++ J_ASSERT(!mb_test_bit(ac.ac_b_ex.fe_start + i, bitmap_bh->b_data)); ++#endif + mb_set_bits(bitmap_bh->b_data, ac.ac_b_ex.fe_start, ac.ac_b_ex.fe_len); + + spin_lock(sb_bgl_lock(sbi, ac.ac_b_ex.fe_group)); @@ -1358,368 +1826,358 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + ext3_mb_release_blocks(sb, 1); + } + -+ if ((ext3_mb_stats) && (ac.ac_g_ex.fe_len > 1)) { -+ spin_lock(&sbi->s_bal_lock); -+ sbi->s_bal_reqs++; -+ sbi->s_bal_allocated += *len; ++ if (unlikely(ext3_mb_stats) && ac.ac_g_ex.fe_len > 1) { ++ atomic_inc(&sbi->s_bal_reqs); ++ atomic_add(*len, &sbi->s_bal_allocated); + if (*len >= ac.ac_g_ex.fe_len) -+ sbi->s_bal_success++; -+ sbi->s_bal_ex_scanned += ac.ac_found; ++ atomic_inc(&sbi->s_bal_success); ++ atomic_add(ac.ac_found, &sbi->s_bal_ex_scanned); + if (ac.ac_g_ex.fe_start == ac.ac_b_ex.fe_start && + ac.ac_g_ex.fe_group == ac.ac_b_ex.fe_group) -+ sbi->s_bal_goals++; ++ atomic_inc(&sbi->s_bal_goals); + if (ac.ac_found > ext3_mb_max_to_scan) -+ sbi->s_bal_breaks++; -+ spin_unlock(&sbi->s_bal_lock); ++ atomic_inc(&sbi->s_bal_breaks); + } + ++ ext3_mb_store_history(sb, &ac); ++ + return block; +} ++EXPORT_SYMBOL(ext3_mb_new_blocks); + -+int ext3_mb_get_descr_loc(struct ext3_buddy *e3b, struct buffer_head **bh, -+ struct ext3_mb_group_descr **grp) -+{ -+ struct super_block *sb = e3b->bd_sb; -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ int descr_per_block, err, offset; -+ struct ext3_mb_grp_header *hdr; -+ unsigned long block; -+ -+ descr_per_block = (sb->s_blocksize - sizeof(struct ext3_mb_grp_header)) -+ / sizeof(struct ext3_mb_group_descr); -+ block = e3b->bd_group / descr_per_block; -+ *bh = ext3_bread(NULL, sbi->s_buddy, block, 0, &err); -+ if (*bh == NULL) { -+ printk(KERN_ERR "EXT3-fs: cant getblk descr for group %d: %d\n", -+ e3b->bd_group, err); -+ return err; -+ } -+ -+ hdr = (struct ext3_mb_grp_header *) (*bh)->b_data; -+ if (hdr->mh_magic != EXT3_MB_MAGIC_V1) { -+ printk(KERN_ERR "EXT3-fs: invalid magic in group %d!\n", -+ e3b->bd_group); -+ brelse(*bh); -+ *bh = NULL; -+ return -EIO; -+ } ++#ifdef EXT3_MB_HISTORY ++struct ext3_mb_proc_session { ++ struct ext3_mb_history *history; ++ struct super_block *sb; ++ int start; ++ int max; ++}; + -+ offset = e3b->bd_group % descr_per_block -+ * sizeof(struct ext3_mb_group_descr) -+ + sizeof(struct ext3_mb_grp_header); -+ *grp = (struct ext3_mb_group_descr *) ((*bh)->b_data + offset); ++static void *ext3_mb_history_skip_empty(struct ext3_mb_proc_session *s, ++ struct ext3_mb_history *hs, ++ int first) ++{ ++ if (hs == s->history + s->max) ++ hs = s->history; ++ if (!first && hs == s->history + s->start) ++ return NULL; ++ while (hs->goal.fe_len == 0) { ++ hs++; ++ if (hs == s->history + s->max) ++ hs = s->history; ++ if (hs == s->history + s->start) ++ return NULL; ++ } ++ return hs; ++} + -+ return 0; ++static void *ext3_mb_seq_history_start(struct seq_file *seq, loff_t *pos) ++{ ++ struct ext3_mb_proc_session *s = seq->private; ++ struct ext3_mb_history *hs; ++ int l = *pos; ++ ++ if (l == 0) ++ return SEQ_START_TOKEN; ++ hs = ext3_mb_history_skip_empty(s, s->history + s->start, 1); ++ if (!hs) ++ return NULL; ++ while (--l && (hs = ext3_mb_history_skip_empty(s, ++hs, 0)) != NULL); ++ return hs; +} + -+int ext3_mb_load_descr(struct ext3_buddy *e3b) ++static void *ext3_mb_seq_history_next(struct seq_file *seq, void *v, loff_t *pos) +{ -+ struct ext3_mb_group_descr *grp; -+ struct ext3_group_desc *gdp; -+ struct buffer_head *bh; -+ int err, i; ++ struct ext3_mb_proc_session *s = seq->private; ++ struct ext3_mb_history *hs = v; ++ ++ ++*pos; ++ if (v == SEQ_START_TOKEN) ++ return ext3_mb_history_skip_empty(s, s->history + s->start, 1); ++ else ++ return ext3_mb_history_skip_empty(s, ++hs, 0); ++} + -+ err = ext3_mb_get_descr_loc(e3b, &bh, &grp); -+ if (err) -+ return err; -+ -+ e3b->bd_bd->bb_first_free = grp->mgd_first_free; -+ e3b->bd_bd->bb_free = grp->mgd_free; -+ for (i = 0; i <= e3b->bd_blkbits + 1; i++) { -+ J_ASSERT(i < 16); -+ e3b->bd_bd->bb_counters[i] = grp->mgd_counters[i]; -+ } -+ brelse(bh); ++static int ext3_mb_seq_history_show(struct seq_file *seq, void *v) ++{ ++ struct ext3_mb_history *hs = v; ++ char buf[20], buf2[20]; + -+ /* additional checks against old group descriptor */ -+ gdp = ext3_get_group_desc(e3b->bd_sb, e3b->bd_group, NULL); -+ if (!gdp) -+ return -EIO; -+ if (e3b->bd_bd->bb_free != le16_to_cpu(gdp->bg_free_blocks_count)) { -+ printk(KERN_ERR "EXT3-fs: mbgroup %d corrupted (%d != %d)\n", -+ e3b->bd_group, e3b->bd_bd->bb_free, -+ le16_to_cpu(gdp->bg_free_blocks_count)); -+ return -ENODATA; ++ if (v == SEQ_START_TOKEN) { ++ seq_printf(seq, "%-17s %-17s %-5s %-5s %-2s %-5s %-5s %-6s\n", ++ "goal", "result", "found", "grps", "cr", "merge", ++ "tail", "broken"); ++ return 0; + } + ++ sprintf(buf, "%u/%u/%u", hs->goal.fe_group, ++ hs->goal.fe_start, hs->goal.fe_len); ++ sprintf(buf2, "%u/%u/%u", hs->result.fe_group, ++ hs->result.fe_start, hs->result.fe_len); ++ seq_printf(seq, "%-17s %-17s %-5u %-5u %-2u %-5s %-5u %-6u\n", buf, ++ buf2, hs->found, hs->groups, hs->cr, ++ hs->merged ? "M" : "", hs->tail, ++ hs->buddy ? 1 << hs->buddy : 0); + return 0; +} + ++static void ext3_mb_seq_history_stop(struct seq_file *seq, void *v) ++{ ++} ++ ++static struct seq_operations ext3_mb_seq_history_ops = { ++ .start = ext3_mb_seq_history_start, ++ .next = ext3_mb_seq_history_next, ++ .stop = ext3_mb_seq_history_stop, ++ .show = ext3_mb_seq_history_show, ++}; + -+int ext3_mb_update_descr(struct ext3_buddy *e3b) ++static int ext3_mb_seq_history_open(struct inode *inode, struct file *file) +{ -+ struct ext3_mb_group_descr *grp; -+ struct ext3_group_desc *gdp; -+ struct buffer_head *bh; -+ handle_t *handle; -+ int err, i; ++ struct super_block *sb = PDE(inode)->data; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct ext3_mb_proc_session *s; ++ int rc, size; + -+ /* additional checks against old group descriptor */ -+ gdp = ext3_get_group_desc(e3b->bd_sb, e3b->bd_group, NULL); -+ if (!gdp) ++ s = kmalloc(sizeof(*s), GFP_KERNEL); ++ if (s == NULL) + return -EIO; -+ if (e3b->bd_bd->bb_free != le16_to_cpu(gdp->bg_free_blocks_count)) { -+ printk(KERN_ERR "EXT3-fs: mbgroup %d corrupted (%d != %d)\n", -+ e3b->bd_group, e3b->bd_bd->bb_free, -+ le16_to_cpu(gdp->bg_free_blocks_count)); -+ return -ENODATA; -+ } -+ -+ err = ext3_mb_get_descr_loc(e3b, &bh, &grp); -+ if (err) -+ return err; ++ size = sizeof(struct ext3_mb_history) * sbi->s_mb_history_max; ++ s->history = kmalloc(size, GFP_KERNEL); ++ ++ spin_lock(&sbi->s_mb_history_lock); ++ memcpy(s->history, sbi->s_mb_history, size); ++ s->max = sbi->s_mb_history_max; ++ s->start = sbi->s_mb_history_cur % s->max; ++ spin_unlock(&sbi->s_mb_history_lock); + -+ handle = ext3_journal_start(EXT3_SB(e3b->bd_sb)->s_buddy, 1); -+ if (IS_ERR(handle)) { -+ err = PTR_ERR(handle); -+ handle = NULL; -+ goto out; ++ rc = seq_open(file, &ext3_mb_seq_history_ops); ++ if (rc == 0) { ++ struct seq_file *m = (struct seq_file *)file->private_data; ++ m->private = s; ++ } else { ++ kfree(s->history); ++ kfree(s); + } ++ return rc; + -+ err = ext3_journal_get_write_access(handle, bh); -+ if (err) -+ goto out; -+ grp->mgd_first_free = e3b->bd_bd->bb_first_free; -+ grp->mgd_free = e3b->bd_bd->bb_free; -+ for (i = 0; i <= e3b->bd_blkbits + 1; i++) { -+ J_ASSERT(i < 16); -+ grp->mgd_counters[i] = e3b->bd_bd->bb_counters[i]; -+ } -+ err = ext3_journal_dirty_metadata(handle, bh); -+ if (err) -+ goto out; -+ err = 0; -+out: -+ brelse(bh); -+ if (handle) -+ ext3_journal_stop(handle); -+ return err; +} + -+int ext3_mb_generate_buddy(struct ext3_buddy *e3b) ++static int ext3_mb_seq_history_release(struct inode *inode, struct file *file) ++{ ++ struct seq_file *seq = (struct seq_file *)file->private_data; ++ struct ext3_mb_proc_session *s = seq->private; ++ kfree(s->history); ++ kfree(s); ++ return seq_release(inode, file); ++} ++ ++static struct file_operations ext3_mb_seq_history_fops = { ++ .owner = THIS_MODULE, ++ .open = ext3_mb_seq_history_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = ext3_mb_seq_history_release, ++}; ++ ++static void ext3_mb_history_release(struct super_block *sb) +{ -+ struct super_block *sb = e3b->bd_sb; -+ struct buffer_head *bh; -+ int i, count = 0; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ char name[64]; + -+ mb_debug("generate buddy for group %d\n", e3b->bd_group); -+ memset(e3b->bd_bh->b_data, 0xff, sb->s_blocksize); -+ memset(e3b->bd_bh2->b_data, 0xff, sb->s_blocksize); ++ snprintf(name, sizeof(name) - 1, "%s", bdevname(sb->s_bdev, name)); ++ remove_proc_entry("mb_history", sbi->s_mb_proc); ++ remove_proc_entry(name, proc_root_ext3); + -+ bh = read_block_bitmap(sb, e3b->bd_group); -+ if (bh == NULL) -+ return -EIO; ++ if (sbi->s_mb_history) ++ kfree(sbi->s_mb_history); ++} + -+ /* mb_free_blocks will set real free */ -+ e3b->bd_bd->bb_free = 0; -+ e3b->bd_bd->bb_first_free = 1 << 15; -+ /* -+ * if change bb_counters size, don't forget about -+ * ext3_mb_init_backend() -bzzz -+ */ -+ memset(e3b->bd_bd->bb_counters, 0, -+ sizeof(unsigned) * (sb->s_blocksize_bits + 2)); ++static void ext3_mb_history_init(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ char name[64]; ++ int i; + -+ /* loop over the blocks, and create buddies for free ones */ -+ for (i = 0; i < sb->s_blocksize * 8; i++) { -+ if (!mb_test_bit(i, (void *) bh->b_data)) { -+ mb_free_blocks(e3b, i, 1); -+ count++; ++ snprintf(name, sizeof(name) - 1, "%s", bdevname(sb->s_bdev, name)); ++ sbi->s_mb_proc = proc_mkdir(name, proc_root_ext3); ++ if (sbi->s_mb_proc != NULL) { ++ struct proc_dir_entry *p; ++ p = create_proc_entry("mb_history", S_IRUGO, sbi->s_mb_proc); ++ if (p) { ++ p->proc_fops = &ext3_mb_seq_history_fops; ++ p->data = sb; + } + } -+ brelse(bh); -+ mb_check_buddy(e3b); -+ ext3_mb_dirty_buddy(e3b); + -+ return 0; ++ sbi->s_mb_history_max = 1000; ++ sbi->s_mb_history_cur = 0; ++ spin_lock_init(&sbi->s_mb_history_lock); ++ i = sbi->s_mb_history_max * sizeof(struct ext3_mb_history); ++ sbi->s_mb_history = kmalloc(i, GFP_KERNEL); ++ memset(sbi->s_mb_history, 0, i); ++ /* if we can't allocate history, then we simple won't use it */ +} + -+EXPORT_SYMBOL(ext3_mb_new_blocks); ++static void ++ext3_mb_store_history(struct super_block *sb, struct ext3_allocation_context *ac) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct ext3_mb_history h; ++ ++ if (likely(sbi->s_mb_history == NULL)) ++ return; ++ ++ h.goal = ac->ac_g_ex; ++ h.result = ac->ac_b_ex; ++ h.found = ac->ac_found; ++ h.cr = ac->ac_criteria; ++ h.groups = ac->ac_groups_scanned; ++ h.tail = ac->ac_tail; ++ h.buddy = ac->ac_buddy; ++ h.merged = 0; ++ if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && ++ ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group) ++ h.merged = 1; ++ ++ spin_lock(&sbi->s_mb_history_lock); ++ memcpy(sbi->s_mb_history + sbi->s_mb_history_cur, &h, sizeof(h)); ++ if (++sbi->s_mb_history_cur >= sbi->s_mb_history_max) ++ sbi->s_mb_history_cur = 0; ++ spin_unlock(&sbi->s_mb_history_lock); ++} + -+#define MB_CREDITS \ -+ (EXT3_DATA_TRANS_BLOCKS + 3 + EXT3_INDEX_EXTRA_TRANS_BLOCKS + \ -+ 2 * EXT3_SINGLEDATA_TRANS_BLOCKS) ++#else ++#define ext3_mb_history_release(sb) ++#define ext3_mb_history_init(sb) ++#endif + -+int ext3_mb_init_backend(struct super_block *sb, int *created) ++int ext3_mb_init_backend(struct super_block *sb) +{ -+ int err, i, len, descr_per_block, buddy_offset, size; -+ struct inode *root = sb->s_root->d_inode; + struct ext3_sb_info *sbi = EXT3_SB(sb); -+ struct ext3_mb_grp_header *hdr; -+ struct buffer_head *bh = NULL; -+ unsigned long block; -+ struct dentry *db; -+ handle_t *handle; -+ tid_t target; -+ -+ *created = 0; ++ int i, len; ++ + len = sizeof(struct ext3_buddy_group_blocks *) * sbi->s_groups_count; -+ sbi->s_buddy_blocks = kmalloc(len, GFP_KERNEL); -+ if (sbi->s_buddy_blocks == NULL) { -+ printk(KERN_ERR "EXT3-fs: cant allocate mem for buddy\n"); ++ sbi->s_group_info = kmalloc(len, GFP_KERNEL); ++ if (sbi->s_group_info == NULL) { ++ printk(KERN_ERR "EXT3-fs: can't allocate mem for buddy\n"); + return -ENOMEM; + } -+ memset(sbi->s_buddy_blocks, 0, len); -+ sbi->s_buddy = NULL; -+ -+ down(&root->i_sem); -+ len = strlen(EXT3_BUDDY_FILE); -+ db = lookup_one_len(EXT3_BUDDY_FILE, sb->s_root, len); -+ if (IS_ERR(db)) { -+ err = PTR_ERR(db); -+ printk(KERN_ERR "EXT3-fs: cant lookup buddy: %d\n", err); -+ up(&root->i_sem); -+ goto out; -+ } ++ memset(sbi->s_group_info, 0, len); + -+ if (db->d_inode == NULL) { -+ err = ext3_create(root, db, S_IFREG, NULL); -+ if (err) { -+ printk(KERN_ERR "EXT3-fs: cant create buddy: %d\n", err); -+ up(&root->i_sem); -+ goto out; -+ } -+ db->d_inode->i_flags |= S_IMMUTABLE | S_NOATIME; -+ *created = 1; -+ mb_debug("no buddy file, regenerate\n"); -+ } -+ up(&root->i_sem); -+ sbi->s_buddy = igrab(db->d_inode); -+ -+ /* calculate needed size */ -+ descr_per_block = (sb->s_blocksize - sizeof(struct ext3_mb_grp_header)) -+ / sizeof(struct ext3_mb_group_descr); -+ buddy_offset = (sbi->s_groups_count + descr_per_block - 1) -+ / descr_per_block; -+ len = sbi->s_groups_count * sb->s_blocksize * 2 + -+ buddy_offset * sb->s_blocksize; -+ if (len != i_size_read(sbi->s_buddy)) { -+ if (*created == 0) -+ printk("EXT3-fs: wrong i_size (%u != %u), regenerate\n", -+ (unsigned) len, -+ (unsigned) i_size_read(sbi->s_buddy)); -+ *created = 1; -+ } -+ -+ /* read/create mb group descriptors */ -+ for (i = 0; i < buddy_offset; i++) { -+ handle = ext3_journal_start(sbi->s_buddy, MB_CREDITS); -+ if (IS_ERR(handle)) { -+ printk(KERN_ERR "EXT3-fs: cant start transaction\n"); -+ err = PTR_ERR(handle); -+ goto err_out; -+ } -+ -+ bh = ext3_bread(handle, sbi->s_buddy, i, 1, &err); -+ if (bh == NULL) { -+ printk(KERN_ERR "EXT3-fs: cant getblk grp: %d\n", err); -+ goto err_out; -+ } -+ hdr = (struct ext3_mb_grp_header *) bh->b_data; -+ if (hdr->mh_magic != EXT3_MB_MAGIC_V1) { -+ err = ext3_journal_get_write_access(handle, bh); -+ if (err) -+ goto err_out; -+ if (*created == 0) -+ printk(KERN_ERR -+ "EXT3-fs: invalid header 0x%x in %d," -+ "regenerate\n", hdr->mh_magic, i); -+ *created = 1; -+ hdr->mh_magic = EXT3_MB_MAGIC_V1; -+ err = ext3_journal_dirty_metadata(handle, bh); -+ if (err) -+ goto err_out; -+ } -+ brelse(bh); -+ ext3_journal_stop(handle); ++ sbi->s_buddy_cache = new_inode(sb); ++ if (sbi->s_buddy_cache == NULL) { ++ printk(KERN_ERR "EXT3-fs: can't get new inode\n"); ++ kfree(sbi->s_group_info); ++ return -ENOMEM; + } + + /* -+ * if change bb_counters size, don't forget about ext3_mb_generate_buddy() ++ * calculate needed size. if change bb_counters size, ++ * don't forget about ext3_mb_generate_buddy() + */ -+ len = sizeof(struct ext3_buddy_group_blocks); -+ len += sizeof(unsigned) * (sb->s_blocksize_bits + 2); ++ len = sizeof(struct ext3_group_info); ++ len += sizeof(unsigned short) * (sb->s_blocksize_bits + 2); + for (i = 0; i < sbi->s_groups_count; i++) { ++ struct ext3_group_desc * desc; + -+ sbi->s_buddy_blocks[i] = kmalloc(len, GFP_KERNEL); -+ if (sbi->s_buddy_blocks[i] == NULL) { ++ sbi->s_group_info[i] = kmalloc(len, GFP_KERNEL); ++ if (sbi->s_group_info[i] == NULL) { + printk(KERN_ERR "EXT3-fs: cant allocate mem for buddy\n"); -+ err = -ENOMEM; -+ goto out2; -+ } -+ memset(sbi->s_buddy_blocks[i], 0, len); -+ -+ handle = ext3_journal_start(sbi->s_buddy, MB_CREDITS); -+ if (IS_ERR(handle)) { -+ printk(KERN_ERR "EXT3-fs: cant start transaction\n"); -+ err = PTR_ERR(handle); -+ goto out2; -+ } -+ -+ /* allocate block for bitmap */ -+ block = buddy_offset + i * 2; -+ bh = ext3_getblk(handle, sbi->s_buddy, block, 1, &err); -+ if (bh == NULL) { -+ printk(KERN_ERR "EXT3-fs: cant getblk bitmap: %d\n", err); -+ goto out2; -+ } -+ sbi->s_buddy_blocks[i]->bb_bitmap = bh->b_blocknr; -+ brelse(bh); -+ -+ /* allocate block for buddy */ -+ block = buddy_offset + i * 2 + 1; -+ bh = ext3_getblk(handle, sbi->s_buddy, block, 1, &err); -+ if (bh == NULL) { -+ printk(KERN_ERR "EXT3-fs: cant getblk for buddy: %d\n", err); -+ goto out2; ++ goto err_out; + } -+ sbi->s_buddy_blocks[i]->bb_buddy = bh->b_blocknr; -+ brelse(bh); -+ -+ size = (block + 1) << sbi->s_buddy->i_blkbits; -+ if (size > sbi->s_buddy->i_size) { -+ *created = 1; -+ EXT3_I(sbi->s_buddy)->i_disksize = size; -+ i_size_write(sbi->s_buddy, size); -+ mark_inode_dirty(sbi->s_buddy); ++ desc = ext3_get_group_desc(sb, i, NULL); ++ if (desc == NULL) { ++ printk(KERN_ERR "EXT3-fs: cant read descriptor %u\n", i); ++ goto err_out; + } -+ ext3_journal_stop(handle); -+ -+ spin_lock_init(&sbi->s_buddy_blocks[i]->bb_lock); -+ sbi->s_buddy_blocks[i]->bb_md_cur = NULL; -+ sbi->s_buddy_blocks[i]->bb_tid = 0; ++ memset(sbi->s_group_info[i], 0, len); ++ set_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, ++ &sbi->s_group_info[i]->bb_state); ++ sbi->s_group_info[i]->bb_free = ++ le16_to_cpu(desc->bg_free_blocks_count); + } + -+ if (journal_start_commit(sbi->s_journal, &target)) -+ log_wait_commit(sbi->s_journal, target); -+ -+out2: -+ dput(db); -+out: -+ return err; ++ return 0; + +err_out: -+ return err; ++ while (--i >= 0) ++ kfree(sbi->s_group_info[i]); ++ iput(sbi->s_buddy_cache); ++ ++ return -ENOMEM; +} + -+int ext3_mb_write_descriptors(struct super_block *sb) ++int ext3_mb_init(struct super_block *sb, int needs_recovery) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); -+ struct ext3_buddy e3b; -+ int ret = 0, i, err; ++ struct inode *root = sb->s_root->d_inode; ++ unsigned i, offset, max; ++ struct dentry *dentry; + -+ for (i = 0; i < sbi->s_groups_count; i++) { -+ if (sbi->s_buddy_blocks[i] == NULL) -+ continue; ++ if (!test_opt(sb, MBALLOC)) ++ return 0; + -+ err = ext3_mb_load_buddy(sb, i, &e3b); -+ if (err == 0) { -+ ext3_mb_update_descr(&e3b); -+ ext3_mb_release_desc(&e3b); -+ } else -+ ret = err; ++ i = (sb->s_blocksize_bits + 2) * sizeof(unsigned short); ++ ++ sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL); ++ if (sbi->s_mb_offsets == NULL) { ++ clear_opt(sbi->s_mount_opt, MBALLOC); ++ return -ENOMEM; + } -+ return ret; ++ sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); ++ if (sbi->s_mb_maxs == NULL) { ++ clear_opt(sbi->s_mount_opt, MBALLOC); ++ kfree(sbi->s_mb_maxs); ++ return -ENOMEM; ++ } ++ ++ /* order 0 is regular bitmap */ ++ sbi->s_mb_maxs[0] = sb->s_blocksize << 3; ++ sbi->s_mb_offsets[0] = 0; ++ ++ i = 1; ++ offset = 0; ++ max = sb->s_blocksize << 2; ++ do { ++ sbi->s_mb_offsets[i] = offset; ++ sbi->s_mb_maxs[i] = max; ++ offset += 1 << (sb->s_blocksize_bits - i); ++ max = max >> 1; ++ i++; ++ } while (i <= sb->s_blocksize_bits + 1); ++ ++ ++ /* init file for buddy data */ ++ if ((i = ext3_mb_init_backend(sb))) { ++ clear_opt(sbi->s_mount_opt, MBALLOC); ++ kfree(sbi->s_mb_offsets); ++ kfree(sbi->s_mb_maxs); ++ return i; ++ } ++ ++ spin_lock_init(&sbi->s_reserve_lock); ++ spin_lock_init(&sbi->s_md_lock); ++ INIT_LIST_HEAD(&sbi->s_active_transaction); ++ INIT_LIST_HEAD(&sbi->s_closed_transaction); ++ INIT_LIST_HEAD(&sbi->s_committed_transaction); ++ spin_lock_init(&sbi->s_bal_lock); ++ ++ /* remove old on-disk buddy file */ ++ down(&root->i_sem); ++ dentry = lookup_one_len(".buddy", sb->s_root, strlen(".buddy")); ++ if (dentry->d_inode != NULL) { ++ i = vfs_unlink(root, dentry); ++ if (i != 0) ++ printk("EXT3-fs: can't remove .buddy file: %d\n", i); ++ } ++ dput(dentry); ++ up(&root->i_sem); ++ ++ ext3_mb_history_init(sb); ++ ++ printk("EXT3-fs: mballoc enabled\n"); ++ return 0; +} + +int ext3_mb_release(struct super_block *sb) @@ -1739,78 +2197,40 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + spin_unlock(&sbi->s_md_lock); + ext3_mb_free_committed_blocks(sb); + -+ if (sbi->s_buddy_blocks) { -+ ext3_mb_write_descriptors(sb); ++ if (sbi->s_group_info) { + for (i = 0; i < sbi->s_groups_count; i++) { -+ if (sbi->s_buddy_blocks[i] == NULL) ++ if (sbi->s_group_info[i] == NULL) + continue; -+ kfree(sbi->s_buddy_blocks[i]); ++ kfree(sbi->s_group_info[i]); + } -+ kfree(sbi->s_buddy_blocks); -+ } -+ if (sbi->s_buddy) -+ iput(sbi->s_buddy); ++ kfree(sbi->s_group_info); ++ } ++ if (sbi->s_mb_offsets) ++ kfree(sbi->s_mb_offsets); ++ if (sbi->s_mb_maxs) ++ kfree(sbi->s_mb_maxs); ++ if (sbi->s_buddy_cache) ++ iput(sbi->s_buddy_cache); + if (sbi->s_blocks_reserved) + printk("ext3-fs: %ld blocks being reserved at umount!\n", + sbi->s_blocks_reserved); + if (ext3_mb_stats) { -+ printk("EXT3-fs: mballoc: %lu blocks %lu reqs " -+ "(%lu success)\n", sbi->s_bal_allocated, -+ sbi->s_bal_reqs, sbi->s_bal_success); -+ printk("EXT3-fs: mballoc: %lu extents scanned, " -+ "%lu goal hits, %lu breaks\n", sbi->s_bal_ex_scanned, -+ sbi->s_bal_goals, sbi->s_bal_breaks); -+ } -+ -+ return 0; -+} -+ -+int ext3_mb_init(struct super_block *sb, int needs_recovery) -+{ -+ struct ext3_buddy e3b; -+ int i, err, created; -+ -+ if (!test_opt(sb, MBALLOC)) -+ return 0; -+ -+ /* init file for buddy data */ -+ clear_opt(EXT3_SB(sb)->s_mount_opt, MBALLOC); -+ if ((err = ext3_mb_init_backend(sb, &created))) -+ return err; -+ -+repeat: -+ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) { -+ err = ext3_mb_load_buddy(sb, i, &e3b); -+ if (err) { -+ /* FIXME: release backend */ -+ return err; -+ } -+ if (created || needs_recovery) -+ ext3_mb_generate_buddy(&e3b); -+ else -+ err = ext3_mb_load_descr(&e3b); -+ ext3_mb_release_desc(&e3b); -+ if (err == -ENODATA) { -+ created = 1; -+ goto repeat; -+ } -+ } -+ if (created || needs_recovery) -+ printk(KERN_ERR "EXT3-fs: generated buddies for %lu groups\n", -+ EXT3_SB(sb)->s_groups_count); -+ spin_lock_init(&EXT3_SB(sb)->s_reserve_lock); -+ spin_lock_init(&EXT3_SB(sb)->s_md_lock); -+ INIT_LIST_HEAD(&EXT3_SB(sb)->s_active_transaction); -+ INIT_LIST_HEAD(&EXT3_SB(sb)->s_closed_transaction); -+ INIT_LIST_HEAD(&EXT3_SB(sb)->s_committed_transaction); -+ set_opt(EXT3_SB(sb)->s_mount_opt, MBALLOC); -+ -+ spin_lock_init(&EXT3_SB(sb)->s_bal_lock); -+ if (ext3_mb_stats) { -+ printk("EXT3-fs: mballoc enabled (stats)\n"); -+ } else { -+ printk("EXT3-fs: mballoc enabled\n"); -+ } ++ printk("EXT3-fs: mballoc: %u blocks %u reqs (%u success)\n", ++ atomic_read(&sbi->s_bal_allocated), ++ atomic_read(&sbi->s_bal_reqs), ++ atomic_read(&sbi->s_bal_success)); ++ printk("EXT3-fs: mballoc: %u extents scanned, %u goal hits, " ++ "%u 2^N hits, %u breaks\n", ++ atomic_read(&sbi->s_bal_ex_scanned), ++ atomic_read(&sbi->s_bal_goals), ++ atomic_read(&sbi->s_bal_2orders), ++ atomic_read(&sbi->s_bal_breaks)); ++ printk("EXT3-fs: mballoc: %lu generated and it took %Lu\n", ++ sbi->s_mb_buddies_generated++, ++ sbi->s_mb_generation_time); ++ } ++ ++ ext3_mb_history_release(sb); + + return 0; +} @@ -1857,8 +2277,11 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + mb_debug("\n"); + ext3_unlock_group(sb, md->group); + ++ /* balance refcounts from ext3_mb_free_metadata() */ ++ page_cache_release(e3b.bd_buddy_page); ++ page_cache_release(e3b.bd_bitmap_page); ++ + kfree(md); -+ ext3_mb_dirty_buddy(&e3b); + ext3_mb_release_desc(&e3b); + + } while (md); @@ -1875,7 +2298,7 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + /* new transaction! time to close last one and free blocks for + * committed transaction. we know that only transaction can be + * active, so previos transaction can be being logged and we -+ * know that transaction before previous is known to be alreade ++ * know that transaction before previous is known to be already + * logged. this means that now we may free blocks freed in all + * transactions before previous one. hope I'm clear enough ... */ + @@ -1898,12 +2321,15 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c +int ext3_mb_free_metadata(handle_t *handle, struct ext3_buddy *e3b, + int group, int block, int count) +{ -+ struct ext3_buddy_group_blocks *db = e3b->bd_bd; ++ struct ext3_group_info *db = e3b->bd_info; + struct super_block *sb = e3b->bd_sb; + struct ext3_sb_info *sbi = EXT3_SB(sb); + struct ext3_free_metadata *md; + int i; + ++ J_ASSERT(e3b->bd_bitmap_page != NULL); ++ J_ASSERT(e3b->bd_buddy_page != NULL); ++ + ext3_lock_group(sb, group); + for (i = 0; i < count; i++) { + md = db->bb_md_cur; @@ -1925,6 +2351,12 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + spin_lock(&sbi->s_md_lock); + list_add(&md->list, &sbi->s_active_transaction); + spin_unlock(&sbi->s_md_lock); ++ /* protect buddy cache from being freed, ++ * otherwise we'll refresh it from ++ * on-disk bitmap and lose not-yet-available ++ * blocks */ ++ page_cache_get(e3b->bd_buddy_page); ++ page_cache_get(e3b->bd_bitmap_page); + db->bb_md_cur = md; + db->bb_tid = handle->h_transaction->t_tid; + mb_debug("new md 0x%p for group %u\n", @@ -2036,12 +2468,13 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + if (err) + goto error_return; + -+ if (unlikely(ext3_mb_aggressive)) { ++#ifdef AGGRESSIVE_CHECK ++ { + int i; + for (i = 0; i < count; i++) + J_ASSERT(mb_test_bit(bit + i, bitmap_bh->b_data)); + } -+ ++#endif + mb_clear_bits(bitmap_bh->b_data, bit, count); + + /* We dirtied the bitmap block */ @@ -2064,7 +2497,6 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + spin_unlock(sb_bgl_lock(sbi, block_group)); + percpu_counter_mod(&sbi->s_freeblocks_counter, count); + -+ ext3_mb_dirty_buddy(&e3b); + ext3_mb_release_desc(&e3b); + + *freed = count; @@ -2131,52 +2563,30 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c +} + + -+extern void ext3_free_blocks_old(handle_t *handle, struct inode *inode, -+ unsigned long block, unsigned long count); -+void ext3_free_blocks(handle_t *handle, struct inode *inode, -+ unsigned long block, unsigned long count, int metadata) ++void ext3_free_blocks(handle_t *handle, struct inode * inode, ++ unsigned long block, unsigned long count, int metadata) +{ ++ struct super_block *sb; + int freed; + -+ if (!test_opt(inode->i_sb, MBALLOC) || -+ EXT3_SB(inode->i_sb)->s_buddy_blocks == NULL) ++ sb = inode->i_sb; ++ if (!test_opt(sb, MBALLOC)) + ext3_free_blocks_old(handle, inode, block, count); + else { -+ ext3_mb_free_blocks(handle, inode, block,count,metadata,&freed); ++ ext3_mb_free_blocks(handle, inode, block, count, metadata, &freed); + if (freed) + DQUOT_FREE_BLOCK(inode, freed); + } + return; +} -Index: linux-2.6.5-7.201/fs/ext3/proc.c -=================================================================== ---- linux-2.6.5-7.201.orig/fs/ext3/proc.c 2005-10-13 19:40:57.851699336 +0400 -+++ linux-2.6.5-7.201/fs/ext3/proc.c 2005-10-14 09:02:36.000000000 +0400 -@@ -0,0 +1,195 @@ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ + +#define EXT3_ROOT "ext3" -+#define EXT3_MB_AGGRESSIVE_NAME "mb_aggressive" +#define EXT3_MB_STATS_NAME "mb_stats" +#define EXT3_MB_MAX_TO_SCAN_NAME "mb_max_to_scan" ++#define EXT3_MB_MIN_TO_SCAN_NAME "mb_min_to_scan" + -+ -+static struct proc_dir_entry *proc_root_ext3; -+ -+ -+static int ext3_mb_aggressive_read(char *page, char **start, off_t off, -+ int count, int *eof, void *data) ++static int ext3_mb_stats_read(char *page, char **start, off_t off, ++ int count, int *eof, void *data) +{ + int len; + @@ -2184,19 +2594,19 @@ Index: linux-2.6.5-7.201/fs/ext3/proc.c + if (off != 0) + return 0; + -+ len = sprintf(page, "%ld\n", ext3_mb_aggressive); ++ len = sprintf(page, "%ld\n", ext3_mb_stats); + *start = page; + return len; +} + -+static int ext3_mb_aggressive_write(struct file *file, const char *buffer, -+ unsigned long count, void *data) ++static int ext3_mb_stats_write(struct file *file, const char *buffer, ++ unsigned long count, void *data) +{ + char str[32]; + + if (count >= sizeof(str)) { + printk(KERN_ERR "EXT3: %s string to long, max %u bytes\n", -+ EXT3_MB_AGGRESSIVE_NAME, sizeof(str)); ++ EXT3_MB_STATS_NAME, (int)sizeof(str)); + return -EOVERFLOW; + } + @@ -2204,12 +2614,12 @@ Index: linux-2.6.5-7.201/fs/ext3/proc.c + return -EFAULT; + + /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ -+ ext3_mb_aggressive = (simple_strtol(str, NULL, 0) != 0); ++ ext3_mb_stats = (simple_strtol(str, NULL, 0) != 0); + return count; +} + -+static int ext3_mb_stats_read(char *page, char **start, off_t off, -+ int count, int *eof, void *data) ++static int ext3_mb_max_to_scan_read(char *page, char **start, off_t off, ++ int count, int *eof, void *data) +{ + int len; + @@ -2217,19 +2627,20 @@ Index: linux-2.6.5-7.201/fs/ext3/proc.c + if (off != 0) + return 0; + -+ len = sprintf(page, "%ld\n", ext3_mb_stats); ++ len = sprintf(page, "%ld\n", ext3_mb_max_to_scan); + *start = page; + return len; +} + -+static int ext3_mb_stats_write(struct file *file, const char *buffer, -+ unsigned long count, void *data) ++static int ext3_mb_max_to_scan_write(struct file *file, const char *buffer, ++ unsigned long count, void *data) +{ + char str[32]; ++ long value; + + if (count >= sizeof(str)) { + printk(KERN_ERR "EXT3: %s string to long, max %u bytes\n", -+ EXT3_MB_STATS_NAME, sizeof(str)); ++ EXT3_MB_MAX_TO_SCAN_NAME, (int)sizeof(str)); + return -EOVERFLOW; + } + @@ -2237,12 +2648,17 @@ Index: linux-2.6.5-7.201/fs/ext3/proc.c + return -EFAULT; + + /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ -+ ext3_mb_stats = (simple_strtol(str, NULL, 0) != 0); ++ value = simple_strtol(str, NULL, 0); ++ if (value <= 0) ++ return -ERANGE; ++ ++ ext3_mb_max_to_scan = value; ++ + return count; +} + -+static int ext3_mb_max_to_scan_read(char *page, char **start, off_t off, -+ int count, int *eof, void *data) ++static int ext3_mb_min_to_scan_read(char *page, char **start, off_t off, ++ int count, int *eof, void *data) +{ + int len; + @@ -2250,20 +2666,20 @@ Index: linux-2.6.5-7.201/fs/ext3/proc.c + if (off != 0) + return 0; + -+ len = sprintf(page, "%ld\n", ext3_mb_max_to_scan); ++ len = sprintf(page, "%ld\n", ext3_mb_min_to_scan); + *start = page; + return len; +} + -+static int ext3_mb_max_to_scan_write(struct file *file, const char *buffer, -+ unsigned long count, void *data) ++static int ext3_mb_min_to_scan_write(struct file *file, const char *buffer, ++ unsigned long count, void *data) +{ + char str[32]; + long value; + + if (count >= sizeof(str)) { + printk(KERN_ERR "EXT3: %s string to long, max %u bytes\n", -+ EXT3_MB_MAX_TO_SCAN_NAME, sizeof(str)); ++ EXT3_MB_MIN_TO_SCAN_NAME, (int)sizeof(str)); + return -EOVERFLOW; + } + @@ -2272,47 +2688,32 @@ Index: linux-2.6.5-7.201/fs/ext3/proc.c + + /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ + value = simple_strtol(str, NULL, 0); -+ if (value <= 0) ++ if (value <= 0) + return -ERANGE; + -+ ext3_mb_max_to_scan = value; ++ ext3_mb_min_to_scan = value; + + return count; +} + +int __init init_ext3_proc(void) +{ -+ struct proc_dir_entry *proc_ext3_mb_aggressive; + struct proc_dir_entry *proc_ext3_mb_stats; + struct proc_dir_entry *proc_ext3_mb_max_to_scan; ++ struct proc_dir_entry *proc_ext3_mb_min_to_scan; + + proc_root_ext3 = proc_mkdir(EXT3_ROOT, proc_root_fs); + if (proc_root_ext3 == NULL) { -+ printk(KERN_ERR "EXT3: Unable to create %s\n", EXT3_ROOT); -+ return -EIO; -+ } -+ -+ /* Initialize EXT3_MB_AGGRESSIVE_NAME */ -+ proc_ext3_mb_aggressive = create_proc_entry(EXT3_MB_AGGRESSIVE_NAME, -+ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); -+ if (proc_ext3_mb_aggressive == NULL) { -+ printk(KERN_ERR "EXT3: Unable to create %s\n", -+ EXT3_MB_AGGRESSIVE_NAME); -+ remove_proc_entry(EXT3_ROOT, proc_root_fs); ++ printk(KERN_ERR "EXT3: Unable to create %s\n", EXT3_ROOT); + return -EIO; + } + -+ proc_ext3_mb_aggressive->data = NULL; -+ proc_ext3_mb_aggressive->read_proc = ext3_mb_aggressive_read; -+ proc_ext3_mb_aggressive->write_proc = ext3_mb_aggressive_write; -+ + /* Initialize EXT3_MB_STATS_NAME */ + proc_ext3_mb_stats = create_proc_entry(EXT3_MB_STATS_NAME, -+ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); ++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); + if (proc_ext3_mb_stats == NULL) { -+ printk(KERN_ERR "EXT3: Unable to create %s\n", -+ EXT3_MB_STATS_NAME); -+ remove_proc_entry(EXT3_MB_AGGRESSIVE_NAME, proc_root_ext3); ++ printk(KERN_ERR "EXT3: Unable to create %s\n", ++ EXT3_MB_STATS_NAME); + remove_proc_entry(EXT3_ROOT, proc_root_fs); + return -EIO; + } @@ -2323,13 +2724,12 @@ Index: linux-2.6.5-7.201/fs/ext3/proc.c + + /* Initialize EXT3_MAX_TO_SCAN_NAME */ + proc_ext3_mb_max_to_scan = create_proc_entry( -+ EXT3_MB_MAX_TO_SCAN_NAME, -+ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); ++ EXT3_MB_MAX_TO_SCAN_NAME, ++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); + if (proc_ext3_mb_max_to_scan == NULL) { -+ printk(KERN_ERR "EXT3: Unable to create %s\n", -+ EXT3_MB_STATS_NAME); ++ printk(KERN_ERR "EXT3: Unable to create %s\n", ++ EXT3_MB_MAX_TO_SCAN_NAME); + remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); -+ remove_proc_entry(EXT3_MB_AGGRESSIVE_NAME, proc_root_ext3); + remove_proc_entry(EXT3_ROOT, proc_root_fs); + return -EIO; + } @@ -2338,130 +2738,43 @@ Index: linux-2.6.5-7.201/fs/ext3/proc.c + proc_ext3_mb_max_to_scan->read_proc = ext3_mb_max_to_scan_read; + proc_ext3_mb_max_to_scan->write_proc = ext3_mb_max_to_scan_write; + ++ /* Initialize EXT3_MIN_TO_SCAN_NAME */ ++ proc_ext3_mb_min_to_scan = create_proc_entry( ++ EXT3_MB_MIN_TO_SCAN_NAME, ++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); ++ if (proc_ext3_mb_min_to_scan == NULL) { ++ printk(KERN_ERR "EXT3: Unable to create %s\n", ++ EXT3_MB_MIN_TO_SCAN_NAME); ++ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_ROOT, proc_root_fs); ++ return -EIO; ++ } ++ ++ proc_ext3_mb_min_to_scan->data = NULL; ++ proc_ext3_mb_min_to_scan->read_proc = ext3_mb_min_to_scan_read; ++ proc_ext3_mb_min_to_scan->write_proc = ext3_mb_min_to_scan_write; ++ + return 0; +} + +void exit_ext3_proc(void) +{ -+ remove_proc_entry(EXT3_MB_AGGRESSIVE_NAME, proc_root_ext3); + remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); + remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_MIN_TO_SCAN_NAME, proc_root_ext3); + remove_proc_entry(EXT3_ROOT, proc_root_fs); +} -Index: linux-2.6.5-7.201/fs/ext3/inode.c -=================================================================== ---- linux-2.6.5-7.201.orig/fs/ext3/inode.c 2005-10-14 08:59:38.000000000 +0400 -+++ linux-2.6.5-7.201/fs/ext3/inode.c 2005-10-14 08:59:39.000000000 +0400 -@@ -572,7 +572,7 @@ - ext3_journal_forget(handle, branch[i].bh); - } - for (i = 0; i < keys; i++) -- ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1); -+ ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1, 1); - return err; - } - -@@ -673,7 +673,7 @@ - if (err == -EAGAIN) - for (i = 0; i < num; i++) - ext3_free_blocks(handle, inode, -- le32_to_cpu(where[i].key), 1); -+ le32_to_cpu(where[i].key), 1, 1); - return err; - } - -@@ -1835,7 +1835,7 @@ - } - } - -- ext3_free_blocks(handle, inode, block_to_free, count); -+ ext3_free_blocks(handle, inode, block_to_free, count, 1); - } - - /** -@@ -2006,7 +2006,7 @@ - ext3_journal_test_restart(handle, inode); - } - -- ext3_free_blocks(handle, inode, nr, 1); -+ ext3_free_blocks(handle, inode, nr, 1, 1); - - if (parent_bh) { - /* -Index: linux-2.6.5-7.201/fs/ext3/super.c +Index: linux-2.6.5-7.201/fs/ext3/Makefile =================================================================== ---- linux-2.6.5-7.201.orig/fs/ext3/super.c 2005-10-14 08:59:38.000000000 +0400 -+++ linux-2.6.5-7.201/fs/ext3/super.c 2005-10-14 09:02:36.000000000 +0400 -@@ -389,6 +389,7 @@ - struct ext3_super_block *es = sbi->s_es; - int i; - -+ ext3_mb_release(sb); - ext3_ext_release(sb); - ext3_xattr_put_super(sb); - journal_destroy(sbi->s_journal); -@@ -543,6 +544,7 @@ - Opt_commit, Opt_journal_update, Opt_journal_inum, - Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, - Opt_ignore, Opt_barrier, Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, -+ Opt_mballoc, Opt_mbfactor, - Opt_err, Opt_extents, Opt_extdebug - }; - -@@ -590,6 +592,8 @@ - {Opt_iopen_nopriv, "iopen_nopriv"}, - {Opt_extents, "extents"}, - {Opt_extdebug, "extdebug"}, -+ {Opt_mballoc, "mballoc"}, -+ {Opt_mballoc, "mbfactor=%u"}, - {Opt_err, NULL} - }; - -@@ -811,6 +815,16 @@ - case Opt_extdebug: - set_opt (sbi->s_mount_opt, EXTDEBUG); - break; -+ case Opt_mballoc: -+ set_opt (sbi->s_mount_opt, MBALLOC); -+ break; -+ case Opt_mbfactor: -+ if (match_int(&args[0], &option)) -+ return 0; -+ if (option < 0) -+ return 0; -+ sbi->s_mb_factor = option; -+ break; - default: - printk (KERN_ERR - "EXT3-fs: Unrecognized mount option \"%s\" " -@@ -1464,6 +1478,7 @@ - ext3_count_dirs(sb)); - - ext3_ext_init(sb); -+ ext3_mb_init(sb, needs_recovery); - - return 0; - -@@ -2112,7 +2127,13 @@ +--- linux-2.6.5-7.201.orig/fs/ext3/Makefile 2005-12-17 02:53:30.000000000 +0300 ++++ linux-2.6.5-7.201/fs/ext3/Makefile 2005-12-17 03:10:23.000000000 +0300 +@@ -6,7 +6,7 @@ - static int __init init_ext3_fs(void) - { -- int err = init_ext3_xattr(); -+ int err; -+ -+ err = init_ext3_proc(); -+ if (err) -+ return err; -+ -+ err = init_ext3_xattr(); - if (err) - return err; - err = init_inodecache(); -@@ -2141,6 +2162,7 @@ - unregister_filesystem(&ext3_fs_type); - destroy_inodecache(); - exit_ext3_xattr(); -+ exit_ext3_proc(); - } + ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ + ioctl.o namei.o super.o symlink.o hash.o \ +- extents.o ++ extents.o mballoc.o - int ext3_prep_san_write(struct inode *inode, long *blocks, + ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o + ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o diff --git a/lustre/kernel_patches/patches/ext3-mballoc2-2.6.12.patch b/lustre/kernel_patches/patches/ext3-mballoc2-2.6.12.patch new file mode 100644 index 0000000..a2b9caf --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-mballoc2-2.6.12.patch @@ -0,0 +1,2774 @@ +Index: linux-2.6.12.6/include/linux/ext3_fs.h +=================================================================== +--- linux-2.6.12.6.orig/include/linux/ext3_fs.h 2005-12-17 02:17:16.000000000 +0300 ++++ linux-2.6.12.6/include/linux/ext3_fs.h 2005-12-17 02:21:21.000000000 +0300 +@@ -57,6 +57,14 @@ struct statfs; + #define ext3_debug(f, a...) do {} while (0) + #endif + ++#define EXT3_MULTIBLOCK_ALLOCATOR 1 ++ ++#define EXT3_MB_HINT_MERGE 1 ++#define EXT3_MB_HINT_RESERVED 2 ++#define EXT3_MB_HINT_METADATA 4 ++#define EXT3_MB_HINT_FIRST 8 ++#define EXT3_MB_HINT_BEST 16 ++ + /* + * Special inodes numbers + */ +@@ -366,6 +374,7 @@ struct ext3_inode { + #define EXT3_MOUNT_IOPEN_NOPRIV 0x100000/* Make iopen world-readable */ + #define EXT3_MOUNT_EXTENTS 0x200000/* Extents support */ + #define EXT3_MOUNT_EXTDEBUG 0x400000/* Extents debug */ ++#define EXT3_MOUNT_MBALLOC 0x800000/* Buddy allocation support */ + + /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ + #ifndef clear_opt +@@ -727,7 +736,7 @@ extern int ext3_bg_has_super(struct supe + extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group); + extern int ext3_new_block (handle_t *, struct inode *, unsigned long, int *); + extern void ext3_free_blocks (handle_t *, struct inode *, unsigned long, +- unsigned long); ++ unsigned long, int); + extern void ext3_free_blocks_sb (handle_t *, struct super_block *, + unsigned long, unsigned long, int *); + extern unsigned long ext3_count_free_blocks (struct super_block *); +@@ -848,6 +857,17 @@ extern void ext3_extents_initialize_bloc + extern int ext3_ext_ioctl(struct inode *inode, struct file *filp, + unsigned int cmd, unsigned long arg); + ++/* mballoc.c */ ++extern long ext3_mb_stats; ++extern long ext3_mb_max_to_scan; ++extern int ext3_mb_init(struct super_block *, int); ++extern int ext3_mb_release(struct super_block *); ++extern int ext3_mb_new_blocks(handle_t *, struct inode *, unsigned long, int *, int, int *); ++extern int ext3_mb_reserve_blocks(struct super_block *, int); ++extern void ext3_mb_release_blocks(struct super_block *, int); ++int __init init_ext3_proc(void); ++void exit_ext3_proc(void); ++ + #endif /* __KERNEL__ */ + + /* EXT3_IOC_CREATE_INUM at bottom of file (visible to kernel and user). */ +Index: linux-2.6.12.6/include/linux/ext3_fs_sb.h +=================================================================== +--- linux-2.6.12.6.orig/include/linux/ext3_fs_sb.h 2005-08-29 20:55:27.000000000 +0400 ++++ linux-2.6.12.6/include/linux/ext3_fs_sb.h 2005-12-17 02:21:21.000000000 +0300 +@@ -21,8 +21,14 @@ + #include + #include + #include ++#include + #endif + #include ++#include ++ ++struct ext3_buddy_group_blocks; ++struct ext3_mb_history; ++#define EXT3_BB_MAX_BLOCKS + + /* + * third extended-fs super-block data in memory +@@ -78,6 +84,38 @@ struct ext3_sb_info { + char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */ + int s_jquota_fmt; /* Format of quota to use */ + #endif ++ ++ /* for buddy allocator */ ++ struct ext3_group_info **s_group_info; ++ struct inode *s_buddy_cache; ++ long s_blocks_reserved; ++ spinlock_t s_reserve_lock; ++ struct list_head s_active_transaction; ++ struct list_head s_closed_transaction; ++ struct list_head s_committed_transaction; ++ spinlock_t s_md_lock; ++ tid_t s_last_transaction; ++ int s_mb_factor; ++ unsigned short *s_mb_offsets, *s_mb_maxs; ++ ++ /* history to debug policy */ ++ struct ext3_mb_history *s_mb_history; ++ int s_mb_history_cur; ++ int s_mb_history_max; ++ struct proc_dir_entry *s_mb_proc; ++ spinlock_t s_mb_history_lock; ++ ++ /* stats for buddy allocator */ ++ atomic_t s_bal_reqs; /* number of reqs with len > 1 */ ++ atomic_t s_bal_success; /* we found long enough chunks */ ++ atomic_t s_bal_allocated; /* in blocks */ ++ atomic_t s_bal_ex_scanned; /* total extents scanned */ ++ atomic_t s_bal_goals; /* goal hits */ ++ atomic_t s_bal_breaks; /* too long searches */ ++ atomic_t s_bal_2orders; /* 2^order hits */ ++ spinlock_t s_bal_lock; ++ unsigned long s_mb_buddies_generated; ++ unsigned long long s_mb_generation_time; + }; + + #endif /* _LINUX_EXT3_FS_SB */ +Index: linux-2.6.12.6/fs/ext3/super.c +=================================================================== +--- linux-2.6.12.6.orig/fs/ext3/super.c 2005-12-17 02:17:16.000000000 +0300 ++++ linux-2.6.12.6/fs/ext3/super.c 2005-12-17 02:21:21.000000000 +0300 +@@ -387,6 +387,7 @@ static void ext3_put_super (struct super + struct ext3_super_block *es = sbi->s_es; + int i; + ++ ext3_mb_release(sb); + ext3_ext_release(sb); + ext3_xattr_put_super(sb); + journal_destroy(sbi->s_journal); +@@ -597,7 +598,7 @@ enum { + Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, + Opt_ignore, Opt_barrier, Opt_err, Opt_resize, + Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, +- Opt_extents, Opt_extdebug, ++ Opt_extents, Opt_extdebug, Opt_mballoc, + }; + + static match_table_t tokens = { +@@ -649,6 +651,7 @@ static match_table_t tokens = { + {Opt_iopen_nopriv, "iopen_nopriv"}, + {Opt_extents, "extents"}, + {Opt_extdebug, "extdebug"}, ++ {Opt_mballoc, "mballoc"}, + {Opt_barrier, "barrier=%u"}, + {Opt_err, NULL}, + {Opt_resize, "resize"}, +@@ -964,6 +967,9 @@ clear_qf_name: + case Opt_extdebug: + set_opt (sbi->s_mount_opt, EXTDEBUG); + break; ++ case Opt_mballoc: ++ set_opt (sbi->s_mount_opt, MBALLOC); ++ break; + default: + printk (KERN_ERR + "EXT3-fs: Unrecognized mount option \"%s\" " +@@ -1669,6 +1675,7 @@ static int ext3_fill_super (struct super + ext3_count_dirs(sb)); + + ext3_ext_init(sb); ++ ext3_mb_init(sb, needs_recovery); + lock_kernel(); + return 0; + +@@ -2548,7 +2555,13 @@ static struct file_system_type ext3_fs_t + + static int __init init_ext3_fs(void) + { +- int err = init_ext3_xattr(); ++ int err; ++ ++ err = init_ext3_proc(); ++ if (err) ++ return err; ++ ++ err = init_ext3_xattr(); + if (err) + return err; + err = init_inodecache(); +@@ -2570,6 +2583,7 @@ static void __exit exit_ext3_fs(void) + unregister_filesystem(&ext3_fs_type); + destroy_inodecache(); + exit_ext3_xattr(); ++ exit_ext3_proc(); + } + + int ext3_prep_san_write(struct inode *inode, long *blocks, +Index: linux-2.6.12.6/fs/ext3/extents.c +=================================================================== +--- linux-2.6.12.6.orig/fs/ext3/extents.c 2005-12-17 02:17:16.000000000 +0300 ++++ linux-2.6.12.6/fs/ext3/extents.c 2005-12-17 02:21:21.000000000 +0300 +@@ -771,7 +771,7 @@ cleanup: + for (i = 0; i < depth; i++) { + if (!ablocks[i]) + continue; +- ext3_free_blocks(handle, tree->inode, ablocks[i], 1); ++ ext3_free_blocks(handle, tree->inode, ablocks[i], 1, 1); + } + } + kfree(ablocks); +@@ -1428,7 +1428,7 @@ int ext3_ext_rm_idx(handle_t *handle, st + path->p_idx->ei_leaf); + bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf); + ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf); +- ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1); ++ ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1, 1); + return err; + } + +@@ -1913,10 +1913,12 @@ ext3_remove_blocks(struct ext3_extents_t + int needed = ext3_remove_blocks_credits(tree, ex, from, to); + handle_t *handle = ext3_journal_start(tree->inode, needed); + struct buffer_head *bh; +- int i; ++ int i, metadata = 0; + + if (IS_ERR(handle)) + return PTR_ERR(handle); ++ if (S_ISDIR(tree->inode->i_mode) || S_ISLNK(tree->inode->i_mode)) ++ metadata = 1; + if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) { + /* tail removal */ + unsigned long num, start; +@@ -1928,7 +1930,7 @@ ext3_remove_blocks(struct ext3_extents_t + bh = sb_find_get_block(tree->inode->i_sb, start + i); + ext3_forget(handle, 0, tree->inode, bh, start + i); + } +- ext3_free_blocks(handle, tree->inode, start, num); ++ ext3_free_blocks(handle, tree->inode, start, num, metadata); + } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) { + printk("strange request: removal %lu-%lu from %u:%u\n", + from, to, ex->ee_block, ex->ee_len); +Index: linux-2.6.12.6/fs/ext3/inode.c +=================================================================== +--- linux-2.6.12.6.orig/fs/ext3/inode.c 2005-12-17 02:17:16.000000000 +0300 ++++ linux-2.6.12.6/fs/ext3/inode.c 2005-12-17 02:21:21.000000000 +0300 +@@ -564,7 +564,7 @@ static int ext3_alloc_branch(handle_t *h + ext3_journal_forget(handle, branch[i].bh); + } + for (i = 0; i < keys; i++) +- ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1); ++ ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1, 1); + return err; + } + +@@ -1850,7 +1850,7 @@ ext3_clear_blocks(handle_t *handle, stru + } + } + +- ext3_free_blocks(handle, inode, block_to_free, count); ++ ext3_free_blocks(handle, inode, block_to_free, count, 1); + } + + /** +@@ -2023,7 +2023,7 @@ static void ext3_free_branches(handle_t + ext3_journal_test_restart(handle, inode); + } + +- ext3_free_blocks(handle, inode, nr, 1); ++ ext3_free_blocks(handle, inode, nr, 1, 1); + + if (parent_bh) { + /* +Index: linux-2.6.12.6/fs/ext3/balloc.c +=================================================================== +--- linux-2.6.12.6.orig/fs/ext3/balloc.c 2005-08-29 20:55:27.000000000 +0400 ++++ linux-2.6.12.6/fs/ext3/balloc.c 2005-12-17 02:21:21.000000000 +0300 +@@ -79,7 +79,7 @@ struct ext3_group_desc * ext3_get_group_ + * + * Return buffer_head on success or NULL in case of failure. + */ +-static struct buffer_head * ++struct buffer_head * + read_block_bitmap(struct super_block *sb, unsigned int block_group) + { + struct ext3_group_desc * desc; +@@ -490,24 +490,6 @@ error_return: + return; + } + +-/* Free given blocks, update quota and i_blocks field */ +-void ext3_free_blocks(handle_t *handle, struct inode *inode, +- unsigned long block, unsigned long count) +-{ +- struct super_block * sb; +- int dquot_freed_blocks; +- +- sb = inode->i_sb; +- if (!sb) { +- printk ("ext3_free_blocks: nonexistent device"); +- return; +- } +- ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks); +- if (dquot_freed_blocks) +- DQUOT_FREE_BLOCK(inode, dquot_freed_blocks); +- return; +-} +- + /* + * For ext3 allocations, we must not reuse any blocks which are + * allocated in the bitmap buffer's "last committed data" copy. This +@@ -1162,7 +1144,7 @@ int ext3_should_retry_alloc(struct super + * bitmap, and then for any free bit if that fails. + * This function also updates quota and i_blocks field. + */ +-int ext3_new_block(handle_t *handle, struct inode *inode, ++int ext3_new_block_old(handle_t *handle, struct inode *inode, + unsigned long goal, int *errp) + { + struct buffer_head *bitmap_bh = NULL; +Index: linux-2.6.12.6/fs/ext3/xattr.c +=================================================================== +--- linux-2.6.12.6.orig/fs/ext3/xattr.c 2005-08-29 20:55:27.000000000 +0400 ++++ linux-2.6.12.6/fs/ext3/xattr.c 2005-12-17 02:21:33.000000000 +0300 +@@ -484,7 +484,7 @@ ext3_xattr_release_block(handle_t *handl + ea_bdebug(bh, "refcount now=0; freeing"); + if (ce) + mb_cache_entry_free(ce); +- ext3_free_blocks(handle, inode, bh->b_blocknr, 1); ++ ext3_free_blocks(handle, inode, bh->b_blocknr, 1, 1); + get_bh(bh); + ext3_forget(handle, 1, inode, bh, bh->b_blocknr); + } else { +@@ -804,7 +804,7 @@ inserted: + new_bh = sb_getblk(sb, block); + if (!new_bh) { + getblk_failed: +- ext3_free_blocks(handle, inode, block, 1); ++ ext3_free_blocks(handle, inode, block, 1, 1); + error = -EIO; + goto cleanup; + } +Index: linux-2.6.12.6/fs/ext3/mballoc.c +=================================================================== +--- linux-2.6.12.6.orig/fs/ext3/mballoc.c 2005-12-09 13:08:53.191437750 +0300 ++++ linux-2.6.12.6/fs/ext3/mballoc.c 2005-12-17 02:21:21.000000000 +0300 +@@ -0,0 +1,2429 @@ ++/* ++ * Copyright (c) 2003-2005, Cluster File Systems, Inc, info@clusterfs.com ++ * Written by Alex Tomas ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public Licens ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- ++ */ ++ ++ ++/* ++ * mballoc.c contains the multiblocks allocation routines ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * TODO: ++ * - bitmap read-ahead (proposed by Oleg Drokin aka green) ++ * - track min/max extents in each group for better group selection ++ * - mb_mark_used() may allocate chunk right after splitting buddy ++ * - special flag to advice allocator to look for requested + N blocks ++ * this may improve interaction between extents and mballoc ++ * - tree of groups sorted by number of free blocks ++ * - percpu reservation code (hotpath) ++ * - error handling ++ */ ++ ++/* ++ * with AGRESSIVE_CHECK allocator runs consistency checks over ++ * structures. these checks slow things down a lot ++ */ ++#define AGGRESSIVE_CHECK__ ++ ++/* ++ */ ++#define MB_DEBUG__ ++#ifdef MB_DEBUG ++#define mb_debug(fmt,a...) printk(fmt, ##a) ++#else ++#define mb_debug(fmt,a...) ++#endif ++ ++/* ++ * with EXT3_MB_HISTORY mballoc stores last N allocations in memory ++ * and you can monitor it in /proc/fs/ext3//mb_history ++ */ ++#define EXT3_MB_HISTORY ++ ++/* ++ * How long mballoc can look for a best extent (in found extents) ++ */ ++long ext3_mb_max_to_scan = 500; ++ ++/* ++ * How long mballoc must look for a best extent ++ */ ++long ext3_mb_min_to_scan = 30; ++ ++/* ++ * with 'ext3_mb_stats' allocator will collect stats that will be ++ * shown at umount. The collecting costs though! ++ */ ++ ++long ext3_mb_stats = 1; ++ ++#ifdef EXT3_BB_MAX_BLOCKS ++#undef EXT3_BB_MAX_BLOCKS ++#endif ++#define EXT3_BB_MAX_BLOCKS 30 ++ ++struct ext3_free_metadata { ++ unsigned short group; ++ unsigned short num; ++ unsigned short blocks[EXT3_BB_MAX_BLOCKS]; ++ struct list_head list; ++}; ++ ++struct ext3_group_info { ++ unsigned long bb_state; ++ unsigned long bb_tid; ++ struct ext3_free_metadata *bb_md_cur; ++ unsigned short bb_first_free; ++ unsigned short bb_free; ++ unsigned short bb_fragments; ++ unsigned short bb_counters[]; ++}; ++ ++ ++#define EXT3_GROUP_INFO_NEED_INIT_BIT 0 ++#define EXT3_GROUP_INFO_LOCKED_BIT 1 ++ ++#define EXT3_MB_GRP_NEED_INIT(grp) \ ++ (test_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, &(grp)->bb_state)) ++ ++struct ext3_free_extent { ++ __u16 fe_start; ++ __u16 fe_len; ++ __u16 fe_group; ++}; ++ ++struct ext3_allocation_context { ++ struct super_block *ac_sb; ++ ++ /* search goals */ ++ struct ext3_free_extent ac_g_ex; ++ ++ /* the best found extent */ ++ struct ext3_free_extent ac_b_ex; ++ ++ /* number of iterations done. we have to track to limit searching */ ++ unsigned long ac_ex_scanned; ++ __u16 ac_groups_scanned; ++ __u16 ac_found; ++ __u16 ac_tail; ++ __u16 ac_buddy; ++ __u8 ac_status; ++ __u8 ac_flags; /* allocation hints */ ++ __u8 ac_criteria; ++ __u8 ac_repeats; ++ __u8 ac_2order; /* if request is to allocate 2^N blocks and ++ * N > 0, the field stores N, otherwise 0 */ ++}; ++ ++#define AC_STATUS_CONTINUE 1 ++#define AC_STATUS_FOUND 2 ++#define AC_STATUS_BREAK 3 ++ ++struct ext3_mb_history { ++ struct ext3_free_extent goal; /* goal allocation */ ++ struct ext3_free_extent result; /* result allocation */ ++ __u16 found; /* how many extents have been found */ ++ __u16 groups; /* how many groups have been scanned */ ++ __u16 tail; /* what tail broke some buddy */ ++ __u16 buddy; /* buddy the tail ^^^ broke */ ++ __u8 cr; /* which phase the result extent was found at */ ++ __u8 merged; ++}; ++ ++struct ext3_buddy { ++ struct page *bd_buddy_page; ++ void *bd_buddy; ++ struct page *bd_bitmap_page; ++ void *bd_bitmap; ++ struct ext3_group_info *bd_info; ++ struct super_block *bd_sb; ++ __u16 bd_blkbits; ++ __u16 bd_group; ++}; ++#define EXT3_MB_BITMAP(e3b) ((e3b)->bd_bitmap) ++#define EXT3_MB_BUDDY(e3b) ((e3b)->bd_buddy) ++ ++#ifndef EXT3_MB_HISTORY ++#define ext3_mb_store_history(sb,ac) ++#else ++static void ext3_mb_store_history(struct super_block *, ++ struct ext3_allocation_context *ac); ++#endif ++ ++#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) ++ ++static struct proc_dir_entry *proc_root_ext3; ++ ++int ext3_create (struct inode *, struct dentry *, int, struct nameidata *); ++struct buffer_head * read_block_bitmap(struct super_block *, unsigned int); ++int ext3_new_block_old(handle_t *, struct inode *, unsigned long, int *); ++int ext3_mb_reserve_blocks(struct super_block *, int); ++void ext3_mb_release_blocks(struct super_block *, int); ++void ext3_mb_poll_new_transaction(struct super_block *, handle_t *); ++void ext3_mb_free_committed_blocks(struct super_block *); ++ ++#if BITS_PER_LONG == 64 ++#define mb_correct_addr_and_bit(bit,addr) \ ++{ \ ++ bit += ((unsigned long) addr & 7UL) << 3; \ ++ addr = (void *) ((unsigned long) addr & ~7UL); \ ++} ++#elif BITS_PER_LONG == 32 ++#define mb_correct_addr_and_bit(bit,addr) \ ++{ \ ++ bit += ((unsigned long) addr & 3UL) << 3; \ ++ addr = (void *) ((unsigned long) addr & ~3UL); \ ++} ++#else ++#error "how many bits you are?!" ++#endif ++ ++static inline int mb_test_bit(int bit, void *addr) ++{ ++ mb_correct_addr_and_bit(bit,addr); ++ return ext2_test_bit(bit, addr); ++} ++ ++static inline void mb_set_bit(int bit, void *addr) ++{ ++ mb_correct_addr_and_bit(bit,addr); ++ ext2_set_bit(bit, addr); ++} ++ ++static inline void mb_set_bit_atomic(int bit, void *addr) ++{ ++ mb_correct_addr_and_bit(bit,addr); ++ ext2_set_bit_atomic(NULL, bit, addr); ++} ++ ++static inline void mb_clear_bit(int bit, void *addr) ++{ ++ mb_correct_addr_and_bit(bit,addr); ++ ext2_clear_bit(bit, addr); ++} ++ ++static inline void mb_clear_bit_atomic(int bit, void *addr) ++{ ++ mb_correct_addr_and_bit(bit,addr); ++ ext2_clear_bit_atomic(NULL, bit, addr); ++} ++ ++static inline int mb_find_next_zero_bit(void *addr, int max, int start) ++{ ++ int fix; ++#if BITS_PER_LONG == 64 ++ fix = ((unsigned long) addr & 7UL) << 3; ++ addr = (void *) ((unsigned long) addr & ~7UL); ++#elif BITS_PER_LONG == 32 ++ fix = ((unsigned long) addr & 3UL) << 3; ++ addr = (void *) ((unsigned long) addr & ~3UL); ++#else ++#error "how many bits you are?!" ++#endif ++ max += fix; ++ start += fix; ++ return ext2_find_next_zero_bit(addr, max, start) - fix; ++} ++ ++static inline void *mb_find_buddy(struct ext3_buddy *e3b, int order, int *max) ++{ ++ char *bb; ++ ++ J_ASSERT(EXT3_MB_BITMAP(e3b) != EXT3_MB_BUDDY(e3b)); ++ J_ASSERT(max != NULL); ++ ++ if (order > e3b->bd_blkbits + 1) { ++ *max = 0; ++ return NULL; ++ } ++ ++ /* at order 0 we see each particular block */ ++ *max = 1 << (e3b->bd_blkbits + 3); ++ if (order == 0) ++ return EXT3_MB_BITMAP(e3b); ++ ++ bb = EXT3_MB_BUDDY(e3b) + EXT3_SB(e3b->bd_sb)->s_mb_offsets[order]; ++ *max = EXT3_SB(e3b->bd_sb)->s_mb_maxs[order]; ++ ++ return bb; ++} ++ ++#ifdef AGGRESSIVE_CHECK ++ ++static void mb_check_buddy(struct ext3_buddy *e3b) ++{ ++ int order = e3b->bd_blkbits + 1; ++ int max, max2, i, j, k, count; ++ int fragments = 0, fstart; ++ void *buddy, *buddy2; ++ ++ if (!test_opt(e3b->bd_sb, MBALLOC)) ++ return; ++ ++ { ++ static int mb_check_counter = 0; ++ if (mb_check_counter++ % 300 != 0) ++ return; ++ } ++ ++ while (order > 1) { ++ buddy = mb_find_buddy(e3b, order, &max); ++ J_ASSERT(buddy); ++ buddy2 = mb_find_buddy(e3b, order - 1, &max2); ++ J_ASSERT(buddy2); ++ J_ASSERT(buddy != buddy2); ++ J_ASSERT(max * 2 == max2); ++ ++ count = 0; ++ for (i = 0; i < max; i++) { ++ ++ if (mb_test_bit(i, buddy)) { ++ /* only single bit in buddy2 may be 1 */ ++ if (!mb_test_bit(i << 1, buddy2)) ++ J_ASSERT(mb_test_bit((i<<1)+1, buddy2)); ++ else if (!mb_test_bit((i << 1) + 1, buddy2)) ++ J_ASSERT(mb_test_bit(i << 1, buddy2)); ++ continue; ++ } ++ ++ /* both bits in buddy2 must be 0 */ ++ J_ASSERT(mb_test_bit(i << 1, buddy2)); ++ J_ASSERT(mb_test_bit((i << 1) + 1, buddy2)); ++ ++ for (j = 0; j < (1 << order); j++) { ++ k = (i * (1 << order)) + j; ++ J_ASSERT(!mb_test_bit(k, EXT3_MB_BITMAP(e3b))); ++ } ++ count++; ++ } ++ J_ASSERT(e3b->bd_info->bb_counters[order] == count); ++ order--; ++ } ++ ++ fstart = -1; ++ buddy = mb_find_buddy(e3b, 0, &max); ++ for (i = 0; i < max; i++) { ++ if (!mb_test_bit(i, buddy)) { ++ J_ASSERT(i >= e3b->bd_info->bb_first_free); ++ if (fstart == -1) { ++ fragments++; ++ fstart = i; ++ } ++ continue; ++ } ++ fstart = -1; ++ /* check used bits only */ ++ for (j = 0; j < e3b->bd_blkbits + 1; j++) { ++ buddy2 = mb_find_buddy(e3b, j, &max2); ++ k = i >> j; ++ J_ASSERT(k < max2); ++ J_ASSERT(mb_test_bit(k, buddy2)); ++ } ++ } ++ J_ASSERT(!EXT3_MB_GRP_NEED_INIT(e3b->bd_info)); ++ J_ASSERT(e3b->bd_info->bb_fragments == fragments); ++} ++ ++#else ++#define mb_check_buddy(e3b) ++#endif ++ ++/* find most significant bit */ ++static int inline fmsb(unsigned short word) ++{ ++ int order; ++ ++ if (word > 255) { ++ order = 7; ++ word >>= 8; ++ } else { ++ order = -1; ++ } ++ ++ do { ++ order++; ++ word >>= 1; ++ } while (word != 0); ++ ++ return order; ++} ++ ++static void inline ++ext3_mb_mark_free_simple(struct super_block *sb, void *buddy, unsigned first, ++ int len, struct ext3_group_info *grp) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ unsigned short min, max, chunk, border; ++ ++ mb_debug("mark %u/%u free\n", first, len); ++ J_ASSERT(len < EXT3_BLOCKS_PER_GROUP(sb)); ++ ++ border = 2 << sb->s_blocksize_bits; ++ ++ while (len > 0) { ++ /* find how many blocks can be covered since this position */ ++ max = ffs(first | border) - 1; ++ ++ /* find how many blocks of power 2 we need to mark */ ++ min = fmsb(len); ++ ++ mb_debug(" %u/%u -> max %u, min %u\n", ++ first & ((2 << sb->s_blocksize_bits) - 1), ++ len, max, min); ++ ++ if (max < min) ++ min = max; ++ chunk = 1 << min; ++ ++ /* mark multiblock chunks only */ ++ grp->bb_counters[min]++; ++ if (min > 0) { ++ mb_debug(" set %u at %u \n", first >> min, ++ sbi->s_mb_offsets[min]); ++ mb_clear_bit(first >> min, buddy + sbi->s_mb_offsets[min]); ++ } ++ ++ len -= chunk; ++ first += chunk; ++ } ++} ++ ++static void ++ext3_mb_generate_buddy(struct super_block *sb, void *buddy, void *bitmap, ++ struct ext3_group_info *grp) ++{ ++ unsigned short max = EXT3_BLOCKS_PER_GROUP(sb); ++ unsigned short i = 0, first, len; ++ unsigned free = 0, fragments = 0; ++ unsigned long long period = get_cycles(); ++ ++ i = mb_find_next_zero_bit(bitmap, max, 0); ++ grp->bb_first_free = i; ++ while (i < max) { ++ fragments++; ++ first = i; ++ i = find_next_bit(bitmap, max, i); ++ len = i - first; ++ free += len; ++ if (len > 1) ++ ext3_mb_mark_free_simple(sb, buddy, first, len, grp); ++ else ++ grp->bb_counters[0]++; ++ if (i < max) ++ i = mb_find_next_zero_bit(bitmap, max, i); ++ } ++ grp->bb_fragments = fragments; ++ ++ /* bb_state shouldn't being modified because all ++ * others waits for init completion on page lock */ ++ clear_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, &grp->bb_state); ++ if (free != grp->bb_free) { ++ printk("EXT3-fs: %u blocks in bitmap, %u in group descriptor\n", ++ free, grp->bb_free); ++ grp->bb_free = free; ++ } ++ ++ period = get_cycles() - period; ++ spin_lock(&EXT3_SB(sb)->s_bal_lock); ++ EXT3_SB(sb)->s_mb_buddies_generated++; ++ EXT3_SB(sb)->s_mb_generation_time += period; ++ spin_unlock(&EXT3_SB(sb)->s_bal_lock); ++} ++ ++static int ext3_mb_init_cache(struct page *page) ++{ ++ int blocksize, blocks_per_page, groups_per_page; ++ int err = 0, i, first_group, first_block; ++ struct super_block *sb; ++ struct buffer_head *bhs; ++ struct buffer_head **bh; ++ struct inode *inode; ++ char *data, *bitmap; ++ ++ mb_debug("init page %lu\n", page->index); ++ ++ inode = page->mapping->host; ++ sb = inode->i_sb; ++ blocksize = 1 << inode->i_blkbits; ++ blocks_per_page = PAGE_CACHE_SIZE / blocksize; ++ ++ groups_per_page = blocks_per_page >> 1; ++ if (groups_per_page == 0) ++ groups_per_page = 1; ++ ++ /* allocate buffer_heads to read bitmaps */ ++ if (groups_per_page > 1) { ++ err = -ENOMEM; ++ i = sizeof(struct buffer_head *) * groups_per_page; ++ bh = kmalloc(i, GFP_NOFS); ++ if (bh == NULL) ++ goto out; ++ memset(bh, 0, i); ++ } else ++ bh = &bhs; ++ ++ first_group = page->index * blocks_per_page / 2; ++ ++ /* read all groups the page covers into the cache */ ++ for (i = 0; i < groups_per_page; i++) { ++ struct ext3_group_desc * desc; ++ ++ if (first_group + i >= EXT3_SB(sb)->s_groups_count) ++ break; ++ ++ err = -EIO; ++ desc = ext3_get_group_desc(sb, first_group + i, NULL); ++ if (desc == NULL) ++ goto out; ++ ++ err = -ENOMEM; ++ bh[i] = sb_getblk(sb, le32_to_cpu(desc->bg_block_bitmap)); ++ if (bh[i] == NULL) ++ goto out; ++ ++ if (buffer_uptodate(bh[i])) ++ continue; ++ ++ lock_buffer(bh[i]); ++ if (buffer_uptodate(bh[i])) { ++ unlock_buffer(bh[i]); ++ continue; ++ } ++ ++ get_bh(bh[i]); ++ bh[i]->b_end_io = end_buffer_read_sync; ++ submit_bh(READ, bh[i]); ++ mb_debug("read bitmap for group %u\n", first_group + i); ++ } ++ ++ /* wait for I/O completion */ ++ for (i = 0; i < groups_per_page && bh[i]; i++) ++ wait_on_buffer(bh[i]); ++ ++ /* XXX: I/O error handling here */ ++ ++ first_block = page->index * blocks_per_page; ++ for (i = 0; i < blocks_per_page; i++) { ++ int group; ++ ++ group = (first_block + i) >> 1; ++ if (group >= EXT3_SB(sb)->s_groups_count) ++ break; ++ ++ data = page_address(page) + (i * blocksize); ++ bitmap = bh[group - first_group]->b_data; ++ ++ if ((first_block + i) & 1) { ++ /* this is block of buddy */ ++ mb_debug("put buddy for group %u in page %lu/%x\n", ++ group, page->index, i * blocksize); ++ memset(data, 0xff, blocksize); ++ EXT3_SB(sb)->s_group_info[group]->bb_fragments = 0; ++ memset(EXT3_SB(sb)->s_group_info[group]->bb_counters, 0, ++ sizeof(unsigned short)*(sb->s_blocksize_bits+2)); ++ ext3_mb_generate_buddy(sb, data, bitmap, ++ EXT3_SB(sb)->s_group_info[group]); ++ } else { ++ /* this is block of bitmap */ ++ mb_debug("put bitmap for group %u in page %lu/%x\n", ++ group, page->index, i * blocksize); ++ memcpy(data, bitmap, blocksize); ++ } ++ } ++ SetPageUptodate(page); ++ ++out: ++ if (bh) { ++ for (i = 0; bh && i < groups_per_page && bh[i]; i++) ++ brelse(bh[i]); ++ if (bh != &bhs) ++ kfree(bh); ++ } ++ return err; ++} ++ ++static int ext3_mb_load_buddy(struct super_block *sb, int group, ++ struct ext3_buddy *e3b) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct inode *inode = sbi->s_buddy_cache; ++ int blocks_per_page, block, pnum, poff; ++ struct page *page; ++ ++ mb_debug("load group %u\n", group); ++ ++ blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; ++ ++ e3b->bd_blkbits = sb->s_blocksize_bits; ++ e3b->bd_info = sbi->s_group_info[group]; ++ e3b->bd_sb = sb; ++ e3b->bd_group = group; ++ e3b->bd_buddy_page = NULL; ++ e3b->bd_bitmap_page = NULL; ++ ++ block = group * 2; ++ pnum = block / blocks_per_page; ++ poff = block % blocks_per_page; ++ ++ page = find_get_page(inode->i_mapping, pnum); ++ if (page == NULL || !PageUptodate(page)) { ++ if (page) ++ page_cache_release(page); ++ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); ++ if (page) { ++ if (!PageUptodate(page)) ++ ext3_mb_init_cache(page); ++ unlock_page(page); ++ } ++ } ++ if (page == NULL || !PageUptodate(page)) ++ goto err; ++ e3b->bd_bitmap_page = page; ++ e3b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); ++ mark_page_accessed(page); ++ ++ block++; ++ pnum = block / blocks_per_page; ++ poff = block % blocks_per_page; ++ ++ page = find_get_page(inode->i_mapping, pnum); ++ if (page == NULL || !PageUptodate(page)) { ++ if (page) ++ page_cache_release(page); ++ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); ++ if (page) { ++ if (!PageUptodate(page)) ++ ext3_mb_init_cache(page); ++ unlock_page(page); ++ } ++ } ++ if (page == NULL || !PageUptodate(page)) ++ goto err; ++ e3b->bd_buddy_page = page; ++ e3b->bd_buddy = page_address(page) + (poff * sb->s_blocksize); ++ mark_page_accessed(page); ++ ++ J_ASSERT(e3b->bd_bitmap_page != NULL); ++ J_ASSERT(e3b->bd_buddy_page != NULL); ++ ++ return 0; ++ ++err: ++ if (e3b->bd_bitmap_page) ++ page_cache_release(e3b->bd_bitmap_page); ++ if (e3b->bd_buddy_page) ++ page_cache_release(e3b->bd_buddy_page); ++ e3b->bd_buddy = NULL; ++ e3b->bd_bitmap = NULL; ++ return -EIO; ++} ++ ++static void ext3_mb_release_desc(struct ext3_buddy *e3b) ++{ ++ if (e3b->bd_bitmap_page) ++ page_cache_release(e3b->bd_bitmap_page); ++ if (e3b->bd_buddy_page) ++ page_cache_release(e3b->bd_buddy_page); ++} ++ ++ ++static inline void ++ext3_lock_group(struct super_block *sb, int group) ++{ ++ bit_spin_lock(EXT3_GROUP_INFO_LOCKED_BIT, ++ &EXT3_SB(sb)->s_group_info[group]->bb_state); ++} ++ ++static inline void ++ext3_unlock_group(struct super_block *sb, int group) ++{ ++ bit_spin_unlock(EXT3_GROUP_INFO_LOCKED_BIT, ++ &EXT3_SB(sb)->s_group_info[group]->bb_state); ++} ++ ++static int mb_find_order_for_block(struct ext3_buddy *e3b, int block) ++{ ++ int order = 1; ++ void *bb; ++ ++ J_ASSERT(EXT3_MB_BITMAP(e3b) != EXT3_MB_BUDDY(e3b)); ++ J_ASSERT(block < (1 << (e3b->bd_blkbits + 3))); ++ ++ bb = EXT3_MB_BUDDY(e3b); ++ while (order <= e3b->bd_blkbits + 1) { ++ block = block >> 1; ++ if (!mb_test_bit(block, bb)) { ++ /* this block is part of buddy of order 'order' */ ++ return order; ++ } ++ bb += 1 << (e3b->bd_blkbits - order); ++ order++; ++ } ++ return 0; ++} ++ ++static inline void mb_clear_bits(void *bm, int cur, int len) ++{ ++ __u32 *addr; ++ ++ len = cur + len; ++ while (cur < len) { ++ if ((cur & 31) == 0 && (len - cur) >= 32) { ++ /* fast path: clear whole word at once */ ++ addr = bm + (cur >> 3); ++ *addr = 0; ++ cur += 32; ++ continue; ++ } ++ mb_clear_bit_atomic(cur, bm); ++ cur++; ++ } ++} ++ ++static inline void mb_set_bits(void *bm, int cur, int len) ++{ ++ __u32 *addr; ++ ++ len = cur + len; ++ while (cur < len) { ++ if ((cur & 31) == 0 && (len - cur) >= 32) { ++ /* fast path: clear whole word at once */ ++ addr = bm + (cur >> 3); ++ *addr = 0xffffffff; ++ cur += 32; ++ continue; ++ } ++ mb_set_bit_atomic(cur, bm); ++ cur++; ++ } ++} ++ ++static int mb_free_blocks(struct ext3_buddy *e3b, int first, int count) ++{ ++ int block = 0, max = 0, order; ++ void *buddy, *buddy2; ++ ++ mb_check_buddy(e3b); ++ ++ e3b->bd_info->bb_free += count; ++ if (first < e3b->bd_info->bb_first_free) ++ e3b->bd_info->bb_first_free = first; ++ ++ /* let's maintain fragments counter */ ++ if (first != 0) ++ block = !mb_test_bit(first - 1, EXT3_MB_BITMAP(e3b)); ++ if (first + count < EXT3_SB(e3b->bd_sb)->s_mb_maxs[0]) ++ max = !mb_test_bit(first + count, EXT3_MB_BITMAP(e3b)); ++ if (block && max) ++ e3b->bd_info->bb_fragments--; ++ else if (!block && !max) ++ e3b->bd_info->bb_fragments++; ++ ++ /* let's maintain buddy itself */ ++ while (count-- > 0) { ++ block = first++; ++ order = 0; ++ ++ J_ASSERT(mb_test_bit(block, EXT3_MB_BITMAP(e3b))); ++ mb_clear_bit(block, EXT3_MB_BITMAP(e3b)); ++ e3b->bd_info->bb_counters[order]++; ++ ++ /* start of the buddy */ ++ buddy = mb_find_buddy(e3b, order, &max); ++ ++ do { ++ block &= ~1UL; ++ if (mb_test_bit(block, buddy) || ++ mb_test_bit(block + 1, buddy)) ++ break; ++ ++ /* both the buddies are free, try to coalesce them */ ++ buddy2 = mb_find_buddy(e3b, order + 1, &max); ++ ++ if (!buddy2) ++ break; ++ ++ if (order > 0) { ++ /* for special purposes, we don't set ++ * free bits in bitmap */ ++ mb_set_bit(block, buddy); ++ mb_set_bit(block + 1, buddy); ++ } ++ e3b->bd_info->bb_counters[order]--; ++ e3b->bd_info->bb_counters[order]--; ++ ++ block = block >> 1; ++ order++; ++ e3b->bd_info->bb_counters[order]++; ++ ++ mb_clear_bit(block, buddy2); ++ buddy = buddy2; ++ } while (1); ++ } ++ mb_check_buddy(e3b); ++ ++ return 0; ++} ++ ++static int mb_find_extent(struct ext3_buddy *e3b, int order, int block, ++ int needed, struct ext3_free_extent *ex) ++{ ++ int next, max, ord; ++ void *buddy; ++ ++ J_ASSERT(ex != NULL); ++ ++ buddy = mb_find_buddy(e3b, order, &max); ++ J_ASSERT(buddy); ++ J_ASSERT(block < max); ++ if (mb_test_bit(block, buddy)) { ++ ex->fe_len = 0; ++ ex->fe_start = 0; ++ ex->fe_group = 0; ++ return 0; ++ } ++ ++ if (likely(order == 0)) { ++ /* find actual order */ ++ order = mb_find_order_for_block(e3b, block); ++ block = block >> order; ++ } ++ ++ ex->fe_len = 1 << order; ++ ex->fe_start = block << order; ++ ex->fe_group = e3b->bd_group; ++ ++ while (needed > ex->fe_len && (buddy = mb_find_buddy(e3b, order, &max))) { ++ ++ if (block + 1 >= max) ++ break; ++ ++ next = (block + 1) * (1 << order); ++ if (mb_test_bit(next, EXT3_MB_BITMAP(e3b))) ++ break; ++ ++ ord = mb_find_order_for_block(e3b, next); ++ ++ order = ord; ++ block = next >> order; ++ ex->fe_len += 1 << order; ++ } ++ ++ J_ASSERT(ex->fe_start + ex->fe_len <= (1 << (e3b->bd_blkbits + 3))); ++ return ex->fe_len; ++} ++ ++static int mb_mark_used(struct ext3_buddy *e3b, struct ext3_free_extent *ex) ++{ ++ int ord, mlen = 0, max = 0, cur; ++ int start = ex->fe_start; ++ int len = ex->fe_len; ++ unsigned ret = 0; ++ int len0 = len; ++ void *buddy; ++ ++ mb_check_buddy(e3b); ++ ++ e3b->bd_info->bb_free -= len; ++ if (e3b->bd_info->bb_first_free == start) ++ e3b->bd_info->bb_first_free += len; ++ ++ /* let's maintain fragments counter */ ++ if (start != 0) ++ mlen = !mb_test_bit(start - 1, EXT3_MB_BITMAP(e3b)); ++ if (start + len < EXT3_SB(e3b->bd_sb)->s_mb_maxs[0]) ++ max = !mb_test_bit(start + len, EXT3_MB_BITMAP(e3b)); ++ if (mlen && max) ++ e3b->bd_info->bb_fragments++; ++ else if (!mlen && !max) ++ e3b->bd_info->bb_fragments--; ++ ++ /* let's maintain buddy itself */ ++ while (len) { ++ ord = mb_find_order_for_block(e3b, start); ++ ++ if (((start >> ord) << ord) == start && len >= (1 << ord)) { ++ /* the whole chunk may be allocated at once! */ ++ mlen = 1 << ord; ++ buddy = mb_find_buddy(e3b, ord, &max); ++ J_ASSERT((start >> ord) < max); ++ mb_set_bit(start >> ord, buddy); ++ e3b->bd_info->bb_counters[ord]--; ++ start += mlen; ++ len -= mlen; ++ J_ASSERT(len >= 0); ++ continue; ++ } ++ ++ /* store for history */ ++ if (ret == 0) ++ ret = len | (ord << 16); ++ ++ /* we have to split large buddy */ ++ J_ASSERT(ord > 0); ++ buddy = mb_find_buddy(e3b, ord, &max); ++ mb_set_bit(start >> ord, buddy); ++ e3b->bd_info->bb_counters[ord]--; ++ ++ ord--; ++ cur = (start >> ord) & ~1U; ++ buddy = mb_find_buddy(e3b, ord, &max); ++ mb_clear_bit(cur, buddy); ++ mb_clear_bit(cur + 1, buddy); ++ e3b->bd_info->bb_counters[ord]++; ++ e3b->bd_info->bb_counters[ord]++; ++ } ++ ++ /* now drop all the bits in bitmap */ ++ mb_set_bits(EXT3_MB_BITMAP(e3b), ex->fe_start, len0); ++ ++ mb_check_buddy(e3b); ++ ++ return ret; ++} ++ ++/* ++ * Must be called under group lock! ++ */ ++static void ext3_mb_use_best_found(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ unsigned long ret; ++ ++ ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len); ++ ret = mb_mark_used(e3b, &ac->ac_b_ex); ++ ++ ac->ac_status = AC_STATUS_FOUND; ++ ac->ac_tail = ret & 0xffff; ++ ac->ac_buddy = ret >> 16; ++} ++ ++/* ++ * The routine checks whether found extent is good enough. If it is, ++ * then the extent gets marked used and flag is set to the context ++ * to stop scanning. Otherwise, the extent is compared with the ++ * previous found extent and if new one is better, then it's stored ++ * in the context. Later, the best found extent will be used, if ++ * mballoc can't find good enough extent. ++ * ++ * FIXME: real allocation policy is to be designed yet! ++ */ ++static void ext3_mb_measure_extent(struct ext3_allocation_context *ac, ++ struct ext3_free_extent *ex, ++ struct ext3_buddy *e3b) ++{ ++ struct ext3_free_extent *bex = &ac->ac_b_ex; ++ struct ext3_free_extent *gex = &ac->ac_g_ex; ++ ++ J_ASSERT(ex->fe_len > 0); ++ J_ASSERT(ex->fe_len < (1 << ac->ac_sb->s_blocksize_bits) * 8); ++ J_ASSERT(ex->fe_start < (1 << ac->ac_sb->s_blocksize_bits) * 8); ++ ++ ac->ac_found++; ++ ++ /* ++ * The special case - take what you catch first ++ */ ++ if (unlikely(ac->ac_flags & EXT3_MB_HINT_FIRST)) { ++ *bex = *ex; ++ ext3_mb_use_best_found(ac, e3b); ++ return; ++ } ++ ++ /* ++ * Let's check whether the chuck is good enough ++ */ ++ if (ex->fe_len == gex->fe_len) { ++ *bex = *ex; ++ ext3_mb_use_best_found(ac, e3b); ++ return; ++ } ++ ++ /* ++ * If this is first found extent, just store it in the context ++ */ ++ if (bex->fe_len == 0) { ++ *bex = *ex; ++ return; ++ } ++ ++ /* ++ * If new found extent is better, store it in the context ++ */ ++ if (bex->fe_len < gex->fe_len) { ++ /* if the request isn't satisfied, any found extent ++ * larger than previous best one is better */ ++ if (ex->fe_len > bex->fe_len) ++ *bex = *ex; ++ } else if (ex->fe_len > gex->fe_len) { ++ /* if the request is satisfied, then we try to find ++ * an extent that still satisfy the request, but is ++ * smaller than previous one */ ++ *bex = *ex; ++ } ++ ++ /* ++ * Let's scan at least few extents and don't pick up a first one ++ */ ++ if (bex->fe_len > gex->fe_len && ac->ac_found > ext3_mb_min_to_scan) ++ ac->ac_status = AC_STATUS_BREAK; ++ ++ /* ++ * We don't want to scan for a whole year ++ */ ++ if (ac->ac_found > ext3_mb_max_to_scan) ++ ac->ac_status = AC_STATUS_BREAK; ++} ++ ++static int ext3_mb_try_best_found(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ struct ext3_free_extent ex = ac->ac_b_ex; ++ int group = ex.fe_group, max, err; ++ ++ J_ASSERT(ex.fe_len > 0); ++ err = ext3_mb_load_buddy(ac->ac_sb, group, e3b); ++ if (err) ++ return err; ++ ++ ext3_lock_group(ac->ac_sb, group); ++ max = mb_find_extent(e3b, 0, ex.fe_start, ex.fe_len, &ex); ++ ++ if (max > 0) { ++ ac->ac_b_ex = ex; ++ ext3_mb_use_best_found(ac, e3b); ++ } ++ ++ ext3_unlock_group(ac->ac_sb, group); ++ ++ ext3_mb_release_desc(e3b); ++ ++ return 0; ++} ++ ++static int ext3_mb_find_by_goal(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ int group = ac->ac_g_ex.fe_group, max, err; ++ struct ext3_free_extent ex; ++ ++ err = ext3_mb_load_buddy(ac->ac_sb, group, e3b); ++ if (err) ++ return err; ++ ++ ext3_lock_group(ac->ac_sb, group); ++ max = mb_find_extent(e3b, 0, ac->ac_g_ex.fe_start, ++ ac->ac_g_ex.fe_len, &ex); ++ ++ if (max > 0) { ++ J_ASSERT(ex.fe_len > 0); ++ J_ASSERT(ex.fe_group == ac->ac_g_ex.fe_group); ++ J_ASSERT(ex.fe_start == ac->ac_g_ex.fe_start); ++ ac->ac_found++; ++ ac->ac_b_ex = ex; ++ ext3_mb_use_best_found(ac, e3b); ++ } ++ ext3_unlock_group(ac->ac_sb, group); ++ ++ ext3_mb_release_desc(e3b); ++ ++ return 0; ++} ++ ++/* ++ * The routine scans buddy structures (not bitmap!) from given order ++ * to max order and tries to find big enough chunk to satisfy the req ++ */ ++static void ext3_mb_simple_scan_group(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ struct super_block *sb = ac->ac_sb; ++ struct ext3_group_info *grp = e3b->bd_info; ++ void *buddy; ++ int i, k, max; ++ ++ J_ASSERT(ac->ac_2order > 0); ++ for (i = ac->ac_2order; i < sb->s_blocksize_bits + 1; i++) { ++ if (grp->bb_counters[i] == 0) ++ continue; ++ ++ buddy = mb_find_buddy(e3b, i, &max); ++ if (buddy == NULL) { ++ printk(KERN_ALERT "looking for wrong order?\n"); ++ break; ++ } ++ ++ k = mb_find_next_zero_bit(buddy, max, 0); ++ J_ASSERT(k < max); ++ ++ ac->ac_found++; ++ ++ ac->ac_b_ex.fe_len = 1 << i; ++ ac->ac_b_ex.fe_start = k << i; ++ ac->ac_b_ex.fe_group = e3b->bd_group; ++ ++ ext3_mb_use_best_found(ac, e3b); ++ J_ASSERT(ac->ac_b_ex.fe_len == ac->ac_g_ex.fe_len); ++ ++ if (unlikely(ext3_mb_stats)) ++ atomic_inc(&EXT3_SB(sb)->s_bal_2orders); ++ ++ break; ++ } ++} ++ ++/* ++ * The routine scans the group and measures all found extents. ++ * In order to optimize scanning, caller must pass number of ++ * free blocks in the group, so the routine can know upper limit. ++ */ ++static void ext3_mb_complex_scan_group(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ struct super_block *sb = ac->ac_sb; ++ void *bitmap = EXT3_MB_BITMAP(e3b); ++ struct ext3_free_extent ex; ++ int i, free; ++ ++ free = e3b->bd_info->bb_free; ++ J_ASSERT(free > 0); ++ ++ i = e3b->bd_info->bb_first_free; ++ ++ while (free && ac->ac_status == AC_STATUS_CONTINUE) { ++ i = mb_find_next_zero_bit(bitmap, sb->s_blocksize * 8, i); ++ if (i >= sb->s_blocksize * 8) { ++ J_ASSERT(free == 0); ++ break; ++ } ++ ++ mb_find_extent(e3b, 0, i, ac->ac_g_ex.fe_len, &ex); ++ J_ASSERT(ex.fe_len > 0); ++ J_ASSERT(free >= ex.fe_len); ++ ++ ext3_mb_measure_extent(ac, &ex, e3b); ++ ++ i += ex.fe_len; ++ free -= ex.fe_len; ++ } ++} ++ ++static int ext3_mb_good_group(struct ext3_allocation_context *ac, ++ int group, int cr) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(ac->ac_sb); ++ struct ext3_group_info *grp = sbi->s_group_info[group]; ++ unsigned free, fragments, i, bits; ++ ++ J_ASSERT(cr >= 0 && cr < 4); ++ J_ASSERT(!EXT3_MB_GRP_NEED_INIT(grp)); ++ ++ free = grp->bb_free; ++ fragments = grp->bb_fragments; ++ if (free == 0) ++ return 0; ++ if (fragments == 0) ++ return 0; ++ ++ switch (cr) { ++ case 0: ++ J_ASSERT(ac->ac_2order != 0); ++ bits = ac->ac_sb->s_blocksize_bits + 1; ++ for (i = ac->ac_2order; i < bits; i++) ++ if (grp->bb_counters[i] > 0) ++ return 1; ++ case 1: ++ if ((free / fragments) >= ac->ac_g_ex.fe_len) ++ return 1; ++ case 2: ++ if (free >= ac->ac_g_ex.fe_len) ++ return 1; ++ case 3: ++ return 1; ++ default: ++ BUG(); ++ } ++ ++ return 0; ++} ++ ++int ext3_mb_new_blocks(handle_t *handle, struct inode *inode, ++ unsigned long goal, int *len, int flags, int *errp) ++{ ++ struct buffer_head *bitmap_bh = NULL; ++ struct ext3_allocation_context ac; ++ int i, group, block, cr, err = 0; ++ struct ext3_group_desc *gdp; ++ struct ext3_super_block *es; ++ struct buffer_head *gdp_bh; ++ struct ext3_sb_info *sbi; ++ struct super_block *sb; ++ struct ext3_buddy e3b; ++ ++ J_ASSERT(len != NULL); ++ J_ASSERT(*len > 0); ++ ++ sb = inode->i_sb; ++ if (!sb) { ++ printk("ext3_mb_new_nblocks: nonexistent device"); ++ return 0; ++ } ++ ++ if (!test_opt(sb, MBALLOC)) { ++ static int ext3_mballoc_warning = 0; ++ if (ext3_mballoc_warning == 0) { ++ printk(KERN_ERR "EXT3-fs: multiblock request with " ++ "mballoc disabled!\n"); ++ ext3_mballoc_warning++; ++ } ++ *len = 1; ++ err = ext3_new_block_old(handle, inode, goal, errp); ++ return err; ++ } ++ ++ ext3_mb_poll_new_transaction(sb, handle); ++ ++ sbi = EXT3_SB(sb); ++ es = EXT3_SB(sb)->s_es; ++ ++ /* ++ * We can't allocate > group size ++ */ ++ if (*len >= EXT3_BLOCKS_PER_GROUP(sb) - 10) ++ *len = EXT3_BLOCKS_PER_GROUP(sb) - 10; ++ ++ if (!(flags & EXT3_MB_HINT_RESERVED)) { ++ /* someone asks for non-reserved blocks */ ++ BUG_ON(*len > 1); ++ err = ext3_mb_reserve_blocks(sb, 1); ++ if (err) { ++ *errp = err; ++ return 0; ++ } ++ } ++ ++ /* ++ * Check quota for allocation of this blocks. ++ */ ++ while (*len && DQUOT_ALLOC_BLOCK(inode, *len)) ++ *len -= 1; ++ if (*len == 0) { ++ *errp = -EDQUOT; ++ block = 0; ++ goto out; ++ } ++ ++ /* start searching from the goal */ ++ if (goal < le32_to_cpu(es->s_first_data_block) || ++ goal >= le32_to_cpu(es->s_blocks_count)) ++ goal = le32_to_cpu(es->s_first_data_block); ++ group = (goal - le32_to_cpu(es->s_first_data_block)) / ++ EXT3_BLOCKS_PER_GROUP(sb); ++ block = ((goal - le32_to_cpu(es->s_first_data_block)) % ++ EXT3_BLOCKS_PER_GROUP(sb)); ++ ++ /* set up allocation goals */ ++ ac.ac_b_ex.fe_group = 0; ++ ac.ac_b_ex.fe_start = 0; ++ ac.ac_b_ex.fe_len = 0; ++ ac.ac_status = AC_STATUS_CONTINUE; ++ ac.ac_groups_scanned = 0; ++ ac.ac_ex_scanned = 0; ++ ac.ac_found = 0; ++ ac.ac_sb = inode->i_sb; ++ ac.ac_g_ex.fe_group = group; ++ ac.ac_g_ex.fe_start = block; ++ ac.ac_g_ex.fe_len = *len; ++ ac.ac_flags = flags; ++ ac.ac_2order = 0; ++ ac.ac_criteria = 0; ++ ++ /* probably, the request is for 2^8+ blocks (1/2/3/... MB) */ ++ i = ffs(*len); ++ if (i >= 8) { ++ i--; ++ if ((*len & (~(1 << i))) == 0) ++ ac.ac_2order = i; ++ } ++ ++ /* Sometimes, caller may want to merge even small ++ * number of blocks to an existing extent */ ++ if (ac.ac_flags & EXT3_MB_HINT_MERGE) { ++ err = ext3_mb_find_by_goal(&ac, &e3b); ++ if (err) ++ goto out_err; ++ if (ac.ac_status == AC_STATUS_FOUND) ++ goto found; ++ } ++ ++ /* Let's just scan groups to find more-less suitable blocks */ ++ cr = ac.ac_2order ? 0 : 1; ++repeat: ++ for (; cr < 4 && ac.ac_status == AC_STATUS_CONTINUE; cr++) { ++ ac.ac_criteria = cr; ++ for (i = 0; i < EXT3_SB(sb)->s_groups_count; group++, i++) { ++ if (group == EXT3_SB(sb)->s_groups_count) ++ group = 0; ++ ++ if (EXT3_MB_GRP_NEED_INIT(sbi->s_group_info[group])) { ++ /* we need full data about the group ++ * to make a good selection */ ++ err = ext3_mb_load_buddy(ac.ac_sb, group, &e3b); ++ if (err) ++ goto out_err; ++ ext3_mb_release_desc(&e3b); ++ } ++ ++ /* check is group good for our criteries */ ++ if (!ext3_mb_good_group(&ac, group, cr)) ++ continue; ++ ++ err = ext3_mb_load_buddy(ac.ac_sb, group, &e3b); ++ if (err) ++ goto out_err; ++ ++ ext3_lock_group(sb, group); ++ if (!ext3_mb_good_group(&ac, group, cr)) { ++ /* someone did allocation from this group */ ++ ext3_unlock_group(sb, group); ++ ext3_mb_release_desc(&e3b); ++ continue; ++ } ++ ++ ac.ac_groups_scanned++; ++ if (cr == 0) ++ ext3_mb_simple_scan_group(&ac, &e3b); ++ else ++ ext3_mb_complex_scan_group(&ac, &e3b); ++ ++ ext3_unlock_group(sb, group); ++ ++ ext3_mb_release_desc(&e3b); ++ ++ if (ac.ac_status != AC_STATUS_CONTINUE) ++ break; ++ } ++ } ++ ++ if (ac.ac_b_ex.fe_len > 0 && ac.ac_status != AC_STATUS_FOUND && ++ !(ac.ac_flags & EXT3_MB_HINT_FIRST)) { ++ /* ++ * We've been searching too long. Let's try to allocate ++ * the best chunk we've found so far ++ */ ++ ++ /*if (ac.ac_found > ext3_mb_max_to_scan) ++ printk(KERN_ERR "EXT3-fs: too long searching at " ++ "%u (%d/%d)\n", cr, ac.ac_b_ex.fe_len, ++ ac.ac_g_ex.fe_len);*/ ++ ext3_mb_try_best_found(&ac, &e3b); ++ if (ac.ac_status != AC_STATUS_FOUND) { ++ /* ++ * Someone more lucky has already allocated it. ++ * The only thing we can do is just take first ++ * found block(s) ++ */ ++ printk(KERN_ERR "EXT3-fs: and someone won our chunk\n"); ++ ac.ac_b_ex.fe_group = 0; ++ ac.ac_b_ex.fe_start = 0; ++ ac.ac_b_ex.fe_len = 0; ++ ac.ac_status = AC_STATUS_CONTINUE; ++ ac.ac_flags |= EXT3_MB_HINT_FIRST; ++ cr = 3; ++ goto repeat; ++ } ++ } ++ ++ if (ac.ac_status != AC_STATUS_FOUND) { ++ /* ++ * We aren't lucky definitely ++ */ ++ DQUOT_FREE_BLOCK(inode, *len); ++ *errp = -ENOSPC; ++ block = 0; ++#if 1 ++ printk(KERN_ERR "EXT3-fs: cant allocate: status %d, flags %d\n", ++ ac.ac_status, ac.ac_flags); ++ printk(KERN_ERR "EXT3-fs: goal %d, best found %d/%d/%d, cr %d\n", ++ ac.ac_g_ex.fe_len, ac.ac_b_ex.fe_group, ++ ac.ac_b_ex.fe_start, ac.ac_b_ex.fe_len, cr); ++ printk(KERN_ERR "EXT3-fs: %lu block reserved, %d found\n", ++ sbi->s_blocks_reserved, ac.ac_found); ++ printk("EXT3-fs: groups: "); ++ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) ++ printk("%d: %d ", i, ++ sbi->s_group_info[i]->bb_free); ++ printk("\n"); ++#endif ++ goto out; ++ } ++ ++found: ++ J_ASSERT(ac.ac_b_ex.fe_len > 0); ++ ++ /* good news - free block(s) have been found. now it's time ++ * to mark block(s) in good old journaled bitmap */ ++ block = ac.ac_b_ex.fe_group * EXT3_BLOCKS_PER_GROUP(sb) ++ + ac.ac_b_ex.fe_start ++ + le32_to_cpu(es->s_first_data_block); ++ ++ /* we made a desicion, now mark found blocks in good old ++ * bitmap to be journaled */ ++ ++ ext3_debug("using block group %d(%d)\n", ++ ac.ac_b_group.group, gdp->bg_free_blocks_count); ++ ++ bitmap_bh = read_block_bitmap(sb, ac.ac_b_ex.fe_group); ++ if (!bitmap_bh) { ++ *errp = -EIO; ++ goto out_err; ++ } ++ ++ err = ext3_journal_get_write_access(handle, bitmap_bh); ++ if (err) { ++ *errp = err; ++ goto out_err; ++ } ++ ++ gdp = ext3_get_group_desc(sb, ac.ac_b_ex.fe_group, &gdp_bh); ++ if (!gdp) { ++ *errp = -EIO; ++ goto out_err; ++ } ++ ++ err = ext3_journal_get_write_access(handle, gdp_bh); ++ if (err) ++ goto out_err; ++ ++ block = ac.ac_b_ex.fe_group * EXT3_BLOCKS_PER_GROUP(sb) ++ + ac.ac_b_ex.fe_start ++ + le32_to_cpu(es->s_first_data_block); ++ ++ if (block == le32_to_cpu(gdp->bg_block_bitmap) || ++ block == le32_to_cpu(gdp->bg_inode_bitmap) || ++ in_range(block, le32_to_cpu(gdp->bg_inode_table), ++ EXT3_SB(sb)->s_itb_per_group)) ++ ext3_error(sb, "ext3_new_block", ++ "Allocating block in system zone - " ++ "block = %u", block); ++#ifdef AGGRESSIVE_CHECK ++ for (i = 0; i < ac.ac_b_ex.fe_len; i++) ++ J_ASSERT(!mb_test_bit(ac.ac_b_ex.fe_start + i, bitmap_bh->b_data)); ++#endif ++ mb_set_bits(bitmap_bh->b_data, ac.ac_b_ex.fe_start, ac.ac_b_ex.fe_len); ++ ++ spin_lock(sb_bgl_lock(sbi, ac.ac_b_ex.fe_group)); ++ gdp->bg_free_blocks_count = ++ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) ++ - ac.ac_b_ex.fe_len); ++ spin_unlock(sb_bgl_lock(sbi, ac.ac_b_ex.fe_group)); ++ percpu_counter_mod(&sbi->s_freeblocks_counter, - ac.ac_b_ex.fe_len); ++ ++ err = ext3_journal_dirty_metadata(handle, bitmap_bh); ++ if (err) ++ goto out_err; ++ err = ext3_journal_dirty_metadata(handle, gdp_bh); ++ if (err) ++ goto out_err; ++ ++ sb->s_dirt = 1; ++ *errp = 0; ++ brelse(bitmap_bh); ++ ++ /* drop non-allocated, but dquote'd blocks */ ++ J_ASSERT(*len >= ac.ac_b_ex.fe_len); ++ DQUOT_FREE_BLOCK(inode, *len - ac.ac_b_ex.fe_len); ++ ++ *len = ac.ac_b_ex.fe_len; ++ J_ASSERT(*len > 0); ++ J_ASSERT(block != 0); ++ goto out; ++ ++out_err: ++ /* if we've already allocated something, roll it back */ ++ if (ac.ac_status == AC_STATUS_FOUND) { ++ /* FIXME: free blocks here */ ++ } ++ ++ DQUOT_FREE_BLOCK(inode, *len); ++ brelse(bitmap_bh); ++ *errp = err; ++ block = 0; ++out: ++ if (!(flags & EXT3_MB_HINT_RESERVED)) { ++ /* block wasn't reserved before and we reserved it ++ * at the beginning of allocation. it doesn't matter ++ * whether we allocated anything or we failed: time ++ * to release reservation. NOTE: because I expect ++ * any multiblock request from delayed allocation ++ * path only, here is single block always */ ++ ext3_mb_release_blocks(sb, 1); ++ } ++ ++ if (unlikely(ext3_mb_stats) && ac.ac_g_ex.fe_len > 1) { ++ atomic_inc(&sbi->s_bal_reqs); ++ atomic_add(*len, &sbi->s_bal_allocated); ++ if (*len >= ac.ac_g_ex.fe_len) ++ atomic_inc(&sbi->s_bal_success); ++ atomic_add(ac.ac_found, &sbi->s_bal_ex_scanned); ++ if (ac.ac_g_ex.fe_start == ac.ac_b_ex.fe_start && ++ ac.ac_g_ex.fe_group == ac.ac_b_ex.fe_group) ++ atomic_inc(&sbi->s_bal_goals); ++ if (ac.ac_found > ext3_mb_max_to_scan) ++ atomic_inc(&sbi->s_bal_breaks); ++ } ++ ++ ext3_mb_store_history(sb, &ac); ++ ++ return block; ++} ++EXPORT_SYMBOL(ext3_mb_new_blocks); ++ ++#ifdef EXT3_MB_HISTORY ++struct ext3_mb_proc_session { ++ struct ext3_mb_history *history; ++ struct super_block *sb; ++ int start; ++ int max; ++}; ++ ++static void *ext3_mb_history_skip_empty(struct ext3_mb_proc_session *s, ++ struct ext3_mb_history *hs, ++ int first) ++{ ++ if (hs == s->history + s->max) ++ hs = s->history; ++ if (!first && hs == s->history + s->start) ++ return NULL; ++ while (hs->goal.fe_len == 0) { ++ hs++; ++ if (hs == s->history + s->max) ++ hs = s->history; ++ if (hs == s->history + s->start) ++ return NULL; ++ } ++ return hs; ++} ++ ++static void *ext3_mb_seq_history_start(struct seq_file *seq, loff_t *pos) ++{ ++ struct ext3_mb_proc_session *s = seq->private; ++ struct ext3_mb_history *hs; ++ int l = *pos; ++ ++ if (l == 0) ++ return SEQ_START_TOKEN; ++ hs = ext3_mb_history_skip_empty(s, s->history + s->start, 1); ++ if (!hs) ++ return NULL; ++ while (--l && (hs = ext3_mb_history_skip_empty(s, ++hs, 0)) != NULL); ++ return hs; ++} ++ ++static void *ext3_mb_seq_history_next(struct seq_file *seq, void *v, loff_t *pos) ++{ ++ struct ext3_mb_proc_session *s = seq->private; ++ struct ext3_mb_history *hs = v; ++ ++ ++*pos; ++ if (v == SEQ_START_TOKEN) ++ return ext3_mb_history_skip_empty(s, s->history + s->start, 1); ++ else ++ return ext3_mb_history_skip_empty(s, ++hs, 0); ++} ++ ++static int ext3_mb_seq_history_show(struct seq_file *seq, void *v) ++{ ++ struct ext3_mb_history *hs = v; ++ char buf[20], buf2[20]; ++ ++ if (v == SEQ_START_TOKEN) { ++ seq_printf(seq, "%-17s %-17s %-5s %-5s %-2s %-5s %-5s %-6s\n", ++ "goal", "result", "found", "grps", "cr", "merge", ++ "tail", "broken"); ++ return 0; ++ } ++ ++ sprintf(buf, "%u/%u/%u", hs->goal.fe_group, ++ hs->goal.fe_start, hs->goal.fe_len); ++ sprintf(buf2, "%u/%u/%u", hs->result.fe_group, ++ hs->result.fe_start, hs->result.fe_len); ++ seq_printf(seq, "%-17s %-17s %-5u %-5u %-2u %-5s %-5u %-6u\n", buf, ++ buf2, hs->found, hs->groups, hs->cr, ++ hs->merged ? "M" : "", hs->tail, ++ hs->buddy ? 1 << hs->buddy : 0); ++ return 0; ++} ++ ++static void ext3_mb_seq_history_stop(struct seq_file *seq, void *v) ++{ ++} ++ ++static struct seq_operations ext3_mb_seq_history_ops = { ++ .start = ext3_mb_seq_history_start, ++ .next = ext3_mb_seq_history_next, ++ .stop = ext3_mb_seq_history_stop, ++ .show = ext3_mb_seq_history_show, ++}; ++ ++static int ext3_mb_seq_history_open(struct inode *inode, struct file *file) ++{ ++ struct super_block *sb = PDE(inode)->data; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct ext3_mb_proc_session *s; ++ int rc, size; ++ ++ s = kmalloc(sizeof(*s), GFP_KERNEL); ++ if (s == NULL) ++ return -EIO; ++ size = sizeof(struct ext3_mb_history) * sbi->s_mb_history_max; ++ s->history = kmalloc(size, GFP_KERNEL); ++ ++ spin_lock(&sbi->s_mb_history_lock); ++ memcpy(s->history, sbi->s_mb_history, size); ++ s->max = sbi->s_mb_history_max; ++ s->start = sbi->s_mb_history_cur % s->max; ++ spin_unlock(&sbi->s_mb_history_lock); ++ ++ rc = seq_open(file, &ext3_mb_seq_history_ops); ++ if (rc == 0) { ++ struct seq_file *m = (struct seq_file *)file->private_data; ++ m->private = s; ++ } else { ++ kfree(s->history); ++ kfree(s); ++ } ++ return rc; ++ ++} ++ ++static int ext3_mb_seq_history_release(struct inode *inode, struct file *file) ++{ ++ struct seq_file *seq = (struct seq_file *)file->private_data; ++ struct ext3_mb_proc_session *s = seq->private; ++ kfree(s->history); ++ kfree(s); ++ return seq_release(inode, file); ++} ++ ++static struct file_operations ext3_mb_seq_history_fops = { ++ .owner = THIS_MODULE, ++ .open = ext3_mb_seq_history_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = ext3_mb_seq_history_release, ++}; ++ ++static void ext3_mb_history_release(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ char name[64]; ++ ++ snprintf(name, sizeof(name) - 1, "%s", bdevname(sb->s_bdev, name)); ++ remove_proc_entry("mb_history", sbi->s_mb_proc); ++ remove_proc_entry(name, proc_root_ext3); ++ ++ if (sbi->s_mb_history) ++ kfree(sbi->s_mb_history); ++} ++ ++static void ext3_mb_history_init(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ char name[64]; ++ int i; ++ ++ snprintf(name, sizeof(name) - 1, "%s", bdevname(sb->s_bdev, name)); ++ sbi->s_mb_proc = proc_mkdir(name, proc_root_ext3); ++ if (sbi->s_mb_proc != NULL) { ++ struct proc_dir_entry *p; ++ p = create_proc_entry("mb_history", S_IRUGO, sbi->s_mb_proc); ++ if (p) { ++ p->proc_fops = &ext3_mb_seq_history_fops; ++ p->data = sb; ++ } ++ } ++ ++ sbi->s_mb_history_max = 1000; ++ sbi->s_mb_history_cur = 0; ++ spin_lock_init(&sbi->s_mb_history_lock); ++ i = sbi->s_mb_history_max * sizeof(struct ext3_mb_history); ++ sbi->s_mb_history = kmalloc(i, GFP_KERNEL); ++ memset(sbi->s_mb_history, 0, i); ++ /* if we can't allocate history, then we simple won't use it */ ++} ++ ++static void ++ext3_mb_store_history(struct super_block *sb, struct ext3_allocation_context *ac) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct ext3_mb_history h; ++ ++ if (likely(sbi->s_mb_history == NULL)) ++ return; ++ ++ h.goal = ac->ac_g_ex; ++ h.result = ac->ac_b_ex; ++ h.found = ac->ac_found; ++ h.cr = ac->ac_criteria; ++ h.groups = ac->ac_groups_scanned; ++ h.tail = ac->ac_tail; ++ h.buddy = ac->ac_buddy; ++ h.merged = 0; ++ if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && ++ ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group) ++ h.merged = 1; ++ ++ spin_lock(&sbi->s_mb_history_lock); ++ memcpy(sbi->s_mb_history + sbi->s_mb_history_cur, &h, sizeof(h)); ++ if (++sbi->s_mb_history_cur >= sbi->s_mb_history_max) ++ sbi->s_mb_history_cur = 0; ++ spin_unlock(&sbi->s_mb_history_lock); ++} ++ ++#else ++#define ext3_mb_history_release(sb) ++#define ext3_mb_history_init(sb) ++#endif ++ ++int ext3_mb_init_backend(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int i, len; ++ ++ len = sizeof(struct ext3_buddy_group_blocks *) * sbi->s_groups_count; ++ sbi->s_group_info = kmalloc(len, GFP_KERNEL); ++ if (sbi->s_group_info == NULL) { ++ printk(KERN_ERR "EXT3-fs: can't allocate mem for buddy\n"); ++ return -ENOMEM; ++ } ++ memset(sbi->s_group_info, 0, len); ++ ++ sbi->s_buddy_cache = new_inode(sb); ++ if (sbi->s_buddy_cache == NULL) { ++ printk(KERN_ERR "EXT3-fs: can't get new inode\n"); ++ kfree(sbi->s_group_info); ++ return -ENOMEM; ++ } ++ ++ /* ++ * calculate needed size. if change bb_counters size, ++ * don't forget about ext3_mb_generate_buddy() ++ */ ++ len = sizeof(struct ext3_group_info); ++ len += sizeof(unsigned short) * (sb->s_blocksize_bits + 2); ++ for (i = 0; i < sbi->s_groups_count; i++) { ++ struct ext3_group_desc * desc; ++ ++ sbi->s_group_info[i] = kmalloc(len, GFP_KERNEL); ++ if (sbi->s_group_info[i] == NULL) { ++ printk(KERN_ERR "EXT3-fs: cant allocate mem for buddy\n"); ++ goto err_out; ++ } ++ desc = ext3_get_group_desc(sb, i, NULL); ++ if (desc == NULL) { ++ printk(KERN_ERR "EXT3-fs: cant read descriptor %u\n", i); ++ goto err_out; ++ } ++ memset(sbi->s_group_info[i], 0, len); ++ set_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, ++ &sbi->s_group_info[i]->bb_state); ++ sbi->s_group_info[i]->bb_free = ++ le16_to_cpu(desc->bg_free_blocks_count); ++ } ++ ++ return 0; ++ ++err_out: ++ while (--i >= 0) ++ kfree(sbi->s_group_info[i]); ++ iput(sbi->s_buddy_cache); ++ ++ return -ENOMEM; ++} ++ ++int ext3_mb_init(struct super_block *sb, int needs_recovery) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct inode *root = sb->s_root->d_inode; ++ unsigned i, offset, max; ++ struct dentry *dentry; ++ ++ if (!test_opt(sb, MBALLOC)) ++ return 0; ++ ++ i = (sb->s_blocksize_bits + 2) * sizeof(unsigned short); ++ ++ sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL); ++ if (sbi->s_mb_offsets == NULL) { ++ clear_opt(sbi->s_mount_opt, MBALLOC); ++ return -ENOMEM; ++ } ++ sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); ++ if (sbi->s_mb_maxs == NULL) { ++ clear_opt(sbi->s_mount_opt, MBALLOC); ++ kfree(sbi->s_mb_maxs); ++ return -ENOMEM; ++ } ++ ++ /* order 0 is regular bitmap */ ++ sbi->s_mb_maxs[0] = sb->s_blocksize << 3; ++ sbi->s_mb_offsets[0] = 0; ++ ++ i = 1; ++ offset = 0; ++ max = sb->s_blocksize << 2; ++ do { ++ sbi->s_mb_offsets[i] = offset; ++ sbi->s_mb_maxs[i] = max; ++ offset += 1 << (sb->s_blocksize_bits - i); ++ max = max >> 1; ++ i++; ++ } while (i <= sb->s_blocksize_bits + 1); ++ ++ ++ /* init file for buddy data */ ++ if ((i = ext3_mb_init_backend(sb))) { ++ clear_opt(sbi->s_mount_opt, MBALLOC); ++ kfree(sbi->s_mb_offsets); ++ kfree(sbi->s_mb_maxs); ++ return i; ++ } ++ ++ spin_lock_init(&sbi->s_reserve_lock); ++ spin_lock_init(&sbi->s_md_lock); ++ INIT_LIST_HEAD(&sbi->s_active_transaction); ++ INIT_LIST_HEAD(&sbi->s_closed_transaction); ++ INIT_LIST_HEAD(&sbi->s_committed_transaction); ++ spin_lock_init(&sbi->s_bal_lock); ++ ++ /* remove old on-disk buddy file */ ++ down(&root->i_sem); ++ dentry = lookup_one_len(".buddy", sb->s_root, strlen(".buddy")); ++ if (dentry->d_inode != NULL) { ++ i = vfs_unlink(root, dentry); ++ if (i != 0) ++ printk("EXT3-fs: can't remove .buddy file: %d\n", i); ++ } ++ dput(dentry); ++ up(&root->i_sem); ++ ++ ext3_mb_history_init(sb); ++ ++ printk("EXT3-fs: mballoc enabled\n"); ++ return 0; ++} ++ ++int ext3_mb_release(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int i; ++ ++ if (!test_opt(sb, MBALLOC)) ++ return 0; ++ ++ /* release freed, non-committed blocks */ ++ spin_lock(&sbi->s_md_lock); ++ list_splice_init(&sbi->s_closed_transaction, ++ &sbi->s_committed_transaction); ++ list_splice_init(&sbi->s_active_transaction, ++ &sbi->s_committed_transaction); ++ spin_unlock(&sbi->s_md_lock); ++ ext3_mb_free_committed_blocks(sb); ++ ++ if (sbi->s_group_info) { ++ for (i = 0; i < sbi->s_groups_count; i++) { ++ if (sbi->s_group_info[i] == NULL) ++ continue; ++ kfree(sbi->s_group_info[i]); ++ } ++ kfree(sbi->s_group_info); ++ } ++ if (sbi->s_mb_offsets) ++ kfree(sbi->s_mb_offsets); ++ if (sbi->s_mb_maxs) ++ kfree(sbi->s_mb_maxs); ++ if (sbi->s_buddy_cache) ++ iput(sbi->s_buddy_cache); ++ if (sbi->s_blocks_reserved) ++ printk("ext3-fs: %ld blocks being reserved at umount!\n", ++ sbi->s_blocks_reserved); ++ if (ext3_mb_stats) { ++ printk("EXT3-fs: mballoc: %u blocks %u reqs (%u success)\n", ++ atomic_read(&sbi->s_bal_allocated), ++ atomic_read(&sbi->s_bal_reqs), ++ atomic_read(&sbi->s_bal_success)); ++ printk("EXT3-fs: mballoc: %u extents scanned, %u goal hits, " ++ "%u 2^N hits, %u breaks\n", ++ atomic_read(&sbi->s_bal_ex_scanned), ++ atomic_read(&sbi->s_bal_goals), ++ atomic_read(&sbi->s_bal_2orders), ++ atomic_read(&sbi->s_bal_breaks)); ++ printk("EXT3-fs: mballoc: %lu generated and it took %Lu\n", ++ sbi->s_mb_buddies_generated++, ++ sbi->s_mb_generation_time); ++ } ++ ++ ext3_mb_history_release(sb); ++ ++ return 0; ++} ++ ++void ext3_mb_free_committed_blocks(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int err, i, count = 0, count2 = 0; ++ struct ext3_free_metadata *md; ++ struct ext3_buddy e3b; ++ ++ if (list_empty(&sbi->s_committed_transaction)) ++ return; ++ ++ /* there is committed blocks to be freed yet */ ++ do { ++ /* get next array of blocks */ ++ md = NULL; ++ spin_lock(&sbi->s_md_lock); ++ if (!list_empty(&sbi->s_committed_transaction)) { ++ md = list_entry(sbi->s_committed_transaction.next, ++ struct ext3_free_metadata, list); ++ list_del(&md->list); ++ } ++ spin_unlock(&sbi->s_md_lock); ++ ++ if (md == NULL) ++ break; ++ ++ mb_debug("gonna free %u blocks in group %u (0x%p):", ++ md->num, md->group, md); ++ ++ err = ext3_mb_load_buddy(sb, md->group, &e3b); ++ BUG_ON(err != 0); ++ ++ /* there are blocks to put in buddy to make them really free */ ++ count += md->num; ++ count2++; ++ ext3_lock_group(sb, md->group); ++ for (i = 0; i < md->num; i++) { ++ mb_debug(" %u", md->blocks[i]); ++ mb_free_blocks(&e3b, md->blocks[i], 1); ++ } ++ mb_debug("\n"); ++ ext3_unlock_group(sb, md->group); ++ ++ /* balance refcounts from ext3_mb_free_metadata() */ ++ page_cache_release(e3b.bd_buddy_page); ++ page_cache_release(e3b.bd_bitmap_page); ++ ++ kfree(md); ++ ext3_mb_release_desc(&e3b); ++ ++ } while (md); ++ mb_debug("freed %u blocks in %u structures\n", count, count2); ++} ++ ++void ext3_mb_poll_new_transaction(struct super_block *sb, handle_t *handle) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ ++ if (sbi->s_last_transaction == handle->h_transaction->t_tid) ++ return; ++ ++ /* new transaction! time to close last one and free blocks for ++ * committed transaction. we know that only transaction can be ++ * active, so previos transaction can be being logged and we ++ * know that transaction before previous is known to be already ++ * logged. this means that now we may free blocks freed in all ++ * transactions before previous one. hope I'm clear enough ... */ ++ ++ spin_lock(&sbi->s_md_lock); ++ if (sbi->s_last_transaction != handle->h_transaction->t_tid) { ++ mb_debug("new transaction %lu, old %lu\n", ++ (unsigned long) handle->h_transaction->t_tid, ++ (unsigned long) sbi->s_last_transaction); ++ list_splice_init(&sbi->s_closed_transaction, ++ &sbi->s_committed_transaction); ++ list_splice_init(&sbi->s_active_transaction, ++ &sbi->s_closed_transaction); ++ sbi->s_last_transaction = handle->h_transaction->t_tid; ++ } ++ spin_unlock(&sbi->s_md_lock); ++ ++ ext3_mb_free_committed_blocks(sb); ++} ++ ++int ext3_mb_free_metadata(handle_t *handle, struct ext3_buddy *e3b, ++ int group, int block, int count) ++{ ++ struct ext3_group_info *db = e3b->bd_info; ++ struct super_block *sb = e3b->bd_sb; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct ext3_free_metadata *md; ++ int i; ++ ++ J_ASSERT(e3b->bd_bitmap_page != NULL); ++ J_ASSERT(e3b->bd_buddy_page != NULL); ++ ++ ext3_lock_group(sb, group); ++ for (i = 0; i < count; i++) { ++ md = db->bb_md_cur; ++ if (md && db->bb_tid != handle->h_transaction->t_tid) { ++ db->bb_md_cur = NULL; ++ md = NULL; ++ } ++ ++ if (md == NULL) { ++ ext3_unlock_group(sb, group); ++ md = kmalloc(sizeof(*md), GFP_KERNEL); ++ if (md == NULL) ++ return -ENOMEM; ++ md->num = 0; ++ md->group = group; ++ ++ ext3_lock_group(sb, group); ++ if (db->bb_md_cur == NULL) { ++ spin_lock(&sbi->s_md_lock); ++ list_add(&md->list, &sbi->s_active_transaction); ++ spin_unlock(&sbi->s_md_lock); ++ /* protect buddy cache from being freed, ++ * otherwise we'll refresh it from ++ * on-disk bitmap and lose not-yet-available ++ * blocks */ ++ page_cache_get(e3b->bd_buddy_page); ++ page_cache_get(e3b->bd_bitmap_page); ++ db->bb_md_cur = md; ++ db->bb_tid = handle->h_transaction->t_tid; ++ mb_debug("new md 0x%p for group %u\n", ++ md, md->group); ++ } else { ++ kfree(md); ++ md = db->bb_md_cur; ++ } ++ } ++ ++ BUG_ON(md->num >= EXT3_BB_MAX_BLOCKS); ++ md->blocks[md->num] = block + i; ++ md->num++; ++ if (md->num == EXT3_BB_MAX_BLOCKS) { ++ /* no more space, put full container on a sb's list */ ++ db->bb_md_cur = NULL; ++ } ++ } ++ ext3_unlock_group(sb, group); ++ return 0; ++} ++ ++void ext3_mb_free_blocks(handle_t *handle, struct inode *inode, ++ unsigned long block, unsigned long count, ++ int metadata, int *freed) ++{ ++ struct buffer_head *bitmap_bh = NULL; ++ struct ext3_group_desc *gdp; ++ struct ext3_super_block *es; ++ unsigned long bit, overflow; ++ struct buffer_head *gd_bh; ++ unsigned long block_group; ++ struct ext3_sb_info *sbi; ++ struct super_block *sb; ++ struct ext3_buddy e3b; ++ int err = 0, ret; ++ ++ *freed = 0; ++ sb = inode->i_sb; ++ if (!sb) { ++ printk ("ext3_free_blocks: nonexistent device"); ++ return; ++ } ++ ++ ext3_mb_poll_new_transaction(sb, handle); ++ ++ sbi = EXT3_SB(sb); ++ es = EXT3_SB(sb)->s_es; ++ if (block < le32_to_cpu(es->s_first_data_block) || ++ block + count < block || ++ block + count > le32_to_cpu(es->s_blocks_count)) { ++ ext3_error (sb, "ext3_free_blocks", ++ "Freeing blocks not in datazone - " ++ "block = %lu, count = %lu", block, count); ++ goto error_return; ++ } ++ ++ ext3_debug("freeing block %lu\n", block); ++ ++do_more: ++ overflow = 0; ++ block_group = (block - le32_to_cpu(es->s_first_data_block)) / ++ EXT3_BLOCKS_PER_GROUP(sb); ++ bit = (block - le32_to_cpu(es->s_first_data_block)) % ++ EXT3_BLOCKS_PER_GROUP(sb); ++ /* ++ * Check to see if we are freeing blocks across a group ++ * boundary. ++ */ ++ if (bit + count > EXT3_BLOCKS_PER_GROUP(sb)) { ++ overflow = bit + count - EXT3_BLOCKS_PER_GROUP(sb); ++ count -= overflow; ++ } ++ brelse(bitmap_bh); ++ bitmap_bh = read_block_bitmap(sb, block_group); ++ if (!bitmap_bh) ++ goto error_return; ++ gdp = ext3_get_group_desc (sb, block_group, &gd_bh); ++ if (!gdp) ++ goto error_return; ++ ++ if (in_range (le32_to_cpu(gdp->bg_block_bitmap), block, count) || ++ in_range (le32_to_cpu(gdp->bg_inode_bitmap), block, count) || ++ in_range (block, le32_to_cpu(gdp->bg_inode_table), ++ EXT3_SB(sb)->s_itb_per_group) || ++ in_range (block + count - 1, le32_to_cpu(gdp->bg_inode_table), ++ EXT3_SB(sb)->s_itb_per_group)) ++ ext3_error (sb, "ext3_free_blocks", ++ "Freeing blocks in system zones - " ++ "Block = %lu, count = %lu", ++ block, count); ++ ++ BUFFER_TRACE(bitmap_bh, "getting write access"); ++ err = ext3_journal_get_write_access(handle, bitmap_bh); ++ if (err) ++ goto error_return; ++ ++ /* ++ * We are about to modify some metadata. Call the journal APIs ++ * to unshare ->b_data if a currently-committing transaction is ++ * using it ++ */ ++ BUFFER_TRACE(gd_bh, "get_write_access"); ++ err = ext3_journal_get_write_access(handle, gd_bh); ++ if (err) ++ goto error_return; ++ ++ err = ext3_mb_load_buddy(sb, block_group, &e3b); ++ if (err) ++ goto error_return; ++ ++#ifdef AGGRESSIVE_CHECK ++ { ++ int i; ++ for (i = 0; i < count; i++) ++ J_ASSERT(mb_test_bit(bit + i, bitmap_bh->b_data)); ++ } ++#endif ++ mb_clear_bits(bitmap_bh->b_data, bit, count); ++ ++ /* We dirtied the bitmap block */ ++ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); ++ err = ext3_journal_dirty_metadata(handle, bitmap_bh); ++ ++ if (metadata) { ++ /* blocks being freed are metadata. these blocks shouldn't ++ * be used until this transaction is committed */ ++ ext3_mb_free_metadata(handle, &e3b, block_group, bit, count); ++ } else { ++ ext3_lock_group(sb, block_group); ++ mb_free_blocks(&e3b, bit, count); ++ ext3_unlock_group(sb, block_group); ++ } ++ ++ spin_lock(sb_bgl_lock(sbi, block_group)); ++ gdp->bg_free_blocks_count = ++ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + count); ++ spin_unlock(sb_bgl_lock(sbi, block_group)); ++ percpu_counter_mod(&sbi->s_freeblocks_counter, count); ++ ++ ext3_mb_release_desc(&e3b); ++ ++ *freed = count; ++ ++ /* And the group descriptor block */ ++ BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); ++ ret = ext3_journal_dirty_metadata(handle, gd_bh); ++ if (!err) err = ret; ++ ++ if (overflow && !err) { ++ block += count; ++ count = overflow; ++ goto do_more; ++ } ++ sb->s_dirt = 1; ++error_return: ++ brelse(bitmap_bh); ++ ext3_std_error(sb, err); ++ return; ++} ++ ++int ext3_mb_reserve_blocks(struct super_block *sb, int blocks) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int free, ret = -ENOSPC; ++ ++ BUG_ON(blocks < 0); ++ spin_lock(&sbi->s_reserve_lock); ++ free = percpu_counter_read_positive(&sbi->s_freeblocks_counter); ++ if (blocks <= free - sbi->s_blocks_reserved) { ++ sbi->s_blocks_reserved += blocks; ++ ret = 0; ++ } ++ spin_unlock(&sbi->s_reserve_lock); ++ return ret; ++} ++ ++void ext3_mb_release_blocks(struct super_block *sb, int blocks) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ ++ BUG_ON(blocks < 0); ++ spin_lock(&sbi->s_reserve_lock); ++ sbi->s_blocks_reserved -= blocks; ++ WARN_ON(sbi->s_blocks_reserved < 0); ++ if (sbi->s_blocks_reserved < 0) ++ sbi->s_blocks_reserved = 0; ++ spin_unlock(&sbi->s_reserve_lock); ++} ++ ++int ext3_new_block(handle_t *handle, struct inode *inode, ++ unsigned long goal, int *errp) ++{ ++ int ret, len; ++ ++ if (!test_opt(inode->i_sb, MBALLOC)) { ++ ret = ext3_new_block_old(handle, inode, goal, errp); ++ goto out; ++ } ++ len = 1; ++ ret = ext3_mb_new_blocks(handle, inode, goal, &len, 0, errp); ++out: ++ return ret; ++} ++ ++ ++void ext3_free_blocks(handle_t *handle, struct inode * inode, ++ unsigned long block, unsigned long count, int metadata) ++{ ++ struct super_block *sb; ++ int freed; ++ ++ sb = inode->i_sb; ++ if (!test_opt(sb, MBALLOC)) ++ ext3_free_blocks_sb(handle, sb, block, count, &freed); ++ else ++ ext3_mb_free_blocks(handle, inode, block, count, metadata, &freed); ++ if (freed) ++ DQUOT_FREE_BLOCK(inode, freed); ++ return; ++} ++ ++#define EXT3_ROOT "ext3" ++#define EXT3_MB_STATS_NAME "mb_stats" ++#define EXT3_MB_MAX_TO_SCAN_NAME "mb_max_to_scan" ++#define EXT3_MB_MIN_TO_SCAN_NAME "mb_min_to_scan" ++ ++static int ext3_mb_stats_read(char *page, char **start, off_t off, ++ int count, int *eof, void *data) ++{ ++ int len; ++ ++ *eof = 1; ++ if (off != 0) ++ return 0; ++ ++ len = sprintf(page, "%ld\n", ext3_mb_stats); ++ *start = page; ++ return len; ++} ++ ++static int ext3_mb_stats_write(struct file *file, const char *buffer, ++ unsigned long count, void *data) ++{ ++ char str[32]; ++ ++ if (count >= sizeof(str)) { ++ printk(KERN_ERR "EXT3: %s string to long, max %u bytes\n", ++ EXT3_MB_STATS_NAME, (int)sizeof(str)); ++ return -EOVERFLOW; ++ } ++ ++ if (copy_from_user(str, buffer, count)) ++ return -EFAULT; ++ ++ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ ++ ext3_mb_stats = (simple_strtol(str, NULL, 0) != 0); ++ return count; ++} ++ ++static int ext3_mb_max_to_scan_read(char *page, char **start, off_t off, ++ int count, int *eof, void *data) ++{ ++ int len; ++ ++ *eof = 1; ++ if (off != 0) ++ return 0; ++ ++ len = sprintf(page, "%ld\n", ext3_mb_max_to_scan); ++ *start = page; ++ return len; ++} ++ ++static int ext3_mb_max_to_scan_write(struct file *file, const char *buffer, ++ unsigned long count, void *data) ++{ ++ char str[32]; ++ long value; ++ ++ if (count >= sizeof(str)) { ++ printk(KERN_ERR "EXT3: %s string to long, max %u bytes\n", ++ EXT3_MB_MAX_TO_SCAN_NAME, (int)sizeof(str)); ++ return -EOVERFLOW; ++ } ++ ++ if (copy_from_user(str, buffer, count)) ++ return -EFAULT; ++ ++ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ ++ value = simple_strtol(str, NULL, 0); ++ if (value <= 0) ++ return -ERANGE; ++ ++ ext3_mb_max_to_scan = value; ++ ++ return count; ++} ++ ++static int ext3_mb_min_to_scan_read(char *page, char **start, off_t off, ++ int count, int *eof, void *data) ++{ ++ int len; ++ ++ *eof = 1; ++ if (off != 0) ++ return 0; ++ ++ len = sprintf(page, "%ld\n", ext3_mb_min_to_scan); ++ *start = page; ++ return len; ++} ++ ++static int ext3_mb_min_to_scan_write(struct file *file, const char *buffer, ++ unsigned long count, void *data) ++{ ++ char str[32]; ++ long value; ++ ++ if (count >= sizeof(str)) { ++ printk(KERN_ERR "EXT3: %s string to long, max %u bytes\n", ++ EXT3_MB_MIN_TO_SCAN_NAME, (int)sizeof(str)); ++ return -EOVERFLOW; ++ } ++ ++ if (copy_from_user(str, buffer, count)) ++ return -EFAULT; ++ ++ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ ++ value = simple_strtol(str, NULL, 0); ++ if (value <= 0) ++ return -ERANGE; ++ ++ ext3_mb_min_to_scan = value; ++ ++ return count; ++} ++ ++int __init init_ext3_proc(void) ++{ ++ struct proc_dir_entry *proc_ext3_mb_stats; ++ struct proc_dir_entry *proc_ext3_mb_max_to_scan; ++ struct proc_dir_entry *proc_ext3_mb_min_to_scan; ++ ++ proc_root_ext3 = proc_mkdir(EXT3_ROOT, proc_root_fs); ++ if (proc_root_ext3 == NULL) { ++ printk(KERN_ERR "EXT3: Unable to create %s\n", EXT3_ROOT); ++ return -EIO; ++ } ++ ++ /* Initialize EXT3_MB_STATS_NAME */ ++ proc_ext3_mb_stats = create_proc_entry(EXT3_MB_STATS_NAME, ++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); ++ if (proc_ext3_mb_stats == NULL) { ++ printk(KERN_ERR "EXT3: Unable to create %s\n", ++ EXT3_MB_STATS_NAME); ++ remove_proc_entry(EXT3_ROOT, proc_root_fs); ++ return -EIO; ++ } ++ ++ proc_ext3_mb_stats->data = NULL; ++ proc_ext3_mb_stats->read_proc = ext3_mb_stats_read; ++ proc_ext3_mb_stats->write_proc = ext3_mb_stats_write; ++ ++ /* Initialize EXT3_MAX_TO_SCAN_NAME */ ++ proc_ext3_mb_max_to_scan = create_proc_entry( ++ EXT3_MB_MAX_TO_SCAN_NAME, ++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); ++ if (proc_ext3_mb_max_to_scan == NULL) { ++ printk(KERN_ERR "EXT3: Unable to create %s\n", ++ EXT3_MB_MAX_TO_SCAN_NAME); ++ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_ROOT, proc_root_fs); ++ return -EIO; ++ } ++ ++ proc_ext3_mb_max_to_scan->data = NULL; ++ proc_ext3_mb_max_to_scan->read_proc = ext3_mb_max_to_scan_read; ++ proc_ext3_mb_max_to_scan->write_proc = ext3_mb_max_to_scan_write; ++ ++ /* Initialize EXT3_MIN_TO_SCAN_NAME */ ++ proc_ext3_mb_min_to_scan = create_proc_entry( ++ EXT3_MB_MIN_TO_SCAN_NAME, ++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); ++ if (proc_ext3_mb_min_to_scan == NULL) { ++ printk(KERN_ERR "EXT3: Unable to create %s\n", ++ EXT3_MB_MIN_TO_SCAN_NAME); ++ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_ROOT, proc_root_fs); ++ return -EIO; ++ } ++ ++ proc_ext3_mb_min_to_scan->data = NULL; ++ proc_ext3_mb_min_to_scan->read_proc = ext3_mb_min_to_scan_read; ++ proc_ext3_mb_min_to_scan->write_proc = ext3_mb_min_to_scan_write; ++ ++ return 0; ++} ++ ++void exit_ext3_proc(void) ++{ ++ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_MIN_TO_SCAN_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_ROOT, proc_root_fs); ++} +Index: linux-2.6.12.6/fs/ext3/Makefile +=================================================================== +--- linux-2.6.12.6.orig/fs/ext3/Makefile 2005-12-17 02:17:16.000000000 +0300 ++++ linux-2.6.12.6/fs/ext3/Makefile 2005-12-17 02:21:21.000000000 +0300 +@@ -6,7 +6,7 @@ + + ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ + ioctl.o namei.o super.o symlink.o hash.o resize.o \ +- extents.o ++ extents.o mballoc.o + + ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o + ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o diff --git a/lustre/kernel_patches/patches/ext3-mballoc2-2.6.9-rhel4.patch b/lustre/kernel_patches/patches/ext3-mballoc2-2.6.9-rhel4.patch index b3d9f73..d12c678 100644 --- a/lustre/kernel_patches/patches/ext3-mballoc2-2.6.9-rhel4.patch +++ b/lustre/kernel_patches/patches/ext3-mballoc2-2.6.9-rhel4.patch @@ -1,71 +1,8 @@ -Index: linux-2.6.9/include/linux/ext3_fs_sb.h +Index: linux-2.6.9-full/include/linux/ext3_fs.h =================================================================== ---- linux-2.6.9.orig/include/linux/ext3_fs_sb.h 2005-10-14 09:10:05.000000000 +0400 -+++ linux-2.6.9/include/linux/ext3_fs_sb.h 2005-10-14 09:10:13.000000000 +0400 -@@ -23,10 +23,30 @@ - #define EXT_INCLUDE - #include - #include -+#include - #endif - #endif - #include - -+#define EXT3_BB_MAX_BLOCKS 30 -+struct ext3_free_metadata { -+ unsigned short group; -+ unsigned short num; -+ unsigned short blocks[EXT3_BB_MAX_BLOCKS]; -+ struct list_head list; -+}; -+ -+struct ext3_buddy_group_blocks { -+ __u32 bb_bitmap; -+ __u32 bb_buddy; -+ spinlock_t bb_lock; -+ unsigned long bb_tid; -+ struct ext3_free_metadata *bb_md_cur; -+ unsigned short bb_first_free; -+ unsigned short bb_free; -+ unsigned bb_counters[]; -+}; -+ - /* - * third extended-fs super-block data in memory - */ -@@ -81,6 +101,27 @@ - char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */ - int s_jquota_fmt; /* Format of quota to use */ - #endif -+ -+ /* for buddy allocator */ -+ struct ext3_buddy_group_blocks **s_buddy_blocks; -+ struct inode *s_buddy; -+ long s_blocks_reserved; -+ spinlock_t s_reserve_lock; -+ struct list_head s_active_transaction; -+ struct list_head s_closed_transaction; -+ struct list_head s_committed_transaction; -+ spinlock_t s_md_lock; -+ tid_t s_last_transaction; -+ int s_mb_factor; -+ -+ /* stats for buddy allocator */ -+ spinlock_t s_bal_lock; -+ unsigned long s_bal_reqs; /* number of reqs with len > 1 */ -+ unsigned long s_bal_success; /* we found long enough chunks */ -+ unsigned long s_bal_allocated; /* in blocks */ -+ unsigned long s_bal_ex_scanned; /* total extents scanned */ -+ unsigned long s_bal_goals; /* goal hits */ -+ unsigned long s_bal_breaks; /* too long searches */ - }; - - #endif /* _LINUX_EXT3_FS_SB */ -Index: linux-2.6.9/include/linux/ext3_fs.h -=================================================================== ---- linux-2.6.9.orig/include/linux/ext3_fs.h 2005-10-14 09:10:12.000000000 +0400 -+++ linux-2.6.9/include/linux/ext3_fs.h 2005-10-14 09:10:31.000000000 +0400 -@@ -57,6 +57,14 @@ +--- linux-2.6.9-full.orig/include/linux/ext3_fs.h 2005-12-16 23:16:41.000000000 +0300 ++++ linux-2.6.9-full/include/linux/ext3_fs.h 2005-12-16 23:16:42.000000000 +0300 +@@ -57,6 +57,14 @@ struct statfs; #define ext3_debug(f, a...) do {} while (0) #endif @@ -80,15 +17,15 @@ Index: linux-2.6.9/include/linux/ext3_fs.h /* * Special inodes numbers */ -@@ -365,6 +373,7 @@ - #define EXT3_MOUNT_IOPEN_NOPRIV 0x80000 /* Make iopen world-readable */ - #define EXT3_MOUNT_EXTENTS 0x100000/* Extents support */ - #define EXT3_MOUNT_EXTDEBUG 0x200000/* Extents debug */ -+#define EXT3_MOUNT_MBALLOC 0x400000/* Buddy allocation support */ +@@ -365,6 +373,7 @@ struct ext3_inode { + #define EXT3_MOUNT_IOPEN_NOPRIV 0x100000/* Make iopen world-readable */ + #define EXT3_MOUNT_EXTENTS 0x200000/* Extents support */ + #define EXT3_MOUNT_EXTDEBUG 0x400000/* Extents debug */ ++#define EXT3_MOUNT_MBALLOC 0x800000/* Buddy allocation support */ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ #ifndef clear_opt -@@ -726,7 +735,7 @@ +@@ -726,7 +735,7 @@ extern int ext3_bg_has_super(struct supe extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group); extern int ext3_new_block (handle_t *, struct inode *, unsigned long, int *); extern void ext3_free_blocks (handle_t *, struct inode *, unsigned long, @@ -97,12 +34,11 @@ Index: linux-2.6.9/include/linux/ext3_fs.h extern void ext3_free_blocks_sb (handle_t *, struct super_block *, unsigned long, unsigned long, int *); extern unsigned long ext3_count_free_blocks (struct super_block *); -@@ -857,6 +866,44 @@ +@@ -857,6 +866,17 @@ extern void ext3_extents_initialize_bloc extern int ext3_ext_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg); +/* mballoc.c */ -+extern long ext3_mb_aggressive; +extern long ext3_mb_stats; +extern long ext3_mb_max_to_scan; +extern int ext3_mb_init(struct super_block *, int); @@ -110,90 +46,146 @@ Index: linux-2.6.9/include/linux/ext3_fs.h +extern int ext3_mb_new_blocks(handle_t *, struct inode *, unsigned long, int *, int, int *); +extern int ext3_mb_reserve_blocks(struct super_block *, int); +extern void ext3_mb_release_blocks(struct super_block *, int); -+ -+/* writeback.c */ -+extern int ext3_wb_writepages(struct address_space *, struct writeback_control *); -+extern int ext3_wb_prepare_write(struct file *file, struct page *page, -+ unsigned from, unsigned to); -+extern int ext3_wb_commit_write(struct file *, struct page *, unsigned, unsigned); -+extern int ext3_wb_writepage(struct page *, struct writeback_control *); -+extern int ext3_wb_invalidatepage(struct page *, unsigned long); -+extern int ext3_wb_releasepage(struct page *, int); -+extern int ext3_wb_block_truncate_page(handle_t *, struct page *, struct address_space *, loff_t); -+extern void ext3_wb_init(struct super_block *); -+extern void ext3_wb_release(struct super_block *); -+ -+/* writeback.c */ -+extern int ext3_wb_writepages(struct address_space *, struct writeback_control *); -+extern int ext3_wb_prepare_write(struct file *file, struct page *page, -+ unsigned from, unsigned to); -+extern int ext3_wb_commit_write(struct file *, struct page *, unsigned, unsigned); -+extern int ext3_wb_writepage(struct page *, struct writeback_control *); -+extern int ext3_wb_invalidatepage(struct page *, unsigned long); -+extern int ext3_wb_releasepage(struct page *, int); -+extern int ext3_wb_block_truncate_page(handle_t *, struct page *, struct address_space *, loff_t); -+extern void ext3_wb_init(struct super_block *); -+extern void ext3_wb_release(struct super_block *); -+ -+/* proc.c */ -+extern int init_ext3_proc(void); -+extern void exit_ext3_proc(void); ++int __init init_ext3_proc(void); ++void exit_ext3_proc(void); + #endif /* __KERNEL__ */ /* EXT3_IOC_CREATE_INUM at bottom of file (visible to kernel and user). */ -Index: linux-2.6.9/fs/ext3/balloc.c +Index: linux-2.6.9-full/include/linux/ext3_fs_sb.h =================================================================== ---- linux-2.6.9.orig/fs/ext3/balloc.c 2005-05-13 21:39:03.000000000 +0400 -+++ linux-2.6.9/fs/ext3/balloc.c 2005-10-14 09:10:13.000000000 +0400 -@@ -79,7 +79,7 @@ - * - * Return buffer_head on success or NULL in case of failure. - */ --static struct buffer_head * -+struct buffer_head * - read_block_bitmap(struct super_block *sb, unsigned int block_group) - { - struct ext3_group_desc * desc; -@@ -450,24 +450,6 @@ - return; - } +--- linux-2.6.9-full.orig/include/linux/ext3_fs_sb.h 2005-12-16 23:16:39.000000000 +0300 ++++ linux-2.6.9-full/include/linux/ext3_fs_sb.h 2005-12-16 23:16:42.000000000 +0300 +@@ -23,9 +23,15 @@ + #define EXT_INCLUDE + #include + #include ++#include + #endif + #endif + #include ++#include ++ ++struct ext3_buddy_group_blocks; ++struct ext3_mb_history; ++#define EXT3_BB_MAX_BLOCKS --/* Free given blocks, update quota and i_blocks field */ --void ext3_free_blocks(handle_t *handle, struct inode *inode, -- unsigned long block, unsigned long count) --{ -- struct super_block * sb; -- int dquot_freed_blocks; -- -- sb = inode->i_sb; -- if (!sb) { -- printk ("ext3_free_blocks: nonexistent device"); -- return; -- } -- ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks); -- if (dquot_freed_blocks) -- DQUOT_FREE_BLOCK(inode, dquot_freed_blocks); -- return; --} -- /* - * For ext3 allocations, we must not reuse any blocks which are - * allocated in the bitmap buffer's "last committed data" copy. This -@@ -1140,7 +1122,7 @@ - * bitmap, and then for any free bit if that fails. - * This function also updates quota and i_blocks field. - */ --int ext3_new_block(handle_t *handle, struct inode *inode, -+int ext3_new_block_old(handle_t *handle, struct inode *inode, - unsigned long goal, int *errp) + * third extended-fs super-block data in memory +@@ -81,6 +87,38 @@ struct ext3_sb_info { + char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */ + int s_jquota_fmt; /* Format of quota to use */ + #endif ++ ++ /* for buddy allocator */ ++ struct ext3_group_info **s_group_info; ++ struct inode *s_buddy_cache; ++ long s_blocks_reserved; ++ spinlock_t s_reserve_lock; ++ struct list_head s_active_transaction; ++ struct list_head s_closed_transaction; ++ struct list_head s_committed_transaction; ++ spinlock_t s_md_lock; ++ tid_t s_last_transaction; ++ int s_mb_factor; ++ unsigned short *s_mb_offsets, *s_mb_maxs; ++ ++ /* history to debug policy */ ++ struct ext3_mb_history *s_mb_history; ++ int s_mb_history_cur; ++ int s_mb_history_max; ++ struct proc_dir_entry *s_mb_proc; ++ spinlock_t s_mb_history_lock; ++ ++ /* stats for buddy allocator */ ++ atomic_t s_bal_reqs; /* number of reqs with len > 1 */ ++ atomic_t s_bal_success; /* we found long enough chunks */ ++ atomic_t s_bal_allocated; /* in blocks */ ++ atomic_t s_bal_ex_scanned; /* total extents scanned */ ++ atomic_t s_bal_goals; /* goal hits */ ++ atomic_t s_bal_breaks; /* too long searches */ ++ atomic_t s_bal_2orders; /* 2^order hits */ ++ spinlock_t s_bal_lock; ++ unsigned long s_mb_buddies_generated; ++ unsigned long long s_mb_generation_time; + }; + + #endif /* _LINUX_EXT3_FS_SB */ +Index: linux-2.6.9-full/fs/ext3/super.c +=================================================================== +--- linux-2.6.9-full.orig/fs/ext3/super.c 2005-12-16 23:16:41.000000000 +0300 ++++ linux-2.6.9-full/fs/ext3/super.c 2005-12-16 23:16:42.000000000 +0300 +@@ -394,6 +394,7 @@ void ext3_put_super (struct super_block + struct ext3_super_block *es = sbi->s_es; + int i; + ++ ext3_mb_release(sb); + ext3_ext_release(sb); + ext3_xattr_put_super(sb); + journal_destroy(sbi->s_journal); +@@ -596,7 +597,7 @@ enum { + Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, + Opt_ignore, Opt_barrier, Opt_err, Opt_resize, + Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, +- Opt_extents, Opt_extdebug, ++ Opt_extents, Opt_extdebug, Opt_mballoc, + }; + + static match_table_t tokens = { +@@ -647,6 +649,7 @@ static match_table_t tokens = { + {Opt_iopen_nopriv, "iopen_nopriv"}, + {Opt_extents, "extents"}, + {Opt_extdebug, "extdebug"}, ++ {Opt_mballoc, "mballoc"}, + {Opt_barrier, "barrier=%u"}, + {Opt_err, NULL}, + {Opt_resize, "resize"}, +@@ -957,6 +960,9 @@ clear_qf_name: + case Opt_extdebug: + set_opt (sbi->s_mount_opt, EXTDEBUG); + break; ++ case Opt_mballoc: ++ set_opt (sbi->s_mount_opt, MBALLOC); ++ break; + default: + printk (KERN_ERR + "EXT3-fs: Unrecognized mount option \"%s\" " +@@ -1646,6 +1652,7 @@ static int ext3_fill_super (struct super + ext3_count_dirs(sb)); + + ext3_ext_init(sb); ++ ext3_mb_init(sb, needs_recovery); + + return 0; + +@@ -2428,7 +2435,13 @@ static struct file_system_type ext3_fs_t + + static int __init init_ext3_fs(void) { - struct buffer_head *bitmap_bh = NULL; -Index: linux-2.6.9/fs/ext3/extents.c +- int err = init_ext3_xattr(); ++ int err; ++ ++ err = init_ext3_proc(); ++ if (err) ++ return err; ++ ++ err = init_ext3_xattr(); + if (err) + return err; + err = init_inodecache(); +@@ -2450,6 +2463,7 @@ static void __exit exit_ext3_fs(void) + unregister_filesystem(&ext3_fs_type); + destroy_inodecache(); + exit_ext3_xattr(); ++ exit_ext3_proc(); + } + + int ext3_prep_san_write(struct inode *inode, long *blocks, +Index: linux-2.6.9-full/fs/ext3/extents.c =================================================================== ---- linux-2.6.9.orig/fs/ext3/extents.c 2005-10-14 09:10:12.000000000 +0400 -+++ linux-2.6.9/fs/ext3/extents.c 2005-10-14 09:10:13.000000000 +0400 -@@ -771,7 +771,7 @@ +--- linux-2.6.9-full.orig/fs/ext3/extents.c 2005-12-16 23:16:41.000000000 +0300 ++++ linux-2.6.9-full/fs/ext3/extents.c 2005-12-16 23:16:42.000000000 +0300 +@@ -771,7 +771,7 @@ cleanup: for (i = 0; i < depth; i++) { if (!ablocks[i]) continue; @@ -202,7 +194,7 @@ Index: linux-2.6.9/fs/ext3/extents.c } } kfree(ablocks); -@@ -1428,7 +1428,7 @@ +@@ -1428,7 +1428,7 @@ int ext3_ext_rm_idx(handle_t *handle, st path->p_idx->ei_leaf); bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf); ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf); @@ -211,7 +203,7 @@ Index: linux-2.6.9/fs/ext3/extents.c return err; } -@@ -1913,10 +1913,12 @@ +@@ -1913,10 +1913,12 @@ ext3_remove_blocks(struct ext3_extents_t int needed = ext3_remove_blocks_credits(tree, ex, from, to); handle_t *handle = ext3_journal_start(tree->inode, needed); struct buffer_head *bh; @@ -220,12 +212,12 @@ Index: linux-2.6.9/fs/ext3/extents.c if (IS_ERR(handle)) return PTR_ERR(handle); -+ if (S_ISDIR(tree->inode->i_mode)) ++ if (S_ISDIR(tree->inode->i_mode) || S_ISLNK(tree->inode->i_mode)) + metadata = 1; if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) { /* tail removal */ unsigned long num, start; -@@ -1928,7 +1930,7 @@ +@@ -1928,7 +1930,7 @@ ext3_remove_blocks(struct ext3_extents_t bh = sb_find_get_block(tree->inode->i_sb, start + i); ext3_forget(handle, 0, tree->inode, bh, start + i); } @@ -234,24 +226,98 @@ Index: linux-2.6.9/fs/ext3/extents.c } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) { printk("strange request: removal %lu-%lu from %u:%u\n", from, to, ex->ee_block, ex->ee_len); -Index: linux-2.6.9/fs/ext3/namei.c +Index: linux-2.6.9-full/fs/ext3/inode.c =================================================================== ---- linux-2.6.9.orig/fs/ext3/namei.c 2005-10-14 09:10:04.000000000 +0400 -+++ linux-2.6.9/fs/ext3/namei.c 2005-10-14 09:10:13.000000000 +0400 -@@ -1639,7 +1639,7 @@ - * If the create succeeds, we fill in the inode information - * with d_instantiate(). +--- linux-2.6.9-full.orig/fs/ext3/inode.c 2005-12-16 23:16:41.000000000 +0300 ++++ linux-2.6.9-full/fs/ext3/inode.c 2005-12-16 23:16:42.000000000 +0300 +@@ -572,7 +572,7 @@ static int ext3_alloc_branch(handle_t *h + ext3_journal_forget(handle, branch[i].bh); + } + for (i = 0; i < keys; i++) +- ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1); ++ ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1, 1); + return err; + } + +@@ -673,7 +673,7 @@ err_out: + if (err == -EAGAIN) + for (i = 0; i < num; i++) + ext3_free_blocks(handle, inode, +- le32_to_cpu(where[i].key), 1); ++ le32_to_cpu(where[i].key), 1, 1); + return err; + } + +@@ -1831,7 +1831,7 @@ ext3_clear_blocks(handle_t *handle, stru + } + } + +- ext3_free_blocks(handle, inode, block_to_free, count); ++ ext3_free_blocks(handle, inode, block_to_free, count, 1); + } + + /** +@@ -2004,7 +2004,7 @@ static void ext3_free_branches(handle_t + ext3_journal_test_restart(handle, inode); + } + +- ext3_free_blocks(handle, inode, nr, 1); ++ ext3_free_blocks(handle, inode, nr, 1, 1); + + if (parent_bh) { + /* +Index: linux-2.6.9-full/fs/ext3/balloc.c +=================================================================== +--- linux-2.6.9-full.orig/fs/ext3/balloc.c 2005-10-27 21:44:24.000000000 +0400 ++++ linux-2.6.9-full/fs/ext3/balloc.c 2005-12-16 23:16:42.000000000 +0300 +@@ -79,7 +79,7 @@ struct ext3_group_desc * ext3_get_group_ + * + * Return buffer_head on success or NULL in case of failure. */ --static int ext3_create (struct inode * dir, struct dentry * dentry, int mode, -+int ext3_create (struct inode * dir, struct dentry * dentry, int mode, - struct nameidata *nd) +-static struct buffer_head * ++struct buffer_head * + read_block_bitmap(struct super_block *sb, unsigned int block_group) { - handle_t *handle; -Index: linux-2.6.9/fs/ext3/xattr.c + struct ext3_group_desc * desc; +@@ -450,24 +450,6 @@ error_return: + return; + } + +-/* Free given blocks, update quota and i_blocks field */ +-void ext3_free_blocks(handle_t *handle, struct inode *inode, +- unsigned long block, unsigned long count) +-{ +- struct super_block * sb; +- int dquot_freed_blocks; +- +- sb = inode->i_sb; +- if (!sb) { +- printk ("ext3_free_blocks: nonexistent device"); +- return; +- } +- ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks); +- if (dquot_freed_blocks) +- DQUOT_FREE_BLOCK(inode, dquot_freed_blocks); +- return; +-} +- + /* + * For ext3 allocations, we must not reuse any blocks which are + * allocated in the bitmap buffer's "last committed data" copy. This +@@ -1140,7 +1122,7 @@ int ext3_should_retry_alloc(struct super + * bitmap, and then for any free bit if that fails. + * This function also updates quota and i_blocks field. + */ +-int ext3_new_block(handle_t *handle, struct inode *inode, ++int ext3_new_block_old(handle_t *handle, struct inode *inode, + unsigned long goal, int *errp) + { + struct buffer_head *bitmap_bh = NULL; +Index: linux-2.6.9-full/fs/ext3/xattr.c =================================================================== ---- linux-2.6.9.orig/fs/ext3/xattr.c 2005-10-14 09:10:08.000000000 +0400 -+++ linux-2.6.9/fs/ext3/xattr.c 2005-10-14 09:10:13.000000000 +0400 -@@ -1281,7 +1281,7 @@ +--- linux-2.6.9-full.orig/fs/ext3/xattr.c 2005-12-16 23:16:40.000000000 +0300 ++++ linux-2.6.9-full/fs/ext3/xattr.c 2005-12-16 23:16:42.000000000 +0300 +@@ -1281,7 +1281,7 @@ ext3_xattr_set_handle2(handle_t *handle, new_bh = sb_getblk(sb, block); if (!new_bh) { getblk_failed: @@ -260,7 +326,7 @@ Index: linux-2.6.9/fs/ext3/xattr.c error = -EIO; goto cleanup; } -@@ -1328,7 +1328,7 @@ +@@ -1328,7 +1328,7 @@ getblk_failed: if (ce) mb_cache_entry_free(ce); ea_bdebug(old_bh, "freeing"); @@ -269,7 +335,7 @@ Index: linux-2.6.9/fs/ext3/xattr.c /* ext3_forget() calls bforget() for us, but we let our caller release old_bh, so we need to -@@ -1427,7 +1427,7 @@ +@@ -1427,7 +1427,7 @@ ext3_xattr_delete_inode(handle_t *handle if (HDR(bh)->h_refcount == cpu_to_le32(1)) { if (ce) mb_cache_entry_free(ce); @@ -278,27 +344,13 @@ Index: linux-2.6.9/fs/ext3/xattr.c get_bh(bh); ext3_forget(handle, 1, inode, bh, EXT3_I(inode)->i_file_acl); } else { -Index: linux-2.6.9/fs/ext3/Makefile +Index: linux-2.6.9-full/fs/ext3/mballoc.c =================================================================== ---- linux-2.6.9.orig/fs/ext3/Makefile 2005-10-14 09:10:12.000000000 +0400 -+++ linux-2.6.9/fs/ext3/Makefile 2005-10-14 09:10:13.000000000 +0400 -@@ -5,7 +5,8 @@ - obj-$(CONFIG_EXT3_FS) += ext3.o - - ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o\ -- ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o -+ ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ -+ mballoc.o - - ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o - ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o -Index: linux-2.6.9/fs/ext3/mballoc.c -=================================================================== ---- linux-2.6.9.orig/fs/ext3/mballoc.c 2005-10-13 19:40:57.851699336 +0400 -+++ linux-2.6.9/fs/ext3/mballoc.c 2005-10-14 09:10:31.000000000 +0400 -@@ -0,0 +1,1865 @@ +--- linux-2.6.9-full.orig/fs/ext3/mballoc.c 2005-12-16 17:46:19.148560250 +0300 ++++ linux-2.6.9-full/fs/ext3/mballoc.c 2005-12-17 00:10:15.000000000 +0300 +@@ -0,0 +1,2429 @@ +/* -+ * Copyright(c) 2003, 2004, 2005, Cluster File Systems, Inc, info@clusterfs.com ++ * Copyright (c) 2003-2005, Cluster File Systems, Inc, info@clusterfs.com + * Written by Alex Tomas + * + * This program is free software; you can redistribute it and/or modify @@ -330,12 +382,15 @@ Index: linux-2.6.9/fs/ext3/mballoc.c +#include +#include +#include ++#include ++#include ++#include ++#include + +/* + * TODO: -+ * - bitmap/buddy read-ahead (proposed by Oleg Drokin aka green) ++ * - bitmap read-ahead (proposed by Oleg Drokin aka green) + * - track min/max extents in each group for better group selection -+ * - is it worthwhile to use buddies directly if req is 2^N blocks? + * - mb_mark_used() may allocate chunk right after splitting buddy + * - special flag to advice allocator to look for requested + N blocks + * this may improve interaction between extents and mballoc @@ -345,17 +400,10 @@ Index: linux-2.6.9/fs/ext3/mballoc.c + */ + +/* -+ * with 'ext3_mb_aggressive' set the allocator runs consistency checks over ++ * with AGRESSIVE_CHECK allocator runs consistency checks over + * structures. these checks slow things down a lot + */ -+long ext3_mb_aggressive = 0; -+ -+ -+/* -+ * with 'ext3_mb_stats' allocator will collect stats that will be -+ * shown at umount. The collecting costs though! -+ */ -+long ext3_mb_stats = 1; ++#define AGGRESSIVE_CHECK__ + +/* + */ @@ -367,33 +415,56 @@ Index: linux-2.6.9/fs/ext3/mballoc.c +#endif + +/* -+ * where to save buddies structures beetween umount/mount (clean case only) ++ * with EXT3_MB_HISTORY mballoc stores last N allocations in memory ++ * and you can monitor it in /proc/fs/ext3//mb_history + */ -+#define EXT3_BUDDY_FILE ".buddy" ++#define EXT3_MB_HISTORY + +/* + * How long mballoc can look for a best extent (in found extents) + */ -+long ext3_mb_max_to_scan = 100; ++long ext3_mb_max_to_scan = 500; + +/* -+ * This structure is on-disk description of a group for mballoc ++ * How long mballoc must look for a best extent + */ -+struct ext3_mb_group_descr { -+ __u16 mgd_first_free; /* first free block in the group */ -+ __u16 mgd_free; /* number of free blocks in the group */ -+ __u16 mgd_counters[16]; /* number of free blocks by order */ -+}; ++long ext3_mb_min_to_scan = 30; + +/* -+ * This structure is header of mballoc's file ++ * with 'ext3_mb_stats' allocator will collect stats that will be ++ * shown at umount. The collecting costs though! + */ -+struct ext3_mb_grp_header { -+ __u32 mh_magic; ++ ++long ext3_mb_stats = 1; ++ ++#ifdef EXT3_BB_MAX_BLOCKS ++#undef EXT3_BB_MAX_BLOCKS ++#endif ++#define EXT3_BB_MAX_BLOCKS 30 ++ ++struct ext3_free_metadata { ++ unsigned short group; ++ unsigned short num; ++ unsigned short blocks[EXT3_BB_MAX_BLOCKS]; ++ struct list_head list; +}; + -+#define EXT3_MB_MAGIC_V1 0xbabd16fd ++struct ext3_group_info { ++ unsigned long bb_state; ++ unsigned long bb_tid; ++ struct ext3_free_metadata *bb_md_cur; ++ unsigned short bb_first_free; ++ unsigned short bb_free; ++ unsigned short bb_fragments; ++ unsigned short bb_counters[]; ++}; ++ ++ ++#define EXT3_GROUP_INFO_NEED_INIT_BIT 0 ++#define EXT3_GROUP_INFO_LOCKED_BIT 1 + ++#define EXT3_MB_GRP_NEED_INIT(grp) \ ++ (test_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, &(grp)->bb_state)) + +struct ext3_free_extent { + __u16 fe_start; @@ -414,28 +485,55 @@ Index: linux-2.6.9/fs/ext3/mballoc.c + unsigned long ac_ex_scanned; + __u16 ac_groups_scanned; + __u16 ac_found; ++ __u16 ac_tail; ++ __u16 ac_buddy; + __u8 ac_status; + __u8 ac_flags; /* allocation hints */ ++ __u8 ac_criteria; + __u8 ac_repeats; ++ __u8 ac_2order; /* if request is to allocate 2^N blocks and ++ * N > 0, the field stores N, otherwise 0 */ +}; + +#define AC_STATUS_CONTINUE 1 +#define AC_STATUS_FOUND 2 +#define AC_STATUS_BREAK 3 + ++struct ext3_mb_history { ++ struct ext3_free_extent goal; /* goal allocation */ ++ struct ext3_free_extent result; /* result allocation */ ++ __u16 found; /* how many extents have been found */ ++ __u16 groups; /* how many groups have been scanned */ ++ __u16 tail; /* what tail broke some buddy */ ++ __u16 buddy; /* buddy the tail ^^^ broke */ ++ __u8 cr; /* which phase the result extent was found at */ ++ __u8 merged; ++}; ++ +struct ext3_buddy { -+ struct buffer_head *bd_bh; -+ struct buffer_head *bd_bh2; -+ struct ext3_buddy_group_blocks *bd_bd; ++ struct page *bd_buddy_page; ++ void *bd_buddy; ++ struct page *bd_bitmap_page; ++ void *bd_bitmap; ++ struct ext3_group_info *bd_info; + struct super_block *bd_sb; + __u16 bd_blkbits; + __u16 bd_group; +}; -+#define EXT3_MB_BITMAP(e3b) ((e3b)->bd_bh->b_data) -+#define EXT3_MB_BUDDY(e3b) ((e3b)->bd_bh2->b_data) ++#define EXT3_MB_BITMAP(e3b) ((e3b)->bd_bitmap) ++#define EXT3_MB_BUDDY(e3b) ((e3b)->bd_buddy) ++ ++#ifndef EXT3_MB_HISTORY ++#define ext3_mb_store_history(sb,ac) ++#else ++static void ext3_mb_store_history(struct super_block *, ++ struct ext3_allocation_context *ac); ++#endif + +#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) + ++static struct proc_dir_entry *proc_root_ext3; ++ +int ext3_create (struct inode *, struct dentry *, int, struct nameidata *); +struct buffer_head * read_block_bitmap(struct super_block *, unsigned int); +int ext3_new_block_old(handle_t *, struct inode *, unsigned long, int *); @@ -490,9 +588,25 @@ Index: linux-2.6.9/fs/ext3/mballoc.c + ext2_clear_bit_atomic(NULL, bit, addr); +} + ++static inline int mb_find_next_zero_bit(void *addr, int max, int start) ++{ ++ int fix; ++#if BITS_PER_LONG == 64 ++ fix = ((unsigned long) addr & 7UL) << 3; ++ addr = (void *) ((unsigned long) addr & ~7UL); ++#elif BITS_PER_LONG == 32 ++ fix = ((unsigned long) addr & 3UL) << 3; ++ addr = (void *) ((unsigned long) addr & ~3UL); ++#else ++#error "how many bits you are?!" ++#endif ++ max += fix; ++ start += fix; ++ return ext2_find_next_zero_bit(addr, max, start) - fix; ++} ++ +static inline void *mb_find_buddy(struct ext3_buddy *e3b, int order, int *max) +{ -+ int i = 1; + char *bb; + + J_ASSERT(EXT3_MB_BITMAP(e3b) != EXT3_MB_BUDDY(e3b)); @@ -508,89 +622,30 @@ Index: linux-2.6.9/fs/ext3/mballoc.c + if (order == 0) + return EXT3_MB_BITMAP(e3b); + -+ bb = EXT3_MB_BUDDY(e3b); -+ *max = *max >> 1; -+ while (i < order) { -+ bb += 1 << (e3b->bd_blkbits - i); -+ i++; -+ *max = *max >> 1; -+ } -+ J_ASSERT((unsigned) (bb - (char *) EXT3_MB_BUDDY(e3b)) < -+ e3b->bd_sb->s_blocksize); -+ return bb; -+} -+ -+static int ext3_mb_load_buddy(struct super_block *sb, int group, -+ struct ext3_buddy *e3b) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ -+ J_ASSERT(sbi->s_buddy_blocks[group]->bb_bitmap); -+ J_ASSERT(sbi->s_buddy_blocks[group]->bb_buddy); -+ -+ /* load bitmap */ -+ e3b->bd_bh = sb_getblk(sb, sbi->s_buddy_blocks[group]->bb_bitmap); -+ if (e3b->bd_bh == NULL) { -+ ext3_error(sb, "ext3_mb_load_buddy", -+ "can't get block for buddy bitmap\n"); -+ goto out; -+ } -+ /* load buddy */ -+ e3b->bd_bh2 = sb_getblk(sb, sbi->s_buddy_blocks[group]->bb_buddy); -+ if (e3b->bd_bh2 == NULL) { -+ ext3_error(sb, "ext3_mb_load_buddy", -+ "can't get block for buddy bitmap\n"); -+ goto out; -+ } -+ -+ if (!buffer_uptodate(e3b->bd_bh)) -+ ll_rw_block(READ, 1, &e3b->bd_bh); -+ if (!buffer_uptodate(e3b->bd_bh2)) -+ ll_rw_block(READ, 1, &e3b->bd_bh2); -+ -+ wait_on_buffer(e3b->bd_bh); -+ J_ASSERT(buffer_uptodate(e3b->bd_bh)); -+ wait_on_buffer(e3b->bd_bh2); -+ J_ASSERT(buffer_uptodate(e3b->bd_bh2)); -+ -+ e3b->bd_blkbits = sb->s_blocksize_bits; -+ e3b->bd_bd = sbi->s_buddy_blocks[group]; -+ e3b->bd_sb = sb; -+ e3b->bd_group = group; -+ -+ return 0; -+out: -+ brelse(e3b->bd_bh); -+ brelse(e3b->bd_bh2); -+ e3b->bd_bh = NULL; -+ e3b->bd_bh2 = NULL; -+ return -EIO; -+} ++ bb = EXT3_MB_BUDDY(e3b) + EXT3_SB(e3b->bd_sb)->s_mb_offsets[order]; ++ *max = EXT3_SB(e3b->bd_sb)->s_mb_maxs[order]; + -+static void ext3_mb_dirty_buddy(struct ext3_buddy *e3b) -+{ -+ mark_buffer_dirty(e3b->bd_bh); -+ mark_buffer_dirty(e3b->bd_bh2); ++ return bb; +} + -+static void ext3_mb_release_desc(struct ext3_buddy *e3b) -+{ -+ brelse(e3b->bd_bh); -+ brelse(e3b->bd_bh2); -+} ++#ifdef AGGRESSIVE_CHECK + +static void mb_check_buddy(struct ext3_buddy *e3b) +{ + int order = e3b->bd_blkbits + 1; + int max, max2, i, j, k, count; ++ int fragments = 0, fstart; + void *buddy, *buddy2; + -+ if (likely(!ext3_mb_aggressive)) -+ return; -+ + if (!test_opt(e3b->bd_sb, MBALLOC)) + return; + ++ { ++ static int mb_check_counter = 0; ++ if (mb_check_counter++ % 300 != 0) ++ return; ++ } ++ + while (order > 1) { + buddy = mb_find_buddy(e3b, order, &max); + J_ASSERT(buddy); @@ -621,14 +676,22 @@ Index: linux-2.6.9/fs/ext3/mballoc.c + } + count++; + } -+ J_ASSERT(e3b->bd_bd->bb_counters[order] == count); ++ J_ASSERT(e3b->bd_info->bb_counters[order] == count); + order--; + } + ++ fstart = -1; + buddy = mb_find_buddy(e3b, 0, &max); + for (i = 0; i < max; i++) { -+ if (!mb_test_bit(i, buddy)) ++ if (!mb_test_bit(i, buddy)) { ++ J_ASSERT(i >= e3b->bd_info->bb_first_free); ++ if (fstart == -1) { ++ fragments++; ++ fstart = i; ++ } + continue; ++ } ++ fstart = -1; + /* check used bits only */ + for (j = 0; j < e3b->bd_blkbits + 1; j++) { + buddy2 = mb_find_buddy(e3b, j, &max2); @@ -637,18 +700,325 @@ Index: linux-2.6.9/fs/ext3/mballoc.c + J_ASSERT(mb_test_bit(k, buddy2)); + } + } ++ J_ASSERT(!EXT3_MB_GRP_NEED_INIT(e3b->bd_info)); ++ J_ASSERT(e3b->bd_info->bb_fragments == fragments); ++} ++ ++#else ++#define mb_check_buddy(e3b) ++#endif ++ ++/* find most significant bit */ ++static int inline fmsb(unsigned short word) ++{ ++ int order; ++ ++ if (word > 255) { ++ order = 7; ++ word >>= 8; ++ } else { ++ order = -1; ++ } ++ ++ do { ++ order++; ++ word >>= 1; ++ } while (word != 0); ++ ++ return order; ++} ++ ++static void inline ++ext3_mb_mark_free_simple(struct super_block *sb, void *buddy, unsigned first, ++ int len, struct ext3_group_info *grp) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ unsigned short min, max, chunk, border; ++ ++ mb_debug("mark %u/%u free\n", first, len); ++ J_ASSERT(len < EXT3_BLOCKS_PER_GROUP(sb)); ++ ++ border = 2 << sb->s_blocksize_bits; ++ ++ while (len > 0) { ++ /* find how many blocks can be covered since this position */ ++ max = ffs(first | border) - 1; ++ ++ /* find how many blocks of power 2 we need to mark */ ++ min = fmsb(len); ++ ++ mb_debug(" %u/%u -> max %u, min %u\n", ++ first & ((2 << sb->s_blocksize_bits) - 1), ++ len, max, min); ++ ++ if (max < min) ++ min = max; ++ chunk = 1 << min; ++ ++ /* mark multiblock chunks only */ ++ grp->bb_counters[min]++; ++ if (min > 0) { ++ mb_debug(" set %u at %u \n", first >> min, ++ sbi->s_mb_offsets[min]); ++ mb_clear_bit(first >> min, buddy + sbi->s_mb_offsets[min]); ++ } ++ ++ len -= chunk; ++ first += chunk; ++ } ++} ++ ++static void ++ext3_mb_generate_buddy(struct super_block *sb, void *buddy, void *bitmap, ++ struct ext3_group_info *grp) ++{ ++ unsigned short max = EXT3_BLOCKS_PER_GROUP(sb); ++ unsigned short i = 0, first, len; ++ unsigned free = 0, fragments = 0; ++ unsigned long long period = get_cycles(); ++ ++ i = mb_find_next_zero_bit(bitmap, max, 0); ++ grp->bb_first_free = i; ++ while (i < max) { ++ fragments++; ++ first = i; ++ i = find_next_bit(bitmap, max, i); ++ len = i - first; ++ free += len; ++ if (len > 1) ++ ext3_mb_mark_free_simple(sb, buddy, first, len, grp); ++ else ++ grp->bb_counters[0]++; ++ if (i < max) ++ i = mb_find_next_zero_bit(bitmap, max, i); ++ } ++ grp->bb_fragments = fragments; ++ ++ /* bb_state shouldn't being modified because all ++ * others waits for init completion on page lock */ ++ clear_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, &grp->bb_state); ++ if (free != grp->bb_free) { ++ printk("EXT3-fs: %u blocks in bitmap, %u in group descriptor\n", ++ free, grp->bb_free); ++ grp->bb_free = free; ++ } ++ ++ period = get_cycles() - period; ++ spin_lock(&EXT3_SB(sb)->s_bal_lock); ++ EXT3_SB(sb)->s_mb_buddies_generated++; ++ EXT3_SB(sb)->s_mb_generation_time += period; ++ spin_unlock(&EXT3_SB(sb)->s_bal_lock); ++} ++ ++static int ext3_mb_init_cache(struct page *page) ++{ ++ int blocksize, blocks_per_page, groups_per_page; ++ int err = 0, i, first_group, first_block; ++ struct super_block *sb; ++ struct buffer_head *bhs; ++ struct buffer_head **bh; ++ struct inode *inode; ++ char *data, *bitmap; ++ ++ mb_debug("init page %lu\n", page->index); ++ ++ inode = page->mapping->host; ++ sb = inode->i_sb; ++ blocksize = 1 << inode->i_blkbits; ++ blocks_per_page = PAGE_CACHE_SIZE / blocksize; ++ ++ groups_per_page = blocks_per_page >> 1; ++ if (groups_per_page == 0) ++ groups_per_page = 1; ++ ++ /* allocate buffer_heads to read bitmaps */ ++ if (groups_per_page > 1) { ++ err = -ENOMEM; ++ i = sizeof(struct buffer_head *) * groups_per_page; ++ bh = kmalloc(i, GFP_NOFS); ++ if (bh == NULL) ++ goto out; ++ memset(bh, 0, i); ++ } else ++ bh = &bhs; ++ ++ first_group = page->index * blocks_per_page / 2; ++ ++ /* read all groups the page covers into the cache */ ++ for (i = 0; i < groups_per_page; i++) { ++ struct ext3_group_desc * desc; ++ ++ if (first_group + i >= EXT3_SB(sb)->s_groups_count) ++ break; ++ ++ err = -EIO; ++ desc = ext3_get_group_desc(sb, first_group + i, NULL); ++ if (desc == NULL) ++ goto out; ++ ++ err = -ENOMEM; ++ bh[i] = sb_getblk(sb, le32_to_cpu(desc->bg_block_bitmap)); ++ if (bh[i] == NULL) ++ goto out; ++ ++ if (buffer_uptodate(bh[i])) ++ continue; ++ ++ lock_buffer(bh[i]); ++ if (buffer_uptodate(bh[i])) { ++ unlock_buffer(bh[i]); ++ continue; ++ } ++ ++ get_bh(bh[i]); ++ bh[i]->b_end_io = end_buffer_read_sync; ++ submit_bh(READ, bh[i]); ++ mb_debug("read bitmap for group %u\n", first_group + i); ++ } ++ ++ /* wait for I/O completion */ ++ for (i = 0; i < groups_per_page && bh[i]; i++) ++ wait_on_buffer(bh[i]); ++ ++ /* XXX: I/O error handling here */ ++ ++ first_block = page->index * blocks_per_page; ++ for (i = 0; i < blocks_per_page; i++) { ++ int group; ++ ++ group = (first_block + i) >> 1; ++ if (group >= EXT3_SB(sb)->s_groups_count) ++ break; ++ ++ data = page_address(page) + (i * blocksize); ++ bitmap = bh[group - first_group]->b_data; ++ ++ if ((first_block + i) & 1) { ++ /* this is block of buddy */ ++ mb_debug("put buddy for group %u in page %lu/%x\n", ++ group, page->index, i * blocksize); ++ memset(data, 0xff, blocksize); ++ EXT3_SB(sb)->s_group_info[group]->bb_fragments = 0; ++ memset(EXT3_SB(sb)->s_group_info[group]->bb_counters, 0, ++ sizeof(unsigned short)*(sb->s_blocksize_bits+2)); ++ ext3_mb_generate_buddy(sb, data, bitmap, ++ EXT3_SB(sb)->s_group_info[group]); ++ } else { ++ /* this is block of bitmap */ ++ mb_debug("put bitmap for group %u in page %lu/%x\n", ++ group, page->index, i * blocksize); ++ memcpy(data, bitmap, blocksize); ++ } ++ } ++ SetPageUptodate(page); ++ ++out: ++ if (bh) { ++ for (i = 0; bh && i < groups_per_page && bh[i]; i++) ++ brelse(bh[i]); ++ if (bh != &bhs) ++ kfree(bh); ++ } ++ return err; ++} ++ ++static int ext3_mb_load_buddy(struct super_block *sb, int group, ++ struct ext3_buddy *e3b) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct inode *inode = sbi->s_buddy_cache; ++ int blocks_per_page, block, pnum, poff; ++ struct page *page; ++ ++ mb_debug("load group %u\n", group); ++ ++ blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; ++ ++ e3b->bd_blkbits = sb->s_blocksize_bits; ++ e3b->bd_info = sbi->s_group_info[group]; ++ e3b->bd_sb = sb; ++ e3b->bd_group = group; ++ e3b->bd_buddy_page = NULL; ++ e3b->bd_bitmap_page = NULL; ++ ++ block = group * 2; ++ pnum = block / blocks_per_page; ++ poff = block % blocks_per_page; ++ ++ page = find_get_page(inode->i_mapping, pnum); ++ if (page == NULL || !PageUptodate(page)) { ++ if (page) ++ page_cache_release(page); ++ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); ++ if (page) { ++ if (!PageUptodate(page)) ++ ext3_mb_init_cache(page); ++ unlock_page(page); ++ } ++ } ++ if (page == NULL || !PageUptodate(page)) ++ goto err; ++ e3b->bd_bitmap_page = page; ++ e3b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); ++ mark_page_accessed(page); ++ ++ block++; ++ pnum = block / blocks_per_page; ++ poff = block % blocks_per_page; ++ ++ page = find_get_page(inode->i_mapping, pnum); ++ if (page == NULL || !PageUptodate(page)) { ++ if (page) ++ page_cache_release(page); ++ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); ++ if (page) { ++ if (!PageUptodate(page)) ++ ext3_mb_init_cache(page); ++ unlock_page(page); ++ } ++ } ++ if (page == NULL || !PageUptodate(page)) ++ goto err; ++ e3b->bd_buddy_page = page; ++ e3b->bd_buddy = page_address(page) + (poff * sb->s_blocksize); ++ mark_page_accessed(page); ++ ++ J_ASSERT(e3b->bd_bitmap_page != NULL); ++ J_ASSERT(e3b->bd_buddy_page != NULL); ++ ++ return 0; ++ ++err: ++ if (e3b->bd_bitmap_page) ++ page_cache_release(e3b->bd_bitmap_page); ++ if (e3b->bd_buddy_page) ++ page_cache_release(e3b->bd_buddy_page); ++ e3b->bd_buddy = NULL; ++ e3b->bd_bitmap = NULL; ++ return -EIO; ++} ++ ++static void ext3_mb_release_desc(struct ext3_buddy *e3b) ++{ ++ if (e3b->bd_bitmap_page) ++ page_cache_release(e3b->bd_bitmap_page); ++ if (e3b->bd_buddy_page) ++ page_cache_release(e3b->bd_buddy_page); +} + ++ +static inline void +ext3_lock_group(struct super_block *sb, int group) +{ -+ spin_lock(&EXT3_SB(sb)->s_buddy_blocks[group]->bb_lock); ++ bit_spin_lock(EXT3_GROUP_INFO_LOCKED_BIT, ++ &EXT3_SB(sb)->s_group_info[group]->bb_state); +} + +static inline void +ext3_unlock_group(struct super_block *sb, int group) +{ -+ spin_unlock(&EXT3_SB(sb)->s_buddy_blocks[group]->bb_lock); ++ bit_spin_unlock(EXT3_GROUP_INFO_LOCKED_BIT, ++ &EXT3_SB(sb)->s_group_info[group]->bb_state); +} + +static int mb_find_order_for_block(struct ext3_buddy *e3b, int block) @@ -710,22 +1080,33 @@ Index: linux-2.6.9/fs/ext3/mballoc.c + +static int mb_free_blocks(struct ext3_buddy *e3b, int first, int count) +{ -+ int block, max, order; ++ int block = 0, max = 0, order; + void *buddy, *buddy2; + + mb_check_buddy(e3b); + -+ e3b->bd_bd->bb_free += count; -+ if (first < e3b->bd_bd->bb_first_free) -+ e3b->bd_bd->bb_first_free = first; -+ ++ e3b->bd_info->bb_free += count; ++ if (first < e3b->bd_info->bb_first_free) ++ e3b->bd_info->bb_first_free = first; ++ ++ /* let's maintain fragments counter */ ++ if (first != 0) ++ block = !mb_test_bit(first - 1, EXT3_MB_BITMAP(e3b)); ++ if (first + count < EXT3_SB(e3b->bd_sb)->s_mb_maxs[0]) ++ max = !mb_test_bit(first + count, EXT3_MB_BITMAP(e3b)); ++ if (block && max) ++ e3b->bd_info->bb_fragments--; ++ else if (!block && !max) ++ e3b->bd_info->bb_fragments++; ++ ++ /* let's maintain buddy itself */ + while (count-- > 0) { + block = first++; + order = 0; + + J_ASSERT(mb_test_bit(block, EXT3_MB_BITMAP(e3b))); + mb_clear_bit(block, EXT3_MB_BITMAP(e3b)); -+ e3b->bd_bd->bb_counters[order]++; ++ e3b->bd_info->bb_counters[order]++; + + /* start of the buddy */ + buddy = mb_find_buddy(e3b, order, &max); @@ -748,12 +1129,12 @@ Index: linux-2.6.9/fs/ext3/mballoc.c + mb_set_bit(block, buddy); + mb_set_bit(block + 1, buddy); + } -+ e3b->bd_bd->bb_counters[order]--; -+ e3b->bd_bd->bb_counters[order]--; ++ e3b->bd_info->bb_counters[order]--; ++ e3b->bd_info->bb_counters[order]--; + + block = block >> 1; + order++; -+ e3b->bd_bd->bb_counters[order]++; ++ e3b->bd_info->bb_counters[order]++; + + mb_clear_bit(block, buddy2); + buddy = buddy2; @@ -765,7 +1146,7 @@ Index: linux-2.6.9/fs/ext3/mballoc.c +} + +static int mb_find_extent(struct ext3_buddy *e3b, int order, int block, -+ int needed, struct ext3_free_extent *ex) ++ int needed, struct ext3_free_extent *ex) +{ + int next, max, ord; + void *buddy; @@ -782,7 +1163,7 @@ Index: linux-2.6.9/fs/ext3/mballoc.c + return 0; + } + -+ if (order == 0) { ++ if (likely(order == 0)) { + /* find actual order */ + order = mb_find_order_for_block(e3b, block); + block = block >> order; @@ -792,7 +1173,7 @@ Index: linux-2.6.9/fs/ext3/mballoc.c + ex->fe_start = block << order; + ex->fe_group = e3b->bd_group; + -+ while ((buddy = mb_find_buddy(e3b, order, &max))) { ++ while (needed > ex->fe_len && (buddy = mb_find_buddy(e3b, order, &max))) { + + if (block + 1 >= max) + break; @@ -814,16 +1195,30 @@ Index: linux-2.6.9/fs/ext3/mballoc.c + +static int mb_mark_used(struct ext3_buddy *e3b, struct ext3_free_extent *ex) +{ ++ int ord, mlen = 0, max = 0, cur; + int start = ex->fe_start; + int len = ex->fe_len; -+ int ord, mlen, max, cur; ++ unsigned ret = 0; + int len0 = len; + void *buddy; + -+ e3b->bd_bd->bb_free -= len; -+ if (e3b->bd_bd->bb_first_free == start) -+ e3b->bd_bd->bb_first_free += len; ++ mb_check_buddy(e3b); + ++ e3b->bd_info->bb_free -= len; ++ if (e3b->bd_info->bb_first_free == start) ++ e3b->bd_info->bb_first_free += len; ++ ++ /* let's maintain fragments counter */ ++ if (start != 0) ++ mlen = !mb_test_bit(start - 1, EXT3_MB_BITMAP(e3b)); ++ if (start + len < EXT3_SB(e3b->bd_sb)->s_mb_maxs[0]) ++ max = !mb_test_bit(start + len, EXT3_MB_BITMAP(e3b)); ++ if (mlen && max) ++ e3b->bd_info->bb_fragments++; ++ else if (!mlen && !max) ++ e3b->bd_info->bb_fragments--; ++ ++ /* let's maintain buddy itself */ + while (len) { + ord = mb_find_order_for_block(e3b, start); + @@ -833,26 +1228,30 @@ Index: linux-2.6.9/fs/ext3/mballoc.c + buddy = mb_find_buddy(e3b, ord, &max); + J_ASSERT((start >> ord) < max); + mb_set_bit(start >> ord, buddy); -+ e3b->bd_bd->bb_counters[ord]--; ++ e3b->bd_info->bb_counters[ord]--; + start += mlen; + len -= mlen; + J_ASSERT(len >= 0); + continue; + } + ++ /* store for history */ ++ if (ret == 0) ++ ret = len | (ord << 16); ++ + /* we have to split large buddy */ + J_ASSERT(ord > 0); + buddy = mb_find_buddy(e3b, ord, &max); + mb_set_bit(start >> ord, buddy); -+ e3b->bd_bd->bb_counters[ord]--; ++ e3b->bd_info->bb_counters[ord]--; + + ord--; + cur = (start >> ord) & ~1U; + buddy = mb_find_buddy(e3b, ord, &max); + mb_clear_bit(cur, buddy); + mb_clear_bit(cur + 1, buddy); -+ e3b->bd_bd->bb_counters[ord]++; -+ e3b->bd_bd->bb_counters[ord]++; ++ e3b->bd_info->bb_counters[ord]++; ++ e3b->bd_info->bb_counters[ord]++; + } + + /* now drop all the bits in bitmap */ @@ -860,7 +1259,7 @@ Index: linux-2.6.9/fs/ext3/mballoc.c + + mb_check_buddy(e3b); + -+ return 0; ++ return ret; +} + +/* @@ -869,9 +1268,14 @@ Index: linux-2.6.9/fs/ext3/mballoc.c +static void ext3_mb_use_best_found(struct ext3_allocation_context *ac, + struct ext3_buddy *e3b) +{ ++ unsigned long ret; ++ + ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len); -+ mb_mark_used(e3b, &ac->ac_b_ex); ++ ret = mb_mark_used(e3b, &ac->ac_b_ex); ++ + ac->ac_status = AC_STATUS_FOUND; ++ ac->ac_tail = ret & 0xffff; ++ ac->ac_buddy = ret >> 16; +} + +/* @@ -888,9 +1292,8 @@ Index: linux-2.6.9/fs/ext3/mballoc.c + struct ext3_free_extent *ex, + struct ext3_buddy *e3b) +{ -+ int factor = EXT3_SB(ac->ac_sb)->s_mb_factor; + struct ext3_free_extent *bex = &ac->ac_b_ex; -+ int diff = ac->ac_g_ex.fe_len - ex->fe_len; ++ struct ext3_free_extent *gex = &ac->ac_g_ex; + + J_ASSERT(ex->fe_len > 0); + J_ASSERT(ex->fe_len < (1 << ac->ac_sb->s_blocksize_bits) * 8); @@ -901,7 +1304,7 @@ Index: linux-2.6.9/fs/ext3/mballoc.c + /* + * The special case - take what you catch first + */ -+ if (ac->ac_flags & EXT3_MB_HINT_FIRST) { ++ if (unlikely(ac->ac_flags & EXT3_MB_HINT_FIRST)) { + *bex = *ex; + ext3_mb_use_best_found(ac, e3b); + return; @@ -910,26 +1313,7 @@ Index: linux-2.6.9/fs/ext3/mballoc.c + /* + * Let's check whether the chuck is good enough + */ -+ if (ex->fe_len >= ac->ac_g_ex.fe_len) { -+ *bex = *ex; -+ ext3_mb_use_best_found(ac, e3b); -+ return; -+ } -+ -+ /* -+ * If the request is vey large, then it makes sense to use large -+ * chunks for it. Even if they don't satisfy whole request. -+ */ -+ if (ex->fe_len > 1000) { -+ *bex = *ex; -+ ext3_mb_use_best_found(ac, e3b); -+ return; -+ } -+ -+ /* -+ * Sometimes it's worty to take close chunk -+ */ -+ if (factor && (ac->ac_g_ex.fe_len * 100) / (diff * 100) >= factor) { ++ if (ex->fe_len == gex->fe_len) { + *bex = *ex; + ext3_mb_use_best_found(ac, e3b); + return; @@ -945,13 +1329,26 @@ Index: linux-2.6.9/fs/ext3/mballoc.c + + /* + * If new found extent is better, store it in the context -+ * FIXME: possible the policy should be more complex? + */ -+ if (ex->fe_len > bex->fe_len) { ++ if (bex->fe_len < gex->fe_len) { ++ /* if the request isn't satisfied, any found extent ++ * larger than previous best one is better */ ++ if (ex->fe_len > bex->fe_len) ++ *bex = *ex; ++ } else if (ex->fe_len > gex->fe_len) { ++ /* if the request is satisfied, then we try to find ++ * an extent that still satisfy the request, but is ++ * smaller than previous one */ + *bex = *ex; + } + + /* ++ * Let's scan at least few extents and don't pick up a first one ++ */ ++ if (bex->fe_len > gex->fe_len && ac->ac_found > ext3_mb_min_to_scan) ++ ac->ac_status = AC_STATUS_BREAK; ++ ++ /* + * We don't want to scan for a whole year + */ + if (ac->ac_found > ext3_mb_max_to_scan) @@ -972,13 +1369,13 @@ Index: linux-2.6.9/fs/ext3/mballoc.c + ext3_lock_group(ac->ac_sb, group); + max = mb_find_extent(e3b, 0, ex.fe_start, ex.fe_len, &ex); + -+ if (max > 0) ++ if (max > 0) { ++ ac->ac_b_ex = ex; + ext3_mb_use_best_found(ac, e3b); ++ } + + ext3_unlock_group(ac->ac_sb, group); + -+ if (ac->ac_status == AC_STATUS_FOUND) -+ ext3_mb_dirty_buddy(e3b); + ext3_mb_release_desc(e3b); + + return 0; @@ -1002,37 +1399,79 @@ Index: linux-2.6.9/fs/ext3/mballoc.c + J_ASSERT(ex.fe_len > 0); + J_ASSERT(ex.fe_group == ac->ac_g_ex.fe_group); + J_ASSERT(ex.fe_start == ac->ac_g_ex.fe_start); ++ ac->ac_found++; + ac->ac_b_ex = ex; + ext3_mb_use_best_found(ac, e3b); + } + ext3_unlock_group(ac->ac_sb, group); + -+ if (ac->ac_status == AC_STATUS_FOUND) -+ ext3_mb_dirty_buddy(e3b); + ext3_mb_release_desc(e3b); + + return 0; +} ++ ++/* ++ * The routine scans buddy structures (not bitmap!) from given order ++ * to max order and tries to find big enough chunk to satisfy the req ++ */ ++static void ext3_mb_simple_scan_group(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ struct super_block *sb = ac->ac_sb; ++ struct ext3_group_info *grp = e3b->bd_info; ++ void *buddy; ++ int i, k, max; ++ ++ J_ASSERT(ac->ac_2order > 0); ++ for (i = ac->ac_2order; i < sb->s_blocksize_bits + 1; i++) { ++ if (grp->bb_counters[i] == 0) ++ continue; ++ ++ buddy = mb_find_buddy(e3b, i, &max); ++ if (buddy == NULL) { ++ printk(KERN_ALERT "looking for wrong order?\n"); ++ break; ++ } ++ ++ k = mb_find_next_zero_bit(buddy, max, 0); ++ J_ASSERT(k < max); ++ ++ ac->ac_found++; ++ ++ ac->ac_b_ex.fe_len = 1 << i; ++ ac->ac_b_ex.fe_start = k << i; ++ ac->ac_b_ex.fe_group = e3b->bd_group; ++ ++ ext3_mb_use_best_found(ac, e3b); ++ J_ASSERT(ac->ac_b_ex.fe_len == ac->ac_g_ex.fe_len); ++ ++ if (unlikely(ext3_mb_stats)) ++ atomic_inc(&EXT3_SB(sb)->s_bal_2orders); ++ ++ break; ++ } ++} ++ +/* + * The routine scans the group and measures all found extents. + * In order to optimize scanning, caller must pass number of -+ * free blocks in the group, so the routine can upper limit. ++ * free blocks in the group, so the routine can know upper limit. + */ -+static void ext3_mb_scan_group(struct ext3_allocation_context *ac, -+ struct ext3_buddy *e3b) ++static void ext3_mb_complex_scan_group(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) +{ + struct super_block *sb = ac->ac_sb; + void *bitmap = EXT3_MB_BITMAP(e3b); + struct ext3_free_extent ex; + int i, free; + -+ free = e3b->bd_bd->bb_free; ++ free = e3b->bd_info->bb_free; + J_ASSERT(free > 0); + -+ i = e3b->bd_bd->bb_first_free; ++ i = e3b->bd_info->bb_first_free; + -+ while (free && ac->ac_status != AC_STATUS_FOUND) { -+ i = ext2_find_next_zero_bit(bitmap, sb->s_blocksize * 8, i); ++ while (free && ac->ac_status == AC_STATUS_CONTINUE) { ++ i = mb_find_next_zero_bit(bitmap, sb->s_blocksize * 8, i); + if (i >= sb->s_blocksize * 8) { + J_ASSERT(free == 0); + break; @@ -1052,23 +1491,39 @@ Index: linux-2.6.9/fs/ext3/mballoc.c +static int ext3_mb_good_group(struct ext3_allocation_context *ac, + int group, int cr) +{ -+ int free; ++ struct ext3_sb_info *sbi = EXT3_SB(ac->ac_sb); ++ struct ext3_group_info *grp = sbi->s_group_info[group]; ++ unsigned free, fragments, i, bits; + -+ J_ASSERT(cr >= 0 && cr < 3); ++ J_ASSERT(cr >= 0 && cr < 4); ++ J_ASSERT(!EXT3_MB_GRP_NEED_INIT(grp)); + -+ free = EXT3_SB(ac->ac_sb)->s_buddy_blocks[group]->bb_free; ++ free = grp->bb_free; ++ fragments = grp->bb_fragments; + if (free == 0) + return 0; ++ if (fragments == 0) ++ return 0; + -+ if (cr == 0) { -+ if (free >= ac->ac_g_ex.fe_len >> 1) ++ switch (cr) { ++ case 0: ++ J_ASSERT(ac->ac_2order != 0); ++ bits = ac->ac_sb->s_blocksize_bits + 1; ++ for (i = ac->ac_2order; i < bits; i++) ++ if (grp->bb_counters[i] > 0) ++ return 1; ++ case 1: ++ if ((free / fragments) >= ac->ac_g_ex.fe_len) ++ return 1; ++ case 2: ++ if (free >= ac->ac_g_ex.fe_len) ++ return 1; ++ case 3: + return 1; -+ } else if (cr == 1) { -+ if (free >= ac->ac_g_ex.fe_len >> 2) -+ return 1; -+ } else if (cr == 2) { -+ return 1; ++ default: ++ BUG(); + } ++ + return 0; +} + @@ -1160,11 +1615,19 @@ Index: linux-2.6.9/fs/ext3/mballoc.c + ac.ac_g_ex.fe_start = block; + ac.ac_g_ex.fe_len = *len; + ac.ac_flags = flags; ++ ac.ac_2order = 0; ++ ac.ac_criteria = 0; + -+ /* -+ * Sometimes, caller may want to merge even small number -+ * of blocks to an existing extent -+ */ ++ /* probably, the request is for 2^8+ blocks (1/2/3/... MB) */ ++ i = ffs(*len); ++ if (i >= 8) { ++ i--; ++ if ((*len & (~(1 << i))) == 0) ++ ac.ac_2order = i; ++ } ++ ++ /* Sometimes, caller may want to merge even small ++ * number of blocks to an existing extent */ + if (ac.ac_flags & EXT3_MB_HINT_MERGE) { + err = ext3_mb_find_by_goal(&ac, &e3b); + if (err) @@ -1173,23 +1636,24 @@ Index: linux-2.6.9/fs/ext3/mballoc.c + goto found; + } + -+ /* -+ * FIXME -+ * If requested chunk is power of 2 length, we can try -+ * to exploit buddy nature to speed allocation up -+ */ -+ -+ -+ /* -+ * Let's just scan groups to find more-less suitable blocks -+ */ -+ cr = 0; ++ /* Let's just scan groups to find more-less suitable blocks */ ++ cr = ac.ac_2order ? 0 : 1; +repeat: -+ for (; cr < 3 && ac.ac_status == AC_STATUS_CONTINUE; cr++) { ++ for (; cr < 4 && ac.ac_status == AC_STATUS_CONTINUE; cr++) { ++ ac.ac_criteria = cr; + for (i = 0; i < EXT3_SB(sb)->s_groups_count; group++, i++) { + if (group == EXT3_SB(sb)->s_groups_count) + group = 0; + ++ if (EXT3_MB_GRP_NEED_INIT(sbi->s_group_info[group])) { ++ /* we need full data about the group ++ * to make a good selection */ ++ err = ext3_mb_load_buddy(ac.ac_sb, group, &e3b); ++ if (err) ++ goto out_err; ++ ext3_mb_release_desc(&e3b); ++ } ++ + /* check is group good for our criteries */ + if (!ext3_mb_good_group(&ac, group, cr)) + continue; @@ -1206,29 +1670,32 @@ Index: linux-2.6.9/fs/ext3/mballoc.c + continue; + } + -+ ext3_mb_scan_group(&ac, &e3b); ++ ac.ac_groups_scanned++; ++ if (cr == 0) ++ ext3_mb_simple_scan_group(&ac, &e3b); ++ else ++ ext3_mb_complex_scan_group(&ac, &e3b); ++ + ext3_unlock_group(sb, group); + -+ if (ac.ac_status == AC_STATUS_FOUND) -+ ext3_mb_dirty_buddy(&e3b); + ext3_mb_release_desc(&e3b); + -+ if (err) -+ goto out_err; + if (ac.ac_status != AC_STATUS_CONTINUE) + break; + } + } + -+ if (ac.ac_status == AC_STATUS_BREAK && ++ if (ac.ac_b_ex.fe_len > 0 && ac.ac_status != AC_STATUS_FOUND && + !(ac.ac_flags & EXT3_MB_HINT_FIRST)) { + /* + * We've been searching too long. Let's try to allocate + * the best chunk we've found so far + */ -+ ext3_warning(inode->i_sb, __FUNCTION__, -+ "too long searching: got %d want %d\n", -+ ac.ac_b_ex.fe_len, ac.ac_g_ex.fe_len); ++ ++ /*if (ac.ac_found > ext3_mb_max_to_scan) ++ printk(KERN_ERR "EXT3-fs: too long searching at " ++ "%u (%d/%d)\n", cr, ac.ac_b_ex.fe_len, ++ ac.ac_g_ex.fe_len);*/ + ext3_mb_try_best_found(&ac, &e3b); + if (ac.ac_status != AC_STATUS_FOUND) { + /* @@ -1242,7 +1709,7 @@ Index: linux-2.6.9/fs/ext3/mballoc.c + ac.ac_b_ex.fe_len = 0; + ac.ac_status = AC_STATUS_CONTINUE; + ac.ac_flags |= EXT3_MB_HINT_FIRST; -+ cr = 2; ++ cr = 3; + goto repeat; + } + } @@ -1265,7 +1732,7 @@ Index: linux-2.6.9/fs/ext3/mballoc.c + printk("EXT3-fs: groups: "); + for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) + printk("%d: %d ", i, -+ sbi->s_buddy_blocks[i]->bb_free); ++ sbi->s_group_info[i]->bb_free); + printk("\n"); +#endif + goto out; @@ -1319,12 +1786,10 @@ Index: linux-2.6.9/fs/ext3/mballoc.c + ext3_error(sb, "ext3_new_block", + "Allocating block in system zone - " + "block = %u", block); -+ if (unlikely(ext3_mb_aggressive)) { -+ for (i = 0; i < ac.ac_b_ex.fe_len; i++) -+ J_ASSERT(!mb_test_bit(ac.ac_b_ex.fe_start + i, -+ bitmap_bh->b_data)); -+ } -+ ++#ifdef AGGRESSIVE_CHECK ++ for (i = 0; i < ac.ac_b_ex.fe_len; i++) ++ J_ASSERT(!mb_test_bit(ac.ac_b_ex.fe_start + i, bitmap_bh->b_data)); ++#endif + mb_set_bits(bitmap_bh->b_data, ac.ac_b_ex.fe_start, ac.ac_b_ex.fe_len); + + spin_lock(sb_bgl_lock(sbi, ac.ac_b_ex.fe_group)); @@ -1374,369 +1839,359 @@ Index: linux-2.6.9/fs/ext3/mballoc.c + * path only, here is single block always */ + ext3_mb_release_blocks(sb, 1); + } -+ -+ if ((ext3_mb_stats) && (ac.ac_g_ex.fe_len > 1)) { -+ spin_lock(&sbi->s_bal_lock); -+ sbi->s_bal_reqs++; -+ sbi->s_bal_allocated += *len; ++ ++ if (unlikely(ext3_mb_stats) && ac.ac_g_ex.fe_len > 1) { ++ atomic_inc(&sbi->s_bal_reqs); ++ atomic_add(*len, &sbi->s_bal_allocated); + if (*len >= ac.ac_g_ex.fe_len) -+ sbi->s_bal_success++; -+ sbi->s_bal_ex_scanned += ac.ac_found; ++ atomic_inc(&sbi->s_bal_success); ++ atomic_add(ac.ac_found, &sbi->s_bal_ex_scanned); + if (ac.ac_g_ex.fe_start == ac.ac_b_ex.fe_start && + ac.ac_g_ex.fe_group == ac.ac_b_ex.fe_group) -+ sbi->s_bal_goals++; ++ atomic_inc(&sbi->s_bal_goals); + if (ac.ac_found > ext3_mb_max_to_scan) -+ sbi->s_bal_breaks++; -+ spin_unlock(&sbi->s_bal_lock); ++ atomic_inc(&sbi->s_bal_breaks); + } + ++ ext3_mb_store_history(sb, &ac); ++ + return block; +} ++EXPORT_SYMBOL(ext3_mb_new_blocks); ++ ++#ifdef EXT3_MB_HISTORY ++struct ext3_mb_proc_session { ++ struct ext3_mb_history *history; ++ struct super_block *sb; ++ int start; ++ int max; ++}; + -+int ext3_mb_get_descr_loc(struct ext3_buddy *e3b, struct buffer_head **bh, -+ struct ext3_mb_group_descr **grp) ++static void *ext3_mb_history_skip_empty(struct ext3_mb_proc_session *s, ++ struct ext3_mb_history *hs, ++ int first) +{ -+ struct super_block *sb = e3b->bd_sb; -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ int descr_per_block, err, offset; -+ struct ext3_mb_grp_header *hdr; -+ unsigned long block; -+ -+ descr_per_block = (sb->s_blocksize - sizeof(struct ext3_mb_grp_header)) -+ / sizeof(struct ext3_mb_group_descr); -+ block = e3b->bd_group / descr_per_block; -+ *bh = ext3_bread(NULL, sbi->s_buddy, block, 0, &err); -+ if (*bh == NULL) { -+ printk(KERN_ERR "EXT3-fs: cant getblk descr for group %d: %d\n", -+ e3b->bd_group, err); -+ return err; -+ } ++ if (hs == s->history + s->max) ++ hs = s->history; ++ if (!first && hs == s->history + s->start) ++ return NULL; ++ while (hs->goal.fe_len == 0) { ++ hs++; ++ if (hs == s->history + s->max) ++ hs = s->history; ++ if (hs == s->history + s->start) ++ return NULL; ++ } ++ return hs; ++} + -+ hdr = (struct ext3_mb_grp_header *) (*bh)->b_data; -+ if (hdr->mh_magic != EXT3_MB_MAGIC_V1) { -+ printk(KERN_ERR "EXT3-fs: invalid magic in group %d!\n", -+ e3b->bd_group); -+ brelse(*bh); -+ *bh = NULL; -+ return -EIO; -+ } ++static void *ext3_mb_seq_history_start(struct seq_file *seq, loff_t *pos) ++{ ++ struct ext3_mb_proc_session *s = seq->private; ++ struct ext3_mb_history *hs; ++ int l = *pos; ++ ++ if (l == 0) ++ return SEQ_START_TOKEN; ++ hs = ext3_mb_history_skip_empty(s, s->history + s->start, 1); ++ if (!hs) ++ return NULL; ++ while (--l && (hs = ext3_mb_history_skip_empty(s, ++hs, 0)) != NULL); ++ return hs; ++} + -+ offset = e3b->bd_group % descr_per_block -+ * sizeof(struct ext3_mb_group_descr) -+ + sizeof(struct ext3_mb_grp_header); -+ *grp = (struct ext3_mb_group_descr *) ((*bh)->b_data + offset); ++static void *ext3_mb_seq_history_next(struct seq_file *seq, void *v, loff_t *pos) ++{ ++ struct ext3_mb_proc_session *s = seq->private; ++ struct ext3_mb_history *hs = v; + -+ return 0; ++ ++*pos; ++ if (v == SEQ_START_TOKEN) ++ return ext3_mb_history_skip_empty(s, s->history + s->start, 1); ++ else ++ return ext3_mb_history_skip_empty(s, ++hs, 0); +} + -+int ext3_mb_load_descr(struct ext3_buddy *e3b) ++static int ext3_mb_seq_history_show(struct seq_file *seq, void *v) +{ -+ struct ext3_mb_group_descr *grp; -+ struct ext3_group_desc *gdp; -+ struct buffer_head *bh; -+ int err, i; -+ -+ err = ext3_mb_get_descr_loc(e3b, &bh, &grp); -+ if (err) -+ return err; -+ -+ e3b->bd_bd->bb_first_free = grp->mgd_first_free; -+ e3b->bd_bd->bb_free = grp->mgd_free; -+ for (i = 0; i <= e3b->bd_blkbits + 1; i++) { -+ J_ASSERT(i < 16); -+ e3b->bd_bd->bb_counters[i] = grp->mgd_counters[i]; -+ } -+ brelse(bh); ++ struct ext3_mb_history *hs = v; ++ char buf[20], buf2[20]; + -+ /* additional checks against old group descriptor */ -+ gdp = ext3_get_group_desc(e3b->bd_sb, e3b->bd_group, NULL); -+ if (!gdp) -+ return -EIO; -+ if (e3b->bd_bd->bb_free != le16_to_cpu(gdp->bg_free_blocks_count)) { -+ printk(KERN_ERR "EXT3-fs: mbgroup %d corrupted (%d != %d)\n", -+ e3b->bd_group, e3b->bd_bd->bb_free, -+ le16_to_cpu(gdp->bg_free_blocks_count)); -+ return -ENODATA; ++ if (v == SEQ_START_TOKEN) { ++ seq_printf(seq, "%-17s %-17s %-5s %-5s %-2s %-5s %-5s %-6s\n", ++ "goal", "result", "found", "grps", "cr", "merge", ++ "tail", "broken"); ++ return 0; + } + ++ sprintf(buf, "%u/%u/%u", hs->goal.fe_group, ++ hs->goal.fe_start, hs->goal.fe_len); ++ sprintf(buf2, "%u/%u/%u", hs->result.fe_group, ++ hs->result.fe_start, hs->result.fe_len); ++ seq_printf(seq, "%-17s %-17s %-5u %-5u %-2u %-5s %-5u %-6u\n", buf, ++ buf2, hs->found, hs->groups, hs->cr, ++ hs->merged ? "M" : "", hs->tail, ++ hs->buddy ? 1 << hs->buddy : 0); + return 0; +} + ++static void ext3_mb_seq_history_stop(struct seq_file *seq, void *v) ++{ ++} ++ ++static struct seq_operations ext3_mb_seq_history_ops = { ++ .start = ext3_mb_seq_history_start, ++ .next = ext3_mb_seq_history_next, ++ .stop = ext3_mb_seq_history_stop, ++ .show = ext3_mb_seq_history_show, ++}; + -+int ext3_mb_update_descr(struct ext3_buddy *e3b) ++static int ext3_mb_seq_history_open(struct inode *inode, struct file *file) +{ -+ struct ext3_mb_group_descr *grp; -+ struct ext3_group_desc *gdp; -+ struct buffer_head *bh; -+ handle_t *handle; -+ int err, i; ++ struct super_block *sb = PDE(inode)->data; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct ext3_mb_proc_session *s; ++ int rc, size; + -+ /* additional checks against old group descriptor */ -+ gdp = ext3_get_group_desc(e3b->bd_sb, e3b->bd_group, NULL); -+ if (!gdp) ++ s = kmalloc(sizeof(*s), GFP_KERNEL); ++ if (s == NULL) + return -EIO; -+ if (e3b->bd_bd->bb_free != le16_to_cpu(gdp->bg_free_blocks_count)) { -+ printk(KERN_ERR "EXT3-fs: mbgroup %d corrupted (%d != %d)\n", -+ e3b->bd_group, e3b->bd_bd->bb_free, -+ le16_to_cpu(gdp->bg_free_blocks_count)); -+ return -ENODATA; -+ } -+ -+ err = ext3_mb_get_descr_loc(e3b, &bh, &grp); -+ if (err) -+ return err; ++ size = sizeof(struct ext3_mb_history) * sbi->s_mb_history_max; ++ s->history = kmalloc(size, GFP_KERNEL); ++ ++ spin_lock(&sbi->s_mb_history_lock); ++ memcpy(s->history, sbi->s_mb_history, size); ++ s->max = sbi->s_mb_history_max; ++ s->start = sbi->s_mb_history_cur % s->max; ++ spin_unlock(&sbi->s_mb_history_lock); + -+ handle = ext3_journal_start_sb(e3b->bd_sb, 1); -+ if (IS_ERR(handle)) { -+ err = PTR_ERR(handle); -+ handle = NULL; -+ goto out; -+ } -+ -+ err = ext3_journal_get_write_access(handle, bh); -+ if (err) -+ goto out; -+ grp->mgd_first_free = e3b->bd_bd->bb_first_free; -+ grp->mgd_free = e3b->bd_bd->bb_free; -+ for (i = 0; i <= e3b->bd_blkbits + 1; i++) { -+ J_ASSERT(i < 16); -+ grp->mgd_counters[i] = e3b->bd_bd->bb_counters[i]; ++ rc = seq_open(file, &ext3_mb_seq_history_ops); ++ if (rc == 0) { ++ struct seq_file *m = (struct seq_file *)file->private_data; ++ m->private = s; ++ } else { ++ kfree(s->history); ++ kfree(s); + } -+ err = ext3_journal_dirty_metadata(handle, bh); -+ if (err) -+ goto out; -+ err = 0; -+out: -+ brelse(bh); -+ if (handle) -+ ext3_journal_stop(handle); -+ return err; ++ return rc; ++ +} + -+int ext3_mb_generate_buddy(struct ext3_buddy *e3b) ++static int ext3_mb_seq_history_release(struct inode *inode, struct file *file) +{ -+ struct super_block *sb = e3b->bd_sb; -+ struct buffer_head *bh; -+ int i, count = 0; ++ struct seq_file *seq = (struct seq_file *)file->private_data; ++ struct ext3_mb_proc_session *s = seq->private; ++ kfree(s->history); ++ kfree(s); ++ return seq_release(inode, file); ++} + -+ mb_debug("generate buddy for group %d\n", e3b->bd_group); -+ memset(e3b->bd_bh->b_data, 0xff, sb->s_blocksize); -+ memset(e3b->bd_bh2->b_data, 0xff, sb->s_blocksize); ++static struct file_operations ext3_mb_seq_history_fops = { ++ .owner = THIS_MODULE, ++ .open = ext3_mb_seq_history_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = ext3_mb_seq_history_release, ++}; + -+ bh = read_block_bitmap(sb, e3b->bd_group); -+ if (bh == NULL) -+ return -EIO; ++static void ext3_mb_history_release(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ char name[64]; + -+ /* mb_free_blocks will set real free */ -+ e3b->bd_bd->bb_free = 0; -+ e3b->bd_bd->bb_first_free = 1 << 15; -+ /* -+ * if change bb_counters size, don't forget about -+ * ext3_mb_init_backend() -bzzz -+ */ -+ memset(e3b->bd_bd->bb_counters, 0, -+ sizeof(unsigned) * (sb->s_blocksize_bits + 2)); ++ snprintf(name, sizeof(name) - 1, "%s", bdevname(sb->s_bdev, name)); ++ remove_proc_entry("mb_history", sbi->s_mb_proc); ++ remove_proc_entry(name, proc_root_ext3); + -+ /* loop over the blocks, and create buddies for free ones */ -+ for (i = 0; i < sb->s_blocksize * 8; i++) { -+ if (!mb_test_bit(i, (void *) bh->b_data)) { -+ mb_free_blocks(e3b, i, 1); -+ count++; ++ if (sbi->s_mb_history) ++ kfree(sbi->s_mb_history); ++} ++ ++static void ext3_mb_history_init(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ char name[64]; ++ int i; ++ ++ snprintf(name, sizeof(name) - 1, "%s", bdevname(sb->s_bdev, name)); ++ sbi->s_mb_proc = proc_mkdir(name, proc_root_ext3); ++ if (sbi->s_mb_proc != NULL) { ++ struct proc_dir_entry *p; ++ p = create_proc_entry("mb_history", S_IRUGO, sbi->s_mb_proc); ++ if (p) { ++ p->proc_fops = &ext3_mb_seq_history_fops; ++ p->data = sb; + } + } -+ brelse(bh); -+ mb_check_buddy(e3b); -+ ext3_mb_dirty_buddy(e3b); + -+ return 0; ++ sbi->s_mb_history_max = 1000; ++ sbi->s_mb_history_cur = 0; ++ spin_lock_init(&sbi->s_mb_history_lock); ++ i = sbi->s_mb_history_max * sizeof(struct ext3_mb_history); ++ sbi->s_mb_history = kmalloc(i, GFP_KERNEL); ++ memset(sbi->s_mb_history, 0, i); ++ /* if we can't allocate history, then we simple won't use it */ +} + -+EXPORT_SYMBOL(ext3_mb_new_blocks); ++static void ++ext3_mb_store_history(struct super_block *sb, struct ext3_allocation_context *ac) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct ext3_mb_history h; ++ ++ if (likely(sbi->s_mb_history == NULL)) ++ return; ++ ++ h.goal = ac->ac_g_ex; ++ h.result = ac->ac_b_ex; ++ h.found = ac->ac_found; ++ h.cr = ac->ac_criteria; ++ h.groups = ac->ac_groups_scanned; ++ h.tail = ac->ac_tail; ++ h.buddy = ac->ac_buddy; ++ h.merged = 0; ++ if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && ++ ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group) ++ h.merged = 1; ++ ++ spin_lock(&sbi->s_mb_history_lock); ++ memcpy(sbi->s_mb_history + sbi->s_mb_history_cur, &h, sizeof(h)); ++ if (++sbi->s_mb_history_cur >= sbi->s_mb_history_max) ++ sbi->s_mb_history_cur = 0; ++ spin_unlock(&sbi->s_mb_history_lock); ++} + -+#define MB_CREDITS \ -+ (EXT3_DATA_TRANS_BLOCKS + 3 + EXT3_INDEX_EXTRA_TRANS_BLOCKS + \ -+ 2 * EXT3_SINGLEDATA_TRANS_BLOCKS) ++#else ++#define ext3_mb_history_release(sb) ++#define ext3_mb_history_init(sb) ++#endif + -+int ext3_mb_init_backend(struct super_block *sb, int *created) ++int ext3_mb_init_backend(struct super_block *sb) +{ -+ int err, i, len, descr_per_block, buddy_offset, size; -+ struct inode *root = sb->s_root->d_inode; + struct ext3_sb_info *sbi = EXT3_SB(sb); -+ struct ext3_mb_grp_header *hdr; -+ struct buffer_head *bh = NULL; -+ unsigned long block; -+ struct dentry *db; -+ handle_t *handle; -+ tid_t target; -+ -+ *created = 0; ++ int i, len; ++ + len = sizeof(struct ext3_buddy_group_blocks *) * sbi->s_groups_count; -+ sbi->s_buddy_blocks = kmalloc(len, GFP_KERNEL); -+ if (sbi->s_buddy_blocks == NULL) { -+ printk(KERN_ERR "EXT3-fs: cant allocate mem for buddy\n"); ++ sbi->s_group_info = kmalloc(len, GFP_KERNEL); ++ if (sbi->s_group_info == NULL) { ++ printk(KERN_ERR "EXT3-fs: can't allocate mem for buddy\n"); + return -ENOMEM; + } -+ memset(sbi->s_buddy_blocks, 0, len); -+ sbi->s_buddy = NULL; -+ -+ down(&root->i_sem); -+ len = strlen(EXT3_BUDDY_FILE); -+ db = lookup_one_len(EXT3_BUDDY_FILE, sb->s_root, len); -+ if (IS_ERR(db)) { -+ err = PTR_ERR(db); -+ printk(KERN_ERR "EXT3-fs: cant lookup buddy: %d\n", err); -+ up(&root->i_sem); -+ goto out; -+ } ++ memset(sbi->s_group_info, 0, len); + -+ if (db->d_inode == NULL) { -+ err = ext3_create(root, db, S_IFREG, NULL); -+ if (err) { -+ printk(KERN_ERR "EXT3-fs: cant create buddy: %d\n", err); -+ up(&root->i_sem); -+ goto out; -+ } -+ db->d_inode->i_flags |= S_IMMUTABLE | S_NOATIME; -+ *created = 1; -+ mb_debug("no buddy file, regenerate\n"); -+ } -+ up(&root->i_sem); -+ sbi->s_buddy = igrab(db->d_inode); -+ -+ /* calculate needed size */ -+ descr_per_block = (sb->s_blocksize - sizeof(struct ext3_mb_grp_header)) -+ / sizeof(struct ext3_mb_group_descr); -+ buddy_offset = (sbi->s_groups_count + descr_per_block - 1) -+ / descr_per_block; -+ len = sbi->s_groups_count * sb->s_blocksize * 2 + -+ buddy_offset * sb->s_blocksize; -+ if (len != i_size_read(sbi->s_buddy)) { -+ if (*created == 0) -+ printk("EXT3-fs: wrong i_size (%u != %u), regenerate\n", -+ (unsigned) len, -+ (unsigned) i_size_read(sbi->s_buddy)); -+ *created = 1; -+ } -+ -+ /* read/create mb group descriptors */ -+ for (i = 0; i < buddy_offset; i++) { -+ handle = ext3_journal_start(sbi->s_buddy, MB_CREDITS); -+ if (IS_ERR(handle)) { -+ printk(KERN_ERR "EXT3-fs: cant start transaction\n"); -+ err = PTR_ERR(handle); -+ goto err_out; -+ } -+ -+ bh = ext3_bread(handle, sbi->s_buddy, i, 1, &err); -+ if (bh == NULL) { -+ printk(KERN_ERR "EXT3-fs: cant getblk grp: %d\n", err); -+ goto err_out; -+ } -+ hdr = (struct ext3_mb_grp_header *) bh->b_data; -+ if (hdr->mh_magic != EXT3_MB_MAGIC_V1) { -+ err = ext3_journal_get_write_access(handle, bh); -+ if (err) -+ goto err_out; -+ if (*created == 0) -+ printk(KERN_ERR -+ "EXT3-fs: invalid header 0x%x in %d," -+ "regenerate\n", hdr->mh_magic, i); -+ *created = 1; -+ hdr->mh_magic = EXT3_MB_MAGIC_V1; -+ err = ext3_journal_dirty_metadata(handle, bh); -+ if (err) -+ goto err_out; -+ } -+ brelse(bh); -+ ext3_journal_stop(handle); ++ sbi->s_buddy_cache = new_inode(sb); ++ if (sbi->s_buddy_cache == NULL) { ++ printk(KERN_ERR "EXT3-fs: can't get new inode\n"); ++ kfree(sbi->s_group_info); ++ return -ENOMEM; + } + + /* -+ * if change bb_counters size, don't forget about ext3_mb_generate_buddy() ++ * calculate needed size. if change bb_counters size, ++ * don't forget about ext3_mb_generate_buddy() + */ -+ len = sizeof(struct ext3_buddy_group_blocks); -+ len += sizeof(unsigned) * (sb->s_blocksize_bits + 2); ++ len = sizeof(struct ext3_group_info); ++ len += sizeof(unsigned short) * (sb->s_blocksize_bits + 2); + for (i = 0; i < sbi->s_groups_count; i++) { ++ struct ext3_group_desc * desc; + -+ sbi->s_buddy_blocks[i] = kmalloc(len, GFP_KERNEL); -+ if (sbi->s_buddy_blocks[i] == NULL) { ++ sbi->s_group_info[i] = kmalloc(len, GFP_KERNEL); ++ if (sbi->s_group_info[i] == NULL) { + printk(KERN_ERR "EXT3-fs: cant allocate mem for buddy\n"); -+ err = -ENOMEM; -+ goto out2; -+ } -+ memset(sbi->s_buddy_blocks[i], 0, len); -+ -+ handle = ext3_journal_start(sbi->s_buddy, MB_CREDITS); -+ if (IS_ERR(handle)) { -+ printk(KERN_ERR "EXT3-fs: cant start transaction\n"); -+ err = PTR_ERR(handle); -+ goto out2; -+ } -+ -+ /* allocate block for bitmap */ -+ block = buddy_offset + i * 2; -+ bh = ext3_getblk(handle, sbi->s_buddy, block, 1, &err); -+ if (bh == NULL) { -+ printk(KERN_ERR "EXT3-fs: cant getblk bitmap: %d\n", err); -+ goto out2; -+ } -+ sbi->s_buddy_blocks[i]->bb_bitmap = bh->b_blocknr; -+ brelse(bh); -+ -+ /* allocate block for buddy */ -+ block = buddy_offset + i * 2 + 1; -+ bh = ext3_getblk(handle, sbi->s_buddy, block, 1, &err); -+ if (bh == NULL) { -+ printk(KERN_ERR "EXT3-fs: cant getblk for buddy: %d\n", err); -+ goto out2; ++ goto err_out; + } -+ sbi->s_buddy_blocks[i]->bb_buddy = bh->b_blocknr; -+ brelse(bh); -+ -+ size = (block + 1) << sbi->s_buddy->i_blkbits; -+ if (size > sbi->s_buddy->i_size) { -+ *created = 1; -+ EXT3_I(sbi->s_buddy)->i_disksize = size; -+ i_size_write(sbi->s_buddy, size); -+ mark_inode_dirty(sbi->s_buddy); ++ desc = ext3_get_group_desc(sb, i, NULL); ++ if (desc == NULL) { ++ printk(KERN_ERR "EXT3-fs: cant read descriptor %u\n", i); ++ goto err_out; + } -+ ext3_journal_stop(handle); -+ -+ spin_lock_init(&sbi->s_buddy_blocks[i]->bb_lock); -+ sbi->s_buddy_blocks[i]->bb_md_cur = NULL; -+ sbi->s_buddy_blocks[i]->bb_tid = 0; ++ memset(sbi->s_group_info[i], 0, len); ++ set_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, ++ &sbi->s_group_info[i]->bb_state); ++ sbi->s_group_info[i]->bb_free = ++ le16_to_cpu(desc->bg_free_blocks_count); + } + -+ if (journal_start_commit(sbi->s_journal, &target)) -+ log_wait_commit(sbi->s_journal, target); -+ -+out2: -+ dput(db); -+out: -+ return err; ++ return 0; + +err_out: -+ return err; ++ while (--i >= 0) ++ kfree(sbi->s_group_info[i]); ++ iput(sbi->s_buddy_cache); ++ ++ return -ENOMEM; +} + -+int ext3_mb_write_descriptors(struct super_block *sb) ++int ext3_mb_init(struct super_block *sb, int needs_recovery) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); -+ struct ext3_buddy e3b; -+ int ret = 0, i, err; ++ struct inode *root = sb->s_root->d_inode; ++ unsigned i, offset, max; ++ struct dentry *dentry; + -+ for (i = 0; i < sbi->s_groups_count; i++) { -+ if (sbi->s_buddy_blocks[i] == NULL) -+ continue; ++ if (!test_opt(sb, MBALLOC)) ++ return 0; + -+ err = ext3_mb_load_buddy(sb, i, &e3b); -+ if (err == 0) { -+ ext3_mb_update_descr(&e3b); -+ ext3_mb_release_desc(&e3b); -+ } else -+ ret = err; ++ i = (sb->s_blocksize_bits + 2) * sizeof(unsigned short); ++ ++ sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL); ++ if (sbi->s_mb_offsets == NULL) { ++ clear_opt(sbi->s_mount_opt, MBALLOC); ++ return -ENOMEM; + } -+ return ret; ++ sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); ++ if (sbi->s_mb_maxs == NULL) { ++ clear_opt(sbi->s_mount_opt, MBALLOC); ++ kfree(sbi->s_mb_maxs); ++ return -ENOMEM; ++ } ++ ++ /* order 0 is regular bitmap */ ++ sbi->s_mb_maxs[0] = sb->s_blocksize << 3; ++ sbi->s_mb_offsets[0] = 0; ++ ++ i = 1; ++ offset = 0; ++ max = sb->s_blocksize << 2; ++ do { ++ sbi->s_mb_offsets[i] = offset; ++ sbi->s_mb_maxs[i] = max; ++ offset += 1 << (sb->s_blocksize_bits - i); ++ max = max >> 1; ++ i++; ++ } while (i <= sb->s_blocksize_bits + 1); ++ ++ ++ /* init file for buddy data */ ++ if ((i = ext3_mb_init_backend(sb))) { ++ clear_opt(sbi->s_mount_opt, MBALLOC); ++ kfree(sbi->s_mb_offsets); ++ kfree(sbi->s_mb_maxs); ++ return i; ++ } ++ ++ spin_lock_init(&sbi->s_reserve_lock); ++ spin_lock_init(&sbi->s_md_lock); ++ INIT_LIST_HEAD(&sbi->s_active_transaction); ++ INIT_LIST_HEAD(&sbi->s_closed_transaction); ++ INIT_LIST_HEAD(&sbi->s_committed_transaction); ++ spin_lock_init(&sbi->s_bal_lock); ++ ++ /* remove old on-disk buddy file */ ++ down(&root->i_sem); ++ dentry = lookup_one_len(".buddy", sb->s_root, strlen(".buddy")); ++ if (dentry->d_inode != NULL) { ++ i = vfs_unlink(root, dentry); ++ if (i != 0) ++ printk("EXT3-fs: can't remove .buddy file: %d\n", i); ++ } ++ dput(dentry); ++ up(&root->i_sem); ++ ++ ext3_mb_history_init(sb); ++ ++ printk("EXT3-fs: mballoc enabled\n"); ++ return 0; +} + +int ext3_mb_release(struct super_block *sb) @@ -1756,78 +2211,40 @@ Index: linux-2.6.9/fs/ext3/mballoc.c + spin_unlock(&sbi->s_md_lock); + ext3_mb_free_committed_blocks(sb); + -+ if (sbi->s_buddy_blocks) { -+ ext3_mb_write_descriptors(sb); ++ if (sbi->s_group_info) { + for (i = 0; i < sbi->s_groups_count; i++) { -+ if (sbi->s_buddy_blocks[i] == NULL) ++ if (sbi->s_group_info[i] == NULL) + continue; -+ kfree(sbi->s_buddy_blocks[i]); ++ kfree(sbi->s_group_info[i]); + } -+ kfree(sbi->s_buddy_blocks); -+ } -+ if (sbi->s_buddy) -+ iput(sbi->s_buddy); ++ kfree(sbi->s_group_info); ++ } ++ if (sbi->s_mb_offsets) ++ kfree(sbi->s_mb_offsets); ++ if (sbi->s_mb_maxs) ++ kfree(sbi->s_mb_maxs); ++ if (sbi->s_buddy_cache) ++ iput(sbi->s_buddy_cache); + if (sbi->s_blocks_reserved) + printk("ext3-fs: %ld blocks being reserved at umount!\n", + sbi->s_blocks_reserved); + if (ext3_mb_stats) { -+ printk("EXT3-fs: mballoc: %lu blocks %lu reqs " -+ "(%lu success)\n", sbi->s_bal_allocated, -+ sbi->s_bal_reqs, sbi->s_bal_success); -+ printk("EXT3-fs: mballoc: %lu extents scanned, " -+ "%lu goal hits, %lu breaks\n", sbi->s_bal_ex_scanned, -+ sbi->s_bal_goals, sbi->s_bal_breaks); -+ } -+ -+ return 0; -+} -+ -+int ext3_mb_init(struct super_block *sb, int needs_recovery) -+{ -+ struct ext3_buddy e3b; -+ int i, err, created; -+ -+ if (!test_opt(sb, MBALLOC)) -+ return 0; -+ -+ /* init file for buddy data */ -+ clear_opt(EXT3_SB(sb)->s_mount_opt, MBALLOC); -+ if ((err = ext3_mb_init_backend(sb, &created))) -+ return err; -+ -+repeat: -+ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) { -+ err = ext3_mb_load_buddy(sb, i, &e3b); -+ if (err) { -+ /* FIXME: release backend */ -+ return err; -+ } -+ if (created || needs_recovery) -+ ext3_mb_generate_buddy(&e3b); -+ else -+ err = ext3_mb_load_descr(&e3b); -+ ext3_mb_release_desc(&e3b); -+ if (err == -ENODATA) { -+ created = 1; -+ goto repeat; -+ } -+ } -+ if (created || needs_recovery) -+ printk(KERN_ERR "EXT3-fs: generated buddies for %lu groups\n", -+ EXT3_SB(sb)->s_groups_count); -+ spin_lock_init(&EXT3_SB(sb)->s_reserve_lock); -+ spin_lock_init(&EXT3_SB(sb)->s_md_lock); -+ INIT_LIST_HEAD(&EXT3_SB(sb)->s_active_transaction); -+ INIT_LIST_HEAD(&EXT3_SB(sb)->s_closed_transaction); -+ INIT_LIST_HEAD(&EXT3_SB(sb)->s_committed_transaction); -+ set_opt(EXT3_SB(sb)->s_mount_opt, MBALLOC); -+ -+ spin_lock_init(&EXT3_SB(sb)->s_bal_lock); -+ if (ext3_mb_stats) { -+ printk("EXT3-fs: mballoc enabled (stats)\n"); -+ } else { -+ printk("EXT3-fs: mballoc enabled\n"); -+ } ++ printk("EXT3-fs: mballoc: %u blocks %u reqs (%u success)\n", ++ atomic_read(&sbi->s_bal_allocated), ++ atomic_read(&sbi->s_bal_reqs), ++ atomic_read(&sbi->s_bal_success)); ++ printk("EXT3-fs: mballoc: %u extents scanned, %u goal hits, " ++ "%u 2^N hits, %u breaks\n", ++ atomic_read(&sbi->s_bal_ex_scanned), ++ atomic_read(&sbi->s_bal_goals), ++ atomic_read(&sbi->s_bal_2orders), ++ atomic_read(&sbi->s_bal_breaks)); ++ printk("EXT3-fs: mballoc: %lu generated and it took %Lu\n", ++ sbi->s_mb_buddies_generated++, ++ sbi->s_mb_generation_time); ++ } ++ ++ ext3_mb_history_release(sb); + + return 0; +} @@ -1874,8 +2291,11 @@ Index: linux-2.6.9/fs/ext3/mballoc.c + mb_debug("\n"); + ext3_unlock_group(sb, md->group); + ++ /* balance refcounts from ext3_mb_free_metadata() */ ++ page_cache_release(e3b.bd_buddy_page); ++ page_cache_release(e3b.bd_bitmap_page); ++ + kfree(md); -+ ext3_mb_dirty_buddy(&e3b); + ext3_mb_release_desc(&e3b); + + } while (md); @@ -1892,7 +2312,7 @@ Index: linux-2.6.9/fs/ext3/mballoc.c + /* new transaction! time to close last one and free blocks for + * committed transaction. we know that only transaction can be + * active, so previos transaction can be being logged and we -+ * know that transaction before previous is known to be alreade ++ * know that transaction before previous is known to be already + * logged. this means that now we may free blocks freed in all + * transactions before previous one. hope I'm clear enough ... */ + @@ -1915,12 +2335,15 @@ Index: linux-2.6.9/fs/ext3/mballoc.c +int ext3_mb_free_metadata(handle_t *handle, struct ext3_buddy *e3b, + int group, int block, int count) +{ -+ struct ext3_buddy_group_blocks *db = e3b->bd_bd; ++ struct ext3_group_info *db = e3b->bd_info; + struct super_block *sb = e3b->bd_sb; + struct ext3_sb_info *sbi = EXT3_SB(sb); + struct ext3_free_metadata *md; + int i; + ++ J_ASSERT(e3b->bd_bitmap_page != NULL); ++ J_ASSERT(e3b->bd_buddy_page != NULL); ++ + ext3_lock_group(sb, group); + for (i = 0; i < count; i++) { + md = db->bb_md_cur; @@ -1942,6 +2365,12 @@ Index: linux-2.6.9/fs/ext3/mballoc.c + spin_lock(&sbi->s_md_lock); + list_add(&md->list, &sbi->s_active_transaction); + spin_unlock(&sbi->s_md_lock); ++ /* protect buddy cache from being freed, ++ * otherwise we'll refresh it from ++ * on-disk bitmap and lose not-yet-available ++ * blocks */ ++ page_cache_get(e3b->bd_buddy_page); ++ page_cache_get(e3b->bd_bitmap_page); + db->bb_md_cur = md; + db->bb_tid = handle->h_transaction->t_tid; + mb_debug("new md 0x%p for group %u\n", @@ -2053,12 +2482,13 @@ Index: linux-2.6.9/fs/ext3/mballoc.c + if (err) + goto error_return; + -+ if (unlikely(ext3_mb_aggressive)) { ++#ifdef AGGRESSIVE_CHECK ++ { + int i; + for (i = 0; i < count; i++) + J_ASSERT(mb_test_bit(bit + i, bitmap_bh->b_data)); + } -+ ++#endif + mb_clear_bits(bitmap_bh->b_data, bit, count); + + /* We dirtied the bitmap block */ @@ -2081,7 +2511,6 @@ Index: linux-2.6.9/fs/ext3/mballoc.c + spin_unlock(sb_bgl_lock(sbi, block_group)); + percpu_counter_mod(&sbi->s_freeblocks_counter, count); + -+ ext3_mb_dirty_buddy(&e3b); + ext3_mb_release_desc(&e3b); + + *freed = count; @@ -2147,50 +2576,30 @@ Index: linux-2.6.9/fs/ext3/mballoc.c + return ret; +} + -+void ext3_free_blocks(handle_t *handle, struct inode *inode, -+ unsigned long block, unsigned long count, int metadata) ++ ++void ext3_free_blocks(handle_t *handle, struct inode * inode, ++ unsigned long block, unsigned long count, int metadata) +{ ++ struct super_block *sb; + int freed; + -+ if (!test_opt(inode->i_sb, MBALLOC) || -+ EXT3_SB(inode->i_sb)->s_buddy_blocks == NULL) -+ ext3_free_blocks_sb(handle, inode->i_sb, block, count, &freed); ++ sb = inode->i_sb; ++ if (!test_opt(sb, MBALLOC)) ++ ext3_free_blocks_sb(handle, sb, block, count, &freed); + else -+ ext3_mb_free_blocks(handle, inode, block,count,metadata,&freed); -+ ++ ext3_mb_free_blocks(handle, inode, block, count, metadata, &freed); + if (freed) + DQUOT_FREE_BLOCK(inode, freed); + return; +} -Index: linux-2.6.5-7.201/fs/ext3/proc.c -=================================================================== ---- linux-2.6.5-7.201.orig/fs/ext3/proc.c 2005-10-13 19:40:57.851699336 +0400 -+++ linux-2.6.5-7.201/fs/ext3/proc.c 2005-10-14 09:02:36.000000000 +0400 -@@ -0,0 +1,195 @@ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ + +#define EXT3_ROOT "ext3" -+#define EXT3_MB_AGGRESSIVE_NAME "mb_aggressive" +#define EXT3_MB_STATS_NAME "mb_stats" +#define EXT3_MB_MAX_TO_SCAN_NAME "mb_max_to_scan" ++#define EXT3_MB_MIN_TO_SCAN_NAME "mb_min_to_scan" + -+ -+static struct proc_dir_entry *proc_root_ext3; -+ -+ -+static int ext3_mb_aggressive_read(char *page, char **start, off_t off, -+ int count, int *eof, void *data) ++static int ext3_mb_stats_read(char *page, char **start, off_t off, ++ int count, int *eof, void *data) +{ + int len; + @@ -2198,19 +2607,19 @@ Index: linux-2.6.5-7.201/fs/ext3/proc.c + if (off != 0) + return 0; + -+ len = sprintf(page, "%ld\n", ext3_mb_aggressive); ++ len = sprintf(page, "%ld\n", ext3_mb_stats); + *start = page; + return len; +} + -+static int ext3_mb_aggressive_write(struct file *file, const char *buffer, -+ unsigned long count, void *data) ++static int ext3_mb_stats_write(struct file *file, const char *buffer, ++ unsigned long count, void *data) +{ + char str[32]; + + if (count >= sizeof(str)) { + printk(KERN_ERR "EXT3: %s string to long, max %u bytes\n", -+ EXT3_MB_AGGRESSIVE_NAME, sizeof(str)); ++ EXT3_MB_STATS_NAME, (int)sizeof(str)); + return -EOVERFLOW; + } + @@ -2218,12 +2627,12 @@ Index: linux-2.6.5-7.201/fs/ext3/proc.c + return -EFAULT; + + /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ -+ ext3_mb_aggressive = (simple_strtol(str, NULL, 0) != 0); ++ ext3_mb_stats = (simple_strtol(str, NULL, 0) != 0); + return count; +} + -+static int ext3_mb_stats_read(char *page, char **start, off_t off, -+ int count, int *eof, void *data) ++static int ext3_mb_max_to_scan_read(char *page, char **start, off_t off, ++ int count, int *eof, void *data) +{ + int len; + @@ -2231,19 +2640,20 @@ Index: linux-2.6.5-7.201/fs/ext3/proc.c + if (off != 0) + return 0; + -+ len = sprintf(page, "%ld\n", ext3_mb_stats); ++ len = sprintf(page, "%ld\n", ext3_mb_max_to_scan); + *start = page; + return len; +} + -+static int ext3_mb_stats_write(struct file *file, const char *buffer, -+ unsigned long count, void *data) ++static int ext3_mb_max_to_scan_write(struct file *file, const char *buffer, ++ unsigned long count, void *data) +{ + char str[32]; ++ long value; + + if (count >= sizeof(str)) { + printk(KERN_ERR "EXT3: %s string to long, max %u bytes\n", -+ EXT3_MB_STATS_NAME, sizeof(str)); ++ EXT3_MB_MAX_TO_SCAN_NAME, (int)sizeof(str)); + return -EOVERFLOW; + } + @@ -2251,12 +2661,17 @@ Index: linux-2.6.5-7.201/fs/ext3/proc.c + return -EFAULT; + + /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ -+ ext3_mb_stats = (simple_strtol(str, NULL, 0) != 0); ++ value = simple_strtol(str, NULL, 0); ++ if (value <= 0) ++ return -ERANGE; ++ ++ ext3_mb_max_to_scan = value; ++ + return count; +} + -+static int ext3_mb_max_to_scan_read(char *page, char **start, off_t off, -+ int count, int *eof, void *data) ++static int ext3_mb_min_to_scan_read(char *page, char **start, off_t off, ++ int count, int *eof, void *data) +{ + int len; + @@ -2264,20 +2679,20 @@ Index: linux-2.6.5-7.201/fs/ext3/proc.c + if (off != 0) + return 0; + -+ len = sprintf(page, "%ld\n", ext3_mb_max_to_scan); ++ len = sprintf(page, "%ld\n", ext3_mb_min_to_scan); + *start = page; + return len; +} + -+static int ext3_mb_max_to_scan_write(struct file *file, const char *buffer, -+ unsigned long count, void *data) ++static int ext3_mb_min_to_scan_write(struct file *file, const char *buffer, ++ unsigned long count, void *data) +{ + char str[32]; + long value; + + if (count >= sizeof(str)) { + printk(KERN_ERR "EXT3: %s string to long, max %u bytes\n", -+ EXT3_MB_MAX_TO_SCAN_NAME, sizeof(str)); ++ EXT3_MB_MIN_TO_SCAN_NAME, (int)sizeof(str)); + return -EOVERFLOW; + } + @@ -2286,47 +2701,32 @@ Index: linux-2.6.5-7.201/fs/ext3/proc.c + + /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ + value = simple_strtol(str, NULL, 0); -+ if (value <= 0) ++ if (value <= 0) + return -ERANGE; + -+ ext3_mb_max_to_scan = value; ++ ext3_mb_min_to_scan = value; + + return count; +} + +int __init init_ext3_proc(void) +{ -+ struct proc_dir_entry *proc_ext3_mb_aggressive; + struct proc_dir_entry *proc_ext3_mb_stats; + struct proc_dir_entry *proc_ext3_mb_max_to_scan; ++ struct proc_dir_entry *proc_ext3_mb_min_to_scan; + + proc_root_ext3 = proc_mkdir(EXT3_ROOT, proc_root_fs); + if (proc_root_ext3 == NULL) { -+ printk(KERN_ERR "EXT3: Unable to create %s\n", EXT3_ROOT); -+ return -EIO; -+ } -+ -+ /* Initialize EXT3_MB_AGGRESSIVE_NAME */ -+ proc_ext3_mb_aggressive = create_proc_entry(EXT3_MB_AGGRESSIVE_NAME, -+ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); -+ if (proc_ext3_mb_aggressive == NULL) { -+ printk(KERN_ERR "EXT3: Unable to create %s\n", -+ EXT3_MB_AGGRESSIVE_NAME); -+ remove_proc_entry(EXT3_ROOT, proc_root_fs); ++ printk(KERN_ERR "EXT3: Unable to create %s\n", EXT3_ROOT); + return -EIO; + } + -+ proc_ext3_mb_aggressive->data = NULL; -+ proc_ext3_mb_aggressive->read_proc = ext3_mb_aggressive_read; -+ proc_ext3_mb_aggressive->write_proc = ext3_mb_aggressive_write; -+ + /* Initialize EXT3_MB_STATS_NAME */ + proc_ext3_mb_stats = create_proc_entry(EXT3_MB_STATS_NAME, -+ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); ++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); + if (proc_ext3_mb_stats == NULL) { -+ printk(KERN_ERR "EXT3: Unable to create %s\n", -+ EXT3_MB_STATS_NAME); -+ remove_proc_entry(EXT3_MB_AGGRESSIVE_NAME, proc_root_ext3); ++ printk(KERN_ERR "EXT3: Unable to create %s\n", ++ EXT3_MB_STATS_NAME); + remove_proc_entry(EXT3_ROOT, proc_root_fs); + return -EIO; + } @@ -2337,13 +2737,12 @@ Index: linux-2.6.5-7.201/fs/ext3/proc.c + + /* Initialize EXT3_MAX_TO_SCAN_NAME */ + proc_ext3_mb_max_to_scan = create_proc_entry( -+ EXT3_MB_MAX_TO_SCAN_NAME, -+ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); ++ EXT3_MB_MAX_TO_SCAN_NAME, ++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); + if (proc_ext3_mb_max_to_scan == NULL) { -+ printk(KERN_ERR "EXT3: Unable to create %s\n", -+ EXT3_MB_STATS_NAME); ++ printk(KERN_ERR "EXT3: Unable to create %s\n", ++ EXT3_MB_MAX_TO_SCAN_NAME); + remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); -+ remove_proc_entry(EXT3_MB_AGGRESSIVE_NAME, proc_root_ext3); + remove_proc_entry(EXT3_ROOT, proc_root_fs); + return -EIO; + } @@ -2352,131 +2751,43 @@ Index: linux-2.6.5-7.201/fs/ext3/proc.c + proc_ext3_mb_max_to_scan->read_proc = ext3_mb_max_to_scan_read; + proc_ext3_mb_max_to_scan->write_proc = ext3_mb_max_to_scan_write; + ++ /* Initialize EXT3_MIN_TO_SCAN_NAME */ ++ proc_ext3_mb_min_to_scan = create_proc_entry( ++ EXT3_MB_MIN_TO_SCAN_NAME, ++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); ++ if (proc_ext3_mb_min_to_scan == NULL) { ++ printk(KERN_ERR "EXT3: Unable to create %s\n", ++ EXT3_MB_MIN_TO_SCAN_NAME); ++ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_ROOT, proc_root_fs); ++ return -EIO; ++ } ++ ++ proc_ext3_mb_min_to_scan->data = NULL; ++ proc_ext3_mb_min_to_scan->read_proc = ext3_mb_min_to_scan_read; ++ proc_ext3_mb_min_to_scan->write_proc = ext3_mb_min_to_scan_write; ++ + return 0; +} + +void exit_ext3_proc(void) +{ -+ remove_proc_entry(EXT3_MB_AGGRESSIVE_NAME, proc_root_ext3); + remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); + remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_MIN_TO_SCAN_NAME, proc_root_ext3); + remove_proc_entry(EXT3_ROOT, proc_root_fs); +} -Index: linux-2.6.9/fs/ext3/inode.c -=================================================================== ---- linux-2.6.9.orig/fs/ext3/inode.c 2005-10-14 09:10:12.000000000 +0400 -+++ linux-2.6.9/fs/ext3/inode.c 2005-10-14 09:10:13.000000000 +0400 -@@ -572,7 +572,7 @@ - ext3_journal_forget(handle, branch[i].bh); - } - for (i = 0; i < keys; i++) -- ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1); -+ ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1, 1); - return err; - } - -@@ -673,7 +673,7 @@ - if (err == -EAGAIN) - for (i = 0; i < num; i++) - ext3_free_blocks(handle, inode, -- le32_to_cpu(where[i].key), 1); -+ le32_to_cpu(where[i].key), 1, 1); - return err; - } - -@@ -1831,7 +1831,7 @@ - } - } - -- ext3_free_blocks(handle, inode, block_to_free, count); -+ ext3_free_blocks(handle, inode, block_to_free, count, 1); - } - - /** -@@ -2004,7 +2004,7 @@ - ext3_journal_test_restart(handle, inode); - } - -- ext3_free_blocks(handle, inode, nr, 1); -+ ext3_free_blocks(handle, inode, nr, 1, 1); - - if (parent_bh) { - /* -Index: linux-2.6.9/fs/ext3/super.c +Index: linux-2.6.9-full/fs/ext3/Makefile =================================================================== ---- linux-2.6.9.orig/fs/ext3/super.c 2005-10-14 09:10:12.000000000 +0400 -+++ linux-2.6.9/fs/ext3/super.c 2005-10-14 09:10:31.000000000 +0400 -@@ -394,6 +394,7 @@ - struct ext3_super_block *es = sbi->s_es; - int i; - -+ ext3_mb_release(sb); - ext3_ext_release(sb); - ext3_xattr_put_super(sb); - journal_destroy(sbi->s_journal); -@@ -590,7 +591,7 @@ - Opt_commit, Opt_journal_update, Opt_journal_inum, - Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, - Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, -- Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, -+ Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_mballoc, Opt_mbfactor, - Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, - Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_extents, Opt_extdebug, - }; -@@ -644,6 +645,8 @@ - {Opt_iopen_nopriv, "iopen_nopriv"}, - {Opt_extents, "extents"}, - {Opt_extdebug, "extdebug"}, -+ {Opt_mballoc, "mballoc"}, -+ {Opt_mballoc, "mbfactor=%u"}, - {Opt_barrier, "barrier=%u"}, - {Opt_err, NULL}, - {Opt_resize, "resize"}, -@@ -954,6 +957,16 @@ - case Opt_extdebug: - set_opt (sbi->s_mount_opt, EXTDEBUG); - break; -+ case Opt_mballoc: -+ set_opt (sbi->s_mount_opt, MBALLOC); -+ break; -+ case Opt_mbfactor: -+ if (match_int(&args[0], &option)) -+ return 0; -+ if (option < 0) -+ return 0; -+ sbi->s_mb_factor = option; -+ break; - default: - printk (KERN_ERR - "EXT3-fs: Unrecognized mount option \"%s\" " -@@ -1637,6 +1650,7 @@ - ext3_count_dirs(sb)); - - ext3_ext_init(sb); -+ ext3_mb_init(sb, needs_recovery); +--- linux-2.6.9-full.orig/fs/ext3/Makefile 2005-12-16 23:16:41.000000000 +0300 ++++ linux-2.6.9-full/fs/ext3/Makefile 2005-12-16 23:16:42.000000000 +0300 +@@ -6,7 +6,7 @@ - return 0; - -@@ -2419,7 +2433,13 @@ - - static int __init init_ext3_fs(void) - { -- int err = init_ext3_xattr(); -+ int err; -+ -+ err = init_ext3_proc(); -+ if (err) -+ return err; -+ -+ err = init_ext3_xattr(); - if (err) - return err; - err = init_inodecache(); -@@ -2441,6 +2461,7 @@ - unregister_filesystem(&ext3_fs_type); - destroy_inodecache(); - exit_ext3_xattr(); -+ exit_ext3_proc(); - } + ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ + ioctl.o namei.o super.o symlink.o hash.o resize.o \ +- extents.o ++ extents.o mballoc.o - int ext3_prep_san_write(struct inode *inode, long *blocks, + ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o + ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o diff --git a/lustre/kernel_patches/patches/ext3-nlinks-2.6.9.patch b/lustre/kernel_patches/patches/ext3-nlinks-2.6.9.patch new file mode 100644 index 0000000..62bf156 --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-nlinks-2.6.9.patch @@ -0,0 +1,163 @@ +diff -Nur orig/fs/ext3/namei.c patch/fs/ext3/namei.c +--- orig/fs/ext3/namei.c 2005-10-12 13:58:19.000000000 -0700 ++++ patch/fs/ext3/namei.c 2005-10-12 14:00:33.000000000 -0700 +@@ -1603,11 +1603,17 @@ + static inline void ext3_inc_count(handle_t *handle, struct inode *inode) + { + inode->i_nlink++; ++ if (is_dx(inode) && inode->i_nlink > 1) { ++ /* limit is 16-bit i_links_count */ ++ if (inode->i_nlink >= EXT3_LINK_MAX || inode->i_nlink == 2) ++ inode->i_nlink = 1; ++ } + } + + static inline void ext3_dec_count(handle_t *handle, struct inode *inode) + { +- inode->i_nlink--; ++ if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 2) ++ inode->i_nlink--; + } + + static int ext3_add_nondir(handle_t *handle, +@@ -1706,7 +1712,7 @@ + struct ext3_dir_entry_2 * de; + int err, retries = 0; + +- if (dir->i_nlink >= EXT3_LINK_MAX) ++ if (EXT3_DIR_LINK_MAXED(dir)) + return -EMLINK; + + retry: +@@ -1729,7 +1735,7 @@ + inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize; + dir_block = ext3_bread (handle, inode, 0, 1, &err); + if (!dir_block) { +- inode->i_nlink--; /* is this nlink == 0? */ ++ ext3_dec_count(handle, inode); /* is this nlink == 0? */ + ext3_mark_inode_dirty(handle, inode); + iput (inode); + goto out_stop; +@@ -1761,7 +1767,7 @@ + iput (inode); + goto out_stop; + } +- dir->i_nlink++; ++ ext3_inc_count(handle, dir); + ext3_update_dx_flag(dir); + ext3_mark_inode_dirty(handle, dir); + d_instantiate(dentry, inode); +@@ -2026,10 +2032,10 @@ + retval = ext3_delete_entry(handle, dir, de, bh); + if (retval) + goto end_rmdir; +- if (inode->i_nlink != 2) +- ext3_warning (inode->i_sb, "ext3_rmdir", +- "empty directory has nlink!=2 (%d)", +- inode->i_nlink); ++ if (!EXT3_DIR_LINK_EMPTY(inode)) ++ ext3_warning(inode->i_sb, "ext3_rmdir", ++ "empty directory has too many links (%d)", ++ inode->i_nlink); + inode->i_version++; + inode->i_nlink = 0; + /* There's no need to set i_disksize: the fact that i_nlink is +@@ -2039,7 +2045,7 @@ + ext3_orphan_add(handle, inode); + inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; + ext3_mark_inode_dirty(handle, inode); +- dir->i_nlink--; ++ ext3_dec_count(handle, dir); + ext3_update_dx_flag(dir); + ext3_mark_inode_dirty(handle, dir); + +@@ -2090,7 +2096,7 @@ + dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; + ext3_update_dx_flag(dir); + ext3_mark_inode_dirty(handle, dir); +- inode->i_nlink--; ++ ext3_dec_count(handle, inode); + if (!inode->i_nlink) + ext3_orphan_add(handle, inode); + inode->i_ctime = dir->i_ctime; +@@ -2165,7 +2171,7 @@ + struct inode *inode = old_dentry->d_inode; + int err, retries = 0; + +- if (inode->i_nlink >= EXT3_LINK_MAX) ++ if (EXT3_DIR_LINK_MAXED(inode)) + return -EMLINK; + + retry: +@@ -2252,8 +2258,8 @@ + if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino) + goto end_rename; + retval = -EMLINK; +- if (!new_inode && new_dir!=old_dir && +- new_dir->i_nlink >= EXT3_LINK_MAX) ++ if (!new_inode && new_dir != old_dir && ++ EXT3_DIR_LINK_MAXED(new_dir)) + goto end_rename; + } + if (!new_bh) { +@@ -2310,7 +2316,7 @@ + } + + if (new_inode) { +- new_inode->i_nlink--; ++ ext3_dec_count(handle, new_inode); + new_inode->i_ctime = CURRENT_TIME_SEC; + } + old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME_SEC; +@@ -2321,11 +2327,13 @@ + PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino); + BUFFER_TRACE(dir_bh, "call ext3_journal_dirty_metadata"); + ext3_journal_dirty_metadata(handle, dir_bh); +- old_dir->i_nlink--; ++ ext3_dec_count(handle, old_dir); + if (new_inode) { +- new_inode->i_nlink--; ++ /* checked empty_dir above, can't have another parent, ++ * ext3_dec_count() won't work for many-linked dirs */ ++ new_inode->i_nlink = 0; + } else { +- new_dir->i_nlink++; ++ ext3_inc_count(handle, new_dir); + ext3_update_dx_flag(new_dir); + ext3_mark_inode_dirty(handle, new_dir); + } + +Index: linux-2.6.7/include/linux/ext3_fs.h +=================================================================== +--- linux-2.6.7.orig/include/linux/ext3_fs.h 2004-06-15 23:19:36.000000000 -0600 ++++ linux-2.6.7/include/linux/ext3_fs.h 2004-08-20 17:41:27.000000000 -0600 +@@ -79,7 +81,7 @@ + /* + * Maximal count of links to a file + */ +-#define EXT3_LINK_MAX 32000 ++#define EXT3_LINK_MAX 65000 + + /* + * Macro-instructions used to manage several block sizes +@@ -595,14 +595,15 @@ struct ext3_dir_entry_2 { + */ + + #ifdef CONFIG_EXT3_INDEX +- #define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \ +- EXT3_FEATURE_COMPAT_DIR_INDEX) && \ ++#define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \ ++ EXT3_FEATURE_COMPAT_DIR_INDEX) && \ + (EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) +-#define EXT3_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT3_LINK_MAX) +-#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1) ++#define EXT3_DIR_LINK_MAXED(dir) (!is_dx(dir) && (dir)->i_nlink >=EXT3_LINK_MAX) ++#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || \ ++ (is_dx(dir) && (dir)->i_nlink == 1)) + #else + #define is_dx(dir) 0 +-#define EXT3_DIR_LINK_MAX(dir) ((dir)->i_nlink >= EXT3_LINK_MAX) ++#define EXT3_DIR_LINK_MAXED(dir) ((dir)->i_nlink >= EXT3_LINK_MAX) + #define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2) + #endif + diff --git a/lustre/kernel_patches/patches/ext3-no-write-super.patch b/lustre/kernel_patches/patches/ext3-no-write-super.patch deleted file mode 100644 index d2dcdae..0000000 --- a/lustre/kernel_patches/patches/ext3-no-write-super.patch +++ /dev/null @@ -1,22 +0,0 @@ - 0 files changed - ---- linux-2.4.20/fs/ext3/super.c~ext3-no-write-super 2003-08-11 13:20:17.000000000 +0400 -+++ linux-2.4.20-alexey/fs/ext3/super.c 2003-08-11 13:31:35.000000000 +0400 -@@ -1849,7 +1849,6 @@ void ext3_write_super (struct super_bloc - if (down_trylock(&sb->s_lock) == 0) - BUG(); /* aviro detector */ - sb->s_dirt = 0; -- target = log_start_commit(EXT3_SB(sb)->s_journal, NULL); - - /* - * Tricky --- if we are unmounting, the write really does need -@@ -1857,6 +1856,7 @@ void ext3_write_super (struct super_bloc - * sb->s_root. - */ - if (do_sync_supers || !sb->s_root) { -+ target = log_start_commit(EXT3_SB(sb)->s_journal, NULL); - unlock_super(sb); - log_wait_commit(EXT3_SB(sb)->s_journal, target); - lock_super(sb); - -_ diff --git a/lustre/kernel_patches/patches/ext3-orphan_lock-2.4.19-suse.patch b/lustre/kernel_patches/patches/ext3-orphan_lock-2.4.19-suse.patch deleted file mode 100644 index 4c16fe6..0000000 --- a/lustre/kernel_patches/patches/ext3-orphan_lock-2.4.19-suse.patch +++ /dev/null @@ -1,85 +0,0 @@ -Index: linux-2.4.19/fs/ext3/namei.c -=================================================================== ---- linux-2.4.19.orig/fs/ext3/namei.c 2004-04-23 22:36:03.000000000 -0400 -+++ linux-2.4.19/fs/ext3/namei.c 2004-04-23 22:37:37.000000000 -0400 -@@ -1751,8 +1751,8 @@ - struct super_block *sb = inode->i_sb; - struct ext3_iloc iloc; - int err = 0, rc; -- -- lock_super(sb); -+ -+ down(&EXT3_SB(sb)->s_orphan_lock); - if (!list_empty(&EXT3_I(inode)->i_orphan)) - goto out_unlock; - -@@ -1800,7 +1800,7 @@ - jbd_debug(4, "orphan inode %ld will point to %d\n", - inode->i_ino, NEXT_ORPHAN(inode)); - out_unlock: -- unlock_super(sb); -+ up(&EXT3_SB(sb)->s_orphan_lock); - ext3_std_error(inode->i_sb, err); - return err; - } -@@ -1813,20 +1813,19 @@ - { - struct list_head *prev; - struct ext3_inode_info *ei = EXT3_I(inode); -- struct ext3_sb_info *sbi; -+ struct ext3_sb_info *sbi = EXT3_SB(inode->i_sb); - unsigned long ino_next; - struct ext3_iloc iloc; - int err = 0; - -- lock_super(inode->i_sb); -+ down(&sbi->s_orphan_lock); - if (list_empty(&ei->i_orphan)) { -- unlock_super(inode->i_sb); -+ up(&sbi->s_orphan_lock); - return 0; - } - - ino_next = NEXT_ORPHAN(inode); - prev = ei->i_orphan.prev; -- sbi = EXT3_SB(inode->i_sb); - - jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino); - -@@ -1872,10 +1871,10 @@ - if (err) - goto out_brelse; - --out_err: -+out_err: - ext3_std_error(inode->i_sb, err); - out: -- unlock_super(inode->i_sb); -+ up(&sbi->s_orphan_lock); - return err; - - out_brelse: -Index: linux-2.4.19/fs/ext3/super.c -=================================================================== ---- linux-2.4.19.orig/fs/ext3/super.c 2004-04-23 22:30:41.000000000 -0400 -+++ linux-2.4.19/fs/ext3/super.c 2004-04-23 22:36:22.000000000 -0400 -@@ -1179,6 +1179,7 @@ - */ - sb->s_op = &ext3_sops; - INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ -+ sema_init(&sbi->s_orphan_lock, 1); - - sb->s_root = 0; - -Index: linux-2.4.19/include/linux/ext3_fs_sb.h -=================================================================== ---- linux-2.4.19.orig/include/linux/ext3_fs_sb.h 2004-04-23 18:26:27.000000000 -0400 -+++ linux-2.4.19/include/linux/ext3_fs_sb.h 2004-04-23 22:36:22.000000000 -0400 -@@ -69,6 +69,7 @@ - struct inode * s_journal_inode; - struct journal_s * s_journal; - struct list_head s_orphan; -+ struct semaphore s_orphan_lock; - struct block_device *journal_bdev; - #ifdef CONFIG_JBD_DEBUG - struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */ diff --git a/lustre/kernel_patches/patches/ext3-remove-cond_resched-calls-2.6.12.patch b/lustre/kernel_patches/patches/ext3-remove-cond_resched-calls-2.6.12.patch new file mode 100644 index 0000000..57898d5 --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-remove-cond_resched-calls-2.6.12.patch @@ -0,0 +1,29 @@ +Index: linux-stage/fs/ext3/ialloc.c +=================================================================== +--- linux-stage.orig/fs/ext3/ialloc.c 2005-06-26 10:59:43.048185981 +0200 ++++ linux-stage/fs/ext3/ialloc.c 2005-06-26 11:01:21.317716027 +0200 +@@ -775,7 +775,6 @@ + if (!gdp) + continue; + desc_count += le16_to_cpu(gdp->bg_free_inodes_count); +- cond_resched(); + } + return desc_count; + #endif +Index: linux-stage/fs/ext3/super.c +=================================================================== +--- linux-stage.orig/fs/ext3/super.c 2005-06-26 10:59:43.205412542 +0200 ++++ linux-stage/fs/ext3/super.c 2005-06-26 11:02:29.599941754 +0200 +@@ -2236,11 +2232,9 @@ + * block group descriptors. If the sparse superblocks + * feature is turned on, then not all groups have this. + */ +- for (i = 0; i < ngroups; i++) { ++ for (i = 0; i < ngroups; i++) + overhead += ext3_bg_has_super(sb, i) + + ext3_bg_num_gdb(sb, i); +- cond_resched(); +- } + + /* + * Every block group has an inode bitmap, a block diff --git a/lustre/kernel_patches/patches/ext3-statfs-2.6.12.patch b/lustre/kernel_patches/patches/ext3-statfs-2.6.12.patch new file mode 100644 index 0000000..ad7d79b --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-statfs-2.6.12.patch @@ -0,0 +1,177 @@ +Index: linux-2.6.12/fs/ext3/super.c +=================================================================== +--- linux-2.6.12.orig/fs/ext3/super.c 2005-06-17 13:48:29.000000000 -0600 ++++ linux-2.6.12/fs/ext3/super.c 2005-11-25 05:59:47.000000000 -0700 +@@ -2165,13 +2165,13 @@ + { + struct ext3_super_block *es = EXT3_SB(sb)->s_es; + unsigned long overhead; +- int i; + + if (test_opt (sb, MINIX_DF)) + overhead = 0; + else { +- unsigned long ngroups; +- ngroups = EXT3_SB(sb)->s_groups_count; ++ unsigned long ngroups = EXT3_SB(sb)->s_groups_count, group; ++ unsigned long three = 1, five = 5, seven = 7; ++ unsigned long metabg = -1UL; + smp_rmb(); + + /* +@@ -2189,11 +2188,14 @@ + * block group descriptors. If the sparse superblocks + * feature is turned on, then not all groups have this. + */ +- for (i = 0; i < ngroups; i++) { +- overhead += ext3_bg_has_super(sb, i) + +- ext3_bg_num_gdb(sb, i); +- cond_resched(); +- } ++ overhead += 1 + EXT3_SB(sb)->s_gdb_count; /* group 0 */ ++ if (EXT3_HAS_INCOMPAT_FEATURE(sb,EXT3_FEATURE_INCOMPAT_META_BG)) ++ metabg =le32_to_cpu(EXT3_SB(sb)->s_es->s_first_meta_bg); ++ ++ while ((group = ext3_list_backups(sb, &three, &five, &seven)) < ++ ngroups) /* sb + group descriptors backups */ ++ overhead += 1 + (group >= metabg ? 1 : ++ EXT3_SB(sb)->s_gdb_count); + + /* + * Every block group has an inode bitmap, a block +@@ -2205,12 +2204,16 @@ + buf->f_type = EXT3_SUPER_MAGIC; + buf->f_bsize = sb->s_blocksize; + buf->f_blocks = le32_to_cpu(es->s_blocks_count) - overhead; +- buf->f_bfree = ext3_count_free_blocks (sb); ++ buf->f_bfree = percpu_counter_read(&EXT3_SB(sb)->s_freeblocks_counter); ++ if (buf->f_bfree < 0) ++ buf->f_bfree = 0; + buf->f_bavail = buf->f_bfree - le32_to_cpu(es->s_r_blocks_count); + if (buf->f_bfree < le32_to_cpu(es->s_r_blocks_count)) + buf->f_bavail = 0; + buf->f_files = le32_to_cpu(es->s_inodes_count); +- buf->f_ffree = ext3_count_free_inodes (sb); ++ buf->f_ffree = percpu_counter_read(&EXT3_SB(sb)->s_freeinodes_counter); ++ if (buf->f_ffree < 0) ++ buf->f_ffree = 0; + buf->f_namelen = EXT3_NAME_LEN; + return 0; + } +Index: linux-2.6.12/fs/ext3/resize.c +=================================================================== +--- linux-2.6.12.orig/fs/ext3/resize.c 2005-11-24 15:17:06.000000000 -0700 ++++ linux-2.6.12/fs/ext3/resize.c 2005-11-25 06:01:01.000000000 -0700 +@@ -285,17 +285,17 @@ + * sequence of powers of 3, 5, and 7: 1, 3, 5, 7, 9, 25, 27, 49, 81, ... + * For a non-sparse filesystem it will be every group: 1, 2, 3, 4, ... + */ +-static unsigned ext3_list_backups(struct super_block *sb, unsigned *three, +- unsigned *five, unsigned *seven) ++unsigned long ext3_list_backups(struct super_block *sb, unsigned long *three, ++ unsigned long *five, unsigned long *seven) + { +- unsigned *min = three; ++ unsigned long metabg = le32_to_cpu(EXT3_SB(sb)->s_es->s_first_meta_bg); ++ unsigned long *min = three, ret; + int mult = 3; +- unsigned ret; + + if (!EXT3_HAS_RO_COMPAT_FEATURE(sb, + EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER)) { +- ret = *min; +- *min += 1; ++ ret = *three; ++ *three += 1; + return ret; + } + +@@ -308,8 +307,26 @@ + mult = 7; + } + +- ret = *min; +- *min *= mult; ++ if (EXT3_HAS_INCOMPAT_FEATURE(sb,EXT3_FEATURE_INCOMPAT_META_BG) && ++ *min >= metabg * EXT3_DESC_PER_BLOCK(sb)) { ++ ret = *min; ++ switch (ret & (EXT3_DESC_PER_BLOCK(sb) - 1)) { ++ case 0: ++ *three = ret + 1; ++ break; ++ case 1: ++ *three = ret + EXT3_DESC_PER_BLOCK(sb) - 2; ++ break; ++ default: ++ *three = (ret | (EXT3_DESC_PER_BLOCK(sb) - 1)) + 1; ++ break; ++ } ++ *five = -1UL; ++ *seven = -1UL; ++ } else { ++ ret = *min; ++ *min *= mult; ++ } + + return ret; + } +@@ -324,17 +337,17 @@ + { + const unsigned long blk = primary->b_blocknr; + const unsigned long end = EXT3_SB(sb)->s_groups_count; +- unsigned three = 1; +- unsigned five = 5; +- unsigned seven = 7; +- unsigned grp; ++ unsigned long three = 1; ++ unsigned long five = 5; ++ unsigned long seven = 7; ++ unsigned long grp; + __u32 *p = (__u32 *)primary->b_data; + int gdbackups = 0; + + while ((grp = ext3_list_backups(sb, &three, &five, &seven)) < end) { + if (le32_to_cpu(*p++) != grp * EXT3_BLOCKS_PER_GROUP(sb) + blk){ + ext3_warning(sb, __FUNCTION__, +- "reserved GDT %ld missing grp %d (%ld)\n", ++ "reserved GDT %ld missing grp %ld (%ld)\n", + blk, grp, + grp * EXT3_BLOCKS_PER_GROUP(sb) + blk); + return -EINVAL; +@@ -618,10 +631,8 @@ + struct ext3_sb_info *sbi = EXT3_SB(sb); + const unsigned long last = sbi->s_groups_count; + const int bpg = EXT3_BLOCKS_PER_GROUP(sb); +- unsigned three = 1; +- unsigned five = 5; +- unsigned seven = 7; +- unsigned group; ++ unsigned long three = 1, five = 5, seven = 7; ++ unsigned long group; + int rest = sb->s_blocksize - size; + handle_t *handle; + int err = 0, err2; +@@ -672,7 +683,7 @@ + exit_err: + if (err) { + ext3_warning(sb, __FUNCTION__, +- "can't update backup for group %d (err %d), " ++ "can't update backup for group %ld (err %d), " + "forcing fsck on next reboot\n", group, err); + sbi->s_mount_state &= ~EXT3_VALID_FS; + sbi->s_es->s_state &= ~cpu_to_le16(EXT3_VALID_FS); +Index: linux-2.6.12/include/linux/ext3_fs.h +=================================================================== +--- linux-2.6.12.orig/include/linux/ext3_fs.h 2005-06-17 13:48:29.000000000 -0600 ++++ linux-2.6.12/include/linux/ext3_fs.h 2005-11-25 05:59:47.000000000 -0700 +@@ -788,6 +788,10 @@ + extern int ext3_group_extend(struct super_block *sb, + struct ext3_super_block *es, + unsigned long n_blocks_count); ++extern unsigned long ext3_list_backups(struct super_block *sb, ++ unsigned long *three, ++ unsigned long *five, ++ unsigned long *seven); + + /* super.c */ + extern void ext3_error (struct super_block *, const char *, const char *, ...) diff --git a/lustre/kernel_patches/patches/ext3-unmount_sync.patch b/lustre/kernel_patches/patches/ext3-unmount_sync.patch deleted file mode 100644 index c57903c..0000000 --- a/lustre/kernel_patches/patches/ext3-unmount_sync.patch +++ /dev/null @@ -1,21 +0,0 @@ - fs/ext3/super.c | 7 ++++++- - 1 files changed, 6 insertions(+), 1 deletion(-) - ---- linux-2.4.20/fs/ext3/super.c~ext3-unmount_sync 2003-04-08 23:35:44.000000000 -0600 -+++ linux-2.4.20-braam/fs/ext3/super.c 2003-04-08 23:35:44.000000000 -0600 -@@ -1612,7 +1612,12 @@ void ext3_write_super (struct super_bloc - sb->s_dirt = 0; - target = log_start_commit(EXT3_SB(sb)->s_journal, NULL); - -- if (do_sync_supers) { -+ /* -+ * Tricky --- if we are unmounting, the write really does need -+ * to be synchronous. We can detect that by looking for NULL in -+ * sb->s_root. -+ */ -+ if (do_sync_supers || !sb->s_root) { - unlock_super(sb); - log_wait_commit(EXT3_SB(sb)->s_journal, target); - lock_super(sb); - -_ diff --git a/lustre/kernel_patches/patches/ext3-use-after-free-2.4.19-pre1.patch b/lustre/kernel_patches/patches/ext3-use-after-free-2.4.19-pre1.patch deleted file mode 100644 index 595db54..0000000 --- a/lustre/kernel_patches/patches/ext3-use-after-free-2.4.19-pre1.patch +++ /dev/null @@ -1,53 +0,0 @@ - ./fs/ext3/namei.c | 11 +++++------ - 1 files changed, 5 insertions(+), 6 deletions(-) - -Index: linux-2.4.19-pre1/./fs/ext3/namei.c -=================================================================== ---- linux-2.4.19-pre1.orig/./fs/ext3/namei.c 2003-11-21 01:52:06.000000000 +0300 -+++ linux-2.4.19-pre1/./fs/ext3/namei.c 2003-11-21 01:58:15.000000000 +0300 -@@ -1522,8 +1522,11 @@ - { - int err = ext3_add_entry(handle, dentry, inode); - if (!err) { -- d_instantiate(dentry, inode); -- return 0; -+ err = ext3_mark_inode_dirty(handle, inode); -+ if (err == 0) { -+ d_instantiate(dentry, inode); -+ return 0; -+ } - } - ext3_dec_count(handle, inode); - iput(inode); -@@ -1559,7 +1562,6 @@ - inode->i_op = &ext3_file_inode_operations; - inode->i_fop = &ext3_file_operations; - inode->i_mapping->a_ops = &ext3_aops; -- ext3_mark_inode_dirty(handle, inode); - err = ext3_add_nondir(handle, dentry, inode); - } - ext3_journal_stop(handle, dir); -@@ -1586,7 +1588,6 @@ - err = PTR_ERR(inode); - if (!IS_ERR(inode)) { - init_special_inode(inode, mode, rdev); -- ext3_mark_inode_dirty(handle, inode); - err = ext3_add_nondir(handle, dentry, inode); - } - ext3_journal_stop(handle, dir); -@@ -2035,7 +2036,6 @@ - inode->i_size = l-1; - } - inode->u.ext3_i.i_disksize = inode->i_size; -- ext3_mark_inode_dirty(handle, inode); - err = ext3_add_nondir(handle, dentry, inode); - out_stop: - ext3_journal_stop(handle, dir); -@@ -2069,7 +2069,6 @@ - ext3_inc_count(handle, inode); - atomic_inc(&inode->i_count); - -- ext3_mark_inode_dirty(handle, inode); - err = ext3_add_nondir(handle, dentry, inode); - ext3_journal_stop(handle, dir); - return err; diff --git a/lustre/kernel_patches/patches/ext3-use-after-free-suse.patch b/lustre/kernel_patches/patches/ext3-use-after-free-suse.patch deleted file mode 100644 index 7899354..0000000 --- a/lustre/kernel_patches/patches/ext3-use-after-free-suse.patch +++ /dev/null @@ -1,53 +0,0 @@ - ./fs/ext3/namei.c | 11 +++++------ - 1 files changed, 5 insertions(+), 6 deletions(-) - -Index: linux-2.4.19/fs/ext3/namei.c -=================================================================== ---- linux-2.4.19.orig/fs/ext3/namei.c 2004-04-23 22:30:41.000000000 -0400 -+++ linux-2.4.19/fs/ext3/namei.c 2004-04-23 22:36:03.000000000 -0400 -@@ -1522,8 +1522,11 @@ - { - int err = ext3_add_entry(handle, dentry, inode); - if (!err) { -- d_instantiate(dentry, inode); -- return 0; -+ err = ext3_mark_inode_dirty(handle, inode); -+ if (err == 0) { -+ d_instantiate(dentry, inode); -+ return 0; -+ } - } - ext3_dec_count(handle, inode); - iput(inode); -@@ -1559,7 +1562,6 @@ - inode->i_op = &ext3_file_inode_operations; - inode->i_fop = &ext3_file_operations; - inode->i_mapping->a_ops = &ext3_aops; -- ext3_mark_inode_dirty(handle, inode); - err = ext3_add_nondir(handle, dentry, inode); - } - ext3_journal_stop(handle, dir); -@@ -1589,7 +1591,6 @@ - #ifdef CONFIG_EXT3_FS_XATTR - inode->i_op = &ext3_special_inode_operations; - #endif -- ext3_mark_inode_dirty(handle, inode); - err = ext3_add_nondir(handle, dentry, inode); - } - ext3_journal_stop(handle, dir); -@@ -2039,7 +2040,6 @@ - inode->i_size = l-1; - } - EXT3_I(inode)->i_disksize = inode->i_size; -- ext3_mark_inode_dirty(handle, inode); - err = ext3_add_nondir(handle, dentry, inode); - out_stop: - ext3_journal_stop(handle, dir); -@@ -2073,7 +2073,6 @@ - ext3_inc_count(handle, inode); - atomic_inc(&inode->i_count); - -- ext3_mark_inode_dirty(handle, inode); - err = ext3_add_nondir(handle, dentry, inode); - ext3_journal_stop(handle, dir); - return err; diff --git a/lustre/kernel_patches/patches/extN-wantedi-2.4.19-suse.patch b/lustre/kernel_patches/patches/extN-wantedi-2.4.19-suse.patch deleted file mode 100644 index 02cfef1..0000000 --- a/lustre/kernel_patches/patches/extN-wantedi-2.4.19-suse.patch +++ /dev/null @@ -1,226 +0,0 @@ - fs/ext3/ialloc.c | 40 ++++++++++++++++++++++++++++++++++++++-- - fs/ext3/inode.c | 2 +- - fs/ext3/ioctl.c | 25 +++++++++++++++++++++++++ - fs/ext3/namei.c | 21 +++++++++++++++++---- - include/linux/dcache.h | 5 +++++ - include/linux/ext3_fs.h | 5 ++++- - 6 files changed, 90 insertions(+), 8 deletions(-) - -Index: linux-2.4.19.SuSE/fs/ext3/namei.c -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/ext3/namei.c Sun Nov 16 01:18:04 2003 -+++ linux-2.4.19.SuSE/fs/ext3/namei.c Sun Nov 16 01:23:20 2003 -@@ -1534,6 +1534,19 @@ - return err; - } - -+static struct inode * ext3_new_inode_wantedi(handle_t *handle, struct inode *dir, -+ int mode, struct dentry *dentry) -+{ -+ unsigned long inum = 0; -+ -+ if (dentry->d_fsdata != NULL) { -+ struct dentry_params *param = -+ (struct dentry_params *) dentry->d_fsdata; -+ inum = param->p_inum; -+ } -+ return ext3_new_inode(handle, dir, mode, inum); -+} -+ - /* - * By the time this is called, we already have created - * the directory cache entry for the new file, but it -@@ -1557,7 +1570,7 @@ - if (IS_SYNC(dir)) - handle->h_sync = 1; - -- inode = ext3_new_inode (handle, dir, mode); -+ inode = ext3_new_inode_wantedi (handle, dir, mode, dentry); - err = PTR_ERR(inode); - if (!IS_ERR(inode)) { - inode->i_op = &ext3_file_inode_operations; -@@ -1585,7 +1598,7 @@ - if (IS_SYNC(dir)) - handle->h_sync = 1; - -- inode = ext3_new_inode (handle, dir, mode); -+ inode = ext3_new_inode_wantedi (handle, dir, mode, dentry); - err = PTR_ERR(inode); - if (!IS_ERR(inode)) { - init_special_inode(inode, inode->i_mode, rdev); -@@ -1618,7 +1631,7 @@ - if (IS_SYNC(dir)) - handle->h_sync = 1; - -- inode = ext3_new_inode (handle, dir, S_IFDIR | mode); -+ inode = ext3_new_inode_wantedi (handle, dir, S_IFDIR | mode, dentry); - err = PTR_ERR(inode); - if (IS_ERR(inode)) - goto out_stop; -@@ -2013,7 +2026,7 @@ - if (IS_SYNC(dir)) - handle->h_sync = 1; - -- inode = ext3_new_inode (handle, dir, S_IFLNK|S_IRWXUGO); -+ inode = ext3_new_inode_wantedi (handle, dir, S_IFLNK|S_IRWXUGO, dentry); - err = PTR_ERR(inode); - if (IS_ERR(inode)) - goto out_stop; -Index: linux-2.4.19.SuSE/fs/ext3/ialloc.c -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/ext3/ialloc.c Sun Nov 16 01:20:17 2003 -+++ linux-2.4.19.SuSE/fs/ext3/ialloc.c Sun Nov 16 01:24:49 2003 -@@ -330,7 +330,8 @@ - * For other inodes, search forward from the parent directory's block - * group to find a free inode. - */ --struct inode * ext3_new_inode (handle_t *handle, struct inode * dir, int mode) -+struct inode * ext3_new_inode(handle_t *handle, const struct inode * dir, -+ int mode, unsigned long goal) - { - struct super_block * sb; - struct buffer_head * bh; -@@ -355,7 +356,41 @@ - init_rwsem(&inode->u.ext3_i.truncate_sem); - - lock_super (sb); -- es = sb->u.ext3_sb.s_es; -+ es = EXT3_SB(sb)->s_es; -+ -+ if (goal) { -+ i = (goal - 1) / EXT3_INODES_PER_GROUP(sb); -+ j = (goal - 1) % EXT3_INODES_PER_GROUP(sb); -+ gdp = ext3_get_group_desc(sb, i, &bh2); -+ -+ bitmap_nr = load_inode_bitmap (sb, i); -+ if (bitmap_nr < 0) { -+ err = bitmap_nr; -+ goto fail; -+ } -+ -+ bh = EXT3_SB(sb)->s_inode_bitmap[bitmap_nr]; -+ -+ BUFFER_TRACE(bh, "get_write_access"); -+ err = ext3_journal_get_write_access(handle, bh); -+ if (err) goto fail; -+ -+ if (ext3_set_bit(j, bh->b_data)) { -+ printk(KERN_ERR "goal inode %lu unavailable\n", goal); -+ /* Oh well, we tried. */ -+ goto repeat; -+ } -+ -+ BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); -+ err = ext3_journal_dirty_metadata(handle, bh); -+ if (err) goto fail; -+ -+ /* We've shortcircuited the allocation system successfully, -+ * now finish filling in the inode. -+ */ -+ goto have_bit_and_group; -+ } -+ - repeat: - gdp = NULL; - i = 0; -@@ -470,6 +505,7 @@ - } - goto repeat; - } -+ have_bit_and_group: - j += i * EXT3_INODES_PER_GROUP(sb) + 1; - if (j < EXT3_FIRST_INO(sb) || j > le32_to_cpu(es->s_inodes_count)) { - ext3_error (sb, "ext3_new_inode", -Index: linux-2.4.19.SuSE/fs/ext3/inode.c -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/ext3/inode.c Sun Nov 16 01:20:17 2003 -+++ linux-2.4.19.SuSE/fs/ext3/inode.c Sun Nov 16 01:23:20 2003 -@@ -2168,7 +2168,7 @@ - if (IS_ERR(handle)) - goto out_truncate; - -- new_inode = ext3_new_inode(handle, old_inode, old_inode->i_mode); -+ new_inode = ext3_new_inode(handle, old_inode, old_inode->i_mode, 0); - if (IS_ERR(new_inode)) { - ext3_debug("truncate inode %lu directly (no new inodes)\n", - old_inode->i_ino); -Index: linux-2.4.19.SuSE/fs/ext3/ioctl.c -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/ext3/ioctl.c Fri Nov 9 14:25:04 2001 -+++ linux-2.4.19.SuSE/fs/ext3/ioctl.c Sun Nov 16 01:23:20 2003 -@@ -23,6 +23,31 @@ - ext3_debug ("cmd = %u, arg = %lu\n", cmd, arg); - - switch (cmd) { -+ case EXT3_IOC_CREATE_INUM: { -+ char name[32]; -+ struct dentry *dchild, *dparent; -+ int rc = 0; -+ -+ dparent = list_entry(inode->i_dentry.next, struct dentry, -+ d_alias); -+ snprintf(name, sizeof name, "%lu", arg); -+ dchild = lookup_one_len(name, dparent, strlen(name)); -+ if (dchild->d_inode) { -+ printk(KERN_ERR "%*s/%lu already exists (ino %lu)\n", -+ dparent->d_name.len, dparent->d_name.name, arg, -+ dchild->d_inode->i_ino); -+ rc = -EEXIST; -+ } else { -+ dchild->d_fsdata = (void *)arg; -+ rc = vfs_create(inode, dchild, 0644); -+ if (rc) -+ printk(KERN_ERR "vfs_create: %d\n", rc); -+ else if (dchild->d_inode->i_ino != arg) -+ rc = -EEXIST; -+ } -+ dput(dchild); -+ return rc; -+ } - case EXT3_IOC_GETFLAGS: - flags = inode->u.ext3_i.i_flags & EXT3_FL_USER_VISIBLE; - return put_user(flags, (int *) arg); -Index: linux-2.4.19.SuSE/include/linux/ext3_fs.h -=================================================================== ---- linux-2.4.19.SuSE.orig/include/linux/ext3_fs.h Sun Nov 16 01:20:17 2003 -+++ linux-2.4.19.SuSE/include/linux/ext3_fs.h Sun Nov 16 01:25:42 2003 -@@ -202,6 +202,7 @@ - #define EXT3_IOC_SETFLAGS _IOW('f', 2, long) - #define EXT3_IOC_GETVERSION _IOR('f', 3, long) - #define EXT3_IOC_SETVERSION _IOW('f', 4, long) -+/* EXT3_IOC_CREATE_INUM at bottom of file (visible to kernel and user). */ - #define EXT3_IOC_GETVERSION_OLD _IOR('v', 1, long) - #define EXT3_IOC_SETVERSION_OLD _IOW('v', 2, long) - #ifdef CONFIG_JBD_DEBUG -@@ -674,7 +675,8 @@ - dx_hash_info *hinfo); - - /* ialloc.c */ --extern struct inode * ext3_new_inode (handle_t *, struct inode *, int); -+extern struct inode * ext3_new_inode (handle_t *, const struct inode *, int, -+ unsigned long); - extern void ext3_free_inode (handle_t *, struct inode *); - extern struct inode * ext3_orphan_get (struct super_block *, unsigned long); - extern unsigned long ext3_count_free_inodes (struct super_block *); -@@ -765,4 +767,5 @@ - - #endif /* __KERNEL__ */ - -+#define EXT3_IOC_CREATE_INUM _IOW('f', 5, long) - #endif /* _LINUX_EXT3_FS_H */ -Index: linux-2.4.19.SuSE/include/linux/dcache.h -=================================================================== ---- linux-2.4.19.SuSE.orig/include/linux/dcache.h Sat Nov 15 17:35:46 2003 -+++ linux-2.4.19.SuSE/include/linux/dcache.h Sun Nov 16 01:23:20 2003 -@@ -62,6 +62,11 @@ - - #define IS_ROOT(x) ((x) == (x)->d_parent) - -+struct dentry_params { -+ unsigned long p_inum; -+ void *p_ptr; -+}; -+ - /* - * "quick string" -- eases parameter passing, but more importantly - * saves "metadata" about the string (ie length and the hash). diff --git a/lustre/kernel_patches/patches/fc3_to_rhel4_updates.patch b/lustre/kernel_patches/patches/fc3_to_rhel4_updates.patch new file mode 100644 index 0000000..2286707 --- /dev/null +++ b/lustre/kernel_patches/patches/fc3_to_rhel4_updates.patch @@ -0,0 +1,12 @@ +Index: linux-2.6.10/include/linux/namei.h +=================================================================== +--- linux-2.6.10.orig/include/linux/namei.h 2005-04-06 09:38:35.000000000 -0600 ++++ linux-2.6.10/include/linux/namei.h 2006-01-03 15:32:11.000000000 -0700 +@@ -46,6 +46,7 @@ + #define LOOKUP_PARENT 16 + #define LOOKUP_NOALT 32 + #define LOOKUP_ATOMIC 64 ++#define LOOKUP_REVAL 128 + + /* + * Intent data diff --git a/lustre/kernel_patches/patches/invalidate_show-2.4.19-bgl.patch b/lustre/kernel_patches/patches/invalidate_show-2.4.19-bgl.patch deleted file mode 100644 index 85bdf9e..0000000 --- a/lustre/kernel_patches/patches/invalidate_show-2.4.19-bgl.patch +++ /dev/null @@ -1,121 +0,0 @@ - - - - fs/inode.c | 21 ++++++++++++++------- - fs/smbfs/inode.c | 2 +- - fs/super.c | 4 ++-- - include/linux/fs.h | 2 +- - 4 files changed, 18 insertions(+), 11 deletions(-) - -Index: linux.mcp2/fs/inode.c -=================================================================== ---- linux.mcp2.orig/fs/inode.c 2004-01-19 07:49:43.000000000 -0800 -+++ linux.mcp2/fs/inode.c 2004-05-05 14:31:31.000000000 -0700 -@@ -553,7 +553,8 @@ - /* - * Invalidate all inodes for a device. - */ --static int invalidate_list(struct list_head *head, struct super_block * sb, struct list_head * dispose) -+static int invalidate_list(struct list_head *head, struct super_block * sb, -+ struct list_head * dispose, int show) - { - struct list_head *next; - int busy = 0, count = 0; -@@ -578,6 +579,11 @@ - count++; - continue; - } -+ if (show) -+ printk(KERN_ERR -+ "inode busy: dev %s:%lu (%p) mode %o count %u\n", -+ kdevname(sb->s_dev), inode->i_ino, inode, -+ inode->i_mode, atomic_read(&inode->i_count)); - busy = 1; - } - /* only unused inodes may be cached with i_count zero */ -@@ -596,22 +602,23 @@ - /** - * invalidate_inodes - discard the inodes on a device - * @sb: superblock -+ * @show: whether we should display any busy inodes found - * - * Discard all of the inodes for a given superblock. If the discard - * fails because there are busy inodes then a non zero value is returned. - * If the discard is successful all the inodes have been discarded. - */ - --int invalidate_inodes(struct super_block * sb) -+int invalidate_inodes(struct super_block * sb, int show) - { - int busy; - LIST_HEAD(throw_away); - - spin_lock(&inode_lock); -- busy = invalidate_list(&inode_in_use, sb, &throw_away); -- busy |= invalidate_list(&inode_unused, sb, &throw_away); -- busy |= invalidate_list(&sb->s_dirty, sb, &throw_away); -- busy |= invalidate_list(&sb->s_locked_inodes, sb, &throw_away); -+ busy = invalidate_list(&inode_in_use, sb, &throw_away, show); -+ busy |= invalidate_list(&inode_unused, sb, &throw_away, show); -+ busy |= invalidate_list(&sb->s_dirty, sb, &throw_away, show); -+ busy |= invalidate_list(&sb->s_locked_inodes, sb, &throw_away, show); - spin_unlock(&inode_lock); - - dispose_list(&throw_away); -@@ -637,7 +644,7 @@ - * hold). - */ - shrink_dcache_sb(sb); -- res = invalidate_inodes(sb); -+ res = invalidate_inodes(sb, 0); - drop_super(sb); - } - invalidate_buffers(dev); -Index: linux.mcp2/fs/super.c -=================================================================== ---- linux.mcp2.orig/fs/super.c 2004-01-19 07:49:43.000000000 -0800 -+++ linux.mcp2/fs/super.c 2004-05-05 14:32:06.000000000 -0700 -@@ -838,7 +838,7 @@ - lock_super(sb); - lock_kernel(); - sb->s_flags &= ~MS_ACTIVE; -- invalidate_inodes(sb); /* bad name - it should be evict_inodes() */ -+ invalidate_inodes(sb, 0); /* bad name - it should be evict_inodes() */ - if (sop) { - if (sop->write_super && sb->s_dirt) - sop->write_super(sb); -@@ -847,7 +847,7 @@ - } - - /* Forget any remaining inodes */ -- if (invalidate_inodes(sb)) { -+ if (invalidate_inodes(sb, 1)) { - printk(KERN_ERR "VFS: Busy inodes after unmount. " - "Self-destruct in 5 seconds. Have a nice day...\n"); - } -Index: linux.mcp2/fs/smbfs/inode.c -=================================================================== ---- linux.mcp2.orig/fs/smbfs/inode.c 2004-01-19 07:49:43.000000000 -0800 -+++ linux.mcp2/fs/smbfs/inode.c 2004-05-05 14:31:31.000000000 -0700 -@@ -166,7 +166,7 @@ - { - VERBOSE("\n"); - shrink_dcache_sb(SB_of(server)); -- invalidate_inodes(SB_of(server)); -+ invalidate_inodes(SB_of(server), 0); - } - - /* -Index: linux.mcp2/include/linux/fs.h -=================================================================== ---- linux.mcp2.orig/include/linux/fs.h 2004-05-05 14:31:06.000000000 -0700 -+++ linux.mcp2/include/linux/fs.h 2004-05-05 14:31:31.000000000 -0700 -@@ -1283,7 +1283,7 @@ - extern void set_buffer_flushtime(struct buffer_head *); - extern void balance_dirty(void); - extern int check_disk_change(kdev_t); --extern int invalidate_inodes(struct super_block *); -+extern int invalidate_inodes(struct super_block *, int); - extern int invalidate_device(kdev_t, int); - extern void invalidate_inode_pages(struct inode *); - extern void invalidate_inode_pages2(struct address_space *); diff --git a/lustre/kernel_patches/patches/iod-stock-24-exports-2.4.19-bgl.patch b/lustre/kernel_patches/patches/iod-stock-24-exports-2.4.19-bgl.patch deleted file mode 100644 index 2466af6..0000000 --- a/lustre/kernel_patches/patches/iod-stock-24-exports-2.4.19-bgl.patch +++ /dev/null @@ -1,52 +0,0 @@ - fs/Makefile | 2 +- - fs/inode.c | 4 +++- - mm/page_alloc.c | 1 + - 3 files changed, 5 insertions(+), 2 deletions(-) - -Index: linux-ion/fs/inode.c -=================================================================== ---- linux-ion.orig/fs/inode.c 2004-09-27 14:58:03.000000000 -0700 -+++ linux-ion/fs/inode.c 2004-09-27 14:58:34.000000000 -0700 -@@ -5,6 +5,7 @@ - */ - - #include -+#include - #include - #include - #include -@@ -66,7 +67,8 @@ - * NOTE! You also have to own the lock if you change - * the i_state of an inode while it is in use.. - */ --static spinlock_t inode_lock = SPIN_LOCK_UNLOCKED; -+spinlock_t inode_lock = SPIN_LOCK_UNLOCKED; -+EXPORT_SYMBOL(inode_lock); - - /* - * Statistics gathering.. -Index: linux-ion/fs/Makefile -=================================================================== ---- linux-ion.orig/fs/Makefile 2004-07-28 14:34:57.000000000 -0700 -+++ linux-ion/fs/Makefile 2004-09-27 14:59:37.000000000 -0700 -@@ -7,7 +7,7 @@ - - O_TARGET := fs.o - --export-objs := filesystems.o open.o dcache.o buffer.o -+export-objs := filesystems.o open.o dcache.o buffer.o inode.o - mod-subdirs := nls - - obj-y := open.o read_write.o devices.o file_table.o buffer.o \ -Index: linux-ion/mm/page_alloc.c -=================================================================== ---- linux-ion.orig/mm/page_alloc.c 2004-07-28 14:34:57.000000000 -0700 -+++ linux-ion/mm/page_alloc.c 2004-09-27 14:58:34.000000000 -0700 -@@ -28,6 +28,7 @@ - LIST_HEAD(inactive_list); - LIST_HEAD(active_list); - pg_data_t *pgdat_list; -+EXPORT_SYMBOL(pgdat_list); - - /* Used to look up the address of the struct zone encoded in page->zone */ - zone_t *zone_table[MAX_NR_ZONES*MAX_NR_NODES]; diff --git a/lustre/kernel_patches/patches/iod-stock-24-exports-2.4.19-suse.patch b/lustre/kernel_patches/patches/iod-stock-24-exports-2.4.19-suse.patch deleted file mode 100644 index 2040fcd..0000000 --- a/lustre/kernel_patches/patches/iod-stock-24-exports-2.4.19-suse.patch +++ /dev/null @@ -1,52 +0,0 @@ - fs/Makefile | 2 +- - fs/inode.c | 4 +++- - mm/page_alloc.c | 1 + - 3 files changed, 5 insertions(+), 2 deletions(-) - -Index: linux-2.4.19.SuSE/fs/inode.c -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/inode.c Sat Nov 15 18:02:13 2003 -+++ linux-2.4.19.SuSE/fs/inode.c Sat Nov 15 18:03:04 2003 -@@ -5,6 +5,7 @@ - */ - - #include -+#include - #include - #include - #include -@@ -67,7 +68,8 @@ - * NOTE! You also have to own the lock if you change - * the i_state of an inode while it is in use.. - */ --static spinlock_t inode_lock = SPIN_LOCK_UNLOCKED; -+spinlock_t inode_lock = SPIN_LOCK_UNLOCKED; -+EXPORT_SYMBOL(inode_lock); - - /* - * Statistics gathering.. -Index: linux-2.4.19.SuSE/fs/Makefile -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/Makefile Mon Jan 27 05:08:56 2003 -+++ linux-2.4.19.SuSE/fs/Makefile Sat Nov 15 18:03:54 2003 -@@ -7,7 +7,7 @@ - - O_TARGET := fs.o - --export-objs := filesystems.o open.o dcache.o buffer.o -+export-objs := filesystems.o open.o dcache.o buffer.o inode.o - mod-subdirs := nls - - obj-y := open.o read_write.o devices.o file_table.o buffer.o \ -Index: linux-2.4.19.SuSE/mm/page_alloc.c -=================================================================== ---- linux-2.4.19.SuSE.orig/mm/page_alloc.c Mon Jan 27 05:08:55 2003 -+++ linux-2.4.19.SuSE/mm/page_alloc.c Sat Nov 15 18:03:04 2003 -@@ -32,6 +32,7 @@ - LIST_HEAD(inactive_list); - LIST_HEAD(active_list); - pg_data_t *pgdat_list; -+EXPORT_SYMBOL(pgdat_list); - - /* Used to look up the address of the struct zone encoded in page->zone */ - zone_t *zone_table[MAX_NR_ZONES*MAX_NR_NODES]; diff --git a/lustre/kernel_patches/patches/iopen-2.4.20.patch b/lustre/kernel_patches/patches/iopen-2.4.20.patch index 4b869a6..d5a28e6 100644 --- a/lustre/kernel_patches/patches/iopen-2.4.20.patch +++ b/lustre/kernel_patches/patches/iopen-2.4.20.patch @@ -489,7 +489,7 @@ Index: lum/include/linux/ext3_fs.h @@ -324,4 +324,6 @@ #define EXT3_MOUNT_XATTR_USER 0x4000 /* Extended user attributes */ #define EXT3_MOUNT_ASYNCDEL 0x20000 /* Delayed deletion */ -+#define EXT3_MOUNT_IOPEN 0x40000 /* Allow access via iopen */ -+#define EXT3_MOUNT_IOPEN_NOPRIV 0x80000 /* Make iopen world-readable */ ++#define EXT3_MOUNT_IOPEN 0x80000 /* Allow access via iopen */ ++#define EXT3_MOUNT_IOPEN_NOPRIV 0x100000/* Make iopen world-readable */ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ diff --git a/lustre/kernel_patches/patches/iopen-2.4.21-chaos.patch b/lustre/kernel_patches/patches/iopen-2.4.21-chaos.patch index 94d8ab9..1510c9b 100644 --- a/lustre/kernel_patches/patches/iopen-2.4.21-chaos.patch +++ b/lustre/kernel_patches/patches/iopen-2.4.21-chaos.patch @@ -490,8 +490,8 @@ Index: linux-ia64/include/linux/ext3_fs.h #define EXT3_MOUNT_XATTR_USER 0x4000 /* Extended user attributes */ #define EXT3_MOUNT_POSIX_ACL 0x8000 /* POSIX Access Control Lists */ #define EXT3_MOUNT_ASYNCDEL 0x20000 /* Delayed deletion */ -+#define EXT3_MOUNT_IOPEN 0x40000 /* Allow access via iopen */ -+#define EXT3_MOUNT_IOPEN_NOPRIV 0x80000 /* Make iopen world-readable */ ++#define EXT3_MOUNT_IOPEN 0x80000 /* Allow access via iopen */ ++#define EXT3_MOUNT_IOPEN_NOPRIV 0x100000/* Make iopen world-readable */ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ #ifndef _LINUX_EXT2_FS_H diff --git a/lustre/kernel_patches/patches/iopen-2.4.21-sles8sp3.patch b/lustre/kernel_patches/patches/iopen-2.4.21-sles8sp3.patch deleted file mode 100644 index a7aa775..0000000 --- a/lustre/kernel_patches/patches/iopen-2.4.21-sles8sp3.patch +++ /dev/null @@ -1,497 +0,0 @@ - Documentation/filesystems/ext2.txt | 16 ++ - fs/ext3/Makefile | 2 - fs/ext3/inode.c | 4 - fs/ext3/iopen.c | 259 +++++++++++++++++++++++++++++++++++++ - fs/ext3/iopen.h | 13 + - fs/ext3/namei.c | 13 + - fs/ext3/super.c | 11 + - include/linux/ext3_fs.h | 2 - 8 files changed, 318 insertions(+), 2 deletions(-) - -Index: kernel-2.4.212l35/Documentation/filesystems/ext2.txt -=================================================================== ---- kernel-2.4.212l35.orig/Documentation/filesystems/ext2.txt 2001-07-11 15:44:45.000000000 -0700 -+++ kernel-2.4.212l35/Documentation/filesystems/ext2.txt 2004-05-06 19:48:32.000000000 -0700 -@@ -35,6 +35,22 @@ - - sb=n Use alternate superblock at this location. - -+iopen Makes an invisible pseudo-directory called -+ __iopen__ available in the root directory -+ of the filesystem. Allows open-by-inode- -+ number. i.e., inode 3145 can be accessed -+ via /mntpt/__iopen__/3145 -+ -+iopen_nopriv This option makes the iopen directory be -+ world-readable. This may be safer since it -+ allows daemons to run as an unprivileged user, -+ however it significantly changes the security -+ model of a Unix filesystem, since previously -+ all files under a mode 700 directory were not -+ generally avilable even if the -+ permissions on the file itself is -+ world-readable. -+ - grpquota,noquota,quota,usrquota Quota options are silently ignored by ext2. - - -Index: kernel-2.4.212l35/fs/ext3/Makefile -=================================================================== ---- kernel-2.4.212l35.orig/fs/ext3/Makefile 2004-05-06 19:46:22.000000000 -0700 -+++ kernel-2.4.212l35/fs/ext3/Makefile 2004-05-06 19:48:32.000000000 -0700 -@@ -11,7 +11,7 @@ - - export-objs := ext3-exports.o - --obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ -+obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ - ioctl.o namei.o super.o symlink.o hash.o ext3-exports.o - obj-m := $(O_TARGET) - -Index: kernel-2.4.212l35/fs/ext3/inode.c -=================================================================== ---- kernel-2.4.212l35.orig/fs/ext3/inode.c 2004-05-06 19:46:24.000000000 -0700 -+++ kernel-2.4.212l35/fs/ext3/inode.c 2004-05-06 19:48:32.000000000 -0700 -@@ -34,6 +34,7 @@ - #include - #include - #include -+#include "iopen.h" - - /* - * SEARCH_FROM_ZERO forces each block allocation to search from the start -@@ -2252,6 +2253,9 @@ - struct buffer_head *bh; - int block; - -+ if (ext3_iopen_get_inode(inode)) -+ return; -+ - if(ext3_get_inode_loc(inode, &iloc)) - goto bad_inode; - bh = iloc.bh; -Index: kernel-2.4.212l35/fs/ext3/iopen.c -=================================================================== ---- kernel-2.4.212l35.orig/fs/ext3/iopen.c 2003-03-27 11:16:05.000000000 -0800 -+++ kernel-2.4.212l35/fs/ext3/iopen.c 2004-05-06 19:48:41.000000000 -0700 -@@ -0,0 +1,285 @@ -+/* -+ * linux/fs/ext3/iopen.c -+ * -+ * Special support for open by inode number -+ * -+ * Copyright (C) 2001 by Theodore Ts'o (tytso@alum.mit.edu). -+ * -+ * This file may be redistributed under the terms of the GNU General -+ * Public License. -+ * -+ * -+ * Invariants: -+ * - there is only ever a single DCACHE_NFSD_DISCONNECTED dentry alias -+ * for an inode at one time. -+ * - there are never both connected and DCACHE_NFSD_DISCONNECTED dentry -+ * aliases on an inode at the same time. -+ * -+ * If we have any connected dentry aliases for an inode, use one of those -+ * in iopen_lookup(). Otherwise, we instantiate a single NFSD_DISCONNECTED -+ * dentry for this inode, which thereafter will be found by the dcache -+ * when looking up this inode number in __iopen__, so we don't return here -+ * until it is gone. -+ * -+ * If we get an inode via a regular name lookup, then we "rename" the -+ * NFSD_DISCONNECTED dentry to the proper name and parent. This ensures -+ * existing users of the disconnected dentry will continue to use the same -+ * dentry as the connected users, and there will never be both kinds of -+ * dentry aliases at one time. -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include "iopen.h" -+ -+#ifndef assert -+#define assert(test) J_ASSERT(test) -+#endif -+ -+#define IOPEN_NAME_LEN 32 -+ -+/* -+ * This implements looking up an inode by number. -+ */ -+static struct dentry *iopen_lookup(struct inode *dir, struct dentry *dentry) -+{ -+ struct inode *inode; -+ unsigned long ino; -+ struct list_head *lp; -+ struct dentry *alternate; -+ char buf[IOPEN_NAME_LEN]; -+ -+ if (dentry->d_name.len >= IOPEN_NAME_LEN) -+ return ERR_PTR(-ENAMETOOLONG); -+ -+ memcpy(buf, dentry->d_name.name, dentry->d_name.len); -+ buf[dentry->d_name.len] = 0; -+ -+ if (strcmp(buf, ".") == 0) -+ ino = dir->i_ino; -+ else if (strcmp(buf, "..") == 0) -+ ino = EXT3_ROOT_INO; -+ else -+ ino = simple_strtoul(buf, 0, 0); -+ -+ if ((ino != EXT3_ROOT_INO && -+ //ino != EXT3_ACL_IDX_INO && -+ //ino != EXT3_ACL_DATA_INO && -+ ino < EXT3_FIRST_INO(dir->i_sb)) || -+ ino > le32_to_cpu(dir->i_sb->u.ext3_sb.s_es->s_inodes_count)) -+ return ERR_PTR(-ENOENT); -+ -+ inode = iget(dir->i_sb, ino); -+ if (!inode) -+ return ERR_PTR(-EACCES); -+ if (is_bad_inode(inode)) { -+ iput(inode); -+ return ERR_PTR(-ENOENT); -+ } -+ -+ assert(list_empty(&dentry->d_alias)); /* d_instantiate */ -+ assert(list_empty(&dentry->d_hash)); /* d_rehash */ -+ -+ /* preferrably return a connected dentry */ -+ spin_lock(&dcache_lock); -+ list_for_each(lp, &inode->i_dentry) { -+ alternate = list_entry(lp, struct dentry, d_alias); -+ assert(!(alternate->d_flags & DCACHE_NFSD_DISCONNECTED)); -+ } -+ -+ if (!list_empty(&inode->i_dentry)) { -+ alternate = list_entry(inode->i_dentry.next, -+ struct dentry, d_alias); -+ dget_locked(alternate); -+ alternate->d_vfs_flags |= DCACHE_REFERENCED; -+ iput(inode); -+ spin_unlock(&dcache_lock); -+ return alternate; -+ } -+ dentry->d_flags |= DCACHE_NFSD_DISCONNECTED; -+ -+ /* d_add(), but don't drop dcache_lock before adding dentry to inode */ -+ list_add(&dentry->d_alias, &inode->i_dentry); /* d_instantiate */ -+ dentry->d_inode = inode; -+ -+ __d_rehash(dentry, 0); /* d_rehash */ -+ spin_unlock(&dcache_lock); -+ -+ return NULL; -+} -+ -+#define do_switch(x,y) do { \ -+ __typeof__ (x) __tmp = x; \ -+ x = y; y = __tmp; } while (0) -+ -+static inline void switch_names(struct dentry *dentry, struct dentry *target) -+{ -+ const unsigned char *old_name, *new_name; -+ -+ memcpy(dentry->d_iname, target->d_iname, DNAME_INLINE_LEN); -+ old_name = target->d_name.name; -+ new_name = dentry->d_name.name; -+ if (old_name == target->d_iname) -+ old_name = dentry->d_iname; -+ if (new_name == dentry->d_iname) -+ new_name = target->d_iname; -+ target->d_name.name = new_name; -+ dentry->d_name.name = old_name; -+} -+ -+/* This function is spliced into ext3_lookup and does the move of a -+ * disconnected dentry (if it exists) to a connected dentry. -+ */ -+struct dentry *iopen_connect_dentry(struct dentry *dentry, struct inode *inode, -+ int rehash) -+{ -+ struct dentry *tmp, *goal = NULL; -+ struct list_head *lp; -+ -+ /* verify this dentry is really new */ -+ assert(dentry->d_inode == NULL); -+ assert(list_empty(&dentry->d_alias)); /* d_instantiate */ -+ if (rehash) -+ assert(list_empty(&dentry->d_hash)); /* d_rehash */ -+ assert(list_empty(&dentry->d_subdirs)); -+ -+ spin_lock(&dcache_lock); -+ if (!inode) -+ goto do_rehash; -+ -+ if (!test_opt(inode->i_sb, IOPEN)) -+ goto do_instantiate; -+ -+ /* preferrably return a connected dentry */ -+ list_for_each(lp, &inode->i_dentry) { -+ tmp = list_entry(lp, struct dentry, d_alias); -+ if (tmp->d_flags & DCACHE_NFSD_DISCONNECTED) { -+ assert(tmp->d_alias.next == &inode->i_dentry); -+ assert(tmp->d_alias.prev == &inode->i_dentry); -+ goal = tmp; -+ dget_locked(goal); -+ break; -+ } -+ } -+ -+ if (!goal) -+ goto do_instantiate; -+ -+ /* Move the goal to the de hash queue - like d_move() */ -+ goal->d_flags &= ~DCACHE_NFSD_DISCONNECTED; -+ list_del_init(&goal->d_hash); -+ -+ list_del(&goal->d_child); -+ list_del(&dentry->d_child); -+ -+ /* Switch the parents and the names.. */ -+ switch_names(goal, dentry); -+ do_switch(goal->d_parent, dentry->d_parent); -+ do_switch(goal->d_name.len, dentry->d_name.len); -+ do_switch(goal->d_name.hash, dentry->d_name.hash); -+ -+ /* And add them back to the (new) parent lists */ -+ list_add(&goal->d_child, &goal->d_parent->d_subdirs); -+ list_add(&dentry->d_child, &dentry->d_parent->d_subdirs); -+ __d_rehash(goal, 0); -+ spin_unlock(&dcache_lock); -+ iput(inode); -+ -+ return goal; -+ -+ /* d_add(), but don't drop dcache_lock before adding dentry to inode */ -+do_instantiate: -+ list_add(&dentry->d_alias, &inode->i_dentry); /* d_instantiate */ -+ dentry->d_inode = inode; -+do_rehash: -+ if (rehash) -+ __d_rehash(dentry, 0); /* d_rehash */ -+ spin_unlock(&dcache_lock); -+ -+ return NULL; -+} -+ -+/* -+ * These are the special structures for the iopen pseudo directory. -+ */ -+ -+static struct inode_operations iopen_inode_operations = { -+ lookup: iopen_lookup, /* BKL held */ -+}; -+ -+static struct file_operations iopen_file_operations = { -+ read: generic_read_dir, -+}; -+ -+static int match_dentry(struct dentry *dentry, const char *name) -+{ -+ int len; -+ -+ len = strlen(name); -+ if (dentry->d_name.len != len) -+ return 0; -+ if (strncmp(dentry->d_name.name, name, len)) -+ return 0; -+ return 1; -+} -+ -+/* -+ * This function is spliced into ext3_lookup and returns 1 the file -+ * name is __iopen__ and dentry has been filled in appropriately. -+ */ -+int ext3_check_for_iopen(struct inode *dir, struct dentry *dentry) -+{ -+ struct inode *inode; -+ -+ if (dir->i_ino != EXT3_ROOT_INO || -+ !test_opt(dir->i_sb, IOPEN) || -+ !match_dentry(dentry, "__iopen__")) -+ return 0; -+ -+ inode = iget(dir->i_sb, EXT3_BAD_INO); -+ -+ if (!inode) -+ return 0; -+ d_add(dentry, inode); -+ return 1; -+} -+ -+/* -+ * This function is spliced into read_inode; it returns 1 if inode -+ * number is the one for /__iopen__, in which case the inode is filled -+ * in appropriately. Otherwise, this fuction returns 0. -+ */ -+int ext3_iopen_get_inode(struct inode *inode) -+{ -+ if (inode->i_ino != EXT3_BAD_INO) -+ return 0; -+ -+ inode->i_mode = S_IFDIR | S_IRUSR | S_IXUSR; -+ if (test_opt(inode->i_sb, IOPEN_NOPRIV)) -+ inode->i_mode |= 0777; -+ inode->i_uid = 0; -+ inode->i_gid = 0; -+ inode->i_nlink = 1; -+ inode->i_size = 4096; -+ inode->i_atime = CURRENT_TIME; -+ inode->i_ctime = CURRENT_TIME; -+ inode->i_mtime = CURRENT_TIME; -+ inode->u.ext3_i.i_dtime = 0; -+ inode->i_blksize = PAGE_SIZE; /* This is the optimal IO size -+ * (for stat), not the fs block -+ * size */ -+ inode->i_blocks = 0; -+ inode->i_version = 1; -+ inode->i_generation = 0; -+ -+ inode->i_op = &iopen_inode_operations; -+ inode->i_fop = &iopen_file_operations; -+ inode->i_mapping->a_ops = 0; -+ -+ return 1; -+} -Index: kernel-2.4.212l35/fs/ext3/iopen.h -=================================================================== ---- kernel-2.4.212l35.orig/fs/ext3/iopen.h 2003-03-27 11:16:05.000000000 -0800 -+++ kernel-2.4.212l35/fs/ext3/iopen.h 2004-05-06 19:48:41.000000000 -0700 -@@ -0,0 +1,15 @@ -+/* -+ * iopen.h -+ * -+ * Special support for opening files by inode number. -+ * -+ * Copyright (C) 2001 by Theodore Ts'o (tytso@alum.mit.edu). -+ * -+ * This file may be redistributed under the terms of the GNU General -+ * Public License. -+ */ -+ -+extern int ext3_check_for_iopen(struct inode *dir, struct dentry *dentry); -+extern int ext3_iopen_get_inode(struct inode *inode); -+extern struct dentry *iopen_connect_dentry(struct dentry *dentry, -+ struct inode *inode, int rehash); -Index: kernel-2.4.212l35/fs/ext3/namei.c -=================================================================== ---- kernel-2.4.212l35.orig/fs/ext3/namei.c 2004-05-06 19:46:23.000000000 -0700 -+++ kernel-2.4.212l35/fs/ext3/namei.c 2004-05-06 19:51:48.000000000 -0700 -@@ -36,7 +36,7 @@ - #include - #include - #include -- -+#include "iopen.h" - - /* - * define how far ahead to read directories while searching them. -@@ -932,6 +932,9 @@ - if (dentry->d_name.len > EXT3_NAME_LEN) - return ERR_PTR(-ENAMETOOLONG); - -+ if (ext3_check_for_iopen(dir, dentry)) -+ return NULL; -+ - bh = ext3_find_entry(dentry, &de); - inode = NULL; - if (bh) { -@@ -943,8 +946,8 @@ - return ERR_PTR(-EACCES); - } - } -- d_add(dentry, inode); -- return NULL; -+ -+ return iopen_connect_dentry(dentry, inode, 1); - } - - #define S_SHIFT 12 -@@ -1936,10 +1940,6 @@ - inode->i_nlink); - inode->i_version = ++event; - inode->i_nlink = 0; -- /* There's no need to set i_disksize: the fact that i_nlink is -- * zero will ensure that the right thing happens during any -- * recovery. */ -- inode->i_size = 0; - ext3_orphan_add(handle, inode); - dir->i_nlink--; - inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; -@@ -2058,6 +2058,23 @@ - return err; - } - -+/* Like ext3_add_nondir() except for call to iopen_connect_dentry */ -+static int ext3_add_link(handle_t *handle, struct dentry *dentry, -+ struct inode *inode) -+{ -+ int err = ext3_add_entry(handle, dentry, inode); -+ if (!err) { -+ err = ext3_mark_inode_dirty(handle, inode); -+ if (err == 0) { -+ dput(iopen_connect_dentry(dentry, inode, 0)); -+ return 0; -+ } -+ } -+ ext3_dec_count(handle, inode); -+ iput(inode); -+ return err; -+} -+ - static int ext3_link (struct dentry * old_dentry, - struct inode * dir, struct dentry *dentry) - { -@@ -2085,7 +2102,8 @@ - ext3_inc_count(handle, inode); - atomic_inc(&inode->i_count); - -- err = ext3_add_nondir(handle, dentry, inode); -+ err = ext3_add_link(handle, dentry, inode); -+ ext3_orphan_del(handle, inode); - ext3_journal_stop(handle, dir); - return err; - } -Index: kernel-2.4.212l35/fs/ext3/super.c -=================================================================== ---- kernel-2.4.212l35.orig/fs/ext3/super.c 2004-05-06 19:46:23.000000000 -0700 -+++ kernel-2.4.212l35/fs/ext3/super.c 2004-05-06 19:48:32.000000000 -0700 -@@ -869,6 +869,18 @@ - || !strcmp (this_char, "quota") - || !strcmp (this_char, "usrquota")) - /* Don't do anything ;-) */ ; -+ else if (!strcmp (this_char, "iopen")) { -+ set_opt (sbi->s_mount_opt, IOPEN); -+ clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV); -+ } -+ else if (!strcmp (this_char, "noiopen")) { -+ clear_opt (sbi->s_mount_opt, IOPEN); -+ clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV); -+ } -+ else if (!strcmp (this_char, "iopen_nopriv")) { -+ set_opt (sbi->s_mount_opt, IOPEN); -+ set_opt (sbi->s_mount_opt, IOPEN_NOPRIV); -+ } - else if (!strcmp (this_char, "journal")) { - /* @@@ FIXME */ - /* Eventually we will want to be able to create -Index: kernel-2.4.212l35/include/linux/ext3_fs.h -=================================================================== ---- kernel-2.4.212l35.orig/include/linux/ext3_fs.h 2004-05-06 19:46:24.000000000 -0700 -+++ kernel-2.4.212l35/include/linux/ext3_fs.h 2004-05-06 19:48:32.000000000 -0700 -@@ -324,6 +324,8 @@ - #define EXT3_MOUNT_XATTR_USER 0x4000 /* Extended user attributes */ - #define EXT3_MOUNT_POSIX_ACL 0x8000 /* POSIX Access Control Lists */ - #define EXT3_MOUNT_ASYNCDEL 0x20000 /* Delayed deletion */ -+#define EXT3_MOUNT_IOPEN 0x40000 /* Allow access via iopen */ -+#define EXT3_MOUNT_IOPEN_NOPRIV 0x80000 /* Make iopen world-readable */ - - /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ - #ifndef _LINUX_EXT2_FS_H diff --git a/lustre/kernel_patches/patches/iopen-2.6-rhel4.patch b/lustre/kernel_patches/patches/iopen-2.6-rhel4.patch index cbb024a..98dbca4 100644 --- a/lustre/kernel_patches/patches/iopen-2.6-rhel4.patch +++ b/lustre/kernel_patches/patches/iopen-2.6-rhel4.patch @@ -7,7 +7,7 @@ Index: linux-stage/fs/ext3/Makefile obj-$(CONFIG_EXT3_FS) += ext3.o -ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ -+ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o\ ++ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ ioctl.o namei.o super.o symlink.o hash.o resize.o ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o @@ -36,7 +36,7 @@ Index: linux-stage/fs/ext3/iopen.c =================================================================== --- linux-stage.orig/fs/ext3/iopen.c 2005-02-25 14:41:01.017787968 +0200 +++ linux-stage/fs/ext3/iopen.c 2005-02-25 14:41:01.045783712 +0200 -@@ -0,0 +1,277 @@ +@@ -0,0 +1,278 @@ +/* + * linux/fs/ext3/iopen.c + * @@ -124,7 +124,7 @@ Index: linux-stage/fs/ext3/iopen.c + } + + assert(list_empty(&dentry->d_alias)); /* d_instantiate */ -+ assert(d_unhashed(dentry)); /* d_rehash */ ++ assert(d_unhashed(dentry)); /* d_rehash */ + + /* preferrably return a connected dentry */ + spin_lock(&dcache_lock); @@ -188,7 +188,7 @@ Index: linux-stage/fs/ext3/iopen.c + assert(dentry->d_inode == NULL); + assert(list_empty(&dentry->d_alias)); /* d_instantiate */ + if (rehash) -+ assert(d_unhashed(dentry)); /* d_rehash */ ++ assert(d_unhashed(dentry)); /* d_rehash */ + assert(list_empty(&dentry->d_subdirs)); + + spin_lock(&dcache_lock); @@ -214,8 +214,9 @@ Index: linux-stage/fs/ext3/iopen.c + goto do_instantiate; + + /* Move the goal to the de hash queue */ -+ goal->d_flags &= ~ DCACHE_DISCONNECTED; ++ goal->d_flags &= ~DCACHE_DISCONNECTED; + security_d_instantiate(goal, inode); ++ __d_drop(dentry); + __d_rehash(dentry, 0); + __d_move(goal, dentry); + spin_unlock(&dcache_lock); @@ -410,7 +411,7 @@ Index: linux-stage/fs/ext3/namei.c - err = ext3_add_nondir(handle, dentry, inode); + err = ext3_add_link(handle, dentry, inode); -+ ext3_orphan_del(handle,inode); ++ ext3_orphan_del(handle, inode); ext3_journal_stop(handle); if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) goto retry; @@ -419,20 +420,20 @@ Index: linux-stage/fs/ext3/super.c --- linux-stage.orig/fs/ext3/super.c 2005-02-25 14:37:30.987717392 +0200 +++ linux-stage/fs/ext3/super.c 2005-02-25 14:44:50.495901992 +0200 @@ -586,6 +586,7 @@ - Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, -+ Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, Opt_ignore, Opt_barrier, Opt_err, Opt_resize, ++ Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, }; + static match_table_t tokens = { @@ -633,6 +634,9 @@ {Opt_ignore, "noquota"}, {Opt_ignore, "quota"}, {Opt_ignore, "usrquota"}, -+ {Opt_iopen, "iopen"}, -+ {Opt_noiopen, "noiopen"}, -+ {Opt_iopen_nopriv, "iopen_nopriv"}, ++ {Opt_iopen, "iopen"}, ++ {Opt_noiopen, "noiopen"}, ++ {Opt_iopen_nopriv, "iopen_nopriv"}, {Opt_barrier, "barrier=%u"}, {Opt_err, NULL}, {Opt_resize, "resize"}, @@ -463,8 +464,8 @@ Index: linux-stage/include/linux/ext3_fs.h #define EXT3_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */ #define EXT3_MOUNT_BARRIER 0x10000 /* Use block barriers */ #define EXT3_MOUNT_RESERVATION 0x20000 /* Preallocation */ -+#define EXT3_MOUNT_IOPEN 0x40000 /* Allow access via iopen */ -+#define EXT3_MOUNT_IOPEN_NOPRIV 0x80000 /* Make iopen world-readable */ ++#define EXT3_MOUNT_IOPEN 0x80000 /* Allow access via iopen */ ++#define EXT3_MOUNT_IOPEN_NOPRIV 0x100000/* Make iopen world-readable */ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ #ifndef _LINUX_EXT2_FS_H diff --git a/lustre/kernel_patches/patches/iopen-2.6-suse.patch b/lustre/kernel_patches/patches/iopen-2.6-suse.patch index 9aba4f6..1c5e900 100644 --- a/lustre/kernel_patches/patches/iopen-2.6-suse.patch +++ b/lustre/kernel_patches/patches/iopen-2.6-suse.patch @@ -1,15 +1,7 @@ - fs/ext3/inode.c | 3 - fs/ext3/iopen.c | 239 +++++++++++++++++++++++++++++++++++++ - fs/ext3/iopen.h | 15 ++ - fs/ext3/namei.c | 13 ++ - fs/ext3/super.c | 17 ++ - include/linux/ext3_fs.h | 2 - 7 files changed, 304 insertions(+), 1 deletion(-) - -Index: linux-2.6.5-sles9/fs/ext3/Makefile +Index: linux-stage/fs/ext3/Makefile =================================================================== ---- linux-2.6.5-sles9.orig/fs/ext3/Makefile 2004-04-04 07:36:18.000000000 +0400 -+++ linux-2.6.5-sles9/fs/ext3/Makefile 2004-11-09 02:18:27.604914376 +0300 +--- linux-stage.orig/fs/ext3/Makefile 2005-02-25 14:31:53.151076368 +0200 ++++ linux-stage/fs/ext3/Makefile 2005-02-25 14:41:51.259150120 +0200 @@ -4,7 +4,7 @@ obj-$(CONFIG_EXT3_FS) += ext3.o @@ -19,10 +11,10 @@ Index: linux-2.6.5-sles9/fs/ext3/Makefile ioctl.o namei.o super.o symlink.o hash.o ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o -Index: linux-2.6.5-sles9/fs/ext3/inode.c +Index: linux-stage/fs/ext3/inode.c =================================================================== ---- linux-2.6.5-sles9.orig/fs/ext3/inode.c 2004-11-09 02:15:44.739673656 +0300 -+++ linux-2.6.5-sles9/fs/ext3/inode.c 2004-11-09 02:18:27.608913768 +0300 +--- linux-stage.orig/fs/ext3/inode.c 2005-02-25 14:37:30.983718000 +0200 ++++ linux-stage/fs/ext3/inode.c 2005-02-25 14:47:42.069818792 +0200 @@ -37,6 +37,7 @@ #include #include @@ -31,7 +23,7 @@ Index: linux-2.6.5-sles9/fs/ext3/inode.c #include "acl.h" /* -@@ -2402,6 +2403,9 @@ +@@ -2408,6 +2409,9 @@ #endif ei->i_rsv_window.rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED; @@ -41,11 +33,11 @@ Index: linux-2.6.5-sles9/fs/ext3/inode.c if (ext3_get_inode_loc(inode, &iloc, 0)) goto bad_inode; bh = iloc.bh; -Index: linux-2.6.5-sles9/fs/ext3/iopen.c +Index: linux-stage/fs/ext3/iopen.c =================================================================== --- linux-2.6.5-sles9.orig/fs/ext3/iopen.c 2003-01-30 13:24:37.000000000 +0300 +++ linux-2.6.5-sles9/fs/ext3/iopen.c 2004-11-09 02:18:27.611913312 +0300 -@@ -0,0 +1,275 @@ +@@ -0,0 +1,278 @@ +/* + * linux/fs/ext3/iopen.c + * @@ -133,7 +125,7 @@ Index: linux-2.6.5-sles9/fs/ext3/iopen.c + } + + assert(list_empty(&dentry->d_alias)); /* d_instantiate */ -+ assert(d_unhashed(dentry)); /* d_rehash */ ++ assert(d_unhashed(dentry)); /* d_rehash */ + + /* preferrably return a connected dentry */ + spin_lock(&dcache_lock); @@ -146,7 +138,9 @@ Index: linux-2.6.5-sles9/fs/ext3/iopen.c + alternate = list_entry(inode->i_dentry.next, + struct dentry, d_alias); + dget_locked(alternate); ++ spin_lock(&alternate->d_lock); + alternate->d_vfs_flags |= DCACHE_REFERENCED; ++ spin_unlock(&alternate->d_lock); + iput(inode); + spin_unlock(&dcache_lock); + return alternate; @@ -221,8 +215,9 @@ Index: linux-2.6.5-sles9/fs/ext3/iopen.c + goto do_instantiate; + + /* Move the goal to the de hash queue */ -+ goal->d_flags &= ~ DCACHE_DISCONNECTED; ++ goal->d_flags &= ~DCACHE_DISCONNECTED; + security_d_instantiate(goal, inode); ++ __d_drop(dentry); + __d_rehash(dentry, 0); + __d_move(goal, dentry); + spin_unlock(&dcache_lock); @@ -321,10 +316,10 @@ Index: linux-2.6.5-sles9/fs/ext3/iopen.c + + return 1; +} -Index: linux-2.6.5-sles9/fs/ext3/iopen.h +Index: linux-stage/fs/ext3/iopen.h =================================================================== ---- linux-2.6.5-sles9.orig/fs/ext3/iopen.h 2003-01-30 13:24:37.000000000 +0300 -+++ linux-2.6.5-sles9/fs/ext3/iopen.h 2004-11-09 02:18:27.613913008 +0300 +--- linux-stage.orig/fs/ext3/iopen.h 2005-02-25 14:41:01.017787968 +0200 ++++ linux-stage/fs/ext3/iopen.h 2005-02-25 14:41:01.045783712 +0200 @@ -0,0 +1,15 @@ +/* + * iopen.h @@ -341,10 +336,10 @@ Index: linux-2.6.5-sles9/fs/ext3/iopen.h +extern int ext3_iopen_get_inode(struct inode *inode); +extern struct dentry *iopen_connect_dentry(struct dentry *dentry, + struct inode *inode, int rehash); -Index: linux-2.6.5-sles9/fs/ext3/namei.c +Index: linux-stage/fs/ext3/namei.c =================================================================== ---- linux-2.6.5-sles9.orig/fs/ext3/namei.c 2004-11-09 02:15:44.614692656 +0300 -+++ linux-2.6.5-sles9/fs/ext3/namei.c 2004-11-09 02:18:27.616912552 +0300 +--- linux-stage.orig/fs/ext3/namei.c 2005-02-25 14:37:28.975023368 +0200 ++++ linux-stage/fs/ext3/namei.c 2005-02-25 14:46:43.090784968 +0200 @@ -37,6 +37,7 @@ #include #include @@ -353,7 +348,7 @@ Index: linux-2.6.5-sles9/fs/ext3/namei.c #include "acl.h" /* -@@ -979,6 +980,9 @@ +@@ -980,6 +981,9 @@ if (dentry->d_name.len > EXT3_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); @@ -363,7 +358,7 @@ Index: linux-2.6.5-sles9/fs/ext3/namei.c bh = ext3_find_entry(dentry, &de); inode = NULL; if (bh) { -@@ -989,10 +993,8 @@ +@@ -990,10 +994,8 @@ if (!inode) return ERR_PTR(-EACCES); } @@ -376,7 +371,7 @@ Index: linux-2.6.5-sles9/fs/ext3/namei.c } -@@ -2029,10 +2031,6 @@ +@@ -2037,10 +2039,6 @@ inode->i_nlink); inode->i_version++; inode->i_nlink = 0; @@ -387,7 +382,7 @@ Index: linux-2.6.5-sles9/fs/ext3/namei.c ext3_orphan_add(handle, inode); inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; ext3_mark_inode_dirty(handle, inode); -@@ -2152,6 +2150,23 @@ +@@ -2163,6 +2161,23 @@ return err; } @@ -411,40 +406,39 @@ Index: linux-2.6.5-sles9/fs/ext3/namei.c static int ext3_link (struct dentry * old_dentry, struct inode * dir, struct dentry *dentry) { -@@ -2175,7 +2190,8 @@ +@@ -2186,7 +2201,8 @@ ext3_inc_count(handle, inode); atomic_inc(&inode->i_count); - err = ext3_add_nondir(handle, dentry, inode); + err = ext3_add_link(handle, dentry, inode); -+ ext3_orphan_del(handle,inode); ++ ext3_orphan_del(handle, inode); ext3_journal_stop(handle); if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) goto retry; -Index: linux-2.6.5-sles9/fs/ext3/super.c +Index: linux-stage/fs/ext3/super.c =================================================================== ---- linux-2.6.5-sles9.orig/fs/ext3/super.c 2004-11-09 02:15:44.743673048 +0300 -+++ linux-2.6.5-sles9/fs/ext3/super.c 2004-11-09 02:18:27.620911944 +0300 -@@ -534,7 +534,7 @@ - Opt_reservation, Opt_noreservation, Opt_noload, - Opt_commit, Opt_journal_update, Opt_journal_inum, +--- linux-stage.orig/fs/ext3/super.c 2005-02-25 14:37:30.987717392 +0200 ++++ linux-stage/fs/ext3/super.c 2005-02-25 14:44:50.495901992 +0200 +@@ -586,6 +586,7 @@ Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, -- Opt_ignore, Opt_barrier, -+ Opt_ignore, Opt_barrier, Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, + Opt_ignore, Opt_barrier, Opt_err, ++ Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, }; -@@ -577,6 +577,9 @@ + static match_table_t tokens = { +@@ -633,6 +634,9 @@ + {Opt_ignore, "noquota"}, {Opt_ignore, "quota"}, {Opt_ignore, "usrquota"}, - {Opt_barrier, "barrier=%u"}, + {Opt_iopen, "iopen"}, + {Opt_noiopen, "noiopen"}, + {Opt_iopen_nopriv, "iopen_nopriv"}, + {Opt_barrier, "barrier=%u"}, {Opt_err, NULL} }; - -@@ -778,6 +781,18 @@ +@@ -914,6 +918,18 @@ else clear_opt(sbi->s_mount_opt, BARRIER); break; @@ -463,16 +457,16 @@ Index: linux-2.6.5-sles9/fs/ext3/super.c case Opt_ignore: break; default: -Index: linux-2.6.5-sles9/include/linux/ext3_fs.h +Index: linux-stage/include/linux/ext3_fs.h =================================================================== ---- linux-2.6.5-sles9.orig/include/linux/ext3_fs.h 2004-11-09 02:15:44.616692352 +0300 -+++ linux-2.6.5-sles9/include/linux/ext3_fs.h 2004-11-09 02:18:27.622911640 +0300 -@@ -329,6 +329,8 @@ +--- linux-stage.orig/include/linux/ext3_fs.h 2005-02-25 14:37:28.977023064 +0200 ++++ linux-stage/include/linux/ext3_fs.h 2005-02-25 14:49:00.569884968 +0200 +@@ -355,6 +355,8 @@ #define EXT3_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */ #define EXT3_MOUNT_RESERVATION 0x10000 /* Preallocation */ #define EXT3_MOUNT_BARRIER 0x20000 /* Use block barriers */ -+#define EXT3_MOUNT_IOPEN 0x40000 /* Allow access via iopen */ -+#define EXT3_MOUNT_IOPEN_NOPRIV 0x80000 /* Make iopen world-readable */ ++#define EXT3_MOUNT_IOPEN 0x80000 /* Allow access via iopen */ ++#define EXT3_MOUNT_IOPEN_NOPRIV 0x100000/* Make iopen world-readable */ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ #ifndef _LINUX_EXT2_FS_H diff --git a/lustre/kernel_patches/patches/iopen-2.4.19-bgl.patch b/lustre/kernel_patches/patches/iopen-2.6.12.patch similarity index 61% rename from lustre/kernel_patches/patches/iopen-2.4.19-bgl.patch rename to lustre/kernel_patches/patches/iopen-2.6.12.patch index 511cf37..8d456ac 100644 --- a/lustre/kernel_patches/patches/iopen-2.4.19-bgl.patch +++ b/lustre/kernel_patches/patches/iopen-2.6.12.patch @@ -1,80 +1,42 @@ - Documentation/filesystems/ext2.txt | 16 ++ - fs/ext3/Makefile | 2 - fs/ext3/inode.c | 4 - fs/ext3/iopen.c | 259 +++++++++++++++++++++++++++++++++++++ - fs/ext3/iopen.h | 13 + - fs/ext3/namei.c | 13 + - fs/ext3/super.c | 11 + - include/linux/ext3_fs.h | 2 - 8 files changed, 318 insertions(+), 2 deletions(-) - -Index: linux-2.4.19/Documentation/filesystems/ext2.txt +Index: linux-2.6.12-rc6/fs/ext3/Makefile =================================================================== ---- linux-2.4.19.orig/Documentation/filesystems/ext2.txt 2001-07-11 18:44:45.000000000 -0400 -+++ linux-2.4.19/Documentation/filesystems/ext2.txt 2004-04-23 22:37:48.000000000 -0400 -@@ -35,6 +35,22 @@ +--- linux-2.6.12-rc6.orig/fs/ext3/Makefile 2005-06-14 16:00:45.206720992 +0200 ++++ linux-2.6.12-rc6/fs/ext3/Makefile 2005-06-14 16:14:33.595382720 +0200 +@@ -4,7 +4,7 @@ - sb=n Use alternate superblock at this location. + obj-$(CONFIG_EXT3_FS) += ext3.o -+iopen Makes an invisible pseudo-directory called -+ __iopen__ available in the root directory -+ of the filesystem. Allows open-by-inode- -+ number. i.e., inode 3145 can be accessed -+ via /mntpt/__iopen__/3145 -+ -+iopen_nopriv This option makes the iopen directory be -+ world-readable. This may be safer since it -+ allows daemons to run as an unprivileged user, -+ however it significantly changes the security -+ model of a Unix filesystem, since previously -+ all files under a mode 700 directory were not -+ generally avilable even if the -+ permissions on the file itself is -+ world-readable. -+ - grpquota,noquota,quota,usrquota Quota options are silently ignored by ext2. +-ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ ++ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ + ioctl.o namei.o super.o symlink.o hash.o resize.o - -Index: linux.mcp2/fs/ext3/Makefile -=================================================================== ---- linux.mcp2.orig/fs/ext3/Makefile 2004-05-17 15:20:52.000000000 -0700 -+++ linux.mcp2/fs/ext3/Makefile 2004-05-17 15:21:55.000000000 -0700 -@@ -11,7 +11,7 @@ - - export-objs := ext3-exports.o - --obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ -+obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ - ioctl.o namei.o super.o symlink.o hash.o ext3-exports.o - obj-m := $(O_TARGET) - -Index: linux.mcp2/fs/ext3/inode.c + ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o +Index: linux-2.6.12-rc6/fs/ext3/inode.c =================================================================== ---- linux.mcp2.orig/fs/ext3/inode.c 2004-05-17 15:20:59.000000000 -0700 -+++ linux.mcp2/fs/ext3/inode.c 2004-05-17 15:21:55.000000000 -0700 -@@ -31,6 +31,7 @@ - #include - #include - #include +--- linux-2.6.12-rc6.orig/fs/ext3/inode.c 2005-06-14 16:01:16.272150299 +0200 ++++ linux-2.6.12-rc6/fs/ext3/inode.c 2005-06-14 16:24:55.686195412 +0200 +@@ -37,6 +37,7 @@ + #include + #include + #include "xattr.h" +#include "iopen.h" + #include "acl.h" - /* - * SEARCH_FROM_ZERO forces each block allocation to search from the start -@@ -2125,6 +2126,9 @@ - struct buffer_head *bh; - int block; - -+ if (ext3_iopen_get_inode(inode)) -+ return; -+ - if(ext3_get_inode_loc(inode, &iloc)) + static int ext3_writepage_trans_blocks(struct inode *inode); +@@ -2437,6 +2438,8 @@ + ei->i_default_acl = EXT3_ACL_NOT_CACHED; + #endif + ei->i_block_alloc_info = NULL; ++ if (ext3_iopen_get_inode(inode)) ++ return; + + if (__ext3_get_inode_loc(inode, &iloc, 0)) goto bad_inode; - bh = iloc.bh; -Index: linux.mcp2/fs/ext3/iopen.c +Index: linux-2.6.12-rc6/fs/ext3/iopen.c =================================================================== ---- linux.mcp2.orig/fs/ext3/iopen.c 2002-04-11 07:25:15.000000000 -0700 -+++ linux.mcp2/fs/ext3/iopen.c 2004-05-17 15:21:55.000000000 -0700 -@@ -0,0 +1,282 @@ +--- linux-2.6.12-rc6.orig/fs/ext3/iopen.c 2005-06-14 16:14:33.530929595 +0200 ++++ linux-2.6.12-rc6/fs/ext3/iopen.c 2005-06-14 16:14:33.626632719 +0200 +@@ -0,0 +1,278 @@ +/* + * linux/fs/ext3/iopen.c + * @@ -107,11 +69,12 @@ Index: linux.mcp2/fs/ext3/iopen.c + +#include +#include -+#include +#include +#include +#include +#include ++#include ++#include +#include "iopen.h" + +#ifndef assert @@ -123,7 +86,8 @@ Index: linux.mcp2/fs/ext3/iopen.c +/* + * This implements looking up an inode by number. + */ -+static struct dentry *iopen_lookup(struct inode *dir, struct dentry *dentry) ++static struct dentry *iopen_lookup(struct inode * dir, struct dentry *dentry, ++ struct nameidata *nd) +{ + struct inode *inode; + unsigned long ino; @@ -148,7 +112,7 @@ Index: linux.mcp2/fs/ext3/iopen.c + //ino != EXT3_ACL_IDX_INO && + //ino != EXT3_ACL_DATA_INO && + ino < EXT3_FIRST_INO(dir->i_sb)) || -+ ino > le32_to_cpu(dir->i_sb->u.ext3_sb.s_es->s_inodes_count)) ++ ino > le32_to_cpu(EXT3_SB(dir->i_sb)->s_es->s_inodes_count)) + return ERR_PTR(-ENOENT); + + inode = iget(dir->i_sb, ino); @@ -160,31 +124,33 @@ Index: linux.mcp2/fs/ext3/iopen.c + } + + assert(list_empty(&dentry->d_alias)); /* d_instantiate */ -+ assert(list_empty(&dentry->d_hash)); /* d_rehash */ ++ assert(d_unhashed(dentry)); /* d_rehash */ + + /* preferrably return a connected dentry */ + spin_lock(&dcache_lock); + list_for_each(lp, &inode->i_dentry) { + alternate = list_entry(lp, struct dentry, d_alias); -+ assert(!(alternate->d_flags & DCACHE_NFSD_DISCONNECTED)); ++ assert(!(alternate->d_flags & DCACHE_DISCONNECTED)); + } + + if (!list_empty(&inode->i_dentry)) { + alternate = list_entry(inode->i_dentry.next, + struct dentry, d_alias); + dget_locked(alternate); -+ alternate->d_vfs_flags |= DCACHE_REFERENCED; ++ spin_lock(&alternate->d_lock); ++ alternate->d_flags |= DCACHE_REFERENCED; ++ spin_unlock(&alternate->d_lock); + iput(inode); + spin_unlock(&dcache_lock); + return alternate; + } -+ dentry->d_flags |= DCACHE_NFSD_DISCONNECTED; ++ dentry->d_flags |= DCACHE_DISCONNECTED; + + /* d_add(), but don't drop dcache_lock before adding dentry to inode */ + list_add(&dentry->d_alias, &inode->i_dentry); /* d_instantiate */ + dentry->d_inode = inode; + -+ __d_rehash(dentry, 0); /* d_rehash */ ++ d_rehash_cond(dentry, 0); /* d_rehash */ + spin_unlock(&dcache_lock); + + return NULL; @@ -198,7 +164,7 @@ Index: linux.mcp2/fs/ext3/iopen.c +{ + const unsigned char *old_name, *new_name; + -+ memcpy(dentry->d_iname, target->d_iname, DNAME_INLINE_LEN); ++ memcpy(dentry->d_iname, target->d_iname, DNAME_INLINE_LEN_MIN); + old_name = target->d_name.name; + new_name = dentry->d_name.name; + if (old_name == target->d_iname) @@ -222,17 +188,20 @@ Index: linux.mcp2/fs/ext3/iopen.c + assert(dentry->d_inode == NULL); + assert(list_empty(&dentry->d_alias)); /* d_instantiate */ + if (rehash) -+ assert(list_empty(&dentry->d_hash)); /* d_rehash */ ++ assert(d_unhashed(dentry)); /* d_rehash */ + assert(list_empty(&dentry->d_subdirs)); + + spin_lock(&dcache_lock); + if (!inode) + goto do_rehash; + ++ if (!test_opt(inode->i_sb, IOPEN)) ++ goto do_instantiate; ++ + /* preferrably return a connected dentry */ + list_for_each(lp, &inode->i_dentry) { + tmp = list_entry(lp, struct dentry, d_alias); -+ if (tmp->d_flags & DCACHE_NFSD_DISCONNECTED) { ++ if (tmp->d_flags & DCACHE_DISCONNECTED) { + assert(tmp->d_alias.next == &inode->i_dentry); + assert(tmp->d_alias.prev == &inode->i_dentry); + goal = tmp; @@ -244,23 +213,12 @@ Index: linux.mcp2/fs/ext3/iopen.c + if (!goal) + goto do_instantiate; + -+ /* Move the goal to the de hash queue - like d_move() */ -+ goal->d_flags &= ~DCACHE_NFSD_DISCONNECTED; -+ list_del_init(&goal->d_hash); -+ -+ list_del(&goal->d_child); -+ list_del(&dentry->d_child); -+ -+ /* Switch the parents and the names.. */ -+ switch_names(goal, dentry); -+ do_switch(goal->d_parent, dentry->d_parent); -+ do_switch(goal->d_name.len, dentry->d_name.len); -+ do_switch(goal->d_name.hash, dentry->d_name.hash); -+ -+ /* And add them back to the (new) parent lists */ -+ list_add(&goal->d_child, &goal->d_parent->d_subdirs); -+ list_add(&dentry->d_child, &dentry->d_parent->d_subdirs); -+ __d_rehash(goal, 0); ++ /* Move the goal to the de hash queue */ ++ goal->d_flags &= ~DCACHE_DISCONNECTED; ++ security_d_instantiate(goal, inode); ++ __d_drop(dentry); ++ d_rehash_cond(dentry, 0); ++ __d_move(goal, dentry); + spin_unlock(&dcache_lock); + iput(inode); + @@ -272,7 +230,7 @@ Index: linux.mcp2/fs/ext3/iopen.c + dentry->d_inode = inode; +do_rehash: + if (rehash) -+ __d_rehash(dentry, 0); /* d_rehash */ ++ d_rehash_cond(dentry, 0); /* d_rehash */ + spin_unlock(&dcache_lock); + + return NULL; @@ -343,7 +301,7 @@ Index: linux.mcp2/fs/ext3/iopen.c + inode->i_atime = CURRENT_TIME; + inode->i_ctime = CURRENT_TIME; + inode->i_mtime = CURRENT_TIME; -+ inode->u.ext3_i.i_dtime = 0; ++ EXT3_I(inode)->i_dtime = 0; + inode->i_blksize = PAGE_SIZE; /* This is the optimal IO size + * (for stat), not the fs block + * size */ @@ -357,10 +315,10 @@ Index: linux.mcp2/fs/ext3/iopen.c + + return 1; +} -Index: linux.mcp2/fs/ext3/iopen.h +Index: linux-2.6.12-rc6/fs/ext3/iopen.h =================================================================== ---- linux.mcp2.orig/fs/ext3/iopen.h 2002-04-11 07:25:15.000000000 -0700 -+++ linux.mcp2/fs/ext3/iopen.h 2004-05-17 15:21:55.000000000 -0700 +--- linux-2.6.12-rc6.orig/fs/ext3/iopen.h 2005-06-14 16:14:33.534835845 +0200 ++++ linux-2.6.12-rc6/fs/ext3/iopen.h 2005-06-14 16:14:33.633468657 +0200 @@ -0,0 +1,15 @@ +/* + * iopen.h @@ -377,20 +335,19 @@ Index: linux.mcp2/fs/ext3/iopen.h +extern int ext3_iopen_get_inode(struct inode *inode); +extern struct dentry *iopen_connect_dentry(struct dentry *dentry, + struct inode *inode, int rehash); -Index: linux.mcp2/fs/ext3/namei.c +Index: linux-2.6.12-rc6/fs/ext3/namei.c =================================================================== ---- linux.mcp2.orig/fs/ext3/namei.c 2004-05-17 15:20:59.000000000 -0700 -+++ linux.mcp2/fs/ext3/namei.c 2004-05-17 15:21:55.000000000 -0700 -@@ -35,7 +35,7 @@ - #include - #include - #include -- +--- linux-2.6.12-rc6.orig/fs/ext3/namei.c 2005-06-14 16:01:14.701837819 +0200 ++++ linux-2.6.12-rc6/fs/ext3/namei.c 2005-06-14 16:14:33.644210844 +0200 +@@ -37,6 +37,7 @@ + #include + #include + #include "xattr.h" +#include "iopen.h" + #include "acl.h" /* - * define how far ahead to read directories while searching them. -@@ -931,6 +931,9 @@ +@@ -985,6 +986,9 @@ if (dentry->d_name.len > EXT3_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); @@ -400,29 +357,31 @@ Index: linux.mcp2/fs/ext3/namei.c bh = ext3_find_entry(dentry, &de); inode = NULL; if (bh) { -@@ -942,8 +945,8 @@ +@@ -995,10 +999,8 @@ + if (!inode) return ERR_PTR(-EACCES); - } } +- if (inode) +- return d_splice_alias(inode, dentry); - d_add(dentry, inode); - return NULL; + + return iopen_connect_dentry(dentry, inode, 1); } - #define S_SHIFT 12 -@@ -1932,10 +1935,6 @@ + +@@ -2042,10 +2044,6 @@ inode->i_nlink); - inode->i_version = ++event; + inode->i_version++; inode->i_nlink = 0; - /* There's no need to set i_disksize: the fact that i_nlink is - * zero will ensure that the right thing happens during any - * recovery. */ - inode->i_size = 0; ext3_orphan_add(handle, inode); + inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; ext3_mark_inode_dirty(handle, inode); - dir->i_nlink--; -@@ -2054,6 +2053,23 @@ +@@ -2168,6 +2166,23 @@ return err; } @@ -446,49 +405,67 @@ Index: linux.mcp2/fs/ext3/namei.c static int ext3_link (struct dentry * old_dentry, struct inode * dir, struct dentry *dentry) { -@@ -2081,7 +2097,8 @@ +@@ -2191,7 +2206,8 @@ ext3_inc_count(handle, inode); atomic_inc(&inode->i_count); - err = ext3_add_nondir(handle, dentry, inode); + err = ext3_add_link(handle, dentry, inode); + ext3_orphan_del(handle, inode); - ext3_journal_stop(handle, dir); - return err; - } -Index: linux.mcp2/fs/ext3/super.c + ext3_journal_stop(handle); + if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) + goto retry; +Index: linux-2.6.12-rc6/fs/ext3/super.c =================================================================== ---- linux.mcp2.orig/fs/ext3/super.c 2004-05-17 15:20:59.000000000 -0700 -+++ linux.mcp2/fs/ext3/super.c 2004-05-17 15:21:55.000000000 -0700 -@@ -836,6 +836,18 @@ - || !strcmp (this_char, "quota") - || !strcmp (this_char, "usrquota")) - /* Don't do anything ;-) */ ; -+ else if (!strcmp (this_char, "iopen")) { +--- linux-2.6.12-rc6.orig/fs/ext3/super.c 2005-06-14 16:01:16.287775299 +0200 ++++ linux-2.6.12-rc6/fs/ext3/super.c 2005-06-14 16:14:33.656906156 +0200 +@@ -590,6 +590,7 @@ + Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, + Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, + Opt_ignore, Opt_barrier, Opt_err, Opt_resize, ++ Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, + }; + + static match_table_t tokens = { +@@ -638,6 +639,9 @@ + {Opt_ignore, "noquota"}, + {Opt_ignore, "quota"}, + {Opt_ignore, "usrquota"}, ++ {Opt_iopen, "iopen"}, ++ {Opt_noiopen, "noiopen"}, ++ {Opt_iopen_nopriv, "iopen_nopriv"}, + {Opt_barrier, "barrier=%u"}, + {Opt_err, NULL}, + {Opt_resize, "resize"}, +@@ -921,6 +925,18 @@ + else + clear_opt(sbi->s_mount_opt, BARRIER); + break; ++ case Opt_iopen: + set_opt (sbi->s_mount_opt, IOPEN); + clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV); -+ } -+ else if (!strcmp (this_char, "noiopen")) { ++ break; ++ case Opt_noiopen: + clear_opt (sbi->s_mount_opt, IOPEN); + clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV); -+ } -+ else if (!strcmp (this_char, "iopen_nopriv")) { ++ break; ++ case Opt_iopen_nopriv: + set_opt (sbi->s_mount_opt, IOPEN); + set_opt (sbi->s_mount_opt, IOPEN_NOPRIV); -+ } - else if (!strcmp (this_char, "journal")) { - /* @@@ FIXME */ - /* Eventually we will want to be able to create -Index: linux.mcp2/include/linux/ext3_fs.h ++ break; + case Opt_ignore: + break; + case Opt_resize: +Index: linux-2.6.12-rc6/include/linux/ext3_fs.h =================================================================== ---- linux.mcp2.orig/include/linux/ext3_fs.h 2004-05-17 15:20:59.000000000 -0700 -+++ linux.mcp2/include/linux/ext3_fs.h 2004-05-17 15:21:55.000000000 -0700 -@@ -323,6 +323,8 @@ - #define EXT3_MOUNT_NO_UID32 0x2000 /* Disable 32-bit UIDs */ - #define EXT3_MOUNT_XATTR_USER 0x4000 /* Extended user attributes */ - #define EXT3_MOUNT_ASYNCDEL 0x20000 /* Delayed deletion */ -+#define EXT3_MOUNT_IOPEN 0x40000 /* Allow access via iopen */ -+#define EXT3_MOUNT_IOPEN_NOPRIV 0x80000 /* Make iopen world-readable */ +--- linux-2.6.12-rc6.orig/include/linux/ext3_fs.h 2005-06-14 16:01:14.709650318 +0200 ++++ linux-2.6.12-rc6/include/linux/ext3_fs.h 2005-06-14 16:28:38.452794245 +0200 +@@ -358,6 +358,8 @@ + #define EXT3_MOUNT_RESERVATION 0x10000 /* Preallocation */ + #define EXT3_MOUNT_BARRIER 0x20000 /* Use block barriers */ + #define EXT3_MOUNT_NOBH 0x40000 /* No bufferheads */ ++#define EXT3_MOUNT_IOPEN 0x80000 /* Allow access via iopen */ ++#define EXT3_MOUNT_IOPEN_NOPRIV 0x100000/* Make iopen world-readable */ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ #ifndef _LINUX_EXT2_FS_H diff --git a/lustre/kernel_patches/patches/iopen-misc-2.6.12.patch b/lustre/kernel_patches/patches/iopen-misc-2.6.12.patch new file mode 100644 index 0000000..48d8ab9 --- /dev/null +++ b/lustre/kernel_patches/patches/iopen-misc-2.6.12.patch @@ -0,0 +1,82 @@ +Index: linux-2.6.4-51.0/Documentation/filesystems/ext2.txt +=================================================================== +--- linux-2.6.4-51.0.orig/Documentation/filesystems/ext2.txt 2004-05-06 22:21:26.000000000 -0400 ++++ linux-2.6.4-51.0/Documentation/filesystems/ext2.txt 2004-05-06 22:24:42.000000000 -0400 +@@ -35,6 +35,22 @@ + + sb=n Use alternate superblock at this location. + ++iopen Makes an invisible pseudo-directory called ++ __iopen__ available in the root directory ++ of the filesystem. Allows open-by-inode- ++ number. i.e., inode 3145 can be accessed ++ via /mntpt/__iopen__/3145 ++ ++iopen_nopriv This option makes the iopen directory be ++ world-readable. This may be safer since it ++ allows daemons to run as an unprivileged user, ++ however it significantly changes the security ++ model of a Unix filesystem, since previously ++ all files under a mode 700 directory were not ++ generally avilable even if the ++ permissions on the file itself is ++ world-readable. ++ + grpquota,noquota,quota,usrquota Quota options are silently ignored by ext2. + + +Index: linux-2.6.4-51.0/fs/dcache.c +=================================================================== +--- linux-2.6.4-51.0.orig/fs/dcache.c 2004-05-06 22:24:42.000000000 -0400 ++++ linux-2.6.4-51.0/fs/dcache.c 2004-05-06 22:58:37.000000000 -0400 +@@ -1195,14 +1195,13 @@ + * dcache entries should not be moved in this way. + */ + +-void d_move(struct dentry * dentry, struct dentry * target) ++void __d_move(struct dentry * dentry, struct dentry * target) + { + struct hlist_head *list; + + if (!dentry->d_inode) + printk(KERN_WARNING "VFS: moving negative dcache entry\n"); + +- spin_lock(&dcache_lock); + write_seqlock(&rename_lock); + /* + * XXXX: do we really need to take target->d_lock? +@@ -1253,6 +1252,14 @@ + spin_unlock(&target->d_lock); + spin_unlock(&dentry->d_lock); + write_sequnlock(&rename_lock); ++} ++ ++EXPORT_SYMBOL(__d_move); ++ ++void d_move(struct dentry *dentry, struct dentry *target) ++{ ++ spin_lock(&dcache_lock); ++ __d_move(dentry, target); + spin_unlock(&dcache_lock); + } + +Index: linux-2.6.4-51.0/include/linux/dcache.h +=================================================================== +--- linux-2.6.4-51.0.orig/include/linux/dcache.h 2004-05-06 22:24:42.000000000 -0400 ++++ linux-2.6.4-51.0/include/linux/dcache.h 2004-05-06 23:03:43.000000000 -0400 +@@ -234,6 +234,7 @@ + * This adds the entry to the hash queues. + */ + extern void d_rehash(struct dentry *); ++extern void d_rehash_cond(struct dentry *, int lock); + + /** + * d_add - add dentry to hash queues +@@ -252,6 +253,7 @@ + + /* used for rename() and baskets */ + extern void d_move(struct dentry *, struct dentry *); ++extern void __d_move(struct dentry *, struct dentry *); + + /* appendix may either be NULL or be used for transname suffixes */ + extern struct dentry * d_lookup(struct dentry *, struct qstr *); diff --git a/lustre/kernel_patches/patches/jbd-2.4.18-jcberr.patch b/lustre/kernel_patches/patches/jbd-2.4.18-jcberr.patch deleted file mode 100644 index 81b4136..0000000 --- a/lustre/kernel_patches/patches/jbd-2.4.18-jcberr.patch +++ /dev/null @@ -1,274 +0,0 @@ -Index: linux-2.4.19.SuSE/include/linux/jbd.h -=================================================================== ---- linux-2.4.19.SuSE.orig/include/linux/jbd.h Sun Nov 16 13:51:03 2003 -+++ linux-2.4.19.SuSE/include/linux/jbd.h Sun Nov 16 15:10:48 2003 -@@ -283,6 +283,13 @@ - return bh->b_private; - } - -+#define HAVE_JOURNAL_CALLBACK_STATUS -+struct journal_callback { -+ struct list_head jcb_list; -+ void (*jcb_func)(struct journal_callback *jcb, int error); -+ /* user data goes here */ -+}; -+ - struct jbd_revoke_table_s; - - /* The handle_t type represents a single atomic update being performed -@@ -313,6 +320,12 @@ - operations */ - int h_err; - -+ /* List of application registered callbacks for this handle. -+ * The function(s) will be called after the transaction that -+ * this handle is part of has been committed to disk. -+ */ -+ struct list_head h_jcb; -+ - /* Flags */ - unsigned int h_sync: 1; /* sync-on-close */ - unsigned int h_jdata: 1; /* force data journaling */ -@@ -432,6 +445,10 @@ - - /* How many handles used this transaction? */ - int t_handle_count; -+ -+ /* List of registered callback functions for this transaction. -+ * Called when the transaction is committed. */ -+ struct list_head t_jcb; - }; - - -@@ -676,6 +693,9 @@ - extern int journal_try_to_free_buffers(journal_t *, struct page *, int); - extern int journal_stop(handle_t *); - extern int journal_flush (journal_t *); -+extern void journal_callback_set(handle_t *handle, -+ void (*fn)(struct journal_callback *,int), -+ struct journal_callback *jcb); - - extern void journal_lock_updates (journal_t *); - extern void journal_unlock_updates (journal_t *); -Index: linux-2.4.19.SuSE/fs/jbd/checkpoint.c -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/jbd/checkpoint.c Mon Feb 25 11:38:08 2002 -+++ linux-2.4.19.SuSE/fs/jbd/checkpoint.c Sun Nov 16 15:10:48 2003 -@@ -594,7 +594,8 @@ - J_ASSERT (transaction->t_log_list == NULL); - J_ASSERT (transaction->t_checkpoint_list == NULL); - J_ASSERT (transaction->t_updates == 0); -- -+ J_ASSERT (list_empty(&transaction->t_jcb)); -+ - J_ASSERT (transaction->t_journal->j_committing_transaction != - transaction); - -Index: linux-2.4.19.SuSE/fs/jbd/commit.c -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/jbd/commit.c Mon Jan 27 05:08:04 2003 -+++ linux-2.4.19.SuSE/fs/jbd/commit.c Sun Nov 16 15:13:53 2003 -@@ -485,7 +485,7 @@ - transaction's t_log_list queue, and metadata buffers are on - the t_iobuf_list queue. - -- Wait for the transactions in reverse order. That way we are -+ Wait for the buffers in reverse order. That way we are - less likely to be woken up until all IOs have completed, and - so we incur less scheduling load. - */ -@@ -576,8 +576,10 @@ - - jbd_debug(3, "JBD: commit phase 6\n"); - -- if (is_journal_aborted(journal)) -+ if (is_journal_aborted(journal)) { -+ unlock_journal(journal); - goto skip_commit; -+ } - - /* Done it all: now write the commit record. We should have - * cleaned up our previous buffers by now, so if we are in abort -@@ -587,9 +589,10 @@ - descriptor = journal_get_descriptor_buffer(journal); - if (!descriptor) { - __journal_abort_hard(journal); -+ unlock_journal(journal); - goto skip_commit; - } -- -+ - /* AKPM: buglet - add `i' to tmp! */ - for (i = 0; i < jh2bh(descriptor)->b_size; i += 512) { - journal_header_t *tmp = -@@ -610,14 +614,32 @@ - put_bh(bh); /* One for getblk() */ - journal_unlock_journal_head(descriptor); - } -- lock_journal(journal); - - /* End of a transaction! Finally, we can do checkpoint - processing: any buffers committed as a result of this - transaction can be removed from any checkpoint list it was on - before. */ - --skip_commit: -+skip_commit: /* The journal should be unlocked by now. */ -+ -+ /* Call any callbacks that had been registered for handles in this -+ * transaction. It is up to the callback to free any allocated -+ * memory. -+ */ -+ if (!list_empty(&commit_transaction->t_jcb)) { -+ struct list_head *p, *n; -+ int error = is_journal_aborted(journal); -+ -+ list_for_each_safe(p, n, &commit_transaction->t_jcb) { -+ struct journal_callback *jcb; -+ -+ jcb = list_entry(p, struct journal_callback, jcb_list); -+ list_del(p); -+ jcb->jcb_func(jcb, error); -+ } -+ } -+ -+ lock_journal(journal); - - jbd_debug(3, "JBD: commit phase 7\n"); - -Index: linux-2.4.19.SuSE/fs/jbd/journal.c -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/jbd/journal.c Mon Jan 27 05:08:00 2003 -+++ linux-2.4.19.SuSE/fs/jbd/journal.c Sun Nov 16 15:10:48 2003 -@@ -59,6 +59,7 @@ - #endif - EXPORT_SYMBOL(journal_flush); - EXPORT_SYMBOL(journal_revoke); -+EXPORT_SYMBOL(journal_callback_set); - - EXPORT_SYMBOL(journal_init_dev); - EXPORT_SYMBOL(journal_init_inode); -Index: linux-2.4.19.SuSE/fs/jbd/transaction.c -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/jbd/transaction.c Sun Nov 16 01:45:26 2003 -+++ linux-2.4.19.SuSE/fs/jbd/transaction.c Sun Nov 16 15:15:34 2003 -@@ -58,6 +58,7 @@ - transaction->t_state = T_RUNNING; - transaction->t_tid = journal->j_transaction_sequence++; - transaction->t_expires = jiffies + bdflush_interval(); -+ INIT_LIST_HEAD(&transaction->t_jcb); - - /* Set up the commit timer for the new transaction. */ - J_ASSERT (!journal->j_commit_timer_active); -@@ -91,7 +92,14 @@ - transaction_t *transaction; - int needed; - int nblocks = handle->h_buffer_credits; -- -+ -+ if (nblocks > journal->j_max_transaction_buffers) { -+ jbd_debug(1, "JBD: %s wants too many credits (%d > %d)\n", -+ current->comm, nblocks, -+ journal->j_max_transaction_buffers); -+ return -ENOSPC; -+ } -+ - jbd_debug(3, "New handle %p going live.\n", handle); - - repeat: -@@ -202,6 +210,20 @@ - return 0; - } - -+/* Allocate a new handle. This should probably be in a slab... */ -+static handle_t *new_handle(int nblocks) -+{ -+ handle_t *handle = jbd_kmalloc(sizeof (handle_t), GFP_NOFS); -+ if (!handle) -+ return NULL; -+ memset(handle, 0, sizeof (handle_t)); -+ handle->h_buffer_credits = nblocks; -+ handle->h_ref = 1; -+ INIT_LIST_HEAD(&handle->h_jcb); -+ -+ return handle; -+} -+ - /* - * Obtain a new handle. - * -@@ -228,14 +250,11 @@ - handle->h_ref++; - return handle; - } -- -- handle = jbd_kmalloc(sizeof (handle_t), GFP_NOFS); -+ -+ handle = new_handle(nblocks); - if (!handle) - return ERR_PTR(-ENOMEM); -- memset (handle, 0, sizeof (handle_t)); - -- handle->h_buffer_credits = nblocks; -- handle->h_ref = 1; - current->journal_info = handle; - - err = start_this_handle(journal, handle); -@@ -334,14 +353,11 @@ - - if (is_journal_aborted(journal)) - return ERR_PTR(-EIO); -- -- handle = jbd_kmalloc(sizeof (handle_t), GFP_NOFS); -+ -+ handle = new_handle(nblocks); - if (!handle) - return ERR_PTR(-ENOMEM); -- memset (handle, 0, sizeof (handle_t)); - -- handle->h_buffer_credits = nblocks; -- handle->h_ref = 1; - current->journal_info = handle; - - err = try_start_this_handle(journal, handle); -@@ -1321,6 +1337,28 @@ - #endif - - /* -+ * Register a callback function for this handle. The function will be -+ * called when the transaction that this handle is part of has been -+ * committed to disk with the original callback data struct and the -+ * error status of the journal as parameters. There is no guarantee of -+ * ordering between handles within a single transaction, nor between -+ * callbacks registered on the same handle. -+ * -+ * The caller is responsible for allocating the journal_callback struct. -+ * This is to allow the caller to add as much extra data to the callback -+ * as needed, but reduce the overhead of multiple allocations. The caller -+ * allocated struct must start with a struct journal_callback at offset 0, -+ * and has the caller-specific data afterwards. -+ */ -+void journal_callback_set(handle_t *handle, -+ void (*func)(struct journal_callback *jcb, int error), -+ struct journal_callback *jcb) -+{ -+ list_add_tail(&jcb->jcb_list, &handle->h_jcb); -+ jcb->jcb_func = func; -+} -+ -+/* - * All done for a particular handle. - * - * There is not much action needed here. We just return any remaining -@@ -1385,7 +1423,10 @@ - wake_up(&journal->j_wait_transaction_locked); - } - -- /* -+ /* Move callbacks from the handle to the transaction. */ -+ list_splice(&handle->h_jcb, &transaction->t_jcb); -+ -+ /* - * If the handle is marked SYNC, we need to set another commit - * going! We also want to force a commit if the current - * transaction is occupying too much of the log, or if the diff --git a/lustre/kernel_patches/patches/jbd-2.4.19-pre1-jcberr.patch b/lustre/kernel_patches/patches/jbd-2.4.19-pre1-jcberr.patch deleted file mode 100644 index bbbf613..0000000 --- a/lustre/kernel_patches/patches/jbd-2.4.19-pre1-jcberr.patch +++ /dev/null @@ -1,274 +0,0 @@ -Index: linux-2.4.19-pre1/include/linux/jbd.h -=================================================================== ---- linux-2.4.19-pre1.orig/include/linux/jbd.h 2003-11-21 03:00:11.000000000 +0300 -+++ linux-2.4.19-pre1/include/linux/jbd.h 2003-11-21 03:04:47.000000000 +0300 -@@ -275,6 +275,13 @@ - return bh->b_private; - } - -+#define HAVE_JOURNAL_CALLBACK_STATUS -+struct journal_callback { -+ struct list_head jcb_list; -+ void (*jcb_func)(struct journal_callback *jcb, int error); -+ /* user data goes here */ -+}; -+ - struct jbd_revoke_table_s; - - /* The handle_t type represents a single atomic update being performed -@@ -305,6 +312,12 @@ - operations */ - int h_err; - -+ /* List of application registered callbacks for this handle. -+ * The function(s) will be called after the transaction that -+ * this handle is part of has been committed to disk. -+ */ -+ struct list_head h_jcb; -+ - /* Flags */ - unsigned int h_sync: 1; /* sync-on-close */ - unsigned int h_jdata: 1; /* force data journaling */ -@@ -424,6 +437,10 @@ - - /* How many handles used this transaction? */ - int t_handle_count; -+ -+ /* List of registered callback functions for this transaction. -+ * Called when the transaction is committed. */ -+ struct list_head t_jcb; - }; - - -@@ -672,6 +689,9 @@ - extern int journal_try_to_free_buffers(journal_t *, struct page *, int); - extern int journal_stop(handle_t *); - extern int journal_flush (journal_t *); -+extern void journal_callback_set(handle_t *handle, -+ void (*fn)(struct journal_callback *,int), -+ struct journal_callback *jcb); - - extern void journal_lock_updates (journal_t *); - extern void journal_unlock_updates (journal_t *); -Index: linux-2.4.19-pre1/fs/jbd/checkpoint.c -=================================================================== ---- linux-2.4.19-pre1.orig/fs/jbd/checkpoint.c 2003-11-21 02:53:20.000000000 +0300 -+++ linux-2.4.19-pre1/fs/jbd/checkpoint.c 2003-11-21 03:04:47.000000000 +0300 -@@ -601,7 +601,8 @@ - J_ASSERT (transaction->t_log_list == NULL); - J_ASSERT (transaction->t_checkpoint_list == NULL); - J_ASSERT (transaction->t_updates == 0); -- -+ J_ASSERT (list_empty(&transaction->t_jcb)); -+ - J_ASSERT (transaction->t_journal->j_committing_transaction != - transaction); - -Index: linux-2.4.19-pre1/fs/jbd/commit.c -=================================================================== ---- linux-2.4.19-pre1.orig/fs/jbd/commit.c 2003-11-21 02:53:20.000000000 +0300 -+++ linux-2.4.19-pre1/fs/jbd/commit.c 2003-11-21 03:04:47.000000000 +0300 -@@ -480,7 +480,7 @@ - transaction's t_log_list queue, and metadata buffers are on - the t_iobuf_list queue. - -- Wait for the transactions in reverse order. That way we are -+ Wait for the buffers in reverse order. That way we are - less likely to be woken up until all IOs have completed, and - so we incur less scheduling load. - */ -@@ -571,8 +571,10 @@ - - jbd_debug(3, "JBD: commit phase 6\n"); - -- if (is_journal_aborted(journal)) -+ if (is_journal_aborted(journal)) { -+ unlock_journal(journal); - goto skip_commit; -+ } - - /* Done it all: now write the commit record. We should have - * cleaned up our previous buffers by now, so if we are in abort -@@ -582,9 +584,10 @@ - descriptor = journal_get_descriptor_buffer(journal); - if (!descriptor) { - __journal_abort_hard(journal); -+ unlock_journal(journal); - goto skip_commit; - } -- -+ - /* AKPM: buglet - add `i' to tmp! */ - for (i = 0; i < jh2bh(descriptor)->b_size; i += 512) { - journal_header_t *tmp = -@@ -605,14 +608,32 @@ - put_bh(bh); /* One for getblk() */ - journal_unlock_journal_head(descriptor); - } -- lock_journal(journal); - - /* End of a transaction! Finally, we can do checkpoint - processing: any buffers committed as a result of this - transaction can be removed from any checkpoint list it was on - before. */ - --skip_commit: -+skip_commit: /* The journal should be unlocked by now. */ -+ -+ /* Call any callbacks that had been registered for handles in this -+ * transaction. It is up to the callback to free any allocated -+ * memory. -+ */ -+ if (!list_empty(&commit_transaction->t_jcb)) { -+ struct list_head *p, *n; -+ int error = is_journal_aborted(journal); -+ -+ list_for_each_safe(p, n, &commit_transaction->t_jcb) { -+ struct journal_callback *jcb; -+ -+ jcb = list_entry(p, struct journal_callback, jcb_list); -+ list_del(p); -+ jcb->jcb_func(jcb, error); -+ } -+ } -+ -+ lock_journal(journal); - - jbd_debug(3, "JBD: commit phase 7\n"); - -Index: linux-2.4.19-pre1/fs/jbd/journal.c -=================================================================== ---- linux-2.4.19-pre1.orig/fs/jbd/journal.c 2003-11-21 02:53:20.000000000 +0300 -+++ linux-2.4.19-pre1/fs/jbd/journal.c 2003-11-21 03:04:47.000000000 +0300 -@@ -58,6 +58,7 @@ - #endif - EXPORT_SYMBOL(journal_flush); - EXPORT_SYMBOL(journal_revoke); -+EXPORT_SYMBOL(journal_callback_set); - - EXPORT_SYMBOL(journal_init_dev); - EXPORT_SYMBOL(journal_init_inode); -Index: linux-2.4.19-pre1/fs/jbd/transaction.c -=================================================================== ---- linux-2.4.19-pre1.orig/fs/jbd/transaction.c 2003-11-21 02:53:20.000000000 +0300 -+++ linux-2.4.19-pre1/fs/jbd/transaction.c 2003-11-21 03:05:14.000000000 +0300 -@@ -57,6 +57,7 @@ - transaction->t_state = T_RUNNING; - transaction->t_tid = journal->j_transaction_sequence++; - transaction->t_expires = jiffies + journal->j_commit_interval; -+ INIT_LIST_HEAD(&transaction->t_jcb); - - /* Set up the commit timer for the new transaction. */ - J_ASSERT (!journal->j_commit_timer_active); -@@ -90,7 +91,14 @@ - transaction_t *transaction; - int needed; - int nblocks = handle->h_buffer_credits; -- -+ -+ if (nblocks > journal->j_max_transaction_buffers) { -+ jbd_debug(1, "JBD: %s wants too many credits (%d > %d)\n", -+ current->comm, nblocks, -+ journal->j_max_transaction_buffers); -+ return -ENOSPC; -+ } -+ - jbd_debug(3, "New handle %p going live.\n", handle); - - repeat: -@@ -196,6 +204,20 @@ - return 0; - } - -+/* Allocate a new handle. This should probably be in a slab... */ -+static handle_t *new_handle(int nblocks) -+{ -+ handle_t *handle = jbd_kmalloc(sizeof (handle_t), GFP_NOFS); -+ if (!handle) -+ return NULL; -+ memset(handle, 0, sizeof (handle_t)); -+ handle->h_buffer_credits = nblocks; -+ handle->h_ref = 1; -+ INIT_LIST_HEAD(&handle->h_jcb); -+ -+ return handle; -+} -+ - /* - * Obtain a new handle. - * -@@ -222,14 +244,11 @@ - handle->h_ref++; - return handle; - } -- -- handle = jbd_kmalloc(sizeof (handle_t), GFP_NOFS); -+ -+ handle = new_handle(nblocks); - if (!handle) - return ERR_PTR(-ENOMEM); -- memset (handle, 0, sizeof (handle_t)); - -- handle->h_buffer_credits = nblocks; -- handle->h_ref = 1; - current->journal_info = handle; - - err = start_this_handle(journal, handle); -@@ -328,14 +347,11 @@ - - if (is_journal_aborted(journal)) - return ERR_PTR(-EIO); -- -- handle = jbd_kmalloc(sizeof (handle_t), GFP_NOFS); -+ -+ handle = new_handle(nblocks); - if (!handle) - return ERR_PTR(-ENOMEM); -- memset (handle, 0, sizeof (handle_t)); - -- handle->h_buffer_credits = nblocks; -- handle->h_ref = 1; - current->journal_info = handle; - - err = try_start_this_handle(journal, handle); -@@ -1324,6 +1340,28 @@ - #endif - - /* -+ * Register a callback function for this handle. The function will be -+ * called when the transaction that this handle is part of has been -+ * committed to disk with the original callback data struct and the -+ * error status of the journal as parameters. There is no guarantee of -+ * ordering between handles within a single transaction, nor between -+ * callbacks registered on the same handle. -+ * -+ * The caller is responsible for allocating the journal_callback struct. -+ * This is to allow the caller to add as much extra data to the callback -+ * as needed, but reduce the overhead of multiple allocations. The caller -+ * allocated struct must start with a struct journal_callback at offset 0, -+ * and has the caller-specific data afterwards. -+ */ -+void journal_callback_set(handle_t *handle, -+ void (*func)(struct journal_callback *jcb, int error), -+ struct journal_callback *jcb) -+{ -+ list_add_tail(&jcb->jcb_list, &handle->h_jcb); -+ jcb->jcb_func = func; -+} -+ -+/* - * All done for a particular handle. - * - * There is not much action needed here. We just return any remaining -@@ -1389,7 +1427,10 @@ - wake_up(&journal->j_wait_transaction_locked); - } - -- /* -+ /* Move callbacks from the handle to the transaction. */ -+ list_splice(&handle->h_jcb, &transaction->t_jcb); -+ -+ /* - * If the handle is marked SYNC, we need to set another commit - * going! We also want to force a commit if the current - * transaction is occupying too much of the log, or if the diff --git a/lustre/kernel_patches/patches/jbd-commit-tricks-rhel3.patch b/lustre/kernel_patches/patches/jbd-commit-tricks-rhel3.patch new file mode 100644 index 0000000..725931c --- /dev/null +++ b/lustre/kernel_patches/patches/jbd-commit-tricks-rhel3.patch @@ -0,0 +1,132 @@ + +--- linux-2.4.18/fs/jbd/checkpoint.c~jbd-commit-tricks Mon Jul 28 13:52:05 2003 ++++ linux-2.4.18-alexey/fs/jbd/checkpoint.c Mon Jul 28 14:03:59 2003 +@@ -77,19 +77,23 @@ static int __try_to_free_cp_buf(struct j + * to wait for a checkpoint to free up some space in the log. + */ + +-void log_wait_for_space(journal_t *journal, int nblocks) ++void log_wait_for_space(journal_t *journal) + { ++ int nblocks; ++ ++ nblocks = jbd_space_needed(journal); + while (log_space_left(journal) < nblocks) { + if (journal->j_flags & JFS_ABORT) + return; + unlock_journal(journal); + down(&journal->j_checkpoint_sem); + lock_journal(journal); ++ nblocks = jbd_space_needed(journal); + + /* Test again, another process may have checkpointed + * while we were waiting for the checkpoint lock */ + if (log_space_left(journal) < nblocks) { +- log_do_checkpoint(journal, nblocks); ++ log_do_checkpoint(journal); + } + up(&journal->j_checkpoint_sem); + } +@@ -260,8 +264,7 @@ static int __flush_buffer(journal_t *jou + * The journal should be locked before calling this function. + */ + +-/* @@@ `nblocks' is unused. Should it be used? */ +-int log_do_checkpoint (journal_t *journal, int nblocks) ++int log_do_checkpoint (journal_t *journal) + { + transaction_t *transaction, *last_transaction, *next_transaction; + int result; +@@ -315,6 +318,8 @@ repeat: + retry = __flush_buffer(journal, jh, bhs, &batch_count, + &drop_count); + } while (jh != last_jh && !retry); ++ if (journal->j_checkpoint_transactions != transaction) ++ goto done; + if (batch_count) { + __flush_batch(bhs, &batch_count); + goto repeat; +@@ -328,6 +333,8 @@ repeat: + */ + cleanup_ret = __cleanup_transaction(journal, transaction); + J_ASSERT(drop_count != 0 || cleanup_ret != 0); ++ if (journal->j_checkpoint_transactions != transaction) ++ goto done; + goto repeat; /* __cleanup may have dropped lock */ + } while (transaction != last_transaction); + +--- linux-2.4.18/fs/jbd/journal.c~jbd-commit-tricks Mon Jul 28 13:52:05 2003 ++++ linux-2.4.18-alexey/fs/jbd/journal.c Mon Jul 28 14:03:59 2003 +@@ -1115,7 +1115,7 @@ void journal_destroy (journal_t *journal + /* Force any old transactions to disk */ + lock_journal(journal); + while (journal->j_checkpoint_transactions != NULL) +- log_do_checkpoint(journal, 1); ++ log_do_checkpoint(journal); + + J_ASSERT(journal->j_running_transaction == NULL); + J_ASSERT(journal->j_committing_transaction == NULL); +@@ -1302,7 +1302,7 @@ int journal_flush (journal_t *journal) + /* ...and flush everything in the log out to disk. */ + lock_journal(journal); + while (!err && journal->j_checkpoint_transactions != NULL) +- err = log_do_checkpoint(journal, journal->j_maxlen); ++ err = log_do_checkpoint(journal); + cleanup_journal_tail(journal); + + /* Finally, mark the journal as really needing no recovery. +--- linux-2.4.18/fs/jbd/transaction.c~jbd-commit-tricks Mon Jul 28 13:52:05 2003 ++++ linux-2.4.18-alexey/fs/jbd/transaction.c Mon Jul 28 14:03:59 2003 +@@ -182,14 +182,9 @@ repeat_locked: + * Also, this test is inconsitent with the matching one in + * journal_extend(). + */ +- needed = journal->j_max_transaction_buffers; +- if (journal->j_committing_transaction) +- needed += journal->j_committing_transaction-> +- t_outstanding_credits; +- +- if (log_space_left(journal) < needed) { ++ if (log_space_left(journal) < jbd_space_needed(journal)) { + jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle); +- log_wait_for_space(journal, needed); ++ log_wait_for_space(journal); + goto repeat_locked; + } + +--- linux-2.4.18/include/linux/jbd.h~jbd-commit-tricks Mon Jul 28 13:52:17 2003 ++++ linux-2.4.18-alexey/include/linux/jbd.h Mon Jul 28 14:03:59 2003 +@@ -740,9 +740,9 @@ extern void journal_brelse_array(stru + extern int log_space_left (journal_t *); /* Called with journal locked */ + extern tid_t log_start_commit (journal_t *, transaction_t *); + extern int log_wait_commit (journal_t *, tid_t); +-extern int log_do_checkpoint (journal_t *, int); ++extern int log_do_checkpoint (journal_t *); + +-extern void log_wait_for_space(journal_t *, int nblocks); ++extern void log_wait_for_space(journal_t *); + extern void __journal_drop_transaction(journal_t *, transaction_t *); + extern int cleanup_journal_tail(journal_t *); + +@@ -815,6 +815,19 @@ static inline int tid_geq(tid_t x, tid_t + } + + extern int journal_blocks_per_page(struct inode *inode); ++ ++/* ++ * Return the minimum number of blocks which must be free in the journal ++ * before a new transaction may be started. Must be called under j_state_lock. ++ */ ++static inline int jbd_space_needed(journal_t *journal) ++{ ++ int nblocks = journal->j_max_transaction_buffers; ++ if (journal->j_committing_transaction) ++ nblocks += journal->j_committing_transaction-> ++ t_outstanding_credits; ++ return nblocks; ++} + + /* + * Definitions which augment the buffer_head layer + +_ diff --git a/lustre/kernel_patches/patches/jbd-flushtime-2.4.19-suse.patch b/lustre/kernel_patches/patches/jbd-flushtime-2.4.19-suse.patch deleted file mode 100644 index 8411137..0000000 --- a/lustre/kernel_patches/patches/jbd-flushtime-2.4.19-suse.patch +++ /dev/null @@ -1,35 +0,0 @@ -Index: linux-2.4.19.SuSE/fs/jbd/transaction.c -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/jbd/transaction.c Sun Nov 16 01:38:25 2003 -+++ linux-2.4.19.SuSE/fs/jbd/transaction.c Sun Nov 16 01:44:26 2003 -@@ -1094,7 +1094,6 @@ - - spin_lock(&journal_datalist_lock); - set_bit(BH_JBDDirty, &bh->b_state); -- set_buffer_flushtime(bh); - - J_ASSERT_JH(jh, jh->b_transaction != NULL); - -@@ -1995,6 +1994,13 @@ - spin_unlock(&journal_datalist_lock); - } - -+static void jbd_refile_buffer(struct buffer_head *bh) -+{ -+ if (buffer_dirty(bh) && (bh->b_list != BUF_DIRTY)) -+ set_buffer_flushtime(bh); -+ refile_buffer(bh); -+} -+ - /* - * Remove a buffer from its current buffer list in preparation for - * dropping it from its current transaction entirely. If the buffer has -@@ -2022,7 +2028,7 @@ - J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING); - } else { - /* Onto BUF_DIRTY for writeback */ -- refile_buffer(jh2bh(jh)); -+ jbd_refile_buffer(jh2bh(jh)); - } - } - diff --git a/lustre/kernel_patches/patches/jbd-stats-2.6.13.4.patch b/lustre/kernel_patches/patches/jbd-stats-2.6.13.4.patch new file mode 100644 index 0000000..4db8dd3 --- /dev/null +++ b/lustre/kernel_patches/patches/jbd-stats-2.6.13.4.patch @@ -0,0 +1,735 @@ +Index: linux-2.6.13.4/include/linux/jbd.h +=================================================================== +--- linux-2.6.13.4.orig/include/linux/jbd.h 2005-10-10 22:54:29.000000000 +0400 ++++ linux-2.6.13.4/include/linux/jbd.h 2005-11-20 01:35:08.000000000 +0300 +@@ -394,6 +394,16 @@ + }; + + ++/* ++ * Some stats for checkpoint phase ++ */ ++struct transaction_chp_stats_s { ++ unsigned long cs_chp_time; ++ unsigned long cs_forced_to_close; ++ unsigned long cs_written; ++ unsigned long cs_dropped; ++}; ++ + /* The transaction_t type is the guts of the journaling mechanism. It + * tracks a compound transaction through its various states: + * +@@ -523,6 +533,21 @@ + spinlock_t t_handle_lock; + + /* ++ * Longest time some handle had to wait for running transaction ++ */ ++ unsigned long t_max_wait; ++ ++ /* ++ * When transaction started ++ */ ++ unsigned long t_start; ++ ++ /* ++ * Checkpointing stats [j_checkpoint_sem] ++ */ ++ struct transaction_chp_stats_s t_chp_stats; ++ ++ /* + * Number of outstanding updates running on this transaction + * [t_handle_lock] + */ +@@ -553,6 +578,57 @@ + + }; + ++struct transaction_run_stats_s { ++ unsigned long rs_wait; ++ unsigned long rs_running; ++ unsigned long rs_locked; ++ unsigned long rs_flushing; ++ unsigned long rs_logging; ++ ++ unsigned long rs_handle_count; ++ unsigned long rs_blocks; ++ unsigned long rs_blocks_logged; ++}; ++ ++struct transaction_stats_s ++{ ++ int ts_type; ++ unsigned long ts_tid; ++ union { ++ struct transaction_run_stats_s run; ++ struct transaction_chp_stats_s chp; ++ } u; ++}; ++ ++#define JBD_STATS_RUN 1 ++#define JBD_STATS_CHECKPOINT 2 ++ ++#define ts_wait u.run.rs_wait ++#define ts_running u.run.rs_running ++#define ts_locked u.run.rs_locked ++#define ts_flushing u.run.rs_flushing ++#define ts_logging u.run.rs_logging ++#define ts_handle_count u.run.rs_handle_count ++#define ts_blocks u.run.rs_blocks ++#define ts_blocks_logged u.run.rs_blocks_logged ++ ++#define ts_chp_time u.chp.cs_chp_time ++#define ts_forced_to_close u.chp.cs_forced_to_close ++#define ts_written u.chp.cs_written ++#define ts_dropped u.chp.cs_dropped ++ ++#define CURRENT_MSECS (jiffies_to_msecs(jiffies)) ++ ++static inline unsigned int ++jbd_time_diff(unsigned int start, unsigned int end) ++{ ++ if (unlikely(start > end)) ++ end = end + (~0UL - start); ++ else ++ end -= start; ++ return end; ++} ++ + /** + * struct journal_s - The journal_s type is the concrete type associated with + * journal_t. +@@ -800,6 +876,16 @@ + int j_wbufsize; + + /* ++ * ++ */ ++ struct transaction_stats_s *j_history; ++ int j_history_max; ++ int j_history_cur; ++ spinlock_t j_history_lock; ++ struct proc_dir_entry *j_proc_entry; ++ struct transaction_stats_s j_stats; ++ ++ /* + * An opaque pointer to fs-private information. ext3 puts its + * superblock pointer here + */ +Index: linux-2.6.13.4/fs/jbd/transaction.c +=================================================================== +--- linux-2.6.13.4.orig/fs/jbd/transaction.c 2005-10-10 22:54:29.000000000 +0400 ++++ linux-2.6.13.4/fs/jbd/transaction.c 2005-11-20 01:31:23.000000000 +0300 +@@ -58,6 +58,8 @@ + + J_ASSERT(journal->j_running_transaction == NULL); + journal->j_running_transaction = transaction; ++ transaction->t_max_wait = 0; ++ transaction->t_start = CURRENT_MSECS; + + return transaction; + } +@@ -84,6 +86,7 @@ + int nblocks = handle->h_buffer_credits; + transaction_t *new_transaction = NULL; + int ret = 0; ++ unsigned long ts = CURRENT_MSECS; + + if (nblocks > journal->j_max_transaction_buffers) { + printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n", +@@ -217,6 +220,12 @@ + /* OK, account for the buffers that this operation expects to + * use and add the handle to the running transaction. */ + ++ if (time_after(transaction->t_start, ts)) { ++ ts = jbd_time_diff(ts, transaction->t_start); ++ if (ts > transaction->t_max_wait) ++ transaction->t_max_wait= ts; ++ } ++ + handle->h_transaction = transaction; + transaction->t_outstanding_credits += nblocks; + transaction->t_updates++; +Index: linux-2.6.13.4/fs/jbd/journal.c +=================================================================== +--- linux-2.6.13.4.orig/fs/jbd/journal.c 2005-10-10 22:54:29.000000000 +0400 ++++ linux-2.6.13.4/fs/jbd/journal.c 2005-11-20 02:07:44.000000000 +0300 +@@ -36,6 +36,7 @@ + #include + #include + #include ++#include + + EXPORT_SYMBOL(journal_start); + EXPORT_SYMBOL(journal_restart); +@@ -646,6 +647,300 @@ + return journal_add_journal_head(bh); + } + ++struct jbd_stats_proc_session { ++ journal_t *journal; ++ struct transaction_stats_s *stats; ++ int start; ++ int max; ++}; ++ ++static void *jbd_history_skip_empty(struct jbd_stats_proc_session *s, ++ struct transaction_stats_s *ts, ++ int first) ++{ ++ if (ts == s->stats + s->max) ++ ts = s->stats; ++ if (!first && ts == s->stats + s->start) ++ return NULL; ++ while (ts->ts_type == 0) { ++ ts++; ++ if (ts == s->stats + s->max) ++ ts = s->stats; ++ if (ts == s->stats + s->start) ++ return NULL; ++ } ++ return ts; ++ ++} ++ ++static void *jbd_seq_history_start(struct seq_file *seq, loff_t *pos) ++{ ++ struct jbd_stats_proc_session *s = seq->private; ++ struct transaction_stats_s *ts; ++ int l = *pos; ++ ++ if (l == 0) ++ return SEQ_START_TOKEN; ++ ts = jbd_history_skip_empty(s, s->stats + s->start, 1); ++ if (!ts) ++ return NULL; ++ while (--l && (ts = jbd_history_skip_empty(s, ++ts, 0)) != NULL); ++ return ts; ++} ++ ++static void *jbd_seq_history_next(struct seq_file *seq, void *v, loff_t *pos) ++{ ++ struct jbd_stats_proc_session *s = seq->private; ++ struct transaction_stats_s *ts = v; ++ ++ ++*pos; ++ if (v == SEQ_START_TOKEN) ++ return jbd_history_skip_empty(s, s->stats + s->start, 1); ++ else ++ return jbd_history_skip_empty(s, ++ts, 0); ++} ++ ++static int jbd_seq_history_show(struct seq_file *seq, void *v) ++{ ++ struct transaction_stats_s *ts = v; ++ if (v == SEQ_START_TOKEN) { ++ seq_printf(seq, "%-4s %-5s %-5s %-5s %-5s %-5s %-5s %-6s %-5s " ++ "%-5s %-5s %-5s %-5s %-5s\n", "R/C", "tid", ++ "wait", "run", "lock", "flush", "log", "hndls", ++ "block", "inlog", "ctime", "write", "drop", ++ "close"); ++ return 0; ++ } ++ if (ts->ts_type == JBD_STATS_RUN) ++ seq_printf(seq, "%-4s %-5lu %-5lu %-5lu %-5lu %-5lu %-5lu " ++ "%-6lu %-5lu %-5lu\n", "R", ts->ts_tid, ++ ts->ts_wait, ts->ts_running, ts->ts_locked, ++ ts->ts_flushing, ts->ts_logging, ++ ts->ts_handle_count, ts->ts_blocks, ++ ts->ts_blocks_logged); ++ else if (ts->ts_type == JBD_STATS_CHECKPOINT) ++ seq_printf(seq, "%-4s %-5lu %48s %-5lu %-5lu %-5lu %-5lu\n", ++ "C", ts->ts_tid, " ", ts->ts_chp_time, ++ ts->ts_written, ts->ts_dropped, ++ ts->ts_forced_to_close); ++ else ++ J_ASSERT(0); ++ return 0; ++} ++ ++static void jbd_seq_history_stop(struct seq_file *seq, void *v) ++{ ++} ++ ++static struct seq_operations jbd_seq_history_ops = { ++ .start = jbd_seq_history_start, ++ .next = jbd_seq_history_next, ++ .stop = jbd_seq_history_stop, ++ .show = jbd_seq_history_show, ++}; ++ ++static int jbd_seq_history_open(struct inode *inode, struct file *file) ++{ ++ journal_t *journal = PDE(inode)->data; ++ struct jbd_stats_proc_session *s; ++ int rc, size; ++ ++ s = kmalloc(sizeof(*s), GFP_KERNEL); ++ if (s == NULL) ++ return -EIO; ++ size = sizeof(struct transaction_stats_s) * journal->j_history_max; ++ s->stats = kmalloc(size, GFP_KERNEL); ++ if (s == NULL) { ++ kfree(s); ++ return -EIO; ++ } ++ spin_lock(&journal->j_history_lock); ++ memcpy(s->stats, journal->j_history, size); ++ s->max = journal->j_history_max; ++ s->start = journal->j_history_cur % s->max; ++ spin_unlock(&journal->j_history_lock); ++ ++ rc = seq_open(file, &jbd_seq_history_ops); ++ if (rc == 0) { ++ struct seq_file *m = (struct seq_file *)file->private_data; ++ m->private = s; ++ } else { ++ kfree(s->stats); ++ kfree(s); ++ } ++ return rc; ++ ++} ++ ++static int jbd_seq_history_release(struct inode *inode, struct file *file) ++{ ++ struct seq_file *seq = (struct seq_file *)file->private_data; ++ struct jbd_stats_proc_session *s = seq->private; ++ kfree(s->stats); ++ kfree(s); ++ return seq_release(inode, file); ++} ++ ++static struct file_operations jbd_seq_history_fops = { ++ .owner = THIS_MODULE, ++ .open = jbd_seq_history_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = jbd_seq_history_release, ++}; ++ ++static void *jbd_seq_info_start(struct seq_file *seq, loff_t *pos) ++{ ++ return *pos ? NULL : SEQ_START_TOKEN; ++} ++ ++static void *jbd_seq_info_next(struct seq_file *seq, void *v, loff_t *pos) ++{ ++ return NULL; ++} ++ ++static int jbd_seq_info_show(struct seq_file *seq, void *v) ++{ ++ struct jbd_stats_proc_session *s = seq->private; ++ if (v != SEQ_START_TOKEN) ++ return 0; ++ seq_printf(seq, "%lu transaction, each upto %u blocks\n", ++ s->stats->ts_tid, ++ s->journal->j_max_transaction_buffers); ++ if (s->stats->ts_tid == 0) ++ return 0; ++ seq_printf(seq, "average: \n %lums waiting for transaction\n", ++ s->stats->ts_wait / s->stats->ts_tid); ++ seq_printf(seq, " %lums running transaction\n", ++ s->stats->ts_running / s->stats->ts_tid); ++ seq_printf(seq, " %lums transaction was being locked\n", ++ s->stats->ts_locked / s->stats->ts_tid); ++ seq_printf(seq, " %lums flushing data (in ordered mode)\n", ++ s->stats->ts_flushing / s->stats->ts_tid); ++ seq_printf(seq, " %lums logging transaction\n", ++ s->stats->ts_logging / s->stats->ts_tid); ++ seq_printf(seq, " %lu handles per transaction\n", ++ s->stats->ts_handle_count / s->stats->ts_tid); ++ seq_printf(seq, " %lu blocks per transaction\n", ++ s->stats->ts_blocks / s->stats->ts_tid); ++ seq_printf(seq, " %lu logged blocks per transaction\n", ++ s->stats->ts_blocks_logged / s->stats->ts_tid); ++ return 0; ++} ++ ++static void jbd_seq_info_stop(struct seq_file *seq, void *v) ++{ ++} ++ ++static struct seq_operations jbd_seq_info_ops = { ++ .start = jbd_seq_info_start, ++ .next = jbd_seq_info_next, ++ .stop = jbd_seq_info_stop, ++ .show = jbd_seq_info_show, ++}; ++ ++static int jbd_seq_info_open(struct inode *inode, struct file *file) ++{ ++ journal_t *journal = PDE(inode)->data; ++ struct jbd_stats_proc_session *s; ++ int rc, size; ++ ++ s = kmalloc(sizeof(*s), GFP_KERNEL); ++ if (s == NULL) ++ return -EIO; ++ size = sizeof(struct transaction_stats_s); ++ s->stats = kmalloc(size, GFP_KERNEL); ++ if (s == NULL) { ++ kfree(s); ++ return -EIO; ++ } ++ spin_lock(&journal->j_history_lock); ++ memcpy(s->stats, &journal->j_stats, size); ++ s->journal = journal; ++ spin_unlock(&journal->j_history_lock); ++ ++ rc = seq_open(file, &jbd_seq_info_ops); ++ if (rc == 0) { ++ struct seq_file *m = (struct seq_file *)file->private_data; ++ m->private = s; ++ } else { ++ kfree(s->stats); ++ kfree(s); ++ } ++ return rc; ++ ++} ++ ++static int jbd_seq_info_release(struct inode *inode, struct file *file) ++{ ++ struct seq_file *seq = (struct seq_file *)file->private_data; ++ struct jbd_stats_proc_session *s = seq->private; ++ kfree(s->stats); ++ kfree(s); ++ return seq_release(inode, file); ++} ++ ++static struct file_operations jbd_seq_info_fops = { ++ .owner = THIS_MODULE, ++ .open = jbd_seq_info_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = jbd_seq_info_release, ++}; ++ ++static struct proc_dir_entry *proc_jbd_stats = NULL; ++ ++static void jbd_stats_proc_init(journal_t *journal) ++{ ++ char name[64]; ++ ++ snprintf(name, sizeof(name) - 1, "%s", bdevname(journal->j_dev, name)); ++ journal->j_proc_entry = proc_mkdir(name, proc_jbd_stats); ++ if (journal->j_proc_entry) { ++ struct proc_dir_entry *p; ++ p = create_proc_entry("history", S_IRUGO, ++ journal->j_proc_entry); ++ if (p) { ++ p->proc_fops = &jbd_seq_history_fops; ++ p->data = journal; ++ p = create_proc_entry("info", S_IRUGO, ++ journal->j_proc_entry); ++ if (p) { ++ p->proc_fops = &jbd_seq_info_fops; ++ p->data = journal; ++ } ++ } ++ } ++} ++ ++static void jbd_stats_proc_exit(journal_t *journal) ++{ ++ char name[64]; ++ ++ snprintf(name, sizeof(name) - 1, "%s", bdevname(journal->j_dev, name)); ++ remove_proc_entry("info", journal->j_proc_entry); ++ remove_proc_entry("history", journal->j_proc_entry); ++ remove_proc_entry(name, proc_jbd_stats); ++} ++ ++static void journal_init_stats(journal_t *journal) ++{ ++ int size; ++ ++ if (proc_jbd_stats == NULL) ++ return; ++ ++ journal->j_history_max = 100; ++ size = sizeof(struct transaction_stats_s) * journal->j_history_max; ++ journal->j_history = kmalloc(size, GFP_KERNEL); ++ if (journal->j_history == NULL) { ++ journal->j_history_max = 0; ++ return; ++ } ++ memset(journal->j_history, 0, size); ++ spin_lock_init(&journal->j_history_lock); ++} ++ + /* + * Management for journal control blocks: functions to create and + * destroy journal_t structures, and to initialise and read existing +@@ -688,6 +983,9 @@ + kfree(journal); + goto fail; + } ++ ++ journal_init_stats(journal); ++ + return journal; + fail: + return NULL; +@@ -731,6 +1029,7 @@ + journal->j_blk_offset = start; + journal->j_maxlen = len; + journal->j_blocksize = blocksize; ++ jbd_stats_proc_init(journal); + + bh = __getblk(journal->j_dev, start, journal->j_blocksize); + J_ASSERT(bh != NULL); +@@ -780,6 +1079,7 @@ + + journal->j_maxlen = inode->i_size >> inode->i_sb->s_blocksize_bits; + journal->j_blocksize = inode->i_sb->s_blocksize; ++ jbd_stats_proc_init(journal); + + /* journal descriptor can store up to n blocks -bzzz */ + n = journal->j_blocksize / sizeof(journal_block_tag_t); +@@ -1161,6 +1461,8 @@ + brelse(journal->j_sb_buffer); + } + ++ if (journal->j_proc_entry) ++ jbd_stats_proc_exit(journal); + if (journal->j_inode) + iput(journal->j_inode); + if (journal->j_revoke) +@@ -1929,6 +2231,28 @@ + + #endif + ++#if defined(CONFIG_PROC_FS) ++ ++#define JBD_STATS_PROC_NAME "fs/jbd" ++ ++static void __init create_jbd_stats_proc_entry(void) ++{ ++ proc_jbd_stats = proc_mkdir(JBD_STATS_PROC_NAME, NULL); ++} ++ ++static void __exit remove_jbd_stats_proc_entry(void) ++{ ++ if (proc_jbd_stats) ++ remove_proc_entry(JBD_STATS_PROC_NAME, NULL); ++} ++ ++#else ++ ++#define create_jbd_stats_proc_entry() do {} while (0) ++#define remove_jbd_stats_proc_entry() do {} while (0) ++ ++#endif ++ + kmem_cache_t *jbd_handle_cache; + + static int __init journal_init_handle_cache(void) +@@ -1983,6 +2307,7 @@ + if (ret != 0) + journal_destroy_caches(); + create_jbd_proc_entry(); ++ create_jbd_stats_proc_entry(); + return ret; + } + +@@ -1994,6 +2319,7 @@ + printk(KERN_EMERG "JBD: leaked %d journal_heads!\n", n); + #endif + remove_jbd_proc_entry(); ++ remove_jbd_stats_proc_entry(); + journal_destroy_caches(); + } + +Index: linux-2.6.13.4/fs/jbd/checkpoint.c +=================================================================== +--- linux-2.6.13.4.orig/fs/jbd/checkpoint.c 2005-11-19 22:46:03.000000000 +0300 ++++ linux-2.6.13.4/fs/jbd/checkpoint.c 2005-11-20 02:24:09.000000000 +0300 +@@ -166,6 +166,7 @@ + transaction_t *t = jh->b_transaction; + tid_t tid = t->t_tid; + ++ transaction->t_chp_stats.cs_forced_to_close++; + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + log_start_commit(journal, tid); +@@ -226,7 +227,7 @@ + */ + static int __flush_buffer(journal_t *journal, struct journal_head *jh, + struct buffer_head **bhs, int *batch_count, +- int *drop_count) ++ int *drop_count, transaction_t *transaction) + { + struct buffer_head *bh = jh2bh(jh); + int ret = 0; +@@ -247,6 +248,7 @@ + set_buffer_jwrite(bh); + bhs[*batch_count] = bh; + jbd_unlock_bh_state(bh); ++ transaction->t_chp_stats.cs_written++; + (*batch_count)++; + if (*batch_count == NR_BATCH) { + __flush_batch(journal, bhs, batch_count); +@@ -315,6 +317,8 @@ + tid_t this_tid; + + transaction = journal->j_checkpoint_transactions; ++ if (transaction->t_chp_stats.cs_chp_time == 0) ++ transaction->t_chp_stats.cs_chp_time = CURRENT_MSECS; + this_tid = transaction->t_tid; + jh = transaction->t_checkpoint_list; + last_jh = jh->b_cpprev; +@@ -331,7 +335,8 @@ + retry = 1; + break; + } +- retry = __flush_buffer(journal, jh, bhs, &batch_count, &drop_count); ++ retry = __flush_buffer(journal, jh, bhs, &batch_count, ++ &drop_count, transaction); + if (cond_resched_lock(&journal->j_list_lock)) { + retry = 1; + break; +@@ -609,6 +614,8 @@ + + void __journal_drop_transaction(journal_t *journal, transaction_t *transaction) + { ++ struct transaction_stats_s stats; ++ + assert_spin_locked(&journal->j_list_lock); + if (transaction->t_cpnext) { + transaction->t_cpnext->t_cpprev = transaction->t_cpprev; +@@ -633,5 +640,25 @@ + J_ASSERT(journal->j_running_transaction != transaction); + + jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid); ++ ++ /* ++ * File the transaction for history ++ */ ++ if (transaction->t_chp_stats.cs_written != 0 || ++ transaction->t_chp_stats.cs_chp_time != 0) { ++ stats.ts_type = JBD_STATS_CHECKPOINT; ++ stats.ts_tid = transaction->t_tid; ++ stats.u.chp = transaction->t_chp_stats; ++ if (stats.ts_chp_time) ++ stats.ts_chp_time = ++ jbd_time_diff(stats.ts_chp_time, CURRENT_MSECS); ++ spin_lock(&journal->j_history_lock); ++ memcpy(journal->j_history + journal->j_history_cur, &stats, ++ sizeof(stats)); ++ if (++journal->j_history_cur == journal->j_history_max) ++ journal->j_history_cur = 0; ++ spin_unlock(&journal->j_history_lock); ++ } ++ + kfree(transaction); + } +Index: linux-2.6.13.4/fs/jbd/commit.c +=================================================================== +--- linux-2.6.13.4.orig/fs/jbd/commit.c 2005-10-10 22:54:29.000000000 +0400 ++++ linux-2.6.13.4/fs/jbd/commit.c 2005-11-20 00:54:10.000000000 +0300 +@@ -21,6 +21,7 @@ + #include + #include + #include ++#include + + /* + * Default IO end handler for temporary BJ_IO buffer_heads. +@@ -168,6 +169,7 @@ + */ + void journal_commit_transaction(journal_t *journal) + { ++ struct transaction_stats_s stats; + transaction_t *commit_transaction; + struct journal_head *jh, *new_jh, *descriptor; + struct buffer_head **wbuf = journal->j_wbuf; +@@ -214,6 +216,11 @@ + spin_lock(&journal->j_state_lock); + commit_transaction->t_state = T_LOCKED; + ++ stats.ts_wait = commit_transaction->t_max_wait; ++ stats.ts_locked = CURRENT_MSECS; ++ stats.ts_running = jbd_time_diff(commit_transaction->t_start, ++ stats.ts_locked); ++ + spin_lock(&commit_transaction->t_handle_lock); + while (commit_transaction->t_updates) { + DEFINE_WAIT(wait); +@@ -286,6 +293,9 @@ + */ + journal_switch_revoke_table(journal); + ++ stats.ts_flushing = CURRENT_MSECS; ++ stats.ts_locked = jbd_time_diff(stats.ts_locked, stats.ts_flushing); ++ + commit_transaction->t_state = T_FLUSH; + journal->j_committing_transaction = commit_transaction; + journal->j_running_transaction = NULL; +@@ -444,6 +454,11 @@ + */ + commit_transaction->t_state = T_COMMIT; + ++ stats.ts_logging = CURRENT_MSECS; ++ stats.ts_flushing = jbd_time_diff(stats.ts_flushing, stats.ts_logging); ++ stats.ts_blocks = commit_transaction->t_outstanding_credits; ++ stats.ts_blocks_logged = 0; ++ + descriptor = NULL; + bufs = 0; + while (commit_transaction->t_buffers) { +@@ -592,6 +607,7 @@ + submit_bh(WRITE, bh); + } + cond_resched(); ++ stats.ts_blocks_logged += bufs; + + /* Force a new descriptor to be generated next + time round the loop. */ +@@ -756,6 +772,7 @@ + cp_transaction = jh->b_cp_transaction; + if (cp_transaction) { + JBUFFER_TRACE(jh, "remove from old cp transaction"); ++ cp_transaction->t_chp_stats.cs_dropped++; + __journal_remove_checkpoint(jh); + } + +@@ -803,6 +820,36 @@ + + J_ASSERT(commit_transaction->t_state == T_COMMIT); + ++ commit_transaction->t_start = CURRENT_MSECS; ++ stats.ts_logging = jbd_time_diff(stats.ts_logging, ++ commit_transaction->t_start); ++ ++ /* ++ * File the transaction for history ++ */ ++ stats.ts_type = JBD_STATS_RUN; ++ stats.ts_tid = commit_transaction->t_tid; ++ stats.ts_handle_count = commit_transaction->t_handle_count; ++ spin_lock(&journal->j_history_lock); ++ memcpy(journal->j_history + journal->j_history_cur, &stats, ++ sizeof(stats)); ++ if (++journal->j_history_cur == journal->j_history_max) ++ journal->j_history_cur = 0; ++ ++ /* ++ * Calculate overall stats ++ */ ++ journal->j_stats.ts_tid++; ++ journal->j_stats.ts_wait += stats.ts_wait; ++ journal->j_stats.ts_running += stats.ts_running; ++ journal->j_stats.ts_locked += stats.ts_locked; ++ journal->j_stats.ts_flushing += stats.ts_flushing; ++ journal->j_stats.ts_logging += stats.ts_logging; ++ journal->j_stats.ts_handle_count += stats.ts_handle_count; ++ journal->j_stats.ts_blocks += stats.ts_blocks; ++ journal->j_stats.ts_blocks_logged += stats.ts_blocks_logged; ++ spin_unlock(&journal->j_history_lock); ++ + /* + * This is a bit sleazy. We borrow j_list_lock to protect + * journal->j_committing_transaction in __journal_remove_checkpoint. diff --git a/lustre/kernel_patches/patches/kallsyms-2.4-bgl.patch b/lustre/kernel_patches/patches/kallsyms-2.4-bgl.patch deleted file mode 100644 index 25f7954..0000000 --- a/lustre/kernel_patches/patches/kallsyms-2.4-bgl.patch +++ /dev/null @@ -1,685 +0,0 @@ -Index: linux-bgl/arch/arm/vmlinux-armo.lds.in -=================================================================== ---- linux-bgl.orig/arch/arm/vmlinux-armo.lds.in 2003-07-02 08:44:12.000000000 -0700 -+++ linux-bgl/arch/arm/vmlinux-armo.lds.in 2004-10-26 22:52:50.037677957 -0700 -@@ -62,6 +62,10 @@ - *(__ksymtab) - __stop___ksymtab = .; - -+ __start___kallsyms = .; /* All kernel symbols */ -+ *(__kallsyms) -+ __stop___kallsyms = .; -+ - *(.got) /* Global offset table */ - - _etext = .; /* End of text section */ -Index: linux-bgl/arch/arm/vmlinux-armv.lds.in -=================================================================== ---- linux-bgl.orig/arch/arm/vmlinux-armv.lds.in 2003-07-02 08:44:12.000000000 -0700 -+++ linux-bgl/arch/arm/vmlinux-armv.lds.in 2004-10-26 22:52:50.038677801 -0700 -@@ -67,6 +67,12 @@ - __stop___ksymtab = .; - } - -+ __kallsyms : { /* Kernel debugging table */ -+ __start___kallsyms = .; /* All kernel symbols */ -+ *(__kallsyms) -+ __stop___kallsyms = .; -+ } -+ - . = ALIGN(8192); - - .data : { -Index: linux-bgl/arch/ppc/config.in -=================================================================== ---- linux-bgl.orig/arch/ppc/config.in 2004-10-04 09:55:49.000000000 -0700 -+++ linux-bgl/arch/ppc/config.in 2004-10-26 23:11:56.416643929 -0700 -@@ -732,6 +732,7 @@ - string 'Additional compile arguments' CONFIG_COMPILE_OPTIONS "-g -ggdb" - fi - fi -+bool 'Load all symbols for debugging/kksymoops' CONFIG_KALLSYMS - - if [ "$CONFIG_ALL_PPC" = "y" ]; then - bool 'Support for early boot text console (BootX or OpenFirmware only)' CONFIG_BOOTX_TEXT -Index: linux-bgl/arch/ppc/vmlinux.lds -=================================================================== ---- linux-bgl.orig/arch/ppc/vmlinux.lds 2003-07-02 08:43:30.000000000 -0700 -+++ linux-bgl/arch/ppc/vmlinux.lds 2004-10-26 22:52:50.043677020 -0700 -@@ -73,6 +73,10 @@ - __ksymtab : { *(__ksymtab) } - __stop___ksymtab = .; - -+ __start___kallsyms = .; /* All kernel symbols */ -+ __kallsyms : { *(__kallsyms) } -+ __stop___kallsyms = .; -+ - __start___ftr_fixup = .; - __ftr_fixup : { *(__ftr_fixup) } - __stop___ftr_fixup = .; -Index: linux-bgl/arch/i386/config.in -=================================================================== ---- linux-bgl.orig/arch/i386/config.in 2003-07-02 08:43:46.000000000 -0700 -+++ linux-bgl/arch/i386/config.in 2004-10-26 22:52:50.040677488 -0700 -@@ -363,6 +363,7 @@ - if [ "$CONFIG_ISDN" != "n" ]; then - source drivers/isdn/Config.in - fi -+ bool ' Load all symbols for debugging/kksymoops' CONFIG_KALLSYMS - fi - endmenu - -Index: linux-bgl/arch/i386/vmlinux.lds -=================================================================== ---- linux-bgl.orig/arch/i386/vmlinux.lds 2003-07-02 08:44:32.000000000 -0700 -+++ linux-bgl/arch/i386/vmlinux.lds 2004-10-26 22:52:50.040677488 -0700 -@@ -27,6 +27,9 @@ - __start___ksymtab = .; /* Kernel symbol table */ - __ksymtab : { *(__ksymtab) } - __stop___ksymtab = .; -+ __start___kallsyms = .; /* All kernel symbols */ -+ __kallsyms : { *(__kallsyms) } -+ __stop___kallsyms = .; - - .data : { /* Data */ - *(.data) -Index: linux-bgl/arch/ia64/config.in -=================================================================== ---- linux-bgl.orig/arch/ia64/config.in 2003-07-02 08:44:12.000000000 -0700 -+++ linux-bgl/arch/ia64/config.in 2004-10-26 22:52:50.055675147 -0700 -@@ -278,4 +278,6 @@ - bool ' Turn on irq debug checks (slow!)' CONFIG_IA64_DEBUG_IRQ - fi - -+bool ' Load all symbols for debugging/kksymoops' CONFIG_KALLSYMS -+ - endmenu -Index: linux-bgl/arch/alpha/vmlinux.lds.in -=================================================================== ---- linux-bgl.orig/arch/alpha/vmlinux.lds.in 2003-07-02 08:43:45.000000000 -0700 -+++ linux-bgl/arch/alpha/vmlinux.lds.in 2004-10-26 22:52:50.036678113 -0700 -@@ -28,6 +28,10 @@ - __stop___ksymtab = .; - .kstrtab : { *(.kstrtab) } - -+ __start___kallsyms = .; /* All kernel symbols */ -+ __kallsyms : { *(__kallsyms) } -+ __stop___kallsyms = .; -+ - /* Startup code */ - . = ALIGN(8192); - __init_begin = .; -Index: linux-bgl/Makefile -=================================================================== ---- linux-bgl.orig/Makefile 2004-10-04 09:55:49.000000000 -0700 -+++ linux-bgl/Makefile 2004-10-26 22:54:44.018588371 -0700 -@@ -38,10 +38,13 @@ - MAKEFILES = $(TOPDIR)/.config - GENKSYMS = /sbin/genksyms - DEPMOD = /sbin/depmod -+KALLSYMS = /sbin/kallsyms - MODFLAGS = -DMODULE - CFLAGS_KERNEL = - PERL = perl - -+TMPPREFIX = -+ - export VERSION PATCHLEVEL SUBLEVEL EXTRAVERSION KERNELRELEASE ARCH \ - CONFIG_SHELL TOPDIR HPATH HOSTCC HOSTCFLAGS CROSS_COMPILE AS LD CC \ - CPP AR NM STRIP OBJCOPY OBJDUMP MAKE MAKEFILES GENKSYMS MODFLAGS PERL -@@ -198,7 +201,7 @@ - CLEAN_FILES = \ - kernel/ksyms.lst include/linux/compile.h \ - vmlinux System.map \ -- .tmp* \ -+ $(TMPPREFIX).tmp* \ - drivers/char/consolemap_deftbl.c drivers/video/promcon_tbl.c \ - drivers/char/conmakehash \ - drivers/char/drm/*-mod.c \ -@@ -278,16 +281,39 @@ - boot: vmlinux - @$(MAKE) CFLAGS="$(CFLAGS) $(CFLAGS_KERNEL)" -C arch/$(ARCH)/boot - -+LD_VMLINUX := $(LD) $(LINKFLAGS) $(HEAD) init/main.o init/version.o init/do_mounts.o \ -+ --start-group \ -+ $(CORE_FILES) \ -+ $(DRIVERS) \ -+ $(NETWORKS) \ -+ $(LIBS) \ -+ --end-group -+ifeq ($(CONFIG_KALLSYMS),y) -+LD_VMLINUX_KALLSYMS := $(TMPPREFIX).tmp_kallsyms3.o -+else -+LD_VMLINUX_KALLSYMS := -+endif -+ - vmlinux: include/linux/version.h $(CONFIGURATION) init/main.o init/version.o init/do_mounts.o linuxsubdirs -- $(LD) $(LINKFLAGS) $(HEAD) init/main.o init/version.o init/do_mounts.o \ -- --start-group \ -- $(CORE_FILES) \ -- $(DRIVERS) \ -- $(NETWORKS) \ -- $(LIBS) \ -- --end-group \ -- -o vmlinux -+ @$(MAKE) CFLAGS="$(CFLAGS) $(CFLAGS_KERNEL)" kallsyms -+ -+.PHONY: kallsyms -+ -+kallsyms: -+ifeq ($(CONFIG_KALLSYMS),y) -+ @echo kallsyms pass 1 -+ $(LD_VMLINUX) -o $(TMPPREFIX).tmp_vmlinux1 -+ @$(KALLSYMS) $(TMPPREFIX).tmp_vmlinux1 > $(TMPPREFIX).tmp_kallsyms1.o -+ @echo kallsyms pass 2 -+ @$(LD_VMLINUX) $(TMPPREFIX).tmp_kallsyms1.o -o $(TMPPREFIX).tmp_vmlinux2 -+ @$(KALLSYMS) $(TMPPREFIX).tmp_vmlinux2 > $(TMPPREFIX).tmp_kallsyms2.o -+ @echo kallsyms pass 3 -+ @$(LD_VMLINUX) $(TMPPREFIX).tmp_kallsyms2.o -o $(TMPPREFIX).tmp_vmlinux3 -+ @$(KALLSYMS) $(TMPPREFIX).tmp_vmlinux3 > $(TMPPREFIX).tmp_kallsyms3.o -+endif -+ $(LD_VMLINUX) $(LD_VMLINUX_KALLSYMS) -o vmlinux - $(NM) vmlinux | grep -v '\(compiled\)\|\(\.o$$\)\|\( [aUw] \)\|\(\.\.ng$$\)\|\(LASH[RL]DI\)' | sort > System.map -+ @rm -f $(TMPPREFIX).tmp_vmlinux* $(TMPPREFIX).tmp_kallsyms* - - symlinks: - rm -f include/asm -Index: linux-bgl/kernel/Makefile -=================================================================== ---- linux-bgl.orig/kernel/Makefile 2003-07-02 08:44:29.000000000 -0700 -+++ linux-bgl/kernel/Makefile 2004-10-26 22:59:34.101037916 -0700 -@@ -19,6 +19,7 @@ - obj-$(CONFIG_UID16) += uid16.o - obj-$(CONFIG_MODULES) += ksyms.o - obj-$(CONFIG_PM) += pm.o -+obj-$(CONFIG_KALLSYMS) += kallsyms.o - - ifneq ($(CONFIG_IA64),y) - # According to Alan Modra , the -fno-omit-frame-pointer is -Index: linux-bgl/kernel/ksyms.c -=================================================================== ---- linux-bgl.orig/kernel/ksyms.c 2004-10-26 21:49:59.922431839 -0700 -+++ linux-bgl/kernel/ksyms.c 2004-10-26 22:52:50.050675927 -0700 -@@ -56,6 +56,9 @@ - #ifdef CONFIG_KMOD - #include - #endif -+#ifdef CONFIG_KALLSYMS -+#include -+#endif - - extern void set_device_ro(kdev_t dev,int flag); - -@@ -81,6 +84,15 @@ - EXPORT_SYMBOL(inter_module_put); - EXPORT_SYMBOL(try_inc_mod_count); - -+#ifdef CONFIG_KALLSYMS -+extern const char __start___kallsyms[]; -+extern const char __stop___kallsyms[]; -+EXPORT_SYMBOL(__start___kallsyms); -+EXPORT_SYMBOL(__stop___kallsyms); -+ -+ -+#endif -+ - /* process memory management */ - EXPORT_SYMBOL(do_mmap_pgoff); - EXPORT_SYMBOL(do_munmap); -Index: linux-bgl/kernel/kallsyms.c -=================================================================== ---- linux-bgl.orig/kernel/kallsyms.c 2004-10-26 17:10:51.404753448 -0700 -+++ linux-bgl/kernel/kallsyms.c 2004-10-26 22:52:50.048676240 -0700 -@@ -0,0 +1,306 @@ -+/* An example of using kallsyms data in a kernel debugger. -+ -+ Copyright 2000 Keith Owens April 2000 -+ -+ This file is part of the Linux modutils. -+ -+ This program is free software; you can redistribute it and/or modify it -+ under the terms of the GNU General Public License as published by the -+ Free Software Foundation; either version 2 of the License, or (at your -+ option) any later version. -+ -+ This program is distributed in the hope that it will be useful, but -+ WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ General Public License for more details. -+ -+ You should have received a copy of the GNU General Public License -+ along with this program; if not, write to the Free Software Foundation, -+ Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. -+ */ -+ -+#ident "$Id: kallsyms-2.4-bgl.patch,v 1.1.20.2 2005/04/01 21:30:19 green Exp $" -+ -+/* -+ This code uses the list of all kernel and module symbols to :- -+ -+ * Find any non-stack symbol in a kernel or module. Symbols do -+ not have to be exported for debugging. -+ -+ * Convert an address to the module (or kernel) that owns it, the -+ section it is in and the nearest symbol. This finds all non-stack -+ symbols, not just exported ones. -+ -+ You need modutils >= 2.3.11 and a kernel with the kallsyms patch -+ which was compiled with CONFIG_KALLSYMS. -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+ -+/* These external symbols are only set on kernels compiled with -+ * CONFIG_KALLSYMS. -+ */ -+ -+extern const char __start___kallsyms[]; -+extern const char __stop___kallsyms[]; -+ -+static struct module **kallsyms_module_list; -+ -+static void kallsyms_get_module_list(void) -+{ -+ const struct kallsyms_header *ka_hdr; -+ const struct kallsyms_section *ka_sec; -+ const struct kallsyms_symbol *ka_sym; -+ const char *ka_str; -+ int i; -+ const char *p; -+ -+ if (__start___kallsyms >= __stop___kallsyms) -+ return; -+ ka_hdr = (struct kallsyms_header *)__start___kallsyms; -+ ka_sec = (struct kallsyms_section *) -+ ((char *)(ka_hdr) + ka_hdr->section_off); -+ ka_sym = (struct kallsyms_symbol *) -+ ((char *)(ka_hdr) + ka_hdr->symbol_off); -+ ka_str = -+ ((char *)(ka_hdr) + ka_hdr->string_off); -+ -+ for (i = 0; i < ka_hdr->symbols; kallsyms_next_sym(ka_hdr, ka_sym), ++i) { -+ p = ka_str + ka_sym->name_off; -+ if (strcmp(p, "module_list") == 0) { -+ if (ka_sym->symbol_addr) -+ kallsyms_module_list = (struct module **)(ka_sym->symbol_addr); -+ break; -+ } -+ } -+} -+ -+static inline void kallsyms_do_first_time(void) -+{ -+ static int first_time = 1; -+ if (first_time) -+ kallsyms_get_module_list(); -+ first_time = 0; -+} -+ -+/* A symbol can appear in more than one module. A token is used to -+ * restart the scan at the next module, set the token to 0 for the -+ * first scan of each symbol. -+ */ -+ -+int kallsyms_symbol_to_address( -+ const char *name, /* Name to lookup */ -+ unsigned long *token, /* Which module to start at */ -+ const char **mod_name, /* Set to module name */ -+ unsigned long *mod_start, /* Set to start address of module */ -+ unsigned long *mod_end, /* Set to end address of module */ -+ const char **sec_name, /* Set to section name */ -+ unsigned long *sec_start, /* Set to start address of section */ -+ unsigned long *sec_end, /* Set to end address of section */ -+ const char **sym_name, /* Set to full symbol name */ -+ unsigned long *sym_start, /* Set to start address of symbol */ -+ unsigned long *sym_end /* Set to end address of symbol */ -+ ) -+{ -+ const struct kallsyms_header *ka_hdr = NULL; /* stupid gcc */ -+ const struct kallsyms_section *ka_sec; -+ const struct kallsyms_symbol *ka_sym = NULL; -+ const char *ka_str = NULL; -+ const struct module *m; -+ int i = 0, l; -+ const char *p, *pt_R; -+ char *p2; -+ -+ kallsyms_do_first_time(); -+ if (!kallsyms_module_list) -+ return(0); -+ -+ /* Restart? */ -+ m = *kallsyms_module_list; -+ if (token && *token) { -+ for (; m; m = m->next) -+ if ((unsigned long)m == *token) -+ break; -+ if (m) -+ m = m->next; -+ } -+ -+ for (; m; m = m->next) { -+ if (!mod_member_present(m, kallsyms_start) || -+ !mod_member_present(m, kallsyms_end) || -+ m->kallsyms_start >= m->kallsyms_end) -+ continue; -+ ka_hdr = (struct kallsyms_header *)m->kallsyms_start; -+ ka_sym = (struct kallsyms_symbol *) -+ ((char *)(ka_hdr) + ka_hdr->symbol_off); -+ ka_str = -+ ((char *)(ka_hdr) + ka_hdr->string_off); -+ for (i = 0; i < ka_hdr->symbols; ++i, kallsyms_next_sym(ka_hdr, ka_sym)) { -+ p = ka_str + ka_sym->name_off; -+ if (strcmp(p, name) == 0) -+ break; -+ /* Unversioned requests match versioned names */ -+ if (!(pt_R = strstr(p, "_R"))) -+ continue; -+ l = strlen(pt_R); -+ if (l < 10) -+ continue; /* Not _R.*xxxxxxxx */ -+ (void)simple_strtoul(pt_R+l-8, &p2, 16); -+ if (*p2) -+ continue; /* Not _R.*xxxxxxxx */ -+ if (strncmp(p, name, pt_R-p) == 0) -+ break; /* Match with version */ -+ } -+ if (i < ka_hdr->symbols) -+ break; -+ } -+ -+ if (token) -+ *token = (unsigned long)m; -+ if (!m) -+ return(0); /* not found */ -+ -+ ka_sec = (const struct kallsyms_section *) -+ ((char *)ka_hdr + ka_hdr->section_off + ka_sym->section_off); -+ *mod_name = *(m->name) ? m->name : "kernel"; -+ *mod_start = ka_hdr->start; -+ *mod_end = ka_hdr->end; -+ *sec_name = ka_sec->name_off + ka_str; -+ *sec_start = ka_sec->start; -+ *sec_end = ka_sec->start + ka_sec->size; -+ *sym_name = ka_sym->name_off + ka_str; -+ *sym_start = ka_sym->symbol_addr; -+ if (i < ka_hdr->symbols-1) { -+ const struct kallsyms_symbol *ka_symn = ka_sym; -+ kallsyms_next_sym(ka_hdr, ka_symn); -+ *sym_end = ka_symn->symbol_addr; -+ } -+ else -+ *sym_end = *sec_end; -+ return(1); -+} -+ -+int kallsyms_address_to_symbol( -+ unsigned long address, /* Address to lookup */ -+ const char **mod_name, /* Set to module name */ -+ unsigned long *mod_start, /* Set to start address of module */ -+ unsigned long *mod_end, /* Set to end address of module */ -+ const char **sec_name, /* Set to section name */ -+ unsigned long *sec_start, /* Set to start address of section */ -+ unsigned long *sec_end, /* Set to end address of section */ -+ const char **sym_name, /* Set to full symbol name */ -+ unsigned long *sym_start, /* Set to start address of symbol */ -+ unsigned long *sym_end /* Set to end address of symbol */ -+ ) -+{ -+ const struct kallsyms_header *ka_hdr = NULL; /* stupid gcc */ -+ const struct kallsyms_section *ka_sec = NULL; -+ const struct kallsyms_symbol *ka_sym; -+ const char *ka_str; -+ const struct module *m; -+ int i; -+ unsigned long end; -+ -+ kallsyms_do_first_time(); -+ if (!kallsyms_module_list) -+ return(0); -+ -+ for (m = *kallsyms_module_list; m; m = m->next) { -+ if (!mod_member_present(m, kallsyms_start) || -+ !mod_member_present(m, kallsyms_end) || -+ m->kallsyms_start >= m->kallsyms_end) -+ continue; -+ ka_hdr = (struct kallsyms_header *)m->kallsyms_start; -+ ka_sec = (const struct kallsyms_section *) -+ ((char *)ka_hdr + ka_hdr->section_off); -+ /* Is the address in any section in this module? */ -+ for (i = 0; i < ka_hdr->sections; ++i, kallsyms_next_sec(ka_hdr, ka_sec)) { -+ if (ka_sec->start <= address && -+ (ka_sec->start + ka_sec->size) > address) -+ break; -+ } -+ if (i < ka_hdr->sections) -+ break; /* Found a matching section */ -+ } -+ -+ if (!m) -+ return(0); /* not found */ -+ -+ ka_sym = (struct kallsyms_symbol *) -+ ((char *)(ka_hdr) + ka_hdr->symbol_off); -+ ka_str = -+ ((char *)(ka_hdr) + ka_hdr->string_off); -+ *mod_name = *(m->name) ? m->name : "kernel"; -+ *mod_start = ka_hdr->start; -+ *mod_end = ka_hdr->end; -+ *sec_name = ka_sec->name_off + ka_str; -+ *sec_start = ka_sec->start; -+ *sec_end = ka_sec->start + ka_sec->size; -+ *sym_name = *sec_name; /* In case we find no matching symbol */ -+ *sym_start = *sec_start; -+ *sym_end = *sec_end; -+ -+ for (i = 0; i < ka_hdr->symbols; ++i, kallsyms_next_sym(ka_hdr, ka_sym)) { -+ if (ka_sym->symbol_addr > address) -+ continue; -+ if (i < ka_hdr->symbols-1) { -+ const struct kallsyms_symbol *ka_symn = ka_sym; -+ kallsyms_next_sym(ka_hdr, ka_symn); -+ end = ka_symn->symbol_addr; -+ } -+ else -+ end = *sec_end; -+ if (end <= address) -+ continue; -+ if ((char *)ka_hdr + ka_hdr->section_off + ka_sym->section_off -+ != (char *)ka_sec) -+ continue; /* wrong section */ -+ *sym_name = ka_str + ka_sym->name_off; -+ *sym_start = ka_sym->symbol_addr; -+ *sym_end = end; -+ break; -+ } -+ return(1); -+} -+ -+/* List all sections in all modules. The callback routine is invoked with -+ * token, module name, section name, section start, section end, section flags. -+ */ -+int kallsyms_sections(void *token, -+ int (*callback)(void *, const char *, const char *, ElfW(Addr), ElfW(Addr), ElfW(Word))) -+{ -+ const struct kallsyms_header *ka_hdr = NULL; /* stupid gcc */ -+ const struct kallsyms_section *ka_sec = NULL; -+ const char *ka_str; -+ const struct module *m; -+ int i; -+ -+ kallsyms_do_first_time(); -+ if (!kallsyms_module_list) -+ return(0); -+ -+ for (m = *kallsyms_module_list; m; m = m->next) { -+ if (!mod_member_present(m, kallsyms_start) || -+ !mod_member_present(m, kallsyms_end) || -+ m->kallsyms_start >= m->kallsyms_end) -+ continue; -+ ka_hdr = (struct kallsyms_header *)m->kallsyms_start; -+ ka_sec = (const struct kallsyms_section *) ((char *)ka_hdr + ka_hdr->section_off); -+ ka_str = ((char *)(ka_hdr) + ka_hdr->string_off); -+ for (i = 0; i < ka_hdr->sections; ++i, kallsyms_next_sec(ka_hdr, ka_sec)) { -+ if (callback( -+ token, -+ *(m->name) ? m->name : "kernel", -+ ka_sec->name_off + ka_str, -+ ka_sec->start, -+ ka_sec->start + ka_sec->size, -+ ka_sec->flags)) -+ return(0); -+ } -+ } -+ return(1); -+} -Index: linux-bgl/include/linux/kallsyms.h -=================================================================== ---- linux-bgl.orig/include/linux/kallsyms.h 2004-10-26 17:10:51.404753448 -0700 -+++ linux-bgl/include/linux/kallsyms.h 2004-10-26 22:52:50.045676708 -0700 -@@ -0,0 +1,141 @@ -+/* kallsyms headers -+ Copyright 2000 Keith Owens -+ -+ This file is part of the Linux modutils. It is exported to kernel -+ space so debuggers can access the kallsyms data. -+ -+ The kallsyms data contains all the non-stack symbols from a kernel -+ or a module. The kernel symbols are held between __start___kallsyms -+ and __stop___kallsyms. The symbols for a module are accessed via -+ the struct module chain which is based at module_list. -+ -+ This program is free software; you can redistribute it and/or modify it -+ under the terms of the GNU General Public License as published by the -+ Free Software Foundation; either version 2 of the License, or (at your -+ option) any later version. -+ -+ This program is distributed in the hope that it will be useful, but -+ WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ General Public License for more details. -+ -+ You should have received a copy of the GNU General Public License -+ along with this program; if not, write to the Free Software Foundation, -+ Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. -+ */ -+ -+#ident "$Id: kallsyms-2.4-bgl.patch,v 1.1.20.2 2005/04/01 21:30:19 green Exp $" -+ -+#ifndef MODUTILS_KALLSYMS_H -+#define MODUTILS_KALLSYMS_H 1 -+ -+/* Have to (re)define these ElfW entries here because external kallsyms -+ * code does not have access to modutils/include/obj.h. This code is -+ * included from user spaces tools (modutils) and kernel, they need -+ * different includes. -+ */ -+ -+#ifndef ELFCLASS32 -+#ifdef __KERNEL__ -+#include -+#else /* __KERNEL__ */ -+#include -+#endif /* __KERNEL__ */ -+#endif /* ELFCLASS32 */ -+ -+#ifndef ELFCLASSM -+#define ELFCLASSM ELF_CLASS -+#endif -+ -+#ifndef ElfW -+# if ELFCLASSM == ELFCLASS32 -+# define ElfW(x) Elf32_ ## x -+# define ELFW(x) ELF32_ ## x -+# else -+# define ElfW(x) Elf64_ ## x -+# define ELFW(x) ELF64_ ## x -+# endif -+#endif -+ -+/* Format of data in the kallsyms section. -+ * Most of the fields are small numbers but the total size and all -+ * offsets can be large so use the 32/64 bit types for these fields. -+ * -+ * Do not use sizeof() on these structures, modutils may be using extra -+ * fields. Instead use the size fields in the header to access the -+ * other bits of data. -+ */ -+ -+struct kallsyms_header { -+ int size; /* Size of this header */ -+ ElfW(Word) total_size; /* Total size of kallsyms data */ -+ int sections; /* Number of section entries */ -+ ElfW(Off) section_off; /* Offset to first section entry */ -+ int section_size; /* Size of one section entry */ -+ int symbols; /* Number of symbol entries */ -+ ElfW(Off) symbol_off; /* Offset to first symbol entry */ -+ int symbol_size; /* Size of one symbol entry */ -+ ElfW(Off) string_off; /* Offset to first string */ -+ ElfW(Addr) start; /* Start address of first section */ -+ ElfW(Addr) end; /* End address of last section */ -+}; -+ -+struct kallsyms_section { -+ ElfW(Addr) start; /* Start address of section */ -+ ElfW(Word) size; /* Size of this section */ -+ ElfW(Off) name_off; /* Offset to section name */ -+ ElfW(Word) flags; /* Flags from section */ -+}; -+ -+struct kallsyms_symbol { -+ ElfW(Off) section_off; /* Offset to section that owns this symbol */ -+ ElfW(Addr) symbol_addr; /* Address of symbol */ -+ ElfW(Off) name_off; /* Offset to symbol name */ -+}; -+ -+#define KALLSYMS_SEC_NAME "__kallsyms" -+#define KALLSYMS_IDX 2 /* obj_kallsyms creates kallsyms as section 2 */ -+ -+#define kallsyms_next_sec(h,s) \ -+ ((s) = (struct kallsyms_section *)((char *)(s) + (h)->section_size)) -+#define kallsyms_next_sym(h,s) \ -+ ((s) = (struct kallsyms_symbol *)((char *)(s) + (h)->symbol_size)) -+ -+int kallsyms_symbol_to_address( -+ const char *name, /* Name to lookup */ -+ unsigned long *token, /* Which module to start with */ -+ const char **mod_name, /* Set to module name or "kernel" */ -+ unsigned long *mod_start, /* Set to start address of module */ -+ unsigned long *mod_end, /* Set to end address of module */ -+ const char **sec_name, /* Set to section name */ -+ unsigned long *sec_start, /* Set to start address of section */ -+ unsigned long *sec_end, /* Set to end address of section */ -+ const char **sym_name, /* Set to full symbol name */ -+ unsigned long *sym_start, /* Set to start address of symbol */ -+ unsigned long *sym_end /* Set to end address of symbol */ -+ ); -+ -+int kallsyms_address_to_symbol( -+ unsigned long address, /* Address to lookup */ -+ const char **mod_name, /* Set to module name */ -+ unsigned long *mod_start, /* Set to start address of module */ -+ unsigned long *mod_end, /* Set to end address of module */ -+ const char **sec_name, /* Set to section name */ -+ unsigned long *sec_start, /* Set to start address of section */ -+ unsigned long *sec_end, /* Set to end address of section */ -+ const char **sym_name, /* Set to full symbol name */ -+ unsigned long *sym_start, /* Set to start address of symbol */ -+ unsigned long *sym_end /* Set to end address of symbol */ -+ ); -+ -+int kallsyms_sections(void *token, -+ int (*callback)(void *, /* token */ -+ const char *, /* module name */ -+ const char *, /* section name */ -+ ElfW(Addr), /* Section start */ -+ ElfW(Addr), /* Section end */ -+ ElfW(Word) /* Section flags */ -+ ) -+ ); -+ -+#endif /* kallsyms.h */ diff --git a/lustre/kernel_patches/patches/kksymoops-2.4-bgl.patch b/lustre/kernel_patches/patches/kksymoops-2.4-bgl.patch deleted file mode 100644 index 9d33973..0000000 --- a/lustre/kernel_patches/patches/kksymoops-2.4-bgl.patch +++ /dev/null @@ -1,678 +0,0 @@ -Index: linux-bgl/arch/i386/kernel/traps.c -=================================================================== ---- linux-bgl.orig/arch/i386/kernel/traps.c 2003-07-02 08:43:23.000000000 -0700 -+++ linux-bgl/arch/i386/kernel/traps.c 2004-10-26 23:25:17.950442396 -0700 -@@ -24,6 +24,7 @@ - #include - #include - #include -+#include - - #ifdef CONFIG_MCA - #include -@@ -135,6 +136,8 @@ - { - int i; - unsigned long addr; -+ /* static to not take up stackspace; if we race here too bad */ -+ static char buffer[512]; - - if (!stack) - stack = (unsigned long*)&stack; -@@ -144,9 +147,8 @@ - while (((long) stack & (THREAD_SIZE-1)) != 0) { - addr = *stack++; - if (kernel_text_address(addr)) { -- if (i && ((i % 6) == 0)) -- printk("\n "); -- printk(" [<%08lx>]", addr); -+ lookup_symbol(addr, buffer, 512); -+ printk("[<%08lx>] %s (0x%p)\n", addr,buffer,stack-1); - i++; - } - } -@@ -186,12 +188,19 @@ - show_trace(esp); - } - -+#ifdef CONFIG_MK7 -+#define ARCHIT "/athlon" -+#else -+#define ARCHIT "/i686" -+#endif -+ - void show_registers(struct pt_regs *regs) - { - int i; - int in_kernel = 1; - unsigned long esp; - unsigned short ss; -+ static char buffer[512]; - - esp = (unsigned long) (®s->esp); - ss = __KERNEL_DS; -@@ -200,8 +209,12 @@ - esp = regs->esp; - ss = regs->xss & 0xffff; - } -+ -+ print_modules(); -+ lookup_symbol(regs->eip, buffer, 512); - printk("CPU: %d\nEIP: %04x:[<%08lx>] %s\nEFLAGS: %08lx\n", - smp_processor_id(), 0xffff & regs->xcs, regs->eip, print_tainted(), regs->eflags); -+ printk("\nEIP is at %s (" UTS_RELEASE ARCHIT ")\n",buffer); - printk("eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n", - regs->eax, regs->ebx, regs->ecx, regs->edx); - printk("esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n", -@@ -261,7 +274,7 @@ - if (__get_user(file, (char **)(eip + 4)) || - (unsigned long)file < PAGE_OFFSET || __get_user(c, file)) - file = ""; -- -+ printk("------------[ cut here ]------------\n"); - printk("kernel BUG at %s:%d!\n", file, line); - - no_bug: -Index: linux-bgl/arch/i386/kernel/process.c -=================================================================== ---- linux-bgl.orig/arch/i386/kernel/process.c 2003-07-02 08:44:07.000000000 -0700 -+++ linux-bgl/arch/i386/kernel/process.c 2004-10-26 23:28:53.017015082 -0700 -@@ -33,6 +33,7 @@ - #include - #include - #include -+#include - - #include - #include -@@ -437,10 +438,14 @@ - void show_regs(struct pt_regs * regs) - { - unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; -+ static char buffer[512]; -+ -+ lookup_symbol(regs->eip, buffer, 512); - - printk("\n"); - printk("Pid: %d, comm: %20s\n", current->pid, current->comm); - printk("EIP: %04x:[<%08lx>] CPU: %d",0xffff & regs->xcs,regs->eip, smp_processor_id()); -+ printk("\nEIP is at %s (" UTS_RELEASE ")\n", buffer); - if (regs->xcs & 3) - printk(" ESP: %04x:%08lx",0xffff & regs->xss,regs->esp); - printk(" EFLAGS: %08lx %s\n",regs->eflags, print_tainted()); -Index: linux-bgl/arch/ia64/kernel/process.c -=================================================================== ---- linux-bgl.orig/arch/ia64/kernel/process.c 2003-07-02 08:43:26.000000000 -0700 -+++ linux-bgl/arch/ia64/kernel/process.c 2004-10-26 23:29:56.340005959 -0700 -@@ -18,6 +18,7 @@ - #include - #include - #include -+#include - - #include - #include -@@ -33,9 +34,10 @@ - #include - #endif - --static void --do_show_stack (struct unw_frame_info *info, void *arg) -+void -+ia64_do_show_stack (struct unw_frame_info *info, void *arg) - { -+ static char buffer[512]; - unsigned long ip, sp, bsp; - - printk("\nCall Trace: "); -@@ -46,7 +48,8 @@ - - unw_get_sp(info, &sp); - unw_get_bsp(info, &bsp); -- printk("[<%016lx>] sp=0x%016lx bsp=0x%016lx\n", ip, sp, bsp); -+ lookup_symbol(ip, buffer, 512); -+ printk("[<%016lx>] sp=0x%016lx bsp=0x%016lx %s\n", ip, sp, bsp, buffer); - } while (unw_unwind(info) >= 0); - } - -@@ -56,19 +59,19 @@ - struct unw_frame_info info; - - unw_init_from_blocked_task(&info, task); -- do_show_stack(&info, 0); -+ ia64_do_show_stack(&info, 0); - } - - void - show_stack (struct task_struct *task) - { - if (!task) -- unw_init_running(do_show_stack, 0); -+ unw_init_running(ia64_do_show_stack, 0); - else { - struct unw_frame_info info; - - unw_init_from_blocked_task(&info, task); -- do_show_stack(&info, 0); -+ ia64_do_show_stack(&info, 0); - } - } - -@@ -76,8 +79,11 @@ - show_regs (struct pt_regs *regs) - { - unsigned long ip = regs->cr_iip + ia64_psr(regs)->ri; -+ static char buffer[512]; - - printk("\nPid: %d, comm: %20s\n", current->pid, current->comm); -+ lookup_symbol(ip, buffer, 512); -+ printk("EIP is at %s (" UTS_RELEASE ")\n", buffer); - printk("psr : %016lx ifs : %016lx ip : [<%016lx>] %s\n", - regs->cr_ipsr, regs->cr_ifs, ip, print_tainted()); - printk("unat: %016lx pfs : %016lx rsc : %016lx\n", -Index: linux-bgl/arch/s390/config.in -=================================================================== ---- linux-bgl.orig/arch/s390/config.in 2003-07-02 08:43:27.000000000 -0700 -+++ linux-bgl/arch/s390/config.in 2004-10-26 23:25:17.961440685 -0700 -@@ -73,5 +73,6 @@ - # bool 'Remote GDB kernel debugging' CONFIG_REMOTE_DEBUG - #fi - bool 'Magic SysRq key' CONFIG_MAGIC_SYSRQ -+bool 'Load all symbols for debugging/kksymoops' CONFIG_KALLSYMS - endmenu - -Index: linux-bgl/arch/s390/kernel/traps.c -=================================================================== ---- linux-bgl.orig/arch/s390/kernel/traps.c 2003-07-02 08:44:02.000000000 -0700 -+++ linux-bgl/arch/s390/kernel/traps.c 2004-10-26 23:25:17.964440218 -0700 -@@ -27,6 +27,7 @@ - #include - #include - #include -+#include - - #include - #include -@@ -108,27 +109,26 @@ - - void show_trace(unsigned long * stack) - { -+ static char buffer[512]; - unsigned long backchain, low_addr, high_addr, ret_addr; - int i; - - if (!stack) - stack = (unsigned long*)&stack; - -- printk("Call Trace: "); - low_addr = ((unsigned long) stack) & PSW_ADDR_MASK; - high_addr = (low_addr & (-THREAD_SIZE)) + THREAD_SIZE; - /* Skip the first frame (biased stack) */ - backchain = *((unsigned long *) low_addr) & PSW_ADDR_MASK; -- /* Print up to 8 lines */ -- for (i = 0; i < 8; i++) { -+ /* Print up to 20 lines */ -+ for (i = 0; i < 20; i++) { - if (backchain < low_addr || backchain >= high_addr) - break; - ret_addr = *((unsigned long *) (backchain+56)) & PSW_ADDR_MASK; - if (!kernel_text_address(ret_addr)) - break; -- if (i && ((i % 6) == 0)) -- printk("\n "); -- printk("[<%08lx>] ", ret_addr); -+ lookup_symbol(ret_addr, buffer, 512); -+ printk("[<%08lx>] %s (0x%lx)\n", ret_addr,buffer,backchain+56); - low_addr = backchain; - backchain = *((unsigned long *) backchain) & PSW_ADDR_MASK; - } -@@ -171,6 +171,7 @@ - - void show_registers(struct pt_regs *regs) - { -+ static char buffer[512]; - mm_segment_t old_fs; - char *mode; - int i; -@@ -179,6 +180,10 @@ - printk("%s PSW : %08lx %08lx\n", - mode, (unsigned long) regs->psw.mask, - (unsigned long) regs->psw.addr); -+ if (!(regs->psw.mask & PSW_PROBLEM_STATE)) { -+ lookup_symbol(regs->psw.addr & 0x7FFFFFFF, buffer, 512); -+ printk(" %s (" UTS_RELEASE ")\n", buffer); -+ } - printk("%s GPRS: %08x %08x %08x %08x\n", mode, - regs->gprs[0], regs->gprs[1], regs->gprs[2], regs->gprs[3]); - printk(" %08x %08x %08x %08x\n", -Index: linux-bgl/arch/s390x/config.in -=================================================================== ---- linux-bgl.orig/arch/s390x/config.in 2003-07-02 08:43:07.000000000 -0700 -+++ linux-bgl/arch/s390x/config.in 2004-10-26 23:25:17.964440218 -0700 -@@ -75,5 +75,6 @@ - # bool 'Remote GDB kernel debugging' CONFIG_REMOTE_DEBUG - #fi - bool 'Magic SysRq key' CONFIG_MAGIC_SYSRQ -+bool 'Load all symbols for debugging/kksymoops' CONFIG_KALLSYMS - endmenu - -Index: linux-bgl/arch/s390x/kernel/traps.c -=================================================================== ---- linux-bgl.orig/arch/s390x/kernel/traps.c 2003-07-02 08:43:25.000000000 -0700 -+++ linux-bgl/arch/s390x/kernel/traps.c 2004-10-26 23:25:17.966439907 -0700 -@@ -27,6 +27,7 @@ - #include - #include - #include -+#include - - #include - #include -@@ -112,25 +113,25 @@ - { - unsigned long backchain, low_addr, high_addr, ret_addr; - int i; -+ /* static to not take up stackspace; if we race here too bad */ -+ static char buffer[512]; - - if (!stack) - stack = (unsigned long*)&stack; - -- printk("Call Trace: "); - low_addr = ((unsigned long) stack) & PSW_ADDR_MASK; - high_addr = (low_addr & (-THREAD_SIZE)) + THREAD_SIZE; - /* Skip the first frame (biased stack) */ - backchain = *((unsigned long *) low_addr) & PSW_ADDR_MASK; -- /* Print up to 8 lines */ -- for (i = 0; i < 8; i++) { -+ /* Print up to 20 lines */ -+ for (i = 0; i < 20; i++) { - if (backchain < low_addr || backchain >= high_addr) - break; - ret_addr = *((unsigned long *) (backchain+112)) & PSW_ADDR_MASK; - if (!kernel_text_address(ret_addr)) - break; -- if (i && ((i % 3) == 0)) -- printk("\n "); -- printk("[<%016lx>] ", ret_addr); -+ lookup_symbol(ret_addr, buffer, 512); -+ printk("[<%016lx>] %s (0x%lx)\n", ret_addr, buffer, backchain+112); - low_addr = backchain; - backchain = *((unsigned long *) backchain) & PSW_ADDR_MASK; - } -@@ -173,6 +174,7 @@ - - void show_registers(struct pt_regs *regs) - { -+ static char buffer[512]; - mm_segment_t old_fs; - char *mode; - int i; -@@ -181,6 +183,10 @@ - printk("%s PSW : %016lx %016lx\n", - mode, (unsigned long) regs->psw.mask, - (unsigned long) regs->psw.addr); -+ if (!(regs->psw.mask & PSW_PROBLEM_STATE)) { -+ lookup_symbol(regs->psw.addr, buffer, 512); -+ printk(" %s (" UTS_RELEASE ")\n", buffer); -+ } - printk("%s GPRS: %016lx %016lx %016lx %016lx\n", mode, - regs->gprs[0], regs->gprs[1], regs->gprs[2], regs->gprs[3]); - printk(" %016lx %016lx %016lx %016lx\n", -Index: linux-bgl/arch/ppc64/mm/fault.c -=================================================================== ---- linux-bgl.orig/arch/ppc64/mm/fault.c 2003-07-02 08:43:12.000000000 -0700 -+++ linux-bgl/arch/ppc64/mm/fault.c 2004-10-26 23:30:24.467942247 -0700 -@@ -224,7 +224,6 @@ - if (debugger_kernel_faults) - debugger(regs); - #endif -- print_backtrace( (unsigned long *)regs->gpr[1] ); - panic("kernel access of bad area pc %lx lr %lx address %lX tsk %s/%d", - regs->nip,regs->link,address,current->comm,current->pid); - } -Index: linux-bgl/arch/ppc64/kernel/traps.c -=================================================================== ---- linux-bgl.orig/arch/ppc64/kernel/traps.c 2003-07-02 08:44:03.000000000 -0700 -+++ linux-bgl/arch/ppc64/kernel/traps.c 2004-10-26 23:33:45.297572484 -0700 -@@ -89,7 +89,6 @@ - #if defined(CONFIG_KDB) - kdb(KDB_REASON_OOPS, 0, (kdb_eframe_t) regs); - #endif -- print_backtrace((unsigned long *)regs->gpr[1]); - panic("Exception in kernel pc %lx signal %d",regs->nip,signr); - #if defined(CONFIG_PPCDBG) && (defined(CONFIG_XMON) || defined(CONFIG_KGDB)) - /* Allow us to catch SIGILLs for 64-bit app/glibc debugging. -Peter */ -@@ -187,7 +186,6 @@ - if (kdb(KDB_REASON_FAULT, 0, regs)) - return ; - #endif -- print_backtrace((unsigned long *)regs->gpr[1]); - panic("machine check"); - } - _exception(SIGSEGV, regs); -@@ -209,7 +207,6 @@ - } - #endif - show_regs(regs); -- print_backtrace((unsigned long *)regs->gpr[1]); - panic("System Management Interrupt"); - } - -Index: linux-bgl/arch/ppc64/kernel/process.c -=================================================================== ---- linux-bgl.orig/arch/ppc64/kernel/process.c 2003-07-02 08:44:31.000000000 -0700 -+++ linux-bgl/arch/ppc64/kernel/process.c 2004-10-26 23:33:01.060713583 -0700 -@@ -30,6 +30,8 @@ - #include - #include - #include -+#include -+#include - - #include - #include -@@ -130,12 +132,61 @@ - __restore_flags(s); - } - -+/* -+ * If the address is either in the .text section of the -+ * kernel, or in the vmalloc'ed module regions, it *may* -+ * be the address of a calling routine -+ */ -+ -+#ifdef CONFIG_MODULES -+ -+extern struct module *module_list; -+extern struct module kernel_module; -+extern char _stext[], _etext[]; -+ -+static inline int kernel_text_address(unsigned long addr) -+{ -+ int retval = 0; -+ struct module *mod; -+ -+ if (addr >= (unsigned long) &_stext && -+ addr <= (unsigned long) &_etext) -+ return 1; -+ -+ for (mod = module_list; mod != &kernel_module; mod = mod->next) { -+ /* mod_bound tests for addr being inside the vmalloc'ed -+ * module area. Of course it'd be better to test only -+ * for the .text subset... */ -+ if (mod_bound(addr, 0, mod)) { -+ retval = 1; -+ break; -+ } -+ } -+ -+ return retval; -+} -+ -+#else -+ -+static inline int kernel_text_address(unsigned long addr) -+{ -+ return (addr >= (unsigned long) &_stext && -+ addr <= (unsigned long) &_etext); -+} -+ -+#endif -+ -+ - void show_regs(struct pt_regs * regs) - { - int i; -+ static char buffer[512]; - -- printk("NIP: %016lX XER: %016lX LR: %016lX REGS: %p TRAP: %04lx %s\n", -+ print_modules(); -+ printk("NIP: %016lx XER: %016lx LR: %016lx REGS: %p TRAP: %04lx %s\n", - regs->nip, regs->xer, regs->link, regs,regs->trap, print_tainted()); -+ lookup_symbol(regs->nip, buffer, 512); -+ printk("NIP is at %s (" UTS_RELEASE ")\n", buffer); - printk("MSR: %016lx EE: %01x PR: %01x FP: %01x ME: %01x IR/DR: %01x%01x\n", - regs->msr, regs->msr&MSR_EE ? 1 : 0, regs->msr&MSR_PR ? 1 : 0, - regs->msr & MSR_FP ? 1 : 0,regs->msr&MSR_ME ? 1 : 0, -@@ -147,27 +198,22 @@ - printk("\nlast math %p ", last_task_used_math); - - #ifdef CONFIG_SMP -- /* printk(" CPU: %d last CPU: %d", current->processor,current->last_processor); */ -+ printk("CPU: %d", smp_processor_id()); - #endif /* CONFIG_SMP */ - -- printk("\n"); - for (i = 0; i < 32; i++) - { - long r; - if ((i % 4) == 0) -- { -- printk("GPR%02d: ", i); -- } -+ printk("\nGPR%02d: ", i); - - if ( __get_user(r, &(regs->gpr[i])) ) - return; - -- printk("%016lX ", r); -- if ((i % 4) == 3) -- { -- printk("\n"); -- } -+ printk("%016lx ", r); - } -+ printk("\n"); -+ print_backtrace((unsigned long *)regs->gpr[1]); - } - - void exit_thread(void) -@@ -415,67 +461,24 @@ - } - } - --extern char _stext[], _etext[]; -- --char * ppc_find_proc_name( unsigned * p, char * buf, unsigned buflen ) --{ -- unsigned long tb_flags; -- unsigned short name_len; -- unsigned long tb_start, code_start, code_ptr, code_offset; -- unsigned code_len; -- strcpy( buf, "Unknown" ); -- code_ptr = (unsigned long)p; -- code_offset = 0; -- if ( ( (unsigned long)p >= (unsigned long)_stext ) && ( (unsigned long)p <= (unsigned long)_etext ) ) { -- while ( (unsigned long)p <= (unsigned long)_etext ) { -- if ( *p == 0 ) { -- tb_start = (unsigned long)p; -- ++p; /* Point to traceback flags */ -- tb_flags = *((unsigned long *)p); -- p += 2; /* Skip over traceback flags */ -- if ( tb_flags & TB_NAME_PRESENT ) { -- if ( tb_flags & TB_PARMINFO ) -- ++p; /* skip over parminfo data */ -- if ( tb_flags & TB_HAS_TBOFF ) { -- code_len = *p; /* get code length */ -- code_start = tb_start - code_len; -- code_offset = code_ptr - code_start + 1; -- if ( code_offset > 0x100000 ) -- break; -- ++p; /* skip over code size */ -- } -- name_len = *((unsigned short *)p); -- if ( name_len > (buflen-20) ) -- name_len = buflen-20; -- memcpy( buf, ((char *)p)+2, name_len ); -- buf[name_len] = 0; -- if ( code_offset ) -- sprintf( buf+name_len, "+0x%lx", code_offset-1 ); -- } -- break; -- } -- ++p; -- } -- } -- return buf; --} -- - void - print_backtrace(unsigned long *sp) - { - int cnt = 0; - unsigned long i; -- char name_buf[256]; -+ char buffer[512]; - -- printk("Call backtrace: \n"); -+ printk("Call Trace: \n"); - while (sp) { - if (__get_user( i, &sp[2] )) - break; -- printk("%016lX ", i); -- printk("%s\n", ppc_find_proc_name( (unsigned *)i, name_buf, 256 )); -+ if (kernel_text_address(i)) { -+ if (__get_user(sp, (unsigned long **)sp)) -+ break; -+ lookup_symbol(i, buffer, 512); -+ printk("[<%016lx>] %s\n", i, buffer); -+ } - if (cnt > 32) break; -- if (__get_user(sp, (unsigned long **)sp)) -- break; - } - printk("\n"); - } -@@ -515,6 +518,7 @@ - unsigned long ip, sp; - unsigned long stack_page = (unsigned long)p; - int count = 0; -+ static char buffer[512]; - - if (!p) - return; -@@ -528,7 +532,8 @@ - break; - if (count > 0) { - ip = *(unsigned long *)(sp + 16); -- printk("[%016lx] ", ip); -+ lookup_symbol(ip, buffer, 512); -+ printk("[<%016lx>] %s\n", ip, buffer); - } - } while (count++ < 16); - printk("\n"); -Index: linux-bgl/kernel/Makefile -=================================================================== ---- linux-bgl.orig/kernel/Makefile 2004-10-26 23:23:00.516655289 -0700 -+++ linux-bgl/kernel/Makefile 2004-10-26 23:35:04.930451186 -0700 -@@ -14,7 +14,7 @@ - obj-y = sched.o dma.o fork.o exec_domain.o panic.o printk.o \ - module.o exit.o itimer.o info.o time.o softirq.o resource.o \ - sysctl.o acct.o capability.o ptrace.o timer.o user.o \ -- signal.o sys.o kmod.o context.o -+ signal.o sys.o kmod.o context.o kksymoops.o - - obj-$(CONFIG_UID16) += uid16.o - obj-$(CONFIG_MODULES) += ksyms.o -Index: linux-bgl/kernel/kksymoops.c -=================================================================== ---- linux-bgl.orig/kernel/kksymoops.c 2004-10-26 17:10:51.404753448 -0700 -+++ linux-bgl/kernel/kksymoops.c 2004-10-26 23:25:17.971439129 -0700 -@@ -0,0 +1,82 @@ -+#include -+#include -+#include -+#include -+#include -+#ifdef CONFIG_KALLSYMS -+#include -+#endif -+ -+ -+ -+int lookup_symbol(unsigned long address, char *buffer, int buflen) -+{ -+ struct module *this_mod; -+ unsigned long bestsofar; -+ -+ const char *mod_name = NULL, *sec_name = NULL, *sym_name = NULL; -+ unsigned long mod_start,mod_end,sec_start,sec_end,sym_start,sym_end; -+ -+ if (!buffer) -+ return -EFAULT; -+ -+ if (buflen<256) -+ return -ENOMEM; -+ -+ memset(buffer,0,buflen); -+ -+#ifdef CONFIG_KALLSYMS -+ if (!kallsyms_address_to_symbol(address,&mod_name,&mod_start,&mod_end,&sec_name, -+ &sec_start, &sec_end, &sym_name, &sym_start, &sym_end)) { -+ /* kallsyms doesn't have a clue; lets try harder */ -+ bestsofar = 0; -+ snprintf(buffer,buflen-1,"[unresolved]"); -+ -+ this_mod = module_list; -+ -+ while (this_mod != NULL) { -+ int i; -+ /* walk the symbol list of this module. Only symbols -+ who's address is smaller than the searched for address -+ are relevant; and only if it's better than the best so far */ -+ for (i=0; i< this_mod->nsyms; i++) -+ if ((this_mod->syms[i].value<=address) && -+ (bestsofarsyms[i].value)) { -+ snprintf(buffer,buflen-1,"%s [%s] 0x%x", -+ this_mod->syms[i].name, -+ this_mod->name, -+ (unsigned int)(address - this_mod->syms[i].value)); -+ bestsofar = this_mod->syms[i].value; -+ } -+ this_mod = this_mod->next; -+ } -+ -+ } else { /* kallsyms success */ -+ snprintf(buffer,buflen-1,"%s [%s] 0x%x",sym_name,mod_name,(unsigned int)(address-sym_start)); -+ } -+#endif -+ return strlen(buffer); -+} -+ -+static char modlist[4096]; -+/* this function isn't smp safe but that's not really a problem; it's called from -+ * oops context only and any locking could actually prevent the oops from going out; -+ * the line that is generated is informational only and should NEVER prevent the real oops -+ * from going out. -+ */ -+void print_modules(void) -+{ -+ struct module *this_mod; -+ int pos = 0, i; -+ memset(modlist,0,4096); -+ -+#ifdef CONFIG_KALLSYMS -+ this_mod = module_list; -+ while (this_mod != NULL) { -+ if (this_mod->name != NULL) -+ pos +=snprintf(modlist+pos,160-pos-1,"%s ",this_mod->name); -+ this_mod = this_mod->next; -+ } -+ printk("%s\n",modlist); -+#endif -+} -Index: linux-bgl/include/linux/kernel.h -=================================================================== ---- linux-bgl.orig/include/linux/kernel.h 2003-07-02 08:44:16.000000000 -0700 -+++ linux-bgl/include/linux/kernel.h 2004-10-26 23:25:17.968439596 -0700 -@@ -107,6 +107,9 @@ - extern int tainted; - extern const char *print_tainted(void); - -+extern int lookup_symbol(unsigned long address, char *buffer, int buflen); -+extern void print_modules(void); -+ - #if DEBUG - #define pr_debug(fmt,arg...) \ - printk(KERN_DEBUG fmt,##arg) diff --git a/lustre/kernel_patches/patches/linux-2.4.18-netdump.patch b/lustre/kernel_patches/patches/linux-2.4.18-netdump.patch deleted file mode 100644 index f8db708..0000000 --- a/lustre/kernel_patches/patches/linux-2.4.18-netdump.patch +++ /dev/null @@ -1,1842 +0,0 @@ -Index: linux-2.4.24/arch/i386/kernel/i386_ksyms.c -=================================================================== ---- linux-2.4.24.orig/arch/i386/kernel/i386_ksyms.c 2003-11-28 13:26:19.000000000 -0500 -+++ linux-2.4.24/arch/i386/kernel/i386_ksyms.c 2004-05-07 16:58:39.000000000 -0400 -@@ -186,3 +186,8 @@ - EXPORT_SYMBOL(edd); - EXPORT_SYMBOL(eddnr); - #endif -+ -+EXPORT_SYMBOL_GPL(show_mem); -+EXPORT_SYMBOL_GPL(show_state); -+EXPORT_SYMBOL_GPL(show_regs); -+ -Index: linux-2.4.24/arch/i386/kernel/process.c -=================================================================== ---- linux-2.4.24.orig/arch/i386/kernel/process.c 2003-11-28 13:26:19.000000000 -0500 -+++ linux-2.4.24/arch/i386/kernel/process.c 2004-05-07 17:08:18.000000000 -0400 -@@ -400,7 +400,8 @@ - * Stop all CPUs and turn off local APICs and the IO-APIC, so - * other OSs see a clean IRQ state. - */ -- smp_send_stop(); -+ if (!netdump_func) -+ smp_send_stop(); - #elif CONFIG_X86_LOCAL_APIC - if (cpu_has_apic) { - __cli(); -Index: linux-2.4.24/arch/i386/kernel/traps.c -=================================================================== ---- linux-2.4.24.orig/arch/i386/kernel/traps.c 2004-05-07 16:57:00.000000000 -0400 -+++ linux-2.4.24/arch/i386/kernel/traps.c 2004-05-07 17:09:17.000000000 -0400 -@@ -280,6 +280,9 @@ - printk("Kernel BUG\n"); - } - -+void (*netdump_func) (struct pt_regs *regs) = NULL; -+int netdump_mode = 0; -+ - spinlock_t die_lock = SPIN_LOCK_UNLOCKED; - - void die(const char * str, struct pt_regs * regs, long err) -@@ -290,6 +293,8 @@ - handle_BUG(regs); - printk("%s: %04lx\n", str, err & 0xffff); - show_registers(regs); -+ if (netdump_func) -+ netdump_func(regs); - bust_spinlocks(0); - spin_unlock_irq(&die_lock); - do_exit(SIGSEGV); -@@ -1041,3 +1046,9 @@ - - EXPORT_SYMBOL_GPL(is_kernel_text_address); - EXPORT_SYMBOL_GPL(lookup_symbol); -+ -+EXPORT_SYMBOL_GPL(netdump_func); -+EXPORT_SYMBOL_GPL(netdump_mode); -+#if CONFIG_X86_LOCAL_APIC -+EXPORT_SYMBOL_GPL(nmi_watchdog); -+#endif -Index: linux-2.4.24/arch/x86_64/kernel/x8664_ksyms.c -=================================================================== ---- linux-2.4.24.orig/arch/x86_64/kernel/x8664_ksyms.c 2003-11-28 13:26:19.000000000 -0500 -+++ linux-2.4.24/arch/x86_64/kernel/x8664_ksyms.c 2004-05-07 17:01:51.000000000 -0400 -@@ -41,6 +41,9 @@ - EXPORT_SYMBOL(drive_info); - #endif - -+//extern void (*netdump_func) (struct pt_regs *regs) = NULL; -+int netdump_mode = 0; -+ - /* platform dependent support */ - EXPORT_SYMBOL(boot_cpu_data); - EXPORT_SYMBOL(dump_fpu); -@@ -229,3 +232,6 @@ - EXPORT_SYMBOL(touch_nmi_watchdog); - - EXPORT_SYMBOL(do_fork); -+ -+EXPORT_SYMBOL_GPL(netdump_func); -+EXPORT_SYMBOL_GPL(netdump_mode); -Index: linux-2.4.24/drivers/net/3c59x.c -=================================================================== ---- linux-2.4.24.orig/drivers/net/3c59x.c 2003-11-28 13:26:20.000000000 -0500 -+++ linux-2.4.24/drivers/net/3c59x.c 2004-05-07 17:01:00.000000000 -0400 -@@ -874,6 +874,7 @@ - static int vortex_ioctl(struct net_device *dev, struct ifreq *rq, int cmd); - static void vortex_tx_timeout(struct net_device *dev); - static void acpi_set_WOL(struct net_device *dev); -+static void vorboom_poll(struct net_device *dev); - static struct ethtool_ops vortex_ethtool_ops; - - /* This driver uses 'options' to pass the media type, full-duplex flag, etc. */ -@@ -1343,6 +1344,9 @@ - dev->set_multicast_list = set_rx_mode; - dev->tx_timeout = vortex_tx_timeout; - dev->watchdog_timeo = (watchdog * HZ) / 1000; -+#ifdef HAVE_POLL_CONTROLLER -+ dev->poll_controller = &vorboom_poll; -+#endif - if (pdev && vp->enable_wol) { - vp->pm_state_valid = 1; - pci_save_state(vp->pdev, vp->power_state); -@@ -2322,6 +2326,29 @@ - spin_unlock(&vp->lock); - } - -+#ifdef HAVE_POLL_CONTROLLER -+ -+/* -+ * Polling 'interrupt' - used by things like netconsole to send skbs -+ * without having to re-enable interrupts. It's not called while -+ * the interrupt routine is executing. -+ */ -+ -+static void vorboom_poll (struct net_device *dev) -+{ -+ struct vortex_private *vp = (struct vortex_private *)dev->priv; -+ -+ if (!netdump_mode) disable_irq(dev->irq); -+ if (vp->full_bus_master_tx) -+ boomerang_interrupt(dev->irq, dev, 0); -+ else -+ vortex_interrupt(dev->irq, dev, 0); -+ if (!netdump_mode) enable_irq(dev->irq); -+} -+ -+#endif -+ -+ - static int vortex_rx(struct net_device *dev) - { - struct vortex_private *vp = (struct vortex_private *)dev->priv; -Index: linux-2.4.24/drivers/net/Config.in -=================================================================== ---- linux-2.4.24.orig/drivers/net/Config.in 2003-11-28 13:26:20.000000000 -0500 -+++ linux-2.4.24/drivers/net/Config.in 2004-05-07 16:58:39.000000000 -0400 -@@ -295,6 +295,8 @@ - dep_tristate ' SysKonnect FDDI PCI support' CONFIG_SKFP $CONFIG_PCI - fi - -+tristate 'Network logging support' CONFIG_NETCONSOLE -+ - if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then - if [ "$CONFIG_INET" = "y" ]; then - bool 'HIPPI driver support (EXPERIMENTAL)' CONFIG_HIPPI -Index: linux-2.4.24/drivers/net/eepro100.c -=================================================================== ---- linux-2.4.24.orig/drivers/net/eepro100.c 2003-08-25 07:44:42.000000000 -0400 -+++ linux-2.4.24/drivers/net/eepro100.c 2004-05-07 16:58:39.000000000 -0400 -@@ -543,6 +543,7 @@ - static int speedo_rx(struct net_device *dev); - static void speedo_tx_buffer_gc(struct net_device *dev); - static void speedo_interrupt(int irq, void *dev_instance, struct pt_regs *regs); -+static void poll_speedo (struct net_device *dev); - static int speedo_close(struct net_device *dev); - static struct net_device_stats *speedo_get_stats(struct net_device *dev); - static int speedo_ioctl(struct net_device *dev, struct ifreq *rq, int cmd); -@@ -879,6 +880,9 @@ - dev->get_stats = &speedo_get_stats; - dev->set_multicast_list = &set_rx_mode; - dev->do_ioctl = &speedo_ioctl; -+#ifdef HAVE_POLL_CONTROLLER -+ dev->poll_controller = &poll_speedo; -+#endif - - return 0; - } -@@ -1176,10 +1180,8 @@ - - - /* Media monitoring and control. */ --static void speedo_timer(unsigned long data) -+static void speedo_timeout(struct net_device *dev, struct speedo_private *sp) - { -- struct net_device *dev = (struct net_device *)data; -- struct speedo_private *sp = (struct speedo_private *)dev->priv; - long ioaddr = dev->base_addr; - int phy_num = sp->phy[0] & 0x1f; - -@@ -1217,6 +1219,15 @@ - dev->name, sp->rx_mode, jiffies, sp->last_rx_time); - set_rx_mode(dev); - } -+} -+ -+static void speedo_timer(unsigned long data) -+{ -+ struct net_device *dev = (struct net_device *)data; -+ struct speedo_private *sp = (struct speedo_private *)dev->priv; -+ -+ speedo_timeout(dev, sp); -+ - /* We must continue to monitor the media. */ - sp->timer.expires = RUN_AT(2*HZ); /* 2.0 sec. */ - add_timer(&sp->timer); -@@ -1661,6 +1672,29 @@ - return; - } - -+#ifdef HAVE_POLL_CONTROLLER -+ -+/* -+ * Polling 'interrupt' - used by things like netconsole to send skbs -+ * without having to re-enable interrupts. It's not called while -+ * the interrupt routine is executing. -+ */ -+ -+static void poll_speedo (struct net_device *dev) -+{ -+ struct speedo_private *sp = (struct speedo_private *)dev->priv; -+ -+ if (!netdump_mode) disable_irq(dev->irq); -+ if (sp->timer.expires == jiffies) { -+ sp->timer.expires = RUN_AT(2*HZ); -+ speedo_timeout(dev, sp); -+ } -+ speedo_interrupt (dev->irq, dev, NULL); -+ if (!netdump_mode) enable_irq(dev->irq); -+} -+ -+#endif -+ - static inline struct RxFD *speedo_rx_alloc(struct net_device *dev, int entry) - { - struct speedo_private *sp = (struct speedo_private *)dev->priv; -Index: linux-2.4.24/drivers/net/Makefile -=================================================================== ---- linux-2.4.24.orig/drivers/net/Makefile 2003-11-28 13:26:20.000000000 -0500 -+++ linux-2.4.24/drivers/net/Makefile 2004-05-07 16:58:39.000000000 -0400 -@@ -250,6 +250,8 @@ - obj-y += ../acorn/net/acorn-net.o - endif - -+obj-$(CONFIG_NETCONSOLE) += netconsole.o -+ - # - # HIPPI adapters - # -Index: linux-2.4.24/drivers/net/netconsole.c -=================================================================== ---- linux-2.4.24.orig/drivers/net/netconsole.c 1969-12-31 19:00:00.000000000 -0500 -+++ linux-2.4.24/drivers/net/netconsole.c 2004-05-07 16:58:39.000000000 -0400 -@@ -0,0 +1,1246 @@ -+/* -+ * linux/drivers/net/netconsole.c -+ * -+ * Copyright (C) 2001 Ingo Molnar -+ * Copyright (C) 2002 Red Hat, Inc. -+ * -+ * This file contains the implementation of an IRQ-safe, crash-safe -+ * kernel console implementation that outputs kernel messages to the -+ * network. -+ * -+ * Modification history: -+ * -+ * 2001-09-17 started by Ingo Molnar. -+ * 2002-03-14 simultaneous syslog packet option by Michael K. Johnson -+ */ -+ -+/**************************************************************** -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2, or (at your option) -+ * any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. -+ * -+ ****************************************************************/ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#if CONFIG_X86_LOCAL_APIC -+#include -+#endif -+#include -+#include -+#include -+#include -+#include -+#include -+ -+static struct net_device *netconsole_dev; -+static u16 source_port, netdump_target_port, netlog_target_port, syslog_target_port; -+static u32 source_ip, netdump_target_ip, netlog_target_ip, syslog_target_ip; -+static unsigned char netdump_daddr[6] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff} ; -+static unsigned char netlog_daddr[6] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff} ; -+static unsigned char syslog_daddr[6] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff} ; -+ -+static unsigned int mhz = 500, idle_timeout; -+static unsigned long long mhz_cycles, jiffy_cycles; -+ -+#include "netconsole.h" -+ -+#define MAX_UDP_CHUNK 1460 -+#define MAX_PRINT_CHUNK (MAX_UDP_CHUNK-HEADER_LEN) -+ -+#define DEBUG 0 -+#if DEBUG -+# define Dprintk(x...) printk(KERN_INFO x) -+#else -+# define Dprintk(x...) -+#endif -+/* -+ * We maintain a small pool of fully-sized skbs, -+ * to make sure the message gets out even in -+ * extreme OOM situations. -+ */ -+#define MAX_NETCONSOLE_SKBS 128 -+ -+static spinlock_t netconsole_lock = SPIN_LOCK_UNLOCKED; -+static int nr_netconsole_skbs; -+static struct sk_buff *netconsole_skbs; -+ -+#define MAX_SKB_SIZE \ -+ (MAX_UDP_CHUNK + sizeof(struct udphdr) + \ -+ sizeof(struct iphdr) + sizeof(struct ethhdr)) -+ -+static int new_arp = 0; -+static unsigned char arp_sha[ETH_ALEN], arp_tha[ETH_ALEN]; -+static u32 arp_sip, arp_tip; -+ -+static void send_netconsole_arp(struct net_device *dev); -+ -+static void __refill_netconsole_skbs(void) -+{ -+ struct sk_buff *skb; -+ unsigned long flags; -+ -+ spin_lock_irqsave(&netconsole_lock, flags); -+ while (nr_netconsole_skbs < MAX_NETCONSOLE_SKBS) { -+ skb = alloc_skb(MAX_SKB_SIZE, GFP_ATOMIC); -+ if (!skb) -+ break; -+ if (netconsole_skbs) -+ skb->next = netconsole_skbs; -+ else -+ skb->next = NULL; -+ netconsole_skbs = skb; -+ nr_netconsole_skbs++; -+ } -+ spin_unlock_irqrestore(&netconsole_lock, flags); -+} -+ -+static struct sk_buff * get_netconsole_skb(void) -+{ -+ struct sk_buff *skb; -+ -+ unsigned long flags; -+ -+ spin_lock_irqsave(&netconsole_lock, flags); -+ skb = netconsole_skbs; -+ if (skb) { -+ netconsole_skbs = skb->next; -+ skb->next = NULL; -+ nr_netconsole_skbs--; -+ } -+ spin_unlock_irqrestore(&netconsole_lock, flags); -+ -+ return skb; -+} -+ -+static unsigned long long t0; -+ -+/* -+ * Do cleanups: -+ * - zap completed output skbs. -+ * - send ARPs if requested -+ * - reboot the box if inactive for more than N seconds. -+ */ -+static void zap_completion_queue(void) -+{ -+ unsigned long long t1; -+ int cpu = smp_processor_id(); -+ -+ if (softnet_data[cpu].completion_queue) { -+ struct sk_buff *clist; -+ -+ local_irq_disable(); -+ clist = softnet_data[cpu].completion_queue; -+ softnet_data[cpu].completion_queue = NULL; -+ local_irq_enable(); -+ -+ while (clist != NULL) { -+ struct sk_buff *skb = clist; -+ clist = clist->next; -+ __kfree_skb(skb); -+ } -+ } -+ -+ if (new_arp) { -+ Dprintk("got ARP req - sending reply.\n"); -+ new_arp = 0; -+ send_netconsole_arp(netconsole_dev); -+ } -+ -+ rdtscll(t1); -+ if (idle_timeout) { -+ if (t0) { -+ if (((t1 - t0) >> 20) > mhz_cycles * (unsigned long long)idle_timeout) { -+ t0 = t1; -+ printk("netdump idle timeout - rebooting in 3 seconds.\n"); -+ mdelay(3000); -+ machine_restart(NULL); -+ } -+ } -+ } -+ /* maintain jiffies in a polling fashion, based on rdtsc. */ -+ { -+ static unsigned long long prev_tick; -+ -+ if (t1 - prev_tick >= jiffy_cycles) { -+ prev_tick += jiffy_cycles; -+ jiffies++; -+ } -+ } -+} -+ -+static struct sk_buff * alloc_netconsole_skb(struct net_device *dev, int len, int reserve) -+{ -+ int once = 1; -+ int count = 0; -+ struct sk_buff *skb = NULL; -+ -+repeat: -+ zap_completion_queue(); -+ if (nr_netconsole_skbs < MAX_NETCONSOLE_SKBS) -+ __refill_netconsole_skbs(); -+ -+ skb = alloc_skb(len, GFP_ATOMIC); -+ if (!skb) { -+ skb = get_netconsole_skb(); -+ if (!skb) { -+ count++; -+ if (once && (count == 1000000)) { -+ printk("possibly FATAL: out of netconsole skbs!!! will keep retrying.\n"); -+ once = 0; -+ } -+ Dprintk("alloc skb: polling controller ...\n"); -+ dev->poll_controller(dev); -+ goto repeat; -+ } -+ } -+ -+ atomic_set(&skb->users, 1); -+ skb_reserve(skb, reserve); -+ return skb; -+} -+ -+static void transmit_raw_skb(struct sk_buff *skb, struct net_device *dev) -+{ -+ -+repeat_poll: -+ spin_lock(&dev->xmit_lock); -+ dev->xmit_lock_owner = smp_processor_id(); -+ -+ if (netif_queue_stopped(dev)) { -+ dev->xmit_lock_owner = -1; -+ spin_unlock(&dev->xmit_lock); -+ -+ Dprintk("xmit skb: polling controller ...\n"); -+ dev->poll_controller(dev); -+ zap_completion_queue(); -+ goto repeat_poll; -+ } -+ -+ dev->hard_start_xmit(skb, dev); -+ -+ dev->xmit_lock_owner = -1; -+ spin_unlock(&dev->xmit_lock); -+} -+ -+static void transmit_netconsole_skb(struct sk_buff *skb, struct net_device *dev, -+ int ip_len, int udp_len, -+ u16 source_port, u16 target_port, u32 source_ip, u32 target_ip, -+ unsigned char * macdaddr) -+{ -+ struct udphdr *udph; -+ struct iphdr *iph; -+ struct ethhdr *eth; -+ -+ udph = (struct udphdr *) skb_push(skb, sizeof(*udph)); -+ udph->source = source_port; -+ udph->dest = target_port; -+ udph->len = htons(udp_len); -+ udph->check = 0; -+ -+ iph = (struct iphdr *)skb_push(skb, sizeof(*iph)); -+ -+ iph->version = 4; -+ iph->ihl = 5; -+ iph->tos = 0; -+ iph->tot_len = htons(ip_len); -+ iph->id = 0; -+ iph->frag_off = 0; -+ iph->ttl = 64; -+ iph->protocol = IPPROTO_UDP; -+ iph->check = 0; -+ iph->saddr = source_ip; -+ iph->daddr = target_ip; -+ iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); -+ -+ eth = (struct ethhdr *) skb_push(skb, ETH_HLEN); -+ -+ eth->h_proto = htons(ETH_P_IP); -+ memcpy(eth->h_source, dev->dev_addr, dev->addr_len); -+ memcpy(eth->h_dest, macdaddr, dev->addr_len); -+ -+ transmit_raw_skb(skb, dev); -+} -+ -+static void send_netconsole_arp(struct net_device *dev) -+{ -+ int total_len, arp_len, arp_data_len; -+ struct sk_buff *skb; -+ unsigned char *arp; -+ struct arphdr *arph; -+ struct ethhdr *eth; -+ -+ arp_data_len = 2*4 + 2*ETH_ALEN; -+ arp_len = arp_data_len + sizeof(struct arphdr); -+ total_len = arp_len + ETH_HLEN; -+ -+ skb = alloc_netconsole_skb(dev, total_len, total_len - arp_data_len); -+ -+ arp = skb->data; -+ -+ memcpy(arp, dev->dev_addr, ETH_ALEN); -+ arp += ETH_ALEN; -+ -+ memcpy(arp, &source_ip, 4); -+ arp += 4; -+ -+ memcpy(arp, arp_sha, ETH_ALEN); -+ arp += ETH_ALEN; -+ -+ memcpy(arp, &arp_sip, 4); -+ arp += 4; -+ -+ skb->len += 2*4 + 2*ETH_ALEN; -+ -+ arph = (struct arphdr *)skb_push(skb, sizeof(*arph)); -+ -+ arph->ar_hrd = htons(dev->type); -+ arph->ar_pro = __constant_htons(ETH_P_IP); -+ arph->ar_hln = ETH_ALEN; -+ arph->ar_pln = 4; -+ arph->ar_op = __constant_htons(ARPOP_REPLY); -+ -+ eth = (struct ethhdr *) skb_push(skb, ETH_HLEN); -+ -+ eth->h_proto = htons(ETH_P_ARP); -+ memcpy(eth->h_source, dev->dev_addr, dev->addr_len); -+ memcpy(eth->h_dest, arp_sha, dev->addr_len); -+ -+ transmit_raw_skb(skb, dev); -+} -+ -+static void send_netdump_skb(struct net_device *dev, const char *msg, unsigned int msg_len, reply_t *reply) -+{ -+ int total_len, ip_len, udp_len; -+ struct sk_buff *skb; -+ -+ udp_len = msg_len + HEADER_LEN + sizeof(struct udphdr); -+ ip_len = udp_len + sizeof(struct iphdr); -+ total_len = ip_len + ETH_HLEN; -+ -+ skb = alloc_netconsole_skb(dev, total_len, total_len - msg_len - HEADER_LEN); -+ -+ skb->data[0] = NETCONSOLE_VERSION; -+ put_unaligned(htonl(reply->nr), (u32 *) (skb->data + 1)); -+ put_unaligned(htonl(reply->code), (u32 *) (skb->data + 5)); -+ put_unaligned(htonl(reply->info), (u32 *) (skb->data + 9)); -+ -+ memcpy(skb->data + HEADER_LEN, msg, msg_len); -+ skb->len += msg_len + HEADER_LEN; -+ -+ transmit_netconsole_skb(skb, dev, ip_len, udp_len, -+ source_port, netdump_target_port, source_ip, netdump_target_ip, netdump_daddr); -+} -+ -+#define SYSLOG_HEADER_LEN 4 -+ -+static void send_netlog_skb(struct net_device *dev, const char *msg, unsigned int msg_len, reply_t *reply) -+{ -+ int total_len, ip_len, udp_len; -+ struct sk_buff *skb; -+ -+ udp_len = msg_len + HEADER_LEN + sizeof(struct udphdr); -+ ip_len = udp_len + sizeof(struct iphdr); -+ total_len = ip_len + ETH_HLEN; -+ -+ skb = alloc_netconsole_skb(dev, total_len, total_len - msg_len - HEADER_LEN); -+ -+ skb->data[0] = NETCONSOLE_VERSION; -+ put_unaligned(htonl(reply->nr), (u32 *) (skb->data + 1)); -+ put_unaligned(htonl(reply->code), (u32 *) (skb->data + 5)); -+ put_unaligned(htonl(reply->info), (u32 *) (skb->data + 9)); -+ -+ memcpy(skb->data + HEADER_LEN, msg, msg_len); -+ skb->len += msg_len + HEADER_LEN; -+ -+ transmit_netconsole_skb(skb, dev, ip_len, udp_len, -+ source_port, netlog_target_port, source_ip, netlog_target_ip, netlog_daddr); -+} -+ -+#define SYSLOG_HEADER_LEN 4 -+ -+static void send_syslog_skb(struct net_device *dev, const char *msg, unsigned int msg_len, int pri) -+{ -+ int total_len, ip_len, udp_len; -+ struct sk_buff *skb; -+ -+ udp_len = msg_len + SYSLOG_HEADER_LEN + sizeof(struct udphdr); -+ ip_len = udp_len + sizeof(struct iphdr); -+ total_len = ip_len + ETH_HLEN; -+ -+ skb = alloc_netconsole_skb(dev, total_len, total_len - msg_len - SYSLOG_HEADER_LEN); -+ -+ skb->data[0] = '<'; -+ skb->data[1] = pri + '0'; -+ skb->data[2]= '>'; -+ skb->data[3]= ' '; -+ -+ memcpy(skb->data + SYSLOG_HEADER_LEN, msg, msg_len); -+ skb->len += msg_len + SYSLOG_HEADER_LEN; -+ -+ transmit_netconsole_skb(skb, dev, ip_len, udp_len, source_port, -+ syslog_target_port, source_ip, syslog_target_ip, syslog_daddr); -+} -+ -+#define MAX_SYSLOG_CHARS 1000 -+ -+static spinlock_t syslog_lock = SPIN_LOCK_UNLOCKED; -+static int syslog_chars; -+static unsigned char syslog_line [MAX_SYSLOG_CHARS + 10]; -+ -+/* -+ * We feed kernel messages char by char, and send the UDP packet -+ * one linefeed. We buffer all characters received. -+ */ -+static inline void feed_syslog_char(struct net_device *dev, const unsigned char c) -+{ -+ if (syslog_chars == MAX_SYSLOG_CHARS) -+ syslog_chars--; -+ syslog_line[syslog_chars] = c; -+ syslog_chars++; -+ if (c == '\n') { -+ send_syslog_skb(dev, syslog_line, syslog_chars, 5); -+ syslog_chars = 0; -+ } -+} -+ -+static spinlock_t sequence_lock = SPIN_LOCK_UNLOCKED; -+static unsigned int log_offset; -+ -+static void write_netconsole_msg(struct console *con, const char *msg0, unsigned int msg_len) -+{ -+ int len, left, i; -+ struct net_device *dev; -+ const char *msg = msg0; -+ reply_t reply; -+ -+ dev = netconsole_dev; -+ if (!dev || netdump_mode) -+ return; -+ -+ if (dev->poll_controller && netif_running(dev)) { -+ unsigned long flags; -+ -+ __save_flags(flags); -+ __cli(); -+ left = msg_len; -+ if (netlog_target_ip) { -+ while (left) { -+ if (left > MAX_PRINT_CHUNK) -+ len = MAX_PRINT_CHUNK; -+ else -+ len = left; -+ reply.code = REPLY_LOG; -+ reply.nr = 0; -+ spin_lock(&sequence_lock); -+ reply.info = log_offset; -+ log_offset += len; -+ spin_unlock(&sequence_lock); -+ send_netlog_skb(dev, msg, len, &reply); -+ msg += len; -+ left -= len; -+ } -+ } -+ if (syslog_target_ip) { -+ spin_lock(&syslog_lock); -+ for (i = 0; i < msg_len; i++) -+ feed_syslog_char(dev, msg0[i]); -+ spin_unlock(&syslog_lock); -+ } -+ -+ __restore_flags(flags); -+ } -+} -+ -+static unsigned short udp_check(struct udphdr *uh, int len, unsigned long saddr, unsigned long daddr, unsigned long base) -+{ -+ return(csum_tcpudp_magic(saddr, daddr, len, IPPROTO_UDP, base)); -+} -+ -+static int udp_checksum_init(struct sk_buff *skb, struct udphdr *uh, -+ unsigned short ulen, u32 saddr, u32 daddr) -+{ -+ if (uh->check == 0) { -+ skb->ip_summed = CHECKSUM_UNNECESSARY; -+ } else if (skb->ip_summed == CHECKSUM_HW) { -+ skb->ip_summed = CHECKSUM_UNNECESSARY; -+ if (!udp_check(uh, ulen, saddr, daddr, skb->csum)) -+ return 0; -+ skb->ip_summed = CHECKSUM_NONE; -+ } -+ if (skb->ip_summed != CHECKSUM_UNNECESSARY) -+ skb->csum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, -+0); -+ /* Probably, we should checksum udp header (it should be in cache -+ * in any case) and data in tiny packets (< rx copybreak). -+ */ -+ return 0; -+} -+ -+static __inline__ int __udp_checksum_complete(struct sk_buff *skb) -+{ -+ return (unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum)); -+} -+ -+static __inline__ int udp_checksum_complete(struct sk_buff *skb) -+{ -+ return skb->ip_summed != CHECKSUM_UNNECESSARY && -+ __udp_checksum_complete(skb); -+} -+ -+/* -+ * NOTE: security depends on the trusted path between the netconsole -+ * server and netconsole client, since none of the packets are -+ * encrypted. The random magic number protects the protocol -+ * against spoofing. -+ */ -+static u64 netconsole_magic; -+static u32 magic1, magic2; -+ -+static spinlock_t req_lock = SPIN_LOCK_UNLOCKED; -+static int nr_req = 0; -+static LIST_HEAD(request_list); -+ -+static void add_new_req(req_t *req) -+{ -+ unsigned long flags; -+ -+ spin_lock_irqsave(&req_lock, flags); -+ list_add_tail(&req->list, &request_list); -+ nr_req++; -+ Dprintk("pending requests: %d.\n", nr_req); -+ spin_unlock_irqrestore(&req_lock, flags); -+ -+ rdtscll(t0); -+} -+ -+static req_t *get_new_req(void) -+{ -+ req_t *req = NULL; -+ unsigned long flags; -+ -+ spin_lock_irqsave(&req_lock, flags); -+ if (nr_req) { -+ req = list_entry(request_list.next, req_t, list); -+ list_del(&req->list); -+ nr_req--; -+ } -+ spin_unlock_irqrestore(&req_lock, flags); -+ -+ return req; -+} -+ -+static req_t *alloc_req(void) -+{ -+ req_t *req; -+ -+ req = (req_t *) kmalloc(sizeof(*req), GFP_ATOMIC); -+ return req; -+} -+ -+static int netconsole_rx_hook(struct sk_buff *skb) -+{ -+ int proto; -+ struct iphdr *iph; -+ struct udphdr *uh; -+ __u32 len, saddr, daddr, ulen; -+ req_t *__req; -+ req_t *req; -+ struct net_device *dev; -+ -+ if (!netdump_mode) -+ return NET_RX_SUCCESS; -+#if DEBUG -+ { -+ static int packet_count; -+ Dprintk(" %d\r", ++packet_count); -+ } -+#endif -+ dev = skb->dev; -+ if (dev->type != ARPHRD_ETHER) -+ goto out; -+ proto = ntohs(skb->mac.ethernet->h_proto); -+ Dprintk("rx got skb %p (len: %d, users: %d), dev %s, h_proto: %04x.\n", skb, skb->len, atomic_read(&skb->users), dev->name, proto); -+ #define D(x) skb->mac.ethernet->h_dest[x] -+ Dprintk("... h_dest: %02X:%02X:%02X:%02X:%02X:%02X.\n", D(0), D(1), D(2), D(3), D(4), D(5)); -+ #undef D -+ #define D(x) skb->mac.ethernet->h_source[x] -+ Dprintk("... h_source: %02X:%02X:%02X:%02X:%02X:%02X.\n", D(0), D(1), D(2), D(3), D(4), D(5)); -+ #undef D -+ if (skb->pkt_type == PACKET_OTHERHOST) -+ goto out; -+ if (skb_shared(skb)) -+ goto out; -+ if (proto == ETH_P_ARP) { -+ struct arphdr *arp; -+ unsigned char *arp_ptr; -+ -+ Dprintk("got arp skb.\n"); -+ arp = (struct arphdr *)skb->data; -+ if (!pskb_may_pull(skb, sizeof(struct arphdr) + 2*4 + 2*ETH_ALEN)) -+ goto out; -+ if (htons(dev->type) != arp->ar_hrd) -+ goto out; -+ if (arp->ar_pro != __constant_htons(ETH_P_IP)) -+ goto out; -+ if (arp->ar_hln != ETH_ALEN) -+ goto out; -+ if (arp->ar_pln != 4) -+ goto out; -+ if (arp->ar_op != __constant_htons(ARPOP_REQUEST)) -+ goto out; -+ /* -+ * ARP header looks ok so far, extract fields: -+ */ -+ arp_ptr = (unsigned char *)(arp + 1); -+ -+ memcpy(arp_sha, arp_ptr, ETH_ALEN); -+ arp_ptr += ETH_ALEN; -+ -+ memcpy(&arp_sip, arp_ptr, 4); -+ arp_ptr += 4; -+ -+ memcpy(arp_tha, arp_ptr, ETH_ALEN); -+ arp_ptr += ETH_ALEN; -+ -+ memcpy(&arp_tip, arp_ptr, 4); -+ -+ #define D(x) arp_sha[x] -+ Dprintk("... arp_sha: %02X:%02X:%02X:%02X:%02X:%02X.\n", D(0), D(1), D(2), D(3), D(4), D(5)); -+ #undef D -+ #define D(x) ((unsigned char *)&arp_sip)[x] -+ Dprintk("... arp_sip: %d.%d.%d.%d.\n", D(0), D(1), D(2), D(3)); -+ #undef D -+ #define D(x) arp_tha[x] -+ Dprintk("... arp_tha: %02X:%02X:%02X:%02X:%02X:%02X.\n", D(0), D(1), D(2), D(3), D(4), D(5)); -+ #undef D -+ #define D(x) ((unsigned char *)&arp_tip)[x] -+ Dprintk("... arp_tip: %d.%d.%d.%d.\n", D(0), D(1), D(2), D(3)); -+ #undef D -+ #define D(x) ((unsigned char *)&source_ip)[x] -+ Dprintk("... (source_ip): %d.%d.%d.%d.\n", D(0), D(1), D(2), D(3)); -+ #undef D -+ -+ if (LOOPBACK(arp_tip) || MULTICAST(arp_tip)) -+ goto out; -+ -+ if (arp_tip != source_ip) -+ goto out; -+ new_arp = 1; -+ goto out; -+ } -+ if (proto != ETH_P_IP) -+ goto out; -+ /* -+ * IP header correctness testing: -+ */ -+ iph = (struct iphdr *)skb->data; -+ if (!pskb_may_pull(skb, sizeof(struct iphdr))) -+ goto out; -+ Dprintk("... IP ihl*4: %d, version: %d.\n", iph->ihl*4, iph->version); -+ if (iph->ihl < 5 || iph->version != 4) -+ goto out; -+ if (!pskb_may_pull(skb, iph->ihl*4)) -+ goto out; -+ if (ip_fast_csum((u8 *)iph, iph->ihl) != 0) -+ goto out; -+ len = ntohs(iph->tot_len); -+ Dprintk("... IP len: %d.\n", len); -+ if (skb->len < len || len < iph->ihl*4) -+ goto out; -+ saddr = iph->saddr; -+ daddr = iph->daddr; -+ Dprintk("... IP src: %08x, dst: %08x.\n", saddr, daddr); -+ Dprintk("... IP protocol: %d.\n", iph->protocol); -+ if (iph->protocol != IPPROTO_UDP) -+ goto out; -+ Dprintk("... netdump src: %08x, dst: %08x.\n", source_ip, netlog_target_ip); -+ if (source_ip != daddr) -+ goto out; -+ if (netlog_target_ip != saddr) -+ goto out; -+ len -= iph->ihl*4; -+ uh = (struct udphdr *)(((char *)iph) + iph->ihl*4); -+ ulen = ntohs(uh->len); -+ Dprintk("... UDP len: %d (left %d).\n", ulen, len); -+ -+#define MIN_COMM_SIZE (sizeof(*uh) + NETDUMP_REQ_SIZE) -+ if (ulen != len || ulen < MIN_COMM_SIZE) { -+ Dprintk("... UDP, hm, len not ok.\n"); -+ goto out; -+ } -+ if (udp_checksum_init(skb, uh, ulen, saddr, daddr) < 0) { -+ Dprintk("... UDP, hm, checksum init not ok.\n"); -+ goto out; -+ } -+ if (udp_checksum_complete(skb)) { -+ Dprintk("... UDP, hm, checksum complete not ok.\n"); -+ goto out; -+ } -+ Dprintk("... UDP packet OK!\n"); -+ Dprintk("... UDP src port: %d, dst port: %d.\n", uh->source, uh->dest); -+ if (source_port != uh->source) -+ goto out; -+ if (netlog_target_port != uh->dest) -+ goto out; -+ __req = (req_t *)(uh + 1); -+ Dprintk("... UDP netdump packet OK!\n"); -+ -+ req = alloc_req(); -+ if (!req) { -+ printk("no more RAM to allocate request - dropping it.\n"); -+ goto out; -+ } -+ -+ req->magic = ntohl(__req->magic); -+ req->command = ntohl(__req->command); -+ req->from = ntohl(__req->from); -+ req->to = ntohl(__req->to); -+ req->nr = ntohl(__req->nr); -+ -+ Dprintk("... netdump magic: %08Lx.\n", req->magic); -+ Dprintk("... netdump command: %08x.\n", req->command); -+ Dprintk("... netdump from: %08x.\n", req->from); -+ Dprintk("... netdump to: %08x.\n", req->to); -+ -+ add_new_req(req); -+out: -+ return NET_RX_DROP; -+} -+ -+#define INVALID_PAGE "page is not valid!\n" -+ -+static void send_netdump_mem (struct net_device *dev, req_t *req) -+{ -+ int i; -+ char *kaddr; -+ char str[1024]; -+ struct page *page; -+ unsigned long nr = req->from; -+ int nr_chunks = PAGE_SIZE/1024; -+ reply_t reply; -+ -+ reply.nr = req->nr; -+ reply.info = 0; -+ if (req->from >= max_mapnr) { -+ sprintf(str, "page %08lx is bigger than max page # %08lx!\n", nr, max_mapnr); -+ reply.code = REPLY_ERROR; -+ send_netdump_skb(dev, str, strlen(str), &reply); -+ return; -+ } -+ page = mem_map + nr; -+ if (PageReserved(page)) -+ page = ZERO_PAGE(0); -+ -+ kaddr = (char *)kmap_atomic(page, KM_NETDUMP); -+ -+ for (i = 0; i < nr_chunks; i++) { -+ unsigned int offset = i*1024; -+ reply.code = REPLY_MEM; -+ reply.info = offset; -+ send_netdump_skb(dev, kaddr + offset, 1024, &reply); -+ } -+ -+ kunmap_atomic(kaddr, KM_NETDUMP); -+} -+ -+/* -+ * This function waits for the client to acknowledge the receipt -+ * of the netdump startup reply, with the possibility of packets -+ * getting lost. We resend the startup packet if no ACK is received, -+ * after a 1 second delay. -+ * -+ * (The client can test the success of the handshake via the HELLO -+ * command, and send ACKs until we enter netdump mode.) -+ */ -+static void netdump_startup_handshake(struct net_device *dev) -+{ -+ char tmp[200]; -+ reply_t reply; -+ req_t *req = NULL; -+ int i; -+ -+ netdump_mode = 1; -+ -+repeat: -+ sprintf(tmp, "NETDUMP start, waiting for start-ACK.\n"); -+ reply.code = REPLY_START_NETDUMP; -+ reply.nr = 0; -+ reply.info = 0; -+ send_netdump_skb(dev, tmp, strlen(tmp), &reply); -+ -+ for (i = 0; i < 10000; i++) { -+ // wait 1 sec. -+ udelay(100); -+ Dprintk("handshake: polling controller ...\n"); -+ dev->poll_controller(dev); -+ zap_completion_queue(); -+ req = get_new_req(); -+ if (req) -+ break; -+ } -+ if (!req) -+ goto repeat; -+ if (req->command != COMM_START_NETDUMP_ACK) { -+ kfree(req); -+ goto repeat; -+ } -+ kfree(req); -+ -+ printk("NETDUMP START!\n"); -+} -+ -+#if 0 -+ -+static inline void print_status (req_t *req) -+{ -+ static int count = 0; -+ -+ switch (++count & 3) { -+ case 0: printk("/\r"); break; -+ case 1: printk("|\r"); break; -+ case 2: printk("\\\r"); break; -+ case 3: printk("-\r"); break; -+ } -+} -+ -+#else -+ -+static inline void print_status (req_t *req) -+{ -+ static int count = 0; -+ static int prev_jiffies = 0; -+ -+ if (jiffies/HZ != prev_jiffies/HZ) { -+ prev_jiffies = jiffies; -+ count++; -+ switch (count & 3) { -+ case 0: printk("%d(%ld)/\r", nr_req, jiffies); break; -+ case 1: printk("%d(%ld)|\r", nr_req, jiffies); break; -+ case 2: printk("%d(%ld)\\\r", nr_req, jiffies); break; -+ case 3: printk("%d(%ld)-\r", nr_req, jiffies); break; -+ } -+ } -+} -+ -+#endif -+ -+#define CLI 1 -+ -+#if CONFIG_SMP -+static void freeze_cpu (void * dummy) -+{ -+ printk("CPU#%d is frozen.\n", smp_processor_id()); -+#if CLI -+ for (;;) __cli(); -+#else -+ for (;;) __sti(); -+#endif -+} -+#endif -+ -+static void netconsole_netdump (struct pt_regs *regs) -+{ -+ reply_t reply; -+ char tmp[200]; -+ unsigned long flags; -+ struct net_device *dev = netconsole_dev; -+ unsigned long esp; -+ unsigned short ss; -+ struct pt_regs myregs; -+ req_t *req; -+ -+ __save_flags(flags); -+ __cli(); -+#if CONFIG_X86_LOCAL_APIC -+ nmi_watchdog = 0; -+#endif -+#if CONFIG_SMP -+ smp_call_function(freeze_cpu, NULL, 1, 0); -+#endif -+ mdelay(1000); -+ /* -+ * Just in case we are crashing within the networking code -+ * ... attempt to fix up. -+ */ -+ spin_lock_init(&dev->xmit_lock); -+ -+ esp = (unsigned long) ((char *)regs + sizeof (struct pt_regs)); -+ ss = __KERNEL_DS; -+ if (regs->xcs & 3) { -+ esp = regs->esp; -+ ss = regs->xss & 0xffff; -+ } -+ myregs = *regs; -+ myregs.esp = esp; -+ myregs.xss = (myregs.xss & 0xffff0000) | ss; -+ -+ rdtscll(t0); -+ -+ printk("< netdump activated - performing handshake with the client. >\n"); -+ netdump_startup_handshake(dev); -+ -+ printk("< handshake completed - listening for dump requests. >\n"); -+ -+ while (netdump_mode) { -+ __cli(); -+ Dprintk("main netdump loop: polling controller ...\n"); -+ dev->poll_controller(dev); -+ zap_completion_queue(); -+#if !CLI -+ __sti(); -+#endif -+ req = get_new_req(); -+ if (!req) -+ continue; -+ Dprintk("got new req, command %d.\n", req->command); -+ print_status(req); -+ switch (req->command) { -+ case COMM_NONE: -+ Dprintk("got NO command.\n"); -+ break; -+ -+ case COMM_SEND_MEM: -+ Dprintk("got MEM command.\n"); -+ // send ->from ->to. -+ send_netdump_mem(dev, req); -+ break; -+ -+ case COMM_EXIT: -+ Dprintk("got EXIT command.\n"); -+ netdump_mode = 0; -+ break; -+ -+ case COMM_REBOOT: -+ Dprintk("got REBOOT command.\n"); -+ printk("netdump: rebooting in 3 seconds.\n"); -+ mdelay(3000); -+ machine_restart(NULL); -+ break; -+ -+ case COMM_HELLO: -+ sprintf(tmp, "Hello, this is netdump version 0.%02d\n", NETCONSOLE_VERSION); -+ reply.code = REPLY_HELLO; -+ reply.nr = req->nr; -+ reply.info = NETCONSOLE_VERSION; -+ send_netdump_skb(dev, tmp, strlen(tmp), &reply); -+ break; -+ -+ case COMM_GET_PAGE_SIZE: -+ sprintf(tmp, "PAGE_SIZE: %ld\n", PAGE_SIZE); -+ reply.code = REPLY_PAGE_SIZE; -+ reply.nr = req->nr; -+ reply.info = PAGE_SIZE; -+ send_netdump_skb(dev, tmp, strlen(tmp), &reply); -+ break; -+ -+ case COMM_GET_REGS: -+ { -+ char *tmp2 = tmp; -+ elf_gregset_t elf_regs; -+ -+ reply.code = REPLY_REGS; -+ reply.nr = req->nr; -+ reply.info = max_mapnr; -+ tmp2 = tmp + sprintf(tmp, "Sending register info.\n"); -+ ELF_CORE_COPY_REGS(elf_regs, regs); -+ memcpy(tmp2, &elf_regs, sizeof(elf_regs)); -+ send_netdump_skb(dev, tmp, strlen(tmp) + sizeof(elf_regs), &reply); -+ break; -+ } -+ -+ case COMM_GET_NR_PAGES: -+ reply.code = REPLY_NR_PAGES; -+ reply.nr = req->nr; -+ reply.info = max_mapnr; -+ sprintf(tmp, "Number of pages: %ld\n", max_mapnr); -+ send_netdump_skb(dev, tmp, strlen(tmp), &reply); -+ break; -+ -+ case COMM_SHOW_STATE: -+ netdump_mode = 0; -+ if (regs) -+ show_regs(regs); -+ show_state(); -+ show_mem(); -+ netdump_mode = 1; -+ reply.code = REPLY_SHOW_STATE; -+ reply.nr = req->nr; -+ reply.info = 0; -+ send_netdump_skb(dev, tmp, strlen(tmp), &reply); -+ break; -+ -+ default: -+ reply.code = REPLY_ERROR; -+ reply.nr = req->nr; -+ reply.info = req->command; -+ Dprintk("got UNKNOWN command!\n"); -+ sprintf(tmp, "Got unknown command code %d!\n", req->command); -+ send_netdump_skb(dev, tmp, strlen(tmp), &reply); -+ break; -+ } -+ kfree(req); -+ req = NULL; -+ } -+ sprintf(tmp, "NETDUMP end.\n"); -+ reply.code = REPLY_END_NETDUMP; -+ reply.nr = 0; -+ reply.info = 0; -+ send_netdump_skb(dev, tmp, strlen(tmp), &reply); -+ printk("NETDUMP END!\n"); -+ __restore_flags(flags); -+} -+ -+static char *dev; -+static int netdump_target_eth_byte0 = 255; -+static int netdump_target_eth_byte1 = 255; -+static int netdump_target_eth_byte2 = 255; -+static int netdump_target_eth_byte3 = 255; -+static int netdump_target_eth_byte4 = 255; -+static int netdump_target_eth_byte5 = 255; -+ -+static int netlog_target_eth_byte0 = 255; -+static int netlog_target_eth_byte1 = 255; -+static int netlog_target_eth_byte2 = 255; -+static int netlog_target_eth_byte3 = 255; -+static int netlog_target_eth_byte4 = 255; -+static int netlog_target_eth_byte5 = 255; -+ -+static int syslog_target_eth_byte0 = 255; -+static int syslog_target_eth_byte1 = 255; -+static int syslog_target_eth_byte2 = 255; -+static int syslog_target_eth_byte3 = 255; -+static int syslog_target_eth_byte4 = 255; -+static int syslog_target_eth_byte5 = 255; -+ -+MODULE_PARM(netdump_target_ip, "i"); -+MODULE_PARM_DESC(netdump_target_ip, -+ "remote netdump IP address as a native (not network) endian integer"); -+MODULE_PARM(netlog_target_ip, "i"); -+MODULE_PARM_DESC(netlog_target_ip, -+ "remote netlog IP address as a native (not network) endian integer"); -+MODULE_PARM(syslog_target_ip, "i"); -+MODULE_PARM_DESC(syslog_target_ip, -+ "remote syslog IP address as a native (not network) endian integer"); -+ -+MODULE_PARM(source_port, "h"); -+MODULE_PARM_DESC(source_port, -+ "local port from which to send netdump packets"); -+ -+MODULE_PARM(netdump_target_port, "h"); -+MODULE_PARM_DESC(netdump_target_port, -+ "remote port to which to send netdump packets"); -+MODULE_PARM(netlog_target_port, "h"); -+MODULE_PARM_DESC(netlog_target_port, -+ "remote port to which to send netlog packets"); -+MODULE_PARM(syslog_target_port, "h"); -+MODULE_PARM_DESC(syslog_target_port, -+ "remote port to which to send syslog packets"); -+ -+#define ETH_BYTE(name,nr) \ -+ MODULE_PARM(name##_target_eth_byte##nr, "i"); \ -+ MODULE_PARM_DESC(name##_target_eth_byte##nr, \ -+ "byte "#nr" of the netdump server MAC address") -+ -+#define ETH_BYTES(name) \ -+ ETH_BYTE(name, 0); ETH_BYTE(name, 1); ETH_BYTE(name, 2); \ -+ ETH_BYTE(name, 3); ETH_BYTE(name, 4); ETH_BYTE(name, 5); -+ -+ETH_BYTES(netdump); -+ETH_BYTES(netlog); -+ETH_BYTES(syslog); -+ -+MODULE_PARM(magic1, "i"); -+MODULE_PARM_DESC(magic1, -+ "lower 32 bits of magic cookie shared between client and server"); -+MODULE_PARM(magic2, "i"); -+MODULE_PARM_DESC(magic2, -+ "upper 32 bits of magic cookie shared between client and server"); -+MODULE_PARM(dev, "s"); -+MODULE_PARM_DESC(dev, -+ "name of the device from which to send netdump and syslog packets"); -+MODULE_PARM(mhz, "i"); -+MODULE_PARM_DESC(mhz, -+ "one second wall clock time takes this many million CPU cycles"); -+MODULE_PARM(idle_timeout, "i"); -+MODULE_PARM_DESC(idle_timeout, -+ "reboot system after this many idle seconds"); -+ -+static struct console netconsole = -+ { flags: CON_ENABLED, write: write_netconsole_msg }; -+ -+static int init_netconsole(void) -+{ -+ struct net_device *ndev = NULL; -+ struct in_device *in_dev; -+ -+ printk(KERN_INFO "netlog: using network device <%s>\n", dev); -+ // this will be valid once the device goes up. -+ if (dev) -+ ndev = dev_get_by_name(dev); -+ if (!ndev) { -+ printk(KERN_ERR "netlog: network device %s does not exist, aborting.\n", dev); -+ return -1; -+ } -+ if (!ndev->poll_controller) { -+ printk(KERN_ERR "netlog: %s's network driver does not implement netlogging yet, aborting.\n", dev); -+ return -1; -+ } -+ in_dev = in_dev_get(ndev); -+ if (!in_dev) { -+ printk(KERN_ERR "netlog: network device %s is not an IP protocol device, aborting.\n", dev); -+ return -1; -+ } -+ -+ if (!magic1 || !magic2) { -+ printk(KERN_ERR "netlog: magic cookie (magic1,magic2) not specified.\n"); -+ return -1; -+ } -+ netconsole_magic = magic1 + (((u64)magic2)<<32); -+ -+ source_ip = ntohl(in_dev->ifa_list->ifa_local); -+ if (!source_ip) { -+ printk(KERN_ERR "netlog: network device %s has no local address, aborting.\n", dev); -+ return -1; -+ } -+#define IP(x) ((unsigned char *)&source_ip)[x] -+ printk(KERN_INFO "netlog: using source IP %u.%u.%u.%u\n", -+ IP(3), IP(2), IP(1), IP(0)); -+#undef IP -+ source_ip = htonl(source_ip); -+ if (!source_port) { -+ printk(KERN_ERR "netlog: source_port parameter not specified, aborting.\n"); -+ return -1; -+ } -+ printk(KERN_INFO "netlog: using source UDP port: %u\n", source_port); -+ source_port = htons(source_port); -+ -+ if (!netdump_target_ip && !netlog_target_ip && !syslog_target_ip) { -+ printk(KERN_ERR "netlog: target_ip parameter not specified, aborting.\n"); -+ return -1; -+ } -+ if (netdump_target_ip) { -+#define IP(x) ((unsigned char *)&netdump_target_ip)[x] -+ printk(KERN_INFO "netlog: using netdump target IP %u.%u.%u.%u\n", -+ IP(3), IP(2), IP(1), IP(0)); -+#undef IP -+ netdump_target_ip = htonl(netdump_target_ip); -+ } -+ if (netlog_target_ip) { -+#define IP(x) ((unsigned char *)&netlog_target_ip)[x] -+ printk(KERN_INFO "netlog: using netlog target IP %u.%u.%u.%u\n", -+ IP(3), IP(2), IP(1), IP(0)); -+#undef IP -+ netlog_target_ip = htonl(netlog_target_ip); -+ } -+ if (syslog_target_ip) { -+ if (!syslog_target_port) -+ syslog_target_port = 514; -+#define IP(x) ((unsigned char *)&syslog_target_ip)[x] -+ printk("netlog: using syslog target IP %u.%u.%u.%u, port: %d\n", IP(3), IP(2), IP(1), IP(0), syslog_target_port); -+#undef IP -+ syslog_target_ip = htonl(syslog_target_ip); -+ syslog_target_port = htons(syslog_target_port); -+ } -+ if (!netdump_target_port && !netlog_target_port && !syslog_target_port) { -+ printk(KERN_ERR "netlog: target_port parameter not specified, aborting.\n"); -+ return -1; -+ } -+ if (netdump_target_port) { -+ printk(KERN_INFO "netlog: using target UDP port: %u\n", netdump_target_port); -+ netdump_target_port = htons(netdump_target_port); -+ } -+ if (netlog_target_port) { -+ printk(KERN_INFO "netlog: using target UDP port: %u\n", netlog_target_port); -+ netlog_target_port = htons(netlog_target_port); -+ } -+ -+ netdump_daddr[0] = netdump_target_eth_byte0; -+ netdump_daddr[1] = netdump_target_eth_byte1; -+ netdump_daddr[2] = netdump_target_eth_byte2; -+ netdump_daddr[3] = netdump_target_eth_byte3; -+ netdump_daddr[4] = netdump_target_eth_byte4; -+ netdump_daddr[5] = netdump_target_eth_byte5; -+ -+ if ((netdump_daddr[0] & netdump_daddr[1] & netdump_daddr[2] & netdump_daddr[3] & netdump_daddr[4] & netdump_daddr[5]) == 255) -+ printk(KERN_INFO "netlog: using broadcast ethernet frames to send netdump packets.\n"); -+ else -+ printk(KERN_INFO "netlog: using netdump target ethernet address %02x:%02x:%02x:%02x:%02x:%02x.\n", -+ netdump_daddr[0], netdump_daddr[1], netdump_daddr[2], netdump_daddr[3], netdump_daddr[4], netdump_daddr[5]); -+ -+ netlog_daddr[0] = netlog_target_eth_byte0; -+ netlog_daddr[1] = netlog_target_eth_byte1; -+ netlog_daddr[2] = netlog_target_eth_byte2; -+ netlog_daddr[3] = netlog_target_eth_byte3; -+ netlog_daddr[4] = netlog_target_eth_byte4; -+ netlog_daddr[5] = netlog_target_eth_byte5; -+ -+ if ((netlog_daddr[0] & netlog_daddr[1] & netlog_daddr[2] & netlog_daddr[3] & netlog_daddr[4] & netlog_daddr[5]) == 255) -+ printk(KERN_INFO "netlog: using broadcast ethernet frames to send netdump packets.\n"); -+ else -+ printk(KERN_INFO "netlog: using netdump target ethernet address %02x:%02x:%02x:%02x:%02x:%02x.\n", -+ netlog_daddr[0], netlog_daddr[1], netlog_daddr[2], netlog_daddr[3], netlog_daddr[4], netlog_daddr[5]); -+ syslog_daddr[0] = syslog_target_eth_byte0; -+ syslog_daddr[1] = syslog_target_eth_byte1; -+ syslog_daddr[2] = syslog_target_eth_byte2; -+ syslog_daddr[3] = syslog_target_eth_byte3; -+ syslog_daddr[4] = syslog_target_eth_byte4; -+ syslog_daddr[5] = syslog_target_eth_byte5; -+ -+ if ((syslog_daddr[0] & syslog_daddr[1] & syslog_daddr[2] & syslog_daddr[3] & syslog_daddr[4] & syslog_daddr[5]) == 255) -+ printk(KERN_INFO "netlog: using broadcast ethernet frames to send syslog packets.\n"); -+ else -+ printk(KERN_INFO "netlog: using syslog target ethernet address %02x:%02x:%02x:%02x:%02x:%02x.\n", -+ syslog_daddr[0], syslog_daddr[1], syslog_daddr[2], syslog_daddr[3], syslog_daddr[4], syslog_daddr[5]); -+ -+ mhz_cycles = (unsigned long long)mhz * 1000000ULL; -+ jiffy_cycles = (unsigned long long)mhz * (1000000/HZ); -+ -+ INIT_LIST_HEAD(&request_list); -+ -+ ndev->rx_hook = netconsole_rx_hook; -+ netdump_func = netconsole_netdump; -+ netconsole_dev = ndev; -+#define STARTUP_MSG "[...network console startup...]\n" -+ write_netconsole_msg(NULL, STARTUP_MSG, strlen(STARTUP_MSG)); -+ -+ register_console(&netconsole); -+ printk(KERN_INFO "netlog: network logging started up successfully!\n"); -+ return 0; -+} -+ -+static void cleanup_netconsole(void) -+{ -+ printk(KERN_INFO "netlog: network logging shut down.\n"); -+ unregister_console(&netconsole); -+ -+#define SHUTDOWN_MSG "[...network console shutdown...]\n" -+ write_netconsole_msg(NULL, SHUTDOWN_MSG, strlen(SHUTDOWN_MSG)); -+ netconsole_dev->rx_hook = NULL; -+ netconsole_dev = NULL; -+} -+ -+module_init(init_netconsole); -+module_exit(cleanup_netconsole); -+ -+MODULE_LICENSE("GPL"); -+ -Index: linux-2.4.24/drivers/net/netconsole.h -=================================================================== ---- linux-2.4.24.orig/drivers/net/netconsole.h 1969-12-31 19:00:00.000000000 -0500 -+++ linux-2.4.24/drivers/net/netconsole.h 2004-05-07 16:58:39.000000000 -0400 -@@ -0,0 +1,81 @@ -+/* -+ * linux/drivers/net/netconsole.h -+ * -+ * Copyright (C) 2001 Ingo Molnar -+ * -+ * This file contains the implementation of an IRQ-safe, crash-safe -+ * kernel console implementation that outputs kernel messages to the -+ * network. -+ * -+ * Modification history: -+ * -+ * 2001-09-17 started by Ingo Molnar. -+ */ -+ -+/**************************************************************** -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2, or (at your option) -+ * any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. -+ * -+ ****************************************************************/ -+ -+#define NETCONSOLE_VERSION 0x04 -+ -+enum netdump_commands { -+ COMM_NONE = 0, -+ COMM_SEND_MEM = 1, -+ COMM_EXIT = 2, -+ COMM_REBOOT = 3, -+ COMM_HELLO = 4, -+ COMM_GET_NR_PAGES = 5, -+ COMM_GET_PAGE_SIZE = 6, -+ COMM_START_NETDUMP_ACK = 7, -+ COMM_GET_REGS = 8, -+ COMM_SHOW_STATE = 9, -+}; -+ -+#define NETDUMP_REQ_SIZE (8+4*4) -+ -+typedef struct netdump_req_s { -+ u64 magic; -+ u32 nr; -+ u32 command; -+ u32 from; -+ u32 to; -+ struct list_head list; -+} req_t; -+ -+enum netdump_replies { -+ REPLY_NONE = 0, -+ REPLY_ERROR = 1, -+ REPLY_LOG = 2, -+ REPLY_MEM = 3, -+ REPLY_RESERVED = 4, -+ REPLY_HELLO = 5, -+ REPLY_NR_PAGES = 6, -+ REPLY_PAGE_SIZE = 7, -+ REPLY_START_NETDUMP = 8, -+ REPLY_END_NETDUMP = 9, -+ REPLY_REGS = 10, -+ REPLY_MAGIC = 11, -+ REPLY_SHOW_STATE = 12, -+}; -+ -+typedef struct netdump_reply_s { -+ u32 nr; -+ u32 code; -+ u32 info; -+} reply_t; -+ -+#define HEADER_LEN (1 + sizeof(reply_t)) -+ -Index: linux-2.4.24/drivers/net/tlan.c -=================================================================== ---- linux-2.4.24.orig/drivers/net/tlan.c 2003-11-28 13:26:20.000000000 -0500 -+++ linux-2.4.24/drivers/net/tlan.c 2004-05-07 16:58:39.000000000 -0400 -@@ -345,6 +345,8 @@ - static void TLan_EeReceiveByte( u16, u8 *, int ); - static int TLan_EeReadByte( struct net_device *, u8, u8 * ); - -+static void TLan_Poll(struct net_device *); -+ - - static void - TLan_StoreSKB( struct tlan_list_tag *tag, struct sk_buff *skb) -@@ -891,6 +893,9 @@ - dev->get_stats = &TLan_GetStats; - dev->set_multicast_list = &TLan_SetMulticastList; - dev->do_ioctl = &TLan_ioctl; -+#ifdef HAVE_POLL_CONTROLLER -+ dev->poll_controller = &TLan_Poll; -+#endif - dev->tx_timeout = &TLan_tx_timeout; - dev->watchdog_timeo = TX_TIMEOUT; - -@@ -1176,7 +1181,14 @@ - - } /* TLan_HandleInterrupts */ - -- -+#ifdef HAVE_POLL_CONTROLLER -+static void TLan_Poll(struct net_device *dev) -+{ -+ if (!netdump_mode) disable_irq(dev->irq); -+ TLan_HandleInterrupt(dev->irq, dev, NULL); -+ if (!netdump_mode) enable_irq(dev->irq); -+} -+#endif - - - /*************************************************************** -Index: linux-2.4.24/drivers/net/tulip/tulip_core.c -=================================================================== ---- linux-2.4.24.orig/drivers/net/tulip/tulip_core.c 2003-11-28 13:26:20.000000000 -0500 -+++ linux-2.4.24/drivers/net/tulip/tulip_core.c 2004-05-07 16:58:39.000000000 -0400 -@@ -266,6 +266,7 @@ - static struct net_device_stats *tulip_get_stats(struct net_device *dev); - static int private_ioctl(struct net_device *dev, struct ifreq *rq, int cmd); - static void set_rx_mode(struct net_device *dev); -+static void poll_tulip(struct net_device *dev); - - - -@@ -1728,6 +1729,9 @@ - dev->get_stats = tulip_get_stats; - dev->do_ioctl = private_ioctl; - dev->set_multicast_list = set_rx_mode; -+#ifdef HAVE_POLL_CONTROLLER -+ dev->poll_controller = &poll_tulip; -+#endif - - if (register_netdev(dev)) - goto err_out_free_ring; -@@ -1902,6 +1906,24 @@ - } - - -+#ifdef HAVE_POLL_CONTROLLER -+ -+/* -+ * Polling 'interrupt' - used by things like netconsole to send skbs -+ * without having to re-enable interrupts. It's not called while -+ * the interrupt routine is executing. -+ */ -+ -+static void poll_tulip (struct net_device *dev) -+{ -+ if (!netdump_mode) disable_irq(dev->irq); -+ tulip_interrupt (dev->irq, dev, NULL); -+ if (!netdump_mode) enable_irq(dev->irq); -+} -+ -+#endif -+ -+ - static struct pci_driver tulip_driver = { - name: DRV_NAME, - id_table: tulip_pci_tbl, -Index: linux-2.4.24/drivers/net/e100/e100_main.c -=================================================================== ---- linux-2.4.24.orig/drivers/net/e100/e100_main.c 2004-05-07 16:58:39.000000000 -0400 -+++ linux-2.4.24/drivers/net/e100/e100_main.c 2004-05-07 17:00:21.000000000 -0400 -@@ -664,6 +664,10 @@ - goto err_unregister_netdev; - } - -+#ifdef HAVE_POLL_CONTROLLER -+ dev->poll_controller = e100_netpoll; -+#endif -+ - e100nics++; - - e100_get_speed_duplex_caps(bdp); -Index: linux-2.4.24/drivers/net/e1000/e1000_main.c -=================================================================== ---- linux-2.4.24.orig/drivers/net/e1000/e1000_main.c 2003-11-28 13:26:20.000000000 -0500 -+++ linux-2.4.24/drivers/net/e1000/e1000_main.c 2004-05-07 16:58:39.000000000 -0400 -@@ -182,6 +182,9 @@ - static int e1000_resume(struct pci_dev *pdev); - #endif - -+/* for netdump / net console */ -+static void e1000_netpoll (struct net_device *dev); -+ - struct notifier_block e1000_notifier_reboot = { - .notifier_call = e1000_notify_reboot, - .next = NULL, -@@ -434,6 +437,10 @@ - netdev->vlan_rx_add_vid = e1000_vlan_rx_add_vid; - netdev->vlan_rx_kill_vid = e1000_vlan_rx_kill_vid; - -+#ifdef HAVE_POLL_CONTROLLER -+ netdev->poll_controller = e1000_netpoll; -+#endif -+ - netdev->irq = pdev->irq; - netdev->mem_start = mmio_start; - netdev->mem_end = mmio_start + mmio_len; -@@ -2899,4 +2906,20 @@ - } - #endif - -+#ifdef HAVE_POLL_CONTROLLER -+/* -+ * Polling 'interrupt' - used by things like netconsole to send skbs -+ * without having to re-enable interrupts. It's not called while -+ * the interrupt routine is executing. -+ */ -+ -+static void e1000_netpoll (struct net_device *dev) -+{ -+ if (!netdump_mode) disable_irq(dev->irq); -+ e1000_intr (dev->irq, dev, NULL); -+ if (!netdump_mode) enable_irq(dev->irq); -+} -+ -+#endif -+ - /* e1000_main.c */ -Index: linux-2.4.24/drivers/net/tg3.c -=================================================================== ---- linux-2.4.24.orig/drivers/net/tg3.c 2003-11-28 13:26:20.000000000 -0500 -+++ linux-2.4.24/drivers/net/tg3.c 2004-05-07 16:58:39.000000000 -0400 -@@ -216,6 +216,9 @@ - #define tr16(reg) readw(tp->regs + (reg)) - #define tr8(reg) readb(tp->regs + (reg)) - -+/* Added by mark.fasheh@oracle.com to help enable netdump on these cards */ -+static void poll_tg3 (struct net_device *dev); -+ - static void tg3_write_mem(struct tg3 *tp, u32 off, u32 val) - { - unsigned long flags; -@@ -7630,6 +7633,9 @@ - dev->watchdog_timeo = TG3_TX_TIMEOUT; - dev->change_mtu = tg3_change_mtu; - dev->irq = pdev->irq; -+#ifdef HAVE_POLL_CONTROLLER -+ dev->poll_controller = &poll_tg3; -+#endif - - err = tg3_get_invariants(tp); - if (err) { -@@ -7862,5 +7868,23 @@ - pci_unregister_driver(&tg3_driver); - } - -+#ifdef HAVE_POLL_CONTROLLER -+ -+/* -+ * Polling 'interrupt' - used by things like netconsole to send skbs -+ * without having to re-enable interrupts. It's not called while -+ * the interrupt routine is executing. -+ */ -+ -+static void poll_tg3 (struct net_device *dev) -+{ -+ if (!netdump_mode) disable_irq(dev->irq); -+ tg3_interrupt (dev->irq, dev, NULL); -+ if (!netdump_mode) enable_irq(dev->irq); -+} -+ -+#endif -+ -+ - module_init(tg3_init); - module_exit(tg3_cleanup); -Index: linux-2.4.24/include/asm-i386/kmap_types.h -=================================================================== ---- linux-2.4.24.orig/include/asm-i386/kmap_types.h 2003-08-25 07:44:43.000000000 -0400 -+++ linux-2.4.24/include/asm-i386/kmap_types.h 2004-05-07 16:59:12.000000000 -0400 -@@ -10,6 +10,7 @@ - KM_BH_IRQ, - KM_SOFTIRQ0, - KM_SOFTIRQ1, -+ KM_NETDUMP, - KM_TYPE_NR - }; - -Index: linux-2.4.24/include/linux/kernel.h -=================================================================== ---- linux-2.4.24.orig/include/linux/kernel.h 2004-05-07 16:56:55.000000000 -0400 -+++ linux-2.4.24/include/linux/kernel.h 2004-05-07 16:58:39.000000000 -0400 -@@ -104,6 +104,9 @@ - - extern void bust_spinlocks(int yes); - extern int oops_in_progress; /* If set, an oops, panic(), BUG() or die() is in progress */ -+struct pt_regs; -+extern void (*netdump_func) (struct pt_regs *regs); -+extern int netdump_mode; - - extern int tainted; - extern const char *print_tainted(void); -Index: linux-2.4.24/include/linux/netdevice.h -=================================================================== ---- linux-2.4.24.orig/include/linux/netdevice.h 2003-11-28 13:26:21.000000000 -0500 -+++ linux-2.4.24/include/linux/netdevice.h 2004-05-07 16:58:39.000000000 -0400 -@@ -435,6 +435,9 @@ - unsigned char *haddr); - int (*neigh_setup)(struct net_device *dev, struct neigh_parms *); - int (*accept_fastpath)(struct net_device *, struct dst_entry*); -+#define HAVE_POLL_CONTROLLER -+ void (*poll_controller)(struct net_device *dev); -+ int (*rx_hook)(struct sk_buff *skb); - - /* open/release and usage marking */ - struct module *owner; -Index: linux-2.4.24/kernel/panic.c -=================================================================== ---- linux-2.4.24.orig/kernel/panic.c 2004-05-07 16:56:56.000000000 -0400 -+++ linux-2.4.24/kernel/panic.c 2004-05-07 16:58:39.000000000 -0400 -@@ -62,6 +62,8 @@ - vsprintf(buf, fmt, args); - va_end(args); - printk(KERN_EMERG "Kernel panic: %s\n",buf); -+ if (netdump_func) -+ BUG(); - if (in_interrupt()) - printk(KERN_EMERG "In interrupt handler - not syncing\n"); - else if (!current->pid) -Index: linux-2.4.24/net/core/dev.c -=================================================================== ---- linux-2.4.24.orig/net/core/dev.c 2003-11-28 13:26:21.000000000 -0500 -+++ linux-2.4.24/net/core/dev.c 2004-05-07 16:58:39.000000000 -0400 -@@ -1288,6 +1288,13 @@ - - local_irq_save(flags); - -+ if (unlikely(skb->dev->rx_hook != NULL)) { -+ int ret; -+ -+ ret = skb->dev->rx_hook(skb); -+ if (ret == NET_RX_DROP) -+ goto drop; -+ } - netdev_rx_stat[this_cpu].total++; - if (queue->input_pkt_queue.qlen <= netdev_max_backlog) { - if (queue->input_pkt_queue.qlen) { diff --git a/lustre/kernel_patches/patches/linux-2.4.19-bgl-xattr-0.8.54.patch b/lustre/kernel_patches/patches/linux-2.4.19-bgl-xattr-0.8.54.patch deleted file mode 100644 index a6a7e12..0000000 --- a/lustre/kernel_patches/patches/linux-2.4.19-bgl-xattr-0.8.54.patch +++ /dev/null @@ -1,5242 +0,0 @@ - Documentation/Configure.help | 66 ++ - arch/alpha/defconfig | 7 - arch/alpha/kernel/entry.S | 12 - arch/arm/defconfig | 7 - arch/arm/kernel/calls.S | 24 - arch/i386/defconfig | 7 - arch/ia64/defconfig | 7 - arch/ia64/kernel/entry.S | 24 - arch/m68k/defconfig | 7 - arch/mips/defconfig | 7 - arch/mips64/defconfig | 7 - arch/ppc/defconfig | 14 - arch/ppc64/kernel/misc.S | 2 - arch/s390/defconfig | 7 - arch/s390/kernel/entry.S | 24 - arch/s390x/defconfig | 7 - arch/s390x/kernel/entry.S | 24 - arch/s390x/kernel/wrapper32.S | 92 +++ - arch/sparc/defconfig | 7 - arch/sparc/kernel/systbls.S | 10 - arch/sparc64/defconfig | 7 - arch/sparc64/kernel/systbls.S | 20 - fs/Config.in | 14 - fs/Makefile | 3 - fs/ext2/Makefile | 4 - fs/ext2/file.c | 5 - fs/ext2/ialloc.c | 2 - fs/ext2/inode.c | 34 - - fs/ext2/namei.c | 14 - fs/ext2/super.c | 29 - fs/ext2/symlink.c | 14 - fs/ext2/xattr.c | 1212 +++++++++++++++++++++++++++++++++++++++++ - fs/ext2/xattr_user.c | 103 +++ - fs/ext3/Makefile | 10 - fs/ext3/file.c | 5 - fs/ext3/ialloc.c | 2 - fs/ext3/inode.c | 35 - - fs/ext3/namei.c | 21 - fs/ext3/super.c | 36 + - fs/ext3/symlink.c | 14 - fs/ext3/xattr.c | 1225 ++++++++++++++++++++++++++++++++++++++++++ - fs/ext3/xattr_user.c | 111 +++ - fs/jfs/jfs_xattr.h | 6 - fs/jfs/xattr.c | 6 - fs/mbcache.c | 648 ++++++++++++++++++++++ - include/asm-arm/unistd.h | 2 - include/asm-ia64/unistd.h | 13 - include/asm-ppc64/unistd.h | 2 - include/asm-s390/unistd.h | 15 - include/asm-s390x/unistd.h | 15 - include/asm-sparc/unistd.h | 24 - include/asm-sparc64/unistd.h | 24 - include/linux/cache_def.h | 15 - include/linux/errno.h | 4 - include/linux/ext2_fs.h | 31 - - include/linux/ext2_xattr.h | 157 +++++ - include/linux/ext3_fs.h | 31 - - include/linux/ext3_jbd.h | 8 - include/linux/ext3_xattr.h | 157 +++++ - include/linux/fs.h | 2 - include/linux/mbcache.h | 69 ++ - kernel/ksyms.c | 4 - mm/vmscan.c | 35 + - fs/ext3/ext3-exports.c | 14 + - 64 files changed, 4355 insertions(+), 195 deletions(-) - -Index: linux-DRV401/arch/ppc/defconfig -=================================================================== ---- linux-DRV401.orig/arch/ppc/defconfig 2004-10-15 10:24:32.000000000 -0700 -+++ linux-DRV401/arch/ppc/defconfig 2004-10-15 11:03:51.000000000 -0700 -@@ -1,6 +1,13 @@ - # - # Automatically generated by make menuconfig: don't edit - # -+CONFIG_EXT3_FS_XATTR=y -+# CONFIG_EXT3_FS_XATTR_SHARING is not set -+# CONFIG_EXT3_FS_XATTR_USER is not set -+# CONFIG_EXT2_FS_XATTR is not set -+# CONFIG_EXT2_FS_XATTR_SHARING is not set -+# CONFIG_EXT2_FS_XATTR_USER is not set -+# CONFIG_FS_MBCACHE is not set - # CONFIG_UID16 is not set - # CONFIG_RWSEM_GENERIC_SPINLOCK is not set - CONFIG_RWSEM_XCHGADD_ALGORITHM=y -Index: linux-DRV401/fs/Config.in -=================================================================== ---- linux-DRV401.orig/fs/Config.in 2004-10-15 10:24:06.000000000 -0700 -+++ linux-DRV401/fs/Config.in 2004-10-15 11:03:51.000000000 -0700 -@@ -22,6 +22,11 @@ - dep_tristate 'BFS file system support (EXPERIMENTAL)' CONFIG_BFS_FS $CONFIG_EXPERIMENTAL - - tristate 'Ext3 journalling file system support' CONFIG_EXT3_FS -+dep_mbool ' Ext3 extended attributes' CONFIG_EXT3_FS_XATTR $CONFIG_EXT3_FS -+dep_bool ' Ext3 extended attribute block sharing' \ -+ CONFIG_EXT3_FS_XATTR_SHARING $CONFIG_EXT3_FS_XATTR -+dep_bool ' Ext3 extended user attributes' \ -+ CONFIG_EXT3_FS_XATTR_USER $CONFIG_EXT3_FS_XATTR - # CONFIG_JBD could be its own option (even modular), but until there are - # other users than ext3, we will simply make it be the same as CONFIG_EXT3_FS - # dep_tristate ' Journal Block Device support (JBD for ext3)' CONFIG_JBD $CONFIG_EXT3_FS -@@ -77,6 +82,11 @@ - tristate 'ROM file system support' CONFIG_ROMFS_FS - - tristate 'Second extended fs support' CONFIG_EXT2_FS -+dep_mbool ' Ext2 extended attributes' CONFIG_EXT2_FS_XATTR $CONFIG_EXT2_FS -+dep_bool ' Ext2 extended attribute block sharing' \ -+ CONFIG_EXT2_FS_XATTR_SHARING $CONFIG_EXT2_FS_XATTR -+dep_bool ' Ext2 extended user attributes' \ -+ CONFIG_EXT2_FS_XATTR_USER $CONFIG_EXT2_FS_XATTR - - tristate 'System V/Xenix/V7/Coherent file system support' CONFIG_SYSV_FS - -@@ -156,6 +166,10 @@ - fi - fi - -+# Meta block cache for Extended Attributes (ext2/ext3) -+#tristate 'Meta block cache' CONFIG_FS_MBCACHE -+define_tristate CONFIG_FS_MBCACHE y -+ - mainmenu_option next_comment - comment 'Partition Types' - source fs/partitions/Config.in -Index: linux-DRV401/fs/Makefile -=================================================================== ---- linux-DRV401.orig/fs/Makefile 2004-10-15 10:39:15.000000000 -0700 -+++ linux-DRV401/fs/Makefile 2004-10-15 11:03:51.000000000 -0700 -@@ -14,7 +14,7 @@ - super.o block_dev.o char_dev.o stat.o exec.o pipe.o namei.o \ - fcntl.o ioctl.o readdir.o select.o fifo.o locks.o \ - dcache.o inode.o attr.o bad_inode.o file.o iobuf.o dnotify.o \ -- filesystems.o namespace.o seq_file.o quota.o -+ filesystems.o namespace.o seq_file.o quota.o xattr.o - - ifeq ($(CONFIG_QUOTA),y) - obj-y += dquot.o -@@ -76,6 +76,9 @@ - - obj-$(CONFIG_BINFMT_ELF) += binfmt_elf.o - -+export-objs += mbcache.o -+obj-$(CONFIG_FS_MBCACHE) += mbcache.o -+ - # persistent filesystems - obj-y += $(join $(subdir-y),$(subdir-y:%=/%.o)) - -Index: linux-DRV401/fs/ext2/Makefile -=================================================================== ---- linux-DRV401.orig/fs/ext2/Makefile 2004-10-15 10:23:59.000000000 -0700 -+++ linux-DRV401/fs/ext2/Makefile 2004-10-15 11:03:51.000000000 -0700 -@@ -13,4 +13,8 @@ - ioctl.o namei.o super.o symlink.o - obj-m := $(O_TARGET) - -+export-objs += xattr.o -+obj-$(CONFIG_EXT2_FS_XATTR) += xattr.o -+obj-$(CONFIG_EXT2_FS_XATTR_USER) += xattr_user.o -+ - include $(TOPDIR)/Rules.make -Index: linux-DRV401/fs/ext2/file.c -=================================================================== ---- linux-DRV401.orig/fs/ext2/file.c 2004-10-15 10:23:59.000000000 -0700 -+++ linux-DRV401/fs/ext2/file.c 2004-10-15 11:03:51.000000000 -0700 -@@ -20,6 +20,7 @@ - - #include - #include -+#include - #include - - /* -@@ -51,4 +52,8 @@ - - struct inode_operations ext2_file_inode_operations = { - truncate: ext2_truncate, -+ setxattr: ext2_setxattr, -+ getxattr: ext2_getxattr, -+ listxattr: ext2_listxattr, -+ removexattr: ext2_removexattr, - }; -Index: linux-DRV401/fs/ext2/ialloc.c -=================================================================== ---- linux-DRV401.orig/fs/ext2/ialloc.c 2004-10-15 10:23:59.000000000 -0700 -+++ linux-DRV401/fs/ext2/ialloc.c 2004-10-15 11:03:51.000000000 -0700 -@@ -15,6 +15,7 @@ - #include - #include - #include -+#include - #include - #include - -@@ -167,6 +168,7 @@ - */ - if (!is_bad_inode(inode)) { - /* Quota is already initialized in iput() */ -+ ext2_xattr_delete_inode(inode); - DQUOT_FREE_INODE(inode); - DQUOT_DROP(inode); - } -Index: linux-DRV401/fs/ext2/inode.c -=================================================================== ---- linux-DRV401.orig/fs/ext2/inode.c 2004-10-15 10:24:00.000000000 -0700 -+++ linux-DRV401/fs/ext2/inode.c 2004-10-15 11:03:51.000000000 -0700 -@@ -39,6 +39,18 @@ - static int ext2_update_inode(struct inode * inode, int do_sync); - - /* -+ * Test whether an inode is a fast symlink. -+ */ -+static inline int ext2_inode_is_fast_symlink(struct inode *inode) -+{ -+ int ea_blocks = inode->u.ext2_i.i_file_acl ? -+ (inode->i_sb->s_blocksize >> 9) : 0; -+ -+ return (S_ISLNK(inode->i_mode) && -+ inode->i_blocks - ea_blocks == 0); -+} -+ -+/* - * Called at each iput() - */ - void ext2_put_inode (struct inode * inode) -@@ -53,9 +65,7 @@ - { - lock_kernel(); - -- if (is_bad_inode(inode) || -- inode->i_ino == EXT2_ACL_IDX_INO || -- inode->i_ino == EXT2_ACL_DATA_INO) -+ if (is_bad_inode(inode)) - goto no_delete; - inode->u.ext2_i.i_dtime = CURRENT_TIME; - mark_inode_dirty(inode); -@@ -792,6 +802,8 @@ - if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || - S_ISLNK(inode->i_mode))) - return; -+ if (ext2_inode_is_fast_symlink(inode)) -+ return; - if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) - return; - -@@ -879,8 +891,7 @@ - unsigned long offset; - struct ext2_group_desc * gdp; - -- if ((inode->i_ino != EXT2_ROOT_INO && inode->i_ino != EXT2_ACL_IDX_INO && -- inode->i_ino != EXT2_ACL_DATA_INO && -+ if ((inode->i_ino != EXT2_ROOT_INO && - inode->i_ino < EXT2_FIRST_INO(inode->i_sb)) || - inode->i_ino > le32_to_cpu(inode->i_sb->u.ext2_sb.s_es->s_inodes_count)) { - ext2_error (inode->i_sb, "ext2_read_inode", -@@ -965,10 +976,7 @@ - for (block = 0; block < EXT2_N_BLOCKS; block++) - inode->u.ext2_i.i_data[block] = raw_inode->i_block[block]; - -- if (inode->i_ino == EXT2_ACL_IDX_INO || -- inode->i_ino == EXT2_ACL_DATA_INO) -- /* Nothing to do */ ; -- else if (S_ISREG(inode->i_mode)) { -+ if (S_ISREG(inode->i_mode)) { - inode->i_op = &ext2_file_inode_operations; - inode->i_fop = &ext2_file_operations; - inode->i_mapping->a_ops = &ext2_aops; -@@ -977,15 +985,17 @@ - inode->i_fop = &ext2_dir_operations; - inode->i_mapping->a_ops = &ext2_aops; - } else if (S_ISLNK(inode->i_mode)) { -- if (!inode->i_blocks) -+ if (ext2_inode_is_fast_symlink(inode)) - inode->i_op = &ext2_fast_symlink_inode_operations; - else { -- inode->i_op = &page_symlink_inode_operations; -+ inode->i_op = &ext2_symlink_inode_operations; - inode->i_mapping->a_ops = &ext2_aops; - } -- } else -+ } else { -+ inode->i_op = &ext2_special_inode_operations; - init_special_inode(inode, inode->i_mode, - le32_to_cpu(raw_inode->i_block[0])); -+ } - brelse (bh); - inode->i_attr_flags = 0; - if (inode->u.ext2_i.i_flags & EXT2_SYNC_FL) { -Index: linux-DRV401/fs/ext2/namei.c -=================================================================== ---- linux-DRV401.orig/fs/ext2/namei.c 2004-10-15 10:23:59.000000000 -0700 -+++ linux-DRV401/fs/ext2/namei.c 2004-10-15 11:03:51.000000000 -0700 -@@ -31,6 +31,7 @@ - - #include - #include -+#include - #include - - /* -@@ -136,7 +137,7 @@ - - if (l > sizeof (inode->u.ext2_i.i_data)) { - /* slow symlink */ -- inode->i_op = &page_symlink_inode_operations; -+ inode->i_op = &ext2_symlink_inode_operations; - inode->i_mapping->a_ops = &ext2_aops; - err = block_symlink(inode, symname, l); - if (err) -@@ -345,4 +346,15 @@ - rmdir: ext2_rmdir, - mknod: ext2_mknod, - rename: ext2_rename, -+ setxattr: ext2_setxattr, -+ getxattr: ext2_getxattr, -+ listxattr: ext2_listxattr, -+ removexattr: ext2_removexattr, -+}; -+ -+struct inode_operations ext2_special_inode_operations = { -+ setxattr: ext2_setxattr, -+ getxattr: ext2_getxattr, -+ listxattr: ext2_listxattr, -+ removexattr: ext2_removexattr, - }; -Index: linux-DRV401/fs/ext2/super.c -=================================================================== ---- linux-DRV401.orig/fs/ext2/super.c 2004-10-15 10:23:59.000000000 -0700 -+++ linux-DRV401/fs/ext2/super.c 2004-10-15 11:03:51.000000000 -0700 -@@ -21,6 +21,7 @@ - #include - #include - #include -+#include - #include - #include - #include -@@ -125,6 +126,7 @@ - int db_count; - int i; - -+ ext2_xattr_put_super(sb); - if (!(sb->s_flags & MS_RDONLY)) { - struct ext2_super_block *es = EXT2_SB(sb)->s_es; - -@@ -175,6 +177,13 @@ - this_char = strtok (NULL, ",")) { - if ((value = strchr (this_char, '=')) != NULL) - *value++ = 0; -+#ifdef CONFIG_EXT2_FS_XATTR_USER -+ if (!strcmp (this_char, "user_xattr")) -+ set_opt (*mount_options, XATTR_USER); -+ else if (!strcmp (this_char, "nouser_xattr")) -+ clear_opt (*mount_options, XATTR_USER); -+ else -+#endif - if (!strcmp (this_char, "bsddf")) - clear_opt (*mount_options, MINIX_DF); - else if (!strcmp (this_char, "nouid32")) { -@@ -424,6 +433,9 @@ - blocksize = BLOCK_SIZE; - - sb->u.ext2_sb.s_mount_opt = 0; -+#ifdef CONFIG_EXT2_FS_XATTR_USER -+ /* set_opt (sb->u.ext2_sb.s_mount_opt, XATTR_USER); */ -+#endif - if (!parse_options ((char *) data, &sb_block, &resuid, &resgid, - &sb->u.ext2_sb.s_mount_opt)) { - return NULL; -@@ -810,12 +822,27 @@ - - static int __init init_ext2_fs(void) - { -- return register_filesystem(&ext2_fs_type); -+ int error = init_ext2_xattr(); -+ if (error) -+ return error; -+ error = init_ext2_xattr_user(); -+ if (error) -+ goto fail; -+ error = register_filesystem(&ext2_fs_type); -+ if (!error) -+ return 0; -+ -+ exit_ext2_xattr_user(); -+fail: -+ exit_ext2_xattr(); -+ return error; - } - - static void __exit exit_ext2_fs(void) - { - unregister_filesystem(&ext2_fs_type); -+ exit_ext2_xattr_user(); -+ exit_ext2_xattr(); - } - - EXPORT_NO_SYMBOLS; -Index: linux-DRV401/fs/ext2/symlink.c -=================================================================== ---- linux-DRV401.orig/fs/ext2/symlink.c 2004-10-15 10:23:59.000000000 -0700 -+++ linux-DRV401/fs/ext2/symlink.c 2004-10-15 11:03:51.000000000 -0700 -@@ -19,6 +19,7 @@ - - #include - #include -+#include - - static int ext2_readlink(struct dentry *dentry, char *buffer, int buflen) - { -@@ -32,7 +33,20 @@ - return vfs_follow_link(nd, s); - } - -+struct inode_operations ext2_symlink_inode_operations = { -+ readlink: page_readlink, -+ follow_link: page_follow_link, -+ setxattr: ext2_setxattr, -+ getxattr: ext2_getxattr, -+ listxattr: ext2_listxattr, -+ removexattr: ext2_removexattr, -+}; -+ - struct inode_operations ext2_fast_symlink_inode_operations = { - readlink: ext2_readlink, - follow_link: ext2_follow_link, -+ setxattr: ext2_setxattr, -+ getxattr: ext2_getxattr, -+ listxattr: ext2_listxattr, -+ removexattr: ext2_removexattr, - }; -Index: linux-DRV401/fs/ext2/xattr.c -=================================================================== ---- linux-DRV401.orig/fs/ext2/xattr.c 2004-10-12 08:56:38.404764448 -0700 -+++ linux-DRV401/fs/ext2/xattr.c 2004-10-15 11:03:51.000000000 -0700 -@@ -0,0 +1,1212 @@ -+/* -+ * linux/fs/ext2/xattr.c -+ * -+ * Copyright (C) 2001 by Andreas Gruenbacher, -+ * -+ * Fix by Harrison Xing . -+ * Extended attributes for symlinks and special files added per -+ * suggestion of Luka Renko . -+ */ -+ -+/* -+ * Extended attributes are stored on disk blocks allocated outside of -+ * any inode. The i_file_acl field is then made to point to this allocated -+ * block. If all extended attributes of an inode are identical, these -+ * inodes may share the same extended attribute block. Such situations -+ * are automatically detected by keeping a cache of recent attribute block -+ * numbers and hashes over the block's contents in memory. -+ * -+ * -+ * Extended attribute block layout: -+ * -+ * +------------------+ -+ * | header | -+ * | entry 1 | | -+ * | entry 2 | | growing downwards -+ * | entry 3 | v -+ * | four null bytes | -+ * | . . . | -+ * | value 1 | ^ -+ * | value 3 | | growing upwards -+ * | value 2 | | -+ * +------------------+ -+ * -+ * The block header is followed by multiple entry descriptors. These entry -+ * descriptors are variable in size, and alligned to EXT2_XATTR_PAD -+ * byte boundaries. The entry descriptors are sorted by attribute name, -+ * so that two extended attribute blocks can be compared efficiently. -+ * -+ * Attribute values are aligned to the end of the block, stored in -+ * no specific order. They are also padded to EXT2_XATTR_PAD byte -+ * boundaries. No additional gaps are left between them. -+ * -+ * Locking strategy -+ * ---------------- -+ * The VFS already holds the BKL and the inode->i_sem semaphore when any of -+ * the xattr inode operations are called, so we are guaranteed that only one -+ * processes accesses extended attributes of an inode at any time. -+ * -+ * For writing we also grab the ext2_xattr_sem semaphore. This ensures that -+ * only a single process is modifying an extended attribute block, even -+ * if the block is shared among inodes. -+ * -+ * Note for porting to 2.5 -+ * ----------------------- -+ * The BKL will no longer be held in the xattr inode operations. -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+/* These symbols may be needed by a module. */ -+EXPORT_SYMBOL(ext2_xattr_register); -+EXPORT_SYMBOL(ext2_xattr_unregister); -+EXPORT_SYMBOL(ext2_xattr_get); -+EXPORT_SYMBOL(ext2_xattr_list); -+EXPORT_SYMBOL(ext2_xattr_set); -+ -+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,0) -+# define mark_buffer_dirty(bh) mark_buffer_dirty(bh, 1) -+#endif -+ -+#define HDR(bh) ((struct ext2_xattr_header *)((bh)->b_data)) -+#define ENTRY(ptr) ((struct ext2_xattr_entry *)(ptr)) -+#define FIRST_ENTRY(bh) ENTRY(HDR(bh)+1) -+#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0) -+ -+#ifdef EXT2_XATTR_DEBUG -+# define ea_idebug(inode, f...) do { \ -+ printk(KERN_DEBUG "inode %s:%ld: ", \ -+ kdevname(inode->i_dev), inode->i_ino); \ -+ printk(f); \ -+ printk("\n"); \ -+ } while (0) -+# define ea_bdebug(bh, f...) do { \ -+ printk(KERN_DEBUG "block %s:%ld: ", \ -+ kdevname(bh->b_dev), bh->b_blocknr); \ -+ printk(f); \ -+ printk("\n"); \ -+ } while (0) -+#else -+# define ea_idebug(f...) -+# define ea_bdebug(f...) -+#endif -+ -+static int ext2_xattr_set2(struct inode *, struct buffer_head *, -+ struct ext2_xattr_header *); -+ -+#ifdef CONFIG_EXT2_FS_XATTR_SHARING -+ -+static int ext2_xattr_cache_insert(struct buffer_head *); -+static struct buffer_head *ext2_xattr_cache_find(struct inode *, -+ struct ext2_xattr_header *); -+static void ext2_xattr_cache_remove(struct buffer_head *); -+static void ext2_xattr_rehash(struct ext2_xattr_header *, -+ struct ext2_xattr_entry *); -+ -+static struct mb_cache *ext2_xattr_cache; -+ -+#else -+# define ext2_xattr_cache_insert(bh) 0 -+# define ext2_xattr_cache_find(inode, header) NULL -+# define ext2_xattr_cache_remove(bh) while(0) {} -+# define ext2_xattr_rehash(header, entry) while(0) {} -+#endif -+ -+/* -+ * If a file system does not share extended attributes among inodes, -+ * we should not need the ext2_xattr_sem semaphore. However, the -+ * filesystem may still contain shared blocks, so we always take -+ * the lock. -+ */ -+ -+DECLARE_MUTEX(ext2_xattr_sem); -+ -+static inline int -+ext2_xattr_new_block(struct inode *inode, int * errp, int force) -+{ -+ struct super_block *sb = inode->i_sb; -+ int goal = le32_to_cpu(EXT2_SB(sb)->s_es->s_first_data_block) + -+ EXT2_I(inode)->i_block_group * EXT2_BLOCKS_PER_GROUP(sb); -+ -+ /* How can we enforce the allocation? */ -+ int block = ext2_new_block(inode, goal, 0, 0, errp); -+#ifdef OLD_QUOTAS -+ if (!*errp) -+ inode->i_blocks += inode->i_sb->s_blocksize >> 9; -+#endif -+ return block; -+} -+ -+static inline int -+ext2_xattr_quota_alloc(struct inode *inode, int force) -+{ -+ /* How can we enforce the allocation? */ -+#ifdef OLD_QUOTAS -+ int error = DQUOT_ALLOC_BLOCK(inode->i_sb, inode, 1); -+ if (!error) -+ inode->i_blocks += inode->i_sb->s_blocksize >> 9; -+#else -+ int error = DQUOT_ALLOC_BLOCK(inode, 1); -+#endif -+ return error; -+} -+ -+#ifdef OLD_QUOTAS -+ -+static inline void -+ext2_xattr_quota_free(struct inode *inode) -+{ -+ DQUOT_FREE_BLOCK(inode->i_sb, inode, 1); -+ inode->i_blocks -= inode->i_sb->s_blocksize >> 9; -+} -+ -+static inline void -+ext2_xattr_free_block(struct inode * inode, unsigned long block) -+{ -+ ext2_free_blocks(inode, block, 1); -+ inode->i_blocks -= inode->i_sb->s_blocksize >> 9; -+} -+ -+#else -+# define ext2_xattr_quota_free(inode) \ -+ DQUOT_FREE_BLOCK(inode, 1) -+# define ext2_xattr_free_block(inode, block) \ -+ ext2_free_blocks(inode, block, 1) -+#endif -+ -+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,18) -+ -+static inline struct buffer_head * -+sb_bread(struct super_block *sb, int block) -+{ -+ return bread(sb->s_dev, block, sb->s_blocksize); -+} -+ -+static inline struct buffer_head * -+sb_getblk(struct super_block *sb, int block) -+{ -+ return getblk(sb->s_dev, block, sb->s_blocksize); -+} -+ -+#endif -+ -+struct ext2_xattr_handler *ext2_xattr_handlers[EXT2_XATTR_INDEX_MAX]; -+rwlock_t ext2_handler_lock = RW_LOCK_UNLOCKED; -+ -+int -+ext2_xattr_register(int name_index, struct ext2_xattr_handler *handler) -+{ -+ int error = -EINVAL; -+ -+ if (name_index > 0 && name_index <= EXT2_XATTR_INDEX_MAX) { -+ write_lock(&ext2_handler_lock); -+ if (!ext2_xattr_handlers[name_index-1]) { -+ ext2_xattr_handlers[name_index-1] = handler; -+ error = 0; -+ } -+ write_unlock(&ext2_handler_lock); -+ } -+ return error; -+} -+ -+void -+ext2_xattr_unregister(int name_index, struct ext2_xattr_handler *handler) -+{ -+ if (name_index > 0 || name_index <= EXT2_XATTR_INDEX_MAX) { -+ write_lock(&ext2_handler_lock); -+ ext2_xattr_handlers[name_index-1] = NULL; -+ write_unlock(&ext2_handler_lock); -+ } -+} -+ -+static inline const char * -+strcmp_prefix(const char *a, const char *a_prefix) -+{ -+ while (*a_prefix && *a == *a_prefix) { -+ a++; -+ a_prefix++; -+ } -+ return *a_prefix ? NULL : a; -+} -+ -+/* -+ * Decode the extended attribute name, and translate it into -+ * the name_index and name suffix. -+ */ -+static struct ext2_xattr_handler * -+ext2_xattr_resolve_name(const char **name) -+{ -+ struct ext2_xattr_handler *handler = NULL; -+ int i; -+ -+ if (!*name) -+ return NULL; -+ read_lock(&ext2_handler_lock); -+ for (i=0; iprefix); -+ if (n) { -+ handler = ext2_xattr_handlers[i]; -+ *name = n; -+ break; -+ } -+ } -+ } -+ read_unlock(&ext2_handler_lock); -+ return handler; -+} -+ -+static inline struct ext2_xattr_handler * -+ext2_xattr_handler(int name_index) -+{ -+ struct ext2_xattr_handler *handler = NULL; -+ if (name_index > 0 && name_index <= EXT2_XATTR_INDEX_MAX) { -+ read_lock(&ext2_handler_lock); -+ handler = ext2_xattr_handlers[name_index-1]; -+ read_unlock(&ext2_handler_lock); -+ } -+ return handler; -+} -+ -+/* -+ * Inode operation getxattr() -+ * -+ * dentry->d_inode->i_sem down -+ * BKL held [before 2.5.x] -+ */ -+ssize_t -+ext2_getxattr(struct dentry *dentry, const char *name, -+ void *buffer, size_t size) -+{ -+ struct ext2_xattr_handler *handler; -+ struct inode *inode = dentry->d_inode; -+ -+ handler = ext2_xattr_resolve_name(&name); -+ if (!handler) -+ return -ENOTSUP; -+ return handler->get(inode, name, buffer, size); -+} -+ -+/* -+ * Inode operation listxattr() -+ * -+ * dentry->d_inode->i_sem down -+ * BKL held [before 2.5.x] -+ */ -+ssize_t -+ext2_listxattr(struct dentry *dentry, char *buffer, size_t size) -+{ -+ return ext2_xattr_list(dentry->d_inode, buffer, size); -+} -+ -+/* -+ * Inode operation setxattr() -+ * -+ * dentry->d_inode->i_sem down -+ * BKL held [before 2.5.x] -+ */ -+int -+ext2_setxattr(struct dentry *dentry, const char *name, -+ const void *value, size_t size, int flags) -+{ -+ struct ext2_xattr_handler *handler; -+ struct inode *inode = dentry->d_inode; -+ -+ if (size == 0) -+ value = ""; /* empty EA, do not remove */ -+ handler = ext2_xattr_resolve_name(&name); -+ if (!handler) -+ return -ENOTSUP; -+ return handler->set(inode, name, value, size, flags); -+} -+ -+/* -+ * Inode operation removexattr() -+ * -+ * dentry->d_inode->i_sem down -+ * BKL held [before 2.5.x] -+ */ -+int -+ext2_removexattr(struct dentry *dentry, const char *name) -+{ -+ struct ext2_xattr_handler *handler; -+ struct inode *inode = dentry->d_inode; -+ -+ handler = ext2_xattr_resolve_name(&name); -+ if (!handler) -+ return -ENOTSUP; -+ return handler->set(inode, name, NULL, 0, XATTR_REPLACE); -+} -+ -+/* -+ * ext2_xattr_get() -+ * -+ * Copy an extended attribute into the buffer -+ * provided, or compute the buffer size required. -+ * Buffer is NULL to compute the size of the buffer required. -+ * -+ * Returns a negative error number on failure, or the number of bytes -+ * used / required on success. -+ */ -+int -+ext2_xattr_get(struct inode *inode, int name_index, const char *name, -+ void *buffer, size_t buffer_size) -+{ -+ struct buffer_head *bh = NULL; -+ struct ext2_xattr_entry *entry; -+ unsigned int block, size; -+ char *end; -+ int name_len, error; -+ -+ ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld", -+ name_index, name, buffer, (long)buffer_size); -+ -+ if (name == NULL) -+ return -EINVAL; -+ if (!EXT2_I(inode)->i_file_acl) -+ return -ENOATTR; -+ block = EXT2_I(inode)->i_file_acl; -+ ea_idebug(inode, "reading block %d", block); -+ bh = sb_bread(inode->i_sb, block); -+ if (!bh) -+ return -EIO; -+ ea_bdebug(bh, "b_count=%d, refcount=%d", -+ atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount)); -+ end = bh->b_data + bh->b_size; -+ if (HDR(bh)->h_magic != cpu_to_le32(EXT2_XATTR_MAGIC) || -+ HDR(bh)->h_blocks != cpu_to_le32(1)) { -+bad_block: ext2_error(inode->i_sb, "ext2_xattr_get", -+ "inode %ld: bad block %d", inode->i_ino, block); -+ error = -EIO; -+ goto cleanup; -+ } -+ /* find named attribute */ -+ name_len = strlen(name); -+ -+ error = -ERANGE; -+ if (name_len > 255) -+ goto cleanup; -+ entry = FIRST_ENTRY(bh); -+ while (!IS_LAST_ENTRY(entry)) { -+ struct ext2_xattr_entry *next = -+ EXT2_XATTR_NEXT(entry); -+ if ((char *)next >= end) -+ goto bad_block; -+ if (name_index == entry->e_name_index && -+ name_len == entry->e_name_len && -+ memcmp(name, entry->e_name, name_len) == 0) -+ goto found; -+ entry = next; -+ } -+ /* Check the remaining name entries */ -+ while (!IS_LAST_ENTRY(entry)) { -+ struct ext2_xattr_entry *next = -+ EXT2_XATTR_NEXT(entry); -+ if ((char *)next >= end) -+ goto bad_block; -+ entry = next; -+ } -+ if (ext2_xattr_cache_insert(bh)) -+ ea_idebug(inode, "cache insert failed"); -+ error = -ENOATTR; -+ goto cleanup; -+found: -+ /* check the buffer size */ -+ if (entry->e_value_block != 0) -+ goto bad_block; -+ size = le32_to_cpu(entry->e_value_size); -+ if (size > inode->i_sb->s_blocksize || -+ le16_to_cpu(entry->e_value_offs) + size > inode->i_sb->s_blocksize) -+ goto bad_block; -+ -+ if (ext2_xattr_cache_insert(bh)) -+ ea_idebug(inode, "cache insert failed"); -+ if (buffer) { -+ error = -ERANGE; -+ if (size > buffer_size) -+ goto cleanup; -+ /* return value of attribute */ -+ memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs), -+ size); -+ } -+ error = size; -+ -+cleanup: -+ brelse(bh); -+ -+ return error; -+} -+ -+/* -+ * ext2_xattr_list() -+ * -+ * Copy a list of attribute names into the buffer -+ * provided, or compute the buffer size required. -+ * Buffer is NULL to compute the size of the buffer required. -+ * -+ * Returns a negative error number on failure, or the number of bytes -+ * used / required on success. -+ */ -+int -+ext2_xattr_list(struct inode *inode, char *buffer, size_t buffer_size) -+{ -+ struct buffer_head *bh = NULL; -+ struct ext2_xattr_entry *entry; -+ unsigned int block, size = 0; -+ char *buf, *end; -+ int error; -+ -+ ea_idebug(inode, "buffer=%p, buffer_size=%ld", -+ buffer, (long)buffer_size); -+ -+ if (!EXT2_I(inode)->i_file_acl) -+ return 0; -+ block = EXT2_I(inode)->i_file_acl; -+ ea_idebug(inode, "reading block %d", block); -+ bh = sb_bread(inode->i_sb, block); -+ if (!bh) -+ return -EIO; -+ ea_bdebug(bh, "b_count=%d, refcount=%d", -+ atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount)); -+ end = bh->b_data + bh->b_size; -+ if (HDR(bh)->h_magic != cpu_to_le32(EXT2_XATTR_MAGIC) || -+ HDR(bh)->h_blocks != cpu_to_le32(1)) { -+bad_block: ext2_error(inode->i_sb, "ext2_xattr_list", -+ "inode %ld: bad block %d", inode->i_ino, block); -+ error = -EIO; -+ goto cleanup; -+ } -+ /* compute the size required for the list of attribute names */ -+ for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry); -+ entry = EXT2_XATTR_NEXT(entry)) { -+ struct ext2_xattr_handler *handler; -+ struct ext2_xattr_entry *next = -+ EXT2_XATTR_NEXT(entry); -+ if ((char *)next >= end) -+ goto bad_block; -+ -+ handler = ext2_xattr_handler(entry->e_name_index); -+ if (handler) -+ size += handler->list(NULL, inode, entry->e_name, -+ entry->e_name_len); -+ } -+ -+ if (ext2_xattr_cache_insert(bh)) -+ ea_idebug(inode, "cache insert failed"); -+ if (!buffer) { -+ error = size; -+ goto cleanup; -+ } else { -+ error = -ERANGE; -+ if (size > buffer_size) -+ goto cleanup; -+ } -+ -+ /* list the attribute names */ -+ buf = buffer; -+ for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry); -+ entry = EXT2_XATTR_NEXT(entry)) { -+ struct ext2_xattr_handler *handler; -+ -+ handler = ext2_xattr_handler(entry->e_name_index); -+ if (handler) -+ buf += handler->list(buf, inode, entry->e_name, -+ entry->e_name_len); -+ } -+ error = size; -+ -+cleanup: -+ brelse(bh); -+ -+ return error; -+} -+ -+/* -+ * If the EXT2_FEATURE_COMPAT_EXT_ATTR feature of this file system is -+ * not set, set it. -+ */ -+static void ext2_xattr_update_super_block(struct super_block *sb) -+{ -+ if (EXT2_HAS_COMPAT_FEATURE(sb, EXT2_FEATURE_COMPAT_EXT_ATTR)) -+ return; -+ -+ lock_super(sb); -+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,0) -+ EXT2_SB(sb)->s_feature_compat |= EXT2_FEATURE_COMPAT_EXT_ATTR; -+#endif -+ EXT2_SB(sb)->s_es->s_feature_compat |= -+ cpu_to_le32(EXT2_FEATURE_COMPAT_EXT_ATTR); -+ sb->s_dirt = 1; -+ mark_buffer_dirty(EXT2_SB(sb)->s_sbh); -+ unlock_super(sb); -+} -+ -+/* -+ * ext2_xattr_set() -+ * -+ * Create, replace or remove an extended attribute for this inode. Buffer -+ * is NULL to remove an existing extended attribute, and non-NULL to -+ * either replace an existing extended attribute, or create a new extended -+ * attribute. The flags XATTR_REPLACE and XATTR_CREATE -+ * specify that an extended attribute must exist and must not exist -+ * previous to the call, respectively. -+ * -+ * Returns 0, or a negative error number on failure. -+ */ -+int -+ext2_xattr_set(struct inode *inode, int name_index, const char *name, -+ const void *value, size_t value_len, int flags) -+{ -+ struct super_block *sb = inode->i_sb; -+ struct buffer_head *bh = NULL; -+ struct ext2_xattr_header *header = NULL; -+ struct ext2_xattr_entry *here, *last; -+ unsigned int name_len; -+ int block = EXT2_I(inode)->i_file_acl; -+ int min_offs = sb->s_blocksize, not_found = 1, free, error; -+ char *end; -+ -+ /* -+ * header -- Points either into bh, or to a temporarily -+ * allocated buffer. -+ * here -- The named entry found, or the place for inserting, within -+ * the block pointed to by header. -+ * last -- Points right after the last named entry within the block -+ * pointed to by header. -+ * min_offs -- The offset of the first value (values are aligned -+ * towards the end of the block). -+ * end -- Points right after the block pointed to by header. -+ */ -+ -+ ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld", -+ name_index, name, value, (long)value_len); -+ -+ if (IS_RDONLY(inode)) -+ return -EROFS; -+ if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) -+ return -EPERM; -+ if (value == NULL) -+ value_len = 0; -+ if (name == NULL) -+ return -EINVAL; -+ name_len = strlen(name); -+ if (name_len > 255 || value_len > sb->s_blocksize) -+ return -ERANGE; -+ down(&ext2_xattr_sem); -+ -+ if (block) { -+ /* The inode already has an extended attribute block. */ -+ -+ bh = sb_bread(sb, block); -+ error = -EIO; -+ if (!bh) -+ goto cleanup; -+ ea_bdebug(bh, "b_count=%d, refcount=%d", -+ atomic_read(&(bh->b_count)), -+ le32_to_cpu(HDR(bh)->h_refcount)); -+ header = HDR(bh); -+ end = bh->b_data + bh->b_size; -+ if (header->h_magic != cpu_to_le32(EXT2_XATTR_MAGIC) || -+ header->h_blocks != cpu_to_le32(1)) { -+bad_block: ext2_error(sb, "ext2_xattr_set", -+ "inode %ld: bad block %d", inode->i_ino, block); -+ error = -EIO; -+ goto cleanup; -+ } -+ /* Find the named attribute. */ -+ here = FIRST_ENTRY(bh); -+ while (!IS_LAST_ENTRY(here)) { -+ struct ext2_xattr_entry *next = EXT2_XATTR_NEXT(here); -+ if ((char *)next >= end) -+ goto bad_block; -+ if (!here->e_value_block && here->e_value_size) { -+ int offs = le16_to_cpu(here->e_value_offs); -+ if (offs < min_offs) -+ min_offs = offs; -+ } -+ not_found = name_index - here->e_name_index; -+ if (!not_found) -+ not_found = name_len - here->e_name_len; -+ if (!not_found) -+ not_found = memcmp(name, here->e_name,name_len); -+ if (not_found <= 0) -+ break; -+ here = next; -+ } -+ last = here; -+ /* We still need to compute min_offs and last. */ -+ while (!IS_LAST_ENTRY(last)) { -+ struct ext2_xattr_entry *next = EXT2_XATTR_NEXT(last); -+ if ((char *)next >= end) -+ goto bad_block; -+ if (!last->e_value_block && last->e_value_size) { -+ int offs = le16_to_cpu(last->e_value_offs); -+ if (offs < min_offs) -+ min_offs = offs; -+ } -+ last = next; -+ } -+ -+ /* Check whether we have enough space left. */ -+ free = min_offs - ((char*)last - (char*)header) - sizeof(__u32); -+ } else { -+ /* We will use a new extended attribute block. */ -+ free = sb->s_blocksize - -+ sizeof(struct ext2_xattr_header) - sizeof(__u32); -+ here = last = NULL; /* avoid gcc uninitialized warning. */ -+ } -+ -+ if (not_found) { -+ /* Request to remove a nonexistent attribute? */ -+ error = -ENOATTR; -+ if (flags & XATTR_REPLACE) -+ goto cleanup; -+ error = 0; -+ if (value == NULL) -+ goto cleanup; -+ else -+ free -= EXT2_XATTR_LEN(name_len); -+ } else { -+ /* Request to create an existing attribute? */ -+ error = -EEXIST; -+ if (flags & XATTR_CREATE) -+ goto cleanup; -+ if (!here->e_value_block && here->e_value_size) { -+ unsigned int size = le32_to_cpu(here->e_value_size); -+ -+ if (le16_to_cpu(here->e_value_offs) + size > -+ sb->s_blocksize || size > sb->s_blocksize) -+ goto bad_block; -+ free += EXT2_XATTR_SIZE(size); -+ } -+ } -+ free -= EXT2_XATTR_SIZE(value_len); -+ error = -ENOSPC; -+ if (free < 0) -+ goto cleanup; -+ -+ /* Here we know that we can set the new attribute. */ -+ -+ if (header) { -+ if (header->h_refcount == cpu_to_le32(1)) { -+ ea_bdebug(bh, "modifying in-place"); -+ ext2_xattr_cache_remove(bh); -+ } else { -+ int offset; -+ -+ ea_bdebug(bh, "cloning"); -+ header = kmalloc(bh->b_size, GFP_KERNEL); -+ error = -ENOMEM; -+ if (header == NULL) -+ goto cleanup; -+ memcpy(header, HDR(bh), bh->b_size); -+ header->h_refcount = cpu_to_le32(1); -+ offset = (char *)header - bh->b_data; -+ here = ENTRY((char *)here + offset); -+ last = ENTRY((char *)last + offset); -+ } -+ } else { -+ /* Allocate a buffer where we construct the new block. */ -+ header = kmalloc(sb->s_blocksize, GFP_KERNEL); -+ error = -ENOMEM; -+ if (header == NULL) -+ goto cleanup; -+ memset(header, 0, sb->s_blocksize); -+ end = (char *)header + sb->s_blocksize; -+ header->h_magic = cpu_to_le32(EXT2_XATTR_MAGIC); -+ header->h_blocks = header->h_refcount = cpu_to_le32(1); -+ last = here = ENTRY(header+1); -+ } -+ -+ if (not_found) { -+ /* Insert the new name. */ -+ int size = EXT2_XATTR_LEN(name_len); -+ int rest = (char *)last - (char *)here; -+ memmove((char *)here + size, here, rest); -+ memset(here, 0, size); -+ here->e_name_index = name_index; -+ here->e_name_len = name_len; -+ memcpy(here->e_name, name, name_len); -+ } else { -+ /* Remove the old value. */ -+ if (!here->e_value_block && here->e_value_size) { -+ char *first_val = (char *)header + min_offs; -+ int offs = le16_to_cpu(here->e_value_offs); -+ char *val = (char *)header + offs; -+ size_t size = EXT2_XATTR_SIZE( -+ le32_to_cpu(here->e_value_size)); -+ memmove(first_val + size, first_val, val - first_val); -+ memset(first_val, 0, size); -+ here->e_value_offs = 0; -+ min_offs += size; -+ -+ /* Adjust all value offsets. */ -+ last = ENTRY(header+1); -+ while (!IS_LAST_ENTRY(last)) { -+ int o = le16_to_cpu(last->e_value_offs); -+ if (!last->e_value_block && o < offs) -+ last->e_value_offs = -+ cpu_to_le16(o + size); -+ last = EXT2_XATTR_NEXT(last); -+ } -+ } -+ if (value == NULL) { -+ /* Remove this attribute. */ -+ if (EXT2_XATTR_NEXT(ENTRY(header+1)) == last) { -+ /* This block is now empty. */ -+ error = ext2_xattr_set2(inode, bh, NULL); -+ goto cleanup; -+ } else { -+ /* Remove the old name. */ -+ int size = EXT2_XATTR_LEN(name_len); -+ last = ENTRY((char *)last - size); -+ memmove(here, (char*)here + size, -+ (char*)last - (char*)here); -+ memset(last, 0, size); -+ } -+ } -+ } -+ -+ if (value != NULL) { -+ /* Insert the new value. */ -+ here->e_value_size = cpu_to_le32(value_len); -+ if (value_len) { -+ size_t size = EXT2_XATTR_SIZE(value_len); -+ char *val = (char *)header + min_offs - size; -+ here->e_value_offs = -+ cpu_to_le16((char *)val - (char *)header); -+ memset(val + size - EXT2_XATTR_PAD, 0, -+ EXT2_XATTR_PAD); /* Clear the pad bytes. */ -+ memcpy(val, value, value_len); -+ } -+ } -+ ext2_xattr_rehash(header, here); -+ -+ error = ext2_xattr_set2(inode, bh, header); -+ -+cleanup: -+ brelse(bh); -+ if (!(bh && header == HDR(bh))) -+ kfree(header); -+ up(&ext2_xattr_sem); -+ -+ return error; -+} -+ -+/* -+ * Second half of ext2_xattr_set(): Update the file system. -+ */ -+static int -+ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, -+ struct ext2_xattr_header *header) -+{ -+ struct super_block *sb = inode->i_sb; -+ struct buffer_head *new_bh = NULL; -+ int error; -+ -+ if (header) { -+ new_bh = ext2_xattr_cache_find(inode, header); -+ if (new_bh) { -+ /* -+ * We found an identical block in the cache. -+ * The old block will be released after updating -+ * the inode. -+ */ -+ ea_bdebug(old_bh, "reusing block %ld", -+ new_bh->b_blocknr); -+ -+ error = -EDQUOT; -+ if (ext2_xattr_quota_alloc(inode, 1)) -+ goto cleanup; -+ -+ HDR(new_bh)->h_refcount = cpu_to_le32( -+ le32_to_cpu(HDR(new_bh)->h_refcount) + 1); -+ ea_bdebug(new_bh, "refcount now=%d", -+ le32_to_cpu(HDR(new_bh)->h_refcount)); -+ } else if (old_bh && header == HDR(old_bh)) { -+ /* Keep this block. */ -+ new_bh = old_bh; -+ ext2_xattr_cache_insert(new_bh); -+ } else { -+ /* We need to allocate a new block */ -+ int force = EXT2_I(inode)->i_file_acl != 0; -+ int block = ext2_xattr_new_block(inode, &error, force); -+ if (error) -+ goto cleanup; -+ ea_idebug(inode, "creating block %d", block); -+ -+ new_bh = sb_getblk(sb, block); -+ if (!new_bh) { -+ ext2_xattr_free_block(inode, block); -+ error = -EIO; -+ goto cleanup; -+ } -+ lock_buffer(new_bh); -+ memcpy(new_bh->b_data, header, new_bh->b_size); -+ mark_buffer_uptodate(new_bh, 1); -+ unlock_buffer(new_bh); -+ ext2_xattr_cache_insert(new_bh); -+ -+ ext2_xattr_update_super_block(sb); -+ } -+ mark_buffer_dirty(new_bh); -+ if (IS_SYNC(inode)) { -+ ll_rw_block(WRITE, 1, &new_bh); -+ wait_on_buffer(new_bh); -+ error = -EIO; -+ if (buffer_req(new_bh) && !buffer_uptodate(new_bh)) -+ goto cleanup; -+ } -+ } -+ -+ /* Update the inode. */ -+ EXT2_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0; -+ inode->i_ctime = CURRENT_TIME; -+ if (IS_SYNC(inode)) { -+ error = ext2_sync_inode (inode); -+ if (error) -+ goto cleanup; -+ } else -+ mark_inode_dirty(inode); -+ -+ error = 0; -+ if (old_bh && old_bh != new_bh) { -+ /* -+ * If there was an old block, and we are not still using it, -+ * we now release the old block. -+ */ -+ unsigned int refcount = le32_to_cpu(HDR(old_bh)->h_refcount); -+ -+ if (refcount == 1) { -+ /* Free the old block. */ -+ ea_bdebug(old_bh, "freeing"); -+ ext2_xattr_free_block(inode, old_bh->b_blocknr); -+ mark_buffer_clean(old_bh); -+ } else { -+ /* Decrement the refcount only. */ -+ refcount--; -+ HDR(old_bh)->h_refcount = cpu_to_le32(refcount); -+ ext2_xattr_quota_free(inode); -+ mark_buffer_dirty(old_bh); -+ ea_bdebug(old_bh, "refcount now=%d", refcount); -+ } -+ } -+ -+cleanup: -+ if (old_bh != new_bh) -+ brelse(new_bh); -+ -+ return error; -+} -+ -+/* -+ * ext2_xattr_delete_inode() -+ * -+ * Free extended attribute resources associated with this inode. This -+ * is called immediately before an inode is freed. -+ */ -+void -+ext2_xattr_delete_inode(struct inode *inode) -+{ -+ struct buffer_head *bh; -+ unsigned int block = EXT2_I(inode)->i_file_acl; -+ -+ if (!block) -+ return; -+ down(&ext2_xattr_sem); -+ -+ bh = sb_bread(inode->i_sb, block); -+ if (!bh) { -+ ext2_error(inode->i_sb, "ext2_xattr_delete_inode", -+ "inode %ld: block %d read error", inode->i_ino, block); -+ goto cleanup; -+ } -+ ea_bdebug(bh, "b_count=%d", atomic_read(&(bh->b_count))); -+ if (HDR(bh)->h_magic != cpu_to_le32(EXT2_XATTR_MAGIC) || -+ HDR(bh)->h_blocks != cpu_to_le32(1)) { -+ ext2_error(inode->i_sb, "ext2_xattr_delete_inode", -+ "inode %ld: bad block %d", inode->i_ino, block); -+ goto cleanup; -+ } -+ ea_bdebug(bh, "refcount now=%d", le32_to_cpu(HDR(bh)->h_refcount) - 1); -+ if (HDR(bh)->h_refcount == cpu_to_le32(1)) { -+ ext2_xattr_cache_remove(bh); -+ ext2_xattr_free_block(inode, block); -+ bforget(bh); -+ bh = NULL; -+ } else { -+ HDR(bh)->h_refcount = cpu_to_le32( -+ le32_to_cpu(HDR(bh)->h_refcount) - 1); -+ mark_buffer_dirty(bh); -+ if (IS_SYNC(inode)) { -+ ll_rw_block(WRITE, 1, &bh); -+ wait_on_buffer(bh); -+ } -+ ext2_xattr_quota_free(inode); -+ } -+ EXT2_I(inode)->i_file_acl = 0; -+ -+cleanup: -+ brelse(bh); -+ up(&ext2_xattr_sem); -+} -+ -+/* -+ * ext2_xattr_put_super() -+ * -+ * This is called when a file system is unmounted. -+ */ -+void -+ext2_xattr_put_super(struct super_block *sb) -+{ -+#ifdef CONFIG_EXT2_FS_XATTR_SHARING -+ mb_cache_shrink(ext2_xattr_cache, sb->s_dev); -+#endif -+} -+ -+#ifdef CONFIG_EXT2_FS_XATTR_SHARING -+ -+/* -+ * ext2_xattr_cache_insert() -+ * -+ * Create a new entry in the extended attribute cache, and insert -+ * it unless such an entry is already in the cache. -+ * -+ * Returns 0, or a negative error number on failure. -+ */ -+static int -+ext2_xattr_cache_insert(struct buffer_head *bh) -+{ -+ __u32 hash = le32_to_cpu(HDR(bh)->h_hash); -+ struct mb_cache_entry *ce; -+ int error; -+ -+ ce = mb_cache_entry_alloc(ext2_xattr_cache); -+ if (!ce) -+ return -ENOMEM; -+ error = mb_cache_entry_insert(ce, bh->b_dev, bh->b_blocknr, &hash); -+ if (error) { -+ mb_cache_entry_free(ce); -+ if (error == -EBUSY) { -+ ea_bdebug(bh, "already in cache (%d cache entries)", -+ atomic_read(&ext2_xattr_cache->c_entry_count)); -+ error = 0; -+ } -+ } else { -+ ea_bdebug(bh, "inserting [%x] (%d cache entries)", (int)hash, -+ atomic_read(&ext2_xattr_cache->c_entry_count)); -+ mb_cache_entry_release(ce); -+ } -+ return error; -+} -+ -+/* -+ * ext2_xattr_cmp() -+ * -+ * Compare two extended attribute blocks for equality. -+ * -+ * Returns 0 if the blocks are equal, 1 if they differ, and -+ * a negative error number on errors. -+ */ -+static int -+ext2_xattr_cmp(struct ext2_xattr_header *header1, -+ struct ext2_xattr_header *header2) -+{ -+ struct ext2_xattr_entry *entry1, *entry2; -+ -+ entry1 = ENTRY(header1+1); -+ entry2 = ENTRY(header2+1); -+ while (!IS_LAST_ENTRY(entry1)) { -+ if (IS_LAST_ENTRY(entry2)) -+ return 1; -+ if (entry1->e_hash != entry2->e_hash || -+ entry1->e_name_len != entry2->e_name_len || -+ entry1->e_value_size != entry2->e_value_size || -+ memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len)) -+ return 1; -+ if (entry1->e_value_block != 0 || entry2->e_value_block != 0) -+ return -EIO; -+ if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs), -+ (char *)header2 + le16_to_cpu(entry2->e_value_offs), -+ le32_to_cpu(entry1->e_value_size))) -+ return 1; -+ -+ entry1 = EXT2_XATTR_NEXT(entry1); -+ entry2 = EXT2_XATTR_NEXT(entry2); -+ } -+ if (!IS_LAST_ENTRY(entry2)) -+ return 1; -+ return 0; -+} -+ -+/* -+ * ext2_xattr_cache_find() -+ * -+ * Find an identical extended attribute block. -+ * -+ * Returns a pointer to the block found, or NULL if such a block was -+ * not found or an error occurred. -+ */ -+static struct buffer_head * -+ext2_xattr_cache_find(struct inode *inode, struct ext2_xattr_header *header) -+{ -+ __u32 hash = le32_to_cpu(header->h_hash); -+ struct mb_cache_entry *ce; -+ -+ if (!header->h_hash) -+ return NULL; /* never share */ -+ ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); -+ ce = mb_cache_entry_find_first(ext2_xattr_cache, 0, inode->i_dev, hash); -+ while (ce) { -+ struct buffer_head *bh = sb_bread(inode->i_sb, ce->e_block); -+ -+ if (!bh) { -+ ext2_error(inode->i_sb, "ext2_xattr_cache_find", -+ "inode %ld: block %ld read error", -+ inode->i_ino, ce->e_block); -+ } else if (le32_to_cpu(HDR(bh)->h_refcount) > -+ EXT2_XATTR_REFCOUNT_MAX) { -+ ea_idebug(inode, "block %ld refcount %d>%d",ce->e_block, -+ le32_to_cpu(HDR(bh)->h_refcount), -+ EXT2_XATTR_REFCOUNT_MAX); -+ } else if (!ext2_xattr_cmp(header, HDR(bh))) { -+ ea_bdebug(bh, "b_count=%d",atomic_read(&(bh->b_count))); -+ mb_cache_entry_release(ce); -+ return bh; -+ } -+ brelse(bh); -+ ce = mb_cache_entry_find_next(ce, 0, inode->i_dev, hash); -+ } -+ return NULL; -+} -+ -+/* -+ * ext2_xattr_cache_remove() -+ * -+ * Remove the cache entry of a block from the cache. Called when a -+ * block becomes invalid. -+ */ -+static void -+ext2_xattr_cache_remove(struct buffer_head *bh) -+{ -+ struct mb_cache_entry *ce; -+ -+ ce = mb_cache_entry_get(ext2_xattr_cache, bh->b_dev, bh->b_blocknr); -+ if (ce) { -+ ea_bdebug(bh, "removing (%d cache entries remaining)", -+ atomic_read(&ext2_xattr_cache->c_entry_count)-1); -+ mb_cache_entry_free(ce); -+ } else -+ ea_bdebug(bh, "no cache entry"); -+} -+ -+#define NAME_HASH_SHIFT 5 -+#define VALUE_HASH_SHIFT 16 -+ -+/* -+ * ext2_xattr_hash_entry() -+ * -+ * Compute the hash of an extended attribute. -+ */ -+static inline void ext2_xattr_hash_entry(struct ext2_xattr_header *header, -+ struct ext2_xattr_entry *entry) -+{ -+ __u32 hash = 0; -+ char *name = entry->e_name; -+ int n; -+ -+ for (n=0; n < entry->e_name_len; n++) { -+ hash = (hash << NAME_HASH_SHIFT) ^ -+ (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^ -+ *name++; -+ } -+ -+ if (entry->e_value_block == 0 && entry->e_value_size != 0) { -+ __u32 *value = (__u32 *)((char *)header + -+ le16_to_cpu(entry->e_value_offs)); -+ for (n = (le32_to_cpu(entry->e_value_size) + -+ EXT2_XATTR_ROUND) >> EXT2_XATTR_PAD_BITS; n; n--) { -+ hash = (hash << VALUE_HASH_SHIFT) ^ -+ (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^ -+ le32_to_cpu(*value++); -+ } -+ } -+ entry->e_hash = cpu_to_le32(hash); -+} -+ -+#undef NAME_HASH_SHIFT -+#undef VALUE_HASH_SHIFT -+ -+#define BLOCK_HASH_SHIFT 16 -+ -+/* -+ * ext2_xattr_rehash() -+ * -+ * Re-compute the extended attribute hash value after an entry has changed. -+ */ -+static void ext2_xattr_rehash(struct ext2_xattr_header *header, -+ struct ext2_xattr_entry *entry) -+{ -+ struct ext2_xattr_entry *here; -+ __u32 hash = 0; -+ -+ ext2_xattr_hash_entry(header, entry); -+ here = ENTRY(header+1); -+ while (!IS_LAST_ENTRY(here)) { -+ if (!here->e_hash) { -+ /* Block is not shared if an entry's hash value == 0 */ -+ hash = 0; -+ break; -+ } -+ hash = (hash << BLOCK_HASH_SHIFT) ^ -+ (hash >> (8*sizeof(hash) - BLOCK_HASH_SHIFT)) ^ -+ le32_to_cpu(here->e_hash); -+ here = EXT2_XATTR_NEXT(here); -+ } -+ header->h_hash = cpu_to_le32(hash); -+} -+ -+#undef BLOCK_HASH_SHIFT -+ -+int __init -+init_ext2_xattr(void) -+{ -+ ext2_xattr_cache = mb_cache_create("ext2_xattr", NULL, -+ sizeof(struct mb_cache_entry) + -+ sizeof(struct mb_cache_entry_index), 1, 61); -+ if (!ext2_xattr_cache) -+ return -ENOMEM; -+ -+ return 0; -+} -+ -+void -+exit_ext2_xattr(void) -+{ -+ mb_cache_destroy(ext2_xattr_cache); -+} -+ -+#else /* CONFIG_EXT2_FS_XATTR_SHARING */ -+ -+int __init -+init_ext2_xattr(void) -+{ -+ return 0; -+} -+ -+void -+exit_ext2_xattr(void) -+{ -+} -+ -+#endif /* CONFIG_EXT2_FS_XATTR_SHARING */ -Index: linux-DRV401/fs/ext2/xattr_user.c -=================================================================== ---- linux-DRV401.orig/fs/ext2/xattr_user.c 2004-10-12 08:56:38.404764448 -0700 -+++ linux-DRV401/fs/ext2/xattr_user.c 2004-10-15 11:03:51.000000000 -0700 -@@ -0,0 +1,103 @@ -+/* -+ * linux/fs/ext2/xattr_user.c -+ * Handler for extended user attributes. -+ * -+ * Copyright (C) 2001 by Andreas Gruenbacher, -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+ -+#ifdef CONFIG_EXT2_FS_POSIX_ACL -+# include -+#endif -+ -+#define XATTR_USER_PREFIX "user." -+ -+static size_t -+ext2_xattr_user_list(char *list, struct inode *inode, -+ const char *name, int name_len) -+{ -+ const int prefix_len = sizeof(XATTR_USER_PREFIX)-1; -+ -+ if (!test_opt(inode->i_sb, XATTR_USER)) -+ return 0; -+ -+ if (list) { -+ memcpy(list, XATTR_USER_PREFIX, prefix_len); -+ memcpy(list+prefix_len, name, name_len); -+ list[prefix_len + name_len] = '\0'; -+ } -+ return prefix_len + name_len + 1; -+} -+ -+static int -+ext2_xattr_user_get(struct inode *inode, const char *name, -+ void *buffer, size_t size) -+{ -+ int error; -+ -+ if (strcmp(name, "") == 0) -+ return -EINVAL; -+ if (!test_opt(inode->i_sb, XATTR_USER)) -+ return -ENOTSUP; -+#ifdef CONFIG_EXT2_FS_POSIX_ACL -+ error = ext2_permission_locked(inode, MAY_READ); -+#else -+ error = permission(inode, MAY_READ); -+#endif -+ if (error) -+ return error; -+ -+ return ext2_xattr_get(inode, EXT2_XATTR_INDEX_USER, name, -+ buffer, size); -+} -+ -+static int -+ext2_xattr_user_set(struct inode *inode, const char *name, -+ const void *value, size_t size, int flags) -+{ -+ int error; -+ -+ if (strcmp(name, "") == 0) -+ return -EINVAL; -+ if (!test_opt(inode->i_sb, XATTR_USER)) -+ return -ENOTSUP; -+ if ( !S_ISREG(inode->i_mode) && -+ (!S_ISDIR(inode->i_mode) || inode->i_mode & S_ISVTX)) -+ return -EPERM; -+#ifdef CONFIG_EXT2_FS_POSIX_ACL -+ error = ext2_permission_locked(inode, MAY_WRITE); -+#else -+ error = permission(inode, MAY_WRITE); -+#endif -+ if (error) -+ return error; -+ -+ return ext2_xattr_set(inode, EXT2_XATTR_INDEX_USER, name, -+ value, size, flags); -+} -+ -+struct ext2_xattr_handler ext2_xattr_user_handler = { -+ prefix: XATTR_USER_PREFIX, -+ list: ext2_xattr_user_list, -+ get: ext2_xattr_user_get, -+ set: ext2_xattr_user_set, -+}; -+ -+int __init -+init_ext2_xattr_user(void) -+{ -+ return ext2_xattr_register(EXT2_XATTR_INDEX_USER, -+ &ext2_xattr_user_handler); -+} -+ -+void -+exit_ext2_xattr_user(void) -+{ -+ ext2_xattr_unregister(EXT2_XATTR_INDEX_USER, -+ &ext2_xattr_user_handler); -+} -Index: linux-DRV401/fs/ext3/Makefile -=================================================================== ---- linux-DRV401.orig/fs/ext3/Makefile 2004-10-15 10:39:16.000000000 -0700 -+++ linux-DRV401/fs/ext3/Makefile 2004-10-15 11:03:51.000000000 -0700 -@@ -1,5 +1,5 @@ - # --# Makefile for the linux ext2-filesystem routines. -+# Makefile for the linux ext3-filesystem routines. - # - # Note! Dependencies are done automagically by 'make dep', which also - # removes any old dependencies. DON'T put your own dependencies here -@@ -9,8 +9,14 @@ - - O_TARGET := ext3.o - -+export-objs := ext3-exports.o -+ - obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ -- ioctl.o namei.o super.o symlink.o hash.o -+ ioctl.o namei.o super.o symlink.o hash.o ext3-exports.o - obj-m := $(O_TARGET) - -+export-objs += xattr.o -+obj-$(CONFIG_EXT3_FS_XATTR) += xattr.o -+obj-$(CONFIG_EXT3_FS_XATTR_USER) += xattr_user.o -+ - include $(TOPDIR)/Rules.make -Index: linux-DRV401/fs/ext3/file.c -=================================================================== ---- linux-DRV401.orig/fs/ext3/file.c 2004-10-15 10:39:16.000000000 -0700 -+++ linux-DRV401/fs/ext3/file.c 2004-10-15 11:03:51.000000000 -0700 -@@ -23,6 +23,7 @@ - #include - #include - #include -+#include - #include - #include - -@@ -93,5 +94,9 @@ - struct inode_operations ext3_file_inode_operations = { - truncate: ext3_truncate, /* BKL held */ - setattr: ext3_setattr, /* BKL held */ -+ setxattr: ext3_setxattr, /* BKL held */ -+ getxattr: ext3_getxattr, /* BKL held */ -+ listxattr: ext3_listxattr, /* BKL held */ -+ removexattr: ext3_removexattr, /* BKL held */ - }; - -Index: linux-DRV401/fs/ext3/ialloc.c -=================================================================== ---- linux-DRV401.orig/fs/ext3/ialloc.c 2004-10-15 10:24:00.000000000 -0700 -+++ linux-DRV401/fs/ext3/ialloc.c 2004-10-15 11:03:52.000000000 -0700 -@@ -17,6 +17,7 @@ - #include - #include - #include -+#include - #include - #include - #include -@@ -216,6 +217,7 @@ - * as writing the quota to disk may need the lock as well. - */ - DQUOT_INIT(inode); -+ ext3_xattr_delete_inode(handle, inode); - DQUOT_FREE_INODE(inode); - DQUOT_DROP(inode); - -Index: linux-DRV401/fs/ext3/inode.c -=================================================================== ---- linux-DRV401.orig/fs/ext3/inode.c 2004-10-15 10:24:00.000000000 -0700 -+++ linux-DRV401/fs/ext3/inode.c 2004-10-15 11:03:52.000000000 -0700 -@@ -39,6 +39,18 @@ - */ - #undef SEARCH_FROM_ZERO - -+/* -+ * Test whether an inode is a fast symlink. -+ */ -+static inline int ext3_inode_is_fast_symlink(struct inode *inode) -+{ -+ int ea_blocks = inode->u.ext3_i.i_file_acl ? -+ (inode->i_sb->s_blocksize >> 9) : 0; -+ -+ return (S_ISLNK(inode->i_mode) && -+ inode->i_blocks - ea_blocks == 0); -+} -+ - /* The ext3 forget function must perform a revoke if we are freeing data - * which has been journaled. Metadata (eg. indirect blocks) must be - * revoked in all cases. -@@ -48,7 +60,7 @@ - * still needs to be revoked. - */ - --static int ext3_forget(handle_t *handle, int is_metadata, -+int ext3_forget(handle_t *handle, int is_metadata, - struct inode *inode, struct buffer_head *bh, - int blocknr) - { -@@ -164,9 +176,7 @@ - { - handle_t *handle; - -- if (is_bad_inode(inode) || -- inode->i_ino == EXT3_ACL_IDX_INO || -- inode->i_ino == EXT3_ACL_DATA_INO) -+ if (is_bad_inode(inode)) - goto no_delete; - - lock_kernel(); -@@ -1843,6 +1853,8 @@ - if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || - S_ISLNK(inode->i_mode))) - return; -+ if (ext3_inode_is_fast_symlink(inode)) -+ return; - if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) - return; - -@@ -1990,8 +2002,6 @@ - struct ext3_group_desc * gdp; - - if ((inode->i_ino != EXT3_ROOT_INO && -- inode->i_ino != EXT3_ACL_IDX_INO && -- inode->i_ino != EXT3_ACL_DATA_INO && - inode->i_ino != EXT3_JOURNAL_INO && - inode->i_ino < EXT3_FIRST_INO(inode->i_sb)) || - inode->i_ino > le32_to_cpu( -@@ -2118,10 +2128,7 @@ - - brelse (iloc.bh); - -- if (inode->i_ino == EXT3_ACL_IDX_INO || -- inode->i_ino == EXT3_ACL_DATA_INO) -- /* Nothing to do */ ; -- else if (S_ISREG(inode->i_mode)) { -+ if (S_ISREG(inode->i_mode)) { - inode->i_op = &ext3_file_inode_operations; - inode->i_fop = &ext3_file_operations; - inode->i_mapping->a_ops = &ext3_aops; -@@ -2129,15 +2136,17 @@ - inode->i_op = &ext3_dir_inode_operations; - inode->i_fop = &ext3_dir_operations; - } else if (S_ISLNK(inode->i_mode)) { -- if (!inode->i_blocks) -+ if (ext3_inode_is_fast_symlink(inode)) - inode->i_op = &ext3_fast_symlink_inode_operations; - else { -- inode->i_op = &page_symlink_inode_operations; -+ inode->i_op = &ext3_symlink_inode_operations; - inode->i_mapping->a_ops = &ext3_aops; - } -- } else -+ } else { -+ inode->i_op = &ext3_special_inode_operations; - init_special_inode(inode, inode->i_mode, - le32_to_cpu(iloc.raw_inode->i_block[0])); -+ } - /* inode->i_attr_flags = 0; unused */ - if (inode->u.ext3_i.i_flags & EXT3_SYNC_FL) { - /* inode->i_attr_flags |= ATTR_FLAG_SYNCRONOUS; unused */ -Index: linux-DRV401/fs/ext3/namei.c -=================================================================== ---- linux-DRV401.orig/fs/ext3/namei.c 2004-10-15 10:39:16.000000000 -0700 -+++ linux-DRV401/fs/ext3/namei.c 2004-10-15 11:03:52.000000000 -0700 -@@ -29,6 +29,7 @@ - #include - #include - #include -+#include - #include - #include - #include -@@ -1612,7 +1613,7 @@ - if (IS_SYNC(dir)) - handle->h_sync = 1; - -- inode = ext3_new_inode (handle, dir, S_IFDIR); -+ inode = ext3_new_inode (handle, dir, S_IFDIR | mode); - err = PTR_ERR(inode); - if (IS_ERR(inode)) - goto out_stop; -@@ -1620,7 +1621,6 @@ - inode->i_op = &ext3_dir_inode_operations; - inode->i_fop = &ext3_dir_operations; - inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize; -- inode->i_blocks = 0; - dir_block = ext3_bread (handle, inode, 0, 1, &err); - if (!dir_block) { - inode->i_nlink--; /* is this nlink == 0? */ -@@ -1647,9 +1647,6 @@ - BUFFER_TRACE(dir_block, "call ext3_journal_dirty_metadata"); - ext3_journal_dirty_metadata(handle, dir_block); - brelse (dir_block); -- inode->i_mode = S_IFDIR | mode; -- if (dir->i_mode & S_ISGID) -- inode->i_mode |= S_ISGID; - ext3_mark_inode_dirty(handle, inode); - err = ext3_add_entry (handle, dentry, inode); - if (err) { -@@ -2018,7 +2015,7 @@ - goto out_stop; - - if (l > sizeof (EXT3_I(inode)->i_data)) { -- inode->i_op = &page_symlink_inode_operations; -+ inode->i_op = &ext3_symlink_inode_operations; - inode->i_mapping->a_ops = &ext3_aops; - /* - * block_symlink() calls back into ext3_prepare/commit_write. -@@ -2245,4 +2242,16 @@ - rmdir: ext3_rmdir, /* BKL held */ - mknod: ext3_mknod, /* BKL held */ - rename: ext3_rename, /* BKL held */ -+ setxattr: ext3_setxattr, /* BKL held */ -+ getxattr: ext3_getxattr, /* BKL held */ -+ listxattr: ext3_listxattr, /* BKL held */ -+ removexattr: ext3_removexattr, /* BKL held */ - }; -+ -+struct inode_operations ext3_special_inode_operations = { -+ setxattr: ext3_setxattr, /* BKL held */ -+ getxattr: ext3_getxattr, /* BKL held */ -+ listxattr: ext3_listxattr, /* BKL held */ -+ removexattr: ext3_removexattr, /* BKL held */ -+}; -+ -Index: linux-DRV401/fs/ext3/super.c -=================================================================== ---- linux-DRV401.orig/fs/ext3/super.c 2004-10-15 10:39:16.000000000 -0700 -+++ linux-DRV401/fs/ext3/super.c 2004-10-15 11:03:52.000000000 -0700 -@@ -24,6 +24,7 @@ - #include - #include - #include -+#include - #include - #include - #include -@@ -404,6 +405,7 @@ - kdev_t j_dev = sbi->s_journal->j_dev; - int i; - -+ ext3_xattr_put_super(sb); - journal_destroy(sbi->s_journal); - if (!(sb->s_flags & MS_RDONLY)) { - EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); -@@ -499,6 +501,7 @@ - int is_remount) - { - unsigned long *mount_options = &sbi->s_mount_opt; -+ - uid_t *resuid = &sbi->s_resuid; - gid_t *resgid = &sbi->s_resgid; - char * this_char; -@@ -511,6 +514,13 @@ - this_char = strtok (NULL, ",")) { - if ((value = strchr (this_char, '=')) != NULL) - *value++ = 0; -+#ifdef CONFIG_EXT3_FS_XATTR_USER -+ if (!strcmp (this_char, "user_xattr")) -+ set_opt (*mount_options, XATTR_USER); -+ else if (!strcmp (this_char, "nouser_xattr")) -+ clear_opt (*mount_options, XATTR_USER); -+ else -+#endif - if (!strcmp (this_char, "bsddf")) - clear_opt (*mount_options, MINIX_DF); - else if (!strcmp (this_char, "nouid32")) { -@@ -924,6 +934,12 @@ - sbi->s_mount_opt = 0; - sbi->s_resuid = EXT3_DEF_RESUID; - sbi->s_resgid = EXT3_DEF_RESGID; -+ -+ /* Default extended attribute flags */ -+#ifdef CONFIG_EXT3_FS_XATTR_USER -+ /* set_opt(sbi->s_mount_opt, XATTR_USER); */ -+#endif -+ - if (!parse_options ((char *) data, &sb_block, sbi, &journal_inum, 0)) { - sb->s_dev = 0; - goto out_fail; -@@ -1742,12 +1758,27 @@ - - static int __init init_ext3_fs(void) - { -- return register_filesystem(&ext3_fs_type); -+ int error = init_ext3_xattr(); -+ if (error) -+ return error; -+ error = init_ext3_xattr_user(); -+ if (error) -+ goto fail; -+ error = register_filesystem(&ext3_fs_type); -+ if (!error) -+ return 0; -+ -+ exit_ext3_xattr_user(); -+fail: -+ exit_ext3_xattr(); -+ return error; - } - - static void __exit exit_ext3_fs(void) - { - unregister_filesystem(&ext3_fs_type); -+ exit_ext3_xattr_user(); -+ exit_ext3_xattr(); - } - - EXPORT_SYMBOL(ext3_force_commit); -Index: linux-DRV401/fs/ext3/symlink.c -=================================================================== ---- linux-DRV401.orig/fs/ext3/symlink.c 2004-10-15 10:24:00.000000000 -0700 -+++ linux-DRV401/fs/ext3/symlink.c 2004-10-15 11:03:52.000000000 -0700 -@@ -20,6 +20,7 @@ - #include - #include - #include -+#include - - static int ext3_readlink(struct dentry *dentry, char *buffer, int buflen) - { -@@ -33,7 +34,20 @@ - return vfs_follow_link(nd, s); - } - -+struct inode_operations ext3_symlink_inode_operations = { -+ readlink: page_readlink, /* BKL not held. Don't need */ -+ follow_link: page_follow_link, /* BKL not held. Don't need */ -+ setxattr: ext3_setxattr, /* BKL held */ -+ getxattr: ext3_getxattr, /* BKL held */ -+ listxattr: ext3_listxattr, /* BKL held */ -+ removexattr: ext3_removexattr, /* BKL held */ -+}; -+ - struct inode_operations ext3_fast_symlink_inode_operations = { - readlink: ext3_readlink, /* BKL not held. Don't need */ - follow_link: ext3_follow_link, /* BKL not held. Don't need */ -+ setxattr: ext3_setxattr, /* BKL held */ -+ getxattr: ext3_getxattr, /* BKL held */ -+ listxattr: ext3_listxattr, /* BKL held */ -+ removexattr: ext3_removexattr, /* BKL held */ - }; -Index: linux-DRV401/fs/ext3/xattr.c -=================================================================== ---- linux-DRV401.orig/fs/ext3/xattr.c 2004-10-12 08:56:38.404764448 -0700 -+++ linux-DRV401/fs/ext3/xattr.c 2004-10-15 11:03:52.000000000 -0700 -@@ -0,0 +1,1225 @@ -+/* -+ * linux/fs/ext3/xattr.c -+ * -+ * Copyright (C) 2001 by Andreas Gruenbacher, -+ * -+ * Fix by Harrison Xing . -+ * Ext3 code with a lot of help from Eric Jarman . -+ * Extended attributes for symlinks and special files added per -+ * suggestion of Luka Renko . -+ */ -+ -+/* -+ * Extended attributes are stored on disk blocks allocated outside of -+ * any inode. The i_file_acl field is then made to point to this allocated -+ * block. If all extended attributes of an inode are identical, these -+ * inodes may share the same extended attribute block. Such situations -+ * are automatically detected by keeping a cache of recent attribute block -+ * numbers and hashes over the block's contents in memory. -+ * -+ * -+ * Extended attribute block layout: -+ * -+ * +------------------+ -+ * | header | -+ * | entry 1 | | -+ * | entry 2 | | growing downwards -+ * | entry 3 | v -+ * | four null bytes | -+ * | . . . | -+ * | value 1 | ^ -+ * | value 3 | | growing upwards -+ * | value 2 | | -+ * +------------------+ -+ * -+ * The block header is followed by multiple entry descriptors. These entry -+ * descriptors are variable in size, and alligned to EXT3_XATTR_PAD -+ * byte boundaries. The entry descriptors are sorted by attribute name, -+ * so that two extended attribute blocks can be compared efficiently. -+ * -+ * Attribute values are aligned to the end of the block, stored in -+ * no specific order. They are also padded to EXT3_XATTR_PAD byte -+ * boundaries. No additional gaps are left between them. -+ * -+ * Locking strategy -+ * ---------------- -+ * The VFS already holds the BKL and the inode->i_sem semaphore when any of -+ * the xattr inode operations are called, so we are guaranteed that only one -+ * processes accesses extended attributes of an inode at any time. -+ * -+ * For writing we also grab the ext3_xattr_sem semaphore. This ensures that -+ * only a single process is modifying an extended attribute block, even -+ * if the block is shared among inodes. -+ * -+ * Note for porting to 2.5 -+ * ----------------------- -+ * The BKL will no longer be held in the xattr inode operations. -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#define EXT3_EA_USER "user." -+ -+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,0) -+# define mark_buffer_dirty(bh) mark_buffer_dirty(bh, 1) -+#endif -+ -+#define HDR(bh) ((struct ext3_xattr_header *)((bh)->b_data)) -+#define ENTRY(ptr) ((struct ext3_xattr_entry *)(ptr)) -+#define FIRST_ENTRY(bh) ENTRY(HDR(bh)+1) -+#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0) -+ -+#ifdef EXT3_XATTR_DEBUG -+# define ea_idebug(inode, f...) do { \ -+ printk(KERN_DEBUG "inode %s:%ld: ", \ -+ kdevname(inode->i_dev), inode->i_ino); \ -+ printk(f); \ -+ printk("\n"); \ -+ } while (0) -+# define ea_bdebug(bh, f...) do { \ -+ printk(KERN_DEBUG "block %s:%ld: ", \ -+ kdevname(bh->b_dev), bh->b_blocknr); \ -+ printk(f); \ -+ printk("\n"); \ -+ } while (0) -+#else -+# define ea_idebug(f...) -+# define ea_bdebug(f...) -+#endif -+ -+static int ext3_xattr_set2(handle_t *, struct inode *, struct buffer_head *, -+ struct ext3_xattr_header *); -+ -+#ifdef CONFIG_EXT3_FS_XATTR_SHARING -+ -+static int ext3_xattr_cache_insert(struct buffer_head *); -+static struct buffer_head *ext3_xattr_cache_find(struct inode *, -+ struct ext3_xattr_header *); -+static void ext3_xattr_cache_remove(struct buffer_head *); -+static void ext3_xattr_rehash(struct ext3_xattr_header *, -+ struct ext3_xattr_entry *); -+ -+static struct mb_cache *ext3_xattr_cache; -+ -+#else -+# define ext3_xattr_cache_insert(bh) 0 -+# define ext3_xattr_cache_find(inode, header) NULL -+# define ext3_xattr_cache_remove(bh) while(0) {} -+# define ext3_xattr_rehash(header, entry) while(0) {} -+#endif -+ -+/* -+ * If a file system does not share extended attributes among inodes, -+ * we should not need the ext3_xattr_sem semaphore. However, the -+ * filesystem may still contain shared blocks, so we always take -+ * the lock. -+ */ -+ -+DECLARE_MUTEX(ext3_xattr_sem); -+ -+static inline int -+ext3_xattr_new_block(handle_t *handle, struct inode *inode, -+ int * errp, int force) -+{ -+ struct super_block *sb = inode->i_sb; -+ int goal = le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) + -+ EXT3_I(inode)->i_block_group * EXT3_BLOCKS_PER_GROUP(sb); -+ -+ /* How can we enforce the allocation? */ -+ int block = ext3_new_block(handle, inode, goal, 0, 0, errp); -+#ifdef OLD_QUOTAS -+ if (!*errp) -+ inode->i_blocks += inode->i_sb->s_blocksize >> 9; -+#endif -+ return block; -+} -+ -+static inline int -+ext3_xattr_quota_alloc(struct inode *inode, int force) -+{ -+ /* How can we enforce the allocation? */ -+#ifdef OLD_QUOTAS -+ int error = DQUOT_ALLOC_BLOCK(inode->i_sb, inode, 1); -+ if (!error) -+ inode->i_blocks += inode->i_sb->s_blocksize >> 9; -+#else -+ int error = DQUOT_ALLOC_BLOCK(inode, 1); -+#endif -+ return error; -+} -+ -+#ifdef OLD_QUOTAS -+ -+static inline void -+ext3_xattr_quota_free(struct inode *inode) -+{ -+ DQUOT_FREE_BLOCK(inode->i_sb, inode, 1); -+ inode->i_blocks -= inode->i_sb->s_blocksize >> 9; -+} -+ -+static inline void -+ext3_xattr_free_block(handle_t *handle, struct inode * inode, -+ unsigned long block) -+{ -+ ext3_free_blocks(handle, inode, block, 1); -+ inode->i_blocks -= inode->i_sb->s_blocksize >> 9; -+} -+ -+#else -+# define ext3_xattr_quota_free(inode) \ -+ DQUOT_FREE_BLOCK(inode, 1) -+# define ext3_xattr_free_block(handle, inode, block) \ -+ ext3_free_blocks(handle, inode, block, 1) -+#endif -+ -+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,18) -+ -+static inline struct buffer_head * -+sb_bread(struct super_block *sb, int block) -+{ -+ return bread(sb->s_dev, block, sb->s_blocksize); -+} -+ -+static inline struct buffer_head * -+sb_getblk(struct super_block *sb, int block) -+{ -+ return getblk(sb->s_dev, block, sb->s_blocksize); -+} -+ -+#endif -+ -+struct ext3_xattr_handler *ext3_xattr_handlers[EXT3_XATTR_INDEX_MAX]; -+rwlock_t ext3_handler_lock = RW_LOCK_UNLOCKED; -+ -+int -+ext3_xattr_register(int name_index, struct ext3_xattr_handler *handler) -+{ -+ int error = -EINVAL; -+ -+ if (name_index > 0 && name_index <= EXT3_XATTR_INDEX_MAX) { -+ write_lock(&ext3_handler_lock); -+ if (!ext3_xattr_handlers[name_index-1]) { -+ ext3_xattr_handlers[name_index-1] = handler; -+ error = 0; -+ } -+ write_unlock(&ext3_handler_lock); -+ } -+ return error; -+} -+ -+void -+ext3_xattr_unregister(int name_index, struct ext3_xattr_handler *handler) -+{ -+ if (name_index > 0 || name_index <= EXT3_XATTR_INDEX_MAX) { -+ write_lock(&ext3_handler_lock); -+ ext3_xattr_handlers[name_index-1] = NULL; -+ write_unlock(&ext3_handler_lock); -+ } -+} -+ -+static inline const char * -+strcmp_prefix(const char *a, const char *a_prefix) -+{ -+ while (*a_prefix && *a == *a_prefix) { -+ a++; -+ a_prefix++; -+ } -+ return *a_prefix ? NULL : a; -+} -+ -+/* -+ * Decode the extended attribute name, and translate it into -+ * the name_index and name suffix. -+ */ -+static inline struct ext3_xattr_handler * -+ext3_xattr_resolve_name(const char **name) -+{ -+ struct ext3_xattr_handler *handler = NULL; -+ int i; -+ -+ if (!*name) -+ return NULL; -+ read_lock(&ext3_handler_lock); -+ for (i=0; iprefix); -+ if (n) { -+ handler = ext3_xattr_handlers[i]; -+ *name = n; -+ break; -+ } -+ } -+ } -+ read_unlock(&ext3_handler_lock); -+ return handler; -+} -+ -+static inline struct ext3_xattr_handler * -+ext3_xattr_handler(int name_index) -+{ -+ struct ext3_xattr_handler *handler = NULL; -+ if (name_index > 0 && name_index <= EXT3_XATTR_INDEX_MAX) { -+ read_lock(&ext3_handler_lock); -+ handler = ext3_xattr_handlers[name_index-1]; -+ read_unlock(&ext3_handler_lock); -+ } -+ return handler; -+} -+ -+/* -+ * Inode operation getxattr() -+ * -+ * dentry->d_inode->i_sem down -+ * BKL held [before 2.5.x] -+ */ -+ssize_t -+ext3_getxattr(struct dentry *dentry, const char *name, -+ void *buffer, size_t size) -+{ -+ struct ext3_xattr_handler *handler; -+ struct inode *inode = dentry->d_inode; -+ -+ handler = ext3_xattr_resolve_name(&name); -+ if (!handler) -+ return -ENOTSUP; -+ return handler->get(inode, name, buffer, size); -+} -+ -+/* -+ * Inode operation listxattr() -+ * -+ * dentry->d_inode->i_sem down -+ * BKL held [before 2.5.x] -+ */ -+ssize_t -+ext3_listxattr(struct dentry *dentry, char *buffer, size_t size) -+{ -+ return ext3_xattr_list(dentry->d_inode, buffer, size); -+} -+ -+/* -+ * Inode operation setxattr() -+ * -+ * dentry->d_inode->i_sem down -+ * BKL held [before 2.5.x] -+ */ -+int -+ext3_setxattr(struct dentry *dentry, const char *name, -+ const void *value, size_t size, int flags) -+{ -+ struct ext3_xattr_handler *handler; -+ struct inode *inode = dentry->d_inode; -+ -+ if (size == 0) -+ value = ""; /* empty EA, do not remove */ -+ handler = ext3_xattr_resolve_name(&name); -+ if (!handler) -+ return -ENOTSUP; -+ return handler->set(inode, name, value, size, flags); -+} -+ -+/* -+ * Inode operation removexattr() -+ * -+ * dentry->d_inode->i_sem down -+ * BKL held [before 2.5.x] -+ */ -+int -+ext3_removexattr(struct dentry *dentry, const char *name) -+{ -+ struct ext3_xattr_handler *handler; -+ struct inode *inode = dentry->d_inode; -+ -+ handler = ext3_xattr_resolve_name(&name); -+ if (!handler) -+ return -ENOTSUP; -+ return handler->set(inode, name, NULL, 0, XATTR_REPLACE); -+} -+ -+/* -+ * ext3_xattr_get() -+ * -+ * Copy an extended attribute into the buffer -+ * provided, or compute the buffer size required. -+ * Buffer is NULL to compute the size of the buffer required. -+ * -+ * Returns a negative error number on failure, or the number of bytes -+ * used / required on success. -+ */ -+int -+ext3_xattr_get(struct inode *inode, int name_index, const char *name, -+ void *buffer, size_t buffer_size) -+{ -+ struct buffer_head *bh = NULL; -+ struct ext3_xattr_entry *entry; -+ unsigned int block, size; -+ char *end; -+ int name_len, error; -+ -+ ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld", -+ name_index, name, buffer, (long)buffer_size); -+ -+ if (name == NULL) -+ return -EINVAL; -+ if (!EXT3_I(inode)->i_file_acl) -+ return -ENOATTR; -+ block = EXT3_I(inode)->i_file_acl; -+ ea_idebug(inode, "reading block %d", block); -+ bh = sb_bread(inode->i_sb, block); -+ if (!bh) -+ return -EIO; -+ ea_bdebug(bh, "b_count=%d, refcount=%d", -+ atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount)); -+ end = bh->b_data + bh->b_size; -+ if (HDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) || -+ HDR(bh)->h_blocks != cpu_to_le32(1)) { -+bad_block: ext3_error(inode->i_sb, "ext3_xattr_get", -+ "inode %ld: bad block %d", inode->i_ino, block); -+ error = -EIO; -+ goto cleanup; -+ } -+ /* find named attribute */ -+ name_len = strlen(name); -+ -+ error = -ERANGE; -+ if (name_len > 255) -+ goto cleanup; -+ entry = FIRST_ENTRY(bh); -+ while (!IS_LAST_ENTRY(entry)) { -+ struct ext3_xattr_entry *next = -+ EXT3_XATTR_NEXT(entry); -+ if ((char *)next >= end) -+ goto bad_block; -+ if (name_index == entry->e_name_index && -+ name_len == entry->e_name_len && -+ memcmp(name, entry->e_name, name_len) == 0) -+ goto found; -+ entry = next; -+ } -+ /* Check the remaining name entries */ -+ while (!IS_LAST_ENTRY(entry)) { -+ struct ext3_xattr_entry *next = -+ EXT3_XATTR_NEXT(entry); -+ if ((char *)next >= end) -+ goto bad_block; -+ entry = next; -+ } -+ if (ext3_xattr_cache_insert(bh)) -+ ea_idebug(inode, "cache insert failed"); -+ error = -ENOATTR; -+ goto cleanup; -+found: -+ /* check the buffer size */ -+ if (entry->e_value_block != 0) -+ goto bad_block; -+ size = le32_to_cpu(entry->e_value_size); -+ if (size > inode->i_sb->s_blocksize || -+ le16_to_cpu(entry->e_value_offs) + size > inode->i_sb->s_blocksize) -+ goto bad_block; -+ -+ if (ext3_xattr_cache_insert(bh)) -+ ea_idebug(inode, "cache insert failed"); -+ if (buffer) { -+ error = -ERANGE; -+ if (size > buffer_size) -+ goto cleanup; -+ /* return value of attribute */ -+ memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs), -+ size); -+ } -+ error = size; -+ -+cleanup: -+ brelse(bh); -+ -+ return error; -+} -+ -+/* -+ * ext3_xattr_list() -+ * -+ * Copy a list of attribute names into the buffer -+ * provided, or compute the buffer size required. -+ * Buffer is NULL to compute the size of the buffer required. -+ * -+ * Returns a negative error number on failure, or the number of bytes -+ * used / required on success. -+ */ -+int -+ext3_xattr_list(struct inode *inode, char *buffer, size_t buffer_size) -+{ -+ struct buffer_head *bh = NULL; -+ struct ext3_xattr_entry *entry; -+ unsigned int block, size = 0; -+ char *buf, *end; -+ int error; -+ -+ ea_idebug(inode, "buffer=%p, buffer_size=%ld", -+ buffer, (long)buffer_size); -+ -+ if (!EXT3_I(inode)->i_file_acl) -+ return 0; -+ block = EXT3_I(inode)->i_file_acl; -+ ea_idebug(inode, "reading block %d", block); -+ bh = sb_bread(inode->i_sb, block); -+ if (!bh) -+ return -EIO; -+ ea_bdebug(bh, "b_count=%d, refcount=%d", -+ atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount)); -+ end = bh->b_data + bh->b_size; -+ if (HDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) || -+ HDR(bh)->h_blocks != cpu_to_le32(1)) { -+bad_block: ext3_error(inode->i_sb, "ext3_xattr_list", -+ "inode %ld: bad block %d", inode->i_ino, block); -+ error = -EIO; -+ goto cleanup; -+ } -+ /* compute the size required for the list of attribute names */ -+ for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry); -+ entry = EXT3_XATTR_NEXT(entry)) { -+ struct ext3_xattr_handler *handler; -+ struct ext3_xattr_entry *next = -+ EXT3_XATTR_NEXT(entry); -+ if ((char *)next >= end) -+ goto bad_block; -+ -+ handler = ext3_xattr_handler(entry->e_name_index); -+ if (handler) -+ size += handler->list(NULL, inode, entry->e_name, -+ entry->e_name_len); -+ } -+ -+ if (ext3_xattr_cache_insert(bh)) -+ ea_idebug(inode, "cache insert failed"); -+ if (!buffer) { -+ error = size; -+ goto cleanup; -+ } else { -+ error = -ERANGE; -+ if (size > buffer_size) -+ goto cleanup; -+ } -+ -+ /* list the attribute names */ -+ buf = buffer; -+ for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry); -+ entry = EXT3_XATTR_NEXT(entry)) { -+ struct ext3_xattr_handler *handler; -+ -+ handler = ext3_xattr_handler(entry->e_name_index); -+ if (handler) -+ buf += handler->list(buf, inode, entry->e_name, -+ entry->e_name_len); -+ } -+ error = size; -+ -+cleanup: -+ brelse(bh); -+ -+ return error; -+} -+ -+/* -+ * If the EXT3_FEATURE_COMPAT_EXT_ATTR feature of this file system is -+ * not set, set it. -+ */ -+static void ext3_xattr_update_super_block(handle_t *handle, -+ struct super_block *sb) -+{ -+ if (EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_EXT_ATTR)) -+ return; -+ -+ lock_super(sb); -+ ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh); -+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,0) -+ EXT3_SB(sb)->s_feature_compat |= EXT3_FEATURE_COMPAT_EXT_ATTR; -+#endif -+ EXT3_SB(sb)->s_es->s_feature_compat |= -+ cpu_to_le32(EXT3_FEATURE_COMPAT_EXT_ATTR); -+ sb->s_dirt = 1; -+ ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); -+ unlock_super(sb); -+} -+ -+/* -+ * ext3_xattr_set() -+ * -+ * Create, replace or remove an extended attribute for this inode. Buffer -+ * is NULL to remove an existing extended attribute, and non-NULL to -+ * either replace an existing extended attribute, or create a new extended -+ * attribute. The flags XATTR_REPLACE and XATTR_CREATE -+ * specify that an extended attribute must exist and must not exist -+ * previous to the call, respectively. -+ * -+ * Returns 0, or a negative error number on failure. -+ */ -+int -+ext3_xattr_set(handle_t *handle, struct inode *inode, int name_index, -+ const char *name, const void *value, size_t value_len, int flags) -+{ -+ struct super_block *sb = inode->i_sb; -+ struct buffer_head *bh = NULL; -+ struct ext3_xattr_header *header = NULL; -+ struct ext3_xattr_entry *here, *last; -+ unsigned int name_len; -+ int block = EXT3_I(inode)->i_file_acl; -+ int min_offs = sb->s_blocksize, not_found = 1, free, error; -+ char *end; -+ -+ /* -+ * header -- Points either into bh, or to a temporarily -+ * allocated buffer. -+ * here -- The named entry found, or the place for inserting, within -+ * the block pointed to by header. -+ * last -- Points right after the last named entry within the block -+ * pointed to by header. -+ * min_offs -- The offset of the first value (values are aligned -+ * towards the end of the block). -+ * end -- Points right after the block pointed to by header. -+ */ -+ -+ ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld", -+ name_index, name, value, (long)value_len); -+ -+ if (IS_RDONLY(inode)) -+ return -EROFS; -+ if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) -+ return -EPERM; -+ if (value == NULL) -+ value_len = 0; -+ if (name == NULL) -+ return -EINVAL; -+ name_len = strlen(name); -+ if (name_len > 255 || value_len > sb->s_blocksize) -+ return -ERANGE; -+ down(&ext3_xattr_sem); -+ -+ if (block) { -+ /* The inode already has an extended attribute block. */ -+ bh = sb_bread(sb, block); -+ error = -EIO; -+ if (!bh) -+ goto cleanup; -+ ea_bdebug(bh, "b_count=%d, refcount=%d", -+ atomic_read(&(bh->b_count)), -+ le32_to_cpu(HDR(bh)->h_refcount)); -+ header = HDR(bh); -+ end = bh->b_data + bh->b_size; -+ if (header->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) || -+ header->h_blocks != cpu_to_le32(1)) { -+bad_block: ext3_error(sb, "ext3_xattr_set", -+ "inode %ld: bad block %d", inode->i_ino, block); -+ error = -EIO; -+ goto cleanup; -+ } -+ /* Find the named attribute. */ -+ here = FIRST_ENTRY(bh); -+ while (!IS_LAST_ENTRY(here)) { -+ struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(here); -+ if ((char *)next >= end) -+ goto bad_block; -+ if (!here->e_value_block && here->e_value_size) { -+ int offs = le16_to_cpu(here->e_value_offs); -+ if (offs < min_offs) -+ min_offs = offs; -+ } -+ not_found = name_index - here->e_name_index; -+ if (!not_found) -+ not_found = name_len - here->e_name_len; -+ if (!not_found) -+ not_found = memcmp(name, here->e_name,name_len); -+ if (not_found <= 0) -+ break; -+ here = next; -+ } -+ last = here; -+ /* We still need to compute min_offs and last. */ -+ while (!IS_LAST_ENTRY(last)) { -+ struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(last); -+ if ((char *)next >= end) -+ goto bad_block; -+ if (!last->e_value_block && last->e_value_size) { -+ int offs = le16_to_cpu(last->e_value_offs); -+ if (offs < min_offs) -+ min_offs = offs; -+ } -+ last = next; -+ } -+ -+ /* Check whether we have enough space left. */ -+ free = min_offs - ((char*)last - (char*)header) - sizeof(__u32); -+ } else { -+ /* We will use a new extended attribute block. */ -+ free = sb->s_blocksize - -+ sizeof(struct ext3_xattr_header) - sizeof(__u32); -+ here = last = NULL; /* avoid gcc uninitialized warning. */ -+ } -+ -+ if (not_found) { -+ /* Request to remove a nonexistent attribute? */ -+ error = -ENOATTR; -+ if (flags & XATTR_REPLACE) -+ goto cleanup; -+ error = 0; -+ if (value == NULL) -+ goto cleanup; -+ else -+ free -= EXT3_XATTR_LEN(name_len); -+ } else { -+ /* Request to create an existing attribute? */ -+ error = -EEXIST; -+ if (flags & XATTR_CREATE) -+ goto cleanup; -+ if (!here->e_value_block && here->e_value_size) { -+ unsigned int size = le32_to_cpu(here->e_value_size); -+ -+ if (le16_to_cpu(here->e_value_offs) + size > -+ sb->s_blocksize || size > sb->s_blocksize) -+ goto bad_block; -+ free += EXT3_XATTR_SIZE(size); -+ } -+ } -+ free -= EXT3_XATTR_SIZE(value_len); -+ error = -ENOSPC; -+ if (free < 0) -+ goto cleanup; -+ -+ /* Here we know that we can set the new attribute. */ -+ -+ if (header) { -+ if (header->h_refcount == cpu_to_le32(1)) { -+ ea_bdebug(bh, "modifying in-place"); -+ ext3_xattr_cache_remove(bh); -+ error = ext3_journal_get_write_access(handle, bh); -+ if (error) -+ goto cleanup; -+ } else { -+ int offset; -+ -+ ea_bdebug(bh, "cloning"); -+ header = kmalloc(bh->b_size, GFP_KERNEL); -+ error = -ENOMEM; -+ if (header == NULL) -+ goto cleanup; -+ memcpy(header, HDR(bh), bh->b_size); -+ header->h_refcount = cpu_to_le32(1); -+ offset = (char *)header - bh->b_data; -+ here = ENTRY((char *)here + offset); -+ last = ENTRY((char *)last + offset); -+ } -+ } else { -+ /* Allocate a buffer where we construct the new block. */ -+ header = kmalloc(sb->s_blocksize, GFP_KERNEL); -+ error = -ENOMEM; -+ if (header == NULL) -+ goto cleanup; -+ memset(header, 0, sb->s_blocksize); -+ end = (char *)header + sb->s_blocksize; -+ header->h_magic = cpu_to_le32(EXT3_XATTR_MAGIC); -+ header->h_blocks = header->h_refcount = cpu_to_le32(1); -+ last = here = ENTRY(header+1); -+ } -+ -+ if (not_found) { -+ /* Insert the new name. */ -+ int size = EXT3_XATTR_LEN(name_len); -+ int rest = (char *)last - (char *)here; -+ memmove((char *)here + size, here, rest); -+ memset(here, 0, size); -+ here->e_name_index = name_index; -+ here->e_name_len = name_len; -+ memcpy(here->e_name, name, name_len); -+ } else { -+ /* Remove the old value. */ -+ if (!here->e_value_block && here->e_value_size) { -+ char *first_val = (char *)header + min_offs; -+ int offs = le16_to_cpu(here->e_value_offs); -+ char *val = (char *)header + offs; -+ size_t size = EXT3_XATTR_SIZE( -+ le32_to_cpu(here->e_value_size)); -+ memmove(first_val + size, first_val, val - first_val); -+ memset(first_val, 0, size); -+ here->e_value_offs = 0; -+ min_offs += size; -+ -+ /* Adjust all value offsets. */ -+ last = ENTRY(header+1); -+ while (!IS_LAST_ENTRY(last)) { -+ int o = le16_to_cpu(last->e_value_offs); -+ if (!last->e_value_block && o < offs) -+ last->e_value_offs = -+ cpu_to_le16(o + size); -+ last = EXT3_XATTR_NEXT(last); -+ } -+ } -+ if (value == NULL) { -+ /* Remove this attribute. */ -+ if (EXT3_XATTR_NEXT(ENTRY(header+1)) == last) { -+ /* This block is now empty. */ -+ error = ext3_xattr_set2(handle, inode, bh,NULL); -+ goto cleanup; -+ } else { -+ /* Remove the old name. */ -+ int size = EXT3_XATTR_LEN(name_len); -+ last = ENTRY((char *)last - size); -+ memmove(here, (char*)here + size, -+ (char*)last - (char*)here); -+ memset(last, 0, size); -+ } -+ } -+ } -+ -+ if (value != NULL) { -+ /* Insert the new value. */ -+ here->e_value_size = cpu_to_le32(value_len); -+ if (value_len) { -+ size_t size = EXT3_XATTR_SIZE(value_len); -+ char *val = (char *)header + min_offs - size; -+ here->e_value_offs = -+ cpu_to_le16((char *)val - (char *)header); -+ memset(val + size - EXT3_XATTR_PAD, 0, -+ EXT3_XATTR_PAD); /* Clear the pad bytes. */ -+ memcpy(val, value, value_len); -+ } -+ } -+ ext3_xattr_rehash(header, here); -+ -+ error = ext3_xattr_set2(handle, inode, bh, header); -+ -+cleanup: -+ brelse(bh); -+ if (!(bh && header == HDR(bh))) -+ kfree(header); -+ up(&ext3_xattr_sem); -+ -+ return error; -+} -+ -+/* -+ * Second half of ext3_xattr_set(): Update the file system. -+ */ -+static int -+ext3_xattr_set2(handle_t *handle, struct inode *inode, -+ struct buffer_head *old_bh, struct ext3_xattr_header *header) -+{ -+ struct super_block *sb = inode->i_sb; -+ struct buffer_head *new_bh = NULL; -+ int error; -+ -+ if (header) { -+ new_bh = ext3_xattr_cache_find(inode, header); -+ if (new_bh) { -+ /* -+ * We found an identical block in the cache. -+ * The old block will be released after updating -+ * the inode. -+ */ -+ ea_bdebug(old_bh, "reusing block %ld", -+ new_bh->b_blocknr); -+ -+ error = -EDQUOT; -+ if (ext3_xattr_quota_alloc(inode, 1)) -+ goto cleanup; -+ -+ error = ext3_journal_get_write_access(handle, new_bh); -+ if (error) -+ goto cleanup; -+ HDR(new_bh)->h_refcount = cpu_to_le32( -+ le32_to_cpu(HDR(new_bh)->h_refcount) + 1); -+ ea_bdebug(new_bh, "refcount now=%d", -+ le32_to_cpu(HDR(new_bh)->h_refcount)); -+ } else if (old_bh && header == HDR(old_bh)) { -+ /* Keep this block. */ -+ new_bh = old_bh; -+ ext3_xattr_cache_insert(new_bh); -+ } else { -+ /* We need to allocate a new block */ -+ int force = EXT3_I(inode)->i_file_acl != 0; -+ int block = ext3_xattr_new_block(handle, inode, -+ &error, force); -+ if (error) -+ goto cleanup; -+ ea_idebug(inode, "creating block %d", block); -+ -+ new_bh = sb_getblk(sb, block); -+ if (!new_bh) { -+getblk_failed: ext3_xattr_free_block(handle, inode, block); -+ error = -EIO; -+ goto cleanup; -+ } -+ lock_buffer(new_bh); -+ error = ext3_journal_get_create_access(handle, new_bh); -+ if (error) { -+ unlock_buffer(new_bh); -+ goto getblk_failed; -+ } -+ memcpy(new_bh->b_data, header, new_bh->b_size); -+ mark_buffer_uptodate(new_bh, 1); -+ unlock_buffer(new_bh); -+ ext3_xattr_cache_insert(new_bh); -+ -+ ext3_xattr_update_super_block(handle, sb); -+ } -+ error = ext3_journal_dirty_metadata(handle, new_bh); -+ if (error) -+ goto cleanup; -+ } -+ -+ /* Update the inode. */ -+ EXT3_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0; -+ inode->i_ctime = CURRENT_TIME; -+ ext3_mark_inode_dirty(handle, inode); -+ if (IS_SYNC(inode)) -+ handle->h_sync = 1; -+ -+ error = 0; -+ if (old_bh && old_bh != new_bh) { -+ /* -+ * If there was an old block, and we are not still using it, -+ * we now release the old block. -+ */ -+ unsigned int refcount = le32_to_cpu(HDR(old_bh)->h_refcount); -+ -+ error = ext3_journal_get_write_access(handle, old_bh); -+ if (error) -+ goto cleanup; -+ if (refcount == 1) { -+ /* Free the old block. */ -+ ea_bdebug(old_bh, "freeing"); -+ ext3_xattr_free_block(handle, inode, old_bh->b_blocknr); -+ -+ /* ext3_forget() calls bforget() for us, but we -+ let our caller release old_bh, so we need to -+ duplicate the handle before. */ -+ get_bh(old_bh); -+ ext3_forget(handle, 1, inode, old_bh,old_bh->b_blocknr); -+ } else { -+ /* Decrement the refcount only. */ -+ refcount--; -+ HDR(old_bh)->h_refcount = cpu_to_le32(refcount); -+ ext3_xattr_quota_free(inode); -+ ext3_journal_dirty_metadata(handle, old_bh); -+ ea_bdebug(old_bh, "refcount now=%d", refcount); -+ } -+ } -+ -+cleanup: -+ if (old_bh != new_bh) -+ brelse(new_bh); -+ -+ return error; -+} -+ -+/* -+ * ext3_xattr_delete_inode() -+ * -+ * Free extended attribute resources associated with this inode. This -+ * is called immediately before an inode is freed. -+ */ -+void -+ext3_xattr_delete_inode(handle_t *handle, struct inode *inode) -+{ -+ struct buffer_head *bh; -+ unsigned int block = EXT3_I(inode)->i_file_acl; -+ -+ if (!block) -+ return; -+ down(&ext3_xattr_sem); -+ -+ bh = sb_bread(inode->i_sb, block); -+ if (!bh) { -+ ext3_error(inode->i_sb, "ext3_xattr_delete_inode", -+ "inode %ld: block %d read error", inode->i_ino, block); -+ goto cleanup; -+ } -+ ea_bdebug(bh, "b_count=%d", atomic_read(&(bh->b_count))); -+ if (HDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) || -+ HDR(bh)->h_blocks != cpu_to_le32(1)) { -+ ext3_error(inode->i_sb, "ext3_xattr_delete_inode", -+ "inode %ld: bad block %d", inode->i_ino, block); -+ goto cleanup; -+ } -+ ext3_journal_get_write_access(handle, bh); -+ ea_bdebug(bh, "refcount now=%d", le32_to_cpu(HDR(bh)->h_refcount) - 1); -+ if (HDR(bh)->h_refcount == cpu_to_le32(1)) { -+ ext3_xattr_cache_remove(bh); -+ ext3_xattr_free_block(handle, inode, block); -+ ext3_forget(handle, 1, inode, bh, block); -+ bh = NULL; -+ } else { -+ HDR(bh)->h_refcount = cpu_to_le32( -+ le32_to_cpu(HDR(bh)->h_refcount) - 1); -+ ext3_journal_dirty_metadata(handle, bh); -+ if (IS_SYNC(inode)) -+ handle->h_sync = 1; -+ ext3_xattr_quota_free(inode); -+ } -+ EXT3_I(inode)->i_file_acl = 0; -+ -+cleanup: -+ brelse(bh); -+ up(&ext3_xattr_sem); -+} -+ -+/* -+ * ext3_xattr_put_super() -+ * -+ * This is called when a file system is unmounted. -+ */ -+void -+ext3_xattr_put_super(struct super_block *sb) -+{ -+#ifdef CONFIG_EXT3_FS_XATTR_SHARING -+ mb_cache_shrink(ext3_xattr_cache, sb->s_dev); -+#endif -+} -+ -+#ifdef CONFIG_EXT3_FS_XATTR_SHARING -+ -+/* -+ * ext3_xattr_cache_insert() -+ * -+ * Create a new entry in the extended attribute cache, and insert -+ * it unless such an entry is already in the cache. -+ * -+ * Returns 0, or a negative error number on failure. -+ */ -+static int -+ext3_xattr_cache_insert(struct buffer_head *bh) -+{ -+ __u32 hash = le32_to_cpu(HDR(bh)->h_hash); -+ struct mb_cache_entry *ce; -+ int error; -+ -+ ce = mb_cache_entry_alloc(ext3_xattr_cache); -+ if (!ce) -+ return -ENOMEM; -+ error = mb_cache_entry_insert(ce, bh->b_dev, bh->b_blocknr, &hash); -+ if (error) { -+ mb_cache_entry_free(ce); -+ if (error == -EBUSY) { -+ ea_bdebug(bh, "already in cache (%d cache entries)", -+ atomic_read(&ext3_xattr_cache->c_entry_count)); -+ error = 0; -+ } -+ } else { -+ ea_bdebug(bh, "inserting [%x] (%d cache entries)", (int)hash, -+ atomic_read(&ext3_xattr_cache->c_entry_count)); -+ mb_cache_entry_release(ce); -+ } -+ return error; -+} -+ -+/* -+ * ext3_xattr_cmp() -+ * -+ * Compare two extended attribute blocks for equality. -+ * -+ * Returns 0 if the blocks are equal, 1 if they differ, and -+ * a negative error number on errors. -+ */ -+static int -+ext3_xattr_cmp(struct ext3_xattr_header *header1, -+ struct ext3_xattr_header *header2) -+{ -+ struct ext3_xattr_entry *entry1, *entry2; -+ -+ entry1 = ENTRY(header1+1); -+ entry2 = ENTRY(header2+1); -+ while (!IS_LAST_ENTRY(entry1)) { -+ if (IS_LAST_ENTRY(entry2)) -+ return 1; -+ if (entry1->e_hash != entry2->e_hash || -+ entry1->e_name_len != entry2->e_name_len || -+ entry1->e_value_size != entry2->e_value_size || -+ memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len)) -+ return 1; -+ if (entry1->e_value_block != 0 || entry2->e_value_block != 0) -+ return -EIO; -+ if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs), -+ (char *)header2 + le16_to_cpu(entry2->e_value_offs), -+ le32_to_cpu(entry1->e_value_size))) -+ return 1; -+ -+ entry1 = EXT3_XATTR_NEXT(entry1); -+ entry2 = EXT3_XATTR_NEXT(entry2); -+ } -+ if (!IS_LAST_ENTRY(entry2)) -+ return 1; -+ return 0; -+} -+ -+/* -+ * ext3_xattr_cache_find() -+ * -+ * Find an identical extended attribute block. -+ * -+ * Returns a pointer to the block found, or NULL if such a block was -+ * not found or an error occurred. -+ */ -+static struct buffer_head * -+ext3_xattr_cache_find(struct inode *inode, struct ext3_xattr_header *header) -+{ -+ __u32 hash = le32_to_cpu(header->h_hash); -+ struct mb_cache_entry *ce; -+ -+ if (!header->h_hash) -+ return NULL; /* never share */ -+ ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); -+ ce = mb_cache_entry_find_first(ext3_xattr_cache, 0, inode->i_dev, hash); -+ while (ce) { -+ struct buffer_head *bh = sb_bread(inode->i_sb, ce->e_block); -+ -+ if (!bh) { -+ ext3_error(inode->i_sb, "ext3_xattr_cache_find", -+ "inode %ld: block %ld read error", -+ inode->i_ino, ce->e_block); -+ } else if (le32_to_cpu(HDR(bh)->h_refcount) > -+ EXT3_XATTR_REFCOUNT_MAX) { -+ ea_idebug(inode, "block %ld refcount %d>%d",ce->e_block, -+ le32_to_cpu(HDR(bh)->h_refcount), -+ EXT3_XATTR_REFCOUNT_MAX); -+ } else if (!ext3_xattr_cmp(header, HDR(bh))) { -+ ea_bdebug(bh, "b_count=%d",atomic_read(&(bh->b_count))); -+ mb_cache_entry_release(ce); -+ return bh; -+ } -+ brelse(bh); -+ ce = mb_cache_entry_find_next(ce, 0, inode->i_dev, hash); -+ } -+ return NULL; -+} -+ -+/* -+ * ext3_xattr_cache_remove() -+ * -+ * Remove the cache entry of a block from the cache. Called when a -+ * block becomes invalid. -+ */ -+static void -+ext3_xattr_cache_remove(struct buffer_head *bh) -+{ -+ struct mb_cache_entry *ce; -+ -+ ce = mb_cache_entry_get(ext3_xattr_cache, bh->b_dev, bh->b_blocknr); -+ if (ce) { -+ ea_bdebug(bh, "removing (%d cache entries remaining)", -+ atomic_read(&ext3_xattr_cache->c_entry_count)-1); -+ mb_cache_entry_free(ce); -+ } else -+ ea_bdebug(bh, "no cache entry"); -+} -+ -+#define NAME_HASH_SHIFT 5 -+#define VALUE_HASH_SHIFT 16 -+ -+/* -+ * ext3_xattr_hash_entry() -+ * -+ * Compute the hash of an extended attribute. -+ */ -+static inline void ext3_xattr_hash_entry(struct ext3_xattr_header *header, -+ struct ext3_xattr_entry *entry) -+{ -+ __u32 hash = 0; -+ char *name = entry->e_name; -+ int n; -+ -+ for (n=0; n < entry->e_name_len; n++) { -+ hash = (hash << NAME_HASH_SHIFT) ^ -+ (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^ -+ *name++; -+ } -+ -+ if (entry->e_value_block == 0 && entry->e_value_size != 0) { -+ __u32 *value = (__u32 *)((char *)header + -+ le16_to_cpu(entry->e_value_offs)); -+ for (n = (le32_to_cpu(entry->e_value_size) + -+ EXT3_XATTR_ROUND) >> EXT3_XATTR_PAD_BITS; n; n--) { -+ hash = (hash << VALUE_HASH_SHIFT) ^ -+ (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^ -+ le32_to_cpu(*value++); -+ } -+ } -+ entry->e_hash = cpu_to_le32(hash); -+} -+ -+#undef NAME_HASH_SHIFT -+#undef VALUE_HASH_SHIFT -+ -+#define BLOCK_HASH_SHIFT 16 -+ -+/* -+ * ext3_xattr_rehash() -+ * -+ * Re-compute the extended attribute hash value after an entry has changed. -+ */ -+static void ext3_xattr_rehash(struct ext3_xattr_header *header, -+ struct ext3_xattr_entry *entry) -+{ -+ struct ext3_xattr_entry *here; -+ __u32 hash = 0; -+ -+ ext3_xattr_hash_entry(header, entry); -+ here = ENTRY(header+1); -+ while (!IS_LAST_ENTRY(here)) { -+ if (!here->e_hash) { -+ /* Block is not shared if an entry's hash value == 0 */ -+ hash = 0; -+ break; -+ } -+ hash = (hash << BLOCK_HASH_SHIFT) ^ -+ (hash >> (8*sizeof(hash) - BLOCK_HASH_SHIFT)) ^ -+ le32_to_cpu(here->e_hash); -+ here = EXT3_XATTR_NEXT(here); -+ } -+ header->h_hash = cpu_to_le32(hash); -+} -+ -+#undef BLOCK_HASH_SHIFT -+ -+int __init -+init_ext3_xattr(void) -+{ -+ ext3_xattr_cache = mb_cache_create("ext3_xattr", NULL, -+ sizeof(struct mb_cache_entry) + -+ sizeof(struct mb_cache_entry_index), 1, 61); -+ if (!ext3_xattr_cache) -+ return -ENOMEM; -+ -+ return 0; -+} -+ -+void -+exit_ext3_xattr(void) -+{ -+ if (ext3_xattr_cache) -+ mb_cache_destroy(ext3_xattr_cache); -+ ext3_xattr_cache = NULL; -+} -+ -+#else /* CONFIG_EXT3_FS_XATTR_SHARING */ -+ -+int __init -+init_ext3_xattr(void) -+{ -+ return 0; -+} -+ -+void -+exit_ext3_xattr(void) -+{ -+} -+ -+#endif /* CONFIG_EXT3_FS_XATTR_SHARING */ -Index: linux-DRV401/fs/ext3/xattr_user.c -=================================================================== ---- linux-DRV401.orig/fs/ext3/xattr_user.c 2004-10-12 08:56:38.404764448 -0700 -+++ linux-DRV401/fs/ext3/xattr_user.c 2004-10-15 11:03:52.000000000 -0700 -@@ -0,0 +1,111 @@ -+/* -+ * linux/fs/ext3/xattr_user.c -+ * Handler for extended user attributes. -+ * -+ * Copyright (C) 2001 by Andreas Gruenbacher, -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#ifdef CONFIG_EXT3_FS_POSIX_ACL -+# include -+#endif -+ -+#define XATTR_USER_PREFIX "user." -+ -+static size_t -+ext3_xattr_user_list(char *list, struct inode *inode, -+ const char *name, int name_len) -+{ -+ const int prefix_len = sizeof(XATTR_USER_PREFIX)-1; -+ -+ if (!test_opt(inode->i_sb, XATTR_USER)) -+ return 0; -+ -+ if (list) { -+ memcpy(list, XATTR_USER_PREFIX, prefix_len); -+ memcpy(list+prefix_len, name, name_len); -+ list[prefix_len + name_len] = '\0'; -+ } -+ return prefix_len + name_len + 1; -+} -+ -+static int -+ext3_xattr_user_get(struct inode *inode, const char *name, -+ void *buffer, size_t size) -+{ -+ int error; -+ -+ if (strcmp(name, "") == 0) -+ return -EINVAL; -+ if (!test_opt(inode->i_sb, XATTR_USER)) -+ return -ENOTSUP; -+#ifdef CONFIG_EXT3_FS_POSIX_ACL -+ error = ext3_permission_locked(inode, MAY_READ); -+#else -+ error = permission(inode, MAY_READ); -+#endif -+ if (error) -+ return error; -+ -+ return ext3_xattr_get(inode, EXT3_XATTR_INDEX_USER, name, -+ buffer, size); -+} -+ -+static int -+ext3_xattr_user_set(struct inode *inode, const char *name, -+ const void *value, size_t size, int flags) -+{ -+ handle_t *handle; -+ int error; -+ -+ if (strcmp(name, "") == 0) -+ return -EINVAL; -+ if (!test_opt(inode->i_sb, XATTR_USER)) -+ return -ENOTSUP; -+ if ( !S_ISREG(inode->i_mode) && -+ (!S_ISDIR(inode->i_mode) || inode->i_mode & S_ISVTX)) -+ return -EPERM; -+#ifdef CONFIG_EXT3_FS_POSIX_ACL -+ error = ext3_permission_locked(inode, MAY_WRITE); -+#else -+ error = permission(inode, MAY_WRITE); -+#endif -+ if (error) -+ return error; -+ -+ handle = ext3_journal_start(inode, EXT3_XATTR_TRANS_BLOCKS); -+ if (IS_ERR(handle)) -+ return PTR_ERR(handle); -+ error = ext3_xattr_set(handle, inode, EXT3_XATTR_INDEX_USER, name, -+ value, size, flags); -+ ext3_journal_stop(handle, inode); -+ -+ return error; -+} -+ -+struct ext3_xattr_handler ext3_xattr_user_handler = { -+ prefix: XATTR_USER_PREFIX, -+ list: ext3_xattr_user_list, -+ get: ext3_xattr_user_get, -+ set: ext3_xattr_user_set, -+}; -+ -+int __init -+init_ext3_xattr_user(void) -+{ -+ return ext3_xattr_register(EXT3_XATTR_INDEX_USER, -+ &ext3_xattr_user_handler); -+} -+ -+void -+exit_ext3_xattr_user(void) -+{ -+ ext3_xattr_unregister(EXT3_XATTR_INDEX_USER, -+ &ext3_xattr_user_handler); -+} -Index: linux-DRV401/fs/ext3/ext3-exports.c -=================================================================== ---- linux-DRV401.orig/fs/ext3/ext3-exports.c 2004-10-12 08:56:38.404764448 -0700 -+++ linux-DRV401/fs/ext3/ext3-exports.c 2004-10-15 11:03:52.000000000 -0700 -@@ -0,0 +1,13 @@ -+#include -+#include -+#include -+#include -+#include -+ -+EXPORT_SYMBOL(ext3_force_commit); -+EXPORT_SYMBOL(ext3_bread); -+EXPORT_SYMBOL(ext3_xattr_register); -+EXPORT_SYMBOL(ext3_xattr_unregister); -+EXPORT_SYMBOL(ext3_xattr_get); -+EXPORT_SYMBOL(ext3_xattr_list); -+EXPORT_SYMBOL(ext3_xattr_set); -Index: linux-DRV401/fs/mbcache.c -=================================================================== ---- linux-DRV401.orig/fs/mbcache.c 2004-10-12 08:56:38.404764448 -0700 -+++ linux-DRV401/fs/mbcache.c 2004-10-15 11:03:52.000000000 -0700 -@@ -0,0 +1,648 @@ -+/* -+ * linux/fs/mbcache.c -+ * (C) 2001-2002 Andreas Gruenbacher, -+ */ -+ -+/* -+ * Filesystem Meta Information Block Cache (mbcache) -+ * -+ * The mbcache caches blocks of block devices that need to be located -+ * by their device/block number, as well as by other criteria (such -+ * as the block's contents). -+ * -+ * There can only be one cache entry in a cache per device and block number. -+ * Additional indexes need not be unique in this sense. The number of -+ * additional indexes (=other criteria) can be hardwired at compile time -+ * or specified at cache create time. -+ * -+ * Each cache entry is of fixed size. An entry may be `valid' or `invalid' -+ * in the cache. A valid entry is in the main hash tables of the cache, -+ * and may also be in the lru list. An invalid entry is not in any hashes -+ * or lists. -+ * -+ * A valid cache entry is only in the lru list if no handles refer to it. -+ * Invalid cache entries will be freed when the last handle to the cache -+ * entry is released. Entries that cannot be freed immediately are put -+ * back on the lru list. -+ */ -+ -+#include -+#include -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+ -+#ifdef MB_CACHE_DEBUG -+# define mb_debug(f...) do { \ -+ printk(KERN_DEBUG f); \ -+ printk("\n"); \ -+ } while (0) -+#define mb_assert(c) do { if (!(c)) \ -+ printk(KERN_ERR "assertion " #c " failed\n"); \ -+ } while(0) -+#else -+# define mb_debug(f...) do { } while(0) -+# define mb_assert(c) do { } while(0) -+#endif -+#define mb_error(f...) do { \ -+ printk(KERN_ERR f); \ -+ printk("\n"); \ -+ } while(0) -+ -+MODULE_AUTHOR("Andreas Gruenbacher "); -+MODULE_DESCRIPTION("Meta block cache (for extended attributes)"); -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,0) -+MODULE_LICENSE("GPL"); -+#endif -+ -+EXPORT_SYMBOL(mb_cache_create); -+EXPORT_SYMBOL(mb_cache_shrink); -+EXPORT_SYMBOL(mb_cache_destroy); -+EXPORT_SYMBOL(mb_cache_entry_alloc); -+EXPORT_SYMBOL(mb_cache_entry_insert); -+EXPORT_SYMBOL(mb_cache_entry_release); -+EXPORT_SYMBOL(mb_cache_entry_takeout); -+EXPORT_SYMBOL(mb_cache_entry_free); -+EXPORT_SYMBOL(mb_cache_entry_dup); -+EXPORT_SYMBOL(mb_cache_entry_get); -+#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) -+EXPORT_SYMBOL(mb_cache_entry_find_first); -+EXPORT_SYMBOL(mb_cache_entry_find_next); -+#endif -+ -+ -+/* -+ * Global data: list of all mbcache's, lru list, and a spinlock for -+ * accessing cache data structures on SMP machines. The lru list is -+ * global across all mbcaches. -+ */ -+ -+static LIST_HEAD(mb_cache_list); -+static LIST_HEAD(mb_cache_lru_list); -+static spinlock_t mb_cache_spinlock = SPIN_LOCK_UNLOCKED; -+ -+static inline int -+mb_cache_indexes(struct mb_cache *cache) -+{ -+#ifdef MB_CACHE_INDEXES_COUNT -+ return MB_CACHE_INDEXES_COUNT; -+#else -+ return cache->c_indexes_count; -+#endif -+} -+ -+/* -+ * What the mbcache registers as to get shrunk dynamically. -+ */ -+ -+static void -+mb_cache_memory_pressure(int priority, unsigned int gfp_mask); -+ -+static struct cache_definition mb_cache_definition = { -+ "mb_cache", -+ mb_cache_memory_pressure -+}; -+ -+ -+static inline int -+__mb_cache_entry_is_hashed(struct mb_cache_entry *ce) -+{ -+ return !list_empty(&ce->e_block_list); -+} -+ -+ -+static inline void -+__mb_cache_entry_unhash(struct mb_cache_entry *ce) -+{ -+ int n; -+ -+ if (__mb_cache_entry_is_hashed(ce)) { -+ list_del_init(&ce->e_block_list); -+ for (n=0; ne_cache); n++) -+ list_del(&ce->e_indexes[n].o_list); -+ } -+} -+ -+ -+static inline void -+__mb_cache_entry_forget(struct mb_cache_entry *ce, int gfp_mask) -+{ -+ struct mb_cache *cache = ce->e_cache; -+ -+ mb_assert(atomic_read(&ce->e_used) == 0); -+ if (cache->c_op.free && cache->c_op.free(ce, gfp_mask)) { -+ /* free failed -- put back on the lru list -+ for freeing later. */ -+ spin_lock(&mb_cache_spinlock); -+ list_add(&ce->e_lru_list, &mb_cache_lru_list); -+ spin_unlock(&mb_cache_spinlock); -+ } else { -+ kmem_cache_free(cache->c_entry_cache, ce); -+ atomic_dec(&cache->c_entry_count); -+ } -+} -+ -+ -+static inline void -+__mb_cache_entry_release_unlock(struct mb_cache_entry *ce) -+{ -+ if (atomic_dec_and_test(&ce->e_used)) { -+ if (__mb_cache_entry_is_hashed(ce)) -+ list_add_tail(&ce->e_lru_list, &mb_cache_lru_list); -+ else { -+ spin_unlock(&mb_cache_spinlock); -+ __mb_cache_entry_forget(ce, GFP_KERNEL); -+ return; -+ } -+ } -+ spin_unlock(&mb_cache_spinlock); -+} -+ -+ -+/* -+ * mb_cache_memory_pressure() memory pressure callback -+ * -+ * This function is called by the kernel memory management when memory -+ * gets low. -+ * -+ * @priority: Amount by which to shrink the cache (0 = highes priority) -+ * @gfp_mask: (ignored) -+ */ -+static void -+mb_cache_memory_pressure(int priority, unsigned int gfp_mask) -+{ -+ LIST_HEAD(free_list); -+ struct list_head *l, *ltmp; -+ int count = 0; -+ -+ spin_lock(&mb_cache_spinlock); -+ list_for_each(l, &mb_cache_list) { -+ struct mb_cache *cache = -+ list_entry(l, struct mb_cache, c_cache_list); -+ mb_debug("cache %s (%d)", cache->c_name, -+ atomic_read(&cache->c_entry_count)); -+ count += atomic_read(&cache->c_entry_count); -+ } -+ mb_debug("trying to free %d of %d entries", -+ count / (priority ? priority : 1), count); -+ if (priority) -+ count /= priority; -+ while (count-- && !list_empty(&mb_cache_lru_list)) { -+ struct mb_cache_entry *ce = -+ list_entry(mb_cache_lru_list.next, -+ struct mb_cache_entry, e_lru_list); -+ list_del(&ce->e_lru_list); -+ __mb_cache_entry_unhash(ce); -+ list_add_tail(&ce->e_lru_list, &free_list); -+ } -+ spin_unlock(&mb_cache_spinlock); -+ list_for_each_safe(l, ltmp, &free_list) { -+ __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry, -+ e_lru_list), gfp_mask); -+ } -+} -+ -+ -+/* -+ * mb_cache_create() create a new cache -+ * -+ * All entries in one cache are equal size. Cache entries may be from -+ * multiple devices. If this is the first mbcache created, registers -+ * the cache with kernel memory management. Returns NULL if no more -+ * memory was available. -+ * -+ * @name: name of the cache (informal) -+ * @cache_op: contains the callback called when freeing a cache entry -+ * @entry_size: The size of a cache entry, including -+ * struct mb_cache_entry -+ * @indexes_count: number of additional indexes in the cache. Must equal -+ * MB_CACHE_INDEXES_COUNT if the number of indexes is -+ * hardwired. -+ * @bucket_count: number of hash buckets -+ */ -+struct mb_cache * -+mb_cache_create(const char *name, struct mb_cache_op *cache_op, -+ size_t entry_size, int indexes_count, int bucket_count) -+{ -+ int m=0, n; -+ struct mb_cache *cache = NULL; -+ -+ if(entry_size < sizeof(struct mb_cache_entry) + -+ indexes_count * sizeof(struct mb_cache_entry_index)) -+ return NULL; -+ -+ MOD_INC_USE_COUNT; -+ cache = kmalloc(sizeof(struct mb_cache) + -+ indexes_count * sizeof(struct list_head), GFP_KERNEL); -+ if (!cache) -+ goto fail; -+ cache->c_name = name; -+ cache->c_op.free = NULL; -+ if (cache_op) -+ cache->c_op.free = cache_op->free; -+ atomic_set(&cache->c_entry_count, 0); -+ cache->c_bucket_count = bucket_count; -+#ifdef MB_CACHE_INDEXES_COUNT -+ mb_assert(indexes_count == MB_CACHE_INDEXES_COUNT); -+#else -+ cache->c_indexes_count = indexes_count; -+#endif -+ cache->c_block_hash = kmalloc(bucket_count * sizeof(struct list_head), -+ GFP_KERNEL); -+ if (!cache->c_block_hash) -+ goto fail; -+ for (n=0; nc_block_hash[n]); -+ for (m=0; mc_indexes_hash[m] = kmalloc(bucket_count * -+ sizeof(struct list_head), -+ GFP_KERNEL); -+ if (!cache->c_indexes_hash[m]) -+ goto fail; -+ for (n=0; nc_indexes_hash[m][n]); -+ } -+ cache->c_entry_cache = kmem_cache_create(name, entry_size, 0, -+ 0 /*SLAB_POISON | SLAB_RED_ZONE*/, NULL, NULL); -+ if (!cache->c_entry_cache) -+ goto fail; -+ -+ spin_lock(&mb_cache_spinlock); -+ list_add(&cache->c_cache_list, &mb_cache_list); -+ spin_unlock(&mb_cache_spinlock); -+ return cache; -+ -+fail: -+ if (cache) { -+ while (--m >= 0) -+ kfree(cache->c_indexes_hash[m]); -+ if (cache->c_block_hash) -+ kfree(cache->c_block_hash); -+ kfree(cache); -+ } -+ MOD_DEC_USE_COUNT; -+ return NULL; -+} -+ -+ -+/* -+ * mb_cache_shrink() -+ * -+ * Removes all cache entires of a device from the cache. All cache entries -+ * currently in use cannot be freed, and thus remain in the cache. -+ * -+ * @cache: which cache to shrink -+ * @dev: which device's cache entries to shrink -+ */ -+void -+mb_cache_shrink(struct mb_cache *cache, kdev_t dev) -+{ -+ LIST_HEAD(free_list); -+ struct list_head *l, *ltmp; -+ -+ spin_lock(&mb_cache_spinlock); -+ list_for_each_safe(l, ltmp, &mb_cache_lru_list) { -+ struct mb_cache_entry *ce = -+ list_entry(l, struct mb_cache_entry, e_lru_list); -+ if (ce->e_dev == dev) { -+ list_del(&ce->e_lru_list); -+ list_add_tail(&ce->e_lru_list, &free_list); -+ __mb_cache_entry_unhash(ce); -+ } -+ } -+ spin_unlock(&mb_cache_spinlock); -+ list_for_each_safe(l, ltmp, &free_list) { -+ __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry, -+ e_lru_list), GFP_KERNEL); -+ } -+} -+ -+ -+/* -+ * mb_cache_destroy() -+ * -+ * Shrinks the cache to its minimum possible size (hopefully 0 entries), -+ * and then destroys it. If this was the last mbcache, un-registers the -+ * mbcache from kernel memory management. -+ */ -+void -+mb_cache_destroy(struct mb_cache *cache) -+{ -+ LIST_HEAD(free_list); -+ struct list_head *l, *ltmp; -+ int n; -+ -+ spin_lock(&mb_cache_spinlock); -+ list_for_each_safe(l, ltmp, &mb_cache_lru_list) { -+ struct mb_cache_entry *ce = -+ list_entry(l, struct mb_cache_entry, e_lru_list); -+ if (ce->e_cache == cache) { -+ list_del(&ce->e_lru_list); -+ list_add_tail(&ce->e_lru_list, &free_list); -+ __mb_cache_entry_unhash(ce); -+ } -+ } -+ list_del(&cache->c_cache_list); -+ spin_unlock(&mb_cache_spinlock); -+ list_for_each_safe(l, ltmp, &free_list) { -+ __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry, -+ e_lru_list), GFP_KERNEL); -+ } -+ -+ if (atomic_read(&cache->c_entry_count) > 0) { -+ mb_error("cache %s: %d orphaned entries", -+ cache->c_name, -+ atomic_read(&cache->c_entry_count)); -+ } -+ -+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,3,0)) -+ /* We don't have kmem_cache_destroy() in 2.2.x */ -+ kmem_cache_shrink(cache->c_entry_cache); -+#else -+ kmem_cache_destroy(cache->c_entry_cache); -+#endif -+ for (n=0; n < mb_cache_indexes(cache); n++) -+ kfree(cache->c_indexes_hash[n]); -+ kfree(cache->c_block_hash); -+ kfree(cache); -+ -+ MOD_DEC_USE_COUNT; -+} -+ -+ -+/* -+ * mb_cache_entry_alloc() -+ * -+ * Allocates a new cache entry. The new entry will not be valid initially, -+ * and thus cannot be looked up yet. It should be filled with data, and -+ * then inserted into the cache using mb_cache_entry_insert(). Returns NULL -+ * if no more memory was available. -+ */ -+struct mb_cache_entry * -+mb_cache_entry_alloc(struct mb_cache *cache) -+{ -+ struct mb_cache_entry *ce; -+ -+ atomic_inc(&cache->c_entry_count); -+ ce = kmem_cache_alloc(cache->c_entry_cache, GFP_KERNEL); -+ if (ce) { -+ INIT_LIST_HEAD(&ce->e_lru_list); -+ INIT_LIST_HEAD(&ce->e_block_list); -+ ce->e_cache = cache; -+ atomic_set(&ce->e_used, 1); -+ } -+ return ce; -+} -+ -+ -+/* -+ * mb_cache_entry_insert() -+ * -+ * Inserts an entry that was allocated using mb_cache_entry_alloc() into -+ * the cache. After this, the cache entry can be looked up, but is not yet -+ * in the lru list as the caller still holds a handle to it. Returns 0 on -+ * success, or -EBUSY if a cache entry for that device + inode exists -+ * already (this may happen after a failed lookup, if another process has -+ * inserted the same cache entry in the meantime). -+ * -+ * @dev: device the cache entry belongs to -+ * @block: block number -+ * @keys: array of additional keys. There must be indexes_count entries -+ * in the array (as specified when creating the cache). -+ */ -+int -+mb_cache_entry_insert(struct mb_cache_entry *ce, kdev_t dev, -+ unsigned long block, unsigned int keys[]) -+{ -+ struct mb_cache *cache = ce->e_cache; -+ unsigned int bucket = (HASHDEV(dev) + block) % cache->c_bucket_count; -+ struct list_head *l; -+ int error = -EBUSY, n; -+ -+ spin_lock(&mb_cache_spinlock); -+ list_for_each(l, &cache->c_block_hash[bucket]) { -+ struct mb_cache_entry *ce = -+ list_entry(l, struct mb_cache_entry, e_block_list); -+ if (ce->e_dev == dev && ce->e_block == block) -+ goto out; -+ } -+ __mb_cache_entry_unhash(ce); -+ ce->e_dev = dev; -+ ce->e_block = block; -+ list_add(&ce->e_block_list, &cache->c_block_hash[bucket]); -+ for (n=0; ne_indexes[n].o_key = keys[n]; -+ bucket = keys[n] % cache->c_bucket_count; -+ list_add(&ce->e_indexes[n].o_list, -+ &cache->c_indexes_hash[n][bucket]); -+ } -+out: -+ spin_unlock(&mb_cache_spinlock); -+ return error; -+} -+ -+ -+/* -+ * mb_cache_entry_release() -+ * -+ * Release a handle to a cache entry. When the last handle to a cache entry -+ * is released it is either freed (if it is invalid) or otherwise inserted -+ * in to the lru list. -+ */ -+void -+mb_cache_entry_release(struct mb_cache_entry *ce) -+{ -+ spin_lock(&mb_cache_spinlock); -+ __mb_cache_entry_release_unlock(ce); -+} -+ -+ -+/* -+ * mb_cache_entry_takeout() -+ * -+ * Take a cache entry out of the cache, making it invalid. The entry can later -+ * be re-inserted using mb_cache_entry_insert(), or released using -+ * mb_cache_entry_release(). -+ */ -+void -+mb_cache_entry_takeout(struct mb_cache_entry *ce) -+{ -+ spin_lock(&mb_cache_spinlock); -+ mb_assert(list_empty(&ce->e_lru_list)); -+ __mb_cache_entry_unhash(ce); -+ spin_unlock(&mb_cache_spinlock); -+} -+ -+ -+/* -+ * mb_cache_entry_free() -+ * -+ * This is equivalent to the sequence mb_cache_entry_takeout() -- -+ * mb_cache_entry_release(). -+ */ -+void -+mb_cache_entry_free(struct mb_cache_entry *ce) -+{ -+ spin_lock(&mb_cache_spinlock); -+ mb_assert(list_empty(&ce->e_lru_list)); -+ __mb_cache_entry_unhash(ce); -+ __mb_cache_entry_release_unlock(ce); -+} -+ -+ -+/* -+ * mb_cache_entry_dup() -+ * -+ * Duplicate a handle to a cache entry (does not duplicate the cache entry -+ * itself). After the call, both the old and the new handle must be released. -+ */ -+struct mb_cache_entry * -+mb_cache_entry_dup(struct mb_cache_entry *ce) -+{ -+ atomic_inc(&ce->e_used); -+ return ce; -+} -+ -+ -+/* -+ * mb_cache_entry_get() -+ * -+ * Get a cache entry by device / block number. (There can only be one entry -+ * in the cache per device and block.) Returns NULL if no such cache entry -+ * exists. -+ */ -+struct mb_cache_entry * -+mb_cache_entry_get(struct mb_cache *cache, kdev_t dev, unsigned long block) -+{ -+ unsigned int bucket = (HASHDEV(dev) + block) % cache->c_bucket_count; -+ struct list_head *l; -+ struct mb_cache_entry *ce; -+ -+ spin_lock(&mb_cache_spinlock); -+ list_for_each(l, &cache->c_block_hash[bucket]) { -+ ce = list_entry(l, struct mb_cache_entry, e_block_list); -+ if (ce->e_dev == dev && ce->e_block == block) { -+ if (!list_empty(&ce->e_lru_list)) -+ list_del_init(&ce->e_lru_list); -+ atomic_inc(&ce->e_used); -+ goto cleanup; -+ } -+ } -+ ce = NULL; -+ -+cleanup: -+ spin_unlock(&mb_cache_spinlock); -+ return ce; -+} -+ -+#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) -+ -+static struct mb_cache_entry * -+__mb_cache_entry_find(struct list_head *l, struct list_head *head, -+ int index, kdev_t dev, unsigned int key) -+{ -+ while (l != head) { -+ struct mb_cache_entry *ce = -+ list_entry(l, struct mb_cache_entry, -+ e_indexes[index].o_list); -+ if (ce->e_dev == dev && ce->e_indexes[index].o_key == key) { -+ if (!list_empty(&ce->e_lru_list)) -+ list_del_init(&ce->e_lru_list); -+ atomic_inc(&ce->e_used); -+ return ce; -+ } -+ l = l->next; -+ } -+ return NULL; -+} -+ -+ -+/* -+ * mb_cache_entry_find_first() -+ * -+ * Find the first cache entry on a given device with a certain key in -+ * an additional index. Additonal matches can be found with -+ * mb_cache_entry_find_next(). Returns NULL if no match was found. -+ * -+ * @cache: the cache to search -+ * @index: the number of the additonal index to search (0<=indexc_bucket_count; -+ struct list_head *l; -+ struct mb_cache_entry *ce; -+ -+ mb_assert(index < mb_cache_indexes(cache)); -+ spin_lock(&mb_cache_spinlock); -+ l = cache->c_indexes_hash[index][bucket].next; -+ ce = __mb_cache_entry_find(l, &cache->c_indexes_hash[index][bucket], -+ index, dev, key); -+ spin_unlock(&mb_cache_spinlock); -+ return ce; -+} -+ -+ -+/* -+ * mb_cache_entry_find_next() -+ * -+ * Find the next cache entry on a given device with a certain key in an -+ * additional index. Returns NULL if no match could be found. The previous -+ * entry is atomatically released, so that mb_cache_entry_find_next() can -+ * be called like this: -+ * -+ * entry = mb_cache_entry_find_first(); -+ * while (entry) { -+ * ... -+ * entry = mb_cache_entry_find_next(entry, ...); -+ * } -+ * -+ * @prev: The previous match -+ * @index: the number of the additonal index to search (0<=indexe_cache; -+ unsigned int bucket = key % cache->c_bucket_count; -+ struct list_head *l; -+ struct mb_cache_entry *ce; -+ -+ mb_assert(index < mb_cache_indexes(cache)); -+ spin_lock(&mb_cache_spinlock); -+ l = prev->e_indexes[index].o_list.next; -+ ce = __mb_cache_entry_find(l, &cache->c_indexes_hash[index][bucket], -+ index, dev, key); -+ __mb_cache_entry_release_unlock(prev); -+ return ce; -+} -+ -+#endif /* !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) */ -+ -+static int __init init_mbcache(void) -+{ -+ register_cache(&mb_cache_definition); -+ return 0; -+} -+ -+static void __exit exit_mbcache(void) -+{ -+ unregister_cache(&mb_cache_definition); -+} -+ -+module_init(init_mbcache) -+module_exit(exit_mbcache) -+ -Index: linux-DRV401/fs/xattr.c -=================================================================== ---- linux-DRV401.orig/fs/xattr.c 2004-10-12 08:56:38.404764448 -0700 -+++ linux-DRV401/fs/xattr.c 2004-10-15 11:03:52.000000000 -0700 -@@ -0,0 +1,355 @@ -+/* -+ File: fs/xattr.c -+ -+ Extended attribute handling. -+ -+ Copyright (C) 2001 by Andreas Gruenbacher -+ Copyright (C) 2001 SGI - Silicon Graphics, Inc -+ */ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+/* -+ * Extended attribute memory allocation wrappers, originally -+ * based on the Intermezzo PRESTO_ALLOC/PRESTO_FREE macros. -+ * The vmalloc use here is very uncommon - extended attributes -+ * are supposed to be small chunks of metadata, and it is quite -+ * unusual to have very many extended attributes, so lists tend -+ * to be quite short as well. The 64K upper limit is derived -+ * from the extended attribute size limit used by XFS. -+ * Intentionally allow zero @size for value/list size requests. -+ */ -+static void * -+xattr_alloc(size_t size, size_t limit) -+{ -+ void *ptr; -+ -+ if (size > limit) -+ return ERR_PTR(-E2BIG); -+ -+ if (!size) /* size request, no buffer is needed */ -+ return NULL; -+ else if (size <= PAGE_SIZE) -+ ptr = kmalloc((unsigned long) size, GFP_KERNEL); -+ else -+ ptr = vmalloc((unsigned long) size); -+ if (!ptr) -+ return ERR_PTR(-ENOMEM); -+ return ptr; -+} -+ -+static void -+xattr_free(void *ptr, size_t size) -+{ -+ if (!size) /* size request, no buffer was needed */ -+ return; -+ else if (size <= PAGE_SIZE) -+ kfree(ptr); -+ else -+ vfree(ptr); -+} -+ -+/* -+ * Extended attribute SET operations -+ */ -+static long -+setxattr(struct dentry *d, char *name, void *value, size_t size, int flags) -+{ -+ int error; -+ void *kvalue; -+ char kname[XATTR_NAME_MAX + 1]; -+ -+ if (flags & ~(XATTR_CREATE|XATTR_REPLACE)) -+ return -EINVAL; -+ -+ error = strncpy_from_user(kname, name, sizeof(kname)); -+ if (error == 0 || error == sizeof(kname)) -+ error = -ERANGE; -+ if (error < 0) -+ return error; -+ -+ kvalue = xattr_alloc(size, XATTR_SIZE_MAX); -+ if (IS_ERR(kvalue)) -+ return PTR_ERR(kvalue); -+ -+ if (size > 0 && copy_from_user(kvalue, value, size)) { -+ xattr_free(kvalue, size); -+ return -EFAULT; -+ } -+ -+ error = -EOPNOTSUPP; -+ if (d->d_inode->i_op && d->d_inode->i_op->setxattr) { -+ down(&d->d_inode->i_sem); -+ lock_kernel(); -+ error = d->d_inode->i_op->setxattr(d, kname, kvalue, size, flags); -+ unlock_kernel(); -+ up(&d->d_inode->i_sem); -+ } -+ -+ xattr_free(kvalue, size); -+ return error; -+} -+ -+asmlinkage long -+sys_setxattr(char *path, char *name, void *value, size_t size, int flags) -+{ -+ struct nameidata nd; -+ int error; -+ -+ error = user_path_walk(path, &nd); -+ if (error) -+ return error; -+ error = setxattr(nd.dentry, name, value, size, flags); -+ path_release(&nd); -+ return error; -+} -+ -+asmlinkage long -+sys_lsetxattr(char *path, char *name, void *value, size_t size, int flags) -+{ -+ struct nameidata nd; -+ int error; -+ -+ error = user_path_walk_link(path, &nd); -+ if (error) -+ return error; -+ error = setxattr(nd.dentry, name, value, size, flags); -+ path_release(&nd); -+ return error; -+} -+ -+asmlinkage long -+sys_fsetxattr(int fd, char *name, void *value, size_t size, int flags) -+{ -+ struct file *f; -+ int error = -EBADF; -+ -+ f = fget(fd); -+ if (!f) -+ return error; -+ error = setxattr(f->f_dentry, name, value, size, flags); -+ fput(f); -+ return error; -+} -+ -+/* -+ * Extended attribute GET operations -+ */ -+static ssize_t -+getxattr(struct dentry *d, char *name, void *value, size_t size) -+{ -+ ssize_t error; -+ void *kvalue; -+ char kname[XATTR_NAME_MAX + 1]; -+ -+ error = strncpy_from_user(kname, name, sizeof(kname)); -+ if (error == 0 || error == sizeof(kname)) -+ error = -ERANGE; -+ if (error < 0) -+ return error; -+ -+ kvalue = xattr_alloc(size, XATTR_SIZE_MAX); -+ if (IS_ERR(kvalue)) -+ return PTR_ERR(kvalue); -+ -+ error = -EOPNOTSUPP; -+ if (d->d_inode->i_op && d->d_inode->i_op->getxattr) { -+ down(&d->d_inode->i_sem); -+ lock_kernel(); -+ error = d->d_inode->i_op->getxattr(d, kname, kvalue, size); -+ unlock_kernel(); -+ up(&d->d_inode->i_sem); -+ } -+ -+ if (kvalue && error > 0) -+ if (copy_to_user(value, kvalue, error)) -+ error = -EFAULT; -+ xattr_free(kvalue, size); -+ return error; -+} -+ -+asmlinkage ssize_t -+sys_getxattr(char *path, char *name, void *value, size_t size) -+{ -+ struct nameidata nd; -+ ssize_t error; -+ -+ error = user_path_walk(path, &nd); -+ if (error) -+ return error; -+ error = getxattr(nd.dentry, name, value, size); -+ path_release(&nd); -+ return error; -+} -+ -+asmlinkage ssize_t -+sys_lgetxattr(char *path, char *name, void *value, size_t size) -+{ -+ struct nameidata nd; -+ ssize_t error; -+ -+ error = user_path_walk_link(path, &nd); -+ if (error) -+ return error; -+ error = getxattr(nd.dentry, name, value, size); -+ path_release(&nd); -+ return error; -+} -+ -+asmlinkage ssize_t -+sys_fgetxattr(int fd, char *name, void *value, size_t size) -+{ -+ struct file *f; -+ ssize_t error = -EBADF; -+ -+ f = fget(fd); -+ if (!f) -+ return error; -+ error = getxattr(f->f_dentry, name, value, size); -+ fput(f); -+ return error; -+} -+ -+/* -+ * Extended attribute LIST operations -+ */ -+static ssize_t -+listxattr(struct dentry *d, char *list, size_t size) -+{ -+ ssize_t error; -+ char *klist; -+ -+ klist = (char *)xattr_alloc(size, XATTR_LIST_MAX); -+ if (IS_ERR(klist)) -+ return PTR_ERR(klist); -+ -+ error = -EOPNOTSUPP; -+ if (d->d_inode->i_op && d->d_inode->i_op->listxattr) { -+ down(&d->d_inode->i_sem); -+ lock_kernel(); -+ error = d->d_inode->i_op->listxattr(d, klist, size); -+ unlock_kernel(); -+ up(&d->d_inode->i_sem); -+ } -+ -+ if (klist && error > 0) -+ if (copy_to_user(list, klist, error)) -+ error = -EFAULT; -+ xattr_free(klist, size); -+ return error; -+} -+ -+asmlinkage ssize_t -+sys_listxattr(char *path, char *list, size_t size) -+{ -+ struct nameidata nd; -+ ssize_t error; -+ -+ error = user_path_walk(path, &nd); -+ if (error) -+ return error; -+ error = listxattr(nd.dentry, list, size); -+ path_release(&nd); -+ return error; -+} -+ -+asmlinkage ssize_t -+sys_llistxattr(char *path, char *list, size_t size) -+{ -+ struct nameidata nd; -+ ssize_t error; -+ -+ error = user_path_walk_link(path, &nd); -+ if (error) -+ return error; -+ error = listxattr(nd.dentry, list, size); -+ path_release(&nd); -+ return error; -+} -+ -+asmlinkage ssize_t -+sys_flistxattr(int fd, char *list, size_t size) -+{ -+ struct file *f; -+ ssize_t error = -EBADF; -+ -+ f = fget(fd); -+ if (!f) -+ return error; -+ error = listxattr(f->f_dentry, list, size); -+ fput(f); -+ return error; -+} -+ -+/* -+ * Extended attribute REMOVE operations -+ */ -+static long -+removexattr(struct dentry *d, char *name) -+{ -+ int error; -+ char kname[XATTR_NAME_MAX + 1]; -+ -+ error = strncpy_from_user(kname, name, sizeof(kname)); -+ if (error == 0 || error == sizeof(kname)) -+ error = -ERANGE; -+ if (error < 0) -+ return error; -+ -+ error = -EOPNOTSUPP; -+ if (d->d_inode->i_op && d->d_inode->i_op->removexattr) { -+ down(&d->d_inode->i_sem); -+ lock_kernel(); -+ error = d->d_inode->i_op->removexattr(d, kname); -+ unlock_kernel(); -+ up(&d->d_inode->i_sem); -+ } -+ return error; -+} -+ -+asmlinkage long -+sys_removexattr(char *path, char *name) -+{ -+ struct nameidata nd; -+ int error; -+ -+ error = user_path_walk(path, &nd); -+ if (error) -+ return error; -+ error = removexattr(nd.dentry, name); -+ path_release(&nd); -+ return error; -+} -+ -+asmlinkage long -+sys_lremovexattr(char *path, char *name) -+{ -+ struct nameidata nd; -+ int error; -+ -+ error = user_path_walk_link(path, &nd); -+ if (error) -+ return error; -+ error = removexattr(nd.dentry, name); -+ path_release(&nd); -+ return error; -+} -+ -+asmlinkage long -+sys_fremovexattr(int fd, char *name) -+{ -+ struct file *f; -+ int error = -EBADF; -+ -+ f = fget(fd); -+ if (!f) -+ return error; -+ error = removexattr(f->f_dentry, name); -+ fput(f); -+ return error; -+} -Index: linux-DRV401/include/linux/cache_def.h -=================================================================== ---- linux-DRV401.orig/include/linux/cache_def.h 2004-10-12 08:56:38.404764448 -0700 -+++ linux-DRV401/include/linux/cache_def.h 2004-10-15 11:03:52.000000000 -0700 -@@ -0,0 +1,15 @@ -+/* -+ * linux/cache_def.h -+ * Handling of caches defined in drivers, filesystems, ... -+ * -+ * Copyright (C) 2002 by Andreas Gruenbacher, -+ */ -+ -+struct cache_definition { -+ const char *name; -+ void (*shrink)(int, unsigned int); -+ struct list_head link; -+}; -+ -+extern void register_cache(struct cache_definition *); -+extern void unregister_cache(struct cache_definition *); -Index: linux-DRV401/include/linux/errno.h -=================================================================== ---- linux-DRV401.orig/include/linux/errno.h 2004-10-15 10:26:15.000000000 -0700 -+++ linux-DRV401/include/linux/errno.h 2004-10-15 11:03:52.000000000 -0700 -@@ -23,4 +23,8 @@ - - #endif - -+/* Defined for extended attributes */ -+#define ENOATTR ENODATA /* No such attribute */ -+#define ENOTSUP EOPNOTSUPP /* Operation not supported */ -+ - #endif -Index: linux-DRV401/include/linux/ext2_fs.h -=================================================================== ---- linux-DRV401.orig/include/linux/ext2_fs.h 2004-10-15 10:26:11.000000000 -0700 -+++ linux-DRV401/include/linux/ext2_fs.h 2004-10-15 11:03:52.000000000 -0700 -@@ -57,8 +57,6 @@ - */ - #define EXT2_BAD_INO 1 /* Bad blocks inode */ - #define EXT2_ROOT_INO 2 /* Root inode */ --#define EXT2_ACL_IDX_INO 3 /* ACL inode */ --#define EXT2_ACL_DATA_INO 4 /* ACL inode */ - #define EXT2_BOOT_LOADER_INO 5 /* Boot loader inode */ - #define EXT2_UNDEL_DIR_INO 6 /* Undelete directory inode */ - -@@ -86,7 +84,6 @@ - #else - # define EXT2_BLOCK_SIZE(s) (EXT2_MIN_BLOCK_SIZE << (s)->s_log_block_size) - #endif --#define EXT2_ACLE_PER_BLOCK(s) (EXT2_BLOCK_SIZE(s) / sizeof (struct ext2_acl_entry)) - #define EXT2_ADDR_PER_BLOCK(s) (EXT2_BLOCK_SIZE(s) / sizeof (__u32)) - #ifdef __KERNEL__ - # define EXT2_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits) -@@ -121,28 +118,6 @@ - #endif - - /* -- * ACL structures -- */ --struct ext2_acl_header /* Header of Access Control Lists */ --{ -- __u32 aclh_size; -- __u32 aclh_file_count; -- __u32 aclh_acle_count; -- __u32 aclh_first_acle; --}; -- --struct ext2_acl_entry /* Access Control List Entry */ --{ -- __u32 acle_size; -- __u16 acle_perms; /* Access permissions */ -- __u16 acle_type; /* Type of entry */ -- __u16 acle_tag; /* User or group identity */ -- __u16 acle_pad1; -- __u32 acle_next; /* Pointer on next entry for the */ -- /* same inode or on next free entry */ --}; -- --/* - * Structure of a blocks group descriptor - */ - struct ext2_group_desc -@@ -314,6 +289,7 @@ - #define EXT2_MOUNT_ERRORS_PANIC 0x0040 /* Panic on errors */ - #define EXT2_MOUNT_MINIX_DF 0x0080 /* Mimics the Minix statfs */ - #define EXT2_MOUNT_NO_UID32 0x0200 /* Disable 32-bit UIDs */ -+#define EXT2_MOUNT_XATTR_USER 0x4000 /* Extended user attributes */ - - #define clear_opt(o, opt) o &= ~EXT2_MOUNT_##opt - #define set_opt(o, opt) o |= EXT2_MOUNT_##opt -@@ -397,6 +373,7 @@ - - #ifdef __KERNEL__ - #define EXT2_SB(sb) (&((sb)->u.ext2_sb)) -+#define EXT2_I(inode) (&((inode)->u.ext2_i)) - #else - /* Assume that user mode programs are passing in an ext2fs superblock, not - * a kernel struct super_block. This will allow us to call the feature-test -@@ -466,7 +443,7 @@ - #define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 - #define EXT2_FEATURE_INCOMPAT_ANY 0xffffffff - --#define EXT2_FEATURE_COMPAT_SUPP 0 -+#define EXT2_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR - #define EXT2_FEATURE_INCOMPAT_SUPP EXT2_FEATURE_INCOMPAT_FILETYPE - #define EXT2_FEATURE_RO_COMPAT_SUPP (EXT2_FEATURE_RO_COMPAT_SPARSE_SUPER| \ - EXT2_FEATURE_RO_COMPAT_LARGE_FILE| \ -@@ -623,8 +600,10 @@ - - /* namei.c */ - extern struct inode_operations ext2_dir_inode_operations; -+extern struct inode_operations ext2_special_inode_operations; - - /* symlink.c */ -+extern struct inode_operations ext2_symlink_inode_operations; - extern struct inode_operations ext2_fast_symlink_inode_operations; - - #endif /* __KERNEL__ */ -Index: linux-DRV401/include/linux/ext2_xattr.h -=================================================================== ---- linux-DRV401.orig/include/linux/ext2_xattr.h 2004-10-12 08:56:38.404764448 -0700 -+++ linux-DRV401/include/linux/ext2_xattr.h 2004-10-15 11:03:52.000000000 -0700 -@@ -0,0 +1,157 @@ -+/* -+ File: linux/ext2_xattr.h -+ -+ On-disk format of extended attributes for the ext2 filesystem. -+ -+ (C) 2001 Andreas Gruenbacher, -+*/ -+ -+#include -+#include -+#include -+ -+/* Magic value in attribute blocks */ -+#define EXT2_XATTR_MAGIC 0xEA020000 -+ -+/* Maximum number of references to one attribute block */ -+#define EXT2_XATTR_REFCOUNT_MAX 1024 -+ -+/* Name indexes */ -+#define EXT2_XATTR_INDEX_MAX 10 -+#define EXT2_XATTR_INDEX_USER 1 -+#define EXT2_XATTR_INDEX_POSIX_ACL_ACCESS 2 -+#define EXT2_XATTR_INDEX_POSIX_ACL_DEFAULT 3 -+ -+struct ext2_xattr_header { -+ __u32 h_magic; /* magic number for identification */ -+ __u32 h_refcount; /* reference count */ -+ __u32 h_blocks; /* number of disk blocks used */ -+ __u32 h_hash; /* hash value of all attributes */ -+ __u32 h_reserved[4]; /* zero right now */ -+}; -+ -+struct ext2_xattr_entry { -+ __u8 e_name_len; /* length of name */ -+ __u8 e_name_index; /* attribute name index */ -+ __u16 e_value_offs; /* offset in disk block of value */ -+ __u32 e_value_block; /* disk block attribute is stored on (n/i) */ -+ __u32 e_value_size; /* size of attribute value */ -+ __u32 e_hash; /* hash value of name and value */ -+ char e_name[0]; /* attribute name */ -+}; -+ -+#define EXT2_XATTR_PAD_BITS 2 -+#define EXT2_XATTR_PAD (1<e_name_len)) ) -+#define EXT2_XATTR_SIZE(size) \ -+ (((size) + EXT2_XATTR_ROUND) & ~EXT2_XATTR_ROUND) -+ -+#ifdef __KERNEL__ -+ -+# ifdef CONFIG_EXT2_FS_XATTR -+ -+struct ext2_xattr_handler { -+ char *prefix; -+ size_t (*list)(char *list, struct inode *inode, const char *name, -+ int name_len); -+ int (*get)(struct inode *inode, const char *name, void *buffer, -+ size_t size); -+ int (*set)(struct inode *inode, const char *name, const void *buffer, -+ size_t size, int flags); -+}; -+ -+extern int ext2_xattr_register(int, struct ext2_xattr_handler *); -+extern void ext2_xattr_unregister(int, struct ext2_xattr_handler *); -+ -+extern int ext2_setxattr(struct dentry *, const char *, const void *, size_t, int); -+extern ssize_t ext2_getxattr(struct dentry *, const char *, void *, size_t); -+extern ssize_t ext2_listxattr(struct dentry *, char *, size_t); -+extern int ext2_removexattr(struct dentry *, const char *); -+ -+extern int ext2_xattr_get(struct inode *, int, const char *, void *, size_t); -+extern int ext2_xattr_list(struct inode *, char *, size_t); -+extern int ext2_xattr_set(struct inode *, int, const char *, const void *, size_t, int); -+ -+extern void ext2_xattr_delete_inode(struct inode *); -+extern void ext2_xattr_put_super(struct super_block *); -+ -+extern int init_ext2_xattr(void) __init; -+extern void exit_ext2_xattr(void); -+ -+# else /* CONFIG_EXT2_FS_XATTR */ -+# define ext2_setxattr NULL -+# define ext2_getxattr NULL -+# define ext2_listxattr NULL -+# define ext2_removexattr NULL -+ -+static inline int -+ext2_xattr_get(struct inode *inode, int name_index, -+ const char *name, void *buffer, size_t size) -+{ -+ return -ENOTSUP; -+} -+ -+static inline int -+ext2_xattr_list(struct inode *inode, char *buffer, size_t size) -+{ -+ return -ENOTSUP; -+} -+ -+static inline int -+ext2_xattr_set(struct inode *inode, int name_index, const char *name, -+ const void *value, size_t size, int flags) -+{ -+ return -ENOTSUP; -+} -+ -+static inline void -+ext2_xattr_delete_inode(struct inode *inode) -+{ -+} -+ -+static inline void -+ext2_xattr_put_super(struct super_block *sb) -+{ -+} -+ -+static inline int -+init_ext2_xattr(void) -+{ -+ return 0; -+} -+ -+static inline void -+exit_ext2_xattr(void) -+{ -+} -+ -+# endif /* CONFIG_EXT2_FS_XATTR */ -+ -+# ifdef CONFIG_EXT2_FS_XATTR_USER -+ -+extern int init_ext2_xattr_user(void) __init; -+extern void exit_ext2_xattr_user(void); -+ -+# else /* CONFIG_EXT2_FS_XATTR_USER */ -+ -+static inline int -+init_ext2_xattr_user(void) -+{ -+ return 0; -+} -+ -+static inline void -+exit_ext2_xattr_user(void) -+{ -+} -+ -+# endif /* CONFIG_EXT2_FS_XATTR_USER */ -+ -+#endif /* __KERNEL__ */ -+ -Index: linux-DRV401/include/linux/ext3_fs.h -=================================================================== ---- linux-DRV401.orig/include/linux/ext3_fs.h 2004-10-15 10:39:16.000000000 -0700 -+++ linux-DRV401/include/linux/ext3_fs.h 2004-10-15 11:03:52.000000000 -0700 -@@ -63,8 +63,6 @@ - */ - #define EXT3_BAD_INO 1 /* Bad blocks inode */ - #define EXT3_ROOT_INO 2 /* Root inode */ --#define EXT3_ACL_IDX_INO 3 /* ACL inode */ --#define EXT3_ACL_DATA_INO 4 /* ACL inode */ - #define EXT3_BOOT_LOADER_INO 5 /* Boot loader inode */ - #define EXT3_UNDEL_DIR_INO 6 /* Undelete directory inode */ - #define EXT3_RESIZE_INO 7 /* Reserved group descriptors inode */ -@@ -94,7 +92,6 @@ - #else - # define EXT3_BLOCK_SIZE(s) (EXT3_MIN_BLOCK_SIZE << (s)->s_log_block_size) - #endif --#define EXT3_ACLE_PER_BLOCK(s) (EXT3_BLOCK_SIZE(s) / sizeof (struct ext3_acl_entry)) - #define EXT3_ADDR_PER_BLOCK(s) (EXT3_BLOCK_SIZE(s) / sizeof (__u32)) - #ifdef __KERNEL__ - # define EXT3_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits) -@@ -129,28 +126,6 @@ - #endif - - /* -- * ACL structures -- */ --struct ext3_acl_header /* Header of Access Control Lists */ --{ -- __u32 aclh_size; -- __u32 aclh_file_count; -- __u32 aclh_acle_count; -- __u32 aclh_first_acle; --}; -- --struct ext3_acl_entry /* Access Control List Entry */ --{ -- __u32 acle_size; -- __u16 acle_perms; /* Access permissions */ -- __u16 acle_type; /* Type of entry */ -- __u16 acle_tag; /* User or group identity */ -- __u16 acle_pad1; -- __u32 acle_next; /* Pointer on next entry for the */ -- /* same inode or on next free entry */ --}; -- --/* - * Structure of a blocks group descriptor - */ - struct ext3_group_desc -@@ -344,6 +319,7 @@ - #define EXT3_MOUNT_WRITEBACK_DATA 0x0C00 /* No data ordering */ - #define EXT3_MOUNT_UPDATE_JOURNAL 0x1000 /* Update the journal format */ - #define EXT3_MOUNT_NO_UID32 0x2000 /* Disable 32-bit UIDs */ -+#define EXT3_MOUNT_XATTR_USER 0x4000 /* Extended user attributes */ - - /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ - #ifndef _LINUX_EXT2_FS_H -@@ -520,7 +496,7 @@ - #define EXT3_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */ - #define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */ - --#define EXT3_FEATURE_COMPAT_SUPP 0 -+#define EXT3_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR - #define EXT3_FEATURE_INCOMPAT_SUPP (EXT3_FEATURE_INCOMPAT_FILETYPE| \ - EXT3_FEATURE_INCOMPAT_RECOVER) - #define EXT3_FEATURE_RO_COMPAT_SUPP (EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \ -@@ -703,6 +679,7 @@ - extern unsigned long ext3_count_free (struct buffer_head *, unsigned); - - /* inode.c */ -+extern int ext3_forget(handle_t *, int, struct inode *, struct buffer_head *, int); - extern struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *); - extern struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *); - -@@ -771,8 +748,10 @@ - - /* namei.c */ - extern struct inode_operations ext3_dir_inode_operations; -+extern struct inode_operations ext3_special_inode_operations; - - /* symlink.c */ -+extern struct inode_operations ext3_symlink_inode_operations; - extern struct inode_operations ext3_fast_symlink_inode_operations; - - -Index: linux-DRV401/include/linux/ext3_jbd.h -=================================================================== ---- linux-DRV401.orig/include/linux/ext3_jbd.h 2004-10-15 10:39:16.000000000 -0700 -+++ linux-DRV401/include/linux/ext3_jbd.h 2004-10-15 11:03:52.000000000 -0700 -@@ -30,13 +30,19 @@ - - #define EXT3_SINGLEDATA_TRANS_BLOCKS 8 - -+/* Extended attributes may touch two data buffers, two bitmap buffers, -+ * and two group and summaries. */ -+ -+#define EXT3_XATTR_TRANS_BLOCKS 8 -+ - /* Define the minimum size for a transaction which modifies data. This - * needs to take into account the fact that we may end up modifying two - * quota files too (one for the group, one for the user quota). The - * superblock only gets updated once, of course, so don't bother - * counting that again for the quota updates. */ - --#define EXT3_DATA_TRANS_BLOCKS (3 * EXT3_SINGLEDATA_TRANS_BLOCKS - 2) -+#define EXT3_DATA_TRANS_BLOCKS (3 * EXT3_SINGLEDATA_TRANS_BLOCKS + \ -+ EXT3_XATTR_TRANS_BLOCKS - 2) - - extern int ext3_writepage_trans_blocks(struct inode *inode); - -Index: linux-DRV401/include/linux/ext3_xattr.h -=================================================================== ---- linux-DRV401.orig/include/linux/ext3_xattr.h 2004-10-12 08:56:38.404764448 -0700 -+++ linux-DRV401/include/linux/ext3_xattr.h 2004-10-15 11:03:52.000000000 -0700 -@@ -0,0 +1,157 @@ -+/* -+ File: linux/ext3_xattr.h -+ -+ On-disk format of extended attributes for the ext3 filesystem. -+ -+ (C) 2001 Andreas Gruenbacher, -+*/ -+ -+#include -+#include -+#include -+ -+/* Magic value in attribute blocks */ -+#define EXT3_XATTR_MAGIC 0xEA020000 -+ -+/* Maximum number of references to one attribute block */ -+#define EXT3_XATTR_REFCOUNT_MAX 1024 -+ -+/* Name indexes */ -+#define EXT3_XATTR_INDEX_MAX 10 -+#define EXT3_XATTR_INDEX_USER 1 -+#define EXT3_XATTR_INDEX_POSIX_ACL_ACCESS 2 -+#define EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT 3 -+ -+struct ext3_xattr_header { -+ __u32 h_magic; /* magic number for identification */ -+ __u32 h_refcount; /* reference count */ -+ __u32 h_blocks; /* number of disk blocks used */ -+ __u32 h_hash; /* hash value of all attributes */ -+ __u32 h_reserved[4]; /* zero right now */ -+}; -+ -+struct ext3_xattr_entry { -+ __u8 e_name_len; /* length of name */ -+ __u8 e_name_index; /* attribute name index */ -+ __u16 e_value_offs; /* offset in disk block of value */ -+ __u32 e_value_block; /* disk block attribute is stored on (n/i) */ -+ __u32 e_value_size; /* size of attribute value */ -+ __u32 e_hash; /* hash value of name and value */ -+ char e_name[0]; /* attribute name */ -+}; -+ -+#define EXT3_XATTR_PAD_BITS 2 -+#define EXT3_XATTR_PAD (1<e_name_len)) ) -+#define EXT3_XATTR_SIZE(size) \ -+ (((size) + EXT3_XATTR_ROUND) & ~EXT3_XATTR_ROUND) -+ -+#ifdef __KERNEL__ -+ -+# ifdef CONFIG_EXT3_FS_XATTR -+ -+struct ext3_xattr_handler { -+ char *prefix; -+ size_t (*list)(char *list, struct inode *inode, const char *name, -+ int name_len); -+ int (*get)(struct inode *inode, const char *name, void *buffer, -+ size_t size); -+ int (*set)(struct inode *inode, const char *name, const void *buffer, -+ size_t size, int flags); -+}; -+ -+extern int ext3_xattr_register(int, struct ext3_xattr_handler *); -+extern void ext3_xattr_unregister(int, struct ext3_xattr_handler *); -+ -+extern int ext3_setxattr(struct dentry *, const char *, const void *, size_t, int); -+extern ssize_t ext3_getxattr(struct dentry *, const char *, void *, size_t); -+extern ssize_t ext3_listxattr(struct dentry *, char *, size_t); -+extern int ext3_removexattr(struct dentry *, const char *); -+ -+extern int ext3_xattr_get(struct inode *, int, const char *, void *, size_t); -+extern int ext3_xattr_list(struct inode *, char *, size_t); -+extern int ext3_xattr_set(handle_t *handle, struct inode *, int, const char *, const void *, size_t, int); -+ -+extern void ext3_xattr_delete_inode(handle_t *, struct inode *); -+extern void ext3_xattr_put_super(struct super_block *); -+ -+extern int init_ext3_xattr(void) __init; -+extern void exit_ext3_xattr(void); -+ -+# else /* CONFIG_EXT3_FS_XATTR */ -+# define ext3_setxattr NULL -+# define ext3_getxattr NULL -+# define ext3_listxattr NULL -+# define ext3_removexattr NULL -+ -+static inline int -+ext3_xattr_get(struct inode *inode, int name_index, const char *name, -+ void *buffer, size_t size) -+{ -+ return -ENOTSUP; -+} -+ -+static inline int -+ext3_xattr_list(struct inode *inode, void *buffer, size_t size) -+{ -+ return -ENOTSUP; -+} -+ -+static inline int -+ext3_xattr_set(handle_t *handle, struct inode *inode, int name_index, -+ const char *name, const void *value, size_t size, int flags) -+{ -+ return -ENOTSUP; -+} -+ -+static inline void -+ext3_xattr_delete_inode(handle_t *handle, struct inode *inode) -+{ -+} -+ -+static inline void -+ext3_xattr_put_super(struct super_block *sb) -+{ -+} -+ -+static inline int -+init_ext3_xattr(void) -+{ -+ return 0; -+} -+ -+static inline void -+exit_ext3_xattr(void) -+{ -+} -+ -+# endif /* CONFIG_EXT3_FS_XATTR */ -+ -+# ifdef CONFIG_EXT3_FS_XATTR_USER -+ -+extern int init_ext3_xattr_user(void) __init; -+extern void exit_ext3_xattr_user(void); -+ -+# else /* CONFIG_EXT3_FS_XATTR_USER */ -+ -+static inline int -+init_ext3_xattr_user(void) -+{ -+ return 0; -+} -+ -+static inline void -+exit_ext3_xattr_user(void) -+{ -+} -+ -+#endif /* CONFIG_EXT3_FS_XATTR_USER */ -+ -+#endif /* __KERNEL__ */ -+ -Index: linux-DRV401/include/linux/fs.h -=================================================================== ---- linux-DRV401.orig/include/linux/fs.h 2004-10-15 10:39:15.000000000 -0700 -+++ linux-DRV401/include/linux/fs.h 2004-10-15 11:03:52.000000000 -0700 -@@ -936,6 +936,10 @@ - int (*setattr) (struct dentry *, struct iattr *); - int (*setattr_raw) (struct inode *, struct iattr *); - int (*getattr) (struct dentry *, struct iattr *); -+ int (*setxattr) (struct dentry *, const char *, const void *, size_t, int); -+ ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t); -+ ssize_t (*listxattr) (struct dentry *, char *, size_t); -+ int (*removexattr) (struct dentry *, const char *); - }; - - struct seq_file; -Index: linux-DRV401/include/linux/mbcache.h -=================================================================== ---- linux-DRV401.orig/include/linux/mbcache.h 2004-10-12 08:56:38.404764448 -0700 -+++ linux-DRV401/include/linux/mbcache.h 2004-10-15 11:03:52.000000000 -0700 -@@ -0,0 +1,69 @@ -+/* -+ File: linux/mbcache.h -+ -+ (C) 2001 by Andreas Gruenbacher, -+*/ -+ -+/* Hardwire the number of additional indexes */ -+#define MB_CACHE_INDEXES_COUNT 1 -+ -+struct mb_cache_entry; -+ -+struct mb_cache_op { -+ int (*free)(struct mb_cache_entry *, int); -+}; -+ -+struct mb_cache { -+ struct list_head c_cache_list; -+ const char *c_name; -+ struct mb_cache_op c_op; -+ atomic_t c_entry_count; -+ int c_bucket_count; -+#ifndef MB_CACHE_INDEXES_COUNT -+ int c_indexes_count; -+#endif -+ kmem_cache_t *c_entry_cache; -+ struct list_head *c_block_hash; -+ struct list_head *c_indexes_hash[0]; -+}; -+ -+struct mb_cache_entry_index { -+ struct list_head o_list; -+ unsigned int o_key; -+}; -+ -+struct mb_cache_entry { -+ struct list_head e_lru_list; -+ struct mb_cache *e_cache; -+ atomic_t e_used; -+ kdev_t e_dev; -+ unsigned long e_block; -+ struct list_head e_block_list; -+ struct mb_cache_entry_index e_indexes[0]; -+}; -+ -+/* Functions on caches */ -+ -+struct mb_cache * mb_cache_create(const char *, struct mb_cache_op *, size_t, -+ int, int); -+void mb_cache_shrink(struct mb_cache *, kdev_t); -+void mb_cache_destroy(struct mb_cache *); -+ -+/* Functions on cache entries */ -+ -+struct mb_cache_entry *mb_cache_entry_alloc(struct mb_cache *); -+int mb_cache_entry_insert(struct mb_cache_entry *, kdev_t, unsigned long, -+ unsigned int[]); -+void mb_cache_entry_rehash(struct mb_cache_entry *, unsigned int[]); -+void mb_cache_entry_release(struct mb_cache_entry *); -+void mb_cache_entry_takeout(struct mb_cache_entry *); -+void mb_cache_entry_free(struct mb_cache_entry *); -+struct mb_cache_entry *mb_cache_entry_dup(struct mb_cache_entry *); -+struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *, kdev_t, -+ unsigned long); -+#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) -+struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache, int, -+ kdev_t, unsigned int); -+struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache_entry *, int, -+ kdev_t, unsigned int); -+#endif -Index: linux-DRV401/include/linux/xattr.h -=================================================================== ---- linux-DRV401.orig/include/linux/xattr.h 2004-10-12 08:56:38.404764448 -0700 -+++ linux-DRV401/include/linux/xattr.h 2004-10-15 11:03:52.000000000 -0700 -@@ -0,0 +1,15 @@ -+/* -+ File: linux/xattr.h -+ -+ Extended attributes handling. -+ -+ Copyright (C) 2001 by Andreas Gruenbacher -+ Copyright (c) 2001-2002 Silicon Graphics, Inc. All Rights Reserved. -+*/ -+#ifndef _LINUX_XATTR_H -+#define _LINUX_XATTR_H -+ -+#define XATTR_CREATE 0x1 /* set the value, fail if attr already exists */ -+#define XATTR_REPLACE 0x2 /* set the value, fail if attr does not exist */ -+ -+#endif /* _LINUX_XATTR_H */ -Index: linux-DRV401/include/linux/limits.h -=================================================================== ---- linux-DRV401.orig/include/linux/limits.h 2004-10-15 10:26:20.000000000 -0700 -+++ linux-DRV401/include/linux/limits.h 2004-10-15 11:03:52.000000000 -0700 -@@ -13,6 +13,9 @@ - #define NAME_MAX 255 /* # chars in a file name */ - #define PATH_MAX 4096 /* # chars in a path name including nul */ - #define PIPE_BUF 4096 /* # bytes in atomic write to a pipe */ -+#define XATTR_NAME_MAX 255 /* # chars in an extended attribute name */ -+#define XATTR_SIZE_MAX 65536 /* size of an extended attribute value (64k) */ -+#define XATTR_LIST_MAX 65536 /* size of extended attribute namelist (64k) */ - - #define RTSIG_MAX 32 - -Index: linux-DRV401/kernel/ksyms.c -=================================================================== ---- linux-DRV401.orig/kernel/ksyms.c 2004-10-15 10:39:15.000000000 -0700 -+++ linux-DRV401/kernel/ksyms.c 2004-10-15 11:03:52.000000000 -0700 -@@ -11,6 +11,7 @@ - - #include - #include -+#include - #include - #include - #include -@@ -88,6 +89,7 @@ - EXPORT_SYMBOL(exit_files); - EXPORT_SYMBOL(exit_fs); - EXPORT_SYMBOL(exit_sighand); -+EXPORT_SYMBOL(copy_fs_struct); - EXPORT_SYMBOL(unshare_files); - - /* internal kernel memory management */ -@@ -105,6 +107,8 @@ - EXPORT_SYMBOL(kmem_cache_shrink); - EXPORT_SYMBOL(kmem_cache_alloc); - EXPORT_SYMBOL(kmem_cache_free); -+EXPORT_SYMBOL(register_cache); -+EXPORT_SYMBOL(unregister_cache); - EXPORT_SYMBOL(kmalloc); - EXPORT_SYMBOL(kfree); - EXPORT_SYMBOL(vfree); -Index: linux-DRV401/mm/vmscan.c -=================================================================== ---- linux-DRV401.orig/mm/vmscan.c 2004-10-15 10:24:07.000000000 -0700 -+++ linux-DRV401/mm/vmscan.c 2004-10-15 11:08:53.000000000 -0700 -@@ -15,6 +15,7 @@ - #include - #include - #include -+#include - #include - #include - #include -@@ -31,6 +32,39 @@ - */ - #define DEF_PRIORITY (6) - -+static DECLARE_MUTEX(other_caches_sem); -+static LIST_HEAD(cache_definitions); -+ -+void register_cache(struct cache_definition *cache) -+{ -+ down(&other_caches_sem); -+ list_add(&cache->link, &cache_definitions); -+ up(&other_caches_sem); -+} -+ -+void unregister_cache(struct cache_definition *cache) -+{ -+ down(&other_caches_sem); -+ list_del(&cache->link); -+ up(&other_caches_sem); -+} -+ -+static void shrink_other_caches(unsigned int priority, int gfp_mask) -+{ -+ struct list_head *p; -+ -+ if (down_trylock(&other_caches_sem)) -+ return; -+ -+ list_for_each_prev(p, &cache_definitions) { -+ struct cache_definition *cache = -+ list_entry(p, struct cache_definition, link); -+ -+ cache->shrink(priority, gfp_mask); -+ } -+ up(&other_caches_sem); -+} -+ - /* - * The swap-out function returns 1 if it successfully - * scanned all the pages it was asked to (`count'). -@@ -584,6 +618,7 @@ - - shrink_dcache_memory(priority, gfp_mask); - shrink_icache_memory(priority, gfp_mask); -+ shrink_other_caches(priority, gfp_mask); - #ifdef CONFIG_QUOTA - shrink_dqcache_memory(DEF_PRIORITY, gfp_mask); - #endif diff --git a/lustre/kernel_patches/patches/linux-2.4.19-suse-xattr-0.8.54-hp.patch b/lustre/kernel_patches/patches/linux-2.4.19-suse-xattr-0.8.54-hp.patch deleted file mode 100644 index 1becfbc..0000000 --- a/lustre/kernel_patches/patches/linux-2.4.19-suse-xattr-0.8.54-hp.patch +++ /dev/null @@ -1,346 +0,0 @@ - Documentation/Configure.help | 66 ++ - arch/ia64/defconfig | 7 - fs/Config.in | 14 - fs/Makefile | 3 - fs/ext2/Makefile | 4 - fs/ext2/file.c | 5 - fs/ext2/ialloc.c | 2 - fs/ext2/inode.c | 34 - - fs/ext2/namei.c | 14 - fs/ext2/super.c | 29 - fs/ext2/symlink.c | 14 - fs/ext2/xattr.c | 1212 +++++++++++++++++++++++++++++++++++++++++ - fs/ext2/xattr_user.c | 103 +++ - fs/ext3/Makefile | 9 - fs/ext3/ext3-exports.c | 13 - fs/ext3/file.c | 5 - fs/ext3/ialloc.c | 2 - fs/ext3/inode.c | 35 - - fs/ext3/namei.c | 21 - fs/ext3/super.c | 36 + - fs/ext3/symlink.c | 14 - fs/ext3/xattr.c | 1225 ++++++++++++++++++++++++++++++++++++++++++ - fs/ext3/xattr_user.c | 111 +++ - fs/jfs/jfs_xattr.h | 6 - fs/jfs/xattr.c | 6 - fs/mbcache.c | 648 ++++++++++++++++++++++ - include/linux/cache_def.h | 15 - include/linux/errno.h | 4 - include/linux/ext2_fs.h | 31 - - include/linux/ext2_xattr.h | 157 +++++ - include/linux/ext3_fs.h | 31 - - include/linux/ext3_jbd.h | 8 - include/linux/ext3_xattr.h | 157 +++++ - include/linux/fs.h | 2 - include/linux/mbcache.h | 69 ++ - kernel/ksyms.c | 4 - mm/vmscan.c | 35 + - 62 files changed, 4343 insertions(+), 182 deletions(-) - -Index: linux-2.4.19.SuSE/Documentation/Configure.help -=================================================================== ---- linux-2.4.19.SuSE.orig/Documentation/Configure.help 2004-05-03 11:20:17.000000000 -0700 -+++ linux-2.4.19.SuSE/Documentation/Configure.help 2004-05-03 11:50:22.000000000 -0700 -@@ -15296,6 +15296,39 @@ - - If unsure, say N. - -+Ext2 extended attributes -+CONFIG_EXT2_FS_XATTR -+ Extended attributes are name:value pairs associated with inodes by -+ the kernel or by users (see the attr(5) manual page, or visit -+ for details). -+ -+ If unsure, say N. -+ -+Ext2 extended attribute block sharing -+CONFIG_EXT2_FS_XATTR_SHARING -+ This options enables code for sharing identical extended attribute -+ blocks among multiple inodes. -+ -+ Usually, say Y. -+ -+Ext2 extended user attributes -+CONFIG_EXT2_FS_XATTR_USER -+ This option enables extended user attributes on ext2. Processes can -+ associate extended user attributes with inodes to store additional -+ information such as the character encoding of files, etc. (see the -+ attr(5) manual page, or visit for details). -+ -+ If unsure, say N. -+ -+Ext2 trusted extended attributes -+CONFIG_EXT2_FS_XATTR_TRUSTED -+ This option enables extended attributes on ext2 that are accessible -+ (and visible) only to users capable of CAP_SYS_ADMIN. Usually this -+ is only the super user. Trusted extended attributes are meant for -+ implementing system/security services. -+ -+ If unsure, say N. -+ - Ext3 journalling file system support (EXPERIMENTAL) - CONFIG_EXT3_FS - This is the journalling version of the Second extended file system -@@ -15354,6 +15387,39 @@ - - If unsure, say N. - -+Ext3 extended attributes -+CONFIG_EXT3_FS_XATTR -+ Extended attributes are name:value pairs associated with inodes by -+ the kernel or by users (see the attr(5) manual page, or visit -+ for details). -+ -+ If unsure, say N. -+ -+Ext3 extended attribute block sharing -+CONFIG_EXT3_FS_XATTR_SHARING -+ This options enables code for sharing identical extended attribute -+ blocks among multiple inodes. -+ -+ Usually, say Y. -+ -+Ext3 extended user attributes -+CONFIG_EXT3_FS_XATTR_USER -+ This option enables extended user attributes on ext3. Processes can -+ associate extended user attributes with inodes to store additional -+ information such as the character encoding of files, etc. (see the -+ attr(5) manual page, or visit for details). -+ -+ If unsure, say N. -+ -+Ext3 trusted extended attributes -+CONFIG_EXT3_FS_XATTR_TRUSTED -+ This option enables extended attributes on ext3 that are accessible -+ (and visible) only to users capable of CAP_SYS_ADMIN. Usually this -+ is only the super user. Trusted extended attributes are meant for -+ implementing system/security services. -+ -+ If unsure, say N. -+ - Journal Block Device support (JBD for ext3) (EXPERIMENTAL) - CONFIG_JBD - This is a generic journalling layer for block devices. It is -Index: linux-2.4.19.SuSE/arch/ia64/defconfig -=================================================================== ---- linux-2.4.19.SuSE.orig/arch/ia64/defconfig 2004-05-03 11:19:10.000000000 -0700 -+++ linux-2.4.19.SuSE/arch/ia64/defconfig 2004-05-03 11:50:22.000000000 -0700 -@@ -1,6 +1,13 @@ - # - # Automatically generated make config: don't edit - # -+CONFIG_EXT3_FS_XATTR=y -+# CONFIG_EXT3_FS_XATTR_SHARING is not set -+# CONFIG_EXT3_FS_XATTR_USER is not set -+# CONFIG_EXT2_FS_XATTR is not set -+# CONFIG_EXT2_FS_XATTR_SHARING is not set -+# CONFIG_EXT2_FS_XATTR_USER is not set -+# CONFIG_FS_MBCACHE is not set - - # - # Code maturity level options -Index: linux-2.4.19.SuSE/fs/Config.in -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/Config.in 2004-05-03 11:18:52.000000000 -0700 -+++ linux-2.4.19.SuSE/fs/Config.in 2004-05-03 11:50:22.000000000 -0700 -@@ -203,6 +203,10 @@ - #tristate 'Meta block cache' CONFIG_FS_MBCACHE - define_tristate CONFIG_FS_MBCACHE y - -+# Meta block cache for Extended Attributes (ext2/ext3) -+#tristate 'Meta block cache' CONFIG_FS_MBCACHE -+define_tristate CONFIG_FS_MBCACHE y -+ - mainmenu_option next_comment - comment 'Partition Types' - source fs/partitions/Config.in -Index: linux-2.4.19.SuSE/fs/Makefile -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/Makefile 2004-05-03 11:22:49.000000000 -0700 -+++ linux-2.4.19.SuSE/fs/Makefile 2004-05-03 11:50:22.000000000 -0700 -@@ -104,6 +104,9 @@ - obj-$(CONFIG_FS_MBCACHE) += mbcache.o - obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o xattr_acl.o - -+export-objs += mbcache.o -+obj-$(CONFIG_FS_MBCACHE) += mbcache.o -+ - # persistent filesystems - obj-y += $(join $(subdir-y),$(subdir-y:%=/%.o)) - -Index: linux-2.4.19.SuSE/fs/ext2/Makefile -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/ext2/Makefile 2004-05-03 11:18:46.000000000 -0700 -+++ linux-2.4.19.SuSE/fs/ext2/Makefile 2004-05-03 11:50:22.000000000 -0700 -@@ -18,4 +18,8 @@ - obj-$(CONFIG_EXT2_FS_XATTR_USER) += xattr_user.o - obj-$(CONFIG_EXT2_FS_POSIX_ACL) += acl.o - -+export-objs += xattr.o -+obj-$(CONFIG_EXT2_FS_XATTR) += xattr.o -+obj-$(CONFIG_EXT2_FS_XATTR_USER) += xattr_user.o -+ - include $(TOPDIR)/Rules.make -Index: linux-2.4.19.SuSE/fs/ext2/inode.c -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/ext2/inode.c 2004-05-03 11:18:47.000000000 -0700 -+++ linux-2.4.19.SuSE/fs/ext2/inode.c 2004-05-03 11:50:22.000000000 -0700 -@@ -52,6 +52,18 @@ - } - - /* -+ * Test whether an inode is a fast symlink. -+ */ -+static inline int ext2_inode_is_fast_symlink(struct inode *inode) -+{ -+ int ea_blocks = inode->u.ext2_i.i_file_acl ? -+ (inode->i_sb->s_blocksize >> 9) : 0; -+ -+ return (S_ISLNK(inode->i_mode) && -+ inode->i_blocks - ea_blocks == 0); -+} -+ -+/* - * Called at each iput() - */ - void ext2_put_inode (struct inode * inode) -@@ -806,6 +818,8 @@ - return; - if (ext2_inode_is_fast_symlink(inode)) - return; -+ if (ext2_inode_is_fast_symlink(inode)) -+ return; - if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) - return; - -Index: linux-2.4.19.SuSE/fs/ext2/super.c -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/ext2/super.c 2004-05-03 11:18:47.000000000 -0700 -+++ linux-2.4.19.SuSE/fs/ext2/super.c 2004-05-03 11:50:22.000000000 -0700 -@@ -70,6 +70,7 @@ - { - va_list args; - -+ ext2_xattr_put_super(sb); - if (!(sb->s_flags & MS_RDONLY)) { - sb->u.ext2_sb.s_mount_state |= EXT2_ERROR_FS; - sb->u.ext2_sb.s_es->s_state = -Index: linux-2.4.19.SuSE/fs/ext3/inode.c -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/ext3/inode.c 2004-05-03 11:18:47.000000000 -0700 -+++ linux-2.4.19.SuSE/fs/ext3/inode.c 2004-05-03 11:50:22.000000000 -0700 -@@ -54,6 +54,18 @@ - inode->i_blocks - ea_blocks == 0); - } - -+/* -+ * Test whether an inode is a fast symlink. -+ */ -+static inline int ext3_inode_is_fast_symlink(struct inode *inode) -+{ -+ int ea_blocks = inode->u.ext3_i.i_file_acl ? -+ (inode->i_sb->s_blocksize >> 9) : 0; -+ -+ return (S_ISLNK(inode->i_mode) && -+ inode->i_blocks - ea_blocks == 0); -+} -+ - /* The ext3 forget function must perform a revoke if we are freeing data - * which has been journaled. Metadata (eg. indirect blocks) must be - * revoked in all cases. -@@ -1968,6 +1980,8 @@ - return; - if (ext3_inode_is_fast_symlink(inode)) - return; -+ if (ext3_inode_is_fast_symlink(inode)) -+ return; - if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) - return; - -Index: linux-2.4.19.SuSE/fs/ext3/ext3-exports.c -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/ext3/ext3-exports.c 2004-02-18 07:26:44.000000000 -0800 -+++ linux-2.4.19.SuSE/fs/ext3/ext3-exports.c 2004-05-03 11:50:22.000000000 -0700 -@@ -0,0 +1,13 @@ -+#include -+#include -+#include -+#include -+#include -+ -+EXPORT_SYMBOL(ext3_force_commit); -+EXPORT_SYMBOL(ext3_bread); -+EXPORT_SYMBOL(ext3_xattr_register); -+EXPORT_SYMBOL(ext3_xattr_unregister); -+EXPORT_SYMBOL(ext3_xattr_get); -+EXPORT_SYMBOL(ext3_xattr_list); -+EXPORT_SYMBOL(ext3_xattr_set); -Index: linux-2.4.19.SuSE/include/linux/errno.h -=================================================================== ---- linux-2.4.19.SuSE.orig/include/linux/errno.h 2004-05-03 11:20:21.000000000 -0700 -+++ linux-2.4.19.SuSE/include/linux/errno.h 2004-05-03 11:50:22.000000000 -0700 -@@ -30,4 +30,8 @@ - - #endif - -+/* Defined for extended attributes */ -+#define ENOATTR ENODATA /* No such attribute */ -+#define ENOTSUP EOPNOTSUPP /* Operation not supported */ -+ - #endif -Index: linux-2.4.19.SuSE/kernel/ksyms.c -=================================================================== ---- linux-2.4.19.SuSE.orig/kernel/ksyms.c 2004-05-03 11:22:48.000000000 -0700 -+++ linux-2.4.19.SuSE/kernel/ksyms.c 2004-05-03 11:50:22.000000000 -0700 -@@ -12,6 +12,7 @@ - #define __KERNEL_SYSCALLS__ - #include - #include -+#include - #include - #include - #include -Index: linux-2.4.19.SuSE/mm/vmscan.c -=================================================================== ---- linux-2.4.19.SuSE.orig/mm/vmscan.c 2004-05-03 11:18:53.000000000 -0700 -+++ linux-2.4.19.SuSE/mm/vmscan.c 2004-05-03 11:50:22.000000000 -0700 -@@ -32,6 +32,39 @@ - */ - int vm_passes = 60; - -+static DECLARE_MUTEX(other_caches_sem); -+static LIST_HEAD(cache_definitions); -+ -+void register_cache(struct cache_definition *cache) -+{ -+ down(&other_caches_sem); -+ list_add(&cache->link, &cache_definitions); -+ up(&other_caches_sem); -+} -+ -+void unregister_cache(struct cache_definition *cache) -+{ -+ down(&other_caches_sem); -+ list_del(&cache->link); -+ up(&other_caches_sem); -+} -+ -+static void shrink_other_caches(unsigned int priority, int gfp_mask) -+{ -+ struct list_head *p; -+ -+ if (down_trylock(&other_caches_sem)) -+ return; -+ -+ list_for_each_prev(p, &cache_definitions) { -+ struct cache_definition *cache = -+ list_entry(p, struct cache_definition, link); -+ -+ cache->shrink(priority, gfp_mask); -+ } -+ up(&other_caches_sem); -+} -+ - /* - * "vm_cache_scan_ratio" is how much of the inactive LRU queue we will scan - * in one go. A value of 6 for vm_cache_scan_ratio implies that we'll diff --git a/lustre/kernel_patches/patches/linux-2.4.19-xattr-0.8.54-suse.patch b/lustre/kernel_patches/patches/linux-2.4.19-xattr-0.8.54-suse.patch deleted file mode 100644 index 26d3af9..0000000 --- a/lustre/kernel_patches/patches/linux-2.4.19-xattr-0.8.54-suse.patch +++ /dev/null @@ -1,47 +0,0 @@ - ext2/super.c | 3 +-- - ext3/ext3-exports.c | 13 +++++++++++++ - 2 files changed, 14 insertions(+), 2 deletions(-) - -Index: linux-2.4.19.SuSE/fs/ext2/super.c -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/ext2/super.c Mon Jan 27 05:08:00 2003 -+++ linux-2.4.19.SuSE/fs/ext2/super.c Sun Nov 16 00:40:59 2003 -@@ -70,6 +70,7 @@ - { - va_list args; - -+ ext2_xattr_put_super(sb); - if (!(sb->s_flags & MS_RDONLY)) { - sb->u.ext2_sb.s_mount_state |= EXT2_ERROR_FS; - sb->u.ext2_sb.s_es->s_state = -Index: linux-2.4.19.SuSE/fs/ext3/super.c -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/ext3/super.c Mon Jan 27 05:08:00 2003 -+++ linux-2.4.19.SuSE/fs/ext3/super.c Sun Nov 16 00:40:59 2003 -@@ -1822,8 +1828,6 @@ - exit_ext3_xattr(); - } - --EXPORT_SYMBOL(ext3_force_commit); --EXPORT_SYMBOL(ext3_bread); - - MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); - MODULE_DESCRIPTION("Second Extended Filesystem with journaling extensions"); -Index: linux-2.4.19.SuSE/fs/ext3/ext3-exports.c -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/ext3/ext3-exports.c Sun Nov 16 00:40:58 2003 -+++ linux-2.4.19.SuSE/fs/ext3/ext3-exports.c Sun Nov 16 00:40:59 2003 -@@ -0,0 +1,13 @@ -+#include -+#include -+#include -+#include -+#include -+ -+EXPORT_SYMBOL(ext3_force_commit); -+EXPORT_SYMBOL(ext3_bread); -+EXPORT_SYMBOL(ext3_xattr_register); -+EXPORT_SYMBOL(ext3_xattr_unregister); -+EXPORT_SYMBOL(ext3_xattr_get); -+EXPORT_SYMBOL(ext3_xattr_list); -+EXPORT_SYMBOL(ext3_xattr_set); diff --git a/lustre/kernel_patches/patches/linux-2.4.21-xattr-0.8.54-chaos.patch b/lustre/kernel_patches/patches/linux-2.4.21-xattr-0.8.54-chaos.patch index 196ae17..686b1ea 100644 --- a/lustre/kernel_patches/patches/linux-2.4.21-xattr-0.8.54-chaos.patch +++ b/lustre/kernel_patches/patches/linux-2.4.21-xattr-0.8.54-chaos.patch @@ -247,25 +247,6 @@ Index: linux-2.4.21-chaos/fs/ext2/inode.c =================================================================== --- linux-2.4.21-chaos.orig/fs/ext2/inode.c 2003-07-15 04:41:01.000000000 +0400 +++ linux-2.4.21-chaos/fs/ext2/inode.c 2003-12-14 15:11:46.000000000 +0300 -@@ -39,6 +39,18 @@ - static int ext2_update_inode(struct inode * inode, int do_sync); - - /* -+ * Test whether an inode is a fast symlink. -+ */ -+static inline int ext2_inode_is_fast_symlink(struct inode *inode) -+{ -+ int ea_blocks = inode->u.ext2_i.i_file_acl ? -+ (inode->i_sb->s_blocksize >> 9) : 0; -+ -+ return (S_ISLNK(inode->i_mode) && -+ inode->i_blocks - ea_blocks == 0); -+} -+ -+/* - * Called at each iput() - */ - void ext2_put_inode (struct inode * inode) @@ -53,9 +65,7 @@ { lock_kernel(); @@ -308,12 +289,8 @@ Index: linux-2.4.21-chaos/fs/ext2/inode.c inode->i_op = &ext2_file_inode_operations; inode->i_fop = &ext2_file_operations; inode->i_mapping->a_ops = &ext2_aops; -@@ -1002,15 +1010,17 @@ - inode->i_fop = &ext2_dir_operations; - inode->i_mapping->a_ops = &ext2_aops; - } else if (S_ISLNK(inode->i_mode)) { -- if (!inode->i_blocks) -+ if (ext2_inode_is_fast_symlink(inode)) +@@ -1002,12 +1010,14 @@ + if (ext2_inode_is_fast_symlink(inode)) inode->i_op = &ext2_fast_symlink_inode_operations; else { - inode->i_op = &page_symlink_inode_operations; diff --git a/lustre/kernel_patches/patches/linux-2.4.24-jbd-handle-EIO-rhel3.patch b/lustre/kernel_patches/patches/linux-2.4.24-jbd-handle-EIO-rhel3.patch new file mode 100644 index 0000000..bc66351 --- /dev/null +++ b/lustre/kernel_patches/patches/linux-2.4.24-jbd-handle-EIO-rhel3.patch @@ -0,0 +1,23 @@ +diff -X /home/nikita/src/linux-git/linux-2.6.git/Documentation/dontdiff -rupbB linux-2.4.24.orig/fs/jbd/commit.c linux-2.4.24/fs/jbd/commit.c +--- linux-2.4.24.orig/fs/jbd/commit.c 2005-06-23 17:39:32.000000000 +0400 ++++ linux-2.4.24/fs/jbd/commit.c 2005-06-23 15:56:05.000000000 +0400 +@@ -505,6 +505,9 @@ start_journal_io: + goto wait_for_iobuf; + } + ++ if (unlikely(!buffer_uptodate(bh))) ++ err = -EIO; ++ + clear_bit(BH_JWrite, &jh2bh(jh)->b_state); + + JBUFFER_TRACE(jh, "ph4: unfile after journal write"); +@@ -566,6 +569,9 @@ start_journal_io: + goto wait_for_ctlbuf; + } + ++ if (unlikely(!buffer_uptodate(bh))) ++ err = -EIO; ++ + BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile"); + clear_bit(BH_JWrite, &bh->b_state); + journal_unfile_buffer(jh); diff --git a/lustre/kernel_patches/patches/linux-2.6-binutils-2.16.patch b/lustre/kernel_patches/patches/linux-2.6-binutils-2.16.patch new file mode 100644 index 0000000..5c2e7dc --- /dev/null +++ b/lustre/kernel_patches/patches/linux-2.6-binutils-2.16.patch @@ -0,0 +1,102 @@ +--- linux/arch/i386/kernel/process.c.seg 2005-03-27 13:07:14.000000000 -0800 ++++ linux/arch/i386/kernel/process.c 2005-03-28 10:28:47.000000000 -0800 +@@ -597,8 +597,8 @@ struct task_struct fastcall * __switch_t + * Save away %fs and %gs. No need to save %es and %ds, as + * those are always kernel segments while inside the kernel. + */ +- asm volatile("movl %%fs,%0":"=m" (*(int *)&prev->fs)); +- asm volatile("movl %%gs,%0":"=m" (*(int *)&prev->gs)); ++ asm volatile("mov %%fs,%0":"=m" (prev->fs)); ++ asm volatile("mov %%gs,%0":"=m" (prev->gs)); + + /* + * Restore %fs and %gs if needed. +--- linux/arch/i386/kernel/vm86.c.seg 2005-03-27 13:07:14.000000000 -0800 ++++ linux/arch/i386/kernel/vm86.c 2005-03-28 10:28:47.000000000 -0800 +@@ -294,8 +294,8 @@ static void do_sys_vm86(struct kernel_vm + */ + info->regs32->eax = 0; + tsk->thread.saved_esp0 = tsk->thread.esp0; +- asm volatile("movl %%fs,%0":"=m" (tsk->thread.saved_fs)); +- asm volatile("movl %%gs,%0":"=m" (tsk->thread.saved_gs)); ++ asm volatile("mov %%fs,%0":"=m" (tsk->thread.saved_fs)); ++ asm volatile("mov %%gs,%0":"=m" (tsk->thread.saved_gs)); + + tss = &per_cpu(init_tss, get_cpu()); + tsk->thread.esp0 = (unsigned long) &info->VM86_TSS_ESP0; +--- linux/arch/x86_64/kernel/process.c.seg 2005-03-27 13:07:49.000000000 -0800 ++++ linux/arch/x86_64/kernel/process.c 2005-03-28 11:11:04.206766410 -0800 +@@ -391,10 +391,10 @@ int copy_thread(int nr, unsigned long cl + p->thread.fs = me->thread.fs; + p->thread.gs = me->thread.gs; + +- asm("movl %%gs,%0" : "=m" (p->thread.gsindex)); +- asm("movl %%fs,%0" : "=m" (p->thread.fsindex)); +- asm("movl %%es,%0" : "=m" (p->thread.es)); +- asm("movl %%ds,%0" : "=m" (p->thread.ds)); ++ asm("mov %%gs,%0" : "=m" (p->thread.gsindex)); ++ asm("mov %%fs,%0" : "=m" (p->thread.fsindex)); ++ asm("mov %%es,%0" : "=m" (p->thread.es)); ++ asm("mov %%ds,%0" : "=m" (p->thread.ds)); + + if (unlikely(me->thread.io_bitmap_ptr != NULL)) { + p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); +@@ -457,11 +457,11 @@ struct task_struct *__switch_to(struct t + * Switch DS and ES. + * This won't pick up thread selector changes, but I guess that is ok. + */ +- asm volatile("movl %%es,%0" : "=m" (prev->es)); ++ asm volatile("mov %%es,%0" : "=m" (prev->es)); + if (unlikely(next->es | prev->es)) + loadsegment(es, next->es); + +- asm volatile ("movl %%ds,%0" : "=m" (prev->ds)); ++ asm volatile ("mov %%ds,%0" : "=m" (prev->ds)); + if (unlikely(next->ds | prev->ds)) + loadsegment(ds, next->ds); + +@@ -472,7 +472,7 @@ struct task_struct *__switch_to(struct t + */ + { + unsigned fsindex; +- asm volatile("movl %%fs,%0" : "=g" (fsindex)); ++ asm volatile("movl %%fs,%0" : "=r" (fsindex)); + /* segment register != 0 always requires a reload. + also reload when it has changed. + when prev process used 64bit base always reload +@@ -493,7 +493,7 @@ struct task_struct *__switch_to(struct t + } + { + unsigned gsindex; +- asm volatile("movl %%gs,%0" : "=g" (gsindex)); ++ asm volatile("movl %%gs,%0" : "=r" (gsindex)); + if (unlikely(gsindex | next->gsindex | prev->gs)) { + load_gs_index(next->gsindex); + if (gsindex) +--- linux/include/asm-i386/system.h.seg 2005-03-27 13:09:12.000000000 -0800 ++++ linux/include/asm-i386/system.h 2005-03-28 10:28:47.000000000 -0800 +@@ -81,7 +81,7 @@ static inline unsigned long _get_base(ch + #define loadsegment(seg,value) \ + asm volatile("\n" \ + "1:\t" \ +- "movl %0,%%" #seg "\n" \ ++ "mov %0,%%" #seg "\n" \ + "2:\n" \ + ".section .fixup,\"ax\"\n" \ + "3:\t" \ +@@ -93,13 +93,13 @@ static inline unsigned long _get_base(ch + ".align 4\n\t" \ + ".long 1b,3b\n" \ + ".previous" \ +- : :"m" (*(unsigned int *)&(value))) ++ : :"m" (value)) + + /* + * Save a segment register away + */ + #define savesegment(seg, value) \ +- asm volatile("movl %%" #seg ",%0":"=m" (*(int *)&(value))) ++ asm volatile("mov %%" #seg ",%0":"=m" (value)) + + /* + * Clear and set 'TS' bit respectively diff --git a/lustre/kernel_patches/patches/linux-2.6.9-ext3-sub-second-timestamp.patch b/lustre/kernel_patches/patches/linux-2.6.9-ext3-sub-second-timestamp.patch new file mode 100644 index 0000000..3b46795 --- /dev/null +++ b/lustre/kernel_patches/patches/linux-2.6.9-ext3-sub-second-timestamp.patch @@ -0,0 +1,631 @@ +Index: linux-2.6.10/drivers/char/qtronix.c +=================================================================== +--- linux-2.6.10.orig/drivers/char/qtronix.c 2004-12-24 14:35:50.000000000 -0700 ++++ linux-2.6.10/drivers/char/qtronix.c 2006-01-03 16:16:52.000000000 -0700 +@@ -537,7 +537,7 @@ + i--; + } + if (count-i) { +- file->f_dentry->d_inode->i_atime = CURRENT_TIME; ++ file->f_dentry->d_inode->i_atime = current_fs_time(inode->i_sb); + return count-i; + } + if (signal_pending(current)) +Index: linux-2.6.10/drivers/char/random.c +=================================================================== +--- linux-2.6.10.orig/drivers/char/random.c 2005-04-06 09:38:33.000000000 -0600 ++++ linux-2.6.10/drivers/char/random.c 2006-01-03 16:16:52.000000000 -0700 +@@ -1743,8 +1743,9 @@ + if (p == buffer) { + return (ssize_t)ret; + } else { +- file->f_dentry->d_inode->i_mtime = CURRENT_TIME; +- mark_inode_dirty(file->f_dentry->d_inode); ++ struct inode *inode = file->f_dentry->d_inode; ++ inode->i_mtime = current_fs_time(inode->i_sb); ++ mark_inode_dirty(inode); + return (ssize_t)(p - buffer); + } + } +Index: linux-2.6.10/drivers/char/sonypi.c +=================================================================== +--- linux-2.6.10.orig/drivers/char/sonypi.c 2004-12-24 14:35:23.000000000 -0700 ++++ linux-2.6.10/drivers/char/sonypi.c 2006-01-03 16:18:31.000000000 -0700 +@@ -537,7 +537,8 @@ + } + + if (ret > 0) +- file->f_dentry->d_inode->i_atime = CURRENT_TIME; ++ struct inode *inode = file->f_dentry->d_inode; ++ inode->i_atime = current_fs_time(inode->i_sb); + + return ret; + } +Index: linux-2.6.10/drivers/char/tty_io.c +=================================================================== +--- linux-2.6.10.orig/drivers/char/tty_io.c 2005-04-06 09:38:33.000000000 -0600 ++++ linux-2.6.10/drivers/char/tty_io.c 2006-01-03 16:16:52.000000000 -0700 +@@ -1018,7 +1018,7 @@ + tty_ldisc_deref(ld); + unlock_kernel(); + if (i > 0) +- inode->i_atime = CURRENT_TIME; ++ inode->i_atime = current_fs_time(inode->i_sb); + return i; + } + +@@ -1095,7 +1095,8 @@ + cond_resched(); + } + if (written) { +- file->f_dentry->d_inode->i_mtime = CURRENT_TIME; ++ struct inode *inode = file->f_dentry->d_inode; ++ inode->i_mtime = current_fs_time(inode->i_sb); + ret = written; + } + up(&tty->atomic_write); +Index: linux-2.6.10/fs/attr.c +=================================================================== +--- linux-2.6.10.orig/fs/attr.c 2004-12-24 14:34:00.000000000 -0700 ++++ linux-2.6.10/fs/attr.c 2006-01-03 16:16:52.000000000 -0700 +@@ -14,6 +14,7 @@ + #include + #include + #include ++#include + + /* Taken over from the old code... */ + +@@ -87,11 +88,14 @@ + if (ia_valid & ATTR_GID) + inode->i_gid = attr->ia_gid; + if (ia_valid & ATTR_ATIME) +- inode->i_atime = attr->ia_atime; ++ inode->i_atime = timespec_trunc(attr->ia_atime, ++ get_sb_time_gran(inode->i_sb)); + if (ia_valid & ATTR_MTIME) +- inode->i_mtime = attr->ia_mtime; ++ inode->i_mtime = timespec_trunc(attr->ia_mtime, ++ get_sb_time_gran(inode->i_sb)); + if (ia_valid & ATTR_CTIME) +- inode->i_ctime = attr->ia_ctime; ++ inode->i_ctime = timespec_trunc(attr->ia_ctime, ++ get_sb_time_gran(inode->i_sb)); + if (ia_valid & ATTR_MODE) { + umode_t mode = attr->ia_mode; + +@@ -131,14 +135,17 @@ + int notify_change(struct dentry * dentry, struct iattr * attr) + { + struct inode *inode = dentry->d_inode; +- mode_t mode = inode->i_mode; ++ mode_t mode; + int error; +- struct timespec now = CURRENT_TIME; ++ struct timespec now; + unsigned int ia_valid = attr->ia_valid; + + if (!inode) + BUG(); + ++ mode = inode->i_mode; ++ now = current_fs_time(inode->i_sb); ++ + attr->ia_ctime = now; + if (!(ia_valid & ATTR_ATIME_SET)) + attr->ia_atime = now; +Index: linux-2.6.10/fs/bad_inode.c +=================================================================== +--- linux-2.6.10.orig/fs/bad_inode.c 2004-12-24 14:35:50.000000000 -0700 ++++ linux-2.6.10/fs/bad_inode.c 2006-01-03 16:16:52.000000000 -0700 +@@ -105,7 +105,8 @@ + remove_inode_hash(inode); + + inode->i_mode = S_IFREG; +- inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; ++ inode->i_atime = inode->i_mtime = inode->i_ctime = ++ current_fs_time(inode->i_sb); + inode->i_op = &bad_inode_ops; + inode->i_fop = &bad_file_ops; + } +Index: linux-2.6.10/fs/binfmt_misc.c +=================================================================== +--- linux-2.6.10.orig/fs/binfmt_misc.c 2004-12-24 14:34:31.000000000 -0700 ++++ linux-2.6.10/fs/binfmt_misc.c 2006-01-03 16:16:52.000000000 -0700 +@@ -509,7 +509,8 @@ + inode->i_gid = 0; + inode->i_blksize = PAGE_CACHE_SIZE; + inode->i_blocks = 0; +- inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; ++ inode->i_atime = inode->i_mtime = inode->i_ctime = ++ current_fs_time(inode->i_sb); + } + return inode; + } +Index: linux-2.6.10/fs/ext2/dir.c +=================================================================== +--- linux-2.6.10.orig/fs/ext2/dir.c 2004-12-24 14:34:58.000000000 -0700 ++++ linux-2.6.10/fs/ext2/dir.c 2006-01-03 16:16:52.000000000 -0700 +@@ -426,7 +426,7 @@ + ext2_set_de_type (de, inode); + err = ext2_commit_chunk(page, from, to); + ext2_put_page(page); +- dir->i_mtime = dir->i_ctime = CURRENT_TIME; ++ dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; + EXT2_I(dir)->i_flags &= ~EXT2_BTREE_FL; + mark_inode_dirty(dir); + } +@@ -516,7 +516,7 @@ + de->inode = cpu_to_le32(inode->i_ino); + ext2_set_de_type (de, inode); + err = ext2_commit_chunk(page, from, to); +- dir->i_mtime = dir->i_ctime = CURRENT_TIME; ++ dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; + EXT2_I(dir)->i_flags &= ~EXT2_BTREE_FL; + mark_inode_dirty(dir); + /* OFFSET_CACHE */ +@@ -564,7 +564,7 @@ + pde->rec_len = cpu_to_le16(to-from); + dir->inode = 0; + err = ext2_commit_chunk(page, from, to); +- inode->i_ctime = inode->i_mtime = CURRENT_TIME; ++ inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC; + EXT2_I(inode)->i_flags &= ~EXT2_BTREE_FL; + mark_inode_dirty(inode); + out: +Index: linux-2.6.10/fs/ext2/ialloc.c +=================================================================== +--- linux-2.6.10.orig/fs/ext2/ialloc.c 2004-12-24 14:34:47.000000000 -0700 ++++ linux-2.6.10/fs/ext2/ialloc.c 2006-01-03 16:16:52.000000000 -0700 +@@ -577,7 +577,7 @@ + inode->i_ino = ino; + inode->i_blksize = PAGE_SIZE; /* This is the optimal IO size (for stat), not the fs block size */ + inode->i_blocks = 0; +- inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; ++ inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; + memset(ei->i_data, 0, sizeof(ei->i_data)); + ei->i_flags = EXT2_I(dir)->i_flags & ~EXT2_BTREE_FL; + if (S_ISLNK(mode)) +Index: linux-2.6.10/fs/ext2/inode.c +=================================================================== +--- linux-2.6.10.orig/fs/ext2/inode.c 2004-12-24 14:33:51.000000000 -0700 ++++ linux-2.6.10/fs/ext2/inode.c 2006-01-03 16:16:52.000000000 -0700 +@@ -493,7 +493,7 @@ + + /* We are done with atomic stuff, now do the rest of housekeeping */ + +- inode->i_ctime = CURRENT_TIME; ++ inode->i_ctime = CURRENT_TIME_SEC; + + /* had we spliced it onto indirect block? */ + if (where->bh) +@@ -953,7 +953,7 @@ + case EXT2_TIND_BLOCK: + ; + } +- inode->i_mtime = inode->i_ctime = CURRENT_TIME; ++ inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; + if (inode_needs_sync(inode)) { + sync_mapping_buffers(inode->i_mapping); + ext2_sync_inode (inode); +Index: linux-2.6.10/fs/ext2/ioctl.c +=================================================================== +--- linux-2.6.10.orig/fs/ext2/ioctl.c 2004-12-24 14:35:49.000000000 -0700 ++++ linux-2.6.10/fs/ext2/ioctl.c 2006-01-03 16:16:52.000000000 -0700 +@@ -59,7 +59,7 @@ + ei->i_flags = flags; + + ext2_set_inode_flags(inode); +- inode->i_ctime = CURRENT_TIME; ++ inode->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(inode); + return 0; + } +@@ -72,7 +72,7 @@ + return -EROFS; + if (get_user(inode->i_generation, (int __user *) arg)) + return -EFAULT; +- inode->i_ctime = CURRENT_TIME; ++ inode->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(inode); + return 0; + default: +Index: linux-2.6.10/fs/ext2/namei.c +=================================================================== +--- linux-2.6.10.orig/fs/ext2/namei.c 2004-12-24 14:35:25.000000000 -0700 ++++ linux-2.6.10/fs/ext2/namei.c 2006-01-03 16:16:52.000000000 -0700 +@@ -211,7 +211,7 @@ + if (inode->i_nlink >= EXT2_LINK_MAX) + return -EMLINK; + +- inode->i_ctime = CURRENT_TIME; ++ inode->i_ctime = CURRENT_TIME_SEC; + ext2_inc_count(inode); + atomic_inc(&inode->i_count); + +@@ -337,7 +337,7 @@ + goto out_dir; + ext2_inc_count(old_inode); + ext2_set_link(new_dir, new_de, new_page, old_inode); +- new_inode->i_ctime = CURRENT_TIME; ++ new_inode->i_ctime = CURRENT_TIME_SEC; + if (dir_de) + new_inode->i_nlink--; + ext2_dec_count(new_inode); +@@ -362,7 +362,7 @@ + * rename. + * ext2_dec_count() will mark the inode dirty. + */ +- old_inode->i_ctime = CURRENT_TIME; ++ old_inode->i_ctime = CURRENT_TIME_SEC; + + ext2_delete_entry (old_de, old_page); + ext2_dec_count(old_inode); +Index: linux-2.6.10/fs/ext2/super.c +=================================================================== +--- linux-2.6.10.orig/fs/ext2/super.c 2004-12-24 14:35:01.000000000 -0700 ++++ linux-2.6.10/fs/ext2/super.c 2006-01-03 16:19:06.000000000 -0700 +@@ -595,7 +595,7 @@ + es = (struct ext2_super_block *) (((char *)bh->b_data) + offset); + sbi->s_es = es; + sb->s_magic = le16_to_cpu(es->s_magic); +- sb->s_flags |= MS_ONE_SECOND; ++ set_sb_time_gran(sb, 1000000000U); + + if (sb->s_magic != EXT2_SUPER_MAGIC) + goto cantfind_ext2; +Index: linux-2.6.10/fs/ext2/xattr.c +=================================================================== +--- linux-2.6.10.orig/fs/ext2/xattr.c 2005-04-06 09:38:35.000000000 -0600 ++++ linux-2.6.10/fs/ext2/xattr.c 2006-01-03 16:16:52.000000000 -0700 +@@ -702,7 +702,7 @@ + + /* Update the inode. */ + EXT2_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0; +- inode->i_ctime = CURRENT_TIME; ++ inode->i_ctime = CURRENT_TIME_SEC; + if (IS_SYNC(inode)) { + error = ext2_sync_inode (inode); + if (error) +Index: linux-2.6.10/fs/ext3/ialloc.c +=================================================================== +--- linux-2.6.10.orig/fs/ext3/ialloc.c 2004-12-24 14:34:45.000000000 -0700 ++++ linux-2.6.10/fs/ext3/ialloc.c 2006-01-03 16:16:52.000000000 -0700 +@@ -558,7 +558,7 @@ + /* This is the optimal IO size (for stat), not the fs block size */ + inode->i_blksize = PAGE_SIZE; + inode->i_blocks = 0; +- inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; ++ inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; + + memset(ei->i_data, 0, sizeof(ei->i_data)); + ei->i_next_alloc_block = 0; +Index: linux-2.6.10/fs/ext3/inode.c +=================================================================== +--- linux-2.6.10.orig/fs/ext3/inode.c 2005-04-06 09:38:35.000000000 -0600 ++++ linux-2.6.10/fs/ext3/inode.c 2006-01-03 16:16:52.000000000 -0700 +@@ -626,7 +626,7 @@ + + /* We are done with atomic stuff, now do the rest of housekeeping */ + +- inode->i_ctime = CURRENT_TIME; ++ inode->i_ctime = CURRENT_TIME_SEC; + ext3_mark_inode_dirty(handle, inode); + + /* had we spliced it onto indirect block? */ +@@ -2199,7 +2199,7 @@ + ; + } + up(&ei->truncate_sem); +- inode->i_mtime = inode->i_ctime = CURRENT_TIME; ++ inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; + ext3_mark_inode_dirty(handle, inode); + + /* In a multi-transaction truncate, we only make the final +Index: linux-2.6.10/fs/ext3/ioctl.c +=================================================================== +--- linux-2.6.10.orig/fs/ext3/ioctl.c 2004-12-24 14:34:31.000000000 -0700 ++++ linux-2.6.10/fs/ext3/ioctl.c 2006-01-03 16:16:52.000000000 -0700 +@@ -87,7 +87,7 @@ + ei->i_flags = flags; + + ext3_set_inode_flags(inode); +- inode->i_ctime = CURRENT_TIME; ++ inode->i_ctime = CURRENT_TIME_SEC; + + err = ext3_mark_iloc_dirty(handle, inode, &iloc); + flags_err: +@@ -121,7 +121,7 @@ + return PTR_ERR(handle); + err = ext3_reserve_inode_write(handle, inode, &iloc); + if (err == 0) { +- inode->i_ctime = CURRENT_TIME; ++ inode->i_ctime = CURRENT_TIME_SEC; + inode->i_generation = generation; + err = ext3_mark_iloc_dirty(handle, inode, &iloc); + } +Index: linux-2.6.10/fs/ext3/namei.c +=================================================================== +--- linux-2.6.10.orig/fs/ext3/namei.c 2004-12-24 14:34:58.000000000 -0700 ++++ linux-2.6.10/fs/ext3/namei.c 2006-01-03 16:16:52.000000000 -0700 +@@ -1251,7 +1251,7 @@ + * happen is that the times are slightly out of date + * and/or different from the directory change time. + */ +- dir->i_mtime = dir->i_ctime = CURRENT_TIME; ++ dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; + ext3_update_dx_flag(dir); + dir->i_version++; + ext3_mark_inode_dirty(handle, dir); +@@ -2029,7 +2029,7 @@ + * recovery. */ + inode->i_size = 0; + ext3_orphan_add(handle, inode); +- inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; ++ inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; + ext3_mark_inode_dirty(handle, inode); + dir->i_nlink--; + ext3_update_dx_flag(dir); +@@ -2079,7 +2079,7 @@ + retval = ext3_delete_entry(handle, dir, de, bh); + if (retval) + goto end_unlink; +- dir->i_ctime = dir->i_mtime = CURRENT_TIME; ++ dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; + ext3_update_dx_flag(dir); + ext3_mark_inode_dirty(handle, dir); + inode->i_nlink--; +@@ -2169,7 +2169,7 @@ + if (IS_DIRSYNC(dir)) + handle->h_sync = 1; + +- inode->i_ctime = CURRENT_TIME; ++ inode->i_ctime = CURRENT_TIME_SEC; + ext3_inc_count(handle, inode); + atomic_inc(&inode->i_count); + +@@ -2270,7 +2270,7 @@ + * Like most other Unix systems, set the ctime for inodes on a + * rename. + */ +- old_inode->i_ctime = CURRENT_TIME; ++ old_inode->i_ctime = CURRENT_TIME_SEC; + ext3_mark_inode_dirty(handle, old_inode); + + /* +@@ -2303,9 +2303,9 @@ + + if (new_inode) { + new_inode->i_nlink--; +- new_inode->i_ctime = CURRENT_TIME; ++ new_inode->i_ctime = CURRENT_TIME_SEC; + } +- old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME; ++ old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME_SEC; + ext3_update_dx_flag(old_dir); + if (dir_bh) { + BUFFER_TRACE(dir_bh, "get_write_access"); +Index: linux-2.6.10/fs/ext3/super.c +=================================================================== +--- linux-2.6.10.orig/fs/ext3/super.c 2005-04-06 09:38:35.000000000 -0600 ++++ linux-2.6.10/fs/ext3/super.c 2006-01-03 16:16:52.000000000 -0700 +@@ -1318,7 +1318,7 @@ + if (!parse_options ((char *) data, sb, &journal_inum, NULL, 0)) + goto failed_mount; + +- sb->s_flags |= MS_ONE_SECOND; ++ set_sb_time_gran(sb, 1000000000U); + sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | + ((sbi->s_mount_opt & EXT3_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); + +Index: linux-2.6.10/fs/ext3/xattr.c +=================================================================== +--- linux-2.6.10.orig/fs/ext3/xattr.c 2005-04-06 09:38:35.000000000 -0600 ++++ linux-2.6.10/fs/ext3/xattr.c 2006-01-03 16:16:52.000000000 -0700 +@@ -723,7 +723,7 @@ + + /* Update the inode. */ + EXT3_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0; +- inode->i_ctime = CURRENT_TIME; ++ inode->i_ctime = CURRENT_TIME_SEC; + ext3_mark_inode_dirty(handle, inode); + if (IS_SYNC(inode)) + handle->h_sync = 1; +Index: linux-2.6.10/fs/inode.c +=================================================================== +--- linux-2.6.10.orig/fs/inode.c 2006-01-03 15:33:21.000000000 -0700 ++++ linux-2.6.10/fs/inode.c 2006-01-03 16:16:52.000000000 -0700 +@@ -1131,19 +1131,6 @@ + + EXPORT_SYMBOL(bmap); + +-/* +- * Return true if the filesystem which backs this inode considers the two +- * passed timespecs to be sufficiently different to warrant flushing the +- * altered time out to disk. +- */ +-static int inode_times_differ(struct inode *inode, +- struct timespec *old, struct timespec *new) +-{ +- if (IS_ONE_SECOND(inode)) +- return old->tv_sec != new->tv_sec; +- return !timespec_equal(old, new); +-} +- + /** + * update_atime - update the access time + * @inode: inode accessed +@@ -1163,8 +1150,8 @@ + if (IS_RDONLY(inode)) + return; + +- now = current_kernel_time(); +- if (inode_times_differ(inode, &inode->i_atime, &now)) { ++ now = current_fs_time(inode->i_sb); ++ if (!timespec_equal(&inode->i_atime, &now)) { + inode->i_atime = now; + mark_inode_dirty_sync(inode); + } else { +@@ -1194,14 +1181,13 @@ + if (IS_RDONLY(inode)) + return; + +- now = current_kernel_time(); +- +- if (inode_times_differ(inode, &inode->i_mtime, &now)) ++ now = current_fs_time(inode->i_sb); ++ if (!timespec_equal(&inode->i_mtime, &now)) + sync_it = 1; + inode->i_mtime = now; + + if (ctime_too) { +- if (inode_times_differ(inode, &inode->i_ctime, &now)) ++ if (!timespec_equal(&inode->i_ctime, &now)) + sync_it = 1; + inode->i_ctime = now; + } +Index: linux-2.6.10/fs/locks.c +=================================================================== +--- linux-2.6.10.orig/fs/locks.c 2004-12-24 14:35:28.000000000 -0700 ++++ linux-2.6.10/fs/locks.c 2006-01-03 16:16:52.000000000 -0700 +@@ -1228,7 +1228,7 @@ + { + struct file_lock *flock = inode->i_flock; + if (flock && IS_LEASE(flock) && (flock->fl_type & F_WRLCK)) +- *time = CURRENT_TIME; ++ *time = current_fs_time(inode->i_sb); + else + *time = inode->i_mtime; + } +Index: linux-2.6.10/include/linux/fs.h +=================================================================== +--- linux-2.6.10.orig/include/linux/fs.h 2006-01-03 16:04:26.000000000 -0700 ++++ linux-2.6.10/include/linux/fs.h 2006-01-03 16:16:52.000000000 -0700 +@@ -124,7 +124,8 @@ + #define MS_REC 16384 + #define MS_VERBOSE 32768 + #define MS_POSIXACL (1<<16) /* VFS does not apply the umask */ +-#define MS_ONE_SECOND (1<<17) /* fs has 1 sec a/m/ctime resolution */ ++#define MS_ONE_SECOND (1<<17) /* fs has 1 sec time resolution (obsolete) */ ++#define MS_TIME_GRAN (1<<18) /* fs has s_time_gran field */ + #define MS_ACTIVE (1<<30) + #define MS_NOUSER (1<<31) + +@@ -803,8 +804,33 @@ + * even looking at it. You had been warned. + */ + struct semaphore s_vfs_rename_sem; /* Kludge */ ++ ++ /* Granuality of c/m/atime in ns. ++ Cannot be worse than a second */ ++#ifndef __GENKSYMS__ ++ u32 s_time_gran; ++#endif + }; + ++extern struct timespec current_fs_time(struct super_block *sb); ++ ++static inline u32 get_sb_time_gran(struct super_block *sb) ++{ ++ if (sb->s_flags & MS_TIME_GRAN) ++ return sb->s_time_gran; ++ if (sb->s_flags & MS_ONE_SECOND) ++ return 1000000000U; ++ return 1; ++} ++ ++static inline void set_sb_time_gran(struct super_block *sb, u32 time_gran) ++{ ++ sb->s_time_gran = time_gran; ++ sb->s_flags |= MS_TIME_GRAN; ++ if (time_gran == 1000000000U) ++ sb->s_flags |= MS_ONE_SECOND; ++} ++ + /* + * Snapshotting support. + */ +Index: linux-2.6.10/include/linux/time.h +=================================================================== +--- linux-2.6.10.orig/include/linux/time.h 2004-12-24 14:35:00.000000000 -0700 ++++ linux-2.6.10/include/linux/time.h 2006-01-03 16:16:52.000000000 -0700 +@@ -90,6 +90,7 @@ + struct timespec current_kernel_time(void); + + #define CURRENT_TIME (current_kernel_time()) ++#define CURRENT_TIME_SEC ((struct timespec) { xtime.tv_sec, 0 }) + + extern void do_gettimeofday(struct timeval *tv); + extern int do_settimeofday(struct timespec *tv); +@@ -103,6 +104,8 @@ + extern int do_getitimer(int which, struct itimerval *value); + extern void getnstimeofday (struct timespec *tv); + ++extern struct timespec timespec_trunc(struct timespec t, unsigned gran); ++ + static inline void + set_normalized_timespec (struct timespec *ts, time_t sec, long nsec) + { +Index: linux-2.6.10/kernel/time.c +=================================================================== +--- linux-2.6.10.orig/kernel/time.c 2004-12-24 14:34:26.000000000 -0700 ++++ linux-2.6.10/kernel/time.c 2006-01-03 16:16:52.000000000 -0700 +@@ -36,6 +36,7 @@ + + #include + #include ++#include + + /* + * The timezone where the local system is located. Used as a default by some +@@ -433,6 +434,50 @@ + + EXPORT_SYMBOL(current_kernel_time); + ++/** ++ * current_fs_time - Return FS time ++ * @sb: Superblock. ++ * ++ * Return the current time truncated to the time granuality supported by ++ * the fs. ++ */ ++struct timespec current_fs_time(struct super_block *sb) ++{ ++ struct timespec now = current_kernel_time(); ++ return timespec_trunc(now, get_sb_time_gran(sb)); ++} ++EXPORT_SYMBOL(current_fs_time); ++ ++/** ++ * timespec_trunc - Truncate timespec to a granuality ++ * @t: Timespec ++ * @gran: Granuality in ns. ++ * ++ * Truncate a timespec to a granuality. gran must be smaller than a second. ++ * Always rounds down. ++ * ++ * This function should be only used for timestamps returned by ++ * current_kernel_time() or CURRENT_TIME, not with do_gettimeofday() because ++ * it doesn't handle the better resolution of the later. ++ */ ++struct timespec timespec_trunc(struct timespec t, unsigned gran) ++{ ++ /* ++ * Division is pretty slow so avoid it for common cases. ++ * Currently current_kernel_time() never returns better than ++ * jiffies resolution. Exploit that. ++ */ ++ if (gran <= jiffies_to_usecs(1) * 1000) { ++ /* nothing */ ++ } else if (gran == 1000000000) { ++ t.tv_nsec = 0; ++ } else { ++ t.tv_nsec -= t.tv_nsec % gran; ++ } ++ return t; ++} ++EXPORT_SYMBOL(timespec_trunc); ++ + #ifdef CONFIG_TIME_INTERPOLATION + void getnstimeofday (struct timespec *tv) + { diff --git a/lustre/kernel_patches/patches/listman-2.4.19-bgl.patch b/lustre/kernel_patches/patches/listman-2.4.19-bgl.patch deleted file mode 100644 index 19ad959..0000000 --- a/lustre/kernel_patches/patches/listman-2.4.19-bgl.patch +++ /dev/null @@ -1,72 +0,0 @@ -Index: linux-2.4.18-chaos/include/linux/list.h -=================================================================== ---- linux-2.4.18-chaos.orig/include/linux/list.h 2003-11-23 00:07:05.000000000 +0300 -+++ linux-2.4.18-chaos/include/linux/list.h 2003-12-11 00:25:15.000000000 +0300 -@@ -173,6 +173,67 @@ - for (pos = (head)->prev, prefetch(pos->prev); pos != (head); \ - pos = pos->prev, prefetch(pos->prev)) - -+/** -+ * list_for_each_entry - iterate over list of given type -+ * @pos: the type * to use as a loop counter. -+ * @head: the head for your list. -+ * @member: the name of the list_struct within the struct. -+ */ -+#define list_for_each_entry(pos, head, member) \ -+ for (pos = list_entry((head)->next, typeof(*pos), member), \ -+ prefetch(pos->member.next); \ -+ &pos->member != (head); \ -+ pos = list_entry(pos->member.next, typeof(*pos), member), \ -+ prefetch(pos->member.next)) -+ -+#ifndef list_for_each_entry_safe -+/** -+ * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry -+ * @pos: the type * to use as a loop counter. -+ * @n: another type * to use as temporary storage -+ * @head: the head for your list. -+ * @member: the name of the list_struct within the struct. -+ */ -+#define list_for_each_entry_safe(pos, n, head, member) \ -+ for (pos = list_entry((head)->next, typeof(*pos), member), \ -+ n = list_entry(pos->member.next, typeof(*pos), member); \ -+ &pos->member != (head); \ -+ pos = n, n = list_entry(n->member.next, typeof(*n), member)) -+#endif -+ -+/** -+ * list_move - delete from one list and add as another's head -+ * @list: the entry to move -+ * @head: the head that will precede our entry -+ */ -+static inline void list_move(struct list_head *list, struct list_head *head) -+{ -+ __list_del(list->prev, list->next); -+ list_add(list, head); -+} -+ -+/** -+ * list_move_tail - delete from one list and add as another's tail -+ * @list: the entry to move -+ * @head: the head that will follow our entry -+ */ -+static inline void list_move_tail(struct list_head *list, -+ struct list_head *head) -+{ -+ __list_del(list->prev, list->next); -+ list_add_tail(list, head); -+} -+ -+/* 2.5 uses hlists for some things, like the d_hash. we'll treat them -+ * as 2.5 and let macros drop back.. */ -+#define hlist_entry list_entry -+#define hlist_head list_head -+#define hlist_node list_head -+#define HLIST_HEAD LIST_HEAD -+#define INIT_HLIST_HEAD INIT_LIST_HEAD -+#define hlist_del_init list_del_init -+#define hlist_add_head list_add -+#define hlist_for_each_safe list_for_each_safe - - #endif /* __KERNEL__ || _LVM_H_INCLUDE */ - diff --git a/lustre/kernel_patches/patches/mcore-2.4.20-8.patch b/lustre/kernel_patches/patches/mcore-2.4.20-8.patch deleted file mode 100644 index c8b80eb..0000000 --- a/lustre/kernel_patches/patches/mcore-2.4.20-8.patch +++ /dev/null @@ -1,2738 +0,0 @@ -? linux/.config -? linux/include/linux/autoconf.h -? linux/include/linux/modules -Index: linux/Makefile -=================================================================== -RCS file: /chaos/cvs/kernel-rh/linux/Makefile,v -retrieving revision 1.3.2.1 -retrieving revision 1.3.2.1.2.1 -diff -u -r1.3.2.1 -r1.3.2.1.2.1 ---- linux/Makefile 12 Mar 2003 19:48:52 -0000 1.3.2.1 -+++ linux/Makefile 1 Apr 2003 12:17:40 -0000 1.3.2.1.2.1 -@@ -99,6 +99,10 @@ - CFLAGS += -fomit-frame-pointer - endif - AFLAGS := -D__ASSEMBLY__ $(CPPFLAGS) -+ifeq ($(CONFIG_MCL_COREDUMP),y) -+ CFLAGS += -g -+endif -+ - - # - # ROOT_DEV specifies the default root-device when making the image. -Index: linux/Documentation/Configure.help -=================================================================== -RCS file: /chaos/cvs/kernel-rh/linux/Documentation/Configure.help,v -retrieving revision 1.3.2.1 -retrieving revision 1.3.2.1.2.1 -diff -u -r1.3.2.1 -r1.3.2.1.2.1 ---- linux/Documentation/Configure.help 12 Mar 2003 19:48:52 -0000 1.3.2.1 -+++ linux/Documentation/Configure.help 1 Apr 2003 12:17:40 -0000 1.3.2.1.2.1 -@@ -21660,6 +21660,35 @@ - This option allows you to run the kernel with data cache disabled. - Say Y if you experience CPM lock-ups. - -+Boot kernel image support -+CONFIG_BOOTIMG -+ Add support for booting a new Linux kernel from a running Linux -+ system. You need to download the bootimg(8) utility from -+ ftp://icaftp.epfl.ch/pub/people/almesber/misc/bootimg-current.tar.gz -+ in order to use this functionality. -+ -+Protect SMP configuration tables -+CONFIG_BOOTIMG_SMP -+ On SMP systems, the BIOS stores tables with configuration data in -+ memory and an SMP-enabled kernel reads these tables. However, a -+ kernel without SMP support will overwrite such tables. If a kernel -+ without SMP support used bootimg to boot an SMP-enabled kernel, the -+ latter will probably crash when trying to read the SMP tables. The -+ CONFIG_BOOTIMG_SMP option enables minimal support for scanning and -+ protecting of SMP configuration tables also for kernels without SMP -+ support. -+ -+In-memory kernel core dump facility -+CONFIG_MCL_COREDUMP -+ In conjunction with bootimg, this allows you to get kernel core dumps -+ of your system at panic() time. The panic call is modified so that it -+ calls the core dump facility and reboots the system. On the way back -+ up, the kernel dump image is written out to disk by the accompanying -+ init script. You can use the crash analysis tool to analyze the core -+ dump. This tool can be found at : -+ -+ http://www.missioncriticallinux.com/download -+ - # - # m68k-specific kernel options - # Documented by Chris Lawrence et al. -Index: linux/arch/i386/config.in -=================================================================== -RCS file: /chaos/cvs/kernel-rh/linux/arch/i386/config.in,v -retrieving revision 1.3.2.1 -retrieving revision 1.3.2.1.2.2 -diff -u -r1.3.2.1 -r1.3.2.1.2.2 ---- linux/arch/i386/config.in 12 Mar 2003 19:49:05 -0000 1.3.2.1 -+++ linux/arch/i386/config.in 1 Apr 2003 19:35:12 -0000 1.3.2.1.2.2 -@@ -502,6 +502,12 @@ - bool ' Magic SysRq key' CONFIG_MAGIC_SYSRQ - bool ' Spinlock debugging' CONFIG_DEBUG_SPINLOCK - bool ' Compile the kernel with frame pointers' CONFIG_FRAME_POINTER -+ if [ "$CONFIG_FRAME_POINTER " != "n" ]; then -+ bool ' Kernel Core Dump Facility' CONFIG_MCL_COREDUMP -+ if [ "$CONFIG_MCL_COREDUMP" = "y" ]; then -+ bool ' Reboot using bootimg' CONFIG_BOOTIMG -+ fi -+ fi - fi - - endmenu -Index: linux/arch/i386/vmlinux.lds -=================================================================== -RCS file: /chaos/cvs/kernel-rh/linux/arch/i386/vmlinux.lds,v -retrieving revision 1.1.1.1.4.1 -retrieving revision 1.1.1.1.4.1.2.1 -diff -u -r1.1.1.1.4.1 -r1.1.1.1.4.1.2.1 ---- linux/arch/i386/vmlinux.lds 12 Mar 2003 19:49:05 -0000 1.1.1.1.4.1 -+++ linux/arch/i386/vmlinux.lds 1 Apr 2003 12:17:40 -0000 1.1.1.1.4.1.2.1 -@@ -19,6 +19,13 @@ - .rodata : { *(.rodata) *(.rodata.*) } - .kstrtab : { *(.kstrtab) } - -+ . = ALIGN(16); /* Relocatable bootimage code */ -+ __bootimg_start = .; -+ .bootimg : { -+ *(.bootimg) -+ } -+ __bootimg_end = .; -+ - . = ALIGN(16); /* Exception table */ - __start___ex_table = .; - __ex_table : { *(__ex_table) } -Index: linux/arch/i386/boot/setup.S -=================================================================== -RCS file: /chaos/cvs/kernel-rh/linux/arch/i386/boot/setup.S,v -retrieving revision 1.2.2.1 -retrieving revision 1.2.2.1.2.1 -diff -u -r1.2.2.1 -r1.2.2.1.2.1 ---- linux/arch/i386/boot/setup.S 12 Mar 2003 19:49:05 -0000 1.2.2.1 -+++ linux/arch/i386/boot/setup.S 1 Apr 2003 12:17:40 -0000 1.2.2.1.2.1 -@@ -105,16 +105,22 @@ - # flags, unused bits must be zero (RFU) bit within loadflags - loadflags: - LOADED_HIGH = 1 # If set, the kernel is loaded high -+RELOADS_GDT = 2 # if set, kernel reloads GDT, such that -+ # boot loader does not have to provide -+ # GDT in a "safe" memory location - CAN_USE_HEAP = 0x80 # If set, the loader also has set - # heap_end_ptr to tell how much - # space behind setup.S can be used for - # heap purposes. - # Only the loader knows what is free --#ifndef __BIG_KERNEL__ -- .byte 0 --#else -- .byte LOADED_HIGH -+_FLAGS = 0 -+#ifdef __BIG_KERNEL__ -+ _FLAGS = _FLAGS | LOADED_HIGH - #endif -+#ifdef CONFIG_BOOTIMG -+ _FLAGS = _FLAGS | RELOADS_GDT -+#endif -+ .byte _FLAGS - - setup_move_size: .word 0x8000 # size to move, when setup is not - # loaded at 0x90000. We will move setup -Index: linux/arch/i386/kernel/Makefile -=================================================================== -RCS file: /chaos/cvs/kernel-rh/linux/arch/i386/kernel/Makefile,v -retrieving revision 1.2.2.1 -retrieving revision 1.2.2.1.2.1 -diff -u -r1.2.2.1 -r1.2.2.1.2.1 ---- linux/arch/i386/kernel/Makefile 12 Mar 2003 19:49:05 -0000 1.2.2.1 -+++ linux/arch/i386/kernel/Makefile 1 Apr 2003 12:17:40 -0000 1.2.2.1.2.1 -@@ -49,6 +49,7 @@ - obj-$(CONFIG_X86_LONGRUN) += longrun.o - obj-$(CONFIG_ELAN_CPUFREQ) += elanfreq.o - obj-$(CONFIG_PROFILING) += profile.o -+obj-$(CONFIG_MCL_COREDUMP) += crash.o - - - include $(TOPDIR)/Rules.make -Index: linux/arch/i386/kernel/crash.c -=================================================================== -RCS file: linux/arch/i386/kernel/crash.c -diff -N linux/arch/i386/kernel/crash.c ---- /dev/null 1 Jan 1970 00:00:00 -0000 -+++ linux/arch/i386/kernel/crash.c 1 Apr 2003 12:17:40 -0000 1.1.6.1 -@@ -0,0 +1,82 @@ -+/* -+ * linux/arch/i386/crash.c -+ * -+ * Architecture dependant code for MCL in-memory core dump. -+ */ -+#include -+#include -+#include -+#include -+#include -+#include -+ -+inline void crash_save_regs(void) { -+ static unsigned long regs[8]; -+ -+ __asm__ __volatile__("movl %%ebx,%0" : "=m"(regs[0])); -+ __asm__ __volatile__("movl %%ecx,%0" : "=m"(regs[1])); -+ __asm__ __volatile__("movl %%edx,%0" : "=m"(regs[2])); -+ __asm__ __volatile__("movl %%esi,%0" : "=m"(regs[3])); -+ __asm__ __volatile__("movl %%edi,%0" : "=m"(regs[4])); -+ __asm__ __volatile__("movl %%ebp,%0" : "=m"(regs[5])); -+ __asm__ __volatile__("movl %%eax,%0" : "=m"(regs[6])); -+ __asm__ __volatile__("movl %%esp,%0" : "=m"(regs[7])); -+ -+ panic_regs = regs; -+} -+ -+/* -+ * Save the current stack pointer and EIP. -+ */ -+void crash_save_current_state(struct task_struct *tp) -+{ -+ /* -+ * Here we save ebp instead of esp just in case the compiler -+ * decides to put an extra push in before we execute this -+ * instruction (thus invalidating our frame pointer). -+ */ -+ asm volatile("movl %%ebp,%0":"=m" (*(u_long *)&tp->thread.esp)); -+ tp->thread.eip = (u_long)crash_save_current_state; -+ panic_ksp[smp_processor_id()] = tp->thread.esp; -+ mb(); -+ -+ save_core(); -+ -+ crash_halt_or_reboot(1); -+} -+ -+/* -+ * If we are not the panicking thread, we simply halt. Otherwise, -+ * we take care of calling the reboot code. -+ */ -+void crash_halt_or_reboot(int boot_cpu) -+{ -+#ifdef CONFIG_SMP -+ if (!boot_cpu) { -+ stop_this_cpu(NULL); -+ /* NOTREACHED */ -+ } -+#endif -+ machine_restart(NULL); -+} -+ -+void crash_cleanup_smp_state(void) -+{ -+ /* -+ * Here we duplicate smp_send_stop. Crash_halt_or_reboot() calls -+ * stop_this_cpu. We now know that we are the only one running, -+ * so we finish off the smp_send_stop function. -+ */ -+ __cli(); -+#ifdef CONFIG_SMP -+ disable_local_APIC(); -+#endif -+} -+ -+/* -+ * Core dump IPI -+ */ -+void smp_crash_funnel_cpu(void) -+{ -+ crash_save_current_state(current); -+} -Index: linux/arch/i386/kernel/nmi.c -=================================================================== -RCS file: /chaos/cvs/kernel-rh/linux/arch/i386/kernel/nmi.c,v -retrieving revision 1.2.2.1 -retrieving revision 1.2.2.1.2.1 -diff -u -r1.2.2.1 -r1.2.2.1.2.1 ---- linux/arch/i386/kernel/nmi.c 12 Mar 2003 19:49:06 -0000 1.2.2.1 -+++ linux/arch/i386/kernel/nmi.c 1 Apr 2003 12:17:40 -0000 1.2.2.1.2.1 -@@ -374,11 +374,18 @@ - bust_spinlocks(1); - printk("NMI Watchdog detected LOCKUP on CPU%d, eip %08lx, registers:\n", cpu, regs->eip); - show_registers(regs); -+#ifdef CONFIG_MCL_COREDUMP -+ spin_unlock(&nmi_print_lock); -+ bust_spinlocks(0); -+ panic("die"); -+ /* NOTREACHED */ -+#else - printk("console shuts up ...\n"); - console_silent(); - spin_unlock(&nmi_print_lock); - bust_spinlocks(0); - do_exit(SIGSEGV); -+#endif - } - } else { - last_irq_sums[cpu] = sum; -Index: linux/arch/i386/kernel/process.c -=================================================================== -RCS file: /chaos/cvs/kernel-rh/linux/arch/i386/kernel/process.c,v -retrieving revision 1.2.2.2 -retrieving revision 1.2.2.2.2.1 -diff -u -r1.2.2.2 -r1.2.2.2.2.1 ---- linux/arch/i386/kernel/process.c 1 Apr 2003 02:11:17 -0000 1.2.2.2 -+++ linux/arch/i386/kernel/process.c 1 Apr 2003 12:17:40 -0000 1.2.2.2.2.1 -@@ -50,6 +50,9 @@ - #ifdef CONFIG_MATH_EMULATION - #include - #endif -+#ifdef CONFIG_BOOTIMG -+#include -+#endif - - #include - -@@ -377,7 +380,21 @@ - - void machine_restart(char * __unused) - { -+#ifdef CONFIG_MCL_COREDUMP -+ extern char *panicmsg; -+ /* -+ * Only call bootimg if we have a valid descriptor and -+ * we are in a panic() context. -+ */ -+ if (panicmsg) -+#endif -+#ifdef CONFIG_BOOTIMG -+ if (bootimg_dsc.page_dir) -+ boot_image(); -+#endif -+ - #if CONFIG_SMP -+{ - int cpuid; - - cpuid = GET_APIC_ID(apic_read(APIC_ID)); -@@ -413,6 +430,7 @@ - if (!netdump_func) - smp_send_stop(); - disable_IO_APIC(); -+} - #endif - - if(!reboot_thru_bios) { -Index: linux/arch/i386/kernel/setup.c -=================================================================== -RCS file: /chaos/cvs/kernel-rh/linux/arch/i386/kernel/setup.c,v -retrieving revision 1.3.2.1 -retrieving revision 1.3.2.1.2.2 -diff -u -r1.3.2.1 -r1.3.2.1.2.2 ---- linux/arch/i386/kernel/setup.c 12 Mar 2003 19:49:06 -0000 1.3.2.1 -+++ linux/arch/i386/kernel/setup.c 1 Apr 2003 17:55:35 -0000 1.3.2.1.2.2 -@@ -116,6 +116,9 @@ - #include - #include - #include -+#ifdef CONFIG_MCL_COREDUMP -+#include -+#endif - /* - * Machine setup.. - */ -@@ -973,6 +976,7 @@ - static unsigned long __init setup_memory(void) - { - unsigned long bootmap_size, start_pfn, max_low_pfn; -+ unsigned long bootmap_pages = 0UL, crash_pages = 0UL; - - /* - * partially used pages are not usable - thus -@@ -992,6 +996,21 @@ - printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", - pages_to_mb(highend_pfn - highstart_pfn)); - #endif -+ -+#ifdef CONFIG_MCL_COREDUMP -+ bootmap_pages = bootmem_bootmap_pages(max_low_pfn); -+ crash_pages = crash_pages_needed(); -+ -+ printk("start_pfn: %d, bootmap_pages: %d\n", start_pfn, bootmap_pages); -+ -+ crash_init((u_long)phys_to_virt(PFN_PHYS(start_pfn)), -+ (u_long)phys_to_virt(PFN_PHYS(LOW_OFFSET + start_pfn)), -+ (u_long)phys_to_virt(PFN_PHYS(LOW_OFFSET + start_pfn + -+ crash_pages))); -+ -+ printk("new start_pfn: %08lx\n", PFN_PHYS(start_pfn)); -+ printk("crash map starts at %lx\n",(start_pfn+bootmap_pages)*PAGE_SIZE); -+#endif - printk(KERN_NOTICE "%ldMB LOWMEM available.\n", - pages_to_mb(max_low_pfn)); - /* -@@ -1007,8 +1026,8 @@ - * the (very unlikely) case of us accidentally initializing the - * bootmem allocator with an invalid RAM area. - */ -- reserve_bootmem(HIGH_MEMORY, (PFN_PHYS(start_pfn) + -- bootmap_size + PAGE_SIZE-1) - (HIGH_MEMORY)); -+ reserve_bootmem(HIGH_MEMORY, (PFN_PHYS(start_pfn) + bootmap_size + -+ ((1+crash_pages)*PAGE_SIZE) + PAGE_SIZE-1) - (HIGH_MEMORY)); - - /* - * reserve physical page 0 - it's a special BIOS page on many boxes, -@@ -1016,6 +1035,16 @@ - */ - reserve_bootmem(0, PAGE_SIZE); - -+#ifdef CONFIG_BOOTIMG -+ /* -+ * bootimg(8) reads the old parameter block. Note that the copy in -+ * empty_zero_page will vanish when mem_init runs. (Should we -+ * memcpy(phys_to_virt(0x90000), PARAM, PAGE_SIZE); -+ * now ?) -+ */ -+ reserve_bootmem(0x90000, PAGE_SIZE); -+#endif -+ - #ifdef CONFIG_SMP - /* - * But first pinch a few for the stack/trampoline stuff -@@ -1032,6 +1061,7 @@ - find_smp_config(); - #endif - #ifdef CONFIG_BLK_DEV_INITRD -+ printk("caution: initrd may overwrite dump\n"); /* phro */ - if (LOADER_TYPE && INITRD_START) { - if (INITRD_START + INITRD_SIZE <= (max_low_pfn << PAGE_SHIFT)) { - reserve_bootmem(INITRD_START, INITRD_SIZE); -@@ -1172,6 +1202,12 @@ - smp_alloc_memory(); /* AP processor realmode stacks in low memory*/ - #endif - paging_init(); -+#ifdef CONFIG_MCL_COREDUMP -+ /* -+ * Reserve crash pages -+ */ -+ crash_mark_dump_reserved(); -+#endif - #ifdef CONFIG_X86_LOCAL_APIC - /* - * get boot-time SMP configuration: -Index: linux/arch/i386/kernel/smp.c -=================================================================== -RCS file: /chaos/cvs/kernel-rh/linux/arch/i386/kernel/smp.c,v -retrieving revision 1.3.2.1 -retrieving revision 1.3.2.1.2.1 -diff -u -r1.3.2.1 -r1.3.2.1.2.1 ---- linux/arch/i386/kernel/smp.c 12 Mar 2003 19:49:06 -0000 1.3.2.1 -+++ linux/arch/i386/kernel/smp.c 1 Apr 2003 12:17:40 -0000 1.3.2.1.2.1 -@@ -23,6 +23,9 @@ - #include - #include - -+#ifdef CONFIG_MCL_COREDUMP -+#include -+#endif - /* - * Some notes on x86 processor bugs affecting SMP operation: - * -@@ -579,7 +582,7 @@ - return 0; - } - --static void stop_this_cpu (void * dummy) -+void stop_this_cpu (void * dummy) - { - /* - * Remove this CPU: -Index: linux/arch/i386/kernel/traps.c -=================================================================== -RCS file: /chaos/cvs/kernel-rh/linux/arch/i386/kernel/traps.c,v -retrieving revision 1.3.2.1 -retrieving revision 1.3.2.1.2.1 -diff -u -r1.3.2.1 -r1.3.2.1.2.1 ---- linux/arch/i386/kernel/traps.c 12 Mar 2003 19:49:06 -0000 1.3.2.1 -+++ linux/arch/i386/kernel/traps.c 1 Apr 2003 12:17:40 -0000 1.3.2.1.2.1 -@@ -52,6 +52,10 @@ - #include - #include - -+#ifdef CONFIG_MCL_COREDUMP -+#include -+#endif -+ - asmlinkage int system_call(void); - asmlinkage void lcall7(void); - asmlinkage void lcall27(void); -@@ -309,7 +313,11 @@ - netdump_func(regs); - bust_spinlocks(0); - spin_unlock_irq(&die_lock); -- do_exit(SIGSEGV); -+#ifdef CONFIG_MCL_COREDUMP -+ if(panic_on_oops) -+ panic("die"); -+#endif -+ do_exit(SIGSEGV);/* NOTREACHED */ - } - - static inline void die_if_kernel(const char * str, struct pt_regs * regs, long err) -Index: linux/drivers/char/misc.c -=================================================================== -RCS file: /chaos/cvs/kernel-rh/linux/drivers/char/misc.c,v -retrieving revision 1.2 -retrieving revision 1.2.4.1 -diff -u -r1.2 -r1.2.4.1 ---- linux/drivers/char/misc.c 25 Sep 2002 17:11:05 -0000 1.2 -+++ linux/drivers/char/misc.c 1 Apr 2003 12:17:41 -0000 1.2.4.1 -@@ -78,6 +78,8 @@ - extern int i8k_init(void); - extern int lcd_init(void); - -+extern int crash_init_chrdev(void); -+ - static int misc_read_proc(char *buf, char **start, off_t offset, - int len, int *eof, void *private) - { -@@ -255,6 +257,9 @@ - int __init misc_init(void) - { - create_proc_read_entry("misc", 0, 0, misc_read_proc, NULL); -+#ifdef CONFIG_MCL_COREDUMP -+ crash_init_chrdev(); -+#endif - #ifdef CONFIG_MVME16x - rtc_MK48T08_init(); - #endif -Index: linux/drivers/char/sysrq.c -=================================================================== -RCS file: /chaos/cvs/kernel-rh/linux/drivers/char/sysrq.c,v -retrieving revision 1.2.2.1 -retrieving revision 1.2.2.1.2.2 -diff -u -r1.2.2.1 -r1.2.2.1.2.2 ---- linux/drivers/char/sysrq.c 12 Mar 2003 19:49:47 -0000 1.2.2.1 -+++ linux/drivers/char/sysrq.c 1 Apr 2003 17:55:35 -0000 1.2.2.1.2.2 -@@ -97,7 +97,18 @@ - action_msg: "Resetting", - }; - -- -+#ifdef CONFIG_MCL_COREDUMP -+/* kernel core dump sysrq */ -+static void sysrq_handle_coredump(int key, struct pt_regs *pt_regs, -+ struct kbd_struct *kbd, struct tty_struct *ttty) { -+ panic("sysrq"); -+} -+static struct sysrq_key_op sysrq_coredump_op = { -+ handler: sysrq_handle_coredump, -+ help_msg: "Crash", -+ action_msg: "Dumping core", -+}; -+#endif - - /* SYNC SYSRQ HANDLERS BLOCK */ - -@@ -334,7 +345,11 @@ - it is handled specially on the spark - and will never arive */ - /* b */ &sysrq_reboot_op, -+#ifdef CONFIG_MCL_COREDUMP -+/* c */ &sysrq_coredump_op, -+#else - /* c */ NULL, -+#endif - /* d */ NULL, - /* e */ &sysrq_term_op, - /* f */ NULL, -Index: linux/include/asm-i386/bootimg.h -=================================================================== -RCS file: linux/include/asm-i386/bootimg.h -diff -N linux/include/asm-i386/bootimg.h ---- /dev/null 1 Jan 1970 00:00:00 -0000 -+++ linux/include/asm-i386/bootimg.h 1 Apr 2003 12:17:41 -0000 1.1.6.1 -@@ -0,0 +1,141 @@ -+/* asm-i386/bootimg.h - Boot image, i386-specific code */ -+ -+/* Written 2000 by Werner Almesberger */ -+ -+/* -+ * When porting bootimg(2) to a new architcture, you need to adapt the -+ * functions and definitions in this file. -+ */ -+ -+ -+#ifndef _ASM_I386_BOOTIMG_H -+#define _ASM_I386_BOOTIMG_H -+ -+#include -+#include -+ -+#ifdef CONFIG_SMP -+#include -+#include -+#endif -+ -+ -+/* -+ * The memory page with the code currently executing has been copied from -+ * old_page to new_page. Jump there. -+ * -+ * Note: flush_icache_range has already been called on the new page. -+ */ -+ -+static inline void jump_relocated(unsigned long old_page,unsigned long new_page) -+{ -+ int tmp; -+ -+ __asm__ __volatile__( -+ "stc\n\t" -+ "call 1f\n" -+ "1:\tjnc 2f\n\t" -+ "popl %0\n\t" -+ "addl %1,%0\n\t" -+ "addl %1,%%esp\n\t" -+ "clc\n\t" -+ "jmp *%0\n" -+ "2:" -+ : "=&r" (tmp) : "r" (new_page-old_page)); -+} -+ -+ -+/* -+ * Stop paging, such that -+ * - page tables can be overwritten -+ * - all physical memory can be accessed -+ * - all physical memory is identity-mapped -+ * -+ * (Other rules are possible, but need to be encoded in bootimg(8).) -+ */ -+ -+static inline void stop_paging(void) -+{ -+ unsigned long msw; -+ -+ __asm__ __volatile__( -+ "movl %%cr0,%0\n\t" -+ "andl $0x7fffffff,%0\n\t" -+ "movl %0,%%cr0\n\t" -+ "jmp 1f\n\t" /* i486 and such */ -+ "1:" -+ -+/* Clear the PAE bit in register %cr4 if we were in PAE mode. The initial -+ * page table set up by the new kernel's bootstrap code is non-PAE regardless -+ * of whether the new kernel is a PAE kernel. By clearing the PAE bit here, -+ * we make sure the bootstrap code doesn't accidentally enable PAE mode when -+ * it turns on address translation. -+ */ -+#ifdef CONFIG_X86_PAE -+ "movl %%cr4,%0\n\t" -+ "andl $0xffffffdf,%0\n\t" -+ "movl %0,%%cr4\n\t" -+#endif -+ -+ : "=&r" (msw) : : "memory"); -+} -+ -+ -+/* -+ * Stop any remaining concurrency in the system. If become_only_thread fails -+ * but the system is still usable, become_only_thread should return an error -+ * code. If no recovery is possible, it may as well panic. -+ */ -+ -+static inline int become_only_thread(void) -+{ -+#ifdef CONFIG_SMP -+ smp_send_stop(); -+ disable_IO_APIC(); -+#endif -+ cli(); -+ return 0; -+} -+ -+ -+/* -+ * A conservative estimate of the number of bytes relocate_and_jump allocated -+ * on the stack. This is only used for sanity checking before running code, -+ * because we can't recover from failure in relocate_and_jump. -+ */ -+ -+#define RESERVE_MIN_RELOC_STACK 256 -+ -+ -+/* -+ * Change the stack pointer such that stack is at the end of the specified -+ * page. No data on the old stack will be accessed anymore, so no copying is -+ * required. -+ */ -+ -+static inline void stack_on_page(void *page) -+{ -+ __asm__ __volatile__( -+ "push %%ds\n\t" -+ "pop %%ss\n\t" -+ "movl %0,%%esp\n\t" -+ "addl $0x1000,%%esp\n\t" -+ : : "r" (page)); -+} -+ -+/* -+ * Set up things such that the kernel will be comfortable (e.g. some -+ * architectures expect the boot loader to set registers in certain ways), -+ * and then jump to the kernel's entry address. -+ */ -+ -+static inline void jump_to_kernel(void (*kernel_entry)(void)) -+{ -+ __asm__ __volatile__( -+ "mov $0x90000,%%esi\n\t" -+ : : ); -+ -+ kernel_entry(); -+} -+ -+#endif -Index: linux/include/asm-i386/crash.h -=================================================================== -RCS file: linux/include/asm-i386/crash.h -diff -N linux/include/asm-i386/crash.h ---- /dev/null 1 Jan 1970 00:00:00 -0000 -+++ linux/include/asm-i386/crash.h 1 Apr 2003 12:17:41 -0000 1.1.6.1 -@@ -0,0 +1,15 @@ -+#ifndef __ASM_CRASH_H -+#define __ASM_CRASH_H -+ -+#define UPPER_MEM_BACKUP 0 -+#define LOWER_MEM_FORWARD 0 -+#define LOW_OFFSET 100 -+ -+/* -+ * These two functions are inlined on alpha. That's why they appear -+ * in the arch dependent include file. -+ */ -+void crash_save_current_state(struct task_struct *); -+void crash_halt_or_reboot(int); -+ -+#endif -Index: linux/include/linux/bootimg.h -=================================================================== -RCS file: linux/include/linux/bootimg.h -diff -N linux/include/linux/bootimg.h ---- /dev/null 1 Jan 1970 00:00:00 -0000 -+++ linux/include/linux/bootimg.h 1 Apr 2003 12:17:41 -0000 1.1.6.1 -@@ -0,0 +1,84 @@ -+/* linux/bootimg.h - Boot image, general definitions */ -+ -+/* Written 2000 by Werner Almesberger */ -+ -+ -+#ifndef _LINUX_BOOTIMG_H -+#define _LINUX_BOOTIMG_H -+ -+ -+/* -+ * Constraints on image_map: -+ * - each image_map[n] is the virtual address of a page-sized memory region -+ * readable by the user -+ * - currently, image_map[n] is not required to be page-aligned, but this may -+ * change in the future if we want to map pages directly to lower memory -+ * pressure (NB: mapping works for ELF and plain binary images, but usually -+ * not for (b)zImages, because the prepended boot and setup sectors -+ * mis-align them) -+ * -+ * Constraints on load_map: -+ * - each load_map[] is the physical address of a page in RAM -+ */ -+ -+struct boot_image { -+ void **image_map; /* pointers to image pages in user memory */ -+ int pages; /* length in pages */ -+ unsigned long *load_map;/* list of destination pages (physical addr) */ -+ unsigned long start; /* jump to this physical address */ -+ int flags; /* for future use, must be zero for now */ -+}; -+ -+ -+#ifdef __KERNEL__ -+ -+#define __bootimg __attribute__ ((__section__ (".bootimg"))) -+ -+ -+struct bootimg_dsc { -+ unsigned long self; /* code page ALL ADDRESSES */ -+ unsigned long scratch; /* scratch page ARE PHYSICAL !*/ -+ unsigned long **page_dir; /* src & dst page tables */ -+ void (*jump_to)(void); /* start address */ -+ int pages; /* number of pages */ -+ unsigned long csum; /* Kernel Image checksum */ -+}; -+ -+/* -+ * page_dir contains pointers to pages containing pointers to pages. We call -+ * page_dir a "directory" and the page page_dir[n] points to a "table". The -+ * first PAGES_PER_TABLE/2 entries of page_dir are for source pages, and other -+ * half are for destination pages. -+ */ -+ -+/* -+ * Note that the definitions used here do not necessarily correspond to the -+ * architecture-specific PTRS_PER_PTE, __pte_offset, etc. -+ */ -+ -+#define PAGES_PER_TABLE (PAGE_SIZE/sizeof(void *)) -+#define FROM_TABLE(i) ((i)/PAGES_PER_TABLE) -+#define TO_TABLE(i) ((i)/PAGES_PER_TABLE+PAGES_PER_TABLE/2) -+#define PAGE_NR(i) ((i) % PAGES_PER_TABLE) -+ -+ -+extern char __bootimg_start,__bootimg_end; /* linker segment boundaries */ -+extern unsigned long *unity_page; /* unity-mapped page for i386 */ -+ -+/* -+ * relocate_and_jump runs in its own page with its own stack. This makes it -+ * difficult to pass parameters. The solution chosen here is to use the global -+ * variable bootimg_dsc, which is copied into an "auto" variable by -+ * relocate_and_jump before any copying or relocation takes place. -+ */ -+ -+extern struct bootimg_dsc bootimg_dsc; -+ -+typedef void (*relocate_and_jump_t)(void); -+ -+void relocate_and_jump(void); -+int boot_image(void); -+ -+#endif /* __KERNEL__ */ -+ -+#endif -Index: linux/include/linux/crash.h -=================================================================== -RCS file: linux/include/linux/crash.h -diff -N linux/include/linux/crash.h ---- /dev/null 1 Jan 1970 00:00:00 -0000 -+++ linux/include/linux/crash.h 1 Apr 2003 12:17:41 -0000 1.1.6.1 -@@ -0,0 +1,119 @@ -+#ifndef __LINUX_CRASH_H -+#define __LINUX_CRASH_H -+ -+/* defines for interfacing with user-space (ioctls, etc) */ -+struct ioctl_getdump { -+ unsigned long kva; -+ unsigned long buf; -+}; -+ -+#define CRASH_IOC_MAGIC 'C' -+ -+#define CRASH_IOCFREEDUMP _IO(CRASH_IOC_MAGIC, 0) -+#define CRASH_IOCGETDUMP _IOWR(CRASH_IOC_MAGIC, 1, struct ioctl_getdump) -+#define CRASH_IOCBOOTIMG _IOWR(CRASH_IOC_MAGIC, 2, struct boot_image) -+#define CRASH_IOCVERSION _IO(CRASH_IOC_MAGIC, 3) -+ -+/* kernel-only part of crash.h */ -+#ifdef __KERNEL__ -+#include -+ -+#define CRASH_K_MINOR (1) -+#define CRASH_K_MAJOR (0) -+ -+/* -+ * Crash prototypes. -+ */ -+void save_core(void); -+void crash_mark_dump_reserved(void); -+void crash_init(u_long bootmap_va, u_long crash_va, u_long end_alloc_va); -+u_long crash_pages_needed(void); -+void smp_crash_funnel_cpu(void); -+void crash_cleanup_smp_state(void); -+ -+/* -+ * Arch dependant crash.c funcs -+ */ -+void crash_save_current_state(struct task_struct *); -+void crash_halt_or_reboot(int); -+inline void crash_save_regs(void); -+ -+/* -+ * Crash globals -+ */ -+extern u_long crash_dump_header; -+extern volatile u_long panic_ksp[]; -+extern volatile int crash_release; -+extern int panic_on_oops; -+extern char *panicmsg; -+extern int panic_processor; -+extern int crash_perform_sync; -+extern unsigned long *panic_regs; -+ -+/* -+ * symbols not exported by linux header files -+ */ -+extern void stop_this_cpu(void *); -+ -+/* struct crash_map_hdr located at byte offset 0 */ -+/* on-disk formats */ -+ -+#define trunc_page(x) ((void *)(((unsigned long)(x)) & ~((unsigned long)(PAGE_SIZE - 1)))) -+#define round_page(x) trunc_page(((unsigned long)(x)) + ((unsigned long)(PAGE_SIZE - 1))) -+ -+#define CRASH_MAGIC 0x9a8bccdd -+#define CRASH_SOURCE_PAGES 128 -+#define CRASH_SUB_MAP_BYTES ((u_long)round_page((CRASH_SOURCE_PAGES+1)*sizeof(u_long))) -+#define CRASH_SUB_MAP_PAGES (CRASH_SUB_MAP_BYTES / PAGE_SIZE) -+#define CRASH_UNCOMPR_BUF_PAGES (CRASH_SOURCE_PAGES + CRASH_SUB_MAP_PAGES) -+#define CRASH_COMPR_BUF_PAGES (CRASH_UNCOMPR_BUF_PAGES + (CRASH_UNCOMPR_BUF_PAGES/4)) -+#define CRASH_COMPESS_PRIME_PAGES (2*CRASH_COMPR_BUF_PAGES) -+#define CRASH_ZALLOC_PAGES 16*5*2 /* 2 to handle crash in crash */ -+#define CRASH_LOW_WATER_PAGES 100 -+ -+#define CRASH_CPU_TIMEOUT 5000 /* 5 sec wait for other cpus to stop */ -+ -+#define CRASH_MARK_RESERVED(addr) (set_bit(PG_reserved,&mem_map[MAP_NR(addr)].flags)) -+#define CRASH_CLEAR_RESERVED(addr) (clear_bit(PG_reserved,&mem_map[MAP_NR(addr)].flags)) -+#define CRASH_MARK_BOOT_RESERVED(addr) reserve_bootmem(virt_to_phys((void *)addr), PAGE_SIZE); -+ -+typedef int boolean_t; -+ -+#define TRUE 1 -+#define FALSE 0 -+ -+/* mem structure */ -+struct mem_crash_map_hdr { -+ long magic[4]; /* identify crash dump */ -+ u_long map; /* location of map */ -+ u_long map_pages; -+ u_long data_pages; -+ u_long compr_units; -+ u_long boot_reserved_start; -+ u_long boot_reserved_end; -+}; -+struct mem_crash_map_entry { -+ u_long src_va; /* source start of larger non-contig -+ * block. a src_va of -1 means that -+ * the dest_page_va is the location of -+ * the next map page */ -+ u_long dest_page_va; /* dest of this sub block */ -+ u_long check_sum; /* check_sum for dest data */ -+}; -+ -+/* file structure */ -+struct crash_map_hdr { -+ long magic[4]; /* identify crash dump */ -+ int blk_size; /* block size for this device */ -+ int map_block; /* location of map */ -+ int map_blocks; /* number of blocks for map */ -+}; -+struct crash_map_entry { -+ u_long start_va; /* virtual address */ -+ char *exp_data; /* expanded data in memory */ -+ int start_blk; /* device location */ -+ int num_blks; -+}; -+ -+#endif /* __KERNEL__ */ -+#endif /* __LINUX_CRASH_H */ -Index: linux/include/linux/mm.h -=================================================================== -RCS file: /chaos/cvs/kernel-rh/linux/include/linux/mm.h,v -retrieving revision 1.2.2.1 -retrieving revision 1.2.2.1.2.2 -diff -u -r1.2.2.1 -r1.2.2.1.2.2 ---- linux/include/linux/mm.h 12 Mar 2003 19:51:27 -0000 1.2.2.1 -+++ linux/include/linux/mm.h 1 Apr 2003 17:55:35 -0000 1.2.2.1.2.2 -@@ -331,6 +331,11 @@ - #define PG_lru 18 - #define PG_active_cache 19 - #define PG_fs_1 20 /* Filesystem specific */ -+#ifdef CONFIG_MCL_COREDUMP -+#define PG_free 21 -+#define PG_shm 22 -+#define PG_anon 23 -+#endif - - /* Make it prettier to test the above... */ - #define UnlockPage(page) unlock_page(page) -@@ -452,6 +457,11 @@ - #define PageSetSlab(page) set_bit(PG_slab, &(page)->flags) - #define PageClearSlab(page) clear_bit(PG_slab, &(page)->flags) - #define PageReserved(page) test_bit(PG_reserved, &(page)->flags) -+#ifdef CONFIG_MCL_COREDUMP -+#define PageFree(page) (test_bit(PG_free, &(page)->flags)) -+#define PageAnon(page) (test_bit(PG_anon, &(page)->flags)) -+#define PageShm(page) (test_bit(PG_shm, &(page)->flags)) -+#endif - - #define PageActiveAnon(page) test_bit(PG_active_anon, &(page)->flags) - #define SetPageActiveAnon(page) set_bit(PG_active_anon, &(page)->flags) -Index: linux/include/linux/reboot.h -=================================================================== -RCS file: /chaos/cvs/kernel-rh/linux/include/linux/reboot.h,v -retrieving revision 1.1.1.1 -retrieving revision 1.1.1.1.10.2 -diff -u -r1.1.1.1 -r1.1.1.1.10.2 ---- linux/include/linux/reboot.h 7 May 2002 21:53:47 -0000 1.1.1.1 -+++ linux/include/linux/reboot.h 1 Apr 2003 17:55:35 -0000 1.1.1.1.10.2 -@@ -20,6 +20,7 @@ - * CAD_OFF Ctrl-Alt-Del sequence sends SIGINT to init task. - * POWER_OFF Stop OS and remove all power from system, if possible. - * RESTART2 Restart system using given command string. -+ * COREDUMP We're taking a core dump, secondary cpus already stopped. - */ - - #define LINUX_REBOOT_CMD_RESTART 0x01234567 -@@ -28,7 +29,9 @@ - #define LINUX_REBOOT_CMD_CAD_OFF 0x00000000 - #define LINUX_REBOOT_CMD_POWER_OFF 0x4321FEDC - #define LINUX_REBOOT_CMD_RESTART2 0xA1B2C3D4 -- -+#ifdef CONFIG_MCL_COREDUMP -+#define LINUX_REBOOT_CMD_COREDUMP 0x9A8BCCDD -+#endif - - #ifdef __KERNEL__ - -Index: linux/include/linux/sysctl.h -=================================================================== -RCS file: /chaos/cvs/kernel-rh/linux/include/linux/sysctl.h,v -retrieving revision 1.3.2.1 -retrieving revision 1.3.2.1.2.1 -diff -u -r1.3.2.1 -r1.3.2.1.2.1 ---- linux/include/linux/sysctl.h 12 Mar 2003 19:51:30 -0000 1.3.2.1 -+++ linux/include/linux/sysctl.h 1 Apr 2003 12:17:41 -0000 1.3.2.1.2.1 -@@ -126,6 +126,7 @@ - KERN_CADPID=54, /* int: PID of the process to notify on CAD */ - KERN_CORE_PATTERN=56, /* string: pattern for core-files */ - KERN_PID_MAX=55, /* int: max PID value of processes */ -+ KERN_PANIC_ON_OOPS /* int: panic on oops enabled */ - }; - - -Index: linux/init/main.c -=================================================================== -RCS file: /chaos/cvs/kernel-rh/linux/init/main.c,v -retrieving revision 1.2.2.1 -retrieving revision 1.2.2.1.2.1 -diff -u -r1.2.2.1 -r1.2.2.1.2.1 ---- linux/init/main.c 12 Mar 2003 19:51:35 -0000 1.2.2.1 -+++ linux/init/main.c 1 Apr 2003 12:17:41 -0000 1.2.2.1.2.1 -@@ -70,6 +70,10 @@ - #include - #endif - -+#ifdef CONFIG_BOOTIMG -+#include -+#endif -+ - /* - * Versions of gcc older than that listed below may actually compile - * and link okay, but the end product can have subtle run time bugs. -@@ -352,10 +356,14 @@ - { - char * command_line; - extern char saved_command_line[]; -+#if defined(CONFIG_BOOTIMG) && defined(CONFIG_X86_LOCAL_APIC) -+ unsigned long value; -+#endif - /* - * Interrupts are still disabled. Do necessary setups, then - * enable them - */ -+ printk("start_kernel\n"); - lock_kernel(); - printk(linux_banner); - setup_arch(&command_line); -@@ -373,12 +381,26 @@ - * this. But we do want output early, in case something goes wrong. - */ - console_init(); -+ -+#ifdef CONFIG_BOOTIMG -+ unity_page = alloc_bootmem_pages(PAGE_SIZE); -+ printk("unity_page addr: %p\n",unity_page); -+#endif - #ifdef CONFIG_MODULES - init_modules(); - #endif - profile_init(); - kmem_cache_init(); - sti(); -+#if defined(CONFIG_BOOTIMG) && defined(CONFIG_X86_LOCAL_APIC) -+ /* If we don't make sure the APIC is enabled, AND the LVT0 -+ * register is programmed properly, we won't get timer interrupts -+ */ -+ setup_local_APIC(); -+ -+ value = apic_read(APIC_LVT0); -+ apic_write_around(APIC_LVT0, value & ~APIC_LVT_MASKED); -+#endif - calibrate_delay(); - #ifdef CONFIG_BLK_DEV_INITRD - if (initrd_start && !initrd_below_start_ok && -Index: linux/kernel/Makefile -=================================================================== -RCS file: /chaos/cvs/kernel-rh/linux/kernel/Makefile,v -retrieving revision 1.1.1.1.4.1 -retrieving revision 1.1.1.1.4.1.2.1 -diff -u -r1.1.1.1.4.1 -r1.1.1.1.4.1.2.1 ---- linux/kernel/Makefile 12 Mar 2003 19:51:36 -0000 1.1.1.1.4.1 -+++ linux/kernel/Makefile 1 Apr 2003 12:17:41 -0000 1.1.1.1.4.1.2.1 -@@ -22,7 +22,8 @@ - obj-$(CONFIG_PM) += pm.o - obj-$(CONFIG_KALLSYMS) += kallsyms.o - obj-$(CONFIG_CPU_FREQ) += cpufreq.o -- -+obj-$(CONFIG_BOOTIMG) += bootimg.o bootimg_pic.o -+obj-$(CONFIG_MCL_COREDUMP) += crash.o - - ifneq ($(CONFIG_IA64),y) - # According to Alan Modra , the -fno-omit-frame-pointer is -Index: linux/kernel/bootimg.c -=================================================================== -RCS file: linux/kernel/bootimg.c -diff -N linux/kernel/bootimg.c ---- /dev/null 1 Jan 1970 00:00:00 -0000 -+++ linux/kernel/bootimg.c 1 Apr 2003 12:17:41 -0000 1.1.6.1 -@@ -0,0 +1,301 @@ -+/* bootimg.c - Boot another (kernel) image */ -+ -+/* Written 2000 by Werner Almesberger */ -+ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#if 0 -+#define DPRINTK_CONT(format,args...) printk(format,##args) -+#else -+#define DPRINTK_CONT(format,args...) -+#endif -+#define DPRINTK(format,args...) DPRINTK_CONT(KERN_DEBUG format,##args) -+ -+unsigned long **bootimg_page_dir; -+ -+struct bootimg_dsc bootimg_dsc; /* communication with PIC */ -+unsigned long *unity_page; /* unity-mapped page for i386 */ -+ -+static unsigned long bootimg_checksum(unsigned long **page_dir, int num_pages) -+{ -+ unsigned long checksum, *page; -+ int i, j; -+ -+ checksum = 0; -+ -+ for (i = 0; i < num_pages; i++) { -+ page = __va((unsigned long *) -+ page_dir[FROM_TABLE(i)][PAGE_NR(i)]); -+ -+ for (j = 0; j < PAGES_PER_TABLE; j++) -+ checksum ^= page[j]; -+ -+ checksum ^= page_dir[TO_TABLE(i)][PAGE_NR(i)]; -+ } -+ -+ return checksum; -+} -+ -+#ifdef CONFIG_X86_PAE -+ -+static unsigned long get_identity_mapped_page(void) -+{ -+ pgd_t *pgd; -+ pmd_t *pmd; -+ unsigned long phys_addr, page_base; -+ -+ /* Set up a 2 Mb identity-mapped page. */ -+ -+ phys_addr = virt_to_phys(unity_page); -+ pgd = pgd_offset(current->active_mm, phys_addr); -+ pmd = pmd_offset(pgd, phys_addr); -+ -+ /* We hardcode this rather than using PMD_MASK just in case the PAE -+ * mode setup ever changes so that 2 Mb pages are no longer used. -+ */ -+ page_base = phys_addr & ~((1 << 21) - 1); -+ -+ set_pmd(pmd, __pmd(page_base | _PAGE_PSE | _KERNPG_TABLE)); -+ __flush_tlb_one(phys_addr); -+ -+ return (unsigned long) unity_page; -+} -+ -+#else -+ -+static unsigned long get_identity_mapped_page(void) -+{ -+ set_pgd(pgd_offset(current->active_mm,virt_to_phys(unity_page)), -+ __pgd((_KERNPG_TABLE + _PAGE_PSE + (virt_to_phys(unity_page)&PGDIR_MASK)))); -+ __flush_tlb_one(virt_to_phys(unity_page)); -+ return (unsigned long)unity_page; -+} -+ -+#endif -+ -+#if 0 /* Perhaps we'll need this in the future? */ -+static void unmap_identity_mapped_page(void) -+{ -+ set_pgd(pgd_offset(current->active_mm,virt_to_phys(unity_page)),__pgd(0)); -+ __flush_tlb(); -+} -+#endif -+ -+static int fill_page_dir(unsigned long **page_dir,struct boot_image *image) -+{ -+ int i, count=0; -+ -+ memset(page_dir,0,PAGE_SIZE); -+ for (i = 0; i < image->pages; i += PAGES_PER_TABLE) { -+ unsigned long **table; -+ int bytes_left; -+ -+ table = page_dir+FROM_TABLE(i); -+ *table = (unsigned long *) get_free_page(GFP_KERNEL); -+ if (!*table) return -ENOMEM; -+ -+ memset(*table,0,PAGE_SIZE); -+ DPRINTK("page %d: from table %p @ %p\n",i,*table,table); -+ table = page_dir+TO_TABLE(i); -+ *table = (unsigned long *) get_free_page(GFP_KERNEL); -+ if (!*table) return -ENOMEM; -+ -+ bytes_left = (image->pages-i)*sizeof(unsigned long); -+ if (copy_from_user(*table,image->load_map+i, -+ bytes_left > PAGE_SIZE ? PAGE_SIZE : bytes_left)) -+ return -EFAULT; -+ DPRINTK("page %d: to table %p @ %p\n",i,*table,table); -+ count+=2; /* 2 pages per loop */ -+ } -+ -+ for (i = 0; i < image->pages; i++) { -+ unsigned long page = get_free_page(GFP_KERNEL); -+ void *src; -+ -+ if (!page) return -ENOMEM; -+ count++; -+ -+ page_dir[FROM_TABLE(i)][PAGE_NR(i)] = -+ virt_to_phys((void *) page); -+ if (get_user(src,image->image_map+i) || -+ copy_from_user((void *) page,src,PAGE_SIZE)) -+ return -EFAULT; -+ -+ DPRINTK("page %d: %p->%p->%p @ %p\n",i,src,(void *) page, -+ (void *) page_dir[FROM_TABLE(i)][PAGE_NR(i)], -+ &page_dir[FROM_TABLE(i)][PAGE_NR(i)]); -+ } -+ -+ DPRINTK("fill_page_dir: %d pages allocated\n", count); -+ -+ return 0; -+} -+ -+ -+static void free_page_dir(unsigned long **page_dir) -+{ -+ int i,j,count=0; -+ -+ for (i = 0; i < PAGES_PER_TABLE/2; i++) -+ if (page_dir[i]) -+ for (j = 0; j < PAGES_PER_TABLE; j++) -+ if (page_dir[i][j]) { -+ free_page((unsigned long) -+ phys_to_virt(page_dir[i][j])); -+ count++; -+ } -+ for (i = 0; i < PAGES_PER_TABLE; i++) -+ if (page_dir[i]) { -+ free_page((unsigned long) *page_dir[i]); -+ count++; -+ } -+ DPRINTK("free_page_dir: %d pages freed\n", count); -+} -+ -+ -+static void convert_table_refs_to_phys(unsigned long **page_dir) -+{ -+ int i; -+ -+ DPRINTK("PAGES_PER_TABLE: %d\n",PAGES_PER_TABLE); -+ for (i = 0; i < PAGES_PER_TABLE; i++) -+ if (page_dir[i]) { -+ DPRINTK("table %i: mapped %p -> ",i,page_dir[i]); -+ page_dir[i] = (unsigned long *) -+ virt_to_phys(page_dir[i]); -+ DPRINTK_CONT("%p\n",page_dir[i]); -+ } -+} -+ -+ -+ -+static int fill_bootimg_dsc(struct boot_image *image) -+{ -+ unsigned long scratch; -+ int error = -ENOMEM; -+ -+ if(bootimg_page_dir) { -+ /* free previously allocated memory */ -+ free_page_dir(bootimg_page_dir); -+ free_page((unsigned long) bootimg_page_dir); -+ DPRINTK("free_page (bootimg_page_dir)\n"); -+ } -+ -+ bootimg_page_dir = (unsigned long **) get_free_page(GFP_KERNEL); -+ if (!bootimg_page_dir) goto out0; -+ DPRINTK("get_free_page (bootimg_page_dir)\n"); -+ -+ error = fill_page_dir(bootimg_page_dir,image); -+ if (error) goto out1; -+ -+ if(!bootimg_dsc.scratch) { -+ scratch = get_free_page(GFP_KERNEL); -+ DPRINTK("get_free_page (scratch)\n"); -+ } else -+ scratch = 1; /* already allocated */ -+ -+ if (!scratch) goto out1; -+ /* -+ * Not all architectures need the code to be identity-mapped, but it -+ * can't hurt ... -+ */ -+ DPRINTK("bootimg_page_dir: mapped %p -> ",bootimg_page_dir); -+ bootimg_dsc.page_dir = (unsigned long **) virt_to_phys(bootimg_page_dir); -+ DPRINTK_CONT("%p\n",bootimg_dsc.page_dir); -+ if(!bootimg_dsc.scratch) -+ bootimg_dsc.scratch = virt_to_phys((void *) scratch); -+ bootimg_dsc.jump_to = (void (*)(void)) image->start; -+ bootimg_dsc.pages = image->pages; -+ bootimg_dsc.csum = bootimg_checksum(bootimg_page_dir, image->pages); -+ -+ return 0; -+ -+out1: -+ free_page_dir(bootimg_page_dir); -+ free_page((unsigned long) bootimg_page_dir); -+ DPRINTK("free_page (bootimg_page_dir)\n"); -+ bootimg_page_dir = 0; -+out0: -+ return error; -+} -+ -+extern char *panicmsg; -+int boot_image() -+{ -+ relocate_and_jump_t code; -+ unsigned long code_page; -+ int error = -ENOMEM; -+ -+ if (bootimg_checksum(__va(bootimg_dsc.page_dir),bootimg_dsc.pages) -+ != bootimg_dsc.csum) -+ printk("Checksum of kernel image failed. Rebooting via BIOS\n"); -+ -+ code_page = get_identity_mapped_page(); -+ if (!code_page) goto out3; -+ code = (relocate_and_jump_t) virt_to_phys((void *) code_page); -+ memcpy(code,&__bootimg_start,&__bootimg_end-&__bootimg_start); -+ flush_icache_range(&__bootimg_start, &__bootimg_end-&__bootimg_start); -+ -+ bootimg_dsc.self = (unsigned long) code; -+ printk(KERN_INFO "Running boot code at 0x%p\n",code); -+ -+ /* -+ * The point of no return. Not even printk may work after a successful -+ * return from become_only_thread. -+ */ -+ -+ if (!panicmsg) { -+ error = become_only_thread(); -+ if (error) goto out3; -+ } else { -+#ifdef CONFIG_SMP -+ disable_IO_APIC(); -+#endif -+ __cli(); -+ } -+ -+ convert_table_refs_to_phys((unsigned long **)__va(bootimg_dsc.page_dir)); -+ stack_on_page(code); -+ -+ code(); -+ -+ panic("PIC code exec failed"); -+out3: -+ printk("boot_image() failed!\n"); -+ for(;;); -+} -+ -+/* changed from asmlinkage because we're called via an IOCTL on /dev/crash now */ -+int sys_bootimg(struct boot_image *user_dsc) -+{ -+ struct boot_image dsc; -+ -+ if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SYS_MODULE)) return -EPERM; -+ if (&__bootimg_end-&__bootimg_start > PAGE_SIZE-RESERVE_MIN_RELOC_STACK) -+ { -+ printk(KERN_ERR "boot_image: PIC too large (%d bytes)\n", -+ &__bootimg_end-&__bootimg_start); -+ return -EIO; -+ } -+ if ((void *) relocate_and_jump != (void *) &__bootimg_start) { -+ printk(KERN_ERR "boot_image: relocate_and_jump is mis-placed" -+ "(0x%p != 0x%p)\n",relocate_and_jump,&__bootimg_start); -+ return -EIO; -+ } -+ -+ if (copy_from_user(&dsc,user_dsc,sizeof(dsc))) return -EFAULT; -+ if (dsc.pages >= PAGES_PER_TABLE*PAGES_PER_TABLE/2) return -EFBIG; -+ if (dsc.flags) return -EINVAL; /* for future use */ -+ return fill_bootimg_dsc(&dsc); -+} -Index: linux/kernel/bootimg_pic.c -=================================================================== -RCS file: linux/kernel/bootimg_pic.c -diff -N linux/kernel/bootimg_pic.c ---- /dev/null 1 Jan 1970 00:00:00 -0000 -+++ linux/kernel/bootimg_pic.c 1 Apr 2003 12:17:41 -0000 1.1.6.1 -@@ -0,0 +1,91 @@ -+/* bootimg_pic.c - Boot image, position-independent code */ -+ -+/* Written 2000 by Werner Almesberger */ -+ -+/* -+ * Strongly inspired by FiPaBoL designed mainly by Otfried Cheong and Roger -+ * Gammans, and written by the latter. -+ */ -+ -+/* -+ * This code is position-independent and must fit in a single page ! -+ * Furthermore, everything (text+data+stack) has to go into the -+ * .bootimg segment. -+ */ -+ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+ -+#define copy_and_swap(from,to) \ -+ ( { my_copy_page(from,to); \ -+ tmp = from; \ -+ from = to; \ -+ to = tmp; } ) -+ -+ -+static inline void my_copy_page(unsigned long from,unsigned long to) -+{ -+ unsigned long end = from+PAGE_SIZE; -+ -+ do *((unsigned long *) to)++ = *((unsigned long *) from)++; -+ while (from != end); -+} -+ -+ -+void __bootimg relocate_and_jump(void) -+{ -+ struct bootimg_dsc dsc = bootimg_dsc; -+ int i; -+ -+ stop_paging(); -+ for (i = 0; i < dsc.pages; i++) { -+ unsigned long from,to,tmp; -+ -+ from = dsc.page_dir[FROM_TABLE(i)][PAGE_NR(i)]; -+ to = dsc.page_dir[TO_TABLE(i)][PAGE_NR(i)]; -+ if (from == to) continue; -+ if (to == dsc.self) { -+ copy_and_swap(dsc.self,dsc.scratch); -+ /* WARNING: flush_icache_range MUST BE INLINED !!! */ -+ flush_icache_range(dsc.self,dsc.self+PAGE_SIZE-1); -+ jump_relocated(dsc.scratch,dsc.self); -+ } -+ else if (to == (unsigned long) dsc.page_dir) -+ copy_and_swap((unsigned long) dsc.page_dir,dsc.scratch); -+ else { -+ /* -+ * O((n^2-n)/2), sigh ... -+ */ -+ unsigned long **table; -+ int j; -+ -+ for (j = i+1; j < dsc.pages; j++) { -+ table = dsc.page_dir+FROM_TABLE(j); -+ if (((unsigned long) *table) == to) { -+ copy_and_swap(*table,dsc.scratch); -+ break; -+ } -+ if ((*table)[PAGE_NR(j)] == to) { -+ copy_and_swap((*table)[PAGE_NR(j)], -+ dsc.scratch); -+ break; -+ } -+ table = dsc.page_dir+TO_TABLE(j); -+ if (((unsigned long) *table) == to) { -+ copy_and_swap(*table,dsc.scratch); -+ break; -+ } -+ } -+ } -+ my_copy_page(from,to); -+ dsc.scratch = from; -+ } -+ jump_to_kernel(dsc.jump_to); -+} -Index: linux/kernel/crash.c -=================================================================== -RCS file: linux/kernel/crash.c -diff -N linux/kernel/crash.c ---- /dev/null 1 Jan 1970 00:00:00 -0000 -+++ linux/kernel/crash.c 1 Apr 2003 12:17:41 -0000 1.1.6.1 -@@ -0,0 +1,886 @@ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#ifdef CONFIG_BOOTIMG -+#include -+#endif -+ -+static void crash_print_data_around(u_long p); -+static void crash_free_page(u_long addr); -+static int crash_chksum_page(u_long pg_addr, u_long * sum_addr); -+static void *czalloc(void *arg, unsigned int items, unsigned int size); -+static void czfree(void *arg, void *ptr); -+static u_long crash_alloc_dest_page(void); -+static void crash_free_dest_page(u_long dest); -+static void init_dest_page_alloc(void); -+static int crash_audit_maps(void); -+static u_long crash_get_source_page(void); -+static u_long crash_update_map(u_long map, u_long src_base, u_long dest, u_long * pages); -+static int crash_reset_stream(z_stream * stream); -+static boolean_t crash_is_kseg(u_long addr); -+static u_long *crash_link(u_long p); -+static int crash_chksum(u_long limit, u_long * sum_addr); -+static int crash_audit_map_page(u_long map); -+static void crash_wait_cpus(void); -+static int crash_is_dir_page(struct page *page); -+ -+/* for the /dev/crash interface */ -+int crash_init_chrdev(void); -+static int crashdev_ioctl(struct inode *, struct file *, unsigned int, unsigned long); -+ -+#define CRASH_DEBUG 1 -+ -+#ifdef CONFIG_BOOTIMG -+extern int sys_bootimg(struct boot_image *); -+#endif -+ -+static u_long crash_compr_buf; -+static u_long crash_uncompr_buf; -+static u_long crash_dump_header = 0; -+static u_long crash_dest_free_list = 0; -+static u_long crash_debug = 0; -+ -+static u_long crash_cur_pfn; -+ -+static u_long src_pages_skipped = 0; -+static u_long src_pages_saved = 0; -+static u_long dest_pages_free = 0; -+ -+/* this information is saved from within panic() */ -+char *panicmsg = (char *)0; -+int panic_processor = 0; -+int crash_perform_sync = 0; -+ -+u_int console_crash = 0; /* should be moved to alpha branch */ -+ -+// typedef struct task_struct *task_t; -+ -+/* -+ * Threads active at time of panic: -+ */ -+volatile task_t *panic_threads[NR_CPUS]; -+volatile unsigned long panic_ksp[NR_CPUS]; -+unsigned long *panic_regs = NULL; -+ -+int panic_on_oops; /* for /proc/sys/kernel/panic_on_oops */ -+ -+extern unsigned long max_low_pfn; -+ -+u_long crash_zalloc_start; // , crash_zalloc_end, crash_zalloc_cur; -+ -+/* -+ * Crash Kernel API functions below -+ * crash_pages_needed, computes pages needed for header and compression temp -+ * crash_init, partitions out the allocated pages, sets defaults and -+ * initializes the character device. -+ * crash_mark_dump_reserved, marks pages reserved from a previous dump. -+ * save_core, called at panic time to save a dump to memory. -+ */ -+u_long crash_pages_needed(void) -+{ -+ /* one for the header */ -+ return (1 + CRASH_ZALLOC_PAGES + CRASH_UNCOMPR_BUF_PAGES + CRASH_COMPR_BUF_PAGES); -+} -+ -+void crash_init(u_long bootmap_va, u_long crash_va, u_long end_alloc_va) -+{ -+ struct mem_crash_map_hdr *header; -+ int i; -+ -+ /* the default behavior is not NOT panic on a kernel OOPS */ -+ panic_on_oops = 0; -+ -+ printk("crash_init (crash_va: %08lx)\n", crash_va); -+ for (i = 0; i < NR_CPUS; i++) -+ panic_threads[i] = 0; -+ crash_dump_header = crash_va; -+ crash_va += PAGE_SIZE; -+ crash_zalloc_start = crash_va; -+ crash_va += CRASH_ZALLOC_PAGES * PAGE_SIZE; -+ crash_uncompr_buf = crash_va; -+ crash_va += CRASH_UNCOMPR_BUF_PAGES * PAGE_SIZE; -+ crash_compr_buf = crash_va; -+ crash_va += CRASH_COMPR_BUF_PAGES * PAGE_SIZE; -+#if 0 -+ if (crash_va != end_alloc_va) -+ panic("crash_init inconsistency-1\n"); -+#endif -+ -+ header = (struct mem_crash_map_hdr *)crash_dump_header; -+#ifdef CRASH_DEBUG -+ printk("crash_dump_header %p {\n", header); -+ printk(" magic[0] = %lx\n", header->magic[0]); -+ printk(" map = %lx\n", header->map); -+ printk(" map_pages = %lx\n", header->map_pages); -+ printk(" data_pages = %lx\n", header->data_pages); -+ printk(" compr_units = %lx\n", header->compr_units); -+ printk(" boot_reserved_start = %lx\n", header->boot_reserved_start); -+ printk(" boot_reserved_end = %lx\n", header->boot_reserved_end); -+#endif -+ -+ if (header->magic[0] == CRASH_MAGIC) { -+ printk("crash found\n"); -+ if ((header->boot_reserved_start != bootmap_va) || -+ (header->boot_reserved_end != end_alloc_va)) { -+ /* crash audit will catch the corruption */ -+ printk("crash_init inconsistency, dump may be corrupted\n"); -+ } -+ } else { -+printk("memset..."); -+ memset(header, 0, sizeof(*header)); -+printk("done\n"); -+ } -+ -+ header->boot_reserved_start = bootmap_va; -+ header->boot_reserved_end = end_alloc_va; -+ -+} -+ -+void crash_mark_dump_reserved(void) -+{ -+ struct mem_crash_map_hdr *header; -+ struct mem_crash_map_entry *m; -+ -+ header = (struct mem_crash_map_hdr *)crash_dump_header; -+ if (header->magic[0] != CRASH_MAGIC) -+ return; -+ m = (struct mem_crash_map_entry *)header->map; -+#ifdef CRASH_DEBUG -+ printk("\n\n\ncrash_mark_dump_reserved\n\n"); -+ printk("crash_dump_header %p {\n", header); -+ printk(" magic[0] = %lx\n", header->magic[0]); -+ printk(" map = %lx\n", header->map); -+ printk(" map_pages = %lx\n", header->map_pages); -+ printk(" data_pages = %lx\n", header->data_pages); -+ printk(" compr_units = %lx\n", header->compr_units); -+ printk(" boot_reserved_start = %lx\n", header->boot_reserved_start); -+ printk(" boot_reserved_end = %lx\n", header->boot_reserved_end); -+ printk("mem_crash_map_entry %p {\n", m); -+ printk(" src_va = %lx\n", m->src_va); -+ printk(" dest_page_va = %lx\n", m->dest_page_va); -+ printk(" check_sum = %lx\n", m->check_sum); -+#endif -+ -+ if (crash_audit_maps()) { -+ header->magic[0] = 0; -+ return; -+ } -+ -+ m = (struct mem_crash_map_entry *)header->map; -+ again: -+ CRASH_MARK_BOOT_RESERVED(m); -+ for (; m->src_va; m++) { -+ if (m->src_va == -1) { -+ m = (struct mem_crash_map_entry *)m->dest_page_va; -+ goto again; -+ } -+ CRASH_MARK_BOOT_RESERVED(m->dest_page_va); -+ } -+ return; -+} -+ -+void save_core(void) -+{ -+ int i, j, k; -+ z_stream stream; -+ int err; -+ struct task_struct *tp; -+ struct mem_crash_map_hdr *header; -+ u_long *sub_map; -+ u_long map; -+ u_long src, dest, unc, cp, src_base, comp_pages; -+ -+ k = 0; -+ dest = 0; -+ __cli(); -+ tp = current; -+ mb(); -+ if (smp_processor_id() != 0) { /* boot_cpu_id is always 0, i think */ -+ panic_threads[smp_processor_id()] = tp; -+ crash_halt_or_reboot(0); -+ } else { -+ if (console_crash) -+ panic_threads[smp_processor_id()] = &init_task_union.task; -+ else -+ panic_threads[smp_processor_id()] = tp; -+ -+ crash_wait_cpus(); -+ } -+ -+ printk("save_core: started on CPU%d\n", smp_processor_id()); -+ if (!crash_dump_header) { -+ printk("save_core: not initialized\n"); -+ return; -+ } -+ -+ header = (struct mem_crash_map_hdr *)crash_dump_header; -+ header->magic[0] = 0; -+ header->map_pages = 0; -+ header->data_pages = 0; -+ header->compr_units = 0; -+ header->map = 0; -+ -+ stream.workspace=(void*)crash_zalloc_start; -+ // stream.zalloc = czalloc; -+ // stream.zfree = czfree; -+ // stream.opaque = (voidpf) 0; -+ stream.next_out = (Bytef *) crash_compr_buf; -+ stream.avail_out = (uInt) (CRASH_COMPR_BUF_PAGES * PAGE_SIZE); -+ stream.next_in = (Bytef *) crash_uncompr_buf; -+ stream.avail_in = (uInt) (CRASH_UNCOMPR_BUF_PAGES * PAGE_SIZE); -+ err = zlib_deflateInit(&stream, Z_BEST_SPEED); -+ if (err != Z_OK) { -+ printk("save_core: bad return %d from deflateInit\n", err); -+ return; -+ } -+ -+ init_dest_page_alloc(); -+ header->map = map = crash_update_map(0, 0, 0, &header->map_pages); -+ if (!map) { -+ printk("save_core: no dest pages\n"); -+ return; -+ } -+ crash_cur_pfn = 0; -+ src_base = 0; -+ src = 0; -+ for (;;) { -+ sub_map = (u_long *) crash_uncompr_buf; -+ unc = crash_uncompr_buf + CRASH_SUB_MAP_PAGES * PAGE_SIZE; -+ for (i = 0; i < CRASH_SOURCE_PAGES; i++) { -+ src = crash_get_source_page(); -+ if (!src) -+ break; -+ if (!i) -+ src_base = src; -+ if (!crash_is_kseg(unc) || !crash_is_kseg(src)) { -+ printk("unc = 0x%lx, src = 0x%lx, i = %d\n", unc, src, i); -+ i = src = 0; -+ break; -+ } -+ memcpy((void *)unc, (void *)src, PAGE_SIZE); -+ unc += PAGE_SIZE; -+ *sub_map++ = src; -+ } -+ *sub_map = 0; -+ if (!i && !src) -+ break; -+ err = zlib_deflate(&stream, Z_FINISH); -+ if (!(err == Z_STREAM_END)) { -+ zlib_deflateEnd(&stream); -+ printk("save_core: bad return %d from deflate, src_base = 0x%lx\n", err, -+ src_base); -+ return; -+ } -+ comp_pages = (u_long) round_page(stream.total_out) / PAGE_SIZE; -+ if (crash_debug) -+ printk("src_base = 0x%lx compressed data in 0x%lx pages\n", src_base, -+ comp_pages); -+ -+ cp = crash_compr_buf; -+ j = 0; -+ if (crash_debug) -+ printk("\nsrc = %lx\n", src_base); -+ else { -+ printk("."); -+ if (!(k++ % 64)) -+ printk("\n"); -+ } -+ for (i = 0; i < comp_pages; i++) { -+ dest = crash_alloc_dest_page(); -+ if (crash_debug) { -+ printk("%lx ", dest); -+ if (!(j++ % 8)) -+ printk("\n"); -+ } -+ header->data_pages++; -+ if (!dest) { -+ printk("save_core: no dest pages\n"); -+ return; -+ } -+ if (!crash_is_kseg(dest) || !crash_is_kseg(cp)) { -+ printk("dest = 0x%lx, cp = 0x%lx, i = %d, comp_pages = 0x%lx\n", -+ dest, cp, i, comp_pages); -+ src = 0; -+ break; -+ } -+ memcpy((void *)dest, (void *)cp, PAGE_SIZE); -+ cp += PAGE_SIZE; -+ map = crash_update_map(map, src_base, dest, &header->map_pages); /* links a new map page, if necessary */ -+ if (!map) { -+ printk("save_core: no map\n"); -+ return; -+ } -+ } -+ header->compr_units++; -+ if (!src) -+ break; -+ if (crash_reset_stream(&stream)) -+ return; -+ } -+ -+ map = crash_update_map(map, 0, 0, &header->map_pages); -+ header->magic[0] = CRASH_MAGIC; -+ -+ if (crash_audit_maps()) { -+ header->magic[0] = 0; -+ return; -+ } -+ -+ printk("\nsave_core: src pages skipped = 0x%lx src pages saved = 0x%lx\n", -+ src_pages_skipped, src_pages_saved); -+ printk("save_core: data_pages = 0x%lx map_pages = 0x%lx\n", header->data_pages, -+ header->map_pages); -+ printk("save_core: completed, crash_dump_header = 0x%lx\n", crash_dump_header); -+} -+ -+/* helper functions private to this file */ -+static int crash_reset_stream(z_stream * stream) -+{ -+ int err; -+ -+ stream->workspace=(void*)crash_zalloc_start; -+ // stream->zalloc = czalloc; -+ // stream->zfree = czfree; -+ // stream->opaque = (voidpf) 0; -+ stream->next_out = (Bytef *) crash_compr_buf; -+ stream->avail_out = (uInt) (CRASH_COMPR_BUF_PAGES * PAGE_SIZE); -+ stream->next_in = (Bytef *) crash_uncompr_buf; -+ stream->avail_in = (uInt) (CRASH_UNCOMPR_BUF_PAGES * PAGE_SIZE); -+ err = zlib_deflateReset(stream); -+ if (err != Z_OK) { -+ printk("crash_reset_stream: bad return %d from deflateReset\n", err); -+ return 1; -+ } -+ return 0; -+} -+ -+static u_long crash_alloc_dest_page(void) -+{ -+ u_long addr; -+ -+ addr = crash_dest_free_list; -+ if (addr) { -+ crash_dest_free_list = *(u_long *) addr; -+ dest_pages_free--; -+ } else -+ printk("crash_alloc_dest_page: free list empty\n"); -+ return addr; -+} -+ -+static void crash_free_dest_page(u_long dest) -+{ -+ if (!dest) { -+ printk("crash_free_dest_page: freeing addr 0\n"); -+ return; -+ } -+ dest_pages_free++; -+ dest = (u_long) trunc_page(dest); -+ *(u_long *) dest = crash_dest_free_list; -+ crash_dest_free_list = dest; -+} -+ -+/* -+ * Stolen from setup.c -+ */ -+#define PFN_PHYS(x) ((x) << PAGE_SHIFT) -+ -+static void init_dest_page_alloc(void) -+{ -+ u_long va; -+ long i; -+ struct page *page; -+ struct mem_crash_map_hdr *header; -+ -+ header = (struct mem_crash_map_hdr *)crash_dump_header; -+ for (i = ((1 << 24) >> PAGE_SHIFT) + LOWER_MEM_FORWARD; -+ i < (max_low_pfn - UPPER_MEM_BACKUP); i++) { -+ va = (u_long) phys_to_virt(PFN_PHYS(i)); -+ if ((va >= header->boot_reserved_start) && (va < header->boot_reserved_end)) -+ continue; -+ page = mem_map + i; -+ if (PageLocked(page) || PageReserved(page)) -+ continue; -+ if (PageFree(page) || PageAnon(page) || PageShm(page) || page->buffers) -+ crash_free_dest_page(va); -+ } -+ if (crash_debug) -+ printk("init_dest_page_alloc: dest_pages_free = 0x%lx\n", dest_pages_free); -+} -+ -+static int crash_is_dir_page(struct page *page) { -+ struct inode *tmp_inode; -+ -+ if(page->mapping && page->mapping->host) { -+ tmp_inode = (struct inode *)page->mapping->host; -+ if((tmp_inode->i_sb->s_magic == EXT2_SUPER_MAGIC) && -+ (S_ISDIR(tmp_inode->i_mode))) -+ return 1; -+ } -+ -+ return 0; -+} -+ -+static u_long crash_get_source_page(void) -+{ -+ struct page *page; -+ u_long va; -+ -+ while (crash_cur_pfn < max_low_pfn) { -+ page = mem_map + crash_cur_pfn; -+ if (!(PageFree(page) || PageAnon(page) || PageShm(page) || page->buffers)) -+ break; -+ src_pages_skipped++; -+ crash_cur_pfn++; -+ } -+ if (crash_cur_pfn == max_low_pfn) -+ return 0; -+ -+ va = (u_long) phys_to_virt(PFN_PHYS(crash_cur_pfn)); -+ src_pages_saved++; -+ crash_cur_pfn++; -+ return va; -+} -+ -+static u_long crash_update_map(u_long map, u_long src_base, u_long dest, u_long * pages) -+{ -+ struct mem_crash_map_entry *m; -+ -+ -+ if (!map) { -+ (*pages)++; -+ return crash_alloc_dest_page(); -+ } -+ m = (struct mem_crash_map_entry *)map; -+ m->src_va = src_base; -+ m->dest_page_va = dest; -+ if (dest) -+ if (crash_chksum_page(dest, &m->check_sum)) -+ return 0; -+ -+ map += sizeof(struct mem_crash_map_entry); -+ -+ m = (struct mem_crash_map_entry *)map; -+ if (!src_base) { /* end of list */ -+ if (crash_chksum((u_long) m, &m->src_va)) -+ return 0; -+ } else if ((map + 3 * sizeof(struct mem_crash_map_entry)) > (u_long) round_page(map)) { -+ m->src_va = -1; -+ map = m->dest_page_va = crash_alloc_dest_page(); -+ if (crash_debug) -+ printk("\nm = 0x%lx m->src_va = 0x%lx m->dest_page_va = 0x%lx\n", -+ (u_long) trunc_page(m), m->src_va, m->dest_page_va); -+ m++; -+ if (crash_chksum((u_long) m, &m->src_va)) -+ return 0; -+ if (crash_debug) -+ printk("m = 0x%lx chksum = m->src_va = 0x%lx\n", (u_long) trunc_page(m), -+ m->src_va); -+ if (crash_audit_map_page((u_long) m)) -+ return 0; -+ (*pages)++; -+ } -+ return map; -+} -+ -+static int crash_chksum(u_long limit, u_long * sum_addr) -+{ -+ u_long sum; -+ u_long *addr; -+ -+ if (!crash_is_kseg(limit)) { -+ printk("bad addr = 0x%lx to crash_chksum\n", limit); -+ return 1; -+ } -+ sum = 0; -+ addr = (u_long *) trunc_page(limit); -+ for (; (u_long) addr < limit; addr++) -+ sum += *addr; -+ *sum_addr = sum; -+ return 0; -+} -+ -+static int crash_chksum_page(u_long pg_addr, u_long * sum_addr) -+{ -+ u_long sum, limit; -+ u_long *addr; -+ -+ if (!crash_is_kseg(pg_addr)) { -+ printk("bad addr = 0x%lx to crash_chksum_page\n", pg_addr); -+ return 1; -+ } -+ -+ sum = 0; -+ addr = (u_long *) trunc_page(pg_addr); -+ limit = (u_long) addr + PAGE_SIZE; -+ for (; (u_long) addr < limit; addr++) -+ sum += *addr; -+ *sum_addr = sum; -+ return 0; -+} -+ -+static int crash_audit_maps(void) -+{ -+ u_long m, count; -+ u_long *link_addr; -+ struct mem_crash_map_hdr *header; -+ -+ header = (struct mem_crash_map_hdr *)crash_dump_header; -+ if (header->magic[0] != CRASH_MAGIC) -+ return 1; -+ -+ link_addr = &header->map; -+ m = header->map; -+ -+ count = 0; -+ for (;;) { -+ if (!crash_is_kseg(m)) { -+ printk("crash_audit_maps: bad link 0x%lx at 0x%lx\n", m, -+ (u_long) link_addr); -+ return 1; -+ } -+ if (crash_audit_map_page(m)) { -+ printk("audit failed while on map page %ld\n", count); -+ return 1; -+ } -+ if (!crash_link(m)) -+ break; -+ link_addr = crash_link(m); -+ m = *link_addr; -+ -+ count++; -+ } -+ return 0; -+} -+ -+static int crash_audit_map_page(u_long map) -+{ -+ struct mem_crash_map_entry *m; -+ u_long sum; -+ -+ if (!map || !crash_is_kseg(map)) { -+ printk("crash_audit_map_page: bad map = 0x%lx\n", map); -+ return 1; -+ } -+ map = (u_long) trunc_page((u_long) map); -+ m = (struct mem_crash_map_entry *)map; -+ for (;;) { -+ if ((m->src_va == -1) || (m->src_va == 0)) { -+ m++; -+ if (crash_chksum((u_long) m, &sum)) -+ return 1; -+ if (m->src_va != sum) { -+ printk("crash_audit_map_page: checksum failure1\n"); -+ printk("m = 0x%lx, sum = 0x%lx, m->src_va = 0x%lx\n", -+ (u_long) m, (u_long) sum, (u_long) m->src_va); -+ crash_print_data_around((u_long) & m->src_va); -+ return 1; -+ } else { -+ return 0; -+ } -+ } else { -+ if (crash_chksum_page((u_long) m->dest_page_va, &sum) -+ || (m->check_sum != sum)) { -+ printk("crash_audit_map_page: checksum failure2\n"); -+ printk -+ ("dest_page_va = 0x%lx, &dest_page_va = 0x%lx, sum = 0x%lx, m->check_sum = 0x%lx\n", -+ (u_long) m->dest_page_va, (u_long) (&m->check_sum), -+ (u_long) sum, (u_long) m->check_sum); -+ crash_print_data_around((u_long) & m->check_sum); -+ return 1; -+ } -+ } -+ m++; -+ } -+} -+ -+static void crash_print_data_around(u_long p) -+{ -+ u_long *a; -+ int i; -+ -+ if (!crash_is_kseg(p)) { -+ printk("crash_print_data_around: p = 0x%lx not kseg\n", p); -+ return; -+ } -+ a = (u_long *) p; -+ a -= 20; -+ for (i = 0; i < 40; i++) -+ printk("%lx\n", *a++); -+} -+ -+#ifdef CRASH_DEBUG -+static void crash_print_map_page(u_long map) -+{ -+ struct mem_crash_map_entry *m; -+ int j = 0; -+ u_long sum; -+ -+ map = (u_long) trunc_page((u_long) map); -+ m = (struct mem_crash_map_entry *)map; -+ for (;;) { -+ printk("%lx %lx %lx ", m->src_va, m->dest_page_va, m->check_sum); -+ if (!(j++ % 4)) -+ printk("\n"); -+ if ((m->src_va == -1) || (m->src_va == 0)) { -+ m++; -+ printk("%lx %lx ", m->src_va, m->dest_page_va); -+ if (crash_chksum((u_long) m, &sum)); -+ else -+ printk("\nchksum = 0x%lx\n", sum); -+ return; -+ } -+ m++; -+ } -+} -+#endif /* CRASH_DEBUG */ -+ -+static void crash_wait_cpus(void) -+{ -+ int i; -+ int msecs = 0; -+ -+ for (i = 0; i < smp_num_cpus; i++) { -+ if (i != smp_processor_id()) { -+ while (!panic_threads[i]) { -+ msecs++; -+ mdelay(1); -+ if (msecs > CRASH_CPU_TIMEOUT) { -+ /* if other cpus are still running -+ * we have to halt, otherwise we could -+ * risk using buffer cache pages which -+ * could subsequently get flushed to disk. -+ */ -+ printk("Unable to halt other CPUs, halting system.\n"); -+ crash_halt_or_reboot(0); -+ } -+ } -+ } -+ } -+ -+ crash_cleanup_smp_state(); -+} -+ -+ -+#if 0 -+static void *czalloc(void *arg, unsigned int items, unsigned int size) -+{ -+ u_long nbytes; -+ u_long addr; -+ -+ nbytes = (u_long) (items * size); -+ nbytes = (u_long) round_page(nbytes); -+ if ((crash_zalloc_cur + nbytes) > crash_zalloc_end) -+ return 0; -+ addr = crash_zalloc_cur; -+ crash_zalloc_cur += nbytes; -+ return ((void *)addr); -+} -+ -+static void czfree(void *arg, void *ptr) -+{ -+ printk("zfree: ptr = 0x%lx\n", (u_long) ptr); -+} -+#endif -+ -+static boolean_t crash_is_kseg(u_long addr) -+{ -+ u_long phys; -+ -+ phys = virt_to_phys((void *)addr); -+ if (phys < PFN_PHYS(max_low_pfn)) -+ return TRUE; -+ else -+ return FALSE; -+} -+ -+static u_long *crash_link(u_long p) -+{ -+ struct mem_crash_map_entry *m; -+ -+ p = (u_long) trunc_page(p); -+ m = (struct mem_crash_map_entry *)p; -+ for (; m->src_va; m++) -+ if (m->src_va == -1) -+ return &m->dest_page_va; -+ -+ return 0; -+} -+ -+/* Call this after data written to disk. */ -+static int crash_free_crashmem(void) -+{ -+ struct mem_crash_map_hdr *header; -+ struct mem_crash_map_entry *m, *last_m; -+ -+ if (crash_debug) -+ printk("crash_free_crashmem: \n"); -+ -+ header = (struct mem_crash_map_hdr *)crash_dump_header; -+ if (crash_audit_maps()) { -+ header->magic[0] = 0; -+ return 1; -+ } -+ m = (struct mem_crash_map_entry *)header->map; -+ again: -+ for (; m->src_va; m++) { -+ if (m->src_va == -1) { -+ last_m = m; -+ m = (struct mem_crash_map_entry *)m->dest_page_va; -+ crash_free_page((unsigned long)last_m); -+ goto again; -+ } -+ crash_free_page(m->dest_page_va); -+ } -+ if (crash_debug) -+ printk("crash_free_crashmem: 0x%lx freed\n", -+ (header->data_pages + header->map_pages) * PAGE_SIZE); -+ header->magic[0] = 0; -+ return 0; -+} -+ -+static void crash_free_page(u_long addr) -+{ -+ struct page *page; -+ -+ page = virt_to_page(addr); -+ ClearPageReserved(page); -+ set_page_count(page, 1); -+ __free_page(page); -+} -+ -+static int get_dump_helper(u_long kva, u_long buf) -+{ -+ struct page *page; -+ struct mem_crash_map_hdr *header; -+ -+ header = (struct mem_crash_map_hdr *)crash_dump_header; -+ if (header->magic[0] != CRASH_MAGIC) -+ return 1; -+ -+ if (!kva) { -+ if (crash_audit_maps()) { -+ printk("get_dump_helper: audit failure\n"); -+ header->magic[0] = 0; -+ return 1; -+ } -+ page = virt_to_page((u_long) crash_dump_header); -+ if (!PageReserved(page)) { -+ printk("not reserved: crash_dump_header = 0x%lx\n", crash_dump_header); -+ return 1; -+ } -+ if (copy_to_user((char *)buf, (char *)crash_dump_header, -+ sizeof(struct mem_crash_map_hdr))) { -+ printk("get_dump_helper: copy_to_user failed1\n"); -+ return 1; -+ } -+ } else { -+ page = virt_to_page(kva); -+ if (!PageReserved(page)) { -+ printk("not reserved: kva = 0x%lx\n", kva); -+ return 1; -+ } -+ if (copy_to_user((char *)buf, (char *)trunc_page(kva), PAGE_SIZE)) { -+ printk("get_dump_helper: copy_to_user failed2\n"); -+ return 1; -+ } -+ } -+ return 0; -+} -+ -+static void free_dump_helper(void) -+{ -+ struct mem_crash_map_hdr *header; -+ -+ header = (struct mem_crash_map_hdr *)crash_dump_header; -+ if (header->magic[0] != CRASH_MAGIC) -+ return; -+ if (crash_debug) -+ printk("free_dump_helper\n"); -+ crash_free_crashmem(); -+} -+ -+static int crashdev_open(struct inode *inode, struct file *file) -+{ -+ /* always return success -- nothing to do here */ -+ return 0; -+} -+ -+/* character device implementation */ -+static struct file_operations crashdev_fops = { -+ ioctl:crashdev_ioctl, -+ open:crashdev_open, -+}; -+ -+static struct miscdevice crash_miscdev = { -+ 190, "crash", &crashdev_fops -+}; -+ -+int crash_init_chrdev(void) -+{ -+ int result; -+ -+ result = misc_register(&crash_miscdev); -+ -+ if (result < 0) -+ printk(KERN_WARNING "crash: can't register crash device (c 10 190)\n"); -+ -+ return result; -+} -+ -+/* call the original syscalls, just to get things going */ -+static int crashdev_ioctl(struct inode *inode, struct file *file, -+ unsigned int cmd, unsigned long arg) -+{ -+ int retval = 0; -+ -+ switch (cmd) { -+ case CRASH_IOCFREEDUMP: -+ free_dump_helper(); -+ break; -+ -+ case CRASH_IOCGETDUMP: -+ if (crash_debug) { -+ printk("crashdev_ioctl: get dump\n"); -+ printk("vals: %08lx %08lx\n", -+ ((struct ioctl_getdump *)arg)->kva, -+ ((struct ioctl_getdump *)arg)->buf); -+ } -+ -+ retval = get_dump_helper((u_long) ((struct ioctl_getdump *)arg)->kva, -+ (u_long) ((struct ioctl_getdump *)arg)->buf); -+ break; -+ -+#ifdef CONFIG_BOOTIMG -+ case CRASH_IOCBOOTIMG: -+ if (crash_debug) -+ printk("crashdev_ioctl: bootimg\n"); -+ -+ retval = sys_bootimg((struct boot_image *)arg); -+ break; -+#endif -+ -+ case CRASH_IOCVERSION: -+ if (crash_debug) -+ printk("crashdev_ioctl: version\n"); -+ retval = CRASH_K_MINOR | (CRASH_K_MAJOR << 16); -+ break; -+ -+ default: -+ return -EINVAL; -+ } -+ -+ return retval; -+} -Index: linux/kernel/module.c -=================================================================== -RCS file: /chaos/cvs/kernel-rh/linux/kernel/module.c,v -retrieving revision 1.1.1.1.4.1 -retrieving revision 1.1.1.1.4.1.2.1 -diff -u -r1.1.1.1.4.1 -r1.1.1.1.4.1.2.1 ---- linux/kernel/module.c 12 Mar 2003 19:51:36 -0000 1.1.1.1.4.1 -+++ linux/kernel/module.c 1 Apr 2003 12:17:41 -0000 1.1.1.1.4.1.2.1 -@@ -311,7 +311,14 @@ - error = -EEXIST; - goto err1; - } -+#if defined(CONFIG_MCL_COREDUMP) -+ /* Call vmalloc_32 instead of module_map (vmalloc for i386) -+ * to avoid being mapped in highmem where mcore can't see us. -+ */ -+ if ((mod = (struct module *)vmalloc_32(size)) == NULL) { -+#else - if ((mod = (struct module *)module_map(size)) == NULL) { -+#endif - error = -ENOMEM; - goto err1; - } -Index: linux/kernel/panic.c -=================================================================== -RCS file: /chaos/cvs/kernel-rh/linux/kernel/panic.c,v -retrieving revision 1.3.2.1 -retrieving revision 1.3.2.1.2.1 -diff -u -r1.3.2.1 -r1.3.2.1.2.1 ---- linux/kernel/panic.c 12 Mar 2003 19:51:36 -0000 1.3.2.1 -+++ linux/kernel/panic.c 1 Apr 2003 12:17:41 -0000 1.3.2.1.2.1 -@@ -19,6 +19,10 @@ - #include - #include - -+#ifdef CONFIG_MCL_COREDUMP -+#include -+#endif -+ - asmlinkage void sys_sync(void); /* it's really int */ - - int panic_timeout; -@@ -197,20 +201,43 @@ - unsigned long caller = (unsigned long) __builtin_return_address(0); - #endif - -+#ifdef CONFIG_MCL_COREDUMP -+ crash_save_regs(); -+#endif -+ - bust_spinlocks(1); - va_start(args, fmt); - vsprintf(buf, fmt, args); - va_end(args); - printk(KERN_EMERG "Kernel panic: %s\n",buf); -+ -+#ifdef CONFIG_MCL_COREDUMP -+ if (!panicmsg) { -+ panicmsg = buf; -+ panic_processor = smp_processor_id(); -+ mb(); -+ } -+#endif -+ - if (netdump_func) - BUG(); - if (in_interrupt()) - printk(KERN_EMERG "In interrupt handler - not syncing\n"); - else if (!current->pid) - printk(KERN_EMERG "In idle task - not syncing\n"); -+#ifdef CONFIG_MCL_COREDUMP -+ else if (crash_perform_sync) -+#else - else -+#endif - sys_sync(); -+ - bust_spinlocks(0); -+ -+#ifdef CONFIG_MCL_COREDUMP -+ smp_call_function((void *)smp_crash_funnel_cpu,0,0,0); -+ crash_save_current_state(current); -+#endif - - #ifdef CONFIG_SMP - smp_send_stop(); -Index: linux/kernel/sysctl.c -=================================================================== -RCS file: /chaos/cvs/kernel-rh/linux/kernel/sysctl.c,v -retrieving revision 1.2.2.1 -retrieving revision 1.2.2.1.2.1 -diff -u -r1.2.2.1 -r1.2.2.1.2.1 ---- linux/kernel/sysctl.c 12 Mar 2003 19:51:36 -0000 1.2.2.1 -+++ linux/kernel/sysctl.c 1 Apr 2003 12:17:41 -0000 1.2.2.1.2.1 -@@ -37,6 +37,10 @@ - #include - #endif - -+#ifdef CONFIG_MCL_COREDUMP -+#include -+#endif -+ - #if defined(CONFIG_SYSCTL) - - /* External variables not in a header file. */ -@@ -247,6 +251,10 @@ - {KERN_SYSRQ, "sysrq", &sysrq_enabled, sizeof (int), - 0644, NULL, &proc_dointvec}, - #endif -+#ifdef CONFIG_MCL_COREDUMP -+ {KERN_PANIC_ON_OOPS, "panic_on_oops", &panic_on_oops, sizeof(int), -+ 0644, NULL, &proc_dointvec}, -+#endif - {KERN_CADPID, "cad_pid", &cad_pid, sizeof (int), - 0600, NULL, &proc_dointvec}, - {KERN_MAX_THREADS, "threads-max", &max_threads, sizeof(int), -Index: linux/lib/Config.in -=================================================================== -RCS file: /chaos/cvs/kernel-rh/linux/lib/Config.in,v -retrieving revision 1.2 -retrieving revision 1.2.4.1 -diff -u -r1.2 -r1.2.4.1 ---- linux/lib/Config.in 14 Feb 2003 22:59:23 -0000 1.2 -+++ linux/lib/Config.in 1 Apr 2003 12:17:41 -0000 1.2.4.1 -@@ -23,12 +23,14 @@ - fi - fi - --if [ "$CONFIG_PPP_DEFLATE" = "y" -o \ -+if [ "$CONFIG_MCL_COREDUMP" = "y" -o \ -+ "$CONFIG_PPP_DEFLATE" = "y" -o \ - "$CONFIG_JFFS2_FS" = "y" ]; then - define_tristate CONFIG_ZLIB_DEFLATE y - else - if [ "$CONFIG_PPP_DEFLATE" = "m" -o \ -- "$CONFIG_JFFS2_FS" = "m" ]; then -+ "$CONFIG_JFFS2_FS" = "m" -o \ -+ "$CONFIG_MCL_COREDUMP" = "m" ]; then - define_tristate CONFIG_ZLIB_DEFLATE m - else - tristate 'zlib compression support' CONFIG_ZLIB_DEFLATE -Index: linux/mm/memory.c -=================================================================== -RCS file: /chaos/cvs/kernel-rh/linux/mm/memory.c,v -retrieving revision 1.3.2.1 -retrieving revision 1.3.2.1.2.1 -diff -u -r1.3.2.1 -r1.3.2.1.2.1 ---- linux/mm/memory.c 12 Mar 2003 19:51:37 -0000 1.3.2.1 -+++ linux/mm/memory.c 1 Apr 2003 12:17:41 -0000 1.3.2.1.2.1 -@@ -1381,6 +1381,10 @@ - } - lock_page(page); - -+#ifdef CONFIG_MCL_COREDUMP -+ set_bit(PG_anon, &page->flags); -+#endif -+ - /* - * Back out if somebody else faulted in this pte while we - * released the page table lock. -@@ -1470,6 +1474,9 @@ - mm->rss++; - flush_page_to_ram(page); - entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); -+#ifdef CONFIG_MCL_COREDUMP -+ set_bit(PG_anon, &page->flags); -+#endif - lru_cache_add(page); - } - -Index: linux/mm/page_alloc.c -=================================================================== -RCS file: /chaos/cvs/kernel-rh/linux/mm/page_alloc.c,v -retrieving revision 1.3.2.1 -retrieving revision 1.3.2.1.2.1 -diff -u -r1.3.2.1 -r1.3.2.1.2.1 ---- linux/mm/page_alloc.c 12 Mar 2003 19:51:37 -0000 1.3.2.1 -+++ linux/mm/page_alloc.c 1 Apr 2003 12:17:41 -0000 1.3.2.1.2.1 -@@ -95,6 +95,10 @@ - struct page *base; - per_cpu_t *per_cpu; - zone_t *zone; -+#ifdef CONFIG_MCL_COREDUMP -+ struct page *pagemap; -+ int count = 1<lock); - -+#ifdef CONFIG_MCL_COREDUMP -+ pagemap = page; -+ do { -+ pagemap->flags |= (1<flags &= ~((1<free_pages -= mask; - - while (mask + (1 << (MAX_ORDER-1))) { -@@ -268,6 +281,16 @@ - zone->free_pages -= 1UL << order; - - page = expand(zone, page, index, order, curr_order, area); -+#ifdef CONFIG_MCL_COREDUMP -+ { -+ struct page *pagemap = page; -+ int count = 1<flags &= ~(1<lock, flags); - - set_page_count(page, 1); -Index: linux/arch/i386//boot/compressed/head.S -=================================================================== -RCS file: /chaos/cvs/kernel-rh/linux/arch/i386/boot/compressed/head.S,v -retrieving revision 1.1.1.1 -retrieving revision 1.1.1.1.12.6 -diff -u -r1.1.1.1 -r1.1.1.1.12.6 ---- linux/arch/i386//boot/compressed/head.S 7 May 2002 21:53:54 -0000 1.1.1.1 -+++ linux/arch/i386//boot/compressed/head.S 5 Apr 2003 05:51:27 -0000 1.1.1.1.12.6 -@@ -23,6 +23,7 @@ - */ - .text - -+#include - #include - #include - -@@ -31,6 +32,55 @@ - startup_32: - cld - cli -+ -+#ifdef CONFIG_BOOTIMG -+/* -+ * GDT is invalid if we're booted by bootimg, so reload it now -+ */ -+ lgdt %cs:gdt_descr -+ ljmp $(__KERNEL_CS),$1f -+ -+gdt_table_limit = gdt_table_end - gdt_table - 1 -+gdt_descr: -+ .word gdt_table_limit -+ .long gdt_table -+ -+gdt_table: /* stolen from arch/i386/kernel/head.S */ -+ .quad 0x0000000000000000 /* NULL descriptor */ -+ .quad 0x0000000000000000 /* 0x0b reserved */ -+ .quad 0x0000000000000000 /* 0x13 reserved */ -+ .quad 0x0000000000000000 /* 0x1b reserved */ -+ .quad 0x00cffa000000ffff /* 0x23 user 4GB code at 0x00000000 */ -+ .quad 0x00cff2000000ffff /* 0x2b user 4GB data at 0x00000000 */ -+ .quad 0x0000000000000000 /* 0x33 TLS entry 1 */ -+ .quad 0x0000000000000000 /* 0x3b TLS entry 2 */ -+ .quad 0x0000000000000000 /* 0x43 TLS entry 3 */ -+ .quad 0x0000000000000000 /* 0x4b reserved */ -+ .quad 0x0000000000000000 /* 0x53 reserved */ -+ .quad 0x0000000000000000 /* 0x5b reserved */ -+ -+ .quad 0x00cf9a000000ffff /* 0x60 kernel 4GB code at 0x00000000 */ -+ .quad 0x00cf92000000ffff /* 0x68 kernel 4GB data at 0x00000000 */ -+ .quad 0x0000000000000000 /* 0x70 TSS descriptor */ -+ .quad 0x0000000000000000 /* 0x78 LDT descriptor */ -+ -+ /* Segments used for calling PnP BIOS */ -+ .quad 0x00c09a0000000000 /* 0x80 32-bit code */ -+ .quad 0x00809a0000000000 /* 0x88 16-bit code */ -+ .quad 0x0080920000000000 /* 0x90 16-bit data */ -+ .quad 0x0080920000000000 /* 0x98 16-bit data */ -+ .quad 0x0080920000000000 /* 0xa0 16-bit data */ -+ /* -+ * The APM segments have byte granularity and their bases -+ * and limits are set at run time. -+ */ -+ .quad 0x00409a0000000000 /* 0xa8 APM CS code */ -+ .quad 0x00009a0000000000 /* 0xb0 APM CS 16 code (16 bit) */ -+ .quad 0x0040920000000000 /* 0xb8 APM DS data */ -+gdt_table_end: -+ -+1: -+#endif - movl $(__KERNEL_DS),%eax - movl %eax,%ds - movl %eax,%es -@@ -92,7 +142,6 @@ - cld - rep - movsl -- - popl %esi # discard the address - popl %ebx # real mode pointer - popl %esi # low_buffer_start -@@ -124,5 +173,10 @@ - movsl - movl %ebx,%esi # Restore setup pointer - xorl %ebx,%ebx -+#ifdef CONFIG_BOOTIMG -+ movl $0x100000,%eax -+ jmpl *%eax -+#else - ljmp $(__KERNEL_CS), $0x100000 -+#endif - move_routine_end: -Index: linux/arch/i386//kernel/head.S -=================================================================== -RCS file: /chaos/cvs/kernel-rh/linux/arch/i386/kernel/head.S,v -retrieving revision 1.2.2.1 -retrieving revision 1.2.2.1.2.5 -diff -u -r1.2.2.1 -r1.2.2.1.2.5 ---- linux/arch/i386//kernel/head.S 12 Mar 2003 19:49:06 -0000 1.2.2.1 -+++ linux/arch/i386//kernel/head.S 5 Apr 2003 05:51:27 -0000 1.2.2.1.2.5 -@@ -42,6 +42,21 @@ - * On entry, %esi points to the real-mode code as a 32-bit pointer. - */ - startup_32: -+#ifdef CONFIG_BOOTIMG -+/* -+ * GDT is invalid if we're booted by bootimg, so reload it now -+ */ -+ lgdt %cs:_gdt_descr-__PAGE_OFFSET -+ ljmp $(__KERNEL_CS),$1f-__PAGE_OFFSET -+ -+gdt_limit = SYMBOL_NAME(cpu_gdt_table_end) - SYMBOL_NAME(cpu_gdt_table) - 1 -+ -+_gdt_descr: -+ .word gdt_limit -+ .long SYMBOL_NAME(cpu_gdt_table)-__PAGE_OFFSET -+ -+1: -+#endif - /* - * Set segments to known values - */ -@@ -452,6 +467,7 @@ - .quad 0x00409a0000000000 /* 0xa8 APM CS code */ - .quad 0x00009a0000000000 /* 0xb0 APM CS 16 code (16 bit) */ - .quad 0x0040920000000000 /* 0xb8 APM DS data */ -+ENTRY(cpu_gdt_table_end) - - #if CONFIG_SMP - .fill (NR_CPUS-1)*GDT_ENTRIES,8,0 /* other CPU's GDT */ diff --git a/lustre/kernel_patches/patches/mkdep-revert-rh-2.4.patch b/lustre/kernel_patches/patches/mkdep-revert-rh-2.4.patch deleted file mode 100644 index 5cc34b8..0000000 --- a/lustre/kernel_patches/patches/mkdep-revert-rh-2.4.patch +++ /dev/null @@ -1,50 +0,0 @@ -Index: linux-2.4.20-30.9/scripts/mkdep.c -=================================================================== ---- linux-2.4.20-30.9.orig/scripts/mkdep.c 2004-02-19 19:40:51.000000000 -0500 -+++ linux-2.4.20-30.9/scripts/mkdep.c 2004-04-28 17:24:54.000000000 -0400 -@@ -48,8 +48,6 @@ - char __depname[512] = "\n\t@touch "; - #define depname (__depname+9) - int hasdep; --char cwd[PATH_MAX]; --int lcwd; - - struct path_struct { - int len; -@@ -204,22 +202,8 @@ - memcpy(path->buffer+path->len, name, len); - path->buffer[path->len+len] = '\0'; - if (access(path->buffer, F_OK) == 0) { -- int l = lcwd + strlen(path->buffer); -- char name2[l+2], *p; -- if (path->buffer[0] == '/') { -- memcpy(name2, path->buffer, l+1); -- } -- else { -- memcpy(name2, cwd, lcwd); -- name2[lcwd] = '/'; -- memcpy(name2+lcwd+1, path->buffer, path->len+len+1); -- } -- while ((p = strstr(name2, "/../"))) { -- *p = '\0'; -- strcpy(strrchr(name2, '/'), p+3); -- } - do_depname(); -- printf(" \\\n %s", name2); -+ printf(" \\\n %s", path->buffer); - return; - } - } -@@ -601,12 +585,6 @@ - return 1; - } - -- if (!getcwd(cwd, sizeof(cwd))) { -- fprintf(stderr, "mkdep: getcwd() failed %m\n"); -- return 1; -- } -- lcwd = strlen(cwd); -- - add_path("."); /* for #include "..." */ - - while (++argv, --argc > 0) { diff --git a/lustre/kernel_patches/patches/nfs-cifs-intent-2.6.12.patch b/lustre/kernel_patches/patches/nfs-cifs-intent-2.6.12.patch new file mode 100644 index 0000000..41e5ecb --- /dev/null +++ b/lustre/kernel_patches/patches/nfs-cifs-intent-2.6.12.patch @@ -0,0 +1,110 @@ +Index: linux-2.6.12-rc6/fs/nfs/dir.c +=================================================================== +--- linux-2.6.12-rc6.orig/fs/nfs/dir.c 2005-06-14 14:22:14.585699648 +0200 ++++ linux-2.6.12-rc6/fs/nfs/dir.c 2005-06-14 14:26:39.884524523 +0200 +@@ -783,7 +783,7 @@ + if (nd->flags & LOOKUP_DIRECTORY) + return 0; + /* Are we trying to write to a read only partition? */ +- if (IS_RDONLY(dir) && (nd->intent.open.flags & (O_CREAT|O_TRUNC|FMODE_WRITE))) ++ if (IS_RDONLY(dir) && (nd->intent.it_flags & (O_CREAT|O_TRUNC|FMODE_WRITE))) + return 0; + return 1; + } +@@ -805,7 +805,7 @@ + dentry->d_op = NFS_PROTO(dir)->dentry_ops; + + /* Let vfs_create() deal with O_EXCL */ +- if (nd->intent.open.flags & O_EXCL) ++ if (nd->intent.it_flags & O_EXCL) + goto no_entry; + + /* Open the file on the server */ +@@ -817,7 +817,7 @@ + goto out; + } + +- if (nd->intent.open.flags & O_CREAT) { ++ if (nd->intent.it_flags & O_CREAT) { + nfs_begin_data_update(dir); + inode = nfs4_atomic_open(dir, dentry, nd); + nfs_end_data_update(dir); +@@ -833,7 +833,7 @@ + break; + /* This turned out not to be a regular file */ + case -ELOOP: +- if (!(nd->intent.open.flags & O_NOFOLLOW)) ++ if (!(nd->intent.it_flags & O_NOFOLLOW)) + goto no_open; + /* case -EISDIR: */ + /* case -EINVAL: */ +@@ -874,7 +874,7 @@ + /* NFS only supports OPEN on regular files */ + if (!S_ISREG(inode->i_mode)) + goto no_open; +- openflags = nd->intent.open.flags; ++ openflags = nd->intent.it_flags; + /* We cannot do exclusive creation on a positive dentry */ + if ((openflags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL)) + goto no_open; +Index: linux-2.6.12-rc6/fs/nfs/nfs4proc.c +=================================================================== +--- linux-2.6.12-rc6.orig/fs/nfs/nfs4proc.c 2005-06-06 17:22:29.000000000 +0200 ++++ linux-2.6.12-rc6/fs/nfs/nfs4proc.c 2005-06-14 14:30:18.499756220 +0200 +@@ -877,19 +877,19 @@ + struct nfs4_state *state; + + if (nd->flags & LOOKUP_CREATE) { +- attr.ia_mode = nd->intent.open.create_mode; ++ attr.ia_mode = nd->intent.it_create_mode; + attr.ia_valid = ATTR_MODE; + if (!IS_POSIXACL(dir)) + attr.ia_mode &= ~current->fs->umask; + } else { + attr.ia_valid = 0; +- BUG_ON(nd->intent.open.flags & O_CREAT); ++ BUG_ON(nd->intent.it_flags & O_CREAT); + } + + cred = rpcauth_lookupcred(NFS_SERVER(dir)->client->cl_auth, 0); + if (IS_ERR(cred)) + return (struct inode *)cred; +- state = nfs4_do_open(dir, dentry, nd->intent.open.flags, &attr, cred); ++ state = nfs4_do_open(dir, dentry, nd->intent.it_flags, &attr, cred); + put_rpccred(cred); + if (IS_ERR(state)) + return (struct inode *)state; +Index: linux-2.6.12-rc6/fs/cifs/dir.c +=================================================================== +--- linux-2.6.12-rc6.orig/fs/cifs/dir.c 2005-06-06 17:22:29.000000000 +0200 ++++ linux-2.6.12-rc6/fs/cifs/dir.c 2005-06-14 14:26:39.915774522 +0200 +@@ -146,23 +146,23 @@ + } + + if(nd) { +- if ((nd->intent.open.flags & O_ACCMODE) == O_RDONLY) ++ if ((nd->intent.it_flags & O_ACCMODE) == O_RDONLY) + desiredAccess = GENERIC_READ; +- else if ((nd->intent.open.flags & O_ACCMODE) == O_WRONLY) { ++ else if ((nd->intent.it_flags & O_ACCMODE) == O_WRONLY) { + desiredAccess = GENERIC_WRITE; + write_only = TRUE; +- } else if ((nd->intent.open.flags & O_ACCMODE) == O_RDWR) { ++ } else if ((nd->intent.it_flags & O_ACCMODE) == O_RDWR) { + /* GENERIC_ALL is too much permission to request */ + /* can cause unnecessary access denied on create */ + /* desiredAccess = GENERIC_ALL; */ + desiredAccess = GENERIC_READ | GENERIC_WRITE; + } + +- if((nd->intent.open.flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) ++ if((nd->intent.it_flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) + disposition = FILE_CREATE; +- else if((nd->intent.open.flags & (O_CREAT | O_TRUNC)) == (O_CREAT | O_TRUNC)) ++ else if((nd->intent.it_flags & (O_CREAT | O_TRUNC)) == (O_CREAT | O_TRUNC)) + disposition = FILE_OVERWRITE_IF; +- else if((nd->intent.open.flags & O_CREAT) == O_CREAT) ++ else if((nd->intent.it_flags & O_CREAT) == O_CREAT) + disposition = FILE_OPEN_IF; + else { + cFYI(1,("Create flag not set in create function")); diff --git a/lustre/kernel_patches/patches/nfs_export_kernel-2.4.19-bgl.patch b/lustre/kernel_patches/patches/nfs_export_kernel-2.4.19-bgl.patch deleted file mode 100644 index 983da60..0000000 --- a/lustre/kernel_patches/patches/nfs_export_kernel-2.4.19-bgl.patch +++ /dev/null @@ -1,741 +0,0 @@ - fs/Makefile | 3 - fs/file_table.c | 11 ++ - fs/inode.c | 23 ++++- - fs/namei.c | 12 ++ - fs/nfsd/export.c | 5 + - fs/nfsd/nfsfh.c | 65 +++++++++++++- - fs/nfsd/vfs.c | 240 ++++++++++++++++++++++++++++++++++++++++++++++++----- - include/linux/fs.h | 10 ++ - kernel/ksyms.c | 2 - 9 files changed, 337 insertions(+), 34 deletions(-) - -Index: linux-bgl/fs/nfsd/vfs.c -=================================================================== ---- linux-bgl.orig/fs/nfsd/vfs.c 2003-07-02 08:44:33.000000000 -0700 -+++ linux-bgl/fs/nfsd/vfs.c 2004-12-28 17:13:59.940919832 -0800 -@@ -77,6 +77,128 @@ - static struct raparms * raparml; - static struct raparms * raparm_cache; - -+static int link_raw(struct dentry *dold, struct dentry *ddir, -+ struct dentry *dnew) -+{ -+ int err; -+ -+ struct nameidata old_nd = { .dentry = dold }; -+ struct nameidata nd = { .dentry = ddir, .last = dnew->d_name }; -+ struct inode_operations *op = nd.dentry->d_inode->i_op; -+ err = op->link_raw(&old_nd, &nd); -+ d_instantiate(dnew, dold->d_inode); -+ if(dold->d_inode->i_op && dold->d_inode->i_op->revalidate_it) -+ dold->d_inode->i_op->revalidate_it(dnew, NULL); -+ -+ return err; -+} -+ -+static int unlink_raw(struct dentry *dentry, char *fname, int flen, -+ struct dentry *rdentry) -+{ -+ int err; -+ struct qstr last = { .name = fname, .len = flen }; -+ struct nameidata nd = { .dentry = dentry, .last = last }; -+ struct inode_operations *op = nd.dentry->d_inode->i_op; -+ err = op->unlink_raw(&nd); -+ if (!err) -+ d_delete(rdentry); -+ -+ return err; -+} -+ -+static int rmdir_raw(struct dentry *dentry, char *fname, int flen, -+ struct dentry *rdentry) -+{ -+ int err; -+ struct qstr last = { .name = fname, .len = flen }; -+ struct nameidata nd = { .dentry = dentry, .last = last }; -+ struct inode_operations *op = nd.dentry->d_inode->i_op; -+ err = op->rmdir_raw(&nd); -+ if(!err) { -+ rdentry->d_inode->i_flags |= S_DEAD; -+ d_delete(rdentry); -+ } -+ -+ return err; -+} -+ -+static int symlink_raw(struct dentry *dentry, char *fname, int flen, -+ char *path) -+{ -+ int err; -+ struct qstr last = { .name = fname, .len = flen }; -+ struct nameidata nd = { .dentry = dentry, .last = last }; -+ struct inode_operations *op = nd.dentry->d_inode->i_op; -+ err = op->symlink_raw(&nd, path); -+ -+ return err; -+} -+ -+static int mkdir_raw(struct dentry *dentry, char *fname, int flen, int mode) -+{ -+ int err; -+ struct qstr last = { .name = fname, .len = flen }; -+ struct nameidata nd = { .dentry = dentry, .last = last }; -+ struct inode_operations *op = nd.dentry->d_inode->i_op; -+ err = op->mkdir_raw(&nd, mode); -+ -+ return err; -+} -+ -+static int mknod_raw(struct dentry *dentry, char *fname, int flen, int mode, -+ dev_t dev) -+{ -+ int err; -+ struct qstr last = { .name = fname, .len = flen }; -+ struct nameidata nd = { .dentry = dentry, .last = last }; -+ struct inode_operations *op = nd.dentry->d_inode->i_op; -+ err = op->mknod_raw(&nd, mode, dev); -+ -+ return err; -+} -+ -+static int rename_raw(struct dentry *fdentry, struct dentry *tdentry, -+ struct dentry *odentry, struct dentry *ndentry) -+{ -+ int err; -+ -+ struct nameidata old_nd = { .dentry = fdentry, .last = odentry->d_name}; -+ struct nameidata new_nd = { .dentry = tdentry, .last = ndentry->d_name}; -+ struct inode_operations *op = old_nd.dentry->d_inode->i_op; -+ err = op->rename_raw(&old_nd, &new_nd); -+ d_move(odentry, ndentry); -+ -+ return err; -+} -+ -+static int setattr_raw(struct inode *inode, struct iattr *iap) -+{ -+ int err; -+ -+ iap->ia_valid |= ATTR_RAW; -+ err = inode->i_op->setattr_raw(inode, iap); -+ -+ return err; -+} -+ -+int revalidate_it(struct dentry *dentry, struct lookup_intent *it) -+{ -+ int err = 0; -+ -+ if (dentry && dentry->d_op && dentry->d_op->d_revalidate_it) { -+ if (!dentry->d_op->d_revalidate_it(dentry, 0, it) && -+ !d_invalidate(dentry)) { -+ dput(dentry); -+ err = -EINVAL; -+ dentry = NULL; -+ return err; -+ } -+ } -+ -+ return err; -+} -+ - /* - * Look up one component of a pathname. - * N.B. After this call _both_ fhp and resfh need an fh_put -@@ -304,7 +426,10 @@ - } - err = nfserr_notsync; - if (!check_guard || guardtime == inode->i_ctime) { -- err = notify_change(dentry, iap); -+ if ( dentry->d_inode->i_op && dentry->d_inode->i_op->setattr_raw) -+ err = setattr_raw(dentry->d_inode, iap); -+ else -+ err = notify_change(dentry, iap); - err = nfserrno(err); - } - if (size_change) { -@@ -431,6 +556,7 @@ - { - struct dentry *dentry; - struct inode *inode; -+ struct lookup_intent it; - int err; - - /* If we get here, then the client has already done an "open", and (hopefully) -@@ -477,6 +603,14 @@ - filp->f_mode = FMODE_READ; - } - -+ intent_init(&it, IT_OPEN, (filp->f_flags & ~O_ACCMODE) | filp->f_mode); -+ -+ err = revalidate_it(dentry, &it); -+ if (err) -+ goto out_nfserr; -+ -+ filp->f_it = ⁢ -+ - err = 0; - if (filp->f_op && filp->f_op->open) { - err = filp->f_op->open(inode, filp); -@@ -491,7 +625,11 @@ - atomic_dec(&filp->f_count); - } - } -+ - out_nfserr: -+ if (it.it_op_release) -+ intent_release(&it); -+ - if (err) - err = nfserrno(err); - out: -@@ -822,7 +960,7 @@ - { - struct dentry *dentry, *dchild; - struct inode *dirp; -- int err; -+ int err, error = -EOPNOTSUPP; - - err = nfserr_perm; - if (!flen) -@@ -838,20 +976,44 @@ - dentry = fhp->fh_dentry; - dirp = dentry->d_inode; - -+ switch (type) { -+ case S_IFDIR: -+ if (dirp->i_op->mkdir_raw) -+ error = mkdir_raw(dentry, fname, flen, iap->ia_mode); -+ break; -+ case S_IFCHR: -+ case S_IFBLK: -+ case S_IFIFO: -+ case S_IFSOCK: -+ case S_IFREG: -+ if (dirp->i_op->mknod_raw) { -+ if (type == S_IFREG) -+ rdev = 0; -+ error = mknod_raw(dentry, fname, flen, iap->ia_mode, rdev); -+ } -+ break; -+ default: -+ printk("nfsd: bad file type %o in nfsd_create\n", type); -+ } -+ - err = nfserr_notdir; -- if(!dirp->i_op || !dirp->i_op->lookup) -+ if(!dirp->i_op || !(dirp->i_op->lookup || dirp->i_op->lookup_it)) - goto out; - /* - * Check whether the response file handle has been verified yet. - * If it has, the parent directory should already be locked. - */ -- if (!resfhp->fh_dentry) { -- /* called from nfsd_proc_mkdir, or possibly nfsd3_proc_create */ -- fh_lock(fhp); -+ if (!resfhp->fh_dentry || dirp->i_op->lookup_it) { -+ /* called from nfsd_proc_mkdir, or possibly nfsd3_proc_create -+ and nfsd_proc_create in case of lustre -+ */ -+ if (!resfhp->fh_dentry) -+ fh_lock(fhp); - dchild = lookup_one_len(fname, dentry, flen); - err = PTR_ERR(dchild); - if (IS_ERR(dchild)) - goto out_nfserr; -+ resfhp->fh_dentry = NULL; - err = fh_compose(resfhp, fhp->fh_export, dchild, fhp); - if (err) - goto out; -@@ -872,10 +1034,12 @@ - * Make sure the child dentry is still negative ... - */ - err = nfserr_exist; -- if (dchild->d_inode) { -- dprintk("nfsd_create: dentry %s/%s not negative!\n", -- dentry->d_name.name, dchild->d_name.name); -- goto out; -+ if ( error == -EOPNOTSUPP) { -+ if (dchild->d_inode) { -+ dprintk("nfsd_create: dentry %s/%s not negative!\n", -+ dentry->d_name.name, dchild->d_name.name); -+ goto out; -+ } - } - - if (!(iap->ia_valid & ATTR_MODE)) -@@ -888,16 +1052,19 @@ - err = nfserr_perm; - switch (type) { - case S_IFREG: -- err = vfs_create(dirp, dchild, iap->ia_mode); -+ if (error == -EOPNOTSUPP) -+ err = vfs_create(dirp, dchild, iap->ia_mode); - break; - case S_IFDIR: -- err = vfs_mkdir(dirp, dchild, iap->ia_mode); -+ if (error == -EOPNOTSUPP) -+ err = vfs_mkdir(dirp, dchild, iap->ia_mode); - break; - case S_IFCHR: - case S_IFBLK: - case S_IFIFO: - case S_IFSOCK: -- err = vfs_mknod(dirp, dchild, iap->ia_mode, rdev); -+ if (error == -EOPNOTSUPP) -+ err = vfs_mknod(dirp, dchild, iap->ia_mode, rdev); - break; - default: - printk("nfsd: bad file type %o in nfsd_create\n", type); -@@ -966,7 +1133,13 @@ - /* Get all the sanity checks out of the way before - * we lock the parent. */ - err = nfserr_notdir; -- if(!dirp->i_op || !dirp->i_op->lookup) -+ if (dirp->i_op->mknod_raw) { -+ err = mknod_raw(dentry, fname, flen, iap->ia_mode, 0); -+ if (err && err != -EOPNOTSUPP) -+ goto out; -+ } -+ -+ if(!dirp->i_op || !(dirp->i_op->lookup || dirp->i_op->lookup_it)) - goto out; - fh_lock(fhp); - -@@ -1017,6 +1190,8 @@ - case NFS3_CREATE_GUARDED: - err = nfserr_exist; - } -+ if(dirp->i_op->mknod_raw) -+ err = 0; - goto out; - } - -@@ -1123,7 +1298,7 @@ - struct iattr *iap) - { - struct dentry *dentry, *dnew; -- int err, cerr; -+ int err, cerr, error = -EOPNOTSUPP; - - err = nfserr_noent; - if (!flen || !plen) -@@ -1137,12 +1312,18 @@ - goto out; - fh_lock(fhp); - dentry = fhp->fh_dentry; -+ -+ if (dentry->d_inode->i_op->symlink_raw) -+ error = symlink_raw(dentry, fname, flen, path); -+ - dnew = lookup_one_len(fname, dentry, flen); - err = PTR_ERR(dnew); - if (IS_ERR(dnew)) - goto out_nfserr; - -- err = vfs_symlink(dentry->d_inode, dnew, path); -+ err = error; -+ if (err == -EOPNOTSUPP || !dentry->d_inode->i_op->symlink_raw) -+ err = vfs_symlink(dentry->d_inode, dnew, path); - if (!err) { - if (EX_ISSYNC(fhp->fh_export)) - nfsd_sync_dir(dentry); -@@ -1152,7 +1333,10 @@ - iap->ia_valid |= ATTR_CTIME; - iap->ia_mode = (iap->ia_mode&S_IALLUGO) - | S_IFLNK; -- err = notify_change(dnew, iap); -+ if (dnew->d_inode->i_op && dnew->d_inode->i_op->setattr_raw) -+ err = setattr_raw(dnew->d_inode, iap); -+ else -+ err = notify_change(dnew, iap); - if (!err && EX_ISSYNC(fhp->fh_export)) - write_inode_now(dentry->d_inode, 1); - } -@@ -1210,7 +1394,10 @@ - dold = tfhp->fh_dentry; - dest = dold->d_inode; - -- err = vfs_link(dold, dirp, dnew); -+ if (dirp->i_op->link_raw) -+ err = link_raw(dold, ddir, dnew); -+ else -+ err = vfs_link(dold, dirp, dnew); - if (!err) { - if (EX_ISSYNC(ffhp->fh_export)) { - nfsd_sync_dir(ddir); -@@ -1295,7 +1482,10 @@ - err = nfserr_perm; - } else - #endif -- err = vfs_rename(fdir, odentry, tdir, ndentry); -+ if(fdir->i_op->rename_raw) -+ err = rename_raw(fdentry, tdentry, odentry, ndentry); -+ else -+ err = vfs_rename(fdir, odentry, tdir, ndentry); - if (!err && EX_ISSYNC(tfhp->fh_export)) { - nfsd_sync_dir(tdentry); - nfsd_sync_dir(fdentry); -@@ -1316,7 +1506,7 @@ - fill_post_wcc(tfhp); - double_up(&tdir->i_sem, &fdir->i_sem); - ffhp->fh_locked = tfhp->fh_locked = 0; -- -+ - out: - return err; - } -@@ -1362,9 +1552,15 @@ - err = nfserr_perm; - } else - #endif -- err = vfs_unlink(dirp, rdentry); -+ if (dirp->i_op->unlink_raw) -+ err = unlink_raw(dentry, fname, flen, rdentry); -+ else -+ err = vfs_unlink(dirp, rdentry); - } else { /* It's RMDIR */ -- err = vfs_rmdir(dirp, rdentry); -+ if (dirp->i_op->rmdir_raw) -+ err = rmdir_raw(dentry, fname, flen, rdentry); -+ else -+ err = vfs_rmdir(dirp, rdentry); - } - - dput(rdentry); -Index: linux-bgl/fs/nfsd/nfsfh.c -=================================================================== ---- linux-bgl.orig/fs/nfsd/nfsfh.c 2003-07-02 08:44:08.000000000 -0700 -+++ linux-bgl/fs/nfsd/nfsfh.c 2004-12-28 17:13:59.942919514 -0800 -@@ -36,6 +36,15 @@ - int sequence; /* sequence counter */ - }; - -+static struct dentry *lookup_it(struct inode *inode, struct dentry * dentry) -+{ -+ if (inode->i_op->lookup_it) -+ return inode->i_op->lookup_it(inode, dentry, NULL, 0); -+ else -+ return inode->i_op->lookup(inode, dentry); -+ -+} -+ - /* - * A rather strange filldir function to capture - * the name matching the specified inode number. -@@ -75,6 +84,8 @@ - int error; - struct file file; - struct nfsd_getdents_callback buffer; -+ struct lookup_intent it; -+ struct file *filp = NULL; - - error = -ENOTDIR; - if (!dir || !S_ISDIR(dir->i_mode)) -@@ -85,9 +96,37 @@ - /* - * Open the directory ... - */ -- error = init_private_file(&file, dentry, FMODE_READ); -- if (error) -+ if (dentry->d_op && dentry->d_op->d_revalidate_it) { -+ if ((dentry->d_flags & DCACHE_NFSD_DISCONNECTED) && -+ (dentry->d_parent == dentry) ) { -+ it.it_op_release = NULL; -+ /* -+ * XXX Temporary Hack: Simulating init_private_file without -+ * f_op->open for disconnected dentry Since we don't have actual -+ * dentry->d_name to revalidate in revalidate_it() -+ */ -+ filp = &file; -+ memset(filp, 0, sizeof(*filp)); -+ filp->f_mode = FMODE_READ; -+ atomic_set(&filp->f_count, 1); -+ filp->f_dentry = dentry; -+ filp->f_uid = current->fsuid; -+ filp->f_gid = current->fsgid; -+ filp->f_op = dentry->d_inode->i_fop; -+ error = 0; -+ } else { -+ intent_init(&it, IT_OPEN, 0); -+ error = revalidate_it(dentry, &it); -+ if (error) -+ goto out; -+ error = init_private_file_it(&file, dentry, FMODE_READ, &it); -+ } -+ } else { -+ error = init_private_file_it(&file, dentry, FMODE_READ, NULL); -+ } -+ if (error) - goto out; -+ - error = -EINVAL; - if (!file.f_op->readdir) - goto out_close; -@@ -113,9 +152,13 @@ - } - - out_close: -- if (file.f_op->release) -+ if (file.f_op->release && !filp) - file.f_op->release(dir, &file); - out: -+ if (dentry->d_op && -+ dentry->d_op->d_revalidate_it && -+ it.it_op_release && !filp) -+ intent_release(&it); - return error; - } - -@@ -273,7 +316,7 @@ - /* I'm going to assume that if the returned dentry is different, then - * it is well connected. But nobody returns different dentrys do they? - */ -- pdentry = child->d_inode->i_op->lookup(child->d_inode, tdentry); -+ pdentry = lookup_it(child->d_inode, tdentry); - d_drop(tdentry); /* we never want ".." hashed */ - if (!pdentry && tdentry->d_inode == NULL) { - /* File system cannot find ".." ... sad but possible */ -@@ -304,6 +347,8 @@ - igrab(tdentry->d_inode); - pdentry->d_flags |= DCACHE_NFSD_DISCONNECTED; - } -+ if (child->d_op && child->d_op->d_revalidate_it) -+ pdentry->d_op = child->d_op; - } - if (pdentry == NULL) - pdentry = ERR_PTR(-ENOMEM); -@@ -461,6 +506,8 @@ - struct dentry *pdentry; - struct inode *parent; - -+ if (result->d_op && result->d_op->d_revalidate_it) -+ dentry->d_op = result->d_op; - pdentry = nfsd_findparent(dentry); - err = PTR_ERR(pdentry); - if (IS_ERR(pdentry)) -@@ -648,6 +695,11 @@ - - inode = dentry->d_inode; - -+ /* cache coherency for non-device filesystems */ -+ if (inode->i_op && inode->i_op->revalidate_it) { -+ inode->i_op->revalidate_it(dentry, NULL); -+ } -+ - /* Type check. The correct error return for type mismatches - * does not seem to be generally agreed upon. SunOS seems to - * use EISDIR if file isn't S_IFREG; a comment in the NFSv3 -@@ -878,8 +930,9 @@ - dentry->d_parent->d_name.name, dentry->d_name.name); - goto out; - out_uptodate: -- printk(KERN_ERR "fh_update: %s/%s already up-to-date!\n", -- dentry->d_parent->d_name.name, dentry->d_name.name); -+ if(!dentry->d_parent->d_inode->i_op->mkdir_raw) -+ printk(KERN_ERR "fh_update: %s/%s already up-to-date!\n", -+ dentry->d_parent->d_name.name, dentry->d_name.name); - goto out; - } - -Index: linux-bgl/fs/Makefile -=================================================================== ---- linux-bgl.orig/fs/Makefile 2004-12-28 17:13:56.898868625 -0800 -+++ linux-bgl/fs/Makefile 2004-12-28 17:13:59.943919356 -0800 -@@ -7,7 +7,8 @@ - - O_TARGET := fs.o - --export-objs := filesystems.o open.o dcache.o buffer.o inode.o -+export-objs := filesystems.o open.o dcache.o buffer.o inode.o namei.o \ -+ file_table.o - mod-subdirs := nls - - obj-y := open.o read_write.o devices.o file_table.o buffer.o \ -Index: linux-bgl/fs/namei.c -=================================================================== ---- linux-bgl.orig/fs/namei.c 2004-12-28 17:13:56.265835195 -0800 -+++ linux-bgl/fs/namei.c 2004-12-28 17:13:59.947918720 -0800 -@@ -22,6 +22,7 @@ - #include - #include - #include -+#include - - #include - #include -@@ -100,6 +101,7 @@ - it->it_op_release(it); - - } -+EXPORT_SYMBOL(intent_release); - - /* In order to reduce some races, while at the same time doing additional - * checking and hopefully speeding things up, we copy filenames to the -@@ -889,7 +891,8 @@ - - - /* SMP-safe */ --struct dentry * lookup_one_len(const char * name, struct dentry * base, int len) -+struct dentry * lookup_one_len_it(const char * name, struct dentry * base, -+ int len, struct lookup_intent *it) - { - unsigned long hash; - struct qstr this; -@@ -909,11 +912,16 @@ - } - this.hash = end_name_hash(hash); - -- return lookup_hash_it(&this, base, NULL); -+ return lookup_hash_it(&this, base, it); - access: - return ERR_PTR(-EACCES); - } - -+struct dentry * lookup_one_len(const char * name, struct dentry * base, int len) -+{ -+ return lookup_one_len_it(name, base, len, NULL); -+} -+ - /* - * namei() - * -Index: linux-bgl/fs/file_table.c -=================================================================== ---- linux-bgl.orig/fs/file_table.c 2003-07-02 08:44:42.000000000 -0700 -+++ linux-bgl/fs/file_table.c 2004-12-28 17:13:59.948918562 -0800 -@@ -82,7 +82,8 @@ - * and call the open function (if any). The caller must verify that - * inode->i_fop is not NULL. - */ --int init_private_file(struct file *filp, struct dentry *dentry, int mode) -+int init_private_file_it(struct file *filp, struct dentry *dentry, int mode, -+ struct lookup_intent *it) - { - memset(filp, 0, sizeof(*filp)); - filp->f_mode = mode; -@@ -90,12 +91,20 @@ - filp->f_dentry = dentry; - filp->f_uid = current->fsuid; - filp->f_gid = current->fsgid; -+ if (it) -+ filp->f_it = it; - filp->f_op = dentry->d_inode->i_fop; - if (filp->f_op->open) - return filp->f_op->open(dentry->d_inode, filp); - else - return 0; - } -+EXPORT_SYMBOL(init_private_file_it); -+ -+int init_private_file(struct file *filp, struct dentry *dentry, int mode) -+{ -+ return init_private_file_it(filp, dentry, mode, NULL); -+} - - void fput(struct file * file) - { -Index: linux-bgl/fs/inode.c -=================================================================== ---- linux-bgl.orig/fs/inode.c 2004-12-28 17:13:56.635910389 -0800 -+++ linux-bgl/fs/inode.c 2004-12-28 17:13:59.950918244 -0800 -@@ -971,9 +971,10 @@ - } - - --struct inode *iget4(struct super_block *sb, unsigned long ino, find_inode_t find_actor, void *opaque) -+static inline struct inode *ifind(struct super_block *sb, unsigned long ino, -+ struct list_head *head, -+ find_inode_t find_actor, void *opaque) - { -- struct list_head * head = inode_hashtable + hash(sb,ino); - struct inode * inode; - - spin_lock(&inode_lock); -@@ -986,6 +987,24 @@ - } - spin_unlock(&inode_lock); - -+ return NULL; -+} -+ -+struct inode *ilookup4(struct super_block *sb, unsigned long ino, -+ find_inode_t find_actor, void *opaque) -+{ -+ struct list_head * head = inode_hashtable + hash(sb,ino); -+ return ifind(sb, ino, head, find_actor, opaque); -+} -+ -+struct inode *iget4(struct super_block *sb, unsigned long ino, -+ find_inode_t find_actor, void *opaque) -+{ -+ struct list_head * head = inode_hashtable + hash(sb,ino); -+ struct inode *inode = ifind(sb, ino, head, find_actor, opaque); -+ if (inode) -+ return inode; -+ - /* - * get_new_inode() will do the right thing, re-trying the search - * in case it had to block at any point. -Index: linux-bgl/kernel/ksyms.c -=================================================================== ---- linux-bgl.orig/kernel/ksyms.c 2004-12-28 17:13:56.978855920 -0800 -+++ linux-bgl/kernel/ksyms.c 2004-12-28 17:13:59.951918085 -0800 -@@ -142,6 +142,7 @@ - EXPORT_SYMBOL(igrab); - EXPORT_SYMBOL(iunique); - EXPORT_SYMBOL(iget4); -+EXPORT_SYMBOL(ilookup4); - EXPORT_SYMBOL(iput); - EXPORT_SYMBOL(force_delete); - EXPORT_SYMBOL(follow_up); -@@ -152,6 +153,7 @@ - EXPORT_SYMBOL(path_release); - EXPORT_SYMBOL(__user_walk); - EXPORT_SYMBOL(lookup_one_len); -+EXPORT_SYMBOL(lookup_one_len_it); - EXPORT_SYMBOL(lookup_hash); - EXPORT_SYMBOL(sys_close); - EXPORT_SYMBOL(dcache_lock); -Index: linux-bgl/include/linux/fs.h -=================================================================== ---- linux-bgl.orig/include/linux/fs.h 2004-12-28 17:13:59.471860200 -0800 -+++ linux-bgl/include/linux/fs.h 2004-12-28 17:13:59.955917450 -0800 -@@ -93,6 +93,9 @@ - #define FS_SINGLE 8 /* Filesystem that can have only one superblock */ - #define FS_NOMOUNT 16 /* Never mount from userland */ - #define FS_LITTER 32 /* Keeps the tree in dcache */ -+#define FS_NFSEXP_FSID 64 /* Use file system specific fsid for -+ * exporting non device filesystems. -+ */ - #define FS_ODD_RENAME 32768 /* Temporary stuff; will go away as soon - * as nfs_rename() will be cleaned up - */ -@@ -1149,6 +1152,9 @@ - struct nameidata *nd, struct lookup_intent *it); - extern struct file *dentry_open_it(struct dentry *dentry, struct vfsmount *mnt, - int flags, struct lookup_intent *it); -+extern int revalidate_it(struct dentry *dentry, struct lookup_intent *it); -+extern int init_private_file_it(struct file *, struct dentry *dentry, int mode, -+ struct lookup_intent *it); - extern int filp_close(struct file *, fl_owner_t id); - extern char * getname(const char *); - -@@ -1418,6 +1424,8 @@ - extern int follow_down(struct vfsmount **, struct dentry **); - extern int follow_up(struct vfsmount **, struct dentry **); - extern struct dentry * lookup_one_len(const char *, struct dentry *, int); -+extern struct dentry * lookup_one_len_it(const char *, struct dentry *, int, -+ struct lookup_intent *); - extern struct dentry * lookup_hash(struct qstr *, struct dentry *); - #define user_path_walk(name,nd) __user_walk(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, nd) - #define user_path_walk_link(name,nd) __user_walk(name, LOOKUP_POSITIVE, nd) -@@ -1431,6 +1439,8 @@ - - typedef int (*find_inode_t)(struct inode *, unsigned long, void *); - extern struct inode * iget4(struct super_block *, unsigned long, find_inode_t, void *); -+extern struct inode * ilookup4(struct super_block *, unsigned long, -+ find_inode_t, void *); - static inline struct inode *iget(struct super_block *sb, unsigned long ino) - { - return iget4(sb, ino, NULL, NULL); diff --git a/lustre/kernel_patches/patches/nfs_export_kernel-2.4.20-hp.patch b/lustre/kernel_patches/patches/nfs_export_kernel-2.4.20-hp.patch index 13c7cd7..3768247 100644 --- a/lustre/kernel_patches/patches/nfs_export_kernel-2.4.20-hp.patch +++ b/lustre/kernel_patches/patches/nfs_export_kernel-2.4.20-hp.patch @@ -285,7 +285,7 @@ --- linux-2.4.20-hp4-pnnl13/fs/nfsd/vfs.c~nfs_export_kernel-2.4.20-hp 2002-11-29 02:53:15.000000000 +0300 +++ linux-2.4.20-hp4-pnnl13-alexey/fs/nfsd/vfs.c 2003-10-08 10:54:08.000000000 +0400 -@@ -77,6 +77,129 @@ struct raparms { +@@ -77,6 +77,130 @@ struct raparms { static struct raparms * raparml; static struct raparms * raparm_cache; @@ -298,6 +298,7 @@ + struct nameidata nd = { .dentry = ddir, .last = dnew->d_name }; + struct inode_operations *op = nd.dentry->d_inode->i_op; + err = op->link_raw(&old_nd, &nd); ++ igrab(dold->d_inode); + d_instantiate(dnew, dold->d_inode); + if (dold->d_inode->i_op && dold->d_inode->i_op->revalidate_it) + dold->d_inode->i_op->revalidate_it(dnew, NULL); diff --git a/lustre/kernel_patches/patches/nfs_export_kernel-2.4.21-chaos.patch b/lustre/kernel_patches/patches/nfs_export_kernel-2.4.21-chaos.patch index 70c2e99..f1d4f5b 100644 --- a/lustre/kernel_patches/patches/nfs_export_kernel-2.4.21-chaos.patch +++ b/lustre/kernel_patches/patches/nfs_export_kernel-2.4.21-chaos.patch @@ -299,7 +299,7 @@ Index: linux-2.4.21-chaos/fs/nfsd/vfs.c =================================================================== --- linux-2.4.21-chaos.orig/fs/nfsd/vfs.c 2003-09-19 03:49:54.000000000 +0400 +++ linux-2.4.21-chaos/fs/nfsd/vfs.c 2003-12-12 16:19:25.000000000 +0300 -@@ -78,6 +78,126 @@ +@@ -78,6 +78,127 @@ static struct raparms * raparml; static struct raparms * raparm_cache; @@ -312,6 +312,7 @@ Index: linux-2.4.21-chaos/fs/nfsd/vfs.c + struct nameidata nd = { .dentry = ddir, .last = dnew->d_name }; + struct inode_operations *op = nd.dentry->d_inode->i_op; + err = op->link_raw(&old_nd, &nd); ++ igrab(dold->d_inode); + d_instantiate(dnew, dold->d_inode); + if (dold->d_inode->i_op && dold->d_inode->i_op->revalidate_it) + dold->d_inode->i_op->revalidate_it(dnew, NULL); diff --git a/lustre/kernel_patches/patches/nfs_export_kernel-2.4.21-suse2.patch b/lustre/kernel_patches/patches/nfs_export_kernel-2.4.21-suse2.patch index 3da14fe..c430b8f 100644 --- a/lustre/kernel_patches/patches/nfs_export_kernel-2.4.21-suse2.patch +++ b/lustre/kernel_patches/patches/nfs_export_kernel-2.4.21-suse2.patch @@ -299,7 +299,7 @@ Index: linux-2.4.21-chaos/fs/nfsd/vfs.c =================================================================== --- linux-2.4.21-chaos.orig/fs/nfsd/vfs.c 2003-09-19 03:49:54.000000000 +0400 +++ linux-2.4.21-chaos/fs/nfsd/vfs.c 2003-12-12 16:19:25.000000000 +0300 -@@ -78,6 +78,126 @@ +@@ -78,6 +78,127 @@ static struct raparms * raparml; static struct raparms * raparm_cache; @@ -312,6 +312,7 @@ Index: linux-2.4.21-chaos/fs/nfsd/vfs.c + struct nameidata nd = { .dentry = ddir, .last = dnew->d_name }; + struct inode_operations *op = nd.dentry->d_inode->i_op; + err = op->link_raw(&old_nd, &nd); ++ igrab(dold->d_inode); + d_instantiate(dnew, dold->d_inode); + if (dold->d_inode->i_op && dold->d_inode->i_op->revalidate_it) + dold->d_inode->i_op->revalidate_it(dnew, NULL); diff --git a/lustre/kernel_patches/patches/nfs_export_kernel-2.4.22.patch b/lustre/kernel_patches/patches/nfs_export_kernel-2.4.22.patch index e573457..43a11b6 100644 --- a/lustre/kernel_patches/patches/nfs_export_kernel-2.4.22.patch +++ b/lustre/kernel_patches/patches/nfs_export_kernel-2.4.22.patch @@ -288,7 +288,7 @@ Index: linux-2.4.22-vanilla/fs/nfsd/vfs.c =================================================================== --- linux-2.4.22-vanilla.orig/fs/nfsd/vfs.c 2003-11-03 23:22:11.000000000 +0300 +++ linux-2.4.22-vanilla/fs/nfsd/vfs.c 2003-11-03 23:47:41.000000000 +0300 -@@ -77,6 +77,126 @@ +@@ -77,6 +77,127 @@ static struct raparms * raparml; static struct raparms * raparm_cache; @@ -301,6 +301,7 @@ Index: linux-2.4.22-vanilla/fs/nfsd/vfs.c + struct nameidata nd = { .dentry = ddir, .last = dnew->d_name }; + struct inode_operations *op = nd.dentry->d_inode->i_op; + err = op->link_raw(&old_nd, &nd); ++ igrab(dold->d_inode); + d_instantiate(dnew, dold->d_inode); + if (dold->d_inode->i_op && dold->d_inode->i_op->revalidate_it) + dold->d_inode->i_op->revalidate_it(dnew, NULL); diff --git a/lustre/kernel_patches/patches/nfs_export_kernel-2.4.29.patch b/lustre/kernel_patches/patches/nfs_export_kernel-2.4.29.patch index 4708b75..ed47b6a 100644 --- a/lustre/kernel_patches/patches/nfs_export_kernel-2.4.29.patch +++ b/lustre/kernel_patches/patches/nfs_export_kernel-2.4.29.patch @@ -287,7 +287,7 @@ Index: linux-2.4.29/fs/nfsd/vfs.c =================================================================== --- linux-2.4.29.orig/fs/nfsd/vfs.c 2005-05-03 16:28:21.000000000 +0300 +++ linux-2.4.29/fs/nfsd/vfs.c 2005-05-03 18:46:09.372133224 +0300 -@@ -77,6 +77,126 @@ +@@ -77,6 +77,127 @@ static struct raparms * raparml; static struct raparms * raparm_cache; @@ -300,6 +300,7 @@ Index: linux-2.4.29/fs/nfsd/vfs.c + struct nameidata nd = { .dentry = ddir, .last = dnew->d_name }; + struct inode_operations *op = nd.dentry->d_inode->i_op; + err = op->link_raw(&old_nd, &nd); ++ igrab(dold->d_inode); + d_instantiate(dnew, dold->d_inode); + if (dold->d_inode->i_op && dold->d_inode->i_op->revalidate_it) + dold->d_inode->i_op->revalidate_it(dnew, NULL); diff --git a/lustre/kernel_patches/patches/qsnet-rhel-2.4.patch b/lustre/kernel_patches/patches/qsnet-rhel-2.4.patch index 40b7c82..c0d43e8 100644 --- a/lustre/kernel_patches/patches/qsnet-rhel-2.4.patch +++ b/lustre/kernel_patches/patches/qsnet-rhel-2.4.patch @@ -20,9 +20,9 @@ Index: linux-2.4.21/arch/ia64/kernel/ia64_ksyms.c --- linux-2.4.21.orig/arch/ia64/kernel/ia64_ksyms.c 2005-06-01 22:51:59.000000000 -0400 +++ linux-2.4.21/arch/ia64/kernel/ia64_ksyms.c 2005-06-01 23:14:43.773842072 -0400 @@ -207,3 +207,13 @@ - EXPORT_SYMBOL_GPL(show_mem); EXPORT_SYMBOL_GPL(show_state); EXPORT_SYMBOL_GPL(show_regs); + EXPORT_SYMBOL(pm_power_off); + +#define __KERNEL_SYSCALLS__ 1 +#include @@ -93660,14 +93660,14 @@ Index: linux-2.4.21/mm/mmap.c #include #include #include -@@ -1459,6 +1460,7 @@ - mm->total_vm = 0; - mm->locked_vm = 0; - +@@ -1450,6 +1451,7 @@ + release_segments(mm); + + spin_lock(&mm->page_table_lock); + coproc_release(mm); - flush_cache_mm(mm); - while (mpnt) { - struct vm_area_struct * next = mpnt->vm_next; + mpnt = mm->mmap; + mm->mmap = mm->mmap_cache = NULL; + mm->mm_rb = RB_ROOT; Index: linux-2.4.21/mm/mprotect.c =================================================================== --- linux-2.4.21.orig/mm/mprotect.c 2005-06-01 22:51:50.000000000 -0400 diff --git a/lustre/kernel_patches/patches/qsnet-rhel4-2.6.patch b/lustre/kernel_patches/patches/qsnet-rhel4-2.6.patch index f0d32bf..a17f058 100644 --- a/lustre/kernel_patches/patches/qsnet-rhel4-2.6.patch +++ b/lustre/kernel_patches/patches/qsnet-rhel4-2.6.patch @@ -96982,9 +96982,9 @@ diff -urN clean/ipc/shm.c linux-2.6.9/ipc/shm.c #include #include +#include + #include #include - #include "util.h" @@ -850,6 +851,44 @@ return retval; } @@ -97514,22 +97514,22 @@ diff -urN clean/mm/memory.c linux-2.6.9/mm/memory.c spin_unlock(&mm->page_table_lock); out: return ret; -@@ -1552,6 +1562,7 @@ +@@ -1555,6 +1565,7 @@ /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, addr, entry); + ioproc_update_page(vma, addr); + lazy_mmu_prot_update(entry); spin_unlock(&mm->page_table_lock); out: - return VM_FAULT_MINOR; -@@ -1669,6 +1680,7 @@ +@@ -1673,6 +1684,7 @@ /* no need to invalidate: a not-present page shouldn't be cached */ update_mmu_cache(vma, address, entry); + ioproc_update_page(vma, address); + lazy_mmu_prot_update(entry); spin_unlock(&mm->page_table_lock); out: - return ret; @@ -1853,6 +1865,7 @@ return ret; return ret == len ? 0 : -1; diff --git a/lustre/kernel_patches/patches/removepage-2.4.19-suse.patch b/lustre/kernel_patches/patches/removepage-2.4.19-suse.patch deleted file mode 100644 index 4602f96..0000000 --- a/lustre/kernel_patches/patches/removepage-2.4.19-suse.patch +++ /dev/null @@ -1,30 +0,0 @@ - include/linux/fs.h | 1 + - mm/filemap.c | 3 +++ - 2 files changed, 4 insertions(+) - -Index: linux-2.4.19.SuSE/include/linux/fs.h -=================================================================== ---- linux-2.4.19.SuSE.orig/include/linux/fs.h Sun Nov 16 00:40:59 2003 -+++ linux-2.4.19.SuSE/include/linux/fs.h Sun Nov 16 01:38:06 2003 -@@ -428,6 +428,7 @@ - int (*releasepage) (struct page *, int); - #define KERNEL_HAS_O_DIRECT /* this is for modules out of the kernel */ - int (*direct_IO)(int, struct file *, struct kiobuf *, unsigned long, int); -+ void (*removepage)(struct page *); /* called when page gets removed from the inode */ - }; - - struct address_space { -Index: linux-2.4.19.SuSE/mm/filemap.c -=================================================================== ---- linux-2.4.19.SuSE.orig/mm/filemap.c Sat Nov 15 18:02:15 2003 -+++ linux-2.4.19.SuSE/mm/filemap.c Sun Nov 16 01:37:11 2003 -@@ -97,6 +97,9 @@ - { - struct address_space * mapping = page->mapping; - -+ if (mapping->a_ops->removepage) -+ mapping->a_ops->removepage(page); -+ - mapping->nrpages--; - list_del(&page->list); - page->mapping = NULL; diff --git a/lustre/kernel_patches/patches/resched-2.4.19-pre1.patch b/lustre/kernel_patches/patches/resched-2.4.19-pre1.patch deleted file mode 100644 index 567e1e8..0000000 --- a/lustre/kernel_patches/patches/resched-2.4.19-pre1.patch +++ /dev/null @@ -1,16 +0,0 @@ -Index: linux-2.4.19-pre1/include/linux/sched.h -=================================================================== ---- linux-2.4.19-pre1.orig/include/linux/sched.h 2003-11-21 04:05:05.000000000 +0300 -+++ linux-2.4.19-pre1/include/linux/sched.h 2003-11-21 04:10:29.000000000 +0300 -@@ -927,6 +927,11 @@ - return res; - } - -+static inline int need_resched(void) -+{ -+ return (unlikely(current->need_resched)); -+} -+ - #endif /* __KERNEL__ */ - - #endif diff --git a/lustre/kernel_patches/patches/small_scatterlist-2.4.21-rhel.patch b/lustre/kernel_patches/patches/small_scatterlist-2.4.21-rhel.patch index 381d490..41d9d8a 100644 --- a/lustre/kernel_patches/patches/small_scatterlist-2.4.21-rhel.patch +++ b/lustre/kernel_patches/patches/small_scatterlist-2.4.21-rhel.patch @@ -3179,7 +3179,7 @@ Index: linux-2.4.21/drivers/scsi/libata-core.c sg->page = virt_to_page(buf); sg->offset = (unsigned long) buf & ~PAGE_MASK; +#endif /* !SMALL_SCATTERLIST */ - sg_dma_len(sg) = buflen; + sg->length = buflen; } @@ -2297,8 +2302,13 @@ @@ -3197,9 +3197,9 @@ Index: linux-2.4.21/drivers/scsi/libata-core.c /* get the current page and offset */ page = nth_page(page, (offset >> PAGE_SHIFT)); @@ -2339,8 +2349,13 @@ + next_sg: sg = &qc->sg[qc->cursg]; - next_page: +#if SMALL_SCATTERLIST + page = sg->u.page.page; + offset = sg->u.page.offset + qc->cursg_ofs; diff --git a/lustre/kernel_patches/patches/socket-exports-2.4.19-bgl.patch b/lustre/kernel_patches/patches/socket-exports-2.4.19-bgl.patch deleted file mode 100644 index e60f473..0000000 --- a/lustre/kernel_patches/patches/socket-exports-2.4.19-bgl.patch +++ /dev/null @@ -1,46 +0,0 @@ - include/linux/socket.h | 4 ++++ - net/netsyms.c | 2 ++ - net/socket.c | 2 +- - 3 files changed, 7 insertions(+), 1 deletion(-) - -Index: linux-DRV401/include/linux/socket.h -=================================================================== ---- linux-DRV401.orig/include/linux/socket.h 2004-10-15 10:26:20.000000000 -0700 -+++ linux-DRV401/include/linux/socket.h 2004-10-15 11:11:09.000000000 -0700 -@@ -260,6 +260,10 @@ - extern int move_addr_to_user(void *kaddr, int klen, void *uaddr, int *ulen); - extern int move_addr_to_kernel(void *uaddr, int ulen, void *kaddr); - extern int put_cmsg(struct msghdr*, int level, int type, int len, void *data); -+struct socket; -+extern int sock_map_fd(struct socket *sock); -+extern struct socket *sockfd_lookup(int fd, int *err); -+ - #endif - #endif /* not kernel and not glibc */ - #endif /* _LINUX_SOCKET_H */ -Index: linux-DRV401/net/netsyms.c -=================================================================== ---- linux-DRV401.orig/net/netsyms.c 2004-10-15 11:10:52.000000000 -0700 -+++ linux-DRV401/net/netsyms.c 2004-10-15 11:11:09.000000000 -0700 -@@ -159,6 +159,8 @@ - EXPORT_SYMBOL(put_cmsg); - EXPORT_SYMBOL(sock_kmalloc); - EXPORT_SYMBOL(sock_kfree_s); -+EXPORT_SYMBOL(sockfd_lookup); -+EXPORT_SYMBOL(sock_map_fd); - - #ifdef CONFIG_FILTER - EXPORT_SYMBOL(sk_run_filter); -Index: linux-DRV401/net/socket.c -=================================================================== ---- linux-DRV401.orig/net/socket.c 2004-10-15 10:24:16.000000000 -0700 -+++ linux-DRV401/net/socket.c 2004-10-15 11:11:09.000000000 -0700 -@@ -326,7 +326,7 @@ - * but we take care of internal coherence yet. - */ - --static int sock_map_fd(struct socket *sock) -+int sock_map_fd(struct socket *sock) - { - int fd; - struct qstr this; diff --git a/lustre/kernel_patches/patches/tcp-zero-copy-2.4.19-pre1.patch b/lustre/kernel_patches/patches/tcp-zero-copy-2.4.19-pre1.patch deleted file mode 100644 index bcd3f73..0000000 --- a/lustre/kernel_patches/patches/tcp-zero-copy-2.4.19-pre1.patch +++ /dev/null @@ -1,461 +0,0 @@ -Index: linux-2.4.19-pre1/include/linux/skbuff.h -=================================================================== ---- linux-2.4.19-pre1.orig/include/linux/skbuff.h 2001-11-22 22:46:26.000000000 +0300 -+++ linux-2.4.19-pre1/include/linux/skbuff.h 2004-01-14 01:15:13.000000000 +0300 -@@ -116,6 +116,30 @@ - __u16 size; - }; - -+/* Support for callback when skb data has been released */ -+typedef struct zccd /* Zero Copy Callback Descriptor */ -+{ /* (embed as first member of custom struct) */ -+ atomic_t zccd_count; /* reference count */ -+ void (*zccd_destructor)(struct zccd *); /* callback when refcount reaches zero */ -+} zccd_t; -+ -+static inline void zccd_init (zccd_t *d, void (*callback)(zccd_t *)) -+{ -+ atomic_set (&d->zccd_count, 1); -+ d->zccd_destructor = callback; -+} -+ -+static inline void zccd_get (zccd_t *d) /* take a reference */ -+{ -+ atomic_inc (&d->zccd_count); -+} -+ -+static inline void zccd_put (zccd_t *d) /* release a reference */ -+{ -+ if (atomic_dec_and_test (&d->zccd_count)) -+ (d->zccd_destructor)(d); -+} -+ - /* This data is invariant across clones and lives at - * the end of the header data, ie. at skb->end. - */ -@@ -123,6 +147,12 @@ - atomic_t dataref; - unsigned int nr_frags; - struct sk_buff *frag_list; -+ zccd_t *zccd; /* zero copy descriptor */ -+ zccd_t *zccd2; /* 2nd zero copy descriptor */ -+ /* NB we expect zero-copy data to be at least 1 packet, so -+ * having 2 zccds means we don't unneccessarily split the packet -+ * where consecutive zero-copy sends abutt. -+ */ - skb_frag_t frags[MAX_SKB_FRAGS]; - }; - -Index: linux-2.4.19-pre1/include/net/tcp.h -=================================================================== ---- linux-2.4.19-pre1.orig/include/net/tcp.h 2001-11-22 22:47:22.000000000 +0300 -+++ linux-2.4.19-pre1/include/net/tcp.h 2004-01-14 01:15:13.000000000 +0300 -@@ -640,6 +640,8 @@ - - extern int tcp_sendmsg(struct sock *sk, struct msghdr *msg, int size); - extern ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags); -+extern ssize_t tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size, -+ int flags, zccd_t *zccd); - - extern int tcp_ioctl(struct sock *sk, - int cmd, -@@ -733,6 +735,9 @@ - struct msghdr *msg, - int len, int nonblock, - int flags, int *addr_len); -+extern int tcp_recvpackets(struct sock *sk, -+ struct sk_buff_head *packets, -+ int len, int nonblock); - - extern int tcp_listen_start(struct sock *sk); - -Index: linux-2.4.19-pre1/net/netsyms.c -=================================================================== ---- linux-2.4.19-pre1.orig/net/netsyms.c 2004-01-14 01:10:37.000000000 +0300 -+++ linux-2.4.19-pre1/net/netsyms.c 2004-01-14 01:15:54.000000000 +0300 -@@ -409,6 +409,9 @@ - - #endif - -+EXPORT_SYMBOL(tcp_sendpage_zccd); -+EXPORT_SYMBOL(tcp_recvpackets); -+ - EXPORT_SYMBOL(netlink_set_err); - EXPORT_SYMBOL(netlink_broadcast); - EXPORT_SYMBOL(netlink_unicast); -Index: linux-2.4.19-pre1/net/core/skbuff.c -=================================================================== ---- linux-2.4.19-pre1.orig/net/core/skbuff.c 2001-12-21 20:42:05.000000000 +0300 -+++ linux-2.4.19-pre1/net/core/skbuff.c 2004-01-14 01:15:13.000000000 +0300 -@@ -208,6 +208,8 @@ - atomic_set(&(skb_shinfo(skb)->dataref), 1); - skb_shinfo(skb)->nr_frags = 0; - skb_shinfo(skb)->frag_list = NULL; -+ skb_shinfo(skb)->zccd = NULL; /* skbuffs kick off with NO user zero copy descriptors */ -+ skb_shinfo(skb)->zccd2 = NULL; - return skb; - - nodata: -@@ -276,6 +278,10 @@ - { - if (!skb->cloned || - atomic_dec_and_test(&(skb_shinfo(skb)->dataref))) { -+ if (skb_shinfo(skb)->zccd != NULL) /* zero copy callback descriptor? */ -+ zccd_put (skb_shinfo(skb)->zccd); /* release hold */ -+ if (skb_shinfo(skb)->zccd2 != NULL) /* 2nd zero copy callback descriptor? */ -+ zccd_put (skb_shinfo(skb)->zccd2); /* release hold */ - if (skb_shinfo(skb)->nr_frags) { - int i; - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) -@@ -532,6 +538,8 @@ - atomic_set(&(skb_shinfo(skb)->dataref), 1); - skb_shinfo(skb)->nr_frags = 0; - skb_shinfo(skb)->frag_list = NULL; -+ skb_shinfo(skb)->zccd = NULL; /* copied data => no user zero copy descriptor */ -+ skb_shinfo(skb)->zccd2 = NULL; - - /* We are no longer a clone, even if we were. */ - skb->cloned = 0; -@@ -578,6 +586,14 @@ - n->data_len = skb->data_len; - n->len = skb->len; - -+ if (skb_shinfo(skb)->zccd != NULL) /* user zero copy descriptor? */ -+ zccd_get (skb_shinfo(skb)->zccd); /* 1 more ref (pages are shared) */ -+ skb_shinfo(n)->zccd = skb_shinfo(skb)->zccd; -+ -+ if (skb_shinfo(skb)->zccd2 != NULL) /* 2nd user zero copy descriptor? */ -+ zccd_get (skb_shinfo(skb)->zccd2); /* 1 more ref (pages are shared) */ -+ skb_shinfo(n)->zccd2 = skb_shinfo(skb)->zccd2; -+ - if (skb_shinfo(skb)->nr_frags) { - int i; - -@@ -620,6 +636,8 @@ - u8 *data; - int size = nhead + (skb->end - skb->head) + ntail; - long off; -+ zccd_t *zccd = skb_shinfo(skb)->zccd; /* stash user zero copy descriptor */ -+ zccd_t *zccd2 = skb_shinfo(skb)->zccd2; /* stash 2nd user zero copy descriptor */ - - if (skb_shared(skb)) - BUG(); -@@ -641,6 +659,11 @@ - if (skb_shinfo(skb)->frag_list) - skb_clone_fraglist(skb); - -+ if (zccd != NULL) /* user zero copy descriptor? */ -+ zccd_get (zccd); /* extra ref (pages are shared) */ -+ if (zccd2 != NULL) /* 2nd user zero copy descriptor? */ -+ zccd_get (zccd2); /* extra ref (pages are shared) */ -+ - skb_release_data(skb); - - off = (data+nhead) - skb->head; -@@ -655,6 +678,8 @@ - skb->nh.raw += off; - skb->cloned = 0; - atomic_set(&skb_shinfo(skb)->dataref, 1); -+ skb_shinfo(skb)->zccd = zccd; -+ skb_shinfo(skb)->zccd2 = zccd2; - return 0; - - nodata: -Index: linux-2.4.19-pre1/net/ipv4/tcp.c -=================================================================== ---- linux-2.4.19-pre1.orig/net/ipv4/tcp.c 2001-12-21 20:42:05.000000000 +0300 -+++ linux-2.4.19-pre1/net/ipv4/tcp.c 2004-01-14 01:15:13.000000000 +0300 -@@ -744,7 +744,7 @@ - goto out; - } - --ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags); -+ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags, zccd_t *zccd); - - static inline int - can_coalesce(struct sk_buff *skb, int i, struct page *page, int off) -@@ -823,7 +823,8 @@ - return err; - } - --ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags) -+/* Extra parameter: user zero copy descriptor (or NULL if not doing that) */ -+ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags, zccd_t *zccd) - { - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - int mss_now; -@@ -871,6 +872,17 @@ - copy = size; - - i = skb_shinfo(skb)->nr_frags; -+ -+ if (zccd != NULL && /* this is a zcc I/O */ -+ skb_shinfo(skb)->zccd != NULL && /* skb is part of a zcc I/O */ -+ skb_shinfo(skb)->zccd2 != NULL && -+ skb_shinfo(skb)->zccd != zccd && /* not the same one */ -+ skb_shinfo(skb)->zccd2 != zccd) -+ { -+ tcp_mark_push (tp, skb); -+ goto new_segment; -+ } -+ - if (can_coalesce(skb, i, page, offset)) { - skb_shinfo(skb)->frags[i-1].size += copy; - } else if (i < MAX_SKB_FRAGS) { -@@ -881,6 +893,20 @@ - goto new_segment; - } - -+ if (zccd != NULL && /* this is a zcc I/O */ -+ skb_shinfo(skb)->zccd != zccd && /* not already referencing this zccd */ -+ skb_shinfo(skb)->zccd2 != zccd) -+ { -+ zccd_get (zccd); /* bump ref count */ -+ -+ BUG_TRAP (skb_shinfo(skb)->zccd2 == NULL); -+ -+ if (skb_shinfo(skb)->zccd == NULL) /* reference this zccd */ -+ skb_shinfo(skb)->zccd = zccd; -+ else -+ skb_shinfo(skb)->zccd2 = zccd; -+ } -+ - skb->len += copy; - skb->data_len += copy; - skb->ip_summed = CHECKSUM_HW; -@@ -944,7 +970,31 @@ - - lock_sock(sk); - TCP_CHECK_TIMER(sk); -- res = do_tcp_sendpages(sk, &page, offset, size, flags); -+ res = do_tcp_sendpages(sk, &page, offset, size, flags, NULL); -+ TCP_CHECK_TIMER(sk); -+ release_sock(sk); -+ return res; -+} -+ -+ssize_t tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size, -+ int flags, zccd_t *zccd) -+{ -+ ssize_t res; -+ struct sock *sk = sock->sk; -+ -+#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) -+ -+ if (!(sk->route_caps & NETIF_F_SG) || /* caller shouldn't waste her time */ -+ !(sk->route_caps & TCP_ZC_CSUM_FLAGS)) /* on double mapping */ -+ BUG (); -+ -+#undef TCP_ZC_CSUM_FLAGS -+ -+ lock_sock(sk); -+ TCP_CHECK_TIMER(sk); -+ -+ res = do_tcp_sendpages(sk, &page, offset, size, flags, zccd); -+ - TCP_CHECK_TIMER(sk); - release_sock(sk); - return res; -@@ -1683,6 +1733,202 @@ - goto out; - } - -+int tcp_recvpackets (struct sock *sk, struct sk_buff_head *packets, -+ int len, int nonblock) -+{ -+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); -+ int copied; -+ long timeo; -+ -+ BUG_TRAP (len > 0); -+ /*BUG_TRAP ((flags & (MSG_OOB | MSG_PEEK | MSG_TRUNC)) == 0);*/ -+ -+ lock_sock(sk); -+ -+ TCP_CHECK_TIMER(sk); -+ -+ copied = -ENOTCONN; -+ if (sk->state == TCP_LISTEN) -+ goto out; -+ -+ copied = 0; -+ timeo = sock_rcvtimeo(sk, nonblock); -+ -+ do { -+ struct sk_buff * skb; -+ u32 offset; -+ unsigned long used; -+ int exhausted; -+ int eaten; -+ -+ /* Are we at urgent data? Stop if we have read anything. */ -+ if (copied && tp->urg_data && tp->urg_seq == tp->copied_seq) -+ break; -+ -+ /* We need to check signals first, to get correct SIGURG -+ * handling. FIXME: Need to check this doesnt impact 1003.1g -+ * and move it down to the bottom of the loop -+ */ -+ if (signal_pending(current)) { -+ if (copied) -+ break; -+ copied = timeo ? sock_intr_errno(timeo) : -EAGAIN; -+ break; -+ } -+ -+ /* Next get a buffer. */ -+ -+ skb = skb_peek(&sk->receive_queue); -+ -+ if (skb == NULL) /* nothing ready */ -+ { -+ if (copied) { -+ if (sk->err || -+ sk->state == TCP_CLOSE || -+ (sk->shutdown & RCV_SHUTDOWN) || -+ !timeo || -+ (0)) -+ break; -+ } else { -+ if (sk->done) -+ break; -+ -+ if (sk->err) { -+ copied = sock_error(sk); -+ break; -+ } -+ -+ if (sk->shutdown & RCV_SHUTDOWN) -+ break; -+ -+ if (sk->state == TCP_CLOSE) { -+ if (!sk->done) { -+ /* This occurs when user tries to read -+ * from never connected socket. -+ */ -+ copied = -ENOTCONN; -+ break; -+ } -+ break; -+ } -+ -+ if (!timeo) { -+ copied = -EAGAIN; -+ break; -+ } -+ } -+ -+ cleanup_rbuf(sk, copied); -+ timeo = tcp_data_wait(sk, timeo); -+ continue; -+ } -+ -+ BUG_TRAP (atomic_read (&skb->users) == 1); -+ -+ exhausted = eaten = 0; -+ -+ offset = tp->copied_seq - TCP_SKB_CB(skb)->seq; -+ if (skb->h.th->syn) -+ offset--; -+ -+ used = skb->len - offset; -+ -+ if (tp->urg_data) { -+ u32 urg_offset = tp->urg_seq - tp->copied_seq; -+ if (urg_offset < used) { -+ if (!urg_offset) { /* at urgent date */ -+ if (!sk->urginline) { -+ tp->copied_seq++; /* discard the single byte of urgent data */ -+ offset++; -+ used--; -+ } -+ } else /* truncate read */ -+ used = urg_offset; -+ } -+ } -+ -+ BUG_TRAP (used >= 0); -+ if (len < used) -+ used = len; -+ -+ if (used == 0) -+ exhausted = 1; -+ else -+ { -+ if (skb_is_nonlinear (skb)) -+ { -+ int rc = skb_linearize (skb, GFP_KERNEL); -+ -+ printk ("tcp_recvpackets(): linearising: %d\n", rc); -+ -+ if (rc) -+ { -+ if (!copied) -+ copied = rc; -+ break; -+ } -+ } -+ -+ if ((offset + used) == skb->len) /* consuming the whole packet */ -+ { -+ __skb_unlink (skb, &sk->receive_queue); -+ dst_release (skb->dst); -+ skb_orphan (skb); -+ __skb_pull (skb, offset); -+ __skb_queue_tail (packets, skb); -+ exhausted = eaten = 1; -+ } -+ else /* consuming only part of the packet */ -+ { -+ struct sk_buff *skb2 = skb_clone (skb, GFP_KERNEL); -+ -+ if (skb2 == NULL) -+ { -+ if (!copied) -+ copied = -ENOMEM; -+ break; -+ } -+ -+ dst_release (skb2->dst); -+ __skb_pull (skb2, offset); -+ __skb_trim (skb2, used); -+ __skb_queue_tail (packets, skb2); -+ } -+ -+ tp->copied_seq += used; -+ copied += used; -+ len -= used; -+ } -+ -+ if (tp->urg_data && after(tp->copied_seq,tp->urg_seq)) { -+ tp->urg_data = 0; -+ tcp_fast_path_check(sk, tp); -+ } -+ -+ if (!exhausted) -+ continue; -+ -+ if (skb->h.th->fin) -+ { -+ tp->copied_seq++; -+ if (!eaten) -+ tcp_eat_skb (sk, skb); -+ break; -+ } -+ -+ if (!eaten) -+ tcp_eat_skb (sk, skb); -+ -+ } while (len > 0); -+ -+ out: -+ /* Clean up data we have read: This will do ACK frames. */ -+ cleanup_rbuf(sk, copied); -+ TCP_CHECK_TIMER(sk); -+ release_sock(sk); -+ return copied; -+} -+ - /* - * State processing on a close. This implements the state shift for - * sending our FIN frame. Note that we only send a FIN for some diff --git a/lustre/kernel_patches/patches/uml-exprt-clearuser-2.6.12.patch b/lustre/kernel_patches/patches/uml-exprt-clearuser-2.6.12.patch new file mode 100644 index 0000000..381b03f --- /dev/null +++ b/lustre/kernel_patches/patches/uml-exprt-clearuser-2.6.12.patch @@ -0,0 +1,11 @@ +--- uml-2.4.24/arch/um/kernel/tt/ksyms.c.orig 2005-05-04 13:59:58.806659456 +0300 ++++ uml-2.4.24/arch/um/kernel/tt/ksyms.c 2005-05-04 14:00:18.358687096 +0300 +@@ -12,6 +12,8 @@ + EXPORT_SYMBOL(__do_strncpy_from_user); + EXPORT_SYMBOL(__do_strnlen_user); + EXPORT_SYMBOL(__do_clear_user); ++EXPORT_SYMBOL(clear_user_tt); ++EXPORT_SYMBOL(clear_user_skas); + + EXPORT_SYMBOL(tracing_pid); + EXPORT_SYMBOL(honeypot); diff --git a/lustre/kernel_patches/patches/vfs_intent-2.4.19-bgl.patch b/lustre/kernel_patches/patches/vfs_intent-2.4.19-bgl.patch deleted file mode 100644 index eec0362..0000000 --- a/lustre/kernel_patches/patches/vfs_intent-2.4.19-bgl.patch +++ /dev/null @@ -1,1849 +0,0 @@ - fs/dcache.c | 19 ++ - fs/exec.c | 17 +- - fs/namei.c | 295 +++++++++++++++++++++++++++++++++++++++------- - fs/namespace.c | 28 +++- - fs/open.c | 172 +++++++++++++++++++------- - fs/stat.c | 52 +++++--- - include/linux/dcache.h | 60 +++++++++ - include/linux/fs.h | 32 ++++ - include/linux/fs_struct.h | 4 - kernel/exit.c | 3 - kernel/fork.c | 3 - kernel/ksyms.c | 1 - 12 files changed, 558 insertions(+), 128 deletions(-) - -Index: linux.mcp2/fs/dcache.c -=================================================================== ---- linux.mcp2.orig/fs/dcache.c 2004-01-19 07:49:43.000000000 -0800 -+++ linux.mcp2/fs/dcache.c 2004-05-05 14:19:59.000000000 -0700 -@@ -181,6 +181,13 @@ - spin_unlock(&dcache_lock); - return 0; - } -+ -+ /* network invalidation by Lustre */ -+ if (dentry->d_flags & DCACHE_LUSTRE_INVALID) { -+ spin_unlock(&dcache_lock); -+ return 0; -+ } -+ - /* - * Check whether to do a partial shrink_dcache - * to get rid of unused child entries. -@@ -830,13 +837,19 @@ - * Adds a dentry to the hash according to its name. - */ - --void d_rehash(struct dentry * entry) -+void __d_rehash(struct dentry * entry, int lock) - { - struct list_head *list = d_hash(entry->d_parent, entry->d_name.hash); - if (!list_empty(&entry->d_hash)) BUG(); -- spin_lock(&dcache_lock); -+ if (lock) spin_lock(&dcache_lock); - list_add(&entry->d_hash, list); -- spin_unlock(&dcache_lock); -+ if (lock) spin_unlock(&dcache_lock); -+} -+EXPORT_SYMBOL(__d_rehash); -+ -+void d_rehash(struct dentry * entry) -+{ -+ __d_rehash(entry, 1); - } - - #define do_switch(x,y) do { \ -Index: linux.mcp2/fs/exec.c -=================================================================== ---- linux.mcp2.orig/fs/exec.c 2004-01-19 07:49:43.000000000 -0800 -+++ linux.mcp2/fs/exec.c 2004-05-05 14:19:59.000000000 -0700 -@@ -107,8 +107,10 @@ - struct file * file; - struct nameidata nd; - int error; -+ struct lookup_intent it = { .it_op = IT_OPEN, -+ .it_flags = FMODE_READ|FMODE_EXEC }; - -- error = user_path_walk(library, &nd); -+ error = user_path_walk_it(library, &nd, &it); - if (error) - goto out; - -@@ -120,7 +122,8 @@ - if (error) - goto exit; - -- file = dentry_open(nd.dentry, nd.mnt, O_RDONLY); -+ file = dentry_open_it(nd.dentry, nd.mnt, O_RDONLY, &it); -+ intent_release(&it); - error = PTR_ERR(file); - if (IS_ERR(file)) - goto out; -@@ -342,9 +345,11 @@ - struct inode *inode; - struct file *file; - int err = 0; -+ struct lookup_intent it = { .it_op = IT_OPEN, -+ .it_flags = FMODE_READ|FMODE_EXEC }; - - if (path_init(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd)) -- err = path_walk(name, &nd); -+ err = path_walk_it(name, &nd, &it); - file = ERR_PTR(err); - if (!err) { - inode = nd.dentry->d_inode; -@@ -356,7 +361,8 @@ - err = -EACCES; - file = ERR_PTR(err); - if (!err) { -- file = dentry_open(nd.dentry, nd.mnt, O_RDONLY); -+ file = dentry_open_it(nd.dentry, nd.mnt, O_RDONLY, &it); -+ intent_release(&it); - if (!IS_ERR(file)) { - err = deny_write_access(file); - if (err) { -@@ -368,6 +374,7 @@ - return file; - } - } -+ intent_release(&it); - path_release(&nd); - } - goto out; -@@ -969,7 +976,7 @@ - goto close_fail; - if (!file->f_op->write) - goto close_fail; -- if (do_truncate(file->f_dentry, 0) != 0) -+ if (do_truncate(file->f_dentry, 0, 0) != 0) - goto close_fail; - - retval = binfmt->core_dump(signr, regs, file); -Index: linux.mcp2/fs/namei.c -=================================================================== ---- linux.mcp2.orig/fs/namei.c 2004-01-19 07:49:43.000000000 -0800 -+++ linux.mcp2/fs/namei.c 2004-05-05 14:28:26.000000000 -0700 -@@ -94,6 +94,13 @@ - * XEmacs seems to be relying on it... - */ - -+void intent_release(struct lookup_intent *it) -+{ -+ if (it && it->it_op_release) -+ it->it_op_release(it); -+ -+} -+ - /* In order to reduce some races, while at the same time doing additional - * checking and hopefully speeding things up, we copy filenames to the - * kernel data space before using them.. -@@ -260,10 +267,19 @@ - * Internal lookup() using the new generic dcache. - * SMP-safe - */ --static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name, int flags) -+static struct dentry *cached_lookup(struct dentry *parent, struct qstr *name, -+ int flags, struct lookup_intent *it) - { - struct dentry * dentry = d_lookup(parent, name); - -+ if (dentry && dentry->d_op && dentry->d_op->d_revalidate_it) { -+ if (!dentry->d_op->d_revalidate_it(dentry, flags, it) && -+ !d_invalidate(dentry)) { -+ dput(dentry); -+ dentry = NULL; -+ } -+ return dentry; -+ } else - if (dentry && dentry->d_op && dentry->d_op->d_revalidate) { - if (!dentry->d_op->d_revalidate(dentry, flags) && !d_invalidate(dentry)) { - dput(dentry); -@@ -281,11 +297,15 @@ - * make sure that nobody added the entry to the dcache in the meantime.. - * SMP-safe - */ --static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, int flags) -+static struct dentry *real_lookup(struct dentry *parent, struct qstr *name, -+ int flags, struct lookup_intent *it) - { - struct dentry * result; - struct inode *dir = parent->d_inode; -+ int counter = 0; - -+again: -+ counter++; - down(&dir->i_sem); - /* - * First re-do the cached lookup just in case it was created -@@ -300,6 +320,9 @@ - result = ERR_PTR(-ENOMEM); - if (dentry) { - lock_kernel(); -+ if (dir->i_op->lookup_it) -+ result = dir->i_op->lookup_it(dir, dentry, it, flags); -+ else - result = dir->i_op->lookup(dir, dentry); - unlock_kernel(); - if (result) -@@ -321,6 +344,15 @@ - dput(result); - result = ERR_PTR(-ENOENT); - } -+ } else if (result->d_op && result->d_op->d_revalidate_it) { -+ if (!result->d_op->d_revalidate_it(result, flags, it) && -+ !d_invalidate(result)) { -+ dput(result); -+ if (counter > 10) -+ result = ERR_PTR(-ESTALE); -+ if (!IS_ERR(result)) -+ goto again; -+ } - } - return result; - } -@@ -332,7 +364,8 @@ - * Without that kind of total limit, nasty chains of consecutive - * symlinks can cause almost arbitrarily long lookups. - */ --static inline int do_follow_link(struct dentry *dentry, struct nameidata *nd) -+static inline int do_follow_link(struct dentry *dentry, struct nameidata *nd, -+ struct lookup_intent *it) - { - int err; - if (current->link_count >= 5) -@@ -346,10 +379,12 @@ - current->link_count++; - current->total_link_count++; - UPDATE_ATIME(dentry->d_inode); -+ nd->intent = it; - err = dentry->d_inode->i_op->follow_link(dentry, nd); - current->link_count--; - return err; - loop: -+ intent_release(it); - path_release(nd); - return -ELOOP; - } -@@ -447,7 +482,8 @@ - * - * We expect 'base' to be positive and a directory. - */ --int link_path_walk(const char * name, struct nameidata *nd) -+int link_path_walk_it(const char *name, struct nameidata *nd, -+ struct lookup_intent *it) - { - struct dentry *dentry; - struct inode *inode; -@@ -520,9 +556,10 @@ - break; - } - /* This does the actual lookups.. */ -- dentry = cached_lookup(nd->dentry, &this, LOOKUP_CONTINUE); -+ dentry = cached_lookup(nd->dentry, &this, LOOKUP_CONTINUE, NULL); - if (!dentry) { -- dentry = real_lookup(nd->dentry, &this, LOOKUP_CONTINUE); -+ dentry = real_lookup(nd->dentry, &this, LOOKUP_CONTINUE, -+ NULL); - err = PTR_ERR(dentry); - if (IS_ERR(dentry)) - break; -@@ -540,7 +577,7 @@ - goto out_dput; - - if (inode->i_op->follow_link) { -- err = do_follow_link(dentry, nd); -+ err = do_follow_link(dentry, nd, NULL); - dput(dentry); - if (err) - goto return_err; -@@ -556,7 +593,7 @@ - nd->dentry = dentry; - } - err = -ENOTDIR; -- if (!inode->i_op->lookup) -+ if (!inode->i_op->lookup && !inode->i_op->lookup_it) - break; - continue; - /* here ends the main loop */ -@@ -583,9 +620,9 @@ - if (err < 0) - break; - } -- dentry = cached_lookup(nd->dentry, &this, 0); -+ dentry = cached_lookup(nd->dentry, &this, 0, it); - if (!dentry) { -- dentry = real_lookup(nd->dentry, &this, 0); -+ dentry = real_lookup(nd->dentry, &this, 0, it); - err = PTR_ERR(dentry); - if (IS_ERR(dentry)) - break; -@@ -595,7 +632,7 @@ - inode = dentry->d_inode; - if ((lookup_flags & LOOKUP_FOLLOW) - && inode && inode->i_op && inode->i_op->follow_link) { -- err = do_follow_link(dentry, nd); -+ err = do_follow_link(dentry, nd, it); - dput(dentry); - if (err) - goto return_err; -@@ -609,7 +646,8 @@ - goto no_inode; - if (lookup_flags & LOOKUP_DIRECTORY) { - err = -ENOTDIR; -- if (!inode->i_op || !inode->i_op->lookup) -+ if (!inode->i_op || -+ (!inode->i_op->lookup && !inode->i_op->lookup_it)) - break; - } - goto return_base; -@@ -633,6 +671,34 @@ - * Check the cached dentry for staleness. - */ - dentry = nd->dentry; -+ if (dentry && dentry->d_op && dentry->d_op->d_revalidate_it) { -+ err = -ESTALE; -+ if (!dentry->d_op->d_revalidate_it(dentry, 0, it)) { -+ struct dentry *new; -+ err = permission(dentry->d_parent->d_inode, -+ MAY_EXEC); -+ if (err) -+ break; -+ new = real_lookup(dentry->d_parent, -+ &dentry->d_name, 0, NULL); -+ if (IS_ERR(new)) { -+ err = PTR_ERR(new); -+ break; -+ } -+ d_invalidate(dentry); -+ dput(dentry); -+ nd->dentry = new; -+ } -+ if (!nd->dentry->d_inode) -+ goto no_inode; -+ if (lookup_flags & LOOKUP_DIRECTORY) { -+ err = -ENOTDIR; -+ if (!nd->dentry->d_inode->i_op || -+ (!nd->dentry->d_inode->i_op->lookup && -+ !nd->dentry->d_inode->i_op->lookup_it)) -+ break; -+ } -+ } else - if (dentry && dentry->d_op && dentry->d_op->d_revalidate) { - err = -ESTALE; - if (!dentry->d_op->d_revalidate(dentry, 0)) { -@@ -646,15 +703,28 @@ - dput(dentry); - break; - } -+ if (err) -+ intent_release(it); - path_release(nd); - return_err: - return err; - } - -+int link_path_walk(const char * name, struct nameidata *nd) -+{ -+ return link_path_walk_it(name, nd, NULL); -+} -+ -+int path_walk_it(const char * name, struct nameidata *nd, struct lookup_intent *it) -+{ -+ current->total_link_count = 0; -+ return link_path_walk_it(name, nd, it); -+} -+ - int path_walk(const char * name, struct nameidata *nd) - { - current->total_link_count = 0; -- return link_path_walk(name, nd); -+ return link_path_walk_it(name, nd, NULL); - } - - /* SMP-safe */ -@@ -743,6 +813,7 @@ - { - nd->last_type = LAST_ROOT; /* if there are only slashes... */ - nd->flags = flags; -+ nd->intent = NULL; - if (*name=='/') - return walk_init_root(name,nd); - read_lock(¤t->fs->lock); -@@ -757,7 +828,8 @@ - * needs parent already locked. Doesn't follow mounts. - * SMP-safe. - */ --struct dentry * lookup_hash(struct qstr *name, struct dentry * base) -+struct dentry * lookup_hash_it(struct qstr *name, struct dentry * base, -+ struct lookup_intent *it) - { - struct dentry * dentry; - struct inode *inode; -@@ -780,13 +852,16 @@ - goto out; - } - -- dentry = cached_lookup(base, name, 0); -+ dentry = cached_lookup(base, name, 0, it); - if (!dentry) { - struct dentry *new = d_alloc(base, name); - dentry = ERR_PTR(-ENOMEM); - if (!new) - goto out; - lock_kernel(); -+ if (inode->i_op->lookup_it) -+ dentry = inode->i_op->lookup_it(inode, new, it, 0); -+ else - dentry = inode->i_op->lookup(inode, new); - unlock_kernel(); - if (!dentry) -@@ -798,6 +873,12 @@ - return dentry; - } - -+struct dentry * lookup_hash(struct qstr *name, struct dentry * base) -+{ -+ return lookup_hash_it(name, base, NULL); -+} -+ -+ - /* SMP-safe */ - struct dentry * lookup_one_len(const char * name, struct dentry * base, int len) - { -@@ -819,7 +900,7 @@ - } - this.hash = end_name_hash(hash); - -- return lookup_hash(&this, base); -+ return lookup_hash_it(&this, base, NULL); - access: - return ERR_PTR(-EACCES); - } -@@ -851,6 +932,23 @@ - return err; - } - -+int __user_walk_it(const char *name, unsigned flags, struct nameidata *nd, -+ struct lookup_intent *it) -+{ -+ char *tmp; -+ int err; -+ -+ tmp = getname(name); -+ err = PTR_ERR(tmp); -+ if (!IS_ERR(tmp)) { -+ err = 0; -+ if (path_init(tmp, flags, nd)) -+ err = path_walk_it(tmp, nd, it); -+ putname(tmp); -+ } -+ return err; -+} -+ - /* - * It's inline, so penalty for filesystems that don't use sticky bit is - * minimal. -@@ -946,7 +1044,8 @@ - return retval; - } - --int vfs_create(struct inode *dir, struct dentry *dentry, int mode) -+static int vfs_create_it(struct inode *dir, struct dentry *dentry, int mode, -+ struct lookup_intent *it) - { - int error; - -@@ -959,12 +1058,15 @@ - goto exit_lock; - - error = -EACCES; /* shouldn't it be ENOSYS? */ -- if (!dir->i_op || !dir->i_op->create) -+ if (!dir->i_op || (!dir->i_op->create && !dir->i_op->create_it)) - goto exit_lock; - - DQUOT_INIT(dir); - lock_kernel(); -- error = dir->i_op->create(dir, dentry, mode); -+ if (dir->i_op->create_it) -+ error = dir->i_op->create_it(dir, dentry, mode, it); -+ else -+ error = dir->i_op->create(dir, dentry, mode); - unlock_kernel(); - exit_lock: - up(&dir->i_zombie); -@@ -973,6 +1075,11 @@ - return error; - } - -+int vfs_create(struct inode *dir, struct dentry *dentry, int mode) -+{ -+ return vfs_create_it(dir, dentry, mode, NULL); -+} -+ - /* - * open_namei() - * -@@ -987,7 +1094,8 @@ - * for symlinks (where the permissions are checked later). - * SMP-safe - */ --int open_namei(const char * pathname, int flag, int mode, struct nameidata *nd) -+int open_namei_it(const char *pathname, int flag, int mode, -+ struct nameidata *nd, struct lookup_intent *it) - { - int acc_mode, error = 0; - struct inode *inode; -@@ -997,12 +1105,14 @@ - - acc_mode = ACC_MODE(flag); - -+ if (it) -+ it->it_flags = flag; - /* - * The simplest case - just a plain lookup. - */ - if (!(flag & O_CREAT)) { - if (path_init(pathname, lookup_flags(flag), nd)) -- error = path_walk(pathname, nd); -+ error = path_walk_it(pathname, nd, it); - if (error) - return error; - dentry = nd->dentry; -@@ -1012,6 +1122,10 @@ - /* - * Create - we need to know the parent. - */ -+ if (it) { -+ it->it_create_mode = mode; -+ it->it_op |= IT_CREAT; -+ } - if (path_init(pathname, LOOKUP_PARENT, nd)) - error = path_walk(pathname, nd); - if (error) -@@ -1028,7 +1142,7 @@ - - dir = nd->dentry; - down(&dir->d_inode->i_sem); -- dentry = lookup_hash(&nd->last, nd->dentry); -+ dentry = lookup_hash_it(&nd->last, nd->dentry, it); - - do_last: - error = PTR_ERR(dentry); -@@ -1037,10 +1151,11 @@ - goto exit; - } - -+ it->it_create_mode = mode; - /* Negative dentry, just create the file */ - if (!dentry->d_inode) { -- error = vfs_create(dir->d_inode, dentry, -- mode & ~current->fs->umask); -+ error = vfs_create_it(dir->d_inode, dentry, -+ mode & ~current->fs->umask, it); - up(&dir->d_inode->i_sem); - dput(nd->dentry); - nd->dentry = dentry; -@@ -1144,7 +1259,7 @@ - if (!error) { - DQUOT_INIT(inode); - -- error = do_truncate(dentry, 0); -+ error = do_truncate(dentry, 0, 1); - } - put_write_access(inode); - if (error) -@@ -1156,8 +1271,10 @@ - return 0; - - exit_dput: -+ intent_release(it); - dput(dentry); - exit: -+ intent_release(it); - path_release(nd); - return error; - -@@ -1176,7 +1293,10 @@ - * are done. Procfs-like symlinks just set LAST_BIND. - */ - UPDATE_ATIME(dentry->d_inode); -+ nd->intent = it; - error = dentry->d_inode->i_op->follow_link(dentry, nd); -+ if (error) -+ intent_release(it); - dput(dentry); - if (error) - return error; -@@ -1198,13 +1318,20 @@ - } - dir = nd->dentry; - down(&dir->d_inode->i_sem); -- dentry = lookup_hash(&nd->last, nd->dentry); -+ dentry = lookup_hash_it(&nd->last, nd->dentry, it); - putname(nd->last.name); - goto do_last; - } - -+int open_namei(const char *pathname, int flag, int mode, struct nameidata *nd) -+{ -+ return open_namei_it(pathname, flag, mode, nd, NULL); -+} -+ -+ - /* SMP-safe */ --static struct dentry *lookup_create(struct nameidata *nd, int is_dir) -+static struct dentry *lookup_create(struct nameidata *nd, int is_dir, -+ struct lookup_intent *it) - { - struct dentry *dentry; - -@@ -1212,7 +1339,7 @@ - dentry = ERR_PTR(-EEXIST); - if (nd->last_type != LAST_NORM) - goto fail; -- dentry = lookup_hash(&nd->last, nd->dentry); -+ dentry = lookup_hash_it(&nd->last, nd->dentry, it); - if (IS_ERR(dentry)) - goto fail; - if (!is_dir && nd->last.name[nd->last.len] && !dentry->d_inode) -@@ -1269,7 +1396,20 @@ - error = path_walk(tmp, &nd); - if (error) - goto out; -- dentry = lookup_create(&nd, 0); -+ -+ if (nd.last_type != LAST_NORM) { -+ error = -EEXIST; -+ goto out2; -+ } -+ if (nd.dentry->d_inode->i_op->mknod_raw) { -+ struct inode_operations *op = nd.dentry->d_inode->i_op; -+ error = op->mknod_raw(&nd, mode, dev); -+ /* the file system wants to use normal vfs path now */ -+ if (error != -EOPNOTSUPP) -+ goto out2; -+ } -+ -+ dentry = lookup_create(&nd, 0, NULL); - error = PTR_ERR(dentry); - - mode &= ~current->fs->umask; -@@ -1290,6 +1426,7 @@ - dput(dentry); - } - up(&nd.dentry->d_inode->i_sem); -+out2: - path_release(&nd); - out: - putname(tmp); -@@ -1338,7 +1475,18 @@ - error = path_walk(tmp, &nd); - if (error) - goto out; -- dentry = lookup_create(&nd, 1); -+ if (nd.last_type != LAST_NORM) { -+ error = -EEXIST; -+ goto out2; -+ } -+ if (nd.dentry->d_inode->i_op->mkdir_raw) { -+ struct inode_operations *op = nd.dentry->d_inode->i_op; -+ error = op->mkdir_raw(&nd, mode); -+ /* the file system wants to use normal vfs path now */ -+ if (error != -EOPNOTSUPP) -+ goto out2; -+ } -+ dentry = lookup_create(&nd, 1, NULL); - error = PTR_ERR(dentry); - if (!IS_ERR(dentry)) { - error = vfs_mkdir(nd.dentry->d_inode, dentry, -@@ -1346,6 +1490,7 @@ - dput(dentry); - } - up(&nd.dentry->d_inode->i_sem); -+out2: - path_release(&nd); - out: - putname(tmp); -@@ -1447,8 +1592,16 @@ - error = -EBUSY; - goto exit1; - } -+ if (nd.dentry->d_inode->i_op->rmdir_raw) { -+ struct inode_operations *op = nd.dentry->d_inode->i_op; -+ -+ error = op->rmdir_raw(&nd); -+ /* the file system wants to use normal vfs path now */ -+ if (error != -EOPNOTSUPP) -+ goto exit1; -+ } - down(&nd.dentry->d_inode->i_sem); -- dentry = lookup_hash(&nd.last, nd.dentry); -+ dentry = lookup_hash_it(&nd.last, nd.dentry, NULL); - error = PTR_ERR(dentry); - if (!IS_ERR(dentry)) { - error = vfs_rmdir(nd.dentry->d_inode, dentry); -@@ -1507,8 +1660,15 @@ - error = -EISDIR; - if (nd.last_type != LAST_NORM) - goto exit1; -+ if (nd.dentry->d_inode->i_op->unlink_raw) { -+ struct inode_operations *op = nd.dentry->d_inode->i_op; -+ error = op->unlink_raw(&nd); -+ /* the file system wants to use normal vfs path now */ -+ if (error != -EOPNOTSUPP) -+ goto exit1; -+ } - down(&nd.dentry->d_inode->i_sem); -- dentry = lookup_hash(&nd.last, nd.dentry); -+ dentry = lookup_hash_it(&nd.last, nd.dentry, NULL); - error = PTR_ERR(dentry); - if (!IS_ERR(dentry)) { - /* Why not before? Because we want correct error value */ -@@ -1576,15 +1736,27 @@ - error = path_walk(to, &nd); - if (error) - goto out; -- dentry = lookup_create(&nd, 0); -+ if (nd.last_type != LAST_NORM) { -+ error = -EEXIST; -+ goto out2; -+ } -+ if (nd.dentry->d_inode->i_op->symlink_raw) { -+ struct inode_operations *op = nd.dentry->d_inode->i_op; -+ error = op->symlink_raw(&nd, from); -+ /* the file system wants to use normal vfs path now */ -+ if (error != -EOPNOTSUPP) -+ goto out2; -+ } -+ dentry = lookup_create(&nd, 0, NULL); - error = PTR_ERR(dentry); - if (!IS_ERR(dentry)) { - error = vfs_symlink(nd.dentry->d_inode, dentry, from); - dput(dentry); - } - up(&nd.dentry->d_inode->i_sem); -+ out2: - path_release(&nd); --out: -+ out: - putname(to); - } - putname(from); -@@ -1667,7 +1835,18 @@ - error = -EXDEV; - if (old_nd.mnt != nd.mnt) - goto out_release; -- new_dentry = lookup_create(&nd, 0); -+ if (nd.last_type != LAST_NORM) { -+ error = -EEXIST; -+ goto out_release; -+ } -+ if (nd.dentry->d_inode->i_op->link_raw) { -+ struct inode_operations *op = nd.dentry->d_inode->i_op; -+ error = op->link_raw(&old_nd, &nd); -+ /* the file system wants to use normal vfs path now */ -+ if (error != -EOPNOTSUPP) -+ goto out_release; -+ } -+ new_dentry = lookup_create(&nd, 0, NULL); - error = PTR_ERR(new_dentry); - if (!IS_ERR(new_dentry)) { - error = vfs_link(old_nd.dentry, nd.dentry->d_inode, new_dentry); -@@ -1713,7 +1888,7 @@ - * locking]. - */ - int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry, -- struct inode *new_dir, struct dentry *new_dentry) -+ struct inode *new_dir, struct dentry *new_dentry) - { - int error; - struct inode *target; -@@ -1792,7 +1967,7 @@ - } - - int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry, -- struct inode *new_dir, struct dentry *new_dentry) -+ struct inode *new_dir, struct dentry *new_dentry) - { - int error; - -@@ -1883,9 +2058,18 @@ - if (newnd.last_type != LAST_NORM) - goto exit2; - -+ if (old_dir->d_inode->i_op->rename_raw) { -+ lock_kernel(); -+ error = old_dir->d_inode->i_op->rename_raw(&oldnd, &newnd); -+ unlock_kernel(); -+ /* the file system wants to use normal vfs path now */ -+ if (error != -EOPNOTSUPP) -+ goto exit2; -+ } -+ - double_lock(new_dir, old_dir); - -- old_dentry = lookup_hash(&oldnd.last, old_dir); -+ old_dentry = lookup_hash_it(&oldnd.last, old_dir, NULL); - error = PTR_ERR(old_dentry); - if (IS_ERR(old_dentry)) - goto exit3; -@@ -1901,16 +2085,16 @@ - if (newnd.last.name[newnd.last.len]) - goto exit4; - } -- new_dentry = lookup_hash(&newnd.last, new_dir); -+ new_dentry = lookup_hash_it(&newnd.last, new_dir, NULL); - error = PTR_ERR(new_dentry); - if (IS_ERR(new_dentry)) - goto exit4; - -+ - lock_kernel(); - error = vfs_rename(old_dir->d_inode, old_dentry, - new_dir->d_inode, new_dentry); - unlock_kernel(); -- - dput(new_dentry); - exit4: - dput(old_dentry); -@@ -1961,20 +2145,26 @@ - } - - static inline int --__vfs_follow_link(struct nameidata *nd, const char *link) -+__vfs_follow_link(struct nameidata *nd, const char *link, -+ struct lookup_intent *it) - { - int res = 0; - char *name; - if (IS_ERR(link)) - goto fail; - -+ if (it == NULL) -+ it = nd->intent; -+ else if (it != nd->intent) -+ printk("it != nd->intent: tell phil@clusterfs.com\n"); -+ - if (*link == '/') { - path_release(nd); - if (!walk_init_root(link, nd)) - /* weird __emul_prefix() stuff did it */ - goto out; - } -- res = link_path_walk(link, nd); -+ res = link_path_walk_it(link, nd, it); - out: - if (current->link_count || res || nd->last_type!=LAST_NORM) - return res; -@@ -1996,7 +2186,13 @@ - - int vfs_follow_link(struct nameidata *nd, const char *link) - { -- return __vfs_follow_link(nd, link); -+ return __vfs_follow_link(nd, link, NULL); -+} -+ -+int vfs_follow_link_it(struct nameidata *nd, const char *link, -+ struct lookup_intent *it) -+{ -+ return __vfs_follow_link(nd, link, it); - } - - /* get the link contents into pagecache */ -@@ -2038,7 +2234,7 @@ - { - struct page *page = NULL; - char *s = page_getlink(dentry, &page); -- int res = __vfs_follow_link(nd, s); -+ int res = __vfs_follow_link(nd, s, NULL); - if (page) { - kunmap(page); - page_cache_release(page); -Index: linux.mcp2/fs/namespace.c -=================================================================== ---- linux.mcp2.orig/fs/namespace.c 2004-01-19 07:49:43.000000000 -0800 -+++ linux.mcp2/fs/namespace.c 2004-05-05 14:22:06.000000000 -0700 -@@ -97,6 +97,7 @@ - { - old_nd->dentry = mnt->mnt_mountpoint; - old_nd->mnt = mnt->mnt_parent; -+ UNPIN(old_nd->dentry, old_nd->mnt, 1); - mnt->mnt_parent = mnt; - mnt->mnt_mountpoint = mnt->mnt_root; - list_del_init(&mnt->mnt_child); -@@ -108,6 +109,7 @@ - { - mnt->mnt_parent = mntget(nd->mnt); - mnt->mnt_mountpoint = dget(nd->dentry); -+ PIN(nd->dentry, nd->mnt, 1); - list_add(&mnt->mnt_hash, mount_hashtable+hash(nd->mnt, nd->dentry)); - list_add(&mnt->mnt_child, &nd->mnt->mnt_mounts); - nd->dentry->d_mounted++; -@@ -491,15 +493,18 @@ - { - struct nameidata old_nd; - struct vfsmount *mnt = NULL; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - int err = mount_is_safe(nd); - if (err) - return err; - if (!old_name || !*old_name) - return -EINVAL; - if (path_init(old_name, LOOKUP_POSITIVE|LOOKUP_FOLLOW, &old_nd)) -- err = path_walk(old_name, &old_nd); -- if (err) -+ err = path_walk_it(old_name, &old_nd, &it); -+ if (err) { -+ intent_release(&it); - return err; -+ } - - down_write(¤t->namespace->sem); - err = -EINVAL; -@@ -522,6 +527,7 @@ - } - - up_write(¤t->namespace->sem); -+ intent_release(&it); - path_release(&old_nd); - return err; - } -@@ -706,6 +712,7 @@ - unsigned long flags, void *data_page) - { - struct nameidata nd; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - int retval = 0; - int mnt_flags = 0; - -@@ -731,9 +738,11 @@ - - /* ... and get the mountpoint */ - if (path_init(dir_name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd)) -- retval = path_walk(dir_name, &nd); -- if (retval) -+ retval = path_walk_it(dir_name, &nd, &it); -+ if (retval) { -+ intent_release(&it); - return retval; -+ } - - if (flags & MS_REMOUNT) - retval = do_remount(&nd, flags & ~MS_REMOUNT, mnt_flags, -@@ -745,6 +754,8 @@ - else - retval = do_add_mount(&nd, type_page, flags, mnt_flags, - dev_name, data_page); -+ -+ intent_release(&it); - path_release(&nd); - return retval; - } -@@ -910,6 +921,8 @@ - { - struct vfsmount *tmp; - struct nameidata new_nd, old_nd, parent_nd, root_parent, user_nd; -+ struct lookup_intent new_it = { .it_op = IT_GETATTR }; -+ struct lookup_intent old_it = { .it_op = IT_GETATTR }; - char *name; - int error; - -@@ -924,7 +937,7 @@ - goto out0; - error = 0; - if (path_init(name, LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &new_nd)) -- error = path_walk(name, &new_nd); -+ error = path_walk_it(name, &new_nd, &new_it); - putname(name); - if (error) - goto out0; -@@ -938,7 +951,7 @@ - goto out1; - error = 0; - if (path_init(name, LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &old_nd)) -- error = path_walk(name, &old_nd); -+ error = path_walk_it(name, &old_nd, &old_it); - putname(name); - if (error) - goto out1; -@@ -994,8 +1007,10 @@ - up(&old_nd.dentry->d_inode->i_zombie); - up_write(¤t->namespace->sem); - path_release(&user_nd); -+ intent_release(&old_it); - path_release(&old_nd); - out1: -+ intent_release(&new_it); - path_release(&new_nd); - out0: - unlock_kernel(); -Index: linux.mcp2/fs/open.c -=================================================================== ---- linux.mcp2.orig/fs/open.c 2004-01-19 07:49:43.000000000 -0800 -+++ linux.mcp2/fs/open.c 2004-05-05 14:30:34.000000000 -0700 -@@ -19,6 +19,8 @@ - #include - - #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m)) -+extern int path_walk_it(const char *name, struct nameidata *nd, -+ struct lookup_intent *it); - - int vfs_statfs(struct super_block *sb, struct statfs *buf) - { -@@ -95,9 +97,10 @@ - write_unlock(&files->file_lock); - } - --int do_truncate(struct dentry *dentry, loff_t length) -+int do_truncate(struct dentry *dentry, loff_t length, int called_from_open) - { - struct inode *inode = dentry->d_inode; -+ struct inode_operations *op = dentry->d_inode->i_op; - int error; - struct iattr newattrs; - -@@ -108,7 +111,13 @@ - down(&inode->i_sem); - newattrs.ia_size = length; - newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME; -- error = notify_change(dentry, &newattrs); -+ if (called_from_open) -+ newattrs.ia_valid |= ATTR_FROM_OPEN; -+ if (op->setattr_raw) { -+ newattrs.ia_valid |= ATTR_RAW; -+ error = op->setattr_raw(inode, &newattrs); -+ } else -+ error = notify_change(dentry, &newattrs); - up(&inode->i_sem); - return error; - } -@@ -118,12 +127,13 @@ - struct nameidata nd; - struct inode * inode; - int error; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - - error = -EINVAL; - if (length < 0) /* sorry, but loff_t says... */ - goto out; - -- error = user_path_walk(path, &nd); -+ error = user_path_walk_it(path, &nd, &it); - if (error) - goto out; - inode = nd.dentry->d_inode; -@@ -163,11 +173,13 @@ - error = locks_verify_truncate(inode, NULL, length); - if (!error) { - DQUOT_INIT(inode); -- error = do_truncate(nd.dentry, length); -+ intent_release(&it); -+ error = do_truncate(nd.dentry, length, 0); - } - put_write_access(inode); - - dput_and_out: -+ intent_release(&it); - path_release(&nd); - out: - return error; -@@ -215,7 +227,7 @@ - - error = locks_verify_truncate(inode, file, length); - if (!error) -- error = do_truncate(dentry, length); -+ error = do_truncate(dentry, length, 0); - out_putf: - fput(file); - out: -@@ -260,11 +272,13 @@ - struct inode * inode; - struct iattr newattrs; - -- error = user_path_walk(filename, &nd); -+ error = user_path_walk_it(filename, &nd, NULL); - if (error) - goto out; - inode = nd.dentry->d_inode; - -+ /* this is safe without a Lustre lock because it only depends -+ on the super block */ - error = -EROFS; - if (IS_RDONLY(inode)) - goto dput_and_out; -@@ -279,11 +293,25 @@ - goto dput_and_out; - - newattrs.ia_valid |= ATTR_ATIME_SET | ATTR_MTIME_SET; -- } else { -+ } -+ -+ if (inode->i_op->setattr_raw) { -+ struct inode_operations *op = nd.dentry->d_inode->i_op; -+ -+ newattrs.ia_valid |= ATTR_RAW; -+ error = op->setattr_raw(inode, &newattrs); -+ /* the file system wants to use normal vfs path now */ -+ if (error != -EOPNOTSUPP) -+ goto dput_and_out; -+ } -+ -+ error = -EPERM; -+ if (!times) { - if (current->fsuid != inode->i_uid && - (error = permission(inode,MAY_WRITE)) != 0) - goto dput_and_out; - } -+ - error = notify_change(nd.dentry, &newattrs); - dput_and_out: - path_release(&nd); -@@ -304,12 +332,14 @@ - struct inode * inode; - struct iattr newattrs; - -- error = user_path_walk(filename, &nd); -+ error = user_path_walk_it(filename, &nd, NULL); - - if (error) - goto out; - inode = nd.dentry->d_inode; - -+ /* this is safe without a Lustre lock because it only depends -+ on the super block */ - error = -EROFS; - if (IS_RDONLY(inode)) - goto dput_and_out; -@@ -324,7 +354,20 @@ - newattrs.ia_atime = times[0].tv_sec; - newattrs.ia_mtime = times[1].tv_sec; - newattrs.ia_valid |= ATTR_ATIME_SET | ATTR_MTIME_SET; -- } else { -+ } -+ -+ if (inode->i_op->setattr_raw) { -+ struct inode_operations *op = nd.dentry->d_inode->i_op; -+ -+ newattrs.ia_valid |= ATTR_RAW; -+ error = op->setattr_raw(inode, &newattrs); -+ /* the file system wants to use normal vfs path now */ -+ if (error != -EOPNOTSUPP) -+ goto dput_and_out; -+ } -+ -+ error = -EPERM; -+ if (!utimes) { - if (current->fsuid != inode->i_uid && - (error = permission(inode,MAY_WRITE)) != 0) - goto dput_and_out; -@@ -347,6 +390,7 @@ - int old_fsuid, old_fsgid; - kernel_cap_t old_cap; - int res; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - - if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */ - return -EINVAL; -@@ -364,13 +408,14 @@ - else - current->cap_effective = current->cap_permitted; - -- res = user_path_walk(filename, &nd); -+ res = user_path_walk_it(filename, &nd, &it); - if (!res) { - res = permission(nd.dentry->d_inode, mode); - /* SuS v2 requires we report a read only fs too */ - if(!res && (mode & S_IWOTH) && IS_RDONLY(nd.dentry->d_inode) - && !special_file(nd.dentry->d_inode->i_mode)) - res = -EROFS; -+ intent_release(&it); - path_release(&nd); - } - -@@ -386,6 +431,7 @@ - int error; - struct nameidata nd; - char *name; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - - name = getname(filename); - error = PTR_ERR(name); -@@ -394,7 +440,7 @@ - - error = 0; - if (path_init(name,LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY,&nd)) -- error = path_walk(name, &nd); -+ error = path_walk_it(name, &nd, &it); - putname(name); - if (error) - goto out; -@@ -406,6 +452,7 @@ - set_fs_pwd(current->fs, nd.mnt, nd.dentry); - - dput_and_out: -+ intent_release(&it); - path_release(&nd); - out: - return error; -@@ -446,6 +493,7 @@ - int error; - struct nameidata nd; - char *name; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - - name = getname(filename); - error = PTR_ERR(name); -@@ -454,7 +502,7 @@ - - path_init(name, LOOKUP_POSITIVE | LOOKUP_FOLLOW | - LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd); -- error = path_walk(name, &nd); -+ error = path_walk_it(name, &nd, &it); - putname(name); - if (error) - goto out; -@@ -471,39 +519,56 @@ - set_fs_altroot(); - error = 0; - dput_and_out: -+ intent_release(&it); - path_release(&nd); - out: - return error; - } - --asmlinkage long sys_fchmod(unsigned int fd, mode_t mode) -+int chmod_common(struct dentry *dentry, mode_t mode) - { -- struct inode * inode; -- struct dentry * dentry; -- struct file * file; -- int err = -EBADF; -+ struct inode *inode = dentry->d_inode; - struct iattr newattrs; -+ int err = -EROFS; - -- file = fget(fd); -- if (!file) -+ if (IS_RDONLY(inode)) - goto out; - -- dentry = file->f_dentry; -- inode = dentry->d_inode; -+ if (inode->i_op->setattr_raw) { -+ newattrs.ia_mode = mode; -+ newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; -+ newattrs.ia_valid |= ATTR_RAW; -+ err = inode->i_op->setattr_raw(inode, &newattrs); -+ /* the file system wants to use normal vfs path now */ -+ if (err != -EOPNOTSUPP) -+ goto out; -+ } - -- err = -EROFS; -- if (IS_RDONLY(inode)) -- goto out_putf; - err = -EPERM; - if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) -- goto out_putf; -+ goto out; -+ - if (mode == (mode_t) -1) - mode = inode->i_mode; - newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); - newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; - err = notify_change(dentry, &newattrs); - --out_putf: -+out: -+ return err; -+} -+ -+asmlinkage long sys_fchmod(unsigned int fd, mode_t mode) -+{ -+ struct file * file; -+ int err = -EBADF; -+ -+ file = fget(fd); -+ if (!file) -+ goto out; -+ -+ err = chmod_common(file->f_dentry, mode); -+ - fput(file); - out: - return err; -@@ -512,30 +577,14 @@ - asmlinkage long sys_chmod(const char * filename, mode_t mode) - { - struct nameidata nd; -- struct inode * inode; - int error; -- struct iattr newattrs; - - error = user_path_walk(filename, &nd); - if (error) - goto out; -- inode = nd.dentry->d_inode; -- -- error = -EROFS; -- if (IS_RDONLY(inode)) -- goto dput_and_out; - -- error = -EPERM; -- if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) -- goto dput_and_out; -+ error = chmod_common(nd.dentry, mode); - -- if (mode == (mode_t) -1) -- mode = inode->i_mode; -- newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); -- newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; -- error = notify_change(nd.dentry, &newattrs); -- --dput_and_out: - path_release(&nd); - out: - return error; -@@ -555,6 +604,20 @@ - error = -EROFS; - if (IS_RDONLY(inode)) - goto out; -+ -+ if (inode->i_op->setattr_raw) { -+ struct inode_operations *op = dentry->d_inode->i_op; -+ -+ newattrs.ia_uid = user; -+ newattrs.ia_gid = group; -+ newattrs.ia_valid = ATTR_UID | ATTR_GID | ATTR_CTIME; -+ newattrs.ia_valid |= ATTR_RAW; -+ error = op->setattr_raw(inode, &newattrs); -+ /* the file system wants to use normal vfs path now */ -+ if (error != -EOPNOTSUPP) -+ return error; -+ } -+ - error = -EPERM; - if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) - goto out; -@@ -659,6 +722,7 @@ - { - int namei_flags, error; - struct nameidata nd; -+ struct lookup_intent it = { .it_op = IT_OPEN }; - - namei_flags = flags; - if ((namei_flags+1) & O_ACCMODE) -@@ -666,14 +730,15 @@ - if (namei_flags & O_TRUNC) - namei_flags |= 2; - -- error = open_namei(filename, namei_flags, mode, &nd); -- if (!error) -- return dentry_open(nd.dentry, nd.mnt, flags); -+ error = open_namei_it(filename, namei_flags, mode, &nd, &it); -+ if (error) -+ return ERR_PTR(error); - -- return ERR_PTR(error); -+ return dentry_open_it(nd.dentry, nd.mnt, flags, &it); - } - --struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags) -+struct file *dentry_open_it(struct dentry *dentry, struct vfsmount *mnt, -+ int flags, struct lookup_intent *it) - { - struct file * f; - struct inode *inode; -@@ -710,12 +775,15 @@ - } - - if (f->f_op && f->f_op->open) { -+ f->f_it = it; - error = f->f_op->open(inode,f); -+ f->f_it = NULL; - if (error) - goto cleanup_all; - } - f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); - -+ intent_release(it); - return f; - - cleanup_all: -@@ -730,11 +798,17 @@ - cleanup_file: - put_filp(f); - cleanup_dentry: -+ intent_release(it); - dput(dentry); - mntput(mnt); - return ERR_PTR(error); - } - -+struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags) -+{ -+ return dentry_open_it(dentry, mnt, flags, NULL); -+} -+ - /* - * Find an empty file descriptor entry, and mark it busy. - */ -Index: linux.mcp2/fs/stat.c -=================================================================== ---- linux.mcp2.orig/fs/stat.c 2004-01-19 07:49:43.000000000 -0800 -+++ linux.mcp2/fs/stat.c 2004-05-05 14:19:59.000000000 -0700 -@@ -17,10 +17,12 @@ - * Revalidate the inode. This is required for proper NFS attribute caching. - */ - static __inline__ int --do_revalidate(struct dentry *dentry) -+do_revalidate(struct dentry *dentry, struct lookup_intent *it) - { - struct inode * inode = dentry->d_inode; -- if (inode->i_op && inode->i_op->revalidate) -+ if (inode->i_op && inode->i_op->revalidate_it) -+ return inode->i_op->revalidate_it(dentry, it); -+ else if (inode->i_op && inode->i_op->revalidate) - return inode->i_op->revalidate(dentry); - return 0; - } -@@ -135,13 +139,15 @@ - asmlinkage long sys_stat(char * filename, struct __old_kernel_stat * statbuf) - { - struct nameidata nd; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - int error; - -- error = user_path_walk(filename, &nd); -+ error = user_path_walk_it(filename, &nd, &it); - if (!error) { -- error = do_revalidate(nd.dentry); -+ error = do_revalidate(nd.dentry, &it); - if (!error) - error = cp_old_stat(nd.dentry->d_inode, statbuf); -+ intent_release(&it); - path_release(&nd); - } - return error; -@@ -151,13 +157,15 @@ - asmlinkage long sys_newstat(char * filename, struct stat * statbuf) - { - struct nameidata nd; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - int error; - -- error = user_path_walk(filename, &nd); -+ error = user_path_walk_it(filename, &nd, &it); - if (!error) { -- error = do_revalidate(nd.dentry); -+ error = do_revalidate(nd.dentry, &it); - if (!error) - error = cp_new_stat(nd.dentry->d_inode, statbuf); -+ intent_release(&it); - path_release(&nd); - } - return error; -@@ -172,13 +180,15 @@ - asmlinkage long sys_lstat(char * filename, struct __old_kernel_stat * statbuf) - { - struct nameidata nd; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - int error; - -- error = user_path_walk_link(filename, &nd); -+ error = user_path_walk_link_it(filename, &nd, &it); - if (!error) { -- error = do_revalidate(nd.dentry); -+ error = do_revalidate(nd.dentry, &it); - if (!error) - error = cp_old_stat(nd.dentry->d_inode, statbuf); -+ intent_release(&it); - path_release(&nd); - } - return error; -@@ -189,13 +199,15 @@ - asmlinkage long sys_newlstat(char * filename, struct stat * statbuf) - { - struct nameidata nd; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - int error; - -- error = user_path_walk_link(filename, &nd); -+ error = user_path_walk_link_it(filename, &nd, &it); - if (!error) { -- error = do_revalidate(nd.dentry); -+ error = do_revalidate(nd.dentry, &it); - if (!error) - error = cp_new_stat(nd.dentry->d_inode, statbuf); -+ intent_release(&it); - path_release(&nd); - } - return error; -@@ -216,7 +228,7 @@ - if (f) { - struct dentry * dentry = f->f_dentry; - -- err = do_revalidate(dentry); -+ err = do_revalidate(dentry, NULL); - if (!err) - err = cp_old_stat(dentry->d_inode, statbuf); - fput(f); -@@ -235,7 +247,7 @@ - if (f) { - struct dentry * dentry = f->f_dentry; - -- err = do_revalidate(dentry); -+ err = do_revalidate(dentry, NULL); - if (!err) - err = cp_new_stat(dentry->d_inode, statbuf); - fput(f); -@@ -257,7 +269,7 @@ - - error = -EINVAL; - if (inode->i_op && inode->i_op->readlink && -- !(error = do_revalidate(nd.dentry))) { -+ !(error = do_revalidate(nd.dentry, NULL))) { - UPDATE_ATIME(inode); - error = inode->i_op->readlink(nd.dentry, buf, bufsiz); - } -@@ -333,12 +345,14 @@ - { - struct nameidata nd; - int error; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - -- error = user_path_walk(filename, &nd); -+ error = user_path_walk_it(filename, &nd, &it); - if (!error) { -- error = do_revalidate(nd.dentry); -+ error = do_revalidate(nd.dentry, &it); - if (!error) - error = cp_new_stat64(nd.dentry->d_inode, statbuf); -+ intent_release(&it); - path_release(&nd); - } - return error; -@@ -348,12 +362,14 @@ - { - struct nameidata nd; - int error; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - -- error = user_path_walk_link(filename, &nd); -+ error = user_path_walk_link_it(filename, &nd, &it); - if (!error) { -- error = do_revalidate(nd.dentry); -+ error = do_revalidate(nd.dentry, &it); - if (!error) - error = cp_new_stat64(nd.dentry->d_inode, statbuf); -+ intent_release(&it); - path_release(&nd); - } - return error; -@@ -368,7 +384,7 @@ - if (f) { - struct dentry * dentry = f->f_dentry; - -- err = do_revalidate(dentry); -+ err = do_revalidate(dentry, NULL); - if (!err) - err = cp_new_stat64(dentry->d_inode, statbuf); - fput(f); -Index: linux.mcp2/include/linux/dcache.h -=================================================================== ---- linux.mcp2.orig/include/linux/dcache.h 2004-04-23 16:52:28.000000000 -0700 -+++ linux.mcp2/include/linux/dcache.h 2004-05-05 14:19:59.000000000 -0700 -@@ -5,6 +5,51 @@ - - #include - #include -+#include -+ -+#define IT_OPEN 0x0001 -+#define IT_CREAT 0x0002 -+#define IT_READDIR 0x0004 -+#define IT_GETATTR 0x0008 -+#define IT_LOOKUP 0x0010 -+#define IT_UNLINK 0x0020 -+#define IT_GETXATTR 0x0040 -+#define IT_EXEC 0x0080 -+#define IT_PIN 0x0100 -+ -+#define IT_FL_LOCKED 0x0001 -+#define IT_FL_FOLLOWED 0x0002 /* set by vfs_follow_link */ -+ -+#define INTENT_MAGIC 0x19620323 -+ -+ -+struct lustre_intent_data { -+ int it_disposition; -+ int it_status; -+ __u64 it_lock_handle; -+ void *it_data; -+ int it_lock_mode; -+ int it_int_flags; -+}; -+struct lookup_intent { -+ int it_magic; -+ void (*it_op_release)(struct lookup_intent *); -+ int it_op; -+ int it_flags; -+ int it_create_mode; -+ union { -+ struct lustre_intent_data lustre; -+ } d; -+}; -+ -+static inline void intent_init(struct lookup_intent *it, int op, int flags) -+{ -+ memset(it, 0, sizeof(*it)); -+ it->it_magic = INTENT_MAGIC; -+ it->it_op = op; -+ it->it_flags = flags; -+} -+ - - /* - * linux/include/linux/dcache.h -@@ -90,8 +135,22 @@ - int (*d_delete)(struct dentry *); - void (*d_release)(struct dentry *); - void (*d_iput)(struct dentry *, struct inode *); -+ int (*d_revalidate_it)(struct dentry *, int, struct lookup_intent *); -+ void (*d_pin)(struct dentry *, struct vfsmount * , int); -+ void (*d_unpin)(struct dentry *, struct vfsmount *, int); - }; - -+#define PIN(de,mnt,flag) if (de && de->d_op && de->d_op->d_pin) \ -+ de->d_op->d_pin(de, mnt, flag); -+#define UNPIN(de,mnt,flag) if (de && de->d_op && de->d_op->d_unpin) \ -+ de->d_op->d_unpin(de, mnt, flag); -+ -+ -+/* defined in fs/namei.c */ -+extern void intent_release(struct lookup_intent *it); -+/* defined in fs/dcache.c */ -+extern void __d_rehash(struct dentry * entry, int lock); -+ - /* the dentry parameter passed to d_hash and d_compare is the parent - * directory of the entries to be compared. It is used in case these - * functions need any directory specific information for determining -@@ -123,6 +182,7 @@ - * s_nfsd_free_path semaphore will be down - */ - #define DCACHE_REFERENCED 0x0008 /* Recently used, don't discard. */ -+#define DCACHE_LUSTRE_INVALID 0x0010 /* Lustre invalidated */ - - extern spinlock_t dcache_lock; - -Index: linux.mcp2/include/linux/fs.h -=================================================================== ---- linux.mcp2.orig/include/linux/fs.h 2004-05-05 14:12:28.000000000 -0700 -+++ linux.mcp2/include/linux/fs.h 2004-05-05 14:19:59.000000000 -0700 -@@ -73,6 +73,7 @@ - - #define FMODE_READ 1 - #define FMODE_WRITE 2 -+#define FMODE_EXEC 4 - - #define READ 0 - #define WRITE 1 -@@ -335,6 +336,9 @@ - #define ATTR_MTIME_SET 256 - #define ATTR_FORCE 512 /* Not a change, but a change it */ - #define ATTR_ATTR_FLAG 1024 -+#define ATTR_RAW 0x0800 /* file system, not vfs will massage attrs */ -+#define ATTR_FROM_OPEN 0x1000 /* called from open path, ie O_TRUNC */ -+#define ATTR_CTIME_SET 0x2000 - - /* - * This is the Inode Attributes structure, used for notify_change(). It -@@ -470,6 +474,7 @@ - struct pipe_inode_info *i_pipe; - struct block_device *i_bdev; - struct char_device *i_cdev; -+ void *i_filterdata; - - unsigned long i_dnotify_mask; /* Directory notify events */ - struct dnotify_struct *i_dnotify; /* for directory notifications */ -@@ -574,6 +579,7 @@ - - /* needed for tty driver, and maybe others */ - void *private_data; -+ struct lookup_intent *f_it; - - /* preallocated helper kiobuf to speedup O_DIRECT */ - struct kiobuf *f_iobuf; -@@ -692,6 +698,7 @@ - struct qstr last; - unsigned int flags; - int last_type; -+ struct lookup_intent *intent; - }; - - #define DQUOT_USR_ENABLED 0x01 /* User diskquotas enabled */ -@@ -840,7 +847,8 @@ - extern int vfs_link(struct dentry *, struct inode *, struct dentry *); - extern int vfs_rmdir(struct inode *, struct dentry *); - extern int vfs_unlink(struct inode *, struct dentry *); --extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *); -+int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, -+ struct inode *new_dir, struct dentry *new_dentry); - - /* - * File types -@@ -900,21 +908,32 @@ - - struct inode_operations { - int (*create) (struct inode *,struct dentry *,int); -+ int (*create_it) (struct inode *,struct dentry *,int, struct lookup_intent *); - struct dentry * (*lookup) (struct inode *,struct dentry *); -+ struct dentry * (*lookup_it) (struct inode *,struct dentry *, struct lookup_intent *, int flags); - int (*link) (struct dentry *,struct inode *,struct dentry *); -+ int (*link_raw) (struct nameidata *,struct nameidata *); - int (*unlink) (struct inode *,struct dentry *); -+ int (*unlink_raw) (struct nameidata *); - int (*symlink) (struct inode *,struct dentry *,const char *); -+ int (*symlink_raw) (struct nameidata *,const char *); - int (*mkdir) (struct inode *,struct dentry *,int); -+ int (*mkdir_raw) (struct nameidata *,int); - int (*rmdir) (struct inode *,struct dentry *); -+ int (*rmdir_raw) (struct nameidata *); - int (*mknod) (struct inode *,struct dentry *,int,int); -+ int (*mknod_raw) (struct nameidata *,int,dev_t); - int (*rename) (struct inode *, struct dentry *, - struct inode *, struct dentry *); -+ int (*rename_raw) (struct nameidata *, struct nameidata *); - int (*readlink) (struct dentry *, char *,int); - int (*follow_link) (struct dentry *, struct nameidata *); - void (*truncate) (struct inode *); - int (*permission) (struct inode *, int); - int (*revalidate) (struct dentry *); -+ int (*revalidate_it) (struct dentry *, struct lookup_intent *); - int (*setattr) (struct dentry *, struct iattr *); -+ int (*setattr_raw) (struct inode *, struct iattr *); - int (*getattr) (struct dentry *, struct iattr *); - }; - -@@ -1115,10 +1134,14 @@ - - asmlinkage long sys_open(const char *, int, int); - asmlinkage long sys_close(unsigned int); /* yes, it's really unsigned */ --extern int do_truncate(struct dentry *, loff_t start); -+extern int do_truncate(struct dentry *, loff_t start, int called_from_open); - - extern struct file *filp_open(const char *, int, int); - extern struct file * dentry_open(struct dentry *, struct vfsmount *, int); -+extern int open_namei_it(const char *filename, int namei_flags, int mode, -+ struct nameidata *nd, struct lookup_intent *it); -+extern struct file *dentry_open_it(struct dentry *dentry, struct vfsmount *mnt, -+ int flags, struct lookup_intent *it); - extern int filp_close(struct file *, fl_owner_t id); - extern char * getname(const char *); - -@@ -1380,6 +1403,7 @@ - extern loff_t default_llseek(struct file *file, loff_t offset, int origin); - - extern int FASTCALL(__user_walk(const char *, unsigned, struct nameidata *)); -+extern int FASTCALL(__user_walk_it(const char *, unsigned, struct nameidata *, struct lookup_intent *it)); - extern int FASTCALL(path_init(const char *, unsigned, struct nameidata *)); - extern int FASTCALL(path_walk(const char *, struct nameidata *)); - extern int FASTCALL(link_path_walk(const char *, struct nameidata *)); -@@ -1390,6 +1414,8 @@ - extern struct dentry * lookup_hash(struct qstr *, struct dentry *); - #define user_path_walk(name,nd) __user_walk(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, nd) - #define user_path_walk_link(name,nd) __user_walk(name, LOOKUP_POSITIVE, nd) -+#define user_path_walk_it(name,nd,it) __user_walk_it(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, nd, it) -+#define user_path_walk_link_it(name,nd,it) __user_walk_it(name, LOOKUP_POSITIVE, nd, it) - - extern void iput(struct inode *); - extern void force_delete(struct inode *); -@@ -1499,6 +1525,8 @@ - - extern int vfs_readlink(struct dentry *, char *, int, const char *); - extern int vfs_follow_link(struct nameidata *, const char *); -+extern int vfs_follow_link_it(struct nameidata *, const char *, -+ struct lookup_intent *it); - extern int page_readlink(struct dentry *, char *, int); - extern int page_follow_link(struct dentry *, struct nameidata *); - extern struct inode_operations page_symlink_inode_operations; -Index: linux.mcp2/include/linux/fs_struct.h -=================================================================== ---- linux.mcp2.orig/include/linux/fs_struct.h 2004-01-19 07:49:42.000000000 -0800 -+++ linux.mcp2/include/linux/fs_struct.h 2004-05-05 14:19:59.000000000 -0700 -@@ -34,10 +34,12 @@ - write_lock(&fs->lock); - old_root = fs->root; - old_rootmnt = fs->rootmnt; -+ PIN(dentry, mnt, 1); - fs->rootmnt = mntget(mnt); - fs->root = dget(dentry); - write_unlock(&fs->lock); - if (old_root) { -+ UNPIN(old_root, old_rootmnt, 1); - dput(old_root); - mntput(old_rootmnt); - } -@@ -57,10 +59,12 @@ - write_lock(&fs->lock); - old_pwd = fs->pwd; - old_pwdmnt = fs->pwdmnt; -+ PIN(dentry, mnt, 0); - fs->pwdmnt = mntget(mnt); - fs->pwd = dget(dentry); - write_unlock(&fs->lock); - if (old_pwd) { -+ UNPIN(old_pwd, old_pwdmnt, 0); - dput(old_pwd); - mntput(old_pwdmnt); - } -Index: linux.mcp2/kernel/exit.c -=================================================================== ---- linux.mcp2.orig/kernel/exit.c 2004-01-19 07:49:44.000000000 -0800 -+++ linux.mcp2/kernel/exit.c 2004-05-05 14:19:59.000000000 -0700 -@@ -252,11 +252,14 @@ - { - /* No need to hold fs->lock if we are killing it */ - if (atomic_dec_and_test(&fs->count)) { -+ UNPIN(fs->pwd, fs->pwdmnt, 0); -+ UNPIN(fs->root, fs->rootmnt, 1); - dput(fs->root); - mntput(fs->rootmnt); - dput(fs->pwd); - mntput(fs->pwdmnt); - if (fs->altroot) { -+ UNPIN(fs->altroot, fs->altrootmnt, 1); - dput(fs->altroot); - mntput(fs->altrootmnt); - } -Index: linux.mcp2/kernel/fork.c -=================================================================== ---- linux.mcp2.orig/kernel/fork.c 2004-01-19 07:49:44.000000000 -0800 -+++ linux.mcp2/kernel/fork.c 2004-05-05 14:19:59.000000000 -0700 -@@ -384,10 +384,13 @@ - fs->umask = old->umask; - read_lock(&old->lock); - fs->rootmnt = mntget(old->rootmnt); -+ PIN(old->pwd, old->pwdmnt, 0); -+ PIN(old->root, old->rootmnt, 1); - fs->root = dget(old->root); - fs->pwdmnt = mntget(old->pwdmnt); - fs->pwd = dget(old->pwd); - if (old->altroot) { -+ PIN(old->altroot, old->altrootmnt, 1); - fs->altrootmnt = mntget(old->altrootmnt); - fs->altroot = dget(old->altroot); - } else { -Index: linux.mcp2/kernel/ksyms.c -=================================================================== ---- linux.mcp2.orig/kernel/ksyms.c 2004-05-05 14:12:28.000000000 -0700 -+++ linux.mcp2/kernel/ksyms.c 2004-05-05 14:19:59.000000000 -0700 -@@ -264,6 +264,7 @@ - EXPORT_SYMBOL(set_page_dirty); - EXPORT_SYMBOL(vfs_readlink); - EXPORT_SYMBOL(vfs_follow_link); -+EXPORT_SYMBOL(vfs_follow_link_it); - EXPORT_SYMBOL(page_readlink); - EXPORT_SYMBOL(page_follow_link); - EXPORT_SYMBOL(page_symlink_inode_operations); diff --git a/lustre/kernel_patches/patches/vfs_intent-2.4.19-suse.patch b/lustre/kernel_patches/patches/vfs_intent-2.4.19-suse.patch deleted file mode 100644 index 340ce7c..0000000 --- a/lustre/kernel_patches/patches/vfs_intent-2.4.19-suse.patch +++ /dev/null @@ -1,1858 +0,0 @@ - fs/dcache.c | 19 ++ - fs/exec.c | 17 +- - fs/namei.c | 295 +++++++++++++++++++++++++++++++++++++++------- - fs/namespace.c | 28 +++- - fs/open.c | 172 +++++++++++++++++++------- - fs/stat.c | 52 +++++--- - include/linux/dcache.h | 60 +++++++++ - include/linux/fs.h | 32 ++++ - include/linux/fs_struct.h | 4 - kernel/exit.c | 3 - kernel/fork.c | 3 - kernel/ksyms.c | 1 - 12 files changed, 558 insertions(+), 128 deletions(-) - -Index: linux-2.4.19.SuSE/fs/dcache.c -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/dcache.c Mon Jan 27 05:08:04 2003 -+++ linux-2.4.19.SuSE/fs/dcache.c Sat Nov 15 17:29:03 2003 -@@ -186,6 +186,13 @@ - spin_unlock(&dcache_lock); - return 0; - } -+ -+ /* network invalidation by Lustre */ -+ if (dentry->d_flags & DCACHE_LUSTRE_INVALID) { -+ spin_unlock(&dcache_lock); -+ return 0; -+ } -+ - /* - * Check whether to do a partial shrink_dcache - * to get rid of unused child entries. -@@ -838,13 +845,19 @@ - * Adds a dentry to the hash according to its name. - */ - --void d_rehash(struct dentry * entry) -+void __d_rehash(struct dentry * entry, int lock) - { - struct list_head *list = d_hash(entry->d_parent, entry->d_name.hash); - if (!list_empty(&entry->d_hash)) BUG(); -- spin_lock(&dcache_lock); -+ if (lock) spin_lock(&dcache_lock); - list_add(&entry->d_hash, list); -- spin_unlock(&dcache_lock); -+ if (lock) spin_unlock(&dcache_lock); -+} -+EXPORT_SYMBOL(__d_rehash); -+ -+void d_rehash(struct dentry * entry) -+{ -+ __d_rehash(entry, 1); - } - - #define do_switch(x,y) do { \ -Index: linux-2.4.19.SuSE/fs/exec.c -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/exec.c Mon Jan 27 05:08:35 2003 -+++ linux-2.4.19.SuSE/fs/exec.c Sat Nov 15 17:34:06 2003 -@@ -107,8 +107,10 @@ - struct file * file; - struct nameidata nd; - int error; -+ struct lookup_intent it = { .it_op = IT_OPEN, -+ .it_flags = FMODE_READ|FMODE_EXEC }; - -- error = user_path_walk(library, &nd); -+ error = user_path_walk_it(library, &nd, &it); - if (error) - goto out; - -@@ -120,7 +122,8 @@ - if (error) - goto exit; - -- file = dentry_open(nd.dentry, nd.mnt, O_RDONLY); -+ file = dentry_open_it(nd.dentry, nd.mnt, O_RDONLY, &it); -+ intent_release(&it); - error = PTR_ERR(file); - if (IS_ERR(file)) - goto out; -@@ -346,9 +349,11 @@ - struct inode *inode; - struct file *file; - int err = 0; -+ struct lookup_intent it = { .it_op = IT_OPEN, -+ .it_flags = FMODE_READ|FMODE_EXEC }; - - if (path_init(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd)) -- err = path_walk(name, &nd); -+ err = path_walk_it(name, &nd, &it); - file = ERR_PTR(err); - if (!err) { - inode = nd.dentry->d_inode; -@@ -360,7 +365,8 @@ - err = -EACCES; - file = ERR_PTR(err); - if (!err) { -- file = dentry_open(nd.dentry, nd.mnt, O_RDONLY); -+ file = dentry_open_it(nd.dentry, nd.mnt, O_RDONLY, &it); -+ intent_release(&it); - if (!IS_ERR(file)) { - err = deny_write_access(file); - if (err) { -@@ -372,6 +378,7 @@ - return file; - } - } -+ intent_release(&it); - path_release(&nd); - } - goto out; -@@ -981,7 +988,7 @@ - goto close_fail; - if (!file->f_op->write) - goto close_fail; -- if (do_truncate(file->f_dentry, 0) != 0) -+ if (do_truncate(file->f_dentry, 0, 0) != 0) - goto close_fail; - - retval = binfmt->core_dump(signr, regs, file); -Index: linux-2.4.19.SuSE/fs/namei.c -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/namei.c Mon Jan 27 05:08:07 2003 -+++ linux-2.4.19.SuSE/fs/namei.c Sat Nov 15 17:52:03 2003 -@@ -94,6 +94,13 @@ - * XEmacs seems to be relying on it... - */ - -+void intent_release(struct lookup_intent *it) -+{ -+ if (it && it->it_op_release) -+ it->it_op_release(it); -+ -+} -+ - /* In order to reduce some races, while at the same time doing additional - * checking and hopefully speeding things up, we copy filenames to the - * kernel data space before using them.. -@@ -260,10 +267,19 @@ - * Internal lookup() using the new generic dcache. - * SMP-safe - */ --static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name, int flags) -+static struct dentry *cached_lookup(struct dentry *parent, struct qstr *name, -+ int flags, struct lookup_intent *it) - { - struct dentry * dentry = d_lookup(parent, name); - -+ if (dentry && dentry->d_op && dentry->d_op->d_revalidate_it) { -+ if (!dentry->d_op->d_revalidate_it(dentry, flags, it) && -+ !d_invalidate(dentry)) { -+ dput(dentry); -+ dentry = NULL; -+ } -+ return dentry; -+ } else - if (dentry && dentry->d_op && dentry->d_op->d_revalidate) { - if (!dentry->d_op->d_revalidate(dentry, flags) && !d_invalidate(dentry)) { - dput(dentry); -@@ -281,11 +297,15 @@ - * make sure that nobody added the entry to the dcache in the meantime.. - * SMP-safe - */ --static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, int flags) -+static struct dentry *real_lookup(struct dentry *parent, struct qstr *name, -+ int flags, struct lookup_intent *it) - { - struct dentry * result; - struct inode *dir = parent->d_inode; -+ int counter = 0; - -+again: -+ counter++; - down(&dir->i_sem); - /* - * First re-do the cached lookup just in case it was created -@@ -300,6 +320,9 @@ - result = ERR_PTR(-ENOMEM); - if (dentry) { - lock_kernel(); -+ if (dir->i_op->lookup_it) -+ result = dir->i_op->lookup_it(dir, dentry, it, flags); -+ else - result = dir->i_op->lookup(dir, dentry); - unlock_kernel(); - if (result) -@@ -321,6 +344,15 @@ - dput(result); - result = ERR_PTR(-ENOENT); - } -+ } else if (result->d_op && result->d_op->d_revalidate_it) { -+ if (!result->d_op->d_revalidate_it(result, flags, it) && -+ !d_invalidate(result)) { -+ dput(result); -+ if (counter > 10) -+ result = ERR_PTR(-ESTALE); -+ if (!IS_ERR(result)) -+ goto again; -+ } - } - return result; - } -@@ -332,7 +364,8 @@ - * Without that kind of total limit, nasty chains of consecutive - * symlinks can cause almost arbitrarily long lookups. - */ --static inline int do_follow_link(struct dentry *dentry, struct nameidata *nd) -+static inline int do_follow_link(struct dentry *dentry, struct nameidata *nd, -+ struct lookup_intent *it) - { - int err; - if (current->link_count >= 8) -@@ -346,10 +379,12 @@ - current->link_count++; - current->total_link_count++; - UPDATE_ATIME(dentry->d_inode); -+ nd->intent = it; - err = dentry->d_inode->i_op->follow_link(dentry, nd); - current->link_count--; - return err; - loop: -+ intent_release(it); - path_release(nd); - return -ELOOP; - } -@@ -447,7 +482,8 @@ - * - * We expect 'base' to be positive and a directory. - */ --int link_path_walk(const char * name, struct nameidata *nd) -+int link_path_walk_it(const char *name, struct nameidata *nd, -+ struct lookup_intent *it) - { - struct dentry *dentry; - struct inode *inode; -@@ -524,12 +560,13 @@ - break; - } - /* This does the actual lookups.. */ -- dentry = cached_lookup(nd->dentry, &this, LOOKUP_CONTINUE); -+ dentry = cached_lookup(nd->dentry, &this, LOOKUP_CONTINUE, NULL); - if (!dentry) { - err = -EWOULDBLOCKIO; - if (atomic) - break; -- dentry = real_lookup(nd->dentry, &this, LOOKUP_CONTINUE); -+ dentry = real_lookup(nd->dentry, &this, LOOKUP_CONTINUE, -+ NULL); - err = PTR_ERR(dentry); - if (IS_ERR(dentry)) - break; -@@ -547,7 +584,7 @@ - goto out_dput; - - if (inode->i_op->follow_link) { -- err = do_follow_link(dentry, nd); -+ err = do_follow_link(dentry, nd, NULL); - dput(dentry); - if (err) - goto return_err; -@@ -563,7 +600,7 @@ - nd->dentry = dentry; - } - err = -ENOTDIR; -- if (!inode->i_op->lookup) -+ if (!inode->i_op->lookup && !inode->i_op->lookup_it) - break; - continue; - /* here ends the main loop */ -@@ -590,12 +627,12 @@ - if (err < 0) - break; - } -- dentry = cached_lookup(nd->dentry, &this, 0); -+ dentry = cached_lookup(nd->dentry, &this, 0, it); - if (!dentry) { - err = -EWOULDBLOCKIO; - if (atomic) - break; -- dentry = real_lookup(nd->dentry, &this, 0); -+ dentry = real_lookup(nd->dentry, &this, 0, it); - err = PTR_ERR(dentry); - if (IS_ERR(dentry)) - break; -@@ -605,7 +642,7 @@ - inode = dentry->d_inode; - if ((lookup_flags & LOOKUP_FOLLOW) - && inode && inode->i_op && inode->i_op->follow_link) { -- err = do_follow_link(dentry, nd); -+ err = do_follow_link(dentry, nd, it); - dput(dentry); - if (err) - goto return_err; -@@ -619,7 +656,8 @@ - goto no_inode; - if (lookup_flags & LOOKUP_DIRECTORY) { - err = -ENOTDIR; -- if (!inode->i_op || !inode->i_op->lookup) -+ if (!inode->i_op || -+ (!inode->i_op->lookup && !inode->i_op->lookup_it)) - break; - } - goto return_base; -@@ -643,6 +681,32 @@ - * Check the cached dentry for staleness. - */ - dentry = nd->dentry; -+ if (dentry && dentry->d_op && dentry->d_op->d_revalidate_it) { -+ err = -ESTALE; -+ if (!dentry->d_op->d_revalidate_it(dentry, 0, it)) { -+ struct dentry *new; -+ err = permission(dentry->d_parent->d_inode, -+ MAY_EXEC); -+ if (err) -+ break; -+ new = real_lookup(dentry->d_parent, -+ &dentry->d_name, 0, it); -+ if (IS_ERR(new)) { -+ err = PTR_ERR(new); -+ break; -+ } -+ d_invalidate(dentry); -+ dput(dentry); -+ nd->dentry = new; -+ } -+ if (!nd->dentry->d_inode) -+ goto no_inode; -+ if (lookup_flags & LOOKUP_DIRECTORY) { -+ err = -ENOTDIR; -+ if (!nd->dentry->d_inode->i_op || -+ (!nd->dentry->d_inode->i_op->lookup && -+ !nd->dentry->d_inode->i_op->lookup_it)) -+ break; -+ } -+ } else - if (dentry && dentry->d_op && dentry->d_op->d_revalidate) { - err = -ESTALE; - if (!dentry->d_op->d_revalidate(dentry, lookup_flags & LOOKUP_PARENT)) { -@@ -656,15 +713,28 @@ - dput(dentry); - break; - } -+ if (err) -+ intent_release(it); - path_release(nd); - return_err: - return err; - } - -+int link_path_walk(const char * name, struct nameidata *nd) -+{ -+ return link_path_walk_it(name, nd, NULL); -+} -+ -+int path_walk_it(const char * name, struct nameidata *nd, struct lookup_intent *it) -+{ -+ current->total_link_count = 0; -+ return link_path_walk_it(name, nd, it); -+} -+ - int path_walk(const char * name, struct nameidata *nd) - { - current->total_link_count = 0; -- return link_path_walk(name, nd); -+ return link_path_walk_it(name, nd, NULL); - } - - /* SMP-safe */ -@@ -753,6 +823,7 @@ - { - nd->last_type = LAST_ROOT; /* if there are only slashes... */ - nd->flags = flags; -+ nd->intent = NULL; - if (*name=='/') - return walk_init_root(name,nd); - read_lock(¤t->fs->lock); -@@ -767,7 +838,8 @@ - * needs parent already locked. Doesn't follow mounts. - * SMP-safe. - */ --struct dentry * lookup_hash(struct qstr *name, struct dentry * base) -+struct dentry * lookup_hash_it(struct qstr *name, struct dentry * base, -+ struct lookup_intent *it) - { - struct dentry * dentry; - struct inode *inode; -@@ -790,13 +862,16 @@ - goto out; - } - -- dentry = cached_lookup(base, name, 0); -+ dentry = cached_lookup(base, name, 0, it); - if (!dentry) { - struct dentry *new = d_alloc(base, name); - dentry = ERR_PTR(-ENOMEM); - if (!new) - goto out; - lock_kernel(); -+ if (inode->i_op->lookup_it) -+ dentry = inode->i_op->lookup_it(inode, new, it, 0); -+ else - dentry = inode->i_op->lookup(inode, new); - unlock_kernel(); - if (!dentry) -@@ -808,6 +883,12 @@ - return dentry; - } - -+struct dentry * lookup_hash(struct qstr *name, struct dentry * base) -+{ -+ return lookup_hash_it(name, base, NULL); -+} -+ -+ - /* SMP-safe */ - struct dentry * lookup_one_len(const char * name, struct dentry * base, int len) - { -@@ -829,7 +910,7 @@ - } - this.hash = end_name_hash(hash); - -- return lookup_hash(&this, base); -+ return lookup_hash_it(&this, base, NULL); - access: - return ERR_PTR(-EACCES); - } -@@ -861,6 +942,23 @@ - return err; - } - -+int __user_walk_it(const char *name, unsigned flags, struct nameidata *nd, -+ struct lookup_intent *it) -+{ -+ char *tmp; -+ int err; -+ -+ tmp = getname(name); -+ err = PTR_ERR(tmp); -+ if (!IS_ERR(tmp)) { -+ err = 0; -+ if (path_init(tmp, flags, nd)) -+ err = path_walk_it(tmp, nd, it); -+ putname(tmp); -+ } -+ return err; -+} -+ - /* - * It's inline, so penalty for filesystems that don't use sticky bit is - * minimal. -@@ -958,7 +1056,8 @@ - return retval; - } - --int vfs_create(struct inode *dir, struct dentry *dentry, int mode) -+static int vfs_create_it(struct inode *dir, struct dentry *dentry, int mode, -+ struct lookup_intent *it) - { - int error; - -@@ -971,12 +1070,15 @@ - goto exit_lock; - - error = -EACCES; /* shouldn't it be ENOSYS? */ -- if (!dir->i_op || !dir->i_op->create) -+ if (!dir->i_op || (!dir->i_op->create && !dir->i_op->create_it)) - goto exit_lock; - - DQUOT_INIT(dir); - lock_kernel(); -- error = dir->i_op->create(dir, dentry, mode); -+ if (dir->i_op->create_it) -+ error = dir->i_op->create_it(dir, dentry, mode, it); -+ else -+ error = dir->i_op->create(dir, dentry, mode); - unlock_kernel(); - exit_lock: - up(&dir->i_zombie); -@@ -985,6 +1087,11 @@ - return error; - } - -+int vfs_create(struct inode *dir, struct dentry *dentry, int mode) -+{ -+ return vfs_create_it(dir, dentry, mode, NULL); -+} -+ - /* - * open_namei() - * -@@ -999,7 +1106,8 @@ - * for symlinks (where the permissions are checked later). - * SMP-safe - */ --int open_namei(const char * pathname, int flag, int mode, struct nameidata *nd) -+int open_namei_it(const char *pathname, int flag, int mode, -+ struct nameidata *nd, struct lookup_intent *it) - { - int acc_mode, error = 0; - struct inode *inode; -@@ -1009,12 +1117,14 @@ - - acc_mode = ACC_MODE(flag); - -+ if (it) -+ it->it_flags = flag; - /* - * The simplest case - just a plain lookup. - */ - if (!(flag & O_CREAT)) { - if (path_init(pathname, lookup_flags(flag), nd)) -- error = path_walk(pathname, nd); -+ error = path_walk_it(pathname, nd, it); - if (error) - return error; - dentry = nd->dentry; -@@ -1024,6 +1134,10 @@ - /* - * Create - we need to know the parent. - */ -+ if (it) { -+ it->it_create_mode = mode; -+ it->it_op |= IT_CREAT; -+ } - if (path_init(pathname, LOOKUP_PARENT, nd)) - error = path_walk(pathname, nd); - if (error) -@@ -1040,7 +1154,7 @@ - - dir = nd->dentry; - down(&dir->d_inode->i_sem); -- dentry = lookup_hash(&nd->last, nd->dentry); -+ dentry = lookup_hash_it(&nd->last, nd->dentry, it); - - do_last: - error = PTR_ERR(dentry); -@@ -1049,11 +1163,13 @@ - goto exit; - } - -+ it->it_create_mode = mode; - /* Negative dentry, just create the file */ - if (!dentry->d_inode) { - if (!IS_POSIXACL(dir->d_inode)) - mode &= ~current->fs->umask; -- error = vfs_create(dir->d_inode, dentry, mode); -+ error = vfs_create_it(dir->d_inode, dentry, -+ mode & ~current->fs->umask, it); - up(&dir->d_inode->i_sem); - #ifndef DENTRY_WASTE_RAM - if (error) -@@ -1161,7 +1277,7 @@ - if (!error) { - DQUOT_INIT(inode); - -- error = do_truncate(dentry, 0); -+ error = do_truncate(dentry, 0, 1); - } - put_write_access(inode); - if (error) -@@ -1173,8 +1289,10 @@ - return 0; - - exit_dput: -+ intent_release(it); - dput(dentry); - exit: -+ intent_release(it); - path_release(nd); - return error; - -@@ -1193,7 +1311,10 @@ - * are done. Procfs-like symlinks just set LAST_BIND. - */ - UPDATE_ATIME(dentry->d_inode); -+ nd->intent = it; - error = dentry->d_inode->i_op->follow_link(dentry, nd); -+ if (error) -+ intent_release(it); - dput(dentry); - if (error) - return error; -@@ -1215,13 +1336,20 @@ - } - dir = nd->dentry; - down(&dir->d_inode->i_sem); -- dentry = lookup_hash(&nd->last, nd->dentry); -+ dentry = lookup_hash_it(&nd->last, nd->dentry, it); - putname(nd->last.name); - goto do_last; - } - -+int open_namei(const char *pathname, int flag, int mode, struct nameidata *nd) -+{ -+ return open_namei_it(pathname, flag, mode, nd, NULL); -+} -+ -+ - /* SMP-safe */ --static struct dentry *lookup_create(struct nameidata *nd, int is_dir) -+static struct dentry *lookup_create(struct nameidata *nd, int is_dir, -+ struct lookup_intent *it) - { - struct dentry *dentry; - -@@ -1229,7 +1357,7 @@ - dentry = ERR_PTR(-EEXIST); - if (nd->last_type != LAST_NORM) - goto fail; -- dentry = lookup_hash(&nd->last, nd->dentry); -+ dentry = lookup_hash_it(&nd->last, nd->dentry, it); - if (IS_ERR(dentry)) - goto fail; - if (!is_dir && nd->last.name[nd->last.len] && !dentry->d_inode) -@@ -1286,7 +1414,20 @@ - error = path_walk(tmp, &nd); - if (error) - goto out; -- dentry = lookup_create(&nd, 0); -+ -+ if (nd.last_type != LAST_NORM) { -+ error = -EEXIST; -+ goto out2; -+ } -+ if (nd.dentry->d_inode->i_op->mknod_raw) { -+ struct inode_operations *op = nd.dentry->d_inode->i_op; -+ error = op->mknod_raw(&nd, mode, dev); -+ /* the file system wants to use normal vfs path now */ -+ if (error != -EOPNOTSUPP) -+ goto out2; -+ } -+ -+ dentry = lookup_create(&nd, 0, NULL); - error = PTR_ERR(dentry); - - if (!IS_POSIXACL(nd.dentry->d_inode)) -@@ -1308,6 +1445,7 @@ - dput(dentry); - } - up(&nd.dentry->d_inode->i_sem); -+out2: - path_release(&nd); - out: - putname(tmp); -@@ -1356,7 +1494,18 @@ - error = path_walk(tmp, &nd); - if (error) - goto out; -- dentry = lookup_create(&nd, 1); -+ if (nd.last_type != LAST_NORM) { -+ error = -EEXIST; -+ goto out2; -+ } -+ if (nd.dentry->d_inode->i_op->mkdir_raw) { -+ struct inode_operations *op = nd.dentry->d_inode->i_op; -+ error = op->mkdir_raw(&nd, mode); -+ /* the file system wants to use normal vfs path now */ -+ if (error != -EOPNOTSUPP) -+ goto out2; -+ } -+ dentry = lookup_create(&nd, 1, NULL); - error = PTR_ERR(dentry); - if (!IS_ERR(dentry)) { - if (!IS_POSIXACL(nd.dentry->d_inode)) -@@ -1365,6 +1510,7 @@ - dput(dentry); - } - up(&nd.dentry->d_inode->i_sem); -+out2: - path_release(&nd); - out: - putname(tmp); -@@ -1466,8 +1612,16 @@ - error = -EBUSY; - goto exit1; - } -+ if (nd.dentry->d_inode->i_op->rmdir_raw) { -+ struct inode_operations *op = nd.dentry->d_inode->i_op; -+ -+ error = op->rmdir_raw(&nd); -+ /* the file system wants to use normal vfs path now */ -+ if (error != -EOPNOTSUPP) -+ goto exit1; -+ } - down(&nd.dentry->d_inode->i_sem); -- dentry = lookup_hash(&nd.last, nd.dentry); -+ dentry = lookup_hash_it(&nd.last, nd.dentry, NULL); - error = PTR_ERR(dentry); - if (!IS_ERR(dentry)) { - error = vfs_rmdir(nd.dentry->d_inode, dentry); -@@ -1526,8 +1680,15 @@ - error = -EISDIR; - if (nd.last_type != LAST_NORM) - goto exit1; -+ if (nd.dentry->d_inode->i_op->unlink_raw) { -+ struct inode_operations *op = nd.dentry->d_inode->i_op; -+ error = op->unlink_raw(&nd); -+ /* the file system wants to use normal vfs path now */ -+ if (error != -EOPNOTSUPP) -+ goto exit1; -+ } - down(&nd.dentry->d_inode->i_sem); -- dentry = lookup_hash(&nd.last, nd.dentry); -+ dentry = lookup_hash_it(&nd.last, nd.dentry, NULL); - error = PTR_ERR(dentry); - if (!IS_ERR(dentry)) { - /* Why not before? Because we want correct error value */ -@@ -1595,15 +1756,27 @@ - error = path_walk(to, &nd); - if (error) - goto out; -- dentry = lookup_create(&nd, 0); -+ if (nd.last_type != LAST_NORM) { -+ error = -EEXIST; -+ goto out2; -+ } -+ if (nd.dentry->d_inode->i_op->symlink_raw) { -+ struct inode_operations *op = nd.dentry->d_inode->i_op; -+ error = op->symlink_raw(&nd, from); -+ /* the file system wants to use normal vfs path now */ -+ if (error != -EOPNOTSUPP) -+ goto out2; -+ } -+ dentry = lookup_create(&nd, 0, NULL); - error = PTR_ERR(dentry); - if (!IS_ERR(dentry)) { - error = vfs_symlink(nd.dentry->d_inode, dentry, from); - dput(dentry); - } - up(&nd.dentry->d_inode->i_sem); -+ out2: - path_release(&nd); --out: -+ out: - putname(to); - } - putname(from); -@@ -1686,7 +1855,14 @@ - error = -EXDEV; - if (old_nd.mnt != nd.mnt) - goto out_release; -- new_dentry = lookup_create(&nd, 0); -+ if (nd.last_type != LAST_NORM) { -+ error = -EEXIST; -+ goto out_release; -+ } -+ if (nd.dentry->d_inode->i_op->link_raw) { -+ struct inode_operations *op = nd.dentry->d_inode->i_op; -+ error = op->link_raw(&old_nd, &nd); -+ /* the file system wants to use normal vfs path now */ -+ if (error != -EOPNOTSUPP) -+ goto out_release; -+ } -+ new_dentry = lookup_create(&nd, 0, NULL); - error = PTR_ERR(new_dentry); - if (!IS_ERR(new_dentry)) { - error = vfs_link(old_nd.dentry, nd.dentry->d_inode, new_dentry); -@@ -1732,7 +1908,7 @@ - * locking]. - */ - int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry, -- struct inode *new_dir, struct dentry *new_dentry) -+ struct inode *new_dir, struct dentry *new_dentry) - { - int error; - struct inode *target; -@@ -1811,7 +1987,7 @@ - } - - int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry, -- struct inode *new_dir, struct dentry *new_dentry) -+ struct inode *new_dir, struct dentry *new_dentry) - { - int error; - -@@ -1902,9 +2078,18 @@ - if (newnd.last_type != LAST_NORM) - goto exit2; - -+ if (old_dir->d_inode->i_op->rename_raw) { -+ lock_kernel(); -+ error = old_dir->d_inode->i_op->rename_raw(&oldnd, &newnd); -+ unlock_kernel(); -+ /* the file system wants to use normal vfs path now */ -+ if (error != -EOPNOTSUPP) -+ goto exit2; -+ } -+ - double_lock(new_dir, old_dir); - -- old_dentry = lookup_hash(&oldnd.last, old_dir); -+ old_dentry = lookup_hash_it(&oldnd.last, old_dir, NULL); - error = PTR_ERR(old_dentry); - if (IS_ERR(old_dentry)) - goto exit3; -@@ -1920,16 +2105,16 @@ - if (newnd.last.name[newnd.last.len]) - goto exit4; - } -- new_dentry = lookup_hash(&newnd.last, new_dir); -+ new_dentry = lookup_hash_it(&newnd.last, new_dir, NULL); - error = PTR_ERR(new_dentry); - if (IS_ERR(new_dentry)) - goto exit4; - -+ - lock_kernel(); - error = vfs_rename(old_dir->d_inode, old_dentry, - new_dir->d_inode, new_dentry); - unlock_kernel(); -- - dput(new_dentry); - exit4: - dput(old_dentry); -@@ -1980,20 +2165,26 @@ - } - - static inline int --__vfs_follow_link(struct nameidata *nd, const char *link) -+__vfs_follow_link(struct nameidata *nd, const char *link, -+ struct lookup_intent *it) - { - int res = 0; - char *name; - if (IS_ERR(link)) - goto fail; - -+ if (it == NULL) -+ it = nd->intent; -+ else if (it != nd->intent) -+ printk("it != nd->intent: tell phil@clusterfs.com\n"); -+ - if (*link == '/') { - path_release(nd); - if (!walk_init_root(link, nd)) - /* weird __emul_prefix() stuff did it */ - goto out; - } -- res = link_path_walk(link, nd); -+ res = link_path_walk_it(link, nd, it); - out: - if (current->link_count || res || nd->last_type!=LAST_NORM) - return res; -@@ -2015,7 +2206,13 @@ - - int vfs_follow_link(struct nameidata *nd, const char *link) - { -- return __vfs_follow_link(nd, link); -+ return __vfs_follow_link(nd, link, NULL); -+} -+ -+int vfs_follow_link_it(struct nameidata *nd, const char *link, -+ struct lookup_intent *it) -+{ -+ return __vfs_follow_link(nd, link, it); - } - - /* get the link contents into pagecache */ -@@ -2057,7 +2254,7 @@ - { - struct page *page = NULL; - char *s = page_getlink(dentry, &page); -- int res = __vfs_follow_link(nd, s); -+ int res = __vfs_follow_link(nd, s, NULL); - if (page) { - kunmap(page); - page_cache_release(page); -Index: linux-2.4.19.SuSE/fs/namespace.c -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/namespace.c Mon Jan 27 05:08:07 2003 -+++ linux-2.4.19.SuSE/fs/namespace.c Sat Nov 15 17:56:42 2003 -@@ -97,6 +97,7 @@ - { - old_nd->dentry = mnt->mnt_mountpoint; - old_nd->mnt = mnt->mnt_parent; -+ UNPIN(old_nd->dentry, old_nd->mnt, 1); - mnt->mnt_parent = mnt; - mnt->mnt_mountpoint = mnt->mnt_root; - list_del_init(&mnt->mnt_child); -@@ -108,6 +109,7 @@ - { - mnt->mnt_parent = mntget(nd->mnt); - mnt->mnt_mountpoint = dget(nd->dentry); -+ PIN(nd->dentry, nd->mnt, 1); - list_add(&mnt->mnt_hash, mount_hashtable+hash(nd->mnt, nd->dentry)); - list_add(&mnt->mnt_child, &nd->mnt->mnt_mounts); - nd->dentry->d_mounted++; -@@ -491,15 +493,18 @@ - { - struct nameidata old_nd; - struct vfsmount *mnt = NULL; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - int err = mount_is_safe(nd); - if (err) - return err; - if (!old_name || !*old_name) - return -EINVAL; - if (path_init(old_name, LOOKUP_POSITIVE|LOOKUP_FOLLOW, &old_nd)) -- err = path_walk(old_name, &old_nd); -- if (err) -+ err = path_walk_it(old_name, &old_nd, &it); -+ if (err) { -+ intent_release(&it); - return err; -+ } - - down_write(¤t->namespace->sem); - err = -EINVAL; -@@ -522,6 +527,7 @@ - } - - up_write(¤t->namespace->sem); -+ intent_release(&it); - path_release(&old_nd); - return err; - } -@@ -725,6 +731,7 @@ - unsigned long flags, void *data_page) - { - struct nameidata nd; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - int retval = 0; - int mnt_flags = 0; - -@@ -750,9 +757,11 @@ - - /* ... and get the mountpoint */ - if (path_init(dir_name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd)) -- retval = path_walk(dir_name, &nd); -- if (retval) -+ retval = path_walk_it(dir_name, &nd, &it); -+ if (retval) { -+ intent_release(&it); - return retval; -+ } - - if (flags & MS_REMOUNT) - retval = do_remount(&nd, flags & ~MS_REMOUNT, mnt_flags, -@@ -764,6 +773,8 @@ - else - retval = do_add_mount(&nd, type_page, flags, mnt_flags, - dev_name, data_page); -+ -+ intent_release(&it); - path_release(&nd); - return retval; - } -@@ -929,6 +940,8 @@ - { - struct vfsmount *tmp; - struct nameidata new_nd, old_nd, parent_nd, root_parent, user_nd; -+ struct lookup_intent new_it = { .it_op = IT_GETATTR }; -+ struct lookup_intent old_it = { .it_op = IT_GETATTR }; - char *name; - int error; - -@@ -943,7 +956,7 @@ - goto out0; - error = 0; - if (path_init(name, LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &new_nd)) -- error = path_walk(name, &new_nd); -+ error = path_walk_it(name, &new_nd, &new_it); - putname(name); - if (error) - goto out0; -@@ -957,7 +970,7 @@ - goto out1; - error = 0; - if (path_init(name, LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &old_nd)) -- error = path_walk(name, &old_nd); -+ error = path_walk_it(name, &old_nd, &old_it); - putname(name); - if (error) - goto out1; -@@ -1013,8 +1026,10 @@ - up(&old_nd.dentry->d_inode->i_zombie); - up_write(¤t->namespace->sem); - path_release(&user_nd); -+ intent_release(&old_it); - path_release(&old_nd); - out1: -+ intent_release(&new_it); - path_release(&new_nd); - out0: - unlock_kernel(); -Index: linux-2.4.19.SuSE/fs/open.c -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/open.c Mon Jan 27 05:08:00 2003 -+++ linux-2.4.19.SuSE/fs/open.c Sat Nov 15 17:43:27 2003 -@@ -19,6 +19,8 @@ - #include - - #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m)) -+extern int path_walk_it(const char *name, struct nameidata *nd, -+ struct lookup_intent *it); - - int vfs_statfs(struct super_block *sb, struct statfs *buf) - { -@@ -95,9 +97,10 @@ - write_unlock(&files->file_lock); - } - --int do_truncate(struct dentry *dentry, loff_t length) -+int do_truncate(struct dentry *dentry, loff_t length, int called_from_open) - { - struct inode *inode = dentry->d_inode; -+ struct inode_operations *op = dentry->d_inode->i_op; - int error; - struct iattr newattrs; - -@@ -108,7 +111,13 @@ - down(&inode->i_sem); - newattrs.ia_size = length; - newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME; -- error = notify_change(dentry, &newattrs); -+ if (called_from_open) -+ newattrs.ia_valid |= ATTR_FROM_OPEN; -+ if (op->setattr_raw) { -+ newattrs.ia_valid |= ATTR_RAW; -+ error = op->setattr_raw(inode, &newattrs); -+ } else -+ error = notify_change(dentry, &newattrs); - up(&inode->i_sem); - return error; - } -@@ -118,12 +127,13 @@ - struct nameidata nd; - struct inode * inode; - int error; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - - error = -EINVAL; - if (length < 0) /* sorry, but loff_t says... */ - goto out; - -- error = user_path_walk(path, &nd); -+ error = user_path_walk_it(path, &nd, &it); - if (error) - goto out; - inode = nd.dentry->d_inode; -@@ -163,11 +173,13 @@ - error = locks_verify_truncate(inode, NULL, length); - if (!error) { - DQUOT_INIT(inode); -- error = do_truncate(nd.dentry, length); -+ intent_release(&it); -+ error = do_truncate(nd.dentry, length, 0); - } - put_write_access(inode); - - dput_and_out: -+ intent_release(&it); - path_release(&nd); - out: - return error; -@@ -215,7 +227,7 @@ - - error = locks_verify_truncate(inode, file, length); - if (!error) -- error = do_truncate(dentry, length); -+ error = do_truncate(dentry, length, 0); - out_putf: - fput(file); - out: -@@ -260,11 +272,13 @@ - struct inode * inode; - struct iattr newattrs; - -- error = user_path_walk(filename, &nd); -+ error = user_path_walk_it(filename, &nd, NULL); - if (error) - goto out; - inode = nd.dentry->d_inode; - -+ /* this is safe without a Lustre lock because it only depends -+ on the super block */ - error = -EROFS; - if (IS_RDONLY(inode)) - goto dput_and_out; -@@ -279,11 +293,25 @@ - goto dput_and_out; - - newattrs.ia_valid |= ATTR_ATIME_SET | ATTR_MTIME_SET; -- } else { -+ } -+ -+ if (inode->i_op->setattr_raw) { -+ struct inode_operations *op = nd.dentry->d_inode->i_op; -+ -+ newattrs.ia_valid |= ATTR_RAW; -+ error = op->setattr_raw(inode, &newattrs); -+ /* the file system wants to use normal vfs path now */ -+ if (error != -EOPNOTSUPP) -+ goto dput_and_out; -+ } -+ -+ error = -EPERM; -+ if (!times) { - if (current->fsuid != inode->i_uid && - (error = permission(inode,MAY_WRITE)) != 0) - goto dput_and_out; - } -+ - error = notify_change(nd.dentry, &newattrs); - dput_and_out: - path_release(&nd); -@@ -304,12 +332,14 @@ - struct inode * inode; - struct iattr newattrs; - -- error = user_path_walk(filename, &nd); -+ error = user_path_walk_it(filename, &nd, NULL); - - if (error) - goto out; - inode = nd.dentry->d_inode; - -+ /* this is safe without a Lustre lock because it only depends -+ on the super block */ - error = -EROFS; - if (IS_RDONLY(inode)) - goto dput_and_out; -@@ -324,7 +354,20 @@ - newattrs.ia_atime = times[0].tv_sec; - newattrs.ia_mtime = times[1].tv_sec; - newattrs.ia_valid |= ATTR_ATIME_SET | ATTR_MTIME_SET; -- } else { -+ } -+ -+ if (inode->i_op->setattr_raw) { -+ struct inode_operations *op = nd.dentry->d_inode->i_op; -+ -+ newattrs.ia_valid |= ATTR_RAW; -+ error = op->setattr_raw(inode, &newattrs); -+ /* the file system wants to use normal vfs path now */ -+ if (error != -EOPNOTSUPP) -+ goto dput_and_out; -+ } -+ -+ error = -EPERM; -+ if (!utimes) { - if (current->fsuid != inode->i_uid && - (error = permission(inode,MAY_WRITE)) != 0) - goto dput_and_out; -@@ -347,6 +390,7 @@ - int old_fsuid, old_fsgid; - kernel_cap_t old_cap; - int res; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - - if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */ - return -EINVAL; -@@ -364,13 +408,14 @@ - else - current->cap_effective = current->cap_permitted; - -- res = user_path_walk(filename, &nd); -+ res = user_path_walk_it(filename, &nd, &it); - if (!res) { - res = permission(nd.dentry->d_inode, mode); - /* SuS v2 requires we report a read only fs too */ - if(!res && (mode & S_IWOTH) && IS_RDONLY(nd.dentry->d_inode) - && !special_file(nd.dentry->d_inode->i_mode)) - res = -EROFS; -+ intent_release(&it); - path_release(&nd); - } - -@@ -386,6 +431,7 @@ - int error; - struct nameidata nd; - char *name; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - - name = getname(filename); - error = PTR_ERR(name); -@@ -394,7 +440,7 @@ - - error = 0; - if (path_init(name,LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY,&nd)) -- error = path_walk(name, &nd); -+ error = path_walk_it(name, &nd, &it); - putname(name); - if (error) - goto out; -@@ -406,6 +452,7 @@ - set_fs_pwd(current->fs, nd.mnt, nd.dentry); - - dput_and_out: -+ intent_release(&it); - path_release(&nd); - out: - return error; -@@ -446,6 +493,7 @@ - int error; - struct nameidata nd; - char *name; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - - name = getname(filename); - error = PTR_ERR(name); -@@ -454,7 +502,7 @@ - - path_init(name, LOOKUP_POSITIVE | LOOKUP_FOLLOW | - LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd); -- error = path_walk(name, &nd); -+ error = path_walk_it(name, &nd, &it); - putname(name); - if (error) - goto out; -@@ -471,39 +519,56 @@ - set_fs_altroot(); - error = 0; - dput_and_out: -+ intent_release(&it); - path_release(&nd); - out: - return error; - } - --asmlinkage long sys_fchmod(unsigned int fd, mode_t mode) -+int chmod_common(struct dentry *dentry, mode_t mode) - { -- struct inode * inode; -- struct dentry * dentry; -- struct file * file; -- int err = -EBADF; -+ struct inode *inode = dentry->d_inode; - struct iattr newattrs; -+ int err = -EROFS; - -- file = fget(fd); -- if (!file) -+ if (IS_RDONLY(inode)) - goto out; - -- dentry = file->f_dentry; -- inode = dentry->d_inode; -+ if (inode->i_op->setattr_raw) { -+ newattrs.ia_mode = mode; -+ newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; -+ newattrs.ia_valid |= ATTR_RAW; -+ err = inode->i_op->setattr_raw(inode, &newattrs); -+ /* the file system wants to use normal vfs path now */ -+ if (err != -EOPNOTSUPP) -+ goto out; -+ } - -- err = -EROFS; -- if (IS_RDONLY(inode)) -- goto out_putf; - err = -EPERM; - if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) -- goto out_putf; -+ goto out; -+ - if (mode == (mode_t) -1) - mode = inode->i_mode; - newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); - newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; - err = notify_change(dentry, &newattrs); - --out_putf: -+out: -+ return err; -+} -+ -+asmlinkage long sys_fchmod(unsigned int fd, mode_t mode) -+{ -+ struct file * file; -+ int err = -EBADF; -+ -+ file = fget(fd); -+ if (!file) -+ goto out; -+ -+ err = chmod_common(file->f_dentry, mode); -+ - fput(file); - out: - return err; -@@ -512,30 +577,14 @@ - asmlinkage long sys_chmod(const char * filename, mode_t mode) - { - struct nameidata nd; -- struct inode * inode; - int error; -- struct iattr newattrs; - - error = user_path_walk(filename, &nd); - if (error) - goto out; -- inode = nd.dentry->d_inode; -- -- error = -EROFS; -- if (IS_RDONLY(inode)) -- goto dput_and_out; - -- error = -EPERM; -- if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) -- goto dput_and_out; -+ error = chmod_common(nd.dentry, mode); - -- if (mode == (mode_t) -1) -- mode = inode->i_mode; -- newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); -- newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; -- error = notify_change(nd.dentry, &newattrs); -- --dput_and_out: - path_release(&nd); - out: - return error; -@@ -555,6 +604,20 @@ - error = -EROFS; - if (IS_RDONLY(inode)) - goto out; -+ -+ if (inode->i_op->setattr_raw) { -+ struct inode_operations *op = dentry->d_inode->i_op; -+ -+ newattrs.ia_uid = user; -+ newattrs.ia_gid = group; -+ newattrs.ia_valid = ATTR_UID | ATTR_GID | ATTR_CTIME; -+ newattrs.ia_valid |= ATTR_RAW; -+ error = op->setattr_raw(inode, &newattrs); -+ /* the file system wants to use normal vfs path now */ -+ if (error != -EOPNOTSUPP) -+ return error; -+ } -+ - error = -EPERM; - if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) - goto out; -@@ -659,6 +722,7 @@ - { - int namei_flags, error; - struct nameidata nd; -+ struct lookup_intent it = { .it_op = IT_OPEN }; - - namei_flags = flags; - if ((namei_flags+1) & O_ACCMODE) -@@ -666,14 +730,15 @@ - if (namei_flags & O_TRUNC) - namei_flags |= 2; - -- error = open_namei(filename, namei_flags, mode, &nd); -- if (!error) -- return dentry_open(nd.dentry, nd.mnt, flags); -+ error = open_namei_it(filename, namei_flags, mode, &nd, &it); -+ if (error) -+ return ERR_PTR(error); - -- return ERR_PTR(error); -+ return dentry_open_it(nd.dentry, nd.mnt, flags, &it); - } - --struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags) -+struct file *dentry_open_it(struct dentry *dentry, struct vfsmount *mnt, -+ int flags, struct lookup_intent *it) - { - struct file * f; - struct inode *inode; -@@ -710,7 +775,9 @@ - } - - if (f->f_op && f->f_op->open) { -+ f->f_it = it; - error = f->f_op->open(inode,f); -+ f->f_it = NULL; - if (error) - goto cleanup_all; - } -@@ -722,6 +789,7 @@ - !inode->i_mapping->a_ops->direct_IO)) - goto cleanup_all; - -+ intent_release(it); - return f; - - cleanup_all: -@@ -736,11 +804,17 @@ - cleanup_file: - put_filp(f); - cleanup_dentry: -+ intent_release(it); - dput(dentry); - mntput(mnt); - return ERR_PTR(error); - } - -+struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags) -+{ -+ return dentry_open_it(dentry, mnt, flags, NULL); -+} -+ - /* - * Find an empty file descriptor entry, and mark it busy. - */ -Index: linux-2.4.19.SuSE/fs/stat.c -=================================================================== ---- linux-2.4.19.SuSE.orig/fs/stat.c Mon Jan 27 05:08:00 2003 -+++ linux-2.4.19.SuSE/fs/stat.c Sat Nov 15 17:29:03 2003 -@@ -17,10 +17,16 @@ - * Revalidate the inode. This is required for proper NFS attribute caching. - */ - static __inline__ int --do_revalidate(struct dentry *dentry) -+do_revalidate(struct dentry *dentry, struct lookup_intent *it) - { - struct inode * inode = dentry->d_inode; -- if (inode->i_op && inode->i_op->revalidate) -+ if (inode->i_op && inode->i_op->revalidate_it) -+ return inode->i_op->revalidate_it(dentry, it); -+ else if (inode->i_op && inode->i_op->revalidate) - return inode->i_op->revalidate(dentry); - return 0; - } -@@ -141,13 +145,15 @@ - asmlinkage long sys_stat(char * filename, struct __old_kernel_stat * statbuf) - { - struct nameidata nd; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - int error; - -- error = user_path_walk(filename, &nd); -+ error = user_path_walk_it(filename, &nd, &it); - if (!error) { -- error = do_revalidate(nd.dentry); -+ error = do_revalidate(nd.dentry, &it); - if (!error) - error = cp_old_stat(nd.dentry->d_inode, statbuf); -+ intent_release(&it); - path_release(&nd); - } - return error; -@@ -157,13 +163,15 @@ - asmlinkage long sys_newstat(char * filename, struct stat * statbuf) - { - struct nameidata nd; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - int error; - -- error = user_path_walk(filename, &nd); -+ error = user_path_walk_it(filename, &nd, &it); - if (!error) { -- error = do_revalidate(nd.dentry); -+ error = do_revalidate(nd.dentry, &it); - if (!error) - error = cp_new_stat(nd.dentry->d_inode, statbuf); -+ intent_release(&it); - path_release(&nd); - } - return error; -@@ -178,13 +186,15 @@ - asmlinkage long sys_lstat(char * filename, struct __old_kernel_stat * statbuf) - { - struct nameidata nd; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - int error; - -- error = user_path_walk_link(filename, &nd); -+ error = user_path_walk_link_it(filename, &nd, &it); - if (!error) { -- error = do_revalidate(nd.dentry); -+ error = do_revalidate(nd.dentry, &it); - if (!error) - error = cp_old_stat(nd.dentry->d_inode, statbuf); -+ intent_release(&it); - path_release(&nd); - } - return error; -@@ -195,13 +205,15 @@ - asmlinkage long sys_newlstat(char * filename, struct stat * statbuf) - { - struct nameidata nd; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - int error; - -- error = user_path_walk_link(filename, &nd); -+ error = user_path_walk_link_it(filename, &nd, &it); - if (!error) { -- error = do_revalidate(nd.dentry); -+ error = do_revalidate(nd.dentry, &it); - if (!error) - error = cp_new_stat(nd.dentry->d_inode, statbuf); -+ intent_release(&it); - path_release(&nd); - } - return error; -@@ -222,7 +234,7 @@ - if (f) { - struct dentry * dentry = f->f_dentry; - -- err = do_revalidate(dentry); -+ err = do_revalidate(dentry, NULL); - if (!err) - err = cp_old_stat(dentry->d_inode, statbuf); - fput(f); -@@ -241,7 +253,7 @@ - if (f) { - struct dentry * dentry = f->f_dentry; - -- err = do_revalidate(dentry); -+ err = do_revalidate(dentry, NULL); - if (!err) - err = cp_new_stat(dentry->d_inode, statbuf); - fput(f); -@@ -263,7 +275,7 @@ - - error = -EINVAL; - if (inode->i_op && inode->i_op->readlink && -- !(error = do_revalidate(nd.dentry))) { -+ !(error = do_revalidate(nd.dentry, NULL))) { - UPDATE_ATIME(inode); - error = inode->i_op->readlink(nd.dentry, buf, bufsiz); - } -@@ -339,12 +351,14 @@ - { - struct nameidata nd; - int error; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - -- error = user_path_walk(filename, &nd); -+ error = user_path_walk_it(filename, &nd, &it); - if (!error) { -- error = do_revalidate(nd.dentry); -+ error = do_revalidate(nd.dentry, &it); - if (!error) - error = cp_new_stat64(nd.dentry->d_inode, statbuf); -+ intent_release(&it); - path_release(&nd); - } - return error; -@@ -354,12 +368,14 @@ - { - struct nameidata nd; - int error; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - -- error = user_path_walk_link(filename, &nd); -+ error = user_path_walk_link_it(filename, &nd, &it); - if (!error) { -- error = do_revalidate(nd.dentry); -+ error = do_revalidate(nd.dentry, &it); - if (!error) - error = cp_new_stat64(nd.dentry->d_inode, statbuf); -+ intent_release(&it); - path_release(&nd); - } - return error; -@@ -374,7 +390,7 @@ - if (f) { - struct dentry * dentry = f->f_dentry; - -- err = do_revalidate(dentry); -+ err = do_revalidate(dentry, NULL); - if (!err) - err = cp_new_stat64(dentry->d_inode, statbuf); - fput(f); -Index: linux-2.4.19.SuSE/include/linux/dcache.h -=================================================================== ---- linux-2.4.19.SuSE.orig/include/linux/dcache.h Mon Jan 27 05:13:15 2003 -+++ linux-2.4.19.SuSE/include/linux/dcache.h Sat Nov 15 17:35:46 2003 -@@ -5,6 +5,51 @@ - - #include - #include -+#include -+ -+#define IT_OPEN 0x0001 -+#define IT_CREAT 0x0002 -+#define IT_READDIR 0x0004 -+#define IT_GETATTR 0x0008 -+#define IT_LOOKUP 0x0010 -+#define IT_UNLINK 0x0020 -+#define IT_GETXATTR 0x0040 -+#define IT_EXEC 0x0080 -+#define IT_PIN 0x0100 -+ -+#define IT_FL_LOCKED 0x0001 -+#define IT_FL_FOLLOWED 0x0002 /* set by vfs_follow_link */ -+ -+#define INTENT_MAGIC 0x19620323 -+ -+ -+struct lustre_intent_data { -+ int it_disposition; -+ int it_status; -+ __u64 it_lock_handle; -+ void *it_data; -+ int it_lock_mode; -+ int it_int_flags; -+}; -+struct lookup_intent { -+ int it_magic; -+ void (*it_op_release)(struct lookup_intent *); -+ int it_op; -+ int it_flags; -+ int it_create_mode; -+ union { -+ struct lustre_intent_data lustre; -+ } d; -+}; -+ -+static inline void intent_init(struct lookup_intent *it, int op, int flags) -+{ -+ memset(it, 0, sizeof(*it)); -+ it->it_magic = INTENT_MAGIC; -+ it->it_op = op; -+ it->it_flags = flags; -+} -+ - - /* - * linux/include/linux/dcache.h -@@ -92,8 +137,22 @@ - int (*d_delete)(struct dentry *); - void (*d_release)(struct dentry *); - void (*d_iput)(struct dentry *, struct inode *); -+ int (*d_revalidate_it)(struct dentry *, int, struct lookup_intent *); -+ void (*d_pin)(struct dentry *, struct vfsmount * , int); -+ void (*d_unpin)(struct dentry *, struct vfsmount *, int); - }; - -+#define PIN(de,mnt,flag) if (de && de->d_op && de->d_op->d_pin) \ -+ de->d_op->d_pin(de, mnt, flag); -+#define UNPIN(de,mnt,flag) if (de && de->d_op && de->d_op->d_unpin) \ -+ de->d_op->d_unpin(de, mnt, flag); -+ -+ -+/* defined in fs/namei.c */ -+extern void intent_release(struct lookup_intent *it); -+/* defined in fs/dcache.c */ -+extern void __d_rehash(struct dentry * entry, int lock); -+ - /* the dentry parameter passed to d_hash and d_compare is the parent - * directory of the entries to be compared. It is used in case these - * functions need any directory specific information for determining -@@ -125,6 +184,7 @@ - * s_nfsd_free_path semaphore will be down - */ - #define DCACHE_REFERENCED 0x0008 /* Recently used, don't discard. */ -+#define DCACHE_LUSTRE_INVALID 0x0010 /* Lustre invalidated */ - - extern spinlock_t dcache_lock; - -Index: linux-2.4.19.SuSE/include/linux/fs.h -=================================================================== ---- linux-2.4.19.SuSE.orig/include/linux/fs.h Sat Nov 15 17:25:06 2003 -+++ linux-2.4.19.SuSE/include/linux/fs.h Sat Nov 15 17:29:03 2003 -@@ -73,6 +73,7 @@ - - #define FMODE_READ 1 - #define FMODE_WRITE 2 -+#define FMODE_EXEC 4 - - #define READ 0 - #define WRITE 1 -@@ -363,6 +364,9 @@ - #define ATTR_MTIME_SET 256 - #define ATTR_FORCE 512 /* Not a change, but a change it */ - #define ATTR_ATTR_FLAG 1024 -+#define ATTR_RAW 0x0800 /* file system, not vfs will massage attrs */ -+#define ATTR_FROM_OPEN 0x1000 /* called from open path, ie O_TRUNC */ -+#define ATTR_CTIME_SET 0x2000 - - /* - * This is the Inode Attributes structure, used for notify_change(). It -@@ -507,6 +511,7 @@ - struct pipe_inode_info *i_pipe; - struct block_device *i_bdev; - struct char_device *i_cdev; -+ void *i_filterdata; - - unsigned long i_dnotify_mask; /* Directory notify events */ - struct dnotify_struct *i_dnotify; /* for directory notifications */ -@@ -669,6 +674,7 @@ - - /* needed for tty driver, and maybe others */ - void *private_data; -+ struct lookup_intent *f_it; - - /* preallocated helper kiobuf to speedup O_DIRECT */ - struct kiobuf *f_iobuf; -@@ -799,6 +805,7 @@ - struct qstr last; - unsigned int flags; - int last_type; -+ struct lookup_intent *intent; - }; - - #define DQUOT_USR_ENABLED 0x01 /* User diskquotas enabled */ -@@ -947,7 +954,8 @@ - extern int __vfs_rmdir(struct inode *, struct dentry *); - extern int vfs_rmdir(struct inode *, struct dentry *); - extern int vfs_unlink(struct inode *, struct dentry *); --extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *); -+int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, -+ struct inode *new_dir, struct dentry *new_dentry); - - /* - * File types -@@ -1020,21 +1028,32 @@ - - struct inode_operations { - int (*create) (struct inode *,struct dentry *,int); -+ int (*create_it) (struct inode *,struct dentry *,int, struct lookup_intent *); - struct dentry * (*lookup) (struct inode *,struct dentry *); -+ struct dentry * (*lookup_it) (struct inode *,struct dentry *, struct lookup_intent *, int flags); - int (*link) (struct dentry *,struct inode *,struct dentry *); -+ int (*link_raw) (struct nameidata *,struct nameidata *); - int (*unlink) (struct inode *,struct dentry *); -+ int (*unlink_raw) (struct nameidata *); - int (*symlink) (struct inode *,struct dentry *,const char *); -+ int (*symlink_raw) (struct nameidata *,const char *); - int (*mkdir) (struct inode *,struct dentry *,int); -+ int (*mkdir_raw) (struct nameidata *,int); - int (*rmdir) (struct inode *,struct dentry *); -+ int (*rmdir_raw) (struct nameidata *); - int (*mknod) (struct inode *,struct dentry *,int,int); -+ int (*mknod_raw) (struct nameidata *,int,dev_t); - int (*rename) (struct inode *, struct dentry *, - struct inode *, struct dentry *); -+ int (*rename_raw) (struct nameidata *, struct nameidata *); - int (*readlink) (struct dentry *, char *,int); - int (*follow_link) (struct dentry *, struct nameidata *); - void (*truncate) (struct inode *); - int (*permission) (struct inode *, int); - int (*revalidate) (struct dentry *); -+ int (*revalidate_it) (struct dentry *, struct lookup_intent *); - int (*setattr) (struct dentry *, struct iattr *); -+ int (*setattr_raw) (struct inode *, struct iattr *); - int (*getattr) (struct dentry *, struct iattr *); - int (*setxattr) (struct dentry *, const char *, const void *, size_t, int); - ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t); -@@ -1244,10 +1263,14 @@ - - asmlinkage long sys_open(const char *, int, int); - asmlinkage long sys_close(unsigned int); /* yes, it's really unsigned */ --extern int do_truncate(struct dentry *, loff_t start); -+extern int do_truncate(struct dentry *, loff_t start, int called_from_open); - - extern struct file *filp_open(const char *, int, int); - extern struct file * dentry_open(struct dentry *, struct vfsmount *, int); -+extern int open_namei_it(const char *filename, int namei_flags, int mode, -+ struct nameidata *nd, struct lookup_intent *it); -+extern struct file *dentry_open_it(struct dentry *dentry, struct vfsmount *mnt, -+ int flags, struct lookup_intent *it); - extern int filp_close(struct file *, fl_owner_t id); - extern char * getname(const char *); - -@@ -1515,6 +1538,7 @@ - extern loff_t default_llseek(struct file *file, loff_t offset, int origin); - - extern int FASTCALL(__user_walk(const char *, unsigned, struct nameidata *)); -+extern int FASTCALL(__user_walk_it(const char *, unsigned, struct nameidata *, struct lookup_intent *it)); - extern int FASTCALL(path_init(const char *, unsigned, struct nameidata *)); - extern int FASTCALL(path_walk(const char *, struct nameidata *)); - extern int FASTCALL(link_path_walk(const char *, struct nameidata *)); -@@ -1526,6 +1550,8 @@ - extern struct dentry * lookup_hash(struct qstr *, struct dentry *); - #define user_path_walk(name,nd) __user_walk(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, nd) - #define user_path_walk_link(name,nd) __user_walk(name, LOOKUP_POSITIVE, nd) -+#define user_path_walk_it(name,nd,it) __user_walk_it(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, nd, it) -+#define user_path_walk_link_it(name,nd,it) __user_walk_it(name, LOOKUP_POSITIVE, nd, it) - - extern void iput(struct inode *); - extern void force_delete(struct inode *); -@@ -1646,6 +1672,8 @@ - - extern int vfs_readlink(struct dentry *, char *, int, const char *); - extern int vfs_follow_link(struct nameidata *, const char *); -+extern int vfs_follow_link_it(struct nameidata *, const char *, -+ struct lookup_intent *it); - extern int page_readlink(struct dentry *, char *, int); - extern int page_follow_link(struct dentry *, struct nameidata *); - extern struct inode_operations page_symlink_inode_operations; -Index: linux-2.4.19.SuSE/include/linux/fs_struct.h -=================================================================== ---- linux-2.4.19.SuSE.orig/include/linux/fs_struct.h Fri Jul 13 15:10:44 2001 -+++ linux-2.4.19.SuSE/include/linux/fs_struct.h Sat Nov 15 17:29:03 2003 -@@ -34,10 +34,12 @@ - write_lock(&fs->lock); - old_root = fs->root; - old_rootmnt = fs->rootmnt; -+ PIN(dentry, mnt, 1); - fs->rootmnt = mntget(mnt); - fs->root = dget(dentry); - write_unlock(&fs->lock); - if (old_root) { -+ UNPIN(old_root, old_rootmnt, 1); - dput(old_root); - mntput(old_rootmnt); - } -@@ -57,10 +59,12 @@ - write_lock(&fs->lock); - old_pwd = fs->pwd; - old_pwdmnt = fs->pwdmnt; -+ PIN(dentry, mnt, 0); - fs->pwdmnt = mntget(mnt); - fs->pwd = dget(dentry); - write_unlock(&fs->lock); - if (old_pwd) { -+ UNPIN(old_pwd, old_pwdmnt, 0); - dput(old_pwd); - mntput(old_pwdmnt); - } -Index: linux-2.4.19.SuSE/kernel/exit.c -=================================================================== ---- linux-2.4.19.SuSE.orig/kernel/exit.c Mon Jan 27 05:08:16 2003 -+++ linux-2.4.19.SuSE/kernel/exit.c Sat Nov 15 17:29:03 2003 -@@ -288,11 +288,14 @@ - { - /* No need to hold fs->lock if we are killing it */ - if (atomic_dec_and_test(&fs->count)) { -+ UNPIN(fs->pwd, fs->pwdmnt, 0); -+ UNPIN(fs->root, fs->rootmnt, 1); - dput(fs->root); - mntput(fs->rootmnt); - dput(fs->pwd); - mntput(fs->pwdmnt); - if (fs->altroot) { -+ UNPIN(fs->altroot, fs->altrootmnt, 1); - dput(fs->altroot); - mntput(fs->altrootmnt); - } -Index: linux-2.4.19.SuSE/kernel/fork.c -=================================================================== ---- linux-2.4.19.SuSE.orig/kernel/fork.c Mon Jan 27 05:08:56 2003 -+++ linux-2.4.19.SuSE/kernel/fork.c Sat Nov 15 17:29:03 2003 -@@ -454,10 +454,13 @@ - fs->umask = old->umask; - read_lock(&old->lock); - fs->rootmnt = mntget(old->rootmnt); -+ PIN(old->pwd, old->pwdmnt, 0); -+ PIN(old->root, old->rootmnt, 1); - fs->root = dget(old->root); - fs->pwdmnt = mntget(old->pwdmnt); - fs->pwd = dget(old->pwd); - if (old->altroot) { -+ PIN(old->altroot, old->altrootmnt, 1); - fs->altrootmnt = mntget(old->altrootmnt); - fs->altroot = dget(old->altroot); - } else { -Index: linux-2.4.19.SuSE/kernel/ksyms.c -=================================================================== ---- linux-2.4.19.SuSE.orig/kernel/ksyms.c Sat Nov 15 17:24:46 2003 -+++ linux-2.4.19.SuSE/kernel/ksyms.c Sat Nov 15 17:29:03 2003 -@@ -315,6 +315,7 @@ - EXPORT_SYMBOL(set_page_dirty); - EXPORT_SYMBOL(vfs_readlink); - EXPORT_SYMBOL(vfs_follow_link); -+EXPORT_SYMBOL(vfs_follow_link_it); - EXPORT_SYMBOL(page_readlink); - EXPORT_SYMBOL(page_follow_link); - EXPORT_SYMBOL(page_symlink_inode_operations); diff --git a/lustre/kernel_patches/patches/vfs_intent-2.6-rhel4.patch b/lustre/kernel_patches/patches/vfs_intent-2.6-rhel4.patch index 8ffead7..75d587e 100644 --- a/lustre/kernel_patches/patches/vfs_intent-2.6-rhel4.patch +++ b/lustre/kernel_patches/patches/vfs_intent-2.6-rhel4.patch @@ -1,8 +1,8 @@ -Index: linux-2.6.9-5.0.3.EL/fs/exec.c +Index: linux-2.6.9-22.0.2.EL/fs/exec.c =================================================================== ---- linux-2.6.9-5.0.3.EL.orig/fs/exec.c 2005-02-26 14:28:01.000000000 +0200 -+++ linux-2.6.9-5.0.3.EL/fs/exec.c 2005-02-26 23:29:02.000000000 +0200 -@@ -124,9 +124,10 @@ +--- linux-2.6.9-22.0.2.EL.orig/fs/exec.c 2006-01-25 23:40:13.000000000 -0500 ++++ linux-2.6.9-22.0.2.EL/fs/exec.c 2006-01-26 18:46:33.000000000 -0500 +@@ -125,9 +125,10 @@ struct file * file; struct nameidata nd; int error; @@ -15,7 +15,7 @@ Index: linux-2.6.9-5.0.3.EL/fs/exec.c if (error) goto out; -@@ -138,7 +139,7 @@ +@@ -139,7 +140,7 @@ if (error) goto exit; @@ -24,7 +24,7 @@ Index: linux-2.6.9-5.0.3.EL/fs/exec.c error = PTR_ERR(file); if (IS_ERR(file)) goto out; -@@ -487,8 +488,9 @@ +@@ -488,8 +489,9 @@ int err; struct file *file; @@ -36,7 +36,7 @@ Index: linux-2.6.9-5.0.3.EL/fs/exec.c file = ERR_PTR(err); if (!err) { -@@ -501,7 +503,7 @@ +@@ -502,7 +504,7 @@ err = -EACCES; file = ERR_PTR(err); if (!err) { @@ -45,11 +45,11 @@ Index: linux-2.6.9-5.0.3.EL/fs/exec.c if (!IS_ERR(file)) { err = deny_write_access(file); if (err) { -Index: linux-2.6.9-5.0.3.EL/fs/namei.c +Index: linux-2.6.9-22.0.2.EL/fs/namei.c =================================================================== ---- linux-2.6.9-5.0.3.EL.orig/fs/namei.c 2005-02-26 14:28:01.000000000 +0200 -+++ linux-2.6.9-5.0.3.EL/fs/namei.c 2005-04-01 18:15:29.743029208 +0300 -@@ -272,8 +272,19 @@ +--- linux-2.6.9-22.0.2.EL.orig/fs/namei.c 2006-01-25 23:40:13.000000000 -0500 ++++ linux-2.6.9-22.0.2.EL/fs/namei.c 2006-01-26 18:46:33.000000000 -0500 +@@ -274,8 +274,19 @@ return 0; } @@ -69,7 +69,7 @@ Index: linux-2.6.9-5.0.3.EL/fs/namei.c dput(nd->dentry); mntput(nd->mnt); } -@@ -363,7 +374,10 @@ +@@ -367,7 +378,10 @@ { struct dentry * result; struct inode *dir = parent->d_inode; @@ -80,7 +80,7 @@ Index: linux-2.6.9-5.0.3.EL/fs/namei.c down(&dir->i_sem); /* * First re-do the cached lookup just in case it was created -@@ -402,7 +416,10 @@ +@@ -406,7 +420,10 @@ if (result->d_op && result->d_op->d_revalidate) { if (!result->d_op->d_revalidate(result, nd) && !d_invalidate(result)) { dput(result); @@ -92,7 +92,7 @@ Index: linux-2.6.9-5.0.3.EL/fs/namei.c } } return result; -@@ -432,7 +449,9 @@ +@@ -436,7 +453,9 @@ static inline int __vfs_follow_link(struct nameidata *nd, const char *link) { int res = 0; @@ -102,7 +102,7 @@ Index: linux-2.6.9-5.0.3.EL/fs/namei.c if (IS_ERR(link)) goto fail; -@@ -442,6 +461,9 @@ +@@ -446,6 +465,9 @@ /* weird __emul_prefix() stuff did it */ goto out; } @@ -112,7 +112,7 @@ Index: linux-2.6.9-5.0.3.EL/fs/namei.c res = link_path_walk(link, nd); out: if (nd->depth || res || nd->last_type!=LAST_NORM) -@@ -650,6 +672,33 @@ +@@ -654,6 +676,33 @@ return PTR_ERR(dentry); } @@ -145,8 +145,8 @@ Index: linux-2.6.9-5.0.3.EL/fs/namei.c + /* * Name resolution. - * -@@ -751,8 +800,12 @@ + * This is the basic name resolution function, turning a pathname into +@@ -755,8 +804,12 @@ goto out_dput; if (inode->i_op->follow_link) { @@ -159,7 +159,7 @@ Index: linux-2.6.9-5.0.3.EL/fs/namei.c dput(next.dentry); mntput(next.mnt); if (err) -@@ -791,14 +844,34 @@ +@@ -795,14 +848,34 @@ inode = nd->dentry->d_inode; /* fallthrough */ case 1: @@ -174,8 +174,8 @@ Index: linux-2.6.9-5.0.3.EL/fs/namei.c + } + if (lookup_flags & LOOKUP_DIRECTORY) { + err = -ENOTDIR; -+ if(!nd->dentry->d_inode->i_op || -+ !nd->dentry->d_inode->i_op->lookup) { ++ if (!nd->dentry->d_inode->i_op || ++ !nd->dentry->d_inode->i_op->lookup){ + path_release(nd); + goto return_err; + } @@ -194,7 +194,7 @@ Index: linux-2.6.9-5.0.3.EL/fs/namei.c if (err) break; follow_mount(&next.mnt, &next.dentry); -@@ -1016,7 +1089,7 @@ +@@ -1053,7 +1126,7 @@ } /* SMP-safe */ @@ -203,7 +203,7 @@ Index: linux-2.6.9-5.0.3.EL/fs/namei.c { unsigned long hash; struct qstr this; -@@ -1036,11 +1109,16 @@ +@@ -1073,11 +1146,16 @@ } this.hash = end_name_hash(hash); @@ -221,7 +221,7 @@ Index: linux-2.6.9-5.0.3.EL/fs/namei.c /* * namei() * -@@ -1052,7 +1130,7 @@ +@@ -1089,7 +1167,7 @@ * that namei follows links, while lnamei does not. * SMP-safe */ @@ -230,7 +230,7 @@ Index: linux-2.6.9-5.0.3.EL/fs/namei.c { char *tmp = getname(name); int err = PTR_ERR(tmp); -@@ -1064,6 +1142,12 @@ +@@ -1101,6 +1179,12 @@ return err; } @@ -243,7 +243,7 @@ Index: linux-2.6.9-5.0.3.EL/fs/namei.c /* * It's inline, so penalty for filesystems that don't use sticky bit is * minimal. -@@ -1347,8 +1431,8 @@ +@@ -1387,8 +1471,8 @@ acc_mode |= MAY_APPEND; /* Fill in the open() intent data */ @@ -254,7 +254,7 @@ Index: linux-2.6.9-5.0.3.EL/fs/namei.c /* * The simplest case - just a plain lookup. -@@ -1363,6 +1447,7 @@ +@@ -1403,6 +1487,7 @@ /* * Create - we need to know the parent. */ @@ -262,7 +262,7 @@ Index: linux-2.6.9-5.0.3.EL/fs/namei.c error = path_lookup(pathname, LOOKUP_PARENT|LOOKUP_OPEN|LOOKUP_CREATE, nd); if (error) return error; -@@ -1379,7 +1464,9 @@ +@@ -1419,7 +1504,9 @@ dir = nd->dentry; nd->flags &= ~LOOKUP_PARENT; down(&dir->d_inode->i_sem); @@ -272,20 +272,20 @@ Index: linux-2.6.9-5.0.3.EL/fs/namei.c do_last: error = PTR_ERR(dentry); -@@ -1492,7 +1579,9 @@ +@@ -1532,7 +1619,9 @@ } dir = nd->dentry; down(&dir->d_inode->i_sem); + nd->flags |= LOOKUP_LAST; dentry = __lookup_hash(&nd->last, nd->dentry, nd); + nd->flags &= ~LOOKUP_LAST; - putname(nd->last.name); + __putname(nd->last.name); goto do_last; } -Index: linux-2.6.9-5.0.3.EL/fs/namespace.c +Index: linux-2.6.9-22.0.2.EL/fs/namespace.c =================================================================== ---- linux-2.6.9-5.0.3.EL.orig/fs/namespace.c 2005-02-26 14:28:01.000000000 +0200 -+++ linux-2.6.9-5.0.3.EL/fs/namespace.c 2005-02-26 23:29:02.000000000 +0200 +--- linux-2.6.9-22.0.2.EL.orig/fs/namespace.c 2006-01-25 23:40:13.000000000 -0500 ++++ linux-2.6.9-22.0.2.EL/fs/namespace.c 2006-01-26 18:46:32.000000000 -0500 @@ -61,6 +61,7 @@ INIT_LIST_HEAD(&mnt->mnt_mounts); INIT_LIST_HEAD(&mnt->mnt_list); @@ -306,9 +306,9 @@ Index: linux-2.6.9-5.0.3.EL/fs/namespace.c { struct super_block *sb = mnt->mnt_sb; dput(mnt->mnt_root); -+ spin_lock(&dcache_lock); -+ list_del(&mnt->mnt_lustre_list); -+ spin_unlock(&dcache_lock); ++ spin_lock(&dcache_lock); ++ list_del(&mnt->mnt_lustre_list); ++ spin_unlock(&dcache_lock); free_vfsmnt(mnt); deactivate_super(sb); } @@ -317,7 +317,7 @@ Index: linux-2.6.9-5.0.3.EL/fs/namespace.c lock_kernel(); + if (sb->s_op->umount_lustre) -+ sb->s_op->umount_lustre(sb); ++ sb->s_op->umount_lustre(sb); if( (flags&MNT_FORCE) && sb->s_op->umount_begin) sb->s_op->umount_begin(sb); unlock_kernel(); @@ -345,11 +345,11 @@ Index: linux-2.6.9-5.0.3.EL/fs/namespace.c /* Discard magic */ if ((flags & MS_MGC_MSK) == MS_MGC_VAL) flags &= ~MS_MGC_MSK; -Index: linux-2.6.9-5.0.3.EL/fs/open.c +Index: linux-2.6.9-22.0.2.EL/fs/open.c =================================================================== ---- linux-2.6.9-5.0.3.EL.orig/fs/open.c 2005-02-26 14:28:01.000000000 +0200 -+++ linux-2.6.9-5.0.3.EL/fs/open.c 2005-02-26 23:29:02.000000000 +0200 -@@ -215,12 +215,12 @@ +--- linux-2.6.9-22.0.2.EL.orig/fs/open.c 2006-01-25 23:40:13.000000000 -0500 ++++ linux-2.6.9-22.0.2.EL/fs/open.c 2006-01-26 18:46:33.000000000 -0500 +@@ -216,12 +216,12 @@ struct nameidata nd; struct inode * inode; int error; @@ -364,7 +364,7 @@ Index: linux-2.6.9-5.0.3.EL/fs/open.c if (error) goto out; inode = nd.dentry->d_inode; -@@ -474,6 +474,7 @@ +@@ -475,6 +475,7 @@ int old_fsuid, old_fsgid; kernel_cap_t old_cap; int res; @@ -372,7 +372,7 @@ Index: linux-2.6.9-5.0.3.EL/fs/open.c if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */ return -EINVAL; -@@ -498,13 +499,14 @@ +@@ -499,13 +500,14 @@ else current->cap_effective = current->cap_permitted; @@ -388,7 +388,7 @@ Index: linux-2.6.9-5.0.3.EL/fs/open.c path_release(&nd); } -@@ -519,8 +521,9 @@ +@@ -520,8 +522,9 @@ { struct nameidata nd; int error; @@ -399,7 +399,7 @@ Index: linux-2.6.9-5.0.3.EL/fs/open.c if (error) goto out; -@@ -572,8 +575,9 @@ +@@ -573,8 +576,9 @@ { struct nameidata nd; int error; @@ -410,7 +410,7 @@ Index: linux-2.6.9-5.0.3.EL/fs/open.c if (error) goto out; -@@ -754,27 +758,8 @@ +@@ -755,27 +759,8 @@ * for the internal routines (ie open_namei()/follow_link() etc). 00 is * used by symlinks. */ @@ -440,7 +440,7 @@ Index: linux-2.6.9-5.0.3.EL/fs/open.c { struct file * f; struct inode *inode; -@@ -786,6 +771,7 @@ +@@ -787,6 +772,7 @@ goto cleanup_dentry; f->f_flags = flags; f->f_mode = ((flags+1) & O_ACCMODE) | FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; @@ -448,7 +448,7 @@ Index: linux-2.6.9-5.0.3.EL/fs/open.c inode = dentry->d_inode; if (f->f_mode & FMODE_WRITE) { error = get_write_access(inode); -@@ -804,6 +790,7 @@ +@@ -805,6 +791,7 @@ error = f->f_op->open(inode,f); if (error) goto cleanup_all; @@ -456,7 +456,7 @@ Index: linux-2.6.9-5.0.3.EL/fs/open.c } f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); -@@ -829,6 +816,7 @@ +@@ -830,6 +817,7 @@ cleanup_file: put_filp(f); cleanup_dentry: @@ -464,7 +464,7 @@ Index: linux-2.6.9-5.0.3.EL/fs/open.c dput(dentry); mntput(mnt); return ERR_PTR(error); -@@ -836,6 +824,36 @@ +@@ -837,6 +825,36 @@ EXPORT_SYMBOL(dentry_open); @@ -501,10 +501,10 @@ Index: linux-2.6.9-5.0.3.EL/fs/open.c /* * Find an empty file descriptor entry, and mark it busy. */ -Index: linux-2.6.9-5.0.3.EL/fs/stat.c +Index: linux-2.6.9-22.0.2.EL/fs/stat.c =================================================================== ---- linux-2.6.9-5.0.3.EL.orig/fs/stat.c 2005-02-26 14:28:01.000000000 +0200 -+++ linux-2.6.9-5.0.3.EL/fs/stat.c 2005-02-26 23:29:02.000000000 +0200 +--- linux-2.6.9-22.0.2.EL.orig/fs/stat.c 2006-01-25 23:40:13.000000000 -0500 ++++ linux-2.6.9-22.0.2.EL/fs/stat.c 2006-01-25 23:40:55.000000000 -0500 @@ -37,7 +37,7 @@ EXPORT_SYMBOL(generic_fillattr); @@ -574,11 +574,11 @@ Index: linux-2.6.9-5.0.3.EL/fs/stat.c fput(f); } return error; -Index: linux-2.6.9-5.0.3.EL/fs/nfs/dir.c +Index: linux-2.6.9-22.0.2.EL/fs/nfs/dir.c =================================================================== ---- linux-2.6.9-5.0.3.EL.orig/fs/nfs/dir.c 2005-02-26 14:28:01.000000000 +0200 -+++ linux-2.6.9-5.0.3.EL/fs/nfs/dir.c 2005-04-01 18:10:28.924760536 +0300 -@@ -718,7 +718,7 @@ +--- linux-2.6.9-22.0.2.EL.orig/fs/nfs/dir.c 2006-01-25 23:40:13.000000000 -0500 ++++ linux-2.6.9-22.0.2.EL/fs/nfs/dir.c 2006-01-26 18:46:32.000000000 -0500 +@@ -726,7 +726,7 @@ return 0; if (!nd || (nd->flags & LOOKUP_CONTINUE) || !(nd->flags & LOOKUP_CREATE)) return 0; @@ -587,7 +587,7 @@ Index: linux-2.6.9-5.0.3.EL/fs/nfs/dir.c } static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd) -@@ -1043,7 +1043,7 @@ +@@ -1057,7 +1057,7 @@ attr.ia_valid = ATTR_MODE; if (nd && (nd->flags & LOOKUP_CREATE)) @@ -596,11 +596,11 @@ Index: linux-2.6.9-5.0.3.EL/fs/nfs/dir.c /* * The 0 argument passed into the create function should one day -Index: linux-2.6.9-5.0.3.EL/fs/inode.c +Index: linux-2.6.9-22.0.2.EL/fs/inode.c =================================================================== ---- linux-2.6.9-5.0.3.EL.orig/fs/inode.c 2005-02-26 14:28:01.000000000 +0200 -+++ linux-2.6.9-5.0.3.EL/fs/inode.c 2005-02-26 23:29:02.000000000 +0200 -@@ -233,6 +233,7 @@ +--- linux-2.6.9-22.0.2.EL.orig/fs/inode.c 2006-01-25 23:40:13.000000000 -0500 ++++ linux-2.6.9-22.0.2.EL/fs/inode.c 2006-01-25 23:40:55.000000000 -0500 +@@ -235,6 +235,7 @@ inodes_stat.nr_unused--; } @@ -608,10 +608,10 @@ Index: linux-2.6.9-5.0.3.EL/fs/inode.c /** * clear_inode - clear an inode * @inode: inode to clear -Index: linux-2.6.9-5.0.3.EL/include/linux/dcache.h +Index: linux-2.6.9-22.0.2.EL/include/linux/dcache.h =================================================================== ---- linux-2.6.9-5.0.3.EL.orig/include/linux/dcache.h 2005-02-26 14:28:01.000000000 +0200 -+++ linux-2.6.9-5.0.3.EL/include/linux/dcache.h 2005-02-26 23:29:02.000000000 +0200 +--- linux-2.6.9-22.0.2.EL.orig/include/linux/dcache.h 2006-01-25 23:40:13.000000000 -0500 ++++ linux-2.6.9-22.0.2.EL/include/linux/dcache.h 2006-01-26 18:46:32.000000000 -0500 @@ -4,6 +4,7 @@ #ifdef __KERNEL__ @@ -629,11 +629,11 @@ Index: linux-2.6.9-5.0.3.EL/include/linux/dcache.h struct dentry_stat_t { int nr_dentry; int nr_unused; -Index: linux-2.6.9-5.0.3.EL/include/linux/fs.h +Index: linux-2.6.9-22.0.2.EL/include/linux/fs.h =================================================================== ---- linux-2.6.9-5.0.3.EL.orig/include/linux/fs.h 2005-02-26 14:28:01.000000000 +0200 -+++ linux-2.6.9-5.0.3.EL/include/linux/fs.h 2005-02-26 23:29:02.000000000 +0200 -@@ -74,6 +74,7 @@ +--- linux-2.6.9-22.0.2.EL.orig/include/linux/fs.h 2006-01-25 23:40:13.000000000 -0500 ++++ linux-2.6.9-22.0.2.EL/include/linux/fs.h 2006-01-26 18:46:33.000000000 -0500 +@@ -75,6 +75,7 @@ #define FMODE_READ 1 #define FMODE_WRITE 2 @@ -641,7 +641,7 @@ Index: linux-2.6.9-5.0.3.EL/include/linux/fs.h /* Internal kernel extensions */ #define FMODE_LSEEK 4 -@@ -258,6 +259,8 @@ +@@ -259,6 +260,8 @@ #define ATTR_ATTR_FLAG 1024 #define ATTR_KILL_SUID 2048 #define ATTR_KILL_SGID 4096 @@ -650,7 +650,7 @@ Index: linux-2.6.9-5.0.3.EL/include/linux/fs.h /* * This is the Inode Attributes structure, used for notify_change(). It -@@ -456,6 +459,7 @@ +@@ -457,6 +460,7 @@ struct block_device *i_bdev; struct cdev *i_cdev; int i_cindex; @@ -658,7 +658,7 @@ Index: linux-2.6.9-5.0.3.EL/include/linux/fs.h __u32 i_generation; -@@ -589,6 +593,7 @@ +@@ -590,6 +594,7 @@ spinlock_t f_ep_lock; #endif /* #ifdef CONFIG_EPOLL */ struct address_space *f_mapping; @@ -666,7 +666,7 @@ Index: linux-2.6.9-5.0.3.EL/include/linux/fs.h }; extern spinlock_t files_lock; #define file_list_lock() spin_lock(&files_lock); -@@ -934,7 +939,9 @@ +@@ -960,7 +965,9 @@ void (*truncate) (struct inode *); int (*permission) (struct inode *, int, struct nameidata *); int (*setattr) (struct dentry *, struct iattr *); @@ -676,7 +676,7 @@ Index: linux-2.6.9-5.0.3.EL/include/linux/fs.h int (*setxattr) (struct dentry *, const char *,const void *,size_t,int); ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t); ssize_t (*listxattr) (struct dentry *, char *, size_t); -@@ -974,6 +981,7 @@ +@@ -1000,6 +1007,7 @@ int (*remount_fs) (struct super_block *, int *, char *); void (*clear_inode) (struct inode *); void (*umount_begin) (struct super_block *); @@ -684,7 +684,7 @@ Index: linux-2.6.9-5.0.3.EL/include/linux/fs.h int (*show_options)(struct seq_file *, struct vfsmount *); }; -@@ -1164,6 +1172,7 @@ +@@ -1192,6 +1200,7 @@ extern struct vfsmount *kern_mount(struct file_system_type *); extern int may_umount_tree(struct vfsmount *); extern int may_umount(struct vfsmount *); @@ -692,7 +692,7 @@ Index: linux-2.6.9-5.0.3.EL/include/linux/fs.h extern long do_mount(char *, char *, char *, unsigned long, void *); extern int vfs_statfs(struct super_block *, struct kstatfs *); -@@ -1228,6 +1237,7 @@ +@@ -1256,6 +1265,7 @@ extern int do_truncate(struct dentry *, loff_t start); extern struct file *filp_open(const char *, int, int); extern struct file * dentry_open(struct dentry *, struct vfsmount *, int); @@ -700,10 +700,10 @@ Index: linux-2.6.9-5.0.3.EL/include/linux/fs.h extern int filp_close(struct file *, fl_owner_t id); extern char * getname(const char __user *); -Index: linux-2.6.9-5.0.3.EL/include/linux/namei.h +Index: linux-2.6.9-22.0.2.EL/include/linux/namei.h =================================================================== ---- linux-2.6.9-5.0.3.EL.orig/include/linux/namei.h 2005-02-26 14:28:01.000000000 +0200 -+++ linux-2.6.9-5.0.3.EL/include/linux/namei.h 2005-02-26 23:29:02.000000000 +0200 +--- linux-2.6.9-22.0.2.EL.orig/include/linux/namei.h 2006-01-25 23:40:13.000000000 -0500 ++++ linux-2.6.9-22.0.2.EL/include/linux/namei.h 2006-01-25 23:40:55.000000000 -0500 @@ -2,14 +2,48 @@ #define _LINUX_NAMEI_H @@ -768,16 +768,16 @@ Index: linux-2.6.9-5.0.3.EL/include/linux/namei.h }; /* -@@ -46,6 +77,8 @@ - #define LOOKUP_PARENT 16 +@@ -47,6 +78,8 @@ #define LOOKUP_NOALT 32 #define LOOKUP_ATOMIC 64 -+#define LOOKUP_LAST (0x1000) -+#define LOOKUP_LINK_NOTLAST (0x2000) + #define LOOKUP_REVAL 128 ++#define LOOKUP_LAST (0x1000) ++#define LOOKUP_LINK_NOTLAST (0x2000) /* * Intent data -@@ -55,6 +88,12 @@ +@@ -56,6 +89,12 @@ #define LOOKUP_ACCESS (0x0400) extern int FASTCALL(__user_walk(const char __user *, unsigned, struct nameidata *)); @@ -790,7 +790,7 @@ Index: linux-2.6.9-5.0.3.EL/include/linux/namei.h #define user_path_walk(name,nd) \ __user_walk(name, LOOKUP_FOLLOW, nd) #define user_path_walk_link(name,nd) \ -@@ -67,7 +106,6 @@ +@@ -68,7 +107,6 @@ extern struct dentry * lookup_one_len(const char *, struct dentry *, int); extern struct dentry * lookup_hash(struct qstr *, struct dentry *); @@ -798,10 +798,10 @@ Index: linux-2.6.9-5.0.3.EL/include/linux/namei.h extern int follow_down(struct vfsmount **, struct dentry **); extern int follow_up(struct vfsmount **, struct dentry **); -Index: linux-2.6.9-5.0.3.EL/include/linux/mount.h +Index: linux-2.6.9-22.0.2.EL/include/linux/mount.h =================================================================== ---- linux-2.6.9-5.0.3.EL.orig/include/linux/mount.h 2005-02-26 14:28:01.000000000 +0200 -+++ linux-2.6.9-5.0.3.EL/include/linux/mount.h 2005-02-26 23:29:02.000000000 +0200 +--- linux-2.6.9-22.0.2.EL.orig/include/linux/mount.h 2006-01-25 23:40:13.000000000 -0500 ++++ linux-2.6.9-22.0.2.EL/include/linux/mount.h 2006-01-25 23:40:55.000000000 -0500 @@ -34,6 +34,8 @@ struct list_head mnt_list; struct list_head mnt_fslink; /* link in fs-specific expiry list */ @@ -811,25 +811,3 @@ Index: linux-2.6.9-5.0.3.EL/include/linux/mount.h }; static inline struct vfsmount *mntget(struct vfsmount *mnt) -Index: linux-2.6.9-5.0.3.EL/kernel/exit.c -=================================================================== ---- linux-2.6.9-5.0.3.EL.orig/kernel/exit.c 2005-02-26 14:28:01.000000000 +0200 -+++ linux-2.6.9-5.0.3.EL/kernel/exit.c 2005-02-26 23:29:02.000000000 +0200 -@@ -244,6 +244,8 @@ - write_unlock_irq(&tasklist_lock); - } - -+EXPORT_SYMBOL(reparent_to_init); -+ - void __set_special_pids(pid_t session, pid_t pgrp) - { - struct task_struct *curr = current; -@@ -428,6 +430,8 @@ - __exit_files(tsk); - } - -+EXPORT_SYMBOL(exit_files); -+ - static inline void __put_fs_struct(struct fs_struct *fs) - { - /* No need to hold fs->lock if we are killing it */ diff --git a/lustre/kernel_patches/patches/vfs_intent-2.6-suse.patch b/lustre/kernel_patches/patches/vfs_intent-2.6-suse.patch index e6e5392..695423b 100644 --- a/lustre/kernel_patches/patches/vfs_intent-2.6-suse.patch +++ b/lustre/kernel_patches/patches/vfs_intent-2.6-suse.patch @@ -682,7 +682,7 @@ Index: linux-2.6.5-12.1/include/linux/fs.h spinlock_t f_ep_lock; #endif /* #ifdef CONFIG_EPOLL */ struct address_space *f_mapping; -+ struct lookup_intent *f_it; ++ struct lookup_intent *f_it; }; extern spinlock_t files_lock; #define file_list_lock() spin_lock(&files_lock); @@ -853,25 +853,3 @@ Index: linux-2.6.5-12.1/include/linux/fshooks.h #define FSHOOK_END_USER_WALK(type, err, field) ((void)0);} -Index: linux-2.6.5-12.1/kernel/exit.c -=================================================================== ---- linux-2.6.5-12.1.orig/kernel/exit.c 2004-05-10 12:21:56.000000000 -0400 -+++ linux-2.6.5-12.1/kernel/exit.c 2004-06-03 18:31:28.000000000 -0400 -@@ -260,6 +260,8 @@ - write_unlock_irq(&tasklist_lock); - } - -+EXPORT_SYMBOL(reparent_to_init); -+ - void __set_special_pids(pid_t session, pid_t pgrp) - { - struct task_struct *curr = current; -@@ -429,6 +431,8 @@ - __exit_files(tsk); - } - -+EXPORT_SYMBOL(exit_files); -+ - static inline void __put_fs_struct(struct fs_struct *fs) - { - /* No need to hold fs->lock if we are killing it */ diff --git a/lustre/kernel_patches/patches/vfs_intent-2.6.12.patch b/lustre/kernel_patches/patches/vfs_intent-2.6.12.patch new file mode 100644 index 0000000..80db906 --- /dev/null +++ b/lustre/kernel_patches/patches/vfs_intent-2.6.12.patch @@ -0,0 +1,819 @@ +Index: linux-2.6.12.5/fs/exec.c +=================================================================== +--- linux-2.6.12.5.orig/fs/exec.c 2005-08-17 17:51:28.000000000 +0200 ++++ linux-2.6.12.5/fs/exec.c 2005-08-17 17:51:44.000000000 +0200 +@@ -122,9 +122,10 @@ + struct file * file; + struct nameidata nd; + int error; ++ intent_init(&nd.intent, IT_OPEN); + +- nd.intent.open.flags = FMODE_READ; +- error = __user_walk(library, LOOKUP_FOLLOW|LOOKUP_OPEN, &nd); ++ nd.intent.it_flags = FMODE_READ|FMODE_EXEC; ++ error = __user_walk_it(library, LOOKUP_FOLLOW|LOOKUP_OPEN, &nd); + if (error) + goto out; + +@@ -136,7 +137,7 @@ + if (error) + goto exit; + +- file = dentry_open(nd.dentry, nd.mnt, O_RDONLY); ++ file = dentry_open_it(nd.dentry, nd.mnt, O_RDONLY, &nd.intent); + error = PTR_ERR(file); + if (IS_ERR(file)) + goto out; +@@ -492,8 +493,9 @@ + int err; + struct file *file; + +- nd.intent.open.flags = FMODE_READ; +- err = path_lookup(name, LOOKUP_FOLLOW|LOOKUP_OPEN, &nd); ++ intent_init(&nd.intent, IT_OPEN); ++ nd.intent.it_flags = FMODE_READ|FMODE_EXEC; ++ err = path_lookup(name, LOOKUP_FOLLOW, &nd); + file = ERR_PTR(err); + + if (!err) { +@@ -506,7 +508,7 @@ + err = -EACCES; + file = ERR_PTR(err); + if (!err) { +- file = dentry_open(nd.dentry, nd.mnt, O_RDONLY); ++ file = dentry_open_it(nd.dentry, nd.mnt, O_RDONLY, &nd.intent); + if (!IS_ERR(file)) { + err = deny_write_access(file); + if (err) { +Index: linux-2.6.12.5/fs/namei.c +=================================================================== +--- linux-2.6.12.5.orig/fs/namei.c 2005-08-17 17:51:28.000000000 +0200 ++++ linux-2.6.12.5/fs/namei.c 2005-08-17 17:52:57.000000000 +0200 +@@ -301,8 +301,19 @@ + return 0; + } + ++void intent_release(struct lookup_intent *it) ++{ ++ if (!it) ++ return; ++ if (it->it_magic != INTENT_MAGIC) ++ return; ++ if (it->it_op_release) ++ it->it_op_release(it); ++} ++ + void path_release(struct nameidata *nd) + { ++ intent_release(&nd->intent); + dput(nd->dentry); + mntput(nd->mnt); + } +@@ -392,8 +403,11 @@ + { + struct dentry * result; + struct inode *dir = parent->d_inode; ++ int counter = 0; + + down(&dir->i_sem); ++again: ++ counter++; + /* + * First re-do the cached lookup just in case it was created + * while we waited for the directory semaphore.. +@@ -427,13 +441,16 @@ + * Uhhuh! Nasty case: the cache was re-populated while + * we waited on the semaphore. Need to revalidate. + */ +- up(&dir->i_sem); + if (result->d_op && result->d_op->d_revalidate) { + if (!result->d_op->d_revalidate(result, nd) && !d_invalidate(result)) { + dput(result); +- result = ERR_PTR(-ENOENT); ++ if (counter > 10) ++ result = ERR_PTR(-ESTALE); ++ if (!IS_ERR(result)) ++ goto again; + } + } ++ up(&dir->i_sem); + return result; + } + +@@ -461,7 +478,9 @@ + static inline int __vfs_follow_link(struct nameidata *nd, const char *link) + { + int res = 0; ++ struct lookup_intent it = nd->intent; + char *name; ++ + if (IS_ERR(link)) + goto fail; + +@@ -471,6 +490,9 @@ + /* weird __emul_prefix() stuff did it */ + goto out; + } ++ intent_init(&nd->intent, it.it_op); ++ nd->intent.it_flags = it.it_flags; ++ nd->intent.it_create_mode = it.it_create_mode; + res = link_path_walk(link, nd); + out: + if (nd->depth || res || nd->last_type!=LAST_NORM) +@@ -703,6 +725,33 @@ + return PTR_ERR(dentry); + } + ++static int revalidate_special(struct nameidata *nd) ++{ ++ struct dentry *dentry = nd->dentry; ++ int err, counter = 0; ++ ++ revalidate_again: ++ if (!dentry->d_op || !dentry->d_op->d_revalidate) ++ return 0; ++ if (!dentry->d_op->d_revalidate(dentry, nd)) { ++ struct dentry *new; ++ if ((err = permission(dentry->d_parent->d_inode, MAY_EXEC, nd))) ++ return err; ++ new = real_lookup(dentry->d_parent, &dentry->d_name, nd); ++ if (IS_ERR(new)) ++ return PTR_ERR(new); ++ d_invalidate(dentry); ++ dput(dentry); ++ nd->dentry = dentry = new; ++ counter++; ++ if (counter < 10) ++ goto revalidate_again; ++ printk("excessive revalidate_it loops\n"); ++ return -ESTALE; ++ } ++ return 0; ++} ++ + /* + * Name resolution. + * This is the basic name resolution function, turning a pathname into +@@ -800,7 +849,11 @@ + goto out_dput; + + if (inode->i_op->follow_link) { ++ int save_flags = nd->flags; ++ nd->flags |= LOOKUP_LINK_NOTLAST; + err = do_follow_link(&next, nd); ++ if (!(save_flags & LOOKUP_LINK_NOTLAST)) ++ nd->flags &= ~LOOKUP_LINK_NOTLAST; + if (err) + goto return_err; + err = -ENOENT; +@@ -839,6 +892,23 @@ + inode = nd->dentry->d_inode; + /* fallthrough */ + case 1: ++ nd->flags |= LOOKUP_LAST; ++ err = revalidate_special(nd); ++ nd->flags &= ~LOOKUP_LAST; ++ if (!nd->dentry->d_inode) ++ err = -ENOENT; ++ if (err) { ++ path_release(nd); ++ goto return_err; ++ } ++ if (lookup_flags & LOOKUP_DIRECTORY) { ++ err = -ENOTDIR; ++ if(!nd->dentry->d_inode->i_op || ++ !nd->dentry->d_inode->i_op->lookup) { ++ path_release(nd); ++ goto return_err; ++ } ++ } + goto return_reval; + } + if (nd->dentry->d_op && nd->dentry->d_op->d_hash) { +@@ -846,7 +916,9 @@ + if (err < 0) + break; + } ++ nd->flags |= LOOKUP_LAST; + err = do_lookup(nd, &this, &next); ++ nd->flags &= ~LOOKUP_LAST; + if (err) + break; + inode = next.dentry->d_inode; +@@ -1097,7 +1169,7 @@ + } + + /* SMP-safe */ +-struct dentry * lookup_one_len(const char * name, struct dentry * base, int len) ++struct dentry * lookup_one_len_it(const char * name, struct dentry * base, int len, struct nameidata *nd) + { + unsigned long hash; + struct qstr this; +@@ -1117,11 +1189,16 @@ + } + this.hash = end_name_hash(hash); + +- return lookup_hash(&this, base); ++ return __lookup_hash(&this, base, nd); + access: + return ERR_PTR(-EACCES); + } + ++struct dentry * lookup_one_len(const char * name, struct dentry * base, int len) ++{ ++ return lookup_one_len_it(name, base, len, NULL); ++} ++ + /* + * namei() + * +@@ -1133,7 +1210,7 @@ + * that namei follows links, while lnamei does not. + * SMP-safe + */ +-int fastcall __user_walk(const char __user *name, unsigned flags, struct nameidata *nd) ++int fastcall __user_walk_it(const char __user *name, unsigned flags, struct nameidata *nd) + { + char *tmp = getname(name); + int err = PTR_ERR(tmp); +@@ -1145,6 +1222,12 @@ + return err; + } + ++int fastcall __user_walk(const char __user *name, unsigned flags, struct nameidata *nd) ++{ ++ intent_init(&nd->intent, IT_LOOKUP); ++ return __user_walk_it(name, flags, nd); ++} ++ + /* + * It's inline, so penalty for filesystems that don't use sticky bit is + * minimal. +@@ -1426,8 +1509,8 @@ + acc_mode |= MAY_APPEND; + + /* Fill in the open() intent data */ +- nd->intent.open.flags = flag; +- nd->intent.open.create_mode = mode; ++ nd->intent.it_flags = flag; ++ nd->intent.it_create_mode = mode; + + /* + * The simplest case - just a plain lookup. +@@ -1442,6 +1525,7 @@ + /* + * Create - we need to know the parent. + */ ++ nd->intent.it_op |= IT_CREAT; + error = path_lookup(pathname, LOOKUP_PARENT|LOOKUP_OPEN|LOOKUP_CREATE, nd); + if (error) + return error; +@@ -1458,7 +1542,9 @@ + dir = nd->dentry; + nd->flags &= ~LOOKUP_PARENT; + down(&dir->d_inode->i_sem); ++ nd->flags |= LOOKUP_LAST; + path.dentry = __lookup_hash(&nd->last, nd->dentry, nd); ++ nd->flags &= ~LOOKUP_LAST; + path.mnt = nd->mnt; + + do_last: +@@ -1564,7 +1650,9 @@ + } + dir = nd->dentry; + down(&dir->d_inode->i_sem); ++ nd->flags |= LOOKUP_LAST; + path.dentry = __lookup_hash(&nd->last, nd->dentry, nd); ++ nd->flags &= ~LOOKUP_LAST; + path.mnt = nd->mnt; + putname(nd->last.name); + goto do_last; +Index: linux-2.6.12.5/fs/namespace.c +=================================================================== +--- linux-2.6.12.5.orig/fs/namespace.c 2005-08-17 17:51:28.000000000 +0200 ++++ linux-2.6.12.5/fs/namespace.c 2005-08-17 17:51:44.000000000 +0200 +@@ -62,6 +62,7 @@ + INIT_LIST_HEAD(&mnt->mnt_mounts); + INIT_LIST_HEAD(&mnt->mnt_list); + INIT_LIST_HEAD(&mnt->mnt_fslink); ++ INIT_LIST_HEAD(&mnt->mnt_lustre_list); + if (name) { + int size = strlen(name)+1; + char *newname = kmalloc(size, GFP_KERNEL); +@@ -113,6 +114,7 @@ + + static void detach_mnt(struct vfsmount *mnt, struct nameidata *old_nd) + { ++ memset(old_nd, 0, sizeof(*old_nd)); + old_nd->dentry = mnt->mnt_mountpoint; + old_nd->mnt = mnt->mnt_parent; + mnt->mnt_parent = mnt; +@@ -176,6 +178,9 @@ + { + struct super_block *sb = mnt->mnt_sb; + dput(mnt->mnt_root); ++ spin_lock(&dcache_lock); ++ list_del(&mnt->mnt_lustre_list); ++ spin_unlock(&dcache_lock); + free_vfsmnt(mnt); + deactivate_super(sb); + } +@@ -402,6 +407,8 @@ + */ + + lock_kernel(); ++ if (sb->s_op->umount_lustre) ++ sb->s_op->umount_lustre(sb); + if( (flags&MNT_FORCE) && sb->s_op->umount_begin) + sb->s_op->umount_begin(sb); + unlock_kernel(); +@@ -627,6 +634,7 @@ + return err; + if (!old_name || !*old_name) + return -EINVAL; ++ intent_init(&old_nd.intent, IT_LOOKUP); + err = path_lookup(old_name, LOOKUP_FOLLOW, &old_nd); + if (err) + return err; +@@ -701,6 +709,7 @@ + return -EPERM; + if (!old_name || !*old_name) + return -EINVAL; ++ intent_init(&old_nd.intent, IT_LOOKUP); + err = path_lookup(old_name, LOOKUP_FOLLOW, &old_nd); + if (err) + return err; +@@ -1012,6 +1021,7 @@ + int retval = 0; + int mnt_flags = 0; + ++ intent_init(&nd.intent, IT_LOOKUP); + /* Discard magic */ + if ((flags & MS_MGC_MSK) == MS_MGC_VAL) + flags &= ~MS_MGC_MSK; +Index: linux-2.6.12.5/fs/open.c +=================================================================== +--- linux-2.6.12.5.orig/fs/open.c 2005-08-17 17:51:28.000000000 +0200 ++++ linux-2.6.12.5/fs/open.c 2005-08-17 17:51:44.000000000 +0200 +@@ -215,12 +215,12 @@ + struct nameidata nd; + struct inode * inode; + int error; +- ++ intent_init(&nd.intent, IT_GETATTR); + error = -EINVAL; + if (length < 0) /* sorry, but loff_t says... */ + goto out; + +- error = user_path_walk(path, &nd); ++ error = user_path_walk_it(path, &nd); + if (error) + goto out; + inode = nd.dentry->d_inode; +@@ -474,6 +474,7 @@ + int old_fsuid, old_fsgid; + kernel_cap_t old_cap; + int res; ++ intent_init(&nd.intent, IT_GETATTR); + + if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */ + return -EINVAL; +@@ -498,13 +499,14 @@ + else + current->cap_effective = current->cap_permitted; + +- res = __user_walk(filename, LOOKUP_FOLLOW|LOOKUP_ACCESS, &nd); ++ res = __user_walk_it(filename, LOOKUP_FOLLOW|LOOKUP_ACCESS, &nd); + if (!res) { + res = permission(nd.dentry->d_inode, mode, &nd); + /* SuS v2 requires we report a read only fs too */ + if(!res && (mode & S_IWOTH) && IS_RDONLY(nd.dentry->d_inode) + && !special_file(nd.dentry->d_inode->i_mode)) + res = -EROFS; ++ + path_release(&nd); + } + +@@ -519,8 +521,9 @@ + { + struct nameidata nd; + int error; ++ intent_init(&nd.intent, IT_GETATTR); + +- error = __user_walk(filename, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &nd); ++ error = __user_walk_it(filename, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &nd); + if (error) + goto out; + +@@ -570,8 +573,9 @@ + { + struct nameidata nd; + int error; ++ intent_init(&nd.intent, IT_GETATTR); + +- error = __user_walk(filename, LOOKUP_FOLLOW | LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd); ++ error = __user_walk_it(filename, LOOKUP_FOLLOW | LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd); + if (error) + goto out; + +@@ -750,27 +754,8 @@ + * for the internal routines (ie open_namei()/follow_link() etc). 00 is + * used by symlinks. + */ +-struct file *filp_open(const char * filename, int flags, int mode) +-{ +- int namei_flags, error; +- struct nameidata nd; +- +- namei_flags = flags; +- if ((namei_flags+1) & O_ACCMODE) +- namei_flags++; +- if (namei_flags & O_TRUNC) +- namei_flags |= 2; +- +- error = open_namei(filename, namei_flags, mode, &nd); +- if (!error) +- return dentry_open(nd.dentry, nd.mnt, flags); +- +- return ERR_PTR(error); +-} +- +-EXPORT_SYMBOL(filp_open); +- +-struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags) ++struct file *dentry_open_it(struct dentry *dentry, struct vfsmount *mnt, int flags, ++ struct lookup_intent *it) + { + struct file * f; + struct inode *inode; +@@ -782,6 +767,7 @@ + goto cleanup_dentry; + f->f_flags = flags; + f->f_mode = ((flags+1) & O_ACCMODE) | FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; ++ f->f_it = it; + inode = dentry->d_inode; + if (f->f_mode & FMODE_WRITE) { + error = get_write_access(inode); +@@ -800,6 +786,7 @@ + error = f->f_op->open(inode,f); + if (error) + goto cleanup_all; ++ intent_release(it); + } + f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); + +@@ -825,6 +812,7 @@ + cleanup_file: + put_filp(f); + cleanup_dentry: ++ intent_release(it); + dput(dentry); + mntput(mnt); + return ERR_PTR(error); +@@ -832,6 +820,36 @@ + + EXPORT_SYMBOL(dentry_open); + ++struct file *filp_open(const char * filename, int flags, int mode) ++{ ++ int namei_flags, error; ++ struct file * temp_filp; ++ struct nameidata nd; ++ intent_init(&nd.intent, IT_OPEN); ++ ++ namei_flags = flags; ++ if ((namei_flags+1) & O_ACCMODE) ++ namei_flags++; ++ if (namei_flags & O_TRUNC) ++ namei_flags |= 2; ++ ++ error = open_namei(filename, namei_flags, mode, &nd); ++ if (!error) { ++ temp_filp = dentry_open_it(nd.dentry, nd.mnt, flags, &nd.intent); ++ return temp_filp; ++ } ++ return ERR_PTR(error); ++} ++ ++ ++struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags) ++{ ++ struct lookup_intent it; ++ intent_init(&it, IT_LOOKUP); ++ ++ return dentry_open_it(dentry, mnt, flags, &it); ++} ++ + /* + * Find an empty file descriptor entry, and mark it busy. + */ +Index: linux-2.6.12.5/fs/stat.c +=================================================================== +--- linux-2.6.12.5.orig/fs/stat.c 2005-08-17 17:51:28.000000000 +0200 ++++ linux-2.6.12.5/fs/stat.c 2005-08-17 17:51:44.000000000 +0200 +@@ -38,7 +38,7 @@ + + EXPORT_SYMBOL(generic_fillattr); + +-int vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) ++int vfs_getattr_it(struct vfsmount *mnt, struct dentry *dentry, struct lookup_intent *it, struct kstat *stat) + { + struct inode *inode = dentry->d_inode; + int retval; +@@ -47,6 +47,8 @@ + if (retval) + return retval; + ++ if (inode->i_op->getattr_it) ++ return inode->i_op->getattr_it(mnt, dentry, it, stat); + if (inode->i_op->getattr) + return inode->i_op->getattr(mnt, dentry, stat); + +@@ -63,14 +65,20 @@ + + EXPORT_SYMBOL(vfs_getattr); + ++int vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) ++{ ++ return vfs_getattr_it(mnt, dentry, NULL, stat); ++} ++ + int vfs_stat(char __user *name, struct kstat *stat) + { + struct nameidata nd; + int error; ++ intent_init(&nd.intent, IT_GETATTR); + +- error = user_path_walk(name, &nd); ++ error = user_path_walk_it(name, &nd); + if (!error) { +- error = vfs_getattr(nd.mnt, nd.dentry, stat); ++ error = vfs_getattr_it(nd.mnt, nd.dentry, &nd.intent, stat); + path_release(&nd); + } + return error; +@@ -82,10 +90,11 @@ + { + struct nameidata nd; + int error; ++ intent_init(&nd.intent, IT_GETATTR); + +- error = user_path_walk_link(name, &nd); ++ error = user_path_walk_link_it(name, &nd); + if (!error) { +- error = vfs_getattr(nd.mnt, nd.dentry, stat); ++ error = vfs_getattr_it(nd.mnt, nd.dentry, &nd.intent, stat); + path_release(&nd); + } + return error; +@@ -97,9 +106,12 @@ + { + struct file *f = fget(fd); + int error = -EBADF; ++ struct nameidata nd; ++ intent_init(&nd.intent, IT_GETATTR); + + if (f) { +- error = vfs_getattr(f->f_vfsmnt, f->f_dentry, stat); ++ error = vfs_getattr_it(f->f_vfsmnt, f->f_dentry, &nd.intent, stat); ++ intent_release(&nd.intent); + fput(f); + } + return error; +Index: linux-2.6.12.5/fs/nfs/dir.c +=================================================================== +--- linux-2.6.12.5.orig/fs/nfs/dir.c 2005-08-17 17:51:28.000000000 +0200 ++++ linux-2.6.12.5/fs/nfs/dir.c 2005-08-17 17:51:44.000000000 +0200 +@@ -727,7 +727,7 @@ + return 0; + if (nd == NULL || nfs_lookup_check_intent(nd, LOOKUP_CREATE) == 0) + return 0; +- return (nd->intent.open.flags & O_EXCL) != 0; ++ return (nd->intent.it_flags & O_EXCL) != 0; + } + + static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd) +@@ -1028,7 +1028,7 @@ + attr.ia_valid = ATTR_MODE; + + if (nd && (nd->flags & LOOKUP_CREATE)) +- open_flags = nd->intent.open.flags; ++ open_flags = nd->intent.it_flags; + + lock_kernel(); + nfs_begin_data_update(dir); +Index: linux-2.6.12.5/fs/inode.c +=================================================================== +--- linux-2.6.12.5.orig/fs/inode.c 2005-08-17 17:51:28.000000000 +0200 ++++ linux-2.6.12.5/fs/inode.c 2005-08-17 17:51:44.000000000 +0200 +@@ -230,6 +230,7 @@ + inodes_stat.nr_unused--; + } + ++EXPORT_SYMBOL(__iget); + /** + * clear_inode - clear an inode + * @inode: inode to clear +Index: linux-2.6.12.5/include/linux/dcache.h +=================================================================== +--- linux-2.6.12.5.orig/include/linux/dcache.h 2005-08-17 17:51:28.000000000 +0200 ++++ linux-2.6.12.5/include/linux/dcache.h 2005-08-17 17:51:44.000000000 +0200 +@@ -4,6 +4,7 @@ + #ifdef __KERNEL__ + + #include ++#include + #include + #include + #include +@@ -37,6 +38,8 @@ + const unsigned char *name; + }; + ++#include ++ + struct dentry_stat_t { + int nr_dentry; + int nr_unused; +Index: linux-2.6.12.5/include/linux/fs.h +=================================================================== +--- linux-2.6.12.5.orig/include/linux/fs.h 2005-08-17 17:51:28.000000000 +0200 ++++ linux-2.6.12.5/include/linux/fs.h 2005-08-17 17:51:44.000000000 +0200 +@@ -58,6 +58,7 @@ + + #define FMODE_READ 1 + #define FMODE_WRITE 2 ++#define FMODE_EXEC 4 + + /* Internal kernel extensions */ + #define FMODE_LSEEK 4 +@@ -260,6 +261,8 @@ + #define ATTR_ATTR_FLAG 1024 + #define ATTR_KILL_SUID 2048 + #define ATTR_KILL_SGID 4096 ++#define ATTR_RAW 8192 /* file system, not vfs will massage attrs */ ++#define ATTR_FROM_OPEN 16384 /* called from open path, ie O_TRUNC */ + + /* + * This is the Inode Attributes structure, used for notify_change(). It +@@ -463,6 +466,7 @@ + struct block_device *i_bdev; + struct cdev *i_cdev; + int i_cindex; ++ void *i_filterdata; + + __u32 i_generation; + +@@ -600,6 +604,7 @@ + spinlock_t f_ep_lock; + #endif /* #ifdef CONFIG_EPOLL */ + struct address_space *f_mapping; ++ struct lookup_intent *f_it; + }; + extern spinlock_t files_lock; + #define file_list_lock() spin_lock(&files_lock); +@@ -968,7 +973,9 @@ + void (*truncate) (struct inode *); + int (*permission) (struct inode *, int, struct nameidata *); + int (*setattr) (struct dentry *, struct iattr *); ++ int (*setattr_raw) (struct inode *, struct iattr *); + int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *); ++ int (*getattr_it) (struct vfsmount *, struct dentry *, struct lookup_intent *, struct kstat *); + int (*setxattr) (struct dentry *, const char *,const void *,size_t,int); + ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t); + ssize_t (*listxattr) (struct dentry *, char *, size_t); +@@ -1008,6 +1015,7 @@ + int (*remount_fs) (struct super_block *, int *, char *); + void (*clear_inode) (struct inode *); + void (*umount_begin) (struct super_block *); ++ void (*umount_lustre) (struct super_block *); + + int (*show_options)(struct seq_file *, struct vfsmount *); + +@@ -1210,6 +1218,7 @@ + extern struct vfsmount *kern_mount(struct file_system_type *); + extern int may_umount_tree(struct vfsmount *); + extern int may_umount(struct vfsmount *); ++struct vfsmount *do_kern_mount(const char *type, int flags, const char *name, void *data); + extern long do_mount(char *, char *, char *, unsigned long, void *); + + extern int vfs_statfs(struct super_block *, struct kstatfs *); +@@ -1262,6 +1271,7 @@ + extern int do_truncate(struct dentry *, loff_t start); + extern struct file *filp_open(const char *, int, int); + extern struct file * dentry_open(struct dentry *, struct vfsmount *, int); ++extern struct file * dentry_open_it(struct dentry *, struct vfsmount *, int, struct lookup_intent *); + extern int filp_close(struct file *, fl_owner_t id); + extern char * getname(const char __user *); + +Index: linux-2.6.12.5/include/linux/namei.h +=================================================================== +--- linux-2.6.12.5.orig/include/linux/namei.h 2005-08-17 17:51:28.000000000 +0200 ++++ linux-2.6.12.5/include/linux/namei.h 2005-08-17 17:51:44.000000000 +0200 +@@ -2,14 +2,48 @@ + #define _LINUX_NAMEI_H + + #include ++#include + + struct vfsmount; ++struct nameidata; + +-struct open_intent { +- int flags; +- int create_mode; ++/* intent opcodes */ ++#define IT_OPEN (1) ++#define IT_CREAT (1<<1) ++#define IT_READDIR (1<<2) ++#define IT_GETATTR (1<<3) ++#define IT_LOOKUP (1<<4) ++#define IT_UNLINK (1<<5) ++#define IT_TRUNC (1<<6) ++#define IT_GETXATTR (1<<7) ++ ++struct lustre_intent_data { ++ int it_disposition; ++ int it_status; ++ __u64 it_lock_handle; ++ void *it_data; ++ int it_lock_mode; + }; + ++#define INTENT_MAGIC 0x19620323 ++struct lookup_intent { ++ int it_magic; ++ void (*it_op_release)(struct lookup_intent *); ++ int it_op; ++ int it_flags; ++ int it_create_mode; ++ union { ++ struct lustre_intent_data lustre; ++ } d; ++}; ++ ++static inline void intent_init(struct lookup_intent *it, int op) ++{ ++ memset(it, 0, sizeof(*it)); ++ it->it_magic = INTENT_MAGIC; ++ it->it_op = op; ++} ++ + enum { MAX_NESTED_LINKS = 5 }; + + struct nameidata { +@@ -21,10 +55,7 @@ + unsigned depth; + char *saved_names[MAX_NESTED_LINKS + 1]; + +- /* Intent data */ +- union { +- struct open_intent open; +- } intent; ++ struct lookup_intent intent; + }; + + /* +@@ -47,6 +78,8 @@ + #define LOOKUP_PARENT 16 + #define LOOKUP_NOALT 32 + #define LOOKUP_REVAL 64 ++#define LOOKUP_LAST (0x1000) ++#define LOOKUP_LINK_NOTLAST (0x2000) + /* + * Intent data + */ +@@ -55,6 +88,12 @@ + #define LOOKUP_ACCESS (0x0400) + + extern int FASTCALL(__user_walk(const char __user *, unsigned, struct nameidata *)); ++extern int FASTCALL(__user_walk_it(const char __user *name, unsigned flags, struct nameidata *nd)); ++#define user_path_walk_it(name,nd) \ ++ __user_walk_it(name, LOOKUP_FOLLOW, nd) ++#define user_path_walk_link_it(name,nd) \ ++ __user_walk_it(name, 0, nd) ++extern void intent_release(struct lookup_intent *); + #define user_path_walk(name,nd) \ + __user_walk(name, LOOKUP_FOLLOW, nd) + #define user_path_walk_link(name,nd) \ +@@ -67,7 +106,6 @@ + + extern struct dentry * lookup_one_len(const char *, struct dentry *, int); + extern struct dentry * lookup_hash(struct qstr *, struct dentry *); +- + extern int follow_down(struct vfsmount **, struct dentry **); + extern int follow_up(struct vfsmount **, struct dentry **); + +Index: linux-2.6.12.5/include/linux/mount.h +=================================================================== +--- linux-2.6.12.5.orig/include/linux/mount.h 2005-08-17 17:51:28.000000000 +0200 ++++ linux-2.6.12.5/include/linux/mount.h 2005-08-17 17:51:44.000000000 +0200 +@@ -36,6 +36,8 @@ + struct list_head mnt_list; + struct list_head mnt_fslink; /* link in fs-specific expiry list */ + struct namespace *mnt_namespace; /* containing namespace */ ++ struct list_head mnt_lustre_list; /* GNS mount list */ ++ unsigned long mnt_last_used; /* for GNS auto-umount (jiffies) */ + }; + + static inline struct vfsmount *mntget(struct vfsmount *mnt) diff --git a/lustre/kernel_patches/patches/vfs_nointent-2.6-rhel4.patch b/lustre/kernel_patches/patches/vfs_nointent-2.6-rhel4.patch index 7f95eb3..173689a 100644 --- a/lustre/kernel_patches/patches/vfs_nointent-2.6-rhel4.patch +++ b/lustre/kernel_patches/patches/vfs_nointent-2.6-rhel4.patch @@ -47,18 +47,18 @@ Index: linux-2.6.9-5.0.3.EL/fs/namei.c if (!IS_ERR(tmp)) { struct dentry *dentry; struct nameidata nd; -+ intent_init(&nd.intent, IT_LOOKUP); ++ intent_init(&nd.intent, IT_LOOKUP); error = path_lookup(tmp, LOOKUP_PARENT, &nd); if (error) goto out; -+ if (nd.dentry->d_inode->i_op->mkdir_raw) { -+ struct inode_operations *op = nd.dentry->d_inode->i_op; -+ error = op->mkdir_raw(&nd, mode); -+ /* the file system wants to use normal vfs path now */ -+ if (error != -EOPNOTSUPP) -+ goto out2; -+ } ++ if (nd.dentry->d_inode->i_op->mkdir_raw) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ error = op->mkdir_raw(&nd, mode); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto out2; ++ } dentry = lookup_create(&nd, 1); error = PTR_ERR(dentry); if (!IS_ERR(dentry)) { @@ -74,7 +74,7 @@ Index: linux-2.6.9-5.0.3.EL/fs/namei.c char * name; struct dentry *dentry; struct nameidata nd; -+ intent_init(&nd.intent, IT_LOOKUP); ++ intent_init(&nd.intent, IT_LOOKUP); name = getname(pathname); if(IS_ERR(name)) @@ -82,16 +82,16 @@ Index: linux-2.6.9-5.0.3.EL/fs/namei.c error = -EBUSY; goto exit1; } -+ -+ if (nd.dentry->d_inode->i_op->rmdir_raw) { -+ struct inode_operations *op = nd.dentry->d_inode->i_op; -+ -+ error = op->rmdir_raw(&nd); -+ /* the file system wants to use normal vfs path now */ -+ if (error != -EOPNOTSUPP) -+ goto exit1; -+ } -+ ++ ++ if (nd.dentry->d_inode->i_op->rmdir_raw) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ ++ error = op->rmdir_raw(&nd); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto exit1; ++ } ++ down(&nd.dentry->d_inode->i_sem); dentry = lookup_hash(&nd.last, nd.dentry); error = PTR_ERR(dentry); @@ -99,7 +99,7 @@ Index: linux-2.6.9-5.0.3.EL/fs/namei.c struct dentry *dentry; struct nameidata nd; struct inode *inode = NULL; -+ intent_init(&nd.intent, IT_LOOKUP); ++ intent_init(&nd.intent, IT_LOOKUP); name = getname(pathname); if(IS_ERR(name)) @@ -107,13 +107,13 @@ Index: linux-2.6.9-5.0.3.EL/fs/namei.c error = -EISDIR; if (nd.last_type != LAST_NORM) goto exit1; -+ if (nd.dentry->d_inode->i_op->unlink_raw) { -+ struct inode_operations *op = nd.dentry->d_inode->i_op; -+ error = op->unlink_raw(&nd); -+ /* the file system wants to use normal vfs path now */ -+ if (error != -EOPNOTSUPP) -+ goto exit1; -+ } ++ if (nd.dentry->d_inode->i_op->unlink_raw) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ error = op->unlink_raw(&nd); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto exit1; ++ } down(&nd.dentry->d_inode->i_sem); dentry = lookup_hash(&nd.last, nd.dentry); error = PTR_ERR(dentry); @@ -121,7 +121,7 @@ Index: linux-2.6.9-5.0.3.EL/fs/namei.c if (!IS_ERR(to)) { struct dentry *dentry; struct nameidata nd; -+ intent_init(&nd.intent, IT_LOOKUP); ++ intent_init(&nd.intent, IT_LOOKUP); error = path_lookup(to, LOOKUP_PARENT, &nd); if (error) @@ -148,8 +148,8 @@ Index: linux-2.6.9-5.0.3.EL/fs/namei.c struct nameidata nd, old_nd; int error; char * to; -+ intent_init(&nd.intent, IT_LOOKUP); -+ intent_init(&old_nd.intent, IT_LOOKUP); ++ intent_init(&nd.intent, IT_LOOKUP); ++ intent_init(&old_nd.intent, IT_LOOKUP); to = getname(newname); if (IS_ERR(to)) @@ -157,40 +157,22 @@ Index: linux-2.6.9-5.0.3.EL/fs/namei.c error = -EXDEV; if (old_nd.mnt != nd.mnt) goto out_release; -+ if (nd.dentry->d_inode->i_op->link_raw) { -+ struct inode_operations *op = nd.dentry->d_inode->i_op; -+ error = op->link_raw(&old_nd, &nd); -+ /* the file system wants to use normal vfs path now */ -+ if (error != -EOPNOTSUPP) -+ goto out_release; -+ } ++ if (nd.dentry->d_inode->i_op->link_raw) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ error = op->link_raw(&old_nd, &nd); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto out_release; ++ } new_dentry = lookup_create(&nd, 0); error = PTR_ERR(new_dentry); if (!IS_ERR(new_dentry)) { -@@ -2101,7 +2158,7 @@ - * locking]. - */ - int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry, -- struct inode *new_dir, struct dentry *new_dentry) -+ struct inode *new_dir, struct dentry *new_dentry) - { - int error = 0; - struct inode *target; -@@ -2146,7 +2203,7 @@ - } - - int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry, -- struct inode *new_dir, struct dentry *new_dentry) -+ struct inode *new_dir, struct dentry *new_dentry) - { - struct inode *target; - int error; @@ -2223,6 +2280,8 @@ struct dentry * old_dentry, *new_dentry; struct dentry * trap; struct nameidata oldnd, newnd; -+ intent_init(&oldnd.intent, IT_LOOKUP); -+ intent_init(&newnd.intent, IT_LOOKUP); ++ intent_init(&oldnd.intent, IT_LOOKUP); ++ intent_init(&newnd.intent, IT_LOOKUP); error = path_lookup(oldname, LOOKUP_PARENT, &oldnd); if (error) @@ -198,12 +180,12 @@ Index: linux-2.6.9-5.0.3.EL/fs/namei.c if (newnd.last_type != LAST_NORM) goto exit2; -+ if (old_dir->d_inode->i_op->rename_raw) { -+ error = old_dir->d_inode->i_op->rename_raw(&oldnd, &newnd); -+ /* the file system wants to use normal vfs path now */ -+ if (error != -EOPNOTSUPP) -+ goto exit2; -+ } ++ if (old_dir->d_inode->i_op->rename_raw) { ++ error = old_dir->d_inode->i_op->rename_raw(&oldnd, &newnd); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto exit2; ++ } + trap = lock_rename(new_dir, old_dir); @@ -284,10 +266,10 @@ Index: linux-2.6.9-5.0.3.EL/fs/open.c + if (error != -EOPNOTSUPP) + goto dput_and_out; + } else { -+ down(&inode->i_sem); -+ error = notify_change(nd.dentry, &newattrs); -+ up(&inode->i_sem); -+ } ++ down(&inode->i_sem); ++ error = notify_change(nd.dentry, &newattrs); ++ up(&inode->i_sem); ++ } dput_and_out: path_release(&nd); out: @@ -307,10 +289,10 @@ Index: linux-2.6.9-5.0.3.EL/fs/open.c + if (error != -EOPNOTSUPP) + goto dput_and_out; + } else { -+ down(&inode->i_sem); -+ error = notify_change(nd.dentry, &newattrs); -+ up(&inode->i_sem); -+ } ++ down(&inode->i_sem); ++ error = notify_change(nd.dentry, &newattrs); ++ up(&inode->i_sem); ++ } dput_and_out: path_release(&nd); out: diff --git a/lustre/kernel_patches/patches/vfs_nointent-2.6.12.patch b/lustre/kernel_patches/patches/vfs_nointent-2.6.12.patch new file mode 100644 index 0000000..ce239c9 --- /dev/null +++ b/lustre/kernel_patches/patches/vfs_nointent-2.6.12.patch @@ -0,0 +1,490 @@ +Index: linux-2.6.12.2/fs/namei.c +=================================================================== +--- linux-2.6.12.2.orig/fs/namei.c 2005-07-23 12:25:12.241868120 +0200 ++++ linux-2.6.12.2/fs/namei.c 2005-07-23 12:25:14.440533872 +0200 +@@ -1466,7 +1466,7 @@ + if (!error) { + DQUOT_INIT(inode); + +- error = do_truncate(dentry, 0); ++ error = do_truncate(dentry, 0, 1); + } + put_write_access(inode); + if (error) +@@ -1719,6 +1719,7 @@ + char * tmp; + struct dentry * dentry; + struct nameidata nd; ++ intent_init(&nd.intent, IT_LOOKUP); + + if (S_ISDIR(mode)) + return -EPERM; +@@ -1729,6 +1730,15 @@ + error = path_lookup(tmp, LOOKUP_PARENT, &nd); + if (error) + goto out; ++ ++ if (nd.dentry->d_inode->i_op->mknod_raw) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ error = op->mknod_raw(&nd, mode, dev); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto out2; ++ } ++ + dentry = lookup_create(&nd, 0); + error = PTR_ERR(dentry); + +@@ -1755,6 +1765,7 @@ + dput(dentry); + } + up(&nd.dentry->d_inode->i_sem); ++out2: + path_release(&nd); + out: + putname(tmp); +@@ -1796,10 +1807,18 @@ + if (!IS_ERR(tmp)) { + struct dentry *dentry; + struct nameidata nd; ++ intent_init(&nd.intent, IT_LOOKUP); + + error = path_lookup(tmp, LOOKUP_PARENT, &nd); + if (error) + goto out; ++ if (nd.dentry->d_inode->i_op->mkdir_raw) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ error = op->mkdir_raw(&nd, mode); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto out2; ++ } + dentry = lookup_create(&nd, 1); + error = PTR_ERR(dentry); + if (!IS_ERR(dentry)) { +@@ -1809,6 +1828,7 @@ + dput(dentry); + } + up(&nd.dentry->d_inode->i_sem); ++out2: + path_release(&nd); + out: + putname(tmp); +@@ -1885,6 +1905,7 @@ + char * name; + struct dentry *dentry; + struct nameidata nd; ++ intent_init(&nd.intent, IT_LOOKUP); + + name = getname(pathname); + if(IS_ERR(name)) +@@ -1905,6 +1926,16 @@ + error = -EBUSY; + goto exit1; + } ++ ++ if (nd.dentry->d_inode->i_op->rmdir_raw) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ ++ error = op->rmdir_raw(&nd); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto exit1; ++ } ++ + down(&nd.dentry->d_inode->i_sem); + dentry = lookup_hash(&nd.last, nd.dentry); + error = PTR_ERR(dentry); +@@ -1963,6 +1994,7 @@ + struct dentry *dentry; + struct nameidata nd; + struct inode *inode = NULL; ++ intent_init(&nd.intent, IT_LOOKUP); + + name = getname(pathname); + if(IS_ERR(name)) +@@ -1974,6 +2006,13 @@ + error = -EISDIR; + if (nd.last_type != LAST_NORM) + goto exit1; ++ if (nd.dentry->d_inode->i_op->unlink_raw) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ error = op->unlink_raw(&nd); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto exit1; ++ } + down(&nd.dentry->d_inode->i_sem); + dentry = lookup_hash(&nd.last, nd.dentry); + error = PTR_ERR(dentry); +@@ -2040,10 +2079,18 @@ + if (!IS_ERR(to)) { + struct dentry *dentry; + struct nameidata nd; ++ intent_init(&nd.intent, IT_LOOKUP); + + error = path_lookup(to, LOOKUP_PARENT, &nd); + if (error) + goto out; ++ if (nd.dentry->d_inode->i_op->symlink_raw) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ error = op->symlink_raw(&nd, from); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto out2; ++ } + dentry = lookup_create(&nd, 0); + error = PTR_ERR(dentry); + if (!IS_ERR(dentry)) { +@@ -2051,6 +2098,7 @@ + dput(dentry); + } + up(&nd.dentry->d_inode->i_sem); ++out2: + path_release(&nd); + out: + putname(to); +@@ -2114,6 +2162,8 @@ + struct nameidata nd, old_nd; + int error; + char * to; ++ intent_init(&nd.intent, IT_LOOKUP); ++ intent_init(&old_nd.intent, IT_LOOKUP); + + to = getname(newname); + if (IS_ERR(to)) +@@ -2128,6 +2178,13 @@ + error = -EXDEV; + if (old_nd.mnt != nd.mnt) + goto out_release; ++ if (nd.dentry->d_inode->i_op->link_raw) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ error = op->link_raw(&old_nd, &nd); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto out_release; ++ } + new_dentry = lookup_create(&nd, 0); + error = PTR_ERR(new_dentry); + if (!IS_ERR(new_dentry)) { +@@ -2300,6 +2357,8 @@ + struct dentry * old_dentry, *new_dentry; + struct dentry * trap; + struct nameidata oldnd, newnd; ++ intent_init(&oldnd.intent, IT_LOOKUP); ++ intent_init(&newnd.intent, IT_LOOKUP); + + error = path_lookup(oldname, LOOKUP_PARENT, &oldnd); + if (error) +@@ -2322,6 +2381,13 @@ + if (newnd.last_type != LAST_NORM) + goto exit2; + ++ if (old_dir->d_inode->i_op->rename_raw) { ++ error = old_dir->d_inode->i_op->rename_raw(&oldnd, &newnd); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto exit2; ++ } ++ + trap = lock_rename(new_dir, old_dir); + + old_dentry = lookup_hash(&oldnd.last, old_dir); +@@ -2353,8 +2419,7 @@ + if (new_dentry == trap) + goto exit5; + +- error = vfs_rename(old_dir->d_inode, old_dentry, +- new_dir->d_inode, new_dentry); ++ error = vfs_rename(old_dir->d_inode, old_dentry, new_dir->d_inode, new_dentry); + exit5: + dput(new_dentry); + exit4: +Index: linux-2.6.12.2/fs/open.c +=================================================================== +--- linux-2.6.12.2.orig/fs/open.c 2005-07-23 12:25:12.248867056 +0200 ++++ linux-2.6.12.2/fs/open.c 2005-07-23 12:28:13.221355056 +0200 +@@ -192,9 +192,10 @@ + return error; + } + +-int do_truncate(struct dentry *dentry, loff_t length) ++int do_truncate(struct dentry *dentry, loff_t length, int called_from_open) + { + int err; ++ struct inode_operations *op = dentry->d_inode->i_op; + struct iattr newattrs; + + /* Not pretty: "inode->i_size" shouldn't really be signed. But it is. */ +@@ -205,7 +206,16 @@ + newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME; + + down(&dentry->d_inode->i_sem); +- err = notify_change(dentry, &newattrs); ++ if (called_from_open) ++ newattrs.ia_valid |= ATTR_FROM_OPEN; ++ if (op->setattr_raw) { ++ newattrs.ia_valid |= ATTR_RAW; ++ newattrs.ia_ctime = CURRENT_TIME; ++ down_write(&dentry->d_inode->i_alloc_sem); ++ err = op->setattr_raw(dentry->d_inode, &newattrs); ++ up_write(&dentry->d_inode->i_alloc_sem); ++ } else ++ err = notify_change(dentry, &newattrs); + up(&dentry->d_inode->i_sem); + return err; + } +@@ -260,7 +270,7 @@ + error = locks_verify_truncate(inode, NULL, length); + if (!error) { + DQUOT_INIT(inode); +- error = do_truncate(nd.dentry, length); ++ error = do_truncate(nd.dentry, length, 0); + } + put_write_access(inode); + +@@ -312,7 +322,7 @@ + + error = locks_verify_truncate(inode, file, length); + if (!error) +- error = do_truncate(dentry, length); ++ error = do_truncate(dentry, length, 0); + out_putf: + fput(file); + out: +@@ -391,9 +401,19 @@ + (error = permission(inode,MAY_WRITE,&nd)) != 0) + goto dput_and_out; + } +- down(&inode->i_sem); +- error = notify_change(nd.dentry, &newattrs); +- up(&inode->i_sem); ++ if (inode->i_op->setattr_raw) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ ++ newattrs.ia_valid |= ATTR_RAW; ++ error = op->setattr_raw(inode, &newattrs); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto dput_and_out; ++ } else { ++ down(&inode->i_sem); ++ error = notify_change(nd.dentry, &newattrs); ++ up(&inode->i_sem); ++ } + dput_and_out: + path_release(&nd); + out: +@@ -444,9 +464,19 @@ + (error = permission(inode,MAY_WRITE,&nd)) != 0) + goto dput_and_out; + } +- down(&inode->i_sem); +- error = notify_change(nd.dentry, &newattrs); +- up(&inode->i_sem); ++ if (inode->i_op->setattr_raw) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ ++ newattrs.ia_valid |= ATTR_RAW; ++ error = op->setattr_raw(inode, &newattrs); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto dput_and_out; ++ } else { ++ down(&inode->i_sem); ++ error = notify_change(nd.dentry, &newattrs); ++ up(&inode->i_sem); ++ } + dput_and_out: + path_release(&nd); + out: +@@ -596,36 +626,52 @@ + return error; + } + +-asmlinkage long sys_fchmod(unsigned int fd, mode_t mode) ++int chmod_common(struct dentry *dentry, mode_t mode) + { +- struct inode * inode; +- struct dentry * dentry; +- struct file * file; +- int err = -EBADF; ++ struct inode * inode = dentry->d_inode; + struct iattr newattrs; ++ int error = -EROFS; + +- file = fget(fd); +- if (!file) ++ if (IS_RDONLY(inode)) + goto out; ++ ++ if (inode->i_op->setattr_raw) { ++ struct inode_operations *op = dentry->d_inode->i_op; ++ ++ newattrs.ia_mode = mode; ++ newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; ++ newattrs.ia_valid |= ATTR_RAW; ++ error = op->setattr_raw(inode, &newattrs); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto out; ++ } + +- dentry = file->f_dentry; +- inode = dentry->d_inode; +- +- err = -EROFS; +- if (IS_RDONLY(inode)) +- goto out_putf; +- err = -EPERM; ++ error = -EPERM; + if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) +- goto out_putf; ++ goto out; ++ + down(&inode->i_sem); + if (mode == (mode_t) -1) + mode = inode->i_mode; + newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); + newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; +- err = notify_change(dentry, &newattrs); ++ error = notify_change(dentry, &newattrs); + up(&inode->i_sem); ++out: ++ return error; ++} + +-out_putf: ++asmlinkage long sys_fchmod(unsigned int fd, mode_t mode) ++{ ++ struct file * file; ++ int err = -EBADF; ++ ++ file = fget(fd); ++ if (!file) ++ goto out; ++ ++ err = chmod_common(file->f_dentry, mode); + fput(file); + out: + return err; +@@ -634,32 +680,13 @@ + asmlinkage long sys_chmod(const char __user * filename, mode_t mode) + { + struct nameidata nd; +- struct inode * inode; + int error; +- struct iattr newattrs; + + error = user_path_walk(filename, &nd); + if (error) + goto out; +- inode = nd.dentry->d_inode; +- +- error = -EROFS; +- if (IS_RDONLY(inode)) +- goto dput_and_out; +- +- error = -EPERM; +- if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) +- goto dput_and_out; +- +- down(&inode->i_sem); +- if (mode == (mode_t) -1) +- mode = inode->i_mode; +- newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); +- newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; +- error = notify_change(nd.dentry, &newattrs); +- up(&inode->i_sem); + +-dput_and_out: ++ error = chmod_common(nd.dentry, mode); + path_release(&nd); + out: + return error; +@@ -680,6 +707,18 @@ + if (IS_RDONLY(inode)) + goto out; + error = -EPERM; ++ if (inode->i_op->setattr_raw) { ++ struct inode_operations *op = dentry->d_inode->i_op; ++ ++ newattrs.ia_uid = user; ++ newattrs.ia_gid = group; ++ newattrs.ia_valid = ATTR_UID | ATTR_GID | ATTR_CTIME; ++ newattrs.ia_valid |= ATTR_RAW; ++ error = op->setattr_raw(inode, &newattrs); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ return error; ++ } + if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) + goto out; + newattrs.ia_valid = ATTR_CTIME; +@@ -693,6 +732,7 @@ + } + if (!S_ISDIR(inode->i_mode)) + newattrs.ia_valid |= ATTR_KILL_SUID|ATTR_KILL_SGID; ++ + down(&inode->i_sem); + error = notify_change(dentry, &newattrs); + up(&inode->i_sem); +Index: linux-2.6.12.2/fs/exec.c +=================================================================== +--- linux-2.6.12.2.orig/fs/exec.c 2005-07-23 12:25:12.229869944 +0200 ++++ linux-2.6.12.2/fs/exec.c 2005-07-23 12:25:14.442533568 +0200 +@@ -1488,7 +1488,7 @@ + goto close_fail; + if (!file->f_op->write) + goto close_fail; +- if (do_truncate(file->f_dentry, 0) != 0) ++ if (do_truncate(file->f_dentry, 0, 0) != 0) + goto close_fail; + + retval = binfmt->core_dump(signr, regs, file); +Index: linux-2.6.12.2/include/linux/fs.h +=================================================================== +--- linux-2.6.12.2.orig/include/linux/fs.h 2005-07-23 12:25:12.279862344 +0200 ++++ linux-2.6.12.2/include/linux/fs.h 2005-07-23 12:25:14.443533416 +0200 +@@ -960,13 +960,20 @@ + int (*create) (struct inode *,struct dentry *,int, struct nameidata *); + struct dentry * (*lookup) (struct inode *,struct dentry *, struct nameidata *); + int (*link) (struct dentry *,struct inode *,struct dentry *); ++ int (*link_raw) (struct nameidata *,struct nameidata *); + int (*unlink) (struct inode *,struct dentry *); ++ int (*unlink_raw) (struct nameidata *); + int (*symlink) (struct inode *,struct dentry *,const char *); ++ int (*symlink_raw) (struct nameidata *,const char *); + int (*mkdir) (struct inode *,struct dentry *,int); ++ int (*mkdir_raw) (struct nameidata *,int); + int (*rmdir) (struct inode *,struct dentry *); ++ int (*rmdir_raw) (struct nameidata *); + int (*mknod) (struct inode *,struct dentry *,int,dev_t); ++ int (*mknod_raw) (struct nameidata *,int,dev_t); + int (*rename) (struct inode *, struct dentry *, + struct inode *, struct dentry *); ++ int (*rename_raw) (struct nameidata *, struct nameidata *); + int (*readlink) (struct dentry *, char __user *,int); + int (*follow_link) (struct dentry *, struct nameidata *); + void (*put_link) (struct dentry *, struct nameidata *); +@@ -1268,7 +1275,7 @@ + + /* fs/open.c */ + +-extern int do_truncate(struct dentry *, loff_t start); ++extern int do_truncate(struct dentry *, loff_t start, int called_from_open); + extern struct file *filp_open(const char *, int, int); + extern struct file * dentry_open(struct dentry *, struct vfsmount *, int); + extern struct file * dentry_open_it(struct dentry *, struct vfsmount *, int, struct lookup_intent *); +Index: linux-2.6.12.2/net/unix/af_unix.c +=================================================================== +--- linux-2.6.12.2.orig/net/unix/af_unix.c 2005-06-30 01:00:53.000000000 +0200 ++++ linux-2.6.12.2/net/unix/af_unix.c 2005-07-23 12:25:14.445533112 +0200 +@@ -673,6 +673,7 @@ + int err = 0; + + if (sunname->sun_path[0]) { ++ intent_init(&nd.intent, IT_LOOKUP); + err = path_lookup(sunname->sun_path, LOOKUP_FOLLOW, &nd); + if (err) + goto fail; diff --git a/lustre/kernel_patches/patches/vfs_races-2.6.12.patch b/lustre/kernel_patches/patches/vfs_races-2.6.12.patch new file mode 100644 index 0000000..011d87d --- /dev/null +++ b/lustre/kernel_patches/patches/vfs_races-2.6.12.patch @@ -0,0 +1,61 @@ +Index: linux-2.6.7-vanilla/fs/dcache.c +=================================================================== +--- linux-2.6.7-vanilla.orig/fs/dcache.c 2004-07-01 12:09:19.000000000 +0300 ++++ linux-2.6.7-vanilla/fs/dcache.c 2004-07-01 12:29:12.510193264 +0300 +@@ -219,6 +219,13 @@ + spin_unlock(&dcache_lock); + return 0; + } ++ ++ /* network invalidation by Lustre */ ++ if (dentry->d_flags & DCACHE_LUSTRE_INVALID) { ++ spin_unlock(&dcache_lock); ++ return 0; ++ } ++ + /* + * Check whether to do a partial shrink_dcache + * to get rid of unused child entries. +@@ -1199,16 +1199,25 @@ + * Adds a dentry to the hash according to its name. + */ + +-void d_rehash(struct dentry * entry) ++void d_rehash_cond(struct dentry * entry, int lock) + { + struct hlist_head *list = d_hash(entry->d_parent, entry->d_name.hash); + +- spin_lock(&dcache_lock); ++ if (lock) ++ spin_lock(&dcache_lock); + spin_lock(&entry->d_lock); + __d_rehash(entry, list); + spin_unlock(&entry->d_lock); +- spin_unlock(&dcache_lock); ++ if (lock) ++ spin_unlock(&dcache_lock); + } + ++EXPORT_SYMBOL(d_rehash_cond); ++ ++void d_rehash(struct dentry * entry) ++{ ++ d_rehash_cond(entry, 1); ++ } ++ + #define do_switch(x,y) do { \ + __typeof__ (x) __tmp = x; \ + x = y; y = __tmp; } while (0) +Index: linux-2.6.7-vanilla/include/linux/dcache.h +=================================================================== +--- linux-2.6.7-vanilla.orig/include/linux/dcache.h 2004-07-01 12:24:53.602553208 +0300 ++++ linux-2.6.7-vanilla/include/linux/dcache.h 2004-07-01 12:27:29.757814000 +0300 +@@ -159,6 +159,8 @@ + + #define DCACHE_REFERENCED 0x0008 /* Recently used, don't discard. */ + #define DCACHE_UNHASHED 0x0010 ++#define DCACHE_LUSTRE_INVALID 0x0020 /* Lustre invalidated */ ++ + + extern spinlock_t dcache_lock; + diff --git a/lustre/kernel_patches/patches/vm-tunables-rhel4.patch b/lustre/kernel_patches/patches/vm-tunables-rhel4.patch new file mode 100644 index 0000000..7e4a9d6 --- /dev/null +++ b/lustre/kernel_patches/patches/vm-tunables-rhel4.patch @@ -0,0 +1,73 @@ +Index: linux+rhel4+chaos/mm/page_alloc.c +=================================================================== +--- linux+rhel4+chaos.orig/mm/page_alloc.c ++++ linux+rhel4+chaos/mm/page_alloc.c +@@ -1972,8 +1972,12 @@ static void setup_per_zone_pages_min(voi + lowmem_pages; + } + +- zone->pages_low = zone->pages_min * 2; +- zone->pages_high = zone->pages_min * 3; ++ /* ++ * When interpreting these watermarks, just keep in mind that: ++ * zone->pages_min == (zone->pages_min * 4) / 4; ++ */ ++ zone->pages_low = (zone->pages_min * 5) / 4; ++ zone->pages_high = (zone->pages_min * 6) / 4; + spin_unlock_irqrestore(&zone->lru_lock, flags); + } + } +@@ -1982,24 +1986,25 @@ static void setup_per_zone_pages_min(voi + * Initialise min_free_kbytes. + * + * For small machines we want it small (128k min). For large machines +- * we want it large (16MB max). But it is not linear, because network ++ * we want it large (64MB max). But it is not linear, because network + * bandwidth does not increase linearly with machine size. We use + * +- * min_free_kbytes = sqrt(lowmem_kbytes) ++ * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy ++ * min_free_kbytes = sqrt(lowmem_kbytes * 16) + * + * which yields + * +- * 16MB: 128k +- * 32MB: 181k +- * 64MB: 256k +- * 128MB: 362k +- * 256MB: 512k +- * 512MB: 724k +- * 1024MB: 1024k +- * 2048MB: 1448k +- * 4096MB: 2048k +- * 8192MB: 2896k +- * 16384MB: 4096k ++ * 16MB: 512k ++ * 32MB: 724k ++ * 64MB: 1024k ++ * 128MB: 1448k ++ * 256MB: 2048k ++ * 512MB: 2896k ++ * 1024MB: 4096k ++ * 2048MB: 5792k ++ * 4096MB: 8192k ++ * 8192MB: 11584k ++ * 16384MB: 16384k + */ + static int __init init_per_zone_pages_min(void) + { +@@ -2007,11 +2012,11 @@ static int __init init_per_zone_pages_mi + + lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10); + +- min_free_kbytes = int_sqrt(lowmem_kbytes); ++ min_free_kbytes = int_sqrt(lowmem_kbytes * 16); + if (min_free_kbytes < 128) + min_free_kbytes = 128; +- if (min_free_kbytes > 16384) +- min_free_kbytes = 16384; ++ if (min_free_kbytes > 65536) ++ min_free_kbytes = 65536; + setup_per_zone_pages_min(); + setup_per_zone_protection(); + return 0; diff --git a/lustre/kernel_patches/patches/vmalloc_to_page-2.4.19-bgl.patch b/lustre/kernel_patches/patches/vmalloc_to_page-2.4.19-bgl.patch deleted file mode 100644 index 1ff2f5d..0000000 --- a/lustre/kernel_patches/patches/vmalloc_to_page-2.4.19-bgl.patch +++ /dev/null @@ -1,12 +0,0 @@ -Index: linux.mcp2/kernel/ksyms.c -=================================================================== ---- linux.mcp2.orig/kernel/ksyms.c 2004-05-05 14:57:48.000000000 -0700 -+++ linux.mcp2/kernel/ksyms.c 2004-05-05 15:32:44.000000000 -0700 -@@ -108,6 +108,7 @@ - EXPORT_SYMBOL(kfree); - EXPORT_SYMBOL(vfree); - EXPORT_SYMBOL(__vmalloc); -+extern struct page * vmalloc_to_page(void *addr); - EXPORT_SYMBOL(vmalloc_to_page); - EXPORT_SYMBOL(mem_map); - EXPORT_SYMBOL(remap_page_range); diff --git a/lustre/kernel_patches/series/2.6-fc3.series b/lustre/kernel_patches/series/2.6-fc3.series index 4b6e21f..361da69 100644 --- a/lustre/kernel_patches/series/2.6-fc3.series +++ b/lustre/kernel_patches/series/2.6-fc3.series @@ -1,5 +1,6 @@ uml-2.6.10-fc3.patch lustre_version.patch +fc3_to_rhel4_updates.patch vfs_intent-2.6-rhel4.patch vfs_nointent-2.6-rhel4.patch vfs_races-2.6-fc3.patch @@ -19,3 +20,4 @@ hostfs_readdir_large.patch ext3-patch-fuzz-fixup-fc3.patch uml-exprt-clearuser.patch fsprivate-2.6.patch +linux-2.6.9-ext3-sub-second-timestamp.patch diff --git a/lustre/kernel_patches/series/2.6-rhel4.series b/lustre/kernel_patches/series/2.6-rhel4.series index 02d7375..1a923c1 100644 --- a/lustre/kernel_patches/series/2.6-rhel4.series +++ b/lustre/kernel_patches/series/2.6-rhel4.series @@ -10,10 +10,12 @@ export_symbols-2.6-rhel4.patch dev_read_only-2.6-suse.patch export-2.6-suse.patch lookup_bdev_init_intent.patch -8kstack-2.6-rhel4.patch remove-suid-2.6-suse.patch export-show_task-2.6-vanilla.patch sd_iostats-2.6-rhel4.patch fsprivate-2.6.patch export_symbol_numa.patch qsnet-rhel4-2.6.patch +linux-2.6-binutils-2.16.patch +compile-fixes-2.6.9-rhel4-22.patch +vm-tunables-rhel4.patch diff --git a/lustre/kernel_patches/series/2.6-suse-newer.series b/lustre/kernel_patches/series/2.6-suse-newer.series index c0de646..1c5d31f 100644 --- a/lustre/kernel_patches/series/2.6-suse-newer.series +++ b/lustre/kernel_patches/series/2.6-suse-newer.series @@ -6,3 +6,4 @@ blkdev_tunables-2.6-suse.patch uml-exprt-clearuser.patch qsnet-suse-2.6.patch fsprivate-2.6.patch +dcache-qstr-api-fix-2.6-suse.patch diff --git a/lustre/kernel_patches/series/2.6.12-vanilla.series b/lustre/kernel_patches/series/2.6.12-vanilla.series new file mode 100644 index 0000000..9ecb127 --- /dev/null +++ b/lustre/kernel_patches/series/2.6.12-vanilla.series @@ -0,0 +1,19 @@ +lustre_version.patch +vfs_intent-2.6.12.patch +vfs_nointent-2.6.12.patch +vfs_races-2.6.12.patch +ext3-wantedi-misc-2.6-suse.patch +jbd-2.6.10-jcberr.patch +nfs-cifs-intent-2.6.12.patch +iopen-misc-2.6.12.patch +export-truncate-2.6-suse.patch +export_symbols-2.6.12.patch +dev_read_only-2.6-suse.patch +export-2.6-suse.patch +lookup_bdev_init_intent.patch +8kstack-2.6.12.patch +remove-suid-2.6-suse.patch +export-show_task-2.6-vanilla.patch +sd_iostats-2.6-rhel4.patch +fsprivate-2.6.patch +export_symbol_numa.patch diff --git a/lustre/kernel_patches/series/bgl-2.4.19 b/lustre/kernel_patches/series/bgl-2.4.19 deleted file mode 100644 index bd67a30..0000000 --- a/lustre/kernel_patches/series/bgl-2.4.19 +++ /dev/null @@ -1,47 +0,0 @@ -dev_read_only_2.4.20-rh.patch -exports_2.4.19-bgl.patch -lustre_version.patch -vfs_intent-2.4.19-bgl.patch -invalidate_show-2.4.19-bgl.patch -export-truncate-bgl.patch -iod-stock-24-exports-2.4.19-bgl.patch -ext3-htree-2.4.19-bgl.patch -linux-2.4.19-bgl-xattr-0.8.54.patch -ext3-2.4.20-fixes.patch -ext3-2.4-ino_t.patch -ext3-largefile.patch -ext3-truncate_blocks.patch -ext3-unmount_sync.patch -ext3-use-after-free-2.4.19-pre1.patch -ext3-orphan_lock.patch -ext3-noread-2.4.20.patch -ext3-delete_thread-2.4.20.patch -extN-wantedi.patch -ext3-san-2.4.20.patch -ext3-map_inode_page.patch -ext3-error-export.patch -iopen-2.4.19-bgl.patch -tcp-zero-copy-2.4.19-pre1.patch -jbd-dont-account-blocks-twice.patch -jbd-commit-tricks.patch -ext3-no-write-super.patch -add_page_private-2.4.19-bgl.patch -socket-exports-2.4.19-bgl.patch -removepage-2.4.20.patch -jbd-ctx_switch.patch -jbd-flushtime-2.4.19-suse.patch -jbd-get_write_access.patch -nfs_export_kernel-2.4.19-bgl.patch -ext3-raw-lookup.patch -ext3-ea-in-inode-2.4.20.patch -listman-2.4.19-bgl.patch -ext3-trusted_ea-2.4.20.patch -jbd-2.4.19-pre1-jcberr.patch -resched-2.4.19-pre1.patch -ext3-xattr-ptr-arith-fix.patch -vmalloc_to_page-2.4.19-bgl.patch -procfs-ndynamic-2.4.patch -ext3-truncate-buffer-head.patch -kallsyms-2.4-bgl.patch -kksymoops-2.4-bgl.patch -export-show_task-2.4-bgl.patch diff --git a/lustre/kernel_patches/series/ldiskfs-2.6-rhel4.series b/lustre/kernel_patches/series/ldiskfs-2.6-rhel4.series index 8e76197..bab81b9 100644 --- a/lustre/kernel_patches/series/ldiskfs-2.6-rhel4.series +++ b/lustre/kernel_patches/series/ldiskfs-2.6-rhel4.series @@ -8,6 +8,5 @@ export-ext3-2.6-rhel4.patch ext3-include-fixes-2.6-rhel4.patch ext3-extents-2.6.9-rhel4.patch ext3-mballoc2-2.6.9-rhel4.patch -ext3-nlinks-2.6.7.patch -ext3-htree-dot-2.6.patch +ext3-nlinks-2.6.9.patch ext3-ialloc-2.6.patch diff --git a/lustre/kernel_patches/series/ldiskfs-2.6.12-vanilla.series b/lustre/kernel_patches/series/ldiskfs-2.6.12-vanilla.series new file mode 100644 index 0000000..7d0a383 --- /dev/null +++ b/lustre/kernel_patches/series/ldiskfs-2.6.12-vanilla.series @@ -0,0 +1,13 @@ +ext3-wantedi-2.6-rhel4.patch +ext3-san-jdike-2.6-suse.patch +iopen-2.6.12.patch +ext3-map_inode_page-2.6-suse.patch +export-ext3-2.6-rhel4.patch +ext3-include-fixes-2.6-rhel4.patch +ext3-extents-2.6.12.patch +ext3-mballoc2-2.6.12.patch +ext3-nlinks-2.6.9.patch +ext3-ialloc-2.6.patch +ext3-remove-cond_resched-calls-2.6.12.patch +ext3-htree-dot-2.6.patch +ext3-external-journal-2.6.12.patch diff --git a/lustre/kernel_patches/series/rhel-2.4.21 b/lustre/kernel_patches/series/rhel-2.4.21 index 2cc841f..bc6e9f6 100644 --- a/lustre/kernel_patches/series/rhel-2.4.21 +++ b/lustre/kernel_patches/series/rhel-2.4.21 @@ -21,7 +21,7 @@ ext3-error-export.patch iopen-2.4.21-chaos.patch tcp-zero-copy-2.4.21-chaos.patch jbd-dont-account-blocks-twice.patch -jbd-commit-tricks.patch +jbd-commit-tricks-rhel3.patch ext3-o_direct-2.4.21-chaos.patch ext3-no-write-super-chaos.patch add_page_private.patch @@ -50,4 +50,4 @@ nfs_statfs-toomanyfiles-rhel-2.4.patch statfs64-cast-unsigned-2.4-rhel.patch fsprivate-2.4.patch nfsd_iallocsem.patch -linux-2.4.24-jbd-handle-EIO.patch +linux-2.4.24-jbd-handle-EIO-rhel3.patch diff --git a/lustre/kernel_patches/series/suse-2.4.21-cray b/lustre/kernel_patches/series/suse-2.4.21-cray index 12b65ba..e0b9c23 100644 --- a/lustre/kernel_patches/series/suse-2.4.21-cray +++ b/lustre/kernel_patches/series/suse-2.4.21-cray @@ -16,7 +16,7 @@ extN-wantedi-2.4.21-suse2.patch ext3-san-2.4.20.patch ext3-map_inode_page-2.4.21-suse2.patch ext3-error-export.patch -iopen-2.4.21-sles8sp3.patch +iopen-2.4.21-chaos.patch tcp-zero-copy-2.4.21-suse2.patch jbd-dont-account-blocks-twice.patch jbd-commit-tricks.patch diff --git a/lustre/kernel_patches/series/suse-2.4.21-jvn b/lustre/kernel_patches/series/suse-2.4.21-jvn index ddcefe4..74e9445 100644 --- a/lustre/kernel_patches/series/suse-2.4.21-jvn +++ b/lustre/kernel_patches/series/suse-2.4.21-jvn @@ -16,7 +16,7 @@ extN-wantedi-2.4.21-suse2.patch ext3-san-2.4.20.patch ext3-map_inode_page-2.4.21-suse2.patch ext3-error-export.patch -iopen-2.4.19-suse.patch +iopen-2.4.21-chaos.patch jbd-dont-account-blocks-twice.patch jbd-commit-tricks.patch ext3-no-write-super-chaos.patch diff --git a/lustre/kernel_patches/targets/2.6-rhel4.target.in b/lustre/kernel_patches/targets/2.6-rhel4.target.in index 0f0dc17..db5c9fa 100644 --- a/lustre/kernel_patches/targets/2.6-rhel4.target.in +++ b/lustre/kernel_patches/targets/2.6-rhel4.target.in @@ -1,5 +1,5 @@ lnxmaj="2.6.9" -lnxrel="5.0.5.EL" +lnxrel="22.0.2.EL" KERNEL=linux-${lnxmaj}-${lnxrel}.tar.bz2 SERIES=2.6-rhel4.series diff --git a/lustre/kernel_patches/targets/2.6-suse.target.in b/lustre/kernel_patches/targets/2.6-suse.target.in index d604f6e..a0a2633 100644 --- a/lustre/kernel_patches/targets/2.6-suse.target.in +++ b/lustre/kernel_patches/targets/2.6-suse.target.in @@ -1,5 +1,5 @@ lnxmaj="2.6.5" -lnxrel="7.201" +lnxrel="7.244" KERNEL=linux-$lnxmaj-$lnxrel.tar.bz2 # they include our patches diff --git a/lustre/kernel_patches/targets/rhel-2.4.target.in b/lustre/kernel_patches/targets/rhel-2.4.target.in index 8982d8f..7af0f35 100644 --- a/lustre/kernel_patches/targets/rhel-2.4.target.in +++ b/lustre/kernel_patches/targets/rhel-2.4.target.in @@ -1,5 +1,5 @@ lnxmaj="2.4.21" -lnxrel="32.0.1.EL" +lnxrel="37.EL" KERNEL=linux-${lnxmaj}-${lnxrel}.tar.bz2 SERIES=rhel-2.4.21 diff --git a/lustre/kernel_patches/which_patch b/lustre/kernel_patches/which_patch index 7ee0629..bc48f94 100644 --- a/lustre/kernel_patches/which_patch +++ b/lustre/kernel_patches/which_patch @@ -2,15 +2,20 @@ SERIES MNEMONIC COMMENT ARCH SUPPORTED KERNELS: rhel-2.4.21 linux-2.4.21-20.3EL RHEL3 2.4.21 all -2.6-suse linux-2.6-suse SLES9 SP1 kernel all +2.6-suse linux-2.6-suse already in SLES9 SP1 kernel all 2.6-suse-newer linux-2.6-suse SLES9 SP1 kernel add-ons all 2.6-rhel4 linux-2.6-rhel4 RHEL4 2.6.9 kernel all +2.6.12-vanilla linux-2.6.12.6 kernel.org 2.6.12.6 all NB - The patches in the 2.6-suse series should already be in the SLES9 SP1 kernel. The patches in the 2.6-suse-newer series are patches that have been created since the SP1 kernel was released and should be applied to the already-patched SP1 kernel. +NB - The patches in the ldiskfs series should not be applied to the kernel. + They are instead applied by the lustre build process to create the + ldiskfs kernel module instead of modifying the core ext3 code. + UNSUPPORTED KERNELS; BEING PHASED OUT; MAY BE MISSING CRITICAL BUG FIXES: hp-pnnl-2.4.20 linux-2.4.20-hp4_pnnl1 same as vanilla but no uml ia64 vanilla-2.4.24 linux-2.4.24 patch with uml-2.4.24-6 um diff --git a/lustre/ldiskfs/Makefile.in b/lustre/ldiskfs/Makefile.in index 92d9b6b..e52e62f 100644 --- a/lustre/ldiskfs/Makefile.in +++ b/lustre/ldiskfs/Makefile.in @@ -11,7 +11,7 @@ ext3_headers := $(wildcard @LINUX@/fs/ext3/*.h) linux_headers := $(wildcard @LINUX@/include/linux/ext3*.h) ext3_sources := $(filter-out %.mod.c,$(wildcard @LINUX@/fs/ext3/*.c)) -new_sources := iopen.c iopen.h extents.c mballoc.c proc.c +new_sources := iopen.c iopen.h extents.c mballoc.c new_headers := ext3_extents.h ldiskfs_patched_sources := $(notdir $(ext3_sources) $(ext3_headers)) $(new_sources) $(new_headers) ldiskfs_sources := $(ldiskfs_patched_sources) diff --git a/lustre/ldiskfs/autoMakefile.am b/lustre/ldiskfs/autoMakefile.am index 0eff073..7e378c2 100644 --- a/lustre/ldiskfs/autoMakefile.am +++ b/lustre/ldiskfs/autoMakefile.am @@ -38,7 +38,8 @@ sources: $(ext3_sources) $(ext3_headers) $(linux_headers) $(series) cp $(ext3_sources) $(ext3_headers) $(ext3_extra) linux-stage/fs/ext3 cp $(linux_headers) linux-stage/include/linux if USE_QUILT - cd linux-stage && quilt setup -d ../$(patches) ../$(series) + ln -s ../$(patches) linux-stage/patches + ln -s ../$(series) linux-stage/series cd linux-stage && quilt push -a -q else @echo -n "Applying ext3 patches:" diff --git a/lustre/ldiskfs/lustre_quota_fmt.c b/lustre/ldiskfs/lustre_quota_fmt.c index cf7dc08..9db3f3f 100644 --- a/lustre/ldiskfs/lustre_quota_fmt.c +++ b/lustre/ldiskfs/lustre_quota_fmt.c @@ -31,7 +31,7 @@ typedef char *dqbuf_t; #define GETIDINDEX(id, depth) (((id) >> ((LUSTRE_DQTREEDEPTH-(depth)-1)*8)) & 0xff) #define GETENTRIES(buf) ((struct lustre_disk_dqblk *)(((char *)buf)+sizeof(struct lustre_disk_dqdbheader))) -static int check_quota_file(struct file *f, int type) +static int check_quota_file(struct file *f, struct inode *inode, int type) { struct lustre_disk_dqheader dqhead; mm_segment_t fs; @@ -40,11 +40,22 @@ static int check_quota_file(struct file *f, int type) static const uint quota_magics[] = LUSTRE_INITQMAGICS; static const uint quota_versions[] = LUSTRE_INITQVERSIONS; - fs = get_fs(); - set_fs(KERNEL_DS); - size = f->f_op->read(f, (char *)&dqhead, - sizeof(struct lustre_disk_dqheader), &offset); - set_fs(fs); + if (f) { + fs = get_fs(); + set_fs(KERNEL_DS); + size = f->f_op->read(f, (char *)&dqhead, + sizeof(struct lustre_disk_dqheader), + &offset); + set_fs(fs); + } else { +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,12) + size = 0; +#else + struct super_block *sb = inode->i_sb; + size = sb->s_op->quota_read(sb, type, (char *)&dqhead, + sizeof(struct lustre_disk_dqheader), 0); +#endif + } if (size != sizeof(struct lustre_disk_dqheader)) return 0; if (le32_to_cpu(dqhead.dqh_magic) != quota_magics[type] || @@ -57,7 +68,7 @@ static int check_quota_file(struct file *f, int type) int lustre_check_quota_file(struct lustre_quota_info *lqi, int type) { struct file *f = lqi->qi_files[type]; - return check_quota_file(f, type); + return check_quota_file(f, NULL, type); } /* Read information header from quota file */ @@ -801,8 +812,26 @@ struct dqblk { uint blk; }; -static int walk_block_dqentry(struct file *filp, uint blk, - struct list_head *list) +static ssize_t quota_read(struct file *file, struct inode *inode, int type, + uint blk, dqbuf_t buf) +{ + if (file) { + return read_blk(file, blk, buf); + } else { +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,12) + return -ENOTSUPP; +#else + struct super_block *sb = inode->i_sb; + memset(buf, 0, LUSTRE_DQBLKSIZE); + return sb->s_op->quota_read(sb, type, (char *)buf, + LUSTRE_DQBLKSIZE, + blk << LUSTRE_DQBLKSIZE_BITS); +#endif + } +} + +static int walk_block_dqentry(struct file *filp, struct inode *inode, int type, + uint blk, struct list_head *list) { dqbuf_t buf = getdqbuf(); loff_t ret = 0; @@ -814,7 +843,7 @@ static int walk_block_dqentry(struct file *filp, uint blk, if (!buf) return -ENOMEM; - if ((ret = read_blk(filp, blk, buf)) < 0) { + if ((ret = quota_read(filp, inode, type, blk, buf)) < 0) { printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk); goto out_buf; } @@ -852,8 +881,8 @@ out_buf: return ret; } -static int walk_tree_dqentry(struct file *filp, uint blk, int depth, - struct list_head *list) +static int walk_tree_dqentry(struct file *filp, struct inode *inode, int type, + uint blk, int depth, struct list_head *list) { dqbuf_t buf = getdqbuf(); loff_t ret = 0; @@ -862,7 +891,7 @@ static int walk_tree_dqentry(struct file *filp, uint blk, int depth, if (!buf) return -ENOMEM; - if ((ret = read_blk(filp, blk, buf)) < 0) { + if ((ret = quota_read(filp, inode, type, blk, buf)) < 0) { printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk); goto out_buf; } @@ -874,9 +903,10 @@ static int walk_tree_dqentry(struct file *filp, uint blk, int depth, continue; if (depth < LUSTRE_DQTREEDEPTH - 1) - ret = walk_tree_dqentry(filp, blk, depth + 1, list); + ret = walk_tree_dqentry(filp, inode, type, blk, + depth + 1, list); else - ret = walk_block_dqentry(filp, blk, list); + ret = walk_block_dqentry(filp, inode, type, blk, list); } out_buf: freedqbuf(buf); @@ -884,17 +914,16 @@ out_buf: } /* Walk through the quota file (v2 format) to get all ids with quota limit */ -int lustre_get_qids(struct lustre_quota_info *lqi, int type, +int lustre_get_qids(struct file *fp, struct inode *inode, int type, struct list_head *list) { - struct file *fp = lqi->qi_files[type]; struct list_head blk_list; struct dqblk *blk_item, *tmp; dqbuf_t buf = NULL; struct lustre_disk_dqblk *ddquot; int rc; - if (!check_quota_file(fp, type)) { + if (!check_quota_file(fp, inode, type)) { printk(KERN_ERR "unknown quota file format!\n"); return -EINVAL; } @@ -904,7 +933,7 @@ int lustre_get_qids(struct lustre_quota_info *lqi, int type, } INIT_LIST_HEAD(&blk_list); - rc = walk_tree_dqentry(fp, LUSTRE_DQTREEOFF, 0, &blk_list); + rc = walk_tree_dqentry(fp, inode, type, LUSTRE_DQTREEOFF, 0, &blk_list); if (rc) { printk(KERN_ERR "walk through quota file failed!(%d)\n", rc); goto out_free; @@ -923,7 +952,7 @@ int lustre_get_qids(struct lustre_quota_info *lqi, int type, struct lustre_disk_dqblk fakedquot; memset(buf, 0, LUSTRE_DQBLKSIZE); - if ((ret = read_blk(fp, blk_item->blk, buf)) < 0) { + if ((ret = quota_read(fp, inode, type, blk_item->blk, buf))<0) { printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk_item->blk); diff --git a/lustre/ldiskfs/quotafmt_test.c b/lustre/ldiskfs/quotafmt_test.c index b7eea6a..5f6bc7c 100644 --- a/lustre/ldiskfs/quotafmt_test.c +++ b/lustre/ldiskfs/quotafmt_test.c @@ -344,6 +344,7 @@ static int quotfmt_test_4(struct lustre_quota_info *lqi) static int quotfmt_test_5(struct lustre_quota_info *lqi) { +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,12) int i, rc = 0; for (i = USRQUOTA; i < MAXQUOTAS && !rc; i++) { @@ -351,7 +352,7 @@ static int quotfmt_test_5(struct lustre_quota_info *lqi) struct dquot_id *dqid, *tmp; INIT_LIST_HEAD(&list); - rc = lustre_get_qids(lqi, i, &list); + rc = lustre_get_qids(lqi->qi_files[i], NULL, i, &list); if (rc) { CERROR("%s get all %ss (rc:%d):\n", rc ? "error" : "success", @@ -366,6 +367,10 @@ static int quotfmt_test_5(struct lustre_quota_info *lqi) printk("\n"); } return rc; +#else + CWARN("kernel version >= 2.6.12, test skipped\n"); + return 0; +#endif } static int quotfmt_run_tests(struct obd_device *obd, struct obd_device *tgt) @@ -420,6 +425,7 @@ static int quotfmt_run_tests(struct obd_device *obd, struct obd_device *tgt) CERROR("walk through quota file failed\n"); GOTO(out, rc); } + out: CWARN("=== Finalize quotafile test\n"); rc = quotfmt_finalize(lqi, tgt, &saved); diff --git a/lustre/ldlm/ldlm_internal.h b/lustre/ldlm/ldlm_internal.h index fa2db34..a2dcf4b 100644 --- a/lustre/ldlm/ldlm_internal.h +++ b/lustre/ldlm/ldlm_internal.h @@ -20,7 +20,7 @@ void ldlm_grant_lock(struct ldlm_lock *lock, void *data, int datalen, struct ldlm_lock * ldlm_lock_create(struct ldlm_namespace *ns, struct lustre_handle *parent_lock_handle, struct ldlm_res_id, - __u32 type, ldlm_mode_t, ldlm_blocking_callback, + ldlm_type_t type, ldlm_mode_t, ldlm_blocking_callback, ldlm_completion_callback, ldlm_glimpse_callback, void *data, __u32 lvb_len); ldlm_error_t ldlm_lock_enqueue(struct ldlm_namespace *, struct ldlm_lock **, diff --git a/lustre/ldlm/ldlm_lock.c b/lustre/ldlm/ldlm_lock.c index b6a0dff..0dab5b9 100644 --- a/lustre/ldlm/ldlm_lock.c +++ b/lustre/ldlm/ldlm_lock.c @@ -129,7 +129,7 @@ void ldlm_lock_put(struct ldlm_lock *lock) struct obd_export *export = NULL; l_lock(&ns->ns_lock); - LDLM_DEBUG(lock, "final lock_put on destroyed lock, freeing"); + LDLM_DEBUG(lock, "final lock_put on destroyed lock, freeing it."); LASSERT(lock->l_destroyed); LASSERT(list_empty(&lock->l_res_link)); @@ -380,11 +380,48 @@ struct ldlm_lock *ldlm_handle2lock_ns(struct ldlm_namespace *ns, void ldlm_lock2desc(struct ldlm_lock *lock, struct ldlm_lock_desc *desc) { - ldlm_res2desc(lock->l_resource, &desc->l_resource); - desc->l_req_mode = lock->l_req_mode; - desc->l_granted_mode = lock->l_granted_mode; - memcpy(&desc->l_policy_data, &lock->l_policy_data, - sizeof(desc->l_policy_data)); + struct obd_export *exp = lock->l_export?:lock->l_conn_export; + /* INODEBITS_INTEROP: If the other side does not support + * inodebits, reply with a plain lock descriptor. + */ + if ((lock->l_resource->lr_type == LDLM_IBITS) && + (exp && !(exp->exp_connect_flags & OBD_CONNECT_IBITS))) { + struct ldlm_resource res = *lock->l_resource; + + /* Make sure all the right bits are set in this lock we + are going to pass to client */ + LASSERTF(lock->l_policy_data.l_inodebits.bits == + (MDS_INODELOCK_LOOKUP|MDS_INODELOCK_UPDATE), + "Inappropriate inode lock bits during " + "conversion " LPU64 "\n", + lock->l_policy_data.l_inodebits.bits); + res.lr_type = LDLM_PLAIN; + ldlm_res2desc(&res, &desc->l_resource); + /* Convert "new" lock mode to something old client can + understand */ + if ((lock->l_req_mode == LCK_CR) || + (lock->l_req_mode == LCK_CW)) + desc->l_req_mode = LCK_PR; + else + desc->l_req_mode = lock->l_req_mode; + if ((lock->l_granted_mode == LCK_CR) || + (lock->l_granted_mode == LCK_CW)) { + desc->l_granted_mode = LCK_PR; + } else { + /* We never grant PW/EX locks to clients */ + LASSERT((lock->l_granted_mode != LCK_PW) && + (lock->l_granted_mode != LCK_EX)); + desc->l_granted_mode = lock->l_granted_mode; + } + + /* We do not copy policy here, because there is no + policy for plain locks */ + } else { + ldlm_res2desc(lock->l_resource, &desc->l_resource); + desc->l_req_mode = lock->l_req_mode; + desc->l_granted_mode = lock->l_granted_mode; + desc->l_policy_data = lock->l_policy_data; + } } void ldlm_add_ast_work_item(struct ldlm_lock *lock, struct ldlm_lock *new, @@ -512,7 +549,7 @@ void ldlm_lock_decref_internal(struct ldlm_lock *lock, __u32 mode) void ldlm_lock_decref(struct lustre_handle *lockh, __u32 mode) { struct ldlm_lock *lock = __ldlm_handle2lock(lockh, 0); - LASSERT(lock != NULL); + LASSERTF(lock != NULL, "Non-existing lock: "LPX64"\n", lockh->cookie); ldlm_lock_decref_internal(lock, mode); LDLM_LOCK_PUT(lock); } @@ -724,19 +761,19 @@ int ldlm_lock_match(struct ldlm_namespace *ns, int flags, if (rc) { l_lock(&ns->ns_lock); LDLM_DEBUG(lock, "matched ("LPU64" "LPU64")", - type == LDLM_PLAIN ? res_id->name[2] : - policy->l_extent.start, - type == LDLM_PLAIN ? res_id->name[3] : - policy->l_extent.end); + (type == LDLM_PLAIN || type == LDLM_IBITS) ? + res_id->name[2] : policy->l_extent.start, + (type == LDLM_PLAIN || type == LDLM_IBITS) ? + res_id->name[3] : policy->l_extent.end); l_unlock(&ns->ns_lock); } else if (!(flags & LDLM_FL_TEST_LOCK)) {/*less verbose for test-only*/ LDLM_DEBUG_NOLOCK("not matched ns %p type %u mode %u res " LPU64"/"LPU64" ("LPU64" "LPU64")", ns, type, mode, res_id->name[0], res_id->name[1], - type == LDLM_PLAIN ? res_id->name[2] : - policy->l_extent.start, - type == LDLM_PLAIN ? res_id->name[3] : - policy->l_extent.end); + (type == LDLM_PLAIN || type == LDLM_IBITS) ? + res_id->name[2] :policy->l_extent.start, + (type == LDLM_PLAIN || type == LDLM_IBITS) ? + res_id->name[3] : policy->l_extent.end); } if (old_lock) LDLM_LOCK_PUT(old_lock); @@ -808,6 +845,7 @@ ldlm_error_t ldlm_lock_enqueue(struct ldlm_namespace *ns, ldlm_error_t rc = ELDLM_OK; ENTRY; + do_gettimeofday(&lock->l_enqueued_time); /* policies are not executed on the client or during replay */ if ((*flags & (LDLM_FL_HAS_INTENT|LDLM_FL_REPLAY)) == LDLM_FL_HAS_INTENT && !local && ns->ns_policy) { @@ -843,7 +881,7 @@ ldlm_error_t ldlm_lock_enqueue(struct ldlm_namespace *ns, /* Some flags from the enqueue want to make it into the AST, via the * lock's l_flags. */ - lock->l_flags |= (*flags & (LDLM_AST_DISCARD_DATA|LDLM_INHERIT_FLAGS)); + lock->l_flags |= *flags & LDLM_AST_DISCARD_DATA; /* This distinction between local lock trees is very important; a client * namespace only has information about locks taken by that client, and @@ -1191,15 +1229,15 @@ void ldlm_lock_dump(int level, struct ldlm_lock *lock, int pos) return; } - CDEBUG(level, " -- Lock dump: %p/"LPX64" (rc: %d) (pos: %d) (pid: %d)\n", + CDEBUG(level," -- Lock dump: %p/"LPX64" (rc: %d) (pos: %d) (pid: %d)\n", lock, lock->l_handle.h_cookie, atomic_read(&lock->l_refc), pos, lock->l_pid); if (lock->l_conn_export != NULL) obd = lock->l_conn_export->exp_obd; if (lock->l_export && lock->l_export->exp_connection) { CDEBUG(level, " Node: NID %s (rhandle: "LPX64")\n", - libcfs_nid2str(lock->l_export->exp_connection->c_peer.nid), - lock->l_remote_handle.cookie); + libcfs_nid2str(lock->l_export->exp_connection->c_peer.nid), + lock->l_remote_handle.cookie); } else if (obd == NULL) { CDEBUG(level, " Node: local\n"); } else { @@ -1212,9 +1250,10 @@ void ldlm_lock_dump(int level, struct ldlm_lock *lock, int pos) lock->l_resource->lr_name.name[0], lock->l_resource->lr_name.name[1]); CDEBUG(level, " Req mode: %s, grant mode: %s, rc: %u, read: %d, " - "write: %d\n", ldlm_lockname[lock->l_req_mode], + "write: %d flags: %#x\n", ldlm_lockname[lock->l_req_mode], ldlm_lockname[lock->l_granted_mode], - atomic_read(&lock->l_refc), lock->l_readers, lock->l_writers); + atomic_read(&lock->l_refc), lock->l_readers, lock->l_writers, + lock->l_flags); if (lock->l_resource->lr_type == LDLM_EXTENT) CDEBUG(level, " Extent: "LPU64" -> "LPU64 " (req "LPU64"-"LPU64")\n", diff --git a/lustre/ldlm/ldlm_lockd.c b/lustre/ldlm/ldlm_lockd.c index b3a82c0..f23f530 100644 --- a/lustre/ldlm/ldlm_lockd.c +++ b/lustre/ldlm/ldlm_lockd.c @@ -257,6 +257,7 @@ static int ldlm_add_waiting_lock(struct ldlm_lock *lock) cfs_time_t timeout_rounded; l_check_ns_lock(lock->l_resource->lr_namespace); + LASSERT(!(lock->l_flags & LDLM_FL_CANCEL_ON_BLOCK)); spin_lock_bh(&waiting_locks_spinlock); if (lock->l_destroyed) { @@ -342,6 +343,7 @@ int ldlm_del_waiting_lock(struct ldlm_lock *lock) static int ldlm_add_waiting_lock(struct ldlm_lock *lock) { + LASSERT(!(lock->l_flags & LDLM_FL_CANCEL_ON_BLOCK)); RETURN(1); } @@ -363,7 +365,7 @@ static void ldlm_failed_ast(struct ldlm_lock *lock, int rc, LDLM_ERROR(lock, "%s AST failed (%d): evicting client %s@%s NID %s" " (%s)", ast_type, rc, lock->l_export->exp_client_uuid.uuid, - conn->c_remote_uuid.uuid, libcfs_nid2str(conn->c_peer.nid), + conn->c_remote_uuid.uuid, libcfs_nid2str(conn->c_peer.nid), str); if (obd_dump_on_timeout) @@ -385,6 +387,12 @@ static int ldlm_handle_ast_error(struct ldlm_lock *lock, libcfs_nid2str(peer.nid)); ldlm_lock_cancel(lock); rc = -ERESTART; + } else if (lock->l_flags & LDLM_FL_CANCEL) { + LDLM_DEBUG(lock, "%s AST timeout from nid %s, but " + "cancel was received (AST reply lost?)", + ast_type, libcfs_nid2str(peer.nid)); + ldlm_lock_cancel(lock); + rc = -ERESTART; } else { l_lock(&lock->l_resource->lr_namespace->ns_lock); ldlm_del_waiting_lock(lock); @@ -457,32 +465,31 @@ int ldlm_server_blocking_ast(struct ldlm_lock *lock, instant_cancel = 1; req = ptlrpc_prep_req(lock->l_export->exp_imp_reverse, - LDLM_BL_CALLBACK, 1, &size, NULL); + LUSTRE_DLM_VERSION, LDLM_BL_CALLBACK, + 1, &size, NULL); if (req == NULL) { l_unlock(&lock->l_resource->lr_namespace->ns_lock); RETURN(-ENOMEM); } body = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*body)); - memcpy(&body->lock_handle1, &lock->l_remote_handle, - sizeof(body->lock_handle1)); - memcpy(&body->lock_desc, desc, sizeof(*desc)); + body->lock_handle1 = lock->l_remote_handle; + body->lock_desc = *desc; body->lock_flags |= (lock->l_flags & LDLM_AST_FLAGS); LDLM_DEBUG(lock, "server preparing blocking AST"); req->rq_replen = lustre_msg_size(0, NULL); - if (instant_cancel) { + if (instant_cancel) ldlm_lock_cancel(lock); -// ldlm_reprocess_all(lock->l_resource); - } else if (lock->l_granted_mode == lock->l_req_mode) { + else if (lock->l_granted_mode == lock->l_req_mode) ldlm_add_waiting_lock(lock); - } + l_unlock(&lock->l_resource->lr_namespace->ns_lock); req->rq_send_state = LUSTRE_IMP_FULL; req->rq_timeout = ldlm_timeout; /* timeout for initial AST reply */ if (unlikely(instant_cancel)) { - rc = ptl_send_rpc_nowait(req); + rc = ptl_send_rpc(req, 1); } else { rc = ptlrpc_queue_wait(req); } @@ -491,6 +498,10 @@ int ldlm_server_blocking_ast(struct ldlm_lock *lock, ptlrpc_req_finished(req); + /* If we cancelled the lock, we need to restart ldlm_reprocess_queue */ + if (!rc && instant_cancel) + rc = -ERESTART; + RETURN(rc); } @@ -500,7 +511,7 @@ int ldlm_server_completion_ast(struct ldlm_lock *lock, int flags, void *data) struct ptlrpc_request *req; struct timeval granted_time; long total_enqueue_wait; - int rc = 0, size[2] = {sizeof(*body)}, buffers = 1; + int rc = 0, size[2] = {sizeof(*body)}, buffers = 1, instant_cancel = 0; ENTRY; LASSERT(lock != NULL); @@ -520,13 +531,13 @@ int ldlm_server_completion_ast(struct ldlm_lock *lock, int flags, void *data) mutex_up(&lock->l_resource->lr_lvb_sem); req = ptlrpc_prep_req(lock->l_export->exp_imp_reverse, - LDLM_CP_CALLBACK, buffers, size, NULL); + LUSTRE_DLM_VERSION, LDLM_CP_CALLBACK, + buffers, size, NULL); if (req == NULL) RETURN(-ENOMEM); body = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*body)); - memcpy(&body->lock_handle1, &lock->l_remote_handle, - sizeof(body->lock_handle1)); + body->lock_handle1 = lock->l_remote_handle; body->lock_flags = flags; ldlm_lock2desc(lock, &body->lock_desc); @@ -552,8 +563,20 @@ int ldlm_server_completion_ast(struct ldlm_lock *lock, int flags, void *data) l_lock(&lock->l_resource->lr_namespace->ns_lock); if (lock->l_flags & LDLM_FL_AST_SENT) { body->lock_flags |= LDLM_FL_AST_SENT; - body->lock_flags &= ~LDLM_FL_CANCEL_ON_BLOCK; - ldlm_add_waiting_lock(lock); /* start the lock-timeout clock */ + + /* We might get here prior to ldlm_handle_enqueue setting + LDLM_FL_CANCEL_ON_BLOCK flag. Then we will put this lock into + waiting list, but this is safe and similar code in + ldlm_handle_enqueue will call ldlm_lock_cancel() still, that + would not only cancel the loc, but will also remove it from + waiting list */ + if (lock->l_flags & LDLM_FL_CANCEL_ON_BLOCK) { + ldlm_lock_cancel(lock); + instant_cancel = 1; + } else { + ldlm_add_waiting_lock(lock); /* start the lock-timeout + clock */ + } } l_unlock(&lock->l_resource->lr_namespace->ns_lock); @@ -563,6 +586,10 @@ int ldlm_server_completion_ast(struct ldlm_lock *lock, int flags, void *data) ptlrpc_req_finished(req); + /* If we cancelled the lock, we need to restart ldlm_reprocess_queue */ + if (!rc && instant_cancel) + rc = -ERESTART; + RETURN(rc); } @@ -577,13 +604,13 @@ int ldlm_server_glimpse_ast(struct ldlm_lock *lock, void *data) LASSERT(lock != NULL); req = ptlrpc_prep_req(lock->l_export->exp_imp_reverse, - LDLM_GL_CALLBACK, 1, &size, NULL); + LUSTRE_DLM_VERSION, LDLM_GL_CALLBACK, + 1, &size, NULL); if (req == NULL) RETURN(-ENOMEM); body = lustre_msg_buf(req->rq_reqmsg, 0, sizeof(*body)); - memcpy(&body->lock_handle1, &lock->l_remote_handle, - sizeof(body->lock_handle1)); + body->lock_handle1 = lock->l_remote_handle; ldlm_lock2desc(lock, &body->lock_desc); mutex_down(&lock->l_resource->lr_lvb_sem); @@ -627,6 +654,10 @@ find_existing_lock(struct obd_export *exp, struct lustre_handle *remote_hdl) } +/* + * Main server-side entry point into LDLM. This is called by ptlrpc service + * threads to carry out client lock enqueueing requests. + */ int ldlm_handle_enqueue(struct ptlrpc_request *req, ldlm_completion_callback completion_callback, ldlm_blocking_callback blocking_callback, @@ -644,7 +675,8 @@ int ldlm_handle_enqueue(struct ptlrpc_request *req, LDLM_DEBUG_NOLOCK("server-side enqueue handler START"); - dlm_req = lustre_swab_reqbuf (req, 0, sizeof (*dlm_req), + dlm_req = lustre_swab_reqbuf (req, MDS_REQ_INTENT_LOCKREQ_OFF, + sizeof (*dlm_req), lustre_swab_ldlm_request); if (dlm_req == NULL) { CERROR ("Can't unpack dlm_req\n"); @@ -655,16 +687,6 @@ int ldlm_handle_enqueue(struct ptlrpc_request *req, LASSERT(req->rq_export); - if (flags & LDLM_FL_REPLAY) { - lock = find_existing_lock(req->rq_export, - &dlm_req->lock_handle1); - if (lock != NULL) { - DEBUG_REQ(D_HA, req, "found existing lock cookie "LPX64, - lock->l_handle.h_cookie); - GOTO(existing_lock, rc = 0); - } - } - if (dlm_req->lock_desc.l_resource.lr_type < LDLM_MIN_TYPE || dlm_req->lock_desc.l_resource.lr_type >= LDLM_MAX_TYPE) { DEBUG_REQ(D_ERROR, req, "invalid lock request type %d\n", @@ -680,6 +702,39 @@ int ldlm_handle_enqueue(struct ptlrpc_request *req, GOTO(out, rc = -EFAULT); } + if (req->rq_export->exp_connect_flags & OBD_CONNECT_IBITS) { + if (dlm_req->lock_desc.l_resource.lr_type == LDLM_PLAIN) { + DEBUG_REQ(D_ERROR, req, + "PLAIN lock request from IBITS client?\n"); + GOTO(out, rc = -EPROTO); + } + } else if (dlm_req->lock_desc.l_resource.lr_type == LDLM_IBITS) { + DEBUG_REQ(D_ERROR, req, + "IBITS lock request from unaware client?\n"); + GOTO(out, rc = -EPROTO); + } + + /* INODEBITS_INTEROP: Perform conversion from plain lock to + * inodebits lock if client does not support them. */ + if (!(req->rq_export->exp_connect_flags & OBD_CONNECT_IBITS) && + (dlm_req->lock_desc.l_resource.lr_type == LDLM_PLAIN)) { + dlm_req->lock_desc.l_resource.lr_type = LDLM_IBITS; + dlm_req->lock_desc.l_policy_data.l_inodebits.bits = + MDS_INODELOCK_LOOKUP | MDS_INODELOCK_UPDATE; + if (dlm_req->lock_desc.l_req_mode == LCK_PR) + dlm_req->lock_desc.l_req_mode = LCK_CR; + } + + if (flags & LDLM_FL_REPLAY) { + lock = find_existing_lock(req->rq_export, + &dlm_req->lock_handle1); + if (lock != NULL) { + DEBUG_REQ(D_HA, req, "found existing lock cookie "LPX64, + lock->l_handle.h_cookie); + GOTO(existing_lock, rc = 0); + } + } + /* The lock's callback data might be set in the policy function */ lock = ldlm_lock_create(obddev->obd_namespace, &dlm_req->lock_handle2, dlm_req->lock_desc.l_resource.lr_name, @@ -691,8 +746,7 @@ int ldlm_handle_enqueue(struct ptlrpc_request *req, GOTO(out, rc = -ENOMEM); do_gettimeofday(&lock->l_enqueued_time); - memcpy(&lock->l_remote_handle, &dlm_req->lock_handle1, - sizeof(lock->l_remote_handle)); + lock->l_remote_handle = dlm_req->lock_handle1; LDLM_DEBUG(lock, "server-side enqueue handler, new lock created"); OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_ENQUEUE_BLOCKED, obd_timeout * 2); @@ -734,11 +788,9 @@ existing_lock: } if (dlm_req->lock_desc.l_resource.lr_type != LDLM_PLAIN) - memcpy(&lock->l_policy_data, &dlm_req->lock_desc.l_policy_data, - sizeof(ldlm_policy_data_t)); + lock->l_policy_data = dlm_req->lock_desc.l_policy_data; if (dlm_req->lock_desc.l_resource.lr_type == LDLM_EXTENT) - memcpy(&lock->l_req_extent, &lock->l_policy_data.l_extent, - sizeof(lock->l_req_extent)); + lock->l_req_extent = lock->l_policy_data.l_extent; err = ldlm_lock_enqueue(obddev->obd_namespace, &lock, cookie, &flags); if (err) @@ -753,6 +805,12 @@ existing_lock: /* We never send a blocking AST until the lock is granted, but * we can tell it right now */ l_lock(&lock->l_resource->lr_namespace->ns_lock); + + /* Now take into account flags to be inherited from original lock + request both in reply to client and in our own lock flags. */ + dlm_rep->lock_flags |= dlm_req->lock_flags & LDLM_INHERIT_FLAGS; + lock->l_flags |= dlm_req->lock_flags & LDLM_INHERIT_FLAGS; + /* Don't move a pending lock onto the export if it has already * been evicted. Cancel it now instead. (bug 5683) */ if (req->rq_export->exp_failed || @@ -761,10 +819,36 @@ existing_lock: rc = -ENOTCONN; } else if (lock->l_flags & LDLM_FL_AST_SENT) { dlm_rep->lock_flags |= LDLM_FL_AST_SENT; - dlm_rep->lock_flags &= ~LDLM_FL_CANCEL_ON_BLOCK; - if (lock->l_granted_mode == lock->l_req_mode) + if (dlm_rep->lock_flags & LDLM_FL_CANCEL_ON_BLOCK) + ldlm_lock_cancel(lock); + else if (lock->l_granted_mode == lock->l_req_mode) ldlm_add_waiting_lock(lock); } + /* Make sure we never ever grant usual metadata locks to liblustre + clients */ + if ((dlm_req->lock_desc.l_resource.lr_type == LDLM_PLAIN || + dlm_req->lock_desc.l_resource.lr_type == LDLM_IBITS) && + req->rq_export->exp_libclient) { + if (!(lock->l_flags & LDLM_FL_CANCEL_ON_BLOCK) || + !(dlm_rep->lock_flags & LDLM_FL_CANCEL_ON_BLOCK)) { + CERROR("Granting sync lock to libclient. " + "req fl %d, rep fl %d, lock fl %d\n", + dlm_req->lock_flags, dlm_rep->lock_flags, + lock->l_flags); + LDLM_ERROR(lock, "sync lock"); + if (dlm_req->lock_flags & LDLM_FL_HAS_INTENT) { + struct ldlm_intent *it; + it = lustre_msg_buf(req->rq_reqmsg, 1, + sizeof(*it)); + if (it != NULL) { + CERROR("This is intent %s (" + LPU64 ")\n", + ldlm_it2str(it->opc), it->opc); + } + } + } + } + l_unlock(&lock->l_resource->lr_namespace->ns_lock); EXIT; @@ -805,8 +889,10 @@ existing_lock: if (!err && dlm_req->lock_desc.l_resource.lr_type != LDLM_FLOCK) ldlm_reprocess_all(lock->l_resource); + LDLM_LOCK_PUT(lock); } + LDLM_DEBUG_NOLOCK("server-side enqueue handler END (lock %p, rc %d)", lock, rc); @@ -994,8 +1080,7 @@ static void ldlm_handle_cp_callback(struct ptlrpc_request *req, } if (lock->l_resource->lr_type != LDLM_PLAIN) { - memcpy(&lock->l_policy_data, &dlm_req->lock_desc.l_policy_data, - sizeof(lock->l_policy_data)); + lock->l_policy_data = dlm_req->lock_desc.l_policy_data; LDLM_DEBUG(lock, "completion AST, new policy data"); } diff --git a/lustre/ldlm/ldlm_resource.c b/lustre/ldlm/ldlm_resource.c index 4679c7d..02e22fd 100644 --- a/lustre/ldlm/ldlm_resource.c +++ b/lustre/ldlm/ldlm_resource.c @@ -480,7 +480,7 @@ ldlm_resource_add(struct ldlm_namespace *ns, struct ldlm_resource *parent, RETURN(NULL); l_lock(&ns->ns_lock); - memcpy(&res->lr_name, &name, sizeof(res->lr_name)); + res->lr_name = name; res->lr_namespace = ns; atomic_inc(&ns->ns_refcount); @@ -689,7 +689,7 @@ EXPORT_SYMBOL(ldlm_resource_unlink_lock); void ldlm_res2desc(struct ldlm_resource *res, struct ldlm_resource_desc *desc) { desc->lr_type = res->lr_type; - memcpy(&desc->lr_name, &res->lr_name, sizeof(desc->lr_name)); + desc->lr_name = res->lr_name; } void ldlm_dump_all_namespaces(int level) diff --git a/lustre/liblustre/Makefile.am b/lustre/liblustre/Makefile.am index 72b71e6..872960f 100644 --- a/lustre/liblustre/Makefile.am +++ b/lustre/liblustre/Makefile.am @@ -61,7 +61,7 @@ liblustre_a_SOURCES = llite_lib.c super.c namei.c rw.c file.c dir.c \ llite_lib.h liblustre.a : $(LUSTRE_LIBS) $(LND_LIBS) $(LNET_LIBS) $(SYSIO_LIBS) $(QUOTA_LIBS) - sh $(srcdir)/genlib.sh "$(SYSIO)" "$(LIBS)" "$(LND_LIBS)" "$(PTHREAD_LIBS)" "$(QUOTA_LIBS)" + sh $(srcdir)/genlib.sh "$(SYSIO)" "$(LIBS)" "$(LND_LIBS)" "$(PTHREAD_LIBS)" "$(QUOTA_LIBS)" "$(CAP_LIBS)" EXTRA_DIST = genlib.sh diff --git a/lustre/liblustre/dir.c b/lustre/liblustre/dir.c index 591b434..3a59da9 100644 --- a/lustre/liblustre/dir.c +++ b/lustre/liblustre/dir.c @@ -82,11 +82,11 @@ static int llu_dir_do_readpage(struct inode *inode, struct page *page) ENTRY; rc = ldlm_lock_match(obddev->obd_namespace, LDLM_FL_BLOCK_GRANTED, - &res_id, LDLM_IBITS, &policy, LCK_PR, &lockh); + &res_id, LDLM_IBITS, &policy, LCK_CR, &lockh); if (!rc) { llu_prepare_mdc_op_data(&data, inode, NULL, NULL, 0, 0); - rc = mdc_enqueue(sbi->ll_mdc_exp, LDLM_IBITS, &it, LCK_PR, + rc = mdc_enqueue(sbi->ll_mdc_exp, LDLM_IBITS, &it, LCK_CR, &data, &lockh, NULL, 0, ldlm_completion_ast, llu_mdc_blocking_ast, inode, LDLM_FL_CANCEL_ON_BLOCK); @@ -117,7 +117,7 @@ static int llu_dir_do_readpage(struct inode *inode, struct page *page) ptlrpc_req_finished(request); EXIT; - ldlm_lock_decref(&lockh, LCK_PR); + ldlm_lock_decref(&lockh, LCK_CR); return rc; } @@ -194,8 +194,8 @@ static int filldir(char *buf, int buflen, return 0; } -ssize_t llu_iop_getdirentries(struct inode *ino, char *buf, size_t nbytes, - _SYSIO_OFF_T *basep) +ssize_t llu_iop_filldirentries(struct inode *ino, _SYSIO_OFF_T *basep, + char *buf, size_t nbytes) { struct llu_inode_info *lli = llu_i2info(ino); struct intnl_stat *st = llu_i2stat(ino); diff --git a/lustre/liblustre/file.c b/lustre/liblustre/file.c index 04d0894..19ce42b 100644 --- a/lustre/liblustre/file.c +++ b/lustre/liblustre/file.c @@ -237,7 +237,6 @@ int llu_objects_destroy(struct ptlrpc_request *request, struct inode *dir) int rc; ENTRY; - oti.oti_thread = request->rq_svc_thread; /* req is swabbed so this is safe */ body = lustre_msg_buf(request->rq_repmsg, 0, sizeof(*body)); @@ -260,7 +259,7 @@ int llu_objects_destroy(struct ptlrpc_request *request, struct inode *dir) GOTO(out, rc = -EPROTO); } - rc = obd_unpackmd(llu_i2obdexp(dir), &lsm, eadata, body->eadatasize); + rc = obd_unpackmd(llu_i2obdexp(dir), &lsm, eadata,body->eadatasize); if (rc < 0) { CERROR("obd_unpackmd: %d\n", rc); GOTO(out, rc); @@ -287,7 +286,7 @@ int llu_objects_destroy(struct ptlrpc_request *request, struct inode *dir) } } - rc = obd_destroy(llu_i2obdexp(dir), oa, lsm, &oti); + rc = obd_destroy(llu_i2obdexp(dir), oa, lsm, &oti, NULL); obdo_free(oa); if (rc) CERROR("obd destroy objid 0x"LPX64" error %d\n", @@ -418,7 +417,7 @@ _SYSIO_OFF_T llu_iop_pos(struct inode *ino, _SYSIO_OFF_T off) /* this isn't where truncate starts. roughly: * llu_iop_{open,setattr}->llu_setattr_raw->llu_vmtruncate->llu_truncate * we grab the lock back in setattr_raw to avoid races. */ -static void llu_truncate(struct inode *inode) +static void llu_truncate(struct inode *inode, obd_flag flags) { struct llu_inode_info *lli = llu_i2info(inode); struct intnl_stat *st = llu_i2stat(inode); @@ -438,9 +437,12 @@ static void llu_truncate(struct inode *inode) } oa.o_id = lsm->lsm_object_id; - oa.o_valid = OBD_MD_FLID; - obdo_from_inode(&oa, inode, OBD_MD_FLTYPE|OBD_MD_FLMODE|OBD_MD_FLATIME| - OBD_MD_FLMTIME | OBD_MD_FLCTIME); + oa.o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS; + oa.o_flags = flags; /* We don't actually want to copy inode flags */ + + obdo_from_inode(&oa, inode, + OBD_MD_FLTYPE | OBD_MD_FLMODE | OBD_MD_FLATIME | + OBD_MD_FLMTIME | OBD_MD_FLCTIME); obd_adjust_kms(llu_i2obdexp(inode), lsm, st->st_size, 1); @@ -462,11 +464,17 @@ static void llu_truncate(struct inode *inode) return; } /* llu_truncate */ -int llu_vmtruncate(struct inode * inode, loff_t offset) +int llu_vmtruncate(struct inode * inode, loff_t offset, obd_flag flags) { llu_i2stat(inode)->st_size = offset; - llu_truncate(inode); + /* + * llu_truncate() is only called from this + * point. llu_vmtruncate/llu_truncate split exists to mimic the + * structure of Linux VFS truncate code path. + */ + + llu_truncate(inode, flags); return 0; } diff --git a/lustre/liblustre/genlib.sh b/lustre/liblustre/genlib.sh index c7e7e06..6d977b3 100755 --- a/lustre/liblustre/genlib.sh +++ b/lustre/liblustre/genlib.sh @@ -22,6 +22,7 @@ LIBS=$2 LND_LIBS=$3 PTHREAD_LIBS=$4 QUOTA_LIBS=$5 +CAP_LIBS=$6 if [ ! -f $SYSIO/lib/libsysio.a ]; then echo "ERROR: $SYSIO/lib/libsysio.a dosen't exist" @@ -103,7 +104,7 @@ if test x$OS = xAIX; then gcc -shared -o $CWD/liblustre.so $ALL_OBJS -lpthread -Xlinker -bnoipath ../../libsyscall.so else $LD -shared -o $CWD/liblustre.so -init __liblustre_setup_ -fini __liblustre_cleanup_ \ - $ALL_OBJS -lcap $PTHREAD_LIBS + $ALL_OBJS $CAP_LIBS $PTHREAD_LIBS fi rm -rf $sysio_tmp diff --git a/lustre/liblustre/llite_lib.c b/lustre/liblustre/llite_lib.c index 10e3472..21c50ec 100644 --- a/lustre/liblustre/llite_lib.c +++ b/lustre/liblustre/llite_lib.c @@ -40,14 +40,6 @@ #include #endif -/* env variables */ -#define ENV_LUSTRE_MNTPNT "LIBLUSTRE_MOUNT_POINT" -#define ENV_LUSTRE_MNTTGT "LIBLUSTRE_MOUNT_TARGET" -#define ENV_LUSTRE_TIMEOUT "LIBLUSTRE_TIMEOUT" -#define ENV_LUSTRE_DUMPFILE "LIBLUSTRE_DUMPFILE" -#define ENV_LUSTRE_DEBUG_MASK "LIBLUSTRE_DEBUG_MASK" -#define ENV_LUSTRE_DEBUG_SUBSYS "LIBLUSTRE_DEBUG_SUBSYS" - /* both sys/queue.h (libsysio require it) and portals/lists.h have definition * of 'LIST_HEAD'. undef it to suppress warnings */ @@ -56,12 +48,13 @@ #include "lutil.h" #include "llite_lib.h" +#include static int lllib_init(void) { if (liblustre_init_current("liblustre") || - init_obdclass() || init_lib_portals() || + init_obdclass() || ptlrpc_init() || mdc_init() || lov_init() || @@ -87,6 +80,7 @@ int liblustre_process_log(struct config_llog_instance *cfg, struct llog_ctxt *ctxt; lnet_nid_t nid = 0; int err, rc = 0; + struct obd_connect_data *ocd = NULL; ENTRY; generate_random_uuid(uuid); @@ -129,12 +123,18 @@ int liblustre_process_log(struct config_llog_instance *cfg, if (obd == NULL) GOTO(out_cleanup, rc = -EINVAL); + OBD_ALLOC(ocd, sizeof(*ocd)); + if (ocd == NULL) + GOTO(out_cleanup, rc = -ENOMEM); + + ocd->ocd_version = LUSTRE_VERSION_CODE; + /* Disable initial recovery on this import */ rc = obd_set_info(obd->obd_self_export, strlen("initial_recov"), "initial_recov", sizeof(allow_recov), &allow_recov); - rc = obd_connect(&mdc_conn, obd, &mdc_uuid, NULL /*connect_flags*/); + rc = obd_connect(&mdc_conn, obd, &mdc_uuid, ocd); if (rc) { CERROR("cannot connect to %s: rc = %d\n", mdsname, rc); GOTO(out_cleanup, rc); @@ -155,6 +155,9 @@ int liblustre_process_log(struct config_llog_instance *cfg, CERROR("obd_disconnect failed: rc = %d\n", err); out_cleanup: + if (ocd) + OBD_FREE(ocd, sizeof(*ocd)); + lustre_cfg_bufs_reset(&bufs, name); lcfg = lustre_cfg_new(LCFG_CLEANUP, &bufs); err = class_process_config(lcfg); @@ -257,7 +260,7 @@ int _sysio_lustre_init(void) perror("init llite driver"); return err; } - timeout = getenv(ENV_LUSTRE_TIMEOUT); + timeout = getenv("LIBLUSTRE_TIMEOUT"); if (timeout) { obd_timeout = (unsigned int) strtol(timeout, NULL, 0); printf("LibLustre: set obd timeout as %u seconds\n", @@ -265,11 +268,11 @@ int _sysio_lustre_init(void) } /* debug masks */ - debug_mask = getenv(ENV_LUSTRE_DEBUG_MASK); + debug_mask = getenv("LIBLUSTRE_DEBUG_MASK"); if (debug_mask) libcfs_debug = (unsigned int) strtol(debug_mask, NULL, 0); - debug_subsys = getenv(ENV_LUSTRE_DEBUG_SUBSYS); + debug_subsys = getenv("LIBLUSTRE_DEBUG_SUBSYS"); if (debug_subsys) libcfs_subsystem_debug = (unsigned int) strtol(debug_subsys, NULL, 0); @@ -294,20 +297,20 @@ void __liblustre_setup_(void) unsigned mntflgs = 0; int err; - lustre_path = getenv(ENV_LUSTRE_MNTPNT); + lustre_path = getenv("LIBLUSTRE_MOUNT_POINT"); if (!lustre_path) { lustre_path = "/mnt/lustre"; } /* mount target */ - target = getenv(ENV_LUSTRE_MNTTGT); + target = getenv("LIBLUSTRE_MOUNT_TARGET"); if (!target) { printf("LibLustre: no mount target specified\n"); exit(1); } - printf("LibLustre: mount point %s, target %s\n", - lustre_path, target); + CDEBUG(D_CONFIG, "LibLustre: mount point %s, target %s\n", + lustre_path, target); #ifdef INIT_SYSIO /* initialize libsysio & mount rootfs */ @@ -360,8 +363,8 @@ void __liblustre_cleanup_(void) * liblutre. this dilema lead to another hack in * libsysio/src/file_hack.c FIXME */ - _sysio_shutdown(); #ifdef INIT_SYSIO + _sysio_shutdown(); cleanup_lib_portals(); LNetFini(); #endif diff --git a/lustre/liblustre/llite_lib.h b/lustre/liblustre/llite_lib.h index 5e3fbce..fd8a92f 100644 --- a/lustre/liblustre/llite_lib.h +++ b/lustre/liblustre/llite_lib.h @@ -25,17 +25,17 @@ struct ll_file_data { struct llu_sb_info { - struct obd_uuid ll_sb_uuid; - struct obd_export *ll_mdc_exp; - struct obd_export *ll_osc_exp; - obd_id ll_rootino; - int ll_flags; - __u64 ll_connect_flags; - struct list_head ll_conn_chain; - - struct obd_uuid ll_mds_uuid; - struct obd_uuid ll_mds_peer_uuid; - char *ll_instance; + struct obd_uuid ll_sb_uuid; + struct obd_export *ll_mdc_exp; + struct obd_export *ll_osc_exp; + obd_id ll_rootino; + int ll_flags; + struct lustre_client_ocd ll_lco; + struct list_head ll_conn_chain; + + struct obd_uuid ll_mds_uuid; + struct obd_uuid ll_mds_peer_uuid; + char *ll_instance; }; #define LL_SBI_NOLCK 0x1 @@ -202,7 +202,7 @@ int llu_iop_open(struct pnode *pnode, int flags, mode_t mode); int llu_mdc_close(struct obd_export *mdc_exp, struct inode *inode); int llu_iop_close(struct inode *inode); _SYSIO_OFF_T llu_iop_pos(struct inode *ino, _SYSIO_OFF_T off); -int llu_vmtruncate(struct inode * inode, loff_t offset); +int llu_vmtruncate(struct inode * inode, loff_t offset, obd_flag obd_flags); void obdo_refresh_inode(struct inode *dst, struct obdo *src, obd_flag valid); int llu_objects_destroy(struct ptlrpc_request *request, struct inode *dir); @@ -231,8 +231,8 @@ int llu_mdc_blocking_ast(struct ldlm_lock *lock, void *data, int flag); /* dir.c */ -ssize_t llu_iop_getdirentries(struct inode *ino, char *buf, size_t nbytes, - _SYSIO_OFF_T *basep); +ssize_t llu_iop_filldirentries(struct inode *ino, _SYSIO_OFF_T *basep, + char *buf, size_t nbytes); /* ext2 related */ #define EXT2_NAME_LEN (255) @@ -255,4 +255,14 @@ static inline struct ext2_dirent *ext2_next_entry(struct ext2_dirent *p) return (struct ext2_dirent*)((char*) p + le16_to_cpu(p->rec_len)); } +static inline void inode_init_lvb(struct inode *inode, struct ost_lvb *lvb) +{ + struct intnl_stat *st = llu_i2stat(inode); + lvb->lvb_size = st->st_size; + lvb->lvb_blocks = st->st_blocks; + lvb->lvb_mtime = st->st_mtime; + lvb->lvb_atime = st->st_atime; + lvb->lvb_ctime = st->st_ctime; +} + #endif diff --git a/lustre/liblustre/lutil.c b/lustre/liblustre/lutil.c index bbb788b..b4689d5 100644 --- a/lustre/liblustre/lutil.c +++ b/lustre/liblustre/lutil.c @@ -77,9 +77,6 @@ void *inter_module_get(char *arg) /* * random number generator stuff */ -#ifdef LIBLUSTRE_USE_URANDOM -static int _rand_dev_fd = -1; -#endif #ifdef HAVE_GETHOSTBYNAME static int get_ipv4_addr() @@ -107,49 +104,57 @@ static int get_ipv4_addr() void liblustre_init_random() { - int seed; + int _rand_dev_fd; + int seed[2]; struct timeval tv; #ifdef LIBLUSTRE_USE_URANDOM _rand_dev_fd = syscall(SYS_open, "/dev/urandom", O_RDONLY); if (_rand_dev_fd >= 0) { if (syscall(SYS_read, _rand_dev_fd, - &seed, sizeof(int)) == sizeof(int)) { - srand(seed); + &seed, sizeof(seed)) == sizeof(seed)) { + ll_srand(seed[0], seed[1]); return; } syscall(SYS_close, _rand_dev_fd); - _rand_dev_fd = -1; } #endif /* LIBLUSTRE_USE_URANDOM */ #ifdef HAVE_GETHOSTBYNAME - seed = get_ipv4_addr(); + seed[0] = get_ipv4_addr(); #else - seed = _my_pnid; + seed[0] = _my_pnid; #endif gettimeofday(&tv, NULL); - srand(tv.tv_sec + tv.tv_usec + getpid() + __swab32(seed)); + ll_srand(tv.tv_sec ^ __swab32(seed[0]), tv.tv_usec ^__swab32(getpid())); } void get_random_bytes(void *buf, int size) { - char *p = buf; + int *p = buf; + int rem; LASSERT(size >= 0); -#ifdef LIBLUSTRE_USE_URANDOM - if (_rand_dev_fd >= 0) { - if (syscall(SYS_read, _rand_dev_fd, buf, size) == size) - return; - syscall(SYS_close, _rand_dev_fd); - _rand_dev_fd = -1; + rem = min((unsigned long)buf & (sizeof(int) - 1), size); + if (rem) { + int val = ll_rand(); + memcpy(buf, &val, rem); + p = buf + rem; + size -= rem; } -#endif - while (size--) - *p++ = rand(); + while (size >= sizeof(int)) { + *p = ll_rand(); + size -= sizeof(int); + p++; + } + buf = p; + if (size) { + int val = ll_rand(); + memcpy(buf, &val, size); + } } - + static void init_capability(int *res) { #ifdef HAVE_LIBCAP @@ -243,9 +248,15 @@ int init_lib_portals() int rc; ENTRY; + rc = libcfs_debug_init(5 * 1024 * 1024); + if (rc != 0) { + CERROR("libcfs_debug_init() failed: %d\n", rc); + RETURN (-ENXIO); + } + rc = LNetInit(); if (rc != 0) { - CERROR("LNetInit failed: %d\n", rc); + CERROR("LNetInit() failed: %d\n", rc); RETURN (-ENXIO); } RETURN(0); @@ -254,5 +265,6 @@ int init_lib_portals() extern void ptlrpc_exit_portals(void); void cleanup_lib_portals() { + libcfs_debug_cleanup(); ptlrpc_exit_portals(); } diff --git a/lustre/liblustre/rw.c b/lustre/liblustre/rw.c index 5bd5642..1f60bd5 100644 --- a/lustre/liblustre/rw.c +++ b/lustre/liblustre/rw.c @@ -226,6 +226,7 @@ int llu_glimpse_size(struct inode *inode) struct llu_sb_info *sbi = llu_i2sbi(inode); ldlm_policy_data_t policy = { .l_extent = { 0, OBD_OBJECT_EOF } }; struct lustre_handle lockh = { 0 }; + struct ost_lvb lvb; int rc, flags = LDLM_FL_HAS_INTENT; ENTRY; @@ -240,9 +241,13 @@ int llu_glimpse_size(struct inode *inode) RETURN(rc > 0 ? -EIO : rc); } - st->st_size = lov_merge_size(lli->lli_smd, 0); - st->st_blocks = lov_merge_blocks(lli->lli_smd); - st->st_mtime = lov_merge_mtime(lli->lli_smd, st->st_mtime); + inode_init_lvb(inode, &lvb); + obd_merge_lvb(sbi->ll_osc_exp, lli->lli_smd, &lvb, 0); + st->st_size = lvb.lvb_size; + st->st_blocks = lvb.lvb_blocks; + st->st_mtime = lvb.lvb_mtime; + st->st_atime = lvb.lvb_atime; + st->st_ctime = lvb.lvb_ctime; CDEBUG(D_DLMTRACE, "glimpse: size: %llu, blocks: %llu\n", (long long)st->st_size, (long long)st->st_blocks); @@ -259,10 +264,11 @@ int llu_extent_lock(struct ll_file_data *fd, struct inode *inode, { struct llu_sb_info *sbi = llu_i2sbi(inode); struct intnl_stat *st = llu_i2stat(inode); + struct ost_lvb lvb; int rc; ENTRY; - LASSERT(lockh->cookie == 0); + LASSERT(!lustre_handle_is_used(lockh)); CLASSERT(ELDLM_OK == 0); /* XXX phil: can we do this? won't it screw the file size up? */ @@ -281,12 +287,17 @@ int llu_extent_lock(struct ll_file_data *fd, struct inode *inode, if (rc > 0) rc = -EIO; + inode_init_lvb(inode, &lvb); + obd_merge_lvb(sbi->ll_osc_exp, lsm, &lvb, 1); if (policy->l_extent.start == 0 && policy->l_extent.end == OBD_OBJECT_EOF) - st->st_size = lov_merge_size(lsm, 1); + st->st_size = lvb.lvb_size; - if (rc == 0) - st->st_mtime = lov_merge_mtime(lsm, st->st_mtime); + if (rc == 0) { + st->st_mtime = lvb.lvb_mtime; + st->st_atime = lvb.lvb_atime; + st->st_ctime = lvb.lvb_ctime; + } RETURN(rc); } @@ -459,7 +470,7 @@ static int llu_queue_pio(int cmd, struct llu_io_group *group, * The root of the problem is that * * kms = lov_merge_size(lsm, 1); - * if (end > kms) + * if (end >= kms) * glimpse_size(inode); * else * st->st_size = kms; @@ -566,6 +577,7 @@ ssize_t llu_file_prwv(const struct iovec *iovec, int iovlen, struct obd_export *exp = NULL; struct llu_io_group *iogroup; struct lustre_rw_params p; + struct ost_lvb lvb; __u64 kms; int err, is_read, iovidx, ret; int local_lock; @@ -587,7 +599,7 @@ ssize_t llu_file_prwv(const struct iovec *iovec, int iovlen, RETURN(-ERANGE); lustre_build_lock_params(session->lis_cmd, lli->lli_open_flags, - lli->lli_sbi->ll_connect_flags, + lli->lli_sbi->ll_lco.lco_flags, pos, len, &p); iogroup = get_io_group(inode, max_io_pages(len, iovlen), &p); @@ -608,8 +620,11 @@ ssize_t llu_file_prwv(const struct iovec *iovec, int iovlen, * date, and, hence, cannot be used for short-read * detection. Rely in OST to handle short reads in that case. */ - kms = lov_merge_size(lsm, 1); - if (p.lrp_policy.l_extent.end > kms) { + inode_init_lvb(inode, &lvb); + obd_merge_lvb(exp, lsm, &lvb, 1); + kms = lvb.lvb_size; + /* extent.end is last byte of the range */ + if (p.lrp_policy.l_extent.end >= kms) { /* A glimpse is necessary to determine whether * we return a short read or some zeroes at * the end of the buffer @@ -620,14 +635,14 @@ ssize_t llu_file_prwv(const struct iovec *iovec, int iovlen, * comment. */ if ((err = llu_glimpse_size(inode))) { - llu_extent_unlock(fd, inode, lsm, - p.lrp_lock_mode, &lockh); - GOTO(err_put, err); + GOTO(err_unlock, err); } - } else + } else { st->st_size = kms; - } else if (lli->lli_open_flags & O_APPEND) + } + } else if (lli->lli_open_flags & O_APPEND) { pos = st->st_size; + } for (iovidx = 0; iovidx < iovlen; iovidx++) { char *buf = (char *) iovec[iovidx].iov_base; @@ -638,9 +653,7 @@ ssize_t llu_file_prwv(const struct iovec *iovec, int iovlen, if (len < count) count = len; if (IS_BAD_PTR(buf) || IS_BAD_PTR(buf + count)) { - llu_extent_unlock(fd, inode, - lsm, p.lrp_lock_mode, &lockh); - GOTO(err_put, err = -EFAULT); + GOTO(err_unlock, err = -EFAULT); } if (is_read) { @@ -648,9 +661,7 @@ ssize_t llu_file_prwv(const struct iovec *iovec, int iovlen, break; } else { if (pos >= lli->lli_maxbytes) { - llu_extent_unlock(fd, inode, lsm, - p.lrp_lock_mode, &lockh); - GOTO(err_put, err = -EFBIG); + GOTO(err_unlock, err = -EFBIG); } if (pos + count >= lli->lli_maxbytes) count = lli->lli_maxbytes - pos; @@ -658,9 +669,7 @@ ssize_t llu_file_prwv(const struct iovec *iovec, int iovlen, ret = llu_queue_pio(session->lis_cmd, iogroup, buf, count, pos); if (ret < 0) { - llu_extent_unlock(fd, inode, - lsm, p.lrp_lock_mode, &lockh); - GOTO(err_put, err = ret); + GOTO(err_unlock, err = ret); } else { pos += ret; if (!is_read) { @@ -677,19 +686,25 @@ ssize_t llu_file_prwv(const struct iovec *iovec, int iovlen, } LASSERT(len == 0 || is_read); /* libsysio should guarantee this */ - /* - * BUG: lock is released too early. Fix is in bug 9296. - */ - err = llu_extent_unlock(fd, inode, lsm, p.lrp_lock_mode, &lockh); - if (err) - CERROR("extent unlock error %d\n", err); - err = obd_trigger_group_io(exp, lsm, NULL, iogroup->lig_oig); if (err) - GOTO(err_put, err); + GOTO(err_unlock, err); + + err = oig_wait(iogroup->lig_oig); + if (err) { + CERROR("sync error %d, data corruption possible\n", err); + GOTO(err_unlock, err); + } + + ret = llu_extent_unlock(fd, inode, lsm, p.lrp_lock_mode, &lockh); + if (ret) + CERROR("extent unlock error %d\n", ret); session->lis_groups[session->lis_ngroups++] = iogroup; RETURN(0); + +err_unlock: + llu_extent_unlock(fd, inode, lsm, p.lrp_lock_mode, &lockh); err_put: put_io_group(iogroup); RETURN((ssize_t)err); @@ -772,6 +787,10 @@ static int llu_file_rwx(struct inode *ino, int llu_iop_read(struct inode *ino, struct ioctx *ioctx) { + /* BUG: 5972 */ + struct intnl_stat *st = llu_i2stat(ino); + st->st_atime = CURRENT_TIME; + return llu_file_rwx(ino, ioctx, 1); } diff --git a/lustre/liblustre/super.c b/lustre/liblustre/super.c index 2ba2f36..f6d2f08 100644 --- a/lustre/liblustre/super.c +++ b/lustre/liblustre/super.c @@ -51,6 +51,7 @@ #undef LIST_HEAD #include "llite_lib.h" +#include #ifndef MAY_EXEC #define MAY_EXEC 1 @@ -148,11 +149,14 @@ void llu_update_inode(struct inode *inode, struct mds_body *body, if (body->valid & OBD_MD_FLID) st->st_ino = body->ino; - if (body->valid & OBD_MD_FLATIME) - LTIME_S(st->st_atime) = body->atime; - if (body->valid & OBD_MD_FLMTIME) + if (body->valid & OBD_MD_FLATIME && + body->mtime > LTIME_S(st->st_mtime)) LTIME_S(st->st_mtime) = body->mtime; - if (body->valid & OBD_MD_FLCTIME) + if (body->valid & OBD_MD_FLMTIME && + body->atime > LTIME_S(st->st_atime)) + LTIME_S(st->st_atime) = body->atime; + if (body->valid & OBD_MD_FLCTIME && + body->ctime > LTIME_S(st->st_ctime)) LTIME_S(st->st_ctime) = body->ctime; if (body->valid & OBD_MD_FLMODE) st->st_mode = (st->st_mode & S_IFMT)|(body->mode & ~S_IFMT); @@ -289,6 +293,10 @@ void obdo_from_inode(struct obdo *dst, struct inode *src, obd_flag valid) dst->o_generation = lli->lli_st_generation; newvalid |= OBD_MD_FLGENER; } + if (valid & OBD_MD_FLFID) { + dst->o_fid = st->st_ino; + newvalid |= OBD_MD_FLFID; + } dst->o_valid |= newvalid; } @@ -343,14 +351,14 @@ static struct inode* llu_new_inode(struct filesys *fs, struct inode *inode; struct llu_inode_info *lli; struct intnl_stat st = { - st_dev: 0, + .st_dev = 0, #ifndef AUTOMOUNT_FILE_NAME - st_mode: fid->f_type & S_IFMT, + .st_mode = fid->f_type & S_IFMT, #else - st_mode: fid->f_type /* all of the bits! */ + .st_mode = fid->f_type /* all of the bits! */ #endif - st_uid: geteuid(), - st_gid: getegid(), + .st_uid = geteuid(), + .st_gid = getegid(), }; OBD_ALLOC(lli, sizeof(*lli)); @@ -367,8 +375,7 @@ static struct inode* llu_new_inode(struct filesys *fs, lli->lli_sysio_fid.fid_data = &lli->lli_fid; lli->lli_sysio_fid.fid_len = sizeof(lli->lli_fid); - - memcpy(&lli->lli_fid, fid, sizeof(*fid)); + lli->lli_fid = *fid; /* file identifier is needed by functions like _sysio_i_find() */ inode = _sysio_i_new(fs, &lli->lli_sysio_fid, @@ -580,12 +587,14 @@ static int inode_setattr(struct inode * inode, struct iattr * attr) struct intnl_stat *st = llu_i2stat(inode); int error = 0; - if (ia_valid & ATTR_SIZE) { - error = llu_vmtruncate(inode, attr->ia_size); - if (error) - goto out; - } + /* + * inode_setattr() is only ever invoked with ATTR_SIZE (by + * llu_setattr_raw()) when file has no bodies. Check this. + */ + LASSERT(ergo(ia_valid & ATTR_SIZE, llu_i2info(inode)->lli_smd == NULL)); + if (ia_valid & ATTR_SIZE) + st->st_size = attr->ia_size; if (ia_valid & ATTR_UID) st->st_uid = attr->ia_uid; if (ia_valid & ATTR_GID) @@ -602,7 +611,6 @@ static int inode_setattr(struct inode * inode, struct iattr * attr) st->st_mode &= ~S_ISGID; } /* mark_inode_dirty(inode); */ -out: return error; } @@ -727,27 +735,46 @@ int llu_setattr_raw(struct inode *inode, struct iattr *attr) if (ia_valid & ATTR_SIZE) { ldlm_policy_data_t policy = { .l_extent = {attr->ia_size, OBD_OBJECT_EOF} }; - struct lustre_handle lockh = { 0 }; - int err, ast_flags = 0; + struct lustre_handle lockh = { 0, }; + struct lustre_handle match_lockh = { 0, }; + + int err; + int flags = LDLM_FL_TEST_LOCK; /* for assertion check below */ + int lock_mode; + obd_flag obd_flags; + + /* check that there are no matching locks */ + LASSERT(obd_match(sbi->ll_osc_exp, lsm, LDLM_EXTENT, &policy, + LCK_PW, &flags, inode, &match_lockh) <= 0); + /* XXX when we fix the AST intents to pass the discard-range * XXX extent, make ast_flags always LDLM_AST_DISCARD_DATA * XXX here. */ - if (attr->ia_size == 0) - ast_flags = LDLM_AST_DISCARD_DATA; + flags = (attr->ia_size == 0) ? LDLM_AST_DISCARD_DATA : 0; - rc = llu_extent_lock(NULL, inode, lsm, LCK_PW, &policy, - &lockh, ast_flags); + if (sbi->ll_lco.lco_flags & OBD_CONNECT_TRUNCLOCK) { + lock_mode = LCK_NL; + obd_flags = OBD_FL_TRUNCLOCK; + CDEBUG(D_INODE, "delegating locking to the OST"); + } else { + lock_mode = LCK_PW; + obd_flags = 0; + } + + /* with lock_mode == LK_NL no lock is taken. */ + rc = llu_extent_lock(NULL, inode, lsm, lock_mode, &policy, + &lockh, flags); if (rc != ELDLM_OK) { if (rc > 0) RETURN(-ENOLCK); RETURN(rc); } - rc = llu_vmtruncate(inode, attr->ia_size); + rc = llu_vmtruncate(inode, attr->ia_size, obd_flags); /* unlock now as we don't mind others file lockers racing with * the mds updates below? */ - err = llu_extent_unlock(NULL, inode, lsm, LCK_PW, &lockh); + err = llu_extent_unlock(NULL, inode, lsm, lock_mode, &lockh); if (err) { CERROR("llu_extent_unlock failed: %d\n", err); if (!rc) @@ -1741,8 +1768,12 @@ llu_fsswop_mount(const char *source, obd_set_info(obd->obd_self_export, strlen("async"), "async", sizeof(async), &async); + ocd.ocd_connect_flags = OBD_CONNECT_IBITS|OBD_CONNECT_VERSION; + ocd.ocd_ibits_known = MDS_INODELOCK_FULL; + ocd.ocd_version = LUSTRE_VERSION_CODE; + /* setup mdc */ - err = obd_connect(&mdc_conn, obd, &sbi->ll_sb_uuid, NULL /* ocd */); + err = obd_connect(&mdc_conn, obd, &sbi->ll_sb_uuid, &ocd); if (err) { CERROR("cannot connect to %s: rc = %d\n", mdc, err); GOTO(out_free, err); @@ -1766,14 +1797,19 @@ llu_fsswop_mount(const char *source, obd_set_info(obd->obd_self_export, strlen("async"), "async", sizeof(async), &async); - ocd.ocd_connect_flags |= OBD_CONNECT_SRVLOCK; + obd->obd_upcall.onu_owner = &sbi->ll_lco; + obd->obd_upcall.onu_upcall = ll_ocd_update; + + ocd.ocd_connect_flags = OBD_CONNECT_SRVLOCK|OBD_CONNECT_REQPORTAL| + OBD_CONNECT_VERSION|OBD_CONNECT_TRUNCLOCK; + ocd.ocd_version = LUSTRE_VERSION_CODE; err = obd_connect(&osc_conn, obd, &sbi->ll_sb_uuid, &ocd); if (err) { CERROR("cannot connect to %s: rc = %d\n", osc, err); GOTO(out_mdc, err); } sbi->ll_osc_exp = class_conn2export(&osc_conn); - sbi->ll_connect_flags = ocd.ocd_connect_flags; + sbi->ll_lco.lco_flags = ocd.ocd_connect_flags; mdc_init_ea_size(sbi->ll_mdc_exp, sbi->ll_osc_exp); @@ -1854,7 +1890,7 @@ static struct inode_ops llu_inode_ops = { inop_lookup: llu_iop_lookup, inop_getattr: llu_iop_getattr, inop_setattr: llu_iop_setattr, - inop_getdirentries: llu_iop_getdirentries, + inop_filldirentries: llu_iop_filldirentries, inop_mkdir: llu_iop_mkdir_raw, inop_rmdir: llu_iop_rmdir_raw, inop_symlink: llu_iop_symlink_raw, diff --git a/lustre/liblustre/tests/echo_test.c b/lustre/liblustre/tests/echo_test.c index dcb31a0..c47f052 100644 --- a/lustre/liblustre/tests/echo_test.c +++ b/lustre/liblustre/tests/echo_test.c @@ -32,7 +32,7 @@ #include "../lutil.h" -#ifdef CRAY_XT3 +#if CRAY_XT3 int _sysio_lustre_init(void) { /* diff --git a/lustre/liblustre/tests/sanity.c b/lustre/liblustre/tests/sanity.c index 5ee1d98..566a4c9 100644 --- a/lustre/liblustre/tests/sanity.c +++ b/lustre/liblustre/tests/sanity.c @@ -48,6 +48,7 @@ void *buf_alloc; int buf_size; +int opt_verbose; extern char *lustre_path; @@ -89,6 +90,9 @@ int t1(char *name) snprintf(path, MAX_PATH_LENGTH, "%s/test_t1", lustre_path); + if (opt_verbose) + printf("touch+unlink %s\n", path); + t_touch(path); t_unlink(path); LEAVE(); @@ -365,7 +369,11 @@ int t14(char *name) ENTRY(">1 block(4k) directory readdir"); snprintf(dir, MAX_PATH_LENGTH, "%s/test_t14_dir/", lustre_path); - t_mkdir(dir); + rc = mkdir(dir, 0755); + if (rc < 0 && errno != EEXIST) { + printf("mkdir(%s) error: %s\n", dir, strerror(errno)); + exit(1); + } printf("Creating %d files...\n", nfiles); for (i = 0; i < nfiles; i++) { sprintf(path, "%s%s%05d", dir, prefix, i); @@ -532,7 +540,7 @@ static int check_file_size(char *file, off_t size) { struct stat statbuf; - if(stat(file, &statbuf) != 0) { + if (stat(file, &statbuf) != 0) { printf("Error stat(%s)\n", file); return(1); } @@ -948,6 +956,7 @@ int t50(char *name) ENTRY("4k aligned i/o sanity"); while (np <= _npages) { printf("%3d per xfer(total %d)...\t", np, _npages); + fflush(stdout); pages_io(np, offset); np += np; } @@ -990,12 +999,12 @@ int t51(char *name) int result; ENTRY("truncate() should truncate file to proper length"); - snprintf(file, MAX_PATH_LENGTH, "%s/test_t19_file", lustre_path); + snprintf(file, MAX_PATH_LENGTH, "%s/test_t51_file", lustre_path); for (size = 0; size < T51_NR * T51_STEP; size += T51_STEP) { t_echo_create(file, ""); if (truncate(file, size) != 0) { - printf("error truncating file: %s\n", strerror(errno)); + printf("\nerror truncating file: %s\n",strerror(errno)); return(-1); } result = check_file_size(file, size); @@ -1006,11 +1015,11 @@ int t51(char *name) t_echo_create(file, ""); fd = open(file, O_RDWR|O_CREAT, (mode_t)0666); if (fd < 0) { - printf("error open file: %s\n", strerror(errno)); + printf("\nerror open file: %s\n", strerror(errno)); return(-1); } if (ftruncate(fd, size) != 0) { - printf("error ftruncating file: %s\n", strerror(errno)); + printf("\nerror ftruncating file:%s\n",strerror(errno)); return(-1); } close(fd); @@ -1018,7 +1027,54 @@ int t51(char *name) if (result != 0) return result; t_unlink(file); + if (size % (T51_STEP * (T51_NR / 75)) == 0) { + printf("."); + fflush(stdout); + } } + printf("\n"); + LEAVE(); +} +/* + * check atime update during read + */ +int t52(char *name) +{ + char file[MAX_PATH_LENGTH] = ""; + char buf[16]; + struct stat statbuf; + time_t atime; + time_t diff; + int fd, i; + + ENTRY("atime should be updated during read"); + snprintf(file, MAX_PATH_LENGTH, "%s/test_t52_file", lustre_path); + + t_echo_create(file, "check atime update during read"); + fd = open(file, O_RDONLY); + if (fd < 0) { + printf("\nerror open file: %s\n", strerror(errno)); + return(-1); + } + stat(file, &statbuf); + printf("st_atime=%s", ctime(&statbuf.st_atime)); + atime = statbuf.st_atime; + for (i = 0; i < 3; i++) { + sleep(2); + read(fd, buf, sizeof(buf)); + stat(file, &statbuf); + printf("st_atime=%s", ctime(&statbuf.st_atime)); + diff = statbuf.st_atime - atime; + if (diff <= 0) { + printf("atime doesn't updated! failed!\n"); + close(fd); + t_unlink(file); + return -1; + } + atime = statbuf.st_atime; + } + close(fd); + t_unlink(file); LEAVE(); } @@ -1077,10 +1133,11 @@ int main(int argc, char * const argv[]) {"dumpfile", 1, 0, 'd'}, {"only", 1, 0, 'o'}, {"target", 1, 0, 't'}, + {"verbose", 1, 0, 'v'}, {0, 0, 0, 0} }; - while ((c = getopt_long(argc, argv, "d:o:t:", long_opts, &opt_index)) != -1) { + while ((c = getopt_long(argc, argv, "d:o:t:v", long_opts, &opt_index)) != -1) { switch (c) { case 'd': setenv(ENV_LUSTRE_DUMPFILE, optarg, 1); @@ -1094,6 +1151,9 @@ int main(int argc, char * const argv[]) case 't': setenv(ENV_LUSTRE_MNTTGT, optarg, 1); break; + case 'v': + opt_verbose++; + break; default: usage(argv[0]); break; @@ -1112,7 +1172,13 @@ int main(int argc, char * const argv[]) __liblustre_setup_(); buf_size = _npages * PAGE_SIZE; - buf_alloc = malloc(buf_size); + if (opt_verbose) + printf("allocating %d bytes buffer\n", buf_size); + buf_alloc = calloc(1, buf_size); + if (buf_alloc == NULL) { + fprintf(stderr, "error allocating %d\n", buf_size); + exit(-ENOMEM); + } for (test = testlist; test->test != NULL; test++) { int run = 1, i; diff --git a/lustre/llite/dcache.c b/lustre/llite/dcache.c index 8d5310a..b8500d4 100644 --- a/lustre/llite/dcache.c +++ b/lustre/llite/dcache.c @@ -59,8 +59,7 @@ static void ll_release(struct dentry *de) * an AST before calling d_revalidate_it(). The dentry still exists (marked * INVALID) so d_lookup() matches it, but we have no lock on it (so * lock_match() fails) and we spin around real_lookup(). */ -static int ll_dcompare(struct dentry *parent, struct qstr *d_name, - struct qstr *name) +int ll_dcompare(struct dentry *parent, struct qstr *d_name, struct qstr *name) { struct dentry *dchild; ENTRY; @@ -71,6 +70,7 @@ static int ll_dcompare(struct dentry *parent, struct qstr *d_name, if (memcmp(d_name->name, name->name, name->len)) RETURN(1); + /* XXX: d_name must be in-dentry structure */ dchild = container_of(d_name, struct dentry, d_name); /* ugh */ if (dchild->d_flags & DCACHE_LUSTRE_INVALID) { CDEBUG(D_DENTRY,"INVALID dentry %p not matched, was bug 3784\n", @@ -134,6 +134,10 @@ void ll_intent_release(struct lookup_intent *it) ll_intent_drop_lock(it); it->it_magic = 0; it->it_op_release = 0; + /* We are still holding extra reference on a request, need to free it */ + if (it_disposition(it, DISP_ENQ_COMPLETE)) + ptlrpc_req_finished(it->d.lustre.it_data); + it->d.lustre.it_disposition = 0; it->d.lustre.it_data = NULL; EXIT; @@ -179,6 +183,7 @@ restart: continue; } + lock_dentry(dentry); if (atomic_read(&dentry->d_count) == 0) { CDEBUG(D_DENTRY, "deleting dentry %.*s (%p) parent %p " "inode %p\n", dentry->d_name.len, @@ -186,9 +191,7 @@ restart: dentry->d_inode); dget_locked(dentry); __d_drop(dentry); -#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0)) - INIT_HLIST_NODE(&dentry->d_hash); -#endif + unlock_dentry(dentry); spin_unlock(&dcache_lock); dput(dentry); goto restart; @@ -197,11 +200,17 @@ restart: "inode %p refc %d\n", dentry->d_name.len, dentry->d_name.name, dentry, dentry->d_parent, dentry->d_inode, atomic_read(&dentry->d_count)); - hlist_del_init(&dentry->d_hash); + /* actually we don't unhash the dentry, rather just + * mark it inaccessible for to __d_lookup(). otherwise + * sys_getcwd() could return -ENOENT -bzzz */ dentry->d_flags |= DCACHE_LUSTRE_INVALID; +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) + __d_drop(dentry); hlist_add_head(&dentry->d_hash, &sbi->ll_orphan_dentry_list); +#endif } + unlock_dentry(dentry); } spin_unlock(&dcache_lock); EXIT; @@ -327,16 +336,18 @@ int ll_revalidate_it(struct dentry *de, int lookup_flags, /* unfortunately ll_intent_lock may cause a callback and revoke our * dentry */ spin_lock(&dcache_lock); - hlist_del_init(&de->d_hash); + lock_dentry(de); + __d_drop(de); + unlock_dentry(de); __d_rehash(de, 0); spin_unlock(&dcache_lock); out: - /* If we had succesful it lookup on mds, but it happened to be negative, - we do not free request as it will be reused during lookup (see - comment in mdc/mdc_locks.c::mdc_intent_lock(). But if + /* We do not free request as it may be reused during following lookup + (see comment in mdc/mdc_locks.c::mdc_intent_lock()), request will + be freed in ll_lookup_it or in ll_intent_release. But if request was not completed, we need to free it. (bug 5154) */ - if (req != NULL && (rc == 1 || !it_disposition(it, DISP_ENQ_COMPLETE))) + if (req != NULL && !it_disposition(it, DISP_ENQ_COMPLETE)) ptlrpc_req_finished(req); if (rc == 0) { ll_unhash_aliases(de->d_inode); @@ -348,7 +359,9 @@ int ll_revalidate_it(struct dentry *de, int lookup_flags, de->d_name.name, de, de->d_parent, de->d_inode, atomic_read(&de->d_count)); ll_lookup_finish_locks(it, de); + lock_dentry(de); de->d_flags &= ~DCACHE_LUSTRE_INVALID; + unlock_dentry(de); } RETURN(rc); } diff --git a/lustre/llite/dir.c b/lustre/llite/dir.c index ab219ab..a0f8a13 100644 --- a/lustre/llite/dir.c +++ b/lustre/llite/dir.c @@ -488,9 +488,13 @@ static int ll_dir_ioctl(struct inode *inode, struct file *file, int rc, lmmsize; ll_inode2fid(&fid, inode); + + rc = ll_get_max_mdsize(sbi, &lmmsize); + if (rc) + RETURN(rc); + rc = mdc_getattr(sbi->ll_mdc_exp, &fid, OBD_MD_FLDIREA, - obd_size_diskmd(sbi->ll_osc_exp, NULL), - &request); + lmmsize, &request); if (rc < 0) { CDEBUG(D_INFO, "mdc_getattr failed: rc = %d\n", rc); RETURN(rc); @@ -527,6 +531,8 @@ static int ll_dir_ioctl(struct inode *inode, struct file *file, ptlrpc_req_finished(request); return rc; } + case LL_IOC_OBD_STATFS: + RETURN(ll_obd_statfs(inode, (void *)arg)); case IOC_MDC_GETFILEINFO: case IOC_MDC_GETSTRIPE: { struct ptlrpc_request *request = NULL; @@ -542,10 +548,14 @@ static int ll_dir_ioctl(struct inode *inode, struct file *file, RETURN(PTR_ERR(filename)); ll_inode2fid(&fid, inode); + + rc = ll_get_max_mdsize(sbi, &lmmsize); + if (rc) + RETURN(rc); + rc = mdc_getattr_name(sbi->ll_mdc_exp, &fid, filename, strlen(filename) + 1, OBD_MD_FLEASIZE, - obd_size_diskmd(sbi->ll_osc_exp, NULL), - &request); + lmmsize, &request); if (rc < 0) { CDEBUG(D_INFO, "mdc_getattr_name failed on %s: rc %d\n", filename, rc); @@ -576,8 +586,65 @@ static int ll_dir_ioctl(struct inode *inode, struct file *file, if (lmm->lmm_magic == __swab32(LOV_MAGIC)) { lustre_swab_lov_user_md((struct lov_user_md *)lmm); lustre_swab_lov_user_md_objects((struct lov_user_md *)lmm); + } else if (lmm->lmm_magic == __swab32(LOV_MAGIC_JOIN)) { + lustre_swab_lov_user_md_join((struct lov_user_md_join *)lmm); + } + if (lmm->lmm_magic == LOV_MAGIC_JOIN) { + struct lov_stripe_md *lsm; + struct lov_user_md_join *lmj; + int lmj_size, i, aindex = 0, rc; + + rc = obd_unpackmd(sbi->ll_osc_exp, &lsm, lmm, lmmsize); + if (rc < 0) + GOTO(out_req, rc = -ENOMEM); + rc = obd_checkmd(sbi->ll_osc_exp, sbi->ll_mdc_exp, lsm); + if (rc) + GOTO(out_free_memmd, rc); + + lmj_size = sizeof(struct lov_user_md_join) + + lsm->lsm_stripe_count * + sizeof(struct lov_user_ost_data_join); + OBD_ALLOC(lmj, lmj_size); + if (!lmj) + GOTO(out_free_memmd, rc = -ENOMEM); + + memcpy(lmj, lmm, sizeof(struct lov_user_md_join)); + for(i = 0; i < lsm->lsm_stripe_count; i++) { + struct lov_array_info *lai = lsm->lsm_array; + if ((lai->lai_ext_array[aindex].le_loi_idx + + lai->lai_ext_array[aindex].le_stripe_count)<=i){ + aindex ++; + } + CDEBUG(D_INFO, "aindex %d i %d l_extent_start"LPU64"" + "len %d \n", aindex, i, + lai->lai_ext_array[aindex].le_start, + (int)lai->lai_ext_array[aindex].le_len); + lmj->lmm_objects[i].l_extent_start = + lai->lai_ext_array[aindex].le_start; + + if ((int)lai->lai_ext_array[aindex].le_len == -1) { + lmj->lmm_objects[i].l_extent_end = -1; + } else { + lmj->lmm_objects[i].l_extent_end = + lai->lai_ext_array[aindex].le_start + + lai->lai_ext_array[aindex].le_len; + } + lmj->lmm_objects[i].l_object_id = + lsm->lsm_oinfo[i].loi_id; + lmj->lmm_objects[i].l_object_gr = + lsm->lsm_oinfo[i].loi_gr; + lmj->lmm_objects[i].l_ost_gen = + lsm->lsm_oinfo[i].loi_ost_gen; + lmj->lmm_objects[i].l_ost_idx = + lsm->lsm_oinfo[i].loi_ost_idx; + } + lmm = (struct lov_mds_md *)lmj; + lmmsize = lmj_size; +out_free_memmd: + obd_free_memmd(sbi->ll_osc_exp, &lsm); + if (rc) + GOTO(out_req, rc); } - if (cmd == IOC_MDC_GETFILEINFO) { struct lov_user_mds_data *lmdp; lstat_t st = { 0 }; @@ -606,6 +673,8 @@ static int ll_dir_ioctl(struct inode *inode, struct file *file, } rc = copy_to_user(lump, lmm, lmmsize); + if (lmm->lmm_magic == LOV_MAGIC_JOIN) + OBD_FREE(lmm, lmmsize); if (rc) GOTO(out_req, rc = -EFAULT); @@ -643,7 +712,8 @@ static int ll_dir_ioctl(struct inode *inode, struct file *file, bufs[1] = NULL; } size = data->ioc_plen1; - req = ptlrpc_prep_req(sbi2mdc(sbi)->cl_import, LLOG_CATINFO, + req = ptlrpc_prep_req(sbi2mdc(sbi)->cl_import, + LUSTRE_LOG_VERSION, LLOG_CATINFO, 2, lens, bufs); if (!req) GOTO(out_catinfo, rc = -ENOMEM); diff --git a/lustre/llite/file.c b/lustre/llite/file.c index 1ec5480..1d8c736 100644 --- a/lustre/llite/file.c +++ b/lustre/llite/file.c @@ -671,7 +671,7 @@ static int ll_glimpse_callback(struct ldlm_lock *lock, void *reqp) return rc; } -/* NB: lov_merge_size will prefer locally cached writes if they extend the +/* NB: obd_merge_lvb will prefer locally cached writes if they extend the * file (because it prefers KMS over RSS when larger) */ int ll_glimpse_size(struct inode *inode, int ast_flags) { @@ -679,6 +679,7 @@ int ll_glimpse_size(struct inode *inode, int ast_flags) struct ll_sb_info *sbi = ll_i2sbi(inode); ldlm_policy_data_t policy = { .l_extent = { 0, OBD_OBJECT_EOF } }; struct lustre_handle lockh = { 0 }; + struct ost_lvb lvb; int rc; ENTRY; @@ -705,11 +706,14 @@ int ll_glimpse_size(struct inode *inode, int ast_flags) } ll_inode_size_lock(inode, 1); - inode->i_size = lov_merge_size(lli->lli_smd, 0); - inode->i_blocks = lov_merge_blocks(lli->lli_smd); + inode_init_lvb(inode, &lvb); + obd_merge_lvb(sbi->ll_osc_exp, lli->lli_smd, &lvb, 0); + inode->i_size = lvb.lvb_size; + inode->i_blocks = lvb.lvb_blocks; + LTIME_S(inode->i_mtime) = lvb.lvb_mtime; + LTIME_S(inode->i_atime) = lvb.lvb_atime; + LTIME_S(inode->i_ctime) = lvb.lvb_ctime; ll_inode_size_unlock(inode, 1); - LTIME_S(inode->i_mtime) = - lov_merge_mtime(lli->lli_smd, LTIME_S(inode->i_mtime)); CDEBUG(D_DLMTRACE, "glimpse: size: %llu, blocks: %lu\n", inode->i_size, inode->i_blocks); @@ -725,10 +729,11 @@ int ll_extent_lock(struct ll_file_data *fd, struct inode *inode, int ast_flags) { struct ll_sb_info *sbi = ll_i2sbi(inode); + struct ost_lvb lvb; int rc; ENTRY; - LASSERT(lockh->cookie == 0); + LASSERT(!lustre_handle_is_used(lockh)); LASSERT(lsm != NULL); /* don't drop the mmapped file to LRU */ @@ -750,6 +755,10 @@ int ll_extent_lock(struct ll_file_data *fd, struct inode *inode, if (rc > 0) rc = -EIO; + ll_inode_size_lock(inode, 1); + inode_init_lvb(inode, &lvb); + obd_merge_lvb(sbi->ll_osc_exp, lsm, &lvb, 0); + if (policy->l_extent.start == 0 && policy->l_extent.end == OBD_OBJECT_EOF) { /* vmtruncate()->ll_truncate() first sets the i_size and then @@ -762,14 +771,16 @@ int ll_extent_lock(struct ll_file_data *fd, struct inode *inode, * cancel the result of the truncate. Getting the * ll_inode_size_lock() after the enqueue maintains the DLM * -> ll_inode_size_lock() acquiring order. */ - ll_inode_size_lock(inode, 1); - inode->i_size = lov_merge_size(lsm, 1); - ll_inode_size_unlock(inode, 1); + inode->i_size = lvb.lvb_size; } - if (rc == 0) - LTIME_S(inode->i_mtime) = - lov_merge_mtime(lsm, LTIME_S(inode->i_mtime)); + if (rc == 0) { + LTIME_S(inode->i_mtime) = lvb.lvb_mtime; + LTIME_S(inode->i_atime) = lvb.lvb_atime; + LTIME_S(inode->i_ctime) = lvb.lvb_ctime; + } + ll_inode_size_unlock(inode, 1); + RETURN(rc); } @@ -799,6 +810,7 @@ static ssize_t ll_file_read(struct file *file, char *buf, size_t count, struct lov_stripe_md *lsm = lli->lli_smd; struct ll_lock_tree tree; struct ll_lock_tree_node *node; + struct ost_lvb lvb; struct ll_ra_read bead; int rc; ssize_t retval; @@ -868,7 +880,9 @@ static ssize_t ll_file_read(struct file *file, char *buf, size_t count, * ll_inode_size_lock(). This guarantees that short reads are handled * correctly in the face of concurrent writes and truncates. */ - kms = lov_merge_size(lsm, 1); + inode_init_lvb(inode, &lvb); + obd_merge_lvb(ll_i2sbi(inode)->ll_osc_exp, lsm, &lvb, 1); + kms = lvb.lvb_size; if (*ppos + count - 1 > kms) { /* A glimpse is necessary to determine whether we return a * short read (B) or some zeroes at the end of the buffer (C) */ @@ -894,6 +908,8 @@ static ssize_t ll_file_read(struct file *file, char *buf, size_t count, bead.lrr_start = *ppos >> CFS_PAGE_SHIFT; bead.lrr_count = (count + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT; ll_ra_read_in(file, &bead); + /* BUG: 5972 */ + file_accessed(file); retval = generic_file_read(file, buf, count, ppos); ll_ra_read_ex(file, &bead); @@ -1163,7 +1179,7 @@ static int ll_lov_getstripe(struct inode *inode, unsigned long arg) } static int ll_get_grouplock(struct inode *inode, struct file *file, - unsigned long arg) + unsigned long arg) { struct ll_file_data *fd = LUSTRE_FPRIVATE(file); ldlm_policy_data_t policy = { .l_extent = { .start = 0, @@ -1194,7 +1210,7 @@ static int ll_get_grouplock(struct inode *inode, struct file *file, } static int ll_put_grouplock(struct inode *inode, struct file *file, - unsigned long arg) + unsigned long arg) { struct ll_file_data *fd = LUSTRE_FPRIVATE(file); struct ll_inode_info *lli = ll_i2info(inode); @@ -1222,6 +1238,193 @@ static int ll_put_grouplock(struct inode *inode, struct file *file, RETURN(0); } +static int join_sanity_check(struct inode *head, struct inode *tail) +{ + ENTRY; + if ((ll_i2sbi(head)->ll_flags & LL_SBI_JOIN) == 0) { + CERROR("server do not support join \n"); + RETURN(-EINVAL); + } + if (!S_ISREG(tail->i_mode) || !S_ISREG(head->i_mode)) { + CERROR("tail ino %lu and ino head %lu must be regular\n", + head->i_ino, tail->i_ino); + RETURN(-EINVAL); + } + if (head->i_ino == tail->i_ino) { + CERROR("file %lu can not be joined to itself \n", head->i_ino); + RETURN(-EINVAL); + } + if (head->i_size % JOIN_FILE_ALIGN) { + CERROR("hsize" LPU64 " must be times of 64K\n", + head->i_size); + RETURN(-EINVAL); + } + RETURN(0); +} + +static int join_file(struct inode *head_inode, struct file *head_filp, + struct file *tail_filp) +{ + struct inode *tail_inode, *tail_parent; + struct dentry *tail_dentry = tail_filp->f_dentry; + struct lookup_intent oit = {.it_op = IT_OPEN, + .it_flags = head_filp->f_flags|O_JOIN_FILE}; + struct ptlrpc_request *req = NULL; + struct ll_file_data *fd; + struct lustre_handle lockh; + struct mdc_op_data *op_data; + __u32 hsize = head_inode->i_size >> 32; + __u32 tsize = head_inode->i_size; + struct file *f; + int rc; + ENTRY; + + tail_dentry = tail_filp->f_dentry; + tail_inode = tail_dentry->d_inode; + tail_parent = tail_dentry->d_parent->d_inode; + + fd = ll_file_data_get(); + if (fd == NULL) + RETURN(-ENOMEM); + + OBD_ALLOC_PTR(op_data); + if (op_data == NULL) { + ll_file_data_put(fd); + RETURN(-ENOMEM); + } + + f = get_empty_filp(); + if (f == NULL) + GOTO(out, rc = -ENOMEM); + + f->f_dentry = head_filp->f_dentry; + f->f_vfsmnt = head_filp->f_vfsmnt; + + ll_prepare_mdc_op_data(op_data, head_inode, tail_parent, + tail_dentry->d_name.name, + tail_dentry->d_name.len, 0); + rc = mdc_enqueue(ll_i2mdcexp(head_inode), LDLM_IBITS, &oit, LCK_PW, + op_data, &lockh, &tsize, 0, ldlm_completion_ast, + ll_mdc_blocking_ast, &hsize, 0); + + if (rc < 0) + GOTO(out, rc); + + req = oit.d.lustre.it_data; + rc = oit.d.lustre.it_status; + + if (rc < 0) + GOTO(out, rc); + + rc = ll_local_open(f, &oit, fd); + LASSERTF(rc == 0, "rc = %d\n", rc); + + fd = NULL; + ll_intent_release(&oit); + + rc = ll_file_release(f->f_dentry->d_inode, f); +out: + if (op_data) + OBD_FREE_PTR(op_data); + if (f) + put_filp(f); + ll_file_data_put(fd); + ptlrpc_req_finished(req); + RETURN(rc); +} + +static int ll_file_join(struct inode *head, struct file *filp, + char *filename_tail) +{ + struct inode *tail = NULL, *first, *second; + struct dentry *tail_dentry; + struct file *tail_filp, *first_filp, *second_filp; + struct ll_lock_tree first_tree, second_tree; + struct ll_lock_tree_node *first_node, *second_node; + struct ll_inode_info *hlli = ll_i2info(head), *tlli; + int rc = 0, cleanup_phase = 0; + ENTRY; + + CDEBUG(D_VFSTRACE, "VFS Op:head=%lu/%u(%p) tail %s\n", + head->i_ino, head->i_generation, head, filename_tail); + + tail_filp = filp_open(filename_tail, O_WRONLY, 0644); + if (IS_ERR(tail_filp)) { + CERROR("Can not open tail file %s", filename_tail); + rc = PTR_ERR(tail_filp); + GOTO(cleanup, rc); + } + tail = igrab(tail_filp->f_dentry->d_inode); + + tlli = ll_i2info(tail); + tail_dentry = tail_filp->f_dentry; + LASSERT(tail_dentry); + cleanup_phase = 1; + + /*reorder the inode for lock sequence*/ + first = head->i_ino > tail->i_ino ? head : tail; + second = head->i_ino > tail->i_ino ? tail : head; + first_filp = head->i_ino > tail->i_ino ? filp : tail_filp; + second_filp = head->i_ino > tail->i_ino ? tail_filp : filp; + + CDEBUG(D_INFO, "reorder object from %lu:%lu to %lu:%lu \n", + head->i_ino, tail->i_ino, first->i_ino, second->i_ino); + first_node = ll_node_from_inode(first, 0, OBD_OBJECT_EOF, LCK_EX); + if (IS_ERR(first_node)){ + rc = PTR_ERR(first_node); + GOTO(cleanup, rc); + } + first_tree.lt_fd = first_filp->private_data; + rc = ll_tree_lock(&first_tree, first_node, NULL, 0, 0); + if (rc != 0) + GOTO(cleanup, rc); + cleanup_phase = 2; + + second_node = ll_node_from_inode(second, 0, OBD_OBJECT_EOF, LCK_EX); + if (IS_ERR(second_node)){ + rc = PTR_ERR(second_node); + GOTO(cleanup, rc); + } + second_tree.lt_fd = second_filp->private_data; + rc = ll_tree_lock(&second_tree, second_node, NULL, 0, 0); + if (rc != 0) + GOTO(cleanup, rc); + cleanup_phase = 3; + + rc = join_sanity_check(head, tail); + if (rc) + GOTO(cleanup, rc); + + rc = join_file(head, filp, tail_filp); + if (rc) + GOTO(cleanup, rc); +cleanup: + switch (cleanup_phase) { + case 3: + ll_tree_unlock(&second_tree); + obd_cancel_unused(ll_i2obdexp(second), + ll_i2info(second)->lli_smd, 0, NULL); + case 2: + ll_tree_unlock(&first_tree); + obd_cancel_unused(ll_i2obdexp(first), + ll_i2info(first)->lli_smd, 0, NULL); + case 1: + filp_close(tail_filp, 0); + if (tail) + iput(tail); + if (head && rc == 0) { + obd_free_memmd(ll_i2sbi(head)->ll_osc_exp, + &hlli->lli_smd); + hlli->lli_smd = NULL; + } + case 0: + break; + default: + CERROR("invalid cleanup_phase %d\n", cleanup_phase); + LBUG(); + } + RETURN(rc); +} int ll_file_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg) { @@ -1268,11 +1471,24 @@ int ll_file_ioctl(struct inode *inode, struct file *file, unsigned int cmd, RETURN(ll_iocontrol(inode, file, cmd, arg)); case EXT3_IOC_GETVERSION_OLD: case EXT3_IOC_GETVERSION: - RETURN(put_user(inode->i_generation, (int *) arg)); + RETURN(put_user(inode->i_generation, (int *)arg)); + case LL_IOC_JOIN: { + char *ftail; + int rc; + + ftail = getname((const char *)arg); + if (IS_ERR(ftail)) + RETURN(PTR_ERR(ftail)); + rc = ll_file_join(inode, file, ftail); + putname(ftail); + RETURN(rc); + } case LL_IOC_GROUP_LOCK: RETURN(ll_get_grouplock(inode, file, arg)); case LL_IOC_GROUP_UNLOCK: RETURN(ll_put_grouplock(inode, file, arg)); + case LL_IOC_OBD_STATFS: + RETURN(ll_obd_statfs(inode, (void *)arg)); /* We need to special case any other ioctls we want to handle, * to send them to the MDS/OST as appropriate and to properly @@ -1530,8 +1746,10 @@ int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it) int ealen = 0; if (S_ISREG(inode->i_mode)) { - ealen = obd_size_diskmd(sbi->ll_osc_exp, NULL); - valid |= OBD_MD_FLEASIZE; + rc = ll_get_max_mdsize(sbi, &ealen); + if (rc) + RETURN(rc); + valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE; } ll_inode2fid(&fid, inode); rc = mdc_getattr(sbi->ll_mdc_exp, &fid, valid, ealen, &req); @@ -1594,6 +1812,7 @@ int ll_getattr(struct vfsmount *mnt, struct dentry *de, static int lustre_check_acl(struct inode *inode, int mask) { +#ifdef CONFIG_FS_POSIX_ACL struct ll_inode_info *lli = ll_i2info(inode); struct posix_acl *acl; int rc; @@ -1610,6 +1829,9 @@ int lustre_check_acl(struct inode *inode, int mask) posix_acl_release(acl); RETURN(rc); +#else + return -EAGAIN; +#endif } #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10)) diff --git a/lustre/llite/llite_mmap.c b/lustre/llite/llite_mmap.c index a12dae6..b5d47fa 100644 --- a/lustre/llite/llite_mmap.c +++ b/lustre/llite/llite_mmap.c @@ -368,6 +368,7 @@ struct page *ll_nopage(struct vm_area_struct *vma, unsigned long address, struct page *page = NULL; struct ll_inode_info *lli = ll_i2info(inode); struct lov_stripe_md *lsm; + struct ost_lvb lvb; __u64 kms, old_mtime; unsigned long pgoff, size, rand_read, seq_read; int rc = 0; @@ -397,7 +398,9 @@ struct page *ll_nopage(struct vm_area_struct *vma, unsigned long address, CWARN("binary changed. inode %lu\n", inode->i_ino); lov_stripe_lock(lsm); - kms = lov_merge_size(lsm, 1); + inode_init_lvb(inode, &lvb); + obd_merge_lvb(ll_i2obdexp(inode), lsm, &lvb, 1); + kms = lvb.lvb_size; pgoff = ((address - vma->vm_start) >> PAGE_CACHE_SHIFT) + vma->vm_pgoff; size = (kms + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; diff --git a/lustre/llite/llite_nfs.c b/lustre/llite/llite_nfs.c index 47ad136..b70ce8c 100644 --- a/lustre/llite/llite_nfs.c +++ b/lustre/llite/llite_nfs.c @@ -69,7 +69,9 @@ static struct inode * search_inode_for_lustre(struct super_block *sb, if (inode) return inode; if (S_ISREG(mode)) { - eadatalen = obd_size_diskmd(sbi->ll_osc_exp, NULL); + rc = ll_get_max_mdsize(sbi, &eadatalen); + if (rc) + return ERR_PTR(rc); valid |= OBD_MD_FLEASIZE; } fid.id = (__u64)ino; @@ -125,13 +127,16 @@ static struct dentry *ll_iget_for_nfs(struct super_block *sb, unsigned long ino, spin_lock(&dcache_lock); for (lp = inode->i_dentry.next; lp != &inode->i_dentry ; lp=lp->next) { result = list_entry(lp,struct dentry, d_alias); + lock_dentry(result); if (!(result->d_flags & DCACHE_DISCONNECTED)) { dget_locked(result); ll_set_dflags(result, DCACHE_REFERENCED); + unlock_dentry(result); spin_unlock(&dcache_lock); iput(inode); return result; } + unlock_dentry(result); } spin_unlock(&dcache_lock); result = d_alloc_root(inode); diff --git a/lustre/llite/namei.c b/lustre/llite/namei.c index c8d6233..19df9d9 100644 --- a/lustre/llite/namei.c +++ b/lustre/llite/namei.c @@ -37,6 +37,7 @@ /* methods */ +/* called from iget{4,5_locked}->find_inode() under inode_lock spinlock */ #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) static int ll_test_inode(struct inode *inode, unsigned long ino, void *opaque) #else @@ -71,8 +72,27 @@ static int ll_test_inode(struct inode *inode, void *opaque) if (inode->i_ino != md->body->ino) return 0; #endif - if (inode->i_generation != md->body->generation) + if (inode->i_generation != md->body->generation) { + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct ll_inode_info *lli = ll_i2info(inode); + + if (inode->i_state & (I_FREEING | I_CLEAR)) + return 0; + + atomic_inc(&inode->i_count); + inode->i_nlink = 0; + inode->i_state |= I_FREEING; + LASSERT(list_empty(&lli->lli_dead_list)); + /* add "duplicate" inode into deathrow for destroy */ + spin_lock(&sbi->ll_deathrow_lock); + list_add(&lli->lli_dead_list, &sbi->ll_deathrow); + spin_unlock(&sbi->ll_deathrow_lock); + + /* remove inode from dirty/io lists */ + list_del_init(&inode->i_list); + return 0; + } /* Apply the attributes in 'opaque' to this inode */ if (!(inode->i_state & (I_FREEING | I_CLEAR))) @@ -311,13 +331,12 @@ struct dentry *ll_find_alias(struct inode *inode, struct dentry *de) de->d_name.len) != 0) continue; - if (!list_empty(&dentry->d_lru)) - list_del_init(&dentry->d_lru); - - hlist_del_init(&dentry->d_hash); - __d_rehash(dentry, 0); /* avoid taking dcache_lock inside */ + dget_locked(dentry); + lock_dentry(dentry); + __d_drop(dentry); dentry->d_flags &= ~DCACHE_LUSTRE_INVALID; - atomic_inc(&dentry->d_count); + unlock_dentry(dentry); + __d_rehash(dentry, 0); /* avoid taking dcache_lock inside */ spin_unlock(&dcache_lock); iput(inode); CDEBUG(D_DENTRY, "alias dentry %.*s (%p) parent %p inode %p " @@ -514,7 +533,7 @@ static int ll_create_it(struct inode *dir, struct dentry *dentry, int mode, if (rc) RETURN(rc); - mdc_store_inode_generation(request, 2, 1); + mdc_store_inode_generation(request, MDS_REQ_INTENT_REC_OFF, 1); inode = ll_create_node(dir, dentry->d_name.name, dentry->d_name.len, NULL, 0, mode, 0, it); if (IS_ERR(inode)) { @@ -625,7 +644,7 @@ static int ll_mknod(struct inode *dir, struct dentry *dchild, int mode, ll_update_times(request, 0, dir); - err = ll_prep_inode(sbi->ll_osc_exp, &inode, request, 0, + err = ll_prep_inode(sbi->ll_osc_exp, &inode, request, 0, dchild->d_sb); if (err) GOTO(out_err, err); @@ -760,8 +779,6 @@ int ll_objects_destroy(struct ptlrpc_request *request, struct inode *dir) int rc; ENTRY; - oti.oti_thread = request->rq_svc_thread; - /* req is swabbed so this is safe */ body = lustre_msg_buf(request->rq_repmsg, 0, sizeof(*body)); @@ -791,6 +808,10 @@ int ll_objects_destroy(struct ptlrpc_request *request, struct inode *dir) } LASSERT(rc >= sizeof(*lsm)); + rc = obd_checkmd(ll_i2obdexp(dir), ll_i2mdcexp(dir), lsm); + if (rc) + GOTO(out_free_memmd, rc); + oa = obdo_alloc(); if (oa == NULL) GOTO(out_free_memmd, rc = -ENOMEM); @@ -811,7 +832,7 @@ int ll_objects_destroy(struct ptlrpc_request *request, struct inode *dir) } } - rc = obd_destroy(ll_i2obdexp(dir), oa, lsm, &oti); + rc = obd_destroy(ll_i2obdexp(dir), oa, lsm, &oti, ll_i2mdcexp(dir)); obdo_free(oa); if (rc) CERROR("obd destroy objid "LPX64" error %d\n", diff --git a/lustre/llite/rw.c b/lustre/llite/rw.c index 16c0169..22cd48e 100644 --- a/lustre/llite/rw.c +++ b/lustre/llite/rw.c @@ -103,7 +103,7 @@ static int ll_brw(int cmd, struct inode *inode, struct obdo *oa, /* this isn't where truncate starts. roughly: * sys_truncate->ll_setattr_raw->vmtruncate->ll_truncate. setattr_raw grabs - * DLM lock on [0, EOF], i_sem, ->lli_size_sem, and WRITE_I_ALLOC_SEM to + * DLM lock on [size, EOF], i_sem, ->lli_size_sem, and WRITE_I_ALLOC_SEM to * avoid races. * * must be called under ->lli_size_sem */ @@ -111,6 +111,7 @@ void ll_truncate(struct inode *inode) { struct ll_inode_info *lli = ll_i2info(inode); struct lov_stripe_md *lsm = lli->lli_smd; + struct ost_lvb lvb; struct obdo oa; int rc; ENTRY; @@ -133,7 +134,9 @@ void ll_truncate(struct inode *inode) /* XXX I'm pretty sure this is a hack to paper over a more fundamental * race condition. */ lov_stripe_lock(lsm); - if (lov_merge_size(lsm, 0) == inode->i_size) { + inode_init_lvb(inode, &lvb); + obd_merge_lvb(ll_i2obdexp(inode), lsm, &lvb, 0); + if (lvb.lvb_size == inode->i_size) { CDEBUG(D_VFSTRACE, "skipping punch for obj "LPX64", %Lu=%#Lx\n", lsm->lsm_object_id, inode->i_size, inode->i_size); lov_stripe_unlock(lsm); @@ -195,7 +198,7 @@ int ll_prepare_write(struct file *file, struct page *page, unsigned from, obd_off offset = ((obd_off)page->index) << PAGE_SHIFT; struct brw_page pga; struct obdo oa; - __u64 kms; + struct ost_lvb lvb; int rc = 0; ENTRY; @@ -235,11 +238,12 @@ int ll_prepare_write(struct file *file, struct page *page, unsigned from, * locking will have updated the KMS, and for our purposes here we can * treat it like i_size. */ lov_stripe_lock(lsm); - kms = lov_merge_size(lsm, 1); + inode_init_lvb(inode, &lvb); + obd_merge_lvb(ll_i2obdexp(inode), lsm, &lvb, 0); lov_stripe_unlock(lsm); - if (kms <= offset) { + if (lvb.lvb_size <= offset) { LL_CDEBUG_PAGE(D_PAGE, page, "kms "LPU64" <= offset "LPU64"\n", - kms, offset); + lvb.lvb_size, offset); memset(kmap(page), 0, PAGE_SIZE); kunmap(page); GOTO(prepare_done, rc = 0); @@ -270,7 +274,8 @@ static int ll_ap_make_ready(void *data, int cmd) llap = LLAP_FROM_COOKIE(data); page = llap->llap_page; - LASSERT(!(cmd & OBD_BRW_READ)); + LASSERTF(!(cmd & OBD_BRW_READ), "cmd %x page %p ino %lu index %lu\n", cmd, page, + page->mapping->host->i_ino, page->index); /* we're trying to write, but the page is locked.. come back later */ if (TryLockPage(page)) @@ -312,6 +317,7 @@ static int ll_ap_refresh_count(void *data, int cmd) struct lov_stripe_md *lsm; struct page *page; struct inode *inode; + struct ost_lvb lvb; __u64 kms; ENTRY; @@ -325,7 +331,9 @@ static int ll_ap_refresh_count(void *data, int cmd) lsm = lli->lli_smd; lov_stripe_lock(lsm); - kms = lov_merge_size(lsm, 1); + inode_init_lvb(inode, &lvb); + obd_merge_lvb(ll_i2obdexp(inode), lsm, &lvb, 1); + kms = lvb.lvb_size; lov_stripe_unlock(lsm); /* catch race with truncate */ @@ -1037,13 +1045,16 @@ static int ll_readahead(struct ll_readahead_state *ras, struct inode *inode; struct lov_stripe_md *lsm; struct ll_ra_read *bead; + struct ost_lvb lvb; ENTRY; inode = mapping->host; lsm = ll_i2info(inode)->lli_smd; lov_stripe_lock(lsm); - kms = lov_merge_size(lsm, 1); + inode_init_lvb(inode, &lvb); + obd_merge_lvb(ll_i2obdexp(inode), lsm, &lvb, 1); + kms = lvb.lvb_size; lov_stripe_unlock(lsm); if (kms == 0) { ll_ra_stats_inc(mapping, RA_STAT_ZERO_LEN); diff --git a/lustre/llite/rw24.c b/lustre/llite/rw24.c index 32a7e05..eb8f71a 100644 --- a/lustre/llite/rw24.c +++ b/lustre/llite/rw24.c @@ -91,6 +91,7 @@ static int ll_direct_IO_24(int rw, flags = 0 /* | OBD_BRW_DIRECTIO */; offset = ((obd_off)blocknr << inode->i_blkbits); length = iobuf->length; + rw = rw ? OBD_BRW_WRITE : OBD_BRW_READ; for (i = 0, length = iobuf->length; length > 0; length -= pga[i].count, offset += pga[i].count, i++) { /*i last!*/ @@ -100,21 +101,20 @@ static int ll_direct_IO_24(int rw, pga[i].count = min_t(int, PAGE_SIZE - (offset & ~PAGE_MASK), length); pga[i].flag = flags; - if (rw == READ) + if (rw == OBD_BRW_READ) POISON_PAGE(iobuf->maplist[i], 0x0d); } ll_inode_fill_obdo(inode, rw, &oa); - if (rw == WRITE) + if (rw == OBD_BRW_WRITE) lprocfs_counter_add(ll_i2sbi(inode)->ll_stats, LPROC_LL_DIRECT_WRITE, iobuf->length); else lprocfs_counter_add(ll_i2sbi(inode)->ll_stats, LPROC_LL_DIRECT_READ, iobuf->length); - rc = obd_brw_async(rw == WRITE ? OBD_BRW_WRITE : OBD_BRW_READ, - ll_i2obdexp(inode), &oa, lsm, iobuf->nr_pages, pga, - set, NULL); + rc = obd_brw_async(rw, ll_i2obdexp(inode), &oa, lsm, iobuf->nr_pages, + pga, set, NULL); if (rc) { CDEBUG(rc == -ENOSPC ? D_INODE : D_ERROR, "error from obd_brw_async: rc = %d\n", rc); @@ -126,7 +126,7 @@ static int ll_direct_IO_24(int rw, ptlrpc_set_destroy(set); if (rc == 0) { rc = iobuf->length; - if (rw == WRITE) { + if (rw == OBD_BRW_WRITE) { lov_stripe_lock(lsm); obd_adjust_kms(ll_i2obdexp(inode), lsm, offset, 0); lov_stripe_unlock(lsm); diff --git a/lustre/llite/super.c b/lustre/llite/super.c index 0405b19..610c215 100644 --- a/lustre/llite/super.c +++ b/lustre/llite/super.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include "llite_internal.h" @@ -91,7 +92,7 @@ static struct file_system_type lustre_fs_type = { static int __init init_lustre_lite(void) { - int rc; + int rc, seed[2]; printk(KERN_INFO "Lustre: Lustre Lite Client File System; " "info@clusterfs.com\n"); @@ -115,6 +116,9 @@ static int __init init_lustre_lite(void) ll_unregister_cache(&ll_cache_definition); } + get_random_bytes(seed, sizeof(seed)); + ll_srand(seed[0], seed[1]); + return rc; } diff --git a/lustre/llite/super25.c b/lustre/llite/super25.c index 25d306d..e2b9bb0 100644 --- a/lustre/llite/super25.c +++ b/lustre/llite/super25.c @@ -128,7 +128,7 @@ struct file_system_type lustre_fs_type = { static int __init init_lustre_lite(void) { - int rc; + int rc, seed[2]; printk(KERN_INFO "Lustre: Lustre Lite Client File System; " "info@clusterfs.com\n"); rc = ll_init_inodecache(); @@ -156,6 +156,9 @@ static int __init init_lustre_lite(void) ll_unregister_cache(&ll_cache_definition); } + get_random_bytes(seed, sizeof(seed)); + ll_srand(seed[0], seed[1]); + return rc; } diff --git a/lustre/llite/xattr.c b/lustre/llite/xattr.c index 194b627..9441a2e 100644 --- a/lustre/llite/xattr.c +++ b/lustre/llite/xattr.c @@ -204,7 +204,7 @@ do_getxattr: if (size < body->eadatasize) { CERROR("server bug: replied size %u > %u\n", - body->eadatasize, size); + body->eadatasize, (int)size); GOTO(out, rc = -ERANGE); } diff --git a/lustre/lov/Makefile.in b/lustre/lov/Makefile.in index aebee3e..f714192 100644 --- a/lustre/lov/Makefile.in +++ b/lustre/lov/Makefile.in @@ -1,4 +1,4 @@ MODULES := lov -lov-objs := lov_log.o lov_obd.o lov_pack.o lproc_lov.o lov_offset.o lov_merge.o lov_request.o lov_qos.o +lov-objs := lov_log.o lov_obd.o lov_pack.o lproc_lov.o lov_offset.o lov_merge.o lov_request.o lov_qos.o lov_ea.o @INCLUDE_RULES@ diff --git a/lustre/lov/autoMakefile.am b/lustre/lov/autoMakefile.am index 0ade64c..583a425 100644 --- a/lustre/lov/autoMakefile.am +++ b/lustre/lov/autoMakefile.am @@ -5,7 +5,7 @@ if LIBLUSTRE noinst_LIBRARIES = liblov.a -liblov_a_SOURCES = lov_log.c lov_obd.c lov_pack.c lov_request.c lov_offset.c lov_qos.c lov_merge.c lov_internal.h +liblov_a_SOURCES = lov_log.c lov_obd.c lov_pack.c lov_request.c lov_offset.c lov_qos.c lov_merge.c lov_ea.c lov_internal.h liblov_a_CPPFLAGS = $(LLCPPFLAGS) liblov_a_CFLAGS = $(LLCFLAGS) endif diff --git a/lustre/lov/lov_ea.c b/lustre/lov/lov_ea.c new file mode 100755 index 0000000..c08020d --- /dev/null +++ b/lustre/lov/lov_ea.c @@ -0,0 +1,546 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2001-2005 Cluster File Systems, Inc. + * Author: Wang Di + * + * This file is part of the Lustre file system, http://www.lustre.org + * Lustre is a trademark of Cluster File Systems, Inc. + * + * You may have signed or agreed to another license before downloading + * this software. If so, you are bound by the terms and conditions + * of that agreement, and the following does not apply to you. See the + * LICENSE file included with this distribution for more information. + * + * If you did not agree to a different license, then this copy of Lustre + * is open source software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * In either case, Lustre is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * license text for more details. + */ + +#ifndef EXPORT_SYMTAB +# define EXPORT_SYMTAB +#endif +#define DEBUG_SUBSYSTEM S_LOV + +#ifdef __KERNEL__ +#include +#else +#include +#endif + +#include +#include +#include +#include + +#include "lov_internal.h" + +struct lovea_unpack_args { + struct lov_stripe_md *lsm; + int cursor; +}; + +static int lsm_lmm_verify_common(struct lov_mds_md *lmm, int lmm_bytes, + int stripe_count) +{ + + if (stripe_count == 0) { + CERROR("bad stripe count %d\n", stripe_count); + lov_dump_lmm_v1(D_WARNING, lmm); + return -EINVAL; + } + + if (lmm->lmm_object_id == 0) { + CERROR("zero object id\n"); + lov_dump_lmm_v1(D_WARNING, lmm); + return -EINVAL; + } + + if (lmm->lmm_pattern != cpu_to_le32(LOV_PATTERN_RAID0)) { + CERROR("bad striping pattern\n"); + lov_dump_lmm_v1(D_WARNING, lmm); + return -EINVAL; + } + + if (lmm->lmm_stripe_size == 0 || + (__u64)le32_to_cpu(lmm->lmm_stripe_size) * stripe_count > ~0UL) { + CERROR("bad stripe size %u\n", + le32_to_cpu(lmm->lmm_stripe_size)); + lov_dump_lmm_v1(D_WARNING, lmm); + return -EINVAL; + } + return 0; +} + +static void lsm_unpackmd_common(struct lov_stripe_md *lsm, + struct lov_mds_md *lmm) +{ + lsm->lsm_object_id = le64_to_cpu(lmm->lmm_object_id); + lsm->lsm_object_gr = le64_to_cpu(lmm->lmm_object_gr); + lsm->lsm_stripe_size = le32_to_cpu(lmm->lmm_stripe_size); + lsm->lsm_pattern = le32_to_cpu(lmm->lmm_pattern); + lsm->lsm_xfersize = lsm->lsm_stripe_size * lsm->lsm_stripe_count; +} + +static void +lsm_stripe_by_index_plain(struct lov_stripe_md *lsm, int *stripeno, + obd_off *lov_off, unsigned long *swidth) +{ + if (swidth) + *swidth = lsm->lsm_stripe_size * lsm->lsm_stripe_count; +} + +static void +lsm_stripe_by_offset_plain(struct lov_stripe_md *lsm, int *stripeno, + obd_off *lov_off, unsigned long *swidth) +{ + if (swidth) + *swidth = lsm->lsm_stripe_size * lsm->lsm_stripe_count; +} + +static obd_off +lsm_stripe_offset_by_index_plain(struct lov_stripe_md *lsm, + int stripe_index) +{ + return 0; +} + +static int +lsm_stripe_index_by_offset_plain(struct lov_stripe_md *lsm, + obd_off lov_off) +{ + return 0; +} + +static void lsm_free_plain(struct lov_stripe_md *lsm) +{ + OBD_FREE(lsm, lov_stripe_md_size(lsm->lsm_stripe_count)); +} + +static int lsm_revalidate_plain(struct lov_stripe_md *lsm, + struct obd_device *obd) +{ + return 0; +} + +static int lsm_destroy_plain(struct lov_stripe_md *lsm, struct obdo *oa, + struct obd_export *md_exp) +{ + return 0; +} + +static int lsm_lmm_verify_plain(struct lov_mds_md *lmm, int lmm_bytes, + int *stripe_count) +{ + if (lmm_bytes < sizeof(*lmm)) { + CERROR("lov_mds_md too small: %d, need at least %d\n", + lmm_bytes, (int)sizeof(*lmm)); + return -EINVAL; + } + + *stripe_count = le32_to_cpu(lmm->lmm_stripe_count); + + if (lmm_bytes < lov_mds_md_v1_size(*stripe_count)) { + CERROR("LOV EA too small: %d, need %d\n", + lmm_bytes, lov_mds_md_v1_size(*stripe_count)); + lov_dump_lmm_v1(D_WARNING, lmm); + return -EINVAL; + } + + return lsm_lmm_verify_common(lmm, lmm_bytes, *stripe_count); +} + +int lsm_unpackmd_plain(struct lov_obd *lov, struct lov_stripe_md *lsm, + struct lov_mds_md_v1 *lmm) +{ + struct lov_oinfo *loi; + int i; + + lsm_unpackmd_common(lsm, lmm); + + for (i = 0, loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++) { + /* XXX LOV STACKING call down to osc_unpackmd() */ + loi->loi_id = le64_to_cpu(lmm->lmm_objects[i].l_object_id); + loi->loi_gr = le64_to_cpu(lmm->lmm_objects[i].l_object_gr); + loi->loi_ost_idx = le32_to_cpu(lmm->lmm_objects[i].l_ost_idx); + loi->loi_ost_gen = le32_to_cpu(lmm->lmm_objects[i].l_ost_gen); + if (loi->loi_ost_idx > lov->desc.ld_tgt_count) { + CERROR("OST index %d more than OST count %d\n", + loi->loi_ost_idx, lov->desc.ld_tgt_count); + lov_dump_lmm_v1(D_WARNING, lmm); + return -EINVAL; + } + loi++; + } + + return 0; +} + +struct lsm_operations lsm_plain_ops = { + .lsm_free = lsm_free_plain, + .lsm_destroy = lsm_destroy_plain, + .lsm_stripe_by_index = lsm_stripe_by_index_plain, + .lsm_stripe_by_offset = lsm_stripe_by_offset_plain, + .lsm_revalidate = lsm_revalidate_plain, + .lsm_stripe_offset_by_index = lsm_stripe_offset_by_index_plain, + .lsm_stripe_index_by_offset = lsm_stripe_index_by_offset_plain, + .lsm_lmm_verify = lsm_lmm_verify_plain, + .lsm_unpackmd = lsm_unpackmd_plain, +}; + +struct lov_extent *lovea_off2le(struct lov_stripe_md *lsm, obd_off lov_off) +{ + struct lov_array_info *lai; + struct lov_extent *le; + int i = 0; + + LASSERT(lsm->lsm_array != NULL); + lai = lsm->lsm_array; + LASSERT(lai->lai_ext_count > 1); + + for (le = lai->lai_ext_array, i = 0; + i < lai->lai_ext_count && le->le_start + le->le_len <= lov_off + && le->le_len != -1; + i ++, le ++) { + ; /* empty loop */ + } + + CDEBUG(D_INFO, "off "LPU64" idx%d, ext"LPU64":"LPU64"idx%d sc%d\n", + lov_off, i, le->le_start, le->le_len, le->le_loi_idx, + le->le_stripe_count); + + RETURN(le); +} + +struct lov_extent *lovea_idx2le(struct lov_stripe_md *lsm, int stripe_no) +{ + struct lov_extent *le; + struct lov_array_info *lai; + int i, stripe_index; + + LASSERT(lsm->lsm_array != NULL); + LASSERT(stripe_no >= 0 && stripe_no <= lsm->lsm_stripe_count); + lai = lsm->lsm_array; + LASSERT(lai->lai_ext_count > 1); + + for (le = lai->lai_ext_array, i = 0, stripe_index = le->le_stripe_count; + i < lai->lai_ext_count && stripe_index <= stripe_no && + le->le_len != -1; i ++, le ++, + stripe_index += le->le_stripe_count) { + ; /* empty loop */ + } + + CDEBUG(D_INFO, "stripe %d idx%d, ext"LPU64":"LPU64"idx %d scount%d\n", + stripe_no, i, le->le_start, le->le_len, le->le_loi_idx, + le->le_stripe_count); + RETURN(le); +} + + +static void lovea_free_array_info(struct lov_stripe_md *lsm) +{ + if (!lsm || !lsm->lsm_array) + return; + + if (lsm->lsm_array->lai_ext_array) + OBD_FREE(lsm->lsm_array->lai_ext_array, + lsm->lsm_array->lai_ext_count * + sizeof(struct lov_extent)); + + OBD_FREE_PTR(lsm->lsm_array); +} + +static void lsm_free_join(struct lov_stripe_md *lsm) +{ + lovea_free_array_info(lsm); + OBD_FREE(lsm, lov_stripe_md_size(lsm->lsm_stripe_count)); +} + +static void +lsm_stripe_by_index_join(struct lov_stripe_md *lsm, int *stripeno, + obd_off *lov_off, unsigned long *swidth) +{ + struct lov_extent *le; + + LASSERT(stripeno != NULL); + + le = lovea_idx2le(lsm, *stripeno); + + LASSERT(le != NULL && le->le_stripe_count != 0); + + *stripeno -= le->le_loi_idx; + + if (swidth) + *swidth = lsm->lsm_stripe_size * le->le_stripe_count; + + if (lov_off) { + struct lov_extent *lov_le = lovea_off2le(lsm, *lov_off); + if (lov_le == le) { + *lov_off = (*lov_off > le->le_start) ? + (*lov_off - le->le_start) : 0; + } else { + *lov_off = (*lov_off > le->le_start) ? + le->le_len : 0; + LASSERT(*lov_off != -1); + } + } +} + +static void +lsm_stripe_by_offset_join(struct lov_stripe_md *lsm, int *stripeno, + obd_off *lov_off, unsigned long *swidth) +{ + struct lov_extent *le; + + LASSERT(lov_off != NULL); + + le = lovea_off2le(lsm, *lov_off); + + LASSERT(le != NULL && le->le_stripe_count != 0); + + *lov_off = (*lov_off > le->le_start) ? (*lov_off - le->le_start) : 0; + + if (stripeno) + *stripeno -= le->le_loi_idx; + + if (swidth) + *swidth = lsm->lsm_stripe_size * le->le_stripe_count; +} + +static obd_off +lsm_stripe_offset_by_index_join(struct lov_stripe_md *lsm, + int stripe_index) +{ + struct lov_extent *le; + + le = lovea_idx2le(lsm, stripe_index); + + return le ? le->le_start : 0; +} + +static int +lsm_stripe_index_by_offset_join(struct lov_stripe_md *lsm, + obd_off lov_off) +{ + struct lov_extent *le = NULL; + + le = lovea_off2le(lsm, lov_off); + + return le ? le->le_loi_idx : 0; +} + +static int lovea_unpack_array(struct llog_handle *handle, + struct llog_rec_hdr *rec, void *data) +{ + struct lovea_unpack_args *args = (struct lovea_unpack_args *)data; + struct llog_array_rec *la_rec = (struct llog_array_rec*)rec; + struct mds_extent_desc *med = &la_rec->lmr_med; + struct lov_stripe_md *lsm = args->lsm; + int cursor = args->cursor++; + struct lov_mds_md *lmm; + struct lov_array_info *lai; + struct lov_oinfo * loi; + int i, loi_index; + ENTRY; + + /* sanity check */ + LASSERT(lsm->lsm_stripe_count != 0); + lmm = &med->med_lmm; + LASSERT(lsm->lsm_array != NULL); + + lai = lsm->lsm_array; + + if (cursor == 0) { + lai->lai_ext_array[cursor].le_loi_idx = 0; + } else { + int next_loi_index = lai->lai_ext_array[cursor - 1].le_loi_idx + + lai->lai_ext_array[cursor - 1].le_stripe_count; + lai->lai_ext_array[cursor].le_loi_idx = next_loi_index; + } + /* insert extent desc into lsm extent array */ + lai->lai_ext_array[cursor].le_start = le64_to_cpu(med->med_start); + lai->lai_ext_array[cursor].le_len = le64_to_cpu(med->med_len); + lai->lai_ext_array[cursor].le_stripe_count = lmm->lmm_stripe_count; + + /* unpack extent's lmm to lov_oinfo array */ + loi_index = lai->lai_ext_array[cursor].le_loi_idx; + loi = &lsm->lsm_oinfo[loi_index]; + CDEBUG(D_INFO, "lovea upackmd cursor %d, loi_index %d extent " + LPU64":"LPU64"\n", cursor, loi_index, med->med_start, + med->med_len); + + for (i = 0; i < lmm->lmm_stripe_count; i ++) { + /* XXX LOV STACKING call down to osc_unpackmd() */ + loi->loi_id = le64_to_cpu(lmm->lmm_objects[i].l_object_id); + loi->loi_gr = le64_to_cpu(lmm->lmm_objects[i].l_object_gr); + loi->loi_ost_idx = le32_to_cpu(lmm->lmm_objects[i].l_ost_idx); + loi->loi_ost_gen = le32_to_cpu(lmm->lmm_objects[i].l_ost_gen); + loi++; + } + + RETURN(0); +} + +static int lsm_revalidate_join(struct lov_stripe_md *lsm, + struct obd_device *obd) +{ + struct llog_handle *llh; + struct llog_ctxt *ctxt; + struct lovea_unpack_args args; + int rc, rc2; + ENTRY; + + LASSERT(lsm->lsm_array != NULL); + + /*Revalidate lsm might be called from client or MDS server. + *So the ctxt might be in different position + */ + ctxt = llog_get_context(obd, LLOG_LOVEA_REPL_CTXT); + if (!ctxt) + ctxt = llog_get_context(obd, LLOG_LOVEA_ORIG_CTXT); + + LASSERT(ctxt); + + if (lsm->lsm_array && lsm->lsm_array->lai_ext_array) + RETURN(0); + + CDEBUG(D_INFO, "get lsm logid: "LPU64":"LPU64"\n", + lsm->lsm_array->lai_array_id.lgl_oid, + lsm->lsm_array->lai_array_id.lgl_ogr); + OBD_ALLOC(lsm->lsm_array->lai_ext_array,lsm->lsm_array->lai_ext_count * + sizeof (struct lov_extent)); + if (!lsm->lsm_array->lai_ext_array) + RETURN(-ENOMEM); + + CDEBUG(D_INFO, "get lsm logid: "LPU64":"LPU64"\n", + lsm->lsm_array->lai_array_id.lgl_oid, + lsm->lsm_array->lai_array_id.lgl_ogr); + + rc = llog_create(ctxt, &llh, &lsm->lsm_array->lai_array_id, NULL); + if (rc) + GOTO(out, rc); + + args.lsm = lsm; + args.cursor = 0; + rc = llog_init_handle(llh, LLOG_F_IS_PLAIN, NULL); + if (rc == 0) + rc = llog_process(llh, lovea_unpack_array, &args, NULL); + rc2 = llog_close(llh); + if (rc == 0) + rc = rc2; +out: + if (rc) + lovea_free_array_info(lsm); + RETURN(rc); +} + +int lsm_destroy_join(struct lov_stripe_md *lsm, struct obdo *oa, + struct obd_export *md_exp) +{ + struct llog_ctxt *ctxt; + struct llog_handle *llh; + int rc = 0; + ENTRY; + + LASSERT(md_exp != NULL); + ctxt = llog_get_context(md_exp->exp_obd, LLOG_LOVEA_REPL_CTXT); + if (!ctxt) + GOTO(out, rc = -EINVAL); + + LASSERT(lsm->lsm_array != NULL); + /*for those orphan inode, we should keep array id*/ + if (!(oa->o_valid & OBD_MD_FLCOOKIE)) + RETURN(0); + + LASSERT(ctxt != NULL); + rc = llog_create(ctxt, &llh, &lsm->lsm_array->lai_array_id, + NULL); + if (rc) + GOTO(out, rc); + + rc = llog_init_handle(llh, LLOG_F_IS_PLAIN, NULL); + if (rc == 0) { + rc = llog_destroy(llh); + } + llog_free_handle(llh); +out: + RETURN(rc); +} + +static int lsm_lmm_verify_join(struct lov_mds_md *lmm, int lmm_bytes, + int *stripe_count) +{ + struct lov_mds_md_join *lmmj = (struct lov_mds_md_join *)lmm; + + if (lmm_bytes < sizeof(*lmmj)) { + CERROR("lov_mds_md too small: %d, need at least %d\n", + lmm_bytes, (int)sizeof(*lmmj)); + return -EINVAL; + } + + if (lmmj->lmmj_array_id.lgl_oid == 0) { + CERROR("zero array object id\n"); + return -EINVAL; + } + + *stripe_count = le32_to_cpu(lmmj->lmmj_md.lmm_stripe_count); + + return lsm_lmm_verify_common(&lmmj->lmmj_md, lmm_bytes, *stripe_count); +} + +static int lovea_init_array_info(struct lov_stripe_md *lsm, + struct llog_logid *logid, + __u32 extent_count) +{ + struct lov_array_info *lai; + ENTRY; + + OBD_ALLOC_PTR(lai); + if (!lai) + RETURN(-ENOMEM); + + lai->lai_array_id = *logid; + lai->lai_ext_count = extent_count; + lsm->lsm_array = lai; + RETURN(0); +} + +static int lsm_unpackmd_join(struct lov_obd *lov, struct lov_stripe_md *lsm, + struct lov_mds_md *lmm) +{ + struct lov_mds_md_join *lmmj = (struct lov_mds_md_join*)lmm; + int rc; + ENTRY; + + lsm_unpackmd_common(lsm, &lmmj->lmmj_md); + + rc = lovea_init_array_info(lsm, &lmmj->lmmj_array_id, + lmmj->lmmj_extent_count); + if (rc) { + CERROR("Init joined lsm id"LPU64" arrary error %d", + lsm->lsm_object_id, rc); + GOTO(out, rc); + } +out: + RETURN(rc); +} + +struct lsm_operations lsm_join_ops = { + .lsm_free = lsm_free_join, + .lsm_destroy = lsm_destroy_join, + .lsm_stripe_by_index = lsm_stripe_by_index_join, + .lsm_stripe_by_offset = lsm_stripe_by_offset_join, + .lsm_revalidate = lsm_revalidate_join, + .lsm_stripe_offset_by_index = lsm_stripe_offset_by_index_join, + .lsm_stripe_index_by_offset = lsm_stripe_index_by_offset_join, + .lsm_lmm_verify = lsm_lmm_verify_join, + .lsm_unpackmd = lsm_unpackmd_join, +}; + + diff --git a/lustre/lov/lov_internal.h b/lustre/lov/lov_internal.h index 41e9295..5829fa9 100644 --- a/lustre/lov/lov_internal.h +++ b/lustre/lov/lov_internal.h @@ -113,6 +113,8 @@ static inline void lov_llh_put(struct lov_lock_handles *llh) /* lov_merge.c */ void lov_merge_attrs(struct obdo *tgt, struct obdo *src, obd_flag valid, struct lov_stripe_md *lsm, int stripeno, int *set); +int lov_merge_lvb(struct obd_export *exp, struct lov_stripe_md *lsm, + struct ost_lvb *lvb, int kms_only); int lov_adjust_kms(struct obd_export *exp, struct lov_stripe_md *lsm, obd_off size, int shrink); @@ -198,8 +200,6 @@ int lov_fini_cancel_set(struct lov_request_set *set); /* lov_obd.c */ int lov_get_stripecnt(struct lov_obd *lov, int stripe_count); -int lov_alloc_memmd(struct lov_stripe_md **lsmp, int stripe_count, int pattern); -void lov_free_memmd(struct lov_stripe_md **lsmp); /* lov_log.c */ int lov_llog_init(struct obd_device *obd, struct obd_device *tgt, @@ -217,7 +217,20 @@ int lov_setea(struct obd_export *exp, struct lov_stripe_md **lsmp, struct lov_user_md *lump); int lov_getstripe(struct obd_export *exp, struct lov_stripe_md *lsm, struct lov_user_md *lump); +int lov_alloc_memmd(struct lov_stripe_md **lsmp, int stripe_count, + int pattern, int magic); +void lov_free_memmd(struct lov_stripe_md **lsmp); + +void lov_dump_lmm_v1(int level, struct lov_mds_md_v1 *lmm); +void lov_dump_lmm_join(int level, struct lov_mds_md_join *lmmj); +/* lov_ea.c */ +int lov_unpackmd_join(struct lov_obd *lov, struct lov_stripe_md *lsm, + struct lov_mds_md *lmm); +struct lov_extent *lovea_idx2le(struct lov_stripe_md *lsm, int stripe_no); +struct lov_extent *lovea_off2le(struct lov_stripe_md *lsm, obd_off lov_off); +int lovea_destroy_object(struct lov_obd *lov, struct lov_stripe_md *lsm, + struct obdo *oa, void *data); /* lproc_lov.c */ extern struct file_operations lov_proc_target_fops; diff --git a/lustre/lov/lov_log.c b/lustre/lov/lov_log.c index d93aa98..72703ef 100644 --- a/lustre/lov/lov_log.c +++ b/lustre/lov/lov_log.c @@ -63,7 +63,9 @@ static int lov_llog_origin_add(struct llog_ctxt *ctxt, int i, rc = 0; ENTRY; - LASSERT(logcookies && numcookies >= lsm->lsm_stripe_count); + LASSERTF(logcookies && numcookies >= lsm->lsm_stripe_count, + "logcookies %p, numcookies %d lsm->lsm_stripe_count %d \n", + logcookies, numcookies, lsm->lsm_stripe_count); for (i = 0,loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++,loi++) { struct obd_device *child = lov->tgts[loi->loi_ost_idx].ltd_exp->exp_obd; diff --git a/lustre/lov/lov_merge.c b/lustre/lov/lov_merge.c index b52fa24..62e1956 100644 --- a/lustre/lov/lov_merge.c +++ b/lustre/lov/lov_merge.c @@ -38,15 +38,24 @@ #include "lov_internal.h" -/* Merge rss if kms == 0 +/* Merge the lock value block(&lvb) attributes from each of the stripes in a + * file into a single lvb. It is expected that the caller initializes the + * current atime, mtime, ctime to avoid regressing a more uptodate time on + * the local client. * - * Even when merging RSS, we will take the KMS value if it's larger. - * This prevents getattr from stomping on dirty cached pages which - * extend the file size. */ -__u64 lov_merge_size(struct lov_stripe_md *lsm, int kms) + * If @kms_only is set then we do not consider the recently seen size (rss) + * when updating the known minimum size (kms). Even when merging RSS, we will + * take the KMS value if it's larger. This prevents getattr from stomping on + * dirty cached pages which extend the file size. */ +int lov_merge_lvb(struct obd_export *exp, struct lov_stripe_md *lsm, + struct ost_lvb *lvb, int kms_only) { struct lov_oinfo *loi; __u64 size = 0; + __u64 blocks = 0; + __u64 current_mtime = lvb->lvb_mtime; + __u64 current_atime = lvb->lvb_atime; + __u64 current_ctime = lvb->lvb_ctime; int i; LASSERT_SPIN_LOCKED(&lsm->lsm_lock); @@ -59,42 +68,29 @@ __u64 lov_merge_size(struct lov_stripe_md *lsm, int kms) obd_size lov_size, tmpsize; tmpsize = loi->loi_kms; - if (kms == 0 && loi->loi_rss > tmpsize) - tmpsize = loi->loi_rss; + if (kms_only == 0 && loi->loi_lvb.lvb_size > tmpsize) + tmpsize = loi->loi_lvb.lvb_size; lov_size = lov_stripe_size(lsm, tmpsize, i); if (lov_size > size) size = lov_size; + /* merge blocks, mtime, atime */ + blocks += loi->loi_lvb.lvb_blocks; + if (loi->loi_lvb.lvb_mtime > current_mtime) + current_mtime = loi->loi_lvb.lvb_mtime; + if (loi->loi_lvb.lvb_atime > current_atime) + current_atime = loi->loi_lvb.lvb_atime; + if (loi->loi_lvb.lvb_ctime > current_ctime) + current_ctime = loi->loi_lvb.lvb_ctime; } - return size; -} -EXPORT_SYMBOL(lov_merge_size); - -/* Merge blocks */ -__u64 lov_merge_blocks(struct lov_stripe_md *lsm) -{ - struct lov_oinfo *loi; - __u64 blocks = 0; - int i; - - for (i = 0, loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++, loi++) - blocks += loi->loi_blocks; - return blocks; -} -EXPORT_SYMBOL(lov_merge_blocks); - -__u64 lov_merge_mtime(struct lov_stripe_md *lsm, __u64 current_time) -{ - struct lov_oinfo *loi; - int i; - - for (i = 0, loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++, loi++) - if (loi->loi_mtime > current_time) - current_time = loi->loi_mtime; - return current_time; + lvb->lvb_size = size; + lvb->lvb_blocks = blocks; + lvb->lvb_mtime = current_mtime; + lvb->lvb_atime = current_atime; + lvb->lvb_ctime = current_ctime; + RETURN(0); } -EXPORT_SYMBOL(lov_merge_mtime); /* Must be called under the lov_stripe_lock() */ int lov_adjust_kms(struct obd_export *exp, struct lov_stripe_md *lsm, @@ -115,7 +111,7 @@ int lov_adjust_kms(struct obd_export *exp, struct lov_stripe_md *lsm, for (loi = lsm->lsm_oinfo; stripe < lsm->lsm_stripe_count; stripe++, loi++) { kms = lov_size_to_stripe(lsm, size, stripe); - loi->loi_kms = loi->loi_rss = kms; + loi->loi_kms = loi->loi_lvb.lvb_size = kms; CDEBUG(D_INODE, "stripe %d KMS %sing "LPU64"->"LPU64"\n", stripe, kms > loi->loi_kms ? "increas":"shrink", diff --git a/lustre/lov/lov_offset.c b/lustre/lov/lov_offset.c index cea91d7..22af87e 100644 --- a/lustre/lov/lov_offset.c +++ b/lustre/lov/lov_offset.c @@ -43,14 +43,18 @@ obd_size lov_stripe_size(struct lov_stripe_md *lsm, obd_size ost_size, int stripeno) { unsigned long ssize = lsm->lsm_stripe_size; - unsigned long swidth = ssize * lsm->lsm_stripe_count; - unsigned long stripe_size; + unsigned long swidth, stripe_size; + int sindex = stripeno; obd_size lov_size; + int magic = lsm->lsm_magic; ENTRY; if (ost_size == 0) RETURN(0); + LASSERT(lsm_op_find(magic) != NULL); + lsm_op_find(magic)->lsm_stripe_by_index(lsm, &stripeno, NULL, &swidth); + /* do_div(a, b) returns a % b, and a = a / b */ stripe_size = do_div(ost_size, ssize); if (stripe_size) @@ -58,6 +62,7 @@ obd_size lov_stripe_size(struct lov_stripe_md *lsm, obd_size ost_size, else lov_size = (ost_size - 1) * swidth + (stripeno + 1) * ssize; + lov_size += lsm_op_find(magic)->lsm_stripe_offset_by_index(lsm, sindex); RETURN(lov_size); } @@ -113,8 +118,8 @@ int lov_stripe_offset(struct lov_stripe_md *lsm, obd_off lov_off, int stripeno, obd_off *obd_off) { unsigned long ssize = lsm->lsm_stripe_size; - unsigned long swidth = ssize * lsm->lsm_stripe_count; - unsigned long stripe_off, this_stripe; + unsigned long swidth, stripe_off, this_stripe; + int magic = lsm->lsm_magic; int ret = 0; if (lov_off == OBD_OBJECT_EOF) { @@ -122,6 +127,10 @@ int lov_stripe_offset(struct lov_stripe_md *lsm, obd_off lov_off, return 0; } + LASSERT(lsm_op_find(magic) != NULL); + lsm_op_find(magic)->lsm_stripe_by_index(lsm, &stripeno, &lov_off, + &swidth); + /* do_div(a, b) returns a % b, and a = a / b */ stripe_off = do_div(lov_off, swidth); @@ -165,12 +174,16 @@ obd_off lov_size_to_stripe(struct lov_stripe_md *lsm, obd_off file_size, int stripeno) { unsigned long ssize = lsm->lsm_stripe_size; - unsigned long swidth = ssize * lsm->lsm_stripe_count; - unsigned long stripe_off, this_stripe; + unsigned long swidth, stripe_off, this_stripe; + int magic = lsm->lsm_magic; if (file_size == OBD_OBJECT_EOF) return OBD_OBJECT_EOF; + LASSERT(lsm_op_find(magic) != NULL); + lsm_op_find(magic)->lsm_stripe_by_index(lsm, &stripeno, &file_size, + &swidth); + /* do_div(a, b) returns a % b, and a = a / b */ stripe_off = do_div(file_size, swidth); @@ -234,10 +247,15 @@ int lov_stripe_intersects(struct lov_stripe_md *lsm, int stripeno, int lov_stripe_number(struct lov_stripe_md *lsm, obd_off lov_off) { unsigned long ssize = lsm->lsm_stripe_size; - unsigned long swidth = ssize * lsm->lsm_stripe_count; - unsigned long stripe_off; + unsigned long swidth, stripe_off; + obd_off offset = lov_off; + int magic = lsm->lsm_magic; + + LASSERT(lsm_op_find(magic) != NULL); + lsm_op_find(magic)->lsm_stripe_by_offset(lsm, NULL, &lov_off, &swidth); stripe_off = do_div(lov_off, swidth); - return stripe_off / ssize; + return (stripe_off/ssize + + lsm_op_find(magic)->lsm_stripe_index_by_offset(lsm, offset)); } diff --git a/lustre/lov/lov_pack.c b/lustre/lov/lov_pack.c index d930d30..ab41749 100644 --- a/lustre/lov/lov_pack.c +++ b/lustre/lov/lov_pack.c @@ -56,6 +56,19 @@ void lov_dump_lmm_v1(int level, struct lov_mds_md_v1 *lmm) le64_to_cpu(lod->l_object_id)); } +void lov_dump_lmm_join(int level, struct lov_mds_md_join *lmmj) +{ + + CDEBUG(level, "objid "LPX64", magic 0x%08X, pattern %#X\n", + le64_to_cpu(lmmj->lmmj_md.lmm_object_id), + le32_to_cpu(lmmj->lmmj_md.lmm_magic), + le32_to_cpu(lmmj->lmmj_md.lmm_pattern)); + CDEBUG(level,"stripe_size %u, stripe_count %u extent_count %u \n", + le32_to_cpu(lmmj->lmmj_md.lmm_stripe_size), + le32_to_cpu(lmmj->lmmj_md.lmm_stripe_count), + le32_to_cpu(lmmj->lmmj_extent_count)); +} + #define LMM_ASSERT(test) \ do { \ if (!(test)) lov_dump_lmm(D_ERROR, lmm); \ @@ -139,10 +152,10 @@ int lov_get_stripecnt(struct lov_obd *lov, int stripe_count) { if (!stripe_count) stripe_count = lov->desc.ld_default_stripe_count; - if (!stripe_count) - stripe_count = 1; if (stripe_count > lov->desc.ld_active_tgt_count) stripe_count = lov->desc.ld_active_tgt_count; + if (!stripe_count) + stripe_count = 1; /* for now, we limit the stripe count directly, when bug 4424 is * fixed this needs to be somewhat dynamic based on whether ext3 * can handle larger EA sizes. */ @@ -152,141 +165,75 @@ int lov_get_stripecnt(struct lov_obd *lov, int stripe_count) return stripe_count; } -static int lov_verify_lmm_v1(struct lov_mds_md_v1 *lmm, int lmm_bytes, - int *stripe_count) -{ - if (lmm_bytes < sizeof(*lmm)) { - CERROR("lov_mds_md too small: %d, need at least %d\n", - lmm_bytes, (int)sizeof(*lmm)); - return -EINVAL; - } - - if (lmm->lmm_magic != le32_to_cpu(LOV_MAGIC_V1)) { - CERROR("bad disk LOV MAGIC: 0x%08X\n", - le32_to_cpu(*(__u32 *)lmm)); - return -EINVAL; - } - - *stripe_count = le32_to_cpu(lmm->lmm_stripe_count); - - if (*stripe_count == 0) { - CERROR("bad stripe count %d\n", *stripe_count); - lov_dump_lmm_v1(D_WARNING, lmm); - return -EINVAL; - } - - if (lmm_bytes < lov_mds_md_v1_size(*stripe_count)) { - CERROR("LOV EA too small: %d, need %d\n", - lmm_bytes, lov_mds_md_v1_size(*stripe_count)); - lov_dump_lmm_v1(D_WARNING, lmm); - return -EINVAL; - } - - if (lmm->lmm_object_id == 0) { - CERROR("zero object id\n"); - lov_dump_lmm_v1(D_WARNING, lmm); - return -EINVAL; - } - - if (lmm->lmm_pattern != cpu_to_le32(LOV_PATTERN_RAID0)) { - CERROR("bad striping pattern\n"); - lov_dump_lmm_v1(D_WARNING, lmm); - return -EINVAL; - } - - if (lmm->lmm_stripe_size == 0 || - (__u64)le32_to_cpu(lmm->lmm_stripe_size) * *stripe_count > ~0UL) { - CERROR("bad stripe size %u\n", - le32_to_cpu(lmm->lmm_stripe_size)); - lov_dump_lmm_v1(D_WARNING, lmm); - return -EINVAL; - } - - return 0; -} static int lov_verify_lmm(void *lmm, int lmm_bytes, int *stripe_count) { - switch (le32_to_cpu(*(__u32 *)lmm)) { - case LOV_MAGIC_V1: - return lov_verify_lmm_v1(lmm, lmm_bytes, stripe_count); - default: + int rc; + + if (lsm_op_find(le32_to_cpu(*(__u32 *)lmm)) == NULL) { CERROR("bad disk LOV MAGIC: 0x%08X; dumping V1 LMM:\n", le32_to_cpu(*(__u32 *)lmm)); lov_dump_lmm_v1(D_WARNING, lmm); return -EINVAL; } + rc = lsm_op_find(le32_to_cpu(*(__u32 *)lmm))->lsm_lmm_verify(lmm, + lmm_bytes, stripe_count); + return rc; } -int lov_alloc_memmd(struct lov_stripe_md **lsmp, int stripe_count, int pattern) +int lov_alloc_memmd(struct lov_stripe_md **lsmp, int stripe_count, + int pattern, int magic) { int lsm_size = lov_stripe_md_size(stripe_count); struct lov_oinfo *loi; int i; + ENTRY; + + CDEBUG(D_INFO, "alloc lsm, stripe_count %d, lsm_size %d\n", + stripe_count, lsm_size); OBD_ALLOC(*lsmp, lsm_size); - if (!*lsmp) - return -ENOMEM; + if (!*lsmp) { + CERROR("can not allocate lsmp lsm_size %d stripe_count %d\n", + lsm_size, stripe_count); + RETURN(-ENOMEM); + } spin_lock_init(&(*lsmp)->lsm_lock); - (*lsmp)->lsm_magic = LOV_MAGIC; + (*lsmp)->lsm_magic = magic; (*lsmp)->lsm_stripe_count = stripe_count; (*lsmp)->lsm_maxbytes = LUSTRE_STRIPE_MAXBYTES * stripe_count; (*lsmp)->lsm_xfersize = PTLRPC_MAX_BRW_SIZE * stripe_count; (*lsmp)->lsm_pattern = pattern; (*lsmp)->lsm_oinfo[0].loi_ost_idx = ~0; - + for (i = 0, loi = (*lsmp)->lsm_oinfo; i < stripe_count; i++, loi++) loi_init(loi); - return lsm_size; + RETURN(lsm_size); } void lov_free_memmd(struct lov_stripe_md **lsmp) { - OBD_FREE(*lsmp, lov_stripe_md_size((*lsmp)->lsm_stripe_count)); + struct lov_stripe_md *lsm = *lsmp; + + LASSERT(lsm_op_find(lsm->lsm_magic) != NULL); + lsm_op_find(lsm->lsm_magic)->lsm_free(lsm); + *lsmp = NULL; } -int lov_unpackmd_v1(struct lov_obd *lov, struct lov_stripe_md *lsm, - struct lov_mds_md_v1 *lmm) -{ - struct lov_oinfo *loi; - int i; - - lsm->lsm_object_id = le64_to_cpu(lmm->lmm_object_id); - lsm->lsm_object_gr = le64_to_cpu(lmm->lmm_object_gr); - lsm->lsm_stripe_size = le32_to_cpu(lmm->lmm_stripe_size); - lsm->lsm_pattern = le32_to_cpu(lmm->lmm_pattern); - lsm->lsm_xfersize = lsm->lsm_stripe_size * lsm->lsm_stripe_count; - - for (i = 0, loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++) { - /* XXX LOV STACKING call down to osc_unpackmd() */ - loi->loi_id = le64_to_cpu(lmm->lmm_objects[i].l_object_id); - loi->loi_gr = le64_to_cpu(lmm->lmm_objects[i].l_object_gr); - loi->loi_ost_idx = le32_to_cpu(lmm->lmm_objects[i].l_ost_idx); - loi->loi_ost_gen = le32_to_cpu(lmm->lmm_objects[i].l_ost_gen); - if (loi->loi_ost_idx > lov->desc.ld_tgt_count) { - CERROR("OST index %d more than OST count %d\n", - loi->loi_ost_idx, lov->desc.ld_tgt_count); - lov_dump_lmm_v1(D_WARNING, lmm); - return -EINVAL; - } - loi++; - } - - return 0; -} /* Unpack LOV object metadata from disk storage. It is packed in LE byte * order and is opaque to the networking layer. */ -int lov_unpackmd(struct obd_export *exp, struct lov_stripe_md **lsmp, +int lov_unpackmd(struct obd_export *exp, struct lov_stripe_md **lsmp, struct lov_mds_md *lmm, int lmm_bytes) { struct obd_device *obd = class_exp2obd(exp); struct lov_obd *lov = &obd->u.lov; int rc = 0, stripe_count, lsm_size; + __u32 magic; ENTRY; /* If passed an MDS struct use values from there, otherwise defaults */ @@ -294,8 +241,10 @@ int lov_unpackmd(struct obd_export *exp, struct lov_stripe_md **lsmp, rc = lov_verify_lmm(lmm, lmm_bytes, &stripe_count); if (rc) RETURN(rc); + magic = le32_to_cpu(lmm->lmm_magic); } else { stripe_count = lov_get_stripecnt(lov, 0); + magic = LOV_MAGIC; } /* If we aren't passed an lsmp struct, we just want the size */ @@ -309,7 +258,8 @@ int lov_unpackmd(struct obd_export *exp, struct lov_stripe_md **lsmp, RETURN(0); } - lsm_size = lov_alloc_memmd(lsmp, stripe_count, LOV_PATTERN_RAID0); + lsm_size = lov_alloc_memmd(lsmp, stripe_count, LOV_PATTERN_RAID0, + magic); if (lsm_size < 0) RETURN(lsm_size); @@ -317,12 +267,8 @@ int lov_unpackmd(struct obd_export *exp, struct lov_stripe_md **lsmp, if (!lmm) RETURN(lsm_size); - switch (le32_to_cpu(lmm->lmm_magic)) { - case LOV_MAGIC_V1: - rc = lov_unpackmd_v1(lov, *lsmp, lmm); - break; - } - + LASSERT(lsm_op_find(magic) != NULL); + rc = lsm_op_find(magic)->lsm_unpackmd(lov, *lsmp, lmm); if (rc) { lov_free_memmd(lsmp); RETURN(rc); @@ -396,7 +342,7 @@ int lov_setstripe(struct obd_export *exp, struct lov_stripe_md **lsmp, RETURN(-EINVAL); } - rc = lov_alloc_memmd(lsmp, stripe_count, lum.lmm_pattern); + rc = lov_alloc_memmd(lsmp, stripe_count, lum.lmm_pattern, LOV_MAGIC); if (rc < 0) RETURN(rc); diff --git a/lustre/lvfs/fsfilt_ext3.c b/lustre/lvfs/fsfilt_ext3.c index 6d7f3b0..869accf 100644 --- a/lustre/lvfs/fsfilt_ext3.c +++ b/lustre/lvfs/fsfilt_ext3.c @@ -53,6 +53,7 @@ #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) #include #endif +#include #ifdef EXT3_MULTIBLOCK_ALLOCATOR #include @@ -72,6 +73,16 @@ struct fsfilt_cb_data { #define EXT3_XATTR_INDEX_TRUSTED 4 #endif +static char *fsfilt_ext3_label(struct super_block *sb) +{ + return EXT3_SB(sb)->s_es->s_volume_name; +} + +static char *fsfilt_ext3_uuid(struct super_block *sb) +{ + return EXT3_SB(sb)->s_es->s_uuid; +} + /* * We don't currently need any additional blocks for rmdir and * unlink transactions because we are storing the OST oa_id inside @@ -153,6 +164,19 @@ static void *fsfilt_ext3_start(struct inode *inode, int op, void *desc_private, nblocks = (LLOG_CHUNK_SIZE >> inode->i_blkbits) + EXT3_DELETE_TRANS_BLOCKS * logs; break; + case FSFILT_OP_JOIN: + /* delete 2 file(file + array id) + create 1 file (array id) + * create/update logs for each stripe */ + nblocks += 2 * EXT3_DELETE_TRANS_BLOCKS; + + /*create array log for head file*/ + nblocks += 3; + nblocks += (EXT3_INDEX_EXTRA_TRANS_BLOCKS + + EXT3_SINGLEDATA_TRANS_BLOCKS); + /*update head file array */ + nblocks += EXT3_INDEX_EXTRA_TRANS_BLOCKS + + EXT3_DATA_TRANS_BLOCKS; + break; default: CERROR("unknown transaction start op %d\n", op); LBUG(); } @@ -414,13 +438,15 @@ static int fsfilt_ext3_setattr(struct dentry *dentry, void *handle, /* make sure _something_ gets set - so new inode * goes to disk (probably won't work over XFS */ if (!(iattr->ia_valid & (ATTR_MODE | ATTR_MTIME | ATTR_CTIME))){ - iattr->ia_valid |= ATTR_MODE; - iattr->ia_mode = inode->i_mode; + iattr->ia_valid |= ATTR_MTIME; + iattr->ia_mtime = inode->i_mtime; } } /* Don't allow setattr to change file type */ - iattr->ia_mode = (inode->i_mode & S_IFMT)|(iattr->ia_mode & ~S_IFMT); + if (iattr->ia_valid & ATTR_MODE) + iattr->ia_mode = (inode->i_mode & S_IFMT) | + (iattr->ia_mode & ~S_IFMT); /* We set these flags on the client, but have already checked perms * so don't confuse inode_change_ok. */ @@ -435,7 +461,6 @@ static int fsfilt_ext3_setattr(struct dentry *dentry, void *handle, } unlock_kernel(); - return rc; } @@ -445,6 +470,12 @@ static int fsfilt_ext3_iocontrol(struct inode * inode, struct file *file, int rc = 0; ENTRY; + /* FIXME: Can't do this because of nested transaction deadlock */ + if (cmd == EXT3_IOC_SETFLAGS && (*(int *)arg) & EXT3_JOURNAL_DATA_FL) { + CERROR("can't set data journal flag on file\n"); + RETURN(-EPERM); + } + if (inode->i_fop->ioctl) rc = inode->i_fop->ioctl(inode, file, cmd, arg); else @@ -454,11 +485,11 @@ static int fsfilt_ext3_iocontrol(struct inode * inode, struct file *file, } static int fsfilt_ext3_set_md(struct inode *inode, void *handle, - void *lmm, int lmm_size) + void *lmm, int lmm_size, const char *name) { int rc; - LASSERT(down_trylock(&inode->i_sem) != 0); + LASSERT_SEM_LOCKED(&inode->i_sem); if (EXT3_I(inode)->i_file_acl /* || large inode EA flag */) CWARN("setting EA on %lu/%u again... interesting\n", @@ -466,7 +497,7 @@ static int fsfilt_ext3_set_md(struct inode *inode, void *handle, lock_24kernel(); rc = ext3_xattr_set_handle(handle, inode, EXT3_XATTR_INDEX_TRUSTED, - XATTR_LUSTRE_MDS_LOV_EA, lmm, lmm_size, 0); + name, lmm, lmm_size, 0); unlock_24kernel(); @@ -477,15 +508,16 @@ static int fsfilt_ext3_set_md(struct inode *inode, void *handle, } /* Must be called with i_sem held */ -static int fsfilt_ext3_get_md(struct inode *inode, void *lmm, int lmm_size) +static int fsfilt_ext3_get_md(struct inode *inode, void *lmm, int lmm_size, + const char *name) { int rc; - LASSERT(down_trylock(&inode->i_sem) != 0); + LASSERT_SEM_LOCKED(&inode->i_sem); lock_24kernel(); rc = ext3_xattr_get(inode, EXT3_XATTR_INDEX_TRUSTED, - XATTR_LUSTRE_MDS_LOV_EA, lmm, lmm_size); + name, lmm, lmm_size); unlock_24kernel(); /* This gives us the MD size */ @@ -494,7 +526,7 @@ static int fsfilt_ext3_get_md(struct inode *inode, void *lmm, int lmm_size) if (rc < 0) { CDEBUG(D_INFO, "error getting EA %d/%s from inode %lu: rc %d\n", - EXT3_XATTR_INDEX_TRUSTED, XATTR_LUSTRE_MDS_LOV_EA, + EXT3_XATTR_INDEX_TRUSTED, name, inode->i_ino, rc); memset(lmm, 0, lmm_size); return (rc == -ENODATA) ? 0 : rc; @@ -737,6 +769,26 @@ static int ext3_ext_find_goal(struct inode *inode, struct ext3_ext_path *path, return bg_start + colour + block; } +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) +#include +static void ll_unmap_underlying_metadata(struct super_block *sb, + unsigned long blocknr) +{ + struct buffer_head *old_bh; + + old_bh = get_hash_table(sb->s_dev, blocknr, sb->s_blocksize); + if (old_bh) { + mark_buffer_clean(old_bh); + wait_on_buffer(old_bh); + clear_bit(BH_Req, &old_bh->b_state); + __brelse(old_bh); + } +} +#else +#define ll_unmap_underlying_metadata(sb, blocknr) \ + unmap_underlying_metadata((sb)->s_bdev, blocknr) +#endif + static int ext3_ext_new_extent_cb(struct ext3_extents_tree *tree, struct ext3_ext_path *path, struct ext3_ext_cache *cex) @@ -844,8 +896,6 @@ out: unlock_24kernel(); map: if (err >= 0) { - struct block_device *bdev = inode->i_sb->s_bdev; - /* map blocks */ if (bp->num == 0) { CERROR("hmm. why do we find this extent?\n"); @@ -868,10 +918,9 @@ map: } else { *(bp->created) = 1; /* unmap any possible underlying metadata from - * the block device mapping. bug 6998. - * This only compiles on 2.6, but there are - * no users of mballoc on 2.4. */ - unmap_underlying_metadata(bdev, *(bp->blocks)); + * the block device mapping. bug 6998. */ + ll_unmap_underlying_metadata(inode->i_sb, + *(bp->blocks)); } bp->created++; bp->blocks++; @@ -958,7 +1007,7 @@ int fsfilt_ext3_map_ext_inode_pages(struct inode *inode, struct page **page, cleanup: return rc; } -#endif +#endif /* EXT3_MULTIBLOCK_ALLOCATOR */ extern int ext3_map_inode_page(struct inode *inode, struct page *page, unsigned long *blocks, int *created, int create); @@ -1161,6 +1210,8 @@ static int fsfilt_ext3_setup(struct super_block *sb) set_opt(EXT3_SB(sb)->s_mount_opt, PDIROPS); sb->s_flags |= S_PDIROPS; #endif + if (!EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX)) + CWARN("filesystem doesn't have dir_index feature enabled\n"); return 0; } @@ -1739,6 +1790,41 @@ static int fsfilt_ext3_quotacheck(struct super_block *sb, brelse(bitmap_bh); } + /* read old quota limits from old quota file. (only for the user + * has limits but hasn't file) */ +#ifdef HAVE_QUOTA_SUPPORT + for (i = 0; i < MAXQUOTAS; i++) { + struct list_head id_list; + struct dquot_id *dqid, *tmp; + + if (!Q_TYPESET(oqc, i)) + continue; + + if (qctxt->qckt_first_check[i]) + continue; + + + LASSERT(sb_dqopt(sb)->files[i] != NULL); + INIT_LIST_HEAD(&id_list); +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,12) + rc = lustre_get_qids(sb_dqopt(sb)->files[i], NULL, i, &id_list); +#else + rc = lustre_get_qids(NULL, sb_dqopt(sb)->files[i], i, &id_list); +#endif + if (rc) + CERROR("read old limits failed. (rc:%d)\n", rc); + + list_for_each_entry_safe(dqid, tmp, &id_list, di_link) { + list_del_init(&dqid->di_link); + + if (!rc) + cqget(sb, qctxt->qckt_hash, &qctxt->qckt_list, + dqid->di_id, i, + qctxt->qckt_first_check[i]); + kfree(dqid); + } + } +#endif /* turn off quota cause we are to dump chk_dqblk to files */ quota_onoff(sb, Q_QUOTAOFF, oqc->qc_type); @@ -1766,7 +1852,7 @@ out: #ifdef HAVE_QUOTA_SUPPORT static int fsfilt_ext3_quotainfo(struct lustre_quota_info *lqi, int type, - int cmd, struct list_head *list) + int cmd) { int rc = 0; ENTRY; @@ -1789,9 +1875,6 @@ static int fsfilt_ext3_quotainfo(struct lustre_quota_info *lqi, int type, case QFILE_INIT_INFO: rc = lustre_init_quota_info(lqi, type); break; - case QFILE_GET_QIDS: - rc = lustre_get_qids(lqi, type, list); - break; default: CERROR("Unsupported admin quota file cmd %d\n", cmd); LBUG(); @@ -1800,6 +1883,12 @@ static int fsfilt_ext3_quotainfo(struct lustre_quota_info *lqi, int type, RETURN(rc); } +static int fsfilt_ext3_qids(struct file *file, struct inode *inode, int type, + struct list_head *list) +{ + return lustre_get_qids(file, inode, type, list); +} + static int fsfilt_ext3_dquot(struct lustre_dquot *dquot, int cmd) { int rc = 0; @@ -1839,6 +1928,8 @@ static int fsfilt_ext3_dquot(struct lustre_dquot *dquot, int cmd) static struct fsfilt_operations fsfilt_ext3_ops = { .fs_type = "ext3", .fs_owner = THIS_MODULE, + .fs_label = fsfilt_ext3_label, + .fs_uuid = fsfilt_ext3_uuid, .fs_start = fsfilt_ext3_start, .fs_brw_start = fsfilt_ext3_brw_start, .fs_commit = fsfilt_ext3_commit, @@ -1863,6 +1954,7 @@ static struct fsfilt_operations fsfilt_ext3_ops = { .fs_quotacheck = fsfilt_ext3_quotacheck, #ifdef HAVE_QUOTA_SUPPORT .fs_quotainfo = fsfilt_ext3_quotainfo, + .fs_qids = fsfilt_ext3_qids, .fs_dquot = fsfilt_ext3_dquot, #endif }; diff --git a/lustre/lvfs/fsfilt_reiserfs.c b/lustre/lvfs/fsfilt_reiserfs.c index e5d93f4..20cbb3f 100644 --- a/lustre/lvfs/fsfilt_reiserfs.c +++ b/lustre/lvfs/fsfilt_reiserfs.c @@ -125,7 +125,7 @@ static int fsfilt_reiserfs_setattr(struct dentry *dentry, void *handle, } static int fsfilt_reiserfs_set_md(struct inode *inode, void *handle, - void *lmm, int lmm_size) + void *lmm, int lmm_size, const char *name) { /* XXX write stripe data into MDS file itself */ CERROR("not implemented yet\n"); @@ -133,7 +133,8 @@ static int fsfilt_reiserfs_set_md(struct inode *inode, void *handle, return -ENOSYS; } -static int fsfilt_reiserfs_get_md(struct inode *inode, void *lmm, int lmm_size) +static int fsfilt_reiserfs_get_md(struct inode *inode, void *lmm, int lmm_size, + const char *name) { if (lmm == NULL) return inode->i_size; diff --git a/lustre/lvfs/lvfs_linux.c b/lustre/lvfs/lvfs_linux.c index 3f9e289..1aec9f9 100644 --- a/lustre/lvfs/lvfs_linux.c +++ b/lustre/lvfs/lvfs_linux.c @@ -194,8 +194,10 @@ void pop_ctxt(struct lvfs_run_ctxt *saved, struct lvfs_run_ctxt *new_ctx, atomic_read(¤t->fs->pwdmnt->mnt_count)); */ - LASSERT(current->fs->pwd == new_ctx->pwd); - LASSERT(current->fs->pwdmnt == new_ctx->pwdmnt); + LASSERTF(current->fs->pwd == new_ctx->pwd, "%p != %p\n", + current->fs->pwd, new_ctx->pwd); + LASSERTF(current->fs->pwdmnt == new_ctx->pwdmnt, "%p != %p\n", + current->fs->pwdmnt, new_ctx->pwdmnt); set_fs(saved->fs); set_fs_pwd(current->fs, saved->pwdmnt, saved->pwd); @@ -469,6 +471,26 @@ EXPORT_SYMBOL(lvfs_set_rdonly); EXPORT_SYMBOL(lvfs_check_rdonly); EXPORT_SYMBOL(lvfs_clear_rdonly); +int lvfs_check_io_health(struct obd_device *obd, struct file *file) +{ + char *write_page = NULL; + loff_t offset = 0; + int rc = 0; + ENTRY; + + OBD_ALLOC(write_page, PAGE_SIZE); + if (!write_page) + RETURN(-ENOMEM); + + rc = fsfilt_write_record(obd, file, write_page, PAGE_SIZE, &offset, 1); + + OBD_FREE(write_page, PAGE_SIZE); + + CDEBUG(D_INFO, "write 1 page synchronously for checking io rc %d\n",rc); + RETURN(rc); +} +EXPORT_SYMBOL(lvfs_check_io_health); + static int __init lvfs_linux_init(void) { RETURN(0); diff --git a/lustre/mdc/lproc_mdc.c b/lustre/mdc/lproc_mdc.c index b78f317..0092084 100644 --- a/lustre/mdc/lproc_mdc.c +++ b/lustre/mdc/lproc_mdc.c @@ -33,6 +33,7 @@ static struct lprocfs_vars lprocfs_obd_vars[] = { { "uuid", lprocfs_rd_uuid, 0, 0 }, { "ping", 0, lprocfs_wr_ping, 0 }, + { "connect_flags", lprocfs_rd_connect_flags, 0, 0 }, { "blocksize", lprocfs_rd_blksize, 0, 0 }, { "kbytestotal", lprocfs_rd_kbytestotal, 0, 0 }, { "kbytesfree", lprocfs_rd_kbytesfree, 0, 0 }, diff --git a/lustre/mdc/mdc_lib.c b/lustre/mdc/mdc_lib.c index d459deb..dddfca5 100644 --- a/lustre/mdc/mdc_lib.c +++ b/lustre/mdc/mdc_lib.c @@ -39,12 +39,12 @@ #endif #endif -void mdc_readdir_pack(struct ptlrpc_request *req, __u64 offset, __u32 size, - struct ll_fid *mdc_fid) +void mdc_readdir_pack(struct ptlrpc_request *req, int pos, __u64 offset, + __u32 size, struct ll_fid *mdc_fid) { struct mds_body *b; - b = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*b)); + b = lustre_msg_buf(req->rq_reqmsg, pos, sizeof (*b)); b->fsuid = current->fsuid; b->fsgid = current->fsgid; b->capability = current->cap_effective; @@ -63,9 +63,15 @@ static void mdc_pack_body(struct mds_body *b) b->capability = current->cap_effective; } -void mdc_pack_req_body(struct ptlrpc_request *req) +void mdc_pack_req_body(struct ptlrpc_request *req, int offset, + __u64 valid, struct ll_fid *fid, int ea_size) { - struct mds_body *b = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*b)); + struct mds_body *b = lustre_msg_buf(req->rq_reqmsg, offset, sizeof(*b)); + + if (fid) + b->fid1 = *fid; + b->valid = valid; + b->eadatasize = ea_size; mdc_pack_body(b); } @@ -111,10 +117,22 @@ static __u32 mds_pack_open_flags(__u32 flags) ((flags & O_APPEND) ? MDS_OPEN_APPEND : 0) | ((flags & O_SYNC) ? MDS_OPEN_SYNC : 0) | ((flags & O_DIRECTORY) ? MDS_OPEN_DIRECTORY : 0) | + ((flags & O_JOIN_FILE) ? MDS_OPEN_JOIN_FILE : 0) | 0; } /* packing of MDS records */ +void mdc_join_pack(struct ptlrpc_request *req, int offset, + struct mdc_op_data *op_data, __u64 head_size) +{ + struct mds_rec_join *rec; + + rec = lustre_msg_buf(req->rq_reqmsg, offset, sizeof(*rec)); + LASSERT(rec != NULL); + rec->jr_fid = op_data->fid2; + rec->jr_headsize = head_size; +} + void mdc_open_pack(struct ptlrpc_request *req, int offset, struct mdc_op_data *op_data, __u32 mode, __u64 rdev, __u32 flags, const void *lmm, int lmmlen) @@ -150,11 +168,11 @@ void mdc_open_pack(struct ptlrpc_request *req, int offset, } } -void mdc_setattr_pack(struct ptlrpc_request *req, struct mdc_op_data *data, - struct iattr *iattr, void *ea, int ealen, - void *ea2, int ea2len) +void mdc_setattr_pack(struct ptlrpc_request *req, int offset, + struct mdc_op_data *data, struct iattr *iattr, + void *ea, int ealen, void *ea2, int ea2len) { - struct mds_rec_setattr *rec = lustre_msg_buf(req->rq_reqmsg, 0, + struct mds_rec_setattr *rec = lustre_msg_buf(req->rq_reqmsg, offset, sizeof (*rec)); rec->sa_opcode = REINT_SETATTR; rec->sa_fsuid = current->fsuid; @@ -182,12 +200,12 @@ void mdc_setattr_pack(struct ptlrpc_request *req, struct mdc_op_data *data, if (ealen == 0) return; - memcpy(lustre_msg_buf(req->rq_reqmsg, 1, ealen), ea, ealen); + memcpy(lustre_msg_buf(req->rq_reqmsg, offset + 1, ealen), ea, ealen); if (ea2len == 0) return; - memcpy(lustre_msg_buf(req->rq_reqmsg, 2, ea2len), ea2, ea2len); + memcpy(lustre_msg_buf(req->rq_reqmsg, offset + 2, ea2len), ea2, ea2len); } void mdc_unlink_pack(struct ptlrpc_request *req, int offset, @@ -265,7 +283,7 @@ void mdc_rename_pack(struct ptlrpc_request *req, int offset, } } -void mdc_getattr_pack(struct ptlrpc_request *req, int valid, int offset, +void mdc_getattr_pack(struct ptlrpc_request *req, int offset, int valid, int flags, struct mdc_op_data *data) { struct mds_body *b; diff --git a/lustre/mdc/mdc_locks.c b/lustre/mdc/mdc_locks.c index cdc09a5..f4f23c4 100644 --- a/lustre/mdc/mdc_locks.c +++ b/lustre/mdc/mdc_locks.c @@ -54,6 +54,11 @@ void it_set_disposition(struct lookup_intent *it, int flag) } EXPORT_SYMBOL(it_set_disposition); +void it_clear_disposition(struct lookup_intent *it, int flag) +{ + it->d.lustre.it_disposition &= ~flag; +} + static int it_to_lock_mode(struct lookup_intent *it) { /* CREAT needs to be tested before open (both could be set) */ @@ -242,39 +247,63 @@ int mdc_enqueue(struct obd_export *exp, struct ldlm_res_id res_id = { .name = {data->fid1.id, data->fid1.generation} }; ldlm_policy_data_t policy = { .l_inodebits = { MDS_INODELOCK_LOOKUP } }; - int size[5] = {sizeof(struct ldlm_request), sizeof(struct ldlm_intent)}; - int rc, flags = extra_lock_flags | LDLM_FL_HAS_INTENT; - int repbufcnt = 3, repsize[4] = {sizeof(struct ldlm_reply), - sizeof(struct mds_body), - obddev->u.cli.cl_max_mds_easize}; - struct ldlm_reply *dlm_rep; - struct ldlm_intent *lit; struct ldlm_request *lockreq; + struct ldlm_intent *lit; + int size[6] = {[MDS_REQ_INTENT_LOCKREQ_OFF] = sizeof(*lockreq), + [MDS_REQ_INTENT_IT_OFF] = sizeof(*lit) }; + struct ldlm_reply *dlm_rep; + int repsize[4] = {sizeof(*dlm_rep), + sizeof(struct mds_body), + obddev->u.cli.cl_max_mds_easize}; void *eadata; unsigned long irqflags; + int repbufcnt = 3, req_buffers = 2; + int rc, flags = extra_lock_flags | LDLM_FL_HAS_INTENT; ENTRY; + LASSERTF(lock_type == LDLM_IBITS, "lock type %d\n", lock_type); // LDLM_DEBUG_NOLOCK("mdsintent=%s,name=%s,dir=%lu", // ldlm_it2str(it->it_op), it_name, it_inode->i_ino); if (it->it_op & IT_OPEN) { it->it_create_mode |= S_IFREG; - size[2] = sizeof(struct mds_rec_create); - size[3] = data->namelen + 1; + size[req_buffers++] = sizeof(struct mds_rec_create); + size[req_buffers++] = data->namelen + 1; /* As an optimization, we allocate an RPC request buffer for * at least a default-sized LOV EA even if we aren't sending * one. We grow the whole request to the next power-of-two * size since we get that much from a slab allocation anyways. * This avoids an allocation below in the common case where * we need to save a default-sized LOV EA for open replay. */ - size[4] = max(lmmsize, obddev->u.cli.cl_default_mds_easize); - rc = lustre_msg_size(5, size); + size[req_buffers++] = max(lmmsize, + obddev->u.cli.cl_default_mds_easize); + rc = lustre_msg_size(req_buffers, size); if (rc & (rc - 1)) - size[4] = min(size[4] + round_up(rc) - rc, - obddev->u.cli.cl_max_mds_easize); - req = ptlrpc_prep_req(class_exp2cliimp(exp), LDLM_ENQUEUE, - 5, size, NULL); + size[req_buffers - 1] = min(size[req_buffers - 1] + + round_up(rc) - rc, + obddev->u.cli.cl_max_mds_easize); + + if (it->it_flags & O_JOIN_FILE) { + __u64 head_size = *(__u32*)cb_data; + __u32 tsize = *(__u32*)lmm; + + /* join is like an unlink of the tail */ + policy.l_inodebits.bits = MDS_INODELOCK_UPDATE; + size[req_buffers++] = sizeof(struct mds_rec_join); + req = ptlrpc_prep_req(class_exp2cliimp(exp), + LUSTRE_DLM_VERSION, LDLM_ENQUEUE, + req_buffers, size, NULL); + /* when joining file, cb_data and lmm args together + * indicate the head file size*/ + mdc_join_pack(req, req_buffers - 1, data, + (head_size << 32) | tsize); + cb_data = NULL; + lmm = NULL; + } else + req = ptlrpc_prep_req(class_exp2cliimp(exp), + LUSTRE_DLM_VERSION, LDLM_ENQUEUE, + req_buffers, size, NULL); if (!req) RETURN(-ENOMEM); @@ -283,60 +312,66 @@ int mdc_enqueue(struct obd_export *exp, spin_unlock_irqrestore (&req->rq_lock, irqflags); /* pack the intent */ - lit = lustre_msg_buf(req->rq_reqmsg, 1, sizeof (*lit)); + lit = lustre_msg_buf(req->rq_reqmsg, MDS_REQ_INTENT_IT_OFF, + sizeof (*lit)); lit->opc = (__u64)it->it_op; /* pack the intended request */ - mdc_open_pack(req, 2, data, it->it_create_mode, 0, + mdc_open_pack(req, MDS_REQ_INTENT_REC_OFF, data, + it->it_create_mode, 0, it->it_flags, lmm, lmmsize); repsize[repbufcnt++] = LUSTRE_POSIX_ACL_MAX_SIZE; } else if (it->it_op & IT_UNLINK) { - size[2] = sizeof(struct mds_rec_unlink); - size[3] = data->namelen + 1; + size[req_buffers++] = sizeof(struct mds_rec_unlink); + size[req_buffers++] = data->namelen + 1; policy.l_inodebits.bits = MDS_INODELOCK_UPDATE; - req = ptlrpc_prep_req(class_exp2cliimp(exp), LDLM_ENQUEUE, 4, - size, NULL); + req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_DLM_VERSION, + LDLM_ENQUEUE, req_buffers, size, NULL); if (!req) RETURN(-ENOMEM); /* pack the intent */ - lit = lustre_msg_buf(req->rq_reqmsg, 1, sizeof (*lit)); + lit = lustre_msg_buf(req->rq_reqmsg, MDS_REQ_INTENT_IT_OFF, + sizeof (*lit)); lit->opc = (__u64)it->it_op; /* pack the intended request */ - mdc_unlink_pack(req, 2, data); - + mdc_unlink_pack(req, MDS_REQ_INTENT_REC_OFF, data); + /* get ready for the reply */ repsize[repbufcnt++] = obddev->u.cli.cl_max_mds_cookiesize; } else if (it->it_op & (IT_GETATTR | IT_LOOKUP)) { obd_valid valid = OBD_MD_FLGETATTR | OBD_MD_FLEASIZE | - OBD_MD_FLACL; - size[2] = sizeof(struct mds_body); - size[3] = data->namelen + 1; + OBD_MD_FLACL | OBD_MD_FLMODEASIZE; + size[req_buffers++] = sizeof(struct mds_body); + size[req_buffers++] = data->namelen + 1; if (it->it_op & IT_GETATTR) policy.l_inodebits.bits = MDS_INODELOCK_UPDATE; - req = ptlrpc_prep_req(class_exp2cliimp(exp), LDLM_ENQUEUE, 4, - size, NULL); + req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_DLM_VERSION, + LDLM_ENQUEUE, req_buffers, size, NULL); if (!req) RETURN(-ENOMEM); /* pack the intent */ - lit = lustre_msg_buf(req->rq_reqmsg, 1, sizeof (*lit)); + lit = lustre_msg_buf(req->rq_reqmsg, MDS_REQ_INTENT_IT_OFF, + sizeof (*lit)); lit->opc = (__u64)it->it_op; /* pack the intended request */ - mdc_getattr_pack(req, valid, 2, it->it_flags, data); - + mdc_getattr_pack(req, MDS_REQ_INTENT_REC_OFF, valid, + it->it_flags, data); + /* get ready for the reply */ repsize[repbufcnt++] = LUSTRE_POSIX_ACL_MAX_SIZE; } else if (it->it_op == IT_READDIR) { policy.l_inodebits.bits = MDS_INODELOCK_UPDATE; - req = ptlrpc_prep_req(class_exp2cliimp(exp), LDLM_ENQUEUE, 1, - size, NULL); + req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_DLM_VERSION, + LDLM_ENQUEUE, 1, size, NULL); if (!req) RETURN(-ENOMEM); + /* get ready for the reply */ repbufcnt = 1; } else { LBUG(); @@ -348,7 +383,7 @@ int mdc_enqueue(struct obd_export *exp, mdc_get_rpc_lock(obddev->u.cli.cl_rpc_lock, it); rc = ldlm_cli_enqueue(exp, req, obddev->obd_namespace, res_id, - lock_type,&policy,lock_mode, &flags,cb_blocking, + lock_type, &policy,lock_mode, &flags, cb_blocking, cb_completion, NULL, cb_data, NULL, 0, NULL, lockh); mdc_put_rpc_lock(obddev->u.cli.cl_rpc_lock, it); @@ -356,7 +391,9 @@ int mdc_enqueue(struct obd_export *exp, /* Similarly, if we're going to replay this request, we don't want to * actually get a lock, just perform the intent. */ if (req->rq_transno || req->rq_replay) { - lockreq = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*lockreq)); + lockreq = lustre_msg_buf(req->rq_reqmsg, + MDS_REQ_INTENT_LOCKREQ_OFF, + sizeof(*lockreq)); lockreq->lock_flags |= LDLM_FL_INTENT_ONLY; } @@ -424,16 +461,34 @@ int mdc_enqueue(struct obd_export *exp, CERROR ("Missing/short eadata\n"); RETURN (-EPROTO); } + if (body->valid & OBD_MD_FLMODEASIZE) { + if (obddev->u.cli.cl_max_mds_easize < + body->max_mdsize) { + obddev->u.cli.cl_max_mds_easize = + body->max_mdsize; + CDEBUG(D_INFO, "maxeasize become %d\n", + body->max_mdsize); + } + if (obddev->u.cli.cl_max_mds_cookiesize < + body->max_cookiesize) { + obddev->u.cli.cl_max_mds_cookiesize = + body->max_cookiesize; + CDEBUG(D_INFO, "cookiesize become %d\n", + body->max_cookiesize); + } + } /* We save the reply LOV EA in case we have to replay * a create for recovery. If we didn't allocate a * large enough request buffer above we need to * reallocate it here to hold the actual LOV EA. */ if (it->it_op & IT_OPEN) { - if (req->rq_reqmsg->buflens[4] < + int pos = MDS_REQ_INTENT_REC_OFF + 2; + + if (req->rq_reqmsg->buflens[pos] < body->eadatasize) mdc_realloc_openmsg(req, body, size); - lmm = lustre_msg_buf(req->rq_reqmsg, 4, + lmm = lustre_msg_buf(req->rq_reqmsg, pos, body->eadatasize); if (lmm) memcpy(lmm, eadata, body->eadatasize); @@ -543,10 +598,17 @@ int mdc_intent_lock(struct obd_export *exp, struct mdc_op_data *op_data, if (rc < 0) RETURN(rc); memcpy(&it->d.lustre.it_lock_handle, &lockh, sizeof(lockh)); + } else if (!op_data->fid2.id) { + /* DISP_ENQ_COMPLETE set means there is extra reference on + * request referenced from this intent, saved for subsequent + * lookup. This path is executed when we proceed to this + * lookup, so we clear DISP_ENQ_COMPLETE */ + it_clear_disposition(it, DISP_ENQ_COMPLETE); } request = *reqp = it->d.lustre.it_data; LASSERT(request != NULL); LASSERT(request != LP_POISON); + LASSERT(request->rq_repmsg != LP_POISON); /* If we're doing an IT_OPEN which did not result in an actual * successful open, then we need to remove the bit which saves diff --git a/lustre/mdc/mdc_reint.c b/lustre/mdc/mdc_reint.c index 15194fb..3ecafc9 100644 --- a/lustre/mdc/mdc_reint.c +++ b/lustre/mdc/mdc_reint.c @@ -74,19 +74,20 @@ int mdc_setattr(struct obd_export *exp, struct mdc_op_data *data, struct mds_rec_setattr *rec; struct mdc_rpc_lock *rpc_lock; struct obd_device *obd = exp->exp_obd; - int rc, bufcount = 1, size[3] = {sizeof(*rec), ealen, ea2len}; + int size[] = { sizeof(*rec), ealen, ea2len}; + int rc, bufcount = 1; ENTRY; LASSERT(iattr != NULL); if (ealen > 0) { - bufcount = 2; + bufcount++; if (ea2len > 0) - bufcount = 3; + bufcount++; } - req = ptlrpc_prep_req(class_exp2cliimp(exp), MDS_REINT, bufcount, - size, NULL); + req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_MDS_VERSION, + MDS_REINT, bufcount, size, NULL); if (req == NULL) RETURN(-ENOMEM); @@ -100,7 +101,7 @@ int mdc_setattr(struct obd_export *exp, struct mdc_op_data *data, if (iattr->ia_valid & (ATTR_MTIME | ATTR_CTIME)) CDEBUG(D_INODE, "setting mtime %lu, ctime %lu\n", LTIME_S(iattr->ia_mtime), LTIME_S(iattr->ia_ctime)); - mdc_setattr_pack(req, data, iattr, ea, ealen, ea2, ea2len); + mdc_setattr_pack(req, MDS_REQ_REC_OFF, data, iattr, ea, ealen, ea2, ea2len); size[0] = sizeof(struct mds_body); req->rq_replen = lustre_msg_size(1, size); @@ -119,8 +120,8 @@ int mdc_create(struct obd_export *exp, struct mdc_op_data *op_data, { struct obd_device *obd = exp->exp_obd; struct ptlrpc_request *req; - int rc, size[3] = {sizeof(struct mds_rec_create), op_data->namelen + 1}; - int level, bufcount = 2; + int size[] = { sizeof(struct mds_rec_create), op_data->namelen + 1, 0}; + int rc, level, bufcount = 2; ENTRY; if (data && datalen) { @@ -128,14 +129,14 @@ int mdc_create(struct obd_export *exp, struct mdc_op_data *op_data, bufcount++; } - req = ptlrpc_prep_req(class_exp2cliimp(exp), MDS_REINT, bufcount, - size, NULL); + req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_MDS_VERSION, + MDS_REINT, bufcount, size, NULL); if (req == NULL) RETURN(-ENOMEM); /* mdc_create_pack fills msg->bufs[1] with name * and msg->bufs[2] with tgt, for symlinks or lov MD data */ - mdc_create_pack(req, 0, op_data, data, datalen, mode, + mdc_create_pack(req, MDS_REQ_REC_OFF, op_data, data, datalen, mode, uid, gid, cap_effective, rdev); size[0] = sizeof(struct mds_body); @@ -162,12 +163,12 @@ int mdc_unlink(struct obd_export *exp, struct mdc_op_data *data, { struct obd_device *obd = class_exp2obd(exp); struct ptlrpc_request *req = *request; - int rc, size[2] = {sizeof(struct mds_rec_unlink), data->namelen + 1}; + int rc, size[] = { sizeof(struct mds_rec_unlink), data->namelen + 1}; ENTRY; LASSERT(req == NULL); - req = ptlrpc_prep_req(class_exp2cliimp(exp), MDS_REINT, 2, size, - NULL); + req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_MDS_VERSION, + MDS_REINT, 2, size, NULL); if (req == NULL) RETURN(-ENOMEM); *request = req; @@ -177,7 +178,7 @@ int mdc_unlink(struct obd_export *exp, struct mdc_op_data *data, size[2] = obd->u.cli.cl_max_mds_cookiesize; req->rq_replen = lustre_msg_size(3, size); - mdc_unlink_pack(req, 0, data); + mdc_unlink_pack(req, MDS_REQ_REC_OFF, data); rc = mdc_reint(req, obd->u.cli.cl_rpc_lock, LUSTRE_IMP_FULL); if (rc == -ERESTARTSYS) @@ -190,15 +191,15 @@ int mdc_link(struct obd_export *exp, struct mdc_op_data *data, { struct obd_device *obd = exp->exp_obd; struct ptlrpc_request *req; - int rc, size[2] = {sizeof(struct mds_rec_link), data->namelen + 1}; + int rc, size[] = { sizeof(struct mds_rec_link), data->namelen + 1}; ENTRY; - req = ptlrpc_prep_req(class_exp2cliimp(exp), MDS_REINT, 2, size, - NULL); + req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_MDS_VERSION, + MDS_REINT, 2, size, NULL); if (req == NULL) RETURN(-ENOMEM); - mdc_link_pack(req, 0, data); + mdc_link_pack(req, MDS_REQ_REC_OFF, data); size[0] = sizeof(struct mds_body); req->rq_replen = lustre_msg_size(1, size); @@ -217,16 +218,15 @@ int mdc_rename(struct obd_export *exp, struct mdc_op_data *data, { struct obd_device *obd = exp->exp_obd; struct ptlrpc_request *req; - int rc, size[3] = {sizeof(struct mds_rec_rename), oldlen + 1, - newlen + 1}; + int rc, size[] = { sizeof(struct mds_rec_rename), oldlen +1, newlen +1}; ENTRY; - req = ptlrpc_prep_req(class_exp2cliimp(exp), MDS_REINT, 3, size, - NULL); + req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_MDS_VERSION, + MDS_REINT, 3, size, NULL); if (req == NULL) RETURN(-ENOMEM); - mdc_rename_pack(req, 0, data, old, oldlen, new, newlen); + mdc_rename_pack(req, MDS_REQ_REC_OFF, data, old, oldlen, new, newlen); size[0] = sizeof(struct mds_body); size[1] = obd->u.cli.cl_max_mds_easize; diff --git a/lustre/mds/Makefile.in b/lustre/mds/Makefile.in index 919c9aa..96d7ec7 100644 --- a/lustre/mds/Makefile.in +++ b/lustre/mds/Makefile.in @@ -1,5 +1,5 @@ MODULES := mds mds-objs := mds_log.o mds_unlink_open.o mds_lov.o handler.o mds_reint.o -mds-objs += mds_fs.o lproc_mds.o mds_open.o mds_lib.o mds_xattr.o +mds-objs += mds_fs.o lproc_mds.o mds_open.o mds_lib.o mds_xattr.o mds_join.o @INCLUDE_RULES@ diff --git a/lustre/mds/lproc_mds.c b/lustre/mds/lproc_mds.c index 12a4cae..c9958edf 100644 --- a/lustre/mds/lproc_mds.c +++ b/lustre/mds/lproc_mds.c @@ -366,8 +366,8 @@ struct lprocfs_vars lprocfs_mds_obd_vars[] = { lprocfs_wr_group_acquire_expire, 0}, { "group_upcall", lprocfs_rd_group_upcall, lprocfs_wr_group_upcall, 0}, - { "group_flush", 0, lprocfs_wr_group_flush, 0}, - { "group_info", 0, lprocfs_wr_group_info, 0 }, + { "group_flush", 0, lprocfs_wr_group_flush, 0}, + { "group_info", 0, lprocfs_wr_group_info, 0 }, { 0 } }; diff --git a/lustre/mds/mds_fs.c b/lustre/mds/mds_fs.c index 2bb066a..1fe60dc 100644 --- a/lustre/mds/mds_fs.c +++ b/lustre/mds/mds_fs.c @@ -48,12 +48,7 @@ #include "mds_internal.h" -/* This limit is arbitrary (32k clients on x86), but it is convenient to use - * 2^n * PAGE_SIZE * 8 for the number of bits that fit an order-n allocation. */ -#define MDS_MAX_CLIENTS (PAGE_SIZE * 8) - -#define LAST_RCVD "last_rcvd" -#define LOV_OBJID "lov_objid" +#define HEALTH_CHECK "health_check" /* Add client data to the MDS. We use a bitmap to locate a free space * in the last_rcvd file if cl_off is -1 (i.e. a new client). @@ -81,15 +76,15 @@ int mds_client_add(struct obd_device *obd, struct mds_obd *mds, * there's no need for extra complication here */ if (new_client) { - cl_idx = find_first_zero_bit(bitmap, MDS_MAX_CLIENTS); + cl_idx = find_first_zero_bit(bitmap, LR_MAX_CLIENTS); repeat: - if (cl_idx >= MDS_MAX_CLIENTS || + if (cl_idx >= LR_MAX_CLIENTS || OBD_FAIL_CHECK_ONCE(OBD_FAIL_MDS_CLIENT_ADD)) { - CERROR("no room for clients - fix MDS_MAX_CLIENTS\n"); + CERROR("no room for clients - fix LR_MAX_CLIENTS\n"); return -EOVERFLOW; } if (test_and_set_bit(cl_idx, bitmap)) { - cl_idx = find_next_zero_bit(bitmap, MDS_MAX_CLIENTS, + cl_idx = find_next_zero_bit(bitmap, LR_MAX_CLIENTS, cl_idx); goto repeat; } @@ -204,7 +199,7 @@ int mds_client_free(struct obd_export *exp) static int mds_server_free_data(struct mds_obd *mds) { - OBD_FREE(mds->mds_client_bitmap, MDS_MAX_CLIENTS / 8); + OBD_FREE(mds->mds_client_bitmap, LR_MAX_CLIENTS / 8); OBD_FREE(mds->mds_server_data, sizeof(*mds->mds_server_data)); mds->mds_server_data = NULL; @@ -224,15 +219,15 @@ static int mds_init_server_data(struct obd_device *obd, struct file *file) /* ensure padding in the struct is the correct size */ LASSERT(offsetof(struct mds_server_data, msd_padding) + - sizeof(msd->msd_padding) == MDS_LR_SERVER_SIZE); + sizeof(msd->msd_padding) == LR_SERVER_SIZE); LASSERT(offsetof(struct mds_client_data, mcd_padding) + - sizeof(mcd->mcd_padding) == MDS_LR_CLIENT_SIZE); + sizeof(mcd->mcd_padding) == LR_CLIENT_SIZE); OBD_ALLOC_WAIT(msd, sizeof(*msd)); if (!msd) RETURN(-ENOMEM); - OBD_ALLOC_WAIT(mds->mds_client_bitmap, MDS_MAX_CLIENTS / 8); + OBD_ALLOC_WAIT(mds->mds_client_bitmap, LR_MAX_CLIENTS / 8); if (!mds->mds_client_bitmap) { OBD_FREE(msd, sizeof(*msd)); RETURN(-ENOMEM); @@ -246,14 +241,14 @@ static int mds_init_server_data(struct obd_device *obd, struct file *file) memcpy(msd->msd_uuid, obd->obd_uuid.uuid,sizeof(msd->msd_uuid)); msd->msd_last_transno = 0; mount_count = msd->msd_mount_count = 0; - msd->msd_server_size = cpu_to_le32(MDS_LR_SERVER_SIZE); - msd->msd_client_start = cpu_to_le32(MDS_LR_CLIENT_START); - msd->msd_client_size = cpu_to_le16(MDS_LR_CLIENT_SIZE); - msd->msd_feature_rocompat = cpu_to_le32(MDS_ROCOMPAT_LOVOBJID); + msd->msd_server_size = cpu_to_le32(LR_SERVER_SIZE); + msd->msd_client_start = cpu_to_le32(LR_CLIENT_START); + msd->msd_client_size = cpu_to_le16(LR_CLIENT_SIZE); + msd->msd_feature_rocompat = cpu_to_le32(OBD_ROCOMPAT_LOVOBJID); } else { rc = fsfilt_read_record(obd, file, msd, sizeof(*msd), &off); if (rc) { - CERROR("error reading MDS %s: rc = %d\n", LAST_RCVD, rc); + CERROR("error reading MDS %s: rc %d\n", LAST_RCVD, rc); GOTO(err_msd, rc); } if (strcmp(msd->msd_uuid, obd->obd_uuid.uuid) != 0) { @@ -263,23 +258,24 @@ static int mds_init_server_data(struct obd_device *obd, struct file *file) } mount_count = le64_to_cpu(msd->msd_mount_count); } - if (msd->msd_feature_incompat & ~cpu_to_le32(MDS_INCOMPAT_SUPP)) { - CERROR("unsupported incompat feature %x\n", - le32_to_cpu(msd->msd_feature_incompat) & - ~MDS_INCOMPAT_SUPP); + if (msd->msd_feature_incompat & ~cpu_to_le32(MDT_INCOMPAT_SUPP)) { + CERROR("%s: unsupported incompat filesystem feature(s) %x\n", + obd->obd_name, le32_to_cpu(msd->msd_feature_incompat) & + ~MDT_INCOMPAT_SUPP); GOTO(err_msd, rc = -EINVAL); } - if (msd->msd_feature_rocompat & ~cpu_to_le32(MDS_ROCOMPAT_SUPP)) { - CERROR("unsupported read-only feature %x\n", - le32_to_cpu(msd->msd_feature_rocompat) & - ~MDS_ROCOMPAT_SUPP); + if (msd->msd_feature_rocompat & ~cpu_to_le32(MDT_ROCOMPAT_SUPP)) { + CERROR("%s: unsupported read-only filesystem feature(s) %x\n", + obd->obd_name, le32_to_cpu(msd->msd_feature_rocompat) & + ~MDT_ROCOMPAT_SUPP); /* Do something like remount filesystem read-only */ GOTO(err_msd, rc = -EINVAL); } mds->mds_last_transno = le64_to_cpu(msd->msd_last_transno); + msd->msd_feature_compat = cpu_to_le32(OBD_COMPAT_MDT); CDEBUG(D_INODE, "%s: server last_transno: "LPU64"\n", obd->obd_name, mds->mds_last_transno); CDEBUG(D_INODE, "%s: server mount_count: "LPU64"\n", @@ -513,11 +509,32 @@ int mds_fs_setup(struct obd_device *obd, struct vfsmount *mnt) file->f_dentry->d_inode->i_mode); GOTO(err_lov_objid, rc = -ENOENT); } + + /* open and test the check io file junk */ + file = filp_open(HEALTH_CHECK, O_RDWR | O_CREAT, 0644); + if (IS_ERR(file)) { + rc = PTR_ERR(file); + CERROR("cannot open/create %s file: rc = %d\n", HEALTH_CHECK, rc); + GOTO(err_lov_objid, rc = PTR_ERR(file)); + } + mds->mds_health_check_filp = file; + if (!S_ISREG(file->f_dentry->d_inode->i_mode)) { + CERROR("%s is not a regular file!: mode = %o\n", HEALTH_CHECK, + file->f_dentry->d_inode->i_mode); + GOTO(err_health_check, rc = -ENOENT); + } + rc = lvfs_check_io_health(obd, file); + if (rc) + GOTO(err_health_check, rc); err_pop: pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); return rc; +err_health_check: + if (mds->mds_health_check_filp && + filp_close(mds->mds_health_check_filp, 0)) + CERROR("can't close %s after error\n", HEALTH_CHECK); err_lov_objid: if (mds->mds_lov_objid_filp && filp_close(mds->mds_lov_objid_filp, 0)) CERROR("can't close %s after error\n", LOV_OBJID); @@ -545,8 +562,8 @@ int mds_fs_cleanup(struct obd_device *obd) int rc = 0; if (obd->obd_fail) - CERROR("%s: shutting down for failover; client state will" - " be preserved.\n", obd->obd_name); + CWARN("%s: shutting down for failover; client state will " + "be preserved.\n", obd->obd_name); class_disconnect_exports(obd); /* cleans up client info too */ mds_server_free_data(mds); @@ -564,6 +581,12 @@ int mds_fs_cleanup(struct obd_device *obd) if (rc) CERROR("%s file won't close, rc=%d\n", LOV_OBJID, rc); } + if (mds->mds_health_check_filp) { + rc = filp_close(mds->mds_health_check_filp, 0); + mds->mds_health_check_filp = NULL; + if (rc) + CERROR("%s file won't close, rc=%d\n", HEALTH_CHECK, rc); + } if (mds->mds_objects_dir != NULL) { l_dput(mds->mds_objects_dir); mds->mds_objects_dir = NULL; @@ -595,7 +618,7 @@ int mds_obd_create(struct obd_export *exp, struct obdo *oa, { struct mds_obd *mds = &exp->exp_obd->u.mds; struct inode *parent_inode = mds->mds_objects_dir->d_inode; - unsigned int tmpname = ll_insecure_random_int(); + unsigned int tmpname = ll_rand(); struct file *filp; struct dentry *new_child; struct lvfs_run_ctxt saved; @@ -677,7 +700,8 @@ out_pop: } int mds_obd_destroy(struct obd_export *exp, struct obdo *oa, - struct lov_stripe_md *ea, struct obd_trans_info *oti) + struct lov_stripe_md *ea, struct obd_trans_info *oti, + struct obd_export *md_exp) { struct mds_obd *mds = &exp->exp_obd->u.mds; struct inode *parent_inode = mds->mds_objects_dir->d_inode; diff --git a/lustre/mds/mds_join.c b/lustre/mds/mds_join.c new file mode 100644 index 0000000..fdc3189 --- /dev/null +++ b/lustre/mds/mds_join.c @@ -0,0 +1,503 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * linux/mds/mds_join.c + * Lustre Metadata join handler file + * + * Copyright (c) 2001-2005 Cluster File Systems, Inc. + * Author: Wang Di + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#ifndef EXPORT_SYMTAB +# define EXPORT_SYMTAB +#endif +#define DEBUG_SUBSYSTEM S_MDS + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "mds_internal.h" +#include + +struct mdsea_cb_data { + struct llog_handle *mc_llh; + struct lov_mds_md *mc_lmm; + struct lov_mds_md_join *mc_lmm_join; + __u64 mc_offset; + __u64 mc_headfile_sz; +}; + +static int mdsea_iterate(struct llog_handle *llh_tail, llog_cb_t cb, + void *cbdata) +{ + return llog_process(llh_tail, cb, cbdata, NULL); +} + +static int mds_insert_join_lmm(struct llog_handle *llh, + struct lov_mds_md *lmm, + __u64 start, __u64 len, + struct lov_mds_md_join *lmmj) +{ + struct llog_rec_hdr rec; + struct mds_extent_desc *med; + int sz_med, rc; + ENTRY; + + + sz_med = lov_mds_md_size(le32_to_cpu(lmm->lmm_stripe_count)); + sz_med += 2 * sizeof(__u64); + sz_med = size_round(sz_med); + + rec.lrh_len = cpu_to_le32(sz_med); + rec.lrh_type = cpu_to_le32(LLOG_JOIN_REC); + + CDEBUG(D_INFO, "insert extent "LPU64":"LPU64" lmm \n", start, len); + + OBD_ALLOC(med, sz_med); + if (med == NULL) + RETURN(-ENOMEM); + + med->med_start = start; + med->med_len = len; + memcpy(&med->med_lmm, lmm, + lov_mds_md_size(le32_to_cpu(lmm->lmm_stripe_count))); + + rc = llog_write_rec(llh, &rec, NULL, 0, med, -1); + OBD_FREE(med, sz_med); + + if (lmmj) { + /*modify lmmj for join stripe info*/ + lmmj->lmmj_md.lmm_stripe_count += lmm->lmm_stripe_count; + lmmj->lmmj_extent_count ++; + } + + RETURN(rc); +} + +static int mdsea_append_extent(struct llog_handle *llh_tail, + struct llog_rec_hdr *rec_in_tail, + struct mdsea_cb_data *cbdata) +{ + struct mds_extent_desc *med = + &((struct llog_array_rec *)rec_in_tail)->lmr_med; + int rc; + ENTRY; + + CDEBUG(D_INODE, "insert lmm extent: "LPU64":"LPU64" \n", + med->med_start, med->med_len); + rc = mds_insert_join_lmm(cbdata->mc_llh, &med->med_lmm, + med->med_start + cbdata->mc_headfile_sz, + med->med_len, cbdata->mc_lmm_join); + if (rc) { + CERROR("error %d insert the lmm \n", rc); + RETURN(rc); + } + RETURN(LLOG_DEL_RECORD); +} + +static void mds_init_stripe_join(struct lov_mds_md_join *lmmj, + struct lov_mds_md *lmm, + struct llog_logid *logid) +{ + lmmj->lmmj_md.lmm_magic = cpu_to_le32(LOV_MAGIC_JOIN); + lmmj->lmmj_md.lmm_object_id = lmm->lmm_object_id; + lmmj->lmmj_md.lmm_object_gr = lmm->lmm_object_gr; + lmmj->lmmj_md.lmm_pattern = lmm->lmm_pattern; + lmmj->lmmj_md.lmm_stripe_size = lmm->lmm_stripe_size; + lmmj->lmmj_md.lmm_stripe_count = 0; + lmmj->lmmj_extent_count = 0; + lmmj->lmmj_array_id = *logid; +} + +static int mdsea_cancel_last_extent(struct llog_handle *llh_tail, + struct llog_rec_hdr *rec_in_tail, + struct mdsea_cb_data *cbdata) +{ + struct mds_extent_desc *med = + &((struct llog_array_rec *)rec_in_tail)->lmr_med; + + CDEBUG(D_INODE, "extent: "LPU64":"LPU64" \n", med->med_start, + med->med_len); + + LASSERTF(cbdata->mc_offset == med->med_start, + "A hole in the extent "LPU64"--"LPU64"\n", + cbdata->mc_offset, med->med_start); + + if (med->med_len != -1) + cbdata->mc_offset = med->med_start + med->med_len; + + if (med->med_start > cbdata->mc_headfile_sz || (med->med_len == -1)) { + CDEBUG(D_INFO, "del rec offset"LPU64", head size "LPU64" \n", + med->med_start, cbdata->mc_headfile_sz); + if (!cbdata->mc_lmm) { + int stripe = le32_to_cpu(med->med_lmm.lmm_stripe_count); + OBD_ALLOC(cbdata->mc_lmm, lov_mds_md_size(stripe)); + if (!cbdata->mc_lmm) + RETURN(-ENOMEM); + memcpy(cbdata->mc_lmm, &med->med_lmm, + lov_mds_md_size(stripe)); + } + RETURN(LLOG_DEL_RECORD); + } + RETURN(0); +} + +static int mds_adjust_last_extent(struct llog_handle *llh_head, + __u64 head_size) +{ + struct mdsea_cb_data *cbdata; + int rc; + ENTRY; + + OBD_ALLOC_PTR(cbdata); + + if (!cbdata) + RETURN(-ENOMEM); + + cbdata->mc_headfile_sz = head_size; + /*Find the last extent and cancel the record in the lmm*/ + rc = mdsea_iterate(llh_head, (llog_cb_t)mdsea_cancel_last_extent, + cbdata); + + if (rc) { + CERROR("can not find the last extent rc=%d\n", rc); + GOTO(exit, rc); + } + + LASSERT(cbdata->mc_lmm); + + CDEBUG(D_INODE, "insert lmm extent: "LPU64":"LPU64" \n", + cbdata->mc_offset, (head_size - cbdata->mc_offset)); + + rc = mds_insert_join_lmm(llh_head, cbdata->mc_lmm, + cbdata->mc_offset, + (head_size - cbdata->mc_offset), + NULL); + if (rc) + CERROR("error insert the lmm rc %d \n", rc); +exit: + if (cbdata && cbdata->mc_lmm) + OBD_FREE(cbdata->mc_lmm, + lov_mds_md_size(cbdata->mc_lmm->lmm_stripe_count)); + if (cbdata) + OBD_FREE_PTR(cbdata); + + RETURN(rc); +} + +static void mds_finish_join(struct mds_obd *mds, struct ptlrpc_request *req, + struct inode *inode, struct lov_mds_md_join *lmmj) +{ + struct mds_body *body = (struct mds_body *) + lustre_msg_buf(req->rq_repmsg, 1, 0); + int max_cookiesize = lmmj->lmmj_md.lmm_stripe_count * + sizeof(struct llog_cookie); + int max_easize = sizeof(*lmmj); + + CDEBUG(D_INFO, "change the max md size from %d to %d \n", + mds->mds_max_mdsize, sizeof(*lmmj)); + + if (mds->mds_max_mdsize < max_easize || + mds->mds_max_cookiesize < max_cookiesize) { + body->max_mdsize = mds->mds_max_mdsize > max_easize ? + mds->mds_max_mdsize : max_easize; + mds->mds_max_mdsize = body->max_mdsize; + body->max_cookiesize = mds->mds_max_cookiesize > max_cookiesize? + mds->mds_max_cookiesize : max_cookiesize; + mds->mds_max_cookiesize = body->max_cookiesize; + body->valid |= OBD_MD_FLMODEASIZE; + } + + if (body->valid & OBD_MD_FLMODEASIZE) + CDEBUG(D_HA, "updating max_mdsize/max_cookiesize: %d/%d\n", + mds->mds_max_mdsize, mds->mds_max_cookiesize); + + mds_pack_inode2fid(&body->fid1, inode); + mds_pack_inode2body(body, inode); +} + +static int mds_join_unlink_tail_inode(struct mds_update_record *rec, + struct ptlrpc_request *req, + struct mds_rec_join *join_rec, + struct lov_mds_md *tail_lmm, + int lmm_size, struct dentry *dchild, + void **handle,struct lustre_handle *lockh) +{ + struct mds_obd *mds = mds_req2mds(req); + struct obd_device *obd = req->rq_export->exp_obd; + struct inode *tail_inode, *head_inode; + struct dentry *de_tailparent = NULL, *de_tail = NULL, *de_head = NULL; + struct lustre_handle dlm_handles[4] = {{0}, {0}, {0}, {0}}; + struct ll_fid head_fid; + int rc; + ENTRY; + + if (lockh) + ldlm_lock_decref(lockh, LCK_EX); + + head_inode = dchild->d_inode; + mdc_pack_fid(&head_fid, head_inode->i_ino, head_inode->i_generation, + head_inode->i_mode & S_IFMT); + + rc = mds_get_parents_children_locked(obd, mds, &join_rec->jr_fid, + &de_tailparent, &head_fid, + &de_head, LCK_PW, rec->ur_name, + rec->ur_namelen, &de_tail, + NULL, 0, NULL, dlm_handles, + LCK_EX); + if (rc) + GOTO(cleanup, rc); + + *lockh = dlm_handles[1]; + LASSERT(de_tailparent); + tail_inode = de_tail->d_inode; + if (tail_inode == NULL) { + CERROR("tail inode doesn't exist(dir %lu,name %s)!\n", + de_tailparent? de_tailparent->d_inode->i_ino : 0, + rec->ur_name); + GOTO(cleanup, rc = -ENOENT); + } + + if (!S_ISREG(tail_inode->i_mode)) { + CERROR("tail file is not a regular file (dir %lu, name %s)!\n", + de_tailparent? de_tailparent->d_inode->i_ino : 0, + rec->ur_name); + GOTO(cleanup, rc = -EINVAL); + } + + *handle = fsfilt_start(obd, head_inode, FSFILT_OP_JOIN, NULL); + if (IS_ERR(*handle)) { + rc = PTR_ERR(*handle); + GOTO(cleanup, rc); + } + + rc = mds_get_md(obd, tail_inode, tail_lmm, &lmm_size, 1); + if (rc < 0) /* get md fails */ + GOTO(cleanup, rc); + + LASSERT(le32_to_cpu(tail_lmm->lmm_magic) == LOV_MAGIC_JOIN || + le32_to_cpu(tail_lmm->lmm_magic) == LOV_MAGIC); + + LASSERT(de_tailparent); + rc = vfs_unlink(de_tailparent->d_inode, de_tail); + + if (rc == 0) { + CDEBUG(D_INODE, "delete the tail inode %lu/%u \n", + tail_inode->i_ino, tail_inode->i_generation); + } +cleanup: + if (dlm_handles[2].cookie != 0) + ldlm_lock_decref(&dlm_handles[2], LCK_EX); + + if (dlm_handles[0].cookie != 0) { + if (rc) + ldlm_lock_decref(&dlm_handles[0], LCK_PW); + else + ptlrpc_save_lock(req, &dlm_handles[0], LCK_PW); + } + if (de_tail) + l_dput(de_tail); + + if (de_tailparent) + l_dput(de_tailparent); + + if (de_head) + l_dput(de_head); + + RETURN(rc); +} + +int mds_join_file(struct mds_update_record *rec, struct ptlrpc_request *req, + struct dentry *de_head, struct lustre_handle *lockh) +{ + struct mds_obd *mds = mds_req2mds(req); + struct obd_device *obd = req->rq_export->exp_obd; + struct inode *head_inode = NULL; + struct lvfs_run_ctxt saved; + void *handle = NULL; + struct lov_mds_md *head_lmm, *tail_lmm; + struct lov_mds_md_join *head_lmmj = NULL, *tail_lmmj = NULL; + int lmm_size, rc = 0, cleanup_phase = 0, size; + struct llog_handle *llh_head = NULL, *llh_tail = NULL; + struct llog_ctxt *ctxt; + struct mds_rec_join *join_rec; + ENTRY; + + join_rec = lustre_swab_reqbuf (req, 5, sizeof (*join_rec), + lustre_swab_mds_rec_join); + if (join_rec == NULL) + RETURN (-EFAULT); + + DEBUG_REQ(D_INODE, req,"head "LPU64"/%u, ptail ino "LPU64"/%u, tail %s", + rec->ur_fid1->id, rec->ur_fid1->generation, + join_rec->jr_fid.id, join_rec->jr_fid.generation, + rec->ur_name); + + size = mds->mds_max_mdsize; + lmm_size = mds->mds_max_mdsize; + OBD_ALLOC(head_lmm, lmm_size); + OBD_ALLOC(tail_lmm, lmm_size); + if (!head_lmm || !tail_lmm) + GOTO(cleanup, rc = -ENOMEM); + + /* acquire head's dentry */ + LASSERT(de_head); + head_inode = de_head->d_inode; + if (head_inode == NULL) { + CERROR("head inode doesn't exist!\n"); + GOTO(cleanup, rc = -ENOENT); + } + + /*Unlink tail inode and get the lmm back*/ + rc = mds_join_unlink_tail_inode(rec, req, join_rec, tail_lmm, lmm_size, + de_head, &handle, lockh); + if (rc) { + CERROR("unlink tail_inode error %d\n", rc); + GOTO(cleanup, rc); + } + + down(&head_inode->i_sem); + cleanup_phase = 1; + rc = mds_get_md(obd, head_inode, head_lmm, &size, 0); + if (rc < 0) + GOTO(cleanup, rc); + + LASSERTF(le32_to_cpu(head_lmm->lmm_magic) == LOV_MAGIC_JOIN || + le32_to_cpu(head_lmm->lmm_magic) == LOV_MAGIC); + + push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); + ctxt = llog_get_context(obd, LLOG_LOVEA_ORIG_CTXT); + cleanup_phase = 2; + if (le32_to_cpu(head_lmm->lmm_magic) == LOV_MAGIC) { /*simple file */ + struct llog_logid *llog_array; + + rc = llog_create(ctxt, &llh_head, NULL, NULL); + if (rc) { + CERROR("cannot create new log, error = %d\n", rc); + GOTO(cleanup, rc); + } + cleanup_phase = 3; + llog_array = &llh_head->lgh_id; + CDEBUG(D_INFO,"create arrary for %lu with id "LPU64":"LPU64"\n", + head_inode->i_ino, llog_array->lgl_oid, + llog_array->lgl_ogr); + rc = llog_init_handle(llh_head, LLOG_F_IS_PLAIN, NULL); + if (rc) + GOTO(cleanup, rc); + OBD_ALLOC_PTR(head_lmmj); + if (head_lmmj == NULL) + GOTO(cleanup, rc = -ENOMEM); + mds_init_stripe_join(head_lmmj, head_lmm, llog_array); + mds_insert_join_lmm(llh_head, head_lmm, 0,join_rec->jr_headsize, + head_lmmj); + } else { /*head lmm is join file */ + head_lmmj = (struct lov_mds_md_join *)head_lmm; + /* construct and fill extent llog object */ + rc = llog_create(ctxt, &llh_head, + &head_lmmj->lmmj_array_id, NULL); + if (rc) { + CERROR("cannot open existing log, error = %d\n", rc); + GOTO(cleanup, rc); + } + cleanup_phase = 3; + rc = llog_init_handle(llh_head, LLOG_F_IS_PLAIN, NULL); + if (rc) + GOTO(cleanup, rc); + rc = mds_adjust_last_extent(llh_head, join_rec->jr_headsize); + if (rc) { + CERROR("can't adjust last extent of obj rc=%d\n", rc); + GOTO(cleanup, rc); + } + } + + if (le32_to_cpu(tail_lmm->lmm_magic) != LOV_MAGIC_JOIN) { + mds_insert_join_lmm(llh_head, tail_lmm, join_rec->jr_headsize, + -1, head_lmmj); + } else { + struct mdsea_cb_data cbdata; + tail_lmmj = (struct lov_mds_md_join *)tail_lmm; + + rc = llog_create(ctxt,&llh_tail,&tail_lmmj->lmmj_array_id,NULL); + if (rc) { + CERROR("cannot open existing log, error = %d\n", rc); + GOTO(cleanup, rc); + } + rc = llog_init_handle(llh_tail, LLOG_F_IS_PLAIN, NULL); + if (rc) { + llog_close(llh_tail); + GOTO(cleanup, rc); + } + cbdata.mc_llh = llh_head; + cbdata.mc_headfile_sz = join_rec->jr_headsize; + cbdata.mc_lmm_join = head_lmmj; + rc = mdsea_iterate(llh_tail, (llog_cb_t)mdsea_append_extent, + &cbdata); + if (rc) { + llog_close(llh_tail); + CERROR("can not append extent log error %d \n", rc); + GOTO(cleanup, rc); + } + rc = llog_destroy(llh_tail); + if (rc) { + llog_close(llh_tail); + CERROR("can not destroy log error %d \n", rc); + GOTO(cleanup, rc); + } + llog_free_handle(llh_tail); + } + LASSERT(head_inode); + CDEBUG(D_INODE, "join finish, set lmm V2 to inode %lu \n", + head_inode->i_ino); + fsfilt_set_md(obd, head_inode, handle, head_lmmj, + sizeof(struct lov_mds_md_join), "lov"); + mds_finish_join(mds, req, head_inode, head_lmmj); +cleanup: + rc = mds_finish_transno(mds, head_inode, handle, req, rc, 0); + switch(cleanup_phase){ + case 3: + llog_close(llh_head); + case 2: + if (head_lmmj && ((void*)head_lmmj != (void*)head_lmm)) + OBD_FREE_PTR(head_lmmj); + + pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); + case 1: + up(&head_inode->i_sem); + case 0: + if (tail_lmm != NULL) + OBD_FREE(tail_lmm, lmm_size); + if (head_lmm != NULL) + OBD_FREE(head_lmm, lmm_size); + break; + default: + CERROR("invalid cleanup_phase %d\n", cleanup_phase); + LBUG(); + } + req->rq_status = rc; + RETURN(rc); +} + diff --git a/lustre/mds/mds_lib.c b/lustre/mds/mds_lib.c index a81f737..c136494 100644 --- a/lustre/mds/mds_lib.c +++ b/lustre/mds/mds_lib.c @@ -365,12 +365,17 @@ int mds_init_ucred(struct lvfs_ucred *ucred, struct ptlrpc_request *req, LASSERT(body != NULL); /* previously verified & swabbed by caller */ #if CRAY_XT3 - ucred->luc_fsuid = req->rq_uid; -#else - ucred->luc_fsuid = body->fsuid; - ucred->luc_fsgid = body->fsgid; - ucred->luc_cap = body->capability; + if (req->rq_uid != LNET_UID_ANY) { + /* Non-root local cluster client */ + LASSERT (req->rq_uid != 0); + ucred->luc_fsuid = req->rq_uid; + } else #endif + { + ucred->luc_fsuid = body->fsuid; + ucred->luc_fsgid = body->fsgid; + ucred->luc_cap = body->capability; + } ucred->luc_uce = upcall_cache_get_entry(mds->mds_group_hash, ucred->luc_fsuid, diff --git a/lustre/mds/mds_log.c b/lustre/mds/mds_log.c index 2a9e6f3..b14ad93 100644 --- a/lustre/mds/mds_log.c +++ b/lustre/mds/mds_log.c @@ -92,23 +92,24 @@ int mds_log_op_unlink(struct obd_device *obd, struct inode *inode, { struct mds_obd *mds = &obd->u.mds; struct lov_stripe_md *lsm = NULL; - struct llog_ctxt *ctxt; struct llog_unlink_rec *lur; + struct llog_ctxt *ctxt; int rc; ENTRY; if (IS_ERR(mds->mds_osc_obd)) RETURN(PTR_ERR(mds->mds_osc_obd)); - rc = obd_unpackmd(mds->mds_osc_exp, &lsm, - lmm, lmm_size); + rc = obd_unpackmd(mds->mds_osc_exp, &lsm, lmm, lmm_size); if (rc < 0) RETURN(rc); - + rc = obd_checkmd(mds->mds_osc_exp, obd->obd_self_export, lsm); + if (rc) + GOTO(out, rc); /* first prepare unlink log record */ OBD_ALLOC(lur, sizeof(*lur)); if (!lur) - RETURN(-ENOMEM); + GOTO(out, rc = -ENOMEM); lur->lur_hdr.lrh_len = lur->lur_tail.lrt_len = sizeof(*lur); lur->lur_hdr.lrh_type = MDS_UNLINK_REC; @@ -116,9 +117,9 @@ int mds_log_op_unlink(struct obd_device *obd, struct inode *inode, rc = llog_add(ctxt, &lur->lur_hdr, lsm, logcookies, cookies_size / sizeof(struct llog_cookie)); - obd_free_memmd(mds->mds_osc_exp, &lsm); OBD_FREE(lur, sizeof(*lur)); - +out: + obd_free_memmd(mds->mds_osc_exp, &lsm); RETURN(rc); } @@ -128,8 +129,8 @@ int mds_log_op_setattr(struct obd_device *obd, struct inode *inode, { struct mds_obd *mds = &obd->u.mds; struct lov_stripe_md *lsm = NULL; - struct llog_ctxt *ctxt; struct llog_setattr_rec *lsr; + struct llog_ctxt *ctxt; int rc; ENTRY; @@ -140,6 +141,10 @@ int mds_log_op_setattr(struct obd_device *obd, struct inode *inode, if (rc < 0) RETURN(rc); + rc = obd_checkmd(mds->mds_osc_exp, obd->obd_self_export, lsm); + if (rc) + GOTO(out, rc); + OBD_ALLOC(lsr, sizeof(*lsr)); if (!lsr) GOTO(out, rc = -ENOMEM); diff --git a/lustre/mds/mds_open.c b/lustre/mds/mds_open.c index e80e707..fea41d9 100644 --- a/lustre/mds/mds_open.c +++ b/lustre/mds/mds_open.c @@ -320,22 +320,22 @@ static int mds_create_objects(struct ptlrpc_request *req, int offset, void *lmm_buf; ENTRY; + if (!S_ISREG(inode->i_mode)) + RETURN(0); if (rec->ur_flags & MDS_OPEN_DELAY_CREATE || !(rec->ur_flags & FMODE_WRITE)) RETURN(0); body = lustre_msg_buf(req->rq_repmsg, 1, sizeof(*body)); - if (!S_ISREG(inode->i_mode)) - RETURN(0); if (body->valid & OBD_MD_FLEASIZE) RETURN(0); OBD_ALLOC(*ids, mds->mds_lov_desc.ld_tgt_count * sizeof(**ids)); if (*ids == NULL) RETURN(-ENOMEM); + oti_init(&oti, req); oti.oti_objid = *ids; - oti.oti_thread = req->rq_svc_thread; /* replay case */ if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY) { @@ -364,7 +364,7 @@ static int mds_create_objects(struct ptlrpc_request *req, int offset, LASSERT(lmm_buf); LASSERT(lmm_bufsize >= lmm_size); memcpy(lmm_buf, lmm, lmm_size); - rc = fsfilt_set_md(obd, inode, *handle, lmm, lmm_size); + rc = fsfilt_set_md(obd, inode, *handle, lmm, lmm_size, "lov"); if (rc) CERROR("open replay failed to set md:%d\n", rc); RETURN(0); @@ -380,7 +380,6 @@ static int mds_create_objects(struct ptlrpc_request *req, int offset, oa->o_gid = 0; oa->o_mode = S_IFREG | 0600; oa->o_id = inode->i_ino; - oa->o_flags = OBD_FL_CREATE_CROW; oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLFLAGS | OBD_MD_FLMODE | OBD_MD_FLUID | OBD_MD_FLGID; oa->o_size = 0; @@ -445,12 +444,6 @@ static int mds_create_objects(struct ptlrpc_request *req, int offset, oa->o_generation = body->fid1.generation; oa->o_valid |= OBD_MD_FLFID | OBD_MD_FLGENER; - /* do not set CROW flag in setattr path as it is not needed - * there and only confuses setattr code in filter. */ - oa->o_flags &= ~OBD_FL_CREATE_CROW; - if (!oa->o_flags) - oa->o_valid &= ~OBD_MD_FLFLAGS; - rc = obd_setattr(mds->mds_osc_exp, oa, lsm, &oti); if (rc) { CERROR("error setting attrs for inode %lu: rc %d\n", @@ -484,7 +477,7 @@ static int mds_create_objects(struct ptlrpc_request *req, int offset, GOTO(out_oa, rc); } - rc = fsfilt_set_md(obd, inode, *handle, lmm, lmm_size); + rc = fsfilt_set_md(obd, inode, *handle, lmm, lmm_size, "lov"); lmm_buf = lustre_msg_buf(req->rq_repmsg, offset, 0); lmm_bufsize = req->rq_repmsg->buflens[offset]; LASSERT(lmm_buf); @@ -569,9 +562,11 @@ static void reconstruct_open(struct mds_update_record *rec, int offset, OBD_MD_FLATIME | OBD_MD_FLMTIME); } - lustre_shrink_reply(req, 2, body->eadatasize, 0); + if (!(rec->ur_flags & MDS_OPEN_JOIN_FILE)) + lustre_shrink_reply(req, 2, body->eadatasize, 0); - if (req->rq_export->exp_connect_flags & OBD_CONNECT_ACL) { + if (req->rq_export->exp_connect_flags & OBD_CONNECT_ACL && + !(rec->ur_flags & MDS_OPEN_JOIN_FILE)) { int acl_off = body->eadatasize ? 3 : 2; rc = mds_pack_acl(med, dchild->d_inode, req->rq_repmsg, @@ -619,6 +614,11 @@ static void reconstruct_open(struct mds_update_record *rec, int offset, * Now that exp_outstanding_reply is a list, it's just using mfd != NULL * to detect a re-open */ if (mfd == NULL) { + if (rec->ur_flags & MDS_OPEN_JOIN_FILE) { + rc = mds_join_file(rec, req, dchild, NULL); + if (rc) + GOTO(out_dput, rc); + } mntget(mds->mds_vfsmnt); CERROR("Re-opened file \n"); mfd = mds_dentry_open(dchild, mds->mds_vfsmnt, @@ -668,8 +668,8 @@ static int accmode(struct inode *inode, int flags) /* Handles object creation, actual opening, and I/O epoch */ static int mds_finish_open(struct ptlrpc_request *req, struct dentry *dchild, struct mds_body *body, int flags, void **handle, - struct mds_update_record *rec, - struct ldlm_reply *rep) + struct mds_update_record *rec,struct ldlm_reply *rep, + struct lustre_handle *lockh) { struct mds_obd *mds = mds_req2mds(req); struct obd_device *obd = req->rq_export->exp_obd; @@ -680,6 +680,7 @@ static int mds_finish_open(struct ptlrpc_request *req, struct dentry *dchild, /* atomically create objects if necessary */ down(&dchild->d_inode->i_sem); + if (S_ISREG(dchild->d_inode->i_mode) && !(body->valid & OBD_MD_FLEASIZE)) { rc = mds_pack_md(obd, req->rq_repmsg, 2, body, @@ -695,8 +696,15 @@ static int mds_finish_open(struct ptlrpc_request *req, struct dentry *dchild, up(&dchild->d_inode->i_sem); RETURN(-EEXIST); } - - if (!(body->valid & OBD_MD_FLEASIZE)) { + if (rec->ur_flags & MDS_OPEN_JOIN_FILE) { + up(&dchild->d_inode->i_sem); + rc = mds_join_file(rec, req, dchild, lockh); + if (rc) + RETURN(rc); + down(&dchild->d_inode->i_sem); + } + if (!(body->valid & OBD_MD_FLEASIZE) && + !(body->valid & OBD_MD_FLMODEASIZE)) { /* no EA: create objects */ rc = mds_create_objects(req, 2, rec, mds, obd, dchild, handle, &ids); @@ -715,9 +723,11 @@ static int mds_finish_open(struct ptlrpc_request *req, struct dentry *dchild, } up(&dchild->d_inode->i_sem); - lustre_shrink_reply(req, 2, body->eadatasize, 0); + if (!(rec->ur_flags & MDS_OPEN_JOIN_FILE)) + lustre_shrink_reply(req, 2, body->eadatasize, 0); - if (req->rq_export->exp_connect_flags & OBD_CONNECT_ACL) { + if (req->rq_export->exp_connect_flags & OBD_CONNECT_ACL && + !(rec->ur_flags & MDS_OPEN_JOIN_FILE)) { int acl_off = body->eadatasize ? 3 : 2; rc = mds_pack_acl(&req->rq_export->exp_mds_data, @@ -767,30 +777,25 @@ static int mds_open_by_fid(struct ptlrpc_request *req, struct ll_fid *fid, if (dchild->d_inode != NULL) { mds_inode_set_orphan(dchild->d_inode); - mds_pack_inode2fid(&body->fid1, dchild->d_inode); - mds_pack_inode2body(body, dchild->d_inode); - intent_set_disposition(rep, DISP_LOOKUP_EXECD); - intent_set_disposition(rep, DISP_LOOKUP_POS); CWARN("Orphan %s found and opened in PENDING directory\n", fidname); - goto open; - } - l_dput(dchild); + } else { + l_dput(dchild); - /* We didn't find it in PENDING so it isn't an orphan. See - * if it was a regular inode that was previously created. */ - dchild = mds_fid2dentry(mds, fid, NULL); - if (IS_ERR(dchild)) - RETURN(PTR_ERR(dchild)); + /* We didn't find it in PENDING so it isn't an orphan. See + * if it was a regular inode that was previously created. */ + dchild = mds_fid2dentry(mds, fid, NULL); + if (IS_ERR(dchild)) + RETURN(PTR_ERR(dchild)); + } mds_pack_inode2fid(&body->fid1, dchild->d_inode); mds_pack_inode2body(body, dchild->d_inode); intent_set_disposition(rep, DISP_LOOKUP_EXECD); intent_set_disposition(rep, DISP_LOOKUP_POS); - open: - rc = mds_finish_open(req, dchild, body, flags, &handle, rec, rep); - rc = mds_finish_transno(mds, dchild ? dchild->d_inode : NULL, handle, + rc = mds_finish_open(req, dchild, body, flags, &handle, rec, rep, NULL); + rc = mds_finish_transno(mds, dchild->d_inode, handle, req, rc, rep ? rep->lock_policy_res1 : 0); /* XXX what do we do here if mds_finish_transno itself failed? */ @@ -798,7 +803,7 @@ static int mds_open_by_fid(struct ptlrpc_request *req, struct ll_fid *fid, RETURN(rc); } -int mds_pin(struct ptlrpc_request *req) +int mds_pin(struct ptlrpc_request *req, int offset) { struct obd_device *obd = req->rq_export->exp_obd; struct mds_body *request_body, *reply_body; @@ -806,7 +811,8 @@ int mds_pin(struct ptlrpc_request *req) int rc, size = sizeof(*reply_body); ENTRY; - request_body = lustre_msg_buf(req->rq_reqmsg, 0, sizeof(*request_body)); + request_body = lustre_msg_buf(req->rq_reqmsg, offset, + sizeof(*request_body)); rc = lustre_pack_reply(req, 1, &size, NULL); if (rc) @@ -821,9 +827,10 @@ int mds_pin(struct ptlrpc_request *req) RETURN(rc); } -/* Get a lock on the ino to sync with creation WRT inode reuse (bug 2029). - * If child_lockh is NULL we just get the lock as a barrier to wait for - * other holders of this lock, and drop it right away again. */ +/* Get an internal lock on the inode number (but not generation) to sync + * new inode creation with inode unlink (bug 2029). If child_lockh is NULL + * we just get the lock as a barrier to wait for other holders of this lock, + * and drop it right away again. */ int mds_lock_new_child(struct obd_device *obd, struct inode *inode, struct lustre_handle *child_lockh) { @@ -870,7 +877,7 @@ int mds_open(struct mds_update_record *rec, int offset, if (offset == 2) { /* intent */ rep = lustre_msg_buf(req->rq_repmsg, 0, sizeof (*rep)); body = lustre_msg_buf(req->rq_repmsg, 1, sizeof (*body)); - } else if (offset == 0) { /* non-intent reint */ + } else if (offset == MDS_REQ_REC_OFF) { /* non-intent reint */ body = lustre_msg_buf(req->rq_repmsg, 0, sizeof (*body)); } else { body = NULL; @@ -882,7 +889,8 @@ int mds_open(struct mds_update_record *rec, int offset, /* Step 0: If we are passed a fid, then we assume the client already * opened this file and is only replaying the RPC, so we open the * inode by fid (at some large expense in security). */ - if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY) { + if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY && + !(rec->ur_flags & MDS_OPEN_JOIN_FILE)) { if (rec->ur_fid2->id == 0) { struct ldlm_lock *lock = ldlm_handle2lock(child_lockh); if (lock) { @@ -919,7 +927,7 @@ int mds_open(struct mds_update_record *rec, int offset, } /* Step 1: Find and lock the parent */ - if (rec->ur_flags & MDS_OPEN_CREAT) + if (rec->ur_flags & (MDS_OPEN_CREAT | MDS_OPEN_JOIN_FILE)) parent_mode = LCK_EX; dparent = mds_fid2locked_dentry(obd, rec->ur_fid1, NULL, parent_mode, &parent_lockh, rec->ur_name, @@ -942,6 +950,13 @@ int mds_open(struct mds_update_record *rec, int offset, cleanup_phase = 1; /* parent dentry and lock */ + if (rec->ur_flags & MDS_OPEN_JOIN_FILE) { + dchild = dget(dparent); + cleanup_phase = 2; /* child dentry */ + acc_mode = accmode(dchild->d_inode, rec->ur_flags); + GOTO(found_child, rc); + } + /* Step 2: Lookup the child */ dchild = ll_lookup_one_len(rec->ur_name, dparent, rec->ur_namelen - 1); if (IS_ERR(dchild)) { @@ -1032,12 +1047,12 @@ int mds_open(struct mds_update_record *rec, int offset, acc_mode = accmode(dchild->d_inode, rec->ur_flags); } - LASSERTF(!mds_inode_is_orphan(dchild->d_inode), "dchild %.*s (%p) inode %p/%lu/%u\n", dchild->d_name.len, dchild->d_name.name, dchild, dchild->d_inode, dchild->d_inode->i_ino, dchild->d_inode->i_generation); +found_child: mds_pack_inode2fid(&body->fid1, dchild->d_inode); mds_pack_inode2body(body, dchild->d_inode); @@ -1051,10 +1066,6 @@ int mds_open(struct mds_update_record *rec, int offset, (acc_mode & MAY_WRITE)) GOTO(cleanup, rc = -EROFS); - /* Can't write to a read-only file */ - if (IS_RDONLY(dchild->d_inode) && (acc_mode & MAY_WRITE) != 0) - GOTO(cleanup, rc = -EPERM); - /* An append-only file must be opened in append mode for * writing */ if (IS_APPEND(dchild->d_inode) && (acc_mode & MAY_WRITE) != 0 && @@ -1095,13 +1106,13 @@ int mds_open(struct mds_update_record *rec, int offset, /* Step 5: mds_open it */ rc = mds_finish_open(req, dchild, body, rec->ur_flags, &handle, rec, - rep); + rep, &parent_lockh); GOTO(cleanup, rc); cleanup: rc = mds_finish_transno(mds, dchild ? dchild->d_inode : NULL, handle, req, rc, rep ? rep->lock_policy_res1 : 0); - + cleanup_no_trans: switch (cleanup_phase) { case 2: @@ -1129,7 +1140,7 @@ int mds_open(struct mds_update_record *rec, int offset, else ptlrpc_save_lock (req, &parent_lockh, parent_mode); } - + /* trigger dqacq on the owner of child and parent */ lquota_adjust(quota_interface, obd, qcids, qpids, rc, FSFILT_OP_CREATE); RETURN(rc); @@ -1144,7 +1155,7 @@ int mds_open(struct mds_update_record *rec, int offset, * (it will not even _have_ an entry in last_rcvd anymore). * * Returns EAGAIN if the client needs to get more data and re-close. */ -int mds_mfd_close(struct ptlrpc_request *req, struct obd_device *obd, +int mds_mfd_close(struct ptlrpc_request *req, int offset,struct obd_device *obd, struct mds_file_data *mfd, int unlink_orphan) { struct inode *inode = mfd->mfd_dentry->d_inode; @@ -1160,7 +1171,7 @@ int mds_mfd_close(struct ptlrpc_request *req, struct obd_device *obd, ENTRY; if (req && req->rq_reqmsg != NULL) - request_body = lustre_msg_buf(req->rq_reqmsg, 0, + request_body = lustre_msg_buf(req->rq_reqmsg, offset, sizeof(*request_body)); if (req && req->rq_repmsg != NULL) reply_body = lustre_msg_buf(req->rq_repmsg, 0, @@ -1325,7 +1336,7 @@ out: RETURN(rc); } -int mds_close(struct ptlrpc_request *req) +int mds_close(struct ptlrpc_request *req, int offset) { struct mds_export_data *med = &req->rq_export->exp_mds_data; struct obd_device *obd = req->rq_export->exp_obd; @@ -1342,12 +1353,17 @@ int mds_close(struct ptlrpc_request *req) if (rc) { CERROR("lustre_pack_reply: rc = %d\n", rc); req->rq_status = rc; - /* Continue on to drop local open count even if we can't send the reply */ + /* continue on to drop local open even if we can't send reply */ } else { MDS_CHECK_RESENT(req, mds_reconstruct_generic(req)); } - body = lustre_swab_reqbuf(req, 0, sizeof(*body), lustre_swab_mds_body); + CDEBUG(D_HA, "close req->rep_len %d mdsize %d cookiesize %d\n", + req->rq_replen, + obd->u.mds.mds_max_mdsize, obd->u.mds.mds_max_cookiesize); + + body = lustre_swab_reqbuf(req, offset, sizeof(*body), + lustre_swab_mds_body); if (body == NULL) { CERROR("Can't unpack body\n"); req->rq_status = -EFAULT; @@ -1384,9 +1400,10 @@ int mds_close(struct ptlrpc_request *req) } push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); - req->rq_status = mds_mfd_close(req, obd, mfd, 1); + req->rq_status = mds_mfd_close(req, offset, obd, mfd, 1); pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); + mds_shrink_reply(obd, req, body); if (OBD_FAIL_CHECK(OBD_FAIL_MDS_CLOSE_PACK)) { CERROR("test case OBD_FAIL_MDS_CLOSE_PACK\n"); req->rq_status = -ENOMEM; @@ -1396,7 +1413,7 @@ int mds_close(struct ptlrpc_request *req) RETURN(rc); } -int mds_done_writing(struct ptlrpc_request *req) +int mds_done_writing(struct ptlrpc_request *req, int offset) { struct mds_body *body; int rc, size = sizeof(struct mds_body); @@ -1404,7 +1421,8 @@ int mds_done_writing(struct ptlrpc_request *req) MDS_CHECK_RESENT(req, mds_reconstruct_generic(req)); - body = lustre_swab_reqbuf(req, 0, sizeof(*body), lustre_swab_mds_body); + body = lustre_swab_reqbuf(req, offset, sizeof(*body), + lustre_swab_mds_body); if (body == NULL) { CERROR("Can't unpack body\n"); req->rq_status = -EFAULT; diff --git a/lustre/mds/mds_reint.c b/lustre/mds/mds_reint.c index 05ac449..8cd37e8 100644 --- a/lustre/mds/mds_reint.c +++ b/lustre/mds/mds_reint.c @@ -74,7 +74,7 @@ static void mds_cancel_cookies_cb(struct obd_device *obd, __u64 transno, CDEBUG(D_HA, "cancelling %d cookies\n", (int)(mlcd->mlcd_cookielen / sizeof(*mlcd->mlcd_cookies))); - rc = obd_unpackmd(obd->u.mds.mds_osc_exp, &lsm, mlcd->mlcd_lmm, + rc = obd_unpackmd(obd->u.mds.mds_osc_exp, &lsm, mlcd->mlcd_lmm, mlcd->mlcd_eadatalen); if (rc < 0) { CERROR("bad LSM cancelling %d log cookies: rc %d\n", @@ -82,6 +82,11 @@ static void mds_cancel_cookies_cb(struct obd_device *obd, __u64 transno, rc); } else { ///* XXX 0 normally, SENDNOW for debug */); + rc = obd_checkmd(obd->u.mds.mds_osc_exp, obd->obd_self_export, + lsm); + if (rc) + CERROR("Can not revalidate lsm %p \n", lsm); + ctxt = llog_get_context(obd,mlcd->mlcd_cookies[0].lgc_subsys+1); rc = llog_cancel(ctxt, lsm, mlcd->mlcd_cookielen / sizeof(*mlcd->mlcd_cookies), @@ -110,8 +115,10 @@ int mds_finish_transno(struct mds_obd *mds, struct inode *inode, void *handle, /* if the export has already been failed, we have no last_rcvd slot */ if (req->rq_export->exp_failed) { - CWARN("committing transaction for disconnected client %s\n", - req->rq_export->exp_client_uuid.uuid); + CWARN("commit transaction for disconnected client %s: rc %d\n", + req->rq_export->exp_client_uuid.uuid, rc); + if (rc == 0) + rc = -ENOTCONN; if (handle) GOTO(commit, rc); RETURN(rc); @@ -134,7 +141,13 @@ int mds_finish_transno(struct mds_obd *mds, struct inode *inode, void *handle, transno = req->rq_reqmsg->transno; if (rc != 0) { - LASSERT(transno == 0); + if (transno != 0) { + CERROR("%s: replay %s transno "LPU64" failed: rc %d\n", + obd->obd_name, + libcfs_nid2str(req->rq_export->exp_connection->c_peer.nid), + transno, rc); + transno = 0; + } } else if (transno == 0) { spin_lock(&mds->mds_transno_lock); transno = ++mds->mds_last_transno; @@ -220,7 +233,7 @@ int mds_fix_attr(struct inode *inode, struct mds_update_record *rec) /* times */ if ((ia_valid & (ATTR_MTIME|ATTR_ATIME)) == (ATTR_MTIME|ATTR_ATIME)) { - if (rec->ur_uc.luc_fsuid != inode->i_uid && + if (current->fsuid != inode->i_uid && (error = ll_permission(inode, MAY_WRITE, NULL)) != 0) RETURN(error); } @@ -329,8 +342,7 @@ void mds_steal_ack_locks(struct ptlrpc_request *req) ptlrpc_schedule_difficult_reply (oldrep); spin_unlock (&svc->srv_lock); - spin_unlock_irqrestore (&exp->exp_lock, flags); - return; + break; } spin_unlock_irqrestore (&exp->exp_lock, flags); } @@ -403,6 +415,12 @@ int mds_osc_setattr_async(struct obd_device *obd, struct inode *inode, GOTO(out, rc); } + rc = obd_checkmd(mds->mds_osc_exp, obd->obd_self_export, lsm); + if (rc) { + CERROR("Error revalidate lsm %p \n", lsm); + GOTO(out, rc); + } + /* then fill oa */ oa->o_id = lsm->lsm_object_id; oa->o_uid = inode->i_uid; @@ -413,19 +431,19 @@ int mds_osc_setattr_async(struct obd_device *obd, struct inode *inode, oti.oti_logcookies = logcookies; } - LASSERT(fid != NULL); + LASSERT(fid != NULL); oa->o_fid = fid->id; oa->o_generation = fid->generation; - oa->o_valid |= OBD_MD_FLFID | OBD_MD_FLGENER; + oa->o_valid |= OBD_MD_FLFID | OBD_MD_FLGENER; /* do setattr from mds to ost asynchronously */ rc = obd_setattr_async(mds->mds_osc_exp, oa, lsm, &oti); if (rc) CDEBUG(D_INODE, "mds to ost setattr objid 0x"LPX64 " on ost error %d\n", lsm->lsm_object_id, rc); - - obd_free_memmd(mds->mds_osc_exp, &lsm); out: + if (lsm) + obd_free_memmd(mds->mds_osc_exp, &lsm); obdo_free(oa); RETURN(rc); } @@ -439,9 +457,9 @@ static int mds_reint_setattr(struct mds_update_record *rec, int offset, struct ptlrpc_request *req, struct lustre_handle *lh) { + unsigned int ia_valid = rec->ur_iattr.ia_valid; struct mds_obd *mds = mds_req2mds(req); struct obd_device *obd = req->rq_export->exp_obd; - unsigned int ia_valid = rec->ur_iattr.ia_valid; struct mds_body *body; struct dentry *de; struct inode *inode = NULL; @@ -450,14 +468,14 @@ static int mds_reint_setattr(struct mds_update_record *rec, int offset, struct mds_logcancel_data *mlcd = NULL; struct lov_mds_md *lmm = NULL; struct llog_cookie *logcookies = NULL; - int lmm_size = 0, need_lock = 1; + int lmm_size = 0, need_lock = 1, cookie_size = 0; int rc = 0, cleanup_phase = 0, err, locked = 0; unsigned int qcids[MAXQUOTAS] = {0, 0}; - unsigned int qpids[MAXQUOTAS] = {rec->ur_iattr.ia_uid, + unsigned int qpids[MAXQUOTAS] = {rec->ur_iattr.ia_uid, rec->ur_iattr.ia_gid}; ENTRY; - LASSERT(offset == 0); + LASSERT(offset == MDS_REQ_REC_OFF); DEBUG_REQ(D_INODE, req, "setattr "LPU64"/%u %x", rec->ur_fid1->id, rec->ur_fid1->generation, rec->ur_iattr.ia_valid); @@ -472,8 +490,8 @@ static int mds_reint_setattr(struct mds_update_record *rec, int offset, if (req->rq_export->exp_connect_flags & OBD_CONNECT_RDONLY) GOTO(cleanup, rc = -EROFS); } else { - __u64 lockpart = MDS_INODELOCK_UPDATE; - if (rec->ur_iattr.ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID) ) + __u64 lockpart = MDS_INODELOCK_UPDATE; + if (rec->ur_iattr.ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) lockpart |= MDS_INODELOCK_LOOKUP; de = mds_fid2locked_dentry(obd, rec->ur_fid1, NULL, LCK_EX, @@ -535,14 +553,14 @@ static int mds_reint_setattr(struct mds_update_record *rec, int offset, rc = fsfilt_setattr(obd, de, handle, &rec->ur_iattr, 0); /* journal chown/chgrp in llog, just like unlink */ if (rc == 0 && lmm_size){ - OBD_ALLOC(logcookies, mds->mds_max_cookiesize); + cookie_size = mds_get_cookie_size(obd, lmm); + OBD_ALLOC(logcookies, cookie_size); if (logcookies == NULL) GOTO(cleanup, rc = -ENOMEM); if (mds_log_op_setattr(obd, inode, lmm, lmm_size, - logcookies, - mds->mds_max_cookiesize) <= 0) { - OBD_FREE(logcookies, mds->mds_max_cookiesize); + logcookies, cookie_size) <= 0) { + OBD_FREE(logcookies, cookie_size); logcookies = NULL; } } @@ -566,7 +584,7 @@ static int mds_reint_setattr(struct mds_update_record *rec, int offset, lum->lmm_stripe_count == 0) || /* lmm_stripe_size == -1 is deprecated in 1.4.6 */ lum->lmm_stripe_size == (typeof(lum->lmm_stripe_size))(-1))){ - rc = fsfilt_set_md(obd, inode, handle, NULL, 0); + rc = fsfilt_set_md(obd, inode, handle, NULL, 0, "lov"); if (rc) GOTO(cleanup, rc); } else { @@ -579,7 +597,7 @@ static int mds_reint_setattr(struct mds_update_record *rec, int offset, obd_free_memmd(mds->mds_osc_exp, &lsm); rc = fsfilt_set_md(obd, inode, handle, rec->ur_eadata, - rec->ur_eadatalen); + rec->ur_eadatalen, "lov"); if (rc) GOTO(cleanup, rc); } @@ -589,10 +607,7 @@ static int mds_reint_setattr(struct mds_update_record *rec, int offset, mds_pack_inode2fid(&body->fid1, inode); mds_pack_inode2body(body, inode); - /* don't return OST-specific attributes if we didn't just set them. Use - * saved ->ia_valid here, as rec->ur_iattr.ia_valid gets rewritten by - * fsfilt_setattr() what breaks case of truncating file with no object - * on OST and no lsm (test_34c from sanity.sh). --umka */ + /* don't return OST-specific attributes if we didn't just set them. */ if (ia_valid & ATTR_SIZE) body->valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS; if (ia_valid & (ATTR_MTIME | ATTR_MTIME_SET)) @@ -626,14 +641,14 @@ static int mds_reint_setattr(struct mds_update_record *rec, int offset, err = mds_finish_transno(mds, inode, handle, req, rc, 0); /* do mds to ost setattr if needed */ if (!rc && !err && lmm_size) - mds_osc_setattr_async(obd, inode, lmm, lmm_size, - logcookies, rec->ur_fid1); + mds_osc_setattr_async(obd, inode, lmm, lmm_size, + logcookies, rec->ur_fid1); switch (cleanup_phase) { case 2: OBD_FREE(lmm, mds->mds_max_mdsize); if (logcookies) - OBD_FREE(logcookies, mds->mds_max_cookiesize); + OBD_FREE(logcookies, cookie_size); case 1: if ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)) && rec->ur_eadata != NULL) @@ -658,7 +673,8 @@ static int mds_reint_setattr(struct mds_update_record *rec, int offset, /* trigger dqrel/dqacq for original owner and new owner */ if (ia_valid & (ATTR_UID | ATTR_GID)) - lquota_adjust(quota_interface, obd, qcids, qpids, rc, FSFILT_OP_SETATTR); + lquota_adjust(quota_interface, obd, qcids, qpids, rc, + FSFILT_OP_SETATTR); return 0; } @@ -705,8 +721,9 @@ static int mds_reint_create(struct mds_update_record *rec, int offset, struct dentry_params dp; ENTRY; - LASSERT(offset == 0); - LASSERT(!strcmp(req->rq_export->exp_obd->obd_type->typ_name, "mds")); + LASSERT(offset == MDS_REQ_REC_OFF); + LASSERT(!strcmp(req->rq_export->exp_obd->obd_type->typ_name, + LUSTRE_MDS_NAME)); DEBUG_REQ(D_INODE, req, "parent "LPU64"/%u name %s mode %o", rec->ur_fid1->id, rec->ur_fid1->generation, @@ -855,7 +872,7 @@ static int mds_reint_create(struct mds_update_record *rec, int offset, if (rc > 0) { down(&inode->i_sem); rc = fsfilt_set_md(obd, inode, handle, - &lmm, lmm_size); + &lmm, lmm_size, "lov"); up(&inode->i_sem); } if (rc) @@ -1001,7 +1018,7 @@ int enqueue_ordered_locks(struct obd_device *obd, struct ldlm_res_id *p1_res_id, RETURN(-EIO); ldlm_lock_dump_handle(D_OTHER, handles[0]); - if (!memcmp(res_id[0], res_id[1], sizeof(*res_id[0])) && + if (memcmp(res_id[0], res_id[1], sizeof(*res_id[0])) == 0 && (policies[0]->l_inodebits.bits & policies[1]->l_inodebits.bits)) { memcpy(handles[1], handles[0], sizeof(*(handles[1]))); ldlm_lock_addref(handles[1], lock_modes[1]); @@ -1088,7 +1105,7 @@ int enqueue_4ordered_locks(struct obd_device *obd,struct ldlm_res_id *p1_res_id, if (res_id[i]->name[0] == 0) break; if (i != 0 && - !memcmp(res_id[i], res_id[i-1], sizeof(*res_id[i])) && + memcmp(res_id[i], res_id[i-1], sizeof(*res_id[i])) == 0 && (policies[i]->l_inodebits.bits & policies[i-1]->l_inodebits.bits)) { memcpy(dlm_handles[i], dlm_handles[i-1], @@ -1399,6 +1416,35 @@ out_dput: RETURN(rc); } +int mds_get_cookie_size(struct obd_device *obd, struct lov_mds_md *lmm) +{ + int count = le32_to_cpu(lmm->lmm_stripe_count); + int real_csize = count * sizeof(struct llog_cookie); + return real_csize; +} + +void mds_shrink_reply(struct obd_device *obd, struct ptlrpc_request *req, + struct mds_body *body) +{ + int cookie_size = 0, md_size = 0; + + if (body && body->valid & OBD_MD_FLEASIZE) { + md_size = body->eadatasize; + } + if (body && body->valid & OBD_MD_FLCOOKIE) { + LASSERT(body->valid & OBD_MD_FLEASIZE); + cookie_size = mds_get_cookie_size(obd, lustre_msg_buf( + req->rq_repmsg, 1, 0)); + } + + CDEBUG(D_INFO, "Shrink to md_size %d cookie_size %d \n", md_size, + cookie_size); + + lustre_shrink_reply(req, 1, md_size, 1); + + lustre_shrink_reply(req, md_size? 2:1, cookie_size, 0); +} + static int mds_reint_unlink(struct mds_update_record *rec, int offset, struct ptlrpc_request *req, struct lustre_handle *lh) @@ -1415,7 +1461,7 @@ static int mds_reint_unlink(struct mds_update_record *rec, int offset, unsigned int qpids [MAXQUOTAS] = {0, 0}; ENTRY; - LASSERT(offset == 0 || offset == 2); + LASSERT(offset == MDS_REQ_REC_OFF || offset == 2); DEBUG_REQ(D_INODE, req, "parent ino "LPU64"/%u, child %s", rec->ur_fid1->id, rec->ur_fid1->generation, rec->ur_name); @@ -1621,6 +1667,8 @@ cleanup: } req->rq_status = rc; + mds_shrink_reply(obd, req, body); + /* trigger dqrel on the owner of child and parent */ lquota_adjust(quota_interface, obd, qcids, qpids, rc, FSFILT_OP_UNLINK); return 0; @@ -1644,7 +1692,7 @@ static int mds_reint_link(struct mds_update_record *rec, int offset, int rc = 0, cleanup_phase = 0; ENTRY; - LASSERT(offset == 0); + LASSERT(offset == MDS_REQ_REC_OFF); DEBUG_REQ(D_INODE, req, "original "LPU64"/%u to "LPU64"/%u %s", rec->ur_fid1->id, rec->ur_fid1->generation, @@ -1782,19 +1830,19 @@ cleanup: * lock on the parent after the lookup is done, so dentry->d_inode may change * at any time, and igrab() itself doesn't like getting passed a NULL argument. */ -static int mds_get_parents_children_locked(struct obd_device *obd, - struct mds_obd *mds, - struct ll_fid *p1_fid, - struct dentry **de_srcdirp, - struct ll_fid *p2_fid, - struct dentry **de_tgtdirp, - int parent_mode, - const char *old_name, int old_len, - struct dentry **de_oldp, - const char *new_name, int new_len, - struct dentry **de_newp, - struct lustre_handle *dlm_handles, - int child_mode) +int mds_get_parents_children_locked(struct obd_device *obd, + struct mds_obd *mds, + struct ll_fid *p1_fid, + struct dentry **de_srcdirp, + struct ll_fid *p2_fid, + struct dentry **de_tgtdirp, + int parent_mode, + const char *old_name, int old_len, + struct dentry **de_oldp, + const char *new_name, int new_len, + struct dentry **de_newp, + struct lustre_handle *dlm_handles, + int child_mode) { struct ldlm_res_id p1_res_id = { .name = {0} }; struct ldlm_res_id p2_res_id = { .name = {0} }; @@ -1861,6 +1909,8 @@ static int mds_get_parents_children_locked(struct obd_device *obd, iput(inode); /* Step 4: Lookup the target child entry */ + if (!new_name) + GOTO(retry_locks, rc); *de_newp = ll_lookup_one_len(new_name, *de_tgtdirp, new_len - 1); if (IS_ERR(*de_newp)) { rc = PTR_ERR(*de_newp); @@ -1924,6 +1974,8 @@ retry_locks: if ((*de_oldp)->d_inode == NULL) GOTO(cleanup, rc = -ENOENT); + if (!new_name) + GOTO(cleanup, rc); /* Step 6b: Re-lookup target child to verify it hasn't changed */ rc = mds_verify_child(obd, &p2_res_id, &dlm_handles[1], *de_tgtdirp, parent_mode, &c2_res_id, &dlm_handles[3], de_newp, @@ -1984,7 +2036,7 @@ static int mds_reint_rename(struct mds_update_record *rec, int offset, unsigned int qpids[4] = {0, 0, 0, 0}; ENTRY; - LASSERT(offset == 0); + LASSERT(offset == MDS_REQ_REC_OFF); DEBUG_REQ(D_INODE, req, "parent "LPU64"/%u %s to "LPU64"/%u %s", rec->ur_fid1->id, rec->ur_fid1->generation, rec->ur_name, @@ -2174,7 +2226,13 @@ int mds_reint_rec(struct mds_update_record *rec, int offset, ENTRY; #if CRAY_XT3 - rec->ur_uc.luc_fsuid = req->rq_uid; + if (req->rq_uid != LNET_UID_ANY) { + /* non-root local cluster client + * NB root's creds are believed... */ + LASSERT (req->rq_uid != 0); + rec->ur_uc.luc_fsuid = req->rq_uid; + rec->ur_uc.luc_cap = 0; + } #endif /* get group info of this user */ diff --git a/lustre/mds/mds_unlink_open.c b/lustre/mds/mds_unlink_open.c index 97df695..9e15740 100644 --- a/lustre/mds/mds_unlink_open.c +++ b/lustre/mds/mds_unlink_open.c @@ -43,13 +43,14 @@ #include "mds_internal.h" -static int mds_osc_destroy_orphan(struct mds_obd *mds, +static int mds_osc_destroy_orphan(struct obd_device *obd, struct inode *inode, struct lov_mds_md *lmm, int lmm_size, struct llog_cookie *logcookies, int log_unlink) { + struct mds_obd *mds = &obd->u.mds; struct lov_stripe_md *lsm = NULL; struct obd_trans_info oti = { 0 }; struct obdo *oa; @@ -68,6 +69,10 @@ static int mds_osc_destroy_orphan(struct mds_obd *mds, rc = 0; } + rc = obd_checkmd(mds->mds_osc_exp, obd->obd_self_export, lsm); + if (rc) + GOTO(out_free_memmd, rc); + oa = obdo_alloc(); if (oa == NULL) GOTO(out_free_memmd, rc = -ENOMEM); @@ -79,8 +84,7 @@ static int mds_osc_destroy_orphan(struct mds_obd *mds, oa->o_valid |= OBD_MD_FLCOOKIE; oti.oti_logcookies = logcookies; } - - rc = obd_destroy(mds->mds_osc_exp, oa, lsm, &oti); + rc = obd_destroy(mds->mds_osc_exp, oa, lsm, &oti, obd->obd_self_export); obdo_free(oa); if (rc) CDEBUG(D_INODE, "destroy orphan objid 0x"LPX64" on ost error " @@ -96,7 +100,7 @@ static int mds_unlink_orphan(struct obd_device *obd, struct dentry *dchild, struct mds_obd *mds = &obd->u.mds; struct lov_mds_md *lmm = NULL; struct llog_cookie *logcookies = NULL; - int lmm_size, log_unlink = 0; + int lmm_size, log_unlink = 0, cookie_size = 0; void *handle = NULL; int rc, err; ENTRY; @@ -136,11 +140,12 @@ static int mds_unlink_orphan(struct obd_device *obd, struct dentry *dchild, CERROR("error %d unlinking orphan %.*s from PENDING\n", rc, dchild->d_name.len, dchild->d_name.name); } else if (lmm_size) { - OBD_ALLOC(logcookies, mds->mds_max_cookiesize); + cookie_size = mds_get_cookie_size(obd, lmm); + OBD_ALLOC(logcookies, cookie_size); if (logcookies == NULL) rc = -ENOMEM; else if (mds_log_op_unlink(obd, inode, lmm,lmm_size,logcookies, - mds->mds_max_cookiesize) > 0) + cookie_size) > 0) log_unlink = 1; } @@ -150,12 +155,12 @@ static int mds_unlink_orphan(struct obd_device *obd, struct dentry *dchild, if (!rc) rc = err; } else if (!rc) { - rc = mds_osc_destroy_orphan(mds, inode, lmm, lmm_size, + rc = mds_osc_destroy_orphan(obd, inode, lmm, lmm_size, logcookies, log_unlink); } if (logcookies != NULL) - OBD_FREE(logcookies, mds->mds_max_cookiesize); + OBD_FREE(logcookies, cookie_size); out_free_lmm: OBD_FREE(lmm, mds->mds_max_mdsize); RETURN(rc); @@ -223,13 +228,14 @@ int mds_cleanup_pending(struct obd_device *obd) GOTO(err_out, rc = PTR_ERR(dchild)); } if (!dchild->d_inode) { - CERROR("orphan %s has been removed\n", d_name); + CWARN("%s: orphan %s has already been removed\n", + obd->obd_name, d_name); GOTO(next, rc = 0); } if (is_bad_inode(dchild->d_inode)) { - CERROR("bad orphan inode found %lu/%u\n", - dchild->d_inode->i_ino, + CERROR("%s: bad orphan inode found %lu/%u\n", + obd->obd_name, dchild->d_inode->i_ino, dchild->d_inode->i_generation); GOTO(next, rc = -ENOENT); } @@ -239,7 +245,8 @@ int mds_cleanup_pending(struct obd_device *obd) if (mds_inode_is_orphan(child_inode) && mds_orphan_open_count(child_inode)) { MDS_UP_READ_ORPHAN_SEM(child_inode); - CWARN("orphan %s re-opened during recovery\n", d_name); + CWARN("%s: orphan %s re-opened during recovery\n", + obd->obd_name, d_name); GOTO(next, rc = 0); } MDS_UP_READ_ORPHAN_SEM(child_inode); @@ -247,16 +254,18 @@ int mds_cleanup_pending(struct obd_device *obd) rc = mds_unlink_orphan(obd, dchild, child_inode, pending_dir); if (rc == 0) { item ++; - CWARN("removed orphan %s from MDS and OST\n", d_name); + CDEBUG(D_HA, "%s: removed orphan %s\n", + obd->obd_name, d_name); } else { - CDEBUG(D_INODE, "removed orphan %s from MDS/OST failed," - " rc = %d\n", d_name, rc); + CDEBUG(D_INODE, "%s: removed orphan %s failed," + " rc = %d\n", obd->obd_name, d_name, rc); rc = 0; } next: l_dput(dchild); up(&pending_dir->i_sem); } + rc = 0; err_out: list_for_each_entry_safe(dirent, n, &dentry_list, lld_list) { list_del(&dirent->lld_list); @@ -264,8 +273,9 @@ err_out: } err_pop: pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); - if (rc == 0) - rc = item; + if (item > 0) + CWARN("%s: removed %d pending open-unlinked files\n", + obd->obd_name, item); RETURN(rc); err_mntget: diff --git a/lustre/mds/mds_xattr.c b/lustre/mds/mds_xattr.c index 1fe8ed8..f35aacc 100644 --- a/lustre/mds/mds_xattr.c +++ b/lustre/mds/mds_xattr.c @@ -62,8 +62,7 @@ static int mds_getxattr_pack_msg(struct ptlrpc_request *req, return -EFAULT; } - if (!(req->rq_export->exp_connect_flags & - OBD_CONNECT_USER_XATTR) && + if (!(req->rq_export->exp_connect_flags & OBD_CONNECT_XATTR) && (strncmp(xattr_name, "user.", 5) == 0)) return -EOPNOTSUPP; @@ -226,10 +225,6 @@ int mds_setxattr_internal(struct ptlrpc_request *req, struct mds_body *body) lockpart = MDS_INODELOCK_UPDATE; -/* - de = mds_fid2locked_dentry(obd, &body->fid1, NULL, LCK_EX, - &lockh, NULL, 0); -*/ de = mds_fid2locked_dentry(obd, &body->fid1, NULL, LCK_EX, &lockh, NULL, 0, lockpart); if (IS_ERR(de)) @@ -251,11 +246,11 @@ int mds_setxattr_internal(struct ptlrpc_request *req, struct mds_body *body) xattr_name); if (strncmp(xattr_name, "trusted.", 8) == 0) { - if (!strcmp(xattr_name, "trusted."XATTR_LUSTRE_MDS_LOV_EA)) + if (strcmp(xattr_name + 8, XATTR_LUSTRE_MDS_LOV_EA) == 0) GOTO(out_dput, rc = -EACCES); } - if (!(req->rq_export->exp_connect_flags & OBD_CONNECT_USER_XATTR) && + if (!(req->rq_export->exp_connect_flags & OBD_CONNECT_XATTR) && (strncmp(xattr_name, "user.", 5) == 0)) { GOTO(out_dput, rc = -EOPNOTSUPP); } diff --git a/lustre/obdclass/autoMakefile.am b/lustre/obdclass/autoMakefile.am index b31e2c5..f9850f9 100644 --- a/lustre/obdclass/autoMakefile.am +++ b/lustre/obdclass/autoMakefile.am @@ -10,7 +10,8 @@ noinst_LIBRARIES = liblustreclass.a liblustreclass_a_SOURCES = class_obd.c debug.c genops.c statfs_pack.c uuid.c liblustreclass_a_SOURCES += lustre_handles.c lustre_peer.c lprocfs_status.c liblustreclass_a_SOURCES += obdo.c obd_config.c llog.c llog_obd.c llog_cat.c -liblustreclass_a_SOURCES += llog_lvfs.c llog_swab.c #llog_ioctl.c rbtree.c +liblustreclass_a_SOURCES += llog_lvfs.c llog_swab.c +liblustreclass_a_SOURCES += prng.c #llog_ioctl.c rbtree.c liblustreclass_a_CPPFLAGS = $(LLCPPFLAGS) -DLUSTRE_VERSION=\"32\" -DBUILD_VERSION=\"1\" liblustreclass_a_CFLAGS = $(LLCFLAGS) diff --git a/lustre/obdclass/debug.c b/lustre/obdclass/debug.c index 24cb8e2..7d3d313 100644 --- a/lustre/obdclass/debug.c +++ b/lustre/obdclass/debug.c @@ -151,24 +151,24 @@ int block_debug_check(char *who, void *addr, int end, __u64 off, __u64 id) ne_off = le64_to_cpu (off); id = le64_to_cpu (id); if (memcmp(addr, (char *)&ne_off, LPDS)) { - CERROR("%s: id "LPX64" offset "LPU64" off: "LPX64" != " + CDEBUG(D_ERROR, "%s: id "LPX64" offset "LPU64" off: "LPX64" != " LPX64"\n", who, id, off, *(__u64 *)addr, ne_off); err = -EINVAL; } if (memcmp(addr + LPDS, (char *)&id, LPDS)) { - CERROR("%s: id "LPX64" offset "LPU64" id: "LPX64" != "LPX64"\n", + CDEBUG(D_ERROR, "%s: id "LPX64" offset "LPU64" id: "LPX64" != "LPX64"\n", who, id, off, *(__u64 *)(addr + LPDS), id); err = -EINVAL; } addr += end - LPDS - LPDS; if (memcmp(addr, (char *)&ne_off, LPDS)) { - CERROR("%s: id "LPX64" offset "LPU64" end off: "LPX64" != " + CDEBUG(D_ERROR, "%s: id "LPX64" offset "LPU64" end off: "LPX64" != " LPX64"\n", who, id, off, *(__u64 *)addr, ne_off); err = -EINVAL; } if (memcmp(addr + LPDS, (char *)&id, LPDS)) { - CERROR("%s: id "LPX64" offset "LPU64" end id: "LPX64" != " + CDEBUG(D_ERROR, "%s: id "LPX64" offset "LPU64" end id: "LPX64" != " LPX64"\n", who, id, off, *(__u64 *)(addr + LPDS), id); err = -EINVAL; } diff --git a/lustre/obdclass/llog.c b/lustre/obdclass/llog.c index 2b46115..a6edbb7 100644 --- a/lustre/obdclass/llog.c +++ b/lustre/obdclass/llog.c @@ -291,6 +291,9 @@ int llog_process(struct llog_handle *loghandle, llog_cb_t cb, loghandle->lgh_id.lgl_oid, loghandle->lgh_id.lgl_ogen); GOTO(out, rc); + } else if (rc == LLOG_DEL_RECORD) { + llog_cancel_rec(loghandle, rec->lrh_index); + rc = 0; } if (rc) GOTO(out, rc); @@ -311,3 +314,89 @@ int llog_process(struct llog_handle *loghandle, llog_cb_t cb, RETURN(rc); } EXPORT_SYMBOL(llog_process); + +int llog_reverse_process(struct llog_handle *loghandle, llog_cb_t cb, + void *data, void *catdata) +{ + struct llog_log_hdr *llh = loghandle->lgh_hdr; + struct llog_process_cat_data *cd = catdata; + void *buf; + int rc = 0, first_index = 1, index, idx; + ENTRY; + + OBD_ALLOC(buf, LLOG_CHUNK_SIZE); + if (!buf) + RETURN(-ENOMEM); + + if (cd != NULL) + first_index = cd->first_idx + 1; + if (cd != NULL && cd->last_idx) + index = cd->last_idx; + else + index = LLOG_BITMAP_BYTES * 8 - 1; + + while (rc == 0) { + struct llog_rec_hdr *rec; + struct llog_rec_tail *tail; + + /* skip records not set in bitmap */ + while (index >= first_index && + !ext2_test_bit(index, llh->llh_bitmap)) + --index; + + LASSERT(index >= first_index - 1); + if (index == first_index - 1) + break; + + /* get the buf with our target record; avoid old garbage */ + memset(buf, 0, LLOG_CHUNK_SIZE); + rc = llog_prev_block(loghandle, index, buf, LLOG_CHUNK_SIZE); + if (rc) + GOTO(out, rc); + + rec = buf; + idx = le32_to_cpu(rec->lrh_index); + if (idx < index) + CDEBUG(D_HA, "index %u : idx %u\n", index, idx); + while (idx < index) { + rec = ((void *)rec + le32_to_cpu(rec->lrh_len)); + idx ++; + } + tail = (void *)rec + le32_to_cpu(rec->lrh_len) - sizeof(*tail); + + /* process records in buffer, starting where we found one */ + while ((void *)tail > buf) { + rec = (void *)tail - le32_to_cpu(tail->lrt_len) + + sizeof(*tail); + + if (rec->lrh_index == 0) + GOTO(out, 0); /* no more records */ + + /* if set, process the callback on this record */ + if (ext2_test_bit(index, llh->llh_bitmap)) { + rc = cb(loghandle, rec, data); + if (rc == LLOG_PROC_BREAK) { + CWARN("recovery from log: "LPX64":%x" + " stopped\n", + loghandle->lgh_id.lgl_oid, + loghandle->lgh_id.lgl_ogen); + GOTO(out, rc); + } + if (rc) + GOTO(out, rc); + } + + /* previous record, still in buffer? */ + --index; + if (index < first_index) + GOTO(out, rc = 0); + tail = (void *)rec - sizeof(*tail); + } + } + +out: + if (buf) + OBD_FREE(buf, LLOG_CHUNK_SIZE); + RETURN(rc); +} +EXPORT_SYMBOL(llog_reverse_process); diff --git a/lustre/obdclass/llog_cat.c b/lustre/obdclass/llog_cat.c index aafffad..55039cc 100644 --- a/lustre/obdclass/llog_cat.c +++ b/lustre/obdclass/llog_cat.c @@ -394,6 +394,70 @@ int llog_cat_process(struct llog_handle *cat_llh, llog_cb_t cb, void *data) } EXPORT_SYMBOL(llog_cat_process); +static int llog_cat_reverse_process_cb(struct llog_handle *cat_llh, + struct llog_rec_hdr *rec, void *data) +{ + struct llog_process_data *d = data; + struct llog_logid_rec *lir = (struct llog_logid_rec *)rec; + struct llog_handle *llh; + int rc; + + if (le32_to_cpu(rec->lrh_type) != LLOG_LOGID_MAGIC) { + CERROR("invalid record in catalog\n"); + RETURN(-EINVAL); + } + CWARN("processing log "LPX64":%x at index %u of catalog "LPX64"\n", + lir->lid_id.lgl_oid, lir->lid_id.lgl_ogen, + le32_to_cpu(rec->lrh_index), cat_llh->lgh_id.lgl_oid); + + rc = llog_cat_id2handle(cat_llh, &llh, &lir->lid_id); + if (rc) { + CERROR("Cannot find handle for log "LPX64"\n", + lir->lid_id.lgl_oid); + RETURN(rc); + } + + rc = llog_reverse_process(llh, d->lpd_cb, d->lpd_data, NULL); + RETURN(rc); +} + +int llog_cat_reverse_process(struct llog_handle *cat_llh, + llog_cb_t cb, void *data) +{ + struct llog_process_data d; + struct llog_process_cat_data cd; + struct llog_log_hdr *llh = cat_llh->lgh_hdr; + int rc; + ENTRY; + + LASSERT(llh->llh_flags &cpu_to_le32(LLOG_F_IS_CAT)); + d.lpd_data = data; + d.lpd_cb = cb; + + if (llh->llh_cat_idx > cat_llh->lgh_last_idx) { + CWARN("catalog "LPX64" crosses index zero\n", + cat_llh->lgh_id.lgl_oid); + + cd.first_idx = 0; + cd.last_idx = cat_llh->lgh_last_idx; + rc = llog_reverse_process(cat_llh, llog_cat_reverse_process_cb, + &d, &cd); + if (rc != 0) + RETURN(rc); + + cd.first_idx = le32_to_cpu(llh->llh_cat_idx); + cd.last_idx = 0; + rc = llog_reverse_process(cat_llh, llog_cat_reverse_process_cb, + &d, &cd); + } else { + rc = llog_reverse_process(cat_llh, llog_cat_reverse_process_cb, + &d, NULL); + } + + RETURN(rc); +} +EXPORT_SYMBOL(llog_cat_reverse_process); + int llog_cat_set_first_idx(struct llog_handle *cathandle, int index) { struct llog_log_hdr *llh = cathandle->lgh_hdr; diff --git a/lustre/obdclass/llog_lvfs.c b/lustre/obdclass/llog_lvfs.c index 1841205..594a00f 100644 --- a/lustre/obdclass/llog_lvfs.c +++ b/lustre/obdclass/llog_lvfs.c @@ -366,7 +366,6 @@ static int llog_lvfs_next_block(struct llog_handle *loghandle, int *cur_idx, rc = fsfilt_read_record(loghandle->lgh_ctxt->loc_exp->exp_obd, loghandle->lgh_file, buf, len, &ppos); - if (rc) { CERROR("Cant read llog block at log id "LPU64 "/%u offset "LPU64"\n", @@ -422,6 +421,79 @@ static int llog_lvfs_next_block(struct llog_handle *loghandle, int *cur_idx, RETURN(-EIO); } +static int llog_lvfs_prev_block(struct llog_handle *loghandle, + int prev_idx, void *buf, int len) +{ + __u64 cur_offset; + int rc; + ENTRY; + + if (len == 0 || len & (LLOG_CHUNK_SIZE - 1)) + RETURN(-EINVAL); + + CDEBUG(D_OTHER, "looking for log index %u n", prev_idx); + + cur_offset = LLOG_CHUNK_SIZE; + llog_skip_over(&cur_offset, 0, prev_idx); + + while (cur_offset < loghandle->lgh_file->f_dentry->d_inode->i_size) { + struct llog_rec_hdr *rec; + struct llog_rec_tail *tail; + loff_t ppos; + + ppos = cur_offset; + + rc = fsfilt_read_record(loghandle->lgh_ctxt->loc_exp->exp_obd, + loghandle->lgh_file, buf, len, + &ppos); + if (rc) { + CERROR("Cant read llog block at log id "LPU64 + "/%u offset "LPU64"\n", + loghandle->lgh_id.lgl_oid, + loghandle->lgh_id.lgl_ogen, + cur_offset); + RETURN(rc); + } + + /* put number of bytes read into rc to make code simpler */ + rc = ppos - cur_offset; + cur_offset = ppos; + + if (rc == 0) /* end of file, nothing to do */ + RETURN(0); + + if (rc < sizeof(*tail)) { + CERROR("Invalid llog block at log id "LPU64"/%u offset " + LPU64"\n", loghandle->lgh_id.lgl_oid, + loghandle->lgh_id.lgl_ogen, cur_offset); + RETURN(-EINVAL); + } + + tail = buf + rc - sizeof(struct llog_rec_tail); + + /* this shouldn't happen */ + if (tail->lrt_index == 0) { + CERROR("Invalid llog tail at log id "LPU64"/%u offset " + LPU64"\n", loghandle->lgh_id.lgl_oid, + loghandle->lgh_id.lgl_ogen, cur_offset); + RETURN(-EINVAL); + } + if (le32_to_cpu(tail->lrt_index) < prev_idx) + continue; + + /* sanity check that the start of the new buffer is no farther + * than the record that we wanted. This shouldn't happen. */ + rec = buf; + if (le32_to_cpu(rec->lrh_index) > prev_idx) { + CERROR("missed desired record? %u > %u\n", + le32_to_cpu(rec->lrh_index), prev_idx); + RETURN(-ENOENT); + } + RETURN(0); + } + RETURN(-EIO); +} + static struct file *llog_filp_open(char *name, int flags, int mode) { char *logname; @@ -604,7 +676,7 @@ static int llog_lvfs_destroy(struct llog_handle *handle) if (rc) GOTO(out, rc); - rc = obd_destroy(handle->lgh_ctxt->loc_exp, oa, NULL, NULL); + rc = obd_destroy(handle->lgh_ctxt->loc_exp, oa, NULL, NULL, NULL); out: obdo_free(oa); RETURN(rc); @@ -696,6 +768,7 @@ int llog_put_cat_list(struct obd_device *obd, struct obd_device *disk_obd, struct llog_operations llog_lvfs_ops = { lop_write_rec: llog_lvfs_write_rec, lop_next_block: llog_lvfs_next_block, + lop_prev_block: llog_lvfs_prev_block, lop_read_header: llog_lvfs_read_header, lop_create: llog_lvfs_create, lop_destroy: llog_lvfs_destroy, @@ -730,6 +803,13 @@ static int llog_lvfs_next_block(struct llog_handle *loghandle, int *cur_idx, return 0; } +static int llog_lvfs_prev_block(struct llog_handle *loghandle, + int prev_idx, void *buf, int len) +{ + LBUG(); + return 0; +} + static int llog_lvfs_create(struct llog_ctxt *ctxt, struct llog_handle **res, struct llog_logid *logid, char *name) { @@ -766,6 +846,7 @@ int llog_put_cat_list(struct obd_device *obd, struct obd_device *disk_obd, struct llog_operations llog_lvfs_ops = { lop_write_rec: llog_lvfs_write_rec, lop_next_block: llog_lvfs_next_block, + lop_prev_block: llog_lvfs_prev_block, lop_read_header: llog_lvfs_read_header, lop_create: llog_lvfs_create, lop_destroy: llog_lvfs_destroy, diff --git a/lustre/obdclass/llog_test.c b/lustre/obdclass/llog_test.c index dddd5ba..89dac0a 100644 --- a/lustre/obdclass/llog_test.c +++ b/lustre/obdclass/llog_test.c @@ -390,7 +390,7 @@ static int llog_test_5(struct obd_device *obd) llog_init_handle(llh, LLOG_F_IS_CAT, &uuid); CWARN("5b: print the catalog entries.. we expect 2\n"); - rc = llog_process(llh, (llog_cb_t)cat_print_cb, "test 5", NULL); + rc = llog_process(llh, cat_print_cb, "test 5", NULL); if (rc) { CERROR("5b: process with cat_print_cb failed: %d\n", rc); GOTO(out, rc); @@ -412,7 +412,7 @@ static int llog_test_5(struct obd_device *obd) } CWARN("5b: print the catalog entries.. we expect 1\n"); - rc = llog_process(llh, (llog_cb_t)cat_print_cb, "test 5", NULL); + rc = llog_process(llh, cat_print_cb, "test 5", NULL); if (rc) { CERROR("5b: process with cat_print_cb failed: %d\n", rc); GOTO(out, rc); @@ -425,6 +425,13 @@ static int llog_test_5(struct obd_device *obd) GOTO(out, rc); } + CWARN("5f: print plain log entries reversely.. expect 6\n"); + rc = llog_cat_reverse_process(llh, plain_print_cb, "foobar"); + if (rc) { + CERROR("5f: reversely process with plain_print_cb failed: %d\n", rc); + GOTO(out, rc); + } + out: CWARN("5: close re-opened catalog\n"); if (llh) @@ -475,10 +482,14 @@ static int llog_test_6(struct obd_device *obd, char *name) GOTO(parse_out, rc); } - rc = llog_process(llh, (llog_cb_t)plain_print_cb, NULL, NULL); + rc = llog_process(llh, plain_print_cb, NULL, NULL); if (rc) CERROR("6: llog_process failed %d\n", rc); + rc = llog_reverse_process(llh, plain_print_cb, NULL, NULL); + if (rc) + CERROR("6: llog_reverse_process failed %d\n", rc); + parse_out: rc = llog_close(llh); if (rc) { @@ -645,7 +656,7 @@ static int llog_test_setup(struct obd_device *obd, obd_count len, void *buf) if (rc) RETURN(rc); - llog_test_rand = ll_insecure_random_int(); + llog_test_rand = ll_rand(); rc = llog_run_tests(obd); if (rc) diff --git a/lustre/obdclass/lprocfs_status.c b/lustre/obdclass/lprocfs_status.c index 79e828d..c7ea976 100644 --- a/lustre/obdclass/lprocfs_status.c +++ b/lustre/obdclass/lprocfs_status.c @@ -346,6 +346,49 @@ int lprocfs_rd_conn_uuid(char *page, char **start, off_t off, int count, return snprintf(page, count, "%s\n", conn->c_remote_uuid.uuid); } +static const char *obd_connect_names[] = { + "read_only", + "lov_index", + "unused", + "write_grant", + "server_lock", + "version", + "request_portal", + "acl", + "xattr", + "create_on_write", + "truncate_lock", + "initial_transno", + "inode_bit_locks", + "join_file", + NULL +}; + +int lprocfs_rd_connect_flags(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct obd_device *obd = data; + __u64 mask = 1, flags; + int i, ret; + + if (obd == NULL) + return 0; + + flags = obd->u.cli.cl_import->imp_connect_data.ocd_connect_flags; + ret = snprintf(page, count, "flags="LPX64"\n", flags); + for (i = 0; obd_connect_names[i] != NULL; i++, mask <<= 1) { + if (flags & mask) + ret += snprintf(page + ret, count - ret, "%s\n", + obd_connect_names[i]); + } + if (flags & ~(mask - 1)) + ret += snprintf(page + ret, count - ret, + "unknown flags "LPX64"\n", flags & ~(mask - 1)); + + return ret; +} +EXPORT_SYMBOL(lprocfs_rd_connect_flags); + int lprocfs_rd_num_exports(char *page, char **start, off_t off, int count, int *eof, void *data) { @@ -620,10 +663,12 @@ int lprocfs_alloc_obd_stats(struct obd_device *obd, unsigned num_private_stats) LPROCFS_OBD_OP_INIT(num_private_stats, stats, add_conn); LPROCFS_OBD_OP_INIT(num_private_stats, stats, del_conn); LPROCFS_OBD_OP_INIT(num_private_stats, stats, connect); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, reconnect); LPROCFS_OBD_OP_INIT(num_private_stats, stats, disconnect); LPROCFS_OBD_OP_INIT(num_private_stats, stats, statfs); LPROCFS_OBD_OP_INIT(num_private_stats, stats, packmd); LPROCFS_OBD_OP_INIT(num_private_stats, stats, unpackmd); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, checkmd); LPROCFS_OBD_OP_INIT(num_private_stats, stats, preallocate); LPROCFS_OBD_OP_INIT(num_private_stats, stats, create); LPROCFS_OBD_OP_INIT(num_private_stats, stats, destroy); @@ -639,6 +684,7 @@ int lprocfs_alloc_obd_stats(struct obd_device *obd, unsigned num_private_stats) LPROCFS_OBD_OP_INIT(num_private_stats, stats, trigger_group_io); LPROCFS_OBD_OP_INIT(num_private_stats, stats, set_async_flags); LPROCFS_OBD_OP_INIT(num_private_stats, stats, teardown_async_page); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, merge_lvb); LPROCFS_OBD_OP_INIT(num_private_stats, stats, adjust_kms); LPROCFS_OBD_OP_INIT(num_private_stats, stats, punch); LPROCFS_OBD_OP_INIT(num_private_stats, stats, sync); @@ -662,9 +708,9 @@ int lprocfs_alloc_obd_stats(struct obd_device *obd, unsigned num_private_stats) LPROCFS_OBD_OP_INIT(num_private_stats, stats, unpin); LPROCFS_OBD_OP_INIT(num_private_stats, stats, import_event); LPROCFS_OBD_OP_INIT(num_private_stats, stats, notify); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, health_check); LPROCFS_OBD_OP_INIT(num_private_stats, stats, quotacheck); LPROCFS_OBD_OP_INIT(num_private_stats, stats, quotactl); - LPROCFS_OBD_OP_INIT(num_private_stats, stats, health_check); for (i = num_private_stats; i < num_stats; i++) { /* If this LBUGs, it is likely that an obd diff --git a/lustre/obdclass/lustre_peer.c b/lustre/obdclass/lustre_peer.c index 617ef98..2a42138 100644 --- a/lustre/obdclass/lustre_peer.c +++ b/lustre/obdclass/lustre_peer.c @@ -84,7 +84,6 @@ int lustre_uuid_to_peer(char *uuid, lnet_nid_t *peer_nid, int index) int class_add_uuid(char *uuid, __u64 nid) { struct uuid_nid_data *data; - int rc; int nob = strnlen (uuid, PAGE_SIZE) + 1; LASSERT(nid != 0); /* valid newconfig NID is never zero */ @@ -92,7 +91,6 @@ int class_add_uuid(char *uuid, __u64 nid) if (nob > PAGE_SIZE) return -EINVAL; - rc = -ENOMEM; OBD_ALLOC(data, sizeof(*data)); if (data == NULL) return -ENOMEM; @@ -151,6 +149,7 @@ int class_del_uuid (char *uuid) data = list_entry(deathrow.next, struct uuid_nid_data, un_list); list_del (&data->un_list); + CDEBUG(D_INFO, "del uuid %s\n", data->un_uuid); OBD_FREE(data->un_uuid, strlen(data->un_uuid) + 1); OBD_FREE(data, sizeof(*data)); diff --git a/lustre/obdclass/obd_config.c b/lustre/obdclass/obd_config.c index f7bc3f7..ca38953 100644 --- a/lustre/obdclass/obd_config.c +++ b/lustre/obdclass/obd_config.c @@ -69,6 +69,26 @@ int class_attach(struct lustre_cfg *lcfg) CDEBUG(D_IOCTL, "attach type %s name: %s uuid: %s\n", MKSTR(typename), MKSTR(name), MKSTR(uuid)); + + /* Mountconf transitional hack, should go away after 1.6. + 1.4.7 uses the old names, so translate back if the + mountconf flag is set. + 1.6 should set this flag, and translate the other way here + if not set. */ + if (lcfg->lcfg_flags & LCFG_FLG_MOUNTCONF){ + char *tmp = NULL; + if (strcmp(typename, "mds") == 0) + tmp = "mdt"; + if (strcmp(typename, "mdt") == 0) + tmp = "mds"; + if (strcmp(typename, "osd") == 0) + tmp = "obdfilter"; + if (tmp) { + LCONSOLE_WARN("Using type %s for %s %s\n", tmp, + MKSTR(typename), MKSTR(name)); + typename = tmp; + } + } /* find the type */ type = class_get_type(typename); @@ -355,7 +375,7 @@ int class_cleanup(struct obd_device *obd, struct lustre_cfg *lcfg) /* Precleanup stage 1, we must make sure all exports (other than the self-export) get destroyed. */ - err = obd_precleanup(obd, 1); + err = obd_precleanup(obd, OBD_CLEANUP_EXPORTS); if (err) CERROR("Precleanup %s returned %d\n", obd->obd_name, err); @@ -390,7 +410,7 @@ void class_decref(struct obd_device *obd) /* if we're not stopping, we didn't finish setup */ /* Precleanup stage 2, do other type-specific cleanup requiring the self-export. */ - err = obd_precleanup(obd, 2); + err = obd_precleanup(obd, OBD_CLEANUP_SELF_EXP); if (err) CERROR("Precleanup %s returned %d\n", obd->obd_name, err); @@ -431,8 +451,8 @@ int class_add_conn(struct obd_device *obd, struct lustre_cfg *lcfg) CERROR("invalid conn_uuid\n"); RETURN(-EINVAL); } - if (strcmp(obd->obd_type->typ_name, "mdc") && - strcmp(obd->obd_type->typ_name, "osc")) { + if (strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) && + strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME)) { CERROR("can't add connection on non-client dev\n"); RETURN(-EINVAL); } @@ -461,8 +481,8 @@ int class_del_conn(struct obd_device *obd, struct lustre_cfg *lcfg) CERROR("invalid conn_uuid\n"); RETURN(-EINVAL); } - if (strcmp(obd->obd_type->typ_name, "mdc") && - strcmp(obd->obd_type->typ_name, "osc")) { + if (strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) && + strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME)) { CERROR("can't del connection on non-client dev\n"); RETURN(-EINVAL); } @@ -617,6 +637,11 @@ int class_process_config(struct lustre_cfg *lcfg) sizeof (obd_lustre_upcall)); GOTO(out, err = 0); } + case LCFG_PARAM: + case LCFG_MARKER: { + LCONSOLE_WARN("LCFG_MARKER not yet implemented.\n"); + GOTO(out, err = 0); + } } /* Commands that require a device */ @@ -849,7 +874,7 @@ void class_manual_cleanup(struct obd_device *obd) int err; char flags[3]=""; ENTRY; - + if (!obd) { CERROR("empty cleanup\n"); EXIT; @@ -861,22 +886,22 @@ void class_manual_cleanup(struct obd_device *obd) if (obd->obd_fail) strcat(flags, "A"); - CDEBUG(D_CONFIG, "Manual cleanup of %s (flags='%s')\n", + CDEBUG(D_CONFIG, "Manual cleanup of %s (flags='%s')\n", obd->obd_name, flags); lustre_cfg_bufs_reset(&bufs, obd->obd_name); lustre_cfg_bufs_set_string(&bufs, 1, flags); lcfg = lustre_cfg_new(LCFG_CLEANUP, &bufs); - + err = class_process_config(lcfg); - if (err) + if (err) CERROR("cleanup failed %d: %s\n", err, obd->obd_name); - + /* the lcfg is almost the same for both ops */ lcfg->lcfg_command = LCFG_DETACH; err = class_process_config(lcfg); lustre_cfg_free(lcfg); - if (err) + if (err) CERROR("detach failed %d: %s\n", err, obd->obd_name); EXIT; } diff --git a/lustre/obdclass/prng.c b/lustre/obdclass/prng.c new file mode 100644 index 0000000..909e311 --- /dev/null +++ b/lustre/obdclass/prng.c @@ -0,0 +1,69 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * This file is part of the Lustre file system, http://www.lustre.org + * Lustre is a trademark of Cluster File Systems, Inc. + * + * concatenation of following two 16-bit multiply with carry generators + * x(n)=a*x(n-1)+carry mod 2^16 and y(n)=b*y(n-1)+carry mod 2^16, + * number and carry packed within the same 32 bit integer. + * algorithm recommended by Marsaglia + ******************************************************************/ +#ifndef EXPORT_SYMTAB +# define EXPORT_SYMTAB +#endif + +#ifdef __KERNEL__ +#include +#else +#include +#endif + +/* +From: George Marsaglia +Newsgroups: sci.math +Subject: Re: A RANDOM NUMBER GENERATOR FOR C +Date: Tue, 30 Sep 1997 05:29:35 -0700 + + * You may replace the two constants 36969 and 18000 by any + * pair of distinct constants from this list: + * 18000 18030 18273 18513 18879 19074 19098 19164 19215 19584 + * 19599 19950 20088 20508 20544 20664 20814 20970 21153 21243 + * 21423 21723 21954 22125 22188 22293 22860 22938 22965 22974 + * 23109 23124 23163 23208 23508 23520 23553 23658 23865 24114 + * 24219 24660 24699 24864 24948 25023 25308 25443 26004 26088 + * 26154 26550 26679 26838 27183 27258 27753 27795 27810 27834 + * 27960 28320 28380 28689 28710 28794 28854 28959 28980 29013 + * 29379 29889 30135 30345 30459 30714 30903 30963 31059 31083 + * (or any other 16-bit constants k for which both k*2^16-1 + * and k*2^15-1 are prime) */ + +#define RANDOM_CONST_A 18030 +#define RANDOM_CONST_B 29013 + +static unsigned int seed_x = 521288629; +static unsigned int seed_y = 362436069; +unsigned int ll_rand(void) +{ + + seed_x = RANDOM_CONST_A * (seed_x & 65535) + (seed_x >> 16); + seed_y = RANDOM_CONST_B * (seed_y & 65535) + (seed_y >> 16); + + return ((seed_x << 16) + (seed_y & 65535)); +} +EXPORT_SYMBOL(ll_rand); + +/* Note that if the input seeds are not completely random, then there is + * a preferred location for the entropy in the two seeds, in order to avoid + * the initial values from the PRNG to be the same each time. + * + * seed1 (seed_x) should have the most entropy in the low bits of the word + * seed2 (seed_y) should have the most entropy in the high bits of the word */ +void ll_srand(unsigned int seed1, unsigned int seed2) +{ + if (seed1) + seed_x = seed1; /* use default seeds if parameter is 0 */ + if (seed2) + seed_y = seed2; +} +EXPORT_SYMBOL(ll_srand); diff --git a/lustre/obdecho/echo.c b/lustre/obdecho/echo.c index 954143f..9a07b6e 100644 --- a/lustre/obdecho/echo.c +++ b/lustre/obdecho/echo.c @@ -130,7 +130,8 @@ int echo_create(struct obd_export *exp, struct obdo *oa, } int echo_destroy(struct obd_export *exp, struct obdo *oa, - struct lov_stripe_md *ea, struct obd_trans_info *oti) + struct lov_stripe_md *ea, struct obd_trans_info *oti, + struct obd_export *md_exp) { struct obd_device *obd = class_exp2obd(exp); diff --git a/lustre/obdfilter/filter_io.c b/lustre/obdfilter/filter_io.c index e09988e..0987611 100644 --- a/lustre/obdfilter/filter_io.c +++ b/lustre/obdfilter/filter_io.c @@ -115,10 +115,14 @@ static void filter_grant_incoming(struct obd_export *exp, struct obdo *oa) /* Update our accounting now so that statfs takes it into account. * Note that fed_dirty is only approximate and can become incorrect * if RPCs arrive out-of-order. No important calculations depend - * on fed_dirty however. */ + * on fed_dirty however, but we must check sanity to not assert. */ + if ((long long)oa->o_dirty < 0) + oa->o_dirty = 0; + else if (oa->o_dirty > fed->fed_grant + 4 * FILTER_GRANT_CHUNK) + oa->o_dirty = fed->fed_grant + 4 * FILTER_GRANT_CHUNK; obd->u.filter.fo_tot_dirty += oa->o_dirty - fed->fed_dirty; if (fed->fed_grant < oa->o_dropped) { - CERROR("%s: cli %s/%p reports %u dropped > fed_grant %lu\n", + CDEBUG(D_HA,"%s: cli %s/%p reports %u dropped > fedgrant %lu\n", obd->obd_name, exp->exp_client_uuid.uuid, exp, oa->o_dropped, fed->fed_grant); oa->o_dropped = 0; @@ -142,8 +146,6 @@ static void filter_grant_incoming(struct obd_export *exp, struct obdo *oa) EXIT; } -#define GRANT_FOR_LLOG(obd) 16 - /* Figure out how much space is available between what we've granted * and what remains in the filesystem. Compensate for ext3 indirect * block overhead when computing how much free space is left ungranted. @@ -182,7 +184,7 @@ restat: if (left >= tot_granted) { left -= tot_granted; } else { - if (left < tot_granted - obd->u.filter.fo_tot_pending + 65536) { + if (left < tot_granted - obd->u.filter.fo_tot_pending) { CERROR("%s: cli %s/%p grant "LPU64" > available " LPU64" and pending "LPU64"\n", obd->obd_name, exp->exp_client_uuid.uuid, exp, tot_granted, @@ -228,12 +230,16 @@ long filter_grant(struct obd_export *exp, obd_size current_grant, obd->obd_name, exp->exp_client_uuid.uuid, exp, want); } else if (current_grant < want && current_grant < fed->fed_grant + FILTER_GRANT_CHUNK) { - grant = min((want >> blockbits) / 2, + grant = min((want >> blockbits), (fs_space_left >> blockbits) / 8); grant <<= blockbits; if (grant) { - if (grant > FILTER_GRANT_CHUNK) + /* Allow >FILTER_GRANT_CHUNK size when clients + * reconnect due to a server reboot. + */ + if ((grant > FILTER_GRANT_CHUNK) && + (!obd->obd_recovering)) grant = FILTER_GRANT_CHUNK; obd->u.filter.fo_tot_granted += grant; @@ -290,24 +296,21 @@ static int filter_preprw_read(int cmd, struct obd_export *exp, struct obdo *oa, spin_unlock(&obd->obd_osfs_lock); } - push_ctxt(&saved, &exp->exp_obd->obd_lvfs_ctxt, NULL); - - iobuf = filter_iobuf_get(oti->oti_thread, &exp->exp_obd->u.filter); + iobuf = filter_iobuf_get(&obd->u.filter, oti); + if (IS_ERR(iobuf)) + RETURN(PTR_ERR(iobuf)); - dentry = filter_oa2dentry_quiet(obd, oa); + push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); + dentry = filter_oa2dentry(obd, oa); if (IS_ERR(dentry)) { - if (PTR_ERR(dentry) == -ENOENT) { - dentry = NULL; - inode = NULL; - } else { - dentry = NULL; - GOTO(cleanup, rc = PTR_ERR(dentry)); - } - } else { - inode = dentry->d_inode; + rc = PTR_ERR(dentry); + dentry = NULL; + GOTO(cleanup, rc); } - if (oa && inode != NULL) + inode = dentry->d_inode; + + if (oa) obdo_to_inode(inode, oa, OBD_MD_FLATIME); fsfilt_check_slow(now, obd_timeout, "preprw_read setup"); @@ -326,10 +329,9 @@ static int filter_preprw_read(int cmd, struct obd_export *exp, struct obdo *oa, */ LASSERT(lnb->page != NULL); - if (inode == NULL || inode->i_size <= rnb->offset) - /* If there's no more data, or inode is not yet - * allocated by CROW abort early. lnb->rc == 0, so it's - * easy to detect later. */ + if (inode->i_size <= rnb->offset) + /* If there's no more data, abort early. lnb->rc == 0, + * so it's easy to detect later. */ break; else filter_alloc_dio_page(obd, inode, lnb); @@ -346,12 +348,10 @@ static int filter_preprw_read(int cmd, struct obd_export *exp, struct obdo *oa, fsfilt_check_slow(now, obd_timeout, "start_page_read"); - if (inode != NULL) { - rc = filter_direct_io(OBD_BRW_READ, dentry, iobuf, - exp, NULL, NULL, NULL); - if (rc) - GOTO(cleanup, rc); - } + rc = filter_direct_io(OBD_BRW_READ, dentry, iobuf, + exp, NULL, NULL, NULL); + if (rc) + GOTO(cleanup, rc); lprocfs_counter_add(obd->obd_stats, LPROC_FILTER_READ_BYTES, tot_bytes); @@ -367,9 +367,9 @@ static int filter_preprw_read(int cmd, struct obd_export *exp, struct obdo *oa, f_dput(dentry); } - filter_iobuf_put(iobuf); + filter_iobuf_put(&obd->u.filter, iobuf, oti); - pop_ctxt(&saved, &exp->exp_obd->obd_lvfs_ctxt, NULL); + pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); if (rc) CERROR("io error %d\n", rc); @@ -442,7 +442,7 @@ static int filter_grant_check(struct obd_export *exp, int objcount, * marked BRW_GRANTED are already mapped and we can * ignore this error. */ lnb[n].rc = -ENOSPC; - rnb[n].flags &= OBD_BRW_GRANTED; + rnb[n].flags &= ~OBD_BRW_GRANTED; CDEBUG(D_CACHE,"%s: cli %s/%p idx %d no space for %d\n", exp->exp_obd->obd_name, exp->exp_client_uuid.uuid, exp, n, bytes); @@ -519,19 +519,24 @@ static int filter_preprw_write(int cmd, struct obd_export *exp, struct obdo *oa, LASSERT(objcount == 1); LASSERT(obj->ioo_bufcnt > 0); - OBD_RACE(OBD_FAIL_OST_CLEAR_ORPHANS_RACE); - - iobuf = filter_iobuf_get(oti->oti_thread, &exp->exp_obd->u.filter); - cleanup_phase = 1; - push_ctxt(&saved, &exp->exp_obd->obd_lvfs_ctxt, NULL); + iobuf = filter_iobuf_get(&exp->exp_obd->u.filter, oti); + if (IS_ERR(iobuf)) + GOTO(cleanup, rc = PTR_ERR(iobuf)); + cleanup_phase = 1; - /* make sure that object is already allocated */ - dentry = filter_crow_object(exp->exp_obd, oa); + dentry = filter_fid2dentry(exp->exp_obd, NULL, obj->ioo_gr, + obj->ioo_id); if (IS_ERR(dentry)) GOTO(cleanup, rc = PTR_ERR(dentry)); cleanup_phase = 2; + if (dentry->d_inode == NULL) { + CERROR("%s: trying to BRW to non-existent file "LPU64"\n", + exp->exp_obd->obd_name, obj->ioo_id); + GOTO(cleanup, rc = -ENOENT); + } + fso.fso_dentry = dentry; fso.fso_bufcnt = obj->ioo_bufcnt; @@ -550,13 +555,12 @@ static int filter_preprw_write(int cmd, struct obd_export *exp, struct obdo *oa, rc = filter_grant_check(exp, objcount, &fso, niocount, nb, res, &left, dentry->d_inode); - /* We're finishing using body->oa as an input variable, so reset - * o_valid here. */ + /* do not zero out oa->o_valid as it is used in filter_commitrw_write() + * for setting UID/GID and fid EA in first write time. */ if (oa && oa->o_valid & OBD_MD_FLGRANT) { oa->o_grant = filter_grant(exp,oa->o_grant,oa->o_undirty,left); - oa->o_valid = OBD_MD_FLGRANT; - } else if (oa) - oa->o_valid = 0; + oa->o_valid |= OBD_MD_FLGRANT; + } spin_unlock(&exp->exp_obd->obd_osfs_lock); @@ -636,19 +640,20 @@ cleanup: switch(cleanup_phase) { case 4: case 3: - filter_iobuf_put(iobuf); + filter_iobuf_put(&exp->exp_obd->u.filter, iobuf, oti); case 2: pop_ctxt(&saved, &exp->exp_obd->obd_lvfs_ctxt, NULL); - if (rc && dentry && !IS_ERR(dentry)) + if (rc) f_dput(dentry); break; case 1: + filter_iobuf_put(&exp->exp_obd->u.filter, iobuf, oti); + case 0: spin_lock(&exp->exp_obd->obd_osfs_lock); if (oa) filter_grant_incoming(exp, oa); spin_unlock(&exp->exp_obd->obd_osfs_lock); pop_ctxt(&saved, &exp->exp_obd->obd_lvfs_ctxt, NULL); - filter_iobuf_put(iobuf); break; default:; } @@ -694,8 +699,23 @@ static int filter_commitrw_read(struct obd_export *exp, struct obdo *oa, struct obd_trans_info *oti, int rc) { struct inode *inode = NULL; + struct ldlm_res_id res_id = { .name = { obj->ioo_id } }; + struct ldlm_resource *resource = NULL; + struct ldlm_namespace *ns = exp->exp_obd->obd_namespace; ENTRY; + /* If oa != NULL then filter_preprw_read updated the inode atime + * and we should update the lvb so that other glimpses will also + * get the updated value. bug 5972 */ + if (oa && ns && ns->ns_lvbo && ns->ns_lvbo->lvbo_update) { + resource = ldlm_resource_get(ns, NULL, res_id, LDLM_EXTENT, 0); + + if (resource != NULL) { + ns->ns_lvbo->lvbo_update(resource, NULL, 0, 1); + ldlm_resource_putref(resource); + } + } + if (res->dentry != NULL) inode = res->dentry->d_inode; @@ -809,6 +829,7 @@ int filter_brw(int cmd, struct obd_export *exp, struct obdo *oa, GOTO(out, ret = -ENOMEM); for (i = 0; i < oa_bufs; i++) { + lnb[i].page = pga[i].pg; rnb[i].offset = pga[i].off; rnb[i].len = pga[i].count; } @@ -820,29 +841,6 @@ int filter_brw(int cmd, struct obd_export *exp, struct obdo *oa, if (ret != 0) GOTO(out, ret); - for (i = 0; i < oa_bufs; i++) { - void *virt; - obd_off off; - void *addr; - - if (lnb[i].page == NULL) - break; - - off = pga[i].off & ~PAGE_MASK; - virt = kmap(pga[i].pg); - addr = kmap(lnb[i].page); - - /* 2 kmaps == vanishingly small deadlock opportunity */ - - if (cmd & OBD_BRW_WRITE) - memcpy(addr + off, virt + off, pga[i].count); - else - memcpy(virt + off, addr + off, pga[i].count); - - kunmap(lnb[i].page); - kunmap(pga[i].pg); - } - ret = filter_commitrw(cmd, exp, oa, 1, &ioo, oa_bufs, lnb, oti, ret); out: diff --git a/lustre/obdfilter/filter_io_24.c b/lustre/obdfilter/filter_io_24.c index 54a2960..d369be3 100644 --- a/lustre/obdfilter/filter_io_24.c +++ b/lustre/obdfilter/filter_io_24.c @@ -149,13 +149,13 @@ static int filter_clear_page_cache(struct inode *inode, struct kiobuf *iobuf) } /* Must be called with i_sem taken for writes; this will drop it */ -int filter_direct_io(int rw, struct dentry *dchild, void *buf, +int filter_direct_io(int rw, struct dentry *dchild, struct filter_iobuf *buf, struct obd_export *exp, struct iattr *attr, struct obd_trans_info *oti, void **wait_handle) { struct obd_device *obd = exp->exp_obd; struct inode *inode = dchild->d_inode; - struct kiobuf *iobuf = buf; + struct kiobuf *iobuf = (void *)buf; int rc, create = (rw == OBD_BRW_WRITE), committed = 0; int blocks_per_page = PAGE_SIZE >> inode->i_blkbits, cleanup_phase = 0; struct semaphore *sem = NULL; @@ -296,50 +296,60 @@ static void clear_kiobuf(struct kiobuf *iobuf) iobuf->length = 0; } -void filter_iobuf_put(void *iobuf) +struct filter_iobuf *filter_alloc_iobuf(struct filter_obd *filter, + int rw, int num_pages) { - clear_kiobuf(iobuf); -} - -int filter_alloc_iobuf(struct filter_obd *filter, int rw, int num_pages, - void **ret) -{ - int rc; struct kiobuf *iobuf; + int rc; ENTRY; LASSERTF(rw == OBD_BRW_WRITE || rw == OBD_BRW_READ, "%x\n", rw); rc = alloc_kiovec(1, &iobuf); if (rc) - RETURN(rc); + RETURN(ERR_PTR(rc)); rc = expand_kiobuf(iobuf, num_pages); if (rc) { free_kiovec(1, &iobuf); - RETURN(rc); + RETURN(ERR_PTR(rc)); } #ifdef HAVE_KIOBUF_DOVARY iobuf->dovary = 0; /* this prevents corruption, not present in 2.4.20 */ #endif clear_kiobuf(iobuf); - *ret = iobuf; - RETURN(0); + RETURN((void *)iobuf); } -void filter_free_iobuf(void *buf) +void filter_free_iobuf(struct filter_iobuf *buf) { - struct kiobuf *iobuf = buf; + struct kiobuf *iobuf = (void *)buf; clear_kiobuf(iobuf); free_kiovec(1, &iobuf); } -int filter_iobuf_add_page(struct obd_device *obd, void *buf, +void filter_iobuf_put(struct filter_obd *filter, struct filter_iobuf *iobuf, + struct obd_trans_info *oti) +{ + int thread_id = oti ? oti->oti_thread_id : -1; + + if (unlikely(thread_id < 0)) { + filter_free_iobuf(iobuf); + return; + } + + LASSERTF(filter->fo_iobuf_pool[thread_id] == iobuf, + "iobuf mismatch for thread %d: pool %p iobuf %p\n", + thread_id, filter->fo_iobuf_pool[thread_id], iobuf); + clear_kiobuf((void *)iobuf); +} + +int filter_iobuf_add_page(struct obd_device *obd, struct filter_iobuf *buf, struct inode *inode, struct page *page) { - struct kiobuf *iobuf = buf; + struct kiobuf *iobuf = (void *)buf; iobuf->maplist[iobuf->nr_pages++] = page; iobuf->length += PAGE_SIZE; @@ -370,7 +380,9 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa, int objcount, if (rc != 0) GOTO(cleanup, rc); - iobuf = filter_iobuf_get(oti->oti_thread, &exp->exp_obd->u.filter); + iobuf = filter_iobuf_get(&obd->u.filter, oti); + if (IS_ERR(iobuf)) + GOTO(cleanup, rc = PTR_ERR(iobuf)); cleanup_phase = 1; fso.fso_dentry = res->dentry; @@ -414,7 +426,34 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa, int objcount, fsfilt_check_slow(now, obd_timeout, "brw_start"); - iattr_from_obdo(&iattr,oa,OBD_MD_FLATIME|OBD_MD_FLMTIME|OBD_MD_FLCTIME); + i = OBD_MD_FLATIME | OBD_MD_FLMTIME | OBD_MD_FLCTIME; + + /* If the inode still has SUID+SGID bits set (see filter_precreate()) + * then we will accept the UID+GID if sent by the client for + * initializing the ownership of this inode. We only allow this to + * happen once (so clear these bits) and later only allow setattr. */ + if (inode->i_mode & S_ISUID) + i |= OBD_MD_FLUID; + if (inode->i_mode & S_ISGID) + i |= OBD_MD_FLGID; + + iattr_from_obdo(&iattr, oa, i); + if (iattr.ia_valid & (ATTR_UID | ATTR_GID)) { + CDEBUG(D_INODE, "update UID/GID to %lu/%lu\n", + (unsigned long)oa->o_uid, (unsigned long)oa->o_gid); + + cap_raise(current->cap_effective, CAP_SYS_RESOURCE); + + iattr.ia_valid |= ATTR_MODE; + iattr.ia_mode = inode->i_mode; + if (iattr.ia_valid & ATTR_UID) + iattr.ia_mode &= ~S_ISUID; + if (iattr.ia_valid & ATTR_GID) + iattr.ia_mode &= ~S_ISGID; + + rc = filter_update_fidea(exp, inode, oti->oti_handle, oa); + } + /* filter_direct_io drops i_sem */ rc = filter_direct_io(OBD_BRW_WRITE, res->dentry, iobuf, exp, &iattr, oti, &wait_handle); @@ -442,7 +481,7 @@ cleanup: pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); LASSERT(current->journal_info == NULL); case 1: - filter_iobuf_put(iobuf); + filter_iobuf_put(&obd->u.filter, iobuf, oti); case 0: /* * lnb->page automatically returns back into per-thread page diff --git a/lustre/obdfilter/filter_io_26.c b/lustre/obdfilter/filter_io_26.c index 679e06d..b96eebb 100644 --- a/lustre/obdfilter/filter_io_26.c +++ b/lustre/obdfilter/filter_io_26.c @@ -42,9 +42,8 @@ /* 512byte block min */ #define MAX_BLOCKS_PER_PAGE (PAGE_SIZE / 512) -struct dio_request { +struct filter_iobuf { atomic_t dr_numreqs; /* number of reqs being processed */ - struct bio *dr_bios; /* list of completed bios */ wait_queue_head_t dr_wait; int dr_max_pages; int dr_npages; @@ -57,12 +56,12 @@ struct dio_request { struct filter_obd *dr_filter; }; -static void record_start_io(struct dio_request *dreq, int rw, int size) +static void record_start_io(struct filter_iobuf *iobuf, int rw, int size) { - struct filter_obd *filter = dreq->dr_filter; + struct filter_obd *filter = iobuf->dr_filter; unsigned long flags; - atomic_inc(&dreq->dr_numreqs); + atomic_inc(&iobuf->dr_numreqs); if (rw == OBD_BRW_READ) { lprocfs_oh_tally(&filter->fo_read_rpc_hist, @@ -79,12 +78,12 @@ static void record_start_io(struct dio_request *dreq, int rw, int size) else filter->fo_w_in_flight++; spin_unlock_irqrestore(&filter->fo_stats_lock, flags); - dreq->dr_start_time = jiffies; + iobuf->dr_start_time = jiffies; } -static void record_finish_io(struct dio_request *dreq, int rw, int rc) +static void record_finish_io(struct filter_iobuf *iobuf, int rw, int rc) { - struct filter_obd *filter = dreq->dr_filter; + struct filter_obd *filter = iobuf->dr_filter; unsigned long flags, stop_time = jiffies; spin_lock_irqsave(&filter->fo_stats_lock, flags); @@ -94,24 +93,24 @@ static void record_finish_io(struct dio_request *dreq, int rw, int rc) filter->fo_w_in_flight--; spin_unlock_irqrestore(&filter->fo_stats_lock, flags); - if (atomic_dec_and_test(&dreq->dr_numreqs)) - wake_up(&dreq->dr_wait); + if (atomic_dec_and_test(&iobuf->dr_numreqs)) + wake_up(&iobuf->dr_wait); if (rc != 0) return; if (rw == OBD_BRW_READ) { lprocfs_oh_tally_log2(&filter->fo_r_io_time, - stop_time - dreq->dr_start_time); + stop_time - iobuf->dr_start_time); } else { lprocfs_oh_tally_log2(&filter->fo_w_io_time, - stop_time - dreq->dr_start_time); + stop_time - iobuf->dr_start_time); } } static int dio_complete_routine(struct bio *bio, unsigned int done, int error) { - struct dio_request *dreq = bio->bi_private; + struct filter_iobuf *iobuf = bio->bi_private; unsigned long flags; if (bio->bi_size) { @@ -120,7 +119,7 @@ static int dio_complete_routine(struct bio *bio, unsigned int done, int error) return 1; } - if (dreq == NULL) { + if (iobuf == NULL) { CERROR("***** bio->bi_private is NULL! This should never " "happen. Normally, I would crash here, but instead I " "will dump the bio contents to the console. Please " @@ -138,16 +137,20 @@ static int dio_complete_routine(struct bio *bio, unsigned int done, int error) return 0; } - spin_lock_irqsave(&dreq->dr_lock, flags); - bio->bi_private = dreq->dr_bios; - dreq->dr_bios = bio; - if (dreq->dr_error == 0) - dreq->dr_error = error; - spin_unlock_irqrestore(&dreq->dr_lock, flags); + spin_lock_irqsave(&iobuf->dr_lock, flags); + if (iobuf->dr_error == 0) + iobuf->dr_error = error; + spin_unlock_irqrestore(&iobuf->dr_lock, flags); - record_finish_io(dreq, test_bit(BIO_RW, &bio->bi_rw) ? + record_finish_io(iobuf, test_bit(BIO_RW, &bio->bi_rw) ? OBD_BRW_WRITE : OBD_BRW_READ, error); + /* Completed bios used to be chained off iobuf->dr_bios and freed in + * filter_clear_dreq(). It was then possible to exhaust the biovec-256 + * mempool when serious on-disk fragmentation was encountered, + * deadlocking the OST. The bios are now released as soon as complete + * so the pool cannot be exhausted while IOs are competing. bug 10076 */ + bio_put(bio); return 0; } @@ -162,92 +165,95 @@ static int can_be_merged(struct bio *bio, sector_t sector) return bio->bi_sector + size == sector ? 1 : 0; } -int filter_alloc_iobuf(struct filter_obd *filter, int rw, int num_pages, - void **ret) +struct filter_iobuf *filter_alloc_iobuf(struct filter_obd *filter, + int rw, int num_pages) { - struct dio_request *dreq; + struct filter_iobuf *iobuf; LASSERTF(rw == OBD_BRW_WRITE || rw == OBD_BRW_READ, "%x\n", rw); - OBD_ALLOC(dreq, sizeof(*dreq)); - if (dreq == NULL) + OBD_ALLOC(iobuf, sizeof(*iobuf)); + if (iobuf == NULL) goto failed_0; - OBD_ALLOC(dreq->dr_pages, num_pages * sizeof(*dreq->dr_pages)); - if (dreq->dr_pages == NULL) + OBD_ALLOC(iobuf->dr_pages, num_pages * sizeof(*iobuf->dr_pages)); + if (iobuf->dr_pages == NULL) goto failed_1; - OBD_ALLOC(dreq->dr_blocks, - MAX_BLOCKS_PER_PAGE * num_pages * sizeof(*dreq->dr_blocks)); - if (dreq->dr_blocks == NULL) + OBD_ALLOC(iobuf->dr_blocks, + MAX_BLOCKS_PER_PAGE * num_pages * sizeof(*iobuf->dr_blocks)); + if (iobuf->dr_blocks == NULL) goto failed_2; - dreq->dr_filter = filter; - dreq->dr_bios = NULL; - init_waitqueue_head(&dreq->dr_wait); - atomic_set(&dreq->dr_numreqs, 0); - spin_lock_init(&dreq->dr_lock); - dreq->dr_max_pages = num_pages; - dreq->dr_npages = 0; - - *ret = dreq; - RETURN(0); - + iobuf->dr_filter = filter; + init_waitqueue_head(&iobuf->dr_wait); + atomic_set(&iobuf->dr_numreqs, 0); + spin_lock_init(&iobuf->dr_lock); + iobuf->dr_max_pages = num_pages; + iobuf->dr_npages = 0; + + RETURN(iobuf); + failed_2: - OBD_FREE(dreq->dr_pages, - num_pages * sizeof(*dreq->dr_pages)); + OBD_FREE(iobuf->dr_pages, + num_pages * sizeof(*iobuf->dr_pages)); failed_1: - OBD_FREE(dreq, sizeof(*dreq)); + OBD_FREE(iobuf, sizeof(*iobuf)); failed_0: - RETURN(-ENOMEM); + RETURN(ERR_PTR(-ENOMEM)); } -void filter_iobuf_put(void *iobuf) +static void filter_clear_iobuf(struct filter_iobuf *iobuf) { - struct dio_request *dreq = iobuf; + iobuf->dr_npages = 0; + atomic_set(&iobuf->dr_numreqs, 0); +} - /* free all bios */ - while (dreq->dr_bios) { - struct bio *bio = dreq->dr_bios; - dreq->dr_bios = bio->bi_private; - bio_put(bio); - } - dreq->dr_npages = 0; - atomic_set(&dreq->dr_numreqs, 0); +void filter_free_iobuf(struct filter_iobuf *iobuf) +{ + int num_pages = iobuf->dr_max_pages; + + filter_clear_iobuf(iobuf); + + OBD_FREE(iobuf->dr_blocks, + MAX_BLOCKS_PER_PAGE * num_pages * sizeof(*iobuf->dr_blocks)); + OBD_FREE(iobuf->dr_pages, + num_pages * sizeof(*iobuf->dr_pages)); + OBD_FREE_PTR(iobuf); } -void filter_free_iobuf(void *iobuf) +void filter_iobuf_put(struct filter_obd *filter, struct filter_iobuf *iobuf, + struct obd_trans_info *oti) { - struct dio_request *dreq = iobuf; - int num_pages = dreq->dr_max_pages; + int thread_id = oti ? oti->oti_thread_id : -1; - filter_iobuf_put(dreq); + if (unlikely(thread_id < 0)) { + filter_free_iobuf(iobuf); + return; + } - OBD_FREE(dreq->dr_blocks, - MAX_BLOCKS_PER_PAGE * num_pages * sizeof(*dreq->dr_blocks)); - OBD_FREE(dreq->dr_pages, - num_pages * sizeof(*dreq->dr_pages)); - OBD_FREE_PTR(dreq); + LASSERTF(filter->fo_iobuf_pool[thread_id] == iobuf, + "iobuf mismatch for thread %d: pool %p iobuf %p\n", + thread_id, filter->fo_iobuf_pool[thread_id], iobuf); + filter_clear_iobuf(iobuf); } -int filter_iobuf_add_page(struct obd_device *obd, void *iobuf, +int filter_iobuf_add_page(struct obd_device *obd, struct filter_iobuf *iobuf, struct inode *inode, struct page *page) { - struct dio_request *dreq = iobuf; - - LASSERT (dreq->dr_npages < dreq->dr_max_pages); - dreq->dr_pages[dreq->dr_npages++] = page; + LASSERT(iobuf->dr_npages < iobuf->dr_max_pages); + iobuf->dr_pages[iobuf->dr_npages++] = page; return 0; } int filter_do_bio(struct obd_device *obd, struct inode *inode, - struct dio_request *dreq, int rw) + struct filter_iobuf *iobuf, int rw) { int blocks_per_page = PAGE_SIZE >> inode->i_blkbits; - struct page **pages = dreq->dr_pages; - int npages = dreq->dr_npages; - unsigned long *blocks = dreq->dr_blocks; + struct page **pages = iobuf->dr_pages; + int npages = iobuf->dr_npages; + unsigned long *blocks = iobuf->dr_blocks; int total_blocks = npages * blocks_per_page; int sector_bits = inode->i_sb->s_blocksize_bits - 9; unsigned int blocksize = inode->i_sb->s_blocksize; @@ -262,17 +268,17 @@ int filter_do_bio(struct obd_device *obd, struct inode *inode, int rc = 0; ENTRY; - LASSERT(dreq->dr_npages == npages); + LASSERT(iobuf->dr_npages == npages); LASSERT(total_blocks <= OBDFILTER_CREATED_SCRATCHPAD_ENTRIES); - for (page_idx = 0, block_idx = 0; - page_idx < npages; + for (page_idx = 0, block_idx = 0; + page_idx < npages; page_idx++, block_idx += blocks_per_page) { - + page = pages[page_idx]; LASSERT (block_idx + blocks_per_page <= total_blocks); - for (i = 0, page_offset = 0; + for (i = 0, page_offset = 0; i < blocks_per_page; i += nblocks, page_offset += blocksize * nblocks) { @@ -295,7 +301,7 @@ int filter_do_bio(struct obd_device *obd, struct inode *inode, if (bio != NULL && can_be_merged(bio, sector) && - bio_add_page(bio, page, + bio_add_page(bio, page, blocksize * nblocks, page_offset) != 0) continue; /* added this frag OK */ @@ -306,25 +312,25 @@ int filter_do_bio(struct obd_device *obd, struct inode *inode, /* Dang! I have to fragment this I/O */ CDEBUG(D_INODE, "bio++ sz %d vcnt %d(%d) " "sectors %d(%d) psg %d(%d) hsg %d(%d)\n", - bio->bi_size, + bio->bi_size, bio->bi_vcnt, bio->bi_max_vecs, bio->bi_size >> 9, q->max_sectors, - bio_phys_segments(q, bio), + bio_phys_segments(q, bio), q->max_phys_segments, - bio_hw_segments(q, bio), + bio_hw_segments(q, bio), q->max_hw_segments); - record_start_io(dreq, rw, bio->bi_size); + record_start_io(iobuf, rw, bio->bi_size); rc = fsfilt_send_bio(rw, obd, inode, bio); if (rc < 0) { CERROR("Can't send bio: %d\n", rc); - record_finish_io(dreq, rw, rc); + record_finish_io(iobuf, rw, rc); goto out; } } /* allocate new bio */ - bio = bio_alloc(GFP_NOIO, + bio = bio_alloc(GFP_NOIO, (npages - page_idx) * blocks_per_page); if (bio == NULL) { CERROR ("Can't allocate bio\n"); @@ -335,30 +341,30 @@ int filter_do_bio(struct obd_device *obd, struct inode *inode, bio->bi_bdev = inode->i_sb->s_bdev; bio->bi_sector = sector; bio->bi_end_io = dio_complete_routine; - bio->bi_private = dreq; + bio->bi_private = iobuf; - rc = bio_add_page(bio, page, + rc = bio_add_page(bio, page, blocksize * nblocks, page_offset); LASSERT (rc != 0); } } if (bio != NULL) { - record_start_io(dreq, rw, bio->bi_size); + record_start_io(iobuf, rw, bio->bi_size); rc = fsfilt_send_bio(rw, obd, inode, bio); if (rc >= 0) { rc = 0; } else { CERROR("Can't send bio: %d\n", rc); - record_finish_io(dreq, rw, rc); + record_finish_io(iobuf, rw, rc); } } out: - wait_event(dreq->dr_wait, atomic_read(&dreq->dr_numreqs) == 0); + wait_event(iobuf->dr_wait, atomic_read(&iobuf->dr_numreqs) == 0); if (rc == 0) - rc = dreq->dr_error; + rc = iobuf->dr_error; RETURN(rc); } @@ -375,7 +381,7 @@ int filter_do_bio(struct obd_device *obd, struct inode *inode, * not be dirty, because we already called fdatasync/fdatawait on them. */ static int filter_clear_page_cache(struct inode *inode, - struct dio_request *iobuf) + struct filter_iobuf *iobuf) { struct page *page; int i, rc, rc2; @@ -414,38 +420,37 @@ static int filter_clear_page_cache(struct inode *inode, } /* Must be called with i_sem taken for writes; this will drop it */ -int filter_direct_io(int rw, struct dentry *dchild, void *iobuf, +int filter_direct_io(int rw, struct dentry *dchild, struct filter_iobuf *iobuf, struct obd_export *exp, struct iattr *attr, struct obd_trans_info *oti, void **wait_handle) { struct obd_device *obd = exp->exp_obd; - struct dio_request *dreq = iobuf; struct inode *inode = dchild->d_inode; int blocks_per_page = PAGE_SIZE >> inode->i_blkbits; int rc, rc2, create; struct semaphore *sem; ENTRY; - LASSERTF(dreq->dr_npages <= dreq->dr_max_pages, "%d,%d\n", - dreq->dr_npages, dreq->dr_max_pages); - LASSERT(dreq->dr_npages <= OBDFILTER_CREATED_SCRATCHPAD_ENTRIES); + LASSERTF(iobuf->dr_npages <= iobuf->dr_max_pages, "%d,%d\n", + iobuf->dr_npages, iobuf->dr_max_pages); + LASSERT(iobuf->dr_npages <= OBDFILTER_CREATED_SCRATCHPAD_ENTRIES); if (rw == OBD_BRW_READ) { - if (dreq->dr_npages == 0) + if (iobuf->dr_npages == 0) RETURN(0); create = 0; sem = NULL; } else { LASSERTF(rw == OBD_BRW_WRITE, "%x\n", rw); - LASSERT(dreq->dr_npages > 0); + LASSERT(iobuf->dr_npages > 0); create = 1; sem = &obd->u.filter.fo_alloc_lock; - - lquota_enforce(quota_interface, obd, dreq->dr_ignore_quota); + + lquota_enforce(quota_interface, obd, iobuf->dr_ignore_quota); } remap: - rc = fsfilt_map_inode_pages(obd, inode, dreq->dr_pages, - dreq->dr_npages, dreq->dr_blocks, + rc = fsfilt_map_inode_pages(obd, inode, iobuf->dr_pages, + iobuf->dr_npages, iobuf->dr_blocks, obdfilter_created_scratchpad, create, sem); if (rc == -EDQUOT) { @@ -464,9 +469,9 @@ remap: if (rw == OBD_BRW_WRITE) { if (rc == 0) { filter_tally_write(&obd->u.filter, - dreq->dr_pages, - dreq->dr_npages, - dreq->dr_blocks, + iobuf->dr_pages, + iobuf->dr_npages, + iobuf->dr_blocks, blocks_per_page); if (attr->ia_size > inode->i_size) attr->ia_valid |= ATTR_SIZE; @@ -490,11 +495,11 @@ remap: RETURN(rc); } - rc = filter_clear_page_cache(inode, dreq); + rc = filter_clear_page_cache(inode, iobuf); if (rc != 0) RETURN(rc); - RETURN(filter_do_bio(obd, inode, dreq, rw)); + RETURN(filter_do_bio(obd, inode, iobuf, rw)); } /* See if there are unallocated parts in given file region */ @@ -524,7 +529,7 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa, int rc) { struct niobuf_local *lnb; - struct dio_request *dreq = NULL; + struct filter_iobuf *iobuf = NULL; struct lvfs_run_ctxt saved; struct fsfilt_objinfo fso; struct iattr iattr = { 0 }; @@ -544,14 +549,16 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa, if (rc != 0) GOTO(cleanup, rc); - dreq = filter_iobuf_get(oti->oti_thread, &exp->exp_obd->u.filter); + iobuf = filter_iobuf_get(&obd->u.filter, oti); + if (IS_ERR(iobuf)) + GOTO(cleanup, rc = PTR_ERR(iobuf)); cleanup_phase = 1; fso.fso_dentry = res->dentry; fso.fso_bufcnt = obj->ioo_bufcnt; inode = res->dentry->d_inode; - dreq->dr_ignore_quota = 0; + iobuf->dr_ignore_quota = 0; for (i = 0, lnb = res; i < obj->ioo_bufcnt; i++, lnb++) { loff_t this_size; @@ -565,7 +572,7 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa, continue; } - err = filter_iobuf_add_page(obd, dreq, inode, lnb->page); + err = filter_iobuf_add_page(obd, iobuf, inode, lnb->page); LASSERT (err == 0); total_size += lnb->len; @@ -575,12 +582,12 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa, this_size = lnb->offset + lnb->len; if (this_size > iattr.ia_size) iattr.ia_size = this_size; - + /* if one page is a write-back page from client cache, or it's - * written by root, then mark the whole io request as ignore + * written by root, then mark the whole io request as ignore * quota request */ if (lnb->flags & (OBD_BRW_FROM_GRANT | OBD_BRW_NOQUOTA)) - dreq->dr_ignore_quota = 1; + iobuf->dr_ignore_quota = 1; } push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); @@ -602,9 +609,36 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa, fsfilt_check_slow(now, obd_timeout, "brw_start"); - iattr_from_obdo(&iattr,oa,OBD_MD_FLATIME|OBD_MD_FLMTIME|OBD_MD_FLCTIME); + i = OBD_MD_FLATIME | OBD_MD_FLMTIME | OBD_MD_FLCTIME; + + /* If the inode still has SUID+SGID bits set (see filter_precreate()) + * then we will accept the UID+GID if sent by the client for + * initializing the ownership of this inode. We only allow this to + * happen once (so clear these bits) and later only allow setattr. */ + if (inode->i_mode & S_ISUID) + i |= OBD_MD_FLUID; + if (inode->i_mode & S_ISGID) + i |= OBD_MD_FLGID; + + iattr_from_obdo(&iattr, oa, i); + if (iattr.ia_valid & (ATTR_UID | ATTR_GID)) { + CDEBUG(D_INODE, "update UID/GID to %lu/%lu\n", + (unsigned long)oa->o_uid, (unsigned long)oa->o_gid); + + cap_raise(current->cap_effective, CAP_SYS_RESOURCE); + + iattr.ia_valid |= ATTR_MODE; + iattr.ia_mode = inode->i_mode; + if (iattr.ia_valid & ATTR_UID) + iattr.ia_mode &= ~S_ISUID; + if (iattr.ia_valid & ATTR_GID) + iattr.ia_mode &= ~S_ISGID; + + rc = filter_update_fidea(exp, inode, oti->oti_handle, oa); + } + /* filter_direct_io drops i_sem */ - rc = filter_direct_io(OBD_BRW_WRITE, res->dentry, dreq, exp, &iattr, + rc = filter_direct_io(OBD_BRW_WRITE, res->dentry, iobuf, exp, &iattr, oti, &wait_handle); if (rc == 0) obdo_from_inode(oa, inode, @@ -635,7 +669,7 @@ cleanup: pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); LASSERT(current->journal_info == NULL); case 1: - filter_iobuf_put(dreq); + filter_iobuf_put(&obd->u.filter, iobuf, oti); case 0: /* * lnb->page automatically returns back into per-thread page diff --git a/lustre/obdfilter/filter_log.c b/lustre/obdfilter/filter_log.c index 1eaa1ce..4a797c9 100644 --- a/lustre/obdfilter/filter_log.c +++ b/lustre/obdfilter/filter_log.c @@ -40,10 +40,10 @@ #include "filter_internal.h" -int filter_log_sz_change(struct llog_handle *cathandle, +int filter_log_sz_change(struct llog_handle *cathandle, struct ll_fid *mds_fid, __u32 io_epoch, - struct llog_cookie *logcookie, + struct llog_cookie *logcookie, struct inode *inode) { struct llog_size_change_rec *lsc; @@ -53,10 +53,10 @@ int filter_log_sz_change(struct llog_handle *cathandle, down(&inode->i_sem); ofd = inode->i_filterdata; - + if (ofd && ofd->ofd_epoch >= io_epoch) { if (ofd->ofd_epoch > io_epoch) - CERROR("client sent old epoch %d for obj ino %ld\n", + CERROR("client sent old epoch %d for obj ino %ld\n", io_epoch, inode->i_ino); up(&inode->i_sem); RETURN(0); @@ -139,7 +139,7 @@ static int filter_recov_log_unlink_cb(struct llog_ctxt *ctxt, memcpy(obdo_logcookie(oa), cookie, sizeof(*cookie)); oid = oa->o_id; - rc = filter_destroy(exp, oa, NULL, NULL); + rc = filter_destroy(exp, oa, NULL, NULL, NULL); obdo_free(oa); if (rc == -ENOENT) { CDEBUG(D_HA, "object already removed, send cookie\n"); @@ -206,12 +206,6 @@ int filter_recov_log_mds_ost_cb(struct llog_handle *llh, CERROR("log is not plain\n"); RETURN(-EINVAL); } - if (rec->lrh_type != MDS_UNLINK_REC && - rec->lrh_type != MDS_SETATTR_REC && - rec->lrh_type != LLOG_GEN_REC) { - CERROR("log record type error\n"); - RETURN(-EINVAL); - } cookie.lgc_lgl = llh->lgh_id; cookie.lgc_subsys = LLOG_MDS_OST_ORIG_CTXT; @@ -234,7 +228,10 @@ int filter_recov_log_mds_ost_cb(struct llog_handle *llh, llog_cancel(ctxt, NULL, 1, &cookie, 0); RETURN(rc); } + break; default: + CERROR("log record type %08x unknown\n", rec->lrh_type); + RETURN(-EINVAL); break; } diff --git a/lustre/obdfilter/filter_lvb.c b/lustre/obdfilter/filter_lvb.c index 86b38f5..c95d295 100644 --- a/lustre/obdfilter/filter_lvb.c +++ b/lustre/obdfilter/filter_lvb.c @@ -43,7 +43,6 @@ static int filter_lvbo_init(struct ldlm_resource *res) { struct ost_lvb *lvb = NULL; - struct filter_obd *filter; struct obd_device *obd; struct dentry *dentry; int rc = 0; @@ -68,35 +67,32 @@ static int filter_lvbo_init(struct ldlm_resource *res) res->lr_lvb_len = sizeof(*lvb); obd = res->lr_namespace->ns_lvbp; - filter = &obd->u.filter; LASSERT(obd != NULL); dentry = filter_fid2dentry(obd, NULL, 0, res->lr_name.name[0]); - if (IS_ERR(dentry)) - RETURN(PTR_ERR(dentry)); + if (IS_ERR(dentry)) { + rc = PTR_ERR(dentry); + CERROR("%s: bad object "LPU64"/"LPU64": rc %d\n", obd->obd_name, + res->lr_name.name[0], res->lr_name.name[1], rc); + RETURN(rc); + } - if (dentry->d_inode == NULL) { - lvb->lvb_size = 0; - lvb->lvb_blocks = 0; + if (dentry->d_inode == NULL) + GOTO(out_dentry, rc = -ENOENT); - /* making client use MDS mtime as this one is zero, bigger one - * will be taken and this does not break POSIX */ - lvb->lvb_mtime = 0; - } else { - lvb->lvb_size = dentry->d_inode->i_size; - lvb->lvb_mtime = LTIME_S(dentry->d_inode->i_mtime); - lvb->lvb_blocks = dentry->d_inode->i_blocks; - } + inode_init_lvb(dentry->d_inode, lvb); CDEBUG(D_DLMTRACE, "res: "LPU64" initial lvb size: "LPU64", " "mtime: "LPU64", blocks: "LPU64"\n", res->lr_name.name[0], lvb->lvb_size, lvb->lvb_mtime, lvb->lvb_blocks); + EXIT; +out_dentry: f_dput(dentry); /* Don't free lvb data on lookup error */ - RETURN(rc); + return rc; } /* This will be called in two ways: diff --git a/lustre/osc/lproc_osc.c b/lustre/osc/lproc_osc.c index 81f04f7..7a5df98 100644 --- a/lustre/osc/lproc_osc.c +++ b/lustre/osc/lproc_osc.c @@ -169,6 +169,65 @@ static int osc_rd_cur_grant_bytes(char *page, char **start, off_t off, return rc; } +static int osc_rd_create_count(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + struct obd_device *obd = data; + + if (obd == NULL) + return 0; + + return snprintf(page, count, "%d\n", + obd->u.cli.cl_oscc.oscc_grow_count); +} + +static int osc_wr_create_count(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct obd_device *obd = data; + int val, rc; + + if (obd == NULL) + return 0; + + rc = lprocfs_write_helper(buffer, count, &val); + if (rc) + return rc; + + if (val < 0) + return -ERANGE; + if (val > OST_MAX_PRECREATE) + return -ERANGE; + + obd->u.cli.cl_oscc.oscc_grow_count = val; + + return count; +} + +static int osc_rd_prealloc_next_id(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct obd_device *obd = data; + + if (obd == NULL) + return 0; + + return snprintf(page, count, LPU64"\n", + obd->u.cli.cl_oscc.oscc_next_id); +} + +static int osc_rd_prealloc_last_id(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct obd_device *obd = data; + + if (obd == NULL) + return 0; + + return snprintf(page, count, LPU64"\n", + obd->u.cli.cl_oscc.oscc_last_id); +} + static int osc_rd_checksum(char *page, char **start, off_t off, int count, int *eof, void *data) { @@ -199,23 +258,10 @@ static int osc_wr_checksum(struct file *file, const char *buffer, return count; } -static int osc_rd_last_id(char *page, char **start, off_t off, - int count, int *eof, void *data) -{ - struct obd_device *obd = (struct obd_device *)data; - struct osc_creator *oscc = &obd->u.cli.cl_oscc; - int rc; - - *eof = 1; - spin_lock(&oscc->oscc_lock); - rc = snprintf(page, count, LPU64"\n", oscc->oscc_next_id); - spin_unlock(&oscc->oscc_lock); - return rc; -} - static struct lprocfs_vars lprocfs_obd_vars[] = { { "uuid", lprocfs_rd_uuid, 0, 0 }, { "ping", 0, lprocfs_wr_ping, 0 }, + { "connect_flags", lprocfs_rd_connect_flags, 0, 0 }, { "blocksize", lprocfs_rd_blksize, 0, 0 }, { "kbytestotal", lprocfs_rd_kbytestotal, 0, 0 }, { "kbytesfree", lprocfs_rd_kbytesfree, 0, 0 }, @@ -229,11 +275,13 @@ static struct lprocfs_vars lprocfs_obd_vars[] = { osc_wr_max_pages_per_rpc, 0 }, { "max_rpcs_in_flight", osc_rd_max_rpcs_in_flight, osc_wr_max_rpcs_in_flight, 0 }, - { "max_dirty_mb", osc_rd_max_dirty_mb, osc_wr_max_dirty_mb, 0 }, + { "max_dirty_mb", osc_rd_max_dirty_mb, osc_wr_max_dirty_mb, 0 }, { "cur_dirty_bytes", osc_rd_cur_dirty_bytes, 0, 0 }, { "cur_grant_bytes", osc_rd_cur_grant_bytes, 0, 0 }, - { "checksums", osc_rd_checksum, osc_wr_checksum, 0 }, - { "last_id", osc_rd_last_id, 0, 0 }, + { "create_count", osc_rd_create_count, osc_wr_create_count, 0 }, + { "prealloc_next_id", osc_rd_prealloc_next_id, 0, 0 }, + { "prealloc_last_id", osc_rd_prealloc_last_id, 0, 0 }, + { "checksums", osc_rd_checksum, osc_wr_checksum, 0 }, { 0 } }; diff --git a/lustre/ost/ost_handler.c b/lustre/ost/ost_handler.c index 9e46442..562eb09 100644 --- a/lustre/ost/ost_handler.c +++ b/lustre/ost/ost_handler.c @@ -51,16 +51,9 @@ #include #include "ost_internal.h" -void oti_init(struct obd_trans_info *oti, struct ptlrpc_request *req) -{ - if (oti == NULL) - return; - memset(oti, 0, sizeof *oti); - - if (req->rq_repmsg && req->rq_reqmsg != 0) - oti->oti_transno = req->rq_repmsg->transno; - oti->oti_thread = req->rq_svc_thread; -} +static int ost_num_threads; +CFS_MODULE_PARM(ost_num_threads, "i", int, 0444, + "number of OST service threads to start"); void oti_to_request(struct obd_trans_info *oti, struct ptlrpc_request *req) { @@ -72,6 +65,7 @@ void oti_to_request(struct obd_trans_info *oti, struct ptlrpc_request *req) if (req->rq_repmsg) req->rq_repmsg->transno = oti->oti_transno; + req->rq_transno = oti->oti_transno; /* XXX 4 == entries in oti_ack_locks??? */ for (ack_lock = oti->oti_ack_locks, i = 0; i < 4; i++, ack_lock++) { @@ -101,7 +95,7 @@ static int ost_destroy(struct obd_export *exp, struct ptlrpc_request *req, oti->oti_logcookies = obdo_logcookie(&body->oa); repbody = lustre_msg_buf(req->rq_repmsg, 0, sizeof(*repbody)); memcpy(&repbody->oa, &body->oa, sizeof(body->oa)); - req->rq_status = obd_destroy(exp, &body->oa, NULL, oti); + req->rq_status = obd_destroy(exp, &body->oa, NULL, oti, NULL); RETURN(0); } @@ -138,6 +132,8 @@ static int ost_statfs(struct ptlrpc_request *req) osfs = lustre_msg_buf(req->rq_repmsg, 0, sizeof(*osfs)); req->rq_status = obd_statfs(req->rq_export->exp_obd, osfs, jiffies-HZ); + if (OBD_FAIL_CHECK_ONCE(OBD_FAIL_OST_ENOSPC)) + osfs->os_bfree = osfs->os_bavail = 64; if (req->rq_status != 0) CERROR("ost: statfs failed: rc %d\n", req->rq_status); @@ -167,18 +163,91 @@ static int ost_create(struct obd_export *exp, struct ptlrpc_request *req, RETURN(0); } +/* + * Helper function for ost_punch(): if asked by client, acquire [size, EOF] + * lock on the file being truncated. + */ +static int ost_punch_lock_get(struct obd_export *exp, struct obdo *oa, + struct lustre_handle *lh) +{ + int flags; + struct ldlm_res_id res_id = { .name = { oa->o_id } }; + ldlm_policy_data_t policy; + __u64 start; + __u64 finis; + + ENTRY; + + LASSERT(!lustre_handle_is_used(lh)); + + if (!(oa->o_valid & OBD_MD_FLFLAGS) || + !(oa->o_flags & OBD_FL_TRUNCLOCK)) + RETURN(0); + + CDEBUG(D_INODE, "OST-side truncate lock.\n"); + + start = oa->o_size; + finis = start + oa->o_blocks; + + /* + * standard truncate optimization: if file body is completely + * destroyed, don't send data back to the server. + */ + flags = (start == 0) ? LDLM_AST_DISCARD_DATA : 0; + + policy.l_extent.start = start & CFS_PAGE_MASK; + + /* + * If ->o_blocks is EOF it means "lock till the end of the + * file". Otherwise, it's size of a hole being punched (in bytes) + */ + if (oa->o_blocks == OBD_OBJECT_EOF || finis < start) + policy.l_extent.end = OBD_OBJECT_EOF; + else + policy.l_extent.end = finis | ~CFS_PAGE_MASK; + + RETURN(ldlm_cli_enqueue(NULL, NULL, exp->exp_obd->obd_namespace, + res_id, LDLM_EXTENT, &policy, LCK_PW, &flags, + ldlm_blocking_ast, ldlm_completion_ast, + ldlm_glimpse_ast, + NULL, NULL, 0, NULL, lh)); +} + +/* + * Helper function for ost_punch(): release lock acquired by + * ost_punch_lock_get(), if any. + */ +static void ost_punch_lock_put(struct obd_export *exp, struct obdo *oa, + struct lustre_handle *lh) +{ + ENTRY; + if (lustre_handle_is_used(lh)) + ldlm_lock_decref(lh, LCK_PW); + EXIT; +} + static int ost_punch(struct obd_export *exp, struct ptlrpc_request *req, struct obd_trans_info *oti) { + struct obdo *oa; struct ost_body *body, *repbody; + struct lustre_handle lh = {0,}; + int rc, size = sizeof(*repbody); + ENTRY; - body = lustre_swab_reqbuf(req, 0, sizeof(*body), lustre_swab_ost_body); + /* + * check that we do support OBD_CONNECT_TRUNCLOCK. + */ + CLASSERT(OST_CONNECT_SUPPORTED & OBD_CONNECT_TRUNCLOCK); + + body = lustre_swab_reqbuf(req, 0, sizeof *body, lustre_swab_ost_body); if (body == NULL) RETURN(-EFAULT); - if ((body->oa.o_valid & (OBD_MD_FLSIZE | OBD_MD_FLBLOCKS)) != + oa = &body->oa; + if ((oa->o_valid & (OBD_MD_FLSIZE | OBD_MD_FLBLOCKS)) != (OBD_MD_FLSIZE | OBD_MD_FLBLOCKS)) RETURN(-EINVAL); @@ -187,10 +256,23 @@ static int ost_punch(struct obd_export *exp, struct ptlrpc_request *req, RETURN(rc); repbody = lustre_msg_buf(req->rq_repmsg, 0, sizeof(*repbody)); - memcpy(&repbody->oa, &body->oa, sizeof(body->oa)); - req->rq_status = obd_punch(exp, &repbody->oa, NULL, repbody->oa.o_size, - repbody->oa.o_blocks, oti); - RETURN(0); + repbody->oa = *oa; + rc = ost_punch_lock_get(exp, oa, &lh); + if (rc == 0) { + if (oa->o_valid & OBD_MD_FLFLAGS && + oa->o_flags == OBD_FL_TRUNCLOCK) + /* + * If OBD_FL_TRUNCLOCK is the only bit set in + * ->o_flags, clear OBD_MD_FLFLAGS to avoid falling + * through filter_setattr() to filter_iocontrol(). + */ + oa->o_valid &= ~OBD_MD_FLFLAGS; + + req->rq_status = obd_punch(exp, oa, NULL, + oa->o_size, oa->o_blocks, oti); + ost_punch_lock_put(exp, oa, &lh); + } + RETURN(rc); } static int ost_sync(struct obd_export *exp, struct ptlrpc_request *req) @@ -461,15 +543,16 @@ static int ost_brw_lock_get(int mode, struct obd_export *exp, ENTRY; LASSERT(mode == LCK_PR || mode == LCK_PW); + LASSERT(!lustre_handle_is_used(lh)); + + if (nrbufs == 0 || !(nb[0].flags & OBD_BRW_SRVLOCK)) + RETURN(0); /* EXPENSIVE ASSERTION */ for (i = 1; i < nrbufs; i ++) LASSERT((nb[0].flags & OBD_BRW_SRVLOCK) == (nb[i].flags & OBD_BRW_SRVLOCK)); - if (nrbufs == 0 || !(nb[0].flags & OBD_BRW_SRVLOCK)) - RETURN(0); - policy.l_extent.start = nb[0].offset & CFS_PAGE_MASK; policy.l_extent.end = (nb[nrbufs - 1].offset + nb[nrbufs - 1].len - 1) | ~CFS_PAGE_MASK; @@ -487,7 +570,9 @@ static void ost_brw_lock_put(int mode, { ENTRY; LASSERT(mode == LCK_PR || mode == LCK_PW); - if (obj->ioo_bufcnt > 0 && niob[0].flags & OBD_BRW_SRVLOCK) + LASSERT((obj->ioo_bufcnt > 0 && (niob[0].flags & OBD_BRW_SRVLOCK)) == + lustre_handle_is_used(lh)); + if (lustre_handle_is_used(lh)) ldlm_lock_decref(lh, mode); EXIT; } @@ -501,7 +586,7 @@ static int ost_brw_read(struct ptlrpc_request *req, struct obd_trans_info *oti) struct obd_ioobj *ioo; struct ost_body *body, *repbody; struct l_wait_info lwi; - struct lustre_handle lockh; + struct lustre_handle lockh = {0}; int size[1] = { sizeof(*body) }; int comms_error = 0; int niocount; @@ -613,17 +698,27 @@ static int ost_brw_read(struct ptlrpc_request *req, struct obd_trans_info *oti) } } + /* Check if client was evicted while we were doing i/o before touching + network */ if (rc == 0) { - rc = ptlrpc_start_bulk_transfer(desc); + if (desc->bd_export->exp_failed) + rc = -ENOTCONN; + else + rc = ptlrpc_start_bulk_transfer(desc); if (rc == 0) { - lwi = LWI_TIMEOUT(obd_timeout * HZ / 4, - ost_bulk_timeout, desc); + lwi = LWI_TIMEOUT_INTERVAL(obd_timeout * HZ / 4, HZ, + ost_bulk_timeout, desc); rc = l_wait_event(desc->bd_waitq, - !ptlrpc_bulk_active(desc), &lwi); + !ptlrpc_bulk_active(desc) || + desc->bd_export->exp_failed, &lwi); LASSERT(rc == 0 || rc == -ETIMEDOUT); if (rc == -ETIMEDOUT) { DEBUG_REQ(D_ERROR, req, "timeout on bulk PUT"); ptlrpc_abort_bulk(desc); + } else if (desc->bd_export->exp_failed) { + DEBUG_REQ(D_ERROR, req, "Eviction on bulk PUT"); + rc = -ENOTCONN; + ptlrpc_abort_bulk(desc); } else if (!desc->bd_success || desc->bd_nob_transferred != desc->bd_nob) { DEBUG_REQ(D_ERROR, req, "%s bulk PUT %d(%d)", @@ -669,7 +764,7 @@ static int ost_brw_read(struct ptlrpc_request *req, struct obd_trans_info *oti) target_committed_to_req(req); ptlrpc_reply(req); } else if (!comms_error) { - /* only reply if comms OK */ + /* Only reply if there was no comms problem with bulk */ target_committed_to_req(req); req->rq_status = rc; ptlrpc_error(req); @@ -680,8 +775,8 @@ static int ost_brw_read(struct ptlrpc_request *req, struct obd_trans_info *oti) req->rq_reply_state = NULL; } if (req->rq_reqmsg->conn_cnt == req->rq_export->exp_conn_cnt) { - CERROR("bulk IO comms error: " - "evicting %s@%s id %s\n", + CERROR("%s: bulk IO comm error evicting %s@%s id %s\n", + req->rq_export->exp_obd->obd_name, req->rq_export->exp_client_uuid.uuid, req->rq_export->exp_connection->c_remote_uuid.uuid, libcfs_id2str(req->rq_peer)); @@ -707,7 +802,7 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti) struct obd_ioobj *ioo; struct ost_body *body, *repbody; struct l_wait_info lwi; - struct lustre_handle lockh; + struct lustre_handle lockh = {0}; __u32 *rcs; int size[2] = { sizeof(*body) }; int objcount, niocount, npages; @@ -816,16 +911,25 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti) pp_rnb[i].offset & (PAGE_SIZE - 1), pp_rnb[i].len); - rc = ptlrpc_start_bulk_transfer (desc); + /* Check if client was evicted while we were doing i/o before touching + network */ + if (desc->bd_export->exp_failed) + rc = -ENOTCONN; + else + rc = ptlrpc_start_bulk_transfer (desc); if (rc == 0) { - lwi = LWI_TIMEOUT(obd_timeout * HZ / 4, - ost_bulk_timeout, desc); - rc = l_wait_event(desc->bd_waitq, !ptlrpc_bulk_active(desc), - &lwi); + lwi = LWI_TIMEOUT_INTERVAL(obd_timeout * HZ / 4, HZ, + ost_bulk_timeout, desc); + rc = l_wait_event(desc->bd_waitq, !ptlrpc_bulk_active(desc) || + desc->bd_export->exp_failed, &lwi); LASSERT(rc == 0 || rc == -ETIMEDOUT); if (rc == -ETIMEDOUT) { DEBUG_REQ(D_ERROR, req, "timeout on bulk GET"); ptlrpc_abort_bulk(desc); + } else if (desc->bd_export->exp_failed) { + DEBUG_REQ(D_ERROR, req, "Eviction on bulk GET"); + rc = -ENOTCONN; + ptlrpc_abort_bulk(desc); } else if (!desc->bd_success || desc->bd_nob_transferred != desc->bd_nob) { DEBUG_REQ(D_ERROR, req, "%s bulk GET %d(%d)", @@ -858,7 +962,7 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti) repbody->oa.o_valid |= OBD_MD_FLCKSUM; } else if ((cksum_counter & (-cksum_counter)) == cksum_counter) { - CWARN("Checksum %u from %s: %x OK\n", cksum_counter, + CWARN("Checksum %u from %s: %x OK\n", cksum_counter, libcfs_id2str(req->rq_peer), cksum); } else { cksum_counter++; @@ -1149,6 +1253,66 @@ static int ost_filter_recovery_request(struct ptlrpc_request *req, } } +int ost_msg_check_version(struct lustre_msg *msg) +{ + int rc; + + /* TODO: enable the below check while really introducing msg version. + * it's disabled because it will break compatibility with b1_4. + */ + return (0); + switch(msg->opc) { + case OST_CONNECT: + case OST_DISCONNECT: + case OBD_PING: + rc = lustre_msg_check_version(msg, LUSTRE_OBD_VERSION); + if (rc) + CERROR("bad opc %u version %08x, expecting %08x\n", + msg->opc, msg->version, LUSTRE_OBD_VERSION); + break; + case OST_CREATE: + case OST_DESTROY: + case OST_GETATTR: + case OST_SETATTR: + case OST_WRITE: + case OST_READ: + case OST_SAN_READ: + case OST_SAN_WRITE: + case OST_PUNCH: + case OST_STATFS: + case OST_SYNC: + case OST_SET_INFO: + case OST_GET_INFO: + case OST_QUOTACHECK: + case OST_QUOTACTL: + rc = lustre_msg_check_version(msg, LUSTRE_OST_VERSION); + if (rc) + CERROR("bad opc %u version %08x, expecting %08x\n", + msg->opc, msg->version, LUSTRE_OST_VERSION); + break; + case LDLM_ENQUEUE: + case LDLM_CONVERT: + case LDLM_CANCEL: + case LDLM_BL_CALLBACK: + case LDLM_CP_CALLBACK: + rc = lustre_msg_check_version(msg, LUSTRE_DLM_VERSION); + if (rc) + CERROR("bad opc %u version %08x, expecting %08x\n", + msg->opc, msg->version, LUSTRE_DLM_VERSION); + break; + case LLOG_ORIGIN_CONNECT: + case OBD_LOG_CANCEL: + rc = lustre_msg_check_version(msg, LUSTRE_LOG_VERSION); + if (rc) + CERROR("bad opc %u version %08x, expecting %08x\n", + msg->opc, msg->version, LUSTRE_LOG_VERSION); + default: + CERROR("Unexpected opcode %d\n", msg->opc); + rc = -ENOTSUPP; + } + return rc; +} + static int ost_handle(struct ptlrpc_request *req) { struct obd_trans_info trans_info = { 0, }; @@ -1187,6 +1351,9 @@ static int ost_handle(struct ptlrpc_request *req) } oti_init(oti, req); + rc = ost_msg_check_version(req->rq_reqmsg); + if (rc) + RETURN(rc); switch (req->rq_reqmsg->opc) { case OST_CONNECT: { @@ -1381,7 +1548,6 @@ static void ost_thread_done(struct ptlrpc_thread *thread) ENTRY; LASSERT(thread != NULL); - LASSERT(thread->t_data != NULL); /* * be prepared to handle partially-initialized pools (because this is @@ -1412,7 +1578,7 @@ static int ost_thread_init(struct ptlrpc_thread *thread) LASSERT(thread != NULL); LASSERT(thread->t_data == NULL); - LASSERT(thread->t_id < OST_NUM_THREADS); + LASSERT(thread->t_id < OST_MAX_THREADS); OBD_ALLOC_PTR(tls); if (tls != NULL) { @@ -1454,21 +1620,23 @@ static int ost_setup(struct obd_device *obd, obd_count len, void *buf) sema_init(&ost->ost_health_sem, 1); + if (ost_num_threads < 2) + ost_num_threads = OST_DEF_THREADS; + if (ost_num_threads > OST_MAX_THREADS) + ost_num_threads = OST_MAX_THREADS; + ost->ost_service = ptlrpc_init_svc(OST_NBUFS, OST_BUFSIZE, OST_MAXREQSIZE, OST_MAXREPSIZE, OST_REQUEST_PORTAL, OSC_REPLY_PORTAL, - obd_timeout * 1000, ost_handle, "ost", + obd_timeout * 1000, ost_handle, LUSTRE_OST_NAME, obd->obd_proc_entry, ost_print_req, - OST_NUM_THREADS); + ost_num_threads); if (ost->ost_service == NULL) { CERROR("failed to start service\n"); GOTO(out_lprocfs, rc = -ENOMEM); } - ost->ost_service->srv_init = ost_thread_init; - ost->ost_service->srv_done = ost_thread_done; - ost->ost_service->srv_cpu_affinity = 1; rc = ptlrpc_start_threads(obd, ost->ost_service, "ll_ost"); if (rc) GOTO(out_service, rc = -EINVAL); @@ -1489,8 +1657,31 @@ static int ost_setup(struct obd_device *obd, obd_count len, void *buf) if (rc) GOTO(out_create, rc = -EINVAL); + ost->ost_io_service = + ptlrpc_init_svc(OST_NBUFS, OST_BUFSIZE, OST_MAXREQSIZE, + OST_MAXREPSIZE, OST_IO_PORTAL, + OSC_REPLY_PORTAL, + obd_timeout * 1000, ost_handle, "ost_io", + obd->obd_proc_entry, ost_print_req, + ost_num_threads); + if (ost->ost_io_service == NULL) { + CERROR("failed to start OST I/O service\n"); + GOTO(out_create, rc = -ENOMEM); + } + + ost->ost_io_service->srv_init = ost_thread_init; + ost->ost_io_service->srv_done = ost_thread_done; + ost->ost_io_service->srv_cpu_affinity = 1; + rc = ptlrpc_start_threads(obd, ost->ost_io_service, + "ll_ost_io"); + if (rc) + GOTO(out_io, rc = -EINVAL); + RETURN(0); +out_io: + ptlrpc_unregister_service(ost->ost_io_service); + ost->ost_io_service = NULL; out_create: ptlrpc_unregister_service(ost->ost_create_service); ost->ost_create_service = NULL; @@ -1518,6 +1709,7 @@ static int ost_cleanup(struct obd_device *obd) down(&ost->ost_health_sem); ptlrpc_unregister_service(ost->ost_service); ptlrpc_unregister_service(ost->ost_create_service); + ptlrpc_unregister_service(ost->ost_io_service); ost->ost_service = NULL; ost->ost_create_service = NULL; up(&ost->ost_health_sem); @@ -1535,6 +1727,7 @@ static int ost_health_check(struct obd_device *obd) down(&ost->ost_health_sem); rc |= ptlrpc_service_health_check(ost->ost_service); rc |= ptlrpc_service_health_check(ost->ost_create_service); + rc |= ptlrpc_service_health_check(ost->ost_io_service); up(&ost->ost_health_sem); /* diff --git a/lustre/ost/ost_internal.h b/lustre/ost/ost_internal.h index 51ae8c9..3407a96 100644 --- a/lustre/ost/ost_internal.h +++ b/lustre/ost/ost_internal.h @@ -14,16 +14,8 @@ extern void ost_print_req(void *seq_file, struct ptlrpc_request *req); /* * tunables for per-thread page pool (bug 5137) */ -enum { - /* - * pool size in pages - */ - OST_THREAD_POOL_SIZE = PTLRPC_MAX_BRW_PAGES, - /* - * GFP mask used to allocate pages for pool - */ - OST_THREAD_POOL_GFP = GFP_HIGHUSER -}; +#define OST_THREAD_POOL_SIZE PTLRPC_MAX_BRW_PAGES /* pool size in pages */ +#define OST_THREAD_POOL_GFP GFP_HIGHUSER /* GFP mask for pool pages */ struct page; struct niobuf_local; diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c index eb45b24..feb77f2 100644 --- a/lustre/ptlrpc/client.c +++ b/lustre/ptlrpc/client.c @@ -305,10 +305,10 @@ static struct ptlrpc_request *ptlrpc_prep_req_from_pool(struct ptlrpc_request_po return request; } -struct ptlrpc_request *ptlrpc_prep_req_pool(struct obd_import *imp, int opcode, - int count, int *lengths, - char **bufs, - struct ptlrpc_request_pool *pool) +struct ptlrpc_request * +ptlrpc_prep_req_pool(struct obd_import *imp, __u32 version, int opcode, + int count, int *lengths, char **bufs, + struct ptlrpc_request_pool *pool) { struct ptlrpc_request *request = NULL; int rc; @@ -337,6 +337,11 @@ struct ptlrpc_request *ptlrpc_prep_req_pool(struct obd_import *imp, int opcode, RETURN(NULL); } +#if 0 /* TODO: enable this while really introducing msg version. + * it's disabled because it will break compatibility with b1_4. + */ + request->rq_reqmsg->version |= version; +#endif if (imp->imp_server_timeout) request->rq_timeout = obd_timeout / 2; else @@ -372,13 +377,14 @@ struct ptlrpc_request *ptlrpc_prep_req_pool(struct obd_import *imp, int opcode, RETURN(request); } -struct ptlrpc_request *ptlrpc_prep_req(struct obd_import *imp, int opcode, - int count, int *lengths, char **bufs) +struct ptlrpc_request * +ptlrpc_prep_req(struct obd_import *imp, __u32 version, int opcode, + int count, int *lengths, char **bufs) { - return ptlrpc_prep_req_pool(imp, opcode, count, lengths, bufs, NULL); + return ptlrpc_prep_req_pool(imp, version, opcode, count, lengths, + bufs, NULL); } - struct ptlrpc_request_set *ptlrpc_prep_set(void) { struct ptlrpc_request_set *set; @@ -712,7 +718,7 @@ static int ptlrpc_send_new_req(struct ptlrpc_request *req) libcfs_nid2str(imp->imp_connection->c_peer.nid), req->rq_reqmsg->opc); - rc = ptl_send_rpc(req); + rc = ptl_send_rpc(req, 0); if (rc) { DEBUG_REQ(D_HA, req, "send failed (%d); expect timeout", rc); req->rq_net_err = 1; @@ -721,6 +727,7 @@ static int ptlrpc_send_new_req(struct ptlrpc_request *req) RETURN(0); } +/* this sends any unsent RPCs in @set and returns TRUE if all are sent */ int ptlrpc_check_set(struct ptlrpc_request_set *set) { unsigned long flags; @@ -842,7 +849,7 @@ int ptlrpc_check_set(struct ptlrpc_request_set *set) } } - rc = ptl_send_rpc(req); + rc = ptl_send_rpc(req, 0); if (rc) { DEBUG_REQ(D_HA, req, "send failed (%d)", rc); @@ -1559,7 +1566,7 @@ restart: list_add_tail(&req->rq_list, &imp->imp_sending_list); spin_unlock_irqrestore(&imp->imp_lock, flags); - rc = ptl_send_rpc(req); + rc = ptl_send_rpc(req, 0); if (rc) { DEBUG_REQ(D_HA, req, "send failed (%d); recovering", rc); timeout = CFS_TICK; @@ -1806,7 +1813,7 @@ void ptlrpc_abort_inflight(struct obd_import *imp) } /* Last chance to free reqs left on the replay list, but we - * will still leak reqs that haven't comitted. */ + * will still leak reqs that haven't committed. */ if (imp->imp_replayable) ptlrpc_free_committed(imp); diff --git a/lustre/ptlrpc/events.c b/lustre/ptlrpc/events.c index 95bb0a5..e12523e 100644 --- a/lustre/ptlrpc/events.c +++ b/lustre/ptlrpc/events.c @@ -390,7 +390,7 @@ int ptlrpc_uuid_to_peer (struct obd_uuid *uuid, } } - CDEBUG(D_WARNING,"%s->%s\n", uuid->uuid, libcfs_id2str(*peer)); + CDEBUG(D_NET,"%s->%s\n", uuid->uuid, libcfs_id2str(*peer)); if (rc != 0) CERROR("No NID found for %s\n", uuid->uuid); return rc; @@ -474,7 +474,7 @@ int ptlrpc_ni_init(void) #endif if (rc == 0) return 0; - + CERROR ("Failed to allocate event queue: %d\n", rc); LNetNIFini(); diff --git a/lustre/ptlrpc/llog_client.c b/lustre/ptlrpc/llog_client.c index 68c1a4a..d714a84 100644 --- a/lustre/ptlrpc/llog_client.c +++ b/lustre/ptlrpc/llog_client.c @@ -85,8 +85,8 @@ static int llog_client_create(struct llog_ctxt *ctxt, struct llog_handle **res, bufcount++; } - req = ptlrpc_prep_req(imp, LLOG_ORIGIN_HANDLE_CREATE, - bufcount, size, tmp); + req = ptlrpc_prep_req(imp, LUSTRE_LOG_VERSION, + LLOG_ORIGIN_HANDLE_CREATE, bufcount, size, tmp); if (!req) GOTO(err_free, rc = -ENOMEM); @@ -115,6 +115,32 @@ err_free: goto out; } +static int llog_client_destroy(struct llog_handle *loghandle) +{ + struct obd_import *imp = loghandle->lgh_ctxt->loc_imp; + struct ptlrpc_request *req = NULL; + struct llogd_body *body; + int size = sizeof(*body); + int repsize[2] = {sizeof (*body)}; + int rc; + ENTRY; + + req = ptlrpc_prep_req(imp, LUSTRE_LOG_VERSION, + LLOG_ORIGIN_HANDLE_DESTROY, 1, &size, NULL); + if (!req) + RETURN(-ENOMEM); + + body = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*body)); + body->lgd_logid = loghandle->lgh_id; + body->lgd_llh_flags = loghandle->lgh_hdr->llh_flags; + + req->rq_replen = lustre_msg_size(1, repsize); + rc = ptlrpc_queue_wait(req); + + ptlrpc_req_finished(req); + RETURN(rc); +} + static int llog_client_next_block(struct llog_handle *loghandle, int *cur_idx, int next_idx, @@ -129,7 +155,8 @@ static int llog_client_next_block(struct llog_handle *loghandle, int rc; ENTRY; - req = ptlrpc_prep_req(imp, LLOG_ORIGIN_HANDLE_NEXT_BLOCK, 1,&size,NULL); + req = ptlrpc_prep_req(imp, LUSTRE_LOG_VERSION, + LLOG_ORIGIN_HANDLE_NEXT_BLOCK, 1,&size,NULL); if (!req) GOTO(out, rc = -ENOMEM); @@ -173,6 +200,56 @@ out: RETURN(rc); } +static int llog_client_prev_block(struct llog_handle *loghandle, + int prev_idx, void *buf, int len) +{ + struct obd_import *imp = loghandle->lgh_ctxt->loc_imp; + struct ptlrpc_request *req = NULL; + struct llogd_body *body; + void * ptr; + int size = sizeof(*body); + int repsize[2] = {sizeof (*body)}; + int rc; + ENTRY; + + req = ptlrpc_prep_req(imp, LUSTRE_LOG_VERSION, + LLOG_ORIGIN_HANDLE_PREV_BLOCK, 1,&size,NULL); + if (!req) + GOTO(out, rc = -ENOMEM); + + body = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*body)); + body->lgd_logid = loghandle->lgh_id; + body->lgd_ctxt_idx = loghandle->lgh_ctxt->loc_idx - 1; + body->lgd_llh_flags = loghandle->lgh_hdr->llh_flags; + body->lgd_index = prev_idx; + body->lgd_len = len; + repsize[1] = len; + + req->rq_replen = lustre_msg_size(2, repsize); + rc = ptlrpc_queue_wait(req); + if (rc) + GOTO(out, rc); + + body = lustre_swab_repbuf(req, 0, sizeof(*body), + lustre_swab_llogd_body); + if (body == NULL) { + CERROR ("Can't unpack llogd_body\n"); + GOTO(out, rc =-EFAULT); + } + + ptr = lustre_msg_buf(req->rq_repmsg, 1, len); + if (ptr == NULL) { + CERROR ("Can't unpack bitmap\n"); + GOTO(out, rc =-EFAULT); + } + + memcpy(buf, ptr, len); + +out: + if (req) + ptlrpc_req_finished(req); + RETURN(rc); +} static int llog_client_read_header(struct llog_handle *handle) { @@ -186,8 +263,8 @@ static int llog_client_read_header(struct llog_handle *handle) int rc; ENTRY; - req = ptlrpc_prep_req(imp, LLOG_ORIGIN_HANDLE_READ_HEADER, - 1, &size, NULL); + req = ptlrpc_prep_req(imp, LUSTRE_LOG_VERSION, + LLOG_ORIGIN_HANDLE_READ_HEADER, 1, &size, NULL); if (!req) GOTO(out, rc = -ENOMEM); @@ -241,7 +318,9 @@ static int llog_client_close(struct llog_handle *handle) struct llog_operations llog_client_ops = { lop_next_block: llog_client_next_block, + lop_prev_block: llog_client_prev_block, lop_read_header: llog_client_read_header, lop_create: llog_client_create, + lop_destroy: llog_client_destroy, lop_close: llog_client_close, }; diff --git a/lustre/ptlrpc/llog_net.c b/lustre/ptlrpc/llog_net.c index 14eeefb..735ed31 100644 --- a/lustre/ptlrpc/llog_net.c +++ b/lustre/ptlrpc/llog_net.c @@ -82,7 +82,8 @@ int llog_origin_connect(struct llog_ctxt *ctxt, int count, LASSERT(ctxt->loc_imp); imp = ctxt->loc_imp; - request = ptlrpc_prep_req(imp, LLOG_ORIGIN_CONNECT, 1, &size, NULL); + request = ptlrpc_prep_req(imp, LUSTRE_LOG_VERSION, + LLOG_ORIGIN_CONNECT, 1, &size, NULL); if (!request) RETURN(-ENOMEM); diff --git a/lustre/ptlrpc/llog_server.c b/lustre/ptlrpc/llog_server.c index 1572ff8..cf588d3 100644 --- a/lustre/ptlrpc/llog_server.c +++ b/lustre/ptlrpc/llog_server.c @@ -105,6 +105,65 @@ out: RETURN(rc); } +int llog_origin_handle_destroy(struct ptlrpc_request *req) +{ + struct obd_export *exp = req->rq_export; + struct obd_device *obd = exp->exp_obd; + struct obd_device *disk_obd; + struct llog_handle *loghandle; + struct llogd_body *body; + struct lvfs_run_ctxt saved; + struct llog_logid *logid = NULL; + struct llog_ctxt *ctxt; + int size = sizeof (*body); + int rc; + __u32 flags; + ENTRY; + + body = lustre_swab_reqbuf(req, 0, sizeof(*body), + lustre_swab_llogd_body); + if (body == NULL) { + CERROR ("Can't unpack llogd_body\n"); + GOTO(out, rc =-EFAULT); + } + + if (body->lgd_logid.lgl_oid > 0) + logid = &body->lgd_logid; + + ctxt = llog_get_context(obd, body->lgd_ctxt_idx); + if (ctxt == NULL) + GOTO(out, rc = -EINVAL); + disk_obd = ctxt->loc_exp->exp_obd; + push_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL); + + rc = llog_create(ctxt, &loghandle, logid, NULL); + if (rc) + GOTO(out_pop, rc); + + rc = lustre_pack_reply(req, 1, &size, NULL); + if (rc) + GOTO(out_close, rc = -ENOMEM); + + body = lustre_msg_buf(req->rq_repmsg, 0, sizeof (*body)); + body->lgd_logid = loghandle->lgh_id; + flags = body->lgd_llh_flags; + rc = llog_init_handle(loghandle, LLOG_F_IS_PLAIN, NULL); + if (rc) + GOTO(out_close, rc); + rc = llog_destroy(loghandle); + if (rc) + GOTO(out_close, rc); + llog_free_handle(loghandle); + +out_close: + if (rc) + llog_close(loghandle); +out_pop: + pop_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL); +out: + RETURN(rc); +} + int llog_origin_handle_next_block(struct ptlrpc_request *req) { struct obd_export *exp = req->rq_export; @@ -178,6 +237,77 @@ out: RETURN(rc); } +int llog_origin_handle_prev_block(struct ptlrpc_request *req) +{ + struct obd_export *exp = req->rq_export; + struct obd_device *obd = exp->exp_obd; + struct llog_handle *loghandle; + struct llogd_body *body; + struct obd_device *disk_obd; + struct lvfs_run_ctxt saved; + struct llog_ctxt *ctxt; + __u32 flags; + __u8 *buf; + void * ptr; + int size[] = {sizeof (*body), + LLOG_CHUNK_SIZE}; + int rc, rc2; + ENTRY; + + body = lustre_swab_reqbuf(req, 0, sizeof(*body), + lustre_swab_llogd_body); + if (body == NULL) { + CERROR ("Can't unpack llogd_body\n"); + GOTO(out, rc =-EFAULT); + } + + OBD_ALLOC(buf, LLOG_CHUNK_SIZE); + if (!buf) + GOTO(out, rc = -ENOMEM); + + ctxt = llog_get_context(obd, body->lgd_ctxt_idx); + LASSERT(ctxt != NULL); + disk_obd = ctxt->loc_exp->exp_obd; + push_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL); + + rc = llog_create(ctxt, &loghandle, &body->lgd_logid, NULL); + if (rc) + GOTO(out_pop, rc); + + flags = body->lgd_llh_flags; + rc = llog_init_handle(loghandle, flags, NULL); + if (rc) + GOTO(out_close, rc); + + memset(buf, 0, LLOG_CHUNK_SIZE); + rc = llog_prev_block(loghandle, body->lgd_index, + buf, LLOG_CHUNK_SIZE); + if (rc) + GOTO(out_close, rc); + + + rc = lustre_pack_reply(req, 2, size, NULL); + if (rc) + GOTO(out_close, rc = -ENOMEM); + + ptr = lustre_msg_buf(req->rq_repmsg, 0, sizeof (body)); + memcpy(ptr, body, sizeof(*body)); + + ptr = lustre_msg_buf(req->rq_repmsg, 1, LLOG_CHUNK_SIZE); + memcpy(ptr, buf, LLOG_CHUNK_SIZE); + +out_close: + rc2 = llog_close(loghandle); + if (!rc) + rc = rc2; + +out_pop: + pop_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL); + OBD_FREE(buf, LLOG_CHUNK_SIZE); +out: + RETURN(rc); +} + int llog_origin_handle_read_header(struct ptlrpc_request *req) { struct obd_export *exp = req->rq_export; @@ -538,11 +668,23 @@ int llog_origin_handle_create(struct ptlrpc_request *req) LBUG(); return 0; } + +int llog_origin_handle_destroy(struct ptlrpc_request *req) +{ + LBUG(); + return 0; +} + int llog_origin_handle_next_block(struct ptlrpc_request *req) { LBUG(); return 0; } +int llog_origin_handle_prev_block(struct ptlrpc_request *req) +{ + LBUG(); + return 0; +} int llog_origin_handle_read_header(struct ptlrpc_request *req) { LBUG(); diff --git a/lustre/ptlrpc/lproc_ptlrpc.c b/lustre/ptlrpc/lproc_ptlrpc.c index 8280722..71e7c6f 100644 --- a/lustre/ptlrpc/lproc_ptlrpc.c +++ b/lustre/ptlrpc/lproc_ptlrpc.c @@ -81,12 +81,6 @@ struct ll_rpc_opcode { { LDLM_BL_CALLBACK, "ldlm_bl_callback" }, { LDLM_CP_CALLBACK, "ldlm_cp_callback" }, { LDLM_GL_CALLBACK, "ldlm_gl_callback" }, - { PTLBD_QUERY, "ptlbd_query" }, - { PTLBD_READ, "ptlbd_read" }, - { PTLBD_WRITE, "ptlbd_write" }, - { PTLBD_FLUSH, "ptlbd_flush" }, - { PTLBD_CONNECT, "ptlbd_connect" }, - { PTLBD_DISCONNECT, "ptlbd_disconnect" }, { OBD_PING, "obd_ping" }, { OBD_LOG_CANCEL, "llog_origin_handle_cancel"}, }; @@ -482,7 +476,8 @@ int lprocfs_wr_ping(struct file *file, const char *buffer, int rc; ENTRY; - req = ptlrpc_prep_req(obd->u.cli.cl_import, OBD_PING, 0, NULL, NULL); + req = ptlrpc_prep_req(obd->u.cli.cl_import, LUSTRE_OBD_VERSION, + OBD_PING, 0, NULL, NULL); if (req == NULL) RETURN(-ENOMEM); diff --git a/lustre/ptlrpc/niobuf.c b/lustre/ptlrpc/niobuf.c index 71c839f..3e9d76c 100644 --- a/lustre/ptlrpc/niobuf.c +++ b/lustre/ptlrpc/niobuf.c @@ -163,6 +163,10 @@ void ptlrpc_abort_bulk (struct ptlrpc_bulk_desc *desc) if (!ptlrpc_bulk_active(desc)) /* completed or */ return; /* never started */ + /* Do not send any meaningful data over the wire for evicted clients */ + if (desc->bd_export && desc->bd_export->exp_failed) + ptl_rpc_wipe_bulk_pages(desc); + /* The unlink ensures the callback happens ASAP and is the last * one. If it fails, it must be because completion just happened, * but we must still l_wait_event() in this case, to give liblustre @@ -335,11 +339,15 @@ int ptlrpc_send_reply (struct ptlrpc_request *req, int may_be_difficult) req->rq_repmsg->status = req->rq_status; req->rq_repmsg->opc = req->rq_reqmsg->opc; - if (req->rq_export == NULL) + if (req->rq_export == NULL || req->rq_export->exp_connection == NULL) conn = ptlrpc_get_connection(req->rq_peer, req->rq_self, NULL); else conn = ptlrpc_connection_addref(req->rq_export->exp_connection); + if (conn == NULL) { + CERROR("not replying on NULL connection\n"); /* bug 9635 */ + return -ENOTCONN; + } atomic_inc (&svc->srv_outstanding_replies); ptlrpc_rs_addref(rs); /* +1 ref for the network */ @@ -377,63 +385,7 @@ int ptlrpc_error(struct ptlrpc_request *req) RETURN(rc); } -int ptl_send_rpc_nowait(struct ptlrpc_request *request) -{ - int rc; - struct ptlrpc_connection *connection; - unsigned long flags; - ENTRY; - - LASSERT (request->rq_type == PTL_RPC_MSG_REQUEST); - - if (request->rq_import->imp_obd && - request->rq_import->imp_obd->obd_fail) { - CDEBUG(D_HA, "muting rpc for failed imp obd %s\n", - request->rq_import->imp_obd->obd_name); - /* this prevents us from waiting in ptlrpc_queue_wait */ - request->rq_err = 1; - RETURN(-ENODEV); - } - - connection = request->rq_import->imp_connection; - - request->rq_reqmsg->handle = request->rq_import->imp_remote_handle; - request->rq_reqmsg->type = PTL_RPC_MSG_REQUEST; - request->rq_reqmsg->conn_cnt = request->rq_import->imp_conn_cnt; - - spin_lock_irqsave (&request->rq_lock, flags); - /* If the MD attach succeeds, there _will_ be a reply_in callback */ - request->rq_receiving_reply = 0; - /* Clear any flags that may be present from previous sends. */ - request->rq_replied = 0; - request->rq_err = 0; - request->rq_timedout = 0; - request->rq_net_err = 0; - request->rq_resend = 0; - request->rq_restart = 0; - spin_unlock_irqrestore (&request->rq_lock, flags); - - ptlrpc_request_addref(request); /* +1 ref for the SENT callback */ - - request->rq_sent = CURRENT_SECONDS; - ptlrpc_pinger_sending_on_import(request->rq_import); - rc = ptl_send_buf(&request->rq_req_md_h, - request->rq_reqmsg, request->rq_reqlen, - LNET_NOACK_REQ, &request->rq_req_cbid, - connection, - request->rq_request_portal, - request->rq_xid); - if (rc == 0) { - ptlrpc_lprocfs_rpc_sent(request); - } else { - ptlrpc_req_finished (request); /* drop callback ref */ - } - - return rc; -} - - -int ptl_send_rpc(struct ptlrpc_request *request) +int ptl_send_rpc(struct ptlrpc_request *request, int noreply) { int rc; int rc2; @@ -472,24 +424,26 @@ int ptl_send_rpc(struct ptlrpc_request *request) request->rq_reqmsg->type = PTL_RPC_MSG_REQUEST; request->rq_reqmsg->conn_cnt = request->rq_import->imp_conn_cnt; - LASSERT (request->rq_replen != 0); - if (request->rq_repmsg == NULL) - OBD_ALLOC(request->rq_repmsg, request->rq_replen); - if (request->rq_repmsg == NULL) - GOTO(cleanup_bulk, rc = -ENOMEM); - - rc = LNetMEAttach(request->rq_reply_portal, /* XXX FIXME bug 249 */ - connection->c_peer, request->rq_xid, 0, - LNET_UNLINK, LNET_INS_AFTER, &reply_me_h); - if (rc != 0) { - CERROR("LNetMEAttach failed: %d\n", rc); - LASSERT (rc == -ENOMEM); - GOTO(cleanup_repmsg, rc = -ENOMEM); + if (!noreply) { + LASSERT (request->rq_replen != 0); + if (request->rq_repmsg == NULL) + OBD_ALLOC(request->rq_repmsg, request->rq_replen); + if (request->rq_repmsg == NULL) + GOTO(cleanup_bulk, rc = -ENOMEM); + + rc = LNetMEAttach(request->rq_reply_portal,/*XXX FIXME bug 249*/ + connection->c_peer, request->rq_xid, 0, + LNET_UNLINK, LNET_INS_AFTER, &reply_me_h); + if (rc != 0) { + CERROR("LNetMEAttach failed: %d\n", rc); + LASSERT (rc == -ENOMEM); + GOTO(cleanup_repmsg, rc = -ENOMEM); + } } spin_lock_irqsave (&request->rq_lock, flags); /* If the MD attach succeeds, there _will_ be a reply_in callback */ - request->rq_receiving_reply = 1; + request->rq_receiving_reply = !noreply; /* Clear any flags that may be present from previous sends. */ request->rq_replied = 0; request->rq_err = 0; @@ -499,30 +453,32 @@ int ptl_send_rpc(struct ptlrpc_request *request) request->rq_restart = 0; spin_unlock_irqrestore (&request->rq_lock, flags); - reply_md.start = request->rq_repmsg; - reply_md.length = request->rq_replen; - reply_md.threshold = 1; - reply_md.options = PTLRPC_MD_OPTIONS | LNET_MD_OP_PUT; - reply_md.user_ptr = &request->rq_reply_cbid; - reply_md.eq_handle = ptlrpc_eq_h; - - rc = LNetMDAttach(reply_me_h, reply_md, LNET_UNLINK, - &request->rq_reply_md_h); - if (rc != 0) { - CERROR("LNetMDAttach failed: %d\n", rc); - LASSERT (rc == -ENOMEM); - spin_lock_irqsave (&request->rq_lock, flags); - /* ...but the MD attach didn't succeed... */ - request->rq_receiving_reply = 0; - spin_unlock_irqrestore (&request->rq_lock, flags); - GOTO(cleanup_me, rc -ENOMEM); + if (!noreply) { + reply_md.start = request->rq_repmsg; + reply_md.length = request->rq_replen; + reply_md.threshold = 1; + reply_md.options = PTLRPC_MD_OPTIONS | LNET_MD_OP_PUT; + reply_md.user_ptr = &request->rq_reply_cbid; + reply_md.eq_handle = ptlrpc_eq_h; + + rc = LNetMDAttach(reply_me_h, reply_md, LNET_UNLINK, + &request->rq_reply_md_h); + if (rc != 0) { + CERROR("LNetMDAttach failed: %d\n", rc); + LASSERT (rc == -ENOMEM); + spin_lock_irqsave (&request->rq_lock, flags); + /* ...but the MD attach didn't succeed... */ + request->rq_receiving_reply = 0; + spin_unlock_irqrestore (&request->rq_lock, flags); + GOTO(cleanup_me, rc -ENOMEM); + } + + CDEBUG(D_NET, "Setup reply buffer: %u bytes, xid "LPU64 + ", portal %u\n", + request->rq_replen, request->rq_xid, + request->rq_reply_portal); } - CDEBUG(D_NET, "Setup reply buffer: %u bytes, xid "LPU64 - ", portal %u\n", - request->rq_replen, request->rq_xid, - request->rq_reply_portal); - ptlrpc_request_addref(request); /* +1 ref for the SENT callback */ request->rq_sent = CURRENT_SECONDS; @@ -540,6 +496,10 @@ int ptl_send_rpc(struct ptlrpc_request *request) ptlrpc_req_finished (request); /* drop callback ref */ + if (noreply) + RETURN(rc); + else + GOTO(cleanup_me, rc); cleanup_me: /* MEUnlink is safe; the PUT didn't even get off the ground, and * nobody apart from the PUT's target has the right nid+XID to diff --git a/lustre/ptlrpc/pers.c b/lustre/ptlrpc/pers.c index 2942032..29913ff 100644 --- a/lustre/ptlrpc/pers.c +++ b/lustre/ptlrpc/pers.c @@ -61,6 +61,18 @@ void ptlrpc_add_bulk_page(struct ptlrpc_bulk_desc *desc, cfs_page_t *page, desc->bd_iov_count++; } +void ptl_rpc_wipe_bulk_pages(struct ptlrpc_bulk_desc *desc) +{ + int i; + + for (i = 0; i < desc->bd_iov_count ; i++) { + lnet_kiov_t *kiov = &desc->bd_iov[i]; + memset(kmap(kiov->kiov_page)+kiov->kiov_offset, 0xab, + kiov->kiov_len); + kunmap(kiov->kiov_page); + } +} + #else /* !__KERNEL__ */ void ptlrpc_fill_bulk_md(lnet_md_t *md, struct ptlrpc_bulk_desc *desc) @@ -105,4 +117,14 @@ void ptlrpc_add_bulk_page(struct ptlrpc_bulk_desc *desc, cfs_page_t *page, } } +void ptl_rpc_wipe_bulk_pages(struct ptlrpc_bulk_desc *desc) +{ + int i; + + for(i = 0; i < desc->bd_iov_count; i++) { + lnet_md_iovec_t *iov = &desc->bd_iov[i]; + + memset(iov->iov_base, 0xab, iov->iov_len); + } +} #endif /* !__KERNEL__ */ diff --git a/lustre/ptlrpc/ptlrpc_internal.h b/lustre/ptlrpc/ptlrpc_internal.h index b158953..2cd63ab 100644 --- a/lustre/ptlrpc/ptlrpc_internal.h +++ b/lustre/ptlrpc/ptlrpc_internal.h @@ -75,16 +75,9 @@ static inline int opcode_offset(__u32 opc) { return (opc - LDLM_FIRST_OPC + (MDS_LAST_OPC - MDS_FIRST_OPC) + (OST_LAST_OPC - OST_FIRST_OPC)); - } else if (opc < PTLBD_LAST_OPC) { - /* Portals Block Device */ - return (opc - PTLBD_FIRST_OPC + - (LDLM_LAST_OPC - LDLM_FIRST_OPC) + - (MDS_LAST_OPC - MDS_FIRST_OPC) + - (OST_LAST_OPC - OST_FIRST_OPC)); } else if (opc < OBD_LAST_OPC) { /* OBD Ping */ return (opc - OBD_FIRST_OPC + - (PTLBD_LAST_OPC - PTLBD_FIRST_OPC) + (LDLM_LAST_OPC - LDLM_FIRST_OPC) + (MDS_LAST_OPC - MDS_FIRST_OPC) + (OST_LAST_OPC - OST_FIRST_OPC)); @@ -94,8 +87,7 @@ static inline int opcode_offset(__u32 opc) { } } -#define LUSTRE_MAX_OPCODES ((PTLBD_LAST_OPC - PTLBD_FIRST_OPC) + \ - (LDLM_LAST_OPC - LDLM_FIRST_OPC) + \ +#define LUSTRE_MAX_OPCODES ((LDLM_LAST_OPC - LDLM_FIRST_OPC) + \ (MDS_LAST_OPC - MDS_FIRST_OPC) + \ (OST_LAST_OPC - OST_FIRST_OPC) + \ (OBD_LAST_OPC - OBD_FIRST_OPC)) @@ -114,6 +106,7 @@ int ptlrpc_expire_one_request(struct ptlrpc_request *req); void ptlrpc_fill_bulk_md(lnet_md_t *md, struct ptlrpc_bulk_desc *desc); void ptlrpc_add_bulk_page(struct ptlrpc_bulk_desc *desc, cfs_page_t *page, int pageoffset, int len); +void ptl_rpc_wipe_bulk_pages(struct ptlrpc_bulk_desc *desc); /* pinger.c */ int ptlrpc_start_pinger(void); diff --git a/lustre/ptlrpc/ptlrpc_module.c b/lustre/ptlrpc/ptlrpc_module.c index 4c43419..ae50b2f 100644 --- a/lustre/ptlrpc/ptlrpc_module.c +++ b/lustre/ptlrpc/ptlrpc_module.c @@ -48,7 +48,7 @@ extern void ptlrpc_exit_portals(void); __init int ptlrpc_init(void) { - int rc; + int rc, cleanup_phase = 0; ENTRY; lustre_assert_wire_constants(); @@ -61,16 +61,40 @@ __init int ptlrpc_init(void) rc = ptlrpc_init_portals(); if (rc) RETURN(rc); + cleanup_phase = 1; ptlrpc_init_connection(); - llog_init_commit_master(); + rc = llog_init_commit_master(); + if (rc) + GOTO(cleanup, rc); + cleanup_phase = 2; ptlrpc_put_connection_superhack = ptlrpc_put_connection; ptlrpc_abort_inflight_superhack = ptlrpc_abort_inflight; - ptlrpc_start_pinger(); - ldlm_init(); + rc = ptlrpc_start_pinger(); + if (rc) + GOTO(cleanup, rc); + cleanup_phase = 3; + + rc = ldlm_init(); + if (rc) + GOTO(cleanup, rc); RETURN(0); + +cleanup: + switch(cleanup_phase) { + case 3: + ptlrpc_stop_pinger(); + case 2: + llog_cleanup_commit_master(1); + ptlrpc_cleanup_connection(); + case 1: + ptlrpc_exit_portals(); + default: ; + } + + return rc; } #ifdef __KERNEL__ @@ -102,7 +126,6 @@ EXPORT_SYMBOL(ptlrpc_reply); EXPORT_SYMBOL(ptlrpc_error); EXPORT_SYMBOL(ptlrpc_resend_req); EXPORT_SYMBOL(ptl_send_rpc); -EXPORT_SYMBOL(ptl_send_rpc_nowait); /* client.c */ EXPORT_SYMBOL(ptlrpc_init_client); @@ -154,6 +177,7 @@ EXPORT_SYMBOL(ptlrpc_service_health_check); /* pack_generic.c */ EXPORT_SYMBOL(lustre_msg_swabbed); +EXPORT_SYMBOL(lustre_msg_check_version); EXPORT_SYMBOL(lustre_pack_request); EXPORT_SYMBOL(lustre_pack_reply); EXPORT_SYMBOL(lustre_shrink_reply); @@ -177,12 +201,14 @@ EXPORT_SYMBOL(lustre_swab_mds_body); EXPORT_SYMBOL(lustre_swab_obd_quotactl); EXPORT_SYMBOL(lustre_swab_mds_rec_setattr); EXPORT_SYMBOL(lustre_swab_mds_rec_create); +EXPORT_SYMBOL(lustre_swab_mds_rec_join); EXPORT_SYMBOL(lustre_swab_mds_rec_link); EXPORT_SYMBOL(lustre_swab_mds_rec_unlink); EXPORT_SYMBOL(lustre_swab_mds_rec_rename); EXPORT_SYMBOL(lustre_swab_lov_desc); EXPORT_SYMBOL(lustre_swab_lov_user_md); EXPORT_SYMBOL(lustre_swab_lov_user_md_objects); +EXPORT_SYMBOL(lustre_swab_lov_user_md_join); EXPORT_SYMBOL(lustre_swab_ldlm_res_id); EXPORT_SYMBOL(lustre_swab_ldlm_policy_data); EXPORT_SYMBOL(lustre_swab_ldlm_intent); @@ -190,9 +216,6 @@ EXPORT_SYMBOL(lustre_swab_ldlm_resource_desc); EXPORT_SYMBOL(lustre_swab_ldlm_lock_desc); EXPORT_SYMBOL(lustre_swab_ldlm_request); EXPORT_SYMBOL(lustre_swab_ldlm_reply); -EXPORT_SYMBOL(lustre_swab_ptlbd_op); -EXPORT_SYMBOL(lustre_swab_ptlbd_niob); -EXPORT_SYMBOL(lustre_swab_ptlbd_rsp); EXPORT_SYMBOL(lustre_swab_qdata); /* recover.c */ @@ -221,7 +244,9 @@ EXPORT_SYMBOL(ptlrpcd_wake); /* llogd.c */ EXPORT_SYMBOL(llog_origin_handle_create); +EXPORT_SYMBOL(llog_origin_handle_destroy); EXPORT_SYMBOL(llog_origin_handle_next_block); +EXPORT_SYMBOL(llog_origin_handle_prev_block); EXPORT_SYMBOL(llog_origin_handle_read_header); EXPORT_SYMBOL(llog_origin_handle_close); EXPORT_SYMBOL(llog_client_ops); diff --git a/lustre/ptlrpc/recov_thread.c b/lustre/ptlrpc/recov_thread.c index 22c10b9..2a1164c 100644 --- a/lustre/ptlrpc/recov_thread.c +++ b/lustre/ptlrpc/recov_thread.c @@ -351,7 +351,8 @@ static int log_commit_thread(void *arg) continue; } - request = ptlrpc_prep_req(import, OBD_LOG_CANCEL, 1, + request = ptlrpc_prep_req(import, LUSTRE_LOG_VERSION, + OBD_LOG_CANCEL, 1, &llcd->llcd_cookiebytes, bufs); if (request == NULL) { diff --git a/lustre/ptlrpc/recover.c b/lustre/ptlrpc/recover.c index 05dc496..5f6edfa 100644 --- a/lustre/ptlrpc/recover.c +++ b/lustre/ptlrpc/recover.c @@ -268,7 +268,7 @@ void ptlrpc_wake_delayed(struct obd_import *imp) void ptlrpc_request_handle_notconn(struct ptlrpc_request *failed_req) { - struct obd_import *imp= failed_req->rq_import; + struct obd_import *imp = failed_req->rq_import; unsigned long flags; ENTRY; @@ -286,7 +286,9 @@ void ptlrpc_request_handle_notconn(struct ptlrpc_request *failed_req) imp->imp_obd->obd_name); ptlrpc_deactivate_import(imp); } - ptlrpc_connect_import(imp, NULL); + /* to control recovery via lctl {disable|enable}_recovery */ + if (imp->imp_deactive == 0) + ptlrpc_connect_import(imp, NULL); } /* Wait for recovery to complete and resend. If evicted, then diff --git a/lustre/quota/Makefile.in b/lustre/quota/Makefile.in deleted file mode 100644 index 19a37ca..0000000 --- a/lustre/quota/Makefile.in +++ /dev/null @@ -1,10 +0,0 @@ -MODULES := lquota -MODULES += quotactl_test quotacheck_test - -lquota-objs := quota_check.o quota_context.o quota_ctl.o quota_interface.o -lquota-objs += quota_master.o -quotactl-objs := quotactl_test.o -quotaccheck-objs := quotacheck_test.o - -@INCLUDE_RULES@ - diff --git a/lustre/quota/quota_check.c b/lustre/quota/quota_check.c index 485c6bd..e47bd39 100644 --- a/lustre/quota/quota_check.c +++ b/lustre/quota/quota_check.c @@ -52,8 +52,8 @@ static int target_quotacheck_callback(struct obd_export *exp, int rc, size = sizeof(*oqctl); ENTRY; - req = ptlrpc_prep_req(exp->exp_imp_reverse, OBD_QC_CALLBACK, - 1, &size, NULL); + req = ptlrpc_prep_req(exp->exp_imp_reverse, LUSTRE_OBD_VERSION, + OBD_QC_CALLBACK, 1, &size, NULL); if (!req) RETURN(-ENOMEM); @@ -164,18 +164,21 @@ int client_quota_check(struct obd_export *exp, struct obd_quotactl *oqctl) struct client_obd *cli = &exp->exp_obd->u.cli; struct ptlrpc_request *req; struct obd_quotactl *body; - int size = sizeof(*body), opc; + int size = sizeof(*body), opc, version; int rc; ENTRY; - if (!strcmp(exp->exp_obd->obd_type->typ_name, LUSTRE_MDC_NAME)) + if (!strcmp(exp->exp_obd->obd_type->typ_name, LUSTRE_MDC_NAME)) { + version = LUSTRE_MDS_VERSION; opc = MDS_QUOTACHECK; - else if (!strcmp(exp->exp_obd->obd_type->typ_name, LUSTRE_OSC_NAME)) + } else if (!strcmp(exp->exp_obd->obd_type->typ_name, LUSTRE_OSC_NAME)) { + version = LUSTRE_OST_VERSION; opc = OST_QUOTACHECK; - else + } else { RETURN(-EINVAL); + } - req = ptlrpc_prep_req(class_exp2cliimp(exp), opc, 1, &size, + req = ptlrpc_prep_req(class_exp2cliimp(exp), version, opc, 1, &size, NULL); if (!req) GOTO(out, rc = -ENOMEM); diff --git a/lustre/quota/quota_context.c b/lustre/quota/quota_context.c index 087672f..20bbb04 100644 --- a/lustre/quota/quota_context.c +++ b/lustre/quota/quota_context.c @@ -512,7 +512,8 @@ schedule_dqacq(struct obd_device *obd, /* build dqacq/dqrel request */ LASSERT(qctxt->lqc_import); - req = ptlrpc_prep_req(qctxt->lqc_import, opc, 1, &size, NULL); + req = ptlrpc_prep_req(qctxt->lqc_import, LUSTRE_MDS_VERSION, opc, 1, + &size, NULL); if (!req) { dqacq_completion(obd, qctxt, qdata, -ENOMEM, opc); RETURN(-ENOMEM); @@ -708,32 +709,24 @@ static int qslave_recovery_main(void *arg) for (type = USRQUOTA; type < MAXQUOTAS; type++) { struct qunit_data qdata; struct quota_info *dqopt = sb_dqopt(qctxt->lqc_sb); - struct lustre_quota_info *dummy; struct list_head id_list; struct dquot_id *dqid, *tmp; int ret; - OBD_ALLOC_PTR(dummy); - if (!dummy) { - CERROR("Not enough memory\n"); - rc = -ENOMEM; - break; - } - down(&dqopt->dqonoff_sem); if (!sb_has_quota_enabled(qctxt->lqc_sb, type)) { up(&dqopt->dqonoff_sem); - OBD_FREE_PTR(dummy); break; } - dummy->qi_files[type] = dqopt->files[type]; - LASSERT(dummy->qi_files[type] != NULL); - INIT_LIST_HEAD(&id_list); - rc = fsfilt_quotainfo(obd, dummy, type, QFILE_GET_QIDS, &id_list); + LASSERT(dqopt->files[type] != NULL); + INIT_LIST_HEAD(&id_list); +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,12) + rc = fsfilt_qids(obd, dqopt->files[type], NULL, type, &id_list); +#else + rc = fsfilt_qids(obd, NULL, dqopt->files[type], type, &id_list); +#endif up(&dqopt->dqonoff_sem); - - OBD_FREE_PTR(dummy); if (rc) CERROR("Get ids from quota file failed. (rc:%d)\n", rc); diff --git a/lustre/quota/quota_ctl.c b/lustre/quota/quota_ctl.c index 9dba40a..fc6ab9d 100644 --- a/lustre/quota/quota_ctl.c +++ b/lustre/quota/quota_ctl.c @@ -168,18 +168,22 @@ int client_quota_ctl(struct obd_export *exp, struct obd_quotactl *oqctl) { struct ptlrpc_request *req; struct obd_quotactl *oqc; - int size = sizeof(*oqctl), opc; + int size = sizeof(*oqctl), opc, version; int rc; ENTRY; - if (!strcmp(exp->exp_obd->obd_type->typ_name, LUSTRE_MDC_NAME)) + if (!strcmp(exp->exp_obd->obd_type->typ_name, LUSTRE_MDC_NAME)) { opc = MDS_QUOTACTL; - else if (!strcmp(exp->exp_obd->obd_type->typ_name, LUSTRE_OSC_NAME)) + version = LUSTRE_MDS_VERSION; + } else if (!strcmp(exp->exp_obd->obd_type->typ_name, LUSTRE_OSC_NAME)) { opc = OST_QUOTACTL; - else + version = LUSTRE_OST_VERSION; + } else { RETURN(-EINVAL); + } - req = ptlrpc_prep_req(class_exp2cliimp(exp), opc, 1, &size, NULL); + req = ptlrpc_prep_req(class_exp2cliimp(exp), version, opc, 1, &size, + NULL); if (!req) GOTO(out, rc = -ENOMEM); diff --git a/lustre/quota/quota_master.c b/lustre/quota/quota_master.c index b16715b..2efa215 100644 --- a/lustre/quota/quota_master.c +++ b/lustre/quota/quota_master.c @@ -301,29 +301,27 @@ int mds_quota_adjust(struct obd_device *obd, unsigned int qcids[], switch (opc) { case FSFILT_OP_RENAME: /* acquire/release block quota on owner of original parent */ - rc = qctxt_adjust_qunit(obd, qctxt, qpids[2], qpids[3], 1, 0); + rc2 = qctxt_adjust_qunit(obd, qctxt, qpids[2], qpids[3], 1, 0); + /* fall-through */ + case FSFILT_OP_SETATTR: + /* acquire/release file quota on original owner */ + rc2 |= qctxt_adjust_qunit(obd, qctxt, qpids[0], qpids[1], 0, 0); /* fall-through */ case FSFILT_OP_CREATE: case FSFILT_OP_UNLINK: - /* acquire/release file quota on owner of child, acquire/release - * block quota on owner of parent */ - rc = qctxt_adjust_qunit(obd, qctxt, qcids[0], qcids[1], 0, 0); - rc2 = qctxt_adjust_qunit(obd, qctxt, qpids[0], qpids[1], 1, 0); - break; - case FSFILT_OP_SETATTR: - /* acquire/release file quota on original & current owner - * of child*/ - rc = qctxt_adjust_qunit(obd, qctxt, qcids[0], qcids[1], 0, 0); - rc2 = qctxt_adjust_qunit(obd, qctxt, qpids[0], qpids[1], 0, 0); + /* acquire/release file/block quota on owner of child (or current owner) */ + rc2 |= qctxt_adjust_qunit(obd, qctxt, qcids[0], qcids[1], 0, 0); + rc2 |= qctxt_adjust_qunit(obd, qctxt, qcids[0], qcids[1], 1, 0); + /* acquire/release block quota on owner of parent (or original owner) */ + rc2 |= qctxt_adjust_qunit(obd, qctxt, qpids[0], qpids[1], 1, 0); break; default: LBUG(); break; } - if (rc || rc2) - CERROR("mds adjust qunit failed! (opc:%d rc:%d)\n", - opc, rc ?: rc2); + if (rc2) + CERROR("mds adjust qunit failed! (opc:%d rc:%d)\n", opc, rc2); RETURN(0); } @@ -430,7 +428,7 @@ int init_admin_quotafiles(struct obd_device *obd, struct obd_quotactl *oqctl) } qinfo->qi_files[i] = fp; - rc = fsfilt_quotainfo(obd, qinfo, i, QFILE_INIT_INFO, NULL); + rc = fsfilt_quotainfo(obd, qinfo, i, QFILE_INIT_INFO); filp_close(fp, 0); qinfo->qi_files[i] = NULL; @@ -500,7 +498,7 @@ int mds_admin_quota_on(struct obd_device *obd, struct obd_quotactl *oqctl) } qinfo->qi_files[i] = fp; - rc = fsfilt_quotainfo(obd, qinfo, i, QFILE_RD_INFO, NULL); + rc = fsfilt_quotainfo(obd, qinfo, i, QFILE_RD_INFO); if (rc) { CERROR("error read quotainfo of %s! (rc:%d)\n", name, rc); @@ -589,7 +587,7 @@ int mds_set_dqinfo(struct obd_device *obd, struct obd_quotactl *oqctl) qinfo->qi_info[oqctl->qc_type].dqi_igrace = dqinfo->dqi_igrace; qinfo->qi_info[oqctl->qc_type].dqi_flags = dqinfo->dqi_flags; - rc = fsfilt_quotainfo(obd, qinfo, oqctl->qc_type, QFILE_WR_INFO, NULL); + rc = fsfilt_quotainfo(obd, qinfo, oqctl->qc_type, QFILE_WR_INFO); out: up(&mds->mds_qonoff_sem); @@ -1042,8 +1040,8 @@ static int qmaster_recovery_main(void *arg) continue; } INIT_LIST_HEAD(&id_list); - rc = fsfilt_quotainfo(obd, qinfo, type, QFILE_GET_QIDS, - &id_list); + rc = fsfilt_qids(obd, qinfo->qi_files[type], NULL, type, + &id_list); up(&mds->mds_qonoff_sem); if (rc) @@ -1072,13 +1070,13 @@ int mds_quota_recovery(struct obd_device *obd) int rc = 0; ENTRY; - spin_lock(&lov->lov_lock); + down(&lov->lov_lock); if (lov->desc.ld_tgt_count != lov->desc.ld_active_tgt_count) { CWARN("Not all osts are active, abort quota recovery\n"); - spin_unlock(&lov->lov_lock); + up(&lov->lov_lock); RETURN(rc); } - spin_unlock(&lov->lov_lock); + up(&lov->lov_lock); data.obd = obd; init_completion(&data.comp); diff --git a/lustre/scripts/lustre b/lustre/scripts/lustre index 2444c65..89edc5b 100755 --- a/lustre/scripts/lustre +++ b/lustre/scripts/lustre @@ -135,33 +135,38 @@ restart() { status() { STATE="stopped" + RETVAL=1 egrep -q "libcfs|lvfs|portals" /proc/modules && STATE="loaded" # check for any routes - on a portals router this is the only thing - [ "`cat /proc/sys/lnet/routes 2> /dev/null`" ] && STATE="running" + [ "`cat /proc/sys/lnet/routes 2> /dev/null`" ] && STATE="running" && RETVAL=0 # check for any configured devices (may indicate partial startup) - [ "`cat /proc/fs/lustre/devices 2> /dev/null`" ] && STATE="partial" + [ "`cat /proc/fs/lustre/devices 2> /dev/null`" ] && STATE="partial" && RETVAL=1 # check for either a server or a client filesystem MDS="`ls /proc/fs/lustre/mds/*/recovery_status 2> /dev/null`" OST="`ls /proc/fs/lustre/obdfilter/*/recovery_status 2> /dev/null`" LLITE="`ls /proc/fs/lustre/llite/fs* 2> /dev/null`" - [ "$MDS" -o "$OST" -o "$LLITE" ] && STATE="running" + [ "$MDS" -o "$OST" -o "$LLITE" ] && STATE="running" && RETVAL=0 # check for server disconnections DISCON="`grep -v FULL /proc/fs/lustre/*c/*/*server_uuid 2> /dev/null`" - [ "$DISCON" ] && STATE="disconnected" + [ "$DISCON" ] && STATE="disconnected" && RETVAL=0 # check for servers in recovery - [ "$MDS$OST" ] && grep -q RECOV $MDS $OST && STATE="recovery" + [ "$MDS$OST" ] && grep -q RECOV $MDS $OST && STATE="recovery" && RETVAL=0 - # check for error in health_check - HEALTH="/proc/fs/lustre/health_check" - [ -f "$HEALTH" ] && grep -q "NOT HEALTHY" $HEALTH && STATE="unhealthy" + # check for error in health_check + HEALTH="/proc/fs/lustre/health_check" + [ -f "$HEALTH" ] && grep -q "NOT HEALTHY" $HEALTH && STATE="unhealthy" && RETVAL=2 - # check for LBUG - [ -f "$HEALTH" ] && grep -q "LBUG" $HEALTH && STATE="LBUG" + # check for LBUG + [ -f "$HEALTH" ] && grep -q "LBUG" $HEALTH && STATE="LBUG" && RETVAL=3 + + # Check if the service really exists + DUMMY=`lctl dl | grep $SERVICE` + [ $? -ne 0 ] && STATE="not_found" && RETVAL=5 echo $STATE } diff --git a/lustre/tests/.cvsignore b/lustre/tests/.cvsignore index 20157d8..97728eb 100644 --- a/lustre/tests/.cvsignore +++ b/lustre/tests/.cvsignore @@ -71,3 +71,4 @@ rmdirmany flock_test writemany random-reads +chownmany diff --git a/lustre/tests/Makefile.am b/lustre/tests/Makefile.am index 982a6d8..c04cf4f 100644 --- a/lustre/tests/Makefile.am +++ b/lustre/tests/Makefile.am @@ -7,13 +7,17 @@ AM_CFLAGS = $(LLCFLAGS) pkgexample_scripts = llmount.sh llmountcleanup.sh llecho.sh llechocleanup.sh pkgexample_scripts += local.sh echo.sh uml.sh lov.sh noinst_DATA = -noinst_SCRIPTS = leak_finder.pl llecho.sh llmount.sh llmountcleanup.sh tbox.sh +noinst_SCRIPTS = leak_finder.pl llecho.sh llmount.sh llmountcleanup.sh noinst_SCRIPTS += llrmount.sh runfailure-mds runvmstat runfailure-net noinst_SCRIPTS += runfailure-ost runiozone runregression-net.sh runtests noinst_SCRIPTS += sanity.sh rundbench EXTRA_DIST = $(pkgexample_scripts) $(noinst_SCRIPTS) $(noinst_DATA) \ - sanity.sh rundbench + sanity.sh rundbench \ + acl/run acl/make-tree acl/getfacl-noacl.test acl/cp.test \ + acl/setfacl.test acl/permissions.test acl/misc.test \ + acl/inheritance.test + if TESTS pkgexample_SCRIPTS = $(pkgexample_scripts) noinst_PROGRAMS = openunlink testreq truncate directio openme writeme diff --git a/lustre/tests/acl/README b/lustre/tests/acl/README index cb98f79..2d8c219 100644 --- a/lustre/tests/acl/README +++ b/lustre/tests/acl/README @@ -1 +1,4 @@ -copied from acl-2.2.23/test/ +- copied from acl-2.2.23/test/ +- add inheritance.test from HP +- some tests are depend on enviroment. e.g. some succeed on FC2 but fail on + FC3 etc. We comment out those items, maybe more will be commented out. diff --git a/lustre/tests/acl/inheritance.test b/lustre/tests/acl/inheritance.test index bef89b7..41bce66 100644 --- a/lustre/tests/acl/inheritance.test +++ b/lustre/tests/acl/inheritance.test @@ -88,10 +88,12 @@ might be distributed around MDS's. $ echo i > tree/dir1/f $ ls -l tree/dir1/f | awk -- '{ print $1 }' > -rw-r--r--+ - $ su bin - $ echo i > tree/dir6/dir2/f - > tree/dir6/dir2/f: No such file or directory - $ su +in following item, the error message is dependant on distributions. +success on FC3, but not on FC2 and SLES3 etc. comment out by CFS. +# $ su bin +# $ echo i > tree/dir6/dir2/f +# > tree/dir6/dir2/f: No such file or directory +# $ su $ rm -rf tree diff --git a/lustre/tests/conf-sanity.sh b/lustre/tests/conf-sanity.sh index 0674610..660c4fc 100644 --- a/lustre/tests/conf-sanity.sh +++ b/lustre/tests/conf-sanity.sh @@ -11,11 +11,9 @@ set -e ONLY=${ONLY:-"$*"} # bug number for skipped test: -ALWAYS_EXCEPT=${ALWAYS_EXCEPT:-""} +ALWAYS_EXCEPT=" $CONF_SANITY_EXCEPT" # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT! -[ "$ALWAYS_EXCEPT$EXCEPT" ] && echo "Skipping tests: $ALWAYS_EXCEPT $EXCEPT" - SRCDIR=`dirname $0` PATH=$PWD/$SRCDIR:$SRCDIR:$SRCDIR/../utils:$PATH @@ -54,6 +52,7 @@ start_mds() { echo "start mds service on `facet_active_host mds`" start mds --reformat $MDSLCONFARGS || return 94 } + stop_mds() { echo "stop mds service on `facet_active_host mds`" stop mds $@ || return 97 @@ -268,11 +267,9 @@ test_5d() { [ -d $MOUNT ] || mkdir -p $MOUNT $LCONF --nosetup --node client_facet $XMLCONFIG > /dev/null - llmount -o nettype=$NETTYPE,$MOUNTOPT $mds_HOST://mds_svc/client_facet $MOUNT || return 1 + llmount -o nettype=$NETTYPE,$MOUNTOPT `facet_nid mds`://mds_svc/client_facet $MOUNT || return 1 - umount $MOUNT || return 2 - # cleanup client modules - $LCONF --cleanup --nosetup --node client_facet $XMLCONFIG > /dev/null + umount_client $MOUNT || return 2 stop_mds || return 3 @@ -620,8 +617,8 @@ test_15() { do_node `hostname` mkdir -p $MOUNT 2> /dev/null # load llite module on the client if it isn't in /lib/modules do_node `hostname` lconf --nosetup --node client_facet $XMLCONFIG - do_node `hostname` mount -t lustre -o nettype=$NETTYPE \ - `facet_active_host mds`:/mds_svc/client_facet $MOUNT ||return $? + do_node `hostname` mount -t lustre -o nettype=$NETTYPE,$MOUNTOPT \ + `facet_nid mds`:/mds_svc/client_facet $MOUNT ||return $? echo "mount lustre on $MOUNT with $MOUNTLUSTRE: success" [ -d /r ] && $LCTL modules > /r/tmp/ogdb-`hostname` check_mount || return 41 @@ -629,8 +626,8 @@ test_15() { [ -f "$MOUNTLUSTRE" ] && rm -f $MOUNTLUSTRE echo "mount lustre on ${MOUNT} without $MOUNTLUSTRE....." - do_node `hostname` mount -t lustre -o nettype=$NETTYPE \ - `facet_active_host mds`:/mds_svc/client_facet $MOUNT &&return $? + do_node `hostname` mount -t lustre -o nettype=$NETTYPE,$MOUNTOPT \ + `facet_nid mds`:/mds_svc/client_facet $MOUNT &&return $? echo "mount lustre on $MOUNT without $MOUNTLUSTRE failed as expected" cleanup || return $? cleanup_15 @@ -746,4 +743,15 @@ test_18() { } run_test 18 "check lconf creates large journals" +test_19() { + # first format the ost/mdt + start_ost + start_mds + stop_mds + stop_ost + start mds $MDSLCONFARGS || return 1 + stop mds --force || return 2 +} +run_test 19 "start/stop MDS without OSTs" + equals_msg "Done" diff --git a/lustre/tests/echo.sh b/lustre/tests/echo.sh index 0c2ab30..10a0bd9 100755 --- a/lustre/tests/echo.sh +++ b/lustre/tests/echo.sh @@ -17,6 +17,7 @@ HOSTNAME=`hostname` SERVER=${SERVER:-$HOSTNAME} CLIENT=${CLIENT:-$HOSTNAME} NET=${NET:-tcp} +[ "$ACCEPTOR_PORT" ] && PORT_OPT="--port $ACCEPTOR_PORT" h2tcp () { case $1 in @@ -60,7 +61,7 @@ STRIPES_PER_OBJ=2 # 0 means stripe over all OSTs rm -f $config # create nodes $LMC --add node --node $SERVER || exit 1 -$LMC --add net --node $SERVER --nid `h2$NET $SERVER` --nettype $NET || exit 2 +$LMC --add net --node $SERVER --nid `h2$NET $SERVER` --nettype $NET $PORT_OPT|| exit 2 if (($LOV)); then $LMC --add mds --node $SERVER --mds mds1 --fstype $FSTYPE --dev $MDSDEV --size $MDSSIZE || exit 10 @@ -75,7 +76,7 @@ fi if [ "$SERVER" != "$CLIENT" ]; then $LMC --add node --node $CLIENT || exit 1 - $LMC --add net --node $CLIENT --nid `h2$NET $CLIENT` --nettype $NET || exit 2 + $LMC --add net --node $CLIENT --nid `h2$NET $CLIENT` --nettype $NET $PORT_OPT || exit 2 fi $LMC --add echo_client --node $CLIENT --ost ${OBD_NAME} || exit 3 diff --git a/lustre/tests/fsx.c b/lustre/tests/fsx.c index a0f4c39..6cf43ee 100644 --- a/lustre/tests/fsx.c +++ b/lustre/tests/fsx.c @@ -531,7 +531,10 @@ check_trunc_hack(void) ftruncate(fd, (off_t)0); ftruncate(fd, (off_t)100000); - fstat(fd, &statbuf); + if (fstat(fd, &statbuf)) { + prterr("trunc_hack: fstat"); + statbuf.st_size = -1; + } if (statbuf.st_size != (off_t)100000) { prt("no extend on truncate! not posix!\n"); exit(130); diff --git a/lustre/tests/insanity.sh b/lustre/tests/insanity.sh index c6d0a1b..03a8f7d 100755 --- a/lustre/tests/insanity.sh +++ b/lustre/tests/insanity.sh @@ -10,7 +10,7 @@ init_test_env $@ . ${CONFIG:=$LUSTRE/tests/cfg/insanity-local.sh} -ALWAYS_EXCEPT="10" +ALWAYS_EXCEPT="10 $INSANITY_EXCEPT" SETUP=${SETUP:-"setup"} CLEANUP=${CLEANUP:-"cleanup"} @@ -152,7 +152,7 @@ cleanup() { stop mds ${FORCE} $MDSLCONFARGS || : for i in `seq $NUMOST`; do - stop ost$i ${REFORMAT} ${FORCE} $OSTLCONFARGS || : + stop ost$i ${FORCE} $OSTLCONFARGS || : done } diff --git a/lustre/tests/llmount.sh b/lustre/tests/llmount.sh index 24e5521..ca26b2a 100755 --- a/lustre/tests/llmount.sh +++ b/lustre/tests/llmount.sh @@ -27,9 +27,14 @@ fi [ "$NODE" ] && node_opt="--node $NODE" [ "$DEBUG" ] && debug_opt="--ptldebug=$DEBUG" +[ "$PTLDEBUG" ] && debug_opt="--ptldebug=$PTLDEBUG" ${LCONF} $NOMOD $portals_opt $lustre_opt $debug_opt $node_opt ${REFORMAT:---reformat} $@ \ - $conf_opt || exit 2 + $conf_opt || { + # maybe acceptor error, dump tcp port usage + netstat -tpn + exit 2 +} if [ "$MOUNT2" ]; then $LLMOUNT -v -o user_xattr,acl `hostname`:/mds1/client $MOUNT2 || exit 3 diff --git a/lustre/tests/llog-test.sh b/lustre/tests/llog-test.sh index 4044610..98b8d12 100644 --- a/lustre/tests/llog-test.sh +++ b/lustre/tests/llog-test.sh @@ -79,6 +79,7 @@ run_test 0 "Prepare fileset" test_1() { ./chownmany 1000 $DIR/llog-%d $LCOUNT sleep 5 + $CHECKSTAT -u \#1000 $DIR/llog-* || return 4 } run_test 1 "Do chowns" @@ -88,13 +89,14 @@ test_2() { fail ost ./chownmany 500 $DIR/llog-%d $HALFCOUNT $LCOUNT sleep 5 + $CHECKSTAT -u \#500 $DIR/llog-* || return 5 } -#run_test 2 "Fail OST during chown" +run_test 2 "Fail OST during chown" test_3() { ./unlinkmany $DIR/llog-%d $LCOUNT sleep 2 - $CHECKSTAT -t file $DIR/llog-* && return 1 || true + $CHECKSTAT -t file $DIR/llog-* && return 10 || true } run_test 3 "Remove testset" diff --git a/lustre/tests/llrmount.sh b/lustre/tests/llrmount.sh index 5e49d89..434ef44 100755 --- a/lustre/tests/llrmount.sh +++ b/lustre/tests/llrmount.sh @@ -1,4 +1,5 @@ #!/bin/sh +# vim:expandtab:shiftwidth=4:softtabstop=4:tabstop=4: export PATH=`dirname $0`/../utils:$PATH @@ -27,10 +28,15 @@ else fi [ "$NODE" ] && node_opt="--node $NODE" +[ "$DEBUG" ] && portals_opt="$portals_opt --ptldebug=$DEBUG" +[ "$PTLDEBUG" ] && portals_opt="$portals_opt --ptldebug=$PTLDEBUG" -${LCONF} $NOMOD $portals_opt $lustre_opt $node_opt $@ $conf_opt || exit 2 +${LCONF} $NOMOD $portals_opt $lustre_opt $node_opt $@ $conf_opt || { + # maybe acceptor error, dump tcp port usage + netstat -tpn + exit 2 +} -[ $DEBUG ] && sysctl -w lnet.debug=$DEBUG if [ "$MOUNT2" ]; then $LLMOUNT -v -o user_xattr,acl `hostname`:/mds1/client $MOUNT2 || exit 3 diff --git a/lustre/tests/local.sh b/lustre/tests/local.sh index ced41d2..0a8cc71 100755 --- a/lustre/tests/local.sh +++ b/lustre/tests/local.sh @@ -14,6 +14,7 @@ FSTYPE=${FSTYPE:-ext3} MOUNT=${MOUNT:-/mnt/lustre} MOUNT2=${MOUNT2:-${MOUNT}2} NETTYPE=${NETTYPE:-tcp} +[ "$ACCEPTOR_PORT" ] && PORT_OPT="--port $ACCEPTOR_PORT" OSTDEV=${OSTDEV:-$TMP/ost1-`hostname`} OSTSIZE=${OSTSIZE:-400000} @@ -59,8 +60,9 @@ h2iib () { # create nodes ${LMC} --add node --node $HOSTNAME || exit 10 -${LMC} --add net --node $HOSTNAME --nid `h2$NETTYPE $HOSTNAME` --nettype $NETTYPE || exit 11 -${LMC} --add net --node client --nid '*' --nettype $NETTYPE || exit 12 +${LMC} --add net --node $HOSTNAME --nid `h2$NETTYPE $HOSTNAME` \ + --nettype $NETTYPE $PORT_OPT || exit 11 +${LMC} --add net --node client --nid '*' --nettype $NETTYPE $PORT_OPT|| exit 12 # configure mds server [ "x$MDS_MOUNT_OPTS" != "x" ] && diff --git a/lustre/tests/lockorder.sh b/lustre/tests/lockorder.sh index 2997518..162df9a 100644 --- a/lustre/tests/lockorder.sh +++ b/lustre/tests/lockorder.sh @@ -15,9 +15,10 @@ COUNT=${COUNT:-100} cleanup() { [ $CR_PID ] && kill -9 $CR_PID + [ $ST_PID ] && kill -9 $ST_PID } -trap cleanup 0 +trap cleanup EXIT LOCKDIR=$DIR/lockdir LOCKFILE=$LOCKDIR/lockfile diff --git a/lustre/tests/lov.sh b/lustre/tests/lov.sh index 7130ad0..352c2b9 100755 --- a/lustre/tests/lov.sh +++ b/lustre/tests/lov.sh @@ -16,6 +16,7 @@ FSTYPE=${FSTYPE:-ext3} MOUNT=${MOUNT:-/mnt/lustre} MOUNT2=${MOUNT2:-${MOUNT}2} NETTYPE=${NETTYPE:-tcp} +[ "$ACCEPTOR_PORT" ] && PORT_OPT="--port $ACCEPTOR_PORT" OSTCOUNT=${OSTCOUNT:-5} # OSTDEVN will still override the device for OST N @@ -39,8 +40,9 @@ rm -f $config # create nodes ${LMC} --add node --node $HOSTNAME || exit 10 -${LMC} --add net --node $HOSTNAME --nid $HOSTNAME --nettype $NETTYPE || exit 11 -${LMC} --add net --node client --nid '*' --nettype $NETTYPE || exit 12 +${LMC} --add net --node $HOSTNAME --nid $HOSTNAME \ + --nettype $NETTYPE $PORT_OPT || exit 11 +${LMC} --add net --node client --nid '*' --nettype $NETTYPE $PORT_OPT || exit 12 [ "x$QUOTA_OPTS" != "x" ] && QUOTA_OPTS="--quota $QUOTA_OPTS" diff --git a/lustre/tests/memhog.c b/lustre/tests/memhog.c index 098e787..11cb734 100644 --- a/lustre/tests/memhog.c +++ b/lustre/tests/memhog.c @@ -50,30 +50,36 @@ int main(int argc, char *argv[]) mem = calloc(numchunk, sizeof(*mem)); if (mem == NULL) { fprintf(stderr, "error allocating initial chunk array\n"); - exit(1); + exit(-1); } alloc = CHUNK; printf("[%d] allocating %lld kbytes in %u kbyte chunks\n", getpid(), kbtotal, alloc); - for (i = kballoc = 0; i < numchunk; i++, kballoc += alloc) { + for (i = kballoc = 0; i < numchunk && alloc > 0; i++, kballoc += alloc){ if (kbtotal - kballoc < alloc) alloc = kbtotal - kballoc; - tmp = mem[i] = malloc(alloc * 1024); - if (tmp == NULL) { + while (alloc > 0 && (mem[i] = malloc(alloc * 1024)) == NULL) { fprintf(stderr, "malloc(%u) failed (%lld/%lld)\n", alloc * 1024, kballoc, kbtotal); - } else { - printf("touching %p (%lld/%lld)\n", - tmp, kballoc, kbtotal); - for (j = 0; j < alloc; j += 4) { - for (k = 0, sum = 0; k < 4095; k++, tmp++) - sum += *tmp; - *tmp = sum; - } + alloc /= 2; + } + if (alloc == 0) + break; + + printf("touching %p ([%lld-%lld]/%lld)\n", mem[i], kballoc, + kballoc + alloc - 1, kbtotal); + for (j = 0, tmp = mem[i]; j < alloc; j += 4) { + for (k = 0, sum = 0; k < 4095; k++, tmp++) + sum += *tmp; + *tmp = sum; } } + if (kballoc == 0) + exit(-2); + + kbtotal = kballoc; printf("touched %lld kbytes\n", kballoc); alloc = CHUNK; @@ -92,7 +98,7 @@ int main(int argc, char *argv[]) if (*tmp != sum) { fprintf(stderr, "sum %x != %x at %p\n", *tmp, sum, tmp - 4092); - rc = 1; + rc++; } } } diff --git a/lustre/tests/oos.sh b/lustre/tests/oos.sh index 28d1b2b..0d12568 100755 --- a/lustre/tests/oos.sh +++ b/lustre/tests/oos.sh @@ -13,9 +13,9 @@ LOG=$TMP/ooslog SUCCESS=1 -rm -f $OOS +rm -f $OOS $LOG -sleep 1 # to ensure we get up-to-date statfs info +sync; sleep 1; sync # to ensure we get up-to-date statfs info #echo -1 > /proc/sys/lnet/debug #echo 0x40a8 > /proc/sys/lnet/subsystem_debug @@ -34,7 +34,6 @@ fi export LANG=C LC_LANG=C # for "No space left on device" message -rm -f $LOG >/dev/null 2>&1 [ -f $LOG ] && echo "ERROR: log file wasn't removed?" && exit 1 # make sure we stripe over all OSTs to avoid OOS on only a subset of OSTs @@ -45,7 +44,8 @@ if dd if=/dev/zero of=$OOS count=$(($ORIGFREE + 100)) bs=1k 2> $LOG; then fi if [ "`grep -c 'No space left on device' $LOG`" -ne 1 ]; then - echo "ERROR: dd not return ENOSPC" + echo "ERROR: dd not return ENOSPC" + sed "s/^/LOG: /" $LOG SUCCESS=0 fi @@ -65,11 +65,14 @@ if [ -z "$OSCFULL" ]; then fi RECORDSOUT=`grep "records out" $LOG | cut -d + -f1` - FILESIZE=`ls -l $OOS | awk '{ print $5 }'` -if [ "$RECORDSOUT" -ne $((FILESIZE / 1024)) ]; then - echo "ERROR: blocks written by dd not equal to the size of file" - SUCCESS=0 +if [ -z "$RECORDSOUT" ]; then + echo "ERROR: no blocks written by dd?" + sed "s/^/LOG: /" $LOG + SUCCESS=0 +elif [ "$RECORDSOUT" -ne $((FILESIZE / 1024)) ]; then + echo "ERROR: blocks written by dd not equal to the size of file" + SUCCESS=0 fi #lctl debug_daemon stop diff --git a/lustre/tests/oos2.sh b/lustre/tests/oos2.sh index b028760..f7682bb 100644 --- a/lustre/tests/oos2.sh +++ b/lustre/tests/oos2.sh @@ -18,7 +18,7 @@ SUCCESS=1 rm -f $OOS $OOS2 $LOG $LOG2 -sleep 1 # to ensure we get up-to-date statfs info +sync; sleep 1; sync # to ensure we get up-to-date statfs info STRIPECOUNT=`cat /proc/fs/lustre/lov/*/activeobd | head -n 1` ORIGFREE=`cat /proc/fs/lustre/llite/*/kbytesavail | head -n 1` diff --git a/lustre/tests/random-reads.c b/lustre/tests/random-reads.c index c02ef0f..1722afb 100644 --- a/lustre/tests/random-reads.c +++ b/lustre/tests/random-reads.c @@ -148,13 +148,15 @@ int main(int argc, char **argv) nblocks = size / bsize; buf = malloc(bsize); if (buf == NULL) { - LOG(LOG_CRIT, "malloc(%i) failure: %m\n", bsize); + LOG(LOG_CRIT, "malloc(%lu) failure: %s\n", (long)bsize, + strerror(errno)); return RR_MALLOC; } fd = open(fname, (preclean ? O_RDWR : O_RDONLY) | O_CREAT, 0700); if (fd == -1) { - LOG(LOG_CRIT, "malloc(\"%s\") failure: %m\n", fname); + LOG(LOG_CRIT, "malloc(\"%s\") failure: %s\n", fname, + strerror(errno)); return RR_OPEN; } if (preclean) { @@ -167,7 +169,8 @@ int main(int argc, char **argv) memset(buf, bsize, seed + i++); ret = write(fd, buf, count); if (ret < 0) { - LOG(LOG_CRIT, "write() failure: %m\n"); + LOG(LOG_CRIT, "write() failure: %s\n", + strerror(errno)); return RR_PRECLEAN; } } @@ -188,8 +191,8 @@ int main(int argc, char **argv) ret = pread(fd, buf, bsize, (block_nr + j) * bsize); if (ret != bsize) { LOG(LOG_CRIT, - "pread(...%zi, %li) got: %zi, %m\n", - bsize, block_nr * bsize, ret); + "pread(...%zi, %li) got: %zi, %s\n", bsize, + block_nr * bsize, ret, strerror(errno)); return RR_READ; } } diff --git a/lustre/tests/recovery-small.sh b/lustre/tests/recovery-small.sh index c784c50..6256185 100755 --- a/lustre/tests/recovery-small.sh +++ b/lustre/tests/recovery-small.sh @@ -3,8 +3,7 @@ set -e # bug 2986 5494 7288 -ALWAYS_EXCEPT="20b 24 27" - +ALWAYS_EXCEPT="20b 24 27 $RECOVERY_SMALL_EXCEPT" LUSTRE=${LUSTRE:-`dirname $0`/..} @@ -137,7 +136,7 @@ run_test 9 "pause bulk on OST (bug 1420)" #bug 1521 test_10() { do_facet client mcreate $MOUNT/$tfile || return 1 - drop_bl_callback "chmod 0777 $MOUNT/$tfile" || return 2 + drop_bl_callback "chmod 0777 $MOUNT/$tfile" || echo "evicted as expected" # wait for the mds to evict the client #echo "sleep $(($TIMEOUT*2))" #sleep $(($TIMEOUT*2)) @@ -156,8 +155,7 @@ test_11(){ cancel_lru_locks OSC do_facet client multiop $MOUNT/$tfile or || return 3 - drop_bl_callback multiop $MOUNT/$tfile Ow || - echo "client evicted, as expected" + drop_bl_callback multiop $MOUNT/$tfile Ow || echo "evicted as expected" do_facet client munlink $MOUNT/$tfile || return 4 } @@ -317,7 +315,7 @@ run_test 18b "eviction and reconnect clears page cache (2766)" test_19a() { f=$MOUNT/$tfile do_facet client mcreate $f || return 1 - drop_ldlm_cancel "chmod 0777 $f" || echo evicted + drop_ldlm_cancel "chmod 0777 $f" || echo "evicted as expected" do_facet client checkstat -v -p 0777 $f || echo evicted # let the client reconnect @@ -447,8 +445,8 @@ test_27() { run_test 27 "fail LOV while using OSC's" test_28() { # bug 6086 - error adding new clients - do_facet client mcreate $MOUNT/$tfile || return 1 - drop_bl_callback "chmod 0777 $MOUNT/$tfile" || return 2 + do_facet client mcreate $MOUNT/$tfile || return 1 + drop_bl_callback "chmod 0777 $MOUNT/$tfile" ||echo "evicted as expected" #define OBD_FAIL_MDS_ADD_CLIENT 0x12f do_facet mds sysctl -w lustre.fail_loc=0x8000012f # fail once (evicted), reconnect fail (fail_loc), ok diff --git a/lustre/tests/replay-dual.sh b/lustre/tests/replay-dual.sh index a5f461b..5fe9d3a 100755 --- a/lustre/tests/replay-dual.sh +++ b/lustre/tests/replay-dual.sh @@ -2,8 +2,8 @@ set -e -# bug 6088 -ALWAYS_EXCEPT="8" +# bug number: 6088 10124 +ALWAYS_EXCEPT="8 15c $REPLAY_DUAL_EXCEPT" LUSTRE=${LUSTRE:-`dirname $0`/..} . $LUSTRE/tests/test-framework.sh @@ -253,9 +253,9 @@ test_12() { sysctl -w lustre.fail_loc=0 ls $DIR/$tfile - $CHECKSTAT -t file $DIR/$tfile || return 2 kill -USR1 $MULTIPID || return 3 wait $MULTIPID || return 4 + $CHECKSTAT -t file $DIR/$tfile || return 2 rm $DIR/$tfile return 0 @@ -316,6 +316,7 @@ test_15() { df $MOUNT || return 1 unlinkmany $MOUNT1/$tfile- 25 || return 2 + [ -e $MOUNT1/$tfile-2-0 ] && error "$tfile-2-0 exists" zconf_mount `hostname` $MOUNT2 return 0 @@ -369,7 +370,7 @@ test_15a() { zconf_mount `hostname` $MOUNT2 return 0 } -run_test 15a "OST clear orphans - synchronize ids on MDS and OST" +#CROW run_test 15a "OST clear orphans - synchronize ids on MDS and OST" test_15b() { replay_barrier mds @@ -385,15 +386,12 @@ test_15b() { zconf_mount `hostname` $MOUNT2 return 0 } -run_test 15b "multiple delayed OST clear orphans" +#CROW run_test 15b "multiple delayed OST clear orphans" test_15c() { - local ost_last_id="" - local osc_last_id="" - replay_barrier mds - for ((i=0;i<20000;i++)); do - echo "data" > "$MOUNT2/${tfile}-$i" + for ((i = 0; i < 2000; i++)); do + echo "data" > "$MOUNT2/${tfile}-$i" || error "create ${tfile}-$i failed" done umount $MOUNT2 diff --git a/lustre/tests/replay-ost-single.sh b/lustre/tests/replay-ost-single.sh index cbe7da1..c9ae901 100755 --- a/lustre/tests/replay-ost-single.sh +++ b/lustre/tests/replay-ost-single.sh @@ -13,7 +13,7 @@ ostfailover_HOST=${ostfailover_HOST:-$ost_HOST} # Skip these tests # BUG NUMBER: 2766? -ALWAYS_EXCEPT="5" +ALWAYS_EXCEPT="5 $REPLAY_OST_SINGLE_EXCEPT" gen_config() { rm -f $XMLCONFIG @@ -154,6 +154,7 @@ test_6() { sync && sleep 2 && sync # wait for delete thread before=`kbytesfree` dd if=/dev/urandom bs=4096 count=1280 of=$f + lfs getstripe $f #define OBD_FAIL_MDS_REINT_NET_REP 0x119 do_facet mds "sysctl -w lustre.fail_loc=0x80000119" sync @@ -166,7 +167,7 @@ test_6() { $CHECKSTAT -t file $f && return 2 || true sync # let the delete happen - sleep 2 + sleep 5 after=`kbytesfree` log "before: $before after: $after" (( $before <= $after + 40 )) || return 3 # take OST logs into account @@ -180,6 +181,7 @@ test_7() { before=`kbytesfree` dd if=/dev/urandom bs=4096 count=1280 of=$f sync + sleep 1 # ensure we have a fresh statfs after_dd=`kbytesfree` log "before: $before after_dd: $after_dd" (( $before > $after_dd )) || return 1 diff --git a/lustre/tests/replay-single.sh b/lustre/tests/replay-single.sh index 4fbeaf3..075861c 100755 --- a/lustre/tests/replay-single.sh +++ b/lustre/tests/replay-single.sh @@ -14,8 +14,8 @@ init_test_env $@ . ${CONFIG:=$LUSTRE/tests/cfg/local.sh} # Skip these tests -# bug number: 2766 4176 -ALWAYS_EXCEPT="0b 39 48" +# bug number: 2766 9930 +ALWAYS_EXCEPT="0b 39 $REPLAY_SINGLE_EXCEPT" gen_config() { rm -f $XMLCONFIG @@ -100,9 +100,9 @@ run_test 1 "simple create" test_1a() { do_facet ost "sysctl -w lustre.fail_loc=0" - rm -fr $DIR/1a0 + rm -fr $DIR/$tfile local old_last_id=`cat /proc/fs/lustre/obdfilter/*/last_id` - createmany -o /mnt/lustre/1a 1 + touch -o $DIR/$tfile 1 sync local new_last_id=`cat /proc/fs/lustre/obdfilter/*/last_id` @@ -112,7 +112,7 @@ test_1a() { } old_last_id=`cat /proc/fs/lustre/obdfilter/*/last_id` - echo "data" > $DIR/1a0 + echo "data" > $DIR/$tfile sync new_last_id=`cat /proc/fs/lustre/obdfilter/*/last_id` test "$old_last_id" = "$new_last_id "&& { @@ -120,7 +120,7 @@ test_1a() { return 1 } - rm -fr $DIR/1a0 + rm -fr $DIR/$tfile #define OBD_FAIL_OST_CROW_EIO | OBD_FAIL_ONCE do_facet ost "sysctl -w lustre.fail_loc=0x80000801" @@ -139,7 +139,7 @@ test_1a() { do_facet ost "sysctl -w lustre.fail_loc=0" } -run_test 1a "CROW object create (check OST last_id)" +#CROW run_test 1a "CROW object create (check OST last_id)" test_2a() { replay_barrier mds diff --git a/lustre/tests/run-llog.sh b/lustre/tests/run-llog.sh index b7201f2..515a347 100644 --- a/lustre/tests/run-llog.sh +++ b/lustre/tests/run-llog.sh @@ -5,7 +5,11 @@ TMP=${TMP:-/tmp} MDS=`ls /proc/fs/lustre/mds | grep -v num_refs | head -n 1` [ -z "$MDS" ] && echo "no MDS available, skipping llog test" && exit 0 -insmod ../obdclass/llog_test.o || exit 1 +case `uname -r` in +2.4.*) insmod ../obdclass/llog_test.o || exit 1 ;; +2.6.*) insmod ../obdclass/llog_test.ko || exit 1 ;; +*) echo "unknown kernel version `uname -r`" && exit 99 ;; +esac lctl modules > $TMP/ogdb-`hostname` echo "NOW reload debugging syms.." diff --git a/lustre/tests/rundbench b/lustre/tests/rundbench index b23ea12..09a0549 100755 --- a/lustre/tests/rundbench +++ b/lustre/tests/rundbench @@ -5,9 +5,10 @@ DIR=${DIR:-$MNT/`hostname`} mkdir -p $DIR TGT=$DIR/client.txt SRC=${SRC:-/usr/lib/dbench/client.txt} -[ ! -e $TGT -a -e $SRC ] && echo "copying $SRC to $TGT" && cp $SRC $TGT +[ ! -s $TGT -a -s $SRC ] && echo "copying $SRC to $TGT" && cp $SRC $TGT SRC=/usr/lib/dbench/client_plain.txt -[ ! -e $TGT -a -e $SRC ] && echo "copying $SRC to $TGT" && cp $SRC $TGT +[ ! -s $TGT -a -s $SRC ] && echo "copying $SRC to $TGT" && cp $SRC $TGT +[ ! -s $TGT ] && echo "$TGT doesn't exist" && exit 1 cd $DIR echo "running 'dbench $@' on $PWD at `date`" dbench -c client.txt $@ diff --git a/lustre/tests/runregression-mds.sh b/lustre/tests/runregression-mds.sh index d403bb4..1b05df8 100755 --- a/lustre/tests/runregression-mds.sh +++ b/lustre/tests/runregression-mds.sh @@ -26,7 +26,7 @@ cleanup() { OSCMT="`mount | awk '/ lustre_lite / { print $3 }' | tail -n 1`" if [ -z "$OSCMT" ]; then $LCONF $@ || exit 1 - trap cleanup 0 + trap cleanup EXIT OSCMT="`mount | awk '/ lustre_lite / { print $3 }' | tail -n 1`" [ -z "$OSCMT" ] && fail "no lustre filesystem mounted" 1 fi diff --git a/lustre/tests/sanityN.sh b/lustre/tests/sanityN.sh index 234b12c..a1df23a 100644 --- a/lustre/tests/sanityN.sh +++ b/lustre/tests/sanityN.sh @@ -3,12 +3,10 @@ set -e ONLY=${ONLY:-"$*"} -# bug number for skipped test: 1768 3192 4035 -ALWAYS_EXCEPT=${ALWAYS_EXCEPT:-"4 14b 14c"} +# bug number for skipped test: 3192 4035 +ALWAYS_EXCEPT=${ALWAYS_EXCEPT:-"14b 14c"} # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT! -[ "$ALWAYS_EXCEPT$EXCEPT" ] && echo "Skipping tests: $ALWAYS_EXCEPT $EXCEPT" - SRCDIR=`dirname $0` PATH=$PWD/$SRCDIR:$SRCDIR:$SRCDIR/../utils:$PATH @@ -66,36 +64,68 @@ run_one() { if ! mount | grep -q $DIR1; then $START fi + testnum=$1 + message=$2 BEFORE=`date +%s` - log "== test $1: $2= `date +%H:%M:%S` ($BEFORE)" - export TESTNAME=test_$1 - test_$1 || error "test_$1: exit with rc=$?" + log "== test $testnum: $message= `date +%H:%M:%S` ($BEFORE)" + export TESTNAME=test_$testnum + export tfile=f${testnum} + export tdir=d${base} + test_$1 || error "exit with rc=$?" unset TESTNAME pass "($((`date +%s` - $BEFORE))s)" cd $SAVE_PWD $CLEAN } +build_test_filter() { + [ "$ALWAYS_EXCEPT$EXCEPT$SANITYN_EXCEPT" ] && \ + echo "Skipping tests: `echo $ALWAYS_EXCEPT $EXCEPT $SANITYN_EXCEPT`" + + for O in $ONLY; do + eval ONLY_${O}=true + done + for E in $EXCEPT $ALWAYS_EXCEPT $SANITY_EXCEPT; do + eval EXCEPT_${E}=true + done +} + +_basetest() { + echo $* +} + +basetest() { + IFS=abcdefghijklmnopqrstuvwxyz _basetest $1 +} + run_test() { - for O in $ONLY; do - if [ "`echo $1 | grep '\<'$O'[a-z]*\>'`" ]; then - echo "" - run_one $1 "$2" - return $? - else - echo -n "." - fi - done - for X in $EXCEPT $ALWAYS_EXCEPT; do - if [ "`echo $1 | grep '\<'$X'[a-z]*\>'`" ]; then - echo "skipping excluded test $1" - return 0 - fi - done - if [ -z "$ONLY" ]; then - run_one $1 "$2" - return $? - fi + export base=`basetest $1` + if [ "$ONLY" ]; then + testname=ONLY_$1 + if [ ${!testname}x != x ]; then + run_one $1 "$2" + return $? + fi + testname=ONLY_$base + if [ ${!testname}x != x ]; then + run_one $1 "$2" + return $? + fi + echo -n "." + return 0 + fi + testname=EXCEPT_$1 + if [ ${!testname}x != x ]; then + echo "skipping excluded test $1" + return 0 + fi + testname=EXCEPT_$base + if [ ${!testname}x != x ]; then + echo "skipping excluded test $1 (base $base)" + return 0 + fi + run_one $1 "$2" + return $? } [ "$SANITYLOG" ] && rm -f $SANITYLOG || true @@ -127,6 +157,8 @@ export DIR2=${DIR2:-$MOUNT2} rm -rf $DIR1/[df][0-9]* $DIR1/lnk +build_test_filter + test_1a() { touch $DIR1/f1 [ -f $DIR2/f1 ] || error @@ -344,13 +376,13 @@ test_14c() { # bug 3430 run_test 14c "open(O_TRUNC) of executing file return -ETXTBSY ==" test_15() { # bug 974 - ENOSPC - echo $PATH + echo "PATH=$PATH" sh oos2.sh $MOUNT1 $MOUNT2 } run_test 15 "test out-of-space with multiple writers ===========" test_16() { - fsx -c 50 -p 100 -N 2500 $MOUNT1/fsxfile $MOUNT2/fsxfile + fsx -c 50 -p 100 -N 2500 -S 0 $MOUNT1/fsxfile $MOUNT2/fsxfile } run_test 16 "2500 iterations of dual-mount fsx =================" @@ -421,23 +453,71 @@ test_20() { [ $CNTD -gt 0 ] && \ error $CNTD" page left in cache after lock cancel" || true } - run_test 20 "test extra readahead page left in cache ====" +cleanup_21() { + trap 0 + umount $DIR1/d21 +} + test_21() { # Bug 5907 mkdir $DIR1/d21 - mount /etc $DIR1/d21 --bind # Poor man's mount. - rmdir $DIR1/d21 && error "Removed mounted directory" - rmdir $DIR2/d21 && echo "Removed mounted directory from another mountpoint, needs to be fixed" - test -d $DIR1/d21 || error "Monted directory disappeared" - umount $DIR1/d21 + mount /etc $DIR1/d21 --bind || error "mount failed" # Poor man's mount. + trap cleanup_21 EXIT + rmdir -v $DIR1/d21 && error "Removed mounted directory" + rmdir -v $DIR2/d21 && echo "Removed mounted directory from another mountpoint, needs to be fixed" + test -d $DIR1/d21 || error "Mounted directory disappeared" + cleanup_21 test -d $DIR2/d21 || test -d $DIR1/d21 && error "Removed dir still visible after umount" true } - run_test 21 " Try to remove mountpoint on another dir ====" +JOIN=${JOIN:-"lfs join"} + +test_22() { # Bug 9926 + mkdir $DIR1/d21 + dd if=/dev/urandom of=$DIR1/d21/128k bs=1024 count=128 + cp -p $DIR1/d21/128k $DIR1/d21/f_head + for ((i=0;i<10;i++)); do + cp -p $DIR1/d21/128k $DIR1/d21/f_tail + $JOIN $DIR1/d21/f_head $DIR1/d21/f_tail || error "join error" + $CHECKSTAT -a $DIR1/d21/f_tail || error "tail file exist after join" + done + echo aaaaaaaaaaa >> $DIR1/d21/no_joined + + mv $DIR2/d21/f_head $DIR2/ + munlink $DIR2/f_head || error "unlink joined file error" + cat $DIR2/d21/no_joined || error "cat error" + rm -rf $DIR2/d21/no_joined || error "unlink normal file error" +} +run_test 22 " After joining in one dir, open/close unlink file in anther dir" +test_23() { # Bug 5972 + echo "others should see updated atime while another read" > $DIR1/f23 + + # clear the lock(mode: LCK_PW) gotten from creating operation + cancel_lru_locks OSC + + time1=`date +%s` + sleep 2 + + multiop $DIR1/f23 or20_c & + MULTIPID=$! + + sleep 2 + time2=`stat -c "%X" $DIR2/f23` + + if (( $time2 <= $time1 )); then + kill -USR1 $MULTIPID + error "atime doesn't update among nodes" + fi + + kill -USR1 $MULTIPID || return 1 + rm -f $DIR1/f23 || error "rm -f $DIR1/f23 failed" + true +} +run_test 23 " others should see updated atime while another read====" log "cleanup: ======================================================" rm -rf $DIR1/[df][0-9]* $DIR1/lnk || true diff --git a/lustre/tests/tbox.sh b/lustre/tests/tbox.sh deleted file mode 100644 index 337e1b8..0000000 --- a/lustre/tests/tbox.sh +++ /dev/null @@ -1,116 +0,0 @@ -# tbox.sh - Shell functions to manage tinderbox build reporting -# Copyright (C) 2002 Cluster File Systems, Inc. -# Gord Eagle , 2002-08-22 - -HOSTNAME=`hostname` -PROGNAME=`echo "$0" | sed -e 's%^.*/%%'` -MAILPROG="${MAILPROG-mail}" - -TBOX_PHASE=build # or test -TBOX_STARTTIME=`date +%s` -TBOX_LOG="${TBOX_LOG-/tmp/tbox.$$.$TBOX_STARTTIME.log}" -TBOX_BUILDMAIL=tinderbox_builds@lustre.org -TBOX_BUILDNAME="${TBOX_BUILDNAME-$PROGNAME-$HOSTNAME}" - -# Send a status message to the list. -tbox_status() { - [ -n "$TBOX_BUILDNAME" -a -n "$TBOX_BUILDMAIL" ] || return 0 - [ "$#" -ge 4 ] || return 1 - if [ "$#" -gt 4 ]; then - log="$5" - echo >> $log - else - log= - fi - - TREE="$1" - SUBJECT="$2" - STATUS="$3" - TIMENOW="$4" - - echo "sending tinderbox mail to $TBOX_BUILDMAIL: $TREE $SUBJECT $STATUS" - - TMPFILE="/tmp/tinderbox.boilerplate.$$.$TIMENOW" - - cat > $TMPFILE <<-EOF - tinderbox: tree: $TREE - tinderbox: starttime: $TBOX_STARTTIME - tinderbox: timenow: $TIMENOW - tinderbox: builddate: $TBOX_STARTTIME - tinderbox: status: $STATUS - tinderbox: buildname: $TBOX_BUILDNAME - tinderbox: errorparser: unix - tinderbox: END - -EOF - - cat $TMPFILE $log | $MAILPROG -s "build $SUBJECT ($TBOX_BUILDNAME)" $TBOX_BUILDMAIL - rm -f $TMPFILE -} - -# Send out the failure or success message based on exit status. -tbox_exit() { - TREE="$1" - TAILPID="$2" - CODE=${3-$?} - if [ $CODE -eq 0 ]; then - SUBJECT=successful - STATUS=success - else - SUBJECT=failed - STATUS="${TBOX_PHASE}_failed" - fi - - # Send off the status message. - trap 0 - tbox_status "$TREE" "$SUBJECT" "$STATUS" - rm -f $TBOX_LOG - - # Wait for tail to display all output, then finish it. - sleep 1 - kill $TAILPID - exit $CODE -} - -# Run a subprogram, but stop it from sending its own tinderbox -# messages. -tbox_absorb_log() { - # This probably doesn't do what you think it does... it only prepends - # TBOX_LOG= to our arguments. - set TBOX_LOG= "$@" - - # Now evaluate the command. - eval "$@" -} - -# Start the log for a given tree. -tbox_start_log() { - TREE="$1" - - # Send status messages to stdout, stderr. - exec 6>&1 7>&2 - - [ -n "$TBOX_LOG" ] || return 0 - - # Initialize the output log file. - : > $TBOX_LOG - - # Send all our output to the log. - exec >>$TBOX_LOG 2>&1 - - # Monitor it on the old stdout. - tail -f $TBOX_LOG 1>&6 & - - # Allow tail to print our last output before exiting. - trap "tbox_exit \"$TREE\" $! 1" 1 2 10 15 - trap "tbox_exit \"$TREE\" $!" 0 -} - - -# Begin writing to the log and send out the initial status. -# tbox_start TREE -tbox_start() { - TREE="$1" - tbox_start_log "$TREE" - tbox_status "$TREE" starting building "$TBOX_STARTTIME" -} diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh index 339cbb5..e4c18ad 100644 --- a/lustre/tests/test-framework.sh +++ b/lustre/tests/test-framework.sh @@ -1,4 +1,5 @@ #!/bin/bash +# vim:expandtab:shiftwidth=4:softtabstop=4:tabstop=4: set -e @@ -31,7 +32,7 @@ init_test_env() { export XMLCONFIG=${XMLCONFIG:-${TESTSUITE}.xml} export LTESTDIR=${LTESTDIR:-$LUSTRE/../ltest} - [ -d /r ] && export ROOT=/r + [ -d /r ] && export ROOT=${ROOT:-/r} export TMP=${TMP:-$ROOT/tmp} export PATH=:$PATH:$LUSTRE/utils:$LUSTRE/tests @@ -42,6 +43,10 @@ init_test_env() { export CHECKSTAT="${CHECKSTAT:-checkstat} " export FSYTPE=${FSTYPE:-"ext3"} + if [ "$ACCEPTOR_PORT" ]; then + export PORT_OPT="--port $ACCEPTOR_PORT" + fi + # Paths on remote nodes, if different export RLUSTRE=${RLUSTRE:-$LUSTRE} export RPWD=${RPWD:-$PWD} @@ -73,6 +78,12 @@ start() { do_facet $facet $LCONF --select ${facet}_svc=${active}_facet \ --node ${active}_facet --ptldebug $PTLDEBUG --subsystem $SUBSYSTEM \ $@ $XMLCONFIG + RC=${PIPESTATUS[0]} + if [ $RC -ne 0 ]; then + # maybe acceptor error, dump tcp port usage + netstat -tpn + fi + return $RC } stop() { @@ -268,12 +279,12 @@ facet_nid() { facet=$1 HOST=`facet_host $facet` if [ -z "$HOST" ]; then - echo "The env variable ${facet}_HOST must be set." - exit 1 + echo "The env variable ${facet}_HOST must be set." + exit 1 fi if [ -z "$NETTYPE" ]; then - echo "The env variable NETTYPE must be set." - exit 1 + echo "The env variable NETTYPE must be set." + exit 1 fi echo `h2$NETTYPE $HOST` } @@ -343,7 +354,8 @@ add_facet() { echo "add facet $facet: `facet_host $facet`" do_lmc --add node --node ${facet}_facet $@ --timeout $TIMEOUT \ --lustre_upcall $UPCALL --ptldebug $PTLDEBUG --subsystem $SUBSYSTEM - do_lmc --add net --node ${facet}_facet --nid `facet_nid $facet` --nettype lnet + do_lmc --add net --node ${facet}_facet --nid `facet_nid $facet` \ + --nettype lnet $PORT_OPT } add_mds() { @@ -549,7 +561,7 @@ build_test_filter() { eval ONLY_${O}=true done [ "$EXCEPT$ALWAYS_EXCEPT" ] && \ - log "skipping test `echo $EXCEPT $ALWAYS_EXCEPT`" + log "skipping tests: `echo $EXCEPT $ALWAYS_EXCEPT`" for E in $EXCEPT $ALWAYS_EXCEPT; do eval EXCEPT_${E}=true done diff --git a/lustre/tests/test-lwizard.sh b/lustre/tests/test-lwizard.sh deleted file mode 100755 index 4f86411..0000000 --- a/lustre/tests/test-lwizard.sh +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/expect - -spawn lwizard $argv -HOSTNAME=`hostname` -set timeout 3 -expect { - "overwrite existing" { - interact - } -} -expect "HOSTNAME for mds" -send -- "$HOSTNAME\n" -expect "network INTERFACE" -send -- "192.168.1.29/24 10.0.0.29/24\n" -expect "enter the device or loop file name for mds" -send -- "/tmp/mds\n" -expect "device SIZE" -send -- "10000\n" -expect "configure FAILOVER" -send -- "n\n" -expect "HOSTNAME for ost" -send -- "$HOSTNAME\n" -expect "network INTERFACE" -send -- "192.168.1.29/24 10.0.0.29/24\n" -expect "device or loop file name for ost" -send -- "/tmp/ost\n" -expect "device SIZE" -send -- "10000\n" -expect "configure FAILOVER" -send -- "n\n" -expect "HOSTNAME for ost" -send -- "\n" -expect "clients' mountpoint" -send -- "\n" -expect "configure another client with multiple network interfaces" -send -- "y\n" -expect "HOSTNAME" -send -- "node\n" -expect "network interface address" -send -- "192.168.1.29/24 10.0.0.29/24\n" -expect "configure another client with multiple network interfaces" -send -- "n\n" -expect "Lustre configuration has been written" -send -- "\n" -close diff --git a/lustre/tests/uml.sh b/lustre/tests/uml.sh index 9c8955d..e887a9f 100644 --- a/lustre/tests/uml.sh +++ b/lustre/tests/uml.sh @@ -27,6 +27,7 @@ CLIENTOPT="user_xattr,acl,${CLIENTOPT:-""}" NETTYPE=${NETTYPE:-tcp} NIDTYPE=${NIDTYPE:-$NETTYPE} +[ "$ACCEPTOR_PORT" ] && PORT_OPT="--port $ACCEPTOR_PORT" # NOTE - You can't have different MDS/OST nodes and also have clients on the # MDS/OST nodes without using --endlevel and --startlevel during lconf. @@ -87,7 +88,8 @@ h2iib () { echo -n "adding NET for:" for NODE in `echo $MDSNODE $OSTNODES $CLIENTS | tr -s " " "\n" | sort -u`; do echo -n " $NODE" - ${LMC} -m $config --add net --node $NODE --nid `h2$NIDTYPE $NODE` --nettype $NETTYPE || exit 1 + ${LMC} -m $config --add net --node $NODE --nid `h2$NIDTYPE $NODE` \ + --nettype $NETTYPE $PORT_OPT || exit 1 done # configure mds server diff --git a/lustre/utils/Makefile.am b/lustre/utils/Makefile.am index 3c17b11..efc7547 100644 --- a/lustre/utils/Makefile.am +++ b/lustre/utils/Makefile.am @@ -9,7 +9,7 @@ AM_LDFLAGS := -L$(top_builddir)/lnet/utils LIBPTLCTL := $(top_builddir)/lnet/utils/libptlctl.a sbin_scripts = lconf lmc llanalyze llstat.pl llobdstat.pl lactive \ - load_ldap.sh lrun lwizard + load_ldap.sh lrun bin_scripts = lfind lstripe if UTILS diff --git a/lustre/utils/lctl.c b/lustre/utils/lctl.c index 9156745..6561ed5 100644 --- a/lustre/utils/lctl.c +++ b/lustre/utils/lctl.c @@ -120,6 +120,8 @@ command_t cmdlist[] = { "Omitting the count means indefinitely, 0 means restore, " "otherwise fail 'count' messages.\n" "usage: fail nid|_all_ [count]"}, + {"ping", jt_ptl_ping, 0, "Check LNET connectivity\n" + "usage: ping nid [timeout] [pid]"}, /* Device selection commands */ {"=== device selection ===", jt_noop, 0, "device selection"}, diff --git a/lustre/utils/lfs.c b/lustre/utils/lfs.c index 0aeb9f1..4a3874a 100644 --- a/lustre/utils/lfs.c +++ b/lustre/utils/lfs.c @@ -55,6 +55,7 @@ static int lfs_setstripe(int argc, char **argv); static int lfs_find(int argc, char **argv); static int lfs_getstripe(int argc, char **argv); static int lfs_osts(int argc, char **argv); +static int lfs_df(int argc, char **argv); static int lfs_check(int argc, char **argv); static int lfs_catinfo(int argc, char **argv); #ifdef HAVE_QUOTA_SUPPORT @@ -65,6 +66,7 @@ static int lfs_quotaoff(int argc, char **argv); static int lfs_setquota(int argc, char **argv); static int lfs_quota(int argc, char **argv); #endif +static int lfs_join(int argc, char **argv); /* all avaialable commands */ command_t cmdlist[] = { @@ -75,8 +77,8 @@ command_t cmdlist[] = { "usage: setstripe \n" " or \n" " setstripe -d (to delete default striping)\n" - "\tstripe size: Number of bytes in each stripe (0 default)\n" - "\tstripe start: OST index of first stripe (-1 default)\n" + "\tstripe size: Number of bytes on each OST (0 filesystem default)\n" + "\tstripe start: OST index of first stripe (-1 filesystem default)\n" "\tstripe count: Number of OSTs to stripe over (0 default, -1 all)"}, {"find", lfs_find, 0, "To list the extended attributes for a given filename or files in a\n" @@ -94,7 +96,14 @@ command_t cmdlist[] = { "usage: catinfo {keyword} [node name]\n" "\tkeywords are one of followings: config, deletions.\n" "\tnode name must be provided when use keyword config."}, + {"join", lfs_join, 0, + "join two lustre files into one - join A, B, will be like cat B >> A & del B\n" + "usage: join \n"}, {"osts", lfs_osts, 0, "osts"}, + {"df", lfs_df, 0, + "report filesystem disk space usage or inodes usage" + "of each MDS/OSD.\n" + "Usage: df [-i] [-h] [path]"}, #ifdef HAVE_QUOTA_SUPPORT {"quotachown",lfs_quotachown, 0, "Change files' owner or group on the specified filesystem.\n" @@ -308,7 +317,7 @@ static int lfs_osts(int argc, char **argv) fp = setmntent(MOUNTED, "r"); if (fp == NULL) { - fprintf(stderr, "setmntent(%s): %s:", MOUNTED, + fprintf(stderr, "%s: setmntent(%s): %s:", argv[0], MOUNTED, strerror (errno)); } else { mnt = getmntent(fp); @@ -317,8 +326,262 @@ static int lfs_osts(int argc, char **argv) rc = llapi_find(mnt->mnt_dir, obduuid, 0, 0, 0); if (rc) fprintf(stderr, - "error: lfs osts failed on %s\n", - mnt->mnt_dir); + "error: %s: failed on %s\n", + argv[0], mnt->mnt_dir); + } + mnt = getmntent(fp); + } + endmntent(fp); + } + + return rc; +} + +#define COOK(value) \ +({ \ + int radix = 0; \ + while (value > 1024) { \ + value /= 1024; \ + radix++; \ + } \ + radix; \ +}) +#define UUF "%-20s" +#define CSF "%9s" +#define CDF "%9llu" +#define HSF "%8s" +#define HDF "%8llu" +#define RSF "%5s" +#define RDF "%5d" + +static int path2mnt(char *path, FILE *fp, char *mntdir, int dir_len) +{ + char rpath[PATH_MAX] = {'\0'}; + struct mntent *mnt, out_mnt = {0}; + int rc, len, out_len = 0; + + if (!realpath(path, rpath)) { + rc = -errno; + fprintf(stderr, "error: lfs df: invalid path '%s': %s\n", + path, strerror(-rc)); + return rc; + } + + len = 0; + mnt = getmntent(fp); + while (feof(fp) == 0 && ferror(fp) == 0) { + if (llapi_is_lustre_mnttype(mnt->mnt_type)) { + len = strlen(mnt->mnt_dir); + if (len > out_len && + !strncmp(rpath, mnt->mnt_dir, len)) { + out_len = len; + memcpy(&out_mnt, mnt, sizeof(out_mnt)); + } + } + mnt = getmntent(fp); + } + + if (out_len > 0) { + strncpy(mntdir, out_mnt.mnt_dir, dir_len); + return 0; + } + + return -EINVAL; +} + +static int showdf(char *mntdir, struct obd_statfs *stat, + struct obd_uuid *uuid, int ishow, int cooked, + char *type, int index, int rc) +{ + __u64 avail, used, total; + double ratio = 0; + int obd_type; + char *suffix = "KMGTPEZY"; + char tbuf[10], ubuf[10], abuf[10], rbuf[10]; + + if (!uuid || !stat || !type) + return -EINVAL; + if (!strncmp(type, "MDT", 3)) { + obd_type = 0; + } else if(!strncmp(type, "OST", 3)){ + obd_type = 1; + } else { + fprintf(stderr, "error: lfs df: invalid type '%s'\n", type); + return -EINVAL; + } + + if (rc == 0) { + if (ishow) { + avail = stat->os_ffree; + used = stat->os_files - stat->os_ffree; + total = stat->os_files; + } else { + avail = stat->os_bavail * stat->os_bsize / 1024; + used = stat->os_blocks - stat->os_bavail; + used = used * stat->os_bsize / 1024; + total = stat->os_blocks * stat->os_bsize / 1024; + } + + if (total > 0) + ratio = (double)used / (double)total; + + if (cooked) { + int i; + i = COOK(total); + if (i > 0) + sprintf(tbuf, HDF"%c", total, suffix[i - 1]); + else + sprintf(tbuf, CDF, total); + + i = COOK(used); + if (i > 0) + sprintf(ubuf, HDF"%c", used, suffix[i - 1]); + else + sprintf(ubuf, CDF, used); + + i = COOK(avail); + if (i > 0) + sprintf(abuf, HDF"%c", avail, suffix[i - 1]); + else + sprintf(abuf, CDF, avail); + } else { + sprintf(tbuf, CDF, total); + sprintf(ubuf, CDF, used); + sprintf(abuf, CDF, avail); + } + + sprintf(rbuf, RDF, (int)(ratio * 100)); + if (obd_type == 0) + printf(UUF" "CSF" "CSF" "CSF" "RSF" %-s[MDT:%d]\n", + (char *)uuid, tbuf, ubuf, abuf, rbuf, + mntdir, index); + else + printf(UUF" "CSF" "CSF" "CSF" "RSF" %-s[OST:%d]\n", + (char *)uuid, tbuf, ubuf, abuf, rbuf, + mntdir, index); + + return 0; + } + switch (rc) { + case -ENODATA: + printf(UUF": inactive OST\n", (char *)uuid); + break; + default: + printf(UUF": %s\n", (char *)uuid, strerror(-rc)); + break; + } + + return 0; +} + +static int mntdf(char *mntdir, int ishow, int cooked) +{ + struct obd_statfs stat_buf; + struct obd_uuid uuid_buf; + __u32 index; + int rc; + + if (ishow) + printf(UUF" "CSF" "CSF" "CSF" "RSF" %-s\n", + "UUID", "Inodes", "IUsed", "IFree", + "IUse%", "Mounted on"); + else + printf(UUF" "CSF" "CSF" "CSF" "RSF" %-s\n", + "UUID", "1K-blocks", "Used", "Available", + "Use%", "Mounted on"); + + for (index = 0; ; index++) { + memset(&stat_buf, 0, sizeof(struct obd_statfs)); + memset(&uuid_buf, 0, sizeof(struct obd_uuid)); + rc = llapi_obd_statfs(mntdir, LL_STATFS_MDC, index, + &stat_buf, &uuid_buf); + if (rc == -ENODEV) + break; + + if (rc == -ENOTCONN || rc == -ETIMEDOUT || rc == -EIO || + rc == -ENODATA || rc == 0) { + showdf(mntdir, &stat_buf, &uuid_buf, ishow, cooked, + "MDT", index, rc); + } else { + fprintf(stderr, + "error: llapi_obd_statfs(%s): %s (%d)\n", + uuid_buf.uuid, strerror(-rc), rc); + return rc; + } + } + + for (index = 0;;index++) { + memset(&stat_buf, 0, sizeof(struct obd_statfs)); + memset(&uuid_buf, 0, sizeof(struct obd_uuid)); + rc = llapi_obd_statfs(mntdir, LL_STATFS_LOV, index, + &stat_buf, &uuid_buf); + if (rc == -ENODEV) + break; + + if (rc == -ENOTCONN || rc == -ETIMEDOUT || rc == -EIO || + rc == -ENODATA || rc == 0) { + showdf(mntdir, &stat_buf, &uuid_buf, ishow, cooked, + "OST", index, rc); + } else { + fprintf(stderr, + "error: llapi_obd_statfs failed: %s (%d)\n", + strerror(-rc), rc); + return rc; + } + } + return 0; +} + +static int lfs_df(int argc, char **argv) +{ + FILE *fp; + char *path = NULL; + struct mntent *mnt = NULL; + char mntdir[PATH_MAX] = {'\0'}; + int ishow = 0, cooked = 0; + int c, rc = 0; + + optind = 0; + while ((c = getopt(argc, argv, "ih")) != -1) { + switch (c) { + case 'i': + ishow = 1; + break; + case 'h': + cooked = 1; + break; + default: + return CMD_HELP; + } + } + if (optind < argc ) + path = argv[optind]; + + fp = setmntent(MOUNTED, "r"); + if (fp == NULL) { + rc = -errno; + fprintf(stderr, "error: %s: open %s failed( %s )\n", + argv[0], MOUNTED, strerror(errno)); + return rc; + } + if (path) { + rc = path2mnt(path, fp, mntdir, sizeof(mntdir)); + if (rc) { + endmntent(fp); + return rc; + } + + rc = mntdf(mntdir, ishow, cooked); + printf("\n"); + endmntent(fp); + } else { + mnt = getmntent(fp); + while (feof(fp) == 0 && ferror(fp) == 0) { + if (llapi_is_lustre_mnttype(mnt->mnt_type)) { + rc = mntdf(mnt->mnt_dir, ishow, cooked); + if (rc) + break; + printf("\n"); } mnt = getmntent(fp); } @@ -372,6 +635,11 @@ static int lfs_check(int argc, char **argv) endmntent(fp); } + if (!mnt) { + fprintf(stderr, "No suitable Lustre mount found\n"); + return -1; + } + rc = llapi_target_check(num_types, obd_types, mnt->mnt_dir); if (rc) @@ -421,6 +689,39 @@ static int lfs_catinfo(int argc, char **argv) return rc; } +int lfs_join(int argc, char **argv) +{ + char *name_head, *name_tail; + int fd, rc; + loff_t size; + + if (argc != 3) + return CMD_HELP; + name_head = argv[1]; + fd = open(name_head, O_WRONLY); + if (fd < 0) { + fprintf(stderr, "Can not open name_head %s rc=%d\n", + name_head, fd); + return fd; + } + size = lseek(fd, 0, SEEK_END); + if (size % JOIN_FILE_ALIGN) { + fprintf(stderr,"head file %s size %llu must be mutiple of %d\n", + name_head, size, JOIN_FILE_ALIGN); + rc = -EINVAL; + goto out; + } + name_tail = argv[2]; + rc = ioctl(fd, LL_IOC_JOIN, name_tail); +out: + close(fd); + if (rc) { + fprintf(stderr, "Lustre joining files: %s, %s, failed\n", + argv[1], argv[2]); + } + return rc; +} + #ifdef HAVE_QUOTA_SUPPORT static int lfs_quotachown(int argc, char **argv) { @@ -488,7 +789,11 @@ static int lfs_quotacheck(int argc, char **argv) qctl.qc_cmd = LUSTRE_Q_QUOTAOFF; qctl.qc_id = QFMT_LDISKFS; qctl.qc_type = check_type; - llapi_quotactl(mnt, &qctl); + rc = llapi_quotactl(mnt, &qctl); + if (rc) { + fprintf(stderr, "quota off failed: %s\n", strerror(errno)); + return rc; + } rc = llapi_quotacheck(mnt, check_type); if (rc) { @@ -779,7 +1084,7 @@ static inline char *type2name(int check_type) static void grace2str(time_t seconds,char *buf) { uint minutes, hours, days; - + minutes = (seconds + 30) / 60; hours = minutes / 60; minutes %= 60; @@ -861,20 +1166,20 @@ static void print_quota(char *mnt, struct if_quotactl *qctl, int ost_only) printf("%s\n%15s", mnt, ""); else printf("%15s", mnt); - + if (bover) diff2str(dqb->dqb_btime, timebuf, now); - + sprintf(numbuf[0], "%llu", toqb(dqb->dqb_curspace)); sprintf(numbuf[1], "%llu", dqb->dqb_bsoftlimit); sprintf(numbuf[2], "%llu", dqb->dqb_bhardlimit); printf(" %7s%c %6s %7s %7s", numbuf[0], bover ? '*' : ' ', numbuf[1], numbuf[2], bover > 1 ? timebuf : ""); - + if (iover) diff2str(dqb->dqb_itime, timebuf, now); - + sprintf(numbuf[0], "%llu", dqb->dqb_curinodes); sprintf(numbuf[1], "%llu", dqb->dqb_isoftlimit); sprintf(numbuf[2], "%llu", dqb->dqb_ihardlimit); @@ -1028,7 +1333,7 @@ static int lfs_quota(int argc, char **argv) print_quota(mnt, &qctl, 0); - if (!*obd_uuid) { + if (!*obd_uuid && qctl.qc_cmd != LUSTRE_Q_GETINFO) { print_mds_quota(mnt, &qctl); print_lov_quota(mnt, &qctl); } diff --git a/lustre/utils/liblustreapi.c b/lustre/utils/liblustreapi.c index 645b38a..bf88a0c 100644 --- a/lustre/utils/liblustreapi.c +++ b/lustre/utils/liblustreapi.c @@ -99,8 +99,8 @@ int llapi_file_create(char *name, long stripe_size, int stripe_offset, } if (stripe_size < 0 || (stripe_size & (LOV_MIN_STRIPE_SIZE - 1))) { errno = rc = -EINVAL; - err_msg("error: stripe_size must be an even " - "multiple of %d bytes", page_size); + err_msg("error: bad stripe_size %lu, must be an even " + "multiple of %d bytes", stripe_size, page_size); goto out; } if (stripe_offset < -1 || stripe_offset > 2048) { @@ -174,7 +174,7 @@ static int prepare_find(struct find_param *param) { param->lumlen = lov_mds_md_size(MAX_LOV_UUID_COUNT); if ((param->lmd = malloc(sizeof(lstat_t) + param->lumlen)) == NULL) { - err_msg("unable to allocate %d bytes of memory for ioctl", + err_msg("error: allocation of %d bytes for ioctl", sizeof(lstat_t) + param->lumlen); return ENOMEM; } @@ -209,7 +209,7 @@ int llapi_lov_get_uuids(int fd, struct obd_uuid *uuidp, int *ost_count) obdgens = malloc(size_round(max_ost_count * sizeof(*obdgens))); if (!obdgens) { - err_msg("no memory for %d generation #'s", max_ost_count); + err_msg("error: %d generation #'s", max_ost_count); return(-ENOMEM); } @@ -223,21 +223,22 @@ int llapi_lov_get_uuids(int fd, struct obd_uuid *uuidp, int *ost_count) desc.ld_tgt_count = max_ost_count; if (obd_ioctl_pack(&data, &buf, OBD_MAX_IOCTL_BUFFER)) { - fprintf(stderr, "internal buffer error packing\n"); + fprintf(stderr, "error: %s: internal packing error\n", + __FUNCTION__); rc = EINVAL; goto out; } rc = ioctl(fd, OBD_IOC_LOV_GET_CONFIG, buf); if (rc) { - err_msg("error getting LOV config"); + err_msg("error: %s: getting LOV config", __FUNCTION__); rc = errno; goto out; } if (obd_ioctl_unpack(&data, buf, OBD_MAX_IOCTL_BUFFER)) { - fprintf(stderr, "invalid reply from ioctl"); - rc = EINVAL; + rc = errno = EINVAL; + err_msg("error: %s: internal ioctl unpack", __FUNCTION__); goto out; } @@ -261,8 +262,12 @@ static int setup_obd_uuids(DIR *dir, char *dname, struct find_param *param) /* Get the lov name */ rc = ioctl(dirfd(dir), OBD_IOC_GETNAME, (void *)uuid); if (rc) { - fprintf(stderr, "error: can't get lov name: %s\n", - strerror(rc = errno)); + rc = errno; + if (rc == -ENOTTY) + fprintf(stderr, "error: %s does not appear to be in " + "a Lustre filesystem\n", dname); + else + err_msg("error: can't get lov name: %s"); return rc; } @@ -271,8 +276,7 @@ static int setup_obd_uuids(DIR *dir, char *dname, struct find_param *param) uuid); fp = fopen(buf, "r"); if (fp == NULL) { - fprintf(stderr, "error: %s opening %s\n", - strerror(rc = errno), buf); + err_msg("error: opening '%s'", buf); return rc; } @@ -298,8 +302,9 @@ static int setup_obd_uuids(DIR *dir, char *dname, struct find_param *param) fclose(fp); if (param->obduuid && (param->obdindex == OBD_NOT_FOUND)) { - printf("unknown obduuid: %s\n", param->obduuid->uuid); - rc = EINVAL; + fprintf(stderr, "error: %s: unknown obduuid: %s\n", + __FUNCTION__, param->obduuid->uuid); + rc = EINVAL; } return (rc); @@ -363,13 +368,80 @@ void lov_dump_user_lmm_v1(struct lov_user_md_v1 *lum, char *dname, char *fname, } } +void lov_dump_user_lmm_join(struct lov_user_md_v1 *lum, char *dname, + char *fname, int obdindex, int quiet, + int header, int body) +{ + struct lov_user_md_join *lumj = (struct lov_user_md_join *)lum; + int i, obdstripe = 0; + + if (obdindex != OBD_NOT_FOUND) { + for (i = 0; i < lumj->lmm_stripe_count; i++) { + if (obdindex == lumj->lmm_objects[i].l_ost_idx) { + printf("%s/%s\n", dname, fname); + obdstripe = 1; + break; + } + } + } else if (!quiet) { + printf("%s/%s\n", dname, fname); + obdstripe = 1; + } + + if (header && obdstripe == 1) { + printf("lmm_magic: 0x%08X\n", lumj->lmm_magic); + printf("lmm_object_gr: "LPX64"\n", lumj->lmm_object_gr); + printf("lmm_object_id: "LPX64"\n", lumj->lmm_object_id); + printf("lmm_stripe_count: %u\n", (int)lumj->lmm_stripe_count); + printf("lmm_stripe_size: %u\n", lumj->lmm_stripe_size); + printf("lmm_stripe_pattern: %x\n", lumj->lmm_pattern); + printf("lmm_extent_count: %x\n", lumj->lmm_extent_count); + } + + if (body) { + unsigned long long start = -1, end = 0; + if (!quiet && obdstripe == 1) + printf("joined\tobdidx\t\t objid\t\tobjid\t\t group" + "\t\tstart\t\tend\n"); + for (i = 0; i < lumj->lmm_stripe_count; i++) { + int idx = lumj->lmm_objects[i].l_ost_idx; + long long oid = lumj->lmm_objects[i].l_object_id; + long long gr = lumj->lmm_objects[i].l_object_gr; + if (obdindex == OBD_NOT_FOUND || obdindex == idx) + printf("\t%6u\t%14llu\t%#13llx\t%14llu%s", + idx, oid, oid, gr, + obdindex == idx ? " *" : ""); + if (start != lumj->lmm_objects[i].l_extent_start || + end != lumj->lmm_objects[i].l_extent_end) { + start = lumj->lmm_objects[i].l_extent_start; + printf("\t%14llu", start); + end = lumj->lmm_objects[i].l_extent_end; + if (end == (unsigned long long)-1) + printf("\t\tEOF\n"); + else + printf("\t\t%llu\n", end); + } else { + printf("\t\t\t\t\n"); + } + } + printf("\n"); + } +} + void llapi_lov_dump_user_lmm(struct find_param *param, char *dname, char *fname) { switch(*(__u32 *)¶m->lmd->lmd_lmm) { /* lum->lmm_magic */ case LOV_USER_MAGIC_V1: - lov_dump_user_lmm_v1(¶m->lmd->lmd_lmm, dname, fname, param->obdindex, - param->quiet, param->verbose, - (param->verbose || !param->obduuid)); + lov_dump_user_lmm_v1(¶m->lmd->lmd_lmm, dname, fname, + param->obdindex, param->quiet, + param->verbose, + (param->verbose || !param->obduuid)); + break; + case LOV_USER_MAGIC_JOIN: + lov_dump_user_lmm_join(¶m->lmd->lmd_lmm, dname, fname, + param->obdindex, param->quiet, + param->verbose, + (param->verbose || !param->obduuid)); break; default: printf("unknown lmm_magic: %#x (expecting %#x)\n", @@ -438,17 +510,19 @@ static int find_process_file(DIR *dir, char *dname, char *fname, if (rc) { if (errno == ENODATA) { if (!param->obduuid && !param->quiet) - fprintf(stderr, - "%s/%s has no stripe info\n", + fprintf(stderr, "%s/%s has no stripe info\n", dname, fname); rc = 0; - } else if (errno == EISDIR) { - fprintf(stderr, "process_file on directory %s/%s!\n", + } else if (errno == ENOTTY) { + fprintf(stderr, "error: %s/%s is not a Lustre fs?\n", dname, fname); + } else if (errno == EISDIR) { + err_msg("error: %s: directory %s/%s", + __FUNCTION__, dname, fname); /* add fname to directory list; */ rc = errno; } else { - err_msg("IOC_MDC_GETSTRIPE ioctl failed for '%s/%s'", + err_msg("error: IOC_MDC_GETSTRIPE failed for '%s/%s'", dname, fname); rc = errno; } @@ -500,10 +574,14 @@ static int process_dir(DIR *dir, char *dname, struct find_param *param) if (rc) { if (errno == ENODATA) { if (!param->obduuid && param->verbose) - printf("%s/%s has no stripe info\n", dname, ""); + printf("%s has no stripe info\n", dname); rc = 0; + } else if (errno == ENOTTY) { + fprintf(stderr, "error: %s: %s not on a Lustre fs?\n", + __FUNCTION__, dname); } else { - err_msg("GETSTRIPE failed for %s", dname); + err_msg("error: %s: LL_IOC_LOV_GETSTRIPE failed for %s", + __FUNCTION__, dname); } } else { llapi_lov_dump_user_lmm(param, dname, ""); @@ -519,8 +597,8 @@ static int process_dir(DIR *dir, char *dname, struct find_param *param) switch (dirp->d_type) { case DT_UNKNOWN: - err_msg("\"%s\" is UNKNOWN type %d", dirp->d_name, - dirp->d_type); + fprintf(stderr, "error: %s: '%s' is UNKNOWN type %d", + __FUNCTION__, dirp->d_name, dirp->d_type); /* If we cared we could stat the file to determine * type and continue on here, but we don't since we * know d_type should be valid for lustre and this @@ -534,7 +612,8 @@ static int process_dir(DIR *dir, char *dname, struct find_param *param) strcat(path, dirp->d_name); subdir = opendir(path); if (subdir == NULL) { - err_msg("\"%.40s\" opendir failed", path); + err_msg("error: %s: opendir '%.40s'", + __FUNCTION__, path); return errno; } rc = process_dir(subdir, path, param); @@ -560,10 +639,12 @@ static int process_path(char *path, struct find_param *param) fname = strrchr(path, '/'); if (fname != NULL && fname[1] == '\0') { /* Trailing '/', it must be a dir */ - *fname = '\0'; + if (strlen(path) > 1) + *fname = '\0'; + dir = opendir(path); if (dir == NULL) { - err_msg("\"%.40s\" opendir failed", path); + err_msg("error: %s: '%.40s' opendir",__FUNCTION__,path); rc = errno; } else { rc = process_dir(dir, path, param); @@ -582,10 +663,13 @@ static int process_path(char *path, struct find_param *param) *fname = '\0'; fname++; dname = path; + if (dname[0] == '\0') + dname = "/"; } dir = opendir(dname); if (dir == NULL) { - err_msg("\"%.40s\" opendir failed", dname); + err_msg("error: %s: '%.40s' open failed", + __FUNCTION__, dname); rc = errno; } else { if (!param->got_uuids) @@ -629,6 +713,47 @@ out: return ret; } +int llapi_obd_statfs(char *path, __u32 type, __u32 index, + struct obd_statfs *stat_buf, + struct obd_uuid *uuid_buf) +{ + int fd; + char raw[OBD_MAX_IOCTL_BUFFER] = {'\0'}; + char *rawbuf = raw; + struct obd_ioctl_data data; + int rc = 0; + + data.ioc_inlbuf1 = (char *)&type; + data.ioc_inllen1 = sizeof(__u32); + data.ioc_inlbuf2 = (char *)&index; + data.ioc_inllen2 = sizeof(__u32); + data.ioc_pbuf1 = (char *)stat_buf; + data.ioc_plen1 = sizeof(struct obd_statfs); + data.ioc_pbuf2 = (char *)uuid_buf; + data.ioc_plen2 = sizeof(struct obd_uuid); + + if (obd_ioctl_pack(&data, &rawbuf, sizeof(raw))) { + fprintf(stderr, "llapi_obd_statfs: error packing ioctl data\n"); + return rc; + } + + fd = open(path, O_RDONLY); + if (errno == EISDIR) + fd = open(path, O_DIRECTORY | O_RDONLY); + + if (fd < 0) { + rc = -errno; + err_msg("error: %s: opening '%s'", __FUNCTION__, path); + return rc; + } + rc = ioctl(fd, LL_IOC_OBD_STATFS, (void *)rawbuf); + if (rc) + rc = -errno; + + close(fd); + return rc; +} + #define MAX_STRING_SIZE 128 #define DEVICES_LIST "/proc/fs/lustre/devices" @@ -710,8 +835,7 @@ static void do_target_check(char *obd_type_name, char *obd_name, rc = llapi_ping(obd_type_name, obd_name); if (rc) { - fprintf(stderr, "error: check %s: %s\n", - obd_name, strerror(rc = errno)); + err_msg("error: check '%s'", obd_name); } else { printf("%s active.\n", obd_name); } @@ -861,8 +985,8 @@ static int quotachown_process_file(DIR *dir, char *dname, char *fname, * invoke syscall directly. */ rc = syscall(SYS_chown, pathname, st->st_uid, st->st_gid); if (rc) - fprintf(stderr, "chown %s (%u,%u) fail: %s\n", - pathname, st->st_uid, st->st_gid, strerror(errno)); + err_msg("error: chown %s (%u,%u)", + pathname, st->st_uid, st->st_gid); return rc; } diff --git a/lustre/utils/llmount.c b/lustre/utils/llmount.c index 7e8e41c..fc75f21 100644 --- a/lustre/utils/llmount.c +++ b/lustre/utils/llmount.c @@ -46,8 +46,8 @@ static char *progname = NULL; void usage(FILE *out) { fprintf(out, "%s v1.%d\n", progname, LMD_MAGIC & 0xFF); - fprintf(out, "usage: %s :// " - "[-fhnv] [-o mntopt]\n", progname); + fprintf(out, "usage: %s [,]://" + " [-fhnv] [-o mntopt]\n", progname); fprintf(out, "\t: nid of MDS (config) node\n" "\t: name of MDS service (e.g. mds1)\n" "\t: name of client config (e.g. client)\n" @@ -59,8 +59,8 @@ void usage(FILE *out) "\t-v|--verbose: print verbose config settings\n" "\t-o: filesystem mount options:\n" "\t\tflock/noflock: enable/disable flock support\n" - "\t\tuser_xattr/nouser_xattr: enable/disable user extended attributes\n" - "\t\t{no}acl: enable/disable ACL support\n" + "\t\tuser_xattr/nouser_xattr: enable/disable user extended " + "attributes\n" ); exit(out != stdout); } @@ -127,21 +127,49 @@ init_options(struct lustre_mount_data *lmd) { memset(lmd, 0, sizeof(*lmd)); lmd->lmd_magic = LMD_MAGIC; - lmd->lmd_nid = LNET_NID_ANY; return 0; } int -print_options(struct lustre_mount_data *lmd, const char *options) +print_options(FILE *out, struct lustre_mount_data *lmd, const char *options) { - printf("nid: %s\n", libcfs_nid2str(lmd->lmd_nid)); - printf("mds: %s\n", lmd->lmd_mds); - printf("profile: %s\n", lmd->lmd_profile); - printf("options: %s\n", options); + int i; + for (i = 0; i < lmd->lmd_nid_count; i++) { + fprintf(out, "mds nid %d: %s\n", i, + libcfs_nid2str(lmd->lmd_nid[i])); + } + fprintf(out, "mds name: %s\n", lmd->lmd_mds); + fprintf(out, "profile: %s\n", lmd->lmd_profile); + fprintf(out, "options: %s\n", options); return 0; } +static int parse_nids(struct lustre_mount_data *lmd, char *nids) +{ + int i = 0; + char *tmp = 0; + lnet_nid_t nid; + + while ((tmp = strsep(&nids, ",:"))) { + nid = libcfs_str2nid(tmp); + if (nid == LNET_NID_ANY) { + fprintf(stderr, "%s: Can't parse NID '%s'\n", + progname, tmp); + continue; + } + lmd->lmd_nid[lmd->lmd_nid_count++] = nid; + if (lmd->lmd_nid_count >= MAX_FAILOVER_NIDS) { + fprintf(stderr, "%s: Too many target NIDs: " + "ignoring nids after %s\n", + progname, tmp); + break; + } + } + return (lmd->lmd_nid_count); +} + + /***************************************************************************** * * This part was cribbed from util-linux/mount/mount.c. There was no clear @@ -283,10 +311,9 @@ build_data(char *source, char *options, struct lustre_mount_data *lmd, if (rc) return rc; - lmd->lmd_nid = libcfs_str2nid(nid); - if (lmd->lmd_nid == LNET_NID_ANY) { - fprintf(stderr, "%s: can't parse nid '%s'\n", progname, nid); - return 1; + if (parse_nids(lmd, nid) == 0) { + fprintf(stderr, "%s: Can't parse any mds nids\n", progname); + return(1); } if (strlen(mds) + 1 > sizeof(lmd->lmd_mds)) { @@ -391,7 +418,7 @@ int main(int argc, char *const argv[]) } if (verbose) - print_options(&lmd, options); + print_options(stdout, &lmd, options); rc = access(target, F_OK); if (rc) { @@ -404,8 +431,9 @@ int main(int argc, char *const argv[]) if (!fake) rc = mount(source, target, "lustre", flags, (void *)&lmd); if (rc) { - fprintf(stderr, "%s: mount(%s, %s) failed: %s\n", source, - target, progname, strerror(errno)); + fprintf(stderr, "%s: mount(%s, %s) failed: %s\n", progname, + source, target, strerror(errno)); + print_options(stderr, &lmd, options); if (errno == ENODEV) fprintf(stderr, "Are the lustre modules loaded?\n" "Check /etc/modules.conf and /proc/filesystems\n"); diff --git a/lustre/utils/llog_reader.c b/lustre/utils/llog_reader.c index 4676bce..169dc8a 100644 --- a/lustre/utils/llog_reader.c +++ b/lustre/utils/llog_reader.c @@ -213,6 +213,11 @@ void print_llog_header(struct llog_log_hdr* llog_buf) static void print_1_cfg(struct lustre_cfg *lcfg) { int i; + if (lcfg->lcfg_nid) + printf("nid=%s("LPX64") ", libcfs_nid2str(lcfg->lcfg_nid), + lcfg->lcfg_nid); + if (lcfg->lcfg_nal) + printf("nal=%d ", lcfg->lcfg_nal); for (i = 0; i < lcfg->lcfg_bufcount; i++) printf("%d:%.*s ", i, lcfg->lcfg_buflens[i], (char*)lustre_cfg_buf(lcfg, i)); @@ -226,16 +231,16 @@ static void print_setup_cfg(struct lustre_cfg *lcfg) if ((lcfg->lcfg_bufcount == 2) && (lcfg->lcfg_buflens[1] == sizeof(*desc))) { printf("lov_setup "); - printf("0:%s ", lustre_cfg_string(lcfg, 0)); + printf("0:%s ", lustre_cfg_string(lcfg, 0)); printf("1:(struct lov_desc)\n"); desc = (struct lov_desc*)(lustre_cfg_string(lcfg, 1)); - printf(" uuid=%s, ", (char*)desc->ld_uuid.uuid); - printf("stripe count=%d, ", desc->ld_default_stripe_count); - printf("size=%lld, ", desc->ld_default_stripe_size); - printf("offset=%lld, ", desc->ld_default_stripe_offset); + printf("\t\tuuid=%s ", (char*)desc->ld_uuid.uuid); + printf("stripe:cnt=%d ", desc->ld_default_stripe_count); + printf("size=%lld ", desc->ld_default_stripe_size); + printf("offset=%lld ", desc->ld_default_stripe_offset); printf("pattern=%d", desc->ld_pattern); } else { - printf("setup "); + printf("setup "); print_1_cfg(lcfg); } return; @@ -247,7 +252,7 @@ void print_lustre_cfg(struct lustre_cfg *lcfg) switch(cmd){ case(LCFG_ATTACH):{ - printf("attach "); + printf("attach "); print_1_cfg(lcfg); break; } @@ -256,37 +261,32 @@ void print_lustre_cfg(struct lustre_cfg *lcfg) break; } case(LCFG_DETACH):{ - printf("detach "); + printf("detach "); print_1_cfg(lcfg); break; } case(LCFG_CLEANUP):{ - printf("cleanup "); + printf("cleanup "); print_1_cfg(lcfg); break; } case(LCFG_ADD_UUID):{ - printf("add_uuid "); - printf("nid=%s("LPX64") ", - libcfs_nid2str(lcfg->lcfg_nid), lcfg->lcfg_nid); - /* obsolete */ - if (lcfg->lcfg_nal) - printf("nal=%d ", lcfg->lcfg_nal); + printf("add_uuid "); print_1_cfg(lcfg); break; } case(LCFG_DEL_UUID):{ - printf("del_uuid "); + printf("del_uuid "); print_1_cfg(lcfg); break; } case(LCFG_ADD_CONN):{ - printf("add_conn "); + printf("add_conn "); print_1_cfg(lcfg); break; } case(LCFG_DEL_CONN):{ - printf("del_conn "); + printf("del_conn "); print_1_cfg(lcfg); break; } @@ -320,6 +320,16 @@ void print_lustre_cfg(struct lustre_cfg *lcfg) print_1_cfg(lcfg); break; } + case(LCFG_PARAM):{ + printf("param "); + print_1_cfg(lcfg); + break; + } + case(LCFG_MARKER):{ + printf("marker "); + print_1_cfg(lcfg); + break; + } default: printf("unsupported cmd_code = %x\n",cmd); } diff --git a/lustre/utils/lmc b/lustre/utils/lmc index d584d29..ef0c7e1 100755 --- a/lustre/utils/lmc +++ b/lustre/utils/lmc @@ -693,12 +693,12 @@ def add_net(gen, lustre, options): # testing network if options.nonet: if options.verbose: - print "Skip the remote host networking test." - elif (node_name != 'client') and (real_net_type == 'tcp'): + print "Skipping the remote host networking test." + elif (real_net_type == 'tcp') and (nid != '*'): if options.verbose: print "Testing network on", node_name target = string.split(nid,'@')[0] - out = runcmd("ping -c 1 -w 10 %s" %target) + out = runcmd("ping -c 1 -w 5 %s" %target) if out != 0: print "Could not connect to", node_name,", Please check network." diff --git a/lustre/utils/lustre_cfg.c b/lustre/utils/lustre_cfg.c index 797275b..cb717b5 100644 --- a/lustre/utils/lustre_cfg.c +++ b/lustre/utils/lustre_cfg.c @@ -266,6 +266,9 @@ int do_add_uuid(char * func, char *uuid, lnet_nid_t nid) lcfg = lustre_cfg_new(LCFG_ADD_UUID, &bufs); lcfg->lcfg_nid = nid; + /* Poison NAL -- pre 1.4.6 will LASSERT on 0 NAL, this way it + doesn't work without crashing (bz 10130) */ + lcfg->lcfg_nal = 0x5a; #if 0 fprintf(stderr, "adding\tnid: %d\tuuid: %s\n", @@ -604,7 +607,7 @@ int jt_lcfg_add_conn(int argc, char **argv) fprintf(stderr, "%s: please use 'cfg_device name' to set the " "device name for config commands.\n", jt_cmdname(argv[0])); - return -EINVAL; + return -EINVAL; } lustre_cfg_bufs_reset(&bufs, lcfg_devname); @@ -637,7 +640,7 @@ int jt_lcfg_del_conn(int argc, char **argv) fprintf(stderr, "%s: please use 'cfg_device name' to set the " "device name for config commands.\n", jt_cmdname(argv[0])); - return -EINVAL; + return -EINVAL; } lustre_cfg_bufs_reset(&bufs, lcfg_devname); diff --git a/lustre/utils/lwizard b/lustre/utils/lwizard deleted file mode 100755 index fa85240..0000000 --- a/lustre/utils/lwizard +++ /dev/null @@ -1,530 +0,0 @@ -#!/bin/sh -# Copyright (C) 2003 Cluster File Systems, Inc. -# Create a Lustre configuration file -# -# Usage: lwizard -# -# Jerrifer -# wangdi - -# fatal error to exit -fatal() -{ - if [ "$#" -gt "1" ]; then - echo - echo "$2" - exit 1 - fi - - exit 1 -} - -#print usage and exit -usage() -{ - cat <> "$LMC_BATCH_FILE" -} - -# following user input to create xml config file -create_config() -{ - local extraopt="" - - for device in $DEVICE_LIST ; do - get_name_in_list $device - echo -n " $DEVICE_NAME" - - case $DEVICE_NAME in - mds*) - add_node "$HOST_NAME" "$INTERFACES" - extraopt="" - if [ "$FAILOVER_HOST" != "" ] ; then - extraopt=" --failover --group $HOST_NAME" - fi - - run_lmc --add mds \ - --node "$HOST_NAME" \ - --mds "$DEVICE_NAME" \ - --dev "$DEVICE" \ - --size "$DEVICE_SIZE" \ - --fstype "$DEFAULT_FSTYPE" \ - $extraopt - if [ "$FAILOVER_HOST" != "" ] ; then - add_node "$FAILOVER_HOST" - run_lmc --add mds \ - --node "$FAILOVER_HOST" \ - --mds "$DEVICE_NAME" \ - --dev "$FAILOVER_DEVICE" \ - --size "$DEVICE_SIZE" \ - --fstype "$DEFAULT_FSTYPE" \ - --failover \ - --group "$HOST_NAME" - fi - ;; - lov*) - run_lmc --add lov \ - --lov "$DEVICE_NAME" \ - --mds "$DEVICE_MDS" \ - --stripe_sz "$STRIPE_SIZE" \ - --stripe_cnt "$STRIPE_CNT" \ - --stripe_pattern "$STRIPE_PATTERN" - ;; - ost*) - add_node "$HOST_NAME" "$INTERFACES" - extraopt="" - if [ "$FAILOVER_HOST" != "" ] ; then - extraopt=" --failover --group $HOST_NAME" - fi - run_lmc --add ost \ - --node "$HOST_NAME" \ - --ost "$DEVICE_NAME" \ - --lov "$DEVICE_LOV" \ - --dev "$DEVICE" \ - --size "$DEVICE_SIZE" \ - --fstype "$DEFAULT_FSTYPE" \ - $extraopt - if [ "$FAILOVER_HOST" != "" ] ; then - add_node "$FAILOVER_HOST" - run_lmc --add ost \ - --node "$FAILOVER_HOST" \ - --ost "$DEVICE_NAME" \ - --lov "$DEVICE_LOV" \ - --dev "$FAILOVER_DEVICE" \ - --size "$DEVICE_SIZE" \ - --fstype "$DEFAULT_FSTYPE" \ - --failover \ - --group "$HOST_NAME" - fi - ;; - client*) - if [ "$INTERFACES" ] ; then - add_node "$HOST_NAME" "$INTERFACES" - run_lmc --add mtpt \ - --node "$HOST_NAME" \ - --mds "$DEVICE_MDS" \ - --lov "$DEVICE_LOV" \ - --path "$DEVICE" \ - # --clientoptions "async" - - else - add_client_node "$DEVICE_NAME" - run_lmc --add mtpt \ - --node "$DEVICE_NAME" \ - --mds "$DEVICE_MDS" \ - --lov "$DEVICE_LOV" \ - --path "$DEVICE" \ - # --clientoptions "async" - fi - ;; - esac - done - - echo - return 0 -} - -maybe_clean() -{ - [ -f "$1" ] || return 0 - if ! (( $FORCE )) ; then - echo -n "${0##*/}: overwrite existing $2 \"$1\"? " - read answer - if ! [ "${answer:0:1}" = "y" -o "${answer:0:1}" = "Y" ] ; then - echo "(${0##*/}: (Exiting.)" - exit 0 - fi - fi - rm -f "$1" -} - -# parse options -get_option "$@" - -# some default definitions -LMC=${LMC:-"/usr/sbin/lmc"} - -CONFIG_FILE=${CONFIG_FILE:-"lwizard.xml"} - -# Remove exiting files. - -maybe_clean "$CONFIG_FILE" "Lustre configuration file" -if [ "$LMC_BATCH_FILE" ] ; then - maybe_clean "$LMC_BATCH_FILE" "lmc batch file" -else - LMC_BATCH_FILE=$(mktemp -q "/tmp/${CONFIG_FILE##*/}.XXXXXX") - [ $? -eq 0 ] || fatal 1 "Couldn't create temporary batch file." -fi - -DEFAULT_FSTYPE=${DEFAULT_FSTYPE:-"ext3"} -DEFAULT_NETTYPE=${DEFAULT_NETTYPE:-"tcp"} -DEFAULT_MNTPT=${DEFAULT_MNTPT:-"/mnt/lustre"} - -STRIPE_SIZE=${STRIPE_SIZE:-$((1 * 1024 * 1024))} -STRIPE_CNT=${STRIPE_CNT:-1} -STRIPE_PATTERN=${STRIPE_PATTERN:-0} - -ANSWER="yes no" - -CURRENT_LOV= -MDS_LIST= -OST_LIST= -CLIENT_LIST= - -# print program information -cat <