From 113303973ec9f8484eb2355a1a6ef3c4c7fd6a56 Mon Sep 17 00:00:00 2001 From: nathan Date: Sat, 10 Feb 2007 06:33:41 +0000 Subject: [PATCH] land b1_5 onto HEAD --- .../patches/export-ext3-2.6-rhel4.patch | 33 + .../patches/export_symbols-ext3-2.6-suse.patch | 8 +- .../patches/ext3-check-jbd-errors-2.6.5.patch | 113 + .../patches/ext3-check-jbd-errors-2.6.9.patch | 113 + .../patches/ext3-ea-in-inode-2.6-rhel4.patch | 840 + .../patches/ext3-ea-in-inode-2.6-suse.patch | 105 +- .../patches/ext3-extents-2.6.12.patch | 2940 + .../patches/ext3-extents-2.6.15.patch | 2947 + .../patches/ext3-extents-2.6.18-vanilla.patch | 2945 + .../patches/ext3-extents-2.6.5.patch | 673 +- .../patches/ext3-extents-2.6.9-rhel4.patch | 2925 + .../patches/ext3-external-journal-2.6.12.patch | 148 + .../patches/ext3-filterdata-2.6.15.patch | 25 + .../patches/ext3-htree-dot-2.6.patch | 23 + .../kernel_patches/patches/ext3-ialloc-2.6.patch | 128 + .../patches/ext3-include-fixes-2.6-rhel4.patch | 20 + .../patches/ext3-include-fixes-2.6-suse.patch | 2 +- .../patches/ext3-lookup-dotdot-2.6.9.patch | 63 + .../patches/ext3-map_inode_page-2.6-suse.patch | 16 +- .../patches/ext3-mballoc2-2.6-fc5.patch | 3105 + .../patches/ext3-mballoc2-2.6-suse.patch | 3219 +- .../patches/ext3-mballoc2-2.6.12.patch | 3105 + .../patches/ext3-mballoc2-2.6.18-vanilla.patch | 2810 + .../patches/ext3-mballoc2-2.6.9-rhel4.patch | 3124 + .../kernel_patches/patches/ext3-nlinks-2.6.7.patch | 42 +- .../kernel_patches/patches/ext3-nlinks-2.6.9.patch | 142 + .../ext3-remove-cond_resched-calls-2.6.12.patch | 29 + .../patches/ext3-rename-reserve-2.6-suse.patch | 263 + .../patches/ext3-sector_t-overflow-2.6.12.patch | 64 + .../ext3-sector_t-overflow-2.6.5-suse.patch | 44 + .../ext3-sector_t-overflow-2.6.9-rhel4.patch | 64 + .../patches/ext3-wantedi-2.6-rhel4.patch | 180 + .../patches/ext3-wantedi-2.6-suse.patch | 57 +- ldiskfs/kernel_patches/patches/iopen-2.6-fc5.patch | 448 + .../kernel_patches/patches/iopen-2.6-rhel4.patch | 471 + .../kernel_patches/patches/iopen-2.6-suse.patch | 101 +- ldiskfs/kernel_patches/patches/iopen-2.6.12.patch | 471 + .../kernel_patches/series/ldiskfs-2.6-fc3.series | 28 +- .../kernel_patches/series/ldiskfs-2.6-fc5.series | 12 + .../kernel_patches/series/ldiskfs-2.6-rhel4.series | 15 + .../series/ldiskfs-2.6-sles10.series | 13 + .../kernel_patches/series/ldiskfs-2.6-suse.series | 12 +- .../series/ldiskfs-2.6.12-vanilla.series | 15 + .../series/ldiskfs-2.6.18-vanilla.series | 13 + ldiskfs/ldiskfs/Makefile.in | 17 +- ldiskfs/ldiskfs/autoMakefile.am | 18 +- lustre/BUGS | 16 +- lustre/COPYING | 352 - lustre/ChangeLog | 2322 +- lustre/LICENSE | 372 + lustre/Makefile.in | 19 +- lustre/Rules.in | 46 - lustre/autoMakefile.am | 51 +- lustre/autoconf/.cvsignore | 2 + lustre/autoconf/lustre-core.m4 | 942 +- lustre/autoconf/lustre-version.ac | 31 +- lustre/cmobd/Makefile.in | 4 - lustre/cmobd/autoMakefile.am | 11 - lustre/cmobd/cm_internal.h | 40 - lustre/cmobd/cm_mds_reint.c | 336 - lustre/cmobd/cm_obd.c | 340 - lustre/cmobd/cm_oss_reint.c | 312 - lustre/cmobd/cm_reint.c | 125 - lustre/cmobd/cm_write.c | 743 - lustre/cmobd/lproc_cm.c | 34 - lustre/cobd/.cvsignore | 10 - lustre/cobd/Makefile.in | 4 - lustre/cobd/autoMakefile.am | 11 - lustre/cobd/cache_obd.c | 1723 - lustre/cobd/lproc_cache.c | 83 - lustre/conf/lustre.dtd | 17 +- lustre/conf/lustre2ldif.xsl | 9 + lustre/conf/modules.conf | 2 + lustre/conf/slapd-lustre.conf | 2 +- lustre/contrib/.cvsignore | 2 + lustre/contrib/Makefile.am | 5 + lustre/contrib/README | 2 + lustre/contrib/mpich-1.2.6-lustre.patch | 1829 + lustre/doc/Makefile.am | 76 +- lustre/doc/blank_template.lyx | 138 - lustre/doc/lconf.8 | 97 +- lustre/doc/lconf.lyx | 6 + lustre/doc/lctl.8 | 431 +- lustre/doc/lctl.lyx | 53 +- lustre/doc/lfs.1 | 109 +- lustre/doc/lfs.lyx | 349 +- lustre/doc/llverdev.txt | 48 + lustre/doc/llverfs.txt | 48 + lustre/doc/lmc.1 | 76 +- lustre/doc/lmc.lyx | 161 +- lustre/doc/lustre.7 | 76 + lustre/doc/lwizard.1 | 84 - lustre/doc/mkfs.lustre.8 | 132 + lustre/doc/mount.lustre.8 | 105 + lustre/doc/tunefs.lustre.8 | 95 + lustre/include/.cvsignore | 1 + lustre/include/Makefile.am | 10 +- lustre/include/darwin/lprocfs_status.h | 57 + lustre/include/darwin/lustre_compat.h | 75 + lustre/include/darwin/lustre_debug.h | 36 + lustre/include/darwin/lustre_dlm.h | 25 + lustre/include/darwin/lustre_fsfilt.h | 32 + lustre/include/darwin/lustre_handles.h | 12 + lustre/include/darwin/lustre_lib.h | 76 + lustre/include/darwin/lustre_lite.h | 86 + lustre/include/darwin/lustre_log.h | 11 + lustre/include/darwin/lustre_mds.h | 32 + lustre/include/darwin/lustre_net.h | 34 + lustre/include/darwin/lustre_quota.h | 16 + lustre/include/darwin/lustre_types.h | 7 + lustre/include/darwin/lustre_user.h | 47 + lustre/include/darwin/lvfs.h | 24 + lustre/include/darwin/obd.h | 39 + lustre/include/darwin/obd_class.h | 34 + lustre/include/darwin/obd_support.h | 58 + lustre/include/liblustre.h | 496 +- lustre/include/linux/Makefile.am | 15 +- lustre/include/linux/lprocfs_status.h | 357 +- lustre/include/linux/lustre_acl.h | 42 - lustre/include/linux/lustre_audit.h | 108 - lustre/include/linux/lustre_cfg.h | 259 - lustre/include/linux/lustre_cmobd.h | 82 - lustre/include/linux/lustre_commit_confd.h | 74 - lustre/include/linux/lustre_compat25.h | 444 +- lustre/include/linux/lustre_debug.h | 46 +- lustre/include/linux/lustre_dlm.h | 707 +- lustre/include/linux/lustre_export.h | 100 - lustre/include/linux/lustre_fsfilt.h | 708 +- lustre/include/linux/lustre_gs.h | 162 - lustre/include/linux/lustre_ha.h | 32 - lustre/include/linux/lustre_handles.h | 38 +- lustre/include/linux/lustre_idl.h | 1350 - lustre/include/linux/lustre_import.h | 140 - lustre/include/linux/lustre_intent.h | 35 + lustre/include/linux/lustre_lib.h | 693 +- lustre/include/linux/lustre_lite.h | 154 +- lustre/include/linux/lustre_log.h | 487 +- lustre/include/linux/lustre_mds.h | 406 +- lustre/include/linux/lustre_mgmt.h | 32 - lustre/include/linux/lustre_net.h | 878 +- lustre/include/linux/lustre_patchless_compat.h | 82 + lustre/include/linux/lustre_quota.h | 18 + lustre/include/linux/lustre_sec.h | 689 - lustre/include/linux/lustre_smfs.h | 532 - lustre/include/linux/lustre_snap.h | 214 - lustre/include/linux/lustre_types.h | 47 + lustre/include/linux/lustre_ucache.h | 79 - lustre/include/linux/lustre_user.h | 82 + lustre/include/linux/lvfs.h | 125 +- lustre/include/linux/lvfs_linux.h | 29 +- lustre/include/linux/obd.h | 1079 +- lustre/include/linux/obd_class.h | 1509 +- lustre/include/linux/obd_echo.h | 38 - lustre/include/linux/obd_lmv.h | 8 - lustre/include/linux/obd_lov.h | 31 - lustre/include/linux/obd_ost.h | 49 - lustre/include/linux/obd_ptlbd.h | 30 - lustre/include/linux/obd_support.h | 553 +- lustre/include/linux/obd_trace.h | 20 - lustre/include/lprocfs_status.h | 505 + lustre/include/lustre/Makefile.am | 5 +- lustre/include/lustre/liblustreapi.h | 86 +- lustre/include/lustre/lustre_idl.h | 1419 + lustre/include/lustre/lustre_user.h | 248 +- lustre/include/lustre/types.h | 14 + lustre/include/lustre_cfg.h | 256 + lustre/include/lustre_commit_confd.h | 58 + lustre/include/lustre_debug.h | 64 + lustre/include/lustre_disk.h | 283 + lustre/include/lustre_dlm.h | 665 + lustre/include/lustre_export.h | 103 + lustre/include/lustre_fsfilt.h | 38 + lustre/include/lustre_ha.h | 25 + lustre/include/lustre_handles.h | 43 + lustre/include/lustre_import.h | 133 + lustre/include/lustre_lib.h | 761 + lustre/include/lustre_lite.h | 140 + lustre/include/lustre_log.h | 428 + lustre/include/lustre_mds.h | 212 + lustre/include/lustre_net.h | 902 + lustre/include/lustre_param.h | 64 + lustre/include/lustre_quota.h | 422 + lustre/include/lustre_ucache.h | 68 + lustre/include/lustre_ver.h.in | 21 + lustre/include/lvfs.h | 61 + lustre/include/obd.h | 1052 + lustre/include/{linux => }/obd_cache.h | 0 lustre/include/obd_class.h | 1418 + lustre/include/obd_echo.h | 33 + lustre/include/obd_lov.h | 28 + lustre/include/obd_ost.h | 37 + lustre/include/obd_support.h | 420 + lustre/include/types.h | 27 - lustre/kernel_patches/LICENSE | 359 + lustre/kernel_patches/README | 720 +- .../config-linux-2.4.18-p4smp-61chaos | 1035 + .../kernel_configs/config-linux-2.4.20-i386-rh | 1849 + .../kernel_configs/config-linux-2.6.7-uml | 493 - .../kernel-2.4.20-hp_pnnl-2.4-ia64-smp.config | 1047 + .../kernel-2.4.20-hp_pnnl-2.4-ia64.config | 1047 + .../kernel-2.4.20-rh-2.4-i686-smp.config | 1866 + .../kernel-2.4.20-rh-2.4-i686.config | 1866 + .../kernel-2.4.21-rhel-2.4-i686-smp.config | 2139 + .../kernel-2.4.21-rhel-2.4-i686.config | 2139 + .../kernel-2.4.21-rhel-2.4-ia64-smp.config | 37 +- .../kernel-2.4.21-rhel-2.4-ia64.config | 37 +- .../kernel-2.4.21-rhel-2.4-x86_64-smp.config | 1787 + .../kernel-2.4.21-rhel-2.4-x86_64.config | 1787 + .../kernel-2.4.21-sles-2.4-i686-smp.config | 2383 + .../kernel-2.4.21-sles-2.4-i686.config | 2383 + .../kernel-2.4.21-suse-2.4.21-2-x86_64.config | 2042 + .../kernel-2.6.10-2.6-fc3-i686-smp.config | 2035 - .../kernel-2.6.10-2.6-fc3-i686.config | 2031 - .../kernel_configs/kernel-2.6.10-smp.config | 2032 - .../kernel-2.6.10-suse-opteron.config | 2433 - .../kernel_configs/kernel-2.6.10-uml.config | 659 - .../kernel_configs/kernel-2.6.10-vmware.config | 1364 - .../kernel-2.6.15-2.6-fc5-i686-smp.config | 1598 + .../kernel-2.6.15-2.6-fc5-i686.config | 1591 + .../kernel_configs/kernel-2.6.15-fc5-i686.config | 1598 + .../kernel-2.6.16-2.6-patchless-i686-smp.config | 1620 + .../kernel-2.6.16-2.6-patchless-i686.config | 1616 + .../kernel-2.6.16-2.6-patchless-ia64-smp.config | 1422 + .../kernel-2.6.16-2.6-patchless-ia64.config | 1419 + .../kernel-2.6.16-2.6-patchless-x86_64-smp.config | 1463 + .../kernel-2.6.16-2.6-patchless-x86_64.config | 1462 + .../kernel-2.6.5-2.6-suse-i686-bigsmp.config | 87 +- .../kernel-2.6.5-2.6-suse-i686-smp.config | 2888 + .../kernel-2.6.5-2.6-suse-i686.config | 87 +- .../kernel-2.6.5-2.6-suse-ia64-smp.config | 2411 + .../kernel-2.6.5-2.6-suse-ia64.config | 2411 + .../kernel-2.6.5-2.6-suse-ppc-pseries64.config | 1454 + .../kernel-2.6.5-2.6-suse-ppc.config | 1453 + .../kernel-2.6.5-2.6-suse-x86_64-smp.config | 2501 + .../kernel-2.6.5-2.6-suse-x86_64.config | 2501 + .../kernel-2.6.9-2.6-rhel4-i686-smp.config | 2468 + .../kernel-2.6.9-2.6-rhel4-i686.config | 2472 + .../kernel-2.6.9-2.6-rhel4-ia64-smp.config | 2030 + .../kernel-2.6.9-2.6-rhel4-ia64.config | 2030 + .../kernel-2.6.9-2.6-rhel4-x86_64-smp.config | 2219 + .../kernel-2.6.9-2.6-rhel4-x86_64.config | 2219 + .../kernel_configs/uml-2.6.10-fc3.config | 662 + .../kernel_configs/uml-vanilla-2.4.24.config | 413 + .../kernel_configs/uml-vanilla-2.6.6.config | 491 + .../kernel_configs/uml_2.6.0_test3.config | 325 + .../kernel_patches/patches/2.6-rhel4-kgdb-ga.patch | 6371 ++ lustre/kernel_patches/patches/2.6.5-quotafix.patch | 2110 + .../3.5G-address-space-2.4.22-vanilla.patch | 352 + .../kernel_patches/patches/8kstack-2.6-rhel4.patch | 13 + lustre/kernel_patches/patches/8kstack-2.6.12.patch | 13 + .../kernel_patches/patches/add_page_private.patch | 23 + .../patches/bitops_ext2_find_next_le_bit-2.6.patch | 153 + .../patches/blkdev_tunables-2.4.21-chaos.patch | 52 + .../patches/blkdev_tunables-2.6-suse.patch | 28 + .../patches/bluesmoke-2.6-suse-lnxi.patch | 5485 ++ .../patches/brk-locked-2.6-suse-lnxi.patch | 219 + .../patches/compile-fixes-2.4.21-rhel.patch | 90 + .../patches/compile-fixes-2.6.9-rhel4-22.patch | 76 + .../patches/configurable-x86-stack-2.4.20.patch | 318 + .../configurable-x86-stack-2.4.21-chaos.patch | 468 + .../configurable-x86-stack-2.4.21-suse-171.patch | 317 + .../configurable-x86-stack-2.4.21-suse2.patch | 318 + .../patches/configurable-x86_64-2.4.21.patch | 122 + .../kernel_patches/patches/dcache-fid-2.6.7.patch | 11 - .../patches/dcache-mds-num-2.6.7.patch | 23 - .../patches/dcache-qstr-api-fix-2.6-suse.patch | 32 + .../patches/dcache_refcount_debug.patch | 24 + .../patches/dev_read_only-2.6-fc5.patch | 145 + .../patches/dev_read_only-2.6-lnxi.patch | 167 + .../patches/dev_read_only-2.6-suse.patch | 153 +- .../patches/dev_read_only-2.6.10-fc3.patch | 107 - .../patches/dev_read_only-2.6.18-vanilla.patch | 145 + .../patches/dev_read_only_2.4.20-rh.patch | 125 + .../patches/dev_read_only_2.4.21-chaos.patch | 122 + .../patches/dynamic-locks-2.6.10-fc3.patch | 278 - .../patches/dynamic-locks-2.6.7.patch | 278 - lustre/kernel_patches/patches/elevator-cfq.patch | 20 + lustre/kernel_patches/patches/export-2.6-fc3.patch | 203 - lustre/kernel_patches/patches/export-2.6-fc5.patch | 12 + .../patches/export-2.6.18-vanilla.patch | 24 + .../patches/export-do_kern_mount.patch | 13 + .../patches/export-ext3-2.6-rhel4.patch | 33 + .../patches/export-ext3-2.6.10-fc3.patch | 33 - .../patches/export-log-2.6-rhel4.patch | 12 + .../patches/export-show_task-2.4-cray.patch | 33 + .../patches/export-show_task-2.4-rh.patch | 171 + .../patches/export-show_task-2.4-rhel.patch | 20 + .../patches/export-show_task-2.4-vanilla.patch | 34 + .../patches/export-show_task-2.6-fc5.patch | 25 + .../patches/export-show_task-2.6-vanilla.patch | 10 +- .../patches/export-show_task-2.6.18-vanilla.patch | 25 + .../patches/export-truncate-2.6.18-vanilla.patch | 39 + .../kernel_patches/patches/export-truncate.patch | 35 + .../patches/export-vanilla-2.6.patch | 94 - .../patches/export-zap-page-range.patch | 12 + .../patches/export_num_siblings.patch | 10 + .../patches/export_symbol_numa-2.6-fc5.patch | 12 + .../patches/export_symbol_numa.patch | 24 + .../patches/export_symbols-2.6-rhel4.patch | 81 + .../patches/export_symbols-2.6-suse.patch | 72 +- .../patches/export_symbols-2.6.12.patch | 64 + .../patches/export_symbols-2.6.18-vanilla.patch | 64 + .../patches/export_symbols-ext3-2.6-suse.patch | 8 +- .../patches/export_symbols-ext3-2.6.10-fc3.patch | 17 - .../patches/exports-2.4.21-chaos.patch | 59 + .../patches/exports_2.4.19-suse.patch | 53 + .../patches/exports_2.4.19-suse2.patch | 59 + .../patches/exports_2.4.20-rh-hp.patch | 53 + .../kernel_patches/patches/ext-2.4-patch-1.patch | 2536 + .../kernel_patches/patches/ext-2.4-patch-2.patch | 34 + .../kernel_patches/patches/ext-2.4-patch-3.patch | 96 + .../kernel_patches/patches/ext-2.4-patch-4.patch | 52 + lustre/kernel_patches/patches/ext3-2.4-ino_t.patch | 144 + .../kernel_patches/patches/ext3-2.4.20-fixes.patch | 118 + .../patches/ext3-check-jbd-errors-2.6.5.patch | 113 + .../patches/ext3-check-jbd-errors-2.6.9.patch | 113 + .../patches/ext3-delete_thread-2.4.20-hp.patch | 499 + .../patches/ext3-delete_thread-2.4.21-chaos.patch | 449 + .../ext3-delete_thread-2.4.21-suse-171.patch | 496 + .../patches/ext3-delete_thread-2.4.24.patch | 449 + .../patches/ext3-delete_thread-2.4.29.patch | 442 + .../ext3-disable-reservation-2.6.10-fc3.patch | 14 - ...sable-write-barrier-by-default-2.6-sles10.patch | 15 + .../patches/ext3-ea-in-inode-2.4.20.patch | 747 + .../patches/ext3-ea-in-inode-2.4.21-chaos.patch | 758 + .../patches/ext3-ea-in-inode-2.4.21-sles.patch | 758 + .../patches/ext3-ea-in-inode-2.4.21-suse2.patch | 758 + .../patches/ext3-ea-in-inode-2.4.22-rh.patch | 755 + .../patches/ext3-ea-in-inode-2.4.29.patch | 731 + .../patches/ext3-ea-in-inode-2.6-fc3.patch | 862 - .../patches/ext3-ea-in-inode-2.6-rhel4.patch | 840 + .../patches/ext3-ea-in-inode-2.6-suse.patch | 105 +- .../kernel_patches/patches/ext3-error-export.patch | 16 + .../patches/ext3-extents-2.4.21-chaos.patch | 2877 + .../patches/ext3-extents-2.4.21-suse2.patch | 2875 + .../patches/ext3-extents-2.4.24.patch | 2863 + .../patches/ext3-extents-2.4.29.patch | 2858 + .../patches/ext3-extents-2.6.10-fc3.patch | 2935 - .../patches/ext3-extents-2.6.12.patch | 2940 + .../patches/ext3-extents-2.6.15.patch | 2947 + .../patches/ext3-extents-2.6.18-vanilla.patch | 2945 + .../patches/ext3-extents-2.6.5.patch | 673 +- .../patches/ext3-extents-2.6.7.patch | 2844 - .../patches/ext3-extents-2.6.9-rhel4.patch | 2925 + .../ext3-extents-asyncdel-2.4.21-chaos.patch | 31 + .../patches/ext3-extents-asyncdel-2.4.24.patch | 31 + .../patches/ext3-extents-in-ea-2.6.10-fc3.patch | 362 - .../patches/ext3-extents-in-ea-2.6.7.patch | 361 - .../ext3-extents-in-ea-exports-symbol-2.6.7.patch | 93 - .../ext3-extents-in-ea-ioctl-2.6.10-fc3.patch | 230 - .../patches/ext3-extents-in-ea-ioctl-2.6.7.patch | 228 - .../patches/ext3-external-journal-2.6.12.patch | 148 + .../patches/ext3-external-journal-2.6.9.patch | 150 + lustre/kernel_patches/patches/ext3-fid-2.6.7.patch | 40 - .../patches/ext3-filterdata-2.6.15.patch | 25 + .../patches/ext3-htree-2.4.21-chaos.patch | 2593 + .../patches/ext3-htree-2.4.21-rhel.patch | 2531 + .../patches/ext3-htree-2.4.22-rh.patch | 2581 + .../kernel_patches/patches/ext3-htree-2.4.29.patch | 2496 + .../patches/ext3-htree-dot-2.6.5-suse.patch | 23 + .../patches/ext3-htree-dot-2.6.patch | 23 + .../patches/ext3-htree-path-ops.patch | 894 + .../patches/ext3-ialloc-2.4.21-suse2.patch | 237 + .../patches/ext3-ialloc-2.4.24.patch | 238 + .../kernel_patches/patches/ext3-ialloc-2.6.patch | 128 + .../patches/ext3-include-fixes-2.6-rhel4.patch | 20 + .../patches/ext3-include-fixes-2.6-suse.patch | 2 +- .../patches/ext3-init-generation-2.6-suse.patch | 12 - .../patches/ext3-ino_sb_macro-2.4.21-chaos.patch | 1514 + .../patches/ext3-inode-reuse-2.6.7.patch | 120 - lustre/kernel_patches/patches/ext3-largefile.patch | 16 + .../patches/ext3-lookup-dotdot-2.4.20.patch | 63 + .../patches/ext3-lookup-dotdot-2.6.9.patch | 63 + .../patches/ext3-map_inode_page-2.4.21-suse2.patch | 119 + .../patches/ext3-map_inode_page-2.6-suse.patch | 16 +- .../patches/ext3-map_inode_page.patch | 110 + .../patches/ext3-map_inode_page_2.4.18.patch | 110 + .../patches/ext3-mballoc2-2.6-fc5.patch | 3105 + .../patches/ext3-mballoc2-2.6-suse.patch | 3219 +- .../patches/ext3-mballoc2-2.6.10-fc3.patch | 2250 - .../patches/ext3-mballoc2-2.6.12.patch | 3105 + .../patches/ext3-mballoc2-2.6.18-vanilla.patch | 2810 + .../patches/ext3-mballoc2-2.6.7.patch | 1750 - .../patches/ext3-mballoc2-2.6.9-rhel4.patch | 3124 + .../patches/ext3-mds-num-2.6.10-fc3.patch | 281 - .../patches/ext3-mds-num-2.6.7.patch | 281 - .../ext3-multi-mount-protection-2.6-fc5.patch | 381 + ...xt3-multi-mount-protection-2.6.18-vanilla.patch | 381 + .../patches/ext3-nlinks-2.4.20-hp_pnnl.patch | 156 + .../patches/ext3-nlinks-2.4.21-chaos.patch | 156 + .../patches/ext3-nlinks-2.4.24.patch | 152 + .../kernel_patches/patches/ext3-nlinks-2.6.7.patch | 42 +- .../kernel_patches/patches/ext3-nlinks-2.6.9.patch | 142 + .../patches/ext3-no-write-super-chaos.patch | 15 + .../patches/ext3-noread-2.4.20.patch | 218 + .../patches/ext3-noread-2.4.21-chaos.patch | 223 + .../patches/ext3-noread-2.4.21-suse2.patch | 218 + .../patches/ext3-o_direct-2.4.21-chaos.patch | 23 + .../patches/ext3-orphan_lock-2.4.22-rh.patch | 82 + .../kernel_patches/patches/ext3-orphan_lock.patch | 79 + .../patches/ext3-patch-fuzz-fixup-fc3.patch | 15 + .../patches/ext3-pdirops-2.6.10-fc3.patch | 1222 - .../patches/ext3-pdirops-2.6.7.patch | 1211 - .../patches/ext3-raw-lookup-2.6.10.patch | 49 - .../kernel_patches/patches/ext3-raw-lookup.patch | 61 + .../ext3-remove-cond_resched-calls-2.6.12.patch | 29 + .../patches/ext3-rename-reserve-2.6-suse.patch | 263 + .../patches/ext3-reserve-inode-space-2.6.7.patch | 286 - .../kernel_patches/patches/ext3-san-2.4.20.patch | 117 + .../patches/ext3-sector_t-overflow-2.4.patch | 41 + .../patches/ext3-sector_t-overflow-2.6.12.patch | 64 + .../ext3-sector_t-overflow-2.6.5-suse.patch | 44 + .../ext3-sector_t-overflow-2.6.9-rhel4.patch | 64 + .../patches/ext3-statfs-2.6.12.patch | 177 + .../patches/ext3-truncate-buffer-head.patch | 10 + .../patches/ext3-truncate_blocks.patch | 92 + .../patches/ext3-trusted_ea-2.4.20.patch | 180 + .../patches/ext3-use-after-free.patch | 53 + .../patches/ext3-wantedi-2.6-rhel4.patch | 180 + .../patches/ext3-wantedi-2.6-suse.patch | 57 +- .../patches/ext3-wantedi-2.6.10-fc3.patch | 192 - .../patches/ext3-wantedi-2.6.15.patch | 174 + .../patches/ext3-wantedi-misc-2.6.18-vanilla.patch | 16 + .../patches/ext3-xattr-ptr-arith-fix.patch | 18 + .../patches/extN-2.4.18-ino_sb_fixup.patch | 33 + .../kernel_patches/patches/extN-misc-fixup.patch | 20 + .../patches/extN-wantedi-2.4.21-chaos.patch | 226 + .../patches/extN-wantedi-2.4.21-suse2.patch | 226 + lustre/kernel_patches/patches/extN-wantedi.patch | 216 + .../patches/fc3_to_rhel4_updates.patch | 12 + .../patches/fsprivate-2.4-suse.patch | 10 + lustre/kernel_patches/patches/fsprivate-2.4.patch | 10 + lustre/kernel_patches/patches/fsprivate-2.6.patch | 10 + .../patches/gfp_debug-2.4.21-rhel.patch | 77 + .../grab_cache_page_nowait_gfp-2.4.21-suse2.patch | 85 + .../grab_cache_page_nowait_gfp-2.6-suse.patch | 57 + .../grab_cache_page_nowait_gfp-rh-2.4.patch | 65 + .../patches/header_guards-vanilla-2.6.patch | 45 - .../patches/highmem-split-2.6.10-fc3.patch | 86 - .../patches/iallocsem_consistency.patch | 48 + .../patches/inode-max-readahead-2.4.24.patch | 22 + .../patches/invalidate_show-2.4.20-hp.patch | 123 + .../patches/invalidate_show-2.4.20-rh.patch | 114 + .../patches/invalidate_show-2.4.29.patch | 107 + .../kernel_patches/patches/invalidate_show.patch | 112 + .../patches/iod-rmap-exports-2.4.21-chaos.patch | 94 + .../patches/iod-stock-24-exports_hp.patch | 48 + .../patches/iod-stock-exports-2.4.22.patch | 52 + lustre/kernel_patches/patches/iopen-2.4.20.patch | 495 + .../patches/iopen-2.4.21-chaos.patch | 497 + lustre/kernel_patches/patches/iopen-2.6-fc5.patch | 448 + .../kernel_patches/patches/iopen-2.6-rhel4.patch | 471 + lustre/kernel_patches/patches/iopen-2.6-suse.patch | 101 +- .../kernel_patches/patches/iopen-2.6-vanilla.patch | 476 - .../kernel_patches/patches/iopen-2.6.10-fc3.patch | 476 - lustre/kernel_patches/patches/iopen-2.6.12.patch | 471 + .../patches/iopen-misc-2.6-fc3.patch | 82 + .../kernel_patches/patches/iopen-misc-2.6.12.patch | 82 + .../patches/iopen-misc-2.6.18-vanilla.patch | 82 + .../patches/jbd-buffer-release-2.6.10-fc3.patch | 399 - .../patches/jbd-buffer-release-2.6.7.patch | 399 - .../patches/jbd-commit-tricks-rhel3.patch | 132 + .../kernel_patches/patches/jbd-commit-tricks.patch | 132 + lustre/kernel_patches/patches/jbd-ctx_switch.patch | 13 + .../patches/jbd-dont-account-blocks-twice.patch | 17 + lustre/kernel_patches/patches/jbd-flushtime.patch | 34 + .../patches/jbd-get_write_access.patch | 56 + .../patches/jbd-jcberr-2.6.18-vanilla.patch | 228 + .../patches/jbd-static-wbuf-2.6.7.patch | 281 - .../patches/jbd-stats-2.6.13.4.patch | 735 + .../kernel_patches/patches/jbd-stats-2.6.5.patch | 772 + .../kernel_patches/patches/jbd-stats-2.6.9.patch | 735 + .../kernel_patches/patches/kallsyms-2.4.29.patch | 689 + .../patches/kexec-2.6-suse-lnxi.patch | 1603 + lustre/kernel_patches/patches/kgdb-ga.patch | 6358 -- .../patches/kjournald_affinity.patch | 52 + .../patches/link_notlast-susefix.patch | 16 + .../patches/linux-2.4.20-xattr-0.8.54-hp.patch | 4875 + .../patches/linux-2.4.21-xattr-0.8.54-chaos.patch | 2172 + .../linux-2.4.21-xattr-0.8.54-suse-171.patch | 276 + .../patches/linux-2.4.21-xattr-0.8.54-suse2.patch | 258 + .../linux-2.4.24-jbd-handle-EIO-rhel3.patch | 23 + .../patches/linux-2.4.24-jbd-handle-EIO.patch | 51 + .../patches/linux-2.4.24-xattr-0.8.54.patch | 5474 ++ .../patches/linux-2.4.29-xattr-0.8.54.patch | 5362 + .../patches/linux-2.6-binutils-2.16.patch | 102 + .../patches/linux-2.6.10-CITI_NFS4_ALL-1.patch | 10703 -- .../patches/linux-2.6.10-fc3-left.patch | 1477 - .../patches/linux-2.6.10-fc3-lkcd.patch | 10687 -- .../patches/linux-2.6.10-fc3-sunrpc_cacheput.patch | 22 - .../patches/linux-2.6.10-flock.patch | 35 - .../patches/linux-2.6.7-CITI_NFS4_ALL-7-lsec.patch | 16246 --- .../linux-2.6.9-ext3-sub-second-timestamp.patch | 631 + lustre/kernel_patches/patches/listman-2.4.20.patch | 22 + .../patches/listman-2.4.21-chaos.patch | 26 + .../llnl-frame-pointer-walk-2.4.21-rhel.patch | 120 + .../llnl-frame-pointer-walk-fix-2.4.21-rhel.patch | 249 + .../lookup-stack-symbols-2.4.21-suse-171.patch | 234 + .../lookup_bdev_init_intent-2.6.18-vanilla.patch | 12 + .../patches/loop-sync-2.4.21-suse.patch | 11 + .../patches/lustre-version-revert_suse.patch | 4 + lustre/kernel_patches/patches/lustre_build.patch | 33 + lustre/kernel_patches/patches/lustre_version.patch | 24 +- .../kernel_patches/patches/md_path_lookup-2.6-suse | 25 + .../kernel_patches/patches/mtd-2.6-suse-lnxi.patch | 35414 +++++++ .../patches/netconsole-2.4.24-ppc.patch | 489 + .../patches/new-tcp-zero-copy-2.4.21-rhel3.patch | 330 + .../patches/new-tcp-zero-copy-2.4.29-vanilla.patch | 317 + .../patches/new-tcp-zero-copy-2.6.5-sles9.patch | 329 + .../new-tcp-zero-copy-2.6.9-41.2chaos.patch | 318 + .../patches/nfs-cifs-intent-2.6-fc3.patch | 127 + .../patches/nfs-cifs-intent-2.6-fc5.patch | 116 + .../patches/nfs-cifs-intent-2.6-rhel4.patch | 123 + .../patches/nfs-cifs-intent-2.6-suse.patch | 18 + .../patches/nfs-cifs-intent-2.6-vanilla.patch | 117 - .../patches/nfs-cifs-intent-2.6.12.patch | 128 + .../patches/nfs_export_kernel-2.4.20-hp.patch | 740 + .../patches/nfs_export_kernel-2.4.21-chaos.patch | 756 + .../patches/nfs_export_kernel-2.4.21-suse2.patch | 756 + .../patches/nfs_export_kernel-2.4.22.patch | 745 + .../patches/nfs_export_kernel-2.4.29.patch | 744 + .../patches/nfs_export_kernel-2.4.29.patch-1 | 730 + .../patches/nfs_statfs-toomanyfiles-rhel-2.4.patch | 30 + lustre/kernel_patches/patches/nfsd_iallocsem.patch | 19 + lustre/kernel_patches/patches/nid-2.6-fc3.patch | 12 - .../patches/pag-basic-2.6.10-fc3.patch | 85 - .../patches/pagecache-lock-2.4.21-chaos.patch | 21 + .../patches/perfctr-2.6-suse-lnxi.patch | 10070 ++ .../patches/procfs-ndynamic-2.4.21-suse2.patch | 16 + .../patches/procfs-ndynamic-2.4.patch | 13 + lustre/kernel_patches/patches/qsnet-rhel-2.4.patch | 93733 ++++++++++++++++++ .../kernel_patches/patches/qsnet-rhel4-2.6.patch | 97652 +++++++++++++++++++ lustre/kernel_patches/patches/qsnet-suse-2.6.patch | 94821 ++++++++++++++++++ .../patches/quota-deadlock-on-pagelock-core.patch | 1264 + .../patches/quota-deadlock-on-pagelock-ext3.patch | 273 + .../patches/quota-umount-race-fix.patch | 139 + .../patches/raid5-configurable-cachesize.patch | 50 + lustre/kernel_patches/patches/raid5-large-io.patch | 20 + .../kernel_patches/patches/raid5-merge-ios.patch | 129 + .../patches/raid5-optimize-memcpy.patch | 227 + .../patches/raid5-serialize-ovelapping-reqs.patch | 140 + lustre/kernel_patches/patches/raid5-stats.patch | 200 + .../patches/raid5-stripe-by-stripe-handling.patch | 104 + .../patches/remove-suid-2.4-rhel.patch | 23 + .../patches/remove-suid-2.6-suse.patch | 22 + .../kernel_patches/patches/removepage-2.4.20.patch | 28 + .../patches/revalide-special-oops-2.6.4.suse.patch | 22 - .../patches/scsi-max-phys-segments-256.patch | 17 - .../patches/sd_iostats-2.4.21-chaos.patch | 442 + .../patches/sd_iostats-2.6-suse.patch | 456 + .../patches/slab-use-after-free-debug-2.4.24.patch | 748 + .../patches/socket-exports-vanilla.patch | 42 + .../patches/statfs64-cast-unsigned-2.4-rhel.patch | 28 + .../patches/tcp-rto_proc-2.6.9.patch | 130 + .../patches/tcp-zero-copy-2.6-fc5.patch | 475 + .../patches/tcp-zero-copy-2.6-sles10.patch | 450 + .../patches/tcp-zero-copy-2.6.18-vanilla.patch | 450 + .../patches/uml-2.4.20-do_mmap_pgoff-fix.patch | 16 + .../kernel_patches/patches/uml-2.6.7-01-bb2.patch | 20390 ---- .../patches/uml-export-end_iomem.patch | 12 + .../patches/uml-exprt-clearuser-2.6.12.patch | 11 + .../patches/uml-exprt-clearuser.patch | 24 + .../patches/uml-patch-2.4.24-1.patch | 41972 ++++++++ .../patches/uml-patch-2.4.29-1.patch | 46719 +++++++++ .../patches/uml-sigusr1-2.4-vanilla.patch | 22 + .../vfs-dcache_locking-vanilla-2.6.10-fc3.patch | 113 - .../patches/vfs-dcache_locking-vanilla-2.6.patch | 85 - .../vfs-dcache_lustre_invalid-vanilla-2.6.patch | 37 - .../kernel_patches/patches/vfs-do_truncate.patch | 87 - .../vfs-gns_export_doumount-2.6.10-fc3.patch | 34 - .../patches/vfs-gns_export_doumount.patch | 34 - .../vfs-intent_api-vanilla-2.6.10-fc3.patch | 557 - .../patches/vfs-intent_api-vanilla-2.6.patch | 555 - ...-intent_release_umount-vanilla-2.6.10-fc3.patch | 10 - .../vfs-lookup_last-vanilla-2.6.10-fc3.patch | 78 - .../patches/vfs-lookup_last-vanilla-2.6.patch | 77 - .../patches/vfs-pdirops-2.6.10-fc3.patch | 274 - .../kernel_patches/patches/vfs-pdirops-2.6.7.patch | 262 - .../patches/vfs-raw_ops-vanilla-2.6.10-fc3.patch | 243 - .../patches/vfs-raw_ops-vanilla-2.6.patch | 235 - .../vfs-umount_lustre-vanilla-2.6.10-fc3.patch | 23 - .../patches/vfs-wantedi-misc-2.6-suse.patch | 18 - .../patches/vfs_fmode_exec-2.6.patch | 34 - .../patches/vfs_gns-2.6-vanilla.patch | 35 - .../patches/vfs_gns-2.6.10-fc3.patch | 38 - .../patches/vfs_intent-2.4.20-hp.patch | 1948 + .../patches/vfs_intent-2.4.20-vanilla.patch | 1854 + .../patches/vfs_intent-2.4.21-rhel.patch | 1891 + .../patches/vfs_intent-2.4.21-suse-171.patch | 1877 + .../patches/vfs_intent-2.4.21-suse2.patch | 1878 + .../patches/vfs_intent-2.4.29-vanilla.patch | 1833 + .../patches/vfs_intent-2.6-fc3.patch | 769 + .../patches/vfs_intent-2.6-fc5-fix.patch | 20 + .../patches/vfs_intent-2.6-fc5.patch | 827 + .../patches/vfs_intent-2.6-rhel4.patch | 794 + .../patches/vfs_intent-2.6-sles10.patch | 848 + .../patches/vfs_intent-2.6-suse.patch | 190 +- .../patches/vfs_intent-2.6-vanilla.patch | 845 - .../kernel_patches/patches/vfs_intent-2.6.12.patch | 793 + .../patches/vfs_intent-2.6.18-vanilla.patch | 727 + ..._intent-reduce-stack-usage-2.6-suse-newer.patch | 42 + .../patches/vfs_lookup_in_file-2.6.patch | 16 - .../patches/vfs_nointent-2.6-fc5.patch | 472 + .../patches/vfs_nointent-2.6-rhel4.patch | 487 + .../patches/vfs_nointent-2.6-sles10.patch | 453 + .../patches/vfs_nointent-2.6-suse.patch | 4 +- .../patches/vfs_nointent-2.6-vanilla.patch | 509 - .../patches/vfs_nointent-2.6.12.patch | 490 + .../patches/vfs_nointent-2.6.18-vanilla.patch | 451 + .../kernel_patches/patches/vfs_races-2.6-fc3.patch | 64 + ...2.6-vanilla.patch => vfs_races-2.6-rhel4.patch} | 0 .../kernel_patches/patches/vfs_races-2.6.12.patch | 61 + .../patches/vfs_races-2.6.18-vanilla.patch | 60 + .../kernel_patches/patches/vm-tunables-rhel4.patch | 73 + lustre/kernel_patches/prepare_tree.sh | 88 - lustre/kernel_patches/scripts/added-by-patch | 14 - lustre/kernel_patches/scripts/apatch | 97 - lustre/kernel_patches/scripts/cat-series | 17 - lustre/kernel_patches/scripts/combine-applied | 45 - lustre/kernel_patches/scripts/combine-series | 43 - lustre/kernel_patches/scripts/cvs-take-patch | 78 - lustre/kernel_patches/scripts/export_patch | 55 - lustre/kernel_patches/scripts/extract_description | 87 - lustre/kernel_patches/scripts/forkpatch | 76 - lustre/kernel_patches/scripts/fpatch | 53 - lustre/kernel_patches/scripts/import_patch | 102 - lustre/kernel_patches/scripts/inpatch | 27 - lustre/kernel_patches/scripts/join-patch | 28 - lustre/kernel_patches/scripts/linus-patch | 26 - lustre/kernel_patches/scripts/mpatch | 101 - lustre/kernel_patches/scripts/new-kernel | 82 - lustre/kernel_patches/scripts/p0-2-p1 | 10 - lustre/kernel_patches/scripts/p_diff | 60 - lustre/kernel_patches/scripts/patchdesc | 21 - lustre/kernel_patches/scripts/patchfns | 256 - lustre/kernel_patches/scripts/pcpatch | 45 - lustre/kernel_patches/scripts/poppatch | 72 - lustre/kernel_patches/scripts/prep-patch | 18 - lustre/kernel_patches/scripts/pstatus | 156 - lustre/kernel_patches/scripts/ptkdiff | 46 - lustre/kernel_patches/scripts/pushpatch | 86 - lustre/kernel_patches/scripts/refpatch | 32 - lustre/kernel_patches/scripts/removed-by-patch | 14 - lustre/kernel_patches/scripts/rename-patch | 20 - lustre/kernel_patches/scripts/rolled-up-patch | 30 - lustre/kernel_patches/scripts/rpatch | 90 - lustre/kernel_patches/scripts/split-patch | 29 - lustre/kernel_patches/scripts/sum-series | 41 - lustre/kernel_patches/scripts/tag-series | 41 - lustre/kernel_patches/scripts/toppatch | 27 - lustre/kernel_patches/scripts/touched-by-patch | 32 - lustre/kernel_patches/scripts/trypatch | 72 - lustre/kernel_patches/scripts/unitdiff.py | 223 - lustre/kernel_patches/scripts/unused-patches | 39 - lustre/kernel_patches/series/2.6-fc3-uml.series | 30 - lustre/kernel_patches/series/2.6-fc3.series | 54 +- lustre/kernel_patches/series/2.6-fc5.series | 20 + .../kernel_patches/series/2.6-rhel4-titech.series | 36 + lustre/kernel_patches/series/2.6-rhel4.series | 35 + lustre/kernel_patches/series/2.6-sles10.series | 18 + lustre/kernel_patches/series/2.6-suse-newer.series | 15 + lustre/kernel_patches/series/2.6-suse.series | 19 +- lustre/kernel_patches/series/2.6-vanilla.series | 23 - lustre/kernel_patches/series/2.6.12-vanilla.series | 20 + lustre/kernel_patches/series/2.6.18-vanilla.series | 19 + lustre/kernel_patches/series/hp-pnnl-2.4.20 | 49 + .../kernel_patches/series/ldiskfs-2.6-fc3.series | 28 +- .../kernel_patches/series/ldiskfs-2.6-fc5.series | 12 + .../kernel_patches/series/ldiskfs-2.6-rhel4.series | 15 + .../series/ldiskfs-2.6-sles10.series | 13 + .../kernel_patches/series/ldiskfs-2.6-suse.series | 12 +- .../series/ldiskfs-2.6-vanilla.series | 16 - .../series/ldiskfs-2.6.12-vanilla.series | 15 + .../series/ldiskfs-2.6.18-vanilla.series | 13 + lustre/kernel_patches/series/rhel-2.4.21 | 54 + lustre/kernel_patches/series/suse-2.4.21-cray | 43 + lustre/kernel_patches/series/vanilla-2.4.24 | 49 + lustre/kernel_patches/series/vanilla-2.4.29 | 46 + lustre/kernel_patches/series/vanilla-2.4.29-uml | 48 + lustre/kernel_patches/targets/2.6-fc3.target.in | 18 - lustre/kernel_patches/targets/2.6-fc5.target.in | 18 + .../kernel_patches/targets/2.6-patchless.target.in | 25 + lustre/kernel_patches/targets/2.6-rhel4.target.in | 25 + lustre/kernel_patches/targets/2.6-suse.target.in | 29 + .../kernel_patches/targets/2.6-vanilla.target.in | 16 + .../kernel_patches/targets/hp_pnnl-2.4.target.in | 17 + lustre/kernel_patches/targets/rh-2.4.target.in | 24 + lustre/kernel_patches/targets/rhel-2.4.target.in | 24 + lustre/kernel_patches/targets/sles-2.4.target.in | 26 + .../kernel_patches/targets/suse-2.4.21-2.target.in | 15 + lustre/kernel_patches/txt/dev_read_only.txt | 3 - lustre/kernel_patches/txt/exports.txt | 3 - lustre/kernel_patches/txt/exports_hp.txt | 3 - lustre/kernel_patches/txt/ext3-2.4.20-fixes.txt | 3 - lustre/kernel_patches/txt/ext3-map_inode_page.txt | 3 - .../txt/ext3-map_inode_page_2.4.18.txt | 3 - lustre/kernel_patches/txt/invalidate_show.txt | 3 - lustre/kernel_patches/txt/kmem_cache_validate.txt | 3 - lustre/kernel_patches/txt/lustre_version.txt | 3 - lustre/kernel_patches/txt/uml_check_get_page.txt | 3 - lustre/kernel_patches/txt/uml_no_panic.txt | 3 - lustre/kernel_patches/which_patch | 29 +- lustre/ldiskfs/Makefile.in | 17 +- lustre/ldiskfs/autoMakefile.am | 18 +- lustre/ldiskfs/lustre_quota_fmt.c | 998 + lustre/ldiskfs/lustre_quota_fmt.h | 84 + lustre/ldiskfs/quotafmt_test.c | 504 + lustre/ldlm/Makefile.am | 2 +- lustre/ldlm/doc/dld.lyx | 343 - lustre/ldlm/l_lock.c | 49 +- lustre/ldlm/ldlm_extent.c | 421 +- lustre/ldlm/ldlm_flock.c | 590 +- lustre/ldlm/ldlm_inodebits.c | 76 +- lustre/ldlm/ldlm_internal.h | 34 +- lustre/ldlm/ldlm_lib.c | 1640 +- lustre/ldlm/ldlm_lock.c | 753 +- lustre/ldlm/ldlm_lockd.c | 1037 +- lustre/ldlm/ldlm_plain.c | 57 +- lustre/ldlm/ldlm_request.c | 704 +- lustre/ldlm/ldlm_resource.c | 204 +- lustre/ldlm/ldlm_test.c | 646 - lustre/liblustre/Makefile.am | 37 +- lustre/liblustre/dir.c | 80 +- lustre/liblustre/doc/dld.lyx | 343 - lustre/liblustre/file.c | 294 +- lustre/liblustre/genlib.sh | 60 +- lustre/liblustre/llite_lib.c | 288 +- lustre/liblustre/llite_lib.h | 187 +- lustre/liblustre/lutil.c | 246 +- lustre/liblustre/lutil.h | 5 +- lustre/liblustre/namei.c | 134 +- lustre/liblustre/rw.c | 434 +- lustre/liblustre/super.c | 1293 +- lustre/liblustre/tests/.cvsignore | 1 - lustre/liblustre/tests/Makefile.am | 17 +- lustre/liblustre/tests/echo_test.c | 187 +- lustre/liblustre/tests/recovery_small.c | 6 +- lustre/liblustre/tests/replay_ost_single.c | 4 +- lustre/liblustre/tests/replay_single.c | 6 +- lustre/liblustre/tests/sanity.c | 1013 +- lustre/liblustre/tests/test_common.c | 110 +- lustre/liblustre/tests/test_common.h | 5 + lustre/llite/.cvsignore | 1 + lustre/llite/Makefile.in | 10 +- lustre/llite/autoMakefile.am | 6 +- lustre/llite/dcache.c | 635 +- lustre/llite/dir.c | 954 +- lustre/llite/doc/dld.lyx | 343 - lustre/llite/file.c | 2534 +- lustre/llite/llite_audit.c | 112 - lustre/llite/llite_capa.c | 363 - lustre/llite/llite_close.c | 130 +- lustre/llite/llite_gns.c | 567 - lustre/llite/llite_gs.c | 843 - lustre/llite/llite_internal.h | 897 +- lustre/llite/llite_lib.c | 3068 +- lustre/llite/llite_mmap.c | 384 +- lustre/llite/llite_nfs.c | 112 +- lustre/llite/lproc_llite.c | 1075 +- lustre/llite/namei.c | 1284 +- lustre/llite/rw.c | 1025 +- lustre/llite/rw24.c | 88 +- lustre/llite/rw26.c | 256 +- lustre/llite/special.c | 458 - lustre/llite/super.c | 149 +- lustre/llite/super25.c | 158 +- lustre/llite/symlink.c | 108 +- lustre/llite/xattr.c | 399 + lustre/lmv/.cvsignore | 7 - lustre/lmv/Makefile.in | 4 - lustre/lmv/autoMakefile.am | 18 - lustre/lmv/doc/dld.lyx | 343 - lustre/lmv/lmv_intent.c | 801 - lustre/lmv/lmv_internal.h | 151 - lustre/lmv/lmv_obd.c | 2457 - lustre/lmv/lmv_objmgr.c | 414 - lustre/lmv/lproc_lmv.c | 151 - lustre/lov/.cvsignore | 1 + lustre/lov/Info.plist | 41 + lustre/lov/Makefile.in | 2 +- lustre/lov/autoMakefile.am | 30 +- lustre/lov/doc/dld.lyx | 343 - lustre/lov/lov_ea.c | 570 + lustre/lov/lov_internal.h | 238 +- lustre/lov/lov_log.c | 279 +- lustre/lov/lov_merge.c | 130 +- lustre/lov/lov_obd.c | 2612 +- lustre/lov/lov_offset.c | 85 +- lustre/lov/lov_pack.c | 344 +- lustre/lov/lov_qos.c | 882 +- lustre/lov/lov_request.c | 1171 +- lustre/lov/lproc_lov.c | 213 +- lustre/lvfs/.cvsignore | 2 + lustre/lvfs/Info.plist | 37 + lustre/lvfs/Makefile.in | 11 +- lustre/lvfs/autoMakefile.am | 46 +- lustre/lvfs/doc/dld.lyx | 343 - lustre/lvfs/fsfilt.c | 18 +- lustre/lvfs/fsfilt_ext3.c | 1475 +- lustre/lvfs/fsfilt_reiserfs.c | 17 +- lustre/lvfs/fsfilt_smfs.c | 63 - lustre/lvfs/fsfilt_snap_ext3.c | 1768 - lustre/lvfs/fsfilt_snap_smfs.c | 444 - lustre/lvfs/llog.c | 370 - lustre/lvfs/llog_cat.c | 652 - lustre/lvfs/llog_lvfs.c | 1152 - lustre/lvfs/lvfs_common.c | 126 +- lustre/lvfs/lvfs_darwin.c | 45 + lustre/lvfs/lvfs_internal.h | 11 - lustre/lvfs/lvfs_linux.c | 313 +- lustre/lvfs/lvfs_reint.c | 553 - lustre/lvfs/lvfs_undo.c | 419 - lustre/lvfs/lvfs_userfs.c | 8 +- lustre/lvfs/upcall_cache.c | 519 + lustre/mdc/autoMakefile.am | 4 +- lustre/mdc/doc/dld.lyx | 343 - lustre/mdc/lproc_mdc.c | 74 +- lustre/mdc/mdc_internal.h | 92 +- lustre/mdc/mdc_lib.c | 410 +- lustre/mdc/mdc_locks.c | 711 +- lustre/mdc/mdc_reint.c | 187 +- lustre/mdc/mdc_request.c | 1642 +- lustre/mds/Makefile.in | 3 +- lustre/mds/autoMakefile.am | 2 +- lustre/mds/commit_confd.c | 24 +- lustre/mds/doc/dld.lyx | 343 - lustre/mds/handler.c | 4389 +- lustre/mds/lproc_mds.c | 550 +- lustre/mds/mds_acl.c | 317 - lustre/mds/mds_audit.c | 288 - lustre/mds/mds_audit_path.c | 608 - lustre/mds/mds_capa.c | 569 - lustre/mds/mds_fs.c | 1155 +- lustre/mds/mds_internal.h | 441 +- lustre/mds/mds_join.c | 504 + lustre/mds/mds_lib.c | 1222 +- lustre/mds/mds_lmv.c | 1313 - lustre/mds/mds_log.c | 205 +- lustre/mds/mds_lov.c | 1080 +- lustre/mds/mds_lsd.c | 301 - lustre/mds/mds_open.c | 1566 +- lustre/mds/mds_reint.c | 3262 +- lustre/mds/mds_unlink_open.c | 163 +- lustre/mds/mds_xattr.c | 366 + lustre/{cmobd => mgc}/.cvsignore | 0 lustre/mgc/Makefile.in | 4 + lustre/mgc/autoMakefile.am | 18 + lustre/mgc/libmgc.c | 147 + lustre/mgc/mgc_request.c | 1196 + lustre/mgmt/.cvsignore | 12 - lustre/mgmt/Makefile.in | 9 - lustre/mgmt/autoMakefile.am | 9 - lustre/mgmt/mgmt_cli.c | 284 - lustre/mgmt/mgmt_svc.c | 171 - lustre/{sec => mgs}/.cvsignore | 0 lustre/mgs/Makefile.in | 4 + lustre/mgs/autoMakefile.am | 11 + lustre/mgs/lproc_mgs.c | 153 + lustre/mgs/mgs_fs.c | 200 + lustre/mgs/mgs_handler.c | 694 + lustre/mgs/mgs_internal.h | 77 + lustre/mgs/mgs_llog.c | 1979 + lustre/nodist | 12 - lustre/obdclass/.cvsignore | 2 + lustre/obdclass/Info.plist | 39 + lustre/obdclass/Makefile.in | 34 +- lustre/obdclass/autoMakefile.am | 46 +- lustre/obdclass/capa.c | 492 - lustre/obdclass/class_obd.c | 566 +- lustre/obdclass/confobd.c | 436 - lustre/obdclass/darwin/.cvsignore | 1 + lustre/obdclass/darwin/Makefile.am | 3 + lustre/obdclass/darwin/darwin-module.c | 181 + lustre/obdclass/darwin/darwin-sysctl.c | 150 + lustre/obdclass/debug.c | 58 +- lustre/obdclass/doc/dld.lyx | 343 - lustre/obdclass/genops.c | 892 +- lustre/obdclass/linux/.cvsignore | 5 + lustre/obdclass/linux/Makefile.am | 4 + lustre/obdclass/linux/linux-module.c | 452 + lustre/obdclass/linux/linux-obdo.c | 288 + lustre/obdclass/linux/linux-sysctl.c | 120 + lustre/obdclass/llog.c | 424 + lustre/obdclass/llog_cat.c | 537 + lustre/obdclass/llog_internal.h | 10 + lustre/obdclass/llog_ioctl.c | 55 +- lustre/obdclass/llog_lvfs.c | 923 + lustre/obdclass/llog_obd.c | 300 +- lustre/obdclass/llog_swab.c | 209 +- lustre/obdclass/llog_test.c | 174 +- lustre/obdclass/lprocfs_status.c | 995 +- lustre/obdclass/lustre_handles.c | 70 +- lustre/obdclass/lustre_peer.c | 151 +- lustre/obdclass/mea.c | 91 - lustre/obdclass/obd_config.c | 1062 +- lustre/obdclass/obd_mount.c | 2007 + lustre/obdclass/obdo.c | 295 +- lustre/obdclass/prng.c | 68 + lustre/obdclass/statfs_pack.c | 38 +- lustre/obdclass/sysctl.c | 126 - lustre/obdclass/uuid.c | 235 +- lustre/obdecho/Info.plist | 45 + lustre/obdecho/autoMakefile.am | 22 + lustre/obdecho/doc/dld.lyx | 343 - lustre/obdecho/echo.c | 263 +- lustre/obdecho/echo_client.c | 403 +- lustre/obdecho/lproc_echo.c | 37 +- lustre/obdfilter/Makefile.in | 5 +- lustre/obdfilter/doc/dld.lyx | 343 - lustre/obdfilter/filter.c | 3672 +- lustre/obdfilter/filter_capa.c | 352 - lustre/obdfilter/filter_internal.h | 213 +- lustre/obdfilter/filter_io.c | 626 +- lustre/obdfilter/filter_io_24.c | 232 +- lustre/obdfilter/filter_io_26.c | 629 +- lustre/obdfilter/filter_log.c | 225 +- lustre/obdfilter/filter_lvb.c | 146 +- lustre/obdfilter/filter_san.c | 133 - lustre/obdfilter/lproc_obdfilter.c | 403 +- lustre/osc/Info.plist | 43 + lustre/osc/Makefile.in | 2 +- lustre/osc/autoMakefile.am | 25 +- lustre/osc/doc/dld.lyx | 343 - lustre/osc/lproc_osc.c | 507 +- lustre/osc/osc_create.c | 320 +- lustre/osc/osc_internal.h | 39 +- lustre/osc/osc_lib.c | 78 - lustre/osc/osc_request.c | 3486 +- lustre/ost/autoMakefile.am | 2 +- lustre/ost/doc/dld.lyx | 343 - lustre/ost/lproc_ost.c | 160 +- lustre/ost/ost_handler.c | 1528 +- lustre/ost/ost_internal.h | 43 + lustre/ptlbd/.cvsignore | 9 - lustre/ptlbd/Makefile.in | 4 - lustre/ptlbd/autoMakefile.am | 13 - lustre/ptlbd/blk.c | 300 - lustre/ptlbd/client.c | 255 - lustre/ptlbd/main.c | 71 - lustre/ptlbd/rpc.c | 386 - lustre/ptlbd/server.c | 138 - lustre/ptlrpc/Info.plist | 33 + lustre/ptlrpc/Makefile.in | 3 +- lustre/ptlrpc/autoMakefile.am | 48 +- lustre/ptlrpc/client.c | 834 +- lustre/ptlrpc/connection.c | 150 +- lustre/ptlrpc/doc/dld.lyx | 343 - lustre/ptlrpc/events.c | 518 +- lustre/ptlrpc/import.c | 823 +- lustre/ptlrpc/llog_client.c | 173 +- lustre/ptlrpc/llog_net.c | 53 +- lustre/ptlrpc/llog_server.c | 274 +- lustre/ptlrpc/lproc_ptlrpc.c | 496 +- lustre/ptlrpc/niobuf.c | 695 +- lustre/ptlrpc/pack_generic.c | 2390 +- lustre/ptlrpc/pers.c | 107 +- lustre/ptlrpc/pinger.c | 630 +- lustre/ptlrpc/ptlrpc_internal.h | 67 +- lustre/ptlrpc/ptlrpc_module.c | 180 +- lustre/ptlrpc/ptlrpcd.c | 139 +- lustre/ptlrpc/recov_thread.c | 242 +- lustre/ptlrpc/recover.c | 291 +- lustre/ptlrpc/service.c | 1111 +- lustre/ptlrpc/wirehdr.c | 10 + lustre/ptlrpc/wiretest.c | 2107 + lustre/{sec/gss => quota}/.cvsignore | 0 lustre/quota/Makefile.in | 10 + lustre/quota/autoMakefile.am | 19 + lustre/quota/quota_check.c | 237 + lustre/quota/quota_context.c | 869 + lustre/quota/quota_ctl.c | 275 + lustre/quota/quota_interface.c | 744 + lustre/quota/quota_internal.h | 98 + lustre/quota/quota_master.c | 1120 + lustre/quota/quotacheck_test.c | 217 + lustre/quota/quotactl_test.c | 358 + lustre/scripts/.cvsignore | 8 + lustre/scripts/Makefile.am | 29 +- lustre/scripts/bdev-io-survey.sh | 11 +- lustre/scripts/collect-stats.sh | 180 - lustre/scripts/cvs-modified-files.pl | 47 - lustre/scripts/cvsdiffclient | 45 - lustre/scripts/cvsrc | 5 - lustre/scripts/graph-rpcs.sh | 359 - lustre/scripts/lc_cluman.sh.in | 524 + lustre/scripts/lc_common.sh | 524 + lustre/scripts/lc_hb.sh.in | 644 + lustre/scripts/lc_lvm.sh.in | 593 + lustre/scripts/lc_md.sh.in | 511 + lustre/scripts/lc_modprobe.sh.in | 66 + lustre/scripts/lc_mon.sh | 139 + lustre/scripts/lc_net.sh.in | 226 + lustre/scripts/lc_servip.sh | 250 + lustre/scripts/linux-merge-config.awk | 317 - lustre/scripts/linux-merge-modules.awk | 125 - lustre/scripts/linux-rhconfig.h | 229 - lustre/scripts/lmc2csv.pl | 228 + lustre/scripts/lustre | 226 +- lustre/scripts/lustre_config.sh.in | 1222 + lustre/scripts/lustre_createcsv.sh.in | 2100 + lustre/scripts/lustre_req_history.sh | 163 + lustre/scripts/lustre_rmmod.sh | 18 + lustre/scripts/lustre_up14.sh | 66 + lustre/scripts/lustrefs | 60 +- lustre/scripts/suse-functions.sh | 22 - lustre/scripts/suse-post.sh | 46 - lustre/scripts/suse-postun.sh | 43 - lustre/scripts/suse-trigger-script.sh.in | 9 - lustre/scripts/version_tag.pl.in | 4 +- lustre/sec/Makefile.in | 6 - lustre/sec/Makefile.mk | 10 - lustre/sec/autoMakefile.am | 22 - lustre/sec/doc/oss_gss_HLD.lyx | 258 - lustre/sec/doc/remote_ugid_HLD.lyx | 884 - lustre/sec/doc/revoke_user_HLD.lyx | 244 - lustre/sec/gks/Makefile.in | 6 - lustre/sec/gks/Makefile.mk | 11 - lustre/sec/gks/autoMakefile.am | 13 - lustre/sec/gks/gks_client.c | 202 - lustre/sec/gks/gks_internal.h | 46 - lustre/sec/gks/gks_server.c | 508 - lustre/sec/gks/lproc_gks.c | 38 - lustre/sec/gss/Makefile.in | 9 - lustre/sec/gss/Makefile.mk | 14 - lustre/sec/gss/autoMakefile.am | 23 - lustre/sec/gss/gss_api.h | 131 - lustre/sec/gss/gss_asn1.h | 87 - lustre/sec/gss/gss_err.h | 181 - lustre/sec/gss/gss_generic_token.c | 295 - lustre/sec/gss/gss_internal.h | 243 - lustre/sec/gss/gss_krb5.h | 183 - lustre/sec/gss/gss_krb5_crypto.c | 264 - lustre/sec/gss/gss_krb5_mech.c | 316 - lustre/sec/gss/gss_krb5_seal.c | 178 - lustre/sec/gss/gss_krb5_seqnum.c | 116 - lustre/sec/gss/gss_krb5_unseal.c | 212 - lustre/sec/gss/gss_krb5_wrap.c | 381 - lustre/sec/gss/gss_mech_switch.c | 302 - lustre/sec/gss/rawobj.c | 179 - lustre/sec/gss/sec_gss.c | 1940 - lustre/sec/gss/svcsec_gss.c | 1680 - lustre/sec/sec.c | 1167 - lustre/sec/sec_null.c | 187 - lustre/sec/svcsec.c | 275 - lustre/sec/svcsec_null.c | 112 - lustre/sec/upcall_cache.c | 458 - lustre/smfs/.cvsignore | 15 - lustre/smfs/Makefile.in | 7 - lustre/smfs/audit.c | 643 - lustre/smfs/audit_mds.c | 299 - lustre/smfs/audit_ost.c | 169 - lustre/smfs/audit_transfer.c | 385 - lustre/smfs/autoMakefile.am | 11 - lustre/smfs/cache.c | 299 - lustre/smfs/cache_space.c | 788 - lustre/smfs/dir.c | 939 - lustre/smfs/doc/dld.lyx | 1866 - lustre/smfs/doc/hld.lyx | 1119 - lustre/smfs/file.c | 564 - lustre/smfs/fsfilt.c | 1276 - lustre/smfs/inode.c | 334 - lustre/smfs/ioctl.c | 192 - lustre/smfs/kml.c | 909 - lustre/smfs/mds_kml.c | 368 - lustre/smfs/options.c | 179 - lustre/smfs/ost_kml.c | 191 - lustre/smfs/smfs_api.h | 234 - lustre/smfs/smfs_cow.c | 1541 - lustre/smfs/smfs_internal.h | 288 - lustre/smfs/smfs_lib.c | 640 - lustre/smfs/smfs_llog.c | 101 - lustre/smfs/super.c | 140 - lustre/smfs/symlink.c | 101 - lustre/smfs/sysctl.c | 111 - lustre/snapfs/.cvsignore | 14 - lustre/snapfs/Makefile.in | 6 - lustre/snapfs/autoMakefile.am | 14 - lustre/snapfs/cache.c | 106 - lustre/snapfs/clonefs.c | 592 - lustre/snapfs/dcache.c | 56 - lustre/snapfs/dir.c | 699 - lustre/snapfs/dotsnap.c | 190 - lustre/snapfs/file.c | 344 - lustre/snapfs/filter.c | 413 - lustre/snapfs/inode.c | 317 - lustre/snapfs/journal_ext3.c | 78 - lustre/snapfs/options.c | 97 - lustre/snapfs/psdev.c | 145 - lustre/snapfs/snap.c | 220 - lustre/snapfs/snapfs_internal.h | 440 - lustre/snapfs/snapfs_support.h | 156 - lustre/snapfs/snaptable.c | 968 - lustre/snapfs/super.c | 512 - lustre/snapfs/symlink.c | 206 - lustre/snapfs/sysctl.c | 91 - lustre/snapfs/utils/.cvsignore | 9 - lustre/snapfs/utils/Makefile.am | 8 - lustre/snapfs/utils/parser.c | 748 - lustre/snapfs/utils/parser.h | 74 - lustre/snapfs/utils/snapconf.c | 74 - lustre/snapfs/utils/snapctl.c | 408 - lustre/snapfs/utils/snapctl.h | 23 - lustre/tests/.cvsignore | 9 +- lustre/tests/2ost.sh | 54 + lustre/tests/Makefile.am | 38 +- lustre/tests/acceptance-metadata-double.sh | 4 +- lustre/tests/acceptance-metadata-parallel.sh | 2 +- lustre/tests/acceptance-metadata-single.sh | 4 +- lustre/tests/acceptance-small.sh | 233 +- lustre/tests/acl/README | 5 +- lustre/tests/acl/inheritance.test | 10 +- lustre/tests/cfg/insanity-adev.sh | 2 +- lustre/tests/cfg/insanity-lmv.sh | 38 - lustre/tests/cfg/insanity-local.sh | 73 +- lustre/tests/cfg/insanity-ltest.sh | 3 +- lustre/tests/cfg/insanity-mdev.sh | 4 +- lustre/tests/cfg/lmv.sh | 41 - lustre/tests/cfg/local.sh | 92 +- lustre/tests/cfg/lov.sh | 69 + lustre/tests/cfg/mdev.sh | 31 - lustre/tests/cfg/smfs.sh | 50 - lustre/tests/checkstack.pl | 83 - lustre/tests/checkstat.c | 33 +- lustre/tests/chownmany.c | 79 + lustre/tests/cmknod.c | 37 +- lustre/tests/cmobd.sh | 89 - lustre/tests/cobd.sh | 85 +- lustre/tests/cobd_test.sh | 96 - lustre/tests/conf-sanity.sh | 938 +- lustre/tests/copy_attr.c | 56 - lustre/tests/createdestroy.c | 6 +- lustre/tests/createmany.c | 6 +- lustre/tests/directio.c | 213 +- lustre/tests/echo.sh | 28 +- lustre/tests/flock.c | 196 + lustre/tests/flock_test.c | 86 + lustre/tests/flocks_test.c | 62 + lustre/tests/fsx.c | 9 +- lustre/tests/gns-upcall.sh | 13 - lustre/tests/insanity.sh | 198 +- lustre/tests/kbuild | 311 + lustre/tests/krb5_env.sh | 149 - lustre/tests/krb5_refresh_cache.sh | 58 - lustre/tests/lfsck_config.sh | 47 - lustre/tests/lfscktest.sh | 286 +- lustre/tests/lfscktest_config.sh | 23 - lustre/tests/liblustre_sanity_uml.sh | 7 +- lustre/tests/ll_dirstripe_verify.c | 212 +- lustre/tests/llecho.sh | 2 +- lustre/tests/llmount-upcall.sh | 6 - lustre/tests/llmount.sh | 46 +- lustre/tests/llmountcleanup.sh | 62 +- lustre/tests/llog-test.sh | 106 + lustre/tests/llrmount.sh | 44 - lustre/tests/lmv.sh | 75 - lustre/tests/local-large-inode.sh | 3 - lustre/tests/local.sh | 83 - lustre/tests/lockorder.sh | 5 +- lustre/tests/lov.sh | 68 - lustre/tests/lsmfs.sh | 5 - lustre/tests/lsnap.sh | 5 - lustre/tests/mcr-individual-ost-nogw-config.sh | 46 - lustre/tests/mcr-mds-failover-config.sh | 50 - lustre/tests/mcr-routed-config.sh | 93 - lustre/tests/mcrlov.sh | 52 - lustre/tests/mdsadd.sh | 81 - lustre/tests/memhog.c | 32 +- lustre/tests/mkdirdeep.c | 4 +- lustre/tests/mmap_sanity.c | 415 +- lustre/tests/mount2fs.sh | 28 +- lustre/tests/multiop.c | 83 +- lustre/tests/oos.sh | 32 +- lustre/tests/oos2.sh | 11 +- lustre/tests/open_delay.c | 23 - lustre/tests/openclose.c | 3 + lustre/tests/opendevunlink.c | 4 + lustre/tests/openfile.c | 11 +- lustre/tests/random-reads.c | 208 + lustre/tests/recovery-cleanup.sh | 14 +- lustre/tests/recovery-small-upcall.sh | 4 - lustre/tests/recovery-small.sh | 511 +- lustre/tests/rename_many.c | 8 +- lustre/tests/replay-dual.sh | 433 +- lustre/tests/replay-ost-single.sh | 129 +- lustre/tests/replay-ost-upcall.sh | 38 - lustre/tests/replay-sanity.sh | 252 - lustre/tests/replay-single-lmv.sh | 169 - lustre/tests/replay-single-upcall.sh | 56 - lustre/tests/replay-single.sh | 841 +- lustre/tests/routed.sh | 156 + lustre/tests/run-llog.sh | 17 +- lustre/tests/run-quotacheck.sh | 30 + lustre/tests/run-quotactl.sh | 30 + lustre/tests/run-quotafmt.sh | 29 + lustre/tests/run_lfscktest.sh | 22 - lustre/tests/runacltest | 160 - lustre/tests/runas.c | 38 +- lustre/tests/rundbench | 11 +- lustre/tests/runfailure-net | 2 +- lustre/tests/runiozone | 2 +- lustre/tests/runregression-mds.sh | 67 - lustre/tests/runregression-net.sh | 2 +- lustre/tests/runtests | 57 +- lustre/tests/runvmstat | 18 +- lustre/tests/sanity-cmobd.sh | 188 - lustre/tests/sanity-crypto.sh | 231 - lustre/tests/sanity-fid.sh | 250 - lustre/tests/sanity-gns.sh | 1339 - lustre/tests/sanity-ldlm.sh | 62 - lustre/tests/sanity-lmv.sh | 352 - lustre/tests/sanity-ost_add_del.sh | 144 - lustre/tests/sanity-quota.sh | 828 + lustre/tests/sanity-sec.sh | 342 - lustre/tests/sanity.sh | 2459 +- lustre/tests/sanityN.sh | 543 +- lustre/tests/setfacl.test | 123 - lustre/tests/small_write.c | 56 +- lustre/tests/smfs.sh | 53 - lustre/tests/stat.c | 1 + lustre/tests/statmany.c | 17 +- lustre/tests/statone.c | 7 +- lustre/tests/tbox.sh | 116 - lustre/tests/test-framework.sh | 848 +- lustre/tests/test.c | 101 - lustre/tests/test45-mountain.sh | 136 - lustre/tests/test45.sh | 174 - lustre/tests/test_brw.c | 38 +- lustre/tests/testreq.c | 2 +- lustre/tests/uml.sh | 124 - lustre/tests/uml_clone.sh | 80 - lustre/tests/upcall | 12 - lustre/tests/utime.c | 88 +- lustre/tests/wantedi.c | 10 +- lustre/tests/write_append_truncate.c | 2 +- lustre/tests/write_disjoint.c | 219 +- lustre/tests/writemany.c | 276 + lustre/utils/.cvsignore | 23 +- lustre/utils/Lustre/.cvsignore | 4 - lustre/utils/Lustre/Makefile.am | 5 - lustre/utils/Lustre/__init__.py | 7 - lustre/utils/Lustre/cmdline.py | 194 - lustre/utils/Lustre/error.py | 10 - lustre/utils/Lustre/lustredb.py | 464 - lustre/utils/Makefile.am | 96 +- lustre/utils/automatic-reconnect-sample | 34 - lustre/utils/doc/dld.lyx | 343 - lustre/utils/ha_assist.sh | 5 - lustre/utils/ha_assist2.sh | 35 - lustre/utils/l_getgroups.c | 249 + lustre/utils/lacl_upcall.c | 347 - lustre/utils/lactive | 120 - lustre/utils/lconf | 4058 - lustre/utils/lctl.c | 649 +- lustre/utils/lfind | 9 - lustre/utils/lfs.c | 1397 +- lustre/utils/liblustreapi.c | 1341 +- lustre/utils/lkinit.c | 114 - lustre/utils/llanalyze | 43 +- lustre/utils/llmount.c | 821 - lustre/utils/llobdstat.pl | 76 +- lustre/utils/llog_reader.c | 407 + lustre/utils/llstat.pl | 284 +- lustre/utils/llverdev.c | 553 + lustre/utils/llverfs.c | 650 + lustre/utils/lmc | 1921 - lustre/utils/load_ldap.sh | 50 - lustre/utils/loadgen.c | 929 + lustre/utils/lr_reader.c | 209 + lustre/utils/lrun | 2 - lustre/utils/lsd_upcall.c | 435 - lustre/utils/lstripe | 9 - lustre/utils/lustre_cfg.c | 521 +- lustre/utils/lwizard | 414 - lustre/utils/mds-failover-sample | 20 - lustre/utils/mkfs_lustre.c | 1404 + lustre/utils/module_cleanup.sh | 22 + lustre/utils/module_setup.sh | 62 + lustre/utils/mount_lustre.c | 443 + lustre/utils/obd.c | 1192 +- lustre/utils/obdctl.c | 1 - lustre/utils/obdctl.h | 70 +- lustre/utils/obdio.c | 4 +- lustre/utils/obdiolib.c | 168 +- lustre/utils/obdiolib.h | 34 +- lustre/utils/parser.c | 66 +- lustre/utils/platform.h | 248 + lustre/utils/plot-llstat.pl | 182 + lustre/utils/wirecheck.c | 785 +- lustre/utils/wirehdr.c | 14 +- lustre/utils/wiretest.c | 3373 +- 1290 files changed, 771169 insertions(+), 241731 deletions(-) create mode 100644 ldiskfs/kernel_patches/patches/export-ext3-2.6-rhel4.patch create mode 100644 ldiskfs/kernel_patches/patches/ext3-check-jbd-errors-2.6.5.patch create mode 100644 ldiskfs/kernel_patches/patches/ext3-check-jbd-errors-2.6.9.patch create mode 100644 ldiskfs/kernel_patches/patches/ext3-ea-in-inode-2.6-rhel4.patch create mode 100644 ldiskfs/kernel_patches/patches/ext3-extents-2.6.12.patch create mode 100644 ldiskfs/kernel_patches/patches/ext3-extents-2.6.15.patch create mode 100644 ldiskfs/kernel_patches/patches/ext3-extents-2.6.18-vanilla.patch create mode 100644 ldiskfs/kernel_patches/patches/ext3-extents-2.6.9-rhel4.patch create mode 100644 ldiskfs/kernel_patches/patches/ext3-external-journal-2.6.12.patch create mode 100644 ldiskfs/kernel_patches/patches/ext3-filterdata-2.6.15.patch create mode 100644 ldiskfs/kernel_patches/patches/ext3-htree-dot-2.6.patch create mode 100644 ldiskfs/kernel_patches/patches/ext3-ialloc-2.6.patch create mode 100644 ldiskfs/kernel_patches/patches/ext3-include-fixes-2.6-rhel4.patch create mode 100644 ldiskfs/kernel_patches/patches/ext3-lookup-dotdot-2.6.9.patch create mode 100644 ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6-fc5.patch create mode 100644 ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6.12.patch create mode 100644 ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6.18-vanilla.patch create mode 100644 ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6.9-rhel4.patch create mode 100644 ldiskfs/kernel_patches/patches/ext3-nlinks-2.6.9.patch create mode 100644 ldiskfs/kernel_patches/patches/ext3-remove-cond_resched-calls-2.6.12.patch create mode 100644 ldiskfs/kernel_patches/patches/ext3-rename-reserve-2.6-suse.patch create mode 100644 ldiskfs/kernel_patches/patches/ext3-sector_t-overflow-2.6.12.patch create mode 100644 ldiskfs/kernel_patches/patches/ext3-sector_t-overflow-2.6.5-suse.patch create mode 100644 ldiskfs/kernel_patches/patches/ext3-sector_t-overflow-2.6.9-rhel4.patch create mode 100644 ldiskfs/kernel_patches/patches/ext3-wantedi-2.6-rhel4.patch create mode 100644 ldiskfs/kernel_patches/patches/iopen-2.6-fc5.patch create mode 100644 ldiskfs/kernel_patches/patches/iopen-2.6-rhel4.patch create mode 100644 ldiskfs/kernel_patches/patches/iopen-2.6.12.patch create mode 100644 ldiskfs/kernel_patches/series/ldiskfs-2.6-fc5.series create mode 100644 ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel4.series create mode 100644 ldiskfs/kernel_patches/series/ldiskfs-2.6-sles10.series create mode 100644 ldiskfs/kernel_patches/series/ldiskfs-2.6.12-vanilla.series create mode 100644 ldiskfs/kernel_patches/series/ldiskfs-2.6.18-vanilla.series delete mode 100644 lustre/COPYING create mode 100644 lustre/LICENSE delete mode 100644 lustre/Rules.in create mode 100644 lustre/autoconf/.cvsignore delete mode 100644 lustre/cmobd/Makefile.in delete mode 100644 lustre/cmobd/autoMakefile.am delete mode 100644 lustre/cmobd/cm_internal.h delete mode 100644 lustre/cmobd/cm_mds_reint.c delete mode 100644 lustre/cmobd/cm_obd.c delete mode 100644 lustre/cmobd/cm_oss_reint.c delete mode 100644 lustre/cmobd/cm_reint.c delete mode 100644 lustre/cmobd/cm_write.c delete mode 100644 lustre/cmobd/lproc_cm.c delete mode 100644 lustre/cobd/.cvsignore delete mode 100644 lustre/cobd/Makefile.in delete mode 100644 lustre/cobd/autoMakefile.am delete mode 100644 lustre/cobd/cache_obd.c delete mode 100644 lustre/cobd/lproc_cache.c create mode 100644 lustre/contrib/.cvsignore create mode 100644 lustre/contrib/Makefile.am create mode 100644 lustre/contrib/README create mode 100644 lustre/contrib/mpich-1.2.6-lustre.patch delete mode 100644 lustre/doc/blank_template.lyx create mode 100644 lustre/doc/llverdev.txt create mode 100644 lustre/doc/llverfs.txt create mode 100644 lustre/doc/lustre.7 delete mode 100644 lustre/doc/lwizard.1 create mode 100644 lustre/doc/mkfs.lustre.8 create mode 100644 lustre/doc/mount.lustre.8 create mode 100644 lustre/doc/tunefs.lustre.8 create mode 100644 lustre/include/darwin/lprocfs_status.h create mode 100644 lustre/include/darwin/lustre_compat.h create mode 100644 lustre/include/darwin/lustre_debug.h create mode 100644 lustre/include/darwin/lustre_dlm.h create mode 100644 lustre/include/darwin/lustre_fsfilt.h create mode 100644 lustre/include/darwin/lustre_handles.h create mode 100644 lustre/include/darwin/lustre_lib.h create mode 100644 lustre/include/darwin/lustre_lite.h create mode 100644 lustre/include/darwin/lustre_log.h create mode 100644 lustre/include/darwin/lustre_mds.h create mode 100644 lustre/include/darwin/lustre_net.h create mode 100644 lustre/include/darwin/lustre_quota.h create mode 100644 lustre/include/darwin/lustre_types.h create mode 100644 lustre/include/darwin/lustre_user.h create mode 100644 lustre/include/darwin/lvfs.h create mode 100644 lustre/include/darwin/obd.h create mode 100644 lustre/include/darwin/obd_class.h create mode 100644 lustre/include/darwin/obd_support.h delete mode 100644 lustre/include/linux/lustre_acl.h delete mode 100644 lustre/include/linux/lustre_audit.h delete mode 100644 lustre/include/linux/lustre_cfg.h delete mode 100644 lustre/include/linux/lustre_cmobd.h delete mode 100644 lustre/include/linux/lustre_commit_confd.h delete mode 100644 lustre/include/linux/lustre_export.h delete mode 100644 lustre/include/linux/lustre_gs.h delete mode 100644 lustre/include/linux/lustre_ha.h delete mode 100644 lustre/include/linux/lustre_idl.h delete mode 100644 lustre/include/linux/lustre_import.h create mode 100644 lustre/include/linux/lustre_intent.h delete mode 100644 lustre/include/linux/lustre_mgmt.h create mode 100644 lustre/include/linux/lustre_patchless_compat.h create mode 100644 lustre/include/linux/lustre_quota.h delete mode 100644 lustre/include/linux/lustre_sec.h delete mode 100644 lustre/include/linux/lustre_smfs.h delete mode 100644 lustre/include/linux/lustre_snap.h create mode 100644 lustre/include/linux/lustre_types.h delete mode 100644 lustre/include/linux/lustre_ucache.h create mode 100644 lustre/include/linux/lustre_user.h delete mode 100644 lustre/include/linux/obd_echo.h delete mode 100644 lustre/include/linux/obd_lmv.h delete mode 100644 lustre/include/linux/obd_lov.h delete mode 100644 lustre/include/linux/obd_ost.h delete mode 100644 lustre/include/linux/obd_ptlbd.h delete mode 100644 lustre/include/linux/obd_trace.h create mode 100644 lustre/include/lprocfs_status.h create mode 100644 lustre/include/lustre/lustre_idl.h create mode 100644 lustre/include/lustre/types.h create mode 100644 lustre/include/lustre_cfg.h create mode 100644 lustre/include/lustre_commit_confd.h create mode 100644 lustre/include/lustre_debug.h create mode 100644 lustre/include/lustre_disk.h create mode 100644 lustre/include/lustre_dlm.h create mode 100644 lustre/include/lustre_export.h create mode 100644 lustre/include/lustre_fsfilt.h create mode 100644 lustre/include/lustre_ha.h create mode 100644 lustre/include/lustre_handles.h create mode 100644 lustre/include/lustre_import.h create mode 100644 lustre/include/lustre_lib.h create mode 100644 lustre/include/lustre_lite.h create mode 100644 lustre/include/lustre_log.h create mode 100644 lustre/include/lustre_mds.h create mode 100644 lustre/include/lustre_net.h create mode 100644 lustre/include/lustre_param.h create mode 100644 lustre/include/lustre_quota.h create mode 100644 lustre/include/lustre_ucache.h create mode 100644 lustre/include/lustre_ver.h.in create mode 100644 lustre/include/lvfs.h create mode 100644 lustre/include/obd.h rename lustre/include/{linux => }/obd_cache.h (100%) create mode 100644 lustre/include/obd_class.h create mode 100644 lustre/include/obd_echo.h create mode 100644 lustre/include/obd_lov.h create mode 100644 lustre/include/obd_ost.h create mode 100644 lustre/include/obd_support.h delete mode 100644 lustre/include/types.h create mode 100644 lustre/kernel_patches/LICENSE create mode 100644 lustre/kernel_patches/kernel_configs/config-linux-2.4.18-p4smp-61chaos create mode 100644 lustre/kernel_patches/kernel_configs/config-linux-2.4.20-i386-rh delete mode 100644 lustre/kernel_patches/kernel_configs/config-linux-2.6.7-uml create mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.4.20-hp_pnnl-2.4-ia64-smp.config create mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.4.20-hp_pnnl-2.4-ia64.config create mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.4.20-rh-2.4-i686-smp.config create mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.4.20-rh-2.4-i686.config create mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.4.21-rhel-2.4-i686-smp.config create mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.4.21-rhel-2.4-i686.config create mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.4.21-rhel-2.4-x86_64-smp.config create mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.4.21-rhel-2.4-x86_64.config create mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.4.21-sles-2.4-i686-smp.config create mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.4.21-sles-2.4-i686.config create mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.4.21-suse-2.4.21-2-x86_64.config delete mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.6.10-2.6-fc3-i686-smp.config delete mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.6.10-2.6-fc3-i686.config delete mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.6.10-smp.config delete mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.6.10-suse-opteron.config delete mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.6.10-uml.config delete mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.6.10-vmware.config create mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.6.15-2.6-fc5-i686-smp.config create mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.6.15-2.6-fc5-i686.config create mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.6.15-fc5-i686.config create mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.6.16-2.6-patchless-i686-smp.config create mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.6.16-2.6-patchless-i686.config create mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.6.16-2.6-patchless-ia64-smp.config create mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.6.16-2.6-patchless-ia64.config create mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.6.16-2.6-patchless-x86_64-smp.config create mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.6.16-2.6-patchless-x86_64.config create mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-i686-smp.config create mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-ia64-smp.config create mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-ia64.config create mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-ppc-pseries64.config create mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-ppc.config create mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-x86_64-smp.config create mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-x86_64.config create mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-i686-smp.config create mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-i686.config create mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-ia64-smp.config create mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-ia64.config create mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-x86_64-smp.config create mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-x86_64.config create mode 100644 lustre/kernel_patches/kernel_configs/uml-2.6.10-fc3.config create mode 100644 lustre/kernel_patches/kernel_configs/uml-vanilla-2.4.24.config create mode 100644 lustre/kernel_patches/kernel_configs/uml-vanilla-2.6.6.config create mode 100644 lustre/kernel_patches/kernel_configs/uml_2.6.0_test3.config create mode 100644 lustre/kernel_patches/patches/2.6-rhel4-kgdb-ga.patch create mode 100644 lustre/kernel_patches/patches/2.6.5-quotafix.patch create mode 100644 lustre/kernel_patches/patches/3.5G-address-space-2.4.22-vanilla.patch create mode 100644 lustre/kernel_patches/patches/8kstack-2.6-rhel4.patch create mode 100644 lustre/kernel_patches/patches/8kstack-2.6.12.patch create mode 100644 lustre/kernel_patches/patches/add_page_private.patch create mode 100644 lustre/kernel_patches/patches/bitops_ext2_find_next_le_bit-2.6.patch create mode 100644 lustre/kernel_patches/patches/blkdev_tunables-2.4.21-chaos.patch create mode 100644 lustre/kernel_patches/patches/blkdev_tunables-2.6-suse.patch create mode 100644 lustre/kernel_patches/patches/bluesmoke-2.6-suse-lnxi.patch create mode 100644 lustre/kernel_patches/patches/brk-locked-2.6-suse-lnxi.patch create mode 100644 lustre/kernel_patches/patches/compile-fixes-2.4.21-rhel.patch create mode 100644 lustre/kernel_patches/patches/compile-fixes-2.6.9-rhel4-22.patch create mode 100644 lustre/kernel_patches/patches/configurable-x86-stack-2.4.20.patch create mode 100644 lustre/kernel_patches/patches/configurable-x86-stack-2.4.21-chaos.patch create mode 100644 lustre/kernel_patches/patches/configurable-x86-stack-2.4.21-suse-171.patch create mode 100644 lustre/kernel_patches/patches/configurable-x86-stack-2.4.21-suse2.patch create mode 100644 lustre/kernel_patches/patches/configurable-x86_64-2.4.21.patch delete mode 100644 lustre/kernel_patches/patches/dcache-fid-2.6.7.patch delete mode 100644 lustre/kernel_patches/patches/dcache-mds-num-2.6.7.patch create mode 100644 lustre/kernel_patches/patches/dcache-qstr-api-fix-2.6-suse.patch create mode 100644 lustre/kernel_patches/patches/dcache_refcount_debug.patch create mode 100644 lustre/kernel_patches/patches/dev_read_only-2.6-fc5.patch create mode 100644 lustre/kernel_patches/patches/dev_read_only-2.6-lnxi.patch delete mode 100644 lustre/kernel_patches/patches/dev_read_only-2.6.10-fc3.patch create mode 100644 lustre/kernel_patches/patches/dev_read_only-2.6.18-vanilla.patch create mode 100644 lustre/kernel_patches/patches/dev_read_only_2.4.20-rh.patch create mode 100644 lustre/kernel_patches/patches/dev_read_only_2.4.21-chaos.patch delete mode 100644 lustre/kernel_patches/patches/dynamic-locks-2.6.10-fc3.patch delete mode 100644 lustre/kernel_patches/patches/dynamic-locks-2.6.7.patch create mode 100644 lustre/kernel_patches/patches/elevator-cfq.patch delete mode 100644 lustre/kernel_patches/patches/export-2.6-fc3.patch create mode 100644 lustre/kernel_patches/patches/export-2.6-fc5.patch create mode 100644 lustre/kernel_patches/patches/export-2.6.18-vanilla.patch create mode 100644 lustre/kernel_patches/patches/export-do_kern_mount.patch create mode 100644 lustre/kernel_patches/patches/export-ext3-2.6-rhel4.patch delete mode 100644 lustre/kernel_patches/patches/export-ext3-2.6.10-fc3.patch create mode 100644 lustre/kernel_patches/patches/export-log-2.6-rhel4.patch create mode 100644 lustre/kernel_patches/patches/export-show_task-2.4-cray.patch create mode 100644 lustre/kernel_patches/patches/export-show_task-2.4-rh.patch create mode 100644 lustre/kernel_patches/patches/export-show_task-2.4-rhel.patch create mode 100644 lustre/kernel_patches/patches/export-show_task-2.4-vanilla.patch create mode 100644 lustre/kernel_patches/patches/export-show_task-2.6-fc5.patch create mode 100644 lustre/kernel_patches/patches/export-show_task-2.6.18-vanilla.patch create mode 100644 lustre/kernel_patches/patches/export-truncate-2.6.18-vanilla.patch create mode 100644 lustre/kernel_patches/patches/export-truncate.patch delete mode 100644 lustre/kernel_patches/patches/export-vanilla-2.6.patch create mode 100644 lustre/kernel_patches/patches/export-zap-page-range.patch create mode 100644 lustre/kernel_patches/patches/export_num_siblings.patch create mode 100644 lustre/kernel_patches/patches/export_symbol_numa-2.6-fc5.patch create mode 100644 lustre/kernel_patches/patches/export_symbol_numa.patch create mode 100644 lustre/kernel_patches/patches/export_symbols-2.6-rhel4.patch create mode 100644 lustre/kernel_patches/patches/export_symbols-2.6.12.patch create mode 100644 lustre/kernel_patches/patches/export_symbols-2.6.18-vanilla.patch delete mode 100644 lustre/kernel_patches/patches/export_symbols-ext3-2.6.10-fc3.patch create mode 100644 lustre/kernel_patches/patches/exports-2.4.21-chaos.patch create mode 100644 lustre/kernel_patches/patches/exports_2.4.19-suse.patch create mode 100644 lustre/kernel_patches/patches/exports_2.4.19-suse2.patch create mode 100644 lustre/kernel_patches/patches/exports_2.4.20-rh-hp.patch create mode 100644 lustre/kernel_patches/patches/ext-2.4-patch-1.patch create mode 100644 lustre/kernel_patches/patches/ext-2.4-patch-2.patch create mode 100644 lustre/kernel_patches/patches/ext-2.4-patch-3.patch create mode 100644 lustre/kernel_patches/patches/ext-2.4-patch-4.patch create mode 100644 lustre/kernel_patches/patches/ext3-2.4-ino_t.patch create mode 100644 lustre/kernel_patches/patches/ext3-2.4.20-fixes.patch create mode 100644 lustre/kernel_patches/patches/ext3-check-jbd-errors-2.6.5.patch create mode 100644 lustre/kernel_patches/patches/ext3-check-jbd-errors-2.6.9.patch create mode 100644 lustre/kernel_patches/patches/ext3-delete_thread-2.4.20-hp.patch create mode 100644 lustre/kernel_patches/patches/ext3-delete_thread-2.4.21-chaos.patch create mode 100644 lustre/kernel_patches/patches/ext3-delete_thread-2.4.21-suse-171.patch create mode 100644 lustre/kernel_patches/patches/ext3-delete_thread-2.4.24.patch create mode 100644 lustre/kernel_patches/patches/ext3-delete_thread-2.4.29.patch delete mode 100644 lustre/kernel_patches/patches/ext3-disable-reservation-2.6.10-fc3.patch create mode 100644 lustre/kernel_patches/patches/ext3-disable-write-barrier-by-default-2.6-sles10.patch create mode 100644 lustre/kernel_patches/patches/ext3-ea-in-inode-2.4.20.patch create mode 100644 lustre/kernel_patches/patches/ext3-ea-in-inode-2.4.21-chaos.patch create mode 100644 lustre/kernel_patches/patches/ext3-ea-in-inode-2.4.21-sles.patch create mode 100644 lustre/kernel_patches/patches/ext3-ea-in-inode-2.4.21-suse2.patch create mode 100644 lustre/kernel_patches/patches/ext3-ea-in-inode-2.4.22-rh.patch create mode 100644 lustre/kernel_patches/patches/ext3-ea-in-inode-2.4.29.patch delete mode 100644 lustre/kernel_patches/patches/ext3-ea-in-inode-2.6-fc3.patch create mode 100644 lustre/kernel_patches/patches/ext3-ea-in-inode-2.6-rhel4.patch create mode 100644 lustre/kernel_patches/patches/ext3-error-export.patch create mode 100644 lustre/kernel_patches/patches/ext3-extents-2.4.21-chaos.patch create mode 100644 lustre/kernel_patches/patches/ext3-extents-2.4.21-suse2.patch create mode 100644 lustre/kernel_patches/patches/ext3-extents-2.4.24.patch create mode 100644 lustre/kernel_patches/patches/ext3-extents-2.4.29.patch delete mode 100644 lustre/kernel_patches/patches/ext3-extents-2.6.10-fc3.patch create mode 100644 lustre/kernel_patches/patches/ext3-extents-2.6.12.patch create mode 100644 lustre/kernel_patches/patches/ext3-extents-2.6.15.patch create mode 100644 lustre/kernel_patches/patches/ext3-extents-2.6.18-vanilla.patch delete mode 100644 lustre/kernel_patches/patches/ext3-extents-2.6.7.patch create mode 100644 lustre/kernel_patches/patches/ext3-extents-2.6.9-rhel4.patch create mode 100644 lustre/kernel_patches/patches/ext3-extents-asyncdel-2.4.21-chaos.patch create mode 100644 lustre/kernel_patches/patches/ext3-extents-asyncdel-2.4.24.patch delete mode 100755 lustre/kernel_patches/patches/ext3-extents-in-ea-2.6.10-fc3.patch delete mode 100755 lustre/kernel_patches/patches/ext3-extents-in-ea-2.6.7.patch delete mode 100755 lustre/kernel_patches/patches/ext3-extents-in-ea-exports-symbol-2.6.7.patch delete mode 100755 lustre/kernel_patches/patches/ext3-extents-in-ea-ioctl-2.6.10-fc3.patch delete mode 100755 lustre/kernel_patches/patches/ext3-extents-in-ea-ioctl-2.6.7.patch create mode 100644 lustre/kernel_patches/patches/ext3-external-journal-2.6.12.patch create mode 100644 lustre/kernel_patches/patches/ext3-external-journal-2.6.9.patch delete mode 100644 lustre/kernel_patches/patches/ext3-fid-2.6.7.patch create mode 100644 lustre/kernel_patches/patches/ext3-filterdata-2.6.15.patch create mode 100644 lustre/kernel_patches/patches/ext3-htree-2.4.21-chaos.patch create mode 100644 lustre/kernel_patches/patches/ext3-htree-2.4.21-rhel.patch create mode 100644 lustre/kernel_patches/patches/ext3-htree-2.4.22-rh.patch create mode 100644 lustre/kernel_patches/patches/ext3-htree-2.4.29.patch create mode 100644 lustre/kernel_patches/patches/ext3-htree-dot-2.6.5-suse.patch create mode 100644 lustre/kernel_patches/patches/ext3-htree-dot-2.6.patch create mode 100644 lustre/kernel_patches/patches/ext3-htree-path-ops.patch create mode 100644 lustre/kernel_patches/patches/ext3-ialloc-2.4.21-suse2.patch create mode 100644 lustre/kernel_patches/patches/ext3-ialloc-2.4.24.patch create mode 100644 lustre/kernel_patches/patches/ext3-ialloc-2.6.patch create mode 100644 lustre/kernel_patches/patches/ext3-include-fixes-2.6-rhel4.patch delete mode 100644 lustre/kernel_patches/patches/ext3-init-generation-2.6-suse.patch create mode 100644 lustre/kernel_patches/patches/ext3-ino_sb_macro-2.4.21-chaos.patch delete mode 100755 lustre/kernel_patches/patches/ext3-inode-reuse-2.6.7.patch create mode 100644 lustre/kernel_patches/patches/ext3-largefile.patch create mode 100644 lustre/kernel_patches/patches/ext3-lookup-dotdot-2.4.20.patch create mode 100644 lustre/kernel_patches/patches/ext3-lookup-dotdot-2.6.9.patch create mode 100644 lustre/kernel_patches/patches/ext3-map_inode_page-2.4.21-suse2.patch create mode 100644 lustre/kernel_patches/patches/ext3-map_inode_page.patch create mode 100644 lustre/kernel_patches/patches/ext3-map_inode_page_2.4.18.patch create mode 100644 lustre/kernel_patches/patches/ext3-mballoc2-2.6-fc5.patch delete mode 100644 lustre/kernel_patches/patches/ext3-mballoc2-2.6.10-fc3.patch create mode 100644 lustre/kernel_patches/patches/ext3-mballoc2-2.6.12.patch create mode 100644 lustre/kernel_patches/patches/ext3-mballoc2-2.6.18-vanilla.patch delete mode 100644 lustre/kernel_patches/patches/ext3-mballoc2-2.6.7.patch create mode 100644 lustre/kernel_patches/patches/ext3-mballoc2-2.6.9-rhel4.patch delete mode 100755 lustre/kernel_patches/patches/ext3-mds-num-2.6.10-fc3.patch delete mode 100755 lustre/kernel_patches/patches/ext3-mds-num-2.6.7.patch create mode 100644 lustre/kernel_patches/patches/ext3-multi-mount-protection-2.6-fc5.patch create mode 100644 lustre/kernel_patches/patches/ext3-multi-mount-protection-2.6.18-vanilla.patch create mode 100644 lustre/kernel_patches/patches/ext3-nlinks-2.4.20-hp_pnnl.patch create mode 100644 lustre/kernel_patches/patches/ext3-nlinks-2.4.21-chaos.patch create mode 100644 lustre/kernel_patches/patches/ext3-nlinks-2.4.24.patch create mode 100644 lustre/kernel_patches/patches/ext3-nlinks-2.6.9.patch create mode 100644 lustre/kernel_patches/patches/ext3-no-write-super-chaos.patch create mode 100644 lustre/kernel_patches/patches/ext3-noread-2.4.20.patch create mode 100644 lustre/kernel_patches/patches/ext3-noread-2.4.21-chaos.patch create mode 100644 lustre/kernel_patches/patches/ext3-noread-2.4.21-suse2.patch create mode 100644 lustre/kernel_patches/patches/ext3-o_direct-2.4.21-chaos.patch create mode 100644 lustre/kernel_patches/patches/ext3-orphan_lock-2.4.22-rh.patch create mode 100644 lustre/kernel_patches/patches/ext3-orphan_lock.patch create mode 100644 lustre/kernel_patches/patches/ext3-patch-fuzz-fixup-fc3.patch delete mode 100644 lustre/kernel_patches/patches/ext3-pdirops-2.6.10-fc3.patch delete mode 100644 lustre/kernel_patches/patches/ext3-pdirops-2.6.7.patch delete mode 100644 lustre/kernel_patches/patches/ext3-raw-lookup-2.6.10.patch create mode 100644 lustre/kernel_patches/patches/ext3-raw-lookup.patch create mode 100644 lustre/kernel_patches/patches/ext3-remove-cond_resched-calls-2.6.12.patch create mode 100644 lustre/kernel_patches/patches/ext3-rename-reserve-2.6-suse.patch delete mode 100755 lustre/kernel_patches/patches/ext3-reserve-inode-space-2.6.7.patch create mode 100644 lustre/kernel_patches/patches/ext3-san-2.4.20.patch create mode 100644 lustre/kernel_patches/patches/ext3-sector_t-overflow-2.4.patch create mode 100644 lustre/kernel_patches/patches/ext3-sector_t-overflow-2.6.12.patch create mode 100644 lustre/kernel_patches/patches/ext3-sector_t-overflow-2.6.5-suse.patch create mode 100644 lustre/kernel_patches/patches/ext3-sector_t-overflow-2.6.9-rhel4.patch create mode 100644 lustre/kernel_patches/patches/ext3-statfs-2.6.12.patch create mode 100644 lustre/kernel_patches/patches/ext3-truncate-buffer-head.patch create mode 100644 lustre/kernel_patches/patches/ext3-truncate_blocks.patch create mode 100644 lustre/kernel_patches/patches/ext3-trusted_ea-2.4.20.patch create mode 100644 lustre/kernel_patches/patches/ext3-use-after-free.patch create mode 100644 lustre/kernel_patches/patches/ext3-wantedi-2.6-rhel4.patch delete mode 100644 lustre/kernel_patches/patches/ext3-wantedi-2.6.10-fc3.patch create mode 100644 lustre/kernel_patches/patches/ext3-wantedi-2.6.15.patch create mode 100644 lustre/kernel_patches/patches/ext3-wantedi-misc-2.6.18-vanilla.patch create mode 100644 lustre/kernel_patches/patches/ext3-xattr-ptr-arith-fix.patch create mode 100644 lustre/kernel_patches/patches/extN-2.4.18-ino_sb_fixup.patch create mode 100644 lustre/kernel_patches/patches/extN-misc-fixup.patch create mode 100644 lustre/kernel_patches/patches/extN-wantedi-2.4.21-chaos.patch create mode 100644 lustre/kernel_patches/patches/extN-wantedi-2.4.21-suse2.patch create mode 100644 lustre/kernel_patches/patches/extN-wantedi.patch create mode 100644 lustre/kernel_patches/patches/fc3_to_rhel4_updates.patch create mode 100644 lustre/kernel_patches/patches/fsprivate-2.4-suse.patch create mode 100644 lustre/kernel_patches/patches/fsprivate-2.4.patch create mode 100644 lustre/kernel_patches/patches/fsprivate-2.6.patch create mode 100644 lustre/kernel_patches/patches/gfp_debug-2.4.21-rhel.patch create mode 100644 lustre/kernel_patches/patches/grab_cache_page_nowait_gfp-2.4.21-suse2.patch create mode 100644 lustre/kernel_patches/patches/grab_cache_page_nowait_gfp-2.6-suse.patch create mode 100644 lustre/kernel_patches/patches/grab_cache_page_nowait_gfp-rh-2.4.patch delete mode 100644 lustre/kernel_patches/patches/header_guards-vanilla-2.6.patch delete mode 100644 lustre/kernel_patches/patches/highmem-split-2.6.10-fc3.patch create mode 100644 lustre/kernel_patches/patches/iallocsem_consistency.patch create mode 100644 lustre/kernel_patches/patches/inode-max-readahead-2.4.24.patch create mode 100644 lustre/kernel_patches/patches/invalidate_show-2.4.20-hp.patch create mode 100644 lustre/kernel_patches/patches/invalidate_show-2.4.20-rh.patch create mode 100644 lustre/kernel_patches/patches/invalidate_show-2.4.29.patch create mode 100644 lustre/kernel_patches/patches/invalidate_show.patch create mode 100644 lustre/kernel_patches/patches/iod-rmap-exports-2.4.21-chaos.patch create mode 100644 lustre/kernel_patches/patches/iod-stock-24-exports_hp.patch create mode 100644 lustre/kernel_patches/patches/iod-stock-exports-2.4.22.patch create mode 100644 lustre/kernel_patches/patches/iopen-2.4.20.patch create mode 100644 lustre/kernel_patches/patches/iopen-2.4.21-chaos.patch create mode 100644 lustre/kernel_patches/patches/iopen-2.6-fc5.patch create mode 100644 lustre/kernel_patches/patches/iopen-2.6-rhel4.patch delete mode 100644 lustre/kernel_patches/patches/iopen-2.6-vanilla.patch delete mode 100644 lustre/kernel_patches/patches/iopen-2.6.10-fc3.patch create mode 100644 lustre/kernel_patches/patches/iopen-2.6.12.patch create mode 100644 lustre/kernel_patches/patches/iopen-misc-2.6-fc3.patch create mode 100644 lustre/kernel_patches/patches/iopen-misc-2.6.12.patch create mode 100644 lustre/kernel_patches/patches/iopen-misc-2.6.18-vanilla.patch delete mode 100644 lustre/kernel_patches/patches/jbd-buffer-release-2.6.10-fc3.patch delete mode 100644 lustre/kernel_patches/patches/jbd-buffer-release-2.6.7.patch create mode 100644 lustre/kernel_patches/patches/jbd-commit-tricks-rhel3.patch create mode 100644 lustre/kernel_patches/patches/jbd-commit-tricks.patch create mode 100644 lustre/kernel_patches/patches/jbd-ctx_switch.patch create mode 100644 lustre/kernel_patches/patches/jbd-dont-account-blocks-twice.patch create mode 100644 lustre/kernel_patches/patches/jbd-flushtime.patch create mode 100644 lustre/kernel_patches/patches/jbd-get_write_access.patch create mode 100644 lustre/kernel_patches/patches/jbd-jcberr-2.6.18-vanilla.patch delete mode 100644 lustre/kernel_patches/patches/jbd-static-wbuf-2.6.7.patch create mode 100644 lustre/kernel_patches/patches/jbd-stats-2.6.13.4.patch create mode 100644 lustre/kernel_patches/patches/jbd-stats-2.6.5.patch create mode 100644 lustre/kernel_patches/patches/jbd-stats-2.6.9.patch create mode 100644 lustre/kernel_patches/patches/kallsyms-2.4.29.patch create mode 100644 lustre/kernel_patches/patches/kexec-2.6-suse-lnxi.patch delete mode 100644 lustre/kernel_patches/patches/kgdb-ga.patch create mode 100644 lustre/kernel_patches/patches/kjournald_affinity.patch create mode 100644 lustre/kernel_patches/patches/link_notlast-susefix.patch create mode 100644 lustre/kernel_patches/patches/linux-2.4.20-xattr-0.8.54-hp.patch create mode 100644 lustre/kernel_patches/patches/linux-2.4.21-xattr-0.8.54-chaos.patch create mode 100644 lustre/kernel_patches/patches/linux-2.4.21-xattr-0.8.54-suse-171.patch create mode 100644 lustre/kernel_patches/patches/linux-2.4.21-xattr-0.8.54-suse2.patch create mode 100644 lustre/kernel_patches/patches/linux-2.4.24-jbd-handle-EIO-rhel3.patch create mode 100644 lustre/kernel_patches/patches/linux-2.4.24-jbd-handle-EIO.patch create mode 100644 lustre/kernel_patches/patches/linux-2.4.24-xattr-0.8.54.patch create mode 100644 lustre/kernel_patches/patches/linux-2.4.29-xattr-0.8.54.patch create mode 100644 lustre/kernel_patches/patches/linux-2.6-binutils-2.16.patch delete mode 100644 lustre/kernel_patches/patches/linux-2.6.10-CITI_NFS4_ALL-1.patch delete mode 100644 lustre/kernel_patches/patches/linux-2.6.10-fc3-left.patch delete mode 100644 lustre/kernel_patches/patches/linux-2.6.10-fc3-lkcd.patch delete mode 100644 lustre/kernel_patches/patches/linux-2.6.10-fc3-sunrpc_cacheput.patch delete mode 100644 lustre/kernel_patches/patches/linux-2.6.10-flock.patch delete mode 100644 lustre/kernel_patches/patches/linux-2.6.7-CITI_NFS4_ALL-7-lsec.patch create mode 100644 lustre/kernel_patches/patches/linux-2.6.9-ext3-sub-second-timestamp.patch create mode 100644 lustre/kernel_patches/patches/listman-2.4.20.patch create mode 100644 lustre/kernel_patches/patches/listman-2.4.21-chaos.patch create mode 100644 lustre/kernel_patches/patches/llnl-frame-pointer-walk-2.4.21-rhel.patch create mode 100644 lustre/kernel_patches/patches/llnl-frame-pointer-walk-fix-2.4.21-rhel.patch create mode 100644 lustre/kernel_patches/patches/lookup-stack-symbols-2.4.21-suse-171.patch create mode 100644 lustre/kernel_patches/patches/lookup_bdev_init_intent-2.6.18-vanilla.patch create mode 100644 lustre/kernel_patches/patches/loop-sync-2.4.21-suse.patch create mode 100644 lustre/kernel_patches/patches/lustre-version-revert_suse.patch create mode 100644 lustre/kernel_patches/patches/lustre_build.patch create mode 100644 lustre/kernel_patches/patches/md_path_lookup-2.6-suse create mode 100644 lustre/kernel_patches/patches/mtd-2.6-suse-lnxi.patch create mode 100644 lustre/kernel_patches/patches/netconsole-2.4.24-ppc.patch create mode 100644 lustre/kernel_patches/patches/new-tcp-zero-copy-2.4.21-rhel3.patch create mode 100644 lustre/kernel_patches/patches/new-tcp-zero-copy-2.4.29-vanilla.patch create mode 100644 lustre/kernel_patches/patches/new-tcp-zero-copy-2.6.5-sles9.patch create mode 100644 lustre/kernel_patches/patches/new-tcp-zero-copy-2.6.9-41.2chaos.patch create mode 100644 lustre/kernel_patches/patches/nfs-cifs-intent-2.6-fc3.patch create mode 100644 lustre/kernel_patches/patches/nfs-cifs-intent-2.6-fc5.patch create mode 100644 lustre/kernel_patches/patches/nfs-cifs-intent-2.6-rhel4.patch delete mode 100644 lustre/kernel_patches/patches/nfs-cifs-intent-2.6-vanilla.patch create mode 100644 lustre/kernel_patches/patches/nfs-cifs-intent-2.6.12.patch create mode 100644 lustre/kernel_patches/patches/nfs_export_kernel-2.4.20-hp.patch create mode 100644 lustre/kernel_patches/patches/nfs_export_kernel-2.4.21-chaos.patch create mode 100644 lustre/kernel_patches/patches/nfs_export_kernel-2.4.21-suse2.patch create mode 100644 lustre/kernel_patches/patches/nfs_export_kernel-2.4.22.patch create mode 100644 lustre/kernel_patches/patches/nfs_export_kernel-2.4.29.patch create mode 100644 lustre/kernel_patches/patches/nfs_export_kernel-2.4.29.patch-1 create mode 100644 lustre/kernel_patches/patches/nfs_statfs-toomanyfiles-rhel-2.4.patch create mode 100644 lustre/kernel_patches/patches/nfsd_iallocsem.patch delete mode 100644 lustre/kernel_patches/patches/nid-2.6-fc3.patch delete mode 100644 lustre/kernel_patches/patches/pag-basic-2.6.10-fc3.patch create mode 100644 lustre/kernel_patches/patches/pagecache-lock-2.4.21-chaos.patch create mode 100644 lustre/kernel_patches/patches/perfctr-2.6-suse-lnxi.patch create mode 100644 lustre/kernel_patches/patches/procfs-ndynamic-2.4.21-suse2.patch create mode 100644 lustre/kernel_patches/patches/procfs-ndynamic-2.4.patch create mode 100644 lustre/kernel_patches/patches/qsnet-rhel-2.4.patch create mode 100644 lustre/kernel_patches/patches/qsnet-rhel4-2.6.patch create mode 100644 lustre/kernel_patches/patches/qsnet-suse-2.6.patch create mode 100644 lustre/kernel_patches/patches/quota-deadlock-on-pagelock-core.patch create mode 100644 lustre/kernel_patches/patches/quota-deadlock-on-pagelock-ext3.patch create mode 100644 lustre/kernel_patches/patches/quota-umount-race-fix.patch create mode 100644 lustre/kernel_patches/patches/raid5-configurable-cachesize.patch create mode 100644 lustre/kernel_patches/patches/raid5-large-io.patch create mode 100644 lustre/kernel_patches/patches/raid5-merge-ios.patch create mode 100644 lustre/kernel_patches/patches/raid5-optimize-memcpy.patch create mode 100644 lustre/kernel_patches/patches/raid5-serialize-ovelapping-reqs.patch create mode 100644 lustre/kernel_patches/patches/raid5-stats.patch create mode 100644 lustre/kernel_patches/patches/raid5-stripe-by-stripe-handling.patch create mode 100644 lustre/kernel_patches/patches/remove-suid-2.4-rhel.patch create mode 100644 lustre/kernel_patches/patches/remove-suid-2.6-suse.patch create mode 100644 lustre/kernel_patches/patches/removepage-2.4.20.patch delete mode 100644 lustre/kernel_patches/patches/revalide-special-oops-2.6.4.suse.patch delete mode 100644 lustre/kernel_patches/patches/scsi-max-phys-segments-256.patch create mode 100644 lustre/kernel_patches/patches/sd_iostats-2.4.21-chaos.patch create mode 100644 lustre/kernel_patches/patches/sd_iostats-2.6-suse.patch create mode 100644 lustre/kernel_patches/patches/slab-use-after-free-debug-2.4.24.patch create mode 100644 lustre/kernel_patches/patches/socket-exports-vanilla.patch create mode 100644 lustre/kernel_patches/patches/statfs64-cast-unsigned-2.4-rhel.patch create mode 100644 lustre/kernel_patches/patches/tcp-rto_proc-2.6.9.patch create mode 100644 lustre/kernel_patches/patches/tcp-zero-copy-2.6-fc5.patch create mode 100644 lustre/kernel_patches/patches/tcp-zero-copy-2.6-sles10.patch create mode 100644 lustre/kernel_patches/patches/tcp-zero-copy-2.6.18-vanilla.patch create mode 100644 lustre/kernel_patches/patches/uml-2.4.20-do_mmap_pgoff-fix.patch delete mode 100644 lustre/kernel_patches/patches/uml-2.6.7-01-bb2.patch create mode 100644 lustre/kernel_patches/patches/uml-export-end_iomem.patch create mode 100644 lustre/kernel_patches/patches/uml-exprt-clearuser-2.6.12.patch create mode 100644 lustre/kernel_patches/patches/uml-exprt-clearuser.patch create mode 100644 lustre/kernel_patches/patches/uml-patch-2.4.24-1.patch create mode 100644 lustre/kernel_patches/patches/uml-patch-2.4.29-1.patch create mode 100644 lustre/kernel_patches/patches/uml-sigusr1-2.4-vanilla.patch delete mode 100644 lustre/kernel_patches/patches/vfs-dcache_locking-vanilla-2.6.10-fc3.patch delete mode 100644 lustre/kernel_patches/patches/vfs-dcache_locking-vanilla-2.6.patch delete mode 100644 lustre/kernel_patches/patches/vfs-dcache_lustre_invalid-vanilla-2.6.patch delete mode 100644 lustre/kernel_patches/patches/vfs-do_truncate.patch delete mode 100644 lustre/kernel_patches/patches/vfs-gns_export_doumount-2.6.10-fc3.patch delete mode 100644 lustre/kernel_patches/patches/vfs-gns_export_doumount.patch delete mode 100644 lustre/kernel_patches/patches/vfs-intent_api-vanilla-2.6.10-fc3.patch delete mode 100644 lustre/kernel_patches/patches/vfs-intent_api-vanilla-2.6.patch delete mode 100644 lustre/kernel_patches/patches/vfs-intent_release_umount-vanilla-2.6.10-fc3.patch delete mode 100644 lustre/kernel_patches/patches/vfs-lookup_last-vanilla-2.6.10-fc3.patch delete mode 100644 lustre/kernel_patches/patches/vfs-lookup_last-vanilla-2.6.patch delete mode 100644 lustre/kernel_patches/patches/vfs-pdirops-2.6.10-fc3.patch delete mode 100644 lustre/kernel_patches/patches/vfs-pdirops-2.6.7.patch delete mode 100644 lustre/kernel_patches/patches/vfs-raw_ops-vanilla-2.6.10-fc3.patch delete mode 100644 lustre/kernel_patches/patches/vfs-raw_ops-vanilla-2.6.patch delete mode 100644 lustre/kernel_patches/patches/vfs-umount_lustre-vanilla-2.6.10-fc3.patch delete mode 100644 lustre/kernel_patches/patches/vfs-wantedi-misc-2.6-suse.patch delete mode 100644 lustre/kernel_patches/patches/vfs_fmode_exec-2.6.patch delete mode 100644 lustre/kernel_patches/patches/vfs_gns-2.6-vanilla.patch delete mode 100644 lustre/kernel_patches/patches/vfs_gns-2.6.10-fc3.patch create mode 100644 lustre/kernel_patches/patches/vfs_intent-2.4.20-hp.patch create mode 100644 lustre/kernel_patches/patches/vfs_intent-2.4.20-vanilla.patch create mode 100644 lustre/kernel_patches/patches/vfs_intent-2.4.21-rhel.patch create mode 100644 lustre/kernel_patches/patches/vfs_intent-2.4.21-suse-171.patch create mode 100644 lustre/kernel_patches/patches/vfs_intent-2.4.21-suse2.patch create mode 100644 lustre/kernel_patches/patches/vfs_intent-2.4.29-vanilla.patch create mode 100644 lustre/kernel_patches/patches/vfs_intent-2.6-fc3.patch create mode 100644 lustre/kernel_patches/patches/vfs_intent-2.6-fc5-fix.patch create mode 100644 lustre/kernel_patches/patches/vfs_intent-2.6-fc5.patch create mode 100644 lustre/kernel_patches/patches/vfs_intent-2.6-rhel4.patch create mode 100644 lustre/kernel_patches/patches/vfs_intent-2.6-sles10.patch delete mode 100644 lustre/kernel_patches/patches/vfs_intent-2.6-vanilla.patch create mode 100644 lustre/kernel_patches/patches/vfs_intent-2.6.12.patch create mode 100644 lustre/kernel_patches/patches/vfs_intent-2.6.18-vanilla.patch create mode 100644 lustre/kernel_patches/patches/vfs_intent-reduce-stack-usage-2.6-suse-newer.patch delete mode 100644 lustre/kernel_patches/patches/vfs_lookup_in_file-2.6.patch create mode 100644 lustre/kernel_patches/patches/vfs_nointent-2.6-fc5.patch create mode 100644 lustre/kernel_patches/patches/vfs_nointent-2.6-rhel4.patch create mode 100644 lustre/kernel_patches/patches/vfs_nointent-2.6-sles10.patch delete mode 100644 lustre/kernel_patches/patches/vfs_nointent-2.6-vanilla.patch create mode 100644 lustre/kernel_patches/patches/vfs_nointent-2.6.12.patch create mode 100644 lustre/kernel_patches/patches/vfs_nointent-2.6.18-vanilla.patch create mode 100644 lustre/kernel_patches/patches/vfs_races-2.6-fc3.patch rename lustre/kernel_patches/patches/{vfs_races-2.6-vanilla.patch => vfs_races-2.6-rhel4.patch} (100%) create mode 100644 lustre/kernel_patches/patches/vfs_races-2.6.12.patch create mode 100644 lustre/kernel_patches/patches/vfs_races-2.6.18-vanilla.patch create mode 100644 lustre/kernel_patches/patches/vm-tunables-rhel4.patch delete mode 100755 lustre/kernel_patches/prepare_tree.sh delete mode 100755 lustre/kernel_patches/scripts/added-by-patch delete mode 100755 lustre/kernel_patches/scripts/apatch delete mode 100755 lustre/kernel_patches/scripts/cat-series delete mode 100755 lustre/kernel_patches/scripts/combine-applied delete mode 100755 lustre/kernel_patches/scripts/combine-series delete mode 100755 lustre/kernel_patches/scripts/cvs-take-patch delete mode 100755 lustre/kernel_patches/scripts/export_patch delete mode 100755 lustre/kernel_patches/scripts/extract_description delete mode 100755 lustre/kernel_patches/scripts/forkpatch delete mode 100755 lustre/kernel_patches/scripts/fpatch delete mode 100755 lustre/kernel_patches/scripts/import_patch delete mode 100755 lustre/kernel_patches/scripts/inpatch delete mode 100755 lustre/kernel_patches/scripts/join-patch delete mode 100755 lustre/kernel_patches/scripts/linus-patch delete mode 100755 lustre/kernel_patches/scripts/mpatch delete mode 100755 lustre/kernel_patches/scripts/new-kernel delete mode 100755 lustre/kernel_patches/scripts/p0-2-p1 delete mode 100755 lustre/kernel_patches/scripts/p_diff delete mode 100755 lustre/kernel_patches/scripts/patchdesc delete mode 100644 lustre/kernel_patches/scripts/patchfns delete mode 100755 lustre/kernel_patches/scripts/pcpatch delete mode 100755 lustre/kernel_patches/scripts/poppatch delete mode 100755 lustre/kernel_patches/scripts/prep-patch delete mode 100755 lustre/kernel_patches/scripts/pstatus delete mode 100755 lustre/kernel_patches/scripts/ptkdiff delete mode 100755 lustre/kernel_patches/scripts/pushpatch delete mode 100755 lustre/kernel_patches/scripts/refpatch delete mode 100755 lustre/kernel_patches/scripts/removed-by-patch delete mode 100755 lustre/kernel_patches/scripts/rename-patch delete mode 100755 lustre/kernel_patches/scripts/rolled-up-patch delete mode 100755 lustre/kernel_patches/scripts/rpatch delete mode 100755 lustre/kernel_patches/scripts/split-patch delete mode 100755 lustre/kernel_patches/scripts/sum-series delete mode 100755 lustre/kernel_patches/scripts/tag-series delete mode 100755 lustre/kernel_patches/scripts/toppatch delete mode 100755 lustre/kernel_patches/scripts/touched-by-patch delete mode 100755 lustre/kernel_patches/scripts/trypatch delete mode 100755 lustre/kernel_patches/scripts/unitdiff.py delete mode 100755 lustre/kernel_patches/scripts/unused-patches delete mode 100644 lustre/kernel_patches/series/2.6-fc3-uml.series create mode 100644 lustre/kernel_patches/series/2.6-fc5.series create mode 100644 lustre/kernel_patches/series/2.6-rhel4-titech.series create mode 100644 lustre/kernel_patches/series/2.6-rhel4.series create mode 100644 lustre/kernel_patches/series/2.6-sles10.series create mode 100644 lustre/kernel_patches/series/2.6-suse-newer.series delete mode 100644 lustre/kernel_patches/series/2.6-vanilla.series create mode 100644 lustre/kernel_patches/series/2.6.12-vanilla.series create mode 100644 lustre/kernel_patches/series/2.6.18-vanilla.series create mode 100644 lustre/kernel_patches/series/hp-pnnl-2.4.20 create mode 100644 lustre/kernel_patches/series/ldiskfs-2.6-fc5.series create mode 100644 lustre/kernel_patches/series/ldiskfs-2.6-rhel4.series create mode 100644 lustre/kernel_patches/series/ldiskfs-2.6-sles10.series delete mode 100644 lustre/kernel_patches/series/ldiskfs-2.6-vanilla.series create mode 100644 lustre/kernel_patches/series/ldiskfs-2.6.12-vanilla.series create mode 100644 lustre/kernel_patches/series/ldiskfs-2.6.18-vanilla.series create mode 100644 lustre/kernel_patches/series/rhel-2.4.21 create mode 100644 lustre/kernel_patches/series/suse-2.4.21-cray create mode 100644 lustre/kernel_patches/series/vanilla-2.4.24 create mode 100644 lustre/kernel_patches/series/vanilla-2.4.29 create mode 100644 lustre/kernel_patches/series/vanilla-2.4.29-uml delete mode 100644 lustre/kernel_patches/targets/2.6-fc3.target.in create mode 100644 lustre/kernel_patches/targets/2.6-fc5.target.in create mode 100644 lustre/kernel_patches/targets/2.6-patchless.target.in create mode 100644 lustre/kernel_patches/targets/2.6-rhel4.target.in create mode 100644 lustre/kernel_patches/targets/2.6-suse.target.in create mode 100644 lustre/kernel_patches/targets/2.6-vanilla.target.in create mode 100644 lustre/kernel_patches/targets/hp_pnnl-2.4.target.in create mode 100644 lustre/kernel_patches/targets/rh-2.4.target.in create mode 100644 lustre/kernel_patches/targets/rhel-2.4.target.in create mode 100644 lustre/kernel_patches/targets/sles-2.4.target.in create mode 100644 lustre/kernel_patches/targets/suse-2.4.21-2.target.in delete mode 100644 lustre/kernel_patches/txt/dev_read_only.txt delete mode 100644 lustre/kernel_patches/txt/exports.txt delete mode 100644 lustre/kernel_patches/txt/exports_hp.txt delete mode 100644 lustre/kernel_patches/txt/ext3-2.4.20-fixes.txt delete mode 100644 lustre/kernel_patches/txt/ext3-map_inode_page.txt delete mode 100644 lustre/kernel_patches/txt/ext3-map_inode_page_2.4.18.txt delete mode 100644 lustre/kernel_patches/txt/invalidate_show.txt delete mode 100644 lustre/kernel_patches/txt/kmem_cache_validate.txt delete mode 100644 lustre/kernel_patches/txt/lustre_version.txt delete mode 100644 lustre/kernel_patches/txt/uml_check_get_page.txt delete mode 100644 lustre/kernel_patches/txt/uml_no_panic.txt create mode 100644 lustre/ldiskfs/lustre_quota_fmt.c create mode 100644 lustre/ldiskfs/lustre_quota_fmt.h create mode 100644 lustre/ldiskfs/quotafmt_test.c delete mode 100644 lustre/ldlm/doc/dld.lyx delete mode 100644 lustre/ldlm/ldlm_test.c delete mode 100644 lustre/liblustre/doc/dld.lyx delete mode 100644 lustre/llite/doc/dld.lyx delete mode 100644 lustre/llite/llite_audit.c delete mode 100644 lustre/llite/llite_capa.c delete mode 100644 lustre/llite/llite_gns.c delete mode 100644 lustre/llite/llite_gs.c delete mode 100644 lustre/llite/special.c create mode 100644 lustre/llite/xattr.c delete mode 100644 lustre/lmv/.cvsignore delete mode 100644 lustre/lmv/Makefile.in delete mode 100644 lustre/lmv/autoMakefile.am delete mode 100644 lustre/lmv/doc/dld.lyx delete mode 100644 lustre/lmv/lmv_intent.c delete mode 100644 lustre/lmv/lmv_internal.h delete mode 100644 lustre/lmv/lmv_obd.c delete mode 100644 lustre/lmv/lmv_objmgr.c delete mode 100644 lustre/lmv/lproc_lmv.c create mode 100644 lustre/lov/Info.plist delete mode 100644 lustre/lov/doc/dld.lyx create mode 100755 lustre/lov/lov_ea.c create mode 100644 lustre/lvfs/Info.plist delete mode 100644 lustre/lvfs/doc/dld.lyx delete mode 100644 lustre/lvfs/fsfilt_smfs.c delete mode 100644 lustre/lvfs/fsfilt_snap_ext3.c delete mode 100644 lustre/lvfs/fsfilt_snap_smfs.c delete mode 100644 lustre/lvfs/llog.c delete mode 100644 lustre/lvfs/llog_cat.c delete mode 100644 lustre/lvfs/llog_lvfs.c create mode 100644 lustre/lvfs/lvfs_darwin.c delete mode 100644 lustre/lvfs/lvfs_reint.c delete mode 100644 lustre/lvfs/lvfs_undo.c create mode 100644 lustre/lvfs/upcall_cache.c delete mode 100644 lustre/mdc/doc/dld.lyx delete mode 100644 lustre/mds/doc/dld.lyx delete mode 100644 lustre/mds/mds_acl.c delete mode 100644 lustre/mds/mds_audit.c delete mode 100644 lustre/mds/mds_audit_path.c delete mode 100644 lustre/mds/mds_capa.c create mode 100644 lustre/mds/mds_join.c delete mode 100644 lustre/mds/mds_lmv.c delete mode 100644 lustre/mds/mds_lsd.c create mode 100644 lustre/mds/mds_xattr.c rename lustre/{cmobd => mgc}/.cvsignore (100%) create mode 100644 lustre/mgc/Makefile.in create mode 100644 lustre/mgc/autoMakefile.am create mode 100644 lustre/mgc/libmgc.c create mode 100644 lustre/mgc/mgc_request.c delete mode 100644 lustre/mgmt/.cvsignore delete mode 100644 lustre/mgmt/Makefile.in delete mode 100644 lustre/mgmt/autoMakefile.am delete mode 100644 lustre/mgmt/mgmt_cli.c delete mode 100644 lustre/mgmt/mgmt_svc.c rename lustre/{sec => mgs}/.cvsignore (100%) create mode 100644 lustre/mgs/Makefile.in create mode 100644 lustre/mgs/autoMakefile.am create mode 100644 lustre/mgs/lproc_mgs.c create mode 100644 lustre/mgs/mgs_fs.c create mode 100644 lustre/mgs/mgs_handler.c create mode 100644 lustre/mgs/mgs_internal.h create mode 100644 lustre/mgs/mgs_llog.c create mode 100644 lustre/obdclass/Info.plist delete mode 100644 lustre/obdclass/capa.c delete mode 100644 lustre/obdclass/confobd.c create mode 100644 lustre/obdclass/darwin/.cvsignore create mode 100644 lustre/obdclass/darwin/Makefile.am create mode 100644 lustre/obdclass/darwin/darwin-module.c create mode 100644 lustre/obdclass/darwin/darwin-sysctl.c delete mode 100644 lustre/obdclass/doc/dld.lyx create mode 100644 lustre/obdclass/linux/.cvsignore create mode 100644 lustre/obdclass/linux/Makefile.am create mode 100644 lustre/obdclass/linux/linux-module.c create mode 100644 lustre/obdclass/linux/linux-obdo.c create mode 100644 lustre/obdclass/linux/linux-sysctl.c create mode 100644 lustre/obdclass/llog.c create mode 100644 lustre/obdclass/llog_cat.c create mode 100644 lustre/obdclass/llog_internal.h create mode 100644 lustre/obdclass/llog_lvfs.c delete mode 100644 lustre/obdclass/mea.c create mode 100644 lustre/obdclass/obd_mount.c create mode 100644 lustre/obdclass/prng.c delete mode 100644 lustre/obdclass/sysctl.c create mode 100644 lustre/obdecho/Info.plist delete mode 100644 lustre/obdecho/doc/dld.lyx delete mode 100644 lustre/obdfilter/doc/dld.lyx delete mode 100644 lustre/obdfilter/filter_capa.c delete mode 100644 lustre/obdfilter/filter_san.c create mode 100644 lustre/osc/Info.plist delete mode 100644 lustre/osc/doc/dld.lyx delete mode 100644 lustre/osc/osc_lib.c delete mode 100644 lustre/ost/doc/dld.lyx create mode 100644 lustre/ost/ost_internal.h delete mode 100644 lustre/ptlbd/.cvsignore delete mode 100644 lustre/ptlbd/Makefile.in delete mode 100644 lustre/ptlbd/autoMakefile.am delete mode 100644 lustre/ptlbd/blk.c delete mode 100644 lustre/ptlbd/client.c delete mode 100644 lustre/ptlbd/main.c delete mode 100644 lustre/ptlbd/rpc.c delete mode 100644 lustre/ptlbd/server.c create mode 100644 lustre/ptlrpc/Info.plist delete mode 100644 lustre/ptlrpc/doc/dld.lyx create mode 100644 lustre/ptlrpc/wirehdr.c create mode 100644 lustre/ptlrpc/wiretest.c rename lustre/{sec/gss => quota}/.cvsignore (100%) create mode 100644 lustre/quota/Makefile.in create mode 100644 lustre/quota/autoMakefile.am create mode 100644 lustre/quota/quota_check.c create mode 100644 lustre/quota/quota_context.c create mode 100644 lustre/quota/quota_ctl.c create mode 100644 lustre/quota/quota_interface.c create mode 100644 lustre/quota/quota_internal.h create mode 100644 lustre/quota/quota_master.c create mode 100644 lustre/quota/quotacheck_test.c create mode 100644 lustre/quota/quotactl_test.c delete mode 100644 lustre/scripts/collect-stats.sh delete mode 100755 lustre/scripts/cvs-modified-files.pl delete mode 100755 lustre/scripts/cvsdiffclient delete mode 100644 lustre/scripts/cvsrc delete mode 100755 lustre/scripts/graph-rpcs.sh create mode 100644 lustre/scripts/lc_cluman.sh.in create mode 100644 lustre/scripts/lc_common.sh create mode 100644 lustre/scripts/lc_hb.sh.in create mode 100644 lustre/scripts/lc_lvm.sh.in create mode 100644 lustre/scripts/lc_md.sh.in create mode 100644 lustre/scripts/lc_modprobe.sh.in create mode 100755 lustre/scripts/lc_mon.sh create mode 100644 lustre/scripts/lc_net.sh.in create mode 100755 lustre/scripts/lc_servip.sh delete mode 100644 lustre/scripts/linux-merge-config.awk delete mode 100644 lustre/scripts/linux-merge-modules.awk delete mode 100644 lustre/scripts/linux-rhconfig.h create mode 100644 lustre/scripts/lmc2csv.pl create mode 100644 lustre/scripts/lustre_config.sh.in create mode 100644 lustre/scripts/lustre_createcsv.sh.in create mode 100755 lustre/scripts/lustre_req_history.sh create mode 100755 lustre/scripts/lustre_rmmod.sh create mode 100755 lustre/scripts/lustre_up14.sh delete mode 100644 lustre/scripts/suse-functions.sh delete mode 100644 lustre/scripts/suse-post.sh delete mode 100644 lustre/scripts/suse-postun.sh delete mode 100644 lustre/scripts/suse-trigger-script.sh.in delete mode 100644 lustre/sec/Makefile.in delete mode 100644 lustre/sec/Makefile.mk delete mode 100644 lustre/sec/autoMakefile.am delete mode 100644 lustre/sec/doc/oss_gss_HLD.lyx delete mode 100644 lustre/sec/doc/remote_ugid_HLD.lyx delete mode 100644 lustre/sec/doc/revoke_user_HLD.lyx delete mode 100644 lustre/sec/gks/Makefile.in delete mode 100644 lustre/sec/gks/Makefile.mk delete mode 100644 lustre/sec/gks/autoMakefile.am delete mode 100644 lustre/sec/gks/gks_client.c delete mode 100644 lustre/sec/gks/gks_internal.h delete mode 100644 lustre/sec/gks/gks_server.c delete mode 100644 lustre/sec/gks/lproc_gks.c delete mode 100644 lustre/sec/gss/Makefile.in delete mode 100644 lustre/sec/gss/Makefile.mk delete mode 100644 lustre/sec/gss/autoMakefile.am delete mode 100644 lustre/sec/gss/gss_api.h delete mode 100644 lustre/sec/gss/gss_asn1.h delete mode 100644 lustre/sec/gss/gss_err.h delete mode 100644 lustre/sec/gss/gss_generic_token.c delete mode 100644 lustre/sec/gss/gss_internal.h delete mode 100644 lustre/sec/gss/gss_krb5.h delete mode 100644 lustre/sec/gss/gss_krb5_crypto.c delete mode 100644 lustre/sec/gss/gss_krb5_mech.c delete mode 100644 lustre/sec/gss/gss_krb5_seal.c delete mode 100644 lustre/sec/gss/gss_krb5_seqnum.c delete mode 100644 lustre/sec/gss/gss_krb5_unseal.c delete mode 100644 lustre/sec/gss/gss_krb5_wrap.c delete mode 100644 lustre/sec/gss/gss_mech_switch.c delete mode 100644 lustre/sec/gss/rawobj.c delete mode 100644 lustre/sec/gss/sec_gss.c delete mode 100644 lustre/sec/gss/svcsec_gss.c delete mode 100644 lustre/sec/sec.c delete mode 100644 lustre/sec/sec_null.c delete mode 100644 lustre/sec/svcsec.c delete mode 100644 lustre/sec/svcsec_null.c delete mode 100644 lustre/sec/upcall_cache.c delete mode 100644 lustre/smfs/.cvsignore delete mode 100644 lustre/smfs/Makefile.in delete mode 100644 lustre/smfs/audit.c delete mode 100644 lustre/smfs/audit_mds.c delete mode 100644 lustre/smfs/audit_ost.c delete mode 100644 lustre/smfs/audit_transfer.c delete mode 100644 lustre/smfs/autoMakefile.am delete mode 100644 lustre/smfs/cache.c delete mode 100644 lustre/smfs/cache_space.c delete mode 100644 lustre/smfs/dir.c delete mode 100644 lustre/smfs/doc/dld.lyx delete mode 100644 lustre/smfs/doc/hld.lyx delete mode 100644 lustre/smfs/file.c delete mode 100644 lustre/smfs/fsfilt.c delete mode 100644 lustre/smfs/inode.c delete mode 100644 lustre/smfs/ioctl.c delete mode 100644 lustre/smfs/kml.c delete mode 100644 lustre/smfs/mds_kml.c delete mode 100644 lustre/smfs/options.c delete mode 100644 lustre/smfs/ost_kml.c delete mode 100644 lustre/smfs/smfs_api.h delete mode 100644 lustre/smfs/smfs_cow.c delete mode 100644 lustre/smfs/smfs_internal.h delete mode 100644 lustre/smfs/smfs_lib.c delete mode 100644 lustre/smfs/smfs_llog.c delete mode 100644 lustre/smfs/super.c delete mode 100644 lustre/smfs/symlink.c delete mode 100644 lustre/smfs/sysctl.c delete mode 100644 lustre/snapfs/.cvsignore delete mode 100644 lustre/snapfs/Makefile.in delete mode 100644 lustre/snapfs/autoMakefile.am delete mode 100644 lustre/snapfs/cache.c delete mode 100644 lustre/snapfs/clonefs.c delete mode 100644 lustre/snapfs/dcache.c delete mode 100644 lustre/snapfs/dir.c delete mode 100644 lustre/snapfs/dotsnap.c delete mode 100644 lustre/snapfs/file.c delete mode 100644 lustre/snapfs/filter.c delete mode 100644 lustre/snapfs/inode.c delete mode 100644 lustre/snapfs/journal_ext3.c delete mode 100644 lustre/snapfs/options.c delete mode 100644 lustre/snapfs/psdev.c delete mode 100644 lustre/snapfs/snap.c delete mode 100644 lustre/snapfs/snapfs_internal.h delete mode 100644 lustre/snapfs/snapfs_support.h delete mode 100644 lustre/snapfs/snaptable.c delete mode 100644 lustre/snapfs/super.c delete mode 100644 lustre/snapfs/symlink.c delete mode 100644 lustre/snapfs/sysctl.c delete mode 100644 lustre/snapfs/utils/.cvsignore delete mode 100644 lustre/snapfs/utils/Makefile.am delete mode 100644 lustre/snapfs/utils/parser.c delete mode 100644 lustre/snapfs/utils/parser.h delete mode 100644 lustre/snapfs/utils/snapconf.c delete mode 100644 lustre/snapfs/utils/snapctl.c delete mode 100644 lustre/snapfs/utils/snapctl.h create mode 100644 lustre/tests/2ost.sh delete mode 100644 lustre/tests/cfg/insanity-lmv.sh delete mode 100644 lustre/tests/cfg/lmv.sh create mode 100644 lustre/tests/cfg/lov.sh delete mode 100644 lustre/tests/cfg/mdev.sh delete mode 100644 lustre/tests/cfg/smfs.sh delete mode 100644 lustre/tests/checkstack.pl create mode 100644 lustre/tests/chownmany.c delete mode 100644 lustre/tests/cmobd.sh delete mode 100644 lustre/tests/cobd_test.sh delete mode 100644 lustre/tests/copy_attr.c create mode 100644 lustre/tests/flock.c create mode 100644 lustre/tests/flock_test.c create mode 100644 lustre/tests/flocks_test.c delete mode 100755 lustre/tests/gns-upcall.sh create mode 100755 lustre/tests/kbuild delete mode 100755 lustre/tests/krb5_env.sh delete mode 100755 lustre/tests/krb5_refresh_cache.sh delete mode 100755 lustre/tests/lfsck_config.sh delete mode 100644 lustre/tests/lfscktest_config.sh delete mode 100755 lustre/tests/llmount-upcall.sh create mode 100644 lustre/tests/llog-test.sh delete mode 100755 lustre/tests/llrmount.sh delete mode 100755 lustre/tests/lmv.sh delete mode 100755 lustre/tests/local-large-inode.sh delete mode 100755 lustre/tests/local.sh delete mode 100755 lustre/tests/lov.sh delete mode 100755 lustre/tests/lsmfs.sh delete mode 100755 lustre/tests/lsnap.sh delete mode 100755 lustre/tests/mcr-individual-ost-nogw-config.sh delete mode 100755 lustre/tests/mcr-mds-failover-config.sh delete mode 100755 lustre/tests/mcr-routed-config.sh delete mode 100755 lustre/tests/mcrlov.sh delete mode 100755 lustre/tests/mdsadd.sh delete mode 100644 lustre/tests/open_delay.c create mode 100644 lustre/tests/random-reads.c delete mode 100755 lustre/tests/recovery-small-upcall.sh delete mode 100755 lustre/tests/replay-ost-upcall.sh delete mode 100755 lustre/tests/replay-sanity.sh delete mode 100755 lustre/tests/replay-single-lmv.sh delete mode 100755 lustre/tests/replay-single-upcall.sh create mode 100644 lustre/tests/routed.sh create mode 100644 lustre/tests/run-quotacheck.sh create mode 100644 lustre/tests/run-quotactl.sh create mode 100644 lustre/tests/run-quotafmt.sh delete mode 100755 lustre/tests/run_lfscktest.sh delete mode 100755 lustre/tests/runacltest delete mode 100755 lustre/tests/runregression-mds.sh delete mode 100644 lustre/tests/sanity-cmobd.sh delete mode 100755 lustre/tests/sanity-crypto.sh delete mode 100644 lustre/tests/sanity-fid.sh delete mode 100644 lustre/tests/sanity-gns.sh delete mode 100644 lustre/tests/sanity-ldlm.sh delete mode 100644 lustre/tests/sanity-lmv.sh delete mode 100755 lustre/tests/sanity-ost_add_del.sh create mode 100644 lustre/tests/sanity-quota.sh delete mode 100644 lustre/tests/sanity-sec.sh delete mode 100644 lustre/tests/setfacl.test delete mode 100755 lustre/tests/smfs.sh delete mode 100644 lustre/tests/tbox.sh delete mode 100755 lustre/tests/test.c delete mode 100644 lustre/tests/test45-mountain.sh delete mode 100644 lustre/tests/test45.sh delete mode 100644 lustre/tests/uml.sh delete mode 100644 lustre/tests/uml_clone.sh delete mode 100755 lustre/tests/upcall create mode 100644 lustre/tests/writemany.c delete mode 100644 lustre/utils/Lustre/.cvsignore delete mode 100644 lustre/utils/Lustre/Makefile.am delete mode 100644 lustre/utils/Lustre/__init__.py delete mode 100644 lustre/utils/Lustre/cmdline.py delete mode 100644 lustre/utils/Lustre/error.py delete mode 100644 lustre/utils/Lustre/lustredb.py delete mode 100755 lustre/utils/automatic-reconnect-sample delete mode 100644 lustre/utils/doc/dld.lyx delete mode 100755 lustre/utils/ha_assist.sh delete mode 100755 lustre/utils/ha_assist2.sh create mode 100644 lustre/utils/l_getgroups.c delete mode 100644 lustre/utils/lacl_upcall.c delete mode 100644 lustre/utils/lactive delete mode 100755 lustre/utils/lconf delete mode 100755 lustre/utils/lfind delete mode 100644 lustre/utils/lkinit.c delete mode 100644 lustre/utils/llmount.c create mode 100644 lustre/utils/llog_reader.c create mode 100644 lustre/utils/llverdev.c create mode 100644 lustre/utils/llverfs.c delete mode 100755 lustre/utils/lmc delete mode 100755 lustre/utils/load_ldap.sh create mode 100644 lustre/utils/loadgen.c create mode 100644 lustre/utils/lr_reader.c delete mode 100644 lustre/utils/lsd_upcall.c delete mode 100755 lustre/utils/lstripe delete mode 100755 lustre/utils/lwizard delete mode 100755 lustre/utils/mds-failover-sample create mode 100644 lustre/utils/mkfs_lustre.c create mode 100755 lustre/utils/module_cleanup.sh create mode 100755 lustre/utils/module_setup.sh create mode 100644 lustre/utils/mount_lustre.c create mode 100644 lustre/utils/platform.h create mode 100755 lustre/utils/plot-llstat.pl diff --git a/ldiskfs/kernel_patches/patches/export-ext3-2.6-rhel4.patch b/ldiskfs/kernel_patches/patches/export-ext3-2.6-rhel4.patch new file mode 100644 index 0000000..7b4f0c8 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/export-ext3-2.6-rhel4.patch @@ -0,0 +1,33 @@ +Index: linux-stage/fs/ext3/super.c +=================================================================== +--- linux-stage.orig/fs/ext3/super.c 2005-02-25 14:50:46.077845320 +0200 ++++ linux-stage/fs/ext3/super.c 2005-02-25 14:51:32.241827328 +0200 +@@ -123,6 +123,8 @@ + journal_abort_handle(handle); + } + ++EXPORT_SYMBOL(ext3_journal_abort_handle); ++ + /* Deal with the reporting of failure conditions on a filesystem such as + * inconsistencies detected or read IO failures. + * +@@ -2002,6 +2004,8 @@ + return ret; + } + ++EXPORT_SYMBOL(ext3_force_commit); ++ + /* + * Ext3 always journals updates to the superblock itself, so we don't + * have to propagate any other updates to the superblock on disk at this +@@ -2433,6 +2437,10 @@ + unsigned long *blocks, int *created, int create); + EXPORT_SYMBOL(ext3_map_inode_page); + ++EXPORT_SYMBOL(ext3_xattr_get); ++EXPORT_SYMBOL(ext3_xattr_set_handle); ++EXPORT_SYMBOL(ext3_bread); ++ + MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); + MODULE_DESCRIPTION("Second Extended Filesystem with journaling extensions"); + MODULE_LICENSE("GPL"); diff --git a/ldiskfs/kernel_patches/patches/export_symbols-ext3-2.6-suse.patch b/ldiskfs/kernel_patches/patches/export_symbols-ext3-2.6-suse.patch index 2327263..294a9cd 100644 --- a/ldiskfs/kernel_patches/patches/export_symbols-ext3-2.6-suse.patch +++ b/ldiskfs/kernel_patches/patches/export_symbols-ext3-2.6-suse.patch @@ -1,7 +1,7 @@ -Index: linux-2.6.0/include/linux/ext3_fs_sb.h +Index: linux-2.6.5-sles9/include/linux/ext3_fs_sb.h =================================================================== ---- linux-2.6.0.orig/include/linux/ext3_fs_sb.h 2003-06-24 18:05:26.000000000 +0400 -+++ linux-2.6.0/include/linux/ext3_fs_sb.h 2003-12-31 01:09:26.000000000 +0300 +--- linux-2.6.5-sles9.orig/include/linux/ext3_fs_sb.h 2004-11-03 08:36:51.000000000 +0300 ++++ linux-2.6.5-sles9/include/linux/ext3_fs_sb.h 2004-11-09 02:20:51.598024096 +0300 @@ -19,9 +19,12 @@ #ifdef __KERNEL__ #include @@ -12,6 +12,6 @@ Index: linux-2.6.0/include/linux/ext3_fs_sb.h #include #endif +#endif + #include /* - * third extended-fs super-block data in memory diff --git a/ldiskfs/kernel_patches/patches/ext3-check-jbd-errors-2.6.5.patch b/ldiskfs/kernel_patches/patches/ext3-check-jbd-errors-2.6.5.patch new file mode 100644 index 0000000..e54774f --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext3-check-jbd-errors-2.6.5.patch @@ -0,0 +1,113 @@ +Index: linux-2.6.5-7.201-full/include/linux/ext3_fs.h +=================================================================== +--- linux-2.6.5-7.201-full.orig/include/linux/ext3_fs.h 2006-08-09 17:59:34.000000000 +0400 ++++ linux-2.6.5-7.201-full/include/linux/ext3_fs.h 2006-08-22 12:35:55.000000000 +0400 +@@ -793,6 +793,7 @@ extern void ext3_put_super (struct super + extern void ext3_write_super (struct super_block *); + extern void ext3_write_super_lockfs (struct super_block *); + extern void ext3_unlockfs (struct super_block *); ++extern void ext3_commit_super (struct super_block *, struct ext3_super_block *, int); + extern int ext3_remount (struct super_block *, int *, char *); + extern int ext3_statfs (struct super_block *, struct kstatfs *); + +Index: linux-2.6.5-7.201-full/fs/ext3/super.c +=================================================================== +--- linux-2.6.5-7.201-full.orig/fs/ext3/super.c 2006-08-09 17:59:37.000000000 +0400 ++++ linux-2.6.5-7.201-full/fs/ext3/super.c 2006-08-09 17:59:37.000000000 +0400 +@@ -39,7 +39,7 @@ + static int ext3_load_journal(struct super_block *, struct ext3_super_block *); + static int ext3_create_journal(struct super_block *, struct ext3_super_block *, + int); +-static void ext3_commit_super (struct super_block * sb, ++void ext3_commit_super (struct super_block * sb, + struct ext3_super_block * es, + int sync); + static void ext3_mark_recovery_complete(struct super_block * sb, +@@ -1781,7 +1781,7 @@ static int ext3_create_journal(struct su + return 0; + } + +-static void ext3_commit_super (struct super_block * sb, ++void ext3_commit_super (struct super_block * sb, + struct ext3_super_block * es, + int sync) + { +Index: linux-2.6.5-7.201-full/fs/ext3/namei.c +=================================================================== +--- linux-2.6.5-7.201-full.orig/fs/ext3/namei.c 2006-08-09 17:59:37.000000000 +0400 ++++ linux-2.6.5-7.201-full/fs/ext3/namei.c 2006-08-09 17:59:37.000000000 +0400 +@@ -1598,7 +1598,7 @@ static int ext3_delete_entry (handle_t * + struct buffer_head * bh) + { + struct ext3_dir_entry_2 * de, * pde; +- int i; ++ int i, err; + + i = 0; + pde = NULL; +@@ -1608,7 +1608,9 @@ static int ext3_delete_entry (handle_t * + return -EIO; + if (de == de_del) { + BUFFER_TRACE(bh, "get_write_access"); +- ext3_journal_get_write_access(handle, bh); ++ err = ext3_journal_get_write_access(handle, bh); ++ if (err) ++ return err; + if (pde) + pde->rec_len = + cpu_to_le16(le16_to_cpu(pde->rec_len) + +Index: linux-2.6.5-7.201-full/fs/ext3/xattr.c +=================================================================== +--- linux-2.6.5-7.201-full.orig/fs/ext3/xattr.c 2006-07-14 01:53:23.000000000 +0400 ++++ linux-2.6.5-7.201-full/fs/ext3/xattr.c 2006-08-09 17:59:37.000000000 +0400 +@@ -107,7 +107,7 @@ ext3_xattr_register(int name_index, stru + { + int error = -EINVAL; + +- if (name_index > 0 && name_index <= EXT3_XATTR_INDEX_MAX) { ++ if (name_index > 0 && name_index < EXT3_XATTR_INDEX_MAX) { + write_lock(&ext3_handler_lock); + if (!ext3_xattr_handlers[name_index-1]) { + ext3_xattr_handlers[name_index-1] = handler; +Index: linux-2.6.5-7.201-full/fs/ext3/inode.c +=================================================================== +--- linux-2.6.5-7.201-full.orig/fs/ext3/inode.c 2006-07-14 01:53:22.000000000 +0400 ++++ linux-2.6.5-7.201-full/fs/ext3/inode.c 2006-08-22 12:35:28.000000000 +0400 +@@ -1517,9 +1517,14 @@ out_stop: + if (end > inode->i_size) { + ei->i_disksize = end; + i_size_write(inode, end); +- err = ext3_mark_inode_dirty(handle, inode); +- if (!ret) +- ret = err; ++ /* ++ * We're going to return a positive `ret' ++ * here due to non-zero-length I/O, so there's ++ * no way of reporting error returns from ++ * ext3_mark_inode_dirty() to userspace. So ++ * ignore it. ++ */ ++ ext3_mark_inode_dirty(handle, inode); + } + } + err = ext3_journal_stop(handle); +@@ -1811,8 +1816,18 @@ ext3_clear_blocks(handle_t *handle, stru + ext3_mark_inode_dirty(handle, inode); + ext3_journal_test_restart(handle, inode); + if (bh) { ++ int err; + BUFFER_TRACE(bh, "retaking write access"); +- ext3_journal_get_write_access(handle, bh); ++ err = ext3_journal_get_write_access(handle, bh); ++ if (err) { ++ struct super_block *sb = inode->i_sb; ++ struct ext3_super_block *es = EXT3_SB(sb)->s_es; ++ printk (KERN_CRIT"EXT3-fs: can't continue truncate\n"); ++ EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS; ++ es->s_state |= cpu_to_le16(EXT3_ERROR_FS); ++ ext3_commit_super(sb, es, 1); ++ return; ++ } + } + } + diff --git a/ldiskfs/kernel_patches/patches/ext3-check-jbd-errors-2.6.9.patch b/ldiskfs/kernel_patches/patches/ext3-check-jbd-errors-2.6.9.patch new file mode 100644 index 0000000..f6904f2 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext3-check-jbd-errors-2.6.9.patch @@ -0,0 +1,113 @@ +Index: linux-2.6.9-full/include/linux/ext3_fs.h +=================================================================== +--- linux-2.6.9-full.orig/include/linux/ext3_fs.h 2006-08-09 17:56:39.000000000 +0400 ++++ linux-2.6.9-full/include/linux/ext3_fs.h 2006-08-22 12:36:22.000000000 +0400 +@@ -826,6 +826,7 @@ extern void ext3_put_super (struct super + extern void ext3_write_super (struct super_block *); + extern void ext3_write_super_lockfs (struct super_block *); + extern void ext3_unlockfs (struct super_block *); ++extern void ext3_commit_super (struct super_block *, struct ext3_super_block *, int); + extern int ext3_remount (struct super_block *, int *, char *); + extern int ext3_statfs (struct super_block *, struct kstatfs *); + +Index: linux-2.6.9-full/fs/ext3/super.c +=================================================================== +--- linux-2.6.9-full.orig/fs/ext3/super.c 2006-08-09 17:56:40.000000000 +0400 ++++ linux-2.6.9-full/fs/ext3/super.c 2006-08-09 17:56:40.000000000 +0400 +@@ -43,7 +43,7 @@ static int ext3_load_journal(struct supe + unsigned long journal_devnum); + static int ext3_create_journal(struct super_block *, struct ext3_super_block *, + int); +-static void ext3_commit_super (struct super_block * sb, ++void ext3_commit_super (struct super_block * sb, + struct ext3_super_block * es, + int sync); + static void ext3_mark_recovery_complete(struct super_block * sb, +@@ -1991,7 +1991,7 @@ static int ext3_create_journal(struct su + return 0; + } + +-static void ext3_commit_super (struct super_block * sb, ++void ext3_commit_super (struct super_block * sb, + struct ext3_super_block * es, + int sync) + { +Index: linux-2.6.9-full/fs/ext3/namei.c +=================================================================== +--- linux-2.6.9-full.orig/fs/ext3/namei.c 2006-08-09 17:56:40.000000000 +0400 ++++ linux-2.6.9-full/fs/ext3/namei.c 2006-08-09 17:56:40.000000000 +0400 +@@ -1599,7 +1599,7 @@ static int ext3_delete_entry (handle_t * + struct buffer_head * bh) + { + struct ext3_dir_entry_2 * de, * pde; +- int i; ++ int i, err; + + i = 0; + pde = NULL; +@@ -1609,7 +1609,9 @@ static int ext3_delete_entry (handle_t * + return -EIO; + if (de == de_del) { + BUFFER_TRACE(bh, "get_write_access"); +- ext3_journal_get_write_access(handle, bh); ++ err = ext3_journal_get_write_access(handle, bh); ++ if (err) ++ return err; + if (pde) + pde->rec_len = + cpu_to_le16(le16_to_cpu(pde->rec_len) + +Index: linux-2.6.9-full/fs/ext3/xattr.c +=================================================================== +--- linux-2.6.9-full.orig/fs/ext3/xattr.c 2006-06-01 14:58:48.000000000 +0400 ++++ linux-2.6.9-full/fs/ext3/xattr.c 2006-08-09 17:56:40.000000000 +0400 +@@ -132,7 +132,7 @@ ext3_xattr_handler(int name_index) + { + struct xattr_handler *handler = NULL; + +- if (name_index > 0 && name_index <= EXT3_XATTR_INDEX_MAX) ++ if (name_index > 0 && name_index < EXT3_XATTR_INDEX_MAX) + handler = ext3_xattr_handler_map[name_index]; + return handler; + } +Index: linux-2.6.9-full/fs/ext3/inode.c +=================================================================== +--- linux-2.6.9-full.orig/fs/ext3/inode.c 2006-06-02 23:37:38.000000000 +0400 ++++ linux-2.6.9-full/fs/ext3/inode.c 2006-08-22 12:34:28.000000000 +0400 +@@ -1513,9 +1513,14 @@ out_stop: + if (end > inode->i_size) { + ei->i_disksize = end; + i_size_write(inode, end); +- err = ext3_mark_inode_dirty(handle, inode); +- if (!ret) +- ret = err; ++ /* ++ * We're going to return a positive `ret' ++ * here due to non-zero-length I/O, so there's ++ * no way of reporting error returns from ++ * ext3_mark_inode_dirty() to userspace. So ++ * ignore it. ++ */ ++ ext3_mark_inode_dirty(handle, inode); + } + } + err = ext3_journal_stop(handle); +@@ -1807,8 +1812,18 @@ ext3_clear_blocks(handle_t *handle, stru + ext3_mark_inode_dirty(handle, inode); + ext3_journal_test_restart(handle, inode); + if (bh) { ++ int err; + BUFFER_TRACE(bh, "retaking write access"); +- ext3_journal_get_write_access(handle, bh); ++ err = ext3_journal_get_write_access(handle, bh); ++ if (err) { ++ struct super_block *sb = inode->i_sb; ++ struct ext3_super_block *es = EXT3_SB(sb)->s_es; ++ printk (KERN_CRIT"EXT3-fs: can't continue truncate\n"); ++ EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS; ++ es->s_state |= cpu_to_le16(EXT3_ERROR_FS); ++ ext3_commit_super(sb, es, 1); ++ return; ++ } + } + } + diff --git a/ldiskfs/kernel_patches/patches/ext3-ea-in-inode-2.6-rhel4.patch b/ldiskfs/kernel_patches/patches/ext3-ea-in-inode-2.6-rhel4.patch new file mode 100644 index 0000000..89cc1b5 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext3-ea-in-inode-2.6-rhel4.patch @@ -0,0 +1,840 @@ +Index: linux-stage/fs/ext3/ialloc.c +=================================================================== +--- linux-stage.orig/fs/ext3/ialloc.c 2005-10-04 16:53:24.000000000 -0600 ++++ linux-stage/fs/ext3/ialloc.c 2005-10-04 17:07:25.000000000 -0600 +@@ -629,6 +629,9 @@ + spin_unlock(&sbi->s_next_gen_lock); + + ei->i_state = EXT3_STATE_NEW; ++ ei->i_extra_isize = ++ (EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) ? ++ sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE : 0; + + ret = inode; + if(DQUOT_ALLOC_INODE(inode)) { +Index: linux-stage/fs/ext3/inode.c +=================================================================== +--- linux-stage.orig/fs/ext3/inode.c 2005-10-04 17:00:22.000000000 -0600 ++++ linux-stage/fs/ext3/inode.c 2005-10-04 17:07:25.000000000 -0600 +@@ -2274,7 +2274,7 @@ + * trying to determine the inode's location on-disk and no read need be + * performed. + */ +-static int ext3_get_inode_loc(struct inode *inode, ++int ext3_get_inode_loc(struct inode *inode, + struct ext3_iloc *iloc, int in_mem) + { + unsigned long block; +@@ -2484,6 +2484,11 @@ void ext3_read_inode(struct inode * inod + ei->i_data[block] = raw_inode->i_block[block]; + INIT_LIST_HEAD(&ei->i_orphan); + ++ if (EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) ++ ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); ++ else ++ ei->i_extra_isize = 0; ++ + if (S_ISREG(inode->i_mode)) { + inode->i_op = &ext3_file_inode_operations; + inode->i_fop = &ext3_file_operations; +@@ -2619,6 +2624,9 @@ static int ext3_do_update_inode(handle_t + } else for (block = 0; block < EXT3_N_BLOCKS; block++) + raw_inode->i_block[block] = ei->i_data[block]; + ++ if (EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) ++ raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); ++ + BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); + rc = ext3_journal_dirty_metadata(handle, bh); + if (!err) +@@ -2849,7 +2857,8 @@ ext3_reserve_inode_write(handle_t *handl + { + int err = 0; + if (handle) { +- err = ext3_get_inode_loc(inode, iloc, 1); ++ err = ext3_get_inode_loc(inode, iloc, EXT3_I(inode)->i_state & ++ EXT3_STATE_NEW); + if (!err) { + BUFFER_TRACE(iloc->bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, iloc->bh); +Index: linux-stage/fs/ext3/xattr.c +=================================================================== +--- linux-stage.orig/fs/ext3/xattr.c 2005-10-04 16:50:11.000000000 -0600 ++++ linux-stage/fs/ext3/xattr.c 2005-10-04 17:19:43.000000000 -0600 +@@ -149,17 +149,12 @@ + } + + /* +- * ext3_xattr_get() +- * +- * Copy an extended attribute into the buffer +- * provided, or compute the buffer size required. +- * Buffer is NULL to compute the size of the buffer required. ++ * ext3_xattr_block_get() + * +- * Returns a negative error number on failure, or the number of bytes +- * used / required on success. ++ * routine looks for attribute in EA block and returns it's value and size + */ + int +-ext3_xattr_get(struct inode *inode, int name_index, const char *name, ++ext3_xattr_block_get(struct inode *inode, int name_index, const char *name, + void *buffer, size_t buffer_size) + { + struct buffer_head *bh = NULL; +@@ -173,7 +168,6 @@ + + if (name == NULL) + return -EINVAL; +- down_read(&EXT3_I(inode)->xattr_sem); + error = -ENODATA; + if (!EXT3_I(inode)->i_file_acl) + goto cleanup; +@@ -246,15 +240,87 @@ + + cleanup: + brelse(bh); +- up_read(&EXT3_I(inode)->xattr_sem); + + return error; + } + + /* +- * ext3_xattr_list() ++ * ext3_xattr_ibody_get() + * +- * Copy a list of attribute names into the buffer ++ * routine looks for attribute in inode body and returns it's value and size ++ */ ++int ++ext3_xattr_ibody_get(struct inode *inode, int name_index, const char *name, ++ void *buffer, size_t buffer_size) ++{ ++ int size, name_len = strlen(name), storage_size; ++ struct ext3_xattr_entry *last; ++ struct ext3_inode *raw_inode; ++ struct ext3_iloc iloc; ++ char *start, *end; ++ int ret = -ENOENT; ++ ++ if (EXT3_SB(inode->i_sb)->s_inode_size <= EXT3_GOOD_OLD_INODE_SIZE) ++ return -ENOENT; ++ ++ ret = ext3_get_inode_loc(inode, &iloc, 0); ++ if (ret) ++ return ret; ++ raw_inode = ext3_raw_inode(&iloc); ++ ++ storage_size = EXT3_SB(inode->i_sb)->s_inode_size - ++ EXT3_GOOD_OLD_INODE_SIZE - ++ EXT3_I(inode)->i_extra_isize - ++ sizeof(__u32); ++ start = (char *) raw_inode + EXT3_GOOD_OLD_INODE_SIZE + ++ EXT3_I(inode)->i_extra_isize; ++ if (le32_to_cpu((*(__u32*) start)) != EXT3_XATTR_MAGIC) { ++ brelse(iloc.bh); ++ return -ENOENT; ++ } ++ start += sizeof(__u32); ++ end = (char *) raw_inode + EXT3_SB(inode->i_sb)->s_inode_size; ++ ++ last = (struct ext3_xattr_entry *) start; ++ while (!IS_LAST_ENTRY(last)) { ++ struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(last); ++ if (le32_to_cpu(last->e_value_size) > storage_size || ++ (char *) next >= end) { ++ ext3_error(inode->i_sb, "ext3_xattr_ibody_get", ++ "inode %ld", inode->i_ino); ++ brelse(iloc.bh); ++ return -EIO; ++ } ++ if (name_index == last->e_name_index && ++ name_len == last->e_name_len && ++ !memcmp(name, last->e_name, name_len)) ++ goto found; ++ last = next; ++ } ++ ++ /* can't find EA */ ++ brelse(iloc.bh); ++ return -ENOENT; ++ ++found: ++ size = le32_to_cpu(last->e_value_size); ++ if (buffer) { ++ ret = -ERANGE; ++ if (buffer_size >= size) { ++ memcpy(buffer, start + le16_to_cpu(last->e_value_offs), ++ size); ++ ret = size; ++ } ++ } else ++ ret = size; ++ brelse(iloc.bh); ++ return ret; ++} ++ ++/* ++ * ext3_xattr_get() ++ * ++ * Copy an extended attribute into the buffer + * provided, or compute the buffer size required. + * Buffer is NULL to compute the size of the buffer required. + * +@@ -262,7 +328,31 @@ + * used / required on success. + */ + int +-ext3_xattr_list(struct inode *inode, char *buffer, size_t buffer_size) ++ext3_xattr_get(struct inode *inode, int name_index, const char *name, ++ void *buffer, size_t buffer_size) ++{ ++ int err; ++ ++ down_read(&EXT3_I(inode)->xattr_sem); ++ ++ /* try to find attribute in inode body */ ++ err = ext3_xattr_ibody_get(inode, name_index, name, ++ buffer, buffer_size); ++ if (err < 0) ++ /* search was unsuccessful, try to find EA in dedicated block */ ++ err = ext3_xattr_block_get(inode, name_index, name, ++ buffer, buffer_size); ++ up_read(&EXT3_I(inode)->xattr_sem); ++ ++ return err; ++} ++ ++/* ext3_xattr_ibody_list() ++ * ++ * generate list of attributes stored in EA block ++ */ ++int ++ext3_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size) + { + struct buffer_head *bh = NULL; + struct ext3_xattr_entry *entry; +@@ -273,7 +363,6 @@ + ea_idebug(inode, "buffer=%p, buffer_size=%ld", + buffer, (long)buffer_size); + +- down_read(&EXT3_I(inode)->xattr_sem); + error = 0; + if (!EXT3_I(inode)->i_file_acl) + goto cleanup; +@@ -330,11 +419,149 @@ + + cleanup: + brelse(bh); +- up_read(&EXT3_I(inode)->xattr_sem); + + return error; + } + ++/* ext3_xattr_ibody_list() ++ * ++ * generate list of attributes stored in inode body ++ */ ++int ++ext3_xattr_ibody_list(struct inode *inode, char *buffer, size_t buffer_size) ++{ ++ struct ext3_xattr_entry *last; ++ struct ext3_inode *raw_inode; ++ char *start, *end, *buf; ++ struct ext3_iloc iloc; ++ int storage_size; ++ size_t rest = buffer_size; ++ int ret; ++ int size = 0; ++ ++ if (EXT3_SB(inode->i_sb)->s_inode_size <= EXT3_GOOD_OLD_INODE_SIZE) ++ return 0; ++ ++ ret = ext3_get_inode_loc(inode, &iloc, 0); ++ if (ret) ++ return ret; ++ raw_inode = ext3_raw_inode(&iloc); ++ ++ storage_size = EXT3_SB(inode->i_sb)->s_inode_size - ++ EXT3_GOOD_OLD_INODE_SIZE - ++ EXT3_I(inode)->i_extra_isize - ++ sizeof(__u32); ++ start = (char *) raw_inode + EXT3_GOOD_OLD_INODE_SIZE + ++ EXT3_I(inode)->i_extra_isize; ++ if (le32_to_cpu((*(__u32*) start)) != EXT3_XATTR_MAGIC) { ++ brelse(iloc.bh); ++ return 0; ++ } ++ start += sizeof(__u32); ++ end = (char *) raw_inode + EXT3_SB(inode->i_sb)->s_inode_size; ++ ++ last = (struct ext3_xattr_entry *) start; ++ while (!IS_LAST_ENTRY(last)) { ++ struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(last); ++ struct xattr_handler *handler; ++ if (le32_to_cpu(last->e_value_size) > storage_size || ++ (char *) next >= end) { ++ ext3_error(inode->i_sb, "ext3_xattr_ibody_list", ++ "inode %ld", inode->i_ino); ++ brelse(iloc.bh); ++ return -EIO; ++ } ++ handler = ext3_xattr_handler(last->e_name_index); ++ if (handler) ++ size += handler->list(inode, NULL, 0, last->e_name, ++ last->e_name_len); ++ last = next; ++ } ++ ++ if (!buffer) { ++ ret = size; ++ goto cleanup; ++ } else { ++ ret = -ERANGE; ++ if (size > buffer_size) ++ goto cleanup; ++ } ++ ++ last = (struct ext3_xattr_entry *) start; ++ buf = buffer; ++ while (!IS_LAST_ENTRY(last)) { ++ struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(last); ++ struct xattr_handler *handler; ++ handler = ext3_xattr_handler(last->e_name_index); ++ if (handler) { ++ size_t size = handler->list(inode, buffer, rest, ++ last->e_name, ++ last->e_name_len); ++ if (buffer) { ++ if (size > rest) { ++ ret = -ERANGE; ++ goto cleanup; ++ } ++ buffer += size; ++ } ++ rest -= size; ++ } ++ last = next; ++ } ++ ret = size; ++cleanup: ++ brelse(iloc.bh); ++ return ret; ++} ++ ++/* ++ * ext3_xattr_list() ++ * ++ * Copy a list of attribute names into the buffer ++ * provided, or compute the buffer size required. ++ * Buffer is NULL to compute the size of the buffer required. ++ * ++ * Returns a negative error number on failure, or the number of bytes ++ * used / required on success. ++ */ ++int ++ext3_xattr_list(struct inode *inode, char *buffer, size_t buffer_size) ++{ ++ int error; ++ int size = buffer_size; ++ ++ down_read(&EXT3_I(inode)->xattr_sem); ++ ++ /* get list of attributes stored in inode body */ ++ error = ext3_xattr_ibody_list(inode, buffer, buffer_size); ++ if (error < 0) { ++ /* some error occured while collecting ++ * attributes in inode body */ ++ size = 0; ++ goto cleanup; ++ } ++ size = error; ++ ++ /* get list of attributes stored in dedicated block */ ++ if (buffer) { ++ buffer_size -= error; ++ if (buffer_size <= 0) { ++ buffer = NULL; ++ buffer_size = 0; ++ } else ++ buffer += error; ++ } ++ ++ error = ext3_xattr_block_list(inode, buffer, buffer_size); ++ if (error < 0) ++ /* listing was successful, so we return len */ ++ size = 0; ++ ++cleanup: ++ up_read(&EXT3_I(inode)->xattr_sem); ++ return error + size; ++} ++ + /* + * If the EXT3_FEATURE_COMPAT_EXT_ATTR feature of this file system is + * not set, set it. +@@ -356,6 +583,279 @@ + } + + /* ++ * ext3_xattr_ibody_find() ++ * ++ * search attribute and calculate free space in inode body ++ * NOTE: free space includes space our attribute hold ++ */ ++int ++ext3_xattr_ibody_find(struct inode *inode, int name_index, ++ const char *name, struct ext3_xattr_entry *rentry, int *free) ++{ ++ struct ext3_xattr_entry *last; ++ struct ext3_inode *raw_inode; ++ int name_len = strlen(name); ++ int err, storage_size; ++ struct ext3_iloc iloc; ++ char *start, *end; ++ int ret = -ENOENT; ++ ++ if (EXT3_SB(inode->i_sb)->s_inode_size <= EXT3_GOOD_OLD_INODE_SIZE) ++ return ret; ++ ++ err = ext3_get_inode_loc(inode, &iloc, 0); ++ if (err) ++ return -EIO; ++ raw_inode = ext3_raw_inode(&iloc); ++ ++ storage_size = EXT3_SB(inode->i_sb)->s_inode_size - ++ EXT3_GOOD_OLD_INODE_SIZE - ++ EXT3_I(inode)->i_extra_isize - ++ sizeof(__u32); ++ *free = storage_size - sizeof(__u32); ++ start = (char *) raw_inode + EXT3_GOOD_OLD_INODE_SIZE + ++ EXT3_I(inode)->i_extra_isize; ++ if (le32_to_cpu((*(__u32*) start)) != EXT3_XATTR_MAGIC) { ++ brelse(iloc.bh); ++ return -ENOENT; ++ } ++ start += sizeof(__u32); ++ end = (char *) raw_inode + EXT3_SB(inode->i_sb)->s_inode_size; ++ ++ last = (struct ext3_xattr_entry *) start; ++ while (!IS_LAST_ENTRY(last)) { ++ struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(last); ++ if (le32_to_cpu(last->e_value_size) > storage_size || ++ (char *) next >= end) { ++ ext3_error(inode->i_sb, "ext3_xattr_ibody_find", ++ "inode %ld", inode->i_ino); ++ brelse(iloc.bh); ++ return -EIO; ++ } ++ ++ if (name_index == last->e_name_index && ++ name_len == last->e_name_len && ++ !memcmp(name, last->e_name, name_len)) { ++ memcpy(rentry, last, sizeof(struct ext3_xattr_entry)); ++ ret = 0; ++ } else { ++ *free -= EXT3_XATTR_LEN(last->e_name_len); ++ *free -= le32_to_cpu(last->e_value_size); ++ } ++ last = next; ++ } ++ ++ brelse(iloc.bh); ++ return ret; ++} ++ ++/* ++ * ext3_xattr_block_find() ++ * ++ * search attribute and calculate free space in EA block (if it allocated) ++ * NOTE: free space includes space our attribute hold ++ */ ++int ++ext3_xattr_block_find(struct inode *inode, int name_index, const char *name, ++ struct ext3_xattr_entry *rentry, int *free) ++{ ++ struct buffer_head *bh = NULL; ++ struct ext3_xattr_entry *entry; ++ char *end; ++ int name_len, error = -ENOENT; ++ ++ if (!EXT3_I(inode)->i_file_acl) { ++ *free = inode->i_sb->s_blocksize - ++ sizeof(struct ext3_xattr_header) - ++ sizeof(__u32); ++ return -ENOENT; ++ } ++ ea_idebug(inode, "reading block %d", EXT3_I(inode)->i_file_acl); ++ bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl); ++ if (!bh) ++ return -EIO; ++ ea_bdebug(bh, "b_count=%d, refcount=%d", ++ atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount)); ++ end = bh->b_data + bh->b_size; ++ if (HDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) || ++ HDR(bh)->h_blocks != cpu_to_le32(1)) { ++bad_block: ext3_error(inode->i_sb, "ext3_xattr_get", ++ "inode %ld: bad block %d", inode->i_ino, ++ EXT3_I(inode)->i_file_acl); ++ brelse(bh); ++ return -EIO; ++ } ++ /* find named attribute */ ++ name_len = strlen(name); ++ *free = bh->b_size - sizeof(__u32); ++ ++ entry = FIRST_ENTRY(bh); ++ while (!IS_LAST_ENTRY(entry)) { ++ struct ext3_xattr_entry *next = ++ EXT3_XATTR_NEXT(entry); ++ if ((char *)next >= end) ++ goto bad_block; ++ if (name_index == entry->e_name_index && ++ name_len == entry->e_name_len && ++ memcmp(name, entry->e_name, name_len) == 0) { ++ memcpy(rentry, entry, sizeof(struct ext3_xattr_entry)); ++ error = 0; ++ } else { ++ *free -= EXT3_XATTR_LEN(entry->e_name_len); ++ *free -= le32_to_cpu(entry->e_value_size); ++ } ++ entry = next; ++ } ++ brelse(bh); ++ ++ return error; ++} ++ ++/* ++ * ext3_xattr_inode_set() ++ * ++ * this routine add/remove/replace attribute in inode body ++ */ ++int ++ext3_xattr_ibody_set(handle_t *handle, struct inode *inode, int name_index, ++ const char *name, const void *value, size_t value_len, ++ int flags) ++{ ++ struct ext3_xattr_entry *last, *next, *here = NULL; ++ struct ext3_inode *raw_inode; ++ int name_len = strlen(name); ++ int esize = EXT3_XATTR_LEN(name_len); ++ struct buffer_head *bh; ++ int err, storage_size; ++ struct ext3_iloc iloc; ++ int free, min_offs; ++ char *start, *end; ++ ++ if (EXT3_SB(inode->i_sb)->s_inode_size <= EXT3_GOOD_OLD_INODE_SIZE) ++ return -ENOSPC; ++ ++ err = ext3_get_inode_loc(inode, &iloc, 0); ++ if (err) ++ return err; ++ raw_inode = ext3_raw_inode(&iloc); ++ bh = iloc.bh; ++ ++ storage_size = EXT3_SB(inode->i_sb)->s_inode_size - ++ EXT3_GOOD_OLD_INODE_SIZE - ++ EXT3_I(inode)->i_extra_isize - ++ sizeof(__u32); ++ start = (char *) raw_inode + EXT3_GOOD_OLD_INODE_SIZE + ++ EXT3_I(inode)->i_extra_isize; ++ if ((*(__u32*) start) != EXT3_XATTR_MAGIC) { ++ /* inode had no attributes before */ ++ *((__u32*) start) = cpu_to_le32(EXT3_XATTR_MAGIC); ++ } ++ start += sizeof(__u32); ++ end = (char *) raw_inode + EXT3_SB(inode->i_sb)->s_inode_size; ++ min_offs = storage_size; ++ free = storage_size - sizeof(__u32); ++ ++ last = (struct ext3_xattr_entry *) start; ++ while (!IS_LAST_ENTRY(last)) { ++ next = EXT3_XATTR_NEXT(last); ++ if (le32_to_cpu(last->e_value_size) > storage_size || ++ (char *) next >= end) { ++ ext3_error(inode->i_sb, "ext3_xattr_ibody_set", ++ "inode %ld", inode->i_ino); ++ brelse(bh); ++ return -EIO; ++ } ++ ++ if (last->e_value_size) { ++ int offs = le16_to_cpu(last->e_value_offs); ++ if (offs < min_offs) ++ min_offs = offs; ++ } ++ if (name_index == last->e_name_index && ++ name_len == last->e_name_len && ++ !memcmp(name, last->e_name, name_len)) ++ here = last; ++ else { ++ /* we calculate all but our attribute ++ * because it will be removed before changing */ ++ free -= EXT3_XATTR_LEN(last->e_name_len); ++ free -= le32_to_cpu(last->e_value_size); ++ } ++ last = next; ++ } ++ ++ if (value && (esize + value_len > free)) { ++ brelse(bh); ++ return -ENOSPC; ++ } ++ ++ err = ext3_reserve_inode_write(handle, inode, &iloc); ++ if (err) { ++ brelse(bh); ++ return err; ++ } ++ ++ if (here) { ++ /* time to remove old value */ ++ struct ext3_xattr_entry *e; ++ int size = le32_to_cpu(here->e_value_size); ++ int border = le16_to_cpu(here->e_value_offs); ++ char *src; ++ ++ /* move tail */ ++ memmove(start + min_offs + size, start + min_offs, ++ border - min_offs); ++ ++ /* recalculate offsets */ ++ e = (struct ext3_xattr_entry *) start; ++ while (!IS_LAST_ENTRY(e)) { ++ struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(e); ++ int offs = le16_to_cpu(e->e_value_offs); ++ if (offs < border) ++ e->e_value_offs = ++ cpu_to_le16(offs + size); ++ e = next; ++ } ++ min_offs += size; ++ ++ /* remove entry */ ++ border = EXT3_XATTR_LEN(here->e_name_len); ++ src = (char *) here + EXT3_XATTR_LEN(here->e_name_len); ++ size = (char *) last - src; ++ if ((char *) here + size > end) ++ printk("ALERT at %s:%d: 0x%p + %d > 0x%p\n", ++ __FILE__, __LINE__, here, size, end); ++ memmove(here, src, size); ++ last = (struct ext3_xattr_entry *) ((char *) last - border); ++ *((__u32 *) last) = 0; ++ } ++ ++ if (value) { ++ int offs = min_offs - value_len; ++ /* use last to create new entry */ ++ last->e_name_len = strlen(name); ++ last->e_name_index = name_index; ++ last->e_value_offs = cpu_to_le16(offs); ++ last->e_value_size = cpu_to_le32(value_len); ++ last->e_hash = last->e_value_block = 0; ++ memset(last->e_name, 0, esize); ++ memcpy(last->e_name, name, last->e_name_len); ++ if (start + offs + value_len > end) ++ printk("ALERT at %s:%d: 0x%p + %d + %zd > 0x%p\n", ++ __FILE__, __LINE__, start, offs, ++ value_len, end); ++ memcpy(start + offs, value, value_len); ++ last = EXT3_XATTR_NEXT(last); ++ *((__u32 *) last) = 0; ++ } ++ ++ ext3_mark_iloc_dirty(handle, inode, &iloc); ++ brelse(bh); ++ ++ return 0; ++} ++ ++/* + * ext3_xattr_set_handle() + * + * Create, replace or remove an extended attribute for this inode. Buffer +@@ -369,6 +869,104 @@ + */ + int + ext3_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, ++ const char *name, const void *value, size_t value_len, ++ int flags) ++{ ++ struct ext3_xattr_entry entry; ++ int err, where = 0, found = 0, total; ++ int free1 = -1, free2 = -1; ++ int name_len; ++ ++ ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld", ++ name_index, name, value, (long)value_len); ++ ++ if (IS_RDONLY(inode)) ++ return -EROFS; ++ if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) ++ return -EPERM; ++ if (value == NULL) ++ value_len = 0; ++ if (name == NULL) ++ return -EINVAL; ++ name_len = strlen(name); ++ if (name_len > 255 || value_len > inode->i_sb->s_blocksize) ++ return -ERANGE; ++ down_write(&EXT3_I(inode)->xattr_sem); ++ ++ /* try to find attribute in inode body */ ++ err = ext3_xattr_ibody_find(inode, name_index, name, &entry, &free1); ++ if (err == 0) { ++ /* found EA in inode */ ++ found = 1; ++ where = 0; ++ } else if (err == -ENOENT) { ++ /* there is no such attribute in inode body */ ++ /* try to find attribute in dedicated block */ ++ err = ext3_xattr_block_find(inode, name_index, name, ++ &entry, &free2); ++ if (err != 0 && err != -ENOENT) { ++ /* not found EA in block */ ++ goto finish; ++ } else if (err == 0) { ++ /* found EA in block */ ++ where = 1; ++ found = 1; ++ } ++ } else ++ goto finish; ++ ++ /* check flags: may replace? may create ? */ ++ if (found && (flags & XATTR_CREATE)) { ++ err = -EEXIST; ++ goto finish; ++ } else if (!found && (flags & XATTR_REPLACE)) { ++ err = -ENODATA; ++ goto finish; ++ } ++ ++ /* check if we have enough space to store attribute */ ++ total = EXT3_XATTR_LEN(strlen(name)) + value_len; ++ if (free1 >= 0 && total > free1 && free2 >= 0 && total > free2) { ++ /* have no enough space */ ++ err = -ENOSPC; ++ goto finish; ++ } ++ ++ /* time to remove attribute */ ++ if (found) { ++ if (where == 0) { ++ /* EA is stored in inode body */ ++ ext3_xattr_ibody_set(handle, inode, name_index, name, ++ NULL, 0, flags); ++ } else { ++ /* EA is stored in separated block */ ++ ext3_xattr_block_set(handle, inode, name_index, name, ++ NULL, 0, flags); ++ } ++ } ++ ++ /* try to store EA in inode body */ ++ err = ext3_xattr_ibody_set(handle, inode, name_index, name, ++ value, value_len, flags); ++ if (err) { ++ /* can't store EA in inode body */ ++ /* try to store in block */ ++ err = ext3_xattr_block_set(handle, inode, name_index, ++ name, value, value_len, flags); ++ } ++ ++finish: ++ up_write(&EXT3_I(inode)->xattr_sem); ++ return err; ++} ++ ++/* ++ * ext3_xattr_block_set() ++ * ++ * this routine add/remove/replace attribute in EA block ++ */ ++int ++ext3_xattr_block_set(handle_t *handle, struct inode *inode, int name_index, + const char *name, const void *value, size_t value_len, + int flags) + { +@@ -391,22 +989,7 @@ + * towards the end of the block). + * end -- Points right after the block pointed to by header. + */ +- +- ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld", +- name_index, name, value, (long)value_len); +- +- if (IS_RDONLY(inode)) +- return -EROFS; +- if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) +- return -EPERM; +- if (value == NULL) +- value_len = 0; +- if (name == NULL) +- return -EINVAL; + name_len = strlen(name); +- if (name_len > 255 || value_len > sb->s_blocksize) +- return -ERANGE; +- down_write(&EXT3_I(inode)->xattr_sem); + if (EXT3_I(inode)->i_file_acl) { + /* The inode already has an extended attribute block. */ + bh = sb_bread(sb, EXT3_I(inode)->i_file_acl); +@@ -638,7 +1221,6 @@ + brelse(bh); + if (!(bh && header == HDR(bh))) + kfree(header); +- up_write(&EXT3_I(inode)->xattr_sem); + + return error; + } +Index: linux-stage/fs/ext3/xattr.h +=================================================================== +--- linux-stage.orig/fs/ext3/xattr.h 2005-10-04 16:50:11.000000000 -0600 ++++ linux-stage/fs/ext3/xattr.h 2005-10-04 17:07:25.000000000 -0600 +@@ -67,7 +67,8 @@ + extern int ext3_xattr_get(struct inode *, int, const char *, void *, size_t); + extern int ext3_xattr_list(struct inode *, char *, size_t); + extern int ext3_xattr_set(struct inode *, int, const char *, const void *, size_t, int); +-extern int ext3_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int); ++extern int ext3_xattr_set_handle(handle_t *, struct inode *, int, const char *,const void *,size_t,int); ++extern int ext3_xattr_block_set(handle_t *, struct inode *, int, const char *,const void *,size_t,int); + + extern void ext3_xattr_delete_inode(handle_t *, struct inode *); + extern void ext3_xattr_put_super(struct super_block *); +Index: linux-stage/include/linux/ext3_fs.h +=================================================================== +--- linux-stage.orig/include/linux/ext3_fs.h 2005-10-04 16:53:29.000000000 -0600 ++++ linux-stage/include/linux/ext3_fs.h 2005-10-04 17:07:25.000000000 -0600 +@@ -293,6 +293,8 @@ + __u32 m_i_reserved2[2]; + } masix2; + } osd2; /* OS dependent 2 */ ++ __u16 i_extra_isize; ++ __u16 i_pad1; + }; + + #define i_size_high i_dir_acl +@@ -757,6 +759,7 @@ + extern int ext3_forget(handle_t *, int, struct inode *, struct buffer_head *, int); + extern struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *); + extern struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *); ++int ext3_get_inode_loc(struct inode *inode, struct ext3_iloc *iloc, int in_mem); + + extern void ext3_read_inode (struct inode *); + extern int ext3_write_inode (struct inode *, int); +Index: linux-stage/include/linux/ext3_fs_i.h +=================================================================== +--- linux-stage.orig/include/linux/ext3_fs_i.h 2005-10-04 16:50:11.000000000 -0600 ++++ linux-stage/include/linux/ext3_fs_i.h 2005-10-04 17:07:25.000000000 -0600 +@@ -113,6 +113,9 @@ + */ + loff_t i_disksize; + ++ /* on-disk additional length */ ++ __u16 i_extra_isize; ++ + /* + * truncate_sem is for serialising ext3_truncate() against + * ext3_getblock(). In the 2.4 ext2 design, great chunks of inode's diff --git a/ldiskfs/kernel_patches/patches/ext3-ea-in-inode-2.6-suse.patch b/ldiskfs/kernel_patches/patches/ext3-ea-in-inode-2.6-suse.patch index 92897b6..72c25a4 100644 --- a/ldiskfs/kernel_patches/patches/ext3-ea-in-inode-2.6-suse.patch +++ b/ldiskfs/kernel_patches/patches/ext3-ea-in-inode-2.6-suse.patch @@ -1,25 +1,23 @@ %patch -Index: linux-2.6.7/fs/ext3/ialloc.c +Index: linux-2.6.0/fs/ext3/ialloc.c =================================================================== ---- linux-2.6.7.orig/fs/ext3/ialloc.c 2004-09-06 20:01:18.000000000 +0800 -+++ linux-2.6.7/fs/ext3/ialloc.c 2004-09-06 20:04:42.000000000 +0800 -@@ -629,6 +629,11 @@ - spin_unlock(&sbi->s_next_gen_lock); +--- linux-2.6.0.orig/fs/ext3/ialloc.c 2004-01-14 18:54:11.000000000 +0300 ++++ linux-2.6.0/fs/ext3/ialloc.c 2004-01-14 18:54:12.000000000 +0300 +@@ -627,6 +627,9 @@ + inode->i_generation = EXT3_SB(sb)->s_next_generation++; ei->i_state = EXT3_STATE_NEW; -+ if (EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) { -+ ei->i_extra_isize = sizeof(__u16) /* i_extra_isize */ -+ + sizeof(__u16); /* i_pad1 */ -+ } else -+ ei->i_extra_isize = 0; ++ ei->i_extra_isize = ++ (EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) ? ++ sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE : 0; ret = inode; if(DQUOT_ALLOC_INODE(inode)) { -Index: linux-2.6.7/fs/ext3/inode.c +Index: linux-2.6.0/fs/ext3/inode.c =================================================================== ---- linux-2.6.7.orig/fs/ext3/inode.c 2004-09-06 20:01:20.000000000 +0800 -+++ linux-2.6.7/fs/ext3/inode.c 2004-09-06 20:04:42.000000000 +0800 -@@ -2349,7 +2349,7 @@ +--- linux-2.6.0.orig/fs/ext3/inode.c 2004-01-14 18:54:12.000000000 +0300 ++++ linux-2.6.0/fs/ext3/inode.c 2004-01-14 19:09:46.000000000 +0300 +@@ -2339,7 +2339,7 @@ * trying to determine the inode's location on-disk and no read need be * performed. */ @@ -28,7 +26,7 @@ Index: linux-2.6.7/fs/ext3/inode.c struct ext3_iloc *iloc, int in_mem) { unsigned long block; -@@ -2558,6 +2558,11 @@ +@@ -2547,6 +2547,11 @@ ei->i_data[block] = raw_inode->i_block[block]; INIT_LIST_HEAD(&ei->i_orphan); @@ -40,7 +38,7 @@ Index: linux-2.6.7/fs/ext3/inode.c if (S_ISREG(inode->i_mode)) { inode->i_op = &ext3_file_inode_operations; inode->i_fop = &ext3_file_operations; -@@ -2693,6 +2698,9 @@ +@@ -2682,6 +2687,9 @@ } else for (block = 0; block < EXT3_N_BLOCKS; block++) raw_inode->i_block[block] = ei->i_data[block]; @@ -50,10 +48,20 @@ Index: linux-2.6.7/fs/ext3/inode.c BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); rc = ext3_journal_dirty_metadata(handle, bh); if (!err) -Index: linux-2.6.7/fs/ext3/xattr.c +@@ -2849,7 +2857,8 @@ ext3_reserve_inode_write(handle_t *handl + { + int err = 0; + if (handle) { +- err = ext3_get_inode_loc(inode, iloc, 1); ++ err = ext3_get_inode_loc(inode, iloc, EXT3_I(inode)->i_state & ++ EXT3_STATE_NEW); + if (!err) { + BUFFER_TRACE(iloc->bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, iloc->bh); +Index: linux-2.6.0/fs/ext3/xattr.c =================================================================== ---- linux-2.6.7.orig/fs/ext3/xattr.c 2004-06-16 13:19:36.000000000 +0800 -+++ linux-2.6.7/fs/ext3/xattr.c 2004-09-06 20:05:40.000000000 +0800 +--- linux-2.6.0.orig/fs/ext3/xattr.c 2003-12-30 08:33:13.000000000 +0300 ++++ linux-2.6.0/fs/ext3/xattr.c 2004-01-14 18:54:12.000000000 +0300 @@ -246,17 +246,12 @@ } @@ -94,7 +102,7 @@ Index: linux-2.6.7/fs/ext3/xattr.c /* - * ext3_xattr_list() -+ * ext3_xattr_ibode_get() ++ * ext3_xattr_ibody_get() * - * Copy a list of attribute names into the buffer + * routine looks for attribute in inode body and returns it's value and size @@ -113,7 +121,7 @@ Index: linux-2.6.7/fs/ext3/xattr.c + if (EXT3_SB(inode->i_sb)->s_inode_size <= EXT3_GOOD_OLD_INODE_SIZE) + return -ENOENT; + -+ ret = ext3_get_inode_loc(inode, &iloc, 1); ++ ret = ext3_get_inode_loc(inode, &iloc, 0); + if (ret) + return ret; + raw_inode = ext3_raw_inode(&iloc); @@ -215,15 +223,6 @@ Index: linux-2.6.7/fs/ext3/xattr.c error = 0; if (!EXT3_I(inode)->i_file_acl) goto cleanup; -@@ -380,7 +469,7 @@ - if (!bh) - goto cleanup; - ea_bdebug(bh, "b_count=%d, refcount=%d", -- atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount)); -+ (int) atomic_read(&(bh->b_count)), (int) le32_to_cpu(HDR(bh)->h_refcount)); - end = bh->b_data + bh->b_size; - if (HDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) || - HDR(bh)->h_blocks != cpu_to_le32(1)) { @@ -431,11 +520,138 @@ cleanup: @@ -251,7 +250,7 @@ Index: linux-2.6.7/fs/ext3/xattr.c + if (EXT3_SB(inode->i_sb)->s_inode_size <= EXT3_GOOD_OLD_INODE_SIZE) + return 0; + -+ ret = ext3_get_inode_loc(inode, &iloc, 1); ++ ret = ext3_get_inode_loc(inode, &iloc, 0); + if (ret) + return ret; + raw_inode = ext3_raw_inode(&iloc); @@ -388,7 +387,7 @@ Index: linux-2.6.7/fs/ext3/xattr.c + if (EXT3_SB(inode->i_sb)->s_inode_size <= EXT3_GOOD_OLD_INODE_SIZE) + return ret; + -+ err = ext3_get_inode_loc(inode, &iloc, 1); ++ err = ext3_get_inode_loc(inode, &iloc, 0); + if (err) + return -EIO; + raw_inode = ext3_raw_inode(&iloc); @@ -519,7 +518,7 @@ Index: linux-2.6.7/fs/ext3/xattr.c + if (EXT3_SB(inode->i_sb)->s_inode_size <= EXT3_GOOD_OLD_INODE_SIZE) + return -ENOSPC; + -+ err = ext3_get_inode_loc(inode, &iloc, 1); ++ err = ext3_get_inode_loc(inode, &iloc, 0); + if (err) + return err; + raw_inode = ext3_raw_inode(&iloc); @@ -626,9 +625,9 @@ Index: linux-2.6.7/fs/ext3/xattr.c + memset(last->e_name, 0, esize); + memcpy(last->e_name, name, last->e_name_len); + if (start + offs + value_len > end) -+ printk("ALERT at %s:%d: 0x%p + %d + %d > 0x%p\n", -+ __FILE__, __LINE__, start, offs, -+ value_len, end); ++ printk("ALERT at %s:%d: 0x%p + %d + %zd > 0x%p\n", ++ __FILE__, __LINE__, start, offs, ++ value_len, end); + memcpy(start + offs, value, value_len); + last = EXT3_XATTR_NEXT(last); + *((__u32 *) last) = 0; @@ -749,7 +748,7 @@ Index: linux-2.6.7/fs/ext3/xattr.c const char *name, const void *value, size_t value_len, int flags) { -@@ -492,22 +1079,7 @@ +@@ -492,22 +1078,7 @@ * towards the end of the block). * end -- Points right after the block pointed to by header. */ @@ -772,7 +771,7 @@ Index: linux-2.6.7/fs/ext3/xattr.c if (EXT3_I(inode)->i_file_acl) { /* The inode already has an extended attribute block. */ bh = sb_bread(sb, EXT3_I(inode)->i_file_acl); -@@ -733,7 +1305,6 @@ +@@ -733,7 +1304,6 @@ brelse(bh); if (!(bh && header == HDR(bh))) kfree(header); @@ -780,10 +779,10 @@ Index: linux-2.6.7/fs/ext3/xattr.c return error; } -Index: linux-2.6.7/fs/ext3/xattr.h +Index: linux-2.6.0/fs/ext3/xattr.h =================================================================== ---- linux-2.6.7.orig/fs/ext3/xattr.h 2004-06-16 13:20:04.000000000 +0800 -+++ linux-2.6.7/fs/ext3/xattr.h 2004-09-06 20:04:42.000000000 +0800 +--- linux-2.6.0.orig/fs/ext3/xattr.h 2003-06-24 18:04:43.000000000 +0400 ++++ linux-2.6.0/fs/ext3/xattr.h 2004-01-14 18:54:12.000000000 +0300 @@ -77,7 +77,8 @@ extern int ext3_xattr_get(struct inode *, int, const char *, void *, size_t); extern int ext3_xattr_list(struct inode *, char *, size_t); @@ -794,10 +793,10 @@ Index: linux-2.6.7/fs/ext3/xattr.h extern void ext3_xattr_delete_inode(handle_t *, struct inode *); extern void ext3_xattr_put_super(struct super_block *); -Index: linux-2.6.7/include/linux/ext3_fs.h +Index: linux-2.6.0/include/linux/ext3_fs.h =================================================================== ---- linux-2.6.7.orig/include/linux/ext3_fs.h 2004-09-06 20:01:19.000000000 +0800 -+++ linux-2.6.7/include/linux/ext3_fs.h 2004-09-06 20:04:42.000000000 +0800 +--- linux-2.6.0.orig/include/linux/ext3_fs.h 2004-01-14 18:54:11.000000000 +0300 ++++ linux-2.6.0/include/linux/ext3_fs.h 2004-01-14 18:54:12.000000000 +0300 @@ -265,6 +265,8 @@ __u32 m_i_reserved2[2]; } masix2; @@ -807,18 +806,18 @@ Index: linux-2.6.7/include/linux/ext3_fs.h }; #define i_size_high i_dir_acl -@@ -725,6 +727,7 @@ +@@ -721,6 +723,7 @@ + extern int ext3_forget(handle_t *, int, struct inode *, struct buffer_head *, int); extern struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *); extern struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *); ++int ext3_get_inode_loc(struct inode *inode, struct ext3_iloc *iloc, int in_mem); -+extern int ext3_get_inode_loc(struct inode *, struct ext3_iloc *, int); extern void ext3_read_inode (struct inode *); extern void ext3_write_inode (struct inode *, int); - extern int ext3_setattr (struct dentry *, struct iattr *); -Index: linux-2.6.7/include/linux/ext3_fs_i.h +Index: linux-2.6.0/include/linux/ext3_fs_i.h =================================================================== ---- linux-2.6.7.orig/include/linux/ext3_fs_i.h 2004-06-16 13:18:52.000000000 +0800 -+++ linux-2.6.7/include/linux/ext3_fs_i.h 2004-09-06 20:04:42.000000000 +0800 +--- linux-2.6.0.orig/include/linux/ext3_fs_i.h 2003-12-30 08:32:44.000000000 +0300 ++++ linux-2.6.0/include/linux/ext3_fs_i.h 2004-01-14 18:54:12.000000000 +0300 @@ -96,6 +96,9 @@ */ loff_t i_disksize; @@ -833,9 +832,9 @@ Index: linux-2.6.7/include/linux/ext3_fs_i.h %diffstat fs/ext3/ialloc.c | 5 fs/ext3/inode.c | 10 - fs/ext3/xattr.c | 635 +++++++++++++++++++++++++++++++++++++++++++--- + fs/ext3/xattr.c | 634 +++++++++++++++++++++++++++++++++++++++++++--- fs/ext3/xattr.h | 3 - include/linux/ext3_fs.h | 3 + include/linux/ext3_fs.h | 2 include/linux/ext3_fs_i.h | 3 - 6 files changed, 625 insertions(+), 34 deletions(-) + 6 files changed, 623 insertions(+), 34 deletions(-) diff --git a/ldiskfs/kernel_patches/patches/ext3-extents-2.6.12.patch b/ldiskfs/kernel_patches/patches/ext3-extents-2.6.12.patch new file mode 100644 index 0000000..f421f88 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext3-extents-2.6.12.patch @@ -0,0 +1,2940 @@ +Index: linux-2.6.12-rc6/fs/ext3/extents.c +=================================================================== +--- linux-2.6.12-rc6.orig/fs/ext3/extents.c 2005-06-14 16:31:25.756503133 +0200 ++++ linux-2.6.12-rc6/fs/ext3/extents.c 2005-06-14 16:31:25.836581257 +0200 +@@ -0,0 +1,2359 @@ ++/* ++ * Copyright(c) 2003, 2004, 2005, Cluster File Systems, Inc, info@clusterfs.com ++ * Written by Alex Tomas ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public Licens ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- ++ */ ++ ++/* ++ * Extents support for EXT3 ++ * ++ * TODO: ++ * - ext3_ext_walk_space() sould not use ext3_ext_find_extent() ++ * - ext3_ext_calc_credits() could take 'mergable' into account ++ * - ext3*_error() should be used in some situations ++ * - find_goal() [to be tested and improved] ++ * - smart tree reduction ++ * - arch-independence ++ * common on-disk format for big/little-endian arch ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++ ++static inline int ext3_ext_check_header(struct ext3_extent_header *eh) ++{ ++ if (eh->eh_magic != EXT3_EXT_MAGIC) { ++ printk(KERN_ERR "EXT3-fs: invalid magic = 0x%x\n", ++ (unsigned)eh->eh_magic); ++ return -EIO; ++ } ++ if (eh->eh_max == 0) { ++ printk(KERN_ERR "EXT3-fs: invalid eh_max = %u\n", ++ (unsigned)eh->eh_max); ++ return -EIO; ++ } ++ if (eh->eh_entries > eh->eh_max) { ++ printk(KERN_ERR "EXT3-fs: invalid eh_entries = %u\n", ++ (unsigned)eh->eh_entries); ++ return -EIO; ++ } ++ return 0; ++} ++ ++static handle_t *ext3_ext_journal_restart(handle_t *handle, int needed) ++{ ++ int err; ++ ++ if (handle->h_buffer_credits > needed) ++ return handle; ++ if (!ext3_journal_extend(handle, needed)) ++ return handle; ++ err = ext3_journal_restart(handle, needed); ++ ++ return handle; ++} ++ ++static int inline ++ext3_ext_get_access_for_root(handle_t *h, struct ext3_extents_tree *tree) ++{ ++ if (tree->ops->get_write_access) ++ return tree->ops->get_write_access(h,tree->buffer); ++ else ++ return 0; ++} ++ ++static int inline ++ext3_ext_mark_root_dirty(handle_t *h, struct ext3_extents_tree *tree) ++{ ++ if (tree->ops->mark_buffer_dirty) ++ return tree->ops->mark_buffer_dirty(h,tree->buffer); ++ else ++ return 0; ++} ++ ++/* ++ * could return: ++ * - EROFS ++ * - ENOMEM ++ */ ++static int ext3_ext_get_access(handle_t *handle, ++ struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ int err; ++ ++ if (path->p_bh) { ++ /* path points to block */ ++ err = ext3_journal_get_write_access(handle, path->p_bh); ++ } else { ++ /* path points to leaf/index in inode body */ ++ err = ext3_ext_get_access_for_root(handle, tree); ++ } ++ return err; ++} ++ ++/* ++ * could return: ++ * - EROFS ++ * - ENOMEM ++ * - EIO ++ */ ++static int ext3_ext_dirty(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ int err; ++ if (path->p_bh) { ++ /* path points to block */ ++ err =ext3_journal_dirty_metadata(handle, path->p_bh); ++ } else { ++ /* path points to leaf/index in inode body */ ++ err = ext3_ext_mark_root_dirty(handle, tree); ++ } ++ return err; ++} ++ ++static int inline ++ext3_ext_new_block(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, struct ext3_extent *ex, ++ int *err) ++{ ++ int goal, depth, newblock; ++ struct inode *inode; ++ ++ EXT_ASSERT(tree); ++ if (tree->ops->new_block) ++ return tree->ops->new_block(handle, tree, path, ex, err); ++ ++ inode = tree->inode; ++ depth = EXT_DEPTH(tree); ++ if (path && depth > 0) { ++ goal = path[depth-1].p_block; ++ } else { ++ struct ext3_inode_info *ei = EXT3_I(inode); ++ unsigned long bg_start; ++ unsigned long colour; ++ ++ bg_start = (ei->i_block_group * ++ EXT3_BLOCKS_PER_GROUP(inode->i_sb)) + ++ le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block); ++ colour = (current->pid % 16) * ++ (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16); ++ goal = bg_start + colour; ++ } ++ ++ newblock = ext3_new_block(handle, inode, goal, err); ++ return newblock; ++} ++ ++static inline void ext3_ext_tree_changed(struct ext3_extents_tree *tree) ++{ ++ struct ext3_extent_header *neh = EXT_ROOT_HDR(tree); ++ neh->eh_generation = ((EXT_FLAGS(neh) & ~EXT_FLAGS_CLR_UNKNOWN) << 24) | ++ (EXT_HDR_GEN(neh) + 1); ++} ++ ++static inline int ext3_ext_space_block(struct ext3_extents_tree *tree) ++{ ++ int size; ++ ++ size = (tree->inode->i_sb->s_blocksize - ++ sizeof(struct ext3_extent_header)) / ++ sizeof(struct ext3_extent); ++#ifdef AGRESSIVE_TEST ++ size = 6; ++#endif ++ return size; ++} ++ ++static inline int ext3_ext_space_block_idx(struct ext3_extents_tree *tree) ++{ ++ int size; ++ ++ size = (tree->inode->i_sb->s_blocksize - ++ sizeof(struct ext3_extent_header)) / ++ sizeof(struct ext3_extent_idx); ++#ifdef AGRESSIVE_TEST ++ size = 5; ++#endif ++ return size; ++} ++ ++static inline int ext3_ext_space_root(struct ext3_extents_tree *tree) ++{ ++ int size; ++ ++ size = (tree->buffer_len - sizeof(struct ext3_extent_header)) / ++ sizeof(struct ext3_extent); ++#ifdef AGRESSIVE_TEST ++ size = 3; ++#endif ++ return size; ++} ++ ++static inline int ext3_ext_space_root_idx(struct ext3_extents_tree *tree) ++{ ++ int size; ++ ++ size = (tree->buffer_len - sizeof(struct ext3_extent_header)) / ++ sizeof(struct ext3_extent_idx); ++#ifdef AGRESSIVE_TEST ++ size = 4; ++#endif ++ return size; ++} ++ ++static void ext3_ext_show_path(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++#ifdef EXT_DEBUG ++ int k, l = path->p_depth; ++ ++ ext_debug(tree, "path:"); ++ for (k = 0; k <= l; k++, path++) { ++ if (path->p_idx) { ++ ext_debug(tree, " %d->%d", path->p_idx->ei_block, ++ path->p_idx->ei_leaf); ++ } else if (path->p_ext) { ++ ext_debug(tree, " %d:%d:%d", ++ path->p_ext->ee_block, ++ path->p_ext->ee_len, ++ path->p_ext->ee_start); ++ } else ++ ext_debug(tree, " []"); ++ } ++ ext_debug(tree, "\n"); ++#endif ++} ++ ++static void ext3_ext_show_leaf(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++#ifdef EXT_DEBUG ++ int depth = EXT_DEPTH(tree); ++ struct ext3_extent_header *eh; ++ struct ext3_extent *ex; ++ int i; ++ ++ if (!path) ++ return; ++ ++ eh = path[depth].p_hdr; ++ ex = EXT_FIRST_EXTENT(eh); ++ ++ for (i = 0; i < eh->eh_entries; i++, ex++) { ++ ext_debug(tree, "%d:%d:%d ", ++ ex->ee_block, ex->ee_len, ex->ee_start); ++ } ++ ext_debug(tree, "\n"); ++#endif ++} ++ ++static void ext3_ext_drop_refs(struct ext3_ext_path *path) ++{ ++ int depth = path->p_depth; ++ int i; ++ ++ for (i = 0; i <= depth; i++, path++) { ++ if (path->p_bh) { ++ brelse(path->p_bh); ++ path->p_bh = NULL; ++ } ++ } ++} ++ ++/* ++ * binary search for closest index by given block ++ */ ++static inline void ++ext3_ext_binsearch_idx(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, int block) ++{ ++ struct ext3_extent_header *eh = path->p_hdr; ++ struct ext3_extent_idx *ix; ++ int l = 0, k, r; ++ ++ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); ++ EXT_ASSERT(eh->eh_entries <= eh->eh_max); ++ EXT_ASSERT(eh->eh_entries > 0); ++ ++ ext_debug(tree, "binsearch for %d(idx): ", block); ++ ++ path->p_idx = ix = EXT_FIRST_INDEX(eh); ++ ++ r = k = eh->eh_entries; ++ while (k > 1) { ++ k = (r - l) / 2; ++ if (block < ix[l + k].ei_block) ++ r -= k; ++ else ++ l += k; ++ ext_debug(tree, "%d:%d:%d ", k, l, r); ++ } ++ ++ ix += l; ++ path->p_idx = ix; ++ ext_debug(tree," -> %d->%d ",path->p_idx->ei_block,path->p_idx->ei_leaf); ++ ++ while (l++ < r) { ++ if (block < ix->ei_block) ++ break; ++ path->p_idx = ix++; ++ } ++ ext_debug(tree, " -> %d->%d\n", path->p_idx->ei_block, ++ path->p_idx->ei_leaf); ++ ++#ifdef CHECK_BINSEARCH ++ { ++ struct ext3_extent_idx *chix; ++ ++ chix = ix = EXT_FIRST_INDEX(eh); ++ for (k = 0; k < eh->eh_entries; k++, ix++) { ++ if (k != 0 && ix->ei_block <= ix[-1].ei_block) { ++ printk("k=%d, ix=0x%p, first=0x%p\n", k, ++ ix, EXT_FIRST_INDEX(eh)); ++ printk("%u <= %u\n", ++ ix->ei_block,ix[-1].ei_block); ++ } ++ EXT_ASSERT(k == 0 || ix->ei_block > ix[-1].ei_block); ++ if (block < ix->ei_block) ++ break; ++ chix = ix; ++ } ++ EXT_ASSERT(chix == path->p_idx); ++ } ++#endif ++} ++ ++/* ++ * binary search for closest extent by given block ++ */ ++static inline void ++ext3_ext_binsearch(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, int block) ++{ ++ struct ext3_extent_header *eh = path->p_hdr; ++ struct ext3_extent *ex; ++ int l = 0, k, r; ++ ++ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); ++ EXT_ASSERT(eh->eh_entries <= eh->eh_max); ++ ++ if (eh->eh_entries == 0) { ++ /* ++ * this leaf is empty yet: ++ * we get such a leaf in split/add case ++ */ ++ return; ++ } ++ ++ ext_debug(tree, "binsearch for %d: ", block); ++ ++ path->p_ext = ex = EXT_FIRST_EXTENT(eh); ++ ++ r = k = eh->eh_entries; ++ while (k > 1) { ++ k = (r - l) / 2; ++ if (block < ex[l + k].ee_block) ++ r -= k; ++ else ++ l += k; ++ ext_debug(tree, "%d:%d:%d ", k, l, r); ++ } ++ ++ ex += l; ++ path->p_ext = ex; ++ ext_debug(tree, " -> %d:%d:%d ", path->p_ext->ee_block, ++ path->p_ext->ee_start, path->p_ext->ee_len); ++ ++ while (l++ < r) { ++ if (block < ex->ee_block) ++ break; ++ path->p_ext = ex++; ++ } ++ ext_debug(tree, " -> %d:%d:%d\n", path->p_ext->ee_block, ++ path->p_ext->ee_start, path->p_ext->ee_len); ++ ++#ifdef CHECK_BINSEARCH ++ { ++ struct ext3_extent *chex; ++ ++ chex = ex = EXT_FIRST_EXTENT(eh); ++ for (k = 0; k < eh->eh_entries; k++, ex++) { ++ EXT_ASSERT(k == 0 || ex->ee_block > ex[-1].ee_block); ++ if (block < ex->ee_block) ++ break; ++ chex = ex; ++ } ++ EXT_ASSERT(chex == path->p_ext); ++ } ++#endif ++} ++ ++int ext3_extent_tree_init(handle_t *handle, struct ext3_extents_tree *tree) ++{ ++ struct ext3_extent_header *eh; ++ ++ BUG_ON(tree->buffer_len == 0); ++ ext3_ext_get_access_for_root(handle, tree); ++ eh = EXT_ROOT_HDR(tree); ++ eh->eh_depth = 0; ++ eh->eh_entries = 0; ++ eh->eh_magic = EXT3_EXT_MAGIC; ++ eh->eh_max = ext3_ext_space_root(tree); ++ ext3_ext_mark_root_dirty(handle, tree); ++ ext3_ext_invalidate_cache(tree); ++ return 0; ++} ++ ++struct ext3_ext_path * ++ext3_ext_find_extent(struct ext3_extents_tree *tree, int block, ++ struct ext3_ext_path *path) ++{ ++ struct ext3_extent_header *eh; ++ struct buffer_head *bh; ++ int depth, i, ppos = 0; ++ ++ EXT_ASSERT(tree); ++ EXT_ASSERT(tree->inode); ++ EXT_ASSERT(tree->root); ++ ++ eh = EXT_ROOT_HDR(tree); ++ EXT_ASSERT(eh); ++ if (ext3_ext_check_header(eh)) { ++ /* don't free previously allocated path ++ * -- caller should take care */ ++ path = NULL; ++ goto err; ++ } ++ ++ i = depth = EXT_DEPTH(tree); ++ EXT_ASSERT(eh->eh_max); ++ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); ++ ++ /* account possible depth increase */ ++ if (!path) { ++ path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 2), ++ GFP_NOFS); ++ if (!path) ++ return ERR_PTR(-ENOMEM); ++ } ++ memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1)); ++ path[0].p_hdr = eh; ++ ++ /* walk through the tree */ ++ while (i) { ++ ext_debug(tree, "depth %d: num %d, max %d\n", ++ ppos, eh->eh_entries, eh->eh_max); ++ ext3_ext_binsearch_idx(tree, path + ppos, block); ++ path[ppos].p_block = path[ppos].p_idx->ei_leaf; ++ path[ppos].p_depth = i; ++ path[ppos].p_ext = NULL; ++ ++ bh = sb_bread(tree->inode->i_sb, path[ppos].p_block); ++ if (!bh) ++ goto err; ++ ++ eh = EXT_BLOCK_HDR(bh); ++ ppos++; ++ EXT_ASSERT(ppos <= depth); ++ path[ppos].p_bh = bh; ++ path[ppos].p_hdr = eh; ++ i--; ++ ++ if (ext3_ext_check_header(eh)) ++ goto err; ++ } ++ ++ path[ppos].p_depth = i; ++ path[ppos].p_hdr = eh; ++ path[ppos].p_ext = NULL; ++ path[ppos].p_idx = NULL; ++ ++ if (ext3_ext_check_header(eh)) ++ goto err; ++ ++ /* find extent */ ++ ext3_ext_binsearch(tree, path + ppos, block); ++ ++ ext3_ext_show_path(tree, path); ++ ++ return path; ++ ++err: ++ printk(KERN_ERR "EXT3-fs: header is corrupted!\n"); ++ if (path) { ++ ext3_ext_drop_refs(path); ++ kfree(path); ++ } ++ return ERR_PTR(-EIO); ++} ++ ++/* ++ * insert new index [logical;ptr] into the block at cupr ++ * it check where to insert: before curp or after curp ++ */ ++static int ext3_ext_insert_index(handle_t *handle, ++ struct ext3_extents_tree *tree, ++ struct ext3_ext_path *curp, ++ int logical, int ptr) ++{ ++ struct ext3_extent_idx *ix; ++ int len, err; ++ ++ if ((err = ext3_ext_get_access(handle, tree, curp))) ++ return err; ++ ++ EXT_ASSERT(logical != curp->p_idx->ei_block); ++ len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx; ++ if (logical > curp->p_idx->ei_block) { ++ /* insert after */ ++ if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) { ++ len = (len - 1) * sizeof(struct ext3_extent_idx); ++ len = len < 0 ? 0 : len; ++ ext_debug(tree, "insert new index %d after: %d. " ++ "move %d from 0x%p to 0x%p\n", ++ logical, ptr, len, ++ (curp->p_idx + 1), (curp->p_idx + 2)); ++ memmove(curp->p_idx + 2, curp->p_idx + 1, len); ++ } ++ ix = curp->p_idx + 1; ++ } else { ++ /* insert before */ ++ len = len * sizeof(struct ext3_extent_idx); ++ len = len < 0 ? 0 : len; ++ ext_debug(tree, "insert new index %d before: %d. " ++ "move %d from 0x%p to 0x%p\n", ++ logical, ptr, len, ++ curp->p_idx, (curp->p_idx + 1)); ++ memmove(curp->p_idx + 1, curp->p_idx, len); ++ ix = curp->p_idx; ++ } ++ ++ ix->ei_block = logical; ++ ix->ei_leaf = ptr; ++ ix->ei_leaf_hi = ix->ei_unused = 0; ++ curp->p_hdr->eh_entries++; ++ ++ EXT_ASSERT(curp->p_hdr->eh_entries <= curp->p_hdr->eh_max); ++ EXT_ASSERT(ix <= EXT_LAST_INDEX(curp->p_hdr)); ++ ++ err = ext3_ext_dirty(handle, tree, curp); ++ ext3_std_error(tree->inode->i_sb, err); ++ ++ return err; ++} ++ ++/* ++ * routine inserts new subtree into the path, using free index entry ++ * at depth 'at: ++ * - allocates all needed blocks (new leaf and all intermediate index blocks) ++ * - makes decision where to split ++ * - moves remaining extens and index entries (right to the split point) ++ * into the newly allocated blocks ++ * - initialize subtree ++ */ ++static int ext3_ext_split(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newext, int at) ++{ ++ struct buffer_head *bh = NULL; ++ int depth = EXT_DEPTH(tree); ++ struct ext3_extent_header *neh; ++ struct ext3_extent_idx *fidx; ++ struct ext3_extent *ex; ++ int i = at, k, m, a; ++ unsigned long newblock, oldblock, border; ++ int *ablocks = NULL; /* array of allocated blocks */ ++ int err = 0; ++ ++ /* make decision: where to split? */ ++ /* FIXME: now desicion is simplest: at current extent */ ++ ++ /* if current leaf will be splitted, then we should use ++ * border from split point */ ++ EXT_ASSERT(path[depth].p_ext <= EXT_MAX_EXTENT(path[depth].p_hdr)); ++ if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) { ++ border = path[depth].p_ext[1].ee_block; ++ ext_debug(tree, "leaf will be splitted." ++ " next leaf starts at %d\n", ++ (int)border); ++ } else { ++ border = newext->ee_block; ++ ext_debug(tree, "leaf will be added." ++ " next leaf starts at %d\n", ++ (int)border); ++ } ++ ++ /* ++ * if error occurs, then we break processing ++ * and turn filesystem read-only. so, index won't ++ * be inserted and tree will be in consistent ++ * state. next mount will repair buffers too ++ */ ++ ++ /* ++ * get array to track all allocated blocks ++ * we need this to handle errors and free blocks ++ * upon them ++ */ ++ ablocks = kmalloc(sizeof(unsigned long) * depth, GFP_NOFS); ++ if (!ablocks) ++ return -ENOMEM; ++ memset(ablocks, 0, sizeof(unsigned long) * depth); ++ ++ /* allocate all needed blocks */ ++ ext_debug(tree, "allocate %d blocks for indexes/leaf\n", depth - at); ++ for (a = 0; a < depth - at; a++) { ++ newblock = ext3_ext_new_block(handle, tree, path, newext, &err); ++ if (newblock == 0) ++ goto cleanup; ++ ablocks[a] = newblock; ++ } ++ ++ /* initialize new leaf */ ++ newblock = ablocks[--a]; ++ EXT_ASSERT(newblock); ++ bh = sb_getblk(tree->inode->i_sb, newblock); ++ if (!bh) { ++ err = -EIO; ++ goto cleanup; ++ } ++ lock_buffer(bh); ++ ++ if ((err = ext3_journal_get_create_access(handle, bh))) ++ goto cleanup; ++ ++ neh = EXT_BLOCK_HDR(bh); ++ neh->eh_entries = 0; ++ neh->eh_max = ext3_ext_space_block(tree); ++ neh->eh_magic = EXT3_EXT_MAGIC; ++ neh->eh_depth = 0; ++ ex = EXT_FIRST_EXTENT(neh); ++ ++ /* move remain of path[depth] to the new leaf */ ++ EXT_ASSERT(path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max); ++ /* start copy from next extent */ ++ /* TODO: we could do it by single memmove */ ++ m = 0; ++ path[depth].p_ext++; ++ while (path[depth].p_ext <= ++ EXT_MAX_EXTENT(path[depth].p_hdr)) { ++ ext_debug(tree, "move %d:%d:%d in new leaf %lu\n", ++ path[depth].p_ext->ee_block, ++ path[depth].p_ext->ee_start, ++ path[depth].p_ext->ee_len, ++ newblock); ++ memmove(ex++, path[depth].p_ext++, sizeof(struct ext3_extent)); ++ neh->eh_entries++; ++ m++; ++ } ++ set_buffer_uptodate(bh); ++ unlock_buffer(bh); ++ ++ if ((err = ext3_journal_dirty_metadata(handle, bh))) ++ goto cleanup; ++ brelse(bh); ++ bh = NULL; ++ ++ /* correct old leaf */ ++ if (m) { ++ if ((err = ext3_ext_get_access(handle, tree, path + depth))) ++ goto cleanup; ++ path[depth].p_hdr->eh_entries -= m; ++ if ((err = ext3_ext_dirty(handle, tree, path + depth))) ++ goto cleanup; ++ ++ } ++ ++ /* create intermediate indexes */ ++ k = depth - at - 1; ++ EXT_ASSERT(k >= 0); ++ if (k) ++ ext_debug(tree, "create %d intermediate indices\n", k); ++ /* insert new index into current index block */ ++ /* current depth stored in i var */ ++ i = depth - 1; ++ while (k--) { ++ oldblock = newblock; ++ newblock = ablocks[--a]; ++ bh = sb_getblk(tree->inode->i_sb, newblock); ++ if (!bh) { ++ err = -EIO; ++ goto cleanup; ++ } ++ lock_buffer(bh); ++ ++ if ((err = ext3_journal_get_create_access(handle, bh))) ++ goto cleanup; ++ ++ neh = EXT_BLOCK_HDR(bh); ++ neh->eh_entries = 1; ++ neh->eh_magic = EXT3_EXT_MAGIC; ++ neh->eh_max = ext3_ext_space_block_idx(tree); ++ neh->eh_depth = depth - i; ++ fidx = EXT_FIRST_INDEX(neh); ++ fidx->ei_block = border; ++ fidx->ei_leaf = oldblock; ++ fidx->ei_leaf_hi = fidx->ei_unused = 0; ++ ++ ext_debug(tree, "int.index at %d (block %lu): %lu -> %lu\n", ++ i, newblock, border, oldblock); ++ /* copy indexes */ ++ m = 0; ++ path[i].p_idx++; ++ ++ ext_debug(tree, "cur 0x%p, last 0x%p\n", path[i].p_idx, ++ EXT_MAX_INDEX(path[i].p_hdr)); ++ EXT_ASSERT(EXT_MAX_INDEX(path[i].p_hdr) == ++ EXT_LAST_INDEX(path[i].p_hdr)); ++ while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) { ++ ext_debug(tree, "%d: move %d:%d in new index %lu\n", ++ i, path[i].p_idx->ei_block, ++ path[i].p_idx->ei_leaf, newblock); ++ memmove(++fidx, path[i].p_idx++, ++ sizeof(struct ext3_extent_idx)); ++ neh->eh_entries++; ++ EXT_ASSERT(neh->eh_entries <= neh->eh_max); ++ m++; ++ } ++ set_buffer_uptodate(bh); ++ unlock_buffer(bh); ++ ++ if ((err = ext3_journal_dirty_metadata(handle, bh))) ++ goto cleanup; ++ brelse(bh); ++ bh = NULL; ++ ++ /* correct old index */ ++ if (m) { ++ err = ext3_ext_get_access(handle, tree, path + i); ++ if (err) ++ goto cleanup; ++ path[i].p_hdr->eh_entries -= m; ++ err = ext3_ext_dirty(handle, tree, path + i); ++ if (err) ++ goto cleanup; ++ } ++ ++ i--; ++ } ++ ++ /* insert new index */ ++ if (!err) ++ err = ext3_ext_insert_index(handle, tree, path + at, ++ border, newblock); ++ ++cleanup: ++ if (bh) { ++ if (buffer_locked(bh)) ++ unlock_buffer(bh); ++ brelse(bh); ++ } ++ ++ if (err) { ++ /* free all allocated blocks in error case */ ++ for (i = 0; i < depth; i++) { ++ if (!ablocks[i]) ++ continue; ++ ext3_free_blocks(handle, tree->inode, ablocks[i], 1); ++ } ++ } ++ kfree(ablocks); ++ ++ return err; ++} ++ ++/* ++ * routine implements tree growing procedure: ++ * - allocates new block ++ * - moves top-level data (index block or leaf) into the new block ++ * - initialize new top-level, creating index that points to the ++ * just created block ++ */ ++static int ext3_ext_grow_indepth(handle_t *handle, ++ struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newext) ++{ ++ struct ext3_ext_path *curp = path; ++ struct ext3_extent_header *neh; ++ struct ext3_extent_idx *fidx; ++ struct buffer_head *bh; ++ unsigned long newblock; ++ int err = 0; ++ ++ newblock = ext3_ext_new_block(handle, tree, path, newext, &err); ++ if (newblock == 0) ++ return err; ++ ++ bh = sb_getblk(tree->inode->i_sb, newblock); ++ if (!bh) { ++ err = -EIO; ++ ext3_std_error(tree->inode->i_sb, err); ++ return err; ++ } ++ lock_buffer(bh); ++ ++ if ((err = ext3_journal_get_create_access(handle, bh))) { ++ unlock_buffer(bh); ++ goto out; ++ } ++ ++ /* move top-level index/leaf into new block */ ++ memmove(bh->b_data, curp->p_hdr, tree->buffer_len); ++ ++ /* set size of new block */ ++ neh = EXT_BLOCK_HDR(bh); ++ /* old root could have indexes or leaves ++ * so calculate eh_max right way */ ++ if (EXT_DEPTH(tree)) ++ neh->eh_max = ext3_ext_space_block_idx(tree); ++ else ++ neh->eh_max = ext3_ext_space_block(tree); ++ neh->eh_magic = EXT3_EXT_MAGIC; ++ set_buffer_uptodate(bh); ++ unlock_buffer(bh); ++ ++ if ((err = ext3_journal_dirty_metadata(handle, bh))) ++ goto out; ++ ++ /* create index in new top-level index: num,max,pointer */ ++ if ((err = ext3_ext_get_access(handle, tree, curp))) ++ goto out; ++ ++ curp->p_hdr->eh_magic = EXT3_EXT_MAGIC; ++ curp->p_hdr->eh_max = ext3_ext_space_root_idx(tree); ++ curp->p_hdr->eh_entries = 1; ++ curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr); ++ /* FIXME: it works, but actually path[0] can be index */ ++ curp->p_idx->ei_block = EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block; ++ curp->p_idx->ei_leaf = newblock; ++ curp->p_idx->ei_leaf_hi = curp->p_idx->ei_unused = 0; ++ ++ neh = EXT_ROOT_HDR(tree); ++ fidx = EXT_FIRST_INDEX(neh); ++ ext_debug(tree, "new root: num %d(%d), lblock %d, ptr %d\n", ++ neh->eh_entries, neh->eh_max, fidx->ei_block, fidx->ei_leaf); ++ ++ neh->eh_depth = path->p_depth + 1; ++ err = ext3_ext_dirty(handle, tree, curp); ++out: ++ brelse(bh); ++ ++ return err; ++} ++ ++/* ++ * routine finds empty index and adds new leaf. if no free index found ++ * then it requests in-depth growing ++ */ ++static int ext3_ext_create_new_leaf(handle_t *handle, ++ struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newext) ++{ ++ struct ext3_ext_path *curp; ++ int depth, i, err = 0; ++ ++repeat: ++ i = depth = EXT_DEPTH(tree); ++ ++ /* walk up to the tree and look for free index entry */ ++ curp = path + depth; ++ while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) { ++ i--; ++ curp--; ++ } ++ ++ /* we use already allocated block for index block ++ * so, subsequent data blocks should be contigoues */ ++ if (EXT_HAS_FREE_INDEX(curp)) { ++ /* if we found index with free entry, then use that ++ * entry: create all needed subtree and add new leaf */ ++ err = ext3_ext_split(handle, tree, path, newext, i); ++ ++ /* refill path */ ++ ext3_ext_drop_refs(path); ++ path = ext3_ext_find_extent(tree, newext->ee_block, path); ++ if (IS_ERR(path)) ++ err = PTR_ERR(path); ++ } else { ++ /* tree is full, time to grow in depth */ ++ err = ext3_ext_grow_indepth(handle, tree, path, newext); ++ ++ /* refill path */ ++ ext3_ext_drop_refs(path); ++ path = ext3_ext_find_extent(tree, newext->ee_block, path); ++ if (IS_ERR(path)) ++ err = PTR_ERR(path); ++ ++ /* ++ * only first (depth 0 -> 1) produces free space ++ * in all other cases we have to split growed tree ++ */ ++ depth = EXT_DEPTH(tree); ++ if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) { ++ /* now we need split */ ++ goto repeat; ++ } ++ } ++ ++ if (err) ++ return err; ++ ++ return 0; ++} ++ ++/* ++ * returns allocated block in subsequent extent or EXT_MAX_BLOCK ++ * NOTE: it consider block number from index entry as ++ * allocated block. thus, index entries have to be consistent ++ * with leafs ++ */ ++static unsigned long ++ext3_ext_next_allocated_block(struct ext3_ext_path *path) ++{ ++ int depth; ++ ++ EXT_ASSERT(path != NULL); ++ depth = path->p_depth; ++ ++ if (depth == 0 && path->p_ext == NULL) ++ return EXT_MAX_BLOCK; ++ ++ /* FIXME: what if index isn't full ?! */ ++ while (depth >= 0) { ++ if (depth == path->p_depth) { ++ /* leaf */ ++ if (path[depth].p_ext != ++ EXT_LAST_EXTENT(path[depth].p_hdr)) ++ return path[depth].p_ext[1].ee_block; ++ } else { ++ /* index */ ++ if (path[depth].p_idx != ++ EXT_LAST_INDEX(path[depth].p_hdr)) ++ return path[depth].p_idx[1].ei_block; ++ } ++ depth--; ++ } ++ ++ return EXT_MAX_BLOCK; ++} ++ ++/* ++ * returns first allocated block from next leaf or EXT_MAX_BLOCK ++ */ ++static unsigned ext3_ext_next_leaf_block(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ int depth; ++ ++ EXT_ASSERT(path != NULL); ++ depth = path->p_depth; ++ ++ /* zero-tree has no leaf blocks at all */ ++ if (depth == 0) ++ return EXT_MAX_BLOCK; ++ ++ /* go to index block */ ++ depth--; ++ ++ while (depth >= 0) { ++ if (path[depth].p_idx != ++ EXT_LAST_INDEX(path[depth].p_hdr)) ++ return path[depth].p_idx[1].ei_block; ++ depth--; ++ } ++ ++ return EXT_MAX_BLOCK; ++} ++ ++/* ++ * if leaf gets modified and modified extent is first in the leaf ++ * then we have to correct all indexes above ++ * TODO: do we need to correct tree in all cases? ++ */ ++int ext3_ext_correct_indexes(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ struct ext3_extent_header *eh; ++ int depth = EXT_DEPTH(tree); ++ struct ext3_extent *ex; ++ unsigned long border; ++ int k, err = 0; ++ ++ eh = path[depth].p_hdr; ++ ex = path[depth].p_ext; ++ EXT_ASSERT(ex); ++ EXT_ASSERT(eh); ++ ++ if (depth == 0) { ++ /* there is no tree at all */ ++ return 0; ++ } ++ ++ if (ex != EXT_FIRST_EXTENT(eh)) { ++ /* we correct tree if first leaf got modified only */ ++ return 0; ++ } ++ ++ /* ++ * TODO: we need correction if border is smaller then current one ++ */ ++ k = depth - 1; ++ border = path[depth].p_ext->ee_block; ++ if ((err = ext3_ext_get_access(handle, tree, path + k))) ++ return err; ++ path[k].p_idx->ei_block = border; ++ if ((err = ext3_ext_dirty(handle, tree, path + k))) ++ return err; ++ ++ while (k--) { ++ /* change all left-side indexes */ ++ if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr)) ++ break; ++ if ((err = ext3_ext_get_access(handle, tree, path + k))) ++ break; ++ path[k].p_idx->ei_block = border; ++ if ((err = ext3_ext_dirty(handle, tree, path + k))) ++ break; ++ } ++ ++ return err; ++} ++ ++static int inline ++ext3_can_extents_be_merged(struct ext3_extents_tree *tree, ++ struct ext3_extent *ex1, ++ struct ext3_extent *ex2) ++{ ++ if (ex1->ee_block + ex1->ee_len != ex2->ee_block) ++ return 0; ++ ++#ifdef AGRESSIVE_TEST ++ if (ex1->ee_len >= 4) ++ return 0; ++#endif ++ ++ if (!tree->ops->mergable) ++ return 1; ++ ++ return tree->ops->mergable(ex1, ex2); ++} ++ ++/* ++ * this routine tries to merge requsted extent into the existing ++ * extent or inserts requested extent as new one into the tree, ++ * creating new leaf in no-space case ++ */ ++int ext3_ext_insert_extent(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newext) ++{ ++ struct ext3_extent_header * eh; ++ struct ext3_extent *ex, *fex; ++ struct ext3_extent *nearex; /* nearest extent */ ++ struct ext3_ext_path *npath = NULL; ++ int depth, len, err, next; ++ ++ EXT_ASSERT(newext->ee_len > 0); ++ depth = EXT_DEPTH(tree); ++ ex = path[depth].p_ext; ++ EXT_ASSERT(path[depth].p_hdr); ++ ++ /* try to insert block into found extent and return */ ++ if (ex && ext3_can_extents_be_merged(tree, ex, newext)) { ++ ext_debug(tree, "append %d block to %d:%d (from %d)\n", ++ newext->ee_len, ex->ee_block, ex->ee_len, ++ ex->ee_start); ++ if ((err = ext3_ext_get_access(handle, tree, path + depth))) ++ return err; ++ ex->ee_len += newext->ee_len; ++ eh = path[depth].p_hdr; ++ nearex = ex; ++ goto merge; ++ } ++ ++repeat: ++ depth = EXT_DEPTH(tree); ++ eh = path[depth].p_hdr; ++ if (eh->eh_entries < eh->eh_max) ++ goto has_space; ++ ++ /* probably next leaf has space for us? */ ++ fex = EXT_LAST_EXTENT(eh); ++ next = ext3_ext_next_leaf_block(tree, path); ++ if (newext->ee_block > fex->ee_block && next != EXT_MAX_BLOCK) { ++ ext_debug(tree, "next leaf block - %d\n", next); ++ EXT_ASSERT(!npath); ++ npath = ext3_ext_find_extent(tree, next, NULL); ++ if (IS_ERR(npath)) ++ return PTR_ERR(npath); ++ EXT_ASSERT(npath->p_depth == path->p_depth); ++ eh = npath[depth].p_hdr; ++ if (eh->eh_entries < eh->eh_max) { ++ ext_debug(tree, "next leaf isnt full(%d)\n", ++ eh->eh_entries); ++ path = npath; ++ goto repeat; ++ } ++ ext_debug(tree, "next leaf hasno free space(%d,%d)\n", ++ eh->eh_entries, eh->eh_max); ++ } ++ ++ /* ++ * there is no free space in found leaf ++ * we're gonna add new leaf in the tree ++ */ ++ err = ext3_ext_create_new_leaf(handle, tree, path, newext); ++ if (err) ++ goto cleanup; ++ depth = EXT_DEPTH(tree); ++ eh = path[depth].p_hdr; ++ ++has_space: ++ nearex = path[depth].p_ext; ++ ++ if ((err = ext3_ext_get_access(handle, tree, path + depth))) ++ goto cleanup; ++ ++ if (!nearex) { ++ /* there is no extent in this leaf, create first one */ ++ ext_debug(tree, "first extent in the leaf: %d:%d:%d\n", ++ newext->ee_block, newext->ee_start, ++ newext->ee_len); ++ path[depth].p_ext = EXT_FIRST_EXTENT(eh); ++ } else if (newext->ee_block > nearex->ee_block) { ++ EXT_ASSERT(newext->ee_block != nearex->ee_block); ++ if (nearex != EXT_LAST_EXTENT(eh)) { ++ len = EXT_MAX_EXTENT(eh) - nearex; ++ len = (len - 1) * sizeof(struct ext3_extent); ++ len = len < 0 ? 0 : len; ++ ext_debug(tree, "insert %d:%d:%d after: nearest 0x%p, " ++ "move %d from 0x%p to 0x%p\n", ++ newext->ee_block, newext->ee_start, ++ newext->ee_len, ++ nearex, len, nearex + 1, nearex + 2); ++ memmove(nearex + 2, nearex + 1, len); ++ } ++ path[depth].p_ext = nearex + 1; ++ } else { ++ EXT_ASSERT(newext->ee_block != nearex->ee_block); ++ len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext3_extent); ++ len = len < 0 ? 0 : len; ++ ext_debug(tree, "insert %d:%d:%d before: nearest 0x%p, " ++ "move %d from 0x%p to 0x%p\n", ++ newext->ee_block, newext->ee_start, newext->ee_len, ++ nearex, len, nearex + 1, nearex + 2); ++ memmove(nearex + 1, nearex, len); ++ path[depth].p_ext = nearex; ++ } ++ ++ eh->eh_entries++; ++ nearex = path[depth].p_ext; ++ nearex->ee_block = newext->ee_block; ++ nearex->ee_start = newext->ee_start; ++ nearex->ee_len = newext->ee_len; ++ /* FIXME: support for large fs */ ++ nearex->ee_start_hi = 0; ++ ++merge: ++ /* try to merge extents to the right */ ++ while (nearex < EXT_LAST_EXTENT(eh)) { ++ if (!ext3_can_extents_be_merged(tree, nearex, nearex + 1)) ++ break; ++ /* merge with next extent! */ ++ nearex->ee_len += nearex[1].ee_len; ++ if (nearex + 1 < EXT_LAST_EXTENT(eh)) { ++ len = (EXT_LAST_EXTENT(eh) - nearex - 1) * ++ sizeof(struct ext3_extent); ++ memmove(nearex + 1, nearex + 2, len); ++ } ++ eh->eh_entries--; ++ EXT_ASSERT(eh->eh_entries > 0); ++ } ++ ++ /* try to merge extents to the left */ ++ ++ /* time to correct all indexes above */ ++ err = ext3_ext_correct_indexes(handle, tree, path); ++ if (err) ++ goto cleanup; ++ ++ err = ext3_ext_dirty(handle, tree, path + depth); ++ ++cleanup: ++ if (npath) { ++ ext3_ext_drop_refs(npath); ++ kfree(npath); ++ } ++ ext3_ext_tree_changed(tree); ++ ext3_ext_invalidate_cache(tree); ++ return err; ++} ++ ++int ext3_ext_walk_space(struct ext3_extents_tree *tree, unsigned long block, ++ unsigned long num, ext_prepare_callback func) ++{ ++ struct ext3_ext_path *path = NULL; ++ struct ext3_ext_cache cbex; ++ struct ext3_extent *ex; ++ unsigned long next, start = 0, end = 0; ++ unsigned long last = block + num; ++ int depth, exists, err = 0; ++ ++ EXT_ASSERT(tree); ++ EXT_ASSERT(func); ++ EXT_ASSERT(tree->inode); ++ EXT_ASSERT(tree->root); ++ ++ while (block < last && block != EXT_MAX_BLOCK) { ++ num = last - block; ++ /* find extent for this block */ ++ path = ext3_ext_find_extent(tree, block, path); ++ if (IS_ERR(path)) { ++ err = PTR_ERR(path); ++ path = NULL; ++ break; ++ } ++ ++ depth = EXT_DEPTH(tree); ++ EXT_ASSERT(path[depth].p_hdr); ++ ex = path[depth].p_ext; ++ next = ext3_ext_next_allocated_block(path); ++ ++ exists = 0; ++ if (!ex) { ++ /* there is no extent yet, so try to allocate ++ * all requested space */ ++ start = block; ++ end = block + num; ++ } else if (ex->ee_block > block) { ++ /* need to allocate space before found extent */ ++ start = block; ++ end = ex->ee_block; ++ if (block + num < end) ++ end = block + num; ++ } else if (block >= ex->ee_block + ex->ee_len) { ++ /* need to allocate space after found extent */ ++ start = block; ++ end = block + num; ++ if (end >= next) ++ end = next; ++ } else if (block >= ex->ee_block) { ++ /* ++ * some part of requested space is covered ++ * by found extent ++ */ ++ start = block; ++ end = ex->ee_block + ex->ee_len; ++ if (block + num < end) ++ end = block + num; ++ exists = 1; ++ } else { ++ BUG(); ++ } ++ EXT_ASSERT(end > start); ++ ++ if (!exists) { ++ cbex.ec_block = start; ++ cbex.ec_len = end - start; ++ cbex.ec_start = 0; ++ cbex.ec_type = EXT3_EXT_CACHE_GAP; ++ } else { ++ cbex.ec_block = ex->ee_block; ++ cbex.ec_len = ex->ee_len; ++ cbex.ec_start = ex->ee_start; ++ cbex.ec_type = EXT3_EXT_CACHE_EXTENT; ++ } ++ ++ EXT_ASSERT(cbex.ec_len > 0); ++ EXT_ASSERT(path[depth].p_hdr); ++ err = func(tree, path, &cbex); ++ ext3_ext_drop_refs(path); ++ ++ if (err < 0) ++ break; ++ if (err == EXT_REPEAT) ++ continue; ++ else if (err == EXT_BREAK) { ++ err = 0; ++ break; ++ } ++ ++ if (EXT_DEPTH(tree) != depth) { ++ /* depth was changed. we have to realloc path */ ++ kfree(path); ++ path = NULL; ++ } ++ ++ block = cbex.ec_block + cbex.ec_len; ++ } ++ ++ if (path) { ++ ext3_ext_drop_refs(path); ++ kfree(path); ++ } ++ ++ return err; ++} ++ ++static inline void ++ext3_ext_put_in_cache(struct ext3_extents_tree *tree, __u32 block, ++ __u32 len, __u32 start, int type) ++{ ++ EXT_ASSERT(len > 0); ++ if (tree->cex) { ++ tree->cex->ec_type = type; ++ tree->cex->ec_block = block; ++ tree->cex->ec_len = len; ++ tree->cex->ec_start = start; ++ } ++} ++ ++/* ++ * this routine calculate boundaries of the gap requested block fits into ++ * and cache this gap ++ */ ++static inline void ++ext3_ext_put_gap_in_cache(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ unsigned long block) ++{ ++ int depth = EXT_DEPTH(tree); ++ unsigned long lblock, len; ++ struct ext3_extent *ex; ++ ++ if (!tree->cex) ++ return; ++ ++ ex = path[depth].p_ext; ++ if (ex == NULL) { ++ /* there is no extent yet, so gap is [0;-] */ ++ lblock = 0; ++ len = EXT_MAX_BLOCK; ++ ext_debug(tree, "cache gap(whole file):"); ++ } else if (block < ex->ee_block) { ++ lblock = block; ++ len = ex->ee_block - block; ++ ext_debug(tree, "cache gap(before): %lu [%lu:%lu]", ++ (unsigned long) block, ++ (unsigned long) ex->ee_block, ++ (unsigned long) ex->ee_len); ++ } else if (block >= ex->ee_block + ex->ee_len) { ++ lblock = ex->ee_block + ex->ee_len; ++ len = ext3_ext_next_allocated_block(path); ++ ext_debug(tree, "cache gap(after): [%lu:%lu] %lu", ++ (unsigned long) ex->ee_block, ++ (unsigned long) ex->ee_len, ++ (unsigned long) block); ++ EXT_ASSERT(len > lblock); ++ len = len - lblock; ++ } else { ++ lblock = len = 0; ++ BUG(); ++ } ++ ++ ext_debug(tree, " -> %lu:%lu\n", (unsigned long) lblock, len); ++ ext3_ext_put_in_cache(tree, lblock, len, 0, EXT3_EXT_CACHE_GAP); ++} ++ ++static inline int ++ext3_ext_in_cache(struct ext3_extents_tree *tree, unsigned long block, ++ struct ext3_extent *ex) ++{ ++ struct ext3_ext_cache *cex = tree->cex; ++ ++ /* is there cache storage at all? */ ++ if (!cex) ++ return EXT3_EXT_CACHE_NO; ++ ++ /* has cache valid data? */ ++ if (cex->ec_type == EXT3_EXT_CACHE_NO) ++ return EXT3_EXT_CACHE_NO; ++ ++ EXT_ASSERT(cex->ec_type == EXT3_EXT_CACHE_GAP || ++ cex->ec_type == EXT3_EXT_CACHE_EXTENT); ++ if (block >= cex->ec_block && block < cex->ec_block + cex->ec_len) { ++ ex->ee_block = cex->ec_block; ++ ex->ee_start = cex->ec_start; ++ ex->ee_start_hi = 0; ++ ex->ee_len = cex->ec_len; ++ ext_debug(tree, "%lu cached by %lu:%lu:%lu\n", ++ (unsigned long) block, ++ (unsigned long) ex->ee_block, ++ (unsigned long) ex->ee_len, ++ (unsigned long) ex->ee_start); ++ return cex->ec_type; ++ } ++ ++ /* not in cache */ ++ return EXT3_EXT_CACHE_NO; ++} ++ ++/* ++ * routine removes index from the index block ++ * it's used in truncate case only. thus all requests are for ++ * last index in the block only ++ */ ++int ext3_ext_rm_idx(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ struct buffer_head *bh; ++ int err; ++ ++ /* free index block */ ++ path--; ++ EXT_ASSERT(path->p_hdr->eh_entries); ++ if ((err = ext3_ext_get_access(handle, tree, path))) ++ return err; ++ path->p_hdr->eh_entries--; ++ if ((err = ext3_ext_dirty(handle, tree, path))) ++ return err; ++ ext_debug(tree, "index is empty, remove it, free block %d\n", ++ path->p_idx->ei_leaf); ++ bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf); ++ ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf); ++ ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1); ++ return err; ++} ++ ++int ext3_ext_calc_credits_for_insert(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ int depth = EXT_DEPTH(tree); ++ int needed; ++ ++ if (path) { ++ /* probably there is space in leaf? */ ++ if (path[depth].p_hdr->eh_entries < path[depth].p_hdr->eh_max) ++ return 1; ++ } ++ ++ /* ++ * the worste case we're expecting is creation of the ++ * new root (growing in depth) with index splitting ++ * for splitting we have to consider depth + 1 because ++ * previous growing could increase it ++ */ ++ depth = depth + 1; ++ ++ /* ++ * growing in depth: ++ * block allocation + new root + old root ++ */ ++ needed = EXT3_ALLOC_NEEDED + 2; ++ ++ /* index split. we may need: ++ * allocate intermediate indexes and new leaf ++ * change two blocks at each level, but root ++ * modify root block (inode) ++ */ ++ needed += (depth * EXT3_ALLOC_NEEDED) + (2 * depth) + 1; ++ ++ return needed; ++} ++ ++static int ++ext3_ext_split_for_rm(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, unsigned long start, ++ unsigned long end) ++{ ++ struct ext3_extent *ex, tex; ++ struct ext3_ext_path *npath; ++ int depth, creds, err; ++ ++ depth = EXT_DEPTH(tree); ++ ex = path[depth].p_ext; ++ EXT_ASSERT(ex); ++ EXT_ASSERT(end < ex->ee_block + ex->ee_len - 1); ++ EXT_ASSERT(ex->ee_block < start); ++ ++ /* calculate tail extent */ ++ tex.ee_block = end + 1; ++ EXT_ASSERT(tex.ee_block < ex->ee_block + ex->ee_len); ++ tex.ee_len = ex->ee_block + ex->ee_len - tex.ee_block; ++ ++ creds = ext3_ext_calc_credits_for_insert(tree, path); ++ handle = ext3_ext_journal_restart(handle, creds); ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ ++ /* calculate head extent. use primary extent */ ++ err = ext3_ext_get_access(handle, tree, path + depth); ++ if (err) ++ return err; ++ ex->ee_len = start - ex->ee_block; ++ err = ext3_ext_dirty(handle, tree, path + depth); ++ if (err) ++ return err; ++ ++ /* FIXME: some callback to free underlying resource ++ * and correct ee_start? */ ++ ext_debug(tree, "split extent: head %u:%u, tail %u:%u\n", ++ ex->ee_block, ex->ee_len, tex.ee_block, tex.ee_len); ++ ++ npath = ext3_ext_find_extent(tree, ex->ee_block, NULL); ++ if (IS_ERR(npath)) ++ return PTR_ERR(npath); ++ depth = EXT_DEPTH(tree); ++ EXT_ASSERT(npath[depth].p_ext->ee_block == ex->ee_block); ++ EXT_ASSERT(npath[depth].p_ext->ee_len == ex->ee_len); ++ ++ err = ext3_ext_insert_extent(handle, tree, npath, &tex); ++ ext3_ext_drop_refs(npath); ++ kfree(npath); ++ ++ return err; ++} ++ ++static int ++ext3_ext_rm_leaf(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, unsigned long start, ++ unsigned long end) ++{ ++ struct ext3_extent *ex, *fu = NULL, *lu, *le; ++ int err = 0, correct_index = 0; ++ int depth = EXT_DEPTH(tree), credits; ++ struct ext3_extent_header *eh; ++ unsigned a, b, block, num; ++ ++ ext_debug(tree, "remove [%lu:%lu] in leaf\n", start, end); ++ if (!path[depth].p_hdr) ++ path[depth].p_hdr = EXT_BLOCK_HDR(path[depth].p_bh); ++ eh = path[depth].p_hdr; ++ EXT_ASSERT(eh); ++ EXT_ASSERT(eh->eh_entries <= eh->eh_max); ++ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); ++ ++ /* find where to start removing */ ++ le = ex = EXT_LAST_EXTENT(eh); ++ while (ex != EXT_FIRST_EXTENT(eh)) { ++ if (ex->ee_block <= end) ++ break; ++ ex--; ++ } ++ ++ if (start > ex->ee_block && end < ex->ee_block + ex->ee_len - 1) { ++ /* removal of internal part of the extent requested ++ * tail and head must be placed in different extent ++ * so, we have to insert one more extent */ ++ path[depth].p_ext = ex; ++ return ext3_ext_split_for_rm(handle, tree, path, start, end); ++ } ++ ++ lu = ex; ++ while (ex >= EXT_FIRST_EXTENT(eh) && ex->ee_block + ex->ee_len > start) { ++ ext_debug(tree, "remove ext %u:%u\n", ex->ee_block, ex->ee_len); ++ path[depth].p_ext = ex; ++ ++ a = ex->ee_block > start ? ex->ee_block : start; ++ b = ex->ee_block + ex->ee_len - 1 < end ? ++ ex->ee_block + ex->ee_len - 1 : end; ++ ++ ext_debug(tree, " border %u:%u\n", a, b); ++ ++ if (a != ex->ee_block && b != ex->ee_block + ex->ee_len - 1) { ++ block = 0; ++ num = 0; ++ BUG(); ++ } else if (a != ex->ee_block) { ++ /* remove tail of the extent */ ++ block = ex->ee_block; ++ num = a - block; ++ } else if (b != ex->ee_block + ex->ee_len - 1) { ++ /* remove head of the extent */ ++ block = a; ++ num = b - a; ++ } else { ++ /* remove whole extent: excelent! */ ++ block = ex->ee_block; ++ num = 0; ++ EXT_ASSERT(a == ex->ee_block && ++ b == ex->ee_block + ex->ee_len - 1); ++ } ++ ++ if (ex == EXT_FIRST_EXTENT(eh)) ++ correct_index = 1; ++ ++ credits = 1; ++ if (correct_index) ++ credits += (EXT_DEPTH(tree) * EXT3_ALLOC_NEEDED) + 1; ++ if (tree->ops->remove_extent_credits) ++ credits+=tree->ops->remove_extent_credits(tree,ex,a,b); ++ ++ handle = ext3_ext_journal_restart(handle, credits); ++ if (IS_ERR(handle)) { ++ err = PTR_ERR(handle); ++ goto out; ++ } ++ ++ err = ext3_ext_get_access(handle, tree, path + depth); ++ if (err) ++ goto out; ++ ++ if (tree->ops->remove_extent) ++ err = tree->ops->remove_extent(tree, ex, a, b); ++ if (err) ++ goto out; ++ ++ if (num == 0) { ++ /* this extent is removed entirely mark slot unused */ ++ ex->ee_start = ex->ee_start_hi = 0; ++ eh->eh_entries--; ++ fu = ex; ++ } ++ ++ ex->ee_block = block; ++ ex->ee_len = num; ++ ++ err = ext3_ext_dirty(handle, tree, path + depth); ++ if (err) ++ goto out; ++ ++ ext_debug(tree, "new extent: %u:%u:%u\n", ++ ex->ee_block, ex->ee_len, ex->ee_start); ++ ex--; ++ } ++ ++ if (fu) { ++ /* reuse unused slots */ ++ while (lu < le) { ++ if (lu->ee_start) { ++ *fu = *lu; ++ lu->ee_start = lu->ee_start_hi = 0; ++ fu++; ++ } ++ lu++; ++ } ++ } ++ ++ if (correct_index && eh->eh_entries) ++ err = ext3_ext_correct_indexes(handle, tree, path); ++ ++ /* if this leaf is free, then we should ++ * remove it from index block above */ ++ if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL) ++ err = ext3_ext_rm_idx(handle, tree, path + depth); ++ ++out: ++ return err; ++} ++ ++ ++static struct ext3_extent_idx * ++ext3_ext_last_covered(struct ext3_extent_header *hdr, unsigned long block) ++{ ++ struct ext3_extent_idx *ix; ++ ++ ix = EXT_LAST_INDEX(hdr); ++ while (ix != EXT_FIRST_INDEX(hdr)) { ++ if (ix->ei_block <= block) ++ break; ++ ix--; ++ } ++ return ix; ++} ++ ++/* ++ * returns 1 if current index have to be freed (even partial) ++ */ ++static int inline ++ext3_ext_more_to_rm(struct ext3_ext_path *path) ++{ ++ EXT_ASSERT(path->p_idx); ++ ++ if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr)) ++ return 0; ++ ++ /* ++ * if truncate on deeper level happened it it wasn't partial ++ * so we have to consider current index for truncation ++ */ ++ if (path->p_hdr->eh_entries == path->p_block) ++ return 0; ++ return 1; ++} ++ ++int ext3_ext_remove_space(struct ext3_extents_tree *tree, ++ unsigned long start, unsigned long end) ++{ ++ struct inode *inode = tree->inode; ++ struct super_block *sb = inode->i_sb; ++ int depth = EXT_DEPTH(tree); ++ struct ext3_ext_path *path; ++ handle_t *handle; ++ int i = 0, err = 0; ++ ++ ext_debug(tree, "space to be removed: %lu:%lu\n", start, end); ++ ++ /* probably first extent we're gonna free will be last in block */ ++ handle = ext3_journal_start(inode, depth + 1); ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ ++ ext3_ext_invalidate_cache(tree); ++ ++ /* ++ * we start scanning from right side freeing all the blocks ++ * after i_size and walking into the deep ++ */ ++ path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 1), GFP_KERNEL); ++ if (IS_ERR(path)) { ++ ext3_error(sb, __FUNCTION__, "Can't allocate path array"); ++ ext3_journal_stop(handle); ++ return -ENOMEM; ++ } ++ memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1)); ++ path[i].p_hdr = EXT_ROOT_HDR(tree); ++ ++ while (i >= 0 && err == 0) { ++ if (i == depth) { ++ /* this is leaf block */ ++ err = ext3_ext_rm_leaf(handle, tree, path, start, end); ++ /* root level have p_bh == NULL, brelse() eats this */ ++ brelse(path[i].p_bh); ++ i--; ++ continue; ++ } ++ ++ /* this is index block */ ++ if (!path[i].p_hdr) { ++ ext_debug(tree, "initialize header\n"); ++ path[i].p_hdr = EXT_BLOCK_HDR(path[i].p_bh); ++ } ++ ++ EXT_ASSERT(path[i].p_hdr->eh_entries <= path[i].p_hdr->eh_max); ++ EXT_ASSERT(path[i].p_hdr->eh_magic == EXT3_EXT_MAGIC); ++ ++ if (!path[i].p_idx) { ++ /* this level hasn't touched yet */ ++ path[i].p_idx = ++ ext3_ext_last_covered(path[i].p_hdr, end); ++ path[i].p_block = path[i].p_hdr->eh_entries + 1; ++ ext_debug(tree, "init index ptr: hdr 0x%p, num %d\n", ++ path[i].p_hdr, path[i].p_hdr->eh_entries); ++ } else { ++ /* we've already was here, see at next index */ ++ path[i].p_idx--; ++ } ++ ++ ext_debug(tree, "level %d - index, first 0x%p, cur 0x%p\n", ++ i, EXT_FIRST_INDEX(path[i].p_hdr), ++ path[i].p_idx); ++ if (ext3_ext_more_to_rm(path + i)) { ++ /* go to the next level */ ++ ext_debug(tree, "move to level %d (block %d)\n", ++ i + 1, path[i].p_idx->ei_leaf); ++ memset(path + i + 1, 0, sizeof(*path)); ++ path[i+1].p_bh = sb_bread(sb, path[i].p_idx->ei_leaf); ++ if (!path[i+1].p_bh) { ++ /* should we reset i_size? */ ++ err = -EIO; ++ break; ++ } ++ /* put actual number of indexes to know is this ++ * number got changed at the next iteration */ ++ path[i].p_block = path[i].p_hdr->eh_entries; ++ i++; ++ } else { ++ /* we finish processing this index, go up */ ++ if (path[i].p_hdr->eh_entries == 0 && i > 0) { ++ /* index is empty, remove it ++ * handle must be already prepared by the ++ * truncatei_leaf() */ ++ err = ext3_ext_rm_idx(handle, tree, path + i); ++ } ++ /* root level have p_bh == NULL, brelse() eats this */ ++ brelse(path[i].p_bh); ++ i--; ++ ext_debug(tree, "return to level %d\n", i); ++ } ++ } ++ ++ /* TODO: flexible tree reduction should be here */ ++ if (path->p_hdr->eh_entries == 0) { ++ /* ++ * truncate to zero freed all the tree ++ * so, we need to correct eh_depth ++ */ ++ err = ext3_ext_get_access(handle, tree, path); ++ if (err == 0) { ++ EXT_ROOT_HDR(tree)->eh_depth = 0; ++ EXT_ROOT_HDR(tree)->eh_max = ext3_ext_space_root(tree); ++ err = ext3_ext_dirty(handle, tree, path); ++ } ++ } ++ ext3_ext_tree_changed(tree); ++ ++ kfree(path); ++ ext3_journal_stop(handle); ++ ++ return err; ++} ++ ++int ext3_ext_calc_metadata_amount(struct ext3_extents_tree *tree, int blocks) ++{ ++ int lcap, icap, rcap, leafs, idxs, num; ++ ++ rcap = ext3_ext_space_root(tree); ++ if (blocks <= rcap) { ++ /* all extents fit to the root */ ++ return 0; ++ } ++ ++ rcap = ext3_ext_space_root_idx(tree); ++ lcap = ext3_ext_space_block(tree); ++ icap = ext3_ext_space_block_idx(tree); ++ ++ num = leafs = (blocks + lcap - 1) / lcap; ++ if (leafs <= rcap) { ++ /* all pointers to leafs fit to the root */ ++ return leafs; ++ } ++ ++ /* ok. we need separate index block(s) to link all leaf blocks */ ++ idxs = (leafs + icap - 1) / icap; ++ do { ++ num += idxs; ++ idxs = (idxs + icap - 1) / icap; ++ } while (idxs > rcap); ++ ++ return num; ++} ++ ++/* ++ * called at mount time ++ */ ++void ext3_ext_init(struct super_block *sb) ++{ ++ /* ++ * possible initialization would be here ++ */ ++ ++ if (test_opt(sb, EXTENTS)) { ++ printk("EXT3-fs: file extents enabled"); ++#ifdef AGRESSIVE_TEST ++ printk(", agressive tests"); ++#endif ++#ifdef CHECK_BINSEARCH ++ printk(", check binsearch"); ++#endif ++ printk("\n"); ++ } ++} ++ ++/* ++ * called at umount time ++ */ ++void ext3_ext_release(struct super_block *sb) ++{ ++} ++ ++/************************************************************************ ++ * VFS related routines ++ ************************************************************************/ ++ ++static int ext3_get_inode_write_access(handle_t *handle, void *buffer) ++{ ++ /* we use in-core data, not bh */ ++ return 0; ++} ++ ++static int ext3_mark_buffer_dirty(handle_t *handle, void *buffer) ++{ ++ struct inode *inode = buffer; ++ return ext3_mark_inode_dirty(handle, inode); ++} ++ ++static int ext3_ext_mergable(struct ext3_extent *ex1, ++ struct ext3_extent *ex2) ++{ ++ /* FIXME: support for large fs */ ++ if (ex1->ee_start + ex1->ee_len == ex2->ee_start) ++ return 1; ++ return 0; ++} ++ ++static int ++ext3_remove_blocks_credits(struct ext3_extents_tree *tree, ++ struct ext3_extent *ex, ++ unsigned long from, unsigned long to) ++{ ++ int needed; ++ ++ /* at present, extent can't cross block group */; ++ needed = 4; /* bitmap + group desc + sb + inode */ ++ ++#ifdef CONFIG_QUOTA ++ needed += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS; ++#endif ++ return needed; ++} ++ ++static int ++ext3_remove_blocks(struct ext3_extents_tree *tree, ++ struct ext3_extent *ex, ++ unsigned long from, unsigned long to) ++{ ++ int needed = ext3_remove_blocks_credits(tree, ex, from, to); ++ handle_t *handle = ext3_journal_start(tree->inode, needed); ++ struct buffer_head *bh; ++ int i; ++ ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) { ++ /* tail removal */ ++ unsigned long num, start; ++ num = ex->ee_block + ex->ee_len - from; ++ start = ex->ee_start + ex->ee_len - num; ++ ext_debug(tree, "free last %lu blocks starting %lu\n", ++ num, start); ++ for (i = 0; i < num; i++) { ++ bh = sb_find_get_block(tree->inode->i_sb, start + i); ++ ext3_forget(handle, 0, tree->inode, bh, start + i); ++ } ++ ext3_free_blocks(handle, tree->inode, start, num); ++ } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) { ++ printk("strange request: removal %lu-%lu from %u:%u\n", ++ from, to, ex->ee_block, ex->ee_len); ++ } else { ++ printk("strange request: removal(2) %lu-%lu from %u:%u\n", ++ from, to, ex->ee_block, ex->ee_len); ++ } ++ ext3_journal_stop(handle); ++ return 0; ++} ++ ++static int ext3_ext_find_goal(struct inode *inode, ++ struct ext3_ext_path *path, unsigned long block) ++{ ++ struct ext3_inode_info *ei = EXT3_I(inode); ++ unsigned long bg_start; ++ unsigned long colour; ++ int depth; ++ ++ if (path) { ++ struct ext3_extent *ex; ++ depth = path->p_depth; ++ ++ /* try to predict block placement */ ++ if ((ex = path[depth].p_ext)) ++ return ex->ee_start + (block - ex->ee_block); ++ ++ /* it looks index is empty ++ * try to find starting from index itself */ ++ if (path[depth].p_bh) ++ return path[depth].p_bh->b_blocknr; ++ } ++ ++ /* OK. use inode's group */ ++ bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) + ++ le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block); ++ colour = (current->pid % 16) * ++ (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16); ++ return bg_start + colour + block; ++} ++ ++static int ext3_new_block_cb(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *ex, int *err) ++{ ++ struct inode *inode = tree->inode; ++ int newblock, goal; ++ ++ EXT_ASSERT(path); ++ EXT_ASSERT(ex); ++ EXT_ASSERT(ex->ee_start); ++ EXT_ASSERT(ex->ee_len); ++ ++ /* reuse block from the extent to order data/metadata */ ++ newblock = ex->ee_start++; ++ ex->ee_len--; ++ if (ex->ee_len == 0) { ++ ex->ee_len = 1; ++ /* allocate new block for the extent */ ++ goal = ext3_ext_find_goal(inode, path, ex->ee_block); ++ ex->ee_start = ext3_new_block(handle, inode, goal, err); ++ ex->ee_start_hi = 0; ++ if (ex->ee_start == 0) { ++ /* error occured: restore old extent */ ++ ex->ee_start = newblock; ++ return 0; ++ } ++ } ++ return newblock; ++} ++ ++static struct ext3_extents_helpers ext3_blockmap_helpers = { ++ .get_write_access = ext3_get_inode_write_access, ++ .mark_buffer_dirty = ext3_mark_buffer_dirty, ++ .mergable = ext3_ext_mergable, ++ .new_block = ext3_new_block_cb, ++ .remove_extent = ext3_remove_blocks, ++ .remove_extent_credits = ext3_remove_blocks_credits, ++}; ++ ++void ext3_init_tree_desc(struct ext3_extents_tree *tree, ++ struct inode *inode) ++{ ++ tree->inode = inode; ++ tree->root = (void *) EXT3_I(inode)->i_data; ++ tree->buffer = (void *) inode; ++ tree->buffer_len = sizeof(EXT3_I(inode)->i_data); ++ tree->cex = (struct ext3_ext_cache *) &EXT3_I(inode)->i_cached_extent; ++ tree->ops = &ext3_blockmap_helpers; ++} ++ ++int ext3_ext_get_block(handle_t *handle, struct inode *inode, ++ long iblock, struct buffer_head *bh_result, ++ int create, int extend_disksize) ++{ ++ struct ext3_ext_path *path = NULL; ++ struct ext3_extent newex; ++ struct ext3_extent *ex; ++ int goal, newblock, err = 0, depth; ++ struct ext3_extents_tree tree; ++ ++ clear_buffer_new(bh_result); ++ ext3_init_tree_desc(&tree, inode); ++ ext_debug(&tree, "block %d requested for inode %u\n", ++ (int) iblock, (unsigned) inode->i_ino); ++ down(&EXT3_I(inode)->truncate_sem); ++ ++ /* check in cache */ ++ if ((goal = ext3_ext_in_cache(&tree, iblock, &newex))) { ++ if (goal == EXT3_EXT_CACHE_GAP) { ++ if (!create) { ++ /* block isn't allocated yet and ++ * user don't want to allocate it */ ++ goto out2; ++ } ++ /* we should allocate requested block */ ++ } else if (goal == EXT3_EXT_CACHE_EXTENT) { ++ /* block is already allocated */ ++ newblock = iblock - newex.ee_block + newex.ee_start; ++ goto out; ++ } else { ++ EXT_ASSERT(0); ++ } ++ } ++ ++ /* find extent for this block */ ++ path = ext3_ext_find_extent(&tree, iblock, NULL); ++ if (IS_ERR(path)) { ++ err = PTR_ERR(path); ++ path = NULL; ++ goto out2; ++ } ++ ++ depth = EXT_DEPTH(&tree); ++ ++ /* ++ * consistent leaf must not be empty ++ * this situations is possible, though, _during_ tree modification ++ * this is why assert can't be put in ext3_ext_find_extent() ++ */ ++ EXT_ASSERT(path[depth].p_ext != NULL || depth == 0); ++ ++ if ((ex = path[depth].p_ext)) { ++ /* if found exent covers block, simple return it */ ++ if (iblock >= ex->ee_block && iblock < ex->ee_block + ex->ee_len) { ++ newblock = iblock - ex->ee_block + ex->ee_start; ++ ext_debug(&tree, "%d fit into %d:%d -> %d\n", ++ (int) iblock, ex->ee_block, ex->ee_len, ++ newblock); ++ ext3_ext_put_in_cache(&tree, ex->ee_block, ++ ex->ee_len, ex->ee_start, ++ EXT3_EXT_CACHE_EXTENT); ++ goto out; ++ } ++ } ++ ++ /* ++ * requested block isn't allocated yet ++ * we couldn't try to create block if create flag is zero ++ */ ++ if (!create) { ++ /* put just found gap into cache to speedup subsequest reqs */ ++ ext3_ext_put_gap_in_cache(&tree, path, iblock); ++ goto out2; ++ } ++ ++ /* allocate new block */ ++ goal = ext3_ext_find_goal(inode, path, iblock); ++ newblock = ext3_new_block(handle, inode, goal, &err); ++ if (!newblock) ++ goto out2; ++ ext_debug(&tree, "allocate new block: goal %d, found %d\n", ++ goal, newblock); ++ ++ /* try to insert new extent into found leaf and return */ ++ newex.ee_block = iblock; ++ newex.ee_start = newblock; ++ newex.ee_start_hi = 0; ++ newex.ee_len = 1; ++ err = ext3_ext_insert_extent(handle, &tree, path, &newex); ++ if (err) ++ goto out2; ++ ++ if (extend_disksize && inode->i_size > EXT3_I(inode)->i_disksize) ++ EXT3_I(inode)->i_disksize = inode->i_size; ++ ++ /* previous routine could use block we allocated */ ++ newblock = newex.ee_start; ++ set_buffer_new(bh_result); ++ ++ ext3_ext_put_in_cache(&tree, newex.ee_block, newex.ee_len, ++ newex.ee_start, EXT3_EXT_CACHE_EXTENT); ++out: ++ ext3_ext_show_leaf(&tree, path); ++ map_bh(bh_result, inode->i_sb, newblock); ++out2: ++ if (path) { ++ ext3_ext_drop_refs(path); ++ kfree(path); ++ } ++ up(&EXT3_I(inode)->truncate_sem); ++ ++ return err; ++} ++ ++void ext3_ext_truncate(struct inode * inode, struct page *page) ++{ ++ struct address_space *mapping = inode->i_mapping; ++ struct super_block *sb = inode->i_sb; ++ struct ext3_extents_tree tree; ++ unsigned long last_block; ++ handle_t *handle; ++ int err = 0; ++ ++ ext3_init_tree_desc(&tree, inode); ++ ++ /* ++ * probably first extent we're gonna free will be last in block ++ */ ++ err = ext3_writepage_trans_blocks(inode) + 3; ++ handle = ext3_journal_start(inode, err); ++ if (IS_ERR(handle)) { ++ if (page) { ++ clear_highpage(page); ++ flush_dcache_page(page); ++ unlock_page(page); ++ page_cache_release(page); ++ } ++ return; ++ } ++ ++ if (page) ++ ext3_block_truncate_page(handle, page, mapping, inode->i_size); ++ ++ down(&EXT3_I(inode)->truncate_sem); ++ ext3_ext_invalidate_cache(&tree); ++ ++ /* ++ * TODO: optimization is possible here ++ * probably we need not scaning at all, ++ * because page truncation is enough ++ */ ++ if (ext3_orphan_add(handle, inode)) ++ goto out_stop; ++ ++ /* we have to know where to truncate from in crash case */ ++ EXT3_I(inode)->i_disksize = inode->i_size; ++ ext3_mark_inode_dirty(handle, inode); ++ ++ last_block = (inode->i_size + sb->s_blocksize - 1) >> ++ EXT3_BLOCK_SIZE_BITS(sb); ++ err = ext3_ext_remove_space(&tree, last_block, EXT_MAX_BLOCK); ++ ++ /* In a multi-transaction truncate, we only make the final ++ * transaction synchronous */ ++ if (IS_SYNC(inode)) ++ handle->h_sync = 1; ++ ++out_stop: ++ /* ++ * If this was a simple ftruncate(), and the file will remain alive ++ * then we need to clear up the orphan record which we created above. ++ * However, if this was a real unlink then we were called by ++ * ext3_delete_inode(), and we allow that function to clean up the ++ * orphan info for us. ++ */ ++ if (inode->i_nlink) ++ ext3_orphan_del(handle, inode); ++ ++ up(&EXT3_I(inode)->truncate_sem); ++ ext3_journal_stop(handle); ++} ++ ++/* ++ * this routine calculate max number of blocks we could modify ++ * in order to allocate new block for an inode ++ */ ++int ext3_ext_writepage_trans_blocks(struct inode *inode, int num) ++{ ++ struct ext3_extents_tree tree; ++ int needed; ++ ++ ext3_init_tree_desc(&tree, inode); ++ ++ needed = ext3_ext_calc_credits_for_insert(&tree, NULL); ++ ++ /* caller want to allocate num blocks */ ++ needed *= num; ++ ++#ifdef CONFIG_QUOTA ++ /* ++ * FIXME: real calculation should be here ++ * it depends on blockmap format of qouta file ++ */ ++ needed += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS; ++#endif ++ ++ return needed; ++} ++ ++void ext3_extents_initialize_blockmap(handle_t *handle, struct inode *inode) ++{ ++ struct ext3_extents_tree tree; ++ ++ ext3_init_tree_desc(&tree, inode); ++ ext3_extent_tree_init(handle, &tree); ++} ++ ++int ext3_ext_calc_blockmap_metadata(struct inode *inode, int blocks) ++{ ++ struct ext3_extents_tree tree; ++ ++ ext3_init_tree_desc(&tree, inode); ++ return ext3_ext_calc_metadata_amount(&tree, blocks); ++} ++ ++static int ++ext3_ext_store_extent_cb(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_ext_cache *newex) ++{ ++ struct ext3_extent_buf *buf = (struct ext3_extent_buf *) tree->private; ++ ++ if (newex->ec_type != EXT3_EXT_CACHE_EXTENT) ++ return EXT_CONTINUE; ++ ++ if (buf->err < 0) ++ return EXT_BREAK; ++ if (buf->cur - buf->buffer + sizeof(*newex) > buf->buflen) ++ return EXT_BREAK; ++ ++ if (!copy_to_user(buf->cur, newex, sizeof(*newex))) { ++ buf->err++; ++ buf->cur += sizeof(*newex); ++ } else { ++ buf->err = -EFAULT; ++ return EXT_BREAK; ++ } ++ return EXT_CONTINUE; ++} ++ ++static int ++ext3_ext_collect_stats_cb(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_ext_cache *ex) ++{ ++ struct ext3_extent_tree_stats *buf = ++ (struct ext3_extent_tree_stats *) tree->private; ++ int depth; ++ ++ if (ex->ec_type != EXT3_EXT_CACHE_EXTENT) ++ return EXT_CONTINUE; ++ ++ depth = EXT_DEPTH(tree); ++ buf->extents_num++; ++ if (path[depth].p_ext == EXT_FIRST_EXTENT(path[depth].p_hdr)) ++ buf->leaf_num++; ++ return EXT_CONTINUE; ++} ++ ++int ext3_ext_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, ++ unsigned long arg) ++{ ++ int err = 0; ++ ++ if (!(EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL)) ++ return -EINVAL; ++ ++ if (cmd == EXT3_IOC_GET_EXTENTS) { ++ struct ext3_extent_buf buf; ++ struct ext3_extents_tree tree; ++ ++ if (copy_from_user(&buf, (void *) arg, sizeof(buf))) ++ return -EFAULT; ++ ++ ext3_init_tree_desc(&tree, inode); ++ buf.cur = buf.buffer; ++ buf.err = 0; ++ tree.private = &buf; ++ down(&EXT3_I(inode)->truncate_sem); ++ err = ext3_ext_walk_space(&tree, buf.start, EXT_MAX_BLOCK, ++ ext3_ext_store_extent_cb); ++ up(&EXT3_I(inode)->truncate_sem); ++ if (err == 0) ++ err = buf.err; ++ } else if (cmd == EXT3_IOC_GET_TREE_STATS) { ++ struct ext3_extent_tree_stats buf; ++ struct ext3_extents_tree tree; ++ ++ ext3_init_tree_desc(&tree, inode); ++ down(&EXT3_I(inode)->truncate_sem); ++ buf.depth = EXT_DEPTH(&tree); ++ buf.extents_num = 0; ++ buf.leaf_num = 0; ++ tree.private = &buf; ++ err = ext3_ext_walk_space(&tree, 0, EXT_MAX_BLOCK, ++ ext3_ext_collect_stats_cb); ++ up(&EXT3_I(inode)->truncate_sem); ++ if (!err) ++ err = copy_to_user((void *) arg, &buf, sizeof(buf)); ++ } else if (cmd == EXT3_IOC_GET_TREE_DEPTH) { ++ struct ext3_extents_tree tree; ++ ext3_init_tree_desc(&tree, inode); ++ down(&EXT3_I(inode)->truncate_sem); ++ err = EXT_DEPTH(&tree); ++ up(&EXT3_I(inode)->truncate_sem); ++ } ++ ++ return err; ++} ++ ++EXPORT_SYMBOL(ext3_init_tree_desc); ++EXPORT_SYMBOL(ext3_mark_inode_dirty); ++EXPORT_SYMBOL(ext3_ext_invalidate_cache); ++EXPORT_SYMBOL(ext3_ext_insert_extent); ++EXPORT_SYMBOL(ext3_ext_walk_space); ++EXPORT_SYMBOL(ext3_ext_find_goal); ++EXPORT_SYMBOL(ext3_ext_calc_credits_for_insert); +Index: linux-2.6.12-rc6/fs/ext3/ialloc.c +=================================================================== +--- linux-2.6.12-rc6.orig/fs/ext3/ialloc.c 2005-06-14 16:31:08.634433030 +0200 ++++ linux-2.6.12-rc6/fs/ext3/ialloc.c 2005-06-14 16:31:25.846346882 +0200 +@@ -598,7 +598,7 @@ + ei->i_dir_start_lookup = 0; + ei->i_disksize = 0; + +- ei->i_flags = EXT3_I(dir)->i_flags & ~EXT3_INDEX_FL; ++ ei->i_flags = EXT3_I(dir)->i_flags & ~(EXT3_INDEX_FL|EXT3_EXTENTS_FL); + if (S_ISLNK(mode)) + ei->i_flags &= ~(EXT3_IMMUTABLE_FL|EXT3_APPEND_FL); + /* dirsync only applies to directories */ +@@ -639,6 +639,18 @@ + DQUOT_FREE_INODE(inode); + goto fail2; + } ++ if (test_opt(sb, EXTENTS) && S_ISREG(inode->i_mode)) { ++ EXT3_I(inode)->i_flags |= EXT3_EXTENTS_FL; ++ ext3_extents_initialize_blockmap(handle, inode); ++ if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_EXTENTS)) { ++ err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh); ++ if (err) goto fail; ++ EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_EXTENTS); ++ BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "call ext3_journal_dirty_metadata"); ++ err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); ++ } ++ } ++ + err = ext3_mark_inode_dirty(handle, inode); + if (err) { + ext3_std_error(sb, err); +Index: linux-2.6.12-rc6/fs/ext3/inode.c +=================================================================== +--- linux-2.6.12-rc6.orig/fs/ext3/inode.c 2005-06-14 16:31:09.701815830 +0200 ++++ linux-2.6.12-rc6/fs/ext3/inode.c 2005-06-14 16:31:25.861971882 +0200 +@@ -40,7 +40,7 @@ + #include "iopen.h" + #include "acl.h" + +-static int ext3_writepage_trans_blocks(struct inode *inode); ++int ext3_writepage_trans_blocks(struct inode *inode); + + /* + * Test whether an inode is a fast symlink. +@@ -784,6 +784,17 @@ + return err; + } + ++static inline int ++ext3_get_block_wrap(handle_t *handle, struct inode *inode, long block, ++ struct buffer_head *bh, int create, int extend_disksize) ++{ ++ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) ++ return ext3_ext_get_block(handle, inode, block, bh, create, ++ extend_disksize); ++ return ext3_get_block_handle(handle, inode, block, bh, create, ++ extend_disksize); ++} ++ + static int ext3_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) + { +@@ -794,8 +805,8 @@ + handle = ext3_journal_current_handle(); + J_ASSERT(handle != 0); + } +- ret = ext3_get_block_handle(handle, inode, iblock, +- bh_result, create, 1); ++ ret = ext3_get_block_wrap(handle, inode, iblock, ++ bh_result, create, 1); + return ret; + } + +@@ -839,7 +850,7 @@ + + get_block: + if (ret == 0) +- ret = ext3_get_block_handle(handle, inode, iblock, ++ ret = ext3_get_block_wrap(handle, inode, iblock, + bh_result, create, 0); + bh_result->b_size = (1 << inode->i_blkbits); + return ret; +@@ -859,7 +870,7 @@ + dummy.b_state = 0; + dummy.b_blocknr = -1000; + buffer_trace_init(&dummy.b_history); +- *errp = ext3_get_block_handle(handle, inode, block, &dummy, create, 1); ++ *errp = ext3_get_block_wrap(handle, inode, block, &dummy, create, 1); + if (!*errp && buffer_mapped(&dummy)) { + struct buffer_head *bh; + bh = sb_getblk(inode->i_sb, dummy.b_blocknr); +@@ -1593,7 +1604,7 @@ + * This required during truncate. We need to physically zero the tail end + * of that block so it doesn't yield old data if the file is later grown. + */ +-static int ext3_block_truncate_page(handle_t *handle, struct page *page, ++int ext3_block_truncate_page(handle_t *handle, struct page *page, + struct address_space *mapping, loff_t from) + { + unsigned long index = from >> PAGE_CACHE_SHIFT; +@@ -2104,6 +2115,9 @@ + return; + } + ++ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) ++ return ext3_ext_truncate(inode, page); ++ + handle = start_transaction(inode); + if (IS_ERR(handle)) { + if (page) { +@@ -2850,12 +2864,15 @@ + * block and work out the exact number of indirects which are touched. Pah. + */ + +-static int ext3_writepage_trans_blocks(struct inode *inode) ++int ext3_writepage_trans_blocks(struct inode *inode) + { + int bpp = ext3_journal_blocks_per_page(inode); + int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3; + int ret; + ++ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) ++ return ext3_ext_writepage_trans_blocks(inode, bpp); ++ + if (ext3_should_journal_data(inode)) + ret = 3 * (bpp + indirects) + 2; + else +Index: linux-2.6.12-rc6/fs/ext3/Makefile +=================================================================== +--- linux-2.6.12-rc6.orig/fs/ext3/Makefile 2005-06-14 16:31:09.179354899 +0200 ++++ linux-2.6.12-rc6/fs/ext3/Makefile 2005-06-14 16:31:25.872714069 +0200 +@@ -5,7 +5,8 @@ + obj-$(CONFIG_EXT3_FS) += ext3.o + + ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ +- ioctl.o namei.o super.o symlink.o hash.o resize.o ++ ioctl.o namei.o super.o symlink.o hash.o resize.o \ ++ extents.o + + ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o + ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o +Index: linux-2.6.12-rc6/fs/ext3/super.c +=================================================================== +--- linux-2.6.12-rc6.orig/fs/ext3/super.c 2005-06-14 16:31:09.950839264 +0200 ++++ linux-2.6.12-rc6/fs/ext3/super.c 2005-06-14 16:31:25.886385944 +0200 +@@ -387,6 +387,7 @@ + struct ext3_super_block *es = sbi->s_es; + int i; + ++ ext3_ext_release(sb); + ext3_xattr_put_super(sb); + journal_destroy(sbi->s_journal); + if (!(sb->s_flags & MS_RDONLY)) { +@@ -451,6 +452,8 @@ + #endif + ei->i_block_alloc_info = NULL; + ei->vfs_inode.i_version = 1; ++ ++ memset(&ei->i_cached_extent, 0, sizeof(ei->i_cached_extent)); + return &ei->vfs_inode; + } + +@@ -593,6 +596,7 @@ + Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, + Opt_ignore, Opt_barrier, Opt_err, Opt_resize, + Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, ++ Opt_extents, Opt_noextents, Opt_extdebug, + }; + + static match_table_t tokens = { +@@ -644,6 +647,9 @@ + {Opt_iopen, "iopen"}, + {Opt_noiopen, "noiopen"}, + {Opt_iopen_nopriv, "iopen_nopriv"}, ++ {Opt_extents, "extents"}, ++ {Opt_noextents, "noextents"}, ++ {Opt_extdebug, "extdebug"}, + {Opt_barrier, "barrier=%u"}, + {Opt_err, NULL}, + {Opt_resize, "resize"}, +@@ -953,6 +958,15 @@ + case Opt_nobh: + set_opt(sbi->s_mount_opt, NOBH); + break; ++ case Opt_extents: ++ set_opt (sbi->s_mount_opt, EXTENTS); ++ break; ++ case Opt_noextents: ++ clear_opt (sbi->s_mount_opt, EXTENTS); ++ break; ++ case Opt_extdebug: ++ set_opt (sbi->s_mount_opt, EXTDEBUG); ++ break; + default: + printk (KERN_ERR + "EXT3-fs: Unrecognized mount option \"%s\" " +@@ -1668,6 +1681,7 @@ + percpu_counter_mod(&sbi->s_dirs_counter, + ext3_count_dirs(sb)); + ++ ext3_ext_init(sb); + lock_kernel(); + return 0; + +Index: linux-2.6.12-rc6/fs/ext3/ioctl.c +=================================================================== +--- linux-2.6.12-rc6.orig/fs/ext3/ioctl.c 2005-06-14 16:31:08.646151780 +0200 ++++ linux-2.6.12-rc6/fs/ext3/ioctl.c 2005-06-14 16:31:25.897128131 +0200 +@@ -124,6 +124,10 @@ + err = ext3_change_inode_journal_flag(inode, jflag); + return err; + } ++ case EXT3_IOC_GET_EXTENTS: ++ case EXT3_IOC_GET_TREE_STATS: ++ case EXT3_IOC_GET_TREE_DEPTH: ++ return ext3_ext_ioctl(inode, filp, cmd, arg); + case EXT3_IOC_GETVERSION: + case EXT3_IOC_GETVERSION_OLD: + return put_user(inode->i_generation, (int __user *) arg); +Index: linux-2.6.12-rc6/include/linux/ext3_fs.h +=================================================================== +--- linux-2.6.12-rc6.orig/include/linux/ext3_fs.h 2005-06-14 16:31:10.185214261 +0200 ++++ linux-2.6.12-rc6/include/linux/ext3_fs.h 2005-06-14 16:31:52.859041864 +0200 +@@ -186,8 +186,9 @@ + #define EXT3_NOTAIL_FL 0x00008000 /* don't merge file tail */ + #define EXT3_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ + #define EXT3_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ ++#define EXT3_EXTENTS_FL 0x00080000 /* Inode uses extents */ + #define EXT3_RESERVED_FL 0x80000000 /* reserved for ext3 lib */ + +-#define EXT3_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */ ++#define EXT3_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */ + #define EXT3_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */ + +@@ -237,6 +238,9 @@ + #endif + #define EXT3_IOC_GETRSVSZ _IOR('f', 5, long) + #define EXT3_IOC_SETRSVSZ _IOW('f', 6, long) ++#define EXT3_IOC_GET_EXTENTS _IOR('f', 7, long) ++#define EXT3_IOC_GET_TREE_DEPTH _IOR('f', 8, long) ++#define EXT3_IOC_GET_TREE_STATS _IOR('f', 9, long) + + /* + * Structure of an inode on the disk +@@ -360,6 +364,8 @@ + #define EXT3_MOUNT_NOBH 0x40000 /* No bufferheads */ + #define EXT3_MOUNT_IOPEN 0x80000 /* Allow access via iopen */ + #define EXT3_MOUNT_IOPEN_NOPRIV 0x100000/* Make iopen world-readable */ ++#define EXT3_MOUNT_EXTENTS 0x200000/* Extents support */ ++#define EXT3_MOUNT_EXTDEBUG 0x400000/* Extents debug */ + + /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ + #ifndef clear_opt +@@ -548,11 +554,13 @@ + #define EXT3_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */ + #define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */ + #define EXT3_FEATURE_INCOMPAT_META_BG 0x0010 ++#define EXT3_FEATURE_INCOMPAT_EXTENTS 0x0040 /* extents support */ + + #define EXT3_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR + #define EXT3_FEATURE_INCOMPAT_SUPP (EXT3_FEATURE_INCOMPAT_FILETYPE| \ + EXT3_FEATURE_INCOMPAT_RECOVER| \ +- EXT3_FEATURE_INCOMPAT_META_BG) ++ EXT3_FEATURE_INCOMPAT_META_BG| \ ++ EXT3_FEATURE_INCOMPAT_EXTENTS) + #define EXT3_FEATURE_RO_COMPAT_SUPP (EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT3_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT3_FEATURE_RO_COMPAT_BTREE_DIR) +@@ -759,6 +767,9 @@ + + + /* inode.c */ ++extern int ext3_block_truncate_page(handle_t *, struct page *, ++ struct address_space *, loff_t); ++extern int ext3_writepage_trans_blocks(struct inode *inode); + extern int ext3_forget(handle_t *, int, struct inode *, struct buffer_head *, int); + extern struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *); + extern struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *); +@@ -828,6 +837,16 @@ + extern struct inode_operations ext3_symlink_inode_operations; + extern struct inode_operations ext3_fast_symlink_inode_operations; + ++/* extents.c */ ++extern int ext3_ext_writepage_trans_blocks(struct inode *, int); ++extern int ext3_ext_get_block(handle_t *, struct inode *, long, ++ struct buffer_head *, int, int); ++extern void ext3_ext_truncate(struct inode *, struct page *); ++extern void ext3_ext_init(struct super_block *); ++extern void ext3_ext_release(struct super_block *); ++extern void ext3_extents_initialize_blockmap(handle_t *, struct inode *); ++extern int ext3_ext_ioctl(struct inode *inode, struct file *filp, ++ unsigned int cmd, unsigned long arg); + + #endif /* __KERNEL__ */ + +Index: linux-2.6.12-rc6/include/linux/ext3_extents.h +=================================================================== +--- linux-2.6.12-rc6.orig/include/linux/ext3_extents.h 2005-06-14 16:31:25.780917195 +0200 ++++ linux-2.6.12-rc6/include/linux/ext3_extents.h 2005-06-14 16:31:25.932284381 +0200 +@@ -0,0 +1,262 @@ ++/* ++ * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com ++ * Written by Alex Tomas ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public Licens ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- ++ */ ++ ++#ifndef _LINUX_EXT3_EXTENTS ++#define _LINUX_EXT3_EXTENTS ++ ++/* ++ * with AGRESSIVE_TEST defined capacity of index/leaf blocks ++ * become very little, so index split, in-depth growing and ++ * other hard changes happens much more often ++ * this is for debug purposes only ++ */ ++#define AGRESSIVE_TEST_ ++ ++/* ++ * if CHECK_BINSEARCH defined, then results of binary search ++ * will be checked by linear search ++ */ ++#define CHECK_BINSEARCH_ ++ ++/* ++ * if EXT_DEBUG is defined you can use 'extdebug' mount option ++ * to get lots of info what's going on ++ */ ++#define EXT_DEBUG_ ++#ifdef EXT_DEBUG ++#define ext_debug(tree,fmt,a...) \ ++do { \ ++ if (test_opt((tree)->inode->i_sb, EXTDEBUG)) \ ++ printk(fmt, ##a); \ ++} while (0); ++#else ++#define ext_debug(tree,fmt,a...) ++#endif ++ ++/* ++ * if EXT_STATS is defined then stats numbers are collected ++ * these number will be displayed at umount time ++ */ ++#define EXT_STATS_ ++ ++ ++#define EXT3_ALLOC_NEEDED 3 /* block bitmap + group desc. + sb */ ++ ++/* ++ * ext3_inode has i_block array (total 60 bytes) ++ * first 4 bytes are used to store: ++ * - tree depth (0 mean there is no tree yet. all extents in the inode) ++ * - number of alive extents in the inode ++ */ ++ ++/* ++ * this is extent on-disk structure ++ * it's used at the bottom of the tree ++ */ ++struct ext3_extent { ++ __u32 ee_block; /* first logical block extent covers */ ++ __u16 ee_len; /* number of blocks covered by extent */ ++ __u16 ee_start_hi; /* high 16 bits of physical block */ ++ __u32 ee_start; /* low 32 bigs of physical block */ ++}; ++ ++/* ++ * this is index on-disk structure ++ * it's used at all the levels, but the bottom ++ */ ++struct ext3_extent_idx { ++ __u32 ei_block; /* index covers logical blocks from 'block' */ ++ __u32 ei_leaf; /* pointer to the physical block of the next * ++ * level. leaf or next index could bet here */ ++ __u16 ei_leaf_hi; /* high 16 bits of physical block */ ++ __u16 ei_unused; ++}; ++ ++/* ++ * each block (leaves and indexes), even inode-stored has header ++ */ ++struct ext3_extent_header { ++ __u16 eh_magic; /* probably will support different formats */ ++ __u16 eh_entries; /* number of valid entries */ ++ __u16 eh_max; /* capacity of store in entries */ ++ __u16 eh_depth; /* has tree real underlaying blocks? */ ++ __u32 eh_generation; /* flags(8 bits) | generation of the tree */ ++}; ++ ++#define EXT3_EXT_MAGIC 0xf30a ++ ++/* ++ * array of ext3_ext_path contains path to some extent ++ * creation/lookup routines use it for traversal/splitting/etc ++ * truncate uses it to simulate recursive walking ++ */ ++struct ext3_ext_path { ++ __u32 p_block; ++ __u16 p_depth; ++ struct ext3_extent *p_ext; ++ struct ext3_extent_idx *p_idx; ++ struct ext3_extent_header *p_hdr; ++ struct buffer_head *p_bh; ++}; ++ ++/* ++ * structure for external API ++ */ ++ ++/* ++ * storage for cached extent ++ */ ++struct ext3_ext_cache { ++ __u32 ec_start; ++ __u32 ec_block; ++ __u32 ec_len; ++ __u32 ec_type; ++}; ++ ++#define EXT3_EXT_CACHE_NO 0 ++#define EXT3_EXT_CACHE_GAP 1 ++#define EXT3_EXT_CACHE_EXTENT 2 ++ ++/* ++ * ext3_extents_tree is used to pass initial information ++ * to top-level extents API ++ */ ++struct ext3_extents_helpers; ++struct ext3_extents_tree { ++ struct inode *inode; /* inode which tree belongs to */ ++ void *root; /* ptr to data top of tree resides at */ ++ void *buffer; /* will be passed as arg to ^^ routines */ ++ int buffer_len; ++ void *private; ++ struct ext3_ext_cache *cex;/* last found extent */ ++ struct ext3_extents_helpers *ops; ++}; ++ ++struct ext3_extents_helpers { ++ int (*get_write_access)(handle_t *h, void *buffer); ++ int (*mark_buffer_dirty)(handle_t *h, void *buffer); ++ int (*mergable)(struct ext3_extent *ex1, struct ext3_extent *ex2); ++ int (*remove_extent_credits)(struct ext3_extents_tree *, ++ struct ext3_extent *, unsigned long, ++ unsigned long); ++ int (*remove_extent)(struct ext3_extents_tree *, ++ struct ext3_extent *, unsigned long, ++ unsigned long); ++ int (*new_block)(handle_t *, struct ext3_extents_tree *, ++ struct ext3_ext_path *, struct ext3_extent *, ++ int *); ++}; ++ ++/* ++ * to be called by ext3_ext_walk_space() ++ * negative retcode - error ++ * positive retcode - signal for ext3_ext_walk_space(), see below ++ * callback must return valid extent (passed or newly created) ++ */ ++typedef int (*ext_prepare_callback)(struct ext3_extents_tree *, ++ struct ext3_ext_path *, ++ struct ext3_ext_cache *); ++ ++#define EXT_CONTINUE 0 ++#define EXT_BREAK 1 ++#define EXT_REPEAT 2 ++ ++ ++#define EXT_MAX_BLOCK 0xffffffff ++ ++ ++#define EXT_FIRST_EXTENT(__hdr__) \ ++ ((struct ext3_extent *) (((char *) (__hdr__)) + \ ++ sizeof(struct ext3_extent_header))) ++#define EXT_FIRST_INDEX(__hdr__) \ ++ ((struct ext3_extent_idx *) (((char *) (__hdr__)) + \ ++ sizeof(struct ext3_extent_header))) ++#define EXT_HAS_FREE_INDEX(__path__) \ ++ ((__path__)->p_hdr->eh_entries < (__path__)->p_hdr->eh_max) ++#define EXT_LAST_EXTENT(__hdr__) \ ++ (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_entries - 1) ++#define EXT_LAST_INDEX(__hdr__) \ ++ (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_entries - 1) ++#define EXT_MAX_EXTENT(__hdr__) \ ++ (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_max - 1) ++#define EXT_MAX_INDEX(__hdr__) \ ++ (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_max - 1) ++#define EXT_HDR_GEN(__hdr__) ((__hdr__)->eh_generation & 0x00ffffff) ++#define EXT_FLAGS(__hdr__) ((__hdr__)->eh_generation >> 24) ++#define EXT_FLAGS_CLR_UNKNOWN 0x7 /* Flags cleared on modification */ ++ ++#define EXT_BLOCK_HDR(__bh__) ((struct ext3_extent_header *)(__bh__)->b_data) ++#define EXT_ROOT_HDR(__tree__) ((struct ext3_extent_header *)(__tree__)->root) ++#define EXT_DEPTH(__tree__) (EXT_ROOT_HDR(__tree__)->eh_depth) ++#define EXT_GENERATION(__tree__) EXT_HDR_GEN(EXT_ROOT_HDR(__tree__)) ++ ++#define EXT_ASSERT(__x__) if (!(__x__)) BUG(); ++ ++#define EXT_CHECK_PATH(tree,path) \ ++{ \ ++ int depth = EXT_DEPTH(tree); \ ++ BUG_ON((unsigned long) (path) < __PAGE_OFFSET); \ ++ BUG_ON((unsigned long) (path)[depth].p_idx < \ ++ __PAGE_OFFSET && (path)[depth].p_idx != NULL); \ ++ BUG_ON((unsigned long) (path)[depth].p_ext < \ ++ __PAGE_OFFSET && (path)[depth].p_ext != NULL); \ ++ BUG_ON((unsigned long) (path)[depth].p_hdr < __PAGE_OFFSET); \ ++ BUG_ON((unsigned long) (path)[depth].p_bh < __PAGE_OFFSET \ ++ && depth != 0); \ ++ BUG_ON((path)[0].p_depth != depth); \ ++} ++ ++ ++/* ++ * this structure is used to gather extents from the tree via ioctl ++ */ ++struct ext3_extent_buf { ++ unsigned long start; ++ int buflen; ++ void *buffer; ++ void *cur; ++ int err; ++}; ++ ++/* ++ * this structure is used to collect stats info about the tree ++ */ ++struct ext3_extent_tree_stats { ++ int depth; ++ int extents_num; ++ int leaf_num; ++}; ++ ++extern void ext3_init_tree_desc(struct ext3_extents_tree *, struct inode *); ++extern int ext3_extent_tree_init(handle_t *, struct ext3_extents_tree *); ++extern int ext3_ext_calc_credits_for_insert(struct ext3_extents_tree *, struct ext3_ext_path *); ++extern int ext3_ext_insert_extent(handle_t *, struct ext3_extents_tree *, struct ext3_ext_path *, struct ext3_extent *); ++extern int ext3_ext_walk_space(struct ext3_extents_tree *, unsigned long, unsigned long, ext_prepare_callback); ++extern int ext3_ext_remove_space(struct ext3_extents_tree *, unsigned long, unsigned long); ++extern struct ext3_ext_path * ext3_ext_find_extent(struct ext3_extents_tree *, int, struct ext3_ext_path *); ++extern int ext3_ext_calc_blockmap_metadata(struct inode *, int); ++ ++static inline void ++ext3_ext_invalidate_cache(struct ext3_extents_tree *tree) ++{ ++ if (tree->cex) ++ tree->cex->ec_type = EXT3_EXT_CACHE_NO; ++} ++ ++ ++#endif /* _LINUX_EXT3_EXTENTS */ +Index: linux-2.6.12-rc6/include/linux/ext3_fs_i.h +=================================================================== +--- linux-2.6.12-rc6.orig/include/linux/ext3_fs_i.h 2005-06-06 17:22:29.000000000 +0200 ++++ linux-2.6.12-rc6/include/linux/ext3_fs_i.h 2005-06-14 16:31:25.941073443 +0200 +@@ -133,6 +133,8 @@ + */ + struct semaphore truncate_sem; + struct inode vfs_inode; ++ ++ __u32 i_cached_extent[4]; + }; + + #endif /* _LINUX_EXT3_FS_I */ diff --git a/ldiskfs/kernel_patches/patches/ext3-extents-2.6.15.patch b/ldiskfs/kernel_patches/patches/ext3-extents-2.6.15.patch new file mode 100644 index 0000000..ea69e84 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext3-extents-2.6.15.patch @@ -0,0 +1,2947 @@ +Index: linux-2.6.16.21-0.8/fs/ext3/extents.c +=================================================================== +--- /dev/null ++++ linux-2.6.16.21-0.8/fs/ext3/extents.c +@@ -0,0 +1,2359 @@ ++/* ++ * Copyright(c) 2003, 2004, 2005, Cluster File Systems, Inc, info@clusterfs.com ++ * Written by Alex Tomas ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public Licens ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- ++ */ ++ ++/* ++ * Extents support for EXT3 ++ * ++ * TODO: ++ * - ext3_ext_walk_space() sould not use ext3_ext_find_extent() ++ * - ext3_ext_calc_credits() could take 'mergable' into account ++ * - ext3*_error() should be used in some situations ++ * - find_goal() [to be tested and improved] ++ * - smart tree reduction ++ * - arch-independence ++ * common on-disk format for big/little-endian arch ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++ ++static inline int ext3_ext_check_header(struct ext3_extent_header *eh) ++{ ++ if (eh->eh_magic != EXT3_EXT_MAGIC) { ++ printk(KERN_ERR "EXT3-fs: invalid magic = 0x%x\n", ++ (unsigned)eh->eh_magic); ++ return -EIO; ++ } ++ if (eh->eh_max == 0) { ++ printk(KERN_ERR "EXT3-fs: invalid eh_max = %u\n", ++ (unsigned)eh->eh_max); ++ return -EIO; ++ } ++ if (eh->eh_entries > eh->eh_max) { ++ printk(KERN_ERR "EXT3-fs: invalid eh_entries = %u\n", ++ (unsigned)eh->eh_entries); ++ return -EIO; ++ } ++ return 0; ++} ++ ++static handle_t *ext3_ext_journal_restart(handle_t *handle, int needed) ++{ ++ int err; ++ ++ if (handle->h_buffer_credits > needed) ++ return handle; ++ if (!ext3_journal_extend(handle, needed)) ++ return handle; ++ err = ext3_journal_restart(handle, needed); ++ ++ return handle; ++} ++ ++static int inline ++ext3_ext_get_access_for_root(handle_t *h, struct ext3_extents_tree *tree) ++{ ++ if (tree->ops->get_write_access) ++ return tree->ops->get_write_access(h,tree->buffer); ++ else ++ return 0; ++} ++ ++static int inline ++ext3_ext_mark_root_dirty(handle_t *h, struct ext3_extents_tree *tree) ++{ ++ if (tree->ops->mark_buffer_dirty) ++ return tree->ops->mark_buffer_dirty(h,tree->buffer); ++ else ++ return 0; ++} ++ ++/* ++ * could return: ++ * - EROFS ++ * - ENOMEM ++ */ ++static int ext3_ext_get_access(handle_t *handle, ++ struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ int err; ++ ++ if (path->p_bh) { ++ /* path points to block */ ++ err = ext3_journal_get_write_access(handle, path->p_bh); ++ } else { ++ /* path points to leaf/index in inode body */ ++ err = ext3_ext_get_access_for_root(handle, tree); ++ } ++ return err; ++} ++ ++/* ++ * could return: ++ * - EROFS ++ * - ENOMEM ++ * - EIO ++ */ ++static int ext3_ext_dirty(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ int err; ++ if (path->p_bh) { ++ /* path points to block */ ++ err =ext3_journal_dirty_metadata(handle, path->p_bh); ++ } else { ++ /* path points to leaf/index in inode body */ ++ err = ext3_ext_mark_root_dirty(handle, tree); ++ } ++ return err; ++} ++ ++static int inline ++ext3_ext_new_block(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, struct ext3_extent *ex, ++ int *err) ++{ ++ int goal, depth, newblock; ++ struct inode *inode; ++ ++ EXT_ASSERT(tree); ++ if (tree->ops->new_block) ++ return tree->ops->new_block(handle, tree, path, ex, err); ++ ++ inode = tree->inode; ++ depth = EXT_DEPTH(tree); ++ if (path && depth > 0) { ++ goal = path[depth-1].p_block; ++ } else { ++ struct ext3_inode_info *ei = EXT3_I(inode); ++ unsigned long bg_start; ++ unsigned long colour; ++ ++ bg_start = (ei->i_block_group * ++ EXT3_BLOCKS_PER_GROUP(inode->i_sb)) + ++ le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block); ++ colour = (current->pid % 16) * ++ (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16); ++ goal = bg_start + colour; ++ } ++ ++ newblock = ext3_new_block(handle, inode, goal, err); ++ return newblock; ++} ++ ++static inline void ext3_ext_tree_changed(struct ext3_extents_tree *tree) ++{ ++ struct ext3_extent_header *neh = EXT_ROOT_HDR(tree); ++ neh->eh_generation = ((EXT_FLAGS(neh) & ~EXT_FLAGS_CLR_UNKNOWN) << 24) | ++ (EXT_HDR_GEN(neh) + 1); ++} ++ ++static inline int ext3_ext_space_block(struct ext3_extents_tree *tree) ++{ ++ int size; ++ ++ size = (tree->inode->i_sb->s_blocksize - ++ sizeof(struct ext3_extent_header)) / ++ sizeof(struct ext3_extent); ++#ifdef AGRESSIVE_TEST ++ size = 6; ++#endif ++ return size; ++} ++ ++static inline int ext3_ext_space_block_idx(struct ext3_extents_tree *tree) ++{ ++ int size; ++ ++ size = (tree->inode->i_sb->s_blocksize - ++ sizeof(struct ext3_extent_header)) / ++ sizeof(struct ext3_extent_idx); ++#ifdef AGRESSIVE_TEST ++ size = 5; ++#endif ++ return size; ++} ++ ++static inline int ext3_ext_space_root(struct ext3_extents_tree *tree) ++{ ++ int size; ++ ++ size = (tree->buffer_len - sizeof(struct ext3_extent_header)) / ++ sizeof(struct ext3_extent); ++#ifdef AGRESSIVE_TEST ++ size = 3; ++#endif ++ return size; ++} ++ ++static inline int ext3_ext_space_root_idx(struct ext3_extents_tree *tree) ++{ ++ int size; ++ ++ size = (tree->buffer_len - sizeof(struct ext3_extent_header)) / ++ sizeof(struct ext3_extent_idx); ++#ifdef AGRESSIVE_TEST ++ size = 4; ++#endif ++ return size; ++} ++ ++static void ext3_ext_show_path(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++#ifdef EXT_DEBUG ++ int k, l = path->p_depth; ++ ++ ext_debug(tree, "path:"); ++ for (k = 0; k <= l; k++, path++) { ++ if (path->p_idx) { ++ ext_debug(tree, " %d->%d", path->p_idx->ei_block, ++ path->p_idx->ei_leaf); ++ } else if (path->p_ext) { ++ ext_debug(tree, " %d:%d:%d", ++ path->p_ext->ee_block, ++ path->p_ext->ee_len, ++ path->p_ext->ee_start); ++ } else ++ ext_debug(tree, " []"); ++ } ++ ext_debug(tree, "\n"); ++#endif ++} ++ ++static void ext3_ext_show_leaf(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++#ifdef EXT_DEBUG ++ int depth = EXT_DEPTH(tree); ++ struct ext3_extent_header *eh; ++ struct ext3_extent *ex; ++ int i; ++ ++ if (!path) ++ return; ++ ++ eh = path[depth].p_hdr; ++ ex = EXT_FIRST_EXTENT(eh); ++ ++ for (i = 0; i < eh->eh_entries; i++, ex++) { ++ ext_debug(tree, "%d:%d:%d ", ++ ex->ee_block, ex->ee_len, ex->ee_start); ++ } ++ ext_debug(tree, "\n"); ++#endif ++} ++ ++static void ext3_ext_drop_refs(struct ext3_ext_path *path) ++{ ++ int depth = path->p_depth; ++ int i; ++ ++ for (i = 0; i <= depth; i++, path++) { ++ if (path->p_bh) { ++ brelse(path->p_bh); ++ path->p_bh = NULL; ++ } ++ } ++} ++ ++/* ++ * binary search for closest index by given block ++ */ ++static inline void ++ext3_ext_binsearch_idx(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, int block) ++{ ++ struct ext3_extent_header *eh = path->p_hdr; ++ struct ext3_extent_idx *ix; ++ int l = 0, k, r; ++ ++ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); ++ EXT_ASSERT(eh->eh_entries <= eh->eh_max); ++ EXT_ASSERT(eh->eh_entries > 0); ++ ++ ext_debug(tree, "binsearch for %d(idx): ", block); ++ ++ path->p_idx = ix = EXT_FIRST_INDEX(eh); ++ ++ r = k = eh->eh_entries; ++ while (k > 1) { ++ k = (r - l) / 2; ++ if (block < ix[l + k].ei_block) ++ r -= k; ++ else ++ l += k; ++ ext_debug(tree, "%d:%d:%d ", k, l, r); ++ } ++ ++ ix += l; ++ path->p_idx = ix; ++ ext_debug(tree," -> %d->%d ",path->p_idx->ei_block,path->p_idx->ei_leaf); ++ ++ while (l++ < r) { ++ if (block < ix->ei_block) ++ break; ++ path->p_idx = ix++; ++ } ++ ext_debug(tree, " -> %d->%d\n", path->p_idx->ei_block, ++ path->p_idx->ei_leaf); ++ ++#ifdef CHECK_BINSEARCH ++ { ++ struct ext3_extent_idx *chix; ++ ++ chix = ix = EXT_FIRST_INDEX(eh); ++ for (k = 0; k < eh->eh_entries; k++, ix++) { ++ if (k != 0 && ix->ei_block <= ix[-1].ei_block) { ++ printk("k=%d, ix=0x%p, first=0x%p\n", k, ++ ix, EXT_FIRST_INDEX(eh)); ++ printk("%u <= %u\n", ++ ix->ei_block,ix[-1].ei_block); ++ } ++ EXT_ASSERT(k == 0 || ix->ei_block > ix[-1].ei_block); ++ if (block < ix->ei_block) ++ break; ++ chix = ix; ++ } ++ EXT_ASSERT(chix == path->p_idx); ++ } ++#endif ++} ++ ++/* ++ * binary search for closest extent by given block ++ */ ++static inline void ++ext3_ext_binsearch(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, int block) ++{ ++ struct ext3_extent_header *eh = path->p_hdr; ++ struct ext3_extent *ex; ++ int l = 0, k, r; ++ ++ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); ++ EXT_ASSERT(eh->eh_entries <= eh->eh_max); ++ ++ if (eh->eh_entries == 0) { ++ /* ++ * this leaf is empty yet: ++ * we get such a leaf in split/add case ++ */ ++ return; ++ } ++ ++ ext_debug(tree, "binsearch for %d: ", block); ++ ++ path->p_ext = ex = EXT_FIRST_EXTENT(eh); ++ ++ r = k = eh->eh_entries; ++ while (k > 1) { ++ k = (r - l) / 2; ++ if (block < ex[l + k].ee_block) ++ r -= k; ++ else ++ l += k; ++ ext_debug(tree, "%d:%d:%d ", k, l, r); ++ } ++ ++ ex += l; ++ path->p_ext = ex; ++ ext_debug(tree, " -> %d:%d:%d ", path->p_ext->ee_block, ++ path->p_ext->ee_start, path->p_ext->ee_len); ++ ++ while (l++ < r) { ++ if (block < ex->ee_block) ++ break; ++ path->p_ext = ex++; ++ } ++ ext_debug(tree, " -> %d:%d:%d\n", path->p_ext->ee_block, ++ path->p_ext->ee_start, path->p_ext->ee_len); ++ ++#ifdef CHECK_BINSEARCH ++ { ++ struct ext3_extent *chex; ++ ++ chex = ex = EXT_FIRST_EXTENT(eh); ++ for (k = 0; k < eh->eh_entries; k++, ex++) { ++ EXT_ASSERT(k == 0 || ex->ee_block > ex[-1].ee_block); ++ if (block < ex->ee_block) ++ break; ++ chex = ex; ++ } ++ EXT_ASSERT(chex == path->p_ext); ++ } ++#endif ++} ++ ++int ext3_extent_tree_init(handle_t *handle, struct ext3_extents_tree *tree) ++{ ++ struct ext3_extent_header *eh; ++ ++ BUG_ON(tree->buffer_len == 0); ++ ext3_ext_get_access_for_root(handle, tree); ++ eh = EXT_ROOT_HDR(tree); ++ eh->eh_depth = 0; ++ eh->eh_entries = 0; ++ eh->eh_magic = EXT3_EXT_MAGIC; ++ eh->eh_max = ext3_ext_space_root(tree); ++ ext3_ext_mark_root_dirty(handle, tree); ++ ext3_ext_invalidate_cache(tree); ++ return 0; ++} ++ ++struct ext3_ext_path * ++ext3_ext_find_extent(struct ext3_extents_tree *tree, int block, ++ struct ext3_ext_path *path) ++{ ++ struct ext3_extent_header *eh; ++ struct buffer_head *bh; ++ int depth, i, ppos = 0; ++ ++ EXT_ASSERT(tree); ++ EXT_ASSERT(tree->inode); ++ EXT_ASSERT(tree->root); ++ ++ eh = EXT_ROOT_HDR(tree); ++ EXT_ASSERT(eh); ++ if (ext3_ext_check_header(eh)) { ++ /* don't free previously allocated path ++ * -- caller should take care */ ++ path = NULL; ++ goto err; ++ } ++ ++ i = depth = EXT_DEPTH(tree); ++ EXT_ASSERT(eh->eh_max); ++ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); ++ ++ /* account possible depth increase */ ++ if (!path) { ++ path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 2), ++ GFP_NOFS); ++ if (!path) ++ return ERR_PTR(-ENOMEM); ++ } ++ memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1)); ++ path[0].p_hdr = eh; ++ ++ /* walk through the tree */ ++ while (i) { ++ ext_debug(tree, "depth %d: num %d, max %d\n", ++ ppos, eh->eh_entries, eh->eh_max); ++ ext3_ext_binsearch_idx(tree, path + ppos, block); ++ path[ppos].p_block = path[ppos].p_idx->ei_leaf; ++ path[ppos].p_depth = i; ++ path[ppos].p_ext = NULL; ++ ++ bh = sb_bread(tree->inode->i_sb, path[ppos].p_block); ++ if (!bh) ++ goto err; ++ ++ eh = EXT_BLOCK_HDR(bh); ++ ppos++; ++ EXT_ASSERT(ppos <= depth); ++ path[ppos].p_bh = bh; ++ path[ppos].p_hdr = eh; ++ i--; ++ ++ if (ext3_ext_check_header(eh)) ++ goto err; ++ } ++ ++ path[ppos].p_depth = i; ++ path[ppos].p_hdr = eh; ++ path[ppos].p_ext = NULL; ++ path[ppos].p_idx = NULL; ++ ++ if (ext3_ext_check_header(eh)) ++ goto err; ++ ++ /* find extent */ ++ ext3_ext_binsearch(tree, path + ppos, block); ++ ++ ext3_ext_show_path(tree, path); ++ ++ return path; ++ ++err: ++ printk(KERN_ERR "EXT3-fs: header is corrupted!\n"); ++ if (path) { ++ ext3_ext_drop_refs(path); ++ kfree(path); ++ } ++ return ERR_PTR(-EIO); ++} ++ ++/* ++ * insert new index [logical;ptr] into the block at cupr ++ * it check where to insert: before curp or after curp ++ */ ++static int ext3_ext_insert_index(handle_t *handle, ++ struct ext3_extents_tree *tree, ++ struct ext3_ext_path *curp, ++ int logical, int ptr) ++{ ++ struct ext3_extent_idx *ix; ++ int len, err; ++ ++ if ((err = ext3_ext_get_access(handle, tree, curp))) ++ return err; ++ ++ EXT_ASSERT(logical != curp->p_idx->ei_block); ++ len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx; ++ if (logical > curp->p_idx->ei_block) { ++ /* insert after */ ++ if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) { ++ len = (len - 1) * sizeof(struct ext3_extent_idx); ++ len = len < 0 ? 0 : len; ++ ext_debug(tree, "insert new index %d after: %d. " ++ "move %d from 0x%p to 0x%p\n", ++ logical, ptr, len, ++ (curp->p_idx + 1), (curp->p_idx + 2)); ++ memmove(curp->p_idx + 2, curp->p_idx + 1, len); ++ } ++ ix = curp->p_idx + 1; ++ } else { ++ /* insert before */ ++ len = len * sizeof(struct ext3_extent_idx); ++ len = len < 0 ? 0 : len; ++ ext_debug(tree, "insert new index %d before: %d. " ++ "move %d from 0x%p to 0x%p\n", ++ logical, ptr, len, ++ curp->p_idx, (curp->p_idx + 1)); ++ memmove(curp->p_idx + 1, curp->p_idx, len); ++ ix = curp->p_idx; ++ } ++ ++ ix->ei_block = logical; ++ ix->ei_leaf = ptr; ++ ix->ei_leaf_hi = ix->ei_unused = 0; ++ curp->p_hdr->eh_entries++; ++ ++ EXT_ASSERT(curp->p_hdr->eh_entries <= curp->p_hdr->eh_max); ++ EXT_ASSERT(ix <= EXT_LAST_INDEX(curp->p_hdr)); ++ ++ err = ext3_ext_dirty(handle, tree, curp); ++ ext3_std_error(tree->inode->i_sb, err); ++ ++ return err; ++} ++ ++/* ++ * routine inserts new subtree into the path, using free index entry ++ * at depth 'at: ++ * - allocates all needed blocks (new leaf and all intermediate index blocks) ++ * - makes decision where to split ++ * - moves remaining extens and index entries (right to the split point) ++ * into the newly allocated blocks ++ * - initialize subtree ++ */ ++static int ext3_ext_split(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newext, int at) ++{ ++ struct buffer_head *bh = NULL; ++ int depth = EXT_DEPTH(tree); ++ struct ext3_extent_header *neh; ++ struct ext3_extent_idx *fidx; ++ struct ext3_extent *ex; ++ int i = at, k, m, a; ++ unsigned long newblock, oldblock, border; ++ int *ablocks = NULL; /* array of allocated blocks */ ++ int err = 0; ++ ++ /* make decision: where to split? */ ++ /* FIXME: now desicion is simplest: at current extent */ ++ ++ /* if current leaf will be splitted, then we should use ++ * border from split point */ ++ EXT_ASSERT(path[depth].p_ext <= EXT_MAX_EXTENT(path[depth].p_hdr)); ++ if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) { ++ border = path[depth].p_ext[1].ee_block; ++ ext_debug(tree, "leaf will be splitted." ++ " next leaf starts at %d\n", ++ (int)border); ++ } else { ++ border = newext->ee_block; ++ ext_debug(tree, "leaf will be added." ++ " next leaf starts at %d\n", ++ (int)border); ++ } ++ ++ /* ++ * if error occurs, then we break processing ++ * and turn filesystem read-only. so, index won't ++ * be inserted and tree will be in consistent ++ * state. next mount will repair buffers too ++ */ ++ ++ /* ++ * get array to track all allocated blocks ++ * we need this to handle errors and free blocks ++ * upon them ++ */ ++ ablocks = kmalloc(sizeof(unsigned long) * depth, GFP_NOFS); ++ if (!ablocks) ++ return -ENOMEM; ++ memset(ablocks, 0, sizeof(unsigned long) * depth); ++ ++ /* allocate all needed blocks */ ++ ext_debug(tree, "allocate %d blocks for indexes/leaf\n", depth - at); ++ for (a = 0; a < depth - at; a++) { ++ newblock = ext3_ext_new_block(handle, tree, path, newext, &err); ++ if (newblock == 0) ++ goto cleanup; ++ ablocks[a] = newblock; ++ } ++ ++ /* initialize new leaf */ ++ newblock = ablocks[--a]; ++ EXT_ASSERT(newblock); ++ bh = sb_getblk(tree->inode->i_sb, newblock); ++ if (!bh) { ++ err = -EIO; ++ goto cleanup; ++ } ++ lock_buffer(bh); ++ ++ if ((err = ext3_journal_get_create_access(handle, bh))) ++ goto cleanup; ++ ++ neh = EXT_BLOCK_HDR(bh); ++ neh->eh_entries = 0; ++ neh->eh_max = ext3_ext_space_block(tree); ++ neh->eh_magic = EXT3_EXT_MAGIC; ++ neh->eh_depth = 0; ++ ex = EXT_FIRST_EXTENT(neh); ++ ++ /* move remain of path[depth] to the new leaf */ ++ EXT_ASSERT(path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max); ++ /* start copy from next extent */ ++ /* TODO: we could do it by single memmove */ ++ m = 0; ++ path[depth].p_ext++; ++ while (path[depth].p_ext <= ++ EXT_MAX_EXTENT(path[depth].p_hdr)) { ++ ext_debug(tree, "move %d:%d:%d in new leaf %lu\n", ++ path[depth].p_ext->ee_block, ++ path[depth].p_ext->ee_start, ++ path[depth].p_ext->ee_len, ++ newblock); ++ memmove(ex++, path[depth].p_ext++, sizeof(struct ext3_extent)); ++ neh->eh_entries++; ++ m++; ++ } ++ set_buffer_uptodate(bh); ++ unlock_buffer(bh); ++ ++ if ((err = ext3_journal_dirty_metadata(handle, bh))) ++ goto cleanup; ++ brelse(bh); ++ bh = NULL; ++ ++ /* correct old leaf */ ++ if (m) { ++ if ((err = ext3_ext_get_access(handle, tree, path + depth))) ++ goto cleanup; ++ path[depth].p_hdr->eh_entries -= m; ++ if ((err = ext3_ext_dirty(handle, tree, path + depth))) ++ goto cleanup; ++ ++ } ++ ++ /* create intermediate indexes */ ++ k = depth - at - 1; ++ EXT_ASSERT(k >= 0); ++ if (k) ++ ext_debug(tree, "create %d intermediate indices\n", k); ++ /* insert new index into current index block */ ++ /* current depth stored in i var */ ++ i = depth - 1; ++ while (k--) { ++ oldblock = newblock; ++ newblock = ablocks[--a]; ++ bh = sb_getblk(tree->inode->i_sb, newblock); ++ if (!bh) { ++ err = -EIO; ++ goto cleanup; ++ } ++ lock_buffer(bh); ++ ++ if ((err = ext3_journal_get_create_access(handle, bh))) ++ goto cleanup; ++ ++ neh = EXT_BLOCK_HDR(bh); ++ neh->eh_entries = 1; ++ neh->eh_magic = EXT3_EXT_MAGIC; ++ neh->eh_max = ext3_ext_space_block_idx(tree); ++ neh->eh_depth = depth - i; ++ fidx = EXT_FIRST_INDEX(neh); ++ fidx->ei_block = border; ++ fidx->ei_leaf = oldblock; ++ fidx->ei_leaf_hi = fidx->ei_unused = 0; ++ ++ ext_debug(tree, "int.index at %d (block %lu): %lu -> %lu\n", ++ i, newblock, border, oldblock); ++ /* copy indexes */ ++ m = 0; ++ path[i].p_idx++; ++ ++ ext_debug(tree, "cur 0x%p, last 0x%p\n", path[i].p_idx, ++ EXT_MAX_INDEX(path[i].p_hdr)); ++ EXT_ASSERT(EXT_MAX_INDEX(path[i].p_hdr) == ++ EXT_LAST_INDEX(path[i].p_hdr)); ++ while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) { ++ ext_debug(tree, "%d: move %d:%d in new index %lu\n", ++ i, path[i].p_idx->ei_block, ++ path[i].p_idx->ei_leaf, newblock); ++ memmove(++fidx, path[i].p_idx++, ++ sizeof(struct ext3_extent_idx)); ++ neh->eh_entries++; ++ EXT_ASSERT(neh->eh_entries <= neh->eh_max); ++ m++; ++ } ++ set_buffer_uptodate(bh); ++ unlock_buffer(bh); ++ ++ if ((err = ext3_journal_dirty_metadata(handle, bh))) ++ goto cleanup; ++ brelse(bh); ++ bh = NULL; ++ ++ /* correct old index */ ++ if (m) { ++ err = ext3_ext_get_access(handle, tree, path + i); ++ if (err) ++ goto cleanup; ++ path[i].p_hdr->eh_entries -= m; ++ err = ext3_ext_dirty(handle, tree, path + i); ++ if (err) ++ goto cleanup; ++ } ++ ++ i--; ++ } ++ ++ /* insert new index */ ++ if (!err) ++ err = ext3_ext_insert_index(handle, tree, path + at, ++ border, newblock); ++ ++cleanup: ++ if (bh) { ++ if (buffer_locked(bh)) ++ unlock_buffer(bh); ++ brelse(bh); ++ } ++ ++ if (err) { ++ /* free all allocated blocks in error case */ ++ for (i = 0; i < depth; i++) { ++ if (!ablocks[i]) ++ continue; ++ ext3_free_blocks(handle, tree->inode, ablocks[i], 1); ++ } ++ } ++ kfree(ablocks); ++ ++ return err; ++} ++ ++/* ++ * routine implements tree growing procedure: ++ * - allocates new block ++ * - moves top-level data (index block or leaf) into the new block ++ * - initialize new top-level, creating index that points to the ++ * just created block ++ */ ++static int ext3_ext_grow_indepth(handle_t *handle, ++ struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newext) ++{ ++ struct ext3_ext_path *curp = path; ++ struct ext3_extent_header *neh; ++ struct ext3_extent_idx *fidx; ++ struct buffer_head *bh; ++ unsigned long newblock; ++ int err = 0; ++ ++ newblock = ext3_ext_new_block(handle, tree, path, newext, &err); ++ if (newblock == 0) ++ return err; ++ ++ bh = sb_getblk(tree->inode->i_sb, newblock); ++ if (!bh) { ++ err = -EIO; ++ ext3_std_error(tree->inode->i_sb, err); ++ return err; ++ } ++ lock_buffer(bh); ++ ++ if ((err = ext3_journal_get_create_access(handle, bh))) { ++ unlock_buffer(bh); ++ goto out; ++ } ++ ++ /* move top-level index/leaf into new block */ ++ memmove(bh->b_data, curp->p_hdr, tree->buffer_len); ++ ++ /* set size of new block */ ++ neh = EXT_BLOCK_HDR(bh); ++ /* old root could have indexes or leaves ++ * so calculate eh_max right way */ ++ if (EXT_DEPTH(tree)) ++ neh->eh_max = ext3_ext_space_block_idx(tree); ++ else ++ neh->eh_max = ext3_ext_space_block(tree); ++ neh->eh_magic = EXT3_EXT_MAGIC; ++ set_buffer_uptodate(bh); ++ unlock_buffer(bh); ++ ++ if ((err = ext3_journal_dirty_metadata(handle, bh))) ++ goto out; ++ ++ /* create index in new top-level index: num,max,pointer */ ++ if ((err = ext3_ext_get_access(handle, tree, curp))) ++ goto out; ++ ++ curp->p_hdr->eh_magic = EXT3_EXT_MAGIC; ++ curp->p_hdr->eh_max = ext3_ext_space_root_idx(tree); ++ curp->p_hdr->eh_entries = 1; ++ curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr); ++ /* FIXME: it works, but actually path[0] can be index */ ++ curp->p_idx->ei_block = EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block; ++ curp->p_idx->ei_leaf = newblock; ++ curp->p_idx->ei_leaf_hi = curp->p_idx->ei_unused = 0; ++ ++ neh = EXT_ROOT_HDR(tree); ++ fidx = EXT_FIRST_INDEX(neh); ++ ext_debug(tree, "new root: num %d(%d), lblock %d, ptr %d\n", ++ neh->eh_entries, neh->eh_max, fidx->ei_block, fidx->ei_leaf); ++ ++ neh->eh_depth = path->p_depth + 1; ++ err = ext3_ext_dirty(handle, tree, curp); ++out: ++ brelse(bh); ++ ++ return err; ++} ++ ++/* ++ * routine finds empty index and adds new leaf. if no free index found ++ * then it requests in-depth growing ++ */ ++static int ext3_ext_create_new_leaf(handle_t *handle, ++ struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newext) ++{ ++ struct ext3_ext_path *curp; ++ int depth, i, err = 0; ++ ++repeat: ++ i = depth = EXT_DEPTH(tree); ++ ++ /* walk up to the tree and look for free index entry */ ++ curp = path + depth; ++ while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) { ++ i--; ++ curp--; ++ } ++ ++ /* we use already allocated block for index block ++ * so, subsequent data blocks should be contigoues */ ++ if (EXT_HAS_FREE_INDEX(curp)) { ++ /* if we found index with free entry, then use that ++ * entry: create all needed subtree and add new leaf */ ++ err = ext3_ext_split(handle, tree, path, newext, i); ++ ++ /* refill path */ ++ ext3_ext_drop_refs(path); ++ path = ext3_ext_find_extent(tree, newext->ee_block, path); ++ if (IS_ERR(path)) ++ err = PTR_ERR(path); ++ } else { ++ /* tree is full, time to grow in depth */ ++ err = ext3_ext_grow_indepth(handle, tree, path, newext); ++ ++ /* refill path */ ++ ext3_ext_drop_refs(path); ++ path = ext3_ext_find_extent(tree, newext->ee_block, path); ++ if (IS_ERR(path)) ++ err = PTR_ERR(path); ++ ++ /* ++ * only first (depth 0 -> 1) produces free space ++ * in all other cases we have to split growed tree ++ */ ++ depth = EXT_DEPTH(tree); ++ if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) { ++ /* now we need split */ ++ goto repeat; ++ } ++ } ++ ++ if (err) ++ return err; ++ ++ return 0; ++} ++ ++/* ++ * returns allocated block in subsequent extent or EXT_MAX_BLOCK ++ * NOTE: it consider block number from index entry as ++ * allocated block. thus, index entries have to be consistent ++ * with leafs ++ */ ++static unsigned long ++ext3_ext_next_allocated_block(struct ext3_ext_path *path) ++{ ++ int depth; ++ ++ EXT_ASSERT(path != NULL); ++ depth = path->p_depth; ++ ++ if (depth == 0 && path->p_ext == NULL) ++ return EXT_MAX_BLOCK; ++ ++ /* FIXME: what if index isn't full ?! */ ++ while (depth >= 0) { ++ if (depth == path->p_depth) { ++ /* leaf */ ++ if (path[depth].p_ext != ++ EXT_LAST_EXTENT(path[depth].p_hdr)) ++ return path[depth].p_ext[1].ee_block; ++ } else { ++ /* index */ ++ if (path[depth].p_idx != ++ EXT_LAST_INDEX(path[depth].p_hdr)) ++ return path[depth].p_idx[1].ei_block; ++ } ++ depth--; ++ } ++ ++ return EXT_MAX_BLOCK; ++} ++ ++/* ++ * returns first allocated block from next leaf or EXT_MAX_BLOCK ++ */ ++static unsigned ext3_ext_next_leaf_block(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ int depth; ++ ++ EXT_ASSERT(path != NULL); ++ depth = path->p_depth; ++ ++ /* zero-tree has no leaf blocks at all */ ++ if (depth == 0) ++ return EXT_MAX_BLOCK; ++ ++ /* go to index block */ ++ depth--; ++ ++ while (depth >= 0) { ++ if (path[depth].p_idx != ++ EXT_LAST_INDEX(path[depth].p_hdr)) ++ return path[depth].p_idx[1].ei_block; ++ depth--; ++ } ++ ++ return EXT_MAX_BLOCK; ++} ++ ++/* ++ * if leaf gets modified and modified extent is first in the leaf ++ * then we have to correct all indexes above ++ * TODO: do we need to correct tree in all cases? ++ */ ++int ext3_ext_correct_indexes(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ struct ext3_extent_header *eh; ++ int depth = EXT_DEPTH(tree); ++ struct ext3_extent *ex; ++ unsigned long border; ++ int k, err = 0; ++ ++ eh = path[depth].p_hdr; ++ ex = path[depth].p_ext; ++ EXT_ASSERT(ex); ++ EXT_ASSERT(eh); ++ ++ if (depth == 0) { ++ /* there is no tree at all */ ++ return 0; ++ } ++ ++ if (ex != EXT_FIRST_EXTENT(eh)) { ++ /* we correct tree if first leaf got modified only */ ++ return 0; ++ } ++ ++ /* ++ * TODO: we need correction if border is smaller then current one ++ */ ++ k = depth - 1; ++ border = path[depth].p_ext->ee_block; ++ if ((err = ext3_ext_get_access(handle, tree, path + k))) ++ return err; ++ path[k].p_idx->ei_block = border; ++ if ((err = ext3_ext_dirty(handle, tree, path + k))) ++ return err; ++ ++ while (k--) { ++ /* change all left-side indexes */ ++ if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr)) ++ break; ++ if ((err = ext3_ext_get_access(handle, tree, path + k))) ++ break; ++ path[k].p_idx->ei_block = border; ++ if ((err = ext3_ext_dirty(handle, tree, path + k))) ++ break; ++ } ++ ++ return err; ++} ++ ++static int inline ++ext3_can_extents_be_merged(struct ext3_extents_tree *tree, ++ struct ext3_extent *ex1, ++ struct ext3_extent *ex2) ++{ ++ if (ex1->ee_block + ex1->ee_len != ex2->ee_block) ++ return 0; ++ ++#ifdef AGRESSIVE_TEST ++ if (ex1->ee_len >= 4) ++ return 0; ++#endif ++ ++ if (!tree->ops->mergable) ++ return 1; ++ ++ return tree->ops->mergable(ex1, ex2); ++} ++ ++/* ++ * this routine tries to merge requsted extent into the existing ++ * extent or inserts requested extent as new one into the tree, ++ * creating new leaf in no-space case ++ */ ++int ext3_ext_insert_extent(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newext) ++{ ++ struct ext3_extent_header * eh; ++ struct ext3_extent *ex, *fex; ++ struct ext3_extent *nearex; /* nearest extent */ ++ struct ext3_ext_path *npath = NULL; ++ int depth, len, err, next; ++ ++ EXT_ASSERT(newext->ee_len > 0); ++ depth = EXT_DEPTH(tree); ++ ex = path[depth].p_ext; ++ EXT_ASSERT(path[depth].p_hdr); ++ ++ /* try to insert block into found extent and return */ ++ if (ex && ext3_can_extents_be_merged(tree, ex, newext)) { ++ ext_debug(tree, "append %d block to %d:%d (from %d)\n", ++ newext->ee_len, ex->ee_block, ex->ee_len, ++ ex->ee_start); ++ if ((err = ext3_ext_get_access(handle, tree, path + depth))) ++ return err; ++ ex->ee_len += newext->ee_len; ++ eh = path[depth].p_hdr; ++ nearex = ex; ++ goto merge; ++ } ++ ++repeat: ++ depth = EXT_DEPTH(tree); ++ eh = path[depth].p_hdr; ++ if (eh->eh_entries < eh->eh_max) ++ goto has_space; ++ ++ /* probably next leaf has space for us? */ ++ fex = EXT_LAST_EXTENT(eh); ++ next = ext3_ext_next_leaf_block(tree, path); ++ if (newext->ee_block > fex->ee_block && next != EXT_MAX_BLOCK) { ++ ext_debug(tree, "next leaf block - %d\n", next); ++ EXT_ASSERT(!npath); ++ npath = ext3_ext_find_extent(tree, next, NULL); ++ if (IS_ERR(npath)) ++ return PTR_ERR(npath); ++ EXT_ASSERT(npath->p_depth == path->p_depth); ++ eh = npath[depth].p_hdr; ++ if (eh->eh_entries < eh->eh_max) { ++ ext_debug(tree, "next leaf isnt full(%d)\n", ++ eh->eh_entries); ++ path = npath; ++ goto repeat; ++ } ++ ext_debug(tree, "next leaf hasno free space(%d,%d)\n", ++ eh->eh_entries, eh->eh_max); ++ } ++ ++ /* ++ * there is no free space in found leaf ++ * we're gonna add new leaf in the tree ++ */ ++ err = ext3_ext_create_new_leaf(handle, tree, path, newext); ++ if (err) ++ goto cleanup; ++ depth = EXT_DEPTH(tree); ++ eh = path[depth].p_hdr; ++ ++has_space: ++ nearex = path[depth].p_ext; ++ ++ if ((err = ext3_ext_get_access(handle, tree, path + depth))) ++ goto cleanup; ++ ++ if (!nearex) { ++ /* there is no extent in this leaf, create first one */ ++ ext_debug(tree, "first extent in the leaf: %d:%d:%d\n", ++ newext->ee_block, newext->ee_start, ++ newext->ee_len); ++ path[depth].p_ext = EXT_FIRST_EXTENT(eh); ++ } else if (newext->ee_block > nearex->ee_block) { ++ EXT_ASSERT(newext->ee_block != nearex->ee_block); ++ if (nearex != EXT_LAST_EXTENT(eh)) { ++ len = EXT_MAX_EXTENT(eh) - nearex; ++ len = (len - 1) * sizeof(struct ext3_extent); ++ len = len < 0 ? 0 : len; ++ ext_debug(tree, "insert %d:%d:%d after: nearest 0x%p, " ++ "move %d from 0x%p to 0x%p\n", ++ newext->ee_block, newext->ee_start, ++ newext->ee_len, ++ nearex, len, nearex + 1, nearex + 2); ++ memmove(nearex + 2, nearex + 1, len); ++ } ++ path[depth].p_ext = nearex + 1; ++ } else { ++ EXT_ASSERT(newext->ee_block != nearex->ee_block); ++ len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext3_extent); ++ len = len < 0 ? 0 : len; ++ ext_debug(tree, "insert %d:%d:%d before: nearest 0x%p, " ++ "move %d from 0x%p to 0x%p\n", ++ newext->ee_block, newext->ee_start, newext->ee_len, ++ nearex, len, nearex + 1, nearex + 2); ++ memmove(nearex + 1, nearex, len); ++ path[depth].p_ext = nearex; ++ } ++ ++ eh->eh_entries++; ++ nearex = path[depth].p_ext; ++ nearex->ee_block = newext->ee_block; ++ nearex->ee_start = newext->ee_start; ++ nearex->ee_len = newext->ee_len; ++ /* FIXME: support for large fs */ ++ nearex->ee_start_hi = 0; ++ ++merge: ++ /* try to merge extents to the right */ ++ while (nearex < EXT_LAST_EXTENT(eh)) { ++ if (!ext3_can_extents_be_merged(tree, nearex, nearex + 1)) ++ break; ++ /* merge with next extent! */ ++ nearex->ee_len += nearex[1].ee_len; ++ if (nearex + 1 < EXT_LAST_EXTENT(eh)) { ++ len = (EXT_LAST_EXTENT(eh) - nearex - 1) * ++ sizeof(struct ext3_extent); ++ memmove(nearex + 1, nearex + 2, len); ++ } ++ eh->eh_entries--; ++ EXT_ASSERT(eh->eh_entries > 0); ++ } ++ ++ /* try to merge extents to the left */ ++ ++ /* time to correct all indexes above */ ++ err = ext3_ext_correct_indexes(handle, tree, path); ++ if (err) ++ goto cleanup; ++ ++ err = ext3_ext_dirty(handle, tree, path + depth); ++ ++cleanup: ++ if (npath) { ++ ext3_ext_drop_refs(npath); ++ kfree(npath); ++ } ++ ext3_ext_tree_changed(tree); ++ ext3_ext_invalidate_cache(tree); ++ return err; ++} ++ ++int ext3_ext_walk_space(struct ext3_extents_tree *tree, unsigned long block, ++ unsigned long num, ext_prepare_callback func) ++{ ++ struct ext3_ext_path *path = NULL; ++ struct ext3_ext_cache cbex; ++ struct ext3_extent *ex; ++ unsigned long next, start = 0, end = 0; ++ unsigned long last = block + num; ++ int depth, exists, err = 0; ++ ++ EXT_ASSERT(tree); ++ EXT_ASSERT(func); ++ EXT_ASSERT(tree->inode); ++ EXT_ASSERT(tree->root); ++ ++ while (block < last && block != EXT_MAX_BLOCK) { ++ num = last - block; ++ /* find extent for this block */ ++ path = ext3_ext_find_extent(tree, block, path); ++ if (IS_ERR(path)) { ++ err = PTR_ERR(path); ++ path = NULL; ++ break; ++ } ++ ++ depth = EXT_DEPTH(tree); ++ EXT_ASSERT(path[depth].p_hdr); ++ ex = path[depth].p_ext; ++ next = ext3_ext_next_allocated_block(path); ++ ++ exists = 0; ++ if (!ex) { ++ /* there is no extent yet, so try to allocate ++ * all requested space */ ++ start = block; ++ end = block + num; ++ } else if (ex->ee_block > block) { ++ /* need to allocate space before found extent */ ++ start = block; ++ end = ex->ee_block; ++ if (block + num < end) ++ end = block + num; ++ } else if (block >= ex->ee_block + ex->ee_len) { ++ /* need to allocate space after found extent */ ++ start = block; ++ end = block + num; ++ if (end >= next) ++ end = next; ++ } else if (block >= ex->ee_block) { ++ /* ++ * some part of requested space is covered ++ * by found extent ++ */ ++ start = block; ++ end = ex->ee_block + ex->ee_len; ++ if (block + num < end) ++ end = block + num; ++ exists = 1; ++ } else { ++ BUG(); ++ } ++ EXT_ASSERT(end > start); ++ ++ if (!exists) { ++ cbex.ec_block = start; ++ cbex.ec_len = end - start; ++ cbex.ec_start = 0; ++ cbex.ec_type = EXT3_EXT_CACHE_GAP; ++ } else { ++ cbex.ec_block = ex->ee_block; ++ cbex.ec_len = ex->ee_len; ++ cbex.ec_start = ex->ee_start; ++ cbex.ec_type = EXT3_EXT_CACHE_EXTENT; ++ } ++ ++ EXT_ASSERT(cbex.ec_len > 0); ++ EXT_ASSERT(path[depth].p_hdr); ++ err = func(tree, path, &cbex); ++ ext3_ext_drop_refs(path); ++ ++ if (err < 0) ++ break; ++ if (err == EXT_REPEAT) ++ continue; ++ else if (err == EXT_BREAK) { ++ err = 0; ++ break; ++ } ++ ++ if (EXT_DEPTH(tree) != depth) { ++ /* depth was changed. we have to realloc path */ ++ kfree(path); ++ path = NULL; ++ } ++ ++ block = cbex.ec_block + cbex.ec_len; ++ } ++ ++ if (path) { ++ ext3_ext_drop_refs(path); ++ kfree(path); ++ } ++ ++ return err; ++} ++ ++static inline void ++ext3_ext_put_in_cache(struct ext3_extents_tree *tree, __u32 block, ++ __u32 len, __u32 start, int type) ++{ ++ EXT_ASSERT(len > 0); ++ if (tree->cex) { ++ tree->cex->ec_type = type; ++ tree->cex->ec_block = block; ++ tree->cex->ec_len = len; ++ tree->cex->ec_start = start; ++ } ++} ++ ++/* ++ * this routine calculate boundaries of the gap requested block fits into ++ * and cache this gap ++ */ ++static inline void ++ext3_ext_put_gap_in_cache(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ unsigned long block) ++{ ++ int depth = EXT_DEPTH(tree); ++ unsigned long lblock, len; ++ struct ext3_extent *ex; ++ ++ if (!tree->cex) ++ return; ++ ++ ex = path[depth].p_ext; ++ if (ex == NULL) { ++ /* there is no extent yet, so gap is [0;-] */ ++ lblock = 0; ++ len = EXT_MAX_BLOCK; ++ ext_debug(tree, "cache gap(whole file):"); ++ } else if (block < ex->ee_block) { ++ lblock = block; ++ len = ex->ee_block - block; ++ ext_debug(tree, "cache gap(before): %lu [%lu:%lu]", ++ (unsigned long) block, ++ (unsigned long) ex->ee_block, ++ (unsigned long) ex->ee_len); ++ } else if (block >= ex->ee_block + ex->ee_len) { ++ lblock = ex->ee_block + ex->ee_len; ++ len = ext3_ext_next_allocated_block(path); ++ ext_debug(tree, "cache gap(after): [%lu:%lu] %lu", ++ (unsigned long) ex->ee_block, ++ (unsigned long) ex->ee_len, ++ (unsigned long) block); ++ EXT_ASSERT(len > lblock); ++ len = len - lblock; ++ } else { ++ lblock = len = 0; ++ BUG(); ++ } ++ ++ ext_debug(tree, " -> %lu:%lu\n", (unsigned long) lblock, len); ++ ext3_ext_put_in_cache(tree, lblock, len, 0, EXT3_EXT_CACHE_GAP); ++} ++ ++static inline int ++ext3_ext_in_cache(struct ext3_extents_tree *tree, unsigned long block, ++ struct ext3_extent *ex) ++{ ++ struct ext3_ext_cache *cex = tree->cex; ++ ++ /* is there cache storage at all? */ ++ if (!cex) ++ return EXT3_EXT_CACHE_NO; ++ ++ /* has cache valid data? */ ++ if (cex->ec_type == EXT3_EXT_CACHE_NO) ++ return EXT3_EXT_CACHE_NO; ++ ++ EXT_ASSERT(cex->ec_type == EXT3_EXT_CACHE_GAP || ++ cex->ec_type == EXT3_EXT_CACHE_EXTENT); ++ if (block >= cex->ec_block && block < cex->ec_block + cex->ec_len) { ++ ex->ee_block = cex->ec_block; ++ ex->ee_start = cex->ec_start; ++ ex->ee_start_hi = 0; ++ ex->ee_len = cex->ec_len; ++ ext_debug(tree, "%lu cached by %lu:%lu:%lu\n", ++ (unsigned long) block, ++ (unsigned long) ex->ee_block, ++ (unsigned long) ex->ee_len, ++ (unsigned long) ex->ee_start); ++ return cex->ec_type; ++ } ++ ++ /* not in cache */ ++ return EXT3_EXT_CACHE_NO; ++} ++ ++/* ++ * routine removes index from the index block ++ * it's used in truncate case only. thus all requests are for ++ * last index in the block only ++ */ ++int ext3_ext_rm_idx(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ struct buffer_head *bh; ++ int err; ++ ++ /* free index block */ ++ path--; ++ EXT_ASSERT(path->p_hdr->eh_entries); ++ if ((err = ext3_ext_get_access(handle, tree, path))) ++ return err; ++ path->p_hdr->eh_entries--; ++ if ((err = ext3_ext_dirty(handle, tree, path))) ++ return err; ++ ext_debug(tree, "index is empty, remove it, free block %d\n", ++ path->p_idx->ei_leaf); ++ bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf); ++ ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf); ++ ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1); ++ return err; ++} ++ ++int ext3_ext_calc_credits_for_insert(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ int depth = EXT_DEPTH(tree); ++ int needed; ++ ++ if (path) { ++ /* probably there is space in leaf? */ ++ if (path[depth].p_hdr->eh_entries < path[depth].p_hdr->eh_max) ++ return 1; ++ } ++ ++ /* ++ * the worste case we're expecting is creation of the ++ * new root (growing in depth) with index splitting ++ * for splitting we have to consider depth + 1 because ++ * previous growing could increase it ++ */ ++ depth = depth + 1; ++ ++ /* ++ * growing in depth: ++ * block allocation + new root + old root ++ */ ++ needed = EXT3_ALLOC_NEEDED + 2; ++ ++ /* index split. we may need: ++ * allocate intermediate indexes and new leaf ++ * change two blocks at each level, but root ++ * modify root block (inode) ++ */ ++ needed += (depth * EXT3_ALLOC_NEEDED) + (2 * depth) + 1; ++ ++ return needed; ++} ++ ++static int ++ext3_ext_split_for_rm(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, unsigned long start, ++ unsigned long end) ++{ ++ struct ext3_extent *ex, tex; ++ struct ext3_ext_path *npath; ++ int depth, creds, err; ++ ++ depth = EXT_DEPTH(tree); ++ ex = path[depth].p_ext; ++ EXT_ASSERT(ex); ++ EXT_ASSERT(end < ex->ee_block + ex->ee_len - 1); ++ EXT_ASSERT(ex->ee_block < start); ++ ++ /* calculate tail extent */ ++ tex.ee_block = end + 1; ++ EXT_ASSERT(tex.ee_block < ex->ee_block + ex->ee_len); ++ tex.ee_len = ex->ee_block + ex->ee_len - tex.ee_block; ++ ++ creds = ext3_ext_calc_credits_for_insert(tree, path); ++ handle = ext3_ext_journal_restart(handle, creds); ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ ++ /* calculate head extent. use primary extent */ ++ err = ext3_ext_get_access(handle, tree, path + depth); ++ if (err) ++ return err; ++ ex->ee_len = start - ex->ee_block; ++ err = ext3_ext_dirty(handle, tree, path + depth); ++ if (err) ++ return err; ++ ++ /* FIXME: some callback to free underlying resource ++ * and correct ee_start? */ ++ ext_debug(tree, "split extent: head %u:%u, tail %u:%u\n", ++ ex->ee_block, ex->ee_len, tex.ee_block, tex.ee_len); ++ ++ npath = ext3_ext_find_extent(tree, ex->ee_block, NULL); ++ if (IS_ERR(npath)) ++ return PTR_ERR(npath); ++ depth = EXT_DEPTH(tree); ++ EXT_ASSERT(npath[depth].p_ext->ee_block == ex->ee_block); ++ EXT_ASSERT(npath[depth].p_ext->ee_len == ex->ee_len); ++ ++ err = ext3_ext_insert_extent(handle, tree, npath, &tex); ++ ext3_ext_drop_refs(npath); ++ kfree(npath); ++ ++ return err; ++} ++ ++static int ++ext3_ext_rm_leaf(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, unsigned long start, ++ unsigned long end) ++{ ++ struct ext3_extent *ex, *fu = NULL, *lu, *le; ++ int err = 0, correct_index = 0; ++ int depth = EXT_DEPTH(tree), credits; ++ struct ext3_extent_header *eh; ++ unsigned a, b, block, num; ++ ++ ext_debug(tree, "remove [%lu:%lu] in leaf\n", start, end); ++ if (!path[depth].p_hdr) ++ path[depth].p_hdr = EXT_BLOCK_HDR(path[depth].p_bh); ++ eh = path[depth].p_hdr; ++ EXT_ASSERT(eh); ++ EXT_ASSERT(eh->eh_entries <= eh->eh_max); ++ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); ++ ++ /* find where to start removing */ ++ le = ex = EXT_LAST_EXTENT(eh); ++ while (ex != EXT_FIRST_EXTENT(eh)) { ++ if (ex->ee_block <= end) ++ break; ++ ex--; ++ } ++ ++ if (start > ex->ee_block && end < ex->ee_block + ex->ee_len - 1) { ++ /* removal of internal part of the extent requested ++ * tail and head must be placed in different extent ++ * so, we have to insert one more extent */ ++ path[depth].p_ext = ex; ++ return ext3_ext_split_for_rm(handle, tree, path, start, end); ++ } ++ ++ lu = ex; ++ while (ex >= EXT_FIRST_EXTENT(eh) && ex->ee_block + ex->ee_len > start) { ++ ext_debug(tree, "remove ext %u:%u\n", ex->ee_block, ex->ee_len); ++ path[depth].p_ext = ex; ++ ++ a = ex->ee_block > start ? ex->ee_block : start; ++ b = ex->ee_block + ex->ee_len - 1 < end ? ++ ex->ee_block + ex->ee_len - 1 : end; ++ ++ ext_debug(tree, " border %u:%u\n", a, b); ++ ++ if (a != ex->ee_block && b != ex->ee_block + ex->ee_len - 1) { ++ block = 0; ++ num = 0; ++ BUG(); ++ } else if (a != ex->ee_block) { ++ /* remove tail of the extent */ ++ block = ex->ee_block; ++ num = a - block; ++ } else if (b != ex->ee_block + ex->ee_len - 1) { ++ /* remove head of the extent */ ++ block = a; ++ num = b - a; ++ } else { ++ /* remove whole extent: excelent! */ ++ block = ex->ee_block; ++ num = 0; ++ EXT_ASSERT(a == ex->ee_block && ++ b == ex->ee_block + ex->ee_len - 1); ++ } ++ ++ if (ex == EXT_FIRST_EXTENT(eh)) ++ correct_index = 1; ++ ++ credits = 1; ++ if (correct_index) ++ credits += (EXT_DEPTH(tree) * EXT3_ALLOC_NEEDED) + 1; ++ if (tree->ops->remove_extent_credits) ++ credits+=tree->ops->remove_extent_credits(tree,ex,a,b); ++ ++ handle = ext3_ext_journal_restart(handle, credits); ++ if (IS_ERR(handle)) { ++ err = PTR_ERR(handle); ++ goto out; ++ } ++ ++ err = ext3_ext_get_access(handle, tree, path + depth); ++ if (err) ++ goto out; ++ ++ if (tree->ops->remove_extent) ++ err = tree->ops->remove_extent(tree, ex, a, b); ++ if (err) ++ goto out; ++ ++ if (num == 0) { ++ /* this extent is removed entirely mark slot unused */ ++ ex->ee_start = ex->ee_start_hi = 0; ++ eh->eh_entries--; ++ fu = ex; ++ } ++ ++ ex->ee_block = block; ++ ex->ee_len = num; ++ ++ err = ext3_ext_dirty(handle, tree, path + depth); ++ if (err) ++ goto out; ++ ++ ext_debug(tree, "new extent: %u:%u:%u\n", ++ ex->ee_block, ex->ee_len, ex->ee_start); ++ ex--; ++ } ++ ++ if (fu) { ++ /* reuse unused slots */ ++ while (lu < le) { ++ if (lu->ee_start) { ++ *fu = *lu; ++ lu->ee_start = lu->ee_start_hi = 0; ++ fu++; ++ } ++ lu++; ++ } ++ } ++ ++ if (correct_index && eh->eh_entries) ++ err = ext3_ext_correct_indexes(handle, tree, path); ++ ++ /* if this leaf is free, then we should ++ * remove it from index block above */ ++ if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL) ++ err = ext3_ext_rm_idx(handle, tree, path + depth); ++ ++out: ++ return err; ++} ++ ++ ++static struct ext3_extent_idx * ++ext3_ext_last_covered(struct ext3_extent_header *hdr, unsigned long block) ++{ ++ struct ext3_extent_idx *ix; ++ ++ ix = EXT_LAST_INDEX(hdr); ++ while (ix != EXT_FIRST_INDEX(hdr)) { ++ if (ix->ei_block <= block) ++ break; ++ ix--; ++ } ++ return ix; ++} ++ ++/* ++ * returns 1 if current index have to be freed (even partial) ++ */ ++static int inline ++ext3_ext_more_to_rm(struct ext3_ext_path *path) ++{ ++ EXT_ASSERT(path->p_idx); ++ ++ if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr)) ++ return 0; ++ ++ /* ++ * if truncate on deeper level happened it it wasn't partial ++ * so we have to consider current index for truncation ++ */ ++ if (path->p_hdr->eh_entries == path->p_block) ++ return 0; ++ return 1; ++} ++ ++int ext3_ext_remove_space(struct ext3_extents_tree *tree, ++ unsigned long start, unsigned long end) ++{ ++ struct inode *inode = tree->inode; ++ struct super_block *sb = inode->i_sb; ++ int depth = EXT_DEPTH(tree); ++ struct ext3_ext_path *path; ++ handle_t *handle; ++ int i = 0, err = 0; ++ ++ ext_debug(tree, "space to be removed: %lu:%lu\n", start, end); ++ ++ /* probably first extent we're gonna free will be last in block */ ++ handle = ext3_journal_start(inode, depth + 1); ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ ++ ext3_ext_invalidate_cache(tree); ++ ++ /* ++ * we start scanning from right side freeing all the blocks ++ * after i_size and walking into the deep ++ */ ++ path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 1), GFP_KERNEL); ++ if (IS_ERR(path)) { ++ ext3_error(sb, __FUNCTION__, "Can't allocate path array"); ++ ext3_journal_stop(handle); ++ return -ENOMEM; ++ } ++ memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1)); ++ path[i].p_hdr = EXT_ROOT_HDR(tree); ++ ++ while (i >= 0 && err == 0) { ++ if (i == depth) { ++ /* this is leaf block */ ++ err = ext3_ext_rm_leaf(handle, tree, path, start, end); ++ /* root level have p_bh == NULL, brelse() eats this */ ++ brelse(path[i].p_bh); ++ i--; ++ continue; ++ } ++ ++ /* this is index block */ ++ if (!path[i].p_hdr) { ++ ext_debug(tree, "initialize header\n"); ++ path[i].p_hdr = EXT_BLOCK_HDR(path[i].p_bh); ++ } ++ ++ EXT_ASSERT(path[i].p_hdr->eh_entries <= path[i].p_hdr->eh_max); ++ EXT_ASSERT(path[i].p_hdr->eh_magic == EXT3_EXT_MAGIC); ++ ++ if (!path[i].p_idx) { ++ /* this level hasn't touched yet */ ++ path[i].p_idx = ++ ext3_ext_last_covered(path[i].p_hdr, end); ++ path[i].p_block = path[i].p_hdr->eh_entries + 1; ++ ext_debug(tree, "init index ptr: hdr 0x%p, num %d\n", ++ path[i].p_hdr, path[i].p_hdr->eh_entries); ++ } else { ++ /* we've already was here, see at next index */ ++ path[i].p_idx--; ++ } ++ ++ ext_debug(tree, "level %d - index, first 0x%p, cur 0x%p\n", ++ i, EXT_FIRST_INDEX(path[i].p_hdr), ++ path[i].p_idx); ++ if (ext3_ext_more_to_rm(path + i)) { ++ /* go to the next level */ ++ ext_debug(tree, "move to level %d (block %d)\n", ++ i + 1, path[i].p_idx->ei_leaf); ++ memset(path + i + 1, 0, sizeof(*path)); ++ path[i+1].p_bh = sb_bread(sb, path[i].p_idx->ei_leaf); ++ if (!path[i+1].p_bh) { ++ /* should we reset i_size? */ ++ err = -EIO; ++ break; ++ } ++ /* put actual number of indexes to know is this ++ * number got changed at the next iteration */ ++ path[i].p_block = path[i].p_hdr->eh_entries; ++ i++; ++ } else { ++ /* we finish processing this index, go up */ ++ if (path[i].p_hdr->eh_entries == 0 && i > 0) { ++ /* index is empty, remove it ++ * handle must be already prepared by the ++ * truncatei_leaf() */ ++ err = ext3_ext_rm_idx(handle, tree, path + i); ++ } ++ /* root level have p_bh == NULL, brelse() eats this */ ++ brelse(path[i].p_bh); ++ i--; ++ ext_debug(tree, "return to level %d\n", i); ++ } ++ } ++ ++ /* TODO: flexible tree reduction should be here */ ++ if (path->p_hdr->eh_entries == 0) { ++ /* ++ * truncate to zero freed all the tree ++ * so, we need to correct eh_depth ++ */ ++ err = ext3_ext_get_access(handle, tree, path); ++ if (err == 0) { ++ EXT_ROOT_HDR(tree)->eh_depth = 0; ++ EXT_ROOT_HDR(tree)->eh_max = ext3_ext_space_root(tree); ++ err = ext3_ext_dirty(handle, tree, path); ++ } ++ } ++ ext3_ext_tree_changed(tree); ++ ++ kfree(path); ++ ext3_journal_stop(handle); ++ ++ return err; ++} ++ ++int ext3_ext_calc_metadata_amount(struct ext3_extents_tree *tree, int blocks) ++{ ++ int lcap, icap, rcap, leafs, idxs, num; ++ ++ rcap = ext3_ext_space_root(tree); ++ if (blocks <= rcap) { ++ /* all extents fit to the root */ ++ return 0; ++ } ++ ++ rcap = ext3_ext_space_root_idx(tree); ++ lcap = ext3_ext_space_block(tree); ++ icap = ext3_ext_space_block_idx(tree); ++ ++ num = leafs = (blocks + lcap - 1) / lcap; ++ if (leafs <= rcap) { ++ /* all pointers to leafs fit to the root */ ++ return leafs; ++ } ++ ++ /* ok. we need separate index block(s) to link all leaf blocks */ ++ idxs = (leafs + icap - 1) / icap; ++ do { ++ num += idxs; ++ idxs = (idxs + icap - 1) / icap; ++ } while (idxs > rcap); ++ ++ return num; ++} ++ ++/* ++ * called at mount time ++ */ ++void ext3_ext_init(struct super_block *sb) ++{ ++ /* ++ * possible initialization would be here ++ */ ++ ++ if (test_opt(sb, EXTENTS)) { ++ printk("EXT3-fs: file extents enabled"); ++#ifdef AGRESSIVE_TEST ++ printk(", agressive tests"); ++#endif ++#ifdef CHECK_BINSEARCH ++ printk(", check binsearch"); ++#endif ++ printk("\n"); ++ } ++} ++ ++/* ++ * called at umount time ++ */ ++void ext3_ext_release(struct super_block *sb) ++{ ++} ++ ++/************************************************************************ ++ * VFS related routines ++ ************************************************************************/ ++ ++static int ext3_get_inode_write_access(handle_t *handle, void *buffer) ++{ ++ /* we use in-core data, not bh */ ++ return 0; ++} ++ ++static int ext3_mark_buffer_dirty(handle_t *handle, void *buffer) ++{ ++ struct inode *inode = buffer; ++ return ext3_mark_inode_dirty(handle, inode); ++} ++ ++static int ext3_ext_mergable(struct ext3_extent *ex1, ++ struct ext3_extent *ex2) ++{ ++ /* FIXME: support for large fs */ ++ if (ex1->ee_start + ex1->ee_len == ex2->ee_start) ++ return 1; ++ return 0; ++} ++ ++static int ++ext3_remove_blocks_credits(struct ext3_extents_tree *tree, ++ struct ext3_extent *ex, ++ unsigned long from, unsigned long to) ++{ ++ int needed; ++ ++ /* at present, extent can't cross block group */; ++ needed = 4; /* bitmap + group desc + sb + inode */ ++ ++#ifdef CONFIG_QUOTA ++ needed += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS; ++#endif ++ return needed; ++} ++ ++static int ++ext3_remove_blocks(struct ext3_extents_tree *tree, ++ struct ext3_extent *ex, ++ unsigned long from, unsigned long to) ++{ ++ int needed = ext3_remove_blocks_credits(tree, ex, from, to); ++ handle_t *handle = ext3_journal_start(tree->inode, needed); ++ struct buffer_head *bh; ++ int i; ++ ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) { ++ /* tail removal */ ++ unsigned long num, start; ++ num = ex->ee_block + ex->ee_len - from; ++ start = ex->ee_start + ex->ee_len - num; ++ ext_debug(tree, "free last %lu blocks starting %lu\n", ++ num, start); ++ for (i = 0; i < num; i++) { ++ bh = sb_find_get_block(tree->inode->i_sb, start + i); ++ ext3_forget(handle, 0, tree->inode, bh, start + i); ++ } ++ ext3_free_blocks(handle, tree->inode, start, num); ++ } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) { ++ printk("strange request: removal %lu-%lu from %u:%u\n", ++ from, to, ex->ee_block, ex->ee_len); ++ } else { ++ printk("strange request: removal(2) %lu-%lu from %u:%u\n", ++ from, to, ex->ee_block, ex->ee_len); ++ } ++ ext3_journal_stop(handle); ++ return 0; ++} ++ ++static int ext3_ext_find_goal(struct inode *inode, ++ struct ext3_ext_path *path, unsigned long block) ++{ ++ struct ext3_inode_info *ei = EXT3_I(inode); ++ unsigned long bg_start; ++ unsigned long colour; ++ int depth; ++ ++ if (path) { ++ struct ext3_extent *ex; ++ depth = path->p_depth; ++ ++ /* try to predict block placement */ ++ if ((ex = path[depth].p_ext)) ++ return ex->ee_start + (block - ex->ee_block); ++ ++ /* it looks index is empty ++ * try to find starting from index itself */ ++ if (path[depth].p_bh) ++ return path[depth].p_bh->b_blocknr; ++ } ++ ++ /* OK. use inode's group */ ++ bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) + ++ le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block); ++ colour = (current->pid % 16) * ++ (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16); ++ return bg_start + colour + block; ++} ++ ++static int ext3_new_block_cb(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *ex, int *err) ++{ ++ struct inode *inode = tree->inode; ++ int newblock, goal; ++ ++ EXT_ASSERT(path); ++ EXT_ASSERT(ex); ++ EXT_ASSERT(ex->ee_start); ++ EXT_ASSERT(ex->ee_len); ++ ++ /* reuse block from the extent to order data/metadata */ ++ newblock = ex->ee_start++; ++ ex->ee_len--; ++ if (ex->ee_len == 0) { ++ ex->ee_len = 1; ++ /* allocate new block for the extent */ ++ goal = ext3_ext_find_goal(inode, path, ex->ee_block); ++ ex->ee_start = ext3_new_block(handle, inode, goal, err); ++ ex->ee_start_hi = 0; ++ if (ex->ee_start == 0) { ++ /* error occured: restore old extent */ ++ ex->ee_start = newblock; ++ return 0; ++ } ++ } ++ return newblock; ++} ++ ++static struct ext3_extents_helpers ext3_blockmap_helpers = { ++ .get_write_access = ext3_get_inode_write_access, ++ .mark_buffer_dirty = ext3_mark_buffer_dirty, ++ .mergable = ext3_ext_mergable, ++ .new_block = ext3_new_block_cb, ++ .remove_extent = ext3_remove_blocks, ++ .remove_extent_credits = ext3_remove_blocks_credits, ++}; ++ ++void ext3_init_tree_desc(struct ext3_extents_tree *tree, ++ struct inode *inode) ++{ ++ tree->inode = inode; ++ tree->root = (void *) EXT3_I(inode)->i_data; ++ tree->buffer = (void *) inode; ++ tree->buffer_len = sizeof(EXT3_I(inode)->i_data); ++ tree->cex = (struct ext3_ext_cache *) &EXT3_I(inode)->i_cached_extent; ++ tree->ops = &ext3_blockmap_helpers; ++} ++ ++int ext3_ext_get_block(handle_t *handle, struct inode *inode, ++ long iblock, struct buffer_head *bh_result, ++ int create, int extend_disksize) ++{ ++ struct ext3_ext_path *path = NULL; ++ struct ext3_extent newex; ++ struct ext3_extent *ex; ++ int goal, newblock, err = 0, depth; ++ struct ext3_extents_tree tree; ++ ++ clear_buffer_new(bh_result); ++ ext3_init_tree_desc(&tree, inode); ++ ext_debug(&tree, "block %d requested for inode %u\n", ++ (int) iblock, (unsigned) inode->i_ino); ++ down(&EXT3_I(inode)->truncate_sem); ++ ++ /* check in cache */ ++ if ((goal = ext3_ext_in_cache(&tree, iblock, &newex))) { ++ if (goal == EXT3_EXT_CACHE_GAP) { ++ if (!create) { ++ /* block isn't allocated yet and ++ * user don't want to allocate it */ ++ goto out2; ++ } ++ /* we should allocate requested block */ ++ } else if (goal == EXT3_EXT_CACHE_EXTENT) { ++ /* block is already allocated */ ++ newblock = iblock - newex.ee_block + newex.ee_start; ++ goto out; ++ } else { ++ EXT_ASSERT(0); ++ } ++ } ++ ++ /* find extent for this block */ ++ path = ext3_ext_find_extent(&tree, iblock, NULL); ++ if (IS_ERR(path)) { ++ err = PTR_ERR(path); ++ path = NULL; ++ goto out2; ++ } ++ ++ depth = EXT_DEPTH(&tree); ++ ++ /* ++ * consistent leaf must not be empty ++ * this situations is possible, though, _during_ tree modification ++ * this is why assert can't be put in ext3_ext_find_extent() ++ */ ++ EXT_ASSERT(path[depth].p_ext != NULL || depth == 0); ++ ++ if ((ex = path[depth].p_ext)) { ++ /* if found exent covers block, simple return it */ ++ if (iblock >= ex->ee_block && iblock < ex->ee_block + ex->ee_len) { ++ newblock = iblock - ex->ee_block + ex->ee_start; ++ ext_debug(&tree, "%d fit into %d:%d -> %d\n", ++ (int) iblock, ex->ee_block, ex->ee_len, ++ newblock); ++ ext3_ext_put_in_cache(&tree, ex->ee_block, ++ ex->ee_len, ex->ee_start, ++ EXT3_EXT_CACHE_EXTENT); ++ goto out; ++ } ++ } ++ ++ /* ++ * requested block isn't allocated yet ++ * we couldn't try to create block if create flag is zero ++ */ ++ if (!create) { ++ /* put just found gap into cache to speedup subsequest reqs */ ++ ext3_ext_put_gap_in_cache(&tree, path, iblock); ++ goto out2; ++ } ++ ++ /* allocate new block */ ++ goal = ext3_ext_find_goal(inode, path, iblock); ++ newblock = ext3_new_block(handle, inode, goal, &err); ++ if (!newblock) ++ goto out2; ++ ext_debug(&tree, "allocate new block: goal %d, found %d\n", ++ goal, newblock); ++ ++ /* try to insert new extent into found leaf and return */ ++ newex.ee_block = iblock; ++ newex.ee_start = newblock; ++ newex.ee_start_hi = 0; ++ newex.ee_len = 1; ++ err = ext3_ext_insert_extent(handle, &tree, path, &newex); ++ if (err) ++ goto out2; ++ ++ if (extend_disksize && inode->i_size > EXT3_I(inode)->i_disksize) ++ EXT3_I(inode)->i_disksize = inode->i_size; ++ ++ /* previous routine could use block we allocated */ ++ newblock = newex.ee_start; ++ set_buffer_new(bh_result); ++ ++ ext3_ext_put_in_cache(&tree, newex.ee_block, newex.ee_len, ++ newex.ee_start, EXT3_EXT_CACHE_EXTENT); ++out: ++ ext3_ext_show_leaf(&tree, path); ++ map_bh(bh_result, inode->i_sb, newblock); ++out2: ++ if (path) { ++ ext3_ext_drop_refs(path); ++ kfree(path); ++ } ++ up(&EXT3_I(inode)->truncate_sem); ++ ++ return err; ++} ++ ++void ext3_ext_truncate(struct inode * inode, struct page *page) ++{ ++ struct address_space *mapping = inode->i_mapping; ++ struct super_block *sb = inode->i_sb; ++ struct ext3_extents_tree tree; ++ unsigned long last_block; ++ handle_t *handle; ++ int err = 0; ++ ++ ext3_init_tree_desc(&tree, inode); ++ ++ /* ++ * probably first extent we're gonna free will be last in block ++ */ ++ err = ext3_writepage_trans_blocks(inode) + 3; ++ handle = ext3_journal_start(inode, err); ++ if (IS_ERR(handle)) { ++ if (page) { ++ clear_highpage(page); ++ flush_dcache_page(page); ++ unlock_page(page); ++ page_cache_release(page); ++ } ++ return; ++ } ++ ++ if (page) ++ ext3_block_truncate_page(handle, page, mapping, inode->i_size); ++ ++ down(&EXT3_I(inode)->truncate_sem); ++ ext3_ext_invalidate_cache(&tree); ++ ++ /* ++ * TODO: optimization is possible here ++ * probably we need not scaning at all, ++ * because page truncation is enough ++ */ ++ if (ext3_orphan_add(handle, inode)) ++ goto out_stop; ++ ++ /* we have to know where to truncate from in crash case */ ++ EXT3_I(inode)->i_disksize = inode->i_size; ++ ext3_mark_inode_dirty(handle, inode); ++ ++ last_block = (inode->i_size + sb->s_blocksize - 1) >> ++ EXT3_BLOCK_SIZE_BITS(sb); ++ err = ext3_ext_remove_space(&tree, last_block, EXT_MAX_BLOCK); ++ ++ /* In a multi-transaction truncate, we only make the final ++ * transaction synchronous */ ++ if (IS_SYNC(inode)) ++ handle->h_sync = 1; ++ ++out_stop: ++ /* ++ * If this was a simple ftruncate(), and the file will remain alive ++ * then we need to clear up the orphan record which we created above. ++ * However, if this was a real unlink then we were called by ++ * ext3_delete_inode(), and we allow that function to clean up the ++ * orphan info for us. ++ */ ++ if (inode->i_nlink) ++ ext3_orphan_del(handle, inode); ++ ++ up(&EXT3_I(inode)->truncate_sem); ++ ext3_journal_stop(handle); ++} ++ ++/* ++ * this routine calculate max number of blocks we could modify ++ * in order to allocate new block for an inode ++ */ ++int ext3_ext_writepage_trans_blocks(struct inode *inode, int num) ++{ ++ struct ext3_extents_tree tree; ++ int needed; ++ ++ ext3_init_tree_desc(&tree, inode); ++ ++ needed = ext3_ext_calc_credits_for_insert(&tree, NULL); ++ ++ /* caller want to allocate num blocks */ ++ needed *= num; ++ ++#ifdef CONFIG_QUOTA ++ /* ++ * FIXME: real calculation should be here ++ * it depends on blockmap format of qouta file ++ */ ++ needed += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS; ++#endif ++ ++ return needed; ++} ++ ++void ext3_extents_initialize_blockmap(handle_t *handle, struct inode *inode) ++{ ++ struct ext3_extents_tree tree; ++ ++ ext3_init_tree_desc(&tree, inode); ++ ext3_extent_tree_init(handle, &tree); ++} ++ ++int ext3_ext_calc_blockmap_metadata(struct inode *inode, int blocks) ++{ ++ struct ext3_extents_tree tree; ++ ++ ext3_init_tree_desc(&tree, inode); ++ return ext3_ext_calc_metadata_amount(&tree, blocks); ++} ++ ++static int ++ext3_ext_store_extent_cb(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_ext_cache *newex) ++{ ++ struct ext3_extent_buf *buf = (struct ext3_extent_buf *) tree->private; ++ ++ if (newex->ec_type != EXT3_EXT_CACHE_EXTENT) ++ return EXT_CONTINUE; ++ ++ if (buf->err < 0) ++ return EXT_BREAK; ++ if (buf->cur - buf->buffer + sizeof(*newex) > buf->buflen) ++ return EXT_BREAK; ++ ++ if (!copy_to_user(buf->cur, newex, sizeof(*newex))) { ++ buf->err++; ++ buf->cur += sizeof(*newex); ++ } else { ++ buf->err = -EFAULT; ++ return EXT_BREAK; ++ } ++ return EXT_CONTINUE; ++} ++ ++static int ++ext3_ext_collect_stats_cb(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_ext_cache *ex) ++{ ++ struct ext3_extent_tree_stats *buf = ++ (struct ext3_extent_tree_stats *) tree->private; ++ int depth; ++ ++ if (ex->ec_type != EXT3_EXT_CACHE_EXTENT) ++ return EXT_CONTINUE; ++ ++ depth = EXT_DEPTH(tree); ++ buf->extents_num++; ++ if (path[depth].p_ext == EXT_FIRST_EXTENT(path[depth].p_hdr)) ++ buf->leaf_num++; ++ return EXT_CONTINUE; ++} ++ ++int ext3_ext_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, ++ unsigned long arg) ++{ ++ int err = 0; ++ ++ if (!(EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL)) ++ return -EINVAL; ++ ++ if (cmd == EXT3_IOC_GET_EXTENTS) { ++ struct ext3_extent_buf buf; ++ struct ext3_extents_tree tree; ++ ++ if (copy_from_user(&buf, (void *) arg, sizeof(buf))) ++ return -EFAULT; ++ ++ ext3_init_tree_desc(&tree, inode); ++ buf.cur = buf.buffer; ++ buf.err = 0; ++ tree.private = &buf; ++ down(&EXT3_I(inode)->truncate_sem); ++ err = ext3_ext_walk_space(&tree, buf.start, EXT_MAX_BLOCK, ++ ext3_ext_store_extent_cb); ++ up(&EXT3_I(inode)->truncate_sem); ++ if (err == 0) ++ err = buf.err; ++ } else if (cmd == EXT3_IOC_GET_TREE_STATS) { ++ struct ext3_extent_tree_stats buf; ++ struct ext3_extents_tree tree; ++ ++ ext3_init_tree_desc(&tree, inode); ++ down(&EXT3_I(inode)->truncate_sem); ++ buf.depth = EXT_DEPTH(&tree); ++ buf.extents_num = 0; ++ buf.leaf_num = 0; ++ tree.private = &buf; ++ err = ext3_ext_walk_space(&tree, 0, EXT_MAX_BLOCK, ++ ext3_ext_collect_stats_cb); ++ up(&EXT3_I(inode)->truncate_sem); ++ if (!err) ++ err = copy_to_user((void *) arg, &buf, sizeof(buf)); ++ } else if (cmd == EXT3_IOC_GET_TREE_DEPTH) { ++ struct ext3_extents_tree tree; ++ ext3_init_tree_desc(&tree, inode); ++ down(&EXT3_I(inode)->truncate_sem); ++ err = EXT_DEPTH(&tree); ++ up(&EXT3_I(inode)->truncate_sem); ++ } ++ ++ return err; ++} ++ ++EXPORT_SYMBOL(ext3_init_tree_desc); ++EXPORT_SYMBOL(ext3_mark_inode_dirty); ++EXPORT_SYMBOL(ext3_ext_invalidate_cache); ++EXPORT_SYMBOL(ext3_ext_insert_extent); ++EXPORT_SYMBOL(ext3_ext_walk_space); ++EXPORT_SYMBOL(ext3_ext_find_goal); ++EXPORT_SYMBOL(ext3_ext_calc_credits_for_insert); +Index: linux-2.6.16.21-0.8/fs/ext3/ialloc.c +=================================================================== +--- linux-2.6.16.21-0.8.orig/fs/ext3/ialloc.c ++++ linux-2.6.16.21-0.8/fs/ext3/ialloc.c +@@ -598,7 +598,7 @@ got: + ei->i_dir_start_lookup = 0; + ei->i_disksize = 0; + +- ei->i_flags = EXT3_I(dir)->i_flags & ~EXT3_INDEX_FL; ++ ei->i_flags = EXT3_I(dir)->i_flags & ~(EXT3_INDEX_FL|EXT3_EXTENTS_FL); + if (S_ISLNK(mode)) + ei->i_flags &= ~(EXT3_IMMUTABLE_FL|EXT3_APPEND_FL); + /* dirsync only applies to directories */ +@@ -642,6 +642,18 @@ got: + if (err) + goto fail_free_drop; + ++ if (test_opt(sb, EXTENTS) && S_ISREG(inode->i_mode)) { ++ EXT3_I(inode)->i_flags |= EXT3_EXTENTS_FL; ++ ext3_extents_initialize_blockmap(handle, inode); ++ if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_EXTENTS)) { ++ err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh); ++ if (err) goto fail; ++ EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_EXTENTS); ++ BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "call ext3_journal_dirty_metadata"); ++ err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); ++ } ++ } ++ + err = ext3_mark_inode_dirty(handle, inode); + if (err) { + ext3_std_error(sb, err); +Index: linux-2.6.16.21-0.8/fs/ext3/inode.c +=================================================================== +--- linux-2.6.16.21-0.8.orig/fs/ext3/inode.c ++++ linux-2.6.16.21-0.8/fs/ext3/inode.c +@@ -40,7 +40,7 @@ + #include "iopen.h" + #include "acl.h" + +-static int ext3_writepage_trans_blocks(struct inode *inode); ++int ext3_writepage_trans_blocks(struct inode *inode); + + /* + * Test whether an inode is a fast symlink. +@@ -788,6 +788,17 @@ out: + return err; + } + ++static inline int ++ext3_get_block_wrap(handle_t *handle, struct inode *inode, long block, ++ struct buffer_head *bh, int create, int extend_disksize) ++{ ++ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) ++ return ext3_ext_get_block(handle, inode, block, bh, create, ++ extend_disksize); ++ return ext3_get_block_handle(handle, inode, block, bh, create, ++ extend_disksize); ++} ++ + static int ext3_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) + { +@@ -798,8 +809,8 @@ static int ext3_get_block(struct inode * + handle = ext3_journal_current_handle(); + J_ASSERT(handle != 0); + } +- ret = ext3_get_block_handle(handle, inode, iblock, +- bh_result, create, 1); ++ ret = ext3_get_block_wrap(handle, inode, iblock, ++ bh_result, create, 1); + return ret; + } + +@@ -843,7 +854,7 @@ ext3_direct_io_get_blocks(struct inode * + + get_block: + if (ret == 0) +- ret = ext3_get_block_handle(handle, inode, iblock, ++ ret = ext3_get_block_wrap(handle, inode, iblock, + bh_result, create, 0); + bh_result->b_size = (1 << inode->i_blkbits); + return ret; +@@ -863,7 +874,7 @@ struct buffer_head *ext3_getblk(handle_t + dummy.b_state = 0; + dummy.b_blocknr = -1000; + buffer_trace_init(&dummy.b_history); +- *errp = ext3_get_block_handle(handle, inode, block, &dummy, create, 1); ++ *errp = ext3_get_block_wrap(handle, inode, block, &dummy, create, 1); + if (!*errp && buffer_mapped(&dummy)) { + struct buffer_head *bh; + bh = sb_getblk(inode->i_sb, dummy.b_blocknr); +@@ -1606,7 +1617,7 @@ void ext3_set_aops(struct inode *inode) + * This required during truncate. We need to physically zero the tail end + * of that block so it doesn't yield old data if the file is later grown. + */ +-static int ext3_block_truncate_page(handle_t *handle, struct page *page, ++int ext3_block_truncate_page(handle_t *handle, struct page *page, + struct address_space *mapping, loff_t from) + { + unsigned long index = from >> PAGE_CACHE_SHIFT; +@@ -2116,6 +2127,9 @@ void ext3_truncate(struct inode * inode) + return; + } + ++ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) ++ return ext3_ext_truncate(inode, page); ++ + handle = start_transaction(inode); + if (IS_ERR(handle)) { + if (page) { +@@ -2863,12 +2877,15 @@ err_out: + * block and work out the exact number of indirects which are touched. Pah. + */ + +-static int ext3_writepage_trans_blocks(struct inode *inode) ++int ext3_writepage_trans_blocks(struct inode *inode) + { + int bpp = ext3_journal_blocks_per_page(inode); + int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3; + int ret; + ++ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) ++ return ext3_ext_writepage_trans_blocks(inode, bpp); ++ + if (ext3_should_journal_data(inode)) + ret = 3 * (bpp + indirects) + 2; + else +Index: linux-2.6.16.21-0.8/fs/ext3/Makefile +=================================================================== +--- linux-2.6.16.21-0.8.orig/fs/ext3/Makefile ++++ linux-2.6.16.21-0.8/fs/ext3/Makefile +@@ -5,7 +5,8 @@ + obj-$(CONFIG_EXT3_FS) += ext3.o + + ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ +- ioctl.o namei.o super.o symlink.o hash.o resize.o ++ ioctl.o namei.o super.o symlink.o hash.o resize.o \ ++ extents.o + + ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o + ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o +Index: linux-2.6.16.21-0.8/fs/ext3/super.c +=================================================================== +--- linux-2.6.16.21-0.8.orig/fs/ext3/super.c ++++ linux-2.6.16.21-0.8/fs/ext3/super.c +@@ -392,6 +392,7 @@ static void ext3_put_super (struct super + struct ext3_super_block *es = sbi->s_es; + int i; + ++ ext3_ext_release(sb); + ext3_xattr_put_super(sb); + journal_destroy(sbi->s_journal); + if (!(sb->s_flags & MS_RDONLY)) { +@@ -456,6 +457,8 @@ static struct inode *ext3_alloc_inode(st + #endif + ei->i_block_alloc_info = NULL; + ei->vfs_inode.i_version = 1; ++ ++ memset(&ei->i_cached_extent, 0, sizeof(ei->i_cached_extent)); + return &ei->vfs_inode; + } + +@@ -638,6 +641,7 @@ enum { + Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, + Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, + Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, ++ Opt_extents, Opt_noextents, Opt_extdebug, + Opt_grpquota + }; + +@@ -689,6 +693,9 @@ static match_table_t tokens = { + {Opt_iopen, "iopen"}, + {Opt_noiopen, "noiopen"}, + {Opt_iopen_nopriv, "iopen_nopriv"}, ++ {Opt_extents, "extents"}, ++ {Opt_noextents, "noextents"}, ++ {Opt_extdebug, "extdebug"}, + {Opt_barrier, "barrier=%u"}, + {Opt_err, NULL}, + {Opt_resize, "resize"}, +@@ -1030,6 +1036,15 @@ clear_qf_name: + case Opt_nobh: + set_opt(sbi->s_mount_opt, NOBH); + break; ++ case Opt_extents: ++ set_opt (sbi->s_mount_opt, EXTENTS); ++ break; ++ case Opt_noextents: ++ clear_opt (sbi->s_mount_opt, EXTENTS); ++ break; ++ case Opt_extdebug: ++ set_opt (sbi->s_mount_opt, EXTDEBUG); ++ break; + default: + printk (KERN_ERR + "EXT3-fs: Unrecognized mount option \"%s\" " +@@ -1756,6 +1768,7 @@ static int ext3_fill_super (struct super + percpu_counter_mod(&sbi->s_dirs_counter, + ext3_count_dirs(sb)); + ++ ext3_ext_init(sb); + lock_kernel(); + return 0; + +Index: linux-2.6.16.21-0.8/fs/ext3/ioctl.c +=================================================================== +--- linux-2.6.16.21-0.8.orig/fs/ext3/ioctl.c ++++ linux-2.6.16.21-0.8/fs/ext3/ioctl.c +@@ -125,6 +125,10 @@ flags_err: + err = ext3_change_inode_journal_flag(inode, jflag); + return err; + } ++ case EXT3_IOC_GET_EXTENTS: ++ case EXT3_IOC_GET_TREE_STATS: ++ case EXT3_IOC_GET_TREE_DEPTH: ++ return ext3_ext_ioctl(inode, filp, cmd, arg); + case EXT3_IOC_GETVERSION: + case EXT3_IOC_GETVERSION_OLD: + return put_user(inode->i_generation, (int __user *) arg); +Index: linux-2.6.16.21-0.8/include/linux/ext3_fs.h +=================================================================== +--- linux-2.6.16.21-0.8.orig/include/linux/ext3_fs.h ++++ linux-2.6.16.21-0.8/include/linux/ext3_fs.h +@@ -185,9 +185,10 @@ struct ext3_group_desc + #define EXT3_NOTAIL_FL 0x00008000 /* file tail should not be merged */ + #define EXT3_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ + #define EXT3_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ ++#define EXT3_EXTENTS_FL 0x00080000 /* Inode uses extents */ + #define EXT3_RESERVED_FL 0x80000000 /* reserved for ext3 lib */ + +-#define EXT3_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */ ++#define EXT3_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */ + #define EXT3_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */ + + /* +@@ -237,6 +238,9 @@ struct ext3_new_group_data { + #endif + #define EXT3_IOC_GETRSVSZ _IOR('f', 5, long) + #define EXT3_IOC_SETRSVSZ _IOW('f', 6, long) ++#define EXT3_IOC_GET_EXTENTS _IOR('f', 7, long) ++#define EXT3_IOC_GET_TREE_DEPTH _IOR('f', 8, long) ++#define EXT3_IOC_GET_TREE_STATS _IOR('f', 9, long) + + /* + * Mount options +@@ -377,6 +381,8 @@ struct ext3_inode { + #define EXT3_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ + #define EXT3_MOUNT_IOPEN 0x400000 /* Allow access via iopen */ + #define EXT3_MOUNT_IOPEN_NOPRIV 0x800000/* Make iopen world-readable */ ++#define EXT3_MOUNT_EXTENTS 0x1000000/* Extents support */ ++#define EXT3_MOUNT_EXTDEBUG 0x2000000/* Extents debug */ + + /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ + #ifndef clear_opt +@@ -565,11 +571,13 @@ static inline struct ext3_inode_info *EX + #define EXT3_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */ + #define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */ + #define EXT3_FEATURE_INCOMPAT_META_BG 0x0010 ++#define EXT3_FEATURE_INCOMPAT_EXTENTS 0x0040 /* extents support */ + + #define EXT3_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR + #define EXT3_FEATURE_INCOMPAT_SUPP (EXT3_FEATURE_INCOMPAT_FILETYPE| \ + EXT3_FEATURE_INCOMPAT_RECOVER| \ +- EXT3_FEATURE_INCOMPAT_META_BG) ++ EXT3_FEATURE_INCOMPAT_META_BG| \ ++ EXT3_FEATURE_INCOMPAT_EXTENTS) + #define EXT3_FEATURE_RO_COMPAT_SUPP (EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT3_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT3_FEATURE_RO_COMPAT_BTREE_DIR) +@@ -776,6 +784,7 @@ extern unsigned long ext3_count_free (st + + + /* inode.c */ ++extern int ext3_block_truncate_page(handle_t *, struct page *, struct address_space *, loff_t); + extern int ext3_forget(handle_t *, int, struct inode *, struct buffer_head *, int); + extern struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *); + extern struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *); +@@ -792,6 +801,7 @@ extern int ext3_get_inode_loc(struct ino + extern void ext3_truncate (struct inode *); + extern void ext3_set_inode_flags(struct inode *); + extern void ext3_set_aops(struct inode *inode); ++extern int ext3_writepage_trans_blocks(struct inode *inode); + + /* ioctl.c */ + extern int ext3_ioctl (struct inode *, struct file *, unsigned int, +@@ -845,6 +855,16 @@ extern struct inode_operations ext3_spec + extern struct inode_operations ext3_symlink_inode_operations; + extern struct inode_operations ext3_fast_symlink_inode_operations; + ++/* extents.c */ ++extern int ext3_ext_writepage_trans_blocks(struct inode *, int); ++extern int ext3_ext_get_block(handle_t *, struct inode *, long, ++ struct buffer_head *, int, int); ++extern void ext3_ext_truncate(struct inode *, struct page *); ++extern void ext3_ext_init(struct super_block *); ++extern void ext3_ext_release(struct super_block *); ++extern void ext3_extents_initialize_blockmap(handle_t *, struct inode *); ++extern int ext3_ext_ioctl(struct inode *inode, struct file *filp, ++ unsigned int cmd, unsigned long arg); + + #endif /* __KERNEL__ */ + +Index: linux-2.6.16.21-0.8/include/linux/ext3_extents.h +=================================================================== +--- /dev/null ++++ linux-2.6.16.21-0.8/include/linux/ext3_extents.h +@@ -0,0 +1,262 @@ ++/* ++ * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com ++ * Written by Alex Tomas ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public Licens ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- ++ */ ++ ++#ifndef _LINUX_EXT3_EXTENTS ++#define _LINUX_EXT3_EXTENTS ++ ++/* ++ * with AGRESSIVE_TEST defined capacity of index/leaf blocks ++ * become very little, so index split, in-depth growing and ++ * other hard changes happens much more often ++ * this is for debug purposes only ++ */ ++#define AGRESSIVE_TEST_ ++ ++/* ++ * if CHECK_BINSEARCH defined, then results of binary search ++ * will be checked by linear search ++ */ ++#define CHECK_BINSEARCH_ ++ ++/* ++ * if EXT_DEBUG is defined you can use 'extdebug' mount option ++ * to get lots of info what's going on ++ */ ++#define EXT_DEBUG_ ++#ifdef EXT_DEBUG ++#define ext_debug(tree,fmt,a...) \ ++do { \ ++ if (test_opt((tree)->inode->i_sb, EXTDEBUG)) \ ++ printk(fmt, ##a); \ ++} while (0); ++#else ++#define ext_debug(tree,fmt,a...) ++#endif ++ ++/* ++ * if EXT_STATS is defined then stats numbers are collected ++ * these number will be displayed at umount time ++ */ ++#define EXT_STATS_ ++ ++ ++#define EXT3_ALLOC_NEEDED 3 /* block bitmap + group desc. + sb */ ++ ++/* ++ * ext3_inode has i_block array (total 60 bytes) ++ * first 4 bytes are used to store: ++ * - tree depth (0 mean there is no tree yet. all extents in the inode) ++ * - number of alive extents in the inode ++ */ ++ ++/* ++ * this is extent on-disk structure ++ * it's used at the bottom of the tree ++ */ ++struct ext3_extent { ++ __u32 ee_block; /* first logical block extent covers */ ++ __u16 ee_len; /* number of blocks covered by extent */ ++ __u16 ee_start_hi; /* high 16 bits of physical block */ ++ __u32 ee_start; /* low 32 bigs of physical block */ ++}; ++ ++/* ++ * this is index on-disk structure ++ * it's used at all the levels, but the bottom ++ */ ++struct ext3_extent_idx { ++ __u32 ei_block; /* index covers logical blocks from 'block' */ ++ __u32 ei_leaf; /* pointer to the physical block of the next * ++ * level. leaf or next index could bet here */ ++ __u16 ei_leaf_hi; /* high 16 bits of physical block */ ++ __u16 ei_unused; ++}; ++ ++/* ++ * each block (leaves and indexes), even inode-stored has header ++ */ ++struct ext3_extent_header { ++ __u16 eh_magic; /* probably will support different formats */ ++ __u16 eh_entries; /* number of valid entries */ ++ __u16 eh_max; /* capacity of store in entries */ ++ __u16 eh_depth; /* has tree real underlaying blocks? */ ++ __u32 eh_generation; /* flags(8 bits) | generation of the tree */ ++}; ++ ++#define EXT3_EXT_MAGIC 0xf30a ++ ++/* ++ * array of ext3_ext_path contains path to some extent ++ * creation/lookup routines use it for traversal/splitting/etc ++ * truncate uses it to simulate recursive walking ++ */ ++struct ext3_ext_path { ++ __u32 p_block; ++ __u16 p_depth; ++ struct ext3_extent *p_ext; ++ struct ext3_extent_idx *p_idx; ++ struct ext3_extent_header *p_hdr; ++ struct buffer_head *p_bh; ++}; ++ ++/* ++ * structure for external API ++ */ ++ ++/* ++ * storage for cached extent ++ */ ++struct ext3_ext_cache { ++ __u32 ec_start; ++ __u32 ec_block; ++ __u32 ec_len; ++ __u32 ec_type; ++}; ++ ++#define EXT3_EXT_CACHE_NO 0 ++#define EXT3_EXT_CACHE_GAP 1 ++#define EXT3_EXT_CACHE_EXTENT 2 ++ ++/* ++ * ext3_extents_tree is used to pass initial information ++ * to top-level extents API ++ */ ++struct ext3_extents_helpers; ++struct ext3_extents_tree { ++ struct inode *inode; /* inode which tree belongs to */ ++ void *root; /* ptr to data top of tree resides at */ ++ void *buffer; /* will be passed as arg to ^^ routines */ ++ int buffer_len; ++ void *private; ++ struct ext3_ext_cache *cex;/* last found extent */ ++ struct ext3_extents_helpers *ops; ++}; ++ ++struct ext3_extents_helpers { ++ int (*get_write_access)(handle_t *h, void *buffer); ++ int (*mark_buffer_dirty)(handle_t *h, void *buffer); ++ int (*mergable)(struct ext3_extent *ex1, struct ext3_extent *ex2); ++ int (*remove_extent_credits)(struct ext3_extents_tree *, ++ struct ext3_extent *, unsigned long, ++ unsigned long); ++ int (*remove_extent)(struct ext3_extents_tree *, ++ struct ext3_extent *, unsigned long, ++ unsigned long); ++ int (*new_block)(handle_t *, struct ext3_extents_tree *, ++ struct ext3_ext_path *, struct ext3_extent *, ++ int *); ++}; ++ ++/* ++ * to be called by ext3_ext_walk_space() ++ * negative retcode - error ++ * positive retcode - signal for ext3_ext_walk_space(), see below ++ * callback must return valid extent (passed or newly created) ++ */ ++typedef int (*ext_prepare_callback)(struct ext3_extents_tree *, ++ struct ext3_ext_path *, ++ struct ext3_ext_cache *); ++ ++#define EXT_CONTINUE 0 ++#define EXT_BREAK 1 ++#define EXT_REPEAT 2 ++ ++ ++#define EXT_MAX_BLOCK 0xffffffff ++ ++ ++#define EXT_FIRST_EXTENT(__hdr__) \ ++ ((struct ext3_extent *) (((char *) (__hdr__)) + \ ++ sizeof(struct ext3_extent_header))) ++#define EXT_FIRST_INDEX(__hdr__) \ ++ ((struct ext3_extent_idx *) (((char *) (__hdr__)) + \ ++ sizeof(struct ext3_extent_header))) ++#define EXT_HAS_FREE_INDEX(__path__) \ ++ ((__path__)->p_hdr->eh_entries < (__path__)->p_hdr->eh_max) ++#define EXT_LAST_EXTENT(__hdr__) \ ++ (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_entries - 1) ++#define EXT_LAST_INDEX(__hdr__) \ ++ (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_entries - 1) ++#define EXT_MAX_EXTENT(__hdr__) \ ++ (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_max - 1) ++#define EXT_MAX_INDEX(__hdr__) \ ++ (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_max - 1) ++#define EXT_HDR_GEN(__hdr__) ((__hdr__)->eh_generation & 0x00ffffff) ++#define EXT_FLAGS(__hdr__) ((__hdr__)->eh_generation >> 24) ++#define EXT_FLAGS_CLR_UNKNOWN 0x7 /* Flags cleared on modification */ ++ ++#define EXT_BLOCK_HDR(__bh__) ((struct ext3_extent_header *)(__bh__)->b_data) ++#define EXT_ROOT_HDR(__tree__) ((struct ext3_extent_header *)(__tree__)->root) ++#define EXT_DEPTH(__tree__) (EXT_ROOT_HDR(__tree__)->eh_depth) ++#define EXT_GENERATION(__tree__) EXT_HDR_GEN(EXT_ROOT_HDR(__tree__)) ++ ++#define EXT_ASSERT(__x__) if (!(__x__)) BUG(); ++ ++#define EXT_CHECK_PATH(tree,path) \ ++{ \ ++ int depth = EXT_DEPTH(tree); \ ++ BUG_ON((unsigned long) (path) < __PAGE_OFFSET); \ ++ BUG_ON((unsigned long) (path)[depth].p_idx < \ ++ __PAGE_OFFSET && (path)[depth].p_idx != NULL); \ ++ BUG_ON((unsigned long) (path)[depth].p_ext < \ ++ __PAGE_OFFSET && (path)[depth].p_ext != NULL); \ ++ BUG_ON((unsigned long) (path)[depth].p_hdr < __PAGE_OFFSET); \ ++ BUG_ON((unsigned long) (path)[depth].p_bh < __PAGE_OFFSET \ ++ && depth != 0); \ ++ BUG_ON((path)[0].p_depth != depth); \ ++} ++ ++ ++/* ++ * this structure is used to gather extents from the tree via ioctl ++ */ ++struct ext3_extent_buf { ++ unsigned long start; ++ int buflen; ++ void *buffer; ++ void *cur; ++ int err; ++}; ++ ++/* ++ * this structure is used to collect stats info about the tree ++ */ ++struct ext3_extent_tree_stats { ++ int depth; ++ int extents_num; ++ int leaf_num; ++}; ++ ++extern void ext3_init_tree_desc(struct ext3_extents_tree *, struct inode *); ++extern int ext3_extent_tree_init(handle_t *, struct ext3_extents_tree *); ++extern int ext3_ext_calc_credits_for_insert(struct ext3_extents_tree *, struct ext3_ext_path *); ++extern int ext3_ext_insert_extent(handle_t *, struct ext3_extents_tree *, struct ext3_ext_path *, struct ext3_extent *); ++extern int ext3_ext_walk_space(struct ext3_extents_tree *, unsigned long, unsigned long, ext_prepare_callback); ++extern int ext3_ext_remove_space(struct ext3_extents_tree *, unsigned long, unsigned long); ++extern struct ext3_ext_path * ext3_ext_find_extent(struct ext3_extents_tree *, int, struct ext3_ext_path *); ++extern int ext3_ext_calc_blockmap_metadata(struct inode *, int); ++ ++static inline void ++ext3_ext_invalidate_cache(struct ext3_extents_tree *tree) ++{ ++ if (tree->cex) ++ tree->cex->ec_type = EXT3_EXT_CACHE_NO; ++} ++ ++ ++#endif /* _LINUX_EXT3_EXTENTS */ +Index: linux-2.6.16.21-0.8/include/linux/ext3_fs_i.h +=================================================================== +--- linux-2.6.16.21-0.8.orig/include/linux/ext3_fs_i.h ++++ linux-2.6.16.21-0.8/include/linux/ext3_fs_i.h +@@ -133,6 +133,8 @@ struct ext3_inode_info { + */ + struct semaphore truncate_sem; + struct inode vfs_inode; ++ ++ __u32 i_cached_extent[4]; + }; + + #endif /* _LINUX_EXT3_FS_I */ diff --git a/ldiskfs/kernel_patches/patches/ext3-extents-2.6.18-vanilla.patch b/ldiskfs/kernel_patches/patches/ext3-extents-2.6.18-vanilla.patch new file mode 100644 index 0000000..f2988a2 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext3-extents-2.6.18-vanilla.patch @@ -0,0 +1,2945 @@ +Index: linux-stage/fs/ext3/extents.c +=================================================================== +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ linux-stage/fs/ext3/extents.c 2006-07-16 14:10:21.000000000 +0800 +@@ -0,0 +1,2359 @@ ++/* ++ * Copyright(c) 2003, 2004, 2005, Cluster File Systems, Inc, info@clusterfs.com ++ * Written by Alex Tomas ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public Licens ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- ++ */ ++ ++/* ++ * Extents support for EXT3 ++ * ++ * TODO: ++ * - ext3_ext_walk_space() sould not use ext3_ext_find_extent() ++ * - ext3_ext_calc_credits() could take 'mergable' into account ++ * - ext3*_error() should be used in some situations ++ * - find_goal() [to be tested and improved] ++ * - smart tree reduction ++ * - arch-independence ++ * common on-disk format for big/little-endian arch ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++ ++static inline int ext3_ext_check_header(struct ext3_extent_header *eh) ++{ ++ if (eh->eh_magic != EXT3_EXT_MAGIC) { ++ printk(KERN_ERR "EXT3-fs: invalid magic = 0x%x\n", ++ (unsigned)eh->eh_magic); ++ return -EIO; ++ } ++ if (eh->eh_max == 0) { ++ printk(KERN_ERR "EXT3-fs: invalid eh_max = %u\n", ++ (unsigned)eh->eh_max); ++ return -EIO; ++ } ++ if (eh->eh_entries > eh->eh_max) { ++ printk(KERN_ERR "EXT3-fs: invalid eh_entries = %u\n", ++ (unsigned)eh->eh_entries); ++ return -EIO; ++ } ++ return 0; ++} ++ ++static handle_t *ext3_ext_journal_restart(handle_t *handle, int needed) ++{ ++ int err; ++ ++ if (handle->h_buffer_credits > needed) ++ return handle; ++ if (!ext3_journal_extend(handle, needed)) ++ return handle; ++ err = ext3_journal_restart(handle, needed); ++ ++ return handle; ++} ++ ++static int inline ++ext3_ext_get_access_for_root(handle_t *h, struct ext3_extents_tree *tree) ++{ ++ if (tree->ops->get_write_access) ++ return tree->ops->get_write_access(h,tree->buffer); ++ else ++ return 0; ++} ++ ++static int inline ++ext3_ext_mark_root_dirty(handle_t *h, struct ext3_extents_tree *tree) ++{ ++ if (tree->ops->mark_buffer_dirty) ++ return tree->ops->mark_buffer_dirty(h,tree->buffer); ++ else ++ return 0; ++} ++ ++/* ++ * could return: ++ * - EROFS ++ * - ENOMEM ++ */ ++static int ext3_ext_get_access(handle_t *handle, ++ struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ int err; ++ ++ if (path->p_bh) { ++ /* path points to block */ ++ err = ext3_journal_get_write_access(handle, path->p_bh); ++ } else { ++ /* path points to leaf/index in inode body */ ++ err = ext3_ext_get_access_for_root(handle, tree); ++ } ++ return err; ++} ++ ++/* ++ * could return: ++ * - EROFS ++ * - ENOMEM ++ * - EIO ++ */ ++static int ext3_ext_dirty(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ int err; ++ if (path->p_bh) { ++ /* path points to block */ ++ err =ext3_journal_dirty_metadata(handle, path->p_bh); ++ } else { ++ /* path points to leaf/index in inode body */ ++ err = ext3_ext_mark_root_dirty(handle, tree); ++ } ++ return err; ++} ++ ++static int inline ++ext3_ext_new_block(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, struct ext3_extent *ex, ++ int *err) ++{ ++ int goal, depth, newblock; ++ struct inode *inode; ++ ++ EXT_ASSERT(tree); ++ if (tree->ops->new_block) ++ return tree->ops->new_block(handle, tree, path, ex, err); ++ ++ inode = tree->inode; ++ depth = EXT_DEPTH(tree); ++ if (path && depth > 0) { ++ goal = path[depth-1].p_block; ++ } else { ++ struct ext3_inode_info *ei = EXT3_I(inode); ++ unsigned long bg_start; ++ unsigned long colour; ++ ++ bg_start = (ei->i_block_group * ++ EXT3_BLOCKS_PER_GROUP(inode->i_sb)) + ++ le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block); ++ colour = (current->pid % 16) * ++ (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16); ++ goal = bg_start + colour; ++ } ++ ++ newblock = ext3_new_block(handle, inode, goal, err); ++ return newblock; ++} ++ ++static inline void ext3_ext_tree_changed(struct ext3_extents_tree *tree) ++{ ++ struct ext3_extent_header *neh = EXT_ROOT_HDR(tree); ++ neh->eh_generation = ((EXT_FLAGS(neh) & ~EXT_FLAGS_CLR_UNKNOWN) << 24) | ++ (EXT_HDR_GEN(neh) + 1); ++} ++ ++static inline int ext3_ext_space_block(struct ext3_extents_tree *tree) ++{ ++ int size; ++ ++ size = (tree->inode->i_sb->s_blocksize - ++ sizeof(struct ext3_extent_header)) / ++ sizeof(struct ext3_extent); ++#ifdef AGRESSIVE_TEST ++ size = 6; ++#endif ++ return size; ++} ++ ++static inline int ext3_ext_space_block_idx(struct ext3_extents_tree *tree) ++{ ++ int size; ++ ++ size = (tree->inode->i_sb->s_blocksize - ++ sizeof(struct ext3_extent_header)) / ++ sizeof(struct ext3_extent_idx); ++#ifdef AGRESSIVE_TEST ++ size = 5; ++#endif ++ return size; ++} ++ ++static inline int ext3_ext_space_root(struct ext3_extents_tree *tree) ++{ ++ int size; ++ ++ size = (tree->buffer_len - sizeof(struct ext3_extent_header)) / ++ sizeof(struct ext3_extent); ++#ifdef AGRESSIVE_TEST ++ size = 3; ++#endif ++ return size; ++} ++ ++static inline int ext3_ext_space_root_idx(struct ext3_extents_tree *tree) ++{ ++ int size; ++ ++ size = (tree->buffer_len - sizeof(struct ext3_extent_header)) / ++ sizeof(struct ext3_extent_idx); ++#ifdef AGRESSIVE_TEST ++ size = 4; ++#endif ++ return size; ++} ++ ++static void ext3_ext_show_path(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++#ifdef EXT_DEBUG ++ int k, l = path->p_depth; ++ ++ ext_debug(tree, "path:"); ++ for (k = 0; k <= l; k++, path++) { ++ if (path->p_idx) { ++ ext_debug(tree, " %d->%d", path->p_idx->ei_block, ++ path->p_idx->ei_leaf); ++ } else if (path->p_ext) { ++ ext_debug(tree, " %d:%d:%d", ++ path->p_ext->ee_block, ++ path->p_ext->ee_len, ++ path->p_ext->ee_start); ++ } else ++ ext_debug(tree, " []"); ++ } ++ ext_debug(tree, "\n"); ++#endif ++} ++ ++static void ext3_ext_show_leaf(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++#ifdef EXT_DEBUG ++ int depth = EXT_DEPTH(tree); ++ struct ext3_extent_header *eh; ++ struct ext3_extent *ex; ++ int i; ++ ++ if (!path) ++ return; ++ ++ eh = path[depth].p_hdr; ++ ex = EXT_FIRST_EXTENT(eh); ++ ++ for (i = 0; i < eh->eh_entries; i++, ex++) { ++ ext_debug(tree, "%d:%d:%d ", ++ ex->ee_block, ex->ee_len, ex->ee_start); ++ } ++ ext_debug(tree, "\n"); ++#endif ++} ++ ++static void ext3_ext_drop_refs(struct ext3_ext_path *path) ++{ ++ int depth = path->p_depth; ++ int i; ++ ++ for (i = 0; i <= depth; i++, path++) { ++ if (path->p_bh) { ++ brelse(path->p_bh); ++ path->p_bh = NULL; ++ } ++ } ++} ++ ++/* ++ * binary search for closest index by given block ++ */ ++static inline void ++ext3_ext_binsearch_idx(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, int block) ++{ ++ struct ext3_extent_header *eh = path->p_hdr; ++ struct ext3_extent_idx *ix; ++ int l = 0, k, r; ++ ++ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); ++ EXT_ASSERT(eh->eh_entries <= eh->eh_max); ++ EXT_ASSERT(eh->eh_entries > 0); ++ ++ ext_debug(tree, "binsearch for %d(idx): ", block); ++ ++ path->p_idx = ix = EXT_FIRST_INDEX(eh); ++ ++ r = k = eh->eh_entries; ++ while (k > 1) { ++ k = (r - l) / 2; ++ if (block < ix[l + k].ei_block) ++ r -= k; ++ else ++ l += k; ++ ext_debug(tree, "%d:%d:%d ", k, l, r); ++ } ++ ++ ix += l; ++ path->p_idx = ix; ++ ext_debug(tree," -> %d->%d ",path->p_idx->ei_block,path->p_idx->ei_leaf); ++ ++ while (l++ < r) { ++ if (block < ix->ei_block) ++ break; ++ path->p_idx = ix++; ++ } ++ ext_debug(tree, " -> %d->%d\n", path->p_idx->ei_block, ++ path->p_idx->ei_leaf); ++ ++#ifdef CHECK_BINSEARCH ++ { ++ struct ext3_extent_idx *chix; ++ ++ chix = ix = EXT_FIRST_INDEX(eh); ++ for (k = 0; k < eh->eh_entries; k++, ix++) { ++ if (k != 0 && ix->ei_block <= ix[-1].ei_block) { ++ printk("k=%d, ix=0x%p, first=0x%p\n", k, ++ ix, EXT_FIRST_INDEX(eh)); ++ printk("%u <= %u\n", ++ ix->ei_block,ix[-1].ei_block); ++ } ++ EXT_ASSERT(k == 0 || ix->ei_block > ix[-1].ei_block); ++ if (block < ix->ei_block) ++ break; ++ chix = ix; ++ } ++ EXT_ASSERT(chix == path->p_idx); ++ } ++#endif ++} ++ ++/* ++ * binary search for closest extent by given block ++ */ ++static inline void ++ext3_ext_binsearch(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, int block) ++{ ++ struct ext3_extent_header *eh = path->p_hdr; ++ struct ext3_extent *ex; ++ int l = 0, k, r; ++ ++ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); ++ EXT_ASSERT(eh->eh_entries <= eh->eh_max); ++ ++ if (eh->eh_entries == 0) { ++ /* ++ * this leaf is empty yet: ++ * we get such a leaf in split/add case ++ */ ++ return; ++ } ++ ++ ext_debug(tree, "binsearch for %d: ", block); ++ ++ path->p_ext = ex = EXT_FIRST_EXTENT(eh); ++ ++ r = k = eh->eh_entries; ++ while (k > 1) { ++ k = (r - l) / 2; ++ if (block < ex[l + k].ee_block) ++ r -= k; ++ else ++ l += k; ++ ext_debug(tree, "%d:%d:%d ", k, l, r); ++ } ++ ++ ex += l; ++ path->p_ext = ex; ++ ext_debug(tree, " -> %d:%d:%d ", path->p_ext->ee_block, ++ path->p_ext->ee_start, path->p_ext->ee_len); ++ ++ while (l++ < r) { ++ if (block < ex->ee_block) ++ break; ++ path->p_ext = ex++; ++ } ++ ext_debug(tree, " -> %d:%d:%d\n", path->p_ext->ee_block, ++ path->p_ext->ee_start, path->p_ext->ee_len); ++ ++#ifdef CHECK_BINSEARCH ++ { ++ struct ext3_extent *chex; ++ ++ chex = ex = EXT_FIRST_EXTENT(eh); ++ for (k = 0; k < eh->eh_entries; k++, ex++) { ++ EXT_ASSERT(k == 0 || ex->ee_block > ex[-1].ee_block); ++ if (block < ex->ee_block) ++ break; ++ chex = ex; ++ } ++ EXT_ASSERT(chex == path->p_ext); ++ } ++#endif ++} ++ ++int ext3_extent_tree_init(handle_t *handle, struct ext3_extents_tree *tree) ++{ ++ struct ext3_extent_header *eh; ++ ++ BUG_ON(tree->buffer_len == 0); ++ ext3_ext_get_access_for_root(handle, tree); ++ eh = EXT_ROOT_HDR(tree); ++ eh->eh_depth = 0; ++ eh->eh_entries = 0; ++ eh->eh_magic = EXT3_EXT_MAGIC; ++ eh->eh_max = ext3_ext_space_root(tree); ++ ext3_ext_mark_root_dirty(handle, tree); ++ ext3_ext_invalidate_cache(tree); ++ return 0; ++} ++ ++struct ext3_ext_path * ++ext3_ext_find_extent(struct ext3_extents_tree *tree, int block, ++ struct ext3_ext_path *path) ++{ ++ struct ext3_extent_header *eh; ++ struct buffer_head *bh; ++ int depth, i, ppos = 0; ++ ++ EXT_ASSERT(tree); ++ EXT_ASSERT(tree->inode); ++ EXT_ASSERT(tree->root); ++ ++ eh = EXT_ROOT_HDR(tree); ++ EXT_ASSERT(eh); ++ if (ext3_ext_check_header(eh)) { ++ /* don't free previously allocated path ++ * -- caller should take care */ ++ path = NULL; ++ goto err; ++ } ++ ++ i = depth = EXT_DEPTH(tree); ++ EXT_ASSERT(eh->eh_max); ++ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); ++ ++ /* account possible depth increase */ ++ if (!path) { ++ path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 2), ++ GFP_NOFS); ++ if (!path) ++ return ERR_PTR(-ENOMEM); ++ } ++ memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1)); ++ path[0].p_hdr = eh; ++ ++ /* walk through the tree */ ++ while (i) { ++ ext_debug(tree, "depth %d: num %d, max %d\n", ++ ppos, eh->eh_entries, eh->eh_max); ++ ext3_ext_binsearch_idx(tree, path + ppos, block); ++ path[ppos].p_block = path[ppos].p_idx->ei_leaf; ++ path[ppos].p_depth = i; ++ path[ppos].p_ext = NULL; ++ ++ bh = sb_bread(tree->inode->i_sb, path[ppos].p_block); ++ if (!bh) ++ goto err; ++ ++ eh = EXT_BLOCK_HDR(bh); ++ ppos++; ++ EXT_ASSERT(ppos <= depth); ++ path[ppos].p_bh = bh; ++ path[ppos].p_hdr = eh; ++ i--; ++ ++ if (ext3_ext_check_header(eh)) ++ goto err; ++ } ++ ++ path[ppos].p_depth = i; ++ path[ppos].p_hdr = eh; ++ path[ppos].p_ext = NULL; ++ path[ppos].p_idx = NULL; ++ ++ if (ext3_ext_check_header(eh)) ++ goto err; ++ ++ /* find extent */ ++ ext3_ext_binsearch(tree, path + ppos, block); ++ ++ ext3_ext_show_path(tree, path); ++ ++ return path; ++ ++err: ++ printk(KERN_ERR "EXT3-fs: header is corrupted!\n"); ++ if (path) { ++ ext3_ext_drop_refs(path); ++ kfree(path); ++ } ++ return ERR_PTR(-EIO); ++} ++ ++/* ++ * insert new index [logical;ptr] into the block at cupr ++ * it check where to insert: before curp or after curp ++ */ ++static int ext3_ext_insert_index(handle_t *handle, ++ struct ext3_extents_tree *tree, ++ struct ext3_ext_path *curp, ++ int logical, int ptr) ++{ ++ struct ext3_extent_idx *ix; ++ int len, err; ++ ++ if ((err = ext3_ext_get_access(handle, tree, curp))) ++ return err; ++ ++ EXT_ASSERT(logical != curp->p_idx->ei_block); ++ len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx; ++ if (logical > curp->p_idx->ei_block) { ++ /* insert after */ ++ if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) { ++ len = (len - 1) * sizeof(struct ext3_extent_idx); ++ len = len < 0 ? 0 : len; ++ ext_debug(tree, "insert new index %d after: %d. " ++ "move %d from 0x%p to 0x%p\n", ++ logical, ptr, len, ++ (curp->p_idx + 1), (curp->p_idx + 2)); ++ memmove(curp->p_idx + 2, curp->p_idx + 1, len); ++ } ++ ix = curp->p_idx + 1; ++ } else { ++ /* insert before */ ++ len = len * sizeof(struct ext3_extent_idx); ++ len = len < 0 ? 0 : len; ++ ext_debug(tree, "insert new index %d before: %d. " ++ "move %d from 0x%p to 0x%p\n", ++ logical, ptr, len, ++ curp->p_idx, (curp->p_idx + 1)); ++ memmove(curp->p_idx + 1, curp->p_idx, len); ++ ix = curp->p_idx; ++ } ++ ++ ix->ei_block = logical; ++ ix->ei_leaf = ptr; ++ ix->ei_leaf_hi = ix->ei_unused = 0; ++ curp->p_hdr->eh_entries++; ++ ++ EXT_ASSERT(curp->p_hdr->eh_entries <= curp->p_hdr->eh_max); ++ EXT_ASSERT(ix <= EXT_LAST_INDEX(curp->p_hdr)); ++ ++ err = ext3_ext_dirty(handle, tree, curp); ++ ext3_std_error(tree->inode->i_sb, err); ++ ++ return err; ++} ++ ++/* ++ * routine inserts new subtree into the path, using free index entry ++ * at depth 'at: ++ * - allocates all needed blocks (new leaf and all intermediate index blocks) ++ * - makes decision where to split ++ * - moves remaining extens and index entries (right to the split point) ++ * into the newly allocated blocks ++ * - initialize subtree ++ */ ++static int ext3_ext_split(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newext, int at) ++{ ++ struct buffer_head *bh = NULL; ++ int depth = EXT_DEPTH(tree); ++ struct ext3_extent_header *neh; ++ struct ext3_extent_idx *fidx; ++ struct ext3_extent *ex; ++ int i = at, k, m, a; ++ unsigned long newblock, oldblock, border; ++ int *ablocks = NULL; /* array of allocated blocks */ ++ int err = 0; ++ ++ /* make decision: where to split? */ ++ /* FIXME: now desicion is simplest: at current extent */ ++ ++ /* if current leaf will be splitted, then we should use ++ * border from split point */ ++ EXT_ASSERT(path[depth].p_ext <= EXT_MAX_EXTENT(path[depth].p_hdr)); ++ if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) { ++ border = path[depth].p_ext[1].ee_block; ++ ext_debug(tree, "leaf will be splitted." ++ " next leaf starts at %d\n", ++ (int)border); ++ } else { ++ border = newext->ee_block; ++ ext_debug(tree, "leaf will be added." ++ " next leaf starts at %d\n", ++ (int)border); ++ } ++ ++ /* ++ * if error occurs, then we break processing ++ * and turn filesystem read-only. so, index won't ++ * be inserted and tree will be in consistent ++ * state. next mount will repair buffers too ++ */ ++ ++ /* ++ * get array to track all allocated blocks ++ * we need this to handle errors and free blocks ++ * upon them ++ */ ++ ablocks = kmalloc(sizeof(unsigned long) * depth, GFP_NOFS); ++ if (!ablocks) ++ return -ENOMEM; ++ memset(ablocks, 0, sizeof(unsigned long) * depth); ++ ++ /* allocate all needed blocks */ ++ ext_debug(tree, "allocate %d blocks for indexes/leaf\n", depth - at); ++ for (a = 0; a < depth - at; a++) { ++ newblock = ext3_ext_new_block(handle, tree, path, newext, &err); ++ if (newblock == 0) ++ goto cleanup; ++ ablocks[a] = newblock; ++ } ++ ++ /* initialize new leaf */ ++ newblock = ablocks[--a]; ++ EXT_ASSERT(newblock); ++ bh = sb_getblk(tree->inode->i_sb, newblock); ++ if (!bh) { ++ err = -EIO; ++ goto cleanup; ++ } ++ lock_buffer(bh); ++ ++ if ((err = ext3_journal_get_create_access(handle, bh))) ++ goto cleanup; ++ ++ neh = EXT_BLOCK_HDR(bh); ++ neh->eh_entries = 0; ++ neh->eh_max = ext3_ext_space_block(tree); ++ neh->eh_magic = EXT3_EXT_MAGIC; ++ neh->eh_depth = 0; ++ ex = EXT_FIRST_EXTENT(neh); ++ ++ /* move remain of path[depth] to the new leaf */ ++ EXT_ASSERT(path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max); ++ /* start copy from next extent */ ++ /* TODO: we could do it by single memmove */ ++ m = 0; ++ path[depth].p_ext++; ++ while (path[depth].p_ext <= ++ EXT_MAX_EXTENT(path[depth].p_hdr)) { ++ ext_debug(tree, "move %d:%d:%d in new leaf %lu\n", ++ path[depth].p_ext->ee_block, ++ path[depth].p_ext->ee_start, ++ path[depth].p_ext->ee_len, ++ newblock); ++ memmove(ex++, path[depth].p_ext++, sizeof(struct ext3_extent)); ++ neh->eh_entries++; ++ m++; ++ } ++ set_buffer_uptodate(bh); ++ unlock_buffer(bh); ++ ++ if ((err = ext3_journal_dirty_metadata(handle, bh))) ++ goto cleanup; ++ brelse(bh); ++ bh = NULL; ++ ++ /* correct old leaf */ ++ if (m) { ++ if ((err = ext3_ext_get_access(handle, tree, path + depth))) ++ goto cleanup; ++ path[depth].p_hdr->eh_entries -= m; ++ if ((err = ext3_ext_dirty(handle, tree, path + depth))) ++ goto cleanup; ++ ++ } ++ ++ /* create intermediate indexes */ ++ k = depth - at - 1; ++ EXT_ASSERT(k >= 0); ++ if (k) ++ ext_debug(tree, "create %d intermediate indices\n", k); ++ /* insert new index into current index block */ ++ /* current depth stored in i var */ ++ i = depth - 1; ++ while (k--) { ++ oldblock = newblock; ++ newblock = ablocks[--a]; ++ bh = sb_getblk(tree->inode->i_sb, newblock); ++ if (!bh) { ++ err = -EIO; ++ goto cleanup; ++ } ++ lock_buffer(bh); ++ ++ if ((err = ext3_journal_get_create_access(handle, bh))) ++ goto cleanup; ++ ++ neh = EXT_BLOCK_HDR(bh); ++ neh->eh_entries = 1; ++ neh->eh_magic = EXT3_EXT_MAGIC; ++ neh->eh_max = ext3_ext_space_block_idx(tree); ++ neh->eh_depth = depth - i; ++ fidx = EXT_FIRST_INDEX(neh); ++ fidx->ei_block = border; ++ fidx->ei_leaf = oldblock; ++ fidx->ei_leaf_hi = fidx->ei_unused = 0; ++ ++ ext_debug(tree, "int.index at %d (block %lu): %lu -> %lu\n", ++ i, newblock, border, oldblock); ++ /* copy indexes */ ++ m = 0; ++ path[i].p_idx++; ++ ++ ext_debug(tree, "cur 0x%p, last 0x%p\n", path[i].p_idx, ++ EXT_MAX_INDEX(path[i].p_hdr)); ++ EXT_ASSERT(EXT_MAX_INDEX(path[i].p_hdr) == ++ EXT_LAST_INDEX(path[i].p_hdr)); ++ while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) { ++ ext_debug(tree, "%d: move %d:%d in new index %lu\n", ++ i, path[i].p_idx->ei_block, ++ path[i].p_idx->ei_leaf, newblock); ++ memmove(++fidx, path[i].p_idx++, ++ sizeof(struct ext3_extent_idx)); ++ neh->eh_entries++; ++ EXT_ASSERT(neh->eh_entries <= neh->eh_max); ++ m++; ++ } ++ set_buffer_uptodate(bh); ++ unlock_buffer(bh); ++ ++ if ((err = ext3_journal_dirty_metadata(handle, bh))) ++ goto cleanup; ++ brelse(bh); ++ bh = NULL; ++ ++ /* correct old index */ ++ if (m) { ++ err = ext3_ext_get_access(handle, tree, path + i); ++ if (err) ++ goto cleanup; ++ path[i].p_hdr->eh_entries -= m; ++ err = ext3_ext_dirty(handle, tree, path + i); ++ if (err) ++ goto cleanup; ++ } ++ ++ i--; ++ } ++ ++ /* insert new index */ ++ if (!err) ++ err = ext3_ext_insert_index(handle, tree, path + at, ++ border, newblock); ++ ++cleanup: ++ if (bh) { ++ if (buffer_locked(bh)) ++ unlock_buffer(bh); ++ brelse(bh); ++ } ++ ++ if (err) { ++ /* free all allocated blocks in error case */ ++ for (i = 0; i < depth; i++) { ++ if (!ablocks[i]) ++ continue; ++ ext3_free_blocks(handle, tree->inode, ablocks[i], 1); ++ } ++ } ++ kfree(ablocks); ++ ++ return err; ++} ++ ++/* ++ * routine implements tree growing procedure: ++ * - allocates new block ++ * - moves top-level data (index block or leaf) into the new block ++ * - initialize new top-level, creating index that points to the ++ * just created block ++ */ ++static int ext3_ext_grow_indepth(handle_t *handle, ++ struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newext) ++{ ++ struct ext3_ext_path *curp = path; ++ struct ext3_extent_header *neh; ++ struct ext3_extent_idx *fidx; ++ struct buffer_head *bh; ++ unsigned long newblock; ++ int err = 0; ++ ++ newblock = ext3_ext_new_block(handle, tree, path, newext, &err); ++ if (newblock == 0) ++ return err; ++ ++ bh = sb_getblk(tree->inode->i_sb, newblock); ++ if (!bh) { ++ err = -EIO; ++ ext3_std_error(tree->inode->i_sb, err); ++ return err; ++ } ++ lock_buffer(bh); ++ ++ if ((err = ext3_journal_get_create_access(handle, bh))) { ++ unlock_buffer(bh); ++ goto out; ++ } ++ ++ /* move top-level index/leaf into new block */ ++ memmove(bh->b_data, curp->p_hdr, tree->buffer_len); ++ ++ /* set size of new block */ ++ neh = EXT_BLOCK_HDR(bh); ++ /* old root could have indexes or leaves ++ * so calculate eh_max right way */ ++ if (EXT_DEPTH(tree)) ++ neh->eh_max = ext3_ext_space_block_idx(tree); ++ else ++ neh->eh_max = ext3_ext_space_block(tree); ++ neh->eh_magic = EXT3_EXT_MAGIC; ++ set_buffer_uptodate(bh); ++ unlock_buffer(bh); ++ ++ if ((err = ext3_journal_dirty_metadata(handle, bh))) ++ goto out; ++ ++ /* create index in new top-level index: num,max,pointer */ ++ if ((err = ext3_ext_get_access(handle, tree, curp))) ++ goto out; ++ ++ curp->p_hdr->eh_magic = EXT3_EXT_MAGIC; ++ curp->p_hdr->eh_max = ext3_ext_space_root_idx(tree); ++ curp->p_hdr->eh_entries = 1; ++ curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr); ++ /* FIXME: it works, but actually path[0] can be index */ ++ curp->p_idx->ei_block = EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block; ++ curp->p_idx->ei_leaf = newblock; ++ curp->p_idx->ei_leaf_hi = curp->p_idx->ei_unused = 0; ++ ++ neh = EXT_ROOT_HDR(tree); ++ fidx = EXT_FIRST_INDEX(neh); ++ ext_debug(tree, "new root: num %d(%d), lblock %d, ptr %d\n", ++ neh->eh_entries, neh->eh_max, fidx->ei_block, fidx->ei_leaf); ++ ++ neh->eh_depth = path->p_depth + 1; ++ err = ext3_ext_dirty(handle, tree, curp); ++out: ++ brelse(bh); ++ ++ return err; ++} ++ ++/* ++ * routine finds empty index and adds new leaf. if no free index found ++ * then it requests in-depth growing ++ */ ++static int ext3_ext_create_new_leaf(handle_t *handle, ++ struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newext) ++{ ++ struct ext3_ext_path *curp; ++ int depth, i, err = 0; ++ ++repeat: ++ i = depth = EXT_DEPTH(tree); ++ ++ /* walk up to the tree and look for free index entry */ ++ curp = path + depth; ++ while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) { ++ i--; ++ curp--; ++ } ++ ++ /* we use already allocated block for index block ++ * so, subsequent data blocks should be contigoues */ ++ if (EXT_HAS_FREE_INDEX(curp)) { ++ /* if we found index with free entry, then use that ++ * entry: create all needed subtree and add new leaf */ ++ err = ext3_ext_split(handle, tree, path, newext, i); ++ ++ /* refill path */ ++ ext3_ext_drop_refs(path); ++ path = ext3_ext_find_extent(tree, newext->ee_block, path); ++ if (IS_ERR(path)) ++ err = PTR_ERR(path); ++ } else { ++ /* tree is full, time to grow in depth */ ++ err = ext3_ext_grow_indepth(handle, tree, path, newext); ++ ++ /* refill path */ ++ ext3_ext_drop_refs(path); ++ path = ext3_ext_find_extent(tree, newext->ee_block, path); ++ if (IS_ERR(path)) ++ err = PTR_ERR(path); ++ ++ /* ++ * only first (depth 0 -> 1) produces free space ++ * in all other cases we have to split growed tree ++ */ ++ depth = EXT_DEPTH(tree); ++ if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) { ++ /* now we need split */ ++ goto repeat; ++ } ++ } ++ ++ if (err) ++ return err; ++ ++ return 0; ++} ++ ++/* ++ * returns allocated block in subsequent extent or EXT_MAX_BLOCK ++ * NOTE: it consider block number from index entry as ++ * allocated block. thus, index entries have to be consistent ++ * with leafs ++ */ ++static unsigned long ++ext3_ext_next_allocated_block(struct ext3_ext_path *path) ++{ ++ int depth; ++ ++ EXT_ASSERT(path != NULL); ++ depth = path->p_depth; ++ ++ if (depth == 0 && path->p_ext == NULL) ++ return EXT_MAX_BLOCK; ++ ++ /* FIXME: what if index isn't full ?! */ ++ while (depth >= 0) { ++ if (depth == path->p_depth) { ++ /* leaf */ ++ if (path[depth].p_ext != ++ EXT_LAST_EXTENT(path[depth].p_hdr)) ++ return path[depth].p_ext[1].ee_block; ++ } else { ++ /* index */ ++ if (path[depth].p_idx != ++ EXT_LAST_INDEX(path[depth].p_hdr)) ++ return path[depth].p_idx[1].ei_block; ++ } ++ depth--; ++ } ++ ++ return EXT_MAX_BLOCK; ++} ++ ++/* ++ * returns first allocated block from next leaf or EXT_MAX_BLOCK ++ */ ++static unsigned ext3_ext_next_leaf_block(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ int depth; ++ ++ EXT_ASSERT(path != NULL); ++ depth = path->p_depth; ++ ++ /* zero-tree has no leaf blocks at all */ ++ if (depth == 0) ++ return EXT_MAX_BLOCK; ++ ++ /* go to index block */ ++ depth--; ++ ++ while (depth >= 0) { ++ if (path[depth].p_idx != ++ EXT_LAST_INDEX(path[depth].p_hdr)) ++ return path[depth].p_idx[1].ei_block; ++ depth--; ++ } ++ ++ return EXT_MAX_BLOCK; ++} ++ ++/* ++ * if leaf gets modified and modified extent is first in the leaf ++ * then we have to correct all indexes above ++ * TODO: do we need to correct tree in all cases? ++ */ ++int ext3_ext_correct_indexes(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ struct ext3_extent_header *eh; ++ int depth = EXT_DEPTH(tree); ++ struct ext3_extent *ex; ++ unsigned long border; ++ int k, err = 0; ++ ++ eh = path[depth].p_hdr; ++ ex = path[depth].p_ext; ++ EXT_ASSERT(ex); ++ EXT_ASSERT(eh); ++ ++ if (depth == 0) { ++ /* there is no tree at all */ ++ return 0; ++ } ++ ++ if (ex != EXT_FIRST_EXTENT(eh)) { ++ /* we correct tree if first leaf got modified only */ ++ return 0; ++ } ++ ++ /* ++ * TODO: we need correction if border is smaller then current one ++ */ ++ k = depth - 1; ++ border = path[depth].p_ext->ee_block; ++ if ((err = ext3_ext_get_access(handle, tree, path + k))) ++ return err; ++ path[k].p_idx->ei_block = border; ++ if ((err = ext3_ext_dirty(handle, tree, path + k))) ++ return err; ++ ++ while (k--) { ++ /* change all left-side indexes */ ++ if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr)) ++ break; ++ if ((err = ext3_ext_get_access(handle, tree, path + k))) ++ break; ++ path[k].p_idx->ei_block = border; ++ if ((err = ext3_ext_dirty(handle, tree, path + k))) ++ break; ++ } ++ ++ return err; ++} ++ ++static int inline ++ext3_can_extents_be_merged(struct ext3_extents_tree *tree, ++ struct ext3_extent *ex1, ++ struct ext3_extent *ex2) ++{ ++ if (ex1->ee_block + ex1->ee_len != ex2->ee_block) ++ return 0; ++ ++#ifdef AGRESSIVE_TEST ++ if (ex1->ee_len >= 4) ++ return 0; ++#endif ++ ++ if (!tree->ops->mergable) ++ return 1; ++ ++ return tree->ops->mergable(ex1, ex2); ++} ++ ++/* ++ * this routine tries to merge requsted extent into the existing ++ * extent or inserts requested extent as new one into the tree, ++ * creating new leaf in no-space case ++ */ ++int ext3_ext_insert_extent(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newext) ++{ ++ struct ext3_extent_header * eh; ++ struct ext3_extent *ex, *fex; ++ struct ext3_extent *nearex; /* nearest extent */ ++ struct ext3_ext_path *npath = NULL; ++ int depth, len, err, next; ++ ++ EXT_ASSERT(newext->ee_len > 0); ++ depth = EXT_DEPTH(tree); ++ ex = path[depth].p_ext; ++ EXT_ASSERT(path[depth].p_hdr); ++ ++ /* try to insert block into found extent and return */ ++ if (ex && ext3_can_extents_be_merged(tree, ex, newext)) { ++ ext_debug(tree, "append %d block to %d:%d (from %d)\n", ++ newext->ee_len, ex->ee_block, ex->ee_len, ++ ex->ee_start); ++ if ((err = ext3_ext_get_access(handle, tree, path + depth))) ++ return err; ++ ex->ee_len += newext->ee_len; ++ eh = path[depth].p_hdr; ++ nearex = ex; ++ goto merge; ++ } ++ ++repeat: ++ depth = EXT_DEPTH(tree); ++ eh = path[depth].p_hdr; ++ if (eh->eh_entries < eh->eh_max) ++ goto has_space; ++ ++ /* probably next leaf has space for us? */ ++ fex = EXT_LAST_EXTENT(eh); ++ next = ext3_ext_next_leaf_block(tree, path); ++ if (newext->ee_block > fex->ee_block && next != EXT_MAX_BLOCK) { ++ ext_debug(tree, "next leaf block - %d\n", next); ++ EXT_ASSERT(!npath); ++ npath = ext3_ext_find_extent(tree, next, NULL); ++ if (IS_ERR(npath)) ++ return PTR_ERR(npath); ++ EXT_ASSERT(npath->p_depth == path->p_depth); ++ eh = npath[depth].p_hdr; ++ if (eh->eh_entries < eh->eh_max) { ++ ext_debug(tree, "next leaf isnt full(%d)\n", ++ eh->eh_entries); ++ path = npath; ++ goto repeat; ++ } ++ ext_debug(tree, "next leaf hasno free space(%d,%d)\n", ++ eh->eh_entries, eh->eh_max); ++ } ++ ++ /* ++ * there is no free space in found leaf ++ * we're gonna add new leaf in the tree ++ */ ++ err = ext3_ext_create_new_leaf(handle, tree, path, newext); ++ if (err) ++ goto cleanup; ++ depth = EXT_DEPTH(tree); ++ eh = path[depth].p_hdr; ++ ++has_space: ++ nearex = path[depth].p_ext; ++ ++ if ((err = ext3_ext_get_access(handle, tree, path + depth))) ++ goto cleanup; ++ ++ if (!nearex) { ++ /* there is no extent in this leaf, create first one */ ++ ext_debug(tree, "first extent in the leaf: %d:%d:%d\n", ++ newext->ee_block, newext->ee_start, ++ newext->ee_len); ++ path[depth].p_ext = EXT_FIRST_EXTENT(eh); ++ } else if (newext->ee_block > nearex->ee_block) { ++ EXT_ASSERT(newext->ee_block != nearex->ee_block); ++ if (nearex != EXT_LAST_EXTENT(eh)) { ++ len = EXT_MAX_EXTENT(eh) - nearex; ++ len = (len - 1) * sizeof(struct ext3_extent); ++ len = len < 0 ? 0 : len; ++ ext_debug(tree, "insert %d:%d:%d after: nearest 0x%p, " ++ "move %d from 0x%p to 0x%p\n", ++ newext->ee_block, newext->ee_start, ++ newext->ee_len, ++ nearex, len, nearex + 1, nearex + 2); ++ memmove(nearex + 2, nearex + 1, len); ++ } ++ path[depth].p_ext = nearex + 1; ++ } else { ++ EXT_ASSERT(newext->ee_block != nearex->ee_block); ++ len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext3_extent); ++ len = len < 0 ? 0 : len; ++ ext_debug(tree, "insert %d:%d:%d before: nearest 0x%p, " ++ "move %d from 0x%p to 0x%p\n", ++ newext->ee_block, newext->ee_start, newext->ee_len, ++ nearex, len, nearex + 1, nearex + 2); ++ memmove(nearex + 1, nearex, len); ++ path[depth].p_ext = nearex; ++ } ++ ++ eh->eh_entries++; ++ nearex = path[depth].p_ext; ++ nearex->ee_block = newext->ee_block; ++ nearex->ee_start = newext->ee_start; ++ nearex->ee_len = newext->ee_len; ++ /* FIXME: support for large fs */ ++ nearex->ee_start_hi = 0; ++ ++merge: ++ /* try to merge extents to the right */ ++ while (nearex < EXT_LAST_EXTENT(eh)) { ++ if (!ext3_can_extents_be_merged(tree, nearex, nearex + 1)) ++ break; ++ /* merge with next extent! */ ++ nearex->ee_len += nearex[1].ee_len; ++ if (nearex + 1 < EXT_LAST_EXTENT(eh)) { ++ len = (EXT_LAST_EXTENT(eh) - nearex - 1) * ++ sizeof(struct ext3_extent); ++ memmove(nearex + 1, nearex + 2, len); ++ } ++ eh->eh_entries--; ++ EXT_ASSERT(eh->eh_entries > 0); ++ } ++ ++ /* try to merge extents to the left */ ++ ++ /* time to correct all indexes above */ ++ err = ext3_ext_correct_indexes(handle, tree, path); ++ if (err) ++ goto cleanup; ++ ++ err = ext3_ext_dirty(handle, tree, path + depth); ++ ++cleanup: ++ if (npath) { ++ ext3_ext_drop_refs(npath); ++ kfree(npath); ++ } ++ ext3_ext_tree_changed(tree); ++ ext3_ext_invalidate_cache(tree); ++ return err; ++} ++ ++int ext3_ext_walk_space(struct ext3_extents_tree *tree, unsigned long block, ++ unsigned long num, ext_prepare_callback func) ++{ ++ struct ext3_ext_path *path = NULL; ++ struct ext3_ext_cache cbex; ++ struct ext3_extent *ex; ++ unsigned long next, start = 0, end = 0; ++ unsigned long last = block + num; ++ int depth, exists, err = 0; ++ ++ EXT_ASSERT(tree); ++ EXT_ASSERT(func); ++ EXT_ASSERT(tree->inode); ++ EXT_ASSERT(tree->root); ++ ++ while (block < last && block != EXT_MAX_BLOCK) { ++ num = last - block; ++ /* find extent for this block */ ++ path = ext3_ext_find_extent(tree, block, path); ++ if (IS_ERR(path)) { ++ err = PTR_ERR(path); ++ path = NULL; ++ break; ++ } ++ ++ depth = EXT_DEPTH(tree); ++ EXT_ASSERT(path[depth].p_hdr); ++ ex = path[depth].p_ext; ++ next = ext3_ext_next_allocated_block(path); ++ ++ exists = 0; ++ if (!ex) { ++ /* there is no extent yet, so try to allocate ++ * all requested space */ ++ start = block; ++ end = block + num; ++ } else if (ex->ee_block > block) { ++ /* need to allocate space before found extent */ ++ start = block; ++ end = ex->ee_block; ++ if (block + num < end) ++ end = block + num; ++ } else if (block >= ex->ee_block + ex->ee_len) { ++ /* need to allocate space after found extent */ ++ start = block; ++ end = block + num; ++ if (end >= next) ++ end = next; ++ } else if (block >= ex->ee_block) { ++ /* ++ * some part of requested space is covered ++ * by found extent ++ */ ++ start = block; ++ end = ex->ee_block + ex->ee_len; ++ if (block + num < end) ++ end = block + num; ++ exists = 1; ++ } else { ++ BUG(); ++ } ++ EXT_ASSERT(end > start); ++ ++ if (!exists) { ++ cbex.ec_block = start; ++ cbex.ec_len = end - start; ++ cbex.ec_start = 0; ++ cbex.ec_type = EXT3_EXT_CACHE_GAP; ++ } else { ++ cbex.ec_block = ex->ee_block; ++ cbex.ec_len = ex->ee_len; ++ cbex.ec_start = ex->ee_start; ++ cbex.ec_type = EXT3_EXT_CACHE_EXTENT; ++ } ++ ++ EXT_ASSERT(cbex.ec_len > 0); ++ EXT_ASSERT(path[depth].p_hdr); ++ err = func(tree, path, &cbex); ++ ext3_ext_drop_refs(path); ++ ++ if (err < 0) ++ break; ++ if (err == EXT_REPEAT) ++ continue; ++ else if (err == EXT_BREAK) { ++ err = 0; ++ break; ++ } ++ ++ if (EXT_DEPTH(tree) != depth) { ++ /* depth was changed. we have to realloc path */ ++ kfree(path); ++ path = NULL; ++ } ++ ++ block = cbex.ec_block + cbex.ec_len; ++ } ++ ++ if (path) { ++ ext3_ext_drop_refs(path); ++ kfree(path); ++ } ++ ++ return err; ++} ++ ++static inline void ++ext3_ext_put_in_cache(struct ext3_extents_tree *tree, __u32 block, ++ __u32 len, __u32 start, int type) ++{ ++ EXT_ASSERT(len > 0); ++ if (tree->cex) { ++ tree->cex->ec_type = type; ++ tree->cex->ec_block = block; ++ tree->cex->ec_len = len; ++ tree->cex->ec_start = start; ++ } ++} ++ ++/* ++ * this routine calculate boundaries of the gap requested block fits into ++ * and cache this gap ++ */ ++static inline void ++ext3_ext_put_gap_in_cache(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ unsigned long block) ++{ ++ int depth = EXT_DEPTH(tree); ++ unsigned long lblock, len; ++ struct ext3_extent *ex; ++ ++ if (!tree->cex) ++ return; ++ ++ ex = path[depth].p_ext; ++ if (ex == NULL) { ++ /* there is no extent yet, so gap is [0;-] */ ++ lblock = 0; ++ len = EXT_MAX_BLOCK; ++ ext_debug(tree, "cache gap(whole file):"); ++ } else if (block < ex->ee_block) { ++ lblock = block; ++ len = ex->ee_block - block; ++ ext_debug(tree, "cache gap(before): %lu [%lu:%lu]", ++ (unsigned long) block, ++ (unsigned long) ex->ee_block, ++ (unsigned long) ex->ee_len); ++ } else if (block >= ex->ee_block + ex->ee_len) { ++ lblock = ex->ee_block + ex->ee_len; ++ len = ext3_ext_next_allocated_block(path); ++ ext_debug(tree, "cache gap(after): [%lu:%lu] %lu", ++ (unsigned long) ex->ee_block, ++ (unsigned long) ex->ee_len, ++ (unsigned long) block); ++ EXT_ASSERT(len > lblock); ++ len = len - lblock; ++ } else { ++ lblock = len = 0; ++ BUG(); ++ } ++ ++ ext_debug(tree, " -> %lu:%lu\n", (unsigned long) lblock, len); ++ ext3_ext_put_in_cache(tree, lblock, len, 0, EXT3_EXT_CACHE_GAP); ++} ++ ++static inline int ++ext3_ext_in_cache(struct ext3_extents_tree *tree, unsigned long block, ++ struct ext3_extent *ex) ++{ ++ struct ext3_ext_cache *cex = tree->cex; ++ ++ /* is there cache storage at all? */ ++ if (!cex) ++ return EXT3_EXT_CACHE_NO; ++ ++ /* has cache valid data? */ ++ if (cex->ec_type == EXT3_EXT_CACHE_NO) ++ return EXT3_EXT_CACHE_NO; ++ ++ EXT_ASSERT(cex->ec_type == EXT3_EXT_CACHE_GAP || ++ cex->ec_type == EXT3_EXT_CACHE_EXTENT); ++ if (block >= cex->ec_block && block < cex->ec_block + cex->ec_len) { ++ ex->ee_block = cex->ec_block; ++ ex->ee_start = cex->ec_start; ++ ex->ee_start_hi = 0; ++ ex->ee_len = cex->ec_len; ++ ext_debug(tree, "%lu cached by %lu:%lu:%lu\n", ++ (unsigned long) block, ++ (unsigned long) ex->ee_block, ++ (unsigned long) ex->ee_len, ++ (unsigned long) ex->ee_start); ++ return cex->ec_type; ++ } ++ ++ /* not in cache */ ++ return EXT3_EXT_CACHE_NO; ++} ++ ++/* ++ * routine removes index from the index block ++ * it's used in truncate case only. thus all requests are for ++ * last index in the block only ++ */ ++int ext3_ext_rm_idx(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ struct buffer_head *bh; ++ int err; ++ ++ /* free index block */ ++ path--; ++ EXT_ASSERT(path->p_hdr->eh_entries); ++ if ((err = ext3_ext_get_access(handle, tree, path))) ++ return err; ++ path->p_hdr->eh_entries--; ++ if ((err = ext3_ext_dirty(handle, tree, path))) ++ return err; ++ ext_debug(tree, "index is empty, remove it, free block %d\n", ++ path->p_idx->ei_leaf); ++ bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf); ++ ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf); ++ ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1); ++ return err; ++} ++ ++int ext3_ext_calc_credits_for_insert(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ int depth = EXT_DEPTH(tree); ++ int needed; ++ ++ if (path) { ++ /* probably there is space in leaf? */ ++ if (path[depth].p_hdr->eh_entries < path[depth].p_hdr->eh_max) ++ return 1; ++ } ++ ++ /* ++ * the worste case we're expecting is creation of the ++ * new root (growing in depth) with index splitting ++ * for splitting we have to consider depth + 1 because ++ * previous growing could increase it ++ */ ++ depth = depth + 1; ++ ++ /* ++ * growing in depth: ++ * block allocation + new root + old root ++ */ ++ needed = EXT3_ALLOC_NEEDED + 2; ++ ++ /* index split. we may need: ++ * allocate intermediate indexes and new leaf ++ * change two blocks at each level, but root ++ * modify root block (inode) ++ */ ++ needed += (depth * EXT3_ALLOC_NEEDED) + (2 * depth) + 1; ++ ++ return needed; ++} ++ ++static int ++ext3_ext_split_for_rm(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, unsigned long start, ++ unsigned long end) ++{ ++ struct ext3_extent *ex, tex; ++ struct ext3_ext_path *npath; ++ int depth, creds, err; ++ ++ depth = EXT_DEPTH(tree); ++ ex = path[depth].p_ext; ++ EXT_ASSERT(ex); ++ EXT_ASSERT(end < ex->ee_block + ex->ee_len - 1); ++ EXT_ASSERT(ex->ee_block < start); ++ ++ /* calculate tail extent */ ++ tex.ee_block = end + 1; ++ EXT_ASSERT(tex.ee_block < ex->ee_block + ex->ee_len); ++ tex.ee_len = ex->ee_block + ex->ee_len - tex.ee_block; ++ ++ creds = ext3_ext_calc_credits_for_insert(tree, path); ++ handle = ext3_ext_journal_restart(handle, creds); ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ ++ /* calculate head extent. use primary extent */ ++ err = ext3_ext_get_access(handle, tree, path + depth); ++ if (err) ++ return err; ++ ex->ee_len = start - ex->ee_block; ++ err = ext3_ext_dirty(handle, tree, path + depth); ++ if (err) ++ return err; ++ ++ /* FIXME: some callback to free underlying resource ++ * and correct ee_start? */ ++ ext_debug(tree, "split extent: head %u:%u, tail %u:%u\n", ++ ex->ee_block, ex->ee_len, tex.ee_block, tex.ee_len); ++ ++ npath = ext3_ext_find_extent(tree, ex->ee_block, NULL); ++ if (IS_ERR(npath)) ++ return PTR_ERR(npath); ++ depth = EXT_DEPTH(tree); ++ EXT_ASSERT(npath[depth].p_ext->ee_block == ex->ee_block); ++ EXT_ASSERT(npath[depth].p_ext->ee_len == ex->ee_len); ++ ++ err = ext3_ext_insert_extent(handle, tree, npath, &tex); ++ ext3_ext_drop_refs(npath); ++ kfree(npath); ++ ++ return err; ++} ++ ++static int ++ext3_ext_rm_leaf(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, unsigned long start, ++ unsigned long end) ++{ ++ struct ext3_extent *ex, *fu = NULL, *lu, *le; ++ int err = 0, correct_index = 0; ++ int depth = EXT_DEPTH(tree), credits; ++ struct ext3_extent_header *eh; ++ unsigned a, b, block, num; ++ ++ ext_debug(tree, "remove [%lu:%lu] in leaf\n", start, end); ++ if (!path[depth].p_hdr) ++ path[depth].p_hdr = EXT_BLOCK_HDR(path[depth].p_bh); ++ eh = path[depth].p_hdr; ++ EXT_ASSERT(eh); ++ EXT_ASSERT(eh->eh_entries <= eh->eh_max); ++ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); ++ ++ /* find where to start removing */ ++ le = ex = EXT_LAST_EXTENT(eh); ++ while (ex != EXT_FIRST_EXTENT(eh)) { ++ if (ex->ee_block <= end) ++ break; ++ ex--; ++ } ++ ++ if (start > ex->ee_block && end < ex->ee_block + ex->ee_len - 1) { ++ /* removal of internal part of the extent requested ++ * tail and head must be placed in different extent ++ * so, we have to insert one more extent */ ++ path[depth].p_ext = ex; ++ return ext3_ext_split_for_rm(handle, tree, path, start, end); ++ } ++ ++ lu = ex; ++ while (ex >= EXT_FIRST_EXTENT(eh) && ex->ee_block + ex->ee_len > start) { ++ ext_debug(tree, "remove ext %u:%u\n", ex->ee_block, ex->ee_len); ++ path[depth].p_ext = ex; ++ ++ a = ex->ee_block > start ? ex->ee_block : start; ++ b = ex->ee_block + ex->ee_len - 1 < end ? ++ ex->ee_block + ex->ee_len - 1 : end; ++ ++ ext_debug(tree, " border %u:%u\n", a, b); ++ ++ if (a != ex->ee_block && b != ex->ee_block + ex->ee_len - 1) { ++ block = 0; ++ num = 0; ++ BUG(); ++ } else if (a != ex->ee_block) { ++ /* remove tail of the extent */ ++ block = ex->ee_block; ++ num = a - block; ++ } else if (b != ex->ee_block + ex->ee_len - 1) { ++ /* remove head of the extent */ ++ block = a; ++ num = b - a; ++ } else { ++ /* remove whole extent: excelent! */ ++ block = ex->ee_block; ++ num = 0; ++ EXT_ASSERT(a == ex->ee_block && ++ b == ex->ee_block + ex->ee_len - 1); ++ } ++ ++ if (ex == EXT_FIRST_EXTENT(eh)) ++ correct_index = 1; ++ ++ credits = 1; ++ if (correct_index) ++ credits += (EXT_DEPTH(tree) * EXT3_ALLOC_NEEDED) + 1; ++ if (tree->ops->remove_extent_credits) ++ credits+=tree->ops->remove_extent_credits(tree,ex,a,b); ++ ++ handle = ext3_ext_journal_restart(handle, credits); ++ if (IS_ERR(handle)) { ++ err = PTR_ERR(handle); ++ goto out; ++ } ++ ++ err = ext3_ext_get_access(handle, tree, path + depth); ++ if (err) ++ goto out; ++ ++ if (tree->ops->remove_extent) ++ err = tree->ops->remove_extent(tree, ex, a, b); ++ if (err) ++ goto out; ++ ++ if (num == 0) { ++ /* this extent is removed entirely mark slot unused */ ++ ex->ee_start = ex->ee_start_hi = 0; ++ eh->eh_entries--; ++ fu = ex; ++ } ++ ++ ex->ee_block = block; ++ ex->ee_len = num; ++ ++ err = ext3_ext_dirty(handle, tree, path + depth); ++ if (err) ++ goto out; ++ ++ ext_debug(tree, "new extent: %u:%u:%u\n", ++ ex->ee_block, ex->ee_len, ex->ee_start); ++ ex--; ++ } ++ ++ if (fu) { ++ /* reuse unused slots */ ++ while (lu < le) { ++ if (lu->ee_start) { ++ *fu = *lu; ++ lu->ee_start = lu->ee_start_hi = 0; ++ fu++; ++ } ++ lu++; ++ } ++ } ++ ++ if (correct_index && eh->eh_entries) ++ err = ext3_ext_correct_indexes(handle, tree, path); ++ ++ /* if this leaf is free, then we should ++ * remove it from index block above */ ++ if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL) ++ err = ext3_ext_rm_idx(handle, tree, path + depth); ++ ++out: ++ return err; ++} ++ ++ ++static struct ext3_extent_idx * ++ext3_ext_last_covered(struct ext3_extent_header *hdr, unsigned long block) ++{ ++ struct ext3_extent_idx *ix; ++ ++ ix = EXT_LAST_INDEX(hdr); ++ while (ix != EXT_FIRST_INDEX(hdr)) { ++ if (ix->ei_block <= block) ++ break; ++ ix--; ++ } ++ return ix; ++} ++ ++/* ++ * returns 1 if current index have to be freed (even partial) ++ */ ++static int inline ++ext3_ext_more_to_rm(struct ext3_ext_path *path) ++{ ++ EXT_ASSERT(path->p_idx); ++ ++ if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr)) ++ return 0; ++ ++ /* ++ * if truncate on deeper level happened it it wasn't partial ++ * so we have to consider current index for truncation ++ */ ++ if (path->p_hdr->eh_entries == path->p_block) ++ return 0; ++ return 1; ++} ++ ++int ext3_ext_remove_space(struct ext3_extents_tree *tree, ++ unsigned long start, unsigned long end) ++{ ++ struct inode *inode = tree->inode; ++ struct super_block *sb = inode->i_sb; ++ int depth = EXT_DEPTH(tree); ++ struct ext3_ext_path *path; ++ handle_t *handle; ++ int i = 0, err = 0; ++ ++ ext_debug(tree, "space to be removed: %lu:%lu\n", start, end); ++ ++ /* probably first extent we're gonna free will be last in block */ ++ handle = ext3_journal_start(inode, depth + 1); ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ ++ ext3_ext_invalidate_cache(tree); ++ ++ /* ++ * we start scanning from right side freeing all the blocks ++ * after i_size and walking into the deep ++ */ ++ path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 1), GFP_KERNEL); ++ if (IS_ERR(path)) { ++ ext3_error(sb, __FUNCTION__, "Can't allocate path array"); ++ ext3_journal_stop(handle); ++ return -ENOMEM; ++ } ++ memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1)); ++ path[i].p_hdr = EXT_ROOT_HDR(tree); ++ ++ while (i >= 0 && err == 0) { ++ if (i == depth) { ++ /* this is leaf block */ ++ err = ext3_ext_rm_leaf(handle, tree, path, start, end); ++ /* root level have p_bh == NULL, brelse() eats this */ ++ brelse(path[i].p_bh); ++ i--; ++ continue; ++ } ++ ++ /* this is index block */ ++ if (!path[i].p_hdr) { ++ ext_debug(tree, "initialize header\n"); ++ path[i].p_hdr = EXT_BLOCK_HDR(path[i].p_bh); ++ } ++ ++ EXT_ASSERT(path[i].p_hdr->eh_entries <= path[i].p_hdr->eh_max); ++ EXT_ASSERT(path[i].p_hdr->eh_magic == EXT3_EXT_MAGIC); ++ ++ if (!path[i].p_idx) { ++ /* this level hasn't touched yet */ ++ path[i].p_idx = ++ ext3_ext_last_covered(path[i].p_hdr, end); ++ path[i].p_block = path[i].p_hdr->eh_entries + 1; ++ ext_debug(tree, "init index ptr: hdr 0x%p, num %d\n", ++ path[i].p_hdr, path[i].p_hdr->eh_entries); ++ } else { ++ /* we've already was here, see at next index */ ++ path[i].p_idx--; ++ } ++ ++ ext_debug(tree, "level %d - index, first 0x%p, cur 0x%p\n", ++ i, EXT_FIRST_INDEX(path[i].p_hdr), ++ path[i].p_idx); ++ if (ext3_ext_more_to_rm(path + i)) { ++ /* go to the next level */ ++ ext_debug(tree, "move to level %d (block %d)\n", ++ i + 1, path[i].p_idx->ei_leaf); ++ memset(path + i + 1, 0, sizeof(*path)); ++ path[i+1].p_bh = sb_bread(sb, path[i].p_idx->ei_leaf); ++ if (!path[i+1].p_bh) { ++ /* should we reset i_size? */ ++ err = -EIO; ++ break; ++ } ++ /* put actual number of indexes to know is this ++ * number got changed at the next iteration */ ++ path[i].p_block = path[i].p_hdr->eh_entries; ++ i++; ++ } else { ++ /* we finish processing this index, go up */ ++ if (path[i].p_hdr->eh_entries == 0 && i > 0) { ++ /* index is empty, remove it ++ * handle must be already prepared by the ++ * truncatei_leaf() */ ++ err = ext3_ext_rm_idx(handle, tree, path + i); ++ } ++ /* root level have p_bh == NULL, brelse() eats this */ ++ brelse(path[i].p_bh); ++ i--; ++ ext_debug(tree, "return to level %d\n", i); ++ } ++ } ++ ++ /* TODO: flexible tree reduction should be here */ ++ if (path->p_hdr->eh_entries == 0) { ++ /* ++ * truncate to zero freed all the tree ++ * so, we need to correct eh_depth ++ */ ++ err = ext3_ext_get_access(handle, tree, path); ++ if (err == 0) { ++ EXT_ROOT_HDR(tree)->eh_depth = 0; ++ EXT_ROOT_HDR(tree)->eh_max = ext3_ext_space_root(tree); ++ err = ext3_ext_dirty(handle, tree, path); ++ } ++ } ++ ext3_ext_tree_changed(tree); ++ ++ kfree(path); ++ ext3_journal_stop(handle); ++ ++ return err; ++} ++ ++int ext3_ext_calc_metadata_amount(struct ext3_extents_tree *tree, int blocks) ++{ ++ int lcap, icap, rcap, leafs, idxs, num; ++ ++ rcap = ext3_ext_space_root(tree); ++ if (blocks <= rcap) { ++ /* all extents fit to the root */ ++ return 0; ++ } ++ ++ rcap = ext3_ext_space_root_idx(tree); ++ lcap = ext3_ext_space_block(tree); ++ icap = ext3_ext_space_block_idx(tree); ++ ++ num = leafs = (blocks + lcap - 1) / lcap; ++ if (leafs <= rcap) { ++ /* all pointers to leafs fit to the root */ ++ return leafs; ++ } ++ ++ /* ok. we need separate index block(s) to link all leaf blocks */ ++ idxs = (leafs + icap - 1) / icap; ++ do { ++ num += idxs; ++ idxs = (idxs + icap - 1) / icap; ++ } while (idxs > rcap); ++ ++ return num; ++} ++ ++/* ++ * called at mount time ++ */ ++void ext3_ext_init(struct super_block *sb) ++{ ++ /* ++ * possible initialization would be here ++ */ ++ ++ if (test_opt(sb, EXTENTS)) { ++ printk("EXT3-fs: file extents enabled"); ++#ifdef AGRESSIVE_TEST ++ printk(", agressive tests"); ++#endif ++#ifdef CHECK_BINSEARCH ++ printk(", check binsearch"); ++#endif ++ printk("\n"); ++ } ++} ++ ++/* ++ * called at umount time ++ */ ++void ext3_ext_release(struct super_block *sb) ++{ ++} ++ ++/************************************************************************ ++ * VFS related routines ++ ************************************************************************/ ++ ++static int ext3_get_inode_write_access(handle_t *handle, void *buffer) ++{ ++ /* we use in-core data, not bh */ ++ return 0; ++} ++ ++static int ext3_mark_buffer_dirty(handle_t *handle, void *buffer) ++{ ++ struct inode *inode = buffer; ++ return ext3_mark_inode_dirty(handle, inode); ++} ++ ++static int ext3_ext_mergable(struct ext3_extent *ex1, ++ struct ext3_extent *ex2) ++{ ++ /* FIXME: support for large fs */ ++ if (ex1->ee_start + ex1->ee_len == ex2->ee_start) ++ return 1; ++ return 0; ++} ++ ++static int ++ext3_remove_blocks_credits(struct ext3_extents_tree *tree, ++ struct ext3_extent *ex, ++ unsigned long from, unsigned long to) ++{ ++ int needed; ++ ++ /* at present, extent can't cross block group */; ++ needed = 4; /* bitmap + group desc + sb + inode */ ++ ++#ifdef CONFIG_QUOTA ++ needed += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS; ++#endif ++ return needed; ++} ++ ++static int ++ext3_remove_blocks(struct ext3_extents_tree *tree, ++ struct ext3_extent *ex, ++ unsigned long from, unsigned long to) ++{ ++ int needed = ext3_remove_blocks_credits(tree, ex, from, to); ++ handle_t *handle = ext3_journal_start(tree->inode, needed); ++ struct buffer_head *bh; ++ int i; ++ ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) { ++ /* tail removal */ ++ unsigned long num, start; ++ num = ex->ee_block + ex->ee_len - from; ++ start = ex->ee_start + ex->ee_len - num; ++ ext_debug(tree, "free last %lu blocks starting %lu\n", ++ num, start); ++ for (i = 0; i < num; i++) { ++ bh = sb_find_get_block(tree->inode->i_sb, start + i); ++ ext3_forget(handle, 0, tree->inode, bh, start + i); ++ } ++ ext3_free_blocks(handle, tree->inode, start, num); ++ } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) { ++ printk("strange request: removal %lu-%lu from %u:%u\n", ++ from, to, ex->ee_block, ex->ee_len); ++ } else { ++ printk("strange request: removal(2) %lu-%lu from %u:%u\n", ++ from, to, ex->ee_block, ex->ee_len); ++ } ++ ext3_journal_stop(handle); ++ return 0; ++} ++ ++static int ext3_ext_find_goal(struct inode *inode, ++ struct ext3_ext_path *path, unsigned long block) ++{ ++ struct ext3_inode_info *ei = EXT3_I(inode); ++ unsigned long bg_start; ++ unsigned long colour; ++ int depth; ++ ++ if (path) { ++ struct ext3_extent *ex; ++ depth = path->p_depth; ++ ++ /* try to predict block placement */ ++ if ((ex = path[depth].p_ext)) ++ return ex->ee_start + (block - ex->ee_block); ++ ++ /* it looks index is empty ++ * try to find starting from index itself */ ++ if (path[depth].p_bh) ++ return path[depth].p_bh->b_blocknr; ++ } ++ ++ /* OK. use inode's group */ ++ bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) + ++ le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block); ++ colour = (current->pid % 16) * ++ (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16); ++ return bg_start + colour + block; ++} ++ ++static int ext3_new_block_cb(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *ex, int *err) ++{ ++ struct inode *inode = tree->inode; ++ int newblock, goal; ++ ++ EXT_ASSERT(path); ++ EXT_ASSERT(ex); ++ EXT_ASSERT(ex->ee_start); ++ EXT_ASSERT(ex->ee_len); ++ ++ /* reuse block from the extent to order data/metadata */ ++ newblock = ex->ee_start++; ++ ex->ee_len--; ++ if (ex->ee_len == 0) { ++ ex->ee_len = 1; ++ /* allocate new block for the extent */ ++ goal = ext3_ext_find_goal(inode, path, ex->ee_block); ++ ex->ee_start = ext3_new_block(handle, inode, goal, err); ++ ex->ee_start_hi = 0; ++ if (ex->ee_start == 0) { ++ /* error occured: restore old extent */ ++ ex->ee_start = newblock; ++ return 0; ++ } ++ } ++ return newblock; ++} ++ ++static struct ext3_extents_helpers ext3_blockmap_helpers = { ++ .get_write_access = ext3_get_inode_write_access, ++ .mark_buffer_dirty = ext3_mark_buffer_dirty, ++ .mergable = ext3_ext_mergable, ++ .new_block = ext3_new_block_cb, ++ .remove_extent = ext3_remove_blocks, ++ .remove_extent_credits = ext3_remove_blocks_credits, ++}; ++ ++void ext3_init_tree_desc(struct ext3_extents_tree *tree, ++ struct inode *inode) ++{ ++ tree->inode = inode; ++ tree->root = (void *) EXT3_I(inode)->i_data; ++ tree->buffer = (void *) inode; ++ tree->buffer_len = sizeof(EXT3_I(inode)->i_data); ++ tree->cex = (struct ext3_ext_cache *) &EXT3_I(inode)->i_cached_extent; ++ tree->ops = &ext3_blockmap_helpers; ++} ++ ++int ext3_ext_get_block(handle_t *handle, struct inode *inode, ++ long iblock, struct buffer_head *bh_result, ++ int create, int extend_disksize) ++{ ++ struct ext3_ext_path *path = NULL; ++ struct ext3_extent newex; ++ struct ext3_extent *ex; ++ int goal, newblock, err = 0, depth; ++ struct ext3_extents_tree tree; ++ ++ clear_buffer_new(bh_result); ++ ext3_init_tree_desc(&tree, inode); ++ ext_debug(&tree, "block %d requested for inode %u\n", ++ (int) iblock, (unsigned) inode->i_ino); ++ mutex_lock(&EXT3_I(inode)->truncate_mutex); ++ ++ /* check in cache */ ++ if ((goal = ext3_ext_in_cache(&tree, iblock, &newex))) { ++ if (goal == EXT3_EXT_CACHE_GAP) { ++ if (!create) { ++ /* block isn't allocated yet and ++ * user don't want to allocate it */ ++ goto out2; ++ } ++ /* we should allocate requested block */ ++ } else if (goal == EXT3_EXT_CACHE_EXTENT) { ++ /* block is already allocated */ ++ newblock = iblock - newex.ee_block + newex.ee_start; ++ goto out; ++ } else { ++ EXT_ASSERT(0); ++ } ++ } ++ ++ /* find extent for this block */ ++ path = ext3_ext_find_extent(&tree, iblock, NULL); ++ if (IS_ERR(path)) { ++ err = PTR_ERR(path); ++ path = NULL; ++ goto out2; ++ } ++ ++ depth = EXT_DEPTH(&tree); ++ ++ /* ++ * consistent leaf must not be empty ++ * this situations is possible, though, _during_ tree modification ++ * this is why assert can't be put in ext3_ext_find_extent() ++ */ ++ EXT_ASSERT(path[depth].p_ext != NULL || depth == 0); ++ ++ if ((ex = path[depth].p_ext)) { ++ /* if found exent covers block, simple return it */ ++ if (iblock >= ex->ee_block && iblock < ex->ee_block + ex->ee_len) { ++ newblock = iblock - ex->ee_block + ex->ee_start; ++ ext_debug(&tree, "%d fit into %d:%d -> %d\n", ++ (int) iblock, ex->ee_block, ex->ee_len, ++ newblock); ++ ext3_ext_put_in_cache(&tree, ex->ee_block, ++ ex->ee_len, ex->ee_start, ++ EXT3_EXT_CACHE_EXTENT); ++ goto out; ++ } ++ } ++ ++ /* ++ * requested block isn't allocated yet ++ * we couldn't try to create block if create flag is zero ++ */ ++ if (!create) { ++ /* put just found gap into cache to speedup subsequest reqs */ ++ ext3_ext_put_gap_in_cache(&tree, path, iblock); ++ goto out2; ++ } ++ ++ /* allocate new block */ ++ goal = ext3_ext_find_goal(inode, path, iblock); ++ newblock = ext3_new_block(handle, inode, goal, &err); ++ if (!newblock) ++ goto out2; ++ ext_debug(&tree, "allocate new block: goal %d, found %d\n", ++ goal, newblock); ++ ++ /* try to insert new extent into found leaf and return */ ++ newex.ee_block = iblock; ++ newex.ee_start = newblock; ++ newex.ee_start_hi = 0; ++ newex.ee_len = 1; ++ err = ext3_ext_insert_extent(handle, &tree, path, &newex); ++ if (err) ++ goto out2; ++ ++ if (extend_disksize && inode->i_size > EXT3_I(inode)->i_disksize) ++ EXT3_I(inode)->i_disksize = inode->i_size; ++ ++ /* previous routine could use block we allocated */ ++ newblock = newex.ee_start; ++ set_buffer_new(bh_result); ++ ++ ext3_ext_put_in_cache(&tree, newex.ee_block, newex.ee_len, ++ newex.ee_start, EXT3_EXT_CACHE_EXTENT); ++out: ++ ext3_ext_show_leaf(&tree, path); ++ map_bh(bh_result, inode->i_sb, newblock); ++out2: ++ if (path) { ++ ext3_ext_drop_refs(path); ++ kfree(path); ++ } ++ mutex_unlock(&EXT3_I(inode)->truncate_mutex); ++ ++ return err; ++} ++ ++void ext3_ext_truncate(struct inode * inode, struct page *page) ++{ ++ struct address_space *mapping = inode->i_mapping; ++ struct super_block *sb = inode->i_sb; ++ struct ext3_extents_tree tree; ++ unsigned long last_block; ++ handle_t *handle; ++ int err = 0; ++ ++ ext3_init_tree_desc(&tree, inode); ++ ++ /* ++ * probably first extent we're gonna free will be last in block ++ */ ++ err = ext3_writepage_trans_blocks(inode) + 3; ++ handle = ext3_journal_start(inode, err); ++ if (IS_ERR(handle)) { ++ if (page) { ++ clear_highpage(page); ++ flush_dcache_page(page); ++ unlock_page(page); ++ page_cache_release(page); ++ } ++ return; ++ } ++ ++ if (page) ++ ext3_block_truncate_page(handle, page, mapping, inode->i_size); ++ ++ mutex_lock(&EXT3_I(inode)->truncate_mutex); ++ ext3_ext_invalidate_cache(&tree); ++ ++ /* ++ * TODO: optimization is possible here ++ * probably we need not scaning at all, ++ * because page truncation is enough ++ */ ++ if (ext3_orphan_add(handle, inode)) ++ goto out_stop; ++ ++ /* we have to know where to truncate from in crash case */ ++ EXT3_I(inode)->i_disksize = inode->i_size; ++ ext3_mark_inode_dirty(handle, inode); ++ ++ last_block = (inode->i_size + sb->s_blocksize - 1) >> ++ EXT3_BLOCK_SIZE_BITS(sb); ++ err = ext3_ext_remove_space(&tree, last_block, EXT_MAX_BLOCK); ++ ++ /* In a multi-transaction truncate, we only make the final ++ * transaction synchronous */ ++ if (IS_SYNC(inode)) ++ handle->h_sync = 1; ++ ++out_stop: ++ /* ++ * If this was a simple ftruncate(), and the file will remain alive ++ * then we need to clear up the orphan record which we created above. ++ * However, if this was a real unlink then we were called by ++ * ext3_delete_inode(), and we allow that function to clean up the ++ * orphan info for us. ++ */ ++ if (inode->i_nlink) ++ ext3_orphan_del(handle, inode); ++ ++ mutex_unlock(&EXT3_I(inode)->truncate_mutex); ++ ext3_journal_stop(handle); ++} ++ ++/* ++ * this routine calculate max number of blocks we could modify ++ * in order to allocate new block for an inode ++ */ ++int ext3_ext_writepage_trans_blocks(struct inode *inode, int num) ++{ ++ struct ext3_extents_tree tree; ++ int needed; ++ ++ ext3_init_tree_desc(&tree, inode); ++ ++ needed = ext3_ext_calc_credits_for_insert(&tree, NULL); ++ ++ /* caller want to allocate num blocks */ ++ needed *= num; ++ ++#ifdef CONFIG_QUOTA ++ /* ++ * FIXME: real calculation should be here ++ * it depends on blockmap format of qouta file ++ */ ++ needed += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS; ++#endif ++ ++ return needed; ++} ++ ++void ext3_extents_initialize_blockmap(handle_t *handle, struct inode *inode) ++{ ++ struct ext3_extents_tree tree; ++ ++ ext3_init_tree_desc(&tree, inode); ++ ext3_extent_tree_init(handle, &tree); ++} ++ ++int ext3_ext_calc_blockmap_metadata(struct inode *inode, int blocks) ++{ ++ struct ext3_extents_tree tree; ++ ++ ext3_init_tree_desc(&tree, inode); ++ return ext3_ext_calc_metadata_amount(&tree, blocks); ++} ++ ++static int ++ext3_ext_store_extent_cb(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_ext_cache *newex) ++{ ++ struct ext3_extent_buf *buf = (struct ext3_extent_buf *) tree->private; ++ ++ if (newex->ec_type != EXT3_EXT_CACHE_EXTENT) ++ return EXT_CONTINUE; ++ ++ if (buf->err < 0) ++ return EXT_BREAK; ++ if (buf->cur - buf->buffer + sizeof(*newex) > buf->buflen) ++ return EXT_BREAK; ++ ++ if (!copy_to_user(buf->cur, newex, sizeof(*newex))) { ++ buf->err++; ++ buf->cur += sizeof(*newex); ++ } else { ++ buf->err = -EFAULT; ++ return EXT_BREAK; ++ } ++ return EXT_CONTINUE; ++} ++ ++static int ++ext3_ext_collect_stats_cb(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_ext_cache *ex) ++{ ++ struct ext3_extent_tree_stats *buf = ++ (struct ext3_extent_tree_stats *) tree->private; ++ int depth; ++ ++ if (ex->ec_type != EXT3_EXT_CACHE_EXTENT) ++ return EXT_CONTINUE; ++ ++ depth = EXT_DEPTH(tree); ++ buf->extents_num++; ++ if (path[depth].p_ext == EXT_FIRST_EXTENT(path[depth].p_hdr)) ++ buf->leaf_num++; ++ return EXT_CONTINUE; ++} ++ ++int ext3_ext_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, ++ unsigned long arg) ++{ ++ int err = 0; ++ ++ if (!(EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL)) ++ return -EINVAL; ++ ++ if (cmd == EXT3_IOC_GET_EXTENTS) { ++ struct ext3_extent_buf buf; ++ struct ext3_extents_tree tree; ++ ++ if (copy_from_user(&buf, (void *) arg, sizeof(buf))) ++ return -EFAULT; ++ ++ ext3_init_tree_desc(&tree, inode); ++ buf.cur = buf.buffer; ++ buf.err = 0; ++ tree.private = &buf; ++ mutex_lock(&EXT3_I(inode)->truncate_mutex); ++ err = ext3_ext_walk_space(&tree, buf.start, EXT_MAX_BLOCK, ++ ext3_ext_store_extent_cb); ++ mutex_unlock(&EXT3_I(inode)->truncate_mutex); ++ if (err == 0) ++ err = buf.err; ++ } else if (cmd == EXT3_IOC_GET_TREE_STATS) { ++ struct ext3_extent_tree_stats buf; ++ struct ext3_extents_tree tree; ++ ++ ext3_init_tree_desc(&tree, inode); ++ mutex_lock(&EXT3_I(inode)->truncate_mutex); ++ buf.depth = EXT_DEPTH(&tree); ++ buf.extents_num = 0; ++ buf.leaf_num = 0; ++ tree.private = &buf; ++ err = ext3_ext_walk_space(&tree, 0, EXT_MAX_BLOCK, ++ ext3_ext_collect_stats_cb); ++ mutex_unlock(&EXT3_I(inode)->truncate_mutex); ++ if (!err) ++ err = copy_to_user((void *) arg, &buf, sizeof(buf)); ++ } else if (cmd == EXT3_IOC_GET_TREE_DEPTH) { ++ struct ext3_extents_tree tree; ++ ext3_init_tree_desc(&tree, inode); ++ mutex_lock(&EXT3_I(inode)->truncate_mutex); ++ err = EXT_DEPTH(&tree); ++ mutex_unlock(&EXT3_I(inode)->truncate_mutex); ++ } ++ ++ return err; ++} ++ ++EXPORT_SYMBOL(ext3_init_tree_desc); ++EXPORT_SYMBOL(ext3_mark_inode_dirty); ++EXPORT_SYMBOL(ext3_ext_invalidate_cache); ++EXPORT_SYMBOL(ext3_ext_insert_extent); ++EXPORT_SYMBOL(ext3_ext_walk_space); ++EXPORT_SYMBOL(ext3_ext_find_goal); ++EXPORT_SYMBOL(ext3_ext_calc_credits_for_insert); +Index: linux-stage/fs/ext3/ialloc.c +=================================================================== +--- linux-stage.orig/fs/ext3/ialloc.c 2006-07-16 13:55:31.000000000 +0800 ++++ linux-stage/fs/ext3/ialloc.c 2006-07-16 14:10:20.000000000 +0800 +@@ -600,7 +600,7 @@ got: + ei->i_dir_start_lookup = 0; + ei->i_disksize = 0; + +- ei->i_flags = EXT3_I(dir)->i_flags & ~EXT3_INDEX_FL; ++ ei->i_flags = EXT3_I(dir)->i_flags & ~(EXT3_INDEX_FL|EXT3_EXTENTS_FL); + if (S_ISLNK(mode)) + ei->i_flags &= ~(EXT3_IMMUTABLE_FL|EXT3_APPEND_FL); + /* dirsync only applies to directories */ +@@ -644,6 +644,18 @@ got: + if (err) + goto fail_free_drop; + ++ if (test_opt(sb, EXTENTS) && S_ISREG(inode->i_mode)) { ++ EXT3_I(inode)->i_flags |= EXT3_EXTENTS_FL; ++ ext3_extents_initialize_blockmap(handle, inode); ++ if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_EXTENTS)) { ++ err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh); ++ if (err) goto fail; ++ EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_EXTENTS); ++ BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "call ext3_journal_dirty_metadata"); ++ err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); ++ } ++ } ++ + err = ext3_mark_inode_dirty(handle, inode); + if (err) { + ext3_std_error(sb, err); +Index: linux-stage/fs/ext3/inode.c +=================================================================== +--- linux-stage.orig/fs/ext3/inode.c 2006-07-16 13:55:31.000000000 +0800 ++++ linux-stage/fs/ext3/inode.c 2006-07-16 14:11:28.000000000 +0800 +@@ -40,7 +40,7 @@ + #include "iopen.h" + #include "acl.h" + +-static int ext3_writepage_trans_blocks(struct inode *inode); ++int ext3_writepage_trans_blocks(struct inode *inode); + + /* + * Test whether an inode is a fast symlink. +@@ -944,6 +944,17 @@ out: + + #define DIO_CREDITS (EXT3_RESERVE_TRANS_BLOCKS + 32) + ++static inline int ++ext3_get_block_wrap(handle_t *handle, struct inode *inode, long block, ++ struct buffer_head *bh, int create, int extend_disksize) ++{ ++ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) ++ return ext3_ext_get_block(handle, inode, block, bh, create, ++ extend_disksize); ++ return ext3_get_blocks_handle(handle, inode, block, 1, bh, create, ++ extend_disksize); ++} ++ + static int ext3_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) + { +@@ -984,8 +995,8 @@ static int ext3_get_block(struct inode * + + get_block: + if (ret == 0) { +- ret = ext3_get_blocks_handle(handle, inode, iblock, +- max_blocks, bh_result, create, 0); ++ ret = ext3_get_block_wrap(handle, inode, iblock, ++ bh_result, create, 0); + if (ret > 0) { + bh_result->b_size = (ret << inode->i_blkbits); + ret = 0; +@@ -1008,7 +1019,7 @@ struct buffer_head *ext3_getblk(handle_t + dummy.b_state = 0; + dummy.b_blocknr = -1000; + buffer_trace_init(&dummy.b_history); +- err = ext3_get_blocks_handle(handle, inode, block, 1, ++ err = ext3_get_block_wrap(handle, inode, block, + &dummy, create, 1); + if (err == 1) { + err = 0; +@@ -1756,7 +1767,7 @@ void ext3_set_aops(struct inode *inode) + * This required during truncate. We need to physically zero the tail end + * of that block so it doesn't yield old data if the file is later grown. + */ +-static int ext3_block_truncate_page(handle_t *handle, struct page *page, ++int ext3_block_truncate_page(handle_t *handle, struct page *page, + struct address_space *mapping, loff_t from) + { + ext3_fsblk_t index = from >> PAGE_CACHE_SHIFT; +@@ -2260,6 +2271,9 @@ void ext3_truncate(struct inode *inode) + return; + } + ++ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) ++ return ext3_ext_truncate(inode, page); ++ + handle = start_transaction(inode); + if (IS_ERR(handle)) { + if (page) { +@@ -3004,12 +3018,15 @@ err_out: + * block and work out the exact number of indirects which are touched. Pah. + */ + +-static int ext3_writepage_trans_blocks(struct inode *inode) ++int ext3_writepage_trans_blocks(struct inode *inode) + { + int bpp = ext3_journal_blocks_per_page(inode); + int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3; + int ret; + ++ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) ++ return ext3_ext_writepage_trans_blocks(inode, bpp); ++ + if (ext3_should_journal_data(inode)) + ret = 3 * (bpp + indirects) + 2; + else +@@ -3277,7 +3294,7 @@ int ext3_prep_san_write(struct inode *in + + /* alloc blocks one by one */ + for (i = 0; i < nblocks; i++) { +- ret = ext3_get_block_handle(handle, inode, blocks[i], ++ ret = ext3_get_blocks_handle(handle, inode, blocks[i], 1, + &bh_tmp, 1, 1); + if (ret) + break; +@@ -3337,7 +3354,7 @@ int ext3_map_inode_page(struct inode *in + if (blocks[i] != 0) + continue; + +- rc = ext3_get_block_handle(handle, inode, iblock, &dummy, 1, 1); ++ rc = ext3_get_blocks_handle(handle, inode, iblock, 1, &dummy, 1, 1); + if (rc) { + printk(KERN_INFO "ext3_map_inode_page: error reading " + "block %ld\n", iblock); +Index: linux-stage/fs/ext3/Makefile +=================================================================== +--- linux-stage.orig/fs/ext3/Makefile 2006-07-16 13:55:31.000000000 +0800 ++++ linux-stage/fs/ext3/Makefile 2006-07-16 14:10:21.000000000 +0800 +@@ -5,7 +5,8 @@ + obj-$(CONFIG_EXT3_FS) += ext3.o + + ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ +- ioctl.o namei.o super.o symlink.o hash.o resize.o ++ ioctl.o namei.o super.o symlink.o hash.o resize.o \ ++ extents.o + + ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o + ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o +Index: linux-stage/fs/ext3/super.c +=================================================================== +--- linux-stage.orig/fs/ext3/super.c 2006-07-16 13:55:31.000000000 +0800 ++++ linux-stage/fs/ext3/super.c 2006-07-16 14:10:21.000000000 +0800 +@@ -391,6 +391,7 @@ static void ext3_put_super (struct super + struct ext3_super_block *es = sbi->s_es; + int i; + ++ ext3_ext_release(sb); + ext3_xattr_put_super(sb); + journal_destroy(sbi->s_journal); + if (!(sb->s_flags & MS_RDONLY)) { +@@ -455,6 +456,8 @@ static struct inode *ext3_alloc_inode(st + #endif + ei->i_block_alloc_info = NULL; + ei->vfs_inode.i_version = 1; ++ ++ memset(&ei->i_cached_extent, 0, sizeof(ei->i_cached_extent)); + return &ei->vfs_inode; + } + +@@ -638,6 +641,7 @@ enum { + Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, + Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, + Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, ++ Opt_extents, Opt_extdebug, + Opt_grpquota + }; + +@@ -690,6 +694,8 @@ static match_table_t tokens = { + {Opt_iopen, "iopen"}, + {Opt_noiopen, "noiopen"}, + {Opt_iopen_nopriv, "iopen_nopriv"}, ++ {Opt_extents, "extents"}, ++ {Opt_extdebug, "extdebug"}, + {Opt_barrier, "barrier=%u"}, + {Opt_err, NULL}, + {Opt_resize, "resize"}, +@@ -1035,6 +1041,12 @@ clear_qf_name: + case Opt_bh: + clear_opt(sbi->s_mount_opt, NOBH); + break; ++ case Opt_extents: ++ set_opt (sbi->s_mount_opt, EXTENTS); ++ break; ++ case Opt_extdebug: ++ set_opt (sbi->s_mount_opt, EXTDEBUG); ++ break; + default: + printk (KERN_ERR + "EXT3-fs: Unrecognized mount option \"%s\" " +@@ -1760,6 +1772,7 @@ static int ext3_fill_super (struct super + test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered": + "writeback"); + ++ ext3_ext_init(sb); + lock_kernel(); + return 0; + +Index: linux-stage/fs/ext3/ioctl.c +=================================================================== +--- linux-stage.orig/fs/ext3/ioctl.c 2006-07-16 13:55:31.000000000 +0800 ++++ linux-stage/fs/ext3/ioctl.c 2006-07-16 13:55:31.000000000 +0800 +@@ -135,6 +135,10 @@ flags_err: + mutex_unlock(&inode->i_mutex); + return err; + } ++ case EXT3_IOC_GET_EXTENTS: ++ case EXT3_IOC_GET_TREE_STATS: ++ case EXT3_IOC_GET_TREE_DEPTH: ++ return ext3_ext_ioctl(inode, filp, cmd, arg); + case EXT3_IOC_GETVERSION: + case EXT3_IOC_GETVERSION_OLD: + return put_user(inode->i_generation, (int __user *) arg); +Index: linux-stage/include/linux/ext3_fs.h +=================================================================== +--- linux-stage.orig/include/linux/ext3_fs.h 2006-07-16 13:55:31.000000000 +0800 ++++ linux-stage/include/linux/ext3_fs.h 2006-07-16 14:10:21.000000000 +0800 +@@ -181,9 +181,10 @@ struct ext3_group_desc + #define EXT3_NOTAIL_FL 0x00008000 /* file tail should not be merged */ + #define EXT3_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ + #define EXT3_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ ++#define EXT3_EXTENTS_FL 0x00080000 /* Inode uses extents */ + #define EXT3_RESERVED_FL 0x80000000 /* reserved for ext3 lib */ + +-#define EXT3_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */ ++#define EXT3_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */ + #define EXT3_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */ + + /* +@@ -233,6 +234,9 @@ struct ext3_new_group_data { + #endif + #define EXT3_IOC_GETRSVSZ _IOR('f', 5, long) + #define EXT3_IOC_SETRSVSZ _IOW('f', 6, long) ++#define EXT3_IOC_GET_EXTENTS _IOR('f', 7, long) ++#define EXT3_IOC_GET_TREE_DEPTH _IOR('f', 8, long) ++#define EXT3_IOC_GET_TREE_STATS _IOR('f', 9, long) + + /* + * Mount options +@@ -373,6 +377,8 @@ struct ext3_inode { + #define EXT3_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ + #define EXT3_MOUNT_IOPEN 0x400000 /* Allow access via iopen */ + #define EXT3_MOUNT_IOPEN_NOPRIV 0x800000/* Make iopen world-readable */ ++#define EXT3_MOUNT_EXTENTS 0x1000000/* Extents support */ ++#define EXT3_MOUNT_EXTDEBUG 0x2000000/* Extents debug */ + + /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ + #ifndef clear_opt +@@ -563,11 +569,13 @@ static inline struct ext3_inode_info *EX + #define EXT3_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */ + #define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */ + #define EXT3_FEATURE_INCOMPAT_META_BG 0x0010 ++#define EXT3_FEATURE_INCOMPAT_EXTENTS 0x0040 /* extents support */ + + #define EXT3_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR + #define EXT3_FEATURE_INCOMPAT_SUPP (EXT3_FEATURE_INCOMPAT_FILETYPE| \ + EXT3_FEATURE_INCOMPAT_RECOVER| \ +- EXT3_FEATURE_INCOMPAT_META_BG) ++ EXT3_FEATURE_INCOMPAT_META_BG| \ ++ EXT3_FEATURE_INCOMPAT_EXTENTS) + #define EXT3_FEATURE_RO_COMPAT_SUPP (EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT3_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT3_FEATURE_RO_COMPAT_BTREE_DIR) +@@ -787,6 +795,8 @@ extern unsigned long ext3_count_free (st + + + /* inode.c */ ++extern int ext3_block_truncate_page(handle_t *, struct page *, ++ struct address_space *, loff_t); + int ext3_forget(handle_t *handle, int is_metadata, struct inode *inode, + struct buffer_head *bh, ext3_fsblk_t blocknr); + struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *); +@@ -860,6 +870,16 @@ extern struct inode_operations ext3_spec + extern struct inode_operations ext3_symlink_inode_operations; + extern struct inode_operations ext3_fast_symlink_inode_operations; + ++/* extents.c */ ++extern int ext3_ext_writepage_trans_blocks(struct inode *, int); ++extern int ext3_ext_get_block(handle_t *, struct inode *, long, ++ struct buffer_head *, int, int); ++extern void ext3_ext_truncate(struct inode *, struct page *); ++extern void ext3_ext_init(struct super_block *); ++extern void ext3_ext_release(struct super_block *); ++extern void ext3_extents_initialize_blockmap(handle_t *, struct inode *); ++extern int ext3_ext_ioctl(struct inode *inode, struct file *filp, ++ unsigned int cmd, unsigned long arg); + + #endif /* __KERNEL__ */ + +Index: linux-stage/include/linux/ext3_extents.h +=================================================================== +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ linux-stage/include/linux/ext3_extents.h 2006-07-16 13:55:31.000000000 +0800 +@@ -0,0 +1,262 @@ ++/* ++ * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com ++ * Written by Alex Tomas ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public Licens ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- ++ */ ++ ++#ifndef _LINUX_EXT3_EXTENTS ++#define _LINUX_EXT3_EXTENTS ++ ++/* ++ * with AGRESSIVE_TEST defined capacity of index/leaf blocks ++ * become very little, so index split, in-depth growing and ++ * other hard changes happens much more often ++ * this is for debug purposes only ++ */ ++#define AGRESSIVE_TEST_ ++ ++/* ++ * if CHECK_BINSEARCH defined, then results of binary search ++ * will be checked by linear search ++ */ ++#define CHECK_BINSEARCH_ ++ ++/* ++ * if EXT_DEBUG is defined you can use 'extdebug' mount option ++ * to get lots of info what's going on ++ */ ++#define EXT_DEBUG_ ++#ifdef EXT_DEBUG ++#define ext_debug(tree,fmt,a...) \ ++do { \ ++ if (test_opt((tree)->inode->i_sb, EXTDEBUG)) \ ++ printk(fmt, ##a); \ ++} while (0); ++#else ++#define ext_debug(tree,fmt,a...) ++#endif ++ ++/* ++ * if EXT_STATS is defined then stats numbers are collected ++ * these number will be displayed at umount time ++ */ ++#define EXT_STATS_ ++ ++ ++#define EXT3_ALLOC_NEEDED 3 /* block bitmap + group desc. + sb */ ++ ++/* ++ * ext3_inode has i_block array (total 60 bytes) ++ * first 4 bytes are used to store: ++ * - tree depth (0 mean there is no tree yet. all extents in the inode) ++ * - number of alive extents in the inode ++ */ ++ ++/* ++ * this is extent on-disk structure ++ * it's used at the bottom of the tree ++ */ ++struct ext3_extent { ++ __u32 ee_block; /* first logical block extent covers */ ++ __u16 ee_len; /* number of blocks covered by extent */ ++ __u16 ee_start_hi; /* high 16 bits of physical block */ ++ __u32 ee_start; /* low 32 bigs of physical block */ ++}; ++ ++/* ++ * this is index on-disk structure ++ * it's used at all the levels, but the bottom ++ */ ++struct ext3_extent_idx { ++ __u32 ei_block; /* index covers logical blocks from 'block' */ ++ __u32 ei_leaf; /* pointer to the physical block of the next * ++ * level. leaf or next index could bet here */ ++ __u16 ei_leaf_hi; /* high 16 bits of physical block */ ++ __u16 ei_unused; ++}; ++ ++/* ++ * each block (leaves and indexes), even inode-stored has header ++ */ ++struct ext3_extent_header { ++ __u16 eh_magic; /* probably will support different formats */ ++ __u16 eh_entries; /* number of valid entries */ ++ __u16 eh_max; /* capacity of store in entries */ ++ __u16 eh_depth; /* has tree real underlaying blocks? */ ++ __u32 eh_generation; /* flags(8 bits) | generation of the tree */ ++}; ++ ++#define EXT3_EXT_MAGIC 0xf30a ++ ++/* ++ * array of ext3_ext_path contains path to some extent ++ * creation/lookup routines use it for traversal/splitting/etc ++ * truncate uses it to simulate recursive walking ++ */ ++struct ext3_ext_path { ++ __u32 p_block; ++ __u16 p_depth; ++ struct ext3_extent *p_ext; ++ struct ext3_extent_idx *p_idx; ++ struct ext3_extent_header *p_hdr; ++ struct buffer_head *p_bh; ++}; ++ ++/* ++ * structure for external API ++ */ ++ ++/* ++ * storage for cached extent ++ */ ++struct ext3_ext_cache { ++ __u32 ec_start; ++ __u32 ec_block; ++ __u32 ec_len; ++ __u32 ec_type; ++}; ++ ++#define EXT3_EXT_CACHE_NO 0 ++#define EXT3_EXT_CACHE_GAP 1 ++#define EXT3_EXT_CACHE_EXTENT 2 ++ ++/* ++ * ext3_extents_tree is used to pass initial information ++ * to top-level extents API ++ */ ++struct ext3_extents_helpers; ++struct ext3_extents_tree { ++ struct inode *inode; /* inode which tree belongs to */ ++ void *root; /* ptr to data top of tree resides at */ ++ void *buffer; /* will be passed as arg to ^^ routines */ ++ int buffer_len; ++ void *private; ++ struct ext3_ext_cache *cex;/* last found extent */ ++ struct ext3_extents_helpers *ops; ++}; ++ ++struct ext3_extents_helpers { ++ int (*get_write_access)(handle_t *h, void *buffer); ++ int (*mark_buffer_dirty)(handle_t *h, void *buffer); ++ int (*mergable)(struct ext3_extent *ex1, struct ext3_extent *ex2); ++ int (*remove_extent_credits)(struct ext3_extents_tree *, ++ struct ext3_extent *, unsigned long, ++ unsigned long); ++ int (*remove_extent)(struct ext3_extents_tree *, ++ struct ext3_extent *, unsigned long, ++ unsigned long); ++ int (*new_block)(handle_t *, struct ext3_extents_tree *, ++ struct ext3_ext_path *, struct ext3_extent *, ++ int *); ++}; ++ ++/* ++ * to be called by ext3_ext_walk_space() ++ * negative retcode - error ++ * positive retcode - signal for ext3_ext_walk_space(), see below ++ * callback must return valid extent (passed or newly created) ++ */ ++typedef int (*ext_prepare_callback)(struct ext3_extents_tree *, ++ struct ext3_ext_path *, ++ struct ext3_ext_cache *); ++ ++#define EXT_CONTINUE 0 ++#define EXT_BREAK 1 ++#define EXT_REPEAT 2 ++ ++ ++#define EXT_MAX_BLOCK 0xffffffff ++ ++ ++#define EXT_FIRST_EXTENT(__hdr__) \ ++ ((struct ext3_extent *) (((char *) (__hdr__)) + \ ++ sizeof(struct ext3_extent_header))) ++#define EXT_FIRST_INDEX(__hdr__) \ ++ ((struct ext3_extent_idx *) (((char *) (__hdr__)) + \ ++ sizeof(struct ext3_extent_header))) ++#define EXT_HAS_FREE_INDEX(__path__) \ ++ ((__path__)->p_hdr->eh_entries < (__path__)->p_hdr->eh_max) ++#define EXT_LAST_EXTENT(__hdr__) \ ++ (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_entries - 1) ++#define EXT_LAST_INDEX(__hdr__) \ ++ (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_entries - 1) ++#define EXT_MAX_EXTENT(__hdr__) \ ++ (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_max - 1) ++#define EXT_MAX_INDEX(__hdr__) \ ++ (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_max - 1) ++#define EXT_HDR_GEN(__hdr__) ((__hdr__)->eh_generation & 0x00ffffff) ++#define EXT_FLAGS(__hdr__) ((__hdr__)->eh_generation >> 24) ++#define EXT_FLAGS_CLR_UNKNOWN 0x7 /* Flags cleared on modification */ ++ ++#define EXT_BLOCK_HDR(__bh__) ((struct ext3_extent_header *)(__bh__)->b_data) ++#define EXT_ROOT_HDR(__tree__) ((struct ext3_extent_header *)(__tree__)->root) ++#define EXT_DEPTH(__tree__) (EXT_ROOT_HDR(__tree__)->eh_depth) ++#define EXT_GENERATION(__tree__) EXT_HDR_GEN(EXT_ROOT_HDR(__tree__)) ++ ++#define EXT_ASSERT(__x__) if (!(__x__)) BUG(); ++ ++#define EXT_CHECK_PATH(tree,path) \ ++{ \ ++ int depth = EXT_DEPTH(tree); \ ++ BUG_ON((unsigned long) (path) < __PAGE_OFFSET); \ ++ BUG_ON((unsigned long) (path)[depth].p_idx < \ ++ __PAGE_OFFSET && (path)[depth].p_idx != NULL); \ ++ BUG_ON((unsigned long) (path)[depth].p_ext < \ ++ __PAGE_OFFSET && (path)[depth].p_ext != NULL); \ ++ BUG_ON((unsigned long) (path)[depth].p_hdr < __PAGE_OFFSET); \ ++ BUG_ON((unsigned long) (path)[depth].p_bh < __PAGE_OFFSET \ ++ && depth != 0); \ ++ BUG_ON((path)[0].p_depth != depth); \ ++} ++ ++ ++/* ++ * this structure is used to gather extents from the tree via ioctl ++ */ ++struct ext3_extent_buf { ++ unsigned long start; ++ int buflen; ++ void *buffer; ++ void *cur; ++ int err; ++}; ++ ++/* ++ * this structure is used to collect stats info about the tree ++ */ ++struct ext3_extent_tree_stats { ++ int depth; ++ int extents_num; ++ int leaf_num; ++}; ++ ++extern void ext3_init_tree_desc(struct ext3_extents_tree *, struct inode *); ++extern int ext3_extent_tree_init(handle_t *, struct ext3_extents_tree *); ++extern int ext3_ext_calc_credits_for_insert(struct ext3_extents_tree *, struct ext3_ext_path *); ++extern int ext3_ext_insert_extent(handle_t *, struct ext3_extents_tree *, struct ext3_ext_path *, struct ext3_extent *); ++extern int ext3_ext_walk_space(struct ext3_extents_tree *, unsigned long, unsigned long, ext_prepare_callback); ++extern int ext3_ext_remove_space(struct ext3_extents_tree *, unsigned long, unsigned long); ++extern struct ext3_ext_path * ext3_ext_find_extent(struct ext3_extents_tree *, int, struct ext3_ext_path *); ++extern int ext3_ext_calc_blockmap_metadata(struct inode *, int); ++ ++static inline void ++ext3_ext_invalidate_cache(struct ext3_extents_tree *tree) ++{ ++ if (tree->cex) ++ tree->cex->ec_type = EXT3_EXT_CACHE_NO; ++} ++ ++ ++#endif /* _LINUX_EXT3_EXTENTS */ +Index: linux-stage/include/linux/ext3_fs_i.h +=================================================================== +--- linux-stage.orig/include/linux/ext3_fs_i.h 2006-07-16 13:55:30.000000000 +0800 ++++ linux-stage/include/linux/ext3_fs_i.h 2006-07-16 14:10:20.000000000 +0800 +@@ -142,6 +142,8 @@ struct ext3_inode_info { + */ + struct mutex truncate_mutex; + struct inode vfs_inode; ++ ++ __u32 i_cached_extent[4]; + }; + + #endif /* _LINUX_EXT3_FS_I */ diff --git a/ldiskfs/kernel_patches/patches/ext3-extents-2.6.5.patch b/ldiskfs/kernel_patches/patches/ext3-extents-2.6.5.patch index cad7b54..b6c37c1 100644 --- a/ldiskfs/kernel_patches/patches/ext3-extents-2.6.5.patch +++ b/ldiskfs/kernel_patches/patches/ext3-extents-2.6.5.patch @@ -1,11 +1,11 @@ %patch Index: linux-2.6.5-sles9/fs/ext3/extents.c =================================================================== ---- linux-2.6.5-sles9.orig/fs/ext3/extents.c 2003-01-30 13:24:37.000000000 +0300 -+++ linux-2.6.5-sles9/fs/ext3/extents.c 2004-11-09 02:25:56.143726112 +0300 -@@ -0,0 +1,2313 @@ +--- linux-2.6.5-sles9.orig/fs/ext3/extents.c 2005-02-17 22:07:57.023609040 +0300 ++++ linux-2.6.5-sles9/fs/ext3/extents.c 2005-02-23 01:02:37.396435640 +0300 +@@ -0,0 +1,2361 @@ +/* -+ * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com ++ * Copyright(c) 2003, 2004, 2005, Cluster File Systems, Inc, info@clusterfs.com + * Written by Alex Tomas + * + * This program is free software; you can redistribute it and/or modify @@ -49,6 +49,27 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c +#include +#include + ++ ++static inline int ext3_ext_check_header(struct ext3_extent_header *eh) ++{ ++ if (eh->eh_magic != EXT3_EXT_MAGIC) { ++ printk(KERN_ERR "EXT3-fs: invalid magic = 0x%x\n", ++ (unsigned)eh->eh_magic); ++ return -EIO; ++ } ++ if (eh->eh_max == 0) { ++ printk(KERN_ERR "EXT3-fs: invalid eh_max = %u\n", ++ (unsigned)eh->eh_max); ++ return -EIO; ++ } ++ if (eh->eh_entries > eh->eh_max) { ++ printk(KERN_ERR "EXT3-fs: invalid eh_entries = %u\n", ++ (unsigned)eh->eh_entries); ++ return -EIO; ++ } ++ return 0; ++} ++ +static handle_t *ext3_ext_journal_restart(handle_t *handle, int needed) +{ + int err; @@ -86,8 +107,8 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + * - ENOMEM + */ +static int ext3_ext_get_access(handle_t *handle, -+ struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path) ++ struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) +{ + int err; + @@ -108,7 +129,7 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + * - EIO + */ +static int ext3_ext_dirty(handle_t *handle, struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path) ++ struct ext3_ext_path *path) +{ + int err; + if (path->p_bh) { @@ -123,8 +144,8 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + +static int inline +ext3_ext_new_block(handle_t *handle, struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, struct ext3_extent *ex, -+ int *err) ++ struct ext3_ext_path *path, struct ext3_extent *ex, ++ int *err) +{ + int goal, depth, newblock; + struct inode *inode; @@ -143,7 +164,7 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + unsigned long colour; + + bg_start = (ei->i_block_group * -+ EXT3_BLOCKS_PER_GROUP(inode->i_sb)) + ++ EXT3_BLOCKS_PER_GROUP(inode->i_sb)) + + le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block); + colour = (current->pid % 16) * + (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16); @@ -156,9 +177,9 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + +static inline void ext3_ext_tree_changed(struct ext3_extents_tree *tree) +{ -+ struct ext3_extent_header *neh; -+ neh = EXT_ROOT_HDR(tree); -+ neh->eh_generation++; ++ struct ext3_extent_header *neh = EXT_ROOT_HDR(tree); ++ neh->eh_generation = ((EXT_FLAGS(neh) & ~EXT_FLAGS_CLR_UNKNOWN) << 24) | ++ (EXT_HDR_GEN(neh) + 1); +} + +static inline int ext3_ext_space_block(struct ext3_extents_tree *tree) @@ -166,8 +187,8 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + int size; + + size = (tree->inode->i_sb->s_blocksize - -+ sizeof(struct ext3_extent_header)) -+ / sizeof(struct ext3_extent); ++ sizeof(struct ext3_extent_header)) / ++ sizeof(struct ext3_extent); +#ifdef AGRESSIVE_TEST + size = 6; +#endif @@ -179,8 +200,8 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + int size; + + size = (tree->inode->i_sb->s_blocksize - -+ sizeof(struct ext3_extent_header)) -+ / sizeof(struct ext3_extent_idx); ++ sizeof(struct ext3_extent_header)) / ++ sizeof(struct ext3_extent_idx); +#ifdef AGRESSIVE_TEST + size = 5; +#endif @@ -191,8 +212,8 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c +{ + int size; + -+ size = (tree->buffer_len - sizeof(struct ext3_extent_header)) -+ / sizeof(struct ext3_extent); ++ size = (tree->buffer_len - sizeof(struct ext3_extent_header)) / ++ sizeof(struct ext3_extent); +#ifdef AGRESSIVE_TEST + size = 3; +#endif @@ -203,9 +224,8 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c +{ + int size; + -+ size = (tree->buffer_len - -+ sizeof(struct ext3_extent_header)) -+ / sizeof(struct ext3_extent_idx); ++ size = (tree->buffer_len - sizeof(struct ext3_extent_header)) / ++ sizeof(struct ext3_extent_idx); +#ifdef AGRESSIVE_TEST + size = 4; +#endif @@ -213,7 +233,7 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c +} + +static void ext3_ext_show_path(struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path) ++ struct ext3_ext_path *path) +{ +#ifdef EXT_DEBUG + int k, l = path->p_depth; @@ -222,12 +242,12 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + for (k = 0; k <= l; k++, path++) { + if (path->p_idx) { + ext_debug(tree, " %d->%d", path->p_idx->ei_block, -+ path->p_idx->ei_leaf); ++ path->p_idx->ei_leaf); + } else if (path->p_ext) { + ext_debug(tree, " %d:%d:%d", -+ path->p_ext->ee_block, -+ path->p_ext->ee_len, -+ path->p_ext->ee_start); ++ path->p_ext->ee_block, ++ path->p_ext->ee_len, ++ path->p_ext->ee_start); + } else + ext_debug(tree, " []"); + } @@ -236,7 +256,7 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c +} + +static void ext3_ext_show_leaf(struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path) ++ struct ext3_ext_path *path) +{ +#ifdef EXT_DEBUG + int depth = EXT_DEPTH(tree); @@ -252,7 +272,7 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + + for (i = 0; i < eh->eh_entries; i++, ex++) { + ext_debug(tree, "%d:%d:%d ", -+ ex->ee_block, ex->ee_len, ex->ee_start); ++ ex->ee_block, ex->ee_len, ex->ee_start); + } + ext_debug(tree, "\n"); +#endif @@ -263,11 +283,12 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + int depth = path->p_depth; + int i; + -+ for (i = 0; i <= depth; i++, path++) ++ for (i = 0; i <= depth; i++, path++) { + if (path->p_bh) { + brelse(path->p_bh); + path->p_bh = NULL; + } ++ } +} + +/* @@ -275,7 +296,7 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + */ +static inline void +ext3_ext_binsearch_idx(struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, int block) ++ struct ext3_ext_path *path, int block) +{ + struct ext3_extent_header *eh = path->p_hdr; + struct ext3_extent_idx *ix; @@ -301,7 +322,7 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + + ix += l; + path->p_idx = ix; -+ ext_debug(tree, " -> %d->%d ", path->p_idx->ei_block, path->p_idx->ei_leaf); ++ ext_debug(tree," -> %d->%d ",path->p_idx->ei_block,path->p_idx->ei_leaf); + + while (l++ < r) { + if (block < ix->ei_block) @@ -309,7 +330,7 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + path->p_idx = ix++; + } + ext_debug(tree, " -> %d->%d\n", path->p_idx->ei_block, -+ path->p_idx->ei_leaf); ++ path->p_idx->ei_leaf); + +#ifdef CHECK_BINSEARCH + { @@ -319,9 +340,9 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + for (k = 0; k < eh->eh_entries; k++, ix++) { + if (k != 0 && ix->ei_block <= ix[-1].ei_block) { + printk("k=%d, ix=0x%p, first=0x%p\n", k, -+ ix, EXT_FIRST_INDEX(eh)); ++ ix, EXT_FIRST_INDEX(eh)); + printk("%u <= %u\n", -+ ix->ei_block,ix[-1].ei_block); ++ ix->ei_block,ix[-1].ei_block); + } + EXT_ASSERT(k == 0 || ix->ei_block > ix[-1].ei_block); + if (block < ix->ei_block) @@ -331,7 +352,6 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + EXT_ASSERT(chix == path->p_idx); + } +#endif -+ +} + +/* @@ -339,7 +359,7 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + */ +static inline void +ext3_ext_binsearch(struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, int block) ++ struct ext3_ext_path *path, int block) +{ + struct ext3_extent_header *eh = path->p_hdr; + struct ext3_extent *ex; @@ -373,7 +393,7 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + ex += l; + path->p_ext = ex; + ext_debug(tree, " -> %d:%d:%d ", path->p_ext->ee_block, -+ path->p_ext->ee_start, path->p_ext->ee_len); ++ path->p_ext->ee_start, path->p_ext->ee_len); + + while (l++ < r) { + if (block < ex->ee_block) @@ -381,7 +401,7 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + path->p_ext = ex++; + } + ext_debug(tree, " -> %d:%d:%d\n", path->p_ext->ee_block, -+ path->p_ext->ee_start, path->p_ext->ee_len); ++ path->p_ext->ee_start, path->p_ext->ee_len); + +#ifdef CHECK_BINSEARCH + { @@ -397,7 +417,6 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + EXT_ASSERT(chex == path->p_ext); + } +#endif -+ +} + +int ext3_extent_tree_init(handle_t *handle, struct ext3_extents_tree *tree) @@ -418,7 +437,7 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + +struct ext3_ext_path * +ext3_ext_find_extent(struct ext3_extents_tree *tree, int block, -+ struct ext3_ext_path *path) ++ struct ext3_ext_path *path) +{ + struct ext3_extent_header *eh; + struct buffer_head *bh; @@ -430,15 +449,21 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + + eh = EXT_ROOT_HDR(tree); + EXT_ASSERT(eh); ++ if (ext3_ext_check_header(eh)) { ++ /* don't free previously allocated path ++ * -- caller should take care */ ++ path = NULL; ++ goto err; ++ } ++ + i = depth = EXT_DEPTH(tree); + EXT_ASSERT(eh->eh_max); + EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); -+ EXT_ASSERT(i == 0 || eh->eh_entries > 0); + + /* account possible depth increase */ + if (!path) { + path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 2), -+ GFP_NOFS); ++ GFP_NOFS); + if (!path) + return ERR_PTR(-ENOMEM); + } @@ -448,29 +473,34 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + /* walk through the tree */ + while (i) { + ext_debug(tree, "depth %d: num %d, max %d\n", -+ ppos, eh->eh_entries, eh->eh_max); ++ ppos, eh->eh_entries, eh->eh_max); + ext3_ext_binsearch_idx(tree, path + ppos, block); + path[ppos].p_block = path[ppos].p_idx->ei_leaf; + path[ppos].p_depth = i; + path[ppos].p_ext = NULL; + + bh = sb_bread(tree->inode->i_sb, path[ppos].p_block); -+ if (!bh) { -+ ext3_ext_drop_refs(path); -+ kfree(path); -+ return ERR_PTR(-EIO); -+ } ++ if (!bh) ++ goto err; ++ + eh = EXT_BLOCK_HDR(bh); + ppos++; + EXT_ASSERT(ppos <= depth); + path[ppos].p_bh = bh; + path[ppos].p_hdr = eh; + i--; ++ ++ if (ext3_ext_check_header(eh)) ++ goto err; + } + + path[ppos].p_depth = i; + path[ppos].p_hdr = eh; + path[ppos].p_ext = NULL; ++ path[ppos].p_idx = NULL; ++ ++ if (ext3_ext_check_header(eh)) ++ goto err; + + /* find extent */ + ext3_ext_binsearch(tree, path + ppos, block); @@ -478,6 +508,14 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + ext3_ext_show_path(tree, path); + + return path; ++ ++err: ++ printk(KERN_ERR "EXT3-fs: header is corrupted!\n"); ++ if (path) { ++ ext3_ext_drop_refs(path); ++ kfree(path); ++ } ++ return ERR_PTR(-EIO); +} + +/* @@ -485,9 +523,9 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + * it check where to insert: before curp or after curp + */ +static int ext3_ext_insert_index(handle_t *handle, -+ struct ext3_extents_tree *tree, -+ struct ext3_ext_path *curp, -+ int logical, int ptr) ++ struct ext3_extents_tree *tree, ++ struct ext3_ext_path *curp, ++ int logical, int ptr) +{ + struct ext3_extent_idx *ix; + int len, err; @@ -503,9 +541,9 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + len = (len - 1) * sizeof(struct ext3_extent_idx); + len = len < 0 ? 0 : len; + ext_debug(tree, "insert new index %d after: %d. " -+ "move %d from 0x%p to 0x%p\n", -+ logical, ptr, len, -+ (curp->p_idx + 1), (curp->p_idx + 2)); ++ "move %d from 0x%p to 0x%p\n", ++ logical, ptr, len, ++ (curp->p_idx + 1), (curp->p_idx + 2)); + memmove(curp->p_idx + 2, curp->p_idx + 1, len); + } + ix = curp->p_idx + 1; @@ -514,15 +552,16 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + len = len * sizeof(struct ext3_extent_idx); + len = len < 0 ? 0 : len; + ext_debug(tree, "insert new index %d before: %d. " -+ "move %d from 0x%p to 0x%p\n", -+ logical, ptr, len, -+ curp->p_idx, (curp->p_idx + 1)); ++ "move %d from 0x%p to 0x%p\n", ++ logical, ptr, len, ++ curp->p_idx, (curp->p_idx + 1)); + memmove(curp->p_idx + 1, curp->p_idx, len); + ix = curp->p_idx; + } + + ix->ei_block = logical; + ix->ei_leaf = ptr; ++ ix->ei_leaf_hi = ix->ei_unused = 0; + curp->p_hdr->eh_entries++; + + EXT_ASSERT(curp->p_hdr->eh_entries <= curp->p_hdr->eh_max); @@ -544,8 +583,8 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + * - initialize subtree + */ +static int ext3_ext_split(handle_t *handle, struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, -+ struct ext3_extent *newext, int at) ++ struct ext3_ext_path *path, ++ struct ext3_extent *newext, int at) +{ + struct buffer_head *bh = NULL; + int depth = EXT_DEPTH(tree); @@ -566,13 +605,13 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) { + border = path[depth].p_ext[1].ee_block; + ext_debug(tree, "leaf will be splitted." -+ " next leaf starts at %d\n", -+ (int)border); ++ " next leaf starts at %d\n", ++ (int)border); + } else { + border = newext->ee_block; + ext_debug(tree, "leaf will be added." -+ " next leaf starts at %d\n", -+ (int)border); ++ " next leaf starts at %d\n", ++ (int)border); + } + + /* @@ -630,12 +669,11 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + while (path[depth].p_ext <= + EXT_MAX_EXTENT(path[depth].p_hdr)) { + ext_debug(tree, "move %d:%d:%d in new leaf %lu\n", -+ path[depth].p_ext->ee_block, -+ path[depth].p_ext->ee_start, -+ path[depth].p_ext->ee_len, -+ newblock); -+ memmove(ex++, path[depth].p_ext++, -+ sizeof(struct ext3_extent)); ++ path[depth].p_ext->ee_block, ++ path[depth].p_ext->ee_start, ++ path[depth].p_ext->ee_len, ++ newblock); ++ memmove(ex++, path[depth].p_ext++, sizeof(struct ext3_extent)); + neh->eh_entries++; + m++; + } @@ -686,23 +724,24 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + fidx = EXT_FIRST_INDEX(neh); + fidx->ei_block = border; + fidx->ei_leaf = oldblock; ++ fidx->ei_leaf_hi = fidx->ei_unused = 0; + + ext_debug(tree, "int.index at %d (block %lu): %lu -> %lu\n", -+ i, newblock, border, oldblock); ++ i, newblock, border, oldblock); + /* copy indexes */ + m = 0; + path[i].p_idx++; + + ext_debug(tree, "cur 0x%p, last 0x%p\n", path[i].p_idx, -+ EXT_MAX_INDEX(path[i].p_hdr)); ++ EXT_MAX_INDEX(path[i].p_hdr)); + EXT_ASSERT(EXT_MAX_INDEX(path[i].p_hdr) == -+ EXT_LAST_INDEX(path[i].p_hdr)); ++ EXT_LAST_INDEX(path[i].p_hdr)); + while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) { + ext_debug(tree, "%d: move %d:%d in new index %lu\n", -+ i, path[i].p_idx->ei_block, -+ path[i].p_idx->ei_leaf, newblock); ++ i, path[i].p_idx->ei_block, ++ path[i].p_idx->ei_leaf, newblock); + memmove(++fidx, path[i].p_idx++, -+ sizeof(struct ext3_extent_idx)); ++ sizeof(struct ext3_extent_idx)); + neh->eh_entries++; + EXT_ASSERT(neh->eh_entries <= neh->eh_max); + m++; @@ -732,7 +771,7 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + /* insert new index */ + if (!err) + err = ext3_ext_insert_index(handle, tree, path + at, -+ border, newblock); ++ border, newblock); + +cleanup: + if (bh) { @@ -762,9 +801,9 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + * just created block + */ +static int ext3_ext_grow_indepth(handle_t *handle, -+ struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, -+ struct ext3_extent *newext) ++ struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newext) +{ + struct ext3_ext_path *curp = path; + struct ext3_extent_header *neh; @@ -796,7 +835,7 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + /* set size of new block */ + neh = EXT_BLOCK_HDR(bh); + /* old root could have indexes or leaves -+ * so calculate e_max right way */ ++ * so calculate eh_max right way */ + if (EXT_DEPTH(tree)) + neh->eh_max = ext3_ext_space_block_idx(tree); + else @@ -819,11 +858,12 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + /* FIXME: it works, but actually path[0] can be index */ + curp->p_idx->ei_block = EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block; + curp->p_idx->ei_leaf = newblock; ++ curp->p_idx->ei_leaf_hi = curp->p_idx->ei_unused = 0; + + neh = EXT_ROOT_HDR(tree); + fidx = EXT_FIRST_INDEX(neh); + ext_debug(tree, "new root: num %d(%d), lblock %d, ptr %d\n", -+ neh->eh_entries, neh->eh_max, fidx->ei_block, fidx->ei_leaf); ++ neh->eh_entries, neh->eh_max, fidx->ei_block, fidx->ei_leaf); + + neh->eh_depth = path->p_depth + 1; + err = ext3_ext_dirty(handle, tree, curp); @@ -838,9 +878,9 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + * then it requests in-depth growing + */ +static int ext3_ext_create_new_leaf(handle_t *handle, -+ struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, -+ struct ext3_extent *newext) ++ struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newext) +{ + struct ext3_ext_path *curp; + int depth, i, err = 0; @@ -916,12 +956,12 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + if (depth == path->p_depth) { + /* leaf */ + if (path[depth].p_ext != -+ EXT_LAST_EXTENT(path[depth].p_hdr)) ++ EXT_LAST_EXTENT(path[depth].p_hdr)) + return path[depth].p_ext[1].ee_block; + } else { + /* index */ + if (path[depth].p_idx != -+ EXT_LAST_INDEX(path[depth].p_hdr)) ++ EXT_LAST_INDEX(path[depth].p_hdr)) + return path[depth].p_idx[1].ei_block; + } + depth--; @@ -934,7 +974,7 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + * returns first allocated block from next leaf or EXT_MAX_BLOCK + */ +static unsigned ext3_ext_next_leaf_block(struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path) ++ struct ext3_ext_path *path) +{ + int depth; + @@ -950,7 +990,7 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + + while (depth >= 0) { + if (path[depth].p_idx != -+ EXT_LAST_INDEX(path[depth].p_hdr)) ++ EXT_LAST_INDEX(path[depth].p_hdr)) + return path[depth].p_idx[1].ei_block; + depth--; + } @@ -964,7 +1004,7 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + * TODO: do we need to correct tree in all cases? + */ +int ext3_ext_correct_indexes(handle_t *handle, struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path) ++ struct ext3_ext_path *path) +{ + struct ext3_extent_header *eh; + int depth = EXT_DEPTH(tree); @@ -1014,8 +1054,8 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + +static int inline +ext3_can_extents_be_merged(struct ext3_extents_tree *tree, -+ struct ext3_extent *ex1, -+ struct ext3_extent *ex2) ++ struct ext3_extent *ex1, ++ struct ext3_extent *ex2) +{ + if (ex1->ee_block + ex1->ee_len != ex2->ee_block) + return 0; @@ -1037,8 +1077,8 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + * creating new leaf in no-space case + */ +int ext3_ext_insert_extent(handle_t *handle, struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, -+ struct ext3_extent *newext) ++ struct ext3_ext_path *path, ++ struct ext3_extent *newext) +{ + struct ext3_extent_header * eh; + struct ext3_extent *ex, *fex; @@ -1047,7 +1087,6 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + int depth, len, err, next; + + EXT_ASSERT(newext->ee_len > 0); -+ EXT_ASSERT(newext->ee_len < EXT_CACHE_MARK); + depth = EXT_DEPTH(tree); + ex = path[depth].p_ext; + EXT_ASSERT(path[depth].p_hdr); @@ -1055,8 +1094,8 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + /* try to insert block into found extent and return */ + if (ex && ext3_can_extents_be_merged(tree, ex, newext)) { + ext_debug(tree, "append %d block to %d:%d (from %d)\n", -+ newext->ee_len, ex->ee_block, ex->ee_len, -+ ex->ee_start); ++ newext->ee_len, ex->ee_block, ex->ee_len, ++ ex->ee_start); + if ((err = ext3_ext_get_access(handle, tree, path + depth))) + return err; + ex->ee_len += newext->ee_len; @@ -1084,12 +1123,12 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + eh = npath[depth].p_hdr; + if (eh->eh_entries < eh->eh_max) { + ext_debug(tree, "next leaf isnt full(%d)\n", -+ eh->eh_entries); ++ eh->eh_entries); + path = npath; + goto repeat; + } + ext_debug(tree, "next leaf hasno free space(%d,%d)\n", -+ eh->eh_entries, eh->eh_max); ++ eh->eh_entries, eh->eh_max); + } + + /* @@ -1111,8 +1150,8 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + if (!nearex) { + /* there is no extent in this leaf, create first one */ + ext_debug(tree, "first extent in the leaf: %d:%d:%d\n", -+ newext->ee_block, newext->ee_start, -+ newext->ee_len); ++ newext->ee_block, newext->ee_start, ++ newext->ee_len); + path[depth].p_ext = EXT_FIRST_EXTENT(eh); + } else if (newext->ee_block > nearex->ee_block) { + EXT_ASSERT(newext->ee_block != nearex->ee_block); @@ -1121,10 +1160,10 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + len = (len - 1) * sizeof(struct ext3_extent); + len = len < 0 ? 0 : len; + ext_debug(tree, "insert %d:%d:%d after: nearest 0x%p, " -+ "move %d from 0x%p to 0x%p\n", -+ newext->ee_block, newext->ee_start, -+ newext->ee_len, -+ nearex, len, nearex + 1, nearex + 2); ++ "move %d from 0x%p to 0x%p\n", ++ newext->ee_block, newext->ee_start, ++ newext->ee_len, ++ nearex, len, nearex + 1, nearex + 2); + memmove(nearex + 2, nearex + 1, len); + } + path[depth].p_ext = nearex + 1; @@ -1133,9 +1172,9 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext3_extent); + len = len < 0 ? 0 : len; + ext_debug(tree, "insert %d:%d:%d before: nearest 0x%p, " -+ "move %d from 0x%p to 0x%p\n", -+ newext->ee_block, newext->ee_start, newext->ee_len, -+ nearex, len, nearex + 1, nearex + 2); ++ "move %d from 0x%p to 0x%p\n", ++ newext->ee_block, newext->ee_start, newext->ee_len, ++ nearex, len, nearex + 1, nearex + 2); + memmove(nearex + 1, nearex, len); + path[depth].p_ext = nearex; + } @@ -1156,8 +1195,8 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + /* merge with next extent! */ + nearex->ee_len += nearex[1].ee_len; + if (nearex + 1 < EXT_LAST_EXTENT(eh)) { -+ len = (EXT_LAST_EXTENT(eh) - nearex - 1) -+ * sizeof(struct ext3_extent); ++ len = (EXT_LAST_EXTENT(eh) - nearex - 1) * ++ sizeof(struct ext3_extent); + memmove(nearex + 1, nearex + 2, len); + } + eh->eh_entries--; @@ -1187,7 +1226,8 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + unsigned long num, ext_prepare_callback func) +{ + struct ext3_ext_path *path = NULL; -+ struct ext3_extent *ex, cbex; ++ struct ext3_ext_cache cbex; ++ struct ext3_extent *ex; + unsigned long next, start = 0, end = 0; + unsigned long last = block + num; + int depth, exists, err = 0; @@ -1246,14 +1286,20 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + EXT_ASSERT(end > start); + + if (!exists) { -+ cbex.ee_block = start; -+ cbex.ee_len = end - start; -+ cbex.ee_start = 0; -+ } else -+ cbex = *ex; ++ cbex.ec_block = start; ++ cbex.ec_len = end - start; ++ cbex.ec_start = 0; ++ cbex.ec_type = EXT3_EXT_CACHE_GAP; ++ } else { ++ cbex.ec_block = ex->ee_block; ++ cbex.ec_len = ex->ee_len; ++ cbex.ec_start = ex->ee_start; ++ cbex.ec_type = EXT3_EXT_CACHE_EXTENT; ++ } + ++ EXT_ASSERT(cbex.ec_len > 0); + EXT_ASSERT(path[depth].p_hdr); -+ err = func(tree, path, &cbex, exists); ++ err = func(tree, path, &cbex); + ext3_ext_drop_refs(path); + + if (err < 0) @@ -1271,7 +1317,7 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + path = NULL; + } + -+ block = cbex.ee_block + cbex.ee_len; ++ block = cbex.ec_block + cbex.ec_len; + } + + if (path) { @@ -1284,7 +1330,7 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + +static inline void +ext3_ext_put_in_cache(struct ext3_extents_tree *tree, __u32 block, -+ __u32 len, __u32 start, int type) ++ __u32 len, __u32 start, int type) +{ + EXT_ASSERT(len > 0); + if (tree->cex) { @@ -1301,8 +1347,8 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + */ +static inline void +ext3_ext_put_gap_in_cache(struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, -+ unsigned long block) ++ struct ext3_ext_path *path, ++ unsigned long block) +{ + int depth = EXT_DEPTH(tree); + unsigned long lblock, len; @@ -1321,16 +1367,16 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + lblock = block; + len = ex->ee_block - block; + ext_debug(tree, "cache gap(before): %lu [%lu:%lu]", -+ (unsigned long) block, -+ (unsigned long) ex->ee_block, -+ (unsigned long) ex->ee_len); ++ (unsigned long) block, ++ (unsigned long) ex->ee_block, ++ (unsigned long) ex->ee_len); + } else if (block >= ex->ee_block + ex->ee_len) { + lblock = ex->ee_block + ex->ee_len; + len = ext3_ext_next_allocated_block(path); + ext_debug(tree, "cache gap(after): [%lu:%lu] %lu", -+ (unsigned long) ex->ee_block, -+ (unsigned long) ex->ee_len, -+ (unsigned long) block); ++ (unsigned long) ex->ee_block, ++ (unsigned long) ex->ee_len, ++ (unsigned long) block); + EXT_ASSERT(len > lblock); + len = len - lblock; + } else { @@ -1344,7 +1390,7 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + +static inline int +ext3_ext_in_cache(struct ext3_extents_tree *tree, unsigned long block, -+ struct ext3_extent *ex) ++ struct ext3_extent *ex) +{ + struct ext3_ext_cache *cex = tree->cex; + @@ -1357,16 +1403,17 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + return EXT3_EXT_CACHE_NO; + + EXT_ASSERT(cex->ec_type == EXT3_EXT_CACHE_GAP || -+ cex->ec_type == EXT3_EXT_CACHE_EXTENT); ++ cex->ec_type == EXT3_EXT_CACHE_EXTENT); + if (block >= cex->ec_block && block < cex->ec_block + cex->ec_len) { + ex->ee_block = cex->ec_block; + ex->ee_start = cex->ec_start; ++ ex->ee_start_hi = 0; + ex->ee_len = cex->ec_len; + ext_debug(tree, "%lu cached by %lu:%lu:%lu\n", -+ (unsigned long) block, -+ (unsigned long) ex->ee_block, -+ (unsigned long) ex->ee_len, -+ (unsigned long) ex->ee_start); ++ (unsigned long) block, ++ (unsigned long) ex->ee_block, ++ (unsigned long) ex->ee_len, ++ (unsigned long) ex->ee_start); + return cex->ec_type; + } + @@ -1380,7 +1427,7 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + * last index in the block only + */ +int ext3_ext_rm_idx(handle_t *handle, struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path) ++ struct ext3_ext_path *path) +{ + struct buffer_head *bh; + int err; @@ -1394,7 +1441,7 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + if ((err = ext3_ext_dirty(handle, tree, path))) + return err; + ext_debug(tree, "index is empty, remove it, free block %d\n", -+ path->p_idx->ei_leaf); ++ path->p_idx->ei_leaf); + bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf); + ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf); + ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1); @@ -1402,7 +1449,7 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c +} + +int ext3_ext_calc_credits_for_insert(struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path) ++ struct ext3_ext_path *path) +{ + int depth = EXT_DEPTH(tree); + int needed; @@ -1439,8 +1486,8 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + +static int +ext3_ext_split_for_rm(handle_t *handle, struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, unsigned long start, -+ unsigned long end) ++ struct ext3_ext_path *path, unsigned long start, ++ unsigned long end) +{ + struct ext3_extent *ex, tex; + struct ext3_ext_path *npath; @@ -1474,7 +1521,7 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + /* FIXME: some callback to free underlying resource + * and correct ee_start? */ + ext_debug(tree, "split extent: head %u:%u, tail %u:%u\n", -+ ex->ee_block, ex->ee_len, tex.ee_block, tex.ee_len); ++ ex->ee_block, ex->ee_len, tex.ee_block, tex.ee_len); + + npath = ext3_ext_find_extent(tree, ex->ee_block, NULL); + if (IS_ERR(npath)) @@ -1488,13 +1535,12 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + kfree(npath); + + return err; -+ +} + +static int +ext3_ext_rm_leaf(handle_t *handle, struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, unsigned long start, -+ unsigned long end) ++ struct ext3_ext_path *path, unsigned long start, ++ unsigned long end) +{ + struct ext3_extent *ex, *fu = NULL, *lu, *le; + int err = 0, correct_index = 0; @@ -1527,8 +1573,7 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + } + + lu = ex; -+ while (ex >= EXT_FIRST_EXTENT(eh) && -+ ex->ee_block + ex->ee_len > start) { ++ while (ex >= EXT_FIRST_EXTENT(eh) && ex->ee_block + ex->ee_len > start) { + ext_debug(tree, "remove ext %u:%u\n", ex->ee_block, ex->ee_len); + path[depth].p_ext = ex; + @@ -1555,7 +1600,7 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + block = ex->ee_block; + num = 0; + EXT_ASSERT(a == ex->ee_block && -+ b == ex->ee_block + ex->ee_len - 1); ++ b == ex->ee_block + ex->ee_len - 1); + } + + if (ex == EXT_FIRST_EXTENT(eh)) @@ -1584,7 +1629,7 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + + if (num == 0) { + /* this extent is removed entirely mark slot unused */ -+ ex->ee_start = 0; ++ ex->ee_start = ex->ee_start_hi = 0; + eh->eh_entries--; + fu = ex; + } @@ -1597,7 +1642,7 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + goto out; + + ext_debug(tree, "new extent: %u:%u:%u\n", -+ ex->ee_block, ex->ee_len, ex->ee_start); ++ ex->ee_block, ex->ee_len, ex->ee_start); + ex--; + } + @@ -1606,7 +1651,7 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + while (lu < le) { + if (lu->ee_start) { + *fu = *lu; -+ lu->ee_start = 0; ++ lu->ee_start = lu->ee_start_hi = 0; + fu++; + } + lu++; @@ -1661,7 +1706,7 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c +} + +int ext3_ext_remove_space(struct ext3_extents_tree *tree, -+ unsigned long start, unsigned long end) ++ unsigned long start, unsigned long end) +{ + struct inode *inode = tree->inode; + struct super_block *sb = inode->i_sb; @@ -1685,8 +1730,7 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + */ + path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 1), GFP_KERNEL); + if (IS_ERR(path)) { -+ ext3_error(sb, "ext3_ext_remove_space", -+ "Can't allocate path array"); ++ ext3_error(sb, __FUNCTION__, "Can't allocate path array"); + ext3_journal_stop(handle); + return -ENOMEM; + } @@ -1718,19 +1762,19 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + ext3_ext_last_covered(path[i].p_hdr, end); + path[i].p_block = path[i].p_hdr->eh_entries + 1; + ext_debug(tree, "init index ptr: hdr 0x%p, num %d\n", -+ path[i].p_hdr, path[i].p_hdr->eh_entries); ++ path[i].p_hdr, path[i].p_hdr->eh_entries); + } else { + /* we've already was here, see at next index */ + path[i].p_idx--; + } + + ext_debug(tree, "level %d - index, first 0x%p, cur 0x%p\n", -+ i, EXT_FIRST_INDEX(path[i].p_hdr), -+ path[i].p_idx); ++ i, EXT_FIRST_INDEX(path[i].p_hdr), ++ path[i].p_idx); + if (ext3_ext_more_to_rm(path + i)) { + /* go to the next level */ + ext_debug(tree, "move to level %d (block %d)\n", -+ i + 1, path[i].p_idx->ei_leaf); ++ i + 1, path[i].p_idx->ei_leaf); + memset(path + i + 1, 0, sizeof(*path)); + path[i+1].p_bh = sb_bread(sb, path[i].p_idx->ei_leaf); + if (!path[i+1].p_bh) { @@ -1853,7 +1897,7 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c +} + +static int ext3_ext_mergable(struct ext3_extent *ex1, -+ struct ext3_extent *ex2) ++ struct ext3_extent *ex2) +{ + /* FIXME: support for large fs */ + if (ex1->ee_start + ex1->ee_len == ex2->ee_start) @@ -1863,8 +1907,8 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + +static int +ext3_remove_blocks_credits(struct ext3_extents_tree *tree, -+ struct ext3_extent *ex, -+ unsigned long from, unsigned long to) ++ struct ext3_extent *ex, ++ unsigned long from, unsigned long to) +{ + int needed; + @@ -1879,8 +1923,8 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + +static int +ext3_remove_blocks(struct ext3_extents_tree *tree, -+ struct ext3_extent *ex, -+ unsigned long from, unsigned long to) ++ struct ext3_extent *ex, ++ unsigned long from, unsigned long to) +{ + int needed = ext3_remove_blocks_credits(tree, ex, from, to); + handle_t *handle = ext3_journal_start(tree->inode, needed); @@ -1895,7 +1939,7 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + num = ex->ee_block + ex->ee_len - from; + start = ex->ee_start + ex->ee_len - num; + ext_debug(tree, "free last %lu blocks starting %lu\n", -+ num, start); ++ num, start); + for (i = 0; i < num; i++) { + bh = sb_find_get_block(tree->inode->i_sb, start + i); + ext3_forget(handle, 0, tree->inode, bh, start + i); @@ -1903,17 +1947,17 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + ext3_free_blocks(handle, tree->inode, start, num); + } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) { + printk("strange request: removal %lu-%lu from %u:%u\n", -+ from, to, ex->ee_block, ex->ee_len); ++ from, to, ex->ee_block, ex->ee_len); + } else { + printk("strange request: removal(2) %lu-%lu from %u:%u\n", -+ from, to, ex->ee_block, ex->ee_len); ++ from, to, ex->ee_block, ex->ee_len); + } + ext3_journal_stop(handle); + return 0; +} + +static int ext3_ext_find_goal(struct inode *inode, -+ struct ext3_ext_path *path, unsigned long block) ++ struct ext3_ext_path *path, unsigned long block) +{ + struct ext3_inode_info *ei = EXT3_I(inode); + unsigned long bg_start; @@ -1943,8 +1987,8 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c +} + +static int ext3_new_block_cb(handle_t *handle, struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, -+ struct ext3_extent *ex, int *err) ++ struct ext3_ext_path *path, ++ struct ext3_extent *ex, int *err) +{ + struct inode *inode = tree->inode; + int newblock, goal; @@ -1962,6 +2006,7 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + /* allocate new block for the extent */ + goal = ext3_ext_find_goal(inode, path, ex->ee_block); + ex->ee_start = ext3_new_block(handle, inode, goal, err); ++ ex->ee_start_hi = 0; + if (ex->ee_start == 0) { + /* error occured: restore old extent */ + ex->ee_start = newblock; @@ -1981,19 +2026,19 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c +}; + +void ext3_init_tree_desc(struct ext3_extents_tree *tree, -+ struct inode *inode) ++ struct inode *inode) +{ + tree->inode = inode; + tree->root = (void *) EXT3_I(inode)->i_data; + tree->buffer = (void *) inode; + tree->buffer_len = sizeof(EXT3_I(inode)->i_data); -+ tree->cex = (struct ext3_ext_cache *) &EXT3_I(inode)->i_cached_extent; ++ tree->cex = &EXT3_I(inode)->i_cached_extent; + tree->ops = &ext3_blockmap_helpers; +} + +int ext3_ext_get_block(handle_t *handle, struct inode *inode, -+ long iblock, struct buffer_head *bh_result, -+ int create, int extend_disksize) ++ long iblock, struct buffer_head *bh_result, ++ int create, int extend_disksize) +{ + struct ext3_ext_path *path = NULL; + struct ext3_extent newex; @@ -2001,10 +2046,10 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + int goal, newblock, err = 0, depth; + struct ext3_extents_tree tree; + -+ clear_buffer_new(bh_result); ++ __clear_bit(BH_New, &bh_result->b_state); + ext3_init_tree_desc(&tree, inode); + ext_debug(&tree, "block %d requested for inode %u\n", -+ (int) iblock, (unsigned) inode->i_ino); ++ (int) iblock, (unsigned) inode->i_ino); + down(&EXT3_I(inode)->truncate_sem); + + /* check in cache */ @@ -2047,11 +2092,11 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + if (iblock >= ex->ee_block && iblock < ex->ee_block + ex->ee_len) { + newblock = iblock - ex->ee_block + ex->ee_start; + ext_debug(&tree, "%d fit into %d:%d -> %d\n", -+ (int) iblock, ex->ee_block, ex->ee_len, -+ newblock); ++ (int) iblock, ex->ee_block, ex->ee_len, ++ newblock); + ext3_ext_put_in_cache(&tree, ex->ee_block, -+ ex->ee_len, ex->ee_start, -+ EXT3_EXT_CACHE_EXTENT); ++ ex->ee_len, ex->ee_start, ++ EXT3_EXT_CACHE_EXTENT); + goto out; + } + } @@ -2072,11 +2117,12 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + if (!newblock) + goto out2; + ext_debug(&tree, "allocate new block: goal %d, found %d\n", -+ goal, newblock); ++ goal, newblock); + + /* try to insert new extent into found leaf and return */ + newex.ee_block = iblock; + newex.ee_start = newblock; ++ newex.ee_start_hi = 0; + newex.ee_len = 1; + err = ext3_ext_insert_extent(handle, &tree, path, &newex); + if (err) @@ -2087,13 +2133,15 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + + /* previous routine could use block we allocated */ + newblock = newex.ee_start; -+ set_buffer_new(bh_result); ++ __set_bit(BH_New, &bh_result->b_state); + + ext3_ext_put_in_cache(&tree, newex.ee_block, newex.ee_len, -+ newex.ee_start, EXT3_EXT_CACHE_EXTENT); ++ newex.ee_start, EXT3_EXT_CACHE_EXTENT); +out: + ext3_ext_show_leaf(&tree, path); -+ map_bh(bh_result, inode->i_sb, newblock); ++ __set_bit(BH_Mapped, &bh_result->b_state); ++ bh_result->b_bdev = inode->i_sb->s_bdev; ++ bh_result->b_blocknr = newblock; +out2: + if (path) { + ext3_ext_drop_refs(path); @@ -2148,8 +2196,8 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + EXT3_I(inode)->i_disksize = inode->i_size; + ext3_mark_inode_dirty(handle, inode); + -+ last_block = (inode->i_size + sb->s_blocksize - 1) -+ >> EXT3_BLOCK_SIZE_BITS(sb); ++ last_block = (inode->i_size + sb->s_blocksize - 1) >> ++ EXT3_BLOCK_SIZE_BITS(sb); + err = ext3_ext_remove_space(&tree, last_block, EXT_MAX_BLOCK); + + /* In a multi-transaction truncate, we only make the final @@ -2217,13 +2265,14 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + +static int +ext3_ext_store_extent_cb(struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, -+ struct ext3_extent *newex, int exist) ++ struct ext3_ext_path *path, ++ struct ext3_ext_cache *newex) +{ + struct ext3_extent_buf *buf = (struct ext3_extent_buf *) tree->private; + -+ if (!exist) ++ if (newex->ec_type != EXT3_EXT_CACHE_EXTENT) + return EXT_CONTINUE; ++ + if (buf->err < 0) + return EXT_BREAK; + if (buf->cur - buf->buffer + sizeof(*newex) > buf->buflen) @@ -2241,14 +2290,14 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + +static int +ext3_ext_collect_stats_cb(struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, -+ struct ext3_extent *ex, int exist) ++ struct ext3_ext_path *path, ++ struct ext3_ext_cache *ex) +{ + struct ext3_extent_tree_stats *buf = + (struct ext3_extent_tree_stats *) tree->private; + int depth; + -+ if (!exist) ++ if (ex->ec_type != EXT3_EXT_CACHE_EXTENT) + return EXT_CONTINUE; + + depth = EXT_DEPTH(tree); @@ -2259,7 +2308,7 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c +} + +int ext3_ext_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, -+ unsigned long arg) ++ unsigned long arg) +{ + int err = 0; + @@ -2279,7 +2328,7 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + tree.private = &buf; + down(&EXT3_I(inode)->truncate_sem); + err = ext3_ext_walk_space(&tree, buf.start, EXT_MAX_BLOCK, -+ ext3_ext_store_extent_cb); ++ ext3_ext_store_extent_cb); + up(&EXT3_I(inode)->truncate_sem); + if (err == 0) + err = buf.err; @@ -2294,7 +2343,7 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c + buf.leaf_num = 0; + tree.private = &buf; + err = ext3_ext_walk_space(&tree, 0, EXT_MAX_BLOCK, -+ ext3_ext_collect_stats_cb); ++ ext3_ext_collect_stats_cb); + up(&EXT3_I(inode)->truncate_sem); + if (!err) + err = copy_to_user((void *) arg, &buf, sizeof(buf)); @@ -2316,39 +2365,55 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c +EXPORT_SYMBOL(ext3_ext_walk_space); +EXPORT_SYMBOL(ext3_ext_find_goal); +EXPORT_SYMBOL(ext3_ext_calc_credits_for_insert); -+ Index: linux-2.6.5-sles9/fs/ext3/ialloc.c =================================================================== ---- linux-2.6.5-sles9.orig/fs/ext3/ialloc.c 2004-11-09 02:22:55.763148128 +0300 -+++ linux-2.6.5-sles9/fs/ext3/ialloc.c 2004-11-09 02:23:21.587222272 +0300 -@@ -647,6 +647,10 @@ +--- linux-2.6.5-sles9.orig/fs/ext3/ialloc.c 2005-02-23 01:01:52.366281264 +0300 ++++ linux-2.6.5-sles9/fs/ext3/ialloc.c 2005-02-23 01:02:37.398435336 +0300 +@@ -566,7 +566,7 @@ repeat: + ei->i_dir_start_lookup = 0; + ei->i_disksize = 0; + +- ei->i_flags = EXT3_I(dir)->i_flags & ~EXT3_INDEX_FL; ++ ei->i_flags = EXT3_I(dir)->i_flags & ~(EXT3_INDEX_FL|EXT3_EXTENTS_FL); + if (S_ISLNK(mode)) + ei->i_flags &= ~(EXT3_IMMUTABLE_FL|EXT3_APPEND_FL); + /* dirsync only applies to directories */ +@@ -647,6 +647,18 @@ DQUOT_FREE_INODE(inode); goto fail2; } -+ if (test_opt(sb, EXTENTS)) { -+ EXT3_I(inode)->i_flags |= EXT3_EXTENTS_FL; -+ ext3_extents_initialize_blockmap(handle, inode); -+ } ++ if (test_opt(sb, EXTENTS) && S_ISREG(inode->i_mode)) { ++ EXT3_I(inode)->i_flags |= EXT3_EXTENTS_FL; ++ ext3_extents_initialize_blockmap(handle, inode); ++ if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_EXTENTS)) { ++ err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh); ++ if (err) goto fail; ++ EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_EXTENTS); ++ BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "call ext3_journal_dirty_metadata"); ++ err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); ++ } ++ } ++ err = ext3_mark_inode_dirty(handle, inode); if (err) { ext3_std_error(sb, err); Index: linux-2.6.5-sles9/fs/ext3/inode.c =================================================================== ---- linux-2.6.5-sles9.orig/fs/ext3/inode.c 2004-11-09 02:22:55.767147520 +0300 -+++ linux-2.6.5-sles9/fs/ext3/inode.c 2004-11-09 02:23:21.592221512 +0300 +--- linux-2.6.5-sles9.orig/fs/ext3/inode.c 2005-02-23 01:01:52.373280200 +0300 ++++ linux-2.6.5-sles9/fs/ext3/inode.c 2005-02-23 01:02:37.404434424 +0300 @@ -796,6 +796,17 @@ goto reread; } +static inline int +ext3_get_block_wrap(handle_t *handle, struct inode *inode, long block, -+ struct buffer_head *bh, int create, int extend_disksize) ++ struct buffer_head *bh, int create, int extend_disksize) +{ + if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) + return ext3_ext_get_block(handle, inode, block, bh, create, -+ extend_disksize); ++ extend_disksize); + return ext3_get_block_handle(handle, inode, block, bh, create, -+ extend_disksize); ++ extend_disksize); +} + static int ext3_get_block(struct inode *inode, sector_t iblock, @@ -2361,7 +2426,7 @@ Index: linux-2.6.5-sles9/fs/ext3/inode.c - ret = ext3_get_block_handle(handle, inode, iblock, - bh_result, create, 1); + ret = ext3_get_block_wrap(handle, inode, iblock, -+ bh_result, create, 1); ++ bh_result, create, 1); return ret; } @@ -2372,7 +2437,7 @@ Index: linux-2.6.5-sles9/fs/ext3/inode.c - ret = ext3_get_block_handle(handle, inode, iblock, - bh_result, create, 0); + ret = ext3_get_block_wrap(handle, inode, iblock, -+ bh_result, create, 0); ++ bh_result, create, 0); if (ret == 0) bh_result->b_size = (1 << inode->i_blkbits); return ret; @@ -2408,92 +2473,94 @@ Index: linux-2.6.5-sles9/fs/ext3/inode.c int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3; int ret; -+ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) -+ return ext3_ext_writepage_trans_blocks(inode, bpp); -+ ++ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) ++ return ext3_ext_writepage_trans_blocks(inode, bpp); ++ if (ext3_should_journal_data(inode)) ret = 3 * (bpp + indirects) + 2; else Index: linux-2.6.5-sles9/fs/ext3/Makefile =================================================================== ---- linux-2.6.5-sles9.orig/fs/ext3/Makefile 2004-11-09 02:18:27.604914376 +0300 -+++ linux-2.6.5-sles9/fs/ext3/Makefile 2004-11-09 02:23:21.593221360 +0300 -@@ -5,7 +5,7 @@ +--- linux-2.6.5-sles9.orig/fs/ext3/Makefile 2005-02-23 01:01:46.501172896 +0300 ++++ linux-2.6.5-sles9/fs/ext3/Makefile 2005-02-23 01:02:37.405434272 +0300 +@@ -5,7 +5,8 @@ obj-$(CONFIG_EXT3_FS) += ext3.o ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ - ioctl.o namei.o super.o symlink.o hash.o -+ ioctl.o namei.o super.o symlink.o hash.o extents.o ++ ioctl.o namei.o super.o symlink.o hash.o \ ++ extents.o ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o Index: linux-2.6.5-sles9/fs/ext3/super.c =================================================================== ---- linux-2.6.5-sles9.orig/fs/ext3/super.c 2004-11-09 02:22:56.450043704 +0300 -+++ linux-2.6.5-sles9/fs/ext3/super.c 2004-11-09 02:23:21.597220752 +0300 +--- linux-2.6.5-sles9.orig/fs/ext3/super.c 2005-02-23 01:02:34.072940888 +0300 ++++ linux-2.6.5-sles9/fs/ext3/super.c 2005-02-23 01:47:15.291333736 +0300 @@ -389,6 +389,7 @@ struct ext3_super_block *es = sbi->s_es; int i; -+ ext3_ext_release(sb); ++ ext3_ext_release(sb); ext3_xattr_put_super(sb); journal_destroy(sbi->s_journal); if (!(sb->s_flags & MS_RDONLY)) { -@@ -447,6 +448,10 @@ +@@ -447,6 +448,8 @@ #endif ei->i_rsv_window.rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED; ei->vfs_inode.i_version = 1; -+ ei->i_cached_extent[0] = 0; -+ ei->i_cached_extent[1] = 0; -+ ei->i_cached_extent[2] = 0; -+ ei->i_cached_extent[3] = 0; ++ ++ memset(&ei->i_cached_extent, 0, sizeof(ei->i_cached_extent)); return &ei->vfs_inode; } -@@ -537,7 +542,7 @@ - Opt_commit, Opt_journal_update, Opt_journal_inum, - Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, - Opt_ignore, Opt_barrier, Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, -- Opt_err, -+ Opt_err, Opt_extents, Opt_extdebug +@@ -537,6 +540,7 @@ + Opt_ignore, Opt_barrier, + Opt_err, + Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, ++ Opt_extents, Opt_noextents, Opt_extdebug, }; static match_table_t tokens = { -@@ -582,6 +587,8 @@ +@@ -582,6 +585,9 @@ {Opt_iopen, "iopen"}, {Opt_noiopen, "noiopen"}, {Opt_iopen_nopriv, "iopen_nopriv"}, + {Opt_extents, "extents"}, ++ {Opt_noextents, "noextents"}, + {Opt_extdebug, "extdebug"}, + {Opt_barrier, "barrier=%u"}, {Opt_err, NULL} }; - -@@ -797,6 +804,12 @@ +@@ -797,6 +802,15 @@ break; case Opt_ignore: break; + case Opt_extents: + set_opt (sbi->s_mount_opt, EXTENTS); + break; ++ case Opt_noextents: ++ clear_opt (sbi->s_mount_opt, EXTENTS); ++ break; + case Opt_extdebug: + set_opt (sbi->s_mount_opt, EXTDEBUG); + break; default: printk (KERN_ERR "EXT3-fs: Unrecognized mount option \"%s\" " -@@ -1449,6 +1462,8 @@ +@@ -1449,6 +1460,8 @@ percpu_counter_mod(&sbi->s_dirs_counter, ext3_count_dirs(sb)); -+ ext3_ext_init(sb); -+ ++ ext3_ext_init(sb); ++ return 0; failed_mount3: Index: linux-2.6.5-sles9/fs/ext3/ioctl.c =================================================================== ---- linux-2.6.5-sles9.orig/fs/ext3/ioctl.c 2004-11-09 02:15:44.610693264 +0300 -+++ linux-2.6.5-sles9/fs/ext3/ioctl.c 2004-11-09 02:23:52.991448104 +0300 +--- linux-2.6.5-sles9.orig/fs/ext3/ioctl.c 2005-02-23 01:01:42.887722224 +0300 ++++ linux-2.6.5-sles9/fs/ext3/ioctl.c 2005-02-23 01:02:37.412433208 +0300 @@ -124,6 +124,10 @@ err = ext3_change_inode_journal_flag(inode, jflag); return err; @@ -2507,16 +2574,19 @@ Index: linux-2.6.5-sles9/fs/ext3/ioctl.c return put_user(inode->i_generation, (int *) arg); Index: linux-2.6.5-sles9/include/linux/ext3_fs.h =================================================================== ---- linux-2.6.5-sles9.orig/include/linux/ext3_fs.h 2004-11-09 02:22:58.767691368 +0300 -+++ linux-2.6.5-sles9/include/linux/ext3_fs.h 2004-11-09 02:25:17.238640584 +0300 -@@ -186,6 +186,7 @@ +--- linux-2.6.5-sles9.orig/include/linux/ext3_fs.h 2005-02-23 01:02:35.823674736 +0300 ++++ linux-2.6.5-sles9/include/linux/ext3_fs.h 2005-02-23 01:02:37.414432904 +0300 +@@ -186,8 +186,9 @@ + #define EXT3_NOTAIL_FL 0x00008000 /* don't merge file tail */ #define EXT3_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ #define EXT3_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ - #define EXT3_RESERVED_FL 0x80000000 /* reserved for ext3 lib */ +#define EXT3_EXTENTS_FL 0x00080000 /* Inode uses extents */ + #define EXT3_RESERVED_FL 0x80000000 /* reserved for ext3 lib */ - #define EXT3_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */ +-#define EXT3_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */ ++#define EXT3_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */ #define EXT3_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */ + @@ -211,6 +212,9 @@ #endif #define EXT3_IOC_GETRSVSZ _IOR('f', 5, long) @@ -2529,41 +2599,60 @@ Index: linux-2.6.5-sles9/include/linux/ext3_fs.h * Structure of an inode on the disk @@ -333,6 +337,8 @@ #define EXT3_MOUNT_BARRIER 0x20000 /* Use block barriers */ - #define EXT3_MOUNT_IOPEN 0x40000 /* Allow access via iopen */ - #define EXT3_MOUNT_IOPEN_NOPRIV 0x80000 /* Make iopen world-readable */ -+#define EXT3_MOUNT_EXTENTS 0x100000/* Extents support */ -+#define EXT3_MOUNT_EXTDEBUG 0x200000/* Extents debug */ + #define EXT3_MOUNT_IOPEN 0x80000 /* Allow access via iopen */ + #define EXT3_MOUNT_IOPEN_NOPRIV 0x100000/* Make iopen world-readable */ ++#define EXT3_MOUNT_EXTENTS 0x200000/* Extents support */ ++#define EXT3_MOUNT_EXTDEBUG 0x400000/* Extents debug */ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ #ifndef clear_opt -@@ -729,6 +735,7 @@ +@@ -503,11 +509,13 @@ + #define EXT3_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */ + #define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */ + #define EXT3_FEATURE_INCOMPAT_META_BG 0x0010 ++#define EXT3_FEATURE_INCOMPAT_EXTENTS 0x0040 /* extents support */ + + #define EXT3_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR + #define EXT3_FEATURE_INCOMPAT_SUPP (EXT3_FEATURE_INCOMPAT_FILETYPE| \ + EXT3_FEATURE_INCOMPAT_RECOVER| \ +- EXT3_FEATURE_INCOMPAT_META_BG) ++ EXT3_FEATURE_INCOMPAT_META_BG| \ ++ EXT3_FEATURE_INCOMPAT_EXTENTS) + #define EXT3_FEATURE_RO_COMPAT_SUPP (EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT3_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT3_FEATURE_RO_COMPAT_BTREE_DIR) +@@ -729,6 +735,9 @@ /* inode.c */ -+extern int ext3_block_truncate_page(handle_t *, struct page *, struct address_space *, loff_t); ++extern int ext3_block_truncate_page(handle_t *, struct page *, ++ struct address_space *, loff_t); ++extern int ext3_writepage_trans_blocks(struct inode *inode); extern int ext3_forget(handle_t *, int, struct inode *, struct buffer_head *, int); extern struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *); extern struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *); -@@ -802,6 +809,14 @@ +@@ -802,6 +809,16 @@ extern struct inode_operations ext3_symlink_inode_operations; extern struct inode_operations ext3_fast_symlink_inode_operations; +/* extents.c */ +extern int ext3_ext_writepage_trans_blocks(struct inode *, int); +extern int ext3_ext_get_block(handle_t *, struct inode *, long, -+ struct buffer_head *, int, int); ++ struct buffer_head *, int, int); +extern void ext3_ext_truncate(struct inode *, struct page *); +extern void ext3_ext_init(struct super_block *); +extern void ext3_ext_release(struct super_block *); +extern void ext3_extents_initialize_blockmap(handle_t *, struct inode *); ++extern int ext3_ext_ioctl(struct inode *inode, struct file *filp, ++ unsigned int cmd, unsigned long arg); #endif /* __KERNEL__ */ Index: linux-2.6.5-sles9/include/linux/ext3_extents.h =================================================================== ---- linux-2.6.5-sles9.orig/include/linux/ext3_extents.h 2003-01-30 13:24:37.000000000 +0300 -+++ linux-2.6.5-sles9/include/linux/ext3_extents.h 2004-11-09 02:23:21.606219384 +0300 -@@ -0,0 +1,252 @@ +--- linux-2.6.5-sles9.orig/include/linux/ext3_extents.h 2005-02-17 22:07:57.023609040 +0300 ++++ linux-2.6.5-sles9/include/linux/ext3_extents.h 2005-02-23 01:02:37.416432600 +0300 +@@ -0,0 +1,262 @@ +/* + * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com + * Written by Alex Tomas @@ -2605,7 +2694,7 @@ Index: linux-2.6.5-sles9/include/linux/ext3_extents.h + */ +#define EXT_DEBUG_ +#ifdef EXT_DEBUG -+#define ext_debug(tree,fmt,a...) \ ++#define ext_debug(tree,fmt,a...) \ +do { \ + if (test_opt((tree)->inode->i_sb, EXTDEBUG)) \ + printk(fmt, ##a); \ @@ -2661,7 +2750,7 @@ Index: linux-2.6.5-sles9/include/linux/ext3_extents.h + __u16 eh_entries; /* number of valid entries */ + __u16 eh_max; /* capacity of store in entries */ + __u16 eh_depth; /* has tree real underlaying blocks? */ -+ __u32 eh_generation; /* generation of the tree */ ++ __u32 eh_generation; /* flags(8 bits) | generation of the tree */ +}; + +#define EXT3_EXT_MAGIC 0xf30a @@ -2718,14 +2807,14 @@ Index: linux-2.6.5-sles9/include/linux/ext3_extents.h + int (*mark_buffer_dirty)(handle_t *h, void *buffer); + int (*mergable)(struct ext3_extent *ex1, struct ext3_extent *ex2); + int (*remove_extent_credits)(struct ext3_extents_tree *, -+ struct ext3_extent *, unsigned long, -+ unsigned long); ++ struct ext3_extent *, unsigned long, ++ unsigned long); + int (*remove_extent)(struct ext3_extents_tree *, -+ struct ext3_extent *, unsigned long, -+ unsigned long); ++ struct ext3_extent *, unsigned long, ++ unsigned long); + int (*new_block)(handle_t *, struct ext3_extents_tree *, -+ struct ext3_ext_path *, struct ext3_extent *, -+ int *); ++ struct ext3_ext_path *, struct ext3_extent *, ++ int *); +}; + +/* @@ -2735,8 +2824,8 @@ Index: linux-2.6.5-sles9/include/linux/ext3_extents.h + * callback must return valid extent (passed or newly created) + */ +typedef int (*ext_prepare_callback)(struct ext3_extents_tree *, -+ struct ext3_ext_path *, -+ struct ext3_extent *, int); ++ struct ext3_ext_path *, ++ struct ext3_ext_cache *); + +#define EXT_CONTINUE 0 +#define EXT_BREAK 1 @@ -2744,7 +2833,6 @@ Index: linux-2.6.5-sles9/include/linux/ext3_extents.h + + +#define EXT_MAX_BLOCK 0xffffffff -+#define EXT_CACHE_MARK 0xffff + + +#define EXT_FIRST_EXTENT(__hdr__) \ @@ -2763,19 +2851,31 @@ Index: linux-2.6.5-sles9/include/linux/ext3_extents.h + (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_max - 1) +#define EXT_MAX_INDEX(__hdr__) \ + (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_max - 1) ++#define EXT_HDR_GEN(__hdr__) ((__hdr__)->eh_generation & 0x00ffffff) ++#define EXT_FLAGS(__hdr__) ((__hdr__)->eh_generation >> 24) ++#define EXT_FLAGS_CLR_UNKNOWN 0x7 /* Flags cleared on modification */ + -+#define EXT_ROOT_HDR(tree) \ -+ ((struct ext3_extent_header *) (tree)->root) -+#define EXT_BLOCK_HDR(bh) \ -+ ((struct ext3_extent_header *) (bh)->b_data) -+#define EXT_DEPTH(_t_) \ -+ (((struct ext3_extent_header *)((_t_)->root))->eh_depth) -+#define EXT_GENERATION(_t_) \ -+ (((struct ext3_extent_header *)((_t_)->root))->eh_generation) -+ ++#define EXT_BLOCK_HDR(__bh__) ((struct ext3_extent_header *)(__bh__)->b_data) ++#define EXT_ROOT_HDR(__tree__) ((struct ext3_extent_header *)(__tree__)->root) ++#define EXT_DEPTH(__tree__) (EXT_ROOT_HDR(__tree__)->eh_depth) ++#define EXT_GENERATION(__tree__) EXT_HDR_GEN(EXT_ROOT_HDR(__tree__)) + +#define EXT_ASSERT(__x__) if (!(__x__)) BUG(); + ++#define EXT_CHECK_PATH(tree,path) \ ++{ \ ++ int depth = EXT_DEPTH(tree); \ ++ BUG_ON((unsigned long) (path) < __PAGE_OFFSET); \ ++ BUG_ON((unsigned long) (path)[depth].p_idx < \ ++ __PAGE_OFFSET && (path)[depth].p_idx != NULL); \ ++ BUG_ON((unsigned long) (path)[depth].p_ext < \ ++ __PAGE_OFFSET && (path)[depth].p_ext != NULL); \ ++ BUG_ON((unsigned long) (path)[depth].p_hdr < __PAGE_OFFSET); \ ++ BUG_ON((unsigned long) (path)[depth].p_bh < __PAGE_OFFSET \ ++ && depth != 0); \ ++ BUG_ON((path)[0].p_depth != depth); \ ++} ++ + +/* + * this structure is used to gather extents from the tree via ioctl @@ -2797,13 +2897,13 @@ Index: linux-2.6.5-sles9/include/linux/ext3_extents.h + int leaf_num; +}; + ++extern void ext3_init_tree_desc(struct ext3_extents_tree *, struct inode *); +extern int ext3_extent_tree_init(handle_t *, struct ext3_extents_tree *); +extern int ext3_ext_calc_credits_for_insert(struct ext3_extents_tree *, struct ext3_ext_path *); +extern int ext3_ext_insert_extent(handle_t *, struct ext3_extents_tree *, struct ext3_ext_path *, struct ext3_extent *); +extern int ext3_ext_walk_space(struct ext3_extents_tree *, unsigned long, unsigned long, ext_prepare_callback); +extern int ext3_ext_remove_space(struct ext3_extents_tree *, unsigned long, unsigned long); +extern struct ext3_ext_path * ext3_ext_find_extent(struct ext3_extents_tree *, int, struct ext3_ext_path *); -+extern void ext3_init_tree_desc(struct ext3_extents_tree *, struct inode *); +extern int ext3_ext_calc_blockmap_metadata(struct inode *, int); + +static inline void @@ -2815,30 +2915,37 @@ Index: linux-2.6.5-sles9/include/linux/ext3_extents.h + + +#endif /* _LINUX_EXT3_EXTENTS */ -+ Index: linux-2.6.5-sles9/include/linux/ext3_fs_i.h =================================================================== ---- linux-2.6.5-sles9.orig/include/linux/ext3_fs_i.h 2004-11-09 02:22:55.780145544 +0300 -+++ linux-2.6.5-sles9/include/linux/ext3_fs_i.h 2004-11-09 02:23:21.606219384 +0300 -@@ -128,6 +128,8 @@ +--- linux-2.6.5-sles9.orig/include/linux/ext3_fs_i.h 2005-02-23 01:01:52.425272296 +0300 ++++ linux-2.6.5-sles9/include/linux/ext3_fs_i.h 2005-02-23 01:45:55.611446920 +0300 +@@ -19,6 +19,7 @@ + #include + #include + #include ++#include + + struct reserve_window { + __u32 _rsv_start; /* First byte reserved */ +@@ -128,6 +129,8 @@ */ struct semaphore truncate_sem; struct inode vfs_inode; + -+ __u32 i_cached_extent[4]; ++ struct ext3_ext_cache i_cached_extent; }; #endif /* _LINUX_EXT3_FS_I */ %diffstat fs/ext3/Makefile | 2 - fs/ext3/extents.c | 2313 +++++++++++++++++++++++++++++++++++++++++++ + fs/ext3/extents.c | 2356 +++++++++++++++++++++++++++++++++++++++++++ fs/ext3/ialloc.c | 4 fs/ext3/inode.c | 29 fs/ext3/ioctl.c | 4 - fs/ext3/super.c | 17 - include/linux/ext3_extents.h | 252 ++++ - include/linux/ext3_fs.h | 15 - include/linux/ext3_fs_i.h | 2 - 9 files changed, 2630 insertions(+), 8 deletions(-) + fs/ext3/super.c | 15 + include/linux/ext3_extents.h | 265 ++++ + include/linux/ext3_fs.h | 17 + include/linux/ext3_fs_i.h | 3 + 9 files changed, 2687 insertions(+), 8 deletions(-) diff --git a/ldiskfs/kernel_patches/patches/ext3-extents-2.6.9-rhel4.patch b/ldiskfs/kernel_patches/patches/ext3-extents-2.6.9-rhel4.patch new file mode 100644 index 0000000..5b5558c --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext3-extents-2.6.9-rhel4.patch @@ -0,0 +1,2925 @@ +Index: linux-stage/fs/ext3/extents.c +=================================================================== +--- linux-stage.orig/fs/ext3/extents.c 2005-02-25 15:33:48.890198160 +0200 ++++ linux-stage/fs/ext3/extents.c 2005-02-25 15:33:48.917194056 +0200 +@@ -0,0 +1,2359 @@ ++/* ++ * Copyright(c) 2003, 2004, 2005, Cluster File Systems, Inc, info@clusterfs.com ++ * Written by Alex Tomas ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public Licens ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- ++ */ ++ ++/* ++ * Extents support for EXT3 ++ * ++ * TODO: ++ * - ext3_ext_walk_space() sould not use ext3_ext_find_extent() ++ * - ext3_ext_calc_credits() could take 'mergable' into account ++ * - ext3*_error() should be used in some situations ++ * - find_goal() [to be tested and improved] ++ * - smart tree reduction ++ * - arch-independence ++ * common on-disk format for big/little-endian arch ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++ ++static inline int ext3_ext_check_header(struct ext3_extent_header *eh) ++{ ++ if (eh->eh_magic != EXT3_EXT_MAGIC) { ++ printk(KERN_ERR "EXT3-fs: invalid magic = 0x%x\n", ++ (unsigned)eh->eh_magic); ++ return -EIO; ++ } ++ if (eh->eh_max == 0) { ++ printk(KERN_ERR "EXT3-fs: invalid eh_max = %u\n", ++ (unsigned)eh->eh_max); ++ return -EIO; ++ } ++ if (eh->eh_entries > eh->eh_max) { ++ printk(KERN_ERR "EXT3-fs: invalid eh_entries = %u\n", ++ (unsigned)eh->eh_entries); ++ return -EIO; ++ } ++ return 0; ++} ++ ++static handle_t *ext3_ext_journal_restart(handle_t *handle, int needed) ++{ ++ int err; ++ ++ if (handle->h_buffer_credits > needed) ++ return handle; ++ if (!ext3_journal_extend(handle, needed)) ++ return handle; ++ err = ext3_journal_restart(handle, needed); ++ ++ return handle; ++} ++ ++static int inline ++ext3_ext_get_access_for_root(handle_t *h, struct ext3_extents_tree *tree) ++{ ++ if (tree->ops->get_write_access) ++ return tree->ops->get_write_access(h,tree->buffer); ++ else ++ return 0; ++} ++ ++static int inline ++ext3_ext_mark_root_dirty(handle_t *h, struct ext3_extents_tree *tree) ++{ ++ if (tree->ops->mark_buffer_dirty) ++ return tree->ops->mark_buffer_dirty(h,tree->buffer); ++ else ++ return 0; ++} ++ ++/* ++ * could return: ++ * - EROFS ++ * - ENOMEM ++ */ ++static int ext3_ext_get_access(handle_t *handle, ++ struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ int err; ++ ++ if (path->p_bh) { ++ /* path points to block */ ++ err = ext3_journal_get_write_access(handle, path->p_bh); ++ } else { ++ /* path points to leaf/index in inode body */ ++ err = ext3_ext_get_access_for_root(handle, tree); ++ } ++ return err; ++} ++ ++/* ++ * could return: ++ * - EROFS ++ * - ENOMEM ++ * - EIO ++ */ ++static int ext3_ext_dirty(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ int err; ++ if (path->p_bh) { ++ /* path points to block */ ++ err =ext3_journal_dirty_metadata(handle, path->p_bh); ++ } else { ++ /* path points to leaf/index in inode body */ ++ err = ext3_ext_mark_root_dirty(handle, tree); ++ } ++ return err; ++} ++ ++static int inline ++ext3_ext_new_block(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, struct ext3_extent *ex, ++ int *err) ++{ ++ int goal, depth, newblock; ++ struct inode *inode; ++ ++ EXT_ASSERT(tree); ++ if (tree->ops->new_block) ++ return tree->ops->new_block(handle, tree, path, ex, err); ++ ++ inode = tree->inode; ++ depth = EXT_DEPTH(tree); ++ if (path && depth > 0) { ++ goal = path[depth-1].p_block; ++ } else { ++ struct ext3_inode_info *ei = EXT3_I(inode); ++ unsigned long bg_start; ++ unsigned long colour; ++ ++ bg_start = (ei->i_block_group * ++ EXT3_BLOCKS_PER_GROUP(inode->i_sb)) + ++ le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block); ++ colour = (current->pid % 16) * ++ (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16); ++ goal = bg_start + colour; ++ } ++ ++ newblock = ext3_new_block(handle, inode, goal, err); ++ return newblock; ++} ++ ++static inline void ext3_ext_tree_changed(struct ext3_extents_tree *tree) ++{ ++ struct ext3_extent_header *neh = EXT_ROOT_HDR(tree); ++ neh->eh_generation = ((EXT_FLAGS(neh) & ~EXT_FLAGS_CLR_UNKNOWN) << 24) | ++ (EXT_HDR_GEN(neh) + 1); ++} ++ ++static inline int ext3_ext_space_block(struct ext3_extents_tree *tree) ++{ ++ int size; ++ ++ size = (tree->inode->i_sb->s_blocksize - ++ sizeof(struct ext3_extent_header)) / ++ sizeof(struct ext3_extent); ++#ifdef AGRESSIVE_TEST ++ size = 6; ++#endif ++ return size; ++} ++ ++static inline int ext3_ext_space_block_idx(struct ext3_extents_tree *tree) ++{ ++ int size; ++ ++ size = (tree->inode->i_sb->s_blocksize - ++ sizeof(struct ext3_extent_header)) / ++ sizeof(struct ext3_extent_idx); ++#ifdef AGRESSIVE_TEST ++ size = 5; ++#endif ++ return size; ++} ++ ++static inline int ext3_ext_space_root(struct ext3_extents_tree *tree) ++{ ++ int size; ++ ++ size = (tree->buffer_len - sizeof(struct ext3_extent_header)) / ++ sizeof(struct ext3_extent); ++#ifdef AGRESSIVE_TEST ++ size = 3; ++#endif ++ return size; ++} ++ ++static inline int ext3_ext_space_root_idx(struct ext3_extents_tree *tree) ++{ ++ int size; ++ ++ size = (tree->buffer_len - sizeof(struct ext3_extent_header)) / ++ sizeof(struct ext3_extent_idx); ++#ifdef AGRESSIVE_TEST ++ size = 4; ++#endif ++ return size; ++} ++ ++static void ext3_ext_show_path(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++#ifdef EXT_DEBUG ++ int k, l = path->p_depth; ++ ++ ext_debug(tree, "path:"); ++ for (k = 0; k <= l; k++, path++) { ++ if (path->p_idx) { ++ ext_debug(tree, " %d->%d", path->p_idx->ei_block, ++ path->p_idx->ei_leaf); ++ } else if (path->p_ext) { ++ ext_debug(tree, " %d:%d:%d", ++ path->p_ext->ee_block, ++ path->p_ext->ee_len, ++ path->p_ext->ee_start); ++ } else ++ ext_debug(tree, " []"); ++ } ++ ext_debug(tree, "\n"); ++#endif ++} ++ ++static void ext3_ext_show_leaf(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++#ifdef EXT_DEBUG ++ int depth = EXT_DEPTH(tree); ++ struct ext3_extent_header *eh; ++ struct ext3_extent *ex; ++ int i; ++ ++ if (!path) ++ return; ++ ++ eh = path[depth].p_hdr; ++ ex = EXT_FIRST_EXTENT(eh); ++ ++ for (i = 0; i < eh->eh_entries; i++, ex++) { ++ ext_debug(tree, "%d:%d:%d ", ++ ex->ee_block, ex->ee_len, ex->ee_start); ++ } ++ ext_debug(tree, "\n"); ++#endif ++} ++ ++static void ext3_ext_drop_refs(struct ext3_ext_path *path) ++{ ++ int depth = path->p_depth; ++ int i; ++ ++ for (i = 0; i <= depth; i++, path++) { ++ if (path->p_bh) { ++ brelse(path->p_bh); ++ path->p_bh = NULL; ++ } ++ } ++} ++ ++/* ++ * binary search for closest index by given block ++ */ ++static inline void ++ext3_ext_binsearch_idx(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, int block) ++{ ++ struct ext3_extent_header *eh = path->p_hdr; ++ struct ext3_extent_idx *ix; ++ int l = 0, k, r; ++ ++ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); ++ EXT_ASSERT(eh->eh_entries <= eh->eh_max); ++ EXT_ASSERT(eh->eh_entries > 0); ++ ++ ext_debug(tree, "binsearch for %d(idx): ", block); ++ ++ path->p_idx = ix = EXT_FIRST_INDEX(eh); ++ ++ r = k = eh->eh_entries; ++ while (k > 1) { ++ k = (r - l) / 2; ++ if (block < ix[l + k].ei_block) ++ r -= k; ++ else ++ l += k; ++ ext_debug(tree, "%d:%d:%d ", k, l, r); ++ } ++ ++ ix += l; ++ path->p_idx = ix; ++ ext_debug(tree," -> %d->%d ",path->p_idx->ei_block,path->p_idx->ei_leaf); ++ ++ while (l++ < r) { ++ if (block < ix->ei_block) ++ break; ++ path->p_idx = ix++; ++ } ++ ext_debug(tree, " -> %d->%d\n", path->p_idx->ei_block, ++ path->p_idx->ei_leaf); ++ ++#ifdef CHECK_BINSEARCH ++ { ++ struct ext3_extent_idx *chix; ++ ++ chix = ix = EXT_FIRST_INDEX(eh); ++ for (k = 0; k < eh->eh_entries; k++, ix++) { ++ if (k != 0 && ix->ei_block <= ix[-1].ei_block) { ++ printk("k=%d, ix=0x%p, first=0x%p\n", k, ++ ix, EXT_FIRST_INDEX(eh)); ++ printk("%u <= %u\n", ++ ix->ei_block,ix[-1].ei_block); ++ } ++ EXT_ASSERT(k == 0 || ix->ei_block > ix[-1].ei_block); ++ if (block < ix->ei_block) ++ break; ++ chix = ix; ++ } ++ EXT_ASSERT(chix == path->p_idx); ++ } ++#endif ++} ++ ++/* ++ * binary search for closest extent by given block ++ */ ++static inline void ++ext3_ext_binsearch(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, int block) ++{ ++ struct ext3_extent_header *eh = path->p_hdr; ++ struct ext3_extent *ex; ++ int l = 0, k, r; ++ ++ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); ++ EXT_ASSERT(eh->eh_entries <= eh->eh_max); ++ ++ if (eh->eh_entries == 0) { ++ /* ++ * this leaf is empty yet: ++ * we get such a leaf in split/add case ++ */ ++ return; ++ } ++ ++ ext_debug(tree, "binsearch for %d: ", block); ++ ++ path->p_ext = ex = EXT_FIRST_EXTENT(eh); ++ ++ r = k = eh->eh_entries; ++ while (k > 1) { ++ k = (r - l) / 2; ++ if (block < ex[l + k].ee_block) ++ r -= k; ++ else ++ l += k; ++ ext_debug(tree, "%d:%d:%d ", k, l, r); ++ } ++ ++ ex += l; ++ path->p_ext = ex; ++ ext_debug(tree, " -> %d:%d:%d ", path->p_ext->ee_block, ++ path->p_ext->ee_start, path->p_ext->ee_len); ++ ++ while (l++ < r) { ++ if (block < ex->ee_block) ++ break; ++ path->p_ext = ex++; ++ } ++ ext_debug(tree, " -> %d:%d:%d\n", path->p_ext->ee_block, ++ path->p_ext->ee_start, path->p_ext->ee_len); ++ ++#ifdef CHECK_BINSEARCH ++ { ++ struct ext3_extent *chex; ++ ++ chex = ex = EXT_FIRST_EXTENT(eh); ++ for (k = 0; k < eh->eh_entries; k++, ex++) { ++ EXT_ASSERT(k == 0 || ex->ee_block > ex[-1].ee_block); ++ if (block < ex->ee_block) ++ break; ++ chex = ex; ++ } ++ EXT_ASSERT(chex == path->p_ext); ++ } ++#endif ++} ++ ++int ext3_extent_tree_init(handle_t *handle, struct ext3_extents_tree *tree) ++{ ++ struct ext3_extent_header *eh; ++ ++ BUG_ON(tree->buffer_len == 0); ++ ext3_ext_get_access_for_root(handle, tree); ++ eh = EXT_ROOT_HDR(tree); ++ eh->eh_depth = 0; ++ eh->eh_entries = 0; ++ eh->eh_magic = EXT3_EXT_MAGIC; ++ eh->eh_max = ext3_ext_space_root(tree); ++ ext3_ext_mark_root_dirty(handle, tree); ++ ext3_ext_invalidate_cache(tree); ++ return 0; ++} ++ ++struct ext3_ext_path * ++ext3_ext_find_extent(struct ext3_extents_tree *tree, int block, ++ struct ext3_ext_path *path) ++{ ++ struct ext3_extent_header *eh; ++ struct buffer_head *bh; ++ int depth, i, ppos = 0; ++ ++ EXT_ASSERT(tree); ++ EXT_ASSERT(tree->inode); ++ EXT_ASSERT(tree->root); ++ ++ eh = EXT_ROOT_HDR(tree); ++ EXT_ASSERT(eh); ++ if (ext3_ext_check_header(eh)) { ++ /* don't free previously allocated path ++ * -- caller should take care */ ++ path = NULL; ++ goto err; ++ } ++ ++ i = depth = EXT_DEPTH(tree); ++ EXT_ASSERT(eh->eh_max); ++ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); ++ ++ /* account possible depth increase */ ++ if (!path) { ++ path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 2), ++ GFP_NOFS); ++ if (!path) ++ return ERR_PTR(-ENOMEM); ++ } ++ memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1)); ++ path[0].p_hdr = eh; ++ ++ /* walk through the tree */ ++ while (i) { ++ ext_debug(tree, "depth %d: num %d, max %d\n", ++ ppos, eh->eh_entries, eh->eh_max); ++ ext3_ext_binsearch_idx(tree, path + ppos, block); ++ path[ppos].p_block = path[ppos].p_idx->ei_leaf; ++ path[ppos].p_depth = i; ++ path[ppos].p_ext = NULL; ++ ++ bh = sb_bread(tree->inode->i_sb, path[ppos].p_block); ++ if (!bh) ++ goto err; ++ ++ eh = EXT_BLOCK_HDR(bh); ++ ppos++; ++ EXT_ASSERT(ppos <= depth); ++ path[ppos].p_bh = bh; ++ path[ppos].p_hdr = eh; ++ i--; ++ ++ if (ext3_ext_check_header(eh)) ++ goto err; ++ } ++ ++ path[ppos].p_depth = i; ++ path[ppos].p_hdr = eh; ++ path[ppos].p_ext = NULL; ++ path[ppos].p_idx = NULL; ++ ++ if (ext3_ext_check_header(eh)) ++ goto err; ++ ++ /* find extent */ ++ ext3_ext_binsearch(tree, path + ppos, block); ++ ++ ext3_ext_show_path(tree, path); ++ ++ return path; ++ ++err: ++ printk(KERN_ERR "EXT3-fs: header is corrupted!\n"); ++ if (path) { ++ ext3_ext_drop_refs(path); ++ kfree(path); ++ } ++ return ERR_PTR(-EIO); ++} ++ ++/* ++ * insert new index [logical;ptr] into the block at cupr ++ * it check where to insert: before curp or after curp ++ */ ++static int ext3_ext_insert_index(handle_t *handle, ++ struct ext3_extents_tree *tree, ++ struct ext3_ext_path *curp, ++ int logical, int ptr) ++{ ++ struct ext3_extent_idx *ix; ++ int len, err; ++ ++ if ((err = ext3_ext_get_access(handle, tree, curp))) ++ return err; ++ ++ EXT_ASSERT(logical != curp->p_idx->ei_block); ++ len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx; ++ if (logical > curp->p_idx->ei_block) { ++ /* insert after */ ++ if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) { ++ len = (len - 1) * sizeof(struct ext3_extent_idx); ++ len = len < 0 ? 0 : len; ++ ext_debug(tree, "insert new index %d after: %d. " ++ "move %d from 0x%p to 0x%p\n", ++ logical, ptr, len, ++ (curp->p_idx + 1), (curp->p_idx + 2)); ++ memmove(curp->p_idx + 2, curp->p_idx + 1, len); ++ } ++ ix = curp->p_idx + 1; ++ } else { ++ /* insert before */ ++ len = len * sizeof(struct ext3_extent_idx); ++ len = len < 0 ? 0 : len; ++ ext_debug(tree, "insert new index %d before: %d. " ++ "move %d from 0x%p to 0x%p\n", ++ logical, ptr, len, ++ curp->p_idx, (curp->p_idx + 1)); ++ memmove(curp->p_idx + 1, curp->p_idx, len); ++ ix = curp->p_idx; ++ } ++ ++ ix->ei_block = logical; ++ ix->ei_leaf = ptr; ++ ix->ei_leaf_hi = ix->ei_unused = 0; ++ curp->p_hdr->eh_entries++; ++ ++ EXT_ASSERT(curp->p_hdr->eh_entries <= curp->p_hdr->eh_max); ++ EXT_ASSERT(ix <= EXT_LAST_INDEX(curp->p_hdr)); ++ ++ err = ext3_ext_dirty(handle, tree, curp); ++ ext3_std_error(tree->inode->i_sb, err); ++ ++ return err; ++} ++ ++/* ++ * routine inserts new subtree into the path, using free index entry ++ * at depth 'at: ++ * - allocates all needed blocks (new leaf and all intermediate index blocks) ++ * - makes decision where to split ++ * - moves remaining extens and index entries (right to the split point) ++ * into the newly allocated blocks ++ * - initialize subtree ++ */ ++static int ext3_ext_split(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newext, int at) ++{ ++ struct buffer_head *bh = NULL; ++ int depth = EXT_DEPTH(tree); ++ struct ext3_extent_header *neh; ++ struct ext3_extent_idx *fidx; ++ struct ext3_extent *ex; ++ int i = at, k, m, a; ++ unsigned long newblock, oldblock, border; ++ int *ablocks = NULL; /* array of allocated blocks */ ++ int err = 0; ++ ++ /* make decision: where to split? */ ++ /* FIXME: now desicion is simplest: at current extent */ ++ ++ /* if current leaf will be splitted, then we should use ++ * border from split point */ ++ EXT_ASSERT(path[depth].p_ext <= EXT_MAX_EXTENT(path[depth].p_hdr)); ++ if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) { ++ border = path[depth].p_ext[1].ee_block; ++ ext_debug(tree, "leaf will be splitted." ++ " next leaf starts at %d\n", ++ (int)border); ++ } else { ++ border = newext->ee_block; ++ ext_debug(tree, "leaf will be added." ++ " next leaf starts at %d\n", ++ (int)border); ++ } ++ ++ /* ++ * if error occurs, then we break processing ++ * and turn filesystem read-only. so, index won't ++ * be inserted and tree will be in consistent ++ * state. next mount will repair buffers too ++ */ ++ ++ /* ++ * get array to track all allocated blocks ++ * we need this to handle errors and free blocks ++ * upon them ++ */ ++ ablocks = kmalloc(sizeof(unsigned long) * depth, GFP_NOFS); ++ if (!ablocks) ++ return -ENOMEM; ++ memset(ablocks, 0, sizeof(unsigned long) * depth); ++ ++ /* allocate all needed blocks */ ++ ext_debug(tree, "allocate %d blocks for indexes/leaf\n", depth - at); ++ for (a = 0; a < depth - at; a++) { ++ newblock = ext3_ext_new_block(handle, tree, path, newext, &err); ++ if (newblock == 0) ++ goto cleanup; ++ ablocks[a] = newblock; ++ } ++ ++ /* initialize new leaf */ ++ newblock = ablocks[--a]; ++ EXT_ASSERT(newblock); ++ bh = sb_getblk(tree->inode->i_sb, newblock); ++ if (!bh) { ++ err = -EIO; ++ goto cleanup; ++ } ++ lock_buffer(bh); ++ ++ if ((err = ext3_journal_get_create_access(handle, bh))) ++ goto cleanup; ++ ++ neh = EXT_BLOCK_HDR(bh); ++ neh->eh_entries = 0; ++ neh->eh_max = ext3_ext_space_block(tree); ++ neh->eh_magic = EXT3_EXT_MAGIC; ++ neh->eh_depth = 0; ++ ex = EXT_FIRST_EXTENT(neh); ++ ++ /* move remain of path[depth] to the new leaf */ ++ EXT_ASSERT(path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max); ++ /* start copy from next extent */ ++ /* TODO: we could do it by single memmove */ ++ m = 0; ++ path[depth].p_ext++; ++ while (path[depth].p_ext <= ++ EXT_MAX_EXTENT(path[depth].p_hdr)) { ++ ext_debug(tree, "move %d:%d:%d in new leaf %lu\n", ++ path[depth].p_ext->ee_block, ++ path[depth].p_ext->ee_start, ++ path[depth].p_ext->ee_len, ++ newblock); ++ memmove(ex++, path[depth].p_ext++, sizeof(struct ext3_extent)); ++ neh->eh_entries++; ++ m++; ++ } ++ set_buffer_uptodate(bh); ++ unlock_buffer(bh); ++ ++ if ((err = ext3_journal_dirty_metadata(handle, bh))) ++ goto cleanup; ++ brelse(bh); ++ bh = NULL; ++ ++ /* correct old leaf */ ++ if (m) { ++ if ((err = ext3_ext_get_access(handle, tree, path + depth))) ++ goto cleanup; ++ path[depth].p_hdr->eh_entries -= m; ++ if ((err = ext3_ext_dirty(handle, tree, path + depth))) ++ goto cleanup; ++ ++ } ++ ++ /* create intermediate indexes */ ++ k = depth - at - 1; ++ EXT_ASSERT(k >= 0); ++ if (k) ++ ext_debug(tree, "create %d intermediate indices\n", k); ++ /* insert new index into current index block */ ++ /* current depth stored in i var */ ++ i = depth - 1; ++ while (k--) { ++ oldblock = newblock; ++ newblock = ablocks[--a]; ++ bh = sb_getblk(tree->inode->i_sb, newblock); ++ if (!bh) { ++ err = -EIO; ++ goto cleanup; ++ } ++ lock_buffer(bh); ++ ++ if ((err = ext3_journal_get_create_access(handle, bh))) ++ goto cleanup; ++ ++ neh = EXT_BLOCK_HDR(bh); ++ neh->eh_entries = 1; ++ neh->eh_magic = EXT3_EXT_MAGIC; ++ neh->eh_max = ext3_ext_space_block_idx(tree); ++ neh->eh_depth = depth - i; ++ fidx = EXT_FIRST_INDEX(neh); ++ fidx->ei_block = border; ++ fidx->ei_leaf = oldblock; ++ fidx->ei_leaf_hi = fidx->ei_unused = 0; ++ ++ ext_debug(tree, "int.index at %d (block %lu): %lu -> %lu\n", ++ i, newblock, border, oldblock); ++ /* copy indexes */ ++ m = 0; ++ path[i].p_idx++; ++ ++ ext_debug(tree, "cur 0x%p, last 0x%p\n", path[i].p_idx, ++ EXT_MAX_INDEX(path[i].p_hdr)); ++ EXT_ASSERT(EXT_MAX_INDEX(path[i].p_hdr) == ++ EXT_LAST_INDEX(path[i].p_hdr)); ++ while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) { ++ ext_debug(tree, "%d: move %d:%d in new index %lu\n", ++ i, path[i].p_idx->ei_block, ++ path[i].p_idx->ei_leaf, newblock); ++ memmove(++fidx, path[i].p_idx++, ++ sizeof(struct ext3_extent_idx)); ++ neh->eh_entries++; ++ EXT_ASSERT(neh->eh_entries <= neh->eh_max); ++ m++; ++ } ++ set_buffer_uptodate(bh); ++ unlock_buffer(bh); ++ ++ if ((err = ext3_journal_dirty_metadata(handle, bh))) ++ goto cleanup; ++ brelse(bh); ++ bh = NULL; ++ ++ /* correct old index */ ++ if (m) { ++ err = ext3_ext_get_access(handle, tree, path + i); ++ if (err) ++ goto cleanup; ++ path[i].p_hdr->eh_entries -= m; ++ err = ext3_ext_dirty(handle, tree, path + i); ++ if (err) ++ goto cleanup; ++ } ++ ++ i--; ++ } ++ ++ /* insert new index */ ++ if (!err) ++ err = ext3_ext_insert_index(handle, tree, path + at, ++ border, newblock); ++ ++cleanup: ++ if (bh) { ++ if (buffer_locked(bh)) ++ unlock_buffer(bh); ++ brelse(bh); ++ } ++ ++ if (err) { ++ /* free all allocated blocks in error case */ ++ for (i = 0; i < depth; i++) { ++ if (!ablocks[i]) ++ continue; ++ ext3_free_blocks(handle, tree->inode, ablocks[i], 1); ++ } ++ } ++ kfree(ablocks); ++ ++ return err; ++} ++ ++/* ++ * routine implements tree growing procedure: ++ * - allocates new block ++ * - moves top-level data (index block or leaf) into the new block ++ * - initialize new top-level, creating index that points to the ++ * just created block ++ */ ++static int ext3_ext_grow_indepth(handle_t *handle, ++ struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newext) ++{ ++ struct ext3_ext_path *curp = path; ++ struct ext3_extent_header *neh; ++ struct ext3_extent_idx *fidx; ++ struct buffer_head *bh; ++ unsigned long newblock; ++ int err = 0; ++ ++ newblock = ext3_ext_new_block(handle, tree, path, newext, &err); ++ if (newblock == 0) ++ return err; ++ ++ bh = sb_getblk(tree->inode->i_sb, newblock); ++ if (!bh) { ++ err = -EIO; ++ ext3_std_error(tree->inode->i_sb, err); ++ return err; ++ } ++ lock_buffer(bh); ++ ++ if ((err = ext3_journal_get_create_access(handle, bh))) { ++ unlock_buffer(bh); ++ goto out; ++ } ++ ++ /* move top-level index/leaf into new block */ ++ memmove(bh->b_data, curp->p_hdr, tree->buffer_len); ++ ++ /* set size of new block */ ++ neh = EXT_BLOCK_HDR(bh); ++ /* old root could have indexes or leaves ++ * so calculate eh_max right way */ ++ if (EXT_DEPTH(tree)) ++ neh->eh_max = ext3_ext_space_block_idx(tree); ++ else ++ neh->eh_max = ext3_ext_space_block(tree); ++ neh->eh_magic = EXT3_EXT_MAGIC; ++ set_buffer_uptodate(bh); ++ unlock_buffer(bh); ++ ++ if ((err = ext3_journal_dirty_metadata(handle, bh))) ++ goto out; ++ ++ /* create index in new top-level index: num,max,pointer */ ++ if ((err = ext3_ext_get_access(handle, tree, curp))) ++ goto out; ++ ++ curp->p_hdr->eh_magic = EXT3_EXT_MAGIC; ++ curp->p_hdr->eh_max = ext3_ext_space_root_idx(tree); ++ curp->p_hdr->eh_entries = 1; ++ curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr); ++ /* FIXME: it works, but actually path[0] can be index */ ++ curp->p_idx->ei_block = EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block; ++ curp->p_idx->ei_leaf = newblock; ++ curp->p_idx->ei_leaf_hi = curp->p_idx->ei_unused = 0; ++ ++ neh = EXT_ROOT_HDR(tree); ++ fidx = EXT_FIRST_INDEX(neh); ++ ext_debug(tree, "new root: num %d(%d), lblock %d, ptr %d\n", ++ neh->eh_entries, neh->eh_max, fidx->ei_block, fidx->ei_leaf); ++ ++ neh->eh_depth = path->p_depth + 1; ++ err = ext3_ext_dirty(handle, tree, curp); ++out: ++ brelse(bh); ++ ++ return err; ++} ++ ++/* ++ * routine finds empty index and adds new leaf. if no free index found ++ * then it requests in-depth growing ++ */ ++static int ext3_ext_create_new_leaf(handle_t *handle, ++ struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newext) ++{ ++ struct ext3_ext_path *curp; ++ int depth, i, err = 0; ++ ++repeat: ++ i = depth = EXT_DEPTH(tree); ++ ++ /* walk up to the tree and look for free index entry */ ++ curp = path + depth; ++ while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) { ++ i--; ++ curp--; ++ } ++ ++ /* we use already allocated block for index block ++ * so, subsequent data blocks should be contigoues */ ++ if (EXT_HAS_FREE_INDEX(curp)) { ++ /* if we found index with free entry, then use that ++ * entry: create all needed subtree and add new leaf */ ++ err = ext3_ext_split(handle, tree, path, newext, i); ++ ++ /* refill path */ ++ ext3_ext_drop_refs(path); ++ path = ext3_ext_find_extent(tree, newext->ee_block, path); ++ if (IS_ERR(path)) ++ err = PTR_ERR(path); ++ } else { ++ /* tree is full, time to grow in depth */ ++ err = ext3_ext_grow_indepth(handle, tree, path, newext); ++ ++ /* refill path */ ++ ext3_ext_drop_refs(path); ++ path = ext3_ext_find_extent(tree, newext->ee_block, path); ++ if (IS_ERR(path)) ++ err = PTR_ERR(path); ++ ++ /* ++ * only first (depth 0 -> 1) produces free space ++ * in all other cases we have to split growed tree ++ */ ++ depth = EXT_DEPTH(tree); ++ if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) { ++ /* now we need split */ ++ goto repeat; ++ } ++ } ++ ++ if (err) ++ return err; ++ ++ return 0; ++} ++ ++/* ++ * returns allocated block in subsequent extent or EXT_MAX_BLOCK ++ * NOTE: it consider block number from index entry as ++ * allocated block. thus, index entries have to be consistent ++ * with leafs ++ */ ++static unsigned long ++ext3_ext_next_allocated_block(struct ext3_ext_path *path) ++{ ++ int depth; ++ ++ EXT_ASSERT(path != NULL); ++ depth = path->p_depth; ++ ++ if (depth == 0 && path->p_ext == NULL) ++ return EXT_MAX_BLOCK; ++ ++ /* FIXME: what if index isn't full ?! */ ++ while (depth >= 0) { ++ if (depth == path->p_depth) { ++ /* leaf */ ++ if (path[depth].p_ext != ++ EXT_LAST_EXTENT(path[depth].p_hdr)) ++ return path[depth].p_ext[1].ee_block; ++ } else { ++ /* index */ ++ if (path[depth].p_idx != ++ EXT_LAST_INDEX(path[depth].p_hdr)) ++ return path[depth].p_idx[1].ei_block; ++ } ++ depth--; ++ } ++ ++ return EXT_MAX_BLOCK; ++} ++ ++/* ++ * returns first allocated block from next leaf or EXT_MAX_BLOCK ++ */ ++static unsigned ext3_ext_next_leaf_block(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ int depth; ++ ++ EXT_ASSERT(path != NULL); ++ depth = path->p_depth; ++ ++ /* zero-tree has no leaf blocks at all */ ++ if (depth == 0) ++ return EXT_MAX_BLOCK; ++ ++ /* go to index block */ ++ depth--; ++ ++ while (depth >= 0) { ++ if (path[depth].p_idx != ++ EXT_LAST_INDEX(path[depth].p_hdr)) ++ return path[depth].p_idx[1].ei_block; ++ depth--; ++ } ++ ++ return EXT_MAX_BLOCK; ++} ++ ++/* ++ * if leaf gets modified and modified extent is first in the leaf ++ * then we have to correct all indexes above ++ * TODO: do we need to correct tree in all cases? ++ */ ++int ext3_ext_correct_indexes(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ struct ext3_extent_header *eh; ++ int depth = EXT_DEPTH(tree); ++ struct ext3_extent *ex; ++ unsigned long border; ++ int k, err = 0; ++ ++ eh = path[depth].p_hdr; ++ ex = path[depth].p_ext; ++ EXT_ASSERT(ex); ++ EXT_ASSERT(eh); ++ ++ if (depth == 0) { ++ /* there is no tree at all */ ++ return 0; ++ } ++ ++ if (ex != EXT_FIRST_EXTENT(eh)) { ++ /* we correct tree if first leaf got modified only */ ++ return 0; ++ } ++ ++ /* ++ * TODO: we need correction if border is smaller then current one ++ */ ++ k = depth - 1; ++ border = path[depth].p_ext->ee_block; ++ if ((err = ext3_ext_get_access(handle, tree, path + k))) ++ return err; ++ path[k].p_idx->ei_block = border; ++ if ((err = ext3_ext_dirty(handle, tree, path + k))) ++ return err; ++ ++ while (k--) { ++ /* change all left-side indexes */ ++ if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr)) ++ break; ++ if ((err = ext3_ext_get_access(handle, tree, path + k))) ++ break; ++ path[k].p_idx->ei_block = border; ++ if ((err = ext3_ext_dirty(handle, tree, path + k))) ++ break; ++ } ++ ++ return err; ++} ++ ++static int inline ++ext3_can_extents_be_merged(struct ext3_extents_tree *tree, ++ struct ext3_extent *ex1, ++ struct ext3_extent *ex2) ++{ ++ if (ex1->ee_block + ex1->ee_len != ex2->ee_block) ++ return 0; ++ ++#ifdef AGRESSIVE_TEST ++ if (ex1->ee_len >= 4) ++ return 0; ++#endif ++ ++ if (!tree->ops->mergable) ++ return 1; ++ ++ return tree->ops->mergable(ex1, ex2); ++} ++ ++/* ++ * this routine tries to merge requsted extent into the existing ++ * extent or inserts requested extent as new one into the tree, ++ * creating new leaf in no-space case ++ */ ++int ext3_ext_insert_extent(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newext) ++{ ++ struct ext3_extent_header * eh; ++ struct ext3_extent *ex, *fex; ++ struct ext3_extent *nearex; /* nearest extent */ ++ struct ext3_ext_path *npath = NULL; ++ int depth, len, err, next; ++ ++ EXT_ASSERT(newext->ee_len > 0); ++ depth = EXT_DEPTH(tree); ++ ex = path[depth].p_ext; ++ EXT_ASSERT(path[depth].p_hdr); ++ ++ /* try to insert block into found extent and return */ ++ if (ex && ext3_can_extents_be_merged(tree, ex, newext)) { ++ ext_debug(tree, "append %d block to %d:%d (from %d)\n", ++ newext->ee_len, ex->ee_block, ex->ee_len, ++ ex->ee_start); ++ if ((err = ext3_ext_get_access(handle, tree, path + depth))) ++ return err; ++ ex->ee_len += newext->ee_len; ++ eh = path[depth].p_hdr; ++ nearex = ex; ++ goto merge; ++ } ++ ++repeat: ++ depth = EXT_DEPTH(tree); ++ eh = path[depth].p_hdr; ++ if (eh->eh_entries < eh->eh_max) ++ goto has_space; ++ ++ /* probably next leaf has space for us? */ ++ fex = EXT_LAST_EXTENT(eh); ++ next = ext3_ext_next_leaf_block(tree, path); ++ if (newext->ee_block > fex->ee_block && next != EXT_MAX_BLOCK) { ++ ext_debug(tree, "next leaf block - %d\n", next); ++ EXT_ASSERT(!npath); ++ npath = ext3_ext_find_extent(tree, next, NULL); ++ if (IS_ERR(npath)) ++ return PTR_ERR(npath); ++ EXT_ASSERT(npath->p_depth == path->p_depth); ++ eh = npath[depth].p_hdr; ++ if (eh->eh_entries < eh->eh_max) { ++ ext_debug(tree, "next leaf isnt full(%d)\n", ++ eh->eh_entries); ++ path = npath; ++ goto repeat; ++ } ++ ext_debug(tree, "next leaf hasno free space(%d,%d)\n", ++ eh->eh_entries, eh->eh_max); ++ } ++ ++ /* ++ * there is no free space in found leaf ++ * we're gonna add new leaf in the tree ++ */ ++ err = ext3_ext_create_new_leaf(handle, tree, path, newext); ++ if (err) ++ goto cleanup; ++ depth = EXT_DEPTH(tree); ++ eh = path[depth].p_hdr; ++ ++has_space: ++ nearex = path[depth].p_ext; ++ ++ if ((err = ext3_ext_get_access(handle, tree, path + depth))) ++ goto cleanup; ++ ++ if (!nearex) { ++ /* there is no extent in this leaf, create first one */ ++ ext_debug(tree, "first extent in the leaf: %d:%d:%d\n", ++ newext->ee_block, newext->ee_start, ++ newext->ee_len); ++ path[depth].p_ext = EXT_FIRST_EXTENT(eh); ++ } else if (newext->ee_block > nearex->ee_block) { ++ EXT_ASSERT(newext->ee_block != nearex->ee_block); ++ if (nearex != EXT_LAST_EXTENT(eh)) { ++ len = EXT_MAX_EXTENT(eh) - nearex; ++ len = (len - 1) * sizeof(struct ext3_extent); ++ len = len < 0 ? 0 : len; ++ ext_debug(tree, "insert %d:%d:%d after: nearest 0x%p, " ++ "move %d from 0x%p to 0x%p\n", ++ newext->ee_block, newext->ee_start, ++ newext->ee_len, ++ nearex, len, nearex + 1, nearex + 2); ++ memmove(nearex + 2, nearex + 1, len); ++ } ++ path[depth].p_ext = nearex + 1; ++ } else { ++ EXT_ASSERT(newext->ee_block != nearex->ee_block); ++ len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext3_extent); ++ len = len < 0 ? 0 : len; ++ ext_debug(tree, "insert %d:%d:%d before: nearest 0x%p, " ++ "move %d from 0x%p to 0x%p\n", ++ newext->ee_block, newext->ee_start, newext->ee_len, ++ nearex, len, nearex + 1, nearex + 2); ++ memmove(nearex + 1, nearex, len); ++ path[depth].p_ext = nearex; ++ } ++ ++ eh->eh_entries++; ++ nearex = path[depth].p_ext; ++ nearex->ee_block = newext->ee_block; ++ nearex->ee_start = newext->ee_start; ++ nearex->ee_len = newext->ee_len; ++ /* FIXME: support for large fs */ ++ nearex->ee_start_hi = 0; ++ ++merge: ++ /* try to merge extents to the right */ ++ while (nearex < EXT_LAST_EXTENT(eh)) { ++ if (!ext3_can_extents_be_merged(tree, nearex, nearex + 1)) ++ break; ++ /* merge with next extent! */ ++ nearex->ee_len += nearex[1].ee_len; ++ if (nearex + 1 < EXT_LAST_EXTENT(eh)) { ++ len = (EXT_LAST_EXTENT(eh) - nearex - 1) * ++ sizeof(struct ext3_extent); ++ memmove(nearex + 1, nearex + 2, len); ++ } ++ eh->eh_entries--; ++ EXT_ASSERT(eh->eh_entries > 0); ++ } ++ ++ /* try to merge extents to the left */ ++ ++ /* time to correct all indexes above */ ++ err = ext3_ext_correct_indexes(handle, tree, path); ++ if (err) ++ goto cleanup; ++ ++ err = ext3_ext_dirty(handle, tree, path + depth); ++ ++cleanup: ++ if (npath) { ++ ext3_ext_drop_refs(npath); ++ kfree(npath); ++ } ++ ext3_ext_tree_changed(tree); ++ ext3_ext_invalidate_cache(tree); ++ return err; ++} ++ ++int ext3_ext_walk_space(struct ext3_extents_tree *tree, unsigned long block, ++ unsigned long num, ext_prepare_callback func) ++{ ++ struct ext3_ext_path *path = NULL; ++ struct ext3_ext_cache cbex; ++ struct ext3_extent *ex; ++ unsigned long next, start = 0, end = 0; ++ unsigned long last = block + num; ++ int depth, exists, err = 0; ++ ++ EXT_ASSERT(tree); ++ EXT_ASSERT(func); ++ EXT_ASSERT(tree->inode); ++ EXT_ASSERT(tree->root); ++ ++ while (block < last && block != EXT_MAX_BLOCK) { ++ num = last - block; ++ /* find extent for this block */ ++ path = ext3_ext_find_extent(tree, block, path); ++ if (IS_ERR(path)) { ++ err = PTR_ERR(path); ++ path = NULL; ++ break; ++ } ++ ++ depth = EXT_DEPTH(tree); ++ EXT_ASSERT(path[depth].p_hdr); ++ ex = path[depth].p_ext; ++ next = ext3_ext_next_allocated_block(path); ++ ++ exists = 0; ++ if (!ex) { ++ /* there is no extent yet, so try to allocate ++ * all requested space */ ++ start = block; ++ end = block + num; ++ } else if (ex->ee_block > block) { ++ /* need to allocate space before found extent */ ++ start = block; ++ end = ex->ee_block; ++ if (block + num < end) ++ end = block + num; ++ } else if (block >= ex->ee_block + ex->ee_len) { ++ /* need to allocate space after found extent */ ++ start = block; ++ end = block + num; ++ if (end >= next) ++ end = next; ++ } else if (block >= ex->ee_block) { ++ /* ++ * some part of requested space is covered ++ * by found extent ++ */ ++ start = block; ++ end = ex->ee_block + ex->ee_len; ++ if (block + num < end) ++ end = block + num; ++ exists = 1; ++ } else { ++ BUG(); ++ } ++ EXT_ASSERT(end > start); ++ ++ if (!exists) { ++ cbex.ec_block = start; ++ cbex.ec_len = end - start; ++ cbex.ec_start = 0; ++ cbex.ec_type = EXT3_EXT_CACHE_GAP; ++ } else { ++ cbex.ec_block = ex->ee_block; ++ cbex.ec_len = ex->ee_len; ++ cbex.ec_start = ex->ee_start; ++ cbex.ec_type = EXT3_EXT_CACHE_EXTENT; ++ } ++ ++ EXT_ASSERT(cbex.ec_len > 0); ++ EXT_ASSERT(path[depth].p_hdr); ++ err = func(tree, path, &cbex); ++ ext3_ext_drop_refs(path); ++ ++ if (err < 0) ++ break; ++ if (err == EXT_REPEAT) ++ continue; ++ else if (err == EXT_BREAK) { ++ err = 0; ++ break; ++ } ++ ++ if (EXT_DEPTH(tree) != depth) { ++ /* depth was changed. we have to realloc path */ ++ kfree(path); ++ path = NULL; ++ } ++ ++ block = cbex.ec_block + cbex.ec_len; ++ } ++ ++ if (path) { ++ ext3_ext_drop_refs(path); ++ kfree(path); ++ } ++ ++ return err; ++} ++ ++static inline void ++ext3_ext_put_in_cache(struct ext3_extents_tree *tree, __u32 block, ++ __u32 len, __u32 start, int type) ++{ ++ EXT_ASSERT(len > 0); ++ if (tree->cex) { ++ tree->cex->ec_type = type; ++ tree->cex->ec_block = block; ++ tree->cex->ec_len = len; ++ tree->cex->ec_start = start; ++ } ++} ++ ++/* ++ * this routine calculate boundaries of the gap requested block fits into ++ * and cache this gap ++ */ ++static inline void ++ext3_ext_put_gap_in_cache(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ unsigned long block) ++{ ++ int depth = EXT_DEPTH(tree); ++ unsigned long lblock, len; ++ struct ext3_extent *ex; ++ ++ if (!tree->cex) ++ return; ++ ++ ex = path[depth].p_ext; ++ if (ex == NULL) { ++ /* there is no extent yet, so gap is [0;-] */ ++ lblock = 0; ++ len = EXT_MAX_BLOCK; ++ ext_debug(tree, "cache gap(whole file):"); ++ } else if (block < ex->ee_block) { ++ lblock = block; ++ len = ex->ee_block - block; ++ ext_debug(tree, "cache gap(before): %lu [%lu:%lu]", ++ (unsigned long) block, ++ (unsigned long) ex->ee_block, ++ (unsigned long) ex->ee_len); ++ } else if (block >= ex->ee_block + ex->ee_len) { ++ lblock = ex->ee_block + ex->ee_len; ++ len = ext3_ext_next_allocated_block(path); ++ ext_debug(tree, "cache gap(after): [%lu:%lu] %lu", ++ (unsigned long) ex->ee_block, ++ (unsigned long) ex->ee_len, ++ (unsigned long) block); ++ EXT_ASSERT(len > lblock); ++ len = len - lblock; ++ } else { ++ lblock = len = 0; ++ BUG(); ++ } ++ ++ ext_debug(tree, " -> %lu:%lu\n", (unsigned long) lblock, len); ++ ext3_ext_put_in_cache(tree, lblock, len, 0, EXT3_EXT_CACHE_GAP); ++} ++ ++static inline int ++ext3_ext_in_cache(struct ext3_extents_tree *tree, unsigned long block, ++ struct ext3_extent *ex) ++{ ++ struct ext3_ext_cache *cex = tree->cex; ++ ++ /* is there cache storage at all? */ ++ if (!cex) ++ return EXT3_EXT_CACHE_NO; ++ ++ /* has cache valid data? */ ++ if (cex->ec_type == EXT3_EXT_CACHE_NO) ++ return EXT3_EXT_CACHE_NO; ++ ++ EXT_ASSERT(cex->ec_type == EXT3_EXT_CACHE_GAP || ++ cex->ec_type == EXT3_EXT_CACHE_EXTENT); ++ if (block >= cex->ec_block && block < cex->ec_block + cex->ec_len) { ++ ex->ee_block = cex->ec_block; ++ ex->ee_start = cex->ec_start; ++ ex->ee_start_hi = 0; ++ ex->ee_len = cex->ec_len; ++ ext_debug(tree, "%lu cached by %lu:%lu:%lu\n", ++ (unsigned long) block, ++ (unsigned long) ex->ee_block, ++ (unsigned long) ex->ee_len, ++ (unsigned long) ex->ee_start); ++ return cex->ec_type; ++ } ++ ++ /* not in cache */ ++ return EXT3_EXT_CACHE_NO; ++} ++ ++/* ++ * routine removes index from the index block ++ * it's used in truncate case only. thus all requests are for ++ * last index in the block only ++ */ ++int ext3_ext_rm_idx(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ struct buffer_head *bh; ++ int err; ++ ++ /* free index block */ ++ path--; ++ EXT_ASSERT(path->p_hdr->eh_entries); ++ if ((err = ext3_ext_get_access(handle, tree, path))) ++ return err; ++ path->p_hdr->eh_entries--; ++ if ((err = ext3_ext_dirty(handle, tree, path))) ++ return err; ++ ext_debug(tree, "index is empty, remove it, free block %d\n", ++ path->p_idx->ei_leaf); ++ bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf); ++ ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf); ++ ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1); ++ return err; ++} ++ ++int ext3_ext_calc_credits_for_insert(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ int depth = EXT_DEPTH(tree); ++ int needed; ++ ++ if (path) { ++ /* probably there is space in leaf? */ ++ if (path[depth].p_hdr->eh_entries < path[depth].p_hdr->eh_max) ++ return 1; ++ } ++ ++ /* ++ * the worste case we're expecting is creation of the ++ * new root (growing in depth) with index splitting ++ * for splitting we have to consider depth + 1 because ++ * previous growing could increase it ++ */ ++ depth = depth + 1; ++ ++ /* ++ * growing in depth: ++ * block allocation + new root + old root ++ */ ++ needed = EXT3_ALLOC_NEEDED + 2; ++ ++ /* index split. we may need: ++ * allocate intermediate indexes and new leaf ++ * change two blocks at each level, but root ++ * modify root block (inode) ++ */ ++ needed += (depth * EXT3_ALLOC_NEEDED) + (2 * depth) + 1; ++ ++ return needed; ++} ++ ++static int ++ext3_ext_split_for_rm(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, unsigned long start, ++ unsigned long end) ++{ ++ struct ext3_extent *ex, tex; ++ struct ext3_ext_path *npath; ++ int depth, creds, err; ++ ++ depth = EXT_DEPTH(tree); ++ ex = path[depth].p_ext; ++ EXT_ASSERT(ex); ++ EXT_ASSERT(end < ex->ee_block + ex->ee_len - 1); ++ EXT_ASSERT(ex->ee_block < start); ++ ++ /* calculate tail extent */ ++ tex.ee_block = end + 1; ++ EXT_ASSERT(tex.ee_block < ex->ee_block + ex->ee_len); ++ tex.ee_len = ex->ee_block + ex->ee_len - tex.ee_block; ++ ++ creds = ext3_ext_calc_credits_for_insert(tree, path); ++ handle = ext3_ext_journal_restart(handle, creds); ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ ++ /* calculate head extent. use primary extent */ ++ err = ext3_ext_get_access(handle, tree, path + depth); ++ if (err) ++ return err; ++ ex->ee_len = start - ex->ee_block; ++ err = ext3_ext_dirty(handle, tree, path + depth); ++ if (err) ++ return err; ++ ++ /* FIXME: some callback to free underlying resource ++ * and correct ee_start? */ ++ ext_debug(tree, "split extent: head %u:%u, tail %u:%u\n", ++ ex->ee_block, ex->ee_len, tex.ee_block, tex.ee_len); ++ ++ npath = ext3_ext_find_extent(tree, ex->ee_block, NULL); ++ if (IS_ERR(npath)) ++ return PTR_ERR(npath); ++ depth = EXT_DEPTH(tree); ++ EXT_ASSERT(npath[depth].p_ext->ee_block == ex->ee_block); ++ EXT_ASSERT(npath[depth].p_ext->ee_len == ex->ee_len); ++ ++ err = ext3_ext_insert_extent(handle, tree, npath, &tex); ++ ext3_ext_drop_refs(npath); ++ kfree(npath); ++ ++ return err; ++} ++ ++static int ++ext3_ext_rm_leaf(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, unsigned long start, ++ unsigned long end) ++{ ++ struct ext3_extent *ex, *fu = NULL, *lu, *le; ++ int err = 0, correct_index = 0; ++ int depth = EXT_DEPTH(tree), credits; ++ struct ext3_extent_header *eh; ++ unsigned a, b, block, num; ++ ++ ext_debug(tree, "remove [%lu:%lu] in leaf\n", start, end); ++ if (!path[depth].p_hdr) ++ path[depth].p_hdr = EXT_BLOCK_HDR(path[depth].p_bh); ++ eh = path[depth].p_hdr; ++ EXT_ASSERT(eh); ++ EXT_ASSERT(eh->eh_entries <= eh->eh_max); ++ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); ++ ++ /* find where to start removing */ ++ le = ex = EXT_LAST_EXTENT(eh); ++ while (ex != EXT_FIRST_EXTENT(eh)) { ++ if (ex->ee_block <= end) ++ break; ++ ex--; ++ } ++ ++ if (start > ex->ee_block && end < ex->ee_block + ex->ee_len - 1) { ++ /* removal of internal part of the extent requested ++ * tail and head must be placed in different extent ++ * so, we have to insert one more extent */ ++ path[depth].p_ext = ex; ++ return ext3_ext_split_for_rm(handle, tree, path, start, end); ++ } ++ ++ lu = ex; ++ while (ex >= EXT_FIRST_EXTENT(eh) && ex->ee_block + ex->ee_len > start) { ++ ext_debug(tree, "remove ext %u:%u\n", ex->ee_block, ex->ee_len); ++ path[depth].p_ext = ex; ++ ++ a = ex->ee_block > start ? ex->ee_block : start; ++ b = ex->ee_block + ex->ee_len - 1 < end ? ++ ex->ee_block + ex->ee_len - 1 : end; ++ ++ ext_debug(tree, " border %u:%u\n", a, b); ++ ++ if (a != ex->ee_block && b != ex->ee_block + ex->ee_len - 1) { ++ block = 0; ++ num = 0; ++ BUG(); ++ } else if (a != ex->ee_block) { ++ /* remove tail of the extent */ ++ block = ex->ee_block; ++ num = a - block; ++ } else if (b != ex->ee_block + ex->ee_len - 1) { ++ /* remove head of the extent */ ++ block = a; ++ num = b - a; ++ } else { ++ /* remove whole extent: excelent! */ ++ block = ex->ee_block; ++ num = 0; ++ EXT_ASSERT(a == ex->ee_block && ++ b == ex->ee_block + ex->ee_len - 1); ++ } ++ ++ if (ex == EXT_FIRST_EXTENT(eh)) ++ correct_index = 1; ++ ++ credits = 1; ++ if (correct_index) ++ credits += (EXT_DEPTH(tree) * EXT3_ALLOC_NEEDED) + 1; ++ if (tree->ops->remove_extent_credits) ++ credits+=tree->ops->remove_extent_credits(tree,ex,a,b); ++ ++ handle = ext3_ext_journal_restart(handle, credits); ++ if (IS_ERR(handle)) { ++ err = PTR_ERR(handle); ++ goto out; ++ } ++ ++ err = ext3_ext_get_access(handle, tree, path + depth); ++ if (err) ++ goto out; ++ ++ if (tree->ops->remove_extent) ++ err = tree->ops->remove_extent(tree, ex, a, b); ++ if (err) ++ goto out; ++ ++ if (num == 0) { ++ /* this extent is removed entirely mark slot unused */ ++ ex->ee_start = ex->ee_start_hi = 0; ++ eh->eh_entries--; ++ fu = ex; ++ } ++ ++ ex->ee_block = block; ++ ex->ee_len = num; ++ ++ err = ext3_ext_dirty(handle, tree, path + depth); ++ if (err) ++ goto out; ++ ++ ext_debug(tree, "new extent: %u:%u:%u\n", ++ ex->ee_block, ex->ee_len, ex->ee_start); ++ ex--; ++ } ++ ++ if (fu) { ++ /* reuse unused slots */ ++ while (lu < le) { ++ if (lu->ee_start) { ++ *fu = *lu; ++ lu->ee_start = lu->ee_start_hi = 0; ++ fu++; ++ } ++ lu++; ++ } ++ } ++ ++ if (correct_index && eh->eh_entries) ++ err = ext3_ext_correct_indexes(handle, tree, path); ++ ++ /* if this leaf is free, then we should ++ * remove it from index block above */ ++ if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL) ++ err = ext3_ext_rm_idx(handle, tree, path + depth); ++ ++out: ++ return err; ++} ++ ++ ++static struct ext3_extent_idx * ++ext3_ext_last_covered(struct ext3_extent_header *hdr, unsigned long block) ++{ ++ struct ext3_extent_idx *ix; ++ ++ ix = EXT_LAST_INDEX(hdr); ++ while (ix != EXT_FIRST_INDEX(hdr)) { ++ if (ix->ei_block <= block) ++ break; ++ ix--; ++ } ++ return ix; ++} ++ ++/* ++ * returns 1 if current index have to be freed (even partial) ++ */ ++static int inline ++ext3_ext_more_to_rm(struct ext3_ext_path *path) ++{ ++ EXT_ASSERT(path->p_idx); ++ ++ if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr)) ++ return 0; ++ ++ /* ++ * if truncate on deeper level happened it it wasn't partial ++ * so we have to consider current index for truncation ++ */ ++ if (path->p_hdr->eh_entries == path->p_block) ++ return 0; ++ return 1; ++} ++ ++int ext3_ext_remove_space(struct ext3_extents_tree *tree, ++ unsigned long start, unsigned long end) ++{ ++ struct inode *inode = tree->inode; ++ struct super_block *sb = inode->i_sb; ++ int depth = EXT_DEPTH(tree); ++ struct ext3_ext_path *path; ++ handle_t *handle; ++ int i = 0, err = 0; ++ ++ ext_debug(tree, "space to be removed: %lu:%lu\n", start, end); ++ ++ /* probably first extent we're gonna free will be last in block */ ++ handle = ext3_journal_start(inode, depth + 1); ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ ++ ext3_ext_invalidate_cache(tree); ++ ++ /* ++ * we start scanning from right side freeing all the blocks ++ * after i_size and walking into the deep ++ */ ++ path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 1), GFP_KERNEL); ++ if (IS_ERR(path)) { ++ ext3_error(sb, __FUNCTION__, "Can't allocate path array"); ++ ext3_journal_stop(handle); ++ return -ENOMEM; ++ } ++ memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1)); ++ path[i].p_hdr = EXT_ROOT_HDR(tree); ++ ++ while (i >= 0 && err == 0) { ++ if (i == depth) { ++ /* this is leaf block */ ++ err = ext3_ext_rm_leaf(handle, tree, path, start, end); ++ /* root level have p_bh == NULL, brelse() eats this */ ++ brelse(path[i].p_bh); ++ i--; ++ continue; ++ } ++ ++ /* this is index block */ ++ if (!path[i].p_hdr) { ++ ext_debug(tree, "initialize header\n"); ++ path[i].p_hdr = EXT_BLOCK_HDR(path[i].p_bh); ++ } ++ ++ EXT_ASSERT(path[i].p_hdr->eh_entries <= path[i].p_hdr->eh_max); ++ EXT_ASSERT(path[i].p_hdr->eh_magic == EXT3_EXT_MAGIC); ++ ++ if (!path[i].p_idx) { ++ /* this level hasn't touched yet */ ++ path[i].p_idx = ++ ext3_ext_last_covered(path[i].p_hdr, end); ++ path[i].p_block = path[i].p_hdr->eh_entries + 1; ++ ext_debug(tree, "init index ptr: hdr 0x%p, num %d\n", ++ path[i].p_hdr, path[i].p_hdr->eh_entries); ++ } else { ++ /* we've already was here, see at next index */ ++ path[i].p_idx--; ++ } ++ ++ ext_debug(tree, "level %d - index, first 0x%p, cur 0x%p\n", ++ i, EXT_FIRST_INDEX(path[i].p_hdr), ++ path[i].p_idx); ++ if (ext3_ext_more_to_rm(path + i)) { ++ /* go to the next level */ ++ ext_debug(tree, "move to level %d (block %d)\n", ++ i + 1, path[i].p_idx->ei_leaf); ++ memset(path + i + 1, 0, sizeof(*path)); ++ path[i+1].p_bh = sb_bread(sb, path[i].p_idx->ei_leaf); ++ if (!path[i+1].p_bh) { ++ /* should we reset i_size? */ ++ err = -EIO; ++ break; ++ } ++ /* put actual number of indexes to know is this ++ * number got changed at the next iteration */ ++ path[i].p_block = path[i].p_hdr->eh_entries; ++ i++; ++ } else { ++ /* we finish processing this index, go up */ ++ if (path[i].p_hdr->eh_entries == 0 && i > 0) { ++ /* index is empty, remove it ++ * handle must be already prepared by the ++ * truncatei_leaf() */ ++ err = ext3_ext_rm_idx(handle, tree, path + i); ++ } ++ /* root level have p_bh == NULL, brelse() eats this */ ++ brelse(path[i].p_bh); ++ i--; ++ ext_debug(tree, "return to level %d\n", i); ++ } ++ } ++ ++ /* TODO: flexible tree reduction should be here */ ++ if (path->p_hdr->eh_entries == 0) { ++ /* ++ * truncate to zero freed all the tree ++ * so, we need to correct eh_depth ++ */ ++ err = ext3_ext_get_access(handle, tree, path); ++ if (err == 0) { ++ EXT_ROOT_HDR(tree)->eh_depth = 0; ++ EXT_ROOT_HDR(tree)->eh_max = ext3_ext_space_root(tree); ++ err = ext3_ext_dirty(handle, tree, path); ++ } ++ } ++ ext3_ext_tree_changed(tree); ++ ++ kfree(path); ++ ext3_journal_stop(handle); ++ ++ return err; ++} ++ ++int ext3_ext_calc_metadata_amount(struct ext3_extents_tree *tree, int blocks) ++{ ++ int lcap, icap, rcap, leafs, idxs, num; ++ ++ rcap = ext3_ext_space_root(tree); ++ if (blocks <= rcap) { ++ /* all extents fit to the root */ ++ return 0; ++ } ++ ++ rcap = ext3_ext_space_root_idx(tree); ++ lcap = ext3_ext_space_block(tree); ++ icap = ext3_ext_space_block_idx(tree); ++ ++ num = leafs = (blocks + lcap - 1) / lcap; ++ if (leafs <= rcap) { ++ /* all pointers to leafs fit to the root */ ++ return leafs; ++ } ++ ++ /* ok. we need separate index block(s) to link all leaf blocks */ ++ idxs = (leafs + icap - 1) / icap; ++ do { ++ num += idxs; ++ idxs = (idxs + icap - 1) / icap; ++ } while (idxs > rcap); ++ ++ return num; ++} ++ ++/* ++ * called at mount time ++ */ ++void ext3_ext_init(struct super_block *sb) ++{ ++ /* ++ * possible initialization would be here ++ */ ++ ++ if (test_opt(sb, EXTENTS)) { ++ printk("EXT3-fs: file extents enabled"); ++#ifdef AGRESSIVE_TEST ++ printk(", agressive tests"); ++#endif ++#ifdef CHECK_BINSEARCH ++ printk(", check binsearch"); ++#endif ++ printk("\n"); ++ } ++} ++ ++/* ++ * called at umount time ++ */ ++void ext3_ext_release(struct super_block *sb) ++{ ++} ++ ++/************************************************************************ ++ * VFS related routines ++ ************************************************************************/ ++ ++static int ext3_get_inode_write_access(handle_t *handle, void *buffer) ++{ ++ /* we use in-core data, not bh */ ++ return 0; ++} ++ ++static int ext3_mark_buffer_dirty(handle_t *handle, void *buffer) ++{ ++ struct inode *inode = buffer; ++ return ext3_mark_inode_dirty(handle, inode); ++} ++ ++static int ext3_ext_mergable(struct ext3_extent *ex1, ++ struct ext3_extent *ex2) ++{ ++ /* FIXME: support for large fs */ ++ if (ex1->ee_start + ex1->ee_len == ex2->ee_start) ++ return 1; ++ return 0; ++} ++ ++static int ++ext3_remove_blocks_credits(struct ext3_extents_tree *tree, ++ struct ext3_extent *ex, ++ unsigned long from, unsigned long to) ++{ ++ int needed; ++ ++ /* at present, extent can't cross block group */; ++ needed = 4; /* bitmap + group desc + sb + inode */ ++ ++#ifdef CONFIG_QUOTA ++ needed += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS; ++#endif ++ return needed; ++} ++ ++static int ++ext3_remove_blocks(struct ext3_extents_tree *tree, ++ struct ext3_extent *ex, ++ unsigned long from, unsigned long to) ++{ ++ int needed = ext3_remove_blocks_credits(tree, ex, from, to); ++ handle_t *handle = ext3_journal_start(tree->inode, needed); ++ struct buffer_head *bh; ++ int i; ++ ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) { ++ /* tail removal */ ++ unsigned long num, start; ++ num = ex->ee_block + ex->ee_len - from; ++ start = ex->ee_start + ex->ee_len - num; ++ ext_debug(tree, "free last %lu blocks starting %lu\n", ++ num, start); ++ for (i = 0; i < num; i++) { ++ bh = sb_find_get_block(tree->inode->i_sb, start + i); ++ ext3_forget(handle, 0, tree->inode, bh, start + i); ++ } ++ ext3_free_blocks(handle, tree->inode, start, num); ++ } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) { ++ printk("strange request: removal %lu-%lu from %u:%u\n", ++ from, to, ex->ee_block, ex->ee_len); ++ } else { ++ printk("strange request: removal(2) %lu-%lu from %u:%u\n", ++ from, to, ex->ee_block, ex->ee_len); ++ } ++ ext3_journal_stop(handle); ++ return 0; ++} ++ ++static int ext3_ext_find_goal(struct inode *inode, ++ struct ext3_ext_path *path, unsigned long block) ++{ ++ struct ext3_inode_info *ei = EXT3_I(inode); ++ unsigned long bg_start; ++ unsigned long colour; ++ int depth; ++ ++ if (path) { ++ struct ext3_extent *ex; ++ depth = path->p_depth; ++ ++ /* try to predict block placement */ ++ if ((ex = path[depth].p_ext)) ++ return ex->ee_start + (block - ex->ee_block); ++ ++ /* it looks index is empty ++ * try to find starting from index itself */ ++ if (path[depth].p_bh) ++ return path[depth].p_bh->b_blocknr; ++ } ++ ++ /* OK. use inode's group */ ++ bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) + ++ le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block); ++ colour = (current->pid % 16) * ++ (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16); ++ return bg_start + colour + block; ++} ++ ++static int ext3_new_block_cb(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *ex, int *err) ++{ ++ struct inode *inode = tree->inode; ++ int newblock, goal; ++ ++ EXT_ASSERT(path); ++ EXT_ASSERT(ex); ++ EXT_ASSERT(ex->ee_start); ++ EXT_ASSERT(ex->ee_len); ++ ++ /* reuse block from the extent to order data/metadata */ ++ newblock = ex->ee_start++; ++ ex->ee_len--; ++ if (ex->ee_len == 0) { ++ ex->ee_len = 1; ++ /* allocate new block for the extent */ ++ goal = ext3_ext_find_goal(inode, path, ex->ee_block); ++ ex->ee_start = ext3_new_block(handle, inode, goal, err); ++ ex->ee_start_hi = 0; ++ if (ex->ee_start == 0) { ++ /* error occured: restore old extent */ ++ ex->ee_start = newblock; ++ return 0; ++ } ++ } ++ return newblock; ++} ++ ++static struct ext3_extents_helpers ext3_blockmap_helpers = { ++ .get_write_access = ext3_get_inode_write_access, ++ .mark_buffer_dirty = ext3_mark_buffer_dirty, ++ .mergable = ext3_ext_mergable, ++ .new_block = ext3_new_block_cb, ++ .remove_extent = ext3_remove_blocks, ++ .remove_extent_credits = ext3_remove_blocks_credits, ++}; ++ ++void ext3_init_tree_desc(struct ext3_extents_tree *tree, ++ struct inode *inode) ++{ ++ tree->inode = inode; ++ tree->root = (void *) EXT3_I(inode)->i_data; ++ tree->buffer = (void *) inode; ++ tree->buffer_len = sizeof(EXT3_I(inode)->i_data); ++ tree->cex = (struct ext3_ext_cache *) &EXT3_I(inode)->i_cached_extent; ++ tree->ops = &ext3_blockmap_helpers; ++} ++ ++int ext3_ext_get_block(handle_t *handle, struct inode *inode, ++ long iblock, struct buffer_head *bh_result, ++ int create, int extend_disksize) ++{ ++ struct ext3_ext_path *path = NULL; ++ struct ext3_extent newex; ++ struct ext3_extent *ex; ++ int goal, newblock, err = 0, depth; ++ struct ext3_extents_tree tree; ++ ++ clear_buffer_new(bh_result); ++ ext3_init_tree_desc(&tree, inode); ++ ext_debug(&tree, "block %d requested for inode %u\n", ++ (int) iblock, (unsigned) inode->i_ino); ++ down(&EXT3_I(inode)->truncate_sem); ++ ++ /* check in cache */ ++ if ((goal = ext3_ext_in_cache(&tree, iblock, &newex))) { ++ if (goal == EXT3_EXT_CACHE_GAP) { ++ if (!create) { ++ /* block isn't allocated yet and ++ * user don't want to allocate it */ ++ goto out2; ++ } ++ /* we should allocate requested block */ ++ } else if (goal == EXT3_EXT_CACHE_EXTENT) { ++ /* block is already allocated */ ++ newblock = iblock - newex.ee_block + newex.ee_start; ++ goto out; ++ } else { ++ EXT_ASSERT(0); ++ } ++ } ++ ++ /* find extent for this block */ ++ path = ext3_ext_find_extent(&tree, iblock, NULL); ++ if (IS_ERR(path)) { ++ err = PTR_ERR(path); ++ path = NULL; ++ goto out2; ++ } ++ ++ depth = EXT_DEPTH(&tree); ++ ++ /* ++ * consistent leaf must not be empty ++ * this situations is possible, though, _during_ tree modification ++ * this is why assert can't be put in ext3_ext_find_extent() ++ */ ++ EXT_ASSERT(path[depth].p_ext != NULL || depth == 0); ++ ++ if ((ex = path[depth].p_ext)) { ++ /* if found exent covers block, simple return it */ ++ if (iblock >= ex->ee_block && iblock < ex->ee_block + ex->ee_len) { ++ newblock = iblock - ex->ee_block + ex->ee_start; ++ ext_debug(&tree, "%d fit into %d:%d -> %d\n", ++ (int) iblock, ex->ee_block, ex->ee_len, ++ newblock); ++ ext3_ext_put_in_cache(&tree, ex->ee_block, ++ ex->ee_len, ex->ee_start, ++ EXT3_EXT_CACHE_EXTENT); ++ goto out; ++ } ++ } ++ ++ /* ++ * requested block isn't allocated yet ++ * we couldn't try to create block if create flag is zero ++ */ ++ if (!create) { ++ /* put just found gap into cache to speedup subsequest reqs */ ++ ext3_ext_put_gap_in_cache(&tree, path, iblock); ++ goto out2; ++ } ++ ++ /* allocate new block */ ++ goal = ext3_ext_find_goal(inode, path, iblock); ++ newblock = ext3_new_block(handle, inode, goal, &err); ++ if (!newblock) ++ goto out2; ++ ext_debug(&tree, "allocate new block: goal %d, found %d\n", ++ goal, newblock); ++ ++ /* try to insert new extent into found leaf and return */ ++ newex.ee_block = iblock; ++ newex.ee_start = newblock; ++ newex.ee_start_hi = 0; ++ newex.ee_len = 1; ++ err = ext3_ext_insert_extent(handle, &tree, path, &newex); ++ if (err) ++ goto out2; ++ ++ if (extend_disksize && inode->i_size > EXT3_I(inode)->i_disksize) ++ EXT3_I(inode)->i_disksize = inode->i_size; ++ ++ /* previous routine could use block we allocated */ ++ newblock = newex.ee_start; ++ set_buffer_new(bh_result); ++ ++ ext3_ext_put_in_cache(&tree, newex.ee_block, newex.ee_len, ++ newex.ee_start, EXT3_EXT_CACHE_EXTENT); ++out: ++ ext3_ext_show_leaf(&tree, path); ++ map_bh(bh_result, inode->i_sb, newblock); ++out2: ++ if (path) { ++ ext3_ext_drop_refs(path); ++ kfree(path); ++ } ++ up(&EXT3_I(inode)->truncate_sem); ++ ++ return err; ++} ++ ++void ext3_ext_truncate(struct inode * inode, struct page *page) ++{ ++ struct address_space *mapping = inode->i_mapping; ++ struct super_block *sb = inode->i_sb; ++ struct ext3_extents_tree tree; ++ unsigned long last_block; ++ handle_t *handle; ++ int err = 0; ++ ++ ext3_init_tree_desc(&tree, inode); ++ ++ /* ++ * probably first extent we're gonna free will be last in block ++ */ ++ err = ext3_writepage_trans_blocks(inode) + 3; ++ handle = ext3_journal_start(inode, err); ++ if (IS_ERR(handle)) { ++ if (page) { ++ clear_highpage(page); ++ flush_dcache_page(page); ++ unlock_page(page); ++ page_cache_release(page); ++ } ++ return; ++ } ++ ++ if (page) ++ ext3_block_truncate_page(handle, page, mapping, inode->i_size); ++ ++ down(&EXT3_I(inode)->truncate_sem); ++ ext3_ext_invalidate_cache(&tree); ++ ++ /* ++ * TODO: optimization is possible here ++ * probably we need not scaning at all, ++ * because page truncation is enough ++ */ ++ if (ext3_orphan_add(handle, inode)) ++ goto out_stop; ++ ++ /* we have to know where to truncate from in crash case */ ++ EXT3_I(inode)->i_disksize = inode->i_size; ++ ext3_mark_inode_dirty(handle, inode); ++ ++ last_block = (inode->i_size + sb->s_blocksize - 1) >> ++ EXT3_BLOCK_SIZE_BITS(sb); ++ err = ext3_ext_remove_space(&tree, last_block, EXT_MAX_BLOCK); ++ ++ /* In a multi-transaction truncate, we only make the final ++ * transaction synchronous */ ++ if (IS_SYNC(inode)) ++ handle->h_sync = 1; ++ ++out_stop: ++ /* ++ * If this was a simple ftruncate(), and the file will remain alive ++ * then we need to clear up the orphan record which we created above. ++ * However, if this was a real unlink then we were called by ++ * ext3_delete_inode(), and we allow that function to clean up the ++ * orphan info for us. ++ */ ++ if (inode->i_nlink) ++ ext3_orphan_del(handle, inode); ++ ++ up(&EXT3_I(inode)->truncate_sem); ++ ext3_journal_stop(handle); ++} ++ ++/* ++ * this routine calculate max number of blocks we could modify ++ * in order to allocate new block for an inode ++ */ ++int ext3_ext_writepage_trans_blocks(struct inode *inode, int num) ++{ ++ struct ext3_extents_tree tree; ++ int needed; ++ ++ ext3_init_tree_desc(&tree, inode); ++ ++ needed = ext3_ext_calc_credits_for_insert(&tree, NULL); ++ ++ /* caller want to allocate num blocks */ ++ needed *= num; ++ ++#ifdef CONFIG_QUOTA ++ /* ++ * FIXME: real calculation should be here ++ * it depends on blockmap format of qouta file ++ */ ++ needed += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS; ++#endif ++ ++ return needed; ++} ++ ++void ext3_extents_initialize_blockmap(handle_t *handle, struct inode *inode) ++{ ++ struct ext3_extents_tree tree; ++ ++ ext3_init_tree_desc(&tree, inode); ++ ext3_extent_tree_init(handle, &tree); ++} ++ ++int ext3_ext_calc_blockmap_metadata(struct inode *inode, int blocks) ++{ ++ struct ext3_extents_tree tree; ++ ++ ext3_init_tree_desc(&tree, inode); ++ return ext3_ext_calc_metadata_amount(&tree, blocks); ++} ++ ++static int ++ext3_ext_store_extent_cb(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_ext_cache *newex) ++{ ++ struct ext3_extent_buf *buf = (struct ext3_extent_buf *) tree->private; ++ ++ if (newex->ec_type != EXT3_EXT_CACHE_EXTENT) ++ return EXT_CONTINUE; ++ ++ if (buf->err < 0) ++ return EXT_BREAK; ++ if (buf->cur - buf->buffer + sizeof(*newex) > buf->buflen) ++ return EXT_BREAK; ++ ++ if (!copy_to_user(buf->cur, newex, sizeof(*newex))) { ++ buf->err++; ++ buf->cur += sizeof(*newex); ++ } else { ++ buf->err = -EFAULT; ++ return EXT_BREAK; ++ } ++ return EXT_CONTINUE; ++} ++ ++static int ++ext3_ext_collect_stats_cb(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_ext_cache *ex) ++{ ++ struct ext3_extent_tree_stats *buf = ++ (struct ext3_extent_tree_stats *) tree->private; ++ int depth; ++ ++ if (ex->ec_type != EXT3_EXT_CACHE_EXTENT) ++ return EXT_CONTINUE; ++ ++ depth = EXT_DEPTH(tree); ++ buf->extents_num++; ++ if (path[depth].p_ext == EXT_FIRST_EXTENT(path[depth].p_hdr)) ++ buf->leaf_num++; ++ return EXT_CONTINUE; ++} ++ ++int ext3_ext_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, ++ unsigned long arg) ++{ ++ int err = 0; ++ ++ if (!(EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL)) ++ return -EINVAL; ++ ++ if (cmd == EXT3_IOC_GET_EXTENTS) { ++ struct ext3_extent_buf buf; ++ struct ext3_extents_tree tree; ++ ++ if (copy_from_user(&buf, (void *) arg, sizeof(buf))) ++ return -EFAULT; ++ ++ ext3_init_tree_desc(&tree, inode); ++ buf.cur = buf.buffer; ++ buf.err = 0; ++ tree.private = &buf; ++ down(&EXT3_I(inode)->truncate_sem); ++ err = ext3_ext_walk_space(&tree, buf.start, EXT_MAX_BLOCK, ++ ext3_ext_store_extent_cb); ++ up(&EXT3_I(inode)->truncate_sem); ++ if (err == 0) ++ err = buf.err; ++ } else if (cmd == EXT3_IOC_GET_TREE_STATS) { ++ struct ext3_extent_tree_stats buf; ++ struct ext3_extents_tree tree; ++ ++ ext3_init_tree_desc(&tree, inode); ++ down(&EXT3_I(inode)->truncate_sem); ++ buf.depth = EXT_DEPTH(&tree); ++ buf.extents_num = 0; ++ buf.leaf_num = 0; ++ tree.private = &buf; ++ err = ext3_ext_walk_space(&tree, 0, EXT_MAX_BLOCK, ++ ext3_ext_collect_stats_cb); ++ up(&EXT3_I(inode)->truncate_sem); ++ if (!err) ++ err = copy_to_user((void *) arg, &buf, sizeof(buf)); ++ } else if (cmd == EXT3_IOC_GET_TREE_DEPTH) { ++ struct ext3_extents_tree tree; ++ ext3_init_tree_desc(&tree, inode); ++ down(&EXT3_I(inode)->truncate_sem); ++ err = EXT_DEPTH(&tree); ++ up(&EXT3_I(inode)->truncate_sem); ++ } ++ ++ return err; ++} ++ ++EXPORT_SYMBOL(ext3_init_tree_desc); ++EXPORT_SYMBOL(ext3_mark_inode_dirty); ++EXPORT_SYMBOL(ext3_ext_invalidate_cache); ++EXPORT_SYMBOL(ext3_ext_insert_extent); ++EXPORT_SYMBOL(ext3_ext_walk_space); ++EXPORT_SYMBOL(ext3_ext_find_goal); ++EXPORT_SYMBOL(ext3_ext_calc_credits_for_insert); +Index: linux-stage/fs/ext3/ialloc.c +=================================================================== +--- linux-stage.orig/fs/ext3/ialloc.c 2005-02-25 14:50:50.304202816 +0200 ++++ linux-stage/fs/ext3/ialloc.c 2005-02-25 15:33:48.920193600 +0200 +@@ -566,7 +566,7 @@ repeat: + ei->i_dir_start_lookup = 0; + ei->i_disksize = 0; + +- ei->i_flags = EXT3_I(dir)->i_flags & ~EXT3_INDEX_FL; ++ ei->i_flags = EXT3_I(dir)->i_flags & ~(EXT3_INDEX_FL|EXT3_EXTENTS_FL); + if (S_ISLNK(mode)) + ei->i_flags &= ~(EXT3_IMMUTABLE_FL|EXT3_APPEND_FL); + /* dirsync only applies to directories */ +@@ -646,6 +646,18 @@ + DQUOT_FREE_INODE(inode); + goto fail2; + } ++ if (test_opt(sb, EXTENTS) && S_ISREG(inode->i_mode)) { ++ EXT3_I(inode)->i_flags |= EXT3_EXTENTS_FL; ++ ext3_extents_initialize_blockmap(handle, inode); ++ if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_EXTENTS)) { ++ err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh); ++ if (err) goto fail; ++ EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_EXTENTS); ++ BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "call ext3_journal_dirty_metadata"); ++ err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); ++ } ++ } ++ + err = ext3_mark_inode_dirty(handle, inode); + if (err) { + ext3_std_error(sb, err); +Index: linux-stage/fs/ext3/inode.c +=================================================================== +--- linux-stage.orig/fs/ext3/inode.c 2005-02-25 14:50:50.309202056 +0200 ++++ linux-stage/fs/ext3/inode.c 2005-02-25 15:36:51.846384592 +0200 +@@ -796,6 +796,17 @@ + goto reread; + } + ++static inline int ++ext3_get_block_wrap(handle_t *handle, struct inode *inode, long block, ++ struct buffer_head *bh, int create, int extend_disksize) ++{ ++ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) ++ return ext3_ext_get_block(handle, inode, block, bh, create, ++ extend_disksize); ++ return ext3_get_block_handle(handle, inode, block, bh, create, ++ extend_disksize); ++} ++ + static int ext3_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) + { +@@ -806,8 +817,8 @@ + handle = ext3_journal_current_handle(); + J_ASSERT(handle != 0); + } +- ret = ext3_get_block_handle(handle, inode, iblock, +- bh_result, create, 1); ++ ret = ext3_get_block_wrap(handle, inode, iblock, ++ bh_result, create, 1); + return ret; + } + +@@ -851,7 +862,7 @@ + + get_block: + if (ret == 0) +- ret = ext3_get_block_handle(handle, inode, iblock, ++ ret = ext3_get_block_wrap(handle, inode, iblock, + bh_result, create, 0); + bh_result->b_size = (1 << inode->i_blkbits); + return ret; +@@ -871,7 +882,7 @@ + dummy.b_state = 0; + dummy.b_blocknr = -1000; + buffer_trace_init(&dummy.b_history); +- *errp = ext3_get_block_handle(handle, inode, block, &dummy, create, 1); ++ *errp = ext3_get_block_wrap(handle, inode, block, &dummy, create, 1); + if (!*errp && buffer_mapped(&dummy)) { + struct buffer_head *bh; + bh = sb_getblk(inode->i_sb, dummy.b_blocknr); +@@ -1589,7 +1600,7 @@ + * This required during truncate. We need to physically zero the tail end + * of that block so it doesn't yield old data if the file is later grown. + */ +-static int ext3_block_truncate_page(handle_t *handle, struct page *page, ++int ext3_block_truncate_page(handle_t *handle, struct page *page, + struct address_space *mapping, loff_t from) + { + unsigned long index = from >> PAGE_CACHE_SHIFT; +@@ -2087,6 +2098,9 @@ + return; + } + ++ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) ++ return ext3_ext_truncate(inode, page); ++ + handle = start_transaction(inode); + if (IS_ERR(handle)) { + if (page) { +@@ -2814,6 +2828,9 @@ + int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3; + int ret; + ++ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) ++ return ext3_ext_writepage_trans_blocks(inode, bpp); ++ + if (ext3_should_journal_data(inode)) + ret = 3 * (bpp + indirects) + 2; + else +Index: linux-stage/fs/ext3/Makefile +=================================================================== +--- linux-stage.orig/fs/ext3/Makefile 2005-02-25 14:49:42.168561008 +0200 ++++ linux-stage/fs/ext3/Makefile 2005-02-25 15:39:28.384587168 +0200 +@@ -5,7 +5,8 @@ + obj-$(CONFIG_EXT3_FS) += ext3.o + + ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ +- ioctl.o namei.o super.o symlink.o hash.o resize.o ++ ioctl.o namei.o super.o symlink.o hash.o resize.o \ ++ extents.o + + ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o + ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o +Index: linux-stage/fs/ext3/super.c +=================================================================== +--- linux-stage.orig/fs/ext3/super.c 2005-02-25 14:52:33.550506992 +0200 ++++ linux-stage/fs/ext3/super.c 2005-02-25 15:38:10.474431312 +0200 +@@ -394,6 +394,7 @@ + struct ext3_super_block *es = sbi->s_es; + int i; + ++ ext3_ext_release(sb); + ext3_xattr_put_super(sb); + journal_destroy(sbi->s_journal); + if (!(sb->s_flags & MS_RDONLY)) { +@@ -457,6 +458,8 @@ + #endif + ei->i_rsv_window.rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED; + ei->vfs_inode.i_version = 1; ++ ++ memset(&ei->i_cached_extent, 0, sizeof(ei->i_cached_extent)); + return &ei->vfs_inode; + } + +@@ -589,6 +594,7 @@ + Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, + Opt_ignore, Opt_barrier, Opt_err, Opt_resize, + Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, ++ Opt_extents, Opt_noextents, Opt_extdebug, + }; + + static match_table_t tokens = { +@@ -639,6 +644,9 @@ + {Opt_iopen, "iopen"}, + {Opt_noiopen, "noiopen"}, + {Opt_iopen_nopriv, "iopen_nopriv"}, ++ {Opt_extents, "extents"}, ++ {Opt_noextents, "noextents"}, ++ {Opt_extdebug, "extdebug"}, + {Opt_barrier, "barrier=%u"}, + {Opt_err, NULL}, + {Opt_resize, "resize"}, +@@ -943,6 +950,15 @@ + match_int(&args[0], &option); + *n_blocks_count = option; + break; ++ case Opt_extents: ++ set_opt (sbi->s_mount_opt, EXTENTS); ++ break; ++ case Opt_noextents: ++ clear_opt (sbi->s_mount_opt, EXTENTS); ++ break; ++ case Opt_extdebug: ++ set_opt (sbi->s_mount_opt, EXTDEBUG); ++ break; + default: + printk (KERN_ERR + "EXT3-fs: Unrecognized mount option \"%s\" " +@@ -1625,6 +1638,8 @@ + percpu_counter_mod(&sbi->s_dirs_counter, + ext3_count_dirs(sb)); + ++ ext3_ext_init(sb); ++ + return 0; + + failed_mount3: +Index: linux-stage/fs/ext3/ioctl.c +=================================================================== +--- linux-stage.orig/fs/ext3/ioctl.c 2005-02-25 14:37:28.971023976 +0200 ++++ linux-stage/fs/ext3/ioctl.c 2005-02-25 15:33:48.938190864 +0200 +@@ -124,6 +124,10 @@ + err = ext3_change_inode_journal_flag(inode, jflag); + return err; + } ++ case EXT3_IOC_GET_EXTENTS: ++ case EXT3_IOC_GET_TREE_STATS: ++ case EXT3_IOC_GET_TREE_DEPTH: ++ return ext3_ext_ioctl(inode, filp, cmd, arg); + case EXT3_IOC_GETVERSION: + case EXT3_IOC_GETVERSION_OLD: + return put_user(inode->i_generation, (int __user *) arg); +Index: linux-stage/include/linux/ext3_fs.h +=================================================================== +--- linux-stage.orig/include/linux/ext3_fs.h 2005-02-25 14:53:56.424908168 +0200 ++++ linux-stage/include/linux/ext3_fs.h 2005-02-25 15:39:12.841950008 +0200 +@@ -186,8 +186,9 @@ + #define EXT3_NOTAIL_FL 0x00008000 /* don't merge file tail */ + #define EXT3_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ + #define EXT3_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ ++#define EXT3_EXTENTS_FL 0x00080000 /* Inode uses extents */ + #define EXT3_RESERVED_FL 0x80000000 /* reserved for ext3 lib */ + +-#define EXT3_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */ ++#define EXT3_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */ + #define EXT3_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */ + +@@ -237,6 +238,9 @@ + #endif + #define EXT3_IOC_GETRSVSZ _IOR('f', 5, long) + #define EXT3_IOC_SETRSVSZ _IOW('f', 6, long) ++#define EXT3_IOC_GET_EXTENTS _IOR('f', 7, long) ++#define EXT3_IOC_GET_TREE_DEPTH _IOR('f', 8, long) ++#define EXT3_IOC_GET_TREE_STATS _IOR('f', 9, long) + + /* + * Structure of an inode on the disk +@@ -359,6 +363,8 @@ + #define EXT3_MOUNT_RESERVATION 0x20000 /* Preallocation */ + #define EXT3_MOUNT_IOPEN 0x80000 /* Allow access via iopen */ + #define EXT3_MOUNT_IOPEN_NOPRIV 0x100000/* Make iopen world-readable */ ++#define EXT3_MOUNT_EXTENTS 0x200000/* Extents support */ ++#define EXT3_MOUNT_EXTDEBUG 0x400000/* Extents debug */ + + /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ + #ifndef _LINUX_EXT2_FS_H +@@ -503,11 +509,13 @@ + #define EXT3_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */ + #define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */ + #define EXT3_FEATURE_INCOMPAT_META_BG 0x0010 ++#define EXT3_FEATURE_INCOMPAT_EXTENTS 0x0040 /* extents support */ + + #define EXT3_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR + #define EXT3_FEATURE_INCOMPAT_SUPP (EXT3_FEATURE_INCOMPAT_FILETYPE| \ + EXT3_FEATURE_INCOMPAT_RECOVER| \ +- EXT3_FEATURE_INCOMPAT_META_BG) ++ EXT3_FEATURE_INCOMPAT_META_BG| \ ++ EXT3_FEATURE_INCOMPAT_EXTENTS) + #define EXT3_FEATURE_RO_COMPAT_SUPP (EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT3_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT3_FEATURE_RO_COMPAT_BTREE_DIR) +@@ -756,6 +763,9 @@ + + + /* inode.c */ ++extern int ext3_block_truncate_page(handle_t *, struct page *, ++ struct address_space *, loff_t); ++extern int ext3_writepage_trans_blocks(struct inode *inode); + extern int ext3_forget(handle_t *, int, struct inode *, struct buffer_head *, int); + extern struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *); + extern struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *); +@@ -836,6 +844,16 @@ + extern struct inode_operations ext3_symlink_inode_operations; + extern struct inode_operations ext3_fast_symlink_inode_operations; + ++/* extents.c */ ++extern int ext3_ext_writepage_trans_blocks(struct inode *, int); ++extern int ext3_ext_get_block(handle_t *, struct inode *, long, ++ struct buffer_head *, int, int); ++extern void ext3_ext_truncate(struct inode *, struct page *); ++extern void ext3_ext_init(struct super_block *); ++extern void ext3_ext_release(struct super_block *); ++extern void ext3_extents_initialize_blockmap(handle_t *, struct inode *); ++extern int ext3_ext_ioctl(struct inode *inode, struct file *filp, ++ unsigned int cmd, unsigned long arg); + + #endif /* __KERNEL__ */ + +Index: linux-stage/include/linux/ext3_extents.h +=================================================================== +--- linux-stage.orig/include/linux/ext3_extents.h 2005-02-25 15:33:48.891198008 +0200 ++++ linux-stage/include/linux/ext3_extents.h 2005-02-25 15:33:48.944189952 +0200 +@@ -0,0 +1,262 @@ ++/* ++ * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com ++ * Written by Alex Tomas ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public Licens ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- ++ */ ++ ++#ifndef _LINUX_EXT3_EXTENTS ++#define _LINUX_EXT3_EXTENTS ++ ++/* ++ * with AGRESSIVE_TEST defined capacity of index/leaf blocks ++ * become very little, so index split, in-depth growing and ++ * other hard changes happens much more often ++ * this is for debug purposes only ++ */ ++#define AGRESSIVE_TEST_ ++ ++/* ++ * if CHECK_BINSEARCH defined, then results of binary search ++ * will be checked by linear search ++ */ ++#define CHECK_BINSEARCH_ ++ ++/* ++ * if EXT_DEBUG is defined you can use 'extdebug' mount option ++ * to get lots of info what's going on ++ */ ++#define EXT_DEBUG_ ++#ifdef EXT_DEBUG ++#define ext_debug(tree,fmt,a...) \ ++do { \ ++ if (test_opt((tree)->inode->i_sb, EXTDEBUG)) \ ++ printk(fmt, ##a); \ ++} while (0); ++#else ++#define ext_debug(tree,fmt,a...) ++#endif ++ ++/* ++ * if EXT_STATS is defined then stats numbers are collected ++ * these number will be displayed at umount time ++ */ ++#define EXT_STATS_ ++ ++ ++#define EXT3_ALLOC_NEEDED 3 /* block bitmap + group desc. + sb */ ++ ++/* ++ * ext3_inode has i_block array (total 60 bytes) ++ * first 4 bytes are used to store: ++ * - tree depth (0 mean there is no tree yet. all extents in the inode) ++ * - number of alive extents in the inode ++ */ ++ ++/* ++ * this is extent on-disk structure ++ * it's used at the bottom of the tree ++ */ ++struct ext3_extent { ++ __u32 ee_block; /* first logical block extent covers */ ++ __u16 ee_len; /* number of blocks covered by extent */ ++ __u16 ee_start_hi; /* high 16 bits of physical block */ ++ __u32 ee_start; /* low 32 bigs of physical block */ ++}; ++ ++/* ++ * this is index on-disk structure ++ * it's used at all the levels, but the bottom ++ */ ++struct ext3_extent_idx { ++ __u32 ei_block; /* index covers logical blocks from 'block' */ ++ __u32 ei_leaf; /* pointer to the physical block of the next * ++ * level. leaf or next index could bet here */ ++ __u16 ei_leaf_hi; /* high 16 bits of physical block */ ++ __u16 ei_unused; ++}; ++ ++/* ++ * each block (leaves and indexes), even inode-stored has header ++ */ ++struct ext3_extent_header { ++ __u16 eh_magic; /* probably will support different formats */ ++ __u16 eh_entries; /* number of valid entries */ ++ __u16 eh_max; /* capacity of store in entries */ ++ __u16 eh_depth; /* has tree real underlaying blocks? */ ++ __u32 eh_generation; /* flags(8 bits) | generation of the tree */ ++}; ++ ++#define EXT3_EXT_MAGIC 0xf30a ++ ++/* ++ * array of ext3_ext_path contains path to some extent ++ * creation/lookup routines use it for traversal/splitting/etc ++ * truncate uses it to simulate recursive walking ++ */ ++struct ext3_ext_path { ++ __u32 p_block; ++ __u16 p_depth; ++ struct ext3_extent *p_ext; ++ struct ext3_extent_idx *p_idx; ++ struct ext3_extent_header *p_hdr; ++ struct buffer_head *p_bh; ++}; ++ ++/* ++ * structure for external API ++ */ ++ ++/* ++ * storage for cached extent ++ */ ++struct ext3_ext_cache { ++ __u32 ec_start; ++ __u32 ec_block; ++ __u32 ec_len; ++ __u32 ec_type; ++}; ++ ++#define EXT3_EXT_CACHE_NO 0 ++#define EXT3_EXT_CACHE_GAP 1 ++#define EXT3_EXT_CACHE_EXTENT 2 ++ ++/* ++ * ext3_extents_tree is used to pass initial information ++ * to top-level extents API ++ */ ++struct ext3_extents_helpers; ++struct ext3_extents_tree { ++ struct inode *inode; /* inode which tree belongs to */ ++ void *root; /* ptr to data top of tree resides at */ ++ void *buffer; /* will be passed as arg to ^^ routines */ ++ int buffer_len; ++ void *private; ++ struct ext3_ext_cache *cex;/* last found extent */ ++ struct ext3_extents_helpers *ops; ++}; ++ ++struct ext3_extents_helpers { ++ int (*get_write_access)(handle_t *h, void *buffer); ++ int (*mark_buffer_dirty)(handle_t *h, void *buffer); ++ int (*mergable)(struct ext3_extent *ex1, struct ext3_extent *ex2); ++ int (*remove_extent_credits)(struct ext3_extents_tree *, ++ struct ext3_extent *, unsigned long, ++ unsigned long); ++ int (*remove_extent)(struct ext3_extents_tree *, ++ struct ext3_extent *, unsigned long, ++ unsigned long); ++ int (*new_block)(handle_t *, struct ext3_extents_tree *, ++ struct ext3_ext_path *, struct ext3_extent *, ++ int *); ++}; ++ ++/* ++ * to be called by ext3_ext_walk_space() ++ * negative retcode - error ++ * positive retcode - signal for ext3_ext_walk_space(), see below ++ * callback must return valid extent (passed or newly created) ++ */ ++typedef int (*ext_prepare_callback)(struct ext3_extents_tree *, ++ struct ext3_ext_path *, ++ struct ext3_ext_cache *); ++ ++#define EXT_CONTINUE 0 ++#define EXT_BREAK 1 ++#define EXT_REPEAT 2 ++ ++ ++#define EXT_MAX_BLOCK 0xffffffff ++ ++ ++#define EXT_FIRST_EXTENT(__hdr__) \ ++ ((struct ext3_extent *) (((char *) (__hdr__)) + \ ++ sizeof(struct ext3_extent_header))) ++#define EXT_FIRST_INDEX(__hdr__) \ ++ ((struct ext3_extent_idx *) (((char *) (__hdr__)) + \ ++ sizeof(struct ext3_extent_header))) ++#define EXT_HAS_FREE_INDEX(__path__) \ ++ ((__path__)->p_hdr->eh_entries < (__path__)->p_hdr->eh_max) ++#define EXT_LAST_EXTENT(__hdr__) \ ++ (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_entries - 1) ++#define EXT_LAST_INDEX(__hdr__) \ ++ (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_entries - 1) ++#define EXT_MAX_EXTENT(__hdr__) \ ++ (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_max - 1) ++#define EXT_MAX_INDEX(__hdr__) \ ++ (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_max - 1) ++#define EXT_HDR_GEN(__hdr__) ((__hdr__)->eh_generation & 0x00ffffff) ++#define EXT_FLAGS(__hdr__) ((__hdr__)->eh_generation >> 24) ++#define EXT_FLAGS_CLR_UNKNOWN 0x7 /* Flags cleared on modification */ ++ ++#define EXT_BLOCK_HDR(__bh__) ((struct ext3_extent_header *)(__bh__)->b_data) ++#define EXT_ROOT_HDR(__tree__) ((struct ext3_extent_header *)(__tree__)->root) ++#define EXT_DEPTH(__tree__) (EXT_ROOT_HDR(__tree__)->eh_depth) ++#define EXT_GENERATION(__tree__) EXT_HDR_GEN(EXT_ROOT_HDR(__tree__)) ++ ++#define EXT_ASSERT(__x__) if (!(__x__)) BUG(); ++ ++#define EXT_CHECK_PATH(tree,path) \ ++{ \ ++ int depth = EXT_DEPTH(tree); \ ++ BUG_ON((unsigned long) (path) < __PAGE_OFFSET); \ ++ BUG_ON((unsigned long) (path)[depth].p_idx < \ ++ __PAGE_OFFSET && (path)[depth].p_idx != NULL); \ ++ BUG_ON((unsigned long) (path)[depth].p_ext < \ ++ __PAGE_OFFSET && (path)[depth].p_ext != NULL); \ ++ BUG_ON((unsigned long) (path)[depth].p_hdr < __PAGE_OFFSET); \ ++ BUG_ON((unsigned long) (path)[depth].p_bh < __PAGE_OFFSET \ ++ && depth != 0); \ ++ BUG_ON((path)[0].p_depth != depth); \ ++} ++ ++ ++/* ++ * this structure is used to gather extents from the tree via ioctl ++ */ ++struct ext3_extent_buf { ++ unsigned long start; ++ int buflen; ++ void *buffer; ++ void *cur; ++ int err; ++}; ++ ++/* ++ * this structure is used to collect stats info about the tree ++ */ ++struct ext3_extent_tree_stats { ++ int depth; ++ int extents_num; ++ int leaf_num; ++}; ++ ++extern void ext3_init_tree_desc(struct ext3_extents_tree *, struct inode *); ++extern int ext3_extent_tree_init(handle_t *, struct ext3_extents_tree *); ++extern int ext3_ext_calc_credits_for_insert(struct ext3_extents_tree *, struct ext3_ext_path *); ++extern int ext3_ext_insert_extent(handle_t *, struct ext3_extents_tree *, struct ext3_ext_path *, struct ext3_extent *); ++extern int ext3_ext_walk_space(struct ext3_extents_tree *, unsigned long, unsigned long, ext_prepare_callback); ++extern int ext3_ext_remove_space(struct ext3_extents_tree *, unsigned long, unsigned long); ++extern struct ext3_ext_path * ext3_ext_find_extent(struct ext3_extents_tree *, int, struct ext3_ext_path *); ++extern int ext3_ext_calc_blockmap_metadata(struct inode *, int); ++ ++static inline void ++ext3_ext_invalidate_cache(struct ext3_extents_tree *tree) ++{ ++ if (tree->cex) ++ tree->cex->ec_type = EXT3_EXT_CACHE_NO; ++} ++ ++ ++#endif /* _LINUX_EXT3_EXTENTS */ +Index: linux-stage/include/linux/ext3_fs_i.h +=================================================================== +--- linux-stage.orig/include/linux/ext3_fs_i.h 2005-02-25 14:50:50.320200384 +0200 ++++ linux-stage/include/linux/ext3_fs_i.h 2005-02-25 15:33:48.945189800 +0200 +@@ -128,6 +128,8 @@ + */ + struct semaphore truncate_sem; + struct inode vfs_inode; ++ ++ __u32 i_cached_extent[4]; + }; + + #endif /* _LINUX_EXT3_FS_I */ diff --git a/ldiskfs/kernel_patches/patches/ext3-external-journal-2.6.12.patch b/ldiskfs/kernel_patches/patches/ext3-external-journal-2.6.12.patch new file mode 100644 index 0000000..bcfdae2 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext3-external-journal-2.6.12.patch @@ -0,0 +1,148 @@ +Signed-off-by: Johann Lombardi + +--- linux-2.6.12.orig/fs/ext3/super.c 2005-06-17 21:48:29.000000000 +0200 ++++ linux-2.6.12/fs/ext3/super.c 2005-11-07 13:37:30.000000000 +0100 +@@ -39,7 +39,8 @@ + #include "xattr.h" + #include "acl.h" + +-static int ext3_load_journal(struct super_block *, struct ext3_super_block *); ++static int ext3_load_journal(struct super_block *, struct ext3_super_block *, ++ unsigned long journal_devnum); + static int ext3_create_journal(struct super_block *, struct ext3_super_block *, + int); + static void ext3_commit_super (struct super_block * sb, +@@ -586,7 +587,7 @@ enum { + Opt_nouid32, Opt_check, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov, + Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, + Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, +- Opt_commit, Opt_journal_update, Opt_journal_inum, ++ Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev, + Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, + Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, + Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, +@@ -624,6 +625,7 @@ static match_table_t tokens = { + {Opt_commit, "commit=%u"}, + {Opt_journal_update, "journal=update"}, + {Opt_journal_inum, "journal=%u"}, ++ {Opt_journal_dev, "journal_dev=%u"}, + {Opt_abort, "abort"}, + {Opt_data_journal, "data=journal"}, + {Opt_data_ordered, "data=ordered"}, +@@ -663,8 +665,9 @@ static unsigned long get_sb_block(void * + return sb_block; + } + +-static int parse_options (char * options, struct super_block *sb, +- unsigned long * inum, unsigned long *n_blocks_count, int is_remount) ++static int parse_options (char *options, struct super_block *sb, ++ unsigned long *inum, unsigned long *journal_devnum, ++ unsigned long *n_blocks_count, int is_remount) + { + struct ext3_sb_info *sbi = EXT3_SB(sb); + char * p; +@@ -805,6 +808,16 @@ static int parse_options (char * options + return 0; + *inum = option; + break; ++ case Opt_journal_dev: ++ if (is_remount) { ++ printk(KERN_ERR "EXT3-fs: cannot specify " ++ "journal on remount\n"); ++ return 0; ++ } ++ if (match_int(&args[0], &option)) ++ return 0; ++ *journal_devnum = option; ++ break; + case Opt_noload: + set_opt (sbi->s_mount_opt, NOLOAD); + break; +@@ -1250,6 +1263,7 @@ static int ext3_fill_super (struct super + unsigned long logic_sb_block; + unsigned long offset = 0; + unsigned long journal_inum = 0; ++ unsigned long journal_devnum = 0; + unsigned long def_mount_opts; + struct inode *root; + int blocksize; +@@ -1330,7 +1344,8 @@ static int ext3_fill_super (struct super + + set_opt(sbi->s_mount_opt, RESERVATION); + +- if (!parse_options ((char *) data, sb, &journal_inum, NULL, 0)) ++ if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum, ++ NULL, 0)) + goto failed_mount; + + sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | +@@ -1541,7 +1556,7 @@ static int ext3_fill_super (struct super + */ + if (!test_opt(sb, NOLOAD) && + EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) { +- if (ext3_load_journal(sb, es)) ++ if (ext3_load_journal(sb, es, journal_devnum)) + goto failed_mount2; + } else if (journal_inum) { + if (ext3_create_journal(sb, es, journal_inum)) +@@ -1821,15 +1836,24 @@ out_bdev: + return NULL; + } + +-static int ext3_load_journal(struct super_block * sb, +- struct ext3_super_block * es) ++static int ext3_load_journal(struct super_block *sb, ++ struct ext3_super_block *es, ++ unsigned long journal_devnum) + { + journal_t *journal; + int journal_inum = le32_to_cpu(es->s_journal_inum); +- dev_t journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev)); ++ dev_t journal_dev; + int err = 0; + int really_read_only; + ++ if (journal_devnum && ++ journal_devnum != le32_to_cpu(es->s_journal_dev)) { ++ printk(KERN_INFO "EXT3-fs: external journal device major/minor " ++ "numbers have changed\n"); ++ journal_dev = new_decode_dev(journal_devnum); ++ } else ++ journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev)); ++ + really_read_only = bdev_read_only(sb->s_bdev); + + /* +@@ -1888,6 +1912,16 @@ static int ext3_load_journal(struct supe + + EXT3_SB(sb)->s_journal = journal; + ext3_clear_journal_err(sb, es); ++ ++ if (journal_devnum && ++ journal_devnum != le32_to_cpu(es->s_journal_dev)) { ++ es->s_journal_dev = cpu_to_le32(journal_devnum); ++ sb->s_dirt = 1; ++ ++ /* Make sure we flush the recovery flag to disk. */ ++ ext3_commit_super(sb, es, 1); ++ } ++ + return 0; + } + +@@ -2093,13 +2127,13 @@ static int ext3_remount (struct super_bl + { + struct ext3_super_block * es; + struct ext3_sb_info *sbi = EXT3_SB(sb); +- unsigned long tmp; ++ unsigned long tmp1, tmp2; + unsigned long n_blocks_count = 0; + + /* + * Allow the "check" option to be passed as a remount option. + */ +- if (!parse_options(data, sb, &tmp, &n_blocks_count, 1)) ++ if (!parse_options(data, sb, &tmp1, &tmp2, &n_blocks_count, 1)) + return -EINVAL; + + if (sbi->s_mount_opt & EXT3_MOUNT_ABORT) diff --git a/ldiskfs/kernel_patches/patches/ext3-filterdata-2.6.15.patch b/ldiskfs/kernel_patches/patches/ext3-filterdata-2.6.15.patch new file mode 100644 index 0000000..e6d431f --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext3-filterdata-2.6.15.patch @@ -0,0 +1,25 @@ +Index: linux-2.6.15/include/linux/ext3_fs_i.h +=================================================================== +--- linux-2.6.15.orig/include/linux/ext3_fs_i.h 2006-02-24 15:41:30.000000000 +0300 ++++ linux-2.6.15/include/linux/ext3_fs_i.h 2006-02-24 15:41:31.000000000 +0300 +@@ -135,6 +135,8 @@ struct ext3_inode_info { + struct inode vfs_inode; + + __u32 i_cached_extent[4]; ++ ++ void *i_filterdata; + }; + + #endif /* _LINUX_EXT3_FS_I */ +Index: linux-2.6.15/fs/ext3/super.c +=================================================================== +--- linux-2.6.15.orig/fs/ext3/super.c 2006-02-24 15:41:30.000000000 +0300 ++++ linux-2.6.15/fs/ext3/super.c 2006-02-24 15:42:02.000000000 +0300 +@@ -459,6 +459,7 @@ static struct inode *ext3_alloc_inode(st + ei->vfs_inode.i_version = 1; + + memset(&ei->i_cached_extent, 0, sizeof(ei->i_cached_extent)); ++ ei->i_filterdata = NULL; + return &ei->vfs_inode; + } + diff --git a/ldiskfs/kernel_patches/patches/ext3-htree-dot-2.6.patch b/ldiskfs/kernel_patches/patches/ext3-htree-dot-2.6.patch new file mode 100644 index 0000000..9192112 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext3-htree-dot-2.6.patch @@ -0,0 +1,23 @@ +Index: linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/fs/ext3/namei.c +=================================================================== +--- linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891.orig/fs/ext3/namei.c 2005-04-04 05:06:46.000000000 -0600 ++++ linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/fs/ext3/namei.c 2005-04-04 05:09:18.000000000 -0600 +@@ -926,8 +926,16 @@ + struct inode *dir = dentry->d_parent->d_inode; + + sb = dir->i_sb; +- if (!(frame = dx_probe(dentry, NULL, &hinfo, frames, err))) +- return NULL; ++ /* NFS may look up ".." - look at dx_root directory block */ ++ if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){ ++ if (!(frame = dx_probe(dentry, NULL, &hinfo, frames, err))) ++ return NULL; ++ } else { ++ frame = frames; ++ frame->bh = NULL; /* for dx_release() */ ++ frame->at = (struct dx_entry *)frames; /* hack for zero entry*/ ++ dx_set_block(frame->at, 0); /* dx_root block is 0 */ ++ } + hash = hinfo.hash; + do { + block = dx_get_block(frame->at); diff --git a/ldiskfs/kernel_patches/patches/ext3-ialloc-2.6.patch b/ldiskfs/kernel_patches/patches/ext3-ialloc-2.6.patch new file mode 100644 index 0000000..15d37a9 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext3-ialloc-2.6.patch @@ -0,0 +1,128 @@ +Index: linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/fs/ext3/ialloc.c +=================================================================== +--- linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891.orig/fs/ext3/ialloc.c 2005-05-16 14:10:54.000000000 -0600 ++++ linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/fs/ext3/ialloc.c 2005-05-16 14:18:29.000000000 -0600 +@@ -352,13 +352,17 @@ + return -1; + } + +-static int find_group_other(struct super_block *sb, struct inode *parent) ++static int find_group_other(struct super_block *sb, struct inode *parent, ++ int mode) + { + int parent_group = EXT3_I(parent)->i_block_group; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); + int ngroups = EXT3_SB(sb)->s_groups_count; + struct ext3_group_desc *desc; + struct buffer_head *bh; + int group, i; ++ int best_group = -1; ++ int avefreeb, freeb, best_group_freeb = 0; + + /* + * Try to place the inode in its parent directory +@@ -366,9 +370,9 @@ + group = parent_group; + desc = ext3_get_group_desc (sb, group, &bh); + if (desc && le16_to_cpu(desc->bg_free_inodes_count) && +- le16_to_cpu(desc->bg_free_blocks_count)) ++ (!S_ISREG(mode) || le16_to_cpu(desc->bg_free_blocks_count))) + return group; +- ++ avefreeb = le32_to_cpu(sbi->s_es->s_free_blocks_count) / ngroups; + /* + * We're going to place this inode in a different blockgroup from its + * parent. We want to cause files in a common directory to all land in +@@ -381,33 +385,47 @@ + group = (group + parent->i_ino) % ngroups; + + /* +- * Use a quadratic hash to find a group with a free inode and some free +- * blocks. ++ * Use a quadratic hash to find a group with a free inode and ++ * average number of free blocks. + */ + for (i = 1; i < ngroups; i <<= 1) { + group += i; + if (group >= ngroups) + group -= ngroups; + desc = ext3_get_group_desc (sb, group, &bh); +- if (desc && le16_to_cpu(desc->bg_free_inodes_count) && +- le16_to_cpu(desc->bg_free_blocks_count)) ++ if (!desc || !desc->bg_free_inodes_count) ++ continue; ++ if (!S_ISREG(mode)) ++ return group; ++ if (le16_to_cpu(desc->bg_free_blocks_count) >= avefreeb) + return group; + } + + /* +- * That failed: try linear search for a free inode, even if that group +- * has no free blocks. ++ * That failed: start from last group used to allocate inode ++ * try linear search for a free inode and prefereably ++ * free blocks. + */ +- group = parent_group; ++ group = sbi->s_last_alloc_group; ++ if (group == -1) ++ group = parent_group; ++ + for (i = 0; i < ngroups; i++) { + if (++group >= ngroups) + group = 0; + desc = ext3_get_group_desc (sb, group, &bh); +- if (desc && le16_to_cpu(desc->bg_free_inodes_count)) +- return group; ++ if (!desc || !desc->bg_free_inodes_count) ++ continue; ++ freeb = le16_to_cpu(desc->bg_free_blocks_count); ++ if (freeb > best_group_freeb) { ++ best_group_freeb = freeb; ++ best_group = group; ++ if (freeb >= avefreeb || !S_ISREG(mode)) ++ break; ++ } + } +- +- return -1; ++ sbi->s_last_alloc_group = best_group; ++ return best_group; + } + + /* +@@ -454,7 +472,7 @@ + else + group = find_group_orlov(sb, dir); + } else +- group = find_group_other(sb, dir); ++ group = find_group_other(sb, dir, mode); + + err = -ENOSPC; + if (group == -1) +Index: linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/fs/ext3/super.c +=================================================================== +--- linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891.orig/fs/ext3/super.c 2005-05-16 14:10:54.000000000 -0600 ++++ linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/fs/ext3/super.c 2005-05-16 14:17:14.000000000 -0600 +@@ -1297,6 +1297,7 @@ + percpu_counter_init(&sbi->s_dirs_counter); + bgl_lock_init(&sbi->s_blockgroup_lock); + ++ sbi->s_last_alloc_group = -1; + for (i = 0; i < db_count; i++) { + block = descriptor_loc(sb, logic_sb_block, i); + sbi->s_group_desc[i] = sb_bread(sb, block); +Index: linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/include/linux/ext3_fs_sb.h +=================================================================== +--- linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891.orig/include/linux/ext3_fs_sb.h 2005-05-16 14:10:54.000000000 -0600 ++++ linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/include/linux/ext3_fs_sb.h 2005-05-16 14:17:14.000000000 -0600 +@@ -59,6 +59,8 @@ + struct percpu_counter s_freeinodes_counter; + struct percpu_counter s_dirs_counter; + struct blockgroup_lock s_blockgroup_lock; ++ /* Last group used to allocate inode */ ++ int s_last_alloc_group; + + /* root of the per fs reservation window tree */ + spinlock_t s_rsv_window_lock; diff --git a/ldiskfs/kernel_patches/patches/ext3-include-fixes-2.6-rhel4.patch b/ldiskfs/kernel_patches/patches/ext3-include-fixes-2.6-rhel4.patch new file mode 100644 index 0000000..52e5521 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext3-include-fixes-2.6-rhel4.patch @@ -0,0 +1,20 @@ +Index: linux-stage/include/linux/ext3_fs.h +=================================================================== +--- linux-stage.orig/include/linux/ext3_fs.h 2005-02-25 14:53:56.424908168 +0200 ++++ linux-stage/include/linux/ext3_fs.h 2005-02-25 14:53:59.376459464 +0200 +@@ -361,12 +361,13 @@ + #define EXT3_MOUNT_IOPEN_NOPRIV 0x100000/* Make iopen world-readable */ + + /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ +-#ifndef _LINUX_EXT2_FS_H ++#ifndef clear_opt + #define clear_opt(o, opt) o &= ~EXT3_MOUNT_##opt + #define set_opt(o, opt) o |= EXT3_MOUNT_##opt + #define test_opt(sb, opt) (EXT3_SB(sb)->s_mount_opt & \ + EXT3_MOUNT_##opt) +-#else ++#endif ++#ifndef EXT2_MOUNT_NOLOAD + #define EXT2_MOUNT_NOLOAD EXT3_MOUNT_NOLOAD + #define EXT2_MOUNT_ABORT EXT3_MOUNT_ABORT + #define EXT2_MOUNT_DATA_FLAGS EXT3_MOUNT_DATA_FLAGS diff --git a/ldiskfs/kernel_patches/patches/ext3-include-fixes-2.6-suse.patch b/ldiskfs/kernel_patches/patches/ext3-include-fixes-2.6-suse.patch index 28e3587..1ac944b 100644 --- a/ldiskfs/kernel_patches/patches/ext3-include-fixes-2.6-suse.patch +++ b/ldiskfs/kernel_patches/patches/ext3-include-fixes-2.6-suse.patch @@ -3,7 +3,7 @@ Index: linux-stage/include/linux/ext3_fs.h --- linux-stage.orig/include/linux/ext3_fs.h 2004-04-02 16:43:37.000000000 -0500 +++ linux-stage/include/linux/ext3_fs.h 2004-04-02 16:43:37.000000000 -0500 @@ -331,12 +331,13 @@ - #define EXT3_MOUNT_IOPEN_NOPRIV 0x20000 /* Make iopen world-readable */ + #define EXT3_MOUNT_IOPEN_NOPRIV 0x100000/* Make iopen world-readable */ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ -#ifndef _LINUX_EXT2_FS_H diff --git a/ldiskfs/kernel_patches/patches/ext3-lookup-dotdot-2.6.9.patch b/ldiskfs/kernel_patches/patches/ext3-lookup-dotdot-2.6.9.patch new file mode 100644 index 0000000..a05256b --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext3-lookup-dotdot-2.6.9.patch @@ -0,0 +1,63 @@ +Index: linux-2.6.9-full/fs/ext3/iopen.c +=================================================================== +--- linux-2.6.9-full.orig/fs/ext3/iopen.c 2006-04-25 08:51:11.000000000 +0400 ++++ linux-2.6.9-full/fs/ext3/iopen.c 2006-05-06 01:21:11.000000000 +0400 +@@ -94,9 +94,12 @@ static struct dentry *iopen_lookup(struc + assert(!(alternate->d_flags & DCACHE_DISCONNECTED)); + } + +- if (!list_empty(&inode->i_dentry)) { +- alternate = list_entry(inode->i_dentry.next, +- struct dentry, d_alias); ++ list_for_each(lp, &inode->i_dentry) { ++ alternate = list_entry(lp, struct dentry, d_alias); ++ /* ignore dentries created for ".." to preserve ++ * proper dcache hierarchy -- bug 10458 */ ++ if (alternate->d_flags & DCACHE_NFSFS_RENAMED) ++ continue; + dget_locked(alternate); + spin_lock(&alternate->d_lock); + alternate->d_flags |= DCACHE_REFERENCED; +Index: linux-2.6.9-full/fs/ext3/namei.c +=================================================================== +--- linux-2.6.9-full.orig/fs/ext3/namei.c 2006-05-06 01:21:10.000000000 +0400 ++++ linux-2.6.9-full/fs/ext3/namei.c 2006-05-06 01:29:30.000000000 +0400 +@@ -1003,6 +1003,38 @@ static struct dentry *ext3_lookup(struct + return ERR_PTR(-EACCES); + } + ++ /* ".." shouldn't go into dcache to preserve dcache hierarchy ++ * otherwise we'll get parent being a child of actual child. ++ * see bug 10458 for details -bzzz */ ++ if (inode && (dentry->d_name.name[0] == '.' && (dentry->d_name.len == 1 || ++ (dentry->d_name.len == 2 && dentry->d_name.name[1] == '.')))) { ++ struct dentry *tmp, *goal = NULL; ++ struct list_head *lp; ++ ++ /* first, look for an existing dentry - any one is good */ ++ spin_lock(&dcache_lock); ++ list_for_each(lp, &inode->i_dentry) { ++ tmp = list_entry(lp, struct dentry, d_alias); ++ goal = tmp; ++ dget_locked(goal); ++ break; ++ } ++ if (goal == NULL) { ++ /* there is no alias, we need to make current dentry: ++ * a) inaccessible for __d_lookup() ++ * b) inaccessible for iopen */ ++ J_ASSERT(list_empty(&dentry->d_alias)); ++ dentry->d_flags |= DCACHE_NFSFS_RENAMED; ++ /* this is d_instantiate() ... */ ++ list_add(&dentry->d_alias, &inode->i_dentry); ++ dentry->d_inode = inode; ++ } ++ spin_unlock(&dcache_lock); ++ if (goal) ++ iput(inode); ++ return goal; ++ } ++ + return iopen_connect_dentry(dentry, inode, 1); + } + diff --git a/ldiskfs/kernel_patches/patches/ext3-map_inode_page-2.6-suse.patch b/ldiskfs/kernel_patches/patches/ext3-map_inode_page-2.6-suse.patch index 91063b3..d2af494 100644 --- a/ldiskfs/kernel_patches/patches/ext3-map_inode_page-2.6-suse.patch +++ b/ldiskfs/kernel_patches/patches/ext3-map_inode_page-2.6-suse.patch @@ -6,7 +6,7 @@ Index: linux-2.6.0/fs/ext3/inode.c =================================================================== --- linux-2.6.0.orig/fs/ext3/inode.c 2003-12-31 00:33:49.000000000 +0300 +++ linux-2.6.0/fs/ext3/inode.c 2003-12-31 01:14:17.000000000 +0300 -@@ -3136,3 +3136,58 @@ +@@ -3136,3 +3136,62 @@ ret = ret2; return ret; } @@ -28,8 +28,9 @@ Index: linux-2.6.0/fs/ext3/inode.c + blocks[i] = ext3_bmap(inode->i_mapping, iblock); + if (blocks[i] == 0) { + failed++; -+ created[i] = -1; -+ } else { ++ if (created) ++ created[i] = -1; ++ } else if (created) { + created[i] = 0; + } + } @@ -53,18 +54,21 @@ Index: linux-2.6.0/fs/ext3/inode.c + "block %ld\n", iblock); + goto out; + } ++ /* Unmap any metadata buffers from the block mapping, to avoid ++ * data corruption due to direct-write from Lustre being ++ * clobbered by a later flush of the blockdev metadata buffer.*/ + if (buffer_new(&dummy)) + unmap_underlying_metadata(dummy.b_bdev, -+ dummy.b_blocknr); ++ dummy.b_blocknr); + blocks[i] = dummy.b_blocknr; -+ created[i] = 1; ++ if (created) ++ created[i] = 1; + } + + out: + ext3_journal_stop(handle); + return rc; +} -+ Index: linux-2.6.0/fs/ext3/super.c =================================================================== --- linux-2.6.0.orig/fs/ext3/super.c 2003-12-31 00:33:49.000000000 +0300 diff --git a/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6-fc5.patch b/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6-fc5.patch new file mode 100644 index 0000000..6c1a3eb --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6-fc5.patch @@ -0,0 +1,3105 @@ +Index: linux-2.6.16.i686/include/linux/ext3_fs.h +=================================================================== +--- linux-2.6.16.i686.orig/include/linux/ext3_fs.h 2006-05-30 22:55:32.000000000 +0800 ++++ linux-2.6.16.i686/include/linux/ext3_fs.h 2006-05-30 23:02:59.000000000 +0800 +@@ -57,6 +57,14 @@ + #define ext3_debug(f, a...) do {} while (0) + #endif + ++#define EXT3_MULTIBLOCK_ALLOCATOR 1 ++ ++#define EXT3_MB_HINT_MERGE 1 ++#define EXT3_MB_HINT_RESERVED 2 ++#define EXT3_MB_HINT_METADATA 4 ++#define EXT3_MB_HINT_FIRST 8 ++#define EXT3_MB_HINT_BEST 16 ++ + /* + * Special inodes numbers + */ +@@ -383,6 +391,7 @@ + #define EXT3_MOUNT_IOPEN_NOPRIV 0x800000/* Make iopen world-readable */ + #define EXT3_MOUNT_EXTENTS 0x1000000/* Extents support */ + #define EXT3_MOUNT_EXTDEBUG 0x2000000/* Extents debug */ ++#define EXT3_MOUNT_MBALLOC 0x4000000/* Buddy allocation support */ + + /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ + #ifndef clear_opt +@@ -404,6 +413,14 @@ + #define ext3_find_first_zero_bit ext2_find_first_zero_bit + #define ext3_find_next_zero_bit ext2_find_next_zero_bit + ++#ifndef ext2_find_next_le_bit ++#ifdef __LITTLE_ENDIAN ++#define ext2_find_next_le_bit(addr, size, off) find_next_bit((addr), (size), (off)) ++#else ++#error "mballoc needs a patch for big-endian systems - CFS bug 10634" ++#endif /* __LITTLE_ENDIAN */ ++#endif /* !ext2_find_next_le_bit */ ++ + /* + * Maximal mount counts between two filesystem checks + */ +@@ -744,7 +753,7 @@ + extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group); + extern int ext3_new_block (handle_t *, struct inode *, unsigned long, int *); + extern void ext3_free_blocks (handle_t *, struct inode *, unsigned long, +- unsigned long); ++ unsigned long, int); + extern void ext3_free_blocks_sb (handle_t *, struct super_block *, + unsigned long, unsigned long, int *); + extern unsigned long ext3_count_free_blocks (struct super_block *); +@@ -865,6 +874,17 @@ + extern int ext3_ext_ioctl(struct inode *inode, struct file *filp, + unsigned int cmd, unsigned long arg); + ++/* mballoc.c */ ++extern long ext3_mb_stats; ++extern long ext3_mb_max_to_scan; ++extern int ext3_mb_init(struct super_block *, int); ++extern int ext3_mb_release(struct super_block *); ++extern int ext3_mb_new_blocks(handle_t *, struct inode *, unsigned long, int *, int, int *); ++extern int ext3_mb_reserve_blocks(struct super_block *, int); ++extern void ext3_mb_release_blocks(struct super_block *, int); ++int __init init_ext3_proc(void); ++void exit_ext3_proc(void); ++ + #endif /* __KERNEL__ */ + + /* EXT3_IOC_CREATE_INUM at bottom of file (visible to kernel and user). */ +Index: linux-2.6.16.i686/include/linux/ext3_fs_sb.h +=================================================================== +--- linux-2.6.16.i686.orig/include/linux/ext3_fs_sb.h 2006-03-20 13:53:29.000000000 +0800 ++++ linux-2.6.16.i686/include/linux/ext3_fs_sb.h 2006-05-30 23:02:59.000000000 +0800 +@@ -21,8 +21,14 @@ + #include + #include + #include ++#include + #endif + #include ++#include ++ ++struct ext3_buddy_group_blocks; ++struct ext3_mb_history; ++#define EXT3_BB_MAX_BLOCKS + + /* + * third extended-fs super-block data in memory +@@ -78,6 +84,43 @@ + char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */ + int s_jquota_fmt; /* Format of quota to use */ + #endif ++ ++ /* for buddy allocator */ ++ struct ext3_group_info ***s_group_info; ++ struct inode *s_buddy_cache; ++ long s_blocks_reserved; ++ spinlock_t s_reserve_lock; ++ struct list_head s_active_transaction; ++ struct list_head s_closed_transaction; ++ struct list_head s_committed_transaction; ++ spinlock_t s_md_lock; ++ tid_t s_last_transaction; ++ int s_mb_factor; ++ unsigned short *s_mb_offsets, *s_mb_maxs; ++ unsigned long s_stripe; ++ ++ /* history to debug policy */ ++ struct ext3_mb_history *s_mb_history; ++ int s_mb_history_cur; ++ int s_mb_history_max; ++ struct proc_dir_entry *s_mb_proc; ++ spinlock_t s_mb_history_lock; ++ ++ /* stats for buddy allocator */ ++ atomic_t s_bal_reqs; /* number of reqs with len > 1 */ ++ atomic_t s_bal_success; /* we found long enough chunks */ ++ atomic_t s_bal_allocated; /* in blocks */ ++ atomic_t s_bal_ex_scanned; /* total extents scanned */ ++ atomic_t s_bal_goals; /* goal hits */ ++ atomic_t s_bal_breaks; /* too long searches */ ++ atomic_t s_bal_2orders; /* 2^order hits */ ++ spinlock_t s_bal_lock; ++ unsigned long s_mb_buddies_generated; ++ unsigned long long s_mb_generation_time; + }; ++ ++#define EXT3_GROUP_INFO(sb, group) \ ++ EXT3_SB(sb)->s_group_info[(group) >> EXT3_DESC_PER_BLOCK_BITS(sb)] \ ++ [(group) & (EXT3_DESC_PER_BLOCK(sb) - 1)] + + #endif /* _LINUX_EXT3_FS_SB */ +Index: linux-2.6.16.i686/fs/ext3/super.c +=================================================================== +--- linux-2.6.16.i686.orig/fs/ext3/super.c 2006-05-30 22:55:32.000000000 +0800 ++++ linux-2.6.16.i686/fs/ext3/super.c 2006-05-30 23:02:59.000000000 +0800 +@@ -392,6 +392,7 @@ + struct ext3_super_block *es = sbi->s_es; + int i; + ++ ext3_mb_release(sb); + ext3_ext_release(sb); + ext3_xattr_put_super(sb); + journal_destroy(sbi->s_journal); +@@ -640,6 +641,7 @@ + Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, + Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, + Opt_extents, Opt_noextents, Opt_extdebug, ++ Opt_mballoc, Opt_nomballoc, Opt_stripe, + Opt_grpquota + }; + +@@ -694,6 +695,9 @@ + {Opt_extents, "extents"}, + {Opt_noextents, "noextents"}, + {Opt_extdebug, "extdebug"}, ++ {Opt_mballoc, "mballoc"}, ++ {Opt_nomballoc, "nomballoc"}, ++ {Opt_stripe, "stripe=%u"}, + {Opt_barrier, "barrier=%u"}, + {Opt_err, NULL}, + {Opt_resize, "resize"}, +@@ -1041,6 +1043,19 @@ + case Opt_extdebug: + set_opt (sbi->s_mount_opt, EXTDEBUG); + break; ++ case Opt_mballoc: ++ set_opt(sbi->s_mount_opt, MBALLOC); ++ break; ++ case Opt_nomballoc: ++ clear_opt(sbi->s_mount_opt, MBALLOC); ++ break; ++ case Opt_stripe: ++ if (match_int(&args[0], &option)) ++ return 0; ++ if (option < 0) ++ return 0; ++ sbi->s_stripe = option; ++ break; + default: + printk (KERN_ERR + "EXT3-fs: Unrecognized mount option \"%s\" " +@@ -1766,6 +1771,7 @@ + ext3_count_dirs(sb)); + + ext3_ext_init(sb); ++ ext3_mb_init(sb, needs_recovery); + lock_kernel(); + return 0; + +@@ -2699,7 +2705,13 @@ + + static int __init init_ext3_fs(void) + { +- int err = init_ext3_xattr(); ++ int err; ++ ++ err = init_ext3_proc(); ++ if (err) ++ return err; ++ ++ err = init_ext3_xattr(); + if (err) + return err; + err = init_inodecache(); +@@ -2721,6 +2733,7 @@ + unregister_filesystem(&ext3_fs_type); + destroy_inodecache(); + exit_ext3_xattr(); ++ exit_ext3_proc(); + } + + int ext3_prep_san_write(struct inode *inode, long *blocks, +Index: linux-2.6.16.i686/fs/ext3/extents.c +=================================================================== +--- linux-2.6.16.i686.orig/fs/ext3/extents.c 2006-05-30 22:55:32.000000000 +0800 ++++ linux-2.6.16.i686/fs/ext3/extents.c 2006-05-30 23:02:59.000000000 +0800 +@@ -771,7 +771,7 @@ + for (i = 0; i < depth; i++) { + if (!ablocks[i]) + continue; +- ext3_free_blocks(handle, tree->inode, ablocks[i], 1); ++ ext3_free_blocks(handle, tree->inode, ablocks[i], 1, 1); + } + } + kfree(ablocks); +@@ -1428,7 +1428,7 @@ + path->p_idx->ei_leaf); + bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf); + ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf); +- ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1); ++ ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1, 1); + return err; + } + +@@ -1913,10 +1913,12 @@ + int needed = ext3_remove_blocks_credits(tree, ex, from, to); + handle_t *handle = ext3_journal_start(tree->inode, needed); + struct buffer_head *bh; +- int i; ++ int i, metadata = 0; + + if (IS_ERR(handle)) + return PTR_ERR(handle); ++ if (S_ISDIR(tree->inode->i_mode) || S_ISLNK(tree->inode->i_mode)) ++ metadata = 1; + if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) { + /* tail removal */ + unsigned long num, start; +@@ -1928,7 +1930,7 @@ + bh = sb_find_get_block(tree->inode->i_sb, start + i); + ext3_forget(handle, 0, tree->inode, bh, start + i); + } +- ext3_free_blocks(handle, tree->inode, start, num); ++ ext3_free_blocks(handle, tree->inode, start, num, metadata); + } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) { + printk("strange request: removal %lu-%lu from %u:%u\n", + from, to, ex->ee_block, ex->ee_len); +Index: linux-2.6.16.i686/fs/ext3/inode.c +=================================================================== +--- linux-2.6.16.i686.orig/fs/ext3/inode.c 2006-05-30 22:55:32.000000000 +0800 ++++ linux-2.6.16.i686/fs/ext3/inode.c 2006-05-30 23:02:59.000000000 +0800 +@@ -568,7 +568,7 @@ + ext3_journal_forget(handle, branch[i].bh); + } + for (i = 0; i < keys; i++) +- ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1); ++ ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1, 1); + return err; + } + +@@ -1862,7 +1862,7 @@ + } + } + +- ext3_free_blocks(handle, inode, block_to_free, count); ++ ext3_free_blocks(handle, inode, block_to_free, count, 1); + } + + /** +@@ -2035,7 +2035,7 @@ + ext3_journal_test_restart(handle, inode); + } + +- ext3_free_blocks(handle, inode, nr, 1); ++ ext3_free_blocks(handle, inode, nr, 1, 1); + + if (parent_bh) { + /* +Index: linux-2.6.16.i686/fs/ext3/balloc.c +=================================================================== +--- linux-2.6.16.i686.orig/fs/ext3/balloc.c 2006-03-20 13:53:29.000000000 +0800 ++++ linux-2.6.16.i686/fs/ext3/balloc.c 2006-05-30 23:02:59.000000000 +0800 +@@ -80,7 +80,7 @@ + * + * Return buffer_head on success or NULL in case of failure. + */ +-static struct buffer_head * ++struct buffer_head * + read_block_bitmap(struct super_block *sb, unsigned int block_group) + { + struct ext3_group_desc * desc; +@@ -491,24 +491,6 @@ + return; + } + +-/* Free given blocks, update quota and i_blocks field */ +-void ext3_free_blocks(handle_t *handle, struct inode *inode, +- unsigned long block, unsigned long count) +-{ +- struct super_block * sb; +- int dquot_freed_blocks; +- +- sb = inode->i_sb; +- if (!sb) { +- printk ("ext3_free_blocks: nonexistent device"); +- return; +- } +- ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks); +- if (dquot_freed_blocks) +- DQUOT_FREE_BLOCK(inode, dquot_freed_blocks); +- return; +-} +- + /* + * For ext3 allocations, we must not reuse any blocks which are + * allocated in the bitmap buffer's "last committed data" copy. This +@@ -1154,7 +1136,7 @@ + * bitmap, and then for any free bit if that fails. + * This function also updates quota and i_blocks field. + */ +-int ext3_new_block(handle_t *handle, struct inode *inode, ++int ext3_new_block_old(handle_t *handle, struct inode *inode, + unsigned long goal, int *errp) + { + struct buffer_head *bitmap_bh = NULL; +Index: linux-2.6.16.i686/fs/ext3/xattr.c +=================================================================== +--- linux-2.6.16.i686.orig/fs/ext3/xattr.c 2006-03-20 13:53:29.000000000 +0800 ++++ linux-2.6.16.i686/fs/ext3/xattr.c 2006-05-30 23:02:59.000000000 +0800 +@@ -484,7 +484,7 @@ + ea_bdebug(bh, "refcount now=0; freeing"); + if (ce) + mb_cache_entry_free(ce); +- ext3_free_blocks(handle, inode, bh->b_blocknr, 1); ++ ext3_free_blocks(handle, inode, bh->b_blocknr, 1, 1); + get_bh(bh); + ext3_forget(handle, 1, inode, bh, bh->b_blocknr); + } else { +@@ -804,7 +804,7 @@ + new_bh = sb_getblk(sb, block); + if (!new_bh) { + getblk_failed: +- ext3_free_blocks(handle, inode, block, 1); ++ ext3_free_blocks(handle, inode, block, 1, 1); + error = -EIO; + goto cleanup; + } +Index: linux-2.6.16.i686/fs/ext3/mballoc.c +=================================================================== +--- linux-2.6.16.i686.orig/fs/ext3/mballoc.c 2006-05-31 04:14:15.752410384 +0800 ++++ linux-2.6.16.i686/fs/ext3/mballoc.c 2006-05-30 23:03:38.000000000 +0800 +@@ -0,0 +1,2729 @@ ++/* ++ * Copyright (c) 2003-2005, Cluster File Systems, Inc, info@clusterfs.com ++ * Written by Alex Tomas ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public Licens ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- ++ */ ++ ++ ++/* ++ * mballoc.c contains the multiblocks allocation routines ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * TODO: ++ * - bitmap read-ahead (proposed by Oleg Drokin aka green) ++ * - track min/max extents in each group for better group selection ++ * - mb_mark_used() may allocate chunk right after splitting buddy ++ * - special flag to advice allocator to look for requested + N blocks ++ * this may improve interaction between extents and mballoc ++ * - tree of groups sorted by number of free blocks ++ * - percpu reservation code (hotpath) ++ * - error handling ++ */ ++ ++/* ++ * with AGRESSIVE_CHECK allocator runs consistency checks over ++ * structures. these checks slow things down a lot ++ */ ++#define AGGRESSIVE_CHECK__ ++ ++/* ++ */ ++#define MB_DEBUG__ ++#ifdef MB_DEBUG ++#define mb_debug(fmt,a...) printk(fmt, ##a) ++#else ++#define mb_debug(fmt,a...) ++#endif ++ ++/* ++ * with EXT3_MB_HISTORY mballoc stores last N allocations in memory ++ * and you can monitor it in /proc/fs/ext3//mb_history ++ */ ++#define EXT3_MB_HISTORY ++ ++/* ++ * How long mballoc can look for a best extent (in found extents) ++ */ ++long ext3_mb_max_to_scan = 500; ++ ++/* ++ * How long mballoc must look for a best extent ++ */ ++long ext3_mb_min_to_scan = 30; ++ ++/* ++ * with 'ext3_mb_stats' allocator will collect stats that will be ++ * shown at umount. The collecting costs though! ++ */ ++ ++long ext3_mb_stats = 1; ++ ++/* ++ * for which requests use 2^N search using buddies ++ */ ++long ext3_mb_order2_reqs = 8; ++ ++#ifdef EXT3_BB_MAX_BLOCKS ++#undef EXT3_BB_MAX_BLOCKS ++#endif ++#define EXT3_BB_MAX_BLOCKS 30 ++ ++struct ext3_free_metadata { ++ unsigned short group; ++ unsigned short num; ++ unsigned short blocks[EXT3_BB_MAX_BLOCKS]; ++ struct list_head list; ++}; ++ ++struct ext3_group_info { ++ unsigned long bb_state; ++ unsigned long bb_tid; ++ struct ext3_free_metadata *bb_md_cur; ++ unsigned short bb_first_free; ++ unsigned short bb_free; ++ unsigned short bb_fragments; ++ unsigned short bb_counters[]; ++}; ++ ++ ++#define EXT3_GROUP_INFO_NEED_INIT_BIT 0 ++#define EXT3_GROUP_INFO_LOCKED_BIT 1 ++ ++#define EXT3_MB_GRP_NEED_INIT(grp) \ ++ (test_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, &(grp)->bb_state)) ++ ++struct ext3_free_extent { ++ __u16 fe_start; ++ __u16 fe_len; ++ __u16 fe_group; ++}; ++ ++struct ext3_allocation_context { ++ struct super_block *ac_sb; ++ ++ /* search goals */ ++ struct ext3_free_extent ac_g_ex; ++ ++ /* the best found extent */ ++ struct ext3_free_extent ac_b_ex; ++ ++ /* number of iterations done. we have to track to limit searching */ ++ unsigned long ac_ex_scanned; ++ __u16 ac_groups_scanned; ++ __u16 ac_found; ++ __u16 ac_tail; ++ __u16 ac_buddy; ++ __u8 ac_status; ++ __u8 ac_flags; /* allocation hints */ ++ __u8 ac_criteria; ++ __u8 ac_repeats; ++ __u8 ac_2order; /* if request is to allocate 2^N blocks and ++ * N > 0, the field stores N, otherwise 0 */ ++ ++ struct page *ac_buddy_page; ++ struct page *ac_bitmap_page; ++}; ++ ++#define AC_STATUS_CONTINUE 1 ++#define AC_STATUS_FOUND 2 ++#define AC_STATUS_BREAK 3 ++ ++struct ext3_mb_history { ++ struct ext3_free_extent goal; /* goal allocation */ ++ struct ext3_free_extent result; /* result allocation */ ++ unsigned pid; ++ unsigned ino; ++ __u16 found; /* how many extents have been found */ ++ __u16 groups; /* how many groups have been scanned */ ++ __u16 tail; /* what tail broke some buddy */ ++ __u16 buddy; /* buddy the tail ^^^ broke */ ++ __u8 cr; /* which phase the result extent was found at */ ++ __u8 merged; ++}; ++ ++struct ext3_buddy { ++ struct page *bd_buddy_page; ++ void *bd_buddy; ++ struct page *bd_bitmap_page; ++ void *bd_bitmap; ++ struct ext3_group_info *bd_info; ++ struct super_block *bd_sb; ++ __u16 bd_blkbits; ++ __u16 bd_group; ++}; ++#define EXT3_MB_BITMAP(e3b) ((e3b)->bd_bitmap) ++#define EXT3_MB_BUDDY(e3b) ((e3b)->bd_buddy) ++ ++#ifndef EXT3_MB_HISTORY ++#define ext3_mb_store_history(sb,ino,ac) ++#else ++static void ext3_mb_store_history(struct super_block *, unsigned ino, ++ struct ext3_allocation_context *ac); ++#endif ++ ++#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) ++ ++static struct proc_dir_entry *proc_root_ext3; ++ ++int ext3_create (struct inode *, struct dentry *, int, struct nameidata *); ++struct buffer_head * read_block_bitmap(struct super_block *, unsigned int); ++int ext3_new_block_old(handle_t *, struct inode *, unsigned long, int *); ++int ext3_mb_reserve_blocks(struct super_block *, int); ++void ext3_mb_release_blocks(struct super_block *, int); ++void ext3_mb_poll_new_transaction(struct super_block *, handle_t *); ++void ext3_mb_free_committed_blocks(struct super_block *); ++ ++#if BITS_PER_LONG == 64 ++#define mb_correct_addr_and_bit(bit,addr) \ ++{ \ ++ bit += ((unsigned long) addr & 7UL) << 3; \ ++ addr = (void *) ((unsigned long) addr & ~7UL); \ ++} ++#elif BITS_PER_LONG == 32 ++#define mb_correct_addr_and_bit(bit,addr) \ ++{ \ ++ bit += ((unsigned long) addr & 3UL) << 3; \ ++ addr = (void *) ((unsigned long) addr & ~3UL); \ ++} ++#else ++#error "how many bits you are?!" ++#endif ++ ++static inline int mb_test_bit(int bit, void *addr) ++{ ++ mb_correct_addr_and_bit(bit,addr); ++ return ext2_test_bit(bit, addr); ++} ++ ++static inline void mb_set_bit(int bit, void *addr) ++{ ++ mb_correct_addr_and_bit(bit,addr); ++ ext2_set_bit(bit, addr); ++} ++ ++static inline void mb_set_bit_atomic(int bit, void *addr) ++{ ++ mb_correct_addr_and_bit(bit,addr); ++ ext2_set_bit_atomic(NULL, bit, addr); ++} ++ ++static inline void mb_clear_bit(int bit, void *addr) ++{ ++ mb_correct_addr_and_bit(bit,addr); ++ ext2_clear_bit(bit, addr); ++} ++ ++static inline void mb_clear_bit_atomic(int bit, void *addr) ++{ ++ mb_correct_addr_and_bit(bit,addr); ++ ext2_clear_bit_atomic(NULL, bit, addr); ++} ++ ++static inline int mb_find_next_zero_bit(void *addr, int max, int start) ++{ ++ int fix; ++#if BITS_PER_LONG == 64 ++ fix = ((unsigned long) addr & 7UL) << 3; ++ addr = (void *) ((unsigned long) addr & ~7UL); ++#elif BITS_PER_LONG == 32 ++ fix = ((unsigned long) addr & 3UL) << 3; ++ addr = (void *) ((unsigned long) addr & ~3UL); ++#else ++#error "how many bits you are?!" ++#endif ++ max += fix; ++ start += fix; ++ return ext2_find_next_zero_bit(addr, max, start) - fix; ++} ++ ++static inline void *mb_find_buddy(struct ext3_buddy *e3b, int order, int *max) ++{ ++ char *bb; ++ ++ J_ASSERT(EXT3_MB_BITMAP(e3b) != EXT3_MB_BUDDY(e3b)); ++ J_ASSERT(max != NULL); ++ ++ if (order > e3b->bd_blkbits + 1) { ++ *max = 0; ++ return NULL; ++ } ++ ++ /* at order 0 we see each particular block */ ++ *max = 1 << (e3b->bd_blkbits + 3); ++ if (order == 0) ++ return EXT3_MB_BITMAP(e3b); ++ ++ bb = EXT3_MB_BUDDY(e3b) + EXT3_SB(e3b->bd_sb)->s_mb_offsets[order]; ++ *max = EXT3_SB(e3b->bd_sb)->s_mb_maxs[order]; ++ ++ return bb; ++} ++ ++#ifdef AGGRESSIVE_CHECK ++ ++static void mb_check_buddy(struct ext3_buddy *e3b) ++{ ++ int order = e3b->bd_blkbits + 1; ++ int max, max2, i, j, k, count; ++ int fragments = 0, fstart; ++ void *buddy, *buddy2; ++ ++ if (!test_opt(e3b->bd_sb, MBALLOC)) ++ return; ++ ++ { ++ static int mb_check_counter = 0; ++ if (mb_check_counter++ % 300 != 0) ++ return; ++ } ++ ++ while (order > 1) { ++ buddy = mb_find_buddy(e3b, order, &max); ++ J_ASSERT(buddy); ++ buddy2 = mb_find_buddy(e3b, order - 1, &max2); ++ J_ASSERT(buddy2); ++ J_ASSERT(buddy != buddy2); ++ J_ASSERT(max * 2 == max2); ++ ++ count = 0; ++ for (i = 0; i < max; i++) { ++ ++ if (mb_test_bit(i, buddy)) { ++ /* only single bit in buddy2 may be 1 */ ++ if (!mb_test_bit(i << 1, buddy2)) ++ J_ASSERT(mb_test_bit((i<<1)+1, buddy2)); ++ else if (!mb_test_bit((i << 1) + 1, buddy2)) ++ J_ASSERT(mb_test_bit(i << 1, buddy2)); ++ continue; ++ } ++ ++ /* both bits in buddy2 must be 0 */ ++ J_ASSERT(mb_test_bit(i << 1, buddy2)); ++ J_ASSERT(mb_test_bit((i << 1) + 1, buddy2)); ++ ++ for (j = 0; j < (1 << order); j++) { ++ k = (i * (1 << order)) + j; ++ J_ASSERT(!mb_test_bit(k, EXT3_MB_BITMAP(e3b))); ++ } ++ count++; ++ } ++ J_ASSERT(e3b->bd_info->bb_counters[order] == count); ++ order--; ++ } ++ ++ fstart = -1; ++ buddy = mb_find_buddy(e3b, 0, &max); ++ for (i = 0; i < max; i++) { ++ if (!mb_test_bit(i, buddy)) { ++ J_ASSERT(i >= e3b->bd_info->bb_first_free); ++ if (fstart == -1) { ++ fragments++; ++ fstart = i; ++ } ++ continue; ++ } ++ fstart = -1; ++ /* check used bits only */ ++ for (j = 0; j < e3b->bd_blkbits + 1; j++) { ++ buddy2 = mb_find_buddy(e3b, j, &max2); ++ k = i >> j; ++ J_ASSERT(k < max2); ++ J_ASSERT(mb_test_bit(k, buddy2)); ++ } ++ } ++ J_ASSERT(!EXT3_MB_GRP_NEED_INIT(e3b->bd_info)); ++ J_ASSERT(e3b->bd_info->bb_fragments == fragments); ++} ++ ++#else ++#define mb_check_buddy(e3b) ++#endif ++ ++/* find most significant bit */ ++static int inline fmsb(unsigned short word) ++{ ++ int order; ++ ++ if (word > 255) { ++ order = 7; ++ word >>= 8; ++ } else { ++ order = -1; ++ } ++ ++ do { ++ order++; ++ word >>= 1; ++ } while (word != 0); ++ ++ return order; ++} ++ ++static void inline ++ext3_mb_mark_free_simple(struct super_block *sb, void *buddy, unsigned first, ++ int len, struct ext3_group_info *grp) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ unsigned short min, max, chunk, border; ++ ++ mb_debug("mark %u/%u free\n", first, len); ++ J_ASSERT(len < EXT3_BLOCKS_PER_GROUP(sb)); ++ ++ border = 2 << sb->s_blocksize_bits; ++ ++ while (len > 0) { ++ /* find how many blocks can be covered since this position */ ++ max = ffs(first | border) - 1; ++ ++ /* find how many blocks of power 2 we need to mark */ ++ min = fmsb(len); ++ ++ mb_debug(" %u/%u -> max %u, min %u\n", ++ first & ((2 << sb->s_blocksize_bits) - 1), ++ len, max, min); ++ ++ if (max < min) ++ min = max; ++ chunk = 1 << min; ++ ++ /* mark multiblock chunks only */ ++ grp->bb_counters[min]++; ++ if (min > 0) { ++ mb_debug(" set %u at %u \n", first >> min, ++ sbi->s_mb_offsets[min]); ++ mb_clear_bit(first >> min, buddy + sbi->s_mb_offsets[min]); ++ } ++ ++ len -= chunk; ++ first += chunk; ++ } ++} ++ ++static void ++ext3_mb_generate_buddy(struct super_block *sb, void *buddy, void *bitmap, ++ int group) ++{ ++ struct ext3_group_info *grp = EXT3_GROUP_INFO(sb, group); ++ unsigned short max = EXT3_BLOCKS_PER_GROUP(sb); ++ unsigned short i = 0, first, len; ++ unsigned free = 0, fragments = 0; ++ unsigned long long period = get_cycles(); ++ ++ i = mb_find_next_zero_bit(bitmap, max, 0); ++ grp->bb_first_free = i; ++ while (i < max) { ++ fragments++; ++ first = i; ++ i = ext2_find_next_le_bit(bitmap, max, i); ++ len = i - first; ++ free += len; ++ if (len > 1) ++ ext3_mb_mark_free_simple(sb, buddy, first, len, grp); ++ else ++ grp->bb_counters[0]++; ++ if (i < max) ++ i = mb_find_next_zero_bit(bitmap, max, i); ++ } ++ grp->bb_fragments = fragments; ++ ++ /* bb_state shouldn't being modified because all ++ * others waits for init completion on page lock */ ++ clear_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, &grp->bb_state); ++ if (free != grp->bb_free) { ++ printk("EXT3-fs: group %u: %u blocks in bitmap, %u in gd\n", ++ group, free, grp->bb_free); ++ grp->bb_free = free; ++ } ++ ++ period = get_cycles() - period; ++ spin_lock(&EXT3_SB(sb)->s_bal_lock); ++ EXT3_SB(sb)->s_mb_buddies_generated++; ++ EXT3_SB(sb)->s_mb_generation_time += period; ++ spin_unlock(&EXT3_SB(sb)->s_bal_lock); ++} ++ ++static int ext3_mb_init_cache(struct page *page) ++{ ++ int blocksize, blocks_per_page, groups_per_page; ++ int err = 0, i, first_group, first_block; ++ struct super_block *sb; ++ struct buffer_head *bhs; ++ struct buffer_head **bh; ++ struct inode *inode; ++ char *data, *bitmap; ++ ++ mb_debug("init page %lu\n", page->index); ++ ++ inode = page->mapping->host; ++ sb = inode->i_sb; ++ blocksize = 1 << inode->i_blkbits; ++ blocks_per_page = PAGE_CACHE_SIZE / blocksize; ++ ++ groups_per_page = blocks_per_page >> 1; ++ if (groups_per_page == 0) ++ groups_per_page = 1; ++ ++ /* allocate buffer_heads to read bitmaps */ ++ if (groups_per_page > 1) { ++ err = -ENOMEM; ++ i = sizeof(struct buffer_head *) * groups_per_page; ++ bh = kmalloc(i, GFP_NOFS); ++ if (bh == NULL) ++ goto out; ++ memset(bh, 0, i); ++ } else ++ bh = &bhs; ++ ++ first_group = page->index * blocks_per_page / 2; ++ ++ /* read all groups the page covers into the cache */ ++ for (i = 0; i < groups_per_page; i++) { ++ struct ext3_group_desc * desc; ++ ++ if (first_group + i >= EXT3_SB(sb)->s_groups_count) ++ break; ++ ++ err = -EIO; ++ desc = ext3_get_group_desc(sb, first_group + i, NULL); ++ if (desc == NULL) ++ goto out; ++ ++ err = -ENOMEM; ++ bh[i] = sb_getblk(sb, le32_to_cpu(desc->bg_block_bitmap)); ++ if (bh[i] == NULL) ++ goto out; ++ ++ if (buffer_uptodate(bh[i])) ++ continue; ++ ++ lock_buffer(bh[i]); ++ if (buffer_uptodate(bh[i])) { ++ unlock_buffer(bh[i]); ++ continue; ++ } ++ ++ get_bh(bh[i]); ++ bh[i]->b_end_io = end_buffer_read_sync; ++ submit_bh(READ, bh[i]); ++ mb_debug("read bitmap for group %u\n", first_group + i); ++ } ++ ++ /* wait for I/O completion */ ++ for (i = 0; i < groups_per_page && bh[i]; i++) ++ wait_on_buffer(bh[i]); ++ ++ err = -EIO; ++ for (i = 0; i < groups_per_page && bh[i]; i++) ++ if (!buffer_uptodate(bh[i])) ++ goto out; ++ ++ first_block = page->index * blocks_per_page; ++ for (i = 0; i < blocks_per_page; i++) { ++ int group; ++ ++ group = (first_block + i) >> 1; ++ if (group >= EXT3_SB(sb)->s_groups_count) ++ break; ++ ++ data = page_address(page) + (i * blocksize); ++ bitmap = bh[group - first_group]->b_data; ++ ++ if ((first_block + i) & 1) { ++ /* this is block of buddy */ ++ mb_debug("put buddy for group %u in page %lu/%x\n", ++ group, page->index, i * blocksize); ++ memset(data, 0xff, blocksize); ++ EXT3_GROUP_INFO(sb, group)->bb_fragments = 0; ++ memset(EXT3_GROUP_INFO(sb, group)->bb_counters, 0, ++ sizeof(unsigned short)*(sb->s_blocksize_bits+2)); ++ ext3_mb_generate_buddy(sb, data, bitmap, group); ++ } else { ++ /* this is block of bitmap */ ++ mb_debug("put bitmap for group %u in page %lu/%x\n", ++ group, page->index, i * blocksize); ++ memcpy(data, bitmap, blocksize); ++ } ++ } ++ SetPageUptodate(page); ++ ++out: ++ if (bh) { ++ for (i = 0; i < groups_per_page && bh[i]; i++) ++ brelse(bh[i]); ++ if (bh != &bhs) ++ kfree(bh); ++ } ++ return err; ++} ++ ++static int ext3_mb_load_buddy(struct super_block *sb, int group, ++ struct ext3_buddy *e3b) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct inode *inode = sbi->s_buddy_cache; ++ int blocks_per_page, block, pnum, poff; ++ struct page *page; ++ ++ mb_debug("load group %u\n", group); ++ ++ blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; ++ ++ e3b->bd_blkbits = sb->s_blocksize_bits; ++ e3b->bd_info = EXT3_GROUP_INFO(sb, group); ++ e3b->bd_sb = sb; ++ e3b->bd_group = group; ++ e3b->bd_buddy_page = NULL; ++ e3b->bd_bitmap_page = NULL; ++ ++ block = group * 2; ++ pnum = block / blocks_per_page; ++ poff = block % blocks_per_page; ++ ++ /* we could use find_or_create_page(), but it locks page ++ * what we'd like to avoid in fast path ... */ ++ page = find_get_page(inode->i_mapping, pnum); ++ if (page == NULL || !PageUptodate(page)) { ++ if (page) ++ page_cache_release(page); ++ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); ++ if (page) { ++ BUG_ON(page->mapping != inode->i_mapping); ++ if (!PageUptodate(page)) ++ ext3_mb_init_cache(page); ++ unlock_page(page); ++ } ++ } ++ if (page == NULL || !PageUptodate(page)) ++ goto err; ++ e3b->bd_bitmap_page = page; ++ e3b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); ++ mark_page_accessed(page); ++ ++ block++; ++ pnum = block / blocks_per_page; ++ poff = block % blocks_per_page; ++ ++ page = find_get_page(inode->i_mapping, pnum); ++ if (page == NULL || !PageUptodate(page)) { ++ if (page) ++ page_cache_release(page); ++ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); ++ if (page) { ++ BUG_ON(page->mapping != inode->i_mapping); ++ if (!PageUptodate(page)) ++ ext3_mb_init_cache(page); ++ unlock_page(page); ++ } ++ } ++ if (page == NULL || !PageUptodate(page)) ++ goto err; ++ e3b->bd_buddy_page = page; ++ e3b->bd_buddy = page_address(page) + (poff * sb->s_blocksize); ++ mark_page_accessed(page); ++ ++ J_ASSERT(e3b->bd_bitmap_page != NULL); ++ J_ASSERT(e3b->bd_buddy_page != NULL); ++ ++ return 0; ++ ++err: ++ if (e3b->bd_bitmap_page) ++ page_cache_release(e3b->bd_bitmap_page); ++ if (e3b->bd_buddy_page) ++ page_cache_release(e3b->bd_buddy_page); ++ e3b->bd_buddy = NULL; ++ e3b->bd_bitmap = NULL; ++ return -EIO; ++} ++ ++static void ext3_mb_release_desc(struct ext3_buddy *e3b) ++{ ++ if (e3b->bd_bitmap_page) ++ page_cache_release(e3b->bd_bitmap_page); ++ if (e3b->bd_buddy_page) ++ page_cache_release(e3b->bd_buddy_page); ++} ++ ++ ++static inline void ++ext3_lock_group(struct super_block *sb, int group) ++{ ++ bit_spin_lock(EXT3_GROUP_INFO_LOCKED_BIT, ++ &EXT3_GROUP_INFO(sb, group)->bb_state); ++} ++ ++static inline void ++ext3_unlock_group(struct super_block *sb, int group) ++{ ++ bit_spin_unlock(EXT3_GROUP_INFO_LOCKED_BIT, ++ &EXT3_GROUP_INFO(sb, group)->bb_state); ++} ++ ++static int mb_find_order_for_block(struct ext3_buddy *e3b, int block) ++{ ++ int order = 1; ++ void *bb; ++ ++ J_ASSERT(EXT3_MB_BITMAP(e3b) != EXT3_MB_BUDDY(e3b)); ++ J_ASSERT(block < (1 << (e3b->bd_blkbits + 3))); ++ ++ bb = EXT3_MB_BUDDY(e3b); ++ while (order <= e3b->bd_blkbits + 1) { ++ block = block >> 1; ++ if (!mb_test_bit(block, bb)) { ++ /* this block is part of buddy of order 'order' */ ++ return order; ++ } ++ bb += 1 << (e3b->bd_blkbits - order); ++ order++; ++ } ++ return 0; ++} ++ ++static inline void mb_clear_bits(void *bm, int cur, int len) ++{ ++ __u32 *addr; ++ ++ len = cur + len; ++ while (cur < len) { ++ if ((cur & 31) == 0 && (len - cur) >= 32) { ++ /* fast path: clear whole word at once */ ++ addr = bm + (cur >> 3); ++ *addr = 0; ++ cur += 32; ++ continue; ++ } ++ mb_clear_bit_atomic(cur, bm); ++ cur++; ++ } ++} ++ ++static inline void mb_set_bits(void *bm, int cur, int len) ++{ ++ __u32 *addr; ++ ++ len = cur + len; ++ while (cur < len) { ++ if ((cur & 31) == 0 && (len - cur) >= 32) { ++ /* fast path: clear whole word at once */ ++ addr = bm + (cur >> 3); ++ *addr = 0xffffffff; ++ cur += 32; ++ continue; ++ } ++ mb_set_bit_atomic(cur, bm); ++ cur++; ++ } ++} ++ ++static int mb_free_blocks(struct ext3_buddy *e3b, int first, int count) ++{ ++ int block = 0, max = 0, order; ++ void *buddy, *buddy2; ++ ++ mb_check_buddy(e3b); ++ ++ e3b->bd_info->bb_free += count; ++ if (first < e3b->bd_info->bb_first_free) ++ e3b->bd_info->bb_first_free = first; ++ ++ /* let's maintain fragments counter */ ++ if (first != 0) ++ block = !mb_test_bit(first - 1, EXT3_MB_BITMAP(e3b)); ++ if (first + count < EXT3_SB(e3b->bd_sb)->s_mb_maxs[0]) ++ max = !mb_test_bit(first + count, EXT3_MB_BITMAP(e3b)); ++ if (block && max) ++ e3b->bd_info->bb_fragments--; ++ else if (!block && !max) ++ e3b->bd_info->bb_fragments++; ++ ++ /* let's maintain buddy itself */ ++ while (count-- > 0) { ++ block = first++; ++ order = 0; ++ ++ J_ASSERT(mb_test_bit(block, EXT3_MB_BITMAP(e3b))); ++ mb_clear_bit(block, EXT3_MB_BITMAP(e3b)); ++ e3b->bd_info->bb_counters[order]++; ++ ++ /* start of the buddy */ ++ buddy = mb_find_buddy(e3b, order, &max); ++ ++ do { ++ block &= ~1UL; ++ if (mb_test_bit(block, buddy) || ++ mb_test_bit(block + 1, buddy)) ++ break; ++ ++ /* both the buddies are free, try to coalesce them */ ++ buddy2 = mb_find_buddy(e3b, order + 1, &max); ++ ++ if (!buddy2) ++ break; ++ ++ if (order > 0) { ++ /* for special purposes, we don't set ++ * free bits in bitmap */ ++ mb_set_bit(block, buddy); ++ mb_set_bit(block + 1, buddy); ++ } ++ e3b->bd_info->bb_counters[order]--; ++ e3b->bd_info->bb_counters[order]--; ++ ++ block = block >> 1; ++ order++; ++ e3b->bd_info->bb_counters[order]++; ++ ++ mb_clear_bit(block, buddy2); ++ buddy = buddy2; ++ } while (1); ++ } ++ mb_check_buddy(e3b); ++ ++ return 0; ++} ++ ++static int mb_find_extent(struct ext3_buddy *e3b, int order, int block, ++ int needed, struct ext3_free_extent *ex) ++{ ++ int next = block, max, ord; ++ void *buddy; ++ ++ J_ASSERT(ex != NULL); ++ ++ buddy = mb_find_buddy(e3b, order, &max); ++ J_ASSERT(buddy); ++ J_ASSERT(block < max); ++ if (mb_test_bit(block, buddy)) { ++ ex->fe_len = 0; ++ ex->fe_start = 0; ++ ex->fe_group = 0; ++ return 0; ++ } ++ ++ if (likely(order == 0)) { ++ /* find actual order */ ++ order = mb_find_order_for_block(e3b, block); ++ block = block >> order; ++ } ++ ++ ex->fe_len = 1 << order; ++ ex->fe_start = block << order; ++ ex->fe_group = e3b->bd_group; ++ ++ /* calc difference from given start */ ++ next = next - ex->fe_start; ++ ex->fe_len -= next; ++ ex->fe_start += next; ++ ++ while (needed > ex->fe_len && (buddy = mb_find_buddy(e3b, order, &max))) { ++ ++ if (block + 1 >= max) ++ break; ++ ++ next = (block + 1) * (1 << order); ++ if (mb_test_bit(next, EXT3_MB_BITMAP(e3b))) ++ break; ++ ++ ord = mb_find_order_for_block(e3b, next); ++ ++ order = ord; ++ block = next >> order; ++ ex->fe_len += 1 << order; ++ } ++ ++ J_ASSERT(ex->fe_start + ex->fe_len <= (1 << (e3b->bd_blkbits + 3))); ++ return ex->fe_len; ++} ++ ++static int mb_mark_used(struct ext3_buddy *e3b, struct ext3_free_extent *ex) ++{ ++ int ord, mlen = 0, max = 0, cur; ++ int start = ex->fe_start; ++ int len = ex->fe_len; ++ unsigned ret = 0; ++ int len0 = len; ++ void *buddy; ++ ++ mb_check_buddy(e3b); ++ ++ e3b->bd_info->bb_free -= len; ++ if (e3b->bd_info->bb_first_free == start) ++ e3b->bd_info->bb_first_free += len; ++ ++ /* let's maintain fragments counter */ ++ if (start != 0) ++ mlen = !mb_test_bit(start - 1, EXT3_MB_BITMAP(e3b)); ++ if (start + len < EXT3_SB(e3b->bd_sb)->s_mb_maxs[0]) ++ max = !mb_test_bit(start + len, EXT3_MB_BITMAP(e3b)); ++ if (mlen && max) ++ e3b->bd_info->bb_fragments++; ++ else if (!mlen && !max) ++ e3b->bd_info->bb_fragments--; ++ ++ /* let's maintain buddy itself */ ++ while (len) { ++ ord = mb_find_order_for_block(e3b, start); ++ ++ if (((start >> ord) << ord) == start && len >= (1 << ord)) { ++ /* the whole chunk may be allocated at once! */ ++ mlen = 1 << ord; ++ buddy = mb_find_buddy(e3b, ord, &max); ++ J_ASSERT((start >> ord) < max); ++ mb_set_bit(start >> ord, buddy); ++ e3b->bd_info->bb_counters[ord]--; ++ start += mlen; ++ len -= mlen; ++ J_ASSERT(len >= 0); ++ continue; ++ } ++ ++ /* store for history */ ++ if (ret == 0) ++ ret = len | (ord << 16); ++ ++ /* we have to split large buddy */ ++ J_ASSERT(ord > 0); ++ buddy = mb_find_buddy(e3b, ord, &max); ++ mb_set_bit(start >> ord, buddy); ++ e3b->bd_info->bb_counters[ord]--; ++ ++ ord--; ++ cur = (start >> ord) & ~1U; ++ buddy = mb_find_buddy(e3b, ord, &max); ++ mb_clear_bit(cur, buddy); ++ mb_clear_bit(cur + 1, buddy); ++ e3b->bd_info->bb_counters[ord]++; ++ e3b->bd_info->bb_counters[ord]++; ++ } ++ ++ /* now drop all the bits in bitmap */ ++ mb_set_bits(EXT3_MB_BITMAP(e3b), ex->fe_start, len0); ++ ++ mb_check_buddy(e3b); ++ ++ return ret; ++} ++ ++/* ++ * Must be called under group lock! ++ */ ++static void ext3_mb_use_best_found(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ unsigned long ret; ++ ++ ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len); ++ ret = mb_mark_used(e3b, &ac->ac_b_ex); ++ ++ ac->ac_status = AC_STATUS_FOUND; ++ ac->ac_tail = ret & 0xffff; ++ ac->ac_buddy = ret >> 16; ++ ++ /* hold in-core structures until allocated ++ * blocks are marked non-free in on-disk bitmap */ ++ ac->ac_buddy_page = e3b->bd_buddy_page; ++ page_cache_get(e3b->bd_buddy_page); ++ ac->ac_bitmap_page = e3b->bd_bitmap_page; ++ page_cache_get(e3b->bd_bitmap_page); ++} ++ ++/* ++ * The routine checks whether found extent is good enough. If it is, ++ * then the extent gets marked used and flag is set to the context ++ * to stop scanning. Otherwise, the extent is compared with the ++ * previous found extent and if new one is better, then it's stored ++ * in the context. Later, the best found extent will be used, if ++ * mballoc can't find good enough extent. ++ * ++ * FIXME: real allocation policy is to be designed yet! ++ */ ++static void ext3_mb_measure_extent(struct ext3_allocation_context *ac, ++ struct ext3_free_extent *ex, ++ struct ext3_buddy *e3b) ++{ ++ struct ext3_free_extent *bex = &ac->ac_b_ex; ++ struct ext3_free_extent *gex = &ac->ac_g_ex; ++ ++ J_ASSERT(ex->fe_len > 0); ++ J_ASSERT(ex->fe_len < (1 << ac->ac_sb->s_blocksize_bits) * 8); ++ J_ASSERT(ex->fe_start < (1 << ac->ac_sb->s_blocksize_bits) * 8); ++ ++ ac->ac_found++; ++ ++ /* ++ * The special case - take what you catch first ++ */ ++ if (unlikely(ac->ac_flags & EXT3_MB_HINT_FIRST)) { ++ *bex = *ex; ++ ext3_mb_use_best_found(ac, e3b); ++ return; ++ } ++ ++ /* ++ * Let's check whether the chunk is good enough ++ */ ++ if (ex->fe_len == gex->fe_len) { ++ *bex = *ex; ++ ext3_mb_use_best_found(ac, e3b); ++ return; ++ } ++ ++ /* ++ * If this is first found extent, just store it in the context ++ */ ++ if (bex->fe_len == 0) { ++ *bex = *ex; ++ return; ++ } ++ ++ /* ++ * If new found extent is better, store it in the context ++ */ ++ if (bex->fe_len < gex->fe_len) { ++ /* if the request isn't satisfied, any found extent ++ * larger than previous best one is better */ ++ if (ex->fe_len > bex->fe_len) ++ *bex = *ex; ++ } else if (ex->fe_len > gex->fe_len) { ++ /* if the request is satisfied, then we try to find ++ * an extent that still satisfy the request, but is ++ * smaller than previous one */ ++ *bex = *ex; ++ } ++ ++ /* ++ * Let's scan at least few extents and don't pick up a first one ++ */ ++ if (bex->fe_len > gex->fe_len && ac->ac_found > ext3_mb_min_to_scan) ++ ac->ac_status = AC_STATUS_BREAK; ++ ++ /* ++ * We don't want to scan for a whole year ++ */ ++ if (ac->ac_found > ext3_mb_max_to_scan) ++ ac->ac_status = AC_STATUS_BREAK; ++} ++ ++static int ext3_mb_try_best_found(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ struct ext3_free_extent ex = ac->ac_b_ex; ++ int group = ex.fe_group, max, err; ++ ++ J_ASSERT(ex.fe_len > 0); ++ err = ext3_mb_load_buddy(ac->ac_sb, group, e3b); ++ if (err) ++ return err; ++ ++ ext3_lock_group(ac->ac_sb, group); ++ max = mb_find_extent(e3b, 0, ex.fe_start, ex.fe_len, &ex); ++ ++ if (max > 0) { ++ ac->ac_b_ex = ex; ++ ext3_mb_use_best_found(ac, e3b); ++ } ++ ++ ext3_unlock_group(ac->ac_sb, group); ++ ++ ext3_mb_release_desc(e3b); ++ ++ return 0; ++} ++ ++static int ext3_mb_find_by_goal(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ int group = ac->ac_g_ex.fe_group, max, err; ++ struct ext3_sb_info *sbi = EXT3_SB(ac->ac_sb); ++ struct ext3_super_block *es = sbi->s_es; ++ struct ext3_free_extent ex; ++ ++ err = ext3_mb_load_buddy(ac->ac_sb, group, e3b); ++ if (err) ++ return err; ++ ++ ext3_lock_group(ac->ac_sb, group); ++ max = mb_find_extent(e3b, 0, ac->ac_g_ex.fe_start, ++ ac->ac_g_ex.fe_len, &ex); ++ ++ if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) { ++ unsigned long start; ++ start = (e3b->bd_group * EXT3_BLOCKS_PER_GROUP(ac->ac_sb) + ++ ex.fe_start + le32_to_cpu(es->s_first_data_block)); ++ if (start % sbi->s_stripe == 0) { ++ ac->ac_found++; ++ ac->ac_b_ex = ex; ++ ext3_mb_use_best_found(ac, e3b); ++ } ++ } else if (max >= ac->ac_g_ex.fe_len) { ++ J_ASSERT(ex.fe_len > 0); ++ J_ASSERT(ex.fe_group == ac->ac_g_ex.fe_group); ++ J_ASSERT(ex.fe_start == ac->ac_g_ex.fe_start); ++ ac->ac_found++; ++ ac->ac_b_ex = ex; ++ ext3_mb_use_best_found(ac, e3b); ++ } else if (max > 0 && (ac->ac_flags & EXT3_MB_HINT_MERGE)) { ++ /* Sometimes, caller may want to merge even small ++ * number of blocks to an existing extent */ ++ J_ASSERT(ex.fe_len > 0); ++ J_ASSERT(ex.fe_group == ac->ac_g_ex.fe_group); ++ J_ASSERT(ex.fe_start == ac->ac_g_ex.fe_start); ++ ac->ac_found++; ++ ac->ac_b_ex = ex; ++ ext3_mb_use_best_found(ac, e3b); ++ } ++ ext3_unlock_group(ac->ac_sb, group); ++ ++ ext3_mb_release_desc(e3b); ++ ++ return 0; ++} ++ ++/* ++ * The routine scans buddy structures (not bitmap!) from given order ++ * to max order and tries to find big enough chunk to satisfy the req ++ */ ++static void ext3_mb_simple_scan_group(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ struct super_block *sb = ac->ac_sb; ++ struct ext3_group_info *grp = e3b->bd_info; ++ void *buddy; ++ int i, k, max; ++ ++ J_ASSERT(ac->ac_2order > 0); ++ for (i = ac->ac_2order; i <= sb->s_blocksize_bits + 1; i++) { ++ if (grp->bb_counters[i] == 0) ++ continue; ++ ++ buddy = mb_find_buddy(e3b, i, &max); ++ if (buddy == NULL) { ++ printk(KERN_ALERT "looking for wrong order?\n"); ++ break; ++ } ++ ++ k = mb_find_next_zero_bit(buddy, max, 0); ++ J_ASSERT(k < max); ++ ++ ac->ac_found++; ++ ++ ac->ac_b_ex.fe_len = 1 << i; ++ ac->ac_b_ex.fe_start = k << i; ++ ac->ac_b_ex.fe_group = e3b->bd_group; ++ ++ ext3_mb_use_best_found(ac, e3b); ++ J_ASSERT(ac->ac_b_ex.fe_len == ac->ac_g_ex.fe_len); ++ ++ if (unlikely(ext3_mb_stats)) ++ atomic_inc(&EXT3_SB(sb)->s_bal_2orders); ++ ++ break; ++ } ++} ++ ++/* ++ * The routine scans the group and measures all found extents. ++ * In order to optimize scanning, caller must pass number of ++ * free blocks in the group, so the routine can know upper limit. ++ */ ++static void ext3_mb_complex_scan_group(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ struct super_block *sb = ac->ac_sb; ++ void *bitmap = EXT3_MB_BITMAP(e3b); ++ struct ext3_free_extent ex; ++ int i, free; ++ ++ free = e3b->bd_info->bb_free; ++ J_ASSERT(free > 0); ++ ++ i = e3b->bd_info->bb_first_free; ++ ++ while (free && ac->ac_status == AC_STATUS_CONTINUE) { ++ i = mb_find_next_zero_bit(bitmap, sb->s_blocksize * 8, i); ++ if (i >= sb->s_blocksize * 8) { ++ J_ASSERT(free == 0); ++ break; ++ } ++ ++ mb_find_extent(e3b, 0, i, ac->ac_g_ex.fe_len, &ex); ++ J_ASSERT(ex.fe_len > 0); ++ J_ASSERT(free >= ex.fe_len); ++ ++ ext3_mb_measure_extent(ac, &ex, e3b); ++ ++ i += ex.fe_len; ++ free -= ex.fe_len; ++ } ++} ++ ++/* ++ * This is a special case for storages like raid5 ++ * we try to find stripe-aligned chunks for stripe-size requests ++ */ ++static void ext3_mb_scan_aligned(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ struct super_block *sb = ac->ac_sb; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ void *bitmap = EXT3_MB_BITMAP(e3b); ++ struct ext3_free_extent ex; ++ unsigned long i, max; ++ ++ J_ASSERT(sbi->s_stripe != 0); ++ ++ /* find first stripe-aligned block */ ++ i = e3b->bd_group * EXT3_BLOCKS_PER_GROUP(sb) ++ + le32_to_cpu(sbi->s_es->s_first_data_block); ++ i = ((i + sbi->s_stripe - 1) / sbi->s_stripe) * sbi->s_stripe; ++ i = (i - le32_to_cpu(sbi->s_es->s_first_data_block)) ++ % EXT3_BLOCKS_PER_GROUP(sb); ++ ++ while (i < sb->s_blocksize * 8) { ++ if (!mb_test_bit(i, bitmap)) { ++ max = mb_find_extent(e3b, 0, i, sbi->s_stripe, &ex); ++ if (max >= sbi->s_stripe) { ++ ac->ac_found++; ++ ac->ac_b_ex = ex; ++ ext3_mb_use_best_found(ac, e3b); ++ break; ++ } ++ } ++ i += sbi->s_stripe; ++ } ++} ++ ++static int ext3_mb_good_group(struct ext3_allocation_context *ac, ++ int group, int cr) ++{ ++ struct ext3_group_info *grp = EXT3_GROUP_INFO(ac->ac_sb, group); ++ unsigned free, fragments, i, bits; ++ ++ J_ASSERT(cr >= 0 && cr < 4); ++ J_ASSERT(!EXT3_MB_GRP_NEED_INIT(grp)); ++ ++ free = grp->bb_free; ++ fragments = grp->bb_fragments; ++ if (free == 0) ++ return 0; ++ if (fragments == 0) ++ return 0; ++ ++ switch (cr) { ++ case 0: ++ J_ASSERT(ac->ac_2order != 0); ++ bits = ac->ac_sb->s_blocksize_bits + 1; ++ for (i = ac->ac_2order; i <= bits; i++) ++ if (grp->bb_counters[i] > 0) ++ return 1; ++ break; ++ case 1: ++ if ((free / fragments) >= ac->ac_g_ex.fe_len) ++ return 1; ++ break; ++ case 2: ++ if (free >= ac->ac_g_ex.fe_len) ++ return 1; ++ break; ++ case 3: ++ return 1; ++ default: ++ BUG(); ++ } ++ ++ return 0; ++} ++ ++int ext3_mb_new_blocks(handle_t *handle, struct inode *inode, ++ unsigned long goal, int *len, int flags, int *errp) ++{ ++ struct buffer_head *bitmap_bh = NULL; ++ struct ext3_allocation_context ac; ++ int i, group, block, cr, err = 0; ++ struct ext3_group_desc *gdp; ++ struct ext3_super_block *es; ++ struct buffer_head *gdp_bh; ++ struct ext3_sb_info *sbi; ++ struct super_block *sb; ++ struct ext3_buddy e3b; ++ ++ J_ASSERT(len != NULL); ++ J_ASSERT(*len > 0); ++ ++ sb = inode->i_sb; ++ if (!sb) { ++ printk("ext3_mb_new_nblocks: nonexistent device"); ++ return 0; ++ } ++ ++ if (!test_opt(sb, MBALLOC)) { ++ static int ext3_mballoc_warning = 0; ++ if (ext3_mballoc_warning == 0) { ++ printk(KERN_ERR "EXT3-fs: multiblock request with " ++ "mballoc disabled!\n"); ++ ext3_mballoc_warning++; ++ } ++ *len = 1; ++ err = ext3_new_block_old(handle, inode, goal, errp); ++ return err; ++ } ++ ++ ext3_mb_poll_new_transaction(sb, handle); ++ ++ sbi = EXT3_SB(sb); ++ es = EXT3_SB(sb)->s_es; ++ ++ /* ++ * We can't allocate > group size ++ */ ++ if (*len >= EXT3_BLOCKS_PER_GROUP(sb) - 10) ++ *len = EXT3_BLOCKS_PER_GROUP(sb) - 10; ++ ++ if (!(flags & EXT3_MB_HINT_RESERVED)) { ++ /* someone asks for non-reserved blocks */ ++ BUG_ON(*len > 1); ++ err = ext3_mb_reserve_blocks(sb, 1); ++ if (err) { ++ *errp = err; ++ return 0; ++ } ++ } ++ ++ ac.ac_buddy_page = NULL; ++ ac.ac_bitmap_page = NULL; ++ ++ /* ++ * Check quota for allocation of this blocks. ++ */ ++ while (*len && DQUOT_ALLOC_BLOCK(inode, *len)) ++ *len -= 1; ++ if (*len == 0) { ++ *errp = -EDQUOT; ++ block = 0; ++ goto out; ++ } ++ ++ /* start searching from the goal */ ++ if (goal < le32_to_cpu(es->s_first_data_block) || ++ goal >= le32_to_cpu(es->s_blocks_count)) ++ goal = le32_to_cpu(es->s_first_data_block); ++ group = (goal - le32_to_cpu(es->s_first_data_block)) / ++ EXT3_BLOCKS_PER_GROUP(sb); ++ block = ((goal - le32_to_cpu(es->s_first_data_block)) % ++ EXT3_BLOCKS_PER_GROUP(sb)); ++ ++ /* set up allocation goals */ ++ ac.ac_b_ex.fe_group = 0; ++ ac.ac_b_ex.fe_start = 0; ++ ac.ac_b_ex.fe_len = 0; ++ ac.ac_status = AC_STATUS_CONTINUE; ++ ac.ac_groups_scanned = 0; ++ ac.ac_ex_scanned = 0; ++ ac.ac_found = 0; ++ ac.ac_sb = inode->i_sb; ++ ac.ac_g_ex.fe_group = group; ++ ac.ac_g_ex.fe_start = block; ++ ac.ac_g_ex.fe_len = *len; ++ ac.ac_flags = flags; ++ ac.ac_2order = 0; ++ ac.ac_criteria = 0; ++ ++ if (*len == 1 && sbi->s_stripe) { ++ /* looks like a metadata, let's use a dirty hack for raid5 ++ * move all metadata in first groups in hope to hit cached ++ * sectors and thus avoid read-modify cycles in raid5 */ ++ ac.ac_g_ex.fe_group = group = 0; ++ } ++ ++ /* probably, the request is for 2^8+ blocks (1/2/3/... MB) */ ++ i = ffs(*len); ++ if (i >= ext3_mb_order2_reqs) { ++ i--; ++ if ((*len & (~(1 << i))) == 0) ++ ac.ac_2order = i; ++ } ++ ++ /* first, try the goal */ ++ err = ext3_mb_find_by_goal(&ac, &e3b); ++ if (err) ++ goto out_err; ++ if (ac.ac_status == AC_STATUS_FOUND) ++ goto found; ++ ++ /* Let's just scan groups to find more-less suitable blocks */ ++ cr = ac.ac_2order ? 0 : 1; ++repeat: ++ for (; cr < 4 && ac.ac_status == AC_STATUS_CONTINUE; cr++) { ++ ac.ac_criteria = cr; ++ for (i = 0; i < EXT3_SB(sb)->s_groups_count; group++, i++) { ++ if (group == EXT3_SB(sb)->s_groups_count) ++ group = 0; ++ ++ if (EXT3_MB_GRP_NEED_INIT(EXT3_GROUP_INFO(sb, group))) { ++ /* we need full data about the group ++ * to make a good selection */ ++ err = ext3_mb_load_buddy(ac.ac_sb, group, &e3b); ++ if (err) ++ goto out_err; ++ ext3_mb_release_desc(&e3b); ++ } ++ ++ /* check is group good for our criteries */ ++ if (!ext3_mb_good_group(&ac, group, cr)) ++ continue; ++ ++ err = ext3_mb_load_buddy(ac.ac_sb, group, &e3b); ++ if (err) ++ goto out_err; ++ ++ ext3_lock_group(sb, group); ++ if (!ext3_mb_good_group(&ac, group, cr)) { ++ /* someone did allocation from this group */ ++ ext3_unlock_group(sb, group); ++ ext3_mb_release_desc(&e3b); ++ continue; ++ } ++ ++ ac.ac_groups_scanned++; ++ if (cr == 0) ++ ext3_mb_simple_scan_group(&ac, &e3b); ++ else if (cr == 1 && *len == sbi->s_stripe) ++ ext3_mb_scan_aligned(&ac, &e3b); ++ else ++ ext3_mb_complex_scan_group(&ac, &e3b); ++ ++ ext3_unlock_group(sb, group); ++ ++ ext3_mb_release_desc(&e3b); ++ ++ if (ac.ac_status != AC_STATUS_CONTINUE) ++ break; ++ } ++ } ++ ++ if (ac.ac_b_ex.fe_len > 0 && ac.ac_status != AC_STATUS_FOUND && ++ !(ac.ac_flags & EXT3_MB_HINT_FIRST)) { ++ /* ++ * We've been searching too long. Let's try to allocate ++ * the best chunk we've found so far ++ */ ++ ++ /*if (ac.ac_found > ext3_mb_max_to_scan) ++ printk(KERN_DEBUG "EXT3-fs: too long searching at " ++ "%u (%d/%d)\n", cr, ac.ac_b_ex.fe_len, ++ ac.ac_g_ex.fe_len);*/ ++ ext3_mb_try_best_found(&ac, &e3b); ++ if (ac.ac_status != AC_STATUS_FOUND) { ++ /* ++ * Someone more lucky has already allocated it. ++ * The only thing we can do is just take first ++ * found block(s) ++ printk(KERN_DEBUG "EXT3-fs: someone won our chunk\n"); ++ */ ++ ac.ac_b_ex.fe_group = 0; ++ ac.ac_b_ex.fe_start = 0; ++ ac.ac_b_ex.fe_len = 0; ++ ac.ac_status = AC_STATUS_CONTINUE; ++ ac.ac_flags |= EXT3_MB_HINT_FIRST; ++ cr = 3; ++ goto repeat; ++ } ++ } ++ ++ if (ac.ac_status != AC_STATUS_FOUND) { ++ /* ++ * We aren't lucky definitely ++ */ ++ DQUOT_FREE_BLOCK(inode, *len); ++ *errp = -ENOSPC; ++ block = 0; ++#if 1 ++ printk(KERN_ERR "EXT3-fs: can't allocate: status %d flags %d\n", ++ ac.ac_status, ac.ac_flags); ++ printk(KERN_ERR "EXT3-fs: goal %d, best found %d/%d/%d cr %d\n", ++ ac.ac_g_ex.fe_len, ac.ac_b_ex.fe_group, ++ ac.ac_b_ex.fe_start, ac.ac_b_ex.fe_len, cr); ++ printk(KERN_ERR "EXT3-fs: %lu block reserved, %d found\n", ++ sbi->s_blocks_reserved, ac.ac_found); ++ printk("EXT3-fs: groups: "); ++ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) ++ printk("%d: %d ", i, EXT3_GROUP_INFO(sb, i)->bb_free); ++ printk("\n"); ++#endif ++ goto out; ++ } ++ ++found: ++ J_ASSERT(ac.ac_b_ex.fe_len > 0); ++ ++ /* good news - free block(s) have been found. now it's time ++ * to mark block(s) in good old journaled bitmap */ ++ block = ac.ac_b_ex.fe_group * EXT3_BLOCKS_PER_GROUP(sb) ++ + ac.ac_b_ex.fe_start ++ + le32_to_cpu(es->s_first_data_block); ++ ++ /* we made a desicion, now mark found blocks in good old ++ * bitmap to be journaled */ ++ ++ ext3_debug("using block group %d(%d)\n", ++ ac.ac_b_group.group, gdp->bg_free_blocks_count); ++ ++ bitmap_bh = read_block_bitmap(sb, ac.ac_b_ex.fe_group); ++ if (!bitmap_bh) { ++ *errp = -EIO; ++ goto out_err; ++ } ++ ++ err = ext3_journal_get_write_access(handle, bitmap_bh); ++ if (err) { ++ *errp = err; ++ goto out_err; ++ } ++ ++ gdp = ext3_get_group_desc(sb, ac.ac_b_ex.fe_group, &gdp_bh); ++ if (!gdp) { ++ *errp = -EIO; ++ goto out_err; ++ } ++ ++ err = ext3_journal_get_write_access(handle, gdp_bh); ++ if (err) ++ goto out_err; ++ ++ block = ac.ac_b_ex.fe_group * EXT3_BLOCKS_PER_GROUP(sb) ++ + ac.ac_b_ex.fe_start ++ + le32_to_cpu(es->s_first_data_block); ++ ++ if (block == le32_to_cpu(gdp->bg_block_bitmap) || ++ block == le32_to_cpu(gdp->bg_inode_bitmap) || ++ in_range(block, le32_to_cpu(gdp->bg_inode_table), ++ EXT3_SB(sb)->s_itb_per_group)) ++ ext3_error(sb, "ext3_new_block", ++ "Allocating block in system zone - " ++ "block = %u", block); ++#ifdef AGGRESSIVE_CHECK ++ for (i = 0; i < ac.ac_b_ex.fe_len; i++) ++ J_ASSERT(!mb_test_bit(ac.ac_b_ex.fe_start + i, bitmap_bh->b_data)); ++#endif ++ mb_set_bits(bitmap_bh->b_data, ac.ac_b_ex.fe_start, ac.ac_b_ex.fe_len); ++ ++ spin_lock(sb_bgl_lock(sbi, ac.ac_b_ex.fe_group)); ++ gdp->bg_free_blocks_count = ++ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) ++ - ac.ac_b_ex.fe_len); ++ spin_unlock(sb_bgl_lock(sbi, ac.ac_b_ex.fe_group)); ++ percpu_counter_mod(&sbi->s_freeblocks_counter, - ac.ac_b_ex.fe_len); ++ ++ err = ext3_journal_dirty_metadata(handle, bitmap_bh); ++ if (err) ++ goto out_err; ++ err = ext3_journal_dirty_metadata(handle, gdp_bh); ++ if (err) ++ goto out_err; ++ ++ sb->s_dirt = 1; ++ *errp = 0; ++ brelse(bitmap_bh); ++ ++ /* drop non-allocated, but dquote'd blocks */ ++ J_ASSERT(*len >= ac.ac_b_ex.fe_len); ++ DQUOT_FREE_BLOCK(inode, *len - ac.ac_b_ex.fe_len); ++ ++ *len = ac.ac_b_ex.fe_len; ++ J_ASSERT(*len > 0); ++ J_ASSERT(block != 0); ++ goto out; ++ ++out_err: ++ /* if we've already allocated something, roll it back */ ++ if (ac.ac_status == AC_STATUS_FOUND) { ++ /* FIXME: free blocks here */ ++ } ++ ++ DQUOT_FREE_BLOCK(inode, *len); ++ brelse(bitmap_bh); ++ *errp = err; ++ block = 0; ++out: ++ if (ac.ac_buddy_page) ++ page_cache_release(ac.ac_buddy_page); ++ if (ac.ac_bitmap_page) ++ page_cache_release(ac.ac_bitmap_page); ++ ++ if (!(flags & EXT3_MB_HINT_RESERVED)) { ++ /* block wasn't reserved before and we reserved it ++ * at the beginning of allocation. it doesn't matter ++ * whether we allocated anything or we failed: time ++ * to release reservation. NOTE: because I expect ++ * any multiblock request from delayed allocation ++ * path only, here is single block always */ ++ ext3_mb_release_blocks(sb, 1); ++ } ++ ++ if (unlikely(ext3_mb_stats) && ac.ac_g_ex.fe_len > 1) { ++ atomic_inc(&sbi->s_bal_reqs); ++ atomic_add(*len, &sbi->s_bal_allocated); ++ if (*len >= ac.ac_g_ex.fe_len) ++ atomic_inc(&sbi->s_bal_success); ++ atomic_add(ac.ac_found, &sbi->s_bal_ex_scanned); ++ if (ac.ac_g_ex.fe_start == ac.ac_b_ex.fe_start && ++ ac.ac_g_ex.fe_group == ac.ac_b_ex.fe_group) ++ atomic_inc(&sbi->s_bal_goals); ++ if (ac.ac_found > ext3_mb_max_to_scan) ++ atomic_inc(&sbi->s_bal_breaks); ++ } ++ ++ ext3_mb_store_history(sb, inode->i_ino, &ac); ++ ++ return block; ++} ++EXPORT_SYMBOL(ext3_mb_new_blocks); ++ ++#ifdef EXT3_MB_HISTORY ++struct ext3_mb_proc_session { ++ struct ext3_mb_history *history; ++ struct super_block *sb; ++ int start; ++ int max; ++}; ++ ++static void *ext3_mb_history_skip_empty(struct ext3_mb_proc_session *s, ++ struct ext3_mb_history *hs, ++ int first) ++{ ++ if (hs == s->history + s->max) ++ hs = s->history; ++ if (!first && hs == s->history + s->start) ++ return NULL; ++ while (hs->goal.fe_len == 0) { ++ hs++; ++ if (hs == s->history + s->max) ++ hs = s->history; ++ if (hs == s->history + s->start) ++ return NULL; ++ } ++ return hs; ++} ++ ++static void *ext3_mb_seq_history_start(struct seq_file *seq, loff_t *pos) ++{ ++ struct ext3_mb_proc_session *s = seq->private; ++ struct ext3_mb_history *hs; ++ int l = *pos; ++ ++ if (l == 0) ++ return SEQ_START_TOKEN; ++ hs = ext3_mb_history_skip_empty(s, s->history + s->start, 1); ++ if (!hs) ++ return NULL; ++ while (--l && (hs = ext3_mb_history_skip_empty(s, ++hs, 0)) != NULL); ++ return hs; ++} ++ ++static void *ext3_mb_seq_history_next(struct seq_file *seq, void *v, loff_t *pos) ++{ ++ struct ext3_mb_proc_session *s = seq->private; ++ struct ext3_mb_history *hs = v; ++ ++ ++*pos; ++ if (v == SEQ_START_TOKEN) ++ return ext3_mb_history_skip_empty(s, s->history + s->start, 1); ++ else ++ return ext3_mb_history_skip_empty(s, ++hs, 0); ++} ++ ++static int ext3_mb_seq_history_show(struct seq_file *seq, void *v) ++{ ++ struct ext3_mb_history *hs = v; ++ char buf[20], buf2[20]; ++ ++ if (v == SEQ_START_TOKEN) { ++ seq_printf(seq, "%-5s %-8s %-17s %-17s %-5s %-5s %-2s %-5s %-5s %-6s\n", ++ "pid", "inode", "goal", "result", "found", "grps", "cr", ++ "merge", "tail", "broken"); ++ return 0; ++ } ++ ++ sprintf(buf, "%u/%u/%u", hs->goal.fe_group, ++ hs->goal.fe_start, hs->goal.fe_len); ++ sprintf(buf2, "%u/%u/%u", hs->result.fe_group, ++ hs->result.fe_start, hs->result.fe_len); ++ seq_printf(seq, "%-5u %-8u %-17s %-17s %-5u %-5u %-2u %-5s %-5u %-6u\n", ++ hs->pid, hs->ino, buf, buf2, hs->found, hs->groups, ++ hs->cr, hs->merged ? "M" : "", hs->tail, ++ hs->buddy ? 1 << hs->buddy : 0); ++ return 0; ++} ++ ++static void ext3_mb_seq_history_stop(struct seq_file *seq, void *v) ++{ ++} ++ ++static struct seq_operations ext3_mb_seq_history_ops = { ++ .start = ext3_mb_seq_history_start, ++ .next = ext3_mb_seq_history_next, ++ .stop = ext3_mb_seq_history_stop, ++ .show = ext3_mb_seq_history_show, ++}; ++ ++static int ext3_mb_seq_history_open(struct inode *inode, struct file *file) ++{ ++ struct super_block *sb = PDE(inode)->data; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct ext3_mb_proc_session *s; ++ int rc, size; ++ ++ s = kmalloc(sizeof(*s), GFP_KERNEL); ++ if (s == NULL) ++ return -EIO; ++ size = sizeof(struct ext3_mb_history) * sbi->s_mb_history_max; ++ s->history = kmalloc(size, GFP_KERNEL); ++ if (s == NULL) { ++ kfree(s); ++ return -EIO; ++ } ++ ++ spin_lock(&sbi->s_mb_history_lock); ++ memcpy(s->history, sbi->s_mb_history, size); ++ s->max = sbi->s_mb_history_max; ++ s->start = sbi->s_mb_history_cur % s->max; ++ spin_unlock(&sbi->s_mb_history_lock); ++ ++ rc = seq_open(file, &ext3_mb_seq_history_ops); ++ if (rc == 0) { ++ struct seq_file *m = (struct seq_file *)file->private_data; ++ m->private = s; ++ } else { ++ kfree(s->history); ++ kfree(s); ++ } ++ return rc; ++ ++} ++ ++static int ext3_mb_seq_history_release(struct inode *inode, struct file *file) ++{ ++ struct seq_file *seq = (struct seq_file *)file->private_data; ++ struct ext3_mb_proc_session *s = seq->private; ++ kfree(s->history); ++ kfree(s); ++ return seq_release(inode, file); ++} ++ ++static struct file_operations ext3_mb_seq_history_fops = { ++ .owner = THIS_MODULE, ++ .open = ext3_mb_seq_history_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = ext3_mb_seq_history_release, ++}; ++ ++static void *ext3_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) ++{ ++ struct super_block *sb = seq->private; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ long group; ++ ++ if (*pos < 0 || *pos >= sbi->s_groups_count) ++ return NULL; ++ ++ group = *pos + 1; ++ return (void *) group; ++} ++ ++static void *ext3_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) ++{ ++ struct super_block *sb = seq->private; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ long group; ++ ++ ++*pos; ++ if (*pos < 0 || *pos >= sbi->s_groups_count) ++ return NULL; ++ group = *pos + 1; ++ return (void *) group;; ++} ++ ++static int ext3_mb_seq_groups_show(struct seq_file *seq, void *v) ++{ ++ struct super_block *sb = seq->private; ++ long group = (long) v, i; ++ struct sg { ++ struct ext3_group_info info; ++ unsigned short counters[16]; ++ } sg; ++ ++ group--; ++ if (group == 0) ++ seq_printf(seq, "#%-5s: %-5s %-5s %-5s [ %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s ]\n", ++ "group", "free", "frags", "first", "2^0", "2^1", "2^2", ++ "2^3", "2^4", "2^5", "2^6", "2^7", "2^8", "2^9", "2^10", ++ "2^11", "2^12", "2^13"); ++ ++ i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) + ++ sizeof(struct ext3_group_info); ++ ext3_lock_group(sb, group); ++ memcpy(&sg, EXT3_GROUP_INFO(sb, group), i); ++ ext3_unlock_group(sb, group); ++ ++ if (EXT3_MB_GRP_NEED_INIT(&sg.info)) ++ return 0; ++ ++ seq_printf(seq, "#%-5lu: %-5u %-5u %-5u [", group, sg.info.bb_free, ++ sg.info.bb_fragments, sg.info.bb_first_free); ++ for (i = 0; i <= 13; i++) ++ seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ? ++ sg.info.bb_counters[i] : 0); ++ seq_printf(seq, " ]\n"); ++ ++ return 0; ++} ++ ++static void ext3_mb_seq_groups_stop(struct seq_file *seq, void *v) ++{ ++} ++ ++static struct seq_operations ext3_mb_seq_groups_ops = { ++ .start = ext3_mb_seq_groups_start, ++ .next = ext3_mb_seq_groups_next, ++ .stop = ext3_mb_seq_groups_stop, ++ .show = ext3_mb_seq_groups_show, ++}; ++ ++static int ext3_mb_seq_groups_open(struct inode *inode, struct file *file) ++{ ++ struct super_block *sb = PDE(inode)->data; ++ int rc; ++ ++ rc = seq_open(file, &ext3_mb_seq_groups_ops); ++ if (rc == 0) { ++ struct seq_file *m = (struct seq_file *)file->private_data; ++ m->private = sb; ++ } ++ return rc; ++ ++} ++ ++static struct file_operations ext3_mb_seq_groups_fops = { ++ .owner = THIS_MODULE, ++ .open = ext3_mb_seq_groups_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = seq_release, ++}; ++ ++static void ext3_mb_history_release(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ char name[64]; ++ ++ snprintf(name, sizeof(name) - 1, "%s", bdevname(sb->s_bdev, name)); ++ remove_proc_entry("mb_groups", sbi->s_mb_proc); ++ remove_proc_entry("mb_history", sbi->s_mb_proc); ++ remove_proc_entry(name, proc_root_ext3); ++ ++ if (sbi->s_mb_history) ++ kfree(sbi->s_mb_history); ++} ++ ++static void ext3_mb_history_init(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ char name[64]; ++ int i; ++ ++ snprintf(name, sizeof(name) - 1, "%s", bdevname(sb->s_bdev, name)); ++ sbi->s_mb_proc = proc_mkdir(name, proc_root_ext3); ++ if (sbi->s_mb_proc != NULL) { ++ struct proc_dir_entry *p; ++ p = create_proc_entry("mb_history", S_IRUGO, sbi->s_mb_proc); ++ if (p) { ++ p->proc_fops = &ext3_mb_seq_history_fops; ++ p->data = sb; ++ } ++ p = create_proc_entry("mb_groups", S_IRUGO, sbi->s_mb_proc); ++ if (p) { ++ p->proc_fops = &ext3_mb_seq_groups_fops; ++ p->data = sb; ++ } ++ } ++ ++ sbi->s_mb_history_max = 1000; ++ sbi->s_mb_history_cur = 0; ++ spin_lock_init(&sbi->s_mb_history_lock); ++ i = sbi->s_mb_history_max * sizeof(struct ext3_mb_history); ++ sbi->s_mb_history = kmalloc(i, GFP_KERNEL); ++ memset(sbi->s_mb_history, 0, i); ++ /* if we can't allocate history, then we simple won't use it */ ++} ++ ++static void ++ext3_mb_store_history(struct super_block *sb, unsigned ino, ++ struct ext3_allocation_context *ac) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct ext3_mb_history h; ++ ++ if (likely(sbi->s_mb_history == NULL)) ++ return; ++ ++ h.pid = current->pid; ++ h.ino = ino; ++ h.goal = ac->ac_g_ex; ++ h.result = ac->ac_b_ex; ++ h.found = ac->ac_found; ++ h.cr = ac->ac_criteria; ++ h.groups = ac->ac_groups_scanned; ++ h.tail = ac->ac_tail; ++ h.buddy = ac->ac_buddy; ++ h.merged = 0; ++ if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && ++ ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group) ++ h.merged = 1; ++ ++ spin_lock(&sbi->s_mb_history_lock); ++ memcpy(sbi->s_mb_history + sbi->s_mb_history_cur, &h, sizeof(h)); ++ if (++sbi->s_mb_history_cur >= sbi->s_mb_history_max) ++ sbi->s_mb_history_cur = 0; ++ spin_unlock(&sbi->s_mb_history_lock); ++} ++ ++#else ++#define ext3_mb_history_release(sb) ++#define ext3_mb_history_init(sb) ++#endif ++ ++int ext3_mb_init_backend(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int i, j, len, metalen; ++ int num_meta_group_infos = ++ (sbi->s_groups_count + EXT3_DESC_PER_BLOCK(sb) - 1) >> ++ EXT3_DESC_PER_BLOCK_BITS(sb); ++ struct ext3_group_info **meta_group_info; ++ ++ /* An 8TB filesystem with 64-bit pointers requires a 4096 byte ++ * kmalloc. A 128kb malloc should suffice for a 256TB filesystem. ++ * So a two level scheme suffices for now. */ ++ sbi->s_group_info = kmalloc(sizeof(*sbi->s_group_info) * ++ num_meta_group_infos, GFP_KERNEL); ++ if (sbi->s_group_info == NULL) { ++ printk(KERN_ERR "EXT3-fs: can't allocate buddy meta group\n"); ++ return -ENOMEM; ++ } ++ sbi->s_buddy_cache = new_inode(sb); ++ if (sbi->s_buddy_cache == NULL) { ++ printk(KERN_ERR "EXT3-fs: can't get new inode\n"); ++ goto err_freesgi; ++ } ++ ++ metalen = sizeof(*meta_group_info) << EXT3_DESC_PER_BLOCK_BITS(sb); ++ for (i = 0; i < num_meta_group_infos; i++) { ++ if ((i + 1) == num_meta_group_infos) ++ metalen = sizeof(*meta_group_info) * ++ (sbi->s_groups_count - ++ (i << EXT3_DESC_PER_BLOCK_BITS(sb))); ++ meta_group_info = kmalloc(metalen, GFP_KERNEL); ++ if (meta_group_info == NULL) { ++ printk(KERN_ERR "EXT3-fs: can't allocate mem for a " ++ "buddy group\n"); ++ goto err_freemeta; ++ } ++ sbi->s_group_info[i] = meta_group_info; ++ } ++ ++ /* ++ * calculate needed size. if change bb_counters size, ++ * don't forget about ext3_mb_generate_buddy() ++ */ ++ len = sizeof(struct ext3_group_info); ++ len += sizeof(unsigned short) * (sb->s_blocksize_bits + 2); ++ for (i = 0; i < sbi->s_groups_count; i++) { ++ struct ext3_group_desc * desc; ++ ++ meta_group_info = ++ sbi->s_group_info[i >> EXT3_DESC_PER_BLOCK_BITS(sb)]; ++ j = i & (EXT3_DESC_PER_BLOCK(sb) - 1); ++ ++ meta_group_info[j] = kmalloc(len, GFP_KERNEL); ++ if (meta_group_info[j] == NULL) { ++ printk(KERN_ERR "EXT3-fs: can't allocate buddy mem\n"); ++ i--; ++ goto err_freebuddy; ++ } ++ desc = ext3_get_group_desc(sb, i, NULL); ++ if (desc == NULL) { ++ printk(KERN_ERR"EXT3-fs: can't read descriptor %u\n",i); ++ goto err_freebuddy; ++ } ++ memset(meta_group_info[j], 0, len); ++ set_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, ++ &meta_group_info[j]->bb_state); ++ meta_group_info[j]->bb_free = ++ le16_to_cpu(desc->bg_free_blocks_count); ++ } ++ ++ return 0; ++ ++err_freebuddy: ++ while (i >= 0) { ++ kfree(EXT3_GROUP_INFO(sb, i)); ++ i--; ++ } ++ i = num_meta_group_infos; ++err_freemeta: ++ while (--i >= 0) ++ kfree(sbi->s_group_info[i]); ++ iput(sbi->s_buddy_cache); ++err_freesgi: ++ kfree(sbi->s_group_info); ++ return -ENOMEM; ++} ++ ++int ext3_mb_init(struct super_block *sb, int needs_recovery) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct inode *root = sb->s_root->d_inode; ++ unsigned i, offset, max; ++ struct dentry *dentry; ++ ++ if (!test_opt(sb, MBALLOC)) ++ return 0; ++ ++ i = (sb->s_blocksize_bits + 2) * sizeof(unsigned short); ++ ++ sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL); ++ if (sbi->s_mb_offsets == NULL) { ++ clear_opt(sbi->s_mount_opt, MBALLOC); ++ return -ENOMEM; ++ } ++ sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); ++ if (sbi->s_mb_maxs == NULL) { ++ clear_opt(sbi->s_mount_opt, MBALLOC); ++ kfree(sbi->s_mb_maxs); ++ return -ENOMEM; ++ } ++ ++ /* order 0 is regular bitmap */ ++ sbi->s_mb_maxs[0] = sb->s_blocksize << 3; ++ sbi->s_mb_offsets[0] = 0; ++ ++ i = 1; ++ offset = 0; ++ max = sb->s_blocksize << 2; ++ do { ++ sbi->s_mb_offsets[i] = offset; ++ sbi->s_mb_maxs[i] = max; ++ offset += 1 << (sb->s_blocksize_bits - i); ++ max = max >> 1; ++ i++; ++ } while (i <= sb->s_blocksize_bits + 1); ++ ++ /* init file for buddy data */ ++ if ((i = ext3_mb_init_backend(sb))) { ++ clear_opt(sbi->s_mount_opt, MBALLOC); ++ kfree(sbi->s_mb_offsets); ++ kfree(sbi->s_mb_maxs); ++ return i; ++ } ++ ++ spin_lock_init(&sbi->s_reserve_lock); ++ spin_lock_init(&sbi->s_md_lock); ++ INIT_LIST_HEAD(&sbi->s_active_transaction); ++ INIT_LIST_HEAD(&sbi->s_closed_transaction); ++ INIT_LIST_HEAD(&sbi->s_committed_transaction); ++ spin_lock_init(&sbi->s_bal_lock); ++ ++ /* remove old on-disk buddy file */ ++ mutex_lock(&root->i_mutex); ++ dentry = lookup_one_len(".buddy", sb->s_root, strlen(".buddy")); ++ if (dentry->d_inode != NULL) { ++ i = vfs_unlink(root, dentry); ++ if (i != 0) ++ printk("EXT3-fs: can't remove .buddy file: %d\n", i); ++ } ++ dput(dentry); ++ mutex_unlock(&root->i_mutex); ++ ++ ext3_mb_history_init(sb); ++ ++ printk("EXT3-fs: mballoc enabled\n"); ++ return 0; ++} ++ ++int ext3_mb_release(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int i, num_meta_group_infos; ++ ++ if (!test_opt(sb, MBALLOC)) ++ return 0; ++ ++ /* release freed, non-committed blocks */ ++ spin_lock(&sbi->s_md_lock); ++ list_splice_init(&sbi->s_closed_transaction, ++ &sbi->s_committed_transaction); ++ list_splice_init(&sbi->s_active_transaction, ++ &sbi->s_committed_transaction); ++ spin_unlock(&sbi->s_md_lock); ++ ext3_mb_free_committed_blocks(sb); ++ ++ if (sbi->s_group_info) { ++ for (i = 0; i < sbi->s_groups_count; i++) ++ kfree(EXT3_GROUP_INFO(sb, i)); ++ num_meta_group_infos = (sbi->s_groups_count + ++ EXT3_DESC_PER_BLOCK(sb) - 1) >> ++ EXT3_DESC_PER_BLOCK_BITS(sb); ++ for (i = 0; i < num_meta_group_infos; i++) ++ kfree(sbi->s_group_info[i]); ++ kfree(sbi->s_group_info); ++ } ++ if (sbi->s_mb_offsets) ++ kfree(sbi->s_mb_offsets); ++ if (sbi->s_mb_maxs) ++ kfree(sbi->s_mb_maxs); ++ if (sbi->s_buddy_cache) ++ iput(sbi->s_buddy_cache); ++ if (sbi->s_blocks_reserved) ++ printk("ext3-fs: %ld blocks being reserved at umount!\n", ++ sbi->s_blocks_reserved); ++ if (ext3_mb_stats) { ++ printk("EXT3-fs: mballoc: %u blocks %u reqs (%u success)\n", ++ atomic_read(&sbi->s_bal_allocated), ++ atomic_read(&sbi->s_bal_reqs), ++ atomic_read(&sbi->s_bal_success)); ++ printk("EXT3-fs: mballoc: %u extents scanned, %u goal hits, " ++ "%u 2^N hits, %u breaks\n", ++ atomic_read(&sbi->s_bal_ex_scanned), ++ atomic_read(&sbi->s_bal_goals), ++ atomic_read(&sbi->s_bal_2orders), ++ atomic_read(&sbi->s_bal_breaks)); ++ printk("EXT3-fs: mballoc: %lu generated and it took %Lu\n", ++ sbi->s_mb_buddies_generated++, ++ sbi->s_mb_generation_time); ++ } ++ ++ ext3_mb_history_release(sb); ++ ++ return 0; ++} ++ ++void ext3_mb_free_committed_blocks(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int err, i, count = 0, count2 = 0; ++ struct ext3_free_metadata *md; ++ struct ext3_buddy e3b; ++ ++ if (list_empty(&sbi->s_committed_transaction)) ++ return; ++ ++ /* there is committed blocks to be freed yet */ ++ do { ++ /* get next array of blocks */ ++ md = NULL; ++ spin_lock(&sbi->s_md_lock); ++ if (!list_empty(&sbi->s_committed_transaction)) { ++ md = list_entry(sbi->s_committed_transaction.next, ++ struct ext3_free_metadata, list); ++ list_del(&md->list); ++ } ++ spin_unlock(&sbi->s_md_lock); ++ ++ if (md == NULL) ++ break; ++ ++ mb_debug("gonna free %u blocks in group %u (0x%p):", ++ md->num, md->group, md); ++ ++ err = ext3_mb_load_buddy(sb, md->group, &e3b); ++ /* we expect to find existing buddy because it's pinned */ ++ BUG_ON(err != 0); ++ ++ /* there are blocks to put in buddy to make them really free */ ++ count += md->num; ++ count2++; ++ ext3_lock_group(sb, md->group); ++ for (i = 0; i < md->num; i++) { ++ mb_debug(" %u", md->blocks[i]); ++ mb_free_blocks(&e3b, md->blocks[i], 1); ++ } ++ mb_debug("\n"); ++ ext3_unlock_group(sb, md->group); ++ ++ /* balance refcounts from ext3_mb_free_metadata() */ ++ page_cache_release(e3b.bd_buddy_page); ++ page_cache_release(e3b.bd_bitmap_page); ++ ++ kfree(md); ++ ext3_mb_release_desc(&e3b); ++ ++ } while (md); ++ mb_debug("freed %u blocks in %u structures\n", count, count2); ++} ++ ++void ext3_mb_poll_new_transaction(struct super_block *sb, handle_t *handle) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ ++ if (sbi->s_last_transaction == handle->h_transaction->t_tid) ++ return; ++ ++ /* new transaction! time to close last one and free blocks for ++ * committed transaction. we know that only transaction can be ++ * active, so previos transaction can be being logged and we ++ * know that transaction before previous is known to be already ++ * logged. this means that now we may free blocks freed in all ++ * transactions before previous one. hope I'm clear enough ... */ ++ ++ spin_lock(&sbi->s_md_lock); ++ if (sbi->s_last_transaction != handle->h_transaction->t_tid) { ++ mb_debug("new transaction %lu, old %lu\n", ++ (unsigned long) handle->h_transaction->t_tid, ++ (unsigned long) sbi->s_last_transaction); ++ list_splice_init(&sbi->s_closed_transaction, ++ &sbi->s_committed_transaction); ++ list_splice_init(&sbi->s_active_transaction, ++ &sbi->s_closed_transaction); ++ sbi->s_last_transaction = handle->h_transaction->t_tid; ++ } ++ spin_unlock(&sbi->s_md_lock); ++ ++ ext3_mb_free_committed_blocks(sb); ++} ++ ++int ext3_mb_free_metadata(handle_t *handle, struct ext3_buddy *e3b, ++ int group, int block, int count) ++{ ++ struct ext3_group_info *db = e3b->bd_info; ++ struct super_block *sb = e3b->bd_sb; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct ext3_free_metadata *md; ++ int i; ++ ++ J_ASSERT(e3b->bd_bitmap_page != NULL); ++ J_ASSERT(e3b->bd_buddy_page != NULL); ++ ++ ext3_lock_group(sb, group); ++ for (i = 0; i < count; i++) { ++ md = db->bb_md_cur; ++ if (md && db->bb_tid != handle->h_transaction->t_tid) { ++ db->bb_md_cur = NULL; ++ md = NULL; ++ } ++ ++ if (md == NULL) { ++ ext3_unlock_group(sb, group); ++ md = kmalloc(sizeof(*md), GFP_KERNEL); ++ if (md == NULL) ++ return -ENOMEM; ++ md->num = 0; ++ md->group = group; ++ ++ ext3_lock_group(sb, group); ++ if (db->bb_md_cur == NULL) { ++ spin_lock(&sbi->s_md_lock); ++ list_add(&md->list, &sbi->s_active_transaction); ++ spin_unlock(&sbi->s_md_lock); ++ /* protect buddy cache from being freed, ++ * otherwise we'll refresh it from ++ * on-disk bitmap and lose not-yet-available ++ * blocks */ ++ page_cache_get(e3b->bd_buddy_page); ++ page_cache_get(e3b->bd_bitmap_page); ++ db->bb_md_cur = md; ++ db->bb_tid = handle->h_transaction->t_tid; ++ mb_debug("new md 0x%p for group %u\n", ++ md, md->group); ++ } else { ++ kfree(md); ++ md = db->bb_md_cur; ++ } ++ } ++ ++ BUG_ON(md->num >= EXT3_BB_MAX_BLOCKS); ++ md->blocks[md->num] = block + i; ++ md->num++; ++ if (md->num == EXT3_BB_MAX_BLOCKS) { ++ /* no more space, put full container on a sb's list */ ++ db->bb_md_cur = NULL; ++ } ++ } ++ ext3_unlock_group(sb, group); ++ return 0; ++} ++ ++void ext3_mb_free_blocks(handle_t *handle, struct inode *inode, ++ unsigned long block, unsigned long count, ++ int metadata, int *freed) ++{ ++ struct buffer_head *bitmap_bh = NULL; ++ struct ext3_group_desc *gdp; ++ struct ext3_super_block *es; ++ unsigned long bit, overflow; ++ struct buffer_head *gd_bh; ++ unsigned long block_group; ++ struct ext3_sb_info *sbi; ++ struct super_block *sb; ++ struct ext3_buddy e3b; ++ int err = 0, ret; ++ ++ *freed = 0; ++ sb = inode->i_sb; ++ if (!sb) { ++ printk ("ext3_free_blocks: nonexistent device"); ++ return; ++ } ++ ++ ext3_mb_poll_new_transaction(sb, handle); ++ ++ sbi = EXT3_SB(sb); ++ es = EXT3_SB(sb)->s_es; ++ if (block < le32_to_cpu(es->s_first_data_block) || ++ block + count < block || ++ block + count > le32_to_cpu(es->s_blocks_count)) { ++ ext3_error (sb, "ext3_free_blocks", ++ "Freeing blocks not in datazone - " ++ "block = %lu, count = %lu", block, count); ++ goto error_return; ++ } ++ ++ ext3_debug("freeing block %lu\n", block); ++ ++do_more: ++ overflow = 0; ++ block_group = (block - le32_to_cpu(es->s_first_data_block)) / ++ EXT3_BLOCKS_PER_GROUP(sb); ++ bit = (block - le32_to_cpu(es->s_first_data_block)) % ++ EXT3_BLOCKS_PER_GROUP(sb); ++ /* ++ * Check to see if we are freeing blocks across a group ++ * boundary. ++ */ ++ if (bit + count > EXT3_BLOCKS_PER_GROUP(sb)) { ++ overflow = bit + count - EXT3_BLOCKS_PER_GROUP(sb); ++ count -= overflow; ++ } ++ brelse(bitmap_bh); ++ bitmap_bh = read_block_bitmap(sb, block_group); ++ if (!bitmap_bh) ++ goto error_return; ++ gdp = ext3_get_group_desc (sb, block_group, &gd_bh); ++ if (!gdp) ++ goto error_return; ++ ++ if (in_range (le32_to_cpu(gdp->bg_block_bitmap), block, count) || ++ in_range (le32_to_cpu(gdp->bg_inode_bitmap), block, count) || ++ in_range (block, le32_to_cpu(gdp->bg_inode_table), ++ EXT3_SB(sb)->s_itb_per_group) || ++ in_range (block + count - 1, le32_to_cpu(gdp->bg_inode_table), ++ EXT3_SB(sb)->s_itb_per_group)) ++ ext3_error (sb, "ext3_free_blocks", ++ "Freeing blocks in system zones - " ++ "Block = %lu, count = %lu", ++ block, count); ++ ++ BUFFER_TRACE(bitmap_bh, "getting write access"); ++ err = ext3_journal_get_write_access(handle, bitmap_bh); ++ if (err) ++ goto error_return; ++ ++ /* ++ * We are about to modify some metadata. Call the journal APIs ++ * to unshare ->b_data if a currently-committing transaction is ++ * using it ++ */ ++ BUFFER_TRACE(gd_bh, "get_write_access"); ++ err = ext3_journal_get_write_access(handle, gd_bh); ++ if (err) ++ goto error_return; ++ ++ err = ext3_mb_load_buddy(sb, block_group, &e3b); ++ if (err) ++ goto error_return; ++ ++#ifdef AGGRESSIVE_CHECK ++ { ++ int i; ++ for (i = 0; i < count; i++) ++ J_ASSERT(mb_test_bit(bit + i, bitmap_bh->b_data)); ++ } ++#endif ++ mb_clear_bits(bitmap_bh->b_data, bit, count); ++ ++ /* We dirtied the bitmap block */ ++ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); ++ err = ext3_journal_dirty_metadata(handle, bitmap_bh); ++ ++ if (metadata) { ++ /* blocks being freed are metadata. these blocks shouldn't ++ * be used until this transaction is committed */ ++ ext3_mb_free_metadata(handle, &e3b, block_group, bit, count); ++ } else { ++ ext3_lock_group(sb, block_group); ++ mb_free_blocks(&e3b, bit, count); ++ ext3_unlock_group(sb, block_group); ++ } ++ ++ spin_lock(sb_bgl_lock(sbi, block_group)); ++ gdp->bg_free_blocks_count = ++ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + count); ++ spin_unlock(sb_bgl_lock(sbi, block_group)); ++ percpu_counter_mod(&sbi->s_freeblocks_counter, count); ++ ++ ext3_mb_release_desc(&e3b); ++ ++ *freed = count; ++ ++ /* And the group descriptor block */ ++ BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); ++ ret = ext3_journal_dirty_metadata(handle, gd_bh); ++ if (!err) err = ret; ++ ++ if (overflow && !err) { ++ block += count; ++ count = overflow; ++ goto do_more; ++ } ++ sb->s_dirt = 1; ++error_return: ++ brelse(bitmap_bh); ++ ext3_std_error(sb, err); ++ return; ++} ++ ++int ext3_mb_reserve_blocks(struct super_block *sb, int blocks) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int free, ret = -ENOSPC; ++ ++ BUG_ON(blocks < 0); ++ spin_lock(&sbi->s_reserve_lock); ++ free = percpu_counter_read_positive(&sbi->s_freeblocks_counter); ++ if (blocks <= free - sbi->s_blocks_reserved) { ++ sbi->s_blocks_reserved += blocks; ++ ret = 0; ++ } ++ spin_unlock(&sbi->s_reserve_lock); ++ return ret; ++} ++ ++void ext3_mb_release_blocks(struct super_block *sb, int blocks) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ ++ BUG_ON(blocks < 0); ++ spin_lock(&sbi->s_reserve_lock); ++ sbi->s_blocks_reserved -= blocks; ++ WARN_ON(sbi->s_blocks_reserved < 0); ++ if (sbi->s_blocks_reserved < 0) ++ sbi->s_blocks_reserved = 0; ++ spin_unlock(&sbi->s_reserve_lock); ++} ++ ++int ext3_new_block(handle_t *handle, struct inode *inode, ++ unsigned long goal, int *errp) ++{ ++ int ret, len; ++ ++ if (!test_opt(inode->i_sb, MBALLOC)) { ++ ret = ext3_new_block_old(handle, inode, goal, errp); ++ goto out; ++ } ++ len = 1; ++ ret = ext3_mb_new_blocks(handle, inode, goal, &len, 0, errp); ++out: ++ return ret; ++} ++ ++ ++void ext3_free_blocks(handle_t *handle, struct inode * inode, ++ unsigned long block, unsigned long count, int metadata) ++{ ++ struct super_block *sb; ++ int freed; ++ ++ sb = inode->i_sb; ++ if (!test_opt(sb, MBALLOC) || !EXT3_SB(sb)->s_group_info) ++ ext3_free_blocks_sb(handle, sb, block, count, &freed); ++ else ++ ext3_mb_free_blocks(handle, inode, block, count, metadata, &freed); ++ if (freed) ++ DQUOT_FREE_BLOCK(inode, freed); ++ return; ++} ++ ++#define EXT3_ROOT "ext3" ++#define EXT3_MB_STATS_NAME "mb_stats" ++#define EXT3_MB_MAX_TO_SCAN_NAME "mb_max_to_scan" ++#define EXT3_MB_MIN_TO_SCAN_NAME "mb_min_to_scan" ++#define EXT3_MB_ORDER2_REQ "mb_order2_req" ++ ++static int ext3_mb_stats_read(char *page, char **start, off_t off, ++ int count, int *eof, void *data) ++{ ++ int len; ++ ++ *eof = 1; ++ if (off != 0) ++ return 0; ++ ++ len = sprintf(page, "%ld\n", ext3_mb_stats); ++ *start = page; ++ return len; ++} ++ ++static int ext3_mb_stats_write(struct file *file, const char *buffer, ++ unsigned long count, void *data) ++{ ++ char str[32]; ++ ++ if (count >= sizeof(str)) { ++ printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n", ++ EXT3_MB_STATS_NAME, (int)sizeof(str)); ++ return -EOVERFLOW; ++ } ++ ++ if (copy_from_user(str, buffer, count)) ++ return -EFAULT; ++ ++ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ ++ ext3_mb_stats = (simple_strtol(str, NULL, 0) != 0); ++ return count; ++} ++ ++static int ext3_mb_max_to_scan_read(char *page, char **start, off_t off, ++ int count, int *eof, void *data) ++{ ++ int len; ++ ++ *eof = 1; ++ if (off != 0) ++ return 0; ++ ++ len = sprintf(page, "%ld\n", ext3_mb_max_to_scan); ++ *start = page; ++ return len; ++} ++ ++static int ext3_mb_max_to_scan_write(struct file *file, const char *buffer, ++ unsigned long count, void *data) ++{ ++ char str[32]; ++ long value; ++ ++ if (count >= sizeof(str)) { ++ printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n", ++ EXT3_MB_MAX_TO_SCAN_NAME, (int)sizeof(str)); ++ return -EOVERFLOW; ++ } ++ ++ if (copy_from_user(str, buffer, count)) ++ return -EFAULT; ++ ++ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ ++ value = simple_strtol(str, NULL, 0); ++ if (value <= 0) ++ return -ERANGE; ++ ++ ext3_mb_max_to_scan = value; ++ ++ return count; ++} ++ ++static int ext3_mb_min_to_scan_read(char *page, char **start, off_t off, ++ int count, int *eof, void *data) ++{ ++ int len; ++ ++ *eof = 1; ++ if (off != 0) ++ return 0; ++ ++ len = sprintf(page, "%ld\n", ext3_mb_min_to_scan); ++ *start = page; ++ return len; ++} ++ ++static int ext3_mb_min_to_scan_write(struct file *file, const char *buffer, ++ unsigned long count, void *data) ++{ ++ char str[32]; ++ long value; ++ ++ if (count >= sizeof(str)) { ++ printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n", ++ EXT3_MB_MIN_TO_SCAN_NAME, (int)sizeof(str)); ++ return -EOVERFLOW; ++ } ++ ++ if (copy_from_user(str, buffer, count)) ++ return -EFAULT; ++ ++ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ ++ value = simple_strtol(str, NULL, 0); ++ if (value <= 0) ++ return -ERANGE; ++ ++ ext3_mb_min_to_scan = value; ++ ++ return count; ++} ++ ++static int ext3_mb_order2_req_read(char *page, char **start, off_t off, ++ int count, int *eof, void *data) ++{ ++ int len; ++ ++ *eof = 1; ++ if (off != 0) ++ return 0; ++ ++ len = sprintf(page, "%ld\n", ext3_mb_order2_reqs); ++ *start = page; ++ return len; ++} ++ ++static int ext3_mb_order2_req_write(struct file *file, const char *buffer, ++ unsigned long count, void *data) ++{ ++ char str[32]; ++ long value; ++ ++ if (count >= sizeof(str)) { ++ printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n", ++ EXT3_MB_MIN_TO_SCAN_NAME, (int)sizeof(str)); ++ return -EOVERFLOW; ++ } ++ ++ if (copy_from_user(str, buffer, count)) ++ return -EFAULT; ++ ++ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ ++ value = simple_strtol(str, NULL, 0); ++ if (value <= 0) ++ return -ERANGE; ++ ++ ext3_mb_order2_reqs = value; ++ ++ return count; ++} ++ ++int __init init_ext3_proc(void) ++{ ++ struct proc_dir_entry *proc_ext3_mb_stats; ++ struct proc_dir_entry *proc_ext3_mb_max_to_scan; ++ struct proc_dir_entry *proc_ext3_mb_min_to_scan; ++ struct proc_dir_entry *proc_ext3_mb_order2_req; ++ ++ proc_root_ext3 = proc_mkdir(EXT3_ROOT, proc_root_fs); ++ if (proc_root_ext3 == NULL) { ++ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", EXT3_ROOT); ++ return -EIO; ++ } ++ ++ /* Initialize EXT3_MB_STATS_NAME */ ++ proc_ext3_mb_stats = create_proc_entry(EXT3_MB_STATS_NAME, ++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); ++ if (proc_ext3_mb_stats == NULL) { ++ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", ++ EXT3_MB_STATS_NAME); ++ remove_proc_entry(EXT3_ROOT, proc_root_fs); ++ return -EIO; ++ } ++ ++ proc_ext3_mb_stats->data = NULL; ++ proc_ext3_mb_stats->read_proc = ext3_mb_stats_read; ++ proc_ext3_mb_stats->write_proc = ext3_mb_stats_write; ++ ++ /* Initialize EXT3_MAX_TO_SCAN_NAME */ ++ proc_ext3_mb_max_to_scan = create_proc_entry( ++ EXT3_MB_MAX_TO_SCAN_NAME, ++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); ++ if (proc_ext3_mb_max_to_scan == NULL) { ++ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", ++ EXT3_MB_MAX_TO_SCAN_NAME); ++ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_ROOT, proc_root_fs); ++ return -EIO; ++ } ++ ++ proc_ext3_mb_max_to_scan->data = NULL; ++ proc_ext3_mb_max_to_scan->read_proc = ext3_mb_max_to_scan_read; ++ proc_ext3_mb_max_to_scan->write_proc = ext3_mb_max_to_scan_write; ++ ++ /* Initialize EXT3_MIN_TO_SCAN_NAME */ ++ proc_ext3_mb_min_to_scan = create_proc_entry( ++ EXT3_MB_MIN_TO_SCAN_NAME, ++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); ++ if (proc_ext3_mb_min_to_scan == NULL) { ++ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", ++ EXT3_MB_MIN_TO_SCAN_NAME); ++ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_ROOT, proc_root_fs); ++ return -EIO; ++ } ++ ++ proc_ext3_mb_min_to_scan->data = NULL; ++ proc_ext3_mb_min_to_scan->read_proc = ext3_mb_min_to_scan_read; ++ proc_ext3_mb_min_to_scan->write_proc = ext3_mb_min_to_scan_write; ++ ++ /* Initialize EXT3_ORDER2_REQ */ ++ proc_ext3_mb_order2_req = create_proc_entry( ++ EXT3_MB_ORDER2_REQ, ++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); ++ if (proc_ext3_mb_order2_req == NULL) { ++ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", ++ EXT3_MB_ORDER2_REQ); ++ remove_proc_entry(EXT3_MB_MIN_TO_SCAN_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_ROOT, proc_root_fs); ++ return -EIO; ++ } ++ ++ proc_ext3_mb_order2_req->data = NULL; ++ proc_ext3_mb_order2_req->read_proc = ext3_mb_order2_req_read; ++ proc_ext3_mb_order2_req->write_proc = ext3_mb_order2_req_write; ++ ++ return 0; ++} ++ ++void exit_ext3_proc(void) ++{ ++ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_MIN_TO_SCAN_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_ORDER2_REQ, proc_root_ext3); ++ remove_proc_entry(EXT3_ROOT, proc_root_fs); ++} +Index: linux-2.6.16.i686/fs/ext3/Makefile +=================================================================== +--- linux-2.6.16.i686.orig/fs/ext3/Makefile 2006-05-30 22:55:32.000000000 +0800 ++++ linux-2.6.16.i686/fs/ext3/Makefile 2006-05-30 23:02:59.000000000 +0800 +@@ -6,7 +6,7 @@ + + ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ + ioctl.o namei.o super.o symlink.o hash.o resize.o \ +- extents.o ++ extents.o mballoc.o + + ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o + ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o diff --git a/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6-suse.patch b/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6-suse.patch index 2408cc7..1fb1b60 100644 --- a/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6-suse.patch +++ b/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6-suse.patch @@ -1,10 +1,373 @@ -Index: linux-2.6.5-sles9/fs/ext3/mballoc.c +Index: linux-2.6.5-7.282-full/include/linux/ext3_fs.h =================================================================== ---- linux-2.6.5-sles9.orig/fs/ext3/mballoc.c 2003-01-30 13:24:37.000000000 +0300 -+++ linux-2.6.5-sles9/fs/ext3/mballoc.c 2004-11-09 02:34:25.181340632 +0300 -@@ -0,0 +1,1428 @@ +--- linux-2.6.5-7.282-full.orig/include/linux/ext3_fs.h 2006-10-24 22:18:28.000000000 +0400 ++++ linux-2.6.5-7.282-full/include/linux/ext3_fs.h 2006-10-24 22:18:28.000000000 +0400 +@@ -57,6 +57,14 @@ struct statfs; + #define ext3_debug(f, a...) do {} while (0) + #endif + ++#define EXT3_MULTIBLOCK_ALLOCATOR 1 ++ ++#define EXT3_MB_HINT_MERGE 1 ++#define EXT3_MB_HINT_RESERVED 2 ++#define EXT3_MB_HINT_METADATA 4 ++#define EXT3_MB_HINT_FIRST 8 ++#define EXT3_MB_HINT_BEST 16 ++ + /* + * Special inodes numbers + */ +@@ -339,6 +347,7 @@ struct ext3_inode { + #define EXT3_MOUNT_IOPEN_NOPRIV 0x100000/* Make iopen world-readable */ + #define EXT3_MOUNT_EXTENTS 0x200000/* Extents support */ + #define EXT3_MOUNT_EXTDEBUG 0x400000/* Extents debug */ ++#define EXT3_MOUNT_MBALLOC 0x800000/* Buddy allocation support */ + + /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ + #ifndef clear_opt +@@ -361,6 +370,14 @@ struct ext3_inode { + #define ext3_find_first_zero_bit ext2_find_first_zero_bit + #define ext3_find_next_zero_bit ext2_find_next_zero_bit + ++#ifndef ext2_find_next_le_bit ++#ifdef __LITTLE_ENDIAN ++#define ext2_find_next_le_bit(addr, size, off) find_next_bit((addr), (size), (off)) ++#else ++#error "mballoc needs a patch for big-endian systems - CFS bug 10634" ++#endif /* __LITTLE_ENDIAN */ ++#endif /* !ext2_find_next_le_bit */ ++ + /* + * Maximal mount counts between two filesystem checks + */ +@@ -700,7 +717,9 @@ extern int ext3_bg_has_super(struct supe + extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group); + extern int ext3_new_block (handle_t *, struct inode *, unsigned long, int *); + extern void ext3_free_blocks (handle_t *, struct inode *, unsigned long, +- unsigned long); ++ unsigned long, int); ++extern void ext3_free_blocks_old(handle_t *, struct inode *, unsigned long, ++ unsigned long); + extern unsigned long ext3_count_free_blocks (struct super_block *); + extern void ext3_check_blocks_bitmap (struct super_block *); + extern struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb, +@@ -824,6 +843,17 @@ extern void ext3_extents_initialize_bloc + extern int ext3_ext_ioctl(struct inode *inode, struct file *filp, + unsigned int cmd, unsigned long arg); + ++/* mballoc.c */ ++extern long ext3_mb_stats; ++extern long ext3_mb_max_to_scan; ++extern int ext3_mb_init(struct super_block *, int); ++extern int ext3_mb_release(struct super_block *); ++extern int ext3_mb_new_blocks(handle_t *, struct inode *, unsigned long, int *, int, int *); ++extern int ext3_mb_reserve_blocks(struct super_block *, int); ++extern void ext3_mb_release_blocks(struct super_block *, int); ++int __init init_ext3_proc(void); ++void exit_ext3_proc(void); ++ + #endif /* __KERNEL__ */ + + #define EXT3_IOC_CREATE_INUM _IOW('f', 5, long) +Index: linux-2.6.5-7.282-full/include/linux/ext3_fs_sb.h +=================================================================== +--- linux-2.6.5-7.282-full.orig/include/linux/ext3_fs_sb.h 2006-10-24 22:18:28.000000000 +0400 ++++ linux-2.6.5-7.282-full/include/linux/ext3_fs_sb.h 2006-10-24 22:18:28.000000000 +0400 +@@ -23,9 +23,15 @@ + #define EXT_INCLUDE + #include + #include ++#include + #endif + #endif + #include ++#include ++ ++struct ext3_buddy_group_blocks; ++struct ext3_mb_history; ++#define EXT3_BB_MAX_BLOCKS + + /* + * third extended-fs super-block data in memory +@@ -78,6 +84,43 @@ struct ext3_sb_info { + struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */ + wait_queue_head_t ro_wait_queue; /* For people waiting for the fs to go read-only */ + #endif ++ ++ /* for buddy allocator */ ++ struct ext3_group_info ***s_group_info; ++ struct inode *s_buddy_cache; ++ long s_blocks_reserved; ++ spinlock_t s_reserve_lock; ++ struct list_head s_active_transaction; ++ struct list_head s_closed_transaction; ++ struct list_head s_committed_transaction; ++ spinlock_t s_md_lock; ++ tid_t s_last_transaction; ++ int s_mb_factor; ++ unsigned short *s_mb_offsets, *s_mb_maxs; ++ unsigned long s_stripe; ++ ++ /* history to debug policy */ ++ struct ext3_mb_history *s_mb_history; ++ int s_mb_history_cur; ++ int s_mb_history_max; ++ struct proc_dir_entry *s_mb_proc; ++ spinlock_t s_mb_history_lock; ++ ++ /* stats for buddy allocator */ ++ atomic_t s_bal_reqs; /* number of reqs with len > 1 */ ++ atomic_t s_bal_success; /* we found long enough chunks */ ++ atomic_t s_bal_allocated; /* in blocks */ ++ atomic_t s_bal_ex_scanned; /* total extents scanned */ ++ atomic_t s_bal_goals; /* goal hits */ ++ atomic_t s_bal_breaks; /* too long searches */ ++ atomic_t s_bal_2orders; /* 2^order hits */ ++ spinlock_t s_bal_lock; ++ unsigned long s_mb_buddies_generated; ++ unsigned long long s_mb_generation_time; + }; + ++#define EXT3_GROUP_INFO(sb, group) \ ++ EXT3_SB(sb)->s_group_info[(group) >> EXT3_DESC_PER_BLOCK_BITS(sb)] \ ++ [(group) & (EXT3_DESC_PER_BLOCK(sb) - 1)] ++ + #endif /* _LINUX_EXT3_FS_SB */ +Index: linux-2.6.5-7.282-full/fs/ext3/super.c +=================================================================== +--- linux-2.6.5-7.282-full.orig/fs/ext3/super.c 2006-10-24 22:18:28.000000000 +0400 ++++ linux-2.6.5-7.282-full/fs/ext3/super.c 2006-10-24 22:18:28.000000000 +0400 +@@ -389,6 +389,7 @@ void ext3_put_super (struct super_block + struct ext3_super_block *es = sbi->s_es; + int i; + ++ ext3_mb_release(sb); + ext3_ext_release(sb); + ext3_xattr_put_super(sb); + journal_destroy(sbi->s_journal); +@@ -588,6 +589,7 @@ enum { + Opt_err, + Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, + Opt_extents, Opt_noextents, Opt_extdebug, ++ Opt_mballoc, Opt_nomballoc, Opt_stripe, + }; + + static match_table_t tokens = { +@@ -634,6 +636,9 @@ static match_table_t tokens = { + {Opt_extents, "extents"}, + {Opt_noextents, "noextents"}, + {Opt_extdebug, "extdebug"}, ++ {Opt_mballoc, "mballoc"}, ++ {Opt_nomballoc, "nomballoc"}, ++ {Opt_stripe, "stripe=%u"}, + {Opt_barrier, "barrier=%u"}, + {Opt_err, NULL} + }; +@@ -859,6 +864,19 @@ static int parse_options (char * options + case Opt_extdebug: + set_opt (sbi->s_mount_opt, EXTDEBUG); + break; ++ case Opt_mballoc: ++ set_opt(sbi->s_mount_opt, MBALLOC); ++ break; ++ case Opt_nomballoc: ++ clear_opt(sbi->s_mount_opt, MBALLOC); ++ break; ++ case Opt_stripe: ++ if (match_int(&args[0], &option)) ++ return 0; ++ if (option < 0) ++ return 0; ++ sbi->s_stripe = option; ++ break; + default: + printk (KERN_ERR + "EXT3-fs: Unrecognized mount option \"%s\" " +@@ -1512,6 +1530,7 @@ static int ext3_fill_super (struct super + ext3_count_dirs(sb)); + + ext3_ext_init(sb); ++ ext3_mb_init(sb, needs_recovery); + + return 0; + +@@ -2160,7 +2179,13 @@ static struct file_system_type ext3_fs_t + + static int __init init_ext3_fs(void) + { +- int err = init_ext3_xattr(); ++ int err; ++ ++ err = init_ext3_proc(); ++ if (err) ++ return err; ++ ++ err = init_ext3_xattr(); + if (err) + return err; + err = init_inodecache(); +@@ -2189,6 +2214,7 @@ static void __exit exit_ext3_fs(void) + unregister_filesystem(&ext3_fs_type); + destroy_inodecache(); + exit_ext3_xattr(); ++ exit_ext3_proc(); + } + + int ext3_prep_san_write(struct inode *inode, long *blocks, +Index: linux-2.6.5-7.282-full/fs/ext3/extents.c +=================================================================== +--- linux-2.6.5-7.282-full.orig/fs/ext3/extents.c 2006-10-24 22:18:28.000000000 +0400 ++++ linux-2.6.5-7.282-full/fs/ext3/extents.c 2006-10-24 22:18:28.000000000 +0400 +@@ -779,7 +779,7 @@ cleanup: + for (i = 0; i < depth; i++) { + if (!ablocks[i]) + continue; +- ext3_free_blocks(handle, tree->inode, ablocks[i], 1); ++ ext3_free_blocks(handle, tree->inode, ablocks[i], 1, 1); + } + } + kfree(ablocks); +@@ -1438,7 +1438,7 @@ int ext3_ext_rm_idx(handle_t *handle, st + path->p_idx->ei_leaf); + bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf); + ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf); +- ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1); ++ ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1, 1); + return err; + } + +@@ -1923,10 +1923,12 @@ ext3_remove_blocks(struct ext3_extents_t + int needed = ext3_remove_blocks_credits(tree, ex, from, to); + handle_t *handle = ext3_journal_start(tree->inode, needed); + struct buffer_head *bh; +- int i; ++ int i, metadata = 0; + + if (IS_ERR(handle)) + return PTR_ERR(handle); ++ if (S_ISDIR(tree->inode->i_mode) || S_ISLNK(tree->inode->i_mode)) ++ metadata = 1; + if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) { + /* tail removal */ + unsigned long num, start; +@@ -1938,7 +1940,7 @@ ext3_remove_blocks(struct ext3_extents_t + bh = sb_find_get_block(tree->inode->i_sb, start + i); + ext3_forget(handle, 0, tree->inode, bh, start + i); + } +- ext3_free_blocks(handle, tree->inode, start, num); ++ ext3_free_blocks(handle, tree->inode, start, num, metadata); + } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) { + printk("strange request: removal %lu-%lu from %u:%u\n", + from, to, ex->ee_block, ex->ee_len); +Index: linux-2.6.5-7.282-full/fs/ext3/inode.c +=================================================================== +--- linux-2.6.5-7.282-full.orig/fs/ext3/inode.c 2006-10-24 22:18:28.000000000 +0400 ++++ linux-2.6.5-7.282-full/fs/ext3/inode.c 2006-10-24 22:18:28.000000000 +0400 +@@ -574,7 +574,7 @@ static int ext3_alloc_branch(handle_t *h + ext3_journal_forget(handle, branch[i].bh); + } + for (i = 0; i < keys; i++) +- ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1); ++ ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1, 1); + return err; + } + +@@ -675,7 +675,7 @@ err_out: + if (err == -EAGAIN) + for (i = 0; i < num; i++) + ext3_free_blocks(handle, inode, +- le32_to_cpu(where[i].key), 1); ++ le32_to_cpu(where[i].key), 1, 1); + return err; + } + +@@ -1837,7 +1837,7 @@ ext3_clear_blocks(handle_t *handle, stru + } + } + +- ext3_free_blocks(handle, inode, block_to_free, count); ++ ext3_free_blocks(handle, inode, block_to_free, count, 1); + } + + /** +@@ -2008,7 +2008,7 @@ static void ext3_free_branches(handle_t + ext3_journal_test_restart(handle, inode); + } + +- ext3_free_blocks(handle, inode, nr, 1); ++ ext3_free_blocks(handle, inode, nr, 1, 1); + + if (parent_bh) { + /* +Index: linux-2.6.5-7.282-full/fs/ext3/balloc.c +=================================================================== +--- linux-2.6.5-7.282-full.orig/fs/ext3/balloc.c 2006-08-30 18:12:13.000000000 +0400 ++++ linux-2.6.5-7.282-full/fs/ext3/balloc.c 2006-10-24 22:18:28.000000000 +0400 +@@ -78,7 +78,7 @@ struct ext3_group_desc * ext3_get_group_ + * + * Return buffer_head on success or NULL in case of failure. + */ +-static struct buffer_head * ++struct buffer_head * + read_block_bitmap(struct super_block *sb, unsigned int block_group) + { + struct ext3_group_desc * desc; +@@ -274,7 +274,7 @@ void ext3_discard_reservation(struct ino + } + + /* Free given blocks, update quota and i_blocks field */ +-void ext3_free_blocks(handle_t *handle, struct inode *inode, ++void ext3_free_blocks_old(handle_t *handle, struct inode *inode, + unsigned long block, unsigned long count) + { + struct buffer_head *bitmap_bh = NULL; +@@ -1142,7 +1142,7 @@ int ext3_should_retry_alloc(struct super + * bitmap, and then for any free bit if that fails. + * This function also updates quota and i_blocks field. + */ +-int ext3_new_block(handle_t *handle, struct inode *inode, ++int ext3_new_block_old(handle_t *handle, struct inode *inode, + unsigned long goal, int *errp) + { + struct buffer_head *bitmap_bh = NULL; +Index: linux-2.6.5-7.282-full/fs/ext3/xattr.c +=================================================================== +--- linux-2.6.5-7.282-full.orig/fs/ext3/xattr.c 2006-10-24 22:18:28.000000000 +0400 ++++ linux-2.6.5-7.282-full/fs/ext3/xattr.c 2006-10-24 22:18:28.000000000 +0400 +@@ -1371,7 +1371,7 @@ ext3_xattr_set_handle2(handle_t *handle, + new_bh = sb_getblk(sb, block); + if (!new_bh) { + getblk_failed: +- ext3_free_blocks(handle, inode, block, 1); ++ ext3_free_blocks(handle, inode, block, 1, 1); + error = -EIO; + goto cleanup; + } +@@ -1411,7 +1411,7 @@ getblk_failed: + if (HDR(old_bh)->h_refcount == cpu_to_le32(1)) { + /* Free the old block. */ + ea_bdebug(old_bh, "freeing"); +- ext3_free_blocks(handle, inode, old_bh->b_blocknr, 1); ++ ext3_free_blocks(handle, inode, old_bh->b_blocknr, 1, 1); + + /* ext3_forget() calls bforget() for us, but we + let our caller release old_bh, so we need to +@@ -1519,7 +1519,7 @@ ext3_xattr_delete_inode(handle_t *handle + mb_cache_entry_free(ce); + ce = NULL; + } +- ext3_free_blocks(handle, inode, EXT3_I(inode)->i_file_acl, 1); ++ ext3_free_blocks(handle, inode, EXT3_I(inode)->i_file_acl, 1, 1); + get_bh(bh); + ext3_forget(handle, 1, inode, bh, EXT3_I(inode)->i_file_acl); + } else { +Index: linux-2.6.5-7.282-full/fs/ext3/mballoc.c +=================================================================== +--- linux-2.6.5-7.282-full.orig/fs/ext3/mballoc.c 2006-10-23 18:07:54.821533176 +0400 ++++ linux-2.6.5-7.282-full/fs/ext3/mballoc.c 2006-10-24 22:20:45.000000000 +0400 +@@ -0,0 +1,2730 @@ +/* -+ * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com ++ * Copyright (c) 2003-2005, Cluster File Systems, Inc, info@clusterfs.com + * Written by Alex Tomas + * + * This program is free software; you can redistribute it and/or modify @@ -36,18 +399,26 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c +#include +#include +#include ++#include ++#include ++#include ++#include + +/* + * TODO: -+ * - do not scan from the beginning, try to remember first free block -+ * - mb_mark_used_* may allocate chunk right after splitting buddy ++ * - bitmap read-ahead (proposed by Oleg Drokin aka green) ++ * - track min/max extents in each group for better group selection ++ * - mb_mark_used() may allocate chunk right after splitting buddy + * - special flag to advice allocator to look for requested + N blocks + * this may improve interaction between extents and mballoc ++ * - tree of groups sorted by number of free blocks ++ * - percpu reservation code (hotpath) ++ * - error handling + */ + +/* + * with AGRESSIVE_CHECK allocator runs consistency checks over -+ * structures. this checks slow things down a lot ++ * structures. these checks slow things down a lot + */ +#define AGGRESSIVE_CHECK__ + @@ -61,197 +432,247 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c +#endif + +/* -+ * where to save buddies structures beetween umount/mount (clean case only) ++ * with EXT3_MB_HISTORY mballoc stores last N allocations in memory ++ * and you can monitor it in /proc/fs/ext3//mb_history ++ */ ++#define EXT3_MB_HISTORY ++ ++/* ++ * How long mballoc can look for a best extent (in found extents) ++ */ ++long ext3_mb_max_to_scan = 500; ++ ++/* ++ * How long mballoc must look for a best extent ++ */ ++long ext3_mb_min_to_scan = 30; ++ ++/* ++ * with 'ext3_mb_stats' allocator will collect stats that will be ++ * shown at umount. The collecting costs though! + */ -+#define EXT3_BUDDY_FILE ".buddy" ++ ++long ext3_mb_stats = 1; + +/* -+ * max. number of chunks to be tracked in ext3_free_extent struct ++ * for which requests use 2^N search using buddies + */ -+#define MB_ARR_SIZE 32 ++long ext3_mb_order2_reqs = 8; ++ ++#ifdef EXT3_BB_MAX_BLOCKS ++#undef EXT3_BB_MAX_BLOCKS ++#endif ++#define EXT3_BB_MAX_BLOCKS 30 ++ ++struct ext3_free_metadata { ++ unsigned short group; ++ unsigned short num; ++ unsigned short blocks[EXT3_BB_MAX_BLOCKS]; ++ struct list_head list; ++}; ++ ++struct ext3_group_info { ++ unsigned long bb_state; ++ unsigned long bb_tid; ++ struct ext3_free_metadata *bb_md_cur; ++ unsigned short bb_first_free; ++ unsigned short bb_free; ++ unsigned short bb_fragments; ++ unsigned short bb_counters[]; ++}; ++ ++ ++#define EXT3_GROUP_INFO_NEED_INIT_BIT 0 ++#define EXT3_GROUP_INFO_LOCKED_BIT 1 ++ ++#define EXT3_MB_GRP_NEED_INIT(grp) \ ++ (test_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, &(grp)->bb_state)) ++ ++struct ext3_free_extent { ++ __u16 fe_start; ++ __u16 fe_len; ++ __u16 fe_group; ++}; + +struct ext3_allocation_context { + struct super_block *ac_sb; + + /* search goals */ -+ int ac_g_group; -+ int ac_g_start; -+ int ac_g_len; -+ int ac_g_flags; -+ ++ struct ext3_free_extent ac_g_ex; ++ + /* the best found extent */ -+ int ac_b_group; -+ int ac_b_start; -+ int ac_b_len; -+ ++ struct ext3_free_extent ac_b_ex; ++ + /* number of iterations done. we have to track to limit searching */ -+ int ac_repeats; -+ int ac_groups_scanned; -+ int ac_status; ++ unsigned long ac_ex_scanned; ++ __u16 ac_groups_scanned; ++ __u16 ac_found; ++ __u16 ac_tail; ++ __u16 ac_buddy; ++ __u8 ac_status; ++ __u8 ac_flags; /* allocation hints */ ++ __u8 ac_criteria; ++ __u8 ac_repeats; ++ __u8 ac_2order; /* if request is to allocate 2^N blocks and ++ * N > 0, the field stores N, otherwise 0 */ ++ ++ struct page *ac_buddy_page; ++ struct page *ac_bitmap_page; +}; + +#define AC_STATUS_CONTINUE 1 +#define AC_STATUS_FOUND 2 -+ ++#define AC_STATUS_BREAK 3 ++ ++struct ext3_mb_history { ++ struct ext3_free_extent goal; /* goal allocation */ ++ struct ext3_free_extent result; /* result allocation */ ++ unsigned pid; ++ unsigned ino; ++ __u16 found; /* how many extents have been found */ ++ __u16 groups; /* how many groups have been scanned */ ++ __u16 tail; /* what tail broke some buddy */ ++ __u16 buddy; /* buddy the tail ^^^ broke */ ++ __u8 cr; /* which phase the result extent was found at */ ++ __u8 merged; ++}; + +struct ext3_buddy { -+ void *bd_bitmap; ++ struct page *bd_buddy_page; + void *bd_buddy; -+ int bd_blkbits; -+ struct buffer_head *bd_bh; -+ struct buffer_head *bd_bh2; -+ struct ext3_buddy_group_blocks *bd_bd; ++ struct page *bd_bitmap_page; ++ void *bd_bitmap; ++ struct ext3_group_info *bd_info; + struct super_block *bd_sb; ++ __u16 bd_blkbits; ++ __u16 bd_group; +}; ++#define EXT3_MB_BITMAP(e3b) ((e3b)->bd_bitmap) ++#define EXT3_MB_BUDDY(e3b) ((e3b)->bd_buddy) + -+struct ext3_free_extent { -+ int fe_start; -+ int fe_len; -+ unsigned char fe_orders[MB_ARR_SIZE]; -+ unsigned char fe_nums; -+ unsigned char fe_back; -+}; ++#ifndef EXT3_MB_HISTORY ++#define ext3_mb_store_history(sb,ino,ac) ++#else ++static void ext3_mb_store_history(struct super_block *, unsigned ino, ++ struct ext3_allocation_context *ac); ++#endif + +#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) + ++static struct proc_dir_entry *proc_root_ext3; + +int ext3_create (struct inode *, struct dentry *, int, struct nameidata *); +struct buffer_head * read_block_bitmap(struct super_block *, unsigned int); -+void ext3_free_blocks_old(handle_t *, struct inode *, unsigned long, unsigned long); +int ext3_new_block_old(handle_t *, struct inode *, unsigned long, int *); +int ext3_mb_reserve_blocks(struct super_block *, int); +void ext3_mb_release_blocks(struct super_block *, int); +void ext3_mb_poll_new_transaction(struct super_block *, handle_t *); +void ext3_mb_free_committed_blocks(struct super_block *); + -+#define mb_correct_addr_and_bit(bit,addr) \ -+{ \ -+ if ((unsigned) addr & 1) { \ -+ bit += 8; \ -+ addr--; \ -+ } \ -+ if ((unsigned) addr & 2) { \ -+ bit += 16; \ -+ addr--; \ -+ addr--; \ -+ } \ ++#if BITS_PER_LONG == 64 ++#define mb_correct_addr_and_bit(bit,addr) \ ++{ \ ++ bit += ((unsigned long) addr & 7UL) << 3; \ ++ addr = (void *) ((unsigned long) addr & ~7UL); \ +} ++#elif BITS_PER_LONG == 32 ++#define mb_correct_addr_and_bit(bit,addr) \ ++{ \ ++ bit += ((unsigned long) addr & 3UL) << 3; \ ++ addr = (void *) ((unsigned long) addr & ~3UL); \ ++} ++#else ++#error "how many bits you are?!" ++#endif + +static inline int mb_test_bit(int bit, void *addr) +{ + mb_correct_addr_and_bit(bit,addr); -+ return test_bit(bit, addr); ++ return ext2_test_bit(bit, addr); +} + +static inline void mb_set_bit(int bit, void *addr) +{ + mb_correct_addr_and_bit(bit,addr); -+ set_bit(bit, addr); ++ ext2_set_bit(bit, addr); +} + -+static inline void mb_clear_bit(int bit, void *addr) ++static inline void mb_set_bit_atomic(int bit, void *addr) +{ + mb_correct_addr_and_bit(bit,addr); -+ clear_bit(bit, addr); ++ ext2_set_bit_atomic(NULL, bit, addr); +} + -+static inline void *mb_find_buddy(struct ext3_buddy *e3b, int order, int *max) ++static inline void mb_clear_bit(int bit, void *addr) +{ -+ int i = 1; -+ void *bb; -+ -+ J_ASSERT(e3b->bd_bitmap != e3b->bd_buddy); -+ J_ASSERT(max != NULL); -+ -+ if (order > e3b->bd_blkbits + 1) -+ return NULL; -+ -+ /* at order 0 we see each particular block */ -+ *max = 1 << (e3b->bd_blkbits + 3); -+ if (order == 0) -+ return e3b->bd_bitmap; ++ mb_correct_addr_and_bit(bit,addr); ++ ext2_clear_bit(bit, addr); ++} + -+ bb = e3b->bd_buddy; -+ *max = *max >> 1; -+ while (i < order) { -+ bb += 1 << (e3b->bd_blkbits - i); -+ i++; -+ *max = *max >> 1; -+ } -+ return bb; ++static inline void mb_clear_bit_atomic(int bit, void *addr) ++{ ++ mb_correct_addr_and_bit(bit,addr); ++ ext2_clear_bit_atomic(NULL, bit, addr); +} + -+static int ext3_mb_load_desc(struct super_block *sb, int group, -+ struct ext3_buddy *e3b) ++static inline int mb_find_next_zero_bit(void *addr, int max, int start) +{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int fix; ++#if BITS_PER_LONG == 64 ++ fix = ((unsigned long) addr & 7UL) << 3; ++ addr = (void *) ((unsigned long) addr & ~7UL); ++#elif BITS_PER_LONG == 32 ++ fix = ((unsigned long) addr & 3UL) << 3; ++ addr = (void *) ((unsigned long) addr & ~3UL); ++#else ++#error "how many bits you are?!" ++#endif ++ max += fix; ++ start += fix; ++ return ext2_find_next_zero_bit(addr, max, start) - fix; ++} + -+ J_ASSERT(sbi->s_buddy_blocks[group].bb_bitmap); -+ J_ASSERT(sbi->s_buddy_blocks[group].bb_buddy); ++static inline void *mb_find_buddy(struct ext3_buddy *e3b, int order, int *max) ++{ ++ char *bb; + -+ /* load bitmap */ -+ e3b->bd_bh = sb_getblk(sb, sbi->s_buddy_blocks[group].bb_bitmap); -+ if (e3b->bd_bh == NULL) { -+ ext3_error(sb, "ext3_mb_load_desc", -+ "can't get block for buddy bitmap\n"); -+ goto out; -+ } -+ if (!buffer_uptodate(e3b->bd_bh)) { -+ ll_rw_block(READ, 1, &e3b->bd_bh); -+ wait_on_buffer(e3b->bd_bh); -+ } -+ J_ASSERT(buffer_uptodate(e3b->bd_bh)); ++ J_ASSERT(EXT3_MB_BITMAP(e3b) != EXT3_MB_BUDDY(e3b)); ++ J_ASSERT(max != NULL); + -+ /* load buddy */ -+ e3b->bd_bh2 = sb_getblk(sb, sbi->s_buddy_blocks[group].bb_buddy); -+ if (e3b->bd_bh2 == NULL) { -+ ext3_error(sb, "ext3_mb_load_desc", -+ "can't get block for buddy bitmap\n"); -+ goto out; -+ } -+ if (!buffer_uptodate(e3b->bd_bh2)) { -+ ll_rw_block(READ, 1, &e3b->bd_bh2); -+ wait_on_buffer(e3b->bd_bh2); ++ if (order > e3b->bd_blkbits + 1) { ++ *max = 0; ++ return NULL; + } -+ J_ASSERT(buffer_uptodate(e3b->bd_bh2)); -+ -+ e3b->bd_bitmap = e3b->bd_bh->b_data; -+ e3b->bd_buddy = e3b->bd_bh2->b_data; -+ e3b->bd_blkbits = sb->s_blocksize_bits; -+ e3b->bd_bd = sbi->s_buddy_blocks + group; -+ e3b->bd_sb = sb; + -+ return 0; -+out: -+ brelse(e3b->bd_bh); -+ brelse(e3b->bd_bh2); -+ e3b->bd_bh = NULL; -+ e3b->bd_bh2 = NULL; -+ return -EIO; -+} ++ /* at order 0 we see each particular block */ ++ *max = 1 << (e3b->bd_blkbits + 3); ++ if (order == 0) ++ return EXT3_MB_BITMAP(e3b); + -+static void ext3_mb_dirty_buddy(struct ext3_buddy *e3b) -+{ -+ mark_buffer_dirty(e3b->bd_bh); -+ mark_buffer_dirty(e3b->bd_bh2); -+} ++ bb = EXT3_MB_BUDDY(e3b) + EXT3_SB(e3b->bd_sb)->s_mb_offsets[order]; ++ *max = EXT3_SB(e3b->bd_sb)->s_mb_maxs[order]; + -+static void ext3_mb_release_desc(struct ext3_buddy *e3b) -+{ -+ brelse(e3b->bd_bh); -+ brelse(e3b->bd_bh2); ++ return bb; +} + +#ifdef AGGRESSIVE_CHECK ++ +static void mb_check_buddy(struct ext3_buddy *e3b) +{ + int order = e3b->bd_blkbits + 1; + int max, max2, i, j, k, count; ++ int fragments = 0, fstart; + void *buddy, *buddy2; + + if (!test_opt(e3b->bd_sb, MBALLOC)) + return; + ++ { ++ static int mb_check_counter = 0; ++ if (mb_check_counter++ % 300 != 0) ++ return; ++ } ++ + while (order > 1) { + buddy = mb_find_buddy(e3b, order, &max); + J_ASSERT(buddy); @@ -263,56 +684,375 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + count = 0; + for (i = 0; i < max; i++) { + -+ if (!mb_test_bit(i, buddy)) { ++ if (mb_test_bit(i, buddy)) { + /* only single bit in buddy2 may be 1 */ -+ if (mb_test_bit(i << 1, buddy2)) -+ J_ASSERT(!mb_test_bit((i<<1)+1, buddy2)); -+ else if (mb_test_bit((i << 1) + 1, buddy2)) -+ J_ASSERT(!mb_test_bit(i << 1, buddy2)); ++ if (!mb_test_bit(i << 1, buddy2)) ++ J_ASSERT(mb_test_bit((i<<1)+1, buddy2)); ++ else if (!mb_test_bit((i << 1) + 1, buddy2)) ++ J_ASSERT(mb_test_bit(i << 1, buddy2)); + continue; + } + + /* both bits in buddy2 must be 0 */ -+ J_ASSERT(!mb_test_bit(i << 1, buddy2)); -+ J_ASSERT(!mb_test_bit((i << 1) + 1, buddy2)); ++ J_ASSERT(mb_test_bit(i << 1, buddy2)); ++ J_ASSERT(mb_test_bit((i << 1) + 1, buddy2)); + + for (j = 0; j < (1 << order); j++) { + k = (i * (1 << order)) + j; -+ J_ASSERT(mb_test_bit(k, e3b->bd_bitmap)); ++ J_ASSERT(!mb_test_bit(k, EXT3_MB_BITMAP(e3b))); + } + count++; + } -+ J_ASSERT(e3b->bd_bd->bb_counters[order] == count); ++ J_ASSERT(e3b->bd_info->bb_counters[order] == count); + order--; + } + ++ fstart = -1; + buddy = mb_find_buddy(e3b, 0, &max); + for (i = 0; i < max; i++) { -+ if (mb_test_bit(i, buddy)) ++ if (!mb_test_bit(i, buddy)) { ++ J_ASSERT(i >= e3b->bd_info->bb_first_free); ++ if (fstart == -1) { ++ fragments++; ++ fstart = i; ++ } + continue; ++ } ++ fstart = -1; + /* check used bits only */ + for (j = 0; j < e3b->bd_blkbits + 1; j++) { + buddy2 = mb_find_buddy(e3b, j, &max2); + k = i >> j; + J_ASSERT(k < max2); -+ J_ASSERT(!mb_test_bit(k, buddy2)); ++ J_ASSERT(mb_test_bit(k, buddy2)); + } + } ++ J_ASSERT(!EXT3_MB_GRP_NEED_INIT(e3b->bd_info)); ++ J_ASSERT(e3b->bd_info->bb_fragments == fragments); +} ++ +#else +#define mb_check_buddy(e3b) +#endif + ++/* find most significant bit */ ++static int inline fmsb(unsigned short word) ++{ ++ int order; ++ ++ if (word > 255) { ++ order = 7; ++ word >>= 8; ++ } else { ++ order = -1; ++ } ++ ++ do { ++ order++; ++ word >>= 1; ++ } while (word != 0); ++ ++ return order; ++} ++ ++static void inline ++ext3_mb_mark_free_simple(struct super_block *sb, void *buddy, unsigned first, ++ int len, struct ext3_group_info *grp) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ unsigned short min, max, chunk, border; ++ ++ mb_debug("mark %u/%u free\n", first, len); ++ J_ASSERT(len < EXT3_BLOCKS_PER_GROUP(sb)); ++ ++ border = 2 << sb->s_blocksize_bits; ++ ++ while (len > 0) { ++ /* find how many blocks can be covered since this position */ ++ max = ffs(first | border) - 1; ++ ++ /* find how many blocks of power 2 we need to mark */ ++ min = fmsb(len); ++ ++ mb_debug(" %u/%u -> max %u, min %u\n", ++ first & ((2 << sb->s_blocksize_bits) - 1), ++ len, max, min); ++ ++ if (max < min) ++ min = max; ++ chunk = 1 << min; ++ ++ /* mark multiblock chunks only */ ++ grp->bb_counters[min]++; ++ if (min > 0) { ++ mb_debug(" set %u at %u \n", first >> min, ++ sbi->s_mb_offsets[min]); ++ mb_clear_bit(first >> min, buddy + sbi->s_mb_offsets[min]); ++ } ++ ++ len -= chunk; ++ first += chunk; ++ } ++} ++ ++static void ++ext3_mb_generate_buddy(struct super_block *sb, void *buddy, void *bitmap, ++ int group) ++{ ++ struct ext3_group_info *grp = EXT3_GROUP_INFO(sb, group); ++ unsigned short max = EXT3_BLOCKS_PER_GROUP(sb); ++ unsigned short i = 0, first, len; ++ unsigned free = 0, fragments = 0; ++ unsigned long long period = get_cycles(); ++ ++ i = mb_find_next_zero_bit(bitmap, max, 0); ++ grp->bb_first_free = i; ++ while (i < max) { ++ fragments++; ++ first = i; ++ i = ext2_find_next_le_bit(bitmap, max, i); ++ len = i - first; ++ free += len; ++ if (len > 1) ++ ext3_mb_mark_free_simple(sb, buddy, first, len, grp); ++ else ++ grp->bb_counters[0]++; ++ if (i < max) ++ i = mb_find_next_zero_bit(bitmap, max, i); ++ } ++ grp->bb_fragments = fragments; ++ ++ /* bb_state shouldn't being modified because all ++ * others waits for init completion on page lock */ ++ clear_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, &grp->bb_state); ++ if (free != grp->bb_free) { ++ printk("EXT3-fs: group %u: %u blocks in bitmap, %u in gd\n", ++ group, free, grp->bb_free); ++ grp->bb_free = free; ++ } ++ ++ period = get_cycles() - period; ++ spin_lock(&EXT3_SB(sb)->s_bal_lock); ++ EXT3_SB(sb)->s_mb_buddies_generated++; ++ EXT3_SB(sb)->s_mb_generation_time += period; ++ spin_unlock(&EXT3_SB(sb)->s_bal_lock); ++} ++ ++static int ext3_mb_init_cache(struct page *page) ++{ ++ int blocksize, blocks_per_page, groups_per_page; ++ int err = 0, i, first_group, first_block; ++ struct super_block *sb; ++ struct buffer_head *bhs; ++ struct buffer_head **bh; ++ struct inode *inode; ++ char *data, *bitmap; ++ ++ mb_debug("init page %lu\n", page->index); ++ ++ inode = page->mapping->host; ++ sb = inode->i_sb; ++ blocksize = 1 << inode->i_blkbits; ++ blocks_per_page = PAGE_CACHE_SIZE / blocksize; ++ ++ groups_per_page = blocks_per_page >> 1; ++ if (groups_per_page == 0) ++ groups_per_page = 1; ++ ++ /* allocate buffer_heads to read bitmaps */ ++ if (groups_per_page > 1) { ++ err = -ENOMEM; ++ i = sizeof(struct buffer_head *) * groups_per_page; ++ bh = kmalloc(i, GFP_NOFS); ++ if (bh == NULL) ++ goto out; ++ memset(bh, 0, i); ++ } else ++ bh = &bhs; ++ ++ first_group = page->index * blocks_per_page / 2; ++ ++ /* read all groups the page covers into the cache */ ++ for (i = 0; i < groups_per_page; i++) { ++ struct ext3_group_desc * desc; ++ ++ if (first_group + i >= EXT3_SB(sb)->s_groups_count) ++ break; ++ ++ err = -EIO; ++ desc = ext3_get_group_desc(sb, first_group + i, NULL); ++ if (desc == NULL) ++ goto out; ++ ++ err = -ENOMEM; ++ bh[i] = sb_getblk(sb, le32_to_cpu(desc->bg_block_bitmap)); ++ if (bh[i] == NULL) ++ goto out; ++ ++ if (buffer_uptodate(bh[i])) ++ continue; ++ ++ lock_buffer(bh[i]); ++ if (buffer_uptodate(bh[i])) { ++ unlock_buffer(bh[i]); ++ continue; ++ } ++ ++ get_bh(bh[i]); ++ bh[i]->b_end_io = end_buffer_read_sync; ++ submit_bh(READ, bh[i]); ++ mb_debug("read bitmap for group %u\n", first_group + i); ++ } ++ ++ /* wait for I/O completion */ ++ for (i = 0; i < groups_per_page && bh[i]; i++) ++ wait_on_buffer(bh[i]); ++ ++ err = -EIO; ++ for (i = 0; i < groups_per_page && bh[i]; i++) ++ if (!buffer_uptodate(bh[i])) ++ goto out; ++ ++ first_block = page->index * blocks_per_page; ++ for (i = 0; i < blocks_per_page; i++) { ++ int group; ++ ++ group = (first_block + i) >> 1; ++ if (group >= EXT3_SB(sb)->s_groups_count) ++ break; ++ ++ data = page_address(page) + (i * blocksize); ++ bitmap = bh[group - first_group]->b_data; ++ ++ if ((first_block + i) & 1) { ++ /* this is block of buddy */ ++ mb_debug("put buddy for group %u in page %lu/%x\n", ++ group, page->index, i * blocksize); ++ memset(data, 0xff, blocksize); ++ EXT3_GROUP_INFO(sb, group)->bb_fragments = 0; ++ memset(EXT3_GROUP_INFO(sb, group)->bb_counters, 0, ++ sizeof(unsigned short)*(sb->s_blocksize_bits+2)); ++ ext3_mb_generate_buddy(sb, data, bitmap, group); ++ } else { ++ /* this is block of bitmap */ ++ mb_debug("put bitmap for group %u in page %lu/%x\n", ++ group, page->index, i * blocksize); ++ memcpy(data, bitmap, blocksize); ++ } ++ } ++ SetPageUptodate(page); ++ ++out: ++ if (bh) { ++ for (i = 0; i < groups_per_page && bh[i]; i++) ++ brelse(bh[i]); ++ if (bh != &bhs) ++ kfree(bh); ++ } ++ return err; ++} ++ ++static int ext3_mb_load_buddy(struct super_block *sb, int group, ++ struct ext3_buddy *e3b) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct inode *inode = sbi->s_buddy_cache; ++ int blocks_per_page, block, pnum, poff; ++ struct page *page; ++ ++ mb_debug("load group %u\n", group); ++ ++ blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; ++ ++ e3b->bd_blkbits = sb->s_blocksize_bits; ++ e3b->bd_info = EXT3_GROUP_INFO(sb, group); ++ e3b->bd_sb = sb; ++ e3b->bd_group = group; ++ e3b->bd_buddy_page = NULL; ++ e3b->bd_bitmap_page = NULL; ++ ++ block = group * 2; ++ pnum = block / blocks_per_page; ++ poff = block % blocks_per_page; ++ ++ /* we could use find_or_create_page(), but it locks page ++ * what we'd like to avoid in fast path ... */ ++ page = find_get_page(inode->i_mapping, pnum); ++ if (page == NULL || !PageUptodate(page)) { ++ if (page) ++ page_cache_release(page); ++ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); ++ if (page) { ++ BUG_ON(page->mapping != inode->i_mapping); ++ if (!PageUptodate(page)) ++ ext3_mb_init_cache(page); ++ unlock_page(page); ++ } ++ } ++ if (page == NULL || !PageUptodate(page)) ++ goto err; ++ e3b->bd_bitmap_page = page; ++ e3b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); ++ mark_page_accessed(page); ++ ++ block++; ++ pnum = block / blocks_per_page; ++ poff = block % blocks_per_page; ++ ++ page = find_get_page(inode->i_mapping, pnum); ++ if (page == NULL || !PageUptodate(page)) { ++ if (page) ++ page_cache_release(page); ++ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); ++ if (page) { ++ BUG_ON(page->mapping != inode->i_mapping); ++ if (!PageUptodate(page)) ++ ext3_mb_init_cache(page); ++ unlock_page(page); ++ } ++ } ++ if (page == NULL || !PageUptodate(page)) ++ goto err; ++ e3b->bd_buddy_page = page; ++ e3b->bd_buddy = page_address(page) + (poff * sb->s_blocksize); ++ mark_page_accessed(page); ++ ++ J_ASSERT(e3b->bd_bitmap_page != NULL); ++ J_ASSERT(e3b->bd_buddy_page != NULL); ++ ++ return 0; ++ ++err: ++ if (e3b->bd_bitmap_page) ++ page_cache_release(e3b->bd_bitmap_page); ++ if (e3b->bd_buddy_page) ++ page_cache_release(e3b->bd_buddy_page); ++ e3b->bd_buddy = NULL; ++ e3b->bd_bitmap = NULL; ++ return -EIO; ++} ++ ++static void ext3_mb_release_desc(struct ext3_buddy *e3b) ++{ ++ if (e3b->bd_bitmap_page) ++ page_cache_release(e3b->bd_bitmap_page); ++ if (e3b->bd_buddy_page) ++ page_cache_release(e3b->bd_buddy_page); ++} ++ ++ +static inline void +ext3_lock_group(struct super_block *sb, int group) +{ -+ spin_lock(&EXT3_SB(sb)->s_buddy_blocks[group].bb_lock); ++ bit_spin_lock(EXT3_GROUP_INFO_LOCKED_BIT, ++ &EXT3_GROUP_INFO(sb, group)->bb_state); +} + +static inline void +ext3_unlock_group(struct super_block *sb, int group) +{ -+ spin_unlock(&EXT3_SB(sb)->s_buddy_blocks[group].bb_lock); ++ bit_spin_unlock(EXT3_GROUP_INFO_LOCKED_BIT, ++ &EXT3_GROUP_INFO(sb, group)->bb_state); +} + +static int mb_find_order_for_block(struct ext3_buddy *e3b, int block) @@ -320,13 +1060,13 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + int order = 1; + void *bb; + -+ J_ASSERT(e3b->bd_bitmap != e3b->bd_buddy); ++ J_ASSERT(EXT3_MB_BITMAP(e3b) != EXT3_MB_BUDDY(e3b)); + J_ASSERT(block < (1 << (e3b->bd_blkbits + 3))); + -+ bb = e3b->bd_buddy; ++ bb = EXT3_MB_BUDDY(e3b); + while (order <= e3b->bd_blkbits + 1) { + block = block >> 1; -+ if (mb_test_bit(block, bb)) { ++ if (!mb_test_bit(block, bb)) { + /* this block is part of buddy of order 'order' */ + return order; + } @@ -349,7 +1089,7 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + cur += 32; + continue; + } -+ mb_clear_bit(cur, bm); ++ mb_clear_bit_atomic(cur, bm); + cur++; + } +} @@ -367,32 +1107,48 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + cur += 32; + continue; + } -+ mb_set_bit(cur, bm); ++ mb_set_bit_atomic(cur, bm); + cur++; + } +} + +static int mb_free_blocks(struct ext3_buddy *e3b, int first, int count) +{ -+ int block, max, order; ++ int block = 0, max = 0, order; + void *buddy, *buddy2; + + mb_check_buddy(e3b); ++ ++ e3b->bd_info->bb_free += count; ++ if (first < e3b->bd_info->bb_first_free) ++ e3b->bd_info->bb_first_free = first; ++ ++ /* let's maintain fragments counter */ ++ if (first != 0) ++ block = !mb_test_bit(first - 1, EXT3_MB_BITMAP(e3b)); ++ if (first + count < EXT3_SB(e3b->bd_sb)->s_mb_maxs[0]) ++ max = !mb_test_bit(first + count, EXT3_MB_BITMAP(e3b)); ++ if (block && max) ++ e3b->bd_info->bb_fragments--; ++ else if (!block && !max) ++ e3b->bd_info->bb_fragments++; ++ ++ /* let's maintain buddy itself */ + while (count-- > 0) { + block = first++; + order = 0; + -+ J_ASSERT(!mb_test_bit(block, e3b->bd_bitmap)); -+ mb_set_bit(block, e3b->bd_bitmap); -+ e3b->bd_bd->bb_counters[order]++; ++ J_ASSERT(mb_test_bit(block, EXT3_MB_BITMAP(e3b))); ++ mb_clear_bit(block, EXT3_MB_BITMAP(e3b)); ++ e3b->bd_info->bb_counters[order]++; + + /* start of the buddy */ + buddy = mb_find_buddy(e3b, order, &max); + + do { + block &= ~1UL; -+ if (!mb_test_bit(block, buddy) || -+ !mb_test_bit(block + 1, buddy)) ++ if (mb_test_bit(block, buddy) || ++ mb_test_bit(block + 1, buddy)) + break; + + /* both the buddies are free, try to coalesce them */ @@ -402,19 +1158,19 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + break; + + if (order > 0) { -+ /* for special purposes, we don't clear ++ /* for special purposes, we don't set + * free bits in bitmap */ -+ mb_clear_bit(block, buddy); -+ mb_clear_bit(block + 1, buddy); ++ mb_set_bit(block, buddy); ++ mb_set_bit(block + 1, buddy); + } -+ e3b->bd_bd->bb_counters[order]--; -+ e3b->bd_bd->bb_counters[order]--; ++ e3b->bd_info->bb_counters[order]--; ++ e3b->bd_info->bb_counters[order]--; + + block = block >> 1; + order++; -+ e3b->bd_bd->bb_counters[order]++; ++ e3b->bd_info->bb_counters[order]++; + -+ mb_set_bit(block, buddy2); ++ mb_clear_bit(block, buddy2); + buddy = buddy2; + } while (1); + } @@ -423,162 +1179,85 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + return 0; +} + -+/* -+ * returns 1 if out extent is enough to fill needed space -+ */ -+int mb_make_backward_extent(struct ext3_free_extent *in, -+ struct ext3_free_extent *out, int needed) -+{ -+ int i; -+ -+ J_ASSERT(in); -+ J_ASSERT(out); -+ J_ASSERT(in->fe_nums < MB_ARR_SIZE); -+ -+ out->fe_len = 0; -+ out->fe_start = in->fe_start + in->fe_len; -+ out->fe_nums = 0; -+ -+ /* for single-chunk extent we need not back order -+ * also, if an extent doesn't fill needed space -+ * then it makes no sense to try back order becase -+ * if we select this extent then it'll be use as is */ -+ if (in->fe_nums < 2 || in->fe_len < needed) -+ return 0; -+ -+ i = in->fe_nums - 1; -+ while (i >= 0 && out->fe_len < needed) { -+ out->fe_len += (1 << in->fe_orders[i]); -+ out->fe_start -= (1 << in->fe_orders[i]); -+ i--; -+ } -+ /* FIXME: in some situation fe_orders may be too small to hold -+ * all the buddies */ -+ J_ASSERT(out->fe_len >= needed); -+ -+ for (i++; i < in->fe_nums; i++) -+ out->fe_orders[out->fe_nums++] = in->fe_orders[i]; -+ J_ASSERT(out->fe_nums < MB_ARR_SIZE); -+ out->fe_back = 1; -+ -+ return 1; -+} -+ -+int mb_find_extent(struct ext3_buddy *e3b, int order, int block, -+ int needed, struct ext3_free_extent *ex) ++static int mb_find_extent(struct ext3_buddy *e3b, int order, int block, ++ int needed, struct ext3_free_extent *ex) +{ -+ int space = needed; -+ int next, max, ord; ++ int next = block, max, ord; + void *buddy; + + J_ASSERT(ex != NULL); + -+ ex->fe_nums = 0; -+ ex->fe_len = 0; -+ + buddy = mb_find_buddy(e3b, order, &max); + J_ASSERT(buddy); + J_ASSERT(block < max); -+ if (!mb_test_bit(block, buddy)) -+ goto nofree; ++ if (mb_test_bit(block, buddy)) { ++ ex->fe_len = 0; ++ ex->fe_start = 0; ++ ex->fe_group = 0; ++ return 0; ++ } + -+ if (order == 0) { ++ if (likely(order == 0)) { + /* find actual order */ + order = mb_find_order_for_block(e3b, block); + block = block >> order; + } + -+ ex->fe_orders[ex->fe_nums++] = order; + ex->fe_len = 1 << order; + ex->fe_start = block << order; -+ ex->fe_back = 0; ++ ex->fe_group = e3b->bd_group; + -+ while ((space = space - (1 << order)) > 0) { ++ /* calc difference from given start */ ++ next = next - ex->fe_start; ++ ex->fe_len -= next; ++ ex->fe_start += next; + -+ buddy = mb_find_buddy(e3b, order, &max); -+ J_ASSERT(buddy); ++ while (needed > ex->fe_len && (buddy = mb_find_buddy(e3b, order, &max))) { + + if (block + 1 >= max) + break; + + next = (block + 1) * (1 << order); -+ if (!mb_test_bit(next, e3b->bd_bitmap)) ++ if (mb_test_bit(next, EXT3_MB_BITMAP(e3b))) + break; + + ord = mb_find_order_for_block(e3b, next); + -+ if ((1 << ord) >= needed) { -+ /* we dont want to coalesce with self-enough buddies */ -+ break; -+ } + order = ord; + block = next >> order; + ex->fe_len += 1 << order; -+ -+ if (ex->fe_nums < MB_ARR_SIZE) -+ ex->fe_orders[ex->fe_nums++] = order; + } + -+nofree: + J_ASSERT(ex->fe_start + ex->fe_len <= (1 << (e3b->bd_blkbits + 3))); + return ex->fe_len; +} + -+static int mb_mark_used_backward(struct ext3_buddy *e3b, -+ struct ext3_free_extent *ex, int len) ++static int mb_mark_used(struct ext3_buddy *e3b, struct ext3_free_extent *ex) +{ -+ int start = ex->fe_start, len0 = len; -+ int ord, mlen, max, cur; ++ int ord, mlen = 0, max = 0, cur; ++ int start = ex->fe_start; ++ int len = ex->fe_len; ++ unsigned ret = 0; ++ int len0 = len; + void *buddy; + -+ start = ex->fe_start + ex->fe_len - 1; -+ while (len) { -+ ord = mb_find_order_for_block(e3b, start); -+ if (((start >> ord) << ord) == (start - (1 << ord) + 1) && -+ len >= (1 << ord)) { -+ /* the whole chunk may be allocated at once! */ -+ mlen = 1 << ord; -+ buddy = mb_find_buddy(e3b, ord, &max); -+ J_ASSERT((start >> ord) < max); -+ mb_clear_bit(start >> ord, buddy); -+ e3b->bd_bd->bb_counters[ord]--; -+ start -= mlen; -+ len -= mlen; -+ J_ASSERT(len >= 0); -+ J_ASSERT(start >= 0); -+ continue; -+ } -+ -+ /* we have to split large buddy */ -+ J_ASSERT(ord > 0); -+ buddy = mb_find_buddy(e3b, ord, &max); -+ mb_clear_bit(start >> ord, buddy); -+ e3b->bd_bd->bb_counters[ord]--; -+ -+ ord--; -+ cur = (start >> ord) & ~1U; -+ buddy = mb_find_buddy(e3b, ord, &max); -+ mb_set_bit(cur, buddy); -+ mb_set_bit(cur + 1, buddy); -+ e3b->bd_bd->bb_counters[ord]++; -+ e3b->bd_bd->bb_counters[ord]++; -+ } -+ -+ /* now drop all the bits in bitmap */ -+ mb_clear_bits(e3b->bd_bitmap, ex->fe_start + ex->fe_len - len0, len0); -+ + mb_check_buddy(e3b); + -+ return 0; -+} -+ -+static int mb_mark_used_forward(struct ext3_buddy *e3b, -+ struct ext3_free_extent *ex, int len) -+{ -+ int start = ex->fe_start, len0 = len; -+ int ord, mlen, max, cur; -+ void *buddy; -+ ++ e3b->bd_info->bb_free -= len; ++ if (e3b->bd_info->bb_first_free == start) ++ e3b->bd_info->bb_first_free += len; ++ ++ /* let's maintain fragments counter */ ++ if (start != 0) ++ mlen = !mb_test_bit(start - 1, EXT3_MB_BITMAP(e3b)); ++ if (start + len < EXT3_SB(e3b->bd_sb)->s_mb_maxs[0]) ++ max = !mb_test_bit(start + len, EXT3_MB_BITMAP(e3b)); ++ if (mlen && max) ++ e3b->bd_info->bb_fragments++; ++ else if (!mlen && !max) ++ e3b->bd_info->bb_fragments--; ++ ++ /* let's maintain buddy itself */ + while (len) { + ord = mb_find_order_for_block(e3b, start); + @@ -587,152 +1266,383 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + mlen = 1 << ord; + buddy = mb_find_buddy(e3b, ord, &max); + J_ASSERT((start >> ord) < max); -+ mb_clear_bit(start >> ord, buddy); -+ e3b->bd_bd->bb_counters[ord]--; ++ mb_set_bit(start >> ord, buddy); ++ e3b->bd_info->bb_counters[ord]--; + start += mlen; + len -= mlen; + J_ASSERT(len >= 0); + continue; + } + ++ /* store for history */ ++ if (ret == 0) ++ ret = len | (ord << 16); ++ + /* we have to split large buddy */ + J_ASSERT(ord > 0); + buddy = mb_find_buddy(e3b, ord, &max); -+ mb_clear_bit(start >> ord, buddy); -+ e3b->bd_bd->bb_counters[ord]--; ++ mb_set_bit(start >> ord, buddy); ++ e3b->bd_info->bb_counters[ord]--; + + ord--; + cur = (start >> ord) & ~1U; + buddy = mb_find_buddy(e3b, ord, &max); -+ mb_set_bit(cur, buddy); -+ mb_set_bit(cur + 1, buddy); -+ e3b->bd_bd->bb_counters[ord]++; -+ e3b->bd_bd->bb_counters[ord]++; ++ mb_clear_bit(cur, buddy); ++ mb_clear_bit(cur + 1, buddy); ++ e3b->bd_info->bb_counters[ord]++; ++ e3b->bd_info->bb_counters[ord]++; + } + + /* now drop all the bits in bitmap */ -+ mb_clear_bits(e3b->bd_bitmap, ex->fe_start, len0); ++ mb_set_bits(EXT3_MB_BITMAP(e3b), ex->fe_start, len0); + + mb_check_buddy(e3b); + -+ return 0; ++ return ret; +} + -+int inline mb_mark_used(struct ext3_buddy *e3b, -+ struct ext3_free_extent *ex, int len) ++/* ++ * Must be called under group lock! ++ */ ++static void ext3_mb_use_best_found(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) +{ -+ int err; ++ unsigned long ret; + -+ J_ASSERT(ex); -+ if (ex->fe_back == 0) -+ err = mb_mark_used_forward(e3b, ex, len); -+ else -+ err = mb_mark_used_backward(e3b, ex, len); -+ return err; ++ ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len); ++ ret = mb_mark_used(e3b, &ac->ac_b_ex); ++ ++ ac->ac_status = AC_STATUS_FOUND; ++ ac->ac_tail = ret & 0xffff; ++ ac->ac_buddy = ret >> 16; ++ ++ /* hold in-core structures until allocated ++ * blocks are marked non-free in on-disk bitmap */ ++ ac->ac_buddy_page = e3b->bd_buddy_page; ++ page_cache_get(e3b->bd_buddy_page); ++ ac->ac_bitmap_page = e3b->bd_bitmap_page; ++ page_cache_get(e3b->bd_bitmap_page); +} + -+int ext3_mb_new_in_group(struct ext3_allocation_context *ac, -+ struct ext3_buddy *e3b, int group) ++/* ++ * The routine checks whether found extent is good enough. If it is, ++ * then the extent gets marked used and flag is set to the context ++ * to stop scanning. Otherwise, the extent is compared with the ++ * previous found extent and if new one is better, then it's stored ++ * in the context. Later, the best found extent will be used, if ++ * mballoc can't find good enough extent. ++ * ++ * FIXME: real allocation policy is to be designed yet! ++ */ ++static void ext3_mb_measure_extent(struct ext3_allocation_context *ac, ++ struct ext3_free_extent *ex, ++ struct ext3_buddy *e3b) +{ -+ struct super_block *sb = ac->ac_sb; -+ int err, gorder, max, i; -+ struct ext3_free_extent curex; -+ -+ /* let's know order of allocation */ -+ gorder = 0; -+ while (ac->ac_g_len > (1 << gorder)) -+ gorder++; -+ -+ if ((ac->ac_g_flags & 1) && ac->ac_g_group == group) { -+ /* someone asks for space at this specified block -+ * probably he wants to merge it into existing extent */ -+ if (mb_test_bit(ac->ac_g_start, e3b->bd_bitmap)) { -+ /* good. at least one block is free */ -+ max = mb_find_extent(e3b, 0, ac->ac_g_start, -+ ac->ac_g_len, &curex); -+ max = min(curex.fe_len, ac->ac_g_len); -+ mb_mark_used(e3b, &curex, max); -+ -+ ac->ac_b_group = group; -+ ac->ac_b_start = curex.fe_start; -+ ac->ac_b_len = max; -+ ac->ac_status = AC_STATUS_FOUND; -+ err = 0; -+ goto out; -+ } -+ /* don't try to find goal anymore */ -+ ac->ac_g_flags &= ~1; ++ struct ext3_free_extent *bex = &ac->ac_b_ex; ++ struct ext3_free_extent *gex = &ac->ac_g_ex; ++ ++ J_ASSERT(ex->fe_len > 0); ++ J_ASSERT(ex->fe_len < (1 << ac->ac_sb->s_blocksize_bits) * 8); ++ J_ASSERT(ex->fe_start < (1 << ac->ac_sb->s_blocksize_bits) * 8); ++ ++ ac->ac_found++; ++ ++ /* ++ * The special case - take what you catch first ++ */ ++ if (unlikely(ac->ac_flags & EXT3_MB_HINT_FIRST)) { ++ *bex = *ex; ++ ext3_mb_use_best_found(ac, e3b); ++ return; + } + -+ i = 0; -+ while (1) { -+ i = find_next_bit(e3b->bd_bitmap, sb->s_blocksize * 8, i); -+ if (i >= sb->s_blocksize * 8) -+ break; ++ /* ++ * Let's check whether the chunk is good enough ++ */ ++ if (ex->fe_len == gex->fe_len) { ++ *bex = *ex; ++ ext3_mb_use_best_found(ac, e3b); ++ return; ++ } + -+ max = mb_find_extent(e3b, 0, i, ac->ac_g_len, &curex); -+ if (max >= ac->ac_g_len) { -+ max = min(curex.fe_len, ac->ac_g_len); -+ mb_mark_used(e3b, &curex, max); -+ -+ ac->ac_b_group = group; -+ ac->ac_b_start = curex.fe_start; -+ ac->ac_b_len = max; -+ ac->ac_status = AC_STATUS_FOUND; -+ break; -+ } -+ i += max; ++ /* ++ * If this is first found extent, just store it in the context ++ */ ++ if (bex->fe_len == 0) { ++ *bex = *ex; ++ return; + } + -+ return 0; ++ /* ++ * If new found extent is better, store it in the context ++ */ ++ if (bex->fe_len < gex->fe_len) { ++ /* if the request isn't satisfied, any found extent ++ * larger than previous best one is better */ ++ if (ex->fe_len > bex->fe_len) ++ *bex = *ex; ++ } else if (ex->fe_len > gex->fe_len) { ++ /* if the request is satisfied, then we try to find ++ * an extent that still satisfy the request, but is ++ * smaller than previous one */ ++ *bex = *ex; ++ } + -+out: -+ return err; ++ /* ++ * Let's scan at least few extents and don't pick up a first one ++ */ ++ if (bex->fe_len > gex->fe_len && ac->ac_found > ext3_mb_min_to_scan) ++ ac->ac_status = AC_STATUS_BREAK; ++ ++ /* ++ * We don't want to scan for a whole year ++ */ ++ if (ac->ac_found > ext3_mb_max_to_scan) ++ ac->ac_status = AC_STATUS_BREAK; +} + -+int mb_good_group(struct ext3_allocation_context *ac, int group, int cr) ++static int ext3_mb_try_best_found(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) +{ -+ struct ext3_group_desc *gdp; -+ int free_blocks; ++ struct ext3_free_extent ex = ac->ac_b_ex; ++ int group = ex.fe_group, max, err; + -+ gdp = ext3_get_group_desc(ac->ac_sb, group, NULL); -+ if (!gdp) -+ return 0; -+ free_blocks = le16_to_cpu(gdp->bg_free_blocks_count); -+ if (free_blocks == 0) -+ return 0; ++ J_ASSERT(ex.fe_len > 0); ++ err = ext3_mb_load_buddy(ac->ac_sb, group, e3b); ++ if (err) ++ return err; + -+ /* someone wants this block very much */ -+ if ((ac->ac_g_flags & 1) && ac->ac_g_group == group) -+ return 1; ++ ext3_lock_group(ac->ac_sb, group); ++ max = mb_find_extent(e3b, 0, ex.fe_start, ex.fe_len, &ex); + -+ /* FIXME: I'd like to take fragmentation into account here */ -+ if (cr == 0) { -+ if (free_blocks >= ac->ac_g_len >> 1) -+ return 1; -+ } else if (cr == 1) { -+ if (free_blocks >= ac->ac_g_len >> 2) -+ return 1; -+ } else if (cr == 2) { -+ return 1; -+ } else { -+ BUG(); ++ if (max > 0) { ++ ac->ac_b_ex = ex; ++ ext3_mb_use_best_found(ac, e3b); + } ++ ++ ext3_unlock_group(ac->ac_sb, group); ++ ++ ext3_mb_release_desc(e3b); ++ + return 0; +} + -+int ext3_mb_new_blocks(handle_t *handle, struct inode *inode, -+ unsigned long goal, int *len, int flags, int *errp) ++static int ext3_mb_find_by_goal(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) +{ -+ struct buffer_head *bitmap_bh = NULL; -+ struct ext3_allocation_context ac; -+ int i, group, block, cr, err = 0; -+ struct ext3_group_desc *gdp; -+ struct ext3_super_block *es; -+ struct buffer_head *gdp_bh; -+ struct ext3_sb_info *sbi; -+ struct super_block *sb; -+ struct ext3_buddy e3b; ++ int group = ac->ac_g_ex.fe_group, max, err; ++ struct ext3_sb_info *sbi = EXT3_SB(ac->ac_sb); ++ struct ext3_super_block *es = sbi->s_es; ++ struct ext3_free_extent ex; ++ ++ err = ext3_mb_load_buddy(ac->ac_sb, group, e3b); ++ if (err) ++ return err; ++ ++ ext3_lock_group(ac->ac_sb, group); ++ max = mb_find_extent(e3b, 0, ac->ac_g_ex.fe_start, ++ ac->ac_g_ex.fe_len, &ex); ++ ++ if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) { ++ unsigned long start; ++ start = (e3b->bd_group * EXT3_BLOCKS_PER_GROUP(ac->ac_sb) + ++ ex.fe_start + le32_to_cpu(es->s_first_data_block)); ++ if (start % sbi->s_stripe == 0) { ++ ac->ac_found++; ++ ac->ac_b_ex = ex; ++ ext3_mb_use_best_found(ac, e3b); ++ } ++ } else if (max >= ac->ac_g_ex.fe_len) { ++ J_ASSERT(ex.fe_len > 0); ++ J_ASSERT(ex.fe_group == ac->ac_g_ex.fe_group); ++ J_ASSERT(ex.fe_start == ac->ac_g_ex.fe_start); ++ ac->ac_found++; ++ ac->ac_b_ex = ex; ++ ext3_mb_use_best_found(ac, e3b); ++ } else if (max > 0 && (ac->ac_flags & EXT3_MB_HINT_MERGE)) { ++ /* Sometimes, caller may want to merge even small ++ * number of blocks to an existing extent */ ++ J_ASSERT(ex.fe_len > 0); ++ J_ASSERT(ex.fe_group == ac->ac_g_ex.fe_group); ++ J_ASSERT(ex.fe_start == ac->ac_g_ex.fe_start); ++ ac->ac_found++; ++ ac->ac_b_ex = ex; ++ ext3_mb_use_best_found(ac, e3b); ++ } ++ ext3_unlock_group(ac->ac_sb, group); ++ ++ ext3_mb_release_desc(e3b); ++ ++ return 0; ++} ++ ++/* ++ * The routine scans buddy structures (not bitmap!) from given order ++ * to max order and tries to find big enough chunk to satisfy the req ++ */ ++static void ext3_mb_simple_scan_group(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ struct super_block *sb = ac->ac_sb; ++ struct ext3_group_info *grp = e3b->bd_info; ++ void *buddy; ++ int i, k, max; ++ ++ J_ASSERT(ac->ac_2order > 0); ++ for (i = ac->ac_2order; i <= sb->s_blocksize_bits + 1; i++) { ++ if (grp->bb_counters[i] == 0) ++ continue; ++ ++ buddy = mb_find_buddy(e3b, i, &max); ++ if (buddy == NULL) { ++ printk(KERN_ALERT "looking for wrong order?\n"); ++ break; ++ } ++ ++ k = mb_find_next_zero_bit(buddy, max, 0); ++ J_ASSERT(k < max); ++ ++ ac->ac_found++; ++ ++ ac->ac_b_ex.fe_len = 1 << i; ++ ac->ac_b_ex.fe_start = k << i; ++ ac->ac_b_ex.fe_group = e3b->bd_group; ++ ++ ext3_mb_use_best_found(ac, e3b); ++ J_ASSERT(ac->ac_b_ex.fe_len == ac->ac_g_ex.fe_len); ++ ++ if (unlikely(ext3_mb_stats)) ++ atomic_inc(&EXT3_SB(sb)->s_bal_2orders); ++ ++ break; ++ } ++} ++ ++/* ++ * The routine scans the group and measures all found extents. ++ * In order to optimize scanning, caller must pass number of ++ * free blocks in the group, so the routine can know upper limit. ++ */ ++static void ext3_mb_complex_scan_group(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ struct super_block *sb = ac->ac_sb; ++ void *bitmap = EXT3_MB_BITMAP(e3b); ++ struct ext3_free_extent ex; ++ int i, free; ++ ++ free = e3b->bd_info->bb_free; ++ J_ASSERT(free > 0); ++ ++ i = e3b->bd_info->bb_first_free; ++ ++ while (free && ac->ac_status == AC_STATUS_CONTINUE) { ++ i = mb_find_next_zero_bit(bitmap, sb->s_blocksize * 8, i); ++ if (i >= sb->s_blocksize * 8) { ++ J_ASSERT(free == 0); ++ break; ++ } ++ ++ mb_find_extent(e3b, 0, i, ac->ac_g_ex.fe_len, &ex); ++ J_ASSERT(ex.fe_len > 0); ++ J_ASSERT(free >= ex.fe_len); ++ ++ ext3_mb_measure_extent(ac, &ex, e3b); ++ ++ i += ex.fe_len; ++ free -= ex.fe_len; ++ } ++} ++ ++/* ++ * This is a special case for storages like raid5 ++ * we try to find stripe-aligned chunks for stripe-size requests ++ */ ++static void ext3_mb_scan_aligned(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ struct super_block *sb = ac->ac_sb; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ void *bitmap = EXT3_MB_BITMAP(e3b); ++ struct ext3_free_extent ex; ++ unsigned long i, max; ++ ++ J_ASSERT(sbi->s_stripe != 0); ++ ++ /* find first stripe-aligned block */ ++ i = e3b->bd_group * EXT3_BLOCKS_PER_GROUP(sb) ++ + le32_to_cpu(sbi->s_es->s_first_data_block); ++ i = ((i + sbi->s_stripe - 1) / sbi->s_stripe) * sbi->s_stripe; ++ i = (i - le32_to_cpu(sbi->s_es->s_first_data_block)) ++ % EXT3_BLOCKS_PER_GROUP(sb); ++ ++ while (i < sb->s_blocksize * 8) { ++ if (!mb_test_bit(i, bitmap)) { ++ max = mb_find_extent(e3b, 0, i, sbi->s_stripe, &ex); ++ if (max >= sbi->s_stripe) { ++ ac->ac_found++; ++ ac->ac_b_ex = ex; ++ ext3_mb_use_best_found(ac, e3b); ++ break; ++ } ++ } ++ i += sbi->s_stripe; ++ } ++} ++ ++static int ext3_mb_good_group(struct ext3_allocation_context *ac, ++ int group, int cr) ++{ ++ struct ext3_group_info *grp = EXT3_GROUP_INFO(ac->ac_sb, group); ++ unsigned free, fragments, i, bits; ++ ++ J_ASSERT(cr >= 0 && cr < 4); ++ J_ASSERT(!EXT3_MB_GRP_NEED_INIT(grp)); ++ ++ free = grp->bb_free; ++ fragments = grp->bb_fragments; ++ if (free == 0) ++ return 0; ++ if (fragments == 0) ++ return 0; ++ ++ switch (cr) { ++ case 0: ++ J_ASSERT(ac->ac_2order != 0); ++ bits = ac->ac_sb->s_blocksize_bits + 1; ++ for (i = ac->ac_2order; i <= bits; i++) ++ if (grp->bb_counters[i] > 0) ++ return 1; ++ break; ++ case 1: ++ if ((free / fragments) >= ac->ac_g_ex.fe_len) ++ return 1; ++ break; ++ case 2: ++ if (free >= ac->ac_g_ex.fe_len) ++ return 1; ++ break; ++ case 3: ++ return 1; ++ default: ++ BUG(); ++ } ++ ++ return 0; ++} ++ ++int ext3_mb_new_blocks(handle_t *handle, struct inode *inode, ++ unsigned long goal, int *len, int flags, int *errp) ++{ ++ struct buffer_head *bitmap_bh = NULL; ++ struct ext3_allocation_context ac; ++ int i, group, block, cr, err = 0; ++ struct ext3_group_desc *gdp; ++ struct ext3_super_block *es; ++ struct buffer_head *gdp_bh; ++ struct ext3_sb_info *sbi; ++ struct super_block *sb; ++ struct ext3_buddy e3b; + + J_ASSERT(len != NULL); + J_ASSERT(*len > 0); @@ -760,7 +1670,13 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + sbi = EXT3_SB(sb); + es = EXT3_SB(sb)->s_es; + -+ if (!(flags & 2)) { ++ /* ++ * We can't allocate > group size ++ */ ++ if (*len >= EXT3_BLOCKS_PER_GROUP(sb) - 10) ++ *len = EXT3_BLOCKS_PER_GROUP(sb) - 10; ++ ++ if (!(flags & EXT3_MB_HINT_RESERVED)) { + /* someone asks for non-reserved blocks */ + BUG_ON(*len > 1); + err = ext3_mb_reserve_blocks(sb, 1); @@ -770,6 +1686,9 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + } + } + ++ ac.ac_buddy_page = NULL; ++ ac.ac_bitmap_page = NULL; ++ + /* + * Check quota for allocation of this blocks. + */ @@ -791,62 +1710,154 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + EXT3_BLOCKS_PER_GROUP(sb)); + + /* set up allocation goals */ -+ ac.ac_b_group = ac.ac_b_start = ac.ac_b_len = 0; -+ ac.ac_status = 0; ++ ac.ac_b_ex.fe_group = 0; ++ ac.ac_b_ex.fe_start = 0; ++ ac.ac_b_ex.fe_len = 0; ++ ac.ac_status = AC_STATUS_CONTINUE; + ac.ac_groups_scanned = 0; ++ ac.ac_ex_scanned = 0; ++ ac.ac_found = 0; + ac.ac_sb = inode->i_sb; -+ ac.ac_g_group = group; -+ ac.ac_g_start = block; -+ ac.ac_g_len = *len; -+ ac.ac_g_flags = flags; ++ ac.ac_g_ex.fe_group = group; ++ ac.ac_g_ex.fe_start = block; ++ ac.ac_g_ex.fe_len = *len; ++ ac.ac_flags = flags; ++ ac.ac_2order = 0; ++ ac.ac_criteria = 0; ++ ++ if (*len == 1 && sbi->s_stripe) { ++ /* looks like a metadata, let's use a dirty hack for raid5 ++ * move all metadata in first groups in hope to hit cached ++ * sectors and thus avoid read-modify cycles in raid5 */ ++ ac.ac_g_ex.fe_group = group = 0; ++ } ++ ++ /* probably, the request is for 2^8+ blocks (1/2/3/... MB) */ ++ i = ffs(*len); ++ if (i >= ext3_mb_order2_reqs) { ++ i--; ++ if ((*len & (~(1 << i))) == 0) ++ ac.ac_2order = i; ++ } + -+ /* loop over the groups */ -+ for (cr = 0; cr < 3 && ac.ac_status != AC_STATUS_FOUND; cr++) { ++ /* first, try the goal */ ++ err = ext3_mb_find_by_goal(&ac, &e3b); ++ if (err) ++ goto out_err; ++ if (ac.ac_status == AC_STATUS_FOUND) ++ goto found; ++ ++ /* Let's just scan groups to find more-less suitable blocks */ ++ cr = ac.ac_2order ? 0 : 1; ++repeat: ++ for (; cr < 4 && ac.ac_status == AC_STATUS_CONTINUE; cr++) { ++ ac.ac_criteria = cr; + for (i = 0; i < EXT3_SB(sb)->s_groups_count; group++, i++) { + if (group == EXT3_SB(sb)->s_groups_count) + group = 0; + ++ if (EXT3_MB_GRP_NEED_INIT(EXT3_GROUP_INFO(sb, group))) { ++ /* we need full data about the group ++ * to make a good selection */ ++ err = ext3_mb_load_buddy(ac.ac_sb, group, &e3b); ++ if (err) ++ goto out_err; ++ ext3_mb_release_desc(&e3b); ++ } ++ + /* check is group good for our criteries */ -+ if (!mb_good_group(&ac, group, cr)) ++ if (!ext3_mb_good_group(&ac, group, cr)) + continue; + -+ err = ext3_mb_load_desc(ac.ac_sb, group, &e3b); ++ err = ext3_mb_load_buddy(ac.ac_sb, group, &e3b); + if (err) + goto out_err; + + ext3_lock_group(sb, group); -+ if (!mb_good_group(&ac, group, cr)) { ++ if (!ext3_mb_good_group(&ac, group, cr)) { + /* someone did allocation from this group */ + ext3_unlock_group(sb, group); + ext3_mb_release_desc(&e3b); + continue; + } + -+ err = ext3_mb_new_in_group(&ac, &e3b, group); ++ ac.ac_groups_scanned++; ++ if (cr == 0) ++ ext3_mb_simple_scan_group(&ac, &e3b); ++ else if (cr == 1 && *len == sbi->s_stripe) ++ ext3_mb_scan_aligned(&ac, &e3b); ++ else ++ ext3_mb_complex_scan_group(&ac, &e3b); ++ + ext3_unlock_group(sb, group); -+ if (ac.ac_status == AC_STATUS_FOUND) -+ ext3_mb_dirty_buddy(&e3b); ++ + ext3_mb_release_desc(&e3b); -+ if (err) -+ goto out_err; -+ if (ac.ac_status == AC_STATUS_FOUND) ++ ++ if (ac.ac_status != AC_STATUS_CONTINUE) + break; + } + } + ++ if (ac.ac_b_ex.fe_len > 0 && ac.ac_status != AC_STATUS_FOUND && ++ !(ac.ac_flags & EXT3_MB_HINT_FIRST)) { ++ /* ++ * We've been searching too long. Let's try to allocate ++ * the best chunk we've found so far ++ */ ++ ++ /*if (ac.ac_found > ext3_mb_max_to_scan) ++ printk(KERN_DEBUG "EXT3-fs: too long searching at " ++ "%u (%d/%d)\n", cr, ac.ac_b_ex.fe_len, ++ ac.ac_g_ex.fe_len);*/ ++ ext3_mb_try_best_found(&ac, &e3b); ++ if (ac.ac_status != AC_STATUS_FOUND) { ++ /* ++ * Someone more lucky has already allocated it. ++ * The only thing we can do is just take first ++ * found block(s) ++ printk(KERN_DEBUG "EXT3-fs: someone won our chunk\n"); ++ */ ++ ac.ac_b_ex.fe_group = 0; ++ ac.ac_b_ex.fe_start = 0; ++ ac.ac_b_ex.fe_len = 0; ++ ac.ac_status = AC_STATUS_CONTINUE; ++ ac.ac_flags |= EXT3_MB_HINT_FIRST; ++ cr = 3; ++ goto repeat; ++ } ++ } ++ + if (ac.ac_status != AC_STATUS_FOUND) { -+ /* unfortunately, we can't satisfy this request */ -+ J_ASSERT(ac.ac_b_len == 0); ++ /* ++ * We aren't lucky definitely ++ */ + DQUOT_FREE_BLOCK(inode, *len); + *errp = -ENOSPC; + block = 0; ++#if 1 ++ printk(KERN_ERR "EXT3-fs: can't allocate: status %d flags %d\n", ++ ac.ac_status, ac.ac_flags); ++ printk(KERN_ERR "EXT3-fs: goal %d, best found %d/%d/%d cr %d\n", ++ ac.ac_g_ex.fe_len, ac.ac_b_ex.fe_group, ++ ac.ac_b_ex.fe_start, ac.ac_b_ex.fe_len, cr); ++ printk(KERN_ERR "EXT3-fs: %lu block reserved, %d found\n", ++ sbi->s_blocks_reserved, ac.ac_found); ++ printk("EXT3-fs: groups: "); ++ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) ++ printk("%d: %d ", i, EXT3_GROUP_INFO(sb, i)->bb_free); ++ printk("\n"); ++#endif + goto out; + } + ++found: ++ J_ASSERT(ac.ac_b_ex.fe_len > 0); ++ + /* good news - free block(s) have been found. now it's time + * to mark block(s) in good old journaled bitmap */ -+ block = ac.ac_b_group * EXT3_BLOCKS_PER_GROUP(sb) -+ + ac.ac_b_start + le32_to_cpu(es->s_first_data_block); ++ block = ac.ac_b_ex.fe_group * EXT3_BLOCKS_PER_GROUP(sb) ++ + ac.ac_b_ex.fe_start ++ + le32_to_cpu(es->s_first_data_block); + + /* we made a desicion, now mark found blocks in good old + * bitmap to be journaled */ @@ -854,7 +1865,7 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + ext3_debug("using block group %d(%d)\n", + ac.ac_b_group.group, gdp->bg_free_blocks_count); + -+ bitmap_bh = read_block_bitmap(sb, ac.ac_b_group); ++ bitmap_bh = read_block_bitmap(sb, ac.ac_b_ex.fe_group); + if (!bitmap_bh) { + *errp = -EIO; + goto out_err; @@ -866,18 +1877,19 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + goto out_err; + } + -+ gdp = ext3_get_group_desc(sb, ac.ac_b_group, &gdp_bh); ++ gdp = ext3_get_group_desc(sb, ac.ac_b_ex.fe_group, &gdp_bh); + if (!gdp) { + *errp = -EIO; + goto out_err; + } -+ ++ + err = ext3_journal_get_write_access(handle, gdp_bh); + if (err) + goto out_err; + -+ block = ac.ac_b_start + ac.ac_b_group * EXT3_BLOCKS_PER_GROUP(sb) -+ + le32_to_cpu(es->s_first_data_block); ++ block = ac.ac_b_ex.fe_group * EXT3_BLOCKS_PER_GROUP(sb) ++ + ac.ac_b_ex.fe_start ++ + le32_to_cpu(es->s_first_data_block); + + if (block == le32_to_cpu(gdp->bg_block_bitmap) || + block == le32_to_cpu(gdp->bg_inode_bitmap) || @@ -886,18 +1898,18 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + ext3_error(sb, "ext3_new_block", + "Allocating block in system zone - " + "block = %u", block); -+#if 0 -+ for (i = 0; i < ac.ac_b_len; i++) -+ J_ASSERT(!mb_test_bit(ac.ac_b_start + i, bitmap_bh->b_data)); ++#ifdef AGGRESSIVE_CHECK ++ for (i = 0; i < ac.ac_b_ex.fe_len; i++) ++ J_ASSERT(!mb_test_bit(ac.ac_b_ex.fe_start + i, bitmap_bh->b_data)); +#endif -+ mb_set_bits(bitmap_bh->b_data, ac.ac_b_start, ac.ac_b_len); ++ mb_set_bits(bitmap_bh->b_data, ac.ac_b_ex.fe_start, ac.ac_b_ex.fe_len); + -+ ext3_lock_group(sb, ac.ac_b_group); ++ spin_lock(sb_bgl_lock(sbi, ac.ac_b_ex.fe_group)); + gdp->bg_free_blocks_count = -+ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) - -+ ac.ac_b_len); -+ ext3_unlock_group(sb, ac.ac_b_group); -+ percpu_counter_mod(&sbi->s_freeblocks_counter, -ac.ac_b_len); ++ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) ++ - ac.ac_b_ex.fe_len); ++ spin_unlock(sb_bgl_lock(sbi, ac.ac_b_ex.fe_group)); ++ percpu_counter_mod(&sbi->s_freeblocks_counter, - ac.ac_b_ex.fe_len); + + err = ext3_journal_dirty_metadata(handle, bitmap_bh); + if (err) @@ -911,10 +1923,11 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + brelse(bitmap_bh); + + /* drop non-allocated, but dquote'd blocks */ -+ J_ASSERT(*len >= ac.ac_b_len); -+ DQUOT_FREE_BLOCK(inode, *len - ac.ac_b_len); ++ J_ASSERT(*len >= ac.ac_b_ex.fe_len); ++ DQUOT_FREE_BLOCK(inode, *len - ac.ac_b_ex.fe_len); + -+ *len = ac.ac_b_len; ++ *len = ac.ac_b_ex.fe_len; ++ J_ASSERT(*len > 0); + J_ASSERT(block != 0); + goto out; + @@ -929,7 +1942,12 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + *errp = err; + block = 0; +out: -+ if (!(flags & 2)) { ++ if (ac.ac_buddy_page) ++ page_cache_release(ac.ac_buddy_page); ++ if (ac.ac_bitmap_page) ++ page_cache_release(ac.ac_bitmap_page); ++ ++ if (!(flags & EXT3_MB_HINT_RESERVED)) { + /* block wasn't reserved before and we reserved it + * at the beginning of allocation. it doesn't matter + * whether we allocated anything or we failed: time @@ -938,250 +1956,625 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + * path only, here is single block always */ + ext3_mb_release_blocks(sb, 1); + } ++ ++ if (unlikely(ext3_mb_stats) && ac.ac_g_ex.fe_len > 1) { ++ atomic_inc(&sbi->s_bal_reqs); ++ atomic_add(*len, &sbi->s_bal_allocated); ++ if (*len >= ac.ac_g_ex.fe_len) ++ atomic_inc(&sbi->s_bal_success); ++ atomic_add(ac.ac_found, &sbi->s_bal_ex_scanned); ++ if (ac.ac_g_ex.fe_start == ac.ac_b_ex.fe_start && ++ ac.ac_g_ex.fe_group == ac.ac_b_ex.fe_group) ++ atomic_inc(&sbi->s_bal_goals); ++ if (ac.ac_found > ext3_mb_max_to_scan) ++ atomic_inc(&sbi->s_bal_breaks); ++ } ++ ++ ext3_mb_store_history(sb, inode->i_ino, &ac); ++ + return block; +} ++EXPORT_SYMBOL(ext3_mb_new_blocks); + -+int ext3_mb_generate_buddy(struct super_block *sb, int group) -+{ -+ struct buffer_head *bh; -+ int i, err, count = 0; -+ struct ext3_buddy e3b; -+ -+ err = ext3_mb_load_desc(sb, group, &e3b); -+ if (err) -+ goto out; -+ memset(e3b.bd_bh->b_data, 0, sb->s_blocksize); -+ memset(e3b.bd_bh2->b_data, 0, sb->s_blocksize); ++#ifdef EXT3_MB_HISTORY ++struct ext3_mb_proc_session { ++ struct ext3_mb_history *history; ++ struct super_block *sb; ++ int start; ++ int max; ++}; + -+ bh = read_block_bitmap(sb, group); -+ if (bh == NULL) { -+ err = -EIO; -+ goto out2; ++static void *ext3_mb_history_skip_empty(struct ext3_mb_proc_session *s, ++ struct ext3_mb_history *hs, ++ int first) ++{ ++ if (hs == s->history + s->max) ++ hs = s->history; ++ if (!first && hs == s->history + s->start) ++ return NULL; ++ while (hs->goal.fe_len == 0) { ++ hs++; ++ if (hs == s->history + s->max) ++ hs = s->history; ++ if (hs == s->history + s->start) ++ return NULL; + } ++ return hs; ++} + -+ /* loop over the blocks, nad create buddies for free ones */ -+ for (i = 0; i < sb->s_blocksize * 8; i++) { -+ if (!mb_test_bit(i, (void *) bh->b_data)) { -+ mb_free_blocks(&e3b, i, 1); -+ count++; -+ } ++static void *ext3_mb_seq_history_start(struct seq_file *seq, loff_t *pos) ++{ ++ struct ext3_mb_proc_session *s = seq->private; ++ struct ext3_mb_history *hs; ++ int l = *pos; ++ ++ if (l == 0) ++ return SEQ_START_TOKEN; ++ hs = ext3_mb_history_skip_empty(s, s->history + s->start, 1); ++ if (!hs) ++ return NULL; ++ while (--l && (hs = ext3_mb_history_skip_empty(s, ++hs, 0)) != NULL); ++ return hs; ++} ++ ++static void *ext3_mb_seq_history_next(struct seq_file *seq, void *v, loff_t *pos) ++{ ++ struct ext3_mb_proc_session *s = seq->private; ++ struct ext3_mb_history *hs = v; ++ ++ ++*pos; ++ if (v == SEQ_START_TOKEN) ++ return ext3_mb_history_skip_empty(s, s->history + s->start, 1); ++ else ++ return ext3_mb_history_skip_empty(s, ++hs, 0); ++} ++ ++static int ext3_mb_seq_history_show(struct seq_file *seq, void *v) ++{ ++ struct ext3_mb_history *hs = v; ++ char buf[20], buf2[20]; ++ ++ if (v == SEQ_START_TOKEN) { ++ seq_printf(seq, "%-5s %-8s %-17s %-17s %-5s %-5s %-2s %-5s %-5s %-6s\n", ++ "pid", "inode", "goal", "result", "found", "grps", "cr", ++ "merge", "tail", "broken"); ++ return 0; + } -+ brelse(bh); -+ mb_check_buddy(&e3b); -+ ext3_mb_dirty_buddy(&e3b); + -+out2: -+ ext3_mb_release_desc(&e3b); -+out: -+ return err; ++ sprintf(buf, "%u/%u/%u", hs->goal.fe_group, ++ hs->goal.fe_start, hs->goal.fe_len); ++ sprintf(buf2, "%u/%u/%u", hs->result.fe_group, ++ hs->result.fe_start, hs->result.fe_len); ++ seq_printf(seq, "%-5u %-8u %-17s %-17s %-5u %-5u %-2u %-5s %-5u %-6u\n", ++ hs->pid, hs->ino, buf, buf2, hs->found, hs->groups, ++ hs->cr, hs->merged ? "M" : "", hs->tail, ++ hs->buddy ? 1 << hs->buddy : 0); ++ return 0; +} + -+EXPORT_SYMBOL(ext3_mb_new_blocks); ++static void ext3_mb_seq_history_stop(struct seq_file *seq, void *v) ++{ ++} + -+#define MB_CREDITS \ -+ (EXT3_DATA_TRANS_BLOCKS + 3 + EXT3_INDEX_EXTRA_TRANS_BLOCKS + \ -+ + 2 * EXT3_SINGLEDATA_TRANS_BLOCKS) ++static struct seq_operations ext3_mb_seq_history_ops = { ++ .start = ext3_mb_seq_history_start, ++ .next = ext3_mb_seq_history_next, ++ .stop = ext3_mb_seq_history_stop, ++ .show = ext3_mb_seq_history_show, ++}; + -+int ext3_mb_init_backend(struct super_block *sb) ++static int ext3_mb_seq_history_open(struct inode *inode, struct file *file) +{ -+ struct inode *root = sb->s_root->d_inode; ++ struct super_block *sb = PDE(inode)->data; + struct ext3_sb_info *sbi = EXT3_SB(sb); -+ struct dentry *db; -+ tid_t target; -+ int err, i; -+ -+ sbi->s_buddy_blocks = kmalloc(sizeof(struct ext3_buddy_group_blocks) * -+ sbi->s_groups_count, GFP_KERNEL); -+ if (sbi->s_buddy_blocks == NULL) { -+ printk("can't allocate mem for buddy maps\n"); -+ return -ENOMEM; -+ } -+ memset(sbi->s_buddy_blocks, 0, -+ sizeof(struct ext3_buddy_group_blocks) * sbi->s_groups_count); -+ sbi->s_buddy = NULL; -+ -+ down(&root->i_sem); -+ db = lookup_one_len(EXT3_BUDDY_FILE, sb->s_root, -+ strlen(EXT3_BUDDY_FILE)); -+ if (IS_ERR(db)) { -+ err = PTR_ERR(db); -+ printk("can't lookup buddy file: %d\n", err); -+ goto out; ++ struct ext3_mb_proc_session *s; ++ int rc, size; ++ ++ s = kmalloc(sizeof(*s), GFP_KERNEL); ++ if (s == NULL) ++ return -EIO; ++ size = sizeof(struct ext3_mb_history) * sbi->s_mb_history_max; ++ s->history = kmalloc(size, GFP_KERNEL); ++ if (s == NULL) { ++ kfree(s); ++ return -EIO; + } + -+ if (db->d_inode != NULL) { -+ sbi->s_buddy = igrab(db->d_inode); -+ goto map; -+ } ++ spin_lock(&sbi->s_mb_history_lock); ++ memcpy(s->history, sbi->s_mb_history, size); ++ s->max = sbi->s_mb_history_max; ++ s->start = sbi->s_mb_history_cur % s->max; ++ spin_unlock(&sbi->s_mb_history_lock); + -+ err = ext3_create(root, db, S_IFREG, NULL); -+ if (err) { -+ printk("error while creation buddy file: %d\n", err); ++ rc = seq_open(file, &ext3_mb_seq_history_ops); ++ if (rc == 0) { ++ struct seq_file *m = (struct seq_file *)file->private_data; ++ m->private = s; + } else { -+ sbi->s_buddy = igrab(db->d_inode); ++ kfree(s->history); ++ kfree(s); + } ++ return rc; + -+map: -+ for (i = 0; i < sbi->s_groups_count; i++) { -+ struct buffer_head *bh = NULL; -+ handle_t *handle; ++} + -+ handle = ext3_journal_start(sbi->s_buddy, MB_CREDITS); -+ if (IS_ERR(handle)) { -+ err = PTR_ERR(handle); -+ goto out2; -+ } -+ -+ /* allocate block for bitmap */ -+ bh = ext3_getblk(handle, sbi->s_buddy, i * 2, 1, &err); -+ if (bh == NULL) { -+ printk("can't get block for buddy bitmap: %d\n", err); -+ goto out2; -+ } -+ sbi->s_buddy_blocks[i].bb_bitmap = bh->b_blocknr; -+ brelse(bh); -+ -+ /* allocate block for buddy */ -+ bh = ext3_getblk(handle, sbi->s_buddy, i * 2 + 1, 1, &err); -+ if (bh == NULL) { -+ printk("can't get block for buddy: %d\n", err); -+ goto out2; -+ } -+ sbi->s_buddy_blocks[i].bb_buddy = bh->b_blocknr; -+ brelse(bh); -+ ext3_journal_stop(handle); -+ spin_lock_init(&sbi->s_buddy_blocks[i].bb_lock); -+ sbi->s_buddy_blocks[i].bb_md_cur = NULL; -+ sbi->s_buddy_blocks[i].bb_tid = 0; -+ } ++static int ext3_mb_seq_history_release(struct inode *inode, struct file *file) ++{ ++ struct seq_file *seq = (struct seq_file *)file->private_data; ++ struct ext3_mb_proc_session *s = seq->private; ++ kfree(s->history); ++ kfree(s); ++ return seq_release(inode, file); ++} + -+ if (journal_start_commit(sbi->s_journal, &target)) -+ log_wait_commit(sbi->s_journal, target); ++static struct file_operations ext3_mb_seq_history_fops = { ++ .owner = THIS_MODULE, ++ .open = ext3_mb_seq_history_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = ext3_mb_seq_history_release, ++}; + -+out2: -+ dput(db); -+out: -+ up(&root->i_sem); -+ return err; ++static void *ext3_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) ++{ ++ struct super_block *sb = seq->private; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ long group; ++ ++ if (*pos < 0 || *pos >= sbi->s_groups_count) ++ return NULL; ++ ++ group = *pos + 1; ++ return (void *) group; +} + -+int ext3_mb_release(struct super_block *sb) ++static void *ext3_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) +{ ++ struct super_block *sb = seq->private; + struct ext3_sb_info *sbi = EXT3_SB(sb); -+ -+ if (!test_opt(sb, MBALLOC)) -+ return 0; -+ -+ /* release freed, non-committed blocks */ -+ spin_lock(&sbi->s_md_lock); -+ list_splice_init(&sbi->s_closed_transaction, -+ &sbi->s_committed_transaction); -+ list_splice_init(&sbi->s_active_transaction, -+ &sbi->s_committed_transaction); -+ spin_unlock(&sbi->s_md_lock); -+ ext3_mb_free_committed_blocks(sb); ++ long group; + -+ if (sbi->s_buddy_blocks) -+ kfree(sbi->s_buddy_blocks); -+ if (sbi->s_buddy) -+ iput(sbi->s_buddy); -+ if (sbi->s_blocks_reserved) -+ printk("ext3-fs: %ld blocks being reserved at umount!\n", -+ sbi->s_blocks_reserved); -+ return 0; ++ ++*pos; ++ if (*pos < 0 || *pos >= sbi->s_groups_count) ++ return NULL; ++ group = *pos + 1; ++ return (void *) group;; +} + -+int ext3_mb_init(struct super_block *sb) ++static int ext3_mb_seq_groups_show(struct seq_file *seq, void *v) +{ -+ struct ext3_super_block *es; -+ int i; ++ struct super_block *sb = seq->private; ++ long group = (long) v, i; ++ struct sg { ++ struct ext3_group_info info; ++ unsigned short counters[16]; ++ } sg; ++ ++ group--; ++ if (group == 0) ++ seq_printf(seq, "#%-5s: %-5s %-5s %-5s [ %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s ]\n", ++ "group", "free", "frags", "first", "2^0", "2^1", "2^2", ++ "2^3", "2^4", "2^5", "2^6", "2^7", "2^8", "2^9", "2^10", ++ "2^11", "2^12", "2^13"); ++ ++ i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) + ++ sizeof(struct ext3_group_info); ++ ext3_lock_group(sb, group); ++ memcpy(&sg, EXT3_GROUP_INFO(sb, group), i); ++ ext3_unlock_group(sb, group); + -+ if (!test_opt(sb, MBALLOC)) ++ if (EXT3_MB_GRP_NEED_INIT(&sg.info)) + return 0; + -+ /* init file for buddy data */ -+ clear_opt(EXT3_SB(sb)->s_mount_opt, MBALLOC); -+ ext3_mb_init_backend(sb); ++ seq_printf(seq, "#%-5lu: %-5u %-5u %-5u [", group, sg.info.bb_free, ++ sg.info.bb_fragments, sg.info.bb_first_free); ++ for (i = 0; i <= 13; i++) ++ seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ? ++ sg.info.bb_counters[i] : 0); ++ seq_printf(seq, " ]\n"); + -+ es = EXT3_SB(sb)->s_es; -+ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) -+ ext3_mb_generate_buddy(sb, i); -+ spin_lock_init(&EXT3_SB(sb)->s_reserve_lock); -+ spin_lock_init(&EXT3_SB(sb)->s_md_lock); -+ INIT_LIST_HEAD(&EXT3_SB(sb)->s_active_transaction); -+ INIT_LIST_HEAD(&EXT3_SB(sb)->s_closed_transaction); -+ INIT_LIST_HEAD(&EXT3_SB(sb)->s_committed_transaction); -+ set_opt(EXT3_SB(sb)->s_mount_opt, MBALLOC); -+ printk("EXT3-fs: mballoc enabled\n"); + return 0; +} + -+void ext3_mb_free_committed_blocks(struct super_block *sb) ++static void ext3_mb_seq_groups_stop(struct seq_file *seq, void *v) +{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ int err, i, count = 0, count2 = 0; -+ struct ext3_free_metadata *md; -+ struct ext3_buddy e3b; ++} + -+ if (list_empty(&sbi->s_committed_transaction)) -+ return; ++static struct seq_operations ext3_mb_seq_groups_ops = { ++ .start = ext3_mb_seq_groups_start, ++ .next = ext3_mb_seq_groups_next, ++ .stop = ext3_mb_seq_groups_stop, ++ .show = ext3_mb_seq_groups_show, ++}; + -+ /* there is committed blocks to be freed yet */ -+ do { -+ /* get next array of blocks */ -+ md = NULL; -+ spin_lock(&sbi->s_md_lock); -+ if (!list_empty(&sbi->s_committed_transaction)) { -+ md = list_entry(sbi->s_committed_transaction.next, -+ struct ext3_free_metadata, list); -+ list_del(&md->list); -+ } -+ spin_unlock(&sbi->s_md_lock); ++static int ext3_mb_seq_groups_open(struct inode *inode, struct file *file) ++{ ++ struct super_block *sb = PDE(inode)->data; ++ int rc; + -+ if (md == NULL) -+ break; ++ rc = seq_open(file, &ext3_mb_seq_groups_ops); ++ if (rc == 0) { ++ struct seq_file *m = (struct seq_file *)file->private_data; ++ m->private = sb; ++ } ++ return rc; + -+ mb_debug("gonna free %u blocks in group %u (0x%p):", -+ md->num, md->group, md); ++} + -+ err = ext3_mb_load_desc(sb, md->group, &e3b); -+ BUG_ON(err != 0); ++static struct file_operations ext3_mb_seq_groups_fops = { ++ .owner = THIS_MODULE, ++ .open = ext3_mb_seq_groups_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = seq_release, ++}; + -+ /* there are blocks to put in buddy to make them really free */ -+ count += md->num; -+ count2++; -+ ext3_lock_group(sb, md->group); -+ for (i = 0; i < md->num; i++) { -+ mb_debug(" %u", md->blocks[i]); -+ mb_free_blocks(&e3b, md->blocks[i], 1); -+ } -+ mb_debug("\n"); -+ ext3_unlock_group(sb, md->group); ++static void ext3_mb_history_release(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ char name[64]; + -+ kfree(md); -+ ext3_mb_dirty_buddy(&e3b); -+ ext3_mb_release_desc(&e3b); ++ snprintf(name, sizeof(name) - 1, "%s", bdevname(sb->s_bdev, name)); ++ remove_proc_entry("mb_groups", sbi->s_mb_proc); ++ remove_proc_entry("mb_history", sbi->s_mb_proc); ++ remove_proc_entry(name, proc_root_ext3); + -+ } while (md); -+ mb_debug("freed %u blocks in %u structures\n", count, count2); ++ if (sbi->s_mb_history) ++ kfree(sbi->s_mb_history); +} + -+void ext3_mb_poll_new_transaction(struct super_block *sb, handle_t *handle) ++static void ext3_mb_history_init(struct super_block *sb) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); ++ char name[64]; ++ int i; + -+ if (sbi->s_last_transaction == handle->h_transaction->t_tid) -+ return; ++ snprintf(name, sizeof(name) - 1, "%s", bdevname(sb->s_bdev, name)); ++ sbi->s_mb_proc = proc_mkdir(name, proc_root_ext3); ++ if (sbi->s_mb_proc != NULL) { ++ struct proc_dir_entry *p; ++ p = create_proc_entry("mb_history", S_IRUGO, sbi->s_mb_proc); ++ if (p) { ++ p->proc_fops = &ext3_mb_seq_history_fops; ++ p->data = sb; ++ } ++ p = create_proc_entry("mb_groups", S_IRUGO, sbi->s_mb_proc); ++ if (p) { ++ p->proc_fops = &ext3_mb_seq_groups_fops; ++ p->data = sb; ++ } ++ } + -+ /* new transaction! time to close last one and free blocks for -+ * committed transaction. we know that only transaction can be -+ * active, so previos transaction can be being logged and we -+ * know that transaction before previous is known to be alreade -+ * logged. this means that now we may free blocks freed in all -+ * transactions before previous one. hope I'm clear enough ... */ ++ sbi->s_mb_history_max = 1000; ++ sbi->s_mb_history_cur = 0; ++ spin_lock_init(&sbi->s_mb_history_lock); ++ i = sbi->s_mb_history_max * sizeof(struct ext3_mb_history); ++ sbi->s_mb_history = kmalloc(i, GFP_KERNEL); ++ memset(sbi->s_mb_history, 0, i); ++ /* if we can't allocate history, then we simple won't use it */ ++} + -+ spin_lock(&sbi->s_md_lock); -+ if (sbi->s_last_transaction != handle->h_transaction->t_tid) { ++static void ++ext3_mb_store_history(struct super_block *sb, unsigned ino, ++ struct ext3_allocation_context *ac) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct ext3_mb_history h; ++ ++ if (likely(sbi->s_mb_history == NULL)) ++ return; ++ ++ h.pid = current->pid; ++ h.ino = ino; ++ h.goal = ac->ac_g_ex; ++ h.result = ac->ac_b_ex; ++ h.found = ac->ac_found; ++ h.cr = ac->ac_criteria; ++ h.groups = ac->ac_groups_scanned; ++ h.tail = ac->ac_tail; ++ h.buddy = ac->ac_buddy; ++ h.merged = 0; ++ if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && ++ ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group) ++ h.merged = 1; ++ ++ spin_lock(&sbi->s_mb_history_lock); ++ memcpy(sbi->s_mb_history + sbi->s_mb_history_cur, &h, sizeof(h)); ++ if (++sbi->s_mb_history_cur >= sbi->s_mb_history_max) ++ sbi->s_mb_history_cur = 0; ++ spin_unlock(&sbi->s_mb_history_lock); ++} ++ ++#else ++#define ext3_mb_history_release(sb) ++#define ext3_mb_history_init(sb) ++#endif ++ ++int ext3_mb_init_backend(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int i, j, len, metalen; ++ int num_meta_group_infos = ++ (sbi->s_groups_count + EXT3_DESC_PER_BLOCK(sb) - 1) >> ++ EXT3_DESC_PER_BLOCK_BITS(sb); ++ struct ext3_group_info **meta_group_info; ++ ++ /* An 8TB filesystem with 64-bit pointers requires a 4096 byte ++ * kmalloc. A 128kb malloc should suffice for a 256TB filesystem. ++ * So a two level scheme suffices for now. */ ++ sbi->s_group_info = kmalloc(sizeof(*sbi->s_group_info) * ++ num_meta_group_infos, GFP_KERNEL); ++ if (sbi->s_group_info == NULL) { ++ printk(KERN_ERR "EXT3-fs: can't allocate buddy meta group\n"); ++ return -ENOMEM; ++ } ++ sbi->s_buddy_cache = new_inode(sb); ++ if (sbi->s_buddy_cache == NULL) { ++ printk(KERN_ERR "EXT3-fs: can't get new inode\n"); ++ goto err_freesgi; ++ } ++ ++ metalen = sizeof(*meta_group_info) << EXT3_DESC_PER_BLOCK_BITS(sb); ++ for (i = 0; i < num_meta_group_infos; i++) { ++ if ((i + 1) == num_meta_group_infos) ++ metalen = sizeof(*meta_group_info) * ++ (sbi->s_groups_count - ++ (i << EXT3_DESC_PER_BLOCK_BITS(sb))); ++ meta_group_info = kmalloc(metalen, GFP_KERNEL); ++ if (meta_group_info == NULL) { ++ printk(KERN_ERR "EXT3-fs: can't allocate mem for a " ++ "buddy group\n"); ++ goto err_freemeta; ++ } ++ sbi->s_group_info[i] = meta_group_info; ++ } ++ ++ /* ++ * calculate needed size. if change bb_counters size, ++ * don't forget about ext3_mb_generate_buddy() ++ */ ++ len = sizeof(struct ext3_group_info); ++ len += sizeof(unsigned short) * (sb->s_blocksize_bits + 2); ++ for (i = 0; i < sbi->s_groups_count; i++) { ++ struct ext3_group_desc * desc; ++ ++ meta_group_info = ++ sbi->s_group_info[i >> EXT3_DESC_PER_BLOCK_BITS(sb)]; ++ j = i & (EXT3_DESC_PER_BLOCK(sb) - 1); ++ ++ meta_group_info[j] = kmalloc(len, GFP_KERNEL); ++ if (meta_group_info[j] == NULL) { ++ printk(KERN_ERR "EXT3-fs: can't allocate buddy mem\n"); ++ i--; ++ goto err_freebuddy; ++ } ++ desc = ext3_get_group_desc(sb, i, NULL); ++ if (desc == NULL) { ++ printk(KERN_ERR"EXT3-fs: can't read descriptor %u\n",i); ++ goto err_freebuddy; ++ } ++ memset(meta_group_info[j], 0, len); ++ set_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, ++ &meta_group_info[j]->bb_state); ++ meta_group_info[j]->bb_free = ++ le16_to_cpu(desc->bg_free_blocks_count); ++ } ++ ++ return 0; ++ ++err_freebuddy: ++ while (i >= 0) { ++ kfree(EXT3_GROUP_INFO(sb, i)); ++ i--; ++ } ++ i = num_meta_group_infos; ++err_freemeta: ++ while (--i >= 0) ++ kfree(sbi->s_group_info[i]); ++ iput(sbi->s_buddy_cache); ++err_freesgi: ++ kfree(sbi->s_group_info); ++ return -ENOMEM; ++} ++ ++int ext3_mb_init(struct super_block *sb, int needs_recovery) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct inode *root = sb->s_root->d_inode; ++ unsigned i, offset, max; ++ struct dentry *dentry; ++ ++ if (!test_opt(sb, MBALLOC)) ++ return 0; ++ ++ i = (sb->s_blocksize_bits + 2) * sizeof(unsigned short); ++ ++ sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL); ++ if (sbi->s_mb_offsets == NULL) { ++ clear_opt(sbi->s_mount_opt, MBALLOC); ++ return -ENOMEM; ++ } ++ sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); ++ if (sbi->s_mb_maxs == NULL) { ++ clear_opt(sbi->s_mount_opt, MBALLOC); ++ kfree(sbi->s_mb_maxs); ++ return -ENOMEM; ++ } ++ ++ /* order 0 is regular bitmap */ ++ sbi->s_mb_maxs[0] = sb->s_blocksize << 3; ++ sbi->s_mb_offsets[0] = 0; ++ ++ i = 1; ++ offset = 0; ++ max = sb->s_blocksize << 2; ++ do { ++ sbi->s_mb_offsets[i] = offset; ++ sbi->s_mb_maxs[i] = max; ++ offset += 1 << (sb->s_blocksize_bits - i); ++ max = max >> 1; ++ i++; ++ } while (i <= sb->s_blocksize_bits + 1); ++ ++ /* init file for buddy data */ ++ if ((i = ext3_mb_init_backend(sb))) { ++ clear_opt(sbi->s_mount_opt, MBALLOC); ++ kfree(sbi->s_mb_offsets); ++ kfree(sbi->s_mb_maxs); ++ return i; ++ } ++ ++ spin_lock_init(&sbi->s_reserve_lock); ++ spin_lock_init(&sbi->s_md_lock); ++ INIT_LIST_HEAD(&sbi->s_active_transaction); ++ INIT_LIST_HEAD(&sbi->s_closed_transaction); ++ INIT_LIST_HEAD(&sbi->s_committed_transaction); ++ spin_lock_init(&sbi->s_bal_lock); ++ ++ /* remove old on-disk buddy file */ ++ down(&root->i_sem); ++ dentry = lookup_one_len(".buddy", sb->s_root, strlen(".buddy")); ++ if (dentry->d_inode != NULL) { ++ i = vfs_unlink(root, dentry); ++ if (i != 0) ++ printk("EXT3-fs: can't remove .buddy file: %d\n", i); ++ } ++ dput(dentry); ++ up(&root->i_sem); ++ ++ ext3_mb_history_init(sb); ++ ++ printk("EXT3-fs: mballoc enabled\n"); ++ return 0; ++} ++ ++int ext3_mb_release(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int i, num_meta_group_infos; ++ ++ if (!test_opt(sb, MBALLOC)) ++ return 0; ++ ++ /* release freed, non-committed blocks */ ++ spin_lock(&sbi->s_md_lock); ++ list_splice_init(&sbi->s_closed_transaction, ++ &sbi->s_committed_transaction); ++ list_splice_init(&sbi->s_active_transaction, ++ &sbi->s_committed_transaction); ++ spin_unlock(&sbi->s_md_lock); ++ ext3_mb_free_committed_blocks(sb); ++ ++ if (sbi->s_group_info) { ++ for (i = 0; i < sbi->s_groups_count; i++) ++ kfree(EXT3_GROUP_INFO(sb, i)); ++ num_meta_group_infos = (sbi->s_groups_count + ++ EXT3_DESC_PER_BLOCK(sb) - 1) >> ++ EXT3_DESC_PER_BLOCK_BITS(sb); ++ for (i = 0; i < num_meta_group_infos; i++) ++ kfree(sbi->s_group_info[i]); ++ kfree(sbi->s_group_info); ++ } ++ if (sbi->s_mb_offsets) ++ kfree(sbi->s_mb_offsets); ++ if (sbi->s_mb_maxs) ++ kfree(sbi->s_mb_maxs); ++ if (sbi->s_buddy_cache) ++ iput(sbi->s_buddy_cache); ++ if (sbi->s_blocks_reserved) ++ printk("ext3-fs: %ld blocks being reserved at umount!\n", ++ sbi->s_blocks_reserved); ++ if (ext3_mb_stats) { ++ printk("EXT3-fs: mballoc: %u blocks %u reqs (%u success)\n", ++ atomic_read(&sbi->s_bal_allocated), ++ atomic_read(&sbi->s_bal_reqs), ++ atomic_read(&sbi->s_bal_success)); ++ printk("EXT3-fs: mballoc: %u extents scanned, %u goal hits, " ++ "%u 2^N hits, %u breaks\n", ++ atomic_read(&sbi->s_bal_ex_scanned), ++ atomic_read(&sbi->s_bal_goals), ++ atomic_read(&sbi->s_bal_2orders), ++ atomic_read(&sbi->s_bal_breaks)); ++ printk("EXT3-fs: mballoc: %lu generated and it took %Lu\n", ++ sbi->s_mb_buddies_generated++, ++ sbi->s_mb_generation_time); ++ } ++ ++ ext3_mb_history_release(sb); ++ ++ return 0; ++} ++ ++void ext3_mb_free_committed_blocks(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int err, i, count = 0, count2 = 0; ++ struct ext3_free_metadata *md; ++ struct ext3_buddy e3b; ++ ++ if (list_empty(&sbi->s_committed_transaction)) ++ return; ++ ++ /* there is committed blocks to be freed yet */ ++ do { ++ /* get next array of blocks */ ++ md = NULL; ++ spin_lock(&sbi->s_md_lock); ++ if (!list_empty(&sbi->s_committed_transaction)) { ++ md = list_entry(sbi->s_committed_transaction.next, ++ struct ext3_free_metadata, list); ++ list_del(&md->list); ++ } ++ spin_unlock(&sbi->s_md_lock); ++ ++ if (md == NULL) ++ break; ++ ++ mb_debug("gonna free %u blocks in group %u (0x%p):", ++ md->num, md->group, md); ++ ++ err = ext3_mb_load_buddy(sb, md->group, &e3b); ++ /* we expect to find existing buddy because it's pinned */ ++ BUG_ON(err != 0); ++ ++ /* there are blocks to put in buddy to make them really free */ ++ count += md->num; ++ count2++; ++ ext3_lock_group(sb, md->group); ++ for (i = 0; i < md->num; i++) { ++ mb_debug(" %u", md->blocks[i]); ++ mb_free_blocks(&e3b, md->blocks[i], 1); ++ } ++ mb_debug("\n"); ++ ext3_unlock_group(sb, md->group); ++ ++ /* balance refcounts from ext3_mb_free_metadata() */ ++ page_cache_release(e3b.bd_buddy_page); ++ page_cache_release(e3b.bd_bitmap_page); ++ ++ kfree(md); ++ ext3_mb_release_desc(&e3b); ++ ++ } while (md); ++ mb_debug("freed %u blocks in %u structures\n", count, count2); ++} ++ ++void ext3_mb_poll_new_transaction(struct super_block *sb, handle_t *handle) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ ++ if (sbi->s_last_transaction == handle->h_transaction->t_tid) ++ return; ++ ++ /* new transaction! time to close last one and free blocks for ++ * committed transaction. we know that only transaction can be ++ * active, so previos transaction can be being logged and we ++ * know that transaction before previous is known to be already ++ * logged. this means that now we may free blocks freed in all ++ * transactions before previous one. hope I'm clear enough ... */ ++ ++ spin_lock(&sbi->s_md_lock); ++ if (sbi->s_last_transaction != handle->h_transaction->t_tid) { + mb_debug("new transaction %lu, old %lu\n", + (unsigned long) handle->h_transaction->t_tid, + (unsigned long) sbi->s_last_transaction); @@ -1199,12 +2592,15 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c +int ext3_mb_free_metadata(handle_t *handle, struct ext3_buddy *e3b, + int group, int block, int count) +{ -+ struct ext3_buddy_group_blocks *db = e3b->bd_bd; ++ struct ext3_group_info *db = e3b->bd_info; + struct super_block *sb = e3b->bd_sb; + struct ext3_sb_info *sbi = EXT3_SB(sb); + struct ext3_free_metadata *md; + int i; + ++ J_ASSERT(e3b->bd_bitmap_page != NULL); ++ J_ASSERT(e3b->bd_buddy_page != NULL); ++ + ext3_lock_group(sb, group); + for (i = 0; i < count; i++) { + md = db->bb_md_cur; @@ -1226,6 +2622,12 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + spin_lock(&sbi->s_md_lock); + list_add(&md->list, &sbi->s_active_transaction); + spin_unlock(&sbi->s_md_lock); ++ /* protect buddy cache from being freed, ++ * otherwise we'll refresh it from ++ * on-disk bitmap and lose not-yet-available ++ * blocks */ ++ page_cache_get(e3b->bd_buddy_page); ++ page_cache_get(e3b->bd_bitmap_page); + db->bb_md_cur = md; + db->bb_tid = handle->h_transaction->t_tid; + mb_debug("new md 0x%p for group %u\n", @@ -1249,7 +2651,8 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c +} + +void ext3_mb_free_blocks(handle_t *handle, struct inode *inode, -+ unsigned long block, unsigned long count, int metadata) ++ unsigned long block, unsigned long count, ++ int metadata, int *freed) +{ + struct buffer_head *bitmap_bh = NULL; + struct ext3_group_desc *gdp; @@ -1262,6 +2665,7 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + struct ext3_buddy e3b; + int err = 0, ret; + ++ *freed = 0; + sb = inode->i_sb; + if (!sb) { + printk ("ext3_free_blocks: nonexistent device"); @@ -1331,33 +2735,42 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + if (err) + goto error_return; + -+ err = ext3_mb_load_desc(sb, block_group, &e3b); ++ err = ext3_mb_load_buddy(sb, block_group, &e3b); + if (err) + goto error_return; + ++#ifdef AGGRESSIVE_CHECK ++ { ++ int i; ++ for (i = 0; i < count; i++) ++ J_ASSERT(mb_test_bit(bit + i, bitmap_bh->b_data)); ++ } ++#endif ++ mb_clear_bits(bitmap_bh->b_data, bit, count); ++ ++ /* We dirtied the bitmap block */ ++ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); ++ err = ext3_journal_dirty_metadata(handle, bitmap_bh); ++ + if (metadata) { + /* blocks being freed are metadata. these blocks shouldn't + * be used until this transaction is committed */ + ext3_mb_free_metadata(handle, &e3b, block_group, bit, count); -+ } else { ++ } else { + ext3_lock_group(sb, block_group); + mb_free_blocks(&e3b, bit, count); -+ gdp->bg_free_blocks_count = -+ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + count); + ext3_unlock_group(sb, block_group); -+ percpu_counter_mod(&sbi->s_freeblocks_counter, count); + } -+ -+ ext3_mb_dirty_buddy(&e3b); -+ ext3_mb_release_desc(&e3b); + -+ /* FIXME: undo logic will be implemented later and another way */ -+ mb_clear_bits(bitmap_bh->b_data, bit, count); -+ DQUOT_FREE_BLOCK(inode, count); ++ spin_lock(sb_bgl_lock(sbi, block_group)); ++ gdp->bg_free_blocks_count = ++ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + count); ++ spin_unlock(sb_bgl_lock(sbi, block_group)); ++ percpu_counter_mod(&sbi->s_freeblocks_counter, count); + -+ /* We dirtied the bitmap block */ -+ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); -+ err = ext3_journal_dirty_metadata(handle, bitmap_bh); ++ ext3_mb_release_desc(&e3b); ++ ++ *freed = count; + + /* And the group descriptor block */ + BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); @@ -1406,7 +2819,7 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c +} + +int ext3_new_block(handle_t *handle, struct inode *inode, -+ unsigned long goal, int *errp) ++ unsigned long goal, int *errp) +{ + int ret, len; + @@ -1424,315 +2837,275 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c +void ext3_free_blocks(handle_t *handle, struct inode * inode, + unsigned long block, unsigned long count, int metadata) +{ -+ if (!test_opt(inode->i_sb, MBALLOC)) ++ struct super_block *sb; ++ int freed; ++ ++ sb = inode->i_sb; ++ if (!test_opt(sb, MBALLOC) || !EXT3_SB(sb)->s_group_info) + ext3_free_blocks_old(handle, inode, block, count); -+ else -+ ext3_mb_free_blocks(handle, inode, block, count, metadata); ++ else { ++ ext3_mb_free_blocks(handle, inode, block, count, metadata, &freed); ++ if (freed) ++ DQUOT_FREE_BLOCK(inode, freed); ++ } + return; +} + -Index: linux-2.6.5-sles9/fs/ext3/super.c -=================================================================== ---- linux-2.6.5-sles9.orig/fs/ext3/super.c 2004-11-09 02:23:21.597220752 +0300 -+++ linux-2.6.5-sles9/fs/ext3/super.c 2004-11-09 02:26:12.572228600 +0300 -@@ -389,6 +389,7 @@ - struct ext3_super_block *es = sbi->s_es; - int i; - -+ ext3_mb_release(sb); - ext3_ext_release(sb); - ext3_xattr_put_super(sb); - journal_destroy(sbi->s_journal); -@@ -542,7 +543,7 @@ - Opt_commit, Opt_journal_update, Opt_journal_inum, - Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, - Opt_ignore, Opt_barrier, Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, -- Opt_err, Opt_extents, Opt_extdebug -+ Opt_err, Opt_extents, Opt_extdebug, Opt_mballoc, - }; - - static match_table_t tokens = { -@@ -589,6 +590,7 @@ - {Opt_iopen_nopriv, "iopen_nopriv"}, - {Opt_extents, "extents"}, - {Opt_extdebug, "extdebug"}, -+ {Opt_mballoc, "mballoc"}, - {Opt_err, NULL} - }; - -@@ -810,6 +812,9 @@ - case Opt_extdebug: - set_opt (sbi->s_mount_opt, EXTDEBUG); - break; -+ case Opt_mballoc: -+ set_opt (sbi->s_mount_opt, MBALLOC); -+ break; - default: - printk (KERN_ERR - "EXT3-fs: Unrecognized mount option \"%s\" " -@@ -1463,7 +1468,8 @@ - ext3_count_dirs(sb)); - - ext3_ext_init(sb); -- -+ ext3_mb_init(sb); ++#define EXT3_ROOT "ext3" ++#define EXT3_MB_STATS_NAME "mb_stats" ++#define EXT3_MB_MAX_TO_SCAN_NAME "mb_max_to_scan" ++#define EXT3_MB_MIN_TO_SCAN_NAME "mb_min_to_scan" ++#define EXT3_MB_ORDER2_REQ "mb_order2_req" + - return 0; - - failed_mount3: -Index: linux-2.6.5-sles9/fs/ext3/Makefile -=================================================================== ---- linux-2.6.5-sles9.orig/fs/ext3/Makefile 2004-11-09 02:23:21.593221360 +0300 -+++ linux-2.6.5-sles9/fs/ext3/Makefile 2004-11-09 02:26:12.572228600 +0300 -@@ -5,7 +5,7 @@ - obj-$(CONFIG_EXT3_FS) += ext3.o - - ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ -- ioctl.o namei.o super.o symlink.o hash.o extents.o -+ ioctl.o namei.o super.o symlink.o hash.o extents.o mballoc.o - - ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o - ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o -Index: linux-2.6.5-sles9/fs/ext3/balloc.c -=================================================================== ---- linux-2.6.5-sles9.orig/fs/ext3/balloc.c 2004-11-03 08:36:51.000000000 +0300 -+++ linux-2.6.5-sles9/fs/ext3/balloc.c 2004-11-09 02:26:53.078070776 +0300 -@@ -78,7 +78,7 @@ - * - * Return buffer_head on success or NULL in case of failure. - */ --static struct buffer_head * -+struct buffer_head * - read_block_bitmap(struct super_block *sb, unsigned int block_group) - { - struct ext3_group_desc * desc; -@@ -274,7 +274,7 @@ - } - - /* Free given blocks, update quota and i_blocks field */ --void ext3_free_blocks(handle_t *handle, struct inode *inode, -+void ext3_free_blocks_old(handle_t *handle, struct inode *inode, - unsigned long block, unsigned long count) - { - struct buffer_head *bitmap_bh = NULL; -@@ -1142,7 +1142,7 @@ - * bitmap, and then for any free bit if that fails. - * This function also updates quota and i_blocks field. - */ --int ext3_new_block(handle_t *handle, struct inode *inode, -+int ext3_new_block_old(handle_t *handle, struct inode *inode, - unsigned long goal, int *errp) - { - struct buffer_head *bitmap_bh = NULL; -Index: linux-2.6.5-sles9/fs/ext3/namei.c -=================================================================== ---- linux-2.6.5-sles9.orig/fs/ext3/namei.c 2004-11-09 02:18:27.616912552 +0300 -+++ linux-2.6.5-sles9/fs/ext3/namei.c 2004-11-09 02:26:12.580227384 +0300 -@@ -1640,7 +1640,7 @@ - * If the create succeeds, we fill in the inode information - * with d_instantiate(). - */ --static int ext3_create (struct inode * dir, struct dentry * dentry, int mode, -+int ext3_create (struct inode * dir, struct dentry * dentry, int mode, - struct nameidata *nd) - { - handle_t *handle; -Index: linux-2.6.5-sles9/fs/ext3/inode.c -=================================================================== ---- linux-2.6.5-sles9.orig/fs/ext3/inode.c 2004-11-09 02:23:21.592221512 +0300 -+++ linux-2.6.5-sles9/fs/ext3/inode.c 2004-11-09 02:26:12.587226320 +0300 -@@ -572,7 +572,7 @@ - ext3_journal_forget(handle, branch[i].bh); - } - for (i = 0; i < keys; i++) -- ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1); -+ ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1, 1); - return err; - } - -@@ -673,7 +673,7 @@ - if (err == -EAGAIN) - for (i = 0; i < num; i++) - ext3_free_blocks(handle, inode, -- le32_to_cpu(where[i].key), 1); -+ le32_to_cpu(where[i].key), 1, 1); - return err; - } - -@@ -1829,7 +1829,7 @@ - } - } - -- ext3_free_blocks(handle, inode, block_to_free, count); -+ ext3_free_blocks(handle, inode, block_to_free, count, 1); - } - - /** -@@ -2000,7 +2000,7 @@ - ext3_journal_test_restart(handle, inode); - } - -- ext3_free_blocks(handle, inode, nr, 1); -+ ext3_free_blocks(handle, inode, nr, 1, 1); - - if (parent_bh) { - /* -Index: linux-2.6.5-sles9/fs/ext3/extents.c -=================================================================== ---- linux-2.6.5-sles9.orig/fs/ext3/extents.c 2004-11-09 02:25:56.143726112 +0300 -+++ linux-2.6.5-sles9/fs/ext3/extents.c 2004-11-09 02:26:12.591225712 +0300 -@@ -740,7 +740,7 @@ - for (i = 0; i < depth; i++) { - if (!ablocks[i]) - continue; -- ext3_free_blocks(handle, tree->inode, ablocks[i], 1); -+ ext3_free_blocks(handle, tree->inode, ablocks[i], 1, 1); - } - } - kfree(ablocks); -@@ -1391,7 +1391,7 @@ - path->p_idx->ei_leaf); - bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf); - ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf); -- ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1); -+ ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1, 1); - return err; - } - -@@ -1879,10 +1879,12 @@ - int needed = ext3_remove_blocks_credits(tree, ex, from, to); - handle_t *handle = ext3_journal_start(tree->inode, needed); - struct buffer_head *bh; -- int i; -+ int i, metadata = 0; - - if (IS_ERR(handle)) - return PTR_ERR(handle); -+ if (S_ISDIR(tree->inode->i_mode)) -+ metadata = 1; - if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) { - /* tail removal */ - unsigned long num, start; -@@ -1894,7 +1896,7 @@ - bh = sb_find_get_block(tree->inode->i_sb, start + i); - ext3_forget(handle, 0, tree->inode, bh, start + i); - } -- ext3_free_blocks(handle, tree->inode, start, num); -+ ext3_free_blocks(handle, tree->inode, start, num, metadata); - } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) { - printk("strange request: removal %lu-%lu from %u:%u\n", - from, to, ex->ee_block, ex->ee_len); -Index: linux-2.6.5-sles9/fs/ext3/xattr.c -=================================================================== ---- linux-2.6.5-sles9.orig/fs/ext3/xattr.c 2004-11-09 02:22:55.777146000 +0300 -+++ linux-2.6.5-sles9/fs/ext3/xattr.c 2004-11-09 02:26:12.593225408 +0300 -@@ -1366,7 +1366,7 @@ - new_bh = sb_getblk(sb, block); - if (!new_bh) { - getblk_failed: -- ext3_free_blocks(handle, inode, block, 1); -+ ext3_free_blocks(handle, inode, block, 1, 1); - error = -EIO; - goto cleanup; - } -@@ -1408,7 +1408,7 @@ - if (HDR(old_bh)->h_refcount == cpu_to_le32(1)) { - /* Free the old block. */ - ea_bdebug(old_bh, "freeing"); -- ext3_free_blocks(handle, inode, old_bh->b_blocknr, 1); -+ ext3_free_blocks(handle, inode, old_bh->b_blocknr, 1, 1); - - /* ext3_forget() calls bforget() for us, but we - let our caller release old_bh, so we need to -@@ -1504,7 +1504,7 @@ - lock_buffer(bh); - if (HDR(bh)->h_refcount == cpu_to_le32(1)) { - ext3_xattr_cache_remove(bh); -- ext3_free_blocks(handle, inode, EXT3_I(inode)->i_file_acl, 1); -+ ext3_free_blocks(handle, inode, EXT3_I(inode)->i_file_acl, 1, 1); - get_bh(bh); - ext3_forget(handle, 1, inode, bh, EXT3_I(inode)->i_file_acl); - } else { -Index: linux-2.6.5-sles9/include/linux/ext3_fs.h -=================================================================== ---- linux-2.6.5-sles9.orig/include/linux/ext3_fs.h 2004-11-09 02:25:17.238640584 +0300 -+++ linux-2.6.5-sles9/include/linux/ext3_fs.h 2004-11-09 02:26:12.596224952 +0300 -@@ -57,6 +57,8 @@ - #define ext3_debug(f, a...) do {} while (0) - #endif - -+#define EXT3_MULTIBLOCK_ALLOCATOR 1 ++static int ext3_mb_stats_read(char *page, char **start, off_t off, ++ int count, int *eof, void *data) ++{ ++ int len; + - /* - * Special inodes numbers - */ -@@ -339,6 +341,7 @@ - #define EXT3_MOUNT_IOPEN_NOPRIV 0x80000 /* Make iopen world-readable */ - #define EXT3_MOUNT_EXTENTS 0x100000/* Extents support */ - #define EXT3_MOUNT_EXTDEBUG 0x200000/* Extents debug */ -+#define EXT3_MOUNT_MBALLOC 0x400000/* Buddy allocation support */ - - /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ - #ifndef clear_opt -@@ -698,7 +701,7 @@ - extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group); - extern int ext3_new_block (handle_t *, struct inode *, unsigned long, int *); - extern void ext3_free_blocks (handle_t *, struct inode *, unsigned long, -- unsigned long); -+ unsigned long, int); - extern unsigned long ext3_count_free_blocks (struct super_block *); - extern void ext3_check_blocks_bitmap (struct super_block *); - extern struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb, -Index: linux-2.6.5-sles9/include/linux/ext3_fs_sb.h -=================================================================== ---- linux-2.6.5-sles9.orig/include/linux/ext3_fs_sb.h 2004-11-09 02:20:51.598024096 +0300 -+++ linux-2.6.5-sles9/include/linux/ext3_fs_sb.h 2004-11-09 02:28:18.753046200 +0300 -@@ -23,10 +23,30 @@ - #define EXT_INCLUDE - #include - #include -+#include - #endif - #endif - #include - -+#define EXT3_BB_MAX_BLOCKS 30 -+struct ext3_free_metadata { -+ unsigned short group; -+ unsigned short num; -+ unsigned short blocks[EXT3_BB_MAX_BLOCKS]; -+ struct list_head list; -+}; ++ *eof = 1; ++ if (off != 0) ++ return 0; + -+#define EXT3_BB_MAX_ORDER 14 ++ len = sprintf(page, "%ld\n", ext3_mb_stats); ++ *start = page; ++ return len; ++} + -+struct ext3_buddy_group_blocks { -+ sector_t bb_bitmap; -+ sector_t bb_buddy; -+ spinlock_t bb_lock; -+ unsigned bb_counters[EXT3_BB_MAX_ORDER]; -+ struct ext3_free_metadata *bb_md_cur; -+ unsigned long bb_tid; -+}; ++static int ext3_mb_stats_write(struct file *file, const char *buffer, ++ unsigned long count, void *data) ++{ ++ char str[32]; + - /* - * third extended-fs super-block data in memory - */ -@@ -78,6 +98,17 @@ - struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */ - wait_queue_head_t ro_wait_queue; /* For people waiting for the fs to go read-only */ - #endif ++ if (count >= sizeof(str)) { ++ printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n", ++ EXT3_MB_STATS_NAME, (int)sizeof(str)); ++ return -EOVERFLOW; ++ } + -+ /* for buddy allocator */ -+ struct ext3_buddy_group_blocks *s_buddy_blocks; -+ struct inode *s_buddy; -+ long s_blocks_reserved; -+ spinlock_t s_reserve_lock; -+ struct list_head s_active_transaction; -+ struct list_head s_closed_transaction; -+ struct list_head s_committed_transaction; -+ spinlock_t s_md_lock; -+ tid_t s_last_transaction; - }; ++ if (copy_from_user(str, buffer, count)) ++ return -EFAULT; ++ ++ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ ++ ext3_mb_stats = (simple_strtol(str, NULL, 0) != 0); ++ return count; ++} ++ ++static int ext3_mb_max_to_scan_read(char *page, char **start, off_t off, ++ int count, int *eof, void *data) ++{ ++ int len; ++ ++ *eof = 1; ++ if (off != 0) ++ return 0; ++ ++ len = sprintf(page, "%ld\n", ext3_mb_max_to_scan); ++ *start = page; ++ return len; ++} ++ ++static int ext3_mb_max_to_scan_write(struct file *file, const char *buffer, ++ unsigned long count, void *data) ++{ ++ char str[32]; ++ long value; ++ ++ if (count >= sizeof(str)) { ++ printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n", ++ EXT3_MB_MAX_TO_SCAN_NAME, (int)sizeof(str)); ++ return -EOVERFLOW; ++ } ++ ++ if (copy_from_user(str, buffer, count)) ++ return -EFAULT; ++ ++ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ ++ value = simple_strtol(str, NULL, 0); ++ if (value <= 0) ++ return -ERANGE; ++ ++ ext3_mb_max_to_scan = value; ++ ++ return count; ++} ++ ++static int ext3_mb_min_to_scan_read(char *page, char **start, off_t off, ++ int count, int *eof, void *data) ++{ ++ int len; ++ ++ *eof = 1; ++ if (off != 0) ++ return 0; ++ ++ len = sprintf(page, "%ld\n", ext3_mb_min_to_scan); ++ *start = page; ++ return len; ++} ++ ++static int ext3_mb_min_to_scan_write(struct file *file, const char *buffer, ++ unsigned long count, void *data) ++{ ++ char str[32]; ++ long value; ++ ++ if (count >= sizeof(str)) { ++ printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n", ++ EXT3_MB_MIN_TO_SCAN_NAME, (int)sizeof(str)); ++ return -EOVERFLOW; ++ } ++ ++ if (copy_from_user(str, buffer, count)) ++ return -EFAULT; ++ ++ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ ++ value = simple_strtol(str, NULL, 0); ++ if (value <= 0) ++ return -ERANGE; ++ ++ ext3_mb_min_to_scan = value; ++ ++ return count; ++} ++ ++static int ext3_mb_order2_req_read(char *page, char **start, off_t off, ++ int count, int *eof, void *data) ++{ ++ int len; ++ ++ *eof = 1; ++ if (off != 0) ++ return 0; ++ ++ len = sprintf(page, "%ld\n", ext3_mb_order2_reqs); ++ *start = page; ++ return len; ++} ++ ++static int ext3_mb_order2_req_write(struct file *file, const char *buffer, ++ unsigned long count, void *data) ++{ ++ char str[32]; ++ long value; ++ ++ if (count >= sizeof(str)) { ++ printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n", ++ EXT3_MB_MIN_TO_SCAN_NAME, (int)sizeof(str)); ++ return -EOVERFLOW; ++ } ++ ++ if (copy_from_user(str, buffer, count)) ++ return -EFAULT; ++ ++ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ ++ value = simple_strtol(str, NULL, 0); ++ if (value <= 0) ++ return -ERANGE; ++ ++ ext3_mb_order2_reqs = value; ++ ++ return count; ++} ++ ++int __init init_ext3_proc(void) ++{ ++ struct proc_dir_entry *proc_ext3_mb_stats; ++ struct proc_dir_entry *proc_ext3_mb_max_to_scan; ++ struct proc_dir_entry *proc_ext3_mb_min_to_scan; ++ struct proc_dir_entry *proc_ext3_mb_order2_req; ++ ++ proc_root_ext3 = proc_mkdir(EXT3_ROOT, proc_root_fs); ++ if (proc_root_ext3 == NULL) { ++ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", EXT3_ROOT); ++ return -EIO; ++ } ++ ++ /* Initialize EXT3_MB_STATS_NAME */ ++ proc_ext3_mb_stats = create_proc_entry(EXT3_MB_STATS_NAME, ++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); ++ if (proc_ext3_mb_stats == NULL) { ++ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", ++ EXT3_MB_STATS_NAME); ++ remove_proc_entry(EXT3_ROOT, proc_root_fs); ++ return -EIO; ++ } ++ ++ proc_ext3_mb_stats->data = NULL; ++ proc_ext3_mb_stats->read_proc = ext3_mb_stats_read; ++ proc_ext3_mb_stats->write_proc = ext3_mb_stats_write; ++ ++ /* Initialize EXT3_MAX_TO_SCAN_NAME */ ++ proc_ext3_mb_max_to_scan = create_proc_entry( ++ EXT3_MB_MAX_TO_SCAN_NAME, ++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); ++ if (proc_ext3_mb_max_to_scan == NULL) { ++ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", ++ EXT3_MB_MAX_TO_SCAN_NAME); ++ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_ROOT, proc_root_fs); ++ return -EIO; ++ } ++ ++ proc_ext3_mb_max_to_scan->data = NULL; ++ proc_ext3_mb_max_to_scan->read_proc = ext3_mb_max_to_scan_read; ++ proc_ext3_mb_max_to_scan->write_proc = ext3_mb_max_to_scan_write; ++ ++ /* Initialize EXT3_MIN_TO_SCAN_NAME */ ++ proc_ext3_mb_min_to_scan = create_proc_entry( ++ EXT3_MB_MIN_TO_SCAN_NAME, ++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); ++ if (proc_ext3_mb_min_to_scan == NULL) { ++ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", ++ EXT3_MB_MIN_TO_SCAN_NAME); ++ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_ROOT, proc_root_fs); ++ return -EIO; ++ } ++ ++ proc_ext3_mb_min_to_scan->data = NULL; ++ proc_ext3_mb_min_to_scan->read_proc = ext3_mb_min_to_scan_read; ++ proc_ext3_mb_min_to_scan->write_proc = ext3_mb_min_to_scan_write; ++ ++ /* Initialize EXT3_ORDER2_REQ */ ++ proc_ext3_mb_order2_req = create_proc_entry( ++ EXT3_MB_ORDER2_REQ, ++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); ++ if (proc_ext3_mb_order2_req == NULL) { ++ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", ++ EXT3_MB_ORDER2_REQ); ++ remove_proc_entry(EXT3_MB_MIN_TO_SCAN_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_ROOT, proc_root_fs); ++ return -EIO; ++ } ++ ++ proc_ext3_mb_order2_req->data = NULL; ++ proc_ext3_mb_order2_req->read_proc = ext3_mb_order2_req_read; ++ proc_ext3_mb_order2_req->write_proc = ext3_mb_order2_req_write; ++ ++ return 0; ++} ++ ++void exit_ext3_proc(void) ++{ ++ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_MIN_TO_SCAN_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_ORDER2_REQ, proc_root_ext3); ++ remove_proc_entry(EXT3_ROOT, proc_root_fs); ++} +Index: linux-2.6.5-7.282-full/fs/ext3/Makefile +=================================================================== +--- linux-2.6.5-7.282-full.orig/fs/ext3/Makefile 2006-10-24 22:18:28.000000000 +0400 ++++ linux-2.6.5-7.282-full/fs/ext3/Makefile 2006-10-24 22:18:28.000000000 +0400 +@@ -6,7 +6,7 @@ obj-$(CONFIG_EXT3_FS) += ext3.o - #endif /* _LINUX_EXT3_FS_SB */ + ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ + ioctl.o namei.o super.o symlink.o hash.o \ +- extents.o ++ extents.o mballoc.o + + ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o + ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o diff --git a/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6.12.patch b/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6.12.patch new file mode 100644 index 0000000..d4d0a05 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6.12.patch @@ -0,0 +1,3105 @@ +Index: linux-2.6.12.6-bull/include/linux/ext3_fs.h +=================================================================== +--- linux-2.6.12.6-bull.orig/include/linux/ext3_fs.h 2006-04-29 20:39:09.000000000 +0400 ++++ linux-2.6.12.6-bull/include/linux/ext3_fs.h 2006-04-29 20:39:10.000000000 +0400 +@@ -57,6 +57,14 @@ struct statfs; + #define ext3_debug(f, a...) do {} while (0) + #endif + ++#define EXT3_MULTIBLOCK_ALLOCATOR 1 ++ ++#define EXT3_MB_HINT_MERGE 1 ++#define EXT3_MB_HINT_RESERVED 2 ++#define EXT3_MB_HINT_METADATA 4 ++#define EXT3_MB_HINT_FIRST 8 ++#define EXT3_MB_HINT_BEST 16 ++ + /* + * Special inodes numbers + */ +@@ -366,6 +374,7 @@ struct ext3_inode { + #define EXT3_MOUNT_IOPEN_NOPRIV 0x100000/* Make iopen world-readable */ + #define EXT3_MOUNT_EXTENTS 0x200000/* Extents support */ + #define EXT3_MOUNT_EXTDEBUG 0x400000/* Extents debug */ ++#define EXT3_MOUNT_MBALLOC 0x800000/* Buddy allocation support */ + + /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ + #ifndef clear_opt +@@ -387,6 +396,14 @@ + #define ext3_find_first_zero_bit ext2_find_first_zero_bit + #define ext3_find_next_zero_bit ext2_find_next_zero_bit + ++#ifndef ext2_find_next_le_bit ++#ifdef __LITTLE_ENDIAN ++#define ext2_find_next_le_bit(addr, size, off) find_next_bit((addr), (size), (off)) ++#else ++#error "mballoc needs a patch for big-endian systems - CFS bug 10634" ++#endif /* __LITTLE_ENDIAN */ ++#endif /* !ext2_find_next_le_bit */ ++ + /* + * Maximal mount counts between two filesystem checks + */ +@@ -727,7 +736,7 @@ extern int ext3_bg_has_super(struct supe + extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group); + extern int ext3_new_block (handle_t *, struct inode *, unsigned long, int *); + extern void ext3_free_blocks (handle_t *, struct inode *, unsigned long, +- unsigned long); ++ unsigned long, int); + extern void ext3_free_blocks_sb (handle_t *, struct super_block *, + unsigned long, unsigned long, int *); + extern unsigned long ext3_count_free_blocks (struct super_block *); +@@ -848,6 +857,17 @@ extern void ext3_extents_initialize_bloc + extern int ext3_ext_ioctl(struct inode *inode, struct file *filp, + unsigned int cmd, unsigned long arg); + ++/* mballoc.c */ ++extern long ext3_mb_stats; ++extern long ext3_mb_max_to_scan; ++extern int ext3_mb_init(struct super_block *, int); ++extern int ext3_mb_release(struct super_block *); ++extern int ext3_mb_new_blocks(handle_t *, struct inode *, unsigned long, int *, int, int *); ++extern int ext3_mb_reserve_blocks(struct super_block *, int); ++extern void ext3_mb_release_blocks(struct super_block *, int); ++int __init init_ext3_proc(void); ++void exit_ext3_proc(void); ++ + #endif /* __KERNEL__ */ + + /* EXT3_IOC_CREATE_INUM at bottom of file (visible to kernel and user). */ +Index: linux-2.6.12.6-bull/include/linux/ext3_fs_sb.h +=================================================================== +--- linux-2.6.12.6-bull.orig/include/linux/ext3_fs_sb.h 2005-08-29 20:55:27.000000000 +0400 ++++ linux-2.6.12.6-bull/include/linux/ext3_fs_sb.h 2006-04-29 20:39:10.000000000 +0400 +@@ -21,8 +21,14 @@ + #include + #include + #include ++#include + #endif + #include ++#include ++ ++struct ext3_buddy_group_blocks; ++struct ext3_mb_history; ++#define EXT3_BB_MAX_BLOCKS + + /* + * third extended-fs super-block data in memory +@@ -78,6 +84,43 @@ struct ext3_sb_info { + char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */ + int s_jquota_fmt; /* Format of quota to use */ + #endif ++ ++ /* for buddy allocator */ ++ struct ext3_group_info ***s_group_info; ++ struct inode *s_buddy_cache; ++ long s_blocks_reserved; ++ spinlock_t s_reserve_lock; ++ struct list_head s_active_transaction; ++ struct list_head s_closed_transaction; ++ struct list_head s_committed_transaction; ++ spinlock_t s_md_lock; ++ tid_t s_last_transaction; ++ int s_mb_factor; ++ unsigned short *s_mb_offsets, *s_mb_maxs; ++ unsigned long s_stripe; ++ ++ /* history to debug policy */ ++ struct ext3_mb_history *s_mb_history; ++ int s_mb_history_cur; ++ int s_mb_history_max; ++ struct proc_dir_entry *s_mb_proc; ++ spinlock_t s_mb_history_lock; ++ ++ /* stats for buddy allocator */ ++ atomic_t s_bal_reqs; /* number of reqs with len > 1 */ ++ atomic_t s_bal_success; /* we found long enough chunks */ ++ atomic_t s_bal_allocated; /* in blocks */ ++ atomic_t s_bal_ex_scanned; /* total extents scanned */ ++ atomic_t s_bal_goals; /* goal hits */ ++ atomic_t s_bal_breaks; /* too long searches */ ++ atomic_t s_bal_2orders; /* 2^order hits */ ++ spinlock_t s_bal_lock; ++ unsigned long s_mb_buddies_generated; ++ unsigned long long s_mb_generation_time; + }; ++ ++#define EXT3_GROUP_INFO(sb, group) \ ++ EXT3_SB(sb)->s_group_info[(group) >> EXT3_DESC_PER_BLOCK_BITS(sb)] \ ++ [(group) & (EXT3_DESC_PER_BLOCK(sb) - 1)] + + #endif /* _LINUX_EXT3_FS_SB */ +Index: linux-2.6.12.6-bull/fs/ext3/super.c +=================================================================== +--- linux-2.6.12.6-bull.orig/fs/ext3/super.c 2006-04-29 20:39:09.000000000 +0400 ++++ linux-2.6.12.6-bull/fs/ext3/super.c 2006-04-29 20:39:10.000000000 +0400 +@@ -387,6 +387,7 @@ static void ext3_put_super (struct super + struct ext3_super_block *es = sbi->s_es; + int i; + ++ ext3_mb_release(sb); + ext3_ext_release(sb); + ext3_xattr_put_super(sb); + journal_destroy(sbi->s_journal); +@@ -597,6 +598,7 @@ enum { + Opt_ignore, Opt_barrier, Opt_err, Opt_resize, + Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, + Opt_extents, Opt_noextents, Opt_extdebug, ++ Opt_mballoc, Opt_nomballoc, Opt_stripe, + }; + + static match_table_t tokens = { +@@ -650,6 +651,9 @@ static match_table_t tokens = { + {Opt_extents, "extents"}, + {Opt_noextents, "noextents"}, + {Opt_extdebug, "extdebug"}, ++ {Opt_mballoc, "mballoc"}, ++ {Opt_nomballoc, "nomballoc"}, ++ {Opt_stripe, "stripe=%u"}, + {Opt_barrier, "barrier=%u"}, + {Opt_err, NULL}, + {Opt_resize, "resize"}, +@@ -965,6 +967,19 @@ clear_qf_name: + case Opt_extdebug: + set_opt (sbi->s_mount_opt, EXTDEBUG); + break; ++ case Opt_mballoc: ++ set_opt(sbi->s_mount_opt, MBALLOC); ++ break; ++ case Opt_nomballoc: ++ clear_opt(sbi->s_mount_opt, MBALLOC); ++ break; ++ case Opt_stripe: ++ if (match_int(&args[0], &option)) ++ return 0; ++ if (option < 0) ++ return 0; ++ sbi->s_stripe = option; ++ break; + default: + printk (KERN_ERR + "EXT3-fs: Unrecognized mount option \"%s\" " +@@ -1670,6 +1675,7 @@ static int ext3_fill_super (struct super + ext3_count_dirs(sb)); + + ext3_ext_init(sb); ++ ext3_mb_init(sb, needs_recovery); + lock_kernel(); + return 0; + +@@ -2549,7 +2555,13 @@ static struct file_system_type ext3_fs_t + + static int __init init_ext3_fs(void) + { +- int err = init_ext3_xattr(); ++ int err; ++ ++ err = init_ext3_proc(); ++ if (err) ++ return err; ++ ++ err = init_ext3_xattr(); + if (err) + return err; + err = init_inodecache(); +@@ -2571,6 +2583,7 @@ static void __exit exit_ext3_fs(void) + unregister_filesystem(&ext3_fs_type); + destroy_inodecache(); + exit_ext3_xattr(); ++ exit_ext3_proc(); + } + + int ext3_prep_san_write(struct inode *inode, long *blocks, +Index: linux-2.6.12.6-bull/fs/ext3/extents.c +=================================================================== +--- linux-2.6.12.6-bull.orig/fs/ext3/extents.c 2006-04-29 20:39:09.000000000 +0400 ++++ linux-2.6.12.6-bull/fs/ext3/extents.c 2006-04-29 20:39:10.000000000 +0400 +@@ -777,7 +777,7 @@ cleanup: + for (i = 0; i < depth; i++) { + if (!ablocks[i]) + continue; +- ext3_free_blocks(handle, tree->inode, ablocks[i], 1); ++ ext3_free_blocks(handle, tree->inode, ablocks[i], 1, 1); + } + } + kfree(ablocks); +@@ -1434,7 +1434,7 @@ int ext3_ext_rm_idx(handle_t *handle, st + path->p_idx->ei_leaf); + bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf); + ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf); +- ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1); ++ ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1, 1); + return err; + } + +@@ -1919,10 +1919,12 @@ ext3_remove_blocks(struct ext3_extents_t + int needed = ext3_remove_blocks_credits(tree, ex, from, to); + handle_t *handle = ext3_journal_start(tree->inode, needed); + struct buffer_head *bh; +- int i; ++ int i, metadata = 0; + + if (IS_ERR(handle)) + return PTR_ERR(handle); ++ if (S_ISDIR(tree->inode->i_mode) || S_ISLNK(tree->inode->i_mode)) ++ metadata = 1; + if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) { + /* tail removal */ + unsigned long num, start; +@@ -1934,7 +1936,7 @@ ext3_remove_blocks(struct ext3_extents_t + bh = sb_find_get_block(tree->inode->i_sb, start + i); + ext3_forget(handle, 0, tree->inode, bh, start + i); + } +- ext3_free_blocks(handle, tree->inode, start, num); ++ ext3_free_blocks(handle, tree->inode, start, num, metadata); + } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) { + printk("strange request: removal %lu-%lu from %u:%u\n", + from, to, ex->ee_block, ex->ee_len); +Index: linux-2.6.12.6-bull/fs/ext3/inode.c +=================================================================== +--- linux-2.6.12.6-bull.orig/fs/ext3/inode.c 2006-04-29 20:39:09.000000000 +0400 ++++ linux-2.6.12.6-bull/fs/ext3/inode.c 2006-04-29 20:39:10.000000000 +0400 +@@ -564,7 +564,7 @@ static int ext3_alloc_branch(handle_t *h + ext3_journal_forget(handle, branch[i].bh); + } + for (i = 0; i < keys; i++) +- ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1); ++ ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1, 1); + return err; + } + +@@ -1850,7 +1850,7 @@ ext3_clear_blocks(handle_t *handle, stru + } + } + +- ext3_free_blocks(handle, inode, block_to_free, count); ++ ext3_free_blocks(handle, inode, block_to_free, count, 1); + } + + /** +@@ -2023,7 +2023,7 @@ static void ext3_free_branches(handle_t + ext3_journal_test_restart(handle, inode); + } + +- ext3_free_blocks(handle, inode, nr, 1); ++ ext3_free_blocks(handle, inode, nr, 1, 1); + + if (parent_bh) { + /* +Index: linux-2.6.12.6-bull/fs/ext3/balloc.c +=================================================================== +--- linux-2.6.12.6-bull.orig/fs/ext3/balloc.c 2005-08-29 20:55:27.000000000 +0400 ++++ linux-2.6.12.6-bull/fs/ext3/balloc.c 2006-04-29 20:39:10.000000000 +0400 +@@ -79,7 +79,7 @@ struct ext3_group_desc * ext3_get_group_ + * + * Return buffer_head on success or NULL in case of failure. + */ +-static struct buffer_head * ++struct buffer_head * + read_block_bitmap(struct super_block *sb, unsigned int block_group) + { + struct ext3_group_desc * desc; +@@ -490,24 +490,6 @@ error_return: + return; + } + +-/* Free given blocks, update quota and i_blocks field */ +-void ext3_free_blocks(handle_t *handle, struct inode *inode, +- unsigned long block, unsigned long count) +-{ +- struct super_block * sb; +- int dquot_freed_blocks; +- +- sb = inode->i_sb; +- if (!sb) { +- printk ("ext3_free_blocks: nonexistent device"); +- return; +- } +- ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks); +- if (dquot_freed_blocks) +- DQUOT_FREE_BLOCK(inode, dquot_freed_blocks); +- return; +-} +- + /* + * For ext3 allocations, we must not reuse any blocks which are + * allocated in the bitmap buffer's "last committed data" copy. This +@@ -1162,7 +1144,7 @@ int ext3_should_retry_alloc(struct super + * bitmap, and then for any free bit if that fails. + * This function also updates quota and i_blocks field. + */ +-int ext3_new_block(handle_t *handle, struct inode *inode, ++int ext3_new_block_old(handle_t *handle, struct inode *inode, + unsigned long goal, int *errp) + { + struct buffer_head *bitmap_bh = NULL; +Index: linux-2.6.12.6-bull/fs/ext3/xattr.c +=================================================================== +--- linux-2.6.12.6-bull.orig/fs/ext3/xattr.c 2005-08-29 20:55:27.000000000 +0400 ++++ linux-2.6.12.6-bull/fs/ext3/xattr.c 2006-04-29 20:39:10.000000000 +0400 +@@ -484,7 +484,7 @@ ext3_xattr_release_block(handle_t *handl + ea_bdebug(bh, "refcount now=0; freeing"); + if (ce) + mb_cache_entry_free(ce); +- ext3_free_blocks(handle, inode, bh->b_blocknr, 1); ++ ext3_free_blocks(handle, inode, bh->b_blocknr, 1, 1); + get_bh(bh); + ext3_forget(handle, 1, inode, bh, bh->b_blocknr); + } else { +@@ -804,7 +804,7 @@ inserted: + new_bh = sb_getblk(sb, block); + if (!new_bh) { + getblk_failed: +- ext3_free_blocks(handle, inode, block, 1); ++ ext3_free_blocks(handle, inode, block, 1, 1); + error = -EIO; + goto cleanup; + } +Index: linux-2.6.12.6-bull/fs/ext3/mballoc.c +=================================================================== +--- linux-2.6.12.6-bull.orig/fs/ext3/mballoc.c 2006-04-22 17:31:47.543334750 +0400 ++++ linux-2.6.12.6-bull/fs/ext3/mballoc.c 2006-04-30 01:24:11.000000000 +0400 +@@ -0,0 +1,2729 @@ ++/* ++ * Copyright (c) 2003-2005, Cluster File Systems, Inc, info@clusterfs.com ++ * Written by Alex Tomas ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public Licens ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- ++ */ ++ ++ ++/* ++ * mballoc.c contains the multiblocks allocation routines ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * TODO: ++ * - bitmap read-ahead (proposed by Oleg Drokin aka green) ++ * - track min/max extents in each group for better group selection ++ * - mb_mark_used() may allocate chunk right after splitting buddy ++ * - special flag to advice allocator to look for requested + N blocks ++ * this may improve interaction between extents and mballoc ++ * - tree of groups sorted by number of free blocks ++ * - percpu reservation code (hotpath) ++ * - error handling ++ */ ++ ++/* ++ * with AGRESSIVE_CHECK allocator runs consistency checks over ++ * structures. these checks slow things down a lot ++ */ ++#define AGGRESSIVE_CHECK__ ++ ++/* ++ */ ++#define MB_DEBUG__ ++#ifdef MB_DEBUG ++#define mb_debug(fmt,a...) printk(fmt, ##a) ++#else ++#define mb_debug(fmt,a...) ++#endif ++ ++/* ++ * with EXT3_MB_HISTORY mballoc stores last N allocations in memory ++ * and you can monitor it in /proc/fs/ext3//mb_history ++ */ ++#define EXT3_MB_HISTORY ++ ++/* ++ * How long mballoc can look for a best extent (in found extents) ++ */ ++long ext3_mb_max_to_scan = 500; ++ ++/* ++ * How long mballoc must look for a best extent ++ */ ++long ext3_mb_min_to_scan = 30; ++ ++/* ++ * with 'ext3_mb_stats' allocator will collect stats that will be ++ * shown at umount. The collecting costs though! ++ */ ++ ++long ext3_mb_stats = 1; ++ ++/* ++ * for which requests use 2^N search using buddies ++ */ ++long ext3_mb_order2_reqs = 8; ++ ++#ifdef EXT3_BB_MAX_BLOCKS ++#undef EXT3_BB_MAX_BLOCKS ++#endif ++#define EXT3_BB_MAX_BLOCKS 30 ++ ++struct ext3_free_metadata { ++ unsigned short group; ++ unsigned short num; ++ unsigned short blocks[EXT3_BB_MAX_BLOCKS]; ++ struct list_head list; ++}; ++ ++struct ext3_group_info { ++ unsigned long bb_state; ++ unsigned long bb_tid; ++ struct ext3_free_metadata *bb_md_cur; ++ unsigned short bb_first_free; ++ unsigned short bb_free; ++ unsigned short bb_fragments; ++ unsigned short bb_counters[]; ++}; ++ ++ ++#define EXT3_GROUP_INFO_NEED_INIT_BIT 0 ++#define EXT3_GROUP_INFO_LOCKED_BIT 1 ++ ++#define EXT3_MB_GRP_NEED_INIT(grp) \ ++ (test_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, &(grp)->bb_state)) ++ ++struct ext3_free_extent { ++ __u16 fe_start; ++ __u16 fe_len; ++ __u16 fe_group; ++}; ++ ++struct ext3_allocation_context { ++ struct super_block *ac_sb; ++ ++ /* search goals */ ++ struct ext3_free_extent ac_g_ex; ++ ++ /* the best found extent */ ++ struct ext3_free_extent ac_b_ex; ++ ++ /* number of iterations done. we have to track to limit searching */ ++ unsigned long ac_ex_scanned; ++ __u16 ac_groups_scanned; ++ __u16 ac_found; ++ __u16 ac_tail; ++ __u16 ac_buddy; ++ __u8 ac_status; ++ __u8 ac_flags; /* allocation hints */ ++ __u8 ac_criteria; ++ __u8 ac_repeats; ++ __u8 ac_2order; /* if request is to allocate 2^N blocks and ++ * N > 0, the field stores N, otherwise 0 */ ++ ++ struct page *ac_buddy_page; ++ struct page *ac_bitmap_page; ++}; ++ ++#define AC_STATUS_CONTINUE 1 ++#define AC_STATUS_FOUND 2 ++#define AC_STATUS_BREAK 3 ++ ++struct ext3_mb_history { ++ struct ext3_free_extent goal; /* goal allocation */ ++ struct ext3_free_extent result; /* result allocation */ ++ unsigned pid; ++ unsigned ino; ++ __u16 found; /* how many extents have been found */ ++ __u16 groups; /* how many groups have been scanned */ ++ __u16 tail; /* what tail broke some buddy */ ++ __u16 buddy; /* buddy the tail ^^^ broke */ ++ __u8 cr; /* which phase the result extent was found at */ ++ __u8 merged; ++}; ++ ++struct ext3_buddy { ++ struct page *bd_buddy_page; ++ void *bd_buddy; ++ struct page *bd_bitmap_page; ++ void *bd_bitmap; ++ struct ext3_group_info *bd_info; ++ struct super_block *bd_sb; ++ __u16 bd_blkbits; ++ __u16 bd_group; ++}; ++#define EXT3_MB_BITMAP(e3b) ((e3b)->bd_bitmap) ++#define EXT3_MB_BUDDY(e3b) ((e3b)->bd_buddy) ++ ++#ifndef EXT3_MB_HISTORY ++#define ext3_mb_store_history(sb,ino,ac) ++#else ++static void ext3_mb_store_history(struct super_block *, unsigned ino, ++ struct ext3_allocation_context *ac); ++#endif ++ ++#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) ++ ++static struct proc_dir_entry *proc_root_ext3; ++ ++int ext3_create (struct inode *, struct dentry *, int, struct nameidata *); ++struct buffer_head * read_block_bitmap(struct super_block *, unsigned int); ++int ext3_new_block_old(handle_t *, struct inode *, unsigned long, int *); ++int ext3_mb_reserve_blocks(struct super_block *, int); ++void ext3_mb_release_blocks(struct super_block *, int); ++void ext3_mb_poll_new_transaction(struct super_block *, handle_t *); ++void ext3_mb_free_committed_blocks(struct super_block *); ++ ++#if BITS_PER_LONG == 64 ++#define mb_correct_addr_and_bit(bit,addr) \ ++{ \ ++ bit += ((unsigned long) addr & 7UL) << 3; \ ++ addr = (void *) ((unsigned long) addr & ~7UL); \ ++} ++#elif BITS_PER_LONG == 32 ++#define mb_correct_addr_and_bit(bit,addr) \ ++{ \ ++ bit += ((unsigned long) addr & 3UL) << 3; \ ++ addr = (void *) ((unsigned long) addr & ~3UL); \ ++} ++#else ++#error "how many bits you are?!" ++#endif ++ ++static inline int mb_test_bit(int bit, void *addr) ++{ ++ mb_correct_addr_and_bit(bit,addr); ++ return ext2_test_bit(bit, addr); ++} ++ ++static inline void mb_set_bit(int bit, void *addr) ++{ ++ mb_correct_addr_and_bit(bit,addr); ++ ext2_set_bit(bit, addr); ++} ++ ++static inline void mb_set_bit_atomic(int bit, void *addr) ++{ ++ mb_correct_addr_and_bit(bit,addr); ++ ext2_set_bit_atomic(NULL, bit, addr); ++} ++ ++static inline void mb_clear_bit(int bit, void *addr) ++{ ++ mb_correct_addr_and_bit(bit,addr); ++ ext2_clear_bit(bit, addr); ++} ++ ++static inline void mb_clear_bit_atomic(int bit, void *addr) ++{ ++ mb_correct_addr_and_bit(bit,addr); ++ ext2_clear_bit_atomic(NULL, bit, addr); ++} ++ ++static inline int mb_find_next_zero_bit(void *addr, int max, int start) ++{ ++ int fix; ++#if BITS_PER_LONG == 64 ++ fix = ((unsigned long) addr & 7UL) << 3; ++ addr = (void *) ((unsigned long) addr & ~7UL); ++#elif BITS_PER_LONG == 32 ++ fix = ((unsigned long) addr & 3UL) << 3; ++ addr = (void *) ((unsigned long) addr & ~3UL); ++#else ++#error "how many bits you are?!" ++#endif ++ max += fix; ++ start += fix; ++ return ext2_find_next_zero_bit(addr, max, start) - fix; ++} ++ ++static inline void *mb_find_buddy(struct ext3_buddy *e3b, int order, int *max) ++{ ++ char *bb; ++ ++ J_ASSERT(EXT3_MB_BITMAP(e3b) != EXT3_MB_BUDDY(e3b)); ++ J_ASSERT(max != NULL); ++ ++ if (order > e3b->bd_blkbits + 1) { ++ *max = 0; ++ return NULL; ++ } ++ ++ /* at order 0 we see each particular block */ ++ *max = 1 << (e3b->bd_blkbits + 3); ++ if (order == 0) ++ return EXT3_MB_BITMAP(e3b); ++ ++ bb = EXT3_MB_BUDDY(e3b) + EXT3_SB(e3b->bd_sb)->s_mb_offsets[order]; ++ *max = EXT3_SB(e3b->bd_sb)->s_mb_maxs[order]; ++ ++ return bb; ++} ++ ++#ifdef AGGRESSIVE_CHECK ++ ++static void mb_check_buddy(struct ext3_buddy *e3b) ++{ ++ int order = e3b->bd_blkbits + 1; ++ int max, max2, i, j, k, count; ++ int fragments = 0, fstart; ++ void *buddy, *buddy2; ++ ++ if (!test_opt(e3b->bd_sb, MBALLOC)) ++ return; ++ ++ { ++ static int mb_check_counter = 0; ++ if (mb_check_counter++ % 300 != 0) ++ return; ++ } ++ ++ while (order > 1) { ++ buddy = mb_find_buddy(e3b, order, &max); ++ J_ASSERT(buddy); ++ buddy2 = mb_find_buddy(e3b, order - 1, &max2); ++ J_ASSERT(buddy2); ++ J_ASSERT(buddy != buddy2); ++ J_ASSERT(max * 2 == max2); ++ ++ count = 0; ++ for (i = 0; i < max; i++) { ++ ++ if (mb_test_bit(i, buddy)) { ++ /* only single bit in buddy2 may be 1 */ ++ if (!mb_test_bit(i << 1, buddy2)) ++ J_ASSERT(mb_test_bit((i<<1)+1, buddy2)); ++ else if (!mb_test_bit((i << 1) + 1, buddy2)) ++ J_ASSERT(mb_test_bit(i << 1, buddy2)); ++ continue; ++ } ++ ++ /* both bits in buddy2 must be 0 */ ++ J_ASSERT(mb_test_bit(i << 1, buddy2)); ++ J_ASSERT(mb_test_bit((i << 1) + 1, buddy2)); ++ ++ for (j = 0; j < (1 << order); j++) { ++ k = (i * (1 << order)) + j; ++ J_ASSERT(!mb_test_bit(k, EXT3_MB_BITMAP(e3b))); ++ } ++ count++; ++ } ++ J_ASSERT(e3b->bd_info->bb_counters[order] == count); ++ order--; ++ } ++ ++ fstart = -1; ++ buddy = mb_find_buddy(e3b, 0, &max); ++ for (i = 0; i < max; i++) { ++ if (!mb_test_bit(i, buddy)) { ++ J_ASSERT(i >= e3b->bd_info->bb_first_free); ++ if (fstart == -1) { ++ fragments++; ++ fstart = i; ++ } ++ continue; ++ } ++ fstart = -1; ++ /* check used bits only */ ++ for (j = 0; j < e3b->bd_blkbits + 1; j++) { ++ buddy2 = mb_find_buddy(e3b, j, &max2); ++ k = i >> j; ++ J_ASSERT(k < max2); ++ J_ASSERT(mb_test_bit(k, buddy2)); ++ } ++ } ++ J_ASSERT(!EXT3_MB_GRP_NEED_INIT(e3b->bd_info)); ++ J_ASSERT(e3b->bd_info->bb_fragments == fragments); ++} ++ ++#else ++#define mb_check_buddy(e3b) ++#endif ++ ++/* find most significant bit */ ++static int inline fmsb(unsigned short word) ++{ ++ int order; ++ ++ if (word > 255) { ++ order = 7; ++ word >>= 8; ++ } else { ++ order = -1; ++ } ++ ++ do { ++ order++; ++ word >>= 1; ++ } while (word != 0); ++ ++ return order; ++} ++ ++static void inline ++ext3_mb_mark_free_simple(struct super_block *sb, void *buddy, unsigned first, ++ int len, struct ext3_group_info *grp) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ unsigned short min, max, chunk, border; ++ ++ mb_debug("mark %u/%u free\n", first, len); ++ J_ASSERT(len < EXT3_BLOCKS_PER_GROUP(sb)); ++ ++ border = 2 << sb->s_blocksize_bits; ++ ++ while (len > 0) { ++ /* find how many blocks can be covered since this position */ ++ max = ffs(first | border) - 1; ++ ++ /* find how many blocks of power 2 we need to mark */ ++ min = fmsb(len); ++ ++ mb_debug(" %u/%u -> max %u, min %u\n", ++ first & ((2 << sb->s_blocksize_bits) - 1), ++ len, max, min); ++ ++ if (max < min) ++ min = max; ++ chunk = 1 << min; ++ ++ /* mark multiblock chunks only */ ++ grp->bb_counters[min]++; ++ if (min > 0) { ++ mb_debug(" set %u at %u \n", first >> min, ++ sbi->s_mb_offsets[min]); ++ mb_clear_bit(first >> min, buddy + sbi->s_mb_offsets[min]); ++ } ++ ++ len -= chunk; ++ first += chunk; ++ } ++} ++ ++static void ++ext3_mb_generate_buddy(struct super_block *sb, void *buddy, void *bitmap, ++ int group) ++{ ++ struct ext3_group_info *grp = EXT3_GROUP_INFO(sb, group); ++ unsigned short max = EXT3_BLOCKS_PER_GROUP(sb); ++ unsigned short i = 0, first, len; ++ unsigned free = 0, fragments = 0; ++ unsigned long long period = get_cycles(); ++ ++ i = mb_find_next_zero_bit(bitmap, max, 0); ++ grp->bb_first_free = i; ++ while (i < max) { ++ fragments++; ++ first = i; ++ i = ext2_find_next_le_bit(bitmap, max, i); ++ len = i - first; ++ free += len; ++ if (len > 1) ++ ext3_mb_mark_free_simple(sb, buddy, first, len, grp); ++ else ++ grp->bb_counters[0]++; ++ if (i < max) ++ i = mb_find_next_zero_bit(bitmap, max, i); ++ } ++ grp->bb_fragments = fragments; ++ ++ /* bb_state shouldn't being modified because all ++ * others waits for init completion on page lock */ ++ clear_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, &grp->bb_state); ++ if (free != grp->bb_free) { ++ printk("EXT3-fs: group %u: %u blocks in bitmap, %u in gd\n", ++ group, free, grp->bb_free); ++ grp->bb_free = free; ++ } ++ ++ period = get_cycles() - period; ++ spin_lock(&EXT3_SB(sb)->s_bal_lock); ++ EXT3_SB(sb)->s_mb_buddies_generated++; ++ EXT3_SB(sb)->s_mb_generation_time += period; ++ spin_unlock(&EXT3_SB(sb)->s_bal_lock); ++} ++ ++static int ext3_mb_init_cache(struct page *page) ++{ ++ int blocksize, blocks_per_page, groups_per_page; ++ int err = 0, i, first_group, first_block; ++ struct super_block *sb; ++ struct buffer_head *bhs; ++ struct buffer_head **bh; ++ struct inode *inode; ++ char *data, *bitmap; ++ ++ mb_debug("init page %lu\n", page->index); ++ ++ inode = page->mapping->host; ++ sb = inode->i_sb; ++ blocksize = 1 << inode->i_blkbits; ++ blocks_per_page = PAGE_CACHE_SIZE / blocksize; ++ ++ groups_per_page = blocks_per_page >> 1; ++ if (groups_per_page == 0) ++ groups_per_page = 1; ++ ++ /* allocate buffer_heads to read bitmaps */ ++ if (groups_per_page > 1) { ++ err = -ENOMEM; ++ i = sizeof(struct buffer_head *) * groups_per_page; ++ bh = kmalloc(i, GFP_NOFS); ++ if (bh == NULL) ++ goto out; ++ memset(bh, 0, i); ++ } else ++ bh = &bhs; ++ ++ first_group = page->index * blocks_per_page / 2; ++ ++ /* read all groups the page covers into the cache */ ++ for (i = 0; i < groups_per_page; i++) { ++ struct ext3_group_desc * desc; ++ ++ if (first_group + i >= EXT3_SB(sb)->s_groups_count) ++ break; ++ ++ err = -EIO; ++ desc = ext3_get_group_desc(sb, first_group + i, NULL); ++ if (desc == NULL) ++ goto out; ++ ++ err = -ENOMEM; ++ bh[i] = sb_getblk(sb, le32_to_cpu(desc->bg_block_bitmap)); ++ if (bh[i] == NULL) ++ goto out; ++ ++ if (buffer_uptodate(bh[i])) ++ continue; ++ ++ lock_buffer(bh[i]); ++ if (buffer_uptodate(bh[i])) { ++ unlock_buffer(bh[i]); ++ continue; ++ } ++ ++ get_bh(bh[i]); ++ bh[i]->b_end_io = end_buffer_read_sync; ++ submit_bh(READ, bh[i]); ++ mb_debug("read bitmap for group %u\n", first_group + i); ++ } ++ ++ /* wait for I/O completion */ ++ for (i = 0; i < groups_per_page && bh[i]; i++) ++ wait_on_buffer(bh[i]); ++ ++ err = -EIO; ++ for (i = 0; i < groups_per_page && bh[i]; i++) ++ if (!buffer_uptodate(bh[i])) ++ goto out; ++ ++ first_block = page->index * blocks_per_page; ++ for (i = 0; i < blocks_per_page; i++) { ++ int group; ++ ++ group = (first_block + i) >> 1; ++ if (group >= EXT3_SB(sb)->s_groups_count) ++ break; ++ ++ data = page_address(page) + (i * blocksize); ++ bitmap = bh[group - first_group]->b_data; ++ ++ if ((first_block + i) & 1) { ++ /* this is block of buddy */ ++ mb_debug("put buddy for group %u in page %lu/%x\n", ++ group, page->index, i * blocksize); ++ memset(data, 0xff, blocksize); ++ EXT3_GROUP_INFO(sb, group)->bb_fragments = 0; ++ memset(EXT3_GROUP_INFO(sb, group)->bb_counters, 0, ++ sizeof(unsigned short)*(sb->s_blocksize_bits+2)); ++ ext3_mb_generate_buddy(sb, data, bitmap, group); ++ } else { ++ /* this is block of bitmap */ ++ mb_debug("put bitmap for group %u in page %lu/%x\n", ++ group, page->index, i * blocksize); ++ memcpy(data, bitmap, blocksize); ++ } ++ } ++ SetPageUptodate(page); ++ ++out: ++ if (bh) { ++ for (i = 0; i < groups_per_page && bh[i]; i++) ++ brelse(bh[i]); ++ if (bh != &bhs) ++ kfree(bh); ++ } ++ return err; ++} ++ ++static int ext3_mb_load_buddy(struct super_block *sb, int group, ++ struct ext3_buddy *e3b) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct inode *inode = sbi->s_buddy_cache; ++ int blocks_per_page, block, pnum, poff; ++ struct page *page; ++ ++ mb_debug("load group %u\n", group); ++ ++ blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; ++ ++ e3b->bd_blkbits = sb->s_blocksize_bits; ++ e3b->bd_info = EXT3_GROUP_INFO(sb, group); ++ e3b->bd_sb = sb; ++ e3b->bd_group = group; ++ e3b->bd_buddy_page = NULL; ++ e3b->bd_bitmap_page = NULL; ++ ++ block = group * 2; ++ pnum = block / blocks_per_page; ++ poff = block % blocks_per_page; ++ ++ /* we could use find_or_create_page(), but it locks page ++ * what we'd like to avoid in fast path ... */ ++ page = find_get_page(inode->i_mapping, pnum); ++ if (page == NULL || !PageUptodate(page)) { ++ if (page) ++ page_cache_release(page); ++ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); ++ if (page) { ++ BUG_ON(page->mapping != inode->i_mapping); ++ if (!PageUptodate(page)) ++ ext3_mb_init_cache(page); ++ unlock_page(page); ++ } ++ } ++ if (page == NULL || !PageUptodate(page)) ++ goto err; ++ e3b->bd_bitmap_page = page; ++ e3b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); ++ mark_page_accessed(page); ++ ++ block++; ++ pnum = block / blocks_per_page; ++ poff = block % blocks_per_page; ++ ++ page = find_get_page(inode->i_mapping, pnum); ++ if (page == NULL || !PageUptodate(page)) { ++ if (page) ++ page_cache_release(page); ++ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); ++ if (page) { ++ BUG_ON(page->mapping != inode->i_mapping); ++ if (!PageUptodate(page)) ++ ext3_mb_init_cache(page); ++ unlock_page(page); ++ } ++ } ++ if (page == NULL || !PageUptodate(page)) ++ goto err; ++ e3b->bd_buddy_page = page; ++ e3b->bd_buddy = page_address(page) + (poff * sb->s_blocksize); ++ mark_page_accessed(page); ++ ++ J_ASSERT(e3b->bd_bitmap_page != NULL); ++ J_ASSERT(e3b->bd_buddy_page != NULL); ++ ++ return 0; ++ ++err: ++ if (e3b->bd_bitmap_page) ++ page_cache_release(e3b->bd_bitmap_page); ++ if (e3b->bd_buddy_page) ++ page_cache_release(e3b->bd_buddy_page); ++ e3b->bd_buddy = NULL; ++ e3b->bd_bitmap = NULL; ++ return -EIO; ++} ++ ++static void ext3_mb_release_desc(struct ext3_buddy *e3b) ++{ ++ if (e3b->bd_bitmap_page) ++ page_cache_release(e3b->bd_bitmap_page); ++ if (e3b->bd_buddy_page) ++ page_cache_release(e3b->bd_buddy_page); ++} ++ ++ ++static inline void ++ext3_lock_group(struct super_block *sb, int group) ++{ ++ bit_spin_lock(EXT3_GROUP_INFO_LOCKED_BIT, ++ &EXT3_GROUP_INFO(sb, group)->bb_state); ++} ++ ++static inline void ++ext3_unlock_group(struct super_block *sb, int group) ++{ ++ bit_spin_unlock(EXT3_GROUP_INFO_LOCKED_BIT, ++ &EXT3_GROUP_INFO(sb, group)->bb_state); ++} ++ ++static int mb_find_order_for_block(struct ext3_buddy *e3b, int block) ++{ ++ int order = 1; ++ void *bb; ++ ++ J_ASSERT(EXT3_MB_BITMAP(e3b) != EXT3_MB_BUDDY(e3b)); ++ J_ASSERT(block < (1 << (e3b->bd_blkbits + 3))); ++ ++ bb = EXT3_MB_BUDDY(e3b); ++ while (order <= e3b->bd_blkbits + 1) { ++ block = block >> 1; ++ if (!mb_test_bit(block, bb)) { ++ /* this block is part of buddy of order 'order' */ ++ return order; ++ } ++ bb += 1 << (e3b->bd_blkbits - order); ++ order++; ++ } ++ return 0; ++} ++ ++static inline void mb_clear_bits(void *bm, int cur, int len) ++{ ++ __u32 *addr; ++ ++ len = cur + len; ++ while (cur < len) { ++ if ((cur & 31) == 0 && (len - cur) >= 32) { ++ /* fast path: clear whole word at once */ ++ addr = bm + (cur >> 3); ++ *addr = 0; ++ cur += 32; ++ continue; ++ } ++ mb_clear_bit_atomic(cur, bm); ++ cur++; ++ } ++} ++ ++static inline void mb_set_bits(void *bm, int cur, int len) ++{ ++ __u32 *addr; ++ ++ len = cur + len; ++ while (cur < len) { ++ if ((cur & 31) == 0 && (len - cur) >= 32) { ++ /* fast path: clear whole word at once */ ++ addr = bm + (cur >> 3); ++ *addr = 0xffffffff; ++ cur += 32; ++ continue; ++ } ++ mb_set_bit_atomic(cur, bm); ++ cur++; ++ } ++} ++ ++static int mb_free_blocks(struct ext3_buddy *e3b, int first, int count) ++{ ++ int block = 0, max = 0, order; ++ void *buddy, *buddy2; ++ ++ mb_check_buddy(e3b); ++ ++ e3b->bd_info->bb_free += count; ++ if (first < e3b->bd_info->bb_first_free) ++ e3b->bd_info->bb_first_free = first; ++ ++ /* let's maintain fragments counter */ ++ if (first != 0) ++ block = !mb_test_bit(first - 1, EXT3_MB_BITMAP(e3b)); ++ if (first + count < EXT3_SB(e3b->bd_sb)->s_mb_maxs[0]) ++ max = !mb_test_bit(first + count, EXT3_MB_BITMAP(e3b)); ++ if (block && max) ++ e3b->bd_info->bb_fragments--; ++ else if (!block && !max) ++ e3b->bd_info->bb_fragments++; ++ ++ /* let's maintain buddy itself */ ++ while (count-- > 0) { ++ block = first++; ++ order = 0; ++ ++ J_ASSERT(mb_test_bit(block, EXT3_MB_BITMAP(e3b))); ++ mb_clear_bit(block, EXT3_MB_BITMAP(e3b)); ++ e3b->bd_info->bb_counters[order]++; ++ ++ /* start of the buddy */ ++ buddy = mb_find_buddy(e3b, order, &max); ++ ++ do { ++ block &= ~1UL; ++ if (mb_test_bit(block, buddy) || ++ mb_test_bit(block + 1, buddy)) ++ break; ++ ++ /* both the buddies are free, try to coalesce them */ ++ buddy2 = mb_find_buddy(e3b, order + 1, &max); ++ ++ if (!buddy2) ++ break; ++ ++ if (order > 0) { ++ /* for special purposes, we don't set ++ * free bits in bitmap */ ++ mb_set_bit(block, buddy); ++ mb_set_bit(block + 1, buddy); ++ } ++ e3b->bd_info->bb_counters[order]--; ++ e3b->bd_info->bb_counters[order]--; ++ ++ block = block >> 1; ++ order++; ++ e3b->bd_info->bb_counters[order]++; ++ ++ mb_clear_bit(block, buddy2); ++ buddy = buddy2; ++ } while (1); ++ } ++ mb_check_buddy(e3b); ++ ++ return 0; ++} ++ ++static int mb_find_extent(struct ext3_buddy *e3b, int order, int block, ++ int needed, struct ext3_free_extent *ex) ++{ ++ int next = block, max, ord; ++ void *buddy; ++ ++ J_ASSERT(ex != NULL); ++ ++ buddy = mb_find_buddy(e3b, order, &max); ++ J_ASSERT(buddy); ++ J_ASSERT(block < max); ++ if (mb_test_bit(block, buddy)) { ++ ex->fe_len = 0; ++ ex->fe_start = 0; ++ ex->fe_group = 0; ++ return 0; ++ } ++ ++ if (likely(order == 0)) { ++ /* find actual order */ ++ order = mb_find_order_for_block(e3b, block); ++ block = block >> order; ++ } ++ ++ ex->fe_len = 1 << order; ++ ex->fe_start = block << order; ++ ex->fe_group = e3b->bd_group; ++ ++ /* calc difference from given start */ ++ next = next - ex->fe_start; ++ ex->fe_len -= next; ++ ex->fe_start += next; ++ ++ while (needed > ex->fe_len && (buddy = mb_find_buddy(e3b, order, &max))) { ++ ++ if (block + 1 >= max) ++ break; ++ ++ next = (block + 1) * (1 << order); ++ if (mb_test_bit(next, EXT3_MB_BITMAP(e3b))) ++ break; ++ ++ ord = mb_find_order_for_block(e3b, next); ++ ++ order = ord; ++ block = next >> order; ++ ex->fe_len += 1 << order; ++ } ++ ++ J_ASSERT(ex->fe_start + ex->fe_len <= (1 << (e3b->bd_blkbits + 3))); ++ return ex->fe_len; ++} ++ ++static int mb_mark_used(struct ext3_buddy *e3b, struct ext3_free_extent *ex) ++{ ++ int ord, mlen = 0, max = 0, cur; ++ int start = ex->fe_start; ++ int len = ex->fe_len; ++ unsigned ret = 0; ++ int len0 = len; ++ void *buddy; ++ ++ mb_check_buddy(e3b); ++ ++ e3b->bd_info->bb_free -= len; ++ if (e3b->bd_info->bb_first_free == start) ++ e3b->bd_info->bb_first_free += len; ++ ++ /* let's maintain fragments counter */ ++ if (start != 0) ++ mlen = !mb_test_bit(start - 1, EXT3_MB_BITMAP(e3b)); ++ if (start + len < EXT3_SB(e3b->bd_sb)->s_mb_maxs[0]) ++ max = !mb_test_bit(start + len, EXT3_MB_BITMAP(e3b)); ++ if (mlen && max) ++ e3b->bd_info->bb_fragments++; ++ else if (!mlen && !max) ++ e3b->bd_info->bb_fragments--; ++ ++ /* let's maintain buddy itself */ ++ while (len) { ++ ord = mb_find_order_for_block(e3b, start); ++ ++ if (((start >> ord) << ord) == start && len >= (1 << ord)) { ++ /* the whole chunk may be allocated at once! */ ++ mlen = 1 << ord; ++ buddy = mb_find_buddy(e3b, ord, &max); ++ J_ASSERT((start >> ord) < max); ++ mb_set_bit(start >> ord, buddy); ++ e3b->bd_info->bb_counters[ord]--; ++ start += mlen; ++ len -= mlen; ++ J_ASSERT(len >= 0); ++ continue; ++ } ++ ++ /* store for history */ ++ if (ret == 0) ++ ret = len | (ord << 16); ++ ++ /* we have to split large buddy */ ++ J_ASSERT(ord > 0); ++ buddy = mb_find_buddy(e3b, ord, &max); ++ mb_set_bit(start >> ord, buddy); ++ e3b->bd_info->bb_counters[ord]--; ++ ++ ord--; ++ cur = (start >> ord) & ~1U; ++ buddy = mb_find_buddy(e3b, ord, &max); ++ mb_clear_bit(cur, buddy); ++ mb_clear_bit(cur + 1, buddy); ++ e3b->bd_info->bb_counters[ord]++; ++ e3b->bd_info->bb_counters[ord]++; ++ } ++ ++ /* now drop all the bits in bitmap */ ++ mb_set_bits(EXT3_MB_BITMAP(e3b), ex->fe_start, len0); ++ ++ mb_check_buddy(e3b); ++ ++ return ret; ++} ++ ++/* ++ * Must be called under group lock! ++ */ ++static void ext3_mb_use_best_found(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ unsigned long ret; ++ ++ ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len); ++ ret = mb_mark_used(e3b, &ac->ac_b_ex); ++ ++ ac->ac_status = AC_STATUS_FOUND; ++ ac->ac_tail = ret & 0xffff; ++ ac->ac_buddy = ret >> 16; ++ ++ /* hold in-core structures until allocated ++ * blocks are marked non-free in on-disk bitmap */ ++ ac->ac_buddy_page = e3b->bd_buddy_page; ++ page_cache_get(e3b->bd_buddy_page); ++ ac->ac_bitmap_page = e3b->bd_bitmap_page; ++ page_cache_get(e3b->bd_bitmap_page); ++} ++ ++/* ++ * The routine checks whether found extent is good enough. If it is, ++ * then the extent gets marked used and flag is set to the context ++ * to stop scanning. Otherwise, the extent is compared with the ++ * previous found extent and if new one is better, then it's stored ++ * in the context. Later, the best found extent will be used, if ++ * mballoc can't find good enough extent. ++ * ++ * FIXME: real allocation policy is to be designed yet! ++ */ ++static void ext3_mb_measure_extent(struct ext3_allocation_context *ac, ++ struct ext3_free_extent *ex, ++ struct ext3_buddy *e3b) ++{ ++ struct ext3_free_extent *bex = &ac->ac_b_ex; ++ struct ext3_free_extent *gex = &ac->ac_g_ex; ++ ++ J_ASSERT(ex->fe_len > 0); ++ J_ASSERT(ex->fe_len < (1 << ac->ac_sb->s_blocksize_bits) * 8); ++ J_ASSERT(ex->fe_start < (1 << ac->ac_sb->s_blocksize_bits) * 8); ++ ++ ac->ac_found++; ++ ++ /* ++ * The special case - take what you catch first ++ */ ++ if (unlikely(ac->ac_flags & EXT3_MB_HINT_FIRST)) { ++ *bex = *ex; ++ ext3_mb_use_best_found(ac, e3b); ++ return; ++ } ++ ++ /* ++ * Let's check whether the chunk is good enough ++ */ ++ if (ex->fe_len == gex->fe_len) { ++ *bex = *ex; ++ ext3_mb_use_best_found(ac, e3b); ++ return; ++ } ++ ++ /* ++ * If this is first found extent, just store it in the context ++ */ ++ if (bex->fe_len == 0) { ++ *bex = *ex; ++ return; ++ } ++ ++ /* ++ * If new found extent is better, store it in the context ++ */ ++ if (bex->fe_len < gex->fe_len) { ++ /* if the request isn't satisfied, any found extent ++ * larger than previous best one is better */ ++ if (ex->fe_len > bex->fe_len) ++ *bex = *ex; ++ } else if (ex->fe_len > gex->fe_len) { ++ /* if the request is satisfied, then we try to find ++ * an extent that still satisfy the request, but is ++ * smaller than previous one */ ++ *bex = *ex; ++ } ++ ++ /* ++ * Let's scan at least few extents and don't pick up a first one ++ */ ++ if (bex->fe_len > gex->fe_len && ac->ac_found > ext3_mb_min_to_scan) ++ ac->ac_status = AC_STATUS_BREAK; ++ ++ /* ++ * We don't want to scan for a whole year ++ */ ++ if (ac->ac_found > ext3_mb_max_to_scan) ++ ac->ac_status = AC_STATUS_BREAK; ++} ++ ++static int ext3_mb_try_best_found(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ struct ext3_free_extent ex = ac->ac_b_ex; ++ int group = ex.fe_group, max, err; ++ ++ J_ASSERT(ex.fe_len > 0); ++ err = ext3_mb_load_buddy(ac->ac_sb, group, e3b); ++ if (err) ++ return err; ++ ++ ext3_lock_group(ac->ac_sb, group); ++ max = mb_find_extent(e3b, 0, ex.fe_start, ex.fe_len, &ex); ++ ++ if (max > 0) { ++ ac->ac_b_ex = ex; ++ ext3_mb_use_best_found(ac, e3b); ++ } ++ ++ ext3_unlock_group(ac->ac_sb, group); ++ ++ ext3_mb_release_desc(e3b); ++ ++ return 0; ++} ++ ++static int ext3_mb_find_by_goal(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ int group = ac->ac_g_ex.fe_group, max, err; ++ struct ext3_sb_info *sbi = EXT3_SB(ac->ac_sb); ++ struct ext3_super_block *es = sbi->s_es; ++ struct ext3_free_extent ex; ++ ++ err = ext3_mb_load_buddy(ac->ac_sb, group, e3b); ++ if (err) ++ return err; ++ ++ ext3_lock_group(ac->ac_sb, group); ++ max = mb_find_extent(e3b, 0, ac->ac_g_ex.fe_start, ++ ac->ac_g_ex.fe_len, &ex); ++ ++ if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) { ++ unsigned long start; ++ start = (e3b->bd_group * EXT3_BLOCKS_PER_GROUP(ac->ac_sb) + ++ ex.fe_start + le32_to_cpu(es->s_first_data_block)); ++ if (start % sbi->s_stripe == 0) { ++ ac->ac_found++; ++ ac->ac_b_ex = ex; ++ ext3_mb_use_best_found(ac, e3b); ++ } ++ } else if (max >= ac->ac_g_ex.fe_len) { ++ J_ASSERT(ex.fe_len > 0); ++ J_ASSERT(ex.fe_group == ac->ac_g_ex.fe_group); ++ J_ASSERT(ex.fe_start == ac->ac_g_ex.fe_start); ++ ac->ac_found++; ++ ac->ac_b_ex = ex; ++ ext3_mb_use_best_found(ac, e3b); ++ } else if (max > 0 && (ac->ac_flags & EXT3_MB_HINT_MERGE)) { ++ /* Sometimes, caller may want to merge even small ++ * number of blocks to an existing extent */ ++ J_ASSERT(ex.fe_len > 0); ++ J_ASSERT(ex.fe_group == ac->ac_g_ex.fe_group); ++ J_ASSERT(ex.fe_start == ac->ac_g_ex.fe_start); ++ ac->ac_found++; ++ ac->ac_b_ex = ex; ++ ext3_mb_use_best_found(ac, e3b); ++ } ++ ext3_unlock_group(ac->ac_sb, group); ++ ++ ext3_mb_release_desc(e3b); ++ ++ return 0; ++} ++ ++/* ++ * The routine scans buddy structures (not bitmap!) from given order ++ * to max order and tries to find big enough chunk to satisfy the req ++ */ ++static void ext3_mb_simple_scan_group(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ struct super_block *sb = ac->ac_sb; ++ struct ext3_group_info *grp = e3b->bd_info; ++ void *buddy; ++ int i, k, max; ++ ++ J_ASSERT(ac->ac_2order > 0); ++ for (i = ac->ac_2order; i <= sb->s_blocksize_bits + 1; i++) { ++ if (grp->bb_counters[i] == 0) ++ continue; ++ ++ buddy = mb_find_buddy(e3b, i, &max); ++ if (buddy == NULL) { ++ printk(KERN_ALERT "looking for wrong order?\n"); ++ break; ++ } ++ ++ k = mb_find_next_zero_bit(buddy, max, 0); ++ J_ASSERT(k < max); ++ ++ ac->ac_found++; ++ ++ ac->ac_b_ex.fe_len = 1 << i; ++ ac->ac_b_ex.fe_start = k << i; ++ ac->ac_b_ex.fe_group = e3b->bd_group; ++ ++ ext3_mb_use_best_found(ac, e3b); ++ J_ASSERT(ac->ac_b_ex.fe_len == ac->ac_g_ex.fe_len); ++ ++ if (unlikely(ext3_mb_stats)) ++ atomic_inc(&EXT3_SB(sb)->s_bal_2orders); ++ ++ break; ++ } ++} ++ ++/* ++ * The routine scans the group and measures all found extents. ++ * In order to optimize scanning, caller must pass number of ++ * free blocks in the group, so the routine can know upper limit. ++ */ ++static void ext3_mb_complex_scan_group(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ struct super_block *sb = ac->ac_sb; ++ void *bitmap = EXT3_MB_BITMAP(e3b); ++ struct ext3_free_extent ex; ++ int i, free; ++ ++ free = e3b->bd_info->bb_free; ++ J_ASSERT(free > 0); ++ ++ i = e3b->bd_info->bb_first_free; ++ ++ while (free && ac->ac_status == AC_STATUS_CONTINUE) { ++ i = mb_find_next_zero_bit(bitmap, sb->s_blocksize * 8, i); ++ if (i >= sb->s_blocksize * 8) { ++ J_ASSERT(free == 0); ++ break; ++ } ++ ++ mb_find_extent(e3b, 0, i, ac->ac_g_ex.fe_len, &ex); ++ J_ASSERT(ex.fe_len > 0); ++ J_ASSERT(free >= ex.fe_len); ++ ++ ext3_mb_measure_extent(ac, &ex, e3b); ++ ++ i += ex.fe_len; ++ free -= ex.fe_len; ++ } ++} ++ ++/* ++ * This is a special case for storages like raid5 ++ * we try to find stripe-aligned chunks for stripe-size requests ++ */ ++static void ext3_mb_scan_aligned(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ struct super_block *sb = ac->ac_sb; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ void *bitmap = EXT3_MB_BITMAP(e3b); ++ struct ext3_free_extent ex; ++ unsigned long i, max; ++ ++ J_ASSERT(sbi->s_stripe != 0); ++ ++ /* find first stripe-aligned block */ ++ i = e3b->bd_group * EXT3_BLOCKS_PER_GROUP(sb) ++ + le32_to_cpu(sbi->s_es->s_first_data_block); ++ i = ((i + sbi->s_stripe - 1) / sbi->s_stripe) * sbi->s_stripe; ++ i = (i - le32_to_cpu(sbi->s_es->s_first_data_block)) ++ % EXT3_BLOCKS_PER_GROUP(sb); ++ ++ while (i < sb->s_blocksize * 8) { ++ if (!mb_test_bit(i, bitmap)) { ++ max = mb_find_extent(e3b, 0, i, sbi->s_stripe, &ex); ++ if (max >= sbi->s_stripe) { ++ ac->ac_found++; ++ ac->ac_b_ex = ex; ++ ext3_mb_use_best_found(ac, e3b); ++ break; ++ } ++ } ++ i += sbi->s_stripe; ++ } ++} ++ ++static int ext3_mb_good_group(struct ext3_allocation_context *ac, ++ int group, int cr) ++{ ++ struct ext3_group_info *grp = EXT3_GROUP_INFO(ac->ac_sb, group); ++ unsigned free, fragments, i, bits; ++ ++ J_ASSERT(cr >= 0 && cr < 4); ++ J_ASSERT(!EXT3_MB_GRP_NEED_INIT(grp)); ++ ++ free = grp->bb_free; ++ fragments = grp->bb_fragments; ++ if (free == 0) ++ return 0; ++ if (fragments == 0) ++ return 0; ++ ++ switch (cr) { ++ case 0: ++ J_ASSERT(ac->ac_2order != 0); ++ bits = ac->ac_sb->s_blocksize_bits + 1; ++ for (i = ac->ac_2order; i <= bits; i++) ++ if (grp->bb_counters[i] > 0) ++ return 1; ++ break; ++ case 1: ++ if ((free / fragments) >= ac->ac_g_ex.fe_len) ++ return 1; ++ break; ++ case 2: ++ if (free >= ac->ac_g_ex.fe_len) ++ return 1; ++ break; ++ case 3: ++ return 1; ++ default: ++ BUG(); ++ } ++ ++ return 0; ++} ++ ++int ext3_mb_new_blocks(handle_t *handle, struct inode *inode, ++ unsigned long goal, int *len, int flags, int *errp) ++{ ++ struct buffer_head *bitmap_bh = NULL; ++ struct ext3_allocation_context ac; ++ int i, group, block, cr, err = 0; ++ struct ext3_group_desc *gdp; ++ struct ext3_super_block *es; ++ struct buffer_head *gdp_bh; ++ struct ext3_sb_info *sbi; ++ struct super_block *sb; ++ struct ext3_buddy e3b; ++ ++ J_ASSERT(len != NULL); ++ J_ASSERT(*len > 0); ++ ++ sb = inode->i_sb; ++ if (!sb) { ++ printk("ext3_mb_new_nblocks: nonexistent device"); ++ return 0; ++ } ++ ++ if (!test_opt(sb, MBALLOC)) { ++ static int ext3_mballoc_warning = 0; ++ if (ext3_mballoc_warning == 0) { ++ printk(KERN_ERR "EXT3-fs: multiblock request with " ++ "mballoc disabled!\n"); ++ ext3_mballoc_warning++; ++ } ++ *len = 1; ++ err = ext3_new_block_old(handle, inode, goal, errp); ++ return err; ++ } ++ ++ ext3_mb_poll_new_transaction(sb, handle); ++ ++ sbi = EXT3_SB(sb); ++ es = EXT3_SB(sb)->s_es; ++ ++ /* ++ * We can't allocate > group size ++ */ ++ if (*len >= EXT3_BLOCKS_PER_GROUP(sb) - 10) ++ *len = EXT3_BLOCKS_PER_GROUP(sb) - 10; ++ ++ if (!(flags & EXT3_MB_HINT_RESERVED)) { ++ /* someone asks for non-reserved blocks */ ++ BUG_ON(*len > 1); ++ err = ext3_mb_reserve_blocks(sb, 1); ++ if (err) { ++ *errp = err; ++ return 0; ++ } ++ } ++ ++ ac.ac_buddy_page = NULL; ++ ac.ac_bitmap_page = NULL; ++ ++ /* ++ * Check quota for allocation of this blocks. ++ */ ++ while (*len && DQUOT_ALLOC_BLOCK(inode, *len)) ++ *len -= 1; ++ if (*len == 0) { ++ *errp = -EDQUOT; ++ block = 0; ++ goto out; ++ } ++ ++ /* start searching from the goal */ ++ if (goal < le32_to_cpu(es->s_first_data_block) || ++ goal >= le32_to_cpu(es->s_blocks_count)) ++ goal = le32_to_cpu(es->s_first_data_block); ++ group = (goal - le32_to_cpu(es->s_first_data_block)) / ++ EXT3_BLOCKS_PER_GROUP(sb); ++ block = ((goal - le32_to_cpu(es->s_first_data_block)) % ++ EXT3_BLOCKS_PER_GROUP(sb)); ++ ++ /* set up allocation goals */ ++ ac.ac_b_ex.fe_group = 0; ++ ac.ac_b_ex.fe_start = 0; ++ ac.ac_b_ex.fe_len = 0; ++ ac.ac_status = AC_STATUS_CONTINUE; ++ ac.ac_groups_scanned = 0; ++ ac.ac_ex_scanned = 0; ++ ac.ac_found = 0; ++ ac.ac_sb = inode->i_sb; ++ ac.ac_g_ex.fe_group = group; ++ ac.ac_g_ex.fe_start = block; ++ ac.ac_g_ex.fe_len = *len; ++ ac.ac_flags = flags; ++ ac.ac_2order = 0; ++ ac.ac_criteria = 0; ++ ++ if (*len == 1 && sbi->s_stripe) { ++ /* looks like a metadata, let's use a dirty hack for raid5 ++ * move all metadata in first groups in hope to hit cached ++ * sectors and thus avoid read-modify cycles in raid5 */ ++ ac.ac_g_ex.fe_group = group = 0; ++ } ++ ++ /* probably, the request is for 2^8+ blocks (1/2/3/... MB) */ ++ i = ffs(*len); ++ if (i >= ext3_mb_order2_reqs) { ++ i--; ++ if ((*len & (~(1 << i))) == 0) ++ ac.ac_2order = i; ++ } ++ ++ /* first, try the goal */ ++ err = ext3_mb_find_by_goal(&ac, &e3b); ++ if (err) ++ goto out_err; ++ if (ac.ac_status == AC_STATUS_FOUND) ++ goto found; ++ ++ /* Let's just scan groups to find more-less suitable blocks */ ++ cr = ac.ac_2order ? 0 : 1; ++repeat: ++ for (; cr < 4 && ac.ac_status == AC_STATUS_CONTINUE; cr++) { ++ ac.ac_criteria = cr; ++ for (i = 0; i < EXT3_SB(sb)->s_groups_count; group++, i++) { ++ if (group == EXT3_SB(sb)->s_groups_count) ++ group = 0; ++ ++ if (EXT3_MB_GRP_NEED_INIT(EXT3_GROUP_INFO(sb, group))) { ++ /* we need full data about the group ++ * to make a good selection */ ++ err = ext3_mb_load_buddy(ac.ac_sb, group, &e3b); ++ if (err) ++ goto out_err; ++ ext3_mb_release_desc(&e3b); ++ } ++ ++ /* check is group good for our criteries */ ++ if (!ext3_mb_good_group(&ac, group, cr)) ++ continue; ++ ++ err = ext3_mb_load_buddy(ac.ac_sb, group, &e3b); ++ if (err) ++ goto out_err; ++ ++ ext3_lock_group(sb, group); ++ if (!ext3_mb_good_group(&ac, group, cr)) { ++ /* someone did allocation from this group */ ++ ext3_unlock_group(sb, group); ++ ext3_mb_release_desc(&e3b); ++ continue; ++ } ++ ++ ac.ac_groups_scanned++; ++ if (cr == 0) ++ ext3_mb_simple_scan_group(&ac, &e3b); ++ else if (cr == 1 && *len == sbi->s_stripe) ++ ext3_mb_scan_aligned(&ac, &e3b); ++ else ++ ext3_mb_complex_scan_group(&ac, &e3b); ++ ++ ext3_unlock_group(sb, group); ++ ++ ext3_mb_release_desc(&e3b); ++ ++ if (ac.ac_status != AC_STATUS_CONTINUE) ++ break; ++ } ++ } ++ ++ if (ac.ac_b_ex.fe_len > 0 && ac.ac_status != AC_STATUS_FOUND && ++ !(ac.ac_flags & EXT3_MB_HINT_FIRST)) { ++ /* ++ * We've been searching too long. Let's try to allocate ++ * the best chunk we've found so far ++ */ ++ ++ /*if (ac.ac_found > ext3_mb_max_to_scan) ++ printk(KERN_DEBUG "EXT3-fs: too long searching at " ++ "%u (%d/%d)\n", cr, ac.ac_b_ex.fe_len, ++ ac.ac_g_ex.fe_len);*/ ++ ext3_mb_try_best_found(&ac, &e3b); ++ if (ac.ac_status != AC_STATUS_FOUND) { ++ /* ++ * Someone more lucky has already allocated it. ++ * The only thing we can do is just take first ++ * found block(s) ++ printk(KERN_DEBUG "EXT3-fs: someone won our chunk\n"); ++ */ ++ ac.ac_b_ex.fe_group = 0; ++ ac.ac_b_ex.fe_start = 0; ++ ac.ac_b_ex.fe_len = 0; ++ ac.ac_status = AC_STATUS_CONTINUE; ++ ac.ac_flags |= EXT3_MB_HINT_FIRST; ++ cr = 3; ++ goto repeat; ++ } ++ } ++ ++ if (ac.ac_status != AC_STATUS_FOUND) { ++ /* ++ * We aren't lucky definitely ++ */ ++ DQUOT_FREE_BLOCK(inode, *len); ++ *errp = -ENOSPC; ++ block = 0; ++#if 1 ++ printk(KERN_ERR "EXT3-fs: can't allocate: status %d flags %d\n", ++ ac.ac_status, ac.ac_flags); ++ printk(KERN_ERR "EXT3-fs: goal %d, best found %d/%d/%d cr %d\n", ++ ac.ac_g_ex.fe_len, ac.ac_b_ex.fe_group, ++ ac.ac_b_ex.fe_start, ac.ac_b_ex.fe_len, cr); ++ printk(KERN_ERR "EXT3-fs: %lu block reserved, %d found\n", ++ sbi->s_blocks_reserved, ac.ac_found); ++ printk("EXT3-fs: groups: "); ++ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) ++ printk("%d: %d ", i, EXT3_GROUP_INFO(sb, i)->bb_free); ++ printk("\n"); ++#endif ++ goto out; ++ } ++ ++found: ++ J_ASSERT(ac.ac_b_ex.fe_len > 0); ++ ++ /* good news - free block(s) have been found. now it's time ++ * to mark block(s) in good old journaled bitmap */ ++ block = ac.ac_b_ex.fe_group * EXT3_BLOCKS_PER_GROUP(sb) ++ + ac.ac_b_ex.fe_start ++ + le32_to_cpu(es->s_first_data_block); ++ ++ /* we made a desicion, now mark found blocks in good old ++ * bitmap to be journaled */ ++ ++ ext3_debug("using block group %d(%d)\n", ++ ac.ac_b_group.group, gdp->bg_free_blocks_count); ++ ++ bitmap_bh = read_block_bitmap(sb, ac.ac_b_ex.fe_group); ++ if (!bitmap_bh) { ++ *errp = -EIO; ++ goto out_err; ++ } ++ ++ err = ext3_journal_get_write_access(handle, bitmap_bh); ++ if (err) { ++ *errp = err; ++ goto out_err; ++ } ++ ++ gdp = ext3_get_group_desc(sb, ac.ac_b_ex.fe_group, &gdp_bh); ++ if (!gdp) { ++ *errp = -EIO; ++ goto out_err; ++ } ++ ++ err = ext3_journal_get_write_access(handle, gdp_bh); ++ if (err) ++ goto out_err; ++ ++ block = ac.ac_b_ex.fe_group * EXT3_BLOCKS_PER_GROUP(sb) ++ + ac.ac_b_ex.fe_start ++ + le32_to_cpu(es->s_first_data_block); ++ ++ if (block == le32_to_cpu(gdp->bg_block_bitmap) || ++ block == le32_to_cpu(gdp->bg_inode_bitmap) || ++ in_range(block, le32_to_cpu(gdp->bg_inode_table), ++ EXT3_SB(sb)->s_itb_per_group)) ++ ext3_error(sb, "ext3_new_block", ++ "Allocating block in system zone - " ++ "block = %u", block); ++#ifdef AGGRESSIVE_CHECK ++ for (i = 0; i < ac.ac_b_ex.fe_len; i++) ++ J_ASSERT(!mb_test_bit(ac.ac_b_ex.fe_start + i, bitmap_bh->b_data)); ++#endif ++ mb_set_bits(bitmap_bh->b_data, ac.ac_b_ex.fe_start, ac.ac_b_ex.fe_len); ++ ++ spin_lock(sb_bgl_lock(sbi, ac.ac_b_ex.fe_group)); ++ gdp->bg_free_blocks_count = ++ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) ++ - ac.ac_b_ex.fe_len); ++ spin_unlock(sb_bgl_lock(sbi, ac.ac_b_ex.fe_group)); ++ percpu_counter_mod(&sbi->s_freeblocks_counter, - ac.ac_b_ex.fe_len); ++ ++ err = ext3_journal_dirty_metadata(handle, bitmap_bh); ++ if (err) ++ goto out_err; ++ err = ext3_journal_dirty_metadata(handle, gdp_bh); ++ if (err) ++ goto out_err; ++ ++ sb->s_dirt = 1; ++ *errp = 0; ++ brelse(bitmap_bh); ++ ++ /* drop non-allocated, but dquote'd blocks */ ++ J_ASSERT(*len >= ac.ac_b_ex.fe_len); ++ DQUOT_FREE_BLOCK(inode, *len - ac.ac_b_ex.fe_len); ++ ++ *len = ac.ac_b_ex.fe_len; ++ J_ASSERT(*len > 0); ++ J_ASSERT(block != 0); ++ goto out; ++ ++out_err: ++ /* if we've already allocated something, roll it back */ ++ if (ac.ac_status == AC_STATUS_FOUND) { ++ /* FIXME: free blocks here */ ++ } ++ ++ DQUOT_FREE_BLOCK(inode, *len); ++ brelse(bitmap_bh); ++ *errp = err; ++ block = 0; ++out: ++ if (ac.ac_buddy_page) ++ page_cache_release(ac.ac_buddy_page); ++ if (ac.ac_bitmap_page) ++ page_cache_release(ac.ac_bitmap_page); ++ ++ if (!(flags & EXT3_MB_HINT_RESERVED)) { ++ /* block wasn't reserved before and we reserved it ++ * at the beginning of allocation. it doesn't matter ++ * whether we allocated anything or we failed: time ++ * to release reservation. NOTE: because I expect ++ * any multiblock request from delayed allocation ++ * path only, here is single block always */ ++ ext3_mb_release_blocks(sb, 1); ++ } ++ ++ if (unlikely(ext3_mb_stats) && ac.ac_g_ex.fe_len > 1) { ++ atomic_inc(&sbi->s_bal_reqs); ++ atomic_add(*len, &sbi->s_bal_allocated); ++ if (*len >= ac.ac_g_ex.fe_len) ++ atomic_inc(&sbi->s_bal_success); ++ atomic_add(ac.ac_found, &sbi->s_bal_ex_scanned); ++ if (ac.ac_g_ex.fe_start == ac.ac_b_ex.fe_start && ++ ac.ac_g_ex.fe_group == ac.ac_b_ex.fe_group) ++ atomic_inc(&sbi->s_bal_goals); ++ if (ac.ac_found > ext3_mb_max_to_scan) ++ atomic_inc(&sbi->s_bal_breaks); ++ } ++ ++ ext3_mb_store_history(sb, inode->i_ino, &ac); ++ ++ return block; ++} ++EXPORT_SYMBOL(ext3_mb_new_blocks); ++ ++#ifdef EXT3_MB_HISTORY ++struct ext3_mb_proc_session { ++ struct ext3_mb_history *history; ++ struct super_block *sb; ++ int start; ++ int max; ++}; ++ ++static void *ext3_mb_history_skip_empty(struct ext3_mb_proc_session *s, ++ struct ext3_mb_history *hs, ++ int first) ++{ ++ if (hs == s->history + s->max) ++ hs = s->history; ++ if (!first && hs == s->history + s->start) ++ return NULL; ++ while (hs->goal.fe_len == 0) { ++ hs++; ++ if (hs == s->history + s->max) ++ hs = s->history; ++ if (hs == s->history + s->start) ++ return NULL; ++ } ++ return hs; ++} ++ ++static void *ext3_mb_seq_history_start(struct seq_file *seq, loff_t *pos) ++{ ++ struct ext3_mb_proc_session *s = seq->private; ++ struct ext3_mb_history *hs; ++ int l = *pos; ++ ++ if (l == 0) ++ return SEQ_START_TOKEN; ++ hs = ext3_mb_history_skip_empty(s, s->history + s->start, 1); ++ if (!hs) ++ return NULL; ++ while (--l && (hs = ext3_mb_history_skip_empty(s, ++hs, 0)) != NULL); ++ return hs; ++} ++ ++static void *ext3_mb_seq_history_next(struct seq_file *seq, void *v, loff_t *pos) ++{ ++ struct ext3_mb_proc_session *s = seq->private; ++ struct ext3_mb_history *hs = v; ++ ++ ++*pos; ++ if (v == SEQ_START_TOKEN) ++ return ext3_mb_history_skip_empty(s, s->history + s->start, 1); ++ else ++ return ext3_mb_history_skip_empty(s, ++hs, 0); ++} ++ ++static int ext3_mb_seq_history_show(struct seq_file *seq, void *v) ++{ ++ struct ext3_mb_history *hs = v; ++ char buf[20], buf2[20]; ++ ++ if (v == SEQ_START_TOKEN) { ++ seq_printf(seq, "%-5s %-8s %-17s %-17s %-5s %-5s %-2s %-5s %-5s %-6s\n", ++ "pid", "inode", "goal", "result", "found", "grps", "cr", ++ "merge", "tail", "broken"); ++ return 0; ++ } ++ ++ sprintf(buf, "%u/%u/%u", hs->goal.fe_group, ++ hs->goal.fe_start, hs->goal.fe_len); ++ sprintf(buf2, "%u/%u/%u", hs->result.fe_group, ++ hs->result.fe_start, hs->result.fe_len); ++ seq_printf(seq, "%-5u %-8u %-17s %-17s %-5u %-5u %-2u %-5s %-5u %-6u\n", ++ hs->pid, hs->ino, buf, buf2, hs->found, hs->groups, ++ hs->cr, hs->merged ? "M" : "", hs->tail, ++ hs->buddy ? 1 << hs->buddy : 0); ++ return 0; ++} ++ ++static void ext3_mb_seq_history_stop(struct seq_file *seq, void *v) ++{ ++} ++ ++static struct seq_operations ext3_mb_seq_history_ops = { ++ .start = ext3_mb_seq_history_start, ++ .next = ext3_mb_seq_history_next, ++ .stop = ext3_mb_seq_history_stop, ++ .show = ext3_mb_seq_history_show, ++}; ++ ++static int ext3_mb_seq_history_open(struct inode *inode, struct file *file) ++{ ++ struct super_block *sb = PDE(inode)->data; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct ext3_mb_proc_session *s; ++ int rc, size; ++ ++ s = kmalloc(sizeof(*s), GFP_KERNEL); ++ if (s == NULL) ++ return -EIO; ++ size = sizeof(struct ext3_mb_history) * sbi->s_mb_history_max; ++ s->history = kmalloc(size, GFP_KERNEL); ++ if (s == NULL) { ++ kfree(s); ++ return -EIO; ++ } ++ ++ spin_lock(&sbi->s_mb_history_lock); ++ memcpy(s->history, sbi->s_mb_history, size); ++ s->max = sbi->s_mb_history_max; ++ s->start = sbi->s_mb_history_cur % s->max; ++ spin_unlock(&sbi->s_mb_history_lock); ++ ++ rc = seq_open(file, &ext3_mb_seq_history_ops); ++ if (rc == 0) { ++ struct seq_file *m = (struct seq_file *)file->private_data; ++ m->private = s; ++ } else { ++ kfree(s->history); ++ kfree(s); ++ } ++ return rc; ++ ++} ++ ++static int ext3_mb_seq_history_release(struct inode *inode, struct file *file) ++{ ++ struct seq_file *seq = (struct seq_file *)file->private_data; ++ struct ext3_mb_proc_session *s = seq->private; ++ kfree(s->history); ++ kfree(s); ++ return seq_release(inode, file); ++} ++ ++static struct file_operations ext3_mb_seq_history_fops = { ++ .owner = THIS_MODULE, ++ .open = ext3_mb_seq_history_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = ext3_mb_seq_history_release, ++}; ++ ++static void *ext3_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) ++{ ++ struct super_block *sb = seq->private; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ long group; ++ ++ if (*pos < 0 || *pos >= sbi->s_groups_count) ++ return NULL; ++ ++ group = *pos + 1; ++ return (void *) group; ++} ++ ++static void *ext3_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) ++{ ++ struct super_block *sb = seq->private; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ long group; ++ ++ ++*pos; ++ if (*pos < 0 || *pos >= sbi->s_groups_count) ++ return NULL; ++ group = *pos + 1; ++ return (void *) group;; ++} ++ ++static int ext3_mb_seq_groups_show(struct seq_file *seq, void *v) ++{ ++ struct super_block *sb = seq->private; ++ long group = (long) v, i; ++ struct sg { ++ struct ext3_group_info info; ++ unsigned short counters[16]; ++ } sg; ++ ++ group--; ++ if (group == 0) ++ seq_printf(seq, "#%-5s: %-5s %-5s %-5s [ %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s ]\n", ++ "group", "free", "frags", "first", "2^0", "2^1", "2^2", ++ "2^3", "2^4", "2^5", "2^6", "2^7", "2^8", "2^9", "2^10", ++ "2^11", "2^12", "2^13"); ++ ++ i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) + ++ sizeof(struct ext3_group_info); ++ ext3_lock_group(sb, group); ++ memcpy(&sg, EXT3_GROUP_INFO(sb, group), i); ++ ext3_unlock_group(sb, group); ++ ++ if (EXT3_MB_GRP_NEED_INIT(&sg.info)) ++ return 0; ++ ++ seq_printf(seq, "#%-5lu: %-5u %-5u %-5u [", group, sg.info.bb_free, ++ sg.info.bb_fragments, sg.info.bb_first_free); ++ for (i = 0; i <= 13; i++) ++ seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ? ++ sg.info.bb_counters[i] : 0); ++ seq_printf(seq, " ]\n"); ++ ++ return 0; ++} ++ ++static void ext3_mb_seq_groups_stop(struct seq_file *seq, void *v) ++{ ++} ++ ++static struct seq_operations ext3_mb_seq_groups_ops = { ++ .start = ext3_mb_seq_groups_start, ++ .next = ext3_mb_seq_groups_next, ++ .stop = ext3_mb_seq_groups_stop, ++ .show = ext3_mb_seq_groups_show, ++}; ++ ++static int ext3_mb_seq_groups_open(struct inode *inode, struct file *file) ++{ ++ struct super_block *sb = PDE(inode)->data; ++ int rc; ++ ++ rc = seq_open(file, &ext3_mb_seq_groups_ops); ++ if (rc == 0) { ++ struct seq_file *m = (struct seq_file *)file->private_data; ++ m->private = sb; ++ } ++ return rc; ++ ++} ++ ++static struct file_operations ext3_mb_seq_groups_fops = { ++ .owner = THIS_MODULE, ++ .open = ext3_mb_seq_groups_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = seq_release, ++}; ++ ++static void ext3_mb_history_release(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ char name[64]; ++ ++ snprintf(name, sizeof(name) - 1, "%s", bdevname(sb->s_bdev, name)); ++ remove_proc_entry("mb_groups", sbi->s_mb_proc); ++ remove_proc_entry("mb_history", sbi->s_mb_proc); ++ remove_proc_entry(name, proc_root_ext3); ++ ++ if (sbi->s_mb_history) ++ kfree(sbi->s_mb_history); ++} ++ ++static void ext3_mb_history_init(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ char name[64]; ++ int i; ++ ++ snprintf(name, sizeof(name) - 1, "%s", bdevname(sb->s_bdev, name)); ++ sbi->s_mb_proc = proc_mkdir(name, proc_root_ext3); ++ if (sbi->s_mb_proc != NULL) { ++ struct proc_dir_entry *p; ++ p = create_proc_entry("mb_history", S_IRUGO, sbi->s_mb_proc); ++ if (p) { ++ p->proc_fops = &ext3_mb_seq_history_fops; ++ p->data = sb; ++ } ++ p = create_proc_entry("mb_groups", S_IRUGO, sbi->s_mb_proc); ++ if (p) { ++ p->proc_fops = &ext3_mb_seq_groups_fops; ++ p->data = sb; ++ } ++ } ++ ++ sbi->s_mb_history_max = 1000; ++ sbi->s_mb_history_cur = 0; ++ spin_lock_init(&sbi->s_mb_history_lock); ++ i = sbi->s_mb_history_max * sizeof(struct ext3_mb_history); ++ sbi->s_mb_history = kmalloc(i, GFP_KERNEL); ++ memset(sbi->s_mb_history, 0, i); ++ /* if we can't allocate history, then we simple won't use it */ ++} ++ ++static void ++ext3_mb_store_history(struct super_block *sb, unsigned ino, ++ struct ext3_allocation_context *ac) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct ext3_mb_history h; ++ ++ if (likely(sbi->s_mb_history == NULL)) ++ return; ++ ++ h.pid = current->pid; ++ h.ino = ino; ++ h.goal = ac->ac_g_ex; ++ h.result = ac->ac_b_ex; ++ h.found = ac->ac_found; ++ h.cr = ac->ac_criteria; ++ h.groups = ac->ac_groups_scanned; ++ h.tail = ac->ac_tail; ++ h.buddy = ac->ac_buddy; ++ h.merged = 0; ++ if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && ++ ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group) ++ h.merged = 1; ++ ++ spin_lock(&sbi->s_mb_history_lock); ++ memcpy(sbi->s_mb_history + sbi->s_mb_history_cur, &h, sizeof(h)); ++ if (++sbi->s_mb_history_cur >= sbi->s_mb_history_max) ++ sbi->s_mb_history_cur = 0; ++ spin_unlock(&sbi->s_mb_history_lock); ++} ++ ++#else ++#define ext3_mb_history_release(sb) ++#define ext3_mb_history_init(sb) ++#endif ++ ++int ext3_mb_init_backend(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int i, j, len, metalen; ++ int num_meta_group_infos = ++ (sbi->s_groups_count + EXT3_DESC_PER_BLOCK(sb) - 1) >> ++ EXT3_DESC_PER_BLOCK_BITS(sb); ++ struct ext3_group_info **meta_group_info; ++ ++ /* An 8TB filesystem with 64-bit pointers requires a 4096 byte ++ * kmalloc. A 128kb malloc should suffice for a 256TB filesystem. ++ * So a two level scheme suffices for now. */ ++ sbi->s_group_info = kmalloc(sizeof(*sbi->s_group_info) * ++ num_meta_group_infos, GFP_KERNEL); ++ if (sbi->s_group_info == NULL) { ++ printk(KERN_ERR "EXT3-fs: can't allocate buddy meta group\n"); ++ return -ENOMEM; ++ } ++ sbi->s_buddy_cache = new_inode(sb); ++ if (sbi->s_buddy_cache == NULL) { ++ printk(KERN_ERR "EXT3-fs: can't get new inode\n"); ++ goto err_freesgi; ++ } ++ ++ metalen = sizeof(*meta_group_info) << EXT3_DESC_PER_BLOCK_BITS(sb); ++ for (i = 0; i < num_meta_group_infos; i++) { ++ if ((i + 1) == num_meta_group_infos) ++ metalen = sizeof(*meta_group_info) * ++ (sbi->s_groups_count - ++ (i << EXT3_DESC_PER_BLOCK_BITS(sb))); ++ meta_group_info = kmalloc(metalen, GFP_KERNEL); ++ if (meta_group_info == NULL) { ++ printk(KERN_ERR "EXT3-fs: can't allocate mem for a " ++ "buddy group\n"); ++ goto err_freemeta; ++ } ++ sbi->s_group_info[i] = meta_group_info; ++ } ++ ++ /* ++ * calculate needed size. if change bb_counters size, ++ * don't forget about ext3_mb_generate_buddy() ++ */ ++ len = sizeof(struct ext3_group_info); ++ len += sizeof(unsigned short) * (sb->s_blocksize_bits + 2); ++ for (i = 0; i < sbi->s_groups_count; i++) { ++ struct ext3_group_desc * desc; ++ ++ meta_group_info = ++ sbi->s_group_info[i >> EXT3_DESC_PER_BLOCK_BITS(sb)]; ++ j = i & (EXT3_DESC_PER_BLOCK(sb) - 1); ++ ++ meta_group_info[j] = kmalloc(len, GFP_KERNEL); ++ if (meta_group_info[j] == NULL) { ++ printk(KERN_ERR "EXT3-fs: can't allocate buddy mem\n"); ++ i--; ++ goto err_freebuddy; ++ } ++ desc = ext3_get_group_desc(sb, i, NULL); ++ if (desc == NULL) { ++ printk(KERN_ERR"EXT3-fs: can't read descriptor %u\n",i); ++ goto err_freebuddy; ++ } ++ memset(meta_group_info[j], 0, len); ++ set_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, ++ &meta_group_info[j]->bb_state); ++ meta_group_info[j]->bb_free = ++ le16_to_cpu(desc->bg_free_blocks_count); ++ } ++ ++ return 0; ++ ++err_freebuddy: ++ while (i >= 0) { ++ kfree(EXT3_GROUP_INFO(sb, i)); ++ i--; ++ } ++ i = num_meta_group_infos; ++err_freemeta: ++ while (--i >= 0) ++ kfree(sbi->s_group_info[i]); ++ iput(sbi->s_buddy_cache); ++err_freesgi: ++ kfree(sbi->s_group_info); ++ return -ENOMEM; ++} ++ ++int ext3_mb_init(struct super_block *sb, int needs_recovery) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct inode *root = sb->s_root->d_inode; ++ unsigned i, offset, max; ++ struct dentry *dentry; ++ ++ if (!test_opt(sb, MBALLOC)) ++ return 0; ++ ++ i = (sb->s_blocksize_bits + 2) * sizeof(unsigned short); ++ ++ sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL); ++ if (sbi->s_mb_offsets == NULL) { ++ clear_opt(sbi->s_mount_opt, MBALLOC); ++ return -ENOMEM; ++ } ++ sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); ++ if (sbi->s_mb_maxs == NULL) { ++ clear_opt(sbi->s_mount_opt, MBALLOC); ++ kfree(sbi->s_mb_maxs); ++ return -ENOMEM; ++ } ++ ++ /* order 0 is regular bitmap */ ++ sbi->s_mb_maxs[0] = sb->s_blocksize << 3; ++ sbi->s_mb_offsets[0] = 0; ++ ++ i = 1; ++ offset = 0; ++ max = sb->s_blocksize << 2; ++ do { ++ sbi->s_mb_offsets[i] = offset; ++ sbi->s_mb_maxs[i] = max; ++ offset += 1 << (sb->s_blocksize_bits - i); ++ max = max >> 1; ++ i++; ++ } while (i <= sb->s_blocksize_bits + 1); ++ ++ /* init file for buddy data */ ++ if ((i = ext3_mb_init_backend(sb))) { ++ clear_opt(sbi->s_mount_opt, MBALLOC); ++ kfree(sbi->s_mb_offsets); ++ kfree(sbi->s_mb_maxs); ++ return i; ++ } ++ ++ spin_lock_init(&sbi->s_reserve_lock); ++ spin_lock_init(&sbi->s_md_lock); ++ INIT_LIST_HEAD(&sbi->s_active_transaction); ++ INIT_LIST_HEAD(&sbi->s_closed_transaction); ++ INIT_LIST_HEAD(&sbi->s_committed_transaction); ++ spin_lock_init(&sbi->s_bal_lock); ++ ++ /* remove old on-disk buddy file */ ++ down(&root->i_sem); ++ dentry = lookup_one_len(".buddy", sb->s_root, strlen(".buddy")); ++ if (dentry->d_inode != NULL) { ++ i = vfs_unlink(root, dentry); ++ if (i != 0) ++ printk("EXT3-fs: can't remove .buddy file: %d\n", i); ++ } ++ dput(dentry); ++ up(&root->i_sem); ++ ++ ext3_mb_history_init(sb); ++ ++ printk("EXT3-fs: mballoc enabled\n"); ++ return 0; ++} ++ ++int ext3_mb_release(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int i, num_meta_group_infos; ++ ++ if (!test_opt(sb, MBALLOC)) ++ return 0; ++ ++ /* release freed, non-committed blocks */ ++ spin_lock(&sbi->s_md_lock); ++ list_splice_init(&sbi->s_closed_transaction, ++ &sbi->s_committed_transaction); ++ list_splice_init(&sbi->s_active_transaction, ++ &sbi->s_committed_transaction); ++ spin_unlock(&sbi->s_md_lock); ++ ext3_mb_free_committed_blocks(sb); ++ ++ if (sbi->s_group_info) { ++ for (i = 0; i < sbi->s_groups_count; i++) ++ kfree(EXT3_GROUP_INFO(sb, i)); ++ num_meta_group_infos = (sbi->s_groups_count + ++ EXT3_DESC_PER_BLOCK(sb) - 1) >> ++ EXT3_DESC_PER_BLOCK_BITS(sb); ++ for (i = 0; i < num_meta_group_infos; i++) ++ kfree(sbi->s_group_info[i]); ++ kfree(sbi->s_group_info); ++ } ++ if (sbi->s_mb_offsets) ++ kfree(sbi->s_mb_offsets); ++ if (sbi->s_mb_maxs) ++ kfree(sbi->s_mb_maxs); ++ if (sbi->s_buddy_cache) ++ iput(sbi->s_buddy_cache); ++ if (sbi->s_blocks_reserved) ++ printk("ext3-fs: %ld blocks being reserved at umount!\n", ++ sbi->s_blocks_reserved); ++ if (ext3_mb_stats) { ++ printk("EXT3-fs: mballoc: %u blocks %u reqs (%u success)\n", ++ atomic_read(&sbi->s_bal_allocated), ++ atomic_read(&sbi->s_bal_reqs), ++ atomic_read(&sbi->s_bal_success)); ++ printk("EXT3-fs: mballoc: %u extents scanned, %u goal hits, " ++ "%u 2^N hits, %u breaks\n", ++ atomic_read(&sbi->s_bal_ex_scanned), ++ atomic_read(&sbi->s_bal_goals), ++ atomic_read(&sbi->s_bal_2orders), ++ atomic_read(&sbi->s_bal_breaks)); ++ printk("EXT3-fs: mballoc: %lu generated and it took %Lu\n", ++ sbi->s_mb_buddies_generated++, ++ sbi->s_mb_generation_time); ++ } ++ ++ ext3_mb_history_release(sb); ++ ++ return 0; ++} ++ ++void ext3_mb_free_committed_blocks(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int err, i, count = 0, count2 = 0; ++ struct ext3_free_metadata *md; ++ struct ext3_buddy e3b; ++ ++ if (list_empty(&sbi->s_committed_transaction)) ++ return; ++ ++ /* there is committed blocks to be freed yet */ ++ do { ++ /* get next array of blocks */ ++ md = NULL; ++ spin_lock(&sbi->s_md_lock); ++ if (!list_empty(&sbi->s_committed_transaction)) { ++ md = list_entry(sbi->s_committed_transaction.next, ++ struct ext3_free_metadata, list); ++ list_del(&md->list); ++ } ++ spin_unlock(&sbi->s_md_lock); ++ ++ if (md == NULL) ++ break; ++ ++ mb_debug("gonna free %u blocks in group %u (0x%p):", ++ md->num, md->group, md); ++ ++ err = ext3_mb_load_buddy(sb, md->group, &e3b); ++ /* we expect to find existing buddy because it's pinned */ ++ BUG_ON(err != 0); ++ ++ /* there are blocks to put in buddy to make them really free */ ++ count += md->num; ++ count2++; ++ ext3_lock_group(sb, md->group); ++ for (i = 0; i < md->num; i++) { ++ mb_debug(" %u", md->blocks[i]); ++ mb_free_blocks(&e3b, md->blocks[i], 1); ++ } ++ mb_debug("\n"); ++ ext3_unlock_group(sb, md->group); ++ ++ /* balance refcounts from ext3_mb_free_metadata() */ ++ page_cache_release(e3b.bd_buddy_page); ++ page_cache_release(e3b.bd_bitmap_page); ++ ++ kfree(md); ++ ext3_mb_release_desc(&e3b); ++ ++ } while (md); ++ mb_debug("freed %u blocks in %u structures\n", count, count2); ++} ++ ++void ext3_mb_poll_new_transaction(struct super_block *sb, handle_t *handle) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ ++ if (sbi->s_last_transaction == handle->h_transaction->t_tid) ++ return; ++ ++ /* new transaction! time to close last one and free blocks for ++ * committed transaction. we know that only transaction can be ++ * active, so previos transaction can be being logged and we ++ * know that transaction before previous is known to be already ++ * logged. this means that now we may free blocks freed in all ++ * transactions before previous one. hope I'm clear enough ... */ ++ ++ spin_lock(&sbi->s_md_lock); ++ if (sbi->s_last_transaction != handle->h_transaction->t_tid) { ++ mb_debug("new transaction %lu, old %lu\n", ++ (unsigned long) handle->h_transaction->t_tid, ++ (unsigned long) sbi->s_last_transaction); ++ list_splice_init(&sbi->s_closed_transaction, ++ &sbi->s_committed_transaction); ++ list_splice_init(&sbi->s_active_transaction, ++ &sbi->s_closed_transaction); ++ sbi->s_last_transaction = handle->h_transaction->t_tid; ++ } ++ spin_unlock(&sbi->s_md_lock); ++ ++ ext3_mb_free_committed_blocks(sb); ++} ++ ++int ext3_mb_free_metadata(handle_t *handle, struct ext3_buddy *e3b, ++ int group, int block, int count) ++{ ++ struct ext3_group_info *db = e3b->bd_info; ++ struct super_block *sb = e3b->bd_sb; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct ext3_free_metadata *md; ++ int i; ++ ++ J_ASSERT(e3b->bd_bitmap_page != NULL); ++ J_ASSERT(e3b->bd_buddy_page != NULL); ++ ++ ext3_lock_group(sb, group); ++ for (i = 0; i < count; i++) { ++ md = db->bb_md_cur; ++ if (md && db->bb_tid != handle->h_transaction->t_tid) { ++ db->bb_md_cur = NULL; ++ md = NULL; ++ } ++ ++ if (md == NULL) { ++ ext3_unlock_group(sb, group); ++ md = kmalloc(sizeof(*md), GFP_KERNEL); ++ if (md == NULL) ++ return -ENOMEM; ++ md->num = 0; ++ md->group = group; ++ ++ ext3_lock_group(sb, group); ++ if (db->bb_md_cur == NULL) { ++ spin_lock(&sbi->s_md_lock); ++ list_add(&md->list, &sbi->s_active_transaction); ++ spin_unlock(&sbi->s_md_lock); ++ /* protect buddy cache from being freed, ++ * otherwise we'll refresh it from ++ * on-disk bitmap and lose not-yet-available ++ * blocks */ ++ page_cache_get(e3b->bd_buddy_page); ++ page_cache_get(e3b->bd_bitmap_page); ++ db->bb_md_cur = md; ++ db->bb_tid = handle->h_transaction->t_tid; ++ mb_debug("new md 0x%p for group %u\n", ++ md, md->group); ++ } else { ++ kfree(md); ++ md = db->bb_md_cur; ++ } ++ } ++ ++ BUG_ON(md->num >= EXT3_BB_MAX_BLOCKS); ++ md->blocks[md->num] = block + i; ++ md->num++; ++ if (md->num == EXT3_BB_MAX_BLOCKS) { ++ /* no more space, put full container on a sb's list */ ++ db->bb_md_cur = NULL; ++ } ++ } ++ ext3_unlock_group(sb, group); ++ return 0; ++} ++ ++void ext3_mb_free_blocks(handle_t *handle, struct inode *inode, ++ unsigned long block, unsigned long count, ++ int metadata, int *freed) ++{ ++ struct buffer_head *bitmap_bh = NULL; ++ struct ext3_group_desc *gdp; ++ struct ext3_super_block *es; ++ unsigned long bit, overflow; ++ struct buffer_head *gd_bh; ++ unsigned long block_group; ++ struct ext3_sb_info *sbi; ++ struct super_block *sb; ++ struct ext3_buddy e3b; ++ int err = 0, ret; ++ ++ *freed = 0; ++ sb = inode->i_sb; ++ if (!sb) { ++ printk ("ext3_free_blocks: nonexistent device"); ++ return; ++ } ++ ++ ext3_mb_poll_new_transaction(sb, handle); ++ ++ sbi = EXT3_SB(sb); ++ es = EXT3_SB(sb)->s_es; ++ if (block < le32_to_cpu(es->s_first_data_block) || ++ block + count < block || ++ block + count > le32_to_cpu(es->s_blocks_count)) { ++ ext3_error (sb, "ext3_free_blocks", ++ "Freeing blocks not in datazone - " ++ "block = %lu, count = %lu", block, count); ++ goto error_return; ++ } ++ ++ ext3_debug("freeing block %lu\n", block); ++ ++do_more: ++ overflow = 0; ++ block_group = (block - le32_to_cpu(es->s_first_data_block)) / ++ EXT3_BLOCKS_PER_GROUP(sb); ++ bit = (block - le32_to_cpu(es->s_first_data_block)) % ++ EXT3_BLOCKS_PER_GROUP(sb); ++ /* ++ * Check to see if we are freeing blocks across a group ++ * boundary. ++ */ ++ if (bit + count > EXT3_BLOCKS_PER_GROUP(sb)) { ++ overflow = bit + count - EXT3_BLOCKS_PER_GROUP(sb); ++ count -= overflow; ++ } ++ brelse(bitmap_bh); ++ bitmap_bh = read_block_bitmap(sb, block_group); ++ if (!bitmap_bh) ++ goto error_return; ++ gdp = ext3_get_group_desc (sb, block_group, &gd_bh); ++ if (!gdp) ++ goto error_return; ++ ++ if (in_range (le32_to_cpu(gdp->bg_block_bitmap), block, count) || ++ in_range (le32_to_cpu(gdp->bg_inode_bitmap), block, count) || ++ in_range (block, le32_to_cpu(gdp->bg_inode_table), ++ EXT3_SB(sb)->s_itb_per_group) || ++ in_range (block + count - 1, le32_to_cpu(gdp->bg_inode_table), ++ EXT3_SB(sb)->s_itb_per_group)) ++ ext3_error (sb, "ext3_free_blocks", ++ "Freeing blocks in system zones - " ++ "Block = %lu, count = %lu", ++ block, count); ++ ++ BUFFER_TRACE(bitmap_bh, "getting write access"); ++ err = ext3_journal_get_write_access(handle, bitmap_bh); ++ if (err) ++ goto error_return; ++ ++ /* ++ * We are about to modify some metadata. Call the journal APIs ++ * to unshare ->b_data if a currently-committing transaction is ++ * using it ++ */ ++ BUFFER_TRACE(gd_bh, "get_write_access"); ++ err = ext3_journal_get_write_access(handle, gd_bh); ++ if (err) ++ goto error_return; ++ ++ err = ext3_mb_load_buddy(sb, block_group, &e3b); ++ if (err) ++ goto error_return; ++ ++#ifdef AGGRESSIVE_CHECK ++ { ++ int i; ++ for (i = 0; i < count; i++) ++ J_ASSERT(mb_test_bit(bit + i, bitmap_bh->b_data)); ++ } ++#endif ++ mb_clear_bits(bitmap_bh->b_data, bit, count); ++ ++ /* We dirtied the bitmap block */ ++ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); ++ err = ext3_journal_dirty_metadata(handle, bitmap_bh); ++ ++ if (metadata) { ++ /* blocks being freed are metadata. these blocks shouldn't ++ * be used until this transaction is committed */ ++ ext3_mb_free_metadata(handle, &e3b, block_group, bit, count); ++ } else { ++ ext3_lock_group(sb, block_group); ++ mb_free_blocks(&e3b, bit, count); ++ ext3_unlock_group(sb, block_group); ++ } ++ ++ spin_lock(sb_bgl_lock(sbi, block_group)); ++ gdp->bg_free_blocks_count = ++ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + count); ++ spin_unlock(sb_bgl_lock(sbi, block_group)); ++ percpu_counter_mod(&sbi->s_freeblocks_counter, count); ++ ++ ext3_mb_release_desc(&e3b); ++ ++ *freed = count; ++ ++ /* And the group descriptor block */ ++ BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); ++ ret = ext3_journal_dirty_metadata(handle, gd_bh); ++ if (!err) err = ret; ++ ++ if (overflow && !err) { ++ block += count; ++ count = overflow; ++ goto do_more; ++ } ++ sb->s_dirt = 1; ++error_return: ++ brelse(bitmap_bh); ++ ext3_std_error(sb, err); ++ return; ++} ++ ++int ext3_mb_reserve_blocks(struct super_block *sb, int blocks) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int free, ret = -ENOSPC; ++ ++ BUG_ON(blocks < 0); ++ spin_lock(&sbi->s_reserve_lock); ++ free = percpu_counter_read_positive(&sbi->s_freeblocks_counter); ++ if (blocks <= free - sbi->s_blocks_reserved) { ++ sbi->s_blocks_reserved += blocks; ++ ret = 0; ++ } ++ spin_unlock(&sbi->s_reserve_lock); ++ return ret; ++} ++ ++void ext3_mb_release_blocks(struct super_block *sb, int blocks) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ ++ BUG_ON(blocks < 0); ++ spin_lock(&sbi->s_reserve_lock); ++ sbi->s_blocks_reserved -= blocks; ++ WARN_ON(sbi->s_blocks_reserved < 0); ++ if (sbi->s_blocks_reserved < 0) ++ sbi->s_blocks_reserved = 0; ++ spin_unlock(&sbi->s_reserve_lock); ++} ++ ++int ext3_new_block(handle_t *handle, struct inode *inode, ++ unsigned long goal, int *errp) ++{ ++ int ret, len; ++ ++ if (!test_opt(inode->i_sb, MBALLOC)) { ++ ret = ext3_new_block_old(handle, inode, goal, errp); ++ goto out; ++ } ++ len = 1; ++ ret = ext3_mb_new_blocks(handle, inode, goal, &len, 0, errp); ++out: ++ return ret; ++} ++ ++ ++void ext3_free_blocks(handle_t *handle, struct inode * inode, ++ unsigned long block, unsigned long count, int metadata) ++{ ++ struct super_block *sb; ++ int freed; ++ ++ sb = inode->i_sb; ++ if (!test_opt(sb, MBALLOC) || !EXT3_SB(sb)->s_group_info) ++ ext3_free_blocks_sb(handle, sb, block, count, &freed); ++ else ++ ext3_mb_free_blocks(handle, inode, block, count, metadata, &freed); ++ if (freed) ++ DQUOT_FREE_BLOCK(inode, freed); ++ return; ++} ++ ++#define EXT3_ROOT "ext3" ++#define EXT3_MB_STATS_NAME "mb_stats" ++#define EXT3_MB_MAX_TO_SCAN_NAME "mb_max_to_scan" ++#define EXT3_MB_MIN_TO_SCAN_NAME "mb_min_to_scan" ++#define EXT3_MB_ORDER2_REQ "mb_order2_req" ++ ++static int ext3_mb_stats_read(char *page, char **start, off_t off, ++ int count, int *eof, void *data) ++{ ++ int len; ++ ++ *eof = 1; ++ if (off != 0) ++ return 0; ++ ++ len = sprintf(page, "%ld\n", ext3_mb_stats); ++ *start = page; ++ return len; ++} ++ ++static int ext3_mb_stats_write(struct file *file, const char *buffer, ++ unsigned long count, void *data) ++{ ++ char str[32]; ++ ++ if (count >= sizeof(str)) { ++ printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n", ++ EXT3_MB_STATS_NAME, (int)sizeof(str)); ++ return -EOVERFLOW; ++ } ++ ++ if (copy_from_user(str, buffer, count)) ++ return -EFAULT; ++ ++ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ ++ ext3_mb_stats = (simple_strtol(str, NULL, 0) != 0); ++ return count; ++} ++ ++static int ext3_mb_max_to_scan_read(char *page, char **start, off_t off, ++ int count, int *eof, void *data) ++{ ++ int len; ++ ++ *eof = 1; ++ if (off != 0) ++ return 0; ++ ++ len = sprintf(page, "%ld\n", ext3_mb_max_to_scan); ++ *start = page; ++ return len; ++} ++ ++static int ext3_mb_max_to_scan_write(struct file *file, const char *buffer, ++ unsigned long count, void *data) ++{ ++ char str[32]; ++ long value; ++ ++ if (count >= sizeof(str)) { ++ printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n", ++ EXT3_MB_MAX_TO_SCAN_NAME, (int)sizeof(str)); ++ return -EOVERFLOW; ++ } ++ ++ if (copy_from_user(str, buffer, count)) ++ return -EFAULT; ++ ++ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ ++ value = simple_strtol(str, NULL, 0); ++ if (value <= 0) ++ return -ERANGE; ++ ++ ext3_mb_max_to_scan = value; ++ ++ return count; ++} ++ ++static int ext3_mb_min_to_scan_read(char *page, char **start, off_t off, ++ int count, int *eof, void *data) ++{ ++ int len; ++ ++ *eof = 1; ++ if (off != 0) ++ return 0; ++ ++ len = sprintf(page, "%ld\n", ext3_mb_min_to_scan); ++ *start = page; ++ return len; ++} ++ ++static int ext3_mb_min_to_scan_write(struct file *file, const char *buffer, ++ unsigned long count, void *data) ++{ ++ char str[32]; ++ long value; ++ ++ if (count >= sizeof(str)) { ++ printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n", ++ EXT3_MB_MIN_TO_SCAN_NAME, (int)sizeof(str)); ++ return -EOVERFLOW; ++ } ++ ++ if (copy_from_user(str, buffer, count)) ++ return -EFAULT; ++ ++ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ ++ value = simple_strtol(str, NULL, 0); ++ if (value <= 0) ++ return -ERANGE; ++ ++ ext3_mb_min_to_scan = value; ++ ++ return count; ++} ++ ++static int ext3_mb_order2_req_read(char *page, char **start, off_t off, ++ int count, int *eof, void *data) ++{ ++ int len; ++ ++ *eof = 1; ++ if (off != 0) ++ return 0; ++ ++ len = sprintf(page, "%ld\n", ext3_mb_order2_reqs); ++ *start = page; ++ return len; ++} ++ ++static int ext3_mb_order2_req_write(struct file *file, const char *buffer, ++ unsigned long count, void *data) ++{ ++ char str[32]; ++ long value; ++ ++ if (count >= sizeof(str)) { ++ printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n", ++ EXT3_MB_MIN_TO_SCAN_NAME, (int)sizeof(str)); ++ return -EOVERFLOW; ++ } ++ ++ if (copy_from_user(str, buffer, count)) ++ return -EFAULT; ++ ++ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ ++ value = simple_strtol(str, NULL, 0); ++ if (value <= 0) ++ return -ERANGE; ++ ++ ext3_mb_order2_reqs = value; ++ ++ return count; ++} ++ ++int __init init_ext3_proc(void) ++{ ++ struct proc_dir_entry *proc_ext3_mb_stats; ++ struct proc_dir_entry *proc_ext3_mb_max_to_scan; ++ struct proc_dir_entry *proc_ext3_mb_min_to_scan; ++ struct proc_dir_entry *proc_ext3_mb_order2_req; ++ ++ proc_root_ext3 = proc_mkdir(EXT3_ROOT, proc_root_fs); ++ if (proc_root_ext3 == NULL) { ++ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", EXT3_ROOT); ++ return -EIO; ++ } ++ ++ /* Initialize EXT3_MB_STATS_NAME */ ++ proc_ext3_mb_stats = create_proc_entry(EXT3_MB_STATS_NAME, ++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); ++ if (proc_ext3_mb_stats == NULL) { ++ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", ++ EXT3_MB_STATS_NAME); ++ remove_proc_entry(EXT3_ROOT, proc_root_fs); ++ return -EIO; ++ } ++ ++ proc_ext3_mb_stats->data = NULL; ++ proc_ext3_mb_stats->read_proc = ext3_mb_stats_read; ++ proc_ext3_mb_stats->write_proc = ext3_mb_stats_write; ++ ++ /* Initialize EXT3_MAX_TO_SCAN_NAME */ ++ proc_ext3_mb_max_to_scan = create_proc_entry( ++ EXT3_MB_MAX_TO_SCAN_NAME, ++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); ++ if (proc_ext3_mb_max_to_scan == NULL) { ++ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", ++ EXT3_MB_MAX_TO_SCAN_NAME); ++ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_ROOT, proc_root_fs); ++ return -EIO; ++ } ++ ++ proc_ext3_mb_max_to_scan->data = NULL; ++ proc_ext3_mb_max_to_scan->read_proc = ext3_mb_max_to_scan_read; ++ proc_ext3_mb_max_to_scan->write_proc = ext3_mb_max_to_scan_write; ++ ++ /* Initialize EXT3_MIN_TO_SCAN_NAME */ ++ proc_ext3_mb_min_to_scan = create_proc_entry( ++ EXT3_MB_MIN_TO_SCAN_NAME, ++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); ++ if (proc_ext3_mb_min_to_scan == NULL) { ++ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", ++ EXT3_MB_MIN_TO_SCAN_NAME); ++ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_ROOT, proc_root_fs); ++ return -EIO; ++ } ++ ++ proc_ext3_mb_min_to_scan->data = NULL; ++ proc_ext3_mb_min_to_scan->read_proc = ext3_mb_min_to_scan_read; ++ proc_ext3_mb_min_to_scan->write_proc = ext3_mb_min_to_scan_write; ++ ++ /* Initialize EXT3_ORDER2_REQ */ ++ proc_ext3_mb_order2_req = create_proc_entry( ++ EXT3_MB_ORDER2_REQ, ++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); ++ if (proc_ext3_mb_order2_req == NULL) { ++ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", ++ EXT3_MB_ORDER2_REQ); ++ remove_proc_entry(EXT3_MB_MIN_TO_SCAN_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_ROOT, proc_root_fs); ++ return -EIO; ++ } ++ ++ proc_ext3_mb_order2_req->data = NULL; ++ proc_ext3_mb_order2_req->read_proc = ext3_mb_order2_req_read; ++ proc_ext3_mb_order2_req->write_proc = ext3_mb_order2_req_write; ++ ++ return 0; ++} ++ ++void exit_ext3_proc(void) ++{ ++ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_MIN_TO_SCAN_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_ORDER2_REQ, proc_root_ext3); ++ remove_proc_entry(EXT3_ROOT, proc_root_fs); ++} +Index: linux-2.6.12.6-bull/fs/ext3/Makefile +=================================================================== +--- linux-2.6.12.6-bull.orig/fs/ext3/Makefile 2006-04-29 20:39:09.000000000 +0400 ++++ linux-2.6.12.6-bull/fs/ext3/Makefile 2006-04-29 20:39:10.000000000 +0400 +@@ -6,7 +6,7 @@ obj-$(CONFIG_EXT3_FS) += ext3.o + + ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ + ioctl.o namei.o super.o symlink.o hash.o resize.o \ +- extents.o ++ extents.o mballoc.o + + ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o + ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o diff --git a/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6.18-vanilla.patch b/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6.18-vanilla.patch new file mode 100644 index 0000000..20fa78a --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6.18-vanilla.patch @@ -0,0 +1,2810 @@ +Index: linux-stage/fs/ext3/mballoc.c +=================================================================== +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ linux-stage/fs/ext3/mballoc.c 2006-07-16 02:29:49.000000000 +0800 +@@ -0,0 +1,2434 @@ ++/* ++ * Copyright (c) 2003-2005, Cluster File Systems, Inc, info@clusterfs.com ++ * Written by Alex Tomas ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public Licens ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- ++ */ ++ ++ ++/* ++ * mballoc.c contains the multiblocks allocation routines ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * TODO: ++ * - bitmap read-ahead (proposed by Oleg Drokin aka green) ++ * - track min/max extents in each group for better group selection ++ * - mb_mark_used() may allocate chunk right after splitting buddy ++ * - special flag to advice allocator to look for requested + N blocks ++ * this may improve interaction between extents and mballoc ++ * - tree of groups sorted by number of free blocks ++ * - percpu reservation code (hotpath) ++ * - error handling ++ */ ++ ++/* ++ * with AGRESSIVE_CHECK allocator runs consistency checks over ++ * structures. these checks slow things down a lot ++ */ ++#define AGGRESSIVE_CHECK__ ++ ++/* ++ */ ++#define MB_DEBUG__ ++#ifdef MB_DEBUG ++#define mb_debug(fmt,a...) printk(fmt, ##a) ++#else ++#define mb_debug(fmt,a...) ++#endif ++ ++/* ++ * with EXT3_MB_HISTORY mballoc stores last N allocations in memory ++ * and you can monitor it in /proc/fs/ext3//mb_history ++ */ ++#define EXT3_MB_HISTORY ++ ++/* ++ * How long mballoc can look for a best extent (in found extents) ++ */ ++long ext3_mb_max_to_scan = 500; ++ ++/* ++ * How long mballoc must look for a best extent ++ */ ++long ext3_mb_min_to_scan = 30; ++ ++/* ++ * with 'ext3_mb_stats' allocator will collect stats that will be ++ * shown at umount. The collecting costs though! ++ */ ++ ++long ext3_mb_stats = 1; ++ ++#ifdef EXT3_BB_MAX_BLOCKS ++#undef EXT3_BB_MAX_BLOCKS ++#endif ++#define EXT3_BB_MAX_BLOCKS 30 ++ ++struct ext3_free_metadata { ++ unsigned short group; ++ unsigned short num; ++ unsigned short blocks[EXT3_BB_MAX_BLOCKS]; ++ struct list_head list; ++}; ++ ++struct ext3_group_info { ++ unsigned long bb_state; ++ unsigned long bb_tid; ++ struct ext3_free_metadata *bb_md_cur; ++ unsigned short bb_first_free; ++ unsigned short bb_free; ++ unsigned short bb_fragments; ++ unsigned short bb_counters[]; ++}; ++ ++ ++#define EXT3_GROUP_INFO_NEED_INIT_BIT 0 ++#define EXT3_GROUP_INFO_LOCKED_BIT 1 ++ ++#define EXT3_MB_GRP_NEED_INIT(grp) \ ++ (test_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, &(grp)->bb_state)) ++ ++struct ext3_free_extent { ++ __u16 fe_start; ++ __u16 fe_len; ++ __u16 fe_group; ++}; ++ ++struct ext3_allocation_context { ++ struct super_block *ac_sb; ++ ++ /* search goals */ ++ struct ext3_free_extent ac_g_ex; ++ ++ /* the best found extent */ ++ struct ext3_free_extent ac_b_ex; ++ ++ /* number of iterations done. we have to track to limit searching */ ++ unsigned long ac_ex_scanned; ++ __u16 ac_groups_scanned; ++ __u16 ac_found; ++ __u16 ac_tail; ++ __u16 ac_buddy; ++ __u8 ac_status; ++ __u8 ac_flags; /* allocation hints */ ++ __u8 ac_criteria; ++ __u8 ac_repeats; ++ __u8 ac_2order; /* if request is to allocate 2^N blocks and ++ * N > 0, the field stores N, otherwise 0 */ ++}; ++ ++#define AC_STATUS_CONTINUE 1 ++#define AC_STATUS_FOUND 2 ++#define AC_STATUS_BREAK 3 ++ ++struct ext3_mb_history { ++ struct ext3_free_extent goal; /* goal allocation */ ++ struct ext3_free_extent result; /* result allocation */ ++ __u16 found; /* how many extents have been found */ ++ __u16 groups; /* how many groups have been scanned */ ++ __u16 tail; /* what tail broke some buddy */ ++ __u16 buddy; /* buddy the tail ^^^ broke */ ++ __u8 cr; /* which phase the result extent was found at */ ++ __u8 merged; ++}; ++ ++struct ext3_buddy { ++ struct page *bd_buddy_page; ++ void *bd_buddy; ++ struct page *bd_bitmap_page; ++ void *bd_bitmap; ++ struct ext3_group_info *bd_info; ++ struct super_block *bd_sb; ++ __u16 bd_blkbits; ++ __u16 bd_group; ++}; ++#define EXT3_MB_BITMAP(e3b) ((e3b)->bd_bitmap) ++#define EXT3_MB_BUDDY(e3b) ((e3b)->bd_buddy) ++ ++#ifndef EXT3_MB_HISTORY ++#define ext3_mb_store_history(sb,ac) ++#else ++static void ext3_mb_store_history(struct super_block *, ++ struct ext3_allocation_context *ac); ++#endif ++ ++#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) ++ ++static struct proc_dir_entry *proc_root_ext3; ++ ++int ext3_create (struct inode *, struct dentry *, int, struct nameidata *); ++struct buffer_head * read_block_bitmap(struct super_block *, unsigned int); ++int ext3_new_block_old(handle_t *, struct inode *, unsigned long, int *); ++int ext3_mb_reserve_blocks(struct super_block *, int); ++void ext3_mb_release_blocks(struct super_block *, int); ++void ext3_mb_poll_new_transaction(struct super_block *, handle_t *); ++void ext3_mb_free_committed_blocks(struct super_block *); ++ ++#if BITS_PER_LONG == 64 ++#define mb_correct_addr_and_bit(bit,addr) \ ++{ \ ++ bit += ((unsigned long) addr & 7UL) << 3; \ ++ addr = (void *) ((unsigned long) addr & ~7UL); \ ++} ++#elif BITS_PER_LONG == 32 ++#define mb_correct_addr_and_bit(bit,addr) \ ++{ \ ++ bit += ((unsigned long) addr & 3UL) << 3; \ ++ addr = (void *) ((unsigned long) addr & ~3UL); \ ++} ++#else ++#error "how many bits you are?!" ++#endif ++ ++static inline int mb_test_bit(int bit, void *addr) ++{ ++ mb_correct_addr_and_bit(bit,addr); ++ return ext2_test_bit(bit, addr); ++} ++ ++static inline void mb_set_bit(int bit, void *addr) ++{ ++ mb_correct_addr_and_bit(bit,addr); ++ ext2_set_bit(bit, addr); ++} ++ ++static inline void mb_set_bit_atomic(int bit, void *addr) ++{ ++ mb_correct_addr_and_bit(bit,addr); ++ ext2_set_bit_atomic(NULL, bit, addr); ++} ++ ++static inline void mb_clear_bit(int bit, void *addr) ++{ ++ mb_correct_addr_and_bit(bit,addr); ++ ext2_clear_bit(bit, addr); ++} ++ ++static inline void mb_clear_bit_atomic(int bit, void *addr) ++{ ++ mb_correct_addr_and_bit(bit,addr); ++ ext2_clear_bit_atomic(NULL, bit, addr); ++} ++ ++static inline int mb_find_next_zero_bit(void *addr, int max, int start) ++{ ++ int fix; ++#if BITS_PER_LONG == 64 ++ fix = ((unsigned long) addr & 7UL) << 3; ++ addr = (void *) ((unsigned long) addr & ~7UL); ++#elif BITS_PER_LONG == 32 ++ fix = ((unsigned long) addr & 3UL) << 3; ++ addr = (void *) ((unsigned long) addr & ~3UL); ++#else ++#error "how many bits you are?!" ++#endif ++ max += fix; ++ start += fix; ++ return ext2_find_next_zero_bit(addr, max, start) - fix; ++} ++ ++static inline void *mb_find_buddy(struct ext3_buddy *e3b, int order, int *max) ++{ ++ char *bb; ++ ++ J_ASSERT(EXT3_MB_BITMAP(e3b) != EXT3_MB_BUDDY(e3b)); ++ J_ASSERT(max != NULL); ++ ++ if (order > e3b->bd_blkbits + 1) { ++ *max = 0; ++ return NULL; ++ } ++ ++ /* at order 0 we see each particular block */ ++ *max = 1 << (e3b->bd_blkbits + 3); ++ if (order == 0) ++ return EXT3_MB_BITMAP(e3b); ++ ++ bb = EXT3_MB_BUDDY(e3b) + EXT3_SB(e3b->bd_sb)->s_mb_offsets[order]; ++ *max = EXT3_SB(e3b->bd_sb)->s_mb_maxs[order]; ++ ++ return bb; ++} ++ ++#ifdef AGGRESSIVE_CHECK ++ ++static void mb_check_buddy(struct ext3_buddy *e3b) ++{ ++ int order = e3b->bd_blkbits + 1; ++ int max, max2, i, j, k, count; ++ int fragments = 0, fstart; ++ void *buddy, *buddy2; ++ ++ if (!test_opt(e3b->bd_sb, MBALLOC)) ++ return; ++ ++ { ++ static int mb_check_counter = 0; ++ if (mb_check_counter++ % 300 != 0) ++ return; ++ } ++ ++ while (order > 1) { ++ buddy = mb_find_buddy(e3b, order, &max); ++ J_ASSERT(buddy); ++ buddy2 = mb_find_buddy(e3b, order - 1, &max2); ++ J_ASSERT(buddy2); ++ J_ASSERT(buddy != buddy2); ++ J_ASSERT(max * 2 == max2); ++ ++ count = 0; ++ for (i = 0; i < max; i++) { ++ ++ if (mb_test_bit(i, buddy)) { ++ /* only single bit in buddy2 may be 1 */ ++ if (!mb_test_bit(i << 1, buddy2)) ++ J_ASSERT(mb_test_bit((i<<1)+1, buddy2)); ++ else if (!mb_test_bit((i << 1) + 1, buddy2)) ++ J_ASSERT(mb_test_bit(i << 1, buddy2)); ++ continue; ++ } ++ ++ /* both bits in buddy2 must be 0 */ ++ J_ASSERT(mb_test_bit(i << 1, buddy2)); ++ J_ASSERT(mb_test_bit((i << 1) + 1, buddy2)); ++ ++ for (j = 0; j < (1 << order); j++) { ++ k = (i * (1 << order)) + j; ++ J_ASSERT(!mb_test_bit(k, EXT3_MB_BITMAP(e3b))); ++ } ++ count++; ++ } ++ J_ASSERT(e3b->bd_info->bb_counters[order] == count); ++ order--; ++ } ++ ++ fstart = -1; ++ buddy = mb_find_buddy(e3b, 0, &max); ++ for (i = 0; i < max; i++) { ++ if (!mb_test_bit(i, buddy)) { ++ J_ASSERT(i >= e3b->bd_info->bb_first_free); ++ if (fstart == -1) { ++ fragments++; ++ fstart = i; ++ } ++ continue; ++ } ++ fstart = -1; ++ /* check used bits only */ ++ for (j = 0; j < e3b->bd_blkbits + 1; j++) { ++ buddy2 = mb_find_buddy(e3b, j, &max2); ++ k = i >> j; ++ J_ASSERT(k < max2); ++ J_ASSERT(mb_test_bit(k, buddy2)); ++ } ++ } ++ J_ASSERT(!EXT3_MB_GRP_NEED_INIT(e3b->bd_info)); ++ J_ASSERT(e3b->bd_info->bb_fragments == fragments); ++} ++ ++#else ++#define mb_check_buddy(e3b) ++#endif ++ ++/* find most significant bit */ ++static int inline fmsb(unsigned short word) ++{ ++ int order; ++ ++ if (word > 255) { ++ order = 7; ++ word >>= 8; ++ } else { ++ order = -1; ++ } ++ ++ do { ++ order++; ++ word >>= 1; ++ } while (word != 0); ++ ++ return order; ++} ++ ++static void inline ++ext3_mb_mark_free_simple(struct super_block *sb, void *buddy, unsigned first, ++ int len, struct ext3_group_info *grp) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ unsigned short min, max, chunk, border; ++ ++ mb_debug("mark %u/%u free\n", first, len); ++ J_ASSERT(len < EXT3_BLOCKS_PER_GROUP(sb)); ++ ++ border = 2 << sb->s_blocksize_bits; ++ ++ while (len > 0) { ++ /* find how many blocks can be covered since this position */ ++ max = ffs(first | border) - 1; ++ ++ /* find how many blocks of power 2 we need to mark */ ++ min = fmsb(len); ++ ++ mb_debug(" %u/%u -> max %u, min %u\n", ++ first & ((2 << sb->s_blocksize_bits) - 1), ++ len, max, min); ++ ++ if (max < min) ++ min = max; ++ chunk = 1 << min; ++ ++ /* mark multiblock chunks only */ ++ grp->bb_counters[min]++; ++ if (min > 0) { ++ mb_debug(" set %u at %u \n", first >> min, ++ sbi->s_mb_offsets[min]); ++ mb_clear_bit(first >> min, buddy + sbi->s_mb_offsets[min]); ++ } ++ ++ len -= chunk; ++ first += chunk; ++ } ++} ++ ++static void ++ext3_mb_generate_buddy(struct super_block *sb, void *buddy, void *bitmap, ++ struct ext3_group_info *grp) ++{ ++ unsigned short max = EXT3_BLOCKS_PER_GROUP(sb); ++ unsigned short i = 0, first, len; ++ unsigned free = 0, fragments = 0; ++ unsigned long long period = get_cycles(); ++ ++ i = mb_find_next_zero_bit(bitmap, max, 0); ++ grp->bb_first_free = i; ++ while (i < max) { ++ fragments++; ++ first = i; ++ i = find_next_bit(bitmap, max, i); ++ len = i - first; ++ free += len; ++ if (len > 1) ++ ext3_mb_mark_free_simple(sb, buddy, first, len, grp); ++ else ++ grp->bb_counters[0]++; ++ if (i < max) ++ i = mb_find_next_zero_bit(bitmap, max, i); ++ } ++ grp->bb_fragments = fragments; ++ ++ /* bb_state shouldn't being modified because all ++ * others waits for init completion on page lock */ ++ clear_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, &grp->bb_state); ++ if (free != grp->bb_free) { ++ printk("EXT3-fs: %u blocks in bitmap, %u in group descriptor\n", ++ free, grp->bb_free); ++ grp->bb_free = free; ++ } ++ ++ period = get_cycles() - period; ++ spin_lock(&EXT3_SB(sb)->s_bal_lock); ++ EXT3_SB(sb)->s_mb_buddies_generated++; ++ EXT3_SB(sb)->s_mb_generation_time += period; ++ spin_unlock(&EXT3_SB(sb)->s_bal_lock); ++} ++ ++static int ext3_mb_init_cache(struct page *page) ++{ ++ int blocksize, blocks_per_page, groups_per_page; ++ int err = 0, i, first_group, first_block; ++ struct super_block *sb; ++ struct buffer_head *bhs; ++ struct buffer_head **bh; ++ struct inode *inode; ++ char *data, *bitmap; ++ ++ mb_debug("init page %lu\n", page->index); ++ ++ inode = page->mapping->host; ++ sb = inode->i_sb; ++ blocksize = 1 << inode->i_blkbits; ++ blocks_per_page = PAGE_CACHE_SIZE / blocksize; ++ ++ groups_per_page = blocks_per_page >> 1; ++ if (groups_per_page == 0) ++ groups_per_page = 1; ++ ++ /* allocate buffer_heads to read bitmaps */ ++ if (groups_per_page > 1) { ++ err = -ENOMEM; ++ i = sizeof(struct buffer_head *) * groups_per_page; ++ bh = kmalloc(i, GFP_NOFS); ++ if (bh == NULL) ++ goto out; ++ memset(bh, 0, i); ++ } else ++ bh = &bhs; ++ ++ first_group = page->index * blocks_per_page / 2; ++ ++ /* read all groups the page covers into the cache */ ++ for (i = 0; i < groups_per_page; i++) { ++ struct ext3_group_desc * desc; ++ ++ if (first_group + i >= EXT3_SB(sb)->s_groups_count) ++ break; ++ ++ err = -EIO; ++ desc = ext3_get_group_desc(sb, first_group + i, NULL); ++ if (desc == NULL) ++ goto out; ++ ++ err = -ENOMEM; ++ bh[i] = sb_getblk(sb, le32_to_cpu(desc->bg_block_bitmap)); ++ if (bh[i] == NULL) ++ goto out; ++ ++ if (buffer_uptodate(bh[i])) ++ continue; ++ ++ lock_buffer(bh[i]); ++ if (buffer_uptodate(bh[i])) { ++ unlock_buffer(bh[i]); ++ continue; ++ } ++ ++ get_bh(bh[i]); ++ bh[i]->b_end_io = end_buffer_read_sync; ++ submit_bh(READ, bh[i]); ++ mb_debug("read bitmap for group %u\n", first_group + i); ++ } ++ ++ /* wait for I/O completion */ ++ for (i = 0; i < groups_per_page && bh[i]; i++) ++ wait_on_buffer(bh[i]); ++ ++ /* XXX: I/O error handling here */ ++ ++ first_block = page->index * blocks_per_page; ++ for (i = 0; i < blocks_per_page; i++) { ++ int group; ++ ++ group = (first_block + i) >> 1; ++ if (group >= EXT3_SB(sb)->s_groups_count) ++ break; ++ ++ data = page_address(page) + (i * blocksize); ++ bitmap = bh[group - first_group]->b_data; ++ ++ if ((first_block + i) & 1) { ++ /* this is block of buddy */ ++ mb_debug("put buddy for group %u in page %lu/%x\n", ++ group, page->index, i * blocksize); ++ memset(data, 0xff, blocksize); ++ EXT3_SB(sb)->s_group_info[group]->bb_fragments = 0; ++ memset(EXT3_SB(sb)->s_group_info[group]->bb_counters, 0, ++ sizeof(unsigned short)*(sb->s_blocksize_bits+2)); ++ ext3_mb_generate_buddy(sb, data, bitmap, ++ EXT3_SB(sb)->s_group_info[group]); ++ } else { ++ /* this is block of bitmap */ ++ mb_debug("put bitmap for group %u in page %lu/%x\n", ++ group, page->index, i * blocksize); ++ memcpy(data, bitmap, blocksize); ++ } ++ } ++ SetPageUptodate(page); ++ ++out: ++ for (i = 0; i < groups_per_page && bh[i]; i++) ++ brelse(bh[i]); ++ if (bh && bh != &bhs) ++ kfree(bh); ++ return err; ++} ++ ++static int ext3_mb_load_buddy(struct super_block *sb, int group, ++ struct ext3_buddy *e3b) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct inode *inode = sbi->s_buddy_cache; ++ int blocks_per_page, block, pnum, poff; ++ struct page *page; ++ ++ mb_debug("load group %u\n", group); ++ ++ blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; ++ ++ e3b->bd_blkbits = sb->s_blocksize_bits; ++ e3b->bd_info = sbi->s_group_info[group]; ++ e3b->bd_sb = sb; ++ e3b->bd_group = group; ++ e3b->bd_buddy_page = NULL; ++ e3b->bd_bitmap_page = NULL; ++ ++ block = group * 2; ++ pnum = block / blocks_per_page; ++ poff = block % blocks_per_page; ++ ++ page = find_get_page(inode->i_mapping, pnum); ++ if (page == NULL || !PageUptodate(page)) { ++ if (page) ++ page_cache_release(page); ++ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); ++ if (page) { ++ if (!PageUptodate(page)) ++ ext3_mb_init_cache(page); ++ unlock_page(page); ++ } ++ } ++ if (page == NULL || !PageUptodate(page)) ++ goto err; ++ e3b->bd_bitmap_page = page; ++ e3b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); ++ mark_page_accessed(page); ++ ++ block++; ++ pnum = block / blocks_per_page; ++ poff = block % blocks_per_page; ++ ++ page = find_get_page(inode->i_mapping, pnum); ++ if (page == NULL || !PageUptodate(page)) { ++ if (page) ++ page_cache_release(page); ++ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); ++ if (page) { ++ if (!PageUptodate(page)) ++ ext3_mb_init_cache(page); ++ unlock_page(page); ++ } ++ } ++ if (page == NULL || !PageUptodate(page)) ++ goto err; ++ e3b->bd_buddy_page = page; ++ e3b->bd_buddy = page_address(page) + (poff * sb->s_blocksize); ++ mark_page_accessed(page); ++ ++ J_ASSERT(e3b->bd_bitmap_page != NULL); ++ J_ASSERT(e3b->bd_buddy_page != NULL); ++ ++ return 0; ++ ++err: ++ if (e3b->bd_bitmap_page) ++ page_cache_release(e3b->bd_bitmap_page); ++ if (e3b->bd_buddy_page) ++ page_cache_release(e3b->bd_buddy_page); ++ e3b->bd_buddy = NULL; ++ e3b->bd_bitmap = NULL; ++ return -EIO; ++} ++ ++static void ext3_mb_release_desc(struct ext3_buddy *e3b) ++{ ++ if (e3b->bd_bitmap_page) ++ page_cache_release(e3b->bd_bitmap_page); ++ if (e3b->bd_buddy_page) ++ page_cache_release(e3b->bd_buddy_page); ++} ++ ++ ++static inline void ++ext3_lock_group(struct super_block *sb, int group) ++{ ++ bit_spin_lock(EXT3_GROUP_INFO_LOCKED_BIT, ++ &EXT3_SB(sb)->s_group_info[group]->bb_state); ++} ++ ++static inline void ++ext3_unlock_group(struct super_block *sb, int group) ++{ ++ bit_spin_unlock(EXT3_GROUP_INFO_LOCKED_BIT, ++ &EXT3_SB(sb)->s_group_info[group]->bb_state); ++} ++ ++static int mb_find_order_for_block(struct ext3_buddy *e3b, int block) ++{ ++ int order = 1; ++ void *bb; ++ ++ J_ASSERT(EXT3_MB_BITMAP(e3b) != EXT3_MB_BUDDY(e3b)); ++ J_ASSERT(block < (1 << (e3b->bd_blkbits + 3))); ++ ++ bb = EXT3_MB_BUDDY(e3b); ++ while (order <= e3b->bd_blkbits + 1) { ++ block = block >> 1; ++ if (!mb_test_bit(block, bb)) { ++ /* this block is part of buddy of order 'order' */ ++ return order; ++ } ++ bb += 1 << (e3b->bd_blkbits - order); ++ order++; ++ } ++ return 0; ++} ++ ++static inline void mb_clear_bits(void *bm, int cur, int len) ++{ ++ __u32 *addr; ++ ++ len = cur + len; ++ while (cur < len) { ++ if ((cur & 31) == 0 && (len - cur) >= 32) { ++ /* fast path: clear whole word at once */ ++ addr = bm + (cur >> 3); ++ *addr = 0; ++ cur += 32; ++ continue; ++ } ++ mb_clear_bit_atomic(cur, bm); ++ cur++; ++ } ++} ++ ++static inline void mb_set_bits(void *bm, int cur, int len) ++{ ++ __u32 *addr; ++ ++ len = cur + len; ++ while (cur < len) { ++ if ((cur & 31) == 0 && (len - cur) >= 32) { ++ /* fast path: clear whole word at once */ ++ addr = bm + (cur >> 3); ++ *addr = 0xffffffff; ++ cur += 32; ++ continue; ++ } ++ mb_set_bit_atomic(cur, bm); ++ cur++; ++ } ++} ++ ++static int mb_free_blocks(struct ext3_buddy *e3b, int first, int count) ++{ ++ int block = 0, max = 0, order; ++ void *buddy, *buddy2; ++ ++ mb_check_buddy(e3b); ++ ++ e3b->bd_info->bb_free += count; ++ if (first < e3b->bd_info->bb_first_free) ++ e3b->bd_info->bb_first_free = first; ++ ++ /* let's maintain fragments counter */ ++ if (first != 0) ++ block = !mb_test_bit(first - 1, EXT3_MB_BITMAP(e3b)); ++ if (first + count < EXT3_SB(e3b->bd_sb)->s_mb_maxs[0]) ++ max = !mb_test_bit(first + count, EXT3_MB_BITMAP(e3b)); ++ if (block && max) ++ e3b->bd_info->bb_fragments--; ++ else if (!block && !max) ++ e3b->bd_info->bb_fragments++; ++ ++ /* let's maintain buddy itself */ ++ while (count-- > 0) { ++ block = first++; ++ order = 0; ++ ++ J_ASSERT(mb_test_bit(block, EXT3_MB_BITMAP(e3b))); ++ mb_clear_bit(block, EXT3_MB_BITMAP(e3b)); ++ e3b->bd_info->bb_counters[order]++; ++ ++ /* start of the buddy */ ++ buddy = mb_find_buddy(e3b, order, &max); ++ ++ do { ++ block &= ~1UL; ++ if (mb_test_bit(block, buddy) || ++ mb_test_bit(block + 1, buddy)) ++ break; ++ ++ /* both the buddies are free, try to coalesce them */ ++ buddy2 = mb_find_buddy(e3b, order + 1, &max); ++ ++ if (!buddy2) ++ break; ++ ++ if (order > 0) { ++ /* for special purposes, we don't set ++ * free bits in bitmap */ ++ mb_set_bit(block, buddy); ++ mb_set_bit(block + 1, buddy); ++ } ++ e3b->bd_info->bb_counters[order]--; ++ e3b->bd_info->bb_counters[order]--; ++ ++ block = block >> 1; ++ order++; ++ e3b->bd_info->bb_counters[order]++; ++ ++ mb_clear_bit(block, buddy2); ++ buddy = buddy2; ++ } while (1); ++ } ++ mb_check_buddy(e3b); ++ ++ return 0; ++} ++ ++static int mb_find_extent(struct ext3_buddy *e3b, int order, int block, ++ int needed, struct ext3_free_extent *ex) ++{ ++ int next, max, ord; ++ void *buddy; ++ ++ J_ASSERT(ex != NULL); ++ ++ buddy = mb_find_buddy(e3b, order, &max); ++ J_ASSERT(buddy); ++ J_ASSERT(block < max); ++ if (mb_test_bit(block, buddy)) { ++ ex->fe_len = 0; ++ ex->fe_start = 0; ++ ex->fe_group = 0; ++ return 0; ++ } ++ ++ if (likely(order == 0)) { ++ /* find actual order */ ++ order = mb_find_order_for_block(e3b, block); ++ block = block >> order; ++ } ++ ++ ex->fe_len = 1 << order; ++ ex->fe_start = block << order; ++ ex->fe_group = e3b->bd_group; ++ ++ while (needed > ex->fe_len && (buddy = mb_find_buddy(e3b, order, &max))) { ++ ++ if (block + 1 >= max) ++ break; ++ ++ next = (block + 1) * (1 << order); ++ if (mb_test_bit(next, EXT3_MB_BITMAP(e3b))) ++ break; ++ ++ ord = mb_find_order_for_block(e3b, next); ++ ++ order = ord; ++ block = next >> order; ++ ex->fe_len += 1 << order; ++ } ++ ++ J_ASSERT(ex->fe_start + ex->fe_len <= (1 << (e3b->bd_blkbits + 3))); ++ return ex->fe_len; ++} ++ ++static int mb_mark_used(struct ext3_buddy *e3b, struct ext3_free_extent *ex) ++{ ++ int ord, mlen = 0, max = 0, cur; ++ int start = ex->fe_start; ++ int len = ex->fe_len; ++ unsigned ret = 0; ++ int len0 = len; ++ void *buddy; ++ ++ mb_check_buddy(e3b); ++ ++ e3b->bd_info->bb_free -= len; ++ if (e3b->bd_info->bb_first_free == start) ++ e3b->bd_info->bb_first_free += len; ++ ++ /* let's maintain fragments counter */ ++ if (start != 0) ++ mlen = !mb_test_bit(start - 1, EXT3_MB_BITMAP(e3b)); ++ if (start + len < EXT3_SB(e3b->bd_sb)->s_mb_maxs[0]) ++ max = !mb_test_bit(start + len, EXT3_MB_BITMAP(e3b)); ++ if (mlen && max) ++ e3b->bd_info->bb_fragments++; ++ else if (!mlen && !max) ++ e3b->bd_info->bb_fragments--; ++ ++ /* let's maintain buddy itself */ ++ while (len) { ++ ord = mb_find_order_for_block(e3b, start); ++ ++ if (((start >> ord) << ord) == start && len >= (1 << ord)) { ++ /* the whole chunk may be allocated at once! */ ++ mlen = 1 << ord; ++ buddy = mb_find_buddy(e3b, ord, &max); ++ J_ASSERT((start >> ord) < max); ++ mb_set_bit(start >> ord, buddy); ++ e3b->bd_info->bb_counters[ord]--; ++ start += mlen; ++ len -= mlen; ++ J_ASSERT(len >= 0); ++ continue; ++ } ++ ++ /* store for history */ ++ if (ret == 0) ++ ret = len | (ord << 16); ++ ++ /* we have to split large buddy */ ++ J_ASSERT(ord > 0); ++ buddy = mb_find_buddy(e3b, ord, &max); ++ mb_set_bit(start >> ord, buddy); ++ e3b->bd_info->bb_counters[ord]--; ++ ++ ord--; ++ cur = (start >> ord) & ~1U; ++ buddy = mb_find_buddy(e3b, ord, &max); ++ mb_clear_bit(cur, buddy); ++ mb_clear_bit(cur + 1, buddy); ++ e3b->bd_info->bb_counters[ord]++; ++ e3b->bd_info->bb_counters[ord]++; ++ } ++ ++ /* now drop all the bits in bitmap */ ++ mb_set_bits(EXT3_MB_BITMAP(e3b), ex->fe_start, len0); ++ ++ mb_check_buddy(e3b); ++ ++ return ret; ++} ++ ++/* ++ * Must be called under group lock! ++ */ ++static void ext3_mb_use_best_found(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ unsigned long ret; ++ ++ ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len); ++ ret = mb_mark_used(e3b, &ac->ac_b_ex); ++ ++ ac->ac_status = AC_STATUS_FOUND; ++ ac->ac_tail = ret & 0xffff; ++ ac->ac_buddy = ret >> 16; ++} ++ ++/* ++ * The routine checks whether found extent is good enough. If it is, ++ * then the extent gets marked used and flag is set to the context ++ * to stop scanning. Otherwise, the extent is compared with the ++ * previous found extent and if new one is better, then it's stored ++ * in the context. Later, the best found extent will be used, if ++ * mballoc can't find good enough extent. ++ * ++ * FIXME: real allocation policy is to be designed yet! ++ */ ++static void ext3_mb_measure_extent(struct ext3_allocation_context *ac, ++ struct ext3_free_extent *ex, ++ struct ext3_buddy *e3b) ++{ ++ struct ext3_free_extent *bex = &ac->ac_b_ex; ++ struct ext3_free_extent *gex = &ac->ac_g_ex; ++ ++ J_ASSERT(ex->fe_len > 0); ++ J_ASSERT(ex->fe_len < (1 << ac->ac_sb->s_blocksize_bits) * 8); ++ J_ASSERT(ex->fe_start < (1 << ac->ac_sb->s_blocksize_bits) * 8); ++ ++ ac->ac_found++; ++ ++ /* ++ * The special case - take what you catch first ++ */ ++ if (unlikely(ac->ac_flags & EXT3_MB_HINT_FIRST)) { ++ *bex = *ex; ++ ext3_mb_use_best_found(ac, e3b); ++ return; ++ } ++ ++ /* ++ * Let's check whether the chuck is good enough ++ */ ++ if (ex->fe_len == gex->fe_len) { ++ *bex = *ex; ++ ext3_mb_use_best_found(ac, e3b); ++ return; ++ } ++ ++ /* ++ * If this is first found extent, just store it in the context ++ */ ++ if (bex->fe_len == 0) { ++ *bex = *ex; ++ return; ++ } ++ ++ /* ++ * If new found extent is better, store it in the context ++ */ ++ if (bex->fe_len < gex->fe_len) { ++ /* if the request isn't satisfied, any found extent ++ * larger than previous best one is better */ ++ if (ex->fe_len > bex->fe_len) ++ *bex = *ex; ++ } else if (ex->fe_len > gex->fe_len) { ++ /* if the request is satisfied, then we try to find ++ * an extent that still satisfy the request, but is ++ * smaller than previous one */ ++ *bex = *ex; ++ } ++ ++ /* ++ * Let's scan at least few extents and don't pick up a first one ++ */ ++ if (bex->fe_len > gex->fe_len && ac->ac_found > ext3_mb_min_to_scan) ++ ac->ac_status = AC_STATUS_BREAK; ++ ++ /* ++ * We don't want to scan for a whole year ++ */ ++ if (ac->ac_found > ext3_mb_max_to_scan) ++ ac->ac_status = AC_STATUS_BREAK; ++} ++ ++static int ext3_mb_try_best_found(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ struct ext3_free_extent ex = ac->ac_b_ex; ++ int group = ex.fe_group, max, err; ++ ++ J_ASSERT(ex.fe_len > 0); ++ err = ext3_mb_load_buddy(ac->ac_sb, group, e3b); ++ if (err) ++ return err; ++ ++ ext3_lock_group(ac->ac_sb, group); ++ max = mb_find_extent(e3b, 0, ex.fe_start, ex.fe_len, &ex); ++ ++ if (max > 0) { ++ ac->ac_b_ex = ex; ++ ext3_mb_use_best_found(ac, e3b); ++ } ++ ++ ext3_unlock_group(ac->ac_sb, group); ++ ++ ext3_mb_release_desc(e3b); ++ ++ return 0; ++} ++ ++static int ext3_mb_find_by_goal(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ int group = ac->ac_g_ex.fe_group, max, err; ++ struct ext3_free_extent ex; ++ ++ err = ext3_mb_load_buddy(ac->ac_sb, group, e3b); ++ if (err) ++ return err; ++ ++ ext3_lock_group(ac->ac_sb, group); ++ max = mb_find_extent(e3b, 0, ac->ac_g_ex.fe_start, ++ ac->ac_g_ex.fe_len, &ex); ++ ++ if (max > 0) { ++ J_ASSERT(ex.fe_len > 0); ++ J_ASSERT(ex.fe_group == ac->ac_g_ex.fe_group); ++ J_ASSERT(ex.fe_start == ac->ac_g_ex.fe_start); ++ ac->ac_found++; ++ ac->ac_b_ex = ex; ++ ext3_mb_use_best_found(ac, e3b); ++ } ++ ext3_unlock_group(ac->ac_sb, group); ++ ++ ext3_mb_release_desc(e3b); ++ ++ return 0; ++} ++ ++/* ++ * The routine scans buddy structures (not bitmap!) from given order ++ * to max order and tries to find big enough chunk to satisfy the req ++ */ ++static void ext3_mb_simple_scan_group(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ struct super_block *sb = ac->ac_sb; ++ struct ext3_group_info *grp = e3b->bd_info; ++ void *buddy; ++ int i, k, max; ++ ++ J_ASSERT(ac->ac_2order > 0); ++ for (i = ac->ac_2order; i < sb->s_blocksize_bits + 1; i++) { ++ if (grp->bb_counters[i] == 0) ++ continue; ++ ++ buddy = mb_find_buddy(e3b, i, &max); ++ if (buddy == NULL) { ++ printk(KERN_ALERT "looking for wrong order?\n"); ++ break; ++ } ++ ++ k = mb_find_next_zero_bit(buddy, max, 0); ++ J_ASSERT(k < max); ++ ++ ac->ac_found++; ++ ++ ac->ac_b_ex.fe_len = 1 << i; ++ ac->ac_b_ex.fe_start = k << i; ++ ac->ac_b_ex.fe_group = e3b->bd_group; ++ ++ ext3_mb_use_best_found(ac, e3b); ++ J_ASSERT(ac->ac_b_ex.fe_len == ac->ac_g_ex.fe_len); ++ ++ if (unlikely(ext3_mb_stats)) ++ atomic_inc(&EXT3_SB(sb)->s_bal_2orders); ++ ++ break; ++ } ++} ++ ++/* ++ * The routine scans the group and measures all found extents. ++ * In order to optimize scanning, caller must pass number of ++ * free blocks in the group, so the routine can know upper limit. ++ */ ++static void ext3_mb_complex_scan_group(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ struct super_block *sb = ac->ac_sb; ++ void *bitmap = EXT3_MB_BITMAP(e3b); ++ struct ext3_free_extent ex; ++ int i, free; ++ ++ free = e3b->bd_info->bb_free; ++ J_ASSERT(free > 0); ++ ++ i = e3b->bd_info->bb_first_free; ++ ++ while (free && ac->ac_status == AC_STATUS_CONTINUE) { ++ i = mb_find_next_zero_bit(bitmap, sb->s_blocksize * 8, i); ++ if (i >= sb->s_blocksize * 8) { ++ J_ASSERT(free == 0); ++ break; ++ } ++ ++ mb_find_extent(e3b, 0, i, ac->ac_g_ex.fe_len, &ex); ++ J_ASSERT(ex.fe_len > 0); ++ J_ASSERT(free >= ex.fe_len); ++ ++ ext3_mb_measure_extent(ac, &ex, e3b); ++ ++ i += ex.fe_len; ++ free -= ex.fe_len; ++ } ++} ++ ++static int ext3_mb_good_group(struct ext3_allocation_context *ac, ++ int group, int cr) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(ac->ac_sb); ++ struct ext3_group_info *grp = sbi->s_group_info[group]; ++ unsigned free, fragments, i, bits; ++ ++ J_ASSERT(cr >= 0 && cr < 4); ++ J_ASSERT(!EXT3_MB_GRP_NEED_INIT(grp)); ++ ++ free = grp->bb_free; ++ fragments = grp->bb_fragments; ++ if (free == 0) ++ return 0; ++ if (fragments == 0) ++ return 0; ++ ++ switch (cr) { ++ case 0: ++ J_ASSERT(ac->ac_2order != 0); ++ bits = ac->ac_sb->s_blocksize_bits + 1; ++ for (i = ac->ac_2order; i < bits; i++) ++ if (grp->bb_counters[i] > 0) ++ return 1; ++ case 1: ++ if ((free / fragments) >= ac->ac_g_ex.fe_len) ++ return 1; ++ case 2: ++ if (free >= ac->ac_g_ex.fe_len) ++ return 1; ++ case 3: ++ return 1; ++ default: ++ BUG(); ++ } ++ ++ return 0; ++} ++ ++int ext3_mb_new_blocks(handle_t *handle, struct inode *inode, ++ unsigned long goal, int *len, int flags, int *errp) ++{ ++ struct buffer_head *bitmap_bh = NULL; ++ struct ext3_allocation_context ac; ++ int i, group, block, cr, err = 0; ++ struct ext3_group_desc *gdp; ++ struct ext3_super_block *es; ++ struct buffer_head *gdp_bh; ++ struct ext3_sb_info *sbi; ++ struct super_block *sb; ++ struct ext3_buddy e3b; ++ ++ J_ASSERT(len != NULL); ++ J_ASSERT(*len > 0); ++ ++ sb = inode->i_sb; ++ if (!sb) { ++ printk("ext3_mb_new_nblocks: nonexistent device"); ++ return 0; ++ } ++ ++ if (!test_opt(sb, MBALLOC)) { ++ static int ext3_mballoc_warning = 0; ++ if (ext3_mballoc_warning == 0) { ++ printk(KERN_ERR "EXT3-fs: multiblock request with " ++ "mballoc disabled!\n"); ++ ext3_mballoc_warning++; ++ } ++ *len = 1; ++ err = ext3_new_block_old(handle, inode, goal, errp); ++ return err; ++ } ++ ++ ext3_mb_poll_new_transaction(sb, handle); ++ ++ sbi = EXT3_SB(sb); ++ es = EXT3_SB(sb)->s_es; ++ ++ /* ++ * We can't allocate > group size ++ */ ++ if (*len >= EXT3_BLOCKS_PER_GROUP(sb) - 10) ++ *len = EXT3_BLOCKS_PER_GROUP(sb) - 10; ++ ++ if (!(flags & EXT3_MB_HINT_RESERVED)) { ++ /* someone asks for non-reserved blocks */ ++ BUG_ON(*len > 1); ++ err = ext3_mb_reserve_blocks(sb, 1); ++ if (err) { ++ *errp = err; ++ return 0; ++ } ++ } ++ ++ /* ++ * Check quota for allocation of this blocks. ++ */ ++ while (*len && DQUOT_ALLOC_BLOCK(inode, *len)) ++ *len -= 1; ++ if (*len == 0) { ++ *errp = -EDQUOT; ++ block = 0; ++ goto out; ++ } ++ ++ /* start searching from the goal */ ++ if (goal < le32_to_cpu(es->s_first_data_block) || ++ goal >= le32_to_cpu(es->s_blocks_count)) ++ goal = le32_to_cpu(es->s_first_data_block); ++ group = (goal - le32_to_cpu(es->s_first_data_block)) / ++ EXT3_BLOCKS_PER_GROUP(sb); ++ block = ((goal - le32_to_cpu(es->s_first_data_block)) % ++ EXT3_BLOCKS_PER_GROUP(sb)); ++ ++ /* set up allocation goals */ ++ ac.ac_b_ex.fe_group = 0; ++ ac.ac_b_ex.fe_start = 0; ++ ac.ac_b_ex.fe_len = 0; ++ ac.ac_status = AC_STATUS_CONTINUE; ++ ac.ac_groups_scanned = 0; ++ ac.ac_ex_scanned = 0; ++ ac.ac_found = 0; ++ ac.ac_sb = inode->i_sb; ++ ac.ac_g_ex.fe_group = group; ++ ac.ac_g_ex.fe_start = block; ++ ac.ac_g_ex.fe_len = *len; ++ ac.ac_flags = flags; ++ ac.ac_2order = 0; ++ ac.ac_criteria = 0; ++ ++ /* probably, the request is for 2^8+ blocks (1/2/3/... MB) */ ++ i = ffs(*len); ++ if (i >= 8) { ++ i--; ++ if ((*len & (~(1 << i))) == 0) ++ ac.ac_2order = i; ++ } ++ ++ /* Sometimes, caller may want to merge even small ++ * number of blocks to an existing extent */ ++ if (ac.ac_flags & EXT3_MB_HINT_MERGE) { ++ err = ext3_mb_find_by_goal(&ac, &e3b); ++ if (err) ++ goto out_err; ++ if (ac.ac_status == AC_STATUS_FOUND) ++ goto found; ++ } ++ ++ /* Let's just scan groups to find more-less suitable blocks */ ++ cr = ac.ac_2order ? 0 : 1; ++repeat: ++ for (; cr < 4 && ac.ac_status == AC_STATUS_CONTINUE; cr++) { ++ ac.ac_criteria = cr; ++ for (i = 0; i < EXT3_SB(sb)->s_groups_count; group++, i++) { ++ if (group == EXT3_SB(sb)->s_groups_count) ++ group = 0; ++ ++ if (EXT3_MB_GRP_NEED_INIT(sbi->s_group_info[group])) { ++ /* we need full data about the group ++ * to make a good selection */ ++ err = ext3_mb_load_buddy(ac.ac_sb, group, &e3b); ++ if (err) ++ goto out_err; ++ ext3_mb_release_desc(&e3b); ++ } ++ ++ /* check is group good for our criteries */ ++ if (!ext3_mb_good_group(&ac, group, cr)) ++ continue; ++ ++ err = ext3_mb_load_buddy(ac.ac_sb, group, &e3b); ++ if (err) ++ goto out_err; ++ ++ ext3_lock_group(sb, group); ++ if (!ext3_mb_good_group(&ac, group, cr)) { ++ /* someone did allocation from this group */ ++ ext3_unlock_group(sb, group); ++ ext3_mb_release_desc(&e3b); ++ continue; ++ } ++ ++ ac.ac_groups_scanned++; ++ if (cr == 0) ++ ext3_mb_simple_scan_group(&ac, &e3b); ++ else ++ ext3_mb_complex_scan_group(&ac, &e3b); ++ ++ ext3_unlock_group(sb, group); ++ ++ ext3_mb_release_desc(&e3b); ++ ++ if (err) ++ goto out_err; ++ if (ac.ac_status != AC_STATUS_CONTINUE) ++ break; ++ } ++ } ++ ++ if (ac.ac_b_ex.fe_len > 0 && ac.ac_status != AC_STATUS_FOUND && ++ !(ac.ac_flags & EXT3_MB_HINT_FIRST)) { ++ /* ++ * We've been searching too long. Let's try to allocate ++ * the best chunk we've found so far ++ */ ++ ++ /*if (ac.ac_found > ext3_mb_max_to_scan) ++ printk(KERN_ERR "EXT3-fs: too long searching at " ++ "%u (%d/%d)\n", cr, ac.ac_b_ex.fe_len, ++ ac.ac_g_ex.fe_len);*/ ++ ext3_mb_try_best_found(&ac, &e3b); ++ if (ac.ac_status != AC_STATUS_FOUND) { ++ /* ++ * Someone more lucky has already allocated it. ++ * The only thing we can do is just take first ++ * found block(s) ++ printk(KERN_ERR "EXT3-fs: and someone won our chunk\n"); ++ */ ++ ac.ac_b_ex.fe_group = 0; ++ ac.ac_b_ex.fe_start = 0; ++ ac.ac_b_ex.fe_len = 0; ++ ac.ac_status = AC_STATUS_CONTINUE; ++ ac.ac_flags |= EXT3_MB_HINT_FIRST; ++ cr = 3; ++ goto repeat; ++ } ++ } ++ ++ if (ac.ac_status != AC_STATUS_FOUND) { ++ /* ++ * We aren't lucky definitely ++ */ ++ DQUOT_FREE_BLOCK(inode, *len); ++ *errp = -ENOSPC; ++ block = 0; ++#if 1 ++ printk(KERN_ERR "EXT3-fs: cant allocate: status %d, flags %d\n", ++ ac.ac_status, ac.ac_flags); ++ printk(KERN_ERR "EXT3-fs: goal %d, best found %d/%d/%d, cr %d\n", ++ ac.ac_g_ex.fe_len, ac.ac_b_ex.fe_group, ++ ac.ac_b_ex.fe_start, ac.ac_b_ex.fe_len, cr); ++ printk(KERN_ERR "EXT3-fs: %lu block reserved, %d found\n", ++ sbi->s_blocks_reserved, ac.ac_found); ++ printk("EXT3-fs: groups: "); ++ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) ++ printk("%d: %d ", i, ++ sbi->s_group_info[i]->bb_free); ++ printk("\n"); ++#endif ++ goto out; ++ } ++ ++found: ++ J_ASSERT(ac.ac_b_ex.fe_len > 0); ++ ++ /* good news - free block(s) have been found. now it's time ++ * to mark block(s) in good old journaled bitmap */ ++ block = ac.ac_b_ex.fe_group * EXT3_BLOCKS_PER_GROUP(sb) ++ + ac.ac_b_ex.fe_start ++ + le32_to_cpu(es->s_first_data_block); ++ ++ /* we made a desicion, now mark found blocks in good old ++ * bitmap to be journaled */ ++ ++ ext3_debug("using block group %d(%d)\n", ++ ac.ac_b_group.group, gdp->bg_free_blocks_count); ++ ++ bitmap_bh = read_block_bitmap(sb, ac.ac_b_ex.fe_group); ++ if (!bitmap_bh) { ++ *errp = -EIO; ++ goto out_err; ++ } ++ ++ err = ext3_journal_get_write_access(handle, bitmap_bh); ++ if (err) { ++ *errp = err; ++ goto out_err; ++ } ++ ++ gdp = ext3_get_group_desc(sb, ac.ac_b_ex.fe_group, &gdp_bh); ++ if (!gdp) { ++ *errp = -EIO; ++ goto out_err; ++ } ++ ++ err = ext3_journal_get_write_access(handle, gdp_bh); ++ if (err) ++ goto out_err; ++ ++ block = ac.ac_b_ex.fe_group * EXT3_BLOCKS_PER_GROUP(sb) ++ + ac.ac_b_ex.fe_start ++ + le32_to_cpu(es->s_first_data_block); ++ ++ if (block == le32_to_cpu(gdp->bg_block_bitmap) || ++ block == le32_to_cpu(gdp->bg_inode_bitmap) || ++ in_range(block, le32_to_cpu(gdp->bg_inode_table), ++ EXT3_SB(sb)->s_itb_per_group)) ++ ext3_error(sb, "ext3_new_block", ++ "Allocating block in system zone - " ++ "block = %u", block); ++#ifdef AGGRESSIVE_CHECK ++ for (i = 0; i < ac.ac_b_ex.fe_len; i++) ++ J_ASSERT(!mb_test_bit(ac.ac_b_ex.fe_start + i, bitmap_bh->b_data)); ++#endif ++ mb_set_bits(bitmap_bh->b_data, ac.ac_b_ex.fe_start, ac.ac_b_ex.fe_len); ++ ++ spin_lock(sb_bgl_lock(sbi, ac.ac_b_ex.fe_group)); ++ gdp->bg_free_blocks_count = ++ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) ++ - ac.ac_b_ex.fe_len); ++ spin_unlock(sb_bgl_lock(sbi, ac.ac_b_ex.fe_group)); ++ percpu_counter_mod(&sbi->s_freeblocks_counter, - ac.ac_b_ex.fe_len); ++ ++ err = ext3_journal_dirty_metadata(handle, bitmap_bh); ++ if (err) ++ goto out_err; ++ err = ext3_journal_dirty_metadata(handle, gdp_bh); ++ if (err) ++ goto out_err; ++ ++ sb->s_dirt = 1; ++ *errp = 0; ++ brelse(bitmap_bh); ++ ++ /* drop non-allocated, but dquote'd blocks */ ++ J_ASSERT(*len >= ac.ac_b_ex.fe_len); ++ DQUOT_FREE_BLOCK(inode, *len - ac.ac_b_ex.fe_len); ++ ++ *len = ac.ac_b_ex.fe_len; ++ J_ASSERT(*len > 0); ++ J_ASSERT(block != 0); ++ goto out; ++ ++out_err: ++ /* if we've already allocated something, roll it back */ ++ if (ac.ac_status == AC_STATUS_FOUND) { ++ /* FIXME: free blocks here */ ++ } ++ ++ DQUOT_FREE_BLOCK(inode, *len); ++ brelse(bitmap_bh); ++ *errp = err; ++ block = 0; ++out: ++ if (!(flags & EXT3_MB_HINT_RESERVED)) { ++ /* block wasn't reserved before and we reserved it ++ * at the beginning of allocation. it doesn't matter ++ * whether we allocated anything or we failed: time ++ * to release reservation. NOTE: because I expect ++ * any multiblock request from delayed allocation ++ * path only, here is single block always */ ++ ext3_mb_release_blocks(sb, 1); ++ } ++ ++ if (unlikely(ext3_mb_stats) && ac.ac_g_ex.fe_len > 1) { ++ atomic_inc(&sbi->s_bal_reqs); ++ atomic_add(*len, &sbi->s_bal_allocated); ++ if (*len >= ac.ac_g_ex.fe_len) ++ atomic_inc(&sbi->s_bal_success); ++ atomic_add(ac.ac_found, &sbi->s_bal_ex_scanned); ++ if (ac.ac_g_ex.fe_start == ac.ac_b_ex.fe_start && ++ ac.ac_g_ex.fe_group == ac.ac_b_ex.fe_group) ++ atomic_inc(&sbi->s_bal_goals); ++ if (ac.ac_found > ext3_mb_max_to_scan) ++ atomic_inc(&sbi->s_bal_breaks); ++ } ++ ++ ext3_mb_store_history(sb, &ac); ++ ++ return block; ++} ++EXPORT_SYMBOL(ext3_mb_new_blocks); ++ ++#ifdef EXT3_MB_HISTORY ++struct ext3_mb_proc_session { ++ struct ext3_mb_history *history; ++ struct super_block *sb; ++ int start; ++ int max; ++}; ++ ++static void *ext3_mb_history_skip_empty(struct ext3_mb_proc_session *s, ++ struct ext3_mb_history *hs, ++ int first) ++{ ++ if (hs == s->history + s->max) ++ hs = s->history; ++ if (!first && hs == s->history + s->start) ++ return NULL; ++ while (hs->goal.fe_len == 0) { ++ hs++; ++ if (hs == s->history + s->max) ++ hs = s->history; ++ if (hs == s->history + s->start) ++ return NULL; ++ } ++ return hs; ++} ++ ++static void *ext3_mb_seq_history_start(struct seq_file *seq, loff_t *pos) ++{ ++ struct ext3_mb_proc_session *s = seq->private; ++ struct ext3_mb_history *hs; ++ int l = *pos; ++ ++ if (l == 0) ++ return SEQ_START_TOKEN; ++ hs = ext3_mb_history_skip_empty(s, s->history + s->start, 1); ++ if (!hs) ++ return NULL; ++ while (--l && (hs = ext3_mb_history_skip_empty(s, ++hs, 0)) != NULL); ++ return hs; ++} ++ ++static void *ext3_mb_seq_history_next(struct seq_file *seq, void *v, loff_t *pos) ++{ ++ struct ext3_mb_proc_session *s = seq->private; ++ struct ext3_mb_history *hs = v; ++ ++ ++*pos; ++ if (v == SEQ_START_TOKEN) ++ return ext3_mb_history_skip_empty(s, s->history + s->start, 1); ++ else ++ return ext3_mb_history_skip_empty(s, ++hs, 0); ++} ++ ++static int ext3_mb_seq_history_show(struct seq_file *seq, void *v) ++{ ++ struct ext3_mb_history *hs = v; ++ char buf[20], buf2[20]; ++ ++ if (v == SEQ_START_TOKEN) { ++ seq_printf(seq, "%-17s %-17s %-5s %-5s %-2s %-5s %-5s %-6s\n", ++ "goal", "result", "found", "grps", "cr", "merge", ++ "tail", "broken"); ++ return 0; ++ } ++ ++ sprintf(buf, "%u/%u/%u", hs->goal.fe_group, ++ hs->goal.fe_start, hs->goal.fe_len); ++ sprintf(buf2, "%u/%u/%u", hs->result.fe_group, ++ hs->result.fe_start, hs->result.fe_len); ++ seq_printf(seq, "%-17s %-17s %-5u %-5u %-2u %-5s %-5u %-6u\n", buf, ++ buf2, hs->found, hs->groups, hs->cr, ++ hs->merged ? "M" : "", hs->tail, ++ hs->buddy ? 1 << hs->buddy : 0); ++ return 0; ++} ++ ++static void ext3_mb_seq_history_stop(struct seq_file *seq, void *v) ++{ ++} ++ ++static struct seq_operations ext3_mb_seq_history_ops = { ++ .start = ext3_mb_seq_history_start, ++ .next = ext3_mb_seq_history_next, ++ .stop = ext3_mb_seq_history_stop, ++ .show = ext3_mb_seq_history_show, ++}; ++ ++static int ext3_mb_seq_history_open(struct inode *inode, struct file *file) ++{ ++ struct super_block *sb = PDE(inode)->data; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct ext3_mb_proc_session *s; ++ int rc, size; ++ ++ s = kmalloc(sizeof(*s), GFP_KERNEL); ++ if (s == NULL) ++ return -EIO; ++ size = sizeof(struct ext3_mb_history) * sbi->s_mb_history_max; ++ s->history = kmalloc(size, GFP_KERNEL); ++ if (s == NULL) { ++ kfree(s); ++ return -EIO; ++ } ++ ++ spin_lock(&sbi->s_mb_history_lock); ++ memcpy(s->history, sbi->s_mb_history, size); ++ s->max = sbi->s_mb_history_max; ++ s->start = sbi->s_mb_history_cur % s->max; ++ spin_unlock(&sbi->s_mb_history_lock); ++ ++ rc = seq_open(file, &ext3_mb_seq_history_ops); ++ if (rc == 0) { ++ struct seq_file *m = (struct seq_file *)file->private_data; ++ m->private = s; ++ } else { ++ kfree(s->history); ++ kfree(s); ++ } ++ return rc; ++ ++} ++ ++static int ext3_mb_seq_history_release(struct inode *inode, struct file *file) ++{ ++ struct seq_file *seq = (struct seq_file *)file->private_data; ++ struct ext3_mb_proc_session *s = seq->private; ++ kfree(s->history); ++ kfree(s); ++ return seq_release(inode, file); ++} ++ ++static struct file_operations ext3_mb_seq_history_fops = { ++ .owner = THIS_MODULE, ++ .open = ext3_mb_seq_history_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = ext3_mb_seq_history_release, ++}; ++ ++static void ext3_mb_history_release(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ char name[64]; ++ ++ snprintf(name, sizeof(name) - 1, "%s", bdevname(sb->s_bdev, name)); ++ remove_proc_entry("mb_history", sbi->s_mb_proc); ++ remove_proc_entry(name, proc_root_ext3); ++ ++ if (sbi->s_mb_history) ++ kfree(sbi->s_mb_history); ++} ++ ++static void ext3_mb_history_init(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ char name[64]; ++ int i; ++ ++ snprintf(name, sizeof(name) - 1, "%s", bdevname(sb->s_bdev, name)); ++ sbi->s_mb_proc = proc_mkdir(name, proc_root_ext3); ++ if (sbi->s_mb_proc != NULL) { ++ struct proc_dir_entry *p; ++ p = create_proc_entry("mb_history", S_IRUGO, sbi->s_mb_proc); ++ if (p) { ++ p->proc_fops = &ext3_mb_seq_history_fops; ++ p->data = sb; ++ } ++ } ++ ++ sbi->s_mb_history_max = 1000; ++ sbi->s_mb_history_cur = 0; ++ spin_lock_init(&sbi->s_mb_history_lock); ++ i = sbi->s_mb_history_max * sizeof(struct ext3_mb_history); ++ sbi->s_mb_history = kmalloc(i, GFP_KERNEL); ++ memset(sbi->s_mb_history, 0, i); ++ /* if we can't allocate history, then we simple won't use it */ ++} ++ ++static void ++ext3_mb_store_history(struct super_block *sb, struct ext3_allocation_context *ac) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct ext3_mb_history h; ++ ++ if (likely(sbi->s_mb_history == NULL)) ++ return; ++ ++ h.goal = ac->ac_g_ex; ++ h.result = ac->ac_b_ex; ++ h.found = ac->ac_found; ++ h.cr = ac->ac_criteria; ++ h.groups = ac->ac_groups_scanned; ++ h.tail = ac->ac_tail; ++ h.buddy = ac->ac_buddy; ++ h.merged = 0; ++ if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && ++ ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group) ++ h.merged = 1; ++ ++ spin_lock(&sbi->s_mb_history_lock); ++ memcpy(sbi->s_mb_history + sbi->s_mb_history_cur, &h, sizeof(h)); ++ if (++sbi->s_mb_history_cur >= sbi->s_mb_history_max) ++ sbi->s_mb_history_cur = 0; ++ spin_unlock(&sbi->s_mb_history_lock); ++} ++ ++#else ++#define ext3_mb_history_release(sb) ++#define ext3_mb_history_init(sb) ++#endif ++ ++int ext3_mb_init_backend(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int i, len; ++ ++ len = sizeof(struct ext3_buddy_group_blocks *) * sbi->s_groups_count; ++ sbi->s_group_info = kmalloc(len, GFP_KERNEL); ++ if (sbi->s_group_info == NULL) { ++ printk(KERN_ERR "EXT3-fs: can't allocate mem for buddy\n"); ++ return -ENOMEM; ++ } ++ memset(sbi->s_group_info, 0, len); ++ ++ sbi->s_buddy_cache = new_inode(sb); ++ if (sbi->s_buddy_cache == NULL) { ++ printk(KERN_ERR "EXT3-fs: can't get new inode\n"); ++ kfree(sbi->s_group_info); ++ return -ENOMEM; ++ } ++ ++ /* ++ * calculate needed size. if change bb_counters size, ++ * don't forget about ext3_mb_generate_buddy() ++ */ ++ len = sizeof(struct ext3_group_info); ++ len += sizeof(unsigned short) * (sb->s_blocksize_bits + 2); ++ for (i = 0; i < sbi->s_groups_count; i++) { ++ struct ext3_group_desc * desc; ++ ++ sbi->s_group_info[i] = kmalloc(len, GFP_KERNEL); ++ if (sbi->s_group_info[i] == NULL) { ++ printk(KERN_ERR "EXT3-fs: cant allocate mem for buddy\n"); ++ goto err_out; ++ } ++ desc = ext3_get_group_desc(sb, i, NULL); ++ if (desc == NULL) { ++ printk(KERN_ERR "EXT3-fs: cant read descriptor %u\n", i); ++ goto err_out; ++ } ++ memset(sbi->s_group_info[i], 0, len); ++ set_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, ++ &sbi->s_group_info[i]->bb_state); ++ sbi->s_group_info[i]->bb_free = ++ le16_to_cpu(desc->bg_free_blocks_count); ++ } ++ ++ return 0; ++ ++err_out: ++ while (--i >= 0) ++ kfree(sbi->s_group_info[i]); ++ iput(sbi->s_buddy_cache); ++ ++ return -ENOMEM; ++} ++ ++int ext3_mb_init(struct super_block *sb, int needs_recovery) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct inode *root = sb->s_root->d_inode; ++ unsigned i, offset, max; ++ struct dentry *dentry; ++ ++ if (!test_opt(sb, MBALLOC)) ++ return 0; ++ ++ i = (sb->s_blocksize_bits + 2) * sizeof(unsigned short); ++ ++ sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL); ++ if (sbi->s_mb_offsets == NULL) { ++ clear_opt(sbi->s_mount_opt, MBALLOC); ++ return -ENOMEM; ++ } ++ sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); ++ if (sbi->s_mb_maxs == NULL) { ++ clear_opt(sbi->s_mount_opt, MBALLOC); ++ kfree(sbi->s_mb_maxs); ++ return -ENOMEM; ++ } ++ ++ /* order 0 is regular bitmap */ ++ sbi->s_mb_maxs[0] = sb->s_blocksize << 3; ++ sbi->s_mb_offsets[0] = 0; ++ ++ i = 1; ++ offset = 0; ++ max = sb->s_blocksize << 2; ++ do { ++ sbi->s_mb_offsets[i] = offset; ++ sbi->s_mb_maxs[i] = max; ++ offset += 1 << (sb->s_blocksize_bits - i); ++ max = max >> 1; ++ i++; ++ } while (i <= sb->s_blocksize_bits + 1); ++ ++ ++ /* init file for buddy data */ ++ if ((i = ext3_mb_init_backend(sb))) { ++ clear_opt(sbi->s_mount_opt, MBALLOC); ++ kfree(sbi->s_mb_offsets); ++ kfree(sbi->s_mb_maxs); ++ return i; ++ } ++ ++ spin_lock_init(&sbi->s_reserve_lock); ++ spin_lock_init(&sbi->s_md_lock); ++ INIT_LIST_HEAD(&sbi->s_active_transaction); ++ INIT_LIST_HEAD(&sbi->s_closed_transaction); ++ INIT_LIST_HEAD(&sbi->s_committed_transaction); ++ spin_lock_init(&sbi->s_bal_lock); ++ ++ /* remove old on-disk buddy file */ ++ mutex_lock(&root->i_mutex); ++ dentry = lookup_one_len(".buddy", sb->s_root, strlen(".buddy")); ++ if (dentry->d_inode != NULL) { ++ i = vfs_unlink(root, dentry); ++ if (i != 0) ++ printk("EXT3-fs: can't remove .buddy file: %d\n", i); ++ } ++ dput(dentry); ++ mutex_unlock(&root->i_mutex); ++ ++ ext3_mb_history_init(sb); ++ ++ printk("EXT3-fs: mballoc enabled\n"); ++ return 0; ++} ++ ++int ext3_mb_release(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int i; ++ ++ if (!test_opt(sb, MBALLOC)) ++ return 0; ++ ++ /* release freed, non-committed blocks */ ++ spin_lock(&sbi->s_md_lock); ++ list_splice_init(&sbi->s_closed_transaction, ++ &sbi->s_committed_transaction); ++ list_splice_init(&sbi->s_active_transaction, ++ &sbi->s_committed_transaction); ++ spin_unlock(&sbi->s_md_lock); ++ ext3_mb_free_committed_blocks(sb); ++ ++ if (sbi->s_group_info) { ++ for (i = 0; i < sbi->s_groups_count; i++) { ++ if (sbi->s_group_info[i] == NULL) ++ continue; ++ kfree(sbi->s_group_info[i]); ++ } ++ kfree(sbi->s_group_info); ++ } ++ if (sbi->s_mb_offsets) ++ kfree(sbi->s_mb_offsets); ++ if (sbi->s_mb_maxs) ++ kfree(sbi->s_mb_maxs); ++ if (sbi->s_buddy_cache) ++ iput(sbi->s_buddy_cache); ++ if (sbi->s_blocks_reserved) ++ printk("ext3-fs: %ld blocks being reserved at umount!\n", ++ sbi->s_blocks_reserved); ++ if (ext3_mb_stats) { ++ printk("EXT3-fs: mballoc: %u blocks %u reqs (%u success)\n", ++ atomic_read(&sbi->s_bal_allocated), ++ atomic_read(&sbi->s_bal_reqs), ++ atomic_read(&sbi->s_bal_success)); ++ printk("EXT3-fs: mballoc: %u extents scanned, %u goal hits, " ++ "%u 2^N hits, %u breaks\n", ++ atomic_read(&sbi->s_bal_ex_scanned), ++ atomic_read(&sbi->s_bal_goals), ++ atomic_read(&sbi->s_bal_2orders), ++ atomic_read(&sbi->s_bal_breaks)); ++ printk("EXT3-fs: mballoc: %lu generated and it took %Lu\n", ++ sbi->s_mb_buddies_generated++, ++ sbi->s_mb_generation_time); ++ } ++ ++ ext3_mb_history_release(sb); ++ ++ return 0; ++} ++ ++void ext3_mb_free_committed_blocks(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int err, i, count = 0, count2 = 0; ++ struct ext3_free_metadata *md; ++ struct ext3_buddy e3b; ++ ++ if (list_empty(&sbi->s_committed_transaction)) ++ return; ++ ++ /* there is committed blocks to be freed yet */ ++ do { ++ /* get next array of blocks */ ++ md = NULL; ++ spin_lock(&sbi->s_md_lock); ++ if (!list_empty(&sbi->s_committed_transaction)) { ++ md = list_entry(sbi->s_committed_transaction.next, ++ struct ext3_free_metadata, list); ++ list_del(&md->list); ++ } ++ spin_unlock(&sbi->s_md_lock); ++ ++ if (md == NULL) ++ break; ++ ++ mb_debug("gonna free %u blocks in group %u (0x%p):", ++ md->num, md->group, md); ++ ++ err = ext3_mb_load_buddy(sb, md->group, &e3b); ++ BUG_ON(err != 0); ++ ++ /* there are blocks to put in buddy to make them really free */ ++ count += md->num; ++ count2++; ++ ext3_lock_group(sb, md->group); ++ for (i = 0; i < md->num; i++) { ++ mb_debug(" %u", md->blocks[i]); ++ mb_free_blocks(&e3b, md->blocks[i], 1); ++ } ++ mb_debug("\n"); ++ ext3_unlock_group(sb, md->group); ++ ++ /* balance refcounts from ext3_mb_free_metadata() */ ++ page_cache_release(e3b.bd_buddy_page); ++ page_cache_release(e3b.bd_bitmap_page); ++ ++ kfree(md); ++ ext3_mb_release_desc(&e3b); ++ ++ } while (md); ++ mb_debug("freed %u blocks in %u structures\n", count, count2); ++} ++ ++void ext3_mb_poll_new_transaction(struct super_block *sb, handle_t *handle) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ ++ if (sbi->s_last_transaction == handle->h_transaction->t_tid) ++ return; ++ ++ /* new transaction! time to close last one and free blocks for ++ * committed transaction. we know that only transaction can be ++ * active, so previos transaction can be being logged and we ++ * know that transaction before previous is known to be already ++ * logged. this means that now we may free blocks freed in all ++ * transactions before previous one. hope I'm clear enough ... */ ++ ++ spin_lock(&sbi->s_md_lock); ++ if (sbi->s_last_transaction != handle->h_transaction->t_tid) { ++ mb_debug("new transaction %lu, old %lu\n", ++ (unsigned long) handle->h_transaction->t_tid, ++ (unsigned long) sbi->s_last_transaction); ++ list_splice_init(&sbi->s_closed_transaction, ++ &sbi->s_committed_transaction); ++ list_splice_init(&sbi->s_active_transaction, ++ &sbi->s_closed_transaction); ++ sbi->s_last_transaction = handle->h_transaction->t_tid; ++ } ++ spin_unlock(&sbi->s_md_lock); ++ ++ ext3_mb_free_committed_blocks(sb); ++} ++ ++int ext3_mb_free_metadata(handle_t *handle, struct ext3_buddy *e3b, ++ int group, int block, int count) ++{ ++ struct ext3_group_info *db = e3b->bd_info; ++ struct super_block *sb = e3b->bd_sb; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct ext3_free_metadata *md; ++ int i; ++ ++ J_ASSERT(e3b->bd_bitmap_page != NULL); ++ J_ASSERT(e3b->bd_buddy_page != NULL); ++ ++ ext3_lock_group(sb, group); ++ for (i = 0; i < count; i++) { ++ md = db->bb_md_cur; ++ if (md && db->bb_tid != handle->h_transaction->t_tid) { ++ db->bb_md_cur = NULL; ++ md = NULL; ++ } ++ ++ if (md == NULL) { ++ ext3_unlock_group(sb, group); ++ md = kmalloc(sizeof(*md), GFP_KERNEL); ++ if (md == NULL) ++ return -ENOMEM; ++ md->num = 0; ++ md->group = group; ++ ++ ext3_lock_group(sb, group); ++ if (db->bb_md_cur == NULL) { ++ spin_lock(&sbi->s_md_lock); ++ list_add(&md->list, &sbi->s_active_transaction); ++ spin_unlock(&sbi->s_md_lock); ++ /* protect buddy cache from being freed, ++ * otherwise we'll refresh it from ++ * on-disk bitmap and lose not-yet-available ++ * blocks */ ++ page_cache_get(e3b->bd_buddy_page); ++ page_cache_get(e3b->bd_bitmap_page); ++ db->bb_md_cur = md; ++ db->bb_tid = handle->h_transaction->t_tid; ++ mb_debug("new md 0x%p for group %u\n", ++ md, md->group); ++ } else { ++ kfree(md); ++ md = db->bb_md_cur; ++ } ++ } ++ ++ BUG_ON(md->num >= EXT3_BB_MAX_BLOCKS); ++ md->blocks[md->num] = block + i; ++ md->num++; ++ if (md->num == EXT3_BB_MAX_BLOCKS) { ++ /* no more space, put full container on a sb's list */ ++ db->bb_md_cur = NULL; ++ } ++ } ++ ext3_unlock_group(sb, group); ++ return 0; ++} ++ ++void ext3_mb_free_blocks(handle_t *handle, struct inode *inode, ++ unsigned long block, unsigned long count, ++ int metadata, int *freed) ++{ ++ struct buffer_head *bitmap_bh = NULL; ++ struct ext3_group_desc *gdp; ++ struct ext3_super_block *es; ++ unsigned long bit, overflow; ++ struct buffer_head *gd_bh; ++ unsigned long block_group; ++ struct ext3_sb_info *sbi; ++ struct super_block *sb; ++ struct ext3_buddy e3b; ++ int err = 0, ret; ++ ++ *freed = 0; ++ sb = inode->i_sb; ++ if (!sb) { ++ printk ("ext3_free_blocks: nonexistent device"); ++ return; ++ } ++ ++ ext3_mb_poll_new_transaction(sb, handle); ++ ++ sbi = EXT3_SB(sb); ++ es = EXT3_SB(sb)->s_es; ++ if (block < le32_to_cpu(es->s_first_data_block) || ++ block + count < block || ++ block + count > le32_to_cpu(es->s_blocks_count)) { ++ ext3_error (sb, "ext3_free_blocks", ++ "Freeing blocks not in datazone - " ++ "block = %lu, count = %lu", block, count); ++ goto error_return; ++ } ++ ++ ext3_debug("freeing block %lu\n", block); ++ ++do_more: ++ overflow = 0; ++ block_group = (block - le32_to_cpu(es->s_first_data_block)) / ++ EXT3_BLOCKS_PER_GROUP(sb); ++ bit = (block - le32_to_cpu(es->s_first_data_block)) % ++ EXT3_BLOCKS_PER_GROUP(sb); ++ /* ++ * Check to see if we are freeing blocks across a group ++ * boundary. ++ */ ++ if (bit + count > EXT3_BLOCKS_PER_GROUP(sb)) { ++ overflow = bit + count - EXT3_BLOCKS_PER_GROUP(sb); ++ count -= overflow; ++ } ++ brelse(bitmap_bh); ++ bitmap_bh = read_block_bitmap(sb, block_group); ++ if (!bitmap_bh) ++ goto error_return; ++ gdp = ext3_get_group_desc (sb, block_group, &gd_bh); ++ if (!gdp) ++ goto error_return; ++ ++ if (in_range (le32_to_cpu(gdp->bg_block_bitmap), block, count) || ++ in_range (le32_to_cpu(gdp->bg_inode_bitmap), block, count) || ++ in_range (block, le32_to_cpu(gdp->bg_inode_table), ++ EXT3_SB(sb)->s_itb_per_group) || ++ in_range (block + count - 1, le32_to_cpu(gdp->bg_inode_table), ++ EXT3_SB(sb)->s_itb_per_group)) ++ ext3_error (sb, "ext3_free_blocks", ++ "Freeing blocks in system zones - " ++ "Block = %lu, count = %lu", ++ block, count); ++ ++ BUFFER_TRACE(bitmap_bh, "getting write access"); ++ err = ext3_journal_get_write_access(handle, bitmap_bh); ++ if (err) ++ goto error_return; ++ ++ /* ++ * We are about to modify some metadata. Call the journal APIs ++ * to unshare ->b_data if a currently-committing transaction is ++ * using it ++ */ ++ BUFFER_TRACE(gd_bh, "get_write_access"); ++ err = ext3_journal_get_write_access(handle, gd_bh); ++ if (err) ++ goto error_return; ++ ++ err = ext3_mb_load_buddy(sb, block_group, &e3b); ++ if (err) ++ goto error_return; ++ ++#ifdef AGGRESSIVE_CHECK ++ { ++ int i; ++ for (i = 0; i < count; i++) ++ J_ASSERT(mb_test_bit(bit + i, bitmap_bh->b_data)); ++ } ++#endif ++ mb_clear_bits(bitmap_bh->b_data, bit, count); ++ ++ /* We dirtied the bitmap block */ ++ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); ++ err = ext3_journal_dirty_metadata(handle, bitmap_bh); ++ ++ if (metadata) { ++ /* blocks being freed are metadata. these blocks shouldn't ++ * be used until this transaction is committed */ ++ ext3_mb_free_metadata(handle, &e3b, block_group, bit, count); ++ } else { ++ ext3_lock_group(sb, block_group); ++ mb_free_blocks(&e3b, bit, count); ++ ext3_unlock_group(sb, block_group); ++ } ++ ++ spin_lock(sb_bgl_lock(sbi, block_group)); ++ gdp->bg_free_blocks_count = ++ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + count); ++ spin_unlock(sb_bgl_lock(sbi, block_group)); ++ percpu_counter_mod(&sbi->s_freeblocks_counter, count); ++ ++ ext3_mb_release_desc(&e3b); ++ ++ *freed = count; ++ ++ /* And the group descriptor block */ ++ BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); ++ ret = ext3_journal_dirty_metadata(handle, gd_bh); ++ if (!err) err = ret; ++ ++ if (overflow && !err) { ++ block += count; ++ count = overflow; ++ goto do_more; ++ } ++ sb->s_dirt = 1; ++error_return: ++ brelse(bitmap_bh); ++ ext3_std_error(sb, err); ++ return; ++} ++ ++int ext3_mb_reserve_blocks(struct super_block *sb, int blocks) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int free, ret = -ENOSPC; ++ ++ BUG_ON(blocks < 0); ++ spin_lock(&sbi->s_reserve_lock); ++ free = percpu_counter_read_positive(&sbi->s_freeblocks_counter); ++ if (blocks <= free - sbi->s_blocks_reserved) { ++ sbi->s_blocks_reserved += blocks; ++ ret = 0; ++ } ++ spin_unlock(&sbi->s_reserve_lock); ++ return ret; ++} ++ ++void ext3_mb_release_blocks(struct super_block *sb, int blocks) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ ++ BUG_ON(blocks < 0); ++ spin_lock(&sbi->s_reserve_lock); ++ sbi->s_blocks_reserved -= blocks; ++ WARN_ON(sbi->s_blocks_reserved < 0); ++ if (sbi->s_blocks_reserved < 0) ++ sbi->s_blocks_reserved = 0; ++ spin_unlock(&sbi->s_reserve_lock); ++} ++ ++int ext3_new_block(handle_t *handle, struct inode *inode, ++ unsigned long goal, int *errp) ++{ ++ int ret, len; ++ ++ if (!test_opt(inode->i_sb, MBALLOC)) { ++ ret = ext3_new_block_old(handle, inode, goal, errp); ++ goto out; ++ } ++ len = 1; ++ ret = ext3_mb_new_blocks(handle, inode, goal, &len, 0, errp); ++out: ++ return ret; ++} ++ ++ ++void ext3_free_blocks(handle_t *handle, struct inode * inode, ++ unsigned long block, unsigned long count, int metadata) ++{ ++ struct super_block *sb; ++ int freed; ++ ++ sb = inode->i_sb; ++ if (!test_opt(sb, MBALLOC)) ++ ext3_free_blocks_sb(handle, sb, block, count, &freed); ++ else ++ ext3_mb_free_blocks(handle, inode, block, count, metadata, &freed); ++ if (freed) ++ DQUOT_FREE_BLOCK(inode, freed); ++ return; ++} ++ ++#define EXT3_ROOT "ext3" ++#define EXT3_MB_STATS_NAME "mb_stats" ++#define EXT3_MB_MAX_TO_SCAN_NAME "mb_max_to_scan" ++#define EXT3_MB_MIN_TO_SCAN_NAME "mb_min_to_scan" ++ ++static int ext3_mb_stats_read(char *page, char **start, off_t off, ++ int count, int *eof, void *data) ++{ ++ int len; ++ ++ *eof = 1; ++ if (off != 0) ++ return 0; ++ ++ len = sprintf(page, "%ld\n", ext3_mb_stats); ++ *start = page; ++ return len; ++} ++ ++static int ext3_mb_stats_write(struct file *file, const char *buffer, ++ unsigned long count, void *data) ++{ ++ char str[32]; ++ ++ if (count >= sizeof(str)) { ++ printk(KERN_ERR "EXT3: %s string to long, max %u bytes\n", ++ EXT3_MB_STATS_NAME, (int)sizeof(str)); ++ return -EOVERFLOW; ++ } ++ ++ if (copy_from_user(str, buffer, count)) ++ return -EFAULT; ++ ++ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ ++ ext3_mb_stats = (simple_strtol(str, NULL, 0) != 0); ++ return count; ++} ++ ++static int ext3_mb_max_to_scan_read(char *page, char **start, off_t off, ++ int count, int *eof, void *data) ++{ ++ int len; ++ ++ *eof = 1; ++ if (off != 0) ++ return 0; ++ ++ len = sprintf(page, "%ld\n", ext3_mb_max_to_scan); ++ *start = page; ++ return len; ++} ++ ++static int ext3_mb_max_to_scan_write(struct file *file, const char *buffer, ++ unsigned long count, void *data) ++{ ++ char str[32]; ++ long value; ++ ++ if (count >= sizeof(str)) { ++ printk(KERN_ERR "EXT3: %s string to long, max %u bytes\n", ++ EXT3_MB_MAX_TO_SCAN_NAME, (int)sizeof(str)); ++ return -EOVERFLOW; ++ } ++ ++ if (copy_from_user(str, buffer, count)) ++ return -EFAULT; ++ ++ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ ++ value = simple_strtol(str, NULL, 0); ++ if (value <= 0) ++ return -ERANGE; ++ ++ ext3_mb_max_to_scan = value; ++ ++ return count; ++} ++ ++static int ext3_mb_min_to_scan_read(char *page, char **start, off_t off, ++ int count, int *eof, void *data) ++{ ++ int len; ++ ++ *eof = 1; ++ if (off != 0) ++ return 0; ++ ++ len = sprintf(page, "%ld\n", ext3_mb_min_to_scan); ++ *start = page; ++ return len; ++} ++ ++static int ext3_mb_min_to_scan_write(struct file *file, const char *buffer, ++ unsigned long count, void *data) ++{ ++ char str[32]; ++ long value; ++ ++ if (count >= sizeof(str)) { ++ printk(KERN_ERR "EXT3: %s string to long, max %u bytes\n", ++ EXT3_MB_MIN_TO_SCAN_NAME, (int)sizeof(str)); ++ return -EOVERFLOW; ++ } ++ ++ if (copy_from_user(str, buffer, count)) ++ return -EFAULT; ++ ++ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ ++ value = simple_strtol(str, NULL, 0); ++ if (value <= 0) ++ return -ERANGE; ++ ++ ext3_mb_min_to_scan = value; ++ ++ return count; ++} ++ ++int __init init_ext3_proc(void) ++{ ++ struct proc_dir_entry *proc_ext3_mb_stats; ++ struct proc_dir_entry *proc_ext3_mb_max_to_scan; ++ struct proc_dir_entry *proc_ext3_mb_min_to_scan; ++ ++ proc_root_ext3 = proc_mkdir(EXT3_ROOT, proc_root_fs); ++ if (proc_root_ext3 == NULL) { ++ printk(KERN_ERR "EXT3: Unable to create %s\n", EXT3_ROOT); ++ return -EIO; ++ } ++ ++ /* Initialize EXT3_MB_STATS_NAME */ ++ proc_ext3_mb_stats = create_proc_entry(EXT3_MB_STATS_NAME, ++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); ++ if (proc_ext3_mb_stats == NULL) { ++ printk(KERN_ERR "EXT3: Unable to create %s\n", ++ EXT3_MB_STATS_NAME); ++ remove_proc_entry(EXT3_ROOT, proc_root_fs); ++ return -EIO; ++ } ++ ++ proc_ext3_mb_stats->data = NULL; ++ proc_ext3_mb_stats->read_proc = ext3_mb_stats_read; ++ proc_ext3_mb_stats->write_proc = ext3_mb_stats_write; ++ ++ /* Initialize EXT3_MAX_TO_SCAN_NAME */ ++ proc_ext3_mb_max_to_scan = create_proc_entry( ++ EXT3_MB_MAX_TO_SCAN_NAME, ++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); ++ if (proc_ext3_mb_max_to_scan == NULL) { ++ printk(KERN_ERR "EXT3: Unable to create %s\n", ++ EXT3_MB_MAX_TO_SCAN_NAME); ++ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_ROOT, proc_root_fs); ++ return -EIO; ++ } ++ ++ proc_ext3_mb_max_to_scan->data = NULL; ++ proc_ext3_mb_max_to_scan->read_proc = ext3_mb_max_to_scan_read; ++ proc_ext3_mb_max_to_scan->write_proc = ext3_mb_max_to_scan_write; ++ ++ /* Initialize EXT3_MIN_TO_SCAN_NAME */ ++ proc_ext3_mb_min_to_scan = create_proc_entry( ++ EXT3_MB_MIN_TO_SCAN_NAME, ++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); ++ if (proc_ext3_mb_min_to_scan == NULL) { ++ printk(KERN_ERR "EXT3: Unable to create %s\n", ++ EXT3_MB_MIN_TO_SCAN_NAME); ++ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_ROOT, proc_root_fs); ++ return -EIO; ++ } ++ ++ proc_ext3_mb_min_to_scan->data = NULL; ++ proc_ext3_mb_min_to_scan->read_proc = ext3_mb_min_to_scan_read; ++ proc_ext3_mb_min_to_scan->write_proc = ext3_mb_min_to_scan_write; ++ ++ return 0; ++} ++ ++void exit_ext3_proc(void) ++{ ++ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_MIN_TO_SCAN_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_ROOT, proc_root_fs); ++} ++ +Index: linux-stage/fs/ext3/extents.c +=================================================================== +--- linux-stage.orig/fs/ext3/extents.c 2006-07-16 02:29:43.000000000 +0800 ++++ linux-stage/fs/ext3/extents.c 2006-07-16 02:29:49.000000000 +0800 +@@ -771,7 +771,7 @@ cleanup: + for (i = 0; i < depth; i++) { + if (!ablocks[i]) + continue; +- ext3_free_blocks(handle, tree->inode, ablocks[i], 1); ++ ext3_free_blocks(handle, tree->inode, ablocks[i], 1, 1); + } + } + kfree(ablocks); +@@ -1428,7 +1428,7 @@ int ext3_ext_rm_idx(handle_t *handle, st + path->p_idx->ei_leaf); + bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf); + ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf); +- ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1); ++ ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1, 1); + return err; + } + +@@ -1913,10 +1913,12 @@ ext3_remove_blocks(struct ext3_extents_t + int needed = ext3_remove_blocks_credits(tree, ex, from, to); + handle_t *handle = ext3_journal_start(tree->inode, needed); + struct buffer_head *bh; +- int i; ++ int i, metadata = 0; + + if (IS_ERR(handle)) + return PTR_ERR(handle); ++ if (S_ISDIR(tree->inode->i_mode) || S_ISLNK(tree->inode->i_mode)) ++ metadata = 1; + if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) { + /* tail removal */ + unsigned long num, start; +@@ -1928,7 +1930,7 @@ ext3_remove_blocks(struct ext3_extents_t + bh = sb_find_get_block(tree->inode->i_sb, start + i); + ext3_forget(handle, 0, tree->inode, bh, start + i); + } +- ext3_free_blocks(handle, tree->inode, start, num); ++ ext3_free_blocks(handle, tree->inode, start, num, metadata); + } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) { + printk("strange request: removal %lu-%lu from %u:%u\n", + from, to, ex->ee_block, ex->ee_len); +Index: linux-stage/fs/ext3/xattr.c +=================================================================== +--- linux-stage.orig/fs/ext3/xattr.c 2006-07-16 02:29:43.000000000 +0800 ++++ linux-stage/fs/ext3/xattr.c 2006-07-16 02:29:49.000000000 +0800 +@@ -484,7 +484,7 @@ ext3_xattr_release_block(handle_t *handl + ea_bdebug(bh, "refcount now=0; freeing"); + if (ce) + mb_cache_entry_free(ce); +- ext3_free_blocks(handle, inode, bh->b_blocknr, 1); ++ ext3_free_blocks(handle, inode, bh->b_blocknr, 1, 1); + get_bh(bh); + ext3_forget(handle, 1, inode, bh, bh->b_blocknr); + } else { +@@ -805,7 +805,7 @@ inserted: + new_bh = sb_getblk(sb, block); + if (!new_bh) { + getblk_failed: +- ext3_free_blocks(handle, inode, block, 1); ++ ext3_free_blocks(handle, inode, block, 1, 1); + error = -EIO; + goto cleanup; + } +Index: linux-stage/fs/ext3/balloc.c +=================================================================== +--- linux-stage.orig/fs/ext3/balloc.c 2006-07-16 02:29:43.000000000 +0800 ++++ linux-stage/fs/ext3/balloc.c 2006-07-16 02:33:13.000000000 +0800 +@@ -79,7 +79,7 @@ struct ext3_group_desc * ext3_get_group_ + * + * Return buffer_head on success or NULL in case of failure. + */ +-static struct buffer_head * ++struct buffer_head * + read_block_bitmap(struct super_block *sb, unsigned int block_group) + { + struct ext3_group_desc * desc; +@@ -490,24 +490,6 @@ error_return: + return; + } + +-/* Free given blocks, update quota and i_blocks field */ +-void ext3_free_blocks(handle_t *handle, struct inode *inode, +- ext3_fsblk_t block, unsigned long count) +-{ +- struct super_block * sb; +- unsigned long dquot_freed_blocks; +- +- sb = inode->i_sb; +- if (!sb) { +- printk ("ext3_free_blocks: nonexistent device"); +- return; +- } +- ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks); +- if (dquot_freed_blocks) +- DQUOT_FREE_BLOCK(inode, dquot_freed_blocks); +- return; +-} +- + /* + * For ext3 allocations, we must not reuse any blocks which are + * allocated in the bitmap buffer's "last committed data" copy. This +@@ -1463,7 +1445,7 @@ out: + return 0; + } + +-ext3_fsblk_t ext3_new_block(handle_t *handle, struct inode *inode, ++ext3_fsblk_t ext3_new_block_old(handle_t *handle, struct inode *inode, + ext3_fsblk_t goal, int *errp) + { + unsigned long count = 1; +Index: linux-stage/fs/ext3/super.c +=================================================================== +--- linux-stage.orig/fs/ext3/super.c 2006-07-16 02:29:43.000000000 +0800 ++++ linux-stage/fs/ext3/super.c 2006-07-16 02:29:49.000000000 +0800 +@@ -391,6 +391,7 @@ static void ext3_put_super (struct super + struct ext3_super_block *es = sbi->s_es; + int i; + ++ ext3_mb_release(sb); + ext3_ext_release(sb); + ext3_xattr_put_super(sb); + journal_destroy(sbi->s_journal); +@@ -641,7 +642,7 @@ enum { + Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, + Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, + Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, +- Opt_extents, Opt_extdebug, ++ Opt_extents, Opt_extdebug, Opt_mballoc, + Opt_grpquota + }; + +@@ -696,6 +697,7 @@ static match_table_t tokens = { + {Opt_iopen_nopriv, "iopen_nopriv"}, + {Opt_extents, "extents"}, + {Opt_extdebug, "extdebug"}, ++ {Opt_mballoc, "mballoc"}, + {Opt_barrier, "barrier=%u"}, + {Opt_err, NULL}, + {Opt_resize, "resize"}, +@@ -1047,6 +1049,9 @@ clear_qf_name: + case Opt_extdebug: + set_opt (sbi->s_mount_opt, EXTDEBUG); + break; ++ case Opt_mballoc: ++ set_opt (sbi->s_mount_opt, MBALLOC); ++ break; + default: + printk (KERN_ERR + "EXT3-fs: Unrecognized mount option \"%s\" " +@@ -1773,6 +1778,7 @@ static int ext3_fill_super (struct super + "writeback"); + + ext3_ext_init(sb); ++ ext3_mb_init(sb, needs_recovery); + lock_kernel(); + return 0; + +@@ -2712,7 +2718,13 @@ static struct file_system_type ext3_fs_t + + static int __init init_ext3_fs(void) + { +- int err = init_ext3_xattr(); ++ int err; ++ ++ err = init_ext3_proc(); ++ if (err) ++ return err; ++ ++ err = init_ext3_xattr(); + if (err) + return err; + err = init_inodecache(); +@@ -2734,6 +2746,7 @@ static void __exit exit_ext3_fs(void) + unregister_filesystem(&ext3_fs_type); + destroy_inodecache(); + exit_ext3_xattr(); ++ exit_ext3_proc(); + } + + int ext3_prep_san_write(struct inode *inode, long *blocks, +Index: linux-stage/fs/ext3/Makefile +=================================================================== +--- linux-stage.orig/fs/ext3/Makefile 2006-07-16 02:29:43.000000000 +0800 ++++ linux-stage/fs/ext3/Makefile 2006-07-16 02:29:49.000000000 +0800 +@@ -6,7 +6,7 @@ obj-$(CONFIG_EXT3_FS) += ext3.o + + ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ + ioctl.o namei.o super.o symlink.o hash.o resize.o \ +- extents.o ++ extents.o mballoc.o + + ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o + ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o +Index: linux-stage/include/linux/ext3_fs.h +=================================================================== +--- linux-stage.orig/include/linux/ext3_fs.h 2006-07-16 02:29:43.000000000 +0800 ++++ linux-stage/include/linux/ext3_fs.h 2006-07-16 02:29:49.000000000 +0800 +@@ -53,6 +53,14 @@ + #define ext3_debug(f, a...) do {} while (0) + #endif + ++#define EXT3_MULTIBLOCK_ALLOCATOR 1 ++ ++#define EXT3_MB_HINT_MERGE 1 ++#define EXT3_MB_HINT_RESERVED 2 ++#define EXT3_MB_HINT_METADATA 4 ++#define EXT3_MB_HINT_FIRST 8 ++#define EXT3_MB_HINT_BEST 16 ++ + /* + * Special inodes numbers + */ +@@ -379,6 +387,7 @@ struct ext3_inode { + #define EXT3_MOUNT_IOPEN_NOPRIV 0x800000/* Make iopen world-readable */ + #define EXT3_MOUNT_EXTENTS 0x1000000/* Extents support */ + #define EXT3_MOUNT_EXTDEBUG 0x2000000/* Extents debug */ ++#define EXT3_MOUNT_MBALLOC 0x4000000/* Buddy allocation support */ + + /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ + #ifndef clear_opt +@@ -749,12 +758,12 @@ ext3_group_first_block_no(struct super_b + /* balloc.c */ + extern int ext3_bg_has_super(struct super_block *sb, int group); + extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group); +-extern ext3_fsblk_t ext3_new_block (handle_t *handle, struct inode *inode, +- ext3_fsblk_t goal, int *errp); ++//extern ext3_fsblk_t ext3_new_block (handle_t *handle, struct inode *inode, ++// ext3_fsblk_t goal, int *errp); + extern ext3_fsblk_t ext3_new_blocks (handle_t *handle, struct inode *inode, + ext3_fsblk_t goal, unsigned long *count, int *errp); + extern void ext3_free_blocks (handle_t *handle, struct inode *inode, +- ext3_fsblk_t block, unsigned long count); ++ ext3_fsblk_t block, unsigned long count, int metadata); + extern void ext3_free_blocks_sb (handle_t *handle, struct super_block *sb, + ext3_fsblk_t block, unsigned long count, + unsigned long *pdquot_freed_blocks); +@@ -881,6 +890,17 @@ extern void ext3_extents_initialize_bloc + extern int ext3_ext_ioctl(struct inode *inode, struct file *filp, + unsigned int cmd, unsigned long arg); + ++/* mballoc.c */ ++extern long ext3_mb_stats; ++extern long ext3_mb_max_to_scan; ++extern int ext3_mb_init(struct super_block *, int); ++extern int ext3_mb_release(struct super_block *); ++extern int ext3_mb_new_blocks(handle_t *, struct inode *, unsigned long, int *, int, int *); ++extern int ext3_mb_reserve_blocks(struct super_block *, int); ++extern void ext3_mb_release_blocks(struct super_block *, int); ++int __init init_ext3_proc(void); ++void exit_ext3_proc(void); ++ + #endif /* __KERNEL__ */ + + /* EXT3_IOC_CREATE_INUM at bottom of file (visible to kernel and user). */ +Index: linux-stage/include/linux/ext3_fs_sb.h +=================================================================== +--- linux-stage.orig/include/linux/ext3_fs_sb.h 2006-07-16 02:29:43.000000000 +0800 ++++ linux-stage/include/linux/ext3_fs_sb.h 2006-07-16 02:29:49.000000000 +0800 +@@ -21,8 +21,14 @@ + #include + #include + #include ++#include + #endif + #include ++#include ++ ++struct ext3_buddy_group_blocks; ++struct ext3_mb_history; ++#define EXT3_BB_MAX_BLOCKS + + /* + * third extended-fs super-block data in memory +@@ -78,6 +84,38 @@ struct ext3_sb_info { + char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */ + int s_jquota_fmt; /* Format of quota to use */ + #endif ++ ++ /* for buddy allocator */ ++ struct ext3_group_info **s_group_info; ++ struct inode *s_buddy_cache; ++ long s_blocks_reserved; ++ spinlock_t s_reserve_lock; ++ struct list_head s_active_transaction; ++ struct list_head s_closed_transaction; ++ struct list_head s_committed_transaction; ++ spinlock_t s_md_lock; ++ tid_t s_last_transaction; ++ int s_mb_factor; ++ unsigned short *s_mb_offsets, *s_mb_maxs; ++ ++ /* history to debug policy */ ++ struct ext3_mb_history *s_mb_history; ++ int s_mb_history_cur; ++ int s_mb_history_max; ++ struct proc_dir_entry *s_mb_proc; ++ spinlock_t s_mb_history_lock; ++ ++ /* stats for buddy allocator */ ++ atomic_t s_bal_reqs; /* number of reqs with len > 1 */ ++ atomic_t s_bal_success; /* we found long enough chunks */ ++ atomic_t s_bal_allocated; /* in blocks */ ++ atomic_t s_bal_ex_scanned; /* total extents scanned */ ++ atomic_t s_bal_goals; /* goal hits */ ++ atomic_t s_bal_breaks; /* too long searches */ ++ atomic_t s_bal_2orders; /* 2^order hits */ ++ spinlock_t s_bal_lock; ++ unsigned long s_mb_buddies_generated; ++ unsigned long long s_mb_generation_time; + }; + + #endif /* _LINUX_EXT3_FS_SB */ +Index: linux-stage/fs/ext3/inode.c +=================================================================== +--- linux-stage.orig/fs/ext3/inode.c 2006-07-16 02:29:43.000000000 +0800 ++++ linux-stage/fs/ext3/inode.c 2006-07-16 02:29:49.000000000 +0800 +@@ -562,7 +562,7 @@ static int ext3_alloc_blocks(handle_t *h + return ret; + failed_out: + for (i = 0; i + #include ++#include + #endif + #endif + #include ++#include ++ ++struct ext3_buddy_group_blocks; ++struct ext3_mb_history; ++#define EXT3_BB_MAX_BLOCKS + + /* + * third extended-fs super-block data in memory +@@ -81,6 +87,43 @@ struct ext3_sb_info { + char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */ + int s_jquota_fmt; /* Format of quota to use */ + #endif ++ ++ /* for buddy allocator */ ++ struct ext3_group_info ***s_group_info; ++ struct inode *s_buddy_cache; ++ long s_blocks_reserved; ++ spinlock_t s_reserve_lock; ++ struct list_head s_active_transaction; ++ struct list_head s_closed_transaction; ++ struct list_head s_committed_transaction; ++ spinlock_t s_md_lock; ++ tid_t s_last_transaction; ++ int s_mb_factor; ++ unsigned short *s_mb_offsets, *s_mb_maxs; ++ unsigned long s_stripe; ++ ++ /* history to debug policy */ ++ struct ext3_mb_history *s_mb_history; ++ int s_mb_history_cur; ++ int s_mb_history_max; ++ struct proc_dir_entry *s_mb_proc; ++ spinlock_t s_mb_history_lock; ++ ++ /* stats for buddy allocator */ ++ atomic_t s_bal_reqs; /* number of reqs with len > 1 */ ++ atomic_t s_bal_success; /* we found long enough chunks */ ++ atomic_t s_bal_allocated; /* in blocks */ ++ atomic_t s_bal_ex_scanned; /* total extents scanned */ ++ atomic_t s_bal_goals; /* goal hits */ ++ atomic_t s_bal_breaks; /* too long searches */ ++ atomic_t s_bal_2orders; /* 2^order hits */ ++ spinlock_t s_bal_lock; ++ unsigned long s_mb_buddies_generated; ++ unsigned long long s_mb_generation_time; + }; + ++#define EXT3_GROUP_INFO(sb, group) \ ++ EXT3_SB(sb)->s_group_info[(group) >> EXT3_DESC_PER_BLOCK_BITS(sb)] \ ++ [(group) & (EXT3_DESC_PER_BLOCK(sb) - 1)] ++ + #endif /* _LINUX_EXT3_FS_SB */ +Index: linux-2.6.9-full/fs/ext3/super.c +=================================================================== +--- linux-2.6.9-full.orig/fs/ext3/super.c 2006-06-01 14:58:46.000000000 +0400 ++++ linux-2.6.9-full/fs/ext3/super.c 2006-10-24 12:54:31.000000000 +0400 +@@ -394,6 +394,7 @@ void ext3_put_super (struct super_block + struct ext3_super_block *es = sbi->s_es; + int i; + ++ ext3_mb_release(sb); + ext3_ext_release(sb); + ext3_xattr_put_super(sb); + journal_destroy(sbi->s_journal); +@@ -597,6 +598,7 @@ enum { + Opt_ignore, Opt_barrier, Opt_err, Opt_resize, + Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, + Opt_extents, Opt_noextents, Opt_extdebug, ++ Opt_mballoc, Opt_nomballoc, Opt_stripe, + }; + + static match_table_t tokens = { +@@ -649,6 +651,9 @@ static match_table_t tokens = { + {Opt_extents, "extents"}, + {Opt_noextents, "noextents"}, + {Opt_extdebug, "extdebug"}, ++ {Opt_mballoc, "mballoc"}, ++ {Opt_nomballoc, "nomballoc"}, ++ {Opt_stripe, "stripe=%u"}, + {Opt_barrier, "barrier=%u"}, + {Opt_err, NULL}, + {Opt_resize, "resize"}, +@@ -962,6 +967,19 @@ clear_qf_name: + case Opt_extdebug: + set_opt (sbi->s_mount_opt, EXTDEBUG); + break; ++ case Opt_mballoc: ++ set_opt(sbi->s_mount_opt, MBALLOC); ++ break; ++ case Opt_nomballoc: ++ clear_opt(sbi->s_mount_opt, MBALLOC); ++ break; ++ case Opt_stripe: ++ if (match_int(&args[0], &option)) ++ return 0; ++ if (option < 0) ++ return 0; ++ sbi->s_stripe = option; ++ break; + default: + printk (KERN_ERR + "EXT3-fs: Unrecognized mount option \"%s\" " +@@ -1651,6 +1669,7 @@ static int ext3_fill_super (struct super + ext3_count_dirs(sb)); + + ext3_ext_init(sb); ++ ext3_mb_init(sb, needs_recovery); + + return 0; + +@@ -2433,7 +2452,13 @@ static struct file_system_type ext3_fs_t + + static int __init init_ext3_fs(void) + { +- int err = init_ext3_xattr(); ++ int err; ++ ++ err = init_ext3_proc(); ++ if (err) ++ return err; ++ ++ err = init_ext3_xattr(); + if (err) + return err; + err = init_inodecache(); +@@ -2455,6 +2480,7 @@ static void __exit exit_ext3_fs(void) + unregister_filesystem(&ext3_fs_type); + destroy_inodecache(); + exit_ext3_xattr(); ++ exit_ext3_proc(); + } + + int ext3_prep_san_write(struct inode *inode, long *blocks, +Index: linux-2.6.9-full/fs/ext3/extents.c +=================================================================== +--- linux-2.6.9-full.orig/fs/ext3/extents.c 2006-06-01 14:58:46.000000000 +0400 ++++ linux-2.6.9-full/fs/ext3/extents.c 2006-10-24 12:54:31.000000000 +0400 +@@ -777,7 +777,7 @@ cleanup: + for (i = 0; i < depth; i++) { + if (!ablocks[i]) + continue; +- ext3_free_blocks(handle, tree->inode, ablocks[i], 1); ++ ext3_free_blocks(handle, tree->inode, ablocks[i], 1, 1); + } + } + kfree(ablocks); +@@ -1434,7 +1434,7 @@ int ext3_ext_rm_idx(handle_t *handle, st + path->p_idx->ei_leaf); + bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf); + ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf); +- ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1); ++ ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1, 1); + return err; + } + +@@ -1919,10 +1919,12 @@ ext3_remove_blocks(struct ext3_extents_t + int needed = ext3_remove_blocks_credits(tree, ex, from, to); + handle_t *handle = ext3_journal_start(tree->inode, needed); + struct buffer_head *bh; +- int i; ++ int i, metadata = 0; + + if (IS_ERR(handle)) + return PTR_ERR(handle); ++ if (S_ISDIR(tree->inode->i_mode) || S_ISLNK(tree->inode->i_mode)) ++ metadata = 1; + if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) { + /* tail removal */ + unsigned long num, start; +@@ -1934,7 +1936,7 @@ ext3_remove_blocks(struct ext3_extents_t + bh = sb_find_get_block(tree->inode->i_sb, start + i); + ext3_forget(handle, 0, tree->inode, bh, start + i); + } +- ext3_free_blocks(handle, tree->inode, start, num); ++ ext3_free_blocks(handle, tree->inode, start, num, metadata); + } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) { + printk("strange request: removal %lu-%lu from %u:%u\n", + from, to, ex->ee_block, ex->ee_len); +Index: linux-2.6.9-full/fs/ext3/inode.c +=================================================================== +--- linux-2.6.9-full.orig/fs/ext3/inode.c 2006-06-01 14:58:46.000000000 +0400 ++++ linux-2.6.9-full/fs/ext3/inode.c 2006-10-24 12:54:31.000000000 +0400 +@@ -572,7 +572,7 @@ static int ext3_alloc_branch(handle_t *h + ext3_journal_forget(handle, branch[i].bh); + } + for (i = 0; i < keys; i++) +- ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1); ++ ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1, 1); + return err; + } + +@@ -673,7 +673,7 @@ err_out: + if (err == -EAGAIN) + for (i = 0; i < num; i++) + ext3_free_blocks(handle, inode, +- le32_to_cpu(where[i].key), 1); ++ le32_to_cpu(where[i].key), 1, 1); + return err; + } + +@@ -1831,7 +1831,7 @@ ext3_clear_blocks(handle_t *handle, stru + } + } + +- ext3_free_blocks(handle, inode, block_to_free, count); ++ ext3_free_blocks(handle, inode, block_to_free, count, 1); + } + + /** +@@ -2004,7 +2004,7 @@ static void ext3_free_branches(handle_t + ext3_journal_test_restart(handle, inode); + } + +- ext3_free_blocks(handle, inode, nr, 1); ++ ext3_free_blocks(handle, inode, nr, 1, 1); + + if (parent_bh) { + /* +Index: linux-2.6.9-full/fs/ext3/balloc.c +=================================================================== +--- linux-2.6.9-full.orig/fs/ext3/balloc.c 2006-03-10 18:20:03.000000000 +0300 ++++ linux-2.6.9-full/fs/ext3/balloc.c 2006-10-24 12:54:31.000000000 +0400 +@@ -79,7 +79,7 @@ struct ext3_group_desc * ext3_get_group_ + * + * Return buffer_head on success or NULL in case of failure. + */ +-static struct buffer_head * ++struct buffer_head * + read_block_bitmap(struct super_block *sb, unsigned int block_group) + { + struct ext3_group_desc * desc; +@@ -451,24 +451,6 @@ error_return: + return; + } + +-/* Free given blocks, update quota and i_blocks field */ +-void ext3_free_blocks(handle_t *handle, struct inode *inode, +- unsigned long block, unsigned long count) +-{ +- struct super_block * sb; +- int dquot_freed_blocks; +- +- sb = inode->i_sb; +- if (!sb) { +- printk ("ext3_free_blocks: nonexistent device"); +- return; +- } +- ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks); +- if (dquot_freed_blocks) +- DQUOT_FREE_BLOCK(inode, dquot_freed_blocks); +- return; +-} +- + /* + * For ext3 allocations, we must not reuse any blocks which are + * allocated in the bitmap buffer's "last committed data" copy. This +@@ -1131,7 +1113,7 @@ int ext3_should_retry_alloc(struct super + * bitmap, and then for any free bit if that fails. + * This function also updates quota and i_blocks field. + */ +-int ext3_new_block(handle_t *handle, struct inode *inode, ++int ext3_new_block_old(handle_t *handle, struct inode *inode, + unsigned long goal, int *errp) + { + struct buffer_head *bitmap_bh = NULL; +Index: linux-2.6.9-full/fs/ext3/xattr.c +=================================================================== +--- linux-2.6.9-full.orig/fs/ext3/xattr.c 2006-05-18 23:57:04.000000000 +0400 ++++ linux-2.6.9-full/fs/ext3/xattr.c 2006-10-24 12:54:31.000000000 +0400 +@@ -1281,7 +1281,7 @@ ext3_xattr_set_handle2(handle_t *handle, + new_bh = sb_getblk(sb, block); + if (!new_bh) { + getblk_failed: +- ext3_free_blocks(handle, inode, block, 1); ++ ext3_free_blocks(handle, inode, block, 1, 1); + error = -EIO; + goto cleanup; + } +@@ -1328,7 +1328,7 @@ getblk_failed: + if (ce) + mb_cache_entry_free(ce); + ea_bdebug(old_bh, "freeing"); +- ext3_free_blocks(handle, inode, old_bh->b_blocknr, 1); ++ ext3_free_blocks(handle, inode, old_bh->b_blocknr, 1, 1); + + /* ext3_forget() calls bforget() for us, but we + let our caller release old_bh, so we need to +@@ -1427,7 +1427,7 @@ ext3_xattr_delete_inode(handle_t *handle + if (HDR(bh)->h_refcount == cpu_to_le32(1)) { + if (ce) + mb_cache_entry_free(ce); +- ext3_free_blocks(handle, inode, EXT3_I(inode)->i_file_acl, 1); ++ ext3_free_blocks(handle, inode, EXT3_I(inode)->i_file_acl, 1, 1); + get_bh(bh); + ext3_forget(handle, 1, inode, bh, EXT3_I(inode)->i_file_acl); + } else { +Index: linux-2.6.9-full/fs/ext3/mballoc.c +=================================================================== +--- linux-2.6.9-full.orig/fs/ext3/mballoc.c 2006-10-23 18:07:54.821533176 +0400 ++++ linux-2.6.9-full/fs/ext3/mballoc.c 2006-10-24 13:00:56.000000000 +0400 +@@ -0,0 +1,2729 @@ ++/* ++ * Copyright (c) 2003-2005, Cluster File Systems, Inc, info@clusterfs.com ++ * Written by Alex Tomas ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public Licens ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- ++ */ ++ ++ ++/* ++ * mballoc.c contains the multiblocks allocation routines ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * TODO: ++ * - bitmap read-ahead (proposed by Oleg Drokin aka green) ++ * - track min/max extents in each group for better group selection ++ * - mb_mark_used() may allocate chunk right after splitting buddy ++ * - special flag to advice allocator to look for requested + N blocks ++ * this may improve interaction between extents and mballoc ++ * - tree of groups sorted by number of free blocks ++ * - percpu reservation code (hotpath) ++ * - error handling ++ */ ++ ++/* ++ * with AGRESSIVE_CHECK allocator runs consistency checks over ++ * structures. these checks slow things down a lot ++ */ ++#define AGGRESSIVE_CHECK__ ++ ++/* ++ */ ++#define MB_DEBUG__ ++#ifdef MB_DEBUG ++#define mb_debug(fmt,a...) printk(fmt, ##a) ++#else ++#define mb_debug(fmt,a...) ++#endif ++ ++/* ++ * with EXT3_MB_HISTORY mballoc stores last N allocations in memory ++ * and you can monitor it in /proc/fs/ext3//mb_history ++ */ ++#define EXT3_MB_HISTORY ++ ++/* ++ * How long mballoc can look for a best extent (in found extents) ++ */ ++long ext3_mb_max_to_scan = 500; ++ ++/* ++ * How long mballoc must look for a best extent ++ */ ++long ext3_mb_min_to_scan = 30; ++ ++/* ++ * with 'ext3_mb_stats' allocator will collect stats that will be ++ * shown at umount. The collecting costs though! ++ */ ++ ++long ext3_mb_stats = 1; ++ ++/* ++ * for which requests use 2^N search using buddies ++ */ ++long ext3_mb_order2_reqs = 8; ++ ++#ifdef EXT3_BB_MAX_BLOCKS ++#undef EXT3_BB_MAX_BLOCKS ++#endif ++#define EXT3_BB_MAX_BLOCKS 30 ++ ++struct ext3_free_metadata { ++ unsigned short group; ++ unsigned short num; ++ unsigned short blocks[EXT3_BB_MAX_BLOCKS]; ++ struct list_head list; ++}; ++ ++struct ext3_group_info { ++ unsigned long bb_state; ++ unsigned long bb_tid; ++ struct ext3_free_metadata *bb_md_cur; ++ unsigned short bb_first_free; ++ unsigned short bb_free; ++ unsigned short bb_fragments; ++ unsigned short bb_counters[]; ++}; ++ ++ ++#define EXT3_GROUP_INFO_NEED_INIT_BIT 0 ++#define EXT3_GROUP_INFO_LOCKED_BIT 1 ++ ++#define EXT3_MB_GRP_NEED_INIT(grp) \ ++ (test_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, &(grp)->bb_state)) ++ ++struct ext3_free_extent { ++ __u16 fe_start; ++ __u16 fe_len; ++ __u16 fe_group; ++}; ++ ++struct ext3_allocation_context { ++ struct super_block *ac_sb; ++ ++ /* search goals */ ++ struct ext3_free_extent ac_g_ex; ++ ++ /* the best found extent */ ++ struct ext3_free_extent ac_b_ex; ++ ++ /* number of iterations done. we have to track to limit searching */ ++ unsigned long ac_ex_scanned; ++ __u16 ac_groups_scanned; ++ __u16 ac_found; ++ __u16 ac_tail; ++ __u16 ac_buddy; ++ __u8 ac_status; ++ __u8 ac_flags; /* allocation hints */ ++ __u8 ac_criteria; ++ __u8 ac_repeats; ++ __u8 ac_2order; /* if request is to allocate 2^N blocks and ++ * N > 0, the field stores N, otherwise 0 */ ++ ++ struct page *ac_buddy_page; ++ struct page *ac_bitmap_page; ++}; ++ ++#define AC_STATUS_CONTINUE 1 ++#define AC_STATUS_FOUND 2 ++#define AC_STATUS_BREAK 3 ++ ++struct ext3_mb_history { ++ struct ext3_free_extent goal; /* goal allocation */ ++ struct ext3_free_extent result; /* result allocation */ ++ unsigned pid; ++ unsigned ino; ++ __u16 found; /* how many extents have been found */ ++ __u16 groups; /* how many groups have been scanned */ ++ __u16 tail; /* what tail broke some buddy */ ++ __u16 buddy; /* buddy the tail ^^^ broke */ ++ __u8 cr; /* which phase the result extent was found at */ ++ __u8 merged; ++}; ++ ++struct ext3_buddy { ++ struct page *bd_buddy_page; ++ void *bd_buddy; ++ struct page *bd_bitmap_page; ++ void *bd_bitmap; ++ struct ext3_group_info *bd_info; ++ struct super_block *bd_sb; ++ __u16 bd_blkbits; ++ __u16 bd_group; ++}; ++#define EXT3_MB_BITMAP(e3b) ((e3b)->bd_bitmap) ++#define EXT3_MB_BUDDY(e3b) ((e3b)->bd_buddy) ++ ++#ifndef EXT3_MB_HISTORY ++#define ext3_mb_store_history(sb,ino,ac) ++#else ++static void ext3_mb_store_history(struct super_block *, unsigned ino, ++ struct ext3_allocation_context *ac); ++#endif ++ ++#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) ++ ++static struct proc_dir_entry *proc_root_ext3; ++ ++int ext3_create (struct inode *, struct dentry *, int, struct nameidata *); ++struct buffer_head * read_block_bitmap(struct super_block *, unsigned int); ++int ext3_new_block_old(handle_t *, struct inode *, unsigned long, int *); ++int ext3_mb_reserve_blocks(struct super_block *, int); ++void ext3_mb_release_blocks(struct super_block *, int); ++void ext3_mb_poll_new_transaction(struct super_block *, handle_t *); ++void ext3_mb_free_committed_blocks(struct super_block *); ++ ++#if BITS_PER_LONG == 64 ++#define mb_correct_addr_and_bit(bit,addr) \ ++{ \ ++ bit += ((unsigned long) addr & 7UL) << 3; \ ++ addr = (void *) ((unsigned long) addr & ~7UL); \ ++} ++#elif BITS_PER_LONG == 32 ++#define mb_correct_addr_and_bit(bit,addr) \ ++{ \ ++ bit += ((unsigned long) addr & 3UL) << 3; \ ++ addr = (void *) ((unsigned long) addr & ~3UL); \ ++} ++#else ++#error "how many bits you are?!" ++#endif ++ ++static inline int mb_test_bit(int bit, void *addr) ++{ ++ mb_correct_addr_and_bit(bit,addr); ++ return ext2_test_bit(bit, addr); ++} ++ ++static inline void mb_set_bit(int bit, void *addr) ++{ ++ mb_correct_addr_and_bit(bit,addr); ++ ext2_set_bit(bit, addr); ++} ++ ++static inline void mb_set_bit_atomic(int bit, void *addr) ++{ ++ mb_correct_addr_and_bit(bit,addr); ++ ext2_set_bit_atomic(NULL, bit, addr); ++} ++ ++static inline void mb_clear_bit(int bit, void *addr) ++{ ++ mb_correct_addr_and_bit(bit,addr); ++ ext2_clear_bit(bit, addr); ++} ++ ++static inline void mb_clear_bit_atomic(int bit, void *addr) ++{ ++ mb_correct_addr_and_bit(bit,addr); ++ ext2_clear_bit_atomic(NULL, bit, addr); ++} ++ ++static inline int mb_find_next_zero_bit(void *addr, int max, int start) ++{ ++ int fix; ++#if BITS_PER_LONG == 64 ++ fix = ((unsigned long) addr & 7UL) << 3; ++ addr = (void *) ((unsigned long) addr & ~7UL); ++#elif BITS_PER_LONG == 32 ++ fix = ((unsigned long) addr & 3UL) << 3; ++ addr = (void *) ((unsigned long) addr & ~3UL); ++#else ++#error "how many bits you are?!" ++#endif ++ max += fix; ++ start += fix; ++ return ext2_find_next_zero_bit(addr, max, start) - fix; ++} ++ ++static inline void *mb_find_buddy(struct ext3_buddy *e3b, int order, int *max) ++{ ++ char *bb; ++ ++ J_ASSERT(EXT3_MB_BITMAP(e3b) != EXT3_MB_BUDDY(e3b)); ++ J_ASSERT(max != NULL); ++ ++ if (order > e3b->bd_blkbits + 1) { ++ *max = 0; ++ return NULL; ++ } ++ ++ /* at order 0 we see each particular block */ ++ *max = 1 << (e3b->bd_blkbits + 3); ++ if (order == 0) ++ return EXT3_MB_BITMAP(e3b); ++ ++ bb = EXT3_MB_BUDDY(e3b) + EXT3_SB(e3b->bd_sb)->s_mb_offsets[order]; ++ *max = EXT3_SB(e3b->bd_sb)->s_mb_maxs[order]; ++ ++ return bb; ++} ++ ++#ifdef AGGRESSIVE_CHECK ++ ++static void mb_check_buddy(struct ext3_buddy *e3b) ++{ ++ int order = e3b->bd_blkbits + 1; ++ int max, max2, i, j, k, count; ++ int fragments = 0, fstart; ++ void *buddy, *buddy2; ++ ++ if (!test_opt(e3b->bd_sb, MBALLOC)) ++ return; ++ ++ { ++ static int mb_check_counter = 0; ++ if (mb_check_counter++ % 300 != 0) ++ return; ++ } ++ ++ while (order > 1) { ++ buddy = mb_find_buddy(e3b, order, &max); ++ J_ASSERT(buddy); ++ buddy2 = mb_find_buddy(e3b, order - 1, &max2); ++ J_ASSERT(buddy2); ++ J_ASSERT(buddy != buddy2); ++ J_ASSERT(max * 2 == max2); ++ ++ count = 0; ++ for (i = 0; i < max; i++) { ++ ++ if (mb_test_bit(i, buddy)) { ++ /* only single bit in buddy2 may be 1 */ ++ if (!mb_test_bit(i << 1, buddy2)) ++ J_ASSERT(mb_test_bit((i<<1)+1, buddy2)); ++ else if (!mb_test_bit((i << 1) + 1, buddy2)) ++ J_ASSERT(mb_test_bit(i << 1, buddy2)); ++ continue; ++ } ++ ++ /* both bits in buddy2 must be 0 */ ++ J_ASSERT(mb_test_bit(i << 1, buddy2)); ++ J_ASSERT(mb_test_bit((i << 1) + 1, buddy2)); ++ ++ for (j = 0; j < (1 << order); j++) { ++ k = (i * (1 << order)) + j; ++ J_ASSERT(!mb_test_bit(k, EXT3_MB_BITMAP(e3b))); ++ } ++ count++; ++ } ++ J_ASSERT(e3b->bd_info->bb_counters[order] == count); ++ order--; ++ } ++ ++ fstart = -1; ++ buddy = mb_find_buddy(e3b, 0, &max); ++ for (i = 0; i < max; i++) { ++ if (!mb_test_bit(i, buddy)) { ++ J_ASSERT(i >= e3b->bd_info->bb_first_free); ++ if (fstart == -1) { ++ fragments++; ++ fstart = i; ++ } ++ continue; ++ } ++ fstart = -1; ++ /* check used bits only */ ++ for (j = 0; j < e3b->bd_blkbits + 1; j++) { ++ buddy2 = mb_find_buddy(e3b, j, &max2); ++ k = i >> j; ++ J_ASSERT(k < max2); ++ J_ASSERT(mb_test_bit(k, buddy2)); ++ } ++ } ++ J_ASSERT(!EXT3_MB_GRP_NEED_INIT(e3b->bd_info)); ++ J_ASSERT(e3b->bd_info->bb_fragments == fragments); ++} ++ ++#else ++#define mb_check_buddy(e3b) ++#endif ++ ++/* find most significant bit */ ++static int inline fmsb(unsigned short word) ++{ ++ int order; ++ ++ if (word > 255) { ++ order = 7; ++ word >>= 8; ++ } else { ++ order = -1; ++ } ++ ++ do { ++ order++; ++ word >>= 1; ++ } while (word != 0); ++ ++ return order; ++} ++ ++static void inline ++ext3_mb_mark_free_simple(struct super_block *sb, void *buddy, unsigned first, ++ int len, struct ext3_group_info *grp) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ unsigned short min, max, chunk, border; ++ ++ mb_debug("mark %u/%u free\n", first, len); ++ J_ASSERT(len < EXT3_BLOCKS_PER_GROUP(sb)); ++ ++ border = 2 << sb->s_blocksize_bits; ++ ++ while (len > 0) { ++ /* find how many blocks can be covered since this position */ ++ max = ffs(first | border) - 1; ++ ++ /* find how many blocks of power 2 we need to mark */ ++ min = fmsb(len); ++ ++ mb_debug(" %u/%u -> max %u, min %u\n", ++ first & ((2 << sb->s_blocksize_bits) - 1), ++ len, max, min); ++ ++ if (max < min) ++ min = max; ++ chunk = 1 << min; ++ ++ /* mark multiblock chunks only */ ++ grp->bb_counters[min]++; ++ if (min > 0) { ++ mb_debug(" set %u at %u \n", first >> min, ++ sbi->s_mb_offsets[min]); ++ mb_clear_bit(first >> min, buddy + sbi->s_mb_offsets[min]); ++ } ++ ++ len -= chunk; ++ first += chunk; ++ } ++} ++ ++static void ++ext3_mb_generate_buddy(struct super_block *sb, void *buddy, void *bitmap, ++ int group) ++{ ++ struct ext3_group_info *grp = EXT3_GROUP_INFO(sb, group); ++ unsigned short max = EXT3_BLOCKS_PER_GROUP(sb); ++ unsigned short i = 0, first, len; ++ unsigned free = 0, fragments = 0; ++ unsigned long long period = get_cycles(); ++ ++ i = mb_find_next_zero_bit(bitmap, max, 0); ++ grp->bb_first_free = i; ++ while (i < max) { ++ fragments++; ++ first = i; ++ i = ext2_find_next_le_bit(bitmap, max, i); ++ len = i - first; ++ free += len; ++ if (len > 1) ++ ext3_mb_mark_free_simple(sb, buddy, first, len, grp); ++ else ++ grp->bb_counters[0]++; ++ if (i < max) ++ i = mb_find_next_zero_bit(bitmap, max, i); ++ } ++ grp->bb_fragments = fragments; ++ ++ /* bb_state shouldn't being modified because all ++ * others waits for init completion on page lock */ ++ clear_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, &grp->bb_state); ++ if (free != grp->bb_free) { ++ printk("EXT3-fs: group %u: %u blocks in bitmap, %u in gd\n", ++ group, free, grp->bb_free); ++ grp->bb_free = free; ++ } ++ ++ period = get_cycles() - period; ++ spin_lock(&EXT3_SB(sb)->s_bal_lock); ++ EXT3_SB(sb)->s_mb_buddies_generated++; ++ EXT3_SB(sb)->s_mb_generation_time += period; ++ spin_unlock(&EXT3_SB(sb)->s_bal_lock); ++} ++ ++static int ext3_mb_init_cache(struct page *page) ++{ ++ int blocksize, blocks_per_page, groups_per_page; ++ int err = 0, i, first_group, first_block; ++ struct super_block *sb; ++ struct buffer_head *bhs; ++ struct buffer_head **bh; ++ struct inode *inode; ++ char *data, *bitmap; ++ ++ mb_debug("init page %lu\n", page->index); ++ ++ inode = page->mapping->host; ++ sb = inode->i_sb; ++ blocksize = 1 << inode->i_blkbits; ++ blocks_per_page = PAGE_CACHE_SIZE / blocksize; ++ ++ groups_per_page = blocks_per_page >> 1; ++ if (groups_per_page == 0) ++ groups_per_page = 1; ++ ++ /* allocate buffer_heads to read bitmaps */ ++ if (groups_per_page > 1) { ++ err = -ENOMEM; ++ i = sizeof(struct buffer_head *) * groups_per_page; ++ bh = kmalloc(i, GFP_NOFS); ++ if (bh == NULL) ++ goto out; ++ memset(bh, 0, i); ++ } else ++ bh = &bhs; ++ ++ first_group = page->index * blocks_per_page / 2; ++ ++ /* read all groups the page covers into the cache */ ++ for (i = 0; i < groups_per_page; i++) { ++ struct ext3_group_desc * desc; ++ ++ if (first_group + i >= EXT3_SB(sb)->s_groups_count) ++ break; ++ ++ err = -EIO; ++ desc = ext3_get_group_desc(sb, first_group + i, NULL); ++ if (desc == NULL) ++ goto out; ++ ++ err = -ENOMEM; ++ bh[i] = sb_getblk(sb, le32_to_cpu(desc->bg_block_bitmap)); ++ if (bh[i] == NULL) ++ goto out; ++ ++ if (buffer_uptodate(bh[i])) ++ continue; ++ ++ lock_buffer(bh[i]); ++ if (buffer_uptodate(bh[i])) { ++ unlock_buffer(bh[i]); ++ continue; ++ } ++ ++ get_bh(bh[i]); ++ bh[i]->b_end_io = end_buffer_read_sync; ++ submit_bh(READ, bh[i]); ++ mb_debug("read bitmap for group %u\n", first_group + i); ++ } ++ ++ /* wait for I/O completion */ ++ for (i = 0; i < groups_per_page && bh[i]; i++) ++ wait_on_buffer(bh[i]); ++ ++ err = -EIO; ++ for (i = 0; i < groups_per_page && bh[i]; i++) ++ if (!buffer_uptodate(bh[i])) ++ goto out; ++ ++ first_block = page->index * blocks_per_page; ++ for (i = 0; i < blocks_per_page; i++) { ++ int group; ++ ++ group = (first_block + i) >> 1; ++ if (group >= EXT3_SB(sb)->s_groups_count) ++ break; ++ ++ data = page_address(page) + (i * blocksize); ++ bitmap = bh[group - first_group]->b_data; ++ ++ if ((first_block + i) & 1) { ++ /* this is block of buddy */ ++ mb_debug("put buddy for group %u in page %lu/%x\n", ++ group, page->index, i * blocksize); ++ memset(data, 0xff, blocksize); ++ EXT3_GROUP_INFO(sb, group)->bb_fragments = 0; ++ memset(EXT3_GROUP_INFO(sb, group)->bb_counters, 0, ++ sizeof(unsigned short)*(sb->s_blocksize_bits+2)); ++ ext3_mb_generate_buddy(sb, data, bitmap, group); ++ } else { ++ /* this is block of bitmap */ ++ mb_debug("put bitmap for group %u in page %lu/%x\n", ++ group, page->index, i * blocksize); ++ memcpy(data, bitmap, blocksize); ++ } ++ } ++ SetPageUptodate(page); ++ ++out: ++ if (bh) { ++ for (i = 0; i < groups_per_page && bh[i]; i++) ++ brelse(bh[i]); ++ if (bh != &bhs) ++ kfree(bh); ++ } ++ return err; ++} ++ ++static int ext3_mb_load_buddy(struct super_block *sb, int group, ++ struct ext3_buddy *e3b) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct inode *inode = sbi->s_buddy_cache; ++ int blocks_per_page, block, pnum, poff; ++ struct page *page; ++ ++ mb_debug("load group %u\n", group); ++ ++ blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; ++ ++ e3b->bd_blkbits = sb->s_blocksize_bits; ++ e3b->bd_info = EXT3_GROUP_INFO(sb, group); ++ e3b->bd_sb = sb; ++ e3b->bd_group = group; ++ e3b->bd_buddy_page = NULL; ++ e3b->bd_bitmap_page = NULL; ++ ++ block = group * 2; ++ pnum = block / blocks_per_page; ++ poff = block % blocks_per_page; ++ ++ /* we could use find_or_create_page(), but it locks page ++ * what we'd like to avoid in fast path ... */ ++ page = find_get_page(inode->i_mapping, pnum); ++ if (page == NULL || !PageUptodate(page)) { ++ if (page) ++ page_cache_release(page); ++ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); ++ if (page) { ++ BUG_ON(page->mapping != inode->i_mapping); ++ if (!PageUptodate(page)) ++ ext3_mb_init_cache(page); ++ unlock_page(page); ++ } ++ } ++ if (page == NULL || !PageUptodate(page)) ++ goto err; ++ e3b->bd_bitmap_page = page; ++ e3b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); ++ mark_page_accessed(page); ++ ++ block++; ++ pnum = block / blocks_per_page; ++ poff = block % blocks_per_page; ++ ++ page = find_get_page(inode->i_mapping, pnum); ++ if (page == NULL || !PageUptodate(page)) { ++ if (page) ++ page_cache_release(page); ++ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); ++ if (page) { ++ BUG_ON(page->mapping != inode->i_mapping); ++ if (!PageUptodate(page)) ++ ext3_mb_init_cache(page); ++ unlock_page(page); ++ } ++ } ++ if (page == NULL || !PageUptodate(page)) ++ goto err; ++ e3b->bd_buddy_page = page; ++ e3b->bd_buddy = page_address(page) + (poff * sb->s_blocksize); ++ mark_page_accessed(page); ++ ++ J_ASSERT(e3b->bd_bitmap_page != NULL); ++ J_ASSERT(e3b->bd_buddy_page != NULL); ++ ++ return 0; ++ ++err: ++ if (e3b->bd_bitmap_page) ++ page_cache_release(e3b->bd_bitmap_page); ++ if (e3b->bd_buddy_page) ++ page_cache_release(e3b->bd_buddy_page); ++ e3b->bd_buddy = NULL; ++ e3b->bd_bitmap = NULL; ++ return -EIO; ++} ++ ++static void ext3_mb_release_desc(struct ext3_buddy *e3b) ++{ ++ if (e3b->bd_bitmap_page) ++ page_cache_release(e3b->bd_bitmap_page); ++ if (e3b->bd_buddy_page) ++ page_cache_release(e3b->bd_buddy_page); ++} ++ ++ ++static inline void ++ext3_lock_group(struct super_block *sb, int group) ++{ ++ bit_spin_lock(EXT3_GROUP_INFO_LOCKED_BIT, ++ &EXT3_GROUP_INFO(sb, group)->bb_state); ++} ++ ++static inline void ++ext3_unlock_group(struct super_block *sb, int group) ++{ ++ bit_spin_unlock(EXT3_GROUP_INFO_LOCKED_BIT, ++ &EXT3_GROUP_INFO(sb, group)->bb_state); ++} ++ ++static int mb_find_order_for_block(struct ext3_buddy *e3b, int block) ++{ ++ int order = 1; ++ void *bb; ++ ++ J_ASSERT(EXT3_MB_BITMAP(e3b) != EXT3_MB_BUDDY(e3b)); ++ J_ASSERT(block < (1 << (e3b->bd_blkbits + 3))); ++ ++ bb = EXT3_MB_BUDDY(e3b); ++ while (order <= e3b->bd_blkbits + 1) { ++ block = block >> 1; ++ if (!mb_test_bit(block, bb)) { ++ /* this block is part of buddy of order 'order' */ ++ return order; ++ } ++ bb += 1 << (e3b->bd_blkbits - order); ++ order++; ++ } ++ return 0; ++} ++ ++static inline void mb_clear_bits(void *bm, int cur, int len) ++{ ++ __u32 *addr; ++ ++ len = cur + len; ++ while (cur < len) { ++ if ((cur & 31) == 0 && (len - cur) >= 32) { ++ /* fast path: clear whole word at once */ ++ addr = bm + (cur >> 3); ++ *addr = 0; ++ cur += 32; ++ continue; ++ } ++ mb_clear_bit_atomic(cur, bm); ++ cur++; ++ } ++} ++ ++static inline void mb_set_bits(void *bm, int cur, int len) ++{ ++ __u32 *addr; ++ ++ len = cur + len; ++ while (cur < len) { ++ if ((cur & 31) == 0 && (len - cur) >= 32) { ++ /* fast path: clear whole word at once */ ++ addr = bm + (cur >> 3); ++ *addr = 0xffffffff; ++ cur += 32; ++ continue; ++ } ++ mb_set_bit_atomic(cur, bm); ++ cur++; ++ } ++} ++ ++static int mb_free_blocks(struct ext3_buddy *e3b, int first, int count) ++{ ++ int block = 0, max = 0, order; ++ void *buddy, *buddy2; ++ ++ mb_check_buddy(e3b); ++ ++ e3b->bd_info->bb_free += count; ++ if (first < e3b->bd_info->bb_first_free) ++ e3b->bd_info->bb_first_free = first; ++ ++ /* let's maintain fragments counter */ ++ if (first != 0) ++ block = !mb_test_bit(first - 1, EXT3_MB_BITMAP(e3b)); ++ if (first + count < EXT3_SB(e3b->bd_sb)->s_mb_maxs[0]) ++ max = !mb_test_bit(first + count, EXT3_MB_BITMAP(e3b)); ++ if (block && max) ++ e3b->bd_info->bb_fragments--; ++ else if (!block && !max) ++ e3b->bd_info->bb_fragments++; ++ ++ /* let's maintain buddy itself */ ++ while (count-- > 0) { ++ block = first++; ++ order = 0; ++ ++ J_ASSERT(mb_test_bit(block, EXT3_MB_BITMAP(e3b))); ++ mb_clear_bit(block, EXT3_MB_BITMAP(e3b)); ++ e3b->bd_info->bb_counters[order]++; ++ ++ /* start of the buddy */ ++ buddy = mb_find_buddy(e3b, order, &max); ++ ++ do { ++ block &= ~1UL; ++ if (mb_test_bit(block, buddy) || ++ mb_test_bit(block + 1, buddy)) ++ break; ++ ++ /* both the buddies are free, try to coalesce them */ ++ buddy2 = mb_find_buddy(e3b, order + 1, &max); ++ ++ if (!buddy2) ++ break; ++ ++ if (order > 0) { ++ /* for special purposes, we don't set ++ * free bits in bitmap */ ++ mb_set_bit(block, buddy); ++ mb_set_bit(block + 1, buddy); ++ } ++ e3b->bd_info->bb_counters[order]--; ++ e3b->bd_info->bb_counters[order]--; ++ ++ block = block >> 1; ++ order++; ++ e3b->bd_info->bb_counters[order]++; ++ ++ mb_clear_bit(block, buddy2); ++ buddy = buddy2; ++ } while (1); ++ } ++ mb_check_buddy(e3b); ++ ++ return 0; ++} ++ ++static int mb_find_extent(struct ext3_buddy *e3b, int order, int block, ++ int needed, struct ext3_free_extent *ex) ++{ ++ int next = block, max, ord; ++ void *buddy; ++ ++ J_ASSERT(ex != NULL); ++ ++ buddy = mb_find_buddy(e3b, order, &max); ++ J_ASSERT(buddy); ++ J_ASSERT(block < max); ++ if (mb_test_bit(block, buddy)) { ++ ex->fe_len = 0; ++ ex->fe_start = 0; ++ ex->fe_group = 0; ++ return 0; ++ } ++ ++ if (likely(order == 0)) { ++ /* find actual order */ ++ order = mb_find_order_for_block(e3b, block); ++ block = block >> order; ++ } ++ ++ ex->fe_len = 1 << order; ++ ex->fe_start = block << order; ++ ex->fe_group = e3b->bd_group; ++ ++ /* calc difference from given start */ ++ next = next - ex->fe_start; ++ ex->fe_len -= next; ++ ex->fe_start += next; ++ ++ while (needed > ex->fe_len && (buddy = mb_find_buddy(e3b, order, &max))) { ++ ++ if (block + 1 >= max) ++ break; ++ ++ next = (block + 1) * (1 << order); ++ if (mb_test_bit(next, EXT3_MB_BITMAP(e3b))) ++ break; ++ ++ ord = mb_find_order_for_block(e3b, next); ++ ++ order = ord; ++ block = next >> order; ++ ex->fe_len += 1 << order; ++ } ++ ++ J_ASSERT(ex->fe_start + ex->fe_len <= (1 << (e3b->bd_blkbits + 3))); ++ return ex->fe_len; ++} ++ ++static int mb_mark_used(struct ext3_buddy *e3b, struct ext3_free_extent *ex) ++{ ++ int ord, mlen = 0, max = 0, cur; ++ int start = ex->fe_start; ++ int len = ex->fe_len; ++ unsigned ret = 0; ++ int len0 = len; ++ void *buddy; ++ ++ mb_check_buddy(e3b); ++ ++ e3b->bd_info->bb_free -= len; ++ if (e3b->bd_info->bb_first_free == start) ++ e3b->bd_info->bb_first_free += len; ++ ++ /* let's maintain fragments counter */ ++ if (start != 0) ++ mlen = !mb_test_bit(start - 1, EXT3_MB_BITMAP(e3b)); ++ if (start + len < EXT3_SB(e3b->bd_sb)->s_mb_maxs[0]) ++ max = !mb_test_bit(start + len, EXT3_MB_BITMAP(e3b)); ++ if (mlen && max) ++ e3b->bd_info->bb_fragments++; ++ else if (!mlen && !max) ++ e3b->bd_info->bb_fragments--; ++ ++ /* let's maintain buddy itself */ ++ while (len) { ++ ord = mb_find_order_for_block(e3b, start); ++ ++ if (((start >> ord) << ord) == start && len >= (1 << ord)) { ++ /* the whole chunk may be allocated at once! */ ++ mlen = 1 << ord; ++ buddy = mb_find_buddy(e3b, ord, &max); ++ J_ASSERT((start >> ord) < max); ++ mb_set_bit(start >> ord, buddy); ++ e3b->bd_info->bb_counters[ord]--; ++ start += mlen; ++ len -= mlen; ++ J_ASSERT(len >= 0); ++ continue; ++ } ++ ++ /* store for history */ ++ if (ret == 0) ++ ret = len | (ord << 16); ++ ++ /* we have to split large buddy */ ++ J_ASSERT(ord > 0); ++ buddy = mb_find_buddy(e3b, ord, &max); ++ mb_set_bit(start >> ord, buddy); ++ e3b->bd_info->bb_counters[ord]--; ++ ++ ord--; ++ cur = (start >> ord) & ~1U; ++ buddy = mb_find_buddy(e3b, ord, &max); ++ mb_clear_bit(cur, buddy); ++ mb_clear_bit(cur + 1, buddy); ++ e3b->bd_info->bb_counters[ord]++; ++ e3b->bd_info->bb_counters[ord]++; ++ } ++ ++ /* now drop all the bits in bitmap */ ++ mb_set_bits(EXT3_MB_BITMAP(e3b), ex->fe_start, len0); ++ ++ mb_check_buddy(e3b); ++ ++ return ret; ++} ++ ++/* ++ * Must be called under group lock! ++ */ ++static void ext3_mb_use_best_found(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ unsigned long ret; ++ ++ ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len); ++ ret = mb_mark_used(e3b, &ac->ac_b_ex); ++ ++ ac->ac_status = AC_STATUS_FOUND; ++ ac->ac_tail = ret & 0xffff; ++ ac->ac_buddy = ret >> 16; ++ ++ /* hold in-core structures until allocated ++ * blocks are marked non-free in on-disk bitmap */ ++ ac->ac_buddy_page = e3b->bd_buddy_page; ++ page_cache_get(e3b->bd_buddy_page); ++ ac->ac_bitmap_page = e3b->bd_bitmap_page; ++ page_cache_get(e3b->bd_bitmap_page); ++} ++ ++/* ++ * The routine checks whether found extent is good enough. If it is, ++ * then the extent gets marked used and flag is set to the context ++ * to stop scanning. Otherwise, the extent is compared with the ++ * previous found extent and if new one is better, then it's stored ++ * in the context. Later, the best found extent will be used, if ++ * mballoc can't find good enough extent. ++ * ++ * FIXME: real allocation policy is to be designed yet! ++ */ ++static void ext3_mb_measure_extent(struct ext3_allocation_context *ac, ++ struct ext3_free_extent *ex, ++ struct ext3_buddy *e3b) ++{ ++ struct ext3_free_extent *bex = &ac->ac_b_ex; ++ struct ext3_free_extent *gex = &ac->ac_g_ex; ++ ++ J_ASSERT(ex->fe_len > 0); ++ J_ASSERT(ex->fe_len < (1 << ac->ac_sb->s_blocksize_bits) * 8); ++ J_ASSERT(ex->fe_start < (1 << ac->ac_sb->s_blocksize_bits) * 8); ++ ++ ac->ac_found++; ++ ++ /* ++ * The special case - take what you catch first ++ */ ++ if (unlikely(ac->ac_flags & EXT3_MB_HINT_FIRST)) { ++ *bex = *ex; ++ ext3_mb_use_best_found(ac, e3b); ++ return; ++ } ++ ++ /* ++ * Let's check whether the chunk is good enough ++ */ ++ if (ex->fe_len == gex->fe_len) { ++ *bex = *ex; ++ ext3_mb_use_best_found(ac, e3b); ++ return; ++ } ++ ++ /* ++ * If this is first found extent, just store it in the context ++ */ ++ if (bex->fe_len == 0) { ++ *bex = *ex; ++ return; ++ } ++ ++ /* ++ * If new found extent is better, store it in the context ++ */ ++ if (bex->fe_len < gex->fe_len) { ++ /* if the request isn't satisfied, any found extent ++ * larger than previous best one is better */ ++ if (ex->fe_len > bex->fe_len) ++ *bex = *ex; ++ } else if (ex->fe_len > gex->fe_len) { ++ /* if the request is satisfied, then we try to find ++ * an extent that still satisfy the request, but is ++ * smaller than previous one */ ++ *bex = *ex; ++ } ++ ++ /* ++ * Let's scan at least few extents and don't pick up a first one ++ */ ++ if (bex->fe_len > gex->fe_len && ac->ac_found > ext3_mb_min_to_scan) ++ ac->ac_status = AC_STATUS_BREAK; ++ ++ /* ++ * We don't want to scan for a whole year ++ */ ++ if (ac->ac_found > ext3_mb_max_to_scan) ++ ac->ac_status = AC_STATUS_BREAK; ++} ++ ++static int ext3_mb_try_best_found(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ struct ext3_free_extent ex = ac->ac_b_ex; ++ int group = ex.fe_group, max, err; ++ ++ J_ASSERT(ex.fe_len > 0); ++ err = ext3_mb_load_buddy(ac->ac_sb, group, e3b); ++ if (err) ++ return err; ++ ++ ext3_lock_group(ac->ac_sb, group); ++ max = mb_find_extent(e3b, 0, ex.fe_start, ex.fe_len, &ex); ++ ++ if (max > 0) { ++ ac->ac_b_ex = ex; ++ ext3_mb_use_best_found(ac, e3b); ++ } ++ ++ ext3_unlock_group(ac->ac_sb, group); ++ ++ ext3_mb_release_desc(e3b); ++ ++ return 0; ++} ++ ++static int ext3_mb_find_by_goal(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ int group = ac->ac_g_ex.fe_group, max, err; ++ struct ext3_sb_info *sbi = EXT3_SB(ac->ac_sb); ++ struct ext3_super_block *es = sbi->s_es; ++ struct ext3_free_extent ex; ++ ++ err = ext3_mb_load_buddy(ac->ac_sb, group, e3b); ++ if (err) ++ return err; ++ ++ ext3_lock_group(ac->ac_sb, group); ++ max = mb_find_extent(e3b, 0, ac->ac_g_ex.fe_start, ++ ac->ac_g_ex.fe_len, &ex); ++ ++ if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) { ++ unsigned long start; ++ start = (e3b->bd_group * EXT3_BLOCKS_PER_GROUP(ac->ac_sb) + ++ ex.fe_start + le32_to_cpu(es->s_first_data_block)); ++ if (start % sbi->s_stripe == 0) { ++ ac->ac_found++; ++ ac->ac_b_ex = ex; ++ ext3_mb_use_best_found(ac, e3b); ++ } ++ } else if (max >= ac->ac_g_ex.fe_len) { ++ J_ASSERT(ex.fe_len > 0); ++ J_ASSERT(ex.fe_group == ac->ac_g_ex.fe_group); ++ J_ASSERT(ex.fe_start == ac->ac_g_ex.fe_start); ++ ac->ac_found++; ++ ac->ac_b_ex = ex; ++ ext3_mb_use_best_found(ac, e3b); ++ } else if (max > 0 && (ac->ac_flags & EXT3_MB_HINT_MERGE)) { ++ /* Sometimes, caller may want to merge even small ++ * number of blocks to an existing extent */ ++ J_ASSERT(ex.fe_len > 0); ++ J_ASSERT(ex.fe_group == ac->ac_g_ex.fe_group); ++ J_ASSERT(ex.fe_start == ac->ac_g_ex.fe_start); ++ ac->ac_found++; ++ ac->ac_b_ex = ex; ++ ext3_mb_use_best_found(ac, e3b); ++ } ++ ext3_unlock_group(ac->ac_sb, group); ++ ++ ext3_mb_release_desc(e3b); ++ ++ return 0; ++} ++ ++/* ++ * The routine scans buddy structures (not bitmap!) from given order ++ * to max order and tries to find big enough chunk to satisfy the req ++ */ ++static void ext3_mb_simple_scan_group(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ struct super_block *sb = ac->ac_sb; ++ struct ext3_group_info *grp = e3b->bd_info; ++ void *buddy; ++ int i, k, max; ++ ++ J_ASSERT(ac->ac_2order > 0); ++ for (i = ac->ac_2order; i <= sb->s_blocksize_bits + 1; i++) { ++ if (grp->bb_counters[i] == 0) ++ continue; ++ ++ buddy = mb_find_buddy(e3b, i, &max); ++ if (buddy == NULL) { ++ printk(KERN_ALERT "looking for wrong order?\n"); ++ break; ++ } ++ ++ k = mb_find_next_zero_bit(buddy, max, 0); ++ J_ASSERT(k < max); ++ ++ ac->ac_found++; ++ ++ ac->ac_b_ex.fe_len = 1 << i; ++ ac->ac_b_ex.fe_start = k << i; ++ ac->ac_b_ex.fe_group = e3b->bd_group; ++ ++ ext3_mb_use_best_found(ac, e3b); ++ J_ASSERT(ac->ac_b_ex.fe_len == ac->ac_g_ex.fe_len); ++ ++ if (unlikely(ext3_mb_stats)) ++ atomic_inc(&EXT3_SB(sb)->s_bal_2orders); ++ ++ break; ++ } ++} ++ ++/* ++ * The routine scans the group and measures all found extents. ++ * In order to optimize scanning, caller must pass number of ++ * free blocks in the group, so the routine can know upper limit. ++ */ ++static void ext3_mb_complex_scan_group(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ struct super_block *sb = ac->ac_sb; ++ void *bitmap = EXT3_MB_BITMAP(e3b); ++ struct ext3_free_extent ex; ++ int i, free; ++ ++ free = e3b->bd_info->bb_free; ++ J_ASSERT(free > 0); ++ ++ i = e3b->bd_info->bb_first_free; ++ ++ while (free && ac->ac_status == AC_STATUS_CONTINUE) { ++ i = mb_find_next_zero_bit(bitmap, sb->s_blocksize * 8, i); ++ if (i >= sb->s_blocksize * 8) { ++ J_ASSERT(free == 0); ++ break; ++ } ++ ++ mb_find_extent(e3b, 0, i, ac->ac_g_ex.fe_len, &ex); ++ J_ASSERT(ex.fe_len > 0); ++ J_ASSERT(free >= ex.fe_len); ++ ++ ext3_mb_measure_extent(ac, &ex, e3b); ++ ++ i += ex.fe_len; ++ free -= ex.fe_len; ++ } ++} ++ ++/* ++ * This is a special case for storages like raid5 ++ * we try to find stripe-aligned chunks for stripe-size requests ++ */ ++static void ext3_mb_scan_aligned(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ struct super_block *sb = ac->ac_sb; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ void *bitmap = EXT3_MB_BITMAP(e3b); ++ struct ext3_free_extent ex; ++ unsigned long i, max; ++ ++ J_ASSERT(sbi->s_stripe != 0); ++ ++ /* find first stripe-aligned block */ ++ i = e3b->bd_group * EXT3_BLOCKS_PER_GROUP(sb) ++ + le32_to_cpu(sbi->s_es->s_first_data_block); ++ i = ((i + sbi->s_stripe - 1) / sbi->s_stripe) * sbi->s_stripe; ++ i = (i - le32_to_cpu(sbi->s_es->s_first_data_block)) ++ % EXT3_BLOCKS_PER_GROUP(sb); ++ ++ while (i < sb->s_blocksize * 8) { ++ if (!mb_test_bit(i, bitmap)) { ++ max = mb_find_extent(e3b, 0, i, sbi->s_stripe, &ex); ++ if (max >= sbi->s_stripe) { ++ ac->ac_found++; ++ ac->ac_b_ex = ex; ++ ext3_mb_use_best_found(ac, e3b); ++ break; ++ } ++ } ++ i += sbi->s_stripe; ++ } ++} ++ ++static int ext3_mb_good_group(struct ext3_allocation_context *ac, ++ int group, int cr) ++{ ++ struct ext3_group_info *grp = EXT3_GROUP_INFO(ac->ac_sb, group); ++ unsigned free, fragments, i, bits; ++ ++ J_ASSERT(cr >= 0 && cr < 4); ++ J_ASSERT(!EXT3_MB_GRP_NEED_INIT(grp)); ++ ++ free = grp->bb_free; ++ fragments = grp->bb_fragments; ++ if (free == 0) ++ return 0; ++ if (fragments == 0) ++ return 0; ++ ++ switch (cr) { ++ case 0: ++ J_ASSERT(ac->ac_2order != 0); ++ bits = ac->ac_sb->s_blocksize_bits + 1; ++ for (i = ac->ac_2order; i <= bits; i++) ++ if (grp->bb_counters[i] > 0) ++ return 1; ++ break; ++ case 1: ++ if ((free / fragments) >= ac->ac_g_ex.fe_len) ++ return 1; ++ break; ++ case 2: ++ if (free >= ac->ac_g_ex.fe_len) ++ return 1; ++ break; ++ case 3: ++ return 1; ++ default: ++ BUG(); ++ } ++ ++ return 0; ++} ++ ++int ext3_mb_new_blocks(handle_t *handle, struct inode *inode, ++ unsigned long goal, int *len, int flags, int *errp) ++{ ++ struct buffer_head *bitmap_bh = NULL; ++ struct ext3_allocation_context ac; ++ int i, group, block, cr, err = 0; ++ struct ext3_group_desc *gdp; ++ struct ext3_super_block *es; ++ struct buffer_head *gdp_bh; ++ struct ext3_sb_info *sbi; ++ struct super_block *sb; ++ struct ext3_buddy e3b; ++ ++ J_ASSERT(len != NULL); ++ J_ASSERT(*len > 0); ++ ++ sb = inode->i_sb; ++ if (!sb) { ++ printk("ext3_mb_new_nblocks: nonexistent device"); ++ return 0; ++ } ++ ++ if (!test_opt(sb, MBALLOC)) { ++ static int ext3_mballoc_warning = 0; ++ if (ext3_mballoc_warning == 0) { ++ printk(KERN_ERR "EXT3-fs: multiblock request with " ++ "mballoc disabled!\n"); ++ ext3_mballoc_warning++; ++ } ++ *len = 1; ++ err = ext3_new_block_old(handle, inode, goal, errp); ++ return err; ++ } ++ ++ ext3_mb_poll_new_transaction(sb, handle); ++ ++ sbi = EXT3_SB(sb); ++ es = EXT3_SB(sb)->s_es; ++ ++ /* ++ * We can't allocate > group size ++ */ ++ if (*len >= EXT3_BLOCKS_PER_GROUP(sb) - 10) ++ *len = EXT3_BLOCKS_PER_GROUP(sb) - 10; ++ ++ if (!(flags & EXT3_MB_HINT_RESERVED)) { ++ /* someone asks for non-reserved blocks */ ++ BUG_ON(*len > 1); ++ err = ext3_mb_reserve_blocks(sb, 1); ++ if (err) { ++ *errp = err; ++ return 0; ++ } ++ } ++ ++ ac.ac_buddy_page = NULL; ++ ac.ac_bitmap_page = NULL; ++ ++ /* ++ * Check quota for allocation of this blocks. ++ */ ++ while (*len && DQUOT_ALLOC_BLOCK(inode, *len)) ++ *len -= 1; ++ if (*len == 0) { ++ *errp = -EDQUOT; ++ block = 0; ++ goto out; ++ } ++ ++ /* start searching from the goal */ ++ if (goal < le32_to_cpu(es->s_first_data_block) || ++ goal >= le32_to_cpu(es->s_blocks_count)) ++ goal = le32_to_cpu(es->s_first_data_block); ++ group = (goal - le32_to_cpu(es->s_first_data_block)) / ++ EXT3_BLOCKS_PER_GROUP(sb); ++ block = ((goal - le32_to_cpu(es->s_first_data_block)) % ++ EXT3_BLOCKS_PER_GROUP(sb)); ++ ++ /* set up allocation goals */ ++ ac.ac_b_ex.fe_group = 0; ++ ac.ac_b_ex.fe_start = 0; ++ ac.ac_b_ex.fe_len = 0; ++ ac.ac_status = AC_STATUS_CONTINUE; ++ ac.ac_groups_scanned = 0; ++ ac.ac_ex_scanned = 0; ++ ac.ac_found = 0; ++ ac.ac_sb = inode->i_sb; ++ ac.ac_g_ex.fe_group = group; ++ ac.ac_g_ex.fe_start = block; ++ ac.ac_g_ex.fe_len = *len; ++ ac.ac_flags = flags; ++ ac.ac_2order = 0; ++ ac.ac_criteria = 0; ++ ++ if (*len == 1 && sbi->s_stripe) { ++ /* looks like a metadata, let's use a dirty hack for raid5 ++ * move all metadata in first groups in hope to hit cached ++ * sectors and thus avoid read-modify cycles in raid5 */ ++ ac.ac_g_ex.fe_group = group = 0; ++ } ++ ++ /* probably, the request is for 2^8+ blocks (1/2/3/... MB) */ ++ i = ffs(*len); ++ if (i >= ext3_mb_order2_reqs) { ++ i--; ++ if ((*len & (~(1 << i))) == 0) ++ ac.ac_2order = i; ++ } ++ ++ /* first, try the goal */ ++ err = ext3_mb_find_by_goal(&ac, &e3b); ++ if (err) ++ goto out_err; ++ if (ac.ac_status == AC_STATUS_FOUND) ++ goto found; ++ ++ /* Let's just scan groups to find more-less suitable blocks */ ++ cr = ac.ac_2order ? 0 : 1; ++repeat: ++ for (; cr < 4 && ac.ac_status == AC_STATUS_CONTINUE; cr++) { ++ ac.ac_criteria = cr; ++ for (i = 0; i < EXT3_SB(sb)->s_groups_count; group++, i++) { ++ if (group == EXT3_SB(sb)->s_groups_count) ++ group = 0; ++ ++ if (EXT3_MB_GRP_NEED_INIT(EXT3_GROUP_INFO(sb, group))) { ++ /* we need full data about the group ++ * to make a good selection */ ++ err = ext3_mb_load_buddy(ac.ac_sb, group, &e3b); ++ if (err) ++ goto out_err; ++ ext3_mb_release_desc(&e3b); ++ } ++ ++ /* check is group good for our criteries */ ++ if (!ext3_mb_good_group(&ac, group, cr)) ++ continue; ++ ++ err = ext3_mb_load_buddy(ac.ac_sb, group, &e3b); ++ if (err) ++ goto out_err; ++ ++ ext3_lock_group(sb, group); ++ if (!ext3_mb_good_group(&ac, group, cr)) { ++ /* someone did allocation from this group */ ++ ext3_unlock_group(sb, group); ++ ext3_mb_release_desc(&e3b); ++ continue; ++ } ++ ++ ac.ac_groups_scanned++; ++ if (cr == 0) ++ ext3_mb_simple_scan_group(&ac, &e3b); ++ else if (cr == 1 && *len == sbi->s_stripe) ++ ext3_mb_scan_aligned(&ac, &e3b); ++ else ++ ext3_mb_complex_scan_group(&ac, &e3b); ++ ++ ext3_unlock_group(sb, group); ++ ++ ext3_mb_release_desc(&e3b); ++ ++ if (ac.ac_status != AC_STATUS_CONTINUE) ++ break; ++ } ++ } ++ ++ if (ac.ac_b_ex.fe_len > 0 && ac.ac_status != AC_STATUS_FOUND && ++ !(ac.ac_flags & EXT3_MB_HINT_FIRST)) { ++ /* ++ * We've been searching too long. Let's try to allocate ++ * the best chunk we've found so far ++ */ ++ ++ /*if (ac.ac_found > ext3_mb_max_to_scan) ++ printk(KERN_DEBUG "EXT3-fs: too long searching at " ++ "%u (%d/%d)\n", cr, ac.ac_b_ex.fe_len, ++ ac.ac_g_ex.fe_len);*/ ++ ext3_mb_try_best_found(&ac, &e3b); ++ if (ac.ac_status != AC_STATUS_FOUND) { ++ /* ++ * Someone more lucky has already allocated it. ++ * The only thing we can do is just take first ++ * found block(s) ++ printk(KERN_DEBUG "EXT3-fs: someone won our chunk\n"); ++ */ ++ ac.ac_b_ex.fe_group = 0; ++ ac.ac_b_ex.fe_start = 0; ++ ac.ac_b_ex.fe_len = 0; ++ ac.ac_status = AC_STATUS_CONTINUE; ++ ac.ac_flags |= EXT3_MB_HINT_FIRST; ++ cr = 3; ++ goto repeat; ++ } ++ } ++ ++ if (ac.ac_status != AC_STATUS_FOUND) { ++ /* ++ * We aren't lucky definitely ++ */ ++ DQUOT_FREE_BLOCK(inode, *len); ++ *errp = -ENOSPC; ++ block = 0; ++#if 1 ++ printk(KERN_ERR "EXT3-fs: can't allocate: status %d flags %d\n", ++ ac.ac_status, ac.ac_flags); ++ printk(KERN_ERR "EXT3-fs: goal %d, best found %d/%d/%d cr %d\n", ++ ac.ac_g_ex.fe_len, ac.ac_b_ex.fe_group, ++ ac.ac_b_ex.fe_start, ac.ac_b_ex.fe_len, cr); ++ printk(KERN_ERR "EXT3-fs: %lu block reserved, %d found\n", ++ sbi->s_blocks_reserved, ac.ac_found); ++ printk("EXT3-fs: groups: "); ++ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) ++ printk("%d: %d ", i, EXT3_GROUP_INFO(sb, i)->bb_free); ++ printk("\n"); ++#endif ++ goto out; ++ } ++ ++found: ++ J_ASSERT(ac.ac_b_ex.fe_len > 0); ++ ++ /* good news - free block(s) have been found. now it's time ++ * to mark block(s) in good old journaled bitmap */ ++ block = ac.ac_b_ex.fe_group * EXT3_BLOCKS_PER_GROUP(sb) ++ + ac.ac_b_ex.fe_start ++ + le32_to_cpu(es->s_first_data_block); ++ ++ /* we made a desicion, now mark found blocks in good old ++ * bitmap to be journaled */ ++ ++ ext3_debug("using block group %d(%d)\n", ++ ac.ac_b_group.group, gdp->bg_free_blocks_count); ++ ++ bitmap_bh = read_block_bitmap(sb, ac.ac_b_ex.fe_group); ++ if (!bitmap_bh) { ++ *errp = -EIO; ++ goto out_err; ++ } ++ ++ err = ext3_journal_get_write_access(handle, bitmap_bh); ++ if (err) { ++ *errp = err; ++ goto out_err; ++ } ++ ++ gdp = ext3_get_group_desc(sb, ac.ac_b_ex.fe_group, &gdp_bh); ++ if (!gdp) { ++ *errp = -EIO; ++ goto out_err; ++ } ++ ++ err = ext3_journal_get_write_access(handle, gdp_bh); ++ if (err) ++ goto out_err; ++ ++ block = ac.ac_b_ex.fe_group * EXT3_BLOCKS_PER_GROUP(sb) ++ + ac.ac_b_ex.fe_start ++ + le32_to_cpu(es->s_first_data_block); ++ ++ if (block == le32_to_cpu(gdp->bg_block_bitmap) || ++ block == le32_to_cpu(gdp->bg_inode_bitmap) || ++ in_range(block, le32_to_cpu(gdp->bg_inode_table), ++ EXT3_SB(sb)->s_itb_per_group)) ++ ext3_error(sb, "ext3_new_block", ++ "Allocating block in system zone - " ++ "block = %u", block); ++#ifdef AGGRESSIVE_CHECK ++ for (i = 0; i < ac.ac_b_ex.fe_len; i++) ++ J_ASSERT(!mb_test_bit(ac.ac_b_ex.fe_start + i, bitmap_bh->b_data)); ++#endif ++ mb_set_bits(bitmap_bh->b_data, ac.ac_b_ex.fe_start, ac.ac_b_ex.fe_len); ++ ++ spin_lock(sb_bgl_lock(sbi, ac.ac_b_ex.fe_group)); ++ gdp->bg_free_blocks_count = ++ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) ++ - ac.ac_b_ex.fe_len); ++ spin_unlock(sb_bgl_lock(sbi, ac.ac_b_ex.fe_group)); ++ percpu_counter_mod(&sbi->s_freeblocks_counter, - ac.ac_b_ex.fe_len); ++ ++ err = ext3_journal_dirty_metadata(handle, bitmap_bh); ++ if (err) ++ goto out_err; ++ err = ext3_journal_dirty_metadata(handle, gdp_bh); ++ if (err) ++ goto out_err; ++ ++ sb->s_dirt = 1; ++ *errp = 0; ++ brelse(bitmap_bh); ++ ++ /* drop non-allocated, but dquote'd blocks */ ++ J_ASSERT(*len >= ac.ac_b_ex.fe_len); ++ DQUOT_FREE_BLOCK(inode, *len - ac.ac_b_ex.fe_len); ++ ++ *len = ac.ac_b_ex.fe_len; ++ J_ASSERT(*len > 0); ++ J_ASSERT(block != 0); ++ goto out; ++ ++out_err: ++ /* if we've already allocated something, roll it back */ ++ if (ac.ac_status == AC_STATUS_FOUND) { ++ /* FIXME: free blocks here */ ++ } ++ ++ DQUOT_FREE_BLOCK(inode, *len); ++ brelse(bitmap_bh); ++ *errp = err; ++ block = 0; ++out: ++ if (ac.ac_buddy_page) ++ page_cache_release(ac.ac_buddy_page); ++ if (ac.ac_bitmap_page) ++ page_cache_release(ac.ac_bitmap_page); ++ ++ if (!(flags & EXT3_MB_HINT_RESERVED)) { ++ /* block wasn't reserved before and we reserved it ++ * at the beginning of allocation. it doesn't matter ++ * whether we allocated anything or we failed: time ++ * to release reservation. NOTE: because I expect ++ * any multiblock request from delayed allocation ++ * path only, here is single block always */ ++ ext3_mb_release_blocks(sb, 1); ++ } ++ ++ if (unlikely(ext3_mb_stats) && ac.ac_g_ex.fe_len > 1) { ++ atomic_inc(&sbi->s_bal_reqs); ++ atomic_add(*len, &sbi->s_bal_allocated); ++ if (*len >= ac.ac_g_ex.fe_len) ++ atomic_inc(&sbi->s_bal_success); ++ atomic_add(ac.ac_found, &sbi->s_bal_ex_scanned); ++ if (ac.ac_g_ex.fe_start == ac.ac_b_ex.fe_start && ++ ac.ac_g_ex.fe_group == ac.ac_b_ex.fe_group) ++ atomic_inc(&sbi->s_bal_goals); ++ if (ac.ac_found > ext3_mb_max_to_scan) ++ atomic_inc(&sbi->s_bal_breaks); ++ } ++ ++ ext3_mb_store_history(sb, inode->i_ino, &ac); ++ ++ return block; ++} ++EXPORT_SYMBOL(ext3_mb_new_blocks); ++ ++#ifdef EXT3_MB_HISTORY ++struct ext3_mb_proc_session { ++ struct ext3_mb_history *history; ++ struct super_block *sb; ++ int start; ++ int max; ++}; ++ ++static void *ext3_mb_history_skip_empty(struct ext3_mb_proc_session *s, ++ struct ext3_mb_history *hs, ++ int first) ++{ ++ if (hs == s->history + s->max) ++ hs = s->history; ++ if (!first && hs == s->history + s->start) ++ return NULL; ++ while (hs->goal.fe_len == 0) { ++ hs++; ++ if (hs == s->history + s->max) ++ hs = s->history; ++ if (hs == s->history + s->start) ++ return NULL; ++ } ++ return hs; ++} ++ ++static void *ext3_mb_seq_history_start(struct seq_file *seq, loff_t *pos) ++{ ++ struct ext3_mb_proc_session *s = seq->private; ++ struct ext3_mb_history *hs; ++ int l = *pos; ++ ++ if (l == 0) ++ return SEQ_START_TOKEN; ++ hs = ext3_mb_history_skip_empty(s, s->history + s->start, 1); ++ if (!hs) ++ return NULL; ++ while (--l && (hs = ext3_mb_history_skip_empty(s, ++hs, 0)) != NULL); ++ return hs; ++} ++ ++static void *ext3_mb_seq_history_next(struct seq_file *seq, void *v, loff_t *pos) ++{ ++ struct ext3_mb_proc_session *s = seq->private; ++ struct ext3_mb_history *hs = v; ++ ++ ++*pos; ++ if (v == SEQ_START_TOKEN) ++ return ext3_mb_history_skip_empty(s, s->history + s->start, 1); ++ else ++ return ext3_mb_history_skip_empty(s, ++hs, 0); ++} ++ ++static int ext3_mb_seq_history_show(struct seq_file *seq, void *v) ++{ ++ struct ext3_mb_history *hs = v; ++ char buf[20], buf2[20]; ++ ++ if (v == SEQ_START_TOKEN) { ++ seq_printf(seq, "%-5s %-8s %-17s %-17s %-5s %-5s %-2s %-5s %-5s %-6s\n", ++ "pid", "inode", "goal", "result", "found", "grps", "cr", ++ "merge", "tail", "broken"); ++ return 0; ++ } ++ ++ sprintf(buf, "%u/%u/%u", hs->goal.fe_group, ++ hs->goal.fe_start, hs->goal.fe_len); ++ sprintf(buf2, "%u/%u/%u", hs->result.fe_group, ++ hs->result.fe_start, hs->result.fe_len); ++ seq_printf(seq, "%-5u %-8u %-17s %-17s %-5u %-5u %-2u %-5s %-5u %-6u\n", ++ hs->pid, hs->ino, buf, buf2, hs->found, hs->groups, ++ hs->cr, hs->merged ? "M" : "", hs->tail, ++ hs->buddy ? 1 << hs->buddy : 0); ++ return 0; ++} ++ ++static void ext3_mb_seq_history_stop(struct seq_file *seq, void *v) ++{ ++} ++ ++static struct seq_operations ext3_mb_seq_history_ops = { ++ .start = ext3_mb_seq_history_start, ++ .next = ext3_mb_seq_history_next, ++ .stop = ext3_mb_seq_history_stop, ++ .show = ext3_mb_seq_history_show, ++}; ++ ++static int ext3_mb_seq_history_open(struct inode *inode, struct file *file) ++{ ++ struct super_block *sb = PDE(inode)->data; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct ext3_mb_proc_session *s; ++ int rc, size; ++ ++ s = kmalloc(sizeof(*s), GFP_KERNEL); ++ if (s == NULL) ++ return -EIO; ++ size = sizeof(struct ext3_mb_history) * sbi->s_mb_history_max; ++ s->history = kmalloc(size, GFP_KERNEL); ++ if (s == NULL) { ++ kfree(s); ++ return -EIO; ++ } ++ ++ spin_lock(&sbi->s_mb_history_lock); ++ memcpy(s->history, sbi->s_mb_history, size); ++ s->max = sbi->s_mb_history_max; ++ s->start = sbi->s_mb_history_cur % s->max; ++ spin_unlock(&sbi->s_mb_history_lock); ++ ++ rc = seq_open(file, &ext3_mb_seq_history_ops); ++ if (rc == 0) { ++ struct seq_file *m = (struct seq_file *)file->private_data; ++ m->private = s; ++ } else { ++ kfree(s->history); ++ kfree(s); ++ } ++ return rc; ++ ++} ++ ++static int ext3_mb_seq_history_release(struct inode *inode, struct file *file) ++{ ++ struct seq_file *seq = (struct seq_file *)file->private_data; ++ struct ext3_mb_proc_session *s = seq->private; ++ kfree(s->history); ++ kfree(s); ++ return seq_release(inode, file); ++} ++ ++static struct file_operations ext3_mb_seq_history_fops = { ++ .owner = THIS_MODULE, ++ .open = ext3_mb_seq_history_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = ext3_mb_seq_history_release, ++}; ++ ++static void *ext3_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) ++{ ++ struct super_block *sb = seq->private; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ long group; ++ ++ if (*pos < 0 || *pos >= sbi->s_groups_count) ++ return NULL; ++ ++ group = *pos + 1; ++ return (void *) group; ++} ++ ++static void *ext3_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) ++{ ++ struct super_block *sb = seq->private; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ long group; ++ ++ ++*pos; ++ if (*pos < 0 || *pos >= sbi->s_groups_count) ++ return NULL; ++ group = *pos + 1; ++ return (void *) group;; ++} ++ ++static int ext3_mb_seq_groups_show(struct seq_file *seq, void *v) ++{ ++ struct super_block *sb = seq->private; ++ long group = (long) v, i; ++ struct sg { ++ struct ext3_group_info info; ++ unsigned short counters[16]; ++ } sg; ++ ++ group--; ++ if (group == 0) ++ seq_printf(seq, "#%-5s: %-5s %-5s %-5s [ %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s ]\n", ++ "group", "free", "frags", "first", "2^0", "2^1", "2^2", ++ "2^3", "2^4", "2^5", "2^6", "2^7", "2^8", "2^9", "2^10", ++ "2^11", "2^12", "2^13"); ++ ++ i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) + ++ sizeof(struct ext3_group_info); ++ ext3_lock_group(sb, group); ++ memcpy(&sg, EXT3_GROUP_INFO(sb, group), i); ++ ext3_unlock_group(sb, group); ++ ++ if (EXT3_MB_GRP_NEED_INIT(&sg.info)) ++ return 0; ++ ++ seq_printf(seq, "#%-5lu: %-5u %-5u %-5u [", group, sg.info.bb_free, ++ sg.info.bb_fragments, sg.info.bb_first_free); ++ for (i = 0; i <= 13; i++) ++ seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ? ++ sg.info.bb_counters[i] : 0); ++ seq_printf(seq, " ]\n"); ++ ++ return 0; ++} ++ ++static void ext3_mb_seq_groups_stop(struct seq_file *seq, void *v) ++{ ++} ++ ++static struct seq_operations ext3_mb_seq_groups_ops = { ++ .start = ext3_mb_seq_groups_start, ++ .next = ext3_mb_seq_groups_next, ++ .stop = ext3_mb_seq_groups_stop, ++ .show = ext3_mb_seq_groups_show, ++}; ++ ++static int ext3_mb_seq_groups_open(struct inode *inode, struct file *file) ++{ ++ struct super_block *sb = PDE(inode)->data; ++ int rc; ++ ++ rc = seq_open(file, &ext3_mb_seq_groups_ops); ++ if (rc == 0) { ++ struct seq_file *m = (struct seq_file *)file->private_data; ++ m->private = sb; ++ } ++ return rc; ++ ++} ++ ++static struct file_operations ext3_mb_seq_groups_fops = { ++ .owner = THIS_MODULE, ++ .open = ext3_mb_seq_groups_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = seq_release, ++}; ++ ++static void ext3_mb_history_release(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ char name[64]; ++ ++ snprintf(name, sizeof(name) - 1, "%s", bdevname(sb->s_bdev, name)); ++ remove_proc_entry("mb_groups", sbi->s_mb_proc); ++ remove_proc_entry("mb_history", sbi->s_mb_proc); ++ remove_proc_entry(name, proc_root_ext3); ++ ++ if (sbi->s_mb_history) ++ kfree(sbi->s_mb_history); ++} ++ ++static void ext3_mb_history_init(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ char name[64]; ++ int i; ++ ++ snprintf(name, sizeof(name) - 1, "%s", bdevname(sb->s_bdev, name)); ++ sbi->s_mb_proc = proc_mkdir(name, proc_root_ext3); ++ if (sbi->s_mb_proc != NULL) { ++ struct proc_dir_entry *p; ++ p = create_proc_entry("mb_history", S_IRUGO, sbi->s_mb_proc); ++ if (p) { ++ p->proc_fops = &ext3_mb_seq_history_fops; ++ p->data = sb; ++ } ++ p = create_proc_entry("mb_groups", S_IRUGO, sbi->s_mb_proc); ++ if (p) { ++ p->proc_fops = &ext3_mb_seq_groups_fops; ++ p->data = sb; ++ } ++ } ++ ++ sbi->s_mb_history_max = 1000; ++ sbi->s_mb_history_cur = 0; ++ spin_lock_init(&sbi->s_mb_history_lock); ++ i = sbi->s_mb_history_max * sizeof(struct ext3_mb_history); ++ sbi->s_mb_history = kmalloc(i, GFP_KERNEL); ++ memset(sbi->s_mb_history, 0, i); ++ /* if we can't allocate history, then we simple won't use it */ ++} ++ ++static void ++ext3_mb_store_history(struct super_block *sb, unsigned ino, ++ struct ext3_allocation_context *ac) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct ext3_mb_history h; ++ ++ if (likely(sbi->s_mb_history == NULL)) ++ return; ++ ++ h.pid = current->pid; ++ h.ino = ino; ++ h.goal = ac->ac_g_ex; ++ h.result = ac->ac_b_ex; ++ h.found = ac->ac_found; ++ h.cr = ac->ac_criteria; ++ h.groups = ac->ac_groups_scanned; ++ h.tail = ac->ac_tail; ++ h.buddy = ac->ac_buddy; ++ h.merged = 0; ++ if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && ++ ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group) ++ h.merged = 1; ++ ++ spin_lock(&sbi->s_mb_history_lock); ++ memcpy(sbi->s_mb_history + sbi->s_mb_history_cur, &h, sizeof(h)); ++ if (++sbi->s_mb_history_cur >= sbi->s_mb_history_max) ++ sbi->s_mb_history_cur = 0; ++ spin_unlock(&sbi->s_mb_history_lock); ++} ++ ++#else ++#define ext3_mb_history_release(sb) ++#define ext3_mb_history_init(sb) ++#endif ++ ++int ext3_mb_init_backend(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int i, j, len, metalen; ++ int num_meta_group_infos = ++ (sbi->s_groups_count + EXT3_DESC_PER_BLOCK(sb) - 1) >> ++ EXT3_DESC_PER_BLOCK_BITS(sb); ++ struct ext3_group_info **meta_group_info; ++ ++ /* An 8TB filesystem with 64-bit pointers requires a 4096 byte ++ * kmalloc. A 128kb malloc should suffice for a 256TB filesystem. ++ * So a two level scheme suffices for now. */ ++ sbi->s_group_info = kmalloc(sizeof(*sbi->s_group_info) * ++ num_meta_group_infos, GFP_KERNEL); ++ if (sbi->s_group_info == NULL) { ++ printk(KERN_ERR "EXT3-fs: can't allocate buddy meta group\n"); ++ return -ENOMEM; ++ } ++ sbi->s_buddy_cache = new_inode(sb); ++ if (sbi->s_buddy_cache == NULL) { ++ printk(KERN_ERR "EXT3-fs: can't get new inode\n"); ++ goto err_freesgi; ++ } ++ ++ metalen = sizeof(*meta_group_info) << EXT3_DESC_PER_BLOCK_BITS(sb); ++ for (i = 0; i < num_meta_group_infos; i++) { ++ if ((i + 1) == num_meta_group_infos) ++ metalen = sizeof(*meta_group_info) * ++ (sbi->s_groups_count - ++ (i << EXT3_DESC_PER_BLOCK_BITS(sb))); ++ meta_group_info = kmalloc(metalen, GFP_KERNEL); ++ if (meta_group_info == NULL) { ++ printk(KERN_ERR "EXT3-fs: can't allocate mem for a " ++ "buddy group\n"); ++ goto err_freemeta; ++ } ++ sbi->s_group_info[i] = meta_group_info; ++ } ++ ++ /* ++ * calculate needed size. if change bb_counters size, ++ * don't forget about ext3_mb_generate_buddy() ++ */ ++ len = sizeof(struct ext3_group_info); ++ len += sizeof(unsigned short) * (sb->s_blocksize_bits + 2); ++ for (i = 0; i < sbi->s_groups_count; i++) { ++ struct ext3_group_desc * desc; ++ ++ meta_group_info = ++ sbi->s_group_info[i >> EXT3_DESC_PER_BLOCK_BITS(sb)]; ++ j = i & (EXT3_DESC_PER_BLOCK(sb) - 1); ++ ++ meta_group_info[j] = kmalloc(len, GFP_KERNEL); ++ if (meta_group_info[j] == NULL) { ++ printk(KERN_ERR "EXT3-fs: can't allocate buddy mem\n"); ++ i--; ++ goto err_freebuddy; ++ } ++ desc = ext3_get_group_desc(sb, i, NULL); ++ if (desc == NULL) { ++ printk(KERN_ERR"EXT3-fs: can't read descriptor %u\n",i); ++ goto err_freebuddy; ++ } ++ memset(meta_group_info[j], 0, len); ++ set_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, ++ &meta_group_info[j]->bb_state); ++ meta_group_info[j]->bb_free = ++ le16_to_cpu(desc->bg_free_blocks_count); ++ } ++ ++ return 0; ++ ++err_freebuddy: ++ while (i >= 0) { ++ kfree(EXT3_GROUP_INFO(sb, i)); ++ i--; ++ } ++ i = num_meta_group_infos; ++err_freemeta: ++ while (--i >= 0) ++ kfree(sbi->s_group_info[i]); ++ iput(sbi->s_buddy_cache); ++err_freesgi: ++ kfree(sbi->s_group_info); ++ return -ENOMEM; ++} ++ ++int ext3_mb_init(struct super_block *sb, int needs_recovery) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct inode *root = sb->s_root->d_inode; ++ unsigned i, offset, max; ++ struct dentry *dentry; ++ ++ if (!test_opt(sb, MBALLOC)) ++ return 0; ++ ++ i = (sb->s_blocksize_bits + 2) * sizeof(unsigned short); ++ ++ sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL); ++ if (sbi->s_mb_offsets == NULL) { ++ clear_opt(sbi->s_mount_opt, MBALLOC); ++ return -ENOMEM; ++ } ++ sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); ++ if (sbi->s_mb_maxs == NULL) { ++ clear_opt(sbi->s_mount_opt, MBALLOC); ++ kfree(sbi->s_mb_maxs); ++ return -ENOMEM; ++ } ++ ++ /* order 0 is regular bitmap */ ++ sbi->s_mb_maxs[0] = sb->s_blocksize << 3; ++ sbi->s_mb_offsets[0] = 0; ++ ++ i = 1; ++ offset = 0; ++ max = sb->s_blocksize << 2; ++ do { ++ sbi->s_mb_offsets[i] = offset; ++ sbi->s_mb_maxs[i] = max; ++ offset += 1 << (sb->s_blocksize_bits - i); ++ max = max >> 1; ++ i++; ++ } while (i <= sb->s_blocksize_bits + 1); ++ ++ /* init file for buddy data */ ++ if ((i = ext3_mb_init_backend(sb))) { ++ clear_opt(sbi->s_mount_opt, MBALLOC); ++ kfree(sbi->s_mb_offsets); ++ kfree(sbi->s_mb_maxs); ++ return i; ++ } ++ ++ spin_lock_init(&sbi->s_reserve_lock); ++ spin_lock_init(&sbi->s_md_lock); ++ INIT_LIST_HEAD(&sbi->s_active_transaction); ++ INIT_LIST_HEAD(&sbi->s_closed_transaction); ++ INIT_LIST_HEAD(&sbi->s_committed_transaction); ++ spin_lock_init(&sbi->s_bal_lock); ++ ++ /* remove old on-disk buddy file */ ++ down(&root->i_sem); ++ dentry = lookup_one_len(".buddy", sb->s_root, strlen(".buddy")); ++ if (dentry->d_inode != NULL) { ++ i = vfs_unlink(root, dentry); ++ if (i != 0) ++ printk("EXT3-fs: can't remove .buddy file: %d\n", i); ++ } ++ dput(dentry); ++ up(&root->i_sem); ++ ++ ext3_mb_history_init(sb); ++ ++ printk("EXT3-fs: mballoc enabled\n"); ++ return 0; ++} ++ ++int ext3_mb_release(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int i, num_meta_group_infos; ++ ++ if (!test_opt(sb, MBALLOC)) ++ return 0; ++ ++ /* release freed, non-committed blocks */ ++ spin_lock(&sbi->s_md_lock); ++ list_splice_init(&sbi->s_closed_transaction, ++ &sbi->s_committed_transaction); ++ list_splice_init(&sbi->s_active_transaction, ++ &sbi->s_committed_transaction); ++ spin_unlock(&sbi->s_md_lock); ++ ext3_mb_free_committed_blocks(sb); ++ ++ if (sbi->s_group_info) { ++ for (i = 0; i < sbi->s_groups_count; i++) ++ kfree(EXT3_GROUP_INFO(sb, i)); ++ num_meta_group_infos = (sbi->s_groups_count + ++ EXT3_DESC_PER_BLOCK(sb) - 1) >> ++ EXT3_DESC_PER_BLOCK_BITS(sb); ++ for (i = 0; i < num_meta_group_infos; i++) ++ kfree(sbi->s_group_info[i]); ++ kfree(sbi->s_group_info); ++ } ++ if (sbi->s_mb_offsets) ++ kfree(sbi->s_mb_offsets); ++ if (sbi->s_mb_maxs) ++ kfree(sbi->s_mb_maxs); ++ if (sbi->s_buddy_cache) ++ iput(sbi->s_buddy_cache); ++ if (sbi->s_blocks_reserved) ++ printk("ext3-fs: %ld blocks being reserved at umount!\n", ++ sbi->s_blocks_reserved); ++ if (ext3_mb_stats) { ++ printk("EXT3-fs: mballoc: %u blocks %u reqs (%u success)\n", ++ atomic_read(&sbi->s_bal_allocated), ++ atomic_read(&sbi->s_bal_reqs), ++ atomic_read(&sbi->s_bal_success)); ++ printk("EXT3-fs: mballoc: %u extents scanned, %u goal hits, " ++ "%u 2^N hits, %u breaks\n", ++ atomic_read(&sbi->s_bal_ex_scanned), ++ atomic_read(&sbi->s_bal_goals), ++ atomic_read(&sbi->s_bal_2orders), ++ atomic_read(&sbi->s_bal_breaks)); ++ printk("EXT3-fs: mballoc: %lu generated and it took %Lu\n", ++ sbi->s_mb_buddies_generated++, ++ sbi->s_mb_generation_time); ++ } ++ ++ ext3_mb_history_release(sb); ++ ++ return 0; ++} ++ ++void ext3_mb_free_committed_blocks(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int err, i, count = 0, count2 = 0; ++ struct ext3_free_metadata *md; ++ struct ext3_buddy e3b; ++ ++ if (list_empty(&sbi->s_committed_transaction)) ++ return; ++ ++ /* there is committed blocks to be freed yet */ ++ do { ++ /* get next array of blocks */ ++ md = NULL; ++ spin_lock(&sbi->s_md_lock); ++ if (!list_empty(&sbi->s_committed_transaction)) { ++ md = list_entry(sbi->s_committed_transaction.next, ++ struct ext3_free_metadata, list); ++ list_del(&md->list); ++ } ++ spin_unlock(&sbi->s_md_lock); ++ ++ if (md == NULL) ++ break; ++ ++ mb_debug("gonna free %u blocks in group %u (0x%p):", ++ md->num, md->group, md); ++ ++ err = ext3_mb_load_buddy(sb, md->group, &e3b); ++ /* we expect to find existing buddy because it's pinned */ ++ BUG_ON(err != 0); ++ ++ /* there are blocks to put in buddy to make them really free */ ++ count += md->num; ++ count2++; ++ ext3_lock_group(sb, md->group); ++ for (i = 0; i < md->num; i++) { ++ mb_debug(" %u", md->blocks[i]); ++ mb_free_blocks(&e3b, md->blocks[i], 1); ++ } ++ mb_debug("\n"); ++ ext3_unlock_group(sb, md->group); ++ ++ /* balance refcounts from ext3_mb_free_metadata() */ ++ page_cache_release(e3b.bd_buddy_page); ++ page_cache_release(e3b.bd_bitmap_page); ++ ++ kfree(md); ++ ext3_mb_release_desc(&e3b); ++ ++ } while (md); ++ mb_debug("freed %u blocks in %u structures\n", count, count2); ++} ++ ++void ext3_mb_poll_new_transaction(struct super_block *sb, handle_t *handle) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ ++ if (sbi->s_last_transaction == handle->h_transaction->t_tid) ++ return; ++ ++ /* new transaction! time to close last one and free blocks for ++ * committed transaction. we know that only transaction can be ++ * active, so previos transaction can be being logged and we ++ * know that transaction before previous is known to be already ++ * logged. this means that now we may free blocks freed in all ++ * transactions before previous one. hope I'm clear enough ... */ ++ ++ spin_lock(&sbi->s_md_lock); ++ if (sbi->s_last_transaction != handle->h_transaction->t_tid) { ++ mb_debug("new transaction %lu, old %lu\n", ++ (unsigned long) handle->h_transaction->t_tid, ++ (unsigned long) sbi->s_last_transaction); ++ list_splice_init(&sbi->s_closed_transaction, ++ &sbi->s_committed_transaction); ++ list_splice_init(&sbi->s_active_transaction, ++ &sbi->s_closed_transaction); ++ sbi->s_last_transaction = handle->h_transaction->t_tid; ++ } ++ spin_unlock(&sbi->s_md_lock); ++ ++ ext3_mb_free_committed_blocks(sb); ++} ++ ++int ext3_mb_free_metadata(handle_t *handle, struct ext3_buddy *e3b, ++ int group, int block, int count) ++{ ++ struct ext3_group_info *db = e3b->bd_info; ++ struct super_block *sb = e3b->bd_sb; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct ext3_free_metadata *md; ++ int i; ++ ++ J_ASSERT(e3b->bd_bitmap_page != NULL); ++ J_ASSERT(e3b->bd_buddy_page != NULL); ++ ++ ext3_lock_group(sb, group); ++ for (i = 0; i < count; i++) { ++ md = db->bb_md_cur; ++ if (md && db->bb_tid != handle->h_transaction->t_tid) { ++ db->bb_md_cur = NULL; ++ md = NULL; ++ } ++ ++ if (md == NULL) { ++ ext3_unlock_group(sb, group); ++ md = kmalloc(sizeof(*md), GFP_KERNEL); ++ if (md == NULL) ++ return -ENOMEM; ++ md->num = 0; ++ md->group = group; ++ ++ ext3_lock_group(sb, group); ++ if (db->bb_md_cur == NULL) { ++ spin_lock(&sbi->s_md_lock); ++ list_add(&md->list, &sbi->s_active_transaction); ++ spin_unlock(&sbi->s_md_lock); ++ /* protect buddy cache from being freed, ++ * otherwise we'll refresh it from ++ * on-disk bitmap and lose not-yet-available ++ * blocks */ ++ page_cache_get(e3b->bd_buddy_page); ++ page_cache_get(e3b->bd_bitmap_page); ++ db->bb_md_cur = md; ++ db->bb_tid = handle->h_transaction->t_tid; ++ mb_debug("new md 0x%p for group %u\n", ++ md, md->group); ++ } else { ++ kfree(md); ++ md = db->bb_md_cur; ++ } ++ } ++ ++ BUG_ON(md->num >= EXT3_BB_MAX_BLOCKS); ++ md->blocks[md->num] = block + i; ++ md->num++; ++ if (md->num == EXT3_BB_MAX_BLOCKS) { ++ /* no more space, put full container on a sb's list */ ++ db->bb_md_cur = NULL; ++ } ++ } ++ ext3_unlock_group(sb, group); ++ return 0; ++} ++ ++void ext3_mb_free_blocks(handle_t *handle, struct inode *inode, ++ unsigned long block, unsigned long count, ++ int metadata, int *freed) ++{ ++ struct buffer_head *bitmap_bh = NULL; ++ struct ext3_group_desc *gdp; ++ struct ext3_super_block *es; ++ unsigned long bit, overflow; ++ struct buffer_head *gd_bh; ++ unsigned long block_group; ++ struct ext3_sb_info *sbi; ++ struct super_block *sb; ++ struct ext3_buddy e3b; ++ int err = 0, ret; ++ ++ *freed = 0; ++ sb = inode->i_sb; ++ if (!sb) { ++ printk ("ext3_free_blocks: nonexistent device"); ++ return; ++ } ++ ++ ext3_mb_poll_new_transaction(sb, handle); ++ ++ sbi = EXT3_SB(sb); ++ es = EXT3_SB(sb)->s_es; ++ if (block < le32_to_cpu(es->s_first_data_block) || ++ block + count < block || ++ block + count > le32_to_cpu(es->s_blocks_count)) { ++ ext3_error (sb, "ext3_free_blocks", ++ "Freeing blocks not in datazone - " ++ "block = %lu, count = %lu", block, count); ++ goto error_return; ++ } ++ ++ ext3_debug("freeing block %lu\n", block); ++ ++do_more: ++ overflow = 0; ++ block_group = (block - le32_to_cpu(es->s_first_data_block)) / ++ EXT3_BLOCKS_PER_GROUP(sb); ++ bit = (block - le32_to_cpu(es->s_first_data_block)) % ++ EXT3_BLOCKS_PER_GROUP(sb); ++ /* ++ * Check to see if we are freeing blocks across a group ++ * boundary. ++ */ ++ if (bit + count > EXT3_BLOCKS_PER_GROUP(sb)) { ++ overflow = bit + count - EXT3_BLOCKS_PER_GROUP(sb); ++ count -= overflow; ++ } ++ brelse(bitmap_bh); ++ bitmap_bh = read_block_bitmap(sb, block_group); ++ if (!bitmap_bh) ++ goto error_return; ++ gdp = ext3_get_group_desc (sb, block_group, &gd_bh); ++ if (!gdp) ++ goto error_return; ++ ++ if (in_range (le32_to_cpu(gdp->bg_block_bitmap), block, count) || ++ in_range (le32_to_cpu(gdp->bg_inode_bitmap), block, count) || ++ in_range (block, le32_to_cpu(gdp->bg_inode_table), ++ EXT3_SB(sb)->s_itb_per_group) || ++ in_range (block + count - 1, le32_to_cpu(gdp->bg_inode_table), ++ EXT3_SB(sb)->s_itb_per_group)) ++ ext3_error (sb, "ext3_free_blocks", ++ "Freeing blocks in system zones - " ++ "Block = %lu, count = %lu", ++ block, count); ++ ++ BUFFER_TRACE(bitmap_bh, "getting write access"); ++ err = ext3_journal_get_write_access(handle, bitmap_bh); ++ if (err) ++ goto error_return; ++ ++ /* ++ * We are about to modify some metadata. Call the journal APIs ++ * to unshare ->b_data if a currently-committing transaction is ++ * using it ++ */ ++ BUFFER_TRACE(gd_bh, "get_write_access"); ++ err = ext3_journal_get_write_access(handle, gd_bh); ++ if (err) ++ goto error_return; ++ ++ err = ext3_mb_load_buddy(sb, block_group, &e3b); ++ if (err) ++ goto error_return; ++ ++#ifdef AGGRESSIVE_CHECK ++ { ++ int i; ++ for (i = 0; i < count; i++) ++ J_ASSERT(mb_test_bit(bit + i, bitmap_bh->b_data)); ++ } ++#endif ++ mb_clear_bits(bitmap_bh->b_data, bit, count); ++ ++ /* We dirtied the bitmap block */ ++ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); ++ err = ext3_journal_dirty_metadata(handle, bitmap_bh); ++ ++ if (metadata) { ++ /* blocks being freed are metadata. these blocks shouldn't ++ * be used until this transaction is committed */ ++ ext3_mb_free_metadata(handle, &e3b, block_group, bit, count); ++ } else { ++ ext3_lock_group(sb, block_group); ++ mb_free_blocks(&e3b, bit, count); ++ ext3_unlock_group(sb, block_group); ++ } ++ ++ spin_lock(sb_bgl_lock(sbi, block_group)); ++ gdp->bg_free_blocks_count = ++ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + count); ++ spin_unlock(sb_bgl_lock(sbi, block_group)); ++ percpu_counter_mod(&sbi->s_freeblocks_counter, count); ++ ++ ext3_mb_release_desc(&e3b); ++ ++ *freed = count; ++ ++ /* And the group descriptor block */ ++ BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); ++ ret = ext3_journal_dirty_metadata(handle, gd_bh); ++ if (!err) err = ret; ++ ++ if (overflow && !err) { ++ block += count; ++ count = overflow; ++ goto do_more; ++ } ++ sb->s_dirt = 1; ++error_return: ++ brelse(bitmap_bh); ++ ext3_std_error(sb, err); ++ return; ++} ++ ++int ext3_mb_reserve_blocks(struct super_block *sb, int blocks) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int free, ret = -ENOSPC; ++ ++ BUG_ON(blocks < 0); ++ spin_lock(&sbi->s_reserve_lock); ++ free = percpu_counter_read_positive(&sbi->s_freeblocks_counter); ++ if (blocks <= free - sbi->s_blocks_reserved) { ++ sbi->s_blocks_reserved += blocks; ++ ret = 0; ++ } ++ spin_unlock(&sbi->s_reserve_lock); ++ return ret; ++} ++ ++void ext3_mb_release_blocks(struct super_block *sb, int blocks) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ ++ BUG_ON(blocks < 0); ++ spin_lock(&sbi->s_reserve_lock); ++ sbi->s_blocks_reserved -= blocks; ++ WARN_ON(sbi->s_blocks_reserved < 0); ++ if (sbi->s_blocks_reserved < 0) ++ sbi->s_blocks_reserved = 0; ++ spin_unlock(&sbi->s_reserve_lock); ++} ++ ++int ext3_new_block(handle_t *handle, struct inode *inode, ++ unsigned long goal, int *errp) ++{ ++ int ret, len; ++ ++ if (!test_opt(inode->i_sb, MBALLOC)) { ++ ret = ext3_new_block_old(handle, inode, goal, errp); ++ goto out; ++ } ++ len = 1; ++ ret = ext3_mb_new_blocks(handle, inode, goal, &len, 0, errp); ++out: ++ return ret; ++} ++ ++ ++void ext3_free_blocks(handle_t *handle, struct inode * inode, ++ unsigned long block, unsigned long count, int metadata) ++{ ++ struct super_block *sb; ++ int freed; ++ ++ sb = inode->i_sb; ++ if (!test_opt(sb, MBALLOC) || !EXT3_SB(sb)->s_group_info) ++ ext3_free_blocks_sb(handle, sb, block, count, &freed); ++ else ++ ext3_mb_free_blocks(handle, inode, block, count, metadata, &freed); ++ if (freed) ++ DQUOT_FREE_BLOCK(inode, freed); ++ return; ++} ++ ++#define EXT3_ROOT "ext3" ++#define EXT3_MB_STATS_NAME "mb_stats" ++#define EXT3_MB_MAX_TO_SCAN_NAME "mb_max_to_scan" ++#define EXT3_MB_MIN_TO_SCAN_NAME "mb_min_to_scan" ++#define EXT3_MB_ORDER2_REQ "mb_order2_req" ++ ++static int ext3_mb_stats_read(char *page, char **start, off_t off, ++ int count, int *eof, void *data) ++{ ++ int len; ++ ++ *eof = 1; ++ if (off != 0) ++ return 0; ++ ++ len = sprintf(page, "%ld\n", ext3_mb_stats); ++ *start = page; ++ return len; ++} ++ ++static int ext3_mb_stats_write(struct file *file, const char *buffer, ++ unsigned long count, void *data) ++{ ++ char str[32]; ++ ++ if (count >= sizeof(str)) { ++ printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n", ++ EXT3_MB_STATS_NAME, (int)sizeof(str)); ++ return -EOVERFLOW; ++ } ++ ++ if (copy_from_user(str, buffer, count)) ++ return -EFAULT; ++ ++ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ ++ ext3_mb_stats = (simple_strtol(str, NULL, 0) != 0); ++ return count; ++} ++ ++static int ext3_mb_max_to_scan_read(char *page, char **start, off_t off, ++ int count, int *eof, void *data) ++{ ++ int len; ++ ++ *eof = 1; ++ if (off != 0) ++ return 0; ++ ++ len = sprintf(page, "%ld\n", ext3_mb_max_to_scan); ++ *start = page; ++ return len; ++} ++ ++static int ext3_mb_max_to_scan_write(struct file *file, const char *buffer, ++ unsigned long count, void *data) ++{ ++ char str[32]; ++ long value; ++ ++ if (count >= sizeof(str)) { ++ printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n", ++ EXT3_MB_MAX_TO_SCAN_NAME, (int)sizeof(str)); ++ return -EOVERFLOW; ++ } ++ ++ if (copy_from_user(str, buffer, count)) ++ return -EFAULT; ++ ++ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ ++ value = simple_strtol(str, NULL, 0); ++ if (value <= 0) ++ return -ERANGE; ++ ++ ext3_mb_max_to_scan = value; ++ ++ return count; ++} ++ ++static int ext3_mb_min_to_scan_read(char *page, char **start, off_t off, ++ int count, int *eof, void *data) ++{ ++ int len; ++ ++ *eof = 1; ++ if (off != 0) ++ return 0; ++ ++ len = sprintf(page, "%ld\n", ext3_mb_min_to_scan); ++ *start = page; ++ return len; ++} ++ ++static int ext3_mb_min_to_scan_write(struct file *file, const char *buffer, ++ unsigned long count, void *data) ++{ ++ char str[32]; ++ long value; ++ ++ if (count >= sizeof(str)) { ++ printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n", ++ EXT3_MB_MIN_TO_SCAN_NAME, (int)sizeof(str)); ++ return -EOVERFLOW; ++ } ++ ++ if (copy_from_user(str, buffer, count)) ++ return -EFAULT; ++ ++ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ ++ value = simple_strtol(str, NULL, 0); ++ if (value <= 0) ++ return -ERANGE; ++ ++ ext3_mb_min_to_scan = value; ++ ++ return count; ++} ++ ++static int ext3_mb_order2_req_read(char *page, char **start, off_t off, ++ int count, int *eof, void *data) ++{ ++ int len; ++ ++ *eof = 1; ++ if (off != 0) ++ return 0; ++ ++ len = sprintf(page, "%ld\n", ext3_mb_order2_reqs); ++ *start = page; ++ return len; ++} ++ ++static int ext3_mb_order2_req_write(struct file *file, const char *buffer, ++ unsigned long count, void *data) ++{ ++ char str[32]; ++ long value; ++ ++ if (count >= sizeof(str)) { ++ printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n", ++ EXT3_MB_MIN_TO_SCAN_NAME, (int)sizeof(str)); ++ return -EOVERFLOW; ++ } ++ ++ if (copy_from_user(str, buffer, count)) ++ return -EFAULT; ++ ++ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ ++ value = simple_strtol(str, NULL, 0); ++ if (value <= 0) ++ return -ERANGE; ++ ++ ext3_mb_order2_reqs = value; ++ ++ return count; ++} ++ ++int __init init_ext3_proc(void) ++{ ++ struct proc_dir_entry *proc_ext3_mb_stats; ++ struct proc_dir_entry *proc_ext3_mb_max_to_scan; ++ struct proc_dir_entry *proc_ext3_mb_min_to_scan; ++ struct proc_dir_entry *proc_ext3_mb_order2_req; ++ ++ proc_root_ext3 = proc_mkdir(EXT3_ROOT, proc_root_fs); ++ if (proc_root_ext3 == NULL) { ++ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", EXT3_ROOT); ++ return -EIO; ++ } ++ ++ /* Initialize EXT3_MB_STATS_NAME */ ++ proc_ext3_mb_stats = create_proc_entry(EXT3_MB_STATS_NAME, ++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); ++ if (proc_ext3_mb_stats == NULL) { ++ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", ++ EXT3_MB_STATS_NAME); ++ remove_proc_entry(EXT3_ROOT, proc_root_fs); ++ return -EIO; ++ } ++ ++ proc_ext3_mb_stats->data = NULL; ++ proc_ext3_mb_stats->read_proc = ext3_mb_stats_read; ++ proc_ext3_mb_stats->write_proc = ext3_mb_stats_write; ++ ++ /* Initialize EXT3_MAX_TO_SCAN_NAME */ ++ proc_ext3_mb_max_to_scan = create_proc_entry( ++ EXT3_MB_MAX_TO_SCAN_NAME, ++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); ++ if (proc_ext3_mb_max_to_scan == NULL) { ++ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", ++ EXT3_MB_MAX_TO_SCAN_NAME); ++ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_ROOT, proc_root_fs); ++ return -EIO; ++ } ++ ++ proc_ext3_mb_max_to_scan->data = NULL; ++ proc_ext3_mb_max_to_scan->read_proc = ext3_mb_max_to_scan_read; ++ proc_ext3_mb_max_to_scan->write_proc = ext3_mb_max_to_scan_write; ++ ++ /* Initialize EXT3_MIN_TO_SCAN_NAME */ ++ proc_ext3_mb_min_to_scan = create_proc_entry( ++ EXT3_MB_MIN_TO_SCAN_NAME, ++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); ++ if (proc_ext3_mb_min_to_scan == NULL) { ++ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", ++ EXT3_MB_MIN_TO_SCAN_NAME); ++ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_ROOT, proc_root_fs); ++ return -EIO; ++ } ++ ++ proc_ext3_mb_min_to_scan->data = NULL; ++ proc_ext3_mb_min_to_scan->read_proc = ext3_mb_min_to_scan_read; ++ proc_ext3_mb_min_to_scan->write_proc = ext3_mb_min_to_scan_write; ++ ++ /* Initialize EXT3_ORDER2_REQ */ ++ proc_ext3_mb_order2_req = create_proc_entry( ++ EXT3_MB_ORDER2_REQ, ++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); ++ if (proc_ext3_mb_order2_req == NULL) { ++ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", ++ EXT3_MB_ORDER2_REQ); ++ remove_proc_entry(EXT3_MB_MIN_TO_SCAN_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_ROOT, proc_root_fs); ++ return -EIO; ++ } ++ ++ proc_ext3_mb_order2_req->data = NULL; ++ proc_ext3_mb_order2_req->read_proc = ext3_mb_order2_req_read; ++ proc_ext3_mb_order2_req->write_proc = ext3_mb_order2_req_write; ++ ++ return 0; ++} ++ ++void exit_ext3_proc(void) ++{ ++ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_MIN_TO_SCAN_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_ORDER2_REQ, proc_root_ext3); ++ remove_proc_entry(EXT3_ROOT, proc_root_fs); ++} +Index: linux-2.6.9-full/fs/ext3/Makefile +=================================================================== +--- linux-2.6.9-full.orig/fs/ext3/Makefile 2006-06-01 14:58:46.000000000 +0400 ++++ linux-2.6.9-full/fs/ext3/Makefile 2006-10-24 12:54:31.000000000 +0400 +@@ -6,7 +6,7 @@ obj-$(CONFIG_EXT3_FS) += ext3.o + + ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ + ioctl.o namei.o super.o symlink.o hash.o resize.o \ +- extents.o ++ extents.o mballoc.o + + ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o + ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o diff --git a/ldiskfs/kernel_patches/patches/ext3-nlinks-2.6.7.patch b/ldiskfs/kernel_patches/patches/ext3-nlinks-2.6.7.patch index b20be23..0d360fa 100644 --- a/ldiskfs/kernel_patches/patches/ext3-nlinks-2.6.7.patch +++ b/ldiskfs/kernel_patches/patches/ext3-nlinks-2.6.7.patch @@ -26,7 +26,7 @@ Index: linux-2.6.7/fs/ext3/namei.c int err; - if (dir->i_nlink >= EXT3_LINK_MAX) -+ if (EXT3_DIR_LINK_MAXED(dir)) ++ if (EXT3_DIR_LINK_MAX(dir)) return -EMLINK; handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + @@ -86,7 +86,7 @@ Index: linux-2.6.7/fs/ext3/namei.c int err; - if (inode->i_nlink >= EXT3_LINK_MAX) -+ if (EXT3_DIR_LINK_MAXED(inode)) ++ if (EXT3_DIR_LINK_MAX(inode)) return -EMLINK; handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + @@ -97,7 +97,7 @@ Index: linux-2.6.7/fs/ext3/namei.c - if (!new_inode && new_dir!=old_dir && - new_dir->i_nlink >= EXT3_LINK_MAX) + if (!new_inode && new_dir != old_dir && -+ EXT3_DIR_LINK_MAXED(new_dir)) ++ EXT3_DIR_LINK_MAX(new_dir)) goto end_rename; } if (!new_bh) { @@ -110,7 +110,7 @@ Index: linux-2.6.7/fs/ext3/namei.c new_inode->i_ctime = CURRENT_TIME; } old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME; -@@ -2299,11 +2304,11 @@ static int ext3_rename (struct inode * o +@@ -2299,11 +2304,13 @@ static int ext3_rename (struct inode * o PARENT_INO(dir_bh->b_data) = le32_to_cpu(new_dir->i_ino); BUFFER_TRACE(dir_bh, "call ext3_journal_dirty_metadata"); ext3_journal_dirty_metadata(handle, dir_bh); @@ -118,7 +118,9 @@ Index: linux-2.6.7/fs/ext3/namei.c + ext3_dec_count(handle, old_dir); if (new_inode) { - new_inode->i_nlink--; -+ ext3_dec_count(handle, new_inode); ++ /* checked empty_dir above, can't have another parent, ++ * ext3_dec_count() won't work for many-linked dirs */ ++ new_inode->i_nlink = 0; } else { - new_dir->i_nlink++; + ext3_inc_count(handle, new_dir); @@ -129,15 +131,6 @@ Index: linux-2.6.7/include/linux/ext3_fs.h =================================================================== --- linux-2.6.7.orig/include/linux/ext3_fs.h 2004-06-15 23:19:36.000000000 -0600 +++ linux-2.6.7/include/linux/ext3_fs.h 2004-08-20 17:41:27.000000000 -0600 -@@ -41,7 +41,7 @@ struct statfs; - /* - * Always enable hashed directories - */ --#define CONFIG_EXT3_INDEX -+#define CONFIG_EXT3_INDEX 1 - - /* - * Debug code @@ -79,7 +81,7 @@ /* * Maximal count of links to a file @@ -147,24 +140,3 @@ Index: linux-2.6.7/include/linux/ext3_fs.h /* * Macro-instructions used to manage several block sizes -@@ -595,14 +595,15 @@ struct ext3_dir_entry_2 { - */ - - #ifdef CONFIG_EXT3_INDEX -- #define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \ -- EXT3_FEATURE_COMPAT_DIR_INDEX) && \ -+#define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \ -+ EXT3_FEATURE_COMPAT_DIR_INDEX) && \ - (EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) --#define EXT3_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT3_LINK_MAX) --#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1) -+#define EXT3_DIR_LINK_MAXED(dir) (!is_dx(dir) && (dir)->i_nlink >=EXT3_LINK_MAX) -+#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || \ -+ (is_dx(dir) && (dir)->i_nlink == 1)) - #else - #define is_dx(dir) 0 --#define EXT3_DIR_LINK_MAX(dir) ((dir)->i_nlink >= EXT3_LINK_MAX) -+#define EXT3_DIR_LINK_MAXED(dir) ((dir)->i_nlink >= EXT3_LINK_MAX) - #define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2) - #endif - diff --git a/ldiskfs/kernel_patches/patches/ext3-nlinks-2.6.9.patch b/ldiskfs/kernel_patches/patches/ext3-nlinks-2.6.9.patch new file mode 100644 index 0000000..37cca81 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext3-nlinks-2.6.9.patch @@ -0,0 +1,142 @@ +diff -Nur orig/fs/ext3/namei.c patch/fs/ext3/namei.c +--- orig/fs/ext3/namei.c 2005-10-12 13:58:19.000000000 -0700 ++++ patch/fs/ext3/namei.c 2005-10-12 14:00:33.000000000 -0700 +@@ -1603,11 +1603,17 @@ + static inline void ext3_inc_count(handle_t *handle, struct inode *inode) + { + inode->i_nlink++; ++ if (is_dx(inode) && inode->i_nlink > 1) { ++ /* limit is 16-bit i_links_count */ ++ if (inode->i_nlink >= EXT3_LINK_MAX || inode->i_nlink == 2) ++ inode->i_nlink = 1; ++ } + } + + static inline void ext3_dec_count(handle_t *handle, struct inode *inode) + { +- inode->i_nlink--; ++ if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 2) ++ inode->i_nlink--; + } + + static int ext3_add_nondir(handle_t *handle, +@@ -1706,7 +1712,7 @@ static int ext3_add_nondir(handle_t + struct ext3_dir_entry_2 * de; + int err, retries = 0; + +- if (dir->i_nlink >= EXT3_LINK_MAX) ++ if (EXT3_DIR_LINK_MAX(dir)) + return -EMLINK; + + retry: +@@ -1729,7 +1735,7 @@ static int ext3_mkdir(struct inode + inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize; + dir_block = ext3_bread (handle, inode, 0, 1, &err); + if (!dir_block) { +- inode->i_nlink--; /* is this nlink == 0? */ ++ ext3_dec_count(handle, inode); /* is this nlink == 0? */ + ext3_mark_inode_dirty(handle, inode); + iput (inode); + goto out_stop; +@@ -1761,7 +1767,7 @@ static int ext3_mkdir(struct inode + iput (inode); + goto out_stop; + } +- dir->i_nlink++; ++ ext3_inc_count(handle, dir); + ext3_update_dx_flag(dir); + ext3_mark_inode_dirty(handle, dir); + d_instantiate(dentry, inode); +@@ -2026,10 +2032,10 @@ static int ext3_rmdir (struct inode + retval = ext3_delete_entry(handle, dir, de, bh); + if (retval) + goto end_rmdir; +- if (inode->i_nlink != 2) +- ext3_warning (inode->i_sb, "ext3_rmdir", +- "empty directory has nlink!=2 (%d)", +- inode->i_nlink); ++ if (!EXT3_DIR_LINK_EMPTY(inode)) ++ ext3_warning(inode->i_sb, "ext3_rmdir", ++ "empty directory has too many links (%d)", ++ inode->i_nlink); + inode->i_version++; + inode->i_nlink = 0; + /* There's no need to set i_disksize: the fact that i_nlink is +@@ -2039,7 +2045,7 @@ static int ext3_rmdir (struct inode + ext3_orphan_add(handle, inode); + inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; + ext3_mark_inode_dirty(handle, inode); +- dir->i_nlink--; ++ ext3_dec_count(handle, dir); + ext3_update_dx_flag(dir); + ext3_mark_inode_dirty(handle, dir); + +@@ -2090,7 +2096,7 @@ static int ext3_unlink(struct inode + dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; + ext3_update_dx_flag(dir); + ext3_mark_inode_dirty(handle, dir); +- inode->i_nlink--; ++ ext3_dec_count(handle, inode); + if (!inode->i_nlink) + ext3_orphan_add(handle, inode); + inode->i_ctime = dir->i_ctime; +@@ -2165,7 +2171,7 @@ static int ext3_link (struct dentry + struct inode *inode = old_dentry->d_inode; + int err, retries = 0; + +- if (inode->i_nlink >= EXT3_LINK_MAX) ++ if (EXT3_DIR_LINK_MAX(inode)) + return -EMLINK; + + retry: +@@ -2252,8 +2258,8 @@ static int ext3_rename (struct inode + if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino) + goto end_rename; + retval = -EMLINK; +- if (!new_inode && new_dir!=old_dir && +- new_dir->i_nlink >= EXT3_LINK_MAX) ++ if (!new_inode && new_dir != old_dir && ++ EXT3_DIR_LINK_MAX(new_dir)) + goto end_rename; + } + if (!new_bh) { +@@ -2310,7 +2316,7 @@ static int ext3_rename (struct inode + } + + if (new_inode) { +- new_inode->i_nlink--; ++ ext3_dec_count(handle, new_inode); + new_inode->i_ctime = CURRENT_TIME_SEC; + } + old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME_SEC; +@@ -2321,11 +2327,13 @@ static int ext3_rename (struct inode + PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino); + BUFFER_TRACE(dir_bh, "call ext3_journal_dirty_metadata"); + ext3_journal_dirty_metadata(handle, dir_bh); +- old_dir->i_nlink--; ++ ext3_dec_count(handle, old_dir); + if (new_inode) { +- new_inode->i_nlink--; ++ /* checked empty_dir above, can't have another parent, ++ * ext3_dec_count() won't work for many-linked dirs */ ++ new_inode->i_nlink = 0; + } else { +- new_dir->i_nlink++; ++ ext3_inc_count(handle, new_dir); + ext3_update_dx_flag(new_dir); + ext3_mark_inode_dirty(handle, new_dir); + } + +Index: linux-2.6.7/include/linux/ext3_fs.h +=================================================================== +--- linux-2.6.7.orig/include/linux/ext3_fs.h 2004-06-15 23:19:36.000000000 -0600 ++++ linux-2.6.7/include/linux/ext3_fs.h 2004-08-20 17:41:27.000000000 -0600 +@@ -79,7 +81,7 @@ + /* + * Maximal count of links to a file + */ +-#define EXT3_LINK_MAX 32000 ++#define EXT3_LINK_MAX 65000 + + /* + * Macro-instructions used to manage several block sizes diff --git a/ldiskfs/kernel_patches/patches/ext3-remove-cond_resched-calls-2.6.12.patch b/ldiskfs/kernel_patches/patches/ext3-remove-cond_resched-calls-2.6.12.patch new file mode 100644 index 0000000..57898d5 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext3-remove-cond_resched-calls-2.6.12.patch @@ -0,0 +1,29 @@ +Index: linux-stage/fs/ext3/ialloc.c +=================================================================== +--- linux-stage.orig/fs/ext3/ialloc.c 2005-06-26 10:59:43.048185981 +0200 ++++ linux-stage/fs/ext3/ialloc.c 2005-06-26 11:01:21.317716027 +0200 +@@ -775,7 +775,6 @@ + if (!gdp) + continue; + desc_count += le16_to_cpu(gdp->bg_free_inodes_count); +- cond_resched(); + } + return desc_count; + #endif +Index: linux-stage/fs/ext3/super.c +=================================================================== +--- linux-stage.orig/fs/ext3/super.c 2005-06-26 10:59:43.205412542 +0200 ++++ linux-stage/fs/ext3/super.c 2005-06-26 11:02:29.599941754 +0200 +@@ -2236,11 +2232,9 @@ + * block group descriptors. If the sparse superblocks + * feature is turned on, then not all groups have this. + */ +- for (i = 0; i < ngroups; i++) { ++ for (i = 0; i < ngroups; i++) + overhead += ext3_bg_has_super(sb, i) + + ext3_bg_num_gdb(sb, i); +- cond_resched(); +- } + + /* + * Every block group has an inode bitmap, a block diff --git a/ldiskfs/kernel_patches/patches/ext3-rename-reserve-2.6-suse.patch b/ldiskfs/kernel_patches/patches/ext3-rename-reserve-2.6-suse.patch new file mode 100644 index 0000000..f323584 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext3-rename-reserve-2.6-suse.patch @@ -0,0 +1,263 @@ +Index: linux-2.6.5-sles9/include/linux/ext3_fs.h +=================================================================== +--- linux-2.6.5-sles9.orig/include/linux/ext3_fs.h 2004-11-09 02:29:14.878513832 +0300 ++++ linux-2.6.5-sles9/include/linux/ext3_fs.h 2004-11-09 02:32:14.151260232 +0300 +@@ -709,7 +709,7 @@ + unsigned int block_group, + struct buffer_head ** bh); + extern int ext3_should_retry_alloc(struct super_block *sb, int *retries); +-extern void rsv_window_add(struct super_block *sb, struct reserve_window_node *rsv); ++extern void rsv_window_add(struct super_block *sb, struct ext3_reserve_window_node *rsv); + + /* dir.c */ + extern int ext3_check_dir_entry(const char *, struct inode *, +Index: linux-2.6.5-sles9/include/linux/ext3_fs_sb.h +=================================================================== +--- linux-2.6.5-sles9.orig/include/linux/ext3_fs_sb.h 2004-11-09 02:28:18.753046200 +0300 ++++ linux-2.6.5-sles9/include/linux/ext3_fs_sb.h 2004-11-09 02:32:27.996155488 +0300 +@@ -86,7 +86,7 @@ + /* root of the per fs reservation window tree */ + spinlock_t s_rsv_window_lock; + struct rb_root s_rsv_window_root; +- struct reserve_window_node s_rsv_window_head; ++ struct ext3_reserve_window_node s_rsv_window_head; + + /* Journaling */ + struct inode * s_journal_inode; +Index: linux-2.6.5-sles9/include/linux/ext3_fs_i.h +=================================================================== +--- linux-2.6.5-sles9.orig/include/linux/ext3_fs_i.h 2004-11-09 02:23:21.606219384 +0300 ++++ linux-2.6.5-sles9/include/linux/ext3_fs_i.h 2004-11-09 02:32:08.752081032 +0300 +@@ -20,17 +20,17 @@ + #include + #include + +-struct reserve_window { ++struct ext3_reserve_window { + __u32 _rsv_start; /* First byte reserved */ + __u32 _rsv_end; /* Last byte reserved or 0 */ + }; + +-struct reserve_window_node { ++struct ext3_reserve_window_node { + struct rb_node rsv_node; + atomic_t rsv_goal_size; + atomic_t rsv_alloc_hit; + seqlock_t rsv_seqlock; +- struct reserve_window rsv_window; ++ struct ext3_reserve_window rsv_window; + }; + + #define rsv_start rsv_window._rsv_start +@@ -76,7 +76,7 @@ + */ + __u32 i_next_alloc_goal; + /* block reservation window */ +- struct reserve_window_node i_rsv_window; ++ struct ext3_reserve_window_node i_rsv_window; + + __u32 i_dir_start_lookup; + #ifdef CONFIG_EXT3_FS_XATTR +Index: linux-2.6.5-sles9/fs/ext3/balloc.c +=================================================================== +--- linux-2.6.5-sles9.orig/fs/ext3/balloc.c 2004-11-09 02:26:53.078070776 +0300 ++++ linux-2.6.5-sles9/fs/ext3/balloc.c 2004-11-09 02:32:43.108858008 +0300 +@@ -115,7 +115,7 @@ + const char *fn) + { + struct rb_node *n; +- struct reserve_window_node *rsv, *prev; ++ struct ext3_reserve_window_node *rsv, *prev; + int bad; + + restart: +@@ -125,7 +125,7 @@ + + printk("Block Allocation Reservation Windows Map (%s):\n", fn); + while (n) { +- rsv = list_entry(n, struct reserve_window_node, rsv_node); ++ rsv = list_entry(n, struct ext3_reserve_window_node, rsv_node); + if (verbose) + printk("reservation window 0x%p " + "start: %d, end: %d\n", +@@ -161,7 +161,7 @@ + #endif + + static int +-goal_in_my_reservation(struct reserve_window *rsv, int goal, ++goal_in_my_reservation(struct ext3_reserve_window *rsv, int goal, + unsigned int group, struct super_block * sb) + { + unsigned long group_first_block, group_last_block; +@@ -184,18 +184,18 @@ + * if the goal is not in any window. + * Returns NULL if there are no windows or if all windows start after the goal. + */ +-static struct reserve_window_node *search_reserve_window(struct rb_root *root, ++static struct ext3_reserve_window_node *search_ext3_reserve_window(struct rb_root *root, + unsigned long goal) + { + struct rb_node *n = root->rb_node; +- struct reserve_window_node *rsv; ++ struct ext3_reserve_window_node *rsv; + + if (!n) + return NULL; + + while (n) + { +- rsv = rb_entry(n, struct reserve_window_node, rsv_node); ++ rsv = rb_entry(n, struct ext3_reserve_window_node, rsv_node); + + if (goal < rsv->rsv_start) + n = n->rb_left; +@@ -212,13 +212,13 @@ + */ + if (rsv->rsv_start > goal) { + n = rb_prev(&rsv->rsv_node); +- rsv = rb_entry(n, struct reserve_window_node, rsv_node); ++ rsv = rb_entry(n, struct ext3_reserve_window_node, rsv_node); + } + return rsv; + } + + void rsv_window_add(struct super_block *sb, +- struct reserve_window_node *rsv) ++ struct ext3_reserve_window_node *rsv) + { + struct rb_root *root = &EXT3_SB(sb)->s_rsv_window_root; + struct rb_node *node = &rsv->rsv_node; +@@ -226,12 +226,12 @@ + + struct rb_node ** p = &root->rb_node; + struct rb_node * parent = NULL; +- struct reserve_window_node *this; ++ struct ext3_reserve_window_node *this; + + while (*p) + { + parent = *p; +- this = rb_entry(parent, struct reserve_window_node, rsv_node); ++ this = rb_entry(parent, struct ext3_reserve_window_node, rsv_node); + + if (start < this->rsv_start) + p = &(*p)->rb_left; +@@ -246,7 +246,7 @@ + } + + static void rsv_window_remove(struct super_block *sb, +- struct reserve_window_node *rsv) ++ struct ext3_reserve_window_node *rsv) + { + rsv->rsv_start = EXT3_RESERVE_WINDOW_NOT_ALLOCATED; + rsv->rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED; +@@ -254,7 +254,7 @@ + rb_erase(&rsv->rsv_node, &EXT3_SB(sb)->s_rsv_window_root); + } + +-static inline int rsv_is_empty(struct reserve_window *rsv) ++static inline int rsv_is_empty(struct ext3_reserve_window *rsv) + { + /* a valid reservation end block could not be 0 */ + return (rsv->_rsv_end == EXT3_RESERVE_WINDOW_NOT_ALLOCATED); +@@ -263,7 +263,7 @@ + void ext3_discard_reservation(struct inode *inode) + { + struct ext3_inode_info *ei = EXT3_I(inode); +- struct reserve_window_node *rsv = &ei->i_rsv_window; ++ struct ext3_reserve_window_node *rsv = &ei->i_rsv_window; + spinlock_t *rsv_lock = &EXT3_SB(inode->i_sb)->s_rsv_window_lock; + + if (!rsv_is_empty(&rsv->rsv_window)) { +@@ -600,7 +600,7 @@ + */ + static int + ext3_try_to_allocate(struct super_block *sb, handle_t *handle, int group, +- struct buffer_head *bitmap_bh, int goal, struct reserve_window *my_rsv) ++ struct buffer_head *bitmap_bh, int goal, struct ext3_reserve_window *my_rsv) + { + int group_first_block, start, end; + +@@ -700,13 +700,13 @@ + * on succeed, it returns the reservation window to be appended to. + * failed, return NULL. + */ +-static struct reserve_window_node *find_next_reservable_window( +- struct reserve_window_node *search_head, ++static struct ext3_reserve_window_node *find_next_reservable_window( ++ struct ext3_reserve_window_node *search_head, + unsigned long size, int *start_block, + int last_block) + { + struct rb_node *next; +- struct reserve_window_node *rsv, *prev; ++ struct ext3_reserve_window_node *rsv, *prev; + int cur; + + /* TODO: make the start of the reservation window byte-aligned */ +@@ -734,7 +734,7 @@ + + prev = rsv; + next = rb_next(&rsv->rsv_node); +- rsv = list_entry(next, struct reserve_window_node, rsv_node); ++ rsv = list_entry(next, struct ext3_reserve_window_node, rsv_node); + + /* + * Reached the last reservation, we can just append to the +@@ -801,15 +801,15 @@ + * @group: the group we are trying to allocate in + * @bitmap_bh: the block group block bitmap + */ +-static int alloc_new_reservation(struct reserve_window_node *my_rsv, ++static int alloc_new_reservation(struct ext3_reserve_window_node *my_rsv, + int goal, struct super_block *sb, + unsigned int group, struct buffer_head *bitmap_bh) + { +- struct reserve_window_node *search_head; ++ struct ext3_reserve_window_node *search_head; + int group_first_block, group_end_block, start_block; + int first_free_block; + int reservable_space_start; +- struct reserve_window_node *prev_rsv; ++ struct ext3_reserve_window_node *prev_rsv; + struct rb_root *fs_rsv_root = &EXT3_SB(sb)->s_rsv_window_root; + unsigned long size; + +@@ -859,7 +859,7 @@ + /* + * shift the search start to the window near the goal block + */ +- search_head = search_reserve_window(fs_rsv_root, start_block); ++ search_head = search_ext3_reserve_window(fs_rsv_root, start_block); + + /* + * find_next_reservable_window() simply finds a reservable window +@@ -968,7 +968,7 @@ + static int + ext3_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle, + unsigned int group, struct buffer_head *bitmap_bh, +- int goal, struct reserve_window_node * my_rsv, ++ int goal, struct ext3_reserve_window_node * my_rsv, + int *errp) + { + spinlock_t *rsv_lock; +@@ -1027,7 +1027,7 @@ + * then we could go to allocate from the reservation window directly. + */ + while (1) { +- struct reserve_window rsv_copy; ++ struct ext3_reserve_window rsv_copy; + unsigned int seq; + + do { +@@ -1159,8 +1159,8 @@ + struct ext3_group_desc *gdp; + struct ext3_super_block *es; + struct ext3_sb_info *sbi; +- struct reserve_window_node *my_rsv = NULL; +- struct reserve_window_node *rsv = &EXT3_I(inode)->i_rsv_window; ++ struct ext3_reserve_window_node *my_rsv = NULL; ++ struct ext3_reserve_window_node *rsv = &EXT3_I(inode)->i_rsv_window; + unsigned short windowsz = 0; + #ifdef EXT3FS_DEBUG + static int goal_hits, goal_attempts; diff --git a/ldiskfs/kernel_patches/patches/ext3-sector_t-overflow-2.6.12.patch b/ldiskfs/kernel_patches/patches/ext3-sector_t-overflow-2.6.12.patch new file mode 100644 index 0000000..ef0f4a4 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext3-sector_t-overflow-2.6.12.patch @@ -0,0 +1,64 @@ +Subject: Avoid disk sector_t overflow for >2TB ext3 filesystem +From: Mingming Cao + + +If ext3 filesystem is larger than 2TB, and sector_t is a u32 (i.e. +CONFIG_LBD not defined in the kernel), the calculation of the disk sector +will overflow. Add check at ext3_fill_super() and ext3_group_extend() to +prevent mount/remount/resize >2TB ext3 filesystem if sector_t size is 4 +bytes. + +Verified this patch on a 32 bit platform without CONFIG_LBD defined +(sector_t is 32 bits long), mount refuse to mount a 10TB ext3. + +Signed-off-by: Mingming Cao +Acked-by: Andreas Dilger +Signed-off-by: Andrew Morton +--- + + fs/ext3/resize.c | 10 ++++++++++ + fs/ext3/super.c | 10 ++++++++++ + 2 files changed, 20 insertions(+) + +diff -puN fs/ext3/resize.c~avoid-disk-sector_t-overflow-for-2tb-ext3-filesystem fs/ext3/resize.c +--- devel/fs/ext3/resize.c~avoid-disk-sector_t-overflow-for-2tb-ext3-filesystem 2006-05-22 14:09:53.000000000 -0700 ++++ devel-akpm/fs/ext3/resize.c 2006-05-22 14:10:56.000000000 -0700 +@@ -926,6 +926,16 @@ int ext3_group_extend(struct super_block + if (n_blocks_count == 0 || n_blocks_count == o_blocks_count) + return 0; + ++ if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { ++ printk(KERN_ERR "EXT3-fs: filesystem on %s: " ++ "too large to resize to %lu blocks safely\n", ++ sb->s_id, n_blocks_count); ++ if (sizeof(sector_t) < 8) ++ ext3_warning(sb, __FUNCTION__, ++ "CONFIG_LBD not enabled\n"); ++ return -EINVAL; ++ } ++ + if (n_blocks_count < o_blocks_count) { + ext3_warning(sb, __FUNCTION__, + "can't shrink FS - resize aborted"); +diff -puN fs/ext3/super.c~avoid-disk-sector_t-overflow-for-2tb-ext3-filesystem fs/ext3/super.c +--- devel/fs/ext3/super.c~avoid-disk-sector_t-overflow-for-2tb-ext3-filesystem 2006-05-22 14:09:53.000000000 -0700 ++++ devel-akpm/fs/ext3/super.c 2006-05-22 14:11:10.000000000 -0700 +@@ -1565,6 +1565,17 @@ static int ext3_fill_super (struct super + goto failed_mount; + } + ++ if (le32_to_cpu(es->s_blocks_count) > ++ (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { ++ printk(KERN_ERR "EXT3-fs: filesystem on %s: " ++ "too large to mount safely - %u blocks\n", sb->s_id, ++ le32_to_cpu(es->s_blocks_count)); ++ if (sizeof(sector_t) < 8) ++ printk(KERN_WARNING ++ "EXT3-fs: CONFIG_LBD not enabled\n"); ++ goto failed_mount; ++ } ++ + if (EXT3_BLOCKS_PER_GROUP(sb) == 0) + goto cantfind_ext3; + sbi->s_groups_count = (le32_to_cpu(es->s_blocks_count) - +_ diff --git a/ldiskfs/kernel_patches/patches/ext3-sector_t-overflow-2.6.5-suse.patch b/ldiskfs/kernel_patches/patches/ext3-sector_t-overflow-2.6.5-suse.patch new file mode 100644 index 0000000..fe655da --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext3-sector_t-overflow-2.6.5-suse.patch @@ -0,0 +1,44 @@ +Subject: Avoid disk sector_t overflow for >2TB ext3 filesystem +From: Mingming Cao + + +If ext3 filesystem is larger than 2TB, and sector_t is a u32 (i.e. +CONFIG_LBD not defined in the kernel), the calculation of the disk sector +will overflow. Add check at ext3_fill_super() and ext3_group_extend() to +prevent mount/remount/resize >2TB ext3 filesystem if sector_t size is 4 +bytes. + +Verified this patch on a 32 bit platform without CONFIG_LBD defined +(sector_t is 32 bits long), mount refuse to mount a 10TB ext3. + +Signed-off-by: Mingming Cao +Acked-by: Andreas Dilger +Signed-off-by: Andrew Morton +--- + + fs/ext3/resize.c | 10 ++++++++++ + fs/ext3/super.c | 10 ++++++++++ + 2 files changed, 20 insertions(+) + +diff -puN fs/ext3/super.c~avoid-disk-sector_t-overflow-for-2tb-ext3-filesystem fs/ext3/super.c +--- devel/fs/ext3/super.c~avoid-disk-sector_t-overflow-for-2tb-ext3-filesystem 2006-05-22 14:09:53.000000000 -0700 ++++ devel-akpm/fs/ext3/super.c 2006-05-22 14:11:10.000000000 -0700 +@@ -1565,6 +1565,17 @@ static int ext3_fill_super (struct super + goto failed_mount; + } + ++ if (le32_to_cpu(es->s_blocks_count) > ++ (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { ++ printk(KERN_ERR "EXT3-fs: filesystem on %s: " ++ "too large to mount safely - %u blocks\n", sb->s_id, ++ le32_to_cpu(es->s_blocks_count)); ++ if (sizeof(sector_t) < 8) ++ printk(KERN_WARNING ++ "EXT3-fs: CONFIG_LBD not enabled\n"); ++ goto failed_mount; ++ } ++ + sbi->s_groups_count = (le32_to_cpu(es->s_blocks_count) - + le32_to_cpu(es->s_first_data_block) + + EXT3_BLOCKS_PER_GROUP(sb) - 1) / +_ diff --git a/ldiskfs/kernel_patches/patches/ext3-sector_t-overflow-2.6.9-rhel4.patch b/ldiskfs/kernel_patches/patches/ext3-sector_t-overflow-2.6.9-rhel4.patch new file mode 100644 index 0000000..9bfdf80 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext3-sector_t-overflow-2.6.9-rhel4.patch @@ -0,0 +1,64 @@ +Subject: Avoid disk sector_t overflow for >2TB ext3 filesystem +From: Mingming Cao + + +If ext3 filesystem is larger than 2TB, and sector_t is a u32 (i.e. +CONFIG_LBD not defined in the kernel), the calculation of the disk sector +will overflow. Add check at ext3_fill_super() and ext3_group_extend() to +prevent mount/remount/resize >2TB ext3 filesystem if sector_t size is 4 +bytes. + +Verified this patch on a 32 bit platform without CONFIG_LBD defined +(sector_t is 32 bits long), mount refuse to mount a 10TB ext3. + +Signed-off-by: Mingming Cao +Acked-by: Andreas Dilger +Signed-off-by: Andrew Morton +--- + + fs/ext3/resize.c | 10 ++++++++++ + fs/ext3/super.c | 10 ++++++++++ + 2 files changed, 20 insertions(+) + +diff -puN fs/ext3/resize.c~avoid-disk-sector_t-overflow-for-2tb-ext3-filesystem fs/ext3/resize.c +--- devel/fs/ext3/resize.c~avoid-disk-sector_t-overflow-for-2tb-ext3-filesystem 2006-05-22 14:09:53.000000000 -0700 ++++ devel-akpm/fs/ext3/resize.c 2006-05-22 14:10:56.000000000 -0700 +@@ -926,6 +926,16 @@ int ext3_group_extend(struct super_block + if (n_blocks_count == 0 || n_blocks_count == o_blocks_count) + return 0; + ++ if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { ++ printk(KERN_ERR "EXT3-fs: filesystem on %s: " ++ "too large to resize to %lu blocks safely\n", ++ sb->s_id, n_blocks_count); ++ if (sizeof(sector_t) < 8) ++ ext3_warning(sb, __FUNCTION__, ++ "CONFIG_LBD not enabled\n"); ++ return -EINVAL; ++ } ++ + if (n_blocks_count < o_blocks_count) { + ext3_warning(sb, __FUNCTION__, + "can't shrink FS - resize aborted"); +diff -puN fs/ext3/super.c~avoid-disk-sector_t-overflow-for-2tb-ext3-filesystem fs/ext3/super.c +--- devel/fs/ext3/super.c~avoid-disk-sector_t-overflow-for-2tb-ext3-filesystem 2006-05-22 14:09:53.000000000 -0700 ++++ devel-akpm/fs/ext3/super.c 2006-05-22 14:11:10.000000000 -0700 +@@ -1565,6 +1565,17 @@ static int ext3_fill_super (struct super + goto failed_mount; + } + ++ if (le32_to_cpu(es->s_blocks_count) > ++ (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { ++ printk(KERN_ERR "EXT3-fs: filesystem on %s: " ++ "too large to mount safely - %u blocks\n", sb->s_id, ++ le32_to_cpu(es->s_blocks_count)); ++ if (sizeof(sector_t) < 8) ++ printk(KERN_WARNING ++ "EXT3-fs: CONFIG_LBD not enabled\n"); ++ goto failed_mount; ++ } ++ + sbi->s_groups_count = (le32_to_cpu(es->s_blocks_count) - + le32_to_cpu(es->s_first_data_block) + + EXT3_BLOCKS_PER_GROUP(sb) - 1) / +_ diff --git a/ldiskfs/kernel_patches/patches/ext3-wantedi-2.6-rhel4.patch b/ldiskfs/kernel_patches/patches/ext3-wantedi-2.6-rhel4.patch new file mode 100644 index 0000000..b586a2f --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext3-wantedi-2.6-rhel4.patch @@ -0,0 +1,180 @@ + fs/ext3/ialloc.c | 35 ++++++++++++++++++++++++++++++++++- + fs/ext3/ioctl.c | 25 +++++++++++++++++++++++++ + fs/ext3/namei.c | 21 +++++++++++++++++---- + include/linux/dcache.h | 5 +++++ + include/linux/ext3_fs.h | 5 ++++- + 5 files changed, 85 insertions(+), 6 deletions(-) + +Index: uml-2.6.3/fs/ext3/ialloc.c +=================================================================== +--- uml-2.6.3.orig/fs/ext3/ialloc.c 2004-02-20 15:00:48.000000000 +0800 ++++ uml-2.6.3/fs/ext3/ialloc.c 2004-02-21 00:24:45.202693776 +0800 +@@ -420,7 +420,8 @@ + * For other inodes, search forward from the parent directory's block + * group to find a free inode. + */ +-struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, int mode) ++struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, int mode, ++ unsigned long goal) + { + struct super_block *sb; + struct buffer_head *bitmap_bh = NULL; +@@ -448,6 +449,41 @@ + + sbi = EXT3_SB(sb); + es = sbi->s_es; ++ if (goal) { ++ group = (goal - 1) / EXT3_INODES_PER_GROUP(sb); ++ ino = (goal - 1) % EXT3_INODES_PER_GROUP(sb); ++ err = -EIO; ++ ++ gdp = ext3_get_group_desc(sb, group, &bh2); ++ if (!gdp) ++ goto fail; ++ ++ bitmap_bh = read_inode_bitmap (sb, group); ++ if (!bitmap_bh) ++ goto fail; ++ ++ BUFFER_TRACE(bh, "get_write_access"); ++ err = ext3_journal_get_write_access(handle, bitmap_bh); ++ if (err) goto fail; ++ ++ if (ext3_set_bit_atomic(sb_bgl_lock(sbi, group), ++ ino, bitmap_bh->b_data)) { ++ printk(KERN_ERR "goal inode %lu unavailable\n", goal); ++ /* Oh well, we tried. */ ++ goto continue_allocation; ++ } ++ ++ BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); ++ err = ext3_journal_dirty_metadata(handle, bitmap_bh); ++ if (err) goto fail; ++ ++ /* We've shortcircuited the allocation system successfully, ++ * now finish filling in the inode. ++ */ ++ goto got; ++ } ++ ++continue_allocation: + if (S_ISDIR(mode)) { + if (test_opt (sb, OLDALLOC)) + group = find_group_dir(sb, dir); +Index: uml-2.6.3/fs/ext3/ioctl.c +=================================================================== +--- uml-2.6.3.orig/fs/ext3/ioctl.c 2004-01-09 14:59:26.000000000 +0800 ++++ uml-2.6.3/fs/ext3/ioctl.c 2004-02-21 00:21:04.541239416 +0800 +@@ -24,6 +24,31 @@ + ext3_debug ("cmd = %u, arg = %lu\n", cmd, arg); + + switch (cmd) { ++ case EXT3_IOC_CREATE_INUM: { ++ char name[32]; ++ struct dentry *dchild, *dparent; ++ int rc = 0; ++ ++ dparent = list_entry(inode->i_dentry.next, struct dentry, ++ d_alias); ++ snprintf(name, sizeof name, "%lu", arg); ++ dchild = lookup_one_len(name, dparent, strlen(name)); ++ if (dchild->d_inode) { ++ printk(KERN_ERR "%*s/%lu already exists (ino %lu)\n", ++ dparent->d_name.len, dparent->d_name.name, arg, ++ dchild->d_inode->i_ino); ++ rc = -EEXIST; ++ } else { ++ dchild->d_fsdata = (void *)arg; ++ rc = vfs_create(inode, dchild, 0644, NULL); ++ if (rc) ++ printk(KERN_ERR "vfs_create: %d\n", rc); ++ else if (dchild->d_inode->i_ino != arg) ++ rc = -EEXIST; ++ } ++ dput(dchild); ++ return rc; ++ } + case EXT3_IOC_GETFLAGS: + flags = ei->i_flags & EXT3_FL_USER_VISIBLE; + return put_user(flags, (int *) arg); +Index: uml-2.6.3/fs/ext3/namei.c +=================================================================== +--- uml-2.6.3.orig/fs/ext3/namei.c 2004-02-20 15:01:27.000000000 +0800 ++++ uml-2.6.3/fs/ext3/namei.c 2004-02-21 00:21:04.611228776 +0800 +@@ -1617,6 +1617,19 @@ + return err; + } + ++static struct inode * ext3_new_inode_wantedi(handle_t *handle, struct inode *dir, ++ int mode, struct dentry *dentry) ++{ ++ unsigned long inum = 0; ++ ++ if (dentry->d_fsdata != NULL) { ++ struct dentry_params *param = ++ (struct dentry_params *) dentry->d_fsdata; ++ inum = param->p_inum; ++ } ++ return ext3_new_inode(handle, dir, mode, inum); ++} ++ + /* + * By the time this is called, we already have created + * the directory cache entry for the new file, but it +@@ -1640,7 +1653,7 @@ + if (IS_DIRSYNC(dir)) + handle->h_sync = 1; + +- inode = ext3_new_inode (handle, dir, mode); ++ inode = ext3_new_inode_wantedi (handle, dir, mode, dentry); + err = PTR_ERR(inode); + if (!IS_ERR(inode)) { + inode->i_op = &ext3_file_inode_operations; +@@ -1670,7 +1683,7 @@ + if (IS_DIRSYNC(dir)) + handle->h_sync = 1; + +- inode = ext3_new_inode (handle, dir, mode); ++ inode = ext3_new_inode_wantedi (handle, dir, mode, dentry); + err = PTR_ERR(inode); + if (!IS_ERR(inode)) { + init_special_inode(inode, inode->i_mode, rdev); +@@ -1702,7 +1715,7 @@ + if (IS_DIRSYNC(dir)) + handle->h_sync = 1; + +- inode = ext3_new_inode (handle, dir, S_IFDIR | mode); ++ inode = ext3_new_inode_wantedi (handle, dir, S_IFDIR | mode, dentry); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_stop; +@@ -2094,7 +2107,7 @@ + if (IS_DIRSYNC(dir)) + handle->h_sync = 1; + +- inode = ext3_new_inode (handle, dir, S_IFLNK|S_IRWXUGO); ++ inode = ext3_new_inode_wantedi (handle, dir, S_IFLNK|S_IRWXUGO, dentry); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_stop; +Index: uml-2.6.3/include/linux/ext3_fs.h +=================================================================== +--- uml-2.6.3.orig/include/linux/ext3_fs.h 2004-01-09 14:59:44.000000000 +0800 ++++ uml-2.6.3/include/linux/ext3_fs.h 2004-02-21 00:21:04.613228472 +0800 +@@ -707,7 +708,8 @@ + dx_hash_info *hinfo); + + /* ialloc.c */ +-extern struct inode * ext3_new_inode (handle_t *, struct inode *, int); ++extern struct inode * ext3_new_inode (handle_t *, struct inode *, int, ++ unsigned long); + extern void ext3_free_inode (handle_t *, struct inode *); + extern struct inode * ext3_orphan_get (struct super_block *, unsigned long); + extern unsigned long ext3_count_free_inodes (struct super_block *); +@@ -792,4 +794,6 @@ + + #endif /* __KERNEL__ */ + ++/* EXT3_IOC_CREATE_INUM at bottom of file (visible to kernel and user). */ ++#define EXT3_IOC_CREATE_INUM _IOW('f', 5, long) + #endif /* _LINUX_EXT3_FS_H */ diff --git a/ldiskfs/kernel_patches/patches/ext3-wantedi-2.6-suse.patch b/ldiskfs/kernel_patches/patches/ext3-wantedi-2.6-suse.patch index 4fd69a5..33535dc 100644 --- a/ldiskfs/kernel_patches/patches/ext3-wantedi-2.6-suse.patch +++ b/ldiskfs/kernel_patches/patches/ext3-wantedi-2.6-suse.patch @@ -5,10 +5,10 @@ include/linux/ext3_fs.h | 5 ++++- 5 files changed, 85 insertions(+), 6 deletions(-) -Index: linux-2.6.7/fs/ext3/ialloc.c +Index: uml-2.6.3/fs/ext3/ialloc.c =================================================================== ---- linux-2.6.7.orig/fs/ext3/ialloc.c 2005-03-24 00:27:43.282608616 +0800 -+++ linux-2.6.7/fs/ext3/ialloc.c 2005-03-24 00:27:43.888516504 +0800 +--- uml-2.6.3.orig/fs/ext3/ialloc.c 2004-02-20 15:00:48.000000000 +0800 ++++ uml-2.6.3/fs/ext3/ialloc.c 2004-02-21 00:24:45.202693776 +0800 @@ -420,7 +420,8 @@ * For other inodes, search forward from the parent directory's block * group to find a free inode. @@ -19,16 +19,19 @@ Index: linux-2.6.7/fs/ext3/ialloc.c { struct super_block *sb; struct buffer_head *bitmap_bh = NULL; -@@ -448,6 +449,38 @@ +@@ -448,6 +449,41 @@ sbi = EXT3_SB(sb); es = sbi->s_es; + if (goal) { + group = (goal - 1) / EXT3_INODES_PER_GROUP(sb); + ino = (goal - 1) % EXT3_INODES_PER_GROUP(sb); ++ err = -EIO; ++ + gdp = ext3_get_group_desc(sb, group, &bh2); ++ if (!gdp) ++ goto fail; + -+ err = -EIO; + bitmap_bh = read_inode_bitmap (sb, group); + if (!bitmap_bh) + goto fail; @@ -58,19 +61,11 @@ Index: linux-2.6.7/fs/ext3/ialloc.c if (S_ISDIR(mode)) { if (test_opt (sb, OLDALLOC)) group = find_group_dir(sb, dir); -Index: linux-2.6.7/fs/ext3/ioctl.c +Index: uml-2.6.3/fs/ext3/ioctl.c =================================================================== ---- linux-2.6.7.orig/fs/ext3/ioctl.c 2004-06-16 13:19:13.000000000 +0800 -+++ linux-2.6.7/fs/ext3/ioctl.c 2005-03-24 00:31:16.113253440 +0800 -@@ -9,6 +9,7 @@ - - #include - #include -+#include - #include - #include - #include -@@ -24,6 +25,31 @@ +--- uml-2.6.3.orig/fs/ext3/ioctl.c 2004-01-09 14:59:26.000000000 +0800 ++++ uml-2.6.3/fs/ext3/ioctl.c 2004-02-21 00:21:04.541239416 +0800 +@@ -24,6 +24,31 @@ ext3_debug ("cmd = %u, arg = %lu\n", cmd, arg); switch (cmd) { @@ -101,12 +96,12 @@ Index: linux-2.6.7/fs/ext3/ioctl.c + } case EXT3_IOC_GETFLAGS: flags = ei->i_flags & EXT3_FL_USER_VISIBLE; - return put_user(flags, (int __user *) arg); -Index: linux-2.6.7/fs/ext3/namei.c + return put_user(flags, (int *) arg); +Index: uml-2.6.3/fs/ext3/namei.c =================================================================== ---- linux-2.6.7.orig/fs/ext3/namei.c 2005-03-24 00:27:43.536570008 +0800 -+++ linux-2.6.7/fs/ext3/namei.c 2005-03-24 00:27:43.893515744 +0800 -@@ -1939,6 +1939,19 @@ +--- uml-2.6.3.orig/fs/ext3/namei.c 2004-02-20 15:01:27.000000000 +0800 ++++ uml-2.6.3/fs/ext3/namei.c 2004-02-21 00:21:04.611228776 +0800 +@@ -1617,6 +1617,19 @@ return err; } @@ -126,7 +121,7 @@ Index: linux-2.6.7/fs/ext3/namei.c /* * By the time this is called, we already have created * the directory cache entry for the new file, but it -@@ -1963,7 +1976,7 @@ +@@ -1640,7 +1653,7 @@ if (IS_DIRSYNC(dir)) handle->h_sync = 1; @@ -135,7 +130,7 @@ Index: linux-2.6.7/fs/ext3/namei.c err = PTR_ERR(inode); if (!IS_ERR(inode)) { inode->i_op = &ext3_file_inode_operations; -@@ -1994,7 +2007,7 @@ +@@ -1670,7 +1683,7 @@ if (IS_DIRSYNC(dir)) handle->h_sync = 1; @@ -144,7 +139,7 @@ Index: linux-2.6.7/fs/ext3/namei.c err = PTR_ERR(inode); if (!IS_ERR(inode)) { init_special_inode(inode, inode->i_mode, rdev); -@@ -2027,7 +2040,7 @@ +@@ -1702,7 +1715,7 @@ if (IS_DIRSYNC(dir)) handle->h_sync = 1; @@ -153,7 +148,7 @@ Index: linux-2.6.7/fs/ext3/namei.c err = PTR_ERR(inode); if (IS_ERR(inode)) goto out_stop; -@@ -2439,7 +2452,7 @@ +@@ -2094,7 +2107,7 @@ if (IS_DIRSYNC(dir)) handle->h_sync = 1; @@ -162,10 +157,10 @@ Index: linux-2.6.7/fs/ext3/namei.c err = PTR_ERR(inode); if (IS_ERR(inode)) goto out_stop; -Index: linux-2.6.7/include/linux/ext3_fs.h +Index: uml-2.6.3/include/linux/ext3_fs.h =================================================================== ---- linux-2.6.7.orig/include/linux/ext3_fs.h 2005-03-24 00:27:43.542569096 +0800 -+++ linux-2.6.7/include/linux/ext3_fs.h 2005-03-24 00:27:43.893515744 +0800 +--- uml-2.6.3.orig/include/linux/ext3_fs.h 2004-01-09 14:59:44.000000000 +0800 ++++ uml-2.6.3/include/linux/ext3_fs.h 2004-02-21 00:21:04.613228472 +0800 @@ -203,6 +203,7 @@ #define EXT3_IOC_SETFLAGS _IOW('f', 2, long) #define EXT3_IOC_GETVERSION _IOR('f', 3, long) @@ -174,7 +169,7 @@ Index: linux-2.6.7/include/linux/ext3_fs.h #define EXT3_IOC_GETVERSION_OLD _IOR('v', 1, long) #define EXT3_IOC_SETVERSION_OLD _IOW('v', 2, long) #ifdef CONFIG_JBD_DEBUG -@@ -708,7 +709,8 @@ +@@ -707,7 +708,8 @@ dx_hash_info *hinfo); /* ialloc.c */ @@ -184,7 +179,7 @@ Index: linux-2.6.7/include/linux/ext3_fs.h extern void ext3_free_inode (handle_t *, struct inode *); extern struct inode * ext3_orphan_get (struct super_block *, unsigned long); extern unsigned long ext3_count_free_inodes (struct super_block *); -@@ -793,4 +795,5 @@ +@@ -792,4 +794,5 @@ #endif /* __KERNEL__ */ diff --git a/ldiskfs/kernel_patches/patches/iopen-2.6-fc5.patch b/ldiskfs/kernel_patches/patches/iopen-2.6-fc5.patch new file mode 100644 index 0000000..6bbcec5 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/iopen-2.6-fc5.patch @@ -0,0 +1,448 @@ +Index: linux-2.6.16.i686/fs/ext3/iopen.c +=================================================================== +--- linux-2.6.16.i686.orig/fs/ext3/iopen.c 2006-05-31 04:14:15.752410384 +0800 ++++ linux-2.6.16.i686/fs/ext3/iopen.c 2006-05-30 22:52:38.000000000 +0800 +@@ -0,0 +1,259 @@ ++/* ++ * linux/fs/ext3/iopen.c ++ * ++ * Special support for open by inode number ++ * ++ * Copyright (C) 2001 by Theodore Ts'o (tytso@alum.mit.edu). ++ * ++ * This file may be redistributed under the terms of the GNU General ++ * Public License. ++ * ++ * ++ * Invariants: ++ * - there is only ever a single DCACHE_NFSD_DISCONNECTED dentry alias ++ * for an inode at one time. ++ * - there are never both connected and DCACHE_NFSD_DISCONNECTED dentry ++ * aliases on an inode at the same time. ++ * ++ * If we have any connected dentry aliases for an inode, use one of those ++ * in iopen_lookup(). Otherwise, we instantiate a single NFSD_DISCONNECTED ++ * dentry for this inode, which thereafter will be found by the dcache ++ * when looking up this inode number in __iopen__, so we don't return here ++ * until it is gone. ++ * ++ * If we get an inode via a regular name lookup, then we "rename" the ++ * NFSD_DISCONNECTED dentry to the proper name and parent. This ensures ++ * existing users of the disconnected dentry will continue to use the same ++ * dentry as the connected users, and there will never be both kinds of ++ * dentry aliases at one time. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "iopen.h" ++ ++#ifndef assert ++#define assert(test) J_ASSERT(test) ++#endif ++ ++#define IOPEN_NAME_LEN 32 ++ ++/* ++ * This implements looking up an inode by number. ++ */ ++static struct dentry *iopen_lookup(struct inode * dir, struct dentry *dentry, ++ struct nameidata *nd) ++{ ++ struct inode *inode; ++ unsigned long ino; ++ struct list_head *lp; ++ struct dentry *alternate; ++ char buf[IOPEN_NAME_LEN]; ++ ++ if (dentry->d_name.len >= IOPEN_NAME_LEN) ++ return ERR_PTR(-ENAMETOOLONG); ++ ++ memcpy(buf, dentry->d_name.name, dentry->d_name.len); ++ buf[dentry->d_name.len] = 0; ++ ++ if (strcmp(buf, ".") == 0) ++ ino = dir->i_ino; ++ else if (strcmp(buf, "..") == 0) ++ ino = EXT3_ROOT_INO; ++ else ++ ino = simple_strtoul(buf, 0, 0); ++ ++ if ((ino != EXT3_ROOT_INO && ++ //ino != EXT3_ACL_IDX_INO && ++ //ino != EXT3_ACL_DATA_INO && ++ ino < EXT3_FIRST_INO(dir->i_sb)) || ++ ino > le32_to_cpu(EXT3_SB(dir->i_sb)->s_es->s_inodes_count)) ++ return ERR_PTR(-ENOENT); ++ ++ inode = iget(dir->i_sb, ino); ++ if (!inode) ++ return ERR_PTR(-EACCES); ++ if (is_bad_inode(inode)) { ++ iput(inode); ++ return ERR_PTR(-ENOENT); ++ } ++ ++ assert(list_empty(&dentry->d_alias)); /* d_instantiate */ ++ assert(d_unhashed(dentry)); /* d_rehash */ ++ ++ /* preferrably return a connected dentry */ ++ spin_lock(&dcache_lock); ++ list_for_each(lp, &inode->i_dentry) { ++ alternate = list_entry(lp, struct dentry, d_alias); ++ assert(!(alternate->d_flags & DCACHE_DISCONNECTED)); ++ } ++ ++ if (!list_empty(&inode->i_dentry)) { ++ alternate = list_entry(inode->i_dentry.next, ++ struct dentry, d_alias); ++ dget_locked(alternate); ++ spin_lock(&alternate->d_lock); ++ alternate->d_flags |= DCACHE_REFERENCED; ++ spin_unlock(&alternate->d_lock); ++ iput(inode); ++ spin_unlock(&dcache_lock); ++ return alternate; ++ } ++ dentry->d_flags |= DCACHE_DISCONNECTED; ++ ++ /* d_add(), but don't drop dcache_lock before adding dentry to inode */ ++ list_add(&dentry->d_alias, &inode->i_dentry); /* d_instantiate */ ++ dentry->d_inode = inode; ++ spin_unlock(&dcache_lock); ++ ++ d_rehash(dentry); ++ ++ return NULL; ++} ++ ++/* This function is spliced into ext3_lookup and does the move of a ++ * disconnected dentry (if it exists) to a connected dentry. ++ */ ++struct dentry *iopen_connect_dentry(struct dentry *dentry, struct inode *inode, ++ int rehash) ++{ ++ struct dentry *tmp, *goal = NULL; ++ struct list_head *lp; ++ ++ /* verify this dentry is really new */ ++ assert(dentry->d_inode == NULL); ++ assert(list_empty(&dentry->d_alias)); /* d_instantiate */ ++ if (rehash) ++ assert(d_unhashed(dentry)); /* d_rehash */ ++ assert(list_empty(&dentry->d_subdirs)); ++ ++ spin_lock(&dcache_lock); ++ if (!inode) ++ goto do_rehash; ++ ++ if (!test_opt(inode->i_sb, IOPEN)) ++ goto do_instantiate; ++ ++ /* preferrably return a connected dentry */ ++ list_for_each(lp, &inode->i_dentry) { ++ tmp = list_entry(lp, struct dentry, d_alias); ++ if (tmp->d_flags & DCACHE_DISCONNECTED) { ++ assert(tmp->d_alias.next == &inode->i_dentry); ++ assert(tmp->d_alias.prev == &inode->i_dentry); ++ goal = tmp; ++ dget_locked(goal); ++ break; ++ } ++ } ++ ++ if (!goal) ++ goto do_instantiate; ++ ++ /* Move the goal to the de hash queue */ ++ goal->d_flags &= ~DCACHE_DISCONNECTED; ++ security_d_instantiate(goal, inode); ++ __d_drop(dentry); ++ spin_unlock(&dcache_lock); ++ d_rehash(dentry); ++ d_move(goal, dentry); ++ iput(inode); ++ ++ return goal; ++ ++ /* d_add(), but don't drop dcache_lock before adding dentry to inode */ ++do_instantiate: ++ list_add(&dentry->d_alias, &inode->i_dentry); /* d_instantiate */ ++ dentry->d_inode = inode; ++do_rehash: ++ spin_unlock(&dcache_lock); ++ if (rehash) ++ d_rehash(dentry); ++ ++ return NULL; ++} ++ ++/* ++ * These are the special structures for the iopen pseudo directory. ++ */ ++ ++static struct inode_operations iopen_inode_operations = { ++ lookup: iopen_lookup, /* BKL held */ ++}; ++ ++static struct file_operations iopen_file_operations = { ++ read: generic_read_dir, ++}; ++ ++static int match_dentry(struct dentry *dentry, const char *name) ++{ ++ int len; ++ ++ len = strlen(name); ++ if (dentry->d_name.len != len) ++ return 0; ++ if (strncmp(dentry->d_name.name, name, len)) ++ return 0; ++ return 1; ++} ++ ++/* ++ * This function is spliced into ext3_lookup and returns 1 the file ++ * name is __iopen__ and dentry has been filled in appropriately. ++ */ ++int ext3_check_for_iopen(struct inode *dir, struct dentry *dentry) ++{ ++ struct inode *inode; ++ ++ if (dir->i_ino != EXT3_ROOT_INO || ++ !test_opt(dir->i_sb, IOPEN) || ++ !match_dentry(dentry, "__iopen__")) ++ return 0; ++ ++ inode = iget(dir->i_sb, EXT3_BAD_INO); ++ ++ if (!inode) ++ return 0; ++ d_add(dentry, inode); ++ return 1; ++} ++ ++/* ++ * This function is spliced into read_inode; it returns 1 if inode ++ * number is the one for /__iopen__, in which case the inode is filled ++ * in appropriately. Otherwise, this fuction returns 0. ++ */ ++int ext3_iopen_get_inode(struct inode *inode) ++{ ++ if (inode->i_ino != EXT3_BAD_INO) ++ return 0; ++ ++ inode->i_mode = S_IFDIR | S_IRUSR | S_IXUSR; ++ if (test_opt(inode->i_sb, IOPEN_NOPRIV)) ++ inode->i_mode |= 0777; ++ inode->i_uid = 0; ++ inode->i_gid = 0; ++ inode->i_nlink = 1; ++ inode->i_size = 4096; ++ inode->i_atime = CURRENT_TIME; ++ inode->i_ctime = CURRENT_TIME; ++ inode->i_mtime = CURRENT_TIME; ++ EXT3_I(inode)->i_dtime = 0; ++ inode->i_blksize = PAGE_SIZE; /* This is the optimal IO size ++ * (for stat), not the fs block ++ * size */ ++ inode->i_blocks = 0; ++ inode->i_version = 1; ++ inode->i_generation = 0; ++ ++ inode->i_op = &iopen_inode_operations; ++ inode->i_fop = &iopen_file_operations; ++ inode->i_mapping->a_ops = 0; ++ ++ return 1; ++} +Index: linux-2.6.16.i686/fs/ext3/iopen.h +=================================================================== +--- linux-2.6.16.i686.orig/fs/ext3/iopen.h 2006-05-31 04:14:15.752410384 +0800 ++++ linux-2.6.16.i686/fs/ext3/iopen.h 2006-05-30 22:52:38.000000000 +0800 +@@ -0,0 +1,15 @@ ++/* ++ * iopen.h ++ * ++ * Special support for opening files by inode number. ++ * ++ * Copyright (C) 2001 by Theodore Ts'o (tytso@alum.mit.edu). ++ * ++ * This file may be redistributed under the terms of the GNU General ++ * Public License. ++ */ ++ ++extern int ext3_check_for_iopen(struct inode *dir, struct dentry *dentry); ++extern int ext3_iopen_get_inode(struct inode *inode); ++extern struct dentry *iopen_connect_dentry(struct dentry *dentry, ++ struct inode *inode, int rehash); +Index: linux-2.6.16.i686/fs/ext3/inode.c +=================================================================== +--- linux-2.6.16.i686.orig/fs/ext3/inode.c 2006-05-30 22:52:03.000000000 +0800 ++++ linux-2.6.16.i686/fs/ext3/inode.c 2006-05-30 22:52:38.000000000 +0800 +@@ -37,6 +37,7 @@ + #include + #include + #include "xattr.h" ++#include "iopen.h" + #include "acl.h" + + static int ext3_writepage_trans_blocks(struct inode *inode); +@@ -2448,6 +2449,8 @@ + ei->i_default_acl = EXT3_ACL_NOT_CACHED; + #endif + ei->i_block_alloc_info = NULL; ++ if (ext3_iopen_get_inode(inode)) ++ return; + + if (__ext3_get_inode_loc(inode, &iloc, 0)) + goto bad_inode; +Index: linux-2.6.16.i686/fs/ext3/super.c +=================================================================== +--- linux-2.6.16.i686.orig/fs/ext3/super.c 2006-05-30 22:52:03.000000000 +0800 ++++ linux-2.6.16.i686/fs/ext3/super.c 2006-05-30 22:52:38.000000000 +0800 +@@ -634,6 +634,7 @@ + Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, + Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, + Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, ++ Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, + Opt_grpquota + }; + +@@ -682,6 +683,9 @@ + {Opt_noquota, "noquota"}, + {Opt_quota, "quota"}, + {Opt_usrquota, "usrquota"}, ++ {Opt_iopen, "iopen"}, ++ {Opt_noiopen, "noiopen"}, ++ {Opt_iopen_nopriv, "iopen_nopriv"}, + {Opt_barrier, "barrier=%u"}, + {Opt_err, NULL}, + {Opt_resize, "resize"}, +@@ -996,6 +1000,18 @@ + else + clear_opt(sbi->s_mount_opt, BARRIER); + break; ++ case Opt_iopen: ++ set_opt (sbi->s_mount_opt, IOPEN); ++ clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV); ++ break; ++ case Opt_noiopen: ++ clear_opt (sbi->s_mount_opt, IOPEN); ++ clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV); ++ break; ++ case Opt_iopen_nopriv: ++ set_opt (sbi->s_mount_opt, IOPEN); ++ set_opt (sbi->s_mount_opt, IOPEN_NOPRIV); ++ break; + case Opt_ignore: + break; + case Opt_resize: +Index: linux-2.6.16.i686/fs/ext3/namei.c +=================================================================== +--- linux-2.6.16.i686.orig/fs/ext3/namei.c 2006-05-30 22:52:00.000000000 +0800 ++++ linux-2.6.16.i686/fs/ext3/namei.c 2006-05-30 22:55:19.000000000 +0800 +@@ -39,6 +39,7 @@ + + #include "namei.h" + #include "xattr.h" ++#include "iopen.h" + #include "acl.h" + + /* +@@ -995,6 +996,9 @@ + if (dentry->d_name.len > EXT3_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); + ++ if (ext3_check_for_iopen(dir, dentry)) ++ return NULL; ++ + bh = ext3_find_entry(dentry, &de); + inode = NULL; + if (bh) { +@@ -1005,7 +1009,7 @@ + if (!inode) + return ERR_PTR(-EACCES); + } +- return d_splice_alias(inode, dentry); ++ return iopen_connect_dentry(dentry, inode, 1); + } + + +@@ -2046,10 +2050,6 @@ + inode->i_nlink); + inode->i_version++; + inode->i_nlink = 0; +- /* There's no need to set i_disksize: the fact that i_nlink is +- * zero will ensure that the right thing happens during any +- * recovery. */ +- inode->i_size = 0; + ext3_orphan_add(handle, inode); + inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; + ext3_mark_inode_dirty(handle, inode); +@@ -2173,6 +2173,23 @@ + return err; + } + ++/* Like ext3_add_nondir() except for call to iopen_connect_dentry */ ++static int ext3_add_link(handle_t *handle, struct dentry *dentry, ++ struct inode *inode) ++{ ++ int err = ext3_add_entry(handle, dentry, inode); ++ if (!err) { ++ err = ext3_mark_inode_dirty(handle, inode); ++ if (err == 0) { ++ dput(iopen_connect_dentry(dentry, inode, 0)); ++ return 0; ++ } ++ } ++ ext3_dec_count(handle, inode); ++ iput(inode); ++ return err; ++} ++ + static int ext3_link (struct dentry * old_dentry, + struct inode * dir, struct dentry *dentry) + { +@@ -2196,7 +2213,8 @@ + ext3_inc_count(handle, inode); + atomic_inc(&inode->i_count); + +- err = ext3_add_nondir(handle, dentry, inode); ++ err = ext3_add_link(handle, dentry, inode); ++ ext3_orphan_del(handle, inode); + ext3_journal_stop(handle); + if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) + goto retry; +Index: linux-2.6.16.i686/fs/ext3/Makefile +=================================================================== +--- linux-2.6.16.i686.orig/fs/ext3/Makefile 2006-03-20 13:53:29.000000000 +0800 ++++ linux-2.6.16.i686/fs/ext3/Makefile 2006-05-30 22:52:38.000000000 +0800 +@@ -4,7 +4,7 @@ + + obj-$(CONFIG_EXT3_FS) += ext3.o + +-ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ ++ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ + ioctl.o namei.o super.o symlink.o hash.o resize.o + + ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o +Index: linux-2.6.16.i686/include/linux/ext3_fs.h +=================================================================== +--- linux-2.6.16.i686.orig/include/linux/ext3_fs.h 2006-05-30 22:52:00.000000000 +0800 ++++ linux-2.6.16.i686/include/linux/ext3_fs.h 2006-05-30 22:52:38.000000000 +0800 +@@ -375,6 +375,8 @@ + #define EXT3_MOUNT_QUOTA 0x80000 /* Some quota option set */ + #define EXT3_MOUNT_USRQUOTA 0x100000 /* "old" user quota */ + #define EXT3_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ ++#define EXT3_MOUNT_IOPEN 0x400000 /* Allow access via iopen */ ++#define EXT3_MOUNT_IOPEN_NOPRIV 0x800000/* Make iopen world-readable */ + + /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ + #ifndef _LINUX_EXT2_FS_H diff --git a/ldiskfs/kernel_patches/patches/iopen-2.6-rhel4.patch b/ldiskfs/kernel_patches/patches/iopen-2.6-rhel4.patch new file mode 100644 index 0000000..98dbca4 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/iopen-2.6-rhel4.patch @@ -0,0 +1,471 @@ +Index: linux-stage/fs/ext3/Makefile +=================================================================== +--- linux-stage.orig/fs/ext3/Makefile 2005-02-25 14:31:53.151076368 +0200 ++++ linux-stage/fs/ext3/Makefile 2005-02-25 14:41:51.259150120 +0200 +@@ -4,7 +4,7 @@ + + obj-$(CONFIG_EXT3_FS) += ext3.o + +-ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ ++ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ + ioctl.o namei.o super.o symlink.o hash.o resize.o + + ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o +Index: linux-stage/fs/ext3/inode.c +=================================================================== +--- linux-stage.orig/fs/ext3/inode.c 2005-02-25 14:37:30.983718000 +0200 ++++ linux-stage/fs/ext3/inode.c 2005-02-25 14:47:42.069818792 +0200 +@@ -37,6 +37,7 @@ + #include + #include + #include "xattr.h" ++#include "iopen.h" + #include "acl.h" + + /* +@@ -2408,6 +2409,8 @@ + ei->i_default_acl = EXT3_ACL_NOT_CACHED; + #endif + ei->i_rsv_window.rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED; ++ if (ext3_iopen_get_inode(inode)) ++ return; + + if (ext3_get_inode_loc(inode, &iloc, 0)) + goto bad_inode; +Index: linux-stage/fs/ext3/iopen.c +=================================================================== +--- linux-stage.orig/fs/ext3/iopen.c 2005-02-25 14:41:01.017787968 +0200 ++++ linux-stage/fs/ext3/iopen.c 2005-02-25 14:41:01.045783712 +0200 +@@ -0,0 +1,278 @@ ++/* ++ * linux/fs/ext3/iopen.c ++ * ++ * Special support for open by inode number ++ * ++ * Copyright (C) 2001 by Theodore Ts'o (tytso@alum.mit.edu). ++ * ++ * This file may be redistributed under the terms of the GNU General ++ * Public License. ++ * ++ * ++ * Invariants: ++ * - there is only ever a single DCACHE_NFSD_DISCONNECTED dentry alias ++ * for an inode at one time. ++ * - there are never both connected and DCACHE_NFSD_DISCONNECTED dentry ++ * aliases on an inode at the same time. ++ * ++ * If we have any connected dentry aliases for an inode, use one of those ++ * in iopen_lookup(). Otherwise, we instantiate a single NFSD_DISCONNECTED ++ * dentry for this inode, which thereafter will be found by the dcache ++ * when looking up this inode number in __iopen__, so we don't return here ++ * until it is gone. ++ * ++ * If we get an inode via a regular name lookup, then we "rename" the ++ * NFSD_DISCONNECTED dentry to the proper name and parent. This ensures ++ * existing users of the disconnected dentry will continue to use the same ++ * dentry as the connected users, and there will never be both kinds of ++ * dentry aliases at one time. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "iopen.h" ++ ++#ifndef assert ++#define assert(test) J_ASSERT(test) ++#endif ++ ++#define IOPEN_NAME_LEN 32 ++ ++/* ++ * This implements looking up an inode by number. ++ */ ++static struct dentry *iopen_lookup(struct inode * dir, struct dentry *dentry, ++ struct nameidata *nd) ++{ ++ struct inode *inode; ++ unsigned long ino; ++ struct list_head *lp; ++ struct dentry *alternate; ++ char buf[IOPEN_NAME_LEN]; ++ ++ if (dentry->d_name.len >= IOPEN_NAME_LEN) ++ return ERR_PTR(-ENAMETOOLONG); ++ ++ memcpy(buf, dentry->d_name.name, dentry->d_name.len); ++ buf[dentry->d_name.len] = 0; ++ ++ if (strcmp(buf, ".") == 0) ++ ino = dir->i_ino; ++ else if (strcmp(buf, "..") == 0) ++ ino = EXT3_ROOT_INO; ++ else ++ ino = simple_strtoul(buf, 0, 0); ++ ++ if ((ino != EXT3_ROOT_INO && ++ //ino != EXT3_ACL_IDX_INO && ++ //ino != EXT3_ACL_DATA_INO && ++ ino < EXT3_FIRST_INO(dir->i_sb)) || ++ ino > le32_to_cpu(EXT3_SB(dir->i_sb)->s_es->s_inodes_count)) ++ return ERR_PTR(-ENOENT); ++ ++ inode = iget(dir->i_sb, ino); ++ if (!inode) ++ return ERR_PTR(-EACCES); ++ if (is_bad_inode(inode)) { ++ iput(inode); ++ return ERR_PTR(-ENOENT); ++ } ++ ++ assert(list_empty(&dentry->d_alias)); /* d_instantiate */ ++ assert(d_unhashed(dentry)); /* d_rehash */ ++ ++ /* preferrably return a connected dentry */ ++ spin_lock(&dcache_lock); ++ list_for_each(lp, &inode->i_dentry) { ++ alternate = list_entry(lp, struct dentry, d_alias); ++ assert(!(alternate->d_flags & DCACHE_DISCONNECTED)); ++ } ++ ++ if (!list_empty(&inode->i_dentry)) { ++ alternate = list_entry(inode->i_dentry.next, ++ struct dentry, d_alias); ++ dget_locked(alternate); ++ spin_lock(&alternate->d_lock); ++ alternate->d_flags |= DCACHE_REFERENCED; ++ spin_unlock(&alternate->d_lock); ++ iput(inode); ++ spin_unlock(&dcache_lock); ++ return alternate; ++ } ++ dentry->d_flags |= DCACHE_DISCONNECTED; ++ ++ /* d_add(), but don't drop dcache_lock before adding dentry to inode */ ++ list_add(&dentry->d_alias, &inode->i_dentry); /* d_instantiate */ ++ dentry->d_inode = inode; ++ ++ __d_rehash(dentry, 0); /* d_rehash */ ++ spin_unlock(&dcache_lock); ++ ++ return NULL; ++} ++ ++#define do_switch(x,y) do { \ ++ __typeof__ (x) __tmp = x; \ ++ x = y; y = __tmp; } while (0) ++ ++static inline void switch_names(struct dentry *dentry, struct dentry *target) ++{ ++ const unsigned char *old_name, *new_name; ++ ++ memcpy(dentry->d_iname, target->d_iname, DNAME_INLINE_LEN_MIN); ++ old_name = target->d_name.name; ++ new_name = dentry->d_name.name; ++ if (old_name == target->d_iname) ++ old_name = dentry->d_iname; ++ if (new_name == dentry->d_iname) ++ new_name = target->d_iname; ++ target->d_name.name = new_name; ++ dentry->d_name.name = old_name; ++} ++ ++/* This function is spliced into ext3_lookup and does the move of a ++ * disconnected dentry (if it exists) to a connected dentry. ++ */ ++struct dentry *iopen_connect_dentry(struct dentry *dentry, struct inode *inode, ++ int rehash) ++{ ++ struct dentry *tmp, *goal = NULL; ++ struct list_head *lp; ++ ++ /* verify this dentry is really new */ ++ assert(dentry->d_inode == NULL); ++ assert(list_empty(&dentry->d_alias)); /* d_instantiate */ ++ if (rehash) ++ assert(d_unhashed(dentry)); /* d_rehash */ ++ assert(list_empty(&dentry->d_subdirs)); ++ ++ spin_lock(&dcache_lock); ++ if (!inode) ++ goto do_rehash; ++ ++ if (!test_opt(inode->i_sb, IOPEN)) ++ goto do_instantiate; ++ ++ /* preferrably return a connected dentry */ ++ list_for_each(lp, &inode->i_dentry) { ++ tmp = list_entry(lp, struct dentry, d_alias); ++ if (tmp->d_flags & DCACHE_DISCONNECTED) { ++ assert(tmp->d_alias.next == &inode->i_dentry); ++ assert(tmp->d_alias.prev == &inode->i_dentry); ++ goal = tmp; ++ dget_locked(goal); ++ break; ++ } ++ } ++ ++ if (!goal) ++ goto do_instantiate; ++ ++ /* Move the goal to the de hash queue */ ++ goal->d_flags &= ~DCACHE_DISCONNECTED; ++ security_d_instantiate(goal, inode); ++ __d_drop(dentry); ++ __d_rehash(dentry, 0); ++ __d_move(goal, dentry); ++ spin_unlock(&dcache_lock); ++ iput(inode); ++ ++ return goal; ++ ++ /* d_add(), but don't drop dcache_lock before adding dentry to inode */ ++do_instantiate: ++ list_add(&dentry->d_alias, &inode->i_dentry); /* d_instantiate */ ++ dentry->d_inode = inode; ++do_rehash: ++ if (rehash) ++ __d_rehash(dentry, 0); /* d_rehash */ ++ spin_unlock(&dcache_lock); ++ ++ return NULL; ++} ++ ++/* ++ * These are the special structures for the iopen pseudo directory. ++ */ ++ ++static struct inode_operations iopen_inode_operations = { ++ lookup: iopen_lookup, /* BKL held */ ++}; ++ ++static struct file_operations iopen_file_operations = { ++ read: generic_read_dir, ++}; ++ ++static int match_dentry(struct dentry *dentry, const char *name) ++{ ++ int len; ++ ++ len = strlen(name); ++ if (dentry->d_name.len != len) ++ return 0; ++ if (strncmp(dentry->d_name.name, name, len)) ++ return 0; ++ return 1; ++} ++ ++/* ++ * This function is spliced into ext3_lookup and returns 1 the file ++ * name is __iopen__ and dentry has been filled in appropriately. ++ */ ++int ext3_check_for_iopen(struct inode *dir, struct dentry *dentry) ++{ ++ struct inode *inode; ++ ++ if (dir->i_ino != EXT3_ROOT_INO || ++ !test_opt(dir->i_sb, IOPEN) || ++ !match_dentry(dentry, "__iopen__")) ++ return 0; ++ ++ inode = iget(dir->i_sb, EXT3_BAD_INO); ++ ++ if (!inode) ++ return 0; ++ d_add(dentry, inode); ++ return 1; ++} ++ ++/* ++ * This function is spliced into read_inode; it returns 1 if inode ++ * number is the one for /__iopen__, in which case the inode is filled ++ * in appropriately. Otherwise, this fuction returns 0. ++ */ ++int ext3_iopen_get_inode(struct inode *inode) ++{ ++ if (inode->i_ino != EXT3_BAD_INO) ++ return 0; ++ ++ inode->i_mode = S_IFDIR | S_IRUSR | S_IXUSR; ++ if (test_opt(inode->i_sb, IOPEN_NOPRIV)) ++ inode->i_mode |= 0777; ++ inode->i_uid = 0; ++ inode->i_gid = 0; ++ inode->i_nlink = 1; ++ inode->i_size = 4096; ++ inode->i_atime = CURRENT_TIME; ++ inode->i_ctime = CURRENT_TIME; ++ inode->i_mtime = CURRENT_TIME; ++ EXT3_I(inode)->i_dtime = 0; ++ inode->i_blksize = PAGE_SIZE; /* This is the optimal IO size ++ * (for stat), not the fs block ++ * size */ ++ inode->i_blocks = 0; ++ inode->i_version = 1; ++ inode->i_generation = 0; ++ ++ inode->i_op = &iopen_inode_operations; ++ inode->i_fop = &iopen_file_operations; ++ inode->i_mapping->a_ops = 0; ++ ++ return 1; ++} +Index: linux-stage/fs/ext3/iopen.h +=================================================================== +--- linux-stage.orig/fs/ext3/iopen.h 2005-02-25 14:41:01.017787968 +0200 ++++ linux-stage/fs/ext3/iopen.h 2005-02-25 14:41:01.045783712 +0200 +@@ -0,0 +1,15 @@ ++/* ++ * iopen.h ++ * ++ * Special support for opening files by inode number. ++ * ++ * Copyright (C) 2001 by Theodore Ts'o (tytso@alum.mit.edu). ++ * ++ * This file may be redistributed under the terms of the GNU General ++ * Public License. ++ */ ++ ++extern int ext3_check_for_iopen(struct inode *dir, struct dentry *dentry); ++extern int ext3_iopen_get_inode(struct inode *inode); ++extern struct dentry *iopen_connect_dentry(struct dentry *dentry, ++ struct inode *inode, int rehash); +Index: linux-stage/fs/ext3/namei.c +=================================================================== +--- linux-stage.orig/fs/ext3/namei.c 2005-02-25 14:37:28.975023368 +0200 ++++ linux-stage/fs/ext3/namei.c 2005-02-25 14:46:43.090784968 +0200 +@@ -37,6 +37,7 @@ + #include + #include + #include "xattr.h" ++#include "iopen.h" + #include "acl.h" + + /* +@@ -980,6 +981,9 @@ + if (dentry->d_name.len > EXT3_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); + ++ if (ext3_check_for_iopen(dir, dentry)) ++ return NULL; ++ + bh = ext3_find_entry(dentry, &de); + inode = NULL; + if (bh) { +@@ -990,10 +994,8 @@ + if (!inode) + return ERR_PTR(-EACCES); + } +- if (inode) +- return d_splice_alias(inode, dentry); +- d_add(dentry, inode); +- return NULL; ++ ++ return iopen_connect_dentry(dentry, inode, 1); + } + + +@@ -2037,10 +2039,6 @@ + inode->i_nlink); + inode->i_version++; + inode->i_nlink = 0; +- /* There's no need to set i_disksize: the fact that i_nlink is +- * zero will ensure that the right thing happens during any +- * recovery. */ +- inode->i_size = 0; + ext3_orphan_add(handle, inode); + inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; + ext3_mark_inode_dirty(handle, inode); +@@ -2163,6 +2161,23 @@ + return err; + } + ++/* Like ext3_add_nondir() except for call to iopen_connect_dentry */ ++static int ext3_add_link(handle_t *handle, struct dentry *dentry, ++ struct inode *inode) ++{ ++ int err = ext3_add_entry(handle, dentry, inode); ++ if (!err) { ++ err = ext3_mark_inode_dirty(handle, inode); ++ if (err == 0) { ++ dput(iopen_connect_dentry(dentry, inode, 0)); ++ return 0; ++ } ++ } ++ ext3_dec_count(handle, inode); ++ iput(inode); ++ return err; ++} ++ + static int ext3_link (struct dentry * old_dentry, + struct inode * dir, struct dentry *dentry) + { +@@ -2186,7 +2201,8 @@ + ext3_inc_count(handle, inode); + atomic_inc(&inode->i_count); + +- err = ext3_add_nondir(handle, dentry, inode); ++ err = ext3_add_link(handle, dentry, inode); ++ ext3_orphan_del(handle, inode); + ext3_journal_stop(handle); + if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) + goto retry; +Index: linux-stage/fs/ext3/super.c +=================================================================== +--- linux-stage.orig/fs/ext3/super.c 2005-02-25 14:37:30.987717392 +0200 ++++ linux-stage/fs/ext3/super.c 2005-02-25 14:44:50.495901992 +0200 +@@ -586,6 +586,7 @@ + Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, + Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, + Opt_ignore, Opt_barrier, Opt_err, Opt_resize, ++ Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, + }; + + static match_table_t tokens = { +@@ -633,6 +634,9 @@ + {Opt_ignore, "noquota"}, + {Opt_ignore, "quota"}, + {Opt_ignore, "usrquota"}, ++ {Opt_iopen, "iopen"}, ++ {Opt_noiopen, "noiopen"}, ++ {Opt_iopen_nopriv, "iopen_nopriv"}, + {Opt_barrier, "barrier=%u"}, + {Opt_err, NULL}, + {Opt_resize, "resize"}, +@@ -914,6 +918,18 @@ + else + clear_opt(sbi->s_mount_opt, BARRIER); + break; ++ case Opt_iopen: ++ set_opt (sbi->s_mount_opt, IOPEN); ++ clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV); ++ break; ++ case Opt_noiopen: ++ clear_opt (sbi->s_mount_opt, IOPEN); ++ clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV); ++ break; ++ case Opt_iopen_nopriv: ++ set_opt (sbi->s_mount_opt, IOPEN); ++ set_opt (sbi->s_mount_opt, IOPEN_NOPRIV); ++ break; + case Opt_ignore: + break; + case Opt_resize: +Index: linux-stage/include/linux/ext3_fs.h +=================================================================== +--- linux-stage.orig/include/linux/ext3_fs.h 2005-02-25 14:37:28.977023064 +0200 ++++ linux-stage/include/linux/ext3_fs.h 2005-02-25 14:49:00.569884968 +0200 +@@ -355,6 +355,8 @@ + #define EXT3_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */ + #define EXT3_MOUNT_BARRIER 0x10000 /* Use block barriers */ + #define EXT3_MOUNT_RESERVATION 0x20000 /* Preallocation */ ++#define EXT3_MOUNT_IOPEN 0x80000 /* Allow access via iopen */ ++#define EXT3_MOUNT_IOPEN_NOPRIV 0x100000/* Make iopen world-readable */ + + /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ + #ifndef _LINUX_EXT2_FS_H diff --git a/ldiskfs/kernel_patches/patches/iopen-2.6-suse.patch b/ldiskfs/kernel_patches/patches/iopen-2.6-suse.patch index 4a51eb8..1c5e900 100644 --- a/ldiskfs/kernel_patches/patches/iopen-2.6-suse.patch +++ b/ldiskfs/kernel_patches/patches/iopen-2.6-suse.patch @@ -1,15 +1,7 @@ - fs/ext3/inode.c | 3 - fs/ext3/iopen.c | 239 +++++++++++++++++++++++++++++++++++++ - fs/ext3/iopen.h | 15 ++ - fs/ext3/namei.c | 13 ++ - fs/ext3/super.c | 17 ++ - include/linux/ext3_fs.h | 2 - 7 files changed, 304 insertions(+), 1 deletion(-) - Index: linux-stage/fs/ext3/Makefile =================================================================== ---- linux-stage.orig/fs/ext3/Makefile 2004-11-03 14:41:24.747805262 -0500 -+++ linux-stage/fs/ext3/Makefile 2004-11-03 14:41:25.123696274 -0500 +--- linux-stage.orig/fs/ext3/Makefile 2005-02-25 14:31:53.151076368 +0200 ++++ linux-stage/fs/ext3/Makefile 2005-02-25 14:41:51.259150120 +0200 @@ -4,7 +4,7 @@ obj-$(CONFIG_EXT3_FS) += ext3.o @@ -21,8 +13,8 @@ Index: linux-stage/fs/ext3/Makefile ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o Index: linux-stage/fs/ext3/inode.c =================================================================== ---- linux-stage.orig/fs/ext3/inode.c 2004-11-03 14:41:25.040720333 -0500 -+++ linux-stage/fs/ext3/inode.c 2004-11-03 14:46:08.458515670 -0500 +--- linux-stage.orig/fs/ext3/inode.c 2005-02-25 14:37:30.983718000 +0200 ++++ linux-stage/fs/ext3/inode.c 2005-02-25 14:47:42.069818792 +0200 @@ -37,6 +37,7 @@ #include #include @@ -31,21 +23,21 @@ Index: linux-stage/fs/ext3/inode.c #include "acl.h" /* -@@ -2401,6 +2402,9 @@ - ei->i_default_acl = EXT3_ACL_NOT_CACHED; +@@ -2408,6 +2409,9 @@ #endif ei->i_rsv_window.rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED; -+ -+ if (ext3_iopen_get_inode(inode)) -+ return; ++ if (ext3_iopen_get_inode(inode)) ++ return; ++ if (ext3_get_inode_loc(inode, &iloc, 0)) goto bad_inode; + bh = iloc.bh; Index: linux-stage/fs/ext3/iopen.c =================================================================== ---- linux-stage.orig/fs/ext3/iopen.c 1969-12-31 19:00:00.000000000 -0500 -+++ linux-stage/fs/ext3/iopen.c 2004-11-03 14:41:25.125695694 -0500 -@@ -0,0 +1,272 @@ +--- linux-2.6.5-sles9.orig/fs/ext3/iopen.c 2003-01-30 13:24:37.000000000 +0300 ++++ linux-2.6.5-sles9/fs/ext3/iopen.c 2004-11-09 02:18:27.611913312 +0300 +@@ -0,0 +1,278 @@ +/* + * linux/fs/ext3/iopen.c + * @@ -133,7 +125,7 @@ Index: linux-stage/fs/ext3/iopen.c + } + + assert(list_empty(&dentry->d_alias)); /* d_instantiate */ -+ assert(d_unhashed(dentry)); /* d_rehash */ ++ assert(d_unhashed(dentry)); /* d_rehash */ + + /* preferrably return a connected dentry */ + spin_lock(&dcache_lock); @@ -146,7 +138,9 @@ Index: linux-stage/fs/ext3/iopen.c + alternate = list_entry(inode->i_dentry.next, + struct dentry, d_alias); + dget_locked(alternate); ++ spin_lock(&alternate->d_lock); + alternate->d_vfs_flags |= DCACHE_REFERENCED; ++ spin_unlock(&alternate->d_lock); + iput(inode); + spin_unlock(&dcache_lock); + return alternate; @@ -202,6 +196,9 @@ Index: linux-stage/fs/ext3/iopen.c + if (!inode) + goto do_rehash; + ++ if (!test_opt(inode->i_sb, IOPEN)) ++ goto do_instantiate; ++ + /* preferrably return a connected dentry */ + list_for_each(lp, &inode->i_dentry) { + tmp = list_entry(lp, struct dentry, d_alias); @@ -218,8 +215,9 @@ Index: linux-stage/fs/ext3/iopen.c + goto do_instantiate; + + /* Move the goal to the de hash queue */ -+ goal->d_flags &= ~ DCACHE_DISCONNECTED; ++ goal->d_flags &= ~DCACHE_DISCONNECTED; + security_d_instantiate(goal, inode); ++ __d_drop(dentry); + __d_rehash(dentry, 0); + __d_move(goal, dentry); + spin_unlock(&dcache_lock); @@ -320,8 +318,8 @@ Index: linux-stage/fs/ext3/iopen.c +} Index: linux-stage/fs/ext3/iopen.h =================================================================== ---- linux-stage.orig/fs/ext3/iopen.h 1969-12-31 19:00:00.000000000 -0500 -+++ linux-stage/fs/ext3/iopen.h 2004-11-03 14:41:25.126695404 -0500 +--- linux-stage.orig/fs/ext3/iopen.h 2005-02-25 14:41:01.017787968 +0200 ++++ linux-stage/fs/ext3/iopen.h 2005-02-25 14:41:01.045783712 +0200 @@ -0,0 +1,15 @@ +/* + * iopen.h @@ -340,8 +338,8 @@ Index: linux-stage/fs/ext3/iopen.h + struct inode *inode, int rehash); Index: linux-stage/fs/ext3/namei.c =================================================================== ---- linux-stage.orig/fs/ext3/namei.c 2004-11-03 14:41:24.957744391 -0500 -+++ linux-stage/fs/ext3/namei.c 2004-11-03 14:41:25.127695114 -0500 +--- linux-stage.orig/fs/ext3/namei.c 2005-02-25 14:37:28.975023368 +0200 ++++ linux-stage/fs/ext3/namei.c 2005-02-25 14:46:43.090784968 +0200 @@ -37,6 +37,7 @@ #include #include @@ -350,7 +348,7 @@ Index: linux-stage/fs/ext3/namei.c #include "acl.h" /* -@@ -979,6 +980,9 @@ +@@ -980,6 +981,9 @@ if (dentry->d_name.len > EXT3_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); @@ -360,7 +358,7 @@ Index: linux-stage/fs/ext3/namei.c bh = ext3_find_entry(dentry, &de); inode = NULL; if (bh) { -@@ -989,10 +993,8 @@ +@@ -990,10 +994,8 @@ if (!inode) return ERR_PTR(-EACCES); } @@ -373,7 +371,7 @@ Index: linux-stage/fs/ext3/namei.c } -@@ -2029,10 +2031,6 @@ +@@ -2037,10 +2039,6 @@ inode->i_nlink); inode->i_version++; inode->i_nlink = 0; @@ -384,7 +382,7 @@ Index: linux-stage/fs/ext3/namei.c ext3_orphan_add(handle, inode); inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; ext3_mark_inode_dirty(handle, inode); -@@ -2152,6 +2150,23 @@ +@@ -2163,6 +2161,23 @@ return err; } @@ -396,7 +394,7 @@ Index: linux-stage/fs/ext3/namei.c + if (!err) { + err = ext3_mark_inode_dirty(handle, inode); + if (err == 0) { -+ (void)iopen_connect_dentry(dentry, inode, 0); ++ dput(iopen_connect_dentry(dentry, inode, 0)); + return 0; + } + } @@ -408,40 +406,39 @@ Index: linux-stage/fs/ext3/namei.c static int ext3_link (struct dentry * old_dentry, struct inode * dir, struct dentry *dentry) { -@@ -2175,7 +2190,8 @@ +@@ -2186,7 +2201,8 @@ ext3_inc_count(handle, inode); atomic_inc(&inode->i_count); - err = ext3_add_nondir(handle, dentry, inode); + err = ext3_add_link(handle, dentry, inode); -+ ext3_orphan_del(handle,inode); ++ ext3_orphan_del(handle, inode); ext3_journal_stop(handle); if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) goto retry; Index: linux-stage/fs/ext3/super.c =================================================================== ---- linux-stage.orig/fs/ext3/super.c 2004-11-03 14:41:25.043719463 -0500 -+++ linux-stage/fs/ext3/super.c 2004-11-03 14:41:25.129694535 -0500 -@@ -534,7 +534,7 @@ - Opt_reservation, Opt_noreservation, Opt_noload, - Opt_commit, Opt_journal_update, Opt_journal_inum, +--- linux-stage.orig/fs/ext3/super.c 2005-02-25 14:37:30.987717392 +0200 ++++ linux-stage/fs/ext3/super.c 2005-02-25 14:44:50.495901992 +0200 +@@ -586,6 +586,7 @@ Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, -- Opt_ignore, Opt_barrier, -+ Opt_ignore, Opt_barrier, Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, + Opt_ignore, Opt_barrier, Opt_err, ++ Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, }; -@@ -577,6 +577,9 @@ + static match_table_t tokens = { +@@ -633,6 +634,9 @@ + {Opt_ignore, "noquota"}, {Opt_ignore, "quota"}, {Opt_ignore, "usrquota"}, - {Opt_barrier, "barrier=%u"}, + {Opt_iopen, "iopen"}, + {Opt_noiopen, "noiopen"}, + {Opt_iopen_nopriv, "iopen_nopriv"}, + {Opt_barrier, "barrier=%u"}, {Opt_err, NULL} }; - -@@ -778,6 +781,18 @@ +@@ -914,6 +918,18 @@ else clear_opt(sbi->s_mount_opt, BARRIER); break; @@ -462,14 +459,14 @@ Index: linux-stage/fs/ext3/super.c default: Index: linux-stage/include/linux/ext3_fs.h =================================================================== ---- linux-stage.orig/include/linux/ext3_fs.h 2004-05-11 17:21:20.000000000 -0400 -+++ linux-stage/include/linux/ext3_fs.h 2004-05-11 17:21:21.000000000 -0400 -@@ -326,6 +326,8 @@ - #define EXT3_MOUNT_XATTR_USER 0x4000 /* Extended user attributes */ - #define EXT3_MOUNT_POSIX_ACL 0x8000 /* POSIX Access Control Lists */ - #define EXT3_MOUNT_BARRIER 0x10000 /* Use block barriers */ -+#define EXT3_MOUNT_IOPEN 0x20000 /* Allow access via iopen */ -+#define EXT3_MOUNT_IOPEN_NOPRIV 0x40000 /* Make iopen world-readable */ +--- linux-stage.orig/include/linux/ext3_fs.h 2005-02-25 14:37:28.977023064 +0200 ++++ linux-stage/include/linux/ext3_fs.h 2005-02-25 14:49:00.569884968 +0200 +@@ -355,6 +355,8 @@ + #define EXT3_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */ + #define EXT3_MOUNT_RESERVATION 0x10000 /* Preallocation */ + #define EXT3_MOUNT_BARRIER 0x20000 /* Use block barriers */ ++#define EXT3_MOUNT_IOPEN 0x80000 /* Allow access via iopen */ ++#define EXT3_MOUNT_IOPEN_NOPRIV 0x100000/* Make iopen world-readable */ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ #ifndef _LINUX_EXT2_FS_H diff --git a/ldiskfs/kernel_patches/patches/iopen-2.6.12.patch b/ldiskfs/kernel_patches/patches/iopen-2.6.12.patch new file mode 100644 index 0000000..8d456ac --- /dev/null +++ b/ldiskfs/kernel_patches/patches/iopen-2.6.12.patch @@ -0,0 +1,471 @@ +Index: linux-2.6.12-rc6/fs/ext3/Makefile +=================================================================== +--- linux-2.6.12-rc6.orig/fs/ext3/Makefile 2005-06-14 16:00:45.206720992 +0200 ++++ linux-2.6.12-rc6/fs/ext3/Makefile 2005-06-14 16:14:33.595382720 +0200 +@@ -4,7 +4,7 @@ + + obj-$(CONFIG_EXT3_FS) += ext3.o + +-ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ ++ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ + ioctl.o namei.o super.o symlink.o hash.o resize.o + + ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o +Index: linux-2.6.12-rc6/fs/ext3/inode.c +=================================================================== +--- linux-2.6.12-rc6.orig/fs/ext3/inode.c 2005-06-14 16:01:16.272150299 +0200 ++++ linux-2.6.12-rc6/fs/ext3/inode.c 2005-06-14 16:24:55.686195412 +0200 +@@ -37,6 +37,7 @@ + #include + #include + #include "xattr.h" ++#include "iopen.h" + #include "acl.h" + + static int ext3_writepage_trans_blocks(struct inode *inode); +@@ -2437,6 +2438,8 @@ + ei->i_default_acl = EXT3_ACL_NOT_CACHED; + #endif + ei->i_block_alloc_info = NULL; ++ if (ext3_iopen_get_inode(inode)) ++ return; + + if (__ext3_get_inode_loc(inode, &iloc, 0)) + goto bad_inode; +Index: linux-2.6.12-rc6/fs/ext3/iopen.c +=================================================================== +--- linux-2.6.12-rc6.orig/fs/ext3/iopen.c 2005-06-14 16:14:33.530929595 +0200 ++++ linux-2.6.12-rc6/fs/ext3/iopen.c 2005-06-14 16:14:33.626632719 +0200 +@@ -0,0 +1,278 @@ ++/* ++ * linux/fs/ext3/iopen.c ++ * ++ * Special support for open by inode number ++ * ++ * Copyright (C) 2001 by Theodore Ts'o (tytso@alum.mit.edu). ++ * ++ * This file may be redistributed under the terms of the GNU General ++ * Public License. ++ * ++ * ++ * Invariants: ++ * - there is only ever a single DCACHE_NFSD_DISCONNECTED dentry alias ++ * for an inode at one time. ++ * - there are never both connected and DCACHE_NFSD_DISCONNECTED dentry ++ * aliases on an inode at the same time. ++ * ++ * If we have any connected dentry aliases for an inode, use one of those ++ * in iopen_lookup(). Otherwise, we instantiate a single NFSD_DISCONNECTED ++ * dentry for this inode, which thereafter will be found by the dcache ++ * when looking up this inode number in __iopen__, so we don't return here ++ * until it is gone. ++ * ++ * If we get an inode via a regular name lookup, then we "rename" the ++ * NFSD_DISCONNECTED dentry to the proper name and parent. This ensures ++ * existing users of the disconnected dentry will continue to use the same ++ * dentry as the connected users, and there will never be both kinds of ++ * dentry aliases at one time. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "iopen.h" ++ ++#ifndef assert ++#define assert(test) J_ASSERT(test) ++#endif ++ ++#define IOPEN_NAME_LEN 32 ++ ++/* ++ * This implements looking up an inode by number. ++ */ ++static struct dentry *iopen_lookup(struct inode * dir, struct dentry *dentry, ++ struct nameidata *nd) ++{ ++ struct inode *inode; ++ unsigned long ino; ++ struct list_head *lp; ++ struct dentry *alternate; ++ char buf[IOPEN_NAME_LEN]; ++ ++ if (dentry->d_name.len >= IOPEN_NAME_LEN) ++ return ERR_PTR(-ENAMETOOLONG); ++ ++ memcpy(buf, dentry->d_name.name, dentry->d_name.len); ++ buf[dentry->d_name.len] = 0; ++ ++ if (strcmp(buf, ".") == 0) ++ ino = dir->i_ino; ++ else if (strcmp(buf, "..") == 0) ++ ino = EXT3_ROOT_INO; ++ else ++ ino = simple_strtoul(buf, 0, 0); ++ ++ if ((ino != EXT3_ROOT_INO && ++ //ino != EXT3_ACL_IDX_INO && ++ //ino != EXT3_ACL_DATA_INO && ++ ino < EXT3_FIRST_INO(dir->i_sb)) || ++ ino > le32_to_cpu(EXT3_SB(dir->i_sb)->s_es->s_inodes_count)) ++ return ERR_PTR(-ENOENT); ++ ++ inode = iget(dir->i_sb, ino); ++ if (!inode) ++ return ERR_PTR(-EACCES); ++ if (is_bad_inode(inode)) { ++ iput(inode); ++ return ERR_PTR(-ENOENT); ++ } ++ ++ assert(list_empty(&dentry->d_alias)); /* d_instantiate */ ++ assert(d_unhashed(dentry)); /* d_rehash */ ++ ++ /* preferrably return a connected dentry */ ++ spin_lock(&dcache_lock); ++ list_for_each(lp, &inode->i_dentry) { ++ alternate = list_entry(lp, struct dentry, d_alias); ++ assert(!(alternate->d_flags & DCACHE_DISCONNECTED)); ++ } ++ ++ if (!list_empty(&inode->i_dentry)) { ++ alternate = list_entry(inode->i_dentry.next, ++ struct dentry, d_alias); ++ dget_locked(alternate); ++ spin_lock(&alternate->d_lock); ++ alternate->d_flags |= DCACHE_REFERENCED; ++ spin_unlock(&alternate->d_lock); ++ iput(inode); ++ spin_unlock(&dcache_lock); ++ return alternate; ++ } ++ dentry->d_flags |= DCACHE_DISCONNECTED; ++ ++ /* d_add(), but don't drop dcache_lock before adding dentry to inode */ ++ list_add(&dentry->d_alias, &inode->i_dentry); /* d_instantiate */ ++ dentry->d_inode = inode; ++ ++ d_rehash_cond(dentry, 0); /* d_rehash */ ++ spin_unlock(&dcache_lock); ++ ++ return NULL; ++} ++ ++#define do_switch(x,y) do { \ ++ __typeof__ (x) __tmp = x; \ ++ x = y; y = __tmp; } while (0) ++ ++static inline void switch_names(struct dentry *dentry, struct dentry *target) ++{ ++ const unsigned char *old_name, *new_name; ++ ++ memcpy(dentry->d_iname, target->d_iname, DNAME_INLINE_LEN_MIN); ++ old_name = target->d_name.name; ++ new_name = dentry->d_name.name; ++ if (old_name == target->d_iname) ++ old_name = dentry->d_iname; ++ if (new_name == dentry->d_iname) ++ new_name = target->d_iname; ++ target->d_name.name = new_name; ++ dentry->d_name.name = old_name; ++} ++ ++/* This function is spliced into ext3_lookup and does the move of a ++ * disconnected dentry (if it exists) to a connected dentry. ++ */ ++struct dentry *iopen_connect_dentry(struct dentry *dentry, struct inode *inode, ++ int rehash) ++{ ++ struct dentry *tmp, *goal = NULL; ++ struct list_head *lp; ++ ++ /* verify this dentry is really new */ ++ assert(dentry->d_inode == NULL); ++ assert(list_empty(&dentry->d_alias)); /* d_instantiate */ ++ if (rehash) ++ assert(d_unhashed(dentry)); /* d_rehash */ ++ assert(list_empty(&dentry->d_subdirs)); ++ ++ spin_lock(&dcache_lock); ++ if (!inode) ++ goto do_rehash; ++ ++ if (!test_opt(inode->i_sb, IOPEN)) ++ goto do_instantiate; ++ ++ /* preferrably return a connected dentry */ ++ list_for_each(lp, &inode->i_dentry) { ++ tmp = list_entry(lp, struct dentry, d_alias); ++ if (tmp->d_flags & DCACHE_DISCONNECTED) { ++ assert(tmp->d_alias.next == &inode->i_dentry); ++ assert(tmp->d_alias.prev == &inode->i_dentry); ++ goal = tmp; ++ dget_locked(goal); ++ break; ++ } ++ } ++ ++ if (!goal) ++ goto do_instantiate; ++ ++ /* Move the goal to the de hash queue */ ++ goal->d_flags &= ~DCACHE_DISCONNECTED; ++ security_d_instantiate(goal, inode); ++ __d_drop(dentry); ++ d_rehash_cond(dentry, 0); ++ __d_move(goal, dentry); ++ spin_unlock(&dcache_lock); ++ iput(inode); ++ ++ return goal; ++ ++ /* d_add(), but don't drop dcache_lock before adding dentry to inode */ ++do_instantiate: ++ list_add(&dentry->d_alias, &inode->i_dentry); /* d_instantiate */ ++ dentry->d_inode = inode; ++do_rehash: ++ if (rehash) ++ d_rehash_cond(dentry, 0); /* d_rehash */ ++ spin_unlock(&dcache_lock); ++ ++ return NULL; ++} ++ ++/* ++ * These are the special structures for the iopen pseudo directory. ++ */ ++ ++static struct inode_operations iopen_inode_operations = { ++ lookup: iopen_lookup, /* BKL held */ ++}; ++ ++static struct file_operations iopen_file_operations = { ++ read: generic_read_dir, ++}; ++ ++static int match_dentry(struct dentry *dentry, const char *name) ++{ ++ int len; ++ ++ len = strlen(name); ++ if (dentry->d_name.len != len) ++ return 0; ++ if (strncmp(dentry->d_name.name, name, len)) ++ return 0; ++ return 1; ++} ++ ++/* ++ * This function is spliced into ext3_lookup and returns 1 the file ++ * name is __iopen__ and dentry has been filled in appropriately. ++ */ ++int ext3_check_for_iopen(struct inode *dir, struct dentry *dentry) ++{ ++ struct inode *inode; ++ ++ if (dir->i_ino != EXT3_ROOT_INO || ++ !test_opt(dir->i_sb, IOPEN) || ++ !match_dentry(dentry, "__iopen__")) ++ return 0; ++ ++ inode = iget(dir->i_sb, EXT3_BAD_INO); ++ ++ if (!inode) ++ return 0; ++ d_add(dentry, inode); ++ return 1; ++} ++ ++/* ++ * This function is spliced into read_inode; it returns 1 if inode ++ * number is the one for /__iopen__, in which case the inode is filled ++ * in appropriately. Otherwise, this fuction returns 0. ++ */ ++int ext3_iopen_get_inode(struct inode *inode) ++{ ++ if (inode->i_ino != EXT3_BAD_INO) ++ return 0; ++ ++ inode->i_mode = S_IFDIR | S_IRUSR | S_IXUSR; ++ if (test_opt(inode->i_sb, IOPEN_NOPRIV)) ++ inode->i_mode |= 0777; ++ inode->i_uid = 0; ++ inode->i_gid = 0; ++ inode->i_nlink = 1; ++ inode->i_size = 4096; ++ inode->i_atime = CURRENT_TIME; ++ inode->i_ctime = CURRENT_TIME; ++ inode->i_mtime = CURRENT_TIME; ++ EXT3_I(inode)->i_dtime = 0; ++ inode->i_blksize = PAGE_SIZE; /* This is the optimal IO size ++ * (for stat), not the fs block ++ * size */ ++ inode->i_blocks = 0; ++ inode->i_version = 1; ++ inode->i_generation = 0; ++ ++ inode->i_op = &iopen_inode_operations; ++ inode->i_fop = &iopen_file_operations; ++ inode->i_mapping->a_ops = 0; ++ ++ return 1; ++} +Index: linux-2.6.12-rc6/fs/ext3/iopen.h +=================================================================== +--- linux-2.6.12-rc6.orig/fs/ext3/iopen.h 2005-06-14 16:14:33.534835845 +0200 ++++ linux-2.6.12-rc6/fs/ext3/iopen.h 2005-06-14 16:14:33.633468657 +0200 +@@ -0,0 +1,15 @@ ++/* ++ * iopen.h ++ * ++ * Special support for opening files by inode number. ++ * ++ * Copyright (C) 2001 by Theodore Ts'o (tytso@alum.mit.edu). ++ * ++ * This file may be redistributed under the terms of the GNU General ++ * Public License. ++ */ ++ ++extern int ext3_check_for_iopen(struct inode *dir, struct dentry *dentry); ++extern int ext3_iopen_get_inode(struct inode *inode); ++extern struct dentry *iopen_connect_dentry(struct dentry *dentry, ++ struct inode *inode, int rehash); +Index: linux-2.6.12-rc6/fs/ext3/namei.c +=================================================================== +--- linux-2.6.12-rc6.orig/fs/ext3/namei.c 2005-06-14 16:01:14.701837819 +0200 ++++ linux-2.6.12-rc6/fs/ext3/namei.c 2005-06-14 16:14:33.644210844 +0200 +@@ -37,6 +37,7 @@ + #include + #include + #include "xattr.h" ++#include "iopen.h" + #include "acl.h" + + /* +@@ -985,6 +986,9 @@ + if (dentry->d_name.len > EXT3_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); + ++ if (ext3_check_for_iopen(dir, dentry)) ++ return NULL; ++ + bh = ext3_find_entry(dentry, &de); + inode = NULL; + if (bh) { +@@ -995,10 +999,8 @@ + if (!inode) + return ERR_PTR(-EACCES); + } +- if (inode) +- return d_splice_alias(inode, dentry); +- d_add(dentry, inode); +- return NULL; ++ ++ return iopen_connect_dentry(dentry, inode, 1); + } + + +@@ -2042,10 +2044,6 @@ + inode->i_nlink); + inode->i_version++; + inode->i_nlink = 0; +- /* There's no need to set i_disksize: the fact that i_nlink is +- * zero will ensure that the right thing happens during any +- * recovery. */ +- inode->i_size = 0; + ext3_orphan_add(handle, inode); + inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; + ext3_mark_inode_dirty(handle, inode); +@@ -2168,6 +2166,23 @@ + return err; + } + ++/* Like ext3_add_nondir() except for call to iopen_connect_dentry */ ++static int ext3_add_link(handle_t *handle, struct dentry *dentry, ++ struct inode *inode) ++{ ++ int err = ext3_add_entry(handle, dentry, inode); ++ if (!err) { ++ err = ext3_mark_inode_dirty(handle, inode); ++ if (err == 0) { ++ dput(iopen_connect_dentry(dentry, inode, 0)); ++ return 0; ++ } ++ } ++ ext3_dec_count(handle, inode); ++ iput(inode); ++ return err; ++} ++ + static int ext3_link (struct dentry * old_dentry, + struct inode * dir, struct dentry *dentry) + { +@@ -2191,7 +2206,8 @@ + ext3_inc_count(handle, inode); + atomic_inc(&inode->i_count); + +- err = ext3_add_nondir(handle, dentry, inode); ++ err = ext3_add_link(handle, dentry, inode); ++ ext3_orphan_del(handle, inode); + ext3_journal_stop(handle); + if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) + goto retry; +Index: linux-2.6.12-rc6/fs/ext3/super.c +=================================================================== +--- linux-2.6.12-rc6.orig/fs/ext3/super.c 2005-06-14 16:01:16.287775299 +0200 ++++ linux-2.6.12-rc6/fs/ext3/super.c 2005-06-14 16:14:33.656906156 +0200 +@@ -590,6 +590,7 @@ + Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, + Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, + Opt_ignore, Opt_barrier, Opt_err, Opt_resize, ++ Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, + }; + + static match_table_t tokens = { +@@ -638,6 +639,9 @@ + {Opt_ignore, "noquota"}, + {Opt_ignore, "quota"}, + {Opt_ignore, "usrquota"}, ++ {Opt_iopen, "iopen"}, ++ {Opt_noiopen, "noiopen"}, ++ {Opt_iopen_nopriv, "iopen_nopriv"}, + {Opt_barrier, "barrier=%u"}, + {Opt_err, NULL}, + {Opt_resize, "resize"}, +@@ -921,6 +925,18 @@ + else + clear_opt(sbi->s_mount_opt, BARRIER); + break; ++ case Opt_iopen: ++ set_opt (sbi->s_mount_opt, IOPEN); ++ clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV); ++ break; ++ case Opt_noiopen: ++ clear_opt (sbi->s_mount_opt, IOPEN); ++ clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV); ++ break; ++ case Opt_iopen_nopriv: ++ set_opt (sbi->s_mount_opt, IOPEN); ++ set_opt (sbi->s_mount_opt, IOPEN_NOPRIV); ++ break; + case Opt_ignore: + break; + case Opt_resize: +Index: linux-2.6.12-rc6/include/linux/ext3_fs.h +=================================================================== +--- linux-2.6.12-rc6.orig/include/linux/ext3_fs.h 2005-06-14 16:01:14.709650318 +0200 ++++ linux-2.6.12-rc6/include/linux/ext3_fs.h 2005-06-14 16:28:38.452794245 +0200 +@@ -358,6 +358,8 @@ + #define EXT3_MOUNT_RESERVATION 0x10000 /* Preallocation */ + #define EXT3_MOUNT_BARRIER 0x20000 /* Use block barriers */ + #define EXT3_MOUNT_NOBH 0x40000 /* No bufferheads */ ++#define EXT3_MOUNT_IOPEN 0x80000 /* Allow access via iopen */ ++#define EXT3_MOUNT_IOPEN_NOPRIV 0x100000/* Make iopen world-readable */ + + /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ + #ifndef _LINUX_EXT2_FS_H diff --git a/ldiskfs/kernel_patches/series/ldiskfs-2.6-fc3.series b/ldiskfs/kernel_patches/series/ldiskfs-2.6-fc3.series index b4608a9..3661023 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-2.6-fc3.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-2.6-fc3.series @@ -1,19 +1,13 @@ -ext3-pdirops-2.6.10-fc3.patch -ext3-wantedi-2.6.10-fc3.patch +ext3-wantedi-2.6-rhel4.patch ext3-san-jdike-2.6-suse.patch -iopen-2.6.10-fc3.patch -export_symbols-ext3-2.6.10-fc3.patch +iopen-2.6-rhel4.patch +export_symbols-ext3-2.6-suse.patch ext3-map_inode_page-2.6-suse.patch -ext3-init-generation-2.6-suse.patch -ext3-ea-in-inode-2.6-fc3.patch -export-ext3-2.6.10-fc3.patch -ext3-include-fixes-2.6-suse.patch -ext3-extents-2.6.10-fc3.patch -#ext3-extents-in-ea-2.6.10-fc3.patch -#ext3-extents-in-ea-ioctl-2.6.10-fc3.patch -#ext3-extents-in-ea-exports-symbol-2.6.7.patch -ext3-mds-num-2.6.10-fc3.patch -ext3-fid-2.6.7.patch -ext3-raw-lookup-2.6.10.patch -ext3-disable-reservation-2.6.10-fc3.patch -ext3-mballoc2-2.6.10-fc3.patch +ext3-ea-in-inode-2.6-rhel4.patch +export-ext3-2.6-rhel4.patch +ext3-include-fixes-2.6-rhel4.patch +ext3-extents-2.6.9-rhel4.patch +ext3-mballoc2-2.6.9-rhel4.patch +ext3-nlinks-2.6.9.patch +ext3-ialloc-2.6.patch +ext3-lookup-dotdot-2.6.9.patch diff --git a/ldiskfs/kernel_patches/series/ldiskfs-2.6-fc5.series b/ldiskfs/kernel_patches/series/ldiskfs-2.6-fc5.series new file mode 100644 index 0000000..1c853bd --- /dev/null +++ b/ldiskfs/kernel_patches/series/ldiskfs-2.6-fc5.series @@ -0,0 +1,12 @@ +ext3-wantedi-2.6-rhel4.patch +ext3-san-jdike-2.6-suse.patch +iopen-2.6-fc5.patch +ext3-map_inode_page-2.6-suse.patch +export-ext3-2.6-rhel4.patch +ext3-include-fixes-2.6-rhel4.patch +ext3-extents-2.6.15.patch +ext3-mballoc2-2.6-fc5.patch +ext3-nlinks-2.6.9.patch +ext3-ialloc-2.6.patch +ext3-remove-cond_resched-calls-2.6.12.patch +ext3-filterdata-2.6.15.patch diff --git a/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel4.series b/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel4.series new file mode 100644 index 0000000..7829040 --- /dev/null +++ b/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel4.series @@ -0,0 +1,15 @@ +ext3-wantedi-2.6-rhel4.patch +ext3-san-jdike-2.6-suse.patch +iopen-2.6-rhel4.patch +export_symbols-ext3-2.6-suse.patch +ext3-map_inode_page-2.6-suse.patch +ext3-ea-in-inode-2.6-rhel4.patch +export-ext3-2.6-rhel4.patch +ext3-include-fixes-2.6-rhel4.patch +ext3-extents-2.6.9-rhel4.patch +ext3-mballoc2-2.6.9-rhel4.patch +ext3-nlinks-2.6.9.patch +ext3-ialloc-2.6.patch +ext3-lookup-dotdot-2.6.9.patch +ext3-sector_t-overflow-2.6.9-rhel4.patch +ext3-check-jbd-errors-2.6.9.patch diff --git a/ldiskfs/kernel_patches/series/ldiskfs-2.6-sles10.series b/ldiskfs/kernel_patches/series/ldiskfs-2.6-sles10.series new file mode 100644 index 0000000..bfba5fb --- /dev/null +++ b/ldiskfs/kernel_patches/series/ldiskfs-2.6-sles10.series @@ -0,0 +1,13 @@ +ext3-wantedi-2.6-rhel4.patch +ext3-san-jdike-2.6-suse.patch +iopen-2.6-fc5.patch +ext3-map_inode_page-2.6-suse.patch +export-ext3-2.6-rhel4.patch +ext3-include-fixes-2.6-rhel4.patch +ext3-extents-2.6.15.patch +ext3-mballoc2-2.6-fc5.patch +ext3-nlinks-2.6.9.patch +ext3-ialloc-2.6.patch +ext3-remove-cond_resched-calls-2.6.12.patch +ext3-filterdata-2.6.15.patch +ext3-disable-write-barrier-by-default-2.6-sles10.patch diff --git a/ldiskfs/kernel_patches/series/ldiskfs-2.6-suse.series b/ldiskfs/kernel_patches/series/ldiskfs-2.6-suse.series index d27088e..6a5f05d 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-2.6-suse.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-2.6-suse.series @@ -1,10 +1,16 @@ ext3-wantedi-2.6-suse.patch ext3-san-jdike-2.6-suse.patch -iopen-2.6-suse.patch +iopen-2.6-suse.patch export_symbols-ext3-2.6-suse.patch ext3-map_inode_page-2.6-suse.patch -ext3-init-generation-2.6-suse.patch ext3-ea-in-inode-2.6-suse.patch export-ext3-2.6-suse.patch ext3-include-fixes-2.6-suse.patch -ext3-htree-rename_fix.patch +ext3-extents-2.6.5.patch +ext3-mballoc2-2.6-suse.patch +ext3-nlinks-2.6.7.patch +ext3-rename-reserve-2.6-suse.patch +ext3-ialloc-2.6.patch +ext3-lookup-dotdot-2.6.9.patch +ext3-sector_t-overflow-2.6.5-suse.patch +ext3-check-jbd-errors-2.6.5.patch diff --git a/ldiskfs/kernel_patches/series/ldiskfs-2.6.12-vanilla.series b/ldiskfs/kernel_patches/series/ldiskfs-2.6.12-vanilla.series new file mode 100644 index 0000000..53c060b --- /dev/null +++ b/ldiskfs/kernel_patches/series/ldiskfs-2.6.12-vanilla.series @@ -0,0 +1,15 @@ +ext3-wantedi-2.6-rhel4.patch +ext3-san-jdike-2.6-suse.patch +iopen-2.6.12.patch +ext3-map_inode_page-2.6-suse.patch +export-ext3-2.6-rhel4.patch +ext3-include-fixes-2.6-rhel4.patch +ext3-extents-2.6.12.patch +ext3-mballoc2-2.6.12.patch +ext3-nlinks-2.6.9.patch +ext3-ialloc-2.6.patch +ext3-remove-cond_resched-calls-2.6.12.patch +ext3-htree-dot-2.6.patch +ext3-external-journal-2.6.12.patch +ext3-lookup-dotdot-2.6.9.patch +ext3-sector_t-overflow-2.6.12.patch diff --git a/ldiskfs/kernel_patches/series/ldiskfs-2.6.18-vanilla.series b/ldiskfs/kernel_patches/series/ldiskfs-2.6.18-vanilla.series new file mode 100644 index 0000000..f379cec --- /dev/null +++ b/ldiskfs/kernel_patches/series/ldiskfs-2.6.18-vanilla.series @@ -0,0 +1,13 @@ +ext3-wantedi-2.6-rhel4.patch +ext3-san-jdike-2.6-suse.patch +iopen-2.6-fc5.patch +ext3-map_inode_page-2.6-suse.patch +export-ext3-2.6-rhel4.patch +ext3-include-fixes-2.6-rhel4.patch +ext3-extents-2.6.18-vanilla.patch +ext3-mballoc2-2.6.18-vanilla.patch +ext3-nlinks-2.6.9.patch +ext3-ialloc-2.6.patch +ext3-remove-cond_resched-calls-2.6.12.patch +ext3-filterdata-2.6.15.patch +ext3-multi-mount-protection-2.6.18-vanilla.patch diff --git a/ldiskfs/ldiskfs/Makefile.in b/ldiskfs/ldiskfs/Makefile.in index acf0b20..e52e62f 100644 --- a/ldiskfs/ldiskfs/Makefile.in +++ b/ldiskfs/ldiskfs/Makefile.in @@ -2,17 +2,28 @@ default: all MODULES := ldiskfs +@QUOTA_TRUE@MODULES += quotafmt_test + # copy makefile over to not break patches ext3_extra := $(wildcard @LINUX@/fs/ext3/Makefile) ext3_headers := $(wildcard @LINUX@/fs/ext3/*.h) linux_headers := $(wildcard @LINUX@/include/linux/ext3*.h) -new_linux_hearders := ext3_extents.h + ext3_sources := $(filter-out %.mod.c,$(wildcard @LINUX@/fs/ext3/*.c)) -new_sources := iopen.c iopen.h extents.c extents-in-ea.c mballoc.c -ldiskfs_sources := $(notdir $(ext3_sources) $(ext3_headers)) $(new_sources) +new_sources := iopen.c iopen.h extents.c mballoc.c +new_headers := ext3_extents.h +ldiskfs_patched_sources := $(notdir $(ext3_sources) $(ext3_headers)) $(new_sources) $(new_headers) +ldiskfs_sources := $(ldiskfs_patched_sources) + +quotafmt_sources := lustre_quota_fmt.c +quotafmt_headers := lustre_quota_fmt.h +@QUOTA_TRUE@ldiskfs_sources += $(quotafmt_sources) $(quotafmt_headers) + ldiskfs-objs := $(filter %.o,$(ldiskfs_sources:.c=.o)) +@QUOTA_TRUE@quotafmt-objs := quotafmt_test.o + EXTRA_PRE_CFLAGS := -I@LINUX@/fs -I@LUSTRE@ -I@LUSTRE@/ldiskfs @INCLUDE_RULES@ diff --git a/ldiskfs/ldiskfs/autoMakefile.am b/ldiskfs/ldiskfs/autoMakefile.am index 0d82a44..7e378c2 100644 --- a/ldiskfs/ldiskfs/autoMakefile.am +++ b/ldiskfs/ldiskfs/autoMakefile.am @@ -6,12 +6,14 @@ endif ldiskfs_linux_headers := $(addprefix linux/,$(subst ext3,ldiskfs,$(notdir $(linux_headers)))) -$(filter %.c,$(ldiskfs_sources)): sources $(ldiskfs_linux_headers) $(filter %.h,$(ldiskfs_sources)) +$(filter %.c,$(ldiskfs_patched_sources)): sources $(ldiskfs_linux_headers) $(filter %.h,$(ldiskfs_patched_sources)) ldiskfs_sed_flags = \ -e "s/dx_hash_info/ext3_dx_hash_info/g" \ -e "s/dir_private_info/ext3_dir_private_info/g" \ -e "s/DX_HASH/EXT3_DX_HASH/g" \ + -e "s/reserve_window/ext3_reserve_window/g" \ + -e "s/rsv_window_add/ext3_rsv_window_add/g" \ -e "s/EXT3/LDISKFS/g" -e "s/ext3/ldiskfs/g" %.c: linux-stage/fs/ext3/%.c @@ -36,17 +38,17 @@ sources: $(ext3_sources) $(ext3_headers) $(linux_headers) $(series) cp $(ext3_sources) $(ext3_headers) $(ext3_extra) linux-stage/fs/ext3 cp $(linux_headers) linux-stage/include/linux if USE_QUILT - cd linux-stage && quilt setup -l ../$(series) -d ../$(patches) + ln -s ../$(patches) linux-stage/patches + ln -s ../$(series) linux-stage/series cd linux-stage && quilt push -a -q else @echo -n "Applying ext3 patches:" @cd linux-stage && for i in $$(<../$(series)) ; do \ - echo -n " $$i" ; \ - patch -s -p1 < ../$(patches)/$$i || exit 1 ; \ + echo -n " $$i" ; \ + patch -s -p1 < ../$(patches)/$$i || exit 1 ; \ done @echo endif - mkdir linux @echo -n "Replacing 'ext3' with 'ldiskfs':" @for i in $(notdir $(ext3_headers) $(ext3_sources)) $(new_sources) ; do \ @@ -54,7 +56,7 @@ endif sed $(strip $(ldiskfs_sed_flags)) \ linux-stage/fs/ext3/$$i > $$i ; \ done - @for i in $(subst ext3,,$(notdir $(linux_headers)) $(new_linux_hearders)) ; do \ + @for i in $(subst ext3,,$(notdir $(linux_headers) $(new_headers))) ; do \ echo -n " ext3$$i" ; \ sed $(strip $(ldiskfs_sed_flags)) \ linux-stage/include/linux/ext3$$i \ @@ -72,7 +74,9 @@ foo-check: @echo "ldiskfs_LDADD: $(ldiskfs_LDADD)" MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ -CLEANFILES = sources *.c *.h +CLEANFILES = sources $(notdir $(linux_headers) $(ext3_headers) $(ext3_sources) $(new_sources) $(new_headers)) + +EXTRA_DIST := lustre_quota_fmt.c lustre_quota_fmt.h quotafmt_test.c clean: clean-am rm -rf linux linux-stage diff --git a/lustre/BUGS b/lustre/BUGS index 9cf6fa2..ba84777 100644 --- a/lustre/BUGS +++ b/lustre/BUGS @@ -1,15 +1 @@ -include /dev/obd in the documentation - - -attach: attaching ext2obd allows ext2 module to be unloaded. Unload, -then do cleanup, get Oops... - -syncing: invalid IOCTL - -create: more than one object - -preallocate: IOCTL - -statfs: - -restoresnap: decrements directory count for ext2 +To report bugs, please visit http://bugzilla.clusterfs.com/ diff --git a/lustre/COPYING b/lustre/COPYING deleted file mode 100644 index c69cfd8..0000000 --- a/lustre/COPYING +++ /dev/null @@ -1,352 +0,0 @@ - - NOTE! This copyright does *not* cover user programs that use kernel - services by normal system calls - this is merely considered normal use - of the kernel, and does *not* fall under the heading of "derived work". - Also note that the GPL below is copyrighted by the Free Software - Foundation, but the instance of code that it refers to (the Linux - kernel) is copyrighted by me and others who actually wrote it. - - Linus Torvalds - ----------------------------------------- - - GNU GENERAL PUBLIC LICENSE - Version 2, June 1991 - - Copyright (C) 1989, 1991 Free Software Foundation, Inc. - 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - Everyone is permitted to copy and distribute verbatim copies - of this license document, but changing it is not allowed. - - Preamble - - The licenses for most software are designed to take away your -freedom to share and change it. By contrast, the GNU General Public -License is intended to guarantee your freedom to share and change free -software--to make sure the software is free for all its users. This -General Public License applies to most of the Free Software -Foundation's software and to any other program whose authors commit to -using it. (Some other Free Software Foundation software is covered by -the GNU Library General Public License instead.) You can apply it to -your programs, too. - - When we speak of free software, we are referring to freedom, not -price. Our General Public Licenses are designed to make sure that you -have the freedom to distribute copies of free software (and charge for -this service if you wish), that you receive source code or can get it -if you want it, that you can change the software or use pieces of it -in new free programs; and that you know you can do these things. - - To protect your rights, we need to make restrictions that forbid -anyone to deny you these rights or to ask you to surrender the rights. -These restrictions translate to certain responsibilities for you if you -distribute copies of the software, or if you modify it. - - For example, if you distribute copies of such a program, whether -gratis or for a fee, you must give the recipients all the rights that -you have. You must make sure that they, too, receive or can get the -source code. And you must show them these terms so they know their -rights. - - We protect your rights with two steps: (1) copyright the software, and -(2) offer you this license which gives you legal permission to copy, -distribute and/or modify the software. - - Also, for each author's protection and ours, we want to make certain -that everyone understands that there is no warranty for this free -software. If the software is modified by someone else and passed on, we -want its recipients to know that what they have is not the original, so -that any problems introduced by others will not reflect on the original -authors' reputations. - - Finally, any free program is threatened constantly by software -patents. We wish to avoid the danger that redistributors of a free -program will individually obtain patent licenses, in effect making the -program proprietary. To prevent this, we have made it clear that any -patent must be licensed for everyone's free use or not licensed at all. - - The precise terms and conditions for copying, distribution and -modification follow. - - GNU GENERAL PUBLIC LICENSE - TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION - - 0. This License applies to any program or other work which contains -a notice placed by the copyright holder saying it may be distributed -under the terms of this General Public License. The "Program", below, -refers to any such program or work, and a "work based on the Program" -means either the Program or any derivative work under copyright law: -that is to say, a work containing the Program or a portion of it, -either verbatim or with modifications and/or translated into another -language. (Hereinafter, translation is included without limitation in -the term "modification".) Each licensee is addressed as "you". - -Activities other than copying, distribution and modification are not -covered by this License; they are outside its scope. The act of -running the Program is not restricted, and the output from the Program -is covered only if its contents constitute a work based on the -Program (independent of having been made by running the Program). -Whether that is true depends on what the Program does. - - 1. You may copy and distribute verbatim copies of the Program's -source code as you receive it, in any medium, provided that you -conspicuously and appropriately publish on each copy an appropriate -copyright notice and disclaimer of warranty; keep intact all the -notices that refer to this License and to the absence of any warranty; -and give any other recipients of the Program a copy of this License -along with the Program. - -You may charge a fee for the physical act of transferring a copy, and -you may at your option offer warranty protection in exchange for a fee. - - 2. You may modify your copy or copies of the Program or any portion -of it, thus forming a work based on the Program, and copy and -distribute such modifications or work under the terms of Section 1 -above, provided that you also meet all of these conditions: - - a) You must cause the modified files to carry prominent notices - stating that you changed the files and the date of any change. - - b) You must cause any work that you distribute or publish, that in - whole or in part contains or is derived from the Program or any - part thereof, to be licensed as a whole at no charge to all third - parties under the terms of this License. - - c) If the modified program normally reads commands interactively - when run, you must cause it, when started running for such - interactive use in the most ordinary way, to print or display an - announcement including an appropriate copyright notice and a - notice that there is no warranty (or else, saying that you provide - a warranty) and that users may redistribute the program under - these conditions, and telling the user how to view a copy of this - License. (Exception: if the Program itself is interactive but - does not normally print such an announcement, your work based on - the Program is not required to print an announcement.) - -These requirements apply to the modified work as a whole. If -identifiable sections of that work are not derived from the Program, -and can be reasonably considered independent and separate works in -themselves, then this License, and its terms, do not apply to those -sections when you distribute them as separate works. But when you -distribute the same sections as part of a whole which is a work based -on the Program, the distribution of the whole must be on the terms of -this License, whose permissions for other licensees extend to the -entire whole, and thus to each and every part regardless of who wrote it. - -Thus, it is not the intent of this section to claim rights or contest -your rights to work written entirely by you; rather, the intent is to -exercise the right to control the distribution of derivative or -collective works based on the Program. - -In addition, mere aggregation of another work not based on the Program -with the Program (or with a work based on the Program) on a volume of -a storage or distribution medium does not bring the other work under -the scope of this License. - - 3. You may copy and distribute the Program (or a work based on it, -under Section 2) in object code or executable form under the terms of -Sections 1 and 2 above provided that you also do one of the following: - - a) Accompany it with the complete corresponding machine-readable - source code, which must be distributed under the terms of Sections - 1 and 2 above on a medium customarily used for software interchange; or, - - b) Accompany it with a written offer, valid for at least three - years, to give any third party, for a charge no more than your - cost of physically performing source distribution, a complete - machine-readable copy of the corresponding source code, to be - distributed under the terms of Sections 1 and 2 above on a medium - customarily used for software interchange; or, - - c) Accompany it with the information you received as to the offer - to distribute corresponding source code. (This alternative is - allowed only for noncommercial distribution and only if you - received the program in object code or executable form with such - an offer, in accord with Subsection b above.) - -The source code for a work means the preferred form of the work for -making modifications to it. For an executable work, complete source -code means all the source code for all modules it contains, plus any -associated interface definition files, plus the scripts used to -control compilation and installation of the executable. However, as a -special exception, the source code distributed need not include -anything that is normally distributed (in either source or binary -form) with the major components (compiler, kernel, and so on) of the -operating system on which the executable runs, unless that component -itself accompanies the executable. - -If distribution of executable or object code is made by offering -access to copy from a designated place, then offering equivalent -access to copy the source code from the same place counts as -distribution of the source code, even though third parties are not -compelled to copy the source along with the object code. - - 4. You may not copy, modify, sublicense, or distribute the Program -except as expressly provided under this License. Any attempt -otherwise to copy, modify, sublicense or distribute the Program is -void, and will automatically terminate your rights under this License. -However, parties who have received copies, or rights, from you under -this License will not have their licenses terminated so long as such -parties remain in full compliance. - - 5. You are not required to accept this License, since you have not -signed it. However, nothing else grants you permission to modify or -distribute the Program or its derivative works. These actions are -prohibited by law if you do not accept this License. Therefore, by -modifying or distributing the Program (or any work based on the -Program), you indicate your acceptance of this License to do so, and -all its terms and conditions for copying, distributing or modifying -the Program or works based on it. - - 6. Each time you redistribute the Program (or any work based on the -Program), the recipient automatically receives a license from the -original licensor to copy, distribute or modify the Program subject to -these terms and conditions. You may not impose any further -restrictions on the recipients' exercise of the rights granted herein. -You are not responsible for enforcing compliance by third parties to -this License. - - 7. If, as a consequence of a court judgment or allegation of patent -infringement or for any other reason (not limited to patent issues), -conditions are imposed on you (whether by court order, agreement or -otherwise) that contradict the conditions of this License, they do not -excuse you from the conditions of this License. If you cannot -distribute so as to satisfy simultaneously your obligations under this -License and any other pertinent obligations, then as a consequence you -may not distribute the Program at all. For example, if a patent -license would not permit royalty-free redistribution of the Program by -all those who receive copies directly or indirectly through you, then -the only way you could satisfy both it and this License would be to -refrain entirely from distribution of the Program. - -If any portion of this section is held invalid or unenforceable under -any particular circumstance, the balance of the section is intended to -apply and the section as a whole is intended to apply in other -circumstances. - -It is not the purpose of this section to induce you to infringe any -patents or other property right claims or to contest validity of any -such claims; this section has the sole purpose of protecting the -integrity of the free software distribution system, which is -implemented by public license practices. Many people have made -generous contributions to the wide range of software distributed -through that system in reliance on consistent application of that -system; it is up to the author/donor to decide if he or she is willing -to distribute software through any other system and a licensee cannot -impose that choice. - -This section is intended to make thoroughly clear what is believed to -be a consequence of the rest of this License. - - 8. If the distribution and/or use of the Program is restricted in -certain countries either by patents or by copyrighted interfaces, the -original copyright holder who places the Program under this License -may add an explicit geographical distribution limitation excluding -those countries, so that distribution is permitted only in or among -countries not thus excluded. In such case, this License incorporates -the limitation as if written in the body of this License. - - 9. The Free Software Foundation may publish revised and/or new versions -of the General Public License from time to time. Such new versions will -be similar in spirit to the present version, but may differ in detail to -address new problems or concerns. - -Each version is given a distinguishing version number. If the Program -specifies a version number of this License which applies to it and "any -later version", you have the option of following the terms and conditions -either of that version or of any later version published by the Free -Software Foundation. If the Program does not specify a version number of -this License, you may choose any version ever published by the Free Software -Foundation. - - 10. If you wish to incorporate parts of the Program into other free -programs whose distribution conditions are different, write to the author -to ask for permission. For software which is copyrighted by the Free -Software Foundation, write to the Free Software Foundation; we sometimes -make exceptions for this. Our decision will be guided by the two goals -of preserving the free status of all derivatives of our free software and -of promoting the sharing and reuse of software generally. - - NO WARRANTY - - 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY -FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN -OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES -PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED -OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF -MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS -TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE -PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, -REPAIR OR CORRECTION. - - 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING -WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR -REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, -INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING -OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED -TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY -YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER -PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE -POSSIBILITY OF SUCH DAMAGES. - - END OF TERMS AND CONDITIONS - - How to Apply These Terms to Your New Programs - - If you develop a new program, and you want it to be of the greatest -possible use to the public, the best way to achieve this is to make it -free software which everyone can redistribute and change under these terms. - - To do so, attach the following notices to the program. It is safest -to attach them to the start of each source file to most effectively -convey the exclusion of warranty; and each file should have at least -the "copyright" line and a pointer to where the full notice is found. - - - Copyright (C) 19yy - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - - -Also add information on how to contact you by electronic and paper mail. - -If the program is interactive, make it output a short notice like this -when it starts in an interactive mode: - - Gnomovision version 69, Copyright (C) 19yy name of author - Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. - This is free software, and you are welcome to redistribute it - under certain conditions; type `show c' for details. - -The hypothetical commands `show w' and `show c' should show the appropriate -parts of the General Public License. Of course, the commands you use may -be called something other than `show w' and `show c'; they could even be -mouse-clicks or menu items--whatever suits your program. - -You should also get your employer (if you work as a programmer) or your -school, if any, to sign a "copyright disclaimer" for the program, if -necessary. Here is a sample; alter the names: - - Yoyodyne, Inc., hereby disclaims all copyright interest in the program - `Gnomovision' (which makes passes at compilers) written by James Hacker. - - , 1 April 1989 - Ty Coon, President of Vice - -This General Public License does not permit incorporating your program into -proprietary programs. If your program is a subroutine library, you may -consider it more useful to permit linking proprietary applications with the -library. If this is what you want to do, use the GNU Library General -Public License instead of this License. diff --git a/lustre/ChangeLog b/lustre/ChangeLog index 8fdfc30..19f16fa 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -1,69 +1,2211 @@ tbd Cluster File Systems, Inc. - * version 1.3.4 + * version 1.6.0 + * CONFIGURATION CHANGE. This version of Lustre WILL NOT + INTEROPERATE with older versions automatically. In many cases a + special upgrade step is needed. Please read the + user documentation before upgrading any part of a live system. + * WIRE PROTOCOL CHANGE from previous 1.6 beta versions. This + version will not interoperate with 1.6 betas before beta7 (1.5.97). + * WARNING: Lustre configuration and startup changes are required with + this release. See https://mail.clusterfs.com/wikis/lustre/MountConf + for details. + * Recommended e2fsprogs version: 1.39.cfs2-0 * bug fixes - - fixes from lustre 1.2.8 - - print NAL number in %x format (4645) - - the watchdog thread now runs as interruptible (5246) - - drop import inflight refcount on signal_completed_replay error (5255) - * miscellania - - add pid to ldlm debugging output (4922) - - add --disable-server and --disable-client configure options (5786) -2004-10-08 Cluster File Systems, Inc. - * version 1.3.3 +Severity : enhancement +Bugzilla : 8007 +Description: MountConf +Details : Lustre configuration is now managed via mkfs and mount + commands instead of lmc and lconf. New obd types (MGS, MGC) + are added for dynamic configuration management. See + https://mail.clusterfs.com/wikis/lustre/MountConf for + details. + +Severity : enhancement +Bugzilla : 4482 +Description: dynamic OST addition +Details : OSTs can now be added to a live filesystem + +Severity : enhancement +Bugzilla : 9851 +Description: startup order invariance +Details : MDTs and OSTs can be started in any order. Clients only + require the MDT to complete startup. + +Severity : enhancement +Bugzilla : 4899 +Description: parallel, asynchronous orphan cleanup +Details : orphan cleanup is now performed in separate threads for each + OST, allowing parallel non-blocking operation. + +Severity : enhancement +Bugzilla : 9862 +Description: optimized stripe assignment +Details : stripe assignments are now made based on ost space available, + ost previous usage, and OSS previous usage, in order to try + to optimize storage space and networking resources. + +Severity : enhancement +Bugzilla : 4226 +Description: Permanently set tunables +Details : All writable /proc/fs/lustre tunables can now be permanently + set on a per-server basis, at mkfs time or on a live + system. + +Severity : enhancement +Bugzilla : 10547 +Description: Lustre message v2 +Details : Add lustre message format v2. + +Severity : enhancement +Bugzilla : 9866 +Description: client OST exclusion list +Details : Clients can be started with a list of OSTs that should be + declared "inactive" for known non-responsive OSTs. + +Severity : minor +Frequency : SFS test only (otherwise harmless) +Bugzilla : 6062 +Description: SPEC SFS validation failure on NFS v2 over lustre. +Details : Changes the blocksize for regular files to be 2x RPC size, + and not depend on stripe size. + +Severity : enhancement +Bugzilla : 9293 +Description: Multiple MD RPCs in flight. +Details : Further unserialise some read-only MDS RPCs - learn about intents. + To avoid overly-overloading MDS, introduce a limit on number of + MDS RPCs in flight for a single client and add /proc controls + to adjust this limit. + +Severity : enhancement +Bugzilla : 22484 +Description: client read/write statistics +Details : Add client read/write call usage stats for performance + analysis of user processes. + /proc/fs/lustre/llite/*/offset_stats shows non-sequential + file access. extents_stats shows chunk size distribution. + extents_stats_per_process show chunk size distribution per + user process. + +Severity : enhancement +Bugzilla : 22485 +Description: per-client statistics on server +Details : Add ldlm and operations statistics for each client in + /proc/fs/lustre/mds|obdfilter/*/exports/ + +Severity : enhancement +Bugzilla : 22486 +Description: mds statistics +Details : Add detailed mds operations statistics in + /proc/fs/lustre/mds/*/stats + +Severity : enhancement +Bugzilla : 10968 +Description: VFS operations stats +Details : Add client VFS call stats, trackable by pid, ppid, or gid + /proc/fs/lustre/llite/*/vfs_ops_stats + /proc/fs/lustre/llite/*/track_[pid|ppid|gid] + +Severity : minor +Frequency : always +Bugzilla : 6380 +Description: Fix client-side osc byte counters +Details : The osc read/write byte counters in + /proc/fs/lustre/osc/*/stats are now working + +Severity : minor +Frequency : always as root on SLES +Bugzilla : 10667 +Description: Failure of copying files with lustre special EAs. +Details : Client side always return success for setxattr call for lustre + special xattr (currently only "trusted.lov"). + +Severity : minor +Frequency : always +Bugzilla : 10345 +Description: Refcount LNET uuids +Details : The global LNET uuid list grew linearly with every startup; + refcount repeated list entries instead of always adding to + the list. + +Severity : enhancement +Bugzilla : 2258 +Description: Dynamic service threads +Details : Within a small range, start extra service threads + automatically when the request queue builds up. + +Severity : major +Frequency : mixed-endian client/server environments +Bugzilla : 11214 +Description: mixed-endian crashes +Details : The new msg_v2 system had some failures in mixed-endian + environments. + +Severity : enhancement +Bugzilla : 11229 +Description: Easy OST removal +Details : OSTs can be permanently deactivated with e.g. 'lctl + conf_param lustre-OST0001.osc.active=0' + +Severity : enhancement +Bugzilla : 11335 +Description: MGS proc entries +Details : Added basic proc entries for the MGS showing what filesystems + are served. + +Severity : enhancement +Bugzilla : 10998 +Description: provide MGS failover +Details : Added config lock reacquisition after MGS server failover. + +Severity : enhancement +Bugzilla : 11461 +Description: add Linux 2.4 support +Details : Added support for RHEL 2.4.21 kernel for 1.6 servers and clients + +Severity : normal +Bugzilla : 11330 +Description: a large application tries to do I/O to the same resource and dies + in the middle of it. +Details : Check the req->rq_arrival time after the call to + ost_brw_lock_get(), but before we do anything about + processing it & sending the BULK transfer request. This + should help move old stale pending locks off the queue as + quickly as obd_timeout. + +Severity : major +Frequency : when an incorrect nid is specified during startup +Bugzilla : 10743 +Description: ptlrpc connect to non-existant node causes kernel crash +Details : LNET can't be re-entered from an event callback, which + happened when we expire a message after the export has been + cleaned up. Instead, hand the zombie cleanup off to another + thread. + +Severity : normal +Bugzilla : 10214 +Description: make O_SYNC working on 2.6 kernels +Details : 2.6 kernels use different method for mark pages for write, + so need add a code to lustre for O_SYNC work. + +Severity : minor +Frequency : always +Bugzilla : 11110 +Description: Failure to close file and release space on NFS +Details : Put inode details into lock acquired in ll_intent_file_open. + Use mdc_intent_lock in ll_intent_open to properly + detect all kind of errors unhandled by mdc_enqueue + +Severity : enhancement +Bugzilla : 10902 +Description: plain/inodebits lock performance improvement +Details : Grouping plain/inodebits in granted list by their request modes + and bits policy, thus improving the performance of search through + the granted list. + +Severity : major +Frequency : rare +Bugzilla : 10866 +Description: proc file read during shutdown sometimes raced obd removal, + causing node crash +Details : Add lock to prevent obd access after proc file removal + +Severity : normal +Bugzilla : 11237 +Description: improperly doing page alignment of locks +Details : Modify lustre core code to use CFS_PAGE_* defines instead of + PAGE_*. Make CFS_PAGE_MASK 64bit long. + +------------------------------------------------------------------------------ + +TBD Cluster File Systems, Inc. + * version 1.4.9 + * Support for kernels: + 2.6.9-42.0.3EL (RHEL 4) + 2.6.5-7.276 (SLES 9) + 2.4.21-40.0.1.EL (RHEL 3) + 2.6.12.6 vanilla (kernel.org) * bug fixes - - properly handle portals process identifiers in messages (4165) - - finish default directory EA handling (3048) - - fixes from lustre 1.2.7 - - removed PTL_MD_KIOV usage under CRAY_PORTALS (4420) - - allow EADDRNOTAVAIL as retry for connect in liblustre tcpnal (4822) -2004-09-16 Cluster File Systems, Inc. - * version 1.3.2 +Severity : critical +Frequency : rare +Bugzilla : 11125 +Description: "went back in time" messages on mds failover +Details : The greatest transno may be lost when the current operation + finishes with an error (transno==0) and the client's last_rcvd + record is over-written. Save the greatest transno in the + mds_last_transno for this case. + +Severity : minor +Frequency : always for specific kernels and striping counts +Bugzilla : 11042 +Description: client may get "Matching packet too big" without ACL support +Details : Clients compiled without CONFIG_FS_POSIX_ACL get an error message + when trying to access files in certain configurations. The + clients should in fact be denied when mounting because they do + not understand ACLs. + +Severity : major +Frequency : Cray XT3 with more than 4000 clients and multiple jobs +Bugzilla : 10906 +Description: many clients connecting with IO in progress causes connect timeouts +Details : Avoid synchronous journal commits to avoid delays caused by many + clients connecting/disconnecting when bulk IO is in progress. + Queue liblustre connect requests on OST_REQUEST_PORTAL instead of + OST_IO_PORTAL to avoid delays behind potentially many pending + slow IO requests. + +Severity : normal +Frequency : occasionally with multiple writers to a single file +Bugzilla : 11081 +Description: shared writes to file may result in wrong size reported by stat() +Details : Allow growing of kms when extent lock is cancelled + +Severity : minor +Frequency : always with random mmap IO to multi-striped file +Bugzilla : 10919 +Description: mmap write might be lost if we are writing to a 'hole' in stripe +Details : Only if the hole is at the end of OST object so that kms is too + small. Fix is to increase kms accordingly in ll_nopage. + +Severity : normal +Frequency : rare, only if OST filesystem is inconsistent with MDS filesystem +Bugzilla : 11211 +Description: writes to a missing object would leak memory on the OST +Details : If there is an inconsistency between the MDS and OST filesystems, + such that the MDS references an object that doesn't exist, writes + to that object will leak memory due to incorrect cleanup in the + error handling path, eventually running out of memory on the OST. + +Severity : minor +Frequency : rare +Bugzilla : 11040 +Description: Creating too long symlink causes lustre errors +Details : Check symlink and name lengths before sending requests to MDS. + +Severity : normal +Frequency : only if flock is enabled (not on by default) +Bugzilla : 11415 +Description: posix locks not released on fd closure on 2.6.9+ +Details : We failed to add posix locks to list of inode locks on 2.6.9+ + kernels, this caused such locks not to be released on fd close and + then assertions on fs unmount about still used locks. + +Severity : minor +Frequency : MDS failover only, very rarely +Bugzilla : 11277 +Description: clients may get ASSERTION(granted_lock != NULL) +Details : When request was taking a long time, and a client was resending + a getattr by name lock request. The were multiple lock + requests with the same client lock handle and + mds_getattr_name->fixup_handle_for_resent_request found one + of the lock handles but later failed with + ASSERTION(granted_lock != NULL). + +Severity : major +Frequency : rare +Bugzilla : 10891 +Description: handle->h_buffer_credits > 0, assertion failure +Details : h_buffer_credits is zero after truncate, causing assertion + failure. This patch extends the transaction or creates a new + one after truncate. + +Severity : normal +Frequency : NFS re-export or patchless client +Bugzilla : 11179, 10796 +Description: Crash on NFS re-export node (__d_move) +Details : We do not want to hash the dentry if we don't have a lock. + But if this dentry is later used in d_move, we'd hit uninitialised + list head d_hash, so we just do this to init d_hash field but + leave dentry unhashed. + +Severity : normal +Frequency : NFS re-export or patchless client +Bugzilla : 11135 +Description: NFS exports has problem with symbolic link +Details : lustre client didn't properly install dentry when re-exported + to NFS or running patchless client. + +Severity : normal +Frequency : NFS re-export or patchless client +Bugzilla : 10796 +Description: Various nfs/patchless fixes. +Details : fixes reuse disconected alias for lookup process - this fixes + warning "find_exported_dentry: npd != pd", fix permission + error with open files at nfs. + +Severity : normal +Frequency : occasional +Bugzilla : 11191 +Description: Crash on NFS re-export node +Details : call clear_page on wrong pointer triggered oops in + generic_mapping_read(). + +Severity : normal +Frequency : rarely, using O_DIRECT IO +Bugzilla : 10903 +Description: unaligned directio crashes client with LASSERT +Details : check for unaligned buffers before trying any requests. + +Severity : major +Frequency : rarely, using CFS RAID5 patches in non-standard kernel series +Bugzilla : 11313 +Description: stale data returned from RAID cache +Details : If only a small amount of IO is done to the RAID device before + reading it again it is possible to get stale data from the RAID + cache instead of reading it from disk. + +Severity : major +Frequency : depends on arch, kernel and compiler version, always on sles10 + kernel and x86_64 +Bugzilla : 11562 +Description: recursive or deep enough symlinks cause stack overflow +Details : getting rid of large stack-allocated variable in + __vfs_follow_link + +Severity : minor +Frequency : depends on hardware +Bugzilla : 11540 +Description: lustre write performance loss in the SLES10 kernel +Details : the performance loss is caused by using of write barriers in the + ext3 code. The SLES10 kernel turns barrier support on by + default. The fix is to undo that change for ldiskfs. + +------------------------------------------------------------------------------ + +2006-12-09 Cluster File Systems, Inc. + * version 1.4.8 + * Support for kernels: + 2.6.9-42.0.3EL (RHEL 4) + 2.6.5-7.276 (SLES 9) + 2.4.21-47.0.1.EL (RHEL 3) + 2.6.12.6 vanilla (kernel.org) * bug fixes - - many liblustre fixes - - fixes from lustre 1.2.6 - * miscellania - - update to new libsysio-head-0806 - - reorganization of lov code -2004-08-30 Cluster File Systems, Inc. - * version 1.3.1 +Severity : major +Frequency : quota enabled and large files being deleted +Bugzilla : 10707 +Description: releasing more than 4GB of quota at once hangs OST +Details : If a user deletes more than 4GB of files on a single OST it + will cause the OST to spin in an infinite loop. Release + quota in < 4GB chunks, or use a 64-bit value for 1.4.7.1+. + +Severity : minor +Frequency : rare +Bugzilla : 10845 +Description: statfs data retrieved from /proc may be stale or zero +Details : When reading per-device statfs data from /proc, in the + {kbytes,files}_{total,free,avail} files, it may appear + as zero or be out of date. + +Severity : normal +Frequency : always, for aggregate stripe size over 4GB +Bugzilla : 10725 +Description: "lfs setstripe" fails assertion when setting 4GB+ stripe width +Details : Using "lfs setstripe" to set stripe size * stripe count over 4GB + will fail the kernel with "ASSERTION(lsm->lsm_xfersize != 0)" + +Severity : minor +Frequency : always if "lfs find" used on a local file/directory +Bugzilla : 10864 +Description: "lfs find" segfaults if used on a local file/directory +Details : The case where a directory component was not specified wasn't + handled correctly. Handle this properly. + +Severity : normal +Frequency : always on ppc64 +Bugzilla : 10634 +Description: the write to an ext3 filesystem mounted with mballoc got stuck +Details : ext3_mb_generate_buddy() uses find_next_bit() which does not + perform endianness conversion. + +Severity : major +Frequency : rarely (truncate to non-zero file size after write under load) +Bugzilla : 10730, 10687 +Description: Files padded with zeros to next 4K multiple +Details : With filesystems mounted using the "extents" option (2.6 kernels) + it is possible that files that are truncated to a non-zero size + immediately after being written are filled with zero bytes beyond + the truncated size. No file data is lost. + +Severity : enhancement +Frequency : liblustre only +Bugzilla : 10452 +Description: Allow recovery/failover for liblustre clients. +Details : liblustre clients were unaware of failover configurations until + now. + +Severity : enhancement +Bugzilla : 10743 +Description: user file locks should fail when not mounting with flock option +Details : Set up an error-returning stub in ll_file_operations.lock field + to prevent incorrect behaviour when client is mounted without + flock option. Also, set up properly f_op->flock field for + RHEL4 kernels. + +Severity : minor +Frequency : always on ia64 +Bugzilla : 10905 +Description: "lfs df" loops on printing out MDS statfs information +Details : The obd_ioctl_data was not initialized and in some systems + this caused a failure during the ioctl that did not return + an error. Initialize the struct and return an error on failure. + +Severity : minor +Frequency : SLES 9 only +Bugzilla : 10667 +Description: Error of copying files with lustre special EAs as root +Details : Client side always return success for setxattr call for lustre + special xattr (currently only "trusted.lov"). + +Severity : normal +Frequency : rarely on clusters with both ia64+i386 clients +Bugzilla : 10672 +Description: ia64+i686 clients doing shared IO on the same file may LBUG +Details : In rare cases when both ia64+i686 (or other mixed-PAGE_SIZE) + clients are doing concurrent writes to the same file it is + possible that the ia64 clients may LASSERT because the OST + extent locks are not PAGE_SIZE aligned. Ensure that grown + locks are always aligned on the request boundary. + +Severity : normal +Frequency : specific use, occasional +Bugzilla : 7040 +Description: Overwriting in use executable truncates on-disk binary image +Details : If one node attempts to overwrite an executable in use by + another node, we now correctly return ETXTBSY instead of + truncating the file. + +Severity : normal +Frequency : rare +Bugzilla : 2707 +Description: chmod on Lustre root is propagated to other clients +Details : Re-validate root's dentry in ll_lookup_it to avoid having it + invalid by the follow_mount time. + +Severity : minor +Frequency : rare +Bugzilla : 10883 +Description: Race in 'instant cancel' lock handling could lead to such locks + never to be granted in case of SMP MDS +Details : Do not destroy not yet granted but cbpending locks in + handle_enqueue + +Severity : minor +Frequency : replay/resend of open +Bugzilla : 10991 +Description: non null lock assetion failure in mds_intent_policy +Details : Trying to replay/resend lockless open requests resulted in + mds_open() returning 0 with no lock. Now it sets a flag if + a lock is going to be returned. + +Severity : enhancement +Bugzilla : 10889 +Description: Checksum enhancements +Details : New checksum enhancements allow for resending RPCs that failed + checksum checks. + +Severity : enhancement +Bugzilla : 7376 +Description: Tunables on number of dirty pages in cacche +Details : Allow to set limit on number of dirty pages cached. + +Severity : normal +Frequency : rare +Bugzilla : 10643 +Description: client crash on unmount - lock still has references +Details : In some error handling cases it was possible to leak a lock + reference on a client while accessing a file. This was not + harmful to the client during operation, but would cause the + client to crash when the filesystem is unmounted. + +Severity : normal +Frequency : specific case, rare +Bugzilla : 10921 +Description: ETXTBSY on mds though file not in use by client +Details : ETXTBSY is no longer incorrectly returned when attempting to + chmod or chown a directory that the user previously tried to + execute or a currently-executing binary. + +Severity : major +Frequency : extremely rare except on liblustre-based clients +Bugzilla : 10480 +Description: Lustre space not freed when files are deleted +Details : Clean up open-unlinked files after client eviction. Previously + the unlink was skipped and the files remained as orphans. + +Severity : normal +Frequency : rare +Bugzilla : 10999 +Description: OST failure "would be an LBUG" in waiting_locks_callback() +Details : In some cases it was possible to send a blocking callback to + a client doing a glimpse, even though that client didn't get + a lock granted. When the glimpse lock is cancelled on the OST + the freed lock is left on the waiting list and corrupted the list. + +Severity : major +Frequency : all core dumps +Bugzilla : 11103 +Description: Broke core dumps to lustre +Details : Negative dentry may be unhashed if parent does not have UPDATE + lock, but some callers, e.g. do_coredump, expect dentry to be + hashed after successful create, hash it in ll_create_it. + +------------------------------------------------------------------------------ + +2006-09-13 Cluster File Systems, Inc. + * version 1.4.7.1 + * Support for kernels: + 2.6.9-42.0.2.EL (RHEL 4) + 2.6.5-7.276 (SLES 9) + 2.4.21-40.EL (RHEL 3) + 2.6.12.6 vanilla (kernel.org) + * bug fix + +Severity : major +Frequency : always on RHEL 3 +Bugzilla : 10867 +Description: Number of open files grows over time +Details : The number of open files grows over time, whether or not + Lustre is started. This was due to a filp leak introduced + by one of our kernel patches. + +------------------------------------------------------------------------------ + +2006-08-20 Cluster File Systems, Inc. + * version 1.4.7 + * Support for kernels: + 2.6.9-42.EL (RHEL 4) + 2.6.5-7.267 (SLES 9) + 2.4.21-40.EL (RHEL 3) + 2.6.12.6 vanilla (kernel.org) * bug fixes - - add locking for mmapped files (2828) - - lmc/lconf changes to support multiple interfaces (3376) - - fixes from lustre 1.2.5 -2004-08-14 Cluster File Systems, Inc. - * version 1.3.0 +Severity : major +Frequency : rare +Bugzilla : 5719, 9635, 9792, 9684 +Description: OST (or MDS) trips assertions in (re)connection under heavy load +Details : If a server is under heavy load and cannot reply to new + connection requests before the client resends the (re)connect, + the connection handling code can behave badly if two service + threads are concurrently handing separate (re)connections from + the same client. Add better locking to the connection handling + code, and ensure that only a single connection will be processed + for a given client UUID, even if the lock is dropped. + +Severity : enhancement +Bugzilla : 3627 +Description: add TCP zero-copy support to kernel +Details : Add support to the kernel TCP stack to allow zero-copy bulk + sends if the hardware supports scatter-gather and checksumming. + This allows socklnd to do client-write and server-read more + efficiently and reduce CPU utilization from skbuf copying. + +Severity : minor +Frequency : only if NFS exporting from client +Bugzilla : 10258 +Description: NULL pointer deref in ll_iocontrol() if chattr mknod file +Details : If setting attributes on a file created under NFS that had + never been opened it would be possible to oops the client + if the file had no objects. + +Severity : major +Frequency : rare +Bugzilla : 9326, 10402, 10897 +Description: client crash in ptlrpcd_wake() thread when sending async RPC +Details : It is possible that ptlrpcd_wake() dereferences a freed async + RPC. In rare cases the ptlrpcd thread alread processed the RPC + before ptlrpcd_wake() was called and the request was freed. + +Severity : minor +Frequency : always for liblustre +Bugzilla : 10290 +Description: liblustre client does MDS+OSTs setattr RPC for each write +Details : When doing a write from a liblustre client, the client + incorrectly issued an RPC to the MDS and each OST the file was + striped over in order to update the timestamps. When writing + with small chunks and many clients this could overwhelm the MDS + with RPCs. In all cases it would slow down the write because + these RPCs are unnecessary. + +Severity : enhancement +Bugzilla : 9340 +Description: allow number of MDS service threads to be changed at module load +Details : It is now possible to change the number of MDS service threads + running. Adding "options mds mds_num_threads={N}" to the MDS's + /etc/modprobe.conf will set the number of threads for the next + time Lustre is restarted (assuming the "mds" module is also + reloaded at that time). The default number of threads will + stay the same, 32 for most systems. + +Severity : major +Frequency : rare +Bugzilla : 10300 +Description: OST crash if filesystem is unformatted or corrupt +Details : If an OST is started on a device that has never been formatted + or if the filesystem is corrupt and cannot even mount then the + error handling cleanup routines would dereference a NULL pointer. + +Severity : normal +Frequency : rare +Bugzilla : 10047 +Description: NULL pointer deref in llap_from_page. +Details : get_cache_page_nowait can return a page with NULL (or otherwise + incorrect) mapping if the page was truncated/reclaimed while it was + searched for. Check for this condition and skip such pages when + doing readahead. Introduce extra check to llap_from_page() to + verify page->mapping->host is non-NULL (so page is not anonymous). + +Severity : minor +Frequency : Sometimes when using sys_sendfile +Bugzilla : 7020 +Description: "page not covered by a lock" warnings from ll_readpage +Details : sendfile called ll_readpage without right page locks present. + Now we introduced ll_file_sendfile that does necessary locking + around call to generic_file_sendfile() much like we do in + ll_file_read(). + +Severity : normal +Frequency : with certain MDS communication failures at client mount time +Bugzilla : 10268 +Description: NULL pointer deref after failed client mount +Details : a client connection request may delayed by the network layer + and not be sent until after the PTLRPC layer has timed out the + request. If the client fails the mount immediately it will try + to clean up before the network times out the request. Add a + reference from the request import to the obd device and delay + the cleanup until the network drops the request. + +Severity : normal +Frequency : occasionally during client (re)connect +Bugzilla : 9387 +Description: assertion failure during client (re)connect +Details : processing a client connection request may be delayed by the + client or server longer than the client connect timeout. This + causes the client to resend the connection request. If the + original connection request is replied in this interval, the + client may trip an assertion failure in ptlrpc_connect_interpret() + which thought it would be the only running connect process. + +Severity : normal +Frequency : only with obd_echo servers and clients that are rebooted +Bugzilla : 10140 +Description: kernel BUG accessing uninitialized data structure +Details : When running an obd_echo server it did not start the ping_evictor + thread, and when a client was evicted an uninitialized data + structure was accessed. Start the ping_evictor in the RPC + service startup instead of the OBD startup. + +Severity : enhancement +Bugzilla : 10193 (patchless) +Description: Remove dependency on various unexported kernel interfaces. +Details : No longer need reparent_to_init, exit_mm, exit_files, + sock_getsockopt, filemap_populate, FMODE_EXEC, put_filp. + +Severity : minor +Frequency : rare (only users of deprecated and unsupported LDAP config) +Bugzilla : 9337 +Description: write_conf for zeroconf mount queried LDAP incorrectly for client +Details : LDAP apparently contains 'lustreName' attributes instead of + 'name'. A simple remapping of the name is sufficient. + +Severity : major +Frequency : rare (only with non-default dump_on_timeout debug enabled) +Bugzilla : 10397 +Description: waiting_locks_callback trips kernel BUG if client is evicted +Details : Running with the dump_on_timeout debug flag turned on makes + it possible that the waiting_locks_callback() can try to dump + the Lustre kernel debug logs from an interrupt handler. Defer + this log dumping to the expired_lock_main() thread. + +Severity : enhancement +Bugzilla : 10420 +Description: Support NFS exporting on 2.6 kernels. +Details : Implement non-rawops metadata methods for NFS server to use without + changing NFS server code. + +Severity : normal +Frequency : very rare (synthetic metadata workload only) +Bugzilla : 9974 +Description: two racing renames might cause an MDS thread to deadlock +Details : Running the "racer" program may cause one MDS thread to rename + a file from being the source of a rename to being the target of + a rename at exactly the same time that another thread is doing + so, and the second thread has already enqueued these locks after + doing a lookup of the target and is trying to relock them in + order. Ensure that we don't try to re-lock the same resource. + +Severity : major +Frequency : only very large systems with liblustre clients +Bugzilla : 7304 +Description: slow eviction of liblustre clients with the "evict_by_nid" RPC +Details : Use asynchronous set_info RPCs to send the "evict_by_nid" to + all OSTs in parallel. This allows the eviction of stale liblustre + clients to proceed much faster than if they were done in series, + and also offers similar improvements for other set_info RPCs. + +Severity : minor +Frequency : common +Bugzilla : 10265 +Description: excessive CPU usage during initial read phase on client +Details : During the initial read phase on a client, it would agressively + retry readahead on the file, consuming too much CPU and impacting + performance (since 1.4.5.8). Improve the readahead algorithm + to avoid this, and also improve some other common cases (read + of small files in particular, where "small" is files smaller than + /proc/fs/lustre/llite/*/max_read_ahead_whole_mb, 2MB by default). + +Severity : minor +Frequency : rare +Bugzilla : 10450 +Description: MDS crash when receiving packet with unknown intent. +Details : Do not LBUG in unknown intent case, just return -EFAULT + +Severity : enhancement +Bugzilla : 9293, 9385 +Description: MDS RPCs are serialised on client. This is unnecessary for some. +Details : Do not serialize getattr (non-intent version) and statfs. + +Severity : minor +Frequency : occasional, when OST network is overloaded/intermittent +Bugzilla : 10416 +Description: client evicted by OST after bulk IO timeout +Details : If a client sends a bulk IO request (read or write) the OST + may evict the client if it is unresposive to its data GET/PUT + request. This is incorrect if the network is overloaded (takes + too long to transfer the RPC data) or dropped the OST GET/PUT + request. There is no need to evict the client at all, since + the pinger and/or lock callbacks will handle this, and the + client can restart the bulk request. + +Severity : minor +Frequency : Always when mmapping file with no objects +Bugzilla : 10438 +Description: client crashes when mmapping file with no objects +Details : Check that we actually have objects in a file before doing any + operations on objects in ll_vm_open, ll_vm_close and + ll_glimpse_size. + +Severity : minor +Frequency : Rare +Bugzilla : 10484 +Description: Request leak when working with deleted CWD +Details : Introduce advanced request refcount tracking for requests + referenced from lustre intent. + +Severity : Enhancement +Bugzilla : 10482 +Description: Cache open file handles on client. +Details : MDS now will return special lock along with openhandle, if + requested and client is allowed to hold openhandle, even if unused, + until such a lock is revoked. Helps NFS a lot, since NFS is opening + closing files for every read/write openration. + +Severity : Enhancement +Bugzilla : 9291 +Description: Cache open negative dentries on client when possible. +Details : Guard negative dentries with UPDATE lock on parent dir, drop + negative dentries on lock revocation. + +Severity : minor +Frequency : Always +Bugzilla : 10510 +Description: Remounting a client read-only wasn't possible with a zconf mount +Details : It wasn't possible to remount a client read-only with llmount. + +Severity : enhancement +Description: Include MPICH 1.2.6 Lustre ADIO interface patch +Details : In lustre/contrib/ or /usr/share/lustre in RPM a patch for + MPICH is included to add Lustre-specific ADIO interfaces. + This is based closely on the UFS ADIO layer and only differs + in file creation, in order to allow the OST striping to be set. + This is user-contributed code and not supported by CFS. + +Severity : minor +Frequency : Always +Bugzilla : 9486 +Description: extended inode attributes (immutable, append-only) work improperly + when 2.4 and 2.6 kernels are used on client/server or vice versa +Details : Introduce kernel-independent values for these flags. + +Severity : enhancement +Frequency : Always +Bugzilla : 10248 +Description: Allow fractional MB tunings for lustre in /proc/ filesystem. +Details : Many of the /proc/ tunables can only be tuned at a megabyte + granularity. Now, Fractional MB granularity is be supported, + this is very useful for low memory system. + +Severity : enhancement +Bugzilla : 9292 +Description: Getattr by fid +Details : Getting a file attributes by its fid, obtaining UPDATE|LOOKUP + locks, avoids extra getattr rpc requests to MDS, allows '/' to + have locks and avoids getattr rpc requests for it on every stat. + +Severity : major +Frequency : Always, for filesystems larger than 2TB +Bugzilla : 6191 +Description: ldiskfs crash at mount for filesystem larger than 2TB with mballoc +Details : Kenrel kmalloc limits allocations to 128kB and this prevents + filesystems larger than 2TB to be mounted with mballoc enabled. + +Severity : critical +Frequency : Always, for 32-bit kernel without CONFIG_LBD and filesystem > 2TB +Bugzilla : 6191 +Description: filesystem corruption for non-standard kernels and very large OSTs +Details : If a 32-bit kernel is compiled without CONFIG_LBD enabled and a + filesystems larger than 2TB is mounted then the kernel will + silently corrupt the start of the filesystem. CONFIG_LBD is + enabled for all CFS-supported kernels, but the possibility of + this happening with a modified kernel config exists. + +Severity : enhancement +Bugzilla : 10462 +Description: add client O_DIRECT support for 2.6 kernels +Details : It is now possible to do O_DIRECT reads and writes to files + in the Lustre client mountpoint on 2.6 kernel clients. + +Severity : enhancement +Bugzilla : 10446 +Description: parallel glimpse, setattr, statfs, punch, destroy requests +Details : Sends glimpse, setattr, statfs, punch, destroy requests to OSTs in + parallel, not waiting for response from every OST before sending + a rpc to the next OST. + +Severity : minor +Frequency : rare +Bugzilla : 10150 +Description: setattr vs write race when updating file timestamps +Details : Client processes that update a file timestamp into the past + right after writing to the file (e.g. tar) it is possible that + the updated file modification time can be reset to the current + time due to a race between processing the setattr and write RPC. + +Severity : enhancement +Bugzilla : 10318 +Description: Bring 'lfs find' closer in line with regular Linux find. +Details : lfs find util supports -atime, -mtime, -ctime, -maxdepth, -print, + -print0 options and obtains all the needed info through the lustre + ioctls. + +Severity : enhancement +Bugzilla : 6221 +Description: support up to 1024 configured devices on one node +Details : change obd_dev array from statically allocated to dynamically + allocated structs as they are first used to reduce memory usage + +Severity : minor +Frequency : rare +Bugzilla : 10437 +Description: Flush dirty partially truncated pages during truncate +Details : Immediatelly flush partially truncated pages in filter_setattr, + this way we completely avoid having any pages in page cache on OST + and can retire ugly workarounds during writes to flush such pages. + +Severity : minor +Frequency : rare +Bugzilla : 10409 +Description: i_sem vs transaction deadlock in mds_obd_destroy during unlink. +Details : protect inode from truncation within vfs_unlink() context + just take a reference before calling vfs_unlink() and release it + when parent's i_sem is free. + +Severity : minor +Frequency : always, if extents are used on OSTs +Bugzilla : 10703 +Description: index ei_leaf_hi (48-bit extension) is not zeroed in extent index +Details : OSTs using the extents format would not zero the high 16 bits of + the index physical block number. This is not a problem for any + OST filesystems smaller than 16TB, and no kernels support ext3 + filesystems larger than 16TB yet. This is fixed in 1.4.7 (all + new/modified files) and can be fixed for existing filesystems + with e2fsprogs-1.39-cfs1. + +Severity : minor +Frequency : rare +Bugzilla : 9387 +Description: import connection selection may be incorrect if timer wraps +Details : Using a 32-bit jiffies timer with HZ=1000 may cause backup + import connections to be ignored if the 32-bit jiffies counter + wraps. Use a 64-bit jiffies counter. + +Severity : major +Frequency : during server recovery +Bugzilla : 10479 +Description: crash after server is denying duplicate export +Details : If clients are resending connect requests to the server, the + server refuses to allow a client to connect multiple times. + Fixed a bug in the handling of this case. + +Severity : minor +Frequency : very large clusters immediately after boot +Bugzilla : 10083 +Description: LNET request buffers exhausted under heavy short-term load +Details : If a large number of client requests are generated on a service + that has previously never seen so many requests it is possible + that the request buffer growth cannot keep up with the spike in + demand. Instead of dropping incoming requests, they are held in + the LND until the RPC service can accept more requests. + +Severity : minor +Frequency : Sometimes during replay +Bugzilla : 9314 +Description: Assertion failure in ll_local_open after replay. +Details : If replay happened on an open request reply before we were able + to set replay handler, reply will become not swabbed tripping the + assertion in ll_local_open. Now we set the handler right after + recognising of open request + +Severity : minor +Frequency : very rare +Bugzilla : 10584 +Description: kernel reports "badness in vsnprintf" +Details : Reading from the "recovery_status" /proc file in small chunks + may cause a negative length in lprocfs_obd_rd_recovery_status() + call to vsnprintf() (which is otherwise harmless). Exit early + if there is no more space in the output buffer. + +Severity : enhancement +Bugzilla : 2259 +Description: clear OBD RPC statistics by writing to them +Details : It is now possible to clear the OBD RPC statistics by writing + to the "stats" file. + +Severity : minor +Frequency : rare +Bugzilla : 10641 +Description: Client mtime is not the same on different clients after utimes +Details : In some cases, the client was using the utimes() syscall on + a file cached on another node. The clients now validate the + ctime from the MDS + OSTs to determine which one is right. + +Severity : minor +Frequency : always +Bugzilla : 10611 +Description: Inability to activate failout mode +Details : lconf script incorrectly assumed that in pythong string's numeric + value is used in comparisons. + +Severity : minor +Frequency : always with multiple stripes per file +Bugzilla : 10671 +Description: Inefficient object allocation for mutli-stripe files +Details : When selecting which OSTs to stripe files over, for files with + a stripe count that divides evenly into the number of OSTs, + the MDS is always picking the same starting OST for each file. + Return the OST selection heuristic to the original design. + +Severity : minor +Frequency : rare +Bugzilla : 10673 +Description: mount failures may take full timeout to return an error +Details : Under some heavy load conditions it is possible that a + failed mount can wait for the full obd_timeout interval, + possibly several minutes, before reporting an error. + Instead return an error as soon as the status is known. + +------------------------------------------------------------------------------ + +2006-02-14 Cluster File Systems, Inc. + * version 1.4.6 + * WIRE PROTOCOL CHANGE. This version of Lustre networking WILL NOT + INTEROPERATE with older versions automatically. Please read the + user documentation before upgrading any part of a live system. + * WARNING: Lustre networking configuration changes are required with + this release. See https://bugzilla.clusterfs.com/show_bug.cgi?id=10052 + for details. * bug fixes - - don't dereference NULL peer_ni in ldlm_handle_ast_error (3258) - - don't allow unlinking open directory if it isn't empty (2904) - - handle partial page writes in filter; fix 512b direct IO (3138) - - handle page cache pages in cleanup path for 2.6 (3335) - - leave liblustre's partial write handling to filter (3274) - - chose better nal ids in liblustre (3292) - - initialize liblustre with uid/group membership (2862) - - let lconf resolve symlinked-to devices (4629) - - balance journal closure when 2.6 filter write fails (3401) - - add second rpc_lock and last_rcvd info for close reqs (3462) - - don't hold llog sem during network request (3652) - - update server last transno after client disconnects (2525) - - replace config semaphore with spinlock (3306) - - ext3 exents and multi-block allocation (3024) - - service time statistics in /proc - - minor fixes to liblustre build (3317) - - client recovery without upcall (3262) - - use transno after validating reply (3892) - - use different name for 2nd ptlrpcd thread (3887) - - get a client lock in ll_inode_revalidate_it (3597) - - direct IO reads on OST (4048) - - process timed out requests if import state changes (3754) - - ignore -ENOENT errors in osc_destroy (3639) - - fixes from lustre 1.2.0-1.2.4 + * Support for kernels: + 2.6.9-22.0.2.EL (RHEL 4) + 2.6.5-7.244 (SLES 9) + 2.6.12.6 vanilla (kernel.org) + + +Severity : enhancement +Bugzilla : 7981/8208 +Description: Introduced Lustre Networking (LNET) +Details : LNET is new networking infrastructure for Lustre, it includes + a reorganized network configuration mode (see the user + documentation for full details) as well as support for routing + between different network fabrics. Lustre Networking Devices + (LNDs) for the supported network fabrics have also been + created for this new infrastructure. + +Severity : enhancement +Description: Introduced Access control lists +Details : clients can set ACLs on files and directories in order to have + more fine-grained permissions than the standard Unix UGO+RWX. + The MDS must be started with the "-o acl" mount option. + +Severity : enhancement +Description: Introduced filesystem quotas +Details : Administrators may now establish per-user quotas on the + filesystem. + +Severity : enhancement +Bugzilla : 7982 +Description: Configuration change for the XT3 + The PTLLND is now used to run Lustre over Portals on the XT3 + The configure option(s) --with-cray-portals are no longer used. + Rather --with-portals= is used to + enable building on the XT3. In addition to enable XT3 specific + features the option --enable-cray-xt3 must be used. + +Severity : major +Frequency : rare +Bugzilla : 7407 +Description: Running on many-way SMP OSTs can trigger oops in llcd_send() +Details : A race between allocating a new llcd and re-getting the llcd_lock + allowed another thread to grab newly-allocated llcd. + +Severity : enhancement +Bugzilla : 7116 +Description: 2.6 OST async journal commit and locking fix to improve performance +Details : The filter_direct_io()+filter_commitrw_write() journal commits for + 2.6 kernels are now async as they already were in 2.4 kernels so + that they can commit concurrently with the network bulk transfer. + For block-allocated files the filter allocation semaphore is held + to avoid filesystem fragmentation during allocation. BKL lock + removed for 2.6 xattr operations where it is no longer needed. + +Severity : minor +Frequency : rare +Bugzilla : 8320 +Description: lconf incorrectly determined whether two IP networks could talk +Details : In some more complicated routing and multiple-network + configurations, lconf will avoid trying to make a network + connection to a disjoint part of the IP space. It was doing the + math incorrectly for one set of cases. + +Severity : major +Frequency : rare +Bugzilla : 7359 +Description: Fix for potential infinite loop processing records in an llog. +Details : If an llog record is corrupted/zeroed, it is possible to loop + forever in llog_process(). Validate the llog record length + and skip the remainder of the block on error. + +Severity : minor +Frequency : occasional (liblustre only) +Bugzilla : 6363 +Description: liblustre could not open files whose last component is a symlink +Details : sysio_path_walk() would incorrectly pass the open intent to + intermediate path components. + +Severity : minor +Frequency : rare (liblustre only with non-standard tuning) +Bugzilla : 7201 (7350) +Description: Tuning the MDC DLM LRU size to zero triggers client LASSERT +Details : llu_lookup_finish_locks() tries to set lock data on a lock + after it has been released, only do this for referenced locks + +Severity : enhancement +Bugzilla : 7328 +Description: specifying an (invalid) directory default stripe_size of -1 + would reset the directory default striping +Details : stripe_size -1 was used internally to signal directory stripe + removal, now use "all default" to signal dir stripe removal + as a directory striping of "all default" is not useful + +Severity : minor +Frequency : common for large clusters running liblustre clients +Bugzilla : 7198 +Description: doing an ls when liblustre clients are running is slow +Details : sending a glimpse AST to a liblustre client waits for every AST + to time out, as liblustre clients will not respond. Since they + cannot cache data we refresh the OST lock LVB from disk instead. + +Severity : enhancement +Bugzilla : 7198 +Description: doing an ls at the same time as file IO can be slow +Details : enqueue and other "small" requests can be blocked behind many + large IO requests. Create a new OST IO portal for non-IO + requests so they can be processed faster. + +Severity : minor +Frequency : rare (only HPUX clients mounting unsupported re-exported NFS vol) +Bugzilla : 5781 +Description: an HPUX NFS client would get -EACCESS when ftruncate()ing a newly + created file with mode 000 +Details : the Linux NFS server relies on an MDS_OPEN_OWNEROVERRIDE hack to + allow an ftruncate() as a non-root user to a file with mode 000. + Lustre now respects this flag to disable mode checks when + truncating a file owned by the user + +Severity : minor +Frequency : liblustre-only, when liblustre client dies unexpectedly or becomes + busy +Bugzilla : 7313 +Description: Revoking locks from clients that went dead or catatonic might take + a lot of time. +Details : New lock flags FL_CANCEL_ON_BLOCK used by liblustre makes + cancellation of such locks instant on servers without waiting for + any reply from clients. Clients drops these locks when cancel + notification from server is received without replying. + +Severity : minor +Frequency : liblustre-only, when liblustre client dies or becomes busy +Bugzilla : 7311 +Description: Doing ls on Linux clients can take a long time with active + liblustre clients +Details : Liblustre client cannot handle ASTs in timely manner, so avoid + granting such locks to it in the first place if possible. Locks + are taken by proxy on the OST during the read or write and + dropped immediately afterward. Add connect flags handling, do + not grant locks to liblustre clients for glimpse ASTs. + +Severity : enhancement +Bugzilla : 6252 +Description: Improve read-ahead algorithm to avoid excessive IO for random reads +Details : Existing read-ahead algorithm is tuned for the case of streamlined + sequential reads and behaves badly with applications doing random + reads. Improve it by reading ahead at least read region, and + avoiding excessive large RPC for small reads. + +Severity : enhancement +Bugzilla : 8330 +Description: Creating more than 1000 files for a single job may cause a load + imbalance on the OSTs if there are also a large number of OSTs. +Details : qos_prep_create() uses an OST index reseed value that is an + even multiple of the number of available OSTs so that if the + reseed happens in the middle of the object allocation it will + still utilize the OSTs as uniformly as possible. + +Severity : major +Frequency : rare +Bugzilla : 8322 +Description: OST or MDS may oops in ping_evictor_main() +Details : ping_evictor_main() drops obd_dev_lock if deleting a stale export + but doesn't restart at beginning of obd_exports_timed list + afterward. + +Severity : enhancement +Bugzilla : 7304 +Description: improve by-nid export eviction on the MDS and OST +Details : allow multiple exports with the same NID to be evicted at one + time without re-searching the exports list. + +Severity : major +Frequency : rare, only with supplementary groups enabled on SMP 2.6 kernels +Bugzilla : 7273 +Description: MDS may oops in groups_free() +Details : in rare race conditions a newly allocated group_info struct is + freed again, and this can be NULL. The 2.4 compatibility code + for groups_free() checked for a NULL pointer, but 2.6 did not. + +Severity : minor +Frequency : common for liblustre clients doing little filesystem IO +Bugzilla : 9352, 7313 +Description: server may evict liblustre clients accessing contended locks +Details : if a client is granted a lock or receives a completion AST + with a blocking AST already set it would not reply to the AST + for LDLM_FL_CANCEL_ON_BLOCK locks. It now replies to such ASTs. + +Severity : minor +Frequency : lfs setstripe, only systems with more than 160 OSTs +Bugzilla : 9440 +Description: unable to set striping with a starting offset beyond OST 160 +Details : llapi_create_file() incorrectly limited the starting stripe + index to the maximum single-file stripe count. + +Severity : minor +Frequency : LDAP users only +Bugzilla : 6163 +Description: lconf did not handle in-kernel recovery with LDAP properly +Details : lconf/LustreDB get_refs() is searching the wrong namespace + +Severity : enhancement +Bugzilla : 7342 +Description: bind OST threads to NUMA nodes to improve performance +Details : all OST threads are uniformly bound to CPUs on a single NUMA + node and do their allocations there to localize memory access + +Severity : enhancement +Bugzilla : 7979 +Description: llmount can determine client NID directly from Myrinet (GM) +Details : the client NID code from gmnalnid was moved directly into + llmount, removing the need to use this or specifying the + client NID explicitly when mounting GM clients with zeroconf + +Severity : minor +Frequency : if client is started with down MDS +Bugzilla : 7184 +Description: if client is started with down MDS mount hangs in ptlrpc_queue_wait +Details : Having an LWI_INTR() wait event (interruptible, but no timeout) + will wait indefinitely in ptlrpc_queue_wait->l_wait_event() after + ptlrpc_import_delayed_req() because we didn't check if the + request was interrupted, and we also didn't break out of the + event loop if there was no timeout + +Severity : major +Frequency : rare +Bugzilla : 5047 +Description: data loss during non-page-aligned writes to a single file from + both multiple nodes and multiple threads on one node at same time +Details : updates to KMS and lsm weren't protected by common lock. Resulting + inconsistency led to false short-reads, that were cached and later + used by ->prepare_write() to fill in partially written page, + leading to data loss. + +Severity : minor +Frequency : always, if lconf --abort_recovery used +Bugzilla : 7047 +Description: lconf --abort_recovery fails with 'Operation not supported' +Details : lconf was attempting to abort recovery on the MDT device and not + the MDS device + +Severity : enhancement +Bugzilla : 9445 +Description: remove cleanup logs +Details : replace lconf-generated cleanup logs with lustre internal + cleanup routines. Eliminates the need for client-cleanup and + mds-cleanup logs. + +Severity : enhancement +Bugzilla : 8592 +Description: add support for EAs (user and system) on lustre filesystems +Details : it is now possible to store extended attributes in the Lustre + client filesystem, and with the user_xattr mount option it + is possible to allow users to store EAs on their files also + +Severity : enhancement +Bugzilla : 7293 +Description: Add possibility (config option) to show minimal available OST free + space. +Details : When compiled with --enable-mindf configure option, statfs(2) + (and so, df) will return least minimal free space available from + all OSTs as amount of free space on FS, instead of summary of + free spaces of all OSTs. + +Severity : enhancement +Bugzilla : 7311 +Description: do not expand extent locks acquired on OST-side +Details : Modify ldlm_extent_policy() to not expand local locks, acquired + by server: they are not cached anyway. + +Severity : major +Frequency : when mmap is used/binaries executed from Lustre +Bugzilla : 9482 +Description: Unmmap pages before throwing them away from read cache. +Details : llap_shrink cache now attempts to unmap pages before discarding + them (if unmapping failed - do not discard). SLES9 kernel has + extra checks that trigger if this unmapping is not done first. + +Severity : minor +Frequency : rare +Bugzilla : 6034 +Description: lconf didn't resolve symlinks before checking to see whether a + given mountpoint was already in use + +Severity : minor +Frequency : when migrating failover services +Bugzilla : 6395, 9514 +Description: When migrating a subset of services from a node (e.g. failback + from a failover service node) the remaining services would + time out and evict clients. +Details : lconf --force (implied by --failover) sets the global obd_timeout + to 5 seconds in order to quickly disconnect, but this caused + other RPCs to time out too quickly. Do not change the global + obd_timeout for force cleanup, only set it for DISCONNECT RPCs. + +Severity : enhancement +Frequency : if MDS is started with down OST +Bugzilla : 9439,5706 +Description: Allow startup/shutdown of an MDS without depending on the + availability of the OSTs. +Details : Asynchronously call mds_lov_synchronize during MDS startup. + Add appropriate locking and lov-osc refcounts for safe + cleaning. Add osc abort_inflight calls in case the + synchronize never started. + +Severity : minor +Frequency : occasional (Cray XT3 only) +Bugzilla : 7305 +Description: root not authorized to access files in CRAY_PORTALS environment +Details : The client process capabilities were not honoured on the MDS in + a CRAY_PORTALS/CRAY_XT3 environment. If the file had previously + been accessed by an authorized user then root was able to access + the file on the local client also. The root user capabilities + are now allowed on the MDS, as this environment has secure UID. + +Severity : minor +Frequency : occasional +Bugzilla : 6449 +Description: ldiskfs "too long searching" message happens too often +Details : A debugging message (otherwise harmless) prints too often on + the OST console. This has been reduced to only happen when + there are fragmentation problems on the filesystem. + +Severity : minor +Frequency : rare +Bugzilla : 9598 +Description: Division by zero in statfs when all OSCs are inactive +Details : lov_get_stripecnt() returns zero due to incorrect order of checks, + lov_statfs divides by value returned by lov_get_stripecnt(). + +Severity : minor +Frequency : common +Bugzilla : 9489, 3273 +Description: First write from each client to each OST was only 4kB in size, + to initialize client writeback cache, which caused sub-optimal + RPCs and poor layout on disk for the first writen file. +Details : Clients now request an initial cache grant at (re)connect time + and so that they can start streaming writes to the cache right + away and always do full-sized RPCs if there is enough data. + If the OST is rebooted the client also re-establishes its grant + so that client cached writes will be honoured under the grant. + +Severity : minor +Frequency : common +Bugzilla : 7198 +Description: Slow ls (and stat(2) syscall) on files residing on IO-loaded OSTs +Details : Now I/O RPCs go to different portal number and (presumably) fast + lock requests (and glimses) and other RPCs get their own service + threads pool that should be able to service those RPCs + immediatelly. + +Severity : enhancement +Bugzilla : 7417 +Description: Ability to exchange lustre version between client and servers and + issue warnings at client side if client is too old. Also for + liblustre clients there is ability to refuse connection of too old + clients. +Details : New 'version' field is added to connect data structure that is + filled with version info. That info is later checked by server and + by client. + +Severity : minor +Frequency : rare, liblustre only. +Bugzilla : 9296, 9581 +Description: Two simultaneous writes from liblustre at offset within same page + might proceed at the same time overwriting eachother with stale + data. +Details : I/O lock withing llu_file_prwv was released too early, before data + actually was hitting the wire. Extended lock-holding time until + server acknowledges receiving data. + +Severity : minor +Frequency : extremely rare. Never observed in practice. +Bugzilla : 9652 +Description: avoid generating lustre_handle cookie of 0. +Details : class_handle_hash() generates handle cookies by incrementing + global counter, and can hit 0 occasionaly (this is unlikely, but + not impossible, because initial value of cookie counter is + selected randonly). Value of 0 is used as a sentinel meaning + "unassigned handle" --- avoid it. Also coalesce two critical + sections in this function into one. + +Severity : enhancement +Bugzilla : 9528 +Description: allow liblustre clients to delegate truncate locking to OST +Details : To avoid overhead of locking, liblustre client instructs OST to + take extent lock in ost_punch() on client's behalf. New connection + flag is added to handle backward compatibility. + +Severity : enhancement +Bugzilla : 4928, 7341, 9758 +Description: allow number of OST service threads to be specified +Details : a module parameter allows the number of OST service threads + to be specified via "options ost ost_num_threads={N}" in the + OSS's /etc/modules.conf or /etc/modprobe.conf. + +Severity : major +Frequency : rare +Bugzilla : 6146, 9635, 9895 +Description: servers crash with bad pointer in target_handle_connect() +Details : In rare cases when a client is reconnecting it was possible that + the connection request was the last reference for that export. + We would temporarily drop the export reference and get a new + one, but this may have been the last reference and the export + was just destroyed. Get new reference before dropping old one. + +Severity : enhancement +Frequency : if client is started with failover MDS +Bugzilla : 9818 +Description: Allow multiple MDS hostnames in the mount command +Details : Try to read the configuration from all specified MDS + hostnames during a client mount in case the "primary" + MDS is down. + +Severity : enhancement +Bugzilla : 9297 +Description: Stop sending data to evicted clients as soon as possible. +Details : Check if the client we are about to send or are sending data to + was evicted already. (Check is done every second of waiting, + for which l_wait_event interface was extended to allow checking + of exit condition at specified intervals). + +Severity : minor +Frequency : rare, normally only when NFS exporting is done from client +Bugzilla : 9301 +Description: 'bad disk LOV MAGIC: 0x00000000' error when chown'ing files + without objects +Details : Make mds_get_md() recognise empty md case and set lmm size to 0. + +Severity : minor +Frequency : always, if srand() is called before liblustre initialization +Bugzilla : 9794 +Description: Liblustre uses system PRNG disturbing its usage by user application +Details : Introduce internal to lustre fast and high-quality PRNG for + lustre usage and make liblustre and some other places in generic + lustre code to use it. + +Severity : enhancement +Bugzilla : 9477, 9557, 9870 +Description: Verify that the MDS configuration logs are updated when xml is +Details : Check if the .xml configuration logs are newer than the config + logs stored on the MDS and report an error if this is the case. + Request --write-conf, or allow starting with --old_conf. + +Severity : enhancement +Bugzilla : 6034 +Description: Handle symlinks in the path when checking if Lustre is mounted. +Details : Resolve intermediate symlinks when checking if a client has + mounted a filesystem to avoid duplicate client mounts. + +Severity : minor +Frequency : rare +Bugzilla : 9309 +Description: lconf can hit an error exception but still return success. +Details : The lconf command catches the Command error exception at the top + level script context and will exit with the associated exit + status, but doesn't ensure that this exit status is non-zero. + +Severity : minor +Frequency : rare +Bugzilla : 9493 +Description: failure of ptlrpc thread startup can cause oops +Details : Starting a ptlrpc service thread can fail if there are a large + number of threads or the server memory is very fragmented. + Handle this without oopsing. + +Severity : minor +Frequency : always, only if liblustre and non-default acceptor port was used +Bugzilla : 9933 +Description: liblustre cannot connect to servers with non-default acceptor port +Details : tcpnal_set_default_params() was not called and was therefore + ignoring the environment varaible TCPNAL_PORT, as well as other + TCPNAL_ environment variables + +Severity : minor +Frequency : rare +Bugzilla : 9923 +Description: two objects could be created on the same OST for a single file +Details : If an OST is down, in some cases it was possible to create two + objects on a single OST for a single file. No problems other + than potential performance impact and spurious error messages. + +Severity : minor +Frequency : rare +Bugzilla : 5681, 9562 +Description: Client may oops in ll_unhash_aliases +Details : Client dcache may become inconsistent in race condition. + In some cases "getcwd" can fail if the current directory is + modified. + +Severity : minor +Frequency : always +Bugzilla : 9942 +Description: Inode refcounting problems in NFS export code +Details : link_raw functions used to call d_instantiate without obtaining + extra inode reference first. + +Severity : minor +Frequency : rare +Bugzilla : 9942, 9903 +Description: Referencing freed requests leading to crash, memleaks with NFS. +Details : We used to require that call to ll_revalidate_it was always + followed by ll_lookup_it. Also with revalidate_special() it is + possible to call ll_revalidate_it() twice for the same dentry + even if first occurence returned success. This fix changes semantic + between DISP_ENQ_COMPLETE disposition flag to mean there is extra + reference on a request referred from the intent. + ll_intent_release() then releases such a request. + +Severity : minor +Frequency : rare, normally benchmark loads only +Bugzilla : 1443 +Description: unlinked inodes were kept in memory on the client +Details : If a client is repeatedly creating and unlinking files it + can accumulate a lot of stale inodes in the inode slab cache. + If there is no other client load running this can cause the + client node to run out of memory. Instead flush old inodes + from client cache that have the same inode number as a new inode. + +Severity : minor +Frequency : SLES9 2.6.5 kernel and long filenames only +Bugzilla : 9969, 10379 +Description: utime reports stale NFS file handle +Details : SLES9 uses out-of-dentry names in some cases, which confused + the lustre dentry revalidation. Change it to always use the + in-dentry qstr. + +Severity : major +Frequency : rare, unless heavy write-truncate concurrency is continuous +Bugzilla : 4180, 6984, 7171, 9963, 9331 +Description: OST becomes very slow and/or deadlocked during object unlink +Details : filter_destroy() was holding onto the parent directory lock + while truncating+unlinking objects. For very large objects this + may block other threads for a long time and slow overall OST + responsiveness. It may also be possible to get a lock ordering + deadlock in this case, or run out of journal credits because of + the combined truncate+unlink. Solution is to do object truncate + first in one transaction without parent lock, and then do the + final unlink in a new transaction with the parent lock. This + reduces the lock hold time dramatically. + +Severity : major +Frequency : rare, 2.4 kernels only +Bugzilla : 9967 +Description: MDS or OST cleanup may trip kernel BUG when dropping kernel lock +Details : mds_cleanup() and filter_cleanup() need to drop the kernel lock + before unmounting their filesystem in order to avoid deadlock. + The kernel_locked() function in 2.4 kernels only checks whether + the kernel lock is held, not whether it is this process that is + holding it as 2.6 kernels do. + +Severity : major +Frequency : rare +Bugzilla : 9635 +Description: MDS or OST may oops/LBUG if a client is connecting multiple times +Details : The client ptlrpc code may be trying to reconnect to a down + server before a previous connection attempt has timed out. + Increase the reconnect interval to be longer than the connection + timeout interval to avoid sending duplicate connections to + servers. + +Severity : minor +Frequency : echo_client brw_test command +Bugzilla : 9919 +Description: fix echo_client to work with OST preallocated code +Details : OST preallocation code (5137) didn't take echo_client IO path + into account: echo_client calls filter methods outside of any + OST thread and, hence, there is no per-thread preallocated + pages and buffers to use. Solution: hijack pga pages for IO. As + a byproduct, this avoids unnecessary data copying. + +Severity : minor +Frequency : rare +Bugzilla : 3555, 5962, 6025, 6155, 6296, 9574 +Description: Client can oops in mdc_commit_close() after open replay +Details : It was possible for the MDS to return an open request with no + transaction number in mds_finish_transno() if the client was + evicted, but without actually returning an error. Clients + would later try to replay that open and may trip an assertion + Simplify the client close codepath, and always return an error + from the MDS in case the open is not successful. + +Severity : major +Frequency : rare, 2.6 OSTs only +Bugzilla : 10076 +Description: OST may deadlock under high load on fragmented files +Details : If there was a heavy load and highly-fragmented OST filesystems + it was possible to have all the OST threads deadlock waiting on + allocation of biovecs, because the biovecs were not released + until the entire RPC IO was completed. Instead, release biovecs + as soon as they are complete to ensure forward IO progress. + +Severity : enhancement +Bugzilla : 9578 +Description: Support for specifying external journal device at mount +Details : If an OST or MDS device is formatted with an external journal + device, this device major/minor is stored in the ext3 superblock + and may not be valid for failover. Allow detecting and + specifying the external journal at mount time. + +Severity : major +Frequency : rare +Bugzilla : 10235 +Description: Mounting an MDS with pending unlinked files may cause oops +Details : target_finish_recovery() calls mds_postrecov() which returned + the number of orphans unlinked. mds_lov_connect->mds_postsetup() + considers this an error and immediately begins cleaning up the + lov, just after starting the mds_lov process + +Severity : enhancement +Bugzilla : 9461 +Description: Implement 'lfs df' to report actual free space on per-OST basis +Details : Add sub-command 'df' on 'lfs' to report the disk space usage of + MDS/OSDs. Usage: lfs df [-i][-h]. Command Options: '-i' to report + usage of objects; '-h' to report in human readable format. + +------------------------------------------------------------------------------ + +2005-08-26 Cluster File Systems, Inc. + * version 1.4.5 + * bug fixes + +Severity : major +Frequency : rare +Bugzilla : 7264 +Description: Mounting an ldiskfs file system with mballoc may crash OST node. +Details : ldiskfs mballoc code may reference an uninitialized buddy struct + at startup during orphan unlinking. Instead, skip buddy update + before setup, as it will be regenerated after recovery is complete. + +Severity : minor +Frequency : rare +Bugzilla : 7039 +Description: If an OST is inactive, its locks might reference stale inodes. +Details : lov_change_cbdata() must iterate over all namespaces, even if + they are inactive to clear inode references from the lock. + +Severity : enhancement +Frequency : occasional, if non-standard max_dirty_mb used +Bugzilla : 7138 +Description: Client will block write RPCs if not enough grant +Details : If a client has max_dirty_mb smaller than max_rpcs_in_flight, + then the client will block writes while waiting for another RPC + to complete instead of consuming its dirty limit. With change + we get improved performance when max_dirty_mb is small. + +Severity : enhancement +Bugzilla : 3389, 6253 +Description: Add support for supplementary groups on the MDS. +Details : The MDS has an upcall /proc/fs/lustre/mds/{mds}/group_upcall + (set to /usr/sbin/l_getgroups if enabled) which will do MDS-side + lookups for user supplementary groups into a cache. + +Severity : minor +Bugzilla : 7278 +Description: O_CREAT|O_EXCL open flags in liblustre always return -EEXIST +Details : Make libsysio to not enforce O_EXCL by clearing the flag, + for liblustre O_EXCL is enforced by MDS. + +Severity : minor +Bugzilla : 6455 +Description: readdir never returns NULL in liblustre. +Details : Corrected llu_iop_getdirentries logic, to return offset of next + dentry in struct dirent. + +Severity : minor +Bugzilla : 7137 +Frequency : liblustre only, depends on application IO pattern +Description: liblustre clients evicted if not contacting servers +Details : Don't put liblustre clients into the ping_evictor list, so + they will not be evicted by the pinger ever. + +Severity : enhancement +Bugzilla : 6902 +Description: Add ability to evict clients by NID from MDS. +Details : By echoing "nid:$NID" string into + /proc/fs/lustre/mds/.../evict_client client with nid that equals to + $NID would be instantly evicted from this MDS and from all active + OSTs connected to it. + +Severity : minor +Bugzilla : 7198 +Description: Do not query file size twice, somewhat slowing stat(2) calls. +Details : lookup_it_finish() used to query file size from OSTs that was not + needed. + +Severity : minor +Bugzilla : 6237 +Description: service threads change working directory to that of init +Details : Starting lustre service threads may pin the working directory + of the parent thread, making that filesystem busy. Threads + now change to the working directory of init to avoid this. + +Severity : minor +Bugzilla : 6827 +Frequency : during shutdown only +Description: shutdown with a failed MDS or OST can cause unmount to hang +Details : Don't resend DISCONNECT messages in ptlrpc_disconnect_import() + if server is down. + +Severity : minor +Bugzilla : 7331 +Frequency : 2.6 only +Description: chmod/chown may include an extra supplementary group +Details : ll{,u}_mdc_pack_op_data() does not properly initialize the + supplementary group and if none is specified this is used. + +Severity : minor +Bugzilla : 5479 (6816) +Frequency : rare +Description: Racing open + rm can assert client in mdc_set_open_replay_data() +Details : If lookup is in progress on a file that is unlinked we might try + to revalidate the inode and fail in revalidate after lookup is + complete and ll_file_open() enqueues the open again but + it_open_error() was not checking DISP_OPEN_OPEN errors correctly. + +Severity : minor +Frequency : always, if lconf --abort_recovery used +Bugzilla : 7047 +Description: lconf --abort_recovery fails with 'Operation not supported' +Details : lconf was attempting to abort recovery on the MDT device and not + the MDS device + +------------------------------------------------------------------------------ + +2005-08-08 Cluster File Systems, Inc. + * version 1.4.4 + * bug fixes + +Severity : major +Frequency : rare (only unsupported configurations with a node running as an + OST and a client) +Bugzilla : 6514, 5137 +Description: Mounting a Lustre file system on a node running as an OST could + lead to deadlocks +Details : OSTs now preallocates memory needed to write out data at + startup, instead of when needed, to avoid having to + allocate memory in possibly low memory situations. + Specifically, if the file system is mounted on on OST, + memory pressure could force it to try to write out data, + which it needed to allocate memory to do. Due to the low + memory, it would be unable to do so and the node would + become unresponsive. + +Severity : enhancement +Bugzilla : 7015 +Description: Addition of lconf --service command line option +Details : lconf now accepts a '--service ' option, which is + shorthand for 'lconf --group --select =' + +Severity : enhancement +Bugzilla : 6101 +Description: Failover mode is now the default for OSTs. +Details : By default, OSTs will now run in failover mode. To return to + the old behaviour, add '--failout' to the lmc line for OSTs. + +Severity : enhancement +Bugzilla : 1693 +Description: Health checks are now provided for MDS and OSTs +Details : Additional detailed health check information on MSD and OSTs + is now provided through the procfs health_check value. + +Severity : minor +Frequency : occasional, depends on IO load +Bugzilla : 4466 +Description: Disk fragmentation on the OSTs could eventually cause slowdowns + after numerous create/delete cycles +Details : The ext3 inode allocation policy would not allocate new inodes + very well on the OSTs because there are no new directories + being created. Instead we look for groups with free space if + the parent directories are nearly full. + +Severity : major +Bugzilla : 6302 +Frequency : rare +Description: Network or server problems during mount may cause partially + mounted clients instead of returning an error. +Details : The config llog parsing code may overwrite the error return + code during mount error handling, returning success instead + of an error. + +Severity : minor +Bugzilla : 6422 +Frequency : rare +Description: MDS can fail to allocate large reply buffers +Details : After long uptimes the MDS can fail to allocate large reply + buffers (e.g. zconf client mount config records) due to memory + fragmentation or consumption by the buffer cache. Preallocate + some large reply buffers so that these replies can be sent even + under memory pressure. + +Severity : minor +Bugzilla : 6266 +Frequency : rare (liblustre) +Description: fsx running with liblustre complained that using truncate() to + extend the file doesn't work. This patch corrects that issue. +Details : This is the liblustre equivalent of the fix for bug 6196. Fixes + ATTR_SIZE and lsm use in llu_setattr_raw. + +Severity : critical +Bugzilla : 6866 +Frequency : rare, only 2.6 kernels +Description: Unusual file access patterns on the MDS may result in inode + data being lost in very rare circumstances. +Details : Bad interaction between the ea-in-inode patch and the "no-read" + code in the 2.6 kernel caused the inode and/or EA data not to + be read from disk, causing single-file corruption. + +Severity : critical +Bugzilla : 6998 +Frequency : rare, only 2.6 filesystems using extents +Description: Heavy concurrent write and delete load may cause data corruption. +Details : It was possible under high-load situations to have an extent + metadata block in the block device cache from a just-unlinked + file overwrite a newly-allocated data block. We now unmap any + metadata buffers that alias just-allocated data blocks. + +Severity : minor +Bugzilla : 7241 +Frequency : filesystems with default stripe_count larger than 77 +Description: lconf+mke2fs fail when formatting filesystem with > 77 stripes +Details : lconf specifies an inode size of 4096 bytes when the default + stripe_count is larger than 77. This conflicts with the default + inode density of 1 per 4096 bytes. Allocate smaller inodes in + this case to avoid pinning too much memory for large EAs. + +------------------------------------------------------------------------------ + +2005-07-07 Cluster File Systems, Inc. + * version 1.4.3 + * bug fixes + +Severity : minor +Frequency : rare (extremely heavy IO load with hundreds of clients) +Bugzilla : 6172 +Description: Client is evicted, gets IO error writing to file +Details : lock ordering changes for bug 5492 reintroduced bug 3267 and + caused clients to be evicted for AST timeouts. The fixes in + bug 5192 mean we no longer need to have such short AST timeouts + so ldlm_timeout has been increased. + +Severity : major +Frequency : occasional during --force or --failover shutdown under load +Bugzilla : 5949, 4834 +Description: Server oops/LBUG if stopped with --force or --failover under load +Details : a collection of import/export refcount and cleanup ordering + issues fixed for safer force cleanup + +Severity : major +Frequency : only filesystems larger than 120 OSTs +Bugzilla : 5990, 6223 +Description: lfs getstripe would oops on a very large filesystem +Details : lov_getconfig used kfree on vmalloc'd memory + +Severity : minor +Frequency : only filesystems exporting via NFS to Solaris 10 clients +Bugzilla : 6242, 6243 +Description: reading from files that had been truncated to a non-zero size + but never opened returned no data +Details : ll_file_read() reads zeros from no-object files to EOF + +Severity : major +Frequency : rare +Bugzilla : 6200 +Description: A bug in MDS/OSS recovery could cause the OSS to fail an assertion +Details : There's little harm in aborting MDS/OSS recovery and letting it + try again, so I removed the LASSERT and return an error instead. + +Severity : enhancement +Bugzilla : 5902 +Description: New debugging infrastructure for tracking down data corruption +Details : The I/O checksum code was replaced to: (a) control it at runtime, + (b) cover more of the client-side code path, and (c) try to narrow + down where problems occurred + +Severity : major +Frequency : rare +Bugzilla : 3819, 4364, 4397, 6313 +Description: Racing close and eviction MDS could cause assertion in mds_close +Details : It was possible to get multiple mfd references during close and + client eviction, leading to one thread referencing a freed mfd. + +Severity: : enhancement +Bugzilla : 3262, 6359 +Description: Attempts to reconnect to servers are now more aggressive. +Details : This builds on the enhanced upcall-less recovery that was added + in 1.4.2. When trying to reconnect to servers, clients will + now try each server in the failover group every 10 seconds. By + default, clients would previously try one server every 25 seconds. + +Severity : major +Frequency : rare +Bugzilla : 6371 +Description: After recovery, certain operations trigger a failed + assertion on a client. +Details : Failing over an mds, using lconf -d --failover, while a + client was doing a readdir() call would cause the client to + LBUG after recovery completed and the readdir() was resent. + +Severity : enhancement +Bugzilla : 6296 +Description: Default groups are now added by lconf +Details : You can now run lconf --group without having to + manually add groups with lmc. + +Severity : major +Frequency : occasional +Bugzilla : 6412 +Description: Nodes with an elan id of 0 trigger a failed assertion + +Severity : minor +Frequency : always when accessing e.g. tty/console device nodes +Bugzilla : 3790 +Description: tty and some other devices nodes cannot be used on lustre +Details : file's private_data field is used by device data and lustre + values in there got lost. New field was added to struct file to + store fs-specific private data. + +Severity : minor +Frequency : when exporting Lustre via NFS +Bugzilla : 5275 +Description: NFSD failed occasionally when looking up a path component +Details : NFSD is looking up ".." which was broken in ext3 directories + that had grown large enough to become hashed. + +Severity : minor +Frequency : Clusters with multiple interfaces not on the same subnet +Bugzilla : 5541 +Description: Nodes will repeatedly try to reconnect to an interface which it + cannot reach and report an error to the log. +Details : Extra peer list entries will be created by lconf with some peers + unreachable. lconf now validates the peer before adding it. + +Severity : major +Frequency : Only if a default stripe is set on the filesystem root. +Bugzilla : 6367 +Description: Setting a default stripe on the filesystem root prevented the + filesystem from being remounted. +Details : The client was sending extra request flags in the root getattr + request and did not allocate a reply buffer for the dir EA. + +Severity : major +Frequency : occasional, higher if lots of files are accessed by one client +Bugzilla : 6159, 6097 +Description: Client trips assertion regarding lsm mismatch/magic +Details : While revalidating inodes the VFS looks up inodes with ifind() + and in rare cases can find an inode that is being freed. + The ll_test_inode() code will free the lsm during ifind() + when it finds an existing inode and then the VFS later attaches + this free lsm to a new inode. + +Severity : major +Frequency : rare +Bugzilla : 6422, 7030 +Description: MDS deadlock between mkdir and client eviction +Details : Creating a new file via mkdir or mknod (starting a transaction + and getting the ns lock) can deadlock with client eviction + (gets ns lock and trying to finish a synchronous transaction). + +Severity : minor +Frequency : occasional +Description: While starting a server, the fsfilt_ext3 module could not be + loaded. +Details : CFS's improved ext3 filesystem is named ldiskfs for 2.6 + kernels. Previously, lconf would still use the ext3 name + when trying to load modules. Now, it will correctly use + ext3 on 2.4 and ldiskfs on 2.6. + +Severity : enhancement +Description: The default stripe count has been changed to 1 +Details : The interpretation of the default stripe count (0, to lfs + or lmc) has been changed to mean striping across a single + OST, rather than all available. For general usage we have + found a stripe count of 1 or 2 works best. + +Severity : enhancement +Description: Add support for compiling against Cray portals. +Details : Conditional compiling for some areas that are different + on Cray Portals. + +Severity : major +Frequency : occasional +Bugzilla : 6409, 6834 +Description: Creating files with an explicit stripe count may lead to + a failed assertion on the MDS +Details : If some OSTs are full or unavailable, creating files may + trigger a failed assertion on the MDS. Now, Lustre will + try to use other servers or return an error to the + client. + +Severity : minor +Frequency : occasional +Bugzilla : 6469 +Description: Multiple concurrent overlapping read+write on multiple SMP nodes + caused lock timeout during readahead (since 1.4.2). +Details : Processes doing readahead might match a lock that hasn't been + granted yet if there are overlapping and conflicting lock + requests. The readahead process waits on ungranted lock + (original lock is CBPENDING), while OST waits for that process + to cancel CBPENDING read lock and eventually evicts client. + +Severity : enhancement +Bugzilla : 6931 +Description: Initial enabling of flock support for clients +Details : Implements fcntl advisory locking and file status functions. + This feature is provided as an optional mount flag (default + off), and is NOT CURRENTLY SUPPORTED. Not all types of record + locking are implemented yet, and those that are are not guaranteed + to be completely correct in production environments. + mount -t lustre -o [flock|noflock] ... + +Severity : major +Frequency : occasional +Bugzilla : 6198 +Description: OSTs running 2.4 kernels but with extents enabled might trip an + assertion in the ext3 JBD (journaling) layer. +Details : The b_committed_data struct is protected by the big kernel lock + in 2.4 kernels, serializing journal_commit_transaction() and + ext3_get_block_handle->ext3_new_block->find_next_usable_block() + access to this struct. In 2.6 kernels there is finer grained + locking to improve SMP performance of the JBD layer. + +Severity : minor +Bugzilla : 6147 +Description: Changes the "SCSI I/O Stats" kernel patch to default to "enabled" + +----------------------------------------------------------------------------- + +2005-05-05 Cluster File Systems, Inc. + * version 1.4.2 + NOTE: Lustre 1.4.2 uses an incompatible network protocol than previous + versions of Lustre. Please update all servers and clients to + version 1.4.2 or later at the same time. You must also run + "lconf --write-conf {config}.xml" on the MDS while it is stopped + to update the configuration logs. + * bug fixes + - fix for HPUX NFS client breakage when NFS exporting Lustre (5781) + - mdc_enqueue does not need max_mds_easize request buffer on send (5707) + - swab llog records of type '0' so we get proper header size/idx (5861) + - send llog cancel req to DLM cancel portal instead of cb portal (5515) + - fix rename of one directory over another leaking an inode (5953) + - avoid SetPageDirty on 2.6 (5981) + - don't re-add just-being-destroyed locks to the waiting list (5653) + - when creating new directories, inherit the parent's custom + striping settings if present parent (3048) + - flush buffers from cache before direct IO in 2.6 obdfilter (4982) + - don't hold i_size_sem in ll_nopage() and ll_ap_refresh_count (6077) + - don't hold client locks on temporary worklist from l_lru (5666) + - handle IO errors in 2.6 obdfilter bio completion routine (6046) + - automatically evict dead clients (5921) + - Update file size properly in create+truncate+fstat case (6196) + - Do not unhash mountpoint dentries, do not allow removal of + mountpoints (5907) + - Avoid lock ordering deadlock issue with write/truncate (6203,5654) + - reserve enough journal credits in fsfilt_start_log for setattr (4554) + - ldlm_enqueue freed-export error path would always LBUG (6149,6184) + - don't reference lr_lvb_data until after we hold lr_lvb_sem (6170) + - don't overwrite last_rcvd if there is a *_client_add() error (6086) + - Correctly handle reads of files with no objects (6243) + - lctl recover will also mark a device active if deactivate used (5933) * miscellania - - use "CATALOGS" for the llog catalogs, not "CATLIST" (old) (b=2841) - - added kernel patch for /dev/sd I/O stats (4385) + - by default create 1 inode per 4kB space on MDS, per 16kB on OSTs + - allow --write-conf on an MDS with different nettype than client (5619) + - don't write config llogs to MDS for mounts not from that MDS (5617) + - lconf should create multiple TCP connections from a client (5201) + - init scripts are now turned off by default; run chkconfig --on + lustre and chkconfig --on lustrefs to use them + - upcalls are no longer needed for clients to recover to failover + servers (3262) + - add --abort-recovery option to lconf to abort recovery on device + startup (6017) + - add support for an arbitrary number of OSTs (3026) + - Quota support protocol changes. + - forward compatibility changes to wire structs (6007) + - rmmod NALs that might be loaded because of /etc/modules.conf (6133) + - support for mountfsoptions and clientoptions to the Lustre LDAP (5873) + - improved "lustre status" script + - initialize blocksize for non-regular files (6062) + - added --disable-server and --disable-client configure options (5782) + - introduce a lookup cache for lconf to avoid repeated DB scans (6204) + - Vanilla 2.4.29 support + - increase maximum number of obd devices to 520 (6242) + - remove the tcp-zero-copy patch from the suse-2.4 series (5902) + - Quadrics Elan drivers are now included for the RHEL 3 2.4.21 and + SLES 9 2.6.5 kernels + - limit stripes per file to 160 (the maximum EA size) (6093) + +2005-03-22 Cluster File Systems, Inc. + * version 1.4.1 + * bug fixes + - don't LASSERT in ll_release on NULL lld with NFS export (4655, 5760) + - hold NS lock when calling handle_ast_error->del_waiting_lock (5746) + - fix setattr mtime regression from lovcleanup merge (4829, 5669) + - workaround for 2.6 crash in ll_unhash_aliases (5687, 5210) + - small ext3 extents cleanups and fixes (5733) + - improved mballoc code, several small races and bugs fixed (5733, 5638) + - kernel version 43 - fix remove_suid bugs in both 2.4 and 2.6 (5695) + - avoid needless client->OST connect, fix handle mismatch (5317) + - fix DLM error path that led to out-of-sync client, long delays (5779) + - support common vfs-enforced mount options (nodev,nosuid,noexec) (5637) + - fix several locking issues related to i_size (5492,5624,5654,5672) + - don't move pending lock onto export if it is already evicted (5683) + - fix kernel oops when creating .foo in unlinked directory (5548) + - fix deadlock in obdfilter statistics vs. object create (5811) + - use time_{before,after} to avoid timer jiffies wrap (5882) + - shutdown --force/--failover stability (3607,3651,4797,5203,4834) + - Do not leak request if server was not able to process it (5154) + - If mds_open unable to find parent dir, make that negative lookup(5154) + - don't create new directories with extent-mapping (5909, 5936) + * miscellania + - fix lustre/lustrefs init scripts for SuSE (patch from Scali, 5702) + - don't hold the pinger_sem in ptlrpc_pinger_sending_on_import + - change obd_increase_kms to obd_adjust_kms (up or down) (5654) + - lconf, lmc search both /usr/lib and /usr/lib64 for Python libs (5800) + - support for RHEL4 kernel on i686 (5773) + - provide error messages when incompatible logs are encountered (5898) + +2005-02-18 Cluster File Systems, Inc. + * version 1.4.0.10 (1.4.1 release candidate 1) + * bug fixes + - don't keep a lock reference when lock is not granted (4238) + - unsafe list practices (rarely) led to infinite eviction loop (4908) + - add per-fs limit of Lustre pages in page cache, avoid OOM (4699) + - drop import inflight refcount on signal_completed_replay error (5255) + - unlock page after async write error during send (3677) + - handle missing objects in filter_preprw_read properly (5265) + - no transno return for symlink open, don't save no-trasno open (3440) + - don't try to complete elan receive that already failed (4012) + - free RPC server reply state on error (5406) + - clean up thread from ptlrpc_start_thread() on error (5160) + - readahead could read extra page into cache that wasn't ejected (5388) + - prevent races in class_attach/setup/cleanup/detach (5260) + - don't dereference de->d_inode after l_dput of de (5458) + - use "int" for stripe value returned from lock_to_stripe (5544) + - mballoc allocation and error-checking fixes in 2.6 (5504) + - block device patches to fix I/O request sizes in 2.6 (5482) + - look up hostnames for IB nals (5602) + - 2.6 changed lock ordering of 2 semaphores, caused deadlock (5654) + - don't start multiple acceptors for the same port (5277) + - fix incorrect LASSERT in mds_getattr_name (5635) + - export a proc file for general "ping" checking (5628) + - fix "lfs check" to not block when the MDS is down (5628) + * miscellania + - service request history (4965) + - put {ll,lov,osc}_async_page structs in a single slab (4699) + - create an "evict_client" /proc entry on OSTs, like the MDS has + - fix mount usage message, return errors per mount(8) (5168) + - change grep [] to grep "[]" in tests so they work in more UMLs + - fix ppc64/x86_64 spec to use %{_libdir} instead of /usr/lib (5389) + - remove ancient LOV_MAGIC_V0 EA support (5047) + - add "disk I/Os in flight" and "I/O req time" stats in obdfilter + - align r/w RPCs to PTLRPC_MAX_BRW_SIZE boundary for performance (3451) + - allow readahead allocations to fail when low on memory (5383) + - mmap locking landed again, after considerable improvement (2828) + - add get_hostaddr() to lustreDB.py for LDAP support (5459) + +2004-11-23 Cluster File Systems, Inc. + * version 1.4.0 + * bug fixes + - send OST transaction number in read/write reply to free req (4966) + - don't ASSERT in ptl_send_rpc() if we run out of memory (5119) + - lock /proc/sys/portals/routes internal state, avoiding oops (4827) + - the watchdog thread now runs as interruptible (5246) + - flock/lockf fixes (but it's still disabled, pending 5135) + - don't use EXT3 constants in llite code (5094) + - memory shortage at startup could cause assertion (5176) + * miscellania + - reorganization of lov code + - single portals codebase + - Infiniband NAL + - add extents/mballoc support (5025) + - direct I/O reads in the obdfilter (4048) + - kernel patches from LNXI for 2.6 (bluesmoke, perfctr, mtd, kexec) + +tbd Cluster File Systems, Inc. + * version 1.2.9 + * bug fixes + - send OST transaction number in read/write reply to free req (4966) + - don't ASSERT in ptl_send_rpc() if we run out of memory (5119) + - lock /proc/sys/portals/routes internal state, avoiding oops (4827) + - the watchdog thread now runs as interruptible (5246) + - handle missing objects in filter_preprw_read properly (5265) + - unsafe list practices (rarely) led to infinite eviction loop (4908) + - drop import inflight refcount on signal_completed_replay error (5255) + - unlock page after async write error during send (3677) + - return original error code on reconstructed replies (3761) + - no transno return for symlink open, don't save no-trasno open (3440) + * miscellania + - add pid to ldlm debugging output (4922) + - bump the watchdog timeouts -- we can't handle 30sec yet + - extra debugging for orphan dentry/inode bug (5259) 2004-11-16 Cluster File Systems, Inc. * version 1.2.8 @@ -74,6 +2216,8 @@ tbd Cluster File Systems, Inc. - fix NULL dereference in /proc/sys/portals/routes (4827) - allow failed mdc_close() operations to be interrupted (4561) - stop precreate on OST before MDS would time out on it (4778) + - don't send partial-page writes before EOF from client (4410) + - discard client grant for sub-page writes on large-page clients (4520) - don't free dentries not owned by NFS code, check generation (4806) - fix lsm leak if mds_create_objects() fails (4801) - limit debug_daemon file size, always print CERROR messages (4789) @@ -88,6 +2232,8 @@ tbd Cluster File Systems, Inc. - make lustrefs init script start after nfs is mounted - fix CWARN/ERROR duplication (4930) - return async write errors to application if possible (2248) + - add /proc/sys/portal/memused (bytes allocated by PORTALS_ALLOC) + - print NAL number in %x format (4645) - update barely-supported suse-2.4.21-171 series (4842) - support for sles 9 %post scripts - support for building 2.6 kernel-source packages @@ -104,13 +2250,15 @@ tbd Cluster File Systems, Inc. - let lconf resolve symlinked-to devices (4629) - don't unlink "objects" from directory with default EA (4554) - hold socknal file ref over connect in case target is down (4394) - - allow more than 32000 subdirectories in a single directory (3244) + - allow more than 32000 subdirectories in a single directory (3244) + - fix blocks count for O_DIRECT writes (3751) - OST returns ENOSPC from object create when no space left (4539) - don't send truncate RPC if file size isn't changing (4410) - limit OSC precreate to 1/2 of value OST considers bogus (4778) - bind to privileged port in socknal and tcpnal (3689) * miscellania - rate limit CERROR/CWARN console message to avoid overload (4519) + - GETFILEINFO dir ioctl returns LOV EA + MDS stat in 1 call (3327) - basic mmap support (3918) - kernel patch series update from b1_4 (4711) @@ -123,8 +2271,9 @@ tbd Cluster File Systems, Inc. - dynamic object preallocation to improve recovery speed (4236) - don't hold spinlock over lock dumping or change debug flags (4401) - don't zero obd_dev when it is force cleaned (3651) + - print grants to console if they go negative (4431) - "lctl deactivate" will stop automatic recovery attempts (3406) - - look for existing replayed locks to avoid duplicates (3764) + - look for existing locks in ldlm_handle_enqueue() (3764) - don't resolve lock handle twice in recovery avoiding race (4401) - revalidate should check working dir is a directory (4134) * miscellania @@ -177,8 +2326,40 @@ tbd Cluster File Systems, Inc. - fix race and lock order deadlock in orphan handling (3450, 3750) - add validity checks when grabbing inodes from l_ast_data (3599) * miscellania + - add /proc/.../recovery_status to obdfilter (3428) + - lightweight CDEBUG infrastructure, debug daemon (3668) + - change default OSC RPC parameters to be better on small clusters + - turn off OST read cache for files smaller than 32MB + - install man pages and include them in rpms (3100) + - add new init script for (un)mounting lustre filesystems (2593) + - run chkconfig in %post for init scripts (3701) - drop scimac NAL (unmaintained) +2004-06-17 Cluster File Systems, Inc. + * version 1.2.3 + * bug fixes + - clean kiobufs before and after use (3485) + - strip trailing '/'s before comparing paths with /proc/mounts (3486) + - remove assertions to work around "in-flight rpcs" recovery bug (3063) + - change init script to fail more clearly if not run as root (1528) + - allow clients to reconnect during replay (1742) + - fix ns_lock/i_sem lock ordering deadlock for kms update (3477) + - don't do DNS lookups on NIDs too small for IP addresses (3442) + - re-awaken ptlrpcd if new requests arrive during check_set (3554) + - fix cond_resched (3554) + - only evict unfinished clients after recovery (3515) + - allow bulk resend, prevent data loss (3570) + - dynamic ptlrpc request buffer allocation (2102) + - don't allow unlinking open directory if it isn't empty (2904) + - set MDS/OST threads to umask 0 to not clobber client modes (3359) + - remove extraneous obd dereference causing LASSERT failure (3334) + - don't use get_cycles() when creating temp. files on the mds (3156) + - hold i_sem when setting i_size in ll_extent_lock() (3564) + - handle EEXIST for set-stripe, set proper directory name (3336) + * miscellania + - servers can dump a log evicting a client - lustre.dump_on_timeout=1 + - fix ksocknal_fmb_callback() error messages (2918) + 2004-05-27 Cluster File Systems, Inc. * version 1.2.2 * bug fixes @@ -207,7 +2388,9 @@ tbd Cluster File Systems, Inc. - don't crash in mdc_close for bad permissions on open (3285) - zero i_rdev for non-device files (3147) - clear page->private before handing to FS, better assertion (3119) + - tune the read pipeline (3236) - fix incorrect decref of invalidated dentry (2350) + - provide read-ahead stats and refine rpc in flight stats (3328) - don't hold journal transaction open across create RPC (3313) - update atime on MDS at close time (3265) - close LDAP connection when recovering to avoid server load (3315) @@ -228,6 +2411,7 @@ tbd Cluster File Systems, Inc. - increase maximum number of MDS request buffers for large systems - change liblustreapi to be useful for external progs like lfsck (3098) - increase local configuration timeout for slow disks (3353) + - allow configuring ldlm AST timeout - lustre.ldlm_timeout= 2004-03-22 Cluster File Systems, Inc. * version 1.2.1 @@ -558,7 +2742,7 @@ tbd Cluster File Systems, Inc. - return 0 from revalidate2 if ll_intent_lock returns -EINTR (912) - fix leak in bulk IO when only partially completed (899, 900, 926) - fix O_DIRECT for ia64 (55) - - (almost) eliminate Lustre-kernel-thread effects on load average (722) + - (almost) eliminate Lustre-kernel-thread effects on load average (722) - C-z after timeout could hang a process forever; fixed (977) * Features - client-side I/O cache (678, 924, 929, 941, 970) @@ -572,7 +2756,7 @@ tbd Cluster File Systems, Inc. - Fix ldlm_lock_match on the MDS to avoid matching remote locks (592) - Fix fsfilt_extN_readpage() to read a full page of directory entries, or fake the remainder if PAGE_SIZE != blocksize (500) - - Avoid extra mdc_getattr() in ll_intent_lock when possible (534, 604) + - Avoid extra mdc_getattr() in ll_intent_lock when possible (534, 604) - Fix imbalanced LOV object allocation and out-of-bound access (469) - Most intent operations were removed, in favour of a new RPC mode that does a single RPC to the server and bypasses most of the VFS @@ -680,9 +2864,9 @@ tbd Cluster File Systems, Inc. - fix dbench 2, extN refcount problem (170, 258, 356, 418) - fix double-O_EXCL intent crash (424) - avoid sending multiple lock CANCELs (352) - * Features + * Features - MDS can do multi-client recovery (modulo bugs in new code) - * Documentation + * Documentation - many updates, edits, cleanups 2002-11-18 Phil Schwan @@ -706,12 +2890,12 @@ tbd Cluster File Systems, Inc. - properly abstracted the echo client - OSC locked 1 byte too many; fixed - rewrote brw callback code: - - fixed recovery bugs related to LOVs (306) - - fixed too-many-pages-in-one-write crash (191) - - fixed (again) crash in sync_io_timeout (214) - - probably fixed callback-related race (385) + - fixed recovery bugs related to LOVs (306) + - fixed too-many-pages-in-one-write crash (191) + - fixed (again) crash in sync_io_timeout (214) + - probably fixed callback-related race (385) * protocol change - - Add capability to MDS protocol + - Add capability to MDS protocol - LDLM cancellations and callbacks on different portals 2002-10-28 Andreas Dilger @@ -866,8 +3050,8 @@ tbd Cluster File Systems, Inc. * small changes in the DLM wire protocol 2002-07-25 Peter J. Braam - * version 0_5_1 with some initial stability, - * locking on MD and file I/O. + * version 0_5_1 with some initial stability, + * locking on MD and file I/O. * documentation updates * several bug fixes since 0.5.0 * small changes in wire protocol @@ -901,4 +3085,4 @@ tbd Cluster File Systems, Inc. * move forward to latest Lustre kernel 2002-06-25 Peter Braam - * release version v0_4_1. Hopefully stable on single node use. + * release version v0_4_1. Hopefully stable on single node use. diff --git a/lustre/LICENSE b/lustre/LICENSE new file mode 100644 index 0000000..edb73cd --- /dev/null +++ b/lustre/LICENSE @@ -0,0 +1,372 @@ +Each file in this distribution contains a header stating the copyright +owner(s), and the licensing terms for that file. Some files are not +eligible for copyright protection, and contain neither. + +There are many files which may be covered by a separate license that +you signed or otherwise agreed to before downloading this software. +If you did not agree to such an agreement, or if the file does not +mention that license, then you can redistribute and/or modify it under +the terms of version 2 of the GNU General Public License. Each file +is very clear about which license is applicable. + +In any case, Lustre is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the license +text for more details. + +Reproduced below is the GNU General Public License version 2, and +Linus's clarifying statement from the Linux kernel source code: + +---------------------------------------- + + NOTE! This copyright does *not* cover user programs that use kernel + services by normal system calls - this is merely considered normal use + of the kernel, and does *not* fall under the heading of "derived work". + Also note that the GPL below is copyrighted by the Free Software + Foundation, but the instance of code that it refers to (the Linux + kernel) is copyrighted by me and others who actually wrote it. + + Linus Torvalds + +---------------------------------------- + + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc. + 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Library General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) 19yy + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) 19yy name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Library General +Public License instead of this License. diff --git a/lustre/Makefile.in b/lustre/Makefile.in index d383f48..1b7a9be 100644 --- a/lustre/Makefile.in +++ b/lustre/Makefile.in @@ -1,22 +1,15 @@ -@LDISKFS_TRUE@subdir-m += ldiskfs +@LDISKFS_TRUE@subdir-m += ldiskfs subdir-m += lvfs subdir-m += obdclass -subdir-m += sec subdir-m += lov -subdir-m += lmv subdir-m += ptlrpc -subdir-m += obdecho subdir-m += osc -subdir-m += cobd -subdir-m += cmobd - -@SERVER_TRUE@subdir-m += smfs mds obdfilter ost -@CLIENT_TRUE@subdir-m += mdc llite - -ifeq ($(PATCHLEVEL),4) -subdir-m += ptlbd -endif # PATCHLEVEL = 4 +subdir-m += obdecho +subdir-m += mgc +@SERVER_TRUE@subdir-m += mds obdfilter ost mgs +@CLIENT_TRUE@subdir-m += mdc llite +@QUOTA_TRUE@subdir-m += quota @INCLUDE_RULES@ diff --git a/lustre/Rules.in b/lustre/Rules.in deleted file mode 100644 index 293ff3c..0000000 --- a/lustre/Rules.in +++ /dev/null @@ -1,46 +0,0 @@ -# Directories building kernel modules should have two files: -# -# Makefile.in: -# -# MODULES := -# -objs := file1.o file2.o file3.o -# @INCLUDE_RULES@ -# -# and autoMakefile.am: -# -# if LIBLUSTRE -# -# endif -# -# if MODULES -# modulefs_DATA = $(KMODEXT) -# endif -# -# DIST_SOURCES = $(-objs:.o=.c) -# MOSTLYCLEANFILES = *.o *.ko *.mod.c - -ifeq ($(PATCHLEVEL),) - -include autoMakefile - -else - -include @LINUX_CONFIG@ - -EXTRA_CFLAGS := $(EXTRA_PRE_CFLAGS) -EXTRA_CFLAGS += @EXTRA_KCFLAGS@ @UML_CFLAGS@ -EXTRA_CFLAGS += $(EXTRA_POST_CFLAGS) - -obj-m := $(patsubst %,%.o,$(MODULES)) - -ifeq ($(PATCHLEVEL),4) -# 2.4 rules -O_TARGET := $(firstword $(obj-m)) -obj-y := $($(firstword $(MODULES))-objs) -export-objs := $(obj-y) $(filter-out $(O_TARGET),$(obj-m)) -include $(TOPDIR)/Rules.make -$(MODINCL)/%.ver: %.c - @true -endif # PATCHLEVEL - -endif # KERNELRELEASE diff --git a/lustre/autoMakefile.am b/lustre/autoMakefile.am index 84f3c26..f0531e8 100644 --- a/lustre/autoMakefile.am +++ b/lustre/autoMakefile.am @@ -5,14 +5,16 @@ AUTOMAKE_OPTIONS = foreign -ALWAYS_SUBDIRS := include lvfs obdclass lov ldlm sec ptlrpc \ - obdecho osc lmv cobd ptlbd cmobd doc utils tests conf scripts \ - autoconf +# also update lustre/autoconf/lustre-core.m4 AC_CONFIG_FILES +ALWAYS_SUBDIRS := include lvfs obdclass ldlm ptlrpc osc lov obdecho \ + mgc doc utils tests scripts autoconf contrib -SERVER_SUBDIRS := ldiskfs smfs snapfs obdfilter ost mds +SERVER_SUBDIRS := ldiskfs obdfilter ost mds mgs CLIENT_SUBDIRS := mdc llite +QUOTA_SUBDIRS := quota + LIBLUSTRE_SUBDIRS := liblustre SUBDIRS := $(ALWAYS_SUBDIRS) @@ -25,6 +27,11 @@ if CLIENT SUBDIRS += $(CLIENT_SUBDIRS) endif +if QUOTA +SUBDIRS += $(QUOTA_SUBDIRS) +endif + +# this needs to be after the client subdirs if LIBLUSTRE if !CLIENT SUBDIRS += $(CLIENT_SUBDIRS) @@ -32,10 +39,10 @@ endif SUBDIRS += $(LIBLUSTRE_SUBDIRS) endif -DIST_SUBDIRS := $(ALWAYS_SUBDIRS) $(SERVER_SUBDIRS) $(CLIENT_SUBDIRS) \ - $(LIBLUSTRE_SUBDIRS) +DIST_SUBDIRS := $(ALWAYS_SUBDIRS) $(SERVER_SUBDIRS) $(CLIENT_SUBDIRS) \ + $(LIBLUSTRE_SUBDIRS) $(QUOTA_SUBDIRS) -EXTRA_DIST = BUGS FDL Rules.in kernel_patches +EXTRA_DIST = BUGS FDL kernel_patches if LDISKFS LDISKFS = ldiskfs-sources @@ -45,32 +52,18 @@ endif lvfs-sources: $(MAKE) sources -C lvfs +obdclass-sources: + $(MAKE) sources -C obdclass -sources: $(LDISKFS) lvfs-sources lustre_build_version +sources: $(LDISKFS) lvfs-sources obdclass-sources lustre_build_version all-recursive: lustre_build_version +BUILD_VER_H=$(top_builddir)/lustre/include/linux/lustre_build_version.h + lustre_build_version: perl $(top_builddir)/lustre/scripts/version_tag.pl $(top_srcdir) $(top_builddir) > tmpver echo "#define LUSTRE_RELEASE @RELEASE@" >> tmpver - cmp -s $(top_builddir)/lustre/include/linux/lustre_build_version.h tmpver \ - 2> /dev/null && \ - $(RM) tmpver || \ - mv tmpver $(top_builddir)/lustre/include/linux/lustre_build_version.h - -CSTK=/tmp/checkstack -CSTKO=/tmp/checkstack.orig - -checkstack: - [ -f ${CSTK} -a ! -s ${CSTKO} ] && mv ${CSTK} ${CSTKO} || true - for i in ${SUBDIRS} portals/knals/*; do \ - MOD=$$i/`basename $$i`.o; \ - [ -f $$MOD ] && objdump -d $$MOD | perl tests/checkstack.pl; \ - done | sort -nr > ${CSTK} - [ -f ${CSTKO} ] && ! diff -u ${CSTKO} ${CSTK} || head -30 ${CSTK} - -checkstack-update: - [ -f ${CSTK} ] && mv ${CSTK} ${CSTKO} - -checkstack-clean: - rm -f ${CSTK} ${CSTKO} + cmp -s $(BUILD_VER_H) tmpver > tmpdiff 2> /dev/null && \ + $(RM) tmpver tmpdiff || \ + mv -f tmpver $(BUILD_VER_H) diff --git a/lustre/autoconf/.cvsignore b/lustre/autoconf/.cvsignore new file mode 100644 index 0000000..282522d --- /dev/null +++ b/lustre/autoconf/.cvsignore @@ -0,0 +1,2 @@ +Makefile +Makefile.in diff --git a/lustre/autoconf/lustre-core.m4 b/lustre/autoconf/lustre-core.m4 index 8012e79..37cf2fe 100644 --- a/lustre/autoconf/lustre-core.m4 +++ b/lustre/autoconf/lustre-core.m4 @@ -1,3 +1,4 @@ +#* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- # # LC_CONFIG_SRCDIR # @@ -26,9 +27,6 @@ AC_SUBST(demodir) pkgexampledir='${pkgdatadir}/examples' AC_SUBST(pkgexampledir) - -pymoddir='${pkglibdir}/python/Lustre' -AC_SUBST(pymoddir) ]) # @@ -38,7 +36,7 @@ AC_SUBST(pymoddir) # AC_DEFUN([LC_TARGET_SUPPORTED], [case $target_os in - linux*) + linux* | darwin*) $1 ;; *) @@ -59,14 +57,13 @@ AC_DEFUN([LC_CONFIG_EXT3], LB_LINUX_CONFIG([EXT3_FS_XATTR],[$1],[$3]) ]) - # # LC_FSHOOKS # # If we have (and can build) fshooks.h # AC_DEFUN([LC_FSHOOKS], -[AC_CHECK_FILE([$LINUX/include/linux/fshooks.h],[ +[LB_CHECK_FILE([$LINUX/include/linux/fshooks.h],[ AC_MSG_CHECKING([if fshooks.h can be compiled]) LB_LINUX_TRY_COMPILE([ #include @@ -80,17 +77,7 @@ AC_DEFUN([LC_FSHOOKS], ]) $1 ],[ -LB_LINUX_TRY_COMPILE([ - #include -],[ - #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,10)) - #error "linux version < 2.6.10, only support 2.6.7" - #endif -],[ $2 -],[ -$3 -]) ]) ]) @@ -139,7 +126,7 @@ LB_LINUX_TRY_COMPILE([ # if zap_page_range() takes a vma arg # AC_DEFUN([LC_FUNC_ZAP_PAGE_RANGE], -[AC_MSG_CHECKING([if zap_pag_range with vma parameter]) +[AC_MSG_CHECKING([if zap_page_range with vma parameter]) ZAP_PAGE_RANGE_VMA="`grep -c 'zap_page_range.*struct vm_area_struct' $LINUX/include/linux/mm.h`" if test "$ZAP_PAGE_RANGE_VMA" != 0 ; then AC_DEFINE(ZAP_PAGE_RANGE_VMA, 1, [zap_page_range with vma parameter]) @@ -166,6 +153,25 @@ fi ]) # +# LC_FUNC_FILEMAP_FDATASYNC +# +# if filemap_fdatasync() exists +# +AC_DEFUN([LC_FUNC_FILEMAP_FDATAWRITE], +[AC_MSG_CHECKING([whether filemap_fdatawrite() is defined]) +LB_LINUX_TRY_COMPILE([ + #include +],[ + int (*foo)(struct address_space *)= filemap_fdatawrite; +],[ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_FILEMAP_FDATAWRITE, 1, [filemap_fdatawrite() found]) +],[ + AC_MSG_RESULT([no]) +]) +]) + +# # LC_FUNC_DIRECT_IO # # if direct_IO takes a struct file argument @@ -226,6 +232,50 @@ LB_LINUX_TRY_COMPILE([ ]) ]) +# +# LC_FUNC_REGISTER_CACHE +# +# if register_cache() is defined by kernel +# +AC_DEFUN([LC_FUNC_REGISTER_CACHE], +[AC_MSG_CHECKING([if kernel defines register_cache()]) +LB_LINUX_TRY_COMPILE([ + #include + #include +],[ + struct cache_definition cache; +],[ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_REGISTER_CACHE, 1, [register_cache found]) + AC_MSG_CHECKING([if kernel expects return from cache shrink function]) + HAVE_CACHE_RETURN_INT="`grep -c 'int.*shrink' $LINUX/include/linux/cache_def.h`" + if test "$HAVE_CACHE_RETURN_INT" != 0 ; then + AC_DEFINE(HAVE_CACHE_RETURN_INT, 1, [kernel expects return from shrink_cache]) + AC_MSG_RESULT(yes) + else + AC_MSG_RESULT(no) + fi +],[ + AC_MSG_RESULT([no]) +]) +]) + +# +# LC_FUNC_GRAB_CACHE_PAGE_NOWAIT_GFP +# +# check for our patched grab_cache_page_nowait_gfp() function +# +AC_DEFUN([LC_FUNC_GRAB_CACHE_PAGE_NOWAIT_GFP], +[AC_MSG_CHECKING([if kernel defines grab_cache_page_nowait_gfp()]) +HAVE_GCPN_GFP="`grep -c 'grab_cache_page_nowait_gfp' $LINUX/include/linux/pagemap.h`" +if test "$HAVE_GCPN_GFP" != 0 ; then + AC_DEFINE(HAVE_GRAB_CACHE_PAGE_NOWAIT_GFP, 1, + [kernel has grab_cache_page_nowait_gfp()]) + AC_MSG_RESULT(yes) +else + AC_MSG_RESULT(no) +fi +]) # # LC_FUNC_DEV_SET_RDONLY @@ -237,7 +287,7 @@ AC_DEFUN([LC_FUNC_DEV_SET_RDONLY], [AC_MSG_CHECKING([if kernel has old single-device dev_set_rdonly]) HAVE_OLD_DSR="`grep -c -s 'dev_set_rdonly.*no_write' $LINUX/drivers/block/ll_rw_blk.c`" if test x$HAVE_OLD_DSR != "x1" ; then - HAVE_OLD_DSR="`grep -c -s 'dev_set_rdonly.*no_write' $LINUX/drivers/block/blkpg.c`" + HAVE_OLD_DSR="`grep -c -s 'dev_set_rdonly.*no_write' $LINUX/drivers/block/blkpg.c`" fi if test x$HAVE_OLD_DSR = "x1" ; then AC_DEFINE(HAVE_OLD_DEV_SET_RDONLY, 1, @@ -248,25 +298,16 @@ else fi ]) - # # LC_CONFIG_BACKINGFS # -# whether to use extN or ldiskfs instead of ext3 +# whether to use ldiskfs instead of ext3 # AC_DEFUN([LC_CONFIG_BACKINGFS], [ BACKINGFS='ext3' -# LLNL patches their ext3 and calls it extN -AC_MSG_CHECKING([whether to use extN]) -AC_ARG_ENABLE([extN], - AC_HELP_STRING([--enable-extN], - [use extN instead of ext3 for lustre backend]), - [BACKINGFS='extN'],[enable_extN='no']) -AC_MSG_RESULT([$enable_extN]) - -# SuSE gets ldiskfs +# 2.6 gets ldiskfs AC_MSG_CHECKING([whether to enable ldiskfs]) AC_ARG_ENABLE([ldiskfs], AC_HELP_STRING([--enable-ldiskfs], @@ -277,8 +318,19 @@ AC_MSG_RESULT([$enable_ldiskfs]) if test x$enable_ldiskfs = xyes ; then BACKINGFS="ldiskfs" + AC_MSG_CHECKING([whether to enable quilt for making ldiskfs]) + AC_ARG_ENABLE([quilt], + AC_HELP_STRING([--disable-quilt],[disable use of quilt for ldiskfs]), + [],[enable_quilt='yes']) + AC_MSG_RESULT([$enable_quilt]) + AC_PATH_PROG(PATCH, patch, [no]) - AC_PATH_PROG(QUILT, quilt, [no]) + + if test x$enable_quilt = xno ; then + QUILT="no" + else + AC_PATH_PROG(QUILT, quilt, [no]) + fi if test x$enable_ldiskfs$PATCH$QUILT = xyesnono ; then AC_MSG_ERROR([Quilt or patch are needed to build the ldiskfs module (for Linux 2.6)]) @@ -300,37 +352,29 @@ case $BACKINGFS in LC_CONFIG_EXT3([],[ AC_MSG_ERROR([Lustre requires that ext3 is enabled in the kernel]) ],[ - AC_MSG_ERROR([Lustre requires that extended attributes for ext3 are enabled in the kernel]) + AC_MSG_WARN([Lustre requires that extended attributes for ext3 are enabled in the kernel]) + AC_MSG_WARN([This build may fail.]) ]) ;; ldiskfs) - LC_FSHOOKS([ - LDISKFS_SERIES="2.6-suse.series" - ],[ - LDISKFS_SERIES="2.6-fc3.series" - ],[ - LDISKFS_SERIES="2.6-vanilla.series" - ] - ) + AC_MSG_CHECKING([which ldiskfs series to use]) + case $LINUXRELEASE in + 2.6.5*) LDISKFS_SERIES="2.6-suse.series" ;; + 2.6.9*) LDISKFS_SERIES="2.6-rhel4.series" ;; + 2.6.10-ac*) LDISKFS_SERIES="2.6-fc3.series" ;; + 2.6.10*) LDISKFS_SERIES="2.6-rhel4.series" ;; + 2.6.12*) LDISKFS_SERIES="2.6.12-vanilla.series" ;; + 2.6.15*) LDISKFS_SERIES="2.6-fc5.series";; + 2.6.16*) LDISKFS_SERIES="2.6-sles10.series";; + 2.6.18*) LDISKFS_SERIES="2.6.18-vanilla.series";; + *) AC_MSG_WARN([Unknown kernel version $LINUXRELEASE, fix lustre/autoconf/lustre-core.m4]) + esac + AC_MSG_RESULT([$LDISKFS_SERIES]) AC_SUBST(LDISKFS_SERIES) ;; esac # $BACKINGFS ]) -# check lookup_raw -AC_DEFUN([LC_CONFIG_LOOKUP_RAW], -[AC_MSG_CHECKING([whether to have raw lookup patch]) -HAVE_LOOKUP_RAW="`grep -c -s 'raw-lookup' $LUSTRE/kernel_patches/series/ldiskfs-$LDISKFS_SERIES`" - -if test x$HAVE_LOOKUP_RAW = "x1" ; then - AC_DEFINE(HAVE_LOOKUP_RAW, 1, - [kernel have lookup raw patch]) - AC_MSG_RESULT(yes) -else - AC_MSG_RESULT(no) -fi -]) - # # LC_CONFIG_PINGER # @@ -349,6 +393,21 @@ fi ]) # +# LC_CONFIG_LIBLUSTRE_RECOVERY +# +AC_DEFUN([LC_CONFIG_LIBLUSTRE_RECOVERY], +[AC_MSG_CHECKING([whether to enable liblustre recovery support]) +AC_ARG_ENABLE([liblustre-recovery], + AC_HELP_STRING([--disable-liblustre-recovery], + [disable liblustre recovery support]), + [],[enable_liblustre_recovery='yes']) +AC_MSG_RESULT([$enable_liblustre_recovery]) +if test x$enable_liblustre_recovery != xno ; then + AC_DEFINE(ENABLE_LIBLUSTRE_RECOVERY, 1, Liblustre Can Recover) +fi +]) + +# # LC_CONFIG_OBD_BUFFER_SIZE # # the maximum buffer size of lctl ioctls @@ -368,65 +427,580 @@ AC_DEFINE_UNQUOTED(OBD_MAX_IOCTL_BUFFER, $OBD_BUFFER_SIZE, [IOCTL Buffer Size]) ]) # -# LC_CONFIG_GSS +# LC_STRUCT_STATFS # -# whether build-in gss/krb5 capability +# AIX does not have statfs.f_namelen # -AC_DEFUN([LC_CONFIG_GSS], -[AC_MSG_CHECKING([whether to enable gss/krb5 support]) -AC_ARG_ENABLE([gss], - AC_HELP_STRING([--enable-gss], - [enable gss/krb5 support]), - [],[enable_gss='yes']) -AC_MSG_RESULT([$enable_gss]) -if test x$enable_gss != xno ; then - AC_DEFINE(ENABLE_GSS, 1, Support GSS/krb5) -fi +AC_DEFUN([LC_STRUCT_STATFS], +[AC_MSG_CHECKING([if struct statfs has a f_namelen field]) +LB_LINUX_TRY_COMPILE([ + #include +],[ + struct statfs sfs; + sfs.f_namelen = 1; +],[ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_STATFS_NAMELEN, 1, [struct statfs has a namelen field]) +],[ + AC_MSG_RESULT([no]) +]) +]) + +AC_DEFUN([LC_FUNC_PAGE_MAPPED], +[AC_MSG_CHECKING([if kernel offers page_mapped]) +LB_LINUX_TRY_COMPILE([ + #include +],[ + page_mapped(NULL); +],[ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_PAGE_MAPPED, 1, [page_mapped found]) +],[ + AC_MSG_RESULT([no]) +]) +]) + +AC_DEFUN([LC_STRUCT_FILE_OPS_UNLOCKED_IOCTL], +[AC_MSG_CHECKING([if struct file_operations has an unlocked_ioctl field]) +LB_LINUX_TRY_COMPILE([ + #include +],[ + struct file_operations fops; + &fops.unlocked_ioctl; +],[ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_UNLOCKED_IOCTL, 1, [struct file_operations has an unlock ed_ioctl field]) +],[ + AC_MSG_RESULT([no]) +]) +]) + +AC_DEFUN([LC_FILEMAP_POPULATE], +[AC_MSG_CHECKING([for exported filemap_populate]) +LB_LINUX_TRY_COMPILE([ + #include + #include +],[ + filemap_populate(NULL, 0, 0, __pgprot(0), 0, 0); +],[ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_FILEMAP_POPULATE, 1, [Kernel exports filemap_populate]) +],[ + AC_MSG_RESULT([no]) +]) +]) + +AC_DEFUN([LC_D_ADD_UNIQUE], +[AC_MSG_CHECKING([for d_add_unique]) +LB_LINUX_TRY_COMPILE([ + #include +],[ + d_add_unique(NULL, NULL); +],[ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_D_ADD_UNIQUE, 1, [Kernel has d_add_unique]) +],[ + AC_MSG_RESULT([no]) +]) +]) + +AC_DEFUN([LC_BIT_SPINLOCK_H], +[LB_CHECK_FILE([$LINUX/include/linux/bit_spinlock.h],[ + AC_MSG_CHECKING([if bit_spinlock.h can be compiled]) + LB_LINUX_TRY_COMPILE([ + #include + #include + #include + ],[],[ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_BIT_SPINLOCK_H, 1, [Kernel has bit_spinlock.h]) + ],[ + AC_MSG_RESULT([no]) + ]) +], +[]) +]) + +# +# LC_POSIX_ACL_XATTR +# +# If we have xattr_acl.h +# +AC_DEFUN([LC_XATTR_ACL], +[LB_CHECK_FILE([$LINUX/include/linux/xattr_acl.h],[ + AC_MSG_CHECKING([if xattr_acl.h can be compiled]) + LB_LINUX_TRY_COMPILE([ + #include + ],[],[ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_XATTR_ACL, 1, [Kernel has xattr_acl]) + ],[ + AC_MSG_RESULT([no]) + ]) +], +[]) +]) + +AC_DEFUN([LC_STRUCT_INTENT_FILE], +[AC_MSG_CHECKING([if struct open_intent has a file field]) +LB_LINUX_TRY_COMPILE([ + #include + #include +],[ + struct open_intent intent; + &intent.file; +],[ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_FILE_IN_STRUCT_INTENT, 1, [struct open_intent has a file field]) +],[ + AC_MSG_RESULT([no]) +]) +]) + + +AC_DEFUN([LC_POSIX_ACL_XATTR_H], +[LB_CHECK_FILE([$LINUX/include/linux/posix_acl_xattr.h],[ + AC_MSG_CHECKING([if linux/posix_acl_xattr.h can be compiled]) + LB_LINUX_TRY_COMPILE([ + #include + ],[],[ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_LINUX_POSIX_ACL_XATTR_H, 1, [linux/posix_acl_xattr.h found]) + + ],[ + AC_MSG_RESULT([no]) + ]) +$1 +],[ +AC_MSG_RESULT([no]) +]) +]) + +AC_DEFUN([LC_LUSTRE_VERSION_H], +[LB_CHECK_FILE([$LINUX/include/linux/lustre_version.h],[ + rm -f "$LUSTRE/include/linux/lustre_version.h" +],[ + touch "$LUSTRE/include/linux/lustre_version.h" + if test x$enable_server = xyes ; then + AC_MSG_WARN([Unpatched kernel detected.]) + AC_MSG_WARN([Lustre servers cannot be built with an unpatched kernel;]) + AC_MSG_WARN([disabling server build]) + enable_server='no' + fi +]) +]) + +AC_DEFUN([LC_FUNC_SET_FS_PWD], +[AC_MSG_CHECKING([if kernel exports show_task]) +have_show_task=0 + if grep -q "EXPORT_SYMBOL(show_task)" \ + "$LINUX/fs/namespace.c" 2>/dev/null ; then + AC_DEFINE(HAVE_SET_FS_PWD, 1, [set_fs_pwd is exported]) + AC_MSG_RESULT([yes]) + else + AC_MSG_RESULT([no]) + fi +]) + + +# +# LC_FUNC_MS_FLOCK_LOCK +# +# SLES9 kernel has MS_FLOCK_LOCK sb flag +# +AC_DEFUN([LC_FUNC_MS_FLOCK_LOCK], +[AC_MSG_CHECKING([if kernel has MS_FLOCK_LOCK sb flag]) +LB_LINUX_TRY_COMPILE([ + #include +],[ + int flags = MS_FLOCK_LOCK; +],[ + AC_DEFINE(HAVE_MS_FLOCK_LOCK, 1, + [kernel has MS_FLOCK_LOCK flag]) + AC_MSG_RESULT([yes]) +],[ + AC_MSG_RESULT([no]) +]) +]) + +# +# LC_FUNC_HAVE_CAN_SLEEP_ARG +# +# SLES9 kernel has third arg can_sleep +# in fs/locks.c: flock_lock_file_wait() +# +AC_DEFUN([LC_FUNC_HAVE_CAN_SLEEP_ARG], +[AC_MSG_CHECKING([if kernel has third arg can_sleep in fs/locks.c: flock_lock_file_wait()]) +LB_LINUX_TRY_COMPILE([ + #include +],[ + int cansleep; + struct file *file; + struct file_lock *file_lock; + flock_lock_file_wait(file, file_lock, cansleep); +],[ + AC_DEFINE(HAVE_CAN_SLEEP_ARG, 1, + [kernel has third arg can_sleep in fs/locks.c: flock_lock_file_wait()]) + AC_MSG_RESULT([yes]) +],[ + AC_MSG_RESULT([no]) +]) +]) + +# +# LC_FUNC_F_OP_FLOCK +# +# rhel4.2 kernel has f_op->flock field +# +AC_DEFUN([LC_FUNC_F_OP_FLOCK], +[AC_MSG_CHECKING([if struct file_operations has flock field]) +LB_LINUX_TRY_COMPILE([ + #include +],[ + struct file_operations ll_file_operations_flock; + ll_file_operations_flock.flock = NULL; +],[ + AC_DEFINE(HAVE_F_OP_FLOCK, 1, + [struct file_operations has flock field]) + AC_MSG_RESULT([yes]) +],[ + AC_MSG_RESULT([no]) +]) +]) + +# +# LC_FUNC_MS_FLOCK_LOCK +# +# SLES9 kernel has MS_FLOCK_LOCK sb flag +# +AC_DEFUN([LC_FUNC_MS_FLOCK_LOCK], +[AC_MSG_CHECKING([if kernel has MS_FLOCK_LOCK sb flag]) +LB_LINUX_TRY_COMPILE([ + #include +],[ + int flags = MS_FLOCK_LOCK; +],[ + AC_DEFINE(HAVE_MS_FLOCK_LOCK, 1, + [kernel has MS_FLOCK_LOCK flag]) + AC_MSG_RESULT([yes]) +],[ + AC_MSG_RESULT([no]) +]) ]) # -# LC_CONFIG_SNAPFS +# LC_FUNC_HAVE_CAN_SLEEP_ARG # -# Whether snapfs is desired +# SLES9 kernel has third arg can_sleep +# in fs/locks.c: flock_lock_file_wait() # -AC_DEFUN([LC_CONFIG_SNAPFS], -[# snap compilation -AC_MSG_CHECKING([whether to enable snapfs support]) -AC_ARG_ENABLE([snapfs], - AC_HELP_STRING([--enable-snapfs], - [build snapfs]), - [],[enable_snapfs='no']) -AC_MSG_RESULT([$enable_snapfs]) +AC_DEFUN([LC_FUNC_HAVE_CAN_SLEEP_ARG], +[AC_MSG_CHECKING([if kernel has third arg can_sleep in fs/locks.c: flock_lock_file_wait()]) +LB_LINUX_TRY_COMPILE([ + #include +],[ + int cansleep; + struct file *file; + struct file_lock *file_lock; + flock_lock_file_wait(file, file_lock, cansleep); +],[ + AC_DEFINE(HAVE_CAN_SLEEP_ARG, 1, + [kernel has third arg can_sleep in fs/locks.c: flock_lock_file_wait()]) + AC_MSG_RESULT([yes]) +],[ + AC_MSG_RESULT([no]) +]) ]) # -# LC_CONFIG_SMFS +# LC_TASK_PPTR # -# whether smfs is desired +# task struct has p_pptr instead of parent # -AC_DEFUN([LC_CONFIG_SMFS], -[AC_MSG_CHECKING([whether to enable smfs support]) -AC_ARG_ENABLE([smfs], - AC_HELP_STRING([--enable-smfs], - [build smfs]), - [],[enable_smfs='no']) -AC_MSG_RESULT([$enable_smfs]) +AC_DEFUN([LC_TASK_PPTR], +[AC_MSG_CHECKING([task p_pptr found]) +LB_LINUX_TRY_COMPILE([ + #include +],[ + struct task_struct *p; + + p = p->p_pptr; +],[ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_TASK_PPTR, 1, [task p_pptr found]) +],[ + AC_MSG_RESULT([no]) +]) ]) # +# LC_FUNC_F_OP_FLOCK +# +# rhel4.2 kernel has f_op->flock field +# +AC_DEFUN([LC_FUNC_F_OP_FLOCK], +[AC_MSG_CHECKING([if struct file_operations has flock field]) +LB_LINUX_TRY_COMPILE([ + #include +],[ + struct file_operations ll_file_operations_flock; + ll_file_operations_flock.flock = NULL; +],[ + AC_DEFINE(HAVE_F_OP_FLOCK, 1, + [struct file_operations has flock field]) + AC_MSG_RESULT([yes]) +],[ + AC_MSG_RESULT([no]) +]) +]) + +# LC_INODE_I_MUTEX +# after 2.6.15 inode have i_mutex intead of i_sem +AC_DEFUN([LC_INODE_I_MUTEX], +[AC_MSG_CHECKING([use inode have i_mutex ]) +LB_LINUX_TRY_COMPILE([ + #include + #include +],[ + struct inode i; + + mutex_unlock(&i.i_mutex); +],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_INODE_I_MUTEX, 1, + [after 2.6.15 inode have i_mutex intead of i_sem]) +],[ + AC_MSG_RESULT(NO) +]) +]) + + +# LC_DQUOTOFF_MUTEX +# after 2.6.17 dquote use mutex instead if semaphore +AC_DEFUN([LC_DQUOTOFF_MUTEX], +[AC_MSG_CHECKING([use dqonoff_mutex]) +LB_LINUX_TRY_COMPILE([ + #include + #include + #include +],[ + struct quota_info dq; + + mutex_unlock(&dq.dqonoff_mutex); +],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_DQUOTOFF_MUTEX, 1, + [after 2.6.17 dquote use mutex instead if semaphore]) +],[ + AC_MSG_RESULT(NO) +]) +]) + +# +# LC_STATFS_DENTRY_PARAM +# starting from 2.6.18 linux kernel uses dentry instead of +# super_block for first vfs_statfs argument +# +AC_DEFUN([LC_STATFS_DENTRY_PARAM], +[AC_MSG_CHECKING([first vfs_statfs parameter is dentry]) +LB_LINUX_TRY_COMPILE([ + #include +],[ + int vfs_statfs(struct dentry *, struct kstatfs *); +],[ + AC_DEFINE(HAVE_STATFS_DENTRY_PARAM, 1, + [first parameter of vfs_statfs is dentry]) + AC_MSG_RESULT([yes]) +],[ + AC_MSG_RESULT([no]) +]) +]) + +# +# LC_VFS_KERN_MOUNT +# starting from 2.6.18 kernel don`t export do_kern_mount +# and want to use vfs_kern_mount instead. +# +AC_DEFUN([LC_VFS_KERN_MOUNT], +[AC_MSG_CHECKING([vfs_kern_mount exist in kernel]) +LB_LINUX_TRY_COMPILE([ + #include +],[ + vfs_kern_mount(NULL, 0, NULL, NULL); +],[ + AC_DEFINE(HAVE_VFS_KERN_MOUNT, 1, + [vfs_kern_mount exist in kernel]) + AC_MSG_RESULT([yes]) +],[ + AC_MSG_RESULT([no]) +]) +]) + +# +# LC_INVALIDATEPAGE_RETURN_INT +# more 2.6 api changes. return type for the invalidatepage +# address_space_operation is 'void' in new kernels but 'int' in old +# +AC_DEFUN([LC_INVALIDATEPAGE_RETURN_INT], +[AC_MSG_CHECKING([invalidatepage has return int]) +LB_LINUX_TRY_COMPILE([ + #include +],[ + int rc = block_invalidatepage(NULL, 0); +],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_INVALIDATEPAGE_RETURN_INT, 1, + [Define if return type of invalidatepage should be int]) +],[ + AC_MSG_RESULT(NO) +]) +]) + +# LC_UMOUNTBEGIN_HAS_VFSMOUNT +# more 2.6 API changes. 2.6.18 umount_begin has different parameters +AC_DEFUN([LC_UMOUNTBEGIN_HAS_VFSMOUNT], +[AC_MSG_CHECKING([if umount_begin needs vfsmount parameter instead of super_block]) +tmp_flags="$EXTRA_KCFLAGS" +EXTRA_KCFLAGS="-Werror" +LB_LINUX_TRY_COMPILE([ + #include + + struct vfsmount; + static void cfg_umount_begin (struct vfsmount *v, int flags) + { + ; + } + + static struct super_operations cfg_super_operations = { + .umount_begin = cfg_umount_begin, + }; +],[ + cfg_super_operations.umount_begin(NULL,0); +],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_UMOUNTBEGIN_VFSMOUNT, 1, + [Define umount_begin need second argument]) +],[ + AC_MSG_RESULT(NO) +]) +EXTRA_KCFLAGS="$tmp_flags" +]) + +# 2.6.19 API changes +# inode don`t have i_blksize field +AC_DEFUN([LC_INODE_BLKSIZE], +[AC_MSG_CHECKING([inode has i_blksize field]) +LB_LINUX_TRY_COMPILE([ +#include +],[ + struct inode i; + i.i_blksize = 0; +],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_INODE_BLKSIZE, 1, + [struct inode has i_blksize field]) +],[ + AC_MSG_RESULT(NO) +]) +]) + +# LC_VFS_READDIR_U64_INO +# 2.6.19 use u64 for inode number instead of inode_t +AC_DEFUN([LC_VFS_READDIR_U64_INO], +[AC_MSG_CHECKING([check vfs_readdir need 64bit inode number]) +tmp_flags="$EXTRA_KCFLAGS" +EXTRA_KCFLAGS="-Werror" +LB_LINUX_TRY_COMPILE([ +#include + int fillonedir(void * __buf, const char * name, int namlen, loff_t offset, + u64 ino, unsigned int d_type) + { + return 0; + } +],[ + filldir_t filter; + + filter = fillonedir; + return 1; +],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_VFS_READDIR_U64_INO, 1, + [if vfs_readdir need 64bit inode number]) +],[ + AC_MSG_RESULT(NO) +]) +EXTRA_KCFLAGS="$tmp_flags" +]) + +# LC_GENERIC_FILE_WRITE +# 2.6.19 introduce do_sync_write instead of +# generic_file_write +AC_DEFUN([LC_GENERIC_FILE_WRITE], +[AC_MSG_CHECKING([use generic_file_write]) +LB_LINUX_TRY_COMPILE([ + #include +],[ + int result = generic_file_read(NULL, NULL, 0, 0); +],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_GENERIC_FILE_WRITE, 1, + [use generic_file_write]) +],[ + AC_MSG_RESULT(NO) +]) +]) + +# LC_GENERIC_FILE_READ +# 2.6.19 need to use do_sync_read instead of +# generic_file_read +AC_DEFUN([LC_GENERIC_FILE_READ], +[AC_MSG_CHECKING([use generic_file_read]) +LB_LINUX_TRY_COMPILE([ + #include +],[ + int result = generic_file_read(NULL, NULL, 0, 0); +],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_GENERIC_FILE_READ, 1, + [use generic_file_read]) +],[ + AC_MSG_RESULT(NO) +]) +]) + +# LC_NR_PAGECACHE +# 2.6.18 don`t export nr_pagecahe +AC_DEFUN([LC_NR_PAGECACHE], +[AC_MSG_CHECKING([kernel export nr_pagecache]) +LB_LINUX_TRY_COMPILE([ + #include +],[ + return atomic_read(&nr_pagecache); +],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_NR_PAGECACHE, 1, + [is kernel export nr_pagecache]) +],[ + AC_MSG_RESULT(NO) +]) +]) + + +# # LC_PROG_LINUX # # Lustre linux kernel checks # AC_DEFUN([LC_PROG_LINUX], -[if test x$enable_server = xyes ; then +[ LC_LUSTRE_VERSION_H +if test x$enable_server = xyes ; then LC_CONFIG_BACKINGFS - LC_CONFIG_LOOKUP_RAW fi LC_CONFIG_PINGER -LC_CONFIG_GSS -LC_CONFIG_SNAPFS -LC_CONFIG_SMFS +LC_CONFIG_LIBLUSTRE_RECOVERY +LC_CONFIG_QUOTA + +LC_TASK_PPTR LC_STRUCT_KIOBUF LC_FUNC_COND_RESCHED @@ -435,7 +1009,44 @@ LC_FUNC_PDE LC_FUNC_DIRECT_IO LC_HEADER_MM_INLINE LC_STRUCT_INODE +LC_FUNC_REGISTER_CACHE +LC_FUNC_GRAB_CACHE_PAGE_NOWAIT_GFP LC_FUNC_DEV_SET_RDONLY +LC_FUNC_FILEMAP_FDATAWRITE +LC_STRUCT_STATFS +LC_FUNC_PAGE_MAPPED +LC_STRUCT_FILE_OPS_UNLOCKED_IOCTL +LC_FILEMAP_POPULATE +LC_D_ADD_UNIQUE +LC_BIT_SPINLOCK_H +LC_XATTR_ACL +LC_STRUCT_INTENT_FILE +LC_POSIX_ACL_XATTR_H +LC_FUNC_SET_FS_PWD +LC_FUNC_MS_FLOCK_LOCK +LC_FUNC_HAVE_CAN_SLEEP_ARG +LC_FUNC_F_OP_FLOCK +LC_QUOTA_READ +LC_COOKIE_FOLLOW_LINK + +# 2.6.15 +LC_INODE_I_MUTEX + +# 2.6.17 +LC_DQUOTOFF_MUTEX + +# 2.6.18 +LC_NR_PAGECACHE +LC_STATFS_DENTRY_PARAM +LC_VFS_KERN_MOUNT +LC_INVALIDATEPAGE_RETURN_INT +LC_UMOUNTBEGIN_HAS_VFSMOUNT + +# 2.6.19 +LC_INODE_BLKSIZE +LC_VFS_READDIR_U64_INO +LC_GENERIC_FILE_READ +LC_GENERIC_FILE_WRITE ]) # @@ -473,12 +1084,88 @@ AC_MSG_RESULT([$enable_liblustre]) # only build sysio if liblustre is built with_sysio="$enable_liblustre" +AC_MSG_CHECKING([whether to build liblustre tests]) +AC_ARG_ENABLE([liblustre-tests], + AC_HELP_STRING([--enable-liblustre-tests], + [enable liblustre tests, if --disable-tests is used]), + [],[enable_liblustre_tests=$enable_tests]) +if test x$enable_liblustre != xyes ; then + enable_liblustre_tests='no' +fi +AC_MSG_RESULT([$enable_liblustre_tests]) + AC_MSG_CHECKING([whether to build mpitests]) AC_ARG_ENABLE([mpitests], AC_HELP_STRING([--enable-mpitests], [build liblustre mpi tests]), [],[enable_mpitests=no]) AC_MSG_RESULT([$enable_mpitests]) + +AC_MSG_NOTICE([Enabling Lustre configure options for libsysio]) +ac_configure_args="$ac_configure_args --with-lustre-hack --with-sockets" + +LC_CONFIG_PINGER +LC_CONFIG_LIBLUSTRE_RECOVERY +]) + +# +# LC_CONFIG_QUOTA +# +# whether to enable quota support +# +AC_DEFUN([LC_CONFIG_QUOTA], +[AC_MSG_CHECKING([whether to enable quota support]) +AC_ARG_ENABLE([quota], + AC_HELP_STRING([--enable-quota], + [enable quota support]), + [],[enable_quota='yes']) +AC_MSG_RESULT([$enable_quota]) +if test x$linux25 != xyes; then + enable_quota='no' +fi +if test x$enable_quota != xno; then + AC_DEFINE(HAVE_QUOTA_SUPPORT, 1, [Enable quota support]) +fi +]) + +AC_DEFUN([LC_QUOTA_READ], +[AC_MSG_CHECKING([if kernel supports quota_read]) +LB_LINUX_TRY_COMPILE([ + #include +],[ + struct super_operations sp; + void *i = (void *)sp.quota_read; +],[ + AC_MSG_RESULT([yes]) + AC_DEFINE(KERNEL_SUPPORTS_QUOTA_READ, 1, [quota_read found]) +],[ + AC_MSG_RESULT([no]) +]) +]) + +# +# LC_COOKIE_FOLLOW_LINK +# +# kernel 2.6.13+ ->follow_link returns a cookie +# + +AC_DEFUN([LC_COOKIE_FOLLOW_LINK], +[AC_MSG_CHECKING([if inode_operations->follow_link returns a cookie]) +LB_LINUX_TRY_COMPILE([ + #include + #include + +],[ + struct dentry dentry; + struct nameidata nd; + + dentry.d_inode->i_op->put_link(&dentry, &nd, NULL); +],[ + AC_DEFINE(HAVE_COOKIE_FOLLOW_LINK, 1, [inode_operations->follow_link returns a cookie]) + AC_MSG_RESULT([yes]) +],[ + AC_MSG_RESULT([no]) +]) ]) # @@ -490,7 +1177,11 @@ AC_DEFUN([LC_CONFIGURE], [LC_CONFIG_OBD_BUFFER_SIZE # include/liblustre.h -AC_CHECK_HEADERS([asm/page.h sys/user.h stdint.h]) +AC_CHECK_HEADERS([asm/page.h sys/user.h sys/vfs.h stdint.h blkid/blkid.h]) + +# include/lustre/lustre_user.h +# See note there re: __ASM_X86_64_PROCESSOR_H +AC_CHECK_HEADERS([linux/quota.h]) # liblustre/llite_lib.h AC_CHECK_HEADERS([xtio.h file.h]) @@ -501,6 +1192,19 @@ AC_CHECK_HEADERS([linux/types.h sys/types.h linux/unistd.h unistd.h]) # liblustre/lutil.c AC_CHECK_HEADERS([netinet/in.h arpa/inet.h catamount/data.h]) AC_CHECK_FUNCS([inet_ntoa]) + +# utils/llverfs.c +AC_CHECK_HEADERS([ext2fs/ext2fs.h]) + +# Super safe df +AC_ARG_ENABLE([mindf], + AC_HELP_STRING([--enable-mindf], + [Make statfs report the minimum available space on any single OST instead of the sum of free space on all OSTs]), + [],[]) +if test "$enable_mindf" = "yes" ; then + AC_DEFINE([MIN_DF], 1, [Report minimum OST free space]) +fi + ]) # @@ -510,18 +1214,16 @@ AC_CHECK_FUNCS([inet_ntoa]) # AC_DEFUN([LC_CONDITIONALS], [AM_CONDITIONAL(LIBLUSTRE, test x$enable_liblustre = xyes) -AM_CONDITIONAL(EXTN, test x$enable_extN = xyes) AM_CONDITIONAL(LDISKFS, test x$enable_ldiskfs = xyes) AM_CONDITIONAL(USE_QUILT, test x$QUILT != xno) -AM_CONDITIONAL(MPITESTS, test x$enable_mpitests = xyes, Build MPI Tests) -AM_CONDITIONAL(SNAPFS, test x$enable_snapfs = xyes) -AM_CONDITIONAL(SMFS, test x$enable_smfs = xyes) -AM_CONDITIONAL(GSS, test x$enable_gss = xyes) -AM_CONDITIONAL(LIBLUSTRE, test x$enable_liblustre = xyes) AM_CONDITIONAL(LIBLUSTRE_TESTS, test x$enable_liblustre_tests = xyes) AM_CONDITIONAL(MPITESTS, test x$enable_mpitests = xyes, Build MPI Tests) AM_CONDITIONAL(CLIENT, test x$enable_client = xyes) AM_CONDITIONAL(SERVER, test x$enable_server = xyes) +AM_CONDITIONAL(QUOTA, test x$enable_quota = xyes) +AM_CONDITIONAL(BLKID, test x$ac_cv_header_blkid_blkid_h = xyes) +AM_CONDITIONAL(EXT2FS_DEVEL, test x$ac_cv_header_ext2fs_ext2fs_h = xyes) +AM_CONDITIONAL(LIBPTHREAD, test x$enable_libpthread = xyes) ]) # @@ -534,24 +1236,29 @@ AC_DEFUN([LC_CONFIG_FILES], lustre/Makefile lustre/autoMakefile lustre/autoconf/Makefile -lustre/cmobd/Makefile -lustre/cmobd/autoMakefile -lustre/cobd/Makefile -lustre/cobd/autoMakefile -lustre/conf/Makefile +lustre/contrib/Makefile lustre/doc/Makefile lustre/include/Makefile +lustre/include/lustre_ver.h lustre/include/linux/Makefile lustre/include/lustre/Makefile -lustre/kernel_patches/targets/2.6-fc3.target +lustre/kernel_patches/targets/2.6-suse.target +lustre/kernel_patches/targets/2.6-vanilla.target +lustre/kernel_patches/targets/2.6-rhel4.target +lustre/kernel_patches/targets/2.6-fc5.target +lustre/kernel_patches/targets/2.6-patchless.target +lustre/kernel_patches/targets/hp_pnnl-2.4.target +lustre/kernel_patches/targets/rh-2.4.target +lustre/kernel_patches/targets/rhel-2.4.target +lustre/kernel_patches/targets/suse-2.4.21-2.target +lustre/kernel_patches/targets/sles-2.4.target lustre/ldiskfs/Makefile lustre/ldiskfs/autoMakefile lustre/ldlm/Makefile lustre/liblustre/Makefile +lustre/liblustre/tests/Makefile lustre/llite/Makefile lustre/llite/autoMakefile -lustre/lmv/Makefile -lustre/lmv/autoMakefile lustre/lov/Makefile lustre/lov/autoMakefile lustre/lvfs/Makefile @@ -562,6 +1269,7 @@ lustre/mds/Makefile lustre/mds/autoMakefile lustre/obdclass/Makefile lustre/obdclass/autoMakefile +lustre/obdclass/linux/Makefile lustre/obdecho/Makefile lustre/obdecho/autoMakefile lustre/obdfilter/Makefile @@ -570,25 +1278,23 @@ lustre/osc/Makefile lustre/osc/autoMakefile lustre/ost/Makefile lustre/ost/autoMakefile -lustre/ptlbd/Makefile -lustre/ptlbd/autoMakefile +lustre/mgc/Makefile +lustre/mgc/autoMakefile +lustre/mgs/Makefile +lustre/mgs/autoMakefile lustre/ptlrpc/Makefile lustre/ptlrpc/autoMakefile +lustre/quota/Makefile +lustre/quota/autoMakefile lustre/scripts/Makefile lustre/scripts/version_tag.pl -lustre/sec/Makefile -lustre/sec/autoMakefile -lustre/sec/gss/Makefile -lustre/sec/gss/autoMakefile -lustre/sec/gks/Makefile -lustre/sec/gks/autoMakefile -lustre/smfs/Makefile -lustre/smfs/autoMakefile -lustre/snapfs/Makefile -lustre/snapfs/autoMakefile -lustre/snapfs/utils/Makefile lustre/tests/Makefile -lustre/utils/Lustre/Makefile lustre/utils/Makefile ]) +case $lb_target_os in + darwin) + AC_CONFIG_FILES([ lustre/obdclass/darwin/Makefile ]) + ;; +esac + ]) diff --git a/lustre/autoconf/lustre-version.ac b/lustre/autoconf/lustre-version.ac index 3ed12ab..5c061e7 100644 --- a/lustre/autoconf/lustre-version.ac +++ b/lustre/autoconf/lustre-version.ac @@ -1 +1,30 @@ -m4_define([LUSTRE_VERSION],[1.7.0]) +m4_define([LUSTRE_MAJOR],[1]) +m4_define([LUSTRE_MINOR],[5]) +m4_define([LUSTRE_PATCH],[97]) +m4_define([LUSTRE_FIX],[0]) + +dnl # 288 stands for 0.0.1.32 , next version with fixes is ok, but next after +dnl # next release candidate/beta would spill this warning already. +m4_define([LUSTRE_VER_ALLOWED_OFFSET],[288]) +m4_define([LUSTRE_VER_OFFSET_WARN],[288]) + +dnl # User editable part ends here. ----------------------------------------- + +m4_pattern_allow(AC_LUSTRE) +m4_define([LUSTRE_VERSION],m4_if(LUSTRE_FIX,[0],LUSTRE_MAJOR.LUSTRE_MINOR.LUSTRE_PATCH,LUSTRE_MAJOR.LUSTRE_MINOR.LUSTRE_PATCH.LUSTRE_FIX)) + +[AC_LUSTRE_MAJOR]=LUSTRE_MAJOR +[AC_LUSTRE_MINOR]=LUSTRE_MINOR +[AC_LUSTRE_PATCH]=LUSTRE_PATCH +[AC_LUSTRE_FIX]=LUSTRE_FIX +[AC_LUSTRE_VERSION_STRING]=LUSTRE_VERSION +[AC_LUSTRE_VER_ALLOWED_OFFSET]=LUSTRE_VER_ALLOWED_OFFSET +[AC_LUSTRE_VER_OFFSET_WARN]=LUSTRE_VER_OFFSET_WARN + +AC_SUBST([AC_LUSTRE_MAJOR]) +AC_SUBST([AC_LUSTRE_MINOR]) +AC_SUBST([AC_LUSTRE_PATCH]) +AC_SUBST([AC_LUSTRE_FIX]) +AC_SUBST([AC_LUSTRE_VERSION_STRING]) +AC_SUBST([AC_LUSTRE_VER_ALLOWED_OFFSET]) +AC_SUBST([AC_LUSTRE_VER_OFFSET_WARN]) diff --git a/lustre/cmobd/Makefile.in b/lustre/cmobd/Makefile.in deleted file mode 100644 index 51c1902..0000000 --- a/lustre/cmobd/Makefile.in +++ /dev/null @@ -1,4 +0,0 @@ -MODULES := cmobd -cmobd-objs := cm_obd.o cm_reint.o cm_write.o -cmobd-objs += cm_oss_reint.o cm_mds_reint.o lproc_cm.o -@INCLUDE_RULES@ diff --git a/lustre/cmobd/autoMakefile.am b/lustre/cmobd/autoMakefile.am deleted file mode 100644 index 7a8256b..0000000 --- a/lustre/cmobd/autoMakefile.am +++ /dev/null @@ -1,11 +0,0 @@ -# Copyright (C) 2001 Cluster File Systems, Inc. -# -# This code is issued under the GNU General Public License. -# See the file COPYING in this distribution - -if MODULES -modulefs_DATA = cmobd$(KMODEXT) -endif - -MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ -DIST_SOURCES = $(cmobd-objs:%.o=%.c) cm_internal.h diff --git a/lustre/cmobd/cm_internal.h b/lustre/cmobd/cm_internal.h deleted file mode 100644 index 987eae3..0000000 --- a/lustre/cmobd/cm_internal.h +++ /dev/null @@ -1,40 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (C) 2002, 2003, 2004 Cluster File Systems, Inc. - * - * This file is part of Lustre, http://www.lustre.org. - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#ifndef CM_INTERNAL_H -#define CM_INTERNAL_H - -int cmobd_reintegrate(struct obd_device *); -int cmobd_dummy_lsm(struct lov_stripe_md **, int, struct obdo*, __u32); -void cmobd_free_lsm(struct lov_stripe_md **); - -int cmobd_replay_write(struct obd_device *, struct obdo *, - struct ldlm_extent *); - -int cmobd_init_write_srv(struct obd_device *); -void cmobd_cleanup_write_srv(struct obd_device *); - -int cmobd_reint_mds(struct obd_device*obd, void *record, int opcode); -int cmobd_reint_oss(struct obd_device *obd, void *record, int opcode); - -int mds_read_md(struct obd_device *obd, struct lustre_id *id, - char **data, int *datalen); -#endif /* CM_INTERNAL_H */ diff --git a/lustre/cmobd/cm_mds_reint.c b/lustre/cmobd/cm_mds_reint.c deleted file mode 100644 index 34f51a5..0000000 --- a/lustre/cmobd/cm_mds_reint.c +++ /dev/null @@ -1,336 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (C) 2001-2003 Cluster File Systems, Inc. - * - * This file is part of Lustre, http://www.sf.net/projects/lustre/ - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#define DEBUG_SUBSYSTEM S_CMOBD - -#include -#include -#include -#include -#include -#include -#include - -#include "cm_internal.h" - -/* converts mds_rec_setattr to struct iattr. */ -static inline void cmobd_rec2iattr(struct mds_rec_setattr *rec, - struct iattr *iattr) -{ - iattr->ia_uid = rec->sa_uid; - iattr->ia_gid = rec->sa_gid; - iattr->ia_mode = rec->sa_mode; - iattr->ia_size = rec->sa_size; - iattr->ia_valid = rec->sa_valid; - LTIME_S(iattr->ia_atime) = rec->sa_atime; - LTIME_S(iattr->ia_mtime) = rec->sa_mtime; - LTIME_S(iattr->ia_ctime) = rec->sa_ctime; - iattr->ia_attr_flags = rec->sa_attr_flags; -} - -static void -cmobd_prepare_mdc_data(struct mdc_op_data *data, struct lustre_id *id1, - struct lustre_id *id2, const char *name, - int namelen, __u32 mode, __u32 flags) -{ - LASSERT(id1); - LASSERT(data); - - memset(data, 0, sizeof(*data)); - - data->id1 = *id1; - if (id2) - data->id2 = *id2; - - data->valid = 0; - data->name = name; - data->flags = flags; - data->namelen = namelen; - data->create_mode = mode; - data->mod_time = LTIME_S(CURRENT_TIME); - - /* zeroing out store cookie, as it makes no sense on master MDS and may - * also confuse it as may be considered as recovery case. */ - memset(&data->id1.li_stc, 0, sizeof(data->id1.li_stc)); - memset(&data->id2.li_stc, 0, sizeof(data->id2.li_stc)); -} - -/* If mdc_setattr() is called with an 'iattr', then it is a normal RPC that - * should take the normal semaphore and go to the normal portal. - * - * If it is called with iattr->ia_valid & ATTR_FROM_OPEN, then it is a magic - * open-path setattr that should take the setattr semaphore and go to the - * setattr portal. */ -static int cmobd_reint_setattr(struct obd_device *obd, void *record) -{ - struct cm_obd *cmobd = &obd->u.cm; - struct ptlrpc_request *req = NULL; - struct mds_kml_pack_info *mkpi; - struct mds_rec_setattr *rec; - struct mdc_op_data *op_data; - struct lustre_msg *msg; - int ea1len, ea2len; - struct iattr iattr; - void *ea1, *ea2; - int rc = 0; - ENTRY; - - mkpi = (struct mds_kml_pack_info *)record; - msg = (struct lustre_msg *)(record + sizeof(*mkpi)); - - rec = lustre_msg_buf(msg, 0, 0); - if (!rec) - RETURN(-EINVAL); - - /* converting setattr rec to struct iattr. */ - cmobd_rec2iattr(rec, &iattr); - - /* FIXME-UMKA: here should be handling of setattr() from open. Bug - * #249. Will be fixed later. */ - - OBD_ALLOC(op_data, sizeof(*op_data)); - if (op_data == NULL) - RETURN(-ENOMEM); - cmobd_prepare_mdc_data(op_data, &rec->sa_id, NULL, - NULL, 0, 0, MDS_REINT_REQ); - - /* handling possible EAs. */ - ea1 = lustre_msg_buf(msg, 1, 0); - ea1len = ea1 ? msg->buflens[1] : 0; - - ea2 = lustre_msg_buf(msg, 2, 0); - ea2len = ea2 ? msg->buflens[2] : 0; - - rc = md_setattr(cmobd->master_exp, op_data, &iattr, - ea1, ea1len, ea2, ea2len, NULL, 0, &req); - OBD_FREE(op_data, sizeof(*op_data)); - - if (req) - ptlrpc_req_finished(req); - RETURN(rc); -} - -static int cmobd_reint_create(struct obd_device *obd, void *record) -{ - struct cm_obd *cmobd = &obd->u.cm; - struct ptlrpc_request *req = NULL; - struct mds_kml_pack_info *mkpi; - int rc = 0, namelen, datalen; - struct mdc_op_data *op_data; - struct mds_rec_create *rec; - struct lustre_msg *msg; - char *name, *data; - ENTRY; - - mkpi = (struct mds_kml_pack_info *)record; - msg = (struct lustre_msg *)(record + sizeof(*mkpi)); - - rec = lustre_msg_buf(msg, 0, 0); - if (!rec) - RETURN(-EINVAL); - - /* getting name to be created and its length */ - name = lustre_msg_string(msg, 1, 0); - namelen = name ? msg->buflens[1] - 1 : 0; - - /* getting misc data (symlink) and its length */ - data = (char *)lustre_msg_buf(msg, 2, 0); - datalen = data ? msg->buflens[2] : 0; - - OBD_ALLOC(op_data, sizeof(*op_data)); - if (op_data == NULL) - GOTO(exit, rc = -ENOMEM); - - /* XXX: here is the issue preventing LMV from being used as master - * device for flushing cache to it. It is allusive to the fact that - * cache MDS parent id with wrong group component is used for forwarding - * reint requests to some MDS from those LMV knows about. As group is - * wrong - LMV forwards reqs to wrong MDS. Do not know how to fix it - * yet. --umka */ - - /* prepare mdc request data. */ - cmobd_prepare_mdc_data(op_data, &rec->cr_id, &rec->cr_replayid, - name, namelen, rec->cr_mode, MDS_REINT_REQ); - - /* requesting to master to create object with passed attributes. */ - rc = md_create(cmobd->master_exp, op_data, data, datalen, - rec->cr_mode, current->fsuid, current->fsgid, - rec->cr_rdev, &req); - OBD_FREE(op_data, sizeof(*op_data)); -exit: - if (req) - ptlrpc_req_finished(req); - - RETURN(rc); -} - -static int cmobd_reint_unlink(struct obd_device *obd, void *record) -{ - struct cm_obd *cmobd = &obd->u.cm; - struct ptlrpc_request *req = NULL; - struct mds_kml_pack_info *mkpi; - struct mdc_op_data *op_data; - struct mds_rec_unlink *rec; - struct lustre_msg *msg; - int rc = 0, namelen; - char *name = NULL; - ENTRY; - - mkpi = (struct mds_kml_pack_info *)record; - msg = (struct lustre_msg *)(record + sizeof(*mkpi)); - - rec = lustre_msg_buf(msg, 0, 0); - if (!rec) - RETURN(-EINVAL); - - /* getting name to be created and its length */ - name = lustre_msg_string(msg, 1, 0); - namelen = name ? msg->buflens[1] - 1 : 0; - - OBD_ALLOC(op_data, sizeof(*op_data)); - if (op_data == NULL) - RETURN(-ENOMEM); - - /* prepare mdc request data. */ - cmobd_prepare_mdc_data(op_data, &rec->ul_id1, NULL, - name, namelen, rec->ul_mode, - MDS_REINT_REQ); - - rc = md_unlink(cmobd->master_exp, op_data, &req); - OBD_FREE(op_data, sizeof(*op_data)); - - if (req) - ptlrpc_req_finished(req); - RETURN(rc); -} - -static int cmobd_reint_link(struct obd_device *obd, void *record) -{ - struct cm_obd *cmobd = &obd->u.cm; - struct ptlrpc_request *req = NULL; - struct mds_kml_pack_info *mkpi; - struct mdc_op_data *op_data; - struct mds_rec_link *rec; - struct lustre_msg *msg; - int rc = 0, namelen; - char *name; - ENTRY; - - mkpi = (struct mds_kml_pack_info *)record; - msg = (struct lustre_msg *)(record + sizeof(*mkpi)); - - rec = lustre_msg_buf(msg, 0, 0); - if (!rec) - RETURN(-EINVAL); - - /* getting name to be created and its length */ - name = lustre_msg_string(msg, 1, 0); - namelen = name ? msg->buflens[1] - 1: 0; - - OBD_ALLOC(op_data, sizeof(*op_data)); - if (op_data == NULL) - RETURN(-ENOMEM); - - /* prepare mdc request data. */ - cmobd_prepare_mdc_data(op_data, &rec->lk_id1, &rec->lk_id2, - name, namelen, 0, MDS_REINT_REQ); - - rc = md_link(cmobd->master_exp, op_data, &req); - OBD_FREE(op_data, sizeof(*op_data)); - - if (req) - ptlrpc_req_finished(req); - RETURN(rc); -} - -static int cmobd_reint_rename(struct obd_device *obd, void *record) -{ - struct cm_obd *cmobd = &obd->u.cm; - struct ptlrpc_request *req = NULL; - struct mds_kml_pack_info *mkpi; - struct mdc_op_data *op_data; - struct mds_rec_rename *rec; - int rc = 0, oldlen, newlen; - struct lustre_msg *msg; - char *old, *new; - ENTRY; - - mkpi = (struct mds_kml_pack_info *)record; - msg = (struct lustre_msg *)(record + sizeof(*mkpi)); - - rec = lustre_msg_buf(msg, 0, 0); - if (!rec) - RETURN(-EINVAL); - - /* getting old name and its length */ - old = lustre_msg_string(msg, 1, 0); - oldlen = old ? msg->buflens[1] - 1 : 0; - - /* getting new len and its length */ - new = lustre_msg_string(msg, 2, 0); - newlen = new ? msg->buflens[2] - 1: 0; - - OBD_ALLOC(op_data, sizeof(*op_data)); - if (op_data == NULL) - RETURN(-ENOMEM); - - /* prepare mdc request data. */ - cmobd_prepare_mdc_data(op_data, &rec->rn_id1, &rec->rn_id1, - NULL, 0, 0, MDS_REINT_REQ); - - rc = md_rename(cmobd->master_exp, op_data, old, oldlen, - new, newlen, &req); - OBD_FREE(op_data, sizeof(*op_data)); - - if (req) - ptlrpc_req_finished(req); - RETURN(rc); -} - -typedef int (*cmobd_reint_rec_func_t)(struct obd_device *, void *); - -static cmobd_reint_rec_func_t mds_reint_handler[REINT_MAX + 1] = { - [REINT_SETATTR] cmobd_reint_setattr, - [REINT_CREATE] cmobd_reint_create, - [REINT_LINK] cmobd_reint_link, - [REINT_UNLINK] cmobd_reint_unlink, - [REINT_RENAME] cmobd_reint_rename, -}; - -int cmobd_reint_mds(struct obd_device *obd, void *record, int dummy) -{ - struct mds_kml_pack_info *mkpi; - struct lustre_msg *msg; - __u32 opcode; - - mkpi = (struct mds_kml_pack_info *)record; - msg = (struct lustre_msg *)(record + sizeof(*mkpi)); - - opcode = *(__u32 *)lustre_msg_buf(msg, 0, 0); - - if (opcode > REINT_MAX || opcode <= 0) { - CERROR("Invalid mds reint opcode %u\n", - opcode); - return -EINVAL; - } - - return mds_reint_handler[opcode](obd, record); -} diff --git a/lustre/cmobd/cm_obd.c b/lustre/cmobd/cm_obd.c deleted file mode 100644 index 417ef9e..0000000 --- a/lustre/cmobd/cm_obd.c +++ /dev/null @@ -1,340 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (c) 2002 Cluster File Systems, Inc. - * - * This file is part of Lustre, http://www.lustre.org. - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#define DEBUG_SUBSYSTEM S_CMOBD - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "cm_internal.h" - -static int cmobd_attach(struct obd_device *obd, - obd_count len, void *data) -{ - struct lprocfs_static_vars lvars; - - lprocfs_init_vars(cmobd, &lvars); - return lprocfs_obd_attach(obd, lvars.obd_vars); -} - -static int cmobd_detach(struct obd_device *obd) -{ - return lprocfs_obd_detach(obd); -} - -static int cmobd_init_dt_desc(struct obd_device *obd) -{ - struct cm_obd *cmobd = &obd->u.cm; - __u32 valsize; - int rc = 0; - ENTRY; - - /* as CMOBD is stand alone device, that is has not to be connected, we - * have no other way to init EAs correctly but ask master device about - * it. Thus both, DT and MD layers should be able to answer with correct - * lov_desc. LOV knows it explicitly and LMV/MDC have to ask MDS server - * of it. */ - valsize = sizeof(cmobd->master_desc); - memset(&cmobd->master_desc, 0, sizeof(cmobd->master_desc)); - - rc = obd_get_info(cmobd->master_exp, strlen("lovdesc") + 1, - "lovdesc", &valsize, &cmobd->master_desc); - RETURN(rc); -} - -static int cmobd_init_ea_size(struct obd_device *obd) -{ - int rc = 0, tgt_count, easize, cookiesize; - struct cm_obd *cmobd = &obd->u.cm; - ENTRY; - - if (!cmobd->master_exp) - RETURN(-EINVAL); - - tgt_count = cmobd->master_desc.ld_tgt_count; - - /* no EA setup is needed as there is single OST with no LOV */ - if (tgt_count == 0) - RETURN(0); - - easize = lov_mds_md_size(tgt_count); - cookiesize = tgt_count * sizeof(struct llog_cookie); - rc = obd_init_ea_size(cmobd->master_exp, easize, cookiesize); - RETURN(rc); -} - -static char *types[] = { - OBD_LMV_DEVICENAME, OBD_MDC_DEVICENAME, - OBD_LOV_DEVICENAME, OBD_OSC_DEVICENAME -}; - -static struct obd_device * -cmobd_find_obd(struct obd_device *obd, struct obd_uuid *uuid) -{ - struct obd_device *res; - int i = 0; - ENTRY; - - CWARN("%s: looking for client obd %s\n", - obd->obd_uuid.uuid, uuid->uuid); - - for (i = 0; i < sizeof(types) / sizeof(char *); i++) { - res = class_find_client_obd(NULL, types[i], uuid); - if (res) - RETURN(res); - } - RETURN(NULL); -} - -static int cmobd_setup(struct obd_device *obd, obd_count len, void *buf) -{ - struct obd_uuid master_uuid, cache_uuid; - struct lustre_handle conn = { 0 }; - struct cm_obd *cmobd = &obd->u.cm; - struct lustre_cfg* lcfg = buf; - int rc; - ENTRY; - - if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1) { - CERROR("%s: setup requires master device uuid\n", - obd->obd_name); - RETURN(-EINVAL); - } - - if (LUSTRE_CFG_BUFLEN(lcfg, 2) < 1) { - CERROR("%s: setup requires cache device uuid\n", - obd->obd_name); - RETURN(-EINVAL); - } - - obd_str2uuid(&master_uuid, lustre_cfg_string(lcfg, 1)); - obd_str2uuid(&cache_uuid, lustre_cfg_string(lcfg, 2)); - - /* getting master obd */ - cmobd->master_obd = cmobd_find_obd(obd, &master_uuid); - if (!cmobd->master_obd) { - CERROR("can't find master client obd by uuid %s\n", - master_uuid.uuid); - RETURN(-EINVAL); - } - - /* getting cache obd */ - cmobd->cache_obd = class_uuid2obd(&cache_uuid); - if (cmobd->cache_obd == NULL) { - CERROR("CMOBD: unable to find obd by uuid: %s\n", - cache_uuid.uuid); - RETURN(-EINVAL); - } - - /* connecting master */ - memset(&conn, 0, sizeof(conn)); - rc = obd_connect(&conn, cmobd->master_obd, &obd->obd_uuid, - NULL, OBD_OPT_REAL_CLIENT); - if (rc) - RETURN(rc); - cmobd->master_exp = class_conn2export(&conn); - - /* connecting cache */ - memset(&conn, 0, sizeof(conn)); - rc = class_connect(&conn, cmobd->cache_obd, &obd->obd_uuid); - if (rc) - GOTO(put_master, rc); - cmobd->cache_exp = class_conn2export(&conn); - - /* initialing DT desc. Both, data and metadata layers should be able to - * serve this call. */ - rc = cmobd_init_dt_desc(obd); - if (rc != 0 && rc != -EPROTO) { - CERROR("cannot get DT layer desc from master device %s, " - "err %d.\n", cmobd->master_exp->exp_obd->obd_name, - rc); - GOTO(put_cache, rc); - } - - if (obd_dt_type(cmobd->master_exp->exp_obd)) { - /* for master dt device remove the recovery flag. */ - rc = obd_set_info(cmobd->master_exp, strlen("unrecovery"), - "unrecovery", 0, NULL); - if (rc) - GOTO(put_cache, rc); - - rc = cmobd_init_write_srv(obd); - if (rc) - GOTO(put_cache, rc); - } - - if (obd_md_type(cmobd->master_exp->exp_obd)) { - __u32 size = sizeof(struct fid_extent); - struct fid_extent ext; - - rc = cmobd_init_ea_size(obd); - if (rc) { - CERROR("can't init MD layer EA size, " - "err %d\n", rc); - GOTO(put_cache, rc); - } - cmobd->write_srv = NULL; - - /* getting fid pool from master to set it on cache */ - rc = obd_get_info(cmobd->master_exp, strlen("getext"), - "getext", &size, &ext); - if (rc) { - CERROR("can't get fids extent from master, " - "err %d\n", rc); - GOTO(put_cache, rc); - } - - /* simple checks for validness */ - if (!ext.fe_start || !ext.fe_width || ext.fe_start == ext.fe_width) { - CERROR("invalid fids extent from master, ["LPD64"-"LPD64"]\n", - ext.fe_start, ext.fe_width); - GOTO(put_cache, rc = -EINVAL); - } - - CWARN("setting master fids extent ["LPD64"-"LPD64 - "] -> %s\n", ext.fe_start, ext.fe_width, - cmobd->cache_exp->exp_obd->obd_name); - - rc = obd_set_info(cmobd->cache_exp, strlen("setext"), - "setext", size, &ext); - if (rc) { - CERROR("can't set fids extent to cache, " - "err %d\n", rc); - GOTO(put_cache, rc); - } - } - - RETURN(rc); -put_cache: - class_disconnect(cmobd->cache_exp, 0); -put_master: - obd_disconnect(cmobd->master_exp, 0); - return rc; -} - -static int cmobd_cleanup(struct obd_device *obd, int flags) -{ - struct cm_obd *cmobd = &obd->u.cm; - int rc; - ENTRY; - - if (cmobd->write_srv) { - cmobd_cleanup_write_srv(obd); - cmobd->write_srv = NULL; - } - - rc = obd_disconnect(cmobd->master_exp, flags); - if (rc) { - CERROR("error disconnecting master %s, err %d\n", - cmobd->master_exp->exp_obd->obd_name, rc); - } - - rc = class_disconnect(cmobd->cache_exp, flags); - if (rc) { - CERROR("error disconnecting cache %s, err %d\n", - cmobd->cache_exp->exp_obd->obd_name, rc); - } - - RETURN(0); -} - -static int cmobd_iocontrol(unsigned int cmd, struct obd_export *exp, - int len, void *karg, void *uarg) -{ - struct obd_device *obd = exp->exp_obd; - int rc = 0; - ENTRY; - - switch (cmd) { - case OBD_IOC_CMOBD_SYNC: - /* here would be nice to make sure somehow that all data is in - * cache and there are no outstanding requests, as otherwise - * cache is not coherent. But how to check that from CMOBD? I do - * not know. --umka */ - rc = cmobd_reintegrate(obd); - break; - default: - CERROR("unrecognized ioctl %#x\n", cmd); - rc = -EINVAL; - break; - } - - RETURN(rc); -} - -static struct obd_ops cmobd_ops = { - .o_owner = THIS_MODULE, - .o_attach = cmobd_attach, - .o_detach = cmobd_detach, - .o_setup = cmobd_setup, - .o_cleanup = cmobd_cleanup, - .o_iocontrol = cmobd_iocontrol, -}; - -kmem_cache_t *cmobd_extent_slab; - -static int __init cmobd_init(void) -{ - struct lprocfs_static_vars lvars; - int rc; - ENTRY; - - printk(KERN_INFO "Lustre: Cache Manager OBD driver; info@clusterfs.com\n"); - - lprocfs_init_vars(cmobd, &lvars); - rc = class_register_type(&cmobd_ops, NULL, lvars.module_vars, - OBD_CMOBD_DEVICENAME); - if (rc) - RETURN(rc); - cmobd_extent_slab = kmem_cache_create("cmobd_extents", - sizeof(struct cmobd_extent_info), 0, - SLAB_HWCACHE_ALIGN, NULL, NULL); - if (cmobd_extent_slab == NULL) { - class_unregister_type(OBD_CMOBD_DEVICENAME); - RETURN(-ENOMEM); - } - RETURN(0); -} - -static void __exit cmobd_exit(void) -{ - class_unregister_type(OBD_CMOBD_DEVICENAME); - if (kmem_cache_destroy(cmobd_extent_slab) != 0) - CERROR("couldn't free cmobd extent slab\n"); -} - -MODULE_AUTHOR("Cluster File Systems, Inc. "); -MODULE_DESCRIPTION("Lustre Cache Manager OBD driver"); -MODULE_LICENSE("GPL"); - -module_init(cmobd_init); -module_exit(cmobd_exit); diff --git a/lustre/cmobd/cm_oss_reint.c b/lustre/cmobd/cm_oss_reint.c deleted file mode 100644 index c460ffb..0000000 --- a/lustre/cmobd/cm_oss_reint.c +++ /dev/null @@ -1,312 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (c) 2002 Cluster File Systems, Inc. - * - * This file is part of Lustre, http://www.lustre.org. - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#define DEBUG_SUBSYSTEM S_CMOBD - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "cm_internal.h" - -void lov_free_memmd(struct lov_stripe_md **lsmp); - -int lov_alloc_memmd(struct lov_stripe_md **lsmp, int stripe_count, - int pattern); - -int smfs_rec_unpack(struct smfs_proc_args *args, char *record, - char **pbuf, int *opcode); - -/* helper functions for cmobd to construct pseudo lsm */ -int cmobd_dummy_lsm(struct lov_stripe_md **lsmp, int stripe_cnt, - struct obdo *oa, __u32 stripe_size) -{ - int i, rc; - ENTRY; - - rc = lov_alloc_memmd(lsmp, stripe_cnt, LOV_PATTERN_CMOBD); - if (rc < 0) - RETURN(rc); - - for (i = 0; i < stripe_cnt; i++) { - (*lsmp)->lsm_oinfo[i].loi_id = oa->o_id; - (*lsmp)->lsm_object_id = oa->o_id; - if (oa->o_valid & OBD_MD_FLGROUP) { - (*lsmp)->lsm_oinfo[i].loi_gr = oa->o_gr; - (*lsmp)->lsm_object_gr = oa->o_gr; - } - (*lsmp)->lsm_oinfo[i].loi_ost_idx = i; - (*lsmp)->lsm_oinfo[i].loi_ost_gen = 1; - (*lsmp)->lsm_stripe_size = stripe_size; - } - RETURN(0); -} - -void cmobd_free_lsm(struct lov_stripe_md **lsmp) -{ - ENTRY; - lov_free_memmd(lsmp); - EXIT; -} - -/* reintegration functions */ -static int cmobd_setattr_reint(struct obd_device *obd, void *rec) -{ - int rc = 0; - struct lov_stripe_md *lsm; - struct cm_obd *cmobd = &obd->u.cm; - struct obd_export *exp = cmobd->master_exp; - struct obdo *oa = (struct obdo *)rec; - ENTRY; - - rc = cmobd_dummy_lsm(&lsm, cmobd->master_desc.ld_tgt_count, oa, - (__u32)cmobd->master_desc.ld_default_stripe_size); - if (rc) - GOTO(out, rc); - - rc = obd_setattr(exp, oa, lsm, NULL, NULL); - - cmobd_free_lsm(&lsm); -out: - RETURN(rc); -} - -static int cmobd_create_reint(struct obd_device *obd, void *rec) -{ - struct cm_obd *cmobd = &obd->u.cm; - struct obd_export *exp = cmobd->master_exp; - struct obdo *oa = (struct obdo *)rec; - struct obd_trans_info oti = { 0 }; - struct lov_stripe_md *lsm; - int rc; - ENTRY; - - rc = cmobd_dummy_lsm(&lsm, cmobd->master_desc.ld_tgt_count, oa, - (__u32)cmobd->master_desc.ld_default_stripe_size); - if (rc) - GOTO(out, rc); - if (cmobd->master_group != oa->o_gr) { - int group = oa->o_gr; - int valsize = sizeof(group); - - rc = obd_set_info(exp, strlen("mds_conn"), - "mds_conn", valsize, &group); - if (rc) - GOTO(out, rc); - cmobd->master_group = oa->o_gr; - } - - oti.oti_flags |= OBD_MODE_CROW; - rc = obd_create(exp, oa, NULL, 0, &lsm, &oti); - cmobd_free_lsm(&lsm); - EXIT; -out: - return rc; -} - -/* direct cut-n-paste of filter_blocking_ast() */ -static int cache_blocking_ast(struct ldlm_lock *lock, - struct ldlm_lock_desc *desc, - void *data, int flag) -{ - int rc, do_ast; - ENTRY; - - if (flag == LDLM_CB_CANCELING) { - /* Don't need to do anything here. */ - RETURN(0); - } - - /* XXX layering violation! -phil */ - lock_res_and_lock(lock); - - /* get this: if filter_blocking_ast() is racing with ldlm_intent_policy, - * such that filter_blocking_ast is called just before l_i_p takes the - * ns_lock, then by the time we get the lock, we might not be the - * correct blocking function anymore. So check, and return early, if - * so. */ - if (lock->l_blocking_ast != cache_blocking_ast) { - unlock_res_and_lock(lock); - RETURN(0); - } - - lock->l_flags |= LDLM_FL_CBPENDING; - do_ast = (!lock->l_readers && !lock->l_writers); - unlock_res_and_lock(lock); - - if (do_ast) { - struct lustre_handle lockh; - LDLM_DEBUG(lock, "already unused, calling ldlm_cli_cancel"); - ldlm_lock2handle(lock, &lockh); - rc = ldlm_cli_cancel(&lockh); - if (rc < 0) - CERROR("ldlm_cli_cancel: %d\n", rc); - } else { - LDLM_DEBUG(lock, "Lock still has references, will be " - "cancelled later"); - } - RETURN(0); -} - -static int master_blocking_ast(struct ldlm_lock *lock, - struct ldlm_lock_desc *desc, - void *data, int flag) -{ - int rc; - struct lustre_handle lockh; - ENTRY; - - switch (flag) { - case LDLM_CB_BLOCKING: - ldlm_lock2handle(lock, &lockh); - rc = ldlm_cli_cancel(&lockh); - if (rc < 0) { - CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc); - RETURN(rc); - } - break; - case LDLM_CB_CANCELING: - /* do nothing here by now */ - break; - default: - LBUG(); - } - RETURN(0); -} - -static int cmobd_write_extents(struct obd_device *obd, struct obdo *oa, - struct ldlm_extent *extent) -{ - struct cm_obd *cmobd = &obd->u.cm; - struct obd_device *cache = cmobd->cache_exp->exp_obd; - struct lustre_handle lockh_src = { 0 }; - struct lustre_handle lockh_dst = { 0 }; - struct ldlm_res_id res_id; - ldlm_policy_data_t policy; - struct lov_stripe_md *lsm; - int flags = 0, err, rc = 0; - ENTRY; - - /* XXX for debug write replay without smfs and kml */ - res_id.name[0]= oa->o_id; - res_id.name[1]= oa->o_gr; - policy.l_extent = *extent; - - /* get extent read lock on the source replay file */ - rc = ldlm_cli_enqueue(NULL, NULL, cache->obd_namespace, res_id, - LDLM_EXTENT, &policy, LCK_PR, - &flags, cache_blocking_ast, ldlm_completion_ast, - NULL, NULL, NULL, 0, NULL, &lockh_src); - if (rc != ELDLM_OK) - RETURN(rc); - - /* construct the pseudo lsm */ - rc = cmobd_dummy_lsm(&lsm, cmobd->master_desc.ld_tgt_count, oa, - (__u32)cmobd->master_desc.ld_default_stripe_size); - if (rc) - GOTO(out_lock, rc); - - rc = obd_enqueue(cmobd->master_exp, lsm, LDLM_EXTENT, &policy, - LCK_PW, &flags, master_blocking_ast, - ldlm_completion_ast, NULL, - NULL, 0, NULL, &lockh_dst); - if (rc != ELDLM_OK) - GOTO(out_lsm, rc); - - err = cmobd_replay_write(obd, oa, &policy.l_extent); - - rc = obd_cancel(cmobd->master_exp, lsm, LCK_PW, &lockh_dst); - if (rc) - GOTO(out_lsm, rc); - - /* XXX in fact, I just want to cancel the only lockh_dst instantly. */ - rc = obd_cancel_unused(cmobd->master_exp, lsm, 0, NULL); - if (err) - rc = err; -out_lsm: - cmobd_free_lsm(&lsm); -out_lock: - ldlm_lock_decref(&lockh_src, LCK_PR); - RETURN(rc); -} - -static int cmobd_write_reint(struct obd_device *obd, void *rec) -{ - struct obdo *oa = (struct obdo *)rec; - struct cm_obd *cmobd = &obd->u.cm; - struct ldlm_extent *extent = NULL; - char *extents_buf = NULL; - struct obd_device *cache; - int rc = 0, ext_num = 0; - unsigned long csb, ino; - __u32 size = 0; - ENTRY; - - size = sizeof(csb); - obd_get_info(cmobd->cache_exp, strlen("cache_sb") + 1, - "cache_sb", &size, &csb); - - ino = *(int*)(&oa->o_inline[0]); - - cache = cmobd->cache_exp->exp_obd; - rc = fsfilt_get_ino_write_extents(cache, (struct super_block *)csb, - ino, &extents_buf, &ext_num); - if (rc) - GOTO(out, rc); - extent = (struct ldlm_extent *)extents_buf; - size = ext_num; - while (extent && size --) { - rc = cmobd_write_extents(obd, oa, extent); - if (rc) - GOTO(out, rc); - extent ++; - } -out: - if (extents_buf) - fsfilt_free_write_extents(cache, (struct super_block *)csb, - ino, extents_buf, ext_num); - RETURN(rc); -} - -int cmobd_reint_oss(struct obd_device *obd, void *record, int opcode) -{ - switch (opcode) { - case OST_CREATE: - return cmobd_create_reint(obd, record); - case OST_SETATTR: - return cmobd_setattr_reint(obd, record); - case OST_WRITE: - return cmobd_write_reint(obd, record); - default: - CERROR("unrecognized oss reint opcode %d\n", - opcode); - return -EINVAL; - } -} diff --git a/lustre/cmobd/cm_reint.c b/lustre/cmobd/cm_reint.c deleted file mode 100644 index ddebc7f..0000000 --- a/lustre/cmobd/cm_reint.c +++ /dev/null @@ -1,125 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (c) 2002 Cluster File Systems, Inc. - * - * This file is part of Lustre, http://www.lustre.org. - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#define DEBUG_SUBSYSTEM S_CMOBD - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "cm_internal.h" - -#define OSS_REINT(opcode) \ -({ \ - int _opcode = (opcode); \ - \ - (_opcode == OST_CREATE || \ - _opcode == OST_SETATTR || \ - _opcode == OST_WRITE); \ -}) - -#define MDS_REINT(opcode) \ - ((opcode) == MDS_REINT) - -static int cmobd_reint_record(struct obd_device *obd, - void *record, int opcode) -{ - if (OSS_REINT(opcode)) - return cmobd_reint_oss(obd, record, opcode); - - if (MDS_REINT(opcode)) - return cmobd_reint_mds(obd, record, opcode); - - CERROR("unrecognized reint opcode %d\n", opcode); - return -EINVAL; -} - -static int cmobd_reint_cb(struct llog_handle *llh, - struct llog_rec_hdr *rec, - void *data) -{ - struct obd_device *obd = (struct obd_device*)data; - char *buf, *pbuf; - int rc = 0, opcode; - - ENTRY; - - if (!(llh->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN)) { - CERROR("log is not plain log\n"); - RETURN(-EINVAL); - } - - if (rec->lrh_type != SMFS_UPDATE_REC) - RETURN(-EINVAL); - - buf = (char *)(rec + 1); - rc = smfs_rec_unpack(NULL, buf, &pbuf, &opcode); - if (rc) - GOTO(out, rc); - - rc = cmobd_reint_record(obd, pbuf, opcode); - if (rc) - GOTO(out, rc); - - /* delete this record. */ - rc = LLOG_DEL_RECORD; -out: - RETURN(rc); -} - -int cmobd_reintegrate(struct obd_device *obd) -{ - struct cm_obd *cmobd = &obd->u.cm; - struct llog_ctxt *ctxt = NULL; - struct llog_handle *llh; - __u32 val_size; - int rc = 0; - ENTRY; - - /* XXX just fetch the reintegration log context from - * cache ost directly, use logid later ?? */ - val_size = sizeof(ctxt); - rc = obd_get_info(cmobd->cache_exp, strlen("reint_log") + 1, - "reint_log", &val_size, &ctxt); - if (rc) - RETURN(rc); - - /* use the already opened log handle instead of reopen a new log - * handle */ - llh = ctxt ? ctxt->loc_handle : NULL; - if (llh == NULL) { - CERROR("reint log is not found, wrong fstype " - "or smfs plugin is used.\n"); - RETURN(-EINVAL); - } - - /* FIXME: should we insert a LLOG_GEN_REC before process log? */ - rc = llog_cat_process(llh, (llog_cb_t)cmobd_reint_cb, obd); - RETURN(rc); -} diff --git a/lustre/cmobd/cm_write.c b/lustre/cmobd/cm_write.c deleted file mode 100644 index aa6bf81..0000000 --- a/lustre/cmobd/cm_write.c +++ /dev/null @@ -1,743 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (c) 2002 Cluster File Systems, Inc. - * - * This file is part of Lustre, http://www.lustre.org. - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#define DEBUG_SUBSYSTEM S_CMOBD - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include "cm_internal.h" - -extern kmem_cache_t *cmobd_extent_slab; - -/* helper function to split an extent */ -static obd_count split_extent(struct ldlm_extent *ext, unsigned long interval) -{ - obd_count buf_count, remainder; - ENTRY; - - buf_count = ext->end - ext->start + 1; - LASSERT(buf_count > 0); - - remainder = do_div(buf_count, interval); - if (remainder) - buf_count++; - - RETURN(buf_count); -} - -static int cmobd_ap_make_ready(void *data, int cmd) -{ - struct cmobd_async_page *cmap = (struct cmobd_async_page *)data; - struct page *page = cmap->cmap_page; - ENTRY; - - if (cmd == OBD_BRW_READ) - RETURN(0); - - if (TryLockPage(page)) - RETURN(-EAGAIN); - - RETURN(0); -} - -static int cmobd_ap_refresh_count(void *data, int cmd) -{ - struct cmobd_async_page *cmap = (struct cmobd_async_page *)data; - struct page *page = cmap->cmap_page; - struct inode *inode = page->mapping->host; - ENTRY; - - LASSERT(cmd != OBD_BRW_READ); - - /* catch race with truncate */ - if (((loff_t)page->index << PAGE_SHIFT) >= inode->i_size) - RETURN(0); - - /* catch sub-page write at end of file */ - if (((loff_t)page->index << PAGE_SHIFT) + PAGE_SIZE > inode->i_size) - RETURN(inode->i_size % PAGE_SIZE); - - RETURN(PAGE_SIZE); -} - -static void cmobd_ap_fill_obdo(void *data, int cmd, struct obdo *oa) -{ - struct cmobd_async_page *cmap = (struct cmobd_async_page *)data; - obd_valid valid_flags; - struct inode *inode; - ENTRY; - - if (IS_ERR(cmap)) { - EXIT; - return; - } - - inode = cmap->cmap_page->mapping->host; - oa->o_id = cmap->cmap_es->es_oa.o_id; - oa->o_gr = cmap->cmap_es->es_oa.o_gr; - oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP; - valid_flags = OBD_MD_FLTYPE | OBD_MD_FLATIME; - if (cmd == OBD_BRW_WRITE) { - oa->o_valid |= OBD_MD_FLIFID; - - /* FIXME-UMKA: should be here some mds num and mds id? */ - mdc_pack_id(obdo_id(oa), inode->i_ino, 0, - inode->i_mode, 0, 0); - valid_flags |= OBD_MD_FLMTIME | OBD_MD_FLCTIME; - } - - obdo_from_inode(oa, inode, valid_flags); - - EXIT; - return; -} - -static void cmobd_ap_completion(void *data, int cmd, struct obdo *oa, int rc) -{ - struct cmobd_async_page *cmap = (struct cmobd_async_page *)data; - struct cmobd_extent_set *set = cmap->cmap_es; - unsigned long flags; - struct page *page; - int wakeup = 0; - ENTRY; - - page = cmap->cmap_page; - LASSERT(PageLocked(page)); - - /* XXX */ - if (rc) - SetPageError(page); - - spin_lock_irqsave(&set->es_lock, flags); - LASSERT(!list_empty(&set->es_pages)); - LASSERT(!list_empty(&cmap->cmap_link)); - - list_del_init(&cmap->cmap_link); - if (list_empty(&set->es_pages) && !set->es_count) - wakeup = 1; - spin_unlock_irqrestore(&set->es_lock, flags); - - obd_teardown_async_page(set->es_exp, set->es_lsm, NULL, - cmap->cmap_cookie); - OBD_FREE(cmap, sizeof(*cmap)); - - unlock_page(page); - page_cache_release(page); - - if (wakeup) - wake_up(&set->es_waitq); - EXIT; - return; -} - -static struct obd_async_page_ops cmobd_async_page_ops = { - .ap_make_ready = cmobd_ap_make_ready, - .ap_refresh_count = cmobd_ap_refresh_count, - .ap_fill_obdo = cmobd_ap_fill_obdo, - .ap_completion = cmobd_ap_completion, -}; - -static int cmobd_send_pages(struct obd_device *obd, - struct niobuf_local *lnb, - obd_count oa_bufs, - struct cmobd_extent_set *set) -{ - struct cm_obd *cmobd = &obd->u.cm; - struct obd_export *exp = cmobd->master_exp; - struct cmobd_async_page *cmap = NULL; - obd_count i; - int rc = 0; - unsigned long flags; - ENTRY; - - for (i = 0; i < oa_bufs; i++, lnb++) { - - OBD_ALLOC(cmap, sizeof(*cmap)); - if (cmap == NULL) { - CERROR("Not enought memory\n"); - rc = -ENOMEM; - break; - } - INIT_LIST_HEAD(&cmap->cmap_link); - cmap->cmap_page = lnb->page; - cmap->cmap_es = set; - - rc = obd_prep_async_page(exp, set->es_lsm, NULL, lnb->page, - lnb->offset, &cmobd_async_page_ops, - cmap, &cmap->cmap_cookie); - if (rc) { - CERROR("cmobd prep async page failed page(%p) rc(%d)\n", - lnb->page, rc); - OBD_FREE(cmap, sizeof(*cmap)); - break; - } - - LASSERT(cmap->cmap_page); - LASSERT(!PageLocked(cmap->cmap_page)); - LASSERT(Page_Uptodate(cmap->cmap_page)); - page_cache_get(cmap->cmap_page); - - spin_lock_irqsave(&set->es_lock, flags); - list_add_tail(&cmap->cmap_link, &set->es_pages); - spin_unlock_irqrestore(&set->es_lock, flags); - - rc = obd_queue_async_io(exp, set->es_lsm, NULL, cmap->cmap_cookie, - OBD_BRW_WRITE, 0, 0, 0, 0); - if (rc) { /* try sync io */ - struct obd_io_group *oig; - - spin_lock_irqsave(&set->es_lock, flags); - list_del_init(&cmap->cmap_link); - spin_unlock_irqrestore(&set->es_lock, flags); - - lock_page(cmap->cmap_page); - - rc = oig_init(&oig); - if (rc) - GOTO(free_page, rc); - - rc = obd_queue_group_io(exp, set->es_lsm, NULL, oig, - cmap->cmap_cookie, - OBD_BRW_WRITE, 0, lnb->len, 0, - ASYNC_READY | ASYNC_URGENT | - ASYNC_COUNT_STABLE | - ASYNC_GROUP_SYNC); - - if (rc) - GOTO(free_oig, rc); - - rc = obd_trigger_group_io(exp, set->es_lsm, NULL, oig); - if (rc) - GOTO(free_oig, rc); - - rc = oig_wait(oig); -free_oig: - oig_release(oig); -free_page: - unlock_page(cmap->cmap_page); - page_cache_release(cmap->cmap_page); - obd_teardown_async_page(exp, set->es_lsm, NULL, - cmap->cmap_cookie); - OBD_FREE(cmap, sizeof(*cmap)); - if (rc) { - CERROR("cmobd sync io failed\n"); - break; - } - } - } - RETURN(rc); -} - -static int cmobd_write_extent(struct obd_device *obd, - struct cmobd_extent_info *ei) -{ - struct cmobd_extent_set *set = ei->ei_set; - struct cm_obd *cmobd = &obd->u.cm; - unsigned long flags; - struct obd_ioobj ioo; - struct niobuf_local *lnb; - struct niobuf_remote *rnb; - obd_count i, oa_bufs; - struct obdo *oa; - obd_off offset; - int ret, rc = 0, wakeup = 0; - ENTRY; - - oa_bufs = split_extent(&ei->ei_extent, PAGE_SIZE); - LASSERT(oa_bufs > 0); - - OBD_ALLOC(lnb, oa_bufs * sizeof(struct niobuf_local)); - OBD_ALLOC(rnb, oa_bufs * sizeof(struct niobuf_remote)); - oa = obdo_alloc(); - - if (lnb == NULL || rnb == NULL || oa == NULL) - GOTO(out, rc = -ENOMEM); - - LASSERT(ei->ei_extent.end >= ei->ei_extent.start); - LASSERT((ei->ei_extent.start & (PAGE_SIZE -1)) == 0); - - for (i = 0, offset = ei->ei_extent.start; i < oa_bufs; - i++, offset += PAGE_SIZE) { - rnb[i].offset = offset; - rnb[i].len = MIN(PAGE_SIZE, ei->ei_extent.end - offset + 1); - } - - memcpy(oa, &set->es_oa, sizeof(*oa)); - obdo_to_ioobj(oa, &ioo); - ioo.ioo_bufcnt = oa_bufs; - - ret = obd_preprw(OBD_BRW_READ, cmobd->cache_exp, oa, 1, &ioo, - oa_bufs, rnb, lnb, NULL, NULL); - if (ret) - GOTO(out, rc = ret); - - rc = cmobd_send_pages(obd, lnb, oa_bufs, set); - if (rc) - CERROR("cmobd_send_pages failed %d\n", rc); - - rc = obd_commitrw(OBD_BRW_READ, cmobd->cache_exp, oa, 1, &ioo, - oa_bufs, lnb, NULL, ret); - - /* countdown and wake up */ - spin_lock_irqsave(&set->es_lock, flags); - LASSERT(set->es_count); - set->es_count--; - if (!set->es_count) - wakeup = 1; - spin_unlock_irqrestore(&set->es_lock, flags); - - if (wakeup) - wake_up(&set->es_waitq); - - EXIT; -out: - if (lnb) - OBD_FREE(lnb, oa_bufs * sizeof(struct niobuf_local)); - if (rnb) - OBD_FREE(rnb, oa_bufs * sizeof(struct niobuf_remote)); - if (oa) - obdo_free(oa); - - return rc; -} - -static struct cmobd_extent_info* get_next_ei(struct cmobd_write_service *ws) -{ - struct cmobd_extent_info *ei = NULL; - unsigned long flags; - int wakeup = 0; - - spin_lock_irqsave(&ws->ws_extent_lock, flags); - if (!list_empty(&ws->ws_extents)) { - ei = list_entry(ws->ws_extents.next, - struct cmobd_extent_info, ei_link); - list_del_init(&ei->ei_link); - ws->ws_nextents--; - if (ws->ws_nextents < CMOBD_MAX_EXTENTS) - wakeup = 1; - } - spin_unlock_irqrestore(&ws->ws_extent_lock, flags); - - if (wakeup) - wake_up_all(&ws->ws_waitq_provider); - - return ei; -} - -static int cmobd_write_main(void *arg) -{ - struct ptlrpc_svc_data *data = (struct ptlrpc_svc_data *)arg; - struct ptlrpc_thread *thread = data->thread; - struct obd_device *obd = data->dev; - struct cm_obd *cmobd = &obd->u.cm; - struct cmobd_write_service *ws = cmobd->write_srv; - struct cmobd_extent_info *extent = NULL; - unsigned long flags; - int rc; - ENTRY; - - lock_kernel(); - ptlrpc_daemonize(); - - SIGNAL_MASK_LOCK(current, flags); - sigfillset(¤t->blocked); - RECALC_SIGPENDING; - SIGNAL_MASK_UNLOCK(current, flags); - - LASSERTF(strlen(data->name) < sizeof(current->comm), - "name %d > len %d\n",strlen(data->name),sizeof(current->comm)); - THREAD_NAME(current->comm, sizeof(current->comm) - 1, "%s", data->name); - - unlock_kernel(); - - thread->t_flags = SVC_RUNNING; - wake_up(&thread->t_ctl_waitq); - - /* Record that the thread is running */ - spin_lock_irqsave(&ws->ws_thread_lock, flags); - ws->ws_nthreads++; - spin_unlock_irqrestore(&ws->ws_thread_lock, flags); - - while ((thread->t_flags & SVC_STOPPING) == 0) { - struct l_wait_info lwi = { 0 }; - - l_wait_event_exclusive(ws->ws_waitq_consumer, - ((thread->t_flags & SVC_STOPPING) || - ((extent = get_next_ei(ws)) != - NULL)), - &lwi); - if (extent == NULL) - continue; - rc = cmobd_write_extent(obd, extent); - if (rc) - CERROR("write extent failed rc=%d\n", rc); - OBD_SLAB_FREE(extent, cmobd_extent_slab, sizeof(*extent)); - extent = NULL; - } - - thread->t_flags = SVC_STOPPED; - wake_up(&thread->t_ctl_waitq); - - spin_lock_irqsave(&ws->ws_thread_lock, flags); - ws->ws_nthreads--; /* must know immediately */ - spin_unlock_irqrestore(&ws->ws_thread_lock, flags); - - RETURN(0); -} - -/* functions for manipulating cmobd write replay threads, similar with - * ptlrpc threads functions */ -static int cmobd_start_thread(struct obd_device *obd, char *name) -{ - struct cm_obd *cmobd = &obd->u.cm; - struct cmobd_write_service *ws = cmobd->write_srv; - struct l_wait_info lwi = { 0 }; - struct ptlrpc_svc_data d; - struct ptlrpc_thread *thread; - unsigned long flags; - int rc; - ENTRY; - - OBD_ALLOC(thread, sizeof(*thread)); - if (thread == NULL) - RETURN(-ENOMEM); - init_waitqueue_head(&thread->t_ctl_waitq); - - d.dev = obd; - d.svc = NULL; - d.name = name; - d.thread = thread; - - spin_lock_irqsave(&ws->ws_thread_lock, flags); - list_add(&thread->t_link, &ws->ws_threads); - spin_unlock_irqrestore(&ws->ws_thread_lock, flags); - - /* CLONE_VM and CLONE_FILES just avoid a needless copy, because we - * just drop the VM and FILES in ptlrpc_daemonize() right away. - */ - rc = kernel_thread(cmobd_write_main, &d, CLONE_VM | CLONE_FILES); - if (rc < 0) { - CERROR("cannot start thread: %d\n", rc); - spin_lock_irqsave(&ws->ws_thread_lock, flags); - list_del_init(&thread->t_link); - spin_unlock_irqrestore(&ws->ws_thread_lock, flags); - OBD_FREE(thread, sizeof(*thread)); - RETURN(rc); - } - l_wait_event(thread->t_ctl_waitq, thread->t_flags & SVC_RUNNING, &lwi); - - RETURN(0); - -} - -static void cmobd_stop_thread(struct obd_device *obd, - struct ptlrpc_thread *thread) -{ - struct cm_obd *cmobd = &obd->u.cm; - struct cmobd_write_service *ws = cmobd->write_srv; - struct l_wait_info lwi = { 0 }; - unsigned long flags; - ENTRY; - - thread->t_flags = SVC_STOPPING; - wake_up_all(&ws->ws_waitq_consumer); - - l_wait_event(thread->t_ctl_waitq, (thread->t_flags & SVC_STOPPED), - &lwi); - - spin_lock_irqsave(&ws->ws_thread_lock, flags); - list_del(&thread->t_link); - spin_unlock_irqrestore(&ws->ws_thread_lock, flags); - - OBD_FREE(thread, sizeof(*thread)); - EXIT; -} - -static void cmobd_stop_all_threads(struct obd_device *obd) -{ - struct cm_obd *cmobd = &obd->u.cm; - struct cmobd_write_service *ws = cmobd->write_srv; - unsigned long flags; - struct ptlrpc_thread *thread; - ENTRY; - - spin_lock_irqsave(&ws->ws_thread_lock, flags); - while (!list_empty(&ws->ws_threads)) { - thread = list_entry(ws->ws_threads.next, - struct ptlrpc_thread, t_link); - - spin_unlock_irqrestore(&ws->ws_thread_lock, flags); - cmobd_stop_thread(obd, thread); - spin_lock_irqsave(&ws->ws_thread_lock, flags); - } - - spin_unlock_irqrestore(&ws->ws_thread_lock, flags); - EXIT; -} - -static int cmobd_start_n_threads(struct obd_device *obd, int num_threads, - char *base_name) -{ - int i, rc = 0; - ENTRY; - - for (i = 0; i < num_threads; i++) { - char name[32]; - snprintf(name, sizeof(name) - 1, "%s_%02d", base_name, i); - rc = cmobd_start_thread(obd, name); - if (rc) { - CERROR("cannot start %s thread #%d: rc %d\n", base_name, - i, rc); - cmobd_stop_all_threads(obd); - } - } - RETURN(rc); -} - -void cmobd_cleanup_write_srv(struct obd_device *obd) -{ - struct cm_obd *cmobd = &obd->u.cm; - struct list_head *pos, *n; - struct cmobd_extent_info *ei; - ENTRY; - - cmobd_stop_all_threads(obd); - - list_for_each_safe(pos, n, &cmobd->write_srv->ws_extents) { - ei = list_entry(pos, struct cmobd_extent_info, ei_link); - list_del_init(&ei->ei_link); - OBD_FREE(ei, sizeof(*ei)); - } - OBD_FREE(cmobd->write_srv, sizeof(*cmobd->write_srv)); - EXIT; -} - -int cmobd_init_write_srv(struct obd_device *obd) -{ - struct cm_obd *cmobd = &obd->u.cm; - struct cmobd_write_service *ws; - int rc; - ENTRY; - - OBD_ALLOC(cmobd->write_srv, sizeof(*cmobd->write_srv)); - if (cmobd->write_srv == NULL) - RETURN(-ENOMEM); - ws = cmobd->write_srv; - - INIT_LIST_HEAD(&ws->ws_threads); - spin_lock_init(&ws->ws_thread_lock); - ws->ws_nthreads = 0; - - INIT_LIST_HEAD(&ws->ws_extents); - spin_lock_init(&ws->ws_extent_lock); - ws->ws_nextents = 0; - init_waitqueue_head(&ws->ws_waitq_provider); - init_waitqueue_head(&ws->ws_waitq_consumer); - - rc = cmobd_start_n_threads(obd, CMOBD_NUM_THREADS, "cm_write"); - if (rc) - cmobd_cleanup_write_srv(obd); - - RETURN(rc); -} - -static int extent_queue_full(struct cmobd_write_service *ws) -{ - unsigned long flags; - int full = 0; - - spin_lock_irqsave(&ws->ws_extent_lock, flags); - full = (ws->ws_nextents >= CMOBD_MAX_EXTENTS) ? 1 : 0; - spin_unlock_irqrestore(&ws->ws_extent_lock, flags); - - return full; -} - -static void cmobd_queue_extent(struct obd_device *obd, - struct cmobd_extent_info *ex) -{ - struct cm_obd *cmobd = &obd->u.cm; - struct cmobd_write_service *ws = cmobd->write_srv; - struct cmobd_extent_set *set = ex->ei_set; - unsigned long flags; - struct l_wait_info lwi = { 0 }; - ENTRY; - -wait: - l_wait_event(ws->ws_waitq_provider, !extent_queue_full(ws), &lwi); - - spin_lock_irqsave(&ws->ws_extent_lock, flags); - if (ws->ws_nextents >= CMOBD_MAX_EXTENTS) { - spin_unlock_irqrestore(&ws->ws_extent_lock, flags); - goto wait; - } - list_add_tail(&ex->ei_link, &ws->ws_extents); - ws->ws_nextents++; - spin_unlock_irqrestore(&ws->ws_extent_lock, flags); - - spin_lock_irqsave(&set->es_lock, flags); - set->es_count++; - spin_unlock_irqrestore(&set->es_lock, flags); - - wake_up_all(&ws->ws_waitq_consumer); - - EXIT; -} - -static obd_size cmobd_id2size(struct obd_export *exp, obd_id id, obd_gr grp) -{ - struct lvfs_run_ctxt saved; - struct dentry *de = NULL; - obd_size size; - ENTRY; - - push_ctxt(&saved, &exp->exp_obd->obd_lvfs_ctxt, NULL); - - de = obd_lvfs_id2dentry(exp, id, 0, grp); - LASSERT(de); - - size = de->d_inode->i_size; - - dput(de); - pop_ctxt(&saved, &exp->exp_obd->obd_lvfs_ctxt, NULL); - - RETURN(size); -} - -static int extent_set_done(struct cmobd_extent_set *set, int phase) -{ - int done = 0; - unsigned long flags; - - spin_lock_irqsave(&set->es_lock, flags); - if (phase == 1) - done = set->es_count ? 0 : 1; - else if (phase == 2) - done = (!set->es_count && list_empty(&set->es_pages)) ? 1 : 0; - spin_unlock_irqrestore(&set->es_lock, flags); - - return done; -} - -int cmobd_replay_write(struct obd_device *obd, struct obdo *oa, - struct ldlm_extent *ext) -{ - struct cm_obd *cmobd = &obd->u.cm; - struct lov_stripe_md *lsm = NULL; - struct cmobd_extent_set set; - struct cmobd_extent_info *ex; - struct l_wait_info lwi = { 0 }; - struct list_head *pos, *n; - struct cmobd_async_page *cmap; - unsigned long flags; - obd_count i, buf_count; - obd_off start; - int rc = 0; - ENTRY; - - rc = cmobd_dummy_lsm(&lsm, cmobd->master_desc.ld_tgt_count, oa, - (__u32)cmobd->master_desc.ld_default_stripe_size); - if (rc) - RETURN(-ENOMEM); - - set.es_extent.start = ext->start; - set.es_extent.end = ext->end; - set.es_lsm = lsm; - set.es_exp = cmobd->master_exp; - set.es_ext_sz = CMOBD_MAX_EXTENT_SZ; - set.es_count = 0; - memcpy(&set.es_oa, oa, sizeof(*oa)); - - INIT_LIST_HEAD(&set.es_pages); - spin_lock_init(&set.es_lock); - init_waitqueue_head(&set.es_waitq); - - if (set.es_extent.end < set.es_extent.start) { - CDEBUG(D_HA, "illegal extent in write replay\n"); - GOTO(out, rc = -EINVAL); - } - /* start of extent is extended to page boundaries */ - set.es_extent.start -= set.es_extent.start & ~PAGE_MASK; - /* if the end of extent is EOF, set it as file size */ - if (set.es_extent.end == OBD_OBJECT_EOF) { - set.es_extent.end = cmobd_id2size(cmobd->cache_exp, - oa->o_id, oa->o_gr) - 1; - if (set.es_extent.end <= 0) - GOTO(out, rc = 0); - } - - buf_count = split_extent(&set.es_extent, set.es_ext_sz); - for (i = 0, start = set.es_extent.start; i < buf_count; - i++, start += set.es_ext_sz) { - OBD_SLAB_ALLOC(ex, cmobd_extent_slab, SLAB_NOFS, sizeof(*ex)); - if (ex == NULL) { - CERROR("not enough memory\n"); - break; - } - - INIT_LIST_HEAD(&ex->ei_link); - ex->ei_set = &set; - ex->ei_extent.start = start; - ex->ei_extent.end = start + set.es_ext_sz - 1; - if (ex->ei_extent.end > set.es_extent.end) - ex->ei_extent.end = set.es_extent.end; - - cmobd_queue_extent(obd, ex); - } - - l_wait_event(set.es_waitq, extent_set_done(&set, 1), &lwi); - - /* fire remaining ios */ - spin_lock_irqsave(&set.es_lock, flags); - list_for_each_safe (pos, n, &set.es_pages) { - cmap = list_entry(pos, struct cmobd_async_page, cmap_link); - - /* locked pages are in flight */ - if (PageLocked(cmap->cmap_page)) - continue; - - spin_unlock_irqrestore(&set.es_lock, flags); - rc = obd_set_async_flags(set.es_exp, set.es_lsm, NULL, - cmap->cmap_cookie, - ASYNC_URGENT); - if (rc) - CERROR("cmobd set async flags failed\n"); - spin_lock_irqsave(&set.es_lock, flags); - break; - } - spin_unlock_irqrestore(&set.es_lock, flags); - - l_wait_event(set.es_waitq, extent_set_done(&set, 2), &lwi); -out: - cmobd_free_lsm(&lsm); - RETURN(rc); -} diff --git a/lustre/cmobd/lproc_cm.c b/lustre/cmobd/lproc_cm.c deleted file mode 100644 index f5c6efc..0000000 --- a/lustre/cmobd/lproc_cm.c +++ /dev/null @@ -1,34 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (C) 2002 Cluster File Systems, Inc. - * - * This file is part of Lustre, http://www.lustre.org. - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - */ -#define DEBUG_SUBSYSTEM S_CLASS - -#include -#include - -#ifndef LPROCFS -static struct lprocfs_vars lprocfs_module_vars[] = { {0} }; -static struct lprocfs_vars lprocfs_obd_vars[] = { {0} }; -#else -static struct lprocfs_vars lprocfs_module_vars[] = { {0} }; -static struct lprocfs_vars lprocfs_obd_vars[] = { {0} }; -#endif /* LPROCFS */ -LPROCFS_INIT_VARS(cmobd, lprocfs_module_vars, lprocfs_obd_vars) diff --git a/lustre/cobd/.cvsignore b/lustre/cobd/.cvsignore deleted file mode 100644 index 642e2e6..0000000 --- a/lustre/cobd/.cvsignore +++ /dev/null @@ -1,10 +0,0 @@ -.deps -Makefile -autoMakefile.in -autoMakefile -*.ko -*.mod.c -.*.cmd -.*.flags -.tmp_versions -.depend diff --git a/lustre/cobd/Makefile.in b/lustre/cobd/Makefile.in deleted file mode 100644 index 4f10283..0000000 --- a/lustre/cobd/Makefile.in +++ /dev/null @@ -1,4 +0,0 @@ -MODULES := cobd -cobd-objs := cache_obd.o lproc_cache.o - -@INCLUDE_RULES@ diff --git a/lustre/cobd/autoMakefile.am b/lustre/cobd/autoMakefile.am deleted file mode 100644 index aa9ca4c..0000000 --- a/lustre/cobd/autoMakefile.am +++ /dev/null @@ -1,11 +0,0 @@ -# Copyright (C) 2002 Cluster File Systems, Inc. -# -# This code is issued under the GNU General Public License. -# See the file COPYING in this distribution - -if MODULES -modulefs_DATA := cobd$(KMODEXT) -endif - -DIST_SOURCES = $(cobd-objs:.o=.c) -MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ diff --git a/lustre/cobd/cache_obd.c b/lustre/cobd/cache_obd.c deleted file mode 100644 index a2b1d17..0000000 --- a/lustre/cobd/cache_obd.c +++ /dev/null @@ -1,1723 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (c) 2002 Cluster File Systems, Inc. - * - * This file is part of Lustre, http://www.lustre.org. - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#define DEBUG_SUBSYSTEM S_COBD - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static int cobd_attach(struct obd_device *obd, - obd_count len, void *buf) -{ - struct lprocfs_static_vars lvars; - int rc = 0; - ENTRY; - - lprocfs_init_vars(cobd, &lvars); - rc = lprocfs_obd_attach(obd, lvars.obd_vars); - - RETURN(rc); -} - -static int cobd_detach(struct obd_device *obd) -{ - ENTRY; - RETURN(lprocfs_obd_detach(obd)); -} - -static int cobd_setup(struct obd_device *obd, obd_count len, void *buf) -{ - struct lustre_cfg *lcfg = (struct lustre_cfg *)buf; - int inst_len = 0, mname_len = 0, cname_len = 0; - struct obd_device *master_obd, *cache_obd; - struct cache_obd *cobd = &obd->u.cobd; - struct lustre_handle conn = { 0 }; - int rc = 0; - ENTRY; - - sema_init(&cobd->sem, 1); - - if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1 || - lustre_cfg_buf(lcfg, 1) == NULL) { - CERROR("%s: setup requires master device name\n", - obd->obd_name); - RETURN(-EINVAL); - } - - if (LUSTRE_CFG_BUFLEN(lcfg, 2) < 1 || - lustre_cfg_buf(lcfg, 2) == NULL) { - CERROR("%s: setup requires cache device name\n", - obd->obd_name); - RETURN(-EINVAL); - } - inst_len = LUSTRE_CFG_BUFLEN(lcfg, 3); - - if (inst_len) { - LASSERT(lustre_cfg_buf(lcfg, 3) != NULL); - mname_len = LUSTRE_CFG_BUFLEN(lcfg, 1) + inst_len; - cname_len = LUSTRE_CFG_BUFLEN(lcfg, 2) + inst_len; - } else { - mname_len = LUSTRE_CFG_BUFLEN(lcfg, 1); - cname_len = LUSTRE_CFG_BUFLEN(lcfg, 2); - } - - /* get the cache obd name and master name */ - OBD_ALLOC(cobd->master_name, mname_len); - if (!cobd->master_name) - RETURN(-ENOMEM); - if(inst_len) - sprintf(cobd->master_name, "%s-%s", lustre_cfg_string(lcfg, 1), - lustre_cfg_string(lcfg, 3)); - else - sprintf(cobd->master_name, "%s", lustre_cfg_string(lcfg, 1)); - - OBD_ALLOC(cobd->cache_name, cname_len); - if (!cobd->cache_name) { - OBD_FREE(cobd->master_name, mname_len); - RETURN(-ENOMEM); - } - if (inst_len) - sprintf(cobd->cache_name, "%s-%s", lustre_cfg_string(lcfg, 2), - lustre_cfg_string(lcfg, 3)); - else - sprintf(cobd->cache_name, "%s", lustre_cfg_string(lcfg, 2)); - - CDEBUG(D_INFO, "master name %s cache name %s\n", cobd->master_name, - cobd->cache_name); - - /* getting master obd */ - master_obd = class_name2obd(cobd->master_name); - if (!master_obd) { - CERROR("can't find master obd by name %s\n", - cobd->master_name); - GOTO(put_names, rc = -EINVAL); - } - - /* connecting master */ - memset(&conn, 0, sizeof(conn)); - rc = class_connect(&conn, master_obd, &obd->obd_uuid); - if (rc) - GOTO(put_names, rc); - - cobd->master_exp = class_conn2export(&conn); - - /* getting cache obd */ - cache_obd = class_name2obd(cobd->cache_name); - if (!cache_obd) { - class_disconnect(cobd->master_exp, 0); - CERROR("can't find cache obd by name %s\n", - cobd->cache_name); - GOTO(put_names, rc = -EINVAL); - } - - /* connecting cache */ - memset(&conn, 0, sizeof(conn)); - rc = class_connect(&conn, cache_obd, &obd->obd_uuid); - if (rc) { - class_disconnect(cobd->master_exp, 0); - GOTO(put_names, rc); - } - cobd->cache_exp = class_conn2export(&conn); - - /* default set cache on, but nothing is realy connected yet, will be - * done in cobd_connect() time. */ - cobd->cache_on = 1; - - /* nothing is connected, make exports reflect this state to not confuse - * cobd_switch() later. */ - cobd->cache_real_exp = NULL; - cobd->master_real_exp = NULL; - - EXIT; -put_names: - if (rc) { - if (cobd->master_name) { - OBD_FREE(cobd->master_name, LUSTRE_CFG_BUFLEN(lcfg, 1)); - cobd->master_name = NULL; - } - if (cobd->cache_name) { - OBD_FREE(cobd->cache_name, LUSTRE_CFG_BUFLEN(lcfg, 2)); - cobd->cache_name = NULL; - } - } - return rc; -} - -static int cobd_cleanup(struct obd_device *obd, int flags) -{ - struct cache_obd *cobd = &obd->u.cobd; - int rc = 0; - ENTRY; - - if (!list_empty(&obd->obd_exports)) - RETURN(-EBUSY); - - if (cobd->cache_name) - OBD_FREE(cobd->cache_name, - strlen(cobd->cache_name) + 1); - if (cobd->master_name) - OBD_FREE(cobd->master_name, - strlen(cobd->master_name) + 1); - - rc = class_disconnect(cobd->master_exp, flags); - if (rc) { - CERROR("error disconnecting master, err %d\n", - rc); - } - rc = class_disconnect(cobd->cache_exp, flags); - if (rc) { - CERROR("error disconnecting master, err %d\n", - rc); - } - - RETURN(0); -} - -static inline struct obd_export * -cobd_get_exp(struct obd_device *obd) -{ - struct cache_obd *cobd = &obd->u.cobd; - ENTRY; - - if (cobd->cache_on) { - CDEBUG(D_TRACE, "get cache exp %p \n", cobd->cache_exp); - if (cobd->cache_real_exp) - RETURN(cobd->cache_real_exp); - RETURN(cobd->cache_exp); - } - - CDEBUG(D_TRACE, "get master exp %p \n", cobd->master_exp); - if (cobd->master_real_exp) - RETURN(cobd->master_real_exp); - RETURN(cobd->master_exp); -} - -static int cobd_init_dt_desc(struct obd_device *obd) -{ - struct cache_obd *cobd = &obd->u.cobd; - struct obd_export *cobd_exp; - __u32 valsize; - int rc = 0; - ENTRY; - - valsize = sizeof(cobd->dt_desc); - memset(&cobd->dt_desc, 0, sizeof(cobd->dt_desc)); - - cobd_exp = cobd_get_exp(obd); - rc = obd_get_info(cobd_exp, strlen("lovdesc") + 1, - "lovdesc", &valsize, &cobd->dt_desc); - RETURN(rc); -} - -static int cobd_init_ea_size(struct obd_device *obd) -{ - int rc = 0, tgt_count, easize, cookiesize; - struct cache_obd *cobd = &obd->u.cobd; - struct obd_export *cobd_exp; - ENTRY; - - tgt_count = cobd->dt_desc.ld_tgt_count; - - /* no EA setup is needed as there is single OST with no LOV */ - if (tgt_count == 0) - RETURN(0); - - cobd_exp = cobd_get_exp(obd); - easize = lov_mds_md_size(tgt_count); - cookiesize = tgt_count * sizeof(struct llog_cookie); - rc = obd_init_ea_size(cobd_exp, easize, cookiesize); - RETURN(rc); -} - -static int -cobd_connect_client(struct obd_device *obd, - struct obd_export *exp, - struct lustre_handle *conn, - struct obd_connect_data *data, - unsigned long flags) -{ - struct obd_device *cli_obd; - int rc = 0; - ENTRY; - - LASSERT(obd); - LASSERT(conn); - - cli_obd = class_exp2obd(exp); - if (cli_obd == NULL) - RETURN(-EINVAL); - - rc = obd_connect(conn, cli_obd, &obd->obd_uuid, - data, flags); - if (rc) - CERROR("error connecting err %d\n", rc); - - RETURN(rc); -} - -static int -cobd_disconnect_client(struct obd_device *obd, - struct obd_export *exp, - unsigned long flags) -{ - struct obd_device *cli_obd; - int rc = 0; - ENTRY; - - cli_obd = class_exp2obd(exp); - cli_obd->obd_no_recov = obd->obd_no_recov; - - rc = obd_disconnect(exp, flags); - if (rc) { - CERROR("error disconnecting from %s, err %d\n", - cli_obd->obd_name, rc); - class_export_put(exp); - } - RETURN(rc); -} - -#define COBD_CONNECT (1 << 0) -#define COBD_DISCON (1 << 1) -#define COBD_SWITCH (1 << 2) - -/* magic function for switching cobd between two exports cache and master in - * strong correspondence with passed @cache_on. It also may perform partial - * actions like only turn off old export or only turn on new one. - * - * bias == COBD_CONNECT only connect new export (used in cobd_connect()) - * bias == COBD_DISCON only disconnect old export (used in cobd_disconnect()) - * - * bias == COBD_SWITCH do both (disconnect old and connect new). This will also - * set ->cache_on to passed @cache_on value. - */ -static int cobd_switch(struct obd_device *obd, - int cache_on, int bias) -{ - struct cache_obd *cobd = &obd->u.cobd; - struct obd_device *cli_obd = NULL; - struct lustre_handle conn = {0,}; - struct obd_export *discon_exp; - struct obd_export *conn_exp; - int rc = 0; - ENTRY; - - if (cache_on) { - discon_exp = cobd->master_real_exp; - conn_exp = cobd->cache_exp; - } else { - discon_exp = cobd->cache_real_exp; - conn_exp = cobd->master_exp; - } - - /* disconnect old export */ - if (bias == COBD_SWITCH || bias == COBD_DISCON) { - if (discon_exp) { - rc = cobd_disconnect_client(obd, discon_exp, 0); - if (rc) { - CWARN("can't disconnect export %p, err %d\n", - discon_exp, rc); - } - } - - if (cache_on) - cobd->master_real_exp = NULL; - else - cobd->cache_real_exp = NULL; - } - - /* connect new export */ - if (bias == COBD_SWITCH || bias == COBD_CONNECT) { - int connected; - - connected = cache_on ? (cobd->cache_real_exp != NULL) : - (cobd->master_real_exp != NULL); - - /* correct export already may be connected */ - if (!connected) { - rc = cobd_connect_client(obd, conn_exp, &conn, - NULL, OBD_OPT_REAL_CLIENT); - if (rc) { - CERROR("can't connect export %p, err %d\n", - conn_exp, rc); - RETURN(rc); - } - - if (cache_on) { - cobd->cache_real_exp = class_conn2export(&conn); - cli_obd = class_exp2obd(cobd->cache_exp); - } else { - cobd->master_real_exp = class_conn2export(&conn); - cli_obd = class_exp2obd(cobd->master_exp); - } - - /* change flag only if connect is allowed to keep - * ->cache_on coherent with real export connected. */ - cobd->cache_on = cache_on; - - /* re-init EA size for new selected export. This should - * be done after assigining new state to @cobd->cache_on - * to not call disconnected old export. */ - if (obd_md_type(cli_obd)) { - rc = cobd_init_dt_desc(obd); - if (rc == 0) { - rc = cobd_init_ea_size(obd); - if (rc) { - CERROR("can't initialize EA size, " - "err %d\n", rc); - } - } else { - CERROR("can't initialize data lovdesc, " - "err %d\n", rc); - /* ignore cases when we did not manage - * to init lovdesc. This is because some - * devices may not know "lovdesc" info - * command. */ - rc = 0; - } - } - } - } - - RETURN(rc); -} - -static int -cobd_connect(struct lustre_handle *conn, struct obd_device *obd, - struct obd_uuid *cluuid, struct obd_connect_data *data, - unsigned long flags) -{ - struct cache_obd *cobd = &obd->u.cobd; - struct obd_export *exp; - int rc = 0; - ENTRY; - - rc = class_connect(conn, obd, cluuid); - if (rc) - RETURN(rc); - - exp = class_conn2export(conn); - rc = cobd_switch(obd, cobd->cache_on, - COBD_CONNECT); - if (rc) - class_disconnect(exp, 0); - else - class_export_put(exp); - RETURN(rc); -} - -static int -cobd_disconnect(struct obd_export *exp, unsigned long flags) -{ - struct cache_obd *cobd; - struct obd_device *obd; - int rc = 0; - ENTRY; - - LASSERT(exp != NULL); - obd = class_exp2obd(exp); - if (obd == NULL) { - CDEBUG(D_IOCTL, "invalid client cookie " - LPX64"\n", exp->exp_handle.h_cookie); - RETURN(-EINVAL); - } - - /* here would be nice also to check that disconnect goes to the same - * export as connect did. But as now we are accepting the notion that - * cache should be switched after client umount this is not needed. - * --umka. */ - cobd = &obd->u.cobd; - rc = cobd_switch(obd, !cobd->cache_on, COBD_DISCON); - class_disconnect(exp, flags); - - RETURN(rc); -} - -static int cobd_get_info(struct obd_export *exp, __u32 keylen, - void *key, __u32 *vallen, void *val) -{ - struct obd_device *obd = class_exp2obd(exp); - struct obd_export *cobd_exp; - int rc = 0; - ENTRY; - - if (obd == NULL) { - CERROR("invalid client cookie "LPX64"\n", - exp->exp_handle.h_cookie); - RETURN(-EINVAL); - } - cobd_exp = cobd_get_exp(obd); - - /* intercept cache utilisation info? */ - rc = obd_get_info(cobd_exp, keylen, key, vallen, val); - RETURN(rc); -} - -static int cobd_set_info(struct obd_export *exp, obd_count keylen, - void *key, obd_count vallen, void *val) -{ - struct obd_device *obd = class_exp2obd(exp); - struct obd_export *cobd_exp; - int rc = 0; - ENTRY; - - if (obd == NULL) { - CERROR("invalid client cookie "LPX64"\n", - exp->exp_handle.h_cookie); - RETURN(-EINVAL); - } - cobd_exp = cobd_get_exp(obd); - - LASSERT(cobd_exp); - - /* intercept cache utilisation info? */ - rc = obd_set_info(cobd_exp, keylen, key, vallen, val); - RETURN(rc); -} - -static int cobd_statfs(struct obd_device *obd, - struct obd_statfs *osfs, - unsigned long max_age) -{ - struct obd_export *cobd_exp; - int rc = 0; - ENTRY; - - cobd_exp = cobd_get_exp(obd); - rc = obd_statfs(class_exp2obd(cobd_exp), osfs, max_age); - RETURN(rc); -} - -static int cobd_iocontrol(unsigned int cmd, struct obd_export *exp, - int len, void *karg, void *uarg) -{ - struct obd_device *obd = class_exp2obd(exp); - struct cache_obd *cobd = &obd->u.cobd; - struct obd_export *cobd_exp; - int rc = 0; - ENTRY; - - down(&cobd->sem); - - /* here would be nice also to make sure somehow that there are no - * out-standing requests which go to wrong MDS after cache switch (close - * RPCs). But how to check that from COBD? I do not know. --umka */ - switch (cmd) { - case OBD_IOC_COBD_CON: - if (!cobd->cache_on) - rc = cobd_switch(obd, 1, COBD_SWITCH); - break; - case OBD_IOC_COBD_COFF: - if (cobd->cache_on) - rc = cobd_switch(obd, 0, COBD_SWITCH); - break; - default: - cobd_exp = cobd_get_exp(obd); - rc = obd_iocontrol(cmd, cobd_exp, len, karg, uarg); - } - - up(&cobd->sem); - RETURN(rc); -} - -static int cobd_notify(struct obd_device *obd, struct obd_device *watched, - int active, void *data) -{ - struct obd_export *cobd_exp; - int rc = 0; - ENTRY; - - cobd_exp = cobd_get_exp(obd); - rc = obd_notify(class_exp2obd(cobd_exp), watched, active, data); - RETURN(rc); -} - -static int cobd_pin(struct obd_export *exp, obd_id ino, __u32 gen, - int type, struct obd_client_handle *handle, - int flag) -{ - struct obd_device *obd = class_exp2obd(exp); - struct obd_export *cobd_exp; - int rc = 0; - ENTRY; - - if (obd == NULL) { - CERROR("invalid client cookie "LPX64"\n", - exp->exp_handle.h_cookie); - RETURN(-EINVAL); - } - cobd_exp = cobd_get_exp(obd); - rc = obd_pin(cobd_exp, ino, gen, type, handle, flag); - RETURN(rc); -} - -static int cobd_unpin(struct obd_export *exp, - struct obd_client_handle *handle, - int flag) -{ - struct obd_device *obd = class_exp2obd(exp); - struct obd_export *cobd_exp; - int rc = 0; - ENTRY; - - if (obd == NULL) { - CERROR("invalid client cookie "LPX64"\n", - exp->exp_handle.h_cookie); - RETURN(-EINVAL); - } - cobd_exp = cobd_get_exp(obd); - rc = obd_unpin(cobd_exp, handle, flag); - RETURN(rc); -} - -/* data related stuff */ -static int cobd_dt_packmd(struct obd_export *exp, - struct lov_mds_md **disk_tgt, - struct lov_stripe_md *mem_src) -{ - struct obd_device *obd = class_exp2obd(exp); - struct obd_export *cobd_exp; - int rc = 0; - ENTRY; - - if (obd == NULL) { - CERROR("invalid client cookie "LPX64"\n", - exp->exp_handle.h_cookie); - RETURN(-EINVAL); - } - cobd_exp = cobd_get_exp(obd); - rc = obd_packmd(cobd_exp, disk_tgt, mem_src); - RETURN(rc); -} - -static int cobd_dt_unpackmd(struct obd_export *exp, - struct lov_stripe_md **mem_tgt, - struct lov_mds_md *disk_src, - int disk_len) -{ - struct obd_device *obd = class_exp2obd(exp); - struct obd_export *cobd_exp; - int rc = 0; - ENTRY; - - if (obd == NULL) { - CERROR("invalid client cookie "LPX64"\n", - exp->exp_handle.h_cookie); - RETURN(-EINVAL); - } - cobd_exp = cobd_get_exp(obd); - rc = obd_unpackmd(cobd_exp, mem_tgt, disk_src, disk_len); - RETURN(rc); -} - -static int cobd_dt_create(struct obd_export *exp, - struct obdo *obdo, - void *acl, int acl_size, - struct lov_stripe_md **ea, - struct obd_trans_info *oti) -{ - struct obd_device *obd = class_exp2obd(exp); - struct obd_export *cobd_exp; - int rc = 0; - ENTRY; - - if (obd == NULL) { - CERROR("invalid client cookie "LPX64"\n", - exp->exp_handle.h_cookie); - RETURN(-EINVAL); - } - cobd_exp = cobd_get_exp(obd); - rc = obd_create(cobd_exp, obdo, acl, acl_size, ea, oti); - RETURN(rc); -} - -static int cobd_dt_destroy(struct obd_export *exp, - struct obdo *obdo, - struct lov_stripe_md *ea, - struct obd_trans_info *oti) -{ - struct obd_device *obd = class_exp2obd(exp); - struct obd_export *cobd_exp; - int rc = 0; - ENTRY; - - if (obd == NULL) { - CERROR("invalid client cookie "LPX64"\n", - exp->exp_handle.h_cookie); - RETURN(-EINVAL); - } - cobd_exp = cobd_get_exp(obd); - rc = obd_destroy(cobd_exp, obdo, ea, oti); - RETURN(rc); -} - -static int cobd_dt_precleanup(struct obd_device *obd, int flags) -{ - /* FIXME: do we need some cleanup here? */ - ENTRY; - RETURN(0); -} - -static int cobd_dt_getattr(struct obd_export *exp, struct obdo *oa, - struct lov_stripe_md *ea) -{ - struct obd_device *obd = class_exp2obd(exp); - struct obd_export *cobd_exp; - int rc = 0; - ENTRY; - - if (obd == NULL) { - CERROR("invalid client cookie "LPX64"\n", - exp->exp_handle.h_cookie); - RETURN(-EINVAL); - } - cobd_exp = cobd_get_exp(obd); - rc = obd_getattr(cobd_exp, oa, ea); - RETURN(rc); -} - -static int cobd_dt_getattr_async(struct obd_export *exp, - struct obdo *obdo, struct lov_stripe_md *ea, - struct ptlrpc_request_set *set) -{ - struct obd_device *obd = class_exp2obd(exp); - struct obd_export *cobd_exp; - int rc = 0; - ENTRY; - - if (obd == NULL) { - CERROR("invalid client cookie "LPX64"\n", - exp->exp_handle.h_cookie); - RETURN(-EINVAL); - } - cobd_exp = cobd_get_exp(obd); - rc = obd_getattr_async(cobd_exp, obdo, ea, set); - RETURN(rc); -} - -static int cobd_dt_setattr(struct obd_export *exp, struct obdo *obdo, - struct lov_stripe_md *ea, - struct obd_trans_info *oti, - struct lustre_capa *capa) -{ - struct obd_device *obd = class_exp2obd(exp); - struct obd_export *cobd_exp; - int rc = 0; - ENTRY; - - if (obd == NULL) { - CERROR("invalid client cookie "LPX64"\n", - exp->exp_handle.h_cookie); - RETURN(-EINVAL); - } - cobd_exp = cobd_get_exp(obd); - rc = obd_setattr(cobd_exp, obdo, ea, oti, capa); - RETURN(rc); -} - -static int cobd_dt_brw(int cmd, struct obd_export *exp, struct obdo *oa, - struct lov_stripe_md *ea, obd_count oa_bufs, - struct brw_page *pg, struct obd_trans_info *oti) -{ - struct obd_device *obd = class_exp2obd(exp); - struct obd_export *cobd_exp; - int rc = 0; - ENTRY; - - if (obd == NULL) { - CERROR("invalid client cookie "LPX64"\n", - exp->exp_handle.h_cookie); - RETURN(-EINVAL); - } - cobd_exp = cobd_get_exp(obd); - rc = obd_brw(cmd, cobd_exp, oa, ea, oa_bufs, pg, oti); - RETURN(rc); -} - -static int cobd_dt_brw_async(int cmd, struct obd_export *exp, - struct obdo *oa, struct lov_stripe_md *ea, - obd_count oa_bufs, struct brw_page *pg, - struct ptlrpc_request_set *set, - struct obd_trans_info *oti) -{ - struct obd_device *obd = class_exp2obd(exp); - struct obd_export *cobd_exp; - int rc = 0; - ENTRY; - - if (obd == NULL) { - CERROR("invalid client cookie "LPX64"\n", - exp->exp_handle.h_cookie); - RETURN(-EINVAL); - } - cobd_exp = cobd_get_exp(obd); - rc = obd_brw_async(cmd, cobd_exp, oa, ea, oa_bufs, - pg, set, oti); - RETURN(rc); -} - -static int cobd_dt_prep_async_page(struct obd_export *exp, - struct lov_stripe_md *lsm, - struct lov_oinfo *loi, - struct page *page, obd_off offset, - struct obd_async_page_ops *ops, - void *data, void **res) -{ - struct obd_device *obd = class_exp2obd(exp); - struct obd_export *cobd_exp; - int rc = 0; - ENTRY; - - if (obd == NULL) { - CERROR("invalid client cookie "LPX64"\n", - exp->exp_handle.h_cookie); - RETURN(-EINVAL); - } - cobd_exp = cobd_get_exp(obd); - rc = obd_prep_async_page(cobd_exp, lsm, loi, page, - offset, ops, data, res); - RETURN(rc); -} - -static int cobd_dt_queue_async_io(struct obd_export *exp, - struct lov_stripe_md *lsm, - struct lov_oinfo *loi, void *cookie, - int cmd, obd_off off, int count, - obd_flags brw_flags, obd_flags async_flags) -{ - struct obd_device *obd = class_exp2obd(exp); - struct obd_export *cobd_exp; - int rc = 0; - ENTRY; - - if (obd == NULL) { - CERROR("invalid client cookie "LPX64"\n", - exp->exp_handle.h_cookie); - RETURN(-EINVAL); - } - cobd_exp = cobd_get_exp(obd); - rc = obd_queue_async_io(cobd_exp, lsm, loi, cookie, cmd, off, - count, brw_flags, async_flags); - RETURN(rc); -} - -static int cobd_dt_set_async_flags(struct obd_export *exp, - struct lov_stripe_md *lsm, - struct lov_oinfo *loi, void *cookie, - obd_flags async_flags) -{ - struct obd_device *obd = class_exp2obd(exp); - struct obd_export *cobd_exp; - int rc = 0; - ENTRY; - - if (obd == NULL) { - CERROR("invalid client cookie "LPX64"\n", - exp->exp_handle.h_cookie); - RETURN(-EINVAL); - } - cobd_exp = cobd_get_exp(obd); - rc = obd_set_async_flags(cobd_exp, lsm, loi, cookie, - async_flags); - RETURN(rc); -} - -static int cobd_dt_queue_group_io(struct obd_export *exp, - struct lov_stripe_md *lsm, - struct lov_oinfo *loi, - struct obd_io_group *oig, - void *cookie, int cmd, obd_off off, - int count, obd_flags brw_flags, - obd_flags async_flags) -{ - struct obd_device *obd = class_exp2obd(exp); - struct obd_export *cobd_exp; - int rc = 0; - ENTRY; - - if (obd == NULL) { - CERROR("invalid client cookie "LPX64"\n", - exp->exp_handle.h_cookie); - RETURN(-EINVAL); - } - cobd_exp = cobd_get_exp(obd); - rc = obd_queue_group_io(cobd_exp, lsm, loi, oig, cookie, - cmd, off, count, brw_flags, - async_flags); - RETURN(rc); -} - -static int cobd_dt_trigger_group_io(struct obd_export *exp, - struct lov_stripe_md *lsm, - struct lov_oinfo *loi, - struct obd_io_group *oig) -{ - struct obd_device *obd = class_exp2obd(exp); - struct obd_export *cobd_exp; - int rc = 0; - ENTRY; - - if (obd == NULL) { - CERROR("invalid client cookie "LPX64"\n", - exp->exp_handle.h_cookie); - RETURN(-EINVAL); - } - cobd_exp = cobd_get_exp(obd); - rc = obd_trigger_group_io(cobd_exp, lsm, loi, oig); - RETURN(rc); -} - -static int cobd_dt_teardown_async_page(struct obd_export *exp, - struct lov_stripe_md *lsm, - struct lov_oinfo *loi, - void *cookie) -{ - struct obd_device *obd = class_exp2obd(exp); - struct obd_export *cobd_exp; - int rc = 0; - ENTRY; - - if (obd == NULL) { - CERROR("invalid client cookie "LPX64"\n", - exp->exp_handle.h_cookie); - RETURN(-EINVAL); - } - cobd_exp = cobd_get_exp(obd); - rc = obd_teardown_async_page(cobd_exp, lsm, loi, cookie); - RETURN(rc); -} - -static int cobd_dt_punch(struct obd_export *exp, struct obdo *oa, - struct lov_stripe_md *ea, obd_size start, - obd_size end, struct obd_trans_info *oti, - struct lustre_capa *capa) -{ - struct obd_device *obd = class_exp2obd(exp); - struct obd_export *cobd_exp; - int rc = 0; - ENTRY; - - if (obd == NULL) { - CERROR("invalid client cookie "LPX64"\n", - exp->exp_handle.h_cookie); - RETURN(-EINVAL); - } - cobd_exp = cobd_get_exp(obd); - rc = obd_punch(cobd_exp, oa, ea, start, end, oti, capa); - RETURN(rc); -} - -static int cobd_dt_sync(struct obd_export *exp, struct obdo *oa, - struct lov_stripe_md *ea, obd_size start, - obd_size end) -{ - struct obd_device *obd = class_exp2obd(exp); - struct obd_export *cobd_exp; - int rc = 0; - ENTRY; - - if (obd == NULL) { - CERROR("invalid client cookie "LPX64"\n", - exp->exp_handle.h_cookie); - RETURN(-EINVAL); - } - cobd_exp = cobd_get_exp(obd); - rc = obd_sync(cobd_exp, oa, ea, start, end); - RETURN(rc); -} - -static int cobd_dt_enqueue(struct obd_export *exp, struct lov_stripe_md *ea, - __u32 type, ldlm_policy_data_t *policy, - __u32 mode, int *flags, void *bl_cb, void *cp_cb, - void *gl_cb, void *data, __u32 lvb_len, - void *lvb_swabber, struct lustre_handle *lockh) -{ - struct obd_device *obd = class_exp2obd(exp); - struct obd_export *cobd_exp; - int rc = 0; - ENTRY; - - if (obd == NULL) { - CERROR("invalid client cookie "LPX64"\n", - exp->exp_handle.h_cookie); - RETURN(-EINVAL); - } - cobd_exp = cobd_get_exp(obd); - rc = obd_enqueue(cobd_exp, ea, type, policy, mode, flags, - bl_cb, cp_cb, gl_cb, data, lvb_len, - lvb_swabber, lockh); - RETURN(rc); -} - -static int cobd_dt_match(struct obd_export *exp, struct lov_stripe_md *ea, - __u32 type, ldlm_policy_data_t *policy, __u32 mode, - int *flags, void *data, struct lustre_handle *lockh) -{ - struct obd_device *obd = class_exp2obd(exp); - struct obd_export *cobd_exp; - int rc = 0; - ENTRY; - - if (obd == NULL) { - CERROR("invalid client cookie "LPX64"\n", - exp->exp_handle.h_cookie); - RETURN(-EINVAL); - } - cobd_exp = cobd_get_exp(obd); - rc = obd_match(cobd_exp, ea, type, policy, mode, flags, - data, lockh); - RETURN(rc); -} -static int cobd_dt_change_cbdata(struct obd_export *exp, - struct lov_stripe_md *lsm, - ldlm_iterator_t it, void *data) -{ - struct obd_device *obd = class_exp2obd(exp); - struct obd_export *cobd_exp; - int rc = 0; - ENTRY; - - if (obd == NULL) { - CERROR("invalid client cookie "LPX64"\n", - exp->exp_handle.h_cookie); - RETURN(-EINVAL); - } - cobd_exp = cobd_get_exp(obd); - rc = obd_change_cbdata(cobd_exp, lsm, it, data); - RETURN(rc); -} - -static int cobd_dt_cancel(struct obd_export *exp, - struct lov_stripe_md *ea, __u32 mode, - struct lustre_handle *lockh) -{ - struct obd_device *obd = class_exp2obd(exp); - struct obd_export *cobd_exp; - int rc = 0; - ENTRY; - - if (obd == NULL) { - CERROR("invalid client cookie "LPX64"\n", - exp->exp_handle.h_cookie); - RETURN(-EINVAL); - } - cobd_exp = cobd_get_exp(obd); - rc = obd_cancel(cobd_exp, ea, mode, lockh); - RETURN(rc); -} - -static int cobd_dt_cancel_unused(struct obd_export *exp, - struct lov_stripe_md *ea, - int flags, void *opaque) -{ - struct obd_device *obd = class_exp2obd(exp); - struct obd_export *cobd_exp; - int rc = 0; - ENTRY; - - if (obd == NULL) { - CERROR("invalid client cookie "LPX64"\n", - exp->exp_handle.h_cookie); - RETURN(-EINVAL); - } - cobd_exp = cobd_get_exp(obd); - rc = obd_cancel_unused(cobd_exp, ea, flags, opaque); - RETURN(rc); -} - -static int cobd_dt_preprw(int cmd, struct obd_export *exp, - struct obdo *oa, int objcount, - struct obd_ioobj *obj, int niocount, - struct niobuf_remote *nb, - struct niobuf_local *res, - struct obd_trans_info *oti, - struct lustre_capa *capa) -{ - struct obd_device *obd = class_exp2obd(exp); - struct obd_export *cobd_exp; - int rc = 0; - ENTRY; - - if (obd == NULL) { - CERROR("invalid client cookie "LPX64"\n", - exp->exp_handle.h_cookie); - RETURN(-EINVAL); - } - cobd_exp = cobd_get_exp(obd); - rc = obd_preprw(cmd, cobd_exp, oa, objcount, obj, - niocount, nb, res, oti, capa); - RETURN(rc); -} - -static int cobd_dt_commitrw(int cmd, struct obd_export *exp, struct obdo *oa, - int objcount, struct obd_ioobj *obj, - int niocount, struct niobuf_local *local, - struct obd_trans_info *oti, int rc) -{ - struct obd_device *obd = class_exp2obd(exp); - struct obd_export *cobd_exp; - int err = 0; - ENTRY; - - if (obd == NULL) { - CERROR("invalid client cookie "LPX64"\n", - exp->exp_handle.h_cookie); - RETURN(-EINVAL); - } - cobd_exp = cobd_get_exp(obd); - err = obd_commitrw(cmd, cobd_exp, oa, objcount, obj, - niocount, local, oti, rc); - RETURN(err); -} - -static int cobd_dt_adjust_kms(struct obd_export *exp, - struct lov_stripe_md *lsm, - obd_off size, int shrink) -{ - struct obd_device *obd = class_exp2obd(exp); - struct obd_export *cobd_exp; - int rc = 0; - ENTRY; - - if (obd == NULL) { - CERROR("invalid client cookie "LPX64"\n", - exp->exp_handle.h_cookie); - RETURN(-EINVAL); - } - cobd_exp = cobd_get_exp(obd); - rc = obd_adjust_kms(cobd_exp, lsm, size, shrink); - - RETURN(rc); -} - -static int cobd_dt_llog_init(struct obd_device *obd, - struct obd_llogs *llogs, - struct obd_device *disk_obd, - int count, struct llog_catid *logid) -{ - struct obd_export *cobd_exp; - struct obd_device *cobd_obd; - int rc = 0; - ENTRY; - - cobd_exp = cobd_get_exp(obd); - cobd_obd = class_exp2obd(cobd_exp); - - rc = obd_llog_init(cobd_obd, &cobd_obd->obd_llogs, - disk_obd, count, logid); - RETURN(rc); -} - -static int cobd_dt_llog_finish(struct obd_device *obd, - struct obd_llogs *llogs, - int count) -{ - struct obd_export *cobd_exp; - struct obd_device *cobd_obd; - int rc = 0; - ENTRY; - - cobd_exp = cobd_get_exp(obd); - cobd_obd = class_exp2obd(cobd_exp); - - rc = obd_llog_finish(cobd_obd, &cobd_obd->obd_llogs, count); - RETURN(rc); -} - -static int cobd_dt_init_ea_size(struct obd_export *exp, int easize, - int cookiesize) -{ - struct obd_export *cobd_exp; - int rc = 0; - ENTRY; - - cobd_exp = cobd_get_exp(exp->exp_obd); - rc = obd_init_ea_size(cobd_exp, easize, cookiesize); - RETURN(rc); -} - -static int cobd_dt_import_event(struct obd_device *obd, - struct obd_import *imp, - enum obd_import_event event) -{ - struct obd_export *cobd_exp; - ENTRY; - - cobd_exp = cobd_get_exp(obd); - obd_import_event(class_exp2obd(cobd_exp), imp, event); - RETURN(0); -} - -/* metadata related stuff */ -static int cobd_md_getstatus(struct obd_export *exp, - struct lustre_id *rootid) -{ - struct obd_device *obd = class_exp2obd(exp); - struct obd_export *cobd_exp; - int rc = 0; - ENTRY; - - if (obd == NULL) { - CERROR("invalid client cookie "LPX64"\n", - exp->exp_handle.h_cookie); - RETURN(-EINVAL); - } - cobd_exp = cobd_get_exp(obd); - rc = md_getstatus(cobd_exp, rootid); - RETURN(rc); -} - -static int cobd_md_getattr(struct obd_export *exp, struct lustre_id *id, - __u64 valid, const char *xattr_name, - const void *xattr_data, unsigned int xattr_datalen, - unsigned int ea_size, struct obd_capa *ocapa, - struct ptlrpc_request **request) -{ - struct obd_device *obd = class_exp2obd(exp); - struct obd_export *cobd_exp; - int rc = 0; - ENTRY; - - if (obd == NULL) { - CERROR("invalid client cookie "LPX64"\n", - exp->exp_handle.h_cookie); - RETURN(-EINVAL); - } - cobd_exp = cobd_get_exp(obd); - rc = md_getattr(cobd_exp, id, valid, xattr_name, xattr_data, - xattr_datalen, ea_size, ocapa, request); - RETURN(rc); -} - -static int cobd_md_req2lustre_md(struct obd_export *mdc_exp, - struct ptlrpc_request *req, - unsigned int offset, - struct obd_export *osc_exp, - struct lustre_md *md) -{ - struct obd_device *obd = class_exp2obd(mdc_exp); - struct obd_export *cobd_exp; - int rc = 0; - ENTRY; - - if (obd == NULL) { - CERROR("invalid client cookie "LPX64"\n", - mdc_exp->exp_handle.h_cookie); - RETURN(-EINVAL); - } - cobd_exp = cobd_get_exp(obd); - rc = md_req2lustre_md(cobd_exp, req, offset, osc_exp, md); - RETURN(rc); -} - -static int cobd_md_change_cbdata(struct obd_export *exp, struct lustre_id *id, - ldlm_iterator_t it, void *data) -{ - struct obd_device *obd = class_exp2obd(exp); - struct obd_export *cobd_exp; - int rc = 0; - ENTRY; - - if (obd == NULL) { - CERROR("invalid client cookie "LPX64"\n", - exp->exp_handle.h_cookie); - RETURN(-EINVAL); - } - cobd_exp = cobd_get_exp(obd); - rc = md_change_cbdata(cobd_exp, id, it, data); - RETURN(rc); -} - -static int cobd_md_getattr_lock(struct obd_export *exp, struct lustre_id *id, - char *filename, int namelen, __u64 valid, - unsigned int ea_size, struct ptlrpc_request **request) -{ - struct obd_device *obd = class_exp2obd(exp); - struct obd_export *cobd_exp; - int rc = 0; - ENTRY; - - if (obd == NULL) { - CERROR("invalid client cookie "LPX64"\n", - exp->exp_handle.h_cookie); - RETURN(-EINVAL); - } - cobd_exp = cobd_get_exp(obd); - rc = md_getattr_lock(cobd_exp, id, filename, namelen, - valid, ea_size, request); - RETURN(rc); -} - -static int cobd_md_create(struct obd_export *exp, struct mdc_op_data *op_data, - const void *data, int datalen, int mode, - __u32 uid, __u32 gid, __u64 rdev, - struct ptlrpc_request **request) -{ - struct obd_device *obd = class_exp2obd(exp); - struct obd_export *cobd_exp; - int rc = 0; - ENTRY; - - if (obd == NULL) { - CERROR("invalid client cookie "LPX64"\n", - exp->exp_handle.h_cookie); - RETURN(-EINVAL); - } - cobd_exp = cobd_get_exp(obd); - rc = md_create(cobd_exp, op_data, data, datalen, mode, - uid, gid, rdev, request); - RETURN(rc); -} - -static int cobd_md_unlink(struct obd_export *exp, - struct mdc_op_data *data, - struct ptlrpc_request **request) -{ - struct obd_device *obd = class_exp2obd(exp); - struct obd_export *cobd_exp; - int rc = 0; - ENTRY; - - if (obd == NULL) { - CERROR("invalid client cookie "LPX64"\n", - exp->exp_handle.h_cookie); - RETURN(-EINVAL); - } - cobd_exp = cobd_get_exp(obd); - rc = md_unlink(cobd_exp, data, request); - RETURN(rc); -} - -static int cobd_md_valid_attrs(struct obd_export *exp, - struct lustre_id *id) -{ - struct obd_device *obd = class_exp2obd(exp); - struct obd_export *cobd_exp; - int rc = 0; - ENTRY; - - if (obd == NULL) { - CERROR("invalid client cookie "LPX64"\n", - exp->exp_handle.h_cookie); - RETURN(-EINVAL); - } - cobd_exp = cobd_get_exp(obd); - rc = md_valid_attrs(cobd_exp, id); - RETURN(rc); -} - -static int cobd_md_rename(struct obd_export *exp, struct mdc_op_data *data, - const char *old, int oldlen, const char *new, - int newlen, struct ptlrpc_request **request) -{ - struct obd_device *obd = class_exp2obd(exp); - struct obd_export *cobd_exp; - int rc = 0; - ENTRY; - - if (obd == NULL) { - CERROR("invalid client cookie "LPX64"\n", - exp->exp_handle.h_cookie); - RETURN(-EINVAL); - } - cobd_exp = cobd_get_exp(obd); - rc = md_rename(cobd_exp, data, old, oldlen, new, newlen, request); - RETURN(rc); -} - -static int cobd_md_link(struct obd_export *exp, struct mdc_op_data *data, - struct ptlrpc_request **request) -{ - struct obd_device *obd = class_exp2obd(exp); - struct obd_export *cobd_exp; - int rc = 0; - ENTRY; - - if (obd == NULL) { - CERROR("invalid client cookie "LPX64"\n", - exp->exp_handle.h_cookie); - RETURN(-EINVAL); - } - cobd_exp = cobd_get_exp(obd); - rc = md_link(cobd_exp, data, request); - RETURN(rc); -} - -static int cobd_md_setattr(struct obd_export *exp, struct mdc_op_data *data, - struct iattr *iattr, void *ea, int ealen, void *ea2, - int ea2len, void *ea3, int ea3len, - struct ptlrpc_request **request) -{ - struct obd_device *obd = class_exp2obd(exp); - struct obd_export *cobd_exp; - int rc = 0; - ENTRY; - - if (obd == NULL) { - CERROR("invalid client cookie "LPX64"\n", - exp->exp_handle.h_cookie); - RETURN(-EINVAL); - } - cobd_exp = cobd_get_exp(obd); - rc = md_setattr(cobd_exp, data, iattr, ea, - ealen, ea2, ea2len, ea3, ea3len, request); - RETURN(rc); -} - -static int cobd_md_readpage(struct obd_export *exp, - struct lustre_id *mdc_id, - __u64 offset, struct page *page, - struct ptlrpc_request **request) -{ - struct obd_device *obd = class_exp2obd(exp); - struct obd_export *cobd_exp; - int rc = 0; - ENTRY; - - if (obd == NULL) { - CERROR("invalid client cookie "LPX64"\n", - exp->exp_handle.h_cookie); - RETURN(-EINVAL); - } - cobd_exp = cobd_get_exp(obd); - rc = md_readpage(cobd_exp, mdc_id, offset, page, request); - RETURN(rc); -} - -static int cobd_md_close(struct obd_export *exp, struct mdc_op_data *op_data, - struct obd_client_handle *och, - struct ptlrpc_request **request) -{ - struct obd_device *obd = class_exp2obd(exp); - struct obd_export *cobd_exp; - int rc = 0; - ENTRY; - - if (obd == NULL) { - CERROR("invalid client cookie "LPX64"\n", - exp->exp_handle.h_cookie); - RETURN(-EINVAL); - } - cobd_exp = cobd_get_exp(obd); - rc = md_close(cobd_exp, op_data, och, request); - RETURN(rc); -} - -static int cobd_md_done_writing(struct obd_export *exp, - struct obdo *obdo) -{ - struct obd_device *obd = class_exp2obd(exp); - struct obd_export *cobd_exp; - int rc = 0; - ENTRY; - - if (obd == NULL) { - CERROR("invalid client cookie "LPX64"\n", - exp->exp_handle.h_cookie); - RETURN(-EINVAL); - } - cobd_exp = cobd_get_exp(obd); - rc = md_done_writing(cobd_exp, obdo); - RETURN(rc); -} - -static int cobd_md_sync(struct obd_export *exp, struct lustre_id *id, - struct ptlrpc_request **request) -{ - struct obd_device *obd = class_exp2obd(exp); - struct obd_export *cobd_exp; - int rc = 0; - ENTRY; - - if (obd == NULL) { - CERROR("invalid client cookie "LPX64"\n", - exp->exp_handle.h_cookie); - RETURN(-EINVAL); - } - cobd_exp = cobd_get_exp(obd); - rc = md_sync(cobd_exp, id, request); - RETURN(rc); -} - -static int cobd_md_set_open_replay_data(struct obd_export *exp, - struct obd_client_handle *och, - struct ptlrpc_request *open_req) -{ - struct obd_device *obd = class_exp2obd(exp); - struct obd_export *cobd_exp; - int rc = 0; - ENTRY; - - if (obd == NULL) { - CERROR("invalid client cookie "LPX64"\n", - exp->exp_handle.h_cookie); - RETURN(-EINVAL); - } - cobd_exp = cobd_get_exp(obd); - rc = md_set_open_replay_data(cobd_exp, och, open_req); - RETURN(rc); -} - -static int cobd_md_clear_open_replay_data(struct obd_export *exp, - struct obd_client_handle *och) -{ - struct obd_device *obd = class_exp2obd(exp); - struct obd_export *cobd_exp; - int rc = 0; - ENTRY; - - if (obd == NULL) { - CERROR("invalid client cookie "LPX64"\n", - exp->exp_handle.h_cookie); - RETURN(-EINVAL); - } - cobd_exp = cobd_get_exp(obd); - rc = md_clear_open_replay_data(cobd_exp, och); - RETURN(rc); -} - -static int cobd_md_store_inode_generation(struct obd_export *exp, - struct ptlrpc_request *req, - int reqoff, int repoff) -{ - struct obd_device *obd = class_exp2obd(exp); - struct obd_export *cobd_exp; - int rc = 0; - ENTRY; - - if (obd == NULL) { - CERROR("invalid client cookie "LPX64"\n", - exp->exp_handle.h_cookie); - RETURN(-EINVAL); - } - cobd_exp = cobd_get_exp(obd); - rc = md_store_inode_generation(cobd_exp, req, reqoff, repoff); - RETURN(rc); -} - -static int cobd_md_set_lock_data(struct obd_export *exp, - __u64 *l, void *data) -{ - struct obd_device *obd = class_exp2obd(exp); - struct obd_export *cobd_exp; - int rc = 0; - ENTRY; - - if (obd == NULL) { - CERROR("invalid client cookie "LPX64"\n", - exp->exp_handle.h_cookie); - RETURN(-EINVAL); - } - cobd_exp = cobd_get_exp(obd); - rc = md_set_lock_data(cobd_exp, l, data); - RETURN(rc); -} - -static int cobd_md_enqueue(struct obd_export *exp, int lock_type, - struct lookup_intent *it, int lock_mode, - struct mdc_op_data *data, struct lustre_handle *lockh, - void *lmm, int lmmsize, - ldlm_completion_callback cb_completion, - ldlm_blocking_callback cb_blocking, void *cb_data) -{ - struct obd_device *obd = class_exp2obd(exp); - struct obd_export *cobd_exp; - int rc = 0; - ENTRY; - - if (obd == NULL) { - CERROR("invalid client cookie "LPX64"\n", - exp->exp_handle.h_cookie); - RETURN(-EINVAL); - } - cobd_exp = cobd_get_exp(obd); - rc = md_enqueue(cobd_exp, lock_type, it, lock_mode, data, - lockh, lmm, lmmsize, cb_completion, cb_blocking, - cb_data); - RETURN(rc); -} - -static int cobd_md_intent_lock(struct obd_export *exp, struct lustre_id *pid, - const char *name, int len, void *lmm, int lmmsize, - struct lustre_id *cid, struct lookup_intent *it, - int lookup_flags, struct ptlrpc_request **reqp, - ldlm_blocking_callback cb_blocking) -{ - struct obd_device *obd = class_exp2obd(exp); - struct obd_export *cobd_exp; - int rc = 0; - ENTRY; - - if (obd == NULL) { - CERROR("invalid client cookie "LPX64"\n", - exp->exp_handle.h_cookie); - RETURN(-EINVAL); - } - lookup_flags |= LOOKUP_COBD; - cobd_exp = cobd_get_exp(obd); - - rc = md_intent_lock(cobd_exp, pid, name, len, lmm, lmmsize, - cid, it, lookup_flags, reqp, cb_blocking); - RETURN(rc); -} - -static struct obd_device *cobd_md_get_real_obd(struct obd_export *exp, - struct lustre_id *id) -{ - struct obd_device *obd = class_exp2obd(exp); - struct obd_export *cobd_exp; - ENTRY; - - if (obd == NULL) { - CERROR("invalid client cookie "LPX64"\n", - exp->exp_handle.h_cookie); - RETURN(NULL); - } - cobd_exp = cobd_get_exp(obd); - RETURN(md_get_real_obd(cobd_exp, id)); -} - -static int cobd_md_change_cbdata_name(struct obd_export *exp, - struct lustre_id *id, char *name, - int namelen, struct lustre_id *id2, - ldlm_iterator_t it, void *data) -{ - struct obd_device *obd = class_exp2obd(exp); - struct obd_export *cobd_exp; - int rc = 0; - - if (obd == NULL) { - CERROR("invalid client cookie "LPX64"\n", - exp->exp_handle.h_cookie); - RETURN(-EINVAL); - } - cobd_exp = cobd_get_exp(obd); - rc = md_change_cbdata_name(cobd_exp, id, name, namelen, - id2, it, data); - RETURN(rc); -} - -static struct obd_ops cobd_obd_ops = { - .o_owner = THIS_MODULE, - .o_attach = cobd_attach, - .o_detach = cobd_detach, - .o_setup = cobd_setup, - .o_cleanup = cobd_cleanup, - .o_connect = cobd_connect, - .o_disconnect = cobd_disconnect, - .o_set_info = cobd_set_info, - .o_get_info = cobd_get_info, - .o_statfs = cobd_statfs, - .o_iocontrol = cobd_iocontrol, - .o_notify = cobd_notify, - .o_pin = cobd_pin, - .o_unpin = cobd_unpin, - - .o_packmd = cobd_dt_packmd, - .o_unpackmd = cobd_dt_unpackmd, - .o_create = cobd_dt_create, - .o_destroy = cobd_dt_destroy, - .o_precleanup = cobd_dt_precleanup, - .o_getattr = cobd_dt_getattr, - .o_getattr_async = cobd_dt_getattr_async, - .o_setattr = cobd_dt_setattr, - .o_brw = cobd_dt_brw, - .o_brw_async = cobd_dt_brw_async, - .o_prep_async_page = cobd_dt_prep_async_page, - .o_queue_async_io = cobd_dt_queue_async_io, - .o_set_async_flags = cobd_dt_set_async_flags, - .o_queue_group_io = cobd_dt_queue_group_io, - .o_trigger_group_io = cobd_dt_trigger_group_io, - .o_teardown_async_page = cobd_dt_teardown_async_page, - .o_preprw = cobd_dt_preprw, - .o_punch = cobd_dt_punch, - .o_sync = cobd_dt_sync, - .o_enqueue = cobd_dt_enqueue, - .o_match = cobd_dt_match, - .o_change_cbdata = cobd_dt_change_cbdata, - .o_cancel = cobd_dt_cancel, - .o_cancel_unused = cobd_dt_cancel_unused, - .o_commitrw = cobd_dt_commitrw, - .o_llog_init = cobd_dt_llog_init, - .o_llog_finish = cobd_dt_llog_finish, - .o_import_event = cobd_dt_import_event, - .o_init_ea_size = cobd_dt_init_ea_size, - .o_adjust_kms = cobd_dt_adjust_kms, -}; - -struct md_ops cobd_md_ops = { - .m_getstatus = cobd_md_getstatus, - .m_getattr = cobd_md_getattr, - .m_req2lustre_md = cobd_md_req2lustre_md, - .m_change_cbdata = cobd_md_change_cbdata, - .m_getattr_lock = cobd_md_getattr_lock, - .m_create = cobd_md_create, - .m_unlink = cobd_md_unlink, - .m_valid_attrs = cobd_md_valid_attrs, - .m_rename = cobd_md_rename, - .m_link = cobd_md_link, - .m_setattr = cobd_md_setattr, - .m_readpage = cobd_md_readpage, - .m_close = cobd_md_close, - .m_done_writing = cobd_md_done_writing, - .m_sync = cobd_md_sync, - .m_set_open_replay_data = cobd_md_set_open_replay_data, - .m_clear_open_replay_data = cobd_md_clear_open_replay_data, - .m_store_inode_generation = cobd_md_store_inode_generation, - .m_set_lock_data = cobd_md_set_lock_data, - .m_enqueue = cobd_md_enqueue, - .m_get_real_obd = cobd_md_get_real_obd, - .m_intent_lock = cobd_md_intent_lock, - .m_change_cbdata_name = cobd_md_change_cbdata_name, -}; - -static int __init cobd_init(void) -{ - struct lprocfs_static_vars lvars; - ENTRY; - - printk(KERN_INFO "Lustre: Caching OBD driver; info@clusterfs.com\n"); - - lprocfs_init_vars(cobd, &lvars); - RETURN(class_register_type(&cobd_obd_ops, &cobd_md_ops, - lvars.module_vars, OBD_CACHE_DEVICENAME)); -} - -static void /*__exit*/ cobd_exit(void) -{ - class_unregister_type(OBD_CACHE_DEVICENAME); -} - -MODULE_AUTHOR("Cluster File Systems, Inc. "); -MODULE_DESCRIPTION("Lustre Caching OBD driver"); -MODULE_LICENSE("GPL"); - -module_init(cobd_init); -module_exit(cobd_exit); diff --git a/lustre/cobd/lproc_cache.c b/lustre/cobd/lproc_cache.c deleted file mode 100644 index a40f42d..0000000 --- a/lustre/cobd/lproc_cache.c +++ /dev/null @@ -1,83 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (C) 2002 Cluster File Systems, Inc. - * - * This file is part of Lustre, http://www.lustre.org. - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - */ -#define DEBUG_SUBSYSTEM S_CLASS - -#include -#include - -#ifndef LPROCFS -static struct lprocfs_vars lprocfs_obd_vars[] = { {0} }; -static struct lprocfs_vars lprocfs_module_vars[] = { {0} }; -#else -/* Common STATUS namespace */ -static int cobd_rd_target(char *page, char **start, off_t off, int count, - int *eof, void *data) -{ - struct obd_device *cobd = (struct obd_device *)data; - int rc; - - LASSERT(cobd != NULL); - - if (!cobd->obd_set_up) { - rc = snprintf(page, count, "not set up\n"); - } else { - struct obd_device *tgt = - class_exp2obd(cobd->u.cobd.master_exp); - LASSERT(tgt != NULL); - rc = snprintf(page, count, "%s\n", tgt->obd_uuid.uuid); - } - return rc; -} - -static int cobd_rd_cache(char *page, char **start, off_t off, int count, - int *eof, void *data) -{ - struct obd_device *cobd = (struct obd_device*)data; - int rc; - - LASSERT(cobd != NULL); - - if (!cobd->obd_set_up) { - rc = snprintf(page, count, "not set up\n"); - } else { - struct obd_device *cache = - class_exp2obd(cobd->u.cobd.cache_exp); - LASSERT(cache != NULL); - rc = snprintf(page, count, "%s\n", cache->obd_uuid.uuid); - } - return rc; -} - -static struct lprocfs_vars lprocfs_obd_vars[] = { - { "uuid", lprocfs_rd_uuid, 0, 0 }, - { "target_uuid", cobd_rd_target, 0, 0 }, - { "cache_uuid", cobd_rd_cache, 0, 0 }, - { 0 } -}; - -struct lprocfs_vars lprocfs_module_vars[] = { - { "num_refs", lprocfs_rd_numrefs, 0, 0 }, - { 0 } -}; -#endif /* LPROCFS */ - -LPROCFS_INIT_VARS(cobd, lprocfs_module_vars, lprocfs_obd_vars) diff --git a/lustre/conf/lustre.dtd b/lustre/conf/lustre.dtd index 0e8ad33..360f4a0 100644 --- a/lustre/conf/lustre.dtd +++ b/lustre/conf/lustre.dtd @@ -33,10 +33,11 @@ + echoclient_ref | mountpoint_ref)*> filesystem_ref #REQUIRED > @@ -46,14 +47,11 @@ - - - + target_ref | node_ref | journalsize | mkfsoptions | mountfsoptions)*> @@ -62,8 +60,7 @@ failover ( 1 | 0 ) #IMPLIED> + obd_ref #REQUIRED> @@ -71,7 +68,7 @@ failover ( 1 | 0 ) #IMPLIED> + target_ref | node_ref | journalsize | mkfsoptions | mountfsoptions)*> @@ -91,6 +88,8 @@ + + @@ -130,8 +129,6 @@ - - diff --git a/lustre/conf/lustre2ldif.xsl b/lustre/conf/lustre2ldif.xsl index 8c3c24a..58b0649 100644 --- a/lustre/conf/lustre2ldif.xsl +++ b/lustre/conf/lustre2ldif.xsl @@ -125,6 +125,9 @@ journalsize: mkfsoptions: + +mountfsoptions: + nodeRef: targetRef: @@ -179,6 +182,9 @@ journalsize: mkfsoptions: + +mountfsoptions: + @@ -213,6 +219,9 @@ dn: uuid=, objectClass: MOUNTPOINT lustreName: uuid: + +clientoptions: + diff --git a/lustre/conf/modules.conf b/lustre/conf/modules.conf index 0fb0a35..a5bdefa 100644 --- a/lustre/conf/modules.conf +++ b/lustre/conf/modules.conf @@ -1,6 +1,8 @@ # sample modules.conf for autoloading lustre modules on zeroconf clients add below kptlrouter portals +#add below ksocknal kptlrouter +#add below kqswnal kptlrouter add below ptlrpc ksocknal add below llite lov osc alias lustre llite diff --git a/lustre/conf/slapd-lustre.conf b/lustre/conf/slapd-lustre.conf index 8558f64..b93b411 100644 --- a/lustre/conf/slapd-lustre.conf +++ b/lustre/conf/slapd-lustre.conf @@ -8,4 +8,4 @@ rootdn "cn=Manager,fs=lustre" include /etc/openldap/schema/lustre.schema rootpw secret directory /var/lib/ldap/lustre -index objectClass eq, uuid eq +index objectClass,uuid eq diff --git a/lustre/contrib/.cvsignore b/lustre/contrib/.cvsignore new file mode 100644 index 0000000..282522d --- /dev/null +++ b/lustre/contrib/.cvsignore @@ -0,0 +1,2 @@ +Makefile +Makefile.in diff --git a/lustre/contrib/Makefile.am b/lustre/contrib/Makefile.am new file mode 100644 index 0000000..5a8e66c --- /dev/null +++ b/lustre/contrib/Makefile.am @@ -0,0 +1,5 @@ +# Contributions Makefile + +EXTRA_DIST = mpich-*.patch +pkgdata_DATA = $(EXTRA_DIST) + diff --git a/lustre/contrib/README b/lustre/contrib/README new file mode 100644 index 0000000..73270f3 --- /dev/null +++ b/lustre/contrib/README @@ -0,0 +1,2 @@ +The files in this directory are user-contributed and are not supported by +CFS in any way. diff --git a/lustre/contrib/mpich-1.2.6-lustre.patch b/lustre/contrib/mpich-1.2.6-lustre.patch new file mode 100644 index 0000000..d32fab9 --- /dev/null +++ b/lustre/contrib/mpich-1.2.6-lustre.patch @@ -0,0 +1,1829 @@ +diff -r -u --new-file mpich-1.2.6/romio/adio/ad_lustre/ad_lustre.c mpich-1.2.6/romio/adio/ad_lustre/ad_lustre.c +--- mpich-1.2.6/romio/adio/ad_lustre/ad_lustre.c 1969-12-31 19:00:00.000000000 -0500 ++++ mpich-1.2.6/romio/adio/ad_lustre/ad_lustre.c 2005-12-06 11:54:37.883130927 -0500 +@@ -0,0 +1,37 @@ ++/* -*- Mode: C; c-basic-offset:4 ; -*- */ ++/* ++ * $Id: ad_lustre.c,v 1.1.1.1 2004/11/04 11:03:38 liam Exp $ ++ * ++ * Copyright (C) 2001 University of Chicago. ++ * See COPYRIGHT notice in top-level directory. ++ */ ++ ++#include "ad_lustre.h" ++ ++/* adioi.h has the ADIOI_Fns_struct define */ ++#include "adioi.h" ++ ++struct ADIOI_Fns_struct ADIO_LUSTRE_operations = { ++ ADIOI_LUSTRE_Open, /* Open */ ++ ADIOI_LUSTRE_ReadContig, /* ReadContig */ ++ ADIOI_LUSTRE_WriteContig, /* WriteContig */ ++ ADIOI_GEN_ReadStridedColl, /* ReadStridedColl */ ++ ADIOI_GEN_WriteStridedColl, /* WriteStridedColl */ ++ ADIOI_GEN_SeekIndividual, /* SeekIndividual */ ++ ADIOI_LUSTRE_Fcntl, /* Fcntl */ ++ ADIOI_LUSTRE_SetInfo, /* SetInfo */ ++ ADIOI_GEN_ReadStrided, /* ReadStrided */ ++ ADIOI_GEN_WriteStrided, /* WriteStrided */ ++ ADIOI_LUSTRE_Close, /* Close */ ++ ADIOI_LUSTRE_IreadContig, /* IreadContig */ ++ ADIOI_LUSTRE_IwriteContig, /* IwriteContig */ ++ ADIOI_LUSTRE_ReadDone, /* ReadDone */ ++ ADIOI_LUSTRE_WriteDone, /* WriteDone */ ++ ADIOI_LUSTRE_ReadComplete, /* ReadComplete */ ++ ADIOI_LUSTRE_WriteComplete, /* WriteComplete */ ++ ADIOI_LUSTRE_IreadStrided, /* IreadStrided */ ++ ADIOI_LUSTRE_IwriteStrided, /* IwriteStrided */ ++ ADIOI_GEN_Flush, /* Flush */ ++ ADIOI_LUSTRE_Resize, /* Resize */ ++ ADIOI_GEN_Delete, /* Delete */ ++}; +diff -r -u --new-file mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_close.c mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_close.c +--- mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_close.c 1969-12-31 19:00:00.000000000 -0500 ++++ mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_close.c 2005-12-06 11:54:37.895129327 -0500 +@@ -0,0 +1,32 @@ ++/* -*- Mode: C; c-basic-offset:4 ; -*- */ ++/* ++ * $Id: ad_lustre_close.c,v 1.1.1.1 2004/11/04 11:03:38 liam Exp $ ++ * ++ * Copyright (C) 1997 University of Chicago. ++ * See COPYRIGHT notice in top-level directory. ++ */ ++ ++#include "ad_lustre.h" ++ ++void ADIOI_LUSTRE_Close(ADIO_File fd, int *error_code) ++{ ++ int err; ++#if defined(MPICH2) || !defined(PRINT_ERR_MSG) ++ static char myname[] = "ADIOI_LUSTRE_CLOSE"; ++#endif ++ ++ err = close(fd->fd_sys); ++ if (err == -1) { ++#ifdef MPICH2 ++ *error_code = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", ++ "**io %s", strerror(errno)); ++#elif defined(PRINT_ERR_MSG) ++ *error_code = MPI_ERR_UNKNOWN; ++#else ++ *error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_ADIO_ERROR, ++ myname, "I/O Error", "%s", strerror(errno)); ++ ADIOI_Error(fd, *error_code, myname); ++#endif ++ } ++ else *error_code = MPI_SUCCESS; ++} +diff -r -u --new-file mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_done.c mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_done.c +--- mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_done.c 1969-12-31 19:00:00.000000000 -0500 ++++ mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_done.c 2005-12-06 11:54:37.898128927 -0500 +@@ -0,0 +1,188 @@ ++/* -*- Mode: C; c-basic-offset:4 ; -*- */ ++/* ++ * $Id: ad_lustre_done.c,v 1.1.1.1 2004/11/04 11:03:38 liam Exp $ ++ * ++ * Copyright (C) 1997 University of Chicago. ++ * See COPYRIGHT notice in top-level directory. ++ */ ++ ++#include "ad_lustre.h" ++ ++int ADIOI_LUSTRE_ReadDone(ADIO_Request *request, ADIO_Status *status, int *error_code) ++{ ++#ifndef NO_AIO ++ int done=0; ++#if defined(MPICH2) || !defined(PRINT_ERR_MSG) ++ static char myname[] = "ADIOI_LUSTRE_READDONE"; ++#endif ++#ifdef AIO_SUN ++ aio_result_t *result=0, *tmp; ++#else ++ int err; ++#endif ++#ifdef AIO_HANDLE_IN_AIOCB ++ struct aiocb *tmp1; ++#endif ++#endif ++ ++ if (*request == ADIO_REQUEST_NULL) { ++ *error_code = MPI_SUCCESS; ++ return 1; ++ } ++ ++#ifdef NO_AIO ++/* HP, FreeBSD, Linux */ ++#ifdef HAVE_STATUS_SET_BYTES ++ MPIR_Status_set_bytes(status, (*request)->datatype, (*request)->nbytes); ++#endif ++ (*request)->fd->async_count--; ++ ADIOI_Free_request((ADIOI_Req_node *) (*request)); ++ *request = ADIO_REQUEST_NULL; ++ *error_code = MPI_SUCCESS; ++ return 1; ++#endif ++ ++#ifdef AIO_SUN ++ if ((*request)->queued) { ++ tmp = (aio_result_t *) (*request)->handle; ++ if (tmp->aio_return == AIO_INPROGRESS) { ++ done = 0; ++ *error_code = MPI_SUCCESS; ++ } ++ else if (tmp->aio_return != -1) { ++ result = (aio_result_t *) aiowait(0); /* dequeue any one request */ ++ done = 1; ++ (*request)->nbytes = tmp->aio_return; ++ *error_code = MPI_SUCCESS; ++ } ++ else { ++#ifdef MPICH2 ++ *error_code = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", ++ "**io %s", strerror(tmp->aio_errno)); ++ return; ++#elif defined(PRINT_ERR_MSG) ++ *error_code = MPI_ERR_UNKNOWN; ++#else ++ *error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_ADIO_ERROR, ++ myname, "I/O Error", "%s", strerror(tmp->aio_errno)); ++ ADIOI_Error((*request)->fd, *error_code, myname); ++#endif ++ } ++ } /* if ((*request)->queued) ... */ ++ else { ++ /* ADIOI_Complete_Async completed this request, but request object ++ was not freed. */ ++ done = 1; ++ *error_code = MPI_SUCCESS; ++ } ++#ifdef HAVE_STATUS_SET_BYTES ++ if (done && ((*request)->nbytes != -1)) ++ MPIR_Status_set_bytes(status, (*request)->datatype, (*request)->nbytes); ++#endif ++ ++#endif ++ ++#ifdef AIO_HANDLE_IN_AIOCB ++/* IBM */ ++ if ((*request)->queued) { ++ tmp1 = (struct aiocb *) (*request)->handle; ++ errno = aio_error(tmp1->aio_handle); ++ if (errno == EINPROG) { ++ done = 0; ++ *error_code = MPI_SUCCESS; ++ } ++ else { ++ err = aio_return(tmp1->aio_handle); ++ (*request)->nbytes = err; ++ errno = aio_error(tmp1->aio_handle); ++ ++ done = 1; ++ ++ if (err == -1) { ++#ifdef MPICH2 ++ *error_code = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", ++ "**io %s", strerror(errno)); ++ return; ++#elif defined(PRINT_ERR_MSG) ++ *error_code = MPI_ERR_UNKNOWN; ++#else ++ *error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_ADIO_ERROR, ++ myname, "I/O Error", "%s", strerror(errno)); ++ ADIOI_Error((*request)->fd, *error_code, myname); ++#endif ++ } ++ else *error_code = MPI_SUCCESS; ++ } ++ } /* if ((*request)->queued) */ ++ else { ++ done = 1; ++ *error_code = MPI_SUCCESS; ++ } ++#ifdef HAVE_STATUS_SET_BYTES ++ if (done && ((*request)->nbytes != -1)) ++ MPIR_Status_set_bytes(status, (*request)->datatype, (*request)->nbytes); ++#endif ++ ++#elif (!defined(NO_AIO) && !defined(AIO_SUN)) ++/* DEC, SGI IRIX 5 and 6 */ ++ if ((*request)->queued) { ++ errno = aio_error((const struct aiocb *) (*request)->handle); ++ if (errno == EINPROGRESS) { ++ done = 0; ++ *error_code = MPI_SUCCESS; ++ } ++ else { ++ err = aio_return((struct aiocb *) (*request)->handle); ++ (*request)->nbytes = err; ++ errno = aio_error((struct aiocb *) (*request)->handle); ++ ++ done = 1; ++ ++ if (err == -1) { ++#ifdef MPICH2 ++ *error_code = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", ++ "**io %s", strerror(errno)); ++ return; ++#elif defined(PRINT_ERR_MSG) ++ *error_code = MPI_ERR_UNKNOWN; ++#else /* MPICH-1 */ ++ *error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_ADIO_ERROR, ++ myname, "I/O Error", "%s", strerror(errno)); ++ ADIOI_Error((*request)->fd, *error_code, myname); ++#endif ++ } ++ else *error_code = MPI_SUCCESS; ++ } ++ } /* if ((*request)->queued) */ ++ else { ++ done = 1; ++ *error_code = MPI_SUCCESS; ++ } ++#ifdef HAVE_STATUS_SET_BYTES ++ if (done && ((*request)->nbytes != -1)) ++ MPIR_Status_set_bytes(status, (*request)->datatype, (*request)->nbytes); ++#endif ++ ++#endif ++ ++#ifndef NO_AIO ++ if (done) { ++ /* if request is still queued in the system, it is also there ++ on ADIOI_Async_list. Delete it from there. */ ++ if ((*request)->queued) ADIOI_Del_req_from_list(request); ++ ++ (*request)->fd->async_count--; ++ if ((*request)->handle) ADIOI_Free((*request)->handle); ++ ADIOI_Free_request((ADIOI_Req_node *) (*request)); ++ *request = ADIO_REQUEST_NULL; ++ } ++ return done; ++#endif ++ ++} ++ ++ ++int ADIOI_LUSTRE_WriteDone(ADIO_Request *request, ADIO_Status *status, int *error_code) ++{ ++ return ADIOI_LUSTRE_ReadDone(request, status, error_code); ++} +diff -r -u --new-file mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_fcntl.c mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_fcntl.c +--- mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_fcntl.c 1969-12-31 19:00:00.000000000 -0500 ++++ mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_fcntl.c 2005-12-06 11:54:37.901128527 -0500 +@@ -0,0 +1,126 @@ ++/* -*- Mode: C; c-basic-offset:4 ; -*- */ ++/* ++ * $Id: ad_lustre_fcntl.c,v 1.1.1.1 2004/11/04 11:03:38 liam Exp $ ++ * ++ * Copyright (C) 1997 University of Chicago. ++ * See COPYRIGHT notice in top-level directory. ++ */ ++ ++#include "ad_lustre.h" ++#include "adio_extern.h" ++/* #ifdef MPISGI ++#include "mpisgi2.h" ++#endif */ ++ ++void ADIOI_LUSTRE_Fcntl(ADIO_File fd, int flag, ADIO_Fcntl_t *fcntl_struct, int *error_code) ++{ ++ int i, ntimes; ++ ADIO_Offset curr_fsize, alloc_size, size, len, done; ++ ADIO_Status status; ++ char *buf; ++#if defined(MPICH2) || !defined(PRINT_ERR_MSG) ++ static char myname[] = "ADIOI_LUSTRE_FCNTL"; ++#endif ++ ++ switch(flag) { ++ case ADIO_FCNTL_GET_FSIZE: ++ fcntl_struct->fsize = lseek(fd->fd_sys, 0, SEEK_END); ++ if (fd->fp_sys_posn != -1) ++ lseek(fd->fd_sys, fd->fp_sys_posn, SEEK_SET); ++ if (fcntl_struct->fsize == -1) { ++#ifdef MPICH2 ++ *error_code = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", ++ "**io %s", strerror(errno)); ++#elif defined(PRINT_ERR_MSG) ++ *error_code = MPI_ERR_UNKNOWN; ++#else /* MPICH-1 */ ++ *error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_ADIO_ERROR, ++ myname, "I/O Error", "%s", strerror(errno)); ++ ADIOI_Error(fd, *error_code, myname); ++#endif ++ } ++ else *error_code = MPI_SUCCESS; ++ break; ++ ++ case ADIO_FCNTL_SET_DISKSPACE: ++ /* will be called by one process only */ ++ /* On file systems with no preallocation function, I have to ++ explicitly write ++ to allocate space. Since there could be holes in the file, ++ I need to read up to the current file size, write it back, ++ and then write beyond that depending on how much ++ preallocation is needed. ++ read/write in sizes of no more than ADIOI_PREALLOC_BUFSZ */ ++ ++ curr_fsize = lseek(fd->fd_sys, 0, SEEK_END); ++ alloc_size = fcntl_struct->diskspace; ++ ++ size = ADIOI_MIN(curr_fsize, alloc_size); ++ ++ ntimes = (size + ADIOI_PREALLOC_BUFSZ - 1)/ADIOI_PREALLOC_BUFSZ; ++ buf = (char *) ADIOI_Malloc(ADIOI_PREALLOC_BUFSZ); ++ done = 0; ++ ++ for (i=0; i curr_fsize) { ++ memset(buf, 0, ADIOI_PREALLOC_BUFSZ); ++ size = alloc_size - curr_fsize; ++ ntimes = (size + ADIOI_PREALLOC_BUFSZ - 1)/ADIOI_PREALLOC_BUFSZ; ++ for (i=0; ifp_sys_posn != -1) ++ lseek(fd->fd_sys, fd->fp_sys_posn, SEEK_SET); ++ *error_code = MPI_SUCCESS; ++ break; ++ ++ case ADIO_FCNTL_SET_IOMODE: ++ /* for implementing PFS I/O modes. will not occur in MPI-IO ++ implementation.*/ ++ if (fd->iomode != fcntl_struct->iomode) { ++ fd->iomode = fcntl_struct->iomode; ++ MPI_Barrier(MPI_COMM_WORLD); ++ } ++ *error_code = MPI_SUCCESS; ++ break; ++ ++ case ADIO_FCNTL_SET_ATOMICITY: ++ fd->atomicity = (fcntl_struct->atomicity == 0) ? 0 : 1; ++ *error_code = MPI_SUCCESS; ++ break; ++ ++ default: ++ FPRINTF(stderr, "Unknown flag passed to ADIOI_LUSTRE_Fcntl\n"); ++ MPI_Abort(MPI_COMM_WORLD, 1); ++ } ++} +diff -r -u --new-file mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_flush.c mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_flush.c +--- mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_flush.c 1969-12-31 19:00:00.000000000 -0500 ++++ mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_flush.c 2005-12-06 11:54:37.903128261 -0500 +@@ -0,0 +1,14 @@ ++/* -*- Mode: C; c-basic-offset:4 ; -*- */ ++/* ++ * $Id: ad_lustre_flush.c,v 1.1.1.1 2004/11/04 11:03:38 liam Exp $ ++ * ++ * Copyright (C) 1997 University of Chicago. ++ * See COPYRIGHT notice in top-level directory. ++ */ ++ ++#include "ad_lustre.h" ++ ++void ADIOI_LUSTRE_Flush(ADIO_File fd, int *error_code) ++{ ++ ADIOI_GEN_Flush(fd, error_code); ++} +diff -r -u --new-file mpich-1.2.6/romio/adio/ad_lustre/ad_lustre.h mpich-1.2.6/romio/adio/ad_lustre/ad_lustre.h +--- mpich-1.2.6/romio/adio/ad_lustre/ad_lustre.h 1969-12-31 19:00:00.000000000 -0500 ++++ mpich-1.2.6/romio/adio/ad_lustre/ad_lustre.h 2005-12-06 11:54:37.891129861 -0500 +@@ -0,0 +1,36 @@ ++/* -*- Mode: C; c-basic-offset:4 ; -*- */ ++/* ++ * $Id: ad_lustre.h,v 1.2 2005/07/07 14:38:17 liam Exp $ ++ * ++ * Copyright (C) 1997 University of Chicago. ++ * See COPYRIGHT notice in top-level directory. ++ */ ++ ++#ifndef AD_UNIX_INCLUDE ++#define AD_UNIX_INCLUDE ++ ++/* temp*/ ++#define HAVE_ASM_TYPES_H 1 ++ ++#include ++#include ++#include ++#include ++#include "lustre/lustre_user.h" ++#include "adio.h" ++ ++#ifndef NO_AIO ++#ifdef AIO_SUN ++#include ++#else ++#include ++#ifdef NEEDS_ADIOCB_T ++typedef struct adiocb adiocb_t; ++#endif ++#endif ++#endif ++ ++int ADIOI_LUSTRE_aio(ADIO_File fd, void *buf, int len, ADIO_Offset offset, ++ int wr, void *handle); ++ ++#endif +diff -r -u --new-file mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_hints.c mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_hints.c +--- mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_hints.c 1969-12-31 19:00:00.000000000 -0500 ++++ mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_hints.c 2005-12-06 11:54:37.904128127 -0500 +@@ -0,0 +1,130 @@ ++/* -*- Mode: C; c-basic-offset:4 ; -*- */ ++/* ++ * $Id: ad_lustre_hints.c,v 1.2 2005/07/07 14:38:17 liam Exp $ ++ * ++ * Copyright (C) 1997 University of Chicago. ++ * See COPYRIGHT notice in top-level directory. ++ */ ++ ++#include "ad_lustre.h" ++ ++void ADIOI_LUSTRE_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code) ++{ ++ char *value, *value_in_fd; ++ int flag, tmp_val, str_factor=-1, str_unit=0, start_iodev=-1; ++ struct lov_user_md lum = { 0 }; ++ int err, myrank, fd_sys, perm, amode, old_mask; ++ ++ if ( (fd->info) == MPI_INFO_NULL) { ++ /* This must be part of the open call. can set striping parameters ++ if necessary. */ ++ MPI_Info_create(&(fd->info)); ++ ++ /* has user specified striping or server buffering parameters ++ and do they have the same value on all processes? */ ++ if (users_info != MPI_INFO_NULL) { ++ value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); ++ ++ MPI_Info_get(users_info, "striping_factor", MPI_MAX_INFO_VAL, ++ value, &flag); ++ if (flag) { ++ str_factor=atoi(value); ++ tmp_val = str_factor; ++ MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); ++ if (tmp_val != str_factor) { ++ FPRINTF(stderr, "ADIOI_LUSTRE_SetInfo: the value for key \"striping_factor\" must be the same on all processes\n"); ++ MPI_Abort(MPI_COMM_WORLD, 1); ++ } ++ } ++ ++ MPI_Info_get(users_info, "striping_unit", MPI_MAX_INFO_VAL, ++ value, &flag); ++ if (flag) { ++ str_unit=atoi(value); ++ tmp_val = str_unit; ++ MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); ++ if (tmp_val != str_unit) { ++ FPRINTF(stderr, "ADIOI_LUSTRE_SetInfo: the value for key \"striping_unit\" must be the same on all processes\n"); ++ MPI_Abort(MPI_COMM_WORLD, 1); ++ } ++ } ++ ++ MPI_Info_get(users_info, "start_iodevice", MPI_MAX_INFO_VAL, ++ value, &flag); ++ if (flag) { ++ start_iodev=atoi(value); ++ tmp_val = start_iodev; ++ MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); ++ if (tmp_val != start_iodev) { ++ FPRINTF(stderr, "ADIOI_LUSTRE_SetInfo: the value for key \"start_iodevice\" must be the same on all processes\n"); ++ MPI_Abort(MPI_COMM_WORLD, 1); ++ } ++ } ++ ++ /* if user has specified striping info, process 0 tries to set it */ ++ if ((str_factor > 0) || (str_unit > 0) || (start_iodev >= 0)) { ++ MPI_Comm_rank(fd->comm, &myrank); ++ if (!myrank) { ++ if (fd->perm == ADIO_PERM_NULL) { ++ old_mask = umask(022); ++ umask(old_mask); ++ perm = old_mask ^ 0666; ++ } ++ else perm = fd->perm; ++ ++ amode = 0; ++ if (fd->access_mode & ADIO_CREATE) ++ amode = amode | O_CREAT; ++ if (fd->access_mode & ADIO_RDWR || ++ (fd->access_mode & ADIO_RDONLY && ++ fd->access_mode & ADIO_WRONLY)) ++ amode = amode | O_RDWR; ++ else if (fd->access_mode & ADIO_WRONLY) ++ amode = amode | O_WRONLY; ++ else if (fd->access_mode & ADIO_RDONLY) ++ amode = amode | O_RDONLY; ++ if (fd->access_mode & ADIO_EXCL) ++ amode = amode | O_EXCL; ++ ++ /* we need to create file so ensure this is set */ ++ amode = amode | O_LOV_DELAY_CREATE | O_CREAT; ++ ++ fd_sys = open(fd->filename, amode, perm); ++ if (fd_sys == -1) { ++ if (errno != EEXIST) ++ FPRINTF(stderr, "ADIOI_LUSTRE_SetInfo: Failure to open file %s %d %d\n",strerror(errno), amode, perm); ++ } else { ++ lum.lmm_magic = LOV_USER_MAGIC; ++ lum.lmm_pattern = 0; ++ lum.lmm_stripe_size = str_unit; ++ lum.lmm_stripe_count = str_factor; ++ lum.lmm_stripe_offset = start_iodev; ++ ++ err = ioctl(fd_sys, LL_IOC_LOV_SETSTRIPE, &lum); ++ if (err == -1 && errno != EEXIST) { ++ FPRINTF(stderr, "ADIOI_LUSTRE_SetInfo: Failure to set stripe info %s \n",strerror(errno)); ++ } ++ ++ close(fd_sys); ++ } ++ ++ } ++ MPI_Barrier(fd->comm); ++ } ++ ++ ADIOI_Free(value); ++ } ++ ++ /* set the values for collective I/O and data sieving parameters */ ++ ADIOI_GEN_SetInfo(fd, users_info, error_code); ++ } ++ ++ else { ++ /* The file has been opened previously and fd->fd_sys is a valid ++ file descriptor. cannot set striping parameters now. */ ++ ++ /* set the values for collective I/O and data sieving parameters */ ++ ADIOI_GEN_SetInfo(fd, users_info, error_code); ++ ++ } ++ ++ *error_code = MPI_SUCCESS; ++} +diff -r -u --new-file mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_iread.c mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_iread.c +--- mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_iread.c 1969-12-31 19:00:00.000000000 -0500 ++++ mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_iread.c 2005-12-06 11:54:37.904128127 -0500 +@@ -0,0 +1,106 @@ ++/* -*- Mode: C; c-basic-offset:4 ; -*- */ ++/* ++ * $Id: ad_lustre_iread.c,v 1.1.1.1 2004/11/04 11:03:38 liam Exp $ ++ * ++ * Copyright (C) 1997 University of Chicago. ++ * See COPYRIGHT notice in top-level directory. ++ */ ++ ++#include "ad_lustre.h" ++ ++void ADIOI_LUSTRE_IreadContig(ADIO_File fd, void *buf, int count, ++ MPI_Datatype datatype, int file_ptr_type, ++ ADIO_Offset offset, ADIO_Request *request, int *error_code) ++{ ++ int len, typesize; ++#ifdef NO_AIO ++ ADIO_Status status; ++#else ++ int err=-1; ++#if defined(MPICH2) || !defined(PRINT_ERR_MSG) ++ static char myname[] = "ADIOI_LUSTRE_IREADCONTIG"; ++#endif ++#endif ++ ++ (*request) = ADIOI_Malloc_request(); ++ (*request)->optype = ADIOI_READ; ++ (*request)->fd = fd; ++ (*request)->datatype = datatype; ++ ++ MPI_Type_size(datatype, &typesize); ++ len = count * typesize; ++ ++#ifdef NO_AIO ++ /* HP, FreeBSD, Linux */ ++ /* no support for nonblocking I/O. Use blocking I/O. */ ++ ++ ADIOI_LUSTRE_ReadContig(fd, buf, len, MPI_BYTE, file_ptr_type, offset, ++ &status, error_code); ++ (*request)->queued = 0; ++#ifdef HAVE_STATUS_SET_BYTES ++ if (*error_code == MPI_SUCCESS) { ++ MPI_Get_elements(&status, MPI_BYTE, &len); ++ (*request)->nbytes = len; ++ } ++#endif ++ ++#else ++ if (file_ptr_type == ADIO_INDIVIDUAL) offset = fd->fp_ind; ++ err = ADIOI_LUSTRE_aio(fd, buf, len, offset, 0, &((*request)->handle)); ++ if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind += len; ++ ++ (*request)->queued = 1; ++ ADIOI_Add_req_to_list(request); ++ ++ if (err == -1) { ++#ifdef MPICH2 ++ *error_code = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", ++ "**io %s", strerror(errno)); ++ return; ++#elif defined(PRINT_ERR_MSG) ++ *error_code = MPI_ERR_UNKNOWN; ++#else /* MPICH-1 */ ++ *error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_ADIO_ERROR, ++ myname, "I/O Error", "%s", strerror(errno)); ++ ADIOI_Error(fd, *error_code, myname); ++#endif ++ } ++ else *error_code = MPI_SUCCESS; ++#endif /* NO_AIO */ ++ ++ fd->fp_sys_posn = -1; /* set it to null. */ ++ fd->async_count++; ++} ++ ++ ++ ++void ADIOI_LUSTRE_IreadStrided(ADIO_File fd, void *buf, int count, ++ MPI_Datatype datatype, int file_ptr_type, ++ ADIO_Offset offset, ADIO_Request *request, int ++ *error_code) ++{ ++ ADIO_Status status; ++#ifdef HAVE_STATUS_SET_BYTES ++ int typesize; ++#endif ++ ++ *request = ADIOI_Malloc_request(); ++ (*request)->optype = ADIOI_READ; ++ (*request)->fd = fd; ++ (*request)->datatype = datatype; ++ (*request)->queued = 0; ++ (*request)->handle = 0; ++ ++/* call the blocking version. It is faster because it does data sieving. */ ++ ADIOI_LUSTRE_ReadStrided(fd, buf, count, datatype, file_ptr_type, ++ offset, &status, error_code); ++ ++ fd->async_count++; ++ ++#ifdef HAVE_STATUS_SET_BYTES ++ if (*error_code == MPI_SUCCESS) { ++ MPI_Type_size(datatype, &typesize); ++ (*request)->nbytes = count * typesize; ++ } ++#endif ++} +diff -r -u --new-file mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_iwrite.c mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_iwrite.c +--- mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_iwrite.c 1969-12-31 19:00:00.000000000 -0500 ++++ mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_iwrite.c 2005-12-06 11:54:37.906127861 -0500 +@@ -0,0 +1,268 @@ ++/* -*- Mode: C; c-basic-offset:4 ; -*- */ ++/* ++ * $Id: ad_lustre_iwrite.c,v 1.1.1.1 2004/11/04 11:03:38 liam Exp $ ++ * ++ * Copyright (C) 1997 University of Chicago. ++ * See COPYRIGHT notice in top-level directory. ++ */ ++ ++#include "ad_lustre.h" ++ ++void ADIOI_LUSTRE_IwriteContig(ADIO_File fd, void *buf, int count, ++ MPI_Datatype datatype, int file_ptr_type, ++ ADIO_Offset offset, ADIO_Request *request, int *error_code) ++{ ++ int len, typesize; ++#ifdef NO_AIO ++ ADIO_Status status; ++#else ++ int err=-1; ++#if defined(MPICH2) || !defined(PRINT_ERR_MSG) ++ static char myname[] = "ADIOI_LUSTRE_IWRITECONTIG"; ++#endif ++#endif ++ ++ *request = ADIOI_Malloc_request(); ++ (*request)->optype = ADIOI_WRITE; ++ (*request)->fd = fd; ++ (*request)->datatype = datatype; ++ ++ MPI_Type_size(datatype, &typesize); ++ len = count * typesize; ++ ++#ifdef NO_AIO ++ /* HP, FreeBSD, Linux */ ++ /* no support for nonblocking I/O. Use blocking I/O. */ ++ ++ ADIOI_LUSTRE_WriteContig(fd, buf, len, MPI_BYTE, file_ptr_type, offset, ++ &status, error_code); ++ (*request)->queued = 0; ++#ifdef HAVE_STATUS_SET_BYTES ++ if (*error_code == MPI_SUCCESS) { ++ MPI_Get_elements(&status, MPI_BYTE, &len); ++ (*request)->nbytes = len; ++ } ++#endif ++ ++#else ++ if (file_ptr_type == ADIO_INDIVIDUAL) offset = fd->fp_ind; ++ err = ADIOI_LUSTRE_aio(fd, buf, len, offset, 1, &((*request)->handle)); ++ if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind += len; ++ ++ (*request)->queued = 1; ++ ADIOI_Add_req_to_list(request); ++ ++ if (err == -1) { ++#ifdef MPICH2 ++ *error_code = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", ++ "**io %s", strerror(errno)); ++ return; ++#elif defined(PRINT_ERR_MSG) ++ *error_code = MPI_ERR_UNKNOWN; ++#else /* MPICH-1 */ ++ *error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_ADIO_ERROR, ++ myname, "I/O Error", "%s", strerror(errno)); ++ ADIOI_Error(fd, *error_code, myname); ++#endif ++ } ++ else *error_code = MPI_SUCCESS; ++#endif /* NO_AIO */ ++ ++ fd->fp_sys_posn = -1; /* set it to null. */ ++ fd->async_count++; ++} ++ ++ ++ ++ ++void ADIOI_LUSTRE_IwriteStrided(ADIO_File fd, void *buf, int count, ++ MPI_Datatype datatype, int file_ptr_type, ++ ADIO_Offset offset, ADIO_Request *request, int ++ *error_code) ++{ ++ ADIO_Status status; ++#ifdef HAVE_STATUS_SET_BYTES ++ int typesize; ++#endif ++ ++ *request = ADIOI_Malloc_request(); ++ (*request)->optype = ADIOI_WRITE; ++ (*request)->fd = fd; ++ (*request)->datatype = datatype; ++ (*request)->queued = 0; ++ (*request)->handle = 0; ++ ++/* call the blocking version. It is faster because it does data sieving. */ ++ ADIOI_LUSTRE_WriteStrided(fd, buf, count, datatype, file_ptr_type, ++ offset, &status, error_code); ++ ++ fd->async_count++; ++ ++#ifdef HAVE_STATUS_SET_BYTES ++ if (*error_code == MPI_SUCCESS) { ++ MPI_Type_size(datatype, &typesize); ++ (*request)->nbytes = count * typesize; ++ } ++#endif ++} ++ ++ ++/* This function is for implementation convenience. It is not user-visible. ++ It takes care of the differences in the interface for nonblocking I/O ++ on various Unix machines! If wr==1 write, wr==0 read. */ ++ ++int ADIOI_LUSTRE_aio(ADIO_File fd, void *buf, int len, ADIO_Offset offset, ++ int wr, void *handle) ++{ ++ int err=-1, fd_sys; ++ ++#ifndef NO_AIO ++ int error_code; ++#ifdef AIO_SUN ++ aio_result_t *result; ++#else ++ struct aiocb *aiocbp; ++#endif ++#endif ++ ++ fd_sys = fd->fd_sys; ++ ++#ifdef AIO_SUN ++ result = (aio_result_t *) ADIOI_Malloc(sizeof(aio_result_t)); ++ result->aio_return = AIO_INPROGRESS; ++ if (wr) err = aiowrite(fd_sys, buf, len, offset, SEEK_SET, result); ++ else err = aioread(fd_sys, buf, len, offset, SEEK_SET, result); ++ ++ if (err == -1) { ++ if (errno == EAGAIN) { ++ /* the man pages say EPROCLIM, but in reality errno is set to EAGAIN! */ ++ ++ /* exceeded the max. no. of outstanding requests. ++ complete all previous async. requests and try again.*/ ++ ++ ADIOI_Complete_async(&error_code); ++ if (wr) err = aiowrite(fd_sys, buf, len, offset, SEEK_SET, result); ++ else err = aioread(fd_sys, buf, len, offset, SEEK_SET, result); ++ ++ while (err == -1) { ++ if (errno == EAGAIN) { ++ /* sleep and try again */ ++ sleep(1); ++ if (wr) err = aiowrite(fd_sys, buf, len, offset, SEEK_SET, result); ++ else err = aioread(fd_sys, buf, len, offset, SEEK_SET, result); ++ } ++ else { ++ FPRINTF(stderr, "Unknown errno %d in ADIOI_LUSTRE_aio\n", errno); ++ MPI_Abort(MPI_COMM_WORLD, 1); ++ } ++ } ++ } ++ else { ++ FPRINTF(stderr, "Unknown errno %d in ADIOI_LUSTRE_aio\n", errno); ++ MPI_Abort(MPI_COMM_WORLD, 1); ++ } ++ } ++ ++ *((aio_result_t **) handle) = result; ++#endif ++ ++#ifdef NO_FD_IN_AIOCB ++/* IBM */ ++ aiocbp = (struct aiocb *) ADIOI_Malloc(sizeof(struct aiocb)); ++ aiocbp->aio_whence = SEEK_SET; ++ aiocbp->aio_offset = offset; ++ aiocbp->aio_buf = buf; ++ aiocbp->aio_nbytes = len; ++ if (wr) err = aio_write(fd_sys, aiocbp); ++ else err = aio_read(fd_sys, aiocbp); ++ ++ if (err == -1) { ++ if (errno == EAGAIN) { ++ /* exceeded the max. no. of outstanding requests. ++ complete all previous async. requests and try again. */ ++ ++ ADIOI_Complete_async(&error_code); ++ if (wr) err = aio_write(fd_sys, aiocbp); ++ else err = aio_read(fd_sys, aiocbp); ++ ++ while (err == -1) { ++ if (errno == EAGAIN) { ++ /* sleep and try again */ ++ sleep(1); ++ if (wr) err = aio_write(fd_sys, aiocbp); ++ else err = aio_read(fd_sys, aiocbp); ++ } ++ else { ++ FPRINTF(stderr, "Unknown errno %d in ADIOI_LUSTRE_aio\n", errno); ++ MPI_Abort(MPI_COMM_WORLD, 1); ++ } ++ } ++ } ++ else { ++ FPRINTF(stderr, "Unknown errno %d in ADIOI_LUSTRE_aio\n", errno); ++ MPI_Abort(MPI_COMM_WORLD, 1); ++ } ++ } ++ ++ *((struct aiocb **) handle) = aiocbp; ++ ++#elif (!defined(NO_AIO) && !defined(AIO_SUN)) ++/* DEC, SGI IRIX 5 and 6 */ ++ ++ aiocbp = (struct aiocb *) ADIOI_Calloc(sizeof(struct aiocb), 1); ++ aiocbp->aio_fildes = fd_sys; ++ aiocbp->aio_offset = offset; ++ aiocbp->aio_buf = buf; ++ aiocbp->aio_nbytes = len; ++ ++#ifdef AIO_PRIORITY_DEFAULT ++/* DEC */ ++ aiocbp->aio_reqprio = AIO_PRIO_DFL; /* not needed in DEC Unix 4.0 */ ++ aiocbp->aio_sigevent.sigev_signo = 0; ++#else ++ aiocbp->aio_reqprio = 0; ++#endif ++ ++#ifdef AIO_SIGNOTIFY_NONE ++/* SGI IRIX 6 */ ++ aiocbp->aio_sigevent.sigev_notify = SIGEV_NONE; ++#else ++ aiocbp->aio_sigevent.sigev_signo = 0; ++#endif ++ ++ if (wr) err = aio_write(aiocbp); ++ else err = aio_read(aiocbp); ++ ++ if (err == -1) { ++ if (errno == EAGAIN) { ++ /* exceeded the max. no. of outstanding requests. ++ complete all previous async. requests and try again. */ ++ ++ ADIOI_Complete_async(&error_code); ++ if (wr) err = aio_write(aiocbp); ++ else err = aio_read(aiocbp); ++ ++ while (err == -1) { ++ if (errno == EAGAIN) { ++ /* sleep and try again */ ++ sleep(1); ++ if (wr) err = aio_write(aiocbp); ++ else err = aio_read(aiocbp); ++ } ++ else { ++ FPRINTF(stderr, "Unknown errno %d in ADIOI_LUSTRE_aio\n", errno); ++ MPI_Abort(MPI_COMM_WORLD, 1); ++ } ++ } ++ } ++ else { ++ FPRINTF(stderr, "Unknown errno %d in ADIOI_LUSTRE_aio\n", errno); ++ MPI_Abort(MPI_COMM_WORLD, 1); ++ } ++ } ++ ++ *((struct aiocb **) handle) = aiocbp; ++#endif ++ ++ return err; ++} +diff -r -u --new-file mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_open.c mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_open.c +--- mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_open.c 1969-12-31 19:00:00.000000000 -0500 ++++ mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_open.c 2005-12-06 11:54:37.906127861 -0500 +@@ -0,0 +1,100 @@ ++/* -*- Mode: C; c-basic-offset:4 ; -*- */ ++/* ++ * $Id: ad_lustre_open.c,v 1.1.1.1 2004/11/04 11:03:38 liam Exp $ ++ * ++ * Copyright (C) 1997 University of Chicago. ++ * See COPYRIGHT notice in top-level directory. ++ */ ++ ++#include "ad_lustre.h" ++ ++void ADIOI_LUSTRE_Open(ADIO_File fd, int *error_code) ++{ ++ int perm, old_mask, amode; ++ struct lov_user_md lum = { 0 }; ++ char *value; ++ ++#if defined(MPICH2) || !defined(PRINT_ERR_MSG) ++ static char myname[] = "ADIOI_LUSTRE_OPEN"; ++#endif ++ ++ if (fd->perm == ADIO_PERM_NULL) { ++ old_mask = umask(022); ++ umask(old_mask); ++ perm = old_mask ^ 0666; ++ } ++ else perm = fd->perm; ++ ++ amode = 0; ++ if (fd->access_mode & ADIO_CREATE) ++ amode = amode | O_CREAT; ++ if (fd->access_mode & ADIO_RDONLY) ++ amode = amode | O_RDONLY; ++ if (fd->access_mode & ADIO_WRONLY) ++ amode = amode | O_WRONLY; ++ if (fd->access_mode & ADIO_RDWR) ++ amode = amode | O_RDWR; ++ if (fd->access_mode & ADIO_EXCL) ++ amode = amode | O_EXCL; ++ ++ fd->fd_sys = open(fd->filename, amode, perm); ++ ++ if (fd->fd_sys != -1) { ++ int err; ++ ++ value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); ++ ++ /* get file striping information and set it in info */ ++ lum.lmm_magic = LOV_USER_MAGIC; ++ err = ioctl(fd->fd_sys, LL_IOC_LOV_GETSTRIPE, (void *) &lum); ++ ++ if (!err) { ++ sprintf(value, "%d", lum.lmm_stripe_size); ++ MPI_Info_set(fd->info, "striping_unit", value); ++ ++ sprintf(value, "%d", lum.lmm_stripe_count); ++ MPI_Info_set(fd->info, "striping_factor", value); ++ ++ sprintf(value, "%d", lum.lmm_stripe_offset); ++ MPI_Info_set(fd->info, "start_iodevice", value); ++ } ++ ADIOI_Free(value); ++ ++ if (fd->access_mode & ADIO_APPEND) ++ fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END); ++ } ++ ++ ++ if ((fd->fd_sys != -1) && (fd->access_mode & ADIO_APPEND)) ++ fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END); ++ ++ if (fd->fd_sys == -1) { ++#ifdef MPICH2 ++ if (errno == ENAMETOOLONG) ++ *error_code = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_BAD_FILE, "**filenamelong", "**filenamelong %s %d", fd->filename, strlen(fd->filename) ); ++ else if (errno == ENOENT) ++ *error_code = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_NO_SUCH_FILE, "**filenoexist", "**filenoexist %s", fd->filename ); ++ else if (errno == ENOTDIR || errno == ELOOP) ++ *error_code = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_BAD_FILE, "**filenamedir", "**filenamedir %s", fd->filename ); ++ else if (errno == EACCES) { ++ *error_code = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_ACCESS, "**fileaccess", "**fileaccess %s", ++ fd->filename ); ++ } ++ else if (errno == EROFS) { ++ /* Read only file or file system and write access requested */ ++ *error_code = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_READ_ONLY, "**ioneedrd", 0 ); ++ } ++ else { ++ *error_code = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", ++ "**io %s", strerror(errno)); ++ } ++#elif defined(PRINT_ERR_MSG) ++ *error_code = MPI_ERR_UNKNOWN; ++#else /* MPICH-1 */ ++ *error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_ADIO_ERROR, ++ myname, "I/O Error", "%s", strerror(errno)); ++ ADIOI_Error(ADIO_FILE_NULL, *error_code, myname); ++#endif ++ } ++ else *error_code = MPI_SUCCESS; ++} +diff -r -u --new-file mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_rdcoll.c mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_rdcoll.c +--- mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_rdcoll.c 1969-12-31 19:00:00.000000000 -0500 ++++ mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_rdcoll.c 2005-12-06 11:54:37.907127727 -0500 +@@ -0,0 +1,18 @@ ++/* -*- Mode: C; c-basic-offset:4 ; -*- */ ++/* ++ * $Id: ad_lustre_rdcoll.c,v 1.1.1.1 2004/11/04 11:03:38 liam Exp $ ++ * ++ * Copyright (C) 1997 University of Chicago. ++ * See COPYRIGHT notice in top-level directory. ++ */ ++ ++#include "ad_lustre.h" ++ ++void ADIOI_LUSTRE_ReadStridedColl(ADIO_File fd, void *buf, int count, ++ MPI_Datatype datatype, int file_ptr_type, ++ ADIO_Offset offset, ADIO_Status *status, int ++ *error_code) ++{ ++ ADIOI_GEN_ReadStridedColl(fd, buf, count, datatype, file_ptr_type, ++ offset, status, error_code); ++} +diff -r -u --new-file mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_read.c mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_read.c +--- mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_read.c 1969-12-31 19:00:00.000000000 -0500 ++++ mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_read.c 2005-12-06 11:54:37.907127727 -0500 +@@ -0,0 +1,67 @@ ++/* -*- Mode: C; c-basic-offset:4 ; -*- */ ++/* ++ * $Id: ad_lustre_read.c,v 1.1.1.1 2004/11/04 11:03:38 liam Exp $ ++ * ++ * Copyright (C) 1997 University of Chicago. ++ * See COPYRIGHT notice in top-level directory. ++ */ ++ ++#include "ad_lustre.h" ++ ++void ADIOI_LUSTRE_ReadContig(ADIO_File fd, void *buf, int count, ++ MPI_Datatype datatype, int file_ptr_type, ++ ADIO_Offset offset, ADIO_Status *status, int *error_code) ++{ ++ int err=-1, datatype_size, len; ++#if defined(MPICH2) || !defined(PRINT_ERR_MSG) ++ static char myname[] = "ADIOI_LUSTRE_READCONTIG"; ++#endif ++ ++ MPI_Type_size(datatype, &datatype_size); ++ len = datatype_size * count; ++ ++ if (file_ptr_type == ADIO_EXPLICIT_OFFSET) { ++ if (fd->fp_sys_posn != offset) ++ lseek(fd->fd_sys, offset, SEEK_SET); ++ err = read(fd->fd_sys, buf, len); ++ fd->fp_sys_posn = offset + len; ++ /* individual file pointer not updated */ ++ } ++ else { /* read from curr. location of ind. file pointer */ ++ if (fd->fp_sys_posn != fd->fp_ind) ++ lseek(fd->fd_sys, fd->fp_ind, SEEK_SET); ++ err = read(fd->fd_sys, buf, len); ++ fd->fp_ind += err; ++ fd->fp_sys_posn = fd->fp_ind; ++ } ++ ++#ifdef HAVE_STATUS_SET_BYTES ++ if (err != -1) MPIR_Status_set_bytes(status, datatype, err); ++#endif ++ ++ if (err == -1) { ++#ifdef MPICH2 ++ *error_code = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", ++ "**io %s", strerror(errno)); ++#elif defined(PRINT_ERR_MSG) ++ *error_code = MPI_ERR_UNKNOWN; ++#else /* MPICH-1 */ ++ *error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_ADIO_ERROR, ++ myname, "I/O Error", "%s", strerror(errno)); ++ ADIOI_Error(fd, *error_code, myname); ++#endif ++ } ++ else *error_code = MPI_SUCCESS; ++} ++ ++ ++ ++ ++void ADIOI_LUSTRE_ReadStrided(ADIO_File fd, void *buf, int count, ++ MPI_Datatype datatype, int file_ptr_type, ++ ADIO_Offset offset, ADIO_Status *status, int ++ *error_code) ++{ ++ ADIOI_GEN_ReadStrided(fd, buf, count, datatype, file_ptr_type, ++ offset, status, error_code); ++} +diff -r -u --new-file mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_resize.c mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_resize.c +--- mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_resize.c 1969-12-31 19:00:00.000000000 -0500 ++++ mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_resize.c 2005-12-06 11:54:37.909127460 -0500 +@@ -0,0 +1,32 @@ ++/* -*- Mode: C; c-basic-offset:4 ; -*- */ ++/* ++ * $Id: ad_lustre_resize.c,v 1.1.1.1 2004/11/04 11:03:38 liam Exp $ ++ * ++ * Copyright (C) 1997 University of Chicago. ++ * See COPYRIGHT notice in top-level directory. ++ */ ++ ++#include "ad_lustre.h" ++ ++void ADIOI_LUSTRE_Resize(ADIO_File fd, ADIO_Offset size, int *error_code) ++{ ++ int err; ++#if defined(MPICH2) || !defined(PRINT_ERR_MSG) ++ static char myname[] = "ADIOI_LUSTRE_RESIZE"; ++#endif ++ ++ err = ftruncate(fd->fd_sys, size); ++ if (err == -1) { ++#ifdef MPICH2 ++ *error_code = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", ++ "**io %s", strerror(errno)); ++#elif defined(PRINT_ERR_MSG) ++ *error_code = MPI_ERR_UNKNOWN; ++#else /* MPICH-1 */ ++ *error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_ADIO_ERROR, ++ myname, "I/O Error", "%s", strerror(errno)); ++ ADIOI_Error(fd, *error_code, myname); ++#endif ++ } ++ else *error_code = MPI_SUCCESS; ++} +diff -r -u --new-file mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_seek.c mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_seek.c +--- mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_seek.c 1969-12-31 19:00:00.000000000 -0500 ++++ mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_seek.c 2005-12-06 11:54:37.911127194 -0500 +@@ -0,0 +1,15 @@ ++/* -*- Mode: C; c-basic-offset:4 ; -*- */ ++/* ++ * $Id: ad_lustre_seek.c,v 1.1.1.1 2004/11/04 11:03:38 liam Exp $ ++ * ++ * Copyright (C) 1997 University of Chicago. ++ * See COPYRIGHT notice in top-level directory. ++ */ ++ ++#include "ad_lustre.h" ++ ++ADIO_Offset ADIOI_LUSTRE_SeekIndividual(ADIO_File fd, ADIO_Offset offset, ++ int whence, int *error_code) ++{ ++ return ADIOI_GEN_SeekIndividual(fd, offset, whence, error_code); ++} +diff -r -u --new-file mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_wait.c mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_wait.c +--- mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_wait.c 1969-12-31 19:00:00.000000000 -0500 ++++ mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_wait.c 2005-12-06 11:54:37.914126794 -0500 +@@ -0,0 +1,188 @@ ++/* -*- Mode: C; c-basic-offset:4 ; -*- */ ++/* ++ * $Id: ad_lustre_wait.c,v 1.1.1.1 2004/11/04 11:03:38 liam Exp $ ++ * ++ * Copyright (C) 1997 University of Chicago. ++ * See COPYRIGHT notice in top-level directory. ++ */ ++ ++#include "ad_lustre.h" ++ ++void ADIOI_LUSTRE_ReadComplete(ADIO_Request *request, ADIO_Status *status, int *error_code) ++{ ++#ifndef NO_AIO ++#if defined(MPICH2) || !defined(PRINT_ERR_MSG) ++ static char myname[] = "ADIOI_LUSTRE_READCOMPLETE"; ++#endif ++#ifdef AIO_SUN ++ aio_result_t *result=0, *tmp; ++#else ++ int err; ++#endif ++#ifdef AIO_HANDLE_IN_AIOCB ++ struct aiocb *tmp1; ++#endif ++#endif ++ ++ if (*request == ADIO_REQUEST_NULL) { ++ *error_code = MPI_SUCCESS; ++ return; ++ } ++ ++#ifdef AIO_SUN ++ if ((*request)->queued) { /* dequeue it */ ++ tmp = (aio_result_t *) (*request)->handle; ++ while (tmp->aio_return == AIO_INPROGRESS) usleep(1000); ++ /* sleep for 1 ms., until done. Is 1 ms. a good number? */ ++ /* when done, dequeue any one request */ ++ result = (aio_result_t *) aiowait(0); ++ ++ (*request)->nbytes = tmp->aio_return; ++ ++ if (tmp->aio_return == -1) { ++#ifdef MPICH2 ++ *error_code = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", ++ "**io %s", strerror(tmp->aio_errno)); ++ return; ++#elif defined(PRINT_ERR_MSG) ++ *error_code = MPI_ERR_UNKNOWN; ++#else /* MPICH-1 */ ++ *error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_ADIO_ERROR, ++ myname, "I/O Error", "%s", strerror(tmp->aio_errno)); ++ ADIOI_Error((*request)->fd, *error_code, myname); ++#endif ++ } ++ else *error_code = MPI_SUCCESS; ++ ++/* aiowait only dequeues a request. The completion of a request can be ++ checked by just checking the aio_return flag in the handle passed ++ to the original aioread()/aiowrite(). Therefore, I need to ensure ++ that aiowait() is called exactly once for each previous ++ aioread()/aiowrite(). This is also taken care of in ADIOI_xxxDone */ ++ } ++ else *error_code = MPI_SUCCESS; ++ ++#ifdef HAVE_STATUS_SET_BYTES ++ if ((*request)->nbytes != -1) ++ MPIR_Status_set_bytes(status, (*request)->datatype, (*request)->nbytes); ++#endif ++ ++#endif ++ ++#ifdef AIO_HANDLE_IN_AIOCB ++/* IBM */ ++ if ((*request)->queued) { ++ do { ++ err = aio_suspend(1, (struct aiocb **) &((*request)->handle)); ++ } while ((err == -1) && (errno == EINTR)); ++ ++ tmp1 = (struct aiocb *) (*request)->handle; ++ if (err != -1) { ++ err = aio_return(tmp1->aio_handle); ++ (*request)->nbytes = err; ++ errno = aio_error(tmp1->aio_handle); ++ } ++ else (*request)->nbytes = -1; ++ ++/* on DEC, it is required to call aio_return to dequeue the request. ++ IBM man pages don't indicate what function to use for dequeue. ++ I'm assuming it is aio_return! POSIX says aio_return may be called ++ only once on a given handle. */ ++ ++ if (err == -1) { ++#ifdef MPICH2 ++ *error_code = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", ++ "**io %s", strerror(errno)); ++ return; ++#elif defined(PRINT_ERR_MSG) ++ *error_code = MPI_ERR_UNKNOWN; ++#else /* MPICH-1 */ ++ *error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_ADIO_ERROR, ++ myname, "I/O Error", "%s", strerror(errno)); ++ ADIOI_Error((*request)->fd, *error_code, myname); ++#endif ++ } ++ else *error_code = MPI_SUCCESS; ++ } /* if ((*request)->queued) */ ++ else *error_code = MPI_SUCCESS; ++ ++#ifdef HAVE_STATUS_SET_BYTES ++ if ((*request)->nbytes != -1) ++ MPIR_Status_set_bytes(status, (*request)->datatype, (*request)->nbytes); ++#endif ++ ++#elif (!defined(NO_AIO) && !defined(AIO_SUN)) ++/* DEC, SGI IRIX 5 and 6 */ ++ if ((*request)->queued) { ++ do { ++ err = aio_suspend((const aiocb_t **) &((*request)->handle), 1, 0); ++ } while ((err == -1) && (errno == EINTR)); ++ ++ if (err != -1) { ++ err = aio_return((struct aiocb *) (*request)->handle); ++ (*request)->nbytes = err; ++ errno = aio_error((struct aiocb *) (*request)->handle); ++ } ++ else (*request)->nbytes = -1; ++ ++ if (err == -1) { ++#ifdef MPICH2 ++ *error_code = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", ++ "**io %s", strerror(errno)); ++ return; ++#elif defined(PRINT_ERR_MSG) ++ *error_code = MPI_ERR_UNKNOWN; ++#else /* MPICH-1 */ ++ *error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_ADIO_ERROR, ++ myname, "I/O Error", "%s", strerror(errno)); ++ ADIOI_Error((*request)->fd, *error_code, myname); ++#endif ++ } ++ else *error_code = MPI_SUCCESS; ++ } /* if ((*request)->queued) */ ++ else *error_code = MPI_SUCCESS; ++#ifdef HAVE_STATUS_SET_BYTES ++ if ((*request)->nbytes != -1) ++ MPIR_Status_set_bytes(status, (*request)->datatype, (*request)->nbytes); ++#endif ++#endif ++ ++#ifndef NO_AIO ++ if ((*request)->queued != -1) { ++ ++ /* queued = -1 is an internal hack used when the request must ++ be completed, but the request object should not be ++ freed. This is used in ADIOI_Complete_async, because the user ++ will call MPI_Wait later, which would require status to ++ be filled. Ugly but works. queued = -1 should be used only ++ in ADIOI_Complete_async. ++ This should not affect the user in any way. */ ++ ++ /* if request is still queued in the system, it is also there ++ on ADIOI_Async_list. Delete it from there. */ ++ if ((*request)->queued) ADIOI_Del_req_from_list(request); ++ ++ (*request)->fd->async_count--; ++ if ((*request)->handle) ADIOI_Free((*request)->handle); ++ ADIOI_Free_request((ADIOI_Req_node *) (*request)); ++ *request = ADIO_REQUEST_NULL; ++ } ++ ++#else ++/* HP, FreeBSD, Linux */ ++ ++#ifdef HAVE_STATUS_SET_BYTES ++ MPIR_Status_set_bytes(status, (*request)->datatype, (*request)->nbytes); ++#endif ++ (*request)->fd->async_count--; ++ ADIOI_Free_request((ADIOI_Req_node *) (*request)); ++ *request = ADIO_REQUEST_NULL; ++ *error_code = MPI_SUCCESS; ++#endif ++} ++ ++ ++void ADIOI_LUSTRE_WriteComplete(ADIO_Request *request, ADIO_Status *status, int *error_code) ++{ ++ ADIOI_LUSTRE_ReadComplete(request, status, error_code); ++} +diff -r -u --new-file mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_wrcoll.c mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_wrcoll.c +--- mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_wrcoll.c 1969-12-31 19:00:00.000000000 -0500 ++++ mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_wrcoll.c 2005-12-06 11:54:37.914126794 -0500 +@@ -0,0 +1,18 @@ ++/* -*- Mode: C; c-basic-offset:4 ; -*- */ ++/* ++ * $Id: ad_lustre_wrcoll.c,v 1.1.1.1 2004/11/04 11:03:38 liam Exp $ ++ * ++ * Copyright (C) 1997 University of Chicago. ++ * See COPYRIGHT notice in top-level directory. ++ */ ++ ++#include "ad_lustre.h" ++ ++void ADIOI_LUSTRE_WriteStridedColl(ADIO_File fd, void *buf, int count, ++ MPI_Datatype datatype, int file_ptr_type, ++ ADIO_Offset offset, ADIO_Status *status, int ++ *error_code) ++{ ++ ADIOI_GEN_WriteStridedColl(fd, buf, count, datatype, file_ptr_type, ++ offset, status, error_code); ++} +diff -r -u --new-file mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_write.c mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_write.c +--- mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_write.c 1969-12-31 19:00:00.000000000 -0500 ++++ mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_write.c 2005-12-06 11:54:37.914126794 -0500 +@@ -0,0 +1,66 @@ ++/* -*- Mode: C; c-basic-offset:4 ; -*- */ ++/* ++ * $Id: ad_lustre_write.c,v 1.1.1.1 2004/11/04 11:03:38 liam Exp $ ++ * ++ * Copyright (C) 1997 University of Chicago. ++ * See COPYRIGHT notice in top-level directory. ++ */ ++ ++#include "ad_lustre.h" ++ ++void ADIOI_LUSTRE_WriteContig(ADIO_File fd, void *buf, int count, ++ MPI_Datatype datatype, int file_ptr_type, ++ ADIO_Offset offset, ADIO_Status *status, int *error_code) ++{ ++ int err=-1, datatype_size, len; ++#if defined(MPICH2) || !defined(PRINT_ERR_MSG) ++ static char myname[] = "ADIOI_LUSTRE_WRITECONTIG"; ++#endif ++ ++ MPI_Type_size(datatype, &datatype_size); ++ len = datatype_size * count; ++ ++ if (file_ptr_type == ADIO_EXPLICIT_OFFSET) { ++ if (fd->fp_sys_posn != offset) ++ lseek(fd->fd_sys, offset, SEEK_SET); ++ err = write(fd->fd_sys, buf, len); ++ fd->fp_sys_posn = offset + err; ++ /* individual file pointer not updated */ ++ } ++ else { /* write from curr. location of ind. file pointer */ ++ if (fd->fp_sys_posn != fd->fp_ind) ++ lseek(fd->fd_sys, fd->fp_ind, SEEK_SET); ++ err = write(fd->fd_sys, buf, len); ++ fd->fp_ind += err; ++ fd->fp_sys_posn = fd->fp_ind; ++ } ++ ++#ifdef HAVE_STATUS_SET_BYTES ++ if (err != -1 && status) MPIR_Status_set_bytes(status, datatype, err); ++#endif ++ ++ if (err == -1) { ++#ifdef MPICH2 ++ *error_code = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", ++ "**io %s", strerror(errno)); ++#elif defined(PRINT_ERR_MSG) ++ *error_code = MPI_ERR_UNKNOWN; ++#else ++ *error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_ADIO_ERROR, ++ myname, "I/O Error", "%s", strerror(errno)); ++ ADIOI_Error(fd, *error_code, myname); ++#endif ++ } ++ else *error_code = MPI_SUCCESS; ++} ++ ++ ++ ++void ADIOI_LUSTRE_WriteStrided(ADIO_File fd, void *buf, int count, ++ MPI_Datatype datatype, int file_ptr_type, ++ ADIO_Offset offset, ADIO_Status *status, int ++ *error_code) ++{ ++ ADIOI_GEN_WriteStrided(fd, buf, count, datatype, file_ptr_type, ++ offset, status, error_code); ++} +diff -r -u --new-file mpich-1.2.6/romio/adio/ad_lustre/Makefile.in mpich-1.2.6/romio/adio/ad_lustre/Makefile.in +--- mpich-1.2.6/romio/adio/ad_lustre/Makefile.in 1969-12-31 19:00:00.000000000 -0500 ++++ mpich-1.2.6/romio/adio/ad_lustre/Makefile.in 2005-12-06 11:54:37.883130927 -0500 +@@ -0,0 +1,47 @@ ++CC = @CC@ ++AR = @AR@ ++LIBNAME = @LIBNAME@ ++srcdir = @srcdir@ ++CC_SHL = @CC_SHL@ ++SHLIBNAME = @SHLIBNAME@ ++ ++INCLUDE_DIR = -I@MPI_INCLUDE_DIR@ -I${srcdir}/../include -I../include ++CFLAGS = @CFLAGS@ $(INCLUDE_DIR) ++ ++C_COMPILE_SHL = $(CC_SHL) @CFLAGS@ $(INCLUDE_DIR) ++ ++@VPATH@ ++ ++AD_LUSTRE_OBJECTS = ad_lustre_close.o ad_lustre_read.o \ ++ ad_lustre_open.o ad_lustre_write.o ad_lustre_done.o \ ++ ad_lustre_fcntl.o ad_lustre_iread.o ad_lustre_iwrite.o ad_lustre_wait.o \ ++ ad_lustre_resize.o ad_lustre_hints.o \ ++ ad_lustre.o ++ ++ ++default: $(LIBNAME) ++ @if [ "@ENABLE_SHLIB@" != "none" ] ; then \ ++ $(MAKE) $(SHLIBNAME).la ;\ ++ fi ++ ++.SUFFIXES: $(SUFFIXES) .p .lo ++ ++.c.o: ++ $(CC) $(CFLAGS) -c $< ++.c.lo: ++ $(C_COMPILE_SHL) -c $< ++ @mv -f $*.o $*.lo ++ ++$(LIBNAME): $(AD_LUSTRE_OBJECTS) ++ $(AR) $(LIBNAME) $(AD_LUSTRE_OBJECTS) ++ ++AD_LUSTRE_LOOBJECTS=$(AD_LUSTRE_OBJECTS:.o=.lo) ++$(SHLIBNAME).la: $(AD_LUSTRE_LOOBJECTS) ++ $(AR) $(SHLIBNAME).la $(AD_LUSTRE_LOOBJECTS) ++ ++coverage: ++ -@for file in ${AD_LUSTRE_OBJECTS:.o=.c} ; do \ ++ gcov -b -f $$file ; done ++ ++clean: ++ @rm -f *.o *.lo +--- mpich-1.2.6/romio/Makefile.in 2004-01-27 18:27:35.000000000 -0500 ++++ mpich-1.2.6/romio/Makefile.in 2005-12-06 11:54:38.000000000 -0500 +@@ -14,7 +14,7 @@ DIRS = mpi-io adio/common + MPIO_DIRS = mpi-io + EXTRA_SRC_DIRS = @EXTRA_SRC_DIRS@ + FILE_SYS_DIRS = @FILE_SYS_DIRS@ +-ALL_DIRS = mpi-io mpi-io/fortran mpi2-other/info mpi2-other/info/fortran mpi2-other/array mpi2-other/array/fortran adio/common adio/ad_pfs adio/ad_piofs adio/ad_nfs adio/ad_ufs adio/ad_xfs adio/ad_hfs adio/ad_sfs adio/ad_testfs adio/ad_pvfs adio/ad_pvfs2 test ++ALL_DIRS = mpi-io mpi-io/fortran mpi2-other/info mpi2-other/info/fortran mpi2-other/array mpi2-other/array/fortran adio/common adio/ad_pfs adio/ad_piofs adio/ad_nfs adio/ad_ufs adio/ad_xfs adio/ad_hfs adio/ad_sfs adio/ad_testfs adio/ad_pvfs adio/ad_pvfs2 adio/ad_lustre test + SHELL = /bin/sh + + @VPATH@ +--- mpich-1.2.6/romio/configure.in 2004-08-02 09:37:31.000000000 -0400 ++++ mpich-1.2.6/romio/configure.in 2005-12-06 11:54:38.000000000 -0500 +@@ -90,7 +90,7 @@ MPIO_REQ_REAL_POBJECTS="_iotest.o _iowai + # + have_aio=no + # +-known_filesystems="nfs ufs pfs piofs pvfs pvfs2 testfs xfs hfs sfs" ++known_filesystems="nfs ufs pfs piofs pvfs pvfs2 testfs xfs hfs sfs lustre" + known_mpi_impls="mpich_mpi sgi_mpi hp_mpi cray_mpi lam_mpi" + # + # Defaults +@@ -1270,6 +1270,9 @@ fi + if test -n "$file_system_testfs"; then + AC_DEFINE(ROMIO_TESTFS,1,[Define for TESTFS]) + fi ++if test -n "$file_system_lustre"; then ++ AC_DEFINE(ROMIO_LUSTRE,1,[Define for LUSTRE]) ++fi + if test -n "$file_system_piofs"; then + AC_DEFINE(PIOFS,1,[Define for PIOFS]) + USER_CFLAGS="$USER_CFLAGS -bI:/usr/include/piofs/piofs.exp" +@@ -1634,7 +1637,7 @@ AC_OUTPUT(Makefile localdefs mpi-io/Make + adio/ad_nfs/Makefile adio/ad_ufs/Makefile \ + adio/ad_xfs/Makefile adio/ad_hfs/Makefile \ + adio/ad_sfs/Makefile adio/ad_pfs/Makefile \ +- adio/ad_testfs/Makefile adio/ad_pvfs/Makefile \ ++ adio/ad_testfs/Makefile adio/ad_lustre/Makefile adio/ad_pvfs/Makefile \ + adio/ad_pvfs2/Makefile adio/ad_piofs/Makefile \ + mpi-io/fortran/Makefile mpi2-other/info/fortran/Makefile \ + mpi2-other/array/fortran/Makefile test/fmisc.f \ +--- mpich-1.2.6/romio/configure 2004-08-04 12:08:28.000000000 -0400 ++++ mpich-1.2.6/romio/configure 2005-12-06 11:54:38.000000000 -0500 +@@ -623,7 +623,7 @@ MPIO_REQ_REAL_POBJECTS="_iotest.o _iowai + # + have_aio=no + # +-known_filesystems="nfs ufs pfs piofs pvfs pvfs2 testfs xfs hfs sfs" ++known_filesystems="nfs ufs pfs piofs pvfs pvfs2 testfs lustre xfs hfs sfs" + known_mpi_impls="mpich_mpi sgi_mpi hp_mpi cray_mpi lam_mpi" + # + # Defaults +@@ -4022,6 +4022,13 @@ if test -n "$file_system_testfs"; then + EOF + + fi ++if test -n "$file_system_lustre"; then ++ cat >> confdefs.h <<\EOF ++#define LUSTRE 1 ++EOF ++ ++fi ++ + if test -n "$file_system_piofs"; then + cat >> confdefs.h <<\EOF + #define PIOFS 1 +@@ -4746,7 +4753,7 @@ trap 'rm -fr `echo "Makefile localdefs m + adio/ad_xfs/Makefile adio/ad_hfs/Makefile \ + adio/ad_sfs/Makefile adio/ad_pfs/Makefile \ + adio/ad_testfs/Makefile adio/ad_pvfs/Makefile \ +- adio/ad_pvfs2/Makefile adio/ad_piofs/Makefile \ ++ adio/ad_pvfs2/Makefile adio/ad_piofs/Makefile adio/ad_lustre/Makefile\ + mpi-io/fortran/Makefile mpi2-other/info/fortran/Makefile \ + mpi2-other/array/fortran/Makefile test/fmisc.f \ + test/fcoll_test.f test/pfcoll_test.f test/fperf.f adio/include/romioconf.h" | sed "s/:[^ ]*//g"` conftest*; exit 1' 1 2 15 +@@ -4912,7 +4919,7 @@ CONFIG_FILES=\${CONFIG_FILES-"Makefile l + adio/ad_nfs/Makefile adio/ad_ufs/Makefile \ + adio/ad_xfs/Makefile adio/ad_hfs/Makefile \ + adio/ad_sfs/Makefile adio/ad_pfs/Makefile \ +- adio/ad_testfs/Makefile adio/ad_pvfs/Makefile \ ++ adio/ad_testfs/Makefile adio/ad_lustre/Makefile adio/ad_pvfs/Makefile \ + adio/ad_pvfs2/Makefile adio/ad_piofs/Makefile \ + mpi-io/fortran/Makefile mpi2-other/info/fortran/Makefile \ + mpi2-other/array/fortran/Makefile test/fmisc.f \ +--- mpich-1.2.6/romio/adio/include/romioconf.h.in 2004-08-04 12:08:28.000000000 -0400 ++++ mpich-1.2.6/romio/adio/include/romioconf.h.in 2005-12-06 11:54:38.000000000 -0500 +@@ -192,6 +192,9 @@ + /* Define for TESTFS */ + #undef ROMIO_TESTFS + ++/* Define for LUSTRE */ ++#undef LUSTRE ++ + /* Define for PIOFS */ + #undef PIOFS + +--- mpich-1.2.6/romio/adio/include/mpio_error.h 2002-11-15 11:26:23.000000000 -0500 ++++ mpich-1.2.6/romio/adio/include/mpio_error.h 2005-12-06 11:54:38.000000000 -0500 +@@ -62,6 +62,7 @@ + #define MPIR_ERR_FILETYPE 33 + #define MPIR_ERR_NO_NTFS 35 + #define MPIR_ERR_NO_TESTFS 36 ++#define MPIR_ERR_NO_LUSTRE 37 + + /* MPI_ERR_COMM */ + #ifndef MPIR_ERR_COMM_NULL +--- mpich-1.2.6/romio/adio/include/adioi_fs_proto.h 2003-06-24 18:48:23.000000000 -0400 ++++ mpich-1.2.6/romio/adio/include/adioi_fs_proto.h 2005-12-06 11:54:38.000000000 -0500 +@@ -261,6 +261,68 @@ ADIO_Offset ADIOI_UFS_SeekIndividual(ADI + void ADIOI_UFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code); + #endif + ++#ifdef LUSTRE ++extern struct ADIOI_Fns_struct ADIO_LUSTRE_operations; ++ ++void ADIOI_LUSTRE_Open(ADIO_File fd, int *error_code); ++void ADIOI_LUSTRE_Close(ADIO_File fd, int *error_code); ++void ADIOI_LUSTRE_ReadContig(ADIO_File fd, void *buf, int count, ++ MPI_Datatype datatype, int file_ptr_type, ++ ADIO_Offset offset, ADIO_Status *status, int ++ *error_code); ++void ADIOI_LUSTRE_WriteContig(ADIO_File fd, void *buf, int count, ++ MPI_Datatype datatype, int file_ptr_type, ++ ADIO_Offset offset, ADIO_Status *status, int ++ *error_code); ++void ADIOI_LUSTRE_IwriteContig(ADIO_File fd, void *buf, int count, ++ MPI_Datatype datatype, int file_ptr_type, ++ ADIO_Offset offset, ADIO_Request *request, int ++ *error_code); ++void ADIOI_LUSTRE_IreadContig(ADIO_File fd, void *buf, int count, ++ MPI_Datatype datatype, int file_ptr_type, ++ ADIO_Offset offset, ADIO_Request *request, int ++ *error_code); ++int ADIOI_LUSTRE_ReadDone(ADIO_Request *request, ADIO_Status *status, int ++ *error_code); ++int ADIOI_LUSTRE_WriteDone(ADIO_Request *request, ADIO_Status *status, int ++ *error_code); ++void ADIOI_LUSTRE_ReadComplete(ADIO_Request *request, ADIO_Status *status, int ++ *error_code); ++void ADIOI_LUSTRE_WriteComplete(ADIO_Request *request, ADIO_Status *status, ++ int *error_code); ++void ADIOI_LUSTRE_Fcntl(ADIO_File fd, int flag, ADIO_Fcntl_t *fcntl_struct, int ++ *error_code); ++void ADIOI_LUSTRE_WriteStrided(ADIO_File fd, void *buf, int count, ++ MPI_Datatype datatype, int file_ptr_type, ++ ADIO_Offset offset, ADIO_Status *status, int ++ *error_code); ++void ADIOI_LUSTRE_ReadStrided(ADIO_File fd, void *buf, int count, ++ MPI_Datatype datatype, int file_ptr_type, ++ ADIO_Offset offset, ADIO_Status *status, int ++ *error_code); ++void ADIOI_LUSTRE_WriteStridedColl(ADIO_File fd, void *buf, int count, ++ MPI_Datatype datatype, int file_ptr_type, ++ ADIO_Offset offset, ADIO_Status *status, int ++ *error_code); ++void ADIOI_LUSTRE_ReadStridedColl(ADIO_File fd, void *buf, int count, ++ MPI_Datatype datatype, int file_ptr_type, ++ ADIO_Offset offset, ADIO_Status *status, int ++ *error_code); ++void ADIOI_LUSTRE_IreadStrided(ADIO_File fd, void *buf, int count, ++ MPI_Datatype datatype, int file_ptr_type, ++ ADIO_Offset offset, ADIO_Request *request, int ++ *error_code); ++void ADIOI_LUSTRE_IwriteStrided(ADIO_File fd, void *buf, int count, ++ MPI_Datatype datatype, int file_ptr_type, ++ ADIO_Offset offset, ADIO_Request *request, int ++ *error_code); ++void ADIOI_LUSTRE_Flush(ADIO_File fd, int *error_code); ++void ADIOI_LUSTRE_Resize(ADIO_File fd, ADIO_Offset size, int *error_code); ++ADIO_Offset ADIOI_LUSTRE_SeekIndividual(ADIO_File fd, ADIO_Offset offset, ++ int whence, int *error_code); ++void ADIOI_LUSTRE_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code); ++#endif ++ + #ifdef ROMIO_NTFS + extern struct ADIOI_Fns_struct ADIO_NTFS_operations; + +--- mpich-1.2.6/romio/adio/include/adio.h 2004-06-07 13:59:57.000000000 -0400 ++++ mpich-1.2.6/romio/adio/include/adio.h 2005-12-06 11:54:38.000000000 -0500 +@@ -276,6 +276,7 @@ typedef struct { + #define ADIO_NTFS 158 /* NTFS for Windows NT */ + #define ADIO_TESTFS 159 /* fake file system for testing */ + #define ADIO_PVFS2 160 /* PVFS2: 2nd generation PVFS */ ++#define ADIO_LUSTRE 161 /* Lustre */ + + #define ADIO_SEEK_SET SEEK_SET + #define ADIO_SEEK_CUR SEEK_CUR +--- mpich-1.2.6/romio/adio/common/setfn.c 2003-06-24 18:48:18.000000000 -0400 ++++ mpich-1.2.6/romio/adio/common/setfn.c 2005-12-06 11:54:38.000000000 -0500 +@@ -114,6 +114,16 @@ void ADIOI_SetFunctions(ADIO_File fd) + #endif + break; + ++ case ADIO_LUSTRE: ++#ifdef LUSTRE ++ *(fd->fns) = ADIO_LUSTRE_operations; ++#else ++ FPRINTF(stderr, "ADIOI_SetFunctions: ROMIO has not been configured to use the LUSTRE file system\n"); ++ MPI_Abort(MPI_COMM_WORLD, 1); ++#endif ++ break; ++ ++ + default: + FPRINTF(stderr, "ADIOI_SetFunctions: Unsupported file system type\n"); + MPI_Abort(MPI_COMM_WORLD, 1); +--- mpich-1.2.6/romio/adio/common/ad_fstype.c 2003-09-04 16:24:44.000000000 -0400 ++++ mpich-1.2.6/romio/adio/common/ad_fstype.c 2005-12-06 11:54:38.000000000 -0500 +@@ -204,6 +204,11 @@ static void ADIO_FileSysType_fncall(char + } + } + #elif defined(LINUX) ++#warning use correct include ++# if defined (LUSTRE) ++#define LL_SUPER_MAGIC 0x0BD00BD0 ++# endif ++ + do { + err = statfs(filename, &fsbuf); + } while (err && (errno == ESTALE)); +@@ -218,6 +223,9 @@ static void ADIO_FileSysType_fncall(char + else { + /* FPRINTF(stderr, "%d\n", fsbuf.f_type);*/ + if (fsbuf.f_type == NFS_SUPER_MAGIC) *fstype = ADIO_NFS; ++# if defined (LUSTRE) ++ else if (fsbuf.f_type == LL_SUPER_MAGIC) *fstype = ADIO_LUSTRE; ++#endif + # if defined(ROMIO_PVFS) + else if (fsbuf.f_type == PVFS_SUPER_MAGIC) *fstype = ADIO_PVFS; + # endif +@@ -359,6 +367,11 @@ static void ADIO_FileSysType_prefix(char + { + *fstype = ADIO_TESTFS; + } ++ else if (!strncmp(filename, "lustre:", 7) ++ || !strncmp(filename, "LUSTRE:", 7)) ++ { ++ *fstype = ADIO_LUSTRE; ++ } + else { + #ifdef ROMIO_NTFS + *fstype = ADIO_NTFS; +@@ -644,6 +657,24 @@ void ADIO_ResolveFileType(MPI_Comm comm, + *ops = &ADIO_TESTFS_operations; + #endif + } ++ if (file_system == ADIO_LUSTRE) { ++#ifndef LUSTRE ++# ifdef MPICH2 ++ *error_code = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**iofstypeunsupported", 0); ++ return; ++# elif defined(PRINT_ERR_MSG) ++ FPRINTF(stderr, "ADIO_ResolveFileType: ROMIO has not been configured to use the LUSTRE file system\n"); ++ MPI_Abort(MPI_COMM_WORLD, 1); ++# else /* MPICH-1 */ ++ myerrcode = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_ERR_NO_LUSTRE, ++ myname, (char *) 0, (char *) 0); ++ *error_code = ADIOI_Error(MPI_FILE_NULL, myerrcode, myname); ++# endif ++ return; ++#else ++ *ops = &ADIO_LUSTRE_operations; ++#endif ++ } + *error_code = MPI_SUCCESS; + *fstype = file_system; + return; diff --git a/lustre/doc/Makefile.am b/lustre/doc/Makefile.am index c93427c..1d02c60 100644 --- a/lustre/doc/Makefile.am +++ b/lustre/doc/Makefile.am @@ -14,47 +14,25 @@ PS2PDF = ps2pdf TEXEXPAND = texexpand SUFFIXES = .lin .lyx .pdf .ps .sgml .html .txt .tex .fig .eps .dvi -if DOC - DOCS = lustre.pdf recovery-manual.pdf -else - DOCS = -endif - if UTILS -man_MANS = lfs.1 lmc.1 lwizard.1 lconf.8 lctl.8 +man_MANS = lustre.7 lfs.1 mount.lustre.8 mkfs.lustre.8 tunefs.lustre.8 lctl.8 endif - -IMAGES := $(patsubst %.fig,%.eps,$(wildcard *.fig)) LYXFILES= $(filter-out $(patsubst %.lin,%.lyx,$(wildcard *.lin)),\ $(wildcard *.lin *.lyx)) -MAINTAINERCLEANFILES = $(IMAGES) $(DOCS) $(VERSIONED) -CLEANFILES = *.aux *.tex doc.old/*.aux doc.old/*.tex *.eps *.log *.pdf -VERSIONED = lustre.lyx doc.old/lustre.lyx -GENERATED = $(VERSIONED) lustre-full.tex lustre-chbar.tex +CLEANFILES = *.aux *.tex *.log *.pdf -EXTRA_DIST = chbar.sh postbar tex2pdf $(DOCS) $(man_MANS) $(IMAGES) \ - $(LYXFILES) lfs.1 lmc.1 lwizard.1 lconf.8 lctl.8 +EXTRA_DIST = tex2pdf lustre.7 mount.lustre.8 mkfs.lustre.8 tunefs.lustre.8 \ + $(LYXFILES) lfs.1 lctl.8 all: -docs: $(DOCS) # These variables are set by lbuild/check-build. RPMRELEASE ?= RELEASE KERNTYPE ?= chaos KERNRPM ?= kernel-2.4.18lustre13-RELEASE.i386.rpm -# update date and version in document -date := $(shell date +%x) -tag := $(shell echo '$$Name: $$' | sed -e 's/^\$$Na''me: *\$$$$/HEAD/; s/^\$$Na''me: \(.*\) \$$$$/\1/') -addversion = sed -e 's|@T''AG@|$(tag)|g; s|@VER''SION@|$(VERSION)|g; s|@DA''TE@|$(date)|g; s|@RPM''RELEASE@|$(RPMRELEASE)|g; s|@KERN''TYPE@|$(KERNTYPE)|g; s|@KERN''RPM@|$(KERNRPM)|g' - -# Regenerate when the $(VERSION) or $Name: $ changes. -.INTERMEDIATE: $(GENERATED) -$(VERSIONED) : %.lyx: %.lin Makefile - $(addversion) $< > $@ - .lyx.pdf: @echo $(LYX2PDF) $< && $(LYX2PDF) $< || printf "\n*** Warning: not creating PDF docs; install lyx to rectify this\n" @@ -82,49 +60,3 @@ $(VERSIONED) : %.lyx: %.lin Makefile .ps.pdf: $(PS2PDF) $< $@ - -lustre.tex recovery-manual.tex lustre.pdf recovery-manual.pdf lustre.txt lustre.html: $(IMAGES) $(LYXFILES) -.fig.eps: - -fig2dev -L eps $< > $@ - -syncweb: lustre.pdf - cp lustre.pdf /usr/src/www/content/lustre/docs/lustre.pdf - ( cd /usr/src/www ; make lustre ; make synclustre ) -.PHONY: syncweb chbar - -# Build a changebar document from the files in doc.old and this directory. -chbar: lustre-chbar.pdf - -# FIXME: Temporary rules until pdftex displays changebars correctly. -lustre-chbar.pdf: lustre-chbar-nopdf.ps - $(PS2PDF) $< $@ -lustre-chbar-nopdf.ps: lustre-chbar-nopdf.dvi - $(DVIPS) $< -o $@ -lustre-chbar-nopdf.dvi: lustre-chbar-nopdf.tex - $(LATEX) $< - $(LATEX) $< -lustre-chbar-nopdf.tex: lustre-chbar.tex - sed -e 's/^\(.*usepackage.*pdftex\)/%\1/' $< > $@ - -%-chbar.tex: chbar.sh postbar doc.old/%-full.tex %-full.tex - $(SHELL) $(srcdir)/chbar.sh doc.old/$*-full.tex $*-full.tex | $(srcdir)/postbar > $@ - -doc.old/%-full.tex: doc.old/%.tex - cd doc.old && $(TEXEXPAND) -texinputs=. -output=$*-full.tex $*.tex - -# This rule needs to come after the more specific doc.old rule. -%-full.tex: %.tex - $(TEXEXPAND) -texinputs=. -texinputs=$(srcdir) -output=$@ $< - -# Check out the old directory if it doesn't exist. -doc.old/lustre.lin: - @if test "X$(OLD)" = X; then \ - echo "You must populate doc.old or specify a CVS tag like OLD=v0_5_1"; \ - exit 1; \ - fi - rm -rf doc.old - mkdir doc.old - cvs checkout -r $(OLD) -d doc.old lustre/doc - -dist-hook: - rm -rf $(distdir)/figs/CVS diff --git a/lustre/doc/blank_template.lyx b/lustre/doc/blank_template.lyx deleted file mode 100644 index 62ee1db..0000000 --- a/lustre/doc/blank_template.lyx +++ /dev/null @@ -1,138 +0,0 @@ -#LyX 1.3 created this file. For more info see http://www.lyx.org/ -\lyxformat 221 -\textclass newbook -\begin_preamble -\usepackage{fancyhdr} -\usepackage{array} -\usepackage{epsfig} -\usepackage{applegar} -\usepackage{makeidx} -\usepackage{multicol} -\usepackage{longtable} -\usepackage{listings} -\usepackage{color} -\usepackage{coz} - -\setlength{\parindent}{0pt} -\parskip 5pt - -\newcommand{\tm}{\symbol{'252}} -\newcommand{\rt}{\symbol{'250}} -\newcommand{\cpr}{\symbol{'251}} -\newcommand{\gt}{\symbol{'074}} -\newcommand{\lt}{\symbol{'076}} -\newcommand{\verbar}{\symbol{'174}} -\newcommand{\hdr}[1]{{\bf #1.\ }} -\newcommand{\dbs}{$\backslash$} -\newcommand{\centre}[1]{ \begin{center} #1 \end{center}} -\newcommand{\cfs}{Cluster File System} -\newcommand{\WS}{IWS} - -\newenvironment{tscreen}% - {\begin{quote}\bgroup\small\tt}% - {\egroup\end{quote}} -\newenvironment{summarybox}[1]{\framebox{{\bf #1}}\penalty500\begin{enumerate}}{\end{enumerate}} - -\renewcommand{\chaptermark}[1]{\markboth{\sf \bf \thechapter\ #1}{}} -\renewcommand{\sectionmark}[1]{\markright{\bfseries #1 \ \thesection}} -\lhead[]{\leftmark} -\rhead[\rightmark]{} - -\setlength{\unitlength}{18mm} -\newcommand{\blob}{\rule[-.2\unitlength]{2\unitlength}{.5\unitlength}} -\newcommand\rblob{\thepage - \begin{picture}(0,0) - \put(.25,-\value{chapter}){\blob} - \end{picture}} -\newcommand\lblob{% - \begin{picture}(0,0) - \put(-3,-\value{chapter}){\blob} - \end{picture}% - \thepage} -\newcounter{line} -\newcommand{\secname}[1]{\addtocounter{line}{1}% - \put(1,-\value{line}){\blob} - \put(-7.5,-\value{line}){\Large \arabic{line}} - \put(-7,-\value{line}){\Large #1}} -\newcommand{\overview}{\thepage - \begin{picture}(0,0) - \secname{Introduction} - \secname{The first year} - \secname{Specialisation} - \end{picture}} - -\newcounter{itemnum} -\renewenvironment{enumerate}{\begin{list}{{\bf \arabic{itemnum}. }} { -\usecounter{itemnum} -\setlength{\labelwidth}{0.4cm} -\setlength{\labelsep}{0.25cm} -\setlength{\leftmargin}{0.65cm} -\setlength{\rightmargin}{1.0cm} -\setlength{\itemsep}{1pt} -\setlength{\parsep}{3pt} -\setlength{\itemindent}{0pt} -\setlength{\listparindent}{0pt} -\setlength{\topsep}{0.5ex} }} - {\end{list}} - -\renewenvironment{itemize}{\begin{list}{\rule{0.15cm}{0.15cm}}{ -\setlength{\labelwidth}{0.25cm} -\setlength{\labelsep}{0.25cm} -\setlength{\leftmargin}{0.65cm} -\setlength{\rightmargin}{1.0cm} -\setlength{\itemsep}{1pt} -\setlength{\parsep}{3pt} -\setlength{\itemindent}{0pt} -\setlength{\listparindent}{0pt} -\setlength{\topsep}{0.5ex}}} -{\end{list}} - -\makeindex -\newcommand{\lst}[2] { - \noindent\rule[-0.3mm]{\textwidth}{0.3mm}\vspace{-0.3mm} - \lstinputlisting[caption={#2}, - label={#1}, - showstringspaces=false, - numbers=left, - stepnumber=1, - frame=bottomline, - extendedchars=true, - basicstyle=\small\tt, - numberstyle=\tiny, - keywordstyle=\color{red}, - language=C, - emph={1, 2, 3, 4, 5, 6, 7, 8, 9, 0, NULL, lustre, CFS}, - emphstyle=\color{blue}, - commentstyle=\color{cyan}, - stringstyle=\color{green}, - directivestyle=\color{magenta}, - breaklines=true]{#1} - \vspace{0.3mm} -} -\end_preamble -\language english -\inputencoding auto -\fontscheme default -\graphics default -\paperfontsize default -\spacing single -\papersize Default -\paperpackage a4 -\use_geometry 0 -\use_amsmath 0 -\use_natbib 0 -\use_numerical_citations 0 -\paperorientation portrait -\secnumdepth 3 -\tocdepth 3 -\paragraph_separation indent -\defskip medskip -\quotes_language english -\quotes_times 2 -\papercolumns 1 -\papersides 1 -\paperpagestyle fancy - -\layout Standard - -\the_end diff --git a/lustre/doc/lconf.8 b/lustre/doc/lconf.8 index a3e60f1..a6ca88a 100644 --- a/lustre/doc/lconf.8 +++ b/lustre/doc/lconf.8 @@ -4,48 +4,59 @@ lconf \- Lustre filesystem configuration utility .SH SYNOPSIS .br .B lconf -[--node ] [-d,--cleanup] [--noexec] [--gdb] [--nosetup] [--nomod] [-n,--noexec] [-v,--verbose] [-h,--help] -[options] --add [args] +[OPTIONS] .br .SH DESCRIPTION .B lconf -, when invoked configures a node following directives in the . There will be single configuration file for all the nodes in a single cluster. This file should be distributed to all the nodes in the cluster or kept in a location accessible to all the nodes. One option is to store the cluster configuration information in LDAP format on an LDAP server that can be reached from all the cluster nodes. +, when invoked configures a node following directives in the +.Can be used to control recovery and startup/shutdown +. There will be single configuration file for all the nodes in a +single cluster. This file should be distributed to all the nodes in +the cluster or kept in a location accessible to all the nodes. The XML file must be specified. When invoked with no options, lconf will attempt to configure the resources owned by the node it is invoked on .PP The arguments that can be used for lconf are: .PP .TP +--abort_recovery - Used to start Lustre when you are certian that +recovery will not succeed, as when an OST or MDS is disabled. +.TP +--acl Enable Access Control List support on the MDS +.TP +--allow_unprivileged_port Allows connections from unprivileged ports +.TP +--clientoptions +Additional options for mounting Lustre clients. Obsolete with +zeroconfig mounting.. +.TP --client_uuid The failed client (required for recovery). .TP ---clientoptions -Additional options for Lustre. +--clumanager Generate a Red Hat Clumanager configuration file for this +node. .TP --config -Cluster configuration name used for LDAP query +Cluster configuration name used for LDAP query (depreciated) .TP --conn_uuid The failed connection (required for recovery). .TP ---d|--cleanup +-d|--cleanup Unconfigure a node. The same config and --node argument used for configuration needs to be used for cleanup as well. This will attempt to undo all of the configuration steps done by lconf, including unloading the kernel modules. .TP --debug_path -Path to save debug dumps. +Path to save debug dumps.(default is /tmp/lustre-log) .TP --dump Dump the kernel debug log to the specified file before portals is unloaded during cleanup. .TP ---dump_path -Path to save debug dumps. Default is /tmp/lustre_log -.TP --failover -Used to shutdown without saving state. Default is 0. This will allow the node to give up service to another node for failover purposes. This will not be a clean shutdown. +Used to shutdown without saving state. This will allow the node to give up service to another node for failover purposes. This will not be a clean shutdown. .TP ---force -Forced unmounting and/or obd detach during cleanup. Default is 0. +-f|--force +Forced unmounting and/or obd detach during cleanup. .TP --gdb -Causes lconf to print a message and pause for 5 seconds after creating a gdb module script and before doing any Lustre configuration (the gdb module script is always created, however). +Causes lconf to create a gdb module script and pause 5 seconds before doing any Lustre configuration (the gdb module script is always created, however). .TP --gdb_script Full name of gdb debug script. Default is /tmp/ogdb. @@ -53,6 +64,9 @@ Full name of gdb debug script. Default is /tmp/ogdb. --group The group of devices to cleanup/configure. .TP +--group_upcall +Pathname to the MDS upcall to resolve secondary group membership. Defaults to NONE, meaning that the MDS will use whatever group the client supplies, but this is limited to a single supplementary group. +.TP -h,--help Print help. .TP @@ -63,19 +77,29 @@ The UUID of the service to be ignored by a client mounting Lustre. Allows the cl Dump all ioctls to the specified file .TP --ldapurl -LDAP server URL +LDAP server URL. Depreciated +.TP +--lustre=src_dir +Specify the base directory for Lustre sources, this parameter will cause lconf to load the lustre modules from this source tree. .TP --lustre_upcall Set the location of the Lustre upcall scripts used by the client for recovery .TP ---lustre=src_dir -Specify the base directory for Lustre sources, this parameter will cause lconf to load the lustre modules from this soure tree. +--make_service_scripts Create per-service symlinks for use with clumanager HA software .TP --mds_ost_conn Open connections to OSTs on MDS. .TP --maxlevel -Perform configuration of devices and services up to level given. level can take the values net, dev, svc, fs. When used in conjunction with cleanup, services are torn down up to a certain level. Default is 100. +Perform configuration of devices and services up to level given. When +used in conjunction with cleanup, services are torn down up to a +certain level. +Levels are aproximatly like: +10 - network +20 - device, ldlm +30 - osd, mdd +40 - mds, ost +70 - mountpoint, echo_client, osc, mdc, lov .TP --minlevel Specify the minimum level of services to configure/cleanup. Default is 0. @@ -98,24 +122,36 @@ Only setup devices and services, do not load modules. --nosetup Only load modules, do not configure devices or services. .TP +--old_conf Start up service even though config logs appear outdated. +.TP --portals -Specify portals source directory. If this is a relative path, then it is assumed to be relative to lustre. +Specify portals source directory. If this is a relative path, then it +is assumed to be relative to lustre. (Depreciated) .TP --portals_upcall -Specify the location of the Portals upcall scripts used by the client for recovery +Specify the location of the Portals upcall scripts used by the client +for recovery (Depreciated) .TP --ptldebug debug-level This options can be used to set the required debug level. .TP +--quota +Enable quota support for client filesystem +.TP +--rawprimary For clumanager, device of the primary quorum +(default=/dev/raw/raw1) +.TP +--rawsecondary For clumanager, device of the secondary quorum (default=/dev/raw/raw2) +.TP --record Write config information on mds. .TP ---record_log -Specify the name of config record log. -.TP --record_device Specify MDS device name that will record the config commands. .TP +--record_log +Specify the name of config record log. +.TP --recover Recover a device. .TP @@ -125,6 +161,14 @@ Reformat all the devices. This is essential on the first time the file system is --select Select a particular node for a service .TP +--service +Shorthand for --group --select = +.TP +--service_scripts For clumanager, directory containing per-service scripts (default=/etc/lustre/services) +.TP +--single_socket The socknal option. Uses only one socket instead of a +bundle. +.TP --subsystem Set the portals debug subsystem. .TP @@ -135,7 +179,10 @@ Specify the failed target (required for recovery). Set the recovery timeout period. .TP --upcall -Set the location of both Lustre and Portals upcall scripts used by the client for recovery +Set the location of both Lustre and Portals upcall scripts used by the +client for recovery +.TP +--user_xattr Enable user_xattr support on MDS .TP --verbose,-v Be verbose and show actions while going along. diff --git a/lustre/doc/lconf.lyx b/lustre/doc/lconf.lyx index e00d94d..2846f48 100644 --- a/lustre/doc/lconf.lyx +++ b/lustre/doc/lconf.lyx @@ -122,6 +122,12 @@ This program configures a node following directives in the . The group of devices to cleanup/configure. \layout Description +--group_upcall\SpecialChar ~ + Pathname to the MDS upcall to resolve secondary group membership. + Defaults to NONE, meaning that the MDS will use whatever group the client + supplies, but this is limited to a single supplementary group. +\layout Description + -h,--help Print help. \layout Description diff --git a/lustre/doc/lctl.8 b/lustre/doc/lctl.8 index 55652c0..e56331a 100644 --- a/lustre/doc/lctl.8 +++ b/lustre/doc/lctl.8 @@ -7,385 +7,172 @@ lctl \- Low level Lustre filesystem configuration utility .br .B lctl --device .br -.B lctl --threads -.br .SH DESCRIPTION .B lctl -can be invoked in interactive mode by issuing lctl command. After that, commands are issued as below. The most common commands in lctl are (in matching pairs) +is used to directly control Lustre via an ioctl interface, allowing +various configuration, maintenance, and debugging features to be accessed. + +.B lctl +can be invoked in interactive mode by issuing lctl command. After that, commands are issued as below. The most common commands in lctl are +.B dl +, .B device -and -.B attach -, -.B detach -and -.B setup , -.B cleanup -and -.B connect +.B network +.I +, +.B list_nids +, +.B ping +.I nid , -.B disconnect -and .B help -, and +, .B quit. -To get a complete listing of available commands, type help at the lctl prompt. To get basic help on the meaning and syntax of a command, type help command. Command completion is activated with the TAB key, and command history is available via the up- and down-arrow keys. +To get a complete listing of available commands, type +.B help +at the lctl prompt. To get basic help on the meaning and syntax of a +command, type +.B help +.I command +. Command completion is activated with the TAB key, and command history is available via the up- and down-arrow keys. -For non-interactive single-threaded use, one uses the second invocation, which runs command after connecting to the device. +For non-interactive use, one uses the second invocation, which runs command after connecting to the device. -.B Network Configuration +.SS Network Configuration .TP ---net -Indicate the network type to be used for the operation. -.TP -network -Indicate what kind of network applies for the configuration commands that follow. -.TP -interface_list -Print the interface entries. -.TP -add_interface [netmask] -Add an interface entry. +.BI network " |" +Start or stop LNET, or select a network type for other +.I lctl +LNET commands .TP -del_interface [ip] -Delete an interface entry. +.BI list_nids +Print all Network Identifiers on the local node. LNET must be running. .TP -peer_list -Print the peer entries. +.BI which_nid " " +From a list of nids for a remote node, show which interface communication +will take place on. .TP -add_peer -Add a peer entry. +.BI ping " " +Check LNET connectivity via an LNET ping. This will use the fabric +appropriate to the specified NID. .TP -del_peer [] [] [ks] -Remove a peer entry. +.BI interface_list +Print the network interface information for a given +.B network +type. .TP -autoconn_list -Print autoconnect entries. +.BI peer_list +Print the known peers for a given +.B network +type. .TP -add_autoconn nid ipaddr port [ise] -Add an autoconnect entry. -.TP -del_autoconn [] [] [ks] -Remove an autoconnect entry. -.TP -conn_list -Print all the connected remote nid. -.TP -connect [[ ] | ] -This will establish a connection to a remote network network id given by the hostname/port combination, or the elan id. -.TP -disconnect -Disconnect from a remote nid. +.BI conn_list +Print all the connected remote NIDs for a given +.B network +type. .TP -active_tx +.BI active_tx This command should print active transmits, and it is only used for elan network type. .TP -mynid [nid] -Informs the socknal of the local nid. It defaults to hostname for tcp networks and is automatically setup for elan/myrinet networks. -.TP -shownid -Print the local NID. -.TP -add_uuid -Associate a given UUID with an nid. -.TP -close_uuid -Disconnect a UUID. -.TP -del_uuid -Delete a UUID association. -.TP -add_route [target] -Add an entry to the routing table for the given target. -.TP -del_route -Delete an entry for the target from the routing table. -.TP -set_route [