From 090aefec6c5a508f640a0c09cbd16f39b25bf528 Mon Sep 17 00:00:00 2001 From: cvs2svn Date: Tue, 6 Nov 2007 09:18:55 +0000 Subject: [PATCH] This commit was manufactured by cvs2svn to create branch 'unlabeled-1.34.34.4.14'. --- build/README.kernel-source | 103 - build/Rules.in | 78 - build/autoconf/.cvsignore | 2 - build/autoconf/Makefile.am | 4 - build/autoconf/lustre-build-darwin.m4 | 106 - build/checkstack.pl | 87 - build/clearpatches.sh | 13 - build/confirmpatches.sh | 12 - build/cvs-modified-files.pl | 47 - build/land2.sh | 34 - build/linux-merge-config.awk | 317 -- build/linux-merge-modules.awk | 125 - build/merge1.sh | 104 - build/merge2.sh | 35 - build/osxpack/ReadMe.txt | 4 - build/osxpack/Welcome.txt | 4 - build/osxpack/packlustre.sh | 72 - build/osxpack/postflight | 13 - build/osxpack/preflight | 67 - build/osxpack/sysctl.conf | 15 - build/osxpack/uninstall_lustre | 111 - build/osxpack/unload_lustre | 18 - build/replace2.sh | 36 - build/sles8-post.sh | 49 - build/sles8-postun.sh | 22 - build/sles8-pre.sh | 2 - build/sles8-update_INITRD_MODULES.sh | 56 - build/sles8-update_rcfile_setting.sh | 35 - build/suse-functions.sh | 22 - build/suse-trigger-script.sh.in | 9 - build/update_oldconfig | 74 - .../patches/export-ext3-2.6-rhel4.patch | 35 - .../patches/export-ext3-2.6-suse.patch | 35 - .../patches/export_symbols-ext3-2.6-suse.patch | 17 - .../patches/ext3-check-jbd-errors-2.6.5.patch | 113 - .../patches/ext3-check-jbd-errors-2.6.9.patch | 113 - ...3-disable-write-bar-by-default-2.6-sles10.patch | 15 - .../patches/ext3-ea-in-inode-2.6-rhel4.patch | 840 ---- .../patches/ext3-ea-in-inode-2.6-suse.patch | 840 ---- .../patches/ext3-extents-2.6.12.patch | 2940 ------------ .../patches/ext3-extents-2.6.15.patch | 2947 ------------ .../patches/ext3-extents-2.6.16-sles10.patch | 2947 ------------ .../patches/ext3-extents-2.6.18-vanilla.patch | 2950 ------------ .../patches/ext3-extents-2.6.5.patch | 2951 ------------ .../patches/ext3-extents-2.6.9-rhel4.patch | 2926 ------------ .../patches/ext3-extents-bug11324.patch | 252 -- .../patches/ext3-external-journal-2.6.12.patch | 148 - .../patches/ext3-filterdata-2.6.15.patch | 25 - .../patches/ext3-htree-dot-2.6.patch | 23 - .../kernel_patches/patches/ext3-ialloc-2.6.patch | 128 - .../patches/ext3-include-fixes-2.6-rhel4.patch | 20 - .../patches/ext3-include-fixes-2.6-suse.patch | 20 - .../patches/ext3-lookup-dotdot-2.6.9.patch | 63 - .../patches/ext3-map_inode_page-2.6-suse.patch | 86 - .../patches/ext3-mballoc2-2.6-fc5.patch | 3103 ------------- .../patches/ext3-mballoc2-2.6-suse.patch | 3108 ------------- .../patches/ext3-mballoc2-2.6.12.patch | 3102 ------------- .../patches/ext3-mballoc2-2.6.18-vanilla.patch | 3140 ------------- .../patches/ext3-mballoc2-2.6.9-rhel4.patch | 3121 ------------- .../patches/ext3-nanosecond-2.6-rhel4.patch | 401 -- .../patches/ext3-nanosecond-2.6-sles10.patch | 404 -- .../patches/ext3-nanosecond-2.6-suse.patch | 195 - .../patches/ext3-nanosecond-2.6.18-vanilla.patch | 403 -- .../kernel_patches/patches/ext3-nlinks-2.6.7.patch | 156 - .../kernel_patches/patches/ext3-nlinks-2.6.9.patch | 158 - .../ext3-remove-cond_resched-calls-2.6.12.patch | 29 - .../patches/ext3-rename-reserve-2.6-suse.patch | 263 -- .../patches/ext3-san-jdike-2.6-suse.patch | 106 - .../patches/ext3-sector_t-overflow-2.6.12.patch | 64 - .../ext3-sector_t-overflow-2.6.5-suse.patch | 44 - .../ext3-sector_t-overflow-2.6.9-rhel4.patch | 64 - .../patches/ext3-wantedi-2.6-rhel4.patch | 193 - .../patches/ext3-wantedi-2.6-suse.patch | 192 - ldiskfs/kernel_patches/patches/iopen-2.6-fc5.patch | 448 -- .../kernel_patches/patches/iopen-2.6-rhel4.patch | 471 -- .../kernel_patches/patches/iopen-2.6-suse.patch | 472 -- ldiskfs/kernel_patches/patches/iopen-2.6.12.patch | 471 -- .../kernel_patches/series/ldiskfs-2.6-fc3.series | 13 - .../kernel_patches/series/ldiskfs-2.6-fc5.series | 12 - .../kernel_patches/series/ldiskfs-2.6-rhel4.series | 17 - .../series/ldiskfs-2.6-sles10.series | 15 - .../kernel_patches/series/ldiskfs-2.6-suse.series | 17 - .../series/ldiskfs-2.6.12-vanilla.series | 15 - .../series/ldiskfs-2.6.18-vanilla.series | 14 - ldiskfs/ldiskfs/Makefile.in | 21 - ldiskfs/ldiskfs/autoMakefile.am | 80 - lustre/.cvsignore | 30 - lustre/BUGS | 1 - lustre/BUILDING | 30 - lustre/ChangeLog | 4689 ------------------- lustre/FDL | 355 -- lustre/LICENSE | 372 -- lustre/Makefile.in | 13 - lustre/autoMakefile.am | 63 - lustre/autoconf/.cvsignore | 2 - lustre/autoconf/Makefile.am | 1 - lustre/autoconf/lustre-core.m4 | 1542 ------- lustre/autoconf/lustre-version.ac | 36 - lustre/conf/.cvsignore | 2 - lustre/conf/Makefile.am | 14 - lustre/conf/lustre.dtd | 145 - lustre/conf/lustre2ldif.xsl | 308 -- lustre/conf/modules.conf | 8 - lustre/conf/slapd-lustre.conf | 11 - lustre/conf/top.ldif | 4 - lustre/contrib/.cvsignore | 2 - lustre/contrib/Makefile.am | 5 - lustre/contrib/README | 2 - lustre/contrib/mpich-1.2.6-lustre.patch | 1829 -------- lustre/contrib/mpich2-1.0.3.patch | 1831 -------- lustre/doc/.cvsignore | 23 - lustre/doc/Makefile.am | 62 - lustre/doc/VERSIONING | 90 - lustre/doc/chbar.sh | 243 - lustre/doc/lconf.8 | 206 - lustre/doc/lconf.lyx | 387 -- lustre/doc/lctl.8 | 190 - lustre/doc/lctl.lyx | 928 ---- lustre/doc/lfs.1 | 136 - lustre/doc/lfs.lyx | 543 --- lustre/doc/llverdev.txt | 48 - lustre/doc/llverfs.txt | 48 - lustre/doc/lmc.1 | 282 -- lustre/doc/lmc.lyx | 691 --- lustre/doc/lustre.7 | 76 - lustre/doc/mkfs.lustre.8 | 132 - lustre/doc/mount.lustre.8 | 105 - lustre/doc/postbar | 151 - lustre/doc/tex2pdf | 3043 ------------- lustre/doc/tunefs.lustre.8 | 101 - lustre/include/.cvsignore | 14 - lustre/include/Makefile.am | 17 - lustre/include/class_hash.h | 120 - lustre/include/darwin/lprocfs_status.h | 57 - lustre/include/darwin/lustre_compat.h | 75 - lustre/include/darwin/lustre_debug.h | 36 - lustre/include/darwin/lustre_dlm.h | 25 - lustre/include/darwin/lustre_fsfilt.h | 32 - lustre/include/darwin/lustre_handles.h | 12 - lustre/include/darwin/lustre_lib.h | 76 - lustre/include/darwin/lustre_lite.h | 84 - lustre/include/darwin/lustre_log.h | 11 - lustre/include/darwin/lustre_mds.h | 32 - lustre/include/darwin/lustre_net.h | 34 - lustre/include/darwin/lustre_quota.h | 16 - lustre/include/darwin/lustre_types.h | 7 - lustre/include/darwin/lustre_user.h | 47 - lustre/include/darwin/lvfs.h | 24 - lustre/include/darwin/obd.h | 39 - lustre/include/darwin/obd_class.h | 34 - lustre/include/darwin/obd_support.h | 58 - lustre/include/ioctl.h | 64 - lustre/include/liblustre.h | 926 ---- lustre/include/linux/.cvsignore | 15 - lustre/include/linux/Makefile.am | 17 - lustre/include/linux/lprocfs_status.h | 53 - lustre/include/linux/lustre_compat25.h | 558 --- lustre/include/linux/lustre_debug.h | 41 - lustre/include/linux/lustre_dlm.h | 21 - lustre/include/linux/lustre_fsfilt.h | 471 -- lustre/include/linux/lustre_handles.h | 25 - lustre/include/linux/lustre_intent.h | 35 - lustre/include/linux/lustre_lib.h | 93 - lustre/include/linux/lustre_lite.h | 78 - lustre/include/linux/lustre_log.h | 44 - lustre/include/linux/lustre_mds.h | 57 - lustre/include/linux/lustre_net.h | 46 - lustre/include/linux/lustre_patchless_compat.h | 118 - lustre/include/linux/lustre_quota.h | 18 - lustre/include/linux/lustre_types.h | 47 - lustre/include/linux/lustre_user.h | 91 - lustre/include/linux/lvfs.h | 139 - lustre/include/linux/lvfs_linux.h | 69 - lustre/include/linux/obd.h | 42 - lustre/include/linux/obd_class.h | 60 - lustre/include/linux/obd_support.h | 101 - lustre/include/lprocfs_status.h | 675 --- lustre/include/lustre/.cvsignore | 2 - lustre/include/lustre/Makefile.am | 10 - lustre/include/lustre/liblustreapi.h | 89 - lustre/include/lustre/lustre_idl.h | 1490 ------ lustre/include/lustre/lustre_user.h | 258 -- lustre/include/lustre/types.h | 14 - lustre/include/lustre_cfg.h | 256 -- lustre/include/lustre_commit_confd.h | 59 - lustre/include/lustre_debug.h | 64 - lustre/include/lustre_disk.h | 288 -- lustre/include/lustre_dlm.h | 808 ---- lustre/include/lustre_export.h | 143 - lustre/include/lustre_fsfilt.h | 38 - lustre/include/lustre_ha.h | 25 - lustre/include/lustre_handles.h | 54 - lustre/include/lustre_import.h | 162 - lustre/include/lustre_lib.h | 764 ---- lustre/include/lustre_lite.h | 140 - lustre/include/lustre_log.h | 483 -- lustre/include/lustre_mds.h | 254 -- lustre/include/lustre_net.h | 935 ---- lustre/include/lustre_param.h | 65 - lustre/include/lustre_quota.h | 464 -- lustre/include/lustre_ucache.h | 68 - lustre/include/lustre_ver.h.in | 32 - lustre/include/lvfs.h | 61 - lustre/include/obd.h | 1062 ----- lustre/include/obd_cache.h | 11 - lustre/include/obd_class.h | 1429 ------ lustre/include/obd_echo.h | 33 - lustre/include/obd_lov.h | 28 - lustre/include/obd_ost.h | 37 - lustre/include/obd_support.h | 695 --- lustre/kernel-tests/.cvsignore | 5 - lustre/kernel-tests/Makefile | 24 - lustre/kernel_patches/LICENSE | 359 -- lustre/kernel_patches/README | 3 - .../kernel-2.6.15-2.6-fc5-i686-smp.config | 1598 ------- .../kernel-2.6.15-2.6-fc5-i686.config | 1591 ------- .../kernel_configs/kernel-2.6.15-fc5-i686.config | 1598 ------- .../kernel-2.6.16-2.6-patchless-i686-smp.config | 1620 ------- .../kernel-2.6.16-2.6-patchless-i686.config | 1616 ------- .../kernel-2.6.16-2.6-patchless-ia64-smp.config | 1422 ------ .../kernel-2.6.16-2.6-patchless-ia64.config | 1419 ------ .../kernel-2.6.16-2.6-patchless-x86_64-smp.config | 1463 ------ .../kernel-2.6.16-2.6-patchless-x86_64.config | 1462 ------ .../kernel-2.6.16-2.6-sles10-i686-bigsmp.config | 3496 -------------- .../kernel-2.6.16-2.6-sles10-i686.config | 3496 -------------- .../kernel-2.6.16-2.6-sles10-x86_64-smp.config | 3058 ------------- .../kernel-2.6.16-2.6-sles10-x86_64.config | 3046 ------------- .../kernel-2.6.18-2.6-rhel5-i686-smp.config | 3002 ------------ .../kernel-2.6.18-2.6-rhel5-i686.config | 2989 ------------ .../kernel-2.6.18-2.6-rhel5-ia64-smp.config | 2645 ----------- .../kernel-2.6.18-2.6-rhel5-ia64.config | 2641 ----------- .../kernel-2.6.18-2.6-rhel5-x86_64-smp.config | 2882 ------------ .../kernel-2.6.18-2.6-rhel5-x86_64.config | 2866 ------------ .../kernel-2.6.18-2.6-vanilla-i686-smp.config | 2936 ------------ .../kernel-2.6.18-2.6-vanilla-i686.config | 2956 ------------ .../kernel-2.6.18-2.6-vanilla-x86_64-smp.config | 2818 ------------ .../kernel-2.6.18-2.6-vanilla-x86_64.config | 2796 ------------ .../kernel-2.6.5-2.6-suse-i686-bigsmp.config | 2888 ------------ .../kernel-2.6.5-2.6-suse-i686-smp.config | 2888 ------------ .../kernel-2.6.5-2.6-suse-i686.config | 2888 ------------ .../kernel-2.6.5-2.6-suse-ia64-smp.config | 2411 ---------- .../kernel-2.6.5-2.6-suse-ia64.config | 2411 ---------- .../kernel-2.6.5-2.6-suse-ppc-pseries64.config | 1454 ------ .../kernel-2.6.5-2.6-suse-ppc.config | 1453 ------ .../kernel-2.6.5-2.6-suse-x86_64-smp.config | 2501 ---------- .../kernel-2.6.5-2.6-suse-x86_64.config | 2501 ---------- .../kernel-2.6.9-2.6-rhel4-i686-smp.config | 2479 ---------- .../kernel-2.6.9-2.6-rhel4-i686.config | 2483 ---------- .../kernel-2.6.9-2.6-rhel4-ia64-smp.config | 2038 --------- .../kernel-2.6.9-2.6-rhel4-ia64.config | 2038 --------- .../kernel-2.6.9-2.6-rhel4-x86_64-smp.config | 2231 --------- .../kernel-2.6.9-2.6-rhel4-x86_64.config | 2230 --------- .../kernel_configs/kgdb_2.6.0_test1_vmware.config | 914 ---- .../kernel_configs/uml-2.6.10-fc3.config | 662 --- .../kernel_configs/uml-vanilla-2.6.6.config | 491 -- .../kernel_configs/uml_2.6.0_test3.config | 325 -- lustre/kernel_patches/patches/2.6.5-quotafix.patch | 2151 --------- lustre/kernel_patches/patches/8kstack-2.6.12.patch | 13 - .../patches/atomic_add_return-sles9.patch | 104 - .../patches/bitops_ext2_find_next_le_bit-2.6.patch | 153 - .../patches/blkdev_tunables-2.6-sles10.patch | 13 - .../patches/blkdev_tunables-2.6-suse.patch | 28 - .../patches/dcache-qstr-api-fix-2.6-suse.patch | 32 - .../patches/dev_read_only-2.6-fc5.patch | 142 - .../patches/dev_read_only-2.6-lnxi.patch | 167 - .../patches/dev_read_only-2.6-suse.patch | 147 - .../patches/dev_read_only-2.6.18-vanilla.patch | 142 - lustre/kernel_patches/patches/export-2.6-fc5.patch | 12 - .../kernel_patches/patches/export-2.6-suse.patch | 24 - .../patches/export-2.6.18-vanilla.patch | 24 - .../patches/export-do_kern_mount.patch | 13 - .../patches/export-log-2.6-rhel4.patch | 12 - .../patches/export-show_task-2.6-fc5.patch | 25 - .../patches/export-show_task-2.6-vanilla.patch | 25 - .../patches/export-show_task-2.6.18-vanilla.patch | 25 - .../patches/export-truncate-2.6-suse.patch | 37 - .../patches/export-truncate-2.6.18-vanilla.patch | 39 - .../patches/export_symbol_numa-2.6-fc5.patch | 12 - .../patches/export_symbol_numa-2.6.18.patch | 24 - .../patches/export_symbol_numa.patch | 24 - .../patches/export_symbols-2.6-rhel4.patch | 81 - .../patches/export_symbols-2.6-suse.patch | 57 - .../patches/export_symbols-2.6.12.patch | 64 - .../patches/export_symbols-2.6.18-vanilla.patch | 64 - .../patches/ext3-patch-fuzz-fixup-fc3.patch | 15 - .../kernel_patches/patches/ext3-super-ntohl.patch | 16 - .../patches/fc3_to_rhel4_updates.patch | 12 - lustre/kernel_patches/patches/fsprivate-2.6.patch | 10 - .../patches/header-guards-2.6-suse.patch | 38 - .../patches/hostfs_readdir_large.patch | 32 - lustre/kernel_patches/patches/i_filter_data.patch | 12 - .../patches/iallocsem_consistency.patch | 48 - .../patches/inode-nr_unused-2.6.9-rhel4.patch | 47 - .../patches/iopen-misc-2.6-fc3.patch | 82 - .../patches/iopen-misc-2.6-suse.patch | 80 - .../kernel_patches/patches/iopen-misc-2.6.12.patch | 112 - .../patches/iopen-misc-2.6.18-vanilla.patch | 82 - .../patches/jbd-16tb-overflow-fixes.patch | 43 - .../kernel_patches/patches/jbd-2.6.10-jcberr.patch | 222 - .../patches/jbd-check-for-unmapped-buffer.patch | 91 - .../patches/jbd-jcberr-2.6.18-vanilla.patch | 228 - .../patches/jbd-journal-chksum-2.6-sles10.patch | 617 --- .../jbd-journal-chksum-2.6.18-vanilla.patch | 616 --- .../patches/jbd-stats-2.6-rhel5.patch | 744 --- .../patches/jbd-stats-2.6-sles10.patch | 735 --- .../kernel_patches/patches/jbd-stats-2.6.5.patch | 772 ---- .../kernel_patches/patches/jbd-stats-2.6.9.patch | 736 --- .../patches/link_notlast-susefix.patch | 16 - .../patches/linux-2.6-binutils-2.16.patch | 102 - .../linux-2.6.9-ext3-sub-second-timestamp.patch | 631 --- .../lookup_bdev_init_intent-2.6.18-vanilla.patch | 12 - .../patches/lookup_bdev_init_intent.patch | 12 - .../patches/lustre-version-revert_suse.patch | 4 - lustre/kernel_patches/patches/lustre_version.patch | 26 - .../kernel_patches/patches/md_path_lookup-2.6-suse | 25 - .../patches/md_path_lookup-2.6-suse.patch | 25 - .../patches/nfs-cifs-intent-2.6-fc3.patch | 127 - .../patches/nfs-cifs-intent-2.6-fc5.patch | 116 - .../patches/nfs-cifs-intent-2.6-suse.patch | 135 - .../kernel_patches/patches/qsnet-rhel4-2.6.patch | 1741 ------- lustre/kernel_patches/patches/qsnet-suse-2.6.patch | 1690 ------- .../patches/quota-deadlock-on-pagelock-core.patch | 1261 ------ .../patches/quota-deadlock-on-pagelock-ext3.patch | 273 -- .../patches/quota-umount-race-fix.patch | 139 - .../patches/raid5-configurable-cachesize.patch | 50 - lustre/kernel_patches/patches/raid5-large-io.patch | 20 - .../kernel_patches/patches/raid5-merge-ios.patch | 129 - .../patches/raid5-optimize-memcpy.patch | 227 - .../patches/raid5-serialize-ovelapping-reqs.patch | 140 - lustre/kernel_patches/patches/raid5-stats.patch | 200 - .../patches/raid5-stripe-by-stripe-handling.patch | 104 - lustre/kernel_patches/patches/raid5-zerocopy.patch | 374 -- .../patches/remove-suid-2.6-suse.patch | 22 - .../patches/sd_iostats-2.6-rhel4.patch | 498 -- .../patches/tcp-rto_proc-2.6.9.patch | 130 - lustre/kernel_patches/patches/uml-2.6.10-fc3.patch | 3781 ---------------- .../patches/uml-exprt-clearuser.patch | 24 - .../vfs-keep-inode-hashed-for-clear-inode.patch | 32 - .../patches/vfs_intent-2.6-fc3.patch | 756 ---- .../patches/vfs_intent-2.6-fc5-fix.patch | 20 - .../patches/vfs_intent-2.6-fc5.patch | 827 ---- .../patches/vfs_intent-2.6-rhel4.patch | 1448 ------ .../patches/vfs_intent-2.6-sles10.patch | 1500 ------ .../patches/vfs_intent-2.6-suse.patch | 833 ---- ..._intent-reduce-stack-usage-2.6-suse-newer.patch | 42 - .../patches/vfs_nointent-2.6-fc5.patch | 472 -- .../patches/vfs_nointent-2.6-rhel4.patch | 487 -- .../patches/vfs_nointent-2.6-sles10.patch | 453 -- .../patches/vfs_nointent-2.6-suse.patch | 472 -- .../kernel_patches/patches/vfs_races-2.6-fc3.patch | 64 - .../patches/vfs_races-2.6-rhel4.patch | 63 - .../patches/vfs_races-2.6-rhel5.patch | 100 - .../patches/vfs_races-2.6-suse.patch | 62 - .../kernel_patches/patches/vfs_races-2.6.12.patch | 61 - .../patches/vfs_races-2.6.18-vanilla.patch | 60 - .../kernel_patches/patches/vm-tunables-rhel4.patch | 19 - lustre/kernel_patches/series/2.6-fc3.series | 23 - lustre/kernel_patches/series/2.6-fc5.series | 18 - .../kernel_patches/series/2.6-rhel4-titech.series | 30 - lustre/kernel_patches/series/2.6-rhel4.series | 31 - lustre/kernel_patches/series/2.6-rhel5.series | 12 - lustre/kernel_patches/series/2.6-sles10.series | 12 - lustre/kernel_patches/series/2.6-suse-newer.series | 15 - lustre/kernel_patches/series/2.6-suse.series | 14 - lustre/kernel_patches/series/2.6.18-vanilla.series | 16 - lustre/kernel_patches/targets/.cvsignore | 1 - lustre/kernel_patches/targets/2.6-fc5.target.in | 18 - .../kernel_patches/targets/2.6-patchless.target.in | 25 - lustre/kernel_patches/targets/2.6-rhel4.target.in | 25 - lustre/kernel_patches/targets/2.6-rhel5.target.in | 24 - lustre/kernel_patches/targets/2.6-sles10.target.in | 31 - lustre/kernel_patches/targets/2.6-suse.target.in | 29 - .../kernel_patches/targets/2.6-vanilla.target.in | 29 - .../kernel_patches/targets/hp_pnnl-2.4.target.in | 17 - lustre/kernel_patches/targets/rh-2.4.target.in | 24 - lustre/kernel_patches/targets/rhel-2.4.target.in | 24 - lustre/kernel_patches/targets/sles-2.4.target.in | 26 - .../kernel_patches/targets/suse-2.4.21-2.target.in | 15 - lustre/kernel_patches/which_patch | 18 - lustre/ldlm/.cvsignore | 6 - lustre/ldlm/Makefile.am | 13 - lustre/ldlm/l_lock.c | 71 - lustre/ldlm/ldlm_extent.c | 536 --- lustre/ldlm/ldlm_flock.c | 576 --- lustre/ldlm/ldlm_inodebits.c | 170 - lustre/ldlm/ldlm_internal.h | 130 - lustre/ldlm/ldlm_lib.c | 1676 ------- lustre/ldlm/ldlm_lock.c | 1900 -------- lustre/ldlm/ldlm_lockd.c | 2025 --------- lustre/ldlm/ldlm_plain.c | 144 - lustre/ldlm/ldlm_pool.c | 1032 ----- lustre/ldlm/ldlm_request.c | 1761 -------- lustre/ldlm/ldlm_resource.c | 1007 ----- lustre/liblustre/.cvsignore | 9 - lustre/liblustre/Makefile.am | 69 - lustre/liblustre/dir.c | 275 -- lustre/liblustre/file.c | 521 --- lustre/liblustre/genlib.sh | 111 - lustre/liblustre/llite_lib.c | 390 -- lustre/liblustre/llite_lib.h | 273 -- lustre/liblustre/lutil.c | 239 - lustre/liblustre/lutil.h | 34 - lustre/liblustre/namei.c | 589 --- lustre/liblustre/rw.c | 891 ---- lustre/liblustre/super.c | 2090 --------- lustre/liblustre/tests/.cvsignore | 8 - lustre/liblustre/tests/Makefile.am | 63 - lustre/liblustre/tests/echo_test.c | 313 -- lustre/liblustre/tests/recovery_small.c | 390 -- lustre/liblustre/tests/replay_ost_single.c | 338 -- lustre/liblustre/tests/replay_single.c | 423 -- lustre/liblustre/tests/sanity.c | 1523 ------- lustre/liblustre/tests/test_common.c | 434 -- lustre/liblustre/tests/test_common.h | 40 - lustre/liblustre/tests/test_lock_cancel.c | 194 - lustre/llite/.cvsignore | 16 - lustre/llite/Makefile.in | 12 - lustre/llite/autoMakefile.am | 12 - lustre/llite/dcache.c | 730 --- lustre/llite/dir.c | 1076 ----- lustre/llite/file.c | 2912 ------------ lustre/llite/llite_close.c | 268 -- lustre/llite/llite_internal.h | 859 ---- lustre/llite/llite_lib.c | 2140 --------- lustre/llite/llite_mmap.c | 648 --- lustre/llite/llite_nfs.c | 262 -- lustre/llite/lloop.c | 759 ---- lustre/llite/lproc_llite.c | 1345 ------ lustre/llite/namei.c | 1332 ------ lustre/llite/rw.c | 1815 -------- lustre/llite/rw24.c | 145 - lustre/llite/rw26.c | 327 -- lustre/llite/statahead.c | 871 ---- lustre/llite/super.c | 127 - lustre/llite/super25.c | 170 - lustre/llite/symlink.c | 219 - lustre/llite/xattr.c | 411 -- lustre/lov/.cvsignore | 11 - lustre/lov/Info.plist | 41 - lustre/lov/Makefile.in | 4 - lustre/lov/autoMakefile.am | 46 - lustre/lov/lov_ea.c | 611 --- lustre/lov/lov_internal.h | 266 -- lustre/lov/lov_log.c | 243 - lustre/lov/lov_merge.c | 179 - lustre/lov/lov_obd.c | 2713 ----------- lustre/lov/lov_offset.c | 279 -- lustre/lov/lov_pack.c | 450 -- lustre/lov/lov_qos.c | 955 ---- lustre/lov/lov_request.c | 1642 ------- lustre/lov/lproc_lov.c | 355 -- lustre/lvfs/.cvsignore | 19 - lustre/lvfs/Info.plist | 37 - lustre/lvfs/Makefile.in | 20 - lustre/lvfs/autoMakefile.am | 83 - lustre/lvfs/fsfilt.c | 109 - lustre/lvfs/fsfilt_ext3.c | 2198 --------- lustre/lvfs/fsfilt_reiserfs.c | 244 - lustre/lvfs/lustre_quota_fmt.c | 998 ---- lustre/lvfs/lustre_quota_fmt.h | 84 - lustre/lvfs/lvfs_common.c | 35 - lustre/lvfs/lvfs_darwin.c | 45 - lustre/lvfs/lvfs_internal.h | 5 - lustre/lvfs/lvfs_lib.c | 164 - lustre/lvfs/lvfs_linux.c | 510 --- lustre/lvfs/lvfs_userfs.c | 44 - lustre/lvfs/prng.c | 107 - lustre/lvfs/quotafmt_test.c | 504 --- lustre/lvfs/upcall_cache.c | 519 --- lustre/mdc/.cvsignore | 15 - lustre/mdc/Makefile.in | 4 - lustre/mdc/autoMakefile.am | 18 - lustre/mdc/lproc_mdc.c | 93 - lustre/mdc/mdc_internal.h | 79 - lustre/mdc/mdc_lib.c | 461 -- lustre/mdc/mdc_locks.c | 900 ---- lustre/mdc/mdc_reint.c | 361 -- lustre/mdc/mdc_request.c | 1432 ------ lustre/mds/.cvsignore | 15 - lustre/mds/Makefile.in | 5 - lustre/mds/autoMakefile.am | 11 - lustre/mds/commit_confd.c | 98 - lustre/mds/handler.c | 2843 ------------ lustre/mds/lproc_mds.c | 382 -- lustre/mds/mds_fs.c | 858 ---- lustre/mds/mds_internal.h | 286 -- lustre/mds/mds_join.c | 506 --- lustre/mds/mds_lib.c | 475 -- lustre/mds/mds_log.c | 229 - lustre/mds/mds_lov.c | 955 ---- lustre/mds/mds_open.c | 1542 ------- lustre/mds/mds_reint.c | 2389 ---------- lustre/mds/mds_xattr.c | 366 -- lustre/mgc/.cvsignore | 15 - lustre/mgc/Makefile.in | 4 - lustre/mgc/autoMakefile.am | 18 - lustre/mgc/libmgc.c | 148 - lustre/mgc/mgc_request.c | 1291 ------ lustre/mgs/.cvsignore | 15 - lustre/mgs/Makefile.in | 4 - lustre/mgs/autoMakefile.am | 11 - lustre/mgs/lproc_mgs.c | 153 - lustre/mgs/mgs_fs.c | 200 - lustre/mgs/mgs_handler.c | 745 --- lustre/mgs/mgs_internal.h | 74 - lustre/mgs/mgs_llog.c | 2052 --------- lustre/nodist | 9 - lustre/obdclass/.cvsignore | 17 - lustre/obdclass/Info.plist | 39 - lustre/obdclass/Makefile.in | 37 - lustre/obdclass/autoMakefile.am | 54 - lustre/obdclass/class_hash.c | 576 --- lustre/obdclass/class_obd.c | 635 --- lustre/obdclass/darwin/.cvsignore | 1 - lustre/obdclass/darwin/Makefile.am | 3 - lustre/obdclass/darwin/darwin-module.c | 181 - lustre/obdclass/darwin/darwin-sysctl.c | 154 - lustre/obdclass/debug.c | 187 - lustre/obdclass/genops.c | 1312 ------ lustre/obdclass/linux/.cvsignore | 5 - lustre/obdclass/linux/Makefile.am | 4 - lustre/obdclass/linux/linux-module.c | 429 -- lustre/obdclass/linux/linux-obdo.c | 288 -- lustre/obdclass/linux/linux-sysctl.c | 349 -- lustre/obdclass/llog.c | 424 -- lustre/obdclass/llog_cat.c | 539 --- lustre/obdclass/llog_internal.h | 10 - lustre/obdclass/llog_ioctl.c | 452 -- lustre/obdclass/llog_lvfs.c | 926 ---- lustre/obdclass/llog_obd.c | 434 -- lustre/obdclass/llog_swab.c | 253 -- lustre/obdclass/llog_test.c | 716 --- lustre/obdclass/lprocfs_status.c | 1654 ------- lustre/obdclass/lustre_handles.c | 251 - lustre/obdclass/lustre_peer.c | 181 - lustre/obdclass/obd_config.c | 1234 ----- lustre/obdclass/obd_mount.c | 2042 --------- lustre/obdclass/obdo.c | 91 - lustre/obdclass/statfs_pack.c | 70 - lustre/obdclass/uuid.c | 53 - lustre/obdecho/.cvsignore | 15 - lustre/obdecho/Info.plist | 45 - lustre/obdecho/Makefile.in | 4 - lustre/obdecho/autoMakefile.am | 40 - lustre/obdecho/echo.c | 620 --- lustre/obdecho/echo_client.c | 1510 ------- lustre/obdecho/lproc_echo.c | 42 - lustre/obdfilter/.cvsignore | 15 - lustre/obdfilter/Makefile.in | 12 - lustre/obdfilter/autoMakefile.am | 11 - lustre/obdfilter/filter.c | 3755 --------------- lustre/obdfilter/filter_internal.h | 192 - lustre/obdfilter/filter_io.c | 886 ---- lustre/obdfilter/filter_io_24.c | 544 --- lustre/obdfilter/filter_io_26.c | 816 ---- lustre/obdfilter/filter_log.c | 254 -- lustre/obdfilter/filter_lvb.c | 224 - lustre/obdfilter/lproc_obdfilter.c | 444 -- lustre/osc/.cvsignore | 15 - lustre/osc/Info.plist | 43 - lustre/osc/Makefile.in | 4 - lustre/osc/autoMakefile.am | 41 - lustre/osc/lproc_osc.c | 494 -- lustre/osc/osc_create.c | 441 -- lustre/osc/osc_internal.h | 89 - lustre/osc/osc_request.c | 3788 ---------------- lustre/ost/.cvsignore | 15 - lustre/ost/Makefile.in | 4 - lustre/ost/autoMakefile.am | 11 - lustre/ost/lproc_ost.c | 44 - lustre/ost/ost_handler.c | 1952 -------- lustre/ost/ost_internal.h | 42 - lustre/ptlrpc/.cvsignore | 16 - lustre/ptlrpc/Info.plist | 33 - lustre/ptlrpc/Makefile.in | 30 - lustre/ptlrpc/autoMakefile.am | 77 - lustre/ptlrpc/client.c | 2127 --------- lustre/ptlrpc/connection.c | 243 - lustre/ptlrpc/events.c | 726 --- lustre/ptlrpc/import.c | 1235 ----- lustre/ptlrpc/llog_client.c | 323 -- lustre/ptlrpc/llog_net.c | 177 - lustre/ptlrpc/llog_server.c | 716 --- lustre/ptlrpc/lproc_ptlrpc.c | 603 --- lustre/ptlrpc/niobuf.c | 635 --- lustre/ptlrpc/pack_generic.c | 2423 ---------- lustre/ptlrpc/pers.c | 130 - lustre/ptlrpc/pinger.c | 701 --- lustre/ptlrpc/ptlrpc_internal.h | 148 - lustre/ptlrpc/ptlrpc_module.c | 302 -- lustre/ptlrpc/ptlrpcd.c | 306 -- lustre/ptlrpc/recov_thread.c | 644 --- lustre/ptlrpc/recover.c | 319 -- lustre/ptlrpc/service.c | 1728 ------- lustre/ptlrpc/wirehdr.c | 10 - lustre/ptlrpc/wiretest.c | 2127 --------- lustre/quota/.cvsignore | 15 - lustre/quota/Makefile.in | 10 - lustre/quota/autoMakefile.am | 19 - lustre/quota/quota_check.c | 237 - lustre/quota/quota_context.c | 962 ---- lustre/quota/quota_ctl.c | 290 -- lustre/quota/quota_interface.c | 831 ---- lustre/quota/quota_internal.h | 100 - lustre/quota/quota_master.c | 1115 ----- lustre/quota/quotacheck_test.c | 218 - lustre/quota/quotactl_test.c | 358 -- lustre/scripts/.cvsignore | 18 - lustre/scripts/Makefile.am | 25 - lustre/scripts/bdev-io-survey.sh | 883 ---- lustre/scripts/dodiff.sh | 5 - lustre/scripts/lc_cluman.in | 524 --- lustre/scripts/lc_common | 591 --- lustre/scripts/lc_hb.in | 644 --- lustre/scripts/lc_lvm.in | 593 --- lustre/scripts/lc_md.in | 511 --- lustre/scripts/lc_modprobe.in | 66 - lustre/scripts/lc_mon | 139 - lustre/scripts/lc_net.in | 226 - lustre/scripts/lc_servip | 250 - lustre/scripts/license-status | 26 - lustre/scripts/llite-group.sh | 67 - lustre/scripts/lmc2csv.pl | 228 - lustre/scripts/lustre | 243 - lustre/scripts/lustre_config.in | 1220 ----- lustre/scripts/lustre_createcsv.in | 2101 --------- lustre/scripts/lustre_req_history | 163 - lustre/scripts/lustre_rmmod | 20 - lustre/scripts/lustre_up14 | 66 - lustre/scripts/lustrefs | 124 - lustre/scripts/maketags.sh | 8 - lustre/scripts/nodelustre | 46 - lustre/scripts/system-profile.sh | 233 - lustre/scripts/version_tag.pl.in | 197 - lustre/tests/.cvsignore | 78 - lustre/tests/2ost.sh | 54 - lustre/tests/Makefile.am | 70 - lustre/tests/README | 85 - lustre/tests/acceptance-metadata-double.sh | 128 - lustre/tests/acceptance-metadata-parallel.sh | 105 - lustre/tests/acceptance-metadata-single.sh | 153 - lustre/tests/acceptance-small.sh | 294 -- lustre/tests/acl/README | 4 - lustre/tests/acl/cp.test | 50 - lustre/tests/acl/getfacl-noacl.test | 55 - lustre/tests/acl/inheritance.test | 131 - lustre/tests/acl/make-tree | 45 - lustre/tests/acl/misc.test | 426 -- lustre/tests/acl/permissions.test | 281 -- lustre/tests/acl/run | 275 -- lustre/tests/acl/setfacl.test | 144 - lustre/tests/busy.sh | 7 - lustre/tests/cfg/insanity-local.sh | 67 - lustre/tests/cfg/insanity-ltest.sh | 74 - lustre/tests/cfg/local.sh | 79 - lustre/tests/cfg/lov.sh | 73 - lustre/tests/checkstat.c | 317 -- lustre/tests/chownmany.c | 79 - lustre/tests/cmknod.c | 128 - lustre/tests/cobd.sh | 34 - lustre/tests/compile.sh | 16 - lustre/tests/conf-sanity.sh | 1427 ------ lustre/tests/crash-mod.sh | 11 - lustre/tests/create.pl | 178 - lustre/tests/createdestroy.c | 225 - lustre/tests/createmany-mpi.c | 143 - lustre/tests/createmany.c | 121 - lustre/tests/createtest.c | 142 - lustre/tests/directio.c | 130 - lustre/tests/disk1_4.zip | Bin 216468 -> 0 bytes lustre/tests/echo.sh | 90 - lustre/tests/fchdir_test.c | 41 - lustre/tests/filter_survey.sh | 262 -- lustre/tests/flock.c | 196 - lustre/tests/flock_test.c | 86 - lustre/tests/flocks_test.c | 62 - lustre/tests/fsx.c | 1400 ------ lustre/tests/getdents.c | 31 - lustre/tests/insanity.sh | 550 --- lustre/tests/iopentest1.c | 101 - lustre/tests/iopentest2.c | 186 - lustre/tests/kbuild | 311 -- lustre/tests/ldaptest.c | 27 - lustre/tests/leak_finder.pl | 84 - lustre/tests/lfscktest.sh | 229 - lustre/tests/liblustre_sanity_uml.sh | 83 - lustre/tests/lkcdmap | 13 - lustre/tests/ll_dirstripe_verify.c | 245 - lustre/tests/ll_getstripe_info.c | 57 - lustre/tests/ll_sparseness_verify.c | 102 - lustre/tests/ll_sparseness_write.c | 61 - lustre/tests/llecho.sh | 23 - lustre/tests/llechocleanup.sh | 15 - lustre/tests/llmount.sh | 11 - lustre/tests/llmountcleanup.sh | 10 - lustre/tests/llog-test.sh | 106 - lustre/tests/lockorder.sh | 80 - lustre/tests/lov-sanity.sh | 89 - lustre/tests/lp_utils.c | 265 -- lustre/tests/lp_utils.h | 103 - lustre/tests/lstiming.sh | 51 - lustre/tests/mcr.sh | 45 - lustre/tests/mcreate.c | 23 - lustre/tests/memhog.c | 108 - lustre/tests/mkdirdeep.c | 258 -- lustre/tests/mkdirmany.c | 40 - lustre/tests/mlink.c | 25 - lustre/tests/mmap_sanity.c | 654 --- lustre/tests/mount2fs.sh | 48 - lustre/tests/mrename.c | 19 - lustre/tests/multifstat.c | 63 - lustre/tests/multiop.c | 416 -- lustre/tests/munlink.c | 35 - lustre/tests/o_directory.c | 53 - lustre/tests/oos.sh | 90 - lustre/tests/oos2.sh | 93 - lustre/tests/openclose.c | 145 - lustre/tests/opendevunlink.c | 117 - lustre/tests/opendirunlink.c | 125 - lustre/tests/openfile.c | 167 - lustre/tests/openfilleddirunlink.c | 81 - lustre/tests/openme.c | 23 - lustre/tests/openunlink.c | 155 - lustre/tests/ost_oos.sh | 41 - lustre/tests/parallel_grouplock.c | 899 ---- lustre/tests/random-reads.c | 208 - lustre/tests/recovery-cleanup.sh | 145 - lustre/tests/recovery-small.sh | 899 ---- lustre/tests/rename.pl | 210 - lustre/tests/rename_many.c | 262 -- lustre/tests/replay-dual.sh | 445 -- lustre/tests/replay-ost-single.sh | 196 - lustre/tests/replay-single.sh | 1534 ------- lustre/tests/rmdirmany.c | 40 - lustre/tests/routed.sh | 156 - lustre/tests/run-llog.sh | 45 - lustre/tests/run-quotacheck.sh | 30 - lustre/tests/run-quotactl.sh | 30 - lustre/tests/run-quotafmt.sh | 29 - lustre/tests/runas.c | 169 - lustre/tests/rundbench | 16 - lustre/tests/runiozone | 17 - lustre/tests/runobdstat | 7 - lustre/tests/runslabinfo | 6 - lustre/tests/runtests | 157 - lustre/tests/runvmstat | 24 - lustre/tests/sanity-buffalo.sh | 231 - lustre/tests/sanity-quota.sh | 1063 ----- lustre/tests/sanity.sh | 4771 -------------------- lustre/tests/sanityN.sh | 598 --- lustre/tests/set_dates.sh | 4 - lustre/tests/sleeptest.c | 115 - lustre/tests/small_write.c | 145 - lustre/tests/socketclient | 12 - lustre/tests/socketserver | 29 - lustre/tests/stat.c | 906 ---- lustre/tests/stat_fs.h | 37 - lustre/tests/statmany.c | 215 - lustre/tests/statone.c | 59 - lustre/tests/tchmod.c | 18 - lustre/tests/test-framework.sh | 1153 ----- lustre/tests/test2.c | 60 - lustre/tests/test_brw.c | 236 - lustre/tests/tmpfs-sanity.sh | 149 - lustre/tests/toexcl.c | 77 - lustre/tests/truncate.c | 24 - lustre/tests/unlinkmany.c | 84 - lustre/tests/utime.c | 161 - lustre/tests/wantedi.c | 49 - lustre/tests/write_append_truncate.c | 286 -- lustre/tests/write_disjoint.c | 195 - lustre/tests/writemany.c | 276 -- lustre/tests/writeme.c | 46 - lustre/utils/.cvsignore | 30 - lustre/utils/Makefile.am | 98 - lustre/utils/l_getgroups.c | 249 - lustre/utils/lctl.c | 330 -- lustre/utils/lfs.c | 1711 ------- lustre/utils/liblustreapi.c | 1520 ------- lustre/utils/llanalyze | 452 -- lustre/utils/llobdstat | 170 - lustre/utils/llog_reader.c | 407 -- lustre/utils/llstat | 204 - lustre/utils/llverdev.c | 553 --- lustre/utils/llverfs.c | 650 --- lustre/utils/loadgen.c | 1037 ----- lustre/utils/lr_reader.c | 209 - lustre/utils/lrun | 17 - lustre/utils/ltrack_stats.c | 493 -- lustre/utils/lustre_cfg.c | 509 --- lustre/utils/mkfs_lustre.c | 1490 ------ lustre/utils/module_cleanup.sh | 22 - lustre/utils/module_setup.sh | 66 - lustre/utils/mount_lustre.c | 612 --- lustre/utils/obd.c | 2264 ---------- lustre/utils/obdbarrier.c | 224 - lustre/utils/obdctl.c | 103 - lustre/utils/obdctl.h | 89 - lustre/utils/obdio.c | 297 -- lustre/utils/obdiolib.c | 386 -- lustre/utils/obdiolib.h | 53 - lustre/utils/parser.c | 772 ---- lustre/utils/parser.h | 74 - lustre/utils/platform.h | 248 - lustre/utils/plot-llstat | 182 - lustre/utils/wirecheck.c | 1303 ------ lustre/utils/wirehdr.c | 26 - lustre/utils/wiretest.c | 2172 --------- 808 files changed, 376130 deletions(-) delete mode 100644 build/README.kernel-source delete mode 100644 build/Rules.in delete mode 100644 build/autoconf/.cvsignore delete mode 100644 build/autoconf/Makefile.am delete mode 100644 build/autoconf/lustre-build-darwin.m4 delete mode 100644 build/checkstack.pl delete mode 100644 build/clearpatches.sh delete mode 100644 build/confirmpatches.sh delete mode 100755 build/cvs-modified-files.pl delete mode 100755 build/land2.sh delete mode 100644 build/linux-merge-config.awk delete mode 100644 build/linux-merge-modules.awk delete mode 100755 build/merge1.sh delete mode 100755 build/merge2.sh delete mode 100644 build/osxpack/ReadMe.txt delete mode 100644 build/osxpack/Welcome.txt delete mode 100755 build/osxpack/packlustre.sh delete mode 100755 build/osxpack/postflight delete mode 100755 build/osxpack/preflight delete mode 100644 build/osxpack/sysctl.conf delete mode 100755 build/osxpack/uninstall_lustre delete mode 100755 build/osxpack/unload_lustre delete mode 100755 build/replace2.sh delete mode 100644 build/sles8-post.sh delete mode 100644 build/sles8-postun.sh delete mode 100644 build/sles8-pre.sh delete mode 100644 build/sles8-update_INITRD_MODULES.sh delete mode 100644 build/sles8-update_rcfile_setting.sh delete mode 100644 build/suse-functions.sh delete mode 100644 build/suse-trigger-script.sh.in delete mode 100755 build/update_oldconfig delete mode 100644 ldiskfs/kernel_patches/patches/export-ext3-2.6-rhel4.patch delete mode 100644 ldiskfs/kernel_patches/patches/export-ext3-2.6-suse.patch delete mode 100644 ldiskfs/kernel_patches/patches/export_symbols-ext3-2.6-suse.patch delete mode 100644 ldiskfs/kernel_patches/patches/ext3-check-jbd-errors-2.6.5.patch delete mode 100644 ldiskfs/kernel_patches/patches/ext3-check-jbd-errors-2.6.9.patch delete mode 100644 ldiskfs/kernel_patches/patches/ext3-disable-write-bar-by-default-2.6-sles10.patch delete mode 100644 ldiskfs/kernel_patches/patches/ext3-ea-in-inode-2.6-rhel4.patch delete mode 100644 ldiskfs/kernel_patches/patches/ext3-ea-in-inode-2.6-suse.patch delete mode 100644 ldiskfs/kernel_patches/patches/ext3-extents-2.6.12.patch delete mode 100644 ldiskfs/kernel_patches/patches/ext3-extents-2.6.15.patch delete mode 100644 ldiskfs/kernel_patches/patches/ext3-extents-2.6.16-sles10.patch delete mode 100644 ldiskfs/kernel_patches/patches/ext3-extents-2.6.18-vanilla.patch delete mode 100644 ldiskfs/kernel_patches/patches/ext3-extents-2.6.5.patch delete mode 100644 ldiskfs/kernel_patches/patches/ext3-extents-2.6.9-rhel4.patch delete mode 100644 ldiskfs/kernel_patches/patches/ext3-extents-bug11324.patch delete mode 100644 ldiskfs/kernel_patches/patches/ext3-external-journal-2.6.12.patch delete mode 100644 ldiskfs/kernel_patches/patches/ext3-filterdata-2.6.15.patch delete mode 100644 ldiskfs/kernel_patches/patches/ext3-htree-dot-2.6.patch delete mode 100644 ldiskfs/kernel_patches/patches/ext3-ialloc-2.6.patch delete mode 100644 ldiskfs/kernel_patches/patches/ext3-include-fixes-2.6-rhel4.patch delete mode 100644 ldiskfs/kernel_patches/patches/ext3-include-fixes-2.6-suse.patch delete mode 100644 ldiskfs/kernel_patches/patches/ext3-lookup-dotdot-2.6.9.patch delete mode 100644 ldiskfs/kernel_patches/patches/ext3-map_inode_page-2.6-suse.patch delete mode 100644 ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6-fc5.patch delete mode 100644 ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6-suse.patch delete mode 100644 ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6.12.patch delete mode 100644 ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6.18-vanilla.patch delete mode 100644 ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6.9-rhel4.patch delete mode 100644 ldiskfs/kernel_patches/patches/ext3-nanosecond-2.6-rhel4.patch delete mode 100644 ldiskfs/kernel_patches/patches/ext3-nanosecond-2.6-sles10.patch delete mode 100644 ldiskfs/kernel_patches/patches/ext3-nanosecond-2.6-suse.patch delete mode 100644 ldiskfs/kernel_patches/patches/ext3-nanosecond-2.6.18-vanilla.patch delete mode 100644 ldiskfs/kernel_patches/patches/ext3-nlinks-2.6.7.patch delete mode 100644 ldiskfs/kernel_patches/patches/ext3-nlinks-2.6.9.patch delete mode 100644 ldiskfs/kernel_patches/patches/ext3-remove-cond_resched-calls-2.6.12.patch delete mode 100644 ldiskfs/kernel_patches/patches/ext3-rename-reserve-2.6-suse.patch delete mode 100644 ldiskfs/kernel_patches/patches/ext3-san-jdike-2.6-suse.patch delete mode 100644 ldiskfs/kernel_patches/patches/ext3-sector_t-overflow-2.6.12.patch delete mode 100644 ldiskfs/kernel_patches/patches/ext3-sector_t-overflow-2.6.5-suse.patch delete mode 100644 ldiskfs/kernel_patches/patches/ext3-sector_t-overflow-2.6.9-rhel4.patch delete mode 100644 ldiskfs/kernel_patches/patches/ext3-wantedi-2.6-rhel4.patch delete mode 100644 ldiskfs/kernel_patches/patches/ext3-wantedi-2.6-suse.patch delete mode 100644 ldiskfs/kernel_patches/patches/iopen-2.6-fc5.patch delete mode 100644 ldiskfs/kernel_patches/patches/iopen-2.6-rhel4.patch delete mode 100644 ldiskfs/kernel_patches/patches/iopen-2.6-suse.patch delete mode 100644 ldiskfs/kernel_patches/patches/iopen-2.6.12.patch delete mode 100644 ldiskfs/kernel_patches/series/ldiskfs-2.6-fc3.series delete mode 100644 ldiskfs/kernel_patches/series/ldiskfs-2.6-fc5.series delete mode 100644 ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel4.series delete mode 100644 ldiskfs/kernel_patches/series/ldiskfs-2.6-sles10.series delete mode 100644 ldiskfs/kernel_patches/series/ldiskfs-2.6-suse.series delete mode 100644 ldiskfs/kernel_patches/series/ldiskfs-2.6.12-vanilla.series delete mode 100644 ldiskfs/kernel_patches/series/ldiskfs-2.6.18-vanilla.series delete mode 100644 ldiskfs/ldiskfs/Makefile.in delete mode 100644 ldiskfs/ldiskfs/autoMakefile.am delete mode 100644 lustre/.cvsignore delete mode 100644 lustre/BUGS delete mode 100644 lustre/BUILDING delete mode 100644 lustre/ChangeLog delete mode 100644 lustre/FDL delete mode 100644 lustre/LICENSE delete mode 100644 lustre/Makefile.in delete mode 100644 lustre/autoMakefile.am delete mode 100644 lustre/autoconf/.cvsignore delete mode 100644 lustre/autoconf/Makefile.am delete mode 100644 lustre/autoconf/lustre-core.m4 delete mode 100644 lustre/autoconf/lustre-version.ac delete mode 100644 lustre/conf/.cvsignore delete mode 100644 lustre/conf/Makefile.am delete mode 100644 lustre/conf/lustre.dtd delete mode 100644 lustre/conf/lustre2ldif.xsl delete mode 100644 lustre/conf/modules.conf delete mode 100644 lustre/conf/slapd-lustre.conf delete mode 100644 lustre/conf/top.ldif delete mode 100644 lustre/contrib/.cvsignore delete mode 100644 lustre/contrib/Makefile.am delete mode 100644 lustre/contrib/README delete mode 100644 lustre/contrib/mpich-1.2.6-lustre.patch delete mode 100644 lustre/contrib/mpich2-1.0.3.patch delete mode 100644 lustre/doc/.cvsignore delete mode 100644 lustre/doc/Makefile.am delete mode 100644 lustre/doc/VERSIONING delete mode 100755 lustre/doc/chbar.sh delete mode 100644 lustre/doc/lconf.8 delete mode 100644 lustre/doc/lconf.lyx delete mode 100644 lustre/doc/lctl.8 delete mode 100644 lustre/doc/lctl.lyx delete mode 100644 lustre/doc/lfs.1 delete mode 100644 lustre/doc/lfs.lyx delete mode 100644 lustre/doc/llverdev.txt delete mode 100644 lustre/doc/llverfs.txt delete mode 100644 lustre/doc/lmc.1 delete mode 100644 lustre/doc/lmc.lyx delete mode 100644 lustre/doc/lustre.7 delete mode 100644 lustre/doc/mkfs.lustre.8 delete mode 100644 lustre/doc/mount.lustre.8 delete mode 100755 lustre/doc/postbar delete mode 100755 lustre/doc/tex2pdf delete mode 100644 lustre/doc/tunefs.lustre.8 delete mode 100644 lustre/include/.cvsignore delete mode 100644 lustre/include/Makefile.am delete mode 100644 lustre/include/class_hash.h delete mode 100644 lustre/include/darwin/lprocfs_status.h delete mode 100644 lustre/include/darwin/lustre_compat.h delete mode 100644 lustre/include/darwin/lustre_debug.h delete mode 100644 lustre/include/darwin/lustre_dlm.h delete mode 100644 lustre/include/darwin/lustre_fsfilt.h delete mode 100644 lustre/include/darwin/lustre_handles.h delete mode 100644 lustre/include/darwin/lustre_lib.h delete mode 100644 lustre/include/darwin/lustre_lite.h delete mode 100644 lustre/include/darwin/lustre_log.h delete mode 100644 lustre/include/darwin/lustre_mds.h delete mode 100644 lustre/include/darwin/lustre_net.h delete mode 100644 lustre/include/darwin/lustre_quota.h delete mode 100644 lustre/include/darwin/lustre_types.h delete mode 100644 lustre/include/darwin/lustre_user.h delete mode 100644 lustre/include/darwin/lvfs.h delete mode 100644 lustre/include/darwin/obd.h delete mode 100644 lustre/include/darwin/obd_class.h delete mode 100644 lustre/include/darwin/obd_support.h delete mode 100644 lustre/include/ioctl.h delete mode 100644 lustre/include/liblustre.h delete mode 100644 lustre/include/linux/.cvsignore delete mode 100644 lustre/include/linux/Makefile.am delete mode 100644 lustre/include/linux/lprocfs_status.h delete mode 100644 lustre/include/linux/lustre_compat25.h delete mode 100644 lustre/include/linux/lustre_debug.h delete mode 100644 lustre/include/linux/lustre_dlm.h delete mode 100644 lustre/include/linux/lustre_fsfilt.h delete mode 100644 lustre/include/linux/lustre_handles.h delete mode 100644 lustre/include/linux/lustre_intent.h delete mode 100644 lustre/include/linux/lustre_lib.h delete mode 100644 lustre/include/linux/lustre_lite.h delete mode 100644 lustre/include/linux/lustre_log.h delete mode 100644 lustre/include/linux/lustre_mds.h delete mode 100644 lustre/include/linux/lustre_net.h delete mode 100644 lustre/include/linux/lustre_patchless_compat.h delete mode 100644 lustre/include/linux/lustre_quota.h delete mode 100644 lustre/include/linux/lustre_types.h delete mode 100644 lustre/include/linux/lustre_user.h delete mode 100644 lustre/include/linux/lvfs.h delete mode 100644 lustre/include/linux/lvfs_linux.h delete mode 100644 lustre/include/linux/obd.h delete mode 100644 lustre/include/linux/obd_class.h delete mode 100644 lustre/include/linux/obd_support.h delete mode 100644 lustre/include/lprocfs_status.h delete mode 100644 lustre/include/lustre/.cvsignore delete mode 100644 lustre/include/lustre/Makefile.am delete mode 100644 lustre/include/lustre/liblustreapi.h delete mode 100644 lustre/include/lustre/lustre_idl.h delete mode 100644 lustre/include/lustre/lustre_user.h delete mode 100644 lustre/include/lustre/types.h delete mode 100644 lustre/include/lustre_cfg.h delete mode 100644 lustre/include/lustre_commit_confd.h delete mode 100644 lustre/include/lustre_debug.h delete mode 100644 lustre/include/lustre_disk.h delete mode 100644 lustre/include/lustre_dlm.h delete mode 100644 lustre/include/lustre_export.h delete mode 100644 lustre/include/lustre_fsfilt.h delete mode 100644 lustre/include/lustre_ha.h delete mode 100644 lustre/include/lustre_handles.h delete mode 100644 lustre/include/lustre_import.h delete mode 100644 lustre/include/lustre_lib.h delete mode 100644 lustre/include/lustre_lite.h delete mode 100644 lustre/include/lustre_log.h delete mode 100644 lustre/include/lustre_mds.h delete mode 100644 lustre/include/lustre_net.h delete mode 100644 lustre/include/lustre_param.h delete mode 100644 lustre/include/lustre_quota.h delete mode 100644 lustre/include/lustre_ucache.h delete mode 100644 lustre/include/lustre_ver.h.in delete mode 100644 lustre/include/lvfs.h delete mode 100644 lustre/include/obd.h delete mode 100644 lustre/include/obd_cache.h delete mode 100644 lustre/include/obd_class.h delete mode 100644 lustre/include/obd_echo.h delete mode 100644 lustre/include/obd_lov.h delete mode 100644 lustre/include/obd_ost.h delete mode 100644 lustre/include/obd_support.h delete mode 100644 lustre/kernel-tests/.cvsignore delete mode 100644 lustre/kernel-tests/Makefile delete mode 100644 lustre/kernel_patches/LICENSE delete mode 100644 lustre/kernel_patches/README delete mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.6.15-2.6-fc5-i686-smp.config delete mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.6.15-2.6-fc5-i686.config delete mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.6.15-fc5-i686.config delete mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.6.16-2.6-patchless-i686-smp.config delete mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.6.16-2.6-patchless-i686.config delete mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.6.16-2.6-patchless-ia64-smp.config delete mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.6.16-2.6-patchless-ia64.config delete mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.6.16-2.6-patchless-x86_64-smp.config delete mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.6.16-2.6-patchless-x86_64.config delete mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.6.16-2.6-sles10-i686-bigsmp.config delete mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.6.16-2.6-sles10-i686.config delete mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.6.16-2.6-sles10-x86_64-smp.config delete mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.6.16-2.6-sles10-x86_64.config delete mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.6.18-2.6-rhel5-i686-smp.config delete mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.6.18-2.6-rhel5-i686.config delete mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.6.18-2.6-rhel5-ia64-smp.config delete mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.6.18-2.6-rhel5-ia64.config delete mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.6.18-2.6-rhel5-x86_64-smp.config delete mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.6.18-2.6-rhel5-x86_64.config delete mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.6.18-2.6-vanilla-i686-smp.config delete mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.6.18-2.6-vanilla-i686.config delete mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.6.18-2.6-vanilla-x86_64-smp.config delete mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.6.18-2.6-vanilla-x86_64.config delete mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-i686-bigsmp.config delete mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-i686-smp.config delete mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-i686.config delete mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-ia64-smp.config delete mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-ia64.config delete mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-ppc-pseries64.config delete mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-ppc.config delete mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-x86_64-smp.config delete mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-x86_64.config delete mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-i686-smp.config delete mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-i686.config delete mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-ia64-smp.config delete mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-ia64.config delete mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-x86_64-smp.config delete mode 100644 lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-x86_64.config delete mode 100644 lustre/kernel_patches/kernel_configs/kgdb_2.6.0_test1_vmware.config delete mode 100644 lustre/kernel_patches/kernel_configs/uml-2.6.10-fc3.config delete mode 100644 lustre/kernel_patches/kernel_configs/uml-vanilla-2.6.6.config delete mode 100644 lustre/kernel_patches/kernel_configs/uml_2.6.0_test3.config delete mode 100644 lustre/kernel_patches/patches/2.6.5-quotafix.patch delete mode 100644 lustre/kernel_patches/patches/8kstack-2.6.12.patch delete mode 100644 lustre/kernel_patches/patches/atomic_add_return-sles9.patch delete mode 100644 lustre/kernel_patches/patches/bitops_ext2_find_next_le_bit-2.6.patch delete mode 100644 lustre/kernel_patches/patches/blkdev_tunables-2.6-sles10.patch delete mode 100644 lustre/kernel_patches/patches/blkdev_tunables-2.6-suse.patch delete mode 100644 lustre/kernel_patches/patches/dcache-qstr-api-fix-2.6-suse.patch delete mode 100644 lustre/kernel_patches/patches/dev_read_only-2.6-fc5.patch delete mode 100644 lustre/kernel_patches/patches/dev_read_only-2.6-lnxi.patch delete mode 100644 lustre/kernel_patches/patches/dev_read_only-2.6-suse.patch delete mode 100644 lustre/kernel_patches/patches/dev_read_only-2.6.18-vanilla.patch delete mode 100644 lustre/kernel_patches/patches/export-2.6-fc5.patch delete mode 100644 lustre/kernel_patches/patches/export-2.6-suse.patch delete mode 100644 lustre/kernel_patches/patches/export-2.6.18-vanilla.patch delete mode 100644 lustre/kernel_patches/patches/export-do_kern_mount.patch delete mode 100644 lustre/kernel_patches/patches/export-log-2.6-rhel4.patch delete mode 100644 lustre/kernel_patches/patches/export-show_task-2.6-fc5.patch delete mode 100644 lustre/kernel_patches/patches/export-show_task-2.6-vanilla.patch delete mode 100644 lustre/kernel_patches/patches/export-show_task-2.6.18-vanilla.patch delete mode 100644 lustre/kernel_patches/patches/export-truncate-2.6-suse.patch delete mode 100644 lustre/kernel_patches/patches/export-truncate-2.6.18-vanilla.patch delete mode 100644 lustre/kernel_patches/patches/export_symbol_numa-2.6-fc5.patch delete mode 100644 lustre/kernel_patches/patches/export_symbol_numa-2.6.18.patch delete mode 100644 lustre/kernel_patches/patches/export_symbol_numa.patch delete mode 100644 lustre/kernel_patches/patches/export_symbols-2.6-rhel4.patch delete mode 100644 lustre/kernel_patches/patches/export_symbols-2.6-suse.patch delete mode 100644 lustre/kernel_patches/patches/export_symbols-2.6.12.patch delete mode 100644 lustre/kernel_patches/patches/export_symbols-2.6.18-vanilla.patch delete mode 100644 lustre/kernel_patches/patches/ext3-patch-fuzz-fixup-fc3.patch delete mode 100644 lustre/kernel_patches/patches/ext3-super-ntohl.patch delete mode 100644 lustre/kernel_patches/patches/fc3_to_rhel4_updates.patch delete mode 100644 lustre/kernel_patches/patches/fsprivate-2.6.patch delete mode 100644 lustre/kernel_patches/patches/header-guards-2.6-suse.patch delete mode 100644 lustre/kernel_patches/patches/hostfs_readdir_large.patch delete mode 100644 lustre/kernel_patches/patches/i_filter_data.patch delete mode 100644 lustre/kernel_patches/patches/iallocsem_consistency.patch delete mode 100644 lustre/kernel_patches/patches/inode-nr_unused-2.6.9-rhel4.patch delete mode 100644 lustre/kernel_patches/patches/iopen-misc-2.6-fc3.patch delete mode 100644 lustre/kernel_patches/patches/iopen-misc-2.6-suse.patch delete mode 100644 lustre/kernel_patches/patches/iopen-misc-2.6.12.patch delete mode 100644 lustre/kernel_patches/patches/iopen-misc-2.6.18-vanilla.patch delete mode 100644 lustre/kernel_patches/patches/jbd-16tb-overflow-fixes.patch delete mode 100644 lustre/kernel_patches/patches/jbd-2.6.10-jcberr.patch delete mode 100644 lustre/kernel_patches/patches/jbd-check-for-unmapped-buffer.patch delete mode 100644 lustre/kernel_patches/patches/jbd-jcberr-2.6.18-vanilla.patch delete mode 100644 lustre/kernel_patches/patches/jbd-journal-chksum-2.6-sles10.patch delete mode 100644 lustre/kernel_patches/patches/jbd-journal-chksum-2.6.18-vanilla.patch delete mode 100644 lustre/kernel_patches/patches/jbd-stats-2.6-rhel5.patch delete mode 100644 lustre/kernel_patches/patches/jbd-stats-2.6-sles10.patch delete mode 100644 lustre/kernel_patches/patches/jbd-stats-2.6.5.patch delete mode 100644 lustre/kernel_patches/patches/jbd-stats-2.6.9.patch delete mode 100644 lustre/kernel_patches/patches/link_notlast-susefix.patch delete mode 100644 lustre/kernel_patches/patches/linux-2.6-binutils-2.16.patch delete mode 100644 lustre/kernel_patches/patches/linux-2.6.9-ext3-sub-second-timestamp.patch delete mode 100644 lustre/kernel_patches/patches/lookup_bdev_init_intent-2.6.18-vanilla.patch delete mode 100644 lustre/kernel_patches/patches/lookup_bdev_init_intent.patch delete mode 100644 lustre/kernel_patches/patches/lustre-version-revert_suse.patch delete mode 100644 lustre/kernel_patches/patches/lustre_version.patch delete mode 100644 lustre/kernel_patches/patches/md_path_lookup-2.6-suse delete mode 100644 lustre/kernel_patches/patches/md_path_lookup-2.6-suse.patch delete mode 100644 lustre/kernel_patches/patches/nfs-cifs-intent-2.6-fc3.patch delete mode 100644 lustre/kernel_patches/patches/nfs-cifs-intent-2.6-fc5.patch delete mode 100644 lustre/kernel_patches/patches/nfs-cifs-intent-2.6-suse.patch delete mode 100644 lustre/kernel_patches/patches/qsnet-rhel4-2.6.patch delete mode 100644 lustre/kernel_patches/patches/qsnet-suse-2.6.patch delete mode 100644 lustre/kernel_patches/patches/quota-deadlock-on-pagelock-core.patch delete mode 100644 lustre/kernel_patches/patches/quota-deadlock-on-pagelock-ext3.patch delete mode 100644 lustre/kernel_patches/patches/quota-umount-race-fix.patch delete mode 100644 lustre/kernel_patches/patches/raid5-configurable-cachesize.patch delete mode 100644 lustre/kernel_patches/patches/raid5-large-io.patch delete mode 100644 lustre/kernel_patches/patches/raid5-merge-ios.patch delete mode 100644 lustre/kernel_patches/patches/raid5-optimize-memcpy.patch delete mode 100644 lustre/kernel_patches/patches/raid5-serialize-ovelapping-reqs.patch delete mode 100644 lustre/kernel_patches/patches/raid5-stats.patch delete mode 100644 lustre/kernel_patches/patches/raid5-stripe-by-stripe-handling.patch delete mode 100644 lustre/kernel_patches/patches/raid5-zerocopy.patch delete mode 100644 lustre/kernel_patches/patches/remove-suid-2.6-suse.patch delete mode 100644 lustre/kernel_patches/patches/sd_iostats-2.6-rhel4.patch delete mode 100644 lustre/kernel_patches/patches/tcp-rto_proc-2.6.9.patch delete mode 100644 lustre/kernel_patches/patches/uml-2.6.10-fc3.patch delete mode 100644 lustre/kernel_patches/patches/uml-exprt-clearuser.patch delete mode 100644 lustre/kernel_patches/patches/vfs-keep-inode-hashed-for-clear-inode.patch delete mode 100644 lustre/kernel_patches/patches/vfs_intent-2.6-fc3.patch delete mode 100644 lustre/kernel_patches/patches/vfs_intent-2.6-fc5-fix.patch delete mode 100644 lustre/kernel_patches/patches/vfs_intent-2.6-fc5.patch delete mode 100644 lustre/kernel_patches/patches/vfs_intent-2.6-rhel4.patch delete mode 100644 lustre/kernel_patches/patches/vfs_intent-2.6-sles10.patch delete mode 100644 lustre/kernel_patches/patches/vfs_intent-2.6-suse.patch delete mode 100644 lustre/kernel_patches/patches/vfs_intent-reduce-stack-usage-2.6-suse-newer.patch delete mode 100644 lustre/kernel_patches/patches/vfs_nointent-2.6-fc5.patch delete mode 100644 lustre/kernel_patches/patches/vfs_nointent-2.6-rhel4.patch delete mode 100644 lustre/kernel_patches/patches/vfs_nointent-2.6-sles10.patch delete mode 100644 lustre/kernel_patches/patches/vfs_nointent-2.6-suse.patch delete mode 100644 lustre/kernel_patches/patches/vfs_races-2.6-fc3.patch delete mode 100644 lustre/kernel_patches/patches/vfs_races-2.6-rhel4.patch delete mode 100644 lustre/kernel_patches/patches/vfs_races-2.6-rhel5.patch delete mode 100644 lustre/kernel_patches/patches/vfs_races-2.6-suse.patch delete mode 100644 lustre/kernel_patches/patches/vfs_races-2.6.12.patch delete mode 100644 lustre/kernel_patches/patches/vfs_races-2.6.18-vanilla.patch delete mode 100644 lustre/kernel_patches/patches/vm-tunables-rhel4.patch delete mode 100644 lustre/kernel_patches/series/2.6-fc3.series delete mode 100644 lustre/kernel_patches/series/2.6-fc5.series delete mode 100644 lustre/kernel_patches/series/2.6-rhel4-titech.series delete mode 100644 lustre/kernel_patches/series/2.6-rhel4.series delete mode 100644 lustre/kernel_patches/series/2.6-rhel5.series delete mode 100644 lustre/kernel_patches/series/2.6-sles10.series delete mode 100644 lustre/kernel_patches/series/2.6-suse-newer.series delete mode 100644 lustre/kernel_patches/series/2.6-suse.series delete mode 100644 lustre/kernel_patches/series/2.6.18-vanilla.series delete mode 100644 lustre/kernel_patches/targets/.cvsignore delete mode 100644 lustre/kernel_patches/targets/2.6-fc5.target.in delete mode 100644 lustre/kernel_patches/targets/2.6-patchless.target.in delete mode 100644 lustre/kernel_patches/targets/2.6-rhel4.target.in delete mode 100644 lustre/kernel_patches/targets/2.6-rhel5.target.in delete mode 100644 lustre/kernel_patches/targets/2.6-sles10.target.in delete mode 100644 lustre/kernel_patches/targets/2.6-suse.target.in delete mode 100644 lustre/kernel_patches/targets/2.6-vanilla.target.in delete mode 100644 lustre/kernel_patches/targets/hp_pnnl-2.4.target.in delete mode 100644 lustre/kernel_patches/targets/rh-2.4.target.in delete mode 100644 lustre/kernel_patches/targets/rhel-2.4.target.in delete mode 100644 lustre/kernel_patches/targets/sles-2.4.target.in delete mode 100644 lustre/kernel_patches/targets/suse-2.4.21-2.target.in delete mode 100644 lustre/kernel_patches/which_patch delete mode 100644 lustre/ldlm/.cvsignore delete mode 100644 lustre/ldlm/Makefile.am delete mode 100644 lustre/ldlm/l_lock.c delete mode 100644 lustre/ldlm/ldlm_extent.c delete mode 100644 lustre/ldlm/ldlm_flock.c delete mode 100644 lustre/ldlm/ldlm_inodebits.c delete mode 100644 lustre/ldlm/ldlm_internal.h delete mode 100644 lustre/ldlm/ldlm_lib.c delete mode 100644 lustre/ldlm/ldlm_lock.c delete mode 100644 lustre/ldlm/ldlm_lockd.c delete mode 100644 lustre/ldlm/ldlm_plain.c delete mode 100644 lustre/ldlm/ldlm_pool.c delete mode 100644 lustre/ldlm/ldlm_request.c delete mode 100644 lustre/ldlm/ldlm_resource.c delete mode 100644 lustre/liblustre/.cvsignore delete mode 100644 lustre/liblustre/Makefile.am delete mode 100644 lustre/liblustre/dir.c delete mode 100644 lustre/liblustre/file.c delete mode 100755 lustre/liblustre/genlib.sh delete mode 100644 lustre/liblustre/llite_lib.c delete mode 100644 lustre/liblustre/llite_lib.h delete mode 100644 lustre/liblustre/lutil.c delete mode 100644 lustre/liblustre/lutil.h delete mode 100644 lustre/liblustre/namei.c delete mode 100644 lustre/liblustre/rw.c delete mode 100644 lustre/liblustre/super.c delete mode 100644 lustre/liblustre/tests/.cvsignore delete mode 100644 lustre/liblustre/tests/Makefile.am delete mode 100644 lustre/liblustre/tests/echo_test.c delete mode 100644 lustre/liblustre/tests/recovery_small.c delete mode 100644 lustre/liblustre/tests/replay_ost_single.c delete mode 100644 lustre/liblustre/tests/replay_single.c delete mode 100644 lustre/liblustre/tests/sanity.c delete mode 100644 lustre/liblustre/tests/test_common.c delete mode 100644 lustre/liblustre/tests/test_common.h delete mode 100644 lustre/liblustre/tests/test_lock_cancel.c delete mode 100644 lustre/llite/.cvsignore delete mode 100644 lustre/llite/Makefile.in delete mode 100644 lustre/llite/autoMakefile.am delete mode 100644 lustre/llite/dcache.c delete mode 100644 lustre/llite/dir.c delete mode 100644 lustre/llite/file.c delete mode 100644 lustre/llite/llite_close.c delete mode 100644 lustre/llite/llite_internal.h delete mode 100644 lustre/llite/llite_lib.c delete mode 100644 lustre/llite/llite_mmap.c delete mode 100644 lustre/llite/llite_nfs.c delete mode 100644 lustre/llite/lloop.c delete mode 100644 lustre/llite/lproc_llite.c delete mode 100644 lustre/llite/namei.c delete mode 100644 lustre/llite/rw.c delete mode 100644 lustre/llite/rw24.c delete mode 100644 lustre/llite/rw26.c delete mode 100644 lustre/llite/statahead.c delete mode 100644 lustre/llite/super.c delete mode 100644 lustre/llite/super25.c delete mode 100644 lustre/llite/symlink.c delete mode 100644 lustre/llite/xattr.c delete mode 100644 lustre/lov/.cvsignore delete mode 100644 lustre/lov/Info.plist delete mode 100644 lustre/lov/Makefile.in delete mode 100644 lustre/lov/autoMakefile.am delete mode 100755 lustre/lov/lov_ea.c delete mode 100644 lustre/lov/lov_internal.h delete mode 100644 lustre/lov/lov_log.c delete mode 100644 lustre/lov/lov_merge.c delete mode 100644 lustre/lov/lov_obd.c delete mode 100644 lustre/lov/lov_offset.c delete mode 100644 lustre/lov/lov_pack.c delete mode 100644 lustre/lov/lov_qos.c delete mode 100644 lustre/lov/lov_request.c delete mode 100644 lustre/lov/lproc_lov.c delete mode 100644 lustre/lvfs/.cvsignore delete mode 100644 lustre/lvfs/Info.plist delete mode 100644 lustre/lvfs/Makefile.in delete mode 100644 lustre/lvfs/autoMakefile.am delete mode 100644 lustre/lvfs/fsfilt.c delete mode 100644 lustre/lvfs/fsfilt_ext3.c delete mode 100644 lustre/lvfs/fsfilt_reiserfs.c delete mode 100644 lustre/lvfs/lustre_quota_fmt.c delete mode 100644 lustre/lvfs/lustre_quota_fmt.h delete mode 100644 lustre/lvfs/lvfs_common.c delete mode 100644 lustre/lvfs/lvfs_darwin.c delete mode 100644 lustre/lvfs/lvfs_internal.h delete mode 100644 lustre/lvfs/lvfs_lib.c delete mode 100644 lustre/lvfs/lvfs_linux.c delete mode 100644 lustre/lvfs/lvfs_userfs.c delete mode 100644 lustre/lvfs/prng.c delete mode 100644 lustre/lvfs/quotafmt_test.c delete mode 100644 lustre/lvfs/upcall_cache.c delete mode 100644 lustre/mdc/.cvsignore delete mode 100644 lustre/mdc/Makefile.in delete mode 100644 lustre/mdc/autoMakefile.am delete mode 100644 lustre/mdc/lproc_mdc.c delete mode 100644 lustre/mdc/mdc_internal.h delete mode 100644 lustre/mdc/mdc_lib.c delete mode 100644 lustre/mdc/mdc_locks.c delete mode 100644 lustre/mdc/mdc_reint.c delete mode 100644 lustre/mdc/mdc_request.c delete mode 100644 lustre/mds/.cvsignore delete mode 100644 lustre/mds/Makefile.in delete mode 100644 lustre/mds/autoMakefile.am delete mode 100644 lustre/mds/commit_confd.c delete mode 100644 lustre/mds/handler.c delete mode 100644 lustre/mds/lproc_mds.c delete mode 100644 lustre/mds/mds_fs.c delete mode 100644 lustre/mds/mds_internal.h delete mode 100644 lustre/mds/mds_join.c delete mode 100644 lustre/mds/mds_lib.c delete mode 100644 lustre/mds/mds_log.c delete mode 100644 lustre/mds/mds_lov.c delete mode 100644 lustre/mds/mds_open.c delete mode 100644 lustre/mds/mds_reint.c delete mode 100644 lustre/mds/mds_xattr.c delete mode 100644 lustre/mgc/.cvsignore delete mode 100644 lustre/mgc/Makefile.in delete mode 100644 lustre/mgc/autoMakefile.am delete mode 100644 lustre/mgc/libmgc.c delete mode 100644 lustre/mgc/mgc_request.c delete mode 100644 lustre/mgs/.cvsignore delete mode 100644 lustre/mgs/Makefile.in delete mode 100644 lustre/mgs/autoMakefile.am delete mode 100644 lustre/mgs/lproc_mgs.c delete mode 100644 lustre/mgs/mgs_fs.c delete mode 100644 lustre/mgs/mgs_handler.c delete mode 100644 lustre/mgs/mgs_internal.h delete mode 100644 lustre/mgs/mgs_llog.c delete mode 100644 lustre/nodist delete mode 100644 lustre/obdclass/.cvsignore delete mode 100644 lustre/obdclass/Info.plist delete mode 100644 lustre/obdclass/Makefile.in delete mode 100644 lustre/obdclass/autoMakefile.am delete mode 100644 lustre/obdclass/class_hash.c delete mode 100644 lustre/obdclass/class_obd.c delete mode 100644 lustre/obdclass/darwin/.cvsignore delete mode 100644 lustre/obdclass/darwin/Makefile.am delete mode 100644 lustre/obdclass/darwin/darwin-module.c delete mode 100644 lustre/obdclass/darwin/darwin-sysctl.c delete mode 100644 lustre/obdclass/debug.c delete mode 100644 lustre/obdclass/genops.c delete mode 100644 lustre/obdclass/linux/.cvsignore delete mode 100644 lustre/obdclass/linux/Makefile.am delete mode 100644 lustre/obdclass/linux/linux-module.c delete mode 100644 lustre/obdclass/linux/linux-obdo.c delete mode 100644 lustre/obdclass/linux/linux-sysctl.c delete mode 100644 lustre/obdclass/llog.c delete mode 100644 lustre/obdclass/llog_cat.c delete mode 100644 lustre/obdclass/llog_internal.h delete mode 100644 lustre/obdclass/llog_ioctl.c delete mode 100644 lustre/obdclass/llog_lvfs.c delete mode 100644 lustre/obdclass/llog_obd.c delete mode 100644 lustre/obdclass/llog_swab.c delete mode 100644 lustre/obdclass/llog_test.c delete mode 100644 lustre/obdclass/lprocfs_status.c delete mode 100644 lustre/obdclass/lustre_handles.c delete mode 100644 lustre/obdclass/lustre_peer.c delete mode 100644 lustre/obdclass/obd_config.c delete mode 100644 lustre/obdclass/obd_mount.c delete mode 100644 lustre/obdclass/obdo.c delete mode 100644 lustre/obdclass/statfs_pack.c delete mode 100644 lustre/obdclass/uuid.c delete mode 100644 lustre/obdecho/.cvsignore delete mode 100644 lustre/obdecho/Info.plist delete mode 100644 lustre/obdecho/Makefile.in delete mode 100644 lustre/obdecho/autoMakefile.am delete mode 100644 lustre/obdecho/echo.c delete mode 100644 lustre/obdecho/echo_client.c delete mode 100644 lustre/obdecho/lproc_echo.c delete mode 100644 lustre/obdfilter/.cvsignore delete mode 100644 lustre/obdfilter/Makefile.in delete mode 100644 lustre/obdfilter/autoMakefile.am delete mode 100644 lustre/obdfilter/filter.c delete mode 100644 lustre/obdfilter/filter_internal.h delete mode 100644 lustre/obdfilter/filter_io.c delete mode 100644 lustre/obdfilter/filter_io_24.c delete mode 100644 lustre/obdfilter/filter_io_26.c delete mode 100644 lustre/obdfilter/filter_log.c delete mode 100644 lustre/obdfilter/filter_lvb.c delete mode 100644 lustre/obdfilter/lproc_obdfilter.c delete mode 100644 lustre/osc/.cvsignore delete mode 100644 lustre/osc/Info.plist delete mode 100644 lustre/osc/Makefile.in delete mode 100644 lustre/osc/autoMakefile.am delete mode 100644 lustre/osc/lproc_osc.c delete mode 100644 lustre/osc/osc_create.c delete mode 100644 lustre/osc/osc_internal.h delete mode 100644 lustre/osc/osc_request.c delete mode 100644 lustre/ost/.cvsignore delete mode 100644 lustre/ost/Makefile.in delete mode 100644 lustre/ost/autoMakefile.am delete mode 100644 lustre/ost/lproc_ost.c delete mode 100644 lustre/ost/ost_handler.c delete mode 100644 lustre/ost/ost_internal.h delete mode 100644 lustre/ptlrpc/.cvsignore delete mode 100644 lustre/ptlrpc/Info.plist delete mode 100644 lustre/ptlrpc/Makefile.in delete mode 100644 lustre/ptlrpc/autoMakefile.am delete mode 100644 lustre/ptlrpc/client.c delete mode 100644 lustre/ptlrpc/connection.c delete mode 100644 lustre/ptlrpc/events.c delete mode 100644 lustre/ptlrpc/import.c delete mode 100644 lustre/ptlrpc/llog_client.c delete mode 100644 lustre/ptlrpc/llog_net.c delete mode 100644 lustre/ptlrpc/llog_server.c delete mode 100644 lustre/ptlrpc/lproc_ptlrpc.c delete mode 100644 lustre/ptlrpc/niobuf.c delete mode 100644 lustre/ptlrpc/pack_generic.c delete mode 100644 lustre/ptlrpc/pers.c delete mode 100644 lustre/ptlrpc/pinger.c delete mode 100644 lustre/ptlrpc/ptlrpc_internal.h delete mode 100644 lustre/ptlrpc/ptlrpc_module.c delete mode 100644 lustre/ptlrpc/ptlrpcd.c delete mode 100644 lustre/ptlrpc/recov_thread.c delete mode 100644 lustre/ptlrpc/recover.c delete mode 100644 lustre/ptlrpc/service.c delete mode 100644 lustre/ptlrpc/wirehdr.c delete mode 100644 lustre/ptlrpc/wiretest.c delete mode 100644 lustre/quota/.cvsignore delete mode 100644 lustre/quota/Makefile.in delete mode 100644 lustre/quota/autoMakefile.am delete mode 100644 lustre/quota/quota_check.c delete mode 100644 lustre/quota/quota_context.c delete mode 100644 lustre/quota/quota_ctl.c delete mode 100644 lustre/quota/quota_interface.c delete mode 100644 lustre/quota/quota_internal.h delete mode 100644 lustre/quota/quota_master.c delete mode 100644 lustre/quota/quotacheck_test.c delete mode 100644 lustre/quota/quotactl_test.c delete mode 100644 lustre/scripts/.cvsignore delete mode 100644 lustre/scripts/Makefile.am delete mode 100755 lustre/scripts/bdev-io-survey.sh delete mode 100755 lustre/scripts/dodiff.sh delete mode 100644 lustre/scripts/lc_cluman.in delete mode 100644 lustre/scripts/lc_common delete mode 100644 lustre/scripts/lc_hb.in delete mode 100644 lustre/scripts/lc_lvm.in delete mode 100644 lustre/scripts/lc_md.in delete mode 100644 lustre/scripts/lc_modprobe.in delete mode 100644 lustre/scripts/lc_mon delete mode 100644 lustre/scripts/lc_net.in delete mode 100644 lustre/scripts/lc_servip delete mode 100755 lustre/scripts/license-status delete mode 100644 lustre/scripts/llite-group.sh delete mode 100644 lustre/scripts/lmc2csv.pl delete mode 100755 lustre/scripts/lustre delete mode 100644 lustre/scripts/lustre_config.in delete mode 100644 lustre/scripts/lustre_createcsv.in delete mode 100644 lustre/scripts/lustre_req_history delete mode 100755 lustre/scripts/lustre_rmmod delete mode 100644 lustre/scripts/lustre_up14 delete mode 100644 lustre/scripts/lustrefs delete mode 100755 lustre/scripts/maketags.sh delete mode 100755 lustre/scripts/nodelustre delete mode 100755 lustre/scripts/system-profile.sh delete mode 100644 lustre/scripts/version_tag.pl.in delete mode 100644 lustre/tests/.cvsignore delete mode 100644 lustre/tests/2ost.sh delete mode 100644 lustre/tests/Makefile.am delete mode 100644 lustre/tests/README delete mode 100644 lustre/tests/acceptance-metadata-double.sh delete mode 100644 lustre/tests/acceptance-metadata-parallel.sh delete mode 100644 lustre/tests/acceptance-metadata-single.sh delete mode 100755 lustre/tests/acceptance-small.sh delete mode 100644 lustre/tests/acl/README delete mode 100644 lustre/tests/acl/cp.test delete mode 100644 lustre/tests/acl/getfacl-noacl.test delete mode 100644 lustre/tests/acl/inheritance.test delete mode 100755 lustre/tests/acl/make-tree delete mode 100644 lustre/tests/acl/misc.test delete mode 100644 lustre/tests/acl/permissions.test delete mode 100755 lustre/tests/acl/run delete mode 100644 lustre/tests/acl/setfacl.test delete mode 100644 lustre/tests/busy.sh delete mode 100644 lustre/tests/cfg/insanity-local.sh delete mode 100644 lustre/tests/cfg/insanity-ltest.sh delete mode 100644 lustre/tests/cfg/local.sh delete mode 100644 lustre/tests/cfg/lov.sh delete mode 100644 lustre/tests/checkstat.c delete mode 100644 lustre/tests/chownmany.c delete mode 100644 lustre/tests/cmknod.c delete mode 100755 lustre/tests/cobd.sh delete mode 100644 lustre/tests/compile.sh delete mode 100644 lustre/tests/conf-sanity.sh delete mode 100644 lustre/tests/crash-mod.sh delete mode 100644 lustre/tests/create.pl delete mode 100644 lustre/tests/createdestroy.c delete mode 100644 lustre/tests/createmany-mpi.c delete mode 100644 lustre/tests/createmany.c delete mode 100644 lustre/tests/createtest.c delete mode 100644 lustre/tests/directio.c delete mode 100644 lustre/tests/disk1_4.zip delete mode 100755 lustre/tests/echo.sh delete mode 100644 lustre/tests/fchdir_test.c delete mode 100644 lustre/tests/filter_survey.sh delete mode 100644 lustre/tests/flock.c delete mode 100644 lustre/tests/flock_test.c delete mode 100644 lustre/tests/flocks_test.c delete mode 100644 lustre/tests/fsx.c delete mode 100644 lustre/tests/getdents.c delete mode 100755 lustre/tests/insanity.sh delete mode 100644 lustre/tests/iopentest1.c delete mode 100644 lustre/tests/iopentest2.c delete mode 100755 lustre/tests/kbuild delete mode 100644 lustre/tests/ldaptest.c delete mode 100644 lustre/tests/leak_finder.pl delete mode 100755 lustre/tests/lfscktest.sh delete mode 100644 lustre/tests/liblustre_sanity_uml.sh delete mode 100755 lustre/tests/lkcdmap delete mode 100644 lustre/tests/ll_dirstripe_verify.c delete mode 100644 lustre/tests/ll_getstripe_info.c delete mode 100644 lustre/tests/ll_sparseness_verify.c delete mode 100644 lustre/tests/ll_sparseness_write.c delete mode 100644 lustre/tests/llecho.sh delete mode 100755 lustre/tests/llechocleanup.sh delete mode 100755 lustre/tests/llmount.sh delete mode 100755 lustre/tests/llmountcleanup.sh delete mode 100644 lustre/tests/llog-test.sh delete mode 100644 lustre/tests/lockorder.sh delete mode 100644 lustre/tests/lov-sanity.sh delete mode 100644 lustre/tests/lp_utils.c delete mode 100644 lustre/tests/lp_utils.h delete mode 100644 lustre/tests/lstiming.sh delete mode 100755 lustre/tests/mcr.sh delete mode 100644 lustre/tests/mcreate.c delete mode 100644 lustre/tests/memhog.c delete mode 100644 lustre/tests/mkdirdeep.c delete mode 100755 lustre/tests/mkdirmany.c delete mode 100755 lustre/tests/mlink.c delete mode 100644 lustre/tests/mmap_sanity.c delete mode 100644 lustre/tests/mount2fs.sh delete mode 100644 lustre/tests/mrename.c delete mode 100644 lustre/tests/multifstat.c delete mode 100755 lustre/tests/multiop.c delete mode 100755 lustre/tests/munlink.c delete mode 100644 lustre/tests/o_directory.c delete mode 100755 lustre/tests/oos.sh delete mode 100644 lustre/tests/oos2.sh delete mode 100644 lustre/tests/openclose.c delete mode 100644 lustre/tests/opendevunlink.c delete mode 100644 lustre/tests/opendirunlink.c delete mode 100644 lustre/tests/openfile.c delete mode 100644 lustre/tests/openfilleddirunlink.c delete mode 100644 lustre/tests/openme.c delete mode 100644 lustre/tests/openunlink.c delete mode 100644 lustre/tests/ost_oos.sh delete mode 100644 lustre/tests/parallel_grouplock.c delete mode 100644 lustre/tests/random-reads.c delete mode 100755 lustre/tests/recovery-cleanup.sh delete mode 100755 lustre/tests/recovery-small.sh delete mode 100644 lustre/tests/rename.pl delete mode 100644 lustre/tests/rename_many.c delete mode 100755 lustre/tests/replay-dual.sh delete mode 100755 lustre/tests/replay-ost-single.sh delete mode 100755 lustre/tests/replay-single.sh delete mode 100755 lustre/tests/rmdirmany.c delete mode 100644 lustre/tests/routed.sh delete mode 100644 lustre/tests/run-llog.sh delete mode 100644 lustre/tests/run-quotacheck.sh delete mode 100644 lustre/tests/run-quotactl.sh delete mode 100644 lustre/tests/run-quotafmt.sh delete mode 100644 lustre/tests/runas.c delete mode 100755 lustre/tests/rundbench delete mode 100755 lustre/tests/runiozone delete mode 100644 lustre/tests/runobdstat delete mode 100755 lustre/tests/runslabinfo delete mode 100755 lustre/tests/runtests delete mode 100755 lustre/tests/runvmstat delete mode 100755 lustre/tests/sanity-buffalo.sh delete mode 100644 lustre/tests/sanity-quota.sh delete mode 100644 lustre/tests/sanity.sh delete mode 100644 lustre/tests/sanityN.sh delete mode 100644 lustre/tests/set_dates.sh delete mode 100644 lustre/tests/sleeptest.c delete mode 100644 lustre/tests/small_write.c delete mode 100755 lustre/tests/socketclient delete mode 100755 lustre/tests/socketserver delete mode 100644 lustre/tests/stat.c delete mode 100644 lustre/tests/stat_fs.h delete mode 100644 lustre/tests/statmany.c delete mode 100644 lustre/tests/statone.c delete mode 100644 lustre/tests/tchmod.c delete mode 100644 lustre/tests/test-framework.sh delete mode 100755 lustre/tests/test2.c delete mode 100644 lustre/tests/test_brw.c delete mode 100755 lustre/tests/tmpfs-sanity.sh delete mode 100644 lustre/tests/toexcl.c delete mode 100644 lustre/tests/truncate.c delete mode 100644 lustre/tests/unlinkmany.c delete mode 100644 lustre/tests/utime.c delete mode 100644 lustre/tests/wantedi.c delete mode 100644 lustre/tests/write_append_truncate.c delete mode 100644 lustre/tests/write_disjoint.c delete mode 100644 lustre/tests/writemany.c delete mode 100644 lustre/tests/writeme.c delete mode 100644 lustre/utils/.cvsignore delete mode 100644 lustre/utils/Makefile.am delete mode 100644 lustre/utils/l_getgroups.c delete mode 100644 lustre/utils/lctl.c delete mode 100644 lustre/utils/lfs.c delete mode 100644 lustre/utils/liblustreapi.c delete mode 100644 lustre/utils/llanalyze delete mode 100644 lustre/utils/llobdstat delete mode 100644 lustre/utils/llog_reader.c delete mode 100644 lustre/utils/llstat delete mode 100644 lustre/utils/llverdev.c delete mode 100644 lustre/utils/llverfs.c delete mode 100644 lustre/utils/loadgen.c delete mode 100644 lustre/utils/lr_reader.c delete mode 100755 lustre/utils/lrun delete mode 100644 lustre/utils/ltrack_stats.c delete mode 100644 lustre/utils/lustre_cfg.c delete mode 100644 lustre/utils/mkfs_lustre.c delete mode 100755 lustre/utils/module_cleanup.sh delete mode 100755 lustre/utils/module_setup.sh delete mode 100644 lustre/utils/mount_lustre.c delete mode 100644 lustre/utils/obd.c delete mode 100644 lustre/utils/obdbarrier.c delete mode 100644 lustre/utils/obdctl.c delete mode 100644 lustre/utils/obdctl.h delete mode 100644 lustre/utils/obdio.c delete mode 100644 lustre/utils/obdiolib.c delete mode 100644 lustre/utils/obdiolib.h delete mode 100644 lustre/utils/parser.c delete mode 100644 lustre/utils/parser.h delete mode 100644 lustre/utils/platform.h delete mode 100644 lustre/utils/plot-llstat delete mode 100644 lustre/utils/wirecheck.c delete mode 100644 lustre/utils/wirehdr.c delete mode 100644 lustre/utils/wiretest.c diff --git a/build/README.kernel-source b/build/README.kernel-source deleted file mode 100644 index bb985e9..0000000 --- a/build/README.kernel-source +++ /dev/null @@ -1,103 +0,0 @@ -Lustre can currently build against Red Hat 2.4 and SuSE 2.6 -kernel-source RPMs. All other kernel-source RPMs are *unsupported* at -this time. - -Note that a Lustre-patched kernel is required for building Lustre; in -most cases a kernel-source RPM from your Linux vendor will not contain -the necessary patches. - -## -## Instructions for Red Hat 2.4.x kernel-source RPMs -## - -1. kernel.h - -Building against a kernel-source RPM requires a special header. On -Red Hat systems, this file should be automatically created at boot -time, and saved in /boot/kernel.h. - - *** If you are not running Red Hat Linux, or are not booted into the - *** kernel you are trying to build against, you need to create this - *** file manually. - - *** If you do not, the Lustre build may fail, or may fail to build - *** modules that work with your kernel. - -Here is an example /boot/kernel.h file. If you are building on -x86_64, the first defines should be __MODULE_KERNEL_x86_64, etc. The -other defines should be simple to figure out. - -/* This file is automatically generated at boot time. */ -#ifndef __BOOT_KERNEL_H_ -#define __BOOT_KERNEL_H_ - -/* Kernel type i686-smp */ - -#ifndef __MODULE_KERNEL_i686 -#define __MODULE_KERNEL_i686 1 -#endif - -#ifndef __BOOT_KERNEL_ENTERPRISE -#define __BOOT_KERNEL_ENTERPRISE 0 -#endif - -#ifndef __BOOT_KERNEL_BIGMEM -#define __BOOT_KERNEL_BIGMEM 0 -#endif - -#ifndef __BOOT_KERNEL_HUGEMEM -#define __BOOT_KERNEL_HUGEMEM 0 -#endif - -#ifndef __BOOT_KERNEL_SMP -#define __BOOT_KERNEL_SMP 1 -#endif - -#ifndef __BOOT_KERNEL_UP -#define __BOOT_KERNEL_UP 0 -#endif - -#endif - -You should save this somewhere, and pass the location of this file to -./configure using the --with-kernel-source-header option. - -2. .config - -You will also need to tell Lustre about the .config file for your -kernel. The two likely locations of this file are -/boot/config-$(uname -r), and /usr/src/linux-2.4/configs/. You should -pass the location of this file to Lustre using the --with-linux-config -option. - -3. An Example - -Here is an example for configuring Lustre: - -./configure --with-linux=/usr/src/linux-2.4.20-28.9_lustre.1.0.3 \ ---with-kernel-source-header=/boot/kernel.h \ ---with-linux-config=/boot/config-2.4.20-28.9_lustre.1.0.3smp - -## -## Instructions for SuSE 2.6.x kernel-source RPMs -## - -1. kernel-syms - -In addition to the kernel-source rpm, you may need to install a -kernel-syms rpm. This should be included where you got your kernel -rpm. - -2. linux-obj - -You will need to choose the correct linux-obj directory for your -machine. They will be located in /usr/src/linux-$(uname --r)-obj/$ARCH/$FLAVOR. Lustre should be configured with the ---with-linux-obj option. - -3. Example - -Here is an example for configuring Lustre: - -./configure --with-linux=/usr/src/linux-2.6.5-7.97 \ ---with-linux-obj=/usr/src/linux-2.6.5-7.97-obj/ppc64/pseries64 diff --git a/build/Rules.in b/build/Rules.in deleted file mode 100644 index 815b3da..0000000 --- a/build/Rules.in +++ /dev/null @@ -1,78 +0,0 @@ -# Directories building kernel modules should have two files: -# -# Makefile.in: -# -# MODULES := -# -objs := file1.o file2.o file3.o -# @INCLUDE_RULES@ -# -# and autoMakefile.am: -# -# if LIBLUSTRE -# -# endif -# -# if MODULES -# modulefs_DATA = $(KMODEXT) -# endif -# -# DIST_SOURCES = $(-objs:.o=.c) -# MOSTLYCLEANFILES = *.o *.ko *.mod.c - -ifeq ($(PATCHLEVEL),) - -include autoMakefile - -# The kernel ABI files for the nonfree modules. -KABIS := $(NONFREE_MODULES:%$(KMODEXT)=%.kabi) - -all: archive-nonfree-modules - -# Where to archive the nonfree modules for binary distribution. -# If this directory has a colon in it, SSH/SCP are used to go out on the network. -nonfreedir := $$HOME/nonfree -#nonfreedir := moraine.clusterfs.com:/home/lustre-nonfree - -# Put the nonfree modules and corresponding KABI files into the binary -# archive. We assume that if the CVS subdirectory doesn't exist, we -# don't want to archive. -archive-nonfree-modules: $(KABIS) $(NONFREE_MODULES) - test -d CVS || exit 0; \ - list="$(NONFREE_MODULES)"; for mod in $$list; do \ - perl $(top_srcdir)/build/kabi -v archive $(nonfreedir) $$mod || exit $$?; \ - done - -# Generate the Kernel ABI files for the nonfree modules. -$(KABIS): $(NONFREE_MODULES) - for mod in $(NONFREE_MODULES); do \ - CC="$(CC)" perl $(top_srcdir)/build/kabi --with-linux="$(LINUX)" module $$mod || exit $$?; \ - done - -fix-kext-ownership: - @if test -d $(DESTDIR)$(kextdir) ; then \ - echo chown -R root:wheel $(DESTDIR)$(kextdir) ; \ - chown -R root:wheel $(DESTDIR)$(kextdir) || \ - echo >&2 "*** WARNING: Could not fix kext ownership for $(DESTDIR)$(kextdir)" ; \ - fi - -else - -include @LINUX_CONFIG@ - -EXTRA_CFLAGS := $(EXTRA_PRE_CFLAGS) -EXTRA_CFLAGS += @EXTRA_KCFLAGS@ @UML_CFLAGS@ @CFLAGS@ -EXTRA_CFLAGS += $(EXTRA_POST_CFLAGS) - -obj-m := $(patsubst %,%.o,$(MODULES)) - -ifeq ($(PATCHLEVEL),4) -# 2.4 rules -O_TARGET := $(firstword $(obj-m)) -obj-y := $($(firstword $(MODULES))-objs) -export-objs := $(obj-y) $(filter-out $(O_TARGET),$(obj-m)) -include $(TOPDIR)/Rules.make -$(MODINCL)/%.ver: %.c - @true -endif # PATCHLEVEL - -endif # KERNELRELEASE diff --git a/build/autoconf/.cvsignore b/build/autoconf/.cvsignore deleted file mode 100644 index 282522d..0000000 --- a/build/autoconf/.cvsignore +++ /dev/null @@ -1,2 +0,0 @@ -Makefile -Makefile.in diff --git a/build/autoconf/Makefile.am b/build/autoconf/Makefile.am deleted file mode 100644 index 5ee43813..0000000 --- a/build/autoconf/Makefile.am +++ /dev/null @@ -1,4 +0,0 @@ -EXTRA_DIST := lustre-build.m4 lustre-build-linux.m4 -if DARWIN -EXTRA_DIST += lustre-build-darwin.m4 -endif diff --git a/build/autoconf/lustre-build-darwin.m4 b/build/autoconf/lustre-build-darwin.m4 deleted file mode 100644 index a14ebf8..0000000 --- a/build/autoconf/lustre-build-darwin.m4 +++ /dev/null @@ -1,106 +0,0 @@ -# -# LB_DARWIN_CHECK_FUNCS -# -# check for functions in the darwin kernel -# Note that this is broken for cross compiling -# -AC_DEFUN([LB_DARWIN_CHECK_FUNCS], -[AC_FOREACH([AC_Func], [$1], - [AH_TEMPLATE(AS_TR_CPP(HAVE_[]AC_Func), - [Define to 1 if you have the `]AC_Func[' function.])])dnl -for ac_func in $1 -do -AC_MSG_CHECKING([for $1]) -AS_IF([AC_TRY_COMMAND(nm /mach | grep "[$1]" >/dev/null 2>/dev/null)],[ - AC_MSG_RESULT([yes]) - AC_DEFINE_UNQUOTED([AS_TR_CPP([HAVE_$ac_func])]) $2 -],[ - AC_MSG_RESULT([no]) $3 -])dnl -done -]) - -# -# LB_DARWIN_CONDITIONALS -# -# AM_CONDITIONALs for darwin -# -AC_DEFUN([LB_DARWIN_CONDITIONALS], -[ -]) - -# -# LB_PROG_DARWIN -# -# darwin tests -# -AC_DEFUN([LB_PROG_DARWIN], -[kernel_framework="/System/Library/Frameworks/Kernel.framework" -# -# FIXME: there should be a better way to get these than hard coding them -# -case $target_cpu in - powerpc*) - EXTRA_KCFLAGS="$EXTRA_KCFLAGS -arch ppc -mtune=G4 -mlong-branch" - EXTRA_KLDFLAGS="-arch ppc" - ;; - i?86 | x86_64) - EXTRA_KCFLAGS="$EXTRA_KCFLAGS -arch i386" - EXTRA_KLDFLAGS="-arch i386" - ;; -esac - -# Kernel of OS X is not 64bits(even in Tiger), but -m64 can be taken by gcc in Tiger -# (Tiger can support 64bits applications), so we have to eliminate -m64 while -# building kextensions for and OS X. -CC=`echo $CC | sed -e "s/\-m64//g"` -EXTRA_KCFLAGS="$EXTRA_KCFLAGS -x c -pipe -Wno-trigraphs -fasm-blocks -g -O0" -EXTRA_KCFLAGS="$EXTRA_KCFLAGS -Wno-four-char-constants -Wmost -O0" -EXTRA_KCFLAGS="$EXTRA_KCFLAGS -fmessage-length=0" -EXTRA_KCFLAGS="$EXTRA_KCFLAGS -I$kernel_framework/Headers" -EXTRA_KCFLAGS="$EXTRA_KCFLAGS -I$kernel_framework/Headers/bsd" -EXTRA_KCFLAGS="$EXTRA_KCFLAGS -I$kernel_framework/PrivateHeaders" -EXTRA_KCFLAGS="$EXTRA_KCFLAGS -fno-common -nostdinc -fno-builtin" -EXTRA_KCFLAGS="$EXTRA_KCFLAGS -finline -fno-keep-inline-functions" -EXTRA_KCFLAGS="$EXTRA_KCFLAGS -force_cpusubtype_ALL -fno-exceptions" -EXTRA_KCFLAGS="$EXTRA_KCFLAGS -msoft-float -static" -EXTRA_KCFLAGS="$EXTRA_KCFLAGS -DKERNEL -DKERNEL_PRIVATE" -EXTRA_KCFLAGS="$EXTRA_KCFLAGS -DDRIVER_PRIVATE -DAPPLE -DNeXT" -EXTRA_KCFLAGS="$EXTRA_KCFLAGS -D__KERNEL__ -D__DARWIN__" -# -# C flags for Panther/Tiger -# -case $target_os in - darwin8*) - EXTRA_KCFLAGS="$EXTRA_KCFLAGS -D__DARWIN8__" - ;; - darwin7*) - EXTRA_KCFLAGS="$EXTRA_KCFLAGS -ffix-and-continue" - ;; -esac - -# -# Debugging flags. Remove! -# -EXTRA_KCFLAGS="$EXTRA_KCFLAGS -O0 -DMACH_ASSERT=1" -EXTRA_KLDFLAGS="$EXTRA_KLDFLAGS -static -nostdlib -r" -EXTRA_KLIBS="-lkmodc++ -lkmod -lcc_kext" -KMODEXT="" - -AC_SUBST(EXTRA_KLDFLAGS) -AC_SUBST(EXTRA_KLIBS) - -kextdir='/System/Library/Extensions/$(firstword $(macos_PROGRAMS)).kext' -plistdir='$(kextdir)/Contents' -macosdir='$(plistdir)/MacOS' - -AC_SUBST(kextdir) -AC_SUBST(plistdir) -AC_SUBST(macosdir) - -LN_PROG_DARWIN - -LP_PROG_DARWIN - -LC_PROG_DARWIN -]) diff --git a/build/checkstack.pl b/build/checkstack.pl deleted file mode 100644 index 6bdab11..0000000 --- a/build/checkstack.pl +++ /dev/null @@ -1,87 +0,0 @@ -#!/usr/bin/perl -# Check the stack usage of functions -# -# Copyright Joern Engel -# Inspired by Linus Torvalds -# Original idea maybe from Keith Owens -# s390 port and big speedup by Arnd Bergmann -# Modified to have simpler output format by Dan Kegel -# -# Usage: -# objdump -d vmlinux | stackcheck.pl [arch] -# -# find -name "*.o" | while read M; do -# objdump -d $M | perl ~/checkstack.pl | \ -# sed "s/^/`basename $M`: /" ; done | \ -# awk '/esp/ { print $5, $2, $4 }' | sort -nr - -# TODO : Port to all architectures (one regex per arch) - -# check for arch -# -# $re is used for three matches: -# $& (whole re) matches the complete objdump line with the stack growth -# $1 (first bracket) matches the code that will be displayed in the output -# $2 (second bracket) matches the size of the stack growth -# -# use anything else and feel the pain ;) -{ - my $arch = shift; - $x = "[0-9a-f]{2,5}"; # hex number >= 256 - $d = "([0-9]{2}|[2-9])[0-9]{2}"; # decimal number >= 200 - if ($arch eq "") { - $arch = `uname -m`; - } - if ($arch =~ /^i[3456]86$/) { - #c0105234: 81 ec ac 05 00 00 sub $0x5ac,%esp - $re = qr/^.*(sub \$(0x$x),\%esp)$/o; - $todec = sub { return hex($_[0]); }; - } elsif ($arch =~ 'x86_64') { - # 2f60: 48 81 ec e8 05 00 00 sub $0x5e8,%rsp - $re = qr/^.*(add \$(0x$x),\%rsp)$/o; - $todec = sub { return hex($_[0]); }; - } elsif ($arch =~ /^ia64$/) { - #e0000000044011fc: 01 0f fc 8c adds r12=-384,r12 - $re = qr/.*(adds.*r12=-($d),r12)/o; - $todec = sub { return $_[0]; }; - } elsif ($arch =~ /^mips64$/) { - #8800402c: 67bdfff0 daddiu sp,sp,-16 - $re = qr/.*(daddiu.*sp,sp,-($d))/o; - $todec = sub { return $_[0]; }; - } elsif ($arch =~ /^mips$/) { - #88003254: 27bdffe0 addiu sp,sp,-32 - $re = qr/.*(addiu.*sp,sp,-($d))/o; - $todec = sub { return $_[0]; }; - } elsif ($arch =~ /^ppc$/) { - #c00029f4: 94 21 ff 30 stwu r1,-208(r1) - $re = qr/.*(stwu.*r1,-($x)\(r1\))/o; - $todec = sub { return hex($_[0]); }; - } elsif ($arch =~ /^s390x?$/) { - # 11160: a7 fb ff 60 aghi %r15,-160 - $re = qr/.*(ag?hi.*\%r15,-($d))/o; - $todec = sub { return $_[0]; }; - } else { - print "Usage: objdump -d vmlinux | checkstack.pl [arch]\n"; - print "where arch is i386, ia64, mips, mips64, ppc, or s390\n"; - print "Each output line gives a function's stack usage, name\n"; - print "Lines are output in order of decreasing stack usage\n"; - die("wrong or unknown architecture\n"); - } -} - -$funcre = qr/^[0-9a-f]* \<(.*)\>:$/; -while ($line = ) { - if ($line =~ m/$funcre/) { - ($func = $line) =~ s/$funcre/\1/; - chomp($func); - } - if ($line =~ m/$re/) { - push(@stack, &$todec($2)." ".$func); - # don't expect more than one stack allocation per function - $func .= " ** bug **"; - } -} - -foreach (sort { $b - $a } (@stack)) { - print $_."\n"; -} diff --git a/build/clearpatches.sh b/build/clearpatches.sh deleted file mode 100644 index a0f5741..0000000 --- a/build/clearpatches.sh +++ /dev/null @@ -1,13 +0,0 @@ -BASEDIR=${BASEDIR:-lustre/kernel_patches} -SERIESPATH=${SERIESPATH:-$BASEDIR/series} -PATCHESPATH=${PATCHESPATH:-$BASEDIR/patches} -NOUSEPATH=${NOUSEPATH:-$BASEDIR/unused} - -#mkdir -p $NOUSEPATH -for PATCH in `ls $PATCHESPATH | grep -v CVS` ; do - #echo $PATCH - if ! grep -rq $PATCH $SERIESPATH ; then - echo "$PATCH" - #mv $PATCHESPATH/$PATCH $NOUSEPATH - fi -done diff --git a/build/confirmpatches.sh b/build/confirmpatches.sh deleted file mode 100644 index 1c160da..0000000 --- a/build/confirmpatches.sh +++ /dev/null @@ -1,12 +0,0 @@ -BASEDIR=${BASEDIR:-lustre/kernel_patches} -SERIESPATH=${SERIESPATH:-$BASEDIR/series} -PATCHESPATH=${PATCHESPATH:-$BASEDIR/patches} -for SERIES in `ls $SERIESPATH | egrep -v "CVS|~$|.orig"` ; do - #echo $SERIES - for PATCH in `cat $SERIESPATH/$SERIES`; do - #echo $PATCH - if [ ! `find $PATCHESPATH -name $PATCH` ]; then - echo "$SERIESPATH/$SERIES: patch $PATCH was not found !" - fi - done -done diff --git a/build/cvs-modified-files.pl b/build/cvs-modified-files.pl deleted file mode 100755 index d13c4d3..0000000 --- a/build/cvs-modified-files.pl +++ /dev/null @@ -1,47 +0,0 @@ -#!/usr/bin/env perl - -my $mode = "NONE"; -my @modified, @added, @removed; - -while($line = <>) { - if ($line =~ /Modified Files:/) { - $mode = "MODIFIED"; - next; - } - - if ($line =~ /Added Files:/) { - $mode = "ADDED"; - next; - } - - if ($line =~ /Removed Files:/) { - $mode = "REMOVED"; - next; - } - - if ($mode eq "NONE") { next; } - if ($line =~ /-------/) { next; } - - chop($line); - $line =~ s/^CVS:\s+//; - $line =~ s/\s+$//; - # print "processing $line for $mode\n"; - @files = split(/ /, $line); - # print "new files for $mode: ", join(', ', @files), "\n"; - - if ($mode eq "MODIFIED") { - push(@modified, @files); - } elsif ($mode eq "ADDED") { - push(@added, @files); - } elsif ($mode eq "REMOVED") { - push(@removed, @files); - } else { - die "Unknown mode $mode!"; - } -} - -print join(' ', @modified); -if ($ENV{"SHOW_ALL_FILES"} ne "no") { - print ' ', join(' ', @added), ' ', join(' ', @removed); -} -print "\n"; diff --git a/build/land2.sh b/build/land2.sh deleted file mode 100755 index ea616a9..0000000 --- a/build/land2.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/bin/bash -e - -CVS=cvs - -if [ ! -f .mergeinfo ] ; then - echo ".mergeinfo doesn't exist - exit" - exit -fi - -. .mergeinfo - -if [ "$OPERATION" != "Land" ] ; then - echo "OPERATION must be Land - is $OPERATION" - echo "You should probably be running ${OPERATION}2.sh" - exit -fi - -if [ -f "$CONFLICTS" ] ; then - echo "$CONFLICTS exists - clean up first" - cat $CONFLICTS - exit -fi - -cvs update $dir 2>&1 | grep "^M" && echo "uncommitted changes" && exit 1 - -echo -n "Tagging as ${CHILD}_BASE_$date ..." -$CVS rtag -r $parent ${CHILD}_BASE_$date $module -echo "done" -echo -n "Tagging as ${CHILD}_BASE ..." -$CVS rtag -F -r $parent ${CHILD}_BASE $module - -echo "saving .mergeinfo as .mergeinfo-$date" -mv .mergeinfo .mergeinfo-$date -echo "done" diff --git a/build/linux-merge-config.awk b/build/linux-merge-config.awk deleted file mode 100644 index 9a9338c..0000000 --- a/build/linux-merge-config.awk +++ /dev/null @@ -1,317 +0,0 @@ -#!/bin/awk -f -BEGIN { - nsects = 0 -} -{ - ARCH = $1 - ARCHES[ARCH] = 1 - TYPE = $2 - TYPES[TYPE] = 1 - NTOTAL++ - ARCHTYPES[ARCH ":" TYPE] = 1 - NARCHES[TYPE]++ - if (NARCHES[TYPE] == 1) - NTOTALTYPES++ - NTYPES[ARCH]++ - if (NTYPES[ARCH] == 1) - NTOTALARCHES++ - FILE = $3 - cursects = nsects - while ((getline < FILE) > 0) { - if ($0 ~ /^\/\*/ || $0 ~ /^ \*\// || $0 ~ /^[ ]*$/) - continue - if ($0 ~ /^ * /) { - SECTION = gensub(/^ \* /,"",$0) - if (!(SECTION in sectno)) { - sectno[SECTION] = nsects - counts[SECTION] = 0 - nsects++ - } else if (cursects && cursects != nsects) { - no = sectno[SECTION] - diff = nsects - cursects - for (s in sectno) { - if (sectno[s] >= cursects) - sectno[s] = sectno[s] - cursects + no - else if (sectno[s] >= no) - sectno[s] += diff - } - } - cursects = nsect - cursym[SECTION] = counts[SECTION] - continue - } - if ($1 != "#define" && $1 != "#undef") - exit 1 - SYMBOL = $2 - n = index($0,SYMBOL)+length(SYMBOL) - if ($1 == "#define") { - n = index($0,SYMBOL)+length(SYMBOL) - VALUE = gensub(/^[ ]*/,"","",substr($0,n)) - if (VALUE == "") VALUE = "__novalue__" - } else - VALUE = "__undefined__" - if (values[SYMBOL]) { - if (present[SYMBOL,ARCH,TYPE]) continue - present[SYMBOL,ARCH,TYPE] = 1 - values[SYMBOL] = values[SYMBOL] SUBSEP ARCH ":" TYPE ":" VALUE - if (SECTION == sections[SYMBOL] && cursym[SECTION] && cursym[SECTION] != counts[SECTION]) { - no = pos[SYMBOL] - diff = counts[SECTION]-cursym[SECTION] - for (s in pos) - if (sections[s] == SECTION) { - if (pos[s] >= cursym[SECTION]) - pos[s] = pos[s] - cursym[SECTION] + no - else if (pos[s] >= no) - pos[s] += diff - } - cursym[SECTION] = counts[SECTION] - } - } else { - present[SYMBOL,ARCH,TYPE] = 1 - values[SYMBOL] = ARCH ":" TYPE ":" VALUE - sections[SYMBOL] = SECTION - pos[SYMBOL] = counts[SECTION] - counts[SECTION]++ - } - } - close(FILE) -} -END { - for (SECTION in sectno) - x[sectno[SECTION]] = SECTION - for (i = 0; i < nsects; i++) { - SECTION = x[i] - if (i > 0) - printf "\n" - printf "/*\n * %s\n */\n", SECTION - split("",lines) - lastelse = "" - for (SYMBOL in sections) - if (sections[SYMBOL] == SECTION) - y[pos[SYMBOL]] = SYMBOL - for (j = 0; j < counts[SECTION]; j++) { - SYMBOL = y[j] - split("",ntype) - split("",total) - split(values[SYMBOL],z,SUBSEP) - split("",val) - totalsum = 0 - for (k in z) { - split(z[k],l,":") - ARCH = l[1] - TYPE = l[2] - VALUE = substr(z[k],length(ARCH)+length(TYPE)+3) - if (val[VALUE]) - val[VALUE] = val[VALUE] " " - val[VALUE] = val[VALUE] ARCH ":" TYPE - ntype[VALUE,TYPE] += 1 - total[VALUE] += 1 - totalsum += 1 - } - split("",curlines) - append = 1 - for (VALUE in val) { - if (total[VALUE] == NTOTAL) { - if (VALUE == "__undefined__") - curlines["1"] = "#undef " SYMBOL "\n" - else if (VALUE == "__novalue__") - curlines["1"] = "#define " SYMBOL "\n" - else - curlines["1"] = "#define " SYMBOL " " VALUE "\n" - if (!lines["1"]) - append = 0 - break - } - shorteststr = "" - curcount = 0 - for (m = 0; m < 4; m++) { - str = "" - split(val[VALUE],yy) - if (total[VALUE] > 1 && total[VALUE] == NTOTAL - 1) { - found = 0 - for (arch in ARCHES) { - for (type in TYPES) { - archtype = arch ":" type - if (ARCHTYPES [archtype] == 1) { - for (n in yy) - if (yy[n] == archtype) - break - if (yy[n] != archtype) { - found = 1 - break - } - } - } - if (found) - break - } - if (NARCHES[type] > 1 && NTYPES[arch] > 1) { - str = "!defined(__module__" arch "_" type ")" - shorteststr = str - break - } - } - if (m == 0 || m == 2) { - nfull = 0 - split("",yysave) - for (type in TYPES) - if (ntype[VALUE,type] == NARCHES[type]) { - if (str) str = str " || " - str = str "defined(__module__" type ")" - for (k in yy) { - split(yy[k], z, ":") - if (z[2] == type) { - yysave[k] = yy[k] - delete yy[k] - } - } - nfull++ - } else - NOTYPE = type - if (m < 2 && nfull > 1 && nfull == NTOTALTYPES - 1) { - str = "!defined(__module__" NOTYPE ")" - for (k in yysave) - yy[k] = yysave[k] - for (k in yy) { - split(yy[k], z, ":") - if (z[2] != NOTYPE) - delete yy[k] - } - } - } - savestr = str - nfull = 0 - split("",yysave) - for (arch in ARCHES) { - narch = 0 - for (k in yy) { - split(yy[k], z, ":") - if (z[1] == arch) - narch++ - } - if (narch == NTYPES[arch]) { - if (str) str = str " || " - str = str "defined(__module__" arch ")" - for (k in yy) { - split(yy[k], z, ":") - if (z[1] == arch) { - yysave[k] = yy[k] - delete yy[k] - } - } - nfull++ - } else - NOARCH = arch - } - if (m < 2 && nfull > 1 && nfull == NTOTALARCHES - 1) { - str = savestr - for (k in yysave) - yy[k] = yysave[k] - if (str) str = str " || " - str = str "!defined(__module__" NOARCH ")" - for (k in yy) { - split(yy[k], z, ":") - if (z[1] != NOARCH) - delete yy[k] - } - } - if (m == 1 || m == 3) { - savestr = str - nfull = 0 - split("",yysave) - for (type in TYPES) { - ntypex = 0 - for (k in yy) { - split(yy[k], z, ":") - if (z[2] == type) - ntypex++ - } - if (ntypex == NARCHES[type]) { - if (str) str = str " || " - str = str "defined(__module__" type ")" - for (k in yy) { - split(yy[k], z, ":") - if (z[2] == type) { - yysave[k] = yy[k] - delete yy[k] - } - } - nfull++ - } else - NOTYPE = type - } - if (m < 2 && nfull > 1 && nfull == NTOTALTYPES - 1) { - str = savestr - for (k in yysave) - yy[k] = yysave[k] - if (str) str = str " || " - str = "!defined(__module__" NOTYPE ")" - for (k in yy) { - split(yy[k], z, ":") - if (z[2] != NOTYPE) - delete yy[k] - } - } - } - for (k in yy) { - split(yy[k], z, ":") - if (str) str = str " || " - str = str "defined(__module__" z[1] "_" z[2] ")" - } - if (m == 0 || length(str) < length(shorteststr)) - shorteststr = str - } - str = shorteststr - if (VALUE == "__undefined__") - curlines[str] = "#undef " SYMBOL "\n" - else if (VALUE == "__novalue__") - curlines[str] = "#define " SYMBOL "\n" - else - curlines[str] = "#define " SYMBOL " " VALUE "\n" - if (!lines[str]) - append = 0 - } - if (append) { - for (str in curlines) - if (curlines[str]) - lines[str] = lines[str] curlines[str] - } else { - if (lines["1"]) - printf "%s", lines["1"] - else if (j > 0) { - ifstr = "#if " - for (str in lines) - if (lines[str] && str != lastelse) { - printf "%s %s\n%s", ifstr, str, lines[str] - ifstr = "#elif " - } - if (lastelse != "") - printf "#else\n%s", lines[lastelse] - printf "#endif\n" - } - split("",lines) - lastelse = "" - for (str in curlines) - if (curlines[str]) { - lines[str] = curlines[str] - if (totalsum == NTOTAL && length(str) > length(lastelse)) { - lastelse = str - } - } - } - } - if (lines["1"]) - printf "%s", lines["1"] - else if (j > 0) { - ifstr = "#if " - for (str in lines) - if (lines[str] && str != lastelse) { - printf "%s %s\n%s", ifstr, str, lines[str] - ifstr = "#elif " - } - if (lastelse != "") - printf "#else\n%s", lines[lastelse] - printf "#endif\n" - } - } -} diff --git a/build/linux-merge-modules.awk b/build/linux-merge-modules.awk deleted file mode 100644 index babc815..0000000 --- a/build/linux-merge-modules.awk +++ /dev/null @@ -1,125 +0,0 @@ -#!/bin/awk -f -{ - # lines in input look like ARCH TYPE path/to/TYPE/ARCH/modules/foo.ver - ARCH=$1 - ARCHES[ARCH]=1 - TYPE=$2 - TYPES[TYPE]=1 - NTOTAL++ - NARCHES[TYPE]++ - NTYPES[ARCH]++ - FILE=$3 - - # read files that look like pairs of repeating - # #define __ver_foo hexstring - # #define foo _set_ver(foo) - while ((getline < FILE) > 0) { - if ($0 ~ /^[ ]*$/) - continue - if ($1 != "#define" || $2 !~ /^__ver_/) - exit 1 - - # this is a "#define __ver_foo somehex" line - SYMBOL=gensub(/^__ver_/,"","",$2) - VALUE=gensub(/^(smp_|2gig_|smp2gig_)/,"","",$3) - VALUE=gensub(/^(smp|2gig|smp2gig)/,"","",VALUE) - values[SYMBOL,ARCH,TYPE]=VALUE - - # skip the "#define foo _set_ver(foo)" line - if ((getline < FILE) <= 0) - exit 2 - if ($1 != "#define" || $2 != SYMBOL || $3 != "_set_ver(" SYMBOL ")") - exit 3 - } - close(FILE) -} -END { - count=0 - for (key in values) - if (values[key]) { - count++ - split(key,x,SUBSEP) - SYMBOL=x[1] - ARCH=x[2] - TYPE=x[3] - - # (re)initialize a few arrays to have no elements - split("",x) - split("",ntype) - split("",total) - - totalsum=0 - for (arch in ARCHES) - for (type in TYPES) - if (values[SYMBOL,arch,type]) { - VALUE = values[SYMBOL,arch,type] - values[SYMBOL,arch,type] = "" - ntype[VALUE,type] += 1 - total[VALUE] += 1 - if (x[VALUE]) - x[VALUE] = x[VALUE] " " - x[VALUE] = x[VALUE] arch ":" type - } - ifstr="#if " - for (VALUE in x) { - if (total[VALUE] == NTOTAL) { - # there is only one checksum for this symbol - printf "#define __ver_%s\t_ver_str(%s)\n", SYMBOL, VALUE - printf "#define %s _set_ver(%s)\n", SYMBOL, SYMBOL - break - } - - totalsum += total[VALUE] - if (totalsum == NTOTAL && ifstr == "#elif") { - # this is the last unique checksum for this symbol - printf "#else\n#define __ver_%s\t_ver_str(%s)\n", SYMBOL, VALUE - printf "#define %s _set_ver(%s)\n", SYMBOL, SYMBOL - break - } - - # there must be more than one checksum still to - # print for this symbol - str="" - split(x[VALUE],y) - for (type in TYPES) - if (ntype[VALUE,type] == NARCHES[type]) { - if (str) str = str " || " - str = str "defined(__module__" type ")" - for (k in y) { - split(y[k], z, ":") - if (z[2] == type) - delete y[k] - } - } - for (arch in ARCHES) { - narch=0 - for (k in y) { - split(y[k], z, ":") - if (z[1] == arch) - narch++ - } - if (narch == NTYPES[arch]) { - if (str) str = str " || " - str = str "defined(__module__" arch ")" - for (k in y) { - split(y[k], z, ":") - if (z[1] == arch) - delete y[k] - } - } - } - for (k in y) { - split(y[k], z, ":") - if (str) str = str " || " - str = str "defined(__module__" z[1] "_" z[2] ")" - } - printf "%s %s\n#define __ver_%s\t_ver_str(%s)\n", ifstr, str, SYMBOL, VALUE - printf "#define %s _set_ver(%s)\n", SYMBOL, SYMBOL - ifstr="#elif " - } - if (ifstr == "#elif ") - printf "#endif\n" - } - if (!count) - printf "\n" -} diff --git a/build/merge1.sh b/build/merge1.sh deleted file mode 100755 index cbd3227..0000000 --- a/build/merge1.sh +++ /dev/null @@ -1,104 +0,0 @@ -#!/bin/bash -e - -CONFLICTS=cvs-merge-conflicts -CVS="cvs -z3" - -if [ -f .mergeinfo ] ; then - echo ".mergeinfo exists - clean up first" - exit -fi - -if [ -f $CONFLICTS ] ; then - echo "$CONFLICTS exists - clean up first" - exit -fi - -if [ $# -lt 2 -o $# -gt 3 ]; then - echo "This is phase 1 of merging branches. Usage: $0 parent child dir" - exit -fi - -parent=$1 -PARENT=`echo $parent | sed -e "s/^b_//" | tr "[a-z]" "[A-Z]"` -child=$2 -CHILD=`echo $child | sed -e "s/^b_//" | tr "[a-z]" "[A-Z]"` -date=`date +%Y%m%d_%H%M` - -dir=${3:-.} -module=$(basename $(<$dir/CVS/Repository)) - -if [ "$module" = "lustre" ] ; then - echo >&2 "${progname}: You probably want to merge lustre or portals, not the whole tree." - echo >&2 "${progname}: Try using ${0} $parent $child lustre" - exit 1 -fi - - -case $parent in - HEAD) : ;; - b_*|b[1-4]*) : ;; - *) parent="b_$parent" ;; -esac -case $child in - HEAD) : ;; - b_*|b[1-4]*) : ;; - *) child="b_$child" -esac - -if [ "$child" != "HEAD" -a "`cat $dir/CVS/Tag 2> /dev/null`" != "T$child" ]; then - echo "This script must be run within the $child branch" - exit 1 -fi - -TEST_FILE=${TEST_FILE:-ChangeLog} # does this need to be smarter? -[ $dir = "build" ] && TEST_FILE=lbuild -check_tag() { - [ -z "$1" ] && echo "check_tag() missing arg" && exit3 - [ "$1" = "HEAD" ] && return - $CVS log $dir/$TEST_FILE 2> /dev/null | grep -q " $1: " && return - echo "$0: tag $1 not found in $dir/$TEST_FILE" - exit 2 -} - -check_tag $parent -check_tag ${CHILD}_BASE - -cat << EOF > .mergeinfo -parent=$parent -PARENT=$PARENT -child=$child -CHILD=$CHILD -date=$date -dir=$dir -module=$module -CONFLICTS=$CONFLICTS -OPERATION=Merge -OPERWHERE=from -EOF - -echo PARENT: $PARENT parent: $parent CHILD: $CHILD child: $child date: $date - -echo -n "tagging $parent as '${PARENT}_${CHILD}_UPDATE_PARENT_$date' ...." -$CVS rtag -r $parent ${PARENT}_${CHILD}_UPDATE_PARENT_$date $module -echo "done" -echo -n "tagging $child as '${PARENT}_${CHILD}_UPDATE_CHILD_$date' ...." -$CVS rtag -r $child ${PARENT}_${CHILD}_UPDATE_CHILD_$date $module -echo "done" - -# Apply all of the changes to your local tree: -echo "Updating: -j ${CHILD}_BASE -j ${PARENT}_${CHILD}_UPDATE_PARENT_$date ...." -$CVS update -j ${CHILD}_BASE -j ${PARENT}_${CHILD}_UPDATE_PARENT_$date -dP $dir -echo "done" - -echo -n "Recording conflicts in $CONFLICTS ..." -$CVS update | awk '/^C/ { print $2 }' > $CONFLICTS -if [ -s $CONFLICTS ] ; then - echo "Conflicts found, fix before committing." - cat $CONFLICTS -else - echo "No conflicts found" - rm -f $CONFLICTS -fi -echo "done" - -echo "Build, test, commit and then run merge2.sh (no arguments)" diff --git a/build/merge2.sh b/build/merge2.sh deleted file mode 100755 index 0ef27cc..0000000 --- a/build/merge2.sh +++ /dev/null @@ -1,35 +0,0 @@ -#!/bin/bash -e - -if [ ! -f .mergeinfo ] ; then - echo ".mergeinfo doesn't exist - exit" - exit -fi - -. .mergeinfo - -if [ "$OPERATION" != "Merge" ] ; then - echo "OPERATION must be Merge - is $OPERATION" - echo "You should probably be running ${OPERATION}2.sh" - exit -fi - -if [ -f $CONFLICTS ] ; then - echo "$CONFLICTS exists - clean up first" - cat $CONFLICTS - exit -fi - -cvs update $dir 2>&1 | grep "^M" && echo "uncommitted changes" && exit 1 - -echo -n "Tagging ${PARENT}_${CHILD}_UPDATE_PARENT_$date as ${CHILD}_BASE_$date ..." -cvs rtag -r ${PARENT}_${CHILD}_UPDATE_PARENT_$date ${CHILD}_BASE_$date $module -echo "done" -echo -n "Tagging ${CHILD}_BASE as ${CHILD}_BASE_PREV ...." -cvs rtag -F -r ${CHILD}_BASE ${CHILD}_BASE_PREV $module -echo "done" -echo "${CHILD}_BASE_$date as ${CHILD}_BASE ..." -cvs rtag -F -r ${CHILD}_BASE_$date ${CHILD}_BASE $module - -echo "saving .mergeinfo as .mergeinfo-$date" -mv .mergeinfo .mergeinfo-$date -echo "done" diff --git a/build/osxpack/ReadMe.txt b/build/osxpack/ReadMe.txt deleted file mode 100644 index 06555ba..0000000 --- a/build/osxpack/ReadMe.txt +++ /dev/null @@ -1,4 +0,0 @@ -This is release 0.9.2 Beta of the Lustre OS X client. - -This is not a production quality release, so use it with care, and on non-production systems. The installation will overwrite the OS X kernel on your system. Un-installing this package will restore your current kernel. - diff --git a/build/osxpack/Welcome.txt b/build/osxpack/Welcome.txt deleted file mode 100644 index f6fa0ce..0000000 --- a/build/osxpack/Welcome.txt +++ /dev/null @@ -1,4 +0,0 @@ -Welcome to the Lustre OS X client installation - -You will be guided through the steps necessary to install this software. During installation you will be required to reboot your Mac, so make sure you have saved all your data and closed applications before continuing the installation. - diff --git a/build/osxpack/packlustre.sh b/build/osxpack/packlustre.sh deleted file mode 100755 index 7392c0e..0000000 --- a/build/osxpack/packlustre.sh +++ /dev/null @@ -1,72 +0,0 @@ -#!/bin/sh -# -# This script is used to create package directory tree used -# by PackageMaker in OS X. - -PREFIX=$1 -STAGE=$2 - -RESOURCE=$PWD/build/osxpack -if ! [ -d $RESOURCE ]; then - echo "Your tree seems to be missing $RESOURCE." >&2 -fi - -if [ "x$PREFIX" == "x" ]; then - PREFIX=/home/cfs/package -fi - -if [ "x$STAGE" == "x" ]; then - STAGE=/System/Library/Extensions -fi - -if ! [ -d $STAGE/llite.kext ]; then - echo "Sorry, cannot find stage files for package" - exit 1 -fi - -if ! [ -d $PREFIX ]; then - mkdir -p $PREFIX -fi - -if ! [ -d $PREFIX/Install_resources ]; then - mkdir -p $PREFIX/Install_resources -fi - -if ! [ -d $PREFIX/Package_contents ]; then - mkdir -p $PREFIX/Package_contents -fi - -CONTENTS=$PREFIX/Package_contents - -if ! [ -d $CONTENTS/System/Libraray/Extensions ]; then - mkdir -p $CONTENTS/System/Library/Extensions -fi -# IMPORTANT -# /etc is symlink of /private/etc in OS X, if we -# just use $CONTENTS/etc, it will OVERWRITE /etc in -# installation target, that means all files in /etc -# will be lost, the system will be corrupted. -if ! [ -d $CONTENTS/private/etc ]; then - mkdir -p $CONTENTS/private/etc -fi - -if ! [ -d $CONTENTS/sbin ]; then - mkdir -p $CONTENTS/sbin -fi - -cp -f $RESOURCE/*.txt $PREFIX/Install_resources/ -cp -f $RESOURCE/*flight $PREFIX/Install_resources/ -cp -f $RESOURCE/sysctl.conf $CONTENTS/private/etc -cp -f $RESOURCE/uninstall_lustre $CONTENTS/sbin -cp -f $RESOURCE/unload_lustre $CONTENTS/sbin -cp -rf $STAGE/llite.kext $CONTENTS/System/Library/Extensions -cp -rf $STAGE/mdc.kext $CONTENTS/System/Library/Extensions -cp -rf $STAGE/lov.kext $CONTENTS/System/Library/Extensions -cp -rf $STAGE/osc.kext $CONTENTS/System/Library/Extensions -cp -rf $STAGE/ptlrpc.kext $CONTENTS/System/Library/Extensions -cp -rf $STAGE/ptlrpcs.kext $CONTENTS/System/Library/Extensions -cp -rf $STAGE/obdclass.kext $CONTENTS/System/Library/Extensions -cp -rf $STAGE/lvfs.kext $CONTENTS/System/Library/Extensions -cp -rf $STAGE/ksocknal.kext $CONTENTS/System/Library/Extensions -cp -rf $STAGE/portals.kext $CONTENTS/System/Library/Extensions -cp -rf $STAGE/libcfs.kext $CONTENTS/System/Library/Extensions diff --git a/build/osxpack/postflight b/build/osxpack/postflight deleted file mode 100755 index a4e6633..0000000 --- a/build/osxpack/postflight +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/sh -/usr/sbin/nvram boot-args="debug=0xe zsize=256" - -# Delete cache files -if [ -f /System/Library/Extensions.kextcache ]; then - rm -f /System/Library/Extensions.kextcache -fi - -if [ -d /System/Library/Caches/com.apple.kernelcaches ]; then - rm -rf /System/Library/Caches/com.apple.kernelcaches -fi - -touch /System/Library diff --git a/build/osxpack/preflight b/build/osxpack/preflight deleted file mode 100755 index 582aea3..0000000 --- a/build/osxpack/preflight +++ /dev/null @@ -1,67 +0,0 @@ -#!/bin/sh - -date=`date +%Y%m%d_%H%M` - -backpath=/System/LustreBackup -if ! [ -d $backpath ]; then - mkdir -p $backpath -fi - -backdir=${backpath}/SystemBackup - -# If system has been backed up, just exit -if [ -d ${backdir} ]; then - exit 0 -fi - -mkdir $backdir -chmod 700 $backdir - -# create backup directory -mkdir -p $backdir/System/Library/Frameworks -mkdir -p $backdir/System/Library/Extensions -mkdir -p $backdir/usr/lib -mkdir -p $backdir/usr/include -mkdir -p $backdir/private/etc - -# backup the old system -cp /mach_kernel $backdir/ -cp -rf /System/Library/Frameworks/IOKit.framework $backdir/System/Library/Frameworks/ -cp -rf /System/Library/Frameworks/Kernel.framework $backdir/System/Library/Frameworks/ -cp -rf /System/Library/Frameworks/System.framework $backdir/System/Library/Frameworks/ -cp -rf /System/Library/Extensions/System.kext $backdir/System/Library/Extensions/ -cp -rf /usr/include/bsm $backdir/usr/include/bsm -cp -rf /usr/include/default_pager $backdir/usr/include/default_pager -cp -rf /usr/include/dev $backdir/usr/include/dev -cp -rf /usr/include/device $backdir/usr/include/device -cp -rf /usr/include/drivers $backdir/usr/include/drivers -cp -rf /usr/include/hfs $backdir/usr/include/hfs -cp -rf /usr/include/isofs $backdir/usr/include/isofs -cp -rf /usr/include/libkern $backdir/usr/include/libkern -cp -rf /usr/include/mach $backdir/usr/include/mach -cp -rf /usr/include/mach_debug $backdir/usr/include/mach_debug -cp -rf /usr/include/machine $backdir/usr/include/machine -cp -rf /usr/include/miscfs $backdir/usr/include/miscfs -cp -rf /usr/include/net $backdir/usr/include/net -cp -rf /usr/include/netat $backdir/usr/include/netat -cp -rf /usr/include/netinet $backdir/usr/include/netinet -cp -rf /usr/include/netinet6 $backdir/usr/include/netinet6 -cp -rf /usr/include/netkey $backdir/usr/include/netkey -cp -rf /usr/include/nfs $backdir/usr/include/nfs -cp -rf /usr/include/pexpert $backdir/usr/include/pexpert -cp -rf /usr/include/ppc $backdir/usr/include/ppc -cp -rf /usr/include/profile $backdir/usr/include/profile -cp -rf /usr/include/sys $backdir/usr/include/sys -cp -rf /usr/include/ufs $backdir/usr/include/ufs -cp -rf /usr/include/vfs $backdir/usr/include/vfs -cp -f /usr/lib/libIOKit.A.dylib $backdir/usr/lib/libIOKit.A.dylib -cp -f /usr/lib/libIOKit.dylib $backdir/usr/lib/libIOKit.dylib -cp -f /usr/lib/libkmod.a $backdir/usr/lib/libkmod.a -cp -f /usr/lib/libkmodc++.a $backdir/usr/lib/libkmodc++.a -if [ -f /private/etc/sysctl.conf ]; then - cp -f /private/etc/sysctl.conf $backdir/private/etc/sysctl.conf -fi - -# record sysctem backup path of current installation -# echo $backdir >> $backpath/lustre-current -# echo $date > $backdir/install_date diff --git a/build/osxpack/sysctl.conf b/build/osxpack/sysctl.conf deleted file mode 100644 index f8d2cc2..0000000 --- a/build/osxpack/sysctl.conf +++ /dev/null @@ -1,15 +0,0 @@ -# nmbclusters can not be modified by sysctl(it's read only) -# so we have to change it in kernel patch -## kern.ipc.nmbclusters=32768 - -# Keep a long enough reassembly queue. -net.inet.tcp.reass.maxsegments=4096 - -# max permited sockbuf -kern.ipc.maxsockbuf=8388608 - -# XXX -net.inet.tcp.sendspace=3217968 -net.inet.tcp.recvspace=3217968 - -kern.maxvnodes=65536 diff --git a/build/osxpack/uninstall_lustre b/build/osxpack/uninstall_lustre deleted file mode 100755 index d2624d2..0000000 --- a/build/osxpack/uninstall_lustre +++ /dev/null @@ -1,111 +0,0 @@ -#!/bin/sh -# -# It's the uninstall programe of lustre - -mount | grep lustre > /dev/null -if [ $? -eq 0 ]; then - echo "Please unmount Lustre before uninstall Lustre." - exit 1 -fi - -echo "unloading modules......" -/sbin/unload_lustre 2 > /dev/null - -backdir=/System/LustreBackup/SystemBackup -if ! [ -d $backdir ]; then - echo "No backup directory is found, you have not installed lustre or uninstall failed!" - exit 1 -fi - -# if ! [ -f $backpath/lustre-current ]; then -# echo "No version information of Lustre installation, you have uninstalled or uninstall failed!" -# exit 2 -# fi -# -# get information of the the backup system -# -# backdir=`tail -1 $backpath/lustre-current` -# if ! [ -d $backdir ]; then -# echo "Miss backup directory, uninstall failed!" -# exit 3 -# fi - -# recover the old system -echo "recover orignal kernel......" -cp -f $backdir/mach_kernel /mach_kernel -echo "recover orignal frameworks......" -cp -rf $backdir/System/Library/Frameworks/IOKit.framework /System/Library/Frameworks/ -cp -rf $backdir/System/Library/Frameworks/Kernel.framework /System/Library/Frameworks/ -cp -rf $backdir/System/Library/Frameworks/System.framework /System/Library/Frameworks/ -echo "recover orignal kernel extensions......" -cp -rf $backdir/System/Library/Extensions/System.kext /System/Library/Extensions/ -echo "recover orignal system headers......" -cp -rf $backdir/usr/include/bsm /usr/include/bsm -cp -rf $backdir/usr/include/default_pager /usr/include/default_pager -cp -rf $backdir/usr/include/dev /usr/include/dev -cp -rf $backdir/usr/include/device /usr/include/device -cp -rf $backdir/usr/include/drivers /usr/include/drivers -cp -rf $backdir/usr/include/hfs /usr/include/hfs -cp -rf $backdir/usr/include/isofs /usr/include/isofs -cp -rf $backdir/usr/include/libkern /usr/include/libkern -cp -rf $backdir/usr/include/mach /usr/include/mach -cp -rf $backdir/usr/include/mach_debug /usr/include/mach_debug -cp -rf $backdir/usr/include/machine /usr/include/machine -cp -rf $backdir/usr/include/miscfs /usr/include/miscfs -cp -rf $backdir/usr/include/net /usr/include/net -cp -rf $backdir/usr/include/netat /usr/include/netat -cp -rf $backdir/usr/include/netinet /usr/include/netinet -cp -rf $backdir/usr/include/netinet6 /usr/include/netinet6 -cp -rf $backdir/usr/include/netkey /usr/include/netkey -cp -rf $backdir/usr/include/nfs /usr/include/nfs -cp -rf $backdir/usr/include/pexpert /usr/include/pexpert -cp -rf $backdir/usr/include/ppc /usr/include/ppc -cp -rf $backdir/usr/include/profile /usr/include/profile -cp -rf $backdir/usr/include/sys /usr/include/sys -cp -rf $backdir/usr/include/ufs /usr/include/ufs -cp -rf $backdir/usr/include/vfs /usr/include/vfs -echo "recover orignal system libraries......" -cp -f $backdir/usr/lib/libIOKit.A.dylib /usr/lib/libIOKit.A.dylib -cp -f $backdir/usr/lib/libIOKit.dylib /usr/lib/libIOKit.dylib -cp -f $backdir/usr/lib/libkmod.a /usr/lib/libkmod.a -cp -f $backdir/usr/lib/libkmodc++.a /usr/lib/libkmodc++.a -if [ -f $backdir/private/etc/sysctl.conf ]; then - cp -f $backdir/private/etc/sysctl.conf /private/etc/sysctl.conf -else - rm -f /private/etc/sysctl.conf -fi - -# -# cp $backpath/lustre-current /tmp/lustre-v -# count=`grep "SystemBackup" /tmp/lustre-v|wc -l` -# count=`expr $count - 1` -# if [ $count -gt 0 ]; then -# head -$count /tmp/lustre-v > $backpath/lustre-current -# else -# # The file should be empty now, it's the last uninstall -# rm -f $backpath/lustre-current -# fi -# rm -f /tmp/lustre-v - -# remove unused files -echo "uninstall lustre modules and utilities......" -rm -rf $backdir -rm -f /sbin/lctl -rm -f /sbin/mount_lustre -rm -f /sbin/unload_lustre -rm -rf /System/Library/Extensions/libcfs.kext -rm -rf /System/Library/Extensions/portals.kext -rm -rf /System/Library/Extensions/ksocknal.kext -rm -rf /System/Library/Extensions/lvfs.kext -rm -rf /System/Library/Extensions/obdclass.kext -rm -rf /System/Library/Extensions/ptlrpc.kext -rm -rf /System/Library/Extensions/ptlrpcs.kext -rm -rf /System/Library/Extensions/osc.kext -rm -rf /System/Library/Extensions/lov.kext -rm -rf /System/Library/Extensions/obdecho.kext -rm -rf /System/Library/Extensions/mdc.kext -rm -rf /System/Library/Extensions/llite.kext - -# disable kernel cache by touch /System/Library -touch /System/Library -echo "Uninstall Lustre successfully, please reboot your computer!" diff --git a/build/osxpack/unload_lustre b/build/osxpack/unload_lustre deleted file mode 100755 index 54404c1..0000000 --- a/build/osxpack/unload_lustre +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/sh - -mount | grep lustre > /dev/null -if [ $? -eq 0 ]; then - echo "Please unmount lustre before unload modules." - exit 1 -fi -kextunload /System/Library/Extensions/llite.kext -kextunload /System/Library/Extensions/mdc.kext -kextunload /System/Library/Extensions/lov.kext -kextunload /System/Library/Extensions/osc.kext -kextunload /System/Library/Extensions/ptlrpc.kext -kextunload /System/Library/Extensions/ptlrpcs.kext -kextunload /System/Library/Extensions/obdclass.kext -kextunload /System/Library/Extensions/lvfs.kext -kextunload /System/Library/Extensions/ksocknal.kext -kextunload /System/Library/Extensions/portals.kext -kextunload /System/Library/Extensions/libcfs.kext diff --git a/build/replace2.sh b/build/replace2.sh deleted file mode 100755 index e0f7b2b..0000000 --- a/build/replace2.sh +++ /dev/null @@ -1,36 +0,0 @@ -#!/bin/bash -e - -CVS=cvs - -if [ ! -f .mergeinfo ] ; then - echo ".mergeinfo doesn't exist - exit" - exit -fi - -. .mergeinfo - -if [ "$OPERATION" != "Replace" ] ; then - echo "OPERATION must be Replace - is $OPERATION" - echo "You should probably be running ${OPERATION}2.sh" - exit -fi - -if [ -f "$CONFLICTS" ] ; then - echo "$CONFLICTS exists - clean up first" - cat $CONFLICTS - exit -fi - -cvs update $dir 2>&1 | grep "^M" && echo "uncommitted changes" && exit 1 - -# Tag parent -echo -n "Tagging as ${CHILD}_REPLACED_${PARENT}_$date ..." -$CVS rtag -r $parent ${CHILD}_REPLACED_${PARENT}_$date $module -echo "done" -# In case someone tries to re-land later -echo -n "Tagging as ${CHILD}_BASE ..." -$CVS rtag -F -r $parent ${CHILD}_BASE $module - -echo "saving .mergeinfo as .mergeinfo-$date" -mv .mergeinfo .mergeinfo-$date -echo "done" diff --git a/build/sles8-post.sh b/build/sles8-post.sh deleted file mode 100644 index d8a2e6b..0000000 --- a/build/sles8-post.sh +++ /dev/null @@ -1,49 +0,0 @@ -# Replace fake symlinks with the real ones -relink vmlinuz-%ver_str /boot/vmlinuz -relink initrd-%ver_str /boot/initrd - -if [ -e /etc/sysconfig/kernel ]; then - update_rcfile_setting /etc/sysconfig/kernel INITRD_MODULES 2>&1 -elif [ -e /etc/rc.config ]; then - update_rcfile_setting /etc/rc.config INITRD_MODULES 2>&1 -fi - -# If any trigger scripts have created additional modules, we need to -# run depmod. -run_depmod= -if [ -x /sbin/depmod ]; then - for module in $(find /lib/modules/%ver_str \ - /lib/modules/%{version}-override-%{cfg_name} \ - -type f) ; do - if [ $module -nt /lib/modules/%ver_str/modules.dep ]; then - run_depmod=1 - break - fi - done -fi -if [ -n "$run_depmod" ]; then - depmod -ae %ver_str -fi - -if [ -f /etc/fstab -a -x /sbin/mkinitrd ]; then - cd /boot && \ - /sbin/mkinitrd -k "vmlinuz-%ver_str" -i "initrd-%ver_str" -elif [ -f /etc/fstab -a -x /sbin/mk_initrd ]; then - cd /boot && \ - /sbin/mk_initrd -k "vmlinuz-%ver_str" -i "initrd-%ver_str" -else - echo "please run mkinitrd as soon as your system is complete" -fi - -# Only call new-kernel-pkg during package updates: Otherwise we might -# call this during an initial installation, with a half-initialized -# boot loader. ($1 = number of instances of this package currently -# installed.) -if [ "$1" -gt 1 ]; then - # Notify boot loader that a new kernel image has been installed. - if [ -x /sbin/new-kernel-pkg ]; then - /sbin/new-kernel-pkg %ver_str - elif [ -e /etc/lilo.conf -a -x /sbin/lilo ]; then - /sbin/lilo - fi -fi diff --git a/build/sles8-postun.sh b/build/sles8-postun.sh deleted file mode 100644 index fe1ded5..0000000 --- a/build/sles8-postun.sh +++ /dev/null @@ -1,22 +0,0 @@ -rm -f /boot/initrd-%ver_str # created in %post -- clean up. - -if [ "$(readlink /boot/vmlinuz)" = "vmlinuz-%ver_str" -o \ - "$(readlink /boot/initrd)" = "initrd-%ver_str" ]; then - # This may be the last kernel RPM on the system, or it may - # be an update. In both of those cases the symlinks will - # eventually be correct. On the other hand, if this kernel - # is removed and other kernel rpms remain installed, - # find the most recent of the remaining kernels, and make - # the symlinks point to it. This makes sure that the boot - # manager will always have a kernel to boot in its default - # configuration. - for vmlinuz in $(cd /boot ; ls -dt vmlinuz-*); do - version="${vmlinuz#vmlinuz-}" - initrd="initrd-$version" - if [ -f "/boot/$vmlinuz" -a -f "/boot/$initrd" ]; then - relink "$vmlinuz" /boot/vmlinuz - relink "$initrd" /boot/initrd - break - fi - done -fi diff --git a/build/sles8-pre.sh b/build/sles8-pre.sh deleted file mode 100644 index a542caf..0000000 --- a/build/sles8-pre.sh +++ /dev/null @@ -1,2 +0,0 @@ -rm -f /boot/vmlinuz.suse -rm -f /boot/initrd.suse diff --git a/build/sles8-update_INITRD_MODULES.sh b/build/sles8-update_INITRD_MODULES.sh deleted file mode 100644 index 38d5e3d..0000000 --- a/build/sles8-update_INITRD_MODULES.sh +++ /dev/null @@ -1,56 +0,0 @@ -# Check if $1 is equal to any argument in $1 .. $*. -# -contains() { - local x=$1 - shift - - case " $@ " in - *" $x "*) return 0 ;; - *) return 1 ;; - esac -} - -# Check the old value of INITRD_MODULES: -# - Remove modules that no longer exist. -# - Add modules that were built into the kernel before. -# -update_INITRD_MODULES() { - # MD_MODS is the list of modules that require md.o. - local MD_MODS="linear multipath raid0 raid1 raid5" - - # NON_SCSI is a whitelist of modules that are no scsi drivers. Any - # module not listed here is assumed to be a scsi driver, and the - # low-level scsi modules are added to INITRD_MODULES. - local NON_SCSI="jbd ext3 jfs xfs reiserfs $MD_MODS md" - - local result maybe_scsi need_md have_md have_scsi have_sd m - for m in "$@" ; do - m="${m%.o}" ; m="${m%.ko}" - - contains "$m" $NON_SCSI || maybe_scsi=1 - contains "$m" $MD_MODS && need_md=1 - [ "$m" == md ] && have_md=1 - if contains "$m" scsi_mod sd_mod ; then - eval have_${m%_mod}=1 - continue - fi - if contains "$m" xfs_dmapi xfs_support ; then - echo "Module $m no longer exists, and was removed from" \ - "INITRD_MODULES." >&2 - continue - fi - - result[${#result[@]}]="$m" - done - if [ -n "$maybe_scsi" -o -n "$have_scsi" -o -n "$have_sd" ]; then - [ -z "$have_scsi" -o -z "$have_sd" ] \ - && echo "Adding SCSI disk modules to INITRD_MODULES" >&2 - result=(scsi_mod sd_mod ${result[@]}) - fi - if [ -n "$need_md" -a -z "$have_md" ]; then - echo "Adding RAID support module to INITRD_MODULES" >&2 - result=(md ${result[@]}) - fi - - echo ${result[@]} -} diff --git a/build/sles8-update_rcfile_setting.sh b/build/sles8-update_rcfile_setting.sh deleted file mode 100644 index 6165cc7..0000000 --- a/build/sles8-update_rcfile_setting.sh +++ /dev/null @@ -1,35 +0,0 @@ -# Update the variable $var in $rcfile: The function update_$VAR must -# exist. It is called with the old value of $var, and must return the -# new value. -# -update_rcfile_setting() { - local rcfile=$1 var=$2 - - # The characters $, `, ", and \ have special meaning inside double - # quoted shell variables. The characters " and \ have special meaning - # inside awk double-quoted variables. - - local old=$(source "$rcfile" ; - eval echo \$$var \ - | sed -e 's/\([$`"\\]\)/\\\1/g') - local new=$(eval update_$var "$old" \ - | sed -e 's/\([$`"\\]\)/\\\1/g' \ - -e 's/\(["\\]\)/\\\1/g') - local tmp=$(mktemp /tmp/${rcfile##/*}.XXXXXX) - - # This script breaks for multi-line varables -- I don't think - # we need to handle this special case. - awk ' - function replace() { - if (!done) - print "'"$var"'=\"'"$new"'\"" - done=1 - } - - /^'"$var"'=/ { replace() ; next } - { print } - ' < $rcfile > $tmp && - cat $tmp > $rcfile - - rm -f $tmp -} diff --git a/build/suse-functions.sh b/build/suse-functions.sh deleted file mode 100644 index a7e421d..0000000 --- a/build/suse-functions.sh +++ /dev/null @@ -1,22 +0,0 @@ -# Readlink is not present on some older distributions: emulate it. -readlink() { - local path=$1 ll - - if [ -L "$path" ]; then - ll="$(LC_ALL=C ls -l "$path" 2> /dev/null)" && - echo "${ll/* -> }" - else - return 1 - fi -} -relink() { - if [ -h "$2" ]; then - local old=$(readlink "$2") - [ "$old" = "$1" ] && return 0 - echo "Changing symlink $2 from $old to $1" - elif [ -e "$2" ]; then - echo "Replacing file $2 with symlink to $1" - fi - rm -f "$2" \ - && ln -s "$1" "$2" -} diff --git a/build/suse-trigger-script.sh.in b/build/suse-trigger-script.sh.in deleted file mode 100644 index 0ead9e8..0000000 --- a/build/suse-trigger-script.sh.in +++ /dev/null @@ -1,9 +0,0 @@ -old_shopt=$(shopt -p nullglob || :) -shopt -s nullglob -for script in /lib/modules/scripts/* ; do - if [ -f "$script" -a -x "$script" ] \ - && ! "$script" --@when@ %ver_str $1 ; then - echo "$script failed." - fi -done -eval $old_shopt diff --git a/build/update_oldconfig b/build/update_oldconfig deleted file mode 100755 index 6d4d870..0000000 --- a/build/update_oldconfig +++ /dev/null @@ -1,74 +0,0 @@ -#!/usr/bin/expect -f - -#enables some diagnostic output -exp_internal 1 - -# log all interaction to a file to diagnose failures -log_file -a [lindex $argv 0] - -# and not stddout -log_user 0 - -set spawnid [spawn make oldconfig] - -#match_max 200 - -# need to allow for the config tool to be built initially -set timeout 30 - -expect { - timeout { - puts "timeout in update_oldconfig waiting for a prompt we recognize" - exit 1 - } - -re "\n *(\[^\n]* \\\[N\/y\/\\?] \\(NEW\\)) " { - puts "$expect_out(1,string) n" - send "n\r" - exp_continue - } - -re "\n *(\[^\n]* \\\[N\/m\/y\/\\?] \\(NEW\\)) " { - puts "$expect_out(1,string) m" - send "m\r" - exp_continue - } - -re "\n *(\[^\n]* \\\[N\/y\/m\/\\?] \\(NEW\\)) " { - puts "$expect_out(1,string) m" - send "m\r" - exp_continue - } - -re "\n *(\[^\n]* \\\[N\/m\/\\?] \\(NEW\\)) " { - puts "$expect_out(1,string) m" - send "m\r" - exp_continue - } - -re "\n *(\[^\n]* \\\[Y\/n\/\\?] \\(NEW\\)) " { - puts "$expect_out(1,string) y" - send "y\r" - exp_continue - } - -re "\n *(\[^\n]* \\\[\[0-9]*] \\(NEW\\)) " { - puts "$expect_out(1,string) " - send "\r" - exp_continue - } - -re "\n *(\[^\n]* \\\[M\/n\/\\?] \\(NEW\\)) " { - puts "$expect_out(1,string) m" - send "m\r" - exp_continue - } - -re "\n *(\[^\n]* \\\[M\/n\/y\/\\?] \\(NEW\\)) " { - puts "$expect_out(1,string) m" - send "m\r" - exp_continue - } - -re "\n *(\[^\n]* \\\[Y\/n\/m\/\\?] \\(NEW\\)) " { - puts "$expect_out(1,string) m" - send "m\r" - exp_continue - } - -re "\n *(\[^\n]* \\\[Y\/\\?] \\(NEW\\)) " { - puts "$expect_out(1,string) y" - send "y\r" - exp_continue - } -} diff --git a/ldiskfs/kernel_patches/patches/export-ext3-2.6-rhel4.patch b/ldiskfs/kernel_patches/patches/export-ext3-2.6-rhel4.patch deleted file mode 100644 index 9f443b4..0000000 --- a/ldiskfs/kernel_patches/patches/export-ext3-2.6-rhel4.patch +++ /dev/null @@ -1,35 +0,0 @@ -Index: linux-2.6.9-42.0.10.EL_lustre.1.4.10/fs/ext3/super.c -=================================================================== ---- linux-2.6.9-42.0.10.EL_lustre.1.4.10.orig/fs/ext3/super.c 2007-05-16 08:46:24.000000000 +0200 -+++ linux-2.6.9-42.0.10.EL_lustre.1.4.10/fs/ext3/super.c 2007-05-16 08:48:58.000000000 +0200 -@@ -123,6 +123,8 @@ void ext3_journal_abort_handle(const cha - journal_abort_handle(handle); - } - -+EXPORT_SYMBOL(ext3_journal_abort_handle); -+ - /* Deal with the reporting of failure conditions on a filesystem such as - * inconsistencies detected or read IO failures. - * -@@ -2064,6 +2066,8 @@ int ext3_force_commit(struct super_block - return ret; - } - -+EXPORT_SYMBOL(ext3_force_commit); -+ - /* - * Ext3 always journals updates to the superblock itself, so we don't - * have to propagate any other updates to the superblock on disk at this -@@ -2586,6 +2590,12 @@ int ext3_map_inode_page(struct inode *in - unsigned long *blocks, int *created, int create); - EXPORT_SYMBOL(ext3_map_inode_page); - -+EXPORT_SYMBOL(ext3_xattr_get); -+EXPORT_SYMBOL(ext3_xattr_set_handle); -+EXPORT_SYMBOL(ext3_bread); -+EXPORT_SYMBOL(ext3_journal_start_sb); -+EXPORT_SYMBOL(__ext3_journal_stop); -+ - MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); - MODULE_DESCRIPTION("Second Extended Filesystem with journaling extensions"); - MODULE_LICENSE("GPL"); diff --git a/ldiskfs/kernel_patches/patches/export-ext3-2.6-suse.patch b/ldiskfs/kernel_patches/patches/export-ext3-2.6-suse.patch deleted file mode 100644 index c10431b..0000000 --- a/ldiskfs/kernel_patches/patches/export-ext3-2.6-suse.patch +++ /dev/null @@ -1,35 +0,0 @@ -Index: linux-2.6.5-7.283_lustre-1.4.10.1/fs/ext3/super.c -=================================================================== ---- linux-2.6.5-7.283_lustre-1.4.10.1.orig/fs/ext3/super.c 2007-05-30 08:48:29.000000000 +0200 -+++ linux-2.6.5-7.283_lustre-1.4.10.1/fs/ext3/super.c 2007-05-30 08:48:37.000000000 +0200 -@@ -116,6 +116,8 @@ void ext3_journal_abort_handle(const cha - handle->h_err = err; - } - -+EXPORT_SYMBOL(ext3_journal_abort_handle); -+ - static char error_buf[1024]; - - /* Deal with the reporting of failure conditions on a filesystem such as -@@ -1895,6 +1897,8 @@ int ext3_force_commit(struct super_block - return ret; - } - -+EXPORT_SYMBOL(ext3_force_commit); -+ - /* - * Ext3 always journals updates to the superblock itself, so we don't - * have to propagate any other updates to the superblock on disk at this -@@ -2334,6 +2338,12 @@ int ext3_map_inode_page(struct inode *in - unsigned long *blocks, int *created, int create); - EXPORT_SYMBOL(ext3_map_inode_page); - -+EXPORT_SYMBOL(ext3_xattr_get); -+EXPORT_SYMBOL(ext3_xattr_set_handle); -+EXPORT_SYMBOL(ext3_bread); -+EXPORT_SYMBOL(ext3_journal_start); -+EXPORT_SYMBOL(__ext3_journal_stop); -+ - MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); - MODULE_DESCRIPTION("Second Extended Filesystem with journaling extensions"); - MODULE_LICENSE("GPL"); diff --git a/ldiskfs/kernel_patches/patches/export_symbols-ext3-2.6-suse.patch b/ldiskfs/kernel_patches/patches/export_symbols-ext3-2.6-suse.patch deleted file mode 100644 index 294a9cd..0000000 --- a/ldiskfs/kernel_patches/patches/export_symbols-ext3-2.6-suse.patch +++ /dev/null @@ -1,17 +0,0 @@ -Index: linux-2.6.5-sles9/include/linux/ext3_fs_sb.h -=================================================================== ---- linux-2.6.5-sles9.orig/include/linux/ext3_fs_sb.h 2004-11-03 08:36:51.000000000 +0300 -+++ linux-2.6.5-sles9/include/linux/ext3_fs_sb.h 2004-11-09 02:20:51.598024096 +0300 -@@ -19,9 +19,12 @@ - #ifdef __KERNEL__ - #include - #include -+#ifndef EXT_INCLUDE -+#define EXT_INCLUDE - #include - #include - #endif -+#endif - #include - - /* diff --git a/ldiskfs/kernel_patches/patches/ext3-check-jbd-errors-2.6.5.patch b/ldiskfs/kernel_patches/patches/ext3-check-jbd-errors-2.6.5.patch deleted file mode 100644 index e54774f..0000000 --- a/ldiskfs/kernel_patches/patches/ext3-check-jbd-errors-2.6.5.patch +++ /dev/null @@ -1,113 +0,0 @@ -Index: linux-2.6.5-7.201-full/include/linux/ext3_fs.h -=================================================================== ---- linux-2.6.5-7.201-full.orig/include/linux/ext3_fs.h 2006-08-09 17:59:34.000000000 +0400 -+++ linux-2.6.5-7.201-full/include/linux/ext3_fs.h 2006-08-22 12:35:55.000000000 +0400 -@@ -793,6 +793,7 @@ extern void ext3_put_super (struct super - extern void ext3_write_super (struct super_block *); - extern void ext3_write_super_lockfs (struct super_block *); - extern void ext3_unlockfs (struct super_block *); -+extern void ext3_commit_super (struct super_block *, struct ext3_super_block *, int); - extern int ext3_remount (struct super_block *, int *, char *); - extern int ext3_statfs (struct super_block *, struct kstatfs *); - -Index: linux-2.6.5-7.201-full/fs/ext3/super.c -=================================================================== ---- linux-2.6.5-7.201-full.orig/fs/ext3/super.c 2006-08-09 17:59:37.000000000 +0400 -+++ linux-2.6.5-7.201-full/fs/ext3/super.c 2006-08-09 17:59:37.000000000 +0400 -@@ -39,7 +39,7 @@ - static int ext3_load_journal(struct super_block *, struct ext3_super_block *); - static int ext3_create_journal(struct super_block *, struct ext3_super_block *, - int); --static void ext3_commit_super (struct super_block * sb, -+void ext3_commit_super (struct super_block * sb, - struct ext3_super_block * es, - int sync); - static void ext3_mark_recovery_complete(struct super_block * sb, -@@ -1781,7 +1781,7 @@ static int ext3_create_journal(struct su - return 0; - } - --static void ext3_commit_super (struct super_block * sb, -+void ext3_commit_super (struct super_block * sb, - struct ext3_super_block * es, - int sync) - { -Index: linux-2.6.5-7.201-full/fs/ext3/namei.c -=================================================================== ---- linux-2.6.5-7.201-full.orig/fs/ext3/namei.c 2006-08-09 17:59:37.000000000 +0400 -+++ linux-2.6.5-7.201-full/fs/ext3/namei.c 2006-08-09 17:59:37.000000000 +0400 -@@ -1598,7 +1598,7 @@ static int ext3_delete_entry (handle_t * - struct buffer_head * bh) - { - struct ext3_dir_entry_2 * de, * pde; -- int i; -+ int i, err; - - i = 0; - pde = NULL; -@@ -1608,7 +1608,9 @@ static int ext3_delete_entry (handle_t * - return -EIO; - if (de == de_del) { - BUFFER_TRACE(bh, "get_write_access"); -- ext3_journal_get_write_access(handle, bh); -+ err = ext3_journal_get_write_access(handle, bh); -+ if (err) -+ return err; - if (pde) - pde->rec_len = - cpu_to_le16(le16_to_cpu(pde->rec_len) + -Index: linux-2.6.5-7.201-full/fs/ext3/xattr.c -=================================================================== ---- linux-2.6.5-7.201-full.orig/fs/ext3/xattr.c 2006-07-14 01:53:23.000000000 +0400 -+++ linux-2.6.5-7.201-full/fs/ext3/xattr.c 2006-08-09 17:59:37.000000000 +0400 -@@ -107,7 +107,7 @@ ext3_xattr_register(int name_index, stru - { - int error = -EINVAL; - -- if (name_index > 0 && name_index <= EXT3_XATTR_INDEX_MAX) { -+ if (name_index > 0 && name_index < EXT3_XATTR_INDEX_MAX) { - write_lock(&ext3_handler_lock); - if (!ext3_xattr_handlers[name_index-1]) { - ext3_xattr_handlers[name_index-1] = handler; -Index: linux-2.6.5-7.201-full/fs/ext3/inode.c -=================================================================== ---- linux-2.6.5-7.201-full.orig/fs/ext3/inode.c 2006-07-14 01:53:22.000000000 +0400 -+++ linux-2.6.5-7.201-full/fs/ext3/inode.c 2006-08-22 12:35:28.000000000 +0400 -@@ -1517,9 +1517,14 @@ out_stop: - if (end > inode->i_size) { - ei->i_disksize = end; - i_size_write(inode, end); -- err = ext3_mark_inode_dirty(handle, inode); -- if (!ret) -- ret = err; -+ /* -+ * We're going to return a positive `ret' -+ * here due to non-zero-length I/O, so there's -+ * no way of reporting error returns from -+ * ext3_mark_inode_dirty() to userspace. So -+ * ignore it. -+ */ -+ ext3_mark_inode_dirty(handle, inode); - } - } - err = ext3_journal_stop(handle); -@@ -1811,8 +1816,18 @@ ext3_clear_blocks(handle_t *handle, stru - ext3_mark_inode_dirty(handle, inode); - ext3_journal_test_restart(handle, inode); - if (bh) { -+ int err; - BUFFER_TRACE(bh, "retaking write access"); -- ext3_journal_get_write_access(handle, bh); -+ err = ext3_journal_get_write_access(handle, bh); -+ if (err) { -+ struct super_block *sb = inode->i_sb; -+ struct ext3_super_block *es = EXT3_SB(sb)->s_es; -+ printk (KERN_CRIT"EXT3-fs: can't continue truncate\n"); -+ EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS; -+ es->s_state |= cpu_to_le16(EXT3_ERROR_FS); -+ ext3_commit_super(sb, es, 1); -+ return; -+ } - } - } - diff --git a/ldiskfs/kernel_patches/patches/ext3-check-jbd-errors-2.6.9.patch b/ldiskfs/kernel_patches/patches/ext3-check-jbd-errors-2.6.9.patch deleted file mode 100644 index f6904f2..0000000 --- a/ldiskfs/kernel_patches/patches/ext3-check-jbd-errors-2.6.9.patch +++ /dev/null @@ -1,113 +0,0 @@ -Index: linux-2.6.9-full/include/linux/ext3_fs.h -=================================================================== ---- linux-2.6.9-full.orig/include/linux/ext3_fs.h 2006-08-09 17:56:39.000000000 +0400 -+++ linux-2.6.9-full/include/linux/ext3_fs.h 2006-08-22 12:36:22.000000000 +0400 -@@ -826,6 +826,7 @@ extern void ext3_put_super (struct super - extern void ext3_write_super (struct super_block *); - extern void ext3_write_super_lockfs (struct super_block *); - extern void ext3_unlockfs (struct super_block *); -+extern void ext3_commit_super (struct super_block *, struct ext3_super_block *, int); - extern int ext3_remount (struct super_block *, int *, char *); - extern int ext3_statfs (struct super_block *, struct kstatfs *); - -Index: linux-2.6.9-full/fs/ext3/super.c -=================================================================== ---- linux-2.6.9-full.orig/fs/ext3/super.c 2006-08-09 17:56:40.000000000 +0400 -+++ linux-2.6.9-full/fs/ext3/super.c 2006-08-09 17:56:40.000000000 +0400 -@@ -43,7 +43,7 @@ static int ext3_load_journal(struct supe - unsigned long journal_devnum); - static int ext3_create_journal(struct super_block *, struct ext3_super_block *, - int); --static void ext3_commit_super (struct super_block * sb, -+void ext3_commit_super (struct super_block * sb, - struct ext3_super_block * es, - int sync); - static void ext3_mark_recovery_complete(struct super_block * sb, -@@ -1991,7 +1991,7 @@ static int ext3_create_journal(struct su - return 0; - } - --static void ext3_commit_super (struct super_block * sb, -+void ext3_commit_super (struct super_block * sb, - struct ext3_super_block * es, - int sync) - { -Index: linux-2.6.9-full/fs/ext3/namei.c -=================================================================== ---- linux-2.6.9-full.orig/fs/ext3/namei.c 2006-08-09 17:56:40.000000000 +0400 -+++ linux-2.6.9-full/fs/ext3/namei.c 2006-08-09 17:56:40.000000000 +0400 -@@ -1599,7 +1599,7 @@ static int ext3_delete_entry (handle_t * - struct buffer_head * bh) - { - struct ext3_dir_entry_2 * de, * pde; -- int i; -+ int i, err; - - i = 0; - pde = NULL; -@@ -1609,7 +1609,9 @@ static int ext3_delete_entry (handle_t * - return -EIO; - if (de == de_del) { - BUFFER_TRACE(bh, "get_write_access"); -- ext3_journal_get_write_access(handle, bh); -+ err = ext3_journal_get_write_access(handle, bh); -+ if (err) -+ return err; - if (pde) - pde->rec_len = - cpu_to_le16(le16_to_cpu(pde->rec_len) + -Index: linux-2.6.9-full/fs/ext3/xattr.c -=================================================================== ---- linux-2.6.9-full.orig/fs/ext3/xattr.c 2006-06-01 14:58:48.000000000 +0400 -+++ linux-2.6.9-full/fs/ext3/xattr.c 2006-08-09 17:56:40.000000000 +0400 -@@ -132,7 +132,7 @@ ext3_xattr_handler(int name_index) - { - struct xattr_handler *handler = NULL; - -- if (name_index > 0 && name_index <= EXT3_XATTR_INDEX_MAX) -+ if (name_index > 0 && name_index < EXT3_XATTR_INDEX_MAX) - handler = ext3_xattr_handler_map[name_index]; - return handler; - } -Index: linux-2.6.9-full/fs/ext3/inode.c -=================================================================== ---- linux-2.6.9-full.orig/fs/ext3/inode.c 2006-06-02 23:37:38.000000000 +0400 -+++ linux-2.6.9-full/fs/ext3/inode.c 2006-08-22 12:34:28.000000000 +0400 -@@ -1513,9 +1513,14 @@ out_stop: - if (end > inode->i_size) { - ei->i_disksize = end; - i_size_write(inode, end); -- err = ext3_mark_inode_dirty(handle, inode); -- if (!ret) -- ret = err; -+ /* -+ * We're going to return a positive `ret' -+ * here due to non-zero-length I/O, so there's -+ * no way of reporting error returns from -+ * ext3_mark_inode_dirty() to userspace. So -+ * ignore it. -+ */ -+ ext3_mark_inode_dirty(handle, inode); - } - } - err = ext3_journal_stop(handle); -@@ -1807,8 +1812,18 @@ ext3_clear_blocks(handle_t *handle, stru - ext3_mark_inode_dirty(handle, inode); - ext3_journal_test_restart(handle, inode); - if (bh) { -+ int err; - BUFFER_TRACE(bh, "retaking write access"); -- ext3_journal_get_write_access(handle, bh); -+ err = ext3_journal_get_write_access(handle, bh); -+ if (err) { -+ struct super_block *sb = inode->i_sb; -+ struct ext3_super_block *es = EXT3_SB(sb)->s_es; -+ printk (KERN_CRIT"EXT3-fs: can't continue truncate\n"); -+ EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS; -+ es->s_state |= cpu_to_le16(EXT3_ERROR_FS); -+ ext3_commit_super(sb, es, 1); -+ return; -+ } - } - } - diff --git a/ldiskfs/kernel_patches/patches/ext3-disable-write-bar-by-default-2.6-sles10.patch b/ldiskfs/kernel_patches/patches/ext3-disable-write-bar-by-default-2.6-sles10.patch deleted file mode 100644 index 9b8d331..0000000 --- a/ldiskfs/kernel_patches/patches/ext3-disable-write-bar-by-default-2.6-sles10.patch +++ /dev/null @@ -1,15 +0,0 @@ ---- - fs/ext3/super.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - ---- linux-2.6.16.21-0.8.orig/fs/ext3/super.c -+++ linux-2.6.16.21-0.8/fs/ext3/super.c -@@ -1425,7 +1425,7 @@ static int ext3_fill_super (struct super - sbi->s_resgid = le16_to_cpu(es->s_def_resgid); - - /* enable barriers by default */ -- set_opt(sbi->s_mount_opt, BARRIER); -+ /* set_opt(sbi->s_mount_opt, BARRIER); */ - set_opt(sbi->s_mount_opt, RESERVATION); - - if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum, diff --git a/ldiskfs/kernel_patches/patches/ext3-ea-in-inode-2.6-rhel4.patch b/ldiskfs/kernel_patches/patches/ext3-ea-in-inode-2.6-rhel4.patch deleted file mode 100644 index 89cc1b5..0000000 --- a/ldiskfs/kernel_patches/patches/ext3-ea-in-inode-2.6-rhel4.patch +++ /dev/null @@ -1,840 +0,0 @@ -Index: linux-stage/fs/ext3/ialloc.c -=================================================================== ---- linux-stage.orig/fs/ext3/ialloc.c 2005-10-04 16:53:24.000000000 -0600 -+++ linux-stage/fs/ext3/ialloc.c 2005-10-04 17:07:25.000000000 -0600 -@@ -629,6 +629,9 @@ - spin_unlock(&sbi->s_next_gen_lock); - - ei->i_state = EXT3_STATE_NEW; -+ ei->i_extra_isize = -+ (EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) ? -+ sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE : 0; - - ret = inode; - if(DQUOT_ALLOC_INODE(inode)) { -Index: linux-stage/fs/ext3/inode.c -=================================================================== ---- linux-stage.orig/fs/ext3/inode.c 2005-10-04 17:00:22.000000000 -0600 -+++ linux-stage/fs/ext3/inode.c 2005-10-04 17:07:25.000000000 -0600 -@@ -2274,7 +2274,7 @@ - * trying to determine the inode's location on-disk and no read need be - * performed. - */ --static int ext3_get_inode_loc(struct inode *inode, -+int ext3_get_inode_loc(struct inode *inode, - struct ext3_iloc *iloc, int in_mem) - { - unsigned long block; -@@ -2484,6 +2484,11 @@ void ext3_read_inode(struct inode * inod - ei->i_data[block] = raw_inode->i_block[block]; - INIT_LIST_HEAD(&ei->i_orphan); - -+ if (EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) -+ ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); -+ else -+ ei->i_extra_isize = 0; -+ - if (S_ISREG(inode->i_mode)) { - inode->i_op = &ext3_file_inode_operations; - inode->i_fop = &ext3_file_operations; -@@ -2619,6 +2624,9 @@ static int ext3_do_update_inode(handle_t - } else for (block = 0; block < EXT3_N_BLOCKS; block++) - raw_inode->i_block[block] = ei->i_data[block]; - -+ if (EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) -+ raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); -+ - BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); - rc = ext3_journal_dirty_metadata(handle, bh); - if (!err) -@@ -2849,7 +2857,8 @@ ext3_reserve_inode_write(handle_t *handl - { - int err = 0; - if (handle) { -- err = ext3_get_inode_loc(inode, iloc, 1); -+ err = ext3_get_inode_loc(inode, iloc, EXT3_I(inode)->i_state & -+ EXT3_STATE_NEW); - if (!err) { - BUFFER_TRACE(iloc->bh, "get_write_access"); - err = ext3_journal_get_write_access(handle, iloc->bh); -Index: linux-stage/fs/ext3/xattr.c -=================================================================== ---- linux-stage.orig/fs/ext3/xattr.c 2005-10-04 16:50:11.000000000 -0600 -+++ linux-stage/fs/ext3/xattr.c 2005-10-04 17:19:43.000000000 -0600 -@@ -149,17 +149,12 @@ - } - - /* -- * ext3_xattr_get() -- * -- * Copy an extended attribute into the buffer -- * provided, or compute the buffer size required. -- * Buffer is NULL to compute the size of the buffer required. -+ * ext3_xattr_block_get() - * -- * Returns a negative error number on failure, or the number of bytes -- * used / required on success. -+ * routine looks for attribute in EA block and returns it's value and size - */ - int --ext3_xattr_get(struct inode *inode, int name_index, const char *name, -+ext3_xattr_block_get(struct inode *inode, int name_index, const char *name, - void *buffer, size_t buffer_size) - { - struct buffer_head *bh = NULL; -@@ -173,7 +168,6 @@ - - if (name == NULL) - return -EINVAL; -- down_read(&EXT3_I(inode)->xattr_sem); - error = -ENODATA; - if (!EXT3_I(inode)->i_file_acl) - goto cleanup; -@@ -246,15 +240,87 @@ - - cleanup: - brelse(bh); -- up_read(&EXT3_I(inode)->xattr_sem); - - return error; - } - - /* -- * ext3_xattr_list() -+ * ext3_xattr_ibody_get() - * -- * Copy a list of attribute names into the buffer -+ * routine looks for attribute in inode body and returns it's value and size -+ */ -+int -+ext3_xattr_ibody_get(struct inode *inode, int name_index, const char *name, -+ void *buffer, size_t buffer_size) -+{ -+ int size, name_len = strlen(name), storage_size; -+ struct ext3_xattr_entry *last; -+ struct ext3_inode *raw_inode; -+ struct ext3_iloc iloc; -+ char *start, *end; -+ int ret = -ENOENT; -+ -+ if (EXT3_SB(inode->i_sb)->s_inode_size <= EXT3_GOOD_OLD_INODE_SIZE) -+ return -ENOENT; -+ -+ ret = ext3_get_inode_loc(inode, &iloc, 0); -+ if (ret) -+ return ret; -+ raw_inode = ext3_raw_inode(&iloc); -+ -+ storage_size = EXT3_SB(inode->i_sb)->s_inode_size - -+ EXT3_GOOD_OLD_INODE_SIZE - -+ EXT3_I(inode)->i_extra_isize - -+ sizeof(__u32); -+ start = (char *) raw_inode + EXT3_GOOD_OLD_INODE_SIZE + -+ EXT3_I(inode)->i_extra_isize; -+ if (le32_to_cpu((*(__u32*) start)) != EXT3_XATTR_MAGIC) { -+ brelse(iloc.bh); -+ return -ENOENT; -+ } -+ start += sizeof(__u32); -+ end = (char *) raw_inode + EXT3_SB(inode->i_sb)->s_inode_size; -+ -+ last = (struct ext3_xattr_entry *) start; -+ while (!IS_LAST_ENTRY(last)) { -+ struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(last); -+ if (le32_to_cpu(last->e_value_size) > storage_size || -+ (char *) next >= end) { -+ ext3_error(inode->i_sb, "ext3_xattr_ibody_get", -+ "inode %ld", inode->i_ino); -+ brelse(iloc.bh); -+ return -EIO; -+ } -+ if (name_index == last->e_name_index && -+ name_len == last->e_name_len && -+ !memcmp(name, last->e_name, name_len)) -+ goto found; -+ last = next; -+ } -+ -+ /* can't find EA */ -+ brelse(iloc.bh); -+ return -ENOENT; -+ -+found: -+ size = le32_to_cpu(last->e_value_size); -+ if (buffer) { -+ ret = -ERANGE; -+ if (buffer_size >= size) { -+ memcpy(buffer, start + le16_to_cpu(last->e_value_offs), -+ size); -+ ret = size; -+ } -+ } else -+ ret = size; -+ brelse(iloc.bh); -+ return ret; -+} -+ -+/* -+ * ext3_xattr_get() -+ * -+ * Copy an extended attribute into the buffer - * provided, or compute the buffer size required. - * Buffer is NULL to compute the size of the buffer required. - * -@@ -262,7 +328,31 @@ - * used / required on success. - */ - int --ext3_xattr_list(struct inode *inode, char *buffer, size_t buffer_size) -+ext3_xattr_get(struct inode *inode, int name_index, const char *name, -+ void *buffer, size_t buffer_size) -+{ -+ int err; -+ -+ down_read(&EXT3_I(inode)->xattr_sem); -+ -+ /* try to find attribute in inode body */ -+ err = ext3_xattr_ibody_get(inode, name_index, name, -+ buffer, buffer_size); -+ if (err < 0) -+ /* search was unsuccessful, try to find EA in dedicated block */ -+ err = ext3_xattr_block_get(inode, name_index, name, -+ buffer, buffer_size); -+ up_read(&EXT3_I(inode)->xattr_sem); -+ -+ return err; -+} -+ -+/* ext3_xattr_ibody_list() -+ * -+ * generate list of attributes stored in EA block -+ */ -+int -+ext3_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size) - { - struct buffer_head *bh = NULL; - struct ext3_xattr_entry *entry; -@@ -273,7 +363,6 @@ - ea_idebug(inode, "buffer=%p, buffer_size=%ld", - buffer, (long)buffer_size); - -- down_read(&EXT3_I(inode)->xattr_sem); - error = 0; - if (!EXT3_I(inode)->i_file_acl) - goto cleanup; -@@ -330,11 +419,149 @@ - - cleanup: - brelse(bh); -- up_read(&EXT3_I(inode)->xattr_sem); - - return error; - } - -+/* ext3_xattr_ibody_list() -+ * -+ * generate list of attributes stored in inode body -+ */ -+int -+ext3_xattr_ibody_list(struct inode *inode, char *buffer, size_t buffer_size) -+{ -+ struct ext3_xattr_entry *last; -+ struct ext3_inode *raw_inode; -+ char *start, *end, *buf; -+ struct ext3_iloc iloc; -+ int storage_size; -+ size_t rest = buffer_size; -+ int ret; -+ int size = 0; -+ -+ if (EXT3_SB(inode->i_sb)->s_inode_size <= EXT3_GOOD_OLD_INODE_SIZE) -+ return 0; -+ -+ ret = ext3_get_inode_loc(inode, &iloc, 0); -+ if (ret) -+ return ret; -+ raw_inode = ext3_raw_inode(&iloc); -+ -+ storage_size = EXT3_SB(inode->i_sb)->s_inode_size - -+ EXT3_GOOD_OLD_INODE_SIZE - -+ EXT3_I(inode)->i_extra_isize - -+ sizeof(__u32); -+ start = (char *) raw_inode + EXT3_GOOD_OLD_INODE_SIZE + -+ EXT3_I(inode)->i_extra_isize; -+ if (le32_to_cpu((*(__u32*) start)) != EXT3_XATTR_MAGIC) { -+ brelse(iloc.bh); -+ return 0; -+ } -+ start += sizeof(__u32); -+ end = (char *) raw_inode + EXT3_SB(inode->i_sb)->s_inode_size; -+ -+ last = (struct ext3_xattr_entry *) start; -+ while (!IS_LAST_ENTRY(last)) { -+ struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(last); -+ struct xattr_handler *handler; -+ if (le32_to_cpu(last->e_value_size) > storage_size || -+ (char *) next >= end) { -+ ext3_error(inode->i_sb, "ext3_xattr_ibody_list", -+ "inode %ld", inode->i_ino); -+ brelse(iloc.bh); -+ return -EIO; -+ } -+ handler = ext3_xattr_handler(last->e_name_index); -+ if (handler) -+ size += handler->list(inode, NULL, 0, last->e_name, -+ last->e_name_len); -+ last = next; -+ } -+ -+ if (!buffer) { -+ ret = size; -+ goto cleanup; -+ } else { -+ ret = -ERANGE; -+ if (size > buffer_size) -+ goto cleanup; -+ } -+ -+ last = (struct ext3_xattr_entry *) start; -+ buf = buffer; -+ while (!IS_LAST_ENTRY(last)) { -+ struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(last); -+ struct xattr_handler *handler; -+ handler = ext3_xattr_handler(last->e_name_index); -+ if (handler) { -+ size_t size = handler->list(inode, buffer, rest, -+ last->e_name, -+ last->e_name_len); -+ if (buffer) { -+ if (size > rest) { -+ ret = -ERANGE; -+ goto cleanup; -+ } -+ buffer += size; -+ } -+ rest -= size; -+ } -+ last = next; -+ } -+ ret = size; -+cleanup: -+ brelse(iloc.bh); -+ return ret; -+} -+ -+/* -+ * ext3_xattr_list() -+ * -+ * Copy a list of attribute names into the buffer -+ * provided, or compute the buffer size required. -+ * Buffer is NULL to compute the size of the buffer required. -+ * -+ * Returns a negative error number on failure, or the number of bytes -+ * used / required on success. -+ */ -+int -+ext3_xattr_list(struct inode *inode, char *buffer, size_t buffer_size) -+{ -+ int error; -+ int size = buffer_size; -+ -+ down_read(&EXT3_I(inode)->xattr_sem); -+ -+ /* get list of attributes stored in inode body */ -+ error = ext3_xattr_ibody_list(inode, buffer, buffer_size); -+ if (error < 0) { -+ /* some error occured while collecting -+ * attributes in inode body */ -+ size = 0; -+ goto cleanup; -+ } -+ size = error; -+ -+ /* get list of attributes stored in dedicated block */ -+ if (buffer) { -+ buffer_size -= error; -+ if (buffer_size <= 0) { -+ buffer = NULL; -+ buffer_size = 0; -+ } else -+ buffer += error; -+ } -+ -+ error = ext3_xattr_block_list(inode, buffer, buffer_size); -+ if (error < 0) -+ /* listing was successful, so we return len */ -+ size = 0; -+ -+cleanup: -+ up_read(&EXT3_I(inode)->xattr_sem); -+ return error + size; -+} -+ - /* - * If the EXT3_FEATURE_COMPAT_EXT_ATTR feature of this file system is - * not set, set it. -@@ -356,6 +583,279 @@ - } - - /* -+ * ext3_xattr_ibody_find() -+ * -+ * search attribute and calculate free space in inode body -+ * NOTE: free space includes space our attribute hold -+ */ -+int -+ext3_xattr_ibody_find(struct inode *inode, int name_index, -+ const char *name, struct ext3_xattr_entry *rentry, int *free) -+{ -+ struct ext3_xattr_entry *last; -+ struct ext3_inode *raw_inode; -+ int name_len = strlen(name); -+ int err, storage_size; -+ struct ext3_iloc iloc; -+ char *start, *end; -+ int ret = -ENOENT; -+ -+ if (EXT3_SB(inode->i_sb)->s_inode_size <= EXT3_GOOD_OLD_INODE_SIZE) -+ return ret; -+ -+ err = ext3_get_inode_loc(inode, &iloc, 0); -+ if (err) -+ return -EIO; -+ raw_inode = ext3_raw_inode(&iloc); -+ -+ storage_size = EXT3_SB(inode->i_sb)->s_inode_size - -+ EXT3_GOOD_OLD_INODE_SIZE - -+ EXT3_I(inode)->i_extra_isize - -+ sizeof(__u32); -+ *free = storage_size - sizeof(__u32); -+ start = (char *) raw_inode + EXT3_GOOD_OLD_INODE_SIZE + -+ EXT3_I(inode)->i_extra_isize; -+ if (le32_to_cpu((*(__u32*) start)) != EXT3_XATTR_MAGIC) { -+ brelse(iloc.bh); -+ return -ENOENT; -+ } -+ start += sizeof(__u32); -+ end = (char *) raw_inode + EXT3_SB(inode->i_sb)->s_inode_size; -+ -+ last = (struct ext3_xattr_entry *) start; -+ while (!IS_LAST_ENTRY(last)) { -+ struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(last); -+ if (le32_to_cpu(last->e_value_size) > storage_size || -+ (char *) next >= end) { -+ ext3_error(inode->i_sb, "ext3_xattr_ibody_find", -+ "inode %ld", inode->i_ino); -+ brelse(iloc.bh); -+ return -EIO; -+ } -+ -+ if (name_index == last->e_name_index && -+ name_len == last->e_name_len && -+ !memcmp(name, last->e_name, name_len)) { -+ memcpy(rentry, last, sizeof(struct ext3_xattr_entry)); -+ ret = 0; -+ } else { -+ *free -= EXT3_XATTR_LEN(last->e_name_len); -+ *free -= le32_to_cpu(last->e_value_size); -+ } -+ last = next; -+ } -+ -+ brelse(iloc.bh); -+ return ret; -+} -+ -+/* -+ * ext3_xattr_block_find() -+ * -+ * search attribute and calculate free space in EA block (if it allocated) -+ * NOTE: free space includes space our attribute hold -+ */ -+int -+ext3_xattr_block_find(struct inode *inode, int name_index, const char *name, -+ struct ext3_xattr_entry *rentry, int *free) -+{ -+ struct buffer_head *bh = NULL; -+ struct ext3_xattr_entry *entry; -+ char *end; -+ int name_len, error = -ENOENT; -+ -+ if (!EXT3_I(inode)->i_file_acl) { -+ *free = inode->i_sb->s_blocksize - -+ sizeof(struct ext3_xattr_header) - -+ sizeof(__u32); -+ return -ENOENT; -+ } -+ ea_idebug(inode, "reading block %d", EXT3_I(inode)->i_file_acl); -+ bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl); -+ if (!bh) -+ return -EIO; -+ ea_bdebug(bh, "b_count=%d, refcount=%d", -+ atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount)); -+ end = bh->b_data + bh->b_size; -+ if (HDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) || -+ HDR(bh)->h_blocks != cpu_to_le32(1)) { -+bad_block: ext3_error(inode->i_sb, "ext3_xattr_get", -+ "inode %ld: bad block %d", inode->i_ino, -+ EXT3_I(inode)->i_file_acl); -+ brelse(bh); -+ return -EIO; -+ } -+ /* find named attribute */ -+ name_len = strlen(name); -+ *free = bh->b_size - sizeof(__u32); -+ -+ entry = FIRST_ENTRY(bh); -+ while (!IS_LAST_ENTRY(entry)) { -+ struct ext3_xattr_entry *next = -+ EXT3_XATTR_NEXT(entry); -+ if ((char *)next >= end) -+ goto bad_block; -+ if (name_index == entry->e_name_index && -+ name_len == entry->e_name_len && -+ memcmp(name, entry->e_name, name_len) == 0) { -+ memcpy(rentry, entry, sizeof(struct ext3_xattr_entry)); -+ error = 0; -+ } else { -+ *free -= EXT3_XATTR_LEN(entry->e_name_len); -+ *free -= le32_to_cpu(entry->e_value_size); -+ } -+ entry = next; -+ } -+ brelse(bh); -+ -+ return error; -+} -+ -+/* -+ * ext3_xattr_inode_set() -+ * -+ * this routine add/remove/replace attribute in inode body -+ */ -+int -+ext3_xattr_ibody_set(handle_t *handle, struct inode *inode, int name_index, -+ const char *name, const void *value, size_t value_len, -+ int flags) -+{ -+ struct ext3_xattr_entry *last, *next, *here = NULL; -+ struct ext3_inode *raw_inode; -+ int name_len = strlen(name); -+ int esize = EXT3_XATTR_LEN(name_len); -+ struct buffer_head *bh; -+ int err, storage_size; -+ struct ext3_iloc iloc; -+ int free, min_offs; -+ char *start, *end; -+ -+ if (EXT3_SB(inode->i_sb)->s_inode_size <= EXT3_GOOD_OLD_INODE_SIZE) -+ return -ENOSPC; -+ -+ err = ext3_get_inode_loc(inode, &iloc, 0); -+ if (err) -+ return err; -+ raw_inode = ext3_raw_inode(&iloc); -+ bh = iloc.bh; -+ -+ storage_size = EXT3_SB(inode->i_sb)->s_inode_size - -+ EXT3_GOOD_OLD_INODE_SIZE - -+ EXT3_I(inode)->i_extra_isize - -+ sizeof(__u32); -+ start = (char *) raw_inode + EXT3_GOOD_OLD_INODE_SIZE + -+ EXT3_I(inode)->i_extra_isize; -+ if ((*(__u32*) start) != EXT3_XATTR_MAGIC) { -+ /* inode had no attributes before */ -+ *((__u32*) start) = cpu_to_le32(EXT3_XATTR_MAGIC); -+ } -+ start += sizeof(__u32); -+ end = (char *) raw_inode + EXT3_SB(inode->i_sb)->s_inode_size; -+ min_offs = storage_size; -+ free = storage_size - sizeof(__u32); -+ -+ last = (struct ext3_xattr_entry *) start; -+ while (!IS_LAST_ENTRY(last)) { -+ next = EXT3_XATTR_NEXT(last); -+ if (le32_to_cpu(last->e_value_size) > storage_size || -+ (char *) next >= end) { -+ ext3_error(inode->i_sb, "ext3_xattr_ibody_set", -+ "inode %ld", inode->i_ino); -+ brelse(bh); -+ return -EIO; -+ } -+ -+ if (last->e_value_size) { -+ int offs = le16_to_cpu(last->e_value_offs); -+ if (offs < min_offs) -+ min_offs = offs; -+ } -+ if (name_index == last->e_name_index && -+ name_len == last->e_name_len && -+ !memcmp(name, last->e_name, name_len)) -+ here = last; -+ else { -+ /* we calculate all but our attribute -+ * because it will be removed before changing */ -+ free -= EXT3_XATTR_LEN(last->e_name_len); -+ free -= le32_to_cpu(last->e_value_size); -+ } -+ last = next; -+ } -+ -+ if (value && (esize + value_len > free)) { -+ brelse(bh); -+ return -ENOSPC; -+ } -+ -+ err = ext3_reserve_inode_write(handle, inode, &iloc); -+ if (err) { -+ brelse(bh); -+ return err; -+ } -+ -+ if (here) { -+ /* time to remove old value */ -+ struct ext3_xattr_entry *e; -+ int size = le32_to_cpu(here->e_value_size); -+ int border = le16_to_cpu(here->e_value_offs); -+ char *src; -+ -+ /* move tail */ -+ memmove(start + min_offs + size, start + min_offs, -+ border - min_offs); -+ -+ /* recalculate offsets */ -+ e = (struct ext3_xattr_entry *) start; -+ while (!IS_LAST_ENTRY(e)) { -+ struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(e); -+ int offs = le16_to_cpu(e->e_value_offs); -+ if (offs < border) -+ e->e_value_offs = -+ cpu_to_le16(offs + size); -+ e = next; -+ } -+ min_offs += size; -+ -+ /* remove entry */ -+ border = EXT3_XATTR_LEN(here->e_name_len); -+ src = (char *) here + EXT3_XATTR_LEN(here->e_name_len); -+ size = (char *) last - src; -+ if ((char *) here + size > end) -+ printk("ALERT at %s:%d: 0x%p + %d > 0x%p\n", -+ __FILE__, __LINE__, here, size, end); -+ memmove(here, src, size); -+ last = (struct ext3_xattr_entry *) ((char *) last - border); -+ *((__u32 *) last) = 0; -+ } -+ -+ if (value) { -+ int offs = min_offs - value_len; -+ /* use last to create new entry */ -+ last->e_name_len = strlen(name); -+ last->e_name_index = name_index; -+ last->e_value_offs = cpu_to_le16(offs); -+ last->e_value_size = cpu_to_le32(value_len); -+ last->e_hash = last->e_value_block = 0; -+ memset(last->e_name, 0, esize); -+ memcpy(last->e_name, name, last->e_name_len); -+ if (start + offs + value_len > end) -+ printk("ALERT at %s:%d: 0x%p + %d + %zd > 0x%p\n", -+ __FILE__, __LINE__, start, offs, -+ value_len, end); -+ memcpy(start + offs, value, value_len); -+ last = EXT3_XATTR_NEXT(last); -+ *((__u32 *) last) = 0; -+ } -+ -+ ext3_mark_iloc_dirty(handle, inode, &iloc); -+ brelse(bh); -+ -+ return 0; -+} -+ -+/* - * ext3_xattr_set_handle() - * - * Create, replace or remove an extended attribute for this inode. Buffer -@@ -369,6 +869,104 @@ - */ - int - ext3_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, -+ const char *name, const void *value, size_t value_len, -+ int flags) -+{ -+ struct ext3_xattr_entry entry; -+ int err, where = 0, found = 0, total; -+ int free1 = -1, free2 = -1; -+ int name_len; -+ -+ ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld", -+ name_index, name, value, (long)value_len); -+ -+ if (IS_RDONLY(inode)) -+ return -EROFS; -+ if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) -+ return -EPERM; -+ if (value == NULL) -+ value_len = 0; -+ if (name == NULL) -+ return -EINVAL; -+ name_len = strlen(name); -+ if (name_len > 255 || value_len > inode->i_sb->s_blocksize) -+ return -ERANGE; -+ down_write(&EXT3_I(inode)->xattr_sem); -+ -+ /* try to find attribute in inode body */ -+ err = ext3_xattr_ibody_find(inode, name_index, name, &entry, &free1); -+ if (err == 0) { -+ /* found EA in inode */ -+ found = 1; -+ where = 0; -+ } else if (err == -ENOENT) { -+ /* there is no such attribute in inode body */ -+ /* try to find attribute in dedicated block */ -+ err = ext3_xattr_block_find(inode, name_index, name, -+ &entry, &free2); -+ if (err != 0 && err != -ENOENT) { -+ /* not found EA in block */ -+ goto finish; -+ } else if (err == 0) { -+ /* found EA in block */ -+ where = 1; -+ found = 1; -+ } -+ } else -+ goto finish; -+ -+ /* check flags: may replace? may create ? */ -+ if (found && (flags & XATTR_CREATE)) { -+ err = -EEXIST; -+ goto finish; -+ } else if (!found && (flags & XATTR_REPLACE)) { -+ err = -ENODATA; -+ goto finish; -+ } -+ -+ /* check if we have enough space to store attribute */ -+ total = EXT3_XATTR_LEN(strlen(name)) + value_len; -+ if (free1 >= 0 && total > free1 && free2 >= 0 && total > free2) { -+ /* have no enough space */ -+ err = -ENOSPC; -+ goto finish; -+ } -+ -+ /* time to remove attribute */ -+ if (found) { -+ if (where == 0) { -+ /* EA is stored in inode body */ -+ ext3_xattr_ibody_set(handle, inode, name_index, name, -+ NULL, 0, flags); -+ } else { -+ /* EA is stored in separated block */ -+ ext3_xattr_block_set(handle, inode, name_index, name, -+ NULL, 0, flags); -+ } -+ } -+ -+ /* try to store EA in inode body */ -+ err = ext3_xattr_ibody_set(handle, inode, name_index, name, -+ value, value_len, flags); -+ if (err) { -+ /* can't store EA in inode body */ -+ /* try to store in block */ -+ err = ext3_xattr_block_set(handle, inode, name_index, -+ name, value, value_len, flags); -+ } -+ -+finish: -+ up_write(&EXT3_I(inode)->xattr_sem); -+ return err; -+} -+ -+/* -+ * ext3_xattr_block_set() -+ * -+ * this routine add/remove/replace attribute in EA block -+ */ -+int -+ext3_xattr_block_set(handle_t *handle, struct inode *inode, int name_index, - const char *name, const void *value, size_t value_len, - int flags) - { -@@ -391,22 +989,7 @@ - * towards the end of the block). - * end -- Points right after the block pointed to by header. - */ -- -- ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld", -- name_index, name, value, (long)value_len); -- -- if (IS_RDONLY(inode)) -- return -EROFS; -- if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) -- return -EPERM; -- if (value == NULL) -- value_len = 0; -- if (name == NULL) -- return -EINVAL; - name_len = strlen(name); -- if (name_len > 255 || value_len > sb->s_blocksize) -- return -ERANGE; -- down_write(&EXT3_I(inode)->xattr_sem); - if (EXT3_I(inode)->i_file_acl) { - /* The inode already has an extended attribute block. */ - bh = sb_bread(sb, EXT3_I(inode)->i_file_acl); -@@ -638,7 +1221,6 @@ - brelse(bh); - if (!(bh && header == HDR(bh))) - kfree(header); -- up_write(&EXT3_I(inode)->xattr_sem); - - return error; - } -Index: linux-stage/fs/ext3/xattr.h -=================================================================== ---- linux-stage.orig/fs/ext3/xattr.h 2005-10-04 16:50:11.000000000 -0600 -+++ linux-stage/fs/ext3/xattr.h 2005-10-04 17:07:25.000000000 -0600 -@@ -67,7 +67,8 @@ - extern int ext3_xattr_get(struct inode *, int, const char *, void *, size_t); - extern int ext3_xattr_list(struct inode *, char *, size_t); - extern int ext3_xattr_set(struct inode *, int, const char *, const void *, size_t, int); --extern int ext3_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int); -+extern int ext3_xattr_set_handle(handle_t *, struct inode *, int, const char *,const void *,size_t,int); -+extern int ext3_xattr_block_set(handle_t *, struct inode *, int, const char *,const void *,size_t,int); - - extern void ext3_xattr_delete_inode(handle_t *, struct inode *); - extern void ext3_xattr_put_super(struct super_block *); -Index: linux-stage/include/linux/ext3_fs.h -=================================================================== ---- linux-stage.orig/include/linux/ext3_fs.h 2005-10-04 16:53:29.000000000 -0600 -+++ linux-stage/include/linux/ext3_fs.h 2005-10-04 17:07:25.000000000 -0600 -@@ -293,6 +293,8 @@ - __u32 m_i_reserved2[2]; - } masix2; - } osd2; /* OS dependent 2 */ -+ __u16 i_extra_isize; -+ __u16 i_pad1; - }; - - #define i_size_high i_dir_acl -@@ -757,6 +759,7 @@ - extern int ext3_forget(handle_t *, int, struct inode *, struct buffer_head *, int); - extern struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *); - extern struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *); -+int ext3_get_inode_loc(struct inode *inode, struct ext3_iloc *iloc, int in_mem); - - extern void ext3_read_inode (struct inode *); - extern int ext3_write_inode (struct inode *, int); -Index: linux-stage/include/linux/ext3_fs_i.h -=================================================================== ---- linux-stage.orig/include/linux/ext3_fs_i.h 2005-10-04 16:50:11.000000000 -0600 -+++ linux-stage/include/linux/ext3_fs_i.h 2005-10-04 17:07:25.000000000 -0600 -@@ -113,6 +113,9 @@ - */ - loff_t i_disksize; - -+ /* on-disk additional length */ -+ __u16 i_extra_isize; -+ - /* - * truncate_sem is for serialising ext3_truncate() against - * ext3_getblock(). In the 2.4 ext2 design, great chunks of inode's diff --git a/ldiskfs/kernel_patches/patches/ext3-ea-in-inode-2.6-suse.patch b/ldiskfs/kernel_patches/patches/ext3-ea-in-inode-2.6-suse.patch deleted file mode 100644 index 72c25a4..0000000 --- a/ldiskfs/kernel_patches/patches/ext3-ea-in-inode-2.6-suse.patch +++ /dev/null @@ -1,840 +0,0 @@ -%patch -Index: linux-2.6.0/fs/ext3/ialloc.c -=================================================================== ---- linux-2.6.0.orig/fs/ext3/ialloc.c 2004-01-14 18:54:11.000000000 +0300 -+++ linux-2.6.0/fs/ext3/ialloc.c 2004-01-14 18:54:12.000000000 +0300 -@@ -627,6 +627,9 @@ - inode->i_generation = EXT3_SB(sb)->s_next_generation++; - - ei->i_state = EXT3_STATE_NEW; -+ ei->i_extra_isize = -+ (EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) ? -+ sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE : 0; - - ret = inode; - if(DQUOT_ALLOC_INODE(inode)) { -Index: linux-2.6.0/fs/ext3/inode.c -=================================================================== ---- linux-2.6.0.orig/fs/ext3/inode.c 2004-01-14 18:54:12.000000000 +0300 -+++ linux-2.6.0/fs/ext3/inode.c 2004-01-14 19:09:46.000000000 +0300 -@@ -2339,7 +2339,7 @@ - * trying to determine the inode's location on-disk and no read need be - * performed. - */ --static int ext3_get_inode_loc(struct inode *inode, -+int ext3_get_inode_loc(struct inode *inode, - struct ext3_iloc *iloc, int in_mem) - { - unsigned long block; -@@ -2547,6 +2547,11 @@ - ei->i_data[block] = raw_inode->i_block[block]; - INIT_LIST_HEAD(&ei->i_orphan); - -+ if (EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) -+ ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); -+ else -+ ei->i_extra_isize = 0; -+ - if (S_ISREG(inode->i_mode)) { - inode->i_op = &ext3_file_inode_operations; - inode->i_fop = &ext3_file_operations; -@@ -2682,6 +2687,9 @@ - } else for (block = 0; block < EXT3_N_BLOCKS; block++) - raw_inode->i_block[block] = ei->i_data[block]; - -+ if (EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) -+ raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); -+ - BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); - rc = ext3_journal_dirty_metadata(handle, bh); - if (!err) -@@ -2849,7 +2857,8 @@ ext3_reserve_inode_write(handle_t *handl - { - int err = 0; - if (handle) { -- err = ext3_get_inode_loc(inode, iloc, 1); -+ err = ext3_get_inode_loc(inode, iloc, EXT3_I(inode)->i_state & -+ EXT3_STATE_NEW); - if (!err) { - BUFFER_TRACE(iloc->bh, "get_write_access"); - err = ext3_journal_get_write_access(handle, iloc->bh); -Index: linux-2.6.0/fs/ext3/xattr.c -=================================================================== ---- linux-2.6.0.orig/fs/ext3/xattr.c 2003-12-30 08:33:13.000000000 +0300 -+++ linux-2.6.0/fs/ext3/xattr.c 2004-01-14 18:54:12.000000000 +0300 -@@ -246,17 +246,12 @@ - } - - /* -- * ext3_xattr_get() -- * -- * Copy an extended attribute into the buffer -- * provided, or compute the buffer size required. -- * Buffer is NULL to compute the size of the buffer required. -+ * ext3_xattr_block_get() - * -- * Returns a negative error number on failure, or the number of bytes -- * used / required on success. -+ * routine looks for attribute in EA block and returns it's value and size - */ - int --ext3_xattr_get(struct inode *inode, int name_index, const char *name, -+ext3_xattr_block_get(struct inode *inode, int name_index, const char *name, - void *buffer, size_t buffer_size) - { - struct buffer_head *bh = NULL; -@@ -270,7 +265,6 @@ - - if (name == NULL) - return -EINVAL; -- down_read(&EXT3_I(inode)->xattr_sem); - error = -ENODATA; - if (!EXT3_I(inode)->i_file_acl) - goto cleanup; -@@ -343,15 +337,87 @@ - - cleanup: - brelse(bh); -- up_read(&EXT3_I(inode)->xattr_sem); - - return error; - } - - /* -- * ext3_xattr_list() -+ * ext3_xattr_ibody_get() - * -- * Copy a list of attribute names into the buffer -+ * routine looks for attribute in inode body and returns it's value and size -+ */ -+int -+ext3_xattr_ibody_get(struct inode *inode, int name_index, const char *name, -+ void *buffer, size_t buffer_size) -+{ -+ int size, name_len = strlen(name), storage_size; -+ struct ext3_xattr_entry *last; -+ struct ext3_inode *raw_inode; -+ struct ext3_iloc iloc; -+ char *start, *end; -+ int ret = -ENOENT; -+ -+ if (EXT3_SB(inode->i_sb)->s_inode_size <= EXT3_GOOD_OLD_INODE_SIZE) -+ return -ENOENT; -+ -+ ret = ext3_get_inode_loc(inode, &iloc, 0); -+ if (ret) -+ return ret; -+ raw_inode = ext3_raw_inode(&iloc); -+ -+ storage_size = EXT3_SB(inode->i_sb)->s_inode_size - -+ EXT3_GOOD_OLD_INODE_SIZE - -+ EXT3_I(inode)->i_extra_isize - -+ sizeof(__u32); -+ start = (char *) raw_inode + EXT3_GOOD_OLD_INODE_SIZE + -+ EXT3_I(inode)->i_extra_isize; -+ if (le32_to_cpu((*(__u32*) start)) != EXT3_XATTR_MAGIC) { -+ brelse(iloc.bh); -+ return -ENOENT; -+ } -+ start += sizeof(__u32); -+ end = (char *) raw_inode + EXT3_SB(inode->i_sb)->s_inode_size; -+ -+ last = (struct ext3_xattr_entry *) start; -+ while (!IS_LAST_ENTRY(last)) { -+ struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(last); -+ if (le32_to_cpu(last->e_value_size) > storage_size || -+ (char *) next >= end) { -+ ext3_error(inode->i_sb, "ext3_xattr_ibody_get", -+ "inode %ld", inode->i_ino); -+ brelse(iloc.bh); -+ return -EIO; -+ } -+ if (name_index == last->e_name_index && -+ name_len == last->e_name_len && -+ !memcmp(name, last->e_name, name_len)) -+ goto found; -+ last = next; -+ } -+ -+ /* can't find EA */ -+ brelse(iloc.bh); -+ return -ENOENT; -+ -+found: -+ size = le32_to_cpu(last->e_value_size); -+ if (buffer) { -+ ret = -ERANGE; -+ if (buffer_size >= size) { -+ memcpy(buffer, start + le16_to_cpu(last->e_value_offs), -+ size); -+ ret = size; -+ } -+ } else -+ ret = size; -+ brelse(iloc.bh); -+ return ret; -+} -+ -+/* -+ * ext3_xattr_get() -+ * -+ * Copy an extended attribute into the buffer - * provided, or compute the buffer size required. - * Buffer is NULL to compute the size of the buffer required. - * -@@ -359,7 +425,31 @@ - * used / required on success. - */ - int --ext3_xattr_list(struct inode *inode, char *buffer, size_t buffer_size) -+ext3_xattr_get(struct inode *inode, int name_index, const char *name, -+ void *buffer, size_t buffer_size) -+{ -+ int err; -+ -+ down_read(&EXT3_I(inode)->xattr_sem); -+ -+ /* try to find attribute in inode body */ -+ err = ext3_xattr_ibody_get(inode, name_index, name, -+ buffer, buffer_size); -+ if (err < 0) -+ /* search was unsuccessful, try to find EA in dedicated block */ -+ err = ext3_xattr_block_get(inode, name_index, name, -+ buffer, buffer_size); -+ up_read(&EXT3_I(inode)->xattr_sem); -+ -+ return err; -+} -+ -+/* ext3_xattr_ibody_list() -+ * -+ * generate list of attributes stored in EA block -+ */ -+int -+ext3_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size) - { - struct buffer_head *bh = NULL; - struct ext3_xattr_entry *entry; -@@ -370,7 +460,6 @@ - ea_idebug(inode, "buffer=%p, buffer_size=%ld", - buffer, (long)buffer_size); - -- down_read(&EXT3_I(inode)->xattr_sem); - error = 0; - if (!EXT3_I(inode)->i_file_acl) - goto cleanup; -@@ -431,11 +520,138 @@ - - cleanup: - brelse(bh); -- up_read(&EXT3_I(inode)->xattr_sem); - - return error; - } - -+/* ext3_xattr_ibody_list() -+ * -+ * generate list of attributes stored in inode body -+ */ -+int -+ext3_xattr_ibody_list(struct inode *inode, char *buffer, size_t buffer_size) -+{ -+ struct ext3_xattr_entry *last; -+ struct ext3_inode *raw_inode; -+ char *start, *end, *buf; -+ struct ext3_iloc iloc; -+ int storage_size; -+ int ret; -+ int size = 0; -+ -+ if (EXT3_SB(inode->i_sb)->s_inode_size <= EXT3_GOOD_OLD_INODE_SIZE) -+ return 0; -+ -+ ret = ext3_get_inode_loc(inode, &iloc, 0); -+ if (ret) -+ return ret; -+ raw_inode = ext3_raw_inode(&iloc); -+ -+ storage_size = EXT3_SB(inode->i_sb)->s_inode_size - -+ EXT3_GOOD_OLD_INODE_SIZE - -+ EXT3_I(inode)->i_extra_isize - -+ sizeof(__u32); -+ start = (char *) raw_inode + EXT3_GOOD_OLD_INODE_SIZE + -+ EXT3_I(inode)->i_extra_isize; -+ if (le32_to_cpu((*(__u32*) start)) != EXT3_XATTR_MAGIC) { -+ brelse(iloc.bh); -+ return 0; -+ } -+ start += sizeof(__u32); -+ end = (char *) raw_inode + EXT3_SB(inode->i_sb)->s_inode_size; -+ -+ last = (struct ext3_xattr_entry *) start; -+ while (!IS_LAST_ENTRY(last)) { -+ struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(last); -+ struct ext3_xattr_handler *handler; -+ if (le32_to_cpu(last->e_value_size) > storage_size || -+ (char *) next >= end) { -+ ext3_error(inode->i_sb, "ext3_xattr_ibody_list", -+ "inode %ld", inode->i_ino); -+ brelse(iloc.bh); -+ return -EIO; -+ } -+ handler = ext3_xattr_handler(last->e_name_index); -+ if (handler) -+ size += handler->list(NULL, inode, last->e_name, -+ last->e_name_len); -+ last = next; -+ } -+ -+ if (!buffer) { -+ ret = size; -+ goto cleanup; -+ } else { -+ ret = -ERANGE; -+ if (size > buffer_size) -+ goto cleanup; -+ } -+ -+ last = (struct ext3_xattr_entry *) start; -+ buf = buffer; -+ while (!IS_LAST_ENTRY(last)) { -+ struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(last); -+ struct ext3_xattr_handler *handler; -+ handler = ext3_xattr_handler(last->e_name_index); -+ if (handler) -+ buf += handler->list(buf, inode, last->e_name, -+ last->e_name_len); -+ last = next; -+ } -+ ret = size; -+cleanup: -+ brelse(iloc.bh); -+ return ret; -+} -+ -+/* -+ * ext3_xattr_list() -+ * -+ * Copy a list of attribute names into the buffer -+ * provided, or compute the buffer size required. -+ * Buffer is NULL to compute the size of the buffer required. -+ * -+ * Returns a negative error number on failure, or the number of bytes -+ * used / required on success. -+ */ -+int -+ext3_xattr_list(struct inode *inode, char *buffer, size_t buffer_size) -+{ -+ int error; -+ int size = buffer_size; -+ -+ down_read(&EXT3_I(inode)->xattr_sem); -+ -+ /* get list of attributes stored in inode body */ -+ error = ext3_xattr_ibody_list(inode, buffer, buffer_size); -+ if (error < 0) { -+ /* some error occured while collecting -+ * attributes in inode body */ -+ size = 0; -+ goto cleanup; -+ } -+ size = error; -+ -+ /* get list of attributes stored in dedicated block */ -+ if (buffer) { -+ buffer_size -= error; -+ if (buffer_size <= 0) { -+ buffer = NULL; -+ buffer_size = 0; -+ } else -+ buffer += error; -+ } -+ -+ error = ext3_xattr_block_list(inode, buffer, buffer_size); -+ if (error < 0) -+ /* listing was successful, so we return len */ -+ size = 0; -+ -+cleanup: -+ up_read(&EXT3_I(inode)->xattr_sem); -+ return error + size; -+} -+ - /* - * If the EXT3_FEATURE_COMPAT_EXT_ATTR feature of this file system is - * not set, set it. -@@ -457,6 +673,279 @@ - } - - /* -+ * ext3_xattr_ibody_find() -+ * -+ * search attribute and calculate free space in inode body -+ * NOTE: free space includes space our attribute hold -+ */ -+int -+ext3_xattr_ibody_find(struct inode *inode, int name_index, -+ const char *name, struct ext3_xattr_entry *rentry, int *free) -+{ -+ struct ext3_xattr_entry *last; -+ struct ext3_inode *raw_inode; -+ int name_len = strlen(name); -+ int err, storage_size; -+ struct ext3_iloc iloc; -+ char *start, *end; -+ int ret = -ENOENT; -+ -+ if (EXT3_SB(inode->i_sb)->s_inode_size <= EXT3_GOOD_OLD_INODE_SIZE) -+ return ret; -+ -+ err = ext3_get_inode_loc(inode, &iloc, 0); -+ if (err) -+ return -EIO; -+ raw_inode = ext3_raw_inode(&iloc); -+ -+ storage_size = EXT3_SB(inode->i_sb)->s_inode_size - -+ EXT3_GOOD_OLD_INODE_SIZE - -+ EXT3_I(inode)->i_extra_isize - -+ sizeof(__u32); -+ *free = storage_size - sizeof(__u32); -+ start = (char *) raw_inode + EXT3_GOOD_OLD_INODE_SIZE + -+ EXT3_I(inode)->i_extra_isize; -+ if (le32_to_cpu((*(__u32*) start)) != EXT3_XATTR_MAGIC) { -+ brelse(iloc.bh); -+ return -ENOENT; -+ } -+ start += sizeof(__u32); -+ end = (char *) raw_inode + EXT3_SB(inode->i_sb)->s_inode_size; -+ -+ last = (struct ext3_xattr_entry *) start; -+ while (!IS_LAST_ENTRY(last)) { -+ struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(last); -+ if (le32_to_cpu(last->e_value_size) > storage_size || -+ (char *) next >= end) { -+ ext3_error(inode->i_sb, "ext3_xattr_ibody_find", -+ "inode %ld", inode->i_ino); -+ brelse(iloc.bh); -+ return -EIO; -+ } -+ -+ if (name_index == last->e_name_index && -+ name_len == last->e_name_len && -+ !memcmp(name, last->e_name, name_len)) { -+ memcpy(rentry, last, sizeof(struct ext3_xattr_entry)); -+ ret = 0; -+ } else { -+ *free -= EXT3_XATTR_LEN(last->e_name_len); -+ *free -= le32_to_cpu(last->e_value_size); -+ } -+ last = next; -+ } -+ -+ brelse(iloc.bh); -+ return ret; -+} -+ -+/* -+ * ext3_xattr_block_find() -+ * -+ * search attribute and calculate free space in EA block (if it allocated) -+ * NOTE: free space includes space our attribute hold -+ */ -+int -+ext3_xattr_block_find(struct inode *inode, int name_index, const char *name, -+ struct ext3_xattr_entry *rentry, int *free) -+{ -+ struct buffer_head *bh = NULL; -+ struct ext3_xattr_entry *entry; -+ char *end; -+ int name_len, error = -ENOENT; -+ -+ if (!EXT3_I(inode)->i_file_acl) { -+ *free = inode->i_sb->s_blocksize - -+ sizeof(struct ext3_xattr_header) - -+ sizeof(__u32); -+ return -ENOENT; -+ } -+ ea_idebug(inode, "reading block %d", EXT3_I(inode)->i_file_acl); -+ bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl); -+ if (!bh) -+ return -EIO; -+ ea_bdebug(bh, "b_count=%d, refcount=%d", -+ atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount)); -+ end = bh->b_data + bh->b_size; -+ if (HDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) || -+ HDR(bh)->h_blocks != cpu_to_le32(1)) { -+bad_block: ext3_error(inode->i_sb, "ext3_xattr_get", -+ "inode %ld: bad block %d", inode->i_ino, -+ EXT3_I(inode)->i_file_acl); -+ brelse(bh); -+ return -EIO; -+ } -+ /* find named attribute */ -+ name_len = strlen(name); -+ *free = bh->b_size - sizeof(__u32); -+ -+ entry = FIRST_ENTRY(bh); -+ while (!IS_LAST_ENTRY(entry)) { -+ struct ext3_xattr_entry *next = -+ EXT3_XATTR_NEXT(entry); -+ if ((char *)next >= end) -+ goto bad_block; -+ if (name_index == entry->e_name_index && -+ name_len == entry->e_name_len && -+ memcmp(name, entry->e_name, name_len) == 0) { -+ memcpy(rentry, entry, sizeof(struct ext3_xattr_entry)); -+ error = 0; -+ } else { -+ *free -= EXT3_XATTR_LEN(entry->e_name_len); -+ *free -= le32_to_cpu(entry->e_value_size); -+ } -+ entry = next; -+ } -+ brelse(bh); -+ -+ return error; -+} -+ -+/* -+ * ext3_xattr_inode_set() -+ * -+ * this routine add/remove/replace attribute in inode body -+ */ -+int -+ext3_xattr_ibody_set(handle_t *handle, struct inode *inode, int name_index, -+ const char *name, const void *value, size_t value_len, -+ int flags) -+{ -+ struct ext3_xattr_entry *last, *next, *here = NULL; -+ struct ext3_inode *raw_inode; -+ int name_len = strlen(name); -+ int esize = EXT3_XATTR_LEN(name_len); -+ struct buffer_head *bh; -+ int err, storage_size; -+ struct ext3_iloc iloc; -+ int free, min_offs; -+ char *start, *end; -+ -+ if (EXT3_SB(inode->i_sb)->s_inode_size <= EXT3_GOOD_OLD_INODE_SIZE) -+ return -ENOSPC; -+ -+ err = ext3_get_inode_loc(inode, &iloc, 0); -+ if (err) -+ return err; -+ raw_inode = ext3_raw_inode(&iloc); -+ bh = iloc.bh; -+ -+ storage_size = EXT3_SB(inode->i_sb)->s_inode_size - -+ EXT3_GOOD_OLD_INODE_SIZE - -+ EXT3_I(inode)->i_extra_isize - -+ sizeof(__u32); -+ start = (char *) raw_inode + EXT3_GOOD_OLD_INODE_SIZE + -+ EXT3_I(inode)->i_extra_isize; -+ if ((*(__u32*) start) != EXT3_XATTR_MAGIC) { -+ /* inode had no attributes before */ -+ *((__u32*) start) = cpu_to_le32(EXT3_XATTR_MAGIC); -+ } -+ start += sizeof(__u32); -+ end = (char *) raw_inode + EXT3_SB(inode->i_sb)->s_inode_size; -+ min_offs = storage_size; -+ free = storage_size - sizeof(__u32); -+ -+ last = (struct ext3_xattr_entry *) start; -+ while (!IS_LAST_ENTRY(last)) { -+ next = EXT3_XATTR_NEXT(last); -+ if (le32_to_cpu(last->e_value_size) > storage_size || -+ (char *) next >= end) { -+ ext3_error(inode->i_sb, "ext3_xattr_ibody_set", -+ "inode %ld", inode->i_ino); -+ brelse(bh); -+ return -EIO; -+ } -+ -+ if (last->e_value_size) { -+ int offs = le16_to_cpu(last->e_value_offs); -+ if (offs < min_offs) -+ min_offs = offs; -+ } -+ if (name_index == last->e_name_index && -+ name_len == last->e_name_len && -+ !memcmp(name, last->e_name, name_len)) -+ here = last; -+ else { -+ /* we calculate all but our attribute -+ * because it will be removed before changing */ -+ free -= EXT3_XATTR_LEN(last->e_name_len); -+ free -= le32_to_cpu(last->e_value_size); -+ } -+ last = next; -+ } -+ -+ if (value && (esize + value_len > free)) { -+ brelse(bh); -+ return -ENOSPC; -+ } -+ -+ err = ext3_reserve_inode_write(handle, inode, &iloc); -+ if (err) { -+ brelse(bh); -+ return err; -+ } -+ -+ if (here) { -+ /* time to remove old value */ -+ struct ext3_xattr_entry *e; -+ int size = le32_to_cpu(here->e_value_size); -+ int border = le16_to_cpu(here->e_value_offs); -+ char *src; -+ -+ /* move tail */ -+ memmove(start + min_offs + size, start + min_offs, -+ border - min_offs); -+ -+ /* recalculate offsets */ -+ e = (struct ext3_xattr_entry *) start; -+ while (!IS_LAST_ENTRY(e)) { -+ struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(e); -+ int offs = le16_to_cpu(e->e_value_offs); -+ if (offs < border) -+ e->e_value_offs = -+ cpu_to_le16(offs + size); -+ e = next; -+ } -+ min_offs += size; -+ -+ /* remove entry */ -+ border = EXT3_XATTR_LEN(here->e_name_len); -+ src = (char *) here + EXT3_XATTR_LEN(here->e_name_len); -+ size = (char *) last - src; -+ if ((char *) here + size > end) -+ printk("ALERT at %s:%d: 0x%p + %d > 0x%p\n", -+ __FILE__, __LINE__, here, size, end); -+ memmove(here, src, size); -+ last = (struct ext3_xattr_entry *) ((char *) last - border); -+ *((__u32 *) last) = 0; -+ } -+ -+ if (value) { -+ int offs = min_offs - value_len; -+ /* use last to create new entry */ -+ last->e_name_len = strlen(name); -+ last->e_name_index = name_index; -+ last->e_value_offs = cpu_to_le16(offs); -+ last->e_value_size = cpu_to_le32(value_len); -+ last->e_hash = last->e_value_block = 0; -+ memset(last->e_name, 0, esize); -+ memcpy(last->e_name, name, last->e_name_len); -+ if (start + offs + value_len > end) -+ printk("ALERT at %s:%d: 0x%p + %d + %zd > 0x%p\n", -+ __FILE__, __LINE__, start, offs, -+ value_len, end); -+ memcpy(start + offs, value, value_len); -+ last = EXT3_XATTR_NEXT(last); -+ *((__u32 *) last) = 0; -+ } -+ -+ ext3_mark_iloc_dirty(handle, inode, &iloc); -+ brelse(bh); -+ -+ return 0; -+} -+ -+/* - * ext3_xattr_set_handle() - * - * Create, replace or remove an extended attribute for this inode. Buffer -@@ -470,6 +959,104 @@ - */ - int - ext3_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, -+ const char *name, const void *value, size_t value_len, -+ int flags) -+{ -+ struct ext3_xattr_entry entry; -+ int err, where = 0, found = 0, total; -+ int free1 = -1, free2 = -1; -+ int name_len; -+ -+ ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld", -+ name_index, name, value, (long)value_len); -+ -+ if (IS_RDONLY(inode)) -+ return -EROFS; -+ if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) -+ return -EPERM; -+ if (value == NULL) -+ value_len = 0; -+ if (name == NULL) -+ return -EINVAL; -+ name_len = strlen(name); -+ if (name_len > 255 || value_len > inode->i_sb->s_blocksize) -+ return -ERANGE; -+ down_write(&EXT3_I(inode)->xattr_sem); -+ -+ /* try to find attribute in inode body */ -+ err = ext3_xattr_ibody_find(inode, name_index, name, &entry, &free1); -+ if (err == 0) { -+ /* found EA in inode */ -+ found = 1; -+ where = 0; -+ } else if (err == -ENOENT) { -+ /* there is no such attribute in inode body */ -+ /* try to find attribute in dedicated block */ -+ err = ext3_xattr_block_find(inode, name_index, name, -+ &entry, &free2); -+ if (err != 0 && err != -ENOENT) { -+ /* not found EA in block */ -+ goto finish; -+ } else if (err == 0) { -+ /* found EA in block */ -+ where = 1; -+ found = 1; -+ } -+ } else -+ goto finish; -+ -+ /* check flags: may replace? may create ? */ -+ if (found && (flags & XATTR_CREATE)) { -+ err = -EEXIST; -+ goto finish; -+ } else if (!found && (flags & XATTR_REPLACE)) { -+ err = -ENODATA; -+ goto finish; -+ } -+ -+ /* check if we have enough space to store attribute */ -+ total = EXT3_XATTR_LEN(strlen(name)) + value_len; -+ if (free1 >= 0 && total > free1 && free2 >= 0 && total > free2) { -+ /* have no enough space */ -+ err = -ENOSPC; -+ goto finish; -+ } -+ -+ /* time to remove attribute */ -+ if (found) { -+ if (where == 0) { -+ /* EA is stored in inode body */ -+ ext3_xattr_ibody_set(handle, inode, name_index, name, -+ NULL, 0, flags); -+ } else { -+ /* EA is stored in separated block */ -+ ext3_xattr_block_set(handle, inode, name_index, name, -+ NULL, 0, flags); -+ } -+ } -+ -+ /* try to store EA in inode body */ -+ err = ext3_xattr_ibody_set(handle, inode, name_index, name, -+ value, value_len, flags); -+ if (err) { -+ /* can't store EA in inode body */ -+ /* try to store in block */ -+ err = ext3_xattr_block_set(handle, inode, name_index, -+ name, value, value_len, flags); -+ } -+ -+finish: -+ up_write(&EXT3_I(inode)->xattr_sem); -+ return err; -+} -+ -+/* -+ * ext3_xattr_block_set() -+ * -+ * this routine add/remove/replace attribute in EA block -+ */ -+int -+ext3_xattr_block_set(handle_t *handle, struct inode *inode, int name_index, - const char *name, const void *value, size_t value_len, - int flags) - { -@@ -492,22 +1078,7 @@ - * towards the end of the block). - * end -- Points right after the block pointed to by header. - */ -- -- ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld", -- name_index, name, value, (long)value_len); -- -- if (IS_RDONLY(inode)) -- return -EROFS; -- if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) -- return -EPERM; -- if (value == NULL) -- value_len = 0; -- if (name == NULL) -- return -EINVAL; - name_len = strlen(name); -- if (name_len > 255 || value_len > sb->s_blocksize) -- return -ERANGE; -- down_write(&EXT3_I(inode)->xattr_sem); - if (EXT3_I(inode)->i_file_acl) { - /* The inode already has an extended attribute block. */ - bh = sb_bread(sb, EXT3_I(inode)->i_file_acl); -@@ -733,7 +1304,6 @@ - brelse(bh); - if (!(bh && header == HDR(bh))) - kfree(header); -- up_write(&EXT3_I(inode)->xattr_sem); - - return error; - } -Index: linux-2.6.0/fs/ext3/xattr.h -=================================================================== ---- linux-2.6.0.orig/fs/ext3/xattr.h 2003-06-24 18:04:43.000000000 +0400 -+++ linux-2.6.0/fs/ext3/xattr.h 2004-01-14 18:54:12.000000000 +0300 -@@ -77,7 +77,8 @@ - extern int ext3_xattr_get(struct inode *, int, const char *, void *, size_t); - extern int ext3_xattr_list(struct inode *, char *, size_t); - extern int ext3_xattr_set(struct inode *, int, const char *, const void *, size_t, int); --extern int ext3_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int); -+extern int ext3_xattr_set_handle(handle_t *, struct inode *, int, const char *,const void *,size_t,int); -+extern int ext3_xattr_block_set(handle_t *, struct inode *, int, const char *,const void *,size_t,int); - - extern void ext3_xattr_delete_inode(handle_t *, struct inode *); - extern void ext3_xattr_put_super(struct super_block *); -Index: linux-2.6.0/include/linux/ext3_fs.h -=================================================================== ---- linux-2.6.0.orig/include/linux/ext3_fs.h 2004-01-14 18:54:11.000000000 +0300 -+++ linux-2.6.0/include/linux/ext3_fs.h 2004-01-14 18:54:12.000000000 +0300 -@@ -265,6 +265,8 @@ - __u32 m_i_reserved2[2]; - } masix2; - } osd2; /* OS dependent 2 */ -+ __u16 i_extra_isize; -+ __u16 i_pad1; - }; - - #define i_size_high i_dir_acl -@@ -721,6 +723,7 @@ - extern int ext3_forget(handle_t *, int, struct inode *, struct buffer_head *, int); - extern struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *); - extern struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *); -+int ext3_get_inode_loc(struct inode *inode, struct ext3_iloc *iloc, int in_mem); - - extern void ext3_read_inode (struct inode *); - extern void ext3_write_inode (struct inode *, int); -Index: linux-2.6.0/include/linux/ext3_fs_i.h -=================================================================== ---- linux-2.6.0.orig/include/linux/ext3_fs_i.h 2003-12-30 08:32:44.000000000 +0300 -+++ linux-2.6.0/include/linux/ext3_fs_i.h 2004-01-14 18:54:12.000000000 +0300 -@@ -96,6 +96,9 @@ - */ - loff_t i_disksize; - -+ /* on-disk additional length */ -+ __u16 i_extra_isize; -+ - /* - * truncate_sem is for serialising ext3_truncate() against - * ext3_getblock(). In the 2.4 ext2 design, great chunks of inode's - -%diffstat - fs/ext3/ialloc.c | 5 - fs/ext3/inode.c | 10 - fs/ext3/xattr.c | 634 +++++++++++++++++++++++++++++++++++++++++++--- - fs/ext3/xattr.h | 3 - include/linux/ext3_fs.h | 2 - include/linux/ext3_fs_i.h | 3 - 6 files changed, 623 insertions(+), 34 deletions(-) - diff --git a/ldiskfs/kernel_patches/patches/ext3-extents-2.6.12.patch b/ldiskfs/kernel_patches/patches/ext3-extents-2.6.12.patch deleted file mode 100644 index f421f88..0000000 --- a/ldiskfs/kernel_patches/patches/ext3-extents-2.6.12.patch +++ /dev/null @@ -1,2940 +0,0 @@ -Index: linux-2.6.12-rc6/fs/ext3/extents.c -=================================================================== ---- linux-2.6.12-rc6.orig/fs/ext3/extents.c 2005-06-14 16:31:25.756503133 +0200 -+++ linux-2.6.12-rc6/fs/ext3/extents.c 2005-06-14 16:31:25.836581257 +0200 -@@ -0,0 +1,2359 @@ -+/* -+ * Copyright(c) 2003, 2004, 2005, Cluster File Systems, Inc, info@clusterfs.com -+ * Written by Alex Tomas -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License version 2 as -+ * published by the Free Software Foundation. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public Licens -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- -+ */ -+ -+/* -+ * Extents support for EXT3 -+ * -+ * TODO: -+ * - ext3_ext_walk_space() sould not use ext3_ext_find_extent() -+ * - ext3_ext_calc_credits() could take 'mergable' into account -+ * - ext3*_error() should be used in some situations -+ * - find_goal() [to be tested and improved] -+ * - smart tree reduction -+ * - arch-independence -+ * common on-disk format for big/little-endian arch -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+ -+static inline int ext3_ext_check_header(struct ext3_extent_header *eh) -+{ -+ if (eh->eh_magic != EXT3_EXT_MAGIC) { -+ printk(KERN_ERR "EXT3-fs: invalid magic = 0x%x\n", -+ (unsigned)eh->eh_magic); -+ return -EIO; -+ } -+ if (eh->eh_max == 0) { -+ printk(KERN_ERR "EXT3-fs: invalid eh_max = %u\n", -+ (unsigned)eh->eh_max); -+ return -EIO; -+ } -+ if (eh->eh_entries > eh->eh_max) { -+ printk(KERN_ERR "EXT3-fs: invalid eh_entries = %u\n", -+ (unsigned)eh->eh_entries); -+ return -EIO; -+ } -+ return 0; -+} -+ -+static handle_t *ext3_ext_journal_restart(handle_t *handle, int needed) -+{ -+ int err; -+ -+ if (handle->h_buffer_credits > needed) -+ return handle; -+ if (!ext3_journal_extend(handle, needed)) -+ return handle; -+ err = ext3_journal_restart(handle, needed); -+ -+ return handle; -+} -+ -+static int inline -+ext3_ext_get_access_for_root(handle_t *h, struct ext3_extents_tree *tree) -+{ -+ if (tree->ops->get_write_access) -+ return tree->ops->get_write_access(h,tree->buffer); -+ else -+ return 0; -+} -+ -+static int inline -+ext3_ext_mark_root_dirty(handle_t *h, struct ext3_extents_tree *tree) -+{ -+ if (tree->ops->mark_buffer_dirty) -+ return tree->ops->mark_buffer_dirty(h,tree->buffer); -+ else -+ return 0; -+} -+ -+/* -+ * could return: -+ * - EROFS -+ * - ENOMEM -+ */ -+static int ext3_ext_get_access(handle_t *handle, -+ struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path) -+{ -+ int err; -+ -+ if (path->p_bh) { -+ /* path points to block */ -+ err = ext3_journal_get_write_access(handle, path->p_bh); -+ } else { -+ /* path points to leaf/index in inode body */ -+ err = ext3_ext_get_access_for_root(handle, tree); -+ } -+ return err; -+} -+ -+/* -+ * could return: -+ * - EROFS -+ * - ENOMEM -+ * - EIO -+ */ -+static int ext3_ext_dirty(handle_t *handle, struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path) -+{ -+ int err; -+ if (path->p_bh) { -+ /* path points to block */ -+ err =ext3_journal_dirty_metadata(handle, path->p_bh); -+ } else { -+ /* path points to leaf/index in inode body */ -+ err = ext3_ext_mark_root_dirty(handle, tree); -+ } -+ return err; -+} -+ -+static int inline -+ext3_ext_new_block(handle_t *handle, struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, struct ext3_extent *ex, -+ int *err) -+{ -+ int goal, depth, newblock; -+ struct inode *inode; -+ -+ EXT_ASSERT(tree); -+ if (tree->ops->new_block) -+ return tree->ops->new_block(handle, tree, path, ex, err); -+ -+ inode = tree->inode; -+ depth = EXT_DEPTH(tree); -+ if (path && depth > 0) { -+ goal = path[depth-1].p_block; -+ } else { -+ struct ext3_inode_info *ei = EXT3_I(inode); -+ unsigned long bg_start; -+ unsigned long colour; -+ -+ bg_start = (ei->i_block_group * -+ EXT3_BLOCKS_PER_GROUP(inode->i_sb)) + -+ le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block); -+ colour = (current->pid % 16) * -+ (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16); -+ goal = bg_start + colour; -+ } -+ -+ newblock = ext3_new_block(handle, inode, goal, err); -+ return newblock; -+} -+ -+static inline void ext3_ext_tree_changed(struct ext3_extents_tree *tree) -+{ -+ struct ext3_extent_header *neh = EXT_ROOT_HDR(tree); -+ neh->eh_generation = ((EXT_FLAGS(neh) & ~EXT_FLAGS_CLR_UNKNOWN) << 24) | -+ (EXT_HDR_GEN(neh) + 1); -+} -+ -+static inline int ext3_ext_space_block(struct ext3_extents_tree *tree) -+{ -+ int size; -+ -+ size = (tree->inode->i_sb->s_blocksize - -+ sizeof(struct ext3_extent_header)) / -+ sizeof(struct ext3_extent); -+#ifdef AGRESSIVE_TEST -+ size = 6; -+#endif -+ return size; -+} -+ -+static inline int ext3_ext_space_block_idx(struct ext3_extents_tree *tree) -+{ -+ int size; -+ -+ size = (tree->inode->i_sb->s_blocksize - -+ sizeof(struct ext3_extent_header)) / -+ sizeof(struct ext3_extent_idx); -+#ifdef AGRESSIVE_TEST -+ size = 5; -+#endif -+ return size; -+} -+ -+static inline int ext3_ext_space_root(struct ext3_extents_tree *tree) -+{ -+ int size; -+ -+ size = (tree->buffer_len - sizeof(struct ext3_extent_header)) / -+ sizeof(struct ext3_extent); -+#ifdef AGRESSIVE_TEST -+ size = 3; -+#endif -+ return size; -+} -+ -+static inline int ext3_ext_space_root_idx(struct ext3_extents_tree *tree) -+{ -+ int size; -+ -+ size = (tree->buffer_len - sizeof(struct ext3_extent_header)) / -+ sizeof(struct ext3_extent_idx); -+#ifdef AGRESSIVE_TEST -+ size = 4; -+#endif -+ return size; -+} -+ -+static void ext3_ext_show_path(struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path) -+{ -+#ifdef EXT_DEBUG -+ int k, l = path->p_depth; -+ -+ ext_debug(tree, "path:"); -+ for (k = 0; k <= l; k++, path++) { -+ if (path->p_idx) { -+ ext_debug(tree, " %d->%d", path->p_idx->ei_block, -+ path->p_idx->ei_leaf); -+ } else if (path->p_ext) { -+ ext_debug(tree, " %d:%d:%d", -+ path->p_ext->ee_block, -+ path->p_ext->ee_len, -+ path->p_ext->ee_start); -+ } else -+ ext_debug(tree, " []"); -+ } -+ ext_debug(tree, "\n"); -+#endif -+} -+ -+static void ext3_ext_show_leaf(struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path) -+{ -+#ifdef EXT_DEBUG -+ int depth = EXT_DEPTH(tree); -+ struct ext3_extent_header *eh; -+ struct ext3_extent *ex; -+ int i; -+ -+ if (!path) -+ return; -+ -+ eh = path[depth].p_hdr; -+ ex = EXT_FIRST_EXTENT(eh); -+ -+ for (i = 0; i < eh->eh_entries; i++, ex++) { -+ ext_debug(tree, "%d:%d:%d ", -+ ex->ee_block, ex->ee_len, ex->ee_start); -+ } -+ ext_debug(tree, "\n"); -+#endif -+} -+ -+static void ext3_ext_drop_refs(struct ext3_ext_path *path) -+{ -+ int depth = path->p_depth; -+ int i; -+ -+ for (i = 0; i <= depth; i++, path++) { -+ if (path->p_bh) { -+ brelse(path->p_bh); -+ path->p_bh = NULL; -+ } -+ } -+} -+ -+/* -+ * binary search for closest index by given block -+ */ -+static inline void -+ext3_ext_binsearch_idx(struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, int block) -+{ -+ struct ext3_extent_header *eh = path->p_hdr; -+ struct ext3_extent_idx *ix; -+ int l = 0, k, r; -+ -+ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); -+ EXT_ASSERT(eh->eh_entries <= eh->eh_max); -+ EXT_ASSERT(eh->eh_entries > 0); -+ -+ ext_debug(tree, "binsearch for %d(idx): ", block); -+ -+ path->p_idx = ix = EXT_FIRST_INDEX(eh); -+ -+ r = k = eh->eh_entries; -+ while (k > 1) { -+ k = (r - l) / 2; -+ if (block < ix[l + k].ei_block) -+ r -= k; -+ else -+ l += k; -+ ext_debug(tree, "%d:%d:%d ", k, l, r); -+ } -+ -+ ix += l; -+ path->p_idx = ix; -+ ext_debug(tree," -> %d->%d ",path->p_idx->ei_block,path->p_idx->ei_leaf); -+ -+ while (l++ < r) { -+ if (block < ix->ei_block) -+ break; -+ path->p_idx = ix++; -+ } -+ ext_debug(tree, " -> %d->%d\n", path->p_idx->ei_block, -+ path->p_idx->ei_leaf); -+ -+#ifdef CHECK_BINSEARCH -+ { -+ struct ext3_extent_idx *chix; -+ -+ chix = ix = EXT_FIRST_INDEX(eh); -+ for (k = 0; k < eh->eh_entries; k++, ix++) { -+ if (k != 0 && ix->ei_block <= ix[-1].ei_block) { -+ printk("k=%d, ix=0x%p, first=0x%p\n", k, -+ ix, EXT_FIRST_INDEX(eh)); -+ printk("%u <= %u\n", -+ ix->ei_block,ix[-1].ei_block); -+ } -+ EXT_ASSERT(k == 0 || ix->ei_block > ix[-1].ei_block); -+ if (block < ix->ei_block) -+ break; -+ chix = ix; -+ } -+ EXT_ASSERT(chix == path->p_idx); -+ } -+#endif -+} -+ -+/* -+ * binary search for closest extent by given block -+ */ -+static inline void -+ext3_ext_binsearch(struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, int block) -+{ -+ struct ext3_extent_header *eh = path->p_hdr; -+ struct ext3_extent *ex; -+ int l = 0, k, r; -+ -+ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); -+ EXT_ASSERT(eh->eh_entries <= eh->eh_max); -+ -+ if (eh->eh_entries == 0) { -+ /* -+ * this leaf is empty yet: -+ * we get such a leaf in split/add case -+ */ -+ return; -+ } -+ -+ ext_debug(tree, "binsearch for %d: ", block); -+ -+ path->p_ext = ex = EXT_FIRST_EXTENT(eh); -+ -+ r = k = eh->eh_entries; -+ while (k > 1) { -+ k = (r - l) / 2; -+ if (block < ex[l + k].ee_block) -+ r -= k; -+ else -+ l += k; -+ ext_debug(tree, "%d:%d:%d ", k, l, r); -+ } -+ -+ ex += l; -+ path->p_ext = ex; -+ ext_debug(tree, " -> %d:%d:%d ", path->p_ext->ee_block, -+ path->p_ext->ee_start, path->p_ext->ee_len); -+ -+ while (l++ < r) { -+ if (block < ex->ee_block) -+ break; -+ path->p_ext = ex++; -+ } -+ ext_debug(tree, " -> %d:%d:%d\n", path->p_ext->ee_block, -+ path->p_ext->ee_start, path->p_ext->ee_len); -+ -+#ifdef CHECK_BINSEARCH -+ { -+ struct ext3_extent *chex; -+ -+ chex = ex = EXT_FIRST_EXTENT(eh); -+ for (k = 0; k < eh->eh_entries; k++, ex++) { -+ EXT_ASSERT(k == 0 || ex->ee_block > ex[-1].ee_block); -+ if (block < ex->ee_block) -+ break; -+ chex = ex; -+ } -+ EXT_ASSERT(chex == path->p_ext); -+ } -+#endif -+} -+ -+int ext3_extent_tree_init(handle_t *handle, struct ext3_extents_tree *tree) -+{ -+ struct ext3_extent_header *eh; -+ -+ BUG_ON(tree->buffer_len == 0); -+ ext3_ext_get_access_for_root(handle, tree); -+ eh = EXT_ROOT_HDR(tree); -+ eh->eh_depth = 0; -+ eh->eh_entries = 0; -+ eh->eh_magic = EXT3_EXT_MAGIC; -+ eh->eh_max = ext3_ext_space_root(tree); -+ ext3_ext_mark_root_dirty(handle, tree); -+ ext3_ext_invalidate_cache(tree); -+ return 0; -+} -+ -+struct ext3_ext_path * -+ext3_ext_find_extent(struct ext3_extents_tree *tree, int block, -+ struct ext3_ext_path *path) -+{ -+ struct ext3_extent_header *eh; -+ struct buffer_head *bh; -+ int depth, i, ppos = 0; -+ -+ EXT_ASSERT(tree); -+ EXT_ASSERT(tree->inode); -+ EXT_ASSERT(tree->root); -+ -+ eh = EXT_ROOT_HDR(tree); -+ EXT_ASSERT(eh); -+ if (ext3_ext_check_header(eh)) { -+ /* don't free previously allocated path -+ * -- caller should take care */ -+ path = NULL; -+ goto err; -+ } -+ -+ i = depth = EXT_DEPTH(tree); -+ EXT_ASSERT(eh->eh_max); -+ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); -+ -+ /* account possible depth increase */ -+ if (!path) { -+ path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 2), -+ GFP_NOFS); -+ if (!path) -+ return ERR_PTR(-ENOMEM); -+ } -+ memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1)); -+ path[0].p_hdr = eh; -+ -+ /* walk through the tree */ -+ while (i) { -+ ext_debug(tree, "depth %d: num %d, max %d\n", -+ ppos, eh->eh_entries, eh->eh_max); -+ ext3_ext_binsearch_idx(tree, path + ppos, block); -+ path[ppos].p_block = path[ppos].p_idx->ei_leaf; -+ path[ppos].p_depth = i; -+ path[ppos].p_ext = NULL; -+ -+ bh = sb_bread(tree->inode->i_sb, path[ppos].p_block); -+ if (!bh) -+ goto err; -+ -+ eh = EXT_BLOCK_HDR(bh); -+ ppos++; -+ EXT_ASSERT(ppos <= depth); -+ path[ppos].p_bh = bh; -+ path[ppos].p_hdr = eh; -+ i--; -+ -+ if (ext3_ext_check_header(eh)) -+ goto err; -+ } -+ -+ path[ppos].p_depth = i; -+ path[ppos].p_hdr = eh; -+ path[ppos].p_ext = NULL; -+ path[ppos].p_idx = NULL; -+ -+ if (ext3_ext_check_header(eh)) -+ goto err; -+ -+ /* find extent */ -+ ext3_ext_binsearch(tree, path + ppos, block); -+ -+ ext3_ext_show_path(tree, path); -+ -+ return path; -+ -+err: -+ printk(KERN_ERR "EXT3-fs: header is corrupted!\n"); -+ if (path) { -+ ext3_ext_drop_refs(path); -+ kfree(path); -+ } -+ return ERR_PTR(-EIO); -+} -+ -+/* -+ * insert new index [logical;ptr] into the block at cupr -+ * it check where to insert: before curp or after curp -+ */ -+static int ext3_ext_insert_index(handle_t *handle, -+ struct ext3_extents_tree *tree, -+ struct ext3_ext_path *curp, -+ int logical, int ptr) -+{ -+ struct ext3_extent_idx *ix; -+ int len, err; -+ -+ if ((err = ext3_ext_get_access(handle, tree, curp))) -+ return err; -+ -+ EXT_ASSERT(logical != curp->p_idx->ei_block); -+ len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx; -+ if (logical > curp->p_idx->ei_block) { -+ /* insert after */ -+ if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) { -+ len = (len - 1) * sizeof(struct ext3_extent_idx); -+ len = len < 0 ? 0 : len; -+ ext_debug(tree, "insert new index %d after: %d. " -+ "move %d from 0x%p to 0x%p\n", -+ logical, ptr, len, -+ (curp->p_idx + 1), (curp->p_idx + 2)); -+ memmove(curp->p_idx + 2, curp->p_idx + 1, len); -+ } -+ ix = curp->p_idx + 1; -+ } else { -+ /* insert before */ -+ len = len * sizeof(struct ext3_extent_idx); -+ len = len < 0 ? 0 : len; -+ ext_debug(tree, "insert new index %d before: %d. " -+ "move %d from 0x%p to 0x%p\n", -+ logical, ptr, len, -+ curp->p_idx, (curp->p_idx + 1)); -+ memmove(curp->p_idx + 1, curp->p_idx, len); -+ ix = curp->p_idx; -+ } -+ -+ ix->ei_block = logical; -+ ix->ei_leaf = ptr; -+ ix->ei_leaf_hi = ix->ei_unused = 0; -+ curp->p_hdr->eh_entries++; -+ -+ EXT_ASSERT(curp->p_hdr->eh_entries <= curp->p_hdr->eh_max); -+ EXT_ASSERT(ix <= EXT_LAST_INDEX(curp->p_hdr)); -+ -+ err = ext3_ext_dirty(handle, tree, curp); -+ ext3_std_error(tree->inode->i_sb, err); -+ -+ return err; -+} -+ -+/* -+ * routine inserts new subtree into the path, using free index entry -+ * at depth 'at: -+ * - allocates all needed blocks (new leaf and all intermediate index blocks) -+ * - makes decision where to split -+ * - moves remaining extens and index entries (right to the split point) -+ * into the newly allocated blocks -+ * - initialize subtree -+ */ -+static int ext3_ext_split(handle_t *handle, struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, -+ struct ext3_extent *newext, int at) -+{ -+ struct buffer_head *bh = NULL; -+ int depth = EXT_DEPTH(tree); -+ struct ext3_extent_header *neh; -+ struct ext3_extent_idx *fidx; -+ struct ext3_extent *ex; -+ int i = at, k, m, a; -+ unsigned long newblock, oldblock, border; -+ int *ablocks = NULL; /* array of allocated blocks */ -+ int err = 0; -+ -+ /* make decision: where to split? */ -+ /* FIXME: now desicion is simplest: at current extent */ -+ -+ /* if current leaf will be splitted, then we should use -+ * border from split point */ -+ EXT_ASSERT(path[depth].p_ext <= EXT_MAX_EXTENT(path[depth].p_hdr)); -+ if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) { -+ border = path[depth].p_ext[1].ee_block; -+ ext_debug(tree, "leaf will be splitted." -+ " next leaf starts at %d\n", -+ (int)border); -+ } else { -+ border = newext->ee_block; -+ ext_debug(tree, "leaf will be added." -+ " next leaf starts at %d\n", -+ (int)border); -+ } -+ -+ /* -+ * if error occurs, then we break processing -+ * and turn filesystem read-only. so, index won't -+ * be inserted and tree will be in consistent -+ * state. next mount will repair buffers too -+ */ -+ -+ /* -+ * get array to track all allocated blocks -+ * we need this to handle errors and free blocks -+ * upon them -+ */ -+ ablocks = kmalloc(sizeof(unsigned long) * depth, GFP_NOFS); -+ if (!ablocks) -+ return -ENOMEM; -+ memset(ablocks, 0, sizeof(unsigned long) * depth); -+ -+ /* allocate all needed blocks */ -+ ext_debug(tree, "allocate %d blocks for indexes/leaf\n", depth - at); -+ for (a = 0; a < depth - at; a++) { -+ newblock = ext3_ext_new_block(handle, tree, path, newext, &err); -+ if (newblock == 0) -+ goto cleanup; -+ ablocks[a] = newblock; -+ } -+ -+ /* initialize new leaf */ -+ newblock = ablocks[--a]; -+ EXT_ASSERT(newblock); -+ bh = sb_getblk(tree->inode->i_sb, newblock); -+ if (!bh) { -+ err = -EIO; -+ goto cleanup; -+ } -+ lock_buffer(bh); -+ -+ if ((err = ext3_journal_get_create_access(handle, bh))) -+ goto cleanup; -+ -+ neh = EXT_BLOCK_HDR(bh); -+ neh->eh_entries = 0; -+ neh->eh_max = ext3_ext_space_block(tree); -+ neh->eh_magic = EXT3_EXT_MAGIC; -+ neh->eh_depth = 0; -+ ex = EXT_FIRST_EXTENT(neh); -+ -+ /* move remain of path[depth] to the new leaf */ -+ EXT_ASSERT(path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max); -+ /* start copy from next extent */ -+ /* TODO: we could do it by single memmove */ -+ m = 0; -+ path[depth].p_ext++; -+ while (path[depth].p_ext <= -+ EXT_MAX_EXTENT(path[depth].p_hdr)) { -+ ext_debug(tree, "move %d:%d:%d in new leaf %lu\n", -+ path[depth].p_ext->ee_block, -+ path[depth].p_ext->ee_start, -+ path[depth].p_ext->ee_len, -+ newblock); -+ memmove(ex++, path[depth].p_ext++, sizeof(struct ext3_extent)); -+ neh->eh_entries++; -+ m++; -+ } -+ set_buffer_uptodate(bh); -+ unlock_buffer(bh); -+ -+ if ((err = ext3_journal_dirty_metadata(handle, bh))) -+ goto cleanup; -+ brelse(bh); -+ bh = NULL; -+ -+ /* correct old leaf */ -+ if (m) { -+ if ((err = ext3_ext_get_access(handle, tree, path + depth))) -+ goto cleanup; -+ path[depth].p_hdr->eh_entries -= m; -+ if ((err = ext3_ext_dirty(handle, tree, path + depth))) -+ goto cleanup; -+ -+ } -+ -+ /* create intermediate indexes */ -+ k = depth - at - 1; -+ EXT_ASSERT(k >= 0); -+ if (k) -+ ext_debug(tree, "create %d intermediate indices\n", k); -+ /* insert new index into current index block */ -+ /* current depth stored in i var */ -+ i = depth - 1; -+ while (k--) { -+ oldblock = newblock; -+ newblock = ablocks[--a]; -+ bh = sb_getblk(tree->inode->i_sb, newblock); -+ if (!bh) { -+ err = -EIO; -+ goto cleanup; -+ } -+ lock_buffer(bh); -+ -+ if ((err = ext3_journal_get_create_access(handle, bh))) -+ goto cleanup; -+ -+ neh = EXT_BLOCK_HDR(bh); -+ neh->eh_entries = 1; -+ neh->eh_magic = EXT3_EXT_MAGIC; -+ neh->eh_max = ext3_ext_space_block_idx(tree); -+ neh->eh_depth = depth - i; -+ fidx = EXT_FIRST_INDEX(neh); -+ fidx->ei_block = border; -+ fidx->ei_leaf = oldblock; -+ fidx->ei_leaf_hi = fidx->ei_unused = 0; -+ -+ ext_debug(tree, "int.index at %d (block %lu): %lu -> %lu\n", -+ i, newblock, border, oldblock); -+ /* copy indexes */ -+ m = 0; -+ path[i].p_idx++; -+ -+ ext_debug(tree, "cur 0x%p, last 0x%p\n", path[i].p_idx, -+ EXT_MAX_INDEX(path[i].p_hdr)); -+ EXT_ASSERT(EXT_MAX_INDEX(path[i].p_hdr) == -+ EXT_LAST_INDEX(path[i].p_hdr)); -+ while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) { -+ ext_debug(tree, "%d: move %d:%d in new index %lu\n", -+ i, path[i].p_idx->ei_block, -+ path[i].p_idx->ei_leaf, newblock); -+ memmove(++fidx, path[i].p_idx++, -+ sizeof(struct ext3_extent_idx)); -+ neh->eh_entries++; -+ EXT_ASSERT(neh->eh_entries <= neh->eh_max); -+ m++; -+ } -+ set_buffer_uptodate(bh); -+ unlock_buffer(bh); -+ -+ if ((err = ext3_journal_dirty_metadata(handle, bh))) -+ goto cleanup; -+ brelse(bh); -+ bh = NULL; -+ -+ /* correct old index */ -+ if (m) { -+ err = ext3_ext_get_access(handle, tree, path + i); -+ if (err) -+ goto cleanup; -+ path[i].p_hdr->eh_entries -= m; -+ err = ext3_ext_dirty(handle, tree, path + i); -+ if (err) -+ goto cleanup; -+ } -+ -+ i--; -+ } -+ -+ /* insert new index */ -+ if (!err) -+ err = ext3_ext_insert_index(handle, tree, path + at, -+ border, newblock); -+ -+cleanup: -+ if (bh) { -+ if (buffer_locked(bh)) -+ unlock_buffer(bh); -+ brelse(bh); -+ } -+ -+ if (err) { -+ /* free all allocated blocks in error case */ -+ for (i = 0; i < depth; i++) { -+ if (!ablocks[i]) -+ continue; -+ ext3_free_blocks(handle, tree->inode, ablocks[i], 1); -+ } -+ } -+ kfree(ablocks); -+ -+ return err; -+} -+ -+/* -+ * routine implements tree growing procedure: -+ * - allocates new block -+ * - moves top-level data (index block or leaf) into the new block -+ * - initialize new top-level, creating index that points to the -+ * just created block -+ */ -+static int ext3_ext_grow_indepth(handle_t *handle, -+ struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, -+ struct ext3_extent *newext) -+{ -+ struct ext3_ext_path *curp = path; -+ struct ext3_extent_header *neh; -+ struct ext3_extent_idx *fidx; -+ struct buffer_head *bh; -+ unsigned long newblock; -+ int err = 0; -+ -+ newblock = ext3_ext_new_block(handle, tree, path, newext, &err); -+ if (newblock == 0) -+ return err; -+ -+ bh = sb_getblk(tree->inode->i_sb, newblock); -+ if (!bh) { -+ err = -EIO; -+ ext3_std_error(tree->inode->i_sb, err); -+ return err; -+ } -+ lock_buffer(bh); -+ -+ if ((err = ext3_journal_get_create_access(handle, bh))) { -+ unlock_buffer(bh); -+ goto out; -+ } -+ -+ /* move top-level index/leaf into new block */ -+ memmove(bh->b_data, curp->p_hdr, tree->buffer_len); -+ -+ /* set size of new block */ -+ neh = EXT_BLOCK_HDR(bh); -+ /* old root could have indexes or leaves -+ * so calculate eh_max right way */ -+ if (EXT_DEPTH(tree)) -+ neh->eh_max = ext3_ext_space_block_idx(tree); -+ else -+ neh->eh_max = ext3_ext_space_block(tree); -+ neh->eh_magic = EXT3_EXT_MAGIC; -+ set_buffer_uptodate(bh); -+ unlock_buffer(bh); -+ -+ if ((err = ext3_journal_dirty_metadata(handle, bh))) -+ goto out; -+ -+ /* create index in new top-level index: num,max,pointer */ -+ if ((err = ext3_ext_get_access(handle, tree, curp))) -+ goto out; -+ -+ curp->p_hdr->eh_magic = EXT3_EXT_MAGIC; -+ curp->p_hdr->eh_max = ext3_ext_space_root_idx(tree); -+ curp->p_hdr->eh_entries = 1; -+ curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr); -+ /* FIXME: it works, but actually path[0] can be index */ -+ curp->p_idx->ei_block = EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block; -+ curp->p_idx->ei_leaf = newblock; -+ curp->p_idx->ei_leaf_hi = curp->p_idx->ei_unused = 0; -+ -+ neh = EXT_ROOT_HDR(tree); -+ fidx = EXT_FIRST_INDEX(neh); -+ ext_debug(tree, "new root: num %d(%d), lblock %d, ptr %d\n", -+ neh->eh_entries, neh->eh_max, fidx->ei_block, fidx->ei_leaf); -+ -+ neh->eh_depth = path->p_depth + 1; -+ err = ext3_ext_dirty(handle, tree, curp); -+out: -+ brelse(bh); -+ -+ return err; -+} -+ -+/* -+ * routine finds empty index and adds new leaf. if no free index found -+ * then it requests in-depth growing -+ */ -+static int ext3_ext_create_new_leaf(handle_t *handle, -+ struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, -+ struct ext3_extent *newext) -+{ -+ struct ext3_ext_path *curp; -+ int depth, i, err = 0; -+ -+repeat: -+ i = depth = EXT_DEPTH(tree); -+ -+ /* walk up to the tree and look for free index entry */ -+ curp = path + depth; -+ while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) { -+ i--; -+ curp--; -+ } -+ -+ /* we use already allocated block for index block -+ * so, subsequent data blocks should be contigoues */ -+ if (EXT_HAS_FREE_INDEX(curp)) { -+ /* if we found index with free entry, then use that -+ * entry: create all needed subtree and add new leaf */ -+ err = ext3_ext_split(handle, tree, path, newext, i); -+ -+ /* refill path */ -+ ext3_ext_drop_refs(path); -+ path = ext3_ext_find_extent(tree, newext->ee_block, path); -+ if (IS_ERR(path)) -+ err = PTR_ERR(path); -+ } else { -+ /* tree is full, time to grow in depth */ -+ err = ext3_ext_grow_indepth(handle, tree, path, newext); -+ -+ /* refill path */ -+ ext3_ext_drop_refs(path); -+ path = ext3_ext_find_extent(tree, newext->ee_block, path); -+ if (IS_ERR(path)) -+ err = PTR_ERR(path); -+ -+ /* -+ * only first (depth 0 -> 1) produces free space -+ * in all other cases we have to split growed tree -+ */ -+ depth = EXT_DEPTH(tree); -+ if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) { -+ /* now we need split */ -+ goto repeat; -+ } -+ } -+ -+ if (err) -+ return err; -+ -+ return 0; -+} -+ -+/* -+ * returns allocated block in subsequent extent or EXT_MAX_BLOCK -+ * NOTE: it consider block number from index entry as -+ * allocated block. thus, index entries have to be consistent -+ * with leafs -+ */ -+static unsigned long -+ext3_ext_next_allocated_block(struct ext3_ext_path *path) -+{ -+ int depth; -+ -+ EXT_ASSERT(path != NULL); -+ depth = path->p_depth; -+ -+ if (depth == 0 && path->p_ext == NULL) -+ return EXT_MAX_BLOCK; -+ -+ /* FIXME: what if index isn't full ?! */ -+ while (depth >= 0) { -+ if (depth == path->p_depth) { -+ /* leaf */ -+ if (path[depth].p_ext != -+ EXT_LAST_EXTENT(path[depth].p_hdr)) -+ return path[depth].p_ext[1].ee_block; -+ } else { -+ /* index */ -+ if (path[depth].p_idx != -+ EXT_LAST_INDEX(path[depth].p_hdr)) -+ return path[depth].p_idx[1].ei_block; -+ } -+ depth--; -+ } -+ -+ return EXT_MAX_BLOCK; -+} -+ -+/* -+ * returns first allocated block from next leaf or EXT_MAX_BLOCK -+ */ -+static unsigned ext3_ext_next_leaf_block(struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path) -+{ -+ int depth; -+ -+ EXT_ASSERT(path != NULL); -+ depth = path->p_depth; -+ -+ /* zero-tree has no leaf blocks at all */ -+ if (depth == 0) -+ return EXT_MAX_BLOCK; -+ -+ /* go to index block */ -+ depth--; -+ -+ while (depth >= 0) { -+ if (path[depth].p_idx != -+ EXT_LAST_INDEX(path[depth].p_hdr)) -+ return path[depth].p_idx[1].ei_block; -+ depth--; -+ } -+ -+ return EXT_MAX_BLOCK; -+} -+ -+/* -+ * if leaf gets modified and modified extent is first in the leaf -+ * then we have to correct all indexes above -+ * TODO: do we need to correct tree in all cases? -+ */ -+int ext3_ext_correct_indexes(handle_t *handle, struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path) -+{ -+ struct ext3_extent_header *eh; -+ int depth = EXT_DEPTH(tree); -+ struct ext3_extent *ex; -+ unsigned long border; -+ int k, err = 0; -+ -+ eh = path[depth].p_hdr; -+ ex = path[depth].p_ext; -+ EXT_ASSERT(ex); -+ EXT_ASSERT(eh); -+ -+ if (depth == 0) { -+ /* there is no tree at all */ -+ return 0; -+ } -+ -+ if (ex != EXT_FIRST_EXTENT(eh)) { -+ /* we correct tree if first leaf got modified only */ -+ return 0; -+ } -+ -+ /* -+ * TODO: we need correction if border is smaller then current one -+ */ -+ k = depth - 1; -+ border = path[depth].p_ext->ee_block; -+ if ((err = ext3_ext_get_access(handle, tree, path + k))) -+ return err; -+ path[k].p_idx->ei_block = border; -+ if ((err = ext3_ext_dirty(handle, tree, path + k))) -+ return err; -+ -+ while (k--) { -+ /* change all left-side indexes */ -+ if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr)) -+ break; -+ if ((err = ext3_ext_get_access(handle, tree, path + k))) -+ break; -+ path[k].p_idx->ei_block = border; -+ if ((err = ext3_ext_dirty(handle, tree, path + k))) -+ break; -+ } -+ -+ return err; -+} -+ -+static int inline -+ext3_can_extents_be_merged(struct ext3_extents_tree *tree, -+ struct ext3_extent *ex1, -+ struct ext3_extent *ex2) -+{ -+ if (ex1->ee_block + ex1->ee_len != ex2->ee_block) -+ return 0; -+ -+#ifdef AGRESSIVE_TEST -+ if (ex1->ee_len >= 4) -+ return 0; -+#endif -+ -+ if (!tree->ops->mergable) -+ return 1; -+ -+ return tree->ops->mergable(ex1, ex2); -+} -+ -+/* -+ * this routine tries to merge requsted extent into the existing -+ * extent or inserts requested extent as new one into the tree, -+ * creating new leaf in no-space case -+ */ -+int ext3_ext_insert_extent(handle_t *handle, struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, -+ struct ext3_extent *newext) -+{ -+ struct ext3_extent_header * eh; -+ struct ext3_extent *ex, *fex; -+ struct ext3_extent *nearex; /* nearest extent */ -+ struct ext3_ext_path *npath = NULL; -+ int depth, len, err, next; -+ -+ EXT_ASSERT(newext->ee_len > 0); -+ depth = EXT_DEPTH(tree); -+ ex = path[depth].p_ext; -+ EXT_ASSERT(path[depth].p_hdr); -+ -+ /* try to insert block into found extent and return */ -+ if (ex && ext3_can_extents_be_merged(tree, ex, newext)) { -+ ext_debug(tree, "append %d block to %d:%d (from %d)\n", -+ newext->ee_len, ex->ee_block, ex->ee_len, -+ ex->ee_start); -+ if ((err = ext3_ext_get_access(handle, tree, path + depth))) -+ return err; -+ ex->ee_len += newext->ee_len; -+ eh = path[depth].p_hdr; -+ nearex = ex; -+ goto merge; -+ } -+ -+repeat: -+ depth = EXT_DEPTH(tree); -+ eh = path[depth].p_hdr; -+ if (eh->eh_entries < eh->eh_max) -+ goto has_space; -+ -+ /* probably next leaf has space for us? */ -+ fex = EXT_LAST_EXTENT(eh); -+ next = ext3_ext_next_leaf_block(tree, path); -+ if (newext->ee_block > fex->ee_block && next != EXT_MAX_BLOCK) { -+ ext_debug(tree, "next leaf block - %d\n", next); -+ EXT_ASSERT(!npath); -+ npath = ext3_ext_find_extent(tree, next, NULL); -+ if (IS_ERR(npath)) -+ return PTR_ERR(npath); -+ EXT_ASSERT(npath->p_depth == path->p_depth); -+ eh = npath[depth].p_hdr; -+ if (eh->eh_entries < eh->eh_max) { -+ ext_debug(tree, "next leaf isnt full(%d)\n", -+ eh->eh_entries); -+ path = npath; -+ goto repeat; -+ } -+ ext_debug(tree, "next leaf hasno free space(%d,%d)\n", -+ eh->eh_entries, eh->eh_max); -+ } -+ -+ /* -+ * there is no free space in found leaf -+ * we're gonna add new leaf in the tree -+ */ -+ err = ext3_ext_create_new_leaf(handle, tree, path, newext); -+ if (err) -+ goto cleanup; -+ depth = EXT_DEPTH(tree); -+ eh = path[depth].p_hdr; -+ -+has_space: -+ nearex = path[depth].p_ext; -+ -+ if ((err = ext3_ext_get_access(handle, tree, path + depth))) -+ goto cleanup; -+ -+ if (!nearex) { -+ /* there is no extent in this leaf, create first one */ -+ ext_debug(tree, "first extent in the leaf: %d:%d:%d\n", -+ newext->ee_block, newext->ee_start, -+ newext->ee_len); -+ path[depth].p_ext = EXT_FIRST_EXTENT(eh); -+ } else if (newext->ee_block > nearex->ee_block) { -+ EXT_ASSERT(newext->ee_block != nearex->ee_block); -+ if (nearex != EXT_LAST_EXTENT(eh)) { -+ len = EXT_MAX_EXTENT(eh) - nearex; -+ len = (len - 1) * sizeof(struct ext3_extent); -+ len = len < 0 ? 0 : len; -+ ext_debug(tree, "insert %d:%d:%d after: nearest 0x%p, " -+ "move %d from 0x%p to 0x%p\n", -+ newext->ee_block, newext->ee_start, -+ newext->ee_len, -+ nearex, len, nearex + 1, nearex + 2); -+ memmove(nearex + 2, nearex + 1, len); -+ } -+ path[depth].p_ext = nearex + 1; -+ } else { -+ EXT_ASSERT(newext->ee_block != nearex->ee_block); -+ len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext3_extent); -+ len = len < 0 ? 0 : len; -+ ext_debug(tree, "insert %d:%d:%d before: nearest 0x%p, " -+ "move %d from 0x%p to 0x%p\n", -+ newext->ee_block, newext->ee_start, newext->ee_len, -+ nearex, len, nearex + 1, nearex + 2); -+ memmove(nearex + 1, nearex, len); -+ path[depth].p_ext = nearex; -+ } -+ -+ eh->eh_entries++; -+ nearex = path[depth].p_ext; -+ nearex->ee_block = newext->ee_block; -+ nearex->ee_start = newext->ee_start; -+ nearex->ee_len = newext->ee_len; -+ /* FIXME: support for large fs */ -+ nearex->ee_start_hi = 0; -+ -+merge: -+ /* try to merge extents to the right */ -+ while (nearex < EXT_LAST_EXTENT(eh)) { -+ if (!ext3_can_extents_be_merged(tree, nearex, nearex + 1)) -+ break; -+ /* merge with next extent! */ -+ nearex->ee_len += nearex[1].ee_len; -+ if (nearex + 1 < EXT_LAST_EXTENT(eh)) { -+ len = (EXT_LAST_EXTENT(eh) - nearex - 1) * -+ sizeof(struct ext3_extent); -+ memmove(nearex + 1, nearex + 2, len); -+ } -+ eh->eh_entries--; -+ EXT_ASSERT(eh->eh_entries > 0); -+ } -+ -+ /* try to merge extents to the left */ -+ -+ /* time to correct all indexes above */ -+ err = ext3_ext_correct_indexes(handle, tree, path); -+ if (err) -+ goto cleanup; -+ -+ err = ext3_ext_dirty(handle, tree, path + depth); -+ -+cleanup: -+ if (npath) { -+ ext3_ext_drop_refs(npath); -+ kfree(npath); -+ } -+ ext3_ext_tree_changed(tree); -+ ext3_ext_invalidate_cache(tree); -+ return err; -+} -+ -+int ext3_ext_walk_space(struct ext3_extents_tree *tree, unsigned long block, -+ unsigned long num, ext_prepare_callback func) -+{ -+ struct ext3_ext_path *path = NULL; -+ struct ext3_ext_cache cbex; -+ struct ext3_extent *ex; -+ unsigned long next, start = 0, end = 0; -+ unsigned long last = block + num; -+ int depth, exists, err = 0; -+ -+ EXT_ASSERT(tree); -+ EXT_ASSERT(func); -+ EXT_ASSERT(tree->inode); -+ EXT_ASSERT(tree->root); -+ -+ while (block < last && block != EXT_MAX_BLOCK) { -+ num = last - block; -+ /* find extent for this block */ -+ path = ext3_ext_find_extent(tree, block, path); -+ if (IS_ERR(path)) { -+ err = PTR_ERR(path); -+ path = NULL; -+ break; -+ } -+ -+ depth = EXT_DEPTH(tree); -+ EXT_ASSERT(path[depth].p_hdr); -+ ex = path[depth].p_ext; -+ next = ext3_ext_next_allocated_block(path); -+ -+ exists = 0; -+ if (!ex) { -+ /* there is no extent yet, so try to allocate -+ * all requested space */ -+ start = block; -+ end = block + num; -+ } else if (ex->ee_block > block) { -+ /* need to allocate space before found extent */ -+ start = block; -+ end = ex->ee_block; -+ if (block + num < end) -+ end = block + num; -+ } else if (block >= ex->ee_block + ex->ee_len) { -+ /* need to allocate space after found extent */ -+ start = block; -+ end = block + num; -+ if (end >= next) -+ end = next; -+ } else if (block >= ex->ee_block) { -+ /* -+ * some part of requested space is covered -+ * by found extent -+ */ -+ start = block; -+ end = ex->ee_block + ex->ee_len; -+ if (block + num < end) -+ end = block + num; -+ exists = 1; -+ } else { -+ BUG(); -+ } -+ EXT_ASSERT(end > start); -+ -+ if (!exists) { -+ cbex.ec_block = start; -+ cbex.ec_len = end - start; -+ cbex.ec_start = 0; -+ cbex.ec_type = EXT3_EXT_CACHE_GAP; -+ } else { -+ cbex.ec_block = ex->ee_block; -+ cbex.ec_len = ex->ee_len; -+ cbex.ec_start = ex->ee_start; -+ cbex.ec_type = EXT3_EXT_CACHE_EXTENT; -+ } -+ -+ EXT_ASSERT(cbex.ec_len > 0); -+ EXT_ASSERT(path[depth].p_hdr); -+ err = func(tree, path, &cbex); -+ ext3_ext_drop_refs(path); -+ -+ if (err < 0) -+ break; -+ if (err == EXT_REPEAT) -+ continue; -+ else if (err == EXT_BREAK) { -+ err = 0; -+ break; -+ } -+ -+ if (EXT_DEPTH(tree) != depth) { -+ /* depth was changed. we have to realloc path */ -+ kfree(path); -+ path = NULL; -+ } -+ -+ block = cbex.ec_block + cbex.ec_len; -+ } -+ -+ if (path) { -+ ext3_ext_drop_refs(path); -+ kfree(path); -+ } -+ -+ return err; -+} -+ -+static inline void -+ext3_ext_put_in_cache(struct ext3_extents_tree *tree, __u32 block, -+ __u32 len, __u32 start, int type) -+{ -+ EXT_ASSERT(len > 0); -+ if (tree->cex) { -+ tree->cex->ec_type = type; -+ tree->cex->ec_block = block; -+ tree->cex->ec_len = len; -+ tree->cex->ec_start = start; -+ } -+} -+ -+/* -+ * this routine calculate boundaries of the gap requested block fits into -+ * and cache this gap -+ */ -+static inline void -+ext3_ext_put_gap_in_cache(struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, -+ unsigned long block) -+{ -+ int depth = EXT_DEPTH(tree); -+ unsigned long lblock, len; -+ struct ext3_extent *ex; -+ -+ if (!tree->cex) -+ return; -+ -+ ex = path[depth].p_ext; -+ if (ex == NULL) { -+ /* there is no extent yet, so gap is [0;-] */ -+ lblock = 0; -+ len = EXT_MAX_BLOCK; -+ ext_debug(tree, "cache gap(whole file):"); -+ } else if (block < ex->ee_block) { -+ lblock = block; -+ len = ex->ee_block - block; -+ ext_debug(tree, "cache gap(before): %lu [%lu:%lu]", -+ (unsigned long) block, -+ (unsigned long) ex->ee_block, -+ (unsigned long) ex->ee_len); -+ } else if (block >= ex->ee_block + ex->ee_len) { -+ lblock = ex->ee_block + ex->ee_len; -+ len = ext3_ext_next_allocated_block(path); -+ ext_debug(tree, "cache gap(after): [%lu:%lu] %lu", -+ (unsigned long) ex->ee_block, -+ (unsigned long) ex->ee_len, -+ (unsigned long) block); -+ EXT_ASSERT(len > lblock); -+ len = len - lblock; -+ } else { -+ lblock = len = 0; -+ BUG(); -+ } -+ -+ ext_debug(tree, " -> %lu:%lu\n", (unsigned long) lblock, len); -+ ext3_ext_put_in_cache(tree, lblock, len, 0, EXT3_EXT_CACHE_GAP); -+} -+ -+static inline int -+ext3_ext_in_cache(struct ext3_extents_tree *tree, unsigned long block, -+ struct ext3_extent *ex) -+{ -+ struct ext3_ext_cache *cex = tree->cex; -+ -+ /* is there cache storage at all? */ -+ if (!cex) -+ return EXT3_EXT_CACHE_NO; -+ -+ /* has cache valid data? */ -+ if (cex->ec_type == EXT3_EXT_CACHE_NO) -+ return EXT3_EXT_CACHE_NO; -+ -+ EXT_ASSERT(cex->ec_type == EXT3_EXT_CACHE_GAP || -+ cex->ec_type == EXT3_EXT_CACHE_EXTENT); -+ if (block >= cex->ec_block && block < cex->ec_block + cex->ec_len) { -+ ex->ee_block = cex->ec_block; -+ ex->ee_start = cex->ec_start; -+ ex->ee_start_hi = 0; -+ ex->ee_len = cex->ec_len; -+ ext_debug(tree, "%lu cached by %lu:%lu:%lu\n", -+ (unsigned long) block, -+ (unsigned long) ex->ee_block, -+ (unsigned long) ex->ee_len, -+ (unsigned long) ex->ee_start); -+ return cex->ec_type; -+ } -+ -+ /* not in cache */ -+ return EXT3_EXT_CACHE_NO; -+} -+ -+/* -+ * routine removes index from the index block -+ * it's used in truncate case only. thus all requests are for -+ * last index in the block only -+ */ -+int ext3_ext_rm_idx(handle_t *handle, struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path) -+{ -+ struct buffer_head *bh; -+ int err; -+ -+ /* free index block */ -+ path--; -+ EXT_ASSERT(path->p_hdr->eh_entries); -+ if ((err = ext3_ext_get_access(handle, tree, path))) -+ return err; -+ path->p_hdr->eh_entries--; -+ if ((err = ext3_ext_dirty(handle, tree, path))) -+ return err; -+ ext_debug(tree, "index is empty, remove it, free block %d\n", -+ path->p_idx->ei_leaf); -+ bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf); -+ ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf); -+ ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1); -+ return err; -+} -+ -+int ext3_ext_calc_credits_for_insert(struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path) -+{ -+ int depth = EXT_DEPTH(tree); -+ int needed; -+ -+ if (path) { -+ /* probably there is space in leaf? */ -+ if (path[depth].p_hdr->eh_entries < path[depth].p_hdr->eh_max) -+ return 1; -+ } -+ -+ /* -+ * the worste case we're expecting is creation of the -+ * new root (growing in depth) with index splitting -+ * for splitting we have to consider depth + 1 because -+ * previous growing could increase it -+ */ -+ depth = depth + 1; -+ -+ /* -+ * growing in depth: -+ * block allocation + new root + old root -+ */ -+ needed = EXT3_ALLOC_NEEDED + 2; -+ -+ /* index split. we may need: -+ * allocate intermediate indexes and new leaf -+ * change two blocks at each level, but root -+ * modify root block (inode) -+ */ -+ needed += (depth * EXT3_ALLOC_NEEDED) + (2 * depth) + 1; -+ -+ return needed; -+} -+ -+static int -+ext3_ext_split_for_rm(handle_t *handle, struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, unsigned long start, -+ unsigned long end) -+{ -+ struct ext3_extent *ex, tex; -+ struct ext3_ext_path *npath; -+ int depth, creds, err; -+ -+ depth = EXT_DEPTH(tree); -+ ex = path[depth].p_ext; -+ EXT_ASSERT(ex); -+ EXT_ASSERT(end < ex->ee_block + ex->ee_len - 1); -+ EXT_ASSERT(ex->ee_block < start); -+ -+ /* calculate tail extent */ -+ tex.ee_block = end + 1; -+ EXT_ASSERT(tex.ee_block < ex->ee_block + ex->ee_len); -+ tex.ee_len = ex->ee_block + ex->ee_len - tex.ee_block; -+ -+ creds = ext3_ext_calc_credits_for_insert(tree, path); -+ handle = ext3_ext_journal_restart(handle, creds); -+ if (IS_ERR(handle)) -+ return PTR_ERR(handle); -+ -+ /* calculate head extent. use primary extent */ -+ err = ext3_ext_get_access(handle, tree, path + depth); -+ if (err) -+ return err; -+ ex->ee_len = start - ex->ee_block; -+ err = ext3_ext_dirty(handle, tree, path + depth); -+ if (err) -+ return err; -+ -+ /* FIXME: some callback to free underlying resource -+ * and correct ee_start? */ -+ ext_debug(tree, "split extent: head %u:%u, tail %u:%u\n", -+ ex->ee_block, ex->ee_len, tex.ee_block, tex.ee_len); -+ -+ npath = ext3_ext_find_extent(tree, ex->ee_block, NULL); -+ if (IS_ERR(npath)) -+ return PTR_ERR(npath); -+ depth = EXT_DEPTH(tree); -+ EXT_ASSERT(npath[depth].p_ext->ee_block == ex->ee_block); -+ EXT_ASSERT(npath[depth].p_ext->ee_len == ex->ee_len); -+ -+ err = ext3_ext_insert_extent(handle, tree, npath, &tex); -+ ext3_ext_drop_refs(npath); -+ kfree(npath); -+ -+ return err; -+} -+ -+static int -+ext3_ext_rm_leaf(handle_t *handle, struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, unsigned long start, -+ unsigned long end) -+{ -+ struct ext3_extent *ex, *fu = NULL, *lu, *le; -+ int err = 0, correct_index = 0; -+ int depth = EXT_DEPTH(tree), credits; -+ struct ext3_extent_header *eh; -+ unsigned a, b, block, num; -+ -+ ext_debug(tree, "remove [%lu:%lu] in leaf\n", start, end); -+ if (!path[depth].p_hdr) -+ path[depth].p_hdr = EXT_BLOCK_HDR(path[depth].p_bh); -+ eh = path[depth].p_hdr; -+ EXT_ASSERT(eh); -+ EXT_ASSERT(eh->eh_entries <= eh->eh_max); -+ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); -+ -+ /* find where to start removing */ -+ le = ex = EXT_LAST_EXTENT(eh); -+ while (ex != EXT_FIRST_EXTENT(eh)) { -+ if (ex->ee_block <= end) -+ break; -+ ex--; -+ } -+ -+ if (start > ex->ee_block && end < ex->ee_block + ex->ee_len - 1) { -+ /* removal of internal part of the extent requested -+ * tail and head must be placed in different extent -+ * so, we have to insert one more extent */ -+ path[depth].p_ext = ex; -+ return ext3_ext_split_for_rm(handle, tree, path, start, end); -+ } -+ -+ lu = ex; -+ while (ex >= EXT_FIRST_EXTENT(eh) && ex->ee_block + ex->ee_len > start) { -+ ext_debug(tree, "remove ext %u:%u\n", ex->ee_block, ex->ee_len); -+ path[depth].p_ext = ex; -+ -+ a = ex->ee_block > start ? ex->ee_block : start; -+ b = ex->ee_block + ex->ee_len - 1 < end ? -+ ex->ee_block + ex->ee_len - 1 : end; -+ -+ ext_debug(tree, " border %u:%u\n", a, b); -+ -+ if (a != ex->ee_block && b != ex->ee_block + ex->ee_len - 1) { -+ block = 0; -+ num = 0; -+ BUG(); -+ } else if (a != ex->ee_block) { -+ /* remove tail of the extent */ -+ block = ex->ee_block; -+ num = a - block; -+ } else if (b != ex->ee_block + ex->ee_len - 1) { -+ /* remove head of the extent */ -+ block = a; -+ num = b - a; -+ } else { -+ /* remove whole extent: excelent! */ -+ block = ex->ee_block; -+ num = 0; -+ EXT_ASSERT(a == ex->ee_block && -+ b == ex->ee_block + ex->ee_len - 1); -+ } -+ -+ if (ex == EXT_FIRST_EXTENT(eh)) -+ correct_index = 1; -+ -+ credits = 1; -+ if (correct_index) -+ credits += (EXT_DEPTH(tree) * EXT3_ALLOC_NEEDED) + 1; -+ if (tree->ops->remove_extent_credits) -+ credits+=tree->ops->remove_extent_credits(tree,ex,a,b); -+ -+ handle = ext3_ext_journal_restart(handle, credits); -+ if (IS_ERR(handle)) { -+ err = PTR_ERR(handle); -+ goto out; -+ } -+ -+ err = ext3_ext_get_access(handle, tree, path + depth); -+ if (err) -+ goto out; -+ -+ if (tree->ops->remove_extent) -+ err = tree->ops->remove_extent(tree, ex, a, b); -+ if (err) -+ goto out; -+ -+ if (num == 0) { -+ /* this extent is removed entirely mark slot unused */ -+ ex->ee_start = ex->ee_start_hi = 0; -+ eh->eh_entries--; -+ fu = ex; -+ } -+ -+ ex->ee_block = block; -+ ex->ee_len = num; -+ -+ err = ext3_ext_dirty(handle, tree, path + depth); -+ if (err) -+ goto out; -+ -+ ext_debug(tree, "new extent: %u:%u:%u\n", -+ ex->ee_block, ex->ee_len, ex->ee_start); -+ ex--; -+ } -+ -+ if (fu) { -+ /* reuse unused slots */ -+ while (lu < le) { -+ if (lu->ee_start) { -+ *fu = *lu; -+ lu->ee_start = lu->ee_start_hi = 0; -+ fu++; -+ } -+ lu++; -+ } -+ } -+ -+ if (correct_index && eh->eh_entries) -+ err = ext3_ext_correct_indexes(handle, tree, path); -+ -+ /* if this leaf is free, then we should -+ * remove it from index block above */ -+ if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL) -+ err = ext3_ext_rm_idx(handle, tree, path + depth); -+ -+out: -+ return err; -+} -+ -+ -+static struct ext3_extent_idx * -+ext3_ext_last_covered(struct ext3_extent_header *hdr, unsigned long block) -+{ -+ struct ext3_extent_idx *ix; -+ -+ ix = EXT_LAST_INDEX(hdr); -+ while (ix != EXT_FIRST_INDEX(hdr)) { -+ if (ix->ei_block <= block) -+ break; -+ ix--; -+ } -+ return ix; -+} -+ -+/* -+ * returns 1 if current index have to be freed (even partial) -+ */ -+static int inline -+ext3_ext_more_to_rm(struct ext3_ext_path *path) -+{ -+ EXT_ASSERT(path->p_idx); -+ -+ if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr)) -+ return 0; -+ -+ /* -+ * if truncate on deeper level happened it it wasn't partial -+ * so we have to consider current index for truncation -+ */ -+ if (path->p_hdr->eh_entries == path->p_block) -+ return 0; -+ return 1; -+} -+ -+int ext3_ext_remove_space(struct ext3_extents_tree *tree, -+ unsigned long start, unsigned long end) -+{ -+ struct inode *inode = tree->inode; -+ struct super_block *sb = inode->i_sb; -+ int depth = EXT_DEPTH(tree); -+ struct ext3_ext_path *path; -+ handle_t *handle; -+ int i = 0, err = 0; -+ -+ ext_debug(tree, "space to be removed: %lu:%lu\n", start, end); -+ -+ /* probably first extent we're gonna free will be last in block */ -+ handle = ext3_journal_start(inode, depth + 1); -+ if (IS_ERR(handle)) -+ return PTR_ERR(handle); -+ -+ ext3_ext_invalidate_cache(tree); -+ -+ /* -+ * we start scanning from right side freeing all the blocks -+ * after i_size and walking into the deep -+ */ -+ path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 1), GFP_KERNEL); -+ if (IS_ERR(path)) { -+ ext3_error(sb, __FUNCTION__, "Can't allocate path array"); -+ ext3_journal_stop(handle); -+ return -ENOMEM; -+ } -+ memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1)); -+ path[i].p_hdr = EXT_ROOT_HDR(tree); -+ -+ while (i >= 0 && err == 0) { -+ if (i == depth) { -+ /* this is leaf block */ -+ err = ext3_ext_rm_leaf(handle, tree, path, start, end); -+ /* root level have p_bh == NULL, brelse() eats this */ -+ brelse(path[i].p_bh); -+ i--; -+ continue; -+ } -+ -+ /* this is index block */ -+ if (!path[i].p_hdr) { -+ ext_debug(tree, "initialize header\n"); -+ path[i].p_hdr = EXT_BLOCK_HDR(path[i].p_bh); -+ } -+ -+ EXT_ASSERT(path[i].p_hdr->eh_entries <= path[i].p_hdr->eh_max); -+ EXT_ASSERT(path[i].p_hdr->eh_magic == EXT3_EXT_MAGIC); -+ -+ if (!path[i].p_idx) { -+ /* this level hasn't touched yet */ -+ path[i].p_idx = -+ ext3_ext_last_covered(path[i].p_hdr, end); -+ path[i].p_block = path[i].p_hdr->eh_entries + 1; -+ ext_debug(tree, "init index ptr: hdr 0x%p, num %d\n", -+ path[i].p_hdr, path[i].p_hdr->eh_entries); -+ } else { -+ /* we've already was here, see at next index */ -+ path[i].p_idx--; -+ } -+ -+ ext_debug(tree, "level %d - index, first 0x%p, cur 0x%p\n", -+ i, EXT_FIRST_INDEX(path[i].p_hdr), -+ path[i].p_idx); -+ if (ext3_ext_more_to_rm(path + i)) { -+ /* go to the next level */ -+ ext_debug(tree, "move to level %d (block %d)\n", -+ i + 1, path[i].p_idx->ei_leaf); -+ memset(path + i + 1, 0, sizeof(*path)); -+ path[i+1].p_bh = sb_bread(sb, path[i].p_idx->ei_leaf); -+ if (!path[i+1].p_bh) { -+ /* should we reset i_size? */ -+ err = -EIO; -+ break; -+ } -+ /* put actual number of indexes to know is this -+ * number got changed at the next iteration */ -+ path[i].p_block = path[i].p_hdr->eh_entries; -+ i++; -+ } else { -+ /* we finish processing this index, go up */ -+ if (path[i].p_hdr->eh_entries == 0 && i > 0) { -+ /* index is empty, remove it -+ * handle must be already prepared by the -+ * truncatei_leaf() */ -+ err = ext3_ext_rm_idx(handle, tree, path + i); -+ } -+ /* root level have p_bh == NULL, brelse() eats this */ -+ brelse(path[i].p_bh); -+ i--; -+ ext_debug(tree, "return to level %d\n", i); -+ } -+ } -+ -+ /* TODO: flexible tree reduction should be here */ -+ if (path->p_hdr->eh_entries == 0) { -+ /* -+ * truncate to zero freed all the tree -+ * so, we need to correct eh_depth -+ */ -+ err = ext3_ext_get_access(handle, tree, path); -+ if (err == 0) { -+ EXT_ROOT_HDR(tree)->eh_depth = 0; -+ EXT_ROOT_HDR(tree)->eh_max = ext3_ext_space_root(tree); -+ err = ext3_ext_dirty(handle, tree, path); -+ } -+ } -+ ext3_ext_tree_changed(tree); -+ -+ kfree(path); -+ ext3_journal_stop(handle); -+ -+ return err; -+} -+ -+int ext3_ext_calc_metadata_amount(struct ext3_extents_tree *tree, int blocks) -+{ -+ int lcap, icap, rcap, leafs, idxs, num; -+ -+ rcap = ext3_ext_space_root(tree); -+ if (blocks <= rcap) { -+ /* all extents fit to the root */ -+ return 0; -+ } -+ -+ rcap = ext3_ext_space_root_idx(tree); -+ lcap = ext3_ext_space_block(tree); -+ icap = ext3_ext_space_block_idx(tree); -+ -+ num = leafs = (blocks + lcap - 1) / lcap; -+ if (leafs <= rcap) { -+ /* all pointers to leafs fit to the root */ -+ return leafs; -+ } -+ -+ /* ok. we need separate index block(s) to link all leaf blocks */ -+ idxs = (leafs + icap - 1) / icap; -+ do { -+ num += idxs; -+ idxs = (idxs + icap - 1) / icap; -+ } while (idxs > rcap); -+ -+ return num; -+} -+ -+/* -+ * called at mount time -+ */ -+void ext3_ext_init(struct super_block *sb) -+{ -+ /* -+ * possible initialization would be here -+ */ -+ -+ if (test_opt(sb, EXTENTS)) { -+ printk("EXT3-fs: file extents enabled"); -+#ifdef AGRESSIVE_TEST -+ printk(", agressive tests"); -+#endif -+#ifdef CHECK_BINSEARCH -+ printk(", check binsearch"); -+#endif -+ printk("\n"); -+ } -+} -+ -+/* -+ * called at umount time -+ */ -+void ext3_ext_release(struct super_block *sb) -+{ -+} -+ -+/************************************************************************ -+ * VFS related routines -+ ************************************************************************/ -+ -+static int ext3_get_inode_write_access(handle_t *handle, void *buffer) -+{ -+ /* we use in-core data, not bh */ -+ return 0; -+} -+ -+static int ext3_mark_buffer_dirty(handle_t *handle, void *buffer) -+{ -+ struct inode *inode = buffer; -+ return ext3_mark_inode_dirty(handle, inode); -+} -+ -+static int ext3_ext_mergable(struct ext3_extent *ex1, -+ struct ext3_extent *ex2) -+{ -+ /* FIXME: support for large fs */ -+ if (ex1->ee_start + ex1->ee_len == ex2->ee_start) -+ return 1; -+ return 0; -+} -+ -+static int -+ext3_remove_blocks_credits(struct ext3_extents_tree *tree, -+ struct ext3_extent *ex, -+ unsigned long from, unsigned long to) -+{ -+ int needed; -+ -+ /* at present, extent can't cross block group */; -+ needed = 4; /* bitmap + group desc + sb + inode */ -+ -+#ifdef CONFIG_QUOTA -+ needed += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS; -+#endif -+ return needed; -+} -+ -+static int -+ext3_remove_blocks(struct ext3_extents_tree *tree, -+ struct ext3_extent *ex, -+ unsigned long from, unsigned long to) -+{ -+ int needed = ext3_remove_blocks_credits(tree, ex, from, to); -+ handle_t *handle = ext3_journal_start(tree->inode, needed); -+ struct buffer_head *bh; -+ int i; -+ -+ if (IS_ERR(handle)) -+ return PTR_ERR(handle); -+ if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) { -+ /* tail removal */ -+ unsigned long num, start; -+ num = ex->ee_block + ex->ee_len - from; -+ start = ex->ee_start + ex->ee_len - num; -+ ext_debug(tree, "free last %lu blocks starting %lu\n", -+ num, start); -+ for (i = 0; i < num; i++) { -+ bh = sb_find_get_block(tree->inode->i_sb, start + i); -+ ext3_forget(handle, 0, tree->inode, bh, start + i); -+ } -+ ext3_free_blocks(handle, tree->inode, start, num); -+ } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) { -+ printk("strange request: removal %lu-%lu from %u:%u\n", -+ from, to, ex->ee_block, ex->ee_len); -+ } else { -+ printk("strange request: removal(2) %lu-%lu from %u:%u\n", -+ from, to, ex->ee_block, ex->ee_len); -+ } -+ ext3_journal_stop(handle); -+ return 0; -+} -+ -+static int ext3_ext_find_goal(struct inode *inode, -+ struct ext3_ext_path *path, unsigned long block) -+{ -+ struct ext3_inode_info *ei = EXT3_I(inode); -+ unsigned long bg_start; -+ unsigned long colour; -+ int depth; -+ -+ if (path) { -+ struct ext3_extent *ex; -+ depth = path->p_depth; -+ -+ /* try to predict block placement */ -+ if ((ex = path[depth].p_ext)) -+ return ex->ee_start + (block - ex->ee_block); -+ -+ /* it looks index is empty -+ * try to find starting from index itself */ -+ if (path[depth].p_bh) -+ return path[depth].p_bh->b_blocknr; -+ } -+ -+ /* OK. use inode's group */ -+ bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) + -+ le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block); -+ colour = (current->pid % 16) * -+ (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16); -+ return bg_start + colour + block; -+} -+ -+static int ext3_new_block_cb(handle_t *handle, struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, -+ struct ext3_extent *ex, int *err) -+{ -+ struct inode *inode = tree->inode; -+ int newblock, goal; -+ -+ EXT_ASSERT(path); -+ EXT_ASSERT(ex); -+ EXT_ASSERT(ex->ee_start); -+ EXT_ASSERT(ex->ee_len); -+ -+ /* reuse block from the extent to order data/metadata */ -+ newblock = ex->ee_start++; -+ ex->ee_len--; -+ if (ex->ee_len == 0) { -+ ex->ee_len = 1; -+ /* allocate new block for the extent */ -+ goal = ext3_ext_find_goal(inode, path, ex->ee_block); -+ ex->ee_start = ext3_new_block(handle, inode, goal, err); -+ ex->ee_start_hi = 0; -+ if (ex->ee_start == 0) { -+ /* error occured: restore old extent */ -+ ex->ee_start = newblock; -+ return 0; -+ } -+ } -+ return newblock; -+} -+ -+static struct ext3_extents_helpers ext3_blockmap_helpers = { -+ .get_write_access = ext3_get_inode_write_access, -+ .mark_buffer_dirty = ext3_mark_buffer_dirty, -+ .mergable = ext3_ext_mergable, -+ .new_block = ext3_new_block_cb, -+ .remove_extent = ext3_remove_blocks, -+ .remove_extent_credits = ext3_remove_blocks_credits, -+}; -+ -+void ext3_init_tree_desc(struct ext3_extents_tree *tree, -+ struct inode *inode) -+{ -+ tree->inode = inode; -+ tree->root = (void *) EXT3_I(inode)->i_data; -+ tree->buffer = (void *) inode; -+ tree->buffer_len = sizeof(EXT3_I(inode)->i_data); -+ tree->cex = (struct ext3_ext_cache *) &EXT3_I(inode)->i_cached_extent; -+ tree->ops = &ext3_blockmap_helpers; -+} -+ -+int ext3_ext_get_block(handle_t *handle, struct inode *inode, -+ long iblock, struct buffer_head *bh_result, -+ int create, int extend_disksize) -+{ -+ struct ext3_ext_path *path = NULL; -+ struct ext3_extent newex; -+ struct ext3_extent *ex; -+ int goal, newblock, err = 0, depth; -+ struct ext3_extents_tree tree; -+ -+ clear_buffer_new(bh_result); -+ ext3_init_tree_desc(&tree, inode); -+ ext_debug(&tree, "block %d requested for inode %u\n", -+ (int) iblock, (unsigned) inode->i_ino); -+ down(&EXT3_I(inode)->truncate_sem); -+ -+ /* check in cache */ -+ if ((goal = ext3_ext_in_cache(&tree, iblock, &newex))) { -+ if (goal == EXT3_EXT_CACHE_GAP) { -+ if (!create) { -+ /* block isn't allocated yet and -+ * user don't want to allocate it */ -+ goto out2; -+ } -+ /* we should allocate requested block */ -+ } else if (goal == EXT3_EXT_CACHE_EXTENT) { -+ /* block is already allocated */ -+ newblock = iblock - newex.ee_block + newex.ee_start; -+ goto out; -+ } else { -+ EXT_ASSERT(0); -+ } -+ } -+ -+ /* find extent for this block */ -+ path = ext3_ext_find_extent(&tree, iblock, NULL); -+ if (IS_ERR(path)) { -+ err = PTR_ERR(path); -+ path = NULL; -+ goto out2; -+ } -+ -+ depth = EXT_DEPTH(&tree); -+ -+ /* -+ * consistent leaf must not be empty -+ * this situations is possible, though, _during_ tree modification -+ * this is why assert can't be put in ext3_ext_find_extent() -+ */ -+ EXT_ASSERT(path[depth].p_ext != NULL || depth == 0); -+ -+ if ((ex = path[depth].p_ext)) { -+ /* if found exent covers block, simple return it */ -+ if (iblock >= ex->ee_block && iblock < ex->ee_block + ex->ee_len) { -+ newblock = iblock - ex->ee_block + ex->ee_start; -+ ext_debug(&tree, "%d fit into %d:%d -> %d\n", -+ (int) iblock, ex->ee_block, ex->ee_len, -+ newblock); -+ ext3_ext_put_in_cache(&tree, ex->ee_block, -+ ex->ee_len, ex->ee_start, -+ EXT3_EXT_CACHE_EXTENT); -+ goto out; -+ } -+ } -+ -+ /* -+ * requested block isn't allocated yet -+ * we couldn't try to create block if create flag is zero -+ */ -+ if (!create) { -+ /* put just found gap into cache to speedup subsequest reqs */ -+ ext3_ext_put_gap_in_cache(&tree, path, iblock); -+ goto out2; -+ } -+ -+ /* allocate new block */ -+ goal = ext3_ext_find_goal(inode, path, iblock); -+ newblock = ext3_new_block(handle, inode, goal, &err); -+ if (!newblock) -+ goto out2; -+ ext_debug(&tree, "allocate new block: goal %d, found %d\n", -+ goal, newblock); -+ -+ /* try to insert new extent into found leaf and return */ -+ newex.ee_block = iblock; -+ newex.ee_start = newblock; -+ newex.ee_start_hi = 0; -+ newex.ee_len = 1; -+ err = ext3_ext_insert_extent(handle, &tree, path, &newex); -+ if (err) -+ goto out2; -+ -+ if (extend_disksize && inode->i_size > EXT3_I(inode)->i_disksize) -+ EXT3_I(inode)->i_disksize = inode->i_size; -+ -+ /* previous routine could use block we allocated */ -+ newblock = newex.ee_start; -+ set_buffer_new(bh_result); -+ -+ ext3_ext_put_in_cache(&tree, newex.ee_block, newex.ee_len, -+ newex.ee_start, EXT3_EXT_CACHE_EXTENT); -+out: -+ ext3_ext_show_leaf(&tree, path); -+ map_bh(bh_result, inode->i_sb, newblock); -+out2: -+ if (path) { -+ ext3_ext_drop_refs(path); -+ kfree(path); -+ } -+ up(&EXT3_I(inode)->truncate_sem); -+ -+ return err; -+} -+ -+void ext3_ext_truncate(struct inode * inode, struct page *page) -+{ -+ struct address_space *mapping = inode->i_mapping; -+ struct super_block *sb = inode->i_sb; -+ struct ext3_extents_tree tree; -+ unsigned long last_block; -+ handle_t *handle; -+ int err = 0; -+ -+ ext3_init_tree_desc(&tree, inode); -+ -+ /* -+ * probably first extent we're gonna free will be last in block -+ */ -+ err = ext3_writepage_trans_blocks(inode) + 3; -+ handle = ext3_journal_start(inode, err); -+ if (IS_ERR(handle)) { -+ if (page) { -+ clear_highpage(page); -+ flush_dcache_page(page); -+ unlock_page(page); -+ page_cache_release(page); -+ } -+ return; -+ } -+ -+ if (page) -+ ext3_block_truncate_page(handle, page, mapping, inode->i_size); -+ -+ down(&EXT3_I(inode)->truncate_sem); -+ ext3_ext_invalidate_cache(&tree); -+ -+ /* -+ * TODO: optimization is possible here -+ * probably we need not scaning at all, -+ * because page truncation is enough -+ */ -+ if (ext3_orphan_add(handle, inode)) -+ goto out_stop; -+ -+ /* we have to know where to truncate from in crash case */ -+ EXT3_I(inode)->i_disksize = inode->i_size; -+ ext3_mark_inode_dirty(handle, inode); -+ -+ last_block = (inode->i_size + sb->s_blocksize - 1) >> -+ EXT3_BLOCK_SIZE_BITS(sb); -+ err = ext3_ext_remove_space(&tree, last_block, EXT_MAX_BLOCK); -+ -+ /* In a multi-transaction truncate, we only make the final -+ * transaction synchronous */ -+ if (IS_SYNC(inode)) -+ handle->h_sync = 1; -+ -+out_stop: -+ /* -+ * If this was a simple ftruncate(), and the file will remain alive -+ * then we need to clear up the orphan record which we created above. -+ * However, if this was a real unlink then we were called by -+ * ext3_delete_inode(), and we allow that function to clean up the -+ * orphan info for us. -+ */ -+ if (inode->i_nlink) -+ ext3_orphan_del(handle, inode); -+ -+ up(&EXT3_I(inode)->truncate_sem); -+ ext3_journal_stop(handle); -+} -+ -+/* -+ * this routine calculate max number of blocks we could modify -+ * in order to allocate new block for an inode -+ */ -+int ext3_ext_writepage_trans_blocks(struct inode *inode, int num) -+{ -+ struct ext3_extents_tree tree; -+ int needed; -+ -+ ext3_init_tree_desc(&tree, inode); -+ -+ needed = ext3_ext_calc_credits_for_insert(&tree, NULL); -+ -+ /* caller want to allocate num blocks */ -+ needed *= num; -+ -+#ifdef CONFIG_QUOTA -+ /* -+ * FIXME: real calculation should be here -+ * it depends on blockmap format of qouta file -+ */ -+ needed += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS; -+#endif -+ -+ return needed; -+} -+ -+void ext3_extents_initialize_blockmap(handle_t *handle, struct inode *inode) -+{ -+ struct ext3_extents_tree tree; -+ -+ ext3_init_tree_desc(&tree, inode); -+ ext3_extent_tree_init(handle, &tree); -+} -+ -+int ext3_ext_calc_blockmap_metadata(struct inode *inode, int blocks) -+{ -+ struct ext3_extents_tree tree; -+ -+ ext3_init_tree_desc(&tree, inode); -+ return ext3_ext_calc_metadata_amount(&tree, blocks); -+} -+ -+static int -+ext3_ext_store_extent_cb(struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, -+ struct ext3_ext_cache *newex) -+{ -+ struct ext3_extent_buf *buf = (struct ext3_extent_buf *) tree->private; -+ -+ if (newex->ec_type != EXT3_EXT_CACHE_EXTENT) -+ return EXT_CONTINUE; -+ -+ if (buf->err < 0) -+ return EXT_BREAK; -+ if (buf->cur - buf->buffer + sizeof(*newex) > buf->buflen) -+ return EXT_BREAK; -+ -+ if (!copy_to_user(buf->cur, newex, sizeof(*newex))) { -+ buf->err++; -+ buf->cur += sizeof(*newex); -+ } else { -+ buf->err = -EFAULT; -+ return EXT_BREAK; -+ } -+ return EXT_CONTINUE; -+} -+ -+static int -+ext3_ext_collect_stats_cb(struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, -+ struct ext3_ext_cache *ex) -+{ -+ struct ext3_extent_tree_stats *buf = -+ (struct ext3_extent_tree_stats *) tree->private; -+ int depth; -+ -+ if (ex->ec_type != EXT3_EXT_CACHE_EXTENT) -+ return EXT_CONTINUE; -+ -+ depth = EXT_DEPTH(tree); -+ buf->extents_num++; -+ if (path[depth].p_ext == EXT_FIRST_EXTENT(path[depth].p_hdr)) -+ buf->leaf_num++; -+ return EXT_CONTINUE; -+} -+ -+int ext3_ext_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, -+ unsigned long arg) -+{ -+ int err = 0; -+ -+ if (!(EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL)) -+ return -EINVAL; -+ -+ if (cmd == EXT3_IOC_GET_EXTENTS) { -+ struct ext3_extent_buf buf; -+ struct ext3_extents_tree tree; -+ -+ if (copy_from_user(&buf, (void *) arg, sizeof(buf))) -+ return -EFAULT; -+ -+ ext3_init_tree_desc(&tree, inode); -+ buf.cur = buf.buffer; -+ buf.err = 0; -+ tree.private = &buf; -+ down(&EXT3_I(inode)->truncate_sem); -+ err = ext3_ext_walk_space(&tree, buf.start, EXT_MAX_BLOCK, -+ ext3_ext_store_extent_cb); -+ up(&EXT3_I(inode)->truncate_sem); -+ if (err == 0) -+ err = buf.err; -+ } else if (cmd == EXT3_IOC_GET_TREE_STATS) { -+ struct ext3_extent_tree_stats buf; -+ struct ext3_extents_tree tree; -+ -+ ext3_init_tree_desc(&tree, inode); -+ down(&EXT3_I(inode)->truncate_sem); -+ buf.depth = EXT_DEPTH(&tree); -+ buf.extents_num = 0; -+ buf.leaf_num = 0; -+ tree.private = &buf; -+ err = ext3_ext_walk_space(&tree, 0, EXT_MAX_BLOCK, -+ ext3_ext_collect_stats_cb); -+ up(&EXT3_I(inode)->truncate_sem); -+ if (!err) -+ err = copy_to_user((void *) arg, &buf, sizeof(buf)); -+ } else if (cmd == EXT3_IOC_GET_TREE_DEPTH) { -+ struct ext3_extents_tree tree; -+ ext3_init_tree_desc(&tree, inode); -+ down(&EXT3_I(inode)->truncate_sem); -+ err = EXT_DEPTH(&tree); -+ up(&EXT3_I(inode)->truncate_sem); -+ } -+ -+ return err; -+} -+ -+EXPORT_SYMBOL(ext3_init_tree_desc); -+EXPORT_SYMBOL(ext3_mark_inode_dirty); -+EXPORT_SYMBOL(ext3_ext_invalidate_cache); -+EXPORT_SYMBOL(ext3_ext_insert_extent); -+EXPORT_SYMBOL(ext3_ext_walk_space); -+EXPORT_SYMBOL(ext3_ext_find_goal); -+EXPORT_SYMBOL(ext3_ext_calc_credits_for_insert); -Index: linux-2.6.12-rc6/fs/ext3/ialloc.c -=================================================================== ---- linux-2.6.12-rc6.orig/fs/ext3/ialloc.c 2005-06-14 16:31:08.634433030 +0200 -+++ linux-2.6.12-rc6/fs/ext3/ialloc.c 2005-06-14 16:31:25.846346882 +0200 -@@ -598,7 +598,7 @@ - ei->i_dir_start_lookup = 0; - ei->i_disksize = 0; - -- ei->i_flags = EXT3_I(dir)->i_flags & ~EXT3_INDEX_FL; -+ ei->i_flags = EXT3_I(dir)->i_flags & ~(EXT3_INDEX_FL|EXT3_EXTENTS_FL); - if (S_ISLNK(mode)) - ei->i_flags &= ~(EXT3_IMMUTABLE_FL|EXT3_APPEND_FL); - /* dirsync only applies to directories */ -@@ -639,6 +639,18 @@ - DQUOT_FREE_INODE(inode); - goto fail2; - } -+ if (test_opt(sb, EXTENTS) && S_ISREG(inode->i_mode)) { -+ EXT3_I(inode)->i_flags |= EXT3_EXTENTS_FL; -+ ext3_extents_initialize_blockmap(handle, inode); -+ if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_EXTENTS)) { -+ err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh); -+ if (err) goto fail; -+ EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_EXTENTS); -+ BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "call ext3_journal_dirty_metadata"); -+ err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); -+ } -+ } -+ - err = ext3_mark_inode_dirty(handle, inode); - if (err) { - ext3_std_error(sb, err); -Index: linux-2.6.12-rc6/fs/ext3/inode.c -=================================================================== ---- linux-2.6.12-rc6.orig/fs/ext3/inode.c 2005-06-14 16:31:09.701815830 +0200 -+++ linux-2.6.12-rc6/fs/ext3/inode.c 2005-06-14 16:31:25.861971882 +0200 -@@ -40,7 +40,7 @@ - #include "iopen.h" - #include "acl.h" - --static int ext3_writepage_trans_blocks(struct inode *inode); -+int ext3_writepage_trans_blocks(struct inode *inode); - - /* - * Test whether an inode is a fast symlink. -@@ -784,6 +784,17 @@ - return err; - } - -+static inline int -+ext3_get_block_wrap(handle_t *handle, struct inode *inode, long block, -+ struct buffer_head *bh, int create, int extend_disksize) -+{ -+ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) -+ return ext3_ext_get_block(handle, inode, block, bh, create, -+ extend_disksize); -+ return ext3_get_block_handle(handle, inode, block, bh, create, -+ extend_disksize); -+} -+ - static int ext3_get_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) - { -@@ -794,8 +805,8 @@ - handle = ext3_journal_current_handle(); - J_ASSERT(handle != 0); - } -- ret = ext3_get_block_handle(handle, inode, iblock, -- bh_result, create, 1); -+ ret = ext3_get_block_wrap(handle, inode, iblock, -+ bh_result, create, 1); - return ret; - } - -@@ -839,7 +850,7 @@ - - get_block: - if (ret == 0) -- ret = ext3_get_block_handle(handle, inode, iblock, -+ ret = ext3_get_block_wrap(handle, inode, iblock, - bh_result, create, 0); - bh_result->b_size = (1 << inode->i_blkbits); - return ret; -@@ -859,7 +870,7 @@ - dummy.b_state = 0; - dummy.b_blocknr = -1000; - buffer_trace_init(&dummy.b_history); -- *errp = ext3_get_block_handle(handle, inode, block, &dummy, create, 1); -+ *errp = ext3_get_block_wrap(handle, inode, block, &dummy, create, 1); - if (!*errp && buffer_mapped(&dummy)) { - struct buffer_head *bh; - bh = sb_getblk(inode->i_sb, dummy.b_blocknr); -@@ -1593,7 +1604,7 @@ - * This required during truncate. We need to physically zero the tail end - * of that block so it doesn't yield old data if the file is later grown. - */ --static int ext3_block_truncate_page(handle_t *handle, struct page *page, -+int ext3_block_truncate_page(handle_t *handle, struct page *page, - struct address_space *mapping, loff_t from) - { - unsigned long index = from >> PAGE_CACHE_SHIFT; -@@ -2104,6 +2115,9 @@ - return; - } - -+ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) -+ return ext3_ext_truncate(inode, page); -+ - handle = start_transaction(inode); - if (IS_ERR(handle)) { - if (page) { -@@ -2850,12 +2864,15 @@ - * block and work out the exact number of indirects which are touched. Pah. - */ - --static int ext3_writepage_trans_blocks(struct inode *inode) -+int ext3_writepage_trans_blocks(struct inode *inode) - { - int bpp = ext3_journal_blocks_per_page(inode); - int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3; - int ret; - -+ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) -+ return ext3_ext_writepage_trans_blocks(inode, bpp); -+ - if (ext3_should_journal_data(inode)) - ret = 3 * (bpp + indirects) + 2; - else -Index: linux-2.6.12-rc6/fs/ext3/Makefile -=================================================================== ---- linux-2.6.12-rc6.orig/fs/ext3/Makefile 2005-06-14 16:31:09.179354899 +0200 -+++ linux-2.6.12-rc6/fs/ext3/Makefile 2005-06-14 16:31:25.872714069 +0200 -@@ -5,7 +5,8 @@ - obj-$(CONFIG_EXT3_FS) += ext3.o - - ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ -- ioctl.o namei.o super.o symlink.o hash.o resize.o -+ ioctl.o namei.o super.o symlink.o hash.o resize.o \ -+ extents.o - - ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o - ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o -Index: linux-2.6.12-rc6/fs/ext3/super.c -=================================================================== ---- linux-2.6.12-rc6.orig/fs/ext3/super.c 2005-06-14 16:31:09.950839264 +0200 -+++ linux-2.6.12-rc6/fs/ext3/super.c 2005-06-14 16:31:25.886385944 +0200 -@@ -387,6 +387,7 @@ - struct ext3_super_block *es = sbi->s_es; - int i; - -+ ext3_ext_release(sb); - ext3_xattr_put_super(sb); - journal_destroy(sbi->s_journal); - if (!(sb->s_flags & MS_RDONLY)) { -@@ -451,6 +452,8 @@ - #endif - ei->i_block_alloc_info = NULL; - ei->vfs_inode.i_version = 1; -+ -+ memset(&ei->i_cached_extent, 0, sizeof(ei->i_cached_extent)); - return &ei->vfs_inode; - } - -@@ -593,6 +596,7 @@ - Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, - Opt_ignore, Opt_barrier, Opt_err, Opt_resize, - Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, -+ Opt_extents, Opt_noextents, Opt_extdebug, - }; - - static match_table_t tokens = { -@@ -644,6 +647,9 @@ - {Opt_iopen, "iopen"}, - {Opt_noiopen, "noiopen"}, - {Opt_iopen_nopriv, "iopen_nopriv"}, -+ {Opt_extents, "extents"}, -+ {Opt_noextents, "noextents"}, -+ {Opt_extdebug, "extdebug"}, - {Opt_barrier, "barrier=%u"}, - {Opt_err, NULL}, - {Opt_resize, "resize"}, -@@ -953,6 +958,15 @@ - case Opt_nobh: - set_opt(sbi->s_mount_opt, NOBH); - break; -+ case Opt_extents: -+ set_opt (sbi->s_mount_opt, EXTENTS); -+ break; -+ case Opt_noextents: -+ clear_opt (sbi->s_mount_opt, EXTENTS); -+ break; -+ case Opt_extdebug: -+ set_opt (sbi->s_mount_opt, EXTDEBUG); -+ break; - default: - printk (KERN_ERR - "EXT3-fs: Unrecognized mount option \"%s\" " -@@ -1668,6 +1681,7 @@ - percpu_counter_mod(&sbi->s_dirs_counter, - ext3_count_dirs(sb)); - -+ ext3_ext_init(sb); - lock_kernel(); - return 0; - -Index: linux-2.6.12-rc6/fs/ext3/ioctl.c -=================================================================== ---- linux-2.6.12-rc6.orig/fs/ext3/ioctl.c 2005-06-14 16:31:08.646151780 +0200 -+++ linux-2.6.12-rc6/fs/ext3/ioctl.c 2005-06-14 16:31:25.897128131 +0200 -@@ -124,6 +124,10 @@ - err = ext3_change_inode_journal_flag(inode, jflag); - return err; - } -+ case EXT3_IOC_GET_EXTENTS: -+ case EXT3_IOC_GET_TREE_STATS: -+ case EXT3_IOC_GET_TREE_DEPTH: -+ return ext3_ext_ioctl(inode, filp, cmd, arg); - case EXT3_IOC_GETVERSION: - case EXT3_IOC_GETVERSION_OLD: - return put_user(inode->i_generation, (int __user *) arg); -Index: linux-2.6.12-rc6/include/linux/ext3_fs.h -=================================================================== ---- linux-2.6.12-rc6.orig/include/linux/ext3_fs.h 2005-06-14 16:31:10.185214261 +0200 -+++ linux-2.6.12-rc6/include/linux/ext3_fs.h 2005-06-14 16:31:52.859041864 +0200 -@@ -186,8 +186,9 @@ - #define EXT3_NOTAIL_FL 0x00008000 /* don't merge file tail */ - #define EXT3_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ - #define EXT3_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ -+#define EXT3_EXTENTS_FL 0x00080000 /* Inode uses extents */ - #define EXT3_RESERVED_FL 0x80000000 /* reserved for ext3 lib */ - --#define EXT3_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */ -+#define EXT3_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */ - #define EXT3_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */ - -@@ -237,6 +238,9 @@ - #endif - #define EXT3_IOC_GETRSVSZ _IOR('f', 5, long) - #define EXT3_IOC_SETRSVSZ _IOW('f', 6, long) -+#define EXT3_IOC_GET_EXTENTS _IOR('f', 7, long) -+#define EXT3_IOC_GET_TREE_DEPTH _IOR('f', 8, long) -+#define EXT3_IOC_GET_TREE_STATS _IOR('f', 9, long) - - /* - * Structure of an inode on the disk -@@ -360,6 +364,8 @@ - #define EXT3_MOUNT_NOBH 0x40000 /* No bufferheads */ - #define EXT3_MOUNT_IOPEN 0x80000 /* Allow access via iopen */ - #define EXT3_MOUNT_IOPEN_NOPRIV 0x100000/* Make iopen world-readable */ -+#define EXT3_MOUNT_EXTENTS 0x200000/* Extents support */ -+#define EXT3_MOUNT_EXTDEBUG 0x400000/* Extents debug */ - - /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ - #ifndef clear_opt -@@ -548,11 +554,13 @@ - #define EXT3_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */ - #define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */ - #define EXT3_FEATURE_INCOMPAT_META_BG 0x0010 -+#define EXT3_FEATURE_INCOMPAT_EXTENTS 0x0040 /* extents support */ - - #define EXT3_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR - #define EXT3_FEATURE_INCOMPAT_SUPP (EXT3_FEATURE_INCOMPAT_FILETYPE| \ - EXT3_FEATURE_INCOMPAT_RECOVER| \ -- EXT3_FEATURE_INCOMPAT_META_BG) -+ EXT3_FEATURE_INCOMPAT_META_BG| \ -+ EXT3_FEATURE_INCOMPAT_EXTENTS) - #define EXT3_FEATURE_RO_COMPAT_SUPP (EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \ - EXT3_FEATURE_RO_COMPAT_LARGE_FILE| \ - EXT3_FEATURE_RO_COMPAT_BTREE_DIR) -@@ -759,6 +767,9 @@ - - - /* inode.c */ -+extern int ext3_block_truncate_page(handle_t *, struct page *, -+ struct address_space *, loff_t); -+extern int ext3_writepage_trans_blocks(struct inode *inode); - extern int ext3_forget(handle_t *, int, struct inode *, struct buffer_head *, int); - extern struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *); - extern struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *); -@@ -828,6 +837,16 @@ - extern struct inode_operations ext3_symlink_inode_operations; - extern struct inode_operations ext3_fast_symlink_inode_operations; - -+/* extents.c */ -+extern int ext3_ext_writepage_trans_blocks(struct inode *, int); -+extern int ext3_ext_get_block(handle_t *, struct inode *, long, -+ struct buffer_head *, int, int); -+extern void ext3_ext_truncate(struct inode *, struct page *); -+extern void ext3_ext_init(struct super_block *); -+extern void ext3_ext_release(struct super_block *); -+extern void ext3_extents_initialize_blockmap(handle_t *, struct inode *); -+extern int ext3_ext_ioctl(struct inode *inode, struct file *filp, -+ unsigned int cmd, unsigned long arg); - - #endif /* __KERNEL__ */ - -Index: linux-2.6.12-rc6/include/linux/ext3_extents.h -=================================================================== ---- linux-2.6.12-rc6.orig/include/linux/ext3_extents.h 2005-06-14 16:31:25.780917195 +0200 -+++ linux-2.6.12-rc6/include/linux/ext3_extents.h 2005-06-14 16:31:25.932284381 +0200 -@@ -0,0 +1,262 @@ -+/* -+ * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com -+ * Written by Alex Tomas -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License version 2 as -+ * published by the Free Software Foundation. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public Licens -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- -+ */ -+ -+#ifndef _LINUX_EXT3_EXTENTS -+#define _LINUX_EXT3_EXTENTS -+ -+/* -+ * with AGRESSIVE_TEST defined capacity of index/leaf blocks -+ * become very little, so index split, in-depth growing and -+ * other hard changes happens much more often -+ * this is for debug purposes only -+ */ -+#define AGRESSIVE_TEST_ -+ -+/* -+ * if CHECK_BINSEARCH defined, then results of binary search -+ * will be checked by linear search -+ */ -+#define CHECK_BINSEARCH_ -+ -+/* -+ * if EXT_DEBUG is defined you can use 'extdebug' mount option -+ * to get lots of info what's going on -+ */ -+#define EXT_DEBUG_ -+#ifdef EXT_DEBUG -+#define ext_debug(tree,fmt,a...) \ -+do { \ -+ if (test_opt((tree)->inode->i_sb, EXTDEBUG)) \ -+ printk(fmt, ##a); \ -+} while (0); -+#else -+#define ext_debug(tree,fmt,a...) -+#endif -+ -+/* -+ * if EXT_STATS is defined then stats numbers are collected -+ * these number will be displayed at umount time -+ */ -+#define EXT_STATS_ -+ -+ -+#define EXT3_ALLOC_NEEDED 3 /* block bitmap + group desc. + sb */ -+ -+/* -+ * ext3_inode has i_block array (total 60 bytes) -+ * first 4 bytes are used to store: -+ * - tree depth (0 mean there is no tree yet. all extents in the inode) -+ * - number of alive extents in the inode -+ */ -+ -+/* -+ * this is extent on-disk structure -+ * it's used at the bottom of the tree -+ */ -+struct ext3_extent { -+ __u32 ee_block; /* first logical block extent covers */ -+ __u16 ee_len; /* number of blocks covered by extent */ -+ __u16 ee_start_hi; /* high 16 bits of physical block */ -+ __u32 ee_start; /* low 32 bigs of physical block */ -+}; -+ -+/* -+ * this is index on-disk structure -+ * it's used at all the levels, but the bottom -+ */ -+struct ext3_extent_idx { -+ __u32 ei_block; /* index covers logical blocks from 'block' */ -+ __u32 ei_leaf; /* pointer to the physical block of the next * -+ * level. leaf or next index could bet here */ -+ __u16 ei_leaf_hi; /* high 16 bits of physical block */ -+ __u16 ei_unused; -+}; -+ -+/* -+ * each block (leaves and indexes), even inode-stored has header -+ */ -+struct ext3_extent_header { -+ __u16 eh_magic; /* probably will support different formats */ -+ __u16 eh_entries; /* number of valid entries */ -+ __u16 eh_max; /* capacity of store in entries */ -+ __u16 eh_depth; /* has tree real underlaying blocks? */ -+ __u32 eh_generation; /* flags(8 bits) | generation of the tree */ -+}; -+ -+#define EXT3_EXT_MAGIC 0xf30a -+ -+/* -+ * array of ext3_ext_path contains path to some extent -+ * creation/lookup routines use it for traversal/splitting/etc -+ * truncate uses it to simulate recursive walking -+ */ -+struct ext3_ext_path { -+ __u32 p_block; -+ __u16 p_depth; -+ struct ext3_extent *p_ext; -+ struct ext3_extent_idx *p_idx; -+ struct ext3_extent_header *p_hdr; -+ struct buffer_head *p_bh; -+}; -+ -+/* -+ * structure for external API -+ */ -+ -+/* -+ * storage for cached extent -+ */ -+struct ext3_ext_cache { -+ __u32 ec_start; -+ __u32 ec_block; -+ __u32 ec_len; -+ __u32 ec_type; -+}; -+ -+#define EXT3_EXT_CACHE_NO 0 -+#define EXT3_EXT_CACHE_GAP 1 -+#define EXT3_EXT_CACHE_EXTENT 2 -+ -+/* -+ * ext3_extents_tree is used to pass initial information -+ * to top-level extents API -+ */ -+struct ext3_extents_helpers; -+struct ext3_extents_tree { -+ struct inode *inode; /* inode which tree belongs to */ -+ void *root; /* ptr to data top of tree resides at */ -+ void *buffer; /* will be passed as arg to ^^ routines */ -+ int buffer_len; -+ void *private; -+ struct ext3_ext_cache *cex;/* last found extent */ -+ struct ext3_extents_helpers *ops; -+}; -+ -+struct ext3_extents_helpers { -+ int (*get_write_access)(handle_t *h, void *buffer); -+ int (*mark_buffer_dirty)(handle_t *h, void *buffer); -+ int (*mergable)(struct ext3_extent *ex1, struct ext3_extent *ex2); -+ int (*remove_extent_credits)(struct ext3_extents_tree *, -+ struct ext3_extent *, unsigned long, -+ unsigned long); -+ int (*remove_extent)(struct ext3_extents_tree *, -+ struct ext3_extent *, unsigned long, -+ unsigned long); -+ int (*new_block)(handle_t *, struct ext3_extents_tree *, -+ struct ext3_ext_path *, struct ext3_extent *, -+ int *); -+}; -+ -+/* -+ * to be called by ext3_ext_walk_space() -+ * negative retcode - error -+ * positive retcode - signal for ext3_ext_walk_space(), see below -+ * callback must return valid extent (passed or newly created) -+ */ -+typedef int (*ext_prepare_callback)(struct ext3_extents_tree *, -+ struct ext3_ext_path *, -+ struct ext3_ext_cache *); -+ -+#define EXT_CONTINUE 0 -+#define EXT_BREAK 1 -+#define EXT_REPEAT 2 -+ -+ -+#define EXT_MAX_BLOCK 0xffffffff -+ -+ -+#define EXT_FIRST_EXTENT(__hdr__) \ -+ ((struct ext3_extent *) (((char *) (__hdr__)) + \ -+ sizeof(struct ext3_extent_header))) -+#define EXT_FIRST_INDEX(__hdr__) \ -+ ((struct ext3_extent_idx *) (((char *) (__hdr__)) + \ -+ sizeof(struct ext3_extent_header))) -+#define EXT_HAS_FREE_INDEX(__path__) \ -+ ((__path__)->p_hdr->eh_entries < (__path__)->p_hdr->eh_max) -+#define EXT_LAST_EXTENT(__hdr__) \ -+ (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_entries - 1) -+#define EXT_LAST_INDEX(__hdr__) \ -+ (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_entries - 1) -+#define EXT_MAX_EXTENT(__hdr__) \ -+ (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_max - 1) -+#define EXT_MAX_INDEX(__hdr__) \ -+ (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_max - 1) -+#define EXT_HDR_GEN(__hdr__) ((__hdr__)->eh_generation & 0x00ffffff) -+#define EXT_FLAGS(__hdr__) ((__hdr__)->eh_generation >> 24) -+#define EXT_FLAGS_CLR_UNKNOWN 0x7 /* Flags cleared on modification */ -+ -+#define EXT_BLOCK_HDR(__bh__) ((struct ext3_extent_header *)(__bh__)->b_data) -+#define EXT_ROOT_HDR(__tree__) ((struct ext3_extent_header *)(__tree__)->root) -+#define EXT_DEPTH(__tree__) (EXT_ROOT_HDR(__tree__)->eh_depth) -+#define EXT_GENERATION(__tree__) EXT_HDR_GEN(EXT_ROOT_HDR(__tree__)) -+ -+#define EXT_ASSERT(__x__) if (!(__x__)) BUG(); -+ -+#define EXT_CHECK_PATH(tree,path) \ -+{ \ -+ int depth = EXT_DEPTH(tree); \ -+ BUG_ON((unsigned long) (path) < __PAGE_OFFSET); \ -+ BUG_ON((unsigned long) (path)[depth].p_idx < \ -+ __PAGE_OFFSET && (path)[depth].p_idx != NULL); \ -+ BUG_ON((unsigned long) (path)[depth].p_ext < \ -+ __PAGE_OFFSET && (path)[depth].p_ext != NULL); \ -+ BUG_ON((unsigned long) (path)[depth].p_hdr < __PAGE_OFFSET); \ -+ BUG_ON((unsigned long) (path)[depth].p_bh < __PAGE_OFFSET \ -+ && depth != 0); \ -+ BUG_ON((path)[0].p_depth != depth); \ -+} -+ -+ -+/* -+ * this structure is used to gather extents from the tree via ioctl -+ */ -+struct ext3_extent_buf { -+ unsigned long start; -+ int buflen; -+ void *buffer; -+ void *cur; -+ int err; -+}; -+ -+/* -+ * this structure is used to collect stats info about the tree -+ */ -+struct ext3_extent_tree_stats { -+ int depth; -+ int extents_num; -+ int leaf_num; -+}; -+ -+extern void ext3_init_tree_desc(struct ext3_extents_tree *, struct inode *); -+extern int ext3_extent_tree_init(handle_t *, struct ext3_extents_tree *); -+extern int ext3_ext_calc_credits_for_insert(struct ext3_extents_tree *, struct ext3_ext_path *); -+extern int ext3_ext_insert_extent(handle_t *, struct ext3_extents_tree *, struct ext3_ext_path *, struct ext3_extent *); -+extern int ext3_ext_walk_space(struct ext3_extents_tree *, unsigned long, unsigned long, ext_prepare_callback); -+extern int ext3_ext_remove_space(struct ext3_extents_tree *, unsigned long, unsigned long); -+extern struct ext3_ext_path * ext3_ext_find_extent(struct ext3_extents_tree *, int, struct ext3_ext_path *); -+extern int ext3_ext_calc_blockmap_metadata(struct inode *, int); -+ -+static inline void -+ext3_ext_invalidate_cache(struct ext3_extents_tree *tree) -+{ -+ if (tree->cex) -+ tree->cex->ec_type = EXT3_EXT_CACHE_NO; -+} -+ -+ -+#endif /* _LINUX_EXT3_EXTENTS */ -Index: linux-2.6.12-rc6/include/linux/ext3_fs_i.h -=================================================================== ---- linux-2.6.12-rc6.orig/include/linux/ext3_fs_i.h 2005-06-06 17:22:29.000000000 +0200 -+++ linux-2.6.12-rc6/include/linux/ext3_fs_i.h 2005-06-14 16:31:25.941073443 +0200 -@@ -133,6 +133,8 @@ - */ - struct semaphore truncate_sem; - struct inode vfs_inode; -+ -+ __u32 i_cached_extent[4]; - }; - - #endif /* _LINUX_EXT3_FS_I */ diff --git a/ldiskfs/kernel_patches/patches/ext3-extents-2.6.15.patch b/ldiskfs/kernel_patches/patches/ext3-extents-2.6.15.patch deleted file mode 100644 index ea69e84..0000000 --- a/ldiskfs/kernel_patches/patches/ext3-extents-2.6.15.patch +++ /dev/null @@ -1,2947 +0,0 @@ -Index: linux-2.6.16.21-0.8/fs/ext3/extents.c -=================================================================== ---- /dev/null -+++ linux-2.6.16.21-0.8/fs/ext3/extents.c -@@ -0,0 +1,2359 @@ -+/* -+ * Copyright(c) 2003, 2004, 2005, Cluster File Systems, Inc, info@clusterfs.com -+ * Written by Alex Tomas -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License version 2 as -+ * published by the Free Software Foundation. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public Licens -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- -+ */ -+ -+/* -+ * Extents support for EXT3 -+ * -+ * TODO: -+ * - ext3_ext_walk_space() sould not use ext3_ext_find_extent() -+ * - ext3_ext_calc_credits() could take 'mergable' into account -+ * - ext3*_error() should be used in some situations -+ * - find_goal() [to be tested and improved] -+ * - smart tree reduction -+ * - arch-independence -+ * common on-disk format for big/little-endian arch -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+ -+static inline int ext3_ext_check_header(struct ext3_extent_header *eh) -+{ -+ if (eh->eh_magic != EXT3_EXT_MAGIC) { -+ printk(KERN_ERR "EXT3-fs: invalid magic = 0x%x\n", -+ (unsigned)eh->eh_magic); -+ return -EIO; -+ } -+ if (eh->eh_max == 0) { -+ printk(KERN_ERR "EXT3-fs: invalid eh_max = %u\n", -+ (unsigned)eh->eh_max); -+ return -EIO; -+ } -+ if (eh->eh_entries > eh->eh_max) { -+ printk(KERN_ERR "EXT3-fs: invalid eh_entries = %u\n", -+ (unsigned)eh->eh_entries); -+ return -EIO; -+ } -+ return 0; -+} -+ -+static handle_t *ext3_ext_journal_restart(handle_t *handle, int needed) -+{ -+ int err; -+ -+ if (handle->h_buffer_credits > needed) -+ return handle; -+ if (!ext3_journal_extend(handle, needed)) -+ return handle; -+ err = ext3_journal_restart(handle, needed); -+ -+ return handle; -+} -+ -+static int inline -+ext3_ext_get_access_for_root(handle_t *h, struct ext3_extents_tree *tree) -+{ -+ if (tree->ops->get_write_access) -+ return tree->ops->get_write_access(h,tree->buffer); -+ else -+ return 0; -+} -+ -+static int inline -+ext3_ext_mark_root_dirty(handle_t *h, struct ext3_extents_tree *tree) -+{ -+ if (tree->ops->mark_buffer_dirty) -+ return tree->ops->mark_buffer_dirty(h,tree->buffer); -+ else -+ return 0; -+} -+ -+/* -+ * could return: -+ * - EROFS -+ * - ENOMEM -+ */ -+static int ext3_ext_get_access(handle_t *handle, -+ struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path) -+{ -+ int err; -+ -+ if (path->p_bh) { -+ /* path points to block */ -+ err = ext3_journal_get_write_access(handle, path->p_bh); -+ } else { -+ /* path points to leaf/index in inode body */ -+ err = ext3_ext_get_access_for_root(handle, tree); -+ } -+ return err; -+} -+ -+/* -+ * could return: -+ * - EROFS -+ * - ENOMEM -+ * - EIO -+ */ -+static int ext3_ext_dirty(handle_t *handle, struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path) -+{ -+ int err; -+ if (path->p_bh) { -+ /* path points to block */ -+ err =ext3_journal_dirty_metadata(handle, path->p_bh); -+ } else { -+ /* path points to leaf/index in inode body */ -+ err = ext3_ext_mark_root_dirty(handle, tree); -+ } -+ return err; -+} -+ -+static int inline -+ext3_ext_new_block(handle_t *handle, struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, struct ext3_extent *ex, -+ int *err) -+{ -+ int goal, depth, newblock; -+ struct inode *inode; -+ -+ EXT_ASSERT(tree); -+ if (tree->ops->new_block) -+ return tree->ops->new_block(handle, tree, path, ex, err); -+ -+ inode = tree->inode; -+ depth = EXT_DEPTH(tree); -+ if (path && depth > 0) { -+ goal = path[depth-1].p_block; -+ } else { -+ struct ext3_inode_info *ei = EXT3_I(inode); -+ unsigned long bg_start; -+ unsigned long colour; -+ -+ bg_start = (ei->i_block_group * -+ EXT3_BLOCKS_PER_GROUP(inode->i_sb)) + -+ le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block); -+ colour = (current->pid % 16) * -+ (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16); -+ goal = bg_start + colour; -+ } -+ -+ newblock = ext3_new_block(handle, inode, goal, err); -+ return newblock; -+} -+ -+static inline void ext3_ext_tree_changed(struct ext3_extents_tree *tree) -+{ -+ struct ext3_extent_header *neh = EXT_ROOT_HDR(tree); -+ neh->eh_generation = ((EXT_FLAGS(neh) & ~EXT_FLAGS_CLR_UNKNOWN) << 24) | -+ (EXT_HDR_GEN(neh) + 1); -+} -+ -+static inline int ext3_ext_space_block(struct ext3_extents_tree *tree) -+{ -+ int size; -+ -+ size = (tree->inode->i_sb->s_blocksize - -+ sizeof(struct ext3_extent_header)) / -+ sizeof(struct ext3_extent); -+#ifdef AGRESSIVE_TEST -+ size = 6; -+#endif -+ return size; -+} -+ -+static inline int ext3_ext_space_block_idx(struct ext3_extents_tree *tree) -+{ -+ int size; -+ -+ size = (tree->inode->i_sb->s_blocksize - -+ sizeof(struct ext3_extent_header)) / -+ sizeof(struct ext3_extent_idx); -+#ifdef AGRESSIVE_TEST -+ size = 5; -+#endif -+ return size; -+} -+ -+static inline int ext3_ext_space_root(struct ext3_extents_tree *tree) -+{ -+ int size; -+ -+ size = (tree->buffer_len - sizeof(struct ext3_extent_header)) / -+ sizeof(struct ext3_extent); -+#ifdef AGRESSIVE_TEST -+ size = 3; -+#endif -+ return size; -+} -+ -+static inline int ext3_ext_space_root_idx(struct ext3_extents_tree *tree) -+{ -+ int size; -+ -+ size = (tree->buffer_len - sizeof(struct ext3_extent_header)) / -+ sizeof(struct ext3_extent_idx); -+#ifdef AGRESSIVE_TEST -+ size = 4; -+#endif -+ return size; -+} -+ -+static void ext3_ext_show_path(struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path) -+{ -+#ifdef EXT_DEBUG -+ int k, l = path->p_depth; -+ -+ ext_debug(tree, "path:"); -+ for (k = 0; k <= l; k++, path++) { -+ if (path->p_idx) { -+ ext_debug(tree, " %d->%d", path->p_idx->ei_block, -+ path->p_idx->ei_leaf); -+ } else if (path->p_ext) { -+ ext_debug(tree, " %d:%d:%d", -+ path->p_ext->ee_block, -+ path->p_ext->ee_len, -+ path->p_ext->ee_start); -+ } else -+ ext_debug(tree, " []"); -+ } -+ ext_debug(tree, "\n"); -+#endif -+} -+ -+static void ext3_ext_show_leaf(struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path) -+{ -+#ifdef EXT_DEBUG -+ int depth = EXT_DEPTH(tree); -+ struct ext3_extent_header *eh; -+ struct ext3_extent *ex; -+ int i; -+ -+ if (!path) -+ return; -+ -+ eh = path[depth].p_hdr; -+ ex = EXT_FIRST_EXTENT(eh); -+ -+ for (i = 0; i < eh->eh_entries; i++, ex++) { -+ ext_debug(tree, "%d:%d:%d ", -+ ex->ee_block, ex->ee_len, ex->ee_start); -+ } -+ ext_debug(tree, "\n"); -+#endif -+} -+ -+static void ext3_ext_drop_refs(struct ext3_ext_path *path) -+{ -+ int depth = path->p_depth; -+ int i; -+ -+ for (i = 0; i <= depth; i++, path++) { -+ if (path->p_bh) { -+ brelse(path->p_bh); -+ path->p_bh = NULL; -+ } -+ } -+} -+ -+/* -+ * binary search for closest index by given block -+ */ -+static inline void -+ext3_ext_binsearch_idx(struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, int block) -+{ -+ struct ext3_extent_header *eh = path->p_hdr; -+ struct ext3_extent_idx *ix; -+ int l = 0, k, r; -+ -+ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); -+ EXT_ASSERT(eh->eh_entries <= eh->eh_max); -+ EXT_ASSERT(eh->eh_entries > 0); -+ -+ ext_debug(tree, "binsearch for %d(idx): ", block); -+ -+ path->p_idx = ix = EXT_FIRST_INDEX(eh); -+ -+ r = k = eh->eh_entries; -+ while (k > 1) { -+ k = (r - l) / 2; -+ if (block < ix[l + k].ei_block) -+ r -= k; -+ else -+ l += k; -+ ext_debug(tree, "%d:%d:%d ", k, l, r); -+ } -+ -+ ix += l; -+ path->p_idx = ix; -+ ext_debug(tree," -> %d->%d ",path->p_idx->ei_block,path->p_idx->ei_leaf); -+ -+ while (l++ < r) { -+ if (block < ix->ei_block) -+ break; -+ path->p_idx = ix++; -+ } -+ ext_debug(tree, " -> %d->%d\n", path->p_idx->ei_block, -+ path->p_idx->ei_leaf); -+ -+#ifdef CHECK_BINSEARCH -+ { -+ struct ext3_extent_idx *chix; -+ -+ chix = ix = EXT_FIRST_INDEX(eh); -+ for (k = 0; k < eh->eh_entries; k++, ix++) { -+ if (k != 0 && ix->ei_block <= ix[-1].ei_block) { -+ printk("k=%d, ix=0x%p, first=0x%p\n", k, -+ ix, EXT_FIRST_INDEX(eh)); -+ printk("%u <= %u\n", -+ ix->ei_block,ix[-1].ei_block); -+ } -+ EXT_ASSERT(k == 0 || ix->ei_block > ix[-1].ei_block); -+ if (block < ix->ei_block) -+ break; -+ chix = ix; -+ } -+ EXT_ASSERT(chix == path->p_idx); -+ } -+#endif -+} -+ -+/* -+ * binary search for closest extent by given block -+ */ -+static inline void -+ext3_ext_binsearch(struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, int block) -+{ -+ struct ext3_extent_header *eh = path->p_hdr; -+ struct ext3_extent *ex; -+ int l = 0, k, r; -+ -+ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); -+ EXT_ASSERT(eh->eh_entries <= eh->eh_max); -+ -+ if (eh->eh_entries == 0) { -+ /* -+ * this leaf is empty yet: -+ * we get such a leaf in split/add case -+ */ -+ return; -+ } -+ -+ ext_debug(tree, "binsearch for %d: ", block); -+ -+ path->p_ext = ex = EXT_FIRST_EXTENT(eh); -+ -+ r = k = eh->eh_entries; -+ while (k > 1) { -+ k = (r - l) / 2; -+ if (block < ex[l + k].ee_block) -+ r -= k; -+ else -+ l += k; -+ ext_debug(tree, "%d:%d:%d ", k, l, r); -+ } -+ -+ ex += l; -+ path->p_ext = ex; -+ ext_debug(tree, " -> %d:%d:%d ", path->p_ext->ee_block, -+ path->p_ext->ee_start, path->p_ext->ee_len); -+ -+ while (l++ < r) { -+ if (block < ex->ee_block) -+ break; -+ path->p_ext = ex++; -+ } -+ ext_debug(tree, " -> %d:%d:%d\n", path->p_ext->ee_block, -+ path->p_ext->ee_start, path->p_ext->ee_len); -+ -+#ifdef CHECK_BINSEARCH -+ { -+ struct ext3_extent *chex; -+ -+ chex = ex = EXT_FIRST_EXTENT(eh); -+ for (k = 0; k < eh->eh_entries; k++, ex++) { -+ EXT_ASSERT(k == 0 || ex->ee_block > ex[-1].ee_block); -+ if (block < ex->ee_block) -+ break; -+ chex = ex; -+ } -+ EXT_ASSERT(chex == path->p_ext); -+ } -+#endif -+} -+ -+int ext3_extent_tree_init(handle_t *handle, struct ext3_extents_tree *tree) -+{ -+ struct ext3_extent_header *eh; -+ -+ BUG_ON(tree->buffer_len == 0); -+ ext3_ext_get_access_for_root(handle, tree); -+ eh = EXT_ROOT_HDR(tree); -+ eh->eh_depth = 0; -+ eh->eh_entries = 0; -+ eh->eh_magic = EXT3_EXT_MAGIC; -+ eh->eh_max = ext3_ext_space_root(tree); -+ ext3_ext_mark_root_dirty(handle, tree); -+ ext3_ext_invalidate_cache(tree); -+ return 0; -+} -+ -+struct ext3_ext_path * -+ext3_ext_find_extent(struct ext3_extents_tree *tree, int block, -+ struct ext3_ext_path *path) -+{ -+ struct ext3_extent_header *eh; -+ struct buffer_head *bh; -+ int depth, i, ppos = 0; -+ -+ EXT_ASSERT(tree); -+ EXT_ASSERT(tree->inode); -+ EXT_ASSERT(tree->root); -+ -+ eh = EXT_ROOT_HDR(tree); -+ EXT_ASSERT(eh); -+ if (ext3_ext_check_header(eh)) { -+ /* don't free previously allocated path -+ * -- caller should take care */ -+ path = NULL; -+ goto err; -+ } -+ -+ i = depth = EXT_DEPTH(tree); -+ EXT_ASSERT(eh->eh_max); -+ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); -+ -+ /* account possible depth increase */ -+ if (!path) { -+ path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 2), -+ GFP_NOFS); -+ if (!path) -+ return ERR_PTR(-ENOMEM); -+ } -+ memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1)); -+ path[0].p_hdr = eh; -+ -+ /* walk through the tree */ -+ while (i) { -+ ext_debug(tree, "depth %d: num %d, max %d\n", -+ ppos, eh->eh_entries, eh->eh_max); -+ ext3_ext_binsearch_idx(tree, path + ppos, block); -+ path[ppos].p_block = path[ppos].p_idx->ei_leaf; -+ path[ppos].p_depth = i; -+ path[ppos].p_ext = NULL; -+ -+ bh = sb_bread(tree->inode->i_sb, path[ppos].p_block); -+ if (!bh) -+ goto err; -+ -+ eh = EXT_BLOCK_HDR(bh); -+ ppos++; -+ EXT_ASSERT(ppos <= depth); -+ path[ppos].p_bh = bh; -+ path[ppos].p_hdr = eh; -+ i--; -+ -+ if (ext3_ext_check_header(eh)) -+ goto err; -+ } -+ -+ path[ppos].p_depth = i; -+ path[ppos].p_hdr = eh; -+ path[ppos].p_ext = NULL; -+ path[ppos].p_idx = NULL; -+ -+ if (ext3_ext_check_header(eh)) -+ goto err; -+ -+ /* find extent */ -+ ext3_ext_binsearch(tree, path + ppos, block); -+ -+ ext3_ext_show_path(tree, path); -+ -+ return path; -+ -+err: -+ printk(KERN_ERR "EXT3-fs: header is corrupted!\n"); -+ if (path) { -+ ext3_ext_drop_refs(path); -+ kfree(path); -+ } -+ return ERR_PTR(-EIO); -+} -+ -+/* -+ * insert new index [logical;ptr] into the block at cupr -+ * it check where to insert: before curp or after curp -+ */ -+static int ext3_ext_insert_index(handle_t *handle, -+ struct ext3_extents_tree *tree, -+ struct ext3_ext_path *curp, -+ int logical, int ptr) -+{ -+ struct ext3_extent_idx *ix; -+ int len, err; -+ -+ if ((err = ext3_ext_get_access(handle, tree, curp))) -+ return err; -+ -+ EXT_ASSERT(logical != curp->p_idx->ei_block); -+ len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx; -+ if (logical > curp->p_idx->ei_block) { -+ /* insert after */ -+ if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) { -+ len = (len - 1) * sizeof(struct ext3_extent_idx); -+ len = len < 0 ? 0 : len; -+ ext_debug(tree, "insert new index %d after: %d. " -+ "move %d from 0x%p to 0x%p\n", -+ logical, ptr, len, -+ (curp->p_idx + 1), (curp->p_idx + 2)); -+ memmove(curp->p_idx + 2, curp->p_idx + 1, len); -+ } -+ ix = curp->p_idx + 1; -+ } else { -+ /* insert before */ -+ len = len * sizeof(struct ext3_extent_idx); -+ len = len < 0 ? 0 : len; -+ ext_debug(tree, "insert new index %d before: %d. " -+ "move %d from 0x%p to 0x%p\n", -+ logical, ptr, len, -+ curp->p_idx, (curp->p_idx + 1)); -+ memmove(curp->p_idx + 1, curp->p_idx, len); -+ ix = curp->p_idx; -+ } -+ -+ ix->ei_block = logical; -+ ix->ei_leaf = ptr; -+ ix->ei_leaf_hi = ix->ei_unused = 0; -+ curp->p_hdr->eh_entries++; -+ -+ EXT_ASSERT(curp->p_hdr->eh_entries <= curp->p_hdr->eh_max); -+ EXT_ASSERT(ix <= EXT_LAST_INDEX(curp->p_hdr)); -+ -+ err = ext3_ext_dirty(handle, tree, curp); -+ ext3_std_error(tree->inode->i_sb, err); -+ -+ return err; -+} -+ -+/* -+ * routine inserts new subtree into the path, using free index entry -+ * at depth 'at: -+ * - allocates all needed blocks (new leaf and all intermediate index blocks) -+ * - makes decision where to split -+ * - moves remaining extens and index entries (right to the split point) -+ * into the newly allocated blocks -+ * - initialize subtree -+ */ -+static int ext3_ext_split(handle_t *handle, struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, -+ struct ext3_extent *newext, int at) -+{ -+ struct buffer_head *bh = NULL; -+ int depth = EXT_DEPTH(tree); -+ struct ext3_extent_header *neh; -+ struct ext3_extent_idx *fidx; -+ struct ext3_extent *ex; -+ int i = at, k, m, a; -+ unsigned long newblock, oldblock, border; -+ int *ablocks = NULL; /* array of allocated blocks */ -+ int err = 0; -+ -+ /* make decision: where to split? */ -+ /* FIXME: now desicion is simplest: at current extent */ -+ -+ /* if current leaf will be splitted, then we should use -+ * border from split point */ -+ EXT_ASSERT(path[depth].p_ext <= EXT_MAX_EXTENT(path[depth].p_hdr)); -+ if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) { -+ border = path[depth].p_ext[1].ee_block; -+ ext_debug(tree, "leaf will be splitted." -+ " next leaf starts at %d\n", -+ (int)border); -+ } else { -+ border = newext->ee_block; -+ ext_debug(tree, "leaf will be added." -+ " next leaf starts at %d\n", -+ (int)border); -+ } -+ -+ /* -+ * if error occurs, then we break processing -+ * and turn filesystem read-only. so, index won't -+ * be inserted and tree will be in consistent -+ * state. next mount will repair buffers too -+ */ -+ -+ /* -+ * get array to track all allocated blocks -+ * we need this to handle errors and free blocks -+ * upon them -+ */ -+ ablocks = kmalloc(sizeof(unsigned long) * depth, GFP_NOFS); -+ if (!ablocks) -+ return -ENOMEM; -+ memset(ablocks, 0, sizeof(unsigned long) * depth); -+ -+ /* allocate all needed blocks */ -+ ext_debug(tree, "allocate %d blocks for indexes/leaf\n", depth - at); -+ for (a = 0; a < depth - at; a++) { -+ newblock = ext3_ext_new_block(handle, tree, path, newext, &err); -+ if (newblock == 0) -+ goto cleanup; -+ ablocks[a] = newblock; -+ } -+ -+ /* initialize new leaf */ -+ newblock = ablocks[--a]; -+ EXT_ASSERT(newblock); -+ bh = sb_getblk(tree->inode->i_sb, newblock); -+ if (!bh) { -+ err = -EIO; -+ goto cleanup; -+ } -+ lock_buffer(bh); -+ -+ if ((err = ext3_journal_get_create_access(handle, bh))) -+ goto cleanup; -+ -+ neh = EXT_BLOCK_HDR(bh); -+ neh->eh_entries = 0; -+ neh->eh_max = ext3_ext_space_block(tree); -+ neh->eh_magic = EXT3_EXT_MAGIC; -+ neh->eh_depth = 0; -+ ex = EXT_FIRST_EXTENT(neh); -+ -+ /* move remain of path[depth] to the new leaf */ -+ EXT_ASSERT(path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max); -+ /* start copy from next extent */ -+ /* TODO: we could do it by single memmove */ -+ m = 0; -+ path[depth].p_ext++; -+ while (path[depth].p_ext <= -+ EXT_MAX_EXTENT(path[depth].p_hdr)) { -+ ext_debug(tree, "move %d:%d:%d in new leaf %lu\n", -+ path[depth].p_ext->ee_block, -+ path[depth].p_ext->ee_start, -+ path[depth].p_ext->ee_len, -+ newblock); -+ memmove(ex++, path[depth].p_ext++, sizeof(struct ext3_extent)); -+ neh->eh_entries++; -+ m++; -+ } -+ set_buffer_uptodate(bh); -+ unlock_buffer(bh); -+ -+ if ((err = ext3_journal_dirty_metadata(handle, bh))) -+ goto cleanup; -+ brelse(bh); -+ bh = NULL; -+ -+ /* correct old leaf */ -+ if (m) { -+ if ((err = ext3_ext_get_access(handle, tree, path + depth))) -+ goto cleanup; -+ path[depth].p_hdr->eh_entries -= m; -+ if ((err = ext3_ext_dirty(handle, tree, path + depth))) -+ goto cleanup; -+ -+ } -+ -+ /* create intermediate indexes */ -+ k = depth - at - 1; -+ EXT_ASSERT(k >= 0); -+ if (k) -+ ext_debug(tree, "create %d intermediate indices\n", k); -+ /* insert new index into current index block */ -+ /* current depth stored in i var */ -+ i = depth - 1; -+ while (k--) { -+ oldblock = newblock; -+ newblock = ablocks[--a]; -+ bh = sb_getblk(tree->inode->i_sb, newblock); -+ if (!bh) { -+ err = -EIO; -+ goto cleanup; -+ } -+ lock_buffer(bh); -+ -+ if ((err = ext3_journal_get_create_access(handle, bh))) -+ goto cleanup; -+ -+ neh = EXT_BLOCK_HDR(bh); -+ neh->eh_entries = 1; -+ neh->eh_magic = EXT3_EXT_MAGIC; -+ neh->eh_max = ext3_ext_space_block_idx(tree); -+ neh->eh_depth = depth - i; -+ fidx = EXT_FIRST_INDEX(neh); -+ fidx->ei_block = border; -+ fidx->ei_leaf = oldblock; -+ fidx->ei_leaf_hi = fidx->ei_unused = 0; -+ -+ ext_debug(tree, "int.index at %d (block %lu): %lu -> %lu\n", -+ i, newblock, border, oldblock); -+ /* copy indexes */ -+ m = 0; -+ path[i].p_idx++; -+ -+ ext_debug(tree, "cur 0x%p, last 0x%p\n", path[i].p_idx, -+ EXT_MAX_INDEX(path[i].p_hdr)); -+ EXT_ASSERT(EXT_MAX_INDEX(path[i].p_hdr) == -+ EXT_LAST_INDEX(path[i].p_hdr)); -+ while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) { -+ ext_debug(tree, "%d: move %d:%d in new index %lu\n", -+ i, path[i].p_idx->ei_block, -+ path[i].p_idx->ei_leaf, newblock); -+ memmove(++fidx, path[i].p_idx++, -+ sizeof(struct ext3_extent_idx)); -+ neh->eh_entries++; -+ EXT_ASSERT(neh->eh_entries <= neh->eh_max); -+ m++; -+ } -+ set_buffer_uptodate(bh); -+ unlock_buffer(bh); -+ -+ if ((err = ext3_journal_dirty_metadata(handle, bh))) -+ goto cleanup; -+ brelse(bh); -+ bh = NULL; -+ -+ /* correct old index */ -+ if (m) { -+ err = ext3_ext_get_access(handle, tree, path + i); -+ if (err) -+ goto cleanup; -+ path[i].p_hdr->eh_entries -= m; -+ err = ext3_ext_dirty(handle, tree, path + i); -+ if (err) -+ goto cleanup; -+ } -+ -+ i--; -+ } -+ -+ /* insert new index */ -+ if (!err) -+ err = ext3_ext_insert_index(handle, tree, path + at, -+ border, newblock); -+ -+cleanup: -+ if (bh) { -+ if (buffer_locked(bh)) -+ unlock_buffer(bh); -+ brelse(bh); -+ } -+ -+ if (err) { -+ /* free all allocated blocks in error case */ -+ for (i = 0; i < depth; i++) { -+ if (!ablocks[i]) -+ continue; -+ ext3_free_blocks(handle, tree->inode, ablocks[i], 1); -+ } -+ } -+ kfree(ablocks); -+ -+ return err; -+} -+ -+/* -+ * routine implements tree growing procedure: -+ * - allocates new block -+ * - moves top-level data (index block or leaf) into the new block -+ * - initialize new top-level, creating index that points to the -+ * just created block -+ */ -+static int ext3_ext_grow_indepth(handle_t *handle, -+ struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, -+ struct ext3_extent *newext) -+{ -+ struct ext3_ext_path *curp = path; -+ struct ext3_extent_header *neh; -+ struct ext3_extent_idx *fidx; -+ struct buffer_head *bh; -+ unsigned long newblock; -+ int err = 0; -+ -+ newblock = ext3_ext_new_block(handle, tree, path, newext, &err); -+ if (newblock == 0) -+ return err; -+ -+ bh = sb_getblk(tree->inode->i_sb, newblock); -+ if (!bh) { -+ err = -EIO; -+ ext3_std_error(tree->inode->i_sb, err); -+ return err; -+ } -+ lock_buffer(bh); -+ -+ if ((err = ext3_journal_get_create_access(handle, bh))) { -+ unlock_buffer(bh); -+ goto out; -+ } -+ -+ /* move top-level index/leaf into new block */ -+ memmove(bh->b_data, curp->p_hdr, tree->buffer_len); -+ -+ /* set size of new block */ -+ neh = EXT_BLOCK_HDR(bh); -+ /* old root could have indexes or leaves -+ * so calculate eh_max right way */ -+ if (EXT_DEPTH(tree)) -+ neh->eh_max = ext3_ext_space_block_idx(tree); -+ else -+ neh->eh_max = ext3_ext_space_block(tree); -+ neh->eh_magic = EXT3_EXT_MAGIC; -+ set_buffer_uptodate(bh); -+ unlock_buffer(bh); -+ -+ if ((err = ext3_journal_dirty_metadata(handle, bh))) -+ goto out; -+ -+ /* create index in new top-level index: num,max,pointer */ -+ if ((err = ext3_ext_get_access(handle, tree, curp))) -+ goto out; -+ -+ curp->p_hdr->eh_magic = EXT3_EXT_MAGIC; -+ curp->p_hdr->eh_max = ext3_ext_space_root_idx(tree); -+ curp->p_hdr->eh_entries = 1; -+ curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr); -+ /* FIXME: it works, but actually path[0] can be index */ -+ curp->p_idx->ei_block = EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block; -+ curp->p_idx->ei_leaf = newblock; -+ curp->p_idx->ei_leaf_hi = curp->p_idx->ei_unused = 0; -+ -+ neh = EXT_ROOT_HDR(tree); -+ fidx = EXT_FIRST_INDEX(neh); -+ ext_debug(tree, "new root: num %d(%d), lblock %d, ptr %d\n", -+ neh->eh_entries, neh->eh_max, fidx->ei_block, fidx->ei_leaf); -+ -+ neh->eh_depth = path->p_depth + 1; -+ err = ext3_ext_dirty(handle, tree, curp); -+out: -+ brelse(bh); -+ -+ return err; -+} -+ -+/* -+ * routine finds empty index and adds new leaf. if no free index found -+ * then it requests in-depth growing -+ */ -+static int ext3_ext_create_new_leaf(handle_t *handle, -+ struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, -+ struct ext3_extent *newext) -+{ -+ struct ext3_ext_path *curp; -+ int depth, i, err = 0; -+ -+repeat: -+ i = depth = EXT_DEPTH(tree); -+ -+ /* walk up to the tree and look for free index entry */ -+ curp = path + depth; -+ while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) { -+ i--; -+ curp--; -+ } -+ -+ /* we use already allocated block for index block -+ * so, subsequent data blocks should be contigoues */ -+ if (EXT_HAS_FREE_INDEX(curp)) { -+ /* if we found index with free entry, then use that -+ * entry: create all needed subtree and add new leaf */ -+ err = ext3_ext_split(handle, tree, path, newext, i); -+ -+ /* refill path */ -+ ext3_ext_drop_refs(path); -+ path = ext3_ext_find_extent(tree, newext->ee_block, path); -+ if (IS_ERR(path)) -+ err = PTR_ERR(path); -+ } else { -+ /* tree is full, time to grow in depth */ -+ err = ext3_ext_grow_indepth(handle, tree, path, newext); -+ -+ /* refill path */ -+ ext3_ext_drop_refs(path); -+ path = ext3_ext_find_extent(tree, newext->ee_block, path); -+ if (IS_ERR(path)) -+ err = PTR_ERR(path); -+ -+ /* -+ * only first (depth 0 -> 1) produces free space -+ * in all other cases we have to split growed tree -+ */ -+ depth = EXT_DEPTH(tree); -+ if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) { -+ /* now we need split */ -+ goto repeat; -+ } -+ } -+ -+ if (err) -+ return err; -+ -+ return 0; -+} -+ -+/* -+ * returns allocated block in subsequent extent or EXT_MAX_BLOCK -+ * NOTE: it consider block number from index entry as -+ * allocated block. thus, index entries have to be consistent -+ * with leafs -+ */ -+static unsigned long -+ext3_ext_next_allocated_block(struct ext3_ext_path *path) -+{ -+ int depth; -+ -+ EXT_ASSERT(path != NULL); -+ depth = path->p_depth; -+ -+ if (depth == 0 && path->p_ext == NULL) -+ return EXT_MAX_BLOCK; -+ -+ /* FIXME: what if index isn't full ?! */ -+ while (depth >= 0) { -+ if (depth == path->p_depth) { -+ /* leaf */ -+ if (path[depth].p_ext != -+ EXT_LAST_EXTENT(path[depth].p_hdr)) -+ return path[depth].p_ext[1].ee_block; -+ } else { -+ /* index */ -+ if (path[depth].p_idx != -+ EXT_LAST_INDEX(path[depth].p_hdr)) -+ return path[depth].p_idx[1].ei_block; -+ } -+ depth--; -+ } -+ -+ return EXT_MAX_BLOCK; -+} -+ -+/* -+ * returns first allocated block from next leaf or EXT_MAX_BLOCK -+ */ -+static unsigned ext3_ext_next_leaf_block(struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path) -+{ -+ int depth; -+ -+ EXT_ASSERT(path != NULL); -+ depth = path->p_depth; -+ -+ /* zero-tree has no leaf blocks at all */ -+ if (depth == 0) -+ return EXT_MAX_BLOCK; -+ -+ /* go to index block */ -+ depth--; -+ -+ while (depth >= 0) { -+ if (path[depth].p_idx != -+ EXT_LAST_INDEX(path[depth].p_hdr)) -+ return path[depth].p_idx[1].ei_block; -+ depth--; -+ } -+ -+ return EXT_MAX_BLOCK; -+} -+ -+/* -+ * if leaf gets modified and modified extent is first in the leaf -+ * then we have to correct all indexes above -+ * TODO: do we need to correct tree in all cases? -+ */ -+int ext3_ext_correct_indexes(handle_t *handle, struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path) -+{ -+ struct ext3_extent_header *eh; -+ int depth = EXT_DEPTH(tree); -+ struct ext3_extent *ex; -+ unsigned long border; -+ int k, err = 0; -+ -+ eh = path[depth].p_hdr; -+ ex = path[depth].p_ext; -+ EXT_ASSERT(ex); -+ EXT_ASSERT(eh); -+ -+ if (depth == 0) { -+ /* there is no tree at all */ -+ return 0; -+ } -+ -+ if (ex != EXT_FIRST_EXTENT(eh)) { -+ /* we correct tree if first leaf got modified only */ -+ return 0; -+ } -+ -+ /* -+ * TODO: we need correction if border is smaller then current one -+ */ -+ k = depth - 1; -+ border = path[depth].p_ext->ee_block; -+ if ((err = ext3_ext_get_access(handle, tree, path + k))) -+ return err; -+ path[k].p_idx->ei_block = border; -+ if ((err = ext3_ext_dirty(handle, tree, path + k))) -+ return err; -+ -+ while (k--) { -+ /* change all left-side indexes */ -+ if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr)) -+ break; -+ if ((err = ext3_ext_get_access(handle, tree, path + k))) -+ break; -+ path[k].p_idx->ei_block = border; -+ if ((err = ext3_ext_dirty(handle, tree, path + k))) -+ break; -+ } -+ -+ return err; -+} -+ -+static int inline -+ext3_can_extents_be_merged(struct ext3_extents_tree *tree, -+ struct ext3_extent *ex1, -+ struct ext3_extent *ex2) -+{ -+ if (ex1->ee_block + ex1->ee_len != ex2->ee_block) -+ return 0; -+ -+#ifdef AGRESSIVE_TEST -+ if (ex1->ee_len >= 4) -+ return 0; -+#endif -+ -+ if (!tree->ops->mergable) -+ return 1; -+ -+ return tree->ops->mergable(ex1, ex2); -+} -+ -+/* -+ * this routine tries to merge requsted extent into the existing -+ * extent or inserts requested extent as new one into the tree, -+ * creating new leaf in no-space case -+ */ -+int ext3_ext_insert_extent(handle_t *handle, struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, -+ struct ext3_extent *newext) -+{ -+ struct ext3_extent_header * eh; -+ struct ext3_extent *ex, *fex; -+ struct ext3_extent *nearex; /* nearest extent */ -+ struct ext3_ext_path *npath = NULL; -+ int depth, len, err, next; -+ -+ EXT_ASSERT(newext->ee_len > 0); -+ depth = EXT_DEPTH(tree); -+ ex = path[depth].p_ext; -+ EXT_ASSERT(path[depth].p_hdr); -+ -+ /* try to insert block into found extent and return */ -+ if (ex && ext3_can_extents_be_merged(tree, ex, newext)) { -+ ext_debug(tree, "append %d block to %d:%d (from %d)\n", -+ newext->ee_len, ex->ee_block, ex->ee_len, -+ ex->ee_start); -+ if ((err = ext3_ext_get_access(handle, tree, path + depth))) -+ return err; -+ ex->ee_len += newext->ee_len; -+ eh = path[depth].p_hdr; -+ nearex = ex; -+ goto merge; -+ } -+ -+repeat: -+ depth = EXT_DEPTH(tree); -+ eh = path[depth].p_hdr; -+ if (eh->eh_entries < eh->eh_max) -+ goto has_space; -+ -+ /* probably next leaf has space for us? */ -+ fex = EXT_LAST_EXTENT(eh); -+ next = ext3_ext_next_leaf_block(tree, path); -+ if (newext->ee_block > fex->ee_block && next != EXT_MAX_BLOCK) { -+ ext_debug(tree, "next leaf block - %d\n", next); -+ EXT_ASSERT(!npath); -+ npath = ext3_ext_find_extent(tree, next, NULL); -+ if (IS_ERR(npath)) -+ return PTR_ERR(npath); -+ EXT_ASSERT(npath->p_depth == path->p_depth); -+ eh = npath[depth].p_hdr; -+ if (eh->eh_entries < eh->eh_max) { -+ ext_debug(tree, "next leaf isnt full(%d)\n", -+ eh->eh_entries); -+ path = npath; -+ goto repeat; -+ } -+ ext_debug(tree, "next leaf hasno free space(%d,%d)\n", -+ eh->eh_entries, eh->eh_max); -+ } -+ -+ /* -+ * there is no free space in found leaf -+ * we're gonna add new leaf in the tree -+ */ -+ err = ext3_ext_create_new_leaf(handle, tree, path, newext); -+ if (err) -+ goto cleanup; -+ depth = EXT_DEPTH(tree); -+ eh = path[depth].p_hdr; -+ -+has_space: -+ nearex = path[depth].p_ext; -+ -+ if ((err = ext3_ext_get_access(handle, tree, path + depth))) -+ goto cleanup; -+ -+ if (!nearex) { -+ /* there is no extent in this leaf, create first one */ -+ ext_debug(tree, "first extent in the leaf: %d:%d:%d\n", -+ newext->ee_block, newext->ee_start, -+ newext->ee_len); -+ path[depth].p_ext = EXT_FIRST_EXTENT(eh); -+ } else if (newext->ee_block > nearex->ee_block) { -+ EXT_ASSERT(newext->ee_block != nearex->ee_block); -+ if (nearex != EXT_LAST_EXTENT(eh)) { -+ len = EXT_MAX_EXTENT(eh) - nearex; -+ len = (len - 1) * sizeof(struct ext3_extent); -+ len = len < 0 ? 0 : len; -+ ext_debug(tree, "insert %d:%d:%d after: nearest 0x%p, " -+ "move %d from 0x%p to 0x%p\n", -+ newext->ee_block, newext->ee_start, -+ newext->ee_len, -+ nearex, len, nearex + 1, nearex + 2); -+ memmove(nearex + 2, nearex + 1, len); -+ } -+ path[depth].p_ext = nearex + 1; -+ } else { -+ EXT_ASSERT(newext->ee_block != nearex->ee_block); -+ len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext3_extent); -+ len = len < 0 ? 0 : len; -+ ext_debug(tree, "insert %d:%d:%d before: nearest 0x%p, " -+ "move %d from 0x%p to 0x%p\n", -+ newext->ee_block, newext->ee_start, newext->ee_len, -+ nearex, len, nearex + 1, nearex + 2); -+ memmove(nearex + 1, nearex, len); -+ path[depth].p_ext = nearex; -+ } -+ -+ eh->eh_entries++; -+ nearex = path[depth].p_ext; -+ nearex->ee_block = newext->ee_block; -+ nearex->ee_start = newext->ee_start; -+ nearex->ee_len = newext->ee_len; -+ /* FIXME: support for large fs */ -+ nearex->ee_start_hi = 0; -+ -+merge: -+ /* try to merge extents to the right */ -+ while (nearex < EXT_LAST_EXTENT(eh)) { -+ if (!ext3_can_extents_be_merged(tree, nearex, nearex + 1)) -+ break; -+ /* merge with next extent! */ -+ nearex->ee_len += nearex[1].ee_len; -+ if (nearex + 1 < EXT_LAST_EXTENT(eh)) { -+ len = (EXT_LAST_EXTENT(eh) - nearex - 1) * -+ sizeof(struct ext3_extent); -+ memmove(nearex + 1, nearex + 2, len); -+ } -+ eh->eh_entries--; -+ EXT_ASSERT(eh->eh_entries > 0); -+ } -+ -+ /* try to merge extents to the left */ -+ -+ /* time to correct all indexes above */ -+ err = ext3_ext_correct_indexes(handle, tree, path); -+ if (err) -+ goto cleanup; -+ -+ err = ext3_ext_dirty(handle, tree, path + depth); -+ -+cleanup: -+ if (npath) { -+ ext3_ext_drop_refs(npath); -+ kfree(npath); -+ } -+ ext3_ext_tree_changed(tree); -+ ext3_ext_invalidate_cache(tree); -+ return err; -+} -+ -+int ext3_ext_walk_space(struct ext3_extents_tree *tree, unsigned long block, -+ unsigned long num, ext_prepare_callback func) -+{ -+ struct ext3_ext_path *path = NULL; -+ struct ext3_ext_cache cbex; -+ struct ext3_extent *ex; -+ unsigned long next, start = 0, end = 0; -+ unsigned long last = block + num; -+ int depth, exists, err = 0; -+ -+ EXT_ASSERT(tree); -+ EXT_ASSERT(func); -+ EXT_ASSERT(tree->inode); -+ EXT_ASSERT(tree->root); -+ -+ while (block < last && block != EXT_MAX_BLOCK) { -+ num = last - block; -+ /* find extent for this block */ -+ path = ext3_ext_find_extent(tree, block, path); -+ if (IS_ERR(path)) { -+ err = PTR_ERR(path); -+ path = NULL; -+ break; -+ } -+ -+ depth = EXT_DEPTH(tree); -+ EXT_ASSERT(path[depth].p_hdr); -+ ex = path[depth].p_ext; -+ next = ext3_ext_next_allocated_block(path); -+ -+ exists = 0; -+ if (!ex) { -+ /* there is no extent yet, so try to allocate -+ * all requested space */ -+ start = block; -+ end = block + num; -+ } else if (ex->ee_block > block) { -+ /* need to allocate space before found extent */ -+ start = block; -+ end = ex->ee_block; -+ if (block + num < end) -+ end = block + num; -+ } else if (block >= ex->ee_block + ex->ee_len) { -+ /* need to allocate space after found extent */ -+ start = block; -+ end = block + num; -+ if (end >= next) -+ end = next; -+ } else if (block >= ex->ee_block) { -+ /* -+ * some part of requested space is covered -+ * by found extent -+ */ -+ start = block; -+ end = ex->ee_block + ex->ee_len; -+ if (block + num < end) -+ end = block + num; -+ exists = 1; -+ } else { -+ BUG(); -+ } -+ EXT_ASSERT(end > start); -+ -+ if (!exists) { -+ cbex.ec_block = start; -+ cbex.ec_len = end - start; -+ cbex.ec_start = 0; -+ cbex.ec_type = EXT3_EXT_CACHE_GAP; -+ } else { -+ cbex.ec_block = ex->ee_block; -+ cbex.ec_len = ex->ee_len; -+ cbex.ec_start = ex->ee_start; -+ cbex.ec_type = EXT3_EXT_CACHE_EXTENT; -+ } -+ -+ EXT_ASSERT(cbex.ec_len > 0); -+ EXT_ASSERT(path[depth].p_hdr); -+ err = func(tree, path, &cbex); -+ ext3_ext_drop_refs(path); -+ -+ if (err < 0) -+ break; -+ if (err == EXT_REPEAT) -+ continue; -+ else if (err == EXT_BREAK) { -+ err = 0; -+ break; -+ } -+ -+ if (EXT_DEPTH(tree) != depth) { -+ /* depth was changed. we have to realloc path */ -+ kfree(path); -+ path = NULL; -+ } -+ -+ block = cbex.ec_block + cbex.ec_len; -+ } -+ -+ if (path) { -+ ext3_ext_drop_refs(path); -+ kfree(path); -+ } -+ -+ return err; -+} -+ -+static inline void -+ext3_ext_put_in_cache(struct ext3_extents_tree *tree, __u32 block, -+ __u32 len, __u32 start, int type) -+{ -+ EXT_ASSERT(len > 0); -+ if (tree->cex) { -+ tree->cex->ec_type = type; -+ tree->cex->ec_block = block; -+ tree->cex->ec_len = len; -+ tree->cex->ec_start = start; -+ } -+} -+ -+/* -+ * this routine calculate boundaries of the gap requested block fits into -+ * and cache this gap -+ */ -+static inline void -+ext3_ext_put_gap_in_cache(struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, -+ unsigned long block) -+{ -+ int depth = EXT_DEPTH(tree); -+ unsigned long lblock, len; -+ struct ext3_extent *ex; -+ -+ if (!tree->cex) -+ return; -+ -+ ex = path[depth].p_ext; -+ if (ex == NULL) { -+ /* there is no extent yet, so gap is [0;-] */ -+ lblock = 0; -+ len = EXT_MAX_BLOCK; -+ ext_debug(tree, "cache gap(whole file):"); -+ } else if (block < ex->ee_block) { -+ lblock = block; -+ len = ex->ee_block - block; -+ ext_debug(tree, "cache gap(before): %lu [%lu:%lu]", -+ (unsigned long) block, -+ (unsigned long) ex->ee_block, -+ (unsigned long) ex->ee_len); -+ } else if (block >= ex->ee_block + ex->ee_len) { -+ lblock = ex->ee_block + ex->ee_len; -+ len = ext3_ext_next_allocated_block(path); -+ ext_debug(tree, "cache gap(after): [%lu:%lu] %lu", -+ (unsigned long) ex->ee_block, -+ (unsigned long) ex->ee_len, -+ (unsigned long) block); -+ EXT_ASSERT(len > lblock); -+ len = len - lblock; -+ } else { -+ lblock = len = 0; -+ BUG(); -+ } -+ -+ ext_debug(tree, " -> %lu:%lu\n", (unsigned long) lblock, len); -+ ext3_ext_put_in_cache(tree, lblock, len, 0, EXT3_EXT_CACHE_GAP); -+} -+ -+static inline int -+ext3_ext_in_cache(struct ext3_extents_tree *tree, unsigned long block, -+ struct ext3_extent *ex) -+{ -+ struct ext3_ext_cache *cex = tree->cex; -+ -+ /* is there cache storage at all? */ -+ if (!cex) -+ return EXT3_EXT_CACHE_NO; -+ -+ /* has cache valid data? */ -+ if (cex->ec_type == EXT3_EXT_CACHE_NO) -+ return EXT3_EXT_CACHE_NO; -+ -+ EXT_ASSERT(cex->ec_type == EXT3_EXT_CACHE_GAP || -+ cex->ec_type == EXT3_EXT_CACHE_EXTENT); -+ if (block >= cex->ec_block && block < cex->ec_block + cex->ec_len) { -+ ex->ee_block = cex->ec_block; -+ ex->ee_start = cex->ec_start; -+ ex->ee_start_hi = 0; -+ ex->ee_len = cex->ec_len; -+ ext_debug(tree, "%lu cached by %lu:%lu:%lu\n", -+ (unsigned long) block, -+ (unsigned long) ex->ee_block, -+ (unsigned long) ex->ee_len, -+ (unsigned long) ex->ee_start); -+ return cex->ec_type; -+ } -+ -+ /* not in cache */ -+ return EXT3_EXT_CACHE_NO; -+} -+ -+/* -+ * routine removes index from the index block -+ * it's used in truncate case only. thus all requests are for -+ * last index in the block only -+ */ -+int ext3_ext_rm_idx(handle_t *handle, struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path) -+{ -+ struct buffer_head *bh; -+ int err; -+ -+ /* free index block */ -+ path--; -+ EXT_ASSERT(path->p_hdr->eh_entries); -+ if ((err = ext3_ext_get_access(handle, tree, path))) -+ return err; -+ path->p_hdr->eh_entries--; -+ if ((err = ext3_ext_dirty(handle, tree, path))) -+ return err; -+ ext_debug(tree, "index is empty, remove it, free block %d\n", -+ path->p_idx->ei_leaf); -+ bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf); -+ ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf); -+ ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1); -+ return err; -+} -+ -+int ext3_ext_calc_credits_for_insert(struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path) -+{ -+ int depth = EXT_DEPTH(tree); -+ int needed; -+ -+ if (path) { -+ /* probably there is space in leaf? */ -+ if (path[depth].p_hdr->eh_entries < path[depth].p_hdr->eh_max) -+ return 1; -+ } -+ -+ /* -+ * the worste case we're expecting is creation of the -+ * new root (growing in depth) with index splitting -+ * for splitting we have to consider depth + 1 because -+ * previous growing could increase it -+ */ -+ depth = depth + 1; -+ -+ /* -+ * growing in depth: -+ * block allocation + new root + old root -+ */ -+ needed = EXT3_ALLOC_NEEDED + 2; -+ -+ /* index split. we may need: -+ * allocate intermediate indexes and new leaf -+ * change two blocks at each level, but root -+ * modify root block (inode) -+ */ -+ needed += (depth * EXT3_ALLOC_NEEDED) + (2 * depth) + 1; -+ -+ return needed; -+} -+ -+static int -+ext3_ext_split_for_rm(handle_t *handle, struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, unsigned long start, -+ unsigned long end) -+{ -+ struct ext3_extent *ex, tex; -+ struct ext3_ext_path *npath; -+ int depth, creds, err; -+ -+ depth = EXT_DEPTH(tree); -+ ex = path[depth].p_ext; -+ EXT_ASSERT(ex); -+ EXT_ASSERT(end < ex->ee_block + ex->ee_len - 1); -+ EXT_ASSERT(ex->ee_block < start); -+ -+ /* calculate tail extent */ -+ tex.ee_block = end + 1; -+ EXT_ASSERT(tex.ee_block < ex->ee_block + ex->ee_len); -+ tex.ee_len = ex->ee_block + ex->ee_len - tex.ee_block; -+ -+ creds = ext3_ext_calc_credits_for_insert(tree, path); -+ handle = ext3_ext_journal_restart(handle, creds); -+ if (IS_ERR(handle)) -+ return PTR_ERR(handle); -+ -+ /* calculate head extent. use primary extent */ -+ err = ext3_ext_get_access(handle, tree, path + depth); -+ if (err) -+ return err; -+ ex->ee_len = start - ex->ee_block; -+ err = ext3_ext_dirty(handle, tree, path + depth); -+ if (err) -+ return err; -+ -+ /* FIXME: some callback to free underlying resource -+ * and correct ee_start? */ -+ ext_debug(tree, "split extent: head %u:%u, tail %u:%u\n", -+ ex->ee_block, ex->ee_len, tex.ee_block, tex.ee_len); -+ -+ npath = ext3_ext_find_extent(tree, ex->ee_block, NULL); -+ if (IS_ERR(npath)) -+ return PTR_ERR(npath); -+ depth = EXT_DEPTH(tree); -+ EXT_ASSERT(npath[depth].p_ext->ee_block == ex->ee_block); -+ EXT_ASSERT(npath[depth].p_ext->ee_len == ex->ee_len); -+ -+ err = ext3_ext_insert_extent(handle, tree, npath, &tex); -+ ext3_ext_drop_refs(npath); -+ kfree(npath); -+ -+ return err; -+} -+ -+static int -+ext3_ext_rm_leaf(handle_t *handle, struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, unsigned long start, -+ unsigned long end) -+{ -+ struct ext3_extent *ex, *fu = NULL, *lu, *le; -+ int err = 0, correct_index = 0; -+ int depth = EXT_DEPTH(tree), credits; -+ struct ext3_extent_header *eh; -+ unsigned a, b, block, num; -+ -+ ext_debug(tree, "remove [%lu:%lu] in leaf\n", start, end); -+ if (!path[depth].p_hdr) -+ path[depth].p_hdr = EXT_BLOCK_HDR(path[depth].p_bh); -+ eh = path[depth].p_hdr; -+ EXT_ASSERT(eh); -+ EXT_ASSERT(eh->eh_entries <= eh->eh_max); -+ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); -+ -+ /* find where to start removing */ -+ le = ex = EXT_LAST_EXTENT(eh); -+ while (ex != EXT_FIRST_EXTENT(eh)) { -+ if (ex->ee_block <= end) -+ break; -+ ex--; -+ } -+ -+ if (start > ex->ee_block && end < ex->ee_block + ex->ee_len - 1) { -+ /* removal of internal part of the extent requested -+ * tail and head must be placed in different extent -+ * so, we have to insert one more extent */ -+ path[depth].p_ext = ex; -+ return ext3_ext_split_for_rm(handle, tree, path, start, end); -+ } -+ -+ lu = ex; -+ while (ex >= EXT_FIRST_EXTENT(eh) && ex->ee_block + ex->ee_len > start) { -+ ext_debug(tree, "remove ext %u:%u\n", ex->ee_block, ex->ee_len); -+ path[depth].p_ext = ex; -+ -+ a = ex->ee_block > start ? ex->ee_block : start; -+ b = ex->ee_block + ex->ee_len - 1 < end ? -+ ex->ee_block + ex->ee_len - 1 : end; -+ -+ ext_debug(tree, " border %u:%u\n", a, b); -+ -+ if (a != ex->ee_block && b != ex->ee_block + ex->ee_len - 1) { -+ block = 0; -+ num = 0; -+ BUG(); -+ } else if (a != ex->ee_block) { -+ /* remove tail of the extent */ -+ block = ex->ee_block; -+ num = a - block; -+ } else if (b != ex->ee_block + ex->ee_len - 1) { -+ /* remove head of the extent */ -+ block = a; -+ num = b - a; -+ } else { -+ /* remove whole extent: excelent! */ -+ block = ex->ee_block; -+ num = 0; -+ EXT_ASSERT(a == ex->ee_block && -+ b == ex->ee_block + ex->ee_len - 1); -+ } -+ -+ if (ex == EXT_FIRST_EXTENT(eh)) -+ correct_index = 1; -+ -+ credits = 1; -+ if (correct_index) -+ credits += (EXT_DEPTH(tree) * EXT3_ALLOC_NEEDED) + 1; -+ if (tree->ops->remove_extent_credits) -+ credits+=tree->ops->remove_extent_credits(tree,ex,a,b); -+ -+ handle = ext3_ext_journal_restart(handle, credits); -+ if (IS_ERR(handle)) { -+ err = PTR_ERR(handle); -+ goto out; -+ } -+ -+ err = ext3_ext_get_access(handle, tree, path + depth); -+ if (err) -+ goto out; -+ -+ if (tree->ops->remove_extent) -+ err = tree->ops->remove_extent(tree, ex, a, b); -+ if (err) -+ goto out; -+ -+ if (num == 0) { -+ /* this extent is removed entirely mark slot unused */ -+ ex->ee_start = ex->ee_start_hi = 0; -+ eh->eh_entries--; -+ fu = ex; -+ } -+ -+ ex->ee_block = block; -+ ex->ee_len = num; -+ -+ err = ext3_ext_dirty(handle, tree, path + depth); -+ if (err) -+ goto out; -+ -+ ext_debug(tree, "new extent: %u:%u:%u\n", -+ ex->ee_block, ex->ee_len, ex->ee_start); -+ ex--; -+ } -+ -+ if (fu) { -+ /* reuse unused slots */ -+ while (lu < le) { -+ if (lu->ee_start) { -+ *fu = *lu; -+ lu->ee_start = lu->ee_start_hi = 0; -+ fu++; -+ } -+ lu++; -+ } -+ } -+ -+ if (correct_index && eh->eh_entries) -+ err = ext3_ext_correct_indexes(handle, tree, path); -+ -+ /* if this leaf is free, then we should -+ * remove it from index block above */ -+ if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL) -+ err = ext3_ext_rm_idx(handle, tree, path + depth); -+ -+out: -+ return err; -+} -+ -+ -+static struct ext3_extent_idx * -+ext3_ext_last_covered(struct ext3_extent_header *hdr, unsigned long block) -+{ -+ struct ext3_extent_idx *ix; -+ -+ ix = EXT_LAST_INDEX(hdr); -+ while (ix != EXT_FIRST_INDEX(hdr)) { -+ if (ix->ei_block <= block) -+ break; -+ ix--; -+ } -+ return ix; -+} -+ -+/* -+ * returns 1 if current index have to be freed (even partial) -+ */ -+static int inline -+ext3_ext_more_to_rm(struct ext3_ext_path *path) -+{ -+ EXT_ASSERT(path->p_idx); -+ -+ if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr)) -+ return 0; -+ -+ /* -+ * if truncate on deeper level happened it it wasn't partial -+ * so we have to consider current index for truncation -+ */ -+ if (path->p_hdr->eh_entries == path->p_block) -+ return 0; -+ return 1; -+} -+ -+int ext3_ext_remove_space(struct ext3_extents_tree *tree, -+ unsigned long start, unsigned long end) -+{ -+ struct inode *inode = tree->inode; -+ struct super_block *sb = inode->i_sb; -+ int depth = EXT_DEPTH(tree); -+ struct ext3_ext_path *path; -+ handle_t *handle; -+ int i = 0, err = 0; -+ -+ ext_debug(tree, "space to be removed: %lu:%lu\n", start, end); -+ -+ /* probably first extent we're gonna free will be last in block */ -+ handle = ext3_journal_start(inode, depth + 1); -+ if (IS_ERR(handle)) -+ return PTR_ERR(handle); -+ -+ ext3_ext_invalidate_cache(tree); -+ -+ /* -+ * we start scanning from right side freeing all the blocks -+ * after i_size and walking into the deep -+ */ -+ path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 1), GFP_KERNEL); -+ if (IS_ERR(path)) { -+ ext3_error(sb, __FUNCTION__, "Can't allocate path array"); -+ ext3_journal_stop(handle); -+ return -ENOMEM; -+ } -+ memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1)); -+ path[i].p_hdr = EXT_ROOT_HDR(tree); -+ -+ while (i >= 0 && err == 0) { -+ if (i == depth) { -+ /* this is leaf block */ -+ err = ext3_ext_rm_leaf(handle, tree, path, start, end); -+ /* root level have p_bh == NULL, brelse() eats this */ -+ brelse(path[i].p_bh); -+ i--; -+ continue; -+ } -+ -+ /* this is index block */ -+ if (!path[i].p_hdr) { -+ ext_debug(tree, "initialize header\n"); -+ path[i].p_hdr = EXT_BLOCK_HDR(path[i].p_bh); -+ } -+ -+ EXT_ASSERT(path[i].p_hdr->eh_entries <= path[i].p_hdr->eh_max); -+ EXT_ASSERT(path[i].p_hdr->eh_magic == EXT3_EXT_MAGIC); -+ -+ if (!path[i].p_idx) { -+ /* this level hasn't touched yet */ -+ path[i].p_idx = -+ ext3_ext_last_covered(path[i].p_hdr, end); -+ path[i].p_block = path[i].p_hdr->eh_entries + 1; -+ ext_debug(tree, "init index ptr: hdr 0x%p, num %d\n", -+ path[i].p_hdr, path[i].p_hdr->eh_entries); -+ } else { -+ /* we've already was here, see at next index */ -+ path[i].p_idx--; -+ } -+ -+ ext_debug(tree, "level %d - index, first 0x%p, cur 0x%p\n", -+ i, EXT_FIRST_INDEX(path[i].p_hdr), -+ path[i].p_idx); -+ if (ext3_ext_more_to_rm(path + i)) { -+ /* go to the next level */ -+ ext_debug(tree, "move to level %d (block %d)\n", -+ i + 1, path[i].p_idx->ei_leaf); -+ memset(path + i + 1, 0, sizeof(*path)); -+ path[i+1].p_bh = sb_bread(sb, path[i].p_idx->ei_leaf); -+ if (!path[i+1].p_bh) { -+ /* should we reset i_size? */ -+ err = -EIO; -+ break; -+ } -+ /* put actual number of indexes to know is this -+ * number got changed at the next iteration */ -+ path[i].p_block = path[i].p_hdr->eh_entries; -+ i++; -+ } else { -+ /* we finish processing this index, go up */ -+ if (path[i].p_hdr->eh_entries == 0 && i > 0) { -+ /* index is empty, remove it -+ * handle must be already prepared by the -+ * truncatei_leaf() */ -+ err = ext3_ext_rm_idx(handle, tree, path + i); -+ } -+ /* root level have p_bh == NULL, brelse() eats this */ -+ brelse(path[i].p_bh); -+ i--; -+ ext_debug(tree, "return to level %d\n", i); -+ } -+ } -+ -+ /* TODO: flexible tree reduction should be here */ -+ if (path->p_hdr->eh_entries == 0) { -+ /* -+ * truncate to zero freed all the tree -+ * so, we need to correct eh_depth -+ */ -+ err = ext3_ext_get_access(handle, tree, path); -+ if (err == 0) { -+ EXT_ROOT_HDR(tree)->eh_depth = 0; -+ EXT_ROOT_HDR(tree)->eh_max = ext3_ext_space_root(tree); -+ err = ext3_ext_dirty(handle, tree, path); -+ } -+ } -+ ext3_ext_tree_changed(tree); -+ -+ kfree(path); -+ ext3_journal_stop(handle); -+ -+ return err; -+} -+ -+int ext3_ext_calc_metadata_amount(struct ext3_extents_tree *tree, int blocks) -+{ -+ int lcap, icap, rcap, leafs, idxs, num; -+ -+ rcap = ext3_ext_space_root(tree); -+ if (blocks <= rcap) { -+ /* all extents fit to the root */ -+ return 0; -+ } -+ -+ rcap = ext3_ext_space_root_idx(tree); -+ lcap = ext3_ext_space_block(tree); -+ icap = ext3_ext_space_block_idx(tree); -+ -+ num = leafs = (blocks + lcap - 1) / lcap; -+ if (leafs <= rcap) { -+ /* all pointers to leafs fit to the root */ -+ return leafs; -+ } -+ -+ /* ok. we need separate index block(s) to link all leaf blocks */ -+ idxs = (leafs + icap - 1) / icap; -+ do { -+ num += idxs; -+ idxs = (idxs + icap - 1) / icap; -+ } while (idxs > rcap); -+ -+ return num; -+} -+ -+/* -+ * called at mount time -+ */ -+void ext3_ext_init(struct super_block *sb) -+{ -+ /* -+ * possible initialization would be here -+ */ -+ -+ if (test_opt(sb, EXTENTS)) { -+ printk("EXT3-fs: file extents enabled"); -+#ifdef AGRESSIVE_TEST -+ printk(", agressive tests"); -+#endif -+#ifdef CHECK_BINSEARCH -+ printk(", check binsearch"); -+#endif -+ printk("\n"); -+ } -+} -+ -+/* -+ * called at umount time -+ */ -+void ext3_ext_release(struct super_block *sb) -+{ -+} -+ -+/************************************************************************ -+ * VFS related routines -+ ************************************************************************/ -+ -+static int ext3_get_inode_write_access(handle_t *handle, void *buffer) -+{ -+ /* we use in-core data, not bh */ -+ return 0; -+} -+ -+static int ext3_mark_buffer_dirty(handle_t *handle, void *buffer) -+{ -+ struct inode *inode = buffer; -+ return ext3_mark_inode_dirty(handle, inode); -+} -+ -+static int ext3_ext_mergable(struct ext3_extent *ex1, -+ struct ext3_extent *ex2) -+{ -+ /* FIXME: support for large fs */ -+ if (ex1->ee_start + ex1->ee_len == ex2->ee_start) -+ return 1; -+ return 0; -+} -+ -+static int -+ext3_remove_blocks_credits(struct ext3_extents_tree *tree, -+ struct ext3_extent *ex, -+ unsigned long from, unsigned long to) -+{ -+ int needed; -+ -+ /* at present, extent can't cross block group */; -+ needed = 4; /* bitmap + group desc + sb + inode */ -+ -+#ifdef CONFIG_QUOTA -+ needed += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS; -+#endif -+ return needed; -+} -+ -+static int -+ext3_remove_blocks(struct ext3_extents_tree *tree, -+ struct ext3_extent *ex, -+ unsigned long from, unsigned long to) -+{ -+ int needed = ext3_remove_blocks_credits(tree, ex, from, to); -+ handle_t *handle = ext3_journal_start(tree->inode, needed); -+ struct buffer_head *bh; -+ int i; -+ -+ if (IS_ERR(handle)) -+ return PTR_ERR(handle); -+ if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) { -+ /* tail removal */ -+ unsigned long num, start; -+ num = ex->ee_block + ex->ee_len - from; -+ start = ex->ee_start + ex->ee_len - num; -+ ext_debug(tree, "free last %lu blocks starting %lu\n", -+ num, start); -+ for (i = 0; i < num; i++) { -+ bh = sb_find_get_block(tree->inode->i_sb, start + i); -+ ext3_forget(handle, 0, tree->inode, bh, start + i); -+ } -+ ext3_free_blocks(handle, tree->inode, start, num); -+ } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) { -+ printk("strange request: removal %lu-%lu from %u:%u\n", -+ from, to, ex->ee_block, ex->ee_len); -+ } else { -+ printk("strange request: removal(2) %lu-%lu from %u:%u\n", -+ from, to, ex->ee_block, ex->ee_len); -+ } -+ ext3_journal_stop(handle); -+ return 0; -+} -+ -+static int ext3_ext_find_goal(struct inode *inode, -+ struct ext3_ext_path *path, unsigned long block) -+{ -+ struct ext3_inode_info *ei = EXT3_I(inode); -+ unsigned long bg_start; -+ unsigned long colour; -+ int depth; -+ -+ if (path) { -+ struct ext3_extent *ex; -+ depth = path->p_depth; -+ -+ /* try to predict block placement */ -+ if ((ex = path[depth].p_ext)) -+ return ex->ee_start + (block - ex->ee_block); -+ -+ /* it looks index is empty -+ * try to find starting from index itself */ -+ if (path[depth].p_bh) -+ return path[depth].p_bh->b_blocknr; -+ } -+ -+ /* OK. use inode's group */ -+ bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) + -+ le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block); -+ colour = (current->pid % 16) * -+ (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16); -+ return bg_start + colour + block; -+} -+ -+static int ext3_new_block_cb(handle_t *handle, struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, -+ struct ext3_extent *ex, int *err) -+{ -+ struct inode *inode = tree->inode; -+ int newblock, goal; -+ -+ EXT_ASSERT(path); -+ EXT_ASSERT(ex); -+ EXT_ASSERT(ex->ee_start); -+ EXT_ASSERT(ex->ee_len); -+ -+ /* reuse block from the extent to order data/metadata */ -+ newblock = ex->ee_start++; -+ ex->ee_len--; -+ if (ex->ee_len == 0) { -+ ex->ee_len = 1; -+ /* allocate new block for the extent */ -+ goal = ext3_ext_find_goal(inode, path, ex->ee_block); -+ ex->ee_start = ext3_new_block(handle, inode, goal, err); -+ ex->ee_start_hi = 0; -+ if (ex->ee_start == 0) { -+ /* error occured: restore old extent */ -+ ex->ee_start = newblock; -+ return 0; -+ } -+ } -+ return newblock; -+} -+ -+static struct ext3_extents_helpers ext3_blockmap_helpers = { -+ .get_write_access = ext3_get_inode_write_access, -+ .mark_buffer_dirty = ext3_mark_buffer_dirty, -+ .mergable = ext3_ext_mergable, -+ .new_block = ext3_new_block_cb, -+ .remove_extent = ext3_remove_blocks, -+ .remove_extent_credits = ext3_remove_blocks_credits, -+}; -+ -+void ext3_init_tree_desc(struct ext3_extents_tree *tree, -+ struct inode *inode) -+{ -+ tree->inode = inode; -+ tree->root = (void *) EXT3_I(inode)->i_data; -+ tree->buffer = (void *) inode; -+ tree->buffer_len = sizeof(EXT3_I(inode)->i_data); -+ tree->cex = (struct ext3_ext_cache *) &EXT3_I(inode)->i_cached_extent; -+ tree->ops = &ext3_blockmap_helpers; -+} -+ -+int ext3_ext_get_block(handle_t *handle, struct inode *inode, -+ long iblock, struct buffer_head *bh_result, -+ int create, int extend_disksize) -+{ -+ struct ext3_ext_path *path = NULL; -+ struct ext3_extent newex; -+ struct ext3_extent *ex; -+ int goal, newblock, err = 0, depth; -+ struct ext3_extents_tree tree; -+ -+ clear_buffer_new(bh_result); -+ ext3_init_tree_desc(&tree, inode); -+ ext_debug(&tree, "block %d requested for inode %u\n", -+ (int) iblock, (unsigned) inode->i_ino); -+ down(&EXT3_I(inode)->truncate_sem); -+ -+ /* check in cache */ -+ if ((goal = ext3_ext_in_cache(&tree, iblock, &newex))) { -+ if (goal == EXT3_EXT_CACHE_GAP) { -+ if (!create) { -+ /* block isn't allocated yet and -+ * user don't want to allocate it */ -+ goto out2; -+ } -+ /* we should allocate requested block */ -+ } else if (goal == EXT3_EXT_CACHE_EXTENT) { -+ /* block is already allocated */ -+ newblock = iblock - newex.ee_block + newex.ee_start; -+ goto out; -+ } else { -+ EXT_ASSERT(0); -+ } -+ } -+ -+ /* find extent for this block */ -+ path = ext3_ext_find_extent(&tree, iblock, NULL); -+ if (IS_ERR(path)) { -+ err = PTR_ERR(path); -+ path = NULL; -+ goto out2; -+ } -+ -+ depth = EXT_DEPTH(&tree); -+ -+ /* -+ * consistent leaf must not be empty -+ * this situations is possible, though, _during_ tree modification -+ * this is why assert can't be put in ext3_ext_find_extent() -+ */ -+ EXT_ASSERT(path[depth].p_ext != NULL || depth == 0); -+ -+ if ((ex = path[depth].p_ext)) { -+ /* if found exent covers block, simple return it */ -+ if (iblock >= ex->ee_block && iblock < ex->ee_block + ex->ee_len) { -+ newblock = iblock - ex->ee_block + ex->ee_start; -+ ext_debug(&tree, "%d fit into %d:%d -> %d\n", -+ (int) iblock, ex->ee_block, ex->ee_len, -+ newblock); -+ ext3_ext_put_in_cache(&tree, ex->ee_block, -+ ex->ee_len, ex->ee_start, -+ EXT3_EXT_CACHE_EXTENT); -+ goto out; -+ } -+ } -+ -+ /* -+ * requested block isn't allocated yet -+ * we couldn't try to create block if create flag is zero -+ */ -+ if (!create) { -+ /* put just found gap into cache to speedup subsequest reqs */ -+ ext3_ext_put_gap_in_cache(&tree, path, iblock); -+ goto out2; -+ } -+ -+ /* allocate new block */ -+ goal = ext3_ext_find_goal(inode, path, iblock); -+ newblock = ext3_new_block(handle, inode, goal, &err); -+ if (!newblock) -+ goto out2; -+ ext_debug(&tree, "allocate new block: goal %d, found %d\n", -+ goal, newblock); -+ -+ /* try to insert new extent into found leaf and return */ -+ newex.ee_block = iblock; -+ newex.ee_start = newblock; -+ newex.ee_start_hi = 0; -+ newex.ee_len = 1; -+ err = ext3_ext_insert_extent(handle, &tree, path, &newex); -+ if (err) -+ goto out2; -+ -+ if (extend_disksize && inode->i_size > EXT3_I(inode)->i_disksize) -+ EXT3_I(inode)->i_disksize = inode->i_size; -+ -+ /* previous routine could use block we allocated */ -+ newblock = newex.ee_start; -+ set_buffer_new(bh_result); -+ -+ ext3_ext_put_in_cache(&tree, newex.ee_block, newex.ee_len, -+ newex.ee_start, EXT3_EXT_CACHE_EXTENT); -+out: -+ ext3_ext_show_leaf(&tree, path); -+ map_bh(bh_result, inode->i_sb, newblock); -+out2: -+ if (path) { -+ ext3_ext_drop_refs(path); -+ kfree(path); -+ } -+ up(&EXT3_I(inode)->truncate_sem); -+ -+ return err; -+} -+ -+void ext3_ext_truncate(struct inode * inode, struct page *page) -+{ -+ struct address_space *mapping = inode->i_mapping; -+ struct super_block *sb = inode->i_sb; -+ struct ext3_extents_tree tree; -+ unsigned long last_block; -+ handle_t *handle; -+ int err = 0; -+ -+ ext3_init_tree_desc(&tree, inode); -+ -+ /* -+ * probably first extent we're gonna free will be last in block -+ */ -+ err = ext3_writepage_trans_blocks(inode) + 3; -+ handle = ext3_journal_start(inode, err); -+ if (IS_ERR(handle)) { -+ if (page) { -+ clear_highpage(page); -+ flush_dcache_page(page); -+ unlock_page(page); -+ page_cache_release(page); -+ } -+ return; -+ } -+ -+ if (page) -+ ext3_block_truncate_page(handle, page, mapping, inode->i_size); -+ -+ down(&EXT3_I(inode)->truncate_sem); -+ ext3_ext_invalidate_cache(&tree); -+ -+ /* -+ * TODO: optimization is possible here -+ * probably we need not scaning at all, -+ * because page truncation is enough -+ */ -+ if (ext3_orphan_add(handle, inode)) -+ goto out_stop; -+ -+ /* we have to know where to truncate from in crash case */ -+ EXT3_I(inode)->i_disksize = inode->i_size; -+ ext3_mark_inode_dirty(handle, inode); -+ -+ last_block = (inode->i_size + sb->s_blocksize - 1) >> -+ EXT3_BLOCK_SIZE_BITS(sb); -+ err = ext3_ext_remove_space(&tree, last_block, EXT_MAX_BLOCK); -+ -+ /* In a multi-transaction truncate, we only make the final -+ * transaction synchronous */ -+ if (IS_SYNC(inode)) -+ handle->h_sync = 1; -+ -+out_stop: -+ /* -+ * If this was a simple ftruncate(), and the file will remain alive -+ * then we need to clear up the orphan record which we created above. -+ * However, if this was a real unlink then we were called by -+ * ext3_delete_inode(), and we allow that function to clean up the -+ * orphan info for us. -+ */ -+ if (inode->i_nlink) -+ ext3_orphan_del(handle, inode); -+ -+ up(&EXT3_I(inode)->truncate_sem); -+ ext3_journal_stop(handle); -+} -+ -+/* -+ * this routine calculate max number of blocks we could modify -+ * in order to allocate new block for an inode -+ */ -+int ext3_ext_writepage_trans_blocks(struct inode *inode, int num) -+{ -+ struct ext3_extents_tree tree; -+ int needed; -+ -+ ext3_init_tree_desc(&tree, inode); -+ -+ needed = ext3_ext_calc_credits_for_insert(&tree, NULL); -+ -+ /* caller want to allocate num blocks */ -+ needed *= num; -+ -+#ifdef CONFIG_QUOTA -+ /* -+ * FIXME: real calculation should be here -+ * it depends on blockmap format of qouta file -+ */ -+ needed += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS; -+#endif -+ -+ return needed; -+} -+ -+void ext3_extents_initialize_blockmap(handle_t *handle, struct inode *inode) -+{ -+ struct ext3_extents_tree tree; -+ -+ ext3_init_tree_desc(&tree, inode); -+ ext3_extent_tree_init(handle, &tree); -+} -+ -+int ext3_ext_calc_blockmap_metadata(struct inode *inode, int blocks) -+{ -+ struct ext3_extents_tree tree; -+ -+ ext3_init_tree_desc(&tree, inode); -+ return ext3_ext_calc_metadata_amount(&tree, blocks); -+} -+ -+static int -+ext3_ext_store_extent_cb(struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, -+ struct ext3_ext_cache *newex) -+{ -+ struct ext3_extent_buf *buf = (struct ext3_extent_buf *) tree->private; -+ -+ if (newex->ec_type != EXT3_EXT_CACHE_EXTENT) -+ return EXT_CONTINUE; -+ -+ if (buf->err < 0) -+ return EXT_BREAK; -+ if (buf->cur - buf->buffer + sizeof(*newex) > buf->buflen) -+ return EXT_BREAK; -+ -+ if (!copy_to_user(buf->cur, newex, sizeof(*newex))) { -+ buf->err++; -+ buf->cur += sizeof(*newex); -+ } else { -+ buf->err = -EFAULT; -+ return EXT_BREAK; -+ } -+ return EXT_CONTINUE; -+} -+ -+static int -+ext3_ext_collect_stats_cb(struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, -+ struct ext3_ext_cache *ex) -+{ -+ struct ext3_extent_tree_stats *buf = -+ (struct ext3_extent_tree_stats *) tree->private; -+ int depth; -+ -+ if (ex->ec_type != EXT3_EXT_CACHE_EXTENT) -+ return EXT_CONTINUE; -+ -+ depth = EXT_DEPTH(tree); -+ buf->extents_num++; -+ if (path[depth].p_ext == EXT_FIRST_EXTENT(path[depth].p_hdr)) -+ buf->leaf_num++; -+ return EXT_CONTINUE; -+} -+ -+int ext3_ext_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, -+ unsigned long arg) -+{ -+ int err = 0; -+ -+ if (!(EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL)) -+ return -EINVAL; -+ -+ if (cmd == EXT3_IOC_GET_EXTENTS) { -+ struct ext3_extent_buf buf; -+ struct ext3_extents_tree tree; -+ -+ if (copy_from_user(&buf, (void *) arg, sizeof(buf))) -+ return -EFAULT; -+ -+ ext3_init_tree_desc(&tree, inode); -+ buf.cur = buf.buffer; -+ buf.err = 0; -+ tree.private = &buf; -+ down(&EXT3_I(inode)->truncate_sem); -+ err = ext3_ext_walk_space(&tree, buf.start, EXT_MAX_BLOCK, -+ ext3_ext_store_extent_cb); -+ up(&EXT3_I(inode)->truncate_sem); -+ if (err == 0) -+ err = buf.err; -+ } else if (cmd == EXT3_IOC_GET_TREE_STATS) { -+ struct ext3_extent_tree_stats buf; -+ struct ext3_extents_tree tree; -+ -+ ext3_init_tree_desc(&tree, inode); -+ down(&EXT3_I(inode)->truncate_sem); -+ buf.depth = EXT_DEPTH(&tree); -+ buf.extents_num = 0; -+ buf.leaf_num = 0; -+ tree.private = &buf; -+ err = ext3_ext_walk_space(&tree, 0, EXT_MAX_BLOCK, -+ ext3_ext_collect_stats_cb); -+ up(&EXT3_I(inode)->truncate_sem); -+ if (!err) -+ err = copy_to_user((void *) arg, &buf, sizeof(buf)); -+ } else if (cmd == EXT3_IOC_GET_TREE_DEPTH) { -+ struct ext3_extents_tree tree; -+ ext3_init_tree_desc(&tree, inode); -+ down(&EXT3_I(inode)->truncate_sem); -+ err = EXT_DEPTH(&tree); -+ up(&EXT3_I(inode)->truncate_sem); -+ } -+ -+ return err; -+} -+ -+EXPORT_SYMBOL(ext3_init_tree_desc); -+EXPORT_SYMBOL(ext3_mark_inode_dirty); -+EXPORT_SYMBOL(ext3_ext_invalidate_cache); -+EXPORT_SYMBOL(ext3_ext_insert_extent); -+EXPORT_SYMBOL(ext3_ext_walk_space); -+EXPORT_SYMBOL(ext3_ext_find_goal); -+EXPORT_SYMBOL(ext3_ext_calc_credits_for_insert); -Index: linux-2.6.16.21-0.8/fs/ext3/ialloc.c -=================================================================== ---- linux-2.6.16.21-0.8.orig/fs/ext3/ialloc.c -+++ linux-2.6.16.21-0.8/fs/ext3/ialloc.c -@@ -598,7 +598,7 @@ got: - ei->i_dir_start_lookup = 0; - ei->i_disksize = 0; - -- ei->i_flags = EXT3_I(dir)->i_flags & ~EXT3_INDEX_FL; -+ ei->i_flags = EXT3_I(dir)->i_flags & ~(EXT3_INDEX_FL|EXT3_EXTENTS_FL); - if (S_ISLNK(mode)) - ei->i_flags &= ~(EXT3_IMMUTABLE_FL|EXT3_APPEND_FL); - /* dirsync only applies to directories */ -@@ -642,6 +642,18 @@ got: - if (err) - goto fail_free_drop; - -+ if (test_opt(sb, EXTENTS) && S_ISREG(inode->i_mode)) { -+ EXT3_I(inode)->i_flags |= EXT3_EXTENTS_FL; -+ ext3_extents_initialize_blockmap(handle, inode); -+ if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_EXTENTS)) { -+ err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh); -+ if (err) goto fail; -+ EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_EXTENTS); -+ BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "call ext3_journal_dirty_metadata"); -+ err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); -+ } -+ } -+ - err = ext3_mark_inode_dirty(handle, inode); - if (err) { - ext3_std_error(sb, err); -Index: linux-2.6.16.21-0.8/fs/ext3/inode.c -=================================================================== ---- linux-2.6.16.21-0.8.orig/fs/ext3/inode.c -+++ linux-2.6.16.21-0.8/fs/ext3/inode.c -@@ -40,7 +40,7 @@ - #include "iopen.h" - #include "acl.h" - --static int ext3_writepage_trans_blocks(struct inode *inode); -+int ext3_writepage_trans_blocks(struct inode *inode); - - /* - * Test whether an inode is a fast symlink. -@@ -788,6 +788,17 @@ out: - return err; - } - -+static inline int -+ext3_get_block_wrap(handle_t *handle, struct inode *inode, long block, -+ struct buffer_head *bh, int create, int extend_disksize) -+{ -+ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) -+ return ext3_ext_get_block(handle, inode, block, bh, create, -+ extend_disksize); -+ return ext3_get_block_handle(handle, inode, block, bh, create, -+ extend_disksize); -+} -+ - static int ext3_get_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) - { -@@ -798,8 +809,8 @@ static int ext3_get_block(struct inode * - handle = ext3_journal_current_handle(); - J_ASSERT(handle != 0); - } -- ret = ext3_get_block_handle(handle, inode, iblock, -- bh_result, create, 1); -+ ret = ext3_get_block_wrap(handle, inode, iblock, -+ bh_result, create, 1); - return ret; - } - -@@ -843,7 +854,7 @@ ext3_direct_io_get_blocks(struct inode * - - get_block: - if (ret == 0) -- ret = ext3_get_block_handle(handle, inode, iblock, -+ ret = ext3_get_block_wrap(handle, inode, iblock, - bh_result, create, 0); - bh_result->b_size = (1 << inode->i_blkbits); - return ret; -@@ -863,7 +874,7 @@ struct buffer_head *ext3_getblk(handle_t - dummy.b_state = 0; - dummy.b_blocknr = -1000; - buffer_trace_init(&dummy.b_history); -- *errp = ext3_get_block_handle(handle, inode, block, &dummy, create, 1); -+ *errp = ext3_get_block_wrap(handle, inode, block, &dummy, create, 1); - if (!*errp && buffer_mapped(&dummy)) { - struct buffer_head *bh; - bh = sb_getblk(inode->i_sb, dummy.b_blocknr); -@@ -1606,7 +1617,7 @@ void ext3_set_aops(struct inode *inode) - * This required during truncate. We need to physically zero the tail end - * of that block so it doesn't yield old data if the file is later grown. - */ --static int ext3_block_truncate_page(handle_t *handle, struct page *page, -+int ext3_block_truncate_page(handle_t *handle, struct page *page, - struct address_space *mapping, loff_t from) - { - unsigned long index = from >> PAGE_CACHE_SHIFT; -@@ -2116,6 +2127,9 @@ void ext3_truncate(struct inode * inode) - return; - } - -+ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) -+ return ext3_ext_truncate(inode, page); -+ - handle = start_transaction(inode); - if (IS_ERR(handle)) { - if (page) { -@@ -2863,12 +2877,15 @@ err_out: - * block and work out the exact number of indirects which are touched. Pah. - */ - --static int ext3_writepage_trans_blocks(struct inode *inode) -+int ext3_writepage_trans_blocks(struct inode *inode) - { - int bpp = ext3_journal_blocks_per_page(inode); - int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3; - int ret; - -+ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) -+ return ext3_ext_writepage_trans_blocks(inode, bpp); -+ - if (ext3_should_journal_data(inode)) - ret = 3 * (bpp + indirects) + 2; - else -Index: linux-2.6.16.21-0.8/fs/ext3/Makefile -=================================================================== ---- linux-2.6.16.21-0.8.orig/fs/ext3/Makefile -+++ linux-2.6.16.21-0.8/fs/ext3/Makefile -@@ -5,7 +5,8 @@ - obj-$(CONFIG_EXT3_FS) += ext3.o - - ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ -- ioctl.o namei.o super.o symlink.o hash.o resize.o -+ ioctl.o namei.o super.o symlink.o hash.o resize.o \ -+ extents.o - - ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o - ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o -Index: linux-2.6.16.21-0.8/fs/ext3/super.c -=================================================================== ---- linux-2.6.16.21-0.8.orig/fs/ext3/super.c -+++ linux-2.6.16.21-0.8/fs/ext3/super.c -@@ -392,6 +392,7 @@ static void ext3_put_super (struct super - struct ext3_super_block *es = sbi->s_es; - int i; - -+ ext3_ext_release(sb); - ext3_xattr_put_super(sb); - journal_destroy(sbi->s_journal); - if (!(sb->s_flags & MS_RDONLY)) { -@@ -456,6 +457,8 @@ static struct inode *ext3_alloc_inode(st - #endif - ei->i_block_alloc_info = NULL; - ei->vfs_inode.i_version = 1; -+ -+ memset(&ei->i_cached_extent, 0, sizeof(ei->i_cached_extent)); - return &ei->vfs_inode; - } - -@@ -638,6 +641,7 @@ enum { - Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, - Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, - Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, -+ Opt_extents, Opt_noextents, Opt_extdebug, - Opt_grpquota - }; - -@@ -689,6 +693,9 @@ static match_table_t tokens = { - {Opt_iopen, "iopen"}, - {Opt_noiopen, "noiopen"}, - {Opt_iopen_nopriv, "iopen_nopriv"}, -+ {Opt_extents, "extents"}, -+ {Opt_noextents, "noextents"}, -+ {Opt_extdebug, "extdebug"}, - {Opt_barrier, "barrier=%u"}, - {Opt_err, NULL}, - {Opt_resize, "resize"}, -@@ -1030,6 +1036,15 @@ clear_qf_name: - case Opt_nobh: - set_opt(sbi->s_mount_opt, NOBH); - break; -+ case Opt_extents: -+ set_opt (sbi->s_mount_opt, EXTENTS); -+ break; -+ case Opt_noextents: -+ clear_opt (sbi->s_mount_opt, EXTENTS); -+ break; -+ case Opt_extdebug: -+ set_opt (sbi->s_mount_opt, EXTDEBUG); -+ break; - default: - printk (KERN_ERR - "EXT3-fs: Unrecognized mount option \"%s\" " -@@ -1756,6 +1768,7 @@ static int ext3_fill_super (struct super - percpu_counter_mod(&sbi->s_dirs_counter, - ext3_count_dirs(sb)); - -+ ext3_ext_init(sb); - lock_kernel(); - return 0; - -Index: linux-2.6.16.21-0.8/fs/ext3/ioctl.c -=================================================================== ---- linux-2.6.16.21-0.8.orig/fs/ext3/ioctl.c -+++ linux-2.6.16.21-0.8/fs/ext3/ioctl.c -@@ -125,6 +125,10 @@ flags_err: - err = ext3_change_inode_journal_flag(inode, jflag); - return err; - } -+ case EXT3_IOC_GET_EXTENTS: -+ case EXT3_IOC_GET_TREE_STATS: -+ case EXT3_IOC_GET_TREE_DEPTH: -+ return ext3_ext_ioctl(inode, filp, cmd, arg); - case EXT3_IOC_GETVERSION: - case EXT3_IOC_GETVERSION_OLD: - return put_user(inode->i_generation, (int __user *) arg); -Index: linux-2.6.16.21-0.8/include/linux/ext3_fs.h -=================================================================== ---- linux-2.6.16.21-0.8.orig/include/linux/ext3_fs.h -+++ linux-2.6.16.21-0.8/include/linux/ext3_fs.h -@@ -185,9 +185,10 @@ struct ext3_group_desc - #define EXT3_NOTAIL_FL 0x00008000 /* file tail should not be merged */ - #define EXT3_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ - #define EXT3_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ -+#define EXT3_EXTENTS_FL 0x00080000 /* Inode uses extents */ - #define EXT3_RESERVED_FL 0x80000000 /* reserved for ext3 lib */ - --#define EXT3_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */ -+#define EXT3_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */ - #define EXT3_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */ - - /* -@@ -237,6 +238,9 @@ struct ext3_new_group_data { - #endif - #define EXT3_IOC_GETRSVSZ _IOR('f', 5, long) - #define EXT3_IOC_SETRSVSZ _IOW('f', 6, long) -+#define EXT3_IOC_GET_EXTENTS _IOR('f', 7, long) -+#define EXT3_IOC_GET_TREE_DEPTH _IOR('f', 8, long) -+#define EXT3_IOC_GET_TREE_STATS _IOR('f', 9, long) - - /* - * Mount options -@@ -377,6 +381,8 @@ struct ext3_inode { - #define EXT3_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ - #define EXT3_MOUNT_IOPEN 0x400000 /* Allow access via iopen */ - #define EXT3_MOUNT_IOPEN_NOPRIV 0x800000/* Make iopen world-readable */ -+#define EXT3_MOUNT_EXTENTS 0x1000000/* Extents support */ -+#define EXT3_MOUNT_EXTDEBUG 0x2000000/* Extents debug */ - - /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ - #ifndef clear_opt -@@ -565,11 +571,13 @@ static inline struct ext3_inode_info *EX - #define EXT3_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */ - #define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */ - #define EXT3_FEATURE_INCOMPAT_META_BG 0x0010 -+#define EXT3_FEATURE_INCOMPAT_EXTENTS 0x0040 /* extents support */ - - #define EXT3_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR - #define EXT3_FEATURE_INCOMPAT_SUPP (EXT3_FEATURE_INCOMPAT_FILETYPE| \ - EXT3_FEATURE_INCOMPAT_RECOVER| \ -- EXT3_FEATURE_INCOMPAT_META_BG) -+ EXT3_FEATURE_INCOMPAT_META_BG| \ -+ EXT3_FEATURE_INCOMPAT_EXTENTS) - #define EXT3_FEATURE_RO_COMPAT_SUPP (EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \ - EXT3_FEATURE_RO_COMPAT_LARGE_FILE| \ - EXT3_FEATURE_RO_COMPAT_BTREE_DIR) -@@ -776,6 +784,7 @@ extern unsigned long ext3_count_free (st - - - /* inode.c */ -+extern int ext3_block_truncate_page(handle_t *, struct page *, struct address_space *, loff_t); - extern int ext3_forget(handle_t *, int, struct inode *, struct buffer_head *, int); - extern struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *); - extern struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *); -@@ -792,6 +801,7 @@ extern int ext3_get_inode_loc(struct ino - extern void ext3_truncate (struct inode *); - extern void ext3_set_inode_flags(struct inode *); - extern void ext3_set_aops(struct inode *inode); -+extern int ext3_writepage_trans_blocks(struct inode *inode); - - /* ioctl.c */ - extern int ext3_ioctl (struct inode *, struct file *, unsigned int, -@@ -845,6 +855,16 @@ extern struct inode_operations ext3_spec - extern struct inode_operations ext3_symlink_inode_operations; - extern struct inode_operations ext3_fast_symlink_inode_operations; - -+/* extents.c */ -+extern int ext3_ext_writepage_trans_blocks(struct inode *, int); -+extern int ext3_ext_get_block(handle_t *, struct inode *, long, -+ struct buffer_head *, int, int); -+extern void ext3_ext_truncate(struct inode *, struct page *); -+extern void ext3_ext_init(struct super_block *); -+extern void ext3_ext_release(struct super_block *); -+extern void ext3_extents_initialize_blockmap(handle_t *, struct inode *); -+extern int ext3_ext_ioctl(struct inode *inode, struct file *filp, -+ unsigned int cmd, unsigned long arg); - - #endif /* __KERNEL__ */ - -Index: linux-2.6.16.21-0.8/include/linux/ext3_extents.h -=================================================================== ---- /dev/null -+++ linux-2.6.16.21-0.8/include/linux/ext3_extents.h -@@ -0,0 +1,262 @@ -+/* -+ * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com -+ * Written by Alex Tomas -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License version 2 as -+ * published by the Free Software Foundation. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public Licens -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- -+ */ -+ -+#ifndef _LINUX_EXT3_EXTENTS -+#define _LINUX_EXT3_EXTENTS -+ -+/* -+ * with AGRESSIVE_TEST defined capacity of index/leaf blocks -+ * become very little, so index split, in-depth growing and -+ * other hard changes happens much more often -+ * this is for debug purposes only -+ */ -+#define AGRESSIVE_TEST_ -+ -+/* -+ * if CHECK_BINSEARCH defined, then results of binary search -+ * will be checked by linear search -+ */ -+#define CHECK_BINSEARCH_ -+ -+/* -+ * if EXT_DEBUG is defined you can use 'extdebug' mount option -+ * to get lots of info what's going on -+ */ -+#define EXT_DEBUG_ -+#ifdef EXT_DEBUG -+#define ext_debug(tree,fmt,a...) \ -+do { \ -+ if (test_opt((tree)->inode->i_sb, EXTDEBUG)) \ -+ printk(fmt, ##a); \ -+} while (0); -+#else -+#define ext_debug(tree,fmt,a...) -+#endif -+ -+/* -+ * if EXT_STATS is defined then stats numbers are collected -+ * these number will be displayed at umount time -+ */ -+#define EXT_STATS_ -+ -+ -+#define EXT3_ALLOC_NEEDED 3 /* block bitmap + group desc. + sb */ -+ -+/* -+ * ext3_inode has i_block array (total 60 bytes) -+ * first 4 bytes are used to store: -+ * - tree depth (0 mean there is no tree yet. all extents in the inode) -+ * - number of alive extents in the inode -+ */ -+ -+/* -+ * this is extent on-disk structure -+ * it's used at the bottom of the tree -+ */ -+struct ext3_extent { -+ __u32 ee_block; /* first logical block extent covers */ -+ __u16 ee_len; /* number of blocks covered by extent */ -+ __u16 ee_start_hi; /* high 16 bits of physical block */ -+ __u32 ee_start; /* low 32 bigs of physical block */ -+}; -+ -+/* -+ * this is index on-disk structure -+ * it's used at all the levels, but the bottom -+ */ -+struct ext3_extent_idx { -+ __u32 ei_block; /* index covers logical blocks from 'block' */ -+ __u32 ei_leaf; /* pointer to the physical block of the next * -+ * level. leaf or next index could bet here */ -+ __u16 ei_leaf_hi; /* high 16 bits of physical block */ -+ __u16 ei_unused; -+}; -+ -+/* -+ * each block (leaves and indexes), even inode-stored has header -+ */ -+struct ext3_extent_header { -+ __u16 eh_magic; /* probably will support different formats */ -+ __u16 eh_entries; /* number of valid entries */ -+ __u16 eh_max; /* capacity of store in entries */ -+ __u16 eh_depth; /* has tree real underlaying blocks? */ -+ __u32 eh_generation; /* flags(8 bits) | generation of the tree */ -+}; -+ -+#define EXT3_EXT_MAGIC 0xf30a -+ -+/* -+ * array of ext3_ext_path contains path to some extent -+ * creation/lookup routines use it for traversal/splitting/etc -+ * truncate uses it to simulate recursive walking -+ */ -+struct ext3_ext_path { -+ __u32 p_block; -+ __u16 p_depth; -+ struct ext3_extent *p_ext; -+ struct ext3_extent_idx *p_idx; -+ struct ext3_extent_header *p_hdr; -+ struct buffer_head *p_bh; -+}; -+ -+/* -+ * structure for external API -+ */ -+ -+/* -+ * storage for cached extent -+ */ -+struct ext3_ext_cache { -+ __u32 ec_start; -+ __u32 ec_block; -+ __u32 ec_len; -+ __u32 ec_type; -+}; -+ -+#define EXT3_EXT_CACHE_NO 0 -+#define EXT3_EXT_CACHE_GAP 1 -+#define EXT3_EXT_CACHE_EXTENT 2 -+ -+/* -+ * ext3_extents_tree is used to pass initial information -+ * to top-level extents API -+ */ -+struct ext3_extents_helpers; -+struct ext3_extents_tree { -+ struct inode *inode; /* inode which tree belongs to */ -+ void *root; /* ptr to data top of tree resides at */ -+ void *buffer; /* will be passed as arg to ^^ routines */ -+ int buffer_len; -+ void *private; -+ struct ext3_ext_cache *cex;/* last found extent */ -+ struct ext3_extents_helpers *ops; -+}; -+ -+struct ext3_extents_helpers { -+ int (*get_write_access)(handle_t *h, void *buffer); -+ int (*mark_buffer_dirty)(handle_t *h, void *buffer); -+ int (*mergable)(struct ext3_extent *ex1, struct ext3_extent *ex2); -+ int (*remove_extent_credits)(struct ext3_extents_tree *, -+ struct ext3_extent *, unsigned long, -+ unsigned long); -+ int (*remove_extent)(struct ext3_extents_tree *, -+ struct ext3_extent *, unsigned long, -+ unsigned long); -+ int (*new_block)(handle_t *, struct ext3_extents_tree *, -+ struct ext3_ext_path *, struct ext3_extent *, -+ int *); -+}; -+ -+/* -+ * to be called by ext3_ext_walk_space() -+ * negative retcode - error -+ * positive retcode - signal for ext3_ext_walk_space(), see below -+ * callback must return valid extent (passed or newly created) -+ */ -+typedef int (*ext_prepare_callback)(struct ext3_extents_tree *, -+ struct ext3_ext_path *, -+ struct ext3_ext_cache *); -+ -+#define EXT_CONTINUE 0 -+#define EXT_BREAK 1 -+#define EXT_REPEAT 2 -+ -+ -+#define EXT_MAX_BLOCK 0xffffffff -+ -+ -+#define EXT_FIRST_EXTENT(__hdr__) \ -+ ((struct ext3_extent *) (((char *) (__hdr__)) + \ -+ sizeof(struct ext3_extent_header))) -+#define EXT_FIRST_INDEX(__hdr__) \ -+ ((struct ext3_extent_idx *) (((char *) (__hdr__)) + \ -+ sizeof(struct ext3_extent_header))) -+#define EXT_HAS_FREE_INDEX(__path__) \ -+ ((__path__)->p_hdr->eh_entries < (__path__)->p_hdr->eh_max) -+#define EXT_LAST_EXTENT(__hdr__) \ -+ (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_entries - 1) -+#define EXT_LAST_INDEX(__hdr__) \ -+ (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_entries - 1) -+#define EXT_MAX_EXTENT(__hdr__) \ -+ (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_max - 1) -+#define EXT_MAX_INDEX(__hdr__) \ -+ (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_max - 1) -+#define EXT_HDR_GEN(__hdr__) ((__hdr__)->eh_generation & 0x00ffffff) -+#define EXT_FLAGS(__hdr__) ((__hdr__)->eh_generation >> 24) -+#define EXT_FLAGS_CLR_UNKNOWN 0x7 /* Flags cleared on modification */ -+ -+#define EXT_BLOCK_HDR(__bh__) ((struct ext3_extent_header *)(__bh__)->b_data) -+#define EXT_ROOT_HDR(__tree__) ((struct ext3_extent_header *)(__tree__)->root) -+#define EXT_DEPTH(__tree__) (EXT_ROOT_HDR(__tree__)->eh_depth) -+#define EXT_GENERATION(__tree__) EXT_HDR_GEN(EXT_ROOT_HDR(__tree__)) -+ -+#define EXT_ASSERT(__x__) if (!(__x__)) BUG(); -+ -+#define EXT_CHECK_PATH(tree,path) \ -+{ \ -+ int depth = EXT_DEPTH(tree); \ -+ BUG_ON((unsigned long) (path) < __PAGE_OFFSET); \ -+ BUG_ON((unsigned long) (path)[depth].p_idx < \ -+ __PAGE_OFFSET && (path)[depth].p_idx != NULL); \ -+ BUG_ON((unsigned long) (path)[depth].p_ext < \ -+ __PAGE_OFFSET && (path)[depth].p_ext != NULL); \ -+ BUG_ON((unsigned long) (path)[depth].p_hdr < __PAGE_OFFSET); \ -+ BUG_ON((unsigned long) (path)[depth].p_bh < __PAGE_OFFSET \ -+ && depth != 0); \ -+ BUG_ON((path)[0].p_depth != depth); \ -+} -+ -+ -+/* -+ * this structure is used to gather extents from the tree via ioctl -+ */ -+struct ext3_extent_buf { -+ unsigned long start; -+ int buflen; -+ void *buffer; -+ void *cur; -+ int err; -+}; -+ -+/* -+ * this structure is used to collect stats info about the tree -+ */ -+struct ext3_extent_tree_stats { -+ int depth; -+ int extents_num; -+ int leaf_num; -+}; -+ -+extern void ext3_init_tree_desc(struct ext3_extents_tree *, struct inode *); -+extern int ext3_extent_tree_init(handle_t *, struct ext3_extents_tree *); -+extern int ext3_ext_calc_credits_for_insert(struct ext3_extents_tree *, struct ext3_ext_path *); -+extern int ext3_ext_insert_extent(handle_t *, struct ext3_extents_tree *, struct ext3_ext_path *, struct ext3_extent *); -+extern int ext3_ext_walk_space(struct ext3_extents_tree *, unsigned long, unsigned long, ext_prepare_callback); -+extern int ext3_ext_remove_space(struct ext3_extents_tree *, unsigned long, unsigned long); -+extern struct ext3_ext_path * ext3_ext_find_extent(struct ext3_extents_tree *, int, struct ext3_ext_path *); -+extern int ext3_ext_calc_blockmap_metadata(struct inode *, int); -+ -+static inline void -+ext3_ext_invalidate_cache(struct ext3_extents_tree *tree) -+{ -+ if (tree->cex) -+ tree->cex->ec_type = EXT3_EXT_CACHE_NO; -+} -+ -+ -+#endif /* _LINUX_EXT3_EXTENTS */ -Index: linux-2.6.16.21-0.8/include/linux/ext3_fs_i.h -=================================================================== ---- linux-2.6.16.21-0.8.orig/include/linux/ext3_fs_i.h -+++ linux-2.6.16.21-0.8/include/linux/ext3_fs_i.h -@@ -133,6 +133,8 @@ struct ext3_inode_info { - */ - struct semaphore truncate_sem; - struct inode vfs_inode; -+ -+ __u32 i_cached_extent[4]; - }; - - #endif /* _LINUX_EXT3_FS_I */ diff --git a/ldiskfs/kernel_patches/patches/ext3-extents-2.6.16-sles10.patch b/ldiskfs/kernel_patches/patches/ext3-extents-2.6.16-sles10.patch deleted file mode 100644 index fd17dab..0000000 --- a/ldiskfs/kernel_patches/patches/ext3-extents-2.6.16-sles10.patch +++ /dev/null @@ -1,2947 +0,0 @@ -Index: linux-2.6.16.27-0.9/fs/ext3/extents.c -=================================================================== ---- /dev/null -+++ linux-2.6.16.27-0.9/fs/ext3/extents.c -@@ -0,0 +1,2359 @@ -+/* -+ * Copyright(c) 2003, 2004, 2005, Cluster File Systems, Inc, info@clusterfs.com -+ * Written by Alex Tomas -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License version 2 as -+ * published by the Free Software Foundation. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public Licens -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- -+ */ -+ -+/* -+ * Extents support for EXT3 -+ * -+ * TODO: -+ * - ext3_ext_walk_space() sould not use ext3_ext_find_extent() -+ * - ext3_ext_calc_credits() could take 'mergable' into account -+ * - ext3*_error() should be used in some situations -+ * - find_goal() [to be tested and improved] -+ * - smart tree reduction -+ * - arch-independence -+ * common on-disk format for big/little-endian arch -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+ -+static inline int ext3_ext_check_header(struct ext3_extent_header *eh) -+{ -+ if (eh->eh_magic != EXT3_EXT_MAGIC) { -+ printk(KERN_ERR "EXT3-fs: invalid magic = 0x%x\n", -+ (unsigned)eh->eh_magic); -+ return -EIO; -+ } -+ if (eh->eh_max == 0) { -+ printk(KERN_ERR "EXT3-fs: invalid eh_max = %u\n", -+ (unsigned)eh->eh_max); -+ return -EIO; -+ } -+ if (eh->eh_entries > eh->eh_max) { -+ printk(KERN_ERR "EXT3-fs: invalid eh_entries = %u\n", -+ (unsigned)eh->eh_entries); -+ return -EIO; -+ } -+ return 0; -+} -+ -+static handle_t *ext3_ext_journal_restart(handle_t *handle, int needed) -+{ -+ int err; -+ -+ if (handle->h_buffer_credits > needed) -+ return handle; -+ if (!ext3_journal_extend(handle, needed)) -+ return handle; -+ err = ext3_journal_restart(handle, needed); -+ -+ return handle; -+} -+ -+static int inline -+ext3_ext_get_access_for_root(handle_t *h, struct ext3_extents_tree *tree) -+{ -+ if (tree->ops->get_write_access) -+ return tree->ops->get_write_access(h,tree->buffer); -+ else -+ return 0; -+} -+ -+static int inline -+ext3_ext_mark_root_dirty(handle_t *h, struct ext3_extents_tree *tree) -+{ -+ if (tree->ops->mark_buffer_dirty) -+ return tree->ops->mark_buffer_dirty(h,tree->buffer); -+ else -+ return 0; -+} -+ -+/* -+ * could return: -+ * - EROFS -+ * - ENOMEM -+ */ -+static int ext3_ext_get_access(handle_t *handle, -+ struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path) -+{ -+ int err; -+ -+ if (path->p_bh) { -+ /* path points to block */ -+ err = ext3_journal_get_write_access(handle, path->p_bh); -+ } else { -+ /* path points to leaf/index in inode body */ -+ err = ext3_ext_get_access_for_root(handle, tree); -+ } -+ return err; -+} -+ -+/* -+ * could return: -+ * - EROFS -+ * - ENOMEM -+ * - EIO -+ */ -+static int ext3_ext_dirty(handle_t *handle, struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path) -+{ -+ int err; -+ if (path->p_bh) { -+ /* path points to block */ -+ err =ext3_journal_dirty_metadata(handle, path->p_bh); -+ } else { -+ /* path points to leaf/index in inode body */ -+ err = ext3_ext_mark_root_dirty(handle, tree); -+ } -+ return err; -+} -+ -+static int inline -+ext3_ext_new_block(handle_t *handle, struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, struct ext3_extent *ex, -+ int *err) -+{ -+ int goal, depth, newblock; -+ struct inode *inode; -+ -+ EXT_ASSERT(tree); -+ if (tree->ops->new_block) -+ return tree->ops->new_block(handle, tree, path, ex, err); -+ -+ inode = tree->inode; -+ depth = EXT_DEPTH(tree); -+ if (path && depth > 0) { -+ goal = path[depth-1].p_block; -+ } else { -+ struct ext3_inode_info *ei = EXT3_I(inode); -+ unsigned long bg_start; -+ unsigned long colour; -+ -+ bg_start = (ei->i_block_group * -+ EXT3_BLOCKS_PER_GROUP(inode->i_sb)) + -+ le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block); -+ colour = (current->pid % 16) * -+ (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16); -+ goal = bg_start + colour; -+ } -+ -+ newblock = ext3_new_block(handle, inode, goal, err); -+ return newblock; -+} -+ -+static inline void ext3_ext_tree_changed(struct ext3_extents_tree *tree) -+{ -+ struct ext3_extent_header *neh = EXT_ROOT_HDR(tree); -+ neh->eh_generation = ((EXT_FLAGS(neh) & ~EXT_FLAGS_CLR_UNKNOWN) << 24) | -+ (EXT_HDR_GEN(neh) + 1); -+} -+ -+static inline int ext3_ext_space_block(struct ext3_extents_tree *tree) -+{ -+ int size; -+ -+ size = (tree->inode->i_sb->s_blocksize - -+ sizeof(struct ext3_extent_header)) / -+ sizeof(struct ext3_extent); -+#ifdef AGRESSIVE_TEST -+ size = 6; -+#endif -+ return size; -+} -+ -+static inline int ext3_ext_space_block_idx(struct ext3_extents_tree *tree) -+{ -+ int size; -+ -+ size = (tree->inode->i_sb->s_blocksize - -+ sizeof(struct ext3_extent_header)) / -+ sizeof(struct ext3_extent_idx); -+#ifdef AGRESSIVE_TEST -+ size = 5; -+#endif -+ return size; -+} -+ -+static inline int ext3_ext_space_root(struct ext3_extents_tree *tree) -+{ -+ int size; -+ -+ size = (tree->buffer_len - sizeof(struct ext3_extent_header)) / -+ sizeof(struct ext3_extent); -+#ifdef AGRESSIVE_TEST -+ size = 3; -+#endif -+ return size; -+} -+ -+static inline int ext3_ext_space_root_idx(struct ext3_extents_tree *tree) -+{ -+ int size; -+ -+ size = (tree->buffer_len - sizeof(struct ext3_extent_header)) / -+ sizeof(struct ext3_extent_idx); -+#ifdef AGRESSIVE_TEST -+ size = 4; -+#endif -+ return size; -+} -+ -+static void ext3_ext_show_path(struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path) -+{ -+#ifdef EXT_DEBUG -+ int k, l = path->p_depth; -+ -+ ext_debug(tree, "path:"); -+ for (k = 0; k <= l; k++, path++) { -+ if (path->p_idx) { -+ ext_debug(tree, " %d->%d", path->p_idx->ei_block, -+ path->p_idx->ei_leaf); -+ } else if (path->p_ext) { -+ ext_debug(tree, " %d:%d:%d", -+ path->p_ext->ee_block, -+ path->p_ext->ee_len, -+ path->p_ext->ee_start); -+ } else -+ ext_debug(tree, " []"); -+ } -+ ext_debug(tree, "\n"); -+#endif -+} -+ -+static void ext3_ext_show_leaf(struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path) -+{ -+#ifdef EXT_DEBUG -+ int depth = EXT_DEPTH(tree); -+ struct ext3_extent_header *eh; -+ struct ext3_extent *ex; -+ int i; -+ -+ if (!path) -+ return; -+ -+ eh = path[depth].p_hdr; -+ ex = EXT_FIRST_EXTENT(eh); -+ -+ for (i = 0; i < eh->eh_entries; i++, ex++) { -+ ext_debug(tree, "%d:%d:%d ", -+ ex->ee_block, ex->ee_len, ex->ee_start); -+ } -+ ext_debug(tree, "\n"); -+#endif -+} -+ -+static void ext3_ext_drop_refs(struct ext3_ext_path *path) -+{ -+ int depth = path->p_depth; -+ int i; -+ -+ for (i = 0; i <= depth; i++, path++) { -+ if (path->p_bh) { -+ brelse(path->p_bh); -+ path->p_bh = NULL; -+ } -+ } -+} -+ -+/* -+ * binary search for closest index by given block -+ */ -+static inline void -+ext3_ext_binsearch_idx(struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, int block) -+{ -+ struct ext3_extent_header *eh = path->p_hdr; -+ struct ext3_extent_idx *ix; -+ int l = 0, k, r; -+ -+ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); -+ EXT_ASSERT(eh->eh_entries <= eh->eh_max); -+ EXT_ASSERT(eh->eh_entries > 0); -+ -+ ext_debug(tree, "binsearch for %d(idx): ", block); -+ -+ path->p_idx = ix = EXT_FIRST_INDEX(eh); -+ -+ r = k = eh->eh_entries; -+ while (k > 1) { -+ k = (r - l) / 2; -+ if (block < ix[l + k].ei_block) -+ r -= k; -+ else -+ l += k; -+ ext_debug(tree, "%d:%d:%d ", k, l, r); -+ } -+ -+ ix += l; -+ path->p_idx = ix; -+ ext_debug(tree," -> %d->%d ",path->p_idx->ei_block,path->p_idx->ei_leaf); -+ -+ while (l++ < r) { -+ if (block < ix->ei_block) -+ break; -+ path->p_idx = ix++; -+ } -+ ext_debug(tree, " -> %d->%d\n", path->p_idx->ei_block, -+ path->p_idx->ei_leaf); -+ -+#ifdef CHECK_BINSEARCH -+ { -+ struct ext3_extent_idx *chix; -+ -+ chix = ix = EXT_FIRST_INDEX(eh); -+ for (k = 0; k < eh->eh_entries; k++, ix++) { -+ if (k != 0 && ix->ei_block <= ix[-1].ei_block) { -+ printk("k=%d, ix=0x%p, first=0x%p\n", k, -+ ix, EXT_FIRST_INDEX(eh)); -+ printk("%u <= %u\n", -+ ix->ei_block,ix[-1].ei_block); -+ } -+ EXT_ASSERT(k == 0 || ix->ei_block > ix[-1].ei_block); -+ if (block < ix->ei_block) -+ break; -+ chix = ix; -+ } -+ EXT_ASSERT(chix == path->p_idx); -+ } -+#endif -+} -+ -+/* -+ * binary search for closest extent by given block -+ */ -+static inline void -+ext3_ext_binsearch(struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, int block) -+{ -+ struct ext3_extent_header *eh = path->p_hdr; -+ struct ext3_extent *ex; -+ int l = 0, k, r; -+ -+ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); -+ EXT_ASSERT(eh->eh_entries <= eh->eh_max); -+ -+ if (eh->eh_entries == 0) { -+ /* -+ * this leaf is empty yet: -+ * we get such a leaf in split/add case -+ */ -+ return; -+ } -+ -+ ext_debug(tree, "binsearch for %d: ", block); -+ -+ path->p_ext = ex = EXT_FIRST_EXTENT(eh); -+ -+ r = k = eh->eh_entries; -+ while (k > 1) { -+ k = (r - l) / 2; -+ if (block < ex[l + k].ee_block) -+ r -= k; -+ else -+ l += k; -+ ext_debug(tree, "%d:%d:%d ", k, l, r); -+ } -+ -+ ex += l; -+ path->p_ext = ex; -+ ext_debug(tree, " -> %d:%d:%d ", path->p_ext->ee_block, -+ path->p_ext->ee_start, path->p_ext->ee_len); -+ -+ while (l++ < r) { -+ if (block < ex->ee_block) -+ break; -+ path->p_ext = ex++; -+ } -+ ext_debug(tree, " -> %d:%d:%d\n", path->p_ext->ee_block, -+ path->p_ext->ee_start, path->p_ext->ee_len); -+ -+#ifdef CHECK_BINSEARCH -+ { -+ struct ext3_extent *chex; -+ -+ chex = ex = EXT_FIRST_EXTENT(eh); -+ for (k = 0; k < eh->eh_entries; k++, ex++) { -+ EXT_ASSERT(k == 0 || ex->ee_block > ex[-1].ee_block); -+ if (block < ex->ee_block) -+ break; -+ chex = ex; -+ } -+ EXT_ASSERT(chex == path->p_ext); -+ } -+#endif -+} -+ -+int ext3_extent_tree_init(handle_t *handle, struct ext3_extents_tree *tree) -+{ -+ struct ext3_extent_header *eh; -+ -+ BUG_ON(tree->buffer_len == 0); -+ ext3_ext_get_access_for_root(handle, tree); -+ eh = EXT_ROOT_HDR(tree); -+ eh->eh_depth = 0; -+ eh->eh_entries = 0; -+ eh->eh_magic = EXT3_EXT_MAGIC; -+ eh->eh_max = ext3_ext_space_root(tree); -+ ext3_ext_mark_root_dirty(handle, tree); -+ ext3_ext_invalidate_cache(tree); -+ return 0; -+} -+ -+struct ext3_ext_path * -+ext3_ext_find_extent(struct ext3_extents_tree *tree, int block, -+ struct ext3_ext_path *path) -+{ -+ struct ext3_extent_header *eh; -+ struct buffer_head *bh; -+ int depth, i, ppos = 0; -+ -+ EXT_ASSERT(tree); -+ EXT_ASSERT(tree->inode); -+ EXT_ASSERT(tree->root); -+ -+ eh = EXT_ROOT_HDR(tree); -+ EXT_ASSERT(eh); -+ if (ext3_ext_check_header(eh)) { -+ /* don't free previously allocated path -+ * -- caller should take care */ -+ path = NULL; -+ goto err; -+ } -+ -+ i = depth = EXT_DEPTH(tree); -+ EXT_ASSERT(eh->eh_max); -+ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); -+ -+ /* account possible depth increase */ -+ if (!path) { -+ path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 2), -+ GFP_NOFS); -+ if (!path) -+ return ERR_PTR(-ENOMEM); -+ } -+ memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1)); -+ path[0].p_hdr = eh; -+ -+ /* walk through the tree */ -+ while (i) { -+ ext_debug(tree, "depth %d: num %d, max %d\n", -+ ppos, eh->eh_entries, eh->eh_max); -+ ext3_ext_binsearch_idx(tree, path + ppos, block); -+ path[ppos].p_block = path[ppos].p_idx->ei_leaf; -+ path[ppos].p_depth = i; -+ path[ppos].p_ext = NULL; -+ -+ bh = sb_bread(tree->inode->i_sb, path[ppos].p_block); -+ if (!bh) -+ goto err; -+ -+ eh = EXT_BLOCK_HDR(bh); -+ ppos++; -+ EXT_ASSERT(ppos <= depth); -+ path[ppos].p_bh = bh; -+ path[ppos].p_hdr = eh; -+ i--; -+ -+ if (ext3_ext_check_header(eh)) -+ goto err; -+ } -+ -+ path[ppos].p_depth = i; -+ path[ppos].p_hdr = eh; -+ path[ppos].p_ext = NULL; -+ path[ppos].p_idx = NULL; -+ -+ if (ext3_ext_check_header(eh)) -+ goto err; -+ -+ /* find extent */ -+ ext3_ext_binsearch(tree, path + ppos, block); -+ -+ ext3_ext_show_path(tree, path); -+ -+ return path; -+ -+err: -+ printk(KERN_ERR "EXT3-fs: header is corrupted!\n"); -+ if (path) { -+ ext3_ext_drop_refs(path); -+ kfree(path); -+ } -+ return ERR_PTR(-EIO); -+} -+ -+/* -+ * insert new index [logical;ptr] into the block at cupr -+ * it check where to insert: before curp or after curp -+ */ -+static int ext3_ext_insert_index(handle_t *handle, -+ struct ext3_extents_tree *tree, -+ struct ext3_ext_path *curp, -+ int logical, int ptr) -+{ -+ struct ext3_extent_idx *ix; -+ int len, err; -+ -+ if ((err = ext3_ext_get_access(handle, tree, curp))) -+ return err; -+ -+ EXT_ASSERT(logical != curp->p_idx->ei_block); -+ len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx; -+ if (logical > curp->p_idx->ei_block) { -+ /* insert after */ -+ if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) { -+ len = (len - 1) * sizeof(struct ext3_extent_idx); -+ len = len < 0 ? 0 : len; -+ ext_debug(tree, "insert new index %d after: %d. " -+ "move %d from 0x%p to 0x%p\n", -+ logical, ptr, len, -+ (curp->p_idx + 1), (curp->p_idx + 2)); -+ memmove(curp->p_idx + 2, curp->p_idx + 1, len); -+ } -+ ix = curp->p_idx + 1; -+ } else { -+ /* insert before */ -+ len = len * sizeof(struct ext3_extent_idx); -+ len = len < 0 ? 0 : len; -+ ext_debug(tree, "insert new index %d before: %d. " -+ "move %d from 0x%p to 0x%p\n", -+ logical, ptr, len, -+ curp->p_idx, (curp->p_idx + 1)); -+ memmove(curp->p_idx + 1, curp->p_idx, len); -+ ix = curp->p_idx; -+ } -+ -+ ix->ei_block = logical; -+ ix->ei_leaf = ptr; -+ ix->ei_leaf_hi = ix->ei_unused = 0; -+ curp->p_hdr->eh_entries++; -+ -+ EXT_ASSERT(curp->p_hdr->eh_entries <= curp->p_hdr->eh_max); -+ EXT_ASSERT(ix <= EXT_LAST_INDEX(curp->p_hdr)); -+ -+ err = ext3_ext_dirty(handle, tree, curp); -+ ext3_std_error(tree->inode->i_sb, err); -+ -+ return err; -+} -+ -+/* -+ * routine inserts new subtree into the path, using free index entry -+ * at depth 'at: -+ * - allocates all needed blocks (new leaf and all intermediate index blocks) -+ * - makes decision where to split -+ * - moves remaining extens and index entries (right to the split point) -+ * into the newly allocated blocks -+ * - initialize subtree -+ */ -+static int ext3_ext_split(handle_t *handle, struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, -+ struct ext3_extent *newext, int at) -+{ -+ struct buffer_head *bh = NULL; -+ int depth = EXT_DEPTH(tree); -+ struct ext3_extent_header *neh; -+ struct ext3_extent_idx *fidx; -+ struct ext3_extent *ex; -+ int i = at, k, m, a; -+ unsigned long newblock, oldblock, border; -+ int *ablocks = NULL; /* array of allocated blocks */ -+ int err = 0; -+ -+ /* make decision: where to split? */ -+ /* FIXME: now desicion is simplest: at current extent */ -+ -+ /* if current leaf will be splitted, then we should use -+ * border from split point */ -+ EXT_ASSERT(path[depth].p_ext <= EXT_MAX_EXTENT(path[depth].p_hdr)); -+ if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) { -+ border = path[depth].p_ext[1].ee_block; -+ ext_debug(tree, "leaf will be splitted." -+ " next leaf starts at %d\n", -+ (int)border); -+ } else { -+ border = newext->ee_block; -+ ext_debug(tree, "leaf will be added." -+ " next leaf starts at %d\n", -+ (int)border); -+ } -+ -+ /* -+ * if error occurs, then we break processing -+ * and turn filesystem read-only. so, index won't -+ * be inserted and tree will be in consistent -+ * state. next mount will repair buffers too -+ */ -+ -+ /* -+ * get array to track all allocated blocks -+ * we need this to handle errors and free blocks -+ * upon them -+ */ -+ ablocks = kmalloc(sizeof(unsigned long) * depth, GFP_NOFS); -+ if (!ablocks) -+ return -ENOMEM; -+ memset(ablocks, 0, sizeof(unsigned long) * depth); -+ -+ /* allocate all needed blocks */ -+ ext_debug(tree, "allocate %d blocks for indexes/leaf\n", depth - at); -+ for (a = 0; a < depth - at; a++) { -+ newblock = ext3_ext_new_block(handle, tree, path, newext, &err); -+ if (newblock == 0) -+ goto cleanup; -+ ablocks[a] = newblock; -+ } -+ -+ /* initialize new leaf */ -+ newblock = ablocks[--a]; -+ EXT_ASSERT(newblock); -+ bh = sb_getblk(tree->inode->i_sb, newblock); -+ if (!bh) { -+ err = -EIO; -+ goto cleanup; -+ } -+ lock_buffer(bh); -+ -+ if ((err = ext3_journal_get_create_access(handle, bh))) -+ goto cleanup; -+ -+ neh = EXT_BLOCK_HDR(bh); -+ neh->eh_entries = 0; -+ neh->eh_max = ext3_ext_space_block(tree); -+ neh->eh_magic = EXT3_EXT_MAGIC; -+ neh->eh_depth = 0; -+ ex = EXT_FIRST_EXTENT(neh); -+ -+ /* move remain of path[depth] to the new leaf */ -+ EXT_ASSERT(path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max); -+ /* start copy from next extent */ -+ /* TODO: we could do it by single memmove */ -+ m = 0; -+ path[depth].p_ext++; -+ while (path[depth].p_ext <= -+ EXT_MAX_EXTENT(path[depth].p_hdr)) { -+ ext_debug(tree, "move %d:%d:%d in new leaf %lu\n", -+ path[depth].p_ext->ee_block, -+ path[depth].p_ext->ee_start, -+ path[depth].p_ext->ee_len, -+ newblock); -+ memmove(ex++, path[depth].p_ext++, sizeof(struct ext3_extent)); -+ neh->eh_entries++; -+ m++; -+ } -+ set_buffer_uptodate(bh); -+ unlock_buffer(bh); -+ -+ if ((err = ext3_journal_dirty_metadata(handle, bh))) -+ goto cleanup; -+ brelse(bh); -+ bh = NULL; -+ -+ /* correct old leaf */ -+ if (m) { -+ if ((err = ext3_ext_get_access(handle, tree, path + depth))) -+ goto cleanup; -+ path[depth].p_hdr->eh_entries -= m; -+ if ((err = ext3_ext_dirty(handle, tree, path + depth))) -+ goto cleanup; -+ -+ } -+ -+ /* create intermediate indexes */ -+ k = depth - at - 1; -+ EXT_ASSERT(k >= 0); -+ if (k) -+ ext_debug(tree, "create %d intermediate indices\n", k); -+ /* insert new index into current index block */ -+ /* current depth stored in i var */ -+ i = depth - 1; -+ while (k--) { -+ oldblock = newblock; -+ newblock = ablocks[--a]; -+ bh = sb_getblk(tree->inode->i_sb, newblock); -+ if (!bh) { -+ err = -EIO; -+ goto cleanup; -+ } -+ lock_buffer(bh); -+ -+ if ((err = ext3_journal_get_create_access(handle, bh))) -+ goto cleanup; -+ -+ neh = EXT_BLOCK_HDR(bh); -+ neh->eh_entries = 1; -+ neh->eh_magic = EXT3_EXT_MAGIC; -+ neh->eh_max = ext3_ext_space_block_idx(tree); -+ neh->eh_depth = depth - i; -+ fidx = EXT_FIRST_INDEX(neh); -+ fidx->ei_block = border; -+ fidx->ei_leaf = oldblock; -+ fidx->ei_leaf_hi = fidx->ei_unused = 0; -+ -+ ext_debug(tree, "int.index at %d (block %lu): %lu -> %lu\n", -+ i, newblock, border, oldblock); -+ /* copy indexes */ -+ m = 0; -+ path[i].p_idx++; -+ -+ ext_debug(tree, "cur 0x%p, last 0x%p\n", path[i].p_idx, -+ EXT_MAX_INDEX(path[i].p_hdr)); -+ EXT_ASSERT(EXT_MAX_INDEX(path[i].p_hdr) == -+ EXT_LAST_INDEX(path[i].p_hdr)); -+ while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) { -+ ext_debug(tree, "%d: move %d:%d in new index %lu\n", -+ i, path[i].p_idx->ei_block, -+ path[i].p_idx->ei_leaf, newblock); -+ memmove(++fidx, path[i].p_idx++, -+ sizeof(struct ext3_extent_idx)); -+ neh->eh_entries++; -+ EXT_ASSERT(neh->eh_entries <= neh->eh_max); -+ m++; -+ } -+ set_buffer_uptodate(bh); -+ unlock_buffer(bh); -+ -+ if ((err = ext3_journal_dirty_metadata(handle, bh))) -+ goto cleanup; -+ brelse(bh); -+ bh = NULL; -+ -+ /* correct old index */ -+ if (m) { -+ err = ext3_ext_get_access(handle, tree, path + i); -+ if (err) -+ goto cleanup; -+ path[i].p_hdr->eh_entries -= m; -+ err = ext3_ext_dirty(handle, tree, path + i); -+ if (err) -+ goto cleanup; -+ } -+ -+ i--; -+ } -+ -+ /* insert new index */ -+ if (!err) -+ err = ext3_ext_insert_index(handle, tree, path + at, -+ border, newblock); -+ -+cleanup: -+ if (bh) { -+ if (buffer_locked(bh)) -+ unlock_buffer(bh); -+ brelse(bh); -+ } -+ -+ if (err) { -+ /* free all allocated blocks in error case */ -+ for (i = 0; i < depth; i++) { -+ if (!ablocks[i]) -+ continue; -+ ext3_free_blocks(handle, tree->inode, ablocks[i], 1); -+ } -+ } -+ kfree(ablocks); -+ -+ return err; -+} -+ -+/* -+ * routine implements tree growing procedure: -+ * - allocates new block -+ * - moves top-level data (index block or leaf) into the new block -+ * - initialize new top-level, creating index that points to the -+ * just created block -+ */ -+static int ext3_ext_grow_indepth(handle_t *handle, -+ struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, -+ struct ext3_extent *newext) -+{ -+ struct ext3_ext_path *curp = path; -+ struct ext3_extent_header *neh; -+ struct ext3_extent_idx *fidx; -+ struct buffer_head *bh; -+ unsigned long newblock; -+ int err = 0; -+ -+ newblock = ext3_ext_new_block(handle, tree, path, newext, &err); -+ if (newblock == 0) -+ return err; -+ -+ bh = sb_getblk(tree->inode->i_sb, newblock); -+ if (!bh) { -+ err = -EIO; -+ ext3_std_error(tree->inode->i_sb, err); -+ return err; -+ } -+ lock_buffer(bh); -+ -+ if ((err = ext3_journal_get_create_access(handle, bh))) { -+ unlock_buffer(bh); -+ goto out; -+ } -+ -+ /* move top-level index/leaf into new block */ -+ memmove(bh->b_data, curp->p_hdr, tree->buffer_len); -+ -+ /* set size of new block */ -+ neh = EXT_BLOCK_HDR(bh); -+ /* old root could have indexes or leaves -+ * so calculate eh_max right way */ -+ if (EXT_DEPTH(tree)) -+ neh->eh_max = ext3_ext_space_block_idx(tree); -+ else -+ neh->eh_max = ext3_ext_space_block(tree); -+ neh->eh_magic = EXT3_EXT_MAGIC; -+ set_buffer_uptodate(bh); -+ unlock_buffer(bh); -+ -+ if ((err = ext3_journal_dirty_metadata(handle, bh))) -+ goto out; -+ -+ /* create index in new top-level index: num,max,pointer */ -+ if ((err = ext3_ext_get_access(handle, tree, curp))) -+ goto out; -+ -+ curp->p_hdr->eh_magic = EXT3_EXT_MAGIC; -+ curp->p_hdr->eh_max = ext3_ext_space_root_idx(tree); -+ curp->p_hdr->eh_entries = 1; -+ curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr); -+ /* FIXME: it works, but actually path[0] can be index */ -+ curp->p_idx->ei_block = EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block; -+ curp->p_idx->ei_leaf = newblock; -+ curp->p_idx->ei_leaf_hi = curp->p_idx->ei_unused = 0; -+ -+ neh = EXT_ROOT_HDR(tree); -+ fidx = EXT_FIRST_INDEX(neh); -+ ext_debug(tree, "new root: num %d(%d), lblock %d, ptr %d\n", -+ neh->eh_entries, neh->eh_max, fidx->ei_block, fidx->ei_leaf); -+ -+ neh->eh_depth = path->p_depth + 1; -+ err = ext3_ext_dirty(handle, tree, curp); -+out: -+ brelse(bh); -+ -+ return err; -+} -+ -+/* -+ * routine finds empty index and adds new leaf. if no free index found -+ * then it requests in-depth growing -+ */ -+static int ext3_ext_create_new_leaf(handle_t *handle, -+ struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, -+ struct ext3_extent *newext) -+{ -+ struct ext3_ext_path *curp; -+ int depth, i, err = 0; -+ -+repeat: -+ i = depth = EXT_DEPTH(tree); -+ -+ /* walk up to the tree and look for free index entry */ -+ curp = path + depth; -+ while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) { -+ i--; -+ curp--; -+ } -+ -+ /* we use already allocated block for index block -+ * so, subsequent data blocks should be contigoues */ -+ if (EXT_HAS_FREE_INDEX(curp)) { -+ /* if we found index with free entry, then use that -+ * entry: create all needed subtree and add new leaf */ -+ err = ext3_ext_split(handle, tree, path, newext, i); -+ -+ /* refill path */ -+ ext3_ext_drop_refs(path); -+ path = ext3_ext_find_extent(tree, newext->ee_block, path); -+ if (IS_ERR(path)) -+ err = PTR_ERR(path); -+ } else { -+ /* tree is full, time to grow in depth */ -+ err = ext3_ext_grow_indepth(handle, tree, path, newext); -+ -+ /* refill path */ -+ ext3_ext_drop_refs(path); -+ path = ext3_ext_find_extent(tree, newext->ee_block, path); -+ if (IS_ERR(path)) -+ err = PTR_ERR(path); -+ -+ /* -+ * only first (depth 0 -> 1) produces free space -+ * in all other cases we have to split growed tree -+ */ -+ depth = EXT_DEPTH(tree); -+ if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) { -+ /* now we need split */ -+ goto repeat; -+ } -+ } -+ -+ if (err) -+ return err; -+ -+ return 0; -+} -+ -+/* -+ * returns allocated block in subsequent extent or EXT_MAX_BLOCK -+ * NOTE: it consider block number from index entry as -+ * allocated block. thus, index entries have to be consistent -+ * with leafs -+ */ -+static unsigned long -+ext3_ext_next_allocated_block(struct ext3_ext_path *path) -+{ -+ int depth; -+ -+ EXT_ASSERT(path != NULL); -+ depth = path->p_depth; -+ -+ if (depth == 0 && path->p_ext == NULL) -+ return EXT_MAX_BLOCK; -+ -+ /* FIXME: what if index isn't full ?! */ -+ while (depth >= 0) { -+ if (depth == path->p_depth) { -+ /* leaf */ -+ if (path[depth].p_ext != -+ EXT_LAST_EXTENT(path[depth].p_hdr)) -+ return path[depth].p_ext[1].ee_block; -+ } else { -+ /* index */ -+ if (path[depth].p_idx != -+ EXT_LAST_INDEX(path[depth].p_hdr)) -+ return path[depth].p_idx[1].ei_block; -+ } -+ depth--; -+ } -+ -+ return EXT_MAX_BLOCK; -+} -+ -+/* -+ * returns first allocated block from next leaf or EXT_MAX_BLOCK -+ */ -+static unsigned ext3_ext_next_leaf_block(struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path) -+{ -+ int depth; -+ -+ EXT_ASSERT(path != NULL); -+ depth = path->p_depth; -+ -+ /* zero-tree has no leaf blocks at all */ -+ if (depth == 0) -+ return EXT_MAX_BLOCK; -+ -+ /* go to index block */ -+ depth--; -+ -+ while (depth >= 0) { -+ if (path[depth].p_idx != -+ EXT_LAST_INDEX(path[depth].p_hdr)) -+ return path[depth].p_idx[1].ei_block; -+ depth--; -+ } -+ -+ return EXT_MAX_BLOCK; -+} -+ -+/* -+ * if leaf gets modified and modified extent is first in the leaf -+ * then we have to correct all indexes above -+ * TODO: do we need to correct tree in all cases? -+ */ -+int ext3_ext_correct_indexes(handle_t *handle, struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path) -+{ -+ struct ext3_extent_header *eh; -+ int depth = EXT_DEPTH(tree); -+ struct ext3_extent *ex; -+ unsigned long border; -+ int k, err = 0; -+ -+ eh = path[depth].p_hdr; -+ ex = path[depth].p_ext; -+ EXT_ASSERT(ex); -+ EXT_ASSERT(eh); -+ -+ if (depth == 0) { -+ /* there is no tree at all */ -+ return 0; -+ } -+ -+ if (ex != EXT_FIRST_EXTENT(eh)) { -+ /* we correct tree if first leaf got modified only */ -+ return 0; -+ } -+ -+ /* -+ * TODO: we need correction if border is smaller then current one -+ */ -+ k = depth - 1; -+ border = path[depth].p_ext->ee_block; -+ if ((err = ext3_ext_get_access(handle, tree, path + k))) -+ return err; -+ path[k].p_idx->ei_block = border; -+ if ((err = ext3_ext_dirty(handle, tree, path + k))) -+ return err; -+ -+ while (k--) { -+ /* change all left-side indexes */ -+ if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr)) -+ break; -+ if ((err = ext3_ext_get_access(handle, tree, path + k))) -+ break; -+ path[k].p_idx->ei_block = border; -+ if ((err = ext3_ext_dirty(handle, tree, path + k))) -+ break; -+ } -+ -+ return err; -+} -+ -+static int inline -+ext3_can_extents_be_merged(struct ext3_extents_tree *tree, -+ struct ext3_extent *ex1, -+ struct ext3_extent *ex2) -+{ -+ if (ex1->ee_block + ex1->ee_len != ex2->ee_block) -+ return 0; -+ -+#ifdef AGRESSIVE_TEST -+ if (ex1->ee_len >= 4) -+ return 0; -+#endif -+ -+ if (!tree->ops->mergable) -+ return 1; -+ -+ return tree->ops->mergable(ex1, ex2); -+} -+ -+/* -+ * this routine tries to merge requsted extent into the existing -+ * extent or inserts requested extent as new one into the tree, -+ * creating new leaf in no-space case -+ */ -+int ext3_ext_insert_extent(handle_t *handle, struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, -+ struct ext3_extent *newext) -+{ -+ struct ext3_extent_header * eh; -+ struct ext3_extent *ex, *fex; -+ struct ext3_extent *nearex; /* nearest extent */ -+ struct ext3_ext_path *npath = NULL; -+ int depth, len, err, next; -+ -+ EXT_ASSERT(newext->ee_len > 0); -+ depth = EXT_DEPTH(tree); -+ ex = path[depth].p_ext; -+ EXT_ASSERT(path[depth].p_hdr); -+ -+ /* try to insert block into found extent and return */ -+ if (ex && ext3_can_extents_be_merged(tree, ex, newext)) { -+ ext_debug(tree, "append %d block to %d:%d (from %d)\n", -+ newext->ee_len, ex->ee_block, ex->ee_len, -+ ex->ee_start); -+ if ((err = ext3_ext_get_access(handle, tree, path + depth))) -+ return err; -+ ex->ee_len += newext->ee_len; -+ eh = path[depth].p_hdr; -+ nearex = ex; -+ goto merge; -+ } -+ -+repeat: -+ depth = EXT_DEPTH(tree); -+ eh = path[depth].p_hdr; -+ if (eh->eh_entries < eh->eh_max) -+ goto has_space; -+ -+ /* probably next leaf has space for us? */ -+ fex = EXT_LAST_EXTENT(eh); -+ next = ext3_ext_next_leaf_block(tree, path); -+ if (newext->ee_block > fex->ee_block && next != EXT_MAX_BLOCK) { -+ ext_debug(tree, "next leaf block - %d\n", next); -+ EXT_ASSERT(!npath); -+ npath = ext3_ext_find_extent(tree, next, NULL); -+ if (IS_ERR(npath)) -+ return PTR_ERR(npath); -+ EXT_ASSERT(npath->p_depth == path->p_depth); -+ eh = npath[depth].p_hdr; -+ if (eh->eh_entries < eh->eh_max) { -+ ext_debug(tree, "next leaf isnt full(%d)\n", -+ eh->eh_entries); -+ path = npath; -+ goto repeat; -+ } -+ ext_debug(tree, "next leaf hasno free space(%d,%d)\n", -+ eh->eh_entries, eh->eh_max); -+ } -+ -+ /* -+ * there is no free space in found leaf -+ * we're gonna add new leaf in the tree -+ */ -+ err = ext3_ext_create_new_leaf(handle, tree, path, newext); -+ if (err) -+ goto cleanup; -+ depth = EXT_DEPTH(tree); -+ eh = path[depth].p_hdr; -+ -+has_space: -+ nearex = path[depth].p_ext; -+ -+ if ((err = ext3_ext_get_access(handle, tree, path + depth))) -+ goto cleanup; -+ -+ if (!nearex) { -+ /* there is no extent in this leaf, create first one */ -+ ext_debug(tree, "first extent in the leaf: %d:%d:%d\n", -+ newext->ee_block, newext->ee_start, -+ newext->ee_len); -+ path[depth].p_ext = EXT_FIRST_EXTENT(eh); -+ } else if (newext->ee_block > nearex->ee_block) { -+ EXT_ASSERT(newext->ee_block != nearex->ee_block); -+ if (nearex != EXT_LAST_EXTENT(eh)) { -+ len = EXT_MAX_EXTENT(eh) - nearex; -+ len = (len - 1) * sizeof(struct ext3_extent); -+ len = len < 0 ? 0 : len; -+ ext_debug(tree, "insert %d:%d:%d after: nearest 0x%p, " -+ "move %d from 0x%p to 0x%p\n", -+ newext->ee_block, newext->ee_start, -+ newext->ee_len, -+ nearex, len, nearex + 1, nearex + 2); -+ memmove(nearex + 2, nearex + 1, len); -+ } -+ path[depth].p_ext = nearex + 1; -+ } else { -+ EXT_ASSERT(newext->ee_block != nearex->ee_block); -+ len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext3_extent); -+ len = len < 0 ? 0 : len; -+ ext_debug(tree, "insert %d:%d:%d before: nearest 0x%p, " -+ "move %d from 0x%p to 0x%p\n", -+ newext->ee_block, newext->ee_start, newext->ee_len, -+ nearex, len, nearex + 1, nearex + 2); -+ memmove(nearex + 1, nearex, len); -+ path[depth].p_ext = nearex; -+ } -+ -+ eh->eh_entries++; -+ nearex = path[depth].p_ext; -+ nearex->ee_block = newext->ee_block; -+ nearex->ee_start = newext->ee_start; -+ nearex->ee_len = newext->ee_len; -+ /* FIXME: support for large fs */ -+ nearex->ee_start_hi = 0; -+ -+merge: -+ /* try to merge extents to the right */ -+ while (nearex < EXT_LAST_EXTENT(eh)) { -+ if (!ext3_can_extents_be_merged(tree, nearex, nearex + 1)) -+ break; -+ /* merge with next extent! */ -+ nearex->ee_len += nearex[1].ee_len; -+ if (nearex + 1 < EXT_LAST_EXTENT(eh)) { -+ len = (EXT_LAST_EXTENT(eh) - nearex - 1) * -+ sizeof(struct ext3_extent); -+ memmove(nearex + 1, nearex + 2, len); -+ } -+ eh->eh_entries--; -+ EXT_ASSERT(eh->eh_entries > 0); -+ } -+ -+ /* try to merge extents to the left */ -+ -+ /* time to correct all indexes above */ -+ err = ext3_ext_correct_indexes(handle, tree, path); -+ if (err) -+ goto cleanup; -+ -+ err = ext3_ext_dirty(handle, tree, path + depth); -+ -+cleanup: -+ if (npath) { -+ ext3_ext_drop_refs(npath); -+ kfree(npath); -+ } -+ ext3_ext_tree_changed(tree); -+ ext3_ext_invalidate_cache(tree); -+ return err; -+} -+ -+int ext3_ext_walk_space(struct ext3_extents_tree *tree, unsigned long block, -+ unsigned long num, ext_prepare_callback func) -+{ -+ struct ext3_ext_path *path = NULL; -+ struct ext3_ext_cache cbex; -+ struct ext3_extent *ex; -+ unsigned long next, start = 0, end = 0; -+ unsigned long last = block + num; -+ int depth, exists, err = 0; -+ -+ EXT_ASSERT(tree); -+ EXT_ASSERT(func); -+ EXT_ASSERT(tree->inode); -+ EXT_ASSERT(tree->root); -+ -+ while (block < last && block != EXT_MAX_BLOCK) { -+ num = last - block; -+ /* find extent for this block */ -+ path = ext3_ext_find_extent(tree, block, path); -+ if (IS_ERR(path)) { -+ err = PTR_ERR(path); -+ path = NULL; -+ break; -+ } -+ -+ depth = EXT_DEPTH(tree); -+ EXT_ASSERT(path[depth].p_hdr); -+ ex = path[depth].p_ext; -+ next = ext3_ext_next_allocated_block(path); -+ -+ exists = 0; -+ if (!ex) { -+ /* there is no extent yet, so try to allocate -+ * all requested space */ -+ start = block; -+ end = block + num; -+ } else if (ex->ee_block > block) { -+ /* need to allocate space before found extent */ -+ start = block; -+ end = ex->ee_block; -+ if (block + num < end) -+ end = block + num; -+ } else if (block >= ex->ee_block + ex->ee_len) { -+ /* need to allocate space after found extent */ -+ start = block; -+ end = block + num; -+ if (end >= next) -+ end = next; -+ } else if (block >= ex->ee_block) { -+ /* -+ * some part of requested space is covered -+ * by found extent -+ */ -+ start = block; -+ end = ex->ee_block + ex->ee_len; -+ if (block + num < end) -+ end = block + num; -+ exists = 1; -+ } else { -+ BUG(); -+ } -+ EXT_ASSERT(end > start); -+ -+ if (!exists) { -+ cbex.ec_block = start; -+ cbex.ec_len = end - start; -+ cbex.ec_start = 0; -+ cbex.ec_type = EXT3_EXT_CACHE_GAP; -+ } else { -+ cbex.ec_block = ex->ee_block; -+ cbex.ec_len = ex->ee_len; -+ cbex.ec_start = ex->ee_start; -+ cbex.ec_type = EXT3_EXT_CACHE_EXTENT; -+ } -+ -+ EXT_ASSERT(cbex.ec_len > 0); -+ EXT_ASSERT(path[depth].p_hdr); -+ err = func(tree, path, &cbex); -+ ext3_ext_drop_refs(path); -+ -+ if (err < 0) -+ break; -+ if (err == EXT_REPEAT) -+ continue; -+ else if (err == EXT_BREAK) { -+ err = 0; -+ break; -+ } -+ -+ if (EXT_DEPTH(tree) != depth) { -+ /* depth was changed. we have to realloc path */ -+ kfree(path); -+ path = NULL; -+ } -+ -+ block = cbex.ec_block + cbex.ec_len; -+ } -+ -+ if (path) { -+ ext3_ext_drop_refs(path); -+ kfree(path); -+ } -+ -+ return err; -+} -+ -+static inline void -+ext3_ext_put_in_cache(struct ext3_extents_tree *tree, __u32 block, -+ __u32 len, __u32 start, int type) -+{ -+ EXT_ASSERT(len > 0); -+ if (tree->cex) { -+ tree->cex->ec_type = type; -+ tree->cex->ec_block = block; -+ tree->cex->ec_len = len; -+ tree->cex->ec_start = start; -+ } -+} -+ -+/* -+ * this routine calculate boundaries of the gap requested block fits into -+ * and cache this gap -+ */ -+static inline void -+ext3_ext_put_gap_in_cache(struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, -+ unsigned long block) -+{ -+ int depth = EXT_DEPTH(tree); -+ unsigned long lblock, len; -+ struct ext3_extent *ex; -+ -+ if (!tree->cex) -+ return; -+ -+ ex = path[depth].p_ext; -+ if (ex == NULL) { -+ /* there is no extent yet, so gap is [0;-] */ -+ lblock = 0; -+ len = EXT_MAX_BLOCK; -+ ext_debug(tree, "cache gap(whole file):"); -+ } else if (block < ex->ee_block) { -+ lblock = block; -+ len = ex->ee_block - block; -+ ext_debug(tree, "cache gap(before): %lu [%lu:%lu]", -+ (unsigned long) block, -+ (unsigned long) ex->ee_block, -+ (unsigned long) ex->ee_len); -+ } else if (block >= ex->ee_block + ex->ee_len) { -+ lblock = ex->ee_block + ex->ee_len; -+ len = ext3_ext_next_allocated_block(path); -+ ext_debug(tree, "cache gap(after): [%lu:%lu] %lu", -+ (unsigned long) ex->ee_block, -+ (unsigned long) ex->ee_len, -+ (unsigned long) block); -+ EXT_ASSERT(len > lblock); -+ len = len - lblock; -+ } else { -+ lblock = len = 0; -+ BUG(); -+ } -+ -+ ext_debug(tree, " -> %lu:%lu\n", (unsigned long) lblock, len); -+ ext3_ext_put_in_cache(tree, lblock, len, 0, EXT3_EXT_CACHE_GAP); -+} -+ -+static inline int -+ext3_ext_in_cache(struct ext3_extents_tree *tree, unsigned long block, -+ struct ext3_extent *ex) -+{ -+ struct ext3_ext_cache *cex = tree->cex; -+ -+ /* is there cache storage at all? */ -+ if (!cex) -+ return EXT3_EXT_CACHE_NO; -+ -+ /* has cache valid data? */ -+ if (cex->ec_type == EXT3_EXT_CACHE_NO) -+ return EXT3_EXT_CACHE_NO; -+ -+ EXT_ASSERT(cex->ec_type == EXT3_EXT_CACHE_GAP || -+ cex->ec_type == EXT3_EXT_CACHE_EXTENT); -+ if (block >= cex->ec_block && block < cex->ec_block + cex->ec_len) { -+ ex->ee_block = cex->ec_block; -+ ex->ee_start = cex->ec_start; -+ ex->ee_start_hi = 0; -+ ex->ee_len = cex->ec_len; -+ ext_debug(tree, "%lu cached by %lu:%lu:%lu\n", -+ (unsigned long) block, -+ (unsigned long) ex->ee_block, -+ (unsigned long) ex->ee_len, -+ (unsigned long) ex->ee_start); -+ return cex->ec_type; -+ } -+ -+ /* not in cache */ -+ return EXT3_EXT_CACHE_NO; -+} -+ -+/* -+ * routine removes index from the index block -+ * it's used in truncate case only. thus all requests are for -+ * last index in the block only -+ */ -+int ext3_ext_rm_idx(handle_t *handle, struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path) -+{ -+ struct buffer_head *bh; -+ int err; -+ -+ /* free index block */ -+ path--; -+ EXT_ASSERT(path->p_hdr->eh_entries); -+ if ((err = ext3_ext_get_access(handle, tree, path))) -+ return err; -+ path->p_hdr->eh_entries--; -+ if ((err = ext3_ext_dirty(handle, tree, path))) -+ return err; -+ ext_debug(tree, "index is empty, remove it, free block %d\n", -+ path->p_idx->ei_leaf); -+ bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf); -+ ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf); -+ ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1); -+ return err; -+} -+ -+int ext3_ext_calc_credits_for_insert(struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path) -+{ -+ int depth = EXT_DEPTH(tree); -+ int needed; -+ -+ if (path) { -+ /* probably there is space in leaf? */ -+ if (path[depth].p_hdr->eh_entries < path[depth].p_hdr->eh_max) -+ return 1; -+ } -+ -+ /* -+ * the worste case we're expecting is creation of the -+ * new root (growing in depth) with index splitting -+ * for splitting we have to consider depth + 1 because -+ * previous growing could increase it -+ */ -+ depth = depth + 1; -+ -+ /* -+ * growing in depth: -+ * block allocation + new root + old root -+ */ -+ needed = EXT3_ALLOC_NEEDED + 2; -+ -+ /* index split. we may need: -+ * allocate intermediate indexes and new leaf -+ * change two blocks at each level, but root -+ * modify root block (inode) -+ */ -+ needed += (depth * EXT3_ALLOC_NEEDED) + (2 * depth) + 1; -+ -+ return needed; -+} -+ -+static int -+ext3_ext_split_for_rm(handle_t *handle, struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, unsigned long start, -+ unsigned long end) -+{ -+ struct ext3_extent *ex, tex; -+ struct ext3_ext_path *npath; -+ int depth, creds, err; -+ -+ depth = EXT_DEPTH(tree); -+ ex = path[depth].p_ext; -+ EXT_ASSERT(ex); -+ EXT_ASSERT(end < ex->ee_block + ex->ee_len - 1); -+ EXT_ASSERT(ex->ee_block < start); -+ -+ /* calculate tail extent */ -+ tex.ee_block = end + 1; -+ EXT_ASSERT(tex.ee_block < ex->ee_block + ex->ee_len); -+ tex.ee_len = ex->ee_block + ex->ee_len - tex.ee_block; -+ -+ creds = ext3_ext_calc_credits_for_insert(tree, path); -+ handle = ext3_ext_journal_restart(handle, creds); -+ if (IS_ERR(handle)) -+ return PTR_ERR(handle); -+ -+ /* calculate head extent. use primary extent */ -+ err = ext3_ext_get_access(handle, tree, path + depth); -+ if (err) -+ return err; -+ ex->ee_len = start - ex->ee_block; -+ err = ext3_ext_dirty(handle, tree, path + depth); -+ if (err) -+ return err; -+ -+ /* FIXME: some callback to free underlying resource -+ * and correct ee_start? */ -+ ext_debug(tree, "split extent: head %u:%u, tail %u:%u\n", -+ ex->ee_block, ex->ee_len, tex.ee_block, tex.ee_len); -+ -+ npath = ext3_ext_find_extent(tree, ex->ee_block, NULL); -+ if (IS_ERR(npath)) -+ return PTR_ERR(npath); -+ depth = EXT_DEPTH(tree); -+ EXT_ASSERT(npath[depth].p_ext->ee_block == ex->ee_block); -+ EXT_ASSERT(npath[depth].p_ext->ee_len == ex->ee_len); -+ -+ err = ext3_ext_insert_extent(handle, tree, npath, &tex); -+ ext3_ext_drop_refs(npath); -+ kfree(npath); -+ -+ return err; -+} -+ -+static int -+ext3_ext_rm_leaf(handle_t *handle, struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, unsigned long start, -+ unsigned long end) -+{ -+ struct ext3_extent *ex, *fu = NULL, *lu, *le; -+ int err = 0, correct_index = 0; -+ int depth = EXT_DEPTH(tree), credits; -+ struct ext3_extent_header *eh; -+ unsigned a, b, block, num; -+ -+ ext_debug(tree, "remove [%lu:%lu] in leaf\n", start, end); -+ if (!path[depth].p_hdr) -+ path[depth].p_hdr = EXT_BLOCK_HDR(path[depth].p_bh); -+ eh = path[depth].p_hdr; -+ EXT_ASSERT(eh); -+ EXT_ASSERT(eh->eh_entries <= eh->eh_max); -+ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); -+ -+ /* find where to start removing */ -+ le = ex = EXT_LAST_EXTENT(eh); -+ while (ex != EXT_FIRST_EXTENT(eh)) { -+ if (ex->ee_block <= end) -+ break; -+ ex--; -+ } -+ -+ if (start > ex->ee_block && end < ex->ee_block + ex->ee_len - 1) { -+ /* removal of internal part of the extent requested -+ * tail and head must be placed in different extent -+ * so, we have to insert one more extent */ -+ path[depth].p_ext = ex; -+ return ext3_ext_split_for_rm(handle, tree, path, start, end); -+ } -+ -+ lu = ex; -+ while (ex >= EXT_FIRST_EXTENT(eh) && ex->ee_block + ex->ee_len > start) { -+ ext_debug(tree, "remove ext %u:%u\n", ex->ee_block, ex->ee_len); -+ path[depth].p_ext = ex; -+ -+ a = ex->ee_block > start ? ex->ee_block : start; -+ b = ex->ee_block + ex->ee_len - 1 < end ? -+ ex->ee_block + ex->ee_len - 1 : end; -+ -+ ext_debug(tree, " border %u:%u\n", a, b); -+ -+ if (a != ex->ee_block && b != ex->ee_block + ex->ee_len - 1) { -+ block = 0; -+ num = 0; -+ BUG(); -+ } else if (a != ex->ee_block) { -+ /* remove tail of the extent */ -+ block = ex->ee_block; -+ num = a - block; -+ } else if (b != ex->ee_block + ex->ee_len - 1) { -+ /* remove head of the extent */ -+ block = a; -+ num = b - a; -+ } else { -+ /* remove whole extent: excelent! */ -+ block = ex->ee_block; -+ num = 0; -+ EXT_ASSERT(a == ex->ee_block && -+ b == ex->ee_block + ex->ee_len - 1); -+ } -+ -+ if (ex == EXT_FIRST_EXTENT(eh)) -+ correct_index = 1; -+ -+ credits = 1; -+ if (correct_index) -+ credits += (EXT_DEPTH(tree) * EXT3_ALLOC_NEEDED) + 1; -+ if (tree->ops->remove_extent_credits) -+ credits+=tree->ops->remove_extent_credits(tree,ex,a,b); -+ -+ handle = ext3_ext_journal_restart(handle, credits); -+ if (IS_ERR(handle)) { -+ err = PTR_ERR(handle); -+ goto out; -+ } -+ -+ err = ext3_ext_get_access(handle, tree, path + depth); -+ if (err) -+ goto out; -+ -+ if (tree->ops->remove_extent) -+ err = tree->ops->remove_extent(tree, ex, a, b); -+ if (err) -+ goto out; -+ -+ if (num == 0) { -+ /* this extent is removed entirely mark slot unused */ -+ ex->ee_start = ex->ee_start_hi = 0; -+ eh->eh_entries--; -+ fu = ex; -+ } -+ -+ ex->ee_block = block; -+ ex->ee_len = num; -+ -+ err = ext3_ext_dirty(handle, tree, path + depth); -+ if (err) -+ goto out; -+ -+ ext_debug(tree, "new extent: %u:%u:%u\n", -+ ex->ee_block, ex->ee_len, ex->ee_start); -+ ex--; -+ } -+ -+ if (fu) { -+ /* reuse unused slots */ -+ while (lu < le) { -+ if (lu->ee_start) { -+ *fu = *lu; -+ lu->ee_start = lu->ee_start_hi = 0; -+ fu++; -+ } -+ lu++; -+ } -+ } -+ -+ if (correct_index && eh->eh_entries) -+ err = ext3_ext_correct_indexes(handle, tree, path); -+ -+ /* if this leaf is free, then we should -+ * remove it from index block above */ -+ if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL) -+ err = ext3_ext_rm_idx(handle, tree, path + depth); -+ -+out: -+ return err; -+} -+ -+ -+static struct ext3_extent_idx * -+ext3_ext_last_covered(struct ext3_extent_header *hdr, unsigned long block) -+{ -+ struct ext3_extent_idx *ix; -+ -+ ix = EXT_LAST_INDEX(hdr); -+ while (ix != EXT_FIRST_INDEX(hdr)) { -+ if (ix->ei_block <= block) -+ break; -+ ix--; -+ } -+ return ix; -+} -+ -+/* -+ * returns 1 if current index have to be freed (even partial) -+ */ -+static int inline -+ext3_ext_more_to_rm(struct ext3_ext_path *path) -+{ -+ EXT_ASSERT(path->p_idx); -+ -+ if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr)) -+ return 0; -+ -+ /* -+ * if truncate on deeper level happened it it wasn't partial -+ * so we have to consider current index for truncation -+ */ -+ if (path->p_hdr->eh_entries == path->p_block) -+ return 0; -+ return 1; -+} -+ -+int ext3_ext_remove_space(struct ext3_extents_tree *tree, -+ unsigned long start, unsigned long end) -+{ -+ struct inode *inode = tree->inode; -+ struct super_block *sb = inode->i_sb; -+ int depth = EXT_DEPTH(tree); -+ struct ext3_ext_path *path; -+ handle_t *handle; -+ int i = 0, err = 0; -+ -+ ext_debug(tree, "space to be removed: %lu:%lu\n", start, end); -+ -+ /* probably first extent we're gonna free will be last in block */ -+ handle = ext3_journal_start(inode, depth + 1); -+ if (IS_ERR(handle)) -+ return PTR_ERR(handle); -+ -+ ext3_ext_invalidate_cache(tree); -+ -+ /* -+ * we start scanning from right side freeing all the blocks -+ * after i_size and walking into the deep -+ */ -+ path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 1), GFP_KERNEL); -+ if (IS_ERR(path)) { -+ ext3_error(sb, __FUNCTION__, "Can't allocate path array"); -+ ext3_journal_stop(handle); -+ return -ENOMEM; -+ } -+ memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1)); -+ path[i].p_hdr = EXT_ROOT_HDR(tree); -+ -+ while (i >= 0 && err == 0) { -+ if (i == depth) { -+ /* this is leaf block */ -+ err = ext3_ext_rm_leaf(handle, tree, path, start, end); -+ /* root level have p_bh == NULL, brelse() eats this */ -+ brelse(path[i].p_bh); -+ i--; -+ continue; -+ } -+ -+ /* this is index block */ -+ if (!path[i].p_hdr) { -+ ext_debug(tree, "initialize header\n"); -+ path[i].p_hdr = EXT_BLOCK_HDR(path[i].p_bh); -+ } -+ -+ EXT_ASSERT(path[i].p_hdr->eh_entries <= path[i].p_hdr->eh_max); -+ EXT_ASSERT(path[i].p_hdr->eh_magic == EXT3_EXT_MAGIC); -+ -+ if (!path[i].p_idx) { -+ /* this level hasn't touched yet */ -+ path[i].p_idx = -+ ext3_ext_last_covered(path[i].p_hdr, end); -+ path[i].p_block = path[i].p_hdr->eh_entries + 1; -+ ext_debug(tree, "init index ptr: hdr 0x%p, num %d\n", -+ path[i].p_hdr, path[i].p_hdr->eh_entries); -+ } else { -+ /* we've already was here, see at next index */ -+ path[i].p_idx--; -+ } -+ -+ ext_debug(tree, "level %d - index, first 0x%p, cur 0x%p\n", -+ i, EXT_FIRST_INDEX(path[i].p_hdr), -+ path[i].p_idx); -+ if (ext3_ext_more_to_rm(path + i)) { -+ /* go to the next level */ -+ ext_debug(tree, "move to level %d (block %d)\n", -+ i + 1, path[i].p_idx->ei_leaf); -+ memset(path + i + 1, 0, sizeof(*path)); -+ path[i+1].p_bh = sb_bread(sb, path[i].p_idx->ei_leaf); -+ if (!path[i+1].p_bh) { -+ /* should we reset i_size? */ -+ err = -EIO; -+ break; -+ } -+ /* put actual number of indexes to know is this -+ * number got changed at the next iteration */ -+ path[i].p_block = path[i].p_hdr->eh_entries; -+ i++; -+ } else { -+ /* we finish processing this index, go up */ -+ if (path[i].p_hdr->eh_entries == 0 && i > 0) { -+ /* index is empty, remove it -+ * handle must be already prepared by the -+ * truncatei_leaf() */ -+ err = ext3_ext_rm_idx(handle, tree, path + i); -+ } -+ /* root level have p_bh == NULL, brelse() eats this */ -+ brelse(path[i].p_bh); -+ i--; -+ ext_debug(tree, "return to level %d\n", i); -+ } -+ } -+ -+ /* TODO: flexible tree reduction should be here */ -+ if (path->p_hdr->eh_entries == 0) { -+ /* -+ * truncate to zero freed all the tree -+ * so, we need to correct eh_depth -+ */ -+ err = ext3_ext_get_access(handle, tree, path); -+ if (err == 0) { -+ EXT_ROOT_HDR(tree)->eh_depth = 0; -+ EXT_ROOT_HDR(tree)->eh_max = ext3_ext_space_root(tree); -+ err = ext3_ext_dirty(handle, tree, path); -+ } -+ } -+ ext3_ext_tree_changed(tree); -+ -+ kfree(path); -+ ext3_journal_stop(handle); -+ -+ return err; -+} -+ -+int ext3_ext_calc_metadata_amount(struct ext3_extents_tree *tree, int blocks) -+{ -+ int lcap, icap, rcap, leafs, idxs, num; -+ -+ rcap = ext3_ext_space_root(tree); -+ if (blocks <= rcap) { -+ /* all extents fit to the root */ -+ return 0; -+ } -+ -+ rcap = ext3_ext_space_root_idx(tree); -+ lcap = ext3_ext_space_block(tree); -+ icap = ext3_ext_space_block_idx(tree); -+ -+ num = leafs = (blocks + lcap - 1) / lcap; -+ if (leafs <= rcap) { -+ /* all pointers to leafs fit to the root */ -+ return leafs; -+ } -+ -+ /* ok. we need separate index block(s) to link all leaf blocks */ -+ idxs = (leafs + icap - 1) / icap; -+ do { -+ num += idxs; -+ idxs = (idxs + icap - 1) / icap; -+ } while (idxs > rcap); -+ -+ return num; -+} -+ -+/* -+ * called at mount time -+ */ -+void ext3_ext_init(struct super_block *sb) -+{ -+ /* -+ * possible initialization would be here -+ */ -+ -+ if (test_opt(sb, EXTENTS)) { -+ printk("EXT3-fs: file extents enabled"); -+#ifdef AGRESSIVE_TEST -+ printk(", agressive tests"); -+#endif -+#ifdef CHECK_BINSEARCH -+ printk(", check binsearch"); -+#endif -+ printk("\n"); -+ } -+} -+ -+/* -+ * called at umount time -+ */ -+void ext3_ext_release(struct super_block *sb) -+{ -+} -+ -+/************************************************************************ -+ * VFS related routines -+ ************************************************************************/ -+ -+static int ext3_get_inode_write_access(handle_t *handle, void *buffer) -+{ -+ /* we use in-core data, not bh */ -+ return 0; -+} -+ -+static int ext3_mark_buffer_dirty(handle_t *handle, void *buffer) -+{ -+ struct inode *inode = buffer; -+ return ext3_mark_inode_dirty(handle, inode); -+} -+ -+static int ext3_ext_mergable(struct ext3_extent *ex1, -+ struct ext3_extent *ex2) -+{ -+ /* FIXME: support for large fs */ -+ if (ex1->ee_start + ex1->ee_len == ex2->ee_start) -+ return 1; -+ return 0; -+} -+ -+static int -+ext3_remove_blocks_credits(struct ext3_extents_tree *tree, -+ struct ext3_extent *ex, -+ unsigned long from, unsigned long to) -+{ -+ int needed; -+ -+ /* at present, extent can't cross block group */; -+ needed = 4; /* bitmap + group desc + sb + inode */ -+ -+#ifdef CONFIG_QUOTA -+ needed += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS; -+#endif -+ return needed; -+} -+ -+static int -+ext3_remove_blocks(struct ext3_extents_tree *tree, -+ struct ext3_extent *ex, -+ unsigned long from, unsigned long to) -+{ -+ int needed = ext3_remove_blocks_credits(tree, ex, from, to); -+ handle_t *handle = ext3_journal_start(tree->inode, needed); -+ struct buffer_head *bh; -+ int i; -+ -+ if (IS_ERR(handle)) -+ return PTR_ERR(handle); -+ if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) { -+ /* tail removal */ -+ unsigned long num, start; -+ num = ex->ee_block + ex->ee_len - from; -+ start = ex->ee_start + ex->ee_len - num; -+ ext_debug(tree, "free last %lu blocks starting %lu\n", -+ num, start); -+ for (i = 0; i < num; i++) { -+ bh = sb_find_get_block(tree->inode->i_sb, start + i); -+ ext3_forget(handle, 0, tree->inode, bh, start + i); -+ } -+ ext3_free_blocks(handle, tree->inode, start, num); -+ } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) { -+ printk("strange request: removal %lu-%lu from %u:%u\n", -+ from, to, ex->ee_block, ex->ee_len); -+ } else { -+ printk("strange request: removal(2) %lu-%lu from %u:%u\n", -+ from, to, ex->ee_block, ex->ee_len); -+ } -+ ext3_journal_stop(handle); -+ return 0; -+} -+ -+static int ext3_ext_find_goal(struct inode *inode, -+ struct ext3_ext_path *path, unsigned long block) -+{ -+ struct ext3_inode_info *ei = EXT3_I(inode); -+ unsigned long bg_start; -+ unsigned long colour; -+ int depth; -+ -+ if (path) { -+ struct ext3_extent *ex; -+ depth = path->p_depth; -+ -+ /* try to predict block placement */ -+ if ((ex = path[depth].p_ext)) -+ return ex->ee_start + (block - ex->ee_block); -+ -+ /* it looks index is empty -+ * try to find starting from index itself */ -+ if (path[depth].p_bh) -+ return path[depth].p_bh->b_blocknr; -+ } -+ -+ /* OK. use inode's group */ -+ bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) + -+ le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block); -+ colour = (current->pid % 16) * -+ (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16); -+ return bg_start + colour + block; -+} -+ -+static int ext3_new_block_cb(handle_t *handle, struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, -+ struct ext3_extent *ex, int *err) -+{ -+ struct inode *inode = tree->inode; -+ int newblock, goal; -+ -+ EXT_ASSERT(path); -+ EXT_ASSERT(ex); -+ EXT_ASSERT(ex->ee_start); -+ EXT_ASSERT(ex->ee_len); -+ -+ /* reuse block from the extent to order data/metadata */ -+ newblock = ex->ee_start++; -+ ex->ee_len--; -+ if (ex->ee_len == 0) { -+ ex->ee_len = 1; -+ /* allocate new block for the extent */ -+ goal = ext3_ext_find_goal(inode, path, ex->ee_block); -+ ex->ee_start = ext3_new_block(handle, inode, goal, err); -+ ex->ee_start_hi = 0; -+ if (ex->ee_start == 0) { -+ /* error occured: restore old extent */ -+ ex->ee_start = newblock; -+ return 0; -+ } -+ } -+ return newblock; -+} -+ -+static struct ext3_extents_helpers ext3_blockmap_helpers = { -+ .get_write_access = ext3_get_inode_write_access, -+ .mark_buffer_dirty = ext3_mark_buffer_dirty, -+ .mergable = ext3_ext_mergable, -+ .new_block = ext3_new_block_cb, -+ .remove_extent = ext3_remove_blocks, -+ .remove_extent_credits = ext3_remove_blocks_credits, -+}; -+ -+void ext3_init_tree_desc(struct ext3_extents_tree *tree, -+ struct inode *inode) -+{ -+ tree->inode = inode; -+ tree->root = (void *) EXT3_I(inode)->i_data; -+ tree->buffer = (void *) inode; -+ tree->buffer_len = sizeof(EXT3_I(inode)->i_data); -+ tree->cex = (struct ext3_ext_cache *) &EXT3_I(inode)->i_cached_extent; -+ tree->ops = &ext3_blockmap_helpers; -+} -+ -+int ext3_ext_get_block(handle_t *handle, struct inode *inode, -+ long iblock, struct buffer_head *bh_result, -+ int create, int extend_disksize) -+{ -+ struct ext3_ext_path *path = NULL; -+ struct ext3_extent newex; -+ struct ext3_extent *ex; -+ int goal, newblock, err = 0, depth; -+ struct ext3_extents_tree tree; -+ -+ clear_buffer_new(bh_result); -+ ext3_init_tree_desc(&tree, inode); -+ ext_debug(&tree, "block %d requested for inode %u\n", -+ (int) iblock, (unsigned) inode->i_ino); -+ down(&EXT3_I(inode)->truncate_sem); -+ -+ /* check in cache */ -+ if ((goal = ext3_ext_in_cache(&tree, iblock, &newex))) { -+ if (goal == EXT3_EXT_CACHE_GAP) { -+ if (!create) { -+ /* block isn't allocated yet and -+ * user don't want to allocate it */ -+ goto out2; -+ } -+ /* we should allocate requested block */ -+ } else if (goal == EXT3_EXT_CACHE_EXTENT) { -+ /* block is already allocated */ -+ newblock = iblock - newex.ee_block + newex.ee_start; -+ goto out; -+ } else { -+ EXT_ASSERT(0); -+ } -+ } -+ -+ /* find extent for this block */ -+ path = ext3_ext_find_extent(&tree, iblock, NULL); -+ if (IS_ERR(path)) { -+ err = PTR_ERR(path); -+ path = NULL; -+ goto out2; -+ } -+ -+ depth = EXT_DEPTH(&tree); -+ -+ /* -+ * consistent leaf must not be empty -+ * this situations is possible, though, _during_ tree modification -+ * this is why assert can't be put in ext3_ext_find_extent() -+ */ -+ EXT_ASSERT(path[depth].p_ext != NULL || depth == 0); -+ -+ if ((ex = path[depth].p_ext)) { -+ /* if found exent covers block, simple return it */ -+ if (iblock >= ex->ee_block && iblock < ex->ee_block + ex->ee_len) { -+ newblock = iblock - ex->ee_block + ex->ee_start; -+ ext_debug(&tree, "%d fit into %d:%d -> %d\n", -+ (int) iblock, ex->ee_block, ex->ee_len, -+ newblock); -+ ext3_ext_put_in_cache(&tree, ex->ee_block, -+ ex->ee_len, ex->ee_start, -+ EXT3_EXT_CACHE_EXTENT); -+ goto out; -+ } -+ } -+ -+ /* -+ * requested block isn't allocated yet -+ * we couldn't try to create block if create flag is zero -+ */ -+ if (!create) { -+ /* put just found gap into cache to speedup subsequest reqs */ -+ ext3_ext_put_gap_in_cache(&tree, path, iblock); -+ goto out2; -+ } -+ -+ /* allocate new block */ -+ goal = ext3_ext_find_goal(inode, path, iblock); -+ newblock = ext3_new_block(handle, inode, goal, &err); -+ if (!newblock) -+ goto out2; -+ ext_debug(&tree, "allocate new block: goal %d, found %d\n", -+ goal, newblock); -+ -+ /* try to insert new extent into found leaf and return */ -+ newex.ee_block = iblock; -+ newex.ee_start = newblock; -+ newex.ee_start_hi = 0; -+ newex.ee_len = 1; -+ err = ext3_ext_insert_extent(handle, &tree, path, &newex); -+ if (err) -+ goto out2; -+ -+ if (extend_disksize && inode->i_size > EXT3_I(inode)->i_disksize) -+ EXT3_I(inode)->i_disksize = inode->i_size; -+ -+ /* previous routine could use block we allocated */ -+ newblock = newex.ee_start; -+ set_buffer_new(bh_result); -+ -+ ext3_ext_put_in_cache(&tree, newex.ee_block, newex.ee_len, -+ newex.ee_start, EXT3_EXT_CACHE_EXTENT); -+out: -+ ext3_ext_show_leaf(&tree, path); -+ map_bh(bh_result, inode->i_sb, newblock); -+out2: -+ if (path) { -+ ext3_ext_drop_refs(path); -+ kfree(path); -+ } -+ up(&EXT3_I(inode)->truncate_sem); -+ -+ return err; -+} -+ -+void ext3_ext_truncate(struct inode * inode, struct page *page) -+{ -+ struct address_space *mapping = inode->i_mapping; -+ struct super_block *sb = inode->i_sb; -+ struct ext3_extents_tree tree; -+ unsigned long last_block; -+ handle_t *handle; -+ int err = 0; -+ -+ ext3_init_tree_desc(&tree, inode); -+ -+ /* -+ * probably first extent we're gonna free will be last in block -+ */ -+ err = ext3_writepage_trans_blocks(inode) + 3; -+ handle = ext3_journal_start(inode, err); -+ if (IS_ERR(handle)) { -+ if (page) { -+ clear_highpage(page); -+ flush_dcache_page(page); -+ unlock_page(page); -+ page_cache_release(page); -+ } -+ return; -+ } -+ -+ if (page) -+ ext3_block_truncate_page(handle, page, mapping, inode->i_size); -+ -+ down(&EXT3_I(inode)->truncate_sem); -+ ext3_ext_invalidate_cache(&tree); -+ -+ /* -+ * TODO: optimization is possible here -+ * probably we need not scaning at all, -+ * because page truncation is enough -+ */ -+ if (ext3_orphan_add(handle, inode)) -+ goto out_stop; -+ -+ /* we have to know where to truncate from in crash case */ -+ EXT3_I(inode)->i_disksize = inode->i_size; -+ ext3_mark_inode_dirty(handle, inode); -+ -+ last_block = (inode->i_size + sb->s_blocksize - 1) >> -+ EXT3_BLOCK_SIZE_BITS(sb); -+ err = ext3_ext_remove_space(&tree, last_block, EXT_MAX_BLOCK); -+ -+ /* In a multi-transaction truncate, we only make the final -+ * transaction synchronous */ -+ if (IS_SYNC(inode)) -+ handle->h_sync = 1; -+ -+out_stop: -+ /* -+ * If this was a simple ftruncate(), and the file will remain alive -+ * then we need to clear up the orphan record which we created above. -+ * However, if this was a real unlink then we were called by -+ * ext3_delete_inode(), and we allow that function to clean up the -+ * orphan info for us. -+ */ -+ if (inode->i_nlink) -+ ext3_orphan_del(handle, inode); -+ -+ up(&EXT3_I(inode)->truncate_sem); -+ ext3_journal_stop(handle); -+} -+ -+/* -+ * this routine calculate max number of blocks we could modify -+ * in order to allocate new block for an inode -+ */ -+int ext3_ext_writepage_trans_blocks(struct inode *inode, int num) -+{ -+ struct ext3_extents_tree tree; -+ int needed; -+ -+ ext3_init_tree_desc(&tree, inode); -+ -+ needed = ext3_ext_calc_credits_for_insert(&tree, NULL); -+ -+ /* caller want to allocate num blocks */ -+ needed *= num; -+ -+#ifdef CONFIG_QUOTA -+ /* -+ * FIXME: real calculation should be here -+ * it depends on blockmap format of qouta file -+ */ -+ needed += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS; -+#endif -+ -+ return needed; -+} -+ -+void ext3_extents_initialize_blockmap(handle_t *handle, struct inode *inode) -+{ -+ struct ext3_extents_tree tree; -+ -+ ext3_init_tree_desc(&tree, inode); -+ ext3_extent_tree_init(handle, &tree); -+} -+ -+int ext3_ext_calc_blockmap_metadata(struct inode *inode, int blocks) -+{ -+ struct ext3_extents_tree tree; -+ -+ ext3_init_tree_desc(&tree, inode); -+ return ext3_ext_calc_metadata_amount(&tree, blocks); -+} -+ -+static int -+ext3_ext_store_extent_cb(struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, -+ struct ext3_ext_cache *newex) -+{ -+ struct ext3_extent_buf *buf = (struct ext3_extent_buf *) tree->private; -+ -+ if (newex->ec_type != EXT3_EXT_CACHE_EXTENT) -+ return EXT_CONTINUE; -+ -+ if (buf->err < 0) -+ return EXT_BREAK; -+ if (buf->cur - buf->buffer + sizeof(*newex) > buf->buflen) -+ return EXT_BREAK; -+ -+ if (!copy_to_user(buf->cur, newex, sizeof(*newex))) { -+ buf->err++; -+ buf->cur += sizeof(*newex); -+ } else { -+ buf->err = -EFAULT; -+ return EXT_BREAK; -+ } -+ return EXT_CONTINUE; -+} -+ -+static int -+ext3_ext_collect_stats_cb(struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, -+ struct ext3_ext_cache *ex) -+{ -+ struct ext3_extent_tree_stats *buf = -+ (struct ext3_extent_tree_stats *) tree->private; -+ int depth; -+ -+ if (ex->ec_type != EXT3_EXT_CACHE_EXTENT) -+ return EXT_CONTINUE; -+ -+ depth = EXT_DEPTH(tree); -+ buf->extents_num++; -+ if (path[depth].p_ext == EXT_FIRST_EXTENT(path[depth].p_hdr)) -+ buf->leaf_num++; -+ return EXT_CONTINUE; -+} -+ -+int ext3_ext_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, -+ unsigned long arg) -+{ -+ int err = 0; -+ -+ if (!(EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL)) -+ return -EINVAL; -+ -+ if (cmd == EXT3_IOC_GET_EXTENTS) { -+ struct ext3_extent_buf buf; -+ struct ext3_extents_tree tree; -+ -+ if (copy_from_user(&buf, (void *) arg, sizeof(buf))) -+ return -EFAULT; -+ -+ ext3_init_tree_desc(&tree, inode); -+ buf.cur = buf.buffer; -+ buf.err = 0; -+ tree.private = &buf; -+ down(&EXT3_I(inode)->truncate_sem); -+ err = ext3_ext_walk_space(&tree, buf.start, EXT_MAX_BLOCK, -+ ext3_ext_store_extent_cb); -+ up(&EXT3_I(inode)->truncate_sem); -+ if (err == 0) -+ err = buf.err; -+ } else if (cmd == EXT3_IOC_GET_TREE_STATS) { -+ struct ext3_extent_tree_stats buf; -+ struct ext3_extents_tree tree; -+ -+ ext3_init_tree_desc(&tree, inode); -+ down(&EXT3_I(inode)->truncate_sem); -+ buf.depth = EXT_DEPTH(&tree); -+ buf.extents_num = 0; -+ buf.leaf_num = 0; -+ tree.private = &buf; -+ err = ext3_ext_walk_space(&tree, 0, EXT_MAX_BLOCK, -+ ext3_ext_collect_stats_cb); -+ up(&EXT3_I(inode)->truncate_sem); -+ if (!err) -+ err = copy_to_user((void *) arg, &buf, sizeof(buf)); -+ } else if (cmd == EXT3_IOC_GET_TREE_DEPTH) { -+ struct ext3_extents_tree tree; -+ ext3_init_tree_desc(&tree, inode); -+ down(&EXT3_I(inode)->truncate_sem); -+ err = EXT_DEPTH(&tree); -+ up(&EXT3_I(inode)->truncate_sem); -+ } -+ -+ return err; -+} -+ -+EXPORT_SYMBOL(ext3_init_tree_desc); -+EXPORT_SYMBOL(ext3_mark_inode_dirty); -+EXPORT_SYMBOL(ext3_ext_invalidate_cache); -+EXPORT_SYMBOL(ext3_ext_insert_extent); -+EXPORT_SYMBOL(ext3_ext_walk_space); -+EXPORT_SYMBOL(ext3_ext_find_goal); -+EXPORT_SYMBOL(ext3_ext_calc_credits_for_insert); -Index: linux-2.6.16.27-0.9/fs/ext3/ialloc.c -=================================================================== ---- linux-2.6.16.27-0.9.orig/fs/ext3/ialloc.c -+++ linux-2.6.16.27-0.9/fs/ext3/ialloc.c -@@ -601,7 +601,7 @@ got: - ei->i_dir_start_lookup = 0; - ei->i_disksize = 0; - -- ei->i_flags = EXT3_I(dir)->i_flags & ~EXT3_INDEX_FL; -+ ei->i_flags = EXT3_I(dir)->i_flags & ~(EXT3_INDEX_FL|EXT3_EXTENTS_FL); - if (S_ISLNK(mode)) - ei->i_flags &= ~(EXT3_IMMUTABLE_FL|EXT3_APPEND_FL); - /* dirsync only applies to directories */ -@@ -645,6 +645,18 @@ got: - if (err) - goto fail_free_drop; - -+ if (test_opt(sb, EXTENTS) && S_ISREG(inode->i_mode)) { -+ EXT3_I(inode)->i_flags |= EXT3_EXTENTS_FL; -+ ext3_extents_initialize_blockmap(handle, inode); -+ if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_EXTENTS)) { -+ err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh); -+ if (err) goto fail; -+ EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_EXTENTS); -+ BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "call ext3_journal_dirty_metadata"); -+ err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); -+ } -+ } -+ - err = ext3_mark_inode_dirty(handle, inode); - if (err) { - ext3_std_error(sb, err); -Index: linux-2.6.16.27-0.9/fs/ext3/inode.c -=================================================================== ---- linux-2.6.16.27-0.9.orig/fs/ext3/inode.c -+++ linux-2.6.16.27-0.9/fs/ext3/inode.c -@@ -40,7 +40,7 @@ - #include "iopen.h" - #include "acl.h" - --static int ext3_writepage_trans_blocks(struct inode *inode); -+int ext3_writepage_trans_blocks(struct inode *inode); - - /* - * Test whether an inode is a fast symlink. -@@ -788,6 +788,17 @@ out: - return err; - } - -+static inline int -+ext3_get_block_wrap(handle_t *handle, struct inode *inode, long block, -+ struct buffer_head *bh, int create, int extend_disksize) -+{ -+ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) -+ return ext3_ext_get_block(handle, inode, block, bh, create, -+ extend_disksize); -+ return ext3_get_block_handle(handle, inode, block, bh, create, -+ extend_disksize); -+} -+ - static int ext3_get_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) - { -@@ -798,8 +809,8 @@ static int ext3_get_block(struct inode * - handle = ext3_journal_current_handle(); - J_ASSERT(handle != 0); - } -- ret = ext3_get_block_handle(handle, inode, iblock, -- bh_result, create, 1); -+ ret = ext3_get_block_wrap(handle, inode, iblock, -+ bh_result, create, 1); - return ret; - } - -@@ -843,7 +854,7 @@ ext3_direct_io_get_blocks(struct inode * - - get_block: - if (ret == 0) -- ret = ext3_get_block_handle(handle, inode, iblock, -+ ret = ext3_get_block_wrap(handle, inode, iblock, - bh_result, create, 0); - bh_result->b_size = (1 << inode->i_blkbits); - return ret; -@@ -863,7 +874,7 @@ struct buffer_head *ext3_getblk(handle_t - dummy.b_state = 0; - dummy.b_blocknr = -1000; - buffer_trace_init(&dummy.b_history); -- *errp = ext3_get_block_handle(handle, inode, block, &dummy, create, 1); -+ *errp = ext3_get_block_wrap(handle, inode, block, &dummy, create, 1); - if (!*errp && buffer_mapped(&dummy)) { - struct buffer_head *bh; - bh = sb_getblk(inode->i_sb, dummy.b_blocknr); -@@ -1606,7 +1617,7 @@ void ext3_set_aops(struct inode *inode) - * This required during truncate. We need to physically zero the tail end - * of that block so it doesn't yield old data if the file is later grown. - */ --static int ext3_block_truncate_page(handle_t *handle, struct page *page, -+int ext3_block_truncate_page(handle_t *handle, struct page *page, - struct address_space *mapping, loff_t from) - { - unsigned long index = from >> PAGE_CACHE_SHIFT; -@@ -2116,6 +2127,9 @@ void ext3_truncate(struct inode * inode) - return; - } - -+ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) -+ return ext3_ext_truncate(inode, page); -+ - handle = start_transaction(inode); - if (IS_ERR(handle)) { - if (page) { -@@ -2863,12 +2877,15 @@ err_out: - * block and work out the exact number of indirects which are touched. Pah. - */ - --static int ext3_writepage_trans_blocks(struct inode *inode) -+int ext3_writepage_trans_blocks(struct inode *inode) - { - int bpp = ext3_journal_blocks_per_page(inode); - int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3; - int ret; - -+ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) -+ return ext3_ext_writepage_trans_blocks(inode, bpp); -+ - if (ext3_should_journal_data(inode)) - ret = 3 * (bpp + indirects) + 2; - else -Index: linux-2.6.16.27-0.9/fs/ext3/Makefile -=================================================================== ---- linux-2.6.16.27-0.9.orig/fs/ext3/Makefile -+++ linux-2.6.16.27-0.9/fs/ext3/Makefile -@@ -5,7 +5,8 @@ - obj-$(CONFIG_EXT3_FS) += ext3.o - - ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ -- ioctl.o namei.o super.o symlink.o hash.o resize.o -+ ioctl.o namei.o super.o symlink.o hash.o resize.o \ -+ extents.o - - ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o - ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o -Index: linux-2.6.16.27-0.9/fs/ext3/super.c -=================================================================== ---- linux-2.6.16.27-0.9.orig/fs/ext3/super.c -+++ linux-2.6.16.27-0.9/fs/ext3/super.c -@@ -392,6 +392,7 @@ static void ext3_put_super (struct super - struct ext3_super_block *es = sbi->s_es; - int i; - -+ ext3_ext_release(sb); - ext3_xattr_put_super(sb); - journal_destroy(sbi->s_journal); - if (!(sb->s_flags & MS_RDONLY)) { -@@ -456,6 +457,8 @@ static struct inode *ext3_alloc_inode(st - #endif - ei->i_block_alloc_info = NULL; - ei->vfs_inode.i_version = 1; -+ -+ memset(&ei->i_cached_extent, 0, sizeof(ei->i_cached_extent)); - return &ei->vfs_inode; - } - -@@ -681,6 +684,7 @@ enum { - Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, - Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, - Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, -+ Opt_extents, Opt_noextents, Opt_extdebug, - Opt_grpquota - }; - -@@ -732,6 +736,9 @@ static match_table_t tokens = { - {Opt_iopen, "iopen"}, - {Opt_noiopen, "noiopen"}, - {Opt_iopen_nopriv, "iopen_nopriv"}, -+ {Opt_extents, "extents"}, -+ {Opt_noextents, "noextents"}, -+ {Opt_extdebug, "extdebug"}, - {Opt_barrier, "barrier=%u"}, - {Opt_err, NULL}, - {Opt_resize, "resize"}, -@@ -1073,6 +1080,15 @@ clear_qf_name: - case Opt_nobh: - set_opt(sbi->s_mount_opt, NOBH); - break; -+ case Opt_extents: -+ set_opt (sbi->s_mount_opt, EXTENTS); -+ break; -+ case Opt_noextents: -+ clear_opt (sbi->s_mount_opt, EXTENTS); -+ break; -+ case Opt_extdebug: -+ set_opt (sbi->s_mount_opt, EXTDEBUG); -+ break; - default: - printk (KERN_ERR - "EXT3-fs: Unrecognized mount option \"%s\" " -@@ -1799,6 +1815,7 @@ static int ext3_fill_super (struct super - percpu_counter_mod(&sbi->s_dirs_counter, - ext3_count_dirs(sb)); - -+ ext3_ext_init(sb); - lock_kernel(); - return 0; - -Index: linux-2.6.16.27-0.9/fs/ext3/ioctl.c -=================================================================== ---- linux-2.6.16.27-0.9.orig/fs/ext3/ioctl.c -+++ linux-2.6.16.27-0.9/fs/ext3/ioctl.c -@@ -125,6 +125,10 @@ flags_err: - err = ext3_change_inode_journal_flag(inode, jflag); - return err; - } -+ case EXT3_IOC_GET_EXTENTS: -+ case EXT3_IOC_GET_TREE_STATS: -+ case EXT3_IOC_GET_TREE_DEPTH: -+ return ext3_ext_ioctl(inode, filp, cmd, arg); - case EXT3_IOC_GETVERSION: - case EXT3_IOC_GETVERSION_OLD: - return put_user(inode->i_generation, (int __user *) arg); -Index: linux-2.6.16.27-0.9/include/linux/ext3_fs.h -=================================================================== ---- linux-2.6.16.27-0.9.orig/include/linux/ext3_fs.h -+++ linux-2.6.16.27-0.9/include/linux/ext3_fs.h -@@ -185,9 +185,10 @@ struct ext3_group_desc - #define EXT3_NOTAIL_FL 0x00008000 /* file tail should not be merged */ - #define EXT3_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ - #define EXT3_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ -+#define EXT3_EXTENTS_FL 0x00080000 /* Inode uses extents */ - #define EXT3_RESERVED_FL 0x80000000 /* reserved for ext3 lib */ - --#define EXT3_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */ -+#define EXT3_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */ - #define EXT3_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */ - - /* -@@ -237,6 +238,9 @@ struct ext3_new_group_data { - #endif - #define EXT3_IOC_GETRSVSZ _IOR('f', 5, long) - #define EXT3_IOC_SETRSVSZ _IOW('f', 6, long) -+#define EXT3_IOC_GET_EXTENTS _IOR('f', 7, long) -+#define EXT3_IOC_GET_TREE_DEPTH _IOR('f', 8, long) -+#define EXT3_IOC_GET_TREE_STATS _IOR('f', 9, long) - - /* - * Mount options -@@ -377,6 +381,8 @@ struct ext3_inode { - #define EXT3_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ - #define EXT3_MOUNT_IOPEN 0x400000 /* Allow access via iopen */ - #define EXT3_MOUNT_IOPEN_NOPRIV 0x800000/* Make iopen world-readable */ -+#define EXT3_MOUNT_EXTENTS 0x1000000/* Extents support */ -+#define EXT3_MOUNT_EXTDEBUG 0x2000000/* Extents debug */ - - /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ - #ifndef clear_opt -@@ -565,11 +571,13 @@ static inline struct ext3_inode_info *EX - #define EXT3_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */ - #define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */ - #define EXT3_FEATURE_INCOMPAT_META_BG 0x0010 -+#define EXT3_FEATURE_INCOMPAT_EXTENTS 0x0040 /* extents support */ - - #define EXT3_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR - #define EXT3_FEATURE_INCOMPAT_SUPP (EXT3_FEATURE_INCOMPAT_FILETYPE| \ - EXT3_FEATURE_INCOMPAT_RECOVER| \ -- EXT3_FEATURE_INCOMPAT_META_BG) -+ EXT3_FEATURE_INCOMPAT_META_BG| \ -+ EXT3_FEATURE_INCOMPAT_EXTENTS) - #define EXT3_FEATURE_RO_COMPAT_SUPP (EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \ - EXT3_FEATURE_RO_COMPAT_LARGE_FILE| \ - EXT3_FEATURE_RO_COMPAT_BTREE_DIR) -@@ -776,6 +784,7 @@ extern unsigned long ext3_count_free (st - - - /* inode.c */ -+extern int ext3_block_truncate_page(handle_t *, struct page *, struct address_space *, loff_t); - int ext3_forget(handle_t *, int, struct inode *, struct buffer_head *, int); - struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *); - struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *); -@@ -795,6 +804,7 @@ extern int ext3_get_inode_loc(struct ino - extern void ext3_truncate (struct inode *); - extern void ext3_set_inode_flags(struct inode *); - extern void ext3_set_aops(struct inode *inode); -+extern int ext3_writepage_trans_blocks(struct inode *inode); - - /* ioctl.c */ - extern int ext3_ioctl (struct inode *, struct file *, unsigned int, -@@ -848,6 +858,16 @@ extern struct inode_operations ext3_spec - extern struct inode_operations ext3_symlink_inode_operations; - extern struct inode_operations ext3_fast_symlink_inode_operations; - -+/* extents.c */ -+extern int ext3_ext_writepage_trans_blocks(struct inode *, int); -+extern int ext3_ext_get_block(handle_t *, struct inode *, long, -+ struct buffer_head *, int, int); -+extern void ext3_ext_truncate(struct inode *, struct page *); -+extern void ext3_ext_init(struct super_block *); -+extern void ext3_ext_release(struct super_block *); -+extern void ext3_extents_initialize_blockmap(handle_t *, struct inode *); -+extern int ext3_ext_ioctl(struct inode *inode, struct file *filp, -+ unsigned int cmd, unsigned long arg); - - #endif /* __KERNEL__ */ - -Index: linux-2.6.16.27-0.9/include/linux/ext3_extents.h -=================================================================== ---- /dev/null -+++ linux-2.6.16.27-0.9/include/linux/ext3_extents.h -@@ -0,0 +1,262 @@ -+/* -+ * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com -+ * Written by Alex Tomas -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License version 2 as -+ * published by the Free Software Foundation. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public Licens -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- -+ */ -+ -+#ifndef _LINUX_EXT3_EXTENTS -+#define _LINUX_EXT3_EXTENTS -+ -+/* -+ * with AGRESSIVE_TEST defined capacity of index/leaf blocks -+ * become very little, so index split, in-depth growing and -+ * other hard changes happens much more often -+ * this is for debug purposes only -+ */ -+#define AGRESSIVE_TEST_ -+ -+/* -+ * if CHECK_BINSEARCH defined, then results of binary search -+ * will be checked by linear search -+ */ -+#define CHECK_BINSEARCH_ -+ -+/* -+ * if EXT_DEBUG is defined you can use 'extdebug' mount option -+ * to get lots of info what's going on -+ */ -+#define EXT_DEBUG_ -+#ifdef EXT_DEBUG -+#define ext_debug(tree,fmt,a...) \ -+do { \ -+ if (test_opt((tree)->inode->i_sb, EXTDEBUG)) \ -+ printk(fmt, ##a); \ -+} while (0); -+#else -+#define ext_debug(tree,fmt,a...) -+#endif -+ -+/* -+ * if EXT_STATS is defined then stats numbers are collected -+ * these number will be displayed at umount time -+ */ -+#define EXT_STATS_ -+ -+ -+#define EXT3_ALLOC_NEEDED 3 /* block bitmap + group desc. + sb */ -+ -+/* -+ * ext3_inode has i_block array (total 60 bytes) -+ * first 4 bytes are used to store: -+ * - tree depth (0 mean there is no tree yet. all extents in the inode) -+ * - number of alive extents in the inode -+ */ -+ -+/* -+ * this is extent on-disk structure -+ * it's used at the bottom of the tree -+ */ -+struct ext3_extent { -+ __u32 ee_block; /* first logical block extent covers */ -+ __u16 ee_len; /* number of blocks covered by extent */ -+ __u16 ee_start_hi; /* high 16 bits of physical block */ -+ __u32 ee_start; /* low 32 bigs of physical block */ -+}; -+ -+/* -+ * this is index on-disk structure -+ * it's used at all the levels, but the bottom -+ */ -+struct ext3_extent_idx { -+ __u32 ei_block; /* index covers logical blocks from 'block' */ -+ __u32 ei_leaf; /* pointer to the physical block of the next * -+ * level. leaf or next index could bet here */ -+ __u16 ei_leaf_hi; /* high 16 bits of physical block */ -+ __u16 ei_unused; -+}; -+ -+/* -+ * each block (leaves and indexes), even inode-stored has header -+ */ -+struct ext3_extent_header { -+ __u16 eh_magic; /* probably will support different formats */ -+ __u16 eh_entries; /* number of valid entries */ -+ __u16 eh_max; /* capacity of store in entries */ -+ __u16 eh_depth; /* has tree real underlaying blocks? */ -+ __u32 eh_generation; /* flags(8 bits) | generation of the tree */ -+}; -+ -+#define EXT3_EXT_MAGIC 0xf30a -+ -+/* -+ * array of ext3_ext_path contains path to some extent -+ * creation/lookup routines use it for traversal/splitting/etc -+ * truncate uses it to simulate recursive walking -+ */ -+struct ext3_ext_path { -+ __u32 p_block; -+ __u16 p_depth; -+ struct ext3_extent *p_ext; -+ struct ext3_extent_idx *p_idx; -+ struct ext3_extent_header *p_hdr; -+ struct buffer_head *p_bh; -+}; -+ -+/* -+ * structure for external API -+ */ -+ -+/* -+ * storage for cached extent -+ */ -+struct ext3_ext_cache { -+ __u32 ec_start; -+ __u32 ec_block; -+ __u32 ec_len; -+ __u32 ec_type; -+}; -+ -+#define EXT3_EXT_CACHE_NO 0 -+#define EXT3_EXT_CACHE_GAP 1 -+#define EXT3_EXT_CACHE_EXTENT 2 -+ -+/* -+ * ext3_extents_tree is used to pass initial information -+ * to top-level extents API -+ */ -+struct ext3_extents_helpers; -+struct ext3_extents_tree { -+ struct inode *inode; /* inode which tree belongs to */ -+ void *root; /* ptr to data top of tree resides at */ -+ void *buffer; /* will be passed as arg to ^^ routines */ -+ int buffer_len; -+ void *private; -+ struct ext3_ext_cache *cex;/* last found extent */ -+ struct ext3_extents_helpers *ops; -+}; -+ -+struct ext3_extents_helpers { -+ int (*get_write_access)(handle_t *h, void *buffer); -+ int (*mark_buffer_dirty)(handle_t *h, void *buffer); -+ int (*mergable)(struct ext3_extent *ex1, struct ext3_extent *ex2); -+ int (*remove_extent_credits)(struct ext3_extents_tree *, -+ struct ext3_extent *, unsigned long, -+ unsigned long); -+ int (*remove_extent)(struct ext3_extents_tree *, -+ struct ext3_extent *, unsigned long, -+ unsigned long); -+ int (*new_block)(handle_t *, struct ext3_extents_tree *, -+ struct ext3_ext_path *, struct ext3_extent *, -+ int *); -+}; -+ -+/* -+ * to be called by ext3_ext_walk_space() -+ * negative retcode - error -+ * positive retcode - signal for ext3_ext_walk_space(), see below -+ * callback must return valid extent (passed or newly created) -+ */ -+typedef int (*ext_prepare_callback)(struct ext3_extents_tree *, -+ struct ext3_ext_path *, -+ struct ext3_ext_cache *); -+ -+#define EXT_CONTINUE 0 -+#define EXT_BREAK 1 -+#define EXT_REPEAT 2 -+ -+ -+#define EXT_MAX_BLOCK 0xffffffff -+ -+ -+#define EXT_FIRST_EXTENT(__hdr__) \ -+ ((struct ext3_extent *) (((char *) (__hdr__)) + \ -+ sizeof(struct ext3_extent_header))) -+#define EXT_FIRST_INDEX(__hdr__) \ -+ ((struct ext3_extent_idx *) (((char *) (__hdr__)) + \ -+ sizeof(struct ext3_extent_header))) -+#define EXT_HAS_FREE_INDEX(__path__) \ -+ ((__path__)->p_hdr->eh_entries < (__path__)->p_hdr->eh_max) -+#define EXT_LAST_EXTENT(__hdr__) \ -+ (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_entries - 1) -+#define EXT_LAST_INDEX(__hdr__) \ -+ (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_entries - 1) -+#define EXT_MAX_EXTENT(__hdr__) \ -+ (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_max - 1) -+#define EXT_MAX_INDEX(__hdr__) \ -+ (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_max - 1) -+#define EXT_HDR_GEN(__hdr__) ((__hdr__)->eh_generation & 0x00ffffff) -+#define EXT_FLAGS(__hdr__) ((__hdr__)->eh_generation >> 24) -+#define EXT_FLAGS_CLR_UNKNOWN 0x7 /* Flags cleared on modification */ -+ -+#define EXT_BLOCK_HDR(__bh__) ((struct ext3_extent_header *)(__bh__)->b_data) -+#define EXT_ROOT_HDR(__tree__) ((struct ext3_extent_header *)(__tree__)->root) -+#define EXT_DEPTH(__tree__) (EXT_ROOT_HDR(__tree__)->eh_depth) -+#define EXT_GENERATION(__tree__) EXT_HDR_GEN(EXT_ROOT_HDR(__tree__)) -+ -+#define EXT_ASSERT(__x__) if (!(__x__)) BUG(); -+ -+#define EXT_CHECK_PATH(tree,path) \ -+{ \ -+ int depth = EXT_DEPTH(tree); \ -+ BUG_ON((unsigned long) (path) < __PAGE_OFFSET); \ -+ BUG_ON((unsigned long) (path)[depth].p_idx < \ -+ __PAGE_OFFSET && (path)[depth].p_idx != NULL); \ -+ BUG_ON((unsigned long) (path)[depth].p_ext < \ -+ __PAGE_OFFSET && (path)[depth].p_ext != NULL); \ -+ BUG_ON((unsigned long) (path)[depth].p_hdr < __PAGE_OFFSET); \ -+ BUG_ON((unsigned long) (path)[depth].p_bh < __PAGE_OFFSET \ -+ && depth != 0); \ -+ BUG_ON((path)[0].p_depth != depth); \ -+} -+ -+ -+/* -+ * this structure is used to gather extents from the tree via ioctl -+ */ -+struct ext3_extent_buf { -+ unsigned long start; -+ int buflen; -+ void *buffer; -+ void *cur; -+ int err; -+}; -+ -+/* -+ * this structure is used to collect stats info about the tree -+ */ -+struct ext3_extent_tree_stats { -+ int depth; -+ int extents_num; -+ int leaf_num; -+}; -+ -+extern void ext3_init_tree_desc(struct ext3_extents_tree *, struct inode *); -+extern int ext3_extent_tree_init(handle_t *, struct ext3_extents_tree *); -+extern int ext3_ext_calc_credits_for_insert(struct ext3_extents_tree *, struct ext3_ext_path *); -+extern int ext3_ext_insert_extent(handle_t *, struct ext3_extents_tree *, struct ext3_ext_path *, struct ext3_extent *); -+extern int ext3_ext_walk_space(struct ext3_extents_tree *, unsigned long, unsigned long, ext_prepare_callback); -+extern int ext3_ext_remove_space(struct ext3_extents_tree *, unsigned long, unsigned long); -+extern struct ext3_ext_path * ext3_ext_find_extent(struct ext3_extents_tree *, int, struct ext3_ext_path *); -+extern int ext3_ext_calc_blockmap_metadata(struct inode *, int); -+ -+static inline void -+ext3_ext_invalidate_cache(struct ext3_extents_tree *tree) -+{ -+ if (tree->cex) -+ tree->cex->ec_type = EXT3_EXT_CACHE_NO; -+} -+ -+ -+#endif /* _LINUX_EXT3_EXTENTS */ -Index: linux-2.6.16.27-0.9/include/linux/ext3_fs_i.h -=================================================================== ---- linux-2.6.16.27-0.9.orig/include/linux/ext3_fs_i.h -+++ linux-2.6.16.27-0.9/include/linux/ext3_fs_i.h -@@ -133,6 +133,8 @@ struct ext3_inode_info { - */ - struct semaphore truncate_sem; - struct inode vfs_inode; -+ -+ __u32 i_cached_extent[4]; - }; - - #endif /* _LINUX_EXT3_FS_I */ diff --git a/ldiskfs/kernel_patches/patches/ext3-extents-2.6.18-vanilla.patch b/ldiskfs/kernel_patches/patches/ext3-extents-2.6.18-vanilla.patch deleted file mode 100644 index 7bc712e..0000000 --- a/ldiskfs/kernel_patches/patches/ext3-extents-2.6.18-vanilla.patch +++ /dev/null @@ -1,2950 +0,0 @@ -Index: linux-stage/fs/ext3/extents.c -=================================================================== ---- /dev/null 1970-01-01 00:00:00.000000000 +0000 -+++ linux-stage/fs/ext3/extents.c 2006-07-16 14:10:21.000000000 +0800 -@@ -0,0 +1,2359 @@ -+/* -+ * Copyright(c) 2003, 2004, 2005, Cluster File Systems, Inc, info@clusterfs.com -+ * Written by Alex Tomas -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License version 2 as -+ * published by the Free Software Foundation. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public Licens -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- -+ */ -+ -+/* -+ * Extents support for EXT3 -+ * -+ * TODO: -+ * - ext3_ext_walk_space() sould not use ext3_ext_find_extent() -+ * - ext3_ext_calc_credits() could take 'mergable' into account -+ * - ext3*_error() should be used in some situations -+ * - find_goal() [to be tested and improved] -+ * - smart tree reduction -+ * - arch-independence -+ * common on-disk format for big/little-endian arch -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+ -+static inline int ext3_ext_check_header(struct ext3_extent_header *eh) -+{ -+ if (eh->eh_magic != EXT3_EXT_MAGIC) { -+ printk(KERN_ERR "EXT3-fs: invalid magic = 0x%x\n", -+ (unsigned)eh->eh_magic); -+ return -EIO; -+ } -+ if (eh->eh_max == 0) { -+ printk(KERN_ERR "EXT3-fs: invalid eh_max = %u\n", -+ (unsigned)eh->eh_max); -+ return -EIO; -+ } -+ if (eh->eh_entries > eh->eh_max) { -+ printk(KERN_ERR "EXT3-fs: invalid eh_entries = %u\n", -+ (unsigned)eh->eh_entries); -+ return -EIO; -+ } -+ return 0; -+} -+ -+static handle_t *ext3_ext_journal_restart(handle_t *handle, int needed) -+{ -+ int err; -+ -+ if (handle->h_buffer_credits > needed) -+ return handle; -+ if (!ext3_journal_extend(handle, needed)) -+ return handle; -+ err = ext3_journal_restart(handle, needed); -+ -+ return handle; -+} -+ -+static int inline -+ext3_ext_get_access_for_root(handle_t *h, struct ext3_extents_tree *tree) -+{ -+ if (tree->ops->get_write_access) -+ return tree->ops->get_write_access(h,tree->buffer); -+ else -+ return 0; -+} -+ -+static int inline -+ext3_ext_mark_root_dirty(handle_t *h, struct ext3_extents_tree *tree) -+{ -+ if (tree->ops->mark_buffer_dirty) -+ return tree->ops->mark_buffer_dirty(h,tree->buffer); -+ else -+ return 0; -+} -+ -+/* -+ * could return: -+ * - EROFS -+ * - ENOMEM -+ */ -+static int ext3_ext_get_access(handle_t *handle, -+ struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path) -+{ -+ int err; -+ -+ if (path->p_bh) { -+ /* path points to block */ -+ err = ext3_journal_get_write_access(handle, path->p_bh); -+ } else { -+ /* path points to leaf/index in inode body */ -+ err = ext3_ext_get_access_for_root(handle, tree); -+ } -+ return err; -+} -+ -+/* -+ * could return: -+ * - EROFS -+ * - ENOMEM -+ * - EIO -+ */ -+static int ext3_ext_dirty(handle_t *handle, struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path) -+{ -+ int err; -+ if (path->p_bh) { -+ /* path points to block */ -+ err =ext3_journal_dirty_metadata(handle, path->p_bh); -+ } else { -+ /* path points to leaf/index in inode body */ -+ err = ext3_ext_mark_root_dirty(handle, tree); -+ } -+ return err; -+} -+ -+static int inline -+ext3_ext_new_block(handle_t *handle, struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, struct ext3_extent *ex, -+ int *err) -+{ -+ int goal, depth, newblock; -+ struct inode *inode; -+ -+ EXT_ASSERT(tree); -+ if (tree->ops->new_block) -+ return tree->ops->new_block(handle, tree, path, ex, err); -+ -+ inode = tree->inode; -+ depth = EXT_DEPTH(tree); -+ if (path && depth > 0) { -+ goal = path[depth-1].p_block; -+ } else { -+ struct ext3_inode_info *ei = EXT3_I(inode); -+ unsigned long bg_start; -+ unsigned long colour; -+ -+ bg_start = (ei->i_block_group * -+ EXT3_BLOCKS_PER_GROUP(inode->i_sb)) + -+ le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block); -+ colour = (current->pid % 16) * -+ (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16); -+ goal = bg_start + colour; -+ } -+ -+ newblock = ext3_new_block(handle, inode, goal, err); -+ return newblock; -+} -+ -+static inline void ext3_ext_tree_changed(struct ext3_extents_tree *tree) -+{ -+ struct ext3_extent_header *neh = EXT_ROOT_HDR(tree); -+ neh->eh_generation = ((EXT_FLAGS(neh) & ~EXT_FLAGS_CLR_UNKNOWN) << 24) | -+ (EXT_HDR_GEN(neh) + 1); -+} -+ -+static inline int ext3_ext_space_block(struct ext3_extents_tree *tree) -+{ -+ int size; -+ -+ size = (tree->inode->i_sb->s_blocksize - -+ sizeof(struct ext3_extent_header)) / -+ sizeof(struct ext3_extent); -+#ifdef AGRESSIVE_TEST -+ size = 6; -+#endif -+ return size; -+} -+ -+static inline int ext3_ext_space_block_idx(struct ext3_extents_tree *tree) -+{ -+ int size; -+ -+ size = (tree->inode->i_sb->s_blocksize - -+ sizeof(struct ext3_extent_header)) / -+ sizeof(struct ext3_extent_idx); -+#ifdef AGRESSIVE_TEST -+ size = 5; -+#endif -+ return size; -+} -+ -+static inline int ext3_ext_space_root(struct ext3_extents_tree *tree) -+{ -+ int size; -+ -+ size = (tree->buffer_len - sizeof(struct ext3_extent_header)) / -+ sizeof(struct ext3_extent); -+#ifdef AGRESSIVE_TEST -+ size = 3; -+#endif -+ return size; -+} -+ -+static inline int ext3_ext_space_root_idx(struct ext3_extents_tree *tree) -+{ -+ int size; -+ -+ size = (tree->buffer_len - sizeof(struct ext3_extent_header)) / -+ sizeof(struct ext3_extent_idx); -+#ifdef AGRESSIVE_TEST -+ size = 4; -+#endif -+ return size; -+} -+ -+static void ext3_ext_show_path(struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path) -+{ -+#ifdef EXT_DEBUG -+ int k, l = path->p_depth; -+ -+ ext_debug(tree, "path:"); -+ for (k = 0; k <= l; k++, path++) { -+ if (path->p_idx) { -+ ext_debug(tree, " %d->%d", path->p_idx->ei_block, -+ path->p_idx->ei_leaf); -+ } else if (path->p_ext) { -+ ext_debug(tree, " %d:%d:%d", -+ path->p_ext->ee_block, -+ path->p_ext->ee_len, -+ path->p_ext->ee_start); -+ } else -+ ext_debug(tree, " []"); -+ } -+ ext_debug(tree, "\n"); -+#endif -+} -+ -+static void ext3_ext_show_leaf(struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path) -+{ -+#ifdef EXT_DEBUG -+ int depth = EXT_DEPTH(tree); -+ struct ext3_extent_header *eh; -+ struct ext3_extent *ex; -+ int i; -+ -+ if (!path) -+ return; -+ -+ eh = path[depth].p_hdr; -+ ex = EXT_FIRST_EXTENT(eh); -+ -+ for (i = 0; i < eh->eh_entries; i++, ex++) { -+ ext_debug(tree, "%d:%d:%d ", -+ ex->ee_block, ex->ee_len, ex->ee_start); -+ } -+ ext_debug(tree, "\n"); -+#endif -+} -+ -+static void ext3_ext_drop_refs(struct ext3_ext_path *path) -+{ -+ int depth = path->p_depth; -+ int i; -+ -+ for (i = 0; i <= depth; i++, path++) { -+ if (path->p_bh) { -+ brelse(path->p_bh); -+ path->p_bh = NULL; -+ } -+ } -+} -+ -+/* -+ * binary search for closest index by given block -+ */ -+static inline void -+ext3_ext_binsearch_idx(struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, int block) -+{ -+ struct ext3_extent_header *eh = path->p_hdr; -+ struct ext3_extent_idx *ix; -+ int l = 0, k, r; -+ -+ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); -+ EXT_ASSERT(eh->eh_entries <= eh->eh_max); -+ EXT_ASSERT(eh->eh_entries > 0); -+ -+ ext_debug(tree, "binsearch for %d(idx): ", block); -+ -+ path->p_idx = ix = EXT_FIRST_INDEX(eh); -+ -+ r = k = eh->eh_entries; -+ while (k > 1) { -+ k = (r - l) / 2; -+ if (block < ix[l + k].ei_block) -+ r -= k; -+ else -+ l += k; -+ ext_debug(tree, "%d:%d:%d ", k, l, r); -+ } -+ -+ ix += l; -+ path->p_idx = ix; -+ ext_debug(tree," -> %d->%d ",path->p_idx->ei_block,path->p_idx->ei_leaf); -+ -+ while (l++ < r) { -+ if (block < ix->ei_block) -+ break; -+ path->p_idx = ix++; -+ } -+ ext_debug(tree, " -> %d->%d\n", path->p_idx->ei_block, -+ path->p_idx->ei_leaf); -+ -+#ifdef CHECK_BINSEARCH -+ { -+ struct ext3_extent_idx *chix; -+ -+ chix = ix = EXT_FIRST_INDEX(eh); -+ for (k = 0; k < eh->eh_entries; k++, ix++) { -+ if (k != 0 && ix->ei_block <= ix[-1].ei_block) { -+ printk("k=%d, ix=0x%p, first=0x%p\n", k, -+ ix, EXT_FIRST_INDEX(eh)); -+ printk("%u <= %u\n", -+ ix->ei_block,ix[-1].ei_block); -+ } -+ EXT_ASSERT(k == 0 || ix->ei_block > ix[-1].ei_block); -+ if (block < ix->ei_block) -+ break; -+ chix = ix; -+ } -+ EXT_ASSERT(chix == path->p_idx); -+ } -+#endif -+} -+ -+/* -+ * binary search for closest extent by given block -+ */ -+static inline void -+ext3_ext_binsearch(struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, int block) -+{ -+ struct ext3_extent_header *eh = path->p_hdr; -+ struct ext3_extent *ex; -+ int l = 0, k, r; -+ -+ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); -+ EXT_ASSERT(eh->eh_entries <= eh->eh_max); -+ -+ if (eh->eh_entries == 0) { -+ /* -+ * this leaf is empty yet: -+ * we get such a leaf in split/add case -+ */ -+ return; -+ } -+ -+ ext_debug(tree, "binsearch for %d: ", block); -+ -+ path->p_ext = ex = EXT_FIRST_EXTENT(eh); -+ -+ r = k = eh->eh_entries; -+ while (k > 1) { -+ k = (r - l) / 2; -+ if (block < ex[l + k].ee_block) -+ r -= k; -+ else -+ l += k; -+ ext_debug(tree, "%d:%d:%d ", k, l, r); -+ } -+ -+ ex += l; -+ path->p_ext = ex; -+ ext_debug(tree, " -> %d:%d:%d ", path->p_ext->ee_block, -+ path->p_ext->ee_start, path->p_ext->ee_len); -+ -+ while (l++ < r) { -+ if (block < ex->ee_block) -+ break; -+ path->p_ext = ex++; -+ } -+ ext_debug(tree, " -> %d:%d:%d\n", path->p_ext->ee_block, -+ path->p_ext->ee_start, path->p_ext->ee_len); -+ -+#ifdef CHECK_BINSEARCH -+ { -+ struct ext3_extent *chex; -+ -+ chex = ex = EXT_FIRST_EXTENT(eh); -+ for (k = 0; k < eh->eh_entries; k++, ex++) { -+ EXT_ASSERT(k == 0 || ex->ee_block > ex[-1].ee_block); -+ if (block < ex->ee_block) -+ break; -+ chex = ex; -+ } -+ EXT_ASSERT(chex == path->p_ext); -+ } -+#endif -+} -+ -+int ext3_extent_tree_init(handle_t *handle, struct ext3_extents_tree *tree) -+{ -+ struct ext3_extent_header *eh; -+ -+ BUG_ON(tree->buffer_len == 0); -+ ext3_ext_get_access_for_root(handle, tree); -+ eh = EXT_ROOT_HDR(tree); -+ eh->eh_depth = 0; -+ eh->eh_entries = 0; -+ eh->eh_magic = EXT3_EXT_MAGIC; -+ eh->eh_max = ext3_ext_space_root(tree); -+ ext3_ext_mark_root_dirty(handle, tree); -+ ext3_ext_invalidate_cache(tree); -+ return 0; -+} -+ -+struct ext3_ext_path * -+ext3_ext_find_extent(struct ext3_extents_tree *tree, int block, -+ struct ext3_ext_path *path) -+{ -+ struct ext3_extent_header *eh; -+ struct buffer_head *bh; -+ int depth, i, ppos = 0; -+ -+ EXT_ASSERT(tree); -+ EXT_ASSERT(tree->inode); -+ EXT_ASSERT(tree->root); -+ -+ eh = EXT_ROOT_HDR(tree); -+ EXT_ASSERT(eh); -+ if (ext3_ext_check_header(eh)) { -+ /* don't free previously allocated path -+ * -- caller should take care */ -+ path = NULL; -+ goto err; -+ } -+ -+ i = depth = EXT_DEPTH(tree); -+ EXT_ASSERT(eh->eh_max); -+ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); -+ -+ /* account possible depth increase */ -+ if (!path) { -+ path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 2), -+ GFP_NOFS); -+ if (!path) -+ return ERR_PTR(-ENOMEM); -+ } -+ memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1)); -+ path[0].p_hdr = eh; -+ -+ /* walk through the tree */ -+ while (i) { -+ ext_debug(tree, "depth %d: num %d, max %d\n", -+ ppos, eh->eh_entries, eh->eh_max); -+ ext3_ext_binsearch_idx(tree, path + ppos, block); -+ path[ppos].p_block = path[ppos].p_idx->ei_leaf; -+ path[ppos].p_depth = i; -+ path[ppos].p_ext = NULL; -+ -+ bh = sb_bread(tree->inode->i_sb, path[ppos].p_block); -+ if (!bh) -+ goto err; -+ -+ eh = EXT_BLOCK_HDR(bh); -+ ppos++; -+ EXT_ASSERT(ppos <= depth); -+ path[ppos].p_bh = bh; -+ path[ppos].p_hdr = eh; -+ i--; -+ -+ if (ext3_ext_check_header(eh)) -+ goto err; -+ } -+ -+ path[ppos].p_depth = i; -+ path[ppos].p_hdr = eh; -+ path[ppos].p_ext = NULL; -+ path[ppos].p_idx = NULL; -+ -+ if (ext3_ext_check_header(eh)) -+ goto err; -+ -+ /* find extent */ -+ ext3_ext_binsearch(tree, path + ppos, block); -+ -+ ext3_ext_show_path(tree, path); -+ -+ return path; -+ -+err: -+ printk(KERN_ERR "EXT3-fs: header is corrupted!\n"); -+ if (path) { -+ ext3_ext_drop_refs(path); -+ kfree(path); -+ } -+ return ERR_PTR(-EIO); -+} -+ -+/* -+ * insert new index [logical;ptr] into the block at cupr -+ * it check where to insert: before curp or after curp -+ */ -+static int ext3_ext_insert_index(handle_t *handle, -+ struct ext3_extents_tree *tree, -+ struct ext3_ext_path *curp, -+ int logical, int ptr) -+{ -+ struct ext3_extent_idx *ix; -+ int len, err; -+ -+ if ((err = ext3_ext_get_access(handle, tree, curp))) -+ return err; -+ -+ EXT_ASSERT(logical != curp->p_idx->ei_block); -+ len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx; -+ if (logical > curp->p_idx->ei_block) { -+ /* insert after */ -+ if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) { -+ len = (len - 1) * sizeof(struct ext3_extent_idx); -+ len = len < 0 ? 0 : len; -+ ext_debug(tree, "insert new index %d after: %d. " -+ "move %d from 0x%p to 0x%p\n", -+ logical, ptr, len, -+ (curp->p_idx + 1), (curp->p_idx + 2)); -+ memmove(curp->p_idx + 2, curp->p_idx + 1, len); -+ } -+ ix = curp->p_idx + 1; -+ } else { -+ /* insert before */ -+ len = len * sizeof(struct ext3_extent_idx); -+ len = len < 0 ? 0 : len; -+ ext_debug(tree, "insert new index %d before: %d. " -+ "move %d from 0x%p to 0x%p\n", -+ logical, ptr, len, -+ curp->p_idx, (curp->p_idx + 1)); -+ memmove(curp->p_idx + 1, curp->p_idx, len); -+ ix = curp->p_idx; -+ } -+ -+ ix->ei_block = logical; -+ ix->ei_leaf = ptr; -+ ix->ei_leaf_hi = ix->ei_unused = 0; -+ curp->p_hdr->eh_entries++; -+ -+ EXT_ASSERT(curp->p_hdr->eh_entries <= curp->p_hdr->eh_max); -+ EXT_ASSERT(ix <= EXT_LAST_INDEX(curp->p_hdr)); -+ -+ err = ext3_ext_dirty(handle, tree, curp); -+ ext3_std_error(tree->inode->i_sb, err); -+ -+ return err; -+} -+ -+/* -+ * routine inserts new subtree into the path, using free index entry -+ * at depth 'at: -+ * - allocates all needed blocks (new leaf and all intermediate index blocks) -+ * - makes decision where to split -+ * - moves remaining extens and index entries (right to the split point) -+ * into the newly allocated blocks -+ * - initialize subtree -+ */ -+static int ext3_ext_split(handle_t *handle, struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, -+ struct ext3_extent *newext, int at) -+{ -+ struct buffer_head *bh = NULL; -+ int depth = EXT_DEPTH(tree); -+ struct ext3_extent_header *neh; -+ struct ext3_extent_idx *fidx; -+ struct ext3_extent *ex; -+ int i = at, k, m, a; -+ unsigned long newblock, oldblock, border; -+ int *ablocks = NULL; /* array of allocated blocks */ -+ int err = 0; -+ -+ /* make decision: where to split? */ -+ /* FIXME: now desicion is simplest: at current extent */ -+ -+ /* if current leaf will be splitted, then we should use -+ * border from split point */ -+ EXT_ASSERT(path[depth].p_ext <= EXT_MAX_EXTENT(path[depth].p_hdr)); -+ if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) { -+ border = path[depth].p_ext[1].ee_block; -+ ext_debug(tree, "leaf will be splitted." -+ " next leaf starts at %d\n", -+ (int)border); -+ } else { -+ border = newext->ee_block; -+ ext_debug(tree, "leaf will be added." -+ " next leaf starts at %d\n", -+ (int)border); -+ } -+ -+ /* -+ * if error occurs, then we break processing -+ * and turn filesystem read-only. so, index won't -+ * be inserted and tree will be in consistent -+ * state. next mount will repair buffers too -+ */ -+ -+ /* -+ * get array to track all allocated blocks -+ * we need this to handle errors and free blocks -+ * upon them -+ */ -+ ablocks = kmalloc(sizeof(unsigned long) * depth, GFP_NOFS); -+ if (!ablocks) -+ return -ENOMEM; -+ memset(ablocks, 0, sizeof(unsigned long) * depth); -+ -+ /* allocate all needed blocks */ -+ ext_debug(tree, "allocate %d blocks for indexes/leaf\n", depth - at); -+ for (a = 0; a < depth - at; a++) { -+ newblock = ext3_ext_new_block(handle, tree, path, newext, &err); -+ if (newblock == 0) -+ goto cleanup; -+ ablocks[a] = newblock; -+ } -+ -+ /* initialize new leaf */ -+ newblock = ablocks[--a]; -+ EXT_ASSERT(newblock); -+ bh = sb_getblk(tree->inode->i_sb, newblock); -+ if (!bh) { -+ err = -EIO; -+ goto cleanup; -+ } -+ lock_buffer(bh); -+ -+ if ((err = ext3_journal_get_create_access(handle, bh))) -+ goto cleanup; -+ -+ neh = EXT_BLOCK_HDR(bh); -+ neh->eh_entries = 0; -+ neh->eh_max = ext3_ext_space_block(tree); -+ neh->eh_magic = EXT3_EXT_MAGIC; -+ neh->eh_depth = 0; -+ ex = EXT_FIRST_EXTENT(neh); -+ -+ /* move remain of path[depth] to the new leaf */ -+ EXT_ASSERT(path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max); -+ /* start copy from next extent */ -+ /* TODO: we could do it by single memmove */ -+ m = 0; -+ path[depth].p_ext++; -+ while (path[depth].p_ext <= -+ EXT_MAX_EXTENT(path[depth].p_hdr)) { -+ ext_debug(tree, "move %d:%d:%d in new leaf %lu\n", -+ path[depth].p_ext->ee_block, -+ path[depth].p_ext->ee_start, -+ path[depth].p_ext->ee_len, -+ newblock); -+ memmove(ex++, path[depth].p_ext++, sizeof(struct ext3_extent)); -+ neh->eh_entries++; -+ m++; -+ } -+ set_buffer_uptodate(bh); -+ unlock_buffer(bh); -+ -+ if ((err = ext3_journal_dirty_metadata(handle, bh))) -+ goto cleanup; -+ brelse(bh); -+ bh = NULL; -+ -+ /* correct old leaf */ -+ if (m) { -+ if ((err = ext3_ext_get_access(handle, tree, path + depth))) -+ goto cleanup; -+ path[depth].p_hdr->eh_entries -= m; -+ if ((err = ext3_ext_dirty(handle, tree, path + depth))) -+ goto cleanup; -+ -+ } -+ -+ /* create intermediate indexes */ -+ k = depth - at - 1; -+ EXT_ASSERT(k >= 0); -+ if (k) -+ ext_debug(tree, "create %d intermediate indices\n", k); -+ /* insert new index into current index block */ -+ /* current depth stored in i var */ -+ i = depth - 1; -+ while (k--) { -+ oldblock = newblock; -+ newblock = ablocks[--a]; -+ bh = sb_getblk(tree->inode->i_sb, newblock); -+ if (!bh) { -+ err = -EIO; -+ goto cleanup; -+ } -+ lock_buffer(bh); -+ -+ if ((err = ext3_journal_get_create_access(handle, bh))) -+ goto cleanup; -+ -+ neh = EXT_BLOCK_HDR(bh); -+ neh->eh_entries = 1; -+ neh->eh_magic = EXT3_EXT_MAGIC; -+ neh->eh_max = ext3_ext_space_block_idx(tree); -+ neh->eh_depth = depth - i; -+ fidx = EXT_FIRST_INDEX(neh); -+ fidx->ei_block = border; -+ fidx->ei_leaf = oldblock; -+ fidx->ei_leaf_hi = fidx->ei_unused = 0; -+ -+ ext_debug(tree, "int.index at %d (block %lu): %lu -> %lu\n", -+ i, newblock, border, oldblock); -+ /* copy indexes */ -+ m = 0; -+ path[i].p_idx++; -+ -+ ext_debug(tree, "cur 0x%p, last 0x%p\n", path[i].p_idx, -+ EXT_MAX_INDEX(path[i].p_hdr)); -+ EXT_ASSERT(EXT_MAX_INDEX(path[i].p_hdr) == -+ EXT_LAST_INDEX(path[i].p_hdr)); -+ while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) { -+ ext_debug(tree, "%d: move %d:%d in new index %lu\n", -+ i, path[i].p_idx->ei_block, -+ path[i].p_idx->ei_leaf, newblock); -+ memmove(++fidx, path[i].p_idx++, -+ sizeof(struct ext3_extent_idx)); -+ neh->eh_entries++; -+ EXT_ASSERT(neh->eh_entries <= neh->eh_max); -+ m++; -+ } -+ set_buffer_uptodate(bh); -+ unlock_buffer(bh); -+ -+ if ((err = ext3_journal_dirty_metadata(handle, bh))) -+ goto cleanup; -+ brelse(bh); -+ bh = NULL; -+ -+ /* correct old index */ -+ if (m) { -+ err = ext3_ext_get_access(handle, tree, path + i); -+ if (err) -+ goto cleanup; -+ path[i].p_hdr->eh_entries -= m; -+ err = ext3_ext_dirty(handle, tree, path + i); -+ if (err) -+ goto cleanup; -+ } -+ -+ i--; -+ } -+ -+ /* insert new index */ -+ if (!err) -+ err = ext3_ext_insert_index(handle, tree, path + at, -+ border, newblock); -+ -+cleanup: -+ if (bh) { -+ if (buffer_locked(bh)) -+ unlock_buffer(bh); -+ brelse(bh); -+ } -+ -+ if (err) { -+ /* free all allocated blocks in error case */ -+ for (i = 0; i < depth; i++) { -+ if (!ablocks[i]) -+ continue; -+ ext3_free_blocks(handle, tree->inode, ablocks[i], 1); -+ } -+ } -+ kfree(ablocks); -+ -+ return err; -+} -+ -+/* -+ * routine implements tree growing procedure: -+ * - allocates new block -+ * - moves top-level data (index block or leaf) into the new block -+ * - initialize new top-level, creating index that points to the -+ * just created block -+ */ -+static int ext3_ext_grow_indepth(handle_t *handle, -+ struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, -+ struct ext3_extent *newext) -+{ -+ struct ext3_ext_path *curp = path; -+ struct ext3_extent_header *neh; -+ struct ext3_extent_idx *fidx; -+ struct buffer_head *bh; -+ unsigned long newblock; -+ int err = 0; -+ -+ newblock = ext3_ext_new_block(handle, tree, path, newext, &err); -+ if (newblock == 0) -+ return err; -+ -+ bh = sb_getblk(tree->inode->i_sb, newblock); -+ if (!bh) { -+ err = -EIO; -+ ext3_std_error(tree->inode->i_sb, err); -+ return err; -+ } -+ lock_buffer(bh); -+ -+ if ((err = ext3_journal_get_create_access(handle, bh))) { -+ unlock_buffer(bh); -+ goto out; -+ } -+ -+ /* move top-level index/leaf into new block */ -+ memmove(bh->b_data, curp->p_hdr, tree->buffer_len); -+ -+ /* set size of new block */ -+ neh = EXT_BLOCK_HDR(bh); -+ /* old root could have indexes or leaves -+ * so calculate eh_max right way */ -+ if (EXT_DEPTH(tree)) -+ neh->eh_max = ext3_ext_space_block_idx(tree); -+ else -+ neh->eh_max = ext3_ext_space_block(tree); -+ neh->eh_magic = EXT3_EXT_MAGIC; -+ set_buffer_uptodate(bh); -+ unlock_buffer(bh); -+ -+ if ((err = ext3_journal_dirty_metadata(handle, bh))) -+ goto out; -+ -+ /* create index in new top-level index: num,max,pointer */ -+ if ((err = ext3_ext_get_access(handle, tree, curp))) -+ goto out; -+ -+ curp->p_hdr->eh_magic = EXT3_EXT_MAGIC; -+ curp->p_hdr->eh_max = ext3_ext_space_root_idx(tree); -+ curp->p_hdr->eh_entries = 1; -+ curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr); -+ /* FIXME: it works, but actually path[0] can be index */ -+ curp->p_idx->ei_block = EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block; -+ curp->p_idx->ei_leaf = newblock; -+ curp->p_idx->ei_leaf_hi = curp->p_idx->ei_unused = 0; -+ -+ neh = EXT_ROOT_HDR(tree); -+ fidx = EXT_FIRST_INDEX(neh); -+ ext_debug(tree, "new root: num %d(%d), lblock %d, ptr %d\n", -+ neh->eh_entries, neh->eh_max, fidx->ei_block, fidx->ei_leaf); -+ -+ neh->eh_depth = path->p_depth + 1; -+ err = ext3_ext_dirty(handle, tree, curp); -+out: -+ brelse(bh); -+ -+ return err; -+} -+ -+/* -+ * routine finds empty index and adds new leaf. if no free index found -+ * then it requests in-depth growing -+ */ -+static int ext3_ext_create_new_leaf(handle_t *handle, -+ struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, -+ struct ext3_extent *newext) -+{ -+ struct ext3_ext_path *curp; -+ int depth, i, err = 0; -+ -+repeat: -+ i = depth = EXT_DEPTH(tree); -+ -+ /* walk up to the tree and look for free index entry */ -+ curp = path + depth; -+ while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) { -+ i--; -+ curp--; -+ } -+ -+ /* we use already allocated block for index block -+ * so, subsequent data blocks should be contigoues */ -+ if (EXT_HAS_FREE_INDEX(curp)) { -+ /* if we found index with free entry, then use that -+ * entry: create all needed subtree and add new leaf */ -+ err = ext3_ext_split(handle, tree, path, newext, i); -+ -+ /* refill path */ -+ ext3_ext_drop_refs(path); -+ path = ext3_ext_find_extent(tree, newext->ee_block, path); -+ if (IS_ERR(path)) -+ err = PTR_ERR(path); -+ } else { -+ /* tree is full, time to grow in depth */ -+ err = ext3_ext_grow_indepth(handle, tree, path, newext); -+ -+ /* refill path */ -+ ext3_ext_drop_refs(path); -+ path = ext3_ext_find_extent(tree, newext->ee_block, path); -+ if (IS_ERR(path)) -+ err = PTR_ERR(path); -+ -+ /* -+ * only first (depth 0 -> 1) produces free space -+ * in all other cases we have to split growed tree -+ */ -+ depth = EXT_DEPTH(tree); -+ if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) { -+ /* now we need split */ -+ goto repeat; -+ } -+ } -+ -+ if (err) -+ return err; -+ -+ return 0; -+} -+ -+/* -+ * returns allocated block in subsequent extent or EXT_MAX_BLOCK -+ * NOTE: it consider block number from index entry as -+ * allocated block. thus, index entries have to be consistent -+ * with leafs -+ */ -+static unsigned long -+ext3_ext_next_allocated_block(struct ext3_ext_path *path) -+{ -+ int depth; -+ -+ EXT_ASSERT(path != NULL); -+ depth = path->p_depth; -+ -+ if (depth == 0 && path->p_ext == NULL) -+ return EXT_MAX_BLOCK; -+ -+ /* FIXME: what if index isn't full ?! */ -+ while (depth >= 0) { -+ if (depth == path->p_depth) { -+ /* leaf */ -+ if (path[depth].p_ext != -+ EXT_LAST_EXTENT(path[depth].p_hdr)) -+ return path[depth].p_ext[1].ee_block; -+ } else { -+ /* index */ -+ if (path[depth].p_idx != -+ EXT_LAST_INDEX(path[depth].p_hdr)) -+ return path[depth].p_idx[1].ei_block; -+ } -+ depth--; -+ } -+ -+ return EXT_MAX_BLOCK; -+} -+ -+/* -+ * returns first allocated block from next leaf or EXT_MAX_BLOCK -+ */ -+static unsigned ext3_ext_next_leaf_block(struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path) -+{ -+ int depth; -+ -+ EXT_ASSERT(path != NULL); -+ depth = path->p_depth; -+ -+ /* zero-tree has no leaf blocks at all */ -+ if (depth == 0) -+ return EXT_MAX_BLOCK; -+ -+ /* go to index block */ -+ depth--; -+ -+ while (depth >= 0) { -+ if (path[depth].p_idx != -+ EXT_LAST_INDEX(path[depth].p_hdr)) -+ return path[depth].p_idx[1].ei_block; -+ depth--; -+ } -+ -+ return EXT_MAX_BLOCK; -+} -+ -+/* -+ * if leaf gets modified and modified extent is first in the leaf -+ * then we have to correct all indexes above -+ * TODO: do we need to correct tree in all cases? -+ */ -+int ext3_ext_correct_indexes(handle_t *handle, struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path) -+{ -+ struct ext3_extent_header *eh; -+ int depth = EXT_DEPTH(tree); -+ struct ext3_extent *ex; -+ unsigned long border; -+ int k, err = 0; -+ -+ eh = path[depth].p_hdr; -+ ex = path[depth].p_ext; -+ EXT_ASSERT(ex); -+ EXT_ASSERT(eh); -+ -+ if (depth == 0) { -+ /* there is no tree at all */ -+ return 0; -+ } -+ -+ if (ex != EXT_FIRST_EXTENT(eh)) { -+ /* we correct tree if first leaf got modified only */ -+ return 0; -+ } -+ -+ /* -+ * TODO: we need correction if border is smaller then current one -+ */ -+ k = depth - 1; -+ border = path[depth].p_ext->ee_block; -+ if ((err = ext3_ext_get_access(handle, tree, path + k))) -+ return err; -+ path[k].p_idx->ei_block = border; -+ if ((err = ext3_ext_dirty(handle, tree, path + k))) -+ return err; -+ -+ while (k--) { -+ /* change all left-side indexes */ -+ if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr)) -+ break; -+ if ((err = ext3_ext_get_access(handle, tree, path + k))) -+ break; -+ path[k].p_idx->ei_block = border; -+ if ((err = ext3_ext_dirty(handle, tree, path + k))) -+ break; -+ } -+ -+ return err; -+} -+ -+static int inline -+ext3_can_extents_be_merged(struct ext3_extents_tree *tree, -+ struct ext3_extent *ex1, -+ struct ext3_extent *ex2) -+{ -+ if (ex1->ee_block + ex1->ee_len != ex2->ee_block) -+ return 0; -+ -+#ifdef AGRESSIVE_TEST -+ if (ex1->ee_len >= 4) -+ return 0; -+#endif -+ -+ if (!tree->ops->mergable) -+ return 1; -+ -+ return tree->ops->mergable(ex1, ex2); -+} -+ -+/* -+ * this routine tries to merge requsted extent into the existing -+ * extent or inserts requested extent as new one into the tree, -+ * creating new leaf in no-space case -+ */ -+int ext3_ext_insert_extent(handle_t *handle, struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, -+ struct ext3_extent *newext) -+{ -+ struct ext3_extent_header * eh; -+ struct ext3_extent *ex, *fex; -+ struct ext3_extent *nearex; /* nearest extent */ -+ struct ext3_ext_path *npath = NULL; -+ int depth, len, err, next; -+ -+ EXT_ASSERT(newext->ee_len > 0); -+ depth = EXT_DEPTH(tree); -+ ex = path[depth].p_ext; -+ EXT_ASSERT(path[depth].p_hdr); -+ -+ /* try to insert block into found extent and return */ -+ if (ex && ext3_can_extents_be_merged(tree, ex, newext)) { -+ ext_debug(tree, "append %d block to %d:%d (from %d)\n", -+ newext->ee_len, ex->ee_block, ex->ee_len, -+ ex->ee_start); -+ if ((err = ext3_ext_get_access(handle, tree, path + depth))) -+ return err; -+ ex->ee_len += newext->ee_len; -+ eh = path[depth].p_hdr; -+ nearex = ex; -+ goto merge; -+ } -+ -+repeat: -+ depth = EXT_DEPTH(tree); -+ eh = path[depth].p_hdr; -+ if (eh->eh_entries < eh->eh_max) -+ goto has_space; -+ -+ /* probably next leaf has space for us? */ -+ fex = EXT_LAST_EXTENT(eh); -+ next = ext3_ext_next_leaf_block(tree, path); -+ if (newext->ee_block > fex->ee_block && next != EXT_MAX_BLOCK) { -+ ext_debug(tree, "next leaf block - %d\n", next); -+ EXT_ASSERT(!npath); -+ npath = ext3_ext_find_extent(tree, next, NULL); -+ if (IS_ERR(npath)) -+ return PTR_ERR(npath); -+ EXT_ASSERT(npath->p_depth == path->p_depth); -+ eh = npath[depth].p_hdr; -+ if (eh->eh_entries < eh->eh_max) { -+ ext_debug(tree, "next leaf isnt full(%d)\n", -+ eh->eh_entries); -+ path = npath; -+ goto repeat; -+ } -+ ext_debug(tree, "next leaf hasno free space(%d,%d)\n", -+ eh->eh_entries, eh->eh_max); -+ } -+ -+ /* -+ * there is no free space in found leaf -+ * we're gonna add new leaf in the tree -+ */ -+ err = ext3_ext_create_new_leaf(handle, tree, path, newext); -+ if (err) -+ goto cleanup; -+ depth = EXT_DEPTH(tree); -+ eh = path[depth].p_hdr; -+ -+has_space: -+ nearex = path[depth].p_ext; -+ -+ if ((err = ext3_ext_get_access(handle, tree, path + depth))) -+ goto cleanup; -+ -+ if (!nearex) { -+ /* there is no extent in this leaf, create first one */ -+ ext_debug(tree, "first extent in the leaf: %d:%d:%d\n", -+ newext->ee_block, newext->ee_start, -+ newext->ee_len); -+ path[depth].p_ext = EXT_FIRST_EXTENT(eh); -+ } else if (newext->ee_block > nearex->ee_block) { -+ EXT_ASSERT(newext->ee_block != nearex->ee_block); -+ if (nearex != EXT_LAST_EXTENT(eh)) { -+ len = EXT_MAX_EXTENT(eh) - nearex; -+ len = (len - 1) * sizeof(struct ext3_extent); -+ len = len < 0 ? 0 : len; -+ ext_debug(tree, "insert %d:%d:%d after: nearest 0x%p, " -+ "move %d from 0x%p to 0x%p\n", -+ newext->ee_block, newext->ee_start, -+ newext->ee_len, -+ nearex, len, nearex + 1, nearex + 2); -+ memmove(nearex + 2, nearex + 1, len); -+ } -+ path[depth].p_ext = nearex + 1; -+ } else { -+ EXT_ASSERT(newext->ee_block != nearex->ee_block); -+ len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext3_extent); -+ len = len < 0 ? 0 : len; -+ ext_debug(tree, "insert %d:%d:%d before: nearest 0x%p, " -+ "move %d from 0x%p to 0x%p\n", -+ newext->ee_block, newext->ee_start, newext->ee_len, -+ nearex, len, nearex + 1, nearex + 2); -+ memmove(nearex + 1, nearex, len); -+ path[depth].p_ext = nearex; -+ } -+ -+ eh->eh_entries++; -+ nearex = path[depth].p_ext; -+ nearex->ee_block = newext->ee_block; -+ nearex->ee_start = newext->ee_start; -+ nearex->ee_len = newext->ee_len; -+ /* FIXME: support for large fs */ -+ nearex->ee_start_hi = 0; -+ -+merge: -+ /* try to merge extents to the right */ -+ while (nearex < EXT_LAST_EXTENT(eh)) { -+ if (!ext3_can_extents_be_merged(tree, nearex, nearex + 1)) -+ break; -+ /* merge with next extent! */ -+ nearex->ee_len += nearex[1].ee_len; -+ if (nearex + 1 < EXT_LAST_EXTENT(eh)) { -+ len = (EXT_LAST_EXTENT(eh) - nearex - 1) * -+ sizeof(struct ext3_extent); -+ memmove(nearex + 1, nearex + 2, len); -+ } -+ eh->eh_entries--; -+ EXT_ASSERT(eh->eh_entries > 0); -+ } -+ -+ /* try to merge extents to the left */ -+ -+ /* time to correct all indexes above */ -+ err = ext3_ext_correct_indexes(handle, tree, path); -+ if (err) -+ goto cleanup; -+ -+ err = ext3_ext_dirty(handle, tree, path + depth); -+ -+cleanup: -+ if (npath) { -+ ext3_ext_drop_refs(npath); -+ kfree(npath); -+ } -+ ext3_ext_tree_changed(tree); -+ ext3_ext_invalidate_cache(tree); -+ return err; -+} -+ -+int ext3_ext_walk_space(struct ext3_extents_tree *tree, unsigned long block, -+ unsigned long num, ext_prepare_callback func) -+{ -+ struct ext3_ext_path *path = NULL; -+ struct ext3_ext_cache cbex; -+ struct ext3_extent *ex; -+ unsigned long next, start = 0, end = 0; -+ unsigned long last = block + num; -+ int depth, exists, err = 0; -+ -+ EXT_ASSERT(tree); -+ EXT_ASSERT(func); -+ EXT_ASSERT(tree->inode); -+ EXT_ASSERT(tree->root); -+ -+ while (block < last && block != EXT_MAX_BLOCK) { -+ num = last - block; -+ /* find extent for this block */ -+ path = ext3_ext_find_extent(tree, block, path); -+ if (IS_ERR(path)) { -+ err = PTR_ERR(path); -+ path = NULL; -+ break; -+ } -+ -+ depth = EXT_DEPTH(tree); -+ EXT_ASSERT(path[depth].p_hdr); -+ ex = path[depth].p_ext; -+ next = ext3_ext_next_allocated_block(path); -+ -+ exists = 0; -+ if (!ex) { -+ /* there is no extent yet, so try to allocate -+ * all requested space */ -+ start = block; -+ end = block + num; -+ } else if (ex->ee_block > block) { -+ /* need to allocate space before found extent */ -+ start = block; -+ end = ex->ee_block; -+ if (block + num < end) -+ end = block + num; -+ } else if (block >= ex->ee_block + ex->ee_len) { -+ /* need to allocate space after found extent */ -+ start = block; -+ end = block + num; -+ if (end >= next) -+ end = next; -+ } else if (block >= ex->ee_block) { -+ /* -+ * some part of requested space is covered -+ * by found extent -+ */ -+ start = block; -+ end = ex->ee_block + ex->ee_len; -+ if (block + num < end) -+ end = block + num; -+ exists = 1; -+ } else { -+ BUG(); -+ } -+ EXT_ASSERT(end > start); -+ -+ if (!exists) { -+ cbex.ec_block = start; -+ cbex.ec_len = end - start; -+ cbex.ec_start = 0; -+ cbex.ec_type = EXT3_EXT_CACHE_GAP; -+ } else { -+ cbex.ec_block = ex->ee_block; -+ cbex.ec_len = ex->ee_len; -+ cbex.ec_start = ex->ee_start; -+ cbex.ec_type = EXT3_EXT_CACHE_EXTENT; -+ } -+ -+ EXT_ASSERT(cbex.ec_len > 0); -+ EXT_ASSERT(path[depth].p_hdr); -+ err = func(tree, path, &cbex); -+ ext3_ext_drop_refs(path); -+ -+ if (err < 0) -+ break; -+ if (err == EXT_REPEAT) -+ continue; -+ else if (err == EXT_BREAK) { -+ err = 0; -+ break; -+ } -+ -+ if (EXT_DEPTH(tree) != depth) { -+ /* depth was changed. we have to realloc path */ -+ kfree(path); -+ path = NULL; -+ } -+ -+ block = cbex.ec_block + cbex.ec_len; -+ } -+ -+ if (path) { -+ ext3_ext_drop_refs(path); -+ kfree(path); -+ } -+ -+ return err; -+} -+ -+static inline void -+ext3_ext_put_in_cache(struct ext3_extents_tree *tree, __u32 block, -+ __u32 len, __u32 start, int type) -+{ -+ EXT_ASSERT(len > 0); -+ if (tree->cex) { -+ tree->cex->ec_type = type; -+ tree->cex->ec_block = block; -+ tree->cex->ec_len = len; -+ tree->cex->ec_start = start; -+ } -+} -+ -+/* -+ * this routine calculate boundaries of the gap requested block fits into -+ * and cache this gap -+ */ -+static inline void -+ext3_ext_put_gap_in_cache(struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, -+ unsigned long block) -+{ -+ int depth = EXT_DEPTH(tree); -+ unsigned long lblock, len; -+ struct ext3_extent *ex; -+ -+ if (!tree->cex) -+ return; -+ -+ ex = path[depth].p_ext; -+ if (ex == NULL) { -+ /* there is no extent yet, so gap is [0;-] */ -+ lblock = 0; -+ len = EXT_MAX_BLOCK; -+ ext_debug(tree, "cache gap(whole file):"); -+ } else if (block < ex->ee_block) { -+ lblock = block; -+ len = ex->ee_block - block; -+ ext_debug(tree, "cache gap(before): %lu [%lu:%lu]", -+ (unsigned long) block, -+ (unsigned long) ex->ee_block, -+ (unsigned long) ex->ee_len); -+ } else if (block >= ex->ee_block + ex->ee_len) { -+ lblock = ex->ee_block + ex->ee_len; -+ len = ext3_ext_next_allocated_block(path); -+ ext_debug(tree, "cache gap(after): [%lu:%lu] %lu", -+ (unsigned long) ex->ee_block, -+ (unsigned long) ex->ee_len, -+ (unsigned long) block); -+ EXT_ASSERT(len > lblock); -+ len = len - lblock; -+ } else { -+ lblock = len = 0; -+ BUG(); -+ } -+ -+ ext_debug(tree, " -> %lu:%lu\n", (unsigned long) lblock, len); -+ ext3_ext_put_in_cache(tree, lblock, len, 0, EXT3_EXT_CACHE_GAP); -+} -+ -+static inline int -+ext3_ext_in_cache(struct ext3_extents_tree *tree, unsigned long block, -+ struct ext3_extent *ex) -+{ -+ struct ext3_ext_cache *cex = tree->cex; -+ -+ /* is there cache storage at all? */ -+ if (!cex) -+ return EXT3_EXT_CACHE_NO; -+ -+ /* has cache valid data? */ -+ if (cex->ec_type == EXT3_EXT_CACHE_NO) -+ return EXT3_EXT_CACHE_NO; -+ -+ EXT_ASSERT(cex->ec_type == EXT3_EXT_CACHE_GAP || -+ cex->ec_type == EXT3_EXT_CACHE_EXTENT); -+ if (block >= cex->ec_block && block < cex->ec_block + cex->ec_len) { -+ ex->ee_block = cex->ec_block; -+ ex->ee_start = cex->ec_start; -+ ex->ee_start_hi = 0; -+ ex->ee_len = cex->ec_len; -+ ext_debug(tree, "%lu cached by %lu:%lu:%lu\n", -+ (unsigned long) block, -+ (unsigned long) ex->ee_block, -+ (unsigned long) ex->ee_len, -+ (unsigned long) ex->ee_start); -+ return cex->ec_type; -+ } -+ -+ /* not in cache */ -+ return EXT3_EXT_CACHE_NO; -+} -+ -+/* -+ * routine removes index from the index block -+ * it's used in truncate case only. thus all requests are for -+ * last index in the block only -+ */ -+int ext3_ext_rm_idx(handle_t *handle, struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path) -+{ -+ struct buffer_head *bh; -+ int err; -+ -+ /* free index block */ -+ path--; -+ EXT_ASSERT(path->p_hdr->eh_entries); -+ if ((err = ext3_ext_get_access(handle, tree, path))) -+ return err; -+ path->p_hdr->eh_entries--; -+ if ((err = ext3_ext_dirty(handle, tree, path))) -+ return err; -+ ext_debug(tree, "index is empty, remove it, free block %d\n", -+ path->p_idx->ei_leaf); -+ bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf); -+ ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf); -+ ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1); -+ return err; -+} -+ -+int ext3_ext_calc_credits_for_insert(struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path) -+{ -+ int depth = EXT_DEPTH(tree); -+ int needed; -+ -+ if (path) { -+ /* probably there is space in leaf? */ -+ if (path[depth].p_hdr->eh_entries < path[depth].p_hdr->eh_max) -+ return 1; -+ } -+ -+ /* -+ * the worste case we're expecting is creation of the -+ * new root (growing in depth) with index splitting -+ * for splitting we have to consider depth + 1 because -+ * previous growing could increase it -+ */ -+ depth = depth + 1; -+ -+ /* -+ * growing in depth: -+ * block allocation + new root + old root -+ */ -+ needed = EXT3_ALLOC_NEEDED + 2; -+ -+ /* index split. we may need: -+ * allocate intermediate indexes and new leaf -+ * change two blocks at each level, but root -+ * modify root block (inode) -+ */ -+ needed += (depth * EXT3_ALLOC_NEEDED) + (2 * depth) + 1; -+ -+ return needed; -+} -+ -+static int -+ext3_ext_split_for_rm(handle_t *handle, struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, unsigned long start, -+ unsigned long end) -+{ -+ struct ext3_extent *ex, tex; -+ struct ext3_ext_path *npath; -+ int depth, creds, err; -+ -+ depth = EXT_DEPTH(tree); -+ ex = path[depth].p_ext; -+ EXT_ASSERT(ex); -+ EXT_ASSERT(end < ex->ee_block + ex->ee_len - 1); -+ EXT_ASSERT(ex->ee_block < start); -+ -+ /* calculate tail extent */ -+ tex.ee_block = end + 1; -+ EXT_ASSERT(tex.ee_block < ex->ee_block + ex->ee_len); -+ tex.ee_len = ex->ee_block + ex->ee_len - tex.ee_block; -+ -+ creds = ext3_ext_calc_credits_for_insert(tree, path); -+ handle = ext3_ext_journal_restart(handle, creds); -+ if (IS_ERR(handle)) -+ return PTR_ERR(handle); -+ -+ /* calculate head extent. use primary extent */ -+ err = ext3_ext_get_access(handle, tree, path + depth); -+ if (err) -+ return err; -+ ex->ee_len = start - ex->ee_block; -+ err = ext3_ext_dirty(handle, tree, path + depth); -+ if (err) -+ return err; -+ -+ /* FIXME: some callback to free underlying resource -+ * and correct ee_start? */ -+ ext_debug(tree, "split extent: head %u:%u, tail %u:%u\n", -+ ex->ee_block, ex->ee_len, tex.ee_block, tex.ee_len); -+ -+ npath = ext3_ext_find_extent(tree, ex->ee_block, NULL); -+ if (IS_ERR(npath)) -+ return PTR_ERR(npath); -+ depth = EXT_DEPTH(tree); -+ EXT_ASSERT(npath[depth].p_ext->ee_block == ex->ee_block); -+ EXT_ASSERT(npath[depth].p_ext->ee_len == ex->ee_len); -+ -+ err = ext3_ext_insert_extent(handle, tree, npath, &tex); -+ ext3_ext_drop_refs(npath); -+ kfree(npath); -+ -+ return err; -+} -+ -+static int -+ext3_ext_rm_leaf(handle_t *handle, struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, unsigned long start, -+ unsigned long end) -+{ -+ struct ext3_extent *ex, *fu = NULL, *lu, *le; -+ int err = 0, correct_index = 0; -+ int depth = EXT_DEPTH(tree), credits; -+ struct ext3_extent_header *eh; -+ unsigned a, b, block, num; -+ -+ ext_debug(tree, "remove [%lu:%lu] in leaf\n", start, end); -+ if (!path[depth].p_hdr) -+ path[depth].p_hdr = EXT_BLOCK_HDR(path[depth].p_bh); -+ eh = path[depth].p_hdr; -+ EXT_ASSERT(eh); -+ EXT_ASSERT(eh->eh_entries <= eh->eh_max); -+ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); -+ -+ /* find where to start removing */ -+ le = ex = EXT_LAST_EXTENT(eh); -+ while (ex != EXT_FIRST_EXTENT(eh)) { -+ if (ex->ee_block <= end) -+ break; -+ ex--; -+ } -+ -+ if (start > ex->ee_block && end < ex->ee_block + ex->ee_len - 1) { -+ /* removal of internal part of the extent requested -+ * tail and head must be placed in different extent -+ * so, we have to insert one more extent */ -+ path[depth].p_ext = ex; -+ return ext3_ext_split_for_rm(handle, tree, path, start, end); -+ } -+ -+ lu = ex; -+ while (ex >= EXT_FIRST_EXTENT(eh) && ex->ee_block + ex->ee_len > start) { -+ ext_debug(tree, "remove ext %u:%u\n", ex->ee_block, ex->ee_len); -+ path[depth].p_ext = ex; -+ -+ a = ex->ee_block > start ? ex->ee_block : start; -+ b = ex->ee_block + ex->ee_len - 1 < end ? -+ ex->ee_block + ex->ee_len - 1 : end; -+ -+ ext_debug(tree, " border %u:%u\n", a, b); -+ -+ if (a != ex->ee_block && b != ex->ee_block + ex->ee_len - 1) { -+ block = 0; -+ num = 0; -+ BUG(); -+ } else if (a != ex->ee_block) { -+ /* remove tail of the extent */ -+ block = ex->ee_block; -+ num = a - block; -+ } else if (b != ex->ee_block + ex->ee_len - 1) { -+ /* remove head of the extent */ -+ block = a; -+ num = b - a; -+ } else { -+ /* remove whole extent: excelent! */ -+ block = ex->ee_block; -+ num = 0; -+ EXT_ASSERT(a == ex->ee_block && -+ b == ex->ee_block + ex->ee_len - 1); -+ } -+ -+ if (ex == EXT_FIRST_EXTENT(eh)) -+ correct_index = 1; -+ -+ credits = 1; -+ if (correct_index) -+ credits += (EXT_DEPTH(tree) * EXT3_ALLOC_NEEDED) + 1; -+ if (tree->ops->remove_extent_credits) -+ credits+=tree->ops->remove_extent_credits(tree,ex,a,b); -+ -+ handle = ext3_ext_journal_restart(handle, credits); -+ if (IS_ERR(handle)) { -+ err = PTR_ERR(handle); -+ goto out; -+ } -+ -+ err = ext3_ext_get_access(handle, tree, path + depth); -+ if (err) -+ goto out; -+ -+ if (tree->ops->remove_extent) -+ err = tree->ops->remove_extent(tree, ex, a, b); -+ if (err) -+ goto out; -+ -+ if (num == 0) { -+ /* this extent is removed entirely mark slot unused */ -+ ex->ee_start = ex->ee_start_hi = 0; -+ eh->eh_entries--; -+ fu = ex; -+ } -+ -+ ex->ee_block = block; -+ ex->ee_len = num; -+ -+ err = ext3_ext_dirty(handle, tree, path + depth); -+ if (err) -+ goto out; -+ -+ ext_debug(tree, "new extent: %u:%u:%u\n", -+ ex->ee_block, ex->ee_len, ex->ee_start); -+ ex--; -+ } -+ -+ if (fu) { -+ /* reuse unused slots */ -+ while (lu < le) { -+ if (lu->ee_start) { -+ *fu = *lu; -+ lu->ee_start = lu->ee_start_hi = 0; -+ fu++; -+ } -+ lu++; -+ } -+ } -+ -+ if (correct_index && eh->eh_entries) -+ err = ext3_ext_correct_indexes(handle, tree, path); -+ -+ /* if this leaf is free, then we should -+ * remove it from index block above */ -+ if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL) -+ err = ext3_ext_rm_idx(handle, tree, path + depth); -+ -+out: -+ return err; -+} -+ -+ -+static struct ext3_extent_idx * -+ext3_ext_last_covered(struct ext3_extent_header *hdr, unsigned long block) -+{ -+ struct ext3_extent_idx *ix; -+ -+ ix = EXT_LAST_INDEX(hdr); -+ while (ix != EXT_FIRST_INDEX(hdr)) { -+ if (ix->ei_block <= block) -+ break; -+ ix--; -+ } -+ return ix; -+} -+ -+/* -+ * returns 1 if current index have to be freed (even partial) -+ */ -+static int inline -+ext3_ext_more_to_rm(struct ext3_ext_path *path) -+{ -+ EXT_ASSERT(path->p_idx); -+ -+ if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr)) -+ return 0; -+ -+ /* -+ * if truncate on deeper level happened it it wasn't partial -+ * so we have to consider current index for truncation -+ */ -+ if (path->p_hdr->eh_entries == path->p_block) -+ return 0; -+ return 1; -+} -+ -+int ext3_ext_remove_space(struct ext3_extents_tree *tree, -+ unsigned long start, unsigned long end) -+{ -+ struct inode *inode = tree->inode; -+ struct super_block *sb = inode->i_sb; -+ int depth = EXT_DEPTH(tree); -+ struct ext3_ext_path *path; -+ handle_t *handle; -+ int i = 0, err = 0; -+ -+ ext_debug(tree, "space to be removed: %lu:%lu\n", start, end); -+ -+ /* probably first extent we're gonna free will be last in block */ -+ handle = ext3_journal_start(inode, depth + 1); -+ if (IS_ERR(handle)) -+ return PTR_ERR(handle); -+ -+ ext3_ext_invalidate_cache(tree); -+ -+ /* -+ * we start scanning from right side freeing all the blocks -+ * after i_size and walking into the deep -+ */ -+ path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 1), GFP_KERNEL); -+ if (IS_ERR(path)) { -+ ext3_error(sb, __FUNCTION__, "Can't allocate path array"); -+ ext3_journal_stop(handle); -+ return -ENOMEM; -+ } -+ memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1)); -+ path[i].p_hdr = EXT_ROOT_HDR(tree); -+ -+ while (i >= 0 && err == 0) { -+ if (i == depth) { -+ /* this is leaf block */ -+ err = ext3_ext_rm_leaf(handle, tree, path, start, end); -+ /* root level have p_bh == NULL, brelse() eats this */ -+ brelse(path[i].p_bh); -+ i--; -+ continue; -+ } -+ -+ /* this is index block */ -+ if (!path[i].p_hdr) { -+ ext_debug(tree, "initialize header\n"); -+ path[i].p_hdr = EXT_BLOCK_HDR(path[i].p_bh); -+ } -+ -+ EXT_ASSERT(path[i].p_hdr->eh_entries <= path[i].p_hdr->eh_max); -+ EXT_ASSERT(path[i].p_hdr->eh_magic == EXT3_EXT_MAGIC); -+ -+ if (!path[i].p_idx) { -+ /* this level hasn't touched yet */ -+ path[i].p_idx = -+ ext3_ext_last_covered(path[i].p_hdr, end); -+ path[i].p_block = path[i].p_hdr->eh_entries + 1; -+ ext_debug(tree, "init index ptr: hdr 0x%p, num %d\n", -+ path[i].p_hdr, path[i].p_hdr->eh_entries); -+ } else { -+ /* we've already was here, see at next index */ -+ path[i].p_idx--; -+ } -+ -+ ext_debug(tree, "level %d - index, first 0x%p, cur 0x%p\n", -+ i, EXT_FIRST_INDEX(path[i].p_hdr), -+ path[i].p_idx); -+ if (ext3_ext_more_to_rm(path + i)) { -+ /* go to the next level */ -+ ext_debug(tree, "move to level %d (block %d)\n", -+ i + 1, path[i].p_idx->ei_leaf); -+ memset(path + i + 1, 0, sizeof(*path)); -+ path[i+1].p_bh = sb_bread(sb, path[i].p_idx->ei_leaf); -+ if (!path[i+1].p_bh) { -+ /* should we reset i_size? */ -+ err = -EIO; -+ break; -+ } -+ /* put actual number of indexes to know is this -+ * number got changed at the next iteration */ -+ path[i].p_block = path[i].p_hdr->eh_entries; -+ i++; -+ } else { -+ /* we finish processing this index, go up */ -+ if (path[i].p_hdr->eh_entries == 0 && i > 0) { -+ /* index is empty, remove it -+ * handle must be already prepared by the -+ * truncatei_leaf() */ -+ err = ext3_ext_rm_idx(handle, tree, path + i); -+ } -+ /* root level have p_bh == NULL, brelse() eats this */ -+ brelse(path[i].p_bh); -+ i--; -+ ext_debug(tree, "return to level %d\n", i); -+ } -+ } -+ -+ /* TODO: flexible tree reduction should be here */ -+ if (path->p_hdr->eh_entries == 0) { -+ /* -+ * truncate to zero freed all the tree -+ * so, we need to correct eh_depth -+ */ -+ err = ext3_ext_get_access(handle, tree, path); -+ if (err == 0) { -+ EXT_ROOT_HDR(tree)->eh_depth = 0; -+ EXT_ROOT_HDR(tree)->eh_max = ext3_ext_space_root(tree); -+ err = ext3_ext_dirty(handle, tree, path); -+ } -+ } -+ ext3_ext_tree_changed(tree); -+ -+ kfree(path); -+ ext3_journal_stop(handle); -+ -+ return err; -+} -+ -+int ext3_ext_calc_metadata_amount(struct ext3_extents_tree *tree, int blocks) -+{ -+ int lcap, icap, rcap, leafs, idxs, num; -+ -+ rcap = ext3_ext_space_root(tree); -+ if (blocks <= rcap) { -+ /* all extents fit to the root */ -+ return 0; -+ } -+ -+ rcap = ext3_ext_space_root_idx(tree); -+ lcap = ext3_ext_space_block(tree); -+ icap = ext3_ext_space_block_idx(tree); -+ -+ num = leafs = (blocks + lcap - 1) / lcap; -+ if (leafs <= rcap) { -+ /* all pointers to leafs fit to the root */ -+ return leafs; -+ } -+ -+ /* ok. we need separate index block(s) to link all leaf blocks */ -+ idxs = (leafs + icap - 1) / icap; -+ do { -+ num += idxs; -+ idxs = (idxs + icap - 1) / icap; -+ } while (idxs > rcap); -+ -+ return num; -+} -+ -+/* -+ * called at mount time -+ */ -+void ext3_ext_init(struct super_block *sb) -+{ -+ /* -+ * possible initialization would be here -+ */ -+ -+ if (test_opt(sb, EXTENTS)) { -+ printk("EXT3-fs: file extents enabled"); -+#ifdef AGRESSIVE_TEST -+ printk(", agressive tests"); -+#endif -+#ifdef CHECK_BINSEARCH -+ printk(", check binsearch"); -+#endif -+ printk("\n"); -+ } -+} -+ -+/* -+ * called at umount time -+ */ -+void ext3_ext_release(struct super_block *sb) -+{ -+} -+ -+/************************************************************************ -+ * VFS related routines -+ ************************************************************************/ -+ -+static int ext3_get_inode_write_access(handle_t *handle, void *buffer) -+{ -+ /* we use in-core data, not bh */ -+ return 0; -+} -+ -+static int ext3_mark_buffer_dirty(handle_t *handle, void *buffer) -+{ -+ struct inode *inode = buffer; -+ return ext3_mark_inode_dirty(handle, inode); -+} -+ -+static int ext3_ext_mergable(struct ext3_extent *ex1, -+ struct ext3_extent *ex2) -+{ -+ /* FIXME: support for large fs */ -+ if (ex1->ee_start + ex1->ee_len == ex2->ee_start) -+ return 1; -+ return 0; -+} -+ -+static int -+ext3_remove_blocks_credits(struct ext3_extents_tree *tree, -+ struct ext3_extent *ex, -+ unsigned long from, unsigned long to) -+{ -+ int needed; -+ -+ /* at present, extent can't cross block group */; -+ needed = 4; /* bitmap + group desc + sb + inode */ -+ -+#ifdef CONFIG_QUOTA -+ needed += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS; -+#endif -+ return needed; -+} -+ -+static int -+ext3_remove_blocks(struct ext3_extents_tree *tree, -+ struct ext3_extent *ex, -+ unsigned long from, unsigned long to) -+{ -+ int needed = ext3_remove_blocks_credits(tree, ex, from, to); -+ handle_t *handle = ext3_journal_start(tree->inode, needed); -+ struct buffer_head *bh; -+ int i; -+ -+ if (IS_ERR(handle)) -+ return PTR_ERR(handle); -+ if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) { -+ /* tail removal */ -+ unsigned long num, start; -+ num = ex->ee_block + ex->ee_len - from; -+ start = ex->ee_start + ex->ee_len - num; -+ ext_debug(tree, "free last %lu blocks starting %lu\n", -+ num, start); -+ for (i = 0; i < num; i++) { -+ bh = sb_find_get_block(tree->inode->i_sb, start + i); -+ ext3_forget(handle, 0, tree->inode, bh, start + i); -+ } -+ ext3_free_blocks(handle, tree->inode, start, num); -+ } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) { -+ printk("strange request: removal %lu-%lu from %u:%u\n", -+ from, to, ex->ee_block, ex->ee_len); -+ } else { -+ printk("strange request: removal(2) %lu-%lu from %u:%u\n", -+ from, to, ex->ee_block, ex->ee_len); -+ } -+ ext3_journal_stop(handle); -+ return 0; -+} -+ -+static int ext3_ext_find_goal(struct inode *inode, -+ struct ext3_ext_path *path, unsigned long block) -+{ -+ struct ext3_inode_info *ei = EXT3_I(inode); -+ unsigned long bg_start; -+ unsigned long colour; -+ int depth; -+ -+ if (path) { -+ struct ext3_extent *ex; -+ depth = path->p_depth; -+ -+ /* try to predict block placement */ -+ if ((ex = path[depth].p_ext)) -+ return ex->ee_start + (block - ex->ee_block); -+ -+ /* it looks index is empty -+ * try to find starting from index itself */ -+ if (path[depth].p_bh) -+ return path[depth].p_bh->b_blocknr; -+ } -+ -+ /* OK. use inode's group */ -+ bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) + -+ le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block); -+ colour = (current->pid % 16) * -+ (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16); -+ return bg_start + colour + block; -+} -+ -+static int ext3_new_block_cb(handle_t *handle, struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, -+ struct ext3_extent *ex, int *err) -+{ -+ struct inode *inode = tree->inode; -+ int newblock, goal; -+ -+ EXT_ASSERT(path); -+ EXT_ASSERT(ex); -+ EXT_ASSERT(ex->ee_start); -+ EXT_ASSERT(ex->ee_len); -+ -+ /* reuse block from the extent to order data/metadata */ -+ newblock = ex->ee_start++; -+ ex->ee_len--; -+ if (ex->ee_len == 0) { -+ ex->ee_len = 1; -+ /* allocate new block for the extent */ -+ goal = ext3_ext_find_goal(inode, path, ex->ee_block); -+ ex->ee_start = ext3_new_block(handle, inode, goal, err); -+ ex->ee_start_hi = 0; -+ if (ex->ee_start == 0) { -+ /* error occured: restore old extent */ -+ ex->ee_start = newblock; -+ return 0; -+ } -+ } -+ return newblock; -+} -+ -+static struct ext3_extents_helpers ext3_blockmap_helpers = { -+ .get_write_access = ext3_get_inode_write_access, -+ .mark_buffer_dirty = ext3_mark_buffer_dirty, -+ .mergable = ext3_ext_mergable, -+ .new_block = ext3_new_block_cb, -+ .remove_extent = ext3_remove_blocks, -+ .remove_extent_credits = ext3_remove_blocks_credits, -+}; -+ -+void ext3_init_tree_desc(struct ext3_extents_tree *tree, -+ struct inode *inode) -+{ -+ tree->inode = inode; -+ tree->root = (void *) EXT3_I(inode)->i_data; -+ tree->buffer = (void *) inode; -+ tree->buffer_len = sizeof(EXT3_I(inode)->i_data); -+ tree->cex = (struct ext3_ext_cache *) &EXT3_I(inode)->i_cached_extent; -+ tree->ops = &ext3_blockmap_helpers; -+} -+ -+int ext3_ext_get_block(handle_t *handle, struct inode *inode, -+ long iblock, struct buffer_head *bh_result, -+ int create, int extend_disksize) -+{ -+ struct ext3_ext_path *path = NULL; -+ struct ext3_extent newex; -+ struct ext3_extent *ex; -+ int goal, newblock, err = 0, depth; -+ struct ext3_extents_tree tree; -+ -+ clear_buffer_new(bh_result); -+ ext3_init_tree_desc(&tree, inode); -+ ext_debug(&tree, "block %d requested for inode %u\n", -+ (int) iblock, (unsigned) inode->i_ino); -+ mutex_lock(&EXT3_I(inode)->truncate_mutex); -+ -+ /* check in cache */ -+ if ((goal = ext3_ext_in_cache(&tree, iblock, &newex))) { -+ if (goal == EXT3_EXT_CACHE_GAP) { -+ if (!create) { -+ /* block isn't allocated yet and -+ * user don't want to allocate it */ -+ goto out2; -+ } -+ /* we should allocate requested block */ -+ } else if (goal == EXT3_EXT_CACHE_EXTENT) { -+ /* block is already allocated */ -+ newblock = iblock - newex.ee_block + newex.ee_start; -+ goto out; -+ } else { -+ EXT_ASSERT(0); -+ } -+ } -+ -+ /* find extent for this block */ -+ path = ext3_ext_find_extent(&tree, iblock, NULL); -+ if (IS_ERR(path)) { -+ err = PTR_ERR(path); -+ path = NULL; -+ goto out2; -+ } -+ -+ depth = EXT_DEPTH(&tree); -+ -+ /* -+ * consistent leaf must not be empty -+ * this situations is possible, though, _during_ tree modification -+ * this is why assert can't be put in ext3_ext_find_extent() -+ */ -+ EXT_ASSERT(path[depth].p_ext != NULL || depth == 0); -+ -+ if ((ex = path[depth].p_ext)) { -+ /* if found exent covers block, simple return it */ -+ if (iblock >= ex->ee_block && iblock < ex->ee_block + ex->ee_len) { -+ newblock = iblock - ex->ee_block + ex->ee_start; -+ ext_debug(&tree, "%d fit into %d:%d -> %d\n", -+ (int) iblock, ex->ee_block, ex->ee_len, -+ newblock); -+ ext3_ext_put_in_cache(&tree, ex->ee_block, -+ ex->ee_len, ex->ee_start, -+ EXT3_EXT_CACHE_EXTENT); -+ goto out; -+ } -+ } -+ -+ /* -+ * requested block isn't allocated yet -+ * we couldn't try to create block if create flag is zero -+ */ -+ if (!create) { -+ /* put just found gap into cache to speedup subsequest reqs */ -+ ext3_ext_put_gap_in_cache(&tree, path, iblock); -+ goto out2; -+ } -+ -+ /* allocate new block */ -+ goal = ext3_ext_find_goal(inode, path, iblock); -+ newblock = ext3_new_block(handle, inode, goal, &err); -+ if (!newblock) -+ goto out2; -+ ext_debug(&tree, "allocate new block: goal %d, found %d\n", -+ goal, newblock); -+ -+ /* try to insert new extent into found leaf and return */ -+ newex.ee_block = iblock; -+ newex.ee_start = newblock; -+ newex.ee_start_hi = 0; -+ newex.ee_len = 1; -+ err = ext3_ext_insert_extent(handle, &tree, path, &newex); -+ if (err) -+ goto out2; -+ -+ if (extend_disksize && inode->i_size > EXT3_I(inode)->i_disksize) -+ EXT3_I(inode)->i_disksize = inode->i_size; -+ -+ /* previous routine could use block we allocated */ -+ newblock = newex.ee_start; -+ set_buffer_new(bh_result); -+ -+ ext3_ext_put_in_cache(&tree, newex.ee_block, newex.ee_len, -+ newex.ee_start, EXT3_EXT_CACHE_EXTENT); -+out: -+ ext3_ext_show_leaf(&tree, path); -+ map_bh(bh_result, inode->i_sb, newblock); -+out2: -+ if (path) { -+ ext3_ext_drop_refs(path); -+ kfree(path); -+ } -+ mutex_unlock(&EXT3_I(inode)->truncate_mutex); -+ -+ return err; -+} -+ -+void ext3_ext_truncate(struct inode * inode, struct page *page) -+{ -+ struct address_space *mapping = inode->i_mapping; -+ struct super_block *sb = inode->i_sb; -+ struct ext3_extents_tree tree; -+ unsigned long last_block; -+ handle_t *handle; -+ int err = 0; -+ -+ ext3_init_tree_desc(&tree, inode); -+ -+ /* -+ * probably first extent we're gonna free will be last in block -+ */ -+ err = ext3_writepage_trans_blocks(inode) + 3; -+ handle = ext3_journal_start(inode, err); -+ if (IS_ERR(handle)) { -+ if (page) { -+ clear_highpage(page); -+ flush_dcache_page(page); -+ unlock_page(page); -+ page_cache_release(page); -+ } -+ return; -+ } -+ -+ if (page) -+ ext3_block_truncate_page(handle, page, mapping, inode->i_size); -+ -+ mutex_lock(&EXT3_I(inode)->truncate_mutex); -+ ext3_ext_invalidate_cache(&tree); -+ -+ /* -+ * TODO: optimization is possible here -+ * probably we need not scaning at all, -+ * because page truncation is enough -+ */ -+ if (ext3_orphan_add(handle, inode)) -+ goto out_stop; -+ -+ /* we have to know where to truncate from in crash case */ -+ EXT3_I(inode)->i_disksize = inode->i_size; -+ ext3_mark_inode_dirty(handle, inode); -+ -+ last_block = (inode->i_size + sb->s_blocksize - 1) >> -+ EXT3_BLOCK_SIZE_BITS(sb); -+ err = ext3_ext_remove_space(&tree, last_block, EXT_MAX_BLOCK); -+ -+ /* In a multi-transaction truncate, we only make the final -+ * transaction synchronous */ -+ if (IS_SYNC(inode)) -+ handle->h_sync = 1; -+ -+out_stop: -+ /* -+ * If this was a simple ftruncate(), and the file will remain alive -+ * then we need to clear up the orphan record which we created above. -+ * However, if this was a real unlink then we were called by -+ * ext3_delete_inode(), and we allow that function to clean up the -+ * orphan info for us. -+ */ -+ if (inode->i_nlink) -+ ext3_orphan_del(handle, inode); -+ -+ mutex_unlock(&EXT3_I(inode)->truncate_mutex); -+ ext3_journal_stop(handle); -+} -+ -+/* -+ * this routine calculate max number of blocks we could modify -+ * in order to allocate new block for an inode -+ */ -+int ext3_ext_writepage_trans_blocks(struct inode *inode, int num) -+{ -+ struct ext3_extents_tree tree; -+ int needed; -+ -+ ext3_init_tree_desc(&tree, inode); -+ -+ needed = ext3_ext_calc_credits_for_insert(&tree, NULL); -+ -+ /* caller want to allocate num blocks */ -+ needed *= num; -+ -+#ifdef CONFIG_QUOTA -+ /* -+ * FIXME: real calculation should be here -+ * it depends on blockmap format of qouta file -+ */ -+ needed += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS; -+#endif -+ -+ return needed; -+} -+ -+void ext3_extents_initialize_blockmap(handle_t *handle, struct inode *inode) -+{ -+ struct ext3_extents_tree tree; -+ -+ ext3_init_tree_desc(&tree, inode); -+ ext3_extent_tree_init(handle, &tree); -+} -+ -+int ext3_ext_calc_blockmap_metadata(struct inode *inode, int blocks) -+{ -+ struct ext3_extents_tree tree; -+ -+ ext3_init_tree_desc(&tree, inode); -+ return ext3_ext_calc_metadata_amount(&tree, blocks); -+} -+ -+static int -+ext3_ext_store_extent_cb(struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, -+ struct ext3_ext_cache *newex) -+{ -+ struct ext3_extent_buf *buf = (struct ext3_extent_buf *) tree->private; -+ -+ if (newex->ec_type != EXT3_EXT_CACHE_EXTENT) -+ return EXT_CONTINUE; -+ -+ if (buf->err < 0) -+ return EXT_BREAK; -+ if (buf->cur - buf->buffer + sizeof(*newex) > buf->buflen) -+ return EXT_BREAK; -+ -+ if (!copy_to_user(buf->cur, newex, sizeof(*newex))) { -+ buf->err++; -+ buf->cur += sizeof(*newex); -+ } else { -+ buf->err = -EFAULT; -+ return EXT_BREAK; -+ } -+ return EXT_CONTINUE; -+} -+ -+static int -+ext3_ext_collect_stats_cb(struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, -+ struct ext3_ext_cache *ex) -+{ -+ struct ext3_extent_tree_stats *buf = -+ (struct ext3_extent_tree_stats *) tree->private; -+ int depth; -+ -+ if (ex->ec_type != EXT3_EXT_CACHE_EXTENT) -+ return EXT_CONTINUE; -+ -+ depth = EXT_DEPTH(tree); -+ buf->extents_num++; -+ if (path[depth].p_ext == EXT_FIRST_EXTENT(path[depth].p_hdr)) -+ buf->leaf_num++; -+ return EXT_CONTINUE; -+} -+ -+int ext3_ext_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, -+ unsigned long arg) -+{ -+ int err = 0; -+ -+ if (!(EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL)) -+ return -EINVAL; -+ -+ if (cmd == EXT3_IOC_GET_EXTENTS) { -+ struct ext3_extent_buf buf; -+ struct ext3_extents_tree tree; -+ -+ if (copy_from_user(&buf, (void *) arg, sizeof(buf))) -+ return -EFAULT; -+ -+ ext3_init_tree_desc(&tree, inode); -+ buf.cur = buf.buffer; -+ buf.err = 0; -+ tree.private = &buf; -+ mutex_lock(&EXT3_I(inode)->truncate_mutex); -+ err = ext3_ext_walk_space(&tree, buf.start, EXT_MAX_BLOCK, -+ ext3_ext_store_extent_cb); -+ mutex_unlock(&EXT3_I(inode)->truncate_mutex); -+ if (err == 0) -+ err = buf.err; -+ } else if (cmd == EXT3_IOC_GET_TREE_STATS) { -+ struct ext3_extent_tree_stats buf; -+ struct ext3_extents_tree tree; -+ -+ ext3_init_tree_desc(&tree, inode); -+ mutex_lock(&EXT3_I(inode)->truncate_mutex); -+ buf.depth = EXT_DEPTH(&tree); -+ buf.extents_num = 0; -+ buf.leaf_num = 0; -+ tree.private = &buf; -+ err = ext3_ext_walk_space(&tree, 0, EXT_MAX_BLOCK, -+ ext3_ext_collect_stats_cb); -+ mutex_unlock(&EXT3_I(inode)->truncate_mutex); -+ if (!err) -+ err = copy_to_user((void *) arg, &buf, sizeof(buf)); -+ } else if (cmd == EXT3_IOC_GET_TREE_DEPTH) { -+ struct ext3_extents_tree tree; -+ ext3_init_tree_desc(&tree, inode); -+ mutex_lock(&EXT3_I(inode)->truncate_mutex); -+ err = EXT_DEPTH(&tree); -+ mutex_unlock(&EXT3_I(inode)->truncate_mutex); -+ } -+ -+ return err; -+} -+ -+EXPORT_SYMBOL(ext3_init_tree_desc); -+EXPORT_SYMBOL(ext3_mark_inode_dirty); -+EXPORT_SYMBOL(ext3_ext_invalidate_cache); -+EXPORT_SYMBOL(ext3_ext_insert_extent); -+EXPORT_SYMBOL(ext3_ext_walk_space); -+EXPORT_SYMBOL(ext3_ext_find_goal); -+EXPORT_SYMBOL(ext3_ext_calc_credits_for_insert); -Index: linux-stage/fs/ext3/ialloc.c -=================================================================== ---- linux-stage.orig/fs/ext3/ialloc.c 2006-07-16 13:55:31.000000000 +0800 -+++ linux-stage/fs/ext3/ialloc.c 2006-07-16 14:10:20.000000000 +0800 -@@ -600,7 +600,7 @@ got: - ei->i_dir_start_lookup = 0; - ei->i_disksize = 0; - -- ei->i_flags = EXT3_I(dir)->i_flags & ~EXT3_INDEX_FL; -+ ei->i_flags = EXT3_I(dir)->i_flags & ~(EXT3_INDEX_FL|EXT3_EXTENTS_FL); - if (S_ISLNK(mode)) - ei->i_flags &= ~(EXT3_IMMUTABLE_FL|EXT3_APPEND_FL); - /* dirsync only applies to directories */ -@@ -644,6 +644,18 @@ got: - if (err) - goto fail_free_drop; - -+ if (test_opt(sb, EXTENTS) && S_ISREG(inode->i_mode)) { -+ EXT3_I(inode)->i_flags |= EXT3_EXTENTS_FL; -+ ext3_extents_initialize_blockmap(handle, inode); -+ if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_EXTENTS)) { -+ err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh); -+ if (err) goto fail; -+ EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_EXTENTS); -+ BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "call ext3_journal_dirty_metadata"); -+ err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); -+ } -+ } -+ - err = ext3_mark_inode_dirty(handle, inode); - if (err) { - ext3_std_error(sb, err); -Index: linux-stage/fs/ext3/inode.c -=================================================================== ---- linux-stage.orig/fs/ext3/inode.c 2006-07-16 13:55:31.000000000 +0800 -+++ linux-stage/fs/ext3/inode.c 2006-07-16 14:11:28.000000000 +0800 -@@ -40,7 +40,7 @@ - #include "iopen.h" - #include "acl.h" - --static int ext3_writepage_trans_blocks(struct inode *inode); -+int ext3_writepage_trans_blocks(struct inode *inode); - - /* - * Test whether an inode is a fast symlink. -@@ -944,6 +944,17 @@ out: - - #define DIO_CREDITS (EXT3_RESERVE_TRANS_BLOCKS + 32) - -+static inline int -+ext3_get_block_wrap(handle_t *handle, struct inode *inode, long block, -+ struct buffer_head *bh, int create, int extend_disksize) -+{ -+ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) -+ return ext3_ext_get_block(handle, inode, block, bh, create, -+ extend_disksize); -+ return ext3_get_blocks_handle(handle, inode, block, 1, bh, create, -+ extend_disksize); -+} -+ - static int ext3_get_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) - { -@@ -984,8 +995,8 @@ static int ext3_get_block(struct inode * - - get_block: - if (ret == 0) { -- ret = ext3_get_blocks_handle(handle, inode, iblock, -- max_blocks, bh_result, create, 0); -+ ret = ext3_get_block_wrap(handle, inode, iblock, -+ bh_result, create, 0); - if (ret > 0) { - bh_result->b_size = (ret << inode->i_blkbits); - ret = 0; -@@ -1008,7 +1019,7 @@ struct buffer_head *ext3_getblk(handle_t - dummy.b_state = 0; - dummy.b_blocknr = -1000; - buffer_trace_init(&dummy.b_history); -- err = ext3_get_blocks_handle(handle, inode, block, 1, -+ err = ext3_get_block_wrap(handle, inode, block, - &dummy, create, 1); - if (err == 1) { - err = 0; -@@ -1756,7 +1767,7 @@ void ext3_set_aops(struct inode *inode) - * This required during truncate. We need to physically zero the tail end - * of that block so it doesn't yield old data if the file is later grown. - */ --static int ext3_block_truncate_page(handle_t *handle, struct page *page, -+int ext3_block_truncate_page(handle_t *handle, struct page *page, - struct address_space *mapping, loff_t from) - { - ext3_fsblk_t index = from >> PAGE_CACHE_SHIFT; -@@ -2260,6 +2271,9 @@ void ext3_truncate(struct inode *inode) - return; - } - -+ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) -+ return ext3_ext_truncate(inode, page); -+ - handle = start_transaction(inode); - if (IS_ERR(handle)) { - if (page) { -@@ -3004,12 +3018,15 @@ err_out: - * block and work out the exact number of indirects which are touched. Pah. - */ - --static int ext3_writepage_trans_blocks(struct inode *inode) -+int ext3_writepage_trans_blocks(struct inode *inode) - { - int bpp = ext3_journal_blocks_per_page(inode); - int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3; - int ret; - -+ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) -+ return ext3_ext_writepage_trans_blocks(inode, bpp); -+ - if (ext3_should_journal_data(inode)) - ret = 3 * (bpp + indirects) + 2; - else -@@ -3277,7 +3294,7 @@ int ext3_prep_san_write(struct inode *in - - /* alloc blocks one by one */ - for (i = 0; i < nblocks; i++) { -- ret = ext3_get_block_handle(handle, inode, blocks[i], -+ ret = ext3_get_blocks_handle(handle, inode, blocks[i], 1, - &bh_tmp, 1, 1); - if (ret) - break; -@@ -3337,7 +3354,7 @@ int ext3_map_inode_page(struct inode *in - if (blocks[i] != 0) - continue; - -- rc = ext3_get_block_handle(handle, inode, iblock, &dummy, 1, 1); -+ rc = ext3_get_blocks_handle(handle, inode, iblock, 1, &dummy, 1, 1); - if (rc) { - printk(KERN_INFO "ext3_map_inode_page: error reading " - "block %ld\n", iblock); -Index: linux-stage/fs/ext3/Makefile -=================================================================== ---- linux-stage.orig/fs/ext3/Makefile 2006-07-16 13:55:31.000000000 +0800 -+++ linux-stage/fs/ext3/Makefile 2006-07-16 14:10:21.000000000 +0800 -@@ -5,7 +5,8 @@ - obj-$(CONFIG_EXT3_FS) += ext3.o - - ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ -- ioctl.o namei.o super.o symlink.o hash.o resize.o -+ ioctl.o namei.o super.o symlink.o hash.o resize.o \ -+ extents.o - - ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o - ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o -Index: linux-stage/fs/ext3/super.c -=================================================================== ---- linux-stage.orig/fs/ext3/super.c 2006-07-16 13:55:31.000000000 +0800 -+++ linux-stage/fs/ext3/super.c 2006-07-16 14:10:21.000000000 +0800 -@@ -391,6 +391,7 @@ static void ext3_put_super (struct super - struct ext3_super_block *es = sbi->s_es; - int i; - -+ ext3_ext_release(sb); - ext3_xattr_put_super(sb); - journal_destroy(sbi->s_journal); - if (!(sb->s_flags & MS_RDONLY)) { -@@ -455,6 +456,8 @@ static struct inode *ext3_alloc_inode(st - #endif - ei->i_block_alloc_info = NULL; - ei->vfs_inode.i_version = 1; -+ -+ memset(&ei->i_cached_extent, 0, sizeof(ei->i_cached_extent)); - return &ei->vfs_inode; - } - -@@ -638,6 +641,7 @@ enum { - Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, - Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, - Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, -+ Opt_extents, Opt_noextents, Opt_extdebug, - Opt_grpquota - }; - -@@ -690,6 +694,9 @@ static match_table_t tokens = { - {Opt_iopen, "iopen"}, - {Opt_noiopen, "noiopen"}, - {Opt_iopen_nopriv, "iopen_nopriv"}, -+ {Opt_extents, "extents"}, -+ {Opt_noextents, "noextents"}, -+ {Opt_extdebug, "extdebug"}, - {Opt_barrier, "barrier=%u"}, - {Opt_err, NULL}, - {Opt_resize, "resize"}, -@@ -1035,6 +1041,15 @@ clear_qf_name: - case Opt_bh: - clear_opt(sbi->s_mount_opt, NOBH); - break; -+ case Opt_extents: -+ set_opt (sbi->s_mount_opt, EXTENTS); -+ break; -+ case Opt_noextents: -+ clear_opt (sbi->s_mount_opt, EXTENTS); -+ break; -+ case Opt_extdebug: -+ set_opt (sbi->s_mount_opt, EXTDEBUG); -+ break; - default: - printk (KERN_ERR - "EXT3-fs: Unrecognized mount option \"%s\" " -@@ -1760,6 +1772,7 @@ static int ext3_fill_super (struct super - test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered": - "writeback"); - -+ ext3_ext_init(sb); - lock_kernel(); - return 0; - -Index: linux-stage/fs/ext3/ioctl.c -=================================================================== ---- linux-stage.orig/fs/ext3/ioctl.c 2006-07-16 13:55:31.000000000 +0800 -+++ linux-stage/fs/ext3/ioctl.c 2006-07-16 13:55:31.000000000 +0800 -@@ -135,6 +135,10 @@ flags_err: - mutex_unlock(&inode->i_mutex); - return err; - } -+ case EXT3_IOC_GET_EXTENTS: -+ case EXT3_IOC_GET_TREE_STATS: -+ case EXT3_IOC_GET_TREE_DEPTH: -+ return ext3_ext_ioctl(inode, filp, cmd, arg); - case EXT3_IOC_GETVERSION: - case EXT3_IOC_GETVERSION_OLD: - return put_user(inode->i_generation, (int __user *) arg); -Index: linux-stage/include/linux/ext3_fs.h -=================================================================== ---- linux-stage.orig/include/linux/ext3_fs.h 2006-07-16 13:55:31.000000000 +0800 -+++ linux-stage/include/linux/ext3_fs.h 2006-07-16 14:10:21.000000000 +0800 -@@ -181,9 +181,10 @@ struct ext3_group_desc - #define EXT3_NOTAIL_FL 0x00008000 /* file tail should not be merged */ - #define EXT3_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ - #define EXT3_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ -+#define EXT3_EXTENTS_FL 0x00080000 /* Inode uses extents */ - #define EXT3_RESERVED_FL 0x80000000 /* reserved for ext3 lib */ - --#define EXT3_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */ -+#define EXT3_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */ - #define EXT3_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */ - - /* -@@ -233,6 +234,9 @@ struct ext3_new_group_data { - #endif - #define EXT3_IOC_GETRSVSZ _IOR('f', 5, long) - #define EXT3_IOC_SETRSVSZ _IOW('f', 6, long) -+#define EXT3_IOC_GET_EXTENTS _IOR('f', 7, long) -+#define EXT3_IOC_GET_TREE_DEPTH _IOR('f', 8, long) -+#define EXT3_IOC_GET_TREE_STATS _IOR('f', 9, long) - - /* - * Mount options -@@ -373,6 +377,8 @@ struct ext3_inode { - #define EXT3_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ - #define EXT3_MOUNT_IOPEN 0x400000 /* Allow access via iopen */ - #define EXT3_MOUNT_IOPEN_NOPRIV 0x800000/* Make iopen world-readable */ -+#define EXT3_MOUNT_EXTENTS 0x1000000/* Extents support */ -+#define EXT3_MOUNT_EXTDEBUG 0x2000000/* Extents debug */ - - /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ - #ifndef clear_opt -@@ -563,11 +569,13 @@ static inline struct ext3_inode_info *EX - #define EXT3_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */ - #define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */ - #define EXT3_FEATURE_INCOMPAT_META_BG 0x0010 -+#define EXT3_FEATURE_INCOMPAT_EXTENTS 0x0040 /* extents support */ - - #define EXT3_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR - #define EXT3_FEATURE_INCOMPAT_SUPP (EXT3_FEATURE_INCOMPAT_FILETYPE| \ - EXT3_FEATURE_INCOMPAT_RECOVER| \ -- EXT3_FEATURE_INCOMPAT_META_BG) -+ EXT3_FEATURE_INCOMPAT_META_BG| \ -+ EXT3_FEATURE_INCOMPAT_EXTENTS) - #define EXT3_FEATURE_RO_COMPAT_SUPP (EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \ - EXT3_FEATURE_RO_COMPAT_LARGE_FILE| \ - EXT3_FEATURE_RO_COMPAT_BTREE_DIR) -@@ -787,6 +795,9 @@ extern unsigned long ext3_count_free (st - - - /* inode.c */ -+extern int ext3_block_truncate_page(handle_t *, struct page *, -+ struct address_space *, loff_t); -+extern int ext3_writepage_trans_blocks(struct inode *inode); - int ext3_forget(handle_t *handle, int is_metadata, struct inode *inode, - struct buffer_head *bh, ext3_fsblk_t blocknr); - struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *); -@@ -860,6 +870,16 @@ extern struct inode_operations ext3_spec - extern struct inode_operations ext3_symlink_inode_operations; - extern struct inode_operations ext3_fast_symlink_inode_operations; - -+/* extents.c */ -+extern int ext3_ext_writepage_trans_blocks(struct inode *, int); -+extern int ext3_ext_get_block(handle_t *, struct inode *, long, -+ struct buffer_head *, int, int); -+extern void ext3_ext_truncate(struct inode *, struct page *); -+extern void ext3_ext_init(struct super_block *); -+extern void ext3_ext_release(struct super_block *); -+extern void ext3_extents_initialize_blockmap(handle_t *, struct inode *); -+extern int ext3_ext_ioctl(struct inode *inode, struct file *filp, -+ unsigned int cmd, unsigned long arg); - - #endif /* __KERNEL__ */ - -Index: linux-stage/include/linux/ext3_extents.h -=================================================================== ---- /dev/null 1970-01-01 00:00:00.000000000 +0000 -+++ linux-stage/include/linux/ext3_extents.h 2006-07-16 13:55:31.000000000 +0800 -@@ -0,0 +1,262 @@ -+/* -+ * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com -+ * Written by Alex Tomas -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License version 2 as -+ * published by the Free Software Foundation. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public Licens -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- -+ */ -+ -+#ifndef _LINUX_EXT3_EXTENTS -+#define _LINUX_EXT3_EXTENTS -+ -+/* -+ * with AGRESSIVE_TEST defined capacity of index/leaf blocks -+ * become very little, so index split, in-depth growing and -+ * other hard changes happens much more often -+ * this is for debug purposes only -+ */ -+#define AGRESSIVE_TEST_ -+ -+/* -+ * if CHECK_BINSEARCH defined, then results of binary search -+ * will be checked by linear search -+ */ -+#define CHECK_BINSEARCH_ -+ -+/* -+ * if EXT_DEBUG is defined you can use 'extdebug' mount option -+ * to get lots of info what's going on -+ */ -+#define EXT_DEBUG_ -+#ifdef EXT_DEBUG -+#define ext_debug(tree,fmt,a...) \ -+do { \ -+ if (test_opt((tree)->inode->i_sb, EXTDEBUG)) \ -+ printk(fmt, ##a); \ -+} while (0); -+#else -+#define ext_debug(tree,fmt,a...) -+#endif -+ -+/* -+ * if EXT_STATS is defined then stats numbers are collected -+ * these number will be displayed at umount time -+ */ -+#define EXT_STATS_ -+ -+ -+#define EXT3_ALLOC_NEEDED 3 /* block bitmap + group desc. + sb */ -+ -+/* -+ * ext3_inode has i_block array (total 60 bytes) -+ * first 4 bytes are used to store: -+ * - tree depth (0 mean there is no tree yet. all extents in the inode) -+ * - number of alive extents in the inode -+ */ -+ -+/* -+ * this is extent on-disk structure -+ * it's used at the bottom of the tree -+ */ -+struct ext3_extent { -+ __u32 ee_block; /* first logical block extent covers */ -+ __u16 ee_len; /* number of blocks covered by extent */ -+ __u16 ee_start_hi; /* high 16 bits of physical block */ -+ __u32 ee_start; /* low 32 bigs of physical block */ -+}; -+ -+/* -+ * this is index on-disk structure -+ * it's used at all the levels, but the bottom -+ */ -+struct ext3_extent_idx { -+ __u32 ei_block; /* index covers logical blocks from 'block' */ -+ __u32 ei_leaf; /* pointer to the physical block of the next * -+ * level. leaf or next index could bet here */ -+ __u16 ei_leaf_hi; /* high 16 bits of physical block */ -+ __u16 ei_unused; -+}; -+ -+/* -+ * each block (leaves and indexes), even inode-stored has header -+ */ -+struct ext3_extent_header { -+ __u16 eh_magic; /* probably will support different formats */ -+ __u16 eh_entries; /* number of valid entries */ -+ __u16 eh_max; /* capacity of store in entries */ -+ __u16 eh_depth; /* has tree real underlaying blocks? */ -+ __u32 eh_generation; /* flags(8 bits) | generation of the tree */ -+}; -+ -+#define EXT3_EXT_MAGIC 0xf30a -+ -+/* -+ * array of ext3_ext_path contains path to some extent -+ * creation/lookup routines use it for traversal/splitting/etc -+ * truncate uses it to simulate recursive walking -+ */ -+struct ext3_ext_path { -+ __u32 p_block; -+ __u16 p_depth; -+ struct ext3_extent *p_ext; -+ struct ext3_extent_idx *p_idx; -+ struct ext3_extent_header *p_hdr; -+ struct buffer_head *p_bh; -+}; -+ -+/* -+ * structure for external API -+ */ -+ -+/* -+ * storage for cached extent -+ */ -+struct ext3_ext_cache { -+ __u32 ec_start; -+ __u32 ec_block; -+ __u32 ec_len; -+ __u32 ec_type; -+}; -+ -+#define EXT3_EXT_CACHE_NO 0 -+#define EXT3_EXT_CACHE_GAP 1 -+#define EXT3_EXT_CACHE_EXTENT 2 -+ -+/* -+ * ext3_extents_tree is used to pass initial information -+ * to top-level extents API -+ */ -+struct ext3_extents_helpers; -+struct ext3_extents_tree { -+ struct inode *inode; /* inode which tree belongs to */ -+ void *root; /* ptr to data top of tree resides at */ -+ void *buffer; /* will be passed as arg to ^^ routines */ -+ int buffer_len; -+ void *private; -+ struct ext3_ext_cache *cex;/* last found extent */ -+ struct ext3_extents_helpers *ops; -+}; -+ -+struct ext3_extents_helpers { -+ int (*get_write_access)(handle_t *h, void *buffer); -+ int (*mark_buffer_dirty)(handle_t *h, void *buffer); -+ int (*mergable)(struct ext3_extent *ex1, struct ext3_extent *ex2); -+ int (*remove_extent_credits)(struct ext3_extents_tree *, -+ struct ext3_extent *, unsigned long, -+ unsigned long); -+ int (*remove_extent)(struct ext3_extents_tree *, -+ struct ext3_extent *, unsigned long, -+ unsigned long); -+ int (*new_block)(handle_t *, struct ext3_extents_tree *, -+ struct ext3_ext_path *, struct ext3_extent *, -+ int *); -+}; -+ -+/* -+ * to be called by ext3_ext_walk_space() -+ * negative retcode - error -+ * positive retcode - signal for ext3_ext_walk_space(), see below -+ * callback must return valid extent (passed or newly created) -+ */ -+typedef int (*ext_prepare_callback)(struct ext3_extents_tree *, -+ struct ext3_ext_path *, -+ struct ext3_ext_cache *); -+ -+#define EXT_CONTINUE 0 -+#define EXT_BREAK 1 -+#define EXT_REPEAT 2 -+ -+ -+#define EXT_MAX_BLOCK 0xffffffff -+ -+ -+#define EXT_FIRST_EXTENT(__hdr__) \ -+ ((struct ext3_extent *) (((char *) (__hdr__)) + \ -+ sizeof(struct ext3_extent_header))) -+#define EXT_FIRST_INDEX(__hdr__) \ -+ ((struct ext3_extent_idx *) (((char *) (__hdr__)) + \ -+ sizeof(struct ext3_extent_header))) -+#define EXT_HAS_FREE_INDEX(__path__) \ -+ ((__path__)->p_hdr->eh_entries < (__path__)->p_hdr->eh_max) -+#define EXT_LAST_EXTENT(__hdr__) \ -+ (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_entries - 1) -+#define EXT_LAST_INDEX(__hdr__) \ -+ (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_entries - 1) -+#define EXT_MAX_EXTENT(__hdr__) \ -+ (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_max - 1) -+#define EXT_MAX_INDEX(__hdr__) \ -+ (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_max - 1) -+#define EXT_HDR_GEN(__hdr__) ((__hdr__)->eh_generation & 0x00ffffff) -+#define EXT_FLAGS(__hdr__) ((__hdr__)->eh_generation >> 24) -+#define EXT_FLAGS_CLR_UNKNOWN 0x7 /* Flags cleared on modification */ -+ -+#define EXT_BLOCK_HDR(__bh__) ((struct ext3_extent_header *)(__bh__)->b_data) -+#define EXT_ROOT_HDR(__tree__) ((struct ext3_extent_header *)(__tree__)->root) -+#define EXT_DEPTH(__tree__) (EXT_ROOT_HDR(__tree__)->eh_depth) -+#define EXT_GENERATION(__tree__) EXT_HDR_GEN(EXT_ROOT_HDR(__tree__)) -+ -+#define EXT_ASSERT(__x__) if (!(__x__)) BUG(); -+ -+#define EXT_CHECK_PATH(tree,path) \ -+{ \ -+ int depth = EXT_DEPTH(tree); \ -+ BUG_ON((unsigned long) (path) < __PAGE_OFFSET); \ -+ BUG_ON((unsigned long) (path)[depth].p_idx < \ -+ __PAGE_OFFSET && (path)[depth].p_idx != NULL); \ -+ BUG_ON((unsigned long) (path)[depth].p_ext < \ -+ __PAGE_OFFSET && (path)[depth].p_ext != NULL); \ -+ BUG_ON((unsigned long) (path)[depth].p_hdr < __PAGE_OFFSET); \ -+ BUG_ON((unsigned long) (path)[depth].p_bh < __PAGE_OFFSET \ -+ && depth != 0); \ -+ BUG_ON((path)[0].p_depth != depth); \ -+} -+ -+ -+/* -+ * this structure is used to gather extents from the tree via ioctl -+ */ -+struct ext3_extent_buf { -+ unsigned long start; -+ int buflen; -+ void *buffer; -+ void *cur; -+ int err; -+}; -+ -+/* -+ * this structure is used to collect stats info about the tree -+ */ -+struct ext3_extent_tree_stats { -+ int depth; -+ int extents_num; -+ int leaf_num; -+}; -+ -+extern void ext3_init_tree_desc(struct ext3_extents_tree *, struct inode *); -+extern int ext3_extent_tree_init(handle_t *, struct ext3_extents_tree *); -+extern int ext3_ext_calc_credits_for_insert(struct ext3_extents_tree *, struct ext3_ext_path *); -+extern int ext3_ext_insert_extent(handle_t *, struct ext3_extents_tree *, struct ext3_ext_path *, struct ext3_extent *); -+extern int ext3_ext_walk_space(struct ext3_extents_tree *, unsigned long, unsigned long, ext_prepare_callback); -+extern int ext3_ext_remove_space(struct ext3_extents_tree *, unsigned long, unsigned long); -+extern struct ext3_ext_path * ext3_ext_find_extent(struct ext3_extents_tree *, int, struct ext3_ext_path *); -+extern int ext3_ext_calc_blockmap_metadata(struct inode *, int); -+ -+static inline void -+ext3_ext_invalidate_cache(struct ext3_extents_tree *tree) -+{ -+ if (tree->cex) -+ tree->cex->ec_type = EXT3_EXT_CACHE_NO; -+} -+ -+ -+#endif /* _LINUX_EXT3_EXTENTS */ -Index: linux-stage/include/linux/ext3_fs_i.h -=================================================================== ---- linux-stage.orig/include/linux/ext3_fs_i.h 2006-07-16 13:55:30.000000000 +0800 -+++ linux-stage/include/linux/ext3_fs_i.h 2006-07-16 14:10:20.000000000 +0800 -@@ -142,6 +142,8 @@ struct ext3_inode_info { - */ - struct mutex truncate_mutex; - struct inode vfs_inode; -+ -+ __u32 i_cached_extent[4]; - }; - - #endif /* _LINUX_EXT3_FS_I */ diff --git a/ldiskfs/kernel_patches/patches/ext3-extents-2.6.5.patch b/ldiskfs/kernel_patches/patches/ext3-extents-2.6.5.patch deleted file mode 100644 index b6c37c1..0000000 --- a/ldiskfs/kernel_patches/patches/ext3-extents-2.6.5.patch +++ /dev/null @@ -1,2951 +0,0 @@ -%patch -Index: linux-2.6.5-sles9/fs/ext3/extents.c -=================================================================== ---- linux-2.6.5-sles9.orig/fs/ext3/extents.c 2005-02-17 22:07:57.023609040 +0300 -+++ linux-2.6.5-sles9/fs/ext3/extents.c 2005-02-23 01:02:37.396435640 +0300 -@@ -0,0 +1,2361 @@ -+/* -+ * Copyright(c) 2003, 2004, 2005, Cluster File Systems, Inc, info@clusterfs.com -+ * Written by Alex Tomas -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License version 2 as -+ * published by the Free Software Foundation. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public Licens -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- -+ */ -+ -+/* -+ * Extents support for EXT3 -+ * -+ * TODO: -+ * - ext3_ext_walk_space() sould not use ext3_ext_find_extent() -+ * - ext3_ext_calc_credits() could take 'mergable' into account -+ * - ext3*_error() should be used in some situations -+ * - find_goal() [to be tested and improved] -+ * - smart tree reduction -+ * - arch-independence -+ * common on-disk format for big/little-endian arch -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+ -+static inline int ext3_ext_check_header(struct ext3_extent_header *eh) -+{ -+ if (eh->eh_magic != EXT3_EXT_MAGIC) { -+ printk(KERN_ERR "EXT3-fs: invalid magic = 0x%x\n", -+ (unsigned)eh->eh_magic); -+ return -EIO; -+ } -+ if (eh->eh_max == 0) { -+ printk(KERN_ERR "EXT3-fs: invalid eh_max = %u\n", -+ (unsigned)eh->eh_max); -+ return -EIO; -+ } -+ if (eh->eh_entries > eh->eh_max) { -+ printk(KERN_ERR "EXT3-fs: invalid eh_entries = %u\n", -+ (unsigned)eh->eh_entries); -+ return -EIO; -+ } -+ return 0; -+} -+ -+static handle_t *ext3_ext_journal_restart(handle_t *handle, int needed) -+{ -+ int err; -+ -+ if (handle->h_buffer_credits > needed) -+ return handle; -+ if (!ext3_journal_extend(handle, needed)) -+ return handle; -+ err = ext3_journal_restart(handle, needed); -+ -+ return handle; -+} -+ -+static int inline -+ext3_ext_get_access_for_root(handle_t *h, struct ext3_extents_tree *tree) -+{ -+ if (tree->ops->get_write_access) -+ return tree->ops->get_write_access(h,tree->buffer); -+ else -+ return 0; -+} -+ -+static int inline -+ext3_ext_mark_root_dirty(handle_t *h, struct ext3_extents_tree *tree) -+{ -+ if (tree->ops->mark_buffer_dirty) -+ return tree->ops->mark_buffer_dirty(h,tree->buffer); -+ else -+ return 0; -+} -+ -+/* -+ * could return: -+ * - EROFS -+ * - ENOMEM -+ */ -+static int ext3_ext_get_access(handle_t *handle, -+ struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path) -+{ -+ int err; -+ -+ if (path->p_bh) { -+ /* path points to block */ -+ err = ext3_journal_get_write_access(handle, path->p_bh); -+ } else { -+ /* path points to leaf/index in inode body */ -+ err = ext3_ext_get_access_for_root(handle, tree); -+ } -+ return err; -+} -+ -+/* -+ * could return: -+ * - EROFS -+ * - ENOMEM -+ * - EIO -+ */ -+static int ext3_ext_dirty(handle_t *handle, struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path) -+{ -+ int err; -+ if (path->p_bh) { -+ /* path points to block */ -+ err =ext3_journal_dirty_metadata(handle, path->p_bh); -+ } else { -+ /* path points to leaf/index in inode body */ -+ err = ext3_ext_mark_root_dirty(handle, tree); -+ } -+ return err; -+} -+ -+static int inline -+ext3_ext_new_block(handle_t *handle, struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, struct ext3_extent *ex, -+ int *err) -+{ -+ int goal, depth, newblock; -+ struct inode *inode; -+ -+ EXT_ASSERT(tree); -+ if (tree->ops->new_block) -+ return tree->ops->new_block(handle, tree, path, ex, err); -+ -+ inode = tree->inode; -+ depth = EXT_DEPTH(tree); -+ if (path && depth > 0) { -+ goal = path[depth-1].p_block; -+ } else { -+ struct ext3_inode_info *ei = EXT3_I(inode); -+ unsigned long bg_start; -+ unsigned long colour; -+ -+ bg_start = (ei->i_block_group * -+ EXT3_BLOCKS_PER_GROUP(inode->i_sb)) + -+ le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block); -+ colour = (current->pid % 16) * -+ (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16); -+ goal = bg_start + colour; -+ } -+ -+ newblock = ext3_new_block(handle, inode, goal, err); -+ return newblock; -+} -+ -+static inline void ext3_ext_tree_changed(struct ext3_extents_tree *tree) -+{ -+ struct ext3_extent_header *neh = EXT_ROOT_HDR(tree); -+ neh->eh_generation = ((EXT_FLAGS(neh) & ~EXT_FLAGS_CLR_UNKNOWN) << 24) | -+ (EXT_HDR_GEN(neh) + 1); -+} -+ -+static inline int ext3_ext_space_block(struct ext3_extents_tree *tree) -+{ -+ int size; -+ -+ size = (tree->inode->i_sb->s_blocksize - -+ sizeof(struct ext3_extent_header)) / -+ sizeof(struct ext3_extent); -+#ifdef AGRESSIVE_TEST -+ size = 6; -+#endif -+ return size; -+} -+ -+static inline int ext3_ext_space_block_idx(struct ext3_extents_tree *tree) -+{ -+ int size; -+ -+ size = (tree->inode->i_sb->s_blocksize - -+ sizeof(struct ext3_extent_header)) / -+ sizeof(struct ext3_extent_idx); -+#ifdef AGRESSIVE_TEST -+ size = 5; -+#endif -+ return size; -+} -+ -+static inline int ext3_ext_space_root(struct ext3_extents_tree *tree) -+{ -+ int size; -+ -+ size = (tree->buffer_len - sizeof(struct ext3_extent_header)) / -+ sizeof(struct ext3_extent); -+#ifdef AGRESSIVE_TEST -+ size = 3; -+#endif -+ return size; -+} -+ -+static inline int ext3_ext_space_root_idx(struct ext3_extents_tree *tree) -+{ -+ int size; -+ -+ size = (tree->buffer_len - sizeof(struct ext3_extent_header)) / -+ sizeof(struct ext3_extent_idx); -+#ifdef AGRESSIVE_TEST -+ size = 4; -+#endif -+ return size; -+} -+ -+static void ext3_ext_show_path(struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path) -+{ -+#ifdef EXT_DEBUG -+ int k, l = path->p_depth; -+ -+ ext_debug(tree, "path:"); -+ for (k = 0; k <= l; k++, path++) { -+ if (path->p_idx) { -+ ext_debug(tree, " %d->%d", path->p_idx->ei_block, -+ path->p_idx->ei_leaf); -+ } else if (path->p_ext) { -+ ext_debug(tree, " %d:%d:%d", -+ path->p_ext->ee_block, -+ path->p_ext->ee_len, -+ path->p_ext->ee_start); -+ } else -+ ext_debug(tree, " []"); -+ } -+ ext_debug(tree, "\n"); -+#endif -+} -+ -+static void ext3_ext_show_leaf(struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path) -+{ -+#ifdef EXT_DEBUG -+ int depth = EXT_DEPTH(tree); -+ struct ext3_extent_header *eh; -+ struct ext3_extent *ex; -+ int i; -+ -+ if (!path) -+ return; -+ -+ eh = path[depth].p_hdr; -+ ex = EXT_FIRST_EXTENT(eh); -+ -+ for (i = 0; i < eh->eh_entries; i++, ex++) { -+ ext_debug(tree, "%d:%d:%d ", -+ ex->ee_block, ex->ee_len, ex->ee_start); -+ } -+ ext_debug(tree, "\n"); -+#endif -+} -+ -+static void ext3_ext_drop_refs(struct ext3_ext_path *path) -+{ -+ int depth = path->p_depth; -+ int i; -+ -+ for (i = 0; i <= depth; i++, path++) { -+ if (path->p_bh) { -+ brelse(path->p_bh); -+ path->p_bh = NULL; -+ } -+ } -+} -+ -+/* -+ * binary search for closest index by given block -+ */ -+static inline void -+ext3_ext_binsearch_idx(struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, int block) -+{ -+ struct ext3_extent_header *eh = path->p_hdr; -+ struct ext3_extent_idx *ix; -+ int l = 0, k, r; -+ -+ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); -+ EXT_ASSERT(eh->eh_entries <= eh->eh_max); -+ EXT_ASSERT(eh->eh_entries > 0); -+ -+ ext_debug(tree, "binsearch for %d(idx): ", block); -+ -+ path->p_idx = ix = EXT_FIRST_INDEX(eh); -+ -+ r = k = eh->eh_entries; -+ while (k > 1) { -+ k = (r - l) / 2; -+ if (block < ix[l + k].ei_block) -+ r -= k; -+ else -+ l += k; -+ ext_debug(tree, "%d:%d:%d ", k, l, r); -+ } -+ -+ ix += l; -+ path->p_idx = ix; -+ ext_debug(tree," -> %d->%d ",path->p_idx->ei_block,path->p_idx->ei_leaf); -+ -+ while (l++ < r) { -+ if (block < ix->ei_block) -+ break; -+ path->p_idx = ix++; -+ } -+ ext_debug(tree, " -> %d->%d\n", path->p_idx->ei_block, -+ path->p_idx->ei_leaf); -+ -+#ifdef CHECK_BINSEARCH -+ { -+ struct ext3_extent_idx *chix; -+ -+ chix = ix = EXT_FIRST_INDEX(eh); -+ for (k = 0; k < eh->eh_entries; k++, ix++) { -+ if (k != 0 && ix->ei_block <= ix[-1].ei_block) { -+ printk("k=%d, ix=0x%p, first=0x%p\n", k, -+ ix, EXT_FIRST_INDEX(eh)); -+ printk("%u <= %u\n", -+ ix->ei_block,ix[-1].ei_block); -+ } -+ EXT_ASSERT(k == 0 || ix->ei_block > ix[-1].ei_block); -+ if (block < ix->ei_block) -+ break; -+ chix = ix; -+ } -+ EXT_ASSERT(chix == path->p_idx); -+ } -+#endif -+} -+ -+/* -+ * binary search for closest extent by given block -+ */ -+static inline void -+ext3_ext_binsearch(struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, int block) -+{ -+ struct ext3_extent_header *eh = path->p_hdr; -+ struct ext3_extent *ex; -+ int l = 0, k, r; -+ -+ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); -+ EXT_ASSERT(eh->eh_entries <= eh->eh_max); -+ -+ if (eh->eh_entries == 0) { -+ /* -+ * this leaf is empty yet: -+ * we get such a leaf in split/add case -+ */ -+ return; -+ } -+ -+ ext_debug(tree, "binsearch for %d: ", block); -+ -+ path->p_ext = ex = EXT_FIRST_EXTENT(eh); -+ -+ r = k = eh->eh_entries; -+ while (k > 1) { -+ k = (r - l) / 2; -+ if (block < ex[l + k].ee_block) -+ r -= k; -+ else -+ l += k; -+ ext_debug(tree, "%d:%d:%d ", k, l, r); -+ } -+ -+ ex += l; -+ path->p_ext = ex; -+ ext_debug(tree, " -> %d:%d:%d ", path->p_ext->ee_block, -+ path->p_ext->ee_start, path->p_ext->ee_len); -+ -+ while (l++ < r) { -+ if (block < ex->ee_block) -+ break; -+ path->p_ext = ex++; -+ } -+ ext_debug(tree, " -> %d:%d:%d\n", path->p_ext->ee_block, -+ path->p_ext->ee_start, path->p_ext->ee_len); -+ -+#ifdef CHECK_BINSEARCH -+ { -+ struct ext3_extent *chex; -+ -+ chex = ex = EXT_FIRST_EXTENT(eh); -+ for (k = 0; k < eh->eh_entries; k++, ex++) { -+ EXT_ASSERT(k == 0 || ex->ee_block > ex[-1].ee_block); -+ if (block < ex->ee_block) -+ break; -+ chex = ex; -+ } -+ EXT_ASSERT(chex == path->p_ext); -+ } -+#endif -+} -+ -+int ext3_extent_tree_init(handle_t *handle, struct ext3_extents_tree *tree) -+{ -+ struct ext3_extent_header *eh; -+ -+ BUG_ON(tree->buffer_len == 0); -+ ext3_ext_get_access_for_root(handle, tree); -+ eh = EXT_ROOT_HDR(tree); -+ eh->eh_depth = 0; -+ eh->eh_entries = 0; -+ eh->eh_magic = EXT3_EXT_MAGIC; -+ eh->eh_max = ext3_ext_space_root(tree); -+ ext3_ext_mark_root_dirty(handle, tree); -+ ext3_ext_invalidate_cache(tree); -+ return 0; -+} -+ -+struct ext3_ext_path * -+ext3_ext_find_extent(struct ext3_extents_tree *tree, int block, -+ struct ext3_ext_path *path) -+{ -+ struct ext3_extent_header *eh; -+ struct buffer_head *bh; -+ int depth, i, ppos = 0; -+ -+ EXT_ASSERT(tree); -+ EXT_ASSERT(tree->inode); -+ EXT_ASSERT(tree->root); -+ -+ eh = EXT_ROOT_HDR(tree); -+ EXT_ASSERT(eh); -+ if (ext3_ext_check_header(eh)) { -+ /* don't free previously allocated path -+ * -- caller should take care */ -+ path = NULL; -+ goto err; -+ } -+ -+ i = depth = EXT_DEPTH(tree); -+ EXT_ASSERT(eh->eh_max); -+ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); -+ -+ /* account possible depth increase */ -+ if (!path) { -+ path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 2), -+ GFP_NOFS); -+ if (!path) -+ return ERR_PTR(-ENOMEM); -+ } -+ memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1)); -+ path[0].p_hdr = eh; -+ -+ /* walk through the tree */ -+ while (i) { -+ ext_debug(tree, "depth %d: num %d, max %d\n", -+ ppos, eh->eh_entries, eh->eh_max); -+ ext3_ext_binsearch_idx(tree, path + ppos, block); -+ path[ppos].p_block = path[ppos].p_idx->ei_leaf; -+ path[ppos].p_depth = i; -+ path[ppos].p_ext = NULL; -+ -+ bh = sb_bread(tree->inode->i_sb, path[ppos].p_block); -+ if (!bh) -+ goto err; -+ -+ eh = EXT_BLOCK_HDR(bh); -+ ppos++; -+ EXT_ASSERT(ppos <= depth); -+ path[ppos].p_bh = bh; -+ path[ppos].p_hdr = eh; -+ i--; -+ -+ if (ext3_ext_check_header(eh)) -+ goto err; -+ } -+ -+ path[ppos].p_depth = i; -+ path[ppos].p_hdr = eh; -+ path[ppos].p_ext = NULL; -+ path[ppos].p_idx = NULL; -+ -+ if (ext3_ext_check_header(eh)) -+ goto err; -+ -+ /* find extent */ -+ ext3_ext_binsearch(tree, path + ppos, block); -+ -+ ext3_ext_show_path(tree, path); -+ -+ return path; -+ -+err: -+ printk(KERN_ERR "EXT3-fs: header is corrupted!\n"); -+ if (path) { -+ ext3_ext_drop_refs(path); -+ kfree(path); -+ } -+ return ERR_PTR(-EIO); -+} -+ -+/* -+ * insert new index [logical;ptr] into the block at cupr -+ * it check where to insert: before curp or after curp -+ */ -+static int ext3_ext_insert_index(handle_t *handle, -+ struct ext3_extents_tree *tree, -+ struct ext3_ext_path *curp, -+ int logical, int ptr) -+{ -+ struct ext3_extent_idx *ix; -+ int len, err; -+ -+ if ((err = ext3_ext_get_access(handle, tree, curp))) -+ return err; -+ -+ EXT_ASSERT(logical != curp->p_idx->ei_block); -+ len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx; -+ if (logical > curp->p_idx->ei_block) { -+ /* insert after */ -+ if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) { -+ len = (len - 1) * sizeof(struct ext3_extent_idx); -+ len = len < 0 ? 0 : len; -+ ext_debug(tree, "insert new index %d after: %d. " -+ "move %d from 0x%p to 0x%p\n", -+ logical, ptr, len, -+ (curp->p_idx + 1), (curp->p_idx + 2)); -+ memmove(curp->p_idx + 2, curp->p_idx + 1, len); -+ } -+ ix = curp->p_idx + 1; -+ } else { -+ /* insert before */ -+ len = len * sizeof(struct ext3_extent_idx); -+ len = len < 0 ? 0 : len; -+ ext_debug(tree, "insert new index %d before: %d. " -+ "move %d from 0x%p to 0x%p\n", -+ logical, ptr, len, -+ curp->p_idx, (curp->p_idx + 1)); -+ memmove(curp->p_idx + 1, curp->p_idx, len); -+ ix = curp->p_idx; -+ } -+ -+ ix->ei_block = logical; -+ ix->ei_leaf = ptr; -+ ix->ei_leaf_hi = ix->ei_unused = 0; -+ curp->p_hdr->eh_entries++; -+ -+ EXT_ASSERT(curp->p_hdr->eh_entries <= curp->p_hdr->eh_max); -+ EXT_ASSERT(ix <= EXT_LAST_INDEX(curp->p_hdr)); -+ -+ err = ext3_ext_dirty(handle, tree, curp); -+ ext3_std_error(tree->inode->i_sb, err); -+ -+ return err; -+} -+ -+/* -+ * routine inserts new subtree into the path, using free index entry -+ * at depth 'at: -+ * - allocates all needed blocks (new leaf and all intermediate index blocks) -+ * - makes decision where to split -+ * - moves remaining extens and index entries (right to the split point) -+ * into the newly allocated blocks -+ * - initialize subtree -+ */ -+static int ext3_ext_split(handle_t *handle, struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, -+ struct ext3_extent *newext, int at) -+{ -+ struct buffer_head *bh = NULL; -+ int depth = EXT_DEPTH(tree); -+ struct ext3_extent_header *neh; -+ struct ext3_extent_idx *fidx; -+ struct ext3_extent *ex; -+ int i = at, k, m, a; -+ unsigned long newblock, oldblock, border; -+ int *ablocks = NULL; /* array of allocated blocks */ -+ int err = 0; -+ -+ /* make decision: where to split? */ -+ /* FIXME: now desicion is simplest: at current extent */ -+ -+ /* if current leaf will be splitted, then we should use -+ * border from split point */ -+ EXT_ASSERT(path[depth].p_ext <= EXT_MAX_EXTENT(path[depth].p_hdr)); -+ if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) { -+ border = path[depth].p_ext[1].ee_block; -+ ext_debug(tree, "leaf will be splitted." -+ " next leaf starts at %d\n", -+ (int)border); -+ } else { -+ border = newext->ee_block; -+ ext_debug(tree, "leaf will be added." -+ " next leaf starts at %d\n", -+ (int)border); -+ } -+ -+ /* -+ * if error occurs, then we break processing -+ * and turn filesystem read-only. so, index won't -+ * be inserted and tree will be in consistent -+ * state. next mount will repair buffers too -+ */ -+ -+ /* -+ * get array to track all allocated blocks -+ * we need this to handle errors and free blocks -+ * upon them -+ */ -+ ablocks = kmalloc(sizeof(unsigned long) * depth, GFP_NOFS); -+ if (!ablocks) -+ return -ENOMEM; -+ memset(ablocks, 0, sizeof(unsigned long) * depth); -+ -+ /* allocate all needed blocks */ -+ ext_debug(tree, "allocate %d blocks for indexes/leaf\n", depth - at); -+ for (a = 0; a < depth - at; a++) { -+ newblock = ext3_ext_new_block(handle, tree, path, newext, &err); -+ if (newblock == 0) -+ goto cleanup; -+ ablocks[a] = newblock; -+ } -+ -+ /* initialize new leaf */ -+ newblock = ablocks[--a]; -+ EXT_ASSERT(newblock); -+ bh = sb_getblk(tree->inode->i_sb, newblock); -+ if (!bh) { -+ err = -EIO; -+ goto cleanup; -+ } -+ lock_buffer(bh); -+ -+ if ((err = ext3_journal_get_create_access(handle, bh))) -+ goto cleanup; -+ -+ neh = EXT_BLOCK_HDR(bh); -+ neh->eh_entries = 0; -+ neh->eh_max = ext3_ext_space_block(tree); -+ neh->eh_magic = EXT3_EXT_MAGIC; -+ neh->eh_depth = 0; -+ ex = EXT_FIRST_EXTENT(neh); -+ -+ /* move remain of path[depth] to the new leaf */ -+ EXT_ASSERT(path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max); -+ /* start copy from next extent */ -+ /* TODO: we could do it by single memmove */ -+ m = 0; -+ path[depth].p_ext++; -+ while (path[depth].p_ext <= -+ EXT_MAX_EXTENT(path[depth].p_hdr)) { -+ ext_debug(tree, "move %d:%d:%d in new leaf %lu\n", -+ path[depth].p_ext->ee_block, -+ path[depth].p_ext->ee_start, -+ path[depth].p_ext->ee_len, -+ newblock); -+ memmove(ex++, path[depth].p_ext++, sizeof(struct ext3_extent)); -+ neh->eh_entries++; -+ m++; -+ } -+ set_buffer_uptodate(bh); -+ unlock_buffer(bh); -+ -+ if ((err = ext3_journal_dirty_metadata(handle, bh))) -+ goto cleanup; -+ brelse(bh); -+ bh = NULL; -+ -+ /* correct old leaf */ -+ if (m) { -+ if ((err = ext3_ext_get_access(handle, tree, path + depth))) -+ goto cleanup; -+ path[depth].p_hdr->eh_entries -= m; -+ if ((err = ext3_ext_dirty(handle, tree, path + depth))) -+ goto cleanup; -+ -+ } -+ -+ /* create intermediate indexes */ -+ k = depth - at - 1; -+ EXT_ASSERT(k >= 0); -+ if (k) -+ ext_debug(tree, "create %d intermediate indices\n", k); -+ /* insert new index into current index block */ -+ /* current depth stored in i var */ -+ i = depth - 1; -+ while (k--) { -+ oldblock = newblock; -+ newblock = ablocks[--a]; -+ bh = sb_getblk(tree->inode->i_sb, newblock); -+ if (!bh) { -+ err = -EIO; -+ goto cleanup; -+ } -+ lock_buffer(bh); -+ -+ if ((err = ext3_journal_get_create_access(handle, bh))) -+ goto cleanup; -+ -+ neh = EXT_BLOCK_HDR(bh); -+ neh->eh_entries = 1; -+ neh->eh_magic = EXT3_EXT_MAGIC; -+ neh->eh_max = ext3_ext_space_block_idx(tree); -+ neh->eh_depth = depth - i; -+ fidx = EXT_FIRST_INDEX(neh); -+ fidx->ei_block = border; -+ fidx->ei_leaf = oldblock; -+ fidx->ei_leaf_hi = fidx->ei_unused = 0; -+ -+ ext_debug(tree, "int.index at %d (block %lu): %lu -> %lu\n", -+ i, newblock, border, oldblock); -+ /* copy indexes */ -+ m = 0; -+ path[i].p_idx++; -+ -+ ext_debug(tree, "cur 0x%p, last 0x%p\n", path[i].p_idx, -+ EXT_MAX_INDEX(path[i].p_hdr)); -+ EXT_ASSERT(EXT_MAX_INDEX(path[i].p_hdr) == -+ EXT_LAST_INDEX(path[i].p_hdr)); -+ while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) { -+ ext_debug(tree, "%d: move %d:%d in new index %lu\n", -+ i, path[i].p_idx->ei_block, -+ path[i].p_idx->ei_leaf, newblock); -+ memmove(++fidx, path[i].p_idx++, -+ sizeof(struct ext3_extent_idx)); -+ neh->eh_entries++; -+ EXT_ASSERT(neh->eh_entries <= neh->eh_max); -+ m++; -+ } -+ set_buffer_uptodate(bh); -+ unlock_buffer(bh); -+ -+ if ((err = ext3_journal_dirty_metadata(handle, bh))) -+ goto cleanup; -+ brelse(bh); -+ bh = NULL; -+ -+ /* correct old index */ -+ if (m) { -+ err = ext3_ext_get_access(handle, tree, path + i); -+ if (err) -+ goto cleanup; -+ path[i].p_hdr->eh_entries -= m; -+ err = ext3_ext_dirty(handle, tree, path + i); -+ if (err) -+ goto cleanup; -+ } -+ -+ i--; -+ } -+ -+ /* insert new index */ -+ if (!err) -+ err = ext3_ext_insert_index(handle, tree, path + at, -+ border, newblock); -+ -+cleanup: -+ if (bh) { -+ if (buffer_locked(bh)) -+ unlock_buffer(bh); -+ brelse(bh); -+ } -+ -+ if (err) { -+ /* free all allocated blocks in error case */ -+ for (i = 0; i < depth; i++) { -+ if (!ablocks[i]) -+ continue; -+ ext3_free_blocks(handle, tree->inode, ablocks[i], 1); -+ } -+ } -+ kfree(ablocks); -+ -+ return err; -+} -+ -+/* -+ * routine implements tree growing procedure: -+ * - allocates new block -+ * - moves top-level data (index block or leaf) into the new block -+ * - initialize new top-level, creating index that points to the -+ * just created block -+ */ -+static int ext3_ext_grow_indepth(handle_t *handle, -+ struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, -+ struct ext3_extent *newext) -+{ -+ struct ext3_ext_path *curp = path; -+ struct ext3_extent_header *neh; -+ struct ext3_extent_idx *fidx; -+ struct buffer_head *bh; -+ unsigned long newblock; -+ int err = 0; -+ -+ newblock = ext3_ext_new_block(handle, tree, path, newext, &err); -+ if (newblock == 0) -+ return err; -+ -+ bh = sb_getblk(tree->inode->i_sb, newblock); -+ if (!bh) { -+ err = -EIO; -+ ext3_std_error(tree->inode->i_sb, err); -+ return err; -+ } -+ lock_buffer(bh); -+ -+ if ((err = ext3_journal_get_create_access(handle, bh))) { -+ unlock_buffer(bh); -+ goto out; -+ } -+ -+ /* move top-level index/leaf into new block */ -+ memmove(bh->b_data, curp->p_hdr, tree->buffer_len); -+ -+ /* set size of new block */ -+ neh = EXT_BLOCK_HDR(bh); -+ /* old root could have indexes or leaves -+ * so calculate eh_max right way */ -+ if (EXT_DEPTH(tree)) -+ neh->eh_max = ext3_ext_space_block_idx(tree); -+ else -+ neh->eh_max = ext3_ext_space_block(tree); -+ neh->eh_magic = EXT3_EXT_MAGIC; -+ set_buffer_uptodate(bh); -+ unlock_buffer(bh); -+ -+ if ((err = ext3_journal_dirty_metadata(handle, bh))) -+ goto out; -+ -+ /* create index in new top-level index: num,max,pointer */ -+ if ((err = ext3_ext_get_access(handle, tree, curp))) -+ goto out; -+ -+ curp->p_hdr->eh_magic = EXT3_EXT_MAGIC; -+ curp->p_hdr->eh_max = ext3_ext_space_root_idx(tree); -+ curp->p_hdr->eh_entries = 1; -+ curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr); -+ /* FIXME: it works, but actually path[0] can be index */ -+ curp->p_idx->ei_block = EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block; -+ curp->p_idx->ei_leaf = newblock; -+ curp->p_idx->ei_leaf_hi = curp->p_idx->ei_unused = 0; -+ -+ neh = EXT_ROOT_HDR(tree); -+ fidx = EXT_FIRST_INDEX(neh); -+ ext_debug(tree, "new root: num %d(%d), lblock %d, ptr %d\n", -+ neh->eh_entries, neh->eh_max, fidx->ei_block, fidx->ei_leaf); -+ -+ neh->eh_depth = path->p_depth + 1; -+ err = ext3_ext_dirty(handle, tree, curp); -+out: -+ brelse(bh); -+ -+ return err; -+} -+ -+/* -+ * routine finds empty index and adds new leaf. if no free index found -+ * then it requests in-depth growing -+ */ -+static int ext3_ext_create_new_leaf(handle_t *handle, -+ struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, -+ struct ext3_extent *newext) -+{ -+ struct ext3_ext_path *curp; -+ int depth, i, err = 0; -+ -+repeat: -+ i = depth = EXT_DEPTH(tree); -+ -+ /* walk up to the tree and look for free index entry */ -+ curp = path + depth; -+ while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) { -+ i--; -+ curp--; -+ } -+ -+ /* we use already allocated block for index block -+ * so, subsequent data blocks should be contigoues */ -+ if (EXT_HAS_FREE_INDEX(curp)) { -+ /* if we found index with free entry, then use that -+ * entry: create all needed subtree and add new leaf */ -+ err = ext3_ext_split(handle, tree, path, newext, i); -+ -+ /* refill path */ -+ ext3_ext_drop_refs(path); -+ path = ext3_ext_find_extent(tree, newext->ee_block, path); -+ if (IS_ERR(path)) -+ err = PTR_ERR(path); -+ } else { -+ /* tree is full, time to grow in depth */ -+ err = ext3_ext_grow_indepth(handle, tree, path, newext); -+ -+ /* refill path */ -+ ext3_ext_drop_refs(path); -+ path = ext3_ext_find_extent(tree, newext->ee_block, path); -+ if (IS_ERR(path)) -+ err = PTR_ERR(path); -+ -+ /* -+ * only first (depth 0 -> 1) produces free space -+ * in all other cases we have to split growed tree -+ */ -+ depth = EXT_DEPTH(tree); -+ if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) { -+ /* now we need split */ -+ goto repeat; -+ } -+ } -+ -+ if (err) -+ return err; -+ -+ return 0; -+} -+ -+/* -+ * returns allocated block in subsequent extent or EXT_MAX_BLOCK -+ * NOTE: it consider block number from index entry as -+ * allocated block. thus, index entries have to be consistent -+ * with leafs -+ */ -+static unsigned long -+ext3_ext_next_allocated_block(struct ext3_ext_path *path) -+{ -+ int depth; -+ -+ EXT_ASSERT(path != NULL); -+ depth = path->p_depth; -+ -+ if (depth == 0 && path->p_ext == NULL) -+ return EXT_MAX_BLOCK; -+ -+ /* FIXME: what if index isn't full ?! */ -+ while (depth >= 0) { -+ if (depth == path->p_depth) { -+ /* leaf */ -+ if (path[depth].p_ext != -+ EXT_LAST_EXTENT(path[depth].p_hdr)) -+ return path[depth].p_ext[1].ee_block; -+ } else { -+ /* index */ -+ if (path[depth].p_idx != -+ EXT_LAST_INDEX(path[depth].p_hdr)) -+ return path[depth].p_idx[1].ei_block; -+ } -+ depth--; -+ } -+ -+ return EXT_MAX_BLOCK; -+} -+ -+/* -+ * returns first allocated block from next leaf or EXT_MAX_BLOCK -+ */ -+static unsigned ext3_ext_next_leaf_block(struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path) -+{ -+ int depth; -+ -+ EXT_ASSERT(path != NULL); -+ depth = path->p_depth; -+ -+ /* zero-tree has no leaf blocks at all */ -+ if (depth == 0) -+ return EXT_MAX_BLOCK; -+ -+ /* go to index block */ -+ depth--; -+ -+ while (depth >= 0) { -+ if (path[depth].p_idx != -+ EXT_LAST_INDEX(path[depth].p_hdr)) -+ return path[depth].p_idx[1].ei_block; -+ depth--; -+ } -+ -+ return EXT_MAX_BLOCK; -+} -+ -+/* -+ * if leaf gets modified and modified extent is first in the leaf -+ * then we have to correct all indexes above -+ * TODO: do we need to correct tree in all cases? -+ */ -+int ext3_ext_correct_indexes(handle_t *handle, struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path) -+{ -+ struct ext3_extent_header *eh; -+ int depth = EXT_DEPTH(tree); -+ struct ext3_extent *ex; -+ unsigned long border; -+ int k, err = 0; -+ -+ eh = path[depth].p_hdr; -+ ex = path[depth].p_ext; -+ EXT_ASSERT(ex); -+ EXT_ASSERT(eh); -+ -+ if (depth == 0) { -+ /* there is no tree at all */ -+ return 0; -+ } -+ -+ if (ex != EXT_FIRST_EXTENT(eh)) { -+ /* we correct tree if first leaf got modified only */ -+ return 0; -+ } -+ -+ /* -+ * TODO: we need correction if border is smaller then current one -+ */ -+ k = depth - 1; -+ border = path[depth].p_ext->ee_block; -+ if ((err = ext3_ext_get_access(handle, tree, path + k))) -+ return err; -+ path[k].p_idx->ei_block = border; -+ if ((err = ext3_ext_dirty(handle, tree, path + k))) -+ return err; -+ -+ while (k--) { -+ /* change all left-side indexes */ -+ if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr)) -+ break; -+ if ((err = ext3_ext_get_access(handle, tree, path + k))) -+ break; -+ path[k].p_idx->ei_block = border; -+ if ((err = ext3_ext_dirty(handle, tree, path + k))) -+ break; -+ } -+ -+ return err; -+} -+ -+static int inline -+ext3_can_extents_be_merged(struct ext3_extents_tree *tree, -+ struct ext3_extent *ex1, -+ struct ext3_extent *ex2) -+{ -+ if (ex1->ee_block + ex1->ee_len != ex2->ee_block) -+ return 0; -+ -+#ifdef AGRESSIVE_TEST -+ if (ex1->ee_len >= 4) -+ return 0; -+#endif -+ -+ if (!tree->ops->mergable) -+ return 1; -+ -+ return tree->ops->mergable(ex1, ex2); -+} -+ -+/* -+ * this routine tries to merge requsted extent into the existing -+ * extent or inserts requested extent as new one into the tree, -+ * creating new leaf in no-space case -+ */ -+int ext3_ext_insert_extent(handle_t *handle, struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, -+ struct ext3_extent *newext) -+{ -+ struct ext3_extent_header * eh; -+ struct ext3_extent *ex, *fex; -+ struct ext3_extent *nearex; /* nearest extent */ -+ struct ext3_ext_path *npath = NULL; -+ int depth, len, err, next; -+ -+ EXT_ASSERT(newext->ee_len > 0); -+ depth = EXT_DEPTH(tree); -+ ex = path[depth].p_ext; -+ EXT_ASSERT(path[depth].p_hdr); -+ -+ /* try to insert block into found extent and return */ -+ if (ex && ext3_can_extents_be_merged(tree, ex, newext)) { -+ ext_debug(tree, "append %d block to %d:%d (from %d)\n", -+ newext->ee_len, ex->ee_block, ex->ee_len, -+ ex->ee_start); -+ if ((err = ext3_ext_get_access(handle, tree, path + depth))) -+ return err; -+ ex->ee_len += newext->ee_len; -+ eh = path[depth].p_hdr; -+ nearex = ex; -+ goto merge; -+ } -+ -+repeat: -+ depth = EXT_DEPTH(tree); -+ eh = path[depth].p_hdr; -+ if (eh->eh_entries < eh->eh_max) -+ goto has_space; -+ -+ /* probably next leaf has space for us? */ -+ fex = EXT_LAST_EXTENT(eh); -+ next = ext3_ext_next_leaf_block(tree, path); -+ if (newext->ee_block > fex->ee_block && next != EXT_MAX_BLOCK) { -+ ext_debug(tree, "next leaf block - %d\n", next); -+ EXT_ASSERT(!npath); -+ npath = ext3_ext_find_extent(tree, next, NULL); -+ if (IS_ERR(npath)) -+ return PTR_ERR(npath); -+ EXT_ASSERT(npath->p_depth == path->p_depth); -+ eh = npath[depth].p_hdr; -+ if (eh->eh_entries < eh->eh_max) { -+ ext_debug(tree, "next leaf isnt full(%d)\n", -+ eh->eh_entries); -+ path = npath; -+ goto repeat; -+ } -+ ext_debug(tree, "next leaf hasno free space(%d,%d)\n", -+ eh->eh_entries, eh->eh_max); -+ } -+ -+ /* -+ * there is no free space in found leaf -+ * we're gonna add new leaf in the tree -+ */ -+ err = ext3_ext_create_new_leaf(handle, tree, path, newext); -+ if (err) -+ goto cleanup; -+ depth = EXT_DEPTH(tree); -+ eh = path[depth].p_hdr; -+ -+has_space: -+ nearex = path[depth].p_ext; -+ -+ if ((err = ext3_ext_get_access(handle, tree, path + depth))) -+ goto cleanup; -+ -+ if (!nearex) { -+ /* there is no extent in this leaf, create first one */ -+ ext_debug(tree, "first extent in the leaf: %d:%d:%d\n", -+ newext->ee_block, newext->ee_start, -+ newext->ee_len); -+ path[depth].p_ext = EXT_FIRST_EXTENT(eh); -+ } else if (newext->ee_block > nearex->ee_block) { -+ EXT_ASSERT(newext->ee_block != nearex->ee_block); -+ if (nearex != EXT_LAST_EXTENT(eh)) { -+ len = EXT_MAX_EXTENT(eh) - nearex; -+ len = (len - 1) * sizeof(struct ext3_extent); -+ len = len < 0 ? 0 : len; -+ ext_debug(tree, "insert %d:%d:%d after: nearest 0x%p, " -+ "move %d from 0x%p to 0x%p\n", -+ newext->ee_block, newext->ee_start, -+ newext->ee_len, -+ nearex, len, nearex + 1, nearex + 2); -+ memmove(nearex + 2, nearex + 1, len); -+ } -+ path[depth].p_ext = nearex + 1; -+ } else { -+ EXT_ASSERT(newext->ee_block != nearex->ee_block); -+ len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext3_extent); -+ len = len < 0 ? 0 : len; -+ ext_debug(tree, "insert %d:%d:%d before: nearest 0x%p, " -+ "move %d from 0x%p to 0x%p\n", -+ newext->ee_block, newext->ee_start, newext->ee_len, -+ nearex, len, nearex + 1, nearex + 2); -+ memmove(nearex + 1, nearex, len); -+ path[depth].p_ext = nearex; -+ } -+ -+ eh->eh_entries++; -+ nearex = path[depth].p_ext; -+ nearex->ee_block = newext->ee_block; -+ nearex->ee_start = newext->ee_start; -+ nearex->ee_len = newext->ee_len; -+ /* FIXME: support for large fs */ -+ nearex->ee_start_hi = 0; -+ -+merge: -+ /* try to merge extents to the right */ -+ while (nearex < EXT_LAST_EXTENT(eh)) { -+ if (!ext3_can_extents_be_merged(tree, nearex, nearex + 1)) -+ break; -+ /* merge with next extent! */ -+ nearex->ee_len += nearex[1].ee_len; -+ if (nearex + 1 < EXT_LAST_EXTENT(eh)) { -+ len = (EXT_LAST_EXTENT(eh) - nearex - 1) * -+ sizeof(struct ext3_extent); -+ memmove(nearex + 1, nearex + 2, len); -+ } -+ eh->eh_entries--; -+ EXT_ASSERT(eh->eh_entries > 0); -+ } -+ -+ /* try to merge extents to the left */ -+ -+ /* time to correct all indexes above */ -+ err = ext3_ext_correct_indexes(handle, tree, path); -+ if (err) -+ goto cleanup; -+ -+ err = ext3_ext_dirty(handle, tree, path + depth); -+ -+cleanup: -+ if (npath) { -+ ext3_ext_drop_refs(npath); -+ kfree(npath); -+ } -+ ext3_ext_tree_changed(tree); -+ ext3_ext_invalidate_cache(tree); -+ return err; -+} -+ -+int ext3_ext_walk_space(struct ext3_extents_tree *tree, unsigned long block, -+ unsigned long num, ext_prepare_callback func) -+{ -+ struct ext3_ext_path *path = NULL; -+ struct ext3_ext_cache cbex; -+ struct ext3_extent *ex; -+ unsigned long next, start = 0, end = 0; -+ unsigned long last = block + num; -+ int depth, exists, err = 0; -+ -+ EXT_ASSERT(tree); -+ EXT_ASSERT(func); -+ EXT_ASSERT(tree->inode); -+ EXT_ASSERT(tree->root); -+ -+ while (block < last && block != EXT_MAX_BLOCK) { -+ num = last - block; -+ /* find extent for this block */ -+ path = ext3_ext_find_extent(tree, block, path); -+ if (IS_ERR(path)) { -+ err = PTR_ERR(path); -+ path = NULL; -+ break; -+ } -+ -+ depth = EXT_DEPTH(tree); -+ EXT_ASSERT(path[depth].p_hdr); -+ ex = path[depth].p_ext; -+ next = ext3_ext_next_allocated_block(path); -+ -+ exists = 0; -+ if (!ex) { -+ /* there is no extent yet, so try to allocate -+ * all requested space */ -+ start = block; -+ end = block + num; -+ } else if (ex->ee_block > block) { -+ /* need to allocate space before found extent */ -+ start = block; -+ end = ex->ee_block; -+ if (block + num < end) -+ end = block + num; -+ } else if (block >= ex->ee_block + ex->ee_len) { -+ /* need to allocate space after found extent */ -+ start = block; -+ end = block + num; -+ if (end >= next) -+ end = next; -+ } else if (block >= ex->ee_block) { -+ /* -+ * some part of requested space is covered -+ * by found extent -+ */ -+ start = block; -+ end = ex->ee_block + ex->ee_len; -+ if (block + num < end) -+ end = block + num; -+ exists = 1; -+ } else { -+ BUG(); -+ } -+ EXT_ASSERT(end > start); -+ -+ if (!exists) { -+ cbex.ec_block = start; -+ cbex.ec_len = end - start; -+ cbex.ec_start = 0; -+ cbex.ec_type = EXT3_EXT_CACHE_GAP; -+ } else { -+ cbex.ec_block = ex->ee_block; -+ cbex.ec_len = ex->ee_len; -+ cbex.ec_start = ex->ee_start; -+ cbex.ec_type = EXT3_EXT_CACHE_EXTENT; -+ } -+ -+ EXT_ASSERT(cbex.ec_len > 0); -+ EXT_ASSERT(path[depth].p_hdr); -+ err = func(tree, path, &cbex); -+ ext3_ext_drop_refs(path); -+ -+ if (err < 0) -+ break; -+ if (err == EXT_REPEAT) -+ continue; -+ else if (err == EXT_BREAK) { -+ err = 0; -+ break; -+ } -+ -+ if (EXT_DEPTH(tree) != depth) { -+ /* depth was changed. we have to realloc path */ -+ kfree(path); -+ path = NULL; -+ } -+ -+ block = cbex.ec_block + cbex.ec_len; -+ } -+ -+ if (path) { -+ ext3_ext_drop_refs(path); -+ kfree(path); -+ } -+ -+ return err; -+} -+ -+static inline void -+ext3_ext_put_in_cache(struct ext3_extents_tree *tree, __u32 block, -+ __u32 len, __u32 start, int type) -+{ -+ EXT_ASSERT(len > 0); -+ if (tree->cex) { -+ tree->cex->ec_type = type; -+ tree->cex->ec_block = block; -+ tree->cex->ec_len = len; -+ tree->cex->ec_start = start; -+ } -+} -+ -+/* -+ * this routine calculate boundaries of the gap requested block fits into -+ * and cache this gap -+ */ -+static inline void -+ext3_ext_put_gap_in_cache(struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, -+ unsigned long block) -+{ -+ int depth = EXT_DEPTH(tree); -+ unsigned long lblock, len; -+ struct ext3_extent *ex; -+ -+ if (!tree->cex) -+ return; -+ -+ ex = path[depth].p_ext; -+ if (ex == NULL) { -+ /* there is no extent yet, so gap is [0;-] */ -+ lblock = 0; -+ len = EXT_MAX_BLOCK; -+ ext_debug(tree, "cache gap(whole file):"); -+ } else if (block < ex->ee_block) { -+ lblock = block; -+ len = ex->ee_block - block; -+ ext_debug(tree, "cache gap(before): %lu [%lu:%lu]", -+ (unsigned long) block, -+ (unsigned long) ex->ee_block, -+ (unsigned long) ex->ee_len); -+ } else if (block >= ex->ee_block + ex->ee_len) { -+ lblock = ex->ee_block + ex->ee_len; -+ len = ext3_ext_next_allocated_block(path); -+ ext_debug(tree, "cache gap(after): [%lu:%lu] %lu", -+ (unsigned long) ex->ee_block, -+ (unsigned long) ex->ee_len, -+ (unsigned long) block); -+ EXT_ASSERT(len > lblock); -+ len = len - lblock; -+ } else { -+ lblock = len = 0; -+ BUG(); -+ } -+ -+ ext_debug(tree, " -> %lu:%lu\n", (unsigned long) lblock, len); -+ ext3_ext_put_in_cache(tree, lblock, len, 0, EXT3_EXT_CACHE_GAP); -+} -+ -+static inline int -+ext3_ext_in_cache(struct ext3_extents_tree *tree, unsigned long block, -+ struct ext3_extent *ex) -+{ -+ struct ext3_ext_cache *cex = tree->cex; -+ -+ /* is there cache storage at all? */ -+ if (!cex) -+ return EXT3_EXT_CACHE_NO; -+ -+ /* has cache valid data? */ -+ if (cex->ec_type == EXT3_EXT_CACHE_NO) -+ return EXT3_EXT_CACHE_NO; -+ -+ EXT_ASSERT(cex->ec_type == EXT3_EXT_CACHE_GAP || -+ cex->ec_type == EXT3_EXT_CACHE_EXTENT); -+ if (block >= cex->ec_block && block < cex->ec_block + cex->ec_len) { -+ ex->ee_block = cex->ec_block; -+ ex->ee_start = cex->ec_start; -+ ex->ee_start_hi = 0; -+ ex->ee_len = cex->ec_len; -+ ext_debug(tree, "%lu cached by %lu:%lu:%lu\n", -+ (unsigned long) block, -+ (unsigned long) ex->ee_block, -+ (unsigned long) ex->ee_len, -+ (unsigned long) ex->ee_start); -+ return cex->ec_type; -+ } -+ -+ /* not in cache */ -+ return EXT3_EXT_CACHE_NO; -+} -+ -+/* -+ * routine removes index from the index block -+ * it's used in truncate case only. thus all requests are for -+ * last index in the block only -+ */ -+int ext3_ext_rm_idx(handle_t *handle, struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path) -+{ -+ struct buffer_head *bh; -+ int err; -+ -+ /* free index block */ -+ path--; -+ EXT_ASSERT(path->p_hdr->eh_entries); -+ if ((err = ext3_ext_get_access(handle, tree, path))) -+ return err; -+ path->p_hdr->eh_entries--; -+ if ((err = ext3_ext_dirty(handle, tree, path))) -+ return err; -+ ext_debug(tree, "index is empty, remove it, free block %d\n", -+ path->p_idx->ei_leaf); -+ bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf); -+ ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf); -+ ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1); -+ return err; -+} -+ -+int ext3_ext_calc_credits_for_insert(struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path) -+{ -+ int depth = EXT_DEPTH(tree); -+ int needed; -+ -+ if (path) { -+ /* probably there is space in leaf? */ -+ if (path[depth].p_hdr->eh_entries < path[depth].p_hdr->eh_max) -+ return 1; -+ } -+ -+ /* -+ * the worste case we're expecting is creation of the -+ * new root (growing in depth) with index splitting -+ * for splitting we have to consider depth + 1 because -+ * previous growing could increase it -+ */ -+ depth = depth + 1; -+ -+ /* -+ * growing in depth: -+ * block allocation + new root + old root -+ */ -+ needed = EXT3_ALLOC_NEEDED + 2; -+ -+ /* index split. we may need: -+ * allocate intermediate indexes and new leaf -+ * change two blocks at each level, but root -+ * modify root block (inode) -+ */ -+ needed += (depth * EXT3_ALLOC_NEEDED) + (2 * depth) + 1; -+ -+ return needed; -+} -+ -+static int -+ext3_ext_split_for_rm(handle_t *handle, struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, unsigned long start, -+ unsigned long end) -+{ -+ struct ext3_extent *ex, tex; -+ struct ext3_ext_path *npath; -+ int depth, creds, err; -+ -+ depth = EXT_DEPTH(tree); -+ ex = path[depth].p_ext; -+ EXT_ASSERT(ex); -+ EXT_ASSERT(end < ex->ee_block + ex->ee_len - 1); -+ EXT_ASSERT(ex->ee_block < start); -+ -+ /* calculate tail extent */ -+ tex.ee_block = end + 1; -+ EXT_ASSERT(tex.ee_block < ex->ee_block + ex->ee_len); -+ tex.ee_len = ex->ee_block + ex->ee_len - tex.ee_block; -+ -+ creds = ext3_ext_calc_credits_for_insert(tree, path); -+ handle = ext3_ext_journal_restart(handle, creds); -+ if (IS_ERR(handle)) -+ return PTR_ERR(handle); -+ -+ /* calculate head extent. use primary extent */ -+ err = ext3_ext_get_access(handle, tree, path + depth); -+ if (err) -+ return err; -+ ex->ee_len = start - ex->ee_block; -+ err = ext3_ext_dirty(handle, tree, path + depth); -+ if (err) -+ return err; -+ -+ /* FIXME: some callback to free underlying resource -+ * and correct ee_start? */ -+ ext_debug(tree, "split extent: head %u:%u, tail %u:%u\n", -+ ex->ee_block, ex->ee_len, tex.ee_block, tex.ee_len); -+ -+ npath = ext3_ext_find_extent(tree, ex->ee_block, NULL); -+ if (IS_ERR(npath)) -+ return PTR_ERR(npath); -+ depth = EXT_DEPTH(tree); -+ EXT_ASSERT(npath[depth].p_ext->ee_block == ex->ee_block); -+ EXT_ASSERT(npath[depth].p_ext->ee_len == ex->ee_len); -+ -+ err = ext3_ext_insert_extent(handle, tree, npath, &tex); -+ ext3_ext_drop_refs(npath); -+ kfree(npath); -+ -+ return err; -+} -+ -+static int -+ext3_ext_rm_leaf(handle_t *handle, struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, unsigned long start, -+ unsigned long end) -+{ -+ struct ext3_extent *ex, *fu = NULL, *lu, *le; -+ int err = 0, correct_index = 0; -+ int depth = EXT_DEPTH(tree), credits; -+ struct ext3_extent_header *eh; -+ unsigned a, b, block, num; -+ -+ ext_debug(tree, "remove [%lu:%lu] in leaf\n", start, end); -+ if (!path[depth].p_hdr) -+ path[depth].p_hdr = EXT_BLOCK_HDR(path[depth].p_bh); -+ eh = path[depth].p_hdr; -+ EXT_ASSERT(eh); -+ EXT_ASSERT(eh->eh_entries <= eh->eh_max); -+ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); -+ -+ /* find where to start removing */ -+ le = ex = EXT_LAST_EXTENT(eh); -+ while (ex != EXT_FIRST_EXTENT(eh)) { -+ if (ex->ee_block <= end) -+ break; -+ ex--; -+ } -+ -+ if (start > ex->ee_block && end < ex->ee_block + ex->ee_len - 1) { -+ /* removal of internal part of the extent requested -+ * tail and head must be placed in different extent -+ * so, we have to insert one more extent */ -+ path[depth].p_ext = ex; -+ return ext3_ext_split_for_rm(handle, tree, path, start, end); -+ } -+ -+ lu = ex; -+ while (ex >= EXT_FIRST_EXTENT(eh) && ex->ee_block + ex->ee_len > start) { -+ ext_debug(tree, "remove ext %u:%u\n", ex->ee_block, ex->ee_len); -+ path[depth].p_ext = ex; -+ -+ a = ex->ee_block > start ? ex->ee_block : start; -+ b = ex->ee_block + ex->ee_len - 1 < end ? -+ ex->ee_block + ex->ee_len - 1 : end; -+ -+ ext_debug(tree, " border %u:%u\n", a, b); -+ -+ if (a != ex->ee_block && b != ex->ee_block + ex->ee_len - 1) { -+ block = 0; -+ num = 0; -+ BUG(); -+ } else if (a != ex->ee_block) { -+ /* remove tail of the extent */ -+ block = ex->ee_block; -+ num = a - block; -+ } else if (b != ex->ee_block + ex->ee_len - 1) { -+ /* remove head of the extent */ -+ block = a; -+ num = b - a; -+ } else { -+ /* remove whole extent: excelent! */ -+ block = ex->ee_block; -+ num = 0; -+ EXT_ASSERT(a == ex->ee_block && -+ b == ex->ee_block + ex->ee_len - 1); -+ } -+ -+ if (ex == EXT_FIRST_EXTENT(eh)) -+ correct_index = 1; -+ -+ credits = 1; -+ if (correct_index) -+ credits += (EXT_DEPTH(tree) * EXT3_ALLOC_NEEDED) + 1; -+ if (tree->ops->remove_extent_credits) -+ credits+=tree->ops->remove_extent_credits(tree,ex,a,b); -+ -+ handle = ext3_ext_journal_restart(handle, credits); -+ if (IS_ERR(handle)) { -+ err = PTR_ERR(handle); -+ goto out; -+ } -+ -+ err = ext3_ext_get_access(handle, tree, path + depth); -+ if (err) -+ goto out; -+ -+ if (tree->ops->remove_extent) -+ err = tree->ops->remove_extent(tree, ex, a, b); -+ if (err) -+ goto out; -+ -+ if (num == 0) { -+ /* this extent is removed entirely mark slot unused */ -+ ex->ee_start = ex->ee_start_hi = 0; -+ eh->eh_entries--; -+ fu = ex; -+ } -+ -+ ex->ee_block = block; -+ ex->ee_len = num; -+ -+ err = ext3_ext_dirty(handle, tree, path + depth); -+ if (err) -+ goto out; -+ -+ ext_debug(tree, "new extent: %u:%u:%u\n", -+ ex->ee_block, ex->ee_len, ex->ee_start); -+ ex--; -+ } -+ -+ if (fu) { -+ /* reuse unused slots */ -+ while (lu < le) { -+ if (lu->ee_start) { -+ *fu = *lu; -+ lu->ee_start = lu->ee_start_hi = 0; -+ fu++; -+ } -+ lu++; -+ } -+ } -+ -+ if (correct_index && eh->eh_entries) -+ err = ext3_ext_correct_indexes(handle, tree, path); -+ -+ /* if this leaf is free, then we should -+ * remove it from index block above */ -+ if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL) -+ err = ext3_ext_rm_idx(handle, tree, path + depth); -+ -+out: -+ return err; -+} -+ -+ -+static struct ext3_extent_idx * -+ext3_ext_last_covered(struct ext3_extent_header *hdr, unsigned long block) -+{ -+ struct ext3_extent_idx *ix; -+ -+ ix = EXT_LAST_INDEX(hdr); -+ while (ix != EXT_FIRST_INDEX(hdr)) { -+ if (ix->ei_block <= block) -+ break; -+ ix--; -+ } -+ return ix; -+} -+ -+/* -+ * returns 1 if current index have to be freed (even partial) -+ */ -+static int inline -+ext3_ext_more_to_rm(struct ext3_ext_path *path) -+{ -+ EXT_ASSERT(path->p_idx); -+ -+ if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr)) -+ return 0; -+ -+ /* -+ * if truncate on deeper level happened it it wasn't partial -+ * so we have to consider current index for truncation -+ */ -+ if (path->p_hdr->eh_entries == path->p_block) -+ return 0; -+ return 1; -+} -+ -+int ext3_ext_remove_space(struct ext3_extents_tree *tree, -+ unsigned long start, unsigned long end) -+{ -+ struct inode *inode = tree->inode; -+ struct super_block *sb = inode->i_sb; -+ int depth = EXT_DEPTH(tree); -+ struct ext3_ext_path *path; -+ handle_t *handle; -+ int i = 0, err = 0; -+ -+ ext_debug(tree, "space to be removed: %lu:%lu\n", start, end); -+ -+ /* probably first extent we're gonna free will be last in block */ -+ handle = ext3_journal_start(inode, depth + 1); -+ if (IS_ERR(handle)) -+ return PTR_ERR(handle); -+ -+ ext3_ext_invalidate_cache(tree); -+ -+ /* -+ * we start scanning from right side freeing all the blocks -+ * after i_size and walking into the deep -+ */ -+ path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 1), GFP_KERNEL); -+ if (IS_ERR(path)) { -+ ext3_error(sb, __FUNCTION__, "Can't allocate path array"); -+ ext3_journal_stop(handle); -+ return -ENOMEM; -+ } -+ memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1)); -+ path[i].p_hdr = EXT_ROOT_HDR(tree); -+ -+ while (i >= 0 && err == 0) { -+ if (i == depth) { -+ /* this is leaf block */ -+ err = ext3_ext_rm_leaf(handle, tree, path, start, end); -+ /* root level have p_bh == NULL, brelse() eats this */ -+ brelse(path[i].p_bh); -+ i--; -+ continue; -+ } -+ -+ /* this is index block */ -+ if (!path[i].p_hdr) { -+ ext_debug(tree, "initialize header\n"); -+ path[i].p_hdr = EXT_BLOCK_HDR(path[i].p_bh); -+ } -+ -+ EXT_ASSERT(path[i].p_hdr->eh_entries <= path[i].p_hdr->eh_max); -+ EXT_ASSERT(path[i].p_hdr->eh_magic == EXT3_EXT_MAGIC); -+ -+ if (!path[i].p_idx) { -+ /* this level hasn't touched yet */ -+ path[i].p_idx = -+ ext3_ext_last_covered(path[i].p_hdr, end); -+ path[i].p_block = path[i].p_hdr->eh_entries + 1; -+ ext_debug(tree, "init index ptr: hdr 0x%p, num %d\n", -+ path[i].p_hdr, path[i].p_hdr->eh_entries); -+ } else { -+ /* we've already was here, see at next index */ -+ path[i].p_idx--; -+ } -+ -+ ext_debug(tree, "level %d - index, first 0x%p, cur 0x%p\n", -+ i, EXT_FIRST_INDEX(path[i].p_hdr), -+ path[i].p_idx); -+ if (ext3_ext_more_to_rm(path + i)) { -+ /* go to the next level */ -+ ext_debug(tree, "move to level %d (block %d)\n", -+ i + 1, path[i].p_idx->ei_leaf); -+ memset(path + i + 1, 0, sizeof(*path)); -+ path[i+1].p_bh = sb_bread(sb, path[i].p_idx->ei_leaf); -+ if (!path[i+1].p_bh) { -+ /* should we reset i_size? */ -+ err = -EIO; -+ break; -+ } -+ /* put actual number of indexes to know is this -+ * number got changed at the next iteration */ -+ path[i].p_block = path[i].p_hdr->eh_entries; -+ i++; -+ } else { -+ /* we finish processing this index, go up */ -+ if (path[i].p_hdr->eh_entries == 0 && i > 0) { -+ /* index is empty, remove it -+ * handle must be already prepared by the -+ * truncatei_leaf() */ -+ err = ext3_ext_rm_idx(handle, tree, path + i); -+ } -+ /* root level have p_bh == NULL, brelse() eats this */ -+ brelse(path[i].p_bh); -+ i--; -+ ext_debug(tree, "return to level %d\n", i); -+ } -+ } -+ -+ /* TODO: flexible tree reduction should be here */ -+ if (path->p_hdr->eh_entries == 0) { -+ /* -+ * truncate to zero freed all the tree -+ * so, we need to correct eh_depth -+ */ -+ err = ext3_ext_get_access(handle, tree, path); -+ if (err == 0) { -+ EXT_ROOT_HDR(tree)->eh_depth = 0; -+ EXT_ROOT_HDR(tree)->eh_max = ext3_ext_space_root(tree); -+ err = ext3_ext_dirty(handle, tree, path); -+ } -+ } -+ ext3_ext_tree_changed(tree); -+ -+ kfree(path); -+ ext3_journal_stop(handle); -+ -+ return err; -+} -+ -+int ext3_ext_calc_metadata_amount(struct ext3_extents_tree *tree, int blocks) -+{ -+ int lcap, icap, rcap, leafs, idxs, num; -+ -+ rcap = ext3_ext_space_root(tree); -+ if (blocks <= rcap) { -+ /* all extents fit to the root */ -+ return 0; -+ } -+ -+ rcap = ext3_ext_space_root_idx(tree); -+ lcap = ext3_ext_space_block(tree); -+ icap = ext3_ext_space_block_idx(tree); -+ -+ num = leafs = (blocks + lcap - 1) / lcap; -+ if (leafs <= rcap) { -+ /* all pointers to leafs fit to the root */ -+ return leafs; -+ } -+ -+ /* ok. we need separate index block(s) to link all leaf blocks */ -+ idxs = (leafs + icap - 1) / icap; -+ do { -+ num += idxs; -+ idxs = (idxs + icap - 1) / icap; -+ } while (idxs > rcap); -+ -+ return num; -+} -+ -+/* -+ * called at mount time -+ */ -+void ext3_ext_init(struct super_block *sb) -+{ -+ /* -+ * possible initialization would be here -+ */ -+ -+ if (test_opt(sb, EXTENTS)) { -+ printk("EXT3-fs: file extents enabled"); -+#ifdef AGRESSIVE_TEST -+ printk(", agressive tests"); -+#endif -+#ifdef CHECK_BINSEARCH -+ printk(", check binsearch"); -+#endif -+ printk("\n"); -+ } -+} -+ -+/* -+ * called at umount time -+ */ -+void ext3_ext_release(struct super_block *sb) -+{ -+} -+ -+/************************************************************************ -+ * VFS related routines -+ ************************************************************************/ -+ -+static int ext3_get_inode_write_access(handle_t *handle, void *buffer) -+{ -+ /* we use in-core data, not bh */ -+ return 0; -+} -+ -+static int ext3_mark_buffer_dirty(handle_t *handle, void *buffer) -+{ -+ struct inode *inode = buffer; -+ return ext3_mark_inode_dirty(handle, inode); -+} -+ -+static int ext3_ext_mergable(struct ext3_extent *ex1, -+ struct ext3_extent *ex2) -+{ -+ /* FIXME: support for large fs */ -+ if (ex1->ee_start + ex1->ee_len == ex2->ee_start) -+ return 1; -+ return 0; -+} -+ -+static int -+ext3_remove_blocks_credits(struct ext3_extents_tree *tree, -+ struct ext3_extent *ex, -+ unsigned long from, unsigned long to) -+{ -+ int needed; -+ -+ /* at present, extent can't cross block group */; -+ needed = 4; /* bitmap + group desc + sb + inode */ -+ -+#ifdef CONFIG_QUOTA -+ needed += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS; -+#endif -+ return needed; -+} -+ -+static int -+ext3_remove_blocks(struct ext3_extents_tree *tree, -+ struct ext3_extent *ex, -+ unsigned long from, unsigned long to) -+{ -+ int needed = ext3_remove_blocks_credits(tree, ex, from, to); -+ handle_t *handle = ext3_journal_start(tree->inode, needed); -+ struct buffer_head *bh; -+ int i; -+ -+ if (IS_ERR(handle)) -+ return PTR_ERR(handle); -+ if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) { -+ /* tail removal */ -+ unsigned long num, start; -+ num = ex->ee_block + ex->ee_len - from; -+ start = ex->ee_start + ex->ee_len - num; -+ ext_debug(tree, "free last %lu blocks starting %lu\n", -+ num, start); -+ for (i = 0; i < num; i++) { -+ bh = sb_find_get_block(tree->inode->i_sb, start + i); -+ ext3_forget(handle, 0, tree->inode, bh, start + i); -+ } -+ ext3_free_blocks(handle, tree->inode, start, num); -+ } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) { -+ printk("strange request: removal %lu-%lu from %u:%u\n", -+ from, to, ex->ee_block, ex->ee_len); -+ } else { -+ printk("strange request: removal(2) %lu-%lu from %u:%u\n", -+ from, to, ex->ee_block, ex->ee_len); -+ } -+ ext3_journal_stop(handle); -+ return 0; -+} -+ -+static int ext3_ext_find_goal(struct inode *inode, -+ struct ext3_ext_path *path, unsigned long block) -+{ -+ struct ext3_inode_info *ei = EXT3_I(inode); -+ unsigned long bg_start; -+ unsigned long colour; -+ int depth; -+ -+ if (path) { -+ struct ext3_extent *ex; -+ depth = path->p_depth; -+ -+ /* try to predict block placement */ -+ if ((ex = path[depth].p_ext)) -+ return ex->ee_start + (block - ex->ee_block); -+ -+ /* it looks index is empty -+ * try to find starting from index itself */ -+ if (path[depth].p_bh) -+ return path[depth].p_bh->b_blocknr; -+ } -+ -+ /* OK. use inode's group */ -+ bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) + -+ le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block); -+ colour = (current->pid % 16) * -+ (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16); -+ return bg_start + colour + block; -+} -+ -+static int ext3_new_block_cb(handle_t *handle, struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, -+ struct ext3_extent *ex, int *err) -+{ -+ struct inode *inode = tree->inode; -+ int newblock, goal; -+ -+ EXT_ASSERT(path); -+ EXT_ASSERT(ex); -+ EXT_ASSERT(ex->ee_start); -+ EXT_ASSERT(ex->ee_len); -+ -+ /* reuse block from the extent to order data/metadata */ -+ newblock = ex->ee_start++; -+ ex->ee_len--; -+ if (ex->ee_len == 0) { -+ ex->ee_len = 1; -+ /* allocate new block for the extent */ -+ goal = ext3_ext_find_goal(inode, path, ex->ee_block); -+ ex->ee_start = ext3_new_block(handle, inode, goal, err); -+ ex->ee_start_hi = 0; -+ if (ex->ee_start == 0) { -+ /* error occured: restore old extent */ -+ ex->ee_start = newblock; -+ return 0; -+ } -+ } -+ return newblock; -+} -+ -+static struct ext3_extents_helpers ext3_blockmap_helpers = { -+ .get_write_access = ext3_get_inode_write_access, -+ .mark_buffer_dirty = ext3_mark_buffer_dirty, -+ .mergable = ext3_ext_mergable, -+ .new_block = ext3_new_block_cb, -+ .remove_extent = ext3_remove_blocks, -+ .remove_extent_credits = ext3_remove_blocks_credits, -+}; -+ -+void ext3_init_tree_desc(struct ext3_extents_tree *tree, -+ struct inode *inode) -+{ -+ tree->inode = inode; -+ tree->root = (void *) EXT3_I(inode)->i_data; -+ tree->buffer = (void *) inode; -+ tree->buffer_len = sizeof(EXT3_I(inode)->i_data); -+ tree->cex = &EXT3_I(inode)->i_cached_extent; -+ tree->ops = &ext3_blockmap_helpers; -+} -+ -+int ext3_ext_get_block(handle_t *handle, struct inode *inode, -+ long iblock, struct buffer_head *bh_result, -+ int create, int extend_disksize) -+{ -+ struct ext3_ext_path *path = NULL; -+ struct ext3_extent newex; -+ struct ext3_extent *ex; -+ int goal, newblock, err = 0, depth; -+ struct ext3_extents_tree tree; -+ -+ __clear_bit(BH_New, &bh_result->b_state); -+ ext3_init_tree_desc(&tree, inode); -+ ext_debug(&tree, "block %d requested for inode %u\n", -+ (int) iblock, (unsigned) inode->i_ino); -+ down(&EXT3_I(inode)->truncate_sem); -+ -+ /* check in cache */ -+ if ((goal = ext3_ext_in_cache(&tree, iblock, &newex))) { -+ if (goal == EXT3_EXT_CACHE_GAP) { -+ if (!create) { -+ /* block isn't allocated yet and -+ * user don't want to allocate it */ -+ goto out2; -+ } -+ /* we should allocate requested block */ -+ } else if (goal == EXT3_EXT_CACHE_EXTENT) { -+ /* block is already allocated */ -+ newblock = iblock - newex.ee_block + newex.ee_start; -+ goto out; -+ } else { -+ EXT_ASSERT(0); -+ } -+ } -+ -+ /* find extent for this block */ -+ path = ext3_ext_find_extent(&tree, iblock, NULL); -+ if (IS_ERR(path)) { -+ err = PTR_ERR(path); -+ path = NULL; -+ goto out2; -+ } -+ -+ depth = EXT_DEPTH(&tree); -+ -+ /* -+ * consistent leaf must not be empty -+ * this situations is possible, though, _during_ tree modification -+ * this is why assert can't be put in ext3_ext_find_extent() -+ */ -+ EXT_ASSERT(path[depth].p_ext != NULL || depth == 0); -+ -+ if ((ex = path[depth].p_ext)) { -+ /* if found exent covers block, simple return it */ -+ if (iblock >= ex->ee_block && iblock < ex->ee_block + ex->ee_len) { -+ newblock = iblock - ex->ee_block + ex->ee_start; -+ ext_debug(&tree, "%d fit into %d:%d -> %d\n", -+ (int) iblock, ex->ee_block, ex->ee_len, -+ newblock); -+ ext3_ext_put_in_cache(&tree, ex->ee_block, -+ ex->ee_len, ex->ee_start, -+ EXT3_EXT_CACHE_EXTENT); -+ goto out; -+ } -+ } -+ -+ /* -+ * requested block isn't allocated yet -+ * we couldn't try to create block if create flag is zero -+ */ -+ if (!create) { -+ /* put just found gap into cache to speedup subsequest reqs */ -+ ext3_ext_put_gap_in_cache(&tree, path, iblock); -+ goto out2; -+ } -+ -+ /* allocate new block */ -+ goal = ext3_ext_find_goal(inode, path, iblock); -+ newblock = ext3_new_block(handle, inode, goal, &err); -+ if (!newblock) -+ goto out2; -+ ext_debug(&tree, "allocate new block: goal %d, found %d\n", -+ goal, newblock); -+ -+ /* try to insert new extent into found leaf and return */ -+ newex.ee_block = iblock; -+ newex.ee_start = newblock; -+ newex.ee_start_hi = 0; -+ newex.ee_len = 1; -+ err = ext3_ext_insert_extent(handle, &tree, path, &newex); -+ if (err) -+ goto out2; -+ -+ if (extend_disksize && inode->i_size > EXT3_I(inode)->i_disksize) -+ EXT3_I(inode)->i_disksize = inode->i_size; -+ -+ /* previous routine could use block we allocated */ -+ newblock = newex.ee_start; -+ __set_bit(BH_New, &bh_result->b_state); -+ -+ ext3_ext_put_in_cache(&tree, newex.ee_block, newex.ee_len, -+ newex.ee_start, EXT3_EXT_CACHE_EXTENT); -+out: -+ ext3_ext_show_leaf(&tree, path); -+ __set_bit(BH_Mapped, &bh_result->b_state); -+ bh_result->b_bdev = inode->i_sb->s_bdev; -+ bh_result->b_blocknr = newblock; -+out2: -+ if (path) { -+ ext3_ext_drop_refs(path); -+ kfree(path); -+ } -+ up(&EXT3_I(inode)->truncate_sem); -+ -+ return err; -+} -+ -+void ext3_ext_truncate(struct inode * inode, struct page *page) -+{ -+ struct address_space *mapping = inode->i_mapping; -+ struct super_block *sb = inode->i_sb; -+ struct ext3_extents_tree tree; -+ unsigned long last_block; -+ handle_t *handle; -+ int err = 0; -+ -+ ext3_init_tree_desc(&tree, inode); -+ -+ /* -+ * probably first extent we're gonna free will be last in block -+ */ -+ err = ext3_writepage_trans_blocks(inode) + 3; -+ handle = ext3_journal_start(inode, err); -+ if (IS_ERR(handle)) { -+ if (page) { -+ clear_highpage(page); -+ flush_dcache_page(page); -+ unlock_page(page); -+ page_cache_release(page); -+ } -+ return; -+ } -+ -+ if (page) -+ ext3_block_truncate_page(handle, page, mapping, inode->i_size); -+ -+ down(&EXT3_I(inode)->truncate_sem); -+ ext3_ext_invalidate_cache(&tree); -+ -+ /* -+ * TODO: optimization is possible here -+ * probably we need not scaning at all, -+ * because page truncation is enough -+ */ -+ if (ext3_orphan_add(handle, inode)) -+ goto out_stop; -+ -+ /* we have to know where to truncate from in crash case */ -+ EXT3_I(inode)->i_disksize = inode->i_size; -+ ext3_mark_inode_dirty(handle, inode); -+ -+ last_block = (inode->i_size + sb->s_blocksize - 1) >> -+ EXT3_BLOCK_SIZE_BITS(sb); -+ err = ext3_ext_remove_space(&tree, last_block, EXT_MAX_BLOCK); -+ -+ /* In a multi-transaction truncate, we only make the final -+ * transaction synchronous */ -+ if (IS_SYNC(inode)) -+ handle->h_sync = 1; -+ -+out_stop: -+ /* -+ * If this was a simple ftruncate(), and the file will remain alive -+ * then we need to clear up the orphan record which we created above. -+ * However, if this was a real unlink then we were called by -+ * ext3_delete_inode(), and we allow that function to clean up the -+ * orphan info for us. -+ */ -+ if (inode->i_nlink) -+ ext3_orphan_del(handle, inode); -+ -+ up(&EXT3_I(inode)->truncate_sem); -+ ext3_journal_stop(handle); -+} -+ -+/* -+ * this routine calculate max number of blocks we could modify -+ * in order to allocate new block for an inode -+ */ -+int ext3_ext_writepage_trans_blocks(struct inode *inode, int num) -+{ -+ struct ext3_extents_tree tree; -+ int needed; -+ -+ ext3_init_tree_desc(&tree, inode); -+ -+ needed = ext3_ext_calc_credits_for_insert(&tree, NULL); -+ -+ /* caller want to allocate num blocks */ -+ needed *= num; -+ -+#ifdef CONFIG_QUOTA -+ /* -+ * FIXME: real calculation should be here -+ * it depends on blockmap format of qouta file -+ */ -+ needed += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS; -+#endif -+ -+ return needed; -+} -+ -+void ext3_extents_initialize_blockmap(handle_t *handle, struct inode *inode) -+{ -+ struct ext3_extents_tree tree; -+ -+ ext3_init_tree_desc(&tree, inode); -+ ext3_extent_tree_init(handle, &tree); -+} -+ -+int ext3_ext_calc_blockmap_metadata(struct inode *inode, int blocks) -+{ -+ struct ext3_extents_tree tree; -+ -+ ext3_init_tree_desc(&tree, inode); -+ return ext3_ext_calc_metadata_amount(&tree, blocks); -+} -+ -+static int -+ext3_ext_store_extent_cb(struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, -+ struct ext3_ext_cache *newex) -+{ -+ struct ext3_extent_buf *buf = (struct ext3_extent_buf *) tree->private; -+ -+ if (newex->ec_type != EXT3_EXT_CACHE_EXTENT) -+ return EXT_CONTINUE; -+ -+ if (buf->err < 0) -+ return EXT_BREAK; -+ if (buf->cur - buf->buffer + sizeof(*newex) > buf->buflen) -+ return EXT_BREAK; -+ -+ if (!copy_to_user(buf->cur, newex, sizeof(*newex))) { -+ buf->err++; -+ buf->cur += sizeof(*newex); -+ } else { -+ buf->err = -EFAULT; -+ return EXT_BREAK; -+ } -+ return EXT_CONTINUE; -+} -+ -+static int -+ext3_ext_collect_stats_cb(struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, -+ struct ext3_ext_cache *ex) -+{ -+ struct ext3_extent_tree_stats *buf = -+ (struct ext3_extent_tree_stats *) tree->private; -+ int depth; -+ -+ if (ex->ec_type != EXT3_EXT_CACHE_EXTENT) -+ return EXT_CONTINUE; -+ -+ depth = EXT_DEPTH(tree); -+ buf->extents_num++; -+ if (path[depth].p_ext == EXT_FIRST_EXTENT(path[depth].p_hdr)) -+ buf->leaf_num++; -+ return EXT_CONTINUE; -+} -+ -+int ext3_ext_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, -+ unsigned long arg) -+{ -+ int err = 0; -+ -+ if (!(EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL)) -+ return -EINVAL; -+ -+ if (cmd == EXT3_IOC_GET_EXTENTS) { -+ struct ext3_extent_buf buf; -+ struct ext3_extents_tree tree; -+ -+ if (copy_from_user(&buf, (void *) arg, sizeof(buf))) -+ return -EFAULT; -+ -+ ext3_init_tree_desc(&tree, inode); -+ buf.cur = buf.buffer; -+ buf.err = 0; -+ tree.private = &buf; -+ down(&EXT3_I(inode)->truncate_sem); -+ err = ext3_ext_walk_space(&tree, buf.start, EXT_MAX_BLOCK, -+ ext3_ext_store_extent_cb); -+ up(&EXT3_I(inode)->truncate_sem); -+ if (err == 0) -+ err = buf.err; -+ } else if (cmd == EXT3_IOC_GET_TREE_STATS) { -+ struct ext3_extent_tree_stats buf; -+ struct ext3_extents_tree tree; -+ -+ ext3_init_tree_desc(&tree, inode); -+ down(&EXT3_I(inode)->truncate_sem); -+ buf.depth = EXT_DEPTH(&tree); -+ buf.extents_num = 0; -+ buf.leaf_num = 0; -+ tree.private = &buf; -+ err = ext3_ext_walk_space(&tree, 0, EXT_MAX_BLOCK, -+ ext3_ext_collect_stats_cb); -+ up(&EXT3_I(inode)->truncate_sem); -+ if (!err) -+ err = copy_to_user((void *) arg, &buf, sizeof(buf)); -+ } else if (cmd == EXT3_IOC_GET_TREE_DEPTH) { -+ struct ext3_extents_tree tree; -+ ext3_init_tree_desc(&tree, inode); -+ down(&EXT3_I(inode)->truncate_sem); -+ err = EXT_DEPTH(&tree); -+ up(&EXT3_I(inode)->truncate_sem); -+ } -+ -+ return err; -+} -+ -+EXPORT_SYMBOL(ext3_init_tree_desc); -+EXPORT_SYMBOL(ext3_mark_inode_dirty); -+EXPORT_SYMBOL(ext3_ext_invalidate_cache); -+EXPORT_SYMBOL(ext3_ext_insert_extent); -+EXPORT_SYMBOL(ext3_ext_walk_space); -+EXPORT_SYMBOL(ext3_ext_find_goal); -+EXPORT_SYMBOL(ext3_ext_calc_credits_for_insert); -Index: linux-2.6.5-sles9/fs/ext3/ialloc.c -=================================================================== ---- linux-2.6.5-sles9.orig/fs/ext3/ialloc.c 2005-02-23 01:01:52.366281264 +0300 -+++ linux-2.6.5-sles9/fs/ext3/ialloc.c 2005-02-23 01:02:37.398435336 +0300 -@@ -566,7 +566,7 @@ repeat: - ei->i_dir_start_lookup = 0; - ei->i_disksize = 0; - -- ei->i_flags = EXT3_I(dir)->i_flags & ~EXT3_INDEX_FL; -+ ei->i_flags = EXT3_I(dir)->i_flags & ~(EXT3_INDEX_FL|EXT3_EXTENTS_FL); - if (S_ISLNK(mode)) - ei->i_flags &= ~(EXT3_IMMUTABLE_FL|EXT3_APPEND_FL); - /* dirsync only applies to directories */ -@@ -647,6 +647,18 @@ - DQUOT_FREE_INODE(inode); - goto fail2; - } -+ if (test_opt(sb, EXTENTS) && S_ISREG(inode->i_mode)) { -+ EXT3_I(inode)->i_flags |= EXT3_EXTENTS_FL; -+ ext3_extents_initialize_blockmap(handle, inode); -+ if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_EXTENTS)) { -+ err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh); -+ if (err) goto fail; -+ EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_EXTENTS); -+ BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "call ext3_journal_dirty_metadata"); -+ err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); -+ } -+ } -+ - err = ext3_mark_inode_dirty(handle, inode); - if (err) { - ext3_std_error(sb, err); -Index: linux-2.6.5-sles9/fs/ext3/inode.c -=================================================================== ---- linux-2.6.5-sles9.orig/fs/ext3/inode.c 2005-02-23 01:01:52.373280200 +0300 -+++ linux-2.6.5-sles9/fs/ext3/inode.c 2005-02-23 01:02:37.404434424 +0300 -@@ -796,6 +796,17 @@ - goto reread; - } - -+static inline int -+ext3_get_block_wrap(handle_t *handle, struct inode *inode, long block, -+ struct buffer_head *bh, int create, int extend_disksize) -+{ -+ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) -+ return ext3_ext_get_block(handle, inode, block, bh, create, -+ extend_disksize); -+ return ext3_get_block_handle(handle, inode, block, bh, create, -+ extend_disksize); -+} -+ - static int ext3_get_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) - { -@@ -806,8 +817,8 @@ - handle = ext3_journal_current_handle(); - J_ASSERT(handle != 0); - } -- ret = ext3_get_block_handle(handle, inode, iblock, -- bh_result, create, 1); -+ ret = ext3_get_block_wrap(handle, inode, iblock, -+ bh_result, create, 1); - return ret; - } - -@@ -833,8 +844,8 @@ - } - } - if (ret == 0) -- ret = ext3_get_block_handle(handle, inode, iblock, -- bh_result, create, 0); -+ ret = ext3_get_block_wrap(handle, inode, iblock, -+ bh_result, create, 0); - if (ret == 0) - bh_result->b_size = (1 << inode->i_blkbits); - return ret; -@@ -855,7 +866,7 @@ - dummy.b_state = 0; - dummy.b_blocknr = -1000; - buffer_trace_init(&dummy.b_history); -- *errp = ext3_get_block_handle(handle, inode, block, &dummy, create, 1); -+ *errp = ext3_get_block_wrap(handle, inode, block, &dummy, create, 1); - if (!*errp && buffer_mapped(&dummy)) { - struct buffer_head *bh; - bh = sb_getblk(inode->i_sb, dummy.b_blocknr); -@@ -1587,7 +1598,7 @@ - * This required during truncate. We need to physically zero the tail end - * of that block so it doesn't yield old data if the file is later grown. - */ --static int ext3_block_truncate_page(handle_t *handle, struct page *page, -+int ext3_block_truncate_page(handle_t *handle, struct page *page, - struct address_space *mapping, loff_t from) - { - unsigned long index = from >> PAGE_CACHE_SHIFT; -@@ -2083,6 +2094,9 @@ - return; - } - -+ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) -+ return ext3_ext_truncate(inode, page); -+ - handle = start_transaction(inode); - if (IS_ERR(handle)) { - if (page) { -@@ -2789,6 +2803,9 @@ - int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3; - int ret; - -+ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) -+ return ext3_ext_writepage_trans_blocks(inode, bpp); -+ - if (ext3_should_journal_data(inode)) - ret = 3 * (bpp + indirects) + 2; - else -Index: linux-2.6.5-sles9/fs/ext3/Makefile -=================================================================== ---- linux-2.6.5-sles9.orig/fs/ext3/Makefile 2005-02-23 01:01:46.501172896 +0300 -+++ linux-2.6.5-sles9/fs/ext3/Makefile 2005-02-23 01:02:37.405434272 +0300 -@@ -5,7 +5,8 @@ - obj-$(CONFIG_EXT3_FS) += ext3.o - - ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ -- ioctl.o namei.o super.o symlink.o hash.o -+ ioctl.o namei.o super.o symlink.o hash.o \ -+ extents.o - - ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o - ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o -Index: linux-2.6.5-sles9/fs/ext3/super.c -=================================================================== ---- linux-2.6.5-sles9.orig/fs/ext3/super.c 2005-02-23 01:02:34.072940888 +0300 -+++ linux-2.6.5-sles9/fs/ext3/super.c 2005-02-23 01:47:15.291333736 +0300 -@@ -389,6 +389,7 @@ - struct ext3_super_block *es = sbi->s_es; - int i; - -+ ext3_ext_release(sb); - ext3_xattr_put_super(sb); - journal_destroy(sbi->s_journal); - if (!(sb->s_flags & MS_RDONLY)) { -@@ -447,6 +448,8 @@ - #endif - ei->i_rsv_window.rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED; - ei->vfs_inode.i_version = 1; -+ -+ memset(&ei->i_cached_extent, 0, sizeof(ei->i_cached_extent)); - return &ei->vfs_inode; - } - -@@ -537,6 +540,7 @@ - Opt_ignore, Opt_barrier, - Opt_err, - Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, -+ Opt_extents, Opt_noextents, Opt_extdebug, - }; - - static match_table_t tokens = { -@@ -582,6 +585,9 @@ - {Opt_iopen, "iopen"}, - {Opt_noiopen, "noiopen"}, - {Opt_iopen_nopriv, "iopen_nopriv"}, -+ {Opt_extents, "extents"}, -+ {Opt_noextents, "noextents"}, -+ {Opt_extdebug, "extdebug"}, - {Opt_barrier, "barrier=%u"}, - {Opt_err, NULL} - }; -@@ -797,6 +802,15 @@ - break; - case Opt_ignore: - break; -+ case Opt_extents: -+ set_opt (sbi->s_mount_opt, EXTENTS); -+ break; -+ case Opt_noextents: -+ clear_opt (sbi->s_mount_opt, EXTENTS); -+ break; -+ case Opt_extdebug: -+ set_opt (sbi->s_mount_opt, EXTDEBUG); -+ break; - default: - printk (KERN_ERR - "EXT3-fs: Unrecognized mount option \"%s\" " -@@ -1449,6 +1460,8 @@ - percpu_counter_mod(&sbi->s_dirs_counter, - ext3_count_dirs(sb)); - -+ ext3_ext_init(sb); -+ - return 0; - - failed_mount3: -Index: linux-2.6.5-sles9/fs/ext3/ioctl.c -=================================================================== ---- linux-2.6.5-sles9.orig/fs/ext3/ioctl.c 2005-02-23 01:01:42.887722224 +0300 -+++ linux-2.6.5-sles9/fs/ext3/ioctl.c 2005-02-23 01:02:37.412433208 +0300 -@@ -124,6 +124,10 @@ - err = ext3_change_inode_journal_flag(inode, jflag); - return err; - } -+ case EXT3_IOC_GET_EXTENTS: -+ case EXT3_IOC_GET_TREE_STATS: -+ case EXT3_IOC_GET_TREE_DEPTH: -+ return ext3_ext_ioctl(inode, filp, cmd, arg); - case EXT3_IOC_GETVERSION: - case EXT3_IOC_GETVERSION_OLD: - return put_user(inode->i_generation, (int *) arg); -Index: linux-2.6.5-sles9/include/linux/ext3_fs.h -=================================================================== ---- linux-2.6.5-sles9.orig/include/linux/ext3_fs.h 2005-02-23 01:02:35.823674736 +0300 -+++ linux-2.6.5-sles9/include/linux/ext3_fs.h 2005-02-23 01:02:37.414432904 +0300 -@@ -186,8 +186,9 @@ - #define EXT3_NOTAIL_FL 0x00008000 /* don't merge file tail */ - #define EXT3_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ - #define EXT3_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ -+#define EXT3_EXTENTS_FL 0x00080000 /* Inode uses extents */ - #define EXT3_RESERVED_FL 0x80000000 /* reserved for ext3 lib */ - --#define EXT3_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */ -+#define EXT3_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */ - #define EXT3_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */ - -@@ -211,6 +212,9 @@ - #endif - #define EXT3_IOC_GETRSVSZ _IOR('f', 5, long) - #define EXT3_IOC_SETRSVSZ _IOW('f', 6, long) -+#define EXT3_IOC_GET_EXTENTS _IOR('f', 7, long) -+#define EXT3_IOC_GET_TREE_DEPTH _IOR('f', 8, long) -+#define EXT3_IOC_GET_TREE_STATS _IOR('f', 9, long) - - /* - * Structure of an inode on the disk -@@ -333,6 +337,8 @@ - #define EXT3_MOUNT_BARRIER 0x20000 /* Use block barriers */ - #define EXT3_MOUNT_IOPEN 0x80000 /* Allow access via iopen */ - #define EXT3_MOUNT_IOPEN_NOPRIV 0x100000/* Make iopen world-readable */ -+#define EXT3_MOUNT_EXTENTS 0x200000/* Extents support */ -+#define EXT3_MOUNT_EXTDEBUG 0x400000/* Extents debug */ - - /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ - #ifndef clear_opt -@@ -503,11 +509,13 @@ - #define EXT3_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */ - #define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */ - #define EXT3_FEATURE_INCOMPAT_META_BG 0x0010 -+#define EXT3_FEATURE_INCOMPAT_EXTENTS 0x0040 /* extents support */ - - #define EXT3_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR - #define EXT3_FEATURE_INCOMPAT_SUPP (EXT3_FEATURE_INCOMPAT_FILETYPE| \ - EXT3_FEATURE_INCOMPAT_RECOVER| \ -- EXT3_FEATURE_INCOMPAT_META_BG) -+ EXT3_FEATURE_INCOMPAT_META_BG| \ -+ EXT3_FEATURE_INCOMPAT_EXTENTS) - #define EXT3_FEATURE_RO_COMPAT_SUPP (EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \ - EXT3_FEATURE_RO_COMPAT_LARGE_FILE| \ - EXT3_FEATURE_RO_COMPAT_BTREE_DIR) -@@ -729,6 +735,9 @@ - - - /* inode.c */ -+extern int ext3_block_truncate_page(handle_t *, struct page *, -+ struct address_space *, loff_t); -+extern int ext3_writepage_trans_blocks(struct inode *inode); - extern int ext3_forget(handle_t *, int, struct inode *, struct buffer_head *, int); - extern struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *); - extern struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *); -@@ -802,6 +809,16 @@ - extern struct inode_operations ext3_symlink_inode_operations; - extern struct inode_operations ext3_fast_symlink_inode_operations; - -+/* extents.c */ -+extern int ext3_ext_writepage_trans_blocks(struct inode *, int); -+extern int ext3_ext_get_block(handle_t *, struct inode *, long, -+ struct buffer_head *, int, int); -+extern void ext3_ext_truncate(struct inode *, struct page *); -+extern void ext3_ext_init(struct super_block *); -+extern void ext3_ext_release(struct super_block *); -+extern void ext3_extents_initialize_blockmap(handle_t *, struct inode *); -+extern int ext3_ext_ioctl(struct inode *inode, struct file *filp, -+ unsigned int cmd, unsigned long arg); - - #endif /* __KERNEL__ */ - -Index: linux-2.6.5-sles9/include/linux/ext3_extents.h -=================================================================== ---- linux-2.6.5-sles9.orig/include/linux/ext3_extents.h 2005-02-17 22:07:57.023609040 +0300 -+++ linux-2.6.5-sles9/include/linux/ext3_extents.h 2005-02-23 01:02:37.416432600 +0300 -@@ -0,0 +1,262 @@ -+/* -+ * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com -+ * Written by Alex Tomas -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License version 2 as -+ * published by the Free Software Foundation. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public Licens -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- -+ */ -+ -+#ifndef _LINUX_EXT3_EXTENTS -+#define _LINUX_EXT3_EXTENTS -+ -+/* -+ * with AGRESSIVE_TEST defined capacity of index/leaf blocks -+ * become very little, so index split, in-depth growing and -+ * other hard changes happens much more often -+ * this is for debug purposes only -+ */ -+#define AGRESSIVE_TEST_ -+ -+/* -+ * if CHECK_BINSEARCH defined, then results of binary search -+ * will be checked by linear search -+ */ -+#define CHECK_BINSEARCH_ -+ -+/* -+ * if EXT_DEBUG is defined you can use 'extdebug' mount option -+ * to get lots of info what's going on -+ */ -+#define EXT_DEBUG_ -+#ifdef EXT_DEBUG -+#define ext_debug(tree,fmt,a...) \ -+do { \ -+ if (test_opt((tree)->inode->i_sb, EXTDEBUG)) \ -+ printk(fmt, ##a); \ -+} while (0); -+#else -+#define ext_debug(tree,fmt,a...) -+#endif -+ -+/* -+ * if EXT_STATS is defined then stats numbers are collected -+ * these number will be displayed at umount time -+ */ -+#define EXT_STATS_ -+ -+ -+#define EXT3_ALLOC_NEEDED 3 /* block bitmap + group desc. + sb */ -+ -+/* -+ * ext3_inode has i_block array (total 60 bytes) -+ * first 4 bytes are used to store: -+ * - tree depth (0 mean there is no tree yet. all extents in the inode) -+ * - number of alive extents in the inode -+ */ -+ -+/* -+ * this is extent on-disk structure -+ * it's used at the bottom of the tree -+ */ -+struct ext3_extent { -+ __u32 ee_block; /* first logical block extent covers */ -+ __u16 ee_len; /* number of blocks covered by extent */ -+ __u16 ee_start_hi; /* high 16 bits of physical block */ -+ __u32 ee_start; /* low 32 bigs of physical block */ -+}; -+ -+/* -+ * this is index on-disk structure -+ * it's used at all the levels, but the bottom -+ */ -+struct ext3_extent_idx { -+ __u32 ei_block; /* index covers logical blocks from 'block' */ -+ __u32 ei_leaf; /* pointer to the physical block of the next * -+ * level. leaf or next index could bet here */ -+ __u16 ei_leaf_hi; /* high 16 bits of physical block */ -+ __u16 ei_unused; -+}; -+ -+/* -+ * each block (leaves and indexes), even inode-stored has header -+ */ -+struct ext3_extent_header { -+ __u16 eh_magic; /* probably will support different formats */ -+ __u16 eh_entries; /* number of valid entries */ -+ __u16 eh_max; /* capacity of store in entries */ -+ __u16 eh_depth; /* has tree real underlaying blocks? */ -+ __u32 eh_generation; /* flags(8 bits) | generation of the tree */ -+}; -+ -+#define EXT3_EXT_MAGIC 0xf30a -+ -+/* -+ * array of ext3_ext_path contains path to some extent -+ * creation/lookup routines use it for traversal/splitting/etc -+ * truncate uses it to simulate recursive walking -+ */ -+struct ext3_ext_path { -+ __u32 p_block; -+ __u16 p_depth; -+ struct ext3_extent *p_ext; -+ struct ext3_extent_idx *p_idx; -+ struct ext3_extent_header *p_hdr; -+ struct buffer_head *p_bh; -+}; -+ -+/* -+ * structure for external API -+ */ -+ -+/* -+ * storage for cached extent -+ */ -+struct ext3_ext_cache { -+ __u32 ec_start; -+ __u32 ec_block; -+ __u32 ec_len; -+ __u32 ec_type; -+}; -+ -+#define EXT3_EXT_CACHE_NO 0 -+#define EXT3_EXT_CACHE_GAP 1 -+#define EXT3_EXT_CACHE_EXTENT 2 -+ -+/* -+ * ext3_extents_tree is used to pass initial information -+ * to top-level extents API -+ */ -+struct ext3_extents_helpers; -+struct ext3_extents_tree { -+ struct inode *inode; /* inode which tree belongs to */ -+ void *root; /* ptr to data top of tree resides at */ -+ void *buffer; /* will be passed as arg to ^^ routines */ -+ int buffer_len; -+ void *private; -+ struct ext3_ext_cache *cex;/* last found extent */ -+ struct ext3_extents_helpers *ops; -+}; -+ -+struct ext3_extents_helpers { -+ int (*get_write_access)(handle_t *h, void *buffer); -+ int (*mark_buffer_dirty)(handle_t *h, void *buffer); -+ int (*mergable)(struct ext3_extent *ex1, struct ext3_extent *ex2); -+ int (*remove_extent_credits)(struct ext3_extents_tree *, -+ struct ext3_extent *, unsigned long, -+ unsigned long); -+ int (*remove_extent)(struct ext3_extents_tree *, -+ struct ext3_extent *, unsigned long, -+ unsigned long); -+ int (*new_block)(handle_t *, struct ext3_extents_tree *, -+ struct ext3_ext_path *, struct ext3_extent *, -+ int *); -+}; -+ -+/* -+ * to be called by ext3_ext_walk_space() -+ * negative retcode - error -+ * positive retcode - signal for ext3_ext_walk_space(), see below -+ * callback must return valid extent (passed or newly created) -+ */ -+typedef int (*ext_prepare_callback)(struct ext3_extents_tree *, -+ struct ext3_ext_path *, -+ struct ext3_ext_cache *); -+ -+#define EXT_CONTINUE 0 -+#define EXT_BREAK 1 -+#define EXT_REPEAT 2 -+ -+ -+#define EXT_MAX_BLOCK 0xffffffff -+ -+ -+#define EXT_FIRST_EXTENT(__hdr__) \ -+ ((struct ext3_extent *) (((char *) (__hdr__)) + \ -+ sizeof(struct ext3_extent_header))) -+#define EXT_FIRST_INDEX(__hdr__) \ -+ ((struct ext3_extent_idx *) (((char *) (__hdr__)) + \ -+ sizeof(struct ext3_extent_header))) -+#define EXT_HAS_FREE_INDEX(__path__) \ -+ ((__path__)->p_hdr->eh_entries < (__path__)->p_hdr->eh_max) -+#define EXT_LAST_EXTENT(__hdr__) \ -+ (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_entries - 1) -+#define EXT_LAST_INDEX(__hdr__) \ -+ (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_entries - 1) -+#define EXT_MAX_EXTENT(__hdr__) \ -+ (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_max - 1) -+#define EXT_MAX_INDEX(__hdr__) \ -+ (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_max - 1) -+#define EXT_HDR_GEN(__hdr__) ((__hdr__)->eh_generation & 0x00ffffff) -+#define EXT_FLAGS(__hdr__) ((__hdr__)->eh_generation >> 24) -+#define EXT_FLAGS_CLR_UNKNOWN 0x7 /* Flags cleared on modification */ -+ -+#define EXT_BLOCK_HDR(__bh__) ((struct ext3_extent_header *)(__bh__)->b_data) -+#define EXT_ROOT_HDR(__tree__) ((struct ext3_extent_header *)(__tree__)->root) -+#define EXT_DEPTH(__tree__) (EXT_ROOT_HDR(__tree__)->eh_depth) -+#define EXT_GENERATION(__tree__) EXT_HDR_GEN(EXT_ROOT_HDR(__tree__)) -+ -+#define EXT_ASSERT(__x__) if (!(__x__)) BUG(); -+ -+#define EXT_CHECK_PATH(tree,path) \ -+{ \ -+ int depth = EXT_DEPTH(tree); \ -+ BUG_ON((unsigned long) (path) < __PAGE_OFFSET); \ -+ BUG_ON((unsigned long) (path)[depth].p_idx < \ -+ __PAGE_OFFSET && (path)[depth].p_idx != NULL); \ -+ BUG_ON((unsigned long) (path)[depth].p_ext < \ -+ __PAGE_OFFSET && (path)[depth].p_ext != NULL); \ -+ BUG_ON((unsigned long) (path)[depth].p_hdr < __PAGE_OFFSET); \ -+ BUG_ON((unsigned long) (path)[depth].p_bh < __PAGE_OFFSET \ -+ && depth != 0); \ -+ BUG_ON((path)[0].p_depth != depth); \ -+} -+ -+ -+/* -+ * this structure is used to gather extents from the tree via ioctl -+ */ -+struct ext3_extent_buf { -+ unsigned long start; -+ int buflen; -+ void *buffer; -+ void *cur; -+ int err; -+}; -+ -+/* -+ * this structure is used to collect stats info about the tree -+ */ -+struct ext3_extent_tree_stats { -+ int depth; -+ int extents_num; -+ int leaf_num; -+}; -+ -+extern void ext3_init_tree_desc(struct ext3_extents_tree *, struct inode *); -+extern int ext3_extent_tree_init(handle_t *, struct ext3_extents_tree *); -+extern int ext3_ext_calc_credits_for_insert(struct ext3_extents_tree *, struct ext3_ext_path *); -+extern int ext3_ext_insert_extent(handle_t *, struct ext3_extents_tree *, struct ext3_ext_path *, struct ext3_extent *); -+extern int ext3_ext_walk_space(struct ext3_extents_tree *, unsigned long, unsigned long, ext_prepare_callback); -+extern int ext3_ext_remove_space(struct ext3_extents_tree *, unsigned long, unsigned long); -+extern struct ext3_ext_path * ext3_ext_find_extent(struct ext3_extents_tree *, int, struct ext3_ext_path *); -+extern int ext3_ext_calc_blockmap_metadata(struct inode *, int); -+ -+static inline void -+ext3_ext_invalidate_cache(struct ext3_extents_tree *tree) -+{ -+ if (tree->cex) -+ tree->cex->ec_type = EXT3_EXT_CACHE_NO; -+} -+ -+ -+#endif /* _LINUX_EXT3_EXTENTS */ -Index: linux-2.6.5-sles9/include/linux/ext3_fs_i.h -=================================================================== ---- linux-2.6.5-sles9.orig/include/linux/ext3_fs_i.h 2005-02-23 01:01:52.425272296 +0300 -+++ linux-2.6.5-sles9/include/linux/ext3_fs_i.h 2005-02-23 01:45:55.611446920 +0300 -@@ -19,6 +19,7 @@ - #include - #include - #include -+#include - - struct reserve_window { - __u32 _rsv_start; /* First byte reserved */ -@@ -128,6 +129,8 @@ - */ - struct semaphore truncate_sem; - struct inode vfs_inode; -+ -+ struct ext3_ext_cache i_cached_extent; - }; - - #endif /* _LINUX_EXT3_FS_I */ - -%diffstat - fs/ext3/Makefile | 2 - fs/ext3/extents.c | 2356 +++++++++++++++++++++++++++++++++++++++++++ - fs/ext3/ialloc.c | 4 - fs/ext3/inode.c | 29 - fs/ext3/ioctl.c | 4 - fs/ext3/super.c | 15 - include/linux/ext3_extents.h | 265 ++++ - include/linux/ext3_fs.h | 17 - include/linux/ext3_fs_i.h | 3 - 9 files changed, 2687 insertions(+), 8 deletions(-) - diff --git a/ldiskfs/kernel_patches/patches/ext3-extents-2.6.9-rhel4.patch b/ldiskfs/kernel_patches/patches/ext3-extents-2.6.9-rhel4.patch deleted file mode 100644 index 67d6236..0000000 --- a/ldiskfs/kernel_patches/patches/ext3-extents-2.6.9-rhel4.patch +++ /dev/null @@ -1,2926 +0,0 @@ -Index: linux-stage/fs/ext3/extents.c -=================================================================== ---- linux-stage.orig/fs/ext3/extents.c 2005-02-25 15:33:48.890198160 +0200 -+++ linux-stage/fs/ext3/extents.c 2005-02-25 15:33:48.917194056 +0200 -@@ -0,0 +1,2360 @@ -+/* -+ * Copyright(c) 2003, 2004, 2005, Cluster File Systems, Inc, info@clusterfs.com -+ * Written by Alex Tomas -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License version 2 as -+ * published by the Free Software Foundation. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public Licens -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- -+ */ -+ -+/* -+ * Extents support for EXT3 -+ * -+ * TODO: -+ * - ext3_ext_walk_space() sould not use ext3_ext_find_extent() -+ * - ext3_ext_calc_credits() could take 'mergable' into account -+ * - ext3*_error() should be used in some situations -+ * - find_goal() [to be tested and improved] -+ * - smart tree reduction -+ * - arch-independence -+ * common on-disk format for big/little-endian arch -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+ -+static inline int ext3_ext_check_header(struct ext3_extent_header *eh) -+{ -+ if (eh->eh_magic != EXT3_EXT_MAGIC) { -+ printk(KERN_ERR "EXT3-fs: invalid magic = 0x%x\n", -+ (unsigned)eh->eh_magic); -+ return -EIO; -+ } -+ if (eh->eh_max == 0) { -+ printk(KERN_ERR "EXT3-fs: invalid eh_max = %u\n", -+ (unsigned)eh->eh_max); -+ return -EIO; -+ } -+ if (eh->eh_entries > eh->eh_max) { -+ printk(KERN_ERR "EXT3-fs: invalid eh_entries = %u\n", -+ (unsigned)eh->eh_entries); -+ return -EIO; -+ } -+ return 0; -+} -+ -+static handle_t *ext3_ext_journal_restart(handle_t *handle, int needed) -+{ -+ int err; -+ -+ if (handle->h_buffer_credits > needed) -+ return handle; -+ if (!ext3_journal_extend(handle, needed)) -+ return handle; -+ err = ext3_journal_restart(handle, needed); -+ -+ return handle; -+} -+ -+static int inline -+ext3_ext_get_access_for_root(handle_t *h, struct ext3_extents_tree *tree) -+{ -+ if (tree->ops->get_write_access) -+ return tree->ops->get_write_access(h,tree->buffer); -+ else -+ return 0; -+} -+ -+static int inline -+ext3_ext_mark_root_dirty(handle_t *h, struct ext3_extents_tree *tree) -+{ -+ if (tree->ops->mark_buffer_dirty) -+ return tree->ops->mark_buffer_dirty(h,tree->buffer); -+ else -+ return 0; -+} -+ -+/* -+ * could return: -+ * - EROFS -+ * - ENOMEM -+ */ -+static int ext3_ext_get_access(handle_t *handle, -+ struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path) -+{ -+ int err; -+ -+ if (path->p_bh) { -+ /* path points to block */ -+ err = ext3_journal_get_write_access(handle, path->p_bh); -+ } else { -+ /* path points to leaf/index in inode body */ -+ err = ext3_ext_get_access_for_root(handle, tree); -+ } -+ return err; -+} -+ -+/* -+ * could return: -+ * - EROFS -+ * - ENOMEM -+ * - EIO -+ */ -+static int ext3_ext_dirty(handle_t *handle, struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path) -+{ -+ int err; -+ if (path->p_bh) { -+ /* path points to block */ -+ err =ext3_journal_dirty_metadata(handle, path->p_bh); -+ } else { -+ /* path points to leaf/index in inode body */ -+ err = ext3_ext_mark_root_dirty(handle, tree); -+ } -+ return err; -+} -+ -+static int inline -+ext3_ext_new_block(handle_t *handle, struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, struct ext3_extent *ex, -+ int *err) -+{ -+ int goal, depth, newblock; -+ struct inode *inode; -+ -+ EXT_ASSERT(tree); -+ if (tree->ops->new_block) -+ return tree->ops->new_block(handle, tree, path, ex, err); -+ -+ inode = tree->inode; -+ depth = EXT_DEPTH(tree); -+ if (path && depth > 0) { -+ goal = path[depth-1].p_block; -+ } else { -+ struct ext3_inode_info *ei = EXT3_I(inode); -+ unsigned long bg_start; -+ unsigned long colour; -+ -+ bg_start = (ei->i_block_group * -+ EXT3_BLOCKS_PER_GROUP(inode->i_sb)) + -+ le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block); -+ colour = (current->pid % 16) * -+ (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16); -+ goal = bg_start + colour; -+ } -+ -+ newblock = ext3_new_block(handle, inode, goal, err); -+ return newblock; -+} -+ -+static inline void ext3_ext_tree_changed(struct ext3_extents_tree *tree) -+{ -+ struct ext3_extent_header *neh = EXT_ROOT_HDR(tree); -+ neh->eh_generation = ((EXT_FLAGS(neh) & ~EXT_FLAGS_CLR_UNKNOWN) << -+ EXT_HDR_GEN_BITS) | -+ ((EXT_HDR_GEN(neh) + 1) & EXT_HDR_GEN_MASK); -+} -+ -+static inline int ext3_ext_space_block(struct ext3_extents_tree *tree) -+{ -+ int size; -+ -+ size = (tree->inode->i_sb->s_blocksize - -+ sizeof(struct ext3_extent_header)) / -+ sizeof(struct ext3_extent); -+#ifdef AGRESSIVE_TEST -+ size = 6; -+#endif -+ return size; -+} -+ -+static inline int ext3_ext_space_block_idx(struct ext3_extents_tree *tree) -+{ -+ int size; -+ -+ size = (tree->inode->i_sb->s_blocksize - -+ sizeof(struct ext3_extent_header)) / -+ sizeof(struct ext3_extent_idx); -+#ifdef AGRESSIVE_TEST -+ size = 5; -+#endif -+ return size; -+} -+ -+static inline int ext3_ext_space_root(struct ext3_extents_tree *tree) -+{ -+ int size; -+ -+ size = (tree->buffer_len - sizeof(struct ext3_extent_header)) / -+ sizeof(struct ext3_extent); -+#ifdef AGRESSIVE_TEST -+ size = 3; -+#endif -+ return size; -+} -+ -+static inline int ext3_ext_space_root_idx(struct ext3_extents_tree *tree) -+{ -+ int size; -+ -+ size = (tree->buffer_len - sizeof(struct ext3_extent_header)) / -+ sizeof(struct ext3_extent_idx); -+#ifdef AGRESSIVE_TEST -+ size = 4; -+#endif -+ return size; -+} -+ -+static void ext3_ext_show_path(struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path) -+{ -+#ifdef EXT_DEBUG -+ int k, l = path->p_depth; -+ -+ ext_debug(tree, "path:"); -+ for (k = 0; k <= l; k++, path++) { -+ if (path->p_idx) { -+ ext_debug(tree, " %d->%d", path->p_idx->ei_block, -+ path->p_idx->ei_leaf); -+ } else if (path->p_ext) { -+ ext_debug(tree, " %d:%d:%d", -+ path->p_ext->ee_block, -+ path->p_ext->ee_len, -+ path->p_ext->ee_start); -+ } else -+ ext_debug(tree, " []"); -+ } -+ ext_debug(tree, "\n"); -+#endif -+} -+ -+static void ext3_ext_show_leaf(struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path) -+{ -+#ifdef EXT_DEBUG -+ int depth = EXT_DEPTH(tree); -+ struct ext3_extent_header *eh; -+ struct ext3_extent *ex; -+ int i; -+ -+ if (!path) -+ return; -+ -+ eh = path[depth].p_hdr; -+ ex = EXT_FIRST_EXTENT(eh); -+ -+ for (i = 0; i < eh->eh_entries; i++, ex++) { -+ ext_debug(tree, "%d:%d:%d ", -+ ex->ee_block, ex->ee_len, ex->ee_start); -+ } -+ ext_debug(tree, "\n"); -+#endif -+} -+ -+static void ext3_ext_drop_refs(struct ext3_ext_path *path) -+{ -+ int depth = path->p_depth; -+ int i; -+ -+ for (i = 0; i <= depth; i++, path++) { -+ if (path->p_bh) { -+ brelse(path->p_bh); -+ path->p_bh = NULL; -+ } -+ } -+} -+ -+/* -+ * binary search for closest index by given block -+ */ -+static inline void -+ext3_ext_binsearch_idx(struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, int block) -+{ -+ struct ext3_extent_header *eh = path->p_hdr; -+ struct ext3_extent_idx *ix; -+ int l = 0, k, r; -+ -+ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); -+ EXT_ASSERT(eh->eh_entries <= eh->eh_max); -+ EXT_ASSERT(eh->eh_entries > 0); -+ -+ ext_debug(tree, "binsearch for %d(idx): ", block); -+ -+ path->p_idx = ix = EXT_FIRST_INDEX(eh); -+ -+ r = k = eh->eh_entries; -+ while (k > 1) { -+ k = (r - l) / 2; -+ if (block < ix[l + k].ei_block) -+ r -= k; -+ else -+ l += k; -+ ext_debug(tree, "%d:%d:%d ", k, l, r); -+ } -+ -+ ix += l; -+ path->p_idx = ix; -+ ext_debug(tree," -> %d->%d ",path->p_idx->ei_block,path->p_idx->ei_leaf); -+ -+ while (l++ < r) { -+ if (block < ix->ei_block) -+ break; -+ path->p_idx = ix++; -+ } -+ ext_debug(tree, " -> %d->%d\n", path->p_idx->ei_block, -+ path->p_idx->ei_leaf); -+ -+#ifdef CHECK_BINSEARCH -+ { -+ struct ext3_extent_idx *chix; -+ -+ chix = ix = EXT_FIRST_INDEX(eh); -+ for (k = 0; k < eh->eh_entries; k++, ix++) { -+ if (k != 0 && ix->ei_block <= ix[-1].ei_block) { -+ printk("k=%d, ix=0x%p, first=0x%p\n", k, -+ ix, EXT_FIRST_INDEX(eh)); -+ printk("%u <= %u\n", -+ ix->ei_block,ix[-1].ei_block); -+ } -+ EXT_ASSERT(k == 0 || ix->ei_block > ix[-1].ei_block); -+ if (block < ix->ei_block) -+ break; -+ chix = ix; -+ } -+ EXT_ASSERT(chix == path->p_idx); -+ } -+#endif -+} -+ -+/* -+ * binary search for closest extent by given block -+ */ -+static inline void -+ext3_ext_binsearch(struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, int block) -+{ -+ struct ext3_extent_header *eh = path->p_hdr; -+ struct ext3_extent *ex; -+ int l = 0, k, r; -+ -+ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); -+ EXT_ASSERT(eh->eh_entries <= eh->eh_max); -+ -+ if (eh->eh_entries == 0) { -+ /* -+ * this leaf is empty yet: -+ * we get such a leaf in split/add case -+ */ -+ return; -+ } -+ -+ ext_debug(tree, "binsearch for %d: ", block); -+ -+ path->p_ext = ex = EXT_FIRST_EXTENT(eh); -+ -+ r = k = eh->eh_entries; -+ while (k > 1) { -+ k = (r - l) / 2; -+ if (block < ex[l + k].ee_block) -+ r -= k; -+ else -+ l += k; -+ ext_debug(tree, "%d:%d:%d ", k, l, r); -+ } -+ -+ ex += l; -+ path->p_ext = ex; -+ ext_debug(tree, " -> %d:%d:%d ", path->p_ext->ee_block, -+ path->p_ext->ee_start, path->p_ext->ee_len); -+ -+ while (l++ < r) { -+ if (block < ex->ee_block) -+ break; -+ path->p_ext = ex++; -+ } -+ ext_debug(tree, " -> %d:%d:%d\n", path->p_ext->ee_block, -+ path->p_ext->ee_start, path->p_ext->ee_len); -+ -+#ifdef CHECK_BINSEARCH -+ { -+ struct ext3_extent *chex; -+ -+ chex = ex = EXT_FIRST_EXTENT(eh); -+ for (k = 0; k < eh->eh_entries; k++, ex++) { -+ EXT_ASSERT(k == 0 || ex->ee_block > ex[-1].ee_block); -+ if (block < ex->ee_block) -+ break; -+ chex = ex; -+ } -+ EXT_ASSERT(chex == path->p_ext); -+ } -+#endif -+} -+ -+int ext3_extent_tree_init(handle_t *handle, struct ext3_extents_tree *tree) -+{ -+ struct ext3_extent_header *eh; -+ -+ BUG_ON(tree->buffer_len == 0); -+ ext3_ext_get_access_for_root(handle, tree); -+ eh = EXT_ROOT_HDR(tree); -+ eh->eh_depth = 0; -+ eh->eh_entries = 0; -+ eh->eh_magic = EXT3_EXT_MAGIC; -+ eh->eh_max = ext3_ext_space_root(tree); -+ ext3_ext_mark_root_dirty(handle, tree); -+ ext3_ext_invalidate_cache(tree); -+ return 0; -+} -+ -+struct ext3_ext_path * -+ext3_ext_find_extent(struct ext3_extents_tree *tree, int block, -+ struct ext3_ext_path *path) -+{ -+ struct ext3_extent_header *eh; -+ struct buffer_head *bh; -+ int depth, i, ppos = 0; -+ -+ EXT_ASSERT(tree); -+ EXT_ASSERT(tree->inode); -+ EXT_ASSERT(tree->root); -+ -+ eh = EXT_ROOT_HDR(tree); -+ EXT_ASSERT(eh); -+ if (ext3_ext_check_header(eh)) { -+ /* don't free previously allocated path -+ * -- caller should take care */ -+ path = NULL; -+ goto err; -+ } -+ -+ i = depth = EXT_DEPTH(tree); -+ EXT_ASSERT(eh->eh_max); -+ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); -+ -+ /* account possible depth increase */ -+ if (!path) { -+ path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 2), -+ GFP_NOFS); -+ if (!path) -+ return ERR_PTR(-ENOMEM); -+ } -+ memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1)); -+ path[0].p_hdr = eh; -+ -+ /* walk through the tree */ -+ while (i) { -+ ext_debug(tree, "depth %d: num %d, max %d\n", -+ ppos, eh->eh_entries, eh->eh_max); -+ ext3_ext_binsearch_idx(tree, path + ppos, block); -+ path[ppos].p_block = path[ppos].p_idx->ei_leaf; -+ path[ppos].p_depth = i; -+ path[ppos].p_ext = NULL; -+ -+ bh = sb_bread(tree->inode->i_sb, path[ppos].p_block); -+ if (!bh) -+ goto err; -+ -+ eh = EXT_BLOCK_HDR(bh); -+ ppos++; -+ EXT_ASSERT(ppos <= depth); -+ path[ppos].p_bh = bh; -+ path[ppos].p_hdr = eh; -+ i--; -+ -+ if (ext3_ext_check_header(eh)) -+ goto err; -+ } -+ -+ path[ppos].p_depth = i; -+ path[ppos].p_hdr = eh; -+ path[ppos].p_ext = NULL; -+ path[ppos].p_idx = NULL; -+ -+ if (ext3_ext_check_header(eh)) -+ goto err; -+ -+ /* find extent */ -+ ext3_ext_binsearch(tree, path + ppos, block); -+ -+ ext3_ext_show_path(tree, path); -+ -+ return path; -+ -+err: -+ printk(KERN_ERR "EXT3-fs: header is corrupted!\n"); -+ if (path) { -+ ext3_ext_drop_refs(path); -+ kfree(path); -+ } -+ return ERR_PTR(-EIO); -+} -+ -+/* -+ * insert new index [logical;ptr] into the block at cupr -+ * it check where to insert: before curp or after curp -+ */ -+static int ext3_ext_insert_index(handle_t *handle, -+ struct ext3_extents_tree *tree, -+ struct ext3_ext_path *curp, -+ int logical, int ptr) -+{ -+ struct ext3_extent_idx *ix; -+ int len, err; -+ -+ if ((err = ext3_ext_get_access(handle, tree, curp))) -+ return err; -+ -+ EXT_ASSERT(logical != curp->p_idx->ei_block); -+ len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx; -+ if (logical > curp->p_idx->ei_block) { -+ /* insert after */ -+ if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) { -+ len = (len - 1) * sizeof(struct ext3_extent_idx); -+ len = len < 0 ? 0 : len; -+ ext_debug(tree, "insert new index %d after: %d. " -+ "move %d from 0x%p to 0x%p\n", -+ logical, ptr, len, -+ (curp->p_idx + 1), (curp->p_idx + 2)); -+ memmove(curp->p_idx + 2, curp->p_idx + 1, len); -+ } -+ ix = curp->p_idx + 1; -+ } else { -+ /* insert before */ -+ len = len * sizeof(struct ext3_extent_idx); -+ len = len < 0 ? 0 : len; -+ ext_debug(tree, "insert new index %d before: %d. " -+ "move %d from 0x%p to 0x%p\n", -+ logical, ptr, len, -+ curp->p_idx, (curp->p_idx + 1)); -+ memmove(curp->p_idx + 1, curp->p_idx, len); -+ ix = curp->p_idx; -+ } -+ -+ ix->ei_block = logical; -+ ix->ei_leaf = ptr; -+ ix->ei_leaf_hi = ix->ei_unused = 0; -+ curp->p_hdr->eh_entries++; -+ -+ EXT_ASSERT(curp->p_hdr->eh_entries <= curp->p_hdr->eh_max); -+ EXT_ASSERT(ix <= EXT_LAST_INDEX(curp->p_hdr)); -+ -+ err = ext3_ext_dirty(handle, tree, curp); -+ ext3_std_error(tree->inode->i_sb, err); -+ -+ return err; -+} -+ -+/* -+ * routine inserts new subtree into the path, using free index entry -+ * at depth 'at: -+ * - allocates all needed blocks (new leaf and all intermediate index blocks) -+ * - makes decision where to split -+ * - moves remaining extens and index entries (right to the split point) -+ * into the newly allocated blocks -+ * - initialize subtree -+ */ -+static int ext3_ext_split(handle_t *handle, struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, -+ struct ext3_extent *newext, int at) -+{ -+ struct buffer_head *bh = NULL; -+ int depth = EXT_DEPTH(tree); -+ struct ext3_extent_header *neh; -+ struct ext3_extent_idx *fidx; -+ struct ext3_extent *ex; -+ int i = at, k, m, a; -+ unsigned long newblock, oldblock, border; -+ int *ablocks = NULL; /* array of allocated blocks */ -+ int err = 0; -+ -+ /* make decision: where to split? */ -+ /* FIXME: now desicion is simplest: at current extent */ -+ -+ /* if current leaf will be splitted, then we should use -+ * border from split point */ -+ EXT_ASSERT(path[depth].p_ext <= EXT_MAX_EXTENT(path[depth].p_hdr)); -+ if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) { -+ border = path[depth].p_ext[1].ee_block; -+ ext_debug(tree, "leaf will be splitted." -+ " next leaf starts at %d\n", -+ (int)border); -+ } else { -+ border = newext->ee_block; -+ ext_debug(tree, "leaf will be added." -+ " next leaf starts at %d\n", -+ (int)border); -+ } -+ -+ /* -+ * if error occurs, then we break processing -+ * and turn filesystem read-only. so, index won't -+ * be inserted and tree will be in consistent -+ * state. next mount will repair buffers too -+ */ -+ -+ /* -+ * get array to track all allocated blocks -+ * we need this to handle errors and free blocks -+ * upon them -+ */ -+ ablocks = kmalloc(sizeof(unsigned long) * depth, GFP_NOFS); -+ if (!ablocks) -+ return -ENOMEM; -+ memset(ablocks, 0, sizeof(unsigned long) * depth); -+ -+ /* allocate all needed blocks */ -+ ext_debug(tree, "allocate %d blocks for indexes/leaf\n", depth - at); -+ for (a = 0; a < depth - at; a++) { -+ newblock = ext3_ext_new_block(handle, tree, path, newext, &err); -+ if (newblock == 0) -+ goto cleanup; -+ ablocks[a] = newblock; -+ } -+ -+ /* initialize new leaf */ -+ newblock = ablocks[--a]; -+ EXT_ASSERT(newblock); -+ bh = sb_getblk(tree->inode->i_sb, newblock); -+ if (!bh) { -+ err = -EIO; -+ goto cleanup; -+ } -+ lock_buffer(bh); -+ -+ if ((err = ext3_journal_get_create_access(handle, bh))) -+ goto cleanup; -+ -+ neh = EXT_BLOCK_HDR(bh); -+ neh->eh_entries = 0; -+ neh->eh_max = ext3_ext_space_block(tree); -+ neh->eh_magic = EXT3_EXT_MAGIC; -+ neh->eh_depth = 0; -+ ex = EXT_FIRST_EXTENT(neh); -+ -+ /* move remain of path[depth] to the new leaf */ -+ EXT_ASSERT(path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max); -+ /* start copy from next extent */ -+ /* TODO: we could do it by single memmove */ -+ m = 0; -+ path[depth].p_ext++; -+ while (path[depth].p_ext <= -+ EXT_MAX_EXTENT(path[depth].p_hdr)) { -+ ext_debug(tree, "move %d:%d:%d in new leaf %lu\n", -+ path[depth].p_ext->ee_block, -+ path[depth].p_ext->ee_start, -+ path[depth].p_ext->ee_len, -+ newblock); -+ memmove(ex++, path[depth].p_ext++, sizeof(struct ext3_extent)); -+ neh->eh_entries++; -+ m++; -+ } -+ set_buffer_uptodate(bh); -+ unlock_buffer(bh); -+ -+ if ((err = ext3_journal_dirty_metadata(handle, bh))) -+ goto cleanup; -+ brelse(bh); -+ bh = NULL; -+ -+ /* correct old leaf */ -+ if (m) { -+ if ((err = ext3_ext_get_access(handle, tree, path + depth))) -+ goto cleanup; -+ path[depth].p_hdr->eh_entries -= m; -+ if ((err = ext3_ext_dirty(handle, tree, path + depth))) -+ goto cleanup; -+ -+ } -+ -+ /* create intermediate indexes */ -+ k = depth - at - 1; -+ EXT_ASSERT(k >= 0); -+ if (k) -+ ext_debug(tree, "create %d intermediate indices\n", k); -+ /* insert new index into current index block */ -+ /* current depth stored in i var */ -+ i = depth - 1; -+ while (k--) { -+ oldblock = newblock; -+ newblock = ablocks[--a]; -+ bh = sb_getblk(tree->inode->i_sb, newblock); -+ if (!bh) { -+ err = -EIO; -+ goto cleanup; -+ } -+ lock_buffer(bh); -+ -+ if ((err = ext3_journal_get_create_access(handle, bh))) -+ goto cleanup; -+ -+ neh = EXT_BLOCK_HDR(bh); -+ neh->eh_entries = 1; -+ neh->eh_magic = EXT3_EXT_MAGIC; -+ neh->eh_max = ext3_ext_space_block_idx(tree); -+ neh->eh_depth = depth - i; -+ fidx = EXT_FIRST_INDEX(neh); -+ fidx->ei_block = border; -+ fidx->ei_leaf = oldblock; -+ fidx->ei_leaf_hi = fidx->ei_unused = 0; -+ -+ ext_debug(tree, "int.index at %d (block %lu): %lu -> %lu\n", -+ i, newblock, border, oldblock); -+ /* copy indexes */ -+ m = 0; -+ path[i].p_idx++; -+ -+ ext_debug(tree, "cur 0x%p, last 0x%p\n", path[i].p_idx, -+ EXT_MAX_INDEX(path[i].p_hdr)); -+ EXT_ASSERT(EXT_MAX_INDEX(path[i].p_hdr) == -+ EXT_LAST_INDEX(path[i].p_hdr)); -+ while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) { -+ ext_debug(tree, "%d: move %d:%d in new index %lu\n", -+ i, path[i].p_idx->ei_block, -+ path[i].p_idx->ei_leaf, newblock); -+ memmove(++fidx, path[i].p_idx++, -+ sizeof(struct ext3_extent_idx)); -+ neh->eh_entries++; -+ EXT_ASSERT(neh->eh_entries <= neh->eh_max); -+ m++; -+ } -+ set_buffer_uptodate(bh); -+ unlock_buffer(bh); -+ -+ if ((err = ext3_journal_dirty_metadata(handle, bh))) -+ goto cleanup; -+ brelse(bh); -+ bh = NULL; -+ -+ /* correct old index */ -+ if (m) { -+ err = ext3_ext_get_access(handle, tree, path + i); -+ if (err) -+ goto cleanup; -+ path[i].p_hdr->eh_entries -= m; -+ err = ext3_ext_dirty(handle, tree, path + i); -+ if (err) -+ goto cleanup; -+ } -+ -+ i--; -+ } -+ -+ /* insert new index */ -+ if (!err) -+ err = ext3_ext_insert_index(handle, tree, path + at, -+ border, newblock); -+ -+cleanup: -+ if (bh) { -+ if (buffer_locked(bh)) -+ unlock_buffer(bh); -+ brelse(bh); -+ } -+ -+ if (err) { -+ /* free all allocated blocks in error case */ -+ for (i = 0; i < depth; i++) { -+ if (!ablocks[i]) -+ continue; -+ ext3_free_blocks(handle, tree->inode, ablocks[i], 1); -+ } -+ } -+ kfree(ablocks); -+ -+ return err; -+} -+ -+/* -+ * routine implements tree growing procedure: -+ * - allocates new block -+ * - moves top-level data (index block or leaf) into the new block -+ * - initialize new top-level, creating index that points to the -+ * just created block -+ */ -+static int ext3_ext_grow_indepth(handle_t *handle, -+ struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, -+ struct ext3_extent *newext) -+{ -+ struct ext3_ext_path *curp = path; -+ struct ext3_extent_header *neh; -+ struct ext3_extent_idx *fidx; -+ struct buffer_head *bh; -+ unsigned long newblock; -+ int err = 0; -+ -+ newblock = ext3_ext_new_block(handle, tree, path, newext, &err); -+ if (newblock == 0) -+ return err; -+ -+ bh = sb_getblk(tree->inode->i_sb, newblock); -+ if (!bh) { -+ err = -EIO; -+ ext3_std_error(tree->inode->i_sb, err); -+ return err; -+ } -+ lock_buffer(bh); -+ -+ if ((err = ext3_journal_get_create_access(handle, bh))) { -+ unlock_buffer(bh); -+ goto out; -+ } -+ -+ /* move top-level index/leaf into new block */ -+ memmove(bh->b_data, curp->p_hdr, tree->buffer_len); -+ -+ /* set size of new block */ -+ neh = EXT_BLOCK_HDR(bh); -+ /* old root could have indexes or leaves -+ * so calculate eh_max right way */ -+ if (EXT_DEPTH(tree)) -+ neh->eh_max = ext3_ext_space_block_idx(tree); -+ else -+ neh->eh_max = ext3_ext_space_block(tree); -+ neh->eh_magic = EXT3_EXT_MAGIC; -+ set_buffer_uptodate(bh); -+ unlock_buffer(bh); -+ -+ if ((err = ext3_journal_dirty_metadata(handle, bh))) -+ goto out; -+ -+ /* create index in new top-level index: num,max,pointer */ -+ if ((err = ext3_ext_get_access(handle, tree, curp))) -+ goto out; -+ -+ curp->p_hdr->eh_magic = EXT3_EXT_MAGIC; -+ curp->p_hdr->eh_max = ext3_ext_space_root_idx(tree); -+ curp->p_hdr->eh_entries = 1; -+ curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr); -+ /* FIXME: it works, but actually path[0] can be index */ -+ curp->p_idx->ei_block = EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block; -+ curp->p_idx->ei_leaf = newblock; -+ curp->p_idx->ei_leaf_hi = curp->p_idx->ei_unused = 0; -+ -+ neh = EXT_ROOT_HDR(tree); -+ fidx = EXT_FIRST_INDEX(neh); -+ ext_debug(tree, "new root: num %d(%d), lblock %d, ptr %d\n", -+ neh->eh_entries, neh->eh_max, fidx->ei_block, fidx->ei_leaf); -+ -+ neh->eh_depth = path->p_depth + 1; -+ err = ext3_ext_dirty(handle, tree, curp); -+out: -+ brelse(bh); -+ -+ return err; -+} -+ -+/* -+ * routine finds empty index and adds new leaf. if no free index found -+ * then it requests in-depth growing -+ */ -+static int ext3_ext_create_new_leaf(handle_t *handle, -+ struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, -+ struct ext3_extent *newext) -+{ -+ struct ext3_ext_path *curp; -+ int depth, i, err = 0; -+ -+repeat: -+ i = depth = EXT_DEPTH(tree); -+ -+ /* walk up to the tree and look for free index entry */ -+ curp = path + depth; -+ while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) { -+ i--; -+ curp--; -+ } -+ -+ /* we use already allocated block for index block -+ * so, subsequent data blocks should be contigoues */ -+ if (EXT_HAS_FREE_INDEX(curp)) { -+ /* if we found index with free entry, then use that -+ * entry: create all needed subtree and add new leaf */ -+ err = ext3_ext_split(handle, tree, path, newext, i); -+ -+ /* refill path */ -+ ext3_ext_drop_refs(path); -+ path = ext3_ext_find_extent(tree, newext->ee_block, path); -+ if (IS_ERR(path)) -+ err = PTR_ERR(path); -+ } else { -+ /* tree is full, time to grow in depth */ -+ err = ext3_ext_grow_indepth(handle, tree, path, newext); -+ -+ /* refill path */ -+ ext3_ext_drop_refs(path); -+ path = ext3_ext_find_extent(tree, newext->ee_block, path); -+ if (IS_ERR(path)) -+ err = PTR_ERR(path); -+ -+ /* -+ * only first (depth 0 -> 1) produces free space -+ * in all other cases we have to split growed tree -+ */ -+ depth = EXT_DEPTH(tree); -+ if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) { -+ /* now we need split */ -+ goto repeat; -+ } -+ } -+ -+ if (err) -+ return err; -+ -+ return 0; -+} -+ -+/* -+ * returns allocated block in subsequent extent or EXT_MAX_BLOCK -+ * NOTE: it consider block number from index entry as -+ * allocated block. thus, index entries have to be consistent -+ * with leafs -+ */ -+static unsigned long -+ext3_ext_next_allocated_block(struct ext3_ext_path *path) -+{ -+ int depth; -+ -+ EXT_ASSERT(path != NULL); -+ depth = path->p_depth; -+ -+ if (depth == 0 && path->p_ext == NULL) -+ return EXT_MAX_BLOCK; -+ -+ /* FIXME: what if index isn't full ?! */ -+ while (depth >= 0) { -+ if (depth == path->p_depth) { -+ /* leaf */ -+ if (path[depth].p_ext != -+ EXT_LAST_EXTENT(path[depth].p_hdr)) -+ return path[depth].p_ext[1].ee_block; -+ } else { -+ /* index */ -+ if (path[depth].p_idx != -+ EXT_LAST_INDEX(path[depth].p_hdr)) -+ return path[depth].p_idx[1].ei_block; -+ } -+ depth--; -+ } -+ -+ return EXT_MAX_BLOCK; -+} -+ -+/* -+ * returns first allocated block from next leaf or EXT_MAX_BLOCK -+ */ -+static unsigned ext3_ext_next_leaf_block(struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path) -+{ -+ int depth; -+ -+ EXT_ASSERT(path != NULL); -+ depth = path->p_depth; -+ -+ /* zero-tree has no leaf blocks at all */ -+ if (depth == 0) -+ return EXT_MAX_BLOCK; -+ -+ /* go to index block */ -+ depth--; -+ -+ while (depth >= 0) { -+ if (path[depth].p_idx != -+ EXT_LAST_INDEX(path[depth].p_hdr)) -+ return path[depth].p_idx[1].ei_block; -+ depth--; -+ } -+ -+ return EXT_MAX_BLOCK; -+} -+ -+/* -+ * if leaf gets modified and modified extent is first in the leaf -+ * then we have to correct all indexes above -+ * TODO: do we need to correct tree in all cases? -+ */ -+int ext3_ext_correct_indexes(handle_t *handle, struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path) -+{ -+ struct ext3_extent_header *eh; -+ int depth = EXT_DEPTH(tree); -+ struct ext3_extent *ex; -+ unsigned long border; -+ int k, err = 0; -+ -+ eh = path[depth].p_hdr; -+ ex = path[depth].p_ext; -+ EXT_ASSERT(ex); -+ EXT_ASSERT(eh); -+ -+ if (depth == 0) { -+ /* there is no tree at all */ -+ return 0; -+ } -+ -+ if (ex != EXT_FIRST_EXTENT(eh)) { -+ /* we correct tree if first leaf got modified only */ -+ return 0; -+ } -+ -+ /* -+ * TODO: we need correction if border is smaller then current one -+ */ -+ k = depth - 1; -+ border = path[depth].p_ext->ee_block; -+ if ((err = ext3_ext_get_access(handle, tree, path + k))) -+ return err; -+ path[k].p_idx->ei_block = border; -+ if ((err = ext3_ext_dirty(handle, tree, path + k))) -+ return err; -+ -+ while (k--) { -+ /* change all left-side indexes */ -+ if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr)) -+ break; -+ if ((err = ext3_ext_get_access(handle, tree, path + k))) -+ break; -+ path[k].p_idx->ei_block = border; -+ if ((err = ext3_ext_dirty(handle, tree, path + k))) -+ break; -+ } -+ -+ return err; -+} -+ -+static int inline -+ext3_can_extents_be_merged(struct ext3_extents_tree *tree, -+ struct ext3_extent *ex1, -+ struct ext3_extent *ex2) -+{ -+ if (ex1->ee_block + ex1->ee_len != ex2->ee_block) -+ return 0; -+ -+#ifdef AGRESSIVE_TEST -+ if (ex1->ee_len >= 4) -+ return 0; -+#endif -+ -+ if (!tree->ops->mergable) -+ return 1; -+ -+ return tree->ops->mergable(ex1, ex2); -+} -+ -+/* -+ * this routine tries to merge requsted extent into the existing -+ * extent or inserts requested extent as new one into the tree, -+ * creating new leaf in no-space case -+ */ -+int ext3_ext_insert_extent(handle_t *handle, struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, -+ struct ext3_extent *newext) -+{ -+ struct ext3_extent_header * eh; -+ struct ext3_extent *ex, *fex; -+ struct ext3_extent *nearex; /* nearest extent */ -+ struct ext3_ext_path *npath = NULL; -+ int depth, len, err, next; -+ -+ EXT_ASSERT(newext->ee_len > 0); -+ depth = EXT_DEPTH(tree); -+ ex = path[depth].p_ext; -+ EXT_ASSERT(path[depth].p_hdr); -+ -+ /* try to insert block into found extent and return */ -+ if (ex && ext3_can_extents_be_merged(tree, ex, newext)) { -+ ext_debug(tree, "append %d block to %d:%d (from %d)\n", -+ newext->ee_len, ex->ee_block, ex->ee_len, -+ ex->ee_start); -+ if ((err = ext3_ext_get_access(handle, tree, path + depth))) -+ return err; -+ ex->ee_len += newext->ee_len; -+ eh = path[depth].p_hdr; -+ nearex = ex; -+ goto merge; -+ } -+ -+repeat: -+ depth = EXT_DEPTH(tree); -+ eh = path[depth].p_hdr; -+ if (eh->eh_entries < eh->eh_max) -+ goto has_space; -+ -+ /* probably next leaf has space for us? */ -+ fex = EXT_LAST_EXTENT(eh); -+ next = ext3_ext_next_leaf_block(tree, path); -+ if (newext->ee_block > fex->ee_block && next != EXT_MAX_BLOCK) { -+ ext_debug(tree, "next leaf block - %d\n", next); -+ EXT_ASSERT(!npath); -+ npath = ext3_ext_find_extent(tree, next, NULL); -+ if (IS_ERR(npath)) -+ return PTR_ERR(npath); -+ EXT_ASSERT(npath->p_depth == path->p_depth); -+ eh = npath[depth].p_hdr; -+ if (eh->eh_entries < eh->eh_max) { -+ ext_debug(tree, "next leaf isnt full(%d)\n", -+ eh->eh_entries); -+ path = npath; -+ goto repeat; -+ } -+ ext_debug(tree, "next leaf hasno free space(%d,%d)\n", -+ eh->eh_entries, eh->eh_max); -+ } -+ -+ /* -+ * there is no free space in found leaf -+ * we're gonna add new leaf in the tree -+ */ -+ err = ext3_ext_create_new_leaf(handle, tree, path, newext); -+ if (err) -+ goto cleanup; -+ depth = EXT_DEPTH(tree); -+ eh = path[depth].p_hdr; -+ -+has_space: -+ nearex = path[depth].p_ext; -+ -+ if ((err = ext3_ext_get_access(handle, tree, path + depth))) -+ goto cleanup; -+ -+ if (!nearex) { -+ /* there is no extent in this leaf, create first one */ -+ ext_debug(tree, "first extent in the leaf: %d:%d:%d\n", -+ newext->ee_block, newext->ee_start, -+ newext->ee_len); -+ path[depth].p_ext = EXT_FIRST_EXTENT(eh); -+ } else if (newext->ee_block > nearex->ee_block) { -+ EXT_ASSERT(newext->ee_block != nearex->ee_block); -+ if (nearex != EXT_LAST_EXTENT(eh)) { -+ len = EXT_MAX_EXTENT(eh) - nearex; -+ len = (len - 1) * sizeof(struct ext3_extent); -+ len = len < 0 ? 0 : len; -+ ext_debug(tree, "insert %d:%d:%d after: nearest 0x%p, " -+ "move %d from 0x%p to 0x%p\n", -+ newext->ee_block, newext->ee_start, -+ newext->ee_len, -+ nearex, len, nearex + 1, nearex + 2); -+ memmove(nearex + 2, nearex + 1, len); -+ } -+ path[depth].p_ext = nearex + 1; -+ } else { -+ EXT_ASSERT(newext->ee_block != nearex->ee_block); -+ len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext3_extent); -+ len = len < 0 ? 0 : len; -+ ext_debug(tree, "insert %d:%d:%d before: nearest 0x%p, " -+ "move %d from 0x%p to 0x%p\n", -+ newext->ee_block, newext->ee_start, newext->ee_len, -+ nearex, len, nearex + 1, nearex + 2); -+ memmove(nearex + 1, nearex, len); -+ path[depth].p_ext = nearex; -+ } -+ -+ eh->eh_entries++; -+ nearex = path[depth].p_ext; -+ nearex->ee_block = newext->ee_block; -+ nearex->ee_start = newext->ee_start; -+ nearex->ee_len = newext->ee_len; -+ /* FIXME: support for large fs */ -+ nearex->ee_start_hi = 0; -+ -+merge: -+ /* try to merge extents to the right */ -+ while (nearex < EXT_LAST_EXTENT(eh)) { -+ if (!ext3_can_extents_be_merged(tree, nearex, nearex + 1)) -+ break; -+ /* merge with next extent! */ -+ nearex->ee_len += nearex[1].ee_len; -+ if (nearex + 1 < EXT_LAST_EXTENT(eh)) { -+ len = (EXT_LAST_EXTENT(eh) - nearex - 1) * -+ sizeof(struct ext3_extent); -+ memmove(nearex + 1, nearex + 2, len); -+ } -+ eh->eh_entries--; -+ EXT_ASSERT(eh->eh_entries > 0); -+ } -+ -+ /* try to merge extents to the left */ -+ -+ /* time to correct all indexes above */ -+ err = ext3_ext_correct_indexes(handle, tree, path); -+ if (err) -+ goto cleanup; -+ -+ err = ext3_ext_dirty(handle, tree, path + depth); -+ -+cleanup: -+ if (npath) { -+ ext3_ext_drop_refs(npath); -+ kfree(npath); -+ } -+ ext3_ext_tree_changed(tree); -+ ext3_ext_invalidate_cache(tree); -+ return err; -+} -+ -+int ext3_ext_walk_space(struct ext3_extents_tree *tree, unsigned long block, -+ unsigned long num, ext_prepare_callback func) -+{ -+ struct ext3_ext_path *path = NULL; -+ struct ext3_ext_cache cbex; -+ struct ext3_extent *ex; -+ unsigned long next, start = 0, end = 0; -+ unsigned long last = block + num; -+ int depth, exists, err = 0; -+ -+ EXT_ASSERT(tree); -+ EXT_ASSERT(func); -+ EXT_ASSERT(tree->inode); -+ EXT_ASSERT(tree->root); -+ -+ while (block < last && block != EXT_MAX_BLOCK) { -+ num = last - block; -+ /* find extent for this block */ -+ path = ext3_ext_find_extent(tree, block, path); -+ if (IS_ERR(path)) { -+ err = PTR_ERR(path); -+ path = NULL; -+ break; -+ } -+ -+ depth = EXT_DEPTH(tree); -+ EXT_ASSERT(path[depth].p_hdr); -+ ex = path[depth].p_ext; -+ next = ext3_ext_next_allocated_block(path); -+ -+ exists = 0; -+ if (!ex) { -+ /* there is no extent yet, so try to allocate -+ * all requested space */ -+ start = block; -+ end = block + num; -+ } else if (ex->ee_block > block) { -+ /* need to allocate space before found extent */ -+ start = block; -+ end = ex->ee_block; -+ if (block + num < end) -+ end = block + num; -+ } else if (block >= ex->ee_block + ex->ee_len) { -+ /* need to allocate space after found extent */ -+ start = block; -+ end = block + num; -+ if (end >= next) -+ end = next; -+ } else if (block >= ex->ee_block) { -+ /* -+ * some part of requested space is covered -+ * by found extent -+ */ -+ start = block; -+ end = ex->ee_block + ex->ee_len; -+ if (block + num < end) -+ end = block + num; -+ exists = 1; -+ } else { -+ BUG(); -+ } -+ EXT_ASSERT(end > start); -+ -+ if (!exists) { -+ cbex.ec_block = start; -+ cbex.ec_len = end - start; -+ cbex.ec_start = 0; -+ cbex.ec_type = EXT3_EXT_CACHE_GAP; -+ } else { -+ cbex.ec_block = ex->ee_block; -+ cbex.ec_len = ex->ee_len; -+ cbex.ec_start = ex->ee_start; -+ cbex.ec_type = EXT3_EXT_CACHE_EXTENT; -+ } -+ -+ EXT_ASSERT(cbex.ec_len > 0); -+ EXT_ASSERT(path[depth].p_hdr); -+ err = func(tree, path, &cbex); -+ ext3_ext_drop_refs(path); -+ -+ if (err < 0) -+ break; -+ if (err == EXT_REPEAT) -+ continue; -+ else if (err == EXT_BREAK) { -+ err = 0; -+ break; -+ } -+ -+ if (EXT_DEPTH(tree) != depth) { -+ /* depth was changed. we have to realloc path */ -+ kfree(path); -+ path = NULL; -+ } -+ -+ block = cbex.ec_block + cbex.ec_len; -+ } -+ -+ if (path) { -+ ext3_ext_drop_refs(path); -+ kfree(path); -+ } -+ -+ return err; -+} -+ -+static inline void -+ext3_ext_put_in_cache(struct ext3_extents_tree *tree, __u32 block, -+ __u32 len, __u32 start, int type) -+{ -+ EXT_ASSERT(len > 0); -+ if (tree->cex) { -+ tree->cex->ec_type = type; -+ tree->cex->ec_block = block; -+ tree->cex->ec_len = len; -+ tree->cex->ec_start = start; -+ } -+} -+ -+/* -+ * this routine calculate boundaries of the gap requested block fits into -+ * and cache this gap -+ */ -+static inline void -+ext3_ext_put_gap_in_cache(struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, -+ unsigned long block) -+{ -+ int depth = EXT_DEPTH(tree); -+ unsigned long lblock, len; -+ struct ext3_extent *ex; -+ -+ if (!tree->cex) -+ return; -+ -+ ex = path[depth].p_ext; -+ if (ex == NULL) { -+ /* there is no extent yet, so gap is [0;-] */ -+ lblock = 0; -+ len = EXT_MAX_BLOCK; -+ ext_debug(tree, "cache gap(whole file):"); -+ } else if (block < ex->ee_block) { -+ lblock = block; -+ len = ex->ee_block - block; -+ ext_debug(tree, "cache gap(before): %lu [%lu:%lu]", -+ (unsigned long) block, -+ (unsigned long) ex->ee_block, -+ (unsigned long) ex->ee_len); -+ } else if (block >= ex->ee_block + ex->ee_len) { -+ lblock = ex->ee_block + ex->ee_len; -+ len = ext3_ext_next_allocated_block(path); -+ ext_debug(tree, "cache gap(after): [%lu:%lu] %lu", -+ (unsigned long) ex->ee_block, -+ (unsigned long) ex->ee_len, -+ (unsigned long) block); -+ EXT_ASSERT(len > lblock); -+ len = len - lblock; -+ } else { -+ lblock = len = 0; -+ BUG(); -+ } -+ -+ ext_debug(tree, " -> %lu:%lu\n", (unsigned long) lblock, len); -+ ext3_ext_put_in_cache(tree, lblock, len, 0, EXT3_EXT_CACHE_GAP); -+} -+ -+static inline int -+ext3_ext_in_cache(struct ext3_extents_tree *tree, unsigned long block, -+ struct ext3_extent *ex) -+{ -+ struct ext3_ext_cache *cex = tree->cex; -+ -+ /* is there cache storage at all? */ -+ if (!cex) -+ return EXT3_EXT_CACHE_NO; -+ -+ /* has cache valid data? */ -+ if (cex->ec_type == EXT3_EXT_CACHE_NO) -+ return EXT3_EXT_CACHE_NO; -+ -+ EXT_ASSERT(cex->ec_type == EXT3_EXT_CACHE_GAP || -+ cex->ec_type == EXT3_EXT_CACHE_EXTENT); -+ if (block >= cex->ec_block && block < cex->ec_block + cex->ec_len) { -+ ex->ee_block = cex->ec_block; -+ ex->ee_start = cex->ec_start; -+ ex->ee_start_hi = 0; -+ ex->ee_len = cex->ec_len; -+ ext_debug(tree, "%lu cached by %lu:%lu:%lu\n", -+ (unsigned long) block, -+ (unsigned long) ex->ee_block, -+ (unsigned long) ex->ee_len, -+ (unsigned long) ex->ee_start); -+ return cex->ec_type; -+ } -+ -+ /* not in cache */ -+ return EXT3_EXT_CACHE_NO; -+} -+ -+/* -+ * routine removes index from the index block -+ * it's used in truncate case only. thus all requests are for -+ * last index in the block only -+ */ -+int ext3_ext_rm_idx(handle_t *handle, struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path) -+{ -+ struct buffer_head *bh; -+ int err; -+ -+ /* free index block */ -+ path--; -+ EXT_ASSERT(path->p_hdr->eh_entries); -+ if ((err = ext3_ext_get_access(handle, tree, path))) -+ return err; -+ path->p_hdr->eh_entries--; -+ if ((err = ext3_ext_dirty(handle, tree, path))) -+ return err; -+ ext_debug(tree, "index is empty, remove it, free block %d\n", -+ path->p_idx->ei_leaf); -+ bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf); -+ ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf); -+ ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1); -+ return err; -+} -+ -+int ext3_ext_calc_credits_for_insert(struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path) -+{ -+ int depth = EXT_DEPTH(tree); -+ int needed; -+ -+ if (path) { -+ /* probably there is space in leaf? */ -+ if (path[depth].p_hdr->eh_entries < path[depth].p_hdr->eh_max) -+ return 1; -+ } -+ -+ /* -+ * the worste case we're expecting is creation of the -+ * new root (growing in depth) with index splitting -+ * for splitting we have to consider depth + 1 because -+ * previous growing could increase it -+ */ -+ depth = depth + 1; -+ -+ /* -+ * growing in depth: -+ * block allocation + new root + old root -+ */ -+ needed = EXT3_ALLOC_NEEDED + 2; -+ -+ /* index split. we may need: -+ * allocate intermediate indexes and new leaf -+ * change two blocks at each level, but root -+ * modify root block (inode) -+ */ -+ needed += (depth * EXT3_ALLOC_NEEDED) + (2 * depth) + 1; -+ -+ return needed; -+} -+ -+static int -+ext3_ext_split_for_rm(handle_t *handle, struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, unsigned long start, -+ unsigned long end) -+{ -+ struct ext3_extent *ex, tex; -+ struct ext3_ext_path *npath; -+ int depth, creds, err; -+ -+ depth = EXT_DEPTH(tree); -+ ex = path[depth].p_ext; -+ EXT_ASSERT(ex); -+ EXT_ASSERT(end < ex->ee_block + ex->ee_len - 1); -+ EXT_ASSERT(ex->ee_block < start); -+ -+ /* calculate tail extent */ -+ tex.ee_block = end + 1; -+ EXT_ASSERT(tex.ee_block < ex->ee_block + ex->ee_len); -+ tex.ee_len = ex->ee_block + ex->ee_len - tex.ee_block; -+ -+ creds = ext3_ext_calc_credits_for_insert(tree, path); -+ handle = ext3_ext_journal_restart(handle, creds); -+ if (IS_ERR(handle)) -+ return PTR_ERR(handle); -+ -+ /* calculate head extent. use primary extent */ -+ err = ext3_ext_get_access(handle, tree, path + depth); -+ if (err) -+ return err; -+ ex->ee_len = start - ex->ee_block; -+ err = ext3_ext_dirty(handle, tree, path + depth); -+ if (err) -+ return err; -+ -+ /* FIXME: some callback to free underlying resource -+ * and correct ee_start? */ -+ ext_debug(tree, "split extent: head %u:%u, tail %u:%u\n", -+ ex->ee_block, ex->ee_len, tex.ee_block, tex.ee_len); -+ -+ npath = ext3_ext_find_extent(tree, ex->ee_block, NULL); -+ if (IS_ERR(npath)) -+ return PTR_ERR(npath); -+ depth = EXT_DEPTH(tree); -+ EXT_ASSERT(npath[depth].p_ext->ee_block == ex->ee_block); -+ EXT_ASSERT(npath[depth].p_ext->ee_len == ex->ee_len); -+ -+ err = ext3_ext_insert_extent(handle, tree, npath, &tex); -+ ext3_ext_drop_refs(npath); -+ kfree(npath); -+ -+ return err; -+} -+ -+static int -+ext3_ext_rm_leaf(handle_t *handle, struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, unsigned long start, -+ unsigned long end) -+{ -+ struct ext3_extent *ex, *fu = NULL, *lu, *le; -+ int err = 0, correct_index = 0; -+ int depth = EXT_DEPTH(tree), credits; -+ struct ext3_extent_header *eh; -+ unsigned a, b, block, num; -+ -+ ext_debug(tree, "remove [%lu:%lu] in leaf\n", start, end); -+ if (!path[depth].p_hdr) -+ path[depth].p_hdr = EXT_BLOCK_HDR(path[depth].p_bh); -+ eh = path[depth].p_hdr; -+ EXT_ASSERT(eh); -+ EXT_ASSERT(eh->eh_entries <= eh->eh_max); -+ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); -+ -+ /* find where to start removing */ -+ le = ex = EXT_LAST_EXTENT(eh); -+ while (ex != EXT_FIRST_EXTENT(eh)) { -+ if (ex->ee_block <= end) -+ break; -+ ex--; -+ } -+ -+ if (start > ex->ee_block && end < ex->ee_block + ex->ee_len - 1) { -+ /* removal of internal part of the extent requested -+ * tail and head must be placed in different extent -+ * so, we have to insert one more extent */ -+ path[depth].p_ext = ex; -+ return ext3_ext_split_for_rm(handle, tree, path, start, end); -+ } -+ -+ lu = ex; -+ while (ex >= EXT_FIRST_EXTENT(eh) && ex->ee_block + ex->ee_len > start) { -+ ext_debug(tree, "remove ext %u:%u\n", ex->ee_block, ex->ee_len); -+ path[depth].p_ext = ex; -+ -+ a = ex->ee_block > start ? ex->ee_block : start; -+ b = ex->ee_block + ex->ee_len - 1 < end ? -+ ex->ee_block + ex->ee_len - 1 : end; -+ -+ ext_debug(tree, " border %u:%u\n", a, b); -+ -+ if (a != ex->ee_block && b != ex->ee_block + ex->ee_len - 1) { -+ block = 0; -+ num = 0; -+ BUG(); -+ } else if (a != ex->ee_block) { -+ /* remove tail of the extent */ -+ block = ex->ee_block; -+ num = a - block; -+ } else if (b != ex->ee_block + ex->ee_len - 1) { -+ /* remove head of the extent */ -+ block = a; -+ num = b - a; -+ } else { -+ /* remove whole extent: excelent! */ -+ block = ex->ee_block; -+ num = 0; -+ EXT_ASSERT(a == ex->ee_block && -+ b == ex->ee_block + ex->ee_len - 1); -+ } -+ -+ if (ex == EXT_FIRST_EXTENT(eh)) -+ correct_index = 1; -+ -+ credits = 1; -+ if (correct_index) -+ credits += (EXT_DEPTH(tree) * EXT3_ALLOC_NEEDED) + 1; -+ if (tree->ops->remove_extent_credits) -+ credits+=tree->ops->remove_extent_credits(tree,ex,a,b); -+ -+ handle = ext3_ext_journal_restart(handle, credits); -+ if (IS_ERR(handle)) { -+ err = PTR_ERR(handle); -+ goto out; -+ } -+ -+ err = ext3_ext_get_access(handle, tree, path + depth); -+ if (err) -+ goto out; -+ -+ if (tree->ops->remove_extent) -+ err = tree->ops->remove_extent(tree, ex, a, b); -+ if (err) -+ goto out; -+ -+ if (num == 0) { -+ /* this extent is removed entirely mark slot unused */ -+ ex->ee_start = ex->ee_start_hi = 0; -+ eh->eh_entries--; -+ fu = ex; -+ } -+ -+ ex->ee_block = block; -+ ex->ee_len = num; -+ -+ err = ext3_ext_dirty(handle, tree, path + depth); -+ if (err) -+ goto out; -+ -+ ext_debug(tree, "new extent: %u:%u:%u\n", -+ ex->ee_block, ex->ee_len, ex->ee_start); -+ ex--; -+ } -+ -+ if (fu) { -+ /* reuse unused slots */ -+ while (lu < le) { -+ if (lu->ee_start) { -+ *fu = *lu; -+ lu->ee_start = lu->ee_start_hi = 0; -+ fu++; -+ } -+ lu++; -+ } -+ } -+ -+ if (correct_index && eh->eh_entries) -+ err = ext3_ext_correct_indexes(handle, tree, path); -+ -+ /* if this leaf is free, then we should -+ * remove it from index block above */ -+ if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL) -+ err = ext3_ext_rm_idx(handle, tree, path + depth); -+ -+out: -+ return err; -+} -+ -+ -+static struct ext3_extent_idx * -+ext3_ext_last_covered(struct ext3_extent_header *hdr, unsigned long block) -+{ -+ struct ext3_extent_idx *ix; -+ -+ ix = EXT_LAST_INDEX(hdr); -+ while (ix != EXT_FIRST_INDEX(hdr)) { -+ if (ix->ei_block <= block) -+ break; -+ ix--; -+ } -+ return ix; -+} -+ -+/* -+ * returns 1 if current index have to be freed (even partial) -+ */ -+static int inline -+ext3_ext_more_to_rm(struct ext3_ext_path *path) -+{ -+ EXT_ASSERT(path->p_idx); -+ -+ if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr)) -+ return 0; -+ -+ /* -+ * if truncate on deeper level happened it it wasn't partial -+ * so we have to consider current index for truncation -+ */ -+ if (path->p_hdr->eh_entries == path->p_block) -+ return 0; -+ return 1; -+} -+ -+int ext3_ext_remove_space(struct ext3_extents_tree *tree, -+ unsigned long start, unsigned long end) -+{ -+ struct inode *inode = tree->inode; -+ struct super_block *sb = inode->i_sb; -+ int depth = EXT_DEPTH(tree); -+ struct ext3_ext_path *path; -+ handle_t *handle; -+ int i = 0, err = 0; -+ -+ ext_debug(tree, "space to be removed: %lu:%lu\n", start, end); -+ -+ /* probably first extent we're gonna free will be last in block */ -+ handle = ext3_journal_start(inode, depth + 1); -+ if (IS_ERR(handle)) -+ return PTR_ERR(handle); -+ -+ ext3_ext_invalidate_cache(tree); -+ -+ /* -+ * we start scanning from right side freeing all the blocks -+ * after i_size and walking into the deep -+ */ -+ path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 1), GFP_KERNEL); -+ if (IS_ERR(path)) { -+ ext3_error(sb, __FUNCTION__, "Can't allocate path array"); -+ ext3_journal_stop(handle); -+ return -ENOMEM; -+ } -+ memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1)); -+ path[i].p_hdr = EXT_ROOT_HDR(tree); -+ -+ while (i >= 0 && err == 0) { -+ if (i == depth) { -+ /* this is leaf block */ -+ err = ext3_ext_rm_leaf(handle, tree, path, start, end); -+ /* root level have p_bh == NULL, brelse() eats this */ -+ brelse(path[i].p_bh); -+ i--; -+ continue; -+ } -+ -+ /* this is index block */ -+ if (!path[i].p_hdr) { -+ ext_debug(tree, "initialize header\n"); -+ path[i].p_hdr = EXT_BLOCK_HDR(path[i].p_bh); -+ } -+ -+ EXT_ASSERT(path[i].p_hdr->eh_entries <= path[i].p_hdr->eh_max); -+ EXT_ASSERT(path[i].p_hdr->eh_magic == EXT3_EXT_MAGIC); -+ -+ if (!path[i].p_idx) { -+ /* this level hasn't touched yet */ -+ path[i].p_idx = -+ ext3_ext_last_covered(path[i].p_hdr, end); -+ path[i].p_block = path[i].p_hdr->eh_entries + 1; -+ ext_debug(tree, "init index ptr: hdr 0x%p, num %d\n", -+ path[i].p_hdr, path[i].p_hdr->eh_entries); -+ } else { -+ /* we've already was here, see at next index */ -+ path[i].p_idx--; -+ } -+ -+ ext_debug(tree, "level %d - index, first 0x%p, cur 0x%p\n", -+ i, EXT_FIRST_INDEX(path[i].p_hdr), -+ path[i].p_idx); -+ if (ext3_ext_more_to_rm(path + i)) { -+ /* go to the next level */ -+ ext_debug(tree, "move to level %d (block %d)\n", -+ i + 1, path[i].p_idx->ei_leaf); -+ memset(path + i + 1, 0, sizeof(*path)); -+ path[i+1].p_bh = sb_bread(sb, path[i].p_idx->ei_leaf); -+ if (!path[i+1].p_bh) { -+ /* should we reset i_size? */ -+ err = -EIO; -+ break; -+ } -+ /* put actual number of indexes to know is this -+ * number got changed at the next iteration */ -+ path[i].p_block = path[i].p_hdr->eh_entries; -+ i++; -+ } else { -+ /* we finish processing this index, go up */ -+ if (path[i].p_hdr->eh_entries == 0 && i > 0) { -+ /* index is empty, remove it -+ * handle must be already prepared by the -+ * truncatei_leaf() */ -+ err = ext3_ext_rm_idx(handle, tree, path + i); -+ } -+ /* root level have p_bh == NULL, brelse() eats this */ -+ brelse(path[i].p_bh); -+ i--; -+ ext_debug(tree, "return to level %d\n", i); -+ } -+ } -+ -+ /* TODO: flexible tree reduction should be here */ -+ if (path->p_hdr->eh_entries == 0) { -+ /* -+ * truncate to zero freed all the tree -+ * so, we need to correct eh_depth -+ */ -+ err = ext3_ext_get_access(handle, tree, path); -+ if (err == 0) { -+ EXT_ROOT_HDR(tree)->eh_depth = 0; -+ EXT_ROOT_HDR(tree)->eh_max = ext3_ext_space_root(tree); -+ err = ext3_ext_dirty(handle, tree, path); -+ } -+ } -+ ext3_ext_tree_changed(tree); -+ -+ kfree(path); -+ ext3_journal_stop(handle); -+ -+ return err; -+} -+ -+int ext3_ext_calc_metadata_amount(struct ext3_extents_tree *tree, int blocks) -+{ -+ int lcap, icap, rcap, leafs, idxs, num; -+ -+ rcap = ext3_ext_space_root(tree); -+ if (blocks <= rcap) { -+ /* all extents fit to the root */ -+ return 0; -+ } -+ -+ rcap = ext3_ext_space_root_idx(tree); -+ lcap = ext3_ext_space_block(tree); -+ icap = ext3_ext_space_block_idx(tree); -+ -+ num = leafs = (blocks + lcap - 1) / lcap; -+ if (leafs <= rcap) { -+ /* all pointers to leafs fit to the root */ -+ return leafs; -+ } -+ -+ /* ok. we need separate index block(s) to link all leaf blocks */ -+ idxs = (leafs + icap - 1) / icap; -+ do { -+ num += idxs; -+ idxs = (idxs + icap - 1) / icap; -+ } while (idxs > rcap); -+ -+ return num; -+} -+ -+/* -+ * called at mount time -+ */ -+void ext3_ext_init(struct super_block *sb) -+{ -+ /* -+ * possible initialization would be here -+ */ -+ -+ if (test_opt(sb, EXTENTS)) { -+ printk("EXT3-fs: file extents enabled"); -+#ifdef AGRESSIVE_TEST -+ printk(", agressive tests"); -+#endif -+#ifdef CHECK_BINSEARCH -+ printk(", check binsearch"); -+#endif -+ printk("\n"); -+ } -+} -+ -+/* -+ * called at umount time -+ */ -+void ext3_ext_release(struct super_block *sb) -+{ -+} -+ -+/************************************************************************ -+ * VFS related routines -+ ************************************************************************/ -+ -+static int ext3_get_inode_write_access(handle_t *handle, void *buffer) -+{ -+ /* we use in-core data, not bh */ -+ return 0; -+} -+ -+static int ext3_mark_buffer_dirty(handle_t *handle, void *buffer) -+{ -+ struct inode *inode = buffer; -+ return ext3_mark_inode_dirty(handle, inode); -+} -+ -+static int ext3_ext_mergable(struct ext3_extent *ex1, -+ struct ext3_extent *ex2) -+{ -+ /* FIXME: support for large fs */ -+ if (ex1->ee_start + ex1->ee_len == ex2->ee_start) -+ return 1; -+ return 0; -+} -+ -+static int -+ext3_remove_blocks_credits(struct ext3_extents_tree *tree, -+ struct ext3_extent *ex, -+ unsigned long from, unsigned long to) -+{ -+ int needed; -+ -+ /* at present, extent can't cross block group */; -+ needed = 4; /* bitmap + group desc + sb + inode */ -+ -+#ifdef CONFIG_QUOTA -+ needed += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS; -+#endif -+ return needed; -+} -+ -+static int -+ext3_remove_blocks(struct ext3_extents_tree *tree, -+ struct ext3_extent *ex, -+ unsigned long from, unsigned long to) -+{ -+ int needed = ext3_remove_blocks_credits(tree, ex, from, to); -+ handle_t *handle = ext3_journal_start(tree->inode, needed); -+ struct buffer_head *bh; -+ int i; -+ -+ if (IS_ERR(handle)) -+ return PTR_ERR(handle); -+ if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) { -+ /* tail removal */ -+ unsigned long num, start; -+ num = ex->ee_block + ex->ee_len - from; -+ start = ex->ee_start + ex->ee_len - num; -+ ext_debug(tree, "free last %lu blocks starting %lu\n", -+ num, start); -+ for (i = 0; i < num; i++) { -+ bh = sb_find_get_block(tree->inode->i_sb, start + i); -+ ext3_forget(handle, 0, tree->inode, bh, start + i); -+ } -+ ext3_free_blocks(handle, tree->inode, start, num); -+ } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) { -+ printk("strange request: removal %lu-%lu from %u:%u\n", -+ from, to, ex->ee_block, ex->ee_len); -+ } else { -+ printk("strange request: removal(2) %lu-%lu from %u:%u\n", -+ from, to, ex->ee_block, ex->ee_len); -+ } -+ ext3_journal_stop(handle); -+ return 0; -+} -+ -+static int ext3_ext_find_goal(struct inode *inode, -+ struct ext3_ext_path *path, unsigned long block) -+{ -+ struct ext3_inode_info *ei = EXT3_I(inode); -+ unsigned long bg_start; -+ unsigned long colour; -+ int depth; -+ -+ if (path) { -+ struct ext3_extent *ex; -+ depth = path->p_depth; -+ -+ /* try to predict block placement */ -+ if ((ex = path[depth].p_ext)) -+ return ex->ee_start + (block - ex->ee_block); -+ -+ /* it looks index is empty -+ * try to find starting from index itself */ -+ if (path[depth].p_bh) -+ return path[depth].p_bh->b_blocknr; -+ } -+ -+ /* OK. use inode's group */ -+ bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) + -+ le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block); -+ colour = (current->pid % 16) * -+ (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16); -+ return bg_start + colour + block; -+} -+ -+static int ext3_new_block_cb(handle_t *handle, struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, -+ struct ext3_extent *ex, int *err) -+{ -+ struct inode *inode = tree->inode; -+ int newblock, goal; -+ -+ EXT_ASSERT(path); -+ EXT_ASSERT(ex); -+ EXT_ASSERT(ex->ee_start); -+ EXT_ASSERT(ex->ee_len); -+ -+ /* reuse block from the extent to order data/metadata */ -+ newblock = ex->ee_start++; -+ ex->ee_len--; -+ if (ex->ee_len == 0) { -+ ex->ee_len = 1; -+ /* allocate new block for the extent */ -+ goal = ext3_ext_find_goal(inode, path, ex->ee_block); -+ ex->ee_start = ext3_new_block(handle, inode, goal, err); -+ ex->ee_start_hi = 0; -+ if (ex->ee_start == 0) { -+ /* error occured: restore old extent */ -+ ex->ee_start = newblock; -+ return 0; -+ } -+ } -+ return newblock; -+} -+ -+static struct ext3_extents_helpers ext3_blockmap_helpers = { -+ .get_write_access = ext3_get_inode_write_access, -+ .mark_buffer_dirty = ext3_mark_buffer_dirty, -+ .mergable = ext3_ext_mergable, -+ .new_block = ext3_new_block_cb, -+ .remove_extent = ext3_remove_blocks, -+ .remove_extent_credits = ext3_remove_blocks_credits, -+}; -+ -+void ext3_init_tree_desc(struct ext3_extents_tree *tree, -+ struct inode *inode) -+{ -+ tree->inode = inode; -+ tree->root = (void *) EXT3_I(inode)->i_data; -+ tree->buffer = (void *) inode; -+ tree->buffer_len = sizeof(EXT3_I(inode)->i_data); -+ tree->cex = (struct ext3_ext_cache *) &EXT3_I(inode)->i_cached_extent; -+ tree->ops = &ext3_blockmap_helpers; -+} -+ -+int ext3_ext_get_block(handle_t *handle, struct inode *inode, -+ long iblock, struct buffer_head *bh_result, -+ int create, int extend_disksize) -+{ -+ struct ext3_ext_path *path = NULL; -+ struct ext3_extent newex; -+ struct ext3_extent *ex; -+ int goal, newblock, err = 0, depth; -+ struct ext3_extents_tree tree; -+ -+ clear_buffer_new(bh_result); -+ ext3_init_tree_desc(&tree, inode); -+ ext_debug(&tree, "block %d requested for inode %u\n", -+ (int) iblock, (unsigned) inode->i_ino); -+ down(&EXT3_I(inode)->truncate_sem); -+ -+ /* check in cache */ -+ if ((goal = ext3_ext_in_cache(&tree, iblock, &newex))) { -+ if (goal == EXT3_EXT_CACHE_GAP) { -+ if (!create) { -+ /* block isn't allocated yet and -+ * user don't want to allocate it */ -+ goto out2; -+ } -+ /* we should allocate requested block */ -+ } else if (goal == EXT3_EXT_CACHE_EXTENT) { -+ /* block is already allocated */ -+ newblock = iblock - newex.ee_block + newex.ee_start; -+ goto out; -+ } else { -+ EXT_ASSERT(0); -+ } -+ } -+ -+ /* find extent for this block */ -+ path = ext3_ext_find_extent(&tree, iblock, NULL); -+ if (IS_ERR(path)) { -+ err = PTR_ERR(path); -+ path = NULL; -+ goto out2; -+ } -+ -+ depth = EXT_DEPTH(&tree); -+ -+ /* -+ * consistent leaf must not be empty -+ * this situations is possible, though, _during_ tree modification -+ * this is why assert can't be put in ext3_ext_find_extent() -+ */ -+ EXT_ASSERT(path[depth].p_ext != NULL || depth == 0); -+ -+ if ((ex = path[depth].p_ext)) { -+ /* if found exent covers block, simple return it */ -+ if (iblock >= ex->ee_block && iblock < ex->ee_block + ex->ee_len) { -+ newblock = iblock - ex->ee_block + ex->ee_start; -+ ext_debug(&tree, "%d fit into %d:%d -> %d\n", -+ (int) iblock, ex->ee_block, ex->ee_len, -+ newblock); -+ ext3_ext_put_in_cache(&tree, ex->ee_block, -+ ex->ee_len, ex->ee_start, -+ EXT3_EXT_CACHE_EXTENT); -+ goto out; -+ } -+ } -+ -+ /* -+ * requested block isn't allocated yet -+ * we couldn't try to create block if create flag is zero -+ */ -+ if (!create) { -+ /* put just found gap into cache to speedup subsequest reqs */ -+ ext3_ext_put_gap_in_cache(&tree, path, iblock); -+ goto out2; -+ } -+ -+ /* allocate new block */ -+ goal = ext3_ext_find_goal(inode, path, iblock); -+ newblock = ext3_new_block(handle, inode, goal, &err); -+ if (!newblock) -+ goto out2; -+ ext_debug(&tree, "allocate new block: goal %d, found %d\n", -+ goal, newblock); -+ -+ /* try to insert new extent into found leaf and return */ -+ newex.ee_block = iblock; -+ newex.ee_start = newblock; -+ newex.ee_start_hi = 0; -+ newex.ee_len = 1; -+ err = ext3_ext_insert_extent(handle, &tree, path, &newex); -+ if (err) -+ goto out2; -+ -+ if (extend_disksize && inode->i_size > EXT3_I(inode)->i_disksize) -+ EXT3_I(inode)->i_disksize = inode->i_size; -+ -+ /* previous routine could use block we allocated */ -+ newblock = newex.ee_start; -+ set_buffer_new(bh_result); -+ -+ ext3_ext_put_in_cache(&tree, newex.ee_block, newex.ee_len, -+ newex.ee_start, EXT3_EXT_CACHE_EXTENT); -+out: -+ ext3_ext_show_leaf(&tree, path); -+ map_bh(bh_result, inode->i_sb, newblock); -+out2: -+ if (path) { -+ ext3_ext_drop_refs(path); -+ kfree(path); -+ } -+ up(&EXT3_I(inode)->truncate_sem); -+ -+ return err; -+} -+ -+void ext3_ext_truncate(struct inode * inode, struct page *page) -+{ -+ struct address_space *mapping = inode->i_mapping; -+ struct super_block *sb = inode->i_sb; -+ struct ext3_extents_tree tree; -+ unsigned long last_block; -+ handle_t *handle; -+ int err = 0; -+ -+ ext3_init_tree_desc(&tree, inode); -+ -+ /* -+ * probably first extent we're gonna free will be last in block -+ */ -+ err = ext3_writepage_trans_blocks(inode) + 3; -+ handle = ext3_journal_start(inode, err); -+ if (IS_ERR(handle)) { -+ if (page) { -+ clear_highpage(page); -+ flush_dcache_page(page); -+ unlock_page(page); -+ page_cache_release(page); -+ } -+ return; -+ } -+ -+ if (page) -+ ext3_block_truncate_page(handle, page, mapping, inode->i_size); -+ -+ down(&EXT3_I(inode)->truncate_sem); -+ ext3_ext_invalidate_cache(&tree); -+ -+ /* -+ * TODO: optimization is possible here -+ * probably we need not scaning at all, -+ * because page truncation is enough -+ */ -+ if (ext3_orphan_add(handle, inode)) -+ goto out_stop; -+ -+ /* we have to know where to truncate from in crash case */ -+ EXT3_I(inode)->i_disksize = inode->i_size; -+ ext3_mark_inode_dirty(handle, inode); -+ -+ last_block = (inode->i_size + sb->s_blocksize - 1) >> -+ EXT3_BLOCK_SIZE_BITS(sb); -+ err = ext3_ext_remove_space(&tree, last_block, EXT_MAX_BLOCK); -+ -+ /* In a multi-transaction truncate, we only make the final -+ * transaction synchronous */ -+ if (IS_SYNC(inode)) -+ handle->h_sync = 1; -+ -+out_stop: -+ /* -+ * If this was a simple ftruncate(), and the file will remain alive -+ * then we need to clear up the orphan record which we created above. -+ * However, if this was a real unlink then we were called by -+ * ext3_delete_inode(), and we allow that function to clean up the -+ * orphan info for us. -+ */ -+ if (inode->i_nlink) -+ ext3_orphan_del(handle, inode); -+ -+ up(&EXT3_I(inode)->truncate_sem); -+ ext3_journal_stop(handle); -+} -+ -+/* -+ * this routine calculate max number of blocks we could modify -+ * in order to allocate new block for an inode -+ */ -+int ext3_ext_writepage_trans_blocks(struct inode *inode, int num) -+{ -+ struct ext3_extents_tree tree; -+ int needed; -+ -+ ext3_init_tree_desc(&tree, inode); -+ -+ needed = ext3_ext_calc_credits_for_insert(&tree, NULL); -+ -+ /* caller want to allocate num blocks */ -+ needed *= num; -+ -+#ifdef CONFIG_QUOTA -+ /* -+ * FIXME: real calculation should be here -+ * it depends on blockmap format of qouta file -+ */ -+ needed += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS; -+#endif -+ -+ return needed; -+} -+ -+void ext3_extents_initialize_blockmap(handle_t *handle, struct inode *inode) -+{ -+ struct ext3_extents_tree tree; -+ -+ ext3_init_tree_desc(&tree, inode); -+ ext3_extent_tree_init(handle, &tree); -+} -+ -+int ext3_ext_calc_blockmap_metadata(struct inode *inode, int blocks) -+{ -+ struct ext3_extents_tree tree; -+ -+ ext3_init_tree_desc(&tree, inode); -+ return ext3_ext_calc_metadata_amount(&tree, blocks); -+} -+ -+static int -+ext3_ext_store_extent_cb(struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, -+ struct ext3_ext_cache *newex) -+{ -+ struct ext3_extent_buf *buf = (struct ext3_extent_buf *) tree->private; -+ -+ if (newex->ec_type != EXT3_EXT_CACHE_EXTENT) -+ return EXT_CONTINUE; -+ -+ if (buf->err < 0) -+ return EXT_BREAK; -+ if (buf->cur - buf->buffer + sizeof(*newex) > buf->buflen) -+ return EXT_BREAK; -+ -+ if (!copy_to_user(buf->cur, newex, sizeof(*newex))) { -+ buf->err++; -+ buf->cur += sizeof(*newex); -+ } else { -+ buf->err = -EFAULT; -+ return EXT_BREAK; -+ } -+ return EXT_CONTINUE; -+} -+ -+static int -+ext3_ext_collect_stats_cb(struct ext3_extents_tree *tree, -+ struct ext3_ext_path *path, -+ struct ext3_ext_cache *ex) -+{ -+ struct ext3_extent_tree_stats *buf = -+ (struct ext3_extent_tree_stats *) tree->private; -+ int depth; -+ -+ if (ex->ec_type != EXT3_EXT_CACHE_EXTENT) -+ return EXT_CONTINUE; -+ -+ depth = EXT_DEPTH(tree); -+ buf->extents_num++; -+ if (path[depth].p_ext == EXT_FIRST_EXTENT(path[depth].p_hdr)) -+ buf->leaf_num++; -+ return EXT_CONTINUE; -+} -+ -+int ext3_ext_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, -+ unsigned long arg) -+{ -+ int err = 0; -+ -+ if (!(EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL)) -+ return -EINVAL; -+ -+ if (cmd == EXT3_IOC_GET_EXTENTS) { -+ struct ext3_extent_buf buf; -+ struct ext3_extents_tree tree; -+ -+ if (copy_from_user(&buf, (void *) arg, sizeof(buf))) -+ return -EFAULT; -+ -+ ext3_init_tree_desc(&tree, inode); -+ buf.cur = buf.buffer; -+ buf.err = 0; -+ tree.private = &buf; -+ down(&EXT3_I(inode)->truncate_sem); -+ err = ext3_ext_walk_space(&tree, buf.start, EXT_MAX_BLOCK, -+ ext3_ext_store_extent_cb); -+ up(&EXT3_I(inode)->truncate_sem); -+ if (err == 0) -+ err = buf.err; -+ } else if (cmd == EXT3_IOC_GET_TREE_STATS) { -+ struct ext3_extent_tree_stats buf; -+ struct ext3_extents_tree tree; -+ -+ ext3_init_tree_desc(&tree, inode); -+ down(&EXT3_I(inode)->truncate_sem); -+ buf.depth = EXT_DEPTH(&tree); -+ buf.extents_num = 0; -+ buf.leaf_num = 0; -+ tree.private = &buf; -+ err = ext3_ext_walk_space(&tree, 0, EXT_MAX_BLOCK, -+ ext3_ext_collect_stats_cb); -+ up(&EXT3_I(inode)->truncate_sem); -+ if (!err) -+ err = copy_to_user((void *) arg, &buf, sizeof(buf)); -+ } else if (cmd == EXT3_IOC_GET_TREE_DEPTH) { -+ struct ext3_extents_tree tree; -+ ext3_init_tree_desc(&tree, inode); -+ down(&EXT3_I(inode)->truncate_sem); -+ err = EXT_DEPTH(&tree); -+ up(&EXT3_I(inode)->truncate_sem); -+ } -+ -+ return err; -+} -+ -+EXPORT_SYMBOL(ext3_init_tree_desc); -+EXPORT_SYMBOL(ext3_mark_inode_dirty); -+EXPORT_SYMBOL(ext3_ext_invalidate_cache); -+EXPORT_SYMBOL(ext3_ext_insert_extent); -+EXPORT_SYMBOL(ext3_ext_walk_space); -+EXPORT_SYMBOL(ext3_ext_find_goal); -+EXPORT_SYMBOL(ext3_ext_calc_credits_for_insert); -Index: linux-stage/fs/ext3/ialloc.c -=================================================================== ---- linux-stage.orig/fs/ext3/ialloc.c 2005-02-25 14:50:50.304202816 +0200 -+++ linux-stage/fs/ext3/ialloc.c 2005-02-25 15:33:48.920193600 +0200 -@@ -566,7 +566,7 @@ repeat: - ei->i_dir_start_lookup = 0; - ei->i_disksize = 0; - -- ei->i_flags = EXT3_I(dir)->i_flags & ~EXT3_INDEX_FL; -+ ei->i_flags = EXT3_I(dir)->i_flags & ~(EXT3_INDEX_FL|EXT3_EXTENTS_FL); - if (S_ISLNK(mode)) - ei->i_flags &= ~(EXT3_IMMUTABLE_FL|EXT3_APPEND_FL); - /* dirsync only applies to directories */ -@@ -646,6 +646,18 @@ - DQUOT_FREE_INODE(inode); - goto fail2; - } -+ if (test_opt(sb, EXTENTS) && S_ISREG(inode->i_mode)) { -+ EXT3_I(inode)->i_flags |= EXT3_EXTENTS_FL; -+ ext3_extents_initialize_blockmap(handle, inode); -+ if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_EXTENTS)) { -+ err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh); -+ if (err) goto fail; -+ EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_EXTENTS); -+ BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "call ext3_journal_dirty_metadata"); -+ err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); -+ } -+ } -+ - err = ext3_mark_inode_dirty(handle, inode); - if (err) { - ext3_std_error(sb, err); -Index: linux-stage/fs/ext3/inode.c -=================================================================== ---- linux-stage.orig/fs/ext3/inode.c 2005-02-25 14:50:50.309202056 +0200 -+++ linux-stage/fs/ext3/inode.c 2005-02-25 15:36:51.846384592 +0200 -@@ -796,6 +796,17 @@ - goto reread; - } - -+static inline int -+ext3_get_block_wrap(handle_t *handle, struct inode *inode, long block, -+ struct buffer_head *bh, int create, int extend_disksize) -+{ -+ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) -+ return ext3_ext_get_block(handle, inode, block, bh, create, -+ extend_disksize); -+ return ext3_get_block_handle(handle, inode, block, bh, create, -+ extend_disksize); -+} -+ - static int ext3_get_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) - { -@@ -806,8 +817,8 @@ - handle = ext3_journal_current_handle(); - J_ASSERT(handle != 0); - } -- ret = ext3_get_block_handle(handle, inode, iblock, -- bh_result, create, 1); -+ ret = ext3_get_block_wrap(handle, inode, iblock, -+ bh_result, create, 1); - return ret; - } - -@@ -851,7 +862,7 @@ - - get_block: - if (ret == 0) -- ret = ext3_get_block_handle(handle, inode, iblock, -+ ret = ext3_get_block_wrap(handle, inode, iblock, - bh_result, create, 0); - bh_result->b_size = (1 << inode->i_blkbits); - return ret; -@@ -871,7 +882,7 @@ - dummy.b_state = 0; - dummy.b_blocknr = -1000; - buffer_trace_init(&dummy.b_history); -- *errp = ext3_get_block_handle(handle, inode, block, &dummy, create, 1); -+ *errp = ext3_get_block_wrap(handle, inode, block, &dummy, create, 1); - if (!*errp && buffer_mapped(&dummy)) { - struct buffer_head *bh; - bh = sb_getblk(inode->i_sb, dummy.b_blocknr); -@@ -1589,7 +1600,7 @@ - * This required during truncate. We need to physically zero the tail end - * of that block so it doesn't yield old data if the file is later grown. - */ --static int ext3_block_truncate_page(handle_t *handle, struct page *page, -+int ext3_block_truncate_page(handle_t *handle, struct page *page, - struct address_space *mapping, loff_t from) - { - unsigned long index = from >> PAGE_CACHE_SHIFT; -@@ -2087,6 +2098,9 @@ - return; - } - -+ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) -+ return ext3_ext_truncate(inode, page); -+ - handle = start_transaction(inode); - if (IS_ERR(handle)) { - if (page) { -@@ -2814,6 +2828,9 @@ - int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3; - int ret; - -+ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) -+ return ext3_ext_writepage_trans_blocks(inode, bpp); -+ - if (ext3_should_journal_data(inode)) - ret = 3 * (bpp + indirects) + 2; - else -Index: linux-stage/fs/ext3/Makefile -=================================================================== ---- linux-stage.orig/fs/ext3/Makefile 2005-02-25 14:49:42.168561008 +0200 -+++ linux-stage/fs/ext3/Makefile 2005-02-25 15:39:28.384587168 +0200 -@@ -5,7 +5,8 @@ - obj-$(CONFIG_EXT3_FS) += ext3.o - - ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ -- ioctl.o namei.o super.o symlink.o hash.o resize.o -+ ioctl.o namei.o super.o symlink.o hash.o resize.o \ -+ extents.o - - ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o - ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o -Index: linux-stage/fs/ext3/super.c -=================================================================== ---- linux-stage.orig/fs/ext3/super.c 2005-02-25 14:52:33.550506992 +0200 -+++ linux-stage/fs/ext3/super.c 2005-02-25 15:38:10.474431312 +0200 -@@ -394,6 +394,7 @@ - struct ext3_super_block *es = sbi->s_es; - int i; - -+ ext3_ext_release(sb); - ext3_xattr_put_super(sb); - journal_destroy(sbi->s_journal); - if (!(sb->s_flags & MS_RDONLY)) { -@@ -457,6 +458,8 @@ - #endif - ei->i_rsv_window.rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED; - ei->vfs_inode.i_version = 1; -+ -+ memset(&ei->i_cached_extent, 0, sizeof(ei->i_cached_extent)); - return &ei->vfs_inode; - } - -@@ -589,6 +594,7 @@ - Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, - Opt_ignore, Opt_barrier, Opt_err, Opt_resize, - Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, -+ Opt_extents, Opt_noextents, Opt_extdebug, - }; - - static match_table_t tokens = { -@@ -639,6 +644,9 @@ - {Opt_iopen, "iopen"}, - {Opt_noiopen, "noiopen"}, - {Opt_iopen_nopriv, "iopen_nopriv"}, -+ {Opt_extents, "extents"}, -+ {Opt_noextents, "noextents"}, -+ {Opt_extdebug, "extdebug"}, - {Opt_barrier, "barrier=%u"}, - {Opt_err, NULL}, - {Opt_resize, "resize"}, -@@ -943,6 +950,15 @@ - match_int(&args[0], &option); - *n_blocks_count = option; - break; -+ case Opt_extents: -+ set_opt (sbi->s_mount_opt, EXTENTS); -+ break; -+ case Opt_noextents: -+ clear_opt (sbi->s_mount_opt, EXTENTS); -+ break; -+ case Opt_extdebug: -+ set_opt (sbi->s_mount_opt, EXTDEBUG); -+ break; - default: - printk (KERN_ERR - "EXT3-fs: Unrecognized mount option \"%s\" " -@@ -1625,6 +1638,8 @@ - percpu_counter_mod(&sbi->s_dirs_counter, - ext3_count_dirs(sb)); - -+ ext3_ext_init(sb); -+ - return 0; - - failed_mount3: -Index: linux-stage/fs/ext3/ioctl.c -=================================================================== ---- linux-stage.orig/fs/ext3/ioctl.c 2005-02-25 14:37:28.971023976 +0200 -+++ linux-stage/fs/ext3/ioctl.c 2005-02-25 15:33:48.938190864 +0200 -@@ -124,6 +124,10 @@ - err = ext3_change_inode_journal_flag(inode, jflag); - return err; - } -+ case EXT3_IOC_GET_EXTENTS: -+ case EXT3_IOC_GET_TREE_STATS: -+ case EXT3_IOC_GET_TREE_DEPTH: -+ return ext3_ext_ioctl(inode, filp, cmd, arg); - case EXT3_IOC_GETVERSION: - case EXT3_IOC_GETVERSION_OLD: - return put_user(inode->i_generation, (int __user *) arg); -Index: linux-stage/include/linux/ext3_fs.h -=================================================================== ---- linux-stage.orig/include/linux/ext3_fs.h 2005-02-25 14:53:56.424908168 +0200 -+++ linux-stage/include/linux/ext3_fs.h 2005-02-25 15:39:12.841950008 +0200 -@@ -186,8 +186,9 @@ - #define EXT3_NOTAIL_FL 0x00008000 /* don't merge file tail */ - #define EXT3_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ - #define EXT3_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ -+#define EXT3_EXTENTS_FL 0x00080000 /* Inode uses extents */ - #define EXT3_RESERVED_FL 0x80000000 /* reserved for ext3 lib */ - --#define EXT3_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */ -+#define EXT3_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */ - #define EXT3_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */ - -@@ -237,6 +238,9 @@ - #endif - #define EXT3_IOC_GETRSVSZ _IOR('f', 5, long) - #define EXT3_IOC_SETRSVSZ _IOW('f', 6, long) -+#define EXT3_IOC_GET_EXTENTS _IOR('f', 7, long) -+#define EXT3_IOC_GET_TREE_DEPTH _IOR('f', 8, long) -+#define EXT3_IOC_GET_TREE_STATS _IOR('f', 9, long) - - /* - * Structure of an inode on the disk -@@ -359,6 +363,8 @@ - #define EXT3_MOUNT_RESERVATION 0x20000 /* Preallocation */ - #define EXT3_MOUNT_IOPEN 0x80000 /* Allow access via iopen */ - #define EXT3_MOUNT_IOPEN_NOPRIV 0x100000/* Make iopen world-readable */ -+#define EXT3_MOUNT_EXTENTS 0x200000/* Extents support */ -+#define EXT3_MOUNT_EXTDEBUG 0x400000/* Extents debug */ - - /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ - #ifndef _LINUX_EXT2_FS_H -@@ -503,11 +509,13 @@ - #define EXT3_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */ - #define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */ - #define EXT3_FEATURE_INCOMPAT_META_BG 0x0010 -+#define EXT3_FEATURE_INCOMPAT_EXTENTS 0x0040 /* extents support */ - - #define EXT3_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR - #define EXT3_FEATURE_INCOMPAT_SUPP (EXT3_FEATURE_INCOMPAT_FILETYPE| \ - EXT3_FEATURE_INCOMPAT_RECOVER| \ -- EXT3_FEATURE_INCOMPAT_META_BG) -+ EXT3_FEATURE_INCOMPAT_META_BG| \ -+ EXT3_FEATURE_INCOMPAT_EXTENTS) - #define EXT3_FEATURE_RO_COMPAT_SUPP (EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \ - EXT3_FEATURE_RO_COMPAT_LARGE_FILE| \ - EXT3_FEATURE_RO_COMPAT_BTREE_DIR) -@@ -756,6 +763,9 @@ - - - /* inode.c */ -+extern int ext3_block_truncate_page(handle_t *, struct page *, -+ struct address_space *, loff_t); -+extern int ext3_writepage_trans_blocks(struct inode *inode); - extern int ext3_forget(handle_t *, int, struct inode *, struct buffer_head *, int); - extern struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *); - extern struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *); -@@ -836,6 +844,16 @@ - extern struct inode_operations ext3_symlink_inode_operations; - extern struct inode_operations ext3_fast_symlink_inode_operations; - -+/* extents.c */ -+extern int ext3_ext_writepage_trans_blocks(struct inode *, int); -+extern int ext3_ext_get_block(handle_t *, struct inode *, long, -+ struct buffer_head *, int, int); -+extern void ext3_ext_truncate(struct inode *, struct page *); -+extern void ext3_ext_init(struct super_block *); -+extern void ext3_ext_release(struct super_block *); -+extern void ext3_extents_initialize_blockmap(handle_t *, struct inode *); -+extern int ext3_ext_ioctl(struct inode *inode, struct file *filp, -+ unsigned int cmd, unsigned long arg); - - #endif /* __KERNEL__ */ - -Index: linux-stage/include/linux/ext3_extents.h -=================================================================== ---- linux-stage.orig/include/linux/ext3_extents.h 2005-02-25 15:33:48.891198008 +0200 -+++ linux-stage/include/linux/ext3_extents.h 2005-02-25 15:33:48.944189952 +0200 -@@ -0,0 +1,262 @@ -+/* -+ * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com -+ * Written by Alex Tomas -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License version 2 as -+ * published by the Free Software Foundation. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public Licens -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- -+ */ -+ -+#ifndef _LINUX_EXT3_EXTENTS -+#define _LINUX_EXT3_EXTENTS -+ -+/* -+ * with AGRESSIVE_TEST defined capacity of index/leaf blocks -+ * become very little, so index split, in-depth growing and -+ * other hard changes happens much more often -+ * this is for debug purposes only -+ */ -+#define AGRESSIVE_TEST_ -+ -+/* -+ * if CHECK_BINSEARCH defined, then results of binary search -+ * will be checked by linear search -+ */ -+#define CHECK_BINSEARCH_ -+ -+/* -+ * if EXT_DEBUG is defined you can use 'extdebug' mount option -+ * to get lots of info what's going on -+ */ -+#define EXT_DEBUG_ -+#ifdef EXT_DEBUG -+#define ext_debug(tree,fmt,a...) \ -+do { \ -+ if (test_opt((tree)->inode->i_sb, EXTDEBUG)) \ -+ printk(fmt, ##a); \ -+} while (0); -+#else -+#define ext_debug(tree,fmt,a...) -+#endif -+ -+/* -+ * if EXT_STATS is defined then stats numbers are collected -+ * these number will be displayed at umount time -+ */ -+#define EXT_STATS_ -+ -+ -+#define EXT3_ALLOC_NEEDED 3 /* block bitmap + group desc. + sb */ -+ -+/* -+ * ext3_inode has i_block array (total 60 bytes) -+ * first 4 bytes are used to store: -+ * - tree depth (0 mean there is no tree yet. all extents in the inode) -+ * - number of alive extents in the inode -+ */ -+ -+/* -+ * this is extent on-disk structure -+ * it's used at the bottom of the tree -+ */ -+struct ext3_extent { -+ __u32 ee_block; /* first logical block extent covers */ -+ __u16 ee_len; /* number of blocks covered by extent */ -+ __u16 ee_start_hi; /* high 16 bits of physical block */ -+ __u32 ee_start; /* low 32 bits of physical block */ -+}; -+ -+/* -+ * this is index on-disk structure -+ * it's used at all the levels, but the bottom -+ */ -+struct ext3_extent_idx { -+ __u32 ei_block; /* index covers logical blocks from 'block' */ -+ __u32 ei_leaf; /* pointer to the physical block of the next * -+ * level. leaf or next index could bet here */ -+ __u16 ei_leaf_hi; /* high 16 bits of physical block */ -+ __u16 ei_unused; -+}; -+ -+/* -+ * each block (leaves and indexes), even inode-stored has header -+ */ -+struct ext3_extent_header { -+ __u16 eh_magic; /* probably will support different formats */ -+ __u16 eh_entries; /* number of valid entries */ -+ __u16 eh_max; /* capacity of store in entries */ -+ __u16 eh_depth; /* has tree real underlaying blocks? */ -+ __u32 eh_generation; /* flags(8 bits) | generation of the tree */ -+}; -+ -+#define EXT3_EXT_MAGIC 0xf30a -+ -+/* -+ * array of ext3_ext_path contains path to some extent -+ * creation/lookup routines use it for traversal/splitting/etc -+ * truncate uses it to simulate recursive walking -+ */ -+struct ext3_ext_path { -+ __u32 p_block; -+ __u16 p_depth; -+ struct ext3_extent *p_ext; -+ struct ext3_extent_idx *p_idx; -+ struct ext3_extent_header *p_hdr; -+ struct buffer_head *p_bh; -+}; -+ -+/* -+ * structure for external API -+ */ -+ -+/* -+ * storage for cached extent -+ */ -+struct ext3_ext_cache { -+ __u32 ec_start; -+ __u32 ec_block; -+ __u32 ec_len; -+ __u32 ec_type; -+}; -+ -+#define EXT3_EXT_CACHE_NO 0 -+#define EXT3_EXT_CACHE_GAP 1 -+#define EXT3_EXT_CACHE_EXTENT 2 -+ -+/* -+ * ext3_extents_tree is used to pass initial information -+ * to top-level extents API -+ */ -+struct ext3_extents_helpers; -+struct ext3_extents_tree { -+ struct inode *inode; /* inode which tree belongs to */ -+ void *root; /* ptr to data top of tree resides at */ -+ void *buffer; /* will be passed as arg to ^^ routines */ -+ int buffer_len; -+ void *private; -+ struct ext3_ext_cache *cex;/* last found extent */ -+ struct ext3_extents_helpers *ops; -+}; -+ -+struct ext3_extents_helpers { -+ int (*get_write_access)(handle_t *h, void *buffer); -+ int (*mark_buffer_dirty)(handle_t *h, void *buffer); -+ int (*mergable)(struct ext3_extent *ex1, struct ext3_extent *ex2); -+ int (*remove_extent_credits)(struct ext3_extents_tree *, -+ struct ext3_extent *, unsigned long, -+ unsigned long); -+ int (*remove_extent)(struct ext3_extents_tree *, -+ struct ext3_extent *, unsigned long, -+ unsigned long); -+ int (*new_block)(handle_t *, struct ext3_extents_tree *, -+ struct ext3_ext_path *, struct ext3_extent *, -+ int *); -+}; -+ -+/* -+ * to be called by ext3_ext_walk_space() -+ * negative retcode - error -+ * positive retcode - signal for ext3_ext_walk_space(), see below -+ * callback must return valid extent (passed or newly created) -+ */ -+typedef int (*ext_prepare_callback)(struct ext3_extents_tree *, -+ struct ext3_ext_path *, -+ struct ext3_ext_cache *); -+ -+#define EXT_CONTINUE 0 -+#define EXT_BREAK 1 -+#define EXT_REPEAT 2 -+ -+ -+#define EXT_MAX_BLOCK 0xffffffff -+ -+ -+#define EXT_FIRST_EXTENT(__hdr__) \ -+ ((struct ext3_extent *) (((char *) (__hdr__)) + \ -+ sizeof(struct ext3_extent_header))) -+#define EXT_FIRST_INDEX(__hdr__) \ -+ ((struct ext3_extent_idx *) (((char *) (__hdr__)) + \ -+ sizeof(struct ext3_extent_header))) -+#define EXT_HAS_FREE_INDEX(__path__) \ -+ ((__path__)->p_hdr->eh_entries < (__path__)->p_hdr->eh_max) -+#define EXT_LAST_EXTENT(__hdr__) \ -+ (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_entries - 1) -+#define EXT_LAST_INDEX(__hdr__) \ -+ (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_entries - 1) -+#define EXT_MAX_EXTENT(__hdr__) \ -+ (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_max - 1) -+#define EXT_MAX_INDEX(__hdr__) \ -+ (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_max - 1) -+#define EXT_HDR_GEN_BITS 24 -+#define EXT_HDR_GEN_MASK ((1 << EXT_HDR_GEN_BITS) - 1) -+#define EXT_HDR_GEN(__hdr__) ((__hdr__)->eh_generation & EXT_HDR_GEN_MASK) -+#define EXT_FLAGS(__hdr__) ((__hdr__)->eh_generation >> EXT_HDR_GEN_BITS) -+#define EXT_FLAGS_CLR_UNKNOWN 0x7 /* Flags cleared on modification */ -+ -+#define EXT_BLOCK_HDR(__bh__) ((struct ext3_extent_header *)(__bh__)->b_data) -+#define EXT_ROOT_HDR(__tree__) ((struct ext3_extent_header *)(__tree__)->root) -+#define EXT_DEPTH(__tree__) (EXT_ROOT_HDR(__tree__)->eh_depth) -+#define EXT_GENERATION(__tree__) EXT_HDR_GEN(EXT_ROOT_HDR(__tree__)) -+ -+#define EXT_ASSERT(__x__) if (!(__x__)) BUG(); -+ -+#define EXT_CHECK_PATH(tree,path) \ -+{ \ -+ int depth = EXT_DEPTH(tree); \ -+ BUG_ON((unsigned long) (path) < __PAGE_OFFSET); \ -+ BUG_ON((unsigned long) (path)[depth].p_idx < \ -+ __PAGE_OFFSET && (path)[depth].p_idx != NULL); \ -+ BUG_ON((unsigned long) (path)[depth].p_ext < \ -+ __PAGE_OFFSET && (path)[depth].p_ext != NULL); \ -+ BUG_ON((unsigned long) (path)[depth].p_hdr < __PAGE_OFFSET); \ -+ BUG_ON((unsigned long) (path)[depth].p_bh < __PAGE_OFFSET \ -+ && depth != 0); \ -+ BUG_ON((path)[0].p_depth != depth); \ -+} -+ -+ -+/* -+ * this structure is used to gather extents from the tree via ioctl -+ */ -+struct ext3_extent_buf { -+ unsigned long start; -+ int buflen; -+ void *buffer; -+ void *cur; -+ int err; -+}; -+ -+/* -+ * this structure is used to collect stats info about the tree -+ */ -+struct ext3_extent_tree_stats { -+ int depth; -+ int extents_num; -+ int leaf_num; -+}; -+ -+extern void ext3_init_tree_desc(struct ext3_extents_tree *, struct inode *); -+extern int ext3_extent_tree_init(handle_t *, struct ext3_extents_tree *); -+extern int ext3_ext_calc_credits_for_insert(struct ext3_extents_tree *, struct ext3_ext_path *); -+extern int ext3_ext_insert_extent(handle_t *, struct ext3_extents_tree *, struct ext3_ext_path *, struct ext3_extent *); -+extern int ext3_ext_walk_space(struct ext3_extents_tree *, unsigned long, unsigned long, ext_prepare_callback); -+extern int ext3_ext_remove_space(struct ext3_extents_tree *, unsigned long, unsigned long); -+extern struct ext3_ext_path * ext3_ext_find_extent(struct ext3_extents_tree *, int, struct ext3_ext_path *); -+extern int ext3_ext_calc_blockmap_metadata(struct inode *, int); -+ -+static inline void -+ext3_ext_invalidate_cache(struct ext3_extents_tree *tree) -+{ -+ if (tree->cex) -+ tree->cex->ec_type = EXT3_EXT_CACHE_NO; -+} -+#endif /* _LINUX_EXT3_EXTENTS */ -Index: linux-stage/include/linux/ext3_fs_i.h -=================================================================== ---- linux-stage.orig/include/linux/ext3_fs_i.h 2005-02-25 14:50:50.320200384 +0200 -+++ linux-stage/include/linux/ext3_fs_i.h 2005-02-25 15:33:48.945189800 +0200 -@@ -128,6 +128,8 @@ - */ - struct semaphore truncate_sem; - struct inode vfs_inode; -+ -+ __u32 i_cached_extent[4]; - }; - - #endif /* _LINUX_EXT3_FS_I */ diff --git a/ldiskfs/kernel_patches/patches/ext3-extents-bug11324.patch b/ldiskfs/kernel_patches/patches/ext3-extents-bug11324.patch deleted file mode 100644 index c7ed475..0000000 --- a/ldiskfs/kernel_patches/patches/ext3-extents-bug11324.patch +++ /dev/null @@ -1,252 +0,0 @@ -Index: linux-stage/fs/ext3/extents.c -=================================================================== ---- linux-stage.orig/fs/ext3/extents.c 2007-04-17 22:09:19.000000000 -0700 -+++ linux-stage/fs/ext3/extents.c 2007-04-17 22:12:05.000000000 -0700 -@@ -44,26 +44,49 @@ - #include - - --static inline int ext3_ext_check_header(struct ext3_extent_header *eh) --{ -- if (eh->eh_magic != EXT3_EXT_MAGIC) { -- printk(KERN_ERR "EXT3-fs: invalid magic = 0x%x\n", -- (unsigned)eh->eh_magic); -- return -EIO; -- } -- if (eh->eh_max == 0) { -- printk(KERN_ERR "EXT3-fs: invalid eh_max = %u\n", -- (unsigned)eh->eh_max); -- return -EIO; -- } -- if (eh->eh_entries > eh->eh_max) { -- printk(KERN_ERR "EXT3-fs: invalid eh_entries = %u\n", -- (unsigned)eh->eh_entries); -- return -EIO; -+static int __ext3_ext_check_header(const char *function, struct inode *inode, -+ struct ext3_extent_header *eh, int depth, -+ int max) -+{ -+ const char *error_msg = NULL; -+ -+ if (unlikely(eh->eh_magic != EXT3_EXT_MAGIC)) { -+ error_msg = "invalid magic"; -+ goto corrupted; -+ } -+ if (unlikely(eh->eh_depth != depth)) { -+ error_msg = "unexpected eh_depth"; -+ goto corrupted; -+ } -+ if (unlikely(eh->eh_max == 0)) { -+ error_msg = "too small eh_max"; -+ goto corrupted; -+ } -+ if (unlikely(eh->eh_max > max)) { -+ error_msg = "too large eh_max"; -+ goto corrupted; -+ } -+ if (unlikely(eh->eh_entries > eh->eh_max)) { -+ error_msg = "invalid eh_entries"; -+ goto corrupted; - } - return 0; -+ -+corrupted: -+ ext3_error(inode->i_sb, function, -+ "bad header in inode #%lu: %s - magic %x, " -+ "entries %u, max %u(%u), depth %u(%u)", -+ inode->i_ino, error_msg, eh->eh_magic, -+ eh->eh_entries, eh->eh_max, max, -+ eh->eh_depth, depth); -+ -+ return -EIO; - } - -+#define ext3_ext_check_header(inode,eh,depth,max) \ -+ __ext3_ext_check_header(__FUNCTION__,inode,eh,depth,max) -+ -+ - static handle_t *ext3_ext_journal_restart(handle_t *handle, int needed) - { - int err; -@@ -226,6 +249,26 @@ - return size; - } - -+static inline int -+ext3_ext_max_entries(struct ext3_extents_tree *tree, int root, int depth) -+{ -+ int max; -+ -+ if (root) { -+ if (depth == 0) -+ max = ext3_ext_space_root(tree); -+ else -+ max = ext3_ext_space_root_idx(tree); -+ } else { -+ if (depth == 0) -+ max = ext3_ext_space_block(tree); -+ else -+ max = ext3_ext_space_block_idx(tree); -+ } -+ -+ return max; -+} -+ - static void ext3_ext_show_path(struct ext3_extents_tree *tree, - struct ext3_ext_path *path) - { -@@ -296,10 +339,6 @@ - struct ext3_extent_idx *ix; - int l = 0, k, r; - -- EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); -- EXT_ASSERT(eh->eh_entries <= eh->eh_max); -- EXT_ASSERT(eh->eh_entries > 0); -- - ext_debug(tree, "binsearch for %d(idx): ", block); - - path->p_idx = ix = EXT_FIRST_INDEX(eh); -@@ -359,9 +398,6 @@ - struct ext3_extent *ex; - int l = 0, k, r; - -- EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); -- EXT_ASSERT(eh->eh_entries <= eh->eh_max); -- - if (eh->eh_entries == 0) { - /* - * this leaf is empty yet: -@@ -436,6 +472,7 @@ - struct ext3_extent_header *eh; - struct buffer_head *bh; - int depth, i, ppos = 0; -+ int max; - - EXT_ASSERT(tree); - EXT_ASSERT(tree->inode); -@@ -443,17 +480,15 @@ - - eh = EXT_ROOT_HDR(tree); - EXT_ASSERT(eh); -- if (ext3_ext_check_header(eh)) { -+ i = depth = EXT_DEPTH(tree); -+ max = ext3_ext_max_entries(tree, 1, i); -+ if (ext3_ext_check_header(tree->inode, eh, i, max)) { - /* don't free previously allocated path - * -- caller should take care */ - path = NULL; - goto err; - } - -- i = depth = EXT_DEPTH(tree); -- EXT_ASSERT(eh->eh_max); -- EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); -- - /* account possible depth increase */ - if (!path) { - path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 2), -@@ -484,8 +519,10 @@ - path[ppos].p_hdr = eh; - i--; - -- if (ext3_ext_check_header(eh)) -+ max = ext3_ext_max_entries(tree, 0, i); -+ if (ext3_ext_check_header(tree->inode, eh, i, max)) - goto err; -+ - } - - path[ppos].p_depth = i; -@@ -493,9 +530,6 @@ - path[ppos].p_ext = NULL; - path[ppos].p_idx = NULL; - -- if (ext3_ext_check_header(eh)) -- goto err; -- - /* find extent */ - ext3_ext_binsearch(tree, path + ppos, block); - -@@ -1545,6 +1579,8 @@ - ext_debug(tree, "remove [%lu:%lu] in leaf\n", start, end); - if (!path[depth].p_hdr) - path[depth].p_hdr = EXT_BLOCK_HDR(path[depth].p_bh); -+ -+ /* the header must be checked already in ext3_ext_remove_space() */ - eh = path[depth].p_hdr; - EXT_ASSERT(eh); - EXT_ASSERT(eh->eh_entries <= eh->eh_max); -@@ -1707,7 +1743,7 @@ - int depth = EXT_DEPTH(tree); - struct ext3_ext_path *path; - handle_t *handle; -- int i = 0, err = 0; -+ int i = 0, err = 0, max; - - ext_debug(tree, "space to be removed: %lu:%lu\n", start, end); - -@@ -1730,7 +1766,13 @@ - } - memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1)); - path[i].p_hdr = EXT_ROOT_HDR(tree); -- -+ -+ max = ext3_ext_max_entries(tree, 1, depth); -+ if (ext3_ext_check_header(inode, path[i].p_hdr, depth, max)) { -+ err = -EIO; -+ goto out; -+ } -+ - while (i >= 0 && err == 0) { - if (i == depth) { - /* this is leaf block */ -@@ -1740,16 +1782,13 @@ - i--; - continue; - } -- -+ - /* this is index block */ - if (!path[i].p_hdr) { - ext_debug(tree, "initialize header\n"); - path[i].p_hdr = EXT_BLOCK_HDR(path[i].p_bh); - } - -- EXT_ASSERT(path[i].p_hdr->eh_entries <= path[i].p_hdr->eh_max); -- EXT_ASSERT(path[i].p_hdr->eh_magic == EXT3_EXT_MAGIC); -- - if (!path[i].p_idx) { - /* this level hasn't touched yet */ - path[i].p_idx = -@@ -1776,6 +1815,14 @@ - err = -EIO; - break; - } -+ BUG_ON(i + 1 > depth); -+ max = ext3_ext_max_entries(tree, 0, depth - i - 1); -+ if (ext3_ext_check_header(inode, -+ EXT_BLOCK_HDR(path[i+1].p_bh), -+ depth - i - 1, max)) { -+ err = -EIO; -+ break; -+ } - /* put actual number of indexes to know is this - * number got changed at the next iteration */ - path[i].p_block = path[i].p_hdr->eh_entries; -@@ -1796,7 +1843,7 @@ - } - - /* TODO: flexible tree reduction should be here */ -- if (path->p_hdr->eh_entries == 0) { -+ if (err == 0 && path->p_hdr->eh_entries == 0) { - /* - * truncate to zero freed all the tree - * so, we need to correct eh_depth -@@ -1810,6 +1857,7 @@ - } - ext3_ext_tree_changed(tree); - -+out: - kfree(path); - ext3_journal_stop(handle); - diff --git a/ldiskfs/kernel_patches/patches/ext3-external-journal-2.6.12.patch b/ldiskfs/kernel_patches/patches/ext3-external-journal-2.6.12.patch deleted file mode 100644 index bcfdae2..0000000 --- a/ldiskfs/kernel_patches/patches/ext3-external-journal-2.6.12.patch +++ /dev/null @@ -1,148 +0,0 @@ -Signed-off-by: Johann Lombardi - ---- linux-2.6.12.orig/fs/ext3/super.c 2005-06-17 21:48:29.000000000 +0200 -+++ linux-2.6.12/fs/ext3/super.c 2005-11-07 13:37:30.000000000 +0100 -@@ -39,7 +39,8 @@ - #include "xattr.h" - #include "acl.h" - --static int ext3_load_journal(struct super_block *, struct ext3_super_block *); -+static int ext3_load_journal(struct super_block *, struct ext3_super_block *, -+ unsigned long journal_devnum); - static int ext3_create_journal(struct super_block *, struct ext3_super_block *, - int); - static void ext3_commit_super (struct super_block * sb, -@@ -586,7 +587,7 @@ enum { - Opt_nouid32, Opt_check, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov, - Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, - Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, -- Opt_commit, Opt_journal_update, Opt_journal_inum, -+ Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev, - Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, - Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, - Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, -@@ -624,6 +625,7 @@ static match_table_t tokens = { - {Opt_commit, "commit=%u"}, - {Opt_journal_update, "journal=update"}, - {Opt_journal_inum, "journal=%u"}, -+ {Opt_journal_dev, "journal_dev=%u"}, - {Opt_abort, "abort"}, - {Opt_data_journal, "data=journal"}, - {Opt_data_ordered, "data=ordered"}, -@@ -663,8 +665,9 @@ static unsigned long get_sb_block(void * - return sb_block; - } - --static int parse_options (char * options, struct super_block *sb, -- unsigned long * inum, unsigned long *n_blocks_count, int is_remount) -+static int parse_options (char *options, struct super_block *sb, -+ unsigned long *inum, unsigned long *journal_devnum, -+ unsigned long *n_blocks_count, int is_remount) - { - struct ext3_sb_info *sbi = EXT3_SB(sb); - char * p; -@@ -805,6 +808,16 @@ static int parse_options (char * options - return 0; - *inum = option; - break; -+ case Opt_journal_dev: -+ if (is_remount) { -+ printk(KERN_ERR "EXT3-fs: cannot specify " -+ "journal on remount\n"); -+ return 0; -+ } -+ if (match_int(&args[0], &option)) -+ return 0; -+ *journal_devnum = option; -+ break; - case Opt_noload: - set_opt (sbi->s_mount_opt, NOLOAD); - break; -@@ -1250,6 +1263,7 @@ static int ext3_fill_super (struct super - unsigned long logic_sb_block; - unsigned long offset = 0; - unsigned long journal_inum = 0; -+ unsigned long journal_devnum = 0; - unsigned long def_mount_opts; - struct inode *root; - int blocksize; -@@ -1330,7 +1344,8 @@ static int ext3_fill_super (struct super - - set_opt(sbi->s_mount_opt, RESERVATION); - -- if (!parse_options ((char *) data, sb, &journal_inum, NULL, 0)) -+ if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum, -+ NULL, 0)) - goto failed_mount; - - sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | -@@ -1541,7 +1556,7 @@ static int ext3_fill_super (struct super - */ - if (!test_opt(sb, NOLOAD) && - EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) { -- if (ext3_load_journal(sb, es)) -+ if (ext3_load_journal(sb, es, journal_devnum)) - goto failed_mount2; - } else if (journal_inum) { - if (ext3_create_journal(sb, es, journal_inum)) -@@ -1821,15 +1836,24 @@ out_bdev: - return NULL; - } - --static int ext3_load_journal(struct super_block * sb, -- struct ext3_super_block * es) -+static int ext3_load_journal(struct super_block *sb, -+ struct ext3_super_block *es, -+ unsigned long journal_devnum) - { - journal_t *journal; - int journal_inum = le32_to_cpu(es->s_journal_inum); -- dev_t journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev)); -+ dev_t journal_dev; - int err = 0; - int really_read_only; - -+ if (journal_devnum && -+ journal_devnum != le32_to_cpu(es->s_journal_dev)) { -+ printk(KERN_INFO "EXT3-fs: external journal device major/minor " -+ "numbers have changed\n"); -+ journal_dev = new_decode_dev(journal_devnum); -+ } else -+ journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev)); -+ - really_read_only = bdev_read_only(sb->s_bdev); - - /* -@@ -1888,6 +1912,16 @@ static int ext3_load_journal(struct supe - - EXT3_SB(sb)->s_journal = journal; - ext3_clear_journal_err(sb, es); -+ -+ if (journal_devnum && -+ journal_devnum != le32_to_cpu(es->s_journal_dev)) { -+ es->s_journal_dev = cpu_to_le32(journal_devnum); -+ sb->s_dirt = 1; -+ -+ /* Make sure we flush the recovery flag to disk. */ -+ ext3_commit_super(sb, es, 1); -+ } -+ - return 0; - } - -@@ -2093,13 +2127,13 @@ static int ext3_remount (struct super_bl - { - struct ext3_super_block * es; - struct ext3_sb_info *sbi = EXT3_SB(sb); -- unsigned long tmp; -+ unsigned long tmp1, tmp2; - unsigned long n_blocks_count = 0; - - /* - * Allow the "check" option to be passed as a remount option. - */ -- if (!parse_options(data, sb, &tmp, &n_blocks_count, 1)) -+ if (!parse_options(data, sb, &tmp1, &tmp2, &n_blocks_count, 1)) - return -EINVAL; - - if (sbi->s_mount_opt & EXT3_MOUNT_ABORT) diff --git a/ldiskfs/kernel_patches/patches/ext3-filterdata-2.6.15.patch b/ldiskfs/kernel_patches/patches/ext3-filterdata-2.6.15.patch deleted file mode 100644 index e6d431f..0000000 --- a/ldiskfs/kernel_patches/patches/ext3-filterdata-2.6.15.patch +++ /dev/null @@ -1,25 +0,0 @@ -Index: linux-2.6.15/include/linux/ext3_fs_i.h -=================================================================== ---- linux-2.6.15.orig/include/linux/ext3_fs_i.h 2006-02-24 15:41:30.000000000 +0300 -+++ linux-2.6.15/include/linux/ext3_fs_i.h 2006-02-24 15:41:31.000000000 +0300 -@@ -135,6 +135,8 @@ struct ext3_inode_info { - struct inode vfs_inode; - - __u32 i_cached_extent[4]; -+ -+ void *i_filterdata; - }; - - #endif /* _LINUX_EXT3_FS_I */ -Index: linux-2.6.15/fs/ext3/super.c -=================================================================== ---- linux-2.6.15.orig/fs/ext3/super.c 2006-02-24 15:41:30.000000000 +0300 -+++ linux-2.6.15/fs/ext3/super.c 2006-02-24 15:42:02.000000000 +0300 -@@ -459,6 +459,7 @@ static struct inode *ext3_alloc_inode(st - ei->vfs_inode.i_version = 1; - - memset(&ei->i_cached_extent, 0, sizeof(ei->i_cached_extent)); -+ ei->i_filterdata = NULL; - return &ei->vfs_inode; - } - diff --git a/ldiskfs/kernel_patches/patches/ext3-htree-dot-2.6.patch b/ldiskfs/kernel_patches/patches/ext3-htree-dot-2.6.patch deleted file mode 100644 index 9192112..0000000 --- a/ldiskfs/kernel_patches/patches/ext3-htree-dot-2.6.patch +++ /dev/null @@ -1,23 +0,0 @@ -Index: linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/fs/ext3/namei.c -=================================================================== ---- linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891.orig/fs/ext3/namei.c 2005-04-04 05:06:46.000000000 -0600 -+++ linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/fs/ext3/namei.c 2005-04-04 05:09:18.000000000 -0600 -@@ -926,8 +926,16 @@ - struct inode *dir = dentry->d_parent->d_inode; - - sb = dir->i_sb; -- if (!(frame = dx_probe(dentry, NULL, &hinfo, frames, err))) -- return NULL; -+ /* NFS may look up ".." - look at dx_root directory block */ -+ if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){ -+ if (!(frame = dx_probe(dentry, NULL, &hinfo, frames, err))) -+ return NULL; -+ } else { -+ frame = frames; -+ frame->bh = NULL; /* for dx_release() */ -+ frame->at = (struct dx_entry *)frames; /* hack for zero entry*/ -+ dx_set_block(frame->at, 0); /* dx_root block is 0 */ -+ } - hash = hinfo.hash; - do { - block = dx_get_block(frame->at); diff --git a/ldiskfs/kernel_patches/patches/ext3-ialloc-2.6.patch b/ldiskfs/kernel_patches/patches/ext3-ialloc-2.6.patch deleted file mode 100644 index 15d37a9..0000000 --- a/ldiskfs/kernel_patches/patches/ext3-ialloc-2.6.patch +++ /dev/null @@ -1,128 +0,0 @@ -Index: linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/fs/ext3/ialloc.c -=================================================================== ---- linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891.orig/fs/ext3/ialloc.c 2005-05-16 14:10:54.000000000 -0600 -+++ linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/fs/ext3/ialloc.c 2005-05-16 14:18:29.000000000 -0600 -@@ -352,13 +352,17 @@ - return -1; - } - --static int find_group_other(struct super_block *sb, struct inode *parent) -+static int find_group_other(struct super_block *sb, struct inode *parent, -+ int mode) - { - int parent_group = EXT3_I(parent)->i_block_group; -+ struct ext3_sb_info *sbi = EXT3_SB(sb); - int ngroups = EXT3_SB(sb)->s_groups_count; - struct ext3_group_desc *desc; - struct buffer_head *bh; - int group, i; -+ int best_group = -1; -+ int avefreeb, freeb, best_group_freeb = 0; - - /* - * Try to place the inode in its parent directory -@@ -366,9 +370,9 @@ - group = parent_group; - desc = ext3_get_group_desc (sb, group, &bh); - if (desc && le16_to_cpu(desc->bg_free_inodes_count) && -- le16_to_cpu(desc->bg_free_blocks_count)) -+ (!S_ISREG(mode) || le16_to_cpu(desc->bg_free_blocks_count))) - return group; -- -+ avefreeb = le32_to_cpu(sbi->s_es->s_free_blocks_count) / ngroups; - /* - * We're going to place this inode in a different blockgroup from its - * parent. We want to cause files in a common directory to all land in -@@ -381,33 +385,47 @@ - group = (group + parent->i_ino) % ngroups; - - /* -- * Use a quadratic hash to find a group with a free inode and some free -- * blocks. -+ * Use a quadratic hash to find a group with a free inode and -+ * average number of free blocks. - */ - for (i = 1; i < ngroups; i <<= 1) { - group += i; - if (group >= ngroups) - group -= ngroups; - desc = ext3_get_group_desc (sb, group, &bh); -- if (desc && le16_to_cpu(desc->bg_free_inodes_count) && -- le16_to_cpu(desc->bg_free_blocks_count)) -+ if (!desc || !desc->bg_free_inodes_count) -+ continue; -+ if (!S_ISREG(mode)) -+ return group; -+ if (le16_to_cpu(desc->bg_free_blocks_count) >= avefreeb) - return group; - } - - /* -- * That failed: try linear search for a free inode, even if that group -- * has no free blocks. -+ * That failed: start from last group used to allocate inode -+ * try linear search for a free inode and prefereably -+ * free blocks. - */ -- group = parent_group; -+ group = sbi->s_last_alloc_group; -+ if (group == -1) -+ group = parent_group; -+ - for (i = 0; i < ngroups; i++) { - if (++group >= ngroups) - group = 0; - desc = ext3_get_group_desc (sb, group, &bh); -- if (desc && le16_to_cpu(desc->bg_free_inodes_count)) -- return group; -+ if (!desc || !desc->bg_free_inodes_count) -+ continue; -+ freeb = le16_to_cpu(desc->bg_free_blocks_count); -+ if (freeb > best_group_freeb) { -+ best_group_freeb = freeb; -+ best_group = group; -+ if (freeb >= avefreeb || !S_ISREG(mode)) -+ break; -+ } - } -- -- return -1; -+ sbi->s_last_alloc_group = best_group; -+ return best_group; - } - - /* -@@ -454,7 +472,7 @@ - else - group = find_group_orlov(sb, dir); - } else -- group = find_group_other(sb, dir); -+ group = find_group_other(sb, dir, mode); - - err = -ENOSPC; - if (group == -1) -Index: linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/fs/ext3/super.c -=================================================================== ---- linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891.orig/fs/ext3/super.c 2005-05-16 14:10:54.000000000 -0600 -+++ linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/fs/ext3/super.c 2005-05-16 14:17:14.000000000 -0600 -@@ -1297,6 +1297,7 @@ - percpu_counter_init(&sbi->s_dirs_counter); - bgl_lock_init(&sbi->s_blockgroup_lock); - -+ sbi->s_last_alloc_group = -1; - for (i = 0; i < db_count; i++) { - block = descriptor_loc(sb, logic_sb_block, i); - sbi->s_group_desc[i] = sb_bread(sb, block); -Index: linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/include/linux/ext3_fs_sb.h -=================================================================== ---- linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891.orig/include/linux/ext3_fs_sb.h 2005-05-16 14:10:54.000000000 -0600 -+++ linux-2.6.5-SLES9_SP1_BRANCH_2004111114454891/include/linux/ext3_fs_sb.h 2005-05-16 14:17:14.000000000 -0600 -@@ -59,6 +59,8 @@ - struct percpu_counter s_freeinodes_counter; - struct percpu_counter s_dirs_counter; - struct blockgroup_lock s_blockgroup_lock; -+ /* Last group used to allocate inode */ -+ int s_last_alloc_group; - - /* root of the per fs reservation window tree */ - spinlock_t s_rsv_window_lock; diff --git a/ldiskfs/kernel_patches/patches/ext3-include-fixes-2.6-rhel4.patch b/ldiskfs/kernel_patches/patches/ext3-include-fixes-2.6-rhel4.patch deleted file mode 100644 index 52e5521..0000000 --- a/ldiskfs/kernel_patches/patches/ext3-include-fixes-2.6-rhel4.patch +++ /dev/null @@ -1,20 +0,0 @@ -Index: linux-stage/include/linux/ext3_fs.h -=================================================================== ---- linux-stage.orig/include/linux/ext3_fs.h 2005-02-25 14:53:56.424908168 +0200 -+++ linux-stage/include/linux/ext3_fs.h 2005-02-25 14:53:59.376459464 +0200 -@@ -361,12 +361,13 @@ - #define EXT3_MOUNT_IOPEN_NOPRIV 0x100000/* Make iopen world-readable */ - - /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ --#ifndef _LINUX_EXT2_FS_H -+#ifndef clear_opt - #define clear_opt(o, opt) o &= ~EXT3_MOUNT_##opt - #define set_opt(o, opt) o |= EXT3_MOUNT_##opt - #define test_opt(sb, opt) (EXT3_SB(sb)->s_mount_opt & \ - EXT3_MOUNT_##opt) --#else -+#endif -+#ifndef EXT2_MOUNT_NOLOAD - #define EXT2_MOUNT_NOLOAD EXT3_MOUNT_NOLOAD - #define EXT2_MOUNT_ABORT EXT3_MOUNT_ABORT - #define EXT2_MOUNT_DATA_FLAGS EXT3_MOUNT_DATA_FLAGS diff --git a/ldiskfs/kernel_patches/patches/ext3-include-fixes-2.6-suse.patch b/ldiskfs/kernel_patches/patches/ext3-include-fixes-2.6-suse.patch deleted file mode 100644 index 1ac944b..0000000 --- a/ldiskfs/kernel_patches/patches/ext3-include-fixes-2.6-suse.patch +++ /dev/null @@ -1,20 +0,0 @@ -Index: linux-stage/include/linux/ext3_fs.h -=================================================================== ---- linux-stage.orig/include/linux/ext3_fs.h 2004-04-02 16:43:37.000000000 -0500 -+++ linux-stage/include/linux/ext3_fs.h 2004-04-02 16:43:37.000000000 -0500 -@@ -331,12 +331,13 @@ - #define EXT3_MOUNT_IOPEN_NOPRIV 0x100000/* Make iopen world-readable */ - - /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ --#ifndef _LINUX_EXT2_FS_H -+#ifndef clear_opt - #define clear_opt(o, opt) o &= ~EXT3_MOUNT_##opt - #define set_opt(o, opt) o |= EXT3_MOUNT_##opt - #define test_opt(sb, opt) (EXT3_SB(sb)->s_mount_opt & \ - EXT3_MOUNT_##opt) --#else -+#endif -+#ifndef EXT2_MOUNT_NOLOAD - #define EXT2_MOUNT_NOLOAD EXT3_MOUNT_NOLOAD - #define EXT2_MOUNT_ABORT EXT3_MOUNT_ABORT - #define EXT2_MOUNT_DATA_FLAGS EXT3_MOUNT_DATA_FLAGS diff --git a/ldiskfs/kernel_patches/patches/ext3-lookup-dotdot-2.6.9.patch b/ldiskfs/kernel_patches/patches/ext3-lookup-dotdot-2.6.9.patch deleted file mode 100644 index a05256b..0000000 --- a/ldiskfs/kernel_patches/patches/ext3-lookup-dotdot-2.6.9.patch +++ /dev/null @@ -1,63 +0,0 @@ -Index: linux-2.6.9-full/fs/ext3/iopen.c -=================================================================== ---- linux-2.6.9-full.orig/fs/ext3/iopen.c 2006-04-25 08:51:11.000000000 +0400 -+++ linux-2.6.9-full/fs/ext3/iopen.c 2006-05-06 01:21:11.000000000 +0400 -@@ -94,9 +94,12 @@ static struct dentry *iopen_lookup(struc - assert(!(alternate->d_flags & DCACHE_DISCONNECTED)); - } - -- if (!list_empty(&inode->i_dentry)) { -- alternate = list_entry(inode->i_dentry.next, -- struct dentry, d_alias); -+ list_for_each(lp, &inode->i_dentry) { -+ alternate = list_entry(lp, struct dentry, d_alias); -+ /* ignore dentries created for ".." to preserve -+ * proper dcache hierarchy -- bug 10458 */ -+ if (alternate->d_flags & DCACHE_NFSFS_RENAMED) -+ continue; - dget_locked(alternate); - spin_lock(&alternate->d_lock); - alternate->d_flags |= DCACHE_REFERENCED; -Index: linux-2.6.9-full/fs/ext3/namei.c -=================================================================== ---- linux-2.6.9-full.orig/fs/ext3/namei.c 2006-05-06 01:21:10.000000000 +0400 -+++ linux-2.6.9-full/fs/ext3/namei.c 2006-05-06 01:29:30.000000000 +0400 -@@ -1003,6 +1003,38 @@ static struct dentry *ext3_lookup(struct - return ERR_PTR(-EACCES); - } - -+ /* ".." shouldn't go into dcache to preserve dcache hierarchy -+ * otherwise we'll get parent being a child of actual child. -+ * see bug 10458 for details -bzzz */ -+ if (inode && (dentry->d_name.name[0] == '.' && (dentry->d_name.len == 1 || -+ (dentry->d_name.len == 2 && dentry->d_name.name[1] == '.')))) { -+ struct dentry *tmp, *goal = NULL; -+ struct list_head *lp; -+ -+ /* first, look for an existing dentry - any one is good */ -+ spin_lock(&dcache_lock); -+ list_for_each(lp, &inode->i_dentry) { -+ tmp = list_entry(lp, struct dentry, d_alias); -+ goal = tmp; -+ dget_locked(goal); -+ break; -+ } -+ if (goal == NULL) { -+ /* there is no alias, we need to make current dentry: -+ * a) inaccessible for __d_lookup() -+ * b) inaccessible for iopen */ -+ J_ASSERT(list_empty(&dentry->d_alias)); -+ dentry->d_flags |= DCACHE_NFSFS_RENAMED; -+ /* this is d_instantiate() ... */ -+ list_add(&dentry->d_alias, &inode->i_dentry); -+ dentry->d_inode = inode; -+ } -+ spin_unlock(&dcache_lock); -+ if (goal) -+ iput(inode); -+ return goal; -+ } -+ - return iopen_connect_dentry(dentry, inode, 1); - } - diff --git a/ldiskfs/kernel_patches/patches/ext3-map_inode_page-2.6-suse.patch b/ldiskfs/kernel_patches/patches/ext3-map_inode_page-2.6-suse.patch deleted file mode 100644 index 2b6bcf1..0000000 --- a/ldiskfs/kernel_patches/patches/ext3-map_inode_page-2.6-suse.patch +++ /dev/null @@ -1,86 +0,0 @@ - fs/ext3/inode.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++++ - fs/ext3/super.c | 3 +++ - 2 files changed, 55 insertions(+) - -Index: linux-2.6.0/fs/ext3/inode.c -=================================================================== ---- linux-2.6.0.orig/fs/ext3/inode.c 2003-12-31 00:33:49.000000000 +0300 -+++ linux-2.6.0/fs/ext3/inode.c 2003-12-31 01:14:17.000000000 +0300 -@@ -3136,3 +3136,62 @@ - ret = ret2; - return ret; - } -+ -+int ext3_map_inode_page(struct inode *inode, struct page *page, -+ unsigned long *blocks, int *created, int create) -+{ -+ unsigned int blocksize, blocks_per_page; -+ unsigned long iblock; -+ struct buffer_head dummy; -+ void *handle; -+ int i, rc = 0, failed = 0, needed_blocks; -+ -+ blocksize = inode->i_sb->s_blocksize; -+ blocks_per_page = PAGE_SIZE >> inode->i_sb->s_blocksize_bits; -+ iblock = page->index * blocks_per_page; -+ -+ for (i = 0; i < blocks_per_page; i++, iblock++) { -+ blocks[i] = ext3_bmap(inode->i_mapping, iblock); -+ if (blocks[i] == 0) { -+ failed++; -+ if (created) -+ created[i] = -1; -+ } else if (created) { -+ created[i] = 0; -+ } -+ } -+ -+ if (failed == 0 || create == 0) -+ return 0; -+ -+ needed_blocks = ext3_writepage_trans_blocks(inode); -+ handle = ext3_journal_start(inode, needed_blocks); -+ if (IS_ERR(handle)) -+ return PTR_ERR(handle); -+ -+ iblock = page->index * blocks_per_page; -+ for (i = 0; i < blocks_per_page; i++, iblock++) { -+ if (blocks[i] != 0) -+ continue; -+ -+ rc = ext3_get_block_handle(handle, inode, iblock, &dummy, 1, 1); -+ if (rc) { -+ printk(KERN_INFO "ext3_map_inode_page: error reading " -+ "block %ld\n", iblock); -+ goto out; -+ } -+ /* Unmap any metadata buffers from the block mapping, to avoid -+ * data corruption due to direct-write from Lustre being -+ * clobbered by a later flush of the blockdev metadata buffer.*/ -+ if (buffer_new(&dummy)) -+ unmap_underlying_metadata(dummy.b_bdev, -+ dummy.b_blocknr); -+ blocks[i] = dummy.b_blocknr; -+ if (created) -+ created[i] = 1; -+ } -+ -+ out: -+ ext3_journal_stop(handle); -+ return rc; -+} -Index: linux-2.6.0/fs/ext3/super.c -=================================================================== ---- linux-2.6.0.orig/fs/ext3/super.c 2003-12-31 00:33:49.000000000 +0300 -+++ linux-2.6.0/fs/ext3/super.c 2003-12-31 01:10:40.000000000 +0300 -@@ -2051,6 +2051,10 @@ - int nblocks, loff_t newsize); - EXPORT_SYMBOL(ext3_prep_san_write); - -+int ext3_map_inode_page(struct inode *inode, struct page *page, -+ unsigned long *blocks, int *created, int create); -+EXPORT_SYMBOL(ext3_map_inode_page); -+ - MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); - MODULE_DESCRIPTION("Second Extended Filesystem with journaling extensions"); - MODULE_LICENSE("GPL"); diff --git a/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6-fc5.patch b/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6-fc5.patch deleted file mode 100644 index 90a9123..0000000 --- a/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6-fc5.patch +++ /dev/null @@ -1,3103 +0,0 @@ -Index: linux-2.6.16.i686/include/linux/ext3_fs.h -=================================================================== ---- linux-2.6.16.i686.orig/include/linux/ext3_fs.h 2006-05-30 22:55:32.000000000 +0800 -+++ linux-2.6.16.i686/include/linux/ext3_fs.h 2006-05-30 23:02:59.000000000 +0800 -@@ -57,6 +57,14 @@ - #define ext3_debug(f, a...) do {} while (0) - #endif - -+#define EXT3_MULTIBLOCK_ALLOCATOR 1 -+ -+#define EXT3_MB_HINT_MERGE 1 -+#define EXT3_MB_HINT_RESERVED 2 -+#define EXT3_MB_HINT_METADATA 4 -+#define EXT3_MB_HINT_FIRST 8 -+#define EXT3_MB_HINT_BEST 16 -+ - /* - * Special inodes numbers - */ -@@ -383,6 +391,7 @@ - #define EXT3_MOUNT_IOPEN_NOPRIV 0x800000/* Make iopen world-readable */ - #define EXT3_MOUNT_EXTENTS 0x1000000/* Extents support */ - #define EXT3_MOUNT_EXTDEBUG 0x2000000/* Extents debug */ -+#define EXT3_MOUNT_MBALLOC 0x4000000/* Buddy allocation support */ - - /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ - #ifndef clear_opt -@@ -404,6 +413,14 @@ - #define ext3_find_first_zero_bit ext2_find_first_zero_bit - #define ext3_find_next_zero_bit ext2_find_next_zero_bit - -+#ifndef ext2_find_next_le_bit -+#ifdef __LITTLE_ENDIAN -+#define ext2_find_next_le_bit(addr, size, off) find_next_bit((addr), (size), (off)) -+#else -+#error "mballoc needs a patch for big-endian systems - CFS bug 10634" -+#endif /* __LITTLE_ENDIAN */ -+#endif /* !ext2_find_next_le_bit */ -+ - /* - * Maximal mount counts between two filesystem checks - */ -@@ -744,7 +753,9 @@ - extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group); - extern int ext3_new_block (handle_t *, struct inode *, unsigned long, int *); - extern void ext3_free_blocks (handle_t *, struct inode *, unsigned long, -- unsigned long); -+ unsigned long, int); -+extern int ext3_new_block_old(handle_t *handle, struct inode *inode, -+ unsigned long goal, int *errp); - extern void ext3_free_blocks_sb (handle_t *, struct super_block *, - unsigned long, unsigned long, int *); - extern unsigned long ext3_count_free_blocks (struct super_block *); -@@ -865,6 +874,17 @@ - extern int ext3_ext_ioctl(struct inode *inode, struct file *filp, - unsigned int cmd, unsigned long arg); - -+/* mballoc.c */ -+extern long ext3_mb_stats; -+extern long ext3_mb_max_to_scan; -+extern int ext3_mb_init(struct super_block *, int); -+extern int ext3_mb_release(struct super_block *); -+extern int ext3_mb_new_blocks(handle_t *, struct inode *, unsigned long, int *, int, int *); -+extern int ext3_mb_reserve_blocks(struct super_block *, int); -+extern void ext3_mb_release_blocks(struct super_block *, int); -+int __init init_ext3_proc(void); -+void exit_ext3_proc(void); -+ - #endif /* __KERNEL__ */ - - /* EXT3_IOC_CREATE_INUM at bottom of file (visible to kernel and user). */ -Index: linux-2.6.16.i686/include/linux/ext3_fs_sb.h -=================================================================== ---- linux-2.6.16.i686.orig/include/linux/ext3_fs_sb.h 2006-03-20 13:53:29.000000000 +0800 -+++ linux-2.6.16.i686/include/linux/ext3_fs_sb.h 2006-05-30 23:02:59.000000000 +0800 -@@ -21,8 +21,14 @@ - #include - #include - #include -+#include - #endif - #include -+#include -+ -+struct ext3_buddy_group_blocks; -+struct ext3_mb_history; -+#define EXT3_BB_MAX_BLOCKS - - /* - * third extended-fs super-block data in memory -@@ -78,6 +84,43 @@ - char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */ - int s_jquota_fmt; /* Format of quota to use */ - #endif -+ -+ /* for buddy allocator */ -+ struct ext3_group_info ***s_group_info; -+ struct inode *s_buddy_cache; -+ long s_blocks_reserved; -+ spinlock_t s_reserve_lock; -+ struct list_head s_active_transaction; -+ struct list_head s_closed_transaction; -+ struct list_head s_committed_transaction; -+ spinlock_t s_md_lock; -+ tid_t s_last_transaction; -+ int s_mb_factor; -+ unsigned short *s_mb_offsets, *s_mb_maxs; -+ unsigned long s_stripe; -+ -+ /* history to debug policy */ -+ struct ext3_mb_history *s_mb_history; -+ int s_mb_history_cur; -+ int s_mb_history_max; -+ struct proc_dir_entry *s_mb_proc; -+ spinlock_t s_mb_history_lock; -+ -+ /* stats for buddy allocator */ -+ atomic_t s_bal_reqs; /* number of reqs with len > 1 */ -+ atomic_t s_bal_success; /* we found long enough chunks */ -+ atomic_t s_bal_allocated; /* in blocks */ -+ atomic_t s_bal_ex_scanned; /* total extents scanned */ -+ atomic_t s_bal_goals; /* goal hits */ -+ atomic_t s_bal_breaks; /* too long searches */ -+ atomic_t s_bal_2orders; /* 2^order hits */ -+ spinlock_t s_bal_lock; -+ unsigned long s_mb_buddies_generated; -+ unsigned long long s_mb_generation_time; - }; -+ -+#define EXT3_GROUP_INFO(sb, group) \ -+ EXT3_SB(sb)->s_group_info[(group) >> EXT3_DESC_PER_BLOCK_BITS(sb)] \ -+ [(group) & (EXT3_DESC_PER_BLOCK(sb) - 1)] - - #endif /* _LINUX_EXT3_FS_SB */ -Index: linux-2.6.16.i686/fs/ext3/super.c -=================================================================== ---- linux-2.6.16.i686.orig/fs/ext3/super.c 2006-05-30 22:55:32.000000000 +0800 -+++ linux-2.6.16.i686/fs/ext3/super.c 2006-05-30 23:02:59.000000000 +0800 -@@ -392,6 +392,7 @@ - struct ext3_super_block *es = sbi->s_es; - int i; - -+ ext3_mb_release(sb); - ext3_ext_release(sb); - ext3_xattr_put_super(sb); - journal_destroy(sbi->s_journal); -@@ -640,6 +641,7 @@ - Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, - Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, - Opt_extents, Opt_noextents, Opt_extdebug, -+ Opt_mballoc, Opt_nomballoc, Opt_stripe, - Opt_grpquota - }; - -@@ -694,6 +695,9 @@ - {Opt_extents, "extents"}, - {Opt_noextents, "noextents"}, - {Opt_extdebug, "extdebug"}, -+ {Opt_mballoc, "mballoc"}, -+ {Opt_nomballoc, "nomballoc"}, -+ {Opt_stripe, "stripe=%u"}, - {Opt_barrier, "barrier=%u"}, - {Opt_err, NULL}, - {Opt_resize, "resize"}, -@@ -1041,6 +1043,19 @@ - case Opt_extdebug: - set_opt (sbi->s_mount_opt, EXTDEBUG); - break; -+ case Opt_mballoc: -+ set_opt(sbi->s_mount_opt, MBALLOC); -+ break; -+ case Opt_nomballoc: -+ clear_opt(sbi->s_mount_opt, MBALLOC); -+ break; -+ case Opt_stripe: -+ if (match_int(&args[0], &option)) -+ return 0; -+ if (option < 0) -+ return 0; -+ sbi->s_stripe = option; -+ break; - default: - printk (KERN_ERR - "EXT3-fs: Unrecognized mount option \"%s\" " -@@ -1766,6 +1771,7 @@ - ext3_count_dirs(sb)); - - ext3_ext_init(sb); -+ ext3_mb_init(sb, needs_recovery); - lock_kernel(); - return 0; - -@@ -2699,7 +2705,13 @@ - - static int __init init_ext3_fs(void) - { -- int err = init_ext3_xattr(); -+ int err; -+ -+ err = init_ext3_proc(); -+ if (err) -+ return err; -+ -+ err = init_ext3_xattr(); - if (err) - return err; - err = init_inodecache(); -@@ -2721,6 +2733,7 @@ - unregister_filesystem(&ext3_fs_type); - destroy_inodecache(); - exit_ext3_xattr(); -+ exit_ext3_proc(); - } - - int ext3_prep_san_write(struct inode *inode, long *blocks, -Index: linux-2.6.16.i686/fs/ext3/extents.c -=================================================================== ---- linux-2.6.16.i686.orig/fs/ext3/extents.c 2006-05-30 22:55:32.000000000 +0800 -+++ linux-2.6.16.i686/fs/ext3/extents.c 2006-05-30 23:02:59.000000000 +0800 -@@ -771,7 +771,7 @@ - for (i = 0; i < depth; i++) { - if (!ablocks[i]) - continue; -- ext3_free_blocks(handle, tree->inode, ablocks[i], 1); -+ ext3_free_blocks(handle, tree->inode, ablocks[i], 1, 1); - } - } - kfree(ablocks); -@@ -1428,7 +1428,7 @@ - path->p_idx->ei_leaf); - bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf); - ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf); -- ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1); -+ ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1, 1); - return err; - } - -@@ -1913,10 +1913,12 @@ - int needed = ext3_remove_blocks_credits(tree, ex, from, to); - handle_t *handle = ext3_journal_start(tree->inode, needed); - struct buffer_head *bh; -- int i; -+ int i, metadata = 0; - - if (IS_ERR(handle)) - return PTR_ERR(handle); -+ if (S_ISDIR(tree->inode->i_mode) || S_ISLNK(tree->inode->i_mode)) -+ metadata = 1; - if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) { - /* tail removal */ - unsigned long num, start; -@@ -1928,7 +1930,7 @@ - bh = sb_find_get_block(tree->inode->i_sb, start + i); - ext3_forget(handle, 0, tree->inode, bh, start + i); - } -- ext3_free_blocks(handle, tree->inode, start, num); -+ ext3_free_blocks(handle, tree->inode, start, num, metadata); - } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) { - printk("strange request: removal %lu-%lu from %u:%u\n", - from, to, ex->ee_block, ex->ee_len); -Index: linux-2.6.16.i686/fs/ext3/inode.c -=================================================================== ---- linux-2.6.16.i686.orig/fs/ext3/inode.c 2006-05-30 22:55:32.000000000 +0800 -+++ linux-2.6.16.i686/fs/ext3/inode.c 2006-05-30 23:02:59.000000000 +0800 -@@ -568,7 +568,7 @@ - ext3_journal_forget(handle, branch[i].bh); - } - for (i = 0; i < keys; i++) -- ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1); -+ ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1, 1); - return err; - } - -@@ -1862,7 +1862,7 @@ - } - } - -- ext3_free_blocks(handle, inode, block_to_free, count); -+ ext3_free_blocks(handle, inode, block_to_free, count, 1); - } - - /** -@@ -2035,7 +2035,7 @@ - ext3_journal_test_restart(handle, inode); - } - -- ext3_free_blocks(handle, inode, nr, 1); -+ ext3_free_blocks(handle, inode, nr, 1, 1); - - if (parent_bh) { - /* -Index: linux-2.6.16.i686/fs/ext3/balloc.c -=================================================================== ---- linux-2.6.16.i686.orig/fs/ext3/balloc.c 2006-03-20 13:53:29.000000000 +0800 -+++ linux-2.6.16.i686/fs/ext3/balloc.c 2006-05-30 23:02:59.000000000 +0800 -@@ -80,7 +80,7 @@ - * - * Return buffer_head on success or NULL in case of failure. - */ --static struct buffer_head * -+struct buffer_head * - read_block_bitmap(struct super_block *sb, unsigned int block_group) - { - struct ext3_group_desc * desc; -@@ -491,24 +491,6 @@ - return; - } - --/* Free given blocks, update quota and i_blocks field */ --void ext3_free_blocks(handle_t *handle, struct inode *inode, -- unsigned long block, unsigned long count) --{ -- struct super_block * sb; -- int dquot_freed_blocks; -- -- sb = inode->i_sb; -- if (!sb) { -- printk ("ext3_free_blocks: nonexistent device"); -- return; -- } -- ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks); -- if (dquot_freed_blocks) -- DQUOT_FREE_BLOCK(inode, dquot_freed_blocks); -- return; --} -- - /* - * For ext3 allocations, we must not reuse any blocks which are - * allocated in the bitmap buffer's "last committed data" copy. This -@@ -1154,7 +1136,7 @@ - * bitmap, and then for any free bit if that fails. - * This function also updates quota and i_blocks field. - */ --int ext3_new_block(handle_t *handle, struct inode *inode, -+int ext3_new_block_old(handle_t *handle, struct inode *inode, - unsigned long goal, int *errp) - { - struct buffer_head *bitmap_bh = NULL; -Index: linux-2.6.16.i686/fs/ext3/xattr.c -=================================================================== ---- linux-2.6.16.i686.orig/fs/ext3/xattr.c 2006-03-20 13:53:29.000000000 +0800 -+++ linux-2.6.16.i686/fs/ext3/xattr.c 2006-05-30 23:02:59.000000000 +0800 -@@ -484,7 +484,7 @@ - ea_bdebug(bh, "refcount now=0; freeing"); - if (ce) - mb_cache_entry_free(ce); -- ext3_free_blocks(handle, inode, bh->b_blocknr, 1); -+ ext3_free_blocks(handle, inode, bh->b_blocknr, 1, 1); - get_bh(bh); - ext3_forget(handle, 1, inode, bh, bh->b_blocknr); - } else { -@@ -804,7 +804,7 @@ - new_bh = sb_getblk(sb, block); - if (!new_bh) { - getblk_failed: -- ext3_free_blocks(handle, inode, block, 1); -+ ext3_free_blocks(handle, inode, block, 1, 1); - error = -EIO; - goto cleanup; - } -Index: linux-2.6.16.i686/fs/ext3/mballoc.c -=================================================================== ---- linux-2.6.16.i686.orig/fs/ext3/mballoc.c 2006-05-31 04:14:15.752410384 +0800 -+++ linux-2.6.16.i686/fs/ext3/mballoc.c 2006-05-30 23:03:38.000000000 +0800 -@@ -0,0 +1,2725 @@ -+/* -+ * Copyright (c) 2003-2005, Cluster File Systems, Inc, info@clusterfs.com -+ * Written by Alex Tomas -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License version 2 as -+ * published by the Free Software Foundation. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public Licens -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- -+ */ -+ -+ -+/* -+ * mballoc.c contains the multiblocks allocation routines -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+/* -+ * TODO: -+ * - bitmap read-ahead (proposed by Oleg Drokin aka green) -+ * - track min/max extents in each group for better group selection -+ * - mb_mark_used() may allocate chunk right after splitting buddy -+ * - special flag to advice allocator to look for requested + N blocks -+ * this may improve interaction between extents and mballoc -+ * - tree of groups sorted by number of free blocks -+ * - percpu reservation code (hotpath) -+ * - error handling -+ */ -+ -+/* -+ * with AGRESSIVE_CHECK allocator runs consistency checks over -+ * structures. these checks slow things down a lot -+ */ -+#define AGGRESSIVE_CHECK__ -+ -+/* -+ */ -+#define MB_DEBUG__ -+#ifdef MB_DEBUG -+#define mb_debug(fmt,a...) printk(fmt, ##a) -+#else -+#define mb_debug(fmt,a...) -+#endif -+ -+/* -+ * with EXT3_MB_HISTORY mballoc stores last N allocations in memory -+ * and you can monitor it in /proc/fs/ext3//mb_history -+ */ -+#define EXT3_MB_HISTORY -+ -+/* -+ * How long mballoc can look for a best extent (in found extents) -+ */ -+long ext3_mb_max_to_scan = 500; -+ -+/* -+ * How long mballoc must look for a best extent -+ */ -+long ext3_mb_min_to_scan = 30; -+ -+/* -+ * with 'ext3_mb_stats' allocator will collect stats that will be -+ * shown at umount. The collecting costs though! -+ */ -+ -+long ext3_mb_stats = 1; -+ -+/* -+ * for which requests use 2^N search using buddies -+ */ -+long ext3_mb_order2_reqs = 8; -+ -+#ifdef EXT3_BB_MAX_BLOCKS -+#undef EXT3_BB_MAX_BLOCKS -+#endif -+#define EXT3_BB_MAX_BLOCKS 30 -+ -+struct ext3_free_metadata { -+ unsigned short group; -+ unsigned short num; -+ unsigned short blocks[EXT3_BB_MAX_BLOCKS]; -+ struct list_head list; -+}; -+ -+struct ext3_group_info { -+ unsigned long bb_state; -+ unsigned long bb_tid; -+ struct ext3_free_metadata *bb_md_cur; -+ unsigned short bb_first_free; -+ unsigned short bb_free; -+ unsigned short bb_fragments; -+ unsigned short bb_counters[]; -+}; -+ -+ -+#define EXT3_GROUP_INFO_NEED_INIT_BIT 0 -+#define EXT3_GROUP_INFO_LOCKED_BIT 1 -+ -+#define EXT3_MB_GRP_NEED_INIT(grp) \ -+ (test_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, &(grp)->bb_state)) -+ -+struct ext3_free_extent { -+ __u16 fe_start; -+ __u16 fe_len; -+ __u16 fe_group; -+}; -+ -+struct ext3_allocation_context { -+ struct super_block *ac_sb; -+ -+ /* search goals */ -+ struct ext3_free_extent ac_g_ex; -+ -+ /* the best found extent */ -+ struct ext3_free_extent ac_b_ex; -+ -+ /* number of iterations done. we have to track to limit searching */ -+ unsigned long ac_ex_scanned; -+ __u16 ac_groups_scanned; -+ __u16 ac_found; -+ __u16 ac_tail; -+ __u16 ac_buddy; -+ __u8 ac_status; -+ __u8 ac_flags; /* allocation hints */ -+ __u8 ac_criteria; -+ __u8 ac_repeats; -+ __u8 ac_2order; /* if request is to allocate 2^N blocks and -+ * N > 0, the field stores N, otherwise 0 */ -+ -+ struct page *ac_buddy_page; -+ struct page *ac_bitmap_page; -+}; -+ -+#define AC_STATUS_CONTINUE 1 -+#define AC_STATUS_FOUND 2 -+#define AC_STATUS_BREAK 3 -+ -+struct ext3_mb_history { -+ struct ext3_free_extent goal; /* goal allocation */ -+ struct ext3_free_extent result; /* result allocation */ -+ unsigned pid; -+ unsigned ino; -+ __u16 found; /* how many extents have been found */ -+ __u16 groups; /* how many groups have been scanned */ -+ __u16 tail; /* what tail broke some buddy */ -+ __u16 buddy; /* buddy the tail ^^^ broke */ -+ __u8 cr; /* which phase the result extent was found at */ -+ __u8 merged; -+}; -+ -+struct ext3_buddy { -+ struct page *bd_buddy_page; -+ void *bd_buddy; -+ struct page *bd_bitmap_page; -+ void *bd_bitmap; -+ struct ext3_group_info *bd_info; -+ struct super_block *bd_sb; -+ __u16 bd_blkbits; -+ __u16 bd_group; -+}; -+#define EXT3_MB_BITMAP(e3b) ((e3b)->bd_bitmap) -+#define EXT3_MB_BUDDY(e3b) ((e3b)->bd_buddy) -+ -+#ifndef EXT3_MB_HISTORY -+#define ext3_mb_store_history(sb,ino,ac) -+#else -+static void ext3_mb_store_history(struct super_block *, unsigned ino, -+ struct ext3_allocation_context *ac); -+#endif -+ -+#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) -+ -+static struct proc_dir_entry *proc_root_ext3; -+ -+struct buffer_head *read_block_bitmap(struct super_block *, unsigned int); -+void ext3_mb_poll_new_transaction(struct super_block *, handle_t *); -+void ext3_mb_free_committed_blocks(struct super_block *); -+ -+#if BITS_PER_LONG == 64 -+#define mb_correct_addr_and_bit(bit,addr) \ -+{ \ -+ bit += ((unsigned long) addr & 7UL) << 3; \ -+ addr = (void *) ((unsigned long) addr & ~7UL); \ -+} -+#elif BITS_PER_LONG == 32 -+#define mb_correct_addr_and_bit(bit,addr) \ -+{ \ -+ bit += ((unsigned long) addr & 3UL) << 3; \ -+ addr = (void *) ((unsigned long) addr & ~3UL); \ -+} -+#else -+#error "how many bits you are?!" -+#endif -+ -+static inline int mb_test_bit(int bit, void *addr) -+{ -+ mb_correct_addr_and_bit(bit,addr); -+ return ext2_test_bit(bit, addr); -+} -+ -+static inline void mb_set_bit(int bit, void *addr) -+{ -+ mb_correct_addr_and_bit(bit,addr); -+ ext2_set_bit(bit, addr); -+} -+ -+static inline void mb_set_bit_atomic(int bit, void *addr) -+{ -+ mb_correct_addr_and_bit(bit,addr); -+ ext2_set_bit_atomic(NULL, bit, addr); -+} -+ -+static inline void mb_clear_bit(int bit, void *addr) -+{ -+ mb_correct_addr_and_bit(bit,addr); -+ ext2_clear_bit(bit, addr); -+} -+ -+static inline void mb_clear_bit_atomic(int bit, void *addr) -+{ -+ mb_correct_addr_and_bit(bit,addr); -+ ext2_clear_bit_atomic(NULL, bit, addr); -+} -+ -+static inline int mb_find_next_zero_bit(void *addr, int max, int start) -+{ -+ int fix; -+#if BITS_PER_LONG == 64 -+ fix = ((unsigned long) addr & 7UL) << 3; -+ addr = (void *) ((unsigned long) addr & ~7UL); -+#elif BITS_PER_LONG == 32 -+ fix = ((unsigned long) addr & 3UL) << 3; -+ addr = (void *) ((unsigned long) addr & ~3UL); -+#else -+#error "how many bits you are?!" -+#endif -+ max += fix; -+ start += fix; -+ return ext2_find_next_zero_bit(addr, max, start) - fix; -+} -+ -+static inline void *mb_find_buddy(struct ext3_buddy *e3b, int order, int *max) -+{ -+ char *bb; -+ -+ J_ASSERT(EXT3_MB_BITMAP(e3b) != EXT3_MB_BUDDY(e3b)); -+ J_ASSERT(max != NULL); -+ -+ if (order > e3b->bd_blkbits + 1) { -+ *max = 0; -+ return NULL; -+ } -+ -+ /* at order 0 we see each particular block */ -+ *max = 1 << (e3b->bd_blkbits + 3); -+ if (order == 0) -+ return EXT3_MB_BITMAP(e3b); -+ -+ bb = EXT3_MB_BUDDY(e3b) + EXT3_SB(e3b->bd_sb)->s_mb_offsets[order]; -+ *max = EXT3_SB(e3b->bd_sb)->s_mb_maxs[order]; -+ -+ return bb; -+} -+ -+#ifdef AGGRESSIVE_CHECK -+ -+static void mb_check_buddy(struct ext3_buddy *e3b) -+{ -+ int order = e3b->bd_blkbits + 1; -+ int max, max2, i, j, k, count; -+ int fragments = 0, fstart; -+ void *buddy, *buddy2; -+ -+ if (!test_opt(e3b->bd_sb, MBALLOC)) -+ return; -+ -+ { -+ static int mb_check_counter = 0; -+ if (mb_check_counter++ % 300 != 0) -+ return; -+ } -+ -+ while (order > 1) { -+ buddy = mb_find_buddy(e3b, order, &max); -+ J_ASSERT(buddy); -+ buddy2 = mb_find_buddy(e3b, order - 1, &max2); -+ J_ASSERT(buddy2); -+ J_ASSERT(buddy != buddy2); -+ J_ASSERT(max * 2 == max2); -+ -+ count = 0; -+ for (i = 0; i < max; i++) { -+ -+ if (mb_test_bit(i, buddy)) { -+ /* only single bit in buddy2 may be 1 */ -+ if (!mb_test_bit(i << 1, buddy2)) -+ J_ASSERT(mb_test_bit((i<<1)+1, buddy2)); -+ else if (!mb_test_bit((i << 1) + 1, buddy2)) -+ J_ASSERT(mb_test_bit(i << 1, buddy2)); -+ continue; -+ } -+ -+ /* both bits in buddy2 must be 0 */ -+ J_ASSERT(mb_test_bit(i << 1, buddy2)); -+ J_ASSERT(mb_test_bit((i << 1) + 1, buddy2)); -+ -+ for (j = 0; j < (1 << order); j++) { -+ k = (i * (1 << order)) + j; -+ J_ASSERT(!mb_test_bit(k, EXT3_MB_BITMAP(e3b))); -+ } -+ count++; -+ } -+ J_ASSERT(e3b->bd_info->bb_counters[order] == count); -+ order--; -+ } -+ -+ fstart = -1; -+ buddy = mb_find_buddy(e3b, 0, &max); -+ for (i = 0; i < max; i++) { -+ if (!mb_test_bit(i, buddy)) { -+ J_ASSERT(i >= e3b->bd_info->bb_first_free); -+ if (fstart == -1) { -+ fragments++; -+ fstart = i; -+ } -+ continue; -+ } -+ fstart = -1; -+ /* check used bits only */ -+ for (j = 0; j < e3b->bd_blkbits + 1; j++) { -+ buddy2 = mb_find_buddy(e3b, j, &max2); -+ k = i >> j; -+ J_ASSERT(k < max2); -+ J_ASSERT(mb_test_bit(k, buddy2)); -+ } -+ } -+ J_ASSERT(!EXT3_MB_GRP_NEED_INIT(e3b->bd_info)); -+ J_ASSERT(e3b->bd_info->bb_fragments == fragments); -+} -+ -+#else -+#define mb_check_buddy(e3b) -+#endif -+ -+/* find most significant bit */ -+static int inline fmsb(unsigned short word) -+{ -+ int order; -+ -+ if (word > 255) { -+ order = 7; -+ word >>= 8; -+ } else { -+ order = -1; -+ } -+ -+ do { -+ order++; -+ word >>= 1; -+ } while (word != 0); -+ -+ return order; -+} -+ -+static void inline -+ext3_mb_mark_free_simple(struct super_block *sb, void *buddy, unsigned first, -+ int len, struct ext3_group_info *grp) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ unsigned short min, max, chunk, border; -+ -+ mb_debug("mark %u/%u free\n", first, len); -+ J_ASSERT(len < EXT3_BLOCKS_PER_GROUP(sb)); -+ -+ border = 2 << sb->s_blocksize_bits; -+ -+ while (len > 0) { -+ /* find how many blocks can be covered since this position */ -+ max = ffs(first | border) - 1; -+ -+ /* find how many blocks of power 2 we need to mark */ -+ min = fmsb(len); -+ -+ mb_debug(" %u/%u -> max %u, min %u\n", -+ first & ((2 << sb->s_blocksize_bits) - 1), -+ len, max, min); -+ -+ if (max < min) -+ min = max; -+ chunk = 1 << min; -+ -+ /* mark multiblock chunks only */ -+ grp->bb_counters[min]++; -+ if (min > 0) { -+ mb_debug(" set %u at %u \n", first >> min, -+ sbi->s_mb_offsets[min]); -+ mb_clear_bit(first >> min, buddy + sbi->s_mb_offsets[min]); -+ } -+ -+ len -= chunk; -+ first += chunk; -+ } -+} -+ -+static void -+ext3_mb_generate_buddy(struct super_block *sb, void *buddy, void *bitmap, -+ int group) -+{ -+ struct ext3_group_info *grp = EXT3_GROUP_INFO(sb, group); -+ unsigned short max = EXT3_BLOCKS_PER_GROUP(sb); -+ unsigned short i = 0, first, len; -+ unsigned free = 0, fragments = 0; -+ unsigned long long period = get_cycles(); -+ -+ i = mb_find_next_zero_bit(bitmap, max, 0); -+ grp->bb_first_free = i; -+ while (i < max) { -+ fragments++; -+ first = i; -+ i = ext2_find_next_le_bit(bitmap, max, i); -+ len = i - first; -+ free += len; -+ if (len > 1) -+ ext3_mb_mark_free_simple(sb, buddy, first, len, grp); -+ else -+ grp->bb_counters[0]++; -+ if (i < max) -+ i = mb_find_next_zero_bit(bitmap, max, i); -+ } -+ grp->bb_fragments = fragments; -+ -+ /* bb_state shouldn't being modified because all -+ * others waits for init completion on page lock */ -+ clear_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, &grp->bb_state); -+ if (free != grp->bb_free) { -+ printk("EXT3-fs: group %u: %u blocks in bitmap, %u in gd\n", -+ group, free, grp->bb_free); -+ grp->bb_free = free; -+ } -+ -+ period = get_cycles() - period; -+ spin_lock(&EXT3_SB(sb)->s_bal_lock); -+ EXT3_SB(sb)->s_mb_buddies_generated++; -+ EXT3_SB(sb)->s_mb_generation_time += period; -+ spin_unlock(&EXT3_SB(sb)->s_bal_lock); -+} -+ -+static int ext3_mb_init_cache(struct page *page) -+{ -+ int blocksize, blocks_per_page, groups_per_page; -+ int err = 0, i, first_group, first_block; -+ struct super_block *sb; -+ struct buffer_head *bhs; -+ struct buffer_head **bh; -+ struct inode *inode; -+ char *data, *bitmap; -+ -+ mb_debug("init page %lu\n", page->index); -+ -+ inode = page->mapping->host; -+ sb = inode->i_sb; -+ blocksize = 1 << inode->i_blkbits; -+ blocks_per_page = PAGE_CACHE_SIZE / blocksize; -+ -+ groups_per_page = blocks_per_page >> 1; -+ if (groups_per_page == 0) -+ groups_per_page = 1; -+ -+ /* allocate buffer_heads to read bitmaps */ -+ if (groups_per_page > 1) { -+ err = -ENOMEM; -+ i = sizeof(struct buffer_head *) * groups_per_page; -+ bh = kmalloc(i, GFP_NOFS); -+ if (bh == NULL) -+ goto out; -+ memset(bh, 0, i); -+ } else -+ bh = &bhs; -+ -+ first_group = page->index * blocks_per_page / 2; -+ -+ /* read all groups the page covers into the cache */ -+ for (i = 0; i < groups_per_page; i++) { -+ struct ext3_group_desc * desc; -+ -+ if (first_group + i >= EXT3_SB(sb)->s_groups_count) -+ break; -+ -+ err = -EIO; -+ desc = ext3_get_group_desc(sb, first_group + i, NULL); -+ if (desc == NULL) -+ goto out; -+ -+ err = -ENOMEM; -+ bh[i] = sb_getblk(sb, le32_to_cpu(desc->bg_block_bitmap)); -+ if (bh[i] == NULL) -+ goto out; -+ -+ if (buffer_uptodate(bh[i])) -+ continue; -+ -+ lock_buffer(bh[i]); -+ if (buffer_uptodate(bh[i])) { -+ unlock_buffer(bh[i]); -+ continue; -+ } -+ -+ get_bh(bh[i]); -+ bh[i]->b_end_io = end_buffer_read_sync; -+ submit_bh(READ, bh[i]); -+ mb_debug("read bitmap for group %u\n", first_group + i); -+ } -+ -+ /* wait for I/O completion */ -+ for (i = 0; i < groups_per_page && bh[i]; i++) -+ wait_on_buffer(bh[i]); -+ -+ err = -EIO; -+ for (i = 0; i < groups_per_page && bh[i]; i++) -+ if (!buffer_uptodate(bh[i])) -+ goto out; -+ -+ first_block = page->index * blocks_per_page; -+ for (i = 0; i < blocks_per_page; i++) { -+ int group; -+ -+ group = (first_block + i) >> 1; -+ if (group >= EXT3_SB(sb)->s_groups_count) -+ break; -+ -+ data = page_address(page) + (i * blocksize); -+ bitmap = bh[group - first_group]->b_data; -+ -+ if ((first_block + i) & 1) { -+ /* this is block of buddy */ -+ mb_debug("put buddy for group %u in page %lu/%x\n", -+ group, page->index, i * blocksize); -+ memset(data, 0xff, blocksize); -+ EXT3_GROUP_INFO(sb, group)->bb_fragments = 0; -+ memset(EXT3_GROUP_INFO(sb, group)->bb_counters, 0, -+ sizeof(unsigned short)*(sb->s_blocksize_bits+2)); -+ ext3_mb_generate_buddy(sb, data, bitmap, group); -+ } else { -+ /* this is block of bitmap */ -+ mb_debug("put bitmap for group %u in page %lu/%x\n", -+ group, page->index, i * blocksize); -+ memcpy(data, bitmap, blocksize); -+ } -+ } -+ SetPageUptodate(page); -+ -+out: -+ if (bh) { -+ for (i = 0; i < groups_per_page && bh[i]; i++) -+ brelse(bh[i]); -+ if (bh != &bhs) -+ kfree(bh); -+ } -+ return err; -+} -+ -+static int ext3_mb_load_buddy(struct super_block *sb, int group, -+ struct ext3_buddy *e3b) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ struct inode *inode = sbi->s_buddy_cache; -+ int blocks_per_page, block, pnum, poff; -+ struct page *page; -+ -+ mb_debug("load group %u\n", group); -+ -+ blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; -+ -+ e3b->bd_blkbits = sb->s_blocksize_bits; -+ e3b->bd_info = EXT3_GROUP_INFO(sb, group); -+ e3b->bd_sb = sb; -+ e3b->bd_group = group; -+ e3b->bd_buddy_page = NULL; -+ e3b->bd_bitmap_page = NULL; -+ -+ block = group * 2; -+ pnum = block / blocks_per_page; -+ poff = block % blocks_per_page; -+ -+ /* we could use find_or_create_page(), but it locks page -+ * what we'd like to avoid in fast path ... */ -+ page = find_get_page(inode->i_mapping, pnum); -+ if (page == NULL || !PageUptodate(page)) { -+ if (page) -+ page_cache_release(page); -+ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); -+ if (page) { -+ BUG_ON(page->mapping != inode->i_mapping); -+ if (!PageUptodate(page)) -+ ext3_mb_init_cache(page); -+ unlock_page(page); -+ } -+ } -+ if (page == NULL || !PageUptodate(page)) -+ goto err; -+ e3b->bd_bitmap_page = page; -+ e3b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); -+ mark_page_accessed(page); -+ -+ block++; -+ pnum = block / blocks_per_page; -+ poff = block % blocks_per_page; -+ -+ page = find_get_page(inode->i_mapping, pnum); -+ if (page == NULL || !PageUptodate(page)) { -+ if (page) -+ page_cache_release(page); -+ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); -+ if (page) { -+ BUG_ON(page->mapping != inode->i_mapping); -+ if (!PageUptodate(page)) -+ ext3_mb_init_cache(page); -+ unlock_page(page); -+ } -+ } -+ if (page == NULL || !PageUptodate(page)) -+ goto err; -+ e3b->bd_buddy_page = page; -+ e3b->bd_buddy = page_address(page) + (poff * sb->s_blocksize); -+ mark_page_accessed(page); -+ -+ J_ASSERT(e3b->bd_bitmap_page != NULL); -+ J_ASSERT(e3b->bd_buddy_page != NULL); -+ -+ return 0; -+ -+err: -+ if (e3b->bd_bitmap_page) -+ page_cache_release(e3b->bd_bitmap_page); -+ if (e3b->bd_buddy_page) -+ page_cache_release(e3b->bd_buddy_page); -+ e3b->bd_buddy = NULL; -+ e3b->bd_bitmap = NULL; -+ return -EIO; -+} -+ -+static void ext3_mb_release_desc(struct ext3_buddy *e3b) -+{ -+ if (e3b->bd_bitmap_page) -+ page_cache_release(e3b->bd_bitmap_page); -+ if (e3b->bd_buddy_page) -+ page_cache_release(e3b->bd_buddy_page); -+} -+ -+ -+static inline void -+ext3_lock_group(struct super_block *sb, int group) -+{ -+ bit_spin_lock(EXT3_GROUP_INFO_LOCKED_BIT, -+ &EXT3_GROUP_INFO(sb, group)->bb_state); -+} -+ -+static inline void -+ext3_unlock_group(struct super_block *sb, int group) -+{ -+ bit_spin_unlock(EXT3_GROUP_INFO_LOCKED_BIT, -+ &EXT3_GROUP_INFO(sb, group)->bb_state); -+} -+ -+static int mb_find_order_for_block(struct ext3_buddy *e3b, int block) -+{ -+ int order = 1; -+ void *bb; -+ -+ J_ASSERT(EXT3_MB_BITMAP(e3b) != EXT3_MB_BUDDY(e3b)); -+ J_ASSERT(block < (1 << (e3b->bd_blkbits + 3))); -+ -+ bb = EXT3_MB_BUDDY(e3b); -+ while (order <= e3b->bd_blkbits + 1) { -+ block = block >> 1; -+ if (!mb_test_bit(block, bb)) { -+ /* this block is part of buddy of order 'order' */ -+ return order; -+ } -+ bb += 1 << (e3b->bd_blkbits - order); -+ order++; -+ } -+ return 0; -+} -+ -+static inline void mb_clear_bits(void *bm, int cur, int len) -+{ -+ __u32 *addr; -+ -+ len = cur + len; -+ while (cur < len) { -+ if ((cur & 31) == 0 && (len - cur) >= 32) { -+ /* fast path: clear whole word at once */ -+ addr = bm + (cur >> 3); -+ *addr = 0; -+ cur += 32; -+ continue; -+ } -+ mb_clear_bit_atomic(cur, bm); -+ cur++; -+ } -+} -+ -+static inline void mb_set_bits(void *bm, int cur, int len) -+{ -+ __u32 *addr; -+ -+ len = cur + len; -+ while (cur < len) { -+ if ((cur & 31) == 0 && (len - cur) >= 32) { -+ /* fast path: clear whole word at once */ -+ addr = bm + (cur >> 3); -+ *addr = 0xffffffff; -+ cur += 32; -+ continue; -+ } -+ mb_set_bit_atomic(cur, bm); -+ cur++; -+ } -+} -+ -+static int mb_free_blocks(struct ext3_buddy *e3b, int first, int count) -+{ -+ int block = 0, max = 0, order; -+ void *buddy, *buddy2; -+ -+ mb_check_buddy(e3b); -+ -+ e3b->bd_info->bb_free += count; -+ if (first < e3b->bd_info->bb_first_free) -+ e3b->bd_info->bb_first_free = first; -+ -+ /* let's maintain fragments counter */ -+ if (first != 0) -+ block = !mb_test_bit(first - 1, EXT3_MB_BITMAP(e3b)); -+ if (first + count < EXT3_SB(e3b->bd_sb)->s_mb_maxs[0]) -+ max = !mb_test_bit(first + count, EXT3_MB_BITMAP(e3b)); -+ if (block && max) -+ e3b->bd_info->bb_fragments--; -+ else if (!block && !max) -+ e3b->bd_info->bb_fragments++; -+ -+ /* let's maintain buddy itself */ -+ while (count-- > 0) { -+ block = first++; -+ order = 0; -+ -+ J_ASSERT(mb_test_bit(block, EXT3_MB_BITMAP(e3b))); -+ mb_clear_bit(block, EXT3_MB_BITMAP(e3b)); -+ e3b->bd_info->bb_counters[order]++; -+ -+ /* start of the buddy */ -+ buddy = mb_find_buddy(e3b, order, &max); -+ -+ do { -+ block &= ~1UL; -+ if (mb_test_bit(block, buddy) || -+ mb_test_bit(block + 1, buddy)) -+ break; -+ -+ /* both the buddies are free, try to coalesce them */ -+ buddy2 = mb_find_buddy(e3b, order + 1, &max); -+ -+ if (!buddy2) -+ break; -+ -+ if (order > 0) { -+ /* for special purposes, we don't set -+ * free bits in bitmap */ -+ mb_set_bit(block, buddy); -+ mb_set_bit(block + 1, buddy); -+ } -+ e3b->bd_info->bb_counters[order]--; -+ e3b->bd_info->bb_counters[order]--; -+ -+ block = block >> 1; -+ order++; -+ e3b->bd_info->bb_counters[order]++; -+ -+ mb_clear_bit(block, buddy2); -+ buddy = buddy2; -+ } while (1); -+ } -+ mb_check_buddy(e3b); -+ -+ return 0; -+} -+ -+static int mb_find_extent(struct ext3_buddy *e3b, int order, int block, -+ int needed, struct ext3_free_extent *ex) -+{ -+ int next = block, max, ord; -+ void *buddy; -+ -+ J_ASSERT(ex != NULL); -+ -+ buddy = mb_find_buddy(e3b, order, &max); -+ J_ASSERT(buddy); -+ J_ASSERT(block < max); -+ if (mb_test_bit(block, buddy)) { -+ ex->fe_len = 0; -+ ex->fe_start = 0; -+ ex->fe_group = 0; -+ return 0; -+ } -+ -+ if (likely(order == 0)) { -+ /* find actual order */ -+ order = mb_find_order_for_block(e3b, block); -+ block = block >> order; -+ } -+ -+ ex->fe_len = 1 << order; -+ ex->fe_start = block << order; -+ ex->fe_group = e3b->bd_group; -+ -+ /* calc difference from given start */ -+ next = next - ex->fe_start; -+ ex->fe_len -= next; -+ ex->fe_start += next; -+ -+ while (needed > ex->fe_len && (buddy = mb_find_buddy(e3b, order, &max))) { -+ -+ if (block + 1 >= max) -+ break; -+ -+ next = (block + 1) * (1 << order); -+ if (mb_test_bit(next, EXT3_MB_BITMAP(e3b))) -+ break; -+ -+ ord = mb_find_order_for_block(e3b, next); -+ -+ order = ord; -+ block = next >> order; -+ ex->fe_len += 1 << order; -+ } -+ -+ J_ASSERT(ex->fe_start + ex->fe_len <= (1 << (e3b->bd_blkbits + 3))); -+ return ex->fe_len; -+} -+ -+static int mb_mark_used(struct ext3_buddy *e3b, struct ext3_free_extent *ex) -+{ -+ int ord, mlen = 0, max = 0, cur; -+ int start = ex->fe_start; -+ int len = ex->fe_len; -+ unsigned ret = 0; -+ int len0 = len; -+ void *buddy; -+ -+ mb_check_buddy(e3b); -+ -+ e3b->bd_info->bb_free -= len; -+ if (e3b->bd_info->bb_first_free == start) -+ e3b->bd_info->bb_first_free += len; -+ -+ /* let's maintain fragments counter */ -+ if (start != 0) -+ mlen = !mb_test_bit(start - 1, EXT3_MB_BITMAP(e3b)); -+ if (start + len < EXT3_SB(e3b->bd_sb)->s_mb_maxs[0]) -+ max = !mb_test_bit(start + len, EXT3_MB_BITMAP(e3b)); -+ if (mlen && max) -+ e3b->bd_info->bb_fragments++; -+ else if (!mlen && !max) -+ e3b->bd_info->bb_fragments--; -+ -+ /* let's maintain buddy itself */ -+ while (len) { -+ ord = mb_find_order_for_block(e3b, start); -+ -+ if (((start >> ord) << ord) == start && len >= (1 << ord)) { -+ /* the whole chunk may be allocated at once! */ -+ mlen = 1 << ord; -+ buddy = mb_find_buddy(e3b, ord, &max); -+ J_ASSERT((start >> ord) < max); -+ mb_set_bit(start >> ord, buddy); -+ e3b->bd_info->bb_counters[ord]--; -+ start += mlen; -+ len -= mlen; -+ J_ASSERT(len >= 0); -+ continue; -+ } -+ -+ /* store for history */ -+ if (ret == 0) -+ ret = len | (ord << 16); -+ -+ /* we have to split large buddy */ -+ J_ASSERT(ord > 0); -+ buddy = mb_find_buddy(e3b, ord, &max); -+ mb_set_bit(start >> ord, buddy); -+ e3b->bd_info->bb_counters[ord]--; -+ -+ ord--; -+ cur = (start >> ord) & ~1U; -+ buddy = mb_find_buddy(e3b, ord, &max); -+ mb_clear_bit(cur, buddy); -+ mb_clear_bit(cur + 1, buddy); -+ e3b->bd_info->bb_counters[ord]++; -+ e3b->bd_info->bb_counters[ord]++; -+ } -+ -+ /* now drop all the bits in bitmap */ -+ mb_set_bits(EXT3_MB_BITMAP(e3b), ex->fe_start, len0); -+ -+ mb_check_buddy(e3b); -+ -+ return ret; -+} -+ -+/* -+ * Must be called under group lock! -+ */ -+static void ext3_mb_use_best_found(struct ext3_allocation_context *ac, -+ struct ext3_buddy *e3b) -+{ -+ unsigned long ret; -+ -+ ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len); -+ ret = mb_mark_used(e3b, &ac->ac_b_ex); -+ -+ ac->ac_status = AC_STATUS_FOUND; -+ ac->ac_tail = ret & 0xffff; -+ ac->ac_buddy = ret >> 16; -+ -+ /* hold in-core structures until allocated -+ * blocks are marked non-free in on-disk bitmap */ -+ ac->ac_buddy_page = e3b->bd_buddy_page; -+ page_cache_get(e3b->bd_buddy_page); -+ ac->ac_bitmap_page = e3b->bd_bitmap_page; -+ page_cache_get(e3b->bd_bitmap_page); -+} -+ -+/* -+ * The routine checks whether found extent is good enough. If it is, -+ * then the extent gets marked used and flag is set to the context -+ * to stop scanning. Otherwise, the extent is compared with the -+ * previous found extent and if new one is better, then it's stored -+ * in the context. Later, the best found extent will be used, if -+ * mballoc can't find good enough extent. -+ * -+ * FIXME: real allocation policy is to be designed yet! -+ */ -+static void ext3_mb_measure_extent(struct ext3_allocation_context *ac, -+ struct ext3_free_extent *ex, -+ struct ext3_buddy *e3b) -+{ -+ struct ext3_free_extent *bex = &ac->ac_b_ex; -+ struct ext3_free_extent *gex = &ac->ac_g_ex; -+ -+ J_ASSERT(ex->fe_len > 0); -+ J_ASSERT(ex->fe_len < (1 << ac->ac_sb->s_blocksize_bits) * 8); -+ J_ASSERT(ex->fe_start < (1 << ac->ac_sb->s_blocksize_bits) * 8); -+ -+ ac->ac_found++; -+ -+ /* -+ * The special case - take what you catch first -+ */ -+ if (unlikely(ac->ac_flags & EXT3_MB_HINT_FIRST)) { -+ *bex = *ex; -+ ext3_mb_use_best_found(ac, e3b); -+ return; -+ } -+ -+ /* -+ * Let's check whether the chunk is good enough -+ */ -+ if (ex->fe_len == gex->fe_len) { -+ *bex = *ex; -+ ext3_mb_use_best_found(ac, e3b); -+ return; -+ } -+ -+ /* -+ * If this is first found extent, just store it in the context -+ */ -+ if (bex->fe_len == 0) { -+ *bex = *ex; -+ return; -+ } -+ -+ /* -+ * If new found extent is better, store it in the context -+ */ -+ if (bex->fe_len < gex->fe_len) { -+ /* if the request isn't satisfied, any found extent -+ * larger than previous best one is better */ -+ if (ex->fe_len > bex->fe_len) -+ *bex = *ex; -+ } else if (ex->fe_len > gex->fe_len) { -+ /* if the request is satisfied, then we try to find -+ * an extent that still satisfy the request, but is -+ * smaller than previous one */ -+ *bex = *ex; -+ } -+ -+ /* -+ * Let's scan at least few extents and don't pick up a first one -+ */ -+ if (bex->fe_len > gex->fe_len && ac->ac_found > ext3_mb_min_to_scan) -+ ac->ac_status = AC_STATUS_BREAK; -+ -+ /* -+ * We don't want to scan for a whole year -+ */ -+ if (ac->ac_found > ext3_mb_max_to_scan) -+ ac->ac_status = AC_STATUS_BREAK; -+} -+ -+static int ext3_mb_try_best_found(struct ext3_allocation_context *ac, -+ struct ext3_buddy *e3b) -+{ -+ struct ext3_free_extent ex = ac->ac_b_ex; -+ int group = ex.fe_group, max, err; -+ -+ J_ASSERT(ex.fe_len > 0); -+ err = ext3_mb_load_buddy(ac->ac_sb, group, e3b); -+ if (err) -+ return err; -+ -+ ext3_lock_group(ac->ac_sb, group); -+ max = mb_find_extent(e3b, 0, ex.fe_start, ex.fe_len, &ex); -+ -+ if (max > 0) { -+ ac->ac_b_ex = ex; -+ ext3_mb_use_best_found(ac, e3b); -+ } -+ -+ ext3_unlock_group(ac->ac_sb, group); -+ -+ ext3_mb_release_desc(e3b); -+ -+ return 0; -+} -+ -+static int ext3_mb_find_by_goal(struct ext3_allocation_context *ac, -+ struct ext3_buddy *e3b) -+{ -+ int group = ac->ac_g_ex.fe_group, max, err; -+ struct ext3_sb_info *sbi = EXT3_SB(ac->ac_sb); -+ struct ext3_super_block *es = sbi->s_es; -+ struct ext3_free_extent ex; -+ -+ err = ext3_mb_load_buddy(ac->ac_sb, group, e3b); -+ if (err) -+ return err; -+ -+ ext3_lock_group(ac->ac_sb, group); -+ max = mb_find_extent(e3b, 0, ac->ac_g_ex.fe_start, -+ ac->ac_g_ex.fe_len, &ex); -+ -+ if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) { -+ unsigned long start; -+ start = (e3b->bd_group * EXT3_BLOCKS_PER_GROUP(ac->ac_sb) + -+ ex.fe_start + le32_to_cpu(es->s_first_data_block)); -+ if (start % sbi->s_stripe == 0) { -+ ac->ac_found++; -+ ac->ac_b_ex = ex; -+ ext3_mb_use_best_found(ac, e3b); -+ } -+ } else if (max >= ac->ac_g_ex.fe_len) { -+ J_ASSERT(ex.fe_len > 0); -+ J_ASSERT(ex.fe_group == ac->ac_g_ex.fe_group); -+ J_ASSERT(ex.fe_start == ac->ac_g_ex.fe_start); -+ ac->ac_found++; -+ ac->ac_b_ex = ex; -+ ext3_mb_use_best_found(ac, e3b); -+ } else if (max > 0 && (ac->ac_flags & EXT3_MB_HINT_MERGE)) { -+ /* Sometimes, caller may want to merge even small -+ * number of blocks to an existing extent */ -+ J_ASSERT(ex.fe_len > 0); -+ J_ASSERT(ex.fe_group == ac->ac_g_ex.fe_group); -+ J_ASSERT(ex.fe_start == ac->ac_g_ex.fe_start); -+ ac->ac_found++; -+ ac->ac_b_ex = ex; -+ ext3_mb_use_best_found(ac, e3b); -+ } -+ ext3_unlock_group(ac->ac_sb, group); -+ -+ ext3_mb_release_desc(e3b); -+ -+ return 0; -+} -+ -+/* -+ * The routine scans buddy structures (not bitmap!) from given order -+ * to max order and tries to find big enough chunk to satisfy the req -+ */ -+static void ext3_mb_simple_scan_group(struct ext3_allocation_context *ac, -+ struct ext3_buddy *e3b) -+{ -+ struct super_block *sb = ac->ac_sb; -+ struct ext3_group_info *grp = e3b->bd_info; -+ void *buddy; -+ int i, k, max; -+ -+ J_ASSERT(ac->ac_2order > 0); -+ for (i = ac->ac_2order; i <= sb->s_blocksize_bits + 1; i++) { -+ if (grp->bb_counters[i] == 0) -+ continue; -+ -+ buddy = mb_find_buddy(e3b, i, &max); -+ if (buddy == NULL) { -+ printk(KERN_ALERT "looking for wrong order?\n"); -+ break; -+ } -+ -+ k = mb_find_next_zero_bit(buddy, max, 0); -+ J_ASSERT(k < max); -+ -+ ac->ac_found++; -+ -+ ac->ac_b_ex.fe_len = 1 << i; -+ ac->ac_b_ex.fe_start = k << i; -+ ac->ac_b_ex.fe_group = e3b->bd_group; -+ -+ ext3_mb_use_best_found(ac, e3b); -+ J_ASSERT(ac->ac_b_ex.fe_len == ac->ac_g_ex.fe_len); -+ -+ if (unlikely(ext3_mb_stats)) -+ atomic_inc(&EXT3_SB(sb)->s_bal_2orders); -+ -+ break; -+ } -+} -+ -+/* -+ * The routine scans the group and measures all found extents. -+ * In order to optimize scanning, caller must pass number of -+ * free blocks in the group, so the routine can know upper limit. -+ */ -+static void ext3_mb_complex_scan_group(struct ext3_allocation_context *ac, -+ struct ext3_buddy *e3b) -+{ -+ struct super_block *sb = ac->ac_sb; -+ void *bitmap = EXT3_MB_BITMAP(e3b); -+ struct ext3_free_extent ex; -+ int i, free; -+ -+ free = e3b->bd_info->bb_free; -+ J_ASSERT(free > 0); -+ -+ i = e3b->bd_info->bb_first_free; -+ -+ while (free && ac->ac_status == AC_STATUS_CONTINUE) { -+ i = mb_find_next_zero_bit(bitmap, sb->s_blocksize * 8, i); -+ if (i >= sb->s_blocksize * 8) { -+ J_ASSERT(free == 0); -+ break; -+ } -+ -+ mb_find_extent(e3b, 0, i, ac->ac_g_ex.fe_len, &ex); -+ J_ASSERT(ex.fe_len > 0); -+ J_ASSERT(free >= ex.fe_len); -+ -+ ext3_mb_measure_extent(ac, &ex, e3b); -+ -+ i += ex.fe_len; -+ free -= ex.fe_len; -+ } -+} -+ -+/* -+ * This is a special case for storages like raid5 -+ * we try to find stripe-aligned chunks for stripe-size requests -+ */ -+static void ext3_mb_scan_aligned(struct ext3_allocation_context *ac, -+ struct ext3_buddy *e3b) -+{ -+ struct super_block *sb = ac->ac_sb; -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ void *bitmap = EXT3_MB_BITMAP(e3b); -+ struct ext3_free_extent ex; -+ unsigned long i, max; -+ -+ J_ASSERT(sbi->s_stripe != 0); -+ -+ /* find first stripe-aligned block */ -+ i = e3b->bd_group * EXT3_BLOCKS_PER_GROUP(sb) -+ + le32_to_cpu(sbi->s_es->s_first_data_block); -+ i = ((i + sbi->s_stripe - 1) / sbi->s_stripe) * sbi->s_stripe; -+ i = (i - le32_to_cpu(sbi->s_es->s_first_data_block)) -+ % EXT3_BLOCKS_PER_GROUP(sb); -+ -+ while (i < sb->s_blocksize * 8) { -+ if (!mb_test_bit(i, bitmap)) { -+ max = mb_find_extent(e3b, 0, i, sbi->s_stripe, &ex); -+ if (max >= sbi->s_stripe) { -+ ac->ac_found++; -+ ac->ac_b_ex = ex; -+ ext3_mb_use_best_found(ac, e3b); -+ break; -+ } -+ } -+ i += sbi->s_stripe; -+ } -+} -+ -+static int ext3_mb_good_group(struct ext3_allocation_context *ac, -+ int group, int cr) -+{ -+ struct ext3_group_info *grp = EXT3_GROUP_INFO(ac->ac_sb, group); -+ unsigned free, fragments, i, bits; -+ -+ J_ASSERT(cr >= 0 && cr < 4); -+ J_ASSERT(!EXT3_MB_GRP_NEED_INIT(grp)); -+ -+ free = grp->bb_free; -+ fragments = grp->bb_fragments; -+ if (free == 0) -+ return 0; -+ if (fragments == 0) -+ return 0; -+ -+ switch (cr) { -+ case 0: -+ J_ASSERT(ac->ac_2order != 0); -+ bits = ac->ac_sb->s_blocksize_bits + 1; -+ for (i = ac->ac_2order; i <= bits; i++) -+ if (grp->bb_counters[i] > 0) -+ return 1; -+ break; -+ case 1: -+ if ((free / fragments) >= ac->ac_g_ex.fe_len) -+ return 1; -+ break; -+ case 2: -+ if (free >= ac->ac_g_ex.fe_len) -+ return 1; -+ break; -+ case 3: -+ return 1; -+ default: -+ BUG(); -+ } -+ -+ return 0; -+} -+ -+int ext3_mb_new_blocks(handle_t *handle, struct inode *inode, -+ unsigned long goal, int *len, int flags, int *errp) -+{ -+ struct buffer_head *bitmap_bh = NULL; -+ struct ext3_allocation_context ac; -+ int i, group, block, cr, err = 0; -+ struct ext3_group_desc *gdp; -+ struct ext3_super_block *es; -+ struct buffer_head *gdp_bh; -+ struct ext3_sb_info *sbi; -+ struct super_block *sb; -+ struct ext3_buddy e3b; -+ -+ J_ASSERT(len != NULL); -+ J_ASSERT(*len > 0); -+ -+ sb = inode->i_sb; -+ if (!sb) { -+ printk("ext3_mb_new_nblocks: nonexistent device"); -+ return 0; -+ } -+ -+ if (!test_opt(sb, MBALLOC)) { -+ static int ext3_mballoc_warning = 0; -+ if (ext3_mballoc_warning == 0) { -+ printk(KERN_ERR "EXT3-fs: multiblock request with " -+ "mballoc disabled!\n"); -+ ext3_mballoc_warning++; -+ } -+ *len = 1; -+ err = ext3_new_block_old(handle, inode, goal, errp); -+ return err; -+ } -+ -+ ext3_mb_poll_new_transaction(sb, handle); -+ -+ sbi = EXT3_SB(sb); -+ es = EXT3_SB(sb)->s_es; -+ -+ /* -+ * We can't allocate > group size -+ */ -+ if (*len >= EXT3_BLOCKS_PER_GROUP(sb) - 10) -+ *len = EXT3_BLOCKS_PER_GROUP(sb) - 10; -+ -+ if (!(flags & EXT3_MB_HINT_RESERVED)) { -+ /* someone asks for non-reserved blocks */ -+ BUG_ON(*len > 1); -+ err = ext3_mb_reserve_blocks(sb, 1); -+ if (err) { -+ *errp = err; -+ return 0; -+ } -+ } -+ -+ ac.ac_buddy_page = NULL; -+ ac.ac_bitmap_page = NULL; -+ -+ /* -+ * Check quota for allocation of this blocks. -+ */ -+ while (*len && DQUOT_ALLOC_BLOCK(inode, *len)) -+ *len -= 1; -+ if (*len == 0) { -+ *errp = -EDQUOT; -+ block = 0; -+ goto out; -+ } -+ -+ /* start searching from the goal */ -+ if (goal < le32_to_cpu(es->s_first_data_block) || -+ goal >= le32_to_cpu(es->s_blocks_count)) -+ goal = le32_to_cpu(es->s_first_data_block); -+ group = (goal - le32_to_cpu(es->s_first_data_block)) / -+ EXT3_BLOCKS_PER_GROUP(sb); -+ block = ((goal - le32_to_cpu(es->s_first_data_block)) % -+ EXT3_BLOCKS_PER_GROUP(sb)); -+ -+ /* set up allocation goals */ -+ ac.ac_b_ex.fe_group = 0; -+ ac.ac_b_ex.fe_start = 0; -+ ac.ac_b_ex.fe_len = 0; -+ ac.ac_status = AC_STATUS_CONTINUE; -+ ac.ac_groups_scanned = 0; -+ ac.ac_ex_scanned = 0; -+ ac.ac_found = 0; -+ ac.ac_sb = inode->i_sb; -+ ac.ac_g_ex.fe_group = group; -+ ac.ac_g_ex.fe_start = block; -+ ac.ac_g_ex.fe_len = *len; -+ ac.ac_flags = flags; -+ ac.ac_2order = 0; -+ ac.ac_criteria = 0; -+ -+ if (*len == 1 && sbi->s_stripe) { -+ /* looks like a metadata, let's use a dirty hack for raid5 -+ * move all metadata in first groups in hope to hit cached -+ * sectors and thus avoid read-modify cycles in raid5 */ -+ ac.ac_g_ex.fe_group = group = 0; -+ } -+ -+ /* probably, the request is for 2^8+ blocks (1/2/3/... MB) */ -+ i = ffs(*len); -+ if (i >= ext3_mb_order2_reqs) { -+ i--; -+ if ((*len & (~(1 << i))) == 0) -+ ac.ac_2order = i; -+ } -+ -+ /* first, try the goal */ -+ err = ext3_mb_find_by_goal(&ac, &e3b); -+ if (err) -+ goto out_err; -+ if (ac.ac_status == AC_STATUS_FOUND) -+ goto found; -+ -+ /* Let's just scan groups to find more-less suitable blocks */ -+ cr = ac.ac_2order ? 0 : 1; -+repeat: -+ for (; cr < 4 && ac.ac_status == AC_STATUS_CONTINUE; cr++) { -+ ac.ac_criteria = cr; -+ for (i = 0; i < EXT3_SB(sb)->s_groups_count; group++, i++) { -+ if (group == EXT3_SB(sb)->s_groups_count) -+ group = 0; -+ -+ if (EXT3_MB_GRP_NEED_INIT(EXT3_GROUP_INFO(sb, group))) { -+ /* we need full data about the group -+ * to make a good selection */ -+ err = ext3_mb_load_buddy(ac.ac_sb, group, &e3b); -+ if (err) -+ goto out_err; -+ ext3_mb_release_desc(&e3b); -+ } -+ -+ /* check is group good for our criteries */ -+ if (!ext3_mb_good_group(&ac, group, cr)) -+ continue; -+ -+ err = ext3_mb_load_buddy(ac.ac_sb, group, &e3b); -+ if (err) -+ goto out_err; -+ -+ ext3_lock_group(sb, group); -+ if (!ext3_mb_good_group(&ac, group, cr)) { -+ /* someone did allocation from this group */ -+ ext3_unlock_group(sb, group); -+ ext3_mb_release_desc(&e3b); -+ continue; -+ } -+ -+ ac.ac_groups_scanned++; -+ if (cr == 0) -+ ext3_mb_simple_scan_group(&ac, &e3b); -+ else if (cr == 1 && *len == sbi->s_stripe) -+ ext3_mb_scan_aligned(&ac, &e3b); -+ else -+ ext3_mb_complex_scan_group(&ac, &e3b); -+ -+ ext3_unlock_group(sb, group); -+ -+ ext3_mb_release_desc(&e3b); -+ -+ if (ac.ac_status != AC_STATUS_CONTINUE) -+ break; -+ } -+ } -+ -+ if (ac.ac_b_ex.fe_len > 0 && ac.ac_status != AC_STATUS_FOUND && -+ !(ac.ac_flags & EXT3_MB_HINT_FIRST)) { -+ /* -+ * We've been searching too long. Let's try to allocate -+ * the best chunk we've found so far -+ */ -+ -+ /*if (ac.ac_found > ext3_mb_max_to_scan) -+ printk(KERN_DEBUG "EXT3-fs: too long searching at " -+ "%u (%d/%d)\n", cr, ac.ac_b_ex.fe_len, -+ ac.ac_g_ex.fe_len);*/ -+ ext3_mb_try_best_found(&ac, &e3b); -+ if (ac.ac_status != AC_STATUS_FOUND) { -+ /* -+ * Someone more lucky has already allocated it. -+ * The only thing we can do is just take first -+ * found block(s) -+ printk(KERN_DEBUG "EXT3-fs: someone won our chunk\n"); -+ */ -+ ac.ac_b_ex.fe_group = 0; -+ ac.ac_b_ex.fe_start = 0; -+ ac.ac_b_ex.fe_len = 0; -+ ac.ac_status = AC_STATUS_CONTINUE; -+ ac.ac_flags |= EXT3_MB_HINT_FIRST; -+ cr = 3; -+ goto repeat; -+ } -+ } -+ -+ if (ac.ac_status != AC_STATUS_FOUND) { -+ /* -+ * We aren't lucky definitely -+ */ -+ DQUOT_FREE_BLOCK(inode, *len); -+ *errp = -ENOSPC; -+ block = 0; -+#if 1 -+ printk(KERN_ERR "EXT3-fs: can't allocate: status %d flags %d\n", -+ ac.ac_status, ac.ac_flags); -+ printk(KERN_ERR "EXT3-fs: goal %d, best found %d/%d/%d cr %d\n", -+ ac.ac_g_ex.fe_len, ac.ac_b_ex.fe_group, -+ ac.ac_b_ex.fe_start, ac.ac_b_ex.fe_len, cr); -+ printk(KERN_ERR "EXT3-fs: %lu block reserved, %d found\n", -+ sbi->s_blocks_reserved, ac.ac_found); -+ printk("EXT3-fs: groups: "); -+ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) -+ printk("%d: %d ", i, EXT3_GROUP_INFO(sb, i)->bb_free); -+ printk("\n"); -+#endif -+ goto out; -+ } -+ -+found: -+ J_ASSERT(ac.ac_b_ex.fe_len > 0); -+ -+ /* good news - free block(s) have been found. now it's time -+ * to mark block(s) in good old journaled bitmap */ -+ block = ac.ac_b_ex.fe_group * EXT3_BLOCKS_PER_GROUP(sb) -+ + ac.ac_b_ex.fe_start -+ + le32_to_cpu(es->s_first_data_block); -+ -+ /* we made a desicion, now mark found blocks in good old -+ * bitmap to be journaled */ -+ -+ ext3_debug("using block group %d(%d)\n", -+ ac.ac_b_group.group, gdp->bg_free_blocks_count); -+ -+ bitmap_bh = read_block_bitmap(sb, ac.ac_b_ex.fe_group); -+ if (!bitmap_bh) { -+ *errp = -EIO; -+ goto out_err; -+ } -+ -+ err = ext3_journal_get_write_access(handle, bitmap_bh); -+ if (err) { -+ *errp = err; -+ goto out_err; -+ } -+ -+ gdp = ext3_get_group_desc(sb, ac.ac_b_ex.fe_group, &gdp_bh); -+ if (!gdp) { -+ *errp = -EIO; -+ goto out_err; -+ } -+ -+ err = ext3_journal_get_write_access(handle, gdp_bh); -+ if (err) -+ goto out_err; -+ -+ block = ac.ac_b_ex.fe_group * EXT3_BLOCKS_PER_GROUP(sb) -+ + ac.ac_b_ex.fe_start -+ + le32_to_cpu(es->s_first_data_block); -+ -+ if (block == le32_to_cpu(gdp->bg_block_bitmap) || -+ block == le32_to_cpu(gdp->bg_inode_bitmap) || -+ in_range(block, le32_to_cpu(gdp->bg_inode_table), -+ EXT3_SB(sb)->s_itb_per_group)) -+ ext3_error(sb, "ext3_new_block", -+ "Allocating block in system zone - " -+ "block = %u", block); -+#ifdef AGGRESSIVE_CHECK -+ for (i = 0; i < ac.ac_b_ex.fe_len; i++) -+ J_ASSERT(!mb_test_bit(ac.ac_b_ex.fe_start + i, bitmap_bh->b_data)); -+#endif -+ mb_set_bits(bitmap_bh->b_data, ac.ac_b_ex.fe_start, ac.ac_b_ex.fe_len); -+ -+ spin_lock(sb_bgl_lock(sbi, ac.ac_b_ex.fe_group)); -+ gdp->bg_free_blocks_count = -+ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) -+ - ac.ac_b_ex.fe_len); -+ spin_unlock(sb_bgl_lock(sbi, ac.ac_b_ex.fe_group)); -+ percpu_counter_mod(&sbi->s_freeblocks_counter, - ac.ac_b_ex.fe_len); -+ -+ err = ext3_journal_dirty_metadata(handle, bitmap_bh); -+ if (err) -+ goto out_err; -+ err = ext3_journal_dirty_metadata(handle, gdp_bh); -+ if (err) -+ goto out_err; -+ -+ sb->s_dirt = 1; -+ *errp = 0; -+ brelse(bitmap_bh); -+ -+ /* drop non-allocated, but dquote'd blocks */ -+ J_ASSERT(*len >= ac.ac_b_ex.fe_len); -+ DQUOT_FREE_BLOCK(inode, *len - ac.ac_b_ex.fe_len); -+ -+ *len = ac.ac_b_ex.fe_len; -+ J_ASSERT(*len > 0); -+ J_ASSERT(block != 0); -+ goto out; -+ -+out_err: -+ /* if we've already allocated something, roll it back */ -+ if (ac.ac_status == AC_STATUS_FOUND) { -+ /* FIXME: free blocks here */ -+ } -+ -+ DQUOT_FREE_BLOCK(inode, *len); -+ brelse(bitmap_bh); -+ *errp = err; -+ block = 0; -+out: -+ if (ac.ac_buddy_page) -+ page_cache_release(ac.ac_buddy_page); -+ if (ac.ac_bitmap_page) -+ page_cache_release(ac.ac_bitmap_page); -+ -+ if (!(flags & EXT3_MB_HINT_RESERVED)) { -+ /* block wasn't reserved before and we reserved it -+ * at the beginning of allocation. it doesn't matter -+ * whether we allocated anything or we failed: time -+ * to release reservation. NOTE: because I expect -+ * any multiblock request from delayed allocation -+ * path only, here is single block always */ -+ ext3_mb_release_blocks(sb, 1); -+ } -+ -+ if (unlikely(ext3_mb_stats) && ac.ac_g_ex.fe_len > 1) { -+ atomic_inc(&sbi->s_bal_reqs); -+ atomic_add(*len, &sbi->s_bal_allocated); -+ if (*len >= ac.ac_g_ex.fe_len) -+ atomic_inc(&sbi->s_bal_success); -+ atomic_add(ac.ac_found, &sbi->s_bal_ex_scanned); -+ if (ac.ac_g_ex.fe_start == ac.ac_b_ex.fe_start && -+ ac.ac_g_ex.fe_group == ac.ac_b_ex.fe_group) -+ atomic_inc(&sbi->s_bal_goals); -+ if (ac.ac_found > ext3_mb_max_to_scan) -+ atomic_inc(&sbi->s_bal_breaks); -+ } -+ -+ ext3_mb_store_history(sb, inode->i_ino, &ac); -+ -+ return block; -+} -+EXPORT_SYMBOL(ext3_mb_new_blocks); -+ -+#ifdef EXT3_MB_HISTORY -+struct ext3_mb_proc_session { -+ struct ext3_mb_history *history; -+ struct super_block *sb; -+ int start; -+ int max; -+}; -+ -+static void *ext3_mb_history_skip_empty(struct ext3_mb_proc_session *s, -+ struct ext3_mb_history *hs, -+ int first) -+{ -+ if (hs == s->history + s->max) -+ hs = s->history; -+ if (!first && hs == s->history + s->start) -+ return NULL; -+ while (hs->goal.fe_len == 0) { -+ hs++; -+ if (hs == s->history + s->max) -+ hs = s->history; -+ if (hs == s->history + s->start) -+ return NULL; -+ } -+ return hs; -+} -+ -+static void *ext3_mb_seq_history_start(struct seq_file *seq, loff_t *pos) -+{ -+ struct ext3_mb_proc_session *s = seq->private; -+ struct ext3_mb_history *hs; -+ int l = *pos; -+ -+ if (l == 0) -+ return SEQ_START_TOKEN; -+ hs = ext3_mb_history_skip_empty(s, s->history + s->start, 1); -+ if (!hs) -+ return NULL; -+ while (--l && (hs = ext3_mb_history_skip_empty(s, ++hs, 0)) != NULL); -+ return hs; -+} -+ -+static void *ext3_mb_seq_history_next(struct seq_file *seq, void *v, loff_t *pos) -+{ -+ struct ext3_mb_proc_session *s = seq->private; -+ struct ext3_mb_history *hs = v; -+ -+ ++*pos; -+ if (v == SEQ_START_TOKEN) -+ return ext3_mb_history_skip_empty(s, s->history + s->start, 1); -+ else -+ return ext3_mb_history_skip_empty(s, ++hs, 0); -+} -+ -+static int ext3_mb_seq_history_show(struct seq_file *seq, void *v) -+{ -+ struct ext3_mb_history *hs = v; -+ char buf[20], buf2[20]; -+ -+ if (v == SEQ_START_TOKEN) { -+ seq_printf(seq, "%-5s %-8s %-17s %-17s %-5s %-5s %-2s %-5s %-5s %-6s\n", -+ "pid", "inode", "goal", "result", "found", "grps", "cr", -+ "merge", "tail", "broken"); -+ return 0; -+ } -+ -+ sprintf(buf, "%u/%u/%u", hs->goal.fe_group, -+ hs->goal.fe_start, hs->goal.fe_len); -+ sprintf(buf2, "%u/%u/%u", hs->result.fe_group, -+ hs->result.fe_start, hs->result.fe_len); -+ seq_printf(seq, "%-5u %-8u %-17s %-17s %-5u %-5u %-2u %-5s %-5u %-6u\n", -+ hs->pid, hs->ino, buf, buf2, hs->found, hs->groups, -+ hs->cr, hs->merged ? "M" : "", hs->tail, -+ hs->buddy ? 1 << hs->buddy : 0); -+ return 0; -+} -+ -+static void ext3_mb_seq_history_stop(struct seq_file *seq, void *v) -+{ -+} -+ -+static struct seq_operations ext3_mb_seq_history_ops = { -+ .start = ext3_mb_seq_history_start, -+ .next = ext3_mb_seq_history_next, -+ .stop = ext3_mb_seq_history_stop, -+ .show = ext3_mb_seq_history_show, -+}; -+ -+static int ext3_mb_seq_history_open(struct inode *inode, struct file *file) -+{ -+ struct super_block *sb = PDE(inode)->data; -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ struct ext3_mb_proc_session *s; -+ int rc, size; -+ -+ s = kmalloc(sizeof(*s), GFP_KERNEL); -+ if (s == NULL) -+ return -EIO; -+ size = sizeof(struct ext3_mb_history) * sbi->s_mb_history_max; -+ s->history = kmalloc(size, GFP_KERNEL); -+ if (s == NULL) { -+ kfree(s); -+ return -EIO; -+ } -+ -+ spin_lock(&sbi->s_mb_history_lock); -+ memcpy(s->history, sbi->s_mb_history, size); -+ s->max = sbi->s_mb_history_max; -+ s->start = sbi->s_mb_history_cur % s->max; -+ spin_unlock(&sbi->s_mb_history_lock); -+ -+ rc = seq_open(file, &ext3_mb_seq_history_ops); -+ if (rc == 0) { -+ struct seq_file *m = (struct seq_file *)file->private_data; -+ m->private = s; -+ } else { -+ kfree(s->history); -+ kfree(s); -+ } -+ return rc; -+ -+} -+ -+static int ext3_mb_seq_history_release(struct inode *inode, struct file *file) -+{ -+ struct seq_file *seq = (struct seq_file *)file->private_data; -+ struct ext3_mb_proc_session *s = seq->private; -+ kfree(s->history); -+ kfree(s); -+ return seq_release(inode, file); -+} -+ -+static struct file_operations ext3_mb_seq_history_fops = { -+ .owner = THIS_MODULE, -+ .open = ext3_mb_seq_history_open, -+ .read = seq_read, -+ .llseek = seq_lseek, -+ .release = ext3_mb_seq_history_release, -+}; -+ -+static void *ext3_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) -+{ -+ struct super_block *sb = seq->private; -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ long group; -+ -+ if (*pos < 0 || *pos >= sbi->s_groups_count) -+ return NULL; -+ -+ group = *pos + 1; -+ return (void *) group; -+} -+ -+static void *ext3_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) -+{ -+ struct super_block *sb = seq->private; -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ long group; -+ -+ ++*pos; -+ if (*pos < 0 || *pos >= sbi->s_groups_count) -+ return NULL; -+ group = *pos + 1; -+ return (void *) group;; -+} -+ -+static int ext3_mb_seq_groups_show(struct seq_file *seq, void *v) -+{ -+ struct super_block *sb = seq->private; -+ long group = (long) v, i; -+ struct sg { -+ struct ext3_group_info info; -+ unsigned short counters[16]; -+ } sg; -+ -+ group--; -+ if (group == 0) -+ seq_printf(seq, "#%-5s: %-5s %-5s %-5s [ %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s ]\n", -+ "group", "free", "frags", "first", "2^0", "2^1", "2^2", -+ "2^3", "2^4", "2^5", "2^6", "2^7", "2^8", "2^9", "2^10", -+ "2^11", "2^12", "2^13"); -+ -+ i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) + -+ sizeof(struct ext3_group_info); -+ ext3_lock_group(sb, group); -+ memcpy(&sg, EXT3_GROUP_INFO(sb, group), i); -+ ext3_unlock_group(sb, group); -+ -+ if (EXT3_MB_GRP_NEED_INIT(&sg.info)) -+ return 0; -+ -+ seq_printf(seq, "#%-5lu: %-5u %-5u %-5u [", group, sg.info.bb_free, -+ sg.info.bb_fragments, sg.info.bb_first_free); -+ for (i = 0; i <= 13; i++) -+ seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ? -+ sg.info.bb_counters[i] : 0); -+ seq_printf(seq, " ]\n"); -+ -+ return 0; -+} -+ -+static void ext3_mb_seq_groups_stop(struct seq_file *seq, void *v) -+{ -+} -+ -+static struct seq_operations ext3_mb_seq_groups_ops = { -+ .start = ext3_mb_seq_groups_start, -+ .next = ext3_mb_seq_groups_next, -+ .stop = ext3_mb_seq_groups_stop, -+ .show = ext3_mb_seq_groups_show, -+}; -+ -+static int ext3_mb_seq_groups_open(struct inode *inode, struct file *file) -+{ -+ struct super_block *sb = PDE(inode)->data; -+ int rc; -+ -+ rc = seq_open(file, &ext3_mb_seq_groups_ops); -+ if (rc == 0) { -+ struct seq_file *m = (struct seq_file *)file->private_data; -+ m->private = sb; -+ } -+ return rc; -+ -+} -+ -+static struct file_operations ext3_mb_seq_groups_fops = { -+ .owner = THIS_MODULE, -+ .open = ext3_mb_seq_groups_open, -+ .read = seq_read, -+ .llseek = seq_lseek, -+ .release = seq_release, -+}; -+ -+static void ext3_mb_history_release(struct super_block *sb) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ char name[64]; -+ -+ snprintf(name, sizeof(name) - 1, "%s", bdevname(sb->s_bdev, name)); -+ remove_proc_entry("mb_groups", sbi->s_mb_proc); -+ remove_proc_entry("mb_history", sbi->s_mb_proc); -+ remove_proc_entry(name, proc_root_ext3); -+ -+ if (sbi->s_mb_history) -+ kfree(sbi->s_mb_history); -+} -+ -+static void ext3_mb_history_init(struct super_block *sb) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ char name[64]; -+ int i; -+ -+ snprintf(name, sizeof(name) - 1, "%s", bdevname(sb->s_bdev, name)); -+ sbi->s_mb_proc = proc_mkdir(name, proc_root_ext3); -+ if (sbi->s_mb_proc != NULL) { -+ struct proc_dir_entry *p; -+ p = create_proc_entry("mb_history", S_IRUGO, sbi->s_mb_proc); -+ if (p) { -+ p->proc_fops = &ext3_mb_seq_history_fops; -+ p->data = sb; -+ } -+ p = create_proc_entry("mb_groups", S_IRUGO, sbi->s_mb_proc); -+ if (p) { -+ p->proc_fops = &ext3_mb_seq_groups_fops; -+ p->data = sb; -+ } -+ } -+ -+ sbi->s_mb_history_max = 1000; -+ sbi->s_mb_history_cur = 0; -+ spin_lock_init(&sbi->s_mb_history_lock); -+ i = sbi->s_mb_history_max * sizeof(struct ext3_mb_history); -+ sbi->s_mb_history = kmalloc(i, GFP_KERNEL); -+ memset(sbi->s_mb_history, 0, i); -+ /* if we can't allocate history, then we simple won't use it */ -+} -+ -+static void -+ext3_mb_store_history(struct super_block *sb, unsigned ino, -+ struct ext3_allocation_context *ac) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ struct ext3_mb_history h; -+ -+ if (likely(sbi->s_mb_history == NULL)) -+ return; -+ -+ h.pid = current->pid; -+ h.ino = ino; -+ h.goal = ac->ac_g_ex; -+ h.result = ac->ac_b_ex; -+ h.found = ac->ac_found; -+ h.cr = ac->ac_criteria; -+ h.groups = ac->ac_groups_scanned; -+ h.tail = ac->ac_tail; -+ h.buddy = ac->ac_buddy; -+ h.merged = 0; -+ if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && -+ ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group) -+ h.merged = 1; -+ -+ spin_lock(&sbi->s_mb_history_lock); -+ memcpy(sbi->s_mb_history + sbi->s_mb_history_cur, &h, sizeof(h)); -+ if (++sbi->s_mb_history_cur >= sbi->s_mb_history_max) -+ sbi->s_mb_history_cur = 0; -+ spin_unlock(&sbi->s_mb_history_lock); -+} -+ -+#else -+#define ext3_mb_history_release(sb) -+#define ext3_mb_history_init(sb) -+#endif -+ -+int ext3_mb_init_backend(struct super_block *sb) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ int i, j, len, metalen; -+ int num_meta_group_infos = -+ (sbi->s_groups_count + EXT3_DESC_PER_BLOCK(sb) - 1) >> -+ EXT3_DESC_PER_BLOCK_BITS(sb); -+ struct ext3_group_info **meta_group_info; -+ -+ /* An 8TB filesystem with 64-bit pointers requires a 4096 byte -+ * kmalloc. A 128kb malloc should suffice for a 256TB filesystem. -+ * So a two level scheme suffices for now. */ -+ sbi->s_group_info = kmalloc(sizeof(*sbi->s_group_info) * -+ num_meta_group_infos, GFP_KERNEL); -+ if (sbi->s_group_info == NULL) { -+ printk(KERN_ERR "EXT3-fs: can't allocate buddy meta group\n"); -+ return -ENOMEM; -+ } -+ sbi->s_buddy_cache = new_inode(sb); -+ if (sbi->s_buddy_cache == NULL) { -+ printk(KERN_ERR "EXT3-fs: can't get new inode\n"); -+ goto err_freesgi; -+ } -+ -+ metalen = sizeof(*meta_group_info) << EXT3_DESC_PER_BLOCK_BITS(sb); -+ for (i = 0; i < num_meta_group_infos; i++) { -+ if ((i + 1) == num_meta_group_infos) -+ metalen = sizeof(*meta_group_info) * -+ (sbi->s_groups_count - -+ (i << EXT3_DESC_PER_BLOCK_BITS(sb))); -+ meta_group_info = kmalloc(metalen, GFP_KERNEL); -+ if (meta_group_info == NULL) { -+ printk(KERN_ERR "EXT3-fs: can't allocate mem for a " -+ "buddy group\n"); -+ goto err_freemeta; -+ } -+ sbi->s_group_info[i] = meta_group_info; -+ } -+ -+ /* -+ * calculate needed size. if change bb_counters size, -+ * don't forget about ext3_mb_generate_buddy() -+ */ -+ len = sizeof(struct ext3_group_info); -+ len += sizeof(unsigned short) * (sb->s_blocksize_bits + 2); -+ for (i = 0; i < sbi->s_groups_count; i++) { -+ struct ext3_group_desc * desc; -+ -+ meta_group_info = -+ sbi->s_group_info[i >> EXT3_DESC_PER_BLOCK_BITS(sb)]; -+ j = i & (EXT3_DESC_PER_BLOCK(sb) - 1); -+ -+ meta_group_info[j] = kmalloc(len, GFP_KERNEL); -+ if (meta_group_info[j] == NULL) { -+ printk(KERN_ERR "EXT3-fs: can't allocate buddy mem\n"); -+ i--; -+ goto err_freebuddy; -+ } -+ desc = ext3_get_group_desc(sb, i, NULL); -+ if (desc == NULL) { -+ printk(KERN_ERR"EXT3-fs: can't read descriptor %u\n",i); -+ goto err_freebuddy; -+ } -+ memset(meta_group_info[j], 0, len); -+ set_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, -+ &meta_group_info[j]->bb_state); -+ meta_group_info[j]->bb_free = -+ le16_to_cpu(desc->bg_free_blocks_count); -+ } -+ -+ return 0; -+ -+err_freebuddy: -+ while (i >= 0) { -+ kfree(EXT3_GROUP_INFO(sb, i)); -+ i--; -+ } -+ i = num_meta_group_infos; -+err_freemeta: -+ while (--i >= 0) -+ kfree(sbi->s_group_info[i]); -+ iput(sbi->s_buddy_cache); -+err_freesgi: -+ kfree(sbi->s_group_info); -+ return -ENOMEM; -+} -+ -+int ext3_mb_init(struct super_block *sb, int needs_recovery) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ struct inode *root = sb->s_root->d_inode; -+ unsigned i, offset, max; -+ struct dentry *dentry; -+ -+ if (!test_opt(sb, MBALLOC)) -+ return 0; -+ -+ i = (sb->s_blocksize_bits + 2) * sizeof(unsigned short); -+ -+ sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL); -+ if (sbi->s_mb_offsets == NULL) { -+ clear_opt(sbi->s_mount_opt, MBALLOC); -+ return -ENOMEM; -+ } -+ sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); -+ if (sbi->s_mb_maxs == NULL) { -+ clear_opt(sbi->s_mount_opt, MBALLOC); -+ kfree(sbi->s_mb_maxs); -+ return -ENOMEM; -+ } -+ -+ /* order 0 is regular bitmap */ -+ sbi->s_mb_maxs[0] = sb->s_blocksize << 3; -+ sbi->s_mb_offsets[0] = 0; -+ -+ i = 1; -+ offset = 0; -+ max = sb->s_blocksize << 2; -+ do { -+ sbi->s_mb_offsets[i] = offset; -+ sbi->s_mb_maxs[i] = max; -+ offset += 1 << (sb->s_blocksize_bits - i); -+ max = max >> 1; -+ i++; -+ } while (i <= sb->s_blocksize_bits + 1); -+ -+ /* init file for buddy data */ -+ if ((i = ext3_mb_init_backend(sb))) { -+ clear_opt(sbi->s_mount_opt, MBALLOC); -+ kfree(sbi->s_mb_offsets); -+ kfree(sbi->s_mb_maxs); -+ return i; -+ } -+ -+ spin_lock_init(&sbi->s_reserve_lock); -+ spin_lock_init(&sbi->s_md_lock); -+ INIT_LIST_HEAD(&sbi->s_active_transaction); -+ INIT_LIST_HEAD(&sbi->s_closed_transaction); -+ INIT_LIST_HEAD(&sbi->s_committed_transaction); -+ spin_lock_init(&sbi->s_bal_lock); -+ -+ /* remove old on-disk buddy file */ -+ mutex_lock(&root->i_mutex); -+ dentry = lookup_one_len(".buddy", sb->s_root, strlen(".buddy")); -+ if (dentry->d_inode != NULL) { -+ i = vfs_unlink(root, dentry); -+ if (i != 0) -+ printk("EXT3-fs: can't remove .buddy file: %d\n", i); -+ } -+ dput(dentry); -+ mutex_unlock(&root->i_mutex); -+ -+ ext3_mb_history_init(sb); -+ -+ printk("EXT3-fs: mballoc enabled\n"); -+ return 0; -+} -+ -+int ext3_mb_release(struct super_block *sb) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ int i, num_meta_group_infos; -+ -+ if (!test_opt(sb, MBALLOC)) -+ return 0; -+ -+ /* release freed, non-committed blocks */ -+ spin_lock(&sbi->s_md_lock); -+ list_splice_init(&sbi->s_closed_transaction, -+ &sbi->s_committed_transaction); -+ list_splice_init(&sbi->s_active_transaction, -+ &sbi->s_committed_transaction); -+ spin_unlock(&sbi->s_md_lock); -+ ext3_mb_free_committed_blocks(sb); -+ -+ if (sbi->s_group_info) { -+ for (i = 0; i < sbi->s_groups_count; i++) -+ kfree(EXT3_GROUP_INFO(sb, i)); -+ num_meta_group_infos = (sbi->s_groups_count + -+ EXT3_DESC_PER_BLOCK(sb) - 1) >> -+ EXT3_DESC_PER_BLOCK_BITS(sb); -+ for (i = 0; i < num_meta_group_infos; i++) -+ kfree(sbi->s_group_info[i]); -+ kfree(sbi->s_group_info); -+ } -+ if (sbi->s_mb_offsets) -+ kfree(sbi->s_mb_offsets); -+ if (sbi->s_mb_maxs) -+ kfree(sbi->s_mb_maxs); -+ if (sbi->s_buddy_cache) -+ iput(sbi->s_buddy_cache); -+ if (sbi->s_blocks_reserved) -+ printk("ext3-fs: %ld blocks being reserved at umount!\n", -+ sbi->s_blocks_reserved); -+ if (ext3_mb_stats) { -+ printk("EXT3-fs: mballoc: %u blocks %u reqs (%u success)\n", -+ atomic_read(&sbi->s_bal_allocated), -+ atomic_read(&sbi->s_bal_reqs), -+ atomic_read(&sbi->s_bal_success)); -+ printk("EXT3-fs: mballoc: %u extents scanned, %u goal hits, " -+ "%u 2^N hits, %u breaks\n", -+ atomic_read(&sbi->s_bal_ex_scanned), -+ atomic_read(&sbi->s_bal_goals), -+ atomic_read(&sbi->s_bal_2orders), -+ atomic_read(&sbi->s_bal_breaks)); -+ printk("EXT3-fs: mballoc: %lu generated and it took %Lu\n", -+ sbi->s_mb_buddies_generated++, -+ sbi->s_mb_generation_time); -+ } -+ -+ ext3_mb_history_release(sb); -+ -+ return 0; -+} -+ -+void ext3_mb_free_committed_blocks(struct super_block *sb) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ int err, i, count = 0, count2 = 0; -+ struct ext3_free_metadata *md; -+ struct ext3_buddy e3b; -+ -+ if (list_empty(&sbi->s_committed_transaction)) -+ return; -+ -+ /* there is committed blocks to be freed yet */ -+ do { -+ /* get next array of blocks */ -+ md = NULL; -+ spin_lock(&sbi->s_md_lock); -+ if (!list_empty(&sbi->s_committed_transaction)) { -+ md = list_entry(sbi->s_committed_transaction.next, -+ struct ext3_free_metadata, list); -+ list_del(&md->list); -+ } -+ spin_unlock(&sbi->s_md_lock); -+ -+ if (md == NULL) -+ break; -+ -+ mb_debug("gonna free %u blocks in group %u (0x%p):", -+ md->num, md->group, md); -+ -+ err = ext3_mb_load_buddy(sb, md->group, &e3b); -+ /* we expect to find existing buddy because it's pinned */ -+ BUG_ON(err != 0); -+ -+ /* there are blocks to put in buddy to make them really free */ -+ count += md->num; -+ count2++; -+ ext3_lock_group(sb, md->group); -+ for (i = 0; i < md->num; i++) { -+ mb_debug(" %u", md->blocks[i]); -+ mb_free_blocks(&e3b, md->blocks[i], 1); -+ } -+ mb_debug("\n"); -+ ext3_unlock_group(sb, md->group); -+ -+ /* balance refcounts from ext3_mb_free_metadata() */ -+ page_cache_release(e3b.bd_buddy_page); -+ page_cache_release(e3b.bd_bitmap_page); -+ -+ kfree(md); -+ ext3_mb_release_desc(&e3b); -+ -+ } while (md); -+ mb_debug("freed %u blocks in %u structures\n", count, count2); -+} -+ -+void ext3_mb_poll_new_transaction(struct super_block *sb, handle_t *handle) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ -+ if (sbi->s_last_transaction == handle->h_transaction->t_tid) -+ return; -+ -+ /* new transaction! time to close last one and free blocks for -+ * committed transaction. we know that only transaction can be -+ * active, so previos transaction can be being logged and we -+ * know that transaction before previous is known to be already -+ * logged. this means that now we may free blocks freed in all -+ * transactions before previous one. hope I'm clear enough ... */ -+ -+ spin_lock(&sbi->s_md_lock); -+ if (sbi->s_last_transaction != handle->h_transaction->t_tid) { -+ mb_debug("new transaction %lu, old %lu\n", -+ (unsigned long) handle->h_transaction->t_tid, -+ (unsigned long) sbi->s_last_transaction); -+ list_splice_init(&sbi->s_closed_transaction, -+ &sbi->s_committed_transaction); -+ list_splice_init(&sbi->s_active_transaction, -+ &sbi->s_closed_transaction); -+ sbi->s_last_transaction = handle->h_transaction->t_tid; -+ } -+ spin_unlock(&sbi->s_md_lock); -+ -+ ext3_mb_free_committed_blocks(sb); -+} -+ -+int ext3_mb_free_metadata(handle_t *handle, struct ext3_buddy *e3b, -+ int group, int block, int count) -+{ -+ struct ext3_group_info *db = e3b->bd_info; -+ struct super_block *sb = e3b->bd_sb; -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ struct ext3_free_metadata *md; -+ int i; -+ -+ J_ASSERT(e3b->bd_bitmap_page != NULL); -+ J_ASSERT(e3b->bd_buddy_page != NULL); -+ -+ ext3_lock_group(sb, group); -+ for (i = 0; i < count; i++) { -+ md = db->bb_md_cur; -+ if (md && db->bb_tid != handle->h_transaction->t_tid) { -+ db->bb_md_cur = NULL; -+ md = NULL; -+ } -+ -+ if (md == NULL) { -+ ext3_unlock_group(sb, group); -+ md = kmalloc(sizeof(*md), GFP_KERNEL); -+ if (md == NULL) -+ return -ENOMEM; -+ md->num = 0; -+ md->group = group; -+ -+ ext3_lock_group(sb, group); -+ if (db->bb_md_cur == NULL) { -+ spin_lock(&sbi->s_md_lock); -+ list_add(&md->list, &sbi->s_active_transaction); -+ spin_unlock(&sbi->s_md_lock); -+ /* protect buddy cache from being freed, -+ * otherwise we'll refresh it from -+ * on-disk bitmap and lose not-yet-available -+ * blocks */ -+ page_cache_get(e3b->bd_buddy_page); -+ page_cache_get(e3b->bd_bitmap_page); -+ db->bb_md_cur = md; -+ db->bb_tid = handle->h_transaction->t_tid; -+ mb_debug("new md 0x%p for group %u\n", -+ md, md->group); -+ } else { -+ kfree(md); -+ md = db->bb_md_cur; -+ } -+ } -+ -+ BUG_ON(md->num >= EXT3_BB_MAX_BLOCKS); -+ md->blocks[md->num] = block + i; -+ md->num++; -+ if (md->num == EXT3_BB_MAX_BLOCKS) { -+ /* no more space, put full container on a sb's list */ -+ db->bb_md_cur = NULL; -+ } -+ } -+ ext3_unlock_group(sb, group); -+ return 0; -+} -+ -+void ext3_mb_free_blocks(handle_t *handle, struct inode *inode, -+ unsigned long block, unsigned long count, -+ int metadata, int *freed) -+{ -+ struct buffer_head *bitmap_bh = NULL; -+ struct ext3_group_desc *gdp; -+ struct ext3_super_block *es; -+ unsigned long bit, overflow; -+ struct buffer_head *gd_bh; -+ unsigned long block_group; -+ struct ext3_sb_info *sbi; -+ struct super_block *sb; -+ struct ext3_buddy e3b; -+ int err = 0, ret; -+ -+ *freed = 0; -+ sb = inode->i_sb; -+ if (!sb) { -+ printk ("ext3_free_blocks: nonexistent device"); -+ return; -+ } -+ -+ ext3_mb_poll_new_transaction(sb, handle); -+ -+ sbi = EXT3_SB(sb); -+ es = EXT3_SB(sb)->s_es; -+ if (block < le32_to_cpu(es->s_first_data_block) || -+ block + count < block || -+ block + count > le32_to_cpu(es->s_blocks_count)) { -+ ext3_error (sb, "ext3_free_blocks", -+ "Freeing blocks not in datazone - " -+ "block = %lu, count = %lu", block, count); -+ goto error_return; -+ } -+ -+ ext3_debug("freeing block %lu\n", block); -+ -+do_more: -+ overflow = 0; -+ block_group = (block - le32_to_cpu(es->s_first_data_block)) / -+ EXT3_BLOCKS_PER_GROUP(sb); -+ bit = (block - le32_to_cpu(es->s_first_data_block)) % -+ EXT3_BLOCKS_PER_GROUP(sb); -+ /* -+ * Check to see if we are freeing blocks across a group -+ * boundary. -+ */ -+ if (bit + count > EXT3_BLOCKS_PER_GROUP(sb)) { -+ overflow = bit + count - EXT3_BLOCKS_PER_GROUP(sb); -+ count -= overflow; -+ } -+ brelse(bitmap_bh); -+ bitmap_bh = read_block_bitmap(sb, block_group); -+ if (!bitmap_bh) -+ goto error_return; -+ gdp = ext3_get_group_desc (sb, block_group, &gd_bh); -+ if (!gdp) -+ goto error_return; -+ -+ if (in_range (le32_to_cpu(gdp->bg_block_bitmap), block, count) || -+ in_range (le32_to_cpu(gdp->bg_inode_bitmap), block, count) || -+ in_range (block, le32_to_cpu(gdp->bg_inode_table), -+ EXT3_SB(sb)->s_itb_per_group) || -+ in_range (block + count - 1, le32_to_cpu(gdp->bg_inode_table), -+ EXT3_SB(sb)->s_itb_per_group)) -+ ext3_error (sb, "ext3_free_blocks", -+ "Freeing blocks in system zones - " -+ "Block = %lu, count = %lu", -+ block, count); -+ -+ BUFFER_TRACE(bitmap_bh, "getting write access"); -+ err = ext3_journal_get_write_access(handle, bitmap_bh); -+ if (err) -+ goto error_return; -+ -+ /* -+ * We are about to modify some metadata. Call the journal APIs -+ * to unshare ->b_data if a currently-committing transaction is -+ * using it -+ */ -+ BUFFER_TRACE(gd_bh, "get_write_access"); -+ err = ext3_journal_get_write_access(handle, gd_bh); -+ if (err) -+ goto error_return; -+ -+ err = ext3_mb_load_buddy(sb, block_group, &e3b); -+ if (err) -+ goto error_return; -+ -+#ifdef AGGRESSIVE_CHECK -+ { -+ int i; -+ for (i = 0; i < count; i++) -+ J_ASSERT(mb_test_bit(bit + i, bitmap_bh->b_data)); -+ } -+#endif -+ mb_clear_bits(bitmap_bh->b_data, bit, count); -+ -+ /* We dirtied the bitmap block */ -+ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); -+ err = ext3_journal_dirty_metadata(handle, bitmap_bh); -+ -+ if (metadata) { -+ /* blocks being freed are metadata. these blocks shouldn't -+ * be used until this transaction is committed */ -+ ext3_mb_free_metadata(handle, &e3b, block_group, bit, count); -+ } else { -+ ext3_lock_group(sb, block_group); -+ mb_free_blocks(&e3b, bit, count); -+ ext3_unlock_group(sb, block_group); -+ } -+ -+ spin_lock(sb_bgl_lock(sbi, block_group)); -+ gdp->bg_free_blocks_count = -+ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + count); -+ spin_unlock(sb_bgl_lock(sbi, block_group)); -+ percpu_counter_mod(&sbi->s_freeblocks_counter, count); -+ -+ ext3_mb_release_desc(&e3b); -+ -+ *freed = count; -+ -+ /* And the group descriptor block */ -+ BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); -+ ret = ext3_journal_dirty_metadata(handle, gd_bh); -+ if (!err) err = ret; -+ -+ if (overflow && !err) { -+ block += count; -+ count = overflow; -+ goto do_more; -+ } -+ sb->s_dirt = 1; -+error_return: -+ brelse(bitmap_bh); -+ ext3_std_error(sb, err); -+ return; -+} -+ -+int ext3_mb_reserve_blocks(struct super_block *sb, int blocks) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ int free, ret = -ENOSPC; -+ -+ BUG_ON(blocks < 0); -+ spin_lock(&sbi->s_reserve_lock); -+ free = percpu_counter_read_positive(&sbi->s_freeblocks_counter); -+ if (blocks <= free - sbi->s_blocks_reserved) { -+ sbi->s_blocks_reserved += blocks; -+ ret = 0; -+ } -+ spin_unlock(&sbi->s_reserve_lock); -+ return ret; -+} -+ -+void ext3_mb_release_blocks(struct super_block *sb, int blocks) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ -+ BUG_ON(blocks < 0); -+ spin_lock(&sbi->s_reserve_lock); -+ sbi->s_blocks_reserved -= blocks; -+ WARN_ON(sbi->s_blocks_reserved < 0); -+ if (sbi->s_blocks_reserved < 0) -+ sbi->s_blocks_reserved = 0; -+ spin_unlock(&sbi->s_reserve_lock); -+} -+ -+int ext3_new_block(handle_t *handle, struct inode *inode, -+ unsigned long goal, int *errp) -+{ -+ int ret, len; -+ -+ if (!test_opt(inode->i_sb, MBALLOC)) { -+ ret = ext3_new_block_old(handle, inode, goal, errp); -+ goto out; -+ } -+ len = 1; -+ ret = ext3_mb_new_blocks(handle, inode, goal, &len, 0, errp); -+out: -+ return ret; -+} -+ -+ -+void ext3_free_blocks(handle_t *handle, struct inode * inode, -+ unsigned long block, unsigned long count, int metadata) -+{ -+ struct super_block *sb; -+ int freed; -+ -+ sb = inode->i_sb; -+ if (!test_opt(sb, MBALLOC) || !EXT3_SB(sb)->s_group_info) -+ ext3_free_blocks_sb(handle, sb, block, count, &freed); -+ else -+ ext3_mb_free_blocks(handle, inode, block, count, metadata, &freed); -+ if (freed) -+ DQUOT_FREE_BLOCK(inode, freed); -+ return; -+} -+ -+#define EXT3_ROOT "ext3" -+#define EXT3_MB_STATS_NAME "mb_stats" -+#define EXT3_MB_MAX_TO_SCAN_NAME "mb_max_to_scan" -+#define EXT3_MB_MIN_TO_SCAN_NAME "mb_min_to_scan" -+#define EXT3_MB_ORDER2_REQ "mb_order2_req" -+ -+static int ext3_mb_stats_read(char *page, char **start, off_t off, -+ int count, int *eof, void *data) -+{ -+ int len; -+ -+ *eof = 1; -+ if (off != 0) -+ return 0; -+ -+ len = sprintf(page, "%ld\n", ext3_mb_stats); -+ *start = page; -+ return len; -+} -+ -+static int ext3_mb_stats_write(struct file *file, const char *buffer, -+ unsigned long count, void *data) -+{ -+ char str[32]; -+ -+ if (count >= sizeof(str)) { -+ printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n", -+ EXT3_MB_STATS_NAME, (int)sizeof(str)); -+ return -EOVERFLOW; -+ } -+ -+ if (copy_from_user(str, buffer, count)) -+ return -EFAULT; -+ -+ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ -+ ext3_mb_stats = (simple_strtol(str, NULL, 0) != 0); -+ return count; -+} -+ -+static int ext3_mb_max_to_scan_read(char *page, char **start, off_t off, -+ int count, int *eof, void *data) -+{ -+ int len; -+ -+ *eof = 1; -+ if (off != 0) -+ return 0; -+ -+ len = sprintf(page, "%ld\n", ext3_mb_max_to_scan); -+ *start = page; -+ return len; -+} -+ -+static int ext3_mb_max_to_scan_write(struct file *file, const char *buffer, -+ unsigned long count, void *data) -+{ -+ char str[32]; -+ long value; -+ -+ if (count >= sizeof(str)) { -+ printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n", -+ EXT3_MB_MAX_TO_SCAN_NAME, (int)sizeof(str)); -+ return -EOVERFLOW; -+ } -+ -+ if (copy_from_user(str, buffer, count)) -+ return -EFAULT; -+ -+ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ -+ value = simple_strtol(str, NULL, 0); -+ if (value <= 0) -+ return -ERANGE; -+ -+ ext3_mb_max_to_scan = value; -+ -+ return count; -+} -+ -+static int ext3_mb_min_to_scan_read(char *page, char **start, off_t off, -+ int count, int *eof, void *data) -+{ -+ int len; -+ -+ *eof = 1; -+ if (off != 0) -+ return 0; -+ -+ len = sprintf(page, "%ld\n", ext3_mb_min_to_scan); -+ *start = page; -+ return len; -+} -+ -+static int ext3_mb_min_to_scan_write(struct file *file, const char *buffer, -+ unsigned long count, void *data) -+{ -+ char str[32]; -+ long value; -+ -+ if (count >= sizeof(str)) { -+ printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n", -+ EXT3_MB_MIN_TO_SCAN_NAME, (int)sizeof(str)); -+ return -EOVERFLOW; -+ } -+ -+ if (copy_from_user(str, buffer, count)) -+ return -EFAULT; -+ -+ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ -+ value = simple_strtol(str, NULL, 0); -+ if (value <= 0) -+ return -ERANGE; -+ -+ ext3_mb_min_to_scan = value; -+ -+ return count; -+} -+ -+static int ext3_mb_order2_req_read(char *page, char **start, off_t off, -+ int count, int *eof, void *data) -+{ -+ int len; -+ -+ *eof = 1; -+ if (off != 0) -+ return 0; -+ -+ len = sprintf(page, "%ld\n", ext3_mb_order2_reqs); -+ *start = page; -+ return len; -+} -+ -+static int ext3_mb_order2_req_write(struct file *file, const char *buffer, -+ unsigned long count, void *data) -+{ -+ char str[32]; -+ long value; -+ -+ if (count >= sizeof(str)) { -+ printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n", -+ EXT3_MB_MIN_TO_SCAN_NAME, (int)sizeof(str)); -+ return -EOVERFLOW; -+ } -+ -+ if (copy_from_user(str, buffer, count)) -+ return -EFAULT; -+ -+ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ -+ value = simple_strtol(str, NULL, 0); -+ if (value <= 0) -+ return -ERANGE; -+ -+ ext3_mb_order2_reqs = value; -+ -+ return count; -+} -+ -+int __init init_ext3_proc(void) -+{ -+ struct proc_dir_entry *proc_ext3_mb_stats; -+ struct proc_dir_entry *proc_ext3_mb_max_to_scan; -+ struct proc_dir_entry *proc_ext3_mb_min_to_scan; -+ struct proc_dir_entry *proc_ext3_mb_order2_req; -+ -+ proc_root_ext3 = proc_mkdir(EXT3_ROOT, proc_root_fs); -+ if (proc_root_ext3 == NULL) { -+ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", EXT3_ROOT); -+ return -EIO; -+ } -+ -+ /* Initialize EXT3_MB_STATS_NAME */ -+ proc_ext3_mb_stats = create_proc_entry(EXT3_MB_STATS_NAME, -+ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); -+ if (proc_ext3_mb_stats == NULL) { -+ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", -+ EXT3_MB_STATS_NAME); -+ remove_proc_entry(EXT3_ROOT, proc_root_fs); -+ return -EIO; -+ } -+ -+ proc_ext3_mb_stats->data = NULL; -+ proc_ext3_mb_stats->read_proc = ext3_mb_stats_read; -+ proc_ext3_mb_stats->write_proc = ext3_mb_stats_write; -+ -+ /* Initialize EXT3_MAX_TO_SCAN_NAME */ -+ proc_ext3_mb_max_to_scan = create_proc_entry( -+ EXT3_MB_MAX_TO_SCAN_NAME, -+ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); -+ if (proc_ext3_mb_max_to_scan == NULL) { -+ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", -+ EXT3_MB_MAX_TO_SCAN_NAME); -+ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); -+ remove_proc_entry(EXT3_ROOT, proc_root_fs); -+ return -EIO; -+ } -+ -+ proc_ext3_mb_max_to_scan->data = NULL; -+ proc_ext3_mb_max_to_scan->read_proc = ext3_mb_max_to_scan_read; -+ proc_ext3_mb_max_to_scan->write_proc = ext3_mb_max_to_scan_write; -+ -+ /* Initialize EXT3_MIN_TO_SCAN_NAME */ -+ proc_ext3_mb_min_to_scan = create_proc_entry( -+ EXT3_MB_MIN_TO_SCAN_NAME, -+ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); -+ if (proc_ext3_mb_min_to_scan == NULL) { -+ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", -+ EXT3_MB_MIN_TO_SCAN_NAME); -+ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3); -+ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); -+ remove_proc_entry(EXT3_ROOT, proc_root_fs); -+ return -EIO; -+ } -+ -+ proc_ext3_mb_min_to_scan->data = NULL; -+ proc_ext3_mb_min_to_scan->read_proc = ext3_mb_min_to_scan_read; -+ proc_ext3_mb_min_to_scan->write_proc = ext3_mb_min_to_scan_write; -+ -+ /* Initialize EXT3_ORDER2_REQ */ -+ proc_ext3_mb_order2_req = create_proc_entry( -+ EXT3_MB_ORDER2_REQ, -+ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); -+ if (proc_ext3_mb_order2_req == NULL) { -+ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", -+ EXT3_MB_ORDER2_REQ); -+ remove_proc_entry(EXT3_MB_MIN_TO_SCAN_NAME, proc_root_ext3); -+ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3); -+ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); -+ remove_proc_entry(EXT3_ROOT, proc_root_fs); -+ return -EIO; -+ } -+ -+ proc_ext3_mb_order2_req->data = NULL; -+ proc_ext3_mb_order2_req->read_proc = ext3_mb_order2_req_read; -+ proc_ext3_mb_order2_req->write_proc = ext3_mb_order2_req_write; -+ -+ return 0; -+} -+ -+void exit_ext3_proc(void) -+{ -+ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); -+ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3); -+ remove_proc_entry(EXT3_MB_MIN_TO_SCAN_NAME, proc_root_ext3); -+ remove_proc_entry(EXT3_MB_ORDER2_REQ, proc_root_ext3); -+ remove_proc_entry(EXT3_ROOT, proc_root_fs); -+} -Index: linux-2.6.16.i686/fs/ext3/Makefile -=================================================================== ---- linux-2.6.16.i686.orig/fs/ext3/Makefile 2006-05-30 22:55:32.000000000 +0800 -+++ linux-2.6.16.i686/fs/ext3/Makefile 2006-05-30 23:02:59.000000000 +0800 -@@ -6,7 +6,7 @@ - - ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ - ioctl.o namei.o super.o symlink.o hash.o resize.o \ -- extents.o -+ extents.o mballoc.o - - ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o - ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o diff --git a/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6-suse.patch b/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6-suse.patch deleted file mode 100644 index cec1877..0000000 --- a/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6-suse.patch +++ /dev/null @@ -1,3108 +0,0 @@ -Index: linux-2.6.5-7.282-full/include/linux/ext3_fs.h -=================================================================== ---- linux-2.6.5-7.282-full.orig/include/linux/ext3_fs.h 2006-10-24 22:18:28.000000000 +0400 -+++ linux-2.6.5-7.282-full/include/linux/ext3_fs.h 2006-10-24 22:18:28.000000000 +0400 -@@ -57,6 +57,14 @@ struct statfs; - #define ext3_debug(f, a...) do {} while (0) - #endif - -+#define EXT3_MULTIBLOCK_ALLOCATOR 1 -+ -+#define EXT3_MB_HINT_MERGE 1 -+#define EXT3_MB_HINT_RESERVED 2 -+#define EXT3_MB_HINT_METADATA 4 -+#define EXT3_MB_HINT_FIRST 8 -+#define EXT3_MB_HINT_BEST 16 -+ - /* - * Special inodes numbers - */ -@@ -339,6 +347,7 @@ struct ext3_inode { - #define EXT3_MOUNT_IOPEN_NOPRIV 0x100000/* Make iopen world-readable */ - #define EXT3_MOUNT_EXTENTS 0x200000/* Extents support */ - #define EXT3_MOUNT_EXTDEBUG 0x400000/* Extents debug */ -+#define EXT3_MOUNT_MBALLOC 0x800000/* Buddy allocation support */ - - /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ - #ifndef clear_opt -@@ -361,6 +370,14 @@ struct ext3_inode { - #define ext3_find_first_zero_bit ext2_find_first_zero_bit - #define ext3_find_next_zero_bit ext2_find_next_zero_bit - -+#ifndef ext2_find_next_le_bit -+#ifdef __LITTLE_ENDIAN -+#define ext2_find_next_le_bit(addr, size, off) find_next_bit((addr), (size), (off)) -+#else -+#error "mballoc needs a patch for big-endian systems - CFS bug 10634" -+#endif /* __LITTLE_ENDIAN */ -+#endif /* !ext2_find_next_le_bit */ -+ - /* - * Maximal mount counts between two filesystem checks - */ -@@ -700,7 +717,10 @@ extern int ext3_bg_has_super(struct supe - extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group); - extern int ext3_new_block (handle_t *, struct inode *, unsigned long, int *); - extern void ext3_free_blocks (handle_t *, struct inode *, unsigned long, -- unsigned long); -+ unsigned long, int); -+extern int ext3_new_block_old(handle_t *, struct inode *, unsigned long, int *); -+extern void ext3_free_blocks_old(handle_t *, struct inode *, unsigned long, -+ unsigned long); - extern unsigned long ext3_count_free_blocks (struct super_block *); - extern void ext3_check_blocks_bitmap (struct super_block *); - extern struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb, -@@ -824,6 +843,17 @@ extern void ext3_extents_initialize_bloc - extern int ext3_ext_ioctl(struct inode *inode, struct file *filp, - unsigned int cmd, unsigned long arg); - -+/* mballoc.c */ -+extern long ext3_mb_stats; -+extern long ext3_mb_max_to_scan; -+extern int ext3_mb_init(struct super_block *, int); -+extern int ext3_mb_release(struct super_block *); -+extern int ext3_mb_new_blocks(handle_t *, struct inode *, unsigned long, int *, int, int *); -+extern int ext3_mb_reserve_blocks(struct super_block *, int); -+extern void ext3_mb_release_blocks(struct super_block *, int); -+int __init init_ext3_proc(void); -+void exit_ext3_proc(void); -+ - #endif /* __KERNEL__ */ - - #define EXT3_IOC_CREATE_INUM _IOW('f', 5, long) -Index: linux-2.6.5-7.282-full/include/linux/ext3_fs_sb.h -=================================================================== ---- linux-2.6.5-7.282-full.orig/include/linux/ext3_fs_sb.h 2006-10-24 22:18:28.000000000 +0400 -+++ linux-2.6.5-7.282-full/include/linux/ext3_fs_sb.h 2006-10-24 22:18:28.000000000 +0400 -@@ -23,9 +23,15 @@ - #define EXT_INCLUDE - #include - #include -+#include - #endif - #endif - #include -+#include -+ -+struct ext3_buddy_group_blocks; -+struct ext3_mb_history; -+#define EXT3_BB_MAX_BLOCKS - - /* - * third extended-fs super-block data in memory -@@ -78,6 +84,43 @@ struct ext3_sb_info { - struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */ - wait_queue_head_t ro_wait_queue; /* For people waiting for the fs to go read-only */ - #endif -+ -+ /* for buddy allocator */ -+ struct ext3_group_info ***s_group_info; -+ struct inode *s_buddy_cache; -+ long s_blocks_reserved; -+ spinlock_t s_reserve_lock; -+ struct list_head s_active_transaction; -+ struct list_head s_closed_transaction; -+ struct list_head s_committed_transaction; -+ spinlock_t s_md_lock; -+ tid_t s_last_transaction; -+ int s_mb_factor; -+ unsigned short *s_mb_offsets, *s_mb_maxs; -+ unsigned long s_stripe; -+ -+ /* history to debug policy */ -+ struct ext3_mb_history *s_mb_history; -+ int s_mb_history_cur; -+ int s_mb_history_max; -+ struct proc_dir_entry *s_mb_proc; -+ spinlock_t s_mb_history_lock; -+ -+ /* stats for buddy allocator */ -+ atomic_t s_bal_reqs; /* number of reqs with len > 1 */ -+ atomic_t s_bal_success; /* we found long enough chunks */ -+ atomic_t s_bal_allocated; /* in blocks */ -+ atomic_t s_bal_ex_scanned; /* total extents scanned */ -+ atomic_t s_bal_goals; /* goal hits */ -+ atomic_t s_bal_breaks; /* too long searches */ -+ atomic_t s_bal_2orders; /* 2^order hits */ -+ spinlock_t s_bal_lock; -+ unsigned long s_mb_buddies_generated; -+ unsigned long long s_mb_generation_time; - }; - -+#define EXT3_GROUP_INFO(sb, group) \ -+ EXT3_SB(sb)->s_group_info[(group) >> EXT3_DESC_PER_BLOCK_BITS(sb)] \ -+ [(group) & (EXT3_DESC_PER_BLOCK(sb) - 1)] -+ - #endif /* _LINUX_EXT3_FS_SB */ -Index: linux-2.6.5-7.282-full/fs/ext3/super.c -=================================================================== ---- linux-2.6.5-7.282-full.orig/fs/ext3/super.c 2006-10-24 22:18:28.000000000 +0400 -+++ linux-2.6.5-7.282-full/fs/ext3/super.c 2006-10-24 22:18:28.000000000 +0400 -@@ -389,6 +389,7 @@ void ext3_put_super (struct super_block - struct ext3_super_block *es = sbi->s_es; - int i; - -+ ext3_mb_release(sb); - ext3_ext_release(sb); - ext3_xattr_put_super(sb); - journal_destroy(sbi->s_journal); -@@ -588,6 +589,7 @@ enum { - Opt_err, - Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, - Opt_extents, Opt_noextents, Opt_extdebug, -+ Opt_mballoc, Opt_nomballoc, Opt_stripe, - }; - - static match_table_t tokens = { -@@ -634,6 +636,9 @@ static match_table_t tokens = { - {Opt_extents, "extents"}, - {Opt_noextents, "noextents"}, - {Opt_extdebug, "extdebug"}, -+ {Opt_mballoc, "mballoc"}, -+ {Opt_nomballoc, "nomballoc"}, -+ {Opt_stripe, "stripe=%u"}, - {Opt_barrier, "barrier=%u"}, - {Opt_err, NULL} - }; -@@ -859,6 +864,19 @@ static int parse_options (char * options - case Opt_extdebug: - set_opt (sbi->s_mount_opt, EXTDEBUG); - break; -+ case Opt_mballoc: -+ set_opt(sbi->s_mount_opt, MBALLOC); -+ break; -+ case Opt_nomballoc: -+ clear_opt(sbi->s_mount_opt, MBALLOC); -+ break; -+ case Opt_stripe: -+ if (match_int(&args[0], &option)) -+ return 0; -+ if (option < 0) -+ return 0; -+ sbi->s_stripe = option; -+ break; - default: - printk (KERN_ERR - "EXT3-fs: Unrecognized mount option \"%s\" " -@@ -1512,6 +1530,7 @@ static int ext3_fill_super (struct super - ext3_count_dirs(sb)); - - ext3_ext_init(sb); -+ ext3_mb_init(sb, needs_recovery); - - return 0; - -@@ -2160,7 +2179,13 @@ static struct file_system_type ext3_fs_t - - static int __init init_ext3_fs(void) - { -- int err = init_ext3_xattr(); -+ int err; -+ -+ err = init_ext3_proc(); -+ if (err) -+ return err; -+ -+ err = init_ext3_xattr(); - if (err) - return err; - err = init_inodecache(); -@@ -2189,6 +2214,7 @@ static void __exit exit_ext3_fs(void) - unregister_filesystem(&ext3_fs_type); - destroy_inodecache(); - exit_ext3_xattr(); -+ exit_ext3_proc(); - } - - int ext3_prep_san_write(struct inode *inode, long *blocks, -Index: linux-2.6.5-7.282-full/fs/ext3/extents.c -=================================================================== ---- linux-2.6.5-7.282-full.orig/fs/ext3/extents.c 2006-10-24 22:18:28.000000000 +0400 -+++ linux-2.6.5-7.282-full/fs/ext3/extents.c 2006-10-24 22:18:28.000000000 +0400 -@@ -779,7 +779,7 @@ cleanup: - for (i = 0; i < depth; i++) { - if (!ablocks[i]) - continue; -- ext3_free_blocks(handle, tree->inode, ablocks[i], 1); -+ ext3_free_blocks(handle, tree->inode, ablocks[i], 1, 1); - } - } - kfree(ablocks); -@@ -1438,7 +1438,7 @@ int ext3_ext_rm_idx(handle_t *handle, st - path->p_idx->ei_leaf); - bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf); - ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf); -- ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1); -+ ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1, 1); - return err; - } - -@@ -1923,10 +1923,12 @@ ext3_remove_blocks(struct ext3_extents_t - int needed = ext3_remove_blocks_credits(tree, ex, from, to); - handle_t *handle = ext3_journal_start(tree->inode, needed); - struct buffer_head *bh; -- int i; -+ int i, metadata = 0; - - if (IS_ERR(handle)) - return PTR_ERR(handle); -+ if (S_ISDIR(tree->inode->i_mode) || S_ISLNK(tree->inode->i_mode)) -+ metadata = 1; - if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) { - /* tail removal */ - unsigned long num, start; -@@ -1938,7 +1940,7 @@ ext3_remove_blocks(struct ext3_extents_t - bh = sb_find_get_block(tree->inode->i_sb, start + i); - ext3_forget(handle, 0, tree->inode, bh, start + i); - } -- ext3_free_blocks(handle, tree->inode, start, num); -+ ext3_free_blocks(handle, tree->inode, start, num, metadata); - } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) { - printk("strange request: removal %lu-%lu from %u:%u\n", - from, to, ex->ee_block, ex->ee_len); -Index: linux-2.6.5-7.282-full/fs/ext3/inode.c -=================================================================== ---- linux-2.6.5-7.282-full.orig/fs/ext3/inode.c 2006-10-24 22:18:28.000000000 +0400 -+++ linux-2.6.5-7.282-full/fs/ext3/inode.c 2006-10-24 22:18:28.000000000 +0400 -@@ -574,7 +574,7 @@ static int ext3_alloc_branch(handle_t *h - ext3_journal_forget(handle, branch[i].bh); - } - for (i = 0; i < keys; i++) -- ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1); -+ ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1, 1); - return err; - } - -@@ -675,7 +675,7 @@ err_out: - if (err == -EAGAIN) - for (i = 0; i < num; i++) - ext3_free_blocks(handle, inode, -- le32_to_cpu(where[i].key), 1); -+ le32_to_cpu(where[i].key), 1, 1); - return err; - } - -@@ -1837,7 +1837,7 @@ ext3_clear_blocks(handle_t *handle, stru - } - } - -- ext3_free_blocks(handle, inode, block_to_free, count); -+ ext3_free_blocks(handle, inode, block_to_free, count, 1); - } - - /** -@@ -2008,7 +2008,7 @@ static void ext3_free_branches(handle_t - ext3_journal_test_restart(handle, inode); - } - -- ext3_free_blocks(handle, inode, nr, 1); -+ ext3_free_blocks(handle, inode, nr, 1, 1); - - if (parent_bh) { - /* -Index: linux-2.6.5-7.282-full/fs/ext3/balloc.c -=================================================================== ---- linux-2.6.5-7.282-full.orig/fs/ext3/balloc.c 2006-08-30 18:12:13.000000000 +0400 -+++ linux-2.6.5-7.282-full/fs/ext3/balloc.c 2006-10-24 22:18:28.000000000 +0400 -@@ -78,7 +78,7 @@ struct ext3_group_desc * ext3_get_group_ - * - * Return buffer_head on success or NULL in case of failure. - */ --static struct buffer_head * -+struct buffer_head * - read_block_bitmap(struct super_block *sb, unsigned int block_group) - { - struct ext3_group_desc * desc; -@@ -274,7 +274,7 @@ void ext3_discard_reservation(struct ino - } - - /* Free given blocks, update quota and i_blocks field */ --void ext3_free_blocks(handle_t *handle, struct inode *inode, -+void ext3_free_blocks_old(handle_t *handle, struct inode *inode, - unsigned long block, unsigned long count) - { - struct buffer_head *bitmap_bh = NULL; -@@ -1142,7 +1142,7 @@ int ext3_should_retry_alloc(struct super - * bitmap, and then for any free bit if that fails. - * This function also updates quota and i_blocks field. - */ --int ext3_new_block(handle_t *handle, struct inode *inode, -+int ext3_new_block_old(handle_t *handle, struct inode *inode, - unsigned long goal, int *errp) - { - struct buffer_head *bitmap_bh = NULL; -Index: linux-2.6.5-7.282-full/fs/ext3/xattr.c -=================================================================== ---- linux-2.6.5-7.282-full.orig/fs/ext3/xattr.c 2006-10-24 22:18:28.000000000 +0400 -+++ linux-2.6.5-7.282-full/fs/ext3/xattr.c 2006-10-24 22:18:28.000000000 +0400 -@@ -1371,7 +1371,7 @@ ext3_xattr_set_handle2(handle_t *handle, - new_bh = sb_getblk(sb, block); - if (!new_bh) { - getblk_failed: -- ext3_free_blocks(handle, inode, block, 1); -+ ext3_free_blocks(handle, inode, block, 1, 1); - error = -EIO; - goto cleanup; - } -@@ -1411,7 +1411,7 @@ getblk_failed: - if (HDR(old_bh)->h_refcount == cpu_to_le32(1)) { - /* Free the old block. */ - ea_bdebug(old_bh, "freeing"); -- ext3_free_blocks(handle, inode, old_bh->b_blocknr, 1); -+ ext3_free_blocks(handle, inode, old_bh->b_blocknr, 1, 1); - - /* ext3_forget() calls bforget() for us, but we - let our caller release old_bh, so we need to -@@ -1519,7 +1519,7 @@ ext3_xattr_delete_inode(handle_t *handle - mb_cache_entry_free(ce); - ce = NULL; - } -- ext3_free_blocks(handle, inode, EXT3_I(inode)->i_file_acl, 1); -+ ext3_free_blocks(handle, inode, EXT3_I(inode)->i_file_acl, 1, 1); - get_bh(bh); - ext3_forget(handle, 1, inode, bh, EXT3_I(inode)->i_file_acl); - } else { -Index: linux-2.6.5-7.282-full/fs/ext3/mballoc.c -=================================================================== ---- linux-2.6.5-7.282-full.orig/fs/ext3/mballoc.c 2006-10-23 18:07:54.821533176 +0400 -+++ linux-2.6.5-7.282-full/fs/ext3/mballoc.c 2006-10-24 22:20:45.000000000 +0400 -@@ -0,0 +1,2726 @@ -+/* -+ * Copyright (c) 2003-2005, Cluster File Systems, Inc, info@clusterfs.com -+ * Written by Alex Tomas -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License version 2 as -+ * published by the Free Software Foundation. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public Licens -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- -+ */ -+ -+ -+/* -+ * mballoc.c contains the multiblocks allocation routines -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+/* -+ * TODO: -+ * - bitmap read-ahead (proposed by Oleg Drokin aka green) -+ * - track min/max extents in each group for better group selection -+ * - mb_mark_used() may allocate chunk right after splitting buddy -+ * - special flag to advice allocator to look for requested + N blocks -+ * this may improve interaction between extents and mballoc -+ * - tree of groups sorted by number of free blocks -+ * - percpu reservation code (hotpath) -+ * - error handling -+ */ -+ -+/* -+ * with AGRESSIVE_CHECK allocator runs consistency checks over -+ * structures. these checks slow things down a lot -+ */ -+#define AGGRESSIVE_CHECK__ -+ -+/* -+ */ -+#define MB_DEBUG__ -+#ifdef MB_DEBUG -+#define mb_debug(fmt,a...) printk(fmt, ##a) -+#else -+#define mb_debug(fmt,a...) -+#endif -+ -+/* -+ * with EXT3_MB_HISTORY mballoc stores last N allocations in memory -+ * and you can monitor it in /proc/fs/ext3//mb_history -+ */ -+#define EXT3_MB_HISTORY -+ -+/* -+ * How long mballoc can look for a best extent (in found extents) -+ */ -+long ext3_mb_max_to_scan = 500; -+ -+/* -+ * How long mballoc must look for a best extent -+ */ -+long ext3_mb_min_to_scan = 30; -+ -+/* -+ * with 'ext3_mb_stats' allocator will collect stats that will be -+ * shown at umount. The collecting costs though! -+ */ -+ -+long ext3_mb_stats = 1; -+ -+/* -+ * for which requests use 2^N search using buddies -+ */ -+long ext3_mb_order2_reqs = 8; -+ -+#ifdef EXT3_BB_MAX_BLOCKS -+#undef EXT3_BB_MAX_BLOCKS -+#endif -+#define EXT3_BB_MAX_BLOCKS 30 -+ -+struct ext3_free_metadata { -+ unsigned short group; -+ unsigned short num; -+ unsigned short blocks[EXT3_BB_MAX_BLOCKS]; -+ struct list_head list; -+}; -+ -+struct ext3_group_info { -+ unsigned long bb_state; -+ unsigned long bb_tid; -+ struct ext3_free_metadata *bb_md_cur; -+ unsigned short bb_first_free; -+ unsigned short bb_free; -+ unsigned short bb_fragments; -+ unsigned short bb_counters[]; -+}; -+ -+ -+#define EXT3_GROUP_INFO_NEED_INIT_BIT 0 -+#define EXT3_GROUP_INFO_LOCKED_BIT 1 -+ -+#define EXT3_MB_GRP_NEED_INIT(grp) \ -+ (test_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, &(grp)->bb_state)) -+ -+struct ext3_free_extent { -+ __u16 fe_start; -+ __u16 fe_len; -+ __u16 fe_group; -+}; -+ -+struct ext3_allocation_context { -+ struct super_block *ac_sb; -+ -+ /* search goals */ -+ struct ext3_free_extent ac_g_ex; -+ -+ /* the best found extent */ -+ struct ext3_free_extent ac_b_ex; -+ -+ /* number of iterations done. we have to track to limit searching */ -+ unsigned long ac_ex_scanned; -+ __u16 ac_groups_scanned; -+ __u16 ac_found; -+ __u16 ac_tail; -+ __u16 ac_buddy; -+ __u8 ac_status; -+ __u8 ac_flags; /* allocation hints */ -+ __u8 ac_criteria; -+ __u8 ac_repeats; -+ __u8 ac_2order; /* if request is to allocate 2^N blocks and -+ * N > 0, the field stores N, otherwise 0 */ -+ -+ struct page *ac_buddy_page; -+ struct page *ac_bitmap_page; -+}; -+ -+#define AC_STATUS_CONTINUE 1 -+#define AC_STATUS_FOUND 2 -+#define AC_STATUS_BREAK 3 -+ -+struct ext3_mb_history { -+ struct ext3_free_extent goal; /* goal allocation */ -+ struct ext3_free_extent result; /* result allocation */ -+ unsigned pid; -+ unsigned ino; -+ __u16 found; /* how many extents have been found */ -+ __u16 groups; /* how many groups have been scanned */ -+ __u16 tail; /* what tail broke some buddy */ -+ __u16 buddy; /* buddy the tail ^^^ broke */ -+ __u8 cr; /* which phase the result extent was found at */ -+ __u8 merged; -+}; -+ -+struct ext3_buddy { -+ struct page *bd_buddy_page; -+ void *bd_buddy; -+ struct page *bd_bitmap_page; -+ void *bd_bitmap; -+ struct ext3_group_info *bd_info; -+ struct super_block *bd_sb; -+ __u16 bd_blkbits; -+ __u16 bd_group; -+}; -+#define EXT3_MB_BITMAP(e3b) ((e3b)->bd_bitmap) -+#define EXT3_MB_BUDDY(e3b) ((e3b)->bd_buddy) -+ -+#ifndef EXT3_MB_HISTORY -+#define ext3_mb_store_history(sb,ino,ac) -+#else -+static void ext3_mb_store_history(struct super_block *, unsigned ino, -+ struct ext3_allocation_context *ac); -+#endif -+ -+#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) -+ -+static struct proc_dir_entry *proc_root_ext3; -+ -+struct buffer_head * read_block_bitmap(struct super_block *, unsigned int); -+void ext3_mb_poll_new_transaction(struct super_block *, handle_t *); -+void ext3_mb_free_committed_blocks(struct super_block *); -+ -+#if BITS_PER_LONG == 64 -+#define mb_correct_addr_and_bit(bit,addr) \ -+{ \ -+ bit += ((unsigned long) addr & 7UL) << 3; \ -+ addr = (void *) ((unsigned long) addr & ~7UL); \ -+} -+#elif BITS_PER_LONG == 32 -+#define mb_correct_addr_and_bit(bit,addr) \ -+{ \ -+ bit += ((unsigned long) addr & 3UL) << 3; \ -+ addr = (void *) ((unsigned long) addr & ~3UL); \ -+} -+#else -+#error "how many bits you are?!" -+#endif -+ -+static inline int mb_test_bit(int bit, void *addr) -+{ -+ mb_correct_addr_and_bit(bit,addr); -+ return ext2_test_bit(bit, addr); -+} -+ -+static inline void mb_set_bit(int bit, void *addr) -+{ -+ mb_correct_addr_and_bit(bit,addr); -+ ext2_set_bit(bit, addr); -+} -+ -+static inline void mb_set_bit_atomic(int bit, void *addr) -+{ -+ mb_correct_addr_and_bit(bit,addr); -+ ext2_set_bit_atomic(NULL, bit, addr); -+} -+ -+static inline void mb_clear_bit(int bit, void *addr) -+{ -+ mb_correct_addr_and_bit(bit,addr); -+ ext2_clear_bit(bit, addr); -+} -+ -+static inline void mb_clear_bit_atomic(int bit, void *addr) -+{ -+ mb_correct_addr_and_bit(bit,addr); -+ ext2_clear_bit_atomic(NULL, bit, addr); -+} -+ -+static inline int mb_find_next_zero_bit(void *addr, int max, int start) -+{ -+ int fix; -+#if BITS_PER_LONG == 64 -+ fix = ((unsigned long) addr & 7UL) << 3; -+ addr = (void *) ((unsigned long) addr & ~7UL); -+#elif BITS_PER_LONG == 32 -+ fix = ((unsigned long) addr & 3UL) << 3; -+ addr = (void *) ((unsigned long) addr & ~3UL); -+#else -+#error "how many bits you are?!" -+#endif -+ max += fix; -+ start += fix; -+ return ext2_find_next_zero_bit(addr, max, start) - fix; -+} -+ -+static inline void *mb_find_buddy(struct ext3_buddy *e3b, int order, int *max) -+{ -+ char *bb; -+ -+ J_ASSERT(EXT3_MB_BITMAP(e3b) != EXT3_MB_BUDDY(e3b)); -+ J_ASSERT(max != NULL); -+ -+ if (order > e3b->bd_blkbits + 1) { -+ *max = 0; -+ return NULL; -+ } -+ -+ /* at order 0 we see each particular block */ -+ *max = 1 << (e3b->bd_blkbits + 3); -+ if (order == 0) -+ return EXT3_MB_BITMAP(e3b); -+ -+ bb = EXT3_MB_BUDDY(e3b) + EXT3_SB(e3b->bd_sb)->s_mb_offsets[order]; -+ *max = EXT3_SB(e3b->bd_sb)->s_mb_maxs[order]; -+ -+ return bb; -+} -+ -+#ifdef AGGRESSIVE_CHECK -+ -+static void mb_check_buddy(struct ext3_buddy *e3b) -+{ -+ int order = e3b->bd_blkbits + 1; -+ int max, max2, i, j, k, count; -+ int fragments = 0, fstart; -+ void *buddy, *buddy2; -+ -+ if (!test_opt(e3b->bd_sb, MBALLOC)) -+ return; -+ -+ { -+ static int mb_check_counter = 0; -+ if (mb_check_counter++ % 300 != 0) -+ return; -+ } -+ -+ while (order > 1) { -+ buddy = mb_find_buddy(e3b, order, &max); -+ J_ASSERT(buddy); -+ buddy2 = mb_find_buddy(e3b, order - 1, &max2); -+ J_ASSERT(buddy2); -+ J_ASSERT(buddy != buddy2); -+ J_ASSERT(max * 2 == max2); -+ -+ count = 0; -+ for (i = 0; i < max; i++) { -+ -+ if (mb_test_bit(i, buddy)) { -+ /* only single bit in buddy2 may be 1 */ -+ if (!mb_test_bit(i << 1, buddy2)) -+ J_ASSERT(mb_test_bit((i<<1)+1, buddy2)); -+ else if (!mb_test_bit((i << 1) + 1, buddy2)) -+ J_ASSERT(mb_test_bit(i << 1, buddy2)); -+ continue; -+ } -+ -+ /* both bits in buddy2 must be 0 */ -+ J_ASSERT(mb_test_bit(i << 1, buddy2)); -+ J_ASSERT(mb_test_bit((i << 1) + 1, buddy2)); -+ -+ for (j = 0; j < (1 << order); j++) { -+ k = (i * (1 << order)) + j; -+ J_ASSERT(!mb_test_bit(k, EXT3_MB_BITMAP(e3b))); -+ } -+ count++; -+ } -+ J_ASSERT(e3b->bd_info->bb_counters[order] == count); -+ order--; -+ } -+ -+ fstart = -1; -+ buddy = mb_find_buddy(e3b, 0, &max); -+ for (i = 0; i < max; i++) { -+ if (!mb_test_bit(i, buddy)) { -+ J_ASSERT(i >= e3b->bd_info->bb_first_free); -+ if (fstart == -1) { -+ fragments++; -+ fstart = i; -+ } -+ continue; -+ } -+ fstart = -1; -+ /* check used bits only */ -+ for (j = 0; j < e3b->bd_blkbits + 1; j++) { -+ buddy2 = mb_find_buddy(e3b, j, &max2); -+ k = i >> j; -+ J_ASSERT(k < max2); -+ J_ASSERT(mb_test_bit(k, buddy2)); -+ } -+ } -+ J_ASSERT(!EXT3_MB_GRP_NEED_INIT(e3b->bd_info)); -+ J_ASSERT(e3b->bd_info->bb_fragments == fragments); -+} -+ -+#else -+#define mb_check_buddy(e3b) -+#endif -+ -+/* find most significant bit */ -+static int inline fmsb(unsigned short word) -+{ -+ int order; -+ -+ if (word > 255) { -+ order = 7; -+ word >>= 8; -+ } else { -+ order = -1; -+ } -+ -+ do { -+ order++; -+ word >>= 1; -+ } while (word != 0); -+ -+ return order; -+} -+ -+static void inline -+ext3_mb_mark_free_simple(struct super_block *sb, void *buddy, unsigned first, -+ int len, struct ext3_group_info *grp) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ unsigned short min, max, chunk, border; -+ -+ mb_debug("mark %u/%u free\n", first, len); -+ J_ASSERT(len < EXT3_BLOCKS_PER_GROUP(sb)); -+ -+ border = 2 << sb->s_blocksize_bits; -+ -+ while (len > 0) { -+ /* find how many blocks can be covered since this position */ -+ max = ffs(first | border) - 1; -+ -+ /* find how many blocks of power 2 we need to mark */ -+ min = fmsb(len); -+ -+ mb_debug(" %u/%u -> max %u, min %u\n", -+ first & ((2 << sb->s_blocksize_bits) - 1), -+ len, max, min); -+ -+ if (max < min) -+ min = max; -+ chunk = 1 << min; -+ -+ /* mark multiblock chunks only */ -+ grp->bb_counters[min]++; -+ if (min > 0) { -+ mb_debug(" set %u at %u \n", first >> min, -+ sbi->s_mb_offsets[min]); -+ mb_clear_bit(first >> min, buddy + sbi->s_mb_offsets[min]); -+ } -+ -+ len -= chunk; -+ first += chunk; -+ } -+} -+ -+static void -+ext3_mb_generate_buddy(struct super_block *sb, void *buddy, void *bitmap, -+ int group) -+{ -+ struct ext3_group_info *grp = EXT3_GROUP_INFO(sb, group); -+ unsigned short max = EXT3_BLOCKS_PER_GROUP(sb); -+ unsigned short i = 0, first, len; -+ unsigned free = 0, fragments = 0; -+ unsigned long long period = get_cycles(); -+ -+ i = mb_find_next_zero_bit(bitmap, max, 0); -+ grp->bb_first_free = i; -+ while (i < max) { -+ fragments++; -+ first = i; -+ i = ext2_find_next_le_bit(bitmap, max, i); -+ len = i - first; -+ free += len; -+ if (len > 1) -+ ext3_mb_mark_free_simple(sb, buddy, first, len, grp); -+ else -+ grp->bb_counters[0]++; -+ if (i < max) -+ i = mb_find_next_zero_bit(bitmap, max, i); -+ } -+ grp->bb_fragments = fragments; -+ -+ /* bb_state shouldn't being modified because all -+ * others waits for init completion on page lock */ -+ clear_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, &grp->bb_state); -+ if (free != grp->bb_free) { -+ printk("EXT3-fs: group %u: %u blocks in bitmap, %u in gd\n", -+ group, free, grp->bb_free); -+ grp->bb_free = free; -+ } -+ -+ period = get_cycles() - period; -+ spin_lock(&EXT3_SB(sb)->s_bal_lock); -+ EXT3_SB(sb)->s_mb_buddies_generated++; -+ EXT3_SB(sb)->s_mb_generation_time += period; -+ spin_unlock(&EXT3_SB(sb)->s_bal_lock); -+} -+ -+static int ext3_mb_init_cache(struct page *page) -+{ -+ int blocksize, blocks_per_page, groups_per_page; -+ int err = 0, i, first_group, first_block; -+ struct super_block *sb; -+ struct buffer_head *bhs; -+ struct buffer_head **bh; -+ struct inode *inode; -+ char *data, *bitmap; -+ -+ mb_debug("init page %lu\n", page->index); -+ -+ inode = page->mapping->host; -+ sb = inode->i_sb; -+ blocksize = 1 << inode->i_blkbits; -+ blocks_per_page = PAGE_CACHE_SIZE / blocksize; -+ -+ groups_per_page = blocks_per_page >> 1; -+ if (groups_per_page == 0) -+ groups_per_page = 1; -+ -+ /* allocate buffer_heads to read bitmaps */ -+ if (groups_per_page > 1) { -+ err = -ENOMEM; -+ i = sizeof(struct buffer_head *) * groups_per_page; -+ bh = kmalloc(i, GFP_NOFS); -+ if (bh == NULL) -+ goto out; -+ memset(bh, 0, i); -+ } else -+ bh = &bhs; -+ -+ first_group = page->index * blocks_per_page / 2; -+ -+ /* read all groups the page covers into the cache */ -+ for (i = 0; i < groups_per_page; i++) { -+ struct ext3_group_desc * desc; -+ -+ if (first_group + i >= EXT3_SB(sb)->s_groups_count) -+ break; -+ -+ err = -EIO; -+ desc = ext3_get_group_desc(sb, first_group + i, NULL); -+ if (desc == NULL) -+ goto out; -+ -+ err = -ENOMEM; -+ bh[i] = sb_getblk(sb, le32_to_cpu(desc->bg_block_bitmap)); -+ if (bh[i] == NULL) -+ goto out; -+ -+ if (buffer_uptodate(bh[i])) -+ continue; -+ -+ lock_buffer(bh[i]); -+ if (buffer_uptodate(bh[i])) { -+ unlock_buffer(bh[i]); -+ continue; -+ } -+ -+ get_bh(bh[i]); -+ bh[i]->b_end_io = end_buffer_read_sync; -+ submit_bh(READ, bh[i]); -+ mb_debug("read bitmap for group %u\n", first_group + i); -+ } -+ -+ /* wait for I/O completion */ -+ for (i = 0; i < groups_per_page && bh[i]; i++) -+ wait_on_buffer(bh[i]); -+ -+ err = -EIO; -+ for (i = 0; i < groups_per_page && bh[i]; i++) -+ if (!buffer_uptodate(bh[i])) -+ goto out; -+ -+ first_block = page->index * blocks_per_page; -+ for (i = 0; i < blocks_per_page; i++) { -+ int group; -+ -+ group = (first_block + i) >> 1; -+ if (group >= EXT3_SB(sb)->s_groups_count) -+ break; -+ -+ data = page_address(page) + (i * blocksize); -+ bitmap = bh[group - first_group]->b_data; -+ -+ if ((first_block + i) & 1) { -+ /* this is block of buddy */ -+ mb_debug("put buddy for group %u in page %lu/%x\n", -+ group, page->index, i * blocksize); -+ memset(data, 0xff, blocksize); -+ EXT3_GROUP_INFO(sb, group)->bb_fragments = 0; -+ memset(EXT3_GROUP_INFO(sb, group)->bb_counters, 0, -+ sizeof(unsigned short)*(sb->s_blocksize_bits+2)); -+ ext3_mb_generate_buddy(sb, data, bitmap, group); -+ } else { -+ /* this is block of bitmap */ -+ mb_debug("put bitmap for group %u in page %lu/%x\n", -+ group, page->index, i * blocksize); -+ memcpy(data, bitmap, blocksize); -+ } -+ } -+ SetPageUptodate(page); -+ -+out: -+ if (bh) { -+ for (i = 0; i < groups_per_page && bh[i]; i++) -+ brelse(bh[i]); -+ if (bh != &bhs) -+ kfree(bh); -+ } -+ return err; -+} -+ -+static int ext3_mb_load_buddy(struct super_block *sb, int group, -+ struct ext3_buddy *e3b) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ struct inode *inode = sbi->s_buddy_cache; -+ int blocks_per_page, block, pnum, poff; -+ struct page *page; -+ -+ mb_debug("load group %u\n", group); -+ -+ blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; -+ -+ e3b->bd_blkbits = sb->s_blocksize_bits; -+ e3b->bd_info = EXT3_GROUP_INFO(sb, group); -+ e3b->bd_sb = sb; -+ e3b->bd_group = group; -+ e3b->bd_buddy_page = NULL; -+ e3b->bd_bitmap_page = NULL; -+ -+ block = group * 2; -+ pnum = block / blocks_per_page; -+ poff = block % blocks_per_page; -+ -+ /* we could use find_or_create_page(), but it locks page -+ * what we'd like to avoid in fast path ... */ -+ page = find_get_page(inode->i_mapping, pnum); -+ if (page == NULL || !PageUptodate(page)) { -+ if (page) -+ page_cache_release(page); -+ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); -+ if (page) { -+ BUG_ON(page->mapping != inode->i_mapping); -+ if (!PageUptodate(page)) -+ ext3_mb_init_cache(page); -+ unlock_page(page); -+ } -+ } -+ if (page == NULL || !PageUptodate(page)) -+ goto err; -+ e3b->bd_bitmap_page = page; -+ e3b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); -+ mark_page_accessed(page); -+ -+ block++; -+ pnum = block / blocks_per_page; -+ poff = block % blocks_per_page; -+ -+ page = find_get_page(inode->i_mapping, pnum); -+ if (page == NULL || !PageUptodate(page)) { -+ if (page) -+ page_cache_release(page); -+ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); -+ if (page) { -+ BUG_ON(page->mapping != inode->i_mapping); -+ if (!PageUptodate(page)) -+ ext3_mb_init_cache(page); -+ unlock_page(page); -+ } -+ } -+ if (page == NULL || !PageUptodate(page)) -+ goto err; -+ e3b->bd_buddy_page = page; -+ e3b->bd_buddy = page_address(page) + (poff * sb->s_blocksize); -+ mark_page_accessed(page); -+ -+ J_ASSERT(e3b->bd_bitmap_page != NULL); -+ J_ASSERT(e3b->bd_buddy_page != NULL); -+ -+ return 0; -+ -+err: -+ if (e3b->bd_bitmap_page) -+ page_cache_release(e3b->bd_bitmap_page); -+ if (e3b->bd_buddy_page) -+ page_cache_release(e3b->bd_buddy_page); -+ e3b->bd_buddy = NULL; -+ e3b->bd_bitmap = NULL; -+ return -EIO; -+} -+ -+static void ext3_mb_release_desc(struct ext3_buddy *e3b) -+{ -+ if (e3b->bd_bitmap_page) -+ page_cache_release(e3b->bd_bitmap_page); -+ if (e3b->bd_buddy_page) -+ page_cache_release(e3b->bd_buddy_page); -+} -+ -+ -+static inline void -+ext3_lock_group(struct super_block *sb, int group) -+{ -+ bit_spin_lock(EXT3_GROUP_INFO_LOCKED_BIT, -+ &EXT3_GROUP_INFO(sb, group)->bb_state); -+} -+ -+static inline void -+ext3_unlock_group(struct super_block *sb, int group) -+{ -+ bit_spin_unlock(EXT3_GROUP_INFO_LOCKED_BIT, -+ &EXT3_GROUP_INFO(sb, group)->bb_state); -+} -+ -+static int mb_find_order_for_block(struct ext3_buddy *e3b, int block) -+{ -+ int order = 1; -+ void *bb; -+ -+ J_ASSERT(EXT3_MB_BITMAP(e3b) != EXT3_MB_BUDDY(e3b)); -+ J_ASSERT(block < (1 << (e3b->bd_blkbits + 3))); -+ -+ bb = EXT3_MB_BUDDY(e3b); -+ while (order <= e3b->bd_blkbits + 1) { -+ block = block >> 1; -+ if (!mb_test_bit(block, bb)) { -+ /* this block is part of buddy of order 'order' */ -+ return order; -+ } -+ bb += 1 << (e3b->bd_blkbits - order); -+ order++; -+ } -+ return 0; -+} -+ -+static inline void mb_clear_bits(void *bm, int cur, int len) -+{ -+ __u32 *addr; -+ -+ len = cur + len; -+ while (cur < len) { -+ if ((cur & 31) == 0 && (len - cur) >= 32) { -+ /* fast path: clear whole word at once */ -+ addr = bm + (cur >> 3); -+ *addr = 0; -+ cur += 32; -+ continue; -+ } -+ mb_clear_bit_atomic(cur, bm); -+ cur++; -+ } -+} -+ -+static inline void mb_set_bits(void *bm, int cur, int len) -+{ -+ __u32 *addr; -+ -+ len = cur + len; -+ while (cur < len) { -+ if ((cur & 31) == 0 && (len - cur) >= 32) { -+ /* fast path: clear whole word at once */ -+ addr = bm + (cur >> 3); -+ *addr = 0xffffffff; -+ cur += 32; -+ continue; -+ } -+ mb_set_bit_atomic(cur, bm); -+ cur++; -+ } -+} -+ -+static int mb_free_blocks(struct ext3_buddy *e3b, int first, int count) -+{ -+ int block = 0, max = 0, order; -+ void *buddy, *buddy2; -+ -+ mb_check_buddy(e3b); -+ -+ e3b->bd_info->bb_free += count; -+ if (first < e3b->bd_info->bb_first_free) -+ e3b->bd_info->bb_first_free = first; -+ -+ /* let's maintain fragments counter */ -+ if (first != 0) -+ block = !mb_test_bit(first - 1, EXT3_MB_BITMAP(e3b)); -+ if (first + count < EXT3_SB(e3b->bd_sb)->s_mb_maxs[0]) -+ max = !mb_test_bit(first + count, EXT3_MB_BITMAP(e3b)); -+ if (block && max) -+ e3b->bd_info->bb_fragments--; -+ else if (!block && !max) -+ e3b->bd_info->bb_fragments++; -+ -+ /* let's maintain buddy itself */ -+ while (count-- > 0) { -+ block = first++; -+ order = 0; -+ -+ J_ASSERT(mb_test_bit(block, EXT3_MB_BITMAP(e3b))); -+ mb_clear_bit(block, EXT3_MB_BITMAP(e3b)); -+ e3b->bd_info->bb_counters[order]++; -+ -+ /* start of the buddy */ -+ buddy = mb_find_buddy(e3b, order, &max); -+ -+ do { -+ block &= ~1UL; -+ if (mb_test_bit(block, buddy) || -+ mb_test_bit(block + 1, buddy)) -+ break; -+ -+ /* both the buddies are free, try to coalesce them */ -+ buddy2 = mb_find_buddy(e3b, order + 1, &max); -+ -+ if (!buddy2) -+ break; -+ -+ if (order > 0) { -+ /* for special purposes, we don't set -+ * free bits in bitmap */ -+ mb_set_bit(block, buddy); -+ mb_set_bit(block + 1, buddy); -+ } -+ e3b->bd_info->bb_counters[order]--; -+ e3b->bd_info->bb_counters[order]--; -+ -+ block = block >> 1; -+ order++; -+ e3b->bd_info->bb_counters[order]++; -+ -+ mb_clear_bit(block, buddy2); -+ buddy = buddy2; -+ } while (1); -+ } -+ mb_check_buddy(e3b); -+ -+ return 0; -+} -+ -+static int mb_find_extent(struct ext3_buddy *e3b, int order, int block, -+ int needed, struct ext3_free_extent *ex) -+{ -+ int next = block, max, ord; -+ void *buddy; -+ -+ J_ASSERT(ex != NULL); -+ -+ buddy = mb_find_buddy(e3b, order, &max); -+ J_ASSERT(buddy); -+ J_ASSERT(block < max); -+ if (mb_test_bit(block, buddy)) { -+ ex->fe_len = 0; -+ ex->fe_start = 0; -+ ex->fe_group = 0; -+ return 0; -+ } -+ -+ if (likely(order == 0)) { -+ /* find actual order */ -+ order = mb_find_order_for_block(e3b, block); -+ block = block >> order; -+ } -+ -+ ex->fe_len = 1 << order; -+ ex->fe_start = block << order; -+ ex->fe_group = e3b->bd_group; -+ -+ /* calc difference from given start */ -+ next = next - ex->fe_start; -+ ex->fe_len -= next; -+ ex->fe_start += next; -+ -+ while (needed > ex->fe_len && (buddy = mb_find_buddy(e3b, order, &max))) { -+ -+ if (block + 1 >= max) -+ break; -+ -+ next = (block + 1) * (1 << order); -+ if (mb_test_bit(next, EXT3_MB_BITMAP(e3b))) -+ break; -+ -+ ord = mb_find_order_for_block(e3b, next); -+ -+ order = ord; -+ block = next >> order; -+ ex->fe_len += 1 << order; -+ } -+ -+ J_ASSERT(ex->fe_start + ex->fe_len <= (1 << (e3b->bd_blkbits + 3))); -+ return ex->fe_len; -+} -+ -+static int mb_mark_used(struct ext3_buddy *e3b, struct ext3_free_extent *ex) -+{ -+ int ord, mlen = 0, max = 0, cur; -+ int start = ex->fe_start; -+ int len = ex->fe_len; -+ unsigned ret = 0; -+ int len0 = len; -+ void *buddy; -+ -+ mb_check_buddy(e3b); -+ -+ e3b->bd_info->bb_free -= len; -+ if (e3b->bd_info->bb_first_free == start) -+ e3b->bd_info->bb_first_free += len; -+ -+ /* let's maintain fragments counter */ -+ if (start != 0) -+ mlen = !mb_test_bit(start - 1, EXT3_MB_BITMAP(e3b)); -+ if (start + len < EXT3_SB(e3b->bd_sb)->s_mb_maxs[0]) -+ max = !mb_test_bit(start + len, EXT3_MB_BITMAP(e3b)); -+ if (mlen && max) -+ e3b->bd_info->bb_fragments++; -+ else if (!mlen && !max) -+ e3b->bd_info->bb_fragments--; -+ -+ /* let's maintain buddy itself */ -+ while (len) { -+ ord = mb_find_order_for_block(e3b, start); -+ -+ if (((start >> ord) << ord) == start && len >= (1 << ord)) { -+ /* the whole chunk may be allocated at once! */ -+ mlen = 1 << ord; -+ buddy = mb_find_buddy(e3b, ord, &max); -+ J_ASSERT((start >> ord) < max); -+ mb_set_bit(start >> ord, buddy); -+ e3b->bd_info->bb_counters[ord]--; -+ start += mlen; -+ len -= mlen; -+ J_ASSERT(len >= 0); -+ continue; -+ } -+ -+ /* store for history */ -+ if (ret == 0) -+ ret = len | (ord << 16); -+ -+ /* we have to split large buddy */ -+ J_ASSERT(ord > 0); -+ buddy = mb_find_buddy(e3b, ord, &max); -+ mb_set_bit(start >> ord, buddy); -+ e3b->bd_info->bb_counters[ord]--; -+ -+ ord--; -+ cur = (start >> ord) & ~1U; -+ buddy = mb_find_buddy(e3b, ord, &max); -+ mb_clear_bit(cur, buddy); -+ mb_clear_bit(cur + 1, buddy); -+ e3b->bd_info->bb_counters[ord]++; -+ e3b->bd_info->bb_counters[ord]++; -+ } -+ -+ /* now drop all the bits in bitmap */ -+ mb_set_bits(EXT3_MB_BITMAP(e3b), ex->fe_start, len0); -+ -+ mb_check_buddy(e3b); -+ -+ return ret; -+} -+ -+/* -+ * Must be called under group lock! -+ */ -+static void ext3_mb_use_best_found(struct ext3_allocation_context *ac, -+ struct ext3_buddy *e3b) -+{ -+ unsigned long ret; -+ -+ ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len); -+ ret = mb_mark_used(e3b, &ac->ac_b_ex); -+ -+ ac->ac_status = AC_STATUS_FOUND; -+ ac->ac_tail = ret & 0xffff; -+ ac->ac_buddy = ret >> 16; -+ -+ /* hold in-core structures until allocated -+ * blocks are marked non-free in on-disk bitmap */ -+ ac->ac_buddy_page = e3b->bd_buddy_page; -+ page_cache_get(e3b->bd_buddy_page); -+ ac->ac_bitmap_page = e3b->bd_bitmap_page; -+ page_cache_get(e3b->bd_bitmap_page); -+} -+ -+/* -+ * The routine checks whether found extent is good enough. If it is, -+ * then the extent gets marked used and flag is set to the context -+ * to stop scanning. Otherwise, the extent is compared with the -+ * previous found extent and if new one is better, then it's stored -+ * in the context. Later, the best found extent will be used, if -+ * mballoc can't find good enough extent. -+ * -+ * FIXME: real allocation policy is to be designed yet! -+ */ -+static void ext3_mb_measure_extent(struct ext3_allocation_context *ac, -+ struct ext3_free_extent *ex, -+ struct ext3_buddy *e3b) -+{ -+ struct ext3_free_extent *bex = &ac->ac_b_ex; -+ struct ext3_free_extent *gex = &ac->ac_g_ex; -+ -+ J_ASSERT(ex->fe_len > 0); -+ J_ASSERT(ex->fe_len < (1 << ac->ac_sb->s_blocksize_bits) * 8); -+ J_ASSERT(ex->fe_start < (1 << ac->ac_sb->s_blocksize_bits) * 8); -+ -+ ac->ac_found++; -+ -+ /* -+ * The special case - take what you catch first -+ */ -+ if (unlikely(ac->ac_flags & EXT3_MB_HINT_FIRST)) { -+ *bex = *ex; -+ ext3_mb_use_best_found(ac, e3b); -+ return; -+ } -+ -+ /* -+ * Let's check whether the chunk is good enough -+ */ -+ if (ex->fe_len == gex->fe_len) { -+ *bex = *ex; -+ ext3_mb_use_best_found(ac, e3b); -+ return; -+ } -+ -+ /* -+ * If this is first found extent, just store it in the context -+ */ -+ if (bex->fe_len == 0) { -+ *bex = *ex; -+ return; -+ } -+ -+ /* -+ * If new found extent is better, store it in the context -+ */ -+ if (bex->fe_len < gex->fe_len) { -+ /* if the request isn't satisfied, any found extent -+ * larger than previous best one is better */ -+ if (ex->fe_len > bex->fe_len) -+ *bex = *ex; -+ } else if (ex->fe_len > gex->fe_len) { -+ /* if the request is satisfied, then we try to find -+ * an extent that still satisfy the request, but is -+ * smaller than previous one */ -+ *bex = *ex; -+ } -+ -+ /* -+ * Let's scan at least few extents and don't pick up a first one -+ */ -+ if (bex->fe_len > gex->fe_len && ac->ac_found > ext3_mb_min_to_scan) -+ ac->ac_status = AC_STATUS_BREAK; -+ -+ /* -+ * We don't want to scan for a whole year -+ */ -+ if (ac->ac_found > ext3_mb_max_to_scan) -+ ac->ac_status = AC_STATUS_BREAK; -+} -+ -+static int ext3_mb_try_best_found(struct ext3_allocation_context *ac, -+ struct ext3_buddy *e3b) -+{ -+ struct ext3_free_extent ex = ac->ac_b_ex; -+ int group = ex.fe_group, max, err; -+ -+ J_ASSERT(ex.fe_len > 0); -+ err = ext3_mb_load_buddy(ac->ac_sb, group, e3b); -+ if (err) -+ return err; -+ -+ ext3_lock_group(ac->ac_sb, group); -+ max = mb_find_extent(e3b, 0, ex.fe_start, ex.fe_len, &ex); -+ -+ if (max > 0) { -+ ac->ac_b_ex = ex; -+ ext3_mb_use_best_found(ac, e3b); -+ } -+ -+ ext3_unlock_group(ac->ac_sb, group); -+ -+ ext3_mb_release_desc(e3b); -+ -+ return 0; -+} -+ -+static int ext3_mb_find_by_goal(struct ext3_allocation_context *ac, -+ struct ext3_buddy *e3b) -+{ -+ int group = ac->ac_g_ex.fe_group, max, err; -+ struct ext3_sb_info *sbi = EXT3_SB(ac->ac_sb); -+ struct ext3_super_block *es = sbi->s_es; -+ struct ext3_free_extent ex; -+ -+ err = ext3_mb_load_buddy(ac->ac_sb, group, e3b); -+ if (err) -+ return err; -+ -+ ext3_lock_group(ac->ac_sb, group); -+ max = mb_find_extent(e3b, 0, ac->ac_g_ex.fe_start, -+ ac->ac_g_ex.fe_len, &ex); -+ -+ if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) { -+ unsigned long start; -+ start = (e3b->bd_group * EXT3_BLOCKS_PER_GROUP(ac->ac_sb) + -+ ex.fe_start + le32_to_cpu(es->s_first_data_block)); -+ if (start % sbi->s_stripe == 0) { -+ ac->ac_found++; -+ ac->ac_b_ex = ex; -+ ext3_mb_use_best_found(ac, e3b); -+ } -+ } else if (max >= ac->ac_g_ex.fe_len) { -+ J_ASSERT(ex.fe_len > 0); -+ J_ASSERT(ex.fe_group == ac->ac_g_ex.fe_group); -+ J_ASSERT(ex.fe_start == ac->ac_g_ex.fe_start); -+ ac->ac_found++; -+ ac->ac_b_ex = ex; -+ ext3_mb_use_best_found(ac, e3b); -+ } else if (max > 0 && (ac->ac_flags & EXT3_MB_HINT_MERGE)) { -+ /* Sometimes, caller may want to merge even small -+ * number of blocks to an existing extent */ -+ J_ASSERT(ex.fe_len > 0); -+ J_ASSERT(ex.fe_group == ac->ac_g_ex.fe_group); -+ J_ASSERT(ex.fe_start == ac->ac_g_ex.fe_start); -+ ac->ac_found++; -+ ac->ac_b_ex = ex; -+ ext3_mb_use_best_found(ac, e3b); -+ } -+ ext3_unlock_group(ac->ac_sb, group); -+ -+ ext3_mb_release_desc(e3b); -+ -+ return 0; -+} -+ -+/* -+ * The routine scans buddy structures (not bitmap!) from given order -+ * to max order and tries to find big enough chunk to satisfy the req -+ */ -+static void ext3_mb_simple_scan_group(struct ext3_allocation_context *ac, -+ struct ext3_buddy *e3b) -+{ -+ struct super_block *sb = ac->ac_sb; -+ struct ext3_group_info *grp = e3b->bd_info; -+ void *buddy; -+ int i, k, max; -+ -+ J_ASSERT(ac->ac_2order > 0); -+ for (i = ac->ac_2order; i <= sb->s_blocksize_bits + 1; i++) { -+ if (grp->bb_counters[i] == 0) -+ continue; -+ -+ buddy = mb_find_buddy(e3b, i, &max); -+ if (buddy == NULL) { -+ printk(KERN_ALERT "looking for wrong order?\n"); -+ break; -+ } -+ -+ k = mb_find_next_zero_bit(buddy, max, 0); -+ J_ASSERT(k < max); -+ -+ ac->ac_found++; -+ -+ ac->ac_b_ex.fe_len = 1 << i; -+ ac->ac_b_ex.fe_start = k << i; -+ ac->ac_b_ex.fe_group = e3b->bd_group; -+ -+ ext3_mb_use_best_found(ac, e3b); -+ J_ASSERT(ac->ac_b_ex.fe_len == ac->ac_g_ex.fe_len); -+ -+ if (unlikely(ext3_mb_stats)) -+ atomic_inc(&EXT3_SB(sb)->s_bal_2orders); -+ -+ break; -+ } -+} -+ -+/* -+ * The routine scans the group and measures all found extents. -+ * In order to optimize scanning, caller must pass number of -+ * free blocks in the group, so the routine can know upper limit. -+ */ -+static void ext3_mb_complex_scan_group(struct ext3_allocation_context *ac, -+ struct ext3_buddy *e3b) -+{ -+ struct super_block *sb = ac->ac_sb; -+ void *bitmap = EXT3_MB_BITMAP(e3b); -+ struct ext3_free_extent ex; -+ int i, free; -+ -+ free = e3b->bd_info->bb_free; -+ J_ASSERT(free > 0); -+ -+ i = e3b->bd_info->bb_first_free; -+ -+ while (free && ac->ac_status == AC_STATUS_CONTINUE) { -+ i = mb_find_next_zero_bit(bitmap, sb->s_blocksize * 8, i); -+ if (i >= sb->s_blocksize * 8) { -+ J_ASSERT(free == 0); -+ break; -+ } -+ -+ mb_find_extent(e3b, 0, i, ac->ac_g_ex.fe_len, &ex); -+ J_ASSERT(ex.fe_len > 0); -+ J_ASSERT(free >= ex.fe_len); -+ -+ ext3_mb_measure_extent(ac, &ex, e3b); -+ -+ i += ex.fe_len; -+ free -= ex.fe_len; -+ } -+} -+ -+/* -+ * This is a special case for storages like raid5 -+ * we try to find stripe-aligned chunks for stripe-size requests -+ */ -+static void ext3_mb_scan_aligned(struct ext3_allocation_context *ac, -+ struct ext3_buddy *e3b) -+{ -+ struct super_block *sb = ac->ac_sb; -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ void *bitmap = EXT3_MB_BITMAP(e3b); -+ struct ext3_free_extent ex; -+ unsigned long i, max; -+ -+ J_ASSERT(sbi->s_stripe != 0); -+ -+ /* find first stripe-aligned block */ -+ i = e3b->bd_group * EXT3_BLOCKS_PER_GROUP(sb) -+ + le32_to_cpu(sbi->s_es->s_first_data_block); -+ i = ((i + sbi->s_stripe - 1) / sbi->s_stripe) * sbi->s_stripe; -+ i = (i - le32_to_cpu(sbi->s_es->s_first_data_block)) -+ % EXT3_BLOCKS_PER_GROUP(sb); -+ -+ while (i < sb->s_blocksize * 8) { -+ if (!mb_test_bit(i, bitmap)) { -+ max = mb_find_extent(e3b, 0, i, sbi->s_stripe, &ex); -+ if (max >= sbi->s_stripe) { -+ ac->ac_found++; -+ ac->ac_b_ex = ex; -+ ext3_mb_use_best_found(ac, e3b); -+ break; -+ } -+ } -+ i += sbi->s_stripe; -+ } -+} -+ -+static int ext3_mb_good_group(struct ext3_allocation_context *ac, -+ int group, int cr) -+{ -+ struct ext3_group_info *grp = EXT3_GROUP_INFO(ac->ac_sb, group); -+ unsigned free, fragments, i, bits; -+ -+ J_ASSERT(cr >= 0 && cr < 4); -+ J_ASSERT(!EXT3_MB_GRP_NEED_INIT(grp)); -+ -+ free = grp->bb_free; -+ fragments = grp->bb_fragments; -+ if (free == 0) -+ return 0; -+ if (fragments == 0) -+ return 0; -+ -+ switch (cr) { -+ case 0: -+ J_ASSERT(ac->ac_2order != 0); -+ bits = ac->ac_sb->s_blocksize_bits + 1; -+ for (i = ac->ac_2order; i <= bits; i++) -+ if (grp->bb_counters[i] > 0) -+ return 1; -+ break; -+ case 1: -+ if ((free / fragments) >= ac->ac_g_ex.fe_len) -+ return 1; -+ break; -+ case 2: -+ if (free >= ac->ac_g_ex.fe_len) -+ return 1; -+ break; -+ case 3: -+ return 1; -+ default: -+ BUG(); -+ } -+ -+ return 0; -+} -+ -+int ext3_mb_new_blocks(handle_t *handle, struct inode *inode, -+ unsigned long goal, int *len, int flags, int *errp) -+{ -+ struct buffer_head *bitmap_bh = NULL; -+ struct ext3_allocation_context ac; -+ int i, group, block, cr, err = 0; -+ struct ext3_group_desc *gdp; -+ struct ext3_super_block *es; -+ struct buffer_head *gdp_bh; -+ struct ext3_sb_info *sbi; -+ struct super_block *sb; -+ struct ext3_buddy e3b; -+ -+ J_ASSERT(len != NULL); -+ J_ASSERT(*len > 0); -+ -+ sb = inode->i_sb; -+ if (!sb) { -+ printk("ext3_mb_new_nblocks: nonexistent device"); -+ return 0; -+ } -+ -+ if (!test_opt(sb, MBALLOC)) { -+ static int ext3_mballoc_warning = 0; -+ if (ext3_mballoc_warning == 0) { -+ printk(KERN_ERR "EXT3-fs: multiblock request with " -+ "mballoc disabled!\n"); -+ ext3_mballoc_warning++; -+ } -+ *len = 1; -+ err = ext3_new_block_old(handle, inode, goal, errp); -+ return err; -+ } -+ -+ ext3_mb_poll_new_transaction(sb, handle); -+ -+ sbi = EXT3_SB(sb); -+ es = EXT3_SB(sb)->s_es; -+ -+ /* -+ * We can't allocate > group size -+ */ -+ if (*len >= EXT3_BLOCKS_PER_GROUP(sb) - 10) -+ *len = EXT3_BLOCKS_PER_GROUP(sb) - 10; -+ -+ if (!(flags & EXT3_MB_HINT_RESERVED)) { -+ /* someone asks for non-reserved blocks */ -+ BUG_ON(*len > 1); -+ err = ext3_mb_reserve_blocks(sb, 1); -+ if (err) { -+ *errp = err; -+ return 0; -+ } -+ } -+ -+ ac.ac_buddy_page = NULL; -+ ac.ac_bitmap_page = NULL; -+ -+ /* -+ * Check quota for allocation of this blocks. -+ */ -+ while (*len && DQUOT_ALLOC_BLOCK(inode, *len)) -+ *len -= 1; -+ if (*len == 0) { -+ *errp = -EDQUOT; -+ block = 0; -+ goto out; -+ } -+ -+ /* start searching from the goal */ -+ if (goal < le32_to_cpu(es->s_first_data_block) || -+ goal >= le32_to_cpu(es->s_blocks_count)) -+ goal = le32_to_cpu(es->s_first_data_block); -+ group = (goal - le32_to_cpu(es->s_first_data_block)) / -+ EXT3_BLOCKS_PER_GROUP(sb); -+ block = ((goal - le32_to_cpu(es->s_first_data_block)) % -+ EXT3_BLOCKS_PER_GROUP(sb)); -+ -+ /* set up allocation goals */ -+ ac.ac_b_ex.fe_group = 0; -+ ac.ac_b_ex.fe_start = 0; -+ ac.ac_b_ex.fe_len = 0; -+ ac.ac_status = AC_STATUS_CONTINUE; -+ ac.ac_groups_scanned = 0; -+ ac.ac_ex_scanned = 0; -+ ac.ac_found = 0; -+ ac.ac_sb = inode->i_sb; -+ ac.ac_g_ex.fe_group = group; -+ ac.ac_g_ex.fe_start = block; -+ ac.ac_g_ex.fe_len = *len; -+ ac.ac_flags = flags; -+ ac.ac_2order = 0; -+ ac.ac_criteria = 0; -+ -+ if (*len == 1 && sbi->s_stripe) { -+ /* looks like a metadata, let's use a dirty hack for raid5 -+ * move all metadata in first groups in hope to hit cached -+ * sectors and thus avoid read-modify cycles in raid5 */ -+ ac.ac_g_ex.fe_group = group = 0; -+ } -+ -+ /* probably, the request is for 2^8+ blocks (1/2/3/... MB) */ -+ i = ffs(*len); -+ if (i >= ext3_mb_order2_reqs) { -+ i--; -+ if ((*len & (~(1 << i))) == 0) -+ ac.ac_2order = i; -+ } -+ -+ /* first, try the goal */ -+ err = ext3_mb_find_by_goal(&ac, &e3b); -+ if (err) -+ goto out_err; -+ if (ac.ac_status == AC_STATUS_FOUND) -+ goto found; -+ -+ /* Let's just scan groups to find more-less suitable blocks */ -+ cr = ac.ac_2order ? 0 : 1; -+repeat: -+ for (; cr < 4 && ac.ac_status == AC_STATUS_CONTINUE; cr++) { -+ ac.ac_criteria = cr; -+ for (i = 0; i < EXT3_SB(sb)->s_groups_count; group++, i++) { -+ if (group == EXT3_SB(sb)->s_groups_count) -+ group = 0; -+ -+ if (EXT3_MB_GRP_NEED_INIT(EXT3_GROUP_INFO(sb, group))) { -+ /* we need full data about the group -+ * to make a good selection */ -+ err = ext3_mb_load_buddy(ac.ac_sb, group, &e3b); -+ if (err) -+ goto out_err; -+ ext3_mb_release_desc(&e3b); -+ } -+ -+ /* check is group good for our criteries */ -+ if (!ext3_mb_good_group(&ac, group, cr)) -+ continue; -+ -+ err = ext3_mb_load_buddy(ac.ac_sb, group, &e3b); -+ if (err) -+ goto out_err; -+ -+ ext3_lock_group(sb, group); -+ if (!ext3_mb_good_group(&ac, group, cr)) { -+ /* someone did allocation from this group */ -+ ext3_unlock_group(sb, group); -+ ext3_mb_release_desc(&e3b); -+ continue; -+ } -+ -+ ac.ac_groups_scanned++; -+ if (cr == 0) -+ ext3_mb_simple_scan_group(&ac, &e3b); -+ else if (cr == 1 && *len == sbi->s_stripe) -+ ext3_mb_scan_aligned(&ac, &e3b); -+ else -+ ext3_mb_complex_scan_group(&ac, &e3b); -+ -+ ext3_unlock_group(sb, group); -+ -+ ext3_mb_release_desc(&e3b); -+ -+ if (ac.ac_status != AC_STATUS_CONTINUE) -+ break; -+ } -+ } -+ -+ if (ac.ac_b_ex.fe_len > 0 && ac.ac_status != AC_STATUS_FOUND && -+ !(ac.ac_flags & EXT3_MB_HINT_FIRST)) { -+ /* -+ * We've been searching too long. Let's try to allocate -+ * the best chunk we've found so far -+ */ -+ -+ /*if (ac.ac_found > ext3_mb_max_to_scan) -+ printk(KERN_DEBUG "EXT3-fs: too long searching at " -+ "%u (%d/%d)\n", cr, ac.ac_b_ex.fe_len, -+ ac.ac_g_ex.fe_len);*/ -+ ext3_mb_try_best_found(&ac, &e3b); -+ if (ac.ac_status != AC_STATUS_FOUND) { -+ /* -+ * Someone more lucky has already allocated it. -+ * The only thing we can do is just take first -+ * found block(s) -+ printk(KERN_DEBUG "EXT3-fs: someone won our chunk\n"); -+ */ -+ ac.ac_b_ex.fe_group = 0; -+ ac.ac_b_ex.fe_start = 0; -+ ac.ac_b_ex.fe_len = 0; -+ ac.ac_status = AC_STATUS_CONTINUE; -+ ac.ac_flags |= EXT3_MB_HINT_FIRST; -+ cr = 3; -+ goto repeat; -+ } -+ } -+ -+ if (ac.ac_status != AC_STATUS_FOUND) { -+ /* -+ * We aren't lucky definitely -+ */ -+ DQUOT_FREE_BLOCK(inode, *len); -+ *errp = -ENOSPC; -+ block = 0; -+#if 1 -+ printk(KERN_ERR "EXT3-fs: can't allocate: status %d flags %d\n", -+ ac.ac_status, ac.ac_flags); -+ printk(KERN_ERR "EXT3-fs: goal %d, best found %d/%d/%d cr %d\n", -+ ac.ac_g_ex.fe_len, ac.ac_b_ex.fe_group, -+ ac.ac_b_ex.fe_start, ac.ac_b_ex.fe_len, cr); -+ printk(KERN_ERR "EXT3-fs: %lu block reserved, %d found\n", -+ sbi->s_blocks_reserved, ac.ac_found); -+ printk("EXT3-fs: groups: "); -+ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) -+ printk("%d: %d ", i, EXT3_GROUP_INFO(sb, i)->bb_free); -+ printk("\n"); -+#endif -+ goto out; -+ } -+ -+found: -+ J_ASSERT(ac.ac_b_ex.fe_len > 0); -+ -+ /* good news - free block(s) have been found. now it's time -+ * to mark block(s) in good old journaled bitmap */ -+ block = ac.ac_b_ex.fe_group * EXT3_BLOCKS_PER_GROUP(sb) -+ + ac.ac_b_ex.fe_start -+ + le32_to_cpu(es->s_first_data_block); -+ -+ /* we made a desicion, now mark found blocks in good old -+ * bitmap to be journaled */ -+ -+ ext3_debug("using block group %d(%d)\n", -+ ac.ac_b_group.group, gdp->bg_free_blocks_count); -+ -+ bitmap_bh = read_block_bitmap(sb, ac.ac_b_ex.fe_group); -+ if (!bitmap_bh) { -+ *errp = -EIO; -+ goto out_err; -+ } -+ -+ err = ext3_journal_get_write_access(handle, bitmap_bh); -+ if (err) { -+ *errp = err; -+ goto out_err; -+ } -+ -+ gdp = ext3_get_group_desc(sb, ac.ac_b_ex.fe_group, &gdp_bh); -+ if (!gdp) { -+ *errp = -EIO; -+ goto out_err; -+ } -+ -+ err = ext3_journal_get_write_access(handle, gdp_bh); -+ if (err) -+ goto out_err; -+ -+ block = ac.ac_b_ex.fe_group * EXT3_BLOCKS_PER_GROUP(sb) -+ + ac.ac_b_ex.fe_start -+ + le32_to_cpu(es->s_first_data_block); -+ -+ if (block == le32_to_cpu(gdp->bg_block_bitmap) || -+ block == le32_to_cpu(gdp->bg_inode_bitmap) || -+ in_range(block, le32_to_cpu(gdp->bg_inode_table), -+ EXT3_SB(sb)->s_itb_per_group)) -+ ext3_error(sb, "ext3_new_block", -+ "Allocating block in system zone - " -+ "block = %u", block); -+#ifdef AGGRESSIVE_CHECK -+ for (i = 0; i < ac.ac_b_ex.fe_len; i++) -+ J_ASSERT(!mb_test_bit(ac.ac_b_ex.fe_start + i, bitmap_bh->b_data)); -+#endif -+ mb_set_bits(bitmap_bh->b_data, ac.ac_b_ex.fe_start, ac.ac_b_ex.fe_len); -+ -+ spin_lock(sb_bgl_lock(sbi, ac.ac_b_ex.fe_group)); -+ gdp->bg_free_blocks_count = -+ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) -+ - ac.ac_b_ex.fe_len); -+ spin_unlock(sb_bgl_lock(sbi, ac.ac_b_ex.fe_group)); -+ percpu_counter_mod(&sbi->s_freeblocks_counter, - ac.ac_b_ex.fe_len); -+ -+ err = ext3_journal_dirty_metadata(handle, bitmap_bh); -+ if (err) -+ goto out_err; -+ err = ext3_journal_dirty_metadata(handle, gdp_bh); -+ if (err) -+ goto out_err; -+ -+ sb->s_dirt = 1; -+ *errp = 0; -+ brelse(bitmap_bh); -+ -+ /* drop non-allocated, but dquote'd blocks */ -+ J_ASSERT(*len >= ac.ac_b_ex.fe_len); -+ DQUOT_FREE_BLOCK(inode, *len - ac.ac_b_ex.fe_len); -+ -+ *len = ac.ac_b_ex.fe_len; -+ J_ASSERT(*len > 0); -+ J_ASSERT(block != 0); -+ goto out; -+ -+out_err: -+ /* if we've already allocated something, roll it back */ -+ if (ac.ac_status == AC_STATUS_FOUND) { -+ /* FIXME: free blocks here */ -+ } -+ -+ DQUOT_FREE_BLOCK(inode, *len); -+ brelse(bitmap_bh); -+ *errp = err; -+ block = 0; -+out: -+ if (ac.ac_buddy_page) -+ page_cache_release(ac.ac_buddy_page); -+ if (ac.ac_bitmap_page) -+ page_cache_release(ac.ac_bitmap_page); -+ -+ if (!(flags & EXT3_MB_HINT_RESERVED)) { -+ /* block wasn't reserved before and we reserved it -+ * at the beginning of allocation. it doesn't matter -+ * whether we allocated anything or we failed: time -+ * to release reservation. NOTE: because I expect -+ * any multiblock request from delayed allocation -+ * path only, here is single block always */ -+ ext3_mb_release_blocks(sb, 1); -+ } -+ -+ if (unlikely(ext3_mb_stats) && ac.ac_g_ex.fe_len > 1) { -+ atomic_inc(&sbi->s_bal_reqs); -+ atomic_add(*len, &sbi->s_bal_allocated); -+ if (*len >= ac.ac_g_ex.fe_len) -+ atomic_inc(&sbi->s_bal_success); -+ atomic_add(ac.ac_found, &sbi->s_bal_ex_scanned); -+ if (ac.ac_g_ex.fe_start == ac.ac_b_ex.fe_start && -+ ac.ac_g_ex.fe_group == ac.ac_b_ex.fe_group) -+ atomic_inc(&sbi->s_bal_goals); -+ if (ac.ac_found > ext3_mb_max_to_scan) -+ atomic_inc(&sbi->s_bal_breaks); -+ } -+ -+ ext3_mb_store_history(sb, inode->i_ino, &ac); -+ -+ return block; -+} -+EXPORT_SYMBOL(ext3_mb_new_blocks); -+ -+#ifdef EXT3_MB_HISTORY -+struct ext3_mb_proc_session { -+ struct ext3_mb_history *history; -+ struct super_block *sb; -+ int start; -+ int max; -+}; -+ -+static void *ext3_mb_history_skip_empty(struct ext3_mb_proc_session *s, -+ struct ext3_mb_history *hs, -+ int first) -+{ -+ if (hs == s->history + s->max) -+ hs = s->history; -+ if (!first && hs == s->history + s->start) -+ return NULL; -+ while (hs->goal.fe_len == 0) { -+ hs++; -+ if (hs == s->history + s->max) -+ hs = s->history; -+ if (hs == s->history + s->start) -+ return NULL; -+ } -+ return hs; -+} -+ -+static void *ext3_mb_seq_history_start(struct seq_file *seq, loff_t *pos) -+{ -+ struct ext3_mb_proc_session *s = seq->private; -+ struct ext3_mb_history *hs; -+ int l = *pos; -+ -+ if (l == 0) -+ return SEQ_START_TOKEN; -+ hs = ext3_mb_history_skip_empty(s, s->history + s->start, 1); -+ if (!hs) -+ return NULL; -+ while (--l && (hs = ext3_mb_history_skip_empty(s, ++hs, 0)) != NULL); -+ return hs; -+} -+ -+static void *ext3_mb_seq_history_next(struct seq_file *seq, void *v, loff_t *pos) -+{ -+ struct ext3_mb_proc_session *s = seq->private; -+ struct ext3_mb_history *hs = v; -+ -+ ++*pos; -+ if (v == SEQ_START_TOKEN) -+ return ext3_mb_history_skip_empty(s, s->history + s->start, 1); -+ else -+ return ext3_mb_history_skip_empty(s, ++hs, 0); -+} -+ -+static int ext3_mb_seq_history_show(struct seq_file *seq, void *v) -+{ -+ struct ext3_mb_history *hs = v; -+ char buf[20], buf2[20]; -+ -+ if (v == SEQ_START_TOKEN) { -+ seq_printf(seq, "%-5s %-8s %-17s %-17s %-5s %-5s %-2s %-5s %-5s %-6s\n", -+ "pid", "inode", "goal", "result", "found", "grps", "cr", -+ "merge", "tail", "broken"); -+ return 0; -+ } -+ -+ sprintf(buf, "%u/%u/%u", hs->goal.fe_group, -+ hs->goal.fe_start, hs->goal.fe_len); -+ sprintf(buf2, "%u/%u/%u", hs->result.fe_group, -+ hs->result.fe_start, hs->result.fe_len); -+ seq_printf(seq, "%-5u %-8u %-17s %-17s %-5u %-5u %-2u %-5s %-5u %-6u\n", -+ hs->pid, hs->ino, buf, buf2, hs->found, hs->groups, -+ hs->cr, hs->merged ? "M" : "", hs->tail, -+ hs->buddy ? 1 << hs->buddy : 0); -+ return 0; -+} -+ -+static void ext3_mb_seq_history_stop(struct seq_file *seq, void *v) -+{ -+} -+ -+static struct seq_operations ext3_mb_seq_history_ops = { -+ .start = ext3_mb_seq_history_start, -+ .next = ext3_mb_seq_history_next, -+ .stop = ext3_mb_seq_history_stop, -+ .show = ext3_mb_seq_history_show, -+}; -+ -+static int ext3_mb_seq_history_open(struct inode *inode, struct file *file) -+{ -+ struct super_block *sb = PDE(inode)->data; -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ struct ext3_mb_proc_session *s; -+ int rc, size; -+ -+ s = kmalloc(sizeof(*s), GFP_KERNEL); -+ if (s == NULL) -+ return -EIO; -+ size = sizeof(struct ext3_mb_history) * sbi->s_mb_history_max; -+ s->history = kmalloc(size, GFP_KERNEL); -+ if (s == NULL) { -+ kfree(s); -+ return -EIO; -+ } -+ -+ spin_lock(&sbi->s_mb_history_lock); -+ memcpy(s->history, sbi->s_mb_history, size); -+ s->max = sbi->s_mb_history_max; -+ s->start = sbi->s_mb_history_cur % s->max; -+ spin_unlock(&sbi->s_mb_history_lock); -+ -+ rc = seq_open(file, &ext3_mb_seq_history_ops); -+ if (rc == 0) { -+ struct seq_file *m = (struct seq_file *)file->private_data; -+ m->private = s; -+ } else { -+ kfree(s->history); -+ kfree(s); -+ } -+ return rc; -+ -+} -+ -+static int ext3_mb_seq_history_release(struct inode *inode, struct file *file) -+{ -+ struct seq_file *seq = (struct seq_file *)file->private_data; -+ struct ext3_mb_proc_session *s = seq->private; -+ kfree(s->history); -+ kfree(s); -+ return seq_release(inode, file); -+} -+ -+static struct file_operations ext3_mb_seq_history_fops = { -+ .owner = THIS_MODULE, -+ .open = ext3_mb_seq_history_open, -+ .read = seq_read, -+ .llseek = seq_lseek, -+ .release = ext3_mb_seq_history_release, -+}; -+ -+static void *ext3_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) -+{ -+ struct super_block *sb = seq->private; -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ long group; -+ -+ if (*pos < 0 || *pos >= sbi->s_groups_count) -+ return NULL; -+ -+ group = *pos + 1; -+ return (void *) group; -+} -+ -+static void *ext3_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) -+{ -+ struct super_block *sb = seq->private; -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ long group; -+ -+ ++*pos; -+ if (*pos < 0 || *pos >= sbi->s_groups_count) -+ return NULL; -+ group = *pos + 1; -+ return (void *) group;; -+} -+ -+static int ext3_mb_seq_groups_show(struct seq_file *seq, void *v) -+{ -+ struct super_block *sb = seq->private; -+ long group = (long) v, i; -+ struct sg { -+ struct ext3_group_info info; -+ unsigned short counters[16]; -+ } sg; -+ -+ group--; -+ if (group == 0) -+ seq_printf(seq, "#%-5s: %-5s %-5s %-5s [ %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s ]\n", -+ "group", "free", "frags", "first", "2^0", "2^1", "2^2", -+ "2^3", "2^4", "2^5", "2^6", "2^7", "2^8", "2^9", "2^10", -+ "2^11", "2^12", "2^13"); -+ -+ i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) + -+ sizeof(struct ext3_group_info); -+ ext3_lock_group(sb, group); -+ memcpy(&sg, EXT3_GROUP_INFO(sb, group), i); -+ ext3_unlock_group(sb, group); -+ -+ if (EXT3_MB_GRP_NEED_INIT(&sg.info)) -+ return 0; -+ -+ seq_printf(seq, "#%-5lu: %-5u %-5u %-5u [", group, sg.info.bb_free, -+ sg.info.bb_fragments, sg.info.bb_first_free); -+ for (i = 0; i <= 13; i++) -+ seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ? -+ sg.info.bb_counters[i] : 0); -+ seq_printf(seq, " ]\n"); -+ -+ return 0; -+} -+ -+static void ext3_mb_seq_groups_stop(struct seq_file *seq, void *v) -+{ -+} -+ -+static struct seq_operations ext3_mb_seq_groups_ops = { -+ .start = ext3_mb_seq_groups_start, -+ .next = ext3_mb_seq_groups_next, -+ .stop = ext3_mb_seq_groups_stop, -+ .show = ext3_mb_seq_groups_show, -+}; -+ -+static int ext3_mb_seq_groups_open(struct inode *inode, struct file *file) -+{ -+ struct super_block *sb = PDE(inode)->data; -+ int rc; -+ -+ rc = seq_open(file, &ext3_mb_seq_groups_ops); -+ if (rc == 0) { -+ struct seq_file *m = (struct seq_file *)file->private_data; -+ m->private = sb; -+ } -+ return rc; -+ -+} -+ -+static struct file_operations ext3_mb_seq_groups_fops = { -+ .owner = THIS_MODULE, -+ .open = ext3_mb_seq_groups_open, -+ .read = seq_read, -+ .llseek = seq_lseek, -+ .release = seq_release, -+}; -+ -+static void ext3_mb_history_release(struct super_block *sb) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ char name[64]; -+ -+ snprintf(name, sizeof(name) - 1, "%s", bdevname(sb->s_bdev, name)); -+ remove_proc_entry("mb_groups", sbi->s_mb_proc); -+ remove_proc_entry("mb_history", sbi->s_mb_proc); -+ remove_proc_entry(name, proc_root_ext3); -+ -+ if (sbi->s_mb_history) -+ kfree(sbi->s_mb_history); -+} -+ -+static void ext3_mb_history_init(struct super_block *sb) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ char name[64]; -+ int i; -+ -+ snprintf(name, sizeof(name) - 1, "%s", bdevname(sb->s_bdev, name)); -+ sbi->s_mb_proc = proc_mkdir(name, proc_root_ext3); -+ if (sbi->s_mb_proc != NULL) { -+ struct proc_dir_entry *p; -+ p = create_proc_entry("mb_history", S_IRUGO, sbi->s_mb_proc); -+ if (p) { -+ p->proc_fops = &ext3_mb_seq_history_fops; -+ p->data = sb; -+ } -+ p = create_proc_entry("mb_groups", S_IRUGO, sbi->s_mb_proc); -+ if (p) { -+ p->proc_fops = &ext3_mb_seq_groups_fops; -+ p->data = sb; -+ } -+ } -+ -+ sbi->s_mb_history_max = 1000; -+ sbi->s_mb_history_cur = 0; -+ spin_lock_init(&sbi->s_mb_history_lock); -+ i = sbi->s_mb_history_max * sizeof(struct ext3_mb_history); -+ sbi->s_mb_history = kmalloc(i, GFP_KERNEL); -+ memset(sbi->s_mb_history, 0, i); -+ /* if we can't allocate history, then we simple won't use it */ -+} -+ -+static void -+ext3_mb_store_history(struct super_block *sb, unsigned ino, -+ struct ext3_allocation_context *ac) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ struct ext3_mb_history h; -+ -+ if (likely(sbi->s_mb_history == NULL)) -+ return; -+ -+ h.pid = current->pid; -+ h.ino = ino; -+ h.goal = ac->ac_g_ex; -+ h.result = ac->ac_b_ex; -+ h.found = ac->ac_found; -+ h.cr = ac->ac_criteria; -+ h.groups = ac->ac_groups_scanned; -+ h.tail = ac->ac_tail; -+ h.buddy = ac->ac_buddy; -+ h.merged = 0; -+ if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && -+ ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group) -+ h.merged = 1; -+ -+ spin_lock(&sbi->s_mb_history_lock); -+ memcpy(sbi->s_mb_history + sbi->s_mb_history_cur, &h, sizeof(h)); -+ if (++sbi->s_mb_history_cur >= sbi->s_mb_history_max) -+ sbi->s_mb_history_cur = 0; -+ spin_unlock(&sbi->s_mb_history_lock); -+} -+ -+#else -+#define ext3_mb_history_release(sb) -+#define ext3_mb_history_init(sb) -+#endif -+ -+int ext3_mb_init_backend(struct super_block *sb) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ int i, j, len, metalen; -+ int num_meta_group_infos = -+ (sbi->s_groups_count + EXT3_DESC_PER_BLOCK(sb) - 1) >> -+ EXT3_DESC_PER_BLOCK_BITS(sb); -+ struct ext3_group_info **meta_group_info; -+ -+ /* An 8TB filesystem with 64-bit pointers requires a 4096 byte -+ * kmalloc. A 128kb malloc should suffice for a 256TB filesystem. -+ * So a two level scheme suffices for now. */ -+ sbi->s_group_info = kmalloc(sizeof(*sbi->s_group_info) * -+ num_meta_group_infos, GFP_KERNEL); -+ if (sbi->s_group_info == NULL) { -+ printk(KERN_ERR "EXT3-fs: can't allocate buddy meta group\n"); -+ return -ENOMEM; -+ } -+ sbi->s_buddy_cache = new_inode(sb); -+ if (sbi->s_buddy_cache == NULL) { -+ printk(KERN_ERR "EXT3-fs: can't get new inode\n"); -+ goto err_freesgi; -+ } -+ -+ metalen = sizeof(*meta_group_info) << EXT3_DESC_PER_BLOCK_BITS(sb); -+ for (i = 0; i < num_meta_group_infos; i++) { -+ if ((i + 1) == num_meta_group_infos) -+ metalen = sizeof(*meta_group_info) * -+ (sbi->s_groups_count - -+ (i << EXT3_DESC_PER_BLOCK_BITS(sb))); -+ meta_group_info = kmalloc(metalen, GFP_KERNEL); -+ if (meta_group_info == NULL) { -+ printk(KERN_ERR "EXT3-fs: can't allocate mem for a " -+ "buddy group\n"); -+ goto err_freemeta; -+ } -+ sbi->s_group_info[i] = meta_group_info; -+ } -+ -+ /* -+ * calculate needed size. if change bb_counters size, -+ * don't forget about ext3_mb_generate_buddy() -+ */ -+ len = sizeof(struct ext3_group_info); -+ len += sizeof(unsigned short) * (sb->s_blocksize_bits + 2); -+ for (i = 0; i < sbi->s_groups_count; i++) { -+ struct ext3_group_desc * desc; -+ -+ meta_group_info = -+ sbi->s_group_info[i >> EXT3_DESC_PER_BLOCK_BITS(sb)]; -+ j = i & (EXT3_DESC_PER_BLOCK(sb) - 1); -+ -+ meta_group_info[j] = kmalloc(len, GFP_KERNEL); -+ if (meta_group_info[j] == NULL) { -+ printk(KERN_ERR "EXT3-fs: can't allocate buddy mem\n"); -+ i--; -+ goto err_freebuddy; -+ } -+ desc = ext3_get_group_desc(sb, i, NULL); -+ if (desc == NULL) { -+ printk(KERN_ERR"EXT3-fs: can't read descriptor %u\n",i); -+ goto err_freebuddy; -+ } -+ memset(meta_group_info[j], 0, len); -+ set_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, -+ &meta_group_info[j]->bb_state); -+ meta_group_info[j]->bb_free = -+ le16_to_cpu(desc->bg_free_blocks_count); -+ } -+ -+ return 0; -+ -+err_freebuddy: -+ while (i >= 0) { -+ kfree(EXT3_GROUP_INFO(sb, i)); -+ i--; -+ } -+ i = num_meta_group_infos; -+err_freemeta: -+ while (--i >= 0) -+ kfree(sbi->s_group_info[i]); -+ iput(sbi->s_buddy_cache); -+err_freesgi: -+ kfree(sbi->s_group_info); -+ return -ENOMEM; -+} -+ -+int ext3_mb_init(struct super_block *sb, int needs_recovery) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ struct inode *root = sb->s_root->d_inode; -+ unsigned i, offset, max; -+ struct dentry *dentry; -+ -+ if (!test_opt(sb, MBALLOC)) -+ return 0; -+ -+ i = (sb->s_blocksize_bits + 2) * sizeof(unsigned short); -+ -+ sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL); -+ if (sbi->s_mb_offsets == NULL) { -+ clear_opt(sbi->s_mount_opt, MBALLOC); -+ return -ENOMEM; -+ } -+ sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); -+ if (sbi->s_mb_maxs == NULL) { -+ clear_opt(sbi->s_mount_opt, MBALLOC); -+ kfree(sbi->s_mb_maxs); -+ return -ENOMEM; -+ } -+ -+ /* order 0 is regular bitmap */ -+ sbi->s_mb_maxs[0] = sb->s_blocksize << 3; -+ sbi->s_mb_offsets[0] = 0; -+ -+ i = 1; -+ offset = 0; -+ max = sb->s_blocksize << 2; -+ do { -+ sbi->s_mb_offsets[i] = offset; -+ sbi->s_mb_maxs[i] = max; -+ offset += 1 << (sb->s_blocksize_bits - i); -+ max = max >> 1; -+ i++; -+ } while (i <= sb->s_blocksize_bits + 1); -+ -+ /* init file for buddy data */ -+ if ((i = ext3_mb_init_backend(sb))) { -+ clear_opt(sbi->s_mount_opt, MBALLOC); -+ kfree(sbi->s_mb_offsets); -+ kfree(sbi->s_mb_maxs); -+ return i; -+ } -+ -+ spin_lock_init(&sbi->s_reserve_lock); -+ spin_lock_init(&sbi->s_md_lock); -+ INIT_LIST_HEAD(&sbi->s_active_transaction); -+ INIT_LIST_HEAD(&sbi->s_closed_transaction); -+ INIT_LIST_HEAD(&sbi->s_committed_transaction); -+ spin_lock_init(&sbi->s_bal_lock); -+ -+ /* remove old on-disk buddy file */ -+ down(&root->i_sem); -+ dentry = lookup_one_len(".buddy", sb->s_root, strlen(".buddy")); -+ if (dentry->d_inode != NULL) { -+ i = vfs_unlink(root, dentry); -+ if (i != 0) -+ printk("EXT3-fs: can't remove .buddy file: %d\n", i); -+ } -+ dput(dentry); -+ up(&root->i_sem); -+ -+ ext3_mb_history_init(sb); -+ -+ printk("EXT3-fs: mballoc enabled\n"); -+ return 0; -+} -+ -+int ext3_mb_release(struct super_block *sb) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ int i, num_meta_group_infos; -+ -+ if (!test_opt(sb, MBALLOC)) -+ return 0; -+ -+ /* release freed, non-committed blocks */ -+ spin_lock(&sbi->s_md_lock); -+ list_splice_init(&sbi->s_closed_transaction, -+ &sbi->s_committed_transaction); -+ list_splice_init(&sbi->s_active_transaction, -+ &sbi->s_committed_transaction); -+ spin_unlock(&sbi->s_md_lock); -+ ext3_mb_free_committed_blocks(sb); -+ -+ if (sbi->s_group_info) { -+ for (i = 0; i < sbi->s_groups_count; i++) -+ kfree(EXT3_GROUP_INFO(sb, i)); -+ num_meta_group_infos = (sbi->s_groups_count + -+ EXT3_DESC_PER_BLOCK(sb) - 1) >> -+ EXT3_DESC_PER_BLOCK_BITS(sb); -+ for (i = 0; i < num_meta_group_infos; i++) -+ kfree(sbi->s_group_info[i]); -+ kfree(sbi->s_group_info); -+ } -+ if (sbi->s_mb_offsets) -+ kfree(sbi->s_mb_offsets); -+ if (sbi->s_mb_maxs) -+ kfree(sbi->s_mb_maxs); -+ if (sbi->s_buddy_cache) -+ iput(sbi->s_buddy_cache); -+ if (sbi->s_blocks_reserved) -+ printk("ext3-fs: %ld blocks being reserved at umount!\n", -+ sbi->s_blocks_reserved); -+ if (ext3_mb_stats) { -+ printk("EXT3-fs: mballoc: %u blocks %u reqs (%u success)\n", -+ atomic_read(&sbi->s_bal_allocated), -+ atomic_read(&sbi->s_bal_reqs), -+ atomic_read(&sbi->s_bal_success)); -+ printk("EXT3-fs: mballoc: %u extents scanned, %u goal hits, " -+ "%u 2^N hits, %u breaks\n", -+ atomic_read(&sbi->s_bal_ex_scanned), -+ atomic_read(&sbi->s_bal_goals), -+ atomic_read(&sbi->s_bal_2orders), -+ atomic_read(&sbi->s_bal_breaks)); -+ printk("EXT3-fs: mballoc: %lu generated and it took %Lu\n", -+ sbi->s_mb_buddies_generated++, -+ sbi->s_mb_generation_time); -+ } -+ -+ ext3_mb_history_release(sb); -+ -+ return 0; -+} -+ -+void ext3_mb_free_committed_blocks(struct super_block *sb) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ int err, i, count = 0, count2 = 0; -+ struct ext3_free_metadata *md; -+ struct ext3_buddy e3b; -+ -+ if (list_empty(&sbi->s_committed_transaction)) -+ return; -+ -+ /* there is committed blocks to be freed yet */ -+ do { -+ /* get next array of blocks */ -+ md = NULL; -+ spin_lock(&sbi->s_md_lock); -+ if (!list_empty(&sbi->s_committed_transaction)) { -+ md = list_entry(sbi->s_committed_transaction.next, -+ struct ext3_free_metadata, list); -+ list_del(&md->list); -+ } -+ spin_unlock(&sbi->s_md_lock); -+ -+ if (md == NULL) -+ break; -+ -+ mb_debug("gonna free %u blocks in group %u (0x%p):", -+ md->num, md->group, md); -+ -+ err = ext3_mb_load_buddy(sb, md->group, &e3b); -+ /* we expect to find existing buddy because it's pinned */ -+ BUG_ON(err != 0); -+ -+ /* there are blocks to put in buddy to make them really free */ -+ count += md->num; -+ count2++; -+ ext3_lock_group(sb, md->group); -+ for (i = 0; i < md->num; i++) { -+ mb_debug(" %u", md->blocks[i]); -+ mb_free_blocks(&e3b, md->blocks[i], 1); -+ } -+ mb_debug("\n"); -+ ext3_unlock_group(sb, md->group); -+ -+ /* balance refcounts from ext3_mb_free_metadata() */ -+ page_cache_release(e3b.bd_buddy_page); -+ page_cache_release(e3b.bd_bitmap_page); -+ -+ kfree(md); -+ ext3_mb_release_desc(&e3b); -+ -+ } while (md); -+ mb_debug("freed %u blocks in %u structures\n", count, count2); -+} -+ -+void ext3_mb_poll_new_transaction(struct super_block *sb, handle_t *handle) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ -+ if (sbi->s_last_transaction == handle->h_transaction->t_tid) -+ return; -+ -+ /* new transaction! time to close last one and free blocks for -+ * committed transaction. we know that only transaction can be -+ * active, so previos transaction can be being logged and we -+ * know that transaction before previous is known to be already -+ * logged. this means that now we may free blocks freed in all -+ * transactions before previous one. hope I'm clear enough ... */ -+ -+ spin_lock(&sbi->s_md_lock); -+ if (sbi->s_last_transaction != handle->h_transaction->t_tid) { -+ mb_debug("new transaction %lu, old %lu\n", -+ (unsigned long) handle->h_transaction->t_tid, -+ (unsigned long) sbi->s_last_transaction); -+ list_splice_init(&sbi->s_closed_transaction, -+ &sbi->s_committed_transaction); -+ list_splice_init(&sbi->s_active_transaction, -+ &sbi->s_closed_transaction); -+ sbi->s_last_transaction = handle->h_transaction->t_tid; -+ } -+ spin_unlock(&sbi->s_md_lock); -+ -+ ext3_mb_free_committed_blocks(sb); -+} -+ -+int ext3_mb_free_metadata(handle_t *handle, struct ext3_buddy *e3b, -+ int group, int block, int count) -+{ -+ struct ext3_group_info *db = e3b->bd_info; -+ struct super_block *sb = e3b->bd_sb; -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ struct ext3_free_metadata *md; -+ int i; -+ -+ J_ASSERT(e3b->bd_bitmap_page != NULL); -+ J_ASSERT(e3b->bd_buddy_page != NULL); -+ -+ ext3_lock_group(sb, group); -+ for (i = 0; i < count; i++) { -+ md = db->bb_md_cur; -+ if (md && db->bb_tid != handle->h_transaction->t_tid) { -+ db->bb_md_cur = NULL; -+ md = NULL; -+ } -+ -+ if (md == NULL) { -+ ext3_unlock_group(sb, group); -+ md = kmalloc(sizeof(*md), GFP_KERNEL); -+ if (md == NULL) -+ return -ENOMEM; -+ md->num = 0; -+ md->group = group; -+ -+ ext3_lock_group(sb, group); -+ if (db->bb_md_cur == NULL) { -+ spin_lock(&sbi->s_md_lock); -+ list_add(&md->list, &sbi->s_active_transaction); -+ spin_unlock(&sbi->s_md_lock); -+ /* protect buddy cache from being freed, -+ * otherwise we'll refresh it from -+ * on-disk bitmap and lose not-yet-available -+ * blocks */ -+ page_cache_get(e3b->bd_buddy_page); -+ page_cache_get(e3b->bd_bitmap_page); -+ db->bb_md_cur = md; -+ db->bb_tid = handle->h_transaction->t_tid; -+ mb_debug("new md 0x%p for group %u\n", -+ md, md->group); -+ } else { -+ kfree(md); -+ md = db->bb_md_cur; -+ } -+ } -+ -+ BUG_ON(md->num >= EXT3_BB_MAX_BLOCKS); -+ md->blocks[md->num] = block + i; -+ md->num++; -+ if (md->num == EXT3_BB_MAX_BLOCKS) { -+ /* no more space, put full container on a sb's list */ -+ db->bb_md_cur = NULL; -+ } -+ } -+ ext3_unlock_group(sb, group); -+ return 0; -+} -+ -+void ext3_mb_free_blocks(handle_t *handle, struct inode *inode, -+ unsigned long block, unsigned long count, -+ int metadata, int *freed) -+{ -+ struct buffer_head *bitmap_bh = NULL; -+ struct ext3_group_desc *gdp; -+ struct ext3_super_block *es; -+ unsigned long bit, overflow; -+ struct buffer_head *gd_bh; -+ unsigned long block_group; -+ struct ext3_sb_info *sbi; -+ struct super_block *sb; -+ struct ext3_buddy e3b; -+ int err = 0, ret; -+ -+ *freed = 0; -+ sb = inode->i_sb; -+ if (!sb) { -+ printk ("ext3_free_blocks: nonexistent device"); -+ return; -+ } -+ -+ ext3_mb_poll_new_transaction(sb, handle); -+ -+ sbi = EXT3_SB(sb); -+ es = EXT3_SB(sb)->s_es; -+ if (block < le32_to_cpu(es->s_first_data_block) || -+ block + count < block || -+ block + count > le32_to_cpu(es->s_blocks_count)) { -+ ext3_error (sb, "ext3_free_blocks", -+ "Freeing blocks not in datazone - " -+ "block = %lu, count = %lu", block, count); -+ goto error_return; -+ } -+ -+ ext3_debug("freeing block %lu\n", block); -+ -+do_more: -+ overflow = 0; -+ block_group = (block - le32_to_cpu(es->s_first_data_block)) / -+ EXT3_BLOCKS_PER_GROUP(sb); -+ bit = (block - le32_to_cpu(es->s_first_data_block)) % -+ EXT3_BLOCKS_PER_GROUP(sb); -+ /* -+ * Check to see if we are freeing blocks across a group -+ * boundary. -+ */ -+ if (bit + count > EXT3_BLOCKS_PER_GROUP(sb)) { -+ overflow = bit + count - EXT3_BLOCKS_PER_GROUP(sb); -+ count -= overflow; -+ } -+ brelse(bitmap_bh); -+ bitmap_bh = read_block_bitmap(sb, block_group); -+ if (!bitmap_bh) -+ goto error_return; -+ gdp = ext3_get_group_desc (sb, block_group, &gd_bh); -+ if (!gdp) -+ goto error_return; -+ -+ if (in_range (le32_to_cpu(gdp->bg_block_bitmap), block, count) || -+ in_range (le32_to_cpu(gdp->bg_inode_bitmap), block, count) || -+ in_range (block, le32_to_cpu(gdp->bg_inode_table), -+ EXT3_SB(sb)->s_itb_per_group) || -+ in_range (block + count - 1, le32_to_cpu(gdp->bg_inode_table), -+ EXT3_SB(sb)->s_itb_per_group)) -+ ext3_error (sb, "ext3_free_blocks", -+ "Freeing blocks in system zones - " -+ "Block = %lu, count = %lu", -+ block, count); -+ -+ BUFFER_TRACE(bitmap_bh, "getting write access"); -+ err = ext3_journal_get_write_access(handle, bitmap_bh); -+ if (err) -+ goto error_return; -+ -+ /* -+ * We are about to modify some metadata. Call the journal APIs -+ * to unshare ->b_data if a currently-committing transaction is -+ * using it -+ */ -+ BUFFER_TRACE(gd_bh, "get_write_access"); -+ err = ext3_journal_get_write_access(handle, gd_bh); -+ if (err) -+ goto error_return; -+ -+ err = ext3_mb_load_buddy(sb, block_group, &e3b); -+ if (err) -+ goto error_return; -+ -+#ifdef AGGRESSIVE_CHECK -+ { -+ int i; -+ for (i = 0; i < count; i++) -+ J_ASSERT(mb_test_bit(bit + i, bitmap_bh->b_data)); -+ } -+#endif -+ mb_clear_bits(bitmap_bh->b_data, bit, count); -+ -+ /* We dirtied the bitmap block */ -+ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); -+ err = ext3_journal_dirty_metadata(handle, bitmap_bh); -+ -+ if (metadata) { -+ /* blocks being freed are metadata. these blocks shouldn't -+ * be used until this transaction is committed */ -+ ext3_mb_free_metadata(handle, &e3b, block_group, bit, count); -+ } else { -+ ext3_lock_group(sb, block_group); -+ mb_free_blocks(&e3b, bit, count); -+ ext3_unlock_group(sb, block_group); -+ } -+ -+ spin_lock(sb_bgl_lock(sbi, block_group)); -+ gdp->bg_free_blocks_count = -+ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + count); -+ spin_unlock(sb_bgl_lock(sbi, block_group)); -+ percpu_counter_mod(&sbi->s_freeblocks_counter, count); -+ -+ ext3_mb_release_desc(&e3b); -+ -+ *freed = count; -+ -+ /* And the group descriptor block */ -+ BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); -+ ret = ext3_journal_dirty_metadata(handle, gd_bh); -+ if (!err) err = ret; -+ -+ if (overflow && !err) { -+ block += count; -+ count = overflow; -+ goto do_more; -+ } -+ sb->s_dirt = 1; -+error_return: -+ brelse(bitmap_bh); -+ ext3_std_error(sb, err); -+ return; -+} -+ -+int ext3_mb_reserve_blocks(struct super_block *sb, int blocks) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ int free, ret = -ENOSPC; -+ -+ BUG_ON(blocks < 0); -+ spin_lock(&sbi->s_reserve_lock); -+ free = percpu_counter_read_positive(&sbi->s_freeblocks_counter); -+ if (blocks <= free - sbi->s_blocks_reserved) { -+ sbi->s_blocks_reserved += blocks; -+ ret = 0; -+ } -+ spin_unlock(&sbi->s_reserve_lock); -+ return ret; -+} -+ -+void ext3_mb_release_blocks(struct super_block *sb, int blocks) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ -+ BUG_ON(blocks < 0); -+ spin_lock(&sbi->s_reserve_lock); -+ sbi->s_blocks_reserved -= blocks; -+ WARN_ON(sbi->s_blocks_reserved < 0); -+ if (sbi->s_blocks_reserved < 0) -+ sbi->s_blocks_reserved = 0; -+ spin_unlock(&sbi->s_reserve_lock); -+} -+ -+int ext3_new_block(handle_t *handle, struct inode *inode, -+ unsigned long goal, int *errp) -+{ -+ int ret, len; -+ -+ if (!test_opt(inode->i_sb, MBALLOC)) { -+ ret = ext3_new_block_old(handle, inode, goal, errp); -+ goto out; -+ } -+ len = 1; -+ ret = ext3_mb_new_blocks(handle, inode, goal, &len, 0, errp); -+out: -+ return ret; -+} -+ -+ -+void ext3_free_blocks(handle_t *handle, struct inode * inode, -+ unsigned long block, unsigned long count, int metadata) -+{ -+ struct super_block *sb; -+ int freed; -+ -+ sb = inode->i_sb; -+ if (!test_opt(sb, MBALLOC) || !EXT3_SB(sb)->s_group_info) -+ ext3_free_blocks_old(handle, inode, block, count); -+ else { -+ ext3_mb_free_blocks(handle, inode, block, count, metadata, &freed); -+ if (freed) -+ DQUOT_FREE_BLOCK(inode, freed); -+ } -+ return; -+} -+ -+#define EXT3_ROOT "ext3" -+#define EXT3_MB_STATS_NAME "mb_stats" -+#define EXT3_MB_MAX_TO_SCAN_NAME "mb_max_to_scan" -+#define EXT3_MB_MIN_TO_SCAN_NAME "mb_min_to_scan" -+#define EXT3_MB_ORDER2_REQ "mb_order2_req" -+ -+static int ext3_mb_stats_read(char *page, char **start, off_t off, -+ int count, int *eof, void *data) -+{ -+ int len; -+ -+ *eof = 1; -+ if (off != 0) -+ return 0; -+ -+ len = sprintf(page, "%ld\n", ext3_mb_stats); -+ *start = page; -+ return len; -+} -+ -+static int ext3_mb_stats_write(struct file *file, const char *buffer, -+ unsigned long count, void *data) -+{ -+ char str[32]; -+ -+ if (count >= sizeof(str)) { -+ printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n", -+ EXT3_MB_STATS_NAME, (int)sizeof(str)); -+ return -EOVERFLOW; -+ } -+ -+ if (copy_from_user(str, buffer, count)) -+ return -EFAULT; -+ -+ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ -+ ext3_mb_stats = (simple_strtol(str, NULL, 0) != 0); -+ return count; -+} -+ -+static int ext3_mb_max_to_scan_read(char *page, char **start, off_t off, -+ int count, int *eof, void *data) -+{ -+ int len; -+ -+ *eof = 1; -+ if (off != 0) -+ return 0; -+ -+ len = sprintf(page, "%ld\n", ext3_mb_max_to_scan); -+ *start = page; -+ return len; -+} -+ -+static int ext3_mb_max_to_scan_write(struct file *file, const char *buffer, -+ unsigned long count, void *data) -+{ -+ char str[32]; -+ long value; -+ -+ if (count >= sizeof(str)) { -+ printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n", -+ EXT3_MB_MAX_TO_SCAN_NAME, (int)sizeof(str)); -+ return -EOVERFLOW; -+ } -+ -+ if (copy_from_user(str, buffer, count)) -+ return -EFAULT; -+ -+ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ -+ value = simple_strtol(str, NULL, 0); -+ if (value <= 0) -+ return -ERANGE; -+ -+ ext3_mb_max_to_scan = value; -+ -+ return count; -+} -+ -+static int ext3_mb_min_to_scan_read(char *page, char **start, off_t off, -+ int count, int *eof, void *data) -+{ -+ int len; -+ -+ *eof = 1; -+ if (off != 0) -+ return 0; -+ -+ len = sprintf(page, "%ld\n", ext3_mb_min_to_scan); -+ *start = page; -+ return len; -+} -+ -+static int ext3_mb_min_to_scan_write(struct file *file, const char *buffer, -+ unsigned long count, void *data) -+{ -+ char str[32]; -+ long value; -+ -+ if (count >= sizeof(str)) { -+ printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n", -+ EXT3_MB_MIN_TO_SCAN_NAME, (int)sizeof(str)); -+ return -EOVERFLOW; -+ } -+ -+ if (copy_from_user(str, buffer, count)) -+ return -EFAULT; -+ -+ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ -+ value = simple_strtol(str, NULL, 0); -+ if (value <= 0) -+ return -ERANGE; -+ -+ ext3_mb_min_to_scan = value; -+ -+ return count; -+} -+ -+static int ext3_mb_order2_req_read(char *page, char **start, off_t off, -+ int count, int *eof, void *data) -+{ -+ int len; -+ -+ *eof = 1; -+ if (off != 0) -+ return 0; -+ -+ len = sprintf(page, "%ld\n", ext3_mb_order2_reqs); -+ *start = page; -+ return len; -+} -+ -+static int ext3_mb_order2_req_write(struct file *file, const char *buffer, -+ unsigned long count, void *data) -+{ -+ char str[32]; -+ long value; -+ -+ if (count >= sizeof(str)) { -+ printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n", -+ EXT3_MB_MIN_TO_SCAN_NAME, (int)sizeof(str)); -+ return -EOVERFLOW; -+ } -+ -+ if (copy_from_user(str, buffer, count)) -+ return -EFAULT; -+ -+ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ -+ value = simple_strtol(str, NULL, 0); -+ if (value <= 0) -+ return -ERANGE; -+ -+ ext3_mb_order2_reqs = value; -+ -+ return count; -+} -+ -+int __init init_ext3_proc(void) -+{ -+ struct proc_dir_entry *proc_ext3_mb_stats; -+ struct proc_dir_entry *proc_ext3_mb_max_to_scan; -+ struct proc_dir_entry *proc_ext3_mb_min_to_scan; -+ struct proc_dir_entry *proc_ext3_mb_order2_req; -+ -+ proc_root_ext3 = proc_mkdir(EXT3_ROOT, proc_root_fs); -+ if (proc_root_ext3 == NULL) { -+ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", EXT3_ROOT); -+ return -EIO; -+ } -+ -+ /* Initialize EXT3_MB_STATS_NAME */ -+ proc_ext3_mb_stats = create_proc_entry(EXT3_MB_STATS_NAME, -+ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); -+ if (proc_ext3_mb_stats == NULL) { -+ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", -+ EXT3_MB_STATS_NAME); -+ remove_proc_entry(EXT3_ROOT, proc_root_fs); -+ return -EIO; -+ } -+ -+ proc_ext3_mb_stats->data = NULL; -+ proc_ext3_mb_stats->read_proc = ext3_mb_stats_read; -+ proc_ext3_mb_stats->write_proc = ext3_mb_stats_write; -+ -+ /* Initialize EXT3_MAX_TO_SCAN_NAME */ -+ proc_ext3_mb_max_to_scan = create_proc_entry( -+ EXT3_MB_MAX_TO_SCAN_NAME, -+ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); -+ if (proc_ext3_mb_max_to_scan == NULL) { -+ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", -+ EXT3_MB_MAX_TO_SCAN_NAME); -+ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); -+ remove_proc_entry(EXT3_ROOT, proc_root_fs); -+ return -EIO; -+ } -+ -+ proc_ext3_mb_max_to_scan->data = NULL; -+ proc_ext3_mb_max_to_scan->read_proc = ext3_mb_max_to_scan_read; -+ proc_ext3_mb_max_to_scan->write_proc = ext3_mb_max_to_scan_write; -+ -+ /* Initialize EXT3_MIN_TO_SCAN_NAME */ -+ proc_ext3_mb_min_to_scan = create_proc_entry( -+ EXT3_MB_MIN_TO_SCAN_NAME, -+ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); -+ if (proc_ext3_mb_min_to_scan == NULL) { -+ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", -+ EXT3_MB_MIN_TO_SCAN_NAME); -+ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3); -+ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); -+ remove_proc_entry(EXT3_ROOT, proc_root_fs); -+ return -EIO; -+ } -+ -+ proc_ext3_mb_min_to_scan->data = NULL; -+ proc_ext3_mb_min_to_scan->read_proc = ext3_mb_min_to_scan_read; -+ proc_ext3_mb_min_to_scan->write_proc = ext3_mb_min_to_scan_write; -+ -+ /* Initialize EXT3_ORDER2_REQ */ -+ proc_ext3_mb_order2_req = create_proc_entry( -+ EXT3_MB_ORDER2_REQ, -+ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); -+ if (proc_ext3_mb_order2_req == NULL) { -+ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", -+ EXT3_MB_ORDER2_REQ); -+ remove_proc_entry(EXT3_MB_MIN_TO_SCAN_NAME, proc_root_ext3); -+ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3); -+ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); -+ remove_proc_entry(EXT3_ROOT, proc_root_fs); -+ return -EIO; -+ } -+ -+ proc_ext3_mb_order2_req->data = NULL; -+ proc_ext3_mb_order2_req->read_proc = ext3_mb_order2_req_read; -+ proc_ext3_mb_order2_req->write_proc = ext3_mb_order2_req_write; -+ -+ return 0; -+} -+ -+void exit_ext3_proc(void) -+{ -+ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); -+ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3); -+ remove_proc_entry(EXT3_MB_MIN_TO_SCAN_NAME, proc_root_ext3); -+ remove_proc_entry(EXT3_MB_ORDER2_REQ, proc_root_ext3); -+ remove_proc_entry(EXT3_ROOT, proc_root_fs); -+} -Index: linux-2.6.5-7.282-full/fs/ext3/Makefile -=================================================================== ---- linux-2.6.5-7.282-full.orig/fs/ext3/Makefile 2006-10-24 22:18:28.000000000 +0400 -+++ linux-2.6.5-7.282-full/fs/ext3/Makefile 2006-10-24 22:18:28.000000000 +0400 -@@ -6,7 +6,7 @@ obj-$(CONFIG_EXT3_FS) += ext3.o - - ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ - ioctl.o namei.o super.o symlink.o hash.o \ -- extents.o -+ extents.o mballoc.o - - ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o - ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o diff --git a/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6.12.patch b/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6.12.patch deleted file mode 100644 index 135262a..0000000 --- a/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6.12.patch +++ /dev/null @@ -1,3102 +0,0 @@ -Index: linux-2.6.12.6-bull/include/linux/ext3_fs.h -=================================================================== ---- linux-2.6.12.6-bull.orig/include/linux/ext3_fs.h 2006-04-29 20:39:09.000000000 +0400 -+++ linux-2.6.12.6-bull/include/linux/ext3_fs.h 2006-04-29 20:39:10.000000000 +0400 -@@ -57,6 +57,14 @@ struct statfs; - #define ext3_debug(f, a...) do {} while (0) - #endif - -+#define EXT3_MULTIBLOCK_ALLOCATOR 1 -+ -+#define EXT3_MB_HINT_MERGE 1 -+#define EXT3_MB_HINT_RESERVED 2 -+#define EXT3_MB_HINT_METADATA 4 -+#define EXT3_MB_HINT_FIRST 8 -+#define EXT3_MB_HINT_BEST 16 -+ - /* - * Special inodes numbers - */ -@@ -366,6 +374,7 @@ struct ext3_inode { - #define EXT3_MOUNT_IOPEN_NOPRIV 0x100000/* Make iopen world-readable */ - #define EXT3_MOUNT_EXTENTS 0x200000/* Extents support */ - #define EXT3_MOUNT_EXTDEBUG 0x400000/* Extents debug */ -+#define EXT3_MOUNT_MBALLOC 0x800000/* Buddy allocation support */ - - /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ - #ifndef clear_opt -@@ -387,6 +396,14 @@ struct ext3_inode { - #define ext3_find_first_zero_bit ext2_find_first_zero_bit - #define ext3_find_next_zero_bit ext2_find_next_zero_bit - -+#ifndef ext2_find_next_le_bit -+#ifdef __LITTLE_ENDIAN -+#define ext2_find_next_le_bit(addr, size, off) find_next_bit((addr), (size), (off)) -+#else -+#error "mballoc needs a patch for big-endian systems - CFS bug 10634" -+#endif /* __LITTLE_ENDIAN */ -+#endif /* !ext2_find_next_le_bit */ -+ - /* - * Maximal mount counts between two filesystem checks - */ -@@ -727,7 +736,8 @@ extern int ext3_bg_has_super(struct supe - extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group); - extern int ext3_new_block (handle_t *, struct inode *, unsigned long, int *); - extern void ext3_free_blocks (handle_t *, struct inode *, unsigned long, -- unsigned long); -+ unsigned long, int); -+extern int ext3_new_block_old(handle_t *, struct inode *, unsigned long, int *); - extern void ext3_free_blocks_sb (handle_t *, struct super_block *, - unsigned long, unsigned long, int *); - extern unsigned long ext3_count_free_blocks (struct super_block *); -@@ -848,6 +857,17 @@ extern void ext3_extents_initialize_bloc - extern int ext3_ext_ioctl(struct inode *inode, struct file *filp, - unsigned int cmd, unsigned long arg); - -+/* mballoc.c */ -+extern long ext3_mb_stats; -+extern long ext3_mb_max_to_scan; -+extern int ext3_mb_init(struct super_block *, int); -+extern int ext3_mb_release(struct super_block *); -+extern int ext3_mb_new_blocks(handle_t *, struct inode *, unsigned long, int *, int, int *); -+extern int ext3_mb_reserve_blocks(struct super_block *, int); -+extern void ext3_mb_release_blocks(struct super_block *, int); -+int __init init_ext3_proc(void); -+void exit_ext3_proc(void); -+ - #endif /* __KERNEL__ */ - - /* EXT3_IOC_CREATE_INUM at bottom of file (visible to kernel and user). */ -Index: linux-2.6.12.6-bull/include/linux/ext3_fs_sb.h -=================================================================== ---- linux-2.6.12.6-bull.orig/include/linux/ext3_fs_sb.h 2005-08-29 20:55:27.000000000 +0400 -+++ linux-2.6.12.6-bull/include/linux/ext3_fs_sb.h 2006-04-29 20:39:10.000000000 +0400 -@@ -21,8 +21,14 @@ - #include - #include - #include -+#include - #endif - #include -+#include -+ -+struct ext3_buddy_group_blocks; -+struct ext3_mb_history; -+#define EXT3_BB_MAX_BLOCKS - - /* - * third extended-fs super-block data in memory -@@ -78,6 +84,43 @@ struct ext3_sb_info { - char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */ - int s_jquota_fmt; /* Format of quota to use */ - #endif -+ -+ /* for buddy allocator */ -+ struct ext3_group_info ***s_group_info; -+ struct inode *s_buddy_cache; -+ long s_blocks_reserved; -+ spinlock_t s_reserve_lock; -+ struct list_head s_active_transaction; -+ struct list_head s_closed_transaction; -+ struct list_head s_committed_transaction; -+ spinlock_t s_md_lock; -+ tid_t s_last_transaction; -+ int s_mb_factor; -+ unsigned short *s_mb_offsets, *s_mb_maxs; -+ unsigned long s_stripe; -+ -+ /* history to debug policy */ -+ struct ext3_mb_history *s_mb_history; -+ int s_mb_history_cur; -+ int s_mb_history_max; -+ struct proc_dir_entry *s_mb_proc; -+ spinlock_t s_mb_history_lock; -+ -+ /* stats for buddy allocator */ -+ atomic_t s_bal_reqs; /* number of reqs with len > 1 */ -+ atomic_t s_bal_success; /* we found long enough chunks */ -+ atomic_t s_bal_allocated; /* in blocks */ -+ atomic_t s_bal_ex_scanned; /* total extents scanned */ -+ atomic_t s_bal_goals; /* goal hits */ -+ atomic_t s_bal_breaks; /* too long searches */ -+ atomic_t s_bal_2orders; /* 2^order hits */ -+ spinlock_t s_bal_lock; -+ unsigned long s_mb_buddies_generated; -+ unsigned long long s_mb_generation_time; - }; -+ -+#define EXT3_GROUP_INFO(sb, group) \ -+ EXT3_SB(sb)->s_group_info[(group) >> EXT3_DESC_PER_BLOCK_BITS(sb)] \ -+ [(group) & (EXT3_DESC_PER_BLOCK(sb) - 1)] - - #endif /* _LINUX_EXT3_FS_SB */ -Index: linux-2.6.12.6-bull/fs/ext3/super.c -=================================================================== ---- linux-2.6.12.6-bull.orig/fs/ext3/super.c 2006-04-29 20:39:09.000000000 +0400 -+++ linux-2.6.12.6-bull/fs/ext3/super.c 2006-04-29 20:39:10.000000000 +0400 -@@ -387,6 +387,7 @@ static void ext3_put_super (struct super - struct ext3_super_block *es = sbi->s_es; - int i; - -+ ext3_mb_release(sb); - ext3_ext_release(sb); - ext3_xattr_put_super(sb); - journal_destroy(sbi->s_journal); -@@ -597,6 +598,7 @@ enum { - Opt_ignore, Opt_barrier, Opt_err, Opt_resize, - Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, - Opt_extents, Opt_noextents, Opt_extdebug, -+ Opt_mballoc, Opt_nomballoc, Opt_stripe, - }; - - static match_table_t tokens = { -@@ -650,6 +651,9 @@ static match_table_t tokens = { - {Opt_extents, "extents"}, - {Opt_noextents, "noextents"}, - {Opt_extdebug, "extdebug"}, -+ {Opt_mballoc, "mballoc"}, -+ {Opt_nomballoc, "nomballoc"}, -+ {Opt_stripe, "stripe=%u"}, - {Opt_barrier, "barrier=%u"}, - {Opt_err, NULL}, - {Opt_resize, "resize"}, -@@ -965,6 +967,19 @@ clear_qf_name: - case Opt_extdebug: - set_opt (sbi->s_mount_opt, EXTDEBUG); - break; -+ case Opt_mballoc: -+ set_opt(sbi->s_mount_opt, MBALLOC); -+ break; -+ case Opt_nomballoc: -+ clear_opt(sbi->s_mount_opt, MBALLOC); -+ break; -+ case Opt_stripe: -+ if (match_int(&args[0], &option)) -+ return 0; -+ if (option < 0) -+ return 0; -+ sbi->s_stripe = option; -+ break; - default: - printk (KERN_ERR - "EXT3-fs: Unrecognized mount option \"%s\" " -@@ -1670,6 +1675,7 @@ static int ext3_fill_super (struct super - ext3_count_dirs(sb)); - - ext3_ext_init(sb); -+ ext3_mb_init(sb, needs_recovery); - lock_kernel(); - return 0; - -@@ -2549,7 +2555,13 @@ static struct file_system_type ext3_fs_t - - static int __init init_ext3_fs(void) - { -- int err = init_ext3_xattr(); -+ int err; -+ -+ err = init_ext3_proc(); -+ if (err) -+ return err; -+ -+ err = init_ext3_xattr(); - if (err) - return err; - err = init_inodecache(); -@@ -2571,6 +2583,7 @@ static void __exit exit_ext3_fs(void) - unregister_filesystem(&ext3_fs_type); - destroy_inodecache(); - exit_ext3_xattr(); -+ exit_ext3_proc(); - } - - int ext3_prep_san_write(struct inode *inode, long *blocks, -Index: linux-2.6.12.6-bull/fs/ext3/extents.c -=================================================================== ---- linux-2.6.12.6-bull.orig/fs/ext3/extents.c 2006-04-29 20:39:09.000000000 +0400 -+++ linux-2.6.12.6-bull/fs/ext3/extents.c 2006-04-29 20:39:10.000000000 +0400 -@@ -777,7 +777,7 @@ cleanup: - for (i = 0; i < depth; i++) { - if (!ablocks[i]) - continue; -- ext3_free_blocks(handle, tree->inode, ablocks[i], 1); -+ ext3_free_blocks(handle, tree->inode, ablocks[i], 1, 1); - } - } - kfree(ablocks); -@@ -1434,7 +1434,7 @@ int ext3_ext_rm_idx(handle_t *handle, st - path->p_idx->ei_leaf); - bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf); - ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf); -- ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1); -+ ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1, 1); - return err; - } - -@@ -1919,10 +1919,12 @@ ext3_remove_blocks(struct ext3_extents_t - int needed = ext3_remove_blocks_credits(tree, ex, from, to); - handle_t *handle = ext3_journal_start(tree->inode, needed); - struct buffer_head *bh; -- int i; -+ int i, metadata = 0; - - if (IS_ERR(handle)) - return PTR_ERR(handle); -+ if (S_ISDIR(tree->inode->i_mode) || S_ISLNK(tree->inode->i_mode)) -+ metadata = 1; - if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) { - /* tail removal */ - unsigned long num, start; -@@ -1934,7 +1936,7 @@ ext3_remove_blocks(struct ext3_extents_t - bh = sb_find_get_block(tree->inode->i_sb, start + i); - ext3_forget(handle, 0, tree->inode, bh, start + i); - } -- ext3_free_blocks(handle, tree->inode, start, num); -+ ext3_free_blocks(handle, tree->inode, start, num, metadata); - } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) { - printk("strange request: removal %lu-%lu from %u:%u\n", - from, to, ex->ee_block, ex->ee_len); -Index: linux-2.6.12.6-bull/fs/ext3/inode.c -=================================================================== ---- linux-2.6.12.6-bull.orig/fs/ext3/inode.c 2006-04-29 20:39:09.000000000 +0400 -+++ linux-2.6.12.6-bull/fs/ext3/inode.c 2006-04-29 20:39:10.000000000 +0400 -@@ -564,7 +564,7 @@ static int ext3_alloc_branch(handle_t *h - ext3_journal_forget(handle, branch[i].bh); - } - for (i = 0; i < keys; i++) -- ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1); -+ ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1, 1); - return err; - } - -@@ -1850,7 +1850,7 @@ ext3_clear_blocks(handle_t *handle, stru - } - } - -- ext3_free_blocks(handle, inode, block_to_free, count); -+ ext3_free_blocks(handle, inode, block_to_free, count, 1); - } - - /** -@@ -2023,7 +2023,7 @@ static void ext3_free_branches(handle_t - ext3_journal_test_restart(handle, inode); - } - -- ext3_free_blocks(handle, inode, nr, 1); -+ ext3_free_blocks(handle, inode, nr, 1, 1); - - if (parent_bh) { - /* -Index: linux-2.6.12.6-bull/fs/ext3/balloc.c -=================================================================== ---- linux-2.6.12.6-bull.orig/fs/ext3/balloc.c 2005-08-29 20:55:27.000000000 +0400 -+++ linux-2.6.12.6-bull/fs/ext3/balloc.c 2006-04-29 20:39:10.000000000 +0400 -@@ -79,7 +79,7 @@ struct ext3_group_desc * ext3_get_group_ - * - * Return buffer_head on success or NULL in case of failure. - */ --static struct buffer_head * -+struct buffer_head * - read_block_bitmap(struct super_block *sb, unsigned int block_group) - { - struct ext3_group_desc * desc; -@@ -490,24 +490,6 @@ error_return: - return; - } - --/* Free given blocks, update quota and i_blocks field */ --void ext3_free_blocks(handle_t *handle, struct inode *inode, -- unsigned long block, unsigned long count) --{ -- struct super_block * sb; -- int dquot_freed_blocks; -- -- sb = inode->i_sb; -- if (!sb) { -- printk ("ext3_free_blocks: nonexistent device"); -- return; -- } -- ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks); -- if (dquot_freed_blocks) -- DQUOT_FREE_BLOCK(inode, dquot_freed_blocks); -- return; --} -- - /* - * For ext3 allocations, we must not reuse any blocks which are - * allocated in the bitmap buffer's "last committed data" copy. This -@@ -1162,7 +1144,7 @@ int ext3_should_retry_alloc(struct super - * bitmap, and then for any free bit if that fails. - * This function also updates quota and i_blocks field. - */ --int ext3_new_block(handle_t *handle, struct inode *inode, -+int ext3_new_block_old(handle_t *handle, struct inode *inode, - unsigned long goal, int *errp) - { - struct buffer_head *bitmap_bh = NULL; -Index: linux-2.6.12.6-bull/fs/ext3/xattr.c -=================================================================== ---- linux-2.6.12.6-bull.orig/fs/ext3/xattr.c 2005-08-29 20:55:27.000000000 +0400 -+++ linux-2.6.12.6-bull/fs/ext3/xattr.c 2006-04-29 20:39:10.000000000 +0400 -@@ -484,7 +484,7 @@ ext3_xattr_release_block(handle_t *handl - ea_bdebug(bh, "refcount now=0; freeing"); - if (ce) - mb_cache_entry_free(ce); -- ext3_free_blocks(handle, inode, bh->b_blocknr, 1); -+ ext3_free_blocks(handle, inode, bh->b_blocknr, 1, 1); - get_bh(bh); - ext3_forget(handle, 1, inode, bh, bh->b_blocknr); - } else { -@@ -804,7 +804,7 @@ inserted: - new_bh = sb_getblk(sb, block); - if (!new_bh) { - getblk_failed: -- ext3_free_blocks(handle, inode, block, 1); -+ ext3_free_blocks(handle, inode, block, 1, 1); - error = -EIO; - goto cleanup; - } -Index: linux-2.6.12.6-bull/fs/ext3/mballoc.c -=================================================================== ---- linux-2.6.12.6-bull.orig/fs/ext3/mballoc.c 2006-04-22 17:31:47.543334750 +0400 -+++ linux-2.6.12.6-bull/fs/ext3/mballoc.c 2006-04-30 01:24:11.000000000 +0400 -@@ -0,0 +1,2725 @@ -+/* -+ * Copyright (c) 2003-2005, Cluster File Systems, Inc, info@clusterfs.com -+ * Written by Alex Tomas -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License version 2 as -+ * published by the Free Software Foundation. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public Licens -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- -+ */ -+ -+ -+/* -+ * mballoc.c contains the multiblocks allocation routines -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+/* -+ * TODO: -+ * - bitmap read-ahead (proposed by Oleg Drokin aka green) -+ * - track min/max extents in each group for better group selection -+ * - mb_mark_used() may allocate chunk right after splitting buddy -+ * - special flag to advice allocator to look for requested + N blocks -+ * this may improve interaction between extents and mballoc -+ * - tree of groups sorted by number of free blocks -+ * - percpu reservation code (hotpath) -+ * - error handling -+ */ -+ -+/* -+ * with AGRESSIVE_CHECK allocator runs consistency checks over -+ * structures. these checks slow things down a lot -+ */ -+#define AGGRESSIVE_CHECK__ -+ -+/* -+ */ -+#define MB_DEBUG__ -+#ifdef MB_DEBUG -+#define mb_debug(fmt,a...) printk(fmt, ##a) -+#else -+#define mb_debug(fmt,a...) -+#endif -+ -+/* -+ * with EXT3_MB_HISTORY mballoc stores last N allocations in memory -+ * and you can monitor it in /proc/fs/ext3//mb_history -+ */ -+#define EXT3_MB_HISTORY -+ -+/* -+ * How long mballoc can look for a best extent (in found extents) -+ */ -+long ext3_mb_max_to_scan = 500; -+ -+/* -+ * How long mballoc must look for a best extent -+ */ -+long ext3_mb_min_to_scan = 30; -+ -+/* -+ * with 'ext3_mb_stats' allocator will collect stats that will be -+ * shown at umount. The collecting costs though! -+ */ -+ -+long ext3_mb_stats = 1; -+ -+/* -+ * for which requests use 2^N search using buddies -+ */ -+long ext3_mb_order2_reqs = 8; -+ -+#ifdef EXT3_BB_MAX_BLOCKS -+#undef EXT3_BB_MAX_BLOCKS -+#endif -+#define EXT3_BB_MAX_BLOCKS 30 -+ -+struct ext3_free_metadata { -+ unsigned short group; -+ unsigned short num; -+ unsigned short blocks[EXT3_BB_MAX_BLOCKS]; -+ struct list_head list; -+}; -+ -+struct ext3_group_info { -+ unsigned long bb_state; -+ unsigned long bb_tid; -+ struct ext3_free_metadata *bb_md_cur; -+ unsigned short bb_first_free; -+ unsigned short bb_free; -+ unsigned short bb_fragments; -+ unsigned short bb_counters[]; -+}; -+ -+ -+#define EXT3_GROUP_INFO_NEED_INIT_BIT 0 -+#define EXT3_GROUP_INFO_LOCKED_BIT 1 -+ -+#define EXT3_MB_GRP_NEED_INIT(grp) \ -+ (test_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, &(grp)->bb_state)) -+ -+struct ext3_free_extent { -+ __u16 fe_start; -+ __u16 fe_len; -+ __u16 fe_group; -+}; -+ -+struct ext3_allocation_context { -+ struct super_block *ac_sb; -+ -+ /* search goals */ -+ struct ext3_free_extent ac_g_ex; -+ -+ /* the best found extent */ -+ struct ext3_free_extent ac_b_ex; -+ -+ /* number of iterations done. we have to track to limit searching */ -+ unsigned long ac_ex_scanned; -+ __u16 ac_groups_scanned; -+ __u16 ac_found; -+ __u16 ac_tail; -+ __u16 ac_buddy; -+ __u8 ac_status; -+ __u8 ac_flags; /* allocation hints */ -+ __u8 ac_criteria; -+ __u8 ac_repeats; -+ __u8 ac_2order; /* if request is to allocate 2^N blocks and -+ * N > 0, the field stores N, otherwise 0 */ -+ -+ struct page *ac_buddy_page; -+ struct page *ac_bitmap_page; -+}; -+ -+#define AC_STATUS_CONTINUE 1 -+#define AC_STATUS_FOUND 2 -+#define AC_STATUS_BREAK 3 -+ -+struct ext3_mb_history { -+ struct ext3_free_extent goal; /* goal allocation */ -+ struct ext3_free_extent result; /* result allocation */ -+ unsigned pid; -+ unsigned ino; -+ __u16 found; /* how many extents have been found */ -+ __u16 groups; /* how many groups have been scanned */ -+ __u16 tail; /* what tail broke some buddy */ -+ __u16 buddy; /* buddy the tail ^^^ broke */ -+ __u8 cr; /* which phase the result extent was found at */ -+ __u8 merged; -+}; -+ -+struct ext3_buddy { -+ struct page *bd_buddy_page; -+ void *bd_buddy; -+ struct page *bd_bitmap_page; -+ void *bd_bitmap; -+ struct ext3_group_info *bd_info; -+ struct super_block *bd_sb; -+ __u16 bd_blkbits; -+ __u16 bd_group; -+}; -+#define EXT3_MB_BITMAP(e3b) ((e3b)->bd_bitmap) -+#define EXT3_MB_BUDDY(e3b) ((e3b)->bd_buddy) -+ -+#ifndef EXT3_MB_HISTORY -+#define ext3_mb_store_history(sb,ino,ac) -+#else -+static void ext3_mb_store_history(struct super_block *, unsigned ino, -+ struct ext3_allocation_context *ac); -+#endif -+ -+#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) -+ -+static struct proc_dir_entry *proc_root_ext3; -+ -+struct buffer_head * read_block_bitmap(struct super_block *, unsigned int); -+void ext3_mb_poll_new_transaction(struct super_block *, handle_t *); -+void ext3_mb_free_committed_blocks(struct super_block *); -+ -+#if BITS_PER_LONG == 64 -+#define mb_correct_addr_and_bit(bit,addr) \ -+{ \ -+ bit += ((unsigned long) addr & 7UL) << 3; \ -+ addr = (void *) ((unsigned long) addr & ~7UL); \ -+} -+#elif BITS_PER_LONG == 32 -+#define mb_correct_addr_and_bit(bit,addr) \ -+{ \ -+ bit += ((unsigned long) addr & 3UL) << 3; \ -+ addr = (void *) ((unsigned long) addr & ~3UL); \ -+} -+#else -+#error "how many bits you are?!" -+#endif -+ -+static inline int mb_test_bit(int bit, void *addr) -+{ -+ mb_correct_addr_and_bit(bit,addr); -+ return ext2_test_bit(bit, addr); -+} -+ -+static inline void mb_set_bit(int bit, void *addr) -+{ -+ mb_correct_addr_and_bit(bit,addr); -+ ext2_set_bit(bit, addr); -+} -+ -+static inline void mb_set_bit_atomic(int bit, void *addr) -+{ -+ mb_correct_addr_and_bit(bit,addr); -+ ext2_set_bit_atomic(NULL, bit, addr); -+} -+ -+static inline void mb_clear_bit(int bit, void *addr) -+{ -+ mb_correct_addr_and_bit(bit,addr); -+ ext2_clear_bit(bit, addr); -+} -+ -+static inline void mb_clear_bit_atomic(int bit, void *addr) -+{ -+ mb_correct_addr_and_bit(bit,addr); -+ ext2_clear_bit_atomic(NULL, bit, addr); -+} -+ -+static inline int mb_find_next_zero_bit(void *addr, int max, int start) -+{ -+ int fix; -+#if BITS_PER_LONG == 64 -+ fix = ((unsigned long) addr & 7UL) << 3; -+ addr = (void *) ((unsigned long) addr & ~7UL); -+#elif BITS_PER_LONG == 32 -+ fix = ((unsigned long) addr & 3UL) << 3; -+ addr = (void *) ((unsigned long) addr & ~3UL); -+#else -+#error "how many bits you are?!" -+#endif -+ max += fix; -+ start += fix; -+ return ext2_find_next_zero_bit(addr, max, start) - fix; -+} -+ -+static inline void *mb_find_buddy(struct ext3_buddy *e3b, int order, int *max) -+{ -+ char *bb; -+ -+ J_ASSERT(EXT3_MB_BITMAP(e3b) != EXT3_MB_BUDDY(e3b)); -+ J_ASSERT(max != NULL); -+ -+ if (order > e3b->bd_blkbits + 1) { -+ *max = 0; -+ return NULL; -+ } -+ -+ /* at order 0 we see each particular block */ -+ *max = 1 << (e3b->bd_blkbits + 3); -+ if (order == 0) -+ return EXT3_MB_BITMAP(e3b); -+ -+ bb = EXT3_MB_BUDDY(e3b) + EXT3_SB(e3b->bd_sb)->s_mb_offsets[order]; -+ *max = EXT3_SB(e3b->bd_sb)->s_mb_maxs[order]; -+ -+ return bb; -+} -+ -+#ifdef AGGRESSIVE_CHECK -+ -+static void mb_check_buddy(struct ext3_buddy *e3b) -+{ -+ int order = e3b->bd_blkbits + 1; -+ int max, max2, i, j, k, count; -+ int fragments = 0, fstart; -+ void *buddy, *buddy2; -+ -+ if (!test_opt(e3b->bd_sb, MBALLOC)) -+ return; -+ -+ { -+ static int mb_check_counter = 0; -+ if (mb_check_counter++ % 300 != 0) -+ return; -+ } -+ -+ while (order > 1) { -+ buddy = mb_find_buddy(e3b, order, &max); -+ J_ASSERT(buddy); -+ buddy2 = mb_find_buddy(e3b, order - 1, &max2); -+ J_ASSERT(buddy2); -+ J_ASSERT(buddy != buddy2); -+ J_ASSERT(max * 2 == max2); -+ -+ count = 0; -+ for (i = 0; i < max; i++) { -+ -+ if (mb_test_bit(i, buddy)) { -+ /* only single bit in buddy2 may be 1 */ -+ if (!mb_test_bit(i << 1, buddy2)) -+ J_ASSERT(mb_test_bit((i<<1)+1, buddy2)); -+ else if (!mb_test_bit((i << 1) + 1, buddy2)) -+ J_ASSERT(mb_test_bit(i << 1, buddy2)); -+ continue; -+ } -+ -+ /* both bits in buddy2 must be 0 */ -+ J_ASSERT(mb_test_bit(i << 1, buddy2)); -+ J_ASSERT(mb_test_bit((i << 1) + 1, buddy2)); -+ -+ for (j = 0; j < (1 << order); j++) { -+ k = (i * (1 << order)) + j; -+ J_ASSERT(!mb_test_bit(k, EXT3_MB_BITMAP(e3b))); -+ } -+ count++; -+ } -+ J_ASSERT(e3b->bd_info->bb_counters[order] == count); -+ order--; -+ } -+ -+ fstart = -1; -+ buddy = mb_find_buddy(e3b, 0, &max); -+ for (i = 0; i < max; i++) { -+ if (!mb_test_bit(i, buddy)) { -+ J_ASSERT(i >= e3b->bd_info->bb_first_free); -+ if (fstart == -1) { -+ fragments++; -+ fstart = i; -+ } -+ continue; -+ } -+ fstart = -1; -+ /* check used bits only */ -+ for (j = 0; j < e3b->bd_blkbits + 1; j++) { -+ buddy2 = mb_find_buddy(e3b, j, &max2); -+ k = i >> j; -+ J_ASSERT(k < max2); -+ J_ASSERT(mb_test_bit(k, buddy2)); -+ } -+ } -+ J_ASSERT(!EXT3_MB_GRP_NEED_INIT(e3b->bd_info)); -+ J_ASSERT(e3b->bd_info->bb_fragments == fragments); -+} -+ -+#else -+#define mb_check_buddy(e3b) -+#endif -+ -+/* find most significant bit */ -+static int inline fmsb(unsigned short word) -+{ -+ int order; -+ -+ if (word > 255) { -+ order = 7; -+ word >>= 8; -+ } else { -+ order = -1; -+ } -+ -+ do { -+ order++; -+ word >>= 1; -+ } while (word != 0); -+ -+ return order; -+} -+ -+static void inline -+ext3_mb_mark_free_simple(struct super_block *sb, void *buddy, unsigned first, -+ int len, struct ext3_group_info *grp) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ unsigned short min, max, chunk, border; -+ -+ mb_debug("mark %u/%u free\n", first, len); -+ J_ASSERT(len < EXT3_BLOCKS_PER_GROUP(sb)); -+ -+ border = 2 << sb->s_blocksize_bits; -+ -+ while (len > 0) { -+ /* find how many blocks can be covered since this position */ -+ max = ffs(first | border) - 1; -+ -+ /* find how many blocks of power 2 we need to mark */ -+ min = fmsb(len); -+ -+ mb_debug(" %u/%u -> max %u, min %u\n", -+ first & ((2 << sb->s_blocksize_bits) - 1), -+ len, max, min); -+ -+ if (max < min) -+ min = max; -+ chunk = 1 << min; -+ -+ /* mark multiblock chunks only */ -+ grp->bb_counters[min]++; -+ if (min > 0) { -+ mb_debug(" set %u at %u \n", first >> min, -+ sbi->s_mb_offsets[min]); -+ mb_clear_bit(first >> min, buddy + sbi->s_mb_offsets[min]); -+ } -+ -+ len -= chunk; -+ first += chunk; -+ } -+} -+ -+static void -+ext3_mb_generate_buddy(struct super_block *sb, void *buddy, void *bitmap, -+ int group) -+{ -+ struct ext3_group_info *grp = EXT3_GROUP_INFO(sb, group); -+ unsigned short max = EXT3_BLOCKS_PER_GROUP(sb); -+ unsigned short i = 0, first, len; -+ unsigned free = 0, fragments = 0; -+ unsigned long long period = get_cycles(); -+ -+ i = mb_find_next_zero_bit(bitmap, max, 0); -+ grp->bb_first_free = i; -+ while (i < max) { -+ fragments++; -+ first = i; -+ i = ext2_find_next_le_bit(bitmap, max, i); -+ len = i - first; -+ free += len; -+ if (len > 1) -+ ext3_mb_mark_free_simple(sb, buddy, first, len, grp); -+ else -+ grp->bb_counters[0]++; -+ if (i < max) -+ i = mb_find_next_zero_bit(bitmap, max, i); -+ } -+ grp->bb_fragments = fragments; -+ -+ /* bb_state shouldn't being modified because all -+ * others waits for init completion on page lock */ -+ clear_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, &grp->bb_state); -+ if (free != grp->bb_free) { -+ printk("EXT3-fs: group %u: %u blocks in bitmap, %u in gd\n", -+ group, free, grp->bb_free); -+ grp->bb_free = free; -+ } -+ -+ period = get_cycles() - period; -+ spin_lock(&EXT3_SB(sb)->s_bal_lock); -+ EXT3_SB(sb)->s_mb_buddies_generated++; -+ EXT3_SB(sb)->s_mb_generation_time += period; -+ spin_unlock(&EXT3_SB(sb)->s_bal_lock); -+} -+ -+static int ext3_mb_init_cache(struct page *page) -+{ -+ int blocksize, blocks_per_page, groups_per_page; -+ int err = 0, i, first_group, first_block; -+ struct super_block *sb; -+ struct buffer_head *bhs; -+ struct buffer_head **bh; -+ struct inode *inode; -+ char *data, *bitmap; -+ -+ mb_debug("init page %lu\n", page->index); -+ -+ inode = page->mapping->host; -+ sb = inode->i_sb; -+ blocksize = 1 << inode->i_blkbits; -+ blocks_per_page = PAGE_CACHE_SIZE / blocksize; -+ -+ groups_per_page = blocks_per_page >> 1; -+ if (groups_per_page == 0) -+ groups_per_page = 1; -+ -+ /* allocate buffer_heads to read bitmaps */ -+ if (groups_per_page > 1) { -+ err = -ENOMEM; -+ i = sizeof(struct buffer_head *) * groups_per_page; -+ bh = kmalloc(i, GFP_NOFS); -+ if (bh == NULL) -+ goto out; -+ memset(bh, 0, i); -+ } else -+ bh = &bhs; -+ -+ first_group = page->index * blocks_per_page / 2; -+ -+ /* read all groups the page covers into the cache */ -+ for (i = 0; i < groups_per_page; i++) { -+ struct ext3_group_desc * desc; -+ -+ if (first_group + i >= EXT3_SB(sb)->s_groups_count) -+ break; -+ -+ err = -EIO; -+ desc = ext3_get_group_desc(sb, first_group + i, NULL); -+ if (desc == NULL) -+ goto out; -+ -+ err = -ENOMEM; -+ bh[i] = sb_getblk(sb, le32_to_cpu(desc->bg_block_bitmap)); -+ if (bh[i] == NULL) -+ goto out; -+ -+ if (buffer_uptodate(bh[i])) -+ continue; -+ -+ lock_buffer(bh[i]); -+ if (buffer_uptodate(bh[i])) { -+ unlock_buffer(bh[i]); -+ continue; -+ } -+ -+ get_bh(bh[i]); -+ bh[i]->b_end_io = end_buffer_read_sync; -+ submit_bh(READ, bh[i]); -+ mb_debug("read bitmap for group %u\n", first_group + i); -+ } -+ -+ /* wait for I/O completion */ -+ for (i = 0; i < groups_per_page && bh[i]; i++) -+ wait_on_buffer(bh[i]); -+ -+ err = -EIO; -+ for (i = 0; i < groups_per_page && bh[i]; i++) -+ if (!buffer_uptodate(bh[i])) -+ goto out; -+ -+ first_block = page->index * blocks_per_page; -+ for (i = 0; i < blocks_per_page; i++) { -+ int group; -+ -+ group = (first_block + i) >> 1; -+ if (group >= EXT3_SB(sb)->s_groups_count) -+ break; -+ -+ data = page_address(page) + (i * blocksize); -+ bitmap = bh[group - first_group]->b_data; -+ -+ if ((first_block + i) & 1) { -+ /* this is block of buddy */ -+ mb_debug("put buddy for group %u in page %lu/%x\n", -+ group, page->index, i * blocksize); -+ memset(data, 0xff, blocksize); -+ EXT3_GROUP_INFO(sb, group)->bb_fragments = 0; -+ memset(EXT3_GROUP_INFO(sb, group)->bb_counters, 0, -+ sizeof(unsigned short)*(sb->s_blocksize_bits+2)); -+ ext3_mb_generate_buddy(sb, data, bitmap, group); -+ } else { -+ /* this is block of bitmap */ -+ mb_debug("put bitmap for group %u in page %lu/%x\n", -+ group, page->index, i * blocksize); -+ memcpy(data, bitmap, blocksize); -+ } -+ } -+ SetPageUptodate(page); -+ -+out: -+ if (bh) { -+ for (i = 0; i < groups_per_page && bh[i]; i++) -+ brelse(bh[i]); -+ if (bh != &bhs) -+ kfree(bh); -+ } -+ return err; -+} -+ -+static int ext3_mb_load_buddy(struct super_block *sb, int group, -+ struct ext3_buddy *e3b) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ struct inode *inode = sbi->s_buddy_cache; -+ int blocks_per_page, block, pnum, poff; -+ struct page *page; -+ -+ mb_debug("load group %u\n", group); -+ -+ blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; -+ -+ e3b->bd_blkbits = sb->s_blocksize_bits; -+ e3b->bd_info = EXT3_GROUP_INFO(sb, group); -+ e3b->bd_sb = sb; -+ e3b->bd_group = group; -+ e3b->bd_buddy_page = NULL; -+ e3b->bd_bitmap_page = NULL; -+ -+ block = group * 2; -+ pnum = block / blocks_per_page; -+ poff = block % blocks_per_page; -+ -+ /* we could use find_or_create_page(), but it locks page -+ * what we'd like to avoid in fast path ... */ -+ page = find_get_page(inode->i_mapping, pnum); -+ if (page == NULL || !PageUptodate(page)) { -+ if (page) -+ page_cache_release(page); -+ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); -+ if (page) { -+ BUG_ON(page->mapping != inode->i_mapping); -+ if (!PageUptodate(page)) -+ ext3_mb_init_cache(page); -+ unlock_page(page); -+ } -+ } -+ if (page == NULL || !PageUptodate(page)) -+ goto err; -+ e3b->bd_bitmap_page = page; -+ e3b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); -+ mark_page_accessed(page); -+ -+ block++; -+ pnum = block / blocks_per_page; -+ poff = block % blocks_per_page; -+ -+ page = find_get_page(inode->i_mapping, pnum); -+ if (page == NULL || !PageUptodate(page)) { -+ if (page) -+ page_cache_release(page); -+ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); -+ if (page) { -+ BUG_ON(page->mapping != inode->i_mapping); -+ if (!PageUptodate(page)) -+ ext3_mb_init_cache(page); -+ unlock_page(page); -+ } -+ } -+ if (page == NULL || !PageUptodate(page)) -+ goto err; -+ e3b->bd_buddy_page = page; -+ e3b->bd_buddy = page_address(page) + (poff * sb->s_blocksize); -+ mark_page_accessed(page); -+ -+ J_ASSERT(e3b->bd_bitmap_page != NULL); -+ J_ASSERT(e3b->bd_buddy_page != NULL); -+ -+ return 0; -+ -+err: -+ if (e3b->bd_bitmap_page) -+ page_cache_release(e3b->bd_bitmap_page); -+ if (e3b->bd_buddy_page) -+ page_cache_release(e3b->bd_buddy_page); -+ e3b->bd_buddy = NULL; -+ e3b->bd_bitmap = NULL; -+ return -EIO; -+} -+ -+static void ext3_mb_release_desc(struct ext3_buddy *e3b) -+{ -+ if (e3b->bd_bitmap_page) -+ page_cache_release(e3b->bd_bitmap_page); -+ if (e3b->bd_buddy_page) -+ page_cache_release(e3b->bd_buddy_page); -+} -+ -+ -+static inline void -+ext3_lock_group(struct super_block *sb, int group) -+{ -+ bit_spin_lock(EXT3_GROUP_INFO_LOCKED_BIT, -+ &EXT3_GROUP_INFO(sb, group)->bb_state); -+} -+ -+static inline void -+ext3_unlock_group(struct super_block *sb, int group) -+{ -+ bit_spin_unlock(EXT3_GROUP_INFO_LOCKED_BIT, -+ &EXT3_GROUP_INFO(sb, group)->bb_state); -+} -+ -+static int mb_find_order_for_block(struct ext3_buddy *e3b, int block) -+{ -+ int order = 1; -+ void *bb; -+ -+ J_ASSERT(EXT3_MB_BITMAP(e3b) != EXT3_MB_BUDDY(e3b)); -+ J_ASSERT(block < (1 << (e3b->bd_blkbits + 3))); -+ -+ bb = EXT3_MB_BUDDY(e3b); -+ while (order <= e3b->bd_blkbits + 1) { -+ block = block >> 1; -+ if (!mb_test_bit(block, bb)) { -+ /* this block is part of buddy of order 'order' */ -+ return order; -+ } -+ bb += 1 << (e3b->bd_blkbits - order); -+ order++; -+ } -+ return 0; -+} -+ -+static inline void mb_clear_bits(void *bm, int cur, int len) -+{ -+ __u32 *addr; -+ -+ len = cur + len; -+ while (cur < len) { -+ if ((cur & 31) == 0 && (len - cur) >= 32) { -+ /* fast path: clear whole word at once */ -+ addr = bm + (cur >> 3); -+ *addr = 0; -+ cur += 32; -+ continue; -+ } -+ mb_clear_bit_atomic(cur, bm); -+ cur++; -+ } -+} -+ -+static inline void mb_set_bits(void *bm, int cur, int len) -+{ -+ __u32 *addr; -+ -+ len = cur + len; -+ while (cur < len) { -+ if ((cur & 31) == 0 && (len - cur) >= 32) { -+ /* fast path: clear whole word at once */ -+ addr = bm + (cur >> 3); -+ *addr = 0xffffffff; -+ cur += 32; -+ continue; -+ } -+ mb_set_bit_atomic(cur, bm); -+ cur++; -+ } -+} -+ -+static int mb_free_blocks(struct ext3_buddy *e3b, int first, int count) -+{ -+ int block = 0, max = 0, order; -+ void *buddy, *buddy2; -+ -+ mb_check_buddy(e3b); -+ -+ e3b->bd_info->bb_free += count; -+ if (first < e3b->bd_info->bb_first_free) -+ e3b->bd_info->bb_first_free = first; -+ -+ /* let's maintain fragments counter */ -+ if (first != 0) -+ block = !mb_test_bit(first - 1, EXT3_MB_BITMAP(e3b)); -+ if (first + count < EXT3_SB(e3b->bd_sb)->s_mb_maxs[0]) -+ max = !mb_test_bit(first + count, EXT3_MB_BITMAP(e3b)); -+ if (block && max) -+ e3b->bd_info->bb_fragments--; -+ else if (!block && !max) -+ e3b->bd_info->bb_fragments++; -+ -+ /* let's maintain buddy itself */ -+ while (count-- > 0) { -+ block = first++; -+ order = 0; -+ -+ J_ASSERT(mb_test_bit(block, EXT3_MB_BITMAP(e3b))); -+ mb_clear_bit(block, EXT3_MB_BITMAP(e3b)); -+ e3b->bd_info->bb_counters[order]++; -+ -+ /* start of the buddy */ -+ buddy = mb_find_buddy(e3b, order, &max); -+ -+ do { -+ block &= ~1UL; -+ if (mb_test_bit(block, buddy) || -+ mb_test_bit(block + 1, buddy)) -+ break; -+ -+ /* both the buddies are free, try to coalesce them */ -+ buddy2 = mb_find_buddy(e3b, order + 1, &max); -+ -+ if (!buddy2) -+ break; -+ -+ if (order > 0) { -+ /* for special purposes, we don't set -+ * free bits in bitmap */ -+ mb_set_bit(block, buddy); -+ mb_set_bit(block + 1, buddy); -+ } -+ e3b->bd_info->bb_counters[order]--; -+ e3b->bd_info->bb_counters[order]--; -+ -+ block = block >> 1; -+ order++; -+ e3b->bd_info->bb_counters[order]++; -+ -+ mb_clear_bit(block, buddy2); -+ buddy = buddy2; -+ } while (1); -+ } -+ mb_check_buddy(e3b); -+ -+ return 0; -+} -+ -+static int mb_find_extent(struct ext3_buddy *e3b, int order, int block, -+ int needed, struct ext3_free_extent *ex) -+{ -+ int next = block, max, ord; -+ void *buddy; -+ -+ J_ASSERT(ex != NULL); -+ -+ buddy = mb_find_buddy(e3b, order, &max); -+ J_ASSERT(buddy); -+ J_ASSERT(block < max); -+ if (mb_test_bit(block, buddy)) { -+ ex->fe_len = 0; -+ ex->fe_start = 0; -+ ex->fe_group = 0; -+ return 0; -+ } -+ -+ if (likely(order == 0)) { -+ /* find actual order */ -+ order = mb_find_order_for_block(e3b, block); -+ block = block >> order; -+ } -+ -+ ex->fe_len = 1 << order; -+ ex->fe_start = block << order; -+ ex->fe_group = e3b->bd_group; -+ -+ /* calc difference from given start */ -+ next = next - ex->fe_start; -+ ex->fe_len -= next; -+ ex->fe_start += next; -+ -+ while (needed > ex->fe_len && (buddy = mb_find_buddy(e3b, order, &max))) { -+ -+ if (block + 1 >= max) -+ break; -+ -+ next = (block + 1) * (1 << order); -+ if (mb_test_bit(next, EXT3_MB_BITMAP(e3b))) -+ break; -+ -+ ord = mb_find_order_for_block(e3b, next); -+ -+ order = ord; -+ block = next >> order; -+ ex->fe_len += 1 << order; -+ } -+ -+ J_ASSERT(ex->fe_start + ex->fe_len <= (1 << (e3b->bd_blkbits + 3))); -+ return ex->fe_len; -+} -+ -+static int mb_mark_used(struct ext3_buddy *e3b, struct ext3_free_extent *ex) -+{ -+ int ord, mlen = 0, max = 0, cur; -+ int start = ex->fe_start; -+ int len = ex->fe_len; -+ unsigned ret = 0; -+ int len0 = len; -+ void *buddy; -+ -+ mb_check_buddy(e3b); -+ -+ e3b->bd_info->bb_free -= len; -+ if (e3b->bd_info->bb_first_free == start) -+ e3b->bd_info->bb_first_free += len; -+ -+ /* let's maintain fragments counter */ -+ if (start != 0) -+ mlen = !mb_test_bit(start - 1, EXT3_MB_BITMAP(e3b)); -+ if (start + len < EXT3_SB(e3b->bd_sb)->s_mb_maxs[0]) -+ max = !mb_test_bit(start + len, EXT3_MB_BITMAP(e3b)); -+ if (mlen && max) -+ e3b->bd_info->bb_fragments++; -+ else if (!mlen && !max) -+ e3b->bd_info->bb_fragments--; -+ -+ /* let's maintain buddy itself */ -+ while (len) { -+ ord = mb_find_order_for_block(e3b, start); -+ -+ if (((start >> ord) << ord) == start && len >= (1 << ord)) { -+ /* the whole chunk may be allocated at once! */ -+ mlen = 1 << ord; -+ buddy = mb_find_buddy(e3b, ord, &max); -+ J_ASSERT((start >> ord) < max); -+ mb_set_bit(start >> ord, buddy); -+ e3b->bd_info->bb_counters[ord]--; -+ start += mlen; -+ len -= mlen; -+ J_ASSERT(len >= 0); -+ continue; -+ } -+ -+ /* store for history */ -+ if (ret == 0) -+ ret = len | (ord << 16); -+ -+ /* we have to split large buddy */ -+ J_ASSERT(ord > 0); -+ buddy = mb_find_buddy(e3b, ord, &max); -+ mb_set_bit(start >> ord, buddy); -+ e3b->bd_info->bb_counters[ord]--; -+ -+ ord--; -+ cur = (start >> ord) & ~1U; -+ buddy = mb_find_buddy(e3b, ord, &max); -+ mb_clear_bit(cur, buddy); -+ mb_clear_bit(cur + 1, buddy); -+ e3b->bd_info->bb_counters[ord]++; -+ e3b->bd_info->bb_counters[ord]++; -+ } -+ -+ /* now drop all the bits in bitmap */ -+ mb_set_bits(EXT3_MB_BITMAP(e3b), ex->fe_start, len0); -+ -+ mb_check_buddy(e3b); -+ -+ return ret; -+} -+ -+/* -+ * Must be called under group lock! -+ */ -+static void ext3_mb_use_best_found(struct ext3_allocation_context *ac, -+ struct ext3_buddy *e3b) -+{ -+ unsigned long ret; -+ -+ ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len); -+ ret = mb_mark_used(e3b, &ac->ac_b_ex); -+ -+ ac->ac_status = AC_STATUS_FOUND; -+ ac->ac_tail = ret & 0xffff; -+ ac->ac_buddy = ret >> 16; -+ -+ /* hold in-core structures until allocated -+ * blocks are marked non-free in on-disk bitmap */ -+ ac->ac_buddy_page = e3b->bd_buddy_page; -+ page_cache_get(e3b->bd_buddy_page); -+ ac->ac_bitmap_page = e3b->bd_bitmap_page; -+ page_cache_get(e3b->bd_bitmap_page); -+} -+ -+/* -+ * The routine checks whether found extent is good enough. If it is, -+ * then the extent gets marked used and flag is set to the context -+ * to stop scanning. Otherwise, the extent is compared with the -+ * previous found extent and if new one is better, then it's stored -+ * in the context. Later, the best found extent will be used, if -+ * mballoc can't find good enough extent. -+ * -+ * FIXME: real allocation policy is to be designed yet! -+ */ -+static void ext3_mb_measure_extent(struct ext3_allocation_context *ac, -+ struct ext3_free_extent *ex, -+ struct ext3_buddy *e3b) -+{ -+ struct ext3_free_extent *bex = &ac->ac_b_ex; -+ struct ext3_free_extent *gex = &ac->ac_g_ex; -+ -+ J_ASSERT(ex->fe_len > 0); -+ J_ASSERT(ex->fe_len < (1 << ac->ac_sb->s_blocksize_bits) * 8); -+ J_ASSERT(ex->fe_start < (1 << ac->ac_sb->s_blocksize_bits) * 8); -+ -+ ac->ac_found++; -+ -+ /* -+ * The special case - take what you catch first -+ */ -+ if (unlikely(ac->ac_flags & EXT3_MB_HINT_FIRST)) { -+ *bex = *ex; -+ ext3_mb_use_best_found(ac, e3b); -+ return; -+ } -+ -+ /* -+ * Let's check whether the chunk is good enough -+ */ -+ if (ex->fe_len == gex->fe_len) { -+ *bex = *ex; -+ ext3_mb_use_best_found(ac, e3b); -+ return; -+ } -+ -+ /* -+ * If this is first found extent, just store it in the context -+ */ -+ if (bex->fe_len == 0) { -+ *bex = *ex; -+ return; -+ } -+ -+ /* -+ * If new found extent is better, store it in the context -+ */ -+ if (bex->fe_len < gex->fe_len) { -+ /* if the request isn't satisfied, any found extent -+ * larger than previous best one is better */ -+ if (ex->fe_len > bex->fe_len) -+ *bex = *ex; -+ } else if (ex->fe_len > gex->fe_len) { -+ /* if the request is satisfied, then we try to find -+ * an extent that still satisfy the request, but is -+ * smaller than previous one */ -+ *bex = *ex; -+ } -+ -+ /* -+ * Let's scan at least few extents and don't pick up a first one -+ */ -+ if (bex->fe_len > gex->fe_len && ac->ac_found > ext3_mb_min_to_scan) -+ ac->ac_status = AC_STATUS_BREAK; -+ -+ /* -+ * We don't want to scan for a whole year -+ */ -+ if (ac->ac_found > ext3_mb_max_to_scan) -+ ac->ac_status = AC_STATUS_BREAK; -+} -+ -+static int ext3_mb_try_best_found(struct ext3_allocation_context *ac, -+ struct ext3_buddy *e3b) -+{ -+ struct ext3_free_extent ex = ac->ac_b_ex; -+ int group = ex.fe_group, max, err; -+ -+ J_ASSERT(ex.fe_len > 0); -+ err = ext3_mb_load_buddy(ac->ac_sb, group, e3b); -+ if (err) -+ return err; -+ -+ ext3_lock_group(ac->ac_sb, group); -+ max = mb_find_extent(e3b, 0, ex.fe_start, ex.fe_len, &ex); -+ -+ if (max > 0) { -+ ac->ac_b_ex = ex; -+ ext3_mb_use_best_found(ac, e3b); -+ } -+ -+ ext3_unlock_group(ac->ac_sb, group); -+ -+ ext3_mb_release_desc(e3b); -+ -+ return 0; -+} -+ -+static int ext3_mb_find_by_goal(struct ext3_allocation_context *ac, -+ struct ext3_buddy *e3b) -+{ -+ int group = ac->ac_g_ex.fe_group, max, err; -+ struct ext3_sb_info *sbi = EXT3_SB(ac->ac_sb); -+ struct ext3_super_block *es = sbi->s_es; -+ struct ext3_free_extent ex; -+ -+ err = ext3_mb_load_buddy(ac->ac_sb, group, e3b); -+ if (err) -+ return err; -+ -+ ext3_lock_group(ac->ac_sb, group); -+ max = mb_find_extent(e3b, 0, ac->ac_g_ex.fe_start, -+ ac->ac_g_ex.fe_len, &ex); -+ -+ if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) { -+ unsigned long start; -+ start = (e3b->bd_group * EXT3_BLOCKS_PER_GROUP(ac->ac_sb) + -+ ex.fe_start + le32_to_cpu(es->s_first_data_block)); -+ if (start % sbi->s_stripe == 0) { -+ ac->ac_found++; -+ ac->ac_b_ex = ex; -+ ext3_mb_use_best_found(ac, e3b); -+ } -+ } else if (max >= ac->ac_g_ex.fe_len) { -+ J_ASSERT(ex.fe_len > 0); -+ J_ASSERT(ex.fe_group == ac->ac_g_ex.fe_group); -+ J_ASSERT(ex.fe_start == ac->ac_g_ex.fe_start); -+ ac->ac_found++; -+ ac->ac_b_ex = ex; -+ ext3_mb_use_best_found(ac, e3b); -+ } else if (max > 0 && (ac->ac_flags & EXT3_MB_HINT_MERGE)) { -+ /* Sometimes, caller may want to merge even small -+ * number of blocks to an existing extent */ -+ J_ASSERT(ex.fe_len > 0); -+ J_ASSERT(ex.fe_group == ac->ac_g_ex.fe_group); -+ J_ASSERT(ex.fe_start == ac->ac_g_ex.fe_start); -+ ac->ac_found++; -+ ac->ac_b_ex = ex; -+ ext3_mb_use_best_found(ac, e3b); -+ } -+ ext3_unlock_group(ac->ac_sb, group); -+ -+ ext3_mb_release_desc(e3b); -+ -+ return 0; -+} -+ -+/* -+ * The routine scans buddy structures (not bitmap!) from given order -+ * to max order and tries to find big enough chunk to satisfy the req -+ */ -+static void ext3_mb_simple_scan_group(struct ext3_allocation_context *ac, -+ struct ext3_buddy *e3b) -+{ -+ struct super_block *sb = ac->ac_sb; -+ struct ext3_group_info *grp = e3b->bd_info; -+ void *buddy; -+ int i, k, max; -+ -+ J_ASSERT(ac->ac_2order > 0); -+ for (i = ac->ac_2order; i <= sb->s_blocksize_bits + 1; i++) { -+ if (grp->bb_counters[i] == 0) -+ continue; -+ -+ buddy = mb_find_buddy(e3b, i, &max); -+ if (buddy == NULL) { -+ printk(KERN_ALERT "looking for wrong order?\n"); -+ break; -+ } -+ -+ k = mb_find_next_zero_bit(buddy, max, 0); -+ J_ASSERT(k < max); -+ -+ ac->ac_found++; -+ -+ ac->ac_b_ex.fe_len = 1 << i; -+ ac->ac_b_ex.fe_start = k << i; -+ ac->ac_b_ex.fe_group = e3b->bd_group; -+ -+ ext3_mb_use_best_found(ac, e3b); -+ J_ASSERT(ac->ac_b_ex.fe_len == ac->ac_g_ex.fe_len); -+ -+ if (unlikely(ext3_mb_stats)) -+ atomic_inc(&EXT3_SB(sb)->s_bal_2orders); -+ -+ break; -+ } -+} -+ -+/* -+ * The routine scans the group and measures all found extents. -+ * In order to optimize scanning, caller must pass number of -+ * free blocks in the group, so the routine can know upper limit. -+ */ -+static void ext3_mb_complex_scan_group(struct ext3_allocation_context *ac, -+ struct ext3_buddy *e3b) -+{ -+ struct super_block *sb = ac->ac_sb; -+ void *bitmap = EXT3_MB_BITMAP(e3b); -+ struct ext3_free_extent ex; -+ int i, free; -+ -+ free = e3b->bd_info->bb_free; -+ J_ASSERT(free > 0); -+ -+ i = e3b->bd_info->bb_first_free; -+ -+ while (free && ac->ac_status == AC_STATUS_CONTINUE) { -+ i = mb_find_next_zero_bit(bitmap, sb->s_blocksize * 8, i); -+ if (i >= sb->s_blocksize * 8) { -+ J_ASSERT(free == 0); -+ break; -+ } -+ -+ mb_find_extent(e3b, 0, i, ac->ac_g_ex.fe_len, &ex); -+ J_ASSERT(ex.fe_len > 0); -+ J_ASSERT(free >= ex.fe_len); -+ -+ ext3_mb_measure_extent(ac, &ex, e3b); -+ -+ i += ex.fe_len; -+ free -= ex.fe_len; -+ } -+} -+ -+/* -+ * This is a special case for storages like raid5 -+ * we try to find stripe-aligned chunks for stripe-size requests -+ */ -+static void ext3_mb_scan_aligned(struct ext3_allocation_context *ac, -+ struct ext3_buddy *e3b) -+{ -+ struct super_block *sb = ac->ac_sb; -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ void *bitmap = EXT3_MB_BITMAP(e3b); -+ struct ext3_free_extent ex; -+ unsigned long i, max; -+ -+ J_ASSERT(sbi->s_stripe != 0); -+ -+ /* find first stripe-aligned block */ -+ i = e3b->bd_group * EXT3_BLOCKS_PER_GROUP(sb) + -+ le32_to_cpu(sbi->s_es->s_first_data_block); -+ i = ((i + sbi->s_stripe - 1) / sbi->s_stripe) * sbi->s_stripe; -+ i = (i - le32_to_cpu(sbi->s_es->s_first_data_block)) % -+ EXT3_BLOCKS_PER_GROUP(sb); -+ -+ while (i < sb->s_blocksize * 8) { -+ if (!mb_test_bit(i, bitmap)) { -+ max = mb_find_extent(e3b, 0, i, sbi->s_stripe, &ex); -+ if (max >= sbi->s_stripe) { -+ ac->ac_found++; -+ ac->ac_b_ex = ex; -+ ext3_mb_use_best_found(ac, e3b); -+ break; -+ } -+ } -+ i += sbi->s_stripe; -+ } -+} -+ -+static int ext3_mb_good_group(struct ext3_allocation_context *ac, -+ int group, int cr) -+{ -+ struct ext3_group_info *grp = EXT3_GROUP_INFO(ac->ac_sb, group); -+ unsigned free, fragments, i, bits; -+ -+ J_ASSERT(cr >= 0 && cr < 4); -+ J_ASSERT(!EXT3_MB_GRP_NEED_INIT(grp)); -+ -+ free = grp->bb_free; -+ fragments = grp->bb_fragments; -+ if (free == 0) -+ return 0; -+ if (fragments == 0) -+ return 0; -+ -+ switch (cr) { -+ case 0: -+ J_ASSERT(ac->ac_2order != 0); -+ bits = ac->ac_sb->s_blocksize_bits + 1; -+ for (i = ac->ac_2order; i <= bits; i++) -+ if (grp->bb_counters[i] > 0) -+ return 1; -+ break; -+ case 1: -+ if ((free / fragments) >= ac->ac_g_ex.fe_len) -+ return 1; -+ break; -+ case 2: -+ if (free >= ac->ac_g_ex.fe_len) -+ return 1; -+ break; -+ case 3: -+ return 1; -+ default: -+ BUG(); -+ } -+ -+ return 0; -+} -+ -+int ext3_mb_new_blocks(handle_t *handle, struct inode *inode, -+ unsigned long goal, int *len, int flags, int *errp) -+{ -+ struct buffer_head *bitmap_bh = NULL; -+ struct ext3_allocation_context ac; -+ int i, group, block, cr, err = 0; -+ struct ext3_group_desc *gdp; -+ struct ext3_super_block *es; -+ struct buffer_head *gdp_bh; -+ struct ext3_sb_info *sbi; -+ struct super_block *sb; -+ struct ext3_buddy e3b; -+ -+ J_ASSERT(len != NULL); -+ J_ASSERT(*len > 0); -+ -+ sb = inode->i_sb; -+ if (!sb) { -+ printk("ext3_mb_new_nblocks: nonexistent device"); -+ return 0; -+ } -+ -+ if (!test_opt(sb, MBALLOC)) { -+ static int ext3_mballoc_warning = 0; -+ if (ext3_mballoc_warning == 0) { -+ printk(KERN_ERR "EXT3-fs: multiblock request with " -+ "mballoc disabled!\n"); -+ ext3_mballoc_warning++; -+ } -+ *len = 1; -+ err = ext3_new_block_old(handle, inode, goal, errp); -+ return err; -+ } -+ -+ ext3_mb_poll_new_transaction(sb, handle); -+ -+ sbi = EXT3_SB(sb); -+ es = EXT3_SB(sb)->s_es; -+ -+ /* -+ * We can't allocate > group size -+ */ -+ if (*len >= EXT3_BLOCKS_PER_GROUP(sb) - 10) -+ *len = EXT3_BLOCKS_PER_GROUP(sb) - 10; -+ -+ if (!(flags & EXT3_MB_HINT_RESERVED)) { -+ /* someone asks for non-reserved blocks */ -+ BUG_ON(*len > 1); -+ err = ext3_mb_reserve_blocks(sb, 1); -+ if (err) { -+ *errp = err; -+ return 0; -+ } -+ } -+ -+ ac.ac_buddy_page = NULL; -+ ac.ac_bitmap_page = NULL; -+ -+ /* -+ * Check quota for allocation of this blocks. -+ */ -+ while (*len && DQUOT_ALLOC_BLOCK(inode, *len)) -+ *len -= 1; -+ if (*len == 0) { -+ *errp = -EDQUOT; -+ block = 0; -+ goto out; -+ } -+ -+ /* start searching from the goal */ -+ if (goal < le32_to_cpu(es->s_first_data_block) || -+ goal >= le32_to_cpu(es->s_blocks_count)) -+ goal = le32_to_cpu(es->s_first_data_block); -+ group = (goal - le32_to_cpu(es->s_first_data_block)) / -+ EXT3_BLOCKS_PER_GROUP(sb); -+ block = ((goal - le32_to_cpu(es->s_first_data_block)) % -+ EXT3_BLOCKS_PER_GROUP(sb)); -+ -+ /* set up allocation goals */ -+ ac.ac_b_ex.fe_group = 0; -+ ac.ac_b_ex.fe_start = 0; -+ ac.ac_b_ex.fe_len = 0; -+ ac.ac_status = AC_STATUS_CONTINUE; -+ ac.ac_groups_scanned = 0; -+ ac.ac_ex_scanned = 0; -+ ac.ac_found = 0; -+ ac.ac_sb = inode->i_sb; -+ ac.ac_g_ex.fe_group = group; -+ ac.ac_g_ex.fe_start = block; -+ ac.ac_g_ex.fe_len = *len; -+ ac.ac_flags = flags; -+ ac.ac_2order = 0; -+ ac.ac_criteria = 0; -+ -+ if (*len == 1 && sbi->s_stripe) { -+ /* looks like a metadata, let's use a dirty hack for raid5 -+ * move all metadata in first groups in hope to hit cached -+ * sectors and thus avoid read-modify cycles in raid5 */ -+ ac.ac_g_ex.fe_group = group = 0; -+ } -+ -+ /* probably, the request is for 2^8+ blocks (1/2/3/... MB) */ -+ i = ffs(*len); -+ if (i >= ext3_mb_order2_reqs) { -+ i--; -+ if ((*len & (~(1 << i))) == 0) -+ ac.ac_2order = i; -+ } -+ -+ /* first, try the goal */ -+ err = ext3_mb_find_by_goal(&ac, &e3b); -+ if (err) -+ goto out_err; -+ if (ac.ac_status == AC_STATUS_FOUND) -+ goto found; -+ -+ /* Let's just scan groups to find more-less suitable blocks */ -+ cr = ac.ac_2order ? 0 : 1; -+repeat: -+ for (; cr < 4 && ac.ac_status == AC_STATUS_CONTINUE; cr++) { -+ ac.ac_criteria = cr; -+ for (i = 0; i < EXT3_SB(sb)->s_groups_count; group++, i++) { -+ if (group == EXT3_SB(sb)->s_groups_count) -+ group = 0; -+ -+ if (EXT3_MB_GRP_NEED_INIT(EXT3_GROUP_INFO(sb, group))) { -+ /* we need full data about the group -+ * to make a good selection */ -+ err = ext3_mb_load_buddy(ac.ac_sb, group, &e3b); -+ if (err) -+ goto out_err; -+ ext3_mb_release_desc(&e3b); -+ } -+ -+ /* check is group good for our criteries */ -+ if (!ext3_mb_good_group(&ac, group, cr)) -+ continue; -+ -+ err = ext3_mb_load_buddy(ac.ac_sb, group, &e3b); -+ if (err) -+ goto out_err; -+ -+ ext3_lock_group(sb, group); -+ if (!ext3_mb_good_group(&ac, group, cr)) { -+ /* someone did allocation from this group */ -+ ext3_unlock_group(sb, group); -+ ext3_mb_release_desc(&e3b); -+ continue; -+ } -+ -+ ac.ac_groups_scanned++; -+ if (cr == 0) -+ ext3_mb_simple_scan_group(&ac, &e3b); -+ else if (cr == 1 && *len == sbi->s_stripe) -+ ext3_mb_scan_aligned(&ac, &e3b); -+ else -+ ext3_mb_complex_scan_group(&ac, &e3b); -+ -+ ext3_unlock_group(sb, group); -+ -+ ext3_mb_release_desc(&e3b); -+ -+ if (ac.ac_status != AC_STATUS_CONTINUE) -+ break; -+ } -+ } -+ -+ if (ac.ac_b_ex.fe_len > 0 && ac.ac_status != AC_STATUS_FOUND && -+ !(ac.ac_flags & EXT3_MB_HINT_FIRST)) { -+ /* -+ * We've been searching too long. Let's try to allocate -+ * the best chunk we've found so far -+ */ -+ -+ /*if (ac.ac_found > ext3_mb_max_to_scan) -+ printk(KERN_DEBUG "EXT3-fs: too long searching at " -+ "%u (%d/%d)\n", cr, ac.ac_b_ex.fe_len, -+ ac.ac_g_ex.fe_len);*/ -+ ext3_mb_try_best_found(&ac, &e3b); -+ if (ac.ac_status != AC_STATUS_FOUND) { -+ /* -+ * Someone more lucky has already allocated it. -+ * The only thing we can do is just take first -+ * found block(s) -+ printk(KERN_DEBUG "EXT3-fs: someone won our chunk\n"); -+ */ -+ ac.ac_b_ex.fe_group = 0; -+ ac.ac_b_ex.fe_start = 0; -+ ac.ac_b_ex.fe_len = 0; -+ ac.ac_status = AC_STATUS_CONTINUE; -+ ac.ac_flags |= EXT3_MB_HINT_FIRST; -+ cr = 3; -+ goto repeat; -+ } -+ } -+ -+ if (ac.ac_status != AC_STATUS_FOUND) { -+ /* -+ * We aren't lucky definitely -+ */ -+ DQUOT_FREE_BLOCK(inode, *len); -+ *errp = -ENOSPC; -+ block = 0; -+#if 1 -+ printk(KERN_ERR "EXT3-fs: can't allocate: status %d flags %d\n", -+ ac.ac_status, ac.ac_flags); -+ printk(KERN_ERR "EXT3-fs: goal %d, best found %d/%d/%d cr %d\n", -+ ac.ac_g_ex.fe_len, ac.ac_b_ex.fe_group, -+ ac.ac_b_ex.fe_start, ac.ac_b_ex.fe_len, cr); -+ printk(KERN_ERR "EXT3-fs: %lu block reserved, %d found\n", -+ sbi->s_blocks_reserved, ac.ac_found); -+ printk("EXT3-fs: groups: "); -+ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) -+ printk("%d: %d ", i, EXT3_GROUP_INFO(sb, i)->bb_free); -+ printk("\n"); -+#endif -+ goto out; -+ } -+ -+found: -+ J_ASSERT(ac.ac_b_ex.fe_len > 0); -+ -+ /* good news - free block(s) have been found. now it's time -+ * to mark block(s) in good old journaled bitmap */ -+ block = ac.ac_b_ex.fe_group * EXT3_BLOCKS_PER_GROUP(sb) -+ + ac.ac_b_ex.fe_start -+ + le32_to_cpu(es->s_first_data_block); -+ -+ /* we made a desicion, now mark found blocks in good old -+ * bitmap to be journaled */ -+ -+ ext3_debug("using block group %d(%d)\n", -+ ac.ac_b_group.group, gdp->bg_free_blocks_count); -+ -+ bitmap_bh = read_block_bitmap(sb, ac.ac_b_ex.fe_group); -+ if (!bitmap_bh) { -+ *errp = -EIO; -+ goto out_err; -+ } -+ -+ err = ext3_journal_get_write_access(handle, bitmap_bh); -+ if (err) { -+ *errp = err; -+ goto out_err; -+ } -+ -+ gdp = ext3_get_group_desc(sb, ac.ac_b_ex.fe_group, &gdp_bh); -+ if (!gdp) { -+ *errp = -EIO; -+ goto out_err; -+ } -+ -+ err = ext3_journal_get_write_access(handle, gdp_bh); -+ if (err) -+ goto out_err; -+ -+ block = ac.ac_b_ex.fe_group * EXT3_BLOCKS_PER_GROUP(sb) -+ + ac.ac_b_ex.fe_start -+ + le32_to_cpu(es->s_first_data_block); -+ -+ if (block == le32_to_cpu(gdp->bg_block_bitmap) || -+ block == le32_to_cpu(gdp->bg_inode_bitmap) || -+ in_range(block, le32_to_cpu(gdp->bg_inode_table), -+ EXT3_SB(sb)->s_itb_per_group)) -+ ext3_error(sb, "ext3_new_block", -+ "Allocating block in system zone - " -+ "block = %u", block); -+#ifdef AGGRESSIVE_CHECK -+ for (i = 0; i < ac.ac_b_ex.fe_len; i++) -+ J_ASSERT(!mb_test_bit(ac.ac_b_ex.fe_start + i, bitmap_bh->b_data)); -+#endif -+ mb_set_bits(bitmap_bh->b_data, ac.ac_b_ex.fe_start, ac.ac_b_ex.fe_len); -+ -+ spin_lock(sb_bgl_lock(sbi, ac.ac_b_ex.fe_group)); -+ gdp->bg_free_blocks_count = -+ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) -+ - ac.ac_b_ex.fe_len); -+ spin_unlock(sb_bgl_lock(sbi, ac.ac_b_ex.fe_group)); -+ percpu_counter_mod(&sbi->s_freeblocks_counter, - ac.ac_b_ex.fe_len); -+ -+ err = ext3_journal_dirty_metadata(handle, bitmap_bh); -+ if (err) -+ goto out_err; -+ err = ext3_journal_dirty_metadata(handle, gdp_bh); -+ if (err) -+ goto out_err; -+ -+ sb->s_dirt = 1; -+ *errp = 0; -+ brelse(bitmap_bh); -+ -+ /* drop non-allocated, but dquote'd blocks */ -+ J_ASSERT(*len >= ac.ac_b_ex.fe_len); -+ DQUOT_FREE_BLOCK(inode, *len - ac.ac_b_ex.fe_len); -+ -+ *len = ac.ac_b_ex.fe_len; -+ J_ASSERT(*len > 0); -+ J_ASSERT(block != 0); -+ goto out; -+ -+out_err: -+ /* if we've already allocated something, roll it back */ -+ if (ac.ac_status == AC_STATUS_FOUND) { -+ /* FIXME: free blocks here */ -+ } -+ -+ DQUOT_FREE_BLOCK(inode, *len); -+ brelse(bitmap_bh); -+ *errp = err; -+ block = 0; -+out: -+ if (ac.ac_buddy_page) -+ page_cache_release(ac.ac_buddy_page); -+ if (ac.ac_bitmap_page) -+ page_cache_release(ac.ac_bitmap_page); -+ -+ if (!(flags & EXT3_MB_HINT_RESERVED)) { -+ /* block wasn't reserved before and we reserved it -+ * at the beginning of allocation. it doesn't matter -+ * whether we allocated anything or we failed: time -+ * to release reservation. NOTE: because I expect -+ * any multiblock request from delayed allocation -+ * path only, here is single block always */ -+ ext3_mb_release_blocks(sb, 1); -+ } -+ -+ if (unlikely(ext3_mb_stats) && ac.ac_g_ex.fe_len > 1) { -+ atomic_inc(&sbi->s_bal_reqs); -+ atomic_add(*len, &sbi->s_bal_allocated); -+ if (*len >= ac.ac_g_ex.fe_len) -+ atomic_inc(&sbi->s_bal_success); -+ atomic_add(ac.ac_found, &sbi->s_bal_ex_scanned); -+ if (ac.ac_g_ex.fe_start == ac.ac_b_ex.fe_start && -+ ac.ac_g_ex.fe_group == ac.ac_b_ex.fe_group) -+ atomic_inc(&sbi->s_bal_goals); -+ if (ac.ac_found > ext3_mb_max_to_scan) -+ atomic_inc(&sbi->s_bal_breaks); -+ } -+ -+ ext3_mb_store_history(sb, inode->i_ino, &ac); -+ -+ return block; -+} -+EXPORT_SYMBOL(ext3_mb_new_blocks); -+ -+#ifdef EXT3_MB_HISTORY -+struct ext3_mb_proc_session { -+ struct ext3_mb_history *history; -+ struct super_block *sb; -+ int start; -+ int max; -+}; -+ -+static void *ext3_mb_history_skip_empty(struct ext3_mb_proc_session *s, -+ struct ext3_mb_history *hs, -+ int first) -+{ -+ if (hs == s->history + s->max) -+ hs = s->history; -+ if (!first && hs == s->history + s->start) -+ return NULL; -+ while (hs->goal.fe_len == 0) { -+ hs++; -+ if (hs == s->history + s->max) -+ hs = s->history; -+ if (hs == s->history + s->start) -+ return NULL; -+ } -+ return hs; -+} -+ -+static void *ext3_mb_seq_history_start(struct seq_file *seq, loff_t *pos) -+{ -+ struct ext3_mb_proc_session *s = seq->private; -+ struct ext3_mb_history *hs; -+ int l = *pos; -+ -+ if (l == 0) -+ return SEQ_START_TOKEN; -+ hs = ext3_mb_history_skip_empty(s, s->history + s->start, 1); -+ if (!hs) -+ return NULL; -+ while (--l && (hs = ext3_mb_history_skip_empty(s, ++hs, 0)) != NULL); -+ return hs; -+} -+ -+static void *ext3_mb_seq_history_next(struct seq_file *seq, void *v, loff_t *pos) -+{ -+ struct ext3_mb_proc_session *s = seq->private; -+ struct ext3_mb_history *hs = v; -+ -+ ++*pos; -+ if (v == SEQ_START_TOKEN) -+ return ext3_mb_history_skip_empty(s, s->history + s->start, 1); -+ else -+ return ext3_mb_history_skip_empty(s, ++hs, 0); -+} -+ -+static int ext3_mb_seq_history_show(struct seq_file *seq, void *v) -+{ -+ struct ext3_mb_history *hs = v; -+ char buf[20], buf2[20]; -+ -+ if (v == SEQ_START_TOKEN) { -+ seq_printf(seq, "%-5s %-8s %-17s %-17s %-5s %-5s %-2s %-5s %-5s %-6s\n", -+ "pid", "inode", "goal", "result", "found", "grps", "cr", -+ "merge", "tail", "broken"); -+ return 0; -+ } -+ -+ sprintf(buf, "%u/%u/%u", hs->goal.fe_group, -+ hs->goal.fe_start, hs->goal.fe_len); -+ sprintf(buf2, "%u/%u/%u", hs->result.fe_group, -+ hs->result.fe_start, hs->result.fe_len); -+ seq_printf(seq, "%-5u %-8u %-17s %-17s %-5u %-5u %-2u %-5s %-5u %-6u\n", -+ hs->pid, hs->ino, buf, buf2, hs->found, hs->groups, -+ hs->cr, hs->merged ? "M" : "", hs->tail, -+ hs->buddy ? 1 << hs->buddy : 0); -+ return 0; -+} -+ -+static void ext3_mb_seq_history_stop(struct seq_file *seq, void *v) -+{ -+} -+ -+static struct seq_operations ext3_mb_seq_history_ops = { -+ .start = ext3_mb_seq_history_start, -+ .next = ext3_mb_seq_history_next, -+ .stop = ext3_mb_seq_history_stop, -+ .show = ext3_mb_seq_history_show, -+}; -+ -+static int ext3_mb_seq_history_open(struct inode *inode, struct file *file) -+{ -+ struct super_block *sb = PDE(inode)->data; -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ struct ext3_mb_proc_session *s; -+ int rc, size; -+ -+ s = kmalloc(sizeof(*s), GFP_KERNEL); -+ if (s == NULL) -+ return -EIO; -+ size = sizeof(struct ext3_mb_history) * sbi->s_mb_history_max; -+ s->history = kmalloc(size, GFP_KERNEL); -+ if (s == NULL) { -+ kfree(s); -+ return -EIO; -+ } -+ -+ spin_lock(&sbi->s_mb_history_lock); -+ memcpy(s->history, sbi->s_mb_history, size); -+ s->max = sbi->s_mb_history_max; -+ s->start = sbi->s_mb_history_cur % s->max; -+ spin_unlock(&sbi->s_mb_history_lock); -+ -+ rc = seq_open(file, &ext3_mb_seq_history_ops); -+ if (rc == 0) { -+ struct seq_file *m = (struct seq_file *)file->private_data; -+ m->private = s; -+ } else { -+ kfree(s->history); -+ kfree(s); -+ } -+ return rc; -+ -+} -+ -+static int ext3_mb_seq_history_release(struct inode *inode, struct file *file) -+{ -+ struct seq_file *seq = (struct seq_file *)file->private_data; -+ struct ext3_mb_proc_session *s = seq->private; -+ kfree(s->history); -+ kfree(s); -+ return seq_release(inode, file); -+} -+ -+static struct file_operations ext3_mb_seq_history_fops = { -+ .owner = THIS_MODULE, -+ .open = ext3_mb_seq_history_open, -+ .read = seq_read, -+ .llseek = seq_lseek, -+ .release = ext3_mb_seq_history_release, -+}; -+ -+static void *ext3_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) -+{ -+ struct super_block *sb = seq->private; -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ long group; -+ -+ if (*pos < 0 || *pos >= sbi->s_groups_count) -+ return NULL; -+ -+ group = *pos + 1; -+ return (void *) group; -+} -+ -+static void *ext3_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) -+{ -+ struct super_block *sb = seq->private; -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ long group; -+ -+ ++*pos; -+ if (*pos < 0 || *pos >= sbi->s_groups_count) -+ return NULL; -+ group = *pos + 1; -+ return (void *) group;; -+} -+ -+static int ext3_mb_seq_groups_show(struct seq_file *seq, void *v) -+{ -+ struct super_block *sb = seq->private; -+ long group = (long) v, i; -+ struct sg { -+ struct ext3_group_info info; -+ unsigned short counters[16]; -+ } sg; -+ -+ group--; -+ if (group == 0) -+ seq_printf(seq, "#%-5s: %-5s %-5s %-5s [ %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s ]\n", -+ "group", "free", "frags", "first", "2^0", "2^1", "2^2", -+ "2^3", "2^4", "2^5", "2^6", "2^7", "2^8", "2^9", "2^10", -+ "2^11", "2^12", "2^13"); -+ -+ i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) + -+ sizeof(struct ext3_group_info); -+ ext3_lock_group(sb, group); -+ memcpy(&sg, EXT3_GROUP_INFO(sb, group), i); -+ ext3_unlock_group(sb, group); -+ -+ if (EXT3_MB_GRP_NEED_INIT(&sg.info)) -+ return 0; -+ -+ seq_printf(seq, "#%-5lu: %-5u %-5u %-5u [", group, sg.info.bb_free, -+ sg.info.bb_fragments, sg.info.bb_first_free); -+ for (i = 0; i <= 13; i++) -+ seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ? -+ sg.info.bb_counters[i] : 0); -+ seq_printf(seq, " ]\n"); -+ -+ return 0; -+} -+ -+static void ext3_mb_seq_groups_stop(struct seq_file *seq, void *v) -+{ -+} -+ -+static struct seq_operations ext3_mb_seq_groups_ops = { -+ .start = ext3_mb_seq_groups_start, -+ .next = ext3_mb_seq_groups_next, -+ .stop = ext3_mb_seq_groups_stop, -+ .show = ext3_mb_seq_groups_show, -+}; -+ -+static int ext3_mb_seq_groups_open(struct inode *inode, struct file *file) -+{ -+ struct super_block *sb = PDE(inode)->data; -+ int rc; -+ -+ rc = seq_open(file, &ext3_mb_seq_groups_ops); -+ if (rc == 0) { -+ struct seq_file *m = (struct seq_file *)file->private_data; -+ m->private = sb; -+ } -+ return rc; -+ -+} -+ -+static struct file_operations ext3_mb_seq_groups_fops = { -+ .owner = THIS_MODULE, -+ .open = ext3_mb_seq_groups_open, -+ .read = seq_read, -+ .llseek = seq_lseek, -+ .release = seq_release, -+}; -+ -+static void ext3_mb_history_release(struct super_block *sb) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ char name[64]; -+ -+ snprintf(name, sizeof(name) - 1, "%s", bdevname(sb->s_bdev, name)); -+ remove_proc_entry("mb_groups", sbi->s_mb_proc); -+ remove_proc_entry("mb_history", sbi->s_mb_proc); -+ remove_proc_entry(name, proc_root_ext3); -+ -+ if (sbi->s_mb_history) -+ kfree(sbi->s_mb_history); -+} -+ -+static void ext3_mb_history_init(struct super_block *sb) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ char name[64]; -+ int i; -+ -+ snprintf(name, sizeof(name) - 1, "%s", bdevname(sb->s_bdev, name)); -+ sbi->s_mb_proc = proc_mkdir(name, proc_root_ext3); -+ if (sbi->s_mb_proc != NULL) { -+ struct proc_dir_entry *p; -+ p = create_proc_entry("mb_history", S_IRUGO, sbi->s_mb_proc); -+ if (p) { -+ p->proc_fops = &ext3_mb_seq_history_fops; -+ p->data = sb; -+ } -+ p = create_proc_entry("mb_groups", S_IRUGO, sbi->s_mb_proc); -+ if (p) { -+ p->proc_fops = &ext3_mb_seq_groups_fops; -+ p->data = sb; -+ } -+ } -+ -+ sbi->s_mb_history_max = 1000; -+ sbi->s_mb_history_cur = 0; -+ spin_lock_init(&sbi->s_mb_history_lock); -+ i = sbi->s_mb_history_max * sizeof(struct ext3_mb_history); -+ sbi->s_mb_history = kmalloc(i, GFP_KERNEL); -+ memset(sbi->s_mb_history, 0, i); -+ /* if we can't allocate history, then we simple won't use it */ -+} -+ -+static void -+ext3_mb_store_history(struct super_block *sb, unsigned ino, -+ struct ext3_allocation_context *ac) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ struct ext3_mb_history h; -+ -+ if (likely(sbi->s_mb_history == NULL)) -+ return; -+ -+ h.pid = current->pid; -+ h.ino = ino; -+ h.goal = ac->ac_g_ex; -+ h.result = ac->ac_b_ex; -+ h.found = ac->ac_found; -+ h.cr = ac->ac_criteria; -+ h.groups = ac->ac_groups_scanned; -+ h.tail = ac->ac_tail; -+ h.buddy = ac->ac_buddy; -+ h.merged = 0; -+ if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && -+ ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group) -+ h.merged = 1; -+ -+ spin_lock(&sbi->s_mb_history_lock); -+ memcpy(sbi->s_mb_history + sbi->s_mb_history_cur, &h, sizeof(h)); -+ if (++sbi->s_mb_history_cur >= sbi->s_mb_history_max) -+ sbi->s_mb_history_cur = 0; -+ spin_unlock(&sbi->s_mb_history_lock); -+} -+ -+#else -+#define ext3_mb_history_release(sb) -+#define ext3_mb_history_init(sb) -+#endif -+ -+int ext3_mb_init_backend(struct super_block *sb) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ int i, j, len, metalen; -+ int num_meta_group_infos = -+ (sbi->s_groups_count + EXT3_DESC_PER_BLOCK(sb) - 1) >> -+ EXT3_DESC_PER_BLOCK_BITS(sb); -+ struct ext3_group_info **meta_group_info; -+ -+ /* An 8TB filesystem with 64-bit pointers requires a 4096 byte -+ * kmalloc. A 128kb malloc should suffice for a 256TB filesystem. -+ * So a two level scheme suffices for now. */ -+ sbi->s_group_info = kmalloc(sizeof(*sbi->s_group_info) * -+ num_meta_group_infos, GFP_KERNEL); -+ if (sbi->s_group_info == NULL) { -+ printk(KERN_ERR "EXT3-fs: can't allocate buddy meta group\n"); -+ return -ENOMEM; -+ } -+ sbi->s_buddy_cache = new_inode(sb); -+ if (sbi->s_buddy_cache == NULL) { -+ printk(KERN_ERR "EXT3-fs: can't get new inode\n"); -+ goto err_freesgi; -+ } -+ -+ metalen = sizeof(*meta_group_info) << EXT3_DESC_PER_BLOCK_BITS(sb); -+ for (i = 0; i < num_meta_group_infos; i++) { -+ if ((i + 1) == num_meta_group_infos) -+ metalen = sizeof(*meta_group_info) * -+ (sbi->s_groups_count - -+ (i << EXT3_DESC_PER_BLOCK_BITS(sb))); -+ meta_group_info = kmalloc(metalen, GFP_KERNEL); -+ if (meta_group_info == NULL) { -+ printk(KERN_ERR "EXT3-fs: can't allocate mem for a " -+ "buddy group\n"); -+ goto err_freemeta; -+ } -+ sbi->s_group_info[i] = meta_group_info; -+ } -+ -+ /* -+ * calculate needed size. if change bb_counters size, -+ * don't forget about ext3_mb_generate_buddy() -+ */ -+ len = sizeof(struct ext3_group_info); -+ len += sizeof(unsigned short) * (sb->s_blocksize_bits + 2); -+ for (i = 0; i < sbi->s_groups_count; i++) { -+ struct ext3_group_desc * desc; -+ -+ meta_group_info = -+ sbi->s_group_info[i >> EXT3_DESC_PER_BLOCK_BITS(sb)]; -+ j = i & (EXT3_DESC_PER_BLOCK(sb) - 1); -+ -+ meta_group_info[j] = kmalloc(len, GFP_KERNEL); -+ if (meta_group_info[j] == NULL) { -+ printk(KERN_ERR "EXT3-fs: can't allocate buddy mem\n"); -+ i--; -+ goto err_freebuddy; -+ } -+ desc = ext3_get_group_desc(sb, i, NULL); -+ if (desc == NULL) { -+ printk(KERN_ERR"EXT3-fs: can't read descriptor %u\n",i); -+ goto err_freebuddy; -+ } -+ memset(meta_group_info[j], 0, len); -+ set_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, -+ &meta_group_info[j]->bb_state); -+ meta_group_info[j]->bb_free = -+ le16_to_cpu(desc->bg_free_blocks_count); -+ } -+ -+ return 0; -+ -+err_freebuddy: -+ while (i >= 0) { -+ kfree(EXT3_GROUP_INFO(sb, i)); -+ i--; -+ } -+ i = num_meta_group_infos; -+err_freemeta: -+ while (--i >= 0) -+ kfree(sbi->s_group_info[i]); -+ iput(sbi->s_buddy_cache); -+err_freesgi: -+ kfree(sbi->s_group_info); -+ return -ENOMEM; -+} -+ -+int ext3_mb_init(struct super_block *sb, int needs_recovery) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ struct inode *root = sb->s_root->d_inode; -+ unsigned i, offset, max; -+ struct dentry *dentry; -+ -+ if (!test_opt(sb, MBALLOC)) -+ return 0; -+ -+ i = (sb->s_blocksize_bits + 2) * sizeof(unsigned short); -+ -+ sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL); -+ if (sbi->s_mb_offsets == NULL) { -+ clear_opt(sbi->s_mount_opt, MBALLOC); -+ return -ENOMEM; -+ } -+ sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); -+ if (sbi->s_mb_maxs == NULL) { -+ clear_opt(sbi->s_mount_opt, MBALLOC); -+ kfree(sbi->s_mb_maxs); -+ return -ENOMEM; -+ } -+ -+ /* order 0 is regular bitmap */ -+ sbi->s_mb_maxs[0] = sb->s_blocksize << 3; -+ sbi->s_mb_offsets[0] = 0; -+ -+ i = 1; -+ offset = 0; -+ max = sb->s_blocksize << 2; -+ do { -+ sbi->s_mb_offsets[i] = offset; -+ sbi->s_mb_maxs[i] = max; -+ offset += 1 << (sb->s_blocksize_bits - i); -+ max = max >> 1; -+ i++; -+ } while (i <= sb->s_blocksize_bits + 1); -+ -+ /* init file for buddy data */ -+ if ((i = ext3_mb_init_backend(sb))) { -+ clear_opt(sbi->s_mount_opt, MBALLOC); -+ kfree(sbi->s_mb_offsets); -+ kfree(sbi->s_mb_maxs); -+ return i; -+ } -+ -+ spin_lock_init(&sbi->s_reserve_lock); -+ spin_lock_init(&sbi->s_md_lock); -+ INIT_LIST_HEAD(&sbi->s_active_transaction); -+ INIT_LIST_HEAD(&sbi->s_closed_transaction); -+ INIT_LIST_HEAD(&sbi->s_committed_transaction); -+ spin_lock_init(&sbi->s_bal_lock); -+ -+ /* remove old on-disk buddy file */ -+ down(&root->i_sem); -+ dentry = lookup_one_len(".buddy", sb->s_root, strlen(".buddy")); -+ if (dentry->d_inode != NULL) { -+ i = vfs_unlink(root, dentry); -+ if (i != 0) -+ printk("EXT3-fs: can't remove .buddy file: %d\n", i); -+ } -+ dput(dentry); -+ up(&root->i_sem); -+ -+ ext3_mb_history_init(sb); -+ -+ printk("EXT3-fs: mballoc enabled\n"); -+ return 0; -+} -+ -+int ext3_mb_release(struct super_block *sb) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ int i, num_meta_group_infos; -+ -+ if (!test_opt(sb, MBALLOC)) -+ return 0; -+ -+ /* release freed, non-committed blocks */ -+ spin_lock(&sbi->s_md_lock); -+ list_splice_init(&sbi->s_closed_transaction, -+ &sbi->s_committed_transaction); -+ list_splice_init(&sbi->s_active_transaction, -+ &sbi->s_committed_transaction); -+ spin_unlock(&sbi->s_md_lock); -+ ext3_mb_free_committed_blocks(sb); -+ -+ if (sbi->s_group_info) { -+ for (i = 0; i < sbi->s_groups_count; i++) -+ kfree(EXT3_GROUP_INFO(sb, i)); -+ num_meta_group_infos = (sbi->s_groups_count + -+ EXT3_DESC_PER_BLOCK(sb) - 1) >> -+ EXT3_DESC_PER_BLOCK_BITS(sb); -+ for (i = 0; i < num_meta_group_infos; i++) -+ kfree(sbi->s_group_info[i]); -+ kfree(sbi->s_group_info); -+ } -+ if (sbi->s_mb_offsets) -+ kfree(sbi->s_mb_offsets); -+ if (sbi->s_mb_maxs) -+ kfree(sbi->s_mb_maxs); -+ if (sbi->s_buddy_cache) -+ iput(sbi->s_buddy_cache); -+ if (sbi->s_blocks_reserved) -+ printk("ext3-fs: %ld blocks being reserved at umount!\n", -+ sbi->s_blocks_reserved); -+ if (ext3_mb_stats) { -+ printk("EXT3-fs: mballoc: %u blocks %u reqs (%u success)\n", -+ atomic_read(&sbi->s_bal_allocated), -+ atomic_read(&sbi->s_bal_reqs), -+ atomic_read(&sbi->s_bal_success)); -+ printk("EXT3-fs: mballoc: %u extents scanned, %u goal hits, " -+ "%u 2^N hits, %u breaks\n", -+ atomic_read(&sbi->s_bal_ex_scanned), -+ atomic_read(&sbi->s_bal_goals), -+ atomic_read(&sbi->s_bal_2orders), -+ atomic_read(&sbi->s_bal_breaks)); -+ printk("EXT3-fs: mballoc: %lu generated and it took %Lu\n", -+ sbi->s_mb_buddies_generated++, -+ sbi->s_mb_generation_time); -+ } -+ -+ ext3_mb_history_release(sb); -+ -+ return 0; -+} -+ -+void ext3_mb_free_committed_blocks(struct super_block *sb) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ int err, i, count = 0, count2 = 0; -+ struct ext3_free_metadata *md; -+ struct ext3_buddy e3b; -+ -+ if (list_empty(&sbi->s_committed_transaction)) -+ return; -+ -+ /* there is committed blocks to be freed yet */ -+ do { -+ /* get next array of blocks */ -+ md = NULL; -+ spin_lock(&sbi->s_md_lock); -+ if (!list_empty(&sbi->s_committed_transaction)) { -+ md = list_entry(sbi->s_committed_transaction.next, -+ struct ext3_free_metadata, list); -+ list_del(&md->list); -+ } -+ spin_unlock(&sbi->s_md_lock); -+ -+ if (md == NULL) -+ break; -+ -+ mb_debug("gonna free %u blocks in group %u (0x%p):", -+ md->num, md->group, md); -+ -+ err = ext3_mb_load_buddy(sb, md->group, &e3b); -+ /* we expect to find existing buddy because it's pinned */ -+ BUG_ON(err != 0); -+ -+ /* there are blocks to put in buddy to make them really free */ -+ count += md->num; -+ count2++; -+ ext3_lock_group(sb, md->group); -+ for (i = 0; i < md->num; i++) { -+ mb_debug(" %u", md->blocks[i]); -+ mb_free_blocks(&e3b, md->blocks[i], 1); -+ } -+ mb_debug("\n"); -+ ext3_unlock_group(sb, md->group); -+ -+ /* balance refcounts from ext3_mb_free_metadata() */ -+ page_cache_release(e3b.bd_buddy_page); -+ page_cache_release(e3b.bd_bitmap_page); -+ -+ kfree(md); -+ ext3_mb_release_desc(&e3b); -+ -+ } while (md); -+ mb_debug("freed %u blocks in %u structures\n", count, count2); -+} -+ -+void ext3_mb_poll_new_transaction(struct super_block *sb, handle_t *handle) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ -+ if (sbi->s_last_transaction == handle->h_transaction->t_tid) -+ return; -+ -+ /* new transaction! time to close last one and free blocks for -+ * committed transaction. we know that only transaction can be -+ * active, so previos transaction can be being logged and we -+ * know that transaction before previous is known to be already -+ * logged. this means that now we may free blocks freed in all -+ * transactions before previous one. hope I'm clear enough ... */ -+ -+ spin_lock(&sbi->s_md_lock); -+ if (sbi->s_last_transaction != handle->h_transaction->t_tid) { -+ mb_debug("new transaction %lu, old %lu\n", -+ (unsigned long) handle->h_transaction->t_tid, -+ (unsigned long) sbi->s_last_transaction); -+ list_splice_init(&sbi->s_closed_transaction, -+ &sbi->s_committed_transaction); -+ list_splice_init(&sbi->s_active_transaction, -+ &sbi->s_closed_transaction); -+ sbi->s_last_transaction = handle->h_transaction->t_tid; -+ } -+ spin_unlock(&sbi->s_md_lock); -+ -+ ext3_mb_free_committed_blocks(sb); -+} -+ -+int ext3_mb_free_metadata(handle_t *handle, struct ext3_buddy *e3b, -+ int group, int block, int count) -+{ -+ struct ext3_group_info *db = e3b->bd_info; -+ struct super_block *sb = e3b->bd_sb; -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ struct ext3_free_metadata *md; -+ int i; -+ -+ J_ASSERT(e3b->bd_bitmap_page != NULL); -+ J_ASSERT(e3b->bd_buddy_page != NULL); -+ -+ ext3_lock_group(sb, group); -+ for (i = 0; i < count; i++) { -+ md = db->bb_md_cur; -+ if (md && db->bb_tid != handle->h_transaction->t_tid) { -+ db->bb_md_cur = NULL; -+ md = NULL; -+ } -+ -+ if (md == NULL) { -+ ext3_unlock_group(sb, group); -+ md = kmalloc(sizeof(*md), GFP_KERNEL); -+ if (md == NULL) -+ return -ENOMEM; -+ md->num = 0; -+ md->group = group; -+ -+ ext3_lock_group(sb, group); -+ if (db->bb_md_cur == NULL) { -+ spin_lock(&sbi->s_md_lock); -+ list_add(&md->list, &sbi->s_active_transaction); -+ spin_unlock(&sbi->s_md_lock); -+ /* protect buddy cache from being freed, -+ * otherwise we'll refresh it from -+ * on-disk bitmap and lose not-yet-available -+ * blocks */ -+ page_cache_get(e3b->bd_buddy_page); -+ page_cache_get(e3b->bd_bitmap_page); -+ db->bb_md_cur = md; -+ db->bb_tid = handle->h_transaction->t_tid; -+ mb_debug("new md 0x%p for group %u\n", -+ md, md->group); -+ } else { -+ kfree(md); -+ md = db->bb_md_cur; -+ } -+ } -+ -+ BUG_ON(md->num >= EXT3_BB_MAX_BLOCKS); -+ md->blocks[md->num] = block + i; -+ md->num++; -+ if (md->num == EXT3_BB_MAX_BLOCKS) { -+ /* no more space, put full container on a sb's list */ -+ db->bb_md_cur = NULL; -+ } -+ } -+ ext3_unlock_group(sb, group); -+ return 0; -+} -+ -+void ext3_mb_free_blocks(handle_t *handle, struct inode *inode, -+ unsigned long block, unsigned long count, -+ int metadata, int *freed) -+{ -+ struct buffer_head *bitmap_bh = NULL; -+ struct ext3_group_desc *gdp; -+ struct ext3_super_block *es; -+ unsigned long bit, overflow; -+ struct buffer_head *gd_bh; -+ unsigned long block_group; -+ struct ext3_sb_info *sbi; -+ struct super_block *sb; -+ struct ext3_buddy e3b; -+ int err = 0, ret; -+ -+ *freed = 0; -+ sb = inode->i_sb; -+ if (!sb) { -+ printk ("ext3_free_blocks: nonexistent device"); -+ return; -+ } -+ -+ ext3_mb_poll_new_transaction(sb, handle); -+ -+ sbi = EXT3_SB(sb); -+ es = EXT3_SB(sb)->s_es; -+ if (block < le32_to_cpu(es->s_first_data_block) || -+ block + count < block || -+ block + count > le32_to_cpu(es->s_blocks_count)) { -+ ext3_error (sb, "ext3_free_blocks", -+ "Freeing blocks not in datazone - " -+ "block = %lu, count = %lu", block, count); -+ goto error_return; -+ } -+ -+ ext3_debug("freeing block %lu\n", block); -+ -+do_more: -+ overflow = 0; -+ block_group = (block - le32_to_cpu(es->s_first_data_block)) / -+ EXT3_BLOCKS_PER_GROUP(sb); -+ bit = (block - le32_to_cpu(es->s_first_data_block)) % -+ EXT3_BLOCKS_PER_GROUP(sb); -+ /* -+ * Check to see if we are freeing blocks across a group -+ * boundary. -+ */ -+ if (bit + count > EXT3_BLOCKS_PER_GROUP(sb)) { -+ overflow = bit + count - EXT3_BLOCKS_PER_GROUP(sb); -+ count -= overflow; -+ } -+ brelse(bitmap_bh); -+ bitmap_bh = read_block_bitmap(sb, block_group); -+ if (!bitmap_bh) -+ goto error_return; -+ gdp = ext3_get_group_desc (sb, block_group, &gd_bh); -+ if (!gdp) -+ goto error_return; -+ -+ if (in_range (le32_to_cpu(gdp->bg_block_bitmap), block, count) || -+ in_range (le32_to_cpu(gdp->bg_inode_bitmap), block, count) || -+ in_range (block, le32_to_cpu(gdp->bg_inode_table), -+ EXT3_SB(sb)->s_itb_per_group) || -+ in_range (block + count - 1, le32_to_cpu(gdp->bg_inode_table), -+ EXT3_SB(sb)->s_itb_per_group)) -+ ext3_error (sb, "ext3_free_blocks", -+ "Freeing blocks in system zones - " -+ "Block = %lu, count = %lu", -+ block, count); -+ -+ BUFFER_TRACE(bitmap_bh, "getting write access"); -+ err = ext3_journal_get_write_access(handle, bitmap_bh); -+ if (err) -+ goto error_return; -+ -+ /* -+ * We are about to modify some metadata. Call the journal APIs -+ * to unshare ->b_data if a currently-committing transaction is -+ * using it -+ */ -+ BUFFER_TRACE(gd_bh, "get_write_access"); -+ err = ext3_journal_get_write_access(handle, gd_bh); -+ if (err) -+ goto error_return; -+ -+ err = ext3_mb_load_buddy(sb, block_group, &e3b); -+ if (err) -+ goto error_return; -+ -+#ifdef AGGRESSIVE_CHECK -+ { -+ int i; -+ for (i = 0; i < count; i++) -+ J_ASSERT(mb_test_bit(bit + i, bitmap_bh->b_data)); -+ } -+#endif -+ mb_clear_bits(bitmap_bh->b_data, bit, count); -+ -+ /* We dirtied the bitmap block */ -+ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); -+ err = ext3_journal_dirty_metadata(handle, bitmap_bh); -+ -+ if (metadata) { -+ /* blocks being freed are metadata. these blocks shouldn't -+ * be used until this transaction is committed */ -+ ext3_mb_free_metadata(handle, &e3b, block_group, bit, count); -+ } else { -+ ext3_lock_group(sb, block_group); -+ mb_free_blocks(&e3b, bit, count); -+ ext3_unlock_group(sb, block_group); -+ } -+ -+ spin_lock(sb_bgl_lock(sbi, block_group)); -+ gdp->bg_free_blocks_count = -+ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + count); -+ spin_unlock(sb_bgl_lock(sbi, block_group)); -+ percpu_counter_mod(&sbi->s_freeblocks_counter, count); -+ -+ ext3_mb_release_desc(&e3b); -+ -+ *freed = count; -+ -+ /* And the group descriptor block */ -+ BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); -+ ret = ext3_journal_dirty_metadata(handle, gd_bh); -+ if (!err) err = ret; -+ -+ if (overflow && !err) { -+ block += count; -+ count = overflow; -+ goto do_more; -+ } -+ sb->s_dirt = 1; -+error_return: -+ brelse(bitmap_bh); -+ ext3_std_error(sb, err); -+ return; -+} -+ -+int ext3_mb_reserve_blocks(struct super_block *sb, int blocks) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ int free, ret = -ENOSPC; -+ -+ BUG_ON(blocks < 0); -+ spin_lock(&sbi->s_reserve_lock); -+ free = percpu_counter_read_positive(&sbi->s_freeblocks_counter); -+ if (blocks <= free - sbi->s_blocks_reserved) { -+ sbi->s_blocks_reserved += blocks; -+ ret = 0; -+ } -+ spin_unlock(&sbi->s_reserve_lock); -+ return ret; -+} -+ -+void ext3_mb_release_blocks(struct super_block *sb, int blocks) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ -+ BUG_ON(blocks < 0); -+ spin_lock(&sbi->s_reserve_lock); -+ sbi->s_blocks_reserved -= blocks; -+ WARN_ON(sbi->s_blocks_reserved < 0); -+ if (sbi->s_blocks_reserved < 0) -+ sbi->s_blocks_reserved = 0; -+ spin_unlock(&sbi->s_reserve_lock); -+} -+ -+int ext3_new_block(handle_t *handle, struct inode *inode, -+ unsigned long goal, int *errp) -+{ -+ int ret, len; -+ -+ if (!test_opt(inode->i_sb, MBALLOC)) { -+ ret = ext3_new_block_old(handle, inode, goal, errp); -+ goto out; -+ } -+ len = 1; -+ ret = ext3_mb_new_blocks(handle, inode, goal, &len, 0, errp); -+out: -+ return ret; -+} -+ -+ -+void ext3_free_blocks(handle_t *handle, struct inode * inode, -+ unsigned long block, unsigned long count, int metadata) -+{ -+ struct super_block *sb; -+ int freed; -+ -+ sb = inode->i_sb; -+ if (!test_opt(sb, MBALLOC) || !EXT3_SB(sb)->s_group_info) -+ ext3_free_blocks_sb(handle, sb, block, count, &freed); -+ else -+ ext3_mb_free_blocks(handle, inode, block, count, metadata, &freed); -+ if (freed) -+ DQUOT_FREE_BLOCK(inode, freed); -+ return; -+} -+ -+#define EXT3_ROOT "ext3" -+#define EXT3_MB_STATS_NAME "mb_stats" -+#define EXT3_MB_MAX_TO_SCAN_NAME "mb_max_to_scan" -+#define EXT3_MB_MIN_TO_SCAN_NAME "mb_min_to_scan" -+#define EXT3_MB_ORDER2_REQ "mb_order2_req" -+ -+static int ext3_mb_stats_read(char *page, char **start, off_t off, -+ int count, int *eof, void *data) -+{ -+ int len; -+ -+ *eof = 1; -+ if (off != 0) -+ return 0; -+ -+ len = sprintf(page, "%ld\n", ext3_mb_stats); -+ *start = page; -+ return len; -+} -+ -+static int ext3_mb_stats_write(struct file *file, const char *buffer, -+ unsigned long count, void *data) -+{ -+ char str[32]; -+ -+ if (count >= sizeof(str)) { -+ printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n", -+ EXT3_MB_STATS_NAME, (int)sizeof(str)); -+ return -EOVERFLOW; -+ } -+ -+ if (copy_from_user(str, buffer, count)) -+ return -EFAULT; -+ -+ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ -+ ext3_mb_stats = (simple_strtol(str, NULL, 0) != 0); -+ return count; -+} -+ -+static int ext3_mb_max_to_scan_read(char *page, char **start, off_t off, -+ int count, int *eof, void *data) -+{ -+ int len; -+ -+ *eof = 1; -+ if (off != 0) -+ return 0; -+ -+ len = sprintf(page, "%ld\n", ext3_mb_max_to_scan); -+ *start = page; -+ return len; -+} -+ -+static int ext3_mb_max_to_scan_write(struct file *file, const char *buffer, -+ unsigned long count, void *data) -+{ -+ char str[32]; -+ long value; -+ -+ if (count >= sizeof(str)) { -+ printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n", -+ EXT3_MB_MAX_TO_SCAN_NAME, (int)sizeof(str)); -+ return -EOVERFLOW; -+ } -+ -+ if (copy_from_user(str, buffer, count)) -+ return -EFAULT; -+ -+ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ -+ value = simple_strtol(str, NULL, 0); -+ if (value <= 0) -+ return -ERANGE; -+ -+ ext3_mb_max_to_scan = value; -+ -+ return count; -+} -+ -+static int ext3_mb_min_to_scan_read(char *page, char **start, off_t off, -+ int count, int *eof, void *data) -+{ -+ int len; -+ -+ *eof = 1; -+ if (off != 0) -+ return 0; -+ -+ len = sprintf(page, "%ld\n", ext3_mb_min_to_scan); -+ *start = page; -+ return len; -+} -+ -+static int ext3_mb_min_to_scan_write(struct file *file, const char *buffer, -+ unsigned long count, void *data) -+{ -+ char str[32]; -+ long value; -+ -+ if (count >= sizeof(str)) { -+ printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n", -+ EXT3_MB_MIN_TO_SCAN_NAME, (int)sizeof(str)); -+ return -EOVERFLOW; -+ } -+ -+ if (copy_from_user(str, buffer, count)) -+ return -EFAULT; -+ -+ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ -+ value = simple_strtol(str, NULL, 0); -+ if (value <= 0) -+ return -ERANGE; -+ -+ ext3_mb_min_to_scan = value; -+ -+ return count; -+} -+ -+static int ext3_mb_order2_req_read(char *page, char **start, off_t off, -+ int count, int *eof, void *data) -+{ -+ int len; -+ -+ *eof = 1; -+ if (off != 0) -+ return 0; -+ -+ len = sprintf(page, "%ld\n", ext3_mb_order2_reqs); -+ *start = page; -+ return len; -+} -+ -+static int ext3_mb_order2_req_write(struct file *file, const char *buffer, -+ unsigned long count, void *data) -+{ -+ char str[32]; -+ long value; -+ -+ if (count >= sizeof(str)) { -+ printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n", -+ EXT3_MB_MIN_TO_SCAN_NAME, (int)sizeof(str)); -+ return -EOVERFLOW; -+ } -+ -+ if (copy_from_user(str, buffer, count)) -+ return -EFAULT; -+ -+ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ -+ value = simple_strtol(str, NULL, 0); -+ if (value <= 0) -+ return -ERANGE; -+ -+ ext3_mb_order2_reqs = value; -+ -+ return count; -+} -+ -+int __init init_ext3_proc(void) -+{ -+ struct proc_dir_entry *proc_ext3_mb_stats; -+ struct proc_dir_entry *proc_ext3_mb_max_to_scan; -+ struct proc_dir_entry *proc_ext3_mb_min_to_scan; -+ struct proc_dir_entry *proc_ext3_mb_order2_req; -+ -+ proc_root_ext3 = proc_mkdir(EXT3_ROOT, proc_root_fs); -+ if (proc_root_ext3 == NULL) { -+ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", EXT3_ROOT); -+ return -EIO; -+ } -+ -+ /* Initialize EXT3_MB_STATS_NAME */ -+ proc_ext3_mb_stats = create_proc_entry(EXT3_MB_STATS_NAME, -+ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); -+ if (proc_ext3_mb_stats == NULL) { -+ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", -+ EXT3_MB_STATS_NAME); -+ remove_proc_entry(EXT3_ROOT, proc_root_fs); -+ return -EIO; -+ } -+ -+ proc_ext3_mb_stats->data = NULL; -+ proc_ext3_mb_stats->read_proc = ext3_mb_stats_read; -+ proc_ext3_mb_stats->write_proc = ext3_mb_stats_write; -+ -+ /* Initialize EXT3_MAX_TO_SCAN_NAME */ -+ proc_ext3_mb_max_to_scan = create_proc_entry( -+ EXT3_MB_MAX_TO_SCAN_NAME, -+ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); -+ if (proc_ext3_mb_max_to_scan == NULL) { -+ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", -+ EXT3_MB_MAX_TO_SCAN_NAME); -+ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); -+ remove_proc_entry(EXT3_ROOT, proc_root_fs); -+ return -EIO; -+ } -+ -+ proc_ext3_mb_max_to_scan->data = NULL; -+ proc_ext3_mb_max_to_scan->read_proc = ext3_mb_max_to_scan_read; -+ proc_ext3_mb_max_to_scan->write_proc = ext3_mb_max_to_scan_write; -+ -+ /* Initialize EXT3_MIN_TO_SCAN_NAME */ -+ proc_ext3_mb_min_to_scan = create_proc_entry( -+ EXT3_MB_MIN_TO_SCAN_NAME, -+ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); -+ if (proc_ext3_mb_min_to_scan == NULL) { -+ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", -+ EXT3_MB_MIN_TO_SCAN_NAME); -+ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3); -+ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); -+ remove_proc_entry(EXT3_ROOT, proc_root_fs); -+ return -EIO; -+ } -+ -+ proc_ext3_mb_min_to_scan->data = NULL; -+ proc_ext3_mb_min_to_scan->read_proc = ext3_mb_min_to_scan_read; -+ proc_ext3_mb_min_to_scan->write_proc = ext3_mb_min_to_scan_write; -+ -+ /* Initialize EXT3_ORDER2_REQ */ -+ proc_ext3_mb_order2_req = create_proc_entry( -+ EXT3_MB_ORDER2_REQ, -+ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); -+ if (proc_ext3_mb_order2_req == NULL) { -+ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", -+ EXT3_MB_ORDER2_REQ); -+ remove_proc_entry(EXT3_MB_MIN_TO_SCAN_NAME, proc_root_ext3); -+ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3); -+ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); -+ remove_proc_entry(EXT3_ROOT, proc_root_fs); -+ return -EIO; -+ } -+ -+ proc_ext3_mb_order2_req->data = NULL; -+ proc_ext3_mb_order2_req->read_proc = ext3_mb_order2_req_read; -+ proc_ext3_mb_order2_req->write_proc = ext3_mb_order2_req_write; -+ -+ return 0; -+} -+ -+void exit_ext3_proc(void) -+{ -+ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); -+ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3); -+ remove_proc_entry(EXT3_MB_MIN_TO_SCAN_NAME, proc_root_ext3); -+ remove_proc_entry(EXT3_MB_ORDER2_REQ, proc_root_ext3); -+ remove_proc_entry(EXT3_ROOT, proc_root_fs); -+} -Index: linux-2.6.12.6-bull/fs/ext3/Makefile -=================================================================== ---- linux-2.6.12.6-bull.orig/fs/ext3/Makefile 2006-04-29 20:39:09.000000000 +0400 -+++ linux-2.6.12.6-bull/fs/ext3/Makefile 2006-04-29 20:39:10.000000000 +0400 -@@ -6,7 +6,7 @@ obj-$(CONFIG_EXT3_FS) += ext3.o - - ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ - ioctl.o namei.o super.o symlink.o hash.o resize.o \ -- extents.o -+ extents.o mballoc.o - - ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o - ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o diff --git a/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6.18-vanilla.patch b/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6.18-vanilla.patch deleted file mode 100644 index 702dfcc..0000000 --- a/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6.18-vanilla.patch +++ /dev/null @@ -1,3140 +0,0 @@ -Index: linux-stage/include/linux/ext3_fs.h -=================================================================== ---- linux-stage.orig/include/linux/ext3_fs.h 2006-07-16 02:29:43.000000000 +0800 -+++ linux-stage/include/linux/ext3_fs.h 2006-07-16 02:29:49.000000000 +0800 -@@ -53,6 +53,14 @@ - #define ext3_debug(f, a...) do {} while (0) - #endif - -+#define EXT3_MULTIBLOCK_ALLOCATOR 1 -+ -+#define EXT3_MB_HINT_MERGE 1 -+#define EXT3_MB_HINT_RESERVED 2 -+#define EXT3_MB_HINT_METADATA 4 -+#define EXT3_MB_HINT_FIRST 8 -+#define EXT3_MB_HINT_BEST 16 -+ - /* - * Special inodes numbers - */ -@@ -379,6 +387,7 @@ struct ext3_inode { - #define EXT3_MOUNT_IOPEN_NOPRIV 0x800000/* Make iopen world-readable */ - #define EXT3_MOUNT_EXTENTS 0x1000000/* Extents support */ - #define EXT3_MOUNT_EXTDEBUG 0x2000000/* Extents debug */ -+#define EXT3_MOUNT_MBALLOC 0x4000000/* Buddy allocation support */ - - /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ - #ifndef clear_opt -@@ -405,6 +413,14 @@ - #define ext3_find_first_zero_bit ext2_find_first_zero_bit - #define ext3_find_next_zero_bit ext2_find_next_zero_bit - -+#ifndef ext2_find_next_le_bit -+#ifdef __LITTLE_ENDIAN -+#define ext2_find_next_le_bit(addr, size, off) find_next_bit((addr), (size), (off)) -+#else -+#error "mballoc needs a patch for big-endian systems - CFS bug 10634" -+#endif /* __LITTLE_ENDIAN */ -+#endif /* !ext2_find_next_le_bit */ -+ - /* - * Maximal mount counts between two filesystem checks - */ -@@ -749,12 +758,12 @@ ext3_group_first_block_no(struct super_b - /* balloc.c */ - extern int ext3_bg_has_super(struct super_block *sb, int group); - extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group); --extern ext3_fsblk_t ext3_new_block (handle_t *handle, struct inode *inode, -+extern ext3_fsblk_t ext3_new_block_old(handle_t *handle, struct inode *inode, - ext3_fsblk_t goal, int *errp); - extern ext3_fsblk_t ext3_new_blocks (handle_t *handle, struct inode *inode, - ext3_fsblk_t goal, unsigned long *count, int *errp); - extern void ext3_free_blocks (handle_t *handle, struct inode *inode, -- ext3_fsblk_t block, unsigned long count); -+ ext3_fsblk_t block, unsigned long count, int metadata); - extern void ext3_free_blocks_sb (handle_t *handle, struct super_block *sb, - ext3_fsblk_t block, unsigned long count, - unsigned long *pdquot_freed_blocks); -@@ -881,6 +890,21 @@ extern void ext3_extents_initialize_bloc - extern int ext3_ext_ioctl(struct inode *inode, struct file *filp, - unsigned int cmd, unsigned long arg); - -+/* mballoc.c */ -+extern long ext3_mb_stats; -+extern long ext3_mb_max_to_scan; -+extern int ext3_mb_init(struct super_block *sb, int needs_recovery); -+extern int ext3_mb_release(struct super_block *sb); -+extern ext3_fsblk_t ext3_new_block(handle_t *handle, struct inode *inode, -+ ext3_fsblk_t goal, int *errp); -+extern ext3_fsblk_t ext3_mb_new_blocks(handle_t *handle, struct inode *inode, -+ ext3_fsblk_t goal, int *len, int flags, -+ int *errp); -+extern int ext3_mb_reserve_blocks(struct super_block *sb, int); -+extern void ext3_mb_release_blocks(struct super_block *sb, int); -+int __init init_ext3_proc(void); -+void exit_ext3_proc(void); -+ - #endif /* __KERNEL__ */ - - /* EXT3_IOC_CREATE_INUM at bottom of file (visible to kernel and user). */ -Index: linux-stage/include/linux/ext3_fs_sb.h -=================================================================== ---- linux-stage.orig/include/linux/ext3_fs_sb.h 2006-07-16 02:29:43.000000000 +0800 -+++ linux-stage/include/linux/ext3_fs_sb.h 2006-07-16 02:29:49.000000000 +0800 -@@ -21,8 +21,14 @@ - #include - #include - #include -+#include - #endif - #include -+#include -+ -+struct ext3_buddy_group_blocks; -+struct ext3_mb_history; -+#define EXT3_BB_MAX_BLOCKS - - /* - * third extended-fs super-block data in memory -@@ -78,6 +84,43 @@ struct ext3_sb_info { - char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */ - int s_jquota_fmt; /* Format of quota to use */ - #endif -+ -+ /* for buddy allocator */ -+ struct ext3_group_info ***s_group_info; -+ struct inode *s_buddy_cache; -+ long s_blocks_reserved; -+ spinlock_t s_reserve_lock; -+ struct list_head s_active_transaction; -+ struct list_head s_closed_transaction; -+ struct list_head s_committed_transaction; -+ spinlock_t s_md_lock; -+ tid_t s_last_transaction; -+ int s_mb_factor; -+ unsigned short *s_mb_offsets, *s_mb_maxs; -+ unsigned long s_stripe; -+ -+ /* history to debug policy */ -+ struct ext3_mb_history *s_mb_history; -+ int s_mb_history_cur; -+ int s_mb_history_max; -+ struct proc_dir_entry *s_mb_proc; -+ spinlock_t s_mb_history_lock; -+ -+ /* stats for buddy allocator */ -+ atomic_t s_bal_reqs; /* number of reqs with len > 1 */ -+ atomic_t s_bal_success; /* we found long enough chunks */ -+ atomic_t s_bal_allocated; /* in blocks */ -+ atomic_t s_bal_ex_scanned; /* total extents scanned */ -+ atomic_t s_bal_goals; /* goal hits */ -+ atomic_t s_bal_breaks; /* too long searches */ -+ atomic_t s_bal_2orders; /* 2^order hits */ -+ spinlock_t s_bal_lock; -+ unsigned long s_mb_buddies_generated; -+ unsigned long long s_mb_generation_time; - }; -+ -+#define EXT3_GROUP_INFO(sb, group) \ -+ EXT3_SB(sb)->s_group_info[(group) >> EXT3_DESC_PER_BLOCK_BITS(sb)] \ -+ [(group) & (EXT3_DESC_PER_BLOCK(sb) - 1)] - - #endif /* _LINUX_EXT3_FS_SB */ -Index: linux-stage/fs/ext3/super.c -=================================================================== ---- linux-stage.orig/fs/ext3/super.c 2006-07-16 02:29:43.000000000 +0800 -+++ linux-stage/fs/ext3/super.c 2006-07-16 02:29:49.000000000 +0800 -@@ -391,6 +391,7 @@ static void ext3_put_super (struct super - struct ext3_super_block *es = sbi->s_es; - int i; - -+ ext3_mb_release(sb); - ext3_ext_release(sb); - ext3_xattr_put_super(sb); - journal_destroy(sbi->s_journal); -@@ -642,6 +643,7 @@ enum { - Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, - Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, - Opt_extents, Opt_noextents, Opt_extdebug, -+ Opt_mballoc, Opt_nomballoc, Opt_stripe, - Opt_grpquota - }; - -@@ -696,6 +697,9 @@ static match_table_t tokens = { - {Opt_extents, "extents"}, - {Opt_noextents, "noextents"}, - {Opt_extdebug, "extdebug"}, -+ {Opt_mballoc, "mballoc"}, -+ {Opt_nomballoc, "nomballoc"}, -+ {Opt_stripe, "stripe=%u"}, - {Opt_barrier, "barrier=%u"}, - {Opt_err, NULL}, - {Opt_resize, "resize"}, -@@ -1047,6 +1049,19 @@ clear_qf_name: - case Opt_extdebug: - set_opt (sbi->s_mount_opt, EXTDEBUG); - break; -+ case Opt_mballoc: -+ set_opt(sbi->s_mount_opt, MBALLOC); -+ break; -+ case Opt_nomballoc: -+ clear_opt(sbi->s_mount_opt, MBALLOC); -+ break; -+ case Opt_stripe: -+ if (match_int(&args[0], &option)) -+ return 0; -+ if (option < 0) -+ return 0; -+ sbi->s_stripe = option; -+ break; - default: - printk (KERN_ERR - "EXT3-fs: Unrecognized mount option \"%s\" " -@@ -1773,6 +1778,7 @@ static int ext3_fill_super (struct super - "writeback"); - - ext3_ext_init(sb); -+ ext3_mb_init(sb, needs_recovery); - lock_kernel(); - return 0; - -@@ -2712,7 +2718,13 @@ static struct file_system_type ext3_fs_t - - static int __init init_ext3_fs(void) - { -- int err = init_ext3_xattr(); -+ int err; -+ -+ err = init_ext3_proc(); -+ if (err) -+ return err; -+ -+ err = init_ext3_xattr(); - if (err) - return err; - err = init_inodecache(); -@@ -2734,6 +2746,7 @@ static void __exit exit_ext3_fs(void) - unregister_filesystem(&ext3_fs_type); - destroy_inodecache(); - exit_ext3_xattr(); -+ exit_ext3_proc(); - } - - int ext3_prep_san_write(struct inode *inode, long *blocks, -Index: linux-stage/fs/ext3/extents.c -=================================================================== ---- linux-stage.orig/fs/ext3/extents.c 2006-07-16 02:29:43.000000000 +0800 -+++ linux-stage/fs/ext3/extents.c 2006-07-16 02:29:49.000000000 +0800 -@@ -771,7 +771,7 @@ cleanup: - for (i = 0; i < depth; i++) { - if (!ablocks[i]) - continue; -- ext3_free_blocks(handle, tree->inode, ablocks[i], 1); -+ ext3_free_blocks(handle, tree->inode, ablocks[i], 1, 1); - } - } - kfree(ablocks); -@@ -1428,7 +1428,7 @@ int ext3_ext_rm_idx(handle_t *handle, st - path->p_idx->ei_leaf); - bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf); - ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf); -- ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1); -+ ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1, 1); - return err; - } - -@@ -1913,10 +1913,12 @@ ext3_remove_blocks(struct ext3_extents_t - int needed = ext3_remove_blocks_credits(tree, ex, from, to); - handle_t *handle = ext3_journal_start(tree->inode, needed); - struct buffer_head *bh; -- int i; -+ int i, metadata = 0; - - if (IS_ERR(handle)) - return PTR_ERR(handle); -+ if (S_ISDIR(tree->inode->i_mode) || S_ISLNK(tree->inode->i_mode)) -+ metadata = 1; - if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) { - /* tail removal */ - unsigned long num, start; -@@ -1928,7 +1930,7 @@ ext3_remove_blocks(struct ext3_extents_t - bh = sb_find_get_block(tree->inode->i_sb, start + i); - ext3_forget(handle, 0, tree->inode, bh, start + i); - } -- ext3_free_blocks(handle, tree->inode, start, num); -+ ext3_free_blocks(handle, tree->inode, start, num, metadata); - } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) { - printk("strange request: removal %lu-%lu from %u:%u\n", - from, to, ex->ee_block, ex->ee_len); -Index: linux-stage/fs/ext3/inode.c -=================================================================== ---- linux-stage.orig/fs/ext3/inode.c 2006-07-16 02:29:43.000000000 +0800 -+++ linux-stage/fs/ext3/inode.c 2006-07-16 02:29:49.000000000 +0800 -@@ -562,7 +562,7 @@ static int ext3_alloc_blocks(handle_t *h - return ret; - failed_out: - for (i = 0; i i_sb; -- if (!sb) { -- printk ("ext3_free_blocks: nonexistent device"); -- return; -- } -- ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks); -- if (dquot_freed_blocks) -- DQUOT_FREE_BLOCK(inode, dquot_freed_blocks); -- return; --} -- - /* - * For ext3 allocations, we must not reuse any blocks which are - * allocated in the bitmap buffer's "last committed data" copy. This -@@ -1463,7 +1445,7 @@ out: - return 0; - } - --ext3_fsblk_t ext3_new_block(handle_t *handle, struct inode *inode, -+ext3_fsblk_t ext3_new_block_old(handle_t *handle, struct inode *inode, - ext3_fsblk_t goal, int *errp) - { - unsigned long count = 1; -Index: linux-stage/fs/ext3/xattr.c -=================================================================== ---- linux-stage.orig/fs/ext3/xattr.c 2006-07-16 02:29:43.000000000 +0800 -+++ linux-stage/fs/ext3/xattr.c 2006-07-16 02:29:49.000000000 +0800 -@@ -484,7 +484,7 @@ ext3_xattr_release_block(handle_t *handl - ea_bdebug(bh, "refcount now=0; freeing"); - if (ce) - mb_cache_entry_free(ce); -- ext3_free_blocks(handle, inode, bh->b_blocknr, 1); -+ ext3_free_blocks(handle, inode, bh->b_blocknr, 1, 1); - get_bh(bh); - ext3_forget(handle, 1, inode, bh, bh->b_blocknr); - } else { -@@ -805,7 +805,7 @@ inserted: - new_bh = sb_getblk(sb, block); - if (!new_bh) { - getblk_failed: -- ext3_free_blocks(handle, inode, block, 1); -+ ext3_free_blocks(handle, inode, block, 1, 1); - error = -EIO; - goto cleanup; - } -Index: linux-stage/fs/ext3/mballoc.c -=================================================================== ---- /dev/null 1970-01-01 00:00:00.000000000 +0000 -+++ linux-stage/fs/ext3/mballoc.c 2006-07-16 02:29:49.000000000 +0800 -@@ -0,0 +1,2730 @@ -+/* -+ * Copyright (c) 2003-2005, Cluster File Systems, Inc, info@clusterfs.com -+ * Written by Alex Tomas -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License version 2 as -+ * published by the Free Software Foundation. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public Licens -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- -+ */ -+ -+ -+/* -+ * mballoc.c contains the multiblocks allocation routines -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+/* -+ * TODO: -+ * - bitmap read-ahead (proposed by Oleg Drokin aka green) -+ * - track min/max extents in each group for better group selection -+ * - mb_mark_used() may allocate chunk right after splitting buddy -+ * - special flag to advice allocator to look for requested + N blocks -+ * this may improve interaction between extents and mballoc -+ * - tree of groups sorted by number of free blocks -+ * - percpu reservation code (hotpath) -+ * - error handling -+ */ -+ -+/* -+ * with AGRESSIVE_CHECK allocator runs consistency checks over -+ * structures. these checks slow things down a lot -+ */ -+#define AGGRESSIVE_CHECK__ -+ -+/* -+ */ -+#define MB_DEBUG__ -+#ifdef MB_DEBUG -+#define mb_debug(fmt,a...) printk(fmt, ##a) -+#else -+#define mb_debug(fmt,a...) -+#endif -+ -+/* -+ * with EXT3_MB_HISTORY mballoc stores last N allocations in memory -+ * and you can monitor it in /proc/fs/ext3//mb_history -+ */ -+#define EXT3_MB_HISTORY -+ -+/* -+ * How long mballoc can look for a best extent (in found extents) -+ */ -+long ext3_mb_max_to_scan = 500; -+ -+/* -+ * How long mballoc must look for a best extent -+ */ -+long ext3_mb_min_to_scan = 30; -+ -+/* -+ * with 'ext3_mb_stats' allocator will collect stats that will be -+ * shown at umount. The collecting costs though! -+ */ -+ -+long ext3_mb_stats = 1; -+ -+/* -+ * for which requests use 2^N search using buddies -+ */ -+long ext3_mb_order2_reqs = 8; -+ -+#ifdef EXT3_BB_MAX_BLOCKS -+#undef EXT3_BB_MAX_BLOCKS -+#endif -+#define EXT3_BB_MAX_BLOCKS 30 -+ -+struct ext3_free_metadata { -+ unsigned short group; -+ unsigned short num; -+ unsigned short blocks[EXT3_BB_MAX_BLOCKS]; -+ struct list_head list; -+}; -+ -+struct ext3_group_info { -+ unsigned long bb_state; -+ unsigned long bb_tid; -+ struct ext3_free_metadata *bb_md_cur; -+ unsigned short bb_first_free; -+ unsigned short bb_free; -+ unsigned short bb_fragments; -+ unsigned short bb_counters[]; -+}; -+ -+ -+#define EXT3_GROUP_INFO_NEED_INIT_BIT 0 -+#define EXT3_GROUP_INFO_LOCKED_BIT 1 -+ -+#define EXT3_MB_GRP_NEED_INIT(grp) \ -+ (test_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, &(grp)->bb_state)) -+ -+struct ext3_free_extent { -+ __u16 fe_start; -+ __u16 fe_len; -+ __u16 fe_group; -+}; -+ -+struct ext3_allocation_context { -+ struct super_block *ac_sb; -+ -+ /* search goals */ -+ struct ext3_free_extent ac_g_ex; -+ -+ /* the best found extent */ -+ struct ext3_free_extent ac_b_ex; -+ -+ /* number of iterations done. we have to track to limit searching */ -+ unsigned long ac_ex_scanned; -+ __u16 ac_groups_scanned; -+ __u16 ac_found; -+ __u16 ac_tail; -+ __u16 ac_buddy; -+ __u8 ac_status; -+ __u8 ac_flags; /* allocation hints */ -+ __u8 ac_criteria; -+ __u8 ac_repeats; -+ __u8 ac_2order; /* if request is to allocate 2^N blocks and -+ * N > 0, the field stores N, otherwise 0 */ -+ -+ struct page *ac_buddy_page; -+ struct page *ac_bitmap_page; -+}; -+ -+#define AC_STATUS_CONTINUE 1 -+#define AC_STATUS_FOUND 2 -+#define AC_STATUS_BREAK 3 -+ -+struct ext3_mb_history { -+ struct ext3_free_extent goal; /* goal allocation */ -+ struct ext3_free_extent result; /* result allocation */ -+ unsigned pid; -+ unsigned ino; -+ __u16 found; /* how many extents have been found */ -+ __u16 groups; /* how many groups have been scanned */ -+ __u16 tail; /* what tail broke some buddy */ -+ __u16 buddy; /* buddy the tail ^^^ broke */ -+ __u8 cr; /* which phase the result extent was found at */ -+ __u8 merged; -+}; -+ -+struct ext3_buddy { -+ struct page *bd_buddy_page; -+ void *bd_buddy; -+ struct page *bd_bitmap_page; -+ void *bd_bitmap; -+ struct ext3_group_info *bd_info; -+ struct super_block *bd_sb; -+ __u16 bd_blkbits; -+ __u16 bd_group; -+}; -+#define EXT3_MB_BITMAP(e3b) ((e3b)->bd_bitmap) -+#define EXT3_MB_BUDDY(e3b) ((e3b)->bd_buddy) -+ -+#ifndef EXT3_MB_HISTORY -+#define ext3_mb_store_history(sb,ino,ac) -+#else -+static void ext3_mb_store_history(struct super_block *, unsigned ino, -+ struct ext3_allocation_context *ac); -+#endif -+ -+#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) -+ -+static struct proc_dir_entry *proc_root_ext3; -+ -+int ext3_create (struct inode *, struct dentry *, int, struct nameidata *); -+struct buffer_head * read_block_bitmap(struct super_block *, unsigned int); -+int ext3_mb_reserve_blocks(struct super_block *, int); -+void ext3_mb_release_blocks(struct super_block *, int); -+void ext3_mb_poll_new_transaction(struct super_block *, handle_t *); -+void ext3_mb_free_committed_blocks(struct super_block *); -+ -+#if BITS_PER_LONG == 64 -+#define mb_correct_addr_and_bit(bit,addr) \ -+{ \ -+ bit += ((unsigned long) addr & 7UL) << 3; \ -+ addr = (void *) ((unsigned long) addr & ~7UL); \ -+} -+#elif BITS_PER_LONG == 32 -+#define mb_correct_addr_and_bit(bit,addr) \ -+{ \ -+ bit += ((unsigned long) addr & 3UL) << 3; \ -+ addr = (void *) ((unsigned long) addr & ~3UL); \ -+} -+#else -+#error "how many bits you are?!" -+#endif -+ -+static inline int mb_test_bit(int bit, void *addr) -+{ -+ mb_correct_addr_and_bit(bit,addr); -+ return ext2_test_bit(bit, addr); -+} -+ -+static inline void mb_set_bit(int bit, void *addr) -+{ -+ mb_correct_addr_and_bit(bit,addr); -+ ext2_set_bit(bit, addr); -+} -+ -+static inline void mb_set_bit_atomic(int bit, void *addr) -+{ -+ mb_correct_addr_and_bit(bit,addr); -+ ext2_set_bit_atomic(NULL, bit, addr); -+} -+ -+static inline void mb_clear_bit(int bit, void *addr) -+{ -+ mb_correct_addr_and_bit(bit,addr); -+ ext2_clear_bit(bit, addr); -+} -+ -+static inline void mb_clear_bit_atomic(int bit, void *addr) -+{ -+ mb_correct_addr_and_bit(bit,addr); -+ ext2_clear_bit_atomic(NULL, bit, addr); -+} -+ -+static inline int mb_find_next_zero_bit(void *addr, int max, int start) -+{ -+ int fix; -+#if BITS_PER_LONG == 64 -+ fix = ((unsigned long) addr & 7UL) << 3; -+ addr = (void *) ((unsigned long) addr & ~7UL); -+#elif BITS_PER_LONG == 32 -+ fix = ((unsigned long) addr & 3UL) << 3; -+ addr = (void *) ((unsigned long) addr & ~3UL); -+#else -+#error "how many bits you are?!" -+#endif -+ max += fix; -+ start += fix; -+ return ext2_find_next_zero_bit(addr, max, start) - fix; -+} -+ -+static inline void *mb_find_buddy(struct ext3_buddy *e3b, int order, int *max) -+{ -+ char *bb; -+ -+ J_ASSERT(EXT3_MB_BITMAP(e3b) != EXT3_MB_BUDDY(e3b)); -+ J_ASSERT(max != NULL); -+ -+ if (order > e3b->bd_blkbits + 1) { -+ *max = 0; -+ return NULL; -+ } -+ -+ /* at order 0 we see each particular block */ -+ *max = 1 << (e3b->bd_blkbits + 3); -+ if (order == 0) -+ return EXT3_MB_BITMAP(e3b); -+ -+ bb = EXT3_MB_BUDDY(e3b) + EXT3_SB(e3b->bd_sb)->s_mb_offsets[order]; -+ *max = EXT3_SB(e3b->bd_sb)->s_mb_maxs[order]; -+ -+ return bb; -+} -+ -+#ifdef AGGRESSIVE_CHECK -+ -+static void mb_check_buddy(struct ext3_buddy *e3b) -+{ -+ int order = e3b->bd_blkbits + 1; -+ int max, max2, i, j, k, count; -+ int fragments = 0, fstart; -+ void *buddy, *buddy2; -+ -+ if (!test_opt(e3b->bd_sb, MBALLOC)) -+ return; -+ -+ { -+ static int mb_check_counter = 0; -+ if (mb_check_counter++ % 300 != 0) -+ return; -+ } -+ -+ while (order > 1) { -+ buddy = mb_find_buddy(e3b, order, &max); -+ J_ASSERT(buddy); -+ buddy2 = mb_find_buddy(e3b, order - 1, &max2); -+ J_ASSERT(buddy2); -+ J_ASSERT(buddy != buddy2); -+ J_ASSERT(max * 2 == max2); -+ -+ count = 0; -+ for (i = 0; i < max; i++) { -+ -+ if (mb_test_bit(i, buddy)) { -+ /* only single bit in buddy2 may be 1 */ -+ if (!mb_test_bit(i << 1, buddy2)) -+ J_ASSERT(mb_test_bit((i<<1)+1, buddy2)); -+ else if (!mb_test_bit((i << 1) + 1, buddy2)) -+ J_ASSERT(mb_test_bit(i << 1, buddy2)); -+ continue; -+ } -+ -+ /* both bits in buddy2 must be 0 */ -+ J_ASSERT(mb_test_bit(i << 1, buddy2)); -+ J_ASSERT(mb_test_bit((i << 1) + 1, buddy2)); -+ -+ for (j = 0; j < (1 << order); j++) { -+ k = (i * (1 << order)) + j; -+ J_ASSERT(!mb_test_bit(k, EXT3_MB_BITMAP(e3b))); -+ } -+ count++; -+ } -+ J_ASSERT(e3b->bd_info->bb_counters[order] == count); -+ order--; -+ } -+ -+ fstart = -1; -+ buddy = mb_find_buddy(e3b, 0, &max); -+ for (i = 0; i < max; i++) { -+ if (!mb_test_bit(i, buddy)) { -+ J_ASSERT(i >= e3b->bd_info->bb_first_free); -+ if (fstart == -1) { -+ fragments++; -+ fstart = i; -+ } -+ continue; -+ } -+ fstart = -1; -+ /* check used bits only */ -+ for (j = 0; j < e3b->bd_blkbits + 1; j++) { -+ buddy2 = mb_find_buddy(e3b, j, &max2); -+ k = i >> j; -+ J_ASSERT(k < max2); -+ J_ASSERT(mb_test_bit(k, buddy2)); -+ } -+ } -+ J_ASSERT(!EXT3_MB_GRP_NEED_INIT(e3b->bd_info)); -+ J_ASSERT(e3b->bd_info->bb_fragments == fragments); -+} -+ -+#else -+#define mb_check_buddy(e3b) -+#endif -+ -+/* find most significant bit */ -+static int inline fmsb(unsigned short word) -+{ -+ int order; -+ -+ if (word > 255) { -+ order = 7; -+ word >>= 8; -+ } else { -+ order = -1; -+ } -+ -+ do { -+ order++; -+ word >>= 1; -+ } while (word != 0); -+ -+ return order; -+} -+ -+static void inline -+ext3_mb_mark_free_simple(struct super_block *sb, void *buddy, unsigned first, -+ int len, struct ext3_group_info *grp) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ unsigned short min, max, chunk, border; -+ -+ mb_debug("mark %u/%u free\n", first, len); -+ J_ASSERT(len < EXT3_BLOCKS_PER_GROUP(sb)); -+ -+ border = 2 << sb->s_blocksize_bits; -+ -+ while (len > 0) { -+ /* find how many blocks can be covered since this position */ -+ max = ffs(first | border) - 1; -+ -+ /* find how many blocks of power 2 we need to mark */ -+ min = fmsb(len); -+ -+ mb_debug(" %u/%u -> max %u, min %u\n", -+ first & ((2 << sb->s_blocksize_bits) - 1), -+ len, max, min); -+ -+ if (max < min) -+ min = max; -+ chunk = 1 << min; -+ -+ /* mark multiblock chunks only */ -+ grp->bb_counters[min]++; -+ if (min > 0) { -+ mb_debug(" set %u at %u \n", first >> min, -+ sbi->s_mb_offsets[min]); -+ mb_clear_bit(first >> min, buddy + sbi->s_mb_offsets[min]); -+ } -+ -+ len -= chunk; -+ first += chunk; -+ } -+} -+ -+static void -+ext3_mb_generate_buddy(struct super_block *sb, void *buddy, void *bitmap, -+ int group) -+{ -+ struct ext3_group_info *grp = EXT3_GROUP_INFO(sb, group); -+ unsigned short max = EXT3_BLOCKS_PER_GROUP(sb); -+ unsigned short i = 0, first, len; -+ unsigned free = 0, fragments = 0; -+ unsigned long long period = get_cycles(); -+ -+ i = mb_find_next_zero_bit(bitmap, max, 0); -+ grp->bb_first_free = i; -+ while (i < max) { -+ fragments++; -+ first = i; -+ i = ext2_find_next_le_bit(bitmap, max, i); -+ len = i - first; -+ free += len; -+ if (len > 1) -+ ext3_mb_mark_free_simple(sb, buddy, first, len, grp); -+ else -+ grp->bb_counters[0]++; -+ if (i < max) -+ i = mb_find_next_zero_bit(bitmap, max, i); -+ } -+ grp->bb_fragments = fragments; -+ -+ /* bb_state shouldn't being modified because all -+ * others waits for init completion on page lock */ -+ clear_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, &grp->bb_state); -+ if (free != grp->bb_free) { -+ printk("EXT3-fs: group %u: %u blocks in bitmap, %u in gd\n", -+ group, free, grp->bb_free); -+ grp->bb_free = free; -+ } -+ -+ period = get_cycles() - period; -+ spin_lock(&EXT3_SB(sb)->s_bal_lock); -+ EXT3_SB(sb)->s_mb_buddies_generated++; -+ EXT3_SB(sb)->s_mb_generation_time += period; -+ spin_unlock(&EXT3_SB(sb)->s_bal_lock); -+} -+ -+static int ext3_mb_init_cache(struct page *page) -+{ -+ int blocksize, blocks_per_page, groups_per_page; -+ int err = 0, i, first_group, first_block; -+ struct super_block *sb; -+ struct buffer_head *bhs; -+ struct buffer_head **bh; -+ struct inode *inode; -+ char *data, *bitmap; -+ -+ mb_debug("init page %lu\n", page->index); -+ -+ inode = page->mapping->host; -+ sb = inode->i_sb; -+ blocksize = 1 << inode->i_blkbits; -+ blocks_per_page = PAGE_CACHE_SIZE / blocksize; -+ -+ groups_per_page = blocks_per_page >> 1; -+ if (groups_per_page == 0) -+ groups_per_page = 1; -+ -+ /* allocate buffer_heads to read bitmaps */ -+ if (groups_per_page > 1) { -+ err = -ENOMEM; -+ i = sizeof(struct buffer_head *) * groups_per_page; -+ bh = kmalloc(i, GFP_NOFS); -+ if (bh == NULL) -+ goto out; -+ memset(bh, 0, i); -+ } else -+ bh = &bhs; -+ -+ first_group = page->index * blocks_per_page / 2; -+ -+ /* read all groups the page covers into the cache */ -+ for (i = 0; i < groups_per_page; i++) { -+ struct ext3_group_desc * desc; -+ -+ if (first_group + i >= EXT3_SB(sb)->s_groups_count) -+ break; -+ -+ err = -EIO; -+ desc = ext3_get_group_desc(sb, first_group + i, NULL); -+ if (desc == NULL) -+ goto out; -+ -+ err = -ENOMEM; -+ bh[i] = sb_getblk(sb, le32_to_cpu(desc->bg_block_bitmap)); -+ if (bh[i] == NULL) -+ goto out; -+ -+ if (buffer_uptodate(bh[i])) -+ continue; -+ -+ lock_buffer(bh[i]); -+ if (buffer_uptodate(bh[i])) { -+ unlock_buffer(bh[i]); -+ continue; -+ } -+ -+ get_bh(bh[i]); -+ bh[i]->b_end_io = end_buffer_read_sync; -+ submit_bh(READ, bh[i]); -+ mb_debug("read bitmap for group %u\n", first_group + i); -+ } -+ -+ /* wait for I/O completion */ -+ for (i = 0; i < groups_per_page && bh[i]; i++) -+ wait_on_buffer(bh[i]); -+ -+ err = -EIO; -+ for (i = 0; i < groups_per_page && bh[i]; i++) -+ if (!buffer_uptodate(bh[i])) -+ goto out; -+ -+ first_block = page->index * blocks_per_page; -+ for (i = 0; i < blocks_per_page; i++) { -+ int group; -+ -+ group = (first_block + i) >> 1; -+ if (group >= EXT3_SB(sb)->s_groups_count) -+ break; -+ -+ data = page_address(page) + (i * blocksize); -+ bitmap = bh[group - first_group]->b_data; -+ -+ if ((first_block + i) & 1) { -+ /* this is block of buddy */ -+ mb_debug("put buddy for group %u in page %lu/%x\n", -+ group, page->index, i * blocksize); -+ memset(data, 0xff, blocksize); -+ EXT3_GROUP_INFO(sb, group)->bb_fragments = 0; -+ memset(EXT3_GROUP_INFO(sb, group)->bb_counters, 0, -+ sizeof(unsigned short)*(sb->s_blocksize_bits+2)); -+ ext3_mb_generate_buddy(sb, data, bitmap, group); -+ } else { -+ /* this is block of bitmap */ -+ mb_debug("put bitmap for group %u in page %lu/%x\n", -+ group, page->index, i * blocksize); -+ memcpy(data, bitmap, blocksize); -+ } -+ } -+ SetPageUptodate(page); -+ -+out: -+ if (bh) { -+ for (i = 0; i < groups_per_page && bh[i]; i++) -+ brelse(bh[i]); -+ if (bh != &bhs) -+ kfree(bh); -+ } -+ return err; -+} -+ -+static int ext3_mb_load_buddy(struct super_block *sb, int group, -+ struct ext3_buddy *e3b) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ struct inode *inode = sbi->s_buddy_cache; -+ int blocks_per_page, block, pnum, poff; -+ struct page *page; -+ -+ mb_debug("load group %u\n", group); -+ -+ blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; -+ -+ e3b->bd_blkbits = sb->s_blocksize_bits; -+ e3b->bd_info = EXT3_GROUP_INFO(sb, group); -+ e3b->bd_sb = sb; -+ e3b->bd_group = group; -+ e3b->bd_buddy_page = NULL; -+ e3b->bd_bitmap_page = NULL; -+ -+ block = group * 2; -+ pnum = block / blocks_per_page; -+ poff = block % blocks_per_page; -+ -+ /* we could use find_or_create_page(), but it locks page -+ * what we'd like to avoid in fast path ... */ -+ page = find_get_page(inode->i_mapping, pnum); -+ if (page == NULL || !PageUptodate(page)) { -+ if (page) -+ page_cache_release(page); -+ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); -+ if (page) { -+ BUG_ON(page->mapping != inode->i_mapping); -+ if (!PageUptodate(page)) -+ ext3_mb_init_cache(page); -+ unlock_page(page); -+ } -+ } -+ if (page == NULL || !PageUptodate(page)) -+ goto err; -+ e3b->bd_bitmap_page = page; -+ e3b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); -+ mark_page_accessed(page); -+ -+ block++; -+ pnum = block / blocks_per_page; -+ poff = block % blocks_per_page; -+ -+ page = find_get_page(inode->i_mapping, pnum); -+ if (page == NULL || !PageUptodate(page)) { -+ if (page) -+ page_cache_release(page); -+ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); -+ if (page) { -+ BUG_ON(page->mapping != inode->i_mapping); -+ if (!PageUptodate(page)) -+ ext3_mb_init_cache(page); -+ unlock_page(page); -+ } -+ } -+ if (page == NULL || !PageUptodate(page)) -+ goto err; -+ e3b->bd_buddy_page = page; -+ e3b->bd_buddy = page_address(page) + (poff * sb->s_blocksize); -+ mark_page_accessed(page); -+ -+ J_ASSERT(e3b->bd_bitmap_page != NULL); -+ J_ASSERT(e3b->bd_buddy_page != NULL); -+ -+ return 0; -+ -+err: -+ if (e3b->bd_bitmap_page) -+ page_cache_release(e3b->bd_bitmap_page); -+ if (e3b->bd_buddy_page) -+ page_cache_release(e3b->bd_buddy_page); -+ e3b->bd_buddy = NULL; -+ e3b->bd_bitmap = NULL; -+ return -EIO; -+} -+ -+static void ext3_mb_release_desc(struct ext3_buddy *e3b) -+{ -+ if (e3b->bd_bitmap_page) -+ page_cache_release(e3b->bd_bitmap_page); -+ if (e3b->bd_buddy_page) -+ page_cache_release(e3b->bd_buddy_page); -+} -+ -+ -+static inline void -+ext3_lock_group(struct super_block *sb, int group) -+{ -+ bit_spin_lock(EXT3_GROUP_INFO_LOCKED_BIT, -+ &EXT3_GROUP_INFO(sb, group)->bb_state); -+} -+ -+static inline void -+ext3_unlock_group(struct super_block *sb, int group) -+{ -+ bit_spin_unlock(EXT3_GROUP_INFO_LOCKED_BIT, -+ &EXT3_GROUP_INFO(sb, group)->bb_state); -+} -+ -+static int mb_find_order_for_block(struct ext3_buddy *e3b, int block) -+{ -+ int order = 1; -+ void *bb; -+ -+ J_ASSERT(EXT3_MB_BITMAP(e3b) != EXT3_MB_BUDDY(e3b)); -+ J_ASSERT(block < (1 << (e3b->bd_blkbits + 3))); -+ -+ bb = EXT3_MB_BUDDY(e3b); -+ while (order <= e3b->bd_blkbits + 1) { -+ block = block >> 1; -+ if (!mb_test_bit(block, bb)) { -+ /* this block is part of buddy of order 'order' */ -+ return order; -+ } -+ bb += 1 << (e3b->bd_blkbits - order); -+ order++; -+ } -+ return 0; -+} -+ -+static inline void mb_clear_bits(void *bm, int cur, int len) -+{ -+ __u32 *addr; -+ -+ len = cur + len; -+ while (cur < len) { -+ if ((cur & 31) == 0 && (len - cur) >= 32) { -+ /* fast path: clear whole word at once */ -+ addr = bm + (cur >> 3); -+ *addr = 0; -+ cur += 32; -+ continue; -+ } -+ mb_clear_bit_atomic(cur, bm); -+ cur++; -+ } -+} -+ -+static inline void mb_set_bits(void *bm, int cur, int len) -+{ -+ __u32 *addr; -+ -+ len = cur + len; -+ while (cur < len) { -+ if ((cur & 31) == 0 && (len - cur) >= 32) { -+ /* fast path: clear whole word at once */ -+ addr = bm + (cur >> 3); -+ *addr = 0xffffffff; -+ cur += 32; -+ continue; -+ } -+ mb_set_bit_atomic(cur, bm); -+ cur++; -+ } -+} -+ -+static int mb_free_blocks(struct ext3_buddy *e3b, int first, int count) -+{ -+ int block = 0, max = 0, order; -+ void *buddy, *buddy2; -+ -+ mb_check_buddy(e3b); -+ -+ e3b->bd_info->bb_free += count; -+ if (first < e3b->bd_info->bb_first_free) -+ e3b->bd_info->bb_first_free = first; -+ -+ /* let's maintain fragments counter */ -+ if (first != 0) -+ block = !mb_test_bit(first - 1, EXT3_MB_BITMAP(e3b)); -+ if (first + count < EXT3_SB(e3b->bd_sb)->s_mb_maxs[0]) -+ max = !mb_test_bit(first + count, EXT3_MB_BITMAP(e3b)); -+ if (block && max) -+ e3b->bd_info->bb_fragments--; -+ else if (!block && !max) -+ e3b->bd_info->bb_fragments++; -+ -+ /* let's maintain buddy itself */ -+ while (count-- > 0) { -+ block = first++; -+ order = 0; -+ -+ J_ASSERT(mb_test_bit(block, EXT3_MB_BITMAP(e3b))); -+ mb_clear_bit(block, EXT3_MB_BITMAP(e3b)); -+ e3b->bd_info->bb_counters[order]++; -+ -+ /* start of the buddy */ -+ buddy = mb_find_buddy(e3b, order, &max); -+ -+ do { -+ block &= ~1UL; -+ if (mb_test_bit(block, buddy) || -+ mb_test_bit(block + 1, buddy)) -+ break; -+ -+ /* both the buddies are free, try to coalesce them */ -+ buddy2 = mb_find_buddy(e3b, order + 1, &max); -+ -+ if (!buddy2) -+ break; -+ -+ if (order > 0) { -+ /* for special purposes, we don't set -+ * free bits in bitmap */ -+ mb_set_bit(block, buddy); -+ mb_set_bit(block + 1, buddy); -+ } -+ e3b->bd_info->bb_counters[order]--; -+ e3b->bd_info->bb_counters[order]--; -+ -+ block = block >> 1; -+ order++; -+ e3b->bd_info->bb_counters[order]++; -+ -+ mb_clear_bit(block, buddy2); -+ buddy = buddy2; -+ } while (1); -+ } -+ mb_check_buddy(e3b); -+ -+ return 0; -+} -+ -+static int mb_find_extent(struct ext3_buddy *e3b, int order, int block, -+ int needed, struct ext3_free_extent *ex) -+{ -+ int next = block, max, ord; -+ void *buddy; -+ -+ J_ASSERT(ex != NULL); -+ -+ buddy = mb_find_buddy(e3b, order, &max); -+ J_ASSERT(buddy); -+ J_ASSERT(block < max); -+ if (mb_test_bit(block, buddy)) { -+ ex->fe_len = 0; -+ ex->fe_start = 0; -+ ex->fe_group = 0; -+ return 0; -+ } -+ -+ if (likely(order == 0)) { -+ /* find actual order */ -+ order = mb_find_order_for_block(e3b, block); -+ block = block >> order; -+ } -+ -+ ex->fe_len = 1 << order; -+ ex->fe_start = block << order; -+ ex->fe_group = e3b->bd_group; -+ -+ /* calc difference from given start */ -+ next = next - ex->fe_start; -+ ex->fe_len -= next; -+ ex->fe_start += next; -+ -+ while (needed > ex->fe_len && (buddy = mb_find_buddy(e3b, order, &max))) { -+ -+ if (block + 1 >= max) -+ break; -+ -+ next = (block + 1) * (1 << order); -+ if (mb_test_bit(next, EXT3_MB_BITMAP(e3b))) -+ break; -+ -+ ord = mb_find_order_for_block(e3b, next); -+ -+ order = ord; -+ block = next >> order; -+ ex->fe_len += 1 << order; -+ } -+ -+ J_ASSERT(ex->fe_start + ex->fe_len <= (1 << (e3b->bd_blkbits + 3))); -+ return ex->fe_len; -+} -+ -+static int mb_mark_used(struct ext3_buddy *e3b, struct ext3_free_extent *ex) -+{ -+ int ord, mlen = 0, max = 0, cur; -+ int start = ex->fe_start; -+ int len = ex->fe_len; -+ unsigned ret = 0; -+ int len0 = len; -+ void *buddy; -+ -+ mb_check_buddy(e3b); -+ -+ e3b->bd_info->bb_free -= len; -+ if (e3b->bd_info->bb_first_free == start) -+ e3b->bd_info->bb_first_free += len; -+ -+ /* let's maintain fragments counter */ -+ if (start != 0) -+ mlen = !mb_test_bit(start - 1, EXT3_MB_BITMAP(e3b)); -+ if (start + len < EXT3_SB(e3b->bd_sb)->s_mb_maxs[0]) -+ max = !mb_test_bit(start + len, EXT3_MB_BITMAP(e3b)); -+ if (mlen && max) -+ e3b->bd_info->bb_fragments++; -+ else if (!mlen && !max) -+ e3b->bd_info->bb_fragments--; -+ -+ /* let's maintain buddy itself */ -+ while (len) { -+ ord = mb_find_order_for_block(e3b, start); -+ -+ if (((start >> ord) << ord) == start && len >= (1 << ord)) { -+ /* the whole chunk may be allocated at once! */ -+ mlen = 1 << ord; -+ buddy = mb_find_buddy(e3b, ord, &max); -+ J_ASSERT((start >> ord) < max); -+ mb_set_bit(start >> ord, buddy); -+ e3b->bd_info->bb_counters[ord]--; -+ start += mlen; -+ len -= mlen; -+ J_ASSERT(len >= 0); -+ continue; -+ } -+ -+ /* store for history */ -+ if (ret == 0) -+ ret = len | (ord << 16); -+ -+ /* we have to split large buddy */ -+ J_ASSERT(ord > 0); -+ buddy = mb_find_buddy(e3b, ord, &max); -+ mb_set_bit(start >> ord, buddy); -+ e3b->bd_info->bb_counters[ord]--; -+ -+ ord--; -+ cur = (start >> ord) & ~1U; -+ buddy = mb_find_buddy(e3b, ord, &max); -+ mb_clear_bit(cur, buddy); -+ mb_clear_bit(cur + 1, buddy); -+ e3b->bd_info->bb_counters[ord]++; -+ e3b->bd_info->bb_counters[ord]++; -+ } -+ -+ /* now drop all the bits in bitmap */ -+ mb_set_bits(EXT3_MB_BITMAP(e3b), ex->fe_start, len0); -+ -+ mb_check_buddy(e3b); -+ -+ return ret; -+} -+ -+/* -+ * Must be called under group lock! -+ */ -+static void ext3_mb_use_best_found(struct ext3_allocation_context *ac, -+ struct ext3_buddy *e3b) -+{ -+ unsigned long ret; -+ -+ ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len); -+ ret = mb_mark_used(e3b, &ac->ac_b_ex); -+ -+ ac->ac_status = AC_STATUS_FOUND; -+ ac->ac_tail = ret & 0xffff; -+ ac->ac_buddy = ret >> 16; -+ -+ /* hold in-core structures until allocated -+ * blocks are marked non-free in on-disk bitmap */ -+ ac->ac_buddy_page = e3b->bd_buddy_page; -+ page_cache_get(e3b->bd_buddy_page); -+ ac->ac_bitmap_page = e3b->bd_bitmap_page; -+ page_cache_get(e3b->bd_bitmap_page); -+} -+ -+/* -+ * The routine checks whether found extent is good enough. If it is, -+ * then the extent gets marked used and flag is set to the context -+ * to stop scanning. Otherwise, the extent is compared with the -+ * previous found extent and if new one is better, then it's stored -+ * in the context. Later, the best found extent will be used, if -+ * mballoc can't find good enough extent. -+ * -+ * FIXME: real allocation policy is to be designed yet! -+ */ -+static void ext3_mb_measure_extent(struct ext3_allocation_context *ac, -+ struct ext3_free_extent *ex, -+ struct ext3_buddy *e3b) -+{ -+ struct ext3_free_extent *bex = &ac->ac_b_ex; -+ struct ext3_free_extent *gex = &ac->ac_g_ex; -+ -+ J_ASSERT(ex->fe_len > 0); -+ J_ASSERT(ex->fe_len < (1 << ac->ac_sb->s_blocksize_bits) * 8); -+ J_ASSERT(ex->fe_start < (1 << ac->ac_sb->s_blocksize_bits) * 8); -+ -+ ac->ac_found++; -+ -+ /* -+ * The special case - take what you catch first -+ */ -+ if (unlikely(ac->ac_flags & EXT3_MB_HINT_FIRST)) { -+ *bex = *ex; -+ ext3_mb_use_best_found(ac, e3b); -+ return; -+ } -+ -+ /* -+ * Let's check whether the chunk is good enough -+ */ -+ if (ex->fe_len == gex->fe_len) { -+ *bex = *ex; -+ ext3_mb_use_best_found(ac, e3b); -+ return; -+ } -+ -+ /* -+ * If this is first found extent, just store it in the context -+ */ -+ if (bex->fe_len == 0) { -+ *bex = *ex; -+ return; -+ } -+ -+ /* -+ * If new found extent is better, store it in the context -+ */ -+ if (bex->fe_len < gex->fe_len) { -+ /* if the request isn't satisfied, any found extent -+ * larger than previous best one is better */ -+ if (ex->fe_len > bex->fe_len) -+ *bex = *ex; -+ } else if (ex->fe_len > gex->fe_len) { -+ /* if the request is satisfied, then we try to find -+ * an extent that still satisfy the request, but is -+ * smaller than previous one */ -+ *bex = *ex; -+ } -+ -+ /* -+ * Let's scan at least few extents and don't pick up a first one -+ */ -+ if (bex->fe_len > gex->fe_len && ac->ac_found > ext3_mb_min_to_scan) -+ ac->ac_status = AC_STATUS_BREAK; -+ -+ /* -+ * We don't want to scan for a whole year -+ */ -+ if (ac->ac_found > ext3_mb_max_to_scan) -+ ac->ac_status = AC_STATUS_BREAK; -+} -+ -+static int ext3_mb_try_best_found(struct ext3_allocation_context *ac, -+ struct ext3_buddy *e3b) -+{ -+ struct ext3_free_extent ex = ac->ac_b_ex; -+ int group = ex.fe_group, max, err; -+ -+ J_ASSERT(ex.fe_len > 0); -+ err = ext3_mb_load_buddy(ac->ac_sb, group, e3b); -+ if (err) -+ return err; -+ -+ ext3_lock_group(ac->ac_sb, group); -+ max = mb_find_extent(e3b, 0, ex.fe_start, ex.fe_len, &ex); -+ -+ if (max > 0) { -+ ac->ac_b_ex = ex; -+ ext3_mb_use_best_found(ac, e3b); -+ } -+ -+ ext3_unlock_group(ac->ac_sb, group); -+ -+ ext3_mb_release_desc(e3b); -+ -+ return 0; -+} -+ -+static int ext3_mb_find_by_goal(struct ext3_allocation_context *ac, -+ struct ext3_buddy *e3b) -+{ -+ int group = ac->ac_g_ex.fe_group, max, err; -+ struct ext3_sb_info *sbi = EXT3_SB(ac->ac_sb); -+ struct ext3_super_block *es = sbi->s_es; -+ struct ext3_free_extent ex; -+ -+ err = ext3_mb_load_buddy(ac->ac_sb, group, e3b); -+ if (err) -+ return err; -+ -+ ext3_lock_group(ac->ac_sb, group); -+ max = mb_find_extent(e3b, 0, ac->ac_g_ex.fe_start, -+ ac->ac_g_ex.fe_len, &ex); -+ -+ if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) { -+ ext3_fsblk_t start; -+ start = (e3b->bd_group * EXT3_BLOCKS_PER_GROUP(ac->ac_sb) + -+ ex.fe_start + le32_to_cpu(es->s_first_data_block)); -+ if (start % sbi->s_stripe == 0) { -+ ac->ac_found++; -+ ac->ac_b_ex = ex; -+ ext3_mb_use_best_found(ac, e3b); -+ } -+ } else if (max >= ac->ac_g_ex.fe_len) { -+ J_ASSERT(ex.fe_len > 0); -+ J_ASSERT(ex.fe_group == ac->ac_g_ex.fe_group); -+ J_ASSERT(ex.fe_start == ac->ac_g_ex.fe_start); -+ ac->ac_found++; -+ ac->ac_b_ex = ex; -+ ext3_mb_use_best_found(ac, e3b); -+ } else if (max > 0 && (ac->ac_flags & EXT3_MB_HINT_MERGE)) { -+ /* Sometimes, caller may want to merge even small -+ * number of blocks to an existing extent */ -+ J_ASSERT(ex.fe_len > 0); -+ J_ASSERT(ex.fe_group == ac->ac_g_ex.fe_group); -+ J_ASSERT(ex.fe_start == ac->ac_g_ex.fe_start); -+ ac->ac_found++; -+ ac->ac_b_ex = ex; -+ ext3_mb_use_best_found(ac, e3b); -+ } -+ ext3_unlock_group(ac->ac_sb, group); -+ -+ ext3_mb_release_desc(e3b); -+ -+ return 0; -+} -+ -+/* -+ * The routine scans buddy structures (not bitmap!) from given order -+ * to max order and tries to find big enough chunk to satisfy the req -+ */ -+static void ext3_mb_simple_scan_group(struct ext3_allocation_context *ac, -+ struct ext3_buddy *e3b) -+{ -+ struct super_block *sb = ac->ac_sb; -+ struct ext3_group_info *grp = e3b->bd_info; -+ void *buddy; -+ int i, k, max; -+ -+ J_ASSERT(ac->ac_2order > 0); -+ for (i = ac->ac_2order; i <= sb->s_blocksize_bits + 1; i++) { -+ if (grp->bb_counters[i] == 0) -+ continue; -+ -+ buddy = mb_find_buddy(e3b, i, &max); -+ if (buddy == NULL) { -+ printk(KERN_ALERT "looking for wrong order?\n"); -+ break; -+ } -+ -+ k = mb_find_next_zero_bit(buddy, max, 0); -+ J_ASSERT(k < max); -+ -+ ac->ac_found++; -+ -+ ac->ac_b_ex.fe_len = 1 << i; -+ ac->ac_b_ex.fe_start = k << i; -+ ac->ac_b_ex.fe_group = e3b->bd_group; -+ -+ ext3_mb_use_best_found(ac, e3b); -+ J_ASSERT(ac->ac_b_ex.fe_len == ac->ac_g_ex.fe_len); -+ -+ if (unlikely(ext3_mb_stats)) -+ atomic_inc(&EXT3_SB(sb)->s_bal_2orders); -+ -+ break; -+ } -+} -+ -+/* -+ * The routine scans the group and measures all found extents. -+ * In order to optimize scanning, caller must pass number of -+ * free blocks in the group, so the routine can know upper limit. -+ */ -+static void ext3_mb_complex_scan_group(struct ext3_allocation_context *ac, -+ struct ext3_buddy *e3b) -+{ -+ struct super_block *sb = ac->ac_sb; -+ void *bitmap = EXT3_MB_BITMAP(e3b); -+ struct ext3_free_extent ex; -+ int i, free; -+ -+ free = e3b->bd_info->bb_free; -+ J_ASSERT(free > 0); -+ -+ i = e3b->bd_info->bb_first_free; -+ -+ while (free && ac->ac_status == AC_STATUS_CONTINUE) { -+ i = mb_find_next_zero_bit(bitmap, sb->s_blocksize * 8, i); -+ if (i >= sb->s_blocksize * 8) { -+ J_ASSERT(free == 0); -+ break; -+ } -+ -+ mb_find_extent(e3b, 0, i, ac->ac_g_ex.fe_len, &ex); -+ J_ASSERT(ex.fe_len > 0); -+ J_ASSERT(free >= ex.fe_len); -+ -+ ext3_mb_measure_extent(ac, &ex, e3b); -+ -+ i += ex.fe_len; -+ free -= ex.fe_len; -+ } -+} -+ -+/* -+ * This is a special case for storages like raid5 -+ * we try to find stripe-aligned chunks for stripe-size requests -+ */ -+static void ext3_mb_scan_aligned(struct ext3_allocation_context *ac, -+ struct ext3_buddy *e3b) -+{ -+ struct super_block *sb = ac->ac_sb; -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ void *bitmap = EXT3_MB_BITMAP(e3b); -+ struct ext3_free_extent ex; -+ ext3_fsblk_t i, max; -+ -+ J_ASSERT(sbi->s_stripe != 0); -+ -+ /* find first stripe-aligned block */ -+ i = e3b->bd_group * EXT3_BLOCKS_PER_GROUP(sb) + -+ le32_to_cpu(sbi->s_es->s_first_data_block); -+ i = ((i + sbi->s_stripe - 1) / sbi->s_stripe) * sbi->s_stripe; -+ i = (i - le32_to_cpu(sbi->s_es->s_first_data_block)) % -+ EXT3_BLOCKS_PER_GROUP(sb); -+ -+ while (i < sb->s_blocksize * 8) { -+ if (!mb_test_bit(i, bitmap)) { -+ max = mb_find_extent(e3b, 0, i, sbi->s_stripe, &ex); -+ if (max >= sbi->s_stripe) { -+ ac->ac_found++; -+ ac->ac_b_ex = ex; -+ ext3_mb_use_best_found(ac, e3b); -+ break; -+ } -+ } -+ i += sbi->s_stripe; -+ } -+} -+ -+static int ext3_mb_good_group(struct ext3_allocation_context *ac, -+ int group, int cr) -+{ -+ struct ext3_group_info *grp = EXT3_GROUP_INFO(ac->ac_sb, group); -+ unsigned free, fragments, i, bits; -+ -+ J_ASSERT(cr >= 0 && cr < 4); -+ J_ASSERT(!EXT3_MB_GRP_NEED_INIT(grp)); -+ -+ free = grp->bb_free; -+ fragments = grp->bb_fragments; -+ if (free == 0) -+ return 0; -+ if (fragments == 0) -+ return 0; -+ -+ switch (cr) { -+ case 0: -+ J_ASSERT(ac->ac_2order != 0); -+ bits = ac->ac_sb->s_blocksize_bits + 1; -+ for (i = ac->ac_2order; i <= bits; i++) -+ if (grp->bb_counters[i] > 0) -+ return 1; -+ break; -+ case 1: -+ if ((free / fragments) >= ac->ac_g_ex.fe_len) -+ return 1; -+ break; -+ case 2: -+ if (free >= ac->ac_g_ex.fe_len) -+ return 1; -+ break; -+ case 3: -+ return 1; -+ default: -+ BUG(); -+ } -+ -+ return 0; -+} -+ -+ext3_fsblk_t ext3_mb_new_blocks(handle_t *handle, struct inode *inode, -+ ext3_fsblk_t goal, int *len,int flags,int *errp) -+{ -+ struct buffer_head *bitmap_bh = NULL; -+ struct ext3_allocation_context ac; -+ int i, group, cr, err = 0; -+ struct ext3_group_desc *gdp; -+ struct ext3_super_block *es; -+ struct buffer_head *gdp_bh; -+ struct ext3_sb_info *sbi; -+ struct super_block *sb; -+ struct ext3_buddy e3b; -+ ext3_fsblk_t block; -+ -+ J_ASSERT(len != NULL); -+ J_ASSERT(*len > 0); -+ -+ sb = inode->i_sb; -+ if (!sb) { -+ printk("ext3_mb_new_nblocks: nonexistent device"); -+ return 0; -+ } -+ -+ if (!test_opt(sb, MBALLOC)) { -+ static int ext3_mballoc_warning = 0; -+ if (ext3_mballoc_warning == 0) { -+ printk(KERN_ERR "EXT3-fs: multiblock request with " -+ "mballoc disabled!\n"); -+ ext3_mballoc_warning++; -+ } -+ *len = 1; -+ err = ext3_new_block_old(handle, inode, goal, errp); -+ return err; -+ } -+ -+ ext3_mb_poll_new_transaction(sb, handle); -+ -+ sbi = EXT3_SB(sb); -+ es = EXT3_SB(sb)->s_es; -+ -+ /* -+ * We can't allocate > group size -+ */ -+ if (*len >= EXT3_BLOCKS_PER_GROUP(sb) - 10) -+ *len = EXT3_BLOCKS_PER_GROUP(sb) - 10; -+ -+ if (!(flags & EXT3_MB_HINT_RESERVED)) { -+ /* someone asks for non-reserved blocks */ -+ BUG_ON(*len > 1); -+ err = ext3_mb_reserve_blocks(sb, 1); -+ if (err) { -+ *errp = err; -+ return 0; -+ } -+ } -+ -+ ac.ac_buddy_page = NULL; -+ ac.ac_bitmap_page = NULL; -+ -+ /* -+ * Check quota for allocation of this blocks. -+ */ -+ while (*len && DQUOT_ALLOC_BLOCK(inode, *len)) -+ *len -= 1; -+ if (*len == 0) { -+ *errp = -EDQUOT; -+ block = 0; -+ goto out; -+ } -+ -+ /* start searching from the goal */ -+ if (goal < le32_to_cpu(es->s_first_data_block) || -+ goal >= le32_to_cpu(es->s_blocks_count)) -+ goal = le32_to_cpu(es->s_first_data_block); -+ group = (goal - le32_to_cpu(es->s_first_data_block)) / -+ EXT3_BLOCKS_PER_GROUP(sb); -+ block = ((goal - le32_to_cpu(es->s_first_data_block)) % -+ EXT3_BLOCKS_PER_GROUP(sb)); -+ -+ /* set up allocation goals */ -+ ac.ac_b_ex.fe_group = 0; -+ ac.ac_b_ex.fe_start = 0; -+ ac.ac_b_ex.fe_len = 0; -+ ac.ac_status = AC_STATUS_CONTINUE; -+ ac.ac_groups_scanned = 0; -+ ac.ac_ex_scanned = 0; -+ ac.ac_found = 0; -+ ac.ac_sb = inode->i_sb; -+ ac.ac_g_ex.fe_group = group; -+ ac.ac_g_ex.fe_start = block; -+ ac.ac_g_ex.fe_len = *len; -+ ac.ac_flags = flags; -+ ac.ac_2order = 0; -+ ac.ac_criteria = 0; -+ -+ if (*len == 1 && sbi->s_stripe) { -+ /* looks like a metadata, let's use a dirty hack for raid5 -+ * move all metadata in first groups in hope to hit cached -+ * sectors and thus avoid read-modify cycles in raid5 */ -+ ac.ac_g_ex.fe_group = group = 0; -+ } -+ -+ /* probably, the request is for 2^8+ blocks (1/2/3/... MB) */ -+ i = ffs(*len); -+ if (i >= ext3_mb_order2_reqs) { -+ i--; -+ if ((*len & (~(1 << i))) == 0) -+ ac.ac_2order = i; -+ } -+ -+ /* first, try the goal */ -+ err = ext3_mb_find_by_goal(&ac, &e3b); -+ if (err) -+ goto out_err; -+ if (ac.ac_status == AC_STATUS_FOUND) -+ goto found; -+ -+ /* Let's just scan groups to find more-less suitable blocks */ -+ cr = ac.ac_2order ? 0 : 1; -+repeat: -+ for (; cr < 4 && ac.ac_status == AC_STATUS_CONTINUE; cr++) { -+ ac.ac_criteria = cr; -+ for (i = 0; i < EXT3_SB(sb)->s_groups_count; group++, i++) { -+ if (group == EXT3_SB(sb)->s_groups_count) -+ group = 0; -+ -+ if (EXT3_MB_GRP_NEED_INIT(EXT3_GROUP_INFO(sb, group))) { -+ /* we need full data about the group -+ * to make a good selection */ -+ err = ext3_mb_load_buddy(ac.ac_sb, group, &e3b); -+ if (err) -+ goto out_err; -+ ext3_mb_release_desc(&e3b); -+ } -+ -+ /* check is group good for our criteries */ -+ if (!ext3_mb_good_group(&ac, group, cr)) -+ continue; -+ -+ err = ext3_mb_load_buddy(ac.ac_sb, group, &e3b); -+ if (err) -+ goto out_err; -+ -+ ext3_lock_group(sb, group); -+ if (!ext3_mb_good_group(&ac, group, cr)) { -+ /* someone did allocation from this group */ -+ ext3_unlock_group(sb, group); -+ ext3_mb_release_desc(&e3b); -+ continue; -+ } -+ -+ ac.ac_groups_scanned++; -+ if (cr == 0) -+ ext3_mb_simple_scan_group(&ac, &e3b); -+ else if (cr == 1 && *len == sbi->s_stripe) -+ ext3_mb_scan_aligned(&ac, &e3b); -+ else -+ ext3_mb_complex_scan_group(&ac, &e3b); -+ -+ ext3_unlock_group(sb, group); -+ -+ ext3_mb_release_desc(&e3b); -+ -+ if (ac.ac_status != AC_STATUS_CONTINUE) -+ break; -+ } -+ } -+ -+ if (ac.ac_b_ex.fe_len > 0 && ac.ac_status != AC_STATUS_FOUND && -+ !(ac.ac_flags & EXT3_MB_HINT_FIRST)) { -+ /* -+ * We've been searching too long. Let's try to allocate -+ * the best chunk we've found so far -+ */ -+ -+ /*if (ac.ac_found > ext3_mb_max_to_scan) -+ printk(KERN_DEBUG "EXT3-fs: too long searching at " -+ "%u (%d/%d)\n", cr, ac.ac_b_ex.fe_len, -+ ac.ac_g_ex.fe_len);*/ -+ ext3_mb_try_best_found(&ac, &e3b); -+ if (ac.ac_status != AC_STATUS_FOUND) { -+ /* -+ * Someone more lucky has already allocated it. -+ * The only thing we can do is just take first -+ * found block(s) -+ printk(KERN_DEBUG "EXT3-fs: someone won our chunk\n"); -+ */ -+ ac.ac_b_ex.fe_group = 0; -+ ac.ac_b_ex.fe_start = 0; -+ ac.ac_b_ex.fe_len = 0; -+ ac.ac_status = AC_STATUS_CONTINUE; -+ ac.ac_flags |= EXT3_MB_HINT_FIRST; -+ cr = 3; -+ goto repeat; -+ } -+ } -+ -+ if (ac.ac_status != AC_STATUS_FOUND) { -+ /* -+ * We aren't lucky definitely -+ */ -+ DQUOT_FREE_BLOCK(inode, *len); -+ *errp = -ENOSPC; -+ block = 0; -+#if 1 -+ printk(KERN_ERR "EXT3-fs: can't allocate: status %d flags %d\n", -+ ac.ac_status, ac.ac_flags); -+ printk(KERN_ERR "EXT3-fs: goal %d, best found %d/%d/%d cr %d\n", -+ ac.ac_g_ex.fe_len, ac.ac_b_ex.fe_group, -+ ac.ac_b_ex.fe_start, ac.ac_b_ex.fe_len, cr); -+ printk(KERN_ERR "EXT3-fs: %lu block reserved, %d found\n", -+ sbi->s_blocks_reserved, ac.ac_found); -+ printk("EXT3-fs: groups: "); -+ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) -+ printk("%d: %d ", i, EXT3_GROUP_INFO(sb, i)->bb_free); -+ printk("\n"); -+#endif -+ goto out; -+ } -+ -+found: -+ J_ASSERT(ac.ac_b_ex.fe_len > 0); -+ -+ /* good news - free block(s) have been found. now it's time -+ * to mark block(s) in good old journaled bitmap */ -+ block = ac.ac_b_ex.fe_group * EXT3_BLOCKS_PER_GROUP(sb) -+ + ac.ac_b_ex.fe_start -+ + le32_to_cpu(es->s_first_data_block); -+ -+ /* we made a desicion, now mark found blocks in good old -+ * bitmap to be journaled */ -+ -+ ext3_debug("using block group %d(%d)\n", -+ ac.ac_b_group.group, gdp->bg_free_blocks_count); -+ -+ bitmap_bh = read_block_bitmap(sb, ac.ac_b_ex.fe_group); -+ if (!bitmap_bh) { -+ *errp = -EIO; -+ goto out_err; -+ } -+ -+ err = ext3_journal_get_write_access(handle, bitmap_bh); -+ if (err) { -+ *errp = err; -+ goto out_err; -+ } -+ -+ gdp = ext3_get_group_desc(sb, ac.ac_b_ex.fe_group, &gdp_bh); -+ if (!gdp) { -+ *errp = -EIO; -+ goto out_err; -+ } -+ -+ err = ext3_journal_get_write_access(handle, gdp_bh); -+ if (err) -+ goto out_err; -+ -+ block = ac.ac_b_ex.fe_group * EXT3_BLOCKS_PER_GROUP(sb) -+ + ac.ac_b_ex.fe_start -+ + le32_to_cpu(es->s_first_data_block); -+ -+ if (block == le32_to_cpu(gdp->bg_block_bitmap) || -+ block == le32_to_cpu(gdp->bg_inode_bitmap) || -+ in_range(block, le32_to_cpu(gdp->bg_inode_table), -+ EXT3_SB(sb)->s_itb_per_group)) -+ ext3_error(sb, "ext3_new_block", -+ "Allocating block in system zone - " -+ "block = "E3FSBLK, block); -+#ifdef AGGRESSIVE_CHECK -+ for (i = 0; i < ac.ac_b_ex.fe_len; i++) -+ J_ASSERT(!mb_test_bit(ac.ac_b_ex.fe_start + i, bitmap_bh->b_data)); -+#endif -+ mb_set_bits(bitmap_bh->b_data, ac.ac_b_ex.fe_start, ac.ac_b_ex.fe_len); -+ -+ spin_lock(sb_bgl_lock(sbi, ac.ac_b_ex.fe_group)); -+ gdp->bg_free_blocks_count = -+ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) -+ - ac.ac_b_ex.fe_len); -+ spin_unlock(sb_bgl_lock(sbi, ac.ac_b_ex.fe_group)); -+ percpu_counter_mod(&sbi->s_freeblocks_counter, - ac.ac_b_ex.fe_len); -+ -+ err = ext3_journal_dirty_metadata(handle, bitmap_bh); -+ if (err) -+ goto out_err; -+ err = ext3_journal_dirty_metadata(handle, gdp_bh); -+ if (err) -+ goto out_err; -+ -+ sb->s_dirt = 1; -+ *errp = 0; -+ brelse(bitmap_bh); -+ -+ /* drop non-allocated, but dquote'd blocks */ -+ J_ASSERT(*len >= ac.ac_b_ex.fe_len); -+ DQUOT_FREE_BLOCK(inode, *len - ac.ac_b_ex.fe_len); -+ -+ *len = ac.ac_b_ex.fe_len; -+ J_ASSERT(*len > 0); -+ J_ASSERT(block != 0); -+ goto out; -+ -+out_err: -+ /* if we've already allocated something, roll it back */ -+ if (ac.ac_status == AC_STATUS_FOUND) { -+ /* FIXME: free blocks here */ -+ } -+ -+ DQUOT_FREE_BLOCK(inode, *len); -+ brelse(bitmap_bh); -+ *errp = err; -+ block = 0; -+out: -+ if (ac.ac_buddy_page) -+ page_cache_release(ac.ac_buddy_page); -+ if (ac.ac_bitmap_page) -+ page_cache_release(ac.ac_bitmap_page); -+ -+ if (!(flags & EXT3_MB_HINT_RESERVED)) { -+ /* block wasn't reserved before and we reserved it -+ * at the beginning of allocation. it doesn't matter -+ * whether we allocated anything or we failed: time -+ * to release reservation. NOTE: because I expect -+ * any multiblock request from delayed allocation -+ * path only, here is single block always */ -+ ext3_mb_release_blocks(sb, 1); -+ } -+ -+ if (unlikely(ext3_mb_stats) && ac.ac_g_ex.fe_len > 1) { -+ atomic_inc(&sbi->s_bal_reqs); -+ atomic_add(*len, &sbi->s_bal_allocated); -+ if (*len >= ac.ac_g_ex.fe_len) -+ atomic_inc(&sbi->s_bal_success); -+ atomic_add(ac.ac_found, &sbi->s_bal_ex_scanned); -+ if (ac.ac_g_ex.fe_start == ac.ac_b_ex.fe_start && -+ ac.ac_g_ex.fe_group == ac.ac_b_ex.fe_group) -+ atomic_inc(&sbi->s_bal_goals); -+ if (ac.ac_found > ext3_mb_max_to_scan) -+ atomic_inc(&sbi->s_bal_breaks); -+ } -+ -+ ext3_mb_store_history(sb, inode->i_ino, &ac); -+ -+ return block; -+} -+EXPORT_SYMBOL(ext3_mb_new_blocks); -+ -+#ifdef EXT3_MB_HISTORY -+struct ext3_mb_proc_session { -+ struct ext3_mb_history *history; -+ struct super_block *sb; -+ int start; -+ int max; -+}; -+ -+static void *ext3_mb_history_skip_empty(struct ext3_mb_proc_session *s, -+ struct ext3_mb_history *hs, -+ int first) -+{ -+ if (hs == s->history + s->max) -+ hs = s->history; -+ if (!first && hs == s->history + s->start) -+ return NULL; -+ while (hs->goal.fe_len == 0) { -+ hs++; -+ if (hs == s->history + s->max) -+ hs = s->history; -+ if (hs == s->history + s->start) -+ return NULL; -+ } -+ return hs; -+} -+ -+static void *ext3_mb_seq_history_start(struct seq_file *seq, loff_t *pos) -+{ -+ struct ext3_mb_proc_session *s = seq->private; -+ struct ext3_mb_history *hs; -+ int l = *pos; -+ -+ if (l == 0) -+ return SEQ_START_TOKEN; -+ hs = ext3_mb_history_skip_empty(s, s->history + s->start, 1); -+ if (!hs) -+ return NULL; -+ while (--l && (hs = ext3_mb_history_skip_empty(s, ++hs, 0)) != NULL); -+ return hs; -+} -+ -+static void *ext3_mb_seq_history_next(struct seq_file *seq, void *v, loff_t *pos) -+{ -+ struct ext3_mb_proc_session *s = seq->private; -+ struct ext3_mb_history *hs = v; -+ -+ ++*pos; -+ if (v == SEQ_START_TOKEN) -+ return ext3_mb_history_skip_empty(s, s->history + s->start, 1); -+ else -+ return ext3_mb_history_skip_empty(s, ++hs, 0); -+} -+ -+static int ext3_mb_seq_history_show(struct seq_file *seq, void *v) -+{ -+ struct ext3_mb_history *hs = v; -+ char buf[20], buf2[20]; -+ -+ if (v == SEQ_START_TOKEN) { -+ seq_printf(seq, "%-5s %-8s %-17s %-17s %-5s %-5s %-2s %-5s %-5s %-6s\n", -+ "pid", "inode", "goal", "result", "found", "grps", "cr", -+ "merge", "tail", "broken"); -+ return 0; -+ } -+ -+ sprintf(buf, "%u/%u/%u", hs->goal.fe_group, -+ hs->goal.fe_start, hs->goal.fe_len); -+ sprintf(buf2, "%u/%u/%u", hs->result.fe_group, -+ hs->result.fe_start, hs->result.fe_len); -+ seq_printf(seq, "%-5u %-8u %-17s %-17s %-5u %-5u %-2u %-5s %-5u %-6u\n", -+ hs->pid, hs->ino, buf, buf2, hs->found, hs->groups, -+ hs->cr, hs->merged ? "M" : "", hs->tail, -+ hs->buddy ? 1 << hs->buddy : 0); -+ return 0; -+} -+ -+static void ext3_mb_seq_history_stop(struct seq_file *seq, void *v) -+{ -+} -+ -+static struct seq_operations ext3_mb_seq_history_ops = { -+ .start = ext3_mb_seq_history_start, -+ .next = ext3_mb_seq_history_next, -+ .stop = ext3_mb_seq_history_stop, -+ .show = ext3_mb_seq_history_show, -+}; -+ -+static int ext3_mb_seq_history_open(struct inode *inode, struct file *file) -+{ -+ struct super_block *sb = PDE(inode)->data; -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ struct ext3_mb_proc_session *s; -+ int rc, size; -+ -+ s = kmalloc(sizeof(*s), GFP_KERNEL); -+ if (s == NULL) -+ return -EIO; -+ size = sizeof(struct ext3_mb_history) * sbi->s_mb_history_max; -+ s->history = kmalloc(size, GFP_KERNEL); -+ if (s == NULL) { -+ kfree(s); -+ return -EIO; -+ } -+ -+ spin_lock(&sbi->s_mb_history_lock); -+ memcpy(s->history, sbi->s_mb_history, size); -+ s->max = sbi->s_mb_history_max; -+ s->start = sbi->s_mb_history_cur % s->max; -+ spin_unlock(&sbi->s_mb_history_lock); -+ -+ rc = seq_open(file, &ext3_mb_seq_history_ops); -+ if (rc == 0) { -+ struct seq_file *m = (struct seq_file *)file->private_data; -+ m->private = s; -+ } else { -+ kfree(s->history); -+ kfree(s); -+ } -+ return rc; -+ -+} -+ -+static int ext3_mb_seq_history_release(struct inode *inode, struct file *file) -+{ -+ struct seq_file *seq = (struct seq_file *)file->private_data; -+ struct ext3_mb_proc_session *s = seq->private; -+ kfree(s->history); -+ kfree(s); -+ return seq_release(inode, file); -+} -+ -+static struct file_operations ext3_mb_seq_history_fops = { -+ .owner = THIS_MODULE, -+ .open = ext3_mb_seq_history_open, -+ .read = seq_read, -+ .llseek = seq_lseek, -+ .release = ext3_mb_seq_history_release, -+}; -+ -+static void *ext3_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) -+{ -+ struct super_block *sb = seq->private; -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ long group; -+ -+ if (*pos < 0 || *pos >= sbi->s_groups_count) -+ return NULL; -+ -+ group = *pos + 1; -+ return (void *) group; -+} -+ -+static void *ext3_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) -+{ -+ struct super_block *sb = seq->private; -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ long group; -+ -+ ++*pos; -+ if (*pos < 0 || *pos >= sbi->s_groups_count) -+ return NULL; -+ group = *pos + 1; -+ return (void *) group;; -+} -+ -+static int ext3_mb_seq_groups_show(struct seq_file *seq, void *v) -+{ -+ struct super_block *sb = seq->private; -+ long group = (long) v, i; -+ struct sg { -+ struct ext3_group_info info; -+ unsigned short counters[16]; -+ } sg; -+ -+ group--; -+ if (group == 0) -+ seq_printf(seq, "#%-5s: %-5s %-5s %-5s [ %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s ]\n", -+ "group", "free", "frags", "first", "2^0", "2^1", "2^2", -+ "2^3", "2^4", "2^5", "2^6", "2^7", "2^8", "2^9", "2^10", -+ "2^11", "2^12", "2^13"); -+ -+ i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) + -+ sizeof(struct ext3_group_info); -+ ext3_lock_group(sb, group); -+ memcpy(&sg, EXT3_GROUP_INFO(sb, group), i); -+ ext3_unlock_group(sb, group); -+ -+ if (EXT3_MB_GRP_NEED_INIT(&sg.info)) -+ return 0; -+ -+ seq_printf(seq, "#%-5lu: %-5u %-5u %-5u [", group, sg.info.bb_free, -+ sg.info.bb_fragments, sg.info.bb_first_free); -+ for (i = 0; i <= 13; i++) -+ seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ? -+ sg.info.bb_counters[i] : 0); -+ seq_printf(seq, " ]\n"); -+ -+ return 0; -+} -+ -+static void ext3_mb_seq_groups_stop(struct seq_file *seq, void *v) -+{ -+} -+ -+static struct seq_operations ext3_mb_seq_groups_ops = { -+ .start = ext3_mb_seq_groups_start, -+ .next = ext3_mb_seq_groups_next, -+ .stop = ext3_mb_seq_groups_stop, -+ .show = ext3_mb_seq_groups_show, -+}; -+ -+static int ext3_mb_seq_groups_open(struct inode *inode, struct file *file) -+{ -+ struct super_block *sb = PDE(inode)->data; -+ int rc; -+ -+ rc = seq_open(file, &ext3_mb_seq_groups_ops); -+ if (rc == 0) { -+ struct seq_file *m = (struct seq_file *)file->private_data; -+ m->private = sb; -+ } -+ return rc; -+ -+} -+ -+static struct file_operations ext3_mb_seq_groups_fops = { -+ .owner = THIS_MODULE, -+ .open = ext3_mb_seq_groups_open, -+ .read = seq_read, -+ .llseek = seq_lseek, -+ .release = seq_release, -+}; -+ -+static void ext3_mb_history_release(struct super_block *sb) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ char name[64]; -+ -+ snprintf(name, sizeof(name) - 1, "%s", bdevname(sb->s_bdev, name)); -+ remove_proc_entry("mb_groups", sbi->s_mb_proc); -+ remove_proc_entry("mb_history", sbi->s_mb_proc); -+ remove_proc_entry(name, proc_root_ext3); -+ -+ if (sbi->s_mb_history) -+ kfree(sbi->s_mb_history); -+} -+ -+static void ext3_mb_history_init(struct super_block *sb) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ char name[64]; -+ int i; -+ -+ snprintf(name, sizeof(name) - 1, "%s", bdevname(sb->s_bdev, name)); -+ sbi->s_mb_proc = proc_mkdir(name, proc_root_ext3); -+ if (sbi->s_mb_proc != NULL) { -+ struct proc_dir_entry *p; -+ p = create_proc_entry("mb_history", S_IRUGO, sbi->s_mb_proc); -+ if (p) { -+ p->proc_fops = &ext3_mb_seq_history_fops; -+ p->data = sb; -+ } -+ p = create_proc_entry("mb_groups", S_IRUGO, sbi->s_mb_proc); -+ if (p) { -+ p->proc_fops = &ext3_mb_seq_groups_fops; -+ p->data = sb; -+ } -+ } -+ -+ sbi->s_mb_history_max = 1000; -+ sbi->s_mb_history_cur = 0; -+ spin_lock_init(&sbi->s_mb_history_lock); -+ i = sbi->s_mb_history_max * sizeof(struct ext3_mb_history); -+ sbi->s_mb_history = kmalloc(i, GFP_KERNEL); -+ memset(sbi->s_mb_history, 0, i); -+ /* if we can't allocate history, then we simple won't use it */ -+} -+ -+static void -+ext3_mb_store_history(struct super_block *sb, unsigned ino, -+ struct ext3_allocation_context *ac) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ struct ext3_mb_history h; -+ -+ if (likely(sbi->s_mb_history == NULL)) -+ return; -+ -+ h.pid = current->pid; -+ h.ino = ino; -+ h.goal = ac->ac_g_ex; -+ h.result = ac->ac_b_ex; -+ h.found = ac->ac_found; -+ h.cr = ac->ac_criteria; -+ h.groups = ac->ac_groups_scanned; -+ h.tail = ac->ac_tail; -+ h.buddy = ac->ac_buddy; -+ h.merged = 0; -+ if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && -+ ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group) -+ h.merged = 1; -+ -+ spin_lock(&sbi->s_mb_history_lock); -+ memcpy(sbi->s_mb_history + sbi->s_mb_history_cur, &h, sizeof(h)); -+ if (++sbi->s_mb_history_cur >= sbi->s_mb_history_max) -+ sbi->s_mb_history_cur = 0; -+ spin_unlock(&sbi->s_mb_history_lock); -+} -+ -+#else -+#define ext3_mb_history_release(sb) -+#define ext3_mb_history_init(sb) -+#endif -+ -+int ext3_mb_init_backend(struct super_block *sb) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ int i, j, len, metalen; -+ int num_meta_group_infos = -+ (sbi->s_groups_count + EXT3_DESC_PER_BLOCK(sb) - 1) >> -+ EXT3_DESC_PER_BLOCK_BITS(sb); -+ struct ext3_group_info **meta_group_info; -+ -+ /* An 8TB filesystem with 64-bit pointers requires a 4096 byte -+ * kmalloc. A 128kb malloc should suffice for a 256TB filesystem. -+ * So a two level scheme suffices for now. */ -+ sbi->s_group_info = kmalloc(sizeof(*sbi->s_group_info) * -+ num_meta_group_infos, GFP_KERNEL); -+ if (sbi->s_group_info == NULL) { -+ printk(KERN_ERR "EXT3-fs: can't allocate buddy meta group\n"); -+ return -ENOMEM; -+ } -+ sbi->s_buddy_cache = new_inode(sb); -+ if (sbi->s_buddy_cache == NULL) { -+ printk(KERN_ERR "EXT3-fs: can't get new inode\n"); -+ goto err_freesgi; -+ } -+ -+ metalen = sizeof(*meta_group_info) << EXT3_DESC_PER_BLOCK_BITS(sb); -+ for (i = 0; i < num_meta_group_infos; i++) { -+ if ((i + 1) == num_meta_group_infos) -+ metalen = sizeof(*meta_group_info) * -+ (sbi->s_groups_count - -+ (i << EXT3_DESC_PER_BLOCK_BITS(sb))); -+ meta_group_info = kmalloc(metalen, GFP_KERNEL); -+ if (meta_group_info == NULL) { -+ printk(KERN_ERR "EXT3-fs: can't allocate mem for a " -+ "buddy group\n"); -+ goto err_freemeta; -+ } -+ sbi->s_group_info[i] = meta_group_info; -+ } -+ -+ /* -+ * calculate needed size. if change bb_counters size, -+ * don't forget about ext3_mb_generate_buddy() -+ */ -+ len = sizeof(struct ext3_group_info); -+ len += sizeof(unsigned short) * (sb->s_blocksize_bits + 2); -+ for (i = 0; i < sbi->s_groups_count; i++) { -+ struct ext3_group_desc * desc; -+ -+ meta_group_info = -+ sbi->s_group_info[i >> EXT3_DESC_PER_BLOCK_BITS(sb)]; -+ j = i & (EXT3_DESC_PER_BLOCK(sb) - 1); -+ -+ meta_group_info[j] = kmalloc(len, GFP_KERNEL); -+ if (meta_group_info[j] == NULL) { -+ printk(KERN_ERR "EXT3-fs: can't allocate buddy mem\n"); -+ i--; -+ goto err_freebuddy; -+ } -+ desc = ext3_get_group_desc(sb, i, NULL); -+ if (desc == NULL) { -+ printk(KERN_ERR"EXT3-fs: can't read descriptor %u\n",i); -+ goto err_freebuddy; -+ } -+ memset(meta_group_info[j], 0, len); -+ set_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, -+ &meta_group_info[j]->bb_state); -+ meta_group_info[j]->bb_free = -+ le16_to_cpu(desc->bg_free_blocks_count); -+ } -+ -+ return 0; -+ -+err_freebuddy: -+ while (i >= 0) { -+ kfree(EXT3_GROUP_INFO(sb, i)); -+ i--; -+ } -+ i = num_meta_group_infos; -+err_freemeta: -+ while (--i >= 0) -+ kfree(sbi->s_group_info[i]); -+ iput(sbi->s_buddy_cache); -+err_freesgi: -+ kfree(sbi->s_group_info); -+ return -ENOMEM; -+} -+ -+int ext3_mb_init(struct super_block *sb, int needs_recovery) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ struct inode *root = sb->s_root->d_inode; -+ unsigned i, offset, max; -+ struct dentry *dentry; -+ -+ if (!test_opt(sb, MBALLOC)) -+ return 0; -+ -+ i = (sb->s_blocksize_bits + 2) * sizeof(unsigned short); -+ -+ sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL); -+ if (sbi->s_mb_offsets == NULL) { -+ clear_opt(sbi->s_mount_opt, MBALLOC); -+ return -ENOMEM; -+ } -+ sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); -+ if (sbi->s_mb_maxs == NULL) { -+ clear_opt(sbi->s_mount_opt, MBALLOC); -+ kfree(sbi->s_mb_maxs); -+ return -ENOMEM; -+ } -+ -+ /* order 0 is regular bitmap */ -+ sbi->s_mb_maxs[0] = sb->s_blocksize << 3; -+ sbi->s_mb_offsets[0] = 0; -+ -+ i = 1; -+ offset = 0; -+ max = sb->s_blocksize << 2; -+ do { -+ sbi->s_mb_offsets[i] = offset; -+ sbi->s_mb_maxs[i] = max; -+ offset += 1 << (sb->s_blocksize_bits - i); -+ max = max >> 1; -+ i++; -+ } while (i <= sb->s_blocksize_bits + 1); -+ -+ /* init file for buddy data */ -+ if ((i = ext3_mb_init_backend(sb))) { -+ clear_opt(sbi->s_mount_opt, MBALLOC); -+ kfree(sbi->s_mb_offsets); -+ kfree(sbi->s_mb_maxs); -+ return i; -+ } -+ -+ spin_lock_init(&sbi->s_reserve_lock); -+ spin_lock_init(&sbi->s_md_lock); -+ INIT_LIST_HEAD(&sbi->s_active_transaction); -+ INIT_LIST_HEAD(&sbi->s_closed_transaction); -+ INIT_LIST_HEAD(&sbi->s_committed_transaction); -+ spin_lock_init(&sbi->s_bal_lock); -+ -+ /* remove old on-disk buddy file */ -+ mutex_lock(&root->i_mutex); -+ dentry = lookup_one_len(".buddy", sb->s_root, strlen(".buddy")); -+ if (dentry->d_inode != NULL) { -+ i = vfs_unlink(root, dentry); -+ if (i != 0) -+ printk("EXT3-fs: can't remove .buddy file: %d\n", i); -+ } -+ dput(dentry); -+ mutex_unlock(&root->i_mutex); -+ -+ ext3_mb_history_init(sb); -+ -+ printk("EXT3-fs: mballoc enabled\n"); -+ return 0; -+} -+ -+int ext3_mb_release(struct super_block *sb) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ int i, num_meta_group_infos; -+ -+ if (!test_opt(sb, MBALLOC)) -+ return 0; -+ -+ /* release freed, non-committed blocks */ -+ spin_lock(&sbi->s_md_lock); -+ list_splice_init(&sbi->s_closed_transaction, -+ &sbi->s_committed_transaction); -+ list_splice_init(&sbi->s_active_transaction, -+ &sbi->s_committed_transaction); -+ spin_unlock(&sbi->s_md_lock); -+ ext3_mb_free_committed_blocks(sb); -+ -+ if (sbi->s_group_info) { -+ for (i = 0; i < sbi->s_groups_count; i++) -+ kfree(EXT3_GROUP_INFO(sb, i)); -+ num_meta_group_infos = (sbi->s_groups_count + -+ EXT3_DESC_PER_BLOCK(sb) - 1) >> -+ EXT3_DESC_PER_BLOCK_BITS(sb); -+ for (i = 0; i < num_meta_group_infos; i++) -+ kfree(sbi->s_group_info[i]); -+ kfree(sbi->s_group_info); -+ } -+ if (sbi->s_mb_offsets) -+ kfree(sbi->s_mb_offsets); -+ if (sbi->s_mb_maxs) -+ kfree(sbi->s_mb_maxs); -+ if (sbi->s_buddy_cache) -+ iput(sbi->s_buddy_cache); -+ if (sbi->s_blocks_reserved) -+ printk("ext3-fs: %ld blocks being reserved at umount!\n", -+ sbi->s_blocks_reserved); -+ if (ext3_mb_stats) { -+ printk("EXT3-fs: mballoc: %u blocks %u reqs (%u success)\n", -+ atomic_read(&sbi->s_bal_allocated), -+ atomic_read(&sbi->s_bal_reqs), -+ atomic_read(&sbi->s_bal_success)); -+ printk("EXT3-fs: mballoc: %u extents scanned, %u goal hits, " -+ "%u 2^N hits, %u breaks\n", -+ atomic_read(&sbi->s_bal_ex_scanned), -+ atomic_read(&sbi->s_bal_goals), -+ atomic_read(&sbi->s_bal_2orders), -+ atomic_read(&sbi->s_bal_breaks)); -+ printk("EXT3-fs: mballoc: %lu generated and it took %Lu\n", -+ sbi->s_mb_buddies_generated++, -+ sbi->s_mb_generation_time); -+ } -+ -+ ext3_mb_history_release(sb); -+ -+ return 0; -+} -+ -+void ext3_mb_free_committed_blocks(struct super_block *sb) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ int err, i, count = 0, count2 = 0; -+ struct ext3_free_metadata *md; -+ struct ext3_buddy e3b; -+ -+ if (list_empty(&sbi->s_committed_transaction)) -+ return; -+ -+ /* there is committed blocks to be freed yet */ -+ do { -+ /* get next array of blocks */ -+ md = NULL; -+ spin_lock(&sbi->s_md_lock); -+ if (!list_empty(&sbi->s_committed_transaction)) { -+ md = list_entry(sbi->s_committed_transaction.next, -+ struct ext3_free_metadata, list); -+ list_del(&md->list); -+ } -+ spin_unlock(&sbi->s_md_lock); -+ -+ if (md == NULL) -+ break; -+ -+ mb_debug("gonna free %u blocks in group %u (0x%p):", -+ md->num, md->group, md); -+ -+ err = ext3_mb_load_buddy(sb, md->group, &e3b); -+ /* we expect to find existing buddy because it's pinned */ -+ BUG_ON(err != 0); -+ -+ /* there are blocks to put in buddy to make them really free */ -+ count += md->num; -+ count2++; -+ ext3_lock_group(sb, md->group); -+ for (i = 0; i < md->num; i++) { -+ mb_debug(" %u", md->blocks[i]); -+ mb_free_blocks(&e3b, md->blocks[i], 1); -+ } -+ mb_debug("\n"); -+ ext3_unlock_group(sb, md->group); -+ -+ /* balance refcounts from ext3_mb_free_metadata() */ -+ page_cache_release(e3b.bd_buddy_page); -+ page_cache_release(e3b.bd_bitmap_page); -+ -+ kfree(md); -+ ext3_mb_release_desc(&e3b); -+ -+ } while (md); -+ mb_debug("freed %u blocks in %u structures\n", count, count2); -+} -+ -+void ext3_mb_poll_new_transaction(struct super_block *sb, handle_t *handle) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ -+ if (sbi->s_last_transaction == handle->h_transaction->t_tid) -+ return; -+ -+ /* new transaction! time to close last one and free blocks for -+ * committed transaction. we know that only transaction can be -+ * active, so previos transaction can be being logged and we -+ * know that transaction before previous is known to be already -+ * logged. this means that now we may free blocks freed in all -+ * transactions before previous one. hope I'm clear enough ... */ -+ -+ spin_lock(&sbi->s_md_lock); -+ if (sbi->s_last_transaction != handle->h_transaction->t_tid) { -+ mb_debug("new transaction %lu, old %lu\n", -+ (unsigned long) handle->h_transaction->t_tid, -+ (unsigned long) sbi->s_last_transaction); -+ list_splice_init(&sbi->s_closed_transaction, -+ &sbi->s_committed_transaction); -+ list_splice_init(&sbi->s_active_transaction, -+ &sbi->s_closed_transaction); -+ sbi->s_last_transaction = handle->h_transaction->t_tid; -+ } -+ spin_unlock(&sbi->s_md_lock); -+ -+ ext3_mb_free_committed_blocks(sb); -+} -+ -+int ext3_mb_free_metadata(handle_t *handle, struct ext3_buddy *e3b, -+ int group, int block, int count) -+{ -+ struct ext3_group_info *db = e3b->bd_info; -+ struct super_block *sb = e3b->bd_sb; -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ struct ext3_free_metadata *md; -+ int i; -+ -+ J_ASSERT(e3b->bd_bitmap_page != NULL); -+ J_ASSERT(e3b->bd_buddy_page != NULL); -+ -+ ext3_lock_group(sb, group); -+ for (i = 0; i < count; i++) { -+ md = db->bb_md_cur; -+ if (md && db->bb_tid != handle->h_transaction->t_tid) { -+ db->bb_md_cur = NULL; -+ md = NULL; -+ } -+ -+ if (md == NULL) { -+ ext3_unlock_group(sb, group); -+ md = kmalloc(sizeof(*md), GFP_KERNEL); -+ if (md == NULL) -+ return -ENOMEM; -+ md->num = 0; -+ md->group = group; -+ -+ ext3_lock_group(sb, group); -+ if (db->bb_md_cur == NULL) { -+ spin_lock(&sbi->s_md_lock); -+ list_add(&md->list, &sbi->s_active_transaction); -+ spin_unlock(&sbi->s_md_lock); -+ /* protect buddy cache from being freed, -+ * otherwise we'll refresh it from -+ * on-disk bitmap and lose not-yet-available -+ * blocks */ -+ page_cache_get(e3b->bd_buddy_page); -+ page_cache_get(e3b->bd_bitmap_page); -+ db->bb_md_cur = md; -+ db->bb_tid = handle->h_transaction->t_tid; -+ mb_debug("new md 0x%p for group %u\n", -+ md, md->group); -+ } else { -+ kfree(md); -+ md = db->bb_md_cur; -+ } -+ } -+ -+ BUG_ON(md->num >= EXT3_BB_MAX_BLOCKS); -+ md->blocks[md->num] = block + i; -+ md->num++; -+ if (md->num == EXT3_BB_MAX_BLOCKS) { -+ /* no more space, put full container on a sb's list */ -+ db->bb_md_cur = NULL; -+ } -+ } -+ ext3_unlock_group(sb, group); -+ return 0; -+} -+ -+void ext3_mb_free_blocks(handle_t *handle, struct inode *inode, -+ ext3_fsblk_t block, unsigned long count, -+ int metadata, unsigned long *freed) -+{ -+ struct buffer_head *bitmap_bh = NULL; -+ struct ext3_group_desc *gdp; -+ struct ext3_super_block *es; -+ unsigned long bit, overflow; -+ struct buffer_head *gd_bh; -+ unsigned long block_group; -+ struct ext3_sb_info *sbi; -+ struct super_block *sb; -+ struct ext3_buddy e3b; -+ int err = 0, ret; -+ -+ *freed = 0; -+ sb = inode->i_sb; -+ if (!sb) { -+ printk ("ext3_free_blocks: nonexistent device"); -+ return; -+ } -+ -+ ext3_mb_poll_new_transaction(sb, handle); -+ -+ sbi = EXT3_SB(sb); -+ es = EXT3_SB(sb)->s_es; -+ if (block < le32_to_cpu(es->s_first_data_block) || -+ block + count < block || -+ block + count > le32_to_cpu(es->s_blocks_count)) { -+ ext3_error (sb, "ext3_free_blocks", -+ "Freeing blocks not in datazone - " -+ "block = %lu, count = %lu", block, count); -+ goto error_return; -+ } -+ -+ ext3_debug("freeing block %lu\n", block); -+ -+do_more: -+ overflow = 0; -+ block_group = (block - le32_to_cpu(es->s_first_data_block)) / -+ EXT3_BLOCKS_PER_GROUP(sb); -+ bit = (block - le32_to_cpu(es->s_first_data_block)) % -+ EXT3_BLOCKS_PER_GROUP(sb); -+ /* -+ * Check to see if we are freeing blocks across a group -+ * boundary. -+ */ -+ if (bit + count > EXT3_BLOCKS_PER_GROUP(sb)) { -+ overflow = bit + count - EXT3_BLOCKS_PER_GROUP(sb); -+ count -= overflow; -+ } -+ brelse(bitmap_bh); -+ bitmap_bh = read_block_bitmap(sb, block_group); -+ if (!bitmap_bh) -+ goto error_return; -+ gdp = ext3_get_group_desc (sb, block_group, &gd_bh); -+ if (!gdp) -+ goto error_return; -+ -+ if (in_range (le32_to_cpu(gdp->bg_block_bitmap), block, count) || -+ in_range (le32_to_cpu(gdp->bg_inode_bitmap), block, count) || -+ in_range (block, le32_to_cpu(gdp->bg_inode_table), -+ EXT3_SB(sb)->s_itb_per_group) || -+ in_range (block + count - 1, le32_to_cpu(gdp->bg_inode_table), -+ EXT3_SB(sb)->s_itb_per_group)) -+ ext3_error (sb, "ext3_free_blocks", -+ "Freeing blocks in system zones - " -+ "Block = %lu, count = %lu", -+ block, count); -+ -+ BUFFER_TRACE(bitmap_bh, "getting write access"); -+ err = ext3_journal_get_write_access(handle, bitmap_bh); -+ if (err) -+ goto error_return; -+ -+ /* -+ * We are about to modify some metadata. Call the journal APIs -+ * to unshare ->b_data if a currently-committing transaction is -+ * using it -+ */ -+ BUFFER_TRACE(gd_bh, "get_write_access"); -+ err = ext3_journal_get_write_access(handle, gd_bh); -+ if (err) -+ goto error_return; -+ -+ err = ext3_mb_load_buddy(sb, block_group, &e3b); -+ if (err) -+ goto error_return; -+ -+#ifdef AGGRESSIVE_CHECK -+ { -+ int i; -+ for (i = 0; i < count; i++) -+ J_ASSERT(mb_test_bit(bit + i, bitmap_bh->b_data)); -+ } -+#endif -+ mb_clear_bits(bitmap_bh->b_data, bit, count); -+ -+ /* We dirtied the bitmap block */ -+ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); -+ err = ext3_journal_dirty_metadata(handle, bitmap_bh); -+ -+ if (metadata) { -+ /* blocks being freed are metadata. these blocks shouldn't -+ * be used until this transaction is committed */ -+ ext3_mb_free_metadata(handle, &e3b, block_group, bit, count); -+ } else { -+ ext3_lock_group(sb, block_group); -+ mb_free_blocks(&e3b, bit, count); -+ ext3_unlock_group(sb, block_group); -+ } -+ -+ spin_lock(sb_bgl_lock(sbi, block_group)); -+ gdp->bg_free_blocks_count = -+ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + count); -+ spin_unlock(sb_bgl_lock(sbi, block_group)); -+ percpu_counter_mod(&sbi->s_freeblocks_counter, count); -+ -+ ext3_mb_release_desc(&e3b); -+ -+ *freed = count; -+ -+ /* And the group descriptor block */ -+ BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); -+ ret = ext3_journal_dirty_metadata(handle, gd_bh); -+ if (!err) err = ret; -+ -+ if (overflow && !err) { -+ block += count; -+ count = overflow; -+ goto do_more; -+ } -+ sb->s_dirt = 1; -+error_return: -+ brelse(bitmap_bh); -+ ext3_std_error(sb, err); -+ return; -+} -+ -+int ext3_mb_reserve_blocks(struct super_block *sb, int blocks) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ int free, ret = -ENOSPC; -+ -+ BUG_ON(blocks < 0); -+ spin_lock(&sbi->s_reserve_lock); -+ free = percpu_counter_read_positive(&sbi->s_freeblocks_counter); -+ if (blocks <= free - sbi->s_blocks_reserved) { -+ sbi->s_blocks_reserved += blocks; -+ ret = 0; -+ } -+ spin_unlock(&sbi->s_reserve_lock); -+ return ret; -+} -+ -+void ext3_mb_release_blocks(struct super_block *sb, int blocks) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ -+ BUG_ON(blocks < 0); -+ spin_lock(&sbi->s_reserve_lock); -+ sbi->s_blocks_reserved -= blocks; -+ WARN_ON(sbi->s_blocks_reserved < 0); -+ if (sbi->s_blocks_reserved < 0) -+ sbi->s_blocks_reserved = 0; -+ spin_unlock(&sbi->s_reserve_lock); -+} -+ -+ext3_fsblk_t ext3_new_block(handle_t *handle, struct inode *inode, -+ ext3_fsblk_t goal, int *errp) -+{ -+ ext3_fsblk_t ret; -+ int len; -+ -+ if (!test_opt(inode->i_sb, MBALLOC)) { -+ ret = ext3_new_block_old(handle, inode, goal, errp); -+ goto out; -+ } -+ len = 1; -+ ret = ext3_mb_new_blocks(handle, inode, goal, &len, 0, errp); -+out: -+ return ret; -+} -+ -+void ext3_free_blocks(handle_t *handle, struct inode * inode, -+ ext3_fsblk_t block, unsigned long count, int metadata) -+{ -+ struct super_block *sb; -+ unsigned long freed; -+ -+ sb = inode->i_sb; -+ if (!test_opt(sb, MBALLOC) || !EXT3_SB(sb)->s_group_info) -+ ext3_free_blocks_sb(handle, sb, block, count, &freed); -+ else -+ ext3_mb_free_blocks(handle, inode, block, count, metadata, -+ &freed); -+ if (freed) -+ DQUOT_FREE_BLOCK(inode, freed); -+ return; -+} -+ -+#define EXT3_ROOT "ext3" -+#define EXT3_MB_STATS_NAME "mb_stats" -+#define EXT3_MB_MAX_TO_SCAN_NAME "mb_max_to_scan" -+#define EXT3_MB_MIN_TO_SCAN_NAME "mb_min_to_scan" -+#define EXT3_MB_ORDER2_REQ "mb_order2_req" -+ -+static int ext3_mb_stats_read(char *page, char **start, off_t off, -+ int count, int *eof, void *data) -+{ -+ int len; -+ -+ *eof = 1; -+ if (off != 0) -+ return 0; -+ -+ len = sprintf(page, "%ld\n", ext3_mb_stats); -+ *start = page; -+ return len; -+} -+ -+static int ext3_mb_stats_write(struct file *file, const char *buffer, -+ unsigned long count, void *data) -+{ -+ char str[32]; -+ -+ if (count >= sizeof(str)) { -+ printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n", -+ EXT3_MB_STATS_NAME, (int)sizeof(str)); -+ return -EOVERFLOW; -+ } -+ -+ if (copy_from_user(str, buffer, count)) -+ return -EFAULT; -+ -+ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ -+ ext3_mb_stats = (simple_strtol(str, NULL, 0) != 0); -+ return count; -+} -+ -+static int ext3_mb_max_to_scan_read(char *page, char **start, off_t off, -+ int count, int *eof, void *data) -+{ -+ int len; -+ -+ *eof = 1; -+ if (off != 0) -+ return 0; -+ -+ len = sprintf(page, "%ld\n", ext3_mb_max_to_scan); -+ *start = page; -+ return len; -+} -+ -+static int ext3_mb_max_to_scan_write(struct file *file, const char *buffer, -+ unsigned long count, void *data) -+{ -+ char str[32]; -+ long value; -+ -+ if (count >= sizeof(str)) { -+ printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n", -+ EXT3_MB_MAX_TO_SCAN_NAME, (int)sizeof(str)); -+ return -EOVERFLOW; -+ } -+ -+ if (copy_from_user(str, buffer, count)) -+ return -EFAULT; -+ -+ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ -+ value = simple_strtol(str, NULL, 0); -+ if (value <= 0) -+ return -ERANGE; -+ -+ ext3_mb_max_to_scan = value; -+ -+ return count; -+} -+ -+static int ext3_mb_min_to_scan_read(char *page, char **start, off_t off, -+ int count, int *eof, void *data) -+{ -+ int len; -+ -+ *eof = 1; -+ if (off != 0) -+ return 0; -+ -+ len = sprintf(page, "%ld\n", ext3_mb_min_to_scan); -+ *start = page; -+ return len; -+} -+ -+static int ext3_mb_min_to_scan_write(struct file *file, const char *buffer, -+ unsigned long count, void *data) -+{ -+ char str[32]; -+ long value; -+ -+ if (count >= sizeof(str)) { -+ printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n", -+ EXT3_MB_MIN_TO_SCAN_NAME, (int)sizeof(str)); -+ return -EOVERFLOW; -+ } -+ -+ if (copy_from_user(str, buffer, count)) -+ return -EFAULT; -+ -+ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ -+ value = simple_strtol(str, NULL, 0); -+ if (value <= 0) -+ return -ERANGE; -+ -+ ext3_mb_min_to_scan = value; -+ -+ return count; -+} -+ -+static int ext3_mb_order2_req_read(char *page, char **start, off_t off, -+ int count, int *eof, void *data) -+{ -+ int len; -+ -+ *eof = 1; -+ if (off != 0) -+ return 0; -+ -+ len = sprintf(page, "%ld\n", ext3_mb_order2_reqs); -+ *start = page; -+ return len; -+} -+ -+static int ext3_mb_order2_req_write(struct file *file, const char *buffer, -+ unsigned long count, void *data) -+{ -+ char str[32]; -+ long value; -+ -+ if (count >= sizeof(str)) { -+ printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n", -+ EXT3_MB_MIN_TO_SCAN_NAME, (int)sizeof(str)); -+ return -EOVERFLOW; -+ } -+ -+ if (copy_from_user(str, buffer, count)) -+ return -EFAULT; -+ -+ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ -+ value = simple_strtol(str, NULL, 0); -+ if (value <= 0) -+ return -ERANGE; -+ -+ ext3_mb_order2_reqs = value; -+ -+ return count; -+} -+ -+int __init init_ext3_proc(void) -+{ -+ struct proc_dir_entry *proc_ext3_mb_stats; -+ struct proc_dir_entry *proc_ext3_mb_max_to_scan; -+ struct proc_dir_entry *proc_ext3_mb_min_to_scan; -+ struct proc_dir_entry *proc_ext3_mb_order2_req; -+ -+ proc_root_ext3 = proc_mkdir(EXT3_ROOT, proc_root_fs); -+ if (proc_root_ext3 == NULL) { -+ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", EXT3_ROOT); -+ return -EIO; -+ } -+ -+ /* Initialize EXT3_MB_STATS_NAME */ -+ proc_ext3_mb_stats = create_proc_entry(EXT3_MB_STATS_NAME, -+ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); -+ if (proc_ext3_mb_stats == NULL) { -+ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", -+ EXT3_MB_STATS_NAME); -+ remove_proc_entry(EXT3_ROOT, proc_root_fs); -+ return -EIO; -+ } -+ -+ proc_ext3_mb_stats->data = NULL; -+ proc_ext3_mb_stats->read_proc = ext3_mb_stats_read; -+ proc_ext3_mb_stats->write_proc = ext3_mb_stats_write; -+ -+ /* Initialize EXT3_MAX_TO_SCAN_NAME */ -+ proc_ext3_mb_max_to_scan = create_proc_entry( -+ EXT3_MB_MAX_TO_SCAN_NAME, -+ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); -+ if (proc_ext3_mb_max_to_scan == NULL) { -+ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", -+ EXT3_MB_MAX_TO_SCAN_NAME); -+ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); -+ remove_proc_entry(EXT3_ROOT, proc_root_fs); -+ return -EIO; -+ } -+ -+ proc_ext3_mb_max_to_scan->data = NULL; -+ proc_ext3_mb_max_to_scan->read_proc = ext3_mb_max_to_scan_read; -+ proc_ext3_mb_max_to_scan->write_proc = ext3_mb_max_to_scan_write; -+ -+ /* Initialize EXT3_MIN_TO_SCAN_NAME */ -+ proc_ext3_mb_min_to_scan = create_proc_entry( -+ EXT3_MB_MIN_TO_SCAN_NAME, -+ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); -+ if (proc_ext3_mb_min_to_scan == NULL) { -+ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", -+ EXT3_MB_MIN_TO_SCAN_NAME); -+ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3); -+ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); -+ remove_proc_entry(EXT3_ROOT, proc_root_fs); -+ return -EIO; -+ } -+ -+ proc_ext3_mb_min_to_scan->data = NULL; -+ proc_ext3_mb_min_to_scan->read_proc = ext3_mb_min_to_scan_read; -+ proc_ext3_mb_min_to_scan->write_proc = ext3_mb_min_to_scan_write; -+ -+ /* Initialize EXT3_ORDER2_REQ */ -+ proc_ext3_mb_order2_req = create_proc_entry( -+ EXT3_MB_ORDER2_REQ, -+ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); -+ if (proc_ext3_mb_order2_req == NULL) { -+ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", -+ EXT3_MB_ORDER2_REQ); -+ remove_proc_entry(EXT3_MB_MIN_TO_SCAN_NAME, proc_root_ext3); -+ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3); -+ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); -+ remove_proc_entry(EXT3_ROOT, proc_root_fs); -+ return -EIO; -+ } -+ -+ proc_ext3_mb_order2_req->data = NULL; -+ proc_ext3_mb_order2_req->read_proc = ext3_mb_order2_req_read; -+ proc_ext3_mb_order2_req->write_proc = ext3_mb_order2_req_write; -+ -+ return 0; -+} -+ -+void exit_ext3_proc(void) -+{ -+ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); -+ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3); -+ remove_proc_entry(EXT3_MB_MIN_TO_SCAN_NAME, proc_root_ext3); -+ remove_proc_entry(EXT3_MB_ORDER2_REQ, proc_root_ext3); -+ remove_proc_entry(EXT3_ROOT, proc_root_fs); -+} -Index: linux-stage/fs/ext3/Makefile -=================================================================== ---- linux-stage.orig/fs/ext3/Makefile 2006-07-16 02:29:43.000000000 +0800 -+++ linux-stage/fs/ext3/Makefile 2006-07-16 02:29:49.000000000 +0800 -@@ -6,7 +6,7 @@ obj-$(CONFIG_EXT3_FS) += ext3.o - - ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ - ioctl.o namei.o super.o symlink.o hash.o resize.o \ -- extents.o -+ extents.o mballoc.o - - ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o - ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o diff --git a/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6.9-rhel4.patch b/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6.9-rhel4.patch deleted file mode 100644 index 4512098..0000000 --- a/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6.9-rhel4.patch +++ /dev/null @@ -1,3121 +0,0 @@ -Index: linux-2.6.9-full/include/linux/ext3_fs.h -=================================================================== ---- linux-2.6.9-full.orig/include/linux/ext3_fs.h 2006-06-01 14:58:46.000000000 +0400 -+++ linux-2.6.9-full/include/linux/ext3_fs.h 2006-10-24 12:54:31.000000000 +0400 -@@ -57,6 +57,14 @@ struct statfs; - #define ext3_debug(f, a...) do {} while (0) - #endif - -+#define EXT3_MULTIBLOCK_ALLOCATOR 1 -+ -+#define EXT3_MB_HINT_MERGE 1 -+#define EXT3_MB_HINT_RESERVED 2 -+#define EXT3_MB_HINT_METADATA 4 -+#define EXT3_MB_HINT_FIRST 8 -+#define EXT3_MB_HINT_BEST 16 -+ - /* - * Special inodes numbers - */ -@@ -365,6 +373,7 @@ struct ext3_inode { - #define EXT3_MOUNT_IOPEN_NOPRIV 0x100000/* Make iopen world-readable */ - #define EXT3_MOUNT_EXTENTS 0x200000/* Extents support */ - #define EXT3_MOUNT_EXTDEBUG 0x400000/* Extents debug */ -+#define EXT3_MOUNT_MBALLOC 0x800000/* Buddy allocation support */ - - /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ - #ifndef clear_opt -@@ -387,6 +396,14 @@ struct ext3_inode { - #define ext3_find_first_zero_bit ext2_find_first_zero_bit - #define ext3_find_next_zero_bit ext2_find_next_zero_bit - -+#ifndef ext2_find_next_le_bit -+#ifdef __LITTLE_ENDIAN -+#define ext2_find_next_le_bit(addr, size, off) find_next_bit((addr), (size), (off)) -+#else -+#error "mballoc needs a patch for big-endian systems - CFS bug 10634" -+#endif /* __LITTLE_ENDIAN */ -+#endif /* !ext2_find_next_le_bit */ -+ - /* - * Maximal mount counts between two filesystem checks - */ -@@ -726,7 +743,8 @@ extern int ext3_bg_has_super(struct supe - extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group); - extern int ext3_new_block (handle_t *, struct inode *, unsigned long, int *); - extern void ext3_free_blocks (handle_t *, struct inode *, unsigned long, -- unsigned long); -+ unsigned long, int); -+extern int ext3_new_block_old(handle_t *, struct inode *, unsigned long, int *); - extern void ext3_free_blocks_sb (handle_t *, struct super_block *, - unsigned long, unsigned long, int *); - extern unsigned long ext3_count_free_blocks (struct super_block *); -@@ -857,6 +874,17 @@ extern void ext3_extents_initialize_bloc - extern int ext3_ext_ioctl(struct inode *inode, struct file *filp, - unsigned int cmd, unsigned long arg); - -+/* mballoc.c */ -+extern long ext3_mb_stats; -+extern long ext3_mb_max_to_scan; -+extern int ext3_mb_init(struct super_block *, int); -+extern int ext3_mb_release(struct super_block *); -+extern int ext3_mb_new_blocks(handle_t *, struct inode *, unsigned long, int *, int, int *); -+extern int ext3_mb_reserve_blocks(struct super_block *, int); -+extern void ext3_mb_release_blocks(struct super_block *, int); -+int __init init_ext3_proc(void); -+void exit_ext3_proc(void); -+ - #endif /* __KERNEL__ */ - - /* EXT3_IOC_CREATE_INUM at bottom of file (visible to kernel and user). */ -Index: linux-2.6.9-full/include/linux/ext3_fs_sb.h -=================================================================== ---- linux-2.6.9-full.orig/include/linux/ext3_fs_sb.h 2006-05-18 23:57:04.000000000 +0400 -+++ linux-2.6.9-full/include/linux/ext3_fs_sb.h 2006-10-24 12:54:31.000000000 +0400 -@@ -23,9 +23,15 @@ - #define EXT_INCLUDE - #include - #include -+#include - #endif - #endif - #include -+#include -+ -+struct ext3_buddy_group_blocks; -+struct ext3_mb_history; -+#define EXT3_BB_MAX_BLOCKS - - /* - * third extended-fs super-block data in memory -@@ -81,6 +87,43 @@ struct ext3_sb_info { - char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */ - int s_jquota_fmt; /* Format of quota to use */ - #endif -+ -+ /* for buddy allocator */ -+ struct ext3_group_info ***s_group_info; -+ struct inode *s_buddy_cache; -+ long s_blocks_reserved; -+ spinlock_t s_reserve_lock; -+ struct list_head s_active_transaction; -+ struct list_head s_closed_transaction; -+ struct list_head s_committed_transaction; -+ spinlock_t s_md_lock; -+ tid_t s_last_transaction; -+ int s_mb_factor; -+ unsigned short *s_mb_offsets, *s_mb_maxs; -+ unsigned long s_stripe; -+ -+ /* history to debug policy */ -+ struct ext3_mb_history *s_mb_history; -+ int s_mb_history_cur; -+ int s_mb_history_max; -+ struct proc_dir_entry *s_mb_proc; -+ spinlock_t s_mb_history_lock; -+ -+ /* stats for buddy allocator */ -+ atomic_t s_bal_reqs; /* number of reqs with len > 1 */ -+ atomic_t s_bal_success; /* we found long enough chunks */ -+ atomic_t s_bal_allocated; /* in blocks */ -+ atomic_t s_bal_ex_scanned; /* total extents scanned */ -+ atomic_t s_bal_goals; /* goal hits */ -+ atomic_t s_bal_breaks; /* too long searches */ -+ atomic_t s_bal_2orders; /* 2^order hits */ -+ spinlock_t s_bal_lock; -+ unsigned long s_mb_buddies_generated; -+ unsigned long long s_mb_generation_time; - }; - -+#define EXT3_GROUP_INFO(sb, group) \ -+ EXT3_SB(sb)->s_group_info[(group) >> EXT3_DESC_PER_BLOCK_BITS(sb)] \ -+ [(group) & (EXT3_DESC_PER_BLOCK(sb) - 1)] -+ - #endif /* _LINUX_EXT3_FS_SB */ -Index: linux-2.6.9-full/fs/ext3/super.c -=================================================================== ---- linux-2.6.9-full.orig/fs/ext3/super.c 2006-06-01 14:58:46.000000000 +0400 -+++ linux-2.6.9-full/fs/ext3/super.c 2006-10-24 12:54:31.000000000 +0400 -@@ -394,6 +394,7 @@ void ext3_put_super (struct super_block - struct ext3_super_block *es = sbi->s_es; - int i; - -+ ext3_mb_release(sb); - ext3_ext_release(sb); - ext3_xattr_put_super(sb); - journal_destroy(sbi->s_journal); -@@ -597,6 +598,7 @@ enum { - Opt_ignore, Opt_barrier, Opt_err, Opt_resize, - Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, - Opt_extents, Opt_noextents, Opt_extdebug, -+ Opt_mballoc, Opt_nomballoc, Opt_stripe, - }; - - static match_table_t tokens = { -@@ -649,6 +651,9 @@ static match_table_t tokens = { - {Opt_extents, "extents"}, - {Opt_noextents, "noextents"}, - {Opt_extdebug, "extdebug"}, -+ {Opt_mballoc, "mballoc"}, -+ {Opt_nomballoc, "nomballoc"}, -+ {Opt_stripe, "stripe=%u"}, - {Opt_barrier, "barrier=%u"}, - {Opt_err, NULL}, - {Opt_resize, "resize"}, -@@ -962,6 +967,19 @@ clear_qf_name: - case Opt_extdebug: - set_opt (sbi->s_mount_opt, EXTDEBUG); - break; -+ case Opt_mballoc: -+ set_opt(sbi->s_mount_opt, MBALLOC); -+ break; -+ case Opt_nomballoc: -+ clear_opt(sbi->s_mount_opt, MBALLOC); -+ break; -+ case Opt_stripe: -+ if (match_int(&args[0], &option)) -+ return 0; -+ if (option < 0) -+ return 0; -+ sbi->s_stripe = option; -+ break; - default: - printk (KERN_ERR - "EXT3-fs: Unrecognized mount option \"%s\" " -@@ -1651,6 +1669,7 @@ static int ext3_fill_super (struct super - ext3_count_dirs(sb)); - - ext3_ext_init(sb); -+ ext3_mb_init(sb, needs_recovery); - - return 0; - -@@ -2433,7 +2452,13 @@ static struct file_system_type ext3_fs_t - - static int __init init_ext3_fs(void) - { -- int err = init_ext3_xattr(); -+ int err; -+ -+ err = init_ext3_proc(); -+ if (err) -+ return err; -+ -+ err = init_ext3_xattr(); - if (err) - return err; - err = init_inodecache(); -@@ -2455,6 +2480,7 @@ static void __exit exit_ext3_fs(void) - unregister_filesystem(&ext3_fs_type); - destroy_inodecache(); - exit_ext3_xattr(); -+ exit_ext3_proc(); - } - - int ext3_prep_san_write(struct inode *inode, long *blocks, -Index: linux-2.6.9-full/fs/ext3/extents.c -=================================================================== ---- linux-2.6.9-full.orig/fs/ext3/extents.c 2006-06-01 14:58:46.000000000 +0400 -+++ linux-2.6.9-full/fs/ext3/extents.c 2006-10-24 12:54:31.000000000 +0400 -@@ -777,7 +777,7 @@ cleanup: - for (i = 0; i < depth; i++) { - if (!ablocks[i]) - continue; -- ext3_free_blocks(handle, tree->inode, ablocks[i], 1); -+ ext3_free_blocks(handle, tree->inode, ablocks[i], 1, 1); - } - } - kfree(ablocks); -@@ -1434,7 +1434,7 @@ int ext3_ext_rm_idx(handle_t *handle, st - path->p_idx->ei_leaf); - bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf); - ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf); -- ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1); -+ ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1, 1); - return err; - } - -@@ -1919,10 +1919,12 @@ ext3_remove_blocks(struct ext3_extents_t - int needed = ext3_remove_blocks_credits(tree, ex, from, to); - handle_t *handle = ext3_journal_start(tree->inode, needed); - struct buffer_head *bh; -- int i; -+ int i, metadata = 0; - - if (IS_ERR(handle)) - return PTR_ERR(handle); -+ if (S_ISDIR(tree->inode->i_mode) || S_ISLNK(tree->inode->i_mode)) -+ metadata = 1; - if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) { - /* tail removal */ - unsigned long num, start; -@@ -1934,7 +1936,7 @@ ext3_remove_blocks(struct ext3_extents_t - bh = sb_find_get_block(tree->inode->i_sb, start + i); - ext3_forget(handle, 0, tree->inode, bh, start + i); - } -- ext3_free_blocks(handle, tree->inode, start, num); -+ ext3_free_blocks(handle, tree->inode, start, num, metadata); - } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) { - printk("strange request: removal %lu-%lu from %u:%u\n", - from, to, ex->ee_block, ex->ee_len); -Index: linux-2.6.9-full/fs/ext3/inode.c -=================================================================== ---- linux-2.6.9-full.orig/fs/ext3/inode.c 2006-06-01 14:58:46.000000000 +0400 -+++ linux-2.6.9-full/fs/ext3/inode.c 2006-10-24 12:54:31.000000000 +0400 -@@ -572,7 +572,7 @@ static int ext3_alloc_branch(handle_t *h - ext3_journal_forget(handle, branch[i].bh); - } - for (i = 0; i < keys; i++) -- ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1); -+ ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1, 1); - return err; - } - -@@ -673,7 +673,7 @@ err_out: - if (err == -EAGAIN) - for (i = 0; i < num; i++) - ext3_free_blocks(handle, inode, -- le32_to_cpu(where[i].key), 1); -+ le32_to_cpu(where[i].key), 1, 1); - return err; - } - -@@ -1831,7 +1831,7 @@ ext3_clear_blocks(handle_t *handle, stru - } - } - -- ext3_free_blocks(handle, inode, block_to_free, count); -+ ext3_free_blocks(handle, inode, block_to_free, count, 1); - } - - /** -@@ -2004,7 +2004,7 @@ static void ext3_free_branches(handle_t - ext3_journal_test_restart(handle, inode); - } - -- ext3_free_blocks(handle, inode, nr, 1); -+ ext3_free_blocks(handle, inode, nr, 1, 1); - - if (parent_bh) { - /* -Index: linux-2.6.9-full/fs/ext3/balloc.c -=================================================================== ---- linux-2.6.9-full.orig/fs/ext3/balloc.c 2006-03-10 18:20:03.000000000 +0300 -+++ linux-2.6.9-full/fs/ext3/balloc.c 2006-10-24 12:54:31.000000000 +0400 -@@ -79,7 +79,7 @@ struct ext3_group_desc * ext3_get_group_ - * - * Return buffer_head on success or NULL in case of failure. - */ --static struct buffer_head * -+struct buffer_head * - read_block_bitmap(struct super_block *sb, unsigned int block_group) - { - struct ext3_group_desc * desc; -@@ -451,24 +451,6 @@ error_return: - return; - } - --/* Free given blocks, update quota and i_blocks field */ --void ext3_free_blocks(handle_t *handle, struct inode *inode, -- unsigned long block, unsigned long count) --{ -- struct super_block * sb; -- int dquot_freed_blocks; -- -- sb = inode->i_sb; -- if (!sb) { -- printk ("ext3_free_blocks: nonexistent device"); -- return; -- } -- ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks); -- if (dquot_freed_blocks) -- DQUOT_FREE_BLOCK(inode, dquot_freed_blocks); -- return; --} -- - /* - * For ext3 allocations, we must not reuse any blocks which are - * allocated in the bitmap buffer's "last committed data" copy. This -@@ -1131,7 +1113,7 @@ int ext3_should_retry_alloc(struct super - * bitmap, and then for any free bit if that fails. - * This function also updates quota and i_blocks field. - */ --int ext3_new_block(handle_t *handle, struct inode *inode, -+int ext3_new_block_old(handle_t *handle, struct inode *inode, - unsigned long goal, int *errp) - { - struct buffer_head *bitmap_bh = NULL; -Index: linux-2.6.9-full/fs/ext3/xattr.c -=================================================================== ---- linux-2.6.9-full.orig/fs/ext3/xattr.c 2006-05-18 23:57:04.000000000 +0400 -+++ linux-2.6.9-full/fs/ext3/xattr.c 2006-10-24 12:54:31.000000000 +0400 -@@ -1281,7 +1281,7 @@ ext3_xattr_set_handle2(handle_t *handle, - new_bh = sb_getblk(sb, block); - if (!new_bh) { - getblk_failed: -- ext3_free_blocks(handle, inode, block, 1); -+ ext3_free_blocks(handle, inode, block, 1, 1); - error = -EIO; - goto cleanup; - } -@@ -1328,7 +1328,7 @@ getblk_failed: - if (ce) - mb_cache_entry_free(ce); - ea_bdebug(old_bh, "freeing"); -- ext3_free_blocks(handle, inode, old_bh->b_blocknr, 1); -+ ext3_free_blocks(handle, inode, old_bh->b_blocknr, 1, 1); - - /* ext3_forget() calls bforget() for us, but we - let our caller release old_bh, so we need to -@@ -1427,7 +1427,7 @@ ext3_xattr_delete_inode(handle_t *handle - if (HDR(bh)->h_refcount == cpu_to_le32(1)) { - if (ce) - mb_cache_entry_free(ce); -- ext3_free_blocks(handle, inode, EXT3_I(inode)->i_file_acl, 1); -+ ext3_free_blocks(handle, inode, EXT3_I(inode)->i_file_acl, 1, 1); - get_bh(bh); - ext3_forget(handle, 1, inode, bh, EXT3_I(inode)->i_file_acl); - } else { -Index: linux-2.6.9-full/fs/ext3/mballoc.c -=================================================================== ---- linux-2.6.9-full.orig/fs/ext3/mballoc.c 2006-10-23 18:07:54.821533176 +0400 -+++ linux-2.6.9-full/fs/ext3/mballoc.c 2006-10-24 13:00:56.000000000 +0400 -@@ -0,0 +1,2725 @@ -+/* -+ * Copyright (c) 2003-2005, Cluster File Systems, Inc, info@clusterfs.com -+ * Written by Alex Tomas -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License version 2 as -+ * published by the Free Software Foundation. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public Licens -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- -+ */ -+ -+ -+/* -+ * mballoc.c contains the multiblocks allocation routines -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+/* -+ * TODO: -+ * - bitmap read-ahead (proposed by Oleg Drokin aka green) -+ * - track min/max extents in each group for better group selection -+ * - mb_mark_used() may allocate chunk right after splitting buddy -+ * - special flag to advice allocator to look for requested + N blocks -+ * this may improve interaction between extents and mballoc -+ * - tree of groups sorted by number of free blocks -+ * - percpu reservation code (hotpath) -+ * - error handling -+ */ -+ -+/* -+ * with AGRESSIVE_CHECK allocator runs consistency checks over -+ * structures. these checks slow things down a lot -+ */ -+#define AGGRESSIVE_CHECK__ -+ -+/* -+ */ -+#define MB_DEBUG__ -+#ifdef MB_DEBUG -+#define mb_debug(fmt,a...) printk(fmt, ##a) -+#else -+#define mb_debug(fmt,a...) -+#endif -+ -+/* -+ * with EXT3_MB_HISTORY mballoc stores last N allocations in memory -+ * and you can monitor it in /proc/fs/ext3//mb_history -+ */ -+#define EXT3_MB_HISTORY -+ -+/* -+ * How long mballoc can look for a best extent (in found extents) -+ */ -+long ext3_mb_max_to_scan = 500; -+ -+/* -+ * How long mballoc must look for a best extent -+ */ -+long ext3_mb_min_to_scan = 30; -+ -+/* -+ * with 'ext3_mb_stats' allocator will collect stats that will be -+ * shown at umount. The collecting costs though! -+ */ -+ -+long ext3_mb_stats = 1; -+ -+/* -+ * for which requests use 2^N search using buddies -+ */ -+long ext3_mb_order2_reqs = 8; -+ -+#ifdef EXT3_BB_MAX_BLOCKS -+#undef EXT3_BB_MAX_BLOCKS -+#endif -+#define EXT3_BB_MAX_BLOCKS 30 -+ -+struct ext3_free_metadata { -+ unsigned short group; -+ unsigned short num; -+ unsigned short blocks[EXT3_BB_MAX_BLOCKS]; -+ struct list_head list; -+}; -+ -+struct ext3_group_info { -+ unsigned long bb_state; -+ unsigned long bb_tid; -+ struct ext3_free_metadata *bb_md_cur; -+ unsigned short bb_first_free; -+ unsigned short bb_free; -+ unsigned short bb_fragments; -+ unsigned short bb_counters[]; -+}; -+ -+ -+#define EXT3_GROUP_INFO_NEED_INIT_BIT 0 -+#define EXT3_GROUP_INFO_LOCKED_BIT 1 -+ -+#define EXT3_MB_GRP_NEED_INIT(grp) \ -+ (test_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, &(grp)->bb_state)) -+ -+struct ext3_free_extent { -+ __u16 fe_start; -+ __u16 fe_len; -+ __u16 fe_group; -+}; -+ -+struct ext3_allocation_context { -+ struct super_block *ac_sb; -+ -+ /* search goals */ -+ struct ext3_free_extent ac_g_ex; -+ -+ /* the best found extent */ -+ struct ext3_free_extent ac_b_ex; -+ -+ /* number of iterations done. we have to track to limit searching */ -+ unsigned long ac_ex_scanned; -+ __u16 ac_groups_scanned; -+ __u16 ac_found; -+ __u16 ac_tail; -+ __u16 ac_buddy; -+ __u8 ac_status; -+ __u8 ac_flags; /* allocation hints */ -+ __u8 ac_criteria; -+ __u8 ac_repeats; -+ __u8 ac_2order; /* if request is to allocate 2^N blocks and -+ * N > 0, the field stores N, otherwise 0 */ -+ -+ struct page *ac_buddy_page; -+ struct page *ac_bitmap_page; -+}; -+ -+#define AC_STATUS_CONTINUE 1 -+#define AC_STATUS_FOUND 2 -+#define AC_STATUS_BREAK 3 -+ -+struct ext3_mb_history { -+ struct ext3_free_extent goal; /* goal allocation */ -+ struct ext3_free_extent result; /* result allocation */ -+ unsigned pid; -+ unsigned ino; -+ __u16 found; /* how many extents have been found */ -+ __u16 groups; /* how many groups have been scanned */ -+ __u16 tail; /* what tail broke some buddy */ -+ __u16 buddy; /* buddy the tail ^^^ broke */ -+ __u8 cr; /* which phase the result extent was found at */ -+ __u8 merged; -+}; -+ -+struct ext3_buddy { -+ struct page *bd_buddy_page; -+ void *bd_buddy; -+ struct page *bd_bitmap_page; -+ void *bd_bitmap; -+ struct ext3_group_info *bd_info; -+ struct super_block *bd_sb; -+ __u16 bd_blkbits; -+ __u16 bd_group; -+}; -+#define EXT3_MB_BITMAP(e3b) ((e3b)->bd_bitmap) -+#define EXT3_MB_BUDDY(e3b) ((e3b)->bd_buddy) -+ -+#ifndef EXT3_MB_HISTORY -+#define ext3_mb_store_history(sb,ino,ac) -+#else -+static void ext3_mb_store_history(struct super_block *, unsigned ino, -+ struct ext3_allocation_context *ac); -+#endif -+ -+#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) -+ -+static struct proc_dir_entry *proc_root_ext3; -+ -+struct buffer_head * read_block_bitmap(struct super_block *, unsigned int); -+void ext3_mb_poll_new_transaction(struct super_block *, handle_t *); -+void ext3_mb_free_committed_blocks(struct super_block *); -+ -+#if BITS_PER_LONG == 64 -+#define mb_correct_addr_and_bit(bit,addr) \ -+{ \ -+ bit += ((unsigned long) addr & 7UL) << 3; \ -+ addr = (void *) ((unsigned long) addr & ~7UL); \ -+} -+#elif BITS_PER_LONG == 32 -+#define mb_correct_addr_and_bit(bit,addr) \ -+{ \ -+ bit += ((unsigned long) addr & 3UL) << 3; \ -+ addr = (void *) ((unsigned long) addr & ~3UL); \ -+} -+#else -+#error "how many bits you are?!" -+#endif -+ -+static inline int mb_test_bit(int bit, void *addr) -+{ -+ mb_correct_addr_and_bit(bit,addr); -+ return ext2_test_bit(bit, addr); -+} -+ -+static inline void mb_set_bit(int bit, void *addr) -+{ -+ mb_correct_addr_and_bit(bit,addr); -+ ext2_set_bit(bit, addr); -+} -+ -+static inline void mb_set_bit_atomic(int bit, void *addr) -+{ -+ mb_correct_addr_and_bit(bit,addr); -+ ext2_set_bit_atomic(NULL, bit, addr); -+} -+ -+static inline void mb_clear_bit(int bit, void *addr) -+{ -+ mb_correct_addr_and_bit(bit,addr); -+ ext2_clear_bit(bit, addr); -+} -+ -+static inline void mb_clear_bit_atomic(int bit, void *addr) -+{ -+ mb_correct_addr_and_bit(bit,addr); -+ ext2_clear_bit_atomic(NULL, bit, addr); -+} -+ -+static inline int mb_find_next_zero_bit(void *addr, int max, int start) -+{ -+ int fix; -+#if BITS_PER_LONG == 64 -+ fix = ((unsigned long) addr & 7UL) << 3; -+ addr = (void *) ((unsigned long) addr & ~7UL); -+#elif BITS_PER_LONG == 32 -+ fix = ((unsigned long) addr & 3UL) << 3; -+ addr = (void *) ((unsigned long) addr & ~3UL); -+#else -+#error "how many bits you are?!" -+#endif -+ max += fix; -+ start += fix; -+ return ext2_find_next_zero_bit(addr, max, start) - fix; -+} -+ -+static inline void *mb_find_buddy(struct ext3_buddy *e3b, int order, int *max) -+{ -+ char *bb; -+ -+ J_ASSERT(EXT3_MB_BITMAP(e3b) != EXT3_MB_BUDDY(e3b)); -+ J_ASSERT(max != NULL); -+ -+ if (order > e3b->bd_blkbits + 1) { -+ *max = 0; -+ return NULL; -+ } -+ -+ /* at order 0 we see each particular block */ -+ *max = 1 << (e3b->bd_blkbits + 3); -+ if (order == 0) -+ return EXT3_MB_BITMAP(e3b); -+ -+ bb = EXT3_MB_BUDDY(e3b) + EXT3_SB(e3b->bd_sb)->s_mb_offsets[order]; -+ *max = EXT3_SB(e3b->bd_sb)->s_mb_maxs[order]; -+ -+ return bb; -+} -+ -+#ifdef AGGRESSIVE_CHECK -+ -+static void mb_check_buddy(struct ext3_buddy *e3b) -+{ -+ int order = e3b->bd_blkbits + 1; -+ int max, max2, i, j, k, count; -+ int fragments = 0, fstart; -+ void *buddy, *buddy2; -+ -+ if (!test_opt(e3b->bd_sb, MBALLOC)) -+ return; -+ -+ { -+ static int mb_check_counter = 0; -+ if (mb_check_counter++ % 300 != 0) -+ return; -+ } -+ -+ while (order > 1) { -+ buddy = mb_find_buddy(e3b, order, &max); -+ J_ASSERT(buddy); -+ buddy2 = mb_find_buddy(e3b, order - 1, &max2); -+ J_ASSERT(buddy2); -+ J_ASSERT(buddy != buddy2); -+ J_ASSERT(max * 2 == max2); -+ -+ count = 0; -+ for (i = 0; i < max; i++) { -+ -+ if (mb_test_bit(i, buddy)) { -+ /* only single bit in buddy2 may be 1 */ -+ if (!mb_test_bit(i << 1, buddy2)) -+ J_ASSERT(mb_test_bit((i<<1)+1, buddy2)); -+ else if (!mb_test_bit((i << 1) + 1, buddy2)) -+ J_ASSERT(mb_test_bit(i << 1, buddy2)); -+ continue; -+ } -+ -+ /* both bits in buddy2 must be 0 */ -+ J_ASSERT(mb_test_bit(i << 1, buddy2)); -+ J_ASSERT(mb_test_bit((i << 1) + 1, buddy2)); -+ -+ for (j = 0; j < (1 << order); j++) { -+ k = (i * (1 << order)) + j; -+ J_ASSERT(!mb_test_bit(k, EXT3_MB_BITMAP(e3b))); -+ } -+ count++; -+ } -+ J_ASSERT(e3b->bd_info->bb_counters[order] == count); -+ order--; -+ } -+ -+ fstart = -1; -+ buddy = mb_find_buddy(e3b, 0, &max); -+ for (i = 0; i < max; i++) { -+ if (!mb_test_bit(i, buddy)) { -+ J_ASSERT(i >= e3b->bd_info->bb_first_free); -+ if (fstart == -1) { -+ fragments++; -+ fstart = i; -+ } -+ continue; -+ } -+ fstart = -1; -+ /* check used bits only */ -+ for (j = 0; j < e3b->bd_blkbits + 1; j++) { -+ buddy2 = mb_find_buddy(e3b, j, &max2); -+ k = i >> j; -+ J_ASSERT(k < max2); -+ J_ASSERT(mb_test_bit(k, buddy2)); -+ } -+ } -+ J_ASSERT(!EXT3_MB_GRP_NEED_INIT(e3b->bd_info)); -+ J_ASSERT(e3b->bd_info->bb_fragments == fragments); -+} -+ -+#else -+#define mb_check_buddy(e3b) -+#endif -+ -+/* find most significant bit */ -+static int inline fmsb(unsigned short word) -+{ -+ int order; -+ -+ if (word > 255) { -+ order = 7; -+ word >>= 8; -+ } else { -+ order = -1; -+ } -+ -+ do { -+ order++; -+ word >>= 1; -+ } while (word != 0); -+ -+ return order; -+} -+ -+static void inline -+ext3_mb_mark_free_simple(struct super_block *sb, void *buddy, unsigned first, -+ int len, struct ext3_group_info *grp) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ unsigned short min, max, chunk, border; -+ -+ mb_debug("mark %u/%u free\n", first, len); -+ J_ASSERT(len < EXT3_BLOCKS_PER_GROUP(sb)); -+ -+ border = 2 << sb->s_blocksize_bits; -+ -+ while (len > 0) { -+ /* find how many blocks can be covered since this position */ -+ max = ffs(first | border) - 1; -+ -+ /* find how many blocks of power 2 we need to mark */ -+ min = fmsb(len); -+ -+ mb_debug(" %u/%u -> max %u, min %u\n", -+ first & ((2 << sb->s_blocksize_bits) - 1), -+ len, max, min); -+ -+ if (max < min) -+ min = max; -+ chunk = 1 << min; -+ -+ /* mark multiblock chunks only */ -+ grp->bb_counters[min]++; -+ if (min > 0) { -+ mb_debug(" set %u at %u \n", first >> min, -+ sbi->s_mb_offsets[min]); -+ mb_clear_bit(first >> min, buddy + sbi->s_mb_offsets[min]); -+ } -+ -+ len -= chunk; -+ first += chunk; -+ } -+} -+ -+static void -+ext3_mb_generate_buddy(struct super_block *sb, void *buddy, void *bitmap, -+ int group) -+{ -+ struct ext3_group_info *grp = EXT3_GROUP_INFO(sb, group); -+ unsigned short max = EXT3_BLOCKS_PER_GROUP(sb); -+ unsigned short i = 0, first, len; -+ unsigned free = 0, fragments = 0; -+ unsigned long long period = get_cycles(); -+ -+ i = mb_find_next_zero_bit(bitmap, max, 0); -+ grp->bb_first_free = i; -+ while (i < max) { -+ fragments++; -+ first = i; -+ i = ext2_find_next_le_bit(bitmap, max, i); -+ len = i - first; -+ free += len; -+ if (len > 1) -+ ext3_mb_mark_free_simple(sb, buddy, first, len, grp); -+ else -+ grp->bb_counters[0]++; -+ if (i < max) -+ i = mb_find_next_zero_bit(bitmap, max, i); -+ } -+ grp->bb_fragments = fragments; -+ -+ /* bb_state shouldn't being modified because all -+ * others waits for init completion on page lock */ -+ clear_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, &grp->bb_state); -+ if (free != grp->bb_free) { -+ printk("EXT3-fs: group %u: %u blocks in bitmap, %u in gd\n", -+ group, free, grp->bb_free); -+ grp->bb_free = free; -+ } -+ -+ period = get_cycles() - period; -+ spin_lock(&EXT3_SB(sb)->s_bal_lock); -+ EXT3_SB(sb)->s_mb_buddies_generated++; -+ EXT3_SB(sb)->s_mb_generation_time += period; -+ spin_unlock(&EXT3_SB(sb)->s_bal_lock); -+} -+ -+static int ext3_mb_init_cache(struct page *page) -+{ -+ int blocksize, blocks_per_page, groups_per_page; -+ int err = 0, i, first_group, first_block; -+ struct super_block *sb; -+ struct buffer_head *bhs; -+ struct buffer_head **bh; -+ struct inode *inode; -+ char *data, *bitmap; -+ -+ mb_debug("init page %lu\n", page->index); -+ -+ inode = page->mapping->host; -+ sb = inode->i_sb; -+ blocksize = 1 << inode->i_blkbits; -+ blocks_per_page = PAGE_CACHE_SIZE / blocksize; -+ -+ groups_per_page = blocks_per_page >> 1; -+ if (groups_per_page == 0) -+ groups_per_page = 1; -+ -+ /* allocate buffer_heads to read bitmaps */ -+ if (groups_per_page > 1) { -+ err = -ENOMEM; -+ i = sizeof(struct buffer_head *) * groups_per_page; -+ bh = kmalloc(i, GFP_NOFS); -+ if (bh == NULL) -+ goto out; -+ memset(bh, 0, i); -+ } else -+ bh = &bhs; -+ -+ first_group = page->index * blocks_per_page / 2; -+ -+ /* read all groups the page covers into the cache */ -+ for (i = 0; i < groups_per_page; i++) { -+ struct ext3_group_desc * desc; -+ -+ if (first_group + i >= EXT3_SB(sb)->s_groups_count) -+ break; -+ -+ err = -EIO; -+ desc = ext3_get_group_desc(sb, first_group + i, NULL); -+ if (desc == NULL) -+ goto out; -+ -+ err = -ENOMEM; -+ bh[i] = sb_getblk(sb, le32_to_cpu(desc->bg_block_bitmap)); -+ if (bh[i] == NULL) -+ goto out; -+ -+ if (buffer_uptodate(bh[i])) -+ continue; -+ -+ lock_buffer(bh[i]); -+ if (buffer_uptodate(bh[i])) { -+ unlock_buffer(bh[i]); -+ continue; -+ } -+ -+ get_bh(bh[i]); -+ bh[i]->b_end_io = end_buffer_read_sync; -+ submit_bh(READ, bh[i]); -+ mb_debug("read bitmap for group %u\n", first_group + i); -+ } -+ -+ /* wait for I/O completion */ -+ for (i = 0; i < groups_per_page && bh[i]; i++) -+ wait_on_buffer(bh[i]); -+ -+ err = -EIO; -+ for (i = 0; i < groups_per_page && bh[i]; i++) -+ if (!buffer_uptodate(bh[i])) -+ goto out; -+ -+ first_block = page->index * blocks_per_page; -+ for (i = 0; i < blocks_per_page; i++) { -+ int group; -+ -+ group = (first_block + i) >> 1; -+ if (group >= EXT3_SB(sb)->s_groups_count) -+ break; -+ -+ data = page_address(page) + (i * blocksize); -+ bitmap = bh[group - first_group]->b_data; -+ -+ if ((first_block + i) & 1) { -+ /* this is block of buddy */ -+ mb_debug("put buddy for group %u in page %lu/%x\n", -+ group, page->index, i * blocksize); -+ memset(data, 0xff, blocksize); -+ EXT3_GROUP_INFO(sb, group)->bb_fragments = 0; -+ memset(EXT3_GROUP_INFO(sb, group)->bb_counters, 0, -+ sizeof(unsigned short)*(sb->s_blocksize_bits+2)); -+ ext3_mb_generate_buddy(sb, data, bitmap, group); -+ } else { -+ /* this is block of bitmap */ -+ mb_debug("put bitmap for group %u in page %lu/%x\n", -+ group, page->index, i * blocksize); -+ memcpy(data, bitmap, blocksize); -+ } -+ } -+ SetPageUptodate(page); -+ -+out: -+ if (bh) { -+ for (i = 0; i < groups_per_page && bh[i]; i++) -+ brelse(bh[i]); -+ if (bh != &bhs) -+ kfree(bh); -+ } -+ return err; -+} -+ -+static int ext3_mb_load_buddy(struct super_block *sb, int group, -+ struct ext3_buddy *e3b) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ struct inode *inode = sbi->s_buddy_cache; -+ int blocks_per_page, block, pnum, poff; -+ struct page *page; -+ -+ mb_debug("load group %u\n", group); -+ -+ blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; -+ -+ e3b->bd_blkbits = sb->s_blocksize_bits; -+ e3b->bd_info = EXT3_GROUP_INFO(sb, group); -+ e3b->bd_sb = sb; -+ e3b->bd_group = group; -+ e3b->bd_buddy_page = NULL; -+ e3b->bd_bitmap_page = NULL; -+ -+ block = group * 2; -+ pnum = block / blocks_per_page; -+ poff = block % blocks_per_page; -+ -+ /* we could use find_or_create_page(), but it locks page -+ * what we'd like to avoid in fast path ... */ -+ page = find_get_page(inode->i_mapping, pnum); -+ if (page == NULL || !PageUptodate(page)) { -+ if (page) -+ page_cache_release(page); -+ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); -+ if (page) { -+ BUG_ON(page->mapping != inode->i_mapping); -+ if (!PageUptodate(page)) -+ ext3_mb_init_cache(page); -+ unlock_page(page); -+ } -+ } -+ if (page == NULL || !PageUptodate(page)) -+ goto err; -+ e3b->bd_bitmap_page = page; -+ e3b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); -+ mark_page_accessed(page); -+ -+ block++; -+ pnum = block / blocks_per_page; -+ poff = block % blocks_per_page; -+ -+ page = find_get_page(inode->i_mapping, pnum); -+ if (page == NULL || !PageUptodate(page)) { -+ if (page) -+ page_cache_release(page); -+ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); -+ if (page) { -+ BUG_ON(page->mapping != inode->i_mapping); -+ if (!PageUptodate(page)) -+ ext3_mb_init_cache(page); -+ unlock_page(page); -+ } -+ } -+ if (page == NULL || !PageUptodate(page)) -+ goto err; -+ e3b->bd_buddy_page = page; -+ e3b->bd_buddy = page_address(page) + (poff * sb->s_blocksize); -+ mark_page_accessed(page); -+ -+ J_ASSERT(e3b->bd_bitmap_page != NULL); -+ J_ASSERT(e3b->bd_buddy_page != NULL); -+ -+ return 0; -+ -+err: -+ if (e3b->bd_bitmap_page) -+ page_cache_release(e3b->bd_bitmap_page); -+ if (e3b->bd_buddy_page) -+ page_cache_release(e3b->bd_buddy_page); -+ e3b->bd_buddy = NULL; -+ e3b->bd_bitmap = NULL; -+ return -EIO; -+} -+ -+static void ext3_mb_release_desc(struct ext3_buddy *e3b) -+{ -+ if (e3b->bd_bitmap_page) -+ page_cache_release(e3b->bd_bitmap_page); -+ if (e3b->bd_buddy_page) -+ page_cache_release(e3b->bd_buddy_page); -+} -+ -+ -+static inline void -+ext3_lock_group(struct super_block *sb, int group) -+{ -+ bit_spin_lock(EXT3_GROUP_INFO_LOCKED_BIT, -+ &EXT3_GROUP_INFO(sb, group)->bb_state); -+} -+ -+static inline void -+ext3_unlock_group(struct super_block *sb, int group) -+{ -+ bit_spin_unlock(EXT3_GROUP_INFO_LOCKED_BIT, -+ &EXT3_GROUP_INFO(sb, group)->bb_state); -+} -+ -+static int mb_find_order_for_block(struct ext3_buddy *e3b, int block) -+{ -+ int order = 1; -+ void *bb; -+ -+ J_ASSERT(EXT3_MB_BITMAP(e3b) != EXT3_MB_BUDDY(e3b)); -+ J_ASSERT(block < (1 << (e3b->bd_blkbits + 3))); -+ -+ bb = EXT3_MB_BUDDY(e3b); -+ while (order <= e3b->bd_blkbits + 1) { -+ block = block >> 1; -+ if (!mb_test_bit(block, bb)) { -+ /* this block is part of buddy of order 'order' */ -+ return order; -+ } -+ bb += 1 << (e3b->bd_blkbits - order); -+ order++; -+ } -+ return 0; -+} -+ -+static inline void mb_clear_bits(void *bm, int cur, int len) -+{ -+ __u32 *addr; -+ -+ len = cur + len; -+ while (cur < len) { -+ if ((cur & 31) == 0 && (len - cur) >= 32) { -+ /* fast path: clear whole word at once */ -+ addr = bm + (cur >> 3); -+ *addr = 0; -+ cur += 32; -+ continue; -+ } -+ mb_clear_bit_atomic(cur, bm); -+ cur++; -+ } -+} -+ -+static inline void mb_set_bits(void *bm, int cur, int len) -+{ -+ __u32 *addr; -+ -+ len = cur + len; -+ while (cur < len) { -+ if ((cur & 31) == 0 && (len - cur) >= 32) { -+ /* fast path: clear whole word at once */ -+ addr = bm + (cur >> 3); -+ *addr = 0xffffffff; -+ cur += 32; -+ continue; -+ } -+ mb_set_bit_atomic(cur, bm); -+ cur++; -+ } -+} -+ -+static int mb_free_blocks(struct ext3_buddy *e3b, int first, int count) -+{ -+ int block = 0, max = 0, order; -+ void *buddy, *buddy2; -+ -+ mb_check_buddy(e3b); -+ -+ e3b->bd_info->bb_free += count; -+ if (first < e3b->bd_info->bb_first_free) -+ e3b->bd_info->bb_first_free = first; -+ -+ /* let's maintain fragments counter */ -+ if (first != 0) -+ block = !mb_test_bit(first - 1, EXT3_MB_BITMAP(e3b)); -+ if (first + count < EXT3_SB(e3b->bd_sb)->s_mb_maxs[0]) -+ max = !mb_test_bit(first + count, EXT3_MB_BITMAP(e3b)); -+ if (block && max) -+ e3b->bd_info->bb_fragments--; -+ else if (!block && !max) -+ e3b->bd_info->bb_fragments++; -+ -+ /* let's maintain buddy itself */ -+ while (count-- > 0) { -+ block = first++; -+ order = 0; -+ -+ J_ASSERT(mb_test_bit(block, EXT3_MB_BITMAP(e3b))); -+ mb_clear_bit(block, EXT3_MB_BITMAP(e3b)); -+ e3b->bd_info->bb_counters[order]++; -+ -+ /* start of the buddy */ -+ buddy = mb_find_buddy(e3b, order, &max); -+ -+ do { -+ block &= ~1UL; -+ if (mb_test_bit(block, buddy) || -+ mb_test_bit(block + 1, buddy)) -+ break; -+ -+ /* both the buddies are free, try to coalesce them */ -+ buddy2 = mb_find_buddy(e3b, order + 1, &max); -+ -+ if (!buddy2) -+ break; -+ -+ if (order > 0) { -+ /* for special purposes, we don't set -+ * free bits in bitmap */ -+ mb_set_bit(block, buddy); -+ mb_set_bit(block + 1, buddy); -+ } -+ e3b->bd_info->bb_counters[order]--; -+ e3b->bd_info->bb_counters[order]--; -+ -+ block = block >> 1; -+ order++; -+ e3b->bd_info->bb_counters[order]++; -+ -+ mb_clear_bit(block, buddy2); -+ buddy = buddy2; -+ } while (1); -+ } -+ mb_check_buddy(e3b); -+ -+ return 0; -+} -+ -+static int mb_find_extent(struct ext3_buddy *e3b, int order, int block, -+ int needed, struct ext3_free_extent *ex) -+{ -+ int next = block, max, ord; -+ void *buddy; -+ -+ J_ASSERT(ex != NULL); -+ -+ buddy = mb_find_buddy(e3b, order, &max); -+ J_ASSERT(buddy); -+ J_ASSERT(block < max); -+ if (mb_test_bit(block, buddy)) { -+ ex->fe_len = 0; -+ ex->fe_start = 0; -+ ex->fe_group = 0; -+ return 0; -+ } -+ -+ if (likely(order == 0)) { -+ /* find actual order */ -+ order = mb_find_order_for_block(e3b, block); -+ block = block >> order; -+ } -+ -+ ex->fe_len = 1 << order; -+ ex->fe_start = block << order; -+ ex->fe_group = e3b->bd_group; -+ -+ /* calc difference from given start */ -+ next = next - ex->fe_start; -+ ex->fe_len -= next; -+ ex->fe_start += next; -+ -+ while (needed > ex->fe_len && (buddy = mb_find_buddy(e3b, order, &max))) { -+ -+ if (block + 1 >= max) -+ break; -+ -+ next = (block + 1) * (1 << order); -+ if (mb_test_bit(next, EXT3_MB_BITMAP(e3b))) -+ break; -+ -+ ord = mb_find_order_for_block(e3b, next); -+ -+ order = ord; -+ block = next >> order; -+ ex->fe_len += 1 << order; -+ } -+ -+ J_ASSERT(ex->fe_start + ex->fe_len <= (1 << (e3b->bd_blkbits + 3))); -+ return ex->fe_len; -+} -+ -+static int mb_mark_used(struct ext3_buddy *e3b, struct ext3_free_extent *ex) -+{ -+ int ord, mlen = 0, max = 0, cur; -+ int start = ex->fe_start; -+ int len = ex->fe_len; -+ unsigned ret = 0; -+ int len0 = len; -+ void *buddy; -+ -+ mb_check_buddy(e3b); -+ -+ e3b->bd_info->bb_free -= len; -+ if (e3b->bd_info->bb_first_free == start) -+ e3b->bd_info->bb_first_free += len; -+ -+ /* let's maintain fragments counter */ -+ if (start != 0) -+ mlen = !mb_test_bit(start - 1, EXT3_MB_BITMAP(e3b)); -+ if (start + len < EXT3_SB(e3b->bd_sb)->s_mb_maxs[0]) -+ max = !mb_test_bit(start + len, EXT3_MB_BITMAP(e3b)); -+ if (mlen && max) -+ e3b->bd_info->bb_fragments++; -+ else if (!mlen && !max) -+ e3b->bd_info->bb_fragments--; -+ -+ /* let's maintain buddy itself */ -+ while (len) { -+ ord = mb_find_order_for_block(e3b, start); -+ -+ if (((start >> ord) << ord) == start && len >= (1 << ord)) { -+ /* the whole chunk may be allocated at once! */ -+ mlen = 1 << ord; -+ buddy = mb_find_buddy(e3b, ord, &max); -+ J_ASSERT((start >> ord) < max); -+ mb_set_bit(start >> ord, buddy); -+ e3b->bd_info->bb_counters[ord]--; -+ start += mlen; -+ len -= mlen; -+ J_ASSERT(len >= 0); -+ continue; -+ } -+ -+ /* store for history */ -+ if (ret == 0) -+ ret = len | (ord << 16); -+ -+ /* we have to split large buddy */ -+ J_ASSERT(ord > 0); -+ buddy = mb_find_buddy(e3b, ord, &max); -+ mb_set_bit(start >> ord, buddy); -+ e3b->bd_info->bb_counters[ord]--; -+ -+ ord--; -+ cur = (start >> ord) & ~1U; -+ buddy = mb_find_buddy(e3b, ord, &max); -+ mb_clear_bit(cur, buddy); -+ mb_clear_bit(cur + 1, buddy); -+ e3b->bd_info->bb_counters[ord]++; -+ e3b->bd_info->bb_counters[ord]++; -+ } -+ -+ /* now drop all the bits in bitmap */ -+ mb_set_bits(EXT3_MB_BITMAP(e3b), ex->fe_start, len0); -+ -+ mb_check_buddy(e3b); -+ -+ return ret; -+} -+ -+/* -+ * Must be called under group lock! -+ */ -+static void ext3_mb_use_best_found(struct ext3_allocation_context *ac, -+ struct ext3_buddy *e3b) -+{ -+ unsigned long ret; -+ -+ ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len); -+ ret = mb_mark_used(e3b, &ac->ac_b_ex); -+ -+ ac->ac_status = AC_STATUS_FOUND; -+ ac->ac_tail = ret & 0xffff; -+ ac->ac_buddy = ret >> 16; -+ -+ /* hold in-core structures until allocated -+ * blocks are marked non-free in on-disk bitmap */ -+ ac->ac_buddy_page = e3b->bd_buddy_page; -+ page_cache_get(e3b->bd_buddy_page); -+ ac->ac_bitmap_page = e3b->bd_bitmap_page; -+ page_cache_get(e3b->bd_bitmap_page); -+} -+ -+/* -+ * The routine checks whether found extent is good enough. If it is, -+ * then the extent gets marked used and flag is set to the context -+ * to stop scanning. Otherwise, the extent is compared with the -+ * previous found extent and if new one is better, then it's stored -+ * in the context. Later, the best found extent will be used, if -+ * mballoc can't find good enough extent. -+ * -+ * FIXME: real allocation policy is to be designed yet! -+ */ -+static void ext3_mb_measure_extent(struct ext3_allocation_context *ac, -+ struct ext3_free_extent *ex, -+ struct ext3_buddy *e3b) -+{ -+ struct ext3_free_extent *bex = &ac->ac_b_ex; -+ struct ext3_free_extent *gex = &ac->ac_g_ex; -+ -+ J_ASSERT(ex->fe_len > 0); -+ J_ASSERT(ex->fe_len < (1 << ac->ac_sb->s_blocksize_bits) * 8); -+ J_ASSERT(ex->fe_start < (1 << ac->ac_sb->s_blocksize_bits) * 8); -+ -+ ac->ac_found++; -+ -+ /* -+ * The special case - take what you catch first -+ */ -+ if (unlikely(ac->ac_flags & EXT3_MB_HINT_FIRST)) { -+ *bex = *ex; -+ ext3_mb_use_best_found(ac, e3b); -+ return; -+ } -+ -+ /* -+ * Let's check whether the chunk is good enough -+ */ -+ if (ex->fe_len == gex->fe_len) { -+ *bex = *ex; -+ ext3_mb_use_best_found(ac, e3b); -+ return; -+ } -+ -+ /* -+ * If this is first found extent, just store it in the context -+ */ -+ if (bex->fe_len == 0) { -+ *bex = *ex; -+ return; -+ } -+ -+ /* -+ * If new found extent is better, store it in the context -+ */ -+ if (bex->fe_len < gex->fe_len) { -+ /* if the request isn't satisfied, any found extent -+ * larger than previous best one is better */ -+ if (ex->fe_len > bex->fe_len) -+ *bex = *ex; -+ } else if (ex->fe_len > gex->fe_len) { -+ /* if the request is satisfied, then we try to find -+ * an extent that still satisfy the request, but is -+ * smaller than previous one */ -+ *bex = *ex; -+ } -+ -+ /* -+ * Let's scan at least few extents and don't pick up a first one -+ */ -+ if (bex->fe_len > gex->fe_len && ac->ac_found > ext3_mb_min_to_scan) -+ ac->ac_status = AC_STATUS_BREAK; -+ -+ /* -+ * We don't want to scan for a whole year -+ */ -+ if (ac->ac_found > ext3_mb_max_to_scan) -+ ac->ac_status = AC_STATUS_BREAK; -+} -+ -+static int ext3_mb_try_best_found(struct ext3_allocation_context *ac, -+ struct ext3_buddy *e3b) -+{ -+ struct ext3_free_extent ex = ac->ac_b_ex; -+ int group = ex.fe_group, max, err; -+ -+ J_ASSERT(ex.fe_len > 0); -+ err = ext3_mb_load_buddy(ac->ac_sb, group, e3b); -+ if (err) -+ return err; -+ -+ ext3_lock_group(ac->ac_sb, group); -+ max = mb_find_extent(e3b, 0, ex.fe_start, ex.fe_len, &ex); -+ -+ if (max > 0) { -+ ac->ac_b_ex = ex; -+ ext3_mb_use_best_found(ac, e3b); -+ } -+ -+ ext3_unlock_group(ac->ac_sb, group); -+ -+ ext3_mb_release_desc(e3b); -+ -+ return 0; -+} -+ -+static int ext3_mb_find_by_goal(struct ext3_allocation_context *ac, -+ struct ext3_buddy *e3b) -+{ -+ int group = ac->ac_g_ex.fe_group, max, err; -+ struct ext3_sb_info *sbi = EXT3_SB(ac->ac_sb); -+ struct ext3_super_block *es = sbi->s_es; -+ struct ext3_free_extent ex; -+ -+ err = ext3_mb_load_buddy(ac->ac_sb, group, e3b); -+ if (err) -+ return err; -+ -+ ext3_lock_group(ac->ac_sb, group); -+ max = mb_find_extent(e3b, 0, ac->ac_g_ex.fe_start, -+ ac->ac_g_ex.fe_len, &ex); -+ -+ if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) { -+ unsigned long start; -+ start = (e3b->bd_group * EXT3_BLOCKS_PER_GROUP(ac->ac_sb) + -+ ex.fe_start + le32_to_cpu(es->s_first_data_block)); -+ if (start % sbi->s_stripe == 0) { -+ ac->ac_found++; -+ ac->ac_b_ex = ex; -+ ext3_mb_use_best_found(ac, e3b); -+ } -+ } else if (max >= ac->ac_g_ex.fe_len) { -+ J_ASSERT(ex.fe_len > 0); -+ J_ASSERT(ex.fe_group == ac->ac_g_ex.fe_group); -+ J_ASSERT(ex.fe_start == ac->ac_g_ex.fe_start); -+ ac->ac_found++; -+ ac->ac_b_ex = ex; -+ ext3_mb_use_best_found(ac, e3b); -+ } else if (max > 0 && (ac->ac_flags & EXT3_MB_HINT_MERGE)) { -+ /* Sometimes, caller may want to merge even small -+ * number of blocks to an existing extent */ -+ J_ASSERT(ex.fe_len > 0); -+ J_ASSERT(ex.fe_group == ac->ac_g_ex.fe_group); -+ J_ASSERT(ex.fe_start == ac->ac_g_ex.fe_start); -+ ac->ac_found++; -+ ac->ac_b_ex = ex; -+ ext3_mb_use_best_found(ac, e3b); -+ } -+ ext3_unlock_group(ac->ac_sb, group); -+ -+ ext3_mb_release_desc(e3b); -+ -+ return 0; -+} -+ -+/* -+ * The routine scans buddy structures (not bitmap!) from given order -+ * to max order and tries to find big enough chunk to satisfy the req -+ */ -+static void ext3_mb_simple_scan_group(struct ext3_allocation_context *ac, -+ struct ext3_buddy *e3b) -+{ -+ struct super_block *sb = ac->ac_sb; -+ struct ext3_group_info *grp = e3b->bd_info; -+ void *buddy; -+ int i, k, max; -+ -+ J_ASSERT(ac->ac_2order > 0); -+ for (i = ac->ac_2order; i <= sb->s_blocksize_bits + 1; i++) { -+ if (grp->bb_counters[i] == 0) -+ continue; -+ -+ buddy = mb_find_buddy(e3b, i, &max); -+ if (buddy == NULL) { -+ printk(KERN_ALERT "looking for wrong order?\n"); -+ break; -+ } -+ -+ k = mb_find_next_zero_bit(buddy, max, 0); -+ J_ASSERT(k < max); -+ -+ ac->ac_found++; -+ -+ ac->ac_b_ex.fe_len = 1 << i; -+ ac->ac_b_ex.fe_start = k << i; -+ ac->ac_b_ex.fe_group = e3b->bd_group; -+ -+ ext3_mb_use_best_found(ac, e3b); -+ J_ASSERT(ac->ac_b_ex.fe_len == ac->ac_g_ex.fe_len); -+ -+ if (unlikely(ext3_mb_stats)) -+ atomic_inc(&EXT3_SB(sb)->s_bal_2orders); -+ -+ break; -+ } -+} -+ -+/* -+ * The routine scans the group and measures all found extents. -+ * In order to optimize scanning, caller must pass number of -+ * free blocks in the group, so the routine can know upper limit. -+ */ -+static void ext3_mb_complex_scan_group(struct ext3_allocation_context *ac, -+ struct ext3_buddy *e3b) -+{ -+ struct super_block *sb = ac->ac_sb; -+ void *bitmap = EXT3_MB_BITMAP(e3b); -+ struct ext3_free_extent ex; -+ int i, free; -+ -+ free = e3b->bd_info->bb_free; -+ J_ASSERT(free > 0); -+ -+ i = e3b->bd_info->bb_first_free; -+ -+ while (free && ac->ac_status == AC_STATUS_CONTINUE) { -+ i = mb_find_next_zero_bit(bitmap, sb->s_blocksize * 8, i); -+ if (i >= sb->s_blocksize * 8) { -+ J_ASSERT(free == 0); -+ break; -+ } -+ -+ mb_find_extent(e3b, 0, i, ac->ac_g_ex.fe_len, &ex); -+ J_ASSERT(ex.fe_len > 0); -+ J_ASSERT(free >= ex.fe_len); -+ -+ ext3_mb_measure_extent(ac, &ex, e3b); -+ -+ i += ex.fe_len; -+ free -= ex.fe_len; -+ } -+} -+ -+/* -+ * This is a special case for storages like raid5 -+ * we try to find stripe-aligned chunks for stripe-size requests -+ */ -+static void ext3_mb_scan_aligned(struct ext3_allocation_context *ac, -+ struct ext3_buddy *e3b) -+{ -+ struct super_block *sb = ac->ac_sb; -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ void *bitmap = EXT3_MB_BITMAP(e3b); -+ struct ext3_free_extent ex; -+ unsigned long i, max; -+ -+ J_ASSERT(sbi->s_stripe != 0); -+ -+ /* find first stripe-aligned block */ -+ i = e3b->bd_group * EXT3_BLOCKS_PER_GROUP(sb) + -+ le32_to_cpu(sbi->s_es->s_first_data_block); -+ i = ((i + sbi->s_stripe - 1) / sbi->s_stripe) * sbi->s_stripe; -+ i = (i - le32_to_cpu(sbi->s_es->s_first_data_block)) % -+ EXT3_BLOCKS_PER_GROUP(sb); -+ -+ while (i < sb->s_blocksize * 8) { -+ if (!mb_test_bit(i, bitmap)) { -+ max = mb_find_extent(e3b, 0, i, sbi->s_stripe, &ex); -+ if (max >= sbi->s_stripe) { -+ ac->ac_found++; -+ ac->ac_b_ex = ex; -+ ext3_mb_use_best_found(ac, e3b); -+ break; -+ } -+ } -+ i += sbi->s_stripe; -+ } -+} -+ -+static int ext3_mb_good_group(struct ext3_allocation_context *ac, -+ int group, int cr) -+{ -+ struct ext3_group_info *grp = EXT3_GROUP_INFO(ac->ac_sb, group); -+ unsigned free, fragments, i, bits; -+ -+ J_ASSERT(cr >= 0 && cr < 4); -+ J_ASSERT(!EXT3_MB_GRP_NEED_INIT(grp)); -+ -+ free = grp->bb_free; -+ fragments = grp->bb_fragments; -+ if (free == 0) -+ return 0; -+ if (fragments == 0) -+ return 0; -+ -+ switch (cr) { -+ case 0: -+ J_ASSERT(ac->ac_2order != 0); -+ bits = ac->ac_sb->s_blocksize_bits + 1; -+ for (i = ac->ac_2order; i <= bits; i++) -+ if (grp->bb_counters[i] > 0) -+ return 1; -+ break; -+ case 1: -+ if ((free / fragments) >= ac->ac_g_ex.fe_len) -+ return 1; -+ break; -+ case 2: -+ if (free >= ac->ac_g_ex.fe_len) -+ return 1; -+ break; -+ case 3: -+ return 1; -+ default: -+ BUG(); -+ } -+ -+ return 0; -+} -+ -+int ext3_mb_new_blocks(handle_t *handle, struct inode *inode, -+ unsigned long goal, int *len, int flags, int *errp) -+{ -+ struct buffer_head *bitmap_bh = NULL; -+ struct ext3_allocation_context ac; -+ int i, group, block, cr, err = 0; -+ struct ext3_group_desc *gdp; -+ struct ext3_super_block *es; -+ struct buffer_head *gdp_bh; -+ struct ext3_sb_info *sbi; -+ struct super_block *sb; -+ struct ext3_buddy e3b; -+ -+ J_ASSERT(len != NULL); -+ J_ASSERT(*len > 0); -+ -+ sb = inode->i_sb; -+ if (!sb) { -+ printk("ext3_mb_new_nblocks: nonexistent device"); -+ return 0; -+ } -+ -+ if (!test_opt(sb, MBALLOC)) { -+ static int ext3_mballoc_warning = 0; -+ if (ext3_mballoc_warning == 0) { -+ printk(KERN_ERR "EXT3-fs: multiblock request with " -+ "mballoc disabled!\n"); -+ ext3_mballoc_warning++; -+ } -+ *len = 1; -+ err = ext3_new_block_old(handle, inode, goal, errp); -+ return err; -+ } -+ -+ ext3_mb_poll_new_transaction(sb, handle); -+ -+ sbi = EXT3_SB(sb); -+ es = EXT3_SB(sb)->s_es; -+ -+ /* -+ * We can't allocate > group size -+ */ -+ if (*len >= EXT3_BLOCKS_PER_GROUP(sb) - 10) -+ *len = EXT3_BLOCKS_PER_GROUP(sb) - 10; -+ -+ if (!(flags & EXT3_MB_HINT_RESERVED)) { -+ /* someone asks for non-reserved blocks */ -+ BUG_ON(*len > 1); -+ err = ext3_mb_reserve_blocks(sb, 1); -+ if (err) { -+ *errp = err; -+ return 0; -+ } -+ } -+ -+ ac.ac_buddy_page = NULL; -+ ac.ac_bitmap_page = NULL; -+ -+ /* -+ * Check quota for allocation of this blocks. -+ */ -+ while (*len && DQUOT_ALLOC_BLOCK(inode, *len)) -+ *len -= 1; -+ if (*len == 0) { -+ *errp = -EDQUOT; -+ block = 0; -+ goto out; -+ } -+ -+ /* start searching from the goal */ -+ if (goal < le32_to_cpu(es->s_first_data_block) || -+ goal >= le32_to_cpu(es->s_blocks_count)) -+ goal = le32_to_cpu(es->s_first_data_block); -+ group = (goal - le32_to_cpu(es->s_first_data_block)) / -+ EXT3_BLOCKS_PER_GROUP(sb); -+ block = ((goal - le32_to_cpu(es->s_first_data_block)) % -+ EXT3_BLOCKS_PER_GROUP(sb)); -+ -+ /* set up allocation goals */ -+ ac.ac_b_ex.fe_group = 0; -+ ac.ac_b_ex.fe_start = 0; -+ ac.ac_b_ex.fe_len = 0; -+ ac.ac_status = AC_STATUS_CONTINUE; -+ ac.ac_groups_scanned = 0; -+ ac.ac_ex_scanned = 0; -+ ac.ac_found = 0; -+ ac.ac_sb = inode->i_sb; -+ ac.ac_g_ex.fe_group = group; -+ ac.ac_g_ex.fe_start = block; -+ ac.ac_g_ex.fe_len = *len; -+ ac.ac_flags = flags; -+ ac.ac_2order = 0; -+ ac.ac_criteria = 0; -+ -+ if (*len == 1 && sbi->s_stripe) { -+ /* looks like a metadata, let's use a dirty hack for raid5 -+ * move all metadata in first groups in hope to hit cached -+ * sectors and thus avoid read-modify cycles in raid5 */ -+ ac.ac_g_ex.fe_group = group = 0; -+ } -+ -+ /* probably, the request is for 2^8+ blocks (1/2/3/... MB) */ -+ i = ffs(*len); -+ if (i >= ext3_mb_order2_reqs) { -+ i--; -+ if ((*len & (~(1 << i))) == 0) -+ ac.ac_2order = i; -+ } -+ -+ /* first, try the goal */ -+ err = ext3_mb_find_by_goal(&ac, &e3b); -+ if (err) -+ goto out_err; -+ if (ac.ac_status == AC_STATUS_FOUND) -+ goto found; -+ -+ /* Let's just scan groups to find more-less suitable blocks */ -+ cr = ac.ac_2order ? 0 : 1; -+repeat: -+ for (; cr < 4 && ac.ac_status == AC_STATUS_CONTINUE; cr++) { -+ ac.ac_criteria = cr; -+ for (i = 0; i < EXT3_SB(sb)->s_groups_count; group++, i++) { -+ if (group == EXT3_SB(sb)->s_groups_count) -+ group = 0; -+ -+ if (EXT3_MB_GRP_NEED_INIT(EXT3_GROUP_INFO(sb, group))) { -+ /* we need full data about the group -+ * to make a good selection */ -+ err = ext3_mb_load_buddy(ac.ac_sb, group, &e3b); -+ if (err) -+ goto out_err; -+ ext3_mb_release_desc(&e3b); -+ } -+ -+ /* check is group good for our criteries */ -+ if (!ext3_mb_good_group(&ac, group, cr)) -+ continue; -+ -+ err = ext3_mb_load_buddy(ac.ac_sb, group, &e3b); -+ if (err) -+ goto out_err; -+ -+ ext3_lock_group(sb, group); -+ if (!ext3_mb_good_group(&ac, group, cr)) { -+ /* someone did allocation from this group */ -+ ext3_unlock_group(sb, group); -+ ext3_mb_release_desc(&e3b); -+ continue; -+ } -+ -+ ac.ac_groups_scanned++; -+ if (cr == 0) -+ ext3_mb_simple_scan_group(&ac, &e3b); -+ else if (cr == 1 && *len == sbi->s_stripe) -+ ext3_mb_scan_aligned(&ac, &e3b); -+ else -+ ext3_mb_complex_scan_group(&ac, &e3b); -+ -+ ext3_unlock_group(sb, group); -+ -+ ext3_mb_release_desc(&e3b); -+ -+ if (ac.ac_status != AC_STATUS_CONTINUE) -+ break; -+ } -+ } -+ -+ if (ac.ac_b_ex.fe_len > 0 && ac.ac_status != AC_STATUS_FOUND && -+ !(ac.ac_flags & EXT3_MB_HINT_FIRST)) { -+ /* -+ * We've been searching too long. Let's try to allocate -+ * the best chunk we've found so far -+ */ -+ -+ /*if (ac.ac_found > ext3_mb_max_to_scan) -+ printk(KERN_DEBUG "EXT3-fs: too long searching at " -+ "%u (%d/%d)\n", cr, ac.ac_b_ex.fe_len, -+ ac.ac_g_ex.fe_len);*/ -+ ext3_mb_try_best_found(&ac, &e3b); -+ if (ac.ac_status != AC_STATUS_FOUND) { -+ /* -+ * Someone more lucky has already allocated it. -+ * The only thing we can do is just take first -+ * found block(s) -+ printk(KERN_DEBUG "EXT3-fs: someone won our chunk\n"); -+ */ -+ ac.ac_b_ex.fe_group = 0; -+ ac.ac_b_ex.fe_start = 0; -+ ac.ac_b_ex.fe_len = 0; -+ ac.ac_status = AC_STATUS_CONTINUE; -+ ac.ac_flags |= EXT3_MB_HINT_FIRST; -+ cr = 3; -+ goto repeat; -+ } -+ } -+ -+ if (ac.ac_status != AC_STATUS_FOUND) { -+ /* -+ * We aren't lucky definitely -+ */ -+ DQUOT_FREE_BLOCK(inode, *len); -+ *errp = -ENOSPC; -+ block = 0; -+#if 1 -+ printk(KERN_ERR "EXT3-fs: can't allocate: status %d flags %d\n", -+ ac.ac_status, ac.ac_flags); -+ printk(KERN_ERR "EXT3-fs: goal %d, best found %d/%d/%d cr %d\n", -+ ac.ac_g_ex.fe_len, ac.ac_b_ex.fe_group, -+ ac.ac_b_ex.fe_start, ac.ac_b_ex.fe_len, cr); -+ printk(KERN_ERR "EXT3-fs: %lu block reserved, %d found\n", -+ sbi->s_blocks_reserved, ac.ac_found); -+ printk("EXT3-fs: groups: "); -+ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) -+ printk("%d: %d ", i, EXT3_GROUP_INFO(sb, i)->bb_free); -+ printk("\n"); -+#endif -+ goto out; -+ } -+ -+found: -+ J_ASSERT(ac.ac_b_ex.fe_len > 0); -+ -+ /* good news - free block(s) have been found. now it's time -+ * to mark block(s) in good old journaled bitmap */ -+ block = ac.ac_b_ex.fe_group * EXT3_BLOCKS_PER_GROUP(sb) -+ + ac.ac_b_ex.fe_start -+ + le32_to_cpu(es->s_first_data_block); -+ -+ /* we made a desicion, now mark found blocks in good old -+ * bitmap to be journaled */ -+ -+ ext3_debug("using block group %d(%d)\n", -+ ac.ac_b_group.group, gdp->bg_free_blocks_count); -+ -+ bitmap_bh = read_block_bitmap(sb, ac.ac_b_ex.fe_group); -+ if (!bitmap_bh) { -+ *errp = -EIO; -+ goto out_err; -+ } -+ -+ err = ext3_journal_get_write_access(handle, bitmap_bh); -+ if (err) { -+ *errp = err; -+ goto out_err; -+ } -+ -+ gdp = ext3_get_group_desc(sb, ac.ac_b_ex.fe_group, &gdp_bh); -+ if (!gdp) { -+ *errp = -EIO; -+ goto out_err; -+ } -+ -+ err = ext3_journal_get_write_access(handle, gdp_bh); -+ if (err) -+ goto out_err; -+ -+ block = ac.ac_b_ex.fe_group * EXT3_BLOCKS_PER_GROUP(sb) -+ + ac.ac_b_ex.fe_start -+ + le32_to_cpu(es->s_first_data_block); -+ -+ if (block == le32_to_cpu(gdp->bg_block_bitmap) || -+ block == le32_to_cpu(gdp->bg_inode_bitmap) || -+ in_range(block, le32_to_cpu(gdp->bg_inode_table), -+ EXT3_SB(sb)->s_itb_per_group)) -+ ext3_error(sb, "ext3_new_block", -+ "Allocating block in system zone - " -+ "block = %u", block); -+#ifdef AGGRESSIVE_CHECK -+ for (i = 0; i < ac.ac_b_ex.fe_len; i++) -+ J_ASSERT(!mb_test_bit(ac.ac_b_ex.fe_start + i, bitmap_bh->b_data)); -+#endif -+ mb_set_bits(bitmap_bh->b_data, ac.ac_b_ex.fe_start, ac.ac_b_ex.fe_len); -+ -+ spin_lock(sb_bgl_lock(sbi, ac.ac_b_ex.fe_group)); -+ gdp->bg_free_blocks_count = -+ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) -+ - ac.ac_b_ex.fe_len); -+ spin_unlock(sb_bgl_lock(sbi, ac.ac_b_ex.fe_group)); -+ percpu_counter_mod(&sbi->s_freeblocks_counter, - ac.ac_b_ex.fe_len); -+ -+ err = ext3_journal_dirty_metadata(handle, bitmap_bh); -+ if (err) -+ goto out_err; -+ err = ext3_journal_dirty_metadata(handle, gdp_bh); -+ if (err) -+ goto out_err; -+ -+ sb->s_dirt = 1; -+ *errp = 0; -+ brelse(bitmap_bh); -+ -+ /* drop non-allocated, but dquote'd blocks */ -+ J_ASSERT(*len >= ac.ac_b_ex.fe_len); -+ DQUOT_FREE_BLOCK(inode, *len - ac.ac_b_ex.fe_len); -+ -+ *len = ac.ac_b_ex.fe_len; -+ J_ASSERT(*len > 0); -+ J_ASSERT(block != 0); -+ goto out; -+ -+out_err: -+ /* if we've already allocated something, roll it back */ -+ if (ac.ac_status == AC_STATUS_FOUND) { -+ /* FIXME: free blocks here */ -+ } -+ -+ DQUOT_FREE_BLOCK(inode, *len); -+ brelse(bitmap_bh); -+ *errp = err; -+ block = 0; -+out: -+ if (ac.ac_buddy_page) -+ page_cache_release(ac.ac_buddy_page); -+ if (ac.ac_bitmap_page) -+ page_cache_release(ac.ac_bitmap_page); -+ -+ if (!(flags & EXT3_MB_HINT_RESERVED)) { -+ /* block wasn't reserved before and we reserved it -+ * at the beginning of allocation. it doesn't matter -+ * whether we allocated anything or we failed: time -+ * to release reservation. NOTE: because I expect -+ * any multiblock request from delayed allocation -+ * path only, here is single block always */ -+ ext3_mb_release_blocks(sb, 1); -+ } -+ -+ if (unlikely(ext3_mb_stats) && ac.ac_g_ex.fe_len > 1) { -+ atomic_inc(&sbi->s_bal_reqs); -+ atomic_add(*len, &sbi->s_bal_allocated); -+ if (*len >= ac.ac_g_ex.fe_len) -+ atomic_inc(&sbi->s_bal_success); -+ atomic_add(ac.ac_found, &sbi->s_bal_ex_scanned); -+ if (ac.ac_g_ex.fe_start == ac.ac_b_ex.fe_start && -+ ac.ac_g_ex.fe_group == ac.ac_b_ex.fe_group) -+ atomic_inc(&sbi->s_bal_goals); -+ if (ac.ac_found > ext3_mb_max_to_scan) -+ atomic_inc(&sbi->s_bal_breaks); -+ } -+ -+ ext3_mb_store_history(sb, inode->i_ino, &ac); -+ -+ return block; -+} -+EXPORT_SYMBOL(ext3_mb_new_blocks); -+ -+#ifdef EXT3_MB_HISTORY -+struct ext3_mb_proc_session { -+ struct ext3_mb_history *history; -+ struct super_block *sb; -+ int start; -+ int max; -+}; -+ -+static void *ext3_mb_history_skip_empty(struct ext3_mb_proc_session *s, -+ struct ext3_mb_history *hs, -+ int first) -+{ -+ if (hs == s->history + s->max) -+ hs = s->history; -+ if (!first && hs == s->history + s->start) -+ return NULL; -+ while (hs->goal.fe_len == 0) { -+ hs++; -+ if (hs == s->history + s->max) -+ hs = s->history; -+ if (hs == s->history + s->start) -+ return NULL; -+ } -+ return hs; -+} -+ -+static void *ext3_mb_seq_history_start(struct seq_file *seq, loff_t *pos) -+{ -+ struct ext3_mb_proc_session *s = seq->private; -+ struct ext3_mb_history *hs; -+ int l = *pos; -+ -+ if (l == 0) -+ return SEQ_START_TOKEN; -+ hs = ext3_mb_history_skip_empty(s, s->history + s->start, 1); -+ if (!hs) -+ return NULL; -+ while (--l && (hs = ext3_mb_history_skip_empty(s, ++hs, 0)) != NULL); -+ return hs; -+} -+ -+static void *ext3_mb_seq_history_next(struct seq_file *seq, void *v, loff_t *pos) -+{ -+ struct ext3_mb_proc_session *s = seq->private; -+ struct ext3_mb_history *hs = v; -+ -+ ++*pos; -+ if (v == SEQ_START_TOKEN) -+ return ext3_mb_history_skip_empty(s, s->history + s->start, 1); -+ else -+ return ext3_mb_history_skip_empty(s, ++hs, 0); -+} -+ -+static int ext3_mb_seq_history_show(struct seq_file *seq, void *v) -+{ -+ struct ext3_mb_history *hs = v; -+ char buf[20], buf2[20]; -+ -+ if (v == SEQ_START_TOKEN) { -+ seq_printf(seq, "%-5s %-8s %-17s %-17s %-5s %-5s %-2s %-5s %-5s %-6s\n", -+ "pid", "inode", "goal", "result", "found", "grps", "cr", -+ "merge", "tail", "broken"); -+ return 0; -+ } -+ -+ sprintf(buf, "%u/%u/%u", hs->goal.fe_group, -+ hs->goal.fe_start, hs->goal.fe_len); -+ sprintf(buf2, "%u/%u/%u", hs->result.fe_group, -+ hs->result.fe_start, hs->result.fe_len); -+ seq_printf(seq, "%-5u %-8u %-17s %-17s %-5u %-5u %-2u %-5s %-5u %-6u\n", -+ hs->pid, hs->ino, buf, buf2, hs->found, hs->groups, -+ hs->cr, hs->merged ? "M" : "", hs->tail, -+ hs->buddy ? 1 << hs->buddy : 0); -+ return 0; -+} -+ -+static void ext3_mb_seq_history_stop(struct seq_file *seq, void *v) -+{ -+} -+ -+static struct seq_operations ext3_mb_seq_history_ops = { -+ .start = ext3_mb_seq_history_start, -+ .next = ext3_mb_seq_history_next, -+ .stop = ext3_mb_seq_history_stop, -+ .show = ext3_mb_seq_history_show, -+}; -+ -+static int ext3_mb_seq_history_open(struct inode *inode, struct file *file) -+{ -+ struct super_block *sb = PDE(inode)->data; -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ struct ext3_mb_proc_session *s; -+ int rc, size; -+ -+ s = kmalloc(sizeof(*s), GFP_KERNEL); -+ if (s == NULL) -+ return -EIO; -+ size = sizeof(struct ext3_mb_history) * sbi->s_mb_history_max; -+ s->history = kmalloc(size, GFP_KERNEL); -+ if (s == NULL) { -+ kfree(s); -+ return -EIO; -+ } -+ -+ spin_lock(&sbi->s_mb_history_lock); -+ memcpy(s->history, sbi->s_mb_history, size); -+ s->max = sbi->s_mb_history_max; -+ s->start = sbi->s_mb_history_cur % s->max; -+ spin_unlock(&sbi->s_mb_history_lock); -+ -+ rc = seq_open(file, &ext3_mb_seq_history_ops); -+ if (rc == 0) { -+ struct seq_file *m = (struct seq_file *)file->private_data; -+ m->private = s; -+ } else { -+ kfree(s->history); -+ kfree(s); -+ } -+ return rc; -+ -+} -+ -+static int ext3_mb_seq_history_release(struct inode *inode, struct file *file) -+{ -+ struct seq_file *seq = (struct seq_file *)file->private_data; -+ struct ext3_mb_proc_session *s = seq->private; -+ kfree(s->history); -+ kfree(s); -+ return seq_release(inode, file); -+} -+ -+static struct file_operations ext3_mb_seq_history_fops = { -+ .owner = THIS_MODULE, -+ .open = ext3_mb_seq_history_open, -+ .read = seq_read, -+ .llseek = seq_lseek, -+ .release = ext3_mb_seq_history_release, -+}; -+ -+static void *ext3_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) -+{ -+ struct super_block *sb = seq->private; -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ long group; -+ -+ if (*pos < 0 || *pos >= sbi->s_groups_count) -+ return NULL; -+ -+ group = *pos + 1; -+ return (void *) group; -+} -+ -+static void *ext3_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) -+{ -+ struct super_block *sb = seq->private; -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ long group; -+ -+ ++*pos; -+ if (*pos < 0 || *pos >= sbi->s_groups_count) -+ return NULL; -+ group = *pos + 1; -+ return (void *) group;; -+} -+ -+static int ext3_mb_seq_groups_show(struct seq_file *seq, void *v) -+{ -+ struct super_block *sb = seq->private; -+ long group = (long) v, i; -+ struct sg { -+ struct ext3_group_info info; -+ unsigned short counters[16]; -+ } sg; -+ -+ group--; -+ if (group == 0) -+ seq_printf(seq, "#%-5s: %-5s %-5s %-5s [ %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s ]\n", -+ "group", "free", "frags", "first", "2^0", "2^1", "2^2", -+ "2^3", "2^4", "2^5", "2^6", "2^7", "2^8", "2^9", "2^10", -+ "2^11", "2^12", "2^13"); -+ -+ i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) + -+ sizeof(struct ext3_group_info); -+ ext3_lock_group(sb, group); -+ memcpy(&sg, EXT3_GROUP_INFO(sb, group), i); -+ ext3_unlock_group(sb, group); -+ -+ if (EXT3_MB_GRP_NEED_INIT(&sg.info)) -+ return 0; -+ -+ seq_printf(seq, "#%-5lu: %-5u %-5u %-5u [", group, sg.info.bb_free, -+ sg.info.bb_fragments, sg.info.bb_first_free); -+ for (i = 0; i <= 13; i++) -+ seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ? -+ sg.info.bb_counters[i] : 0); -+ seq_printf(seq, " ]\n"); -+ -+ return 0; -+} -+ -+static void ext3_mb_seq_groups_stop(struct seq_file *seq, void *v) -+{ -+} -+ -+static struct seq_operations ext3_mb_seq_groups_ops = { -+ .start = ext3_mb_seq_groups_start, -+ .next = ext3_mb_seq_groups_next, -+ .stop = ext3_mb_seq_groups_stop, -+ .show = ext3_mb_seq_groups_show, -+}; -+ -+static int ext3_mb_seq_groups_open(struct inode *inode, struct file *file) -+{ -+ struct super_block *sb = PDE(inode)->data; -+ int rc; -+ -+ rc = seq_open(file, &ext3_mb_seq_groups_ops); -+ if (rc == 0) { -+ struct seq_file *m = (struct seq_file *)file->private_data; -+ m->private = sb; -+ } -+ return rc; -+ -+} -+ -+static struct file_operations ext3_mb_seq_groups_fops = { -+ .owner = THIS_MODULE, -+ .open = ext3_mb_seq_groups_open, -+ .read = seq_read, -+ .llseek = seq_lseek, -+ .release = seq_release, -+}; -+ -+static void ext3_mb_history_release(struct super_block *sb) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ char name[64]; -+ -+ snprintf(name, sizeof(name) - 1, "%s", bdevname(sb->s_bdev, name)); -+ remove_proc_entry("mb_groups", sbi->s_mb_proc); -+ remove_proc_entry("mb_history", sbi->s_mb_proc); -+ remove_proc_entry(name, proc_root_ext3); -+ -+ if (sbi->s_mb_history) -+ kfree(sbi->s_mb_history); -+} -+ -+static void ext3_mb_history_init(struct super_block *sb) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ char name[64]; -+ int i; -+ -+ snprintf(name, sizeof(name) - 1, "%s", bdevname(sb->s_bdev, name)); -+ sbi->s_mb_proc = proc_mkdir(name, proc_root_ext3); -+ if (sbi->s_mb_proc != NULL) { -+ struct proc_dir_entry *p; -+ p = create_proc_entry("mb_history", S_IRUGO, sbi->s_mb_proc); -+ if (p) { -+ p->proc_fops = &ext3_mb_seq_history_fops; -+ p->data = sb; -+ } -+ p = create_proc_entry("mb_groups", S_IRUGO, sbi->s_mb_proc); -+ if (p) { -+ p->proc_fops = &ext3_mb_seq_groups_fops; -+ p->data = sb; -+ } -+ } -+ -+ sbi->s_mb_history_max = 1000; -+ sbi->s_mb_history_cur = 0; -+ spin_lock_init(&sbi->s_mb_history_lock); -+ i = sbi->s_mb_history_max * sizeof(struct ext3_mb_history); -+ sbi->s_mb_history = kmalloc(i, GFP_KERNEL); -+ memset(sbi->s_mb_history, 0, i); -+ /* if we can't allocate history, then we simple won't use it */ -+} -+ -+static void -+ext3_mb_store_history(struct super_block *sb, unsigned ino, -+ struct ext3_allocation_context *ac) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ struct ext3_mb_history h; -+ -+ if (likely(sbi->s_mb_history == NULL)) -+ return; -+ -+ h.pid = current->pid; -+ h.ino = ino; -+ h.goal = ac->ac_g_ex; -+ h.result = ac->ac_b_ex; -+ h.found = ac->ac_found; -+ h.cr = ac->ac_criteria; -+ h.groups = ac->ac_groups_scanned; -+ h.tail = ac->ac_tail; -+ h.buddy = ac->ac_buddy; -+ h.merged = 0; -+ if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && -+ ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group) -+ h.merged = 1; -+ -+ spin_lock(&sbi->s_mb_history_lock); -+ memcpy(sbi->s_mb_history + sbi->s_mb_history_cur, &h, sizeof(h)); -+ if (++sbi->s_mb_history_cur >= sbi->s_mb_history_max) -+ sbi->s_mb_history_cur = 0; -+ spin_unlock(&sbi->s_mb_history_lock); -+} -+ -+#else -+#define ext3_mb_history_release(sb) -+#define ext3_mb_history_init(sb) -+#endif -+ -+int ext3_mb_init_backend(struct super_block *sb) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ int i, j, len, metalen; -+ int num_meta_group_infos = -+ (sbi->s_groups_count + EXT3_DESC_PER_BLOCK(sb) - 1) >> -+ EXT3_DESC_PER_BLOCK_BITS(sb); -+ struct ext3_group_info **meta_group_info; -+ -+ /* An 8TB filesystem with 64-bit pointers requires a 4096 byte -+ * kmalloc. A 128kb malloc should suffice for a 256TB filesystem. -+ * So a two level scheme suffices for now. */ -+ sbi->s_group_info = kmalloc(sizeof(*sbi->s_group_info) * -+ num_meta_group_infos, GFP_KERNEL); -+ if (sbi->s_group_info == NULL) { -+ printk(KERN_ERR "EXT3-fs: can't allocate buddy meta group\n"); -+ return -ENOMEM; -+ } -+ sbi->s_buddy_cache = new_inode(sb); -+ if (sbi->s_buddy_cache == NULL) { -+ printk(KERN_ERR "EXT3-fs: can't get new inode\n"); -+ goto err_freesgi; -+ } -+ -+ metalen = sizeof(*meta_group_info) << EXT3_DESC_PER_BLOCK_BITS(sb); -+ for (i = 0; i < num_meta_group_infos; i++) { -+ if ((i + 1) == num_meta_group_infos) -+ metalen = sizeof(*meta_group_info) * -+ (sbi->s_groups_count - -+ (i << EXT3_DESC_PER_BLOCK_BITS(sb))); -+ meta_group_info = kmalloc(metalen, GFP_KERNEL); -+ if (meta_group_info == NULL) { -+ printk(KERN_ERR "EXT3-fs: can't allocate mem for a " -+ "buddy group\n"); -+ goto err_freemeta; -+ } -+ sbi->s_group_info[i] = meta_group_info; -+ } -+ -+ /* -+ * calculate needed size. if change bb_counters size, -+ * don't forget about ext3_mb_generate_buddy() -+ */ -+ len = sizeof(struct ext3_group_info); -+ len += sizeof(unsigned short) * (sb->s_blocksize_bits + 2); -+ for (i = 0; i < sbi->s_groups_count; i++) { -+ struct ext3_group_desc * desc; -+ -+ meta_group_info = -+ sbi->s_group_info[i >> EXT3_DESC_PER_BLOCK_BITS(sb)]; -+ j = i & (EXT3_DESC_PER_BLOCK(sb) - 1); -+ -+ meta_group_info[j] = kmalloc(len, GFP_KERNEL); -+ if (meta_group_info[j] == NULL) { -+ printk(KERN_ERR "EXT3-fs: can't allocate buddy mem\n"); -+ i--; -+ goto err_freebuddy; -+ } -+ desc = ext3_get_group_desc(sb, i, NULL); -+ if (desc == NULL) { -+ printk(KERN_ERR"EXT3-fs: can't read descriptor %u\n",i); -+ goto err_freebuddy; -+ } -+ memset(meta_group_info[j], 0, len); -+ set_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, -+ &meta_group_info[j]->bb_state); -+ meta_group_info[j]->bb_free = -+ le16_to_cpu(desc->bg_free_blocks_count); -+ } -+ -+ return 0; -+ -+err_freebuddy: -+ while (i >= 0) { -+ kfree(EXT3_GROUP_INFO(sb, i)); -+ i--; -+ } -+ i = num_meta_group_infos; -+err_freemeta: -+ while (--i >= 0) -+ kfree(sbi->s_group_info[i]); -+ iput(sbi->s_buddy_cache); -+err_freesgi: -+ kfree(sbi->s_group_info); -+ return -ENOMEM; -+} -+ -+int ext3_mb_init(struct super_block *sb, int needs_recovery) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ struct inode *root = sb->s_root->d_inode; -+ unsigned i, offset, max; -+ struct dentry *dentry; -+ -+ if (!test_opt(sb, MBALLOC)) -+ return 0; -+ -+ i = (sb->s_blocksize_bits + 2) * sizeof(unsigned short); -+ -+ sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL); -+ if (sbi->s_mb_offsets == NULL) { -+ clear_opt(sbi->s_mount_opt, MBALLOC); -+ return -ENOMEM; -+ } -+ sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); -+ if (sbi->s_mb_maxs == NULL) { -+ clear_opt(sbi->s_mount_opt, MBALLOC); -+ kfree(sbi->s_mb_maxs); -+ return -ENOMEM; -+ } -+ -+ /* order 0 is regular bitmap */ -+ sbi->s_mb_maxs[0] = sb->s_blocksize << 3; -+ sbi->s_mb_offsets[0] = 0; -+ -+ i = 1; -+ offset = 0; -+ max = sb->s_blocksize << 2; -+ do { -+ sbi->s_mb_offsets[i] = offset; -+ sbi->s_mb_maxs[i] = max; -+ offset += 1 << (sb->s_blocksize_bits - i); -+ max = max >> 1; -+ i++; -+ } while (i <= sb->s_blocksize_bits + 1); -+ -+ /* init file for buddy data */ -+ if ((i = ext3_mb_init_backend(sb))) { -+ clear_opt(sbi->s_mount_opt, MBALLOC); -+ kfree(sbi->s_mb_offsets); -+ kfree(sbi->s_mb_maxs); -+ return i; -+ } -+ -+ spin_lock_init(&sbi->s_reserve_lock); -+ spin_lock_init(&sbi->s_md_lock); -+ INIT_LIST_HEAD(&sbi->s_active_transaction); -+ INIT_LIST_HEAD(&sbi->s_closed_transaction); -+ INIT_LIST_HEAD(&sbi->s_committed_transaction); -+ spin_lock_init(&sbi->s_bal_lock); -+ -+ /* remove old on-disk buddy file */ -+ down(&root->i_sem); -+ dentry = lookup_one_len(".buddy", sb->s_root, strlen(".buddy")); -+ if (dentry->d_inode != NULL) { -+ i = vfs_unlink(root, dentry); -+ if (i != 0) -+ printk("EXT3-fs: can't remove .buddy file: %d\n", i); -+ } -+ dput(dentry); -+ up(&root->i_sem); -+ -+ ext3_mb_history_init(sb); -+ -+ printk("EXT3-fs: mballoc enabled\n"); -+ return 0; -+} -+ -+int ext3_mb_release(struct super_block *sb) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ int i, num_meta_group_infos; -+ -+ if (!test_opt(sb, MBALLOC)) -+ return 0; -+ -+ /* release freed, non-committed blocks */ -+ spin_lock(&sbi->s_md_lock); -+ list_splice_init(&sbi->s_closed_transaction, -+ &sbi->s_committed_transaction); -+ list_splice_init(&sbi->s_active_transaction, -+ &sbi->s_committed_transaction); -+ spin_unlock(&sbi->s_md_lock); -+ ext3_mb_free_committed_blocks(sb); -+ -+ if (sbi->s_group_info) { -+ for (i = 0; i < sbi->s_groups_count; i++) -+ kfree(EXT3_GROUP_INFO(sb, i)); -+ num_meta_group_infos = (sbi->s_groups_count + -+ EXT3_DESC_PER_BLOCK(sb) - 1) >> -+ EXT3_DESC_PER_BLOCK_BITS(sb); -+ for (i = 0; i < num_meta_group_infos; i++) -+ kfree(sbi->s_group_info[i]); -+ kfree(sbi->s_group_info); -+ } -+ if (sbi->s_mb_offsets) -+ kfree(sbi->s_mb_offsets); -+ if (sbi->s_mb_maxs) -+ kfree(sbi->s_mb_maxs); -+ if (sbi->s_buddy_cache) -+ iput(sbi->s_buddy_cache); -+ if (sbi->s_blocks_reserved) -+ printk("ext3-fs: %ld blocks being reserved at umount!\n", -+ sbi->s_blocks_reserved); -+ if (ext3_mb_stats) { -+ printk("EXT3-fs: mballoc: %u blocks %u reqs (%u success)\n", -+ atomic_read(&sbi->s_bal_allocated), -+ atomic_read(&sbi->s_bal_reqs), -+ atomic_read(&sbi->s_bal_success)); -+ printk("EXT3-fs: mballoc: %u extents scanned, %u goal hits, " -+ "%u 2^N hits, %u breaks\n", -+ atomic_read(&sbi->s_bal_ex_scanned), -+ atomic_read(&sbi->s_bal_goals), -+ atomic_read(&sbi->s_bal_2orders), -+ atomic_read(&sbi->s_bal_breaks)); -+ printk("EXT3-fs: mballoc: %lu generated and it took %Lu\n", -+ sbi->s_mb_buddies_generated++, -+ sbi->s_mb_generation_time); -+ } -+ -+ ext3_mb_history_release(sb); -+ -+ return 0; -+} -+ -+void ext3_mb_free_committed_blocks(struct super_block *sb) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ int err, i, count = 0, count2 = 0; -+ struct ext3_free_metadata *md; -+ struct ext3_buddy e3b; -+ -+ if (list_empty(&sbi->s_committed_transaction)) -+ return; -+ -+ /* there is committed blocks to be freed yet */ -+ do { -+ /* get next array of blocks */ -+ md = NULL; -+ spin_lock(&sbi->s_md_lock); -+ if (!list_empty(&sbi->s_committed_transaction)) { -+ md = list_entry(sbi->s_committed_transaction.next, -+ struct ext3_free_metadata, list); -+ list_del(&md->list); -+ } -+ spin_unlock(&sbi->s_md_lock); -+ -+ if (md == NULL) -+ break; -+ -+ mb_debug("gonna free %u blocks in group %u (0x%p):", -+ md->num, md->group, md); -+ -+ err = ext3_mb_load_buddy(sb, md->group, &e3b); -+ /* we expect to find existing buddy because it's pinned */ -+ BUG_ON(err != 0); -+ -+ /* there are blocks to put in buddy to make them really free */ -+ count += md->num; -+ count2++; -+ ext3_lock_group(sb, md->group); -+ for (i = 0; i < md->num; i++) { -+ mb_debug(" %u", md->blocks[i]); -+ mb_free_blocks(&e3b, md->blocks[i], 1); -+ } -+ mb_debug("\n"); -+ ext3_unlock_group(sb, md->group); -+ -+ /* balance refcounts from ext3_mb_free_metadata() */ -+ page_cache_release(e3b.bd_buddy_page); -+ page_cache_release(e3b.bd_bitmap_page); -+ -+ kfree(md); -+ ext3_mb_release_desc(&e3b); -+ -+ } while (md); -+ mb_debug("freed %u blocks in %u structures\n", count, count2); -+} -+ -+void ext3_mb_poll_new_transaction(struct super_block *sb, handle_t *handle) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ -+ if (sbi->s_last_transaction == handle->h_transaction->t_tid) -+ return; -+ -+ /* new transaction! time to close last one and free blocks for -+ * committed transaction. we know that only transaction can be -+ * active, so previos transaction can be being logged and we -+ * know that transaction before previous is known to be already -+ * logged. this means that now we may free blocks freed in all -+ * transactions before previous one. hope I'm clear enough ... */ -+ -+ spin_lock(&sbi->s_md_lock); -+ if (sbi->s_last_transaction != handle->h_transaction->t_tid) { -+ mb_debug("new transaction %lu, old %lu\n", -+ (unsigned long) handle->h_transaction->t_tid, -+ (unsigned long) sbi->s_last_transaction); -+ list_splice_init(&sbi->s_closed_transaction, -+ &sbi->s_committed_transaction); -+ list_splice_init(&sbi->s_active_transaction, -+ &sbi->s_closed_transaction); -+ sbi->s_last_transaction = handle->h_transaction->t_tid; -+ } -+ spin_unlock(&sbi->s_md_lock); -+ -+ ext3_mb_free_committed_blocks(sb); -+} -+ -+int ext3_mb_free_metadata(handle_t *handle, struct ext3_buddy *e3b, -+ int group, int block, int count) -+{ -+ struct ext3_group_info *db = e3b->bd_info; -+ struct super_block *sb = e3b->bd_sb; -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ struct ext3_free_metadata *md; -+ int i; -+ -+ J_ASSERT(e3b->bd_bitmap_page != NULL); -+ J_ASSERT(e3b->bd_buddy_page != NULL); -+ -+ ext3_lock_group(sb, group); -+ for (i = 0; i < count; i++) { -+ md = db->bb_md_cur; -+ if (md && db->bb_tid != handle->h_transaction->t_tid) { -+ db->bb_md_cur = NULL; -+ md = NULL; -+ } -+ -+ if (md == NULL) { -+ ext3_unlock_group(sb, group); -+ md = kmalloc(sizeof(*md), GFP_KERNEL); -+ if (md == NULL) -+ return -ENOMEM; -+ md->num = 0; -+ md->group = group; -+ -+ ext3_lock_group(sb, group); -+ if (db->bb_md_cur == NULL) { -+ spin_lock(&sbi->s_md_lock); -+ list_add(&md->list, &sbi->s_active_transaction); -+ spin_unlock(&sbi->s_md_lock); -+ /* protect buddy cache from being freed, -+ * otherwise we'll refresh it from -+ * on-disk bitmap and lose not-yet-available -+ * blocks */ -+ page_cache_get(e3b->bd_buddy_page); -+ page_cache_get(e3b->bd_bitmap_page); -+ db->bb_md_cur = md; -+ db->bb_tid = handle->h_transaction->t_tid; -+ mb_debug("new md 0x%p for group %u\n", -+ md, md->group); -+ } else { -+ kfree(md); -+ md = db->bb_md_cur; -+ } -+ } -+ -+ BUG_ON(md->num >= EXT3_BB_MAX_BLOCKS); -+ md->blocks[md->num] = block + i; -+ md->num++; -+ if (md->num == EXT3_BB_MAX_BLOCKS) { -+ /* no more space, put full container on a sb's list */ -+ db->bb_md_cur = NULL; -+ } -+ } -+ ext3_unlock_group(sb, group); -+ return 0; -+} -+ -+void ext3_mb_free_blocks(handle_t *handle, struct inode *inode, -+ unsigned long block, unsigned long count, -+ int metadata, int *freed) -+{ -+ struct buffer_head *bitmap_bh = NULL; -+ struct ext3_group_desc *gdp; -+ struct ext3_super_block *es; -+ unsigned long bit, overflow; -+ struct buffer_head *gd_bh; -+ unsigned long block_group; -+ struct ext3_sb_info *sbi; -+ struct super_block *sb; -+ struct ext3_buddy e3b; -+ int err = 0, ret; -+ -+ *freed = 0; -+ sb = inode->i_sb; -+ if (!sb) { -+ printk ("ext3_free_blocks: nonexistent device"); -+ return; -+ } -+ -+ ext3_mb_poll_new_transaction(sb, handle); -+ -+ sbi = EXT3_SB(sb); -+ es = EXT3_SB(sb)->s_es; -+ if (block < le32_to_cpu(es->s_first_data_block) || -+ block + count < block || -+ block + count > le32_to_cpu(es->s_blocks_count)) { -+ ext3_error (sb, "ext3_free_blocks", -+ "Freeing blocks not in datazone - " -+ "block = %lu, count = %lu", block, count); -+ goto error_return; -+ } -+ -+ ext3_debug("freeing block %lu\n", block); -+ -+do_more: -+ overflow = 0; -+ block_group = (block - le32_to_cpu(es->s_first_data_block)) / -+ EXT3_BLOCKS_PER_GROUP(sb); -+ bit = (block - le32_to_cpu(es->s_first_data_block)) % -+ EXT3_BLOCKS_PER_GROUP(sb); -+ /* -+ * Check to see if we are freeing blocks across a group -+ * boundary. -+ */ -+ if (bit + count > EXT3_BLOCKS_PER_GROUP(sb)) { -+ overflow = bit + count - EXT3_BLOCKS_PER_GROUP(sb); -+ count -= overflow; -+ } -+ brelse(bitmap_bh); -+ bitmap_bh = read_block_bitmap(sb, block_group); -+ if (!bitmap_bh) -+ goto error_return; -+ gdp = ext3_get_group_desc (sb, block_group, &gd_bh); -+ if (!gdp) -+ goto error_return; -+ -+ if (in_range (le32_to_cpu(gdp->bg_block_bitmap), block, count) || -+ in_range (le32_to_cpu(gdp->bg_inode_bitmap), block, count) || -+ in_range (block, le32_to_cpu(gdp->bg_inode_table), -+ EXT3_SB(sb)->s_itb_per_group) || -+ in_range (block + count - 1, le32_to_cpu(gdp->bg_inode_table), -+ EXT3_SB(sb)->s_itb_per_group)) -+ ext3_error (sb, "ext3_free_blocks", -+ "Freeing blocks in system zones - " -+ "Block = %lu, count = %lu", -+ block, count); -+ -+ BUFFER_TRACE(bitmap_bh, "getting write access"); -+ err = ext3_journal_get_write_access(handle, bitmap_bh); -+ if (err) -+ goto error_return; -+ -+ /* -+ * We are about to modify some metadata. Call the journal APIs -+ * to unshare ->b_data if a currently-committing transaction is -+ * using it -+ */ -+ BUFFER_TRACE(gd_bh, "get_write_access"); -+ err = ext3_journal_get_write_access(handle, gd_bh); -+ if (err) -+ goto error_return; -+ -+ err = ext3_mb_load_buddy(sb, block_group, &e3b); -+ if (err) -+ goto error_return; -+ -+#ifdef AGGRESSIVE_CHECK -+ { -+ int i; -+ for (i = 0; i < count; i++) -+ J_ASSERT(mb_test_bit(bit + i, bitmap_bh->b_data)); -+ } -+#endif -+ mb_clear_bits(bitmap_bh->b_data, bit, count); -+ -+ /* We dirtied the bitmap block */ -+ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); -+ err = ext3_journal_dirty_metadata(handle, bitmap_bh); -+ -+ if (metadata) { -+ /* blocks being freed are metadata. these blocks shouldn't -+ * be used until this transaction is committed */ -+ ext3_mb_free_metadata(handle, &e3b, block_group, bit, count); -+ } else { -+ ext3_lock_group(sb, block_group); -+ mb_free_blocks(&e3b, bit, count); -+ ext3_unlock_group(sb, block_group); -+ } -+ -+ spin_lock(sb_bgl_lock(sbi, block_group)); -+ gdp->bg_free_blocks_count = -+ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + count); -+ spin_unlock(sb_bgl_lock(sbi, block_group)); -+ percpu_counter_mod(&sbi->s_freeblocks_counter, count); -+ -+ ext3_mb_release_desc(&e3b); -+ -+ *freed = count; -+ -+ /* And the group descriptor block */ -+ BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); -+ ret = ext3_journal_dirty_metadata(handle, gd_bh); -+ if (!err) err = ret; -+ -+ if (overflow && !err) { -+ block += count; -+ count = overflow; -+ goto do_more; -+ } -+ sb->s_dirt = 1; -+error_return: -+ brelse(bitmap_bh); -+ ext3_std_error(sb, err); -+ return; -+} -+ -+int ext3_mb_reserve_blocks(struct super_block *sb, int blocks) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ int free, ret = -ENOSPC; -+ -+ BUG_ON(blocks < 0); -+ spin_lock(&sbi->s_reserve_lock); -+ free = percpu_counter_read_positive(&sbi->s_freeblocks_counter); -+ if (blocks <= free - sbi->s_blocks_reserved) { -+ sbi->s_blocks_reserved += blocks; -+ ret = 0; -+ } -+ spin_unlock(&sbi->s_reserve_lock); -+ return ret; -+} -+ -+void ext3_mb_release_blocks(struct super_block *sb, int blocks) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ -+ BUG_ON(blocks < 0); -+ spin_lock(&sbi->s_reserve_lock); -+ sbi->s_blocks_reserved -= blocks; -+ WARN_ON(sbi->s_blocks_reserved < 0); -+ if (sbi->s_blocks_reserved < 0) -+ sbi->s_blocks_reserved = 0; -+ spin_unlock(&sbi->s_reserve_lock); -+} -+ -+int ext3_new_block(handle_t *handle, struct inode *inode, -+ unsigned long goal, int *errp) -+{ -+ int ret, len; -+ -+ if (!test_opt(inode->i_sb, MBALLOC)) { -+ ret = ext3_new_block_old(handle, inode, goal, errp); -+ goto out; -+ } -+ len = 1; -+ ret = ext3_mb_new_blocks(handle, inode, goal, &len, 0, errp); -+out: -+ return ret; -+} -+ -+ -+void ext3_free_blocks(handle_t *handle, struct inode * inode, -+ unsigned long block, unsigned long count, int metadata) -+{ -+ struct super_block *sb; -+ int freed; -+ -+ sb = inode->i_sb; -+ if (!test_opt(sb, MBALLOC) || !EXT3_SB(sb)->s_group_info) -+ ext3_free_blocks_sb(handle, sb, block, count, &freed); -+ else -+ ext3_mb_free_blocks(handle, inode, block, count, metadata, &freed); -+ if (freed) -+ DQUOT_FREE_BLOCK(inode, freed); -+ return; -+} -+ -+#define EXT3_ROOT "ext3" -+#define EXT3_MB_STATS_NAME "mb_stats" -+#define EXT3_MB_MAX_TO_SCAN_NAME "mb_max_to_scan" -+#define EXT3_MB_MIN_TO_SCAN_NAME "mb_min_to_scan" -+#define EXT3_MB_ORDER2_REQ "mb_order2_req" -+ -+static int ext3_mb_stats_read(char *page, char **start, off_t off, -+ int count, int *eof, void *data) -+{ -+ int len; -+ -+ *eof = 1; -+ if (off != 0) -+ return 0; -+ -+ len = sprintf(page, "%ld\n", ext3_mb_stats); -+ *start = page; -+ return len; -+} -+ -+static int ext3_mb_stats_write(struct file *file, const char *buffer, -+ unsigned long count, void *data) -+{ -+ char str[32]; -+ -+ if (count >= sizeof(str)) { -+ printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n", -+ EXT3_MB_STATS_NAME, (int)sizeof(str)); -+ return -EOVERFLOW; -+ } -+ -+ if (copy_from_user(str, buffer, count)) -+ return -EFAULT; -+ -+ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ -+ ext3_mb_stats = (simple_strtol(str, NULL, 0) != 0); -+ return count; -+} -+ -+static int ext3_mb_max_to_scan_read(char *page, char **start, off_t off, -+ int count, int *eof, void *data) -+{ -+ int len; -+ -+ *eof = 1; -+ if (off != 0) -+ return 0; -+ -+ len = sprintf(page, "%ld\n", ext3_mb_max_to_scan); -+ *start = page; -+ return len; -+} -+ -+static int ext3_mb_max_to_scan_write(struct file *file, const char *buffer, -+ unsigned long count, void *data) -+{ -+ char str[32]; -+ long value; -+ -+ if (count >= sizeof(str)) { -+ printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n", -+ EXT3_MB_MAX_TO_SCAN_NAME, (int)sizeof(str)); -+ return -EOVERFLOW; -+ } -+ -+ if (copy_from_user(str, buffer, count)) -+ return -EFAULT; -+ -+ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ -+ value = simple_strtol(str, NULL, 0); -+ if (value <= 0) -+ return -ERANGE; -+ -+ ext3_mb_max_to_scan = value; -+ -+ return count; -+} -+ -+static int ext3_mb_min_to_scan_read(char *page, char **start, off_t off, -+ int count, int *eof, void *data) -+{ -+ int len; -+ -+ *eof = 1; -+ if (off != 0) -+ return 0; -+ -+ len = sprintf(page, "%ld\n", ext3_mb_min_to_scan); -+ *start = page; -+ return len; -+} -+ -+static int ext3_mb_min_to_scan_write(struct file *file, const char *buffer, -+ unsigned long count, void *data) -+{ -+ char str[32]; -+ long value; -+ -+ if (count >= sizeof(str)) { -+ printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n", -+ EXT3_MB_MIN_TO_SCAN_NAME, (int)sizeof(str)); -+ return -EOVERFLOW; -+ } -+ -+ if (copy_from_user(str, buffer, count)) -+ return -EFAULT; -+ -+ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ -+ value = simple_strtol(str, NULL, 0); -+ if (value <= 0) -+ return -ERANGE; -+ -+ ext3_mb_min_to_scan = value; -+ -+ return count; -+} -+ -+static int ext3_mb_order2_req_read(char *page, char **start, off_t off, -+ int count, int *eof, void *data) -+{ -+ int len; -+ -+ *eof = 1; -+ if (off != 0) -+ return 0; -+ -+ len = sprintf(page, "%ld\n", ext3_mb_order2_reqs); -+ *start = page; -+ return len; -+} -+ -+static int ext3_mb_order2_req_write(struct file *file, const char *buffer, -+ unsigned long count, void *data) -+{ -+ char str[32]; -+ long value; -+ -+ if (count >= sizeof(str)) { -+ printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n", -+ EXT3_MB_MIN_TO_SCAN_NAME, (int)sizeof(str)); -+ return -EOVERFLOW; -+ } -+ -+ if (copy_from_user(str, buffer, count)) -+ return -EFAULT; -+ -+ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ -+ value = simple_strtol(str, NULL, 0); -+ if (value <= 0) -+ return -ERANGE; -+ -+ ext3_mb_order2_reqs = value; -+ -+ return count; -+} -+ -+int __init init_ext3_proc(void) -+{ -+ struct proc_dir_entry *proc_ext3_mb_stats; -+ struct proc_dir_entry *proc_ext3_mb_max_to_scan; -+ struct proc_dir_entry *proc_ext3_mb_min_to_scan; -+ struct proc_dir_entry *proc_ext3_mb_order2_req; -+ -+ proc_root_ext3 = proc_mkdir(EXT3_ROOT, proc_root_fs); -+ if (proc_root_ext3 == NULL) { -+ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", EXT3_ROOT); -+ return -EIO; -+ } -+ -+ /* Initialize EXT3_MB_STATS_NAME */ -+ proc_ext3_mb_stats = create_proc_entry(EXT3_MB_STATS_NAME, -+ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); -+ if (proc_ext3_mb_stats == NULL) { -+ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", -+ EXT3_MB_STATS_NAME); -+ remove_proc_entry(EXT3_ROOT, proc_root_fs); -+ return -EIO; -+ } -+ -+ proc_ext3_mb_stats->data = NULL; -+ proc_ext3_mb_stats->read_proc = ext3_mb_stats_read; -+ proc_ext3_mb_stats->write_proc = ext3_mb_stats_write; -+ -+ /* Initialize EXT3_MAX_TO_SCAN_NAME */ -+ proc_ext3_mb_max_to_scan = create_proc_entry( -+ EXT3_MB_MAX_TO_SCAN_NAME, -+ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); -+ if (proc_ext3_mb_max_to_scan == NULL) { -+ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", -+ EXT3_MB_MAX_TO_SCAN_NAME); -+ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); -+ remove_proc_entry(EXT3_ROOT, proc_root_fs); -+ return -EIO; -+ } -+ -+ proc_ext3_mb_max_to_scan->data = NULL; -+ proc_ext3_mb_max_to_scan->read_proc = ext3_mb_max_to_scan_read; -+ proc_ext3_mb_max_to_scan->write_proc = ext3_mb_max_to_scan_write; -+ -+ /* Initialize EXT3_MIN_TO_SCAN_NAME */ -+ proc_ext3_mb_min_to_scan = create_proc_entry( -+ EXT3_MB_MIN_TO_SCAN_NAME, -+ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); -+ if (proc_ext3_mb_min_to_scan == NULL) { -+ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", -+ EXT3_MB_MIN_TO_SCAN_NAME); -+ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3); -+ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); -+ remove_proc_entry(EXT3_ROOT, proc_root_fs); -+ return -EIO; -+ } -+ -+ proc_ext3_mb_min_to_scan->data = NULL; -+ proc_ext3_mb_min_to_scan->read_proc = ext3_mb_min_to_scan_read; -+ proc_ext3_mb_min_to_scan->write_proc = ext3_mb_min_to_scan_write; -+ -+ /* Initialize EXT3_ORDER2_REQ */ -+ proc_ext3_mb_order2_req = create_proc_entry( -+ EXT3_MB_ORDER2_REQ, -+ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); -+ if (proc_ext3_mb_order2_req == NULL) { -+ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", -+ EXT3_MB_ORDER2_REQ); -+ remove_proc_entry(EXT3_MB_MIN_TO_SCAN_NAME, proc_root_ext3); -+ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3); -+ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); -+ remove_proc_entry(EXT3_ROOT, proc_root_fs); -+ return -EIO; -+ } -+ -+ proc_ext3_mb_order2_req->data = NULL; -+ proc_ext3_mb_order2_req->read_proc = ext3_mb_order2_req_read; -+ proc_ext3_mb_order2_req->write_proc = ext3_mb_order2_req_write; -+ -+ return 0; -+} -+ -+void exit_ext3_proc(void) -+{ -+ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); -+ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3); -+ remove_proc_entry(EXT3_MB_MIN_TO_SCAN_NAME, proc_root_ext3); -+ remove_proc_entry(EXT3_MB_ORDER2_REQ, proc_root_ext3); -+ remove_proc_entry(EXT3_ROOT, proc_root_fs); -+} -Index: linux-2.6.9-full/fs/ext3/Makefile -=================================================================== ---- linux-2.6.9-full.orig/fs/ext3/Makefile 2006-06-01 14:58:46.000000000 +0400 -+++ linux-2.6.9-full/fs/ext3/Makefile 2006-10-24 12:54:31.000000000 +0400 -@@ -6,7 +6,7 @@ obj-$(CONFIG_EXT3_FS) += ext3.o - - ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ - ioctl.o namei.o super.o symlink.o hash.o resize.o \ -- extents.o -+ extents.o mballoc.o - - ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o - ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o diff --git a/ldiskfs/kernel_patches/patches/ext3-nanosecond-2.6-rhel4.patch b/ldiskfs/kernel_patches/patches/ext3-nanosecond-2.6-rhel4.patch deleted file mode 100644 index de039fc..0000000 --- a/ldiskfs/kernel_patches/patches/ext3-nanosecond-2.6-rhel4.patch +++ /dev/null @@ -1,401 +0,0 @@ -Index: linux-stage/fs/ext3/ialloc.c -=================================================================== ---- linux-stage.orig/fs/ext3/ialloc.c -+++ linux-stage/fs/ext3/ialloc.c -@@ -726,7 +726,8 @@ got: - /* This is the optimal IO size (for stat), not the fs block size */ - inode->i_blksize = PAGE_SIZE; - inode->i_blocks = 0; -- inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; -+ inode->i_mtime = inode->i_atime = inode->i_ctime = ei->i_crtime = -+ ext3_current_time(inode); - - memset(ei->i_data, 0, sizeof(ei->i_data)); - ei->i_next_alloc_block = 0; -@@ -764,9 +765,8 @@ got: - spin_unlock(&sbi->s_next_gen_lock); - - ei->i_state = EXT3_STATE_NEW; -- ei->i_extra_isize = -- (EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) ? -- sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE : 0; -+ -+ ei->i_extra_isize = EXT3_SB(sb)->s_want_extra_isize; - - ret = inode; - if(DQUOT_ALLOC_INODE(inode)) { -Index: linux-stage/fs/ext3/inode.c -=================================================================== ---- linux-stage.orig/fs/ext3/inode.c -+++ linux-stage/fs/ext3/inode.c -@@ -627,7 +627,7 @@ static int ext3_splice_branch(handle_t * - - /* We are done with atomic stuff, now do the rest of housekeeping */ - -- inode->i_ctime = CURRENT_TIME_SEC; -+ inode->i_ctime = ext3_current_time(inode); - ext3_mark_inode_dirty(handle, inode); - - /* had we spliced it onto indirect block? */ -@@ -2230,7 +2230,7 @@ do_indirects: - ; - } - up(&ei->truncate_sem); -- inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; -+ inode->i_mtime = inode->i_ctime = ext3_current_time(inode); - ext3_mark_inode_dirty(handle, inode); - - /* In a multi-transaction truncate, we only make the final -@@ -2457,10 +2457,6 @@ void ext3_read_inode(struct inode * inod - } - inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); - inode->i_size = le32_to_cpu(raw_inode->i_size); -- inode->i_atime.tv_sec = le32_to_cpu(raw_inode->i_atime); -- inode->i_ctime.tv_sec = le32_to_cpu(raw_inode->i_ctime); -- inode->i_mtime.tv_sec = le32_to_cpu(raw_inode->i_mtime); -- inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0; - - ei->i_state = 0; - ei->i_next_alloc_block = 0; -@@ -2521,6 +2517,11 @@ void ext3_read_inode(struct inode * inod - else - ei->i_extra_isize = 0; - -+ EXT3_INODE_GET_XTIME(i_ctime, inode, raw_inode); -+ EXT3_INODE_GET_XTIME(i_mtime, inode, raw_inode); -+ EXT3_INODE_GET_XTIME(i_atime, inode, raw_inode); -+ EXT3_EINODE_GET_XTIME(i_crtime, ei, raw_inode); -+ - if (S_ISREG(inode->i_mode)) { - inode->i_op = &ext3_file_inode_operations; - inode->i_fop = &ext3_file_operations; -@@ -2601,9 +2602,12 @@ static int ext3_do_update_inode(handle_t - } - raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); - raw_inode->i_size = cpu_to_le32(ei->i_disksize); -- raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec); -- raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec); -- raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec); -+ -+ EXT3_INODE_SET_XTIME(i_ctime, inode, raw_inode); -+ EXT3_INODE_SET_XTIME(i_mtime, inode, raw_inode); -+ EXT3_INODE_SET_XTIME(i_atime, inode, raw_inode); -+ EXT3_EINODE_SET_XTIME(i_crtime, ei, raw_inode); -+ - raw_inode->i_blocks = cpu_to_le32(inode->i_blocks); - raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); - raw_inode->i_flags = cpu_to_le32(ei->i_flags); -Index: linux-stage/fs/ext3/ioctl.c -=================================================================== ---- linux-stage.orig/fs/ext3/ioctl.c -+++ linux-stage/fs/ext3/ioctl.c -@@ -112,7 +112,7 @@ int ext3_ioctl (struct inode * inode, st - ei->i_flags = flags; - - ext3_set_inode_flags(inode); -- inode->i_ctime = CURRENT_TIME_SEC; -+ inode->i_ctime = ext3_current_time(inode); - - err = ext3_mark_iloc_dirty(handle, inode, &iloc); - flags_err: -@@ -150,7 +150,7 @@ flags_err: - return PTR_ERR(handle); - err = ext3_reserve_inode_write(handle, inode, &iloc); - if (err == 0) { -- inode->i_ctime = CURRENT_TIME_SEC; -+ inode->i_ctime = ext3_current_time(inode); - inode->i_generation = generation; - err = ext3_mark_iloc_dirty(handle, inode, &iloc); - } -Index: linux-stage/fs/ext3/namei.c -=================================================================== ---- linux-stage.orig/fs/ext3/namei.c -+++ linux-stage/fs/ext3/namei.c -@@ -1302,7 +1302,7 @@ static int add_dirent_to_buf(handle_t *h - * happen is that the times are slightly out of date - * and/or different from the directory change time. - */ -- dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; -+ dir->i_mtime = dir->i_ctime = ext3_current_time(dir); - ext3_update_dx_flag(dir); - dir->i_version++; - ext3_mark_inode_dirty(handle, dir); -@@ -2098,7 +2098,7 @@ static int ext3_rmdir (struct inode * di - inode->i_version++; - inode->i_nlink = 0; - ext3_orphan_add(handle, inode); -- inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; -+ inode->i_ctime = dir->i_ctime = dir->i_mtime = ext3_current_time(inode); - ext3_mark_inode_dirty(handle, inode); - ext3_dec_count(handle, dir); - ext3_update_dx_flag(dir); -@@ -2148,13 +2148,13 @@ static int ext3_unlink(struct inode * di - retval = ext3_delete_entry(handle, dir, de, bh); - if (retval) - goto end_unlink; -- dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; -+ dir->i_ctime = dir->i_mtime = ext3_current_time(dir); - ext3_update_dx_flag(dir); - ext3_mark_inode_dirty(handle, dir); - ext3_dec_count(handle, inode); - if (!inode->i_nlink) - ext3_orphan_add(handle, inode); -- inode->i_ctime = dir->i_ctime; -+ inode->i_ctime = ext3_current_time(inode); - ext3_mark_inode_dirty(handle, inode); - retval = 0; - -@@ -2255,7 +2255,7 @@ retry: - if (IS_DIRSYNC(dir)) - handle->h_sync = 1; - -- inode->i_ctime = CURRENT_TIME_SEC; -+ inode->i_ctime = ext3_current_time(inode); - ext3_inc_count(handle, inode); - atomic_inc(&inode->i_count); - -@@ -2357,7 +2357,7 @@ static int ext3_rename (struct inode * o - * Like most other Unix systems, set the ctime for inodes on a - * rename. - */ -- old_inode->i_ctime = CURRENT_TIME_SEC; -+ old_inode->i_ctime = ext3_current_time(old_inode); - ext3_mark_inode_dirty(handle, old_inode); - - /* -@@ -2390,9 +2390,9 @@ static int ext3_rename (struct inode * o - - if (new_inode) { - ext3_dec_count(handle, new_inode); -- new_inode->i_ctime = CURRENT_TIME_SEC; -+ new_inode->i_ctime = ext3_current_time(new_inode); - } -- old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME_SEC; -+ old_dir->i_ctime = old_dir->i_mtime = ext3_current_time(old_dir); - ext3_update_dx_flag(old_dir); - if (dir_bh) { - BUFFER_TRACE(dir_bh, "get_write_access"); -Index: linux-stage/fs/ext3/super.c -=================================================================== ---- linux-stage.orig/fs/ext3/super.c -+++ linux-stage/fs/ext3/super.c -@@ -1573,6 +1573,8 @@ static int ext3_fill_super (struct super - sbi->s_inode_size); - goto failed_mount; - } -+ if (sbi->s_inode_size > EXT3_GOOD_OLD_INODE_SIZE) -+ sb->s_time_gran = 1 << (EXT3_EPOCH_BITS - 2); - } - sbi->s_frag_size = EXT3_MIN_FRAG_SIZE << - le32_to_cpu(es->s_log_frag_size); -@@ -1759,6 +1761,32 @@ static int ext3_fill_super (struct super - } - - ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY); -+ -+ /* determine the minimum size of new large inodes, if present */ -+ if (sbi->s_inode_size > EXT3_GOOD_OLD_INODE_SIZE) { -+ sbi->s_want_extra_isize = sizeof(struct ext3_inode) - -+ EXT3_GOOD_OLD_INODE_SIZE; -+ if (EXT3_HAS_RO_COMPAT_FEATURE(sb, -+ EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE)) { -+ if (sbi->s_want_extra_isize < -+ le16_to_cpu(es->s_want_extra_isize)) -+ sbi->s_want_extra_isize = -+ le16_to_cpu(es->s_want_extra_isize); -+ if (sbi->s_want_extra_isize < -+ le16_to_cpu(es->s_min_extra_isize)) -+ sbi->s_want_extra_isize = -+ le16_to_cpu(es->s_min_extra_isize); -+ } -+ } -+ /* Check if enough inode space is available */ -+ if (EXT3_GOOD_OLD_INODE_SIZE + sbi->s_want_extra_isize > -+ sbi->s_inode_size) { -+ sbi->s_want_extra_isize = sizeof(struct ext3_inode) - -+ EXT3_GOOD_OLD_INODE_SIZE; -+ printk(KERN_INFO "EXT3-fs: required extra inode space not" -+ "available.\n"); -+ } -+ - /* - * akpm: core read_super() calls in here with the superblock locked. - * That deadlocks, because orphan cleanup needs to lock the superblock -Index: linux-stage/fs/ext3/xattr.c -=================================================================== ---- linux-stage.orig/fs/ext3/xattr.c -+++ linux-stage/fs/ext3/xattr.c -@@ -1305,7 +1305,7 @@ getblk_failed: - - /* Update the inode. */ - EXT3_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0; -- inode->i_ctime = CURRENT_TIME_SEC; -+ inode->i_ctime = ext3_current_time(inode); - ext3_mark_inode_dirty(handle, inode); - if (IS_SYNC(inode)) - handle->h_sync = 1; -Index: linux-stage/include/linux/ext3_fs.h -=================================================================== ---- linux-stage.orig/include/linux/ext3_fs.h -+++ linux-stage/include/linux/ext3_fs.h -@@ -280,7 +280,7 @@ struct ext3_inode { - __le16 i_uid; /* Low 16 bits of Owner Uid */ - __le32 i_size; /* Size in bytes */ - __le32 i_atime; /* Access time */ -- __le32 i_ctime; /* Creation time */ -+ __le32 i_ctime; /* Inode Change time */ - __le32 i_mtime; /* Modification time */ - __le32 i_dtime; /* Deletion Time */ - __le16 i_gid; /* Low 16 bits of Group Id */ -@@ -329,10 +329,73 @@ struct ext3_inode { - } osd2; /* OS dependent 2 */ - __u16 i_extra_isize; - __u16 i_pad1; -+ __le32 i_ctime_extra; /* extra Change time (nsec << 2 | epoch) */ -+ __le32 i_mtime_extra; /* extra Modification time(nsec << 2 | epoch) */ -+ __le32 i_atime_extra; /* extra Access time (nsec << 2 | epoch) */ -+ __le32 i_crtime; /* File Creation time */ -+ __le32 i_crtime_extra; /* extra File Creation time (nsec << 2 | epoch) */ - }; - - #define i_size_high i_dir_acl - -+#define EXT3_EPOCH_BITS 2 -+#define EXT3_EPOCH_MASK ((1 << EXT3_EPOCH_BITS) - 1) -+#define EXT3_NSEC_MASK (~0UL << EXT3_EPOCH_BITS) -+ -+#define EXT3_FITS_IN_INODE(ext3_inode, einode, field) \ -+ ((offsetof(typeof(*ext3_inode), field) + \ -+ sizeof((ext3_inode)->field)) \ -+ <= (EXT3_GOOD_OLD_INODE_SIZE + \ -+ (einode)->i_extra_isize)) \ -+ -+static inline __le32 ext3_encode_extra_time(struct timespec *time) -+{ -+ return cpu_to_le32((sizeof(time->tv_sec) > 4 ? -+ time->tv_sec >> 32 : 0) | -+ ((time->tv_nsec << 2) & EXT3_NSEC_MASK)); -+} -+ -+static inline void ext3_decode_extra_time(struct timespec *time, __le32 extra) { -+ if (sizeof(time->tv_sec) > 4) -+ time->tv_sec |= (__u64)(le32_to_cpu(extra) & EXT3_EPOCH_MASK) -+ << 32; -+ time->tv_nsec = (le32_to_cpu(extra) & EXT3_NSEC_MASK) >> 2; -+} -+ -+#define EXT3_INODE_SET_XTIME(xtime, inode, raw_inode) \ -+do { \ -+ (raw_inode)->xtime = cpu_to_le32((inode)->xtime.tv_sec); \ -+ if (EXT3_FITS_IN_INODE(raw_inode, EXT3_I(inode), xtime ## _extra))\ -+ (raw_inode)->xtime ## _extra = \ -+ ext3_encode_extra_time(&(inode)->xtime); \ -+} while (0) -+ -+#define EXT3_EINODE_SET_XTIME(xtime, einode, raw_inode) \ -+do { \ -+ if (EXT3_FITS_IN_INODE(raw_inode, einode, xtime)) \ -+ (raw_inode)->xtime = cpu_to_le32((einode)->xtime.tv_sec); \ -+ if (EXT3_FITS_IN_INODE(raw_inode, einode, xtime ## _extra)) \ -+ (raw_inode)->xtime ## _extra = \ -+ ext3_encode_extra_time(&(einode)->xtime); \ -+} while (0) -+ -+#define EXT3_INODE_GET_XTIME(xtime, inode, raw_inode) \ -+do { \ -+ (inode)->xtime.tv_sec = le32_to_cpu((raw_inode)->xtime); \ -+ if (EXT3_FITS_IN_INODE(raw_inode, EXT3_I(inode), xtime ## _extra))\ -+ ext3_decode_extra_time(&(inode)->xtime, \ -+ raw_inode->xtime ## _extra); \ -+} while (0) -+ -+#define EXT3_EINODE_GET_XTIME(xtime, einode, raw_inode) \ -+do { \ -+ if (EXT3_FITS_IN_INODE(raw_inode, einode, xtime)) \ -+ (einode)->xtime.tv_sec = le32_to_cpu((raw_inode)->xtime); \ -+ if (EXT3_FITS_IN_INODE(raw_inode, einode, xtime ## _extra)) \ -+ ext3_decode_extra_time(&(einode)->xtime, \ -+ raw_inode->xtime ## _extra); \ -+} while (0) -+ - #if defined(__KERNEL__) || defined(__linux__) - #define i_reserved1 osd1.linux1.l_i_reserved1 - #define i_frag osd2.linux2.l_i_frag -@@ -500,11 +563,19 @@ struct ext3_super_block { - __le32 s_last_orphan; /* start of list of inodes to delete */ - __le32 s_hash_seed[4]; /* HTREE hash seed */ - __u8 s_def_hash_version; /* Default hash version to use */ -- __u8 s_reserved_char_pad; -- __u16 s_reserved_word_pad; -+ __u8 s_jnl_backup_type; /* Default type of journal backup */ -+ __le16 s_desc_size; /* Group desc. size: INCOMPAT_64BIT */ - __le32 s_default_mount_opts; -- __le32 s_first_meta_bg; /* First metablock block group */ -- __u32 s_reserved[190]; /* Padding to the end of the block */ -+ __le32 s_first_meta_bg; /* First metablock block group */ -+ __le32 s_mkfs_time; /* When the filesystem was created */ -+ __le32 s_jnl_blocks[17]; /* Backup of the journal inode */ -+ __le32 s_blocks_count_hi; /* Blocks count high 32 bits */ -+ __le32 s_r_blocks_count_hi; /* Reserved blocks count high 32 bits*/ -+ __le32 s_free_blocks_hi; /* Free blocks count high 32 bits */ -+ __le16 s_min_extra_isize; /* All inodes have at least # bytes */ -+ __le16 s_want_extra_isize; /* New inodes should reserve # bytes */ -+ __le32 s_flags; /* Miscellaneous flags */ -+ __u32 s_reserved[167]; /* Padding to the end of the block */ - }; - - #ifdef __KERNEL__ -@@ -580,6 +651,7 @@ static inline struct ext3_inode_info *EX - #define EXT3_FEATURE_RO_COMPAT_BTREE_DIR 0x0004 - #define EXT4_FEATURE_RO_COMPAT_GDT_CSUM 0x0010 - #define EXT4_FEATURE_RO_COMPAT_DIR_NLINK 0x0020 -+#define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040 - - #define EXT3_FEATURE_INCOMPAT_COMPRESSION 0x0001 - #define EXT3_FEATURE_INCOMPAT_FILETYPE 0x0002 -@@ -597,6 +669,7 @@ static inline struct ext3_inode_info *EX - EXT3_FEATURE_RO_COMPAT_LARGE_FILE| \ - EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ - EXT4_FEATURE_RO_COMPAT_DIR_NLINK| \ -+ EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE| \ - EXT3_FEATURE_RO_COMPAT_BTREE_DIR) - - /* -@@ -724,6 +797,12 @@ static inline struct ext3_inode *ext3_ra - return (struct ext3_inode *) (iloc->bh->b_data + iloc->offset); - } - -+static inline struct timespec ext3_current_time(struct inode *inode) -+{ -+ return (inode->i_sb->s_time_gran < NSEC_PER_SEC) ? -+ current_fs_time(inode->i_sb) : CURRENT_TIME_SEC; -+} -+ - /* - * This structure is stuffed into the struct file's private_data field - * for directories. It is where we put information so that we can do -Index: linux-stage/include/linux/ext3_fs_i.h -=================================================================== ---- linux-stage.orig/include/linux/ext3_fs_i.h -+++ linux-stage/include/linux/ext3_fs_i.h -@@ -130,6 +130,7 @@ struct ext3_inode_info { - - /* on-disk additional length */ - __u16 i_extra_isize; -+ struct timespec i_crtime; - - /* - * truncate_sem is for serialising ext3_truncate() against -Index: linux-stage/include/linux/ext3_fs_sb.h -=================================================================== ---- linux-stage.orig/include/linux/ext3_fs_sb.h -+++ linux-stage/include/linux/ext3_fs_sb.h -@@ -71,6 +71,8 @@ struct ext3_sb_info { - /* Last group used to allocate inode */ - int s_last_alloc_group; - -+ unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */ -+ - /* root of the per fs reservation window tree */ - spinlock_t s_rsv_window_lock; - struct rb_root s_rsv_window_root; diff --git a/ldiskfs/kernel_patches/patches/ext3-nanosecond-2.6-sles10.patch b/ldiskfs/kernel_patches/patches/ext3-nanosecond-2.6-sles10.patch deleted file mode 100644 index 04c6e61..0000000 --- a/ldiskfs/kernel_patches/patches/ext3-nanosecond-2.6-sles10.patch +++ /dev/null @@ -1,404 +0,0 @@ -Index: linux-2.6.16.27-0.9/fs/ext3/ialloc.c -=================================================================== ---- linux-2.6.16.27-0.9.orig/fs/ext3/ialloc.c -+++ linux-2.6.16.27-0.9/fs/ext3/ialloc.c -@@ -577,7 +577,8 @@ got: - /* This is the optimal IO size (for stat), not the fs block size */ - inode->i_blksize = PAGE_SIZE; - inode->i_blocks = 0; -- inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; -+ inode->i_mtime = inode->i_atime = inode->i_ctime = ei->i_crtime = -+ ext3_current_time(inode); - - memset(ei->i_data, 0, sizeof(ei->i_data)); - ei->i_dir_start_lookup = 0; -@@ -609,9 +610,8 @@ got: - spin_unlock(&sbi->s_next_gen_lock); - - ei->i_state = EXT3_STATE_NEW; -- ei->i_extra_isize = -- (EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) ? -- sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE : 0; -+ -+ ei->i_extra_isize = EXT3_SB(sb)->s_want_extra_isize; - - ret = inode; - if(DQUOT_ALLOC_INODE(inode)) { -Index: linux-2.6.16.27-0.9/fs/ext3/inode.c -=================================================================== ---- linux-2.6.16.27-0.9.orig/fs/ext3/inode.c -+++ linux-2.6.16.27-0.9/fs/ext3/inode.c -@@ -620,7 +620,7 @@ static int ext3_splice_branch(handle_t * - - /* We are done with atomic stuff, now do the rest of housekeeping */ - -- inode->i_ctime = CURRENT_TIME_SEC; -+ inode->i_ctime = ext3_current_time(inode); - ext3_mark_inode_dirty(handle, inode); - - /* had we spliced it onto indirect block? */ -@@ -2244,7 +2244,7 @@ do_indirects: - ext3_discard_reservation(inode); - - up(&ei->truncate_sem); -- inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; -+ inode->i_mtime = inode->i_ctime = ext3_current_time(inode); - ext3_mark_inode_dirty(handle, inode); - - /* In a multi-transaction truncate, we only make the final -@@ -2479,10 +2479,6 @@ void ext3_read_inode(struct inode * inod - } - inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); - inode->i_size = le32_to_cpu(raw_inode->i_size); -- inode->i_atime.tv_sec = le32_to_cpu(raw_inode->i_atime); -- inode->i_ctime.tv_sec = le32_to_cpu(raw_inode->i_ctime); -- inode->i_mtime.tv_sec = le32_to_cpu(raw_inode->i_mtime); -- inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0; - - ei->i_state = 0; - ei->i_dir_start_lookup = 0; -@@ -2557,6 +2553,11 @@ void ext3_read_inode(struct inode * inod - } else - ei->i_extra_isize = 0; - -+ EXT3_INODE_GET_XTIME(i_ctime, inode, raw_inode); -+ EXT3_INODE_GET_XTIME(i_mtime, inode, raw_inode); -+ EXT3_INODE_GET_XTIME(i_atime, inode, raw_inode); -+ EXT3_EINODE_GET_XTIME(i_crtime, ei, raw_inode); -+ - if (S_ISREG(inode->i_mode)) { - inode->i_op = &ext3_file_inode_operations; - inode->i_fop = &ext3_file_operations; -@@ -2637,9 +2638,12 @@ static int ext3_do_update_inode(handle_t - } - raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); - raw_inode->i_size = cpu_to_le32(ei->i_disksize); -- raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec); -- raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec); -- raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec); -+ -+ EXT3_INODE_SET_XTIME(i_ctime, inode, raw_inode); -+ EXT3_INODE_SET_XTIME(i_mtime, inode, raw_inode); -+ EXT3_INODE_SET_XTIME(i_atime, inode, raw_inode); -+ EXT3_EINODE_SET_XTIME(i_crtime, ei, raw_inode); -+ - raw_inode->i_blocks = cpu_to_le32(inode->i_blocks); - raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); - raw_inode->i_flags = cpu_to_le32(ei->i_flags); -Index: linux-2.6.16.27-0.9/fs/ext3/ioctl.c -=================================================================== ---- linux-2.6.16.27-0.9.orig/fs/ext3/ioctl.c -+++ linux-2.6.16.27-0.9/fs/ext3/ioctl.c -@@ -88,7 +88,7 @@ int ext3_ioctl (struct inode * inode, st - ei->i_flags = flags; - - ext3_set_inode_flags(inode); -- inode->i_ctime = CURRENT_TIME_SEC; -+ inode->i_ctime = ext3_current_time(inode); - - err = ext3_mark_iloc_dirty(handle, inode, &iloc); - flags_err: -@@ -126,7 +126,7 @@ flags_err: - return PTR_ERR(handle); - err = ext3_reserve_inode_write(handle, inode, &iloc); - if (err == 0) { -- inode->i_ctime = CURRENT_TIME_SEC; -+ inode->i_ctime = ext3_current_time(inode); - inode->i_generation = generation; - err = ext3_mark_iloc_dirty(handle, inode, &iloc); - } -Index: linux-2.6.16.27-0.9/fs/ext3/namei.c -=================================================================== ---- linux-2.6.16.27-0.9.orig/fs/ext3/namei.c -+++ linux-2.6.16.27-0.9/fs/ext3/namei.c -@@ -1276,7 +1276,7 @@ static int add_dirent_to_buf(handle_t *h - * happen is that the times are slightly out of date - * and/or different from the directory change time. - */ -- dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; -+ dir->i_mtime = dir->i_ctime = ext3_current_time(dir); - ext3_update_dx_flag(dir); - dir->i_version++; - ext3_mark_inode_dirty(handle, dir); -@@ -2056,7 +2056,7 @@ static int ext3_rmdir (struct inode * di - inode->i_version++; - inode->i_nlink = 0; - ext3_orphan_add(handle, inode); -- inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; -+ inode->i_ctime = dir->i_ctime = dir->i_mtime = ext3_current_time(inode); - ext3_mark_inode_dirty(handle, inode); - ext3_dec_count(handle, dir); - ext3_update_dx_flag(dir); -@@ -2106,13 +2106,13 @@ static int ext3_unlink(struct inode * di - retval = ext3_delete_entry(handle, dir, de, bh); - if (retval) - goto end_unlink; -- dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; -+ dir->i_ctime = dir->i_mtime = ext3_current_time(dir); - ext3_update_dx_flag(dir); - ext3_mark_inode_dirty(handle, dir); - ext3_dec_count(handle, inode); - if (!inode->i_nlink) - ext3_orphan_add(handle, inode); -- inode->i_ctime = dir->i_ctime; -+ inode->i_ctime = ext3_current_time(inode); - ext3_mark_inode_dirty(handle, inode); - retval = 0; - -@@ -2214,7 +2214,7 @@ retry: - if (IS_DIRSYNC(dir)) - handle->h_sync = 1; - -- inode->i_ctime = CURRENT_TIME_SEC; -+ inode->i_ctime = ext3_current_time(inode); - ext3_inc_count(handle, inode); - atomic_inc(&inode->i_count); - -@@ -2317,7 +2317,7 @@ static int ext3_rename (struct inode * o - * Like most other Unix systems, set the ctime for inodes on a - * rename. - */ -- old_inode->i_ctime = CURRENT_TIME_SEC; -+ old_inode->i_ctime = ext3_current_time(old_inode); - ext3_mark_inode_dirty(handle, old_inode); - - /* -@@ -2350,9 +2350,9 @@ static int ext3_rename (struct inode * o - - if (new_inode) { - ext3_dec_count(handle, new_inode); -- new_inode->i_ctime = CURRENT_TIME_SEC; -+ new_inode->i_ctime = ext3_current_time(new_inode); - } -- old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME_SEC; -+ old_dir->i_ctime = old_dir->i_mtime = ext3_current_time(old_dir); - ext3_update_dx_flag(old_dir); - if (dir_bh) { - BUFFER_TRACE(dir_bh, "get_write_access"); -Index: linux-2.6.16.27-0.9/fs/ext3/super.c -=================================================================== ---- linux-2.6.16.27-0.9.orig/fs/ext3/super.c -+++ linux-2.6.16.27-0.9/fs/ext3/super.c -@@ -1614,6 +1614,8 @@ static int ext3_fill_super (struct super - sbi->s_inode_size); - goto failed_mount; - } -+ if (sbi->s_inode_size > EXT3_GOOD_OLD_INODE_SIZE) -+ sb->s_time_gran = 1 << (EXT3_EPOCH_BITS - 2); - } - sbi->s_frag_size = EXT3_MIN_FRAG_SIZE << - le32_to_cpu(es->s_log_frag_size); -@@ -1809,6 +1811,32 @@ static int ext3_fill_super (struct super - } - - ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY); -+ -+ /* determine the minimum size of new large inodes, if present */ -+ if (sbi->s_inode_size > EXT3_GOOD_OLD_INODE_SIZE) { -+ sbi->s_want_extra_isize = sizeof(struct ext3_inode) - -+ EXT3_GOOD_OLD_INODE_SIZE; -+ if (EXT3_HAS_RO_COMPAT_FEATURE(sb, -+ EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE)) { -+ if (sbi->s_want_extra_isize < -+ le16_to_cpu(es->s_want_extra_isize)) -+ sbi->s_want_extra_isize = -+ le16_to_cpu(es->s_want_extra_isize); -+ if (sbi->s_want_extra_isize < -+ le16_to_cpu(es->s_min_extra_isize)) -+ sbi->s_want_extra_isize = -+ le16_to_cpu(es->s_min_extra_isize); -+ } -+ } -+ /* Check if enough inode space is available */ -+ if (EXT3_GOOD_OLD_INODE_SIZE + sbi->s_want_extra_isize > -+ sbi->s_inode_size) { -+ sbi->s_want_extra_isize = sizeof(struct ext3_inode) - -+ EXT3_GOOD_OLD_INODE_SIZE; -+ printk(KERN_INFO "EXT3-fs: required extra inode space not" -+ "available.\n"); -+ } -+ - /* - * akpm: core read_super() calls in here with the superblock locked. - * That deadlocks, because orphan cleanup needs to lock the superblock -Index: linux-2.6.16.27-0.9/fs/ext3/xattr.c -=================================================================== ---- linux-2.6.16.27-0.9.orig/fs/ext3/xattr.c -+++ linux-2.6.16.27-0.9/fs/ext3/xattr.c -@@ -1006,8 +1006,8 @@ ext3_xattr_set_handle(handle_t *handle, - } - if (!error) { - ext3_xattr_update_super_block(handle, inode->i_sb); -- inode->i_ctime = CURRENT_TIME_SEC; -- error = ext3_mark_iloc_dirty(handle, inode, &is.iloc); -+ inode->i_ctime = ext3_current_time(inode); -+ ext3_mark_inode_dirty(handle, inode); - /* - * The bh is consumed by ext3_mark_iloc_dirty, even with - * error != 0. -Index: linux-2.6.16.27-0.9/include/linux/ext3_fs.h -=================================================================== ---- linux-2.6.16.27-0.9.orig/include/linux/ext3_fs.h -+++ linux-2.6.16.27-0.9/include/linux/ext3_fs.h -@@ -272,7 +272,7 @@ struct ext3_inode { - __le16 i_uid; /* Low 16 bits of Owner Uid */ - __le32 i_size; /* Size in bytes */ - __le32 i_atime; /* Access time */ -- __le32 i_ctime; /* Creation time */ -+ __le32 i_ctime; /* Inode Change time */ - __le32 i_mtime; /* Modification time */ - __le32 i_dtime; /* Deletion Time */ - __le16 i_gid; /* Low 16 bits of Group Id */ -@@ -321,10 +321,73 @@ struct ext3_inode { - } osd2; /* OS dependent 2 */ - __le16 i_extra_isize; - __le16 i_pad1; -+ __le32 i_ctime_extra; /* extra Change time (nsec << 2 | epoch) */ -+ __le32 i_mtime_extra; /* extra Modification time(nsec << 2 | epoch) */ -+ __le32 i_atime_extra; /* extra Access time (nsec << 2 | epoch) */ -+ __le32 i_crtime; /* File Creation time */ -+ __le32 i_crtime_extra; /* extra File Creation time (nsec << 2 | epoch) */ - }; - - #define i_size_high i_dir_acl - -+#define EXT3_EPOCH_BITS 2 -+#define EXT3_EPOCH_MASK ((1 << EXT3_EPOCH_BITS) - 1) -+#define EXT3_NSEC_MASK (~0UL << EXT3_EPOCH_BITS) -+ -+#define EXT3_FITS_IN_INODE(ext3_inode, einode, field) \ -+ ((offsetof(typeof(*ext3_inode), field) + \ -+ sizeof((ext3_inode)->field)) \ -+ <= (EXT3_GOOD_OLD_INODE_SIZE + \ -+ (einode)->i_extra_isize)) \ -+ -+static inline __le32 ext3_encode_extra_time(struct timespec *time) -+{ -+ return cpu_to_le32((sizeof(time->tv_sec) > 4 ? -+ time->tv_sec >> 32 : 0) | -+ ((time->tv_nsec << 2) & EXT3_NSEC_MASK)); -+} -+ -+static inline void ext3_decode_extra_time(struct timespec *time, __le32 extra) { -+ if (sizeof(time->tv_sec) > 4) -+ time->tv_sec |= (__u64)(le32_to_cpu(extra) & EXT3_EPOCH_MASK) -+ << 32; -+ time->tv_nsec = (le32_to_cpu(extra) & EXT3_NSEC_MASK) >> 2; -+} -+ -+#define EXT3_INODE_SET_XTIME(xtime, inode, raw_inode) \ -+do { \ -+ (raw_inode)->xtime = cpu_to_le32((inode)->xtime.tv_sec); \ -+ if (EXT3_FITS_IN_INODE(raw_inode, EXT3_I(inode), xtime ## _extra)) \ -+ (raw_inode)->xtime ## _extra = \ -+ ext3_encode_extra_time(&(inode)->xtime); \ -+} while (0) -+ -+#define EXT3_EINODE_SET_XTIME(xtime, einode, raw_inode) \ -+do { \ -+ if (EXT3_FITS_IN_INODE(raw_inode, einode, xtime)) \ -+ (raw_inode)->xtime = cpu_to_le32((einode)->xtime.tv_sec); \ -+ if (EXT3_FITS_IN_INODE(raw_inode, einode, xtime ## _extra)) \ -+ (raw_inode)->xtime ## _extra = \ -+ ext3_encode_extra_time(&(einode)->xtime); \ -+} while (0) -+ -+#define EXT3_INODE_GET_XTIME(xtime, inode, raw_inode) \ -+do { \ -+ (inode)->xtime.tv_sec = le32_to_cpu((raw_inode)->xtime); \ -+ if (EXT3_FITS_IN_INODE(raw_inode, EXT3_I(inode), xtime ## _extra))\ -+ ext3_decode_extra_time(&(inode)->xtime, \ -+ raw_inode->xtime ## _extra); \ -+} while (0) -+ -+#define EXT3_EINODE_GET_XTIME(xtime, einode, raw_inode) \ -+do { \ -+ if (EXT3_FITS_IN_INODE(raw_inode, einode, xtime)) \ -+ (einode)->xtime.tv_sec = le32_to_cpu((raw_inode)->xtime);\ -+ if (EXT3_FITS_IN_INODE(raw_inode, einode, xtime ## _extra)) \ -+ ext3_decode_extra_time(&(einode)->xtime, \ -+ raw_inode->xtime ## _extra); \ -+} while (0) -+ - #if defined(__KERNEL__) || defined(__linux__) - #define i_reserved1 osd1.linux1.l_i_reserved1 - #define i_frag osd2.linux2.l_i_frag -@@ -504,11 +567,19 @@ struct ext3_super_block { - __le32 s_last_orphan; /* start of list of inodes to delete */ - __le32 s_hash_seed[4]; /* HTREE hash seed */ - __u8 s_def_hash_version; /* Default hash version to use */ -- __u8 s_reserved_char_pad; -- __u16 s_reserved_word_pad; -+ __u8 s_jnl_backup_type; /* Default type of journal backup */ -+ __le16 s_desc_size; /* Group desc. size: INCOMPAT_64BIT */ - __le32 s_default_mount_opts; -- __le32 s_first_meta_bg; /* First metablock block group */ -- __u32 s_reserved[190]; /* Padding to the end of the block */ -+ __le32 s_first_meta_bg; /* First metablock block group */ -+ __le32 s_mkfs_time; /* When the filesystem was created */ -+ __le32 s_jnl_blocks[17]; /* Backup of the journal inode */ -+ __le32 s_blocks_count_hi; /* Blocks count high 32 bits */ -+ __le32 s_r_blocks_count_hi; /* Reserved blocks count high 32 bits*/ -+ __le32 s_free_blocks_hi; /* Free blocks count high 32 bits */ -+ __le16 s_min_extra_isize; /* All inodes have at least # bytes */ -+ __le16 s_want_extra_isize; /* New inodes should reserve # bytes */ -+ __le32 s_flags; /* Miscellaneous flags */ -+ __u32 s_reserved[167]; /* Padding to the end of the block */ - }; - - #ifdef __KERNEL__ -@@ -583,6 +648,8 @@ static inline struct ext3_inode_info *EX - #define EXT3_FEATURE_RO_COMPAT_BTREE_DIR 0x0004 - #define EXT4_FEATURE_RO_COMPAT_GDT_CSUM 0x0010 - #define EXT4_FEATURE_RO_COMPAT_DIR_NLINK 0x0020 -+#define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040 -+ - - #define EXT3_FEATURE_INCOMPAT_COMPRESSION 0x0001 - #define EXT3_FEATURE_INCOMPAT_FILETYPE 0x0002 -@@ -599,6 +666,7 @@ static inline struct ext3_inode_info *EX - EXT3_FEATURE_RO_COMPAT_LARGE_FILE| \ - EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ - EXT4_FEATURE_RO_COMPAT_DIR_NLINK| \ -+ EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE| \ - EXT3_FEATURE_RO_COMPAT_BTREE_DIR) - - /* -@@ -726,6 +794,12 @@ static inline struct ext3_inode *ext3_ra - return (struct ext3_inode *) (iloc->bh->b_data + iloc->offset); - } - -+static inline struct timespec ext3_current_time(struct inode *inode) -+{ -+ return (inode->i_sb->s_time_gran < NSEC_PER_SEC) ? -+ current_fs_time(inode->i_sb) : CURRENT_TIME_SEC; -+} -+ - /* - * This structure is stuffed into the struct file's private_data field - * for directories. It is where we put information so that we can do -Index: linux-2.6.16.27-0.9/include/linux/ext3_fs_i.h -=================================================================== ---- linux-2.6.16.27-0.9.orig/include/linux/ext3_fs_i.h -+++ linux-2.6.16.27-0.9/include/linux/ext3_fs_i.h -@@ -130,6 +130,7 @@ struct ext3_inode_info { - - /* on-disk additional length */ - __u16 i_extra_isize; -+ struct timespec i_crtime; - - /* - * truncate_sem is for serialising ext3_truncate() against -Index: linux-2.6.16.27-0.9/include/linux/ext3_fs_sb.h -=================================================================== ---- linux-2.6.16.27-0.9.orig/include/linux/ext3_fs_sb.h -+++ linux-2.6.16.27-0.9/include/linux/ext3_fs_sb.h -@@ -71,6 +71,8 @@ struct ext3_sb_info { - /* Last group used to allocate inode */ - int s_last_alloc_group; - -+ unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */ -+ - /* root of the per fs reservation window tree */ - spinlock_t s_rsv_window_lock; - struct rb_root s_rsv_window_root; diff --git a/ldiskfs/kernel_patches/patches/ext3-nanosecond-2.6-suse.patch b/ldiskfs/kernel_patches/patches/ext3-nanosecond-2.6-suse.patch deleted file mode 100644 index 7824114..0000000 --- a/ldiskfs/kernel_patches/patches/ext3-nanosecond-2.6-suse.patch +++ /dev/null @@ -1,195 +0,0 @@ -Index: linux-2.6.5-7.283/fs/ext3/ialloc.c -=================================================================== ---- linux-2.6.5-7.283.orig/fs/ext3/ialloc.c -+++ linux-2.6.5-7.283/fs/ext3/ialloc.c -@@ -613,7 +613,8 @@ got: - /* This is the optimal IO size (for stat), not the fs block size */ - inode->i_blksize = PAGE_SIZE; - inode->i_blocks = 0; -- inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; -+ inode->i_mtime = inode->i_atime = inode->i_ctime = ei->i_crtime = -+ CURRENT_TIME; - - memset(ei->i_data, 0, sizeof(ei->i_data)); - ei->i_next_alloc_block = 0; -@@ -651,9 +652,8 @@ got: - spin_unlock(&sbi->s_next_gen_lock); - - ei->i_state = EXT3_STATE_NEW; -- ei->i_extra_isize = -- (EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) ? -- sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE : 0; -+ -+ ei->i_extra_isize = EXT3_SB(sb)->s_want_extra_isize; - - ret = inode; - if(DQUOT_ALLOC_INODE(inode)) { -Index: linux-2.6.5-7.283/fs/ext3/inode.c -=================================================================== ---- linux-2.6.5-7.283.orig/fs/ext3/inode.c -+++ linux-2.6.5-7.283/fs/ext3/inode.c -@@ -2459,7 +2459,11 @@ void ext3_read_inode(struct inode * inod - inode->i_atime.tv_sec = le32_to_cpu(raw_inode->i_atime); - inode->i_ctime.tv_sec = le32_to_cpu(raw_inode->i_ctime); - inode->i_mtime.tv_sec = le32_to_cpu(raw_inode->i_mtime); -- inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0; -+ if (EXT3_FITS_IN_INODE(raw_inode, ei, i_crtime)) { -+ ei->i_crtime.tv_sec = le32_to_cpu(raw_inode->i_crtime); -+ } -+ inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = -+ ei->i_crtime.tv_nsec = 0; - - ei->i_state = 0; - ei->i_next_alloc_block = 0; -@@ -2603,6 +2607,10 @@ static int ext3_do_update_inode(handle_t - raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec); - raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec); - raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec); -+ if (EXT3_FITS_IN_INODE(raw_inode, ei, i_crtime)) { -+ raw_inode->i_crtime = cpu_to_le32(ei->i_crtime.tv_sec); -+ } -+ - raw_inode->i_blocks = cpu_to_le32(inode->i_blocks); - raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); - raw_inode->i_flags = cpu_to_le32(ei->i_flags); -Index: linux-2.6.5-7.283/fs/ext3/super.c -=================================================================== ---- linux-2.6.5-7.283.orig/fs/ext3/super.c -+++ linux-2.6.5-7.283/fs/ext3/super.c -@@ -1515,6 +1515,32 @@ static int ext3_fill_super (struct super - } - - ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY); -+ -+ /* determine the minimum size of new large inodes, if present */ -+ if (sbi->s_inode_size > EXT3_GOOD_OLD_INODE_SIZE) { -+ sbi->s_want_extra_isize = sizeof(struct ext3_inode) - -+ EXT3_GOOD_OLD_INODE_SIZE; -+ if (EXT3_HAS_RO_COMPAT_FEATURE(sb, -+ EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE)) { -+ if (sbi->s_want_extra_isize < -+ le16_to_cpu(es->s_want_extra_isize)) -+ sbi->s_want_extra_isize = -+ le16_to_cpu(es->s_want_extra_isize); -+ if (sbi->s_want_extra_isize < -+ le16_to_cpu(es->s_min_extra_isize)) -+ sbi->s_want_extra_isize = -+ le16_to_cpu(es->s_min_extra_isize); -+ } -+ } -+ /* Check if enough inode space is available */ -+ if (EXT3_GOOD_OLD_INODE_SIZE + sbi->s_want_extra_isize > -+ sbi->s_inode_size) { -+ sbi->s_want_extra_isize = sizeof(struct ext3_inode) - -+ EXT3_GOOD_OLD_INODE_SIZE; -+ printk(KERN_INFO "EXT3-fs: required extra inode space not" -+ "available.\n"); -+ } -+ - /* - * akpm: core read_super() calls in here with the superblock locked. - * That deadlocks, because orphan cleanup needs to lock the superblock -Index: linux-2.6.5-7.283/include/linux/ext3_fs.h -=================================================================== ---- linux-2.6.5-7.283.orig/include/linux/ext3_fs.h -+++ linux-2.6.5-7.283/include/linux/ext3_fs.h -@@ -232,7 +232,7 @@ struct ext3_inode { - __u16 i_uid; /* Low 16 bits of Owner Uid */ - __u32 i_size; /* Size in bytes */ - __u32 i_atime; /* Access time */ -- __u32 i_ctime; /* Creation time */ -+ __u32 i_ctime; /* Inode Change time */ - __u32 i_mtime; /* Modification time */ - __u32 i_dtime; /* Deletion Time */ - __u16 i_gid; /* Low 16 bits of Group Id */ -@@ -281,10 +281,25 @@ struct ext3_inode { - } osd2; /* OS dependent 2 */ - __u16 i_extra_isize; - __u16 i_pad1; -+ __le32 i_ctime_extra; /* extra Change time (nsec << 2 | epoch) */ -+ __le32 i_mtime_extra; /* extra Modification time(nsec << 2 | epoch) */ -+ __le32 i_atime_extra; /* extra Access time (nsec << 2 | epoch) */ -+ __le32 i_crtime; /* File Creation time */ -+ __le32 i_crtime_extra; /* extra File Creation time (nsec << 2 | epoch) */ - }; - - #define i_size_high i_dir_acl - -+#define EXT3_EPOCH_BITS 2 -+#define EXT3_EPOCH_MASK ((1 << EXT3_EPOCH_BITS) - 1) -+#define EXT3_NSEC_MASK (~0UL << EXT3_EPOCH_BITS) -+ -+#define EXT3_FITS_IN_INODE(ext3_inode, einode, field) \ -+ ((offsetof(typeof(*ext3_inode), field) + \ -+ sizeof((ext3_inode)->field)) \ -+ <= (EXT3_GOOD_OLD_INODE_SIZE + \ -+ (einode)->i_extra_isize)) \ -+ - #if defined(__KERNEL__) || defined(__linux__) - #define i_reserved1 osd1.linux1.l_i_reserved1 - #define i_frag osd2.linux2.l_i_frag -@@ -460,11 +475,19 @@ struct ext3_super_block { - __u32 s_last_orphan; /* start of list of inodes to delete */ - __u32 s_hash_seed[4]; /* HTREE hash seed */ - __u8 s_def_hash_version; /* Default hash version to use */ -- __u8 s_reserved_char_pad; -- __u16 s_reserved_word_pad; -+ __u8 s_jnl_backup_type; /* Default type of journal backup */ -+ __u16 s_desc_size; /* Group desc. size: INCOMPAT_64BIT */ - __u32 s_default_mount_opts; -- __u32 s_first_meta_bg; /* First metablock block group */ -- __u32 s_reserved[190]; /* Padding to the end of the block */ -+ __u32 s_first_meta_bg; /* First metablock block group */ -+ __u32 s_mkfs_time; /* When the filesystem was created */ -+ __u32 s_jnl_blocks[17]; /* Backup of the journal inode */ -+ __u32 s_blocks_count_hi; /* Blocks count high 32 bits */ -+ __u32 s_r_blocks_count_hi; /* Reserved blocks count high 32 bits*/ -+ __u32 s_free_blocks_hi; /* Free blocks count high 32 bits */ -+ __u16 s_min_extra_isize; /* All inodes have at least # bytes */ -+ __u16 s_want_extra_isize; /* New inodes should reserve # bytes */ -+ __u32 s_flags; /* Miscellaneous flags */ -+ __u32 s_reserved[167]; /* Padding to the end of the block */ - }; - - #ifdef __KERNEL__ -@@ -539,6 +556,7 @@ static inline struct ext3_inode_info *EX - #define EXT3_FEATURE_RO_COMPAT_BTREE_DIR 0x0004 - #define EXT4_FEATURE_RO_COMPAT_GDT_CSUM 0x0010 - #define EXT4_FEATURE_RO_COMPAT_DIR_NLINK 0x0020 -+#define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040 - - #define EXT3_FEATURE_INCOMPAT_COMPRESSION 0x0001 - #define EXT3_FEATURE_INCOMPAT_FILETYPE 0x0002 -@@ -555,6 +573,7 @@ static inline struct ext3_inode_info *EX - EXT3_FEATURE_RO_COMPAT_LARGE_FILE| \ - EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ - EXT4_FEATURE_RO_COMPAT_DIR_NLINK| \ -+ EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE| \ - EXT3_FEATURE_RO_COMPAT_BTREE_DIR) - - /* -Index: linux-2.6.5-7.283/include/linux/ext3_fs_i.h -=================================================================== ---- linux-2.6.5-7.283.orig/include/linux/ext3_fs_i.h -+++ linux-2.6.5-7.283/include/linux/ext3_fs_i.h -@@ -130,6 +130,7 @@ struct ext3_inode_info { - - /* on-disk additional length */ - __u16 i_extra_isize; -+ struct timespec i_crtime; - - /* - * truncate_sem is for serialising ext3_truncate() against -Index: linux-2.6.5-7.283/include/linux/ext3_fs_sb.h -=================================================================== ---- linux-2.6.5-7.283.orig/include/linux/ext3_fs_sb.h -+++ linux-2.6.5-7.283/include/linux/ext3_fs_sb.h -@@ -71,6 +71,8 @@ struct ext3_sb_info { - /* Last group used to allocate inode */ - int s_last_alloc_group; - -+ unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */ -+ - /* root of the per fs reservation window tree */ - spinlock_t s_rsv_window_lock; - struct rb_root s_rsv_window_root; diff --git a/ldiskfs/kernel_patches/patches/ext3-nanosecond-2.6.18-vanilla.patch b/ldiskfs/kernel_patches/patches/ext3-nanosecond-2.6.18-vanilla.patch deleted file mode 100644 index 6dd7ad7..0000000 --- a/ldiskfs/kernel_patches/patches/ext3-nanosecond-2.6.18-vanilla.patch +++ /dev/null @@ -1,403 +0,0 @@ -Index: linux-2.6.18/fs/ext3/ialloc.c -=================================================================== ---- linux-2.6.18.orig/fs/ext3/ialloc.c -+++ linux-2.6.18/fs/ext3/ialloc.c -@@ -615,7 +615,8 @@ got: - /* This is the optimal IO size (for stat), not the fs block size */ - inode->i_blksize = PAGE_SIZE; - inode->i_blocks = 0; -- inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; -+ inode->i_mtime = inode->i_atime = inode->i_ctime = ei->i_crtime = -+ ext3_current_time(inode); - - memset(ei->i_data, 0, sizeof(ei->i_data)); - ei->i_dir_start_lookup = 0; -@@ -647,9 +648,8 @@ got: - spin_unlock(&sbi->s_next_gen_lock); - - ei->i_state = EXT3_STATE_NEW; -- ei->i_extra_isize = -- (EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) ? -- sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE : 0; -+ -+ ei->i_extra_isize = EXT3_SB(sb)->s_want_extra_isize; - - ret = inode; - if(DQUOT_ALLOC_INODE(inode)) { -Index: linux-2.6.18/fs/ext3/inode.c -=================================================================== ---- linux-2.6.18.orig/fs/ext3/inode.c -+++ linux-2.6.18/fs/ext3/inode.c -@@ -729,7 +729,7 @@ static int ext3_splice_branch(handle_t * - - /* We are done with atomic stuff, now do the rest of housekeeping */ - -- inode->i_ctime = CURRENT_TIME_SEC; -+ inode->i_ctime = ext3_current_time(inode); - ext3_mark_inode_dirty(handle, inode); - - /* had we spliced it onto indirect block? */ -@@ -2388,7 +2388,7 @@ do_indirects: - ext3_discard_reservation(inode); - - mutex_unlock(&ei->truncate_mutex); -- inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; -+ inode->i_mtime = inode->i_ctime = ext3_current_time(inode); - ext3_mark_inode_dirty(handle, inode); - - /* -@@ -2624,10 +2624,6 @@ void ext3_read_inode(struct inode * inod - } - inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); - inode->i_size = le32_to_cpu(raw_inode->i_size); -- inode->i_atime.tv_sec = le32_to_cpu(raw_inode->i_atime); -- inode->i_ctime.tv_sec = le32_to_cpu(raw_inode->i_ctime); -- inode->i_mtime.tv_sec = le32_to_cpu(raw_inode->i_mtime); -- inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0; - - ei->i_state = 0; - ei->i_dir_start_lookup = 0; -@@ -2702,6 +2698,11 @@ void ext3_read_inode(struct inode * inod - } else - ei->i_extra_isize = 0; - -+ EXT3_INODE_GET_XTIME(i_ctime, inode, raw_inode); -+ EXT3_INODE_GET_XTIME(i_mtime, inode, raw_inode); -+ EXT3_INODE_GET_XTIME(i_atime, inode, raw_inode); -+ EXT3_EINODE_GET_XTIME(i_crtime, ei, raw_inode); -+ - if (S_ISREG(inode->i_mode)) { - inode->i_op = &ext3_file_inode_operations; - inode->i_fop = &ext3_file_operations; -@@ -2782,9 +2783,12 @@ static int ext3_do_update_inode(handle_t - } - raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); - raw_inode->i_size = cpu_to_le32(ei->i_disksize); -- raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec); -- raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec); -- raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec); -+ -+ EXT3_INODE_SET_XTIME(i_ctime, inode, raw_inode); -+ EXT3_INODE_SET_XTIME(i_mtime, inode, raw_inode); -+ EXT3_INODE_SET_XTIME(i_atime, inode, raw_inode); -+ EXT3_EINODE_SET_XTIME(i_crtime, ei, raw_inode); -+ - raw_inode->i_blocks = cpu_to_le32(inode->i_blocks); - raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); - raw_inode->i_flags = cpu_to_le32(ei->i_flags); -Index: linux-2.6.18/fs/ext3/ioctl.c -=================================================================== ---- linux-2.6.18.orig/fs/ext3/ioctl.c -+++ linux-2.6.18/fs/ext3/ioctl.c -@@ -120,7 +120,7 @@ int ext3_ioctl (struct inode * inode, st - ei->i_flags = flags; - - ext3_set_inode_flags(inode); -- inode->i_ctime = CURRENT_TIME_SEC; -+ inode->i_ctime = ext3_current_time(inode); - - err = ext3_mark_iloc_dirty(handle, inode, &iloc); - flags_err: -@@ -161,7 +161,7 @@ flags_err: - return PTR_ERR(handle); - err = ext3_reserve_inode_write(handle, inode, &iloc); - if (err == 0) { -- inode->i_ctime = CURRENT_TIME_SEC; -+ inode->i_ctime = ext3_current_time(inode); - inode->i_generation = generation; - err = ext3_mark_iloc_dirty(handle, inode, &iloc); - } -Index: linux-2.6.18/fs/ext3/namei.c -=================================================================== ---- linux-2.6.18.orig/fs/ext3/namei.c -+++ linux-2.6.18/fs/ext3/namei.c -@@ -1287,7 +1287,7 @@ static int add_dirent_to_buf(handle_t *h - * happen is that the times are slightly out of date - * and/or different from the directory change time. - */ -- dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; -+ dir->i_mtime = dir->i_ctime = ext3_current_time(dir); - ext3_update_dx_flag(dir); - dir->i_version++; - ext3_mark_inode_dirty(handle, dir); -@@ -2079,7 +2079,7 @@ static int ext3_rmdir (struct inode * di - inode->i_version++; - inode->i_nlink = 0; - ext3_orphan_add(handle, inode); -- inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; -+ inode->i_ctime = dir->i_ctime = dir->i_mtime = ext3_current_time(inode); - ext3_mark_inode_dirty(handle, inode); - ext3_dec_count(handle, dir); - ext3_update_dx_flag(dir); -@@ -2129,13 +2129,13 @@ static int ext3_unlink(struct inode * di - retval = ext3_delete_entry(handle, dir, de, bh); - if (retval) - goto end_unlink; -- dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; -+ dir->i_ctime = dir->i_mtime = ext3_current_time(dir); - ext3_update_dx_flag(dir); - ext3_mark_inode_dirty(handle, dir); - ext3_dec_count(handle, inode); - if (!inode->i_nlink) - ext3_orphan_add(handle, inode); -- inode->i_ctime = dir->i_ctime; -+ inode->i_ctime = ext3_current_time(inode); - ext3_mark_inode_dirty(handle, inode); - retval = 0; - -@@ -2237,7 +2237,7 @@ retry: - if (IS_DIRSYNC(dir)) - handle->h_sync = 1; - -- inode->i_ctime = CURRENT_TIME_SEC; -+ inode->i_ctime = ext3_current_time(inode); - ext3_inc_count(handle, inode); - atomic_inc(&inode->i_count); - -@@ -2340,7 +2340,7 @@ static int ext3_rename (struct inode * o - * Like most other Unix systems, set the ctime for inodes on a - * rename. - */ -- old_inode->i_ctime = CURRENT_TIME_SEC; -+ old_inode->i_ctime = ext3_current_time(old_inode); - ext3_mark_inode_dirty(handle, old_inode); - - /* -@@ -2373,9 +2373,9 @@ static int ext3_rename (struct inode * o - - if (new_inode) { - ext3_dec_count(handle, new_inode); -- new_inode->i_ctime = CURRENT_TIME_SEC; -+ new_inode->i_ctime = ext3_current_time(new_inode); - } -- old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME_SEC; -+ old_dir->i_ctime = old_dir->i_mtime = ext3_current_time(old_dir); - ext3_update_dx_flag(old_dir); - if (dir_bh) { - BUFFER_TRACE(dir_bh, "get_write_access"); -Index: linux-2.6.18/fs/ext3/super.c -=================================================================== ---- linux-2.6.18.orig/fs/ext3/super.c -+++ linux-2.6.18/fs/ext3/super.c -@@ -1615,6 +1615,8 @@ static int ext3_fill_super (struct super - sbi->s_inode_size); - goto failed_mount; - } -+ if (sbi->s_inode_size > EXT3_GOOD_OLD_INODE_SIZE) -+ sb->s_time_gran = 1 << (EXT3_EPOCH_BITS - 2); - } - sbi->s_frag_size = EXT3_MIN_FRAG_SIZE << - le32_to_cpu(es->s_log_frag_size); -@@ -1819,6 +1821,32 @@ static int ext3_fill_super (struct super - } - - ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY); -+ -+ /* determine the minimum size of new large inodes, if present */ -+ if (sbi->s_inode_size > EXT3_GOOD_OLD_INODE_SIZE) { -+ sbi->s_want_extra_isize = sizeof(struct ext3_inode) - -+ EXT3_GOOD_OLD_INODE_SIZE; -+ if (EXT3_HAS_RO_COMPAT_FEATURE(sb, -+ EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE)) { -+ if (sbi->s_want_extra_isize < -+ le16_to_cpu(es->s_want_extra_isize)) -+ sbi->s_want_extra_isize = -+ le16_to_cpu(es->s_want_extra_isize); -+ if (sbi->s_want_extra_isize < -+ le16_to_cpu(es->s_min_extra_isize)) -+ sbi->s_want_extra_isize = -+ le16_to_cpu(es->s_min_extra_isize); -+ } -+ } -+ /* Check if enough inode space is available */ -+ if (EXT3_GOOD_OLD_INODE_SIZE + sbi->s_want_extra_isize > -+ sbi->s_inode_size) { -+ sbi->s_want_extra_isize = sizeof(struct ext3_inode) - -+ EXT3_GOOD_OLD_INODE_SIZE; -+ printk(KERN_INFO "EXT3-fs: required extra inode space not" -+ "available.\n"); -+ } -+ - /* - * akpm: core read_super() calls in here with the superblock locked. - * That deadlocks, because orphan cleanup needs to lock the superblock -Index: linux-2.6.18/fs/ext3/xattr.c -=================================================================== ---- linux-2.6.18.orig/fs/ext3/xattr.c -+++ linux-2.6.18/fs/ext3/xattr.c -@@ -1007,7 +1007,7 @@ ext3_xattr_set_handle(handle_t *handle, - } - if (!error) { - ext3_xattr_update_super_block(handle, inode->i_sb); -- inode->i_ctime = CURRENT_TIME_SEC; -+ inode->i_ctime = ext3_current_time(inode); - error = ext3_mark_iloc_dirty(handle, inode, &is.iloc); - /* - * The bh is consumed by ext3_mark_iloc_dirty, even with -Index: linux-2.6.18/include/linux/ext3_fs.h -=================================================================== ---- linux-2.6.18.orig/include/linux/ext3_fs.h -+++ linux-2.6.18/include/linux/ext3_fs.h -@@ -268,7 +268,7 @@ struct ext3_inode { - __le16 i_uid; /* Low 16 bits of Owner Uid */ - __le32 i_size; /* Size in bytes */ - __le32 i_atime; /* Access time */ -- __le32 i_ctime; /* Creation time */ -+ __le32 i_ctime; /* Inode Change time */ - __le32 i_mtime; /* Modification time */ - __le32 i_dtime; /* Deletion Time */ - __le16 i_gid; /* Low 16 bits of Group Id */ -@@ -317,10 +317,73 @@ struct ext3_inode { - } osd2; /* OS dependent 2 */ - __le16 i_extra_isize; - __le16 i_pad1; -+ __le32 i_ctime_extra; /* extra Change time (nsec << 2 | epoch) */ -+ __le32 i_mtime_extra; /* extra Modification time(nsec << 2 | epoch) */ -+ __le32 i_atime_extra; /* extra Access time (nsec << 2 | epoch) */ -+ __le32 i_crtime; /* File Creation time */ -+ __le32 i_crtime_extra; /* extra File Creation time (nsec << 2 | epoch) */ - }; - - #define i_size_high i_dir_acl - -+#define EXT3_EPOCH_BITS 2 -+#define EXT3_EPOCH_MASK ((1 << EXT3_EPOCH_BITS) - 1) -+#define EXT3_NSEC_MASK (~0UL << EXT3_EPOCH_BITS) -+ -+#define EXT3_FITS_IN_INODE(ext3_inode, einode, field) \ -+ ((offsetof(typeof(*ext3_inode), field) + \ -+ sizeof((ext3_inode)->field)) \ -+ <= (EXT3_GOOD_OLD_INODE_SIZE + \ -+ (einode)->i_extra_isize)) \ -+ -+static inline __le32 ext3_encode_extra_time(struct timespec *time) -+{ -+ return cpu_to_le32((sizeof(time->tv_sec) > 4 ? -+ time->tv_sec >> 32 : 0) | -+ ((time->tv_nsec << 2) & EXT3_NSEC_MASK)); -+} -+ -+static inline void ext3_decode_extra_time(struct timespec *time, __le32 extra) { -+ if (sizeof(time->tv_sec) > 4) -+ time->tv_sec |= (__u64)(le32_to_cpu(extra) & EXT3_EPOCH_MASK) -+ << 32; -+ time->tv_nsec = (le32_to_cpu(extra) & EXT3_NSEC_MASK) >> 2; -+} -+ -+#define EXT3_INODE_SET_XTIME(xtime, inode, raw_inode) \ -+do { \ -+ (raw_inode)->xtime = cpu_to_le32((inode)->xtime.tv_sec); \ -+ if (EXT3_FITS_IN_INODE(raw_inode, EXT3_I(inode), xtime ## _extra)) \ -+ (raw_inode)->xtime ## _extra = \ -+ ext3_encode_extra_time(&(inode)->xtime); \ -+} while (0) -+ -+#define EXT3_EINODE_SET_XTIME(xtime, einode, raw_inode)\ -+do { \ -+ if (EXT3_FITS_IN_INODE(raw_inode, einode, xtime)) \ -+ (raw_inode)->xtime = cpu_to_le32((einode)->xtime.tv_sec); \ -+ if (EXT3_FITS_IN_INODE(raw_inode, einode, xtime ## _extra)) \ -+ (raw_inode)->xtime ## _extra = \ -+ ext3_encode_extra_time(&(einode)->xtime); \ -+} while (0) -+ -+#define EXT3_INODE_GET_XTIME(xtime, inode, raw_inode) \ -+do { \ -+ (inode)->xtime.tv_sec = le32_to_cpu((raw_inode)->xtime); \ -+ if (EXT3_FITS_IN_INODE(raw_inode, EXT3_I(inode), xtime ## _extra)) \ -+ ext3_decode_extra_time(&(inode)->xtime, \ -+ raw_inode->xtime ## _extra); \ -+} while (0) -+ -+#define EXT3_EINODE_GET_XTIME(xtime, einode, raw_inode) \ -+do { \ -+ if (EXT3_FITS_IN_INODE(raw_inode, einode, xtime)) \ -+ (einode)->xtime.tv_sec = le32_to_cpu((raw_inode)->xtime); \ -+ if (EXT3_FITS_IN_INODE(raw_inode, einode, xtime ## _extra)) \ -+ ext3_decode_extra_time(&(einode)->xtime, \ -+ raw_inode->xtime ## _extra); \ -+} while (0) -+ - #if defined(__KERNEL__) || defined(__linux__) - #define i_reserved1 osd1.linux1.l_i_reserved1 - #define i_frag osd2.linux2.l_i_frag -@@ -498,11 +562,19 @@ struct ext3_super_block { - __le32 s_last_orphan; /* start of list of inodes to delete */ - __le32 s_hash_seed[4]; /* HTREE hash seed */ - __u8 s_def_hash_version; /* Default hash version to use */ -- __u8 s_reserved_char_pad; -- __u16 s_reserved_word_pad; -+ __u8 s_jnl_backup_type; /* Default type of journal backup */ -+ __le16 s_desc_size; /* Group desc. size: INCOMPAT_64BIT */ - __le32 s_default_mount_opts; -- __le32 s_first_meta_bg; /* First metablock block group */ -- __u32 s_reserved[190]; /* Padding to the end of the block */ -+ __le32 s_first_meta_bg; /* First metablock block group */ -+ __le32 s_mkfs_time; /* When the filesystem was created */ -+ __le32 s_jnl_blocks[17]; /* Backup of the journal inode */ -+ __le32 s_blocks_count_hi; /* Blocks count high 32 bits */ -+ __le32 s_r_blocks_count_hi; /* Reserved blocks count high 32 bits*/ -+ __le32 s_free_blocks_count_hi; /* Free blocks count high 32 bits */ -+ __le16 s_min_extra_isize; /* All inodes have at least # bytes */ -+ __le16 s_want_extra_isize; /* New inodes should reserve # bytes */ -+ __le32 s_flags; /* Miscellaneous flags */ -+ __u32 s_reserved[167]; /* Padding to the end of the block */ - }; - - #ifdef __KERNEL__ -@@ -519,6 +584,13 @@ static inline struct ext3_inode_info *EX - return container_of(inode, struct ext3_inode_info, vfs_inode); - } - -+static inline struct timespec ext3_current_time(struct inode *inode) -+{ -+ return (inode->i_sb->s_time_gran < NSEC_PER_SEC) ? -+ current_fs_time(inode->i_sb) : CURRENT_TIME_SEC; -+} -+ -+ - static inline int ext3_valid_inum(struct super_block *sb, unsigned long ino) - { - return ino == EXT3_ROOT_INO || -@@ -590,6 +662,8 @@ static inline int ext3_valid_inum(struct - #define EXT3_FEATURE_RO_COMPAT_LARGE_FILE 0x0002 - #define EXT3_FEATURE_RO_COMPAT_BTREE_DIR 0x0004 - #define EXT4_FEATURE_RO_COMPAT_DIR_NLINK 0x0020 -+#define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040 -+ - - #define EXT3_FEATURE_INCOMPAT_COMPRESSION 0x0001 - #define EXT3_FEATURE_INCOMPAT_FILETYPE 0x0002 -@@ -606,6 +680,7 @@ static inline int ext3_valid_inum(struct - #define EXT3_FEATURE_RO_COMPAT_SUPP (EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \ - EXT3_FEATURE_RO_COMPAT_LARGE_FILE| \ - EXT4_FEATURE_RO_COMPAT_DIR_NLINK| \ -+ EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE| \ - EXT3_FEATURE_RO_COMPAT_BTREE_DIR) - - /* -Index: linux-2.6.18/include/linux/ext3_fs_sb.h -=================================================================== ---- linux-2.6.18.orig/include/linux/ext3_fs_sb.h -+++ linux-2.6.18/include/linux/ext3_fs_sb.h -@@ -119,6 +119,8 @@ struct ext3_sb_info { - spinlock_t s_bal_lock; - unsigned long s_mb_buddies_generated; - unsigned long long s_mb_generation_time; -+ -+ unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */ - }; - - #define EXT3_GROUP_INFO(sb, group) \ -Index: linux-2.6.18/include/linux/ext3_fs_i.h -=================================================================== ---- linux-2.6.18.orig/include/linux/ext3_fs_i.h -+++ linux-2.6.18/include/linux/ext3_fs_i.h -@@ -144,6 +144,7 @@ struct ext3_inode_info { - struct inode vfs_inode; - - __u32 i_cached_extent[4]; -+ struct timespec i_crtime; - - void *i_filterdata; - }; diff --git a/ldiskfs/kernel_patches/patches/ext3-nlinks-2.6.7.patch b/ldiskfs/kernel_patches/patches/ext3-nlinks-2.6.7.patch deleted file mode 100644 index 5054b0c..0000000 --- a/ldiskfs/kernel_patches/patches/ext3-nlinks-2.6.7.patch +++ /dev/null @@ -1,156 +0,0 @@ -Index: linux-2.6.5-7.283/fs/ext3/namei.c -=================================================================== ---- linux-2.6.5-7.283.orig/fs/ext3/namei.c -+++ linux-2.6.5-7.283/fs/ext3/namei.c -@@ -1613,11 +1613,17 @@ static int ext3_delete_entry (handle_t * - static inline void ext3_inc_count(handle_t *handle, struct inode *inode) - { - inode->i_nlink++; -+ if (is_dx(inode) && inode->i_nlink > 1) { -+ /* limit is 16-bit i_links_count */ -+ if (inode->i_nlink >= EXT3_LINK_MAX || inode->i_nlink == 2) -+ inode->i_nlink = 1; -+ } - } - - static inline void ext3_dec_count(handle_t *handle, struct inode *inode) - { -- inode->i_nlink--; -+ if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 2) -+ inode->i_nlink--; - } - - static int ext3_add_nondir(handle_t *handle, -@@ -1730,7 +1736,7 @@ static int ext3_mkdir(struct inode * dir - int retries = 0; - int err; - -- if (dir->i_nlink >= EXT3_LINK_MAX) -+ if (EXT3_DIR_LINK_MAX(dir)) - return -EMLINK; - - retry: -@@ -1752,7 +1758,7 @@ retry: - inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize; - dir_block = ext3_bread (handle, inode, 0, 1, &err); - if (!dir_block) { -- inode->i_nlink--; /* is this nlink == 0? */ -+ ext3_dec_count(handle, inode); /* is this nlink == 0? */ - ext3_mark_inode_dirty(handle, inode); - iput (inode); - goto out_stop; -@@ -1784,7 +1790,7 @@ retry: - iput (inode); - goto out_stop; - } -- dir->i_nlink++; -+ ext3_inc_count(handle, dir); - ext3_update_dx_flag(dir); - ext3_mark_inode_dirty(handle, dir); - d_instantiate(dentry, inode); -@@ -2042,16 +2048,16 @@ static int ext3_rmdir (struct inode * di - retval = ext3_delete_entry(handle, dir, de, bh); - if (retval) - goto end_rmdir; -- if (inode->i_nlink != 2) -- ext3_warning (inode->i_sb, "ext3_rmdir", -- "empty directory has nlink!=2 (%d)", -- inode->i_nlink); -+ if (!EXT3_DIR_LINK_EMPTY(inode)) -+ ext3_warning(inode->i_sb, "ext3_rmdir", -+ "empty directory has too many links (%d)", -+ inode->i_nlink); - inode->i_version++; - inode->i_nlink = 0; - ext3_orphan_add(handle, inode); - inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; - ext3_mark_inode_dirty(handle, inode); -- dir->i_nlink--; -+ ext3_dec_count(handle, dir); - ext3_update_dx_flag(dir); - ext3_mark_inode_dirty(handle, dir); - -@@ -2100,7 +2106,7 @@ static int ext3_unlink(struct inode * di - dir->i_ctime = dir->i_mtime = CURRENT_TIME; - ext3_update_dx_flag(dir); - ext3_mark_inode_dirty(handle, dir); -- inode->i_nlink--; -+ ext3_dec_count(handle, inode); - if (!inode->i_nlink) - ext3_orphan_add(handle, inode); - inode->i_ctime = dir->i_ctime; -@@ -2191,7 +2197,7 @@ static int ext3_link (struct dentry * ol - struct inode *inode = old_dentry->d_inode; - int err, retries = 0; - -- if (inode->i_nlink >= EXT3_LINK_MAX) -+ if (EXT3_DIR_LINK_MAX(inode)) - return -EMLINK; - - retry: -@@ -2277,8 +2283,8 @@ static int ext3_rename (struct inode * o - if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino) - goto end_rename; - retval = -EMLINK; -- if (!new_inode && new_dir!=old_dir && -- new_dir->i_nlink >= EXT3_LINK_MAX) -+ if (!new_inode && new_dir != old_dir && -+ EXT3_DIR_LINK_MAX(new_dir)) - goto end_rename; - } - if (!new_bh) { -@@ -2335,7 +2341,7 @@ static int ext3_rename (struct inode * o - } - - if (new_inode) { -- new_inode->i_nlink--; -+ ext3_dec_count(handle, new_inode); - new_inode->i_ctime = CURRENT_TIME; - } - old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME; -@@ -2346,11 +2352,13 @@ static int ext3_rename (struct inode * o - PARENT_INO(dir_bh->b_data) = le32_to_cpu(new_dir->i_ino); - BUFFER_TRACE(dir_bh, "call ext3_journal_dirty_metadata"); - ext3_journal_dirty_metadata(handle, dir_bh); -- old_dir->i_nlink--; -+ ext3_dec_count(handle, old_dir); - if (new_inode) { -- new_inode->i_nlink--; -+ /* checked empty_dir above, can't have another parent, -+ * ext3_dec_count() won't work for many-linked dirs */ -+ new_inode->i_nlink = 0; - } else { -- new_dir->i_nlink++; -+ ext3_inc_count(handle, new_dir); - ext3_update_dx_flag(new_dir); - ext3_mark_inode_dirty(handle, new_dir); - } -Index: linux-2.6.5-7.283/include/linux/ext3_fs.h -=================================================================== ---- linux-2.6.5-7.283.orig/include/linux/ext3_fs.h -+++ linux-2.6.5-7.283/include/linux/ext3_fs.h -@@ -86,7 +86,7 @@ struct statfs; - /* - * Maximal count of links to a file - */ --#define EXT3_LINK_MAX 32000 -+#define EXT3_LINK_MAX 65000 - - /* - * Macro-instructions used to manage several block sizes -@@ -538,6 +538,7 @@ static inline struct ext3_inode_info *EX - #define EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001 - #define EXT3_FEATURE_RO_COMPAT_LARGE_FILE 0x0002 - #define EXT3_FEATURE_RO_COMPAT_BTREE_DIR 0x0004 -+#define EXT4_FEATURE_RO_COMPAT_DIR_NLINK 0x0020 - - #define EXT3_FEATURE_INCOMPAT_COMPRESSION 0x0001 - #define EXT3_FEATURE_INCOMPAT_FILETYPE 0x0002 -@@ -553,6 +554,7 @@ static inline struct ext3_inode_info *EX - EXT3_FEATURE_INCOMPAT_EXTENTS) - #define EXT3_FEATURE_RO_COMPAT_SUPP (EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \ - EXT3_FEATURE_RO_COMPAT_LARGE_FILE| \ -+ EXT4_FEATURE_RO_COMPAT_DIR_NLINK| \ - EXT3_FEATURE_RO_COMPAT_BTREE_DIR) - - /* diff --git a/ldiskfs/kernel_patches/patches/ext3-nlinks-2.6.9.patch b/ldiskfs/kernel_patches/patches/ext3-nlinks-2.6.9.patch deleted file mode 100644 index d572c8f..0000000 --- a/ldiskfs/kernel_patches/patches/ext3-nlinks-2.6.9.patch +++ /dev/null @@ -1,158 +0,0 @@ -Index: linux-2.6.12/fs/ext3/namei.c -=================================================================== ---- linux-2.6.12.orig/fs/ext3/namei.c -+++ linux-2.6.12/fs/ext3/namei.c -@@ -1600,11 +1600,17 @@ static int ext3_delete_entry (handle_t * - static inline void ext3_inc_count(handle_t *handle, struct inode *inode) - { - inode->i_nlink++; -+ if (is_dx(inode) && inode->i_nlink > 1) { -+ /* limit is 16-bit i_links_count */ -+ if (inode->i_nlink >= EXT3_LINK_MAX || inode->i_nlink == 2) -+ inode->i_nlink = 1; -+ } - } - - static inline void ext3_dec_count(handle_t *handle, struct inode *inode) - { -- inode->i_nlink--; -+ if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 2) -+ inode->i_nlink--; - } - - static int ext3_add_nondir(handle_t *handle, -@@ -1703,7 +1709,7 @@ static int ext3_mkdir(struct inode * dir - struct ext3_dir_entry_2 * de; - int err, retries = 0; - -- if (dir->i_nlink >= EXT3_LINK_MAX) -+ if (EXT3_DIR_LINK_MAX(dir)) - return -EMLINK; - - retry: -@@ -1726,7 +1732,7 @@ retry: - inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize; - dir_block = ext3_bread (handle, inode, 0, 1, &err); - if (!dir_block) { -- inode->i_nlink--; /* is this nlink == 0? */ -+ ext3_dec_count(handle, inode); /* is this nlink == 0? */ - ext3_mark_inode_dirty(handle, inode); - iput (inode); - goto out_stop; -@@ -1758,7 +1764,7 @@ retry: - iput (inode); - goto out_stop; - } -- dir->i_nlink++; -+ ext3_inc_count(handle, dir); - ext3_update_dx_flag(dir); - ext3_mark_inode_dirty(handle, dir); - d_instantiate(dentry, inode); -@@ -2023,10 +2029,10 @@ static int ext3_rmdir (struct inode * di - retval = ext3_delete_entry(handle, dir, de, bh); - if (retval) - goto end_rmdir; -- if (inode->i_nlink != 2) -- ext3_warning (inode->i_sb, "ext3_rmdir", -- "empty directory has nlink!=2 (%d)", -- inode->i_nlink); -+ if (!EXT3_DIR_LINK_EMPTY(inode)) -+ ext3_warning(inode->i_sb, "ext3_rmdir", -+ "empty directory has too many links (%d)", -+ inode->i_nlink); - inode->i_version++; - inode->i_nlink = 0; - /* There's no need to set i_disksize: the fact that i_nlink is -@@ -2036,7 +2042,7 @@ static int ext3_rmdir (struct inode * di - ext3_orphan_add(handle, inode); - inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; - ext3_mark_inode_dirty(handle, inode); -- dir->i_nlink--; -+ ext3_dec_count(handle, dir); - ext3_update_dx_flag(dir); - ext3_mark_inode_dirty(handle, dir); - -@@ -2087,7 +2093,7 @@ static int ext3_unlink(struct inode * di - dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; - ext3_update_dx_flag(dir); - ext3_mark_inode_dirty(handle, dir); -- inode->i_nlink--; -+ ext3_dec_count(handle, inode); - if (!inode->i_nlink) - ext3_orphan_add(handle, inode); - inode->i_ctime = dir->i_ctime; -@@ -2162,7 +2168,7 @@ static int ext3_link (struct dentry * ol - struct inode *inode = old_dentry->d_inode; - int err, retries = 0; - -- if (inode->i_nlink >= EXT3_LINK_MAX) -+ if (EXT3_DIR_LINK_MAX(inode)) - return -EMLINK; - - retry: -@@ -2249,8 +2255,8 @@ static int ext3_rename (struct inode * o - if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino) - goto end_rename; - retval = -EMLINK; -- if (!new_inode && new_dir!=old_dir && -- new_dir->i_nlink >= EXT3_LINK_MAX) -+ if (!new_inode && new_dir != old_dir && -+ EXT3_DIR_LINK_MAX(new_dir)) - goto end_rename; - } - if (!new_bh) { -@@ -2307,7 +2313,7 @@ static int ext3_rename (struct inode * o - } - - if (new_inode) { -- new_inode->i_nlink--; -+ ext3_dec_count(handle, new_inode); - new_inode->i_ctime = CURRENT_TIME_SEC; - } - old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME_SEC; -@@ -2318,11 +2324,13 @@ static int ext3_rename (struct inode * o - PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino); - BUFFER_TRACE(dir_bh, "call ext3_journal_dirty_metadata"); - ext3_journal_dirty_metadata(handle, dir_bh); -- old_dir->i_nlink--; -+ ext3_dec_count(handle, old_dir); - if (new_inode) { -- new_inode->i_nlink--; -+ /* checked empty_dir above, can't have another parent, -+ * ext3_dec_count() won't work for many-linked dirs */ -+ new_inode->i_nlink = 0; - } else { -- new_dir->i_nlink++; -+ ext3_inc_count(handle, new_dir); - ext3_update_dx_flag(new_dir); - ext3_mark_inode_dirty(handle, new_dir); - } -Index: linux-2.6.12/include/linux/ext3_fs.h -=================================================================== ---- linux-2.6.12.orig/include/linux/ext3_fs.h -+++ linux-2.6.12/include/linux/ext3_fs.h -@@ -78,7 +78,7 @@ struct statfs; - /* - * Maximal count of links to a file - */ --#define EXT3_LINK_MAX 32000 -+#define EXT3_LINK_MAX 65000 - - /* - * Macro-instructions used to manage several block sizes -@@ -539,6 +539,7 @@ static inline struct ext3_inode_info *EX - #define EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001 - #define EXT3_FEATURE_RO_COMPAT_LARGE_FILE 0x0002 - #define EXT3_FEATURE_RO_COMPAT_BTREE_DIR 0x0004 -+#define EXT4_FEATURE_RO_COMPAT_DIR_NLINK 0x0020 - - #define EXT3_FEATURE_INCOMPAT_COMPRESSION 0x0001 - #define EXT3_FEATURE_INCOMPAT_FILETYPE 0x0002 -@@ -552,6 +553,7 @@ static inline struct ext3_inode_info *EX - EXT3_FEATURE_INCOMPAT_META_BG) - #define EXT3_FEATURE_RO_COMPAT_SUPP (EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \ - EXT3_FEATURE_RO_COMPAT_LARGE_FILE| \ -+ EXT4_FEATURE_RO_COMPAT_DIR_NLINK| \ - EXT3_FEATURE_RO_COMPAT_BTREE_DIR) - - /* diff --git a/ldiskfs/kernel_patches/patches/ext3-remove-cond_resched-calls-2.6.12.patch b/ldiskfs/kernel_patches/patches/ext3-remove-cond_resched-calls-2.6.12.patch deleted file mode 100644 index 57898d5..0000000 --- a/ldiskfs/kernel_patches/patches/ext3-remove-cond_resched-calls-2.6.12.patch +++ /dev/null @@ -1,29 +0,0 @@ -Index: linux-stage/fs/ext3/ialloc.c -=================================================================== ---- linux-stage.orig/fs/ext3/ialloc.c 2005-06-26 10:59:43.048185981 +0200 -+++ linux-stage/fs/ext3/ialloc.c 2005-06-26 11:01:21.317716027 +0200 -@@ -775,7 +775,6 @@ - if (!gdp) - continue; - desc_count += le16_to_cpu(gdp->bg_free_inodes_count); -- cond_resched(); - } - return desc_count; - #endif -Index: linux-stage/fs/ext3/super.c -=================================================================== ---- linux-stage.orig/fs/ext3/super.c 2005-06-26 10:59:43.205412542 +0200 -+++ linux-stage/fs/ext3/super.c 2005-06-26 11:02:29.599941754 +0200 -@@ -2236,11 +2232,9 @@ - * block group descriptors. If the sparse superblocks - * feature is turned on, then not all groups have this. - */ -- for (i = 0; i < ngroups; i++) { -+ for (i = 0; i < ngroups; i++) - overhead += ext3_bg_has_super(sb, i) + - ext3_bg_num_gdb(sb, i); -- cond_resched(); -- } - - /* - * Every block group has an inode bitmap, a block diff --git a/ldiskfs/kernel_patches/patches/ext3-rename-reserve-2.6-suse.patch b/ldiskfs/kernel_patches/patches/ext3-rename-reserve-2.6-suse.patch deleted file mode 100644 index f323584..0000000 --- a/ldiskfs/kernel_patches/patches/ext3-rename-reserve-2.6-suse.patch +++ /dev/null @@ -1,263 +0,0 @@ -Index: linux-2.6.5-sles9/include/linux/ext3_fs.h -=================================================================== ---- linux-2.6.5-sles9.orig/include/linux/ext3_fs.h 2004-11-09 02:29:14.878513832 +0300 -+++ linux-2.6.5-sles9/include/linux/ext3_fs.h 2004-11-09 02:32:14.151260232 +0300 -@@ -709,7 +709,7 @@ - unsigned int block_group, - struct buffer_head ** bh); - extern int ext3_should_retry_alloc(struct super_block *sb, int *retries); --extern void rsv_window_add(struct super_block *sb, struct reserve_window_node *rsv); -+extern void rsv_window_add(struct super_block *sb, struct ext3_reserve_window_node *rsv); - - /* dir.c */ - extern int ext3_check_dir_entry(const char *, struct inode *, -Index: linux-2.6.5-sles9/include/linux/ext3_fs_sb.h -=================================================================== ---- linux-2.6.5-sles9.orig/include/linux/ext3_fs_sb.h 2004-11-09 02:28:18.753046200 +0300 -+++ linux-2.6.5-sles9/include/linux/ext3_fs_sb.h 2004-11-09 02:32:27.996155488 +0300 -@@ -86,7 +86,7 @@ - /* root of the per fs reservation window tree */ - spinlock_t s_rsv_window_lock; - struct rb_root s_rsv_window_root; -- struct reserve_window_node s_rsv_window_head; -+ struct ext3_reserve_window_node s_rsv_window_head; - - /* Journaling */ - struct inode * s_journal_inode; -Index: linux-2.6.5-sles9/include/linux/ext3_fs_i.h -=================================================================== ---- linux-2.6.5-sles9.orig/include/linux/ext3_fs_i.h 2004-11-09 02:23:21.606219384 +0300 -+++ linux-2.6.5-sles9/include/linux/ext3_fs_i.h 2004-11-09 02:32:08.752081032 +0300 -@@ -20,17 +20,17 @@ - #include - #include - --struct reserve_window { -+struct ext3_reserve_window { - __u32 _rsv_start; /* First byte reserved */ - __u32 _rsv_end; /* Last byte reserved or 0 */ - }; - --struct reserve_window_node { -+struct ext3_reserve_window_node { - struct rb_node rsv_node; - atomic_t rsv_goal_size; - atomic_t rsv_alloc_hit; - seqlock_t rsv_seqlock; -- struct reserve_window rsv_window; -+ struct ext3_reserve_window rsv_window; - }; - - #define rsv_start rsv_window._rsv_start -@@ -76,7 +76,7 @@ - */ - __u32 i_next_alloc_goal; - /* block reservation window */ -- struct reserve_window_node i_rsv_window; -+ struct ext3_reserve_window_node i_rsv_window; - - __u32 i_dir_start_lookup; - #ifdef CONFIG_EXT3_FS_XATTR -Index: linux-2.6.5-sles9/fs/ext3/balloc.c -=================================================================== ---- linux-2.6.5-sles9.orig/fs/ext3/balloc.c 2004-11-09 02:26:53.078070776 +0300 -+++ linux-2.6.5-sles9/fs/ext3/balloc.c 2004-11-09 02:32:43.108858008 +0300 -@@ -115,7 +115,7 @@ - const char *fn) - { - struct rb_node *n; -- struct reserve_window_node *rsv, *prev; -+ struct ext3_reserve_window_node *rsv, *prev; - int bad; - - restart: -@@ -125,7 +125,7 @@ - - printk("Block Allocation Reservation Windows Map (%s):\n", fn); - while (n) { -- rsv = list_entry(n, struct reserve_window_node, rsv_node); -+ rsv = list_entry(n, struct ext3_reserve_window_node, rsv_node); - if (verbose) - printk("reservation window 0x%p " - "start: %d, end: %d\n", -@@ -161,7 +161,7 @@ - #endif - - static int --goal_in_my_reservation(struct reserve_window *rsv, int goal, -+goal_in_my_reservation(struct ext3_reserve_window *rsv, int goal, - unsigned int group, struct super_block * sb) - { - unsigned long group_first_block, group_last_block; -@@ -184,18 +184,18 @@ - * if the goal is not in any window. - * Returns NULL if there are no windows or if all windows start after the goal. - */ --static struct reserve_window_node *search_reserve_window(struct rb_root *root, -+static struct ext3_reserve_window_node *search_ext3_reserve_window(struct rb_root *root, - unsigned long goal) - { - struct rb_node *n = root->rb_node; -- struct reserve_window_node *rsv; -+ struct ext3_reserve_window_node *rsv; - - if (!n) - return NULL; - - while (n) - { -- rsv = rb_entry(n, struct reserve_window_node, rsv_node); -+ rsv = rb_entry(n, struct ext3_reserve_window_node, rsv_node); - - if (goal < rsv->rsv_start) - n = n->rb_left; -@@ -212,13 +212,13 @@ - */ - if (rsv->rsv_start > goal) { - n = rb_prev(&rsv->rsv_node); -- rsv = rb_entry(n, struct reserve_window_node, rsv_node); -+ rsv = rb_entry(n, struct ext3_reserve_window_node, rsv_node); - } - return rsv; - } - - void rsv_window_add(struct super_block *sb, -- struct reserve_window_node *rsv) -+ struct ext3_reserve_window_node *rsv) - { - struct rb_root *root = &EXT3_SB(sb)->s_rsv_window_root; - struct rb_node *node = &rsv->rsv_node; -@@ -226,12 +226,12 @@ - - struct rb_node ** p = &root->rb_node; - struct rb_node * parent = NULL; -- struct reserve_window_node *this; -+ struct ext3_reserve_window_node *this; - - while (*p) - { - parent = *p; -- this = rb_entry(parent, struct reserve_window_node, rsv_node); -+ this = rb_entry(parent, struct ext3_reserve_window_node, rsv_node); - - if (start < this->rsv_start) - p = &(*p)->rb_left; -@@ -246,7 +246,7 @@ - } - - static void rsv_window_remove(struct super_block *sb, -- struct reserve_window_node *rsv) -+ struct ext3_reserve_window_node *rsv) - { - rsv->rsv_start = EXT3_RESERVE_WINDOW_NOT_ALLOCATED; - rsv->rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED; -@@ -254,7 +254,7 @@ - rb_erase(&rsv->rsv_node, &EXT3_SB(sb)->s_rsv_window_root); - } - --static inline int rsv_is_empty(struct reserve_window *rsv) -+static inline int rsv_is_empty(struct ext3_reserve_window *rsv) - { - /* a valid reservation end block could not be 0 */ - return (rsv->_rsv_end == EXT3_RESERVE_WINDOW_NOT_ALLOCATED); -@@ -263,7 +263,7 @@ - void ext3_discard_reservation(struct inode *inode) - { - struct ext3_inode_info *ei = EXT3_I(inode); -- struct reserve_window_node *rsv = &ei->i_rsv_window; -+ struct ext3_reserve_window_node *rsv = &ei->i_rsv_window; - spinlock_t *rsv_lock = &EXT3_SB(inode->i_sb)->s_rsv_window_lock; - - if (!rsv_is_empty(&rsv->rsv_window)) { -@@ -600,7 +600,7 @@ - */ - static int - ext3_try_to_allocate(struct super_block *sb, handle_t *handle, int group, -- struct buffer_head *bitmap_bh, int goal, struct reserve_window *my_rsv) -+ struct buffer_head *bitmap_bh, int goal, struct ext3_reserve_window *my_rsv) - { - int group_first_block, start, end; - -@@ -700,13 +700,13 @@ - * on succeed, it returns the reservation window to be appended to. - * failed, return NULL. - */ --static struct reserve_window_node *find_next_reservable_window( -- struct reserve_window_node *search_head, -+static struct ext3_reserve_window_node *find_next_reservable_window( -+ struct ext3_reserve_window_node *search_head, - unsigned long size, int *start_block, - int last_block) - { - struct rb_node *next; -- struct reserve_window_node *rsv, *prev; -+ struct ext3_reserve_window_node *rsv, *prev; - int cur; - - /* TODO: make the start of the reservation window byte-aligned */ -@@ -734,7 +734,7 @@ - - prev = rsv; - next = rb_next(&rsv->rsv_node); -- rsv = list_entry(next, struct reserve_window_node, rsv_node); -+ rsv = list_entry(next, struct ext3_reserve_window_node, rsv_node); - - /* - * Reached the last reservation, we can just append to the -@@ -801,15 +801,15 @@ - * @group: the group we are trying to allocate in - * @bitmap_bh: the block group block bitmap - */ --static int alloc_new_reservation(struct reserve_window_node *my_rsv, -+static int alloc_new_reservation(struct ext3_reserve_window_node *my_rsv, - int goal, struct super_block *sb, - unsigned int group, struct buffer_head *bitmap_bh) - { -- struct reserve_window_node *search_head; -+ struct ext3_reserve_window_node *search_head; - int group_first_block, group_end_block, start_block; - int first_free_block; - int reservable_space_start; -- struct reserve_window_node *prev_rsv; -+ struct ext3_reserve_window_node *prev_rsv; - struct rb_root *fs_rsv_root = &EXT3_SB(sb)->s_rsv_window_root; - unsigned long size; - -@@ -859,7 +859,7 @@ - /* - * shift the search start to the window near the goal block - */ -- search_head = search_reserve_window(fs_rsv_root, start_block); -+ search_head = search_ext3_reserve_window(fs_rsv_root, start_block); - - /* - * find_next_reservable_window() simply finds a reservable window -@@ -968,7 +968,7 @@ - static int - ext3_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle, - unsigned int group, struct buffer_head *bitmap_bh, -- int goal, struct reserve_window_node * my_rsv, -+ int goal, struct ext3_reserve_window_node * my_rsv, - int *errp) - { - spinlock_t *rsv_lock; -@@ -1027,7 +1027,7 @@ - * then we could go to allocate from the reservation window directly. - */ - while (1) { -- struct reserve_window rsv_copy; -+ struct ext3_reserve_window rsv_copy; - unsigned int seq; - - do { -@@ -1159,8 +1159,8 @@ - struct ext3_group_desc *gdp; - struct ext3_super_block *es; - struct ext3_sb_info *sbi; -- struct reserve_window_node *my_rsv = NULL; -- struct reserve_window_node *rsv = &EXT3_I(inode)->i_rsv_window; -+ struct ext3_reserve_window_node *my_rsv = NULL; -+ struct ext3_reserve_window_node *rsv = &EXT3_I(inode)->i_rsv_window; - unsigned short windowsz = 0; - #ifdef EXT3FS_DEBUG - static int goal_hits, goal_attempts; diff --git a/ldiskfs/kernel_patches/patches/ext3-san-jdike-2.6-suse.patch b/ldiskfs/kernel_patches/patches/ext3-san-jdike-2.6-suse.patch deleted file mode 100644 index afda0bd..0000000 --- a/ldiskfs/kernel_patches/patches/ext3-san-jdike-2.6-suse.patch +++ /dev/null @@ -1,106 +0,0 @@ - fs/ext3/inode.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ - fs/ext3/super.c | 4 ++ - 2 files changed, 85 insertions(+) - ---- linux-2.5.73/fs/ext3/inode.c~ext3-san-jdike-2.5.73 2003-06-22 12:32:58.000000000 -0600 -+++ linux-2.5.73-braam/fs/ext3/inode.c 2003-06-30 12:19:21.000000000 -0600 -@@ -2945,3 +2945,84 @@ int ext3_change_inode_journal_flag(struc - - return err; - } -+ -+/* for each block: 1 ind + 1 dind + 1 tind -+ * for each block: 3 bitmap blocks -+ * for each block: 3 group descriptor blocks -+ * i inode block -+ * 1 superblock -+ * 2 * EXT3_SINGLEDATA_TRANS_BLOCKS for the quote files -+ * ((1+1+1) * 3 * nblocks) + 1 + 1 + 2 * EXT3_SINGLEDATA_TRANS_BLOCKS -+ * -+ * XXX assuming: -+ * (1) fs logic block size == page size -+ * (2) ext3 in writeback mode -+ */ -+static inline int ext3_san_write_trans_blocks(int nblocks) -+{ -+ int ret; -+ -+ ret = (1 + 1 + 1) * 3 * nblocks + 1 + 1; -+ -+#ifdef CONFIG_QUOTA -+ ret += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS; -+#endif -+ -+ return ret; -+} -+ -+/* Alloc blocks for an inode, while don't create any buffer/page -+ * for data I/O; set the inode size if file is extended. -+ * -+ * @inode: target inode -+ * @blocks: array of logic block number -+ * @nblocks: how many blocks need be alloced -+ * @newsize: new filesize we should set -+ * -+ * return: 0 success, otherwise failed -+ * (*blocks) contains physical block number alloced -+ * -+ * XXX this assume the fs block size == page size -+ */ -+int ext3_prep_san_write(struct inode *inode, long *blocks, -+ int nblocks, loff_t newsize) -+{ -+ handle_t *handle; -+ struct buffer_head bh_tmp; -+ int needed_blocks; -+ int i, ret = 0, ret2; -+ -+ needed_blocks = ext3_san_write_trans_blocks(nblocks); -+ -+ lock_kernel(); -+ handle = ext3_journal_start(inode, needed_blocks); -+ if (IS_ERR(handle)) { -+ unlock_kernel(); -+ return PTR_ERR(handle); -+ } -+ unlock_kernel(); -+ -+ /* alloc blocks one by one */ -+ for (i = 0; i < nblocks; i++) { -+ ret = ext3_get_block_handle(handle, inode, blocks[i], -+ &bh_tmp, 1, 1); -+ if (ret) -+ break; -+ -+ blocks[i] = bh_tmp.b_blocknr; -+ } -+ -+ /* set inode size if needed */ -+ if (!ret && (newsize > inode->i_size)) { -+ inode->i_size = newsize; -+ ext3_mark_inode_dirty(handle, inode); -+ } -+ -+ lock_kernel(); -+ ret2 = ext3_journal_stop(handle); -+ unlock_kernel(); -+ -+ if (!ret) -+ ret = ret2; -+ return ret; -+} ---- linux-2.5.73/fs/ext3/super.c~ext3-san-jdike-2.5.73 2003-06-22 12:33:16.000000000 -0600 -+++ linux-2.5.73-braam/fs/ext3/super.c 2003-06-30 12:16:36.000000000 -0600 -@@ -2080,6 +2080,10 @@ static void __exit exit_ext3_fs(void) - exit_ext3_xattr(); - } - -+int ext3_prep_san_write(struct inode *inode, long *blocks, -+ int nblocks, loff_t newsize); -+EXPORT_SYMBOL(ext3_prep_san_write); -+ - MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); - MODULE_DESCRIPTION("Second Extended Filesystem with journaling extensions"); - MODULE_LICENSE("GPL"); - -_ diff --git a/ldiskfs/kernel_patches/patches/ext3-sector_t-overflow-2.6.12.patch b/ldiskfs/kernel_patches/patches/ext3-sector_t-overflow-2.6.12.patch deleted file mode 100644 index ef0f4a4..0000000 --- a/ldiskfs/kernel_patches/patches/ext3-sector_t-overflow-2.6.12.patch +++ /dev/null @@ -1,64 +0,0 @@ -Subject: Avoid disk sector_t overflow for >2TB ext3 filesystem -From: Mingming Cao - - -If ext3 filesystem is larger than 2TB, and sector_t is a u32 (i.e. -CONFIG_LBD not defined in the kernel), the calculation of the disk sector -will overflow. Add check at ext3_fill_super() and ext3_group_extend() to -prevent mount/remount/resize >2TB ext3 filesystem if sector_t size is 4 -bytes. - -Verified this patch on a 32 bit platform without CONFIG_LBD defined -(sector_t is 32 bits long), mount refuse to mount a 10TB ext3. - -Signed-off-by: Mingming Cao -Acked-by: Andreas Dilger -Signed-off-by: Andrew Morton ---- - - fs/ext3/resize.c | 10 ++++++++++ - fs/ext3/super.c | 10 ++++++++++ - 2 files changed, 20 insertions(+) - -diff -puN fs/ext3/resize.c~avoid-disk-sector_t-overflow-for-2tb-ext3-filesystem fs/ext3/resize.c ---- devel/fs/ext3/resize.c~avoid-disk-sector_t-overflow-for-2tb-ext3-filesystem 2006-05-22 14:09:53.000000000 -0700 -+++ devel-akpm/fs/ext3/resize.c 2006-05-22 14:10:56.000000000 -0700 -@@ -926,6 +926,16 @@ int ext3_group_extend(struct super_block - if (n_blocks_count == 0 || n_blocks_count == o_blocks_count) - return 0; - -+ if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { -+ printk(KERN_ERR "EXT3-fs: filesystem on %s: " -+ "too large to resize to %lu blocks safely\n", -+ sb->s_id, n_blocks_count); -+ if (sizeof(sector_t) < 8) -+ ext3_warning(sb, __FUNCTION__, -+ "CONFIG_LBD not enabled\n"); -+ return -EINVAL; -+ } -+ - if (n_blocks_count < o_blocks_count) { - ext3_warning(sb, __FUNCTION__, - "can't shrink FS - resize aborted"); -diff -puN fs/ext3/super.c~avoid-disk-sector_t-overflow-for-2tb-ext3-filesystem fs/ext3/super.c ---- devel/fs/ext3/super.c~avoid-disk-sector_t-overflow-for-2tb-ext3-filesystem 2006-05-22 14:09:53.000000000 -0700 -+++ devel-akpm/fs/ext3/super.c 2006-05-22 14:11:10.000000000 -0700 -@@ -1565,6 +1565,17 @@ static int ext3_fill_super (struct super - goto failed_mount; - } - -+ if (le32_to_cpu(es->s_blocks_count) > -+ (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { -+ printk(KERN_ERR "EXT3-fs: filesystem on %s: " -+ "too large to mount safely - %u blocks\n", sb->s_id, -+ le32_to_cpu(es->s_blocks_count)); -+ if (sizeof(sector_t) < 8) -+ printk(KERN_WARNING -+ "EXT3-fs: CONFIG_LBD not enabled\n"); -+ goto failed_mount; -+ } -+ - if (EXT3_BLOCKS_PER_GROUP(sb) == 0) - goto cantfind_ext3; - sbi->s_groups_count = (le32_to_cpu(es->s_blocks_count) - -_ diff --git a/ldiskfs/kernel_patches/patches/ext3-sector_t-overflow-2.6.5-suse.patch b/ldiskfs/kernel_patches/patches/ext3-sector_t-overflow-2.6.5-suse.patch deleted file mode 100644 index fe655da..0000000 --- a/ldiskfs/kernel_patches/patches/ext3-sector_t-overflow-2.6.5-suse.patch +++ /dev/null @@ -1,44 +0,0 @@ -Subject: Avoid disk sector_t overflow for >2TB ext3 filesystem -From: Mingming Cao - - -If ext3 filesystem is larger than 2TB, and sector_t is a u32 (i.e. -CONFIG_LBD not defined in the kernel), the calculation of the disk sector -will overflow. Add check at ext3_fill_super() and ext3_group_extend() to -prevent mount/remount/resize >2TB ext3 filesystem if sector_t size is 4 -bytes. - -Verified this patch on a 32 bit platform without CONFIG_LBD defined -(sector_t is 32 bits long), mount refuse to mount a 10TB ext3. - -Signed-off-by: Mingming Cao -Acked-by: Andreas Dilger -Signed-off-by: Andrew Morton ---- - - fs/ext3/resize.c | 10 ++++++++++ - fs/ext3/super.c | 10 ++++++++++ - 2 files changed, 20 insertions(+) - -diff -puN fs/ext3/super.c~avoid-disk-sector_t-overflow-for-2tb-ext3-filesystem fs/ext3/super.c ---- devel/fs/ext3/super.c~avoid-disk-sector_t-overflow-for-2tb-ext3-filesystem 2006-05-22 14:09:53.000000000 -0700 -+++ devel-akpm/fs/ext3/super.c 2006-05-22 14:11:10.000000000 -0700 -@@ -1565,6 +1565,17 @@ static int ext3_fill_super (struct super - goto failed_mount; - } - -+ if (le32_to_cpu(es->s_blocks_count) > -+ (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { -+ printk(KERN_ERR "EXT3-fs: filesystem on %s: " -+ "too large to mount safely - %u blocks\n", sb->s_id, -+ le32_to_cpu(es->s_blocks_count)); -+ if (sizeof(sector_t) < 8) -+ printk(KERN_WARNING -+ "EXT3-fs: CONFIG_LBD not enabled\n"); -+ goto failed_mount; -+ } -+ - sbi->s_groups_count = (le32_to_cpu(es->s_blocks_count) - - le32_to_cpu(es->s_first_data_block) + - EXT3_BLOCKS_PER_GROUP(sb) - 1) / -_ diff --git a/ldiskfs/kernel_patches/patches/ext3-sector_t-overflow-2.6.9-rhel4.patch b/ldiskfs/kernel_patches/patches/ext3-sector_t-overflow-2.6.9-rhel4.patch deleted file mode 100644 index 9bfdf80..0000000 --- a/ldiskfs/kernel_patches/patches/ext3-sector_t-overflow-2.6.9-rhel4.patch +++ /dev/null @@ -1,64 +0,0 @@ -Subject: Avoid disk sector_t overflow for >2TB ext3 filesystem -From: Mingming Cao - - -If ext3 filesystem is larger than 2TB, and sector_t is a u32 (i.e. -CONFIG_LBD not defined in the kernel), the calculation of the disk sector -will overflow. Add check at ext3_fill_super() and ext3_group_extend() to -prevent mount/remount/resize >2TB ext3 filesystem if sector_t size is 4 -bytes. - -Verified this patch on a 32 bit platform without CONFIG_LBD defined -(sector_t is 32 bits long), mount refuse to mount a 10TB ext3. - -Signed-off-by: Mingming Cao -Acked-by: Andreas Dilger -Signed-off-by: Andrew Morton ---- - - fs/ext3/resize.c | 10 ++++++++++ - fs/ext3/super.c | 10 ++++++++++ - 2 files changed, 20 insertions(+) - -diff -puN fs/ext3/resize.c~avoid-disk-sector_t-overflow-for-2tb-ext3-filesystem fs/ext3/resize.c ---- devel/fs/ext3/resize.c~avoid-disk-sector_t-overflow-for-2tb-ext3-filesystem 2006-05-22 14:09:53.000000000 -0700 -+++ devel-akpm/fs/ext3/resize.c 2006-05-22 14:10:56.000000000 -0700 -@@ -926,6 +926,16 @@ int ext3_group_extend(struct super_block - if (n_blocks_count == 0 || n_blocks_count == o_blocks_count) - return 0; - -+ if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { -+ printk(KERN_ERR "EXT3-fs: filesystem on %s: " -+ "too large to resize to %lu blocks safely\n", -+ sb->s_id, n_blocks_count); -+ if (sizeof(sector_t) < 8) -+ ext3_warning(sb, __FUNCTION__, -+ "CONFIG_LBD not enabled\n"); -+ return -EINVAL; -+ } -+ - if (n_blocks_count < o_blocks_count) { - ext3_warning(sb, __FUNCTION__, - "can't shrink FS - resize aborted"); -diff -puN fs/ext3/super.c~avoid-disk-sector_t-overflow-for-2tb-ext3-filesystem fs/ext3/super.c ---- devel/fs/ext3/super.c~avoid-disk-sector_t-overflow-for-2tb-ext3-filesystem 2006-05-22 14:09:53.000000000 -0700 -+++ devel-akpm/fs/ext3/super.c 2006-05-22 14:11:10.000000000 -0700 -@@ -1565,6 +1565,17 @@ static int ext3_fill_super (struct super - goto failed_mount; - } - -+ if (le32_to_cpu(es->s_blocks_count) > -+ (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { -+ printk(KERN_ERR "EXT3-fs: filesystem on %s: " -+ "too large to mount safely - %u blocks\n", sb->s_id, -+ le32_to_cpu(es->s_blocks_count)); -+ if (sizeof(sector_t) < 8) -+ printk(KERN_WARNING -+ "EXT3-fs: CONFIG_LBD not enabled\n"); -+ goto failed_mount; -+ } -+ - sbi->s_groups_count = (le32_to_cpu(es->s_blocks_count) - - le32_to_cpu(es->s_first_data_block) + - EXT3_BLOCKS_PER_GROUP(sb) - 1) / -_ diff --git a/ldiskfs/kernel_patches/patches/ext3-wantedi-2.6-rhel4.patch b/ldiskfs/kernel_patches/patches/ext3-wantedi-2.6-rhel4.patch deleted file mode 100644 index f71e470005..0000000 --- a/ldiskfs/kernel_patches/patches/ext3-wantedi-2.6-rhel4.patch +++ /dev/null @@ -1,193 +0,0 @@ -diff -urp RH_2_6_9_42_0_3.orig/fs/ext3/ialloc.c RH_2_6_9_42_0_3/fs/ext3/ialloc.c ---- RH_2_6_9_42_0_3.orig/fs/ext3/ialloc.c 2006-10-23 13:32:46.000000000 +0300 -+++ RH_2_6_9_42_0_3/fs/ext3/ialloc.c 2007-02-16 07:22:28.000000000 +0200 -@@ -419,7 +419,8 @@ static int find_group_other(struct super - * For other inodes, search forward from the parent directory's block - * group to find a free inode. - */ --struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, int mode) -+struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, int mode, -+ unsigned long goal) - { - struct super_block *sb; - struct buffer_head *bitmap_bh = NULL; -@@ -447,6 +448,41 @@ struct inode *ext3_new_inode(handle_t *h - - sbi = EXT3_SB(sb); - es = sbi->s_es; -+ if (goal) { -+ group = (goal - 1) / EXT3_INODES_PER_GROUP(sb); -+ ino = (goal - 1) % EXT3_INODES_PER_GROUP(sb); -+ err = -EIO; -+ -+ gdp = ext3_get_group_desc(sb, group, &bh2); -+ if (!gdp) -+ goto fail; -+ -+ bitmap_bh = read_inode_bitmap (sb, group); -+ if (!bitmap_bh) -+ goto fail; -+ -+ BUFFER_TRACE(bh, "get_write_access"); -+ err = ext3_journal_get_write_access(handle, bitmap_bh); -+ if (err) goto fail; -+ -+ if (ext3_set_bit_atomic(sb_bgl_lock(sbi, group), -+ ino, bitmap_bh->b_data)) { -+ printk(KERN_ERR "goal inode %lu unavailable\n", goal); -+ /* Oh well, we tried. */ -+ goto continue_allocation; -+ } -+ -+ BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); -+ err = ext3_journal_dirty_metadata(handle, bitmap_bh); -+ if (err) goto fail; -+ -+ /* We've shortcircuited the allocation system successfully, -+ * now finish filling in the inode. -+ */ -+ goto got; -+ } -+ -+continue_allocation: - if (S_ISDIR(mode)) { - if (test_opt (sb, OLDALLOC)) - group = find_group_dir(sb, dir); -diff -urp RH_2_6_9_42_0_3.orig/fs/ext3/ioctl.c RH_2_6_9_42_0_3/fs/ext3/ioctl.c ---- RH_2_6_9_42_0_3.orig/fs/ext3/ioctl.c 2006-10-23 13:32:46.000000000 +0300 -+++ RH_2_6_9_42_0_3/fs/ext3/ioctl.c 2007-02-16 07:22:28.000000000 +0200 -@@ -25,6 +25,31 @@ int ext3_ioctl (struct inode * inode, st - ext3_debug ("cmd = %u, arg = %lu\n", cmd, arg); - - switch (cmd) { -+ case EXT3_IOC_CREATE_INUM: { -+ char name[32]; -+ struct dentry *dchild, *dparent; -+ int rc = 0; -+ -+ dparent = list_entry(inode->i_dentry.next, struct dentry, -+ d_alias); -+ snprintf(name, sizeof name, "%lu", arg); -+ dchild = lookup_one_len(name, dparent, strlen(name)); -+ if (dchild->d_inode) { -+ printk(KERN_ERR "%*s/%lu already exists (ino %lu)\n", -+ dparent->d_name.len, dparent->d_name.name, arg, -+ dchild->d_inode->i_ino); -+ rc = -EEXIST; -+ } else { -+ dchild->d_fsdata = (void *)arg; -+ rc = vfs_create(inode, dchild, 0644, NULL); -+ if (rc) -+ printk(KERN_ERR "vfs_create: %d\n", rc); -+ else if (dchild->d_inode->i_ino != arg) -+ rc = -EEXIST; -+ } -+ dput(dchild); -+ return rc; -+ } - case EXT3_IOC_GETFLAGS: - flags = ei->i_flags & EXT3_FL_USER_VISIBLE; - return put_user(flags, (int __user *) arg); -diff -urp RH_2_6_9_42_0_3.orig/fs/ext3/namei.c RH_2_6_9_42_0_3/fs/ext3/namei.c ---- RH_2_6_9_42_0_3.orig/fs/ext3/namei.c 2006-10-23 13:32:59.000000000 +0300 -+++ RH_2_6_9_42_0_3/fs/ext3/namei.c 2007-02-22 18:58:13.000000000 +0200 -@@ -97,6 +97,7 @@ struct dx_entry - __le32 block; - }; - -+ - /* - * dx_root_info is laid out so that if it should somehow get overlaid by a - * dirent the two low bits of the hash version will be zero. Therefore, the -@@ -141,6 +142,14 @@ struct dx_map_entry - u32 offs; - }; - -+#define LVFS_DENTRY_PARAM_MAGIC 20070216UL -+struct lvfs_dentry_params -+{ -+ unsigned long p_inum; -+ void *p_ptr; -+ u32 magic; -+}; -+ - #ifdef CONFIG_EXT3_INDEX - static inline unsigned dx_get_block (struct dx_entry *entry); - static void dx_set_block (struct dx_entry *entry, unsigned value); -@@ -1624,6 +1633,20 @@ static int ext3_add_nondir(handle_t *han - return err; - } - -+static struct inode * ext3_new_inode_wantedi(handle_t *handle, struct inode *dir, -+ int mode, struct dentry *dentry) -+{ -+ unsigned long inum = 0; -+ -+ if (dentry->d_fsdata != NULL) { -+ struct lvfs_dentry_params *param = dentry->d_fsdata; -+ -+ if (param->magic == LVFS_DENTRY_PARAM_MAGIC) -+ inum = param->p_inum; -+ } -+ return ext3_new_inode(handle, dir, mode, inum); -+} -+ - /* - * By the time this is called, we already have created - * the directory cache entry for the new file, but it -@@ -1649,7 +1672,7 @@ retry: - if (IS_DIRSYNC(dir)) - handle->h_sync = 1; - -- inode = ext3_new_inode (handle, dir, mode); -+ inode = ext3_new_inode_wantedi (handle, dir, mode, dentry); - err = PTR_ERR(inode); - if (!IS_ERR(inode)) { - inode->i_op = &ext3_file_inode_operations; -@@ -1683,7 +1706,7 @@ retry: - if (IS_DIRSYNC(dir)) - handle->h_sync = 1; - -- inode = ext3_new_inode (handle, dir, mode); -+ inode = ext3_new_inode_wantedi (handle, dir, mode, dentry); - err = PTR_ERR(inode); - if (!IS_ERR(inode)) { - init_special_inode(inode, inode->i_mode, rdev); -@@ -1719,7 +1742,7 @@ retry: - if (IS_DIRSYNC(dir)) - handle->h_sync = 1; - -- inode = ext3_new_inode (handle, dir, S_IFDIR | mode); -+ inode = ext3_new_inode_wantedi (handle, dir, S_IFDIR | mode, dentry); - err = PTR_ERR(inode); - if (IS_ERR(inode)) - goto out_stop; -@@ -2124,7 +2147,7 @@ retry: - if (IS_DIRSYNC(dir)) - handle->h_sync = 1; - -- inode = ext3_new_inode (handle, dir, S_IFLNK|S_IRWXUGO); -+ inode = ext3_new_inode_wantedi (handle, dir, S_IFLNK|S_IRWXUGO, dentry); - err = PTR_ERR(inode); - if (IS_ERR(inode)) - goto out_stop; -diff -urp RH_2_6_9_42_0_3.orig/include/linux/ext3_fs.h RH_2_6_9_42_0_3/include/linux/ext3_fs.h ---- RH_2_6_9_42_0_3.orig/include/linux/ext3_fs.h 2006-10-23 13:32:46.000000000 +0300 -+++ RH_2_6_9_42_0_3/include/linux/ext3_fs.h 2007-02-16 07:22:28.000000000 +0200 -@@ -741,7 +741,8 @@ extern int ext3fs_dirhash(const char *na - dx_hash_info *hinfo); - - /* ialloc.c */ --extern struct inode * ext3_new_inode (handle_t *, struct inode *, int); -+extern struct inode * ext3_new_inode (handle_t *, struct inode *, int, -+ unsigned long); - extern void ext3_free_inode (handle_t *, struct inode *); - extern struct inode * ext3_orphan_get (struct super_block *, unsigned long); - extern unsigned long ext3_count_free_inodes (struct super_block *); -@@ -833,4 +834,6 @@ extern struct inode_operations ext3_fast - - #endif /* __KERNEL__ */ - -+/* EXT3_IOC_CREATE_INUM at bottom of file (visible to kernel and user). */ -+#define EXT3_IOC_CREATE_INUM _IOW('f', 5, long) - #endif /* _LINUX_EXT3_FS_H */ diff --git a/ldiskfs/kernel_patches/patches/ext3-wantedi-2.6-suse.patch b/ldiskfs/kernel_patches/patches/ext3-wantedi-2.6-suse.patch deleted file mode 100644 index e38bedb..0000000 --- a/ldiskfs/kernel_patches/patches/ext3-wantedi-2.6-suse.patch +++ /dev/null @@ -1,192 +0,0 @@ -diff -urp linux-2.6.5-7.282.orig/fs/ext3/ialloc.c linux-2.6.5-7.282/fs/ext3/ialloc.c ---- linux-2.6.5-7.282.orig/fs/ext3/ialloc.c 2006-08-30 17:12:13.000000000 +0300 -+++ linux-2.6.5-7.282/fs/ext3/ialloc.c 2007-02-16 07:43:08.000000000 +0200 -@@ -420,7 +420,8 @@ static int find_group_other(struct super - * For other inodes, search forward from the parent directory's block - * group to find a free inode. - */ --struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, int mode) -+struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, int mode, -+ unsigned long goal) - { - struct super_block *sb; - struct buffer_head *bitmap_bh = NULL; -@@ -448,6 +449,41 @@ struct inode *ext3_new_inode(handle_t *h - - sbi = EXT3_SB(sb); - es = sbi->s_es; -+ if (goal) { -+ group = (goal - 1) / EXT3_INODES_PER_GROUP(sb); -+ ino = (goal - 1) % EXT3_INODES_PER_GROUP(sb); -+ err = -EIO; -+ -+ gdp = ext3_get_group_desc(sb, group, &bh2); -+ if (!gdp) -+ goto fail; -+ -+ bitmap_bh = read_inode_bitmap (sb, group); -+ if (!bitmap_bh) -+ goto fail; -+ -+ BUFFER_TRACE(bh, "get_write_access"); -+ err = ext3_journal_get_write_access(handle, bitmap_bh); -+ if (err) goto fail; -+ -+ if (ext3_set_bit_atomic(sb_bgl_lock(sbi, group), -+ ino, bitmap_bh->b_data)) { -+ printk(KERN_ERR "goal inode %lu unavailable\n", goal); -+ /* Oh well, we tried. */ -+ goto continue_allocation; -+ } -+ -+ BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); -+ err = ext3_journal_dirty_metadata(handle, bitmap_bh); -+ if (err) goto fail; -+ -+ /* We've shortcircuited the allocation system successfully, -+ * now finish filling in the inode. -+ */ -+ goto got; -+ } -+ -+continue_allocation: - if (S_ISDIR(mode)) { - if (test_opt (sb, OLDALLOC)) - group = find_group_dir(sb, dir); -diff -urp linux-2.6.5-7.282.orig/fs/ext3/ioctl.c linux-2.6.5-7.282/fs/ext3/ioctl.c ---- linux-2.6.5-7.282.orig/fs/ext3/ioctl.c 2006-08-30 17:12:13.000000000 +0300 -+++ linux-2.6.5-7.282/fs/ext3/ioctl.c 2007-02-16 07:43:08.000000000 +0200 -@@ -25,6 +25,31 @@ int ext3_ioctl (struct inode * inode, st - ext3_debug ("cmd = %u, arg = %lu\n", cmd, arg); - - switch (cmd) { -+ case EXT3_IOC_CREATE_INUM: { -+ char name[32]; -+ struct dentry *dchild, *dparent; -+ int rc = 0; -+ -+ dparent = list_entry(inode->i_dentry.next, struct dentry, -+ d_alias); -+ snprintf(name, sizeof name, "%lu", arg); -+ dchild = lookup_one_len(name, dparent, strlen(name)); -+ if (dchild->d_inode) { -+ printk(KERN_ERR "%*s/%lu already exists (ino %lu)\n", -+ dparent->d_name.len, dparent->d_name.name, arg, -+ dchild->d_inode->i_ino); -+ rc = -EEXIST; -+ } else { -+ dchild->d_fsdata = (void *)arg; -+ rc = vfs_create(inode, dchild, 0644, NULL); -+ if (rc) -+ printk(KERN_ERR "vfs_create: %d\n", rc); -+ else if (dchild->d_inode->i_ino != arg) -+ rc = -EEXIST; -+ } -+ dput(dchild); -+ return rc; -+ } - case EXT3_IOC_GETFLAGS: - flags = ei->i_flags & EXT3_FL_USER_VISIBLE; - return put_user(flags, (int *) arg); -diff -urp linux-2.6.5-7.282.orig/fs/ext3/namei.c linux-2.6.5-7.282/fs/ext3/namei.c ---- linux-2.6.5-7.282.orig/fs/ext3/namei.c 2006-08-30 17:12:34.000000000 +0300 -+++ linux-2.6.5-7.282/fs/ext3/namei.c 2007-02-16 07:46:13.000000000 +0200 -@@ -144,6 +144,14 @@ struct dx_map_entry - u32 offs; - }; - -+#define LVFS_DENTRY_PARAM_MAGIC 20070216UL -+struct lvfs_dentry_params -+{ -+ unsigned long p_inum; -+ void *p_ptr; -+ u32 magic; -+}; -+ - #ifdef CONFIG_EXT3_INDEX - static inline unsigned dx_get_block (struct dx_entry *entry); - static void dx_set_block (struct dx_entry *entry, unsigned value); -@@ -1625,6 +1633,20 @@ static int ext3_add_nondir(handle_t *han - return err; - } - -+static struct inode * ext3_new_inode_wantedi(handle_t *handle, struct inode *dir, -+ int mode, struct dentry *dentry) -+{ -+ unsigned long inum = 0; -+ -+ if (dentry->d_fsdata != NULL) { -+ struct lvfs_dentry_params *param = dentry->d_fsdata; -+ -+ if (param->magic == LVFS_DENTRY_PARAM_MAGIC) -+ inum = param->p_inum; -+ } -+ return ext3_new_inode(handle, dir, mode, inum); -+} -+ - /* - * By the time this is called, we already have created - * the directory cache entry for the new file, but it -@@ -1649,7 +1671,7 @@ retry: - if (IS_DIRSYNC(dir)) - handle->h_sync = 1; - -- inode = ext3_new_inode (handle, dir, mode); -+ inode = ext3_new_inode_wantedi (handle, dir, mode, dentry); - err = PTR_ERR(inode); - if (!IS_ERR(inode)) { - inode->i_op = &ext3_file_inode_operations; -@@ -1682,7 +1704,7 @@ retry: - if (IS_DIRSYNC(dir)) - handle->h_sync = 1; - -- inode = ext3_new_inode (handle, dir, mode); -+ inode = ext3_new_inode_wantedi (handle, dir, mode, dentry); - err = PTR_ERR(inode); - if (!IS_ERR(inode)) { - init_special_inode(inode, inode->i_mode, rdev); -@@ -1718,7 +1740,7 @@ retry: - if (IS_DIRSYNC(dir)) - handle->h_sync = 1; - -- inode = ext3_new_inode (handle, dir, S_IFDIR | mode); -+ inode = ext3_new_inode_wantedi (handle, dir, S_IFDIR | mode, dentry); - err = PTR_ERR(inode); - if (IS_ERR(inode)) - goto out_stop; -@@ -2113,7 +2135,7 @@ retry: - if (IS_DIRSYNC(dir)) - handle->h_sync = 1; - -- inode = ext3_new_inode (handle, dir, S_IFLNK|S_IRWXUGO); -+ inode = ext3_new_inode_wantedi (handle, dir, S_IFLNK|S_IRWXUGO, dentry); - err = PTR_ERR(inode); - if (IS_ERR(inode)) - goto out_stop; -diff -urp linux-2.6.5-7.282.orig/include/linux/ext3_fs.h linux-2.6.5-7.282/include/linux/ext3_fs.h ---- linux-2.6.5-7.282.orig/include/linux/ext3_fs.h 2006-08-30 17:12:13.000000000 +0300 -+++ linux-2.6.5-7.282/include/linux/ext3_fs.h 2007-02-16 07:43:08.000000000 +0200 -@@ -203,6 +203,7 @@ struct ext3_group_desc - #define EXT3_IOC_SETFLAGS _IOW('f', 2, long) - #define EXT3_IOC_GETVERSION _IOR('f', 3, long) - #define EXT3_IOC_SETVERSION _IOW('f', 4, long) -+/* EXT3_IOC_CREATE_INUM at bottom of file (visible to kernel and user). */ - #define EXT3_IOC_GETVERSION_OLD _IOR('v', 1, long) - #define EXT3_IOC_SETVERSION_OLD _IOW('v', 2, long) - #ifdef CONFIG_JBD_DEBUG -@@ -712,7 +713,8 @@ extern int ext3fs_dirhash(const char *na - dx_hash_info *hinfo); - - /* ialloc.c */ --extern struct inode * ext3_new_inode (handle_t *, struct inode *, int); -+extern struct inode * ext3_new_inode (handle_t *, struct inode *, int, -+ unsigned long); - extern void ext3_free_inode (handle_t *, struct inode *); - extern struct inode * ext3_orphan_get (struct super_block *, unsigned long); - extern unsigned long ext3_count_free_inodes (struct super_block *); -@@ -797,4 +799,5 @@ extern struct inode_operations ext3_fast - - #endif /* __KERNEL__ */ - -+#define EXT3_IOC_CREATE_INUM _IOW('f', 5, long) - #endif /* _LINUX_EXT3_FS_H */ diff --git a/ldiskfs/kernel_patches/patches/iopen-2.6-fc5.patch b/ldiskfs/kernel_patches/patches/iopen-2.6-fc5.patch deleted file mode 100644 index 6bbcec5..0000000 --- a/ldiskfs/kernel_patches/patches/iopen-2.6-fc5.patch +++ /dev/null @@ -1,448 +0,0 @@ -Index: linux-2.6.16.i686/fs/ext3/iopen.c -=================================================================== ---- linux-2.6.16.i686.orig/fs/ext3/iopen.c 2006-05-31 04:14:15.752410384 +0800 -+++ linux-2.6.16.i686/fs/ext3/iopen.c 2006-05-30 22:52:38.000000000 +0800 -@@ -0,0 +1,259 @@ -+/* -+ * linux/fs/ext3/iopen.c -+ * -+ * Special support for open by inode number -+ * -+ * Copyright (C) 2001 by Theodore Ts'o (tytso@alum.mit.edu). -+ * -+ * This file may be redistributed under the terms of the GNU General -+ * Public License. -+ * -+ * -+ * Invariants: -+ * - there is only ever a single DCACHE_NFSD_DISCONNECTED dentry alias -+ * for an inode at one time. -+ * - there are never both connected and DCACHE_NFSD_DISCONNECTED dentry -+ * aliases on an inode at the same time. -+ * -+ * If we have any connected dentry aliases for an inode, use one of those -+ * in iopen_lookup(). Otherwise, we instantiate a single NFSD_DISCONNECTED -+ * dentry for this inode, which thereafter will be found by the dcache -+ * when looking up this inode number in __iopen__, so we don't return here -+ * until it is gone. -+ * -+ * If we get an inode via a regular name lookup, then we "rename" the -+ * NFSD_DISCONNECTED dentry to the proper name and parent. This ensures -+ * existing users of the disconnected dentry will continue to use the same -+ * dentry as the connected users, and there will never be both kinds of -+ * dentry aliases at one time. -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include "iopen.h" -+ -+#ifndef assert -+#define assert(test) J_ASSERT(test) -+#endif -+ -+#define IOPEN_NAME_LEN 32 -+ -+/* -+ * This implements looking up an inode by number. -+ */ -+static struct dentry *iopen_lookup(struct inode * dir, struct dentry *dentry, -+ struct nameidata *nd) -+{ -+ struct inode *inode; -+ unsigned long ino; -+ struct list_head *lp; -+ struct dentry *alternate; -+ char buf[IOPEN_NAME_LEN]; -+ -+ if (dentry->d_name.len >= IOPEN_NAME_LEN) -+ return ERR_PTR(-ENAMETOOLONG); -+ -+ memcpy(buf, dentry->d_name.name, dentry->d_name.len); -+ buf[dentry->d_name.len] = 0; -+ -+ if (strcmp(buf, ".") == 0) -+ ino = dir->i_ino; -+ else if (strcmp(buf, "..") == 0) -+ ino = EXT3_ROOT_INO; -+ else -+ ino = simple_strtoul(buf, 0, 0); -+ -+ if ((ino != EXT3_ROOT_INO && -+ //ino != EXT3_ACL_IDX_INO && -+ //ino != EXT3_ACL_DATA_INO && -+ ino < EXT3_FIRST_INO(dir->i_sb)) || -+ ino > le32_to_cpu(EXT3_SB(dir->i_sb)->s_es->s_inodes_count)) -+ return ERR_PTR(-ENOENT); -+ -+ inode = iget(dir->i_sb, ino); -+ if (!inode) -+ return ERR_PTR(-EACCES); -+ if (is_bad_inode(inode)) { -+ iput(inode); -+ return ERR_PTR(-ENOENT); -+ } -+ -+ assert(list_empty(&dentry->d_alias)); /* d_instantiate */ -+ assert(d_unhashed(dentry)); /* d_rehash */ -+ -+ /* preferrably return a connected dentry */ -+ spin_lock(&dcache_lock); -+ list_for_each(lp, &inode->i_dentry) { -+ alternate = list_entry(lp, struct dentry, d_alias); -+ assert(!(alternate->d_flags & DCACHE_DISCONNECTED)); -+ } -+ -+ if (!list_empty(&inode->i_dentry)) { -+ alternate = list_entry(inode->i_dentry.next, -+ struct dentry, d_alias); -+ dget_locked(alternate); -+ spin_lock(&alternate->d_lock); -+ alternate->d_flags |= DCACHE_REFERENCED; -+ spin_unlock(&alternate->d_lock); -+ iput(inode); -+ spin_unlock(&dcache_lock); -+ return alternate; -+ } -+ dentry->d_flags |= DCACHE_DISCONNECTED; -+ -+ /* d_add(), but don't drop dcache_lock before adding dentry to inode */ -+ list_add(&dentry->d_alias, &inode->i_dentry); /* d_instantiate */ -+ dentry->d_inode = inode; -+ spin_unlock(&dcache_lock); -+ -+ d_rehash(dentry); -+ -+ return NULL; -+} -+ -+/* This function is spliced into ext3_lookup and does the move of a -+ * disconnected dentry (if it exists) to a connected dentry. -+ */ -+struct dentry *iopen_connect_dentry(struct dentry *dentry, struct inode *inode, -+ int rehash) -+{ -+ struct dentry *tmp, *goal = NULL; -+ struct list_head *lp; -+ -+ /* verify this dentry is really new */ -+ assert(dentry->d_inode == NULL); -+ assert(list_empty(&dentry->d_alias)); /* d_instantiate */ -+ if (rehash) -+ assert(d_unhashed(dentry)); /* d_rehash */ -+ assert(list_empty(&dentry->d_subdirs)); -+ -+ spin_lock(&dcache_lock); -+ if (!inode) -+ goto do_rehash; -+ -+ if (!test_opt(inode->i_sb, IOPEN)) -+ goto do_instantiate; -+ -+ /* preferrably return a connected dentry */ -+ list_for_each(lp, &inode->i_dentry) { -+ tmp = list_entry(lp, struct dentry, d_alias); -+ if (tmp->d_flags & DCACHE_DISCONNECTED) { -+ assert(tmp->d_alias.next == &inode->i_dentry); -+ assert(tmp->d_alias.prev == &inode->i_dentry); -+ goal = tmp; -+ dget_locked(goal); -+ break; -+ } -+ } -+ -+ if (!goal) -+ goto do_instantiate; -+ -+ /* Move the goal to the de hash queue */ -+ goal->d_flags &= ~DCACHE_DISCONNECTED; -+ security_d_instantiate(goal, inode); -+ __d_drop(dentry); -+ spin_unlock(&dcache_lock); -+ d_rehash(dentry); -+ d_move(goal, dentry); -+ iput(inode); -+ -+ return goal; -+ -+ /* d_add(), but don't drop dcache_lock before adding dentry to inode */ -+do_instantiate: -+ list_add(&dentry->d_alias, &inode->i_dentry); /* d_instantiate */ -+ dentry->d_inode = inode; -+do_rehash: -+ spin_unlock(&dcache_lock); -+ if (rehash) -+ d_rehash(dentry); -+ -+ return NULL; -+} -+ -+/* -+ * These are the special structures for the iopen pseudo directory. -+ */ -+ -+static struct inode_operations iopen_inode_operations = { -+ lookup: iopen_lookup, /* BKL held */ -+}; -+ -+static struct file_operations iopen_file_operations = { -+ read: generic_read_dir, -+}; -+ -+static int match_dentry(struct dentry *dentry, const char *name) -+{ -+ int len; -+ -+ len = strlen(name); -+ if (dentry->d_name.len != len) -+ return 0; -+ if (strncmp(dentry->d_name.name, name, len)) -+ return 0; -+ return 1; -+} -+ -+/* -+ * This function is spliced into ext3_lookup and returns 1 the file -+ * name is __iopen__ and dentry has been filled in appropriately. -+ */ -+int ext3_check_for_iopen(struct inode *dir, struct dentry *dentry) -+{ -+ struct inode *inode; -+ -+ if (dir->i_ino != EXT3_ROOT_INO || -+ !test_opt(dir->i_sb, IOPEN) || -+ !match_dentry(dentry, "__iopen__")) -+ return 0; -+ -+ inode = iget(dir->i_sb, EXT3_BAD_INO); -+ -+ if (!inode) -+ return 0; -+ d_add(dentry, inode); -+ return 1; -+} -+ -+/* -+ * This function is spliced into read_inode; it returns 1 if inode -+ * number is the one for /__iopen__, in which case the inode is filled -+ * in appropriately. Otherwise, this fuction returns 0. -+ */ -+int ext3_iopen_get_inode(struct inode *inode) -+{ -+ if (inode->i_ino != EXT3_BAD_INO) -+ return 0; -+ -+ inode->i_mode = S_IFDIR | S_IRUSR | S_IXUSR; -+ if (test_opt(inode->i_sb, IOPEN_NOPRIV)) -+ inode->i_mode |= 0777; -+ inode->i_uid = 0; -+ inode->i_gid = 0; -+ inode->i_nlink = 1; -+ inode->i_size = 4096; -+ inode->i_atime = CURRENT_TIME; -+ inode->i_ctime = CURRENT_TIME; -+ inode->i_mtime = CURRENT_TIME; -+ EXT3_I(inode)->i_dtime = 0; -+ inode->i_blksize = PAGE_SIZE; /* This is the optimal IO size -+ * (for stat), not the fs block -+ * size */ -+ inode->i_blocks = 0; -+ inode->i_version = 1; -+ inode->i_generation = 0; -+ -+ inode->i_op = &iopen_inode_operations; -+ inode->i_fop = &iopen_file_operations; -+ inode->i_mapping->a_ops = 0; -+ -+ return 1; -+} -Index: linux-2.6.16.i686/fs/ext3/iopen.h -=================================================================== ---- linux-2.6.16.i686.orig/fs/ext3/iopen.h 2006-05-31 04:14:15.752410384 +0800 -+++ linux-2.6.16.i686/fs/ext3/iopen.h 2006-05-30 22:52:38.000000000 +0800 -@@ -0,0 +1,15 @@ -+/* -+ * iopen.h -+ * -+ * Special support for opening files by inode number. -+ * -+ * Copyright (C) 2001 by Theodore Ts'o (tytso@alum.mit.edu). -+ * -+ * This file may be redistributed under the terms of the GNU General -+ * Public License. -+ */ -+ -+extern int ext3_check_for_iopen(struct inode *dir, struct dentry *dentry); -+extern int ext3_iopen_get_inode(struct inode *inode); -+extern struct dentry *iopen_connect_dentry(struct dentry *dentry, -+ struct inode *inode, int rehash); -Index: linux-2.6.16.i686/fs/ext3/inode.c -=================================================================== ---- linux-2.6.16.i686.orig/fs/ext3/inode.c 2006-05-30 22:52:03.000000000 +0800 -+++ linux-2.6.16.i686/fs/ext3/inode.c 2006-05-30 22:52:38.000000000 +0800 -@@ -37,6 +37,7 @@ - #include - #include - #include "xattr.h" -+#include "iopen.h" - #include "acl.h" - - static int ext3_writepage_trans_blocks(struct inode *inode); -@@ -2448,6 +2449,8 @@ - ei->i_default_acl = EXT3_ACL_NOT_CACHED; - #endif - ei->i_block_alloc_info = NULL; -+ if (ext3_iopen_get_inode(inode)) -+ return; - - if (__ext3_get_inode_loc(inode, &iloc, 0)) - goto bad_inode; -Index: linux-2.6.16.i686/fs/ext3/super.c -=================================================================== ---- linux-2.6.16.i686.orig/fs/ext3/super.c 2006-05-30 22:52:03.000000000 +0800 -+++ linux-2.6.16.i686/fs/ext3/super.c 2006-05-30 22:52:38.000000000 +0800 -@@ -634,6 +634,7 @@ - Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, - Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, - Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, -+ Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, - Opt_grpquota - }; - -@@ -682,6 +683,9 @@ - {Opt_noquota, "noquota"}, - {Opt_quota, "quota"}, - {Opt_usrquota, "usrquota"}, -+ {Opt_iopen, "iopen"}, -+ {Opt_noiopen, "noiopen"}, -+ {Opt_iopen_nopriv, "iopen_nopriv"}, - {Opt_barrier, "barrier=%u"}, - {Opt_err, NULL}, - {Opt_resize, "resize"}, -@@ -996,6 +1000,18 @@ - else - clear_opt(sbi->s_mount_opt, BARRIER); - break; -+ case Opt_iopen: -+ set_opt (sbi->s_mount_opt, IOPEN); -+ clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV); -+ break; -+ case Opt_noiopen: -+ clear_opt (sbi->s_mount_opt, IOPEN); -+ clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV); -+ break; -+ case Opt_iopen_nopriv: -+ set_opt (sbi->s_mount_opt, IOPEN); -+ set_opt (sbi->s_mount_opt, IOPEN_NOPRIV); -+ break; - case Opt_ignore: - break; - case Opt_resize: -Index: linux-2.6.16.i686/fs/ext3/namei.c -=================================================================== ---- linux-2.6.16.i686.orig/fs/ext3/namei.c 2006-05-30 22:52:00.000000000 +0800 -+++ linux-2.6.16.i686/fs/ext3/namei.c 2006-05-30 22:55:19.000000000 +0800 -@@ -39,6 +39,7 @@ - - #include "namei.h" - #include "xattr.h" -+#include "iopen.h" - #include "acl.h" - - /* -@@ -995,6 +996,9 @@ - if (dentry->d_name.len > EXT3_NAME_LEN) - return ERR_PTR(-ENAMETOOLONG); - -+ if (ext3_check_for_iopen(dir, dentry)) -+ return NULL; -+ - bh = ext3_find_entry(dentry, &de); - inode = NULL; - if (bh) { -@@ -1005,7 +1009,7 @@ - if (!inode) - return ERR_PTR(-EACCES); - } -- return d_splice_alias(inode, dentry); -+ return iopen_connect_dentry(dentry, inode, 1); - } - - -@@ -2046,10 +2050,6 @@ - inode->i_nlink); - inode->i_version++; - inode->i_nlink = 0; -- /* There's no need to set i_disksize: the fact that i_nlink is -- * zero will ensure that the right thing happens during any -- * recovery. */ -- inode->i_size = 0; - ext3_orphan_add(handle, inode); - inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; - ext3_mark_inode_dirty(handle, inode); -@@ -2173,6 +2173,23 @@ - return err; - } - -+/* Like ext3_add_nondir() except for call to iopen_connect_dentry */ -+static int ext3_add_link(handle_t *handle, struct dentry *dentry, -+ struct inode *inode) -+{ -+ int err = ext3_add_entry(handle, dentry, inode); -+ if (!err) { -+ err = ext3_mark_inode_dirty(handle, inode); -+ if (err == 0) { -+ dput(iopen_connect_dentry(dentry, inode, 0)); -+ return 0; -+ } -+ } -+ ext3_dec_count(handle, inode); -+ iput(inode); -+ return err; -+} -+ - static int ext3_link (struct dentry * old_dentry, - struct inode * dir, struct dentry *dentry) - { -@@ -2196,7 +2213,8 @@ - ext3_inc_count(handle, inode); - atomic_inc(&inode->i_count); - -- err = ext3_add_nondir(handle, dentry, inode); -+ err = ext3_add_link(handle, dentry, inode); -+ ext3_orphan_del(handle, inode); - ext3_journal_stop(handle); - if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) - goto retry; -Index: linux-2.6.16.i686/fs/ext3/Makefile -=================================================================== ---- linux-2.6.16.i686.orig/fs/ext3/Makefile 2006-03-20 13:53:29.000000000 +0800 -+++ linux-2.6.16.i686/fs/ext3/Makefile 2006-05-30 22:52:38.000000000 +0800 -@@ -4,7 +4,7 @@ - - obj-$(CONFIG_EXT3_FS) += ext3.o - --ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ -+ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ - ioctl.o namei.o super.o symlink.o hash.o resize.o - - ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o -Index: linux-2.6.16.i686/include/linux/ext3_fs.h -=================================================================== ---- linux-2.6.16.i686.orig/include/linux/ext3_fs.h 2006-05-30 22:52:00.000000000 +0800 -+++ linux-2.6.16.i686/include/linux/ext3_fs.h 2006-05-30 22:52:38.000000000 +0800 -@@ -375,6 +375,8 @@ - #define EXT3_MOUNT_QUOTA 0x80000 /* Some quota option set */ - #define EXT3_MOUNT_USRQUOTA 0x100000 /* "old" user quota */ - #define EXT3_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ -+#define EXT3_MOUNT_IOPEN 0x400000 /* Allow access via iopen */ -+#define EXT3_MOUNT_IOPEN_NOPRIV 0x800000/* Make iopen world-readable */ - - /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ - #ifndef _LINUX_EXT2_FS_H diff --git a/ldiskfs/kernel_patches/patches/iopen-2.6-rhel4.patch b/ldiskfs/kernel_patches/patches/iopen-2.6-rhel4.patch deleted file mode 100644 index 98dbca4..0000000 --- a/ldiskfs/kernel_patches/patches/iopen-2.6-rhel4.patch +++ /dev/null @@ -1,471 +0,0 @@ -Index: linux-stage/fs/ext3/Makefile -=================================================================== ---- linux-stage.orig/fs/ext3/Makefile 2005-02-25 14:31:53.151076368 +0200 -+++ linux-stage/fs/ext3/Makefile 2005-02-25 14:41:51.259150120 +0200 -@@ -4,7 +4,7 @@ - - obj-$(CONFIG_EXT3_FS) += ext3.o - --ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ -+ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ - ioctl.o namei.o super.o symlink.o hash.o resize.o - - ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o -Index: linux-stage/fs/ext3/inode.c -=================================================================== ---- linux-stage.orig/fs/ext3/inode.c 2005-02-25 14:37:30.983718000 +0200 -+++ linux-stage/fs/ext3/inode.c 2005-02-25 14:47:42.069818792 +0200 -@@ -37,6 +37,7 @@ - #include - #include - #include "xattr.h" -+#include "iopen.h" - #include "acl.h" - - /* -@@ -2408,6 +2409,8 @@ - ei->i_default_acl = EXT3_ACL_NOT_CACHED; - #endif - ei->i_rsv_window.rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED; -+ if (ext3_iopen_get_inode(inode)) -+ return; - - if (ext3_get_inode_loc(inode, &iloc, 0)) - goto bad_inode; -Index: linux-stage/fs/ext3/iopen.c -=================================================================== ---- linux-stage.orig/fs/ext3/iopen.c 2005-02-25 14:41:01.017787968 +0200 -+++ linux-stage/fs/ext3/iopen.c 2005-02-25 14:41:01.045783712 +0200 -@@ -0,0 +1,278 @@ -+/* -+ * linux/fs/ext3/iopen.c -+ * -+ * Special support for open by inode number -+ * -+ * Copyright (C) 2001 by Theodore Ts'o (tytso@alum.mit.edu). -+ * -+ * This file may be redistributed under the terms of the GNU General -+ * Public License. -+ * -+ * -+ * Invariants: -+ * - there is only ever a single DCACHE_NFSD_DISCONNECTED dentry alias -+ * for an inode at one time. -+ * - there are never both connected and DCACHE_NFSD_DISCONNECTED dentry -+ * aliases on an inode at the same time. -+ * -+ * If we have any connected dentry aliases for an inode, use one of those -+ * in iopen_lookup(). Otherwise, we instantiate a single NFSD_DISCONNECTED -+ * dentry for this inode, which thereafter will be found by the dcache -+ * when looking up this inode number in __iopen__, so we don't return here -+ * until it is gone. -+ * -+ * If we get an inode via a regular name lookup, then we "rename" the -+ * NFSD_DISCONNECTED dentry to the proper name and parent. This ensures -+ * existing users of the disconnected dentry will continue to use the same -+ * dentry as the connected users, and there will never be both kinds of -+ * dentry aliases at one time. -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include "iopen.h" -+ -+#ifndef assert -+#define assert(test) J_ASSERT(test) -+#endif -+ -+#define IOPEN_NAME_LEN 32 -+ -+/* -+ * This implements looking up an inode by number. -+ */ -+static struct dentry *iopen_lookup(struct inode * dir, struct dentry *dentry, -+ struct nameidata *nd) -+{ -+ struct inode *inode; -+ unsigned long ino; -+ struct list_head *lp; -+ struct dentry *alternate; -+ char buf[IOPEN_NAME_LEN]; -+ -+ if (dentry->d_name.len >= IOPEN_NAME_LEN) -+ return ERR_PTR(-ENAMETOOLONG); -+ -+ memcpy(buf, dentry->d_name.name, dentry->d_name.len); -+ buf[dentry->d_name.len] = 0; -+ -+ if (strcmp(buf, ".") == 0) -+ ino = dir->i_ino; -+ else if (strcmp(buf, "..") == 0) -+ ino = EXT3_ROOT_INO; -+ else -+ ino = simple_strtoul(buf, 0, 0); -+ -+ if ((ino != EXT3_ROOT_INO && -+ //ino != EXT3_ACL_IDX_INO && -+ //ino != EXT3_ACL_DATA_INO && -+ ino < EXT3_FIRST_INO(dir->i_sb)) || -+ ino > le32_to_cpu(EXT3_SB(dir->i_sb)->s_es->s_inodes_count)) -+ return ERR_PTR(-ENOENT); -+ -+ inode = iget(dir->i_sb, ino); -+ if (!inode) -+ return ERR_PTR(-EACCES); -+ if (is_bad_inode(inode)) { -+ iput(inode); -+ return ERR_PTR(-ENOENT); -+ } -+ -+ assert(list_empty(&dentry->d_alias)); /* d_instantiate */ -+ assert(d_unhashed(dentry)); /* d_rehash */ -+ -+ /* preferrably return a connected dentry */ -+ spin_lock(&dcache_lock); -+ list_for_each(lp, &inode->i_dentry) { -+ alternate = list_entry(lp, struct dentry, d_alias); -+ assert(!(alternate->d_flags & DCACHE_DISCONNECTED)); -+ } -+ -+ if (!list_empty(&inode->i_dentry)) { -+ alternate = list_entry(inode->i_dentry.next, -+ struct dentry, d_alias); -+ dget_locked(alternate); -+ spin_lock(&alternate->d_lock); -+ alternate->d_flags |= DCACHE_REFERENCED; -+ spin_unlock(&alternate->d_lock); -+ iput(inode); -+ spin_unlock(&dcache_lock); -+ return alternate; -+ } -+ dentry->d_flags |= DCACHE_DISCONNECTED; -+ -+ /* d_add(), but don't drop dcache_lock before adding dentry to inode */ -+ list_add(&dentry->d_alias, &inode->i_dentry); /* d_instantiate */ -+ dentry->d_inode = inode; -+ -+ __d_rehash(dentry, 0); /* d_rehash */ -+ spin_unlock(&dcache_lock); -+ -+ return NULL; -+} -+ -+#define do_switch(x,y) do { \ -+ __typeof__ (x) __tmp = x; \ -+ x = y; y = __tmp; } while (0) -+ -+static inline void switch_names(struct dentry *dentry, struct dentry *target) -+{ -+ const unsigned char *old_name, *new_name; -+ -+ memcpy(dentry->d_iname, target->d_iname, DNAME_INLINE_LEN_MIN); -+ old_name = target->d_name.name; -+ new_name = dentry->d_name.name; -+ if (old_name == target->d_iname) -+ old_name = dentry->d_iname; -+ if (new_name == dentry->d_iname) -+ new_name = target->d_iname; -+ target->d_name.name = new_name; -+ dentry->d_name.name = old_name; -+} -+ -+/* This function is spliced into ext3_lookup and does the move of a -+ * disconnected dentry (if it exists) to a connected dentry. -+ */ -+struct dentry *iopen_connect_dentry(struct dentry *dentry, struct inode *inode, -+ int rehash) -+{ -+ struct dentry *tmp, *goal = NULL; -+ struct list_head *lp; -+ -+ /* verify this dentry is really new */ -+ assert(dentry->d_inode == NULL); -+ assert(list_empty(&dentry->d_alias)); /* d_instantiate */ -+ if (rehash) -+ assert(d_unhashed(dentry)); /* d_rehash */ -+ assert(list_empty(&dentry->d_subdirs)); -+ -+ spin_lock(&dcache_lock); -+ if (!inode) -+ goto do_rehash; -+ -+ if (!test_opt(inode->i_sb, IOPEN)) -+ goto do_instantiate; -+ -+ /* preferrably return a connected dentry */ -+ list_for_each(lp, &inode->i_dentry) { -+ tmp = list_entry(lp, struct dentry, d_alias); -+ if (tmp->d_flags & DCACHE_DISCONNECTED) { -+ assert(tmp->d_alias.next == &inode->i_dentry); -+ assert(tmp->d_alias.prev == &inode->i_dentry); -+ goal = tmp; -+ dget_locked(goal); -+ break; -+ } -+ } -+ -+ if (!goal) -+ goto do_instantiate; -+ -+ /* Move the goal to the de hash queue */ -+ goal->d_flags &= ~DCACHE_DISCONNECTED; -+ security_d_instantiate(goal, inode); -+ __d_drop(dentry); -+ __d_rehash(dentry, 0); -+ __d_move(goal, dentry); -+ spin_unlock(&dcache_lock); -+ iput(inode); -+ -+ return goal; -+ -+ /* d_add(), but don't drop dcache_lock before adding dentry to inode */ -+do_instantiate: -+ list_add(&dentry->d_alias, &inode->i_dentry); /* d_instantiate */ -+ dentry->d_inode = inode; -+do_rehash: -+ if (rehash) -+ __d_rehash(dentry, 0); /* d_rehash */ -+ spin_unlock(&dcache_lock); -+ -+ return NULL; -+} -+ -+/* -+ * These are the special structures for the iopen pseudo directory. -+ */ -+ -+static struct inode_operations iopen_inode_operations = { -+ lookup: iopen_lookup, /* BKL held */ -+}; -+ -+static struct file_operations iopen_file_operations = { -+ read: generic_read_dir, -+}; -+ -+static int match_dentry(struct dentry *dentry, const char *name) -+{ -+ int len; -+ -+ len = strlen(name); -+ if (dentry->d_name.len != len) -+ return 0; -+ if (strncmp(dentry->d_name.name, name, len)) -+ return 0; -+ return 1; -+} -+ -+/* -+ * This function is spliced into ext3_lookup and returns 1 the file -+ * name is __iopen__ and dentry has been filled in appropriately. -+ */ -+int ext3_check_for_iopen(struct inode *dir, struct dentry *dentry) -+{ -+ struct inode *inode; -+ -+ if (dir->i_ino != EXT3_ROOT_INO || -+ !test_opt(dir->i_sb, IOPEN) || -+ !match_dentry(dentry, "__iopen__")) -+ return 0; -+ -+ inode = iget(dir->i_sb, EXT3_BAD_INO); -+ -+ if (!inode) -+ return 0; -+ d_add(dentry, inode); -+ return 1; -+} -+ -+/* -+ * This function is spliced into read_inode; it returns 1 if inode -+ * number is the one for /__iopen__, in which case the inode is filled -+ * in appropriately. Otherwise, this fuction returns 0. -+ */ -+int ext3_iopen_get_inode(struct inode *inode) -+{ -+ if (inode->i_ino != EXT3_BAD_INO) -+ return 0; -+ -+ inode->i_mode = S_IFDIR | S_IRUSR | S_IXUSR; -+ if (test_opt(inode->i_sb, IOPEN_NOPRIV)) -+ inode->i_mode |= 0777; -+ inode->i_uid = 0; -+ inode->i_gid = 0; -+ inode->i_nlink = 1; -+ inode->i_size = 4096; -+ inode->i_atime = CURRENT_TIME; -+ inode->i_ctime = CURRENT_TIME; -+ inode->i_mtime = CURRENT_TIME; -+ EXT3_I(inode)->i_dtime = 0; -+ inode->i_blksize = PAGE_SIZE; /* This is the optimal IO size -+ * (for stat), not the fs block -+ * size */ -+ inode->i_blocks = 0; -+ inode->i_version = 1; -+ inode->i_generation = 0; -+ -+ inode->i_op = &iopen_inode_operations; -+ inode->i_fop = &iopen_file_operations; -+ inode->i_mapping->a_ops = 0; -+ -+ return 1; -+} -Index: linux-stage/fs/ext3/iopen.h -=================================================================== ---- linux-stage.orig/fs/ext3/iopen.h 2005-02-25 14:41:01.017787968 +0200 -+++ linux-stage/fs/ext3/iopen.h 2005-02-25 14:41:01.045783712 +0200 -@@ -0,0 +1,15 @@ -+/* -+ * iopen.h -+ * -+ * Special support for opening files by inode number. -+ * -+ * Copyright (C) 2001 by Theodore Ts'o (tytso@alum.mit.edu). -+ * -+ * This file may be redistributed under the terms of the GNU General -+ * Public License. -+ */ -+ -+extern int ext3_check_for_iopen(struct inode *dir, struct dentry *dentry); -+extern int ext3_iopen_get_inode(struct inode *inode); -+extern struct dentry *iopen_connect_dentry(struct dentry *dentry, -+ struct inode *inode, int rehash); -Index: linux-stage/fs/ext3/namei.c -=================================================================== ---- linux-stage.orig/fs/ext3/namei.c 2005-02-25 14:37:28.975023368 +0200 -+++ linux-stage/fs/ext3/namei.c 2005-02-25 14:46:43.090784968 +0200 -@@ -37,6 +37,7 @@ - #include - #include - #include "xattr.h" -+#include "iopen.h" - #include "acl.h" - - /* -@@ -980,6 +981,9 @@ - if (dentry->d_name.len > EXT3_NAME_LEN) - return ERR_PTR(-ENAMETOOLONG); - -+ if (ext3_check_for_iopen(dir, dentry)) -+ return NULL; -+ - bh = ext3_find_entry(dentry, &de); - inode = NULL; - if (bh) { -@@ -990,10 +994,8 @@ - if (!inode) - return ERR_PTR(-EACCES); - } -- if (inode) -- return d_splice_alias(inode, dentry); -- d_add(dentry, inode); -- return NULL; -+ -+ return iopen_connect_dentry(dentry, inode, 1); - } - - -@@ -2037,10 +2039,6 @@ - inode->i_nlink); - inode->i_version++; - inode->i_nlink = 0; -- /* There's no need to set i_disksize: the fact that i_nlink is -- * zero will ensure that the right thing happens during any -- * recovery. */ -- inode->i_size = 0; - ext3_orphan_add(handle, inode); - inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; - ext3_mark_inode_dirty(handle, inode); -@@ -2163,6 +2161,23 @@ - return err; - } - -+/* Like ext3_add_nondir() except for call to iopen_connect_dentry */ -+static int ext3_add_link(handle_t *handle, struct dentry *dentry, -+ struct inode *inode) -+{ -+ int err = ext3_add_entry(handle, dentry, inode); -+ if (!err) { -+ err = ext3_mark_inode_dirty(handle, inode); -+ if (err == 0) { -+ dput(iopen_connect_dentry(dentry, inode, 0)); -+ return 0; -+ } -+ } -+ ext3_dec_count(handle, inode); -+ iput(inode); -+ return err; -+} -+ - static int ext3_link (struct dentry * old_dentry, - struct inode * dir, struct dentry *dentry) - { -@@ -2186,7 +2201,8 @@ - ext3_inc_count(handle, inode); - atomic_inc(&inode->i_count); - -- err = ext3_add_nondir(handle, dentry, inode); -+ err = ext3_add_link(handle, dentry, inode); -+ ext3_orphan_del(handle, inode); - ext3_journal_stop(handle); - if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) - goto retry; -Index: linux-stage/fs/ext3/super.c -=================================================================== ---- linux-stage.orig/fs/ext3/super.c 2005-02-25 14:37:30.987717392 +0200 -+++ linux-stage/fs/ext3/super.c 2005-02-25 14:44:50.495901992 +0200 -@@ -586,6 +586,7 @@ - Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, - Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, - Opt_ignore, Opt_barrier, Opt_err, Opt_resize, -+ Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, - }; - - static match_table_t tokens = { -@@ -633,6 +634,9 @@ - {Opt_ignore, "noquota"}, - {Opt_ignore, "quota"}, - {Opt_ignore, "usrquota"}, -+ {Opt_iopen, "iopen"}, -+ {Opt_noiopen, "noiopen"}, -+ {Opt_iopen_nopriv, "iopen_nopriv"}, - {Opt_barrier, "barrier=%u"}, - {Opt_err, NULL}, - {Opt_resize, "resize"}, -@@ -914,6 +918,18 @@ - else - clear_opt(sbi->s_mount_opt, BARRIER); - break; -+ case Opt_iopen: -+ set_opt (sbi->s_mount_opt, IOPEN); -+ clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV); -+ break; -+ case Opt_noiopen: -+ clear_opt (sbi->s_mount_opt, IOPEN); -+ clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV); -+ break; -+ case Opt_iopen_nopriv: -+ set_opt (sbi->s_mount_opt, IOPEN); -+ set_opt (sbi->s_mount_opt, IOPEN_NOPRIV); -+ break; - case Opt_ignore: - break; - case Opt_resize: -Index: linux-stage/include/linux/ext3_fs.h -=================================================================== ---- linux-stage.orig/include/linux/ext3_fs.h 2005-02-25 14:37:28.977023064 +0200 -+++ linux-stage/include/linux/ext3_fs.h 2005-02-25 14:49:00.569884968 +0200 -@@ -355,6 +355,8 @@ - #define EXT3_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */ - #define EXT3_MOUNT_BARRIER 0x10000 /* Use block barriers */ - #define EXT3_MOUNT_RESERVATION 0x20000 /* Preallocation */ -+#define EXT3_MOUNT_IOPEN 0x80000 /* Allow access via iopen */ -+#define EXT3_MOUNT_IOPEN_NOPRIV 0x100000/* Make iopen world-readable */ - - /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ - #ifndef _LINUX_EXT2_FS_H diff --git a/ldiskfs/kernel_patches/patches/iopen-2.6-suse.patch b/ldiskfs/kernel_patches/patches/iopen-2.6-suse.patch deleted file mode 100644 index 1c5e900..0000000 --- a/ldiskfs/kernel_patches/patches/iopen-2.6-suse.patch +++ /dev/null @@ -1,472 +0,0 @@ -Index: linux-stage/fs/ext3/Makefile -=================================================================== ---- linux-stage.orig/fs/ext3/Makefile 2005-02-25 14:31:53.151076368 +0200 -+++ linux-stage/fs/ext3/Makefile 2005-02-25 14:41:51.259150120 +0200 -@@ -4,7 +4,7 @@ - - obj-$(CONFIG_EXT3_FS) += ext3.o - --ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ -+ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ - ioctl.o namei.o super.o symlink.o hash.o - - ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o -Index: linux-stage/fs/ext3/inode.c -=================================================================== ---- linux-stage.orig/fs/ext3/inode.c 2005-02-25 14:37:30.983718000 +0200 -+++ linux-stage/fs/ext3/inode.c 2005-02-25 14:47:42.069818792 +0200 -@@ -37,6 +37,7 @@ - #include - #include - #include "xattr.h" -+#include "iopen.h" - #include "acl.h" - - /* -@@ -2408,6 +2409,9 @@ - #endif - ei->i_rsv_window.rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED; - -+ if (ext3_iopen_get_inode(inode)) -+ return; -+ - if (ext3_get_inode_loc(inode, &iloc, 0)) - goto bad_inode; - bh = iloc.bh; -Index: linux-stage/fs/ext3/iopen.c -=================================================================== ---- linux-2.6.5-sles9.orig/fs/ext3/iopen.c 2003-01-30 13:24:37.000000000 +0300 -+++ linux-2.6.5-sles9/fs/ext3/iopen.c 2004-11-09 02:18:27.611913312 +0300 -@@ -0,0 +1,278 @@ -+/* -+ * linux/fs/ext3/iopen.c -+ * -+ * Special support for open by inode number -+ * -+ * Copyright (C) 2001 by Theodore Ts'o (tytso@alum.mit.edu). -+ * -+ * This file may be redistributed under the terms of the GNU General -+ * Public License. -+ * -+ * -+ * Invariants: -+ * - there is only ever a single DCACHE_NFSD_DISCONNECTED dentry alias -+ * for an inode at one time. -+ * - there are never both connected and DCACHE_NFSD_DISCONNECTED dentry -+ * aliases on an inode at the same time. -+ * -+ * If we have any connected dentry aliases for an inode, use one of those -+ * in iopen_lookup(). Otherwise, we instantiate a single NFSD_DISCONNECTED -+ * dentry for this inode, which thereafter will be found by the dcache -+ * when looking up this inode number in __iopen__, so we don't return here -+ * until it is gone. -+ * -+ * If we get an inode via a regular name lookup, then we "rename" the -+ * NFSD_DISCONNECTED dentry to the proper name and parent. This ensures -+ * existing users of the disconnected dentry will continue to use the same -+ * dentry as the connected users, and there will never be both kinds of -+ * dentry aliases at one time. -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include "iopen.h" -+ -+#ifndef assert -+#define assert(test) J_ASSERT(test) -+#endif -+ -+#define IOPEN_NAME_LEN 32 -+ -+/* -+ * This implements looking up an inode by number. -+ */ -+static struct dentry *iopen_lookup(struct inode * dir, struct dentry *dentry, -+ struct nameidata *nd) -+{ -+ struct inode *inode; -+ unsigned long ino; -+ struct list_head *lp; -+ struct dentry *alternate; -+ char buf[IOPEN_NAME_LEN]; -+ -+ if (dentry->d_name.len >= IOPEN_NAME_LEN) -+ return ERR_PTR(-ENAMETOOLONG); -+ -+ memcpy(buf, dentry->d_name.name, dentry->d_name.len); -+ buf[dentry->d_name.len] = 0; -+ -+ if (strcmp(buf, ".") == 0) -+ ino = dir->i_ino; -+ else if (strcmp(buf, "..") == 0) -+ ino = EXT3_ROOT_INO; -+ else -+ ino = simple_strtoul(buf, 0, 0); -+ -+ if ((ino != EXT3_ROOT_INO && -+ //ino != EXT3_ACL_IDX_INO && -+ //ino != EXT3_ACL_DATA_INO && -+ ino < EXT3_FIRST_INO(dir->i_sb)) || -+ ino > le32_to_cpu(EXT3_SB(dir->i_sb)->s_es->s_inodes_count)) -+ return ERR_PTR(-ENOENT); -+ -+ inode = iget(dir->i_sb, ino); -+ if (!inode) -+ return ERR_PTR(-EACCES); -+ if (is_bad_inode(inode)) { -+ iput(inode); -+ return ERR_PTR(-ENOENT); -+ } -+ -+ assert(list_empty(&dentry->d_alias)); /* d_instantiate */ -+ assert(d_unhashed(dentry)); /* d_rehash */ -+ -+ /* preferrably return a connected dentry */ -+ spin_lock(&dcache_lock); -+ list_for_each(lp, &inode->i_dentry) { -+ alternate = list_entry(lp, struct dentry, d_alias); -+ assert(!(alternate->d_flags & DCACHE_DISCONNECTED)); -+ } -+ -+ if (!list_empty(&inode->i_dentry)) { -+ alternate = list_entry(inode->i_dentry.next, -+ struct dentry, d_alias); -+ dget_locked(alternate); -+ spin_lock(&alternate->d_lock); -+ alternate->d_vfs_flags |= DCACHE_REFERENCED; -+ spin_unlock(&alternate->d_lock); -+ iput(inode); -+ spin_unlock(&dcache_lock); -+ return alternate; -+ } -+ dentry->d_flags |= DCACHE_DISCONNECTED; -+ -+ /* d_add(), but don't drop dcache_lock before adding dentry to inode */ -+ list_add(&dentry->d_alias, &inode->i_dentry); /* d_instantiate */ -+ dentry->d_inode = inode; -+ -+ __d_rehash(dentry, 0); /* d_rehash */ -+ spin_unlock(&dcache_lock); -+ -+ return NULL; -+} -+ -+#define do_switch(x,y) do { \ -+ __typeof__ (x) __tmp = x; \ -+ x = y; y = __tmp; } while (0) -+ -+static inline void switch_names(struct dentry *dentry, struct dentry *target) -+{ -+ const unsigned char *old_name, *new_name; -+ -+ memcpy(dentry->d_iname, target->d_iname, DNAME_INLINE_LEN); -+ old_name = target->d_name.name; -+ new_name = dentry->d_name.name; -+ if (old_name == target->d_iname) -+ old_name = dentry->d_iname; -+ if (new_name == dentry->d_iname) -+ new_name = target->d_iname; -+ target->d_name.name = new_name; -+ dentry->d_name.name = old_name; -+} -+ -+/* This function is spliced into ext3_lookup and does the move of a -+ * disconnected dentry (if it exists) to a connected dentry. -+ */ -+struct dentry *iopen_connect_dentry(struct dentry *dentry, struct inode *inode, -+ int rehash) -+{ -+ struct dentry *tmp, *goal = NULL; -+ struct list_head *lp; -+ -+ /* verify this dentry is really new */ -+ assert(dentry->d_inode == NULL); -+ assert(list_empty(&dentry->d_alias)); /* d_instantiate */ -+ if (rehash) -+ assert(d_unhashed(dentry)); /* d_rehash */ -+ assert(list_empty(&dentry->d_subdirs)); -+ -+ spin_lock(&dcache_lock); -+ if (!inode) -+ goto do_rehash; -+ -+ if (!test_opt(inode->i_sb, IOPEN)) -+ goto do_instantiate; -+ -+ /* preferrably return a connected dentry */ -+ list_for_each(lp, &inode->i_dentry) { -+ tmp = list_entry(lp, struct dentry, d_alias); -+ if (tmp->d_flags & DCACHE_DISCONNECTED) { -+ assert(tmp->d_alias.next == &inode->i_dentry); -+ assert(tmp->d_alias.prev == &inode->i_dentry); -+ goal = tmp; -+ dget_locked(goal); -+ break; -+ } -+ } -+ -+ if (!goal) -+ goto do_instantiate; -+ -+ /* Move the goal to the de hash queue */ -+ goal->d_flags &= ~DCACHE_DISCONNECTED; -+ security_d_instantiate(goal, inode); -+ __d_drop(dentry); -+ __d_rehash(dentry, 0); -+ __d_move(goal, dentry); -+ spin_unlock(&dcache_lock); -+ iput(inode); -+ -+ return goal; -+ -+ /* d_add(), but don't drop dcache_lock before adding dentry to inode */ -+do_instantiate: -+ list_add(&dentry->d_alias, &inode->i_dentry); /* d_instantiate */ -+ dentry->d_inode = inode; -+do_rehash: -+ if (rehash) -+ __d_rehash(dentry, 0); /* d_rehash */ -+ spin_unlock(&dcache_lock); -+ -+ return NULL; -+} -+ -+/* -+ * These are the special structures for the iopen pseudo directory. -+ */ -+ -+static struct inode_operations iopen_inode_operations = { -+ lookup: iopen_lookup, /* BKL held */ -+}; -+ -+static struct file_operations iopen_file_operations = { -+ read: generic_read_dir, -+}; -+ -+static int match_dentry(struct dentry *dentry, const char *name) -+{ -+ int len; -+ -+ len = strlen(name); -+ if (dentry->d_name.len != len) -+ return 0; -+ if (strncmp(dentry->d_name.name, name, len)) -+ return 0; -+ return 1; -+} -+ -+/* -+ * This function is spliced into ext3_lookup and returns 1 the file -+ * name is __iopen__ and dentry has been filled in appropriately. -+ */ -+int ext3_check_for_iopen(struct inode *dir, struct dentry *dentry) -+{ -+ struct inode *inode; -+ -+ if (dir->i_ino != EXT3_ROOT_INO || -+ !test_opt(dir->i_sb, IOPEN) || -+ !match_dentry(dentry, "__iopen__")) -+ return 0; -+ -+ inode = iget(dir->i_sb, EXT3_BAD_INO); -+ -+ if (!inode) -+ return 0; -+ d_add(dentry, inode); -+ return 1; -+} -+ -+/* -+ * This function is spliced into read_inode; it returns 1 if inode -+ * number is the one for /__iopen__, in which case the inode is filled -+ * in appropriately. Otherwise, this fuction returns 0. -+ */ -+int ext3_iopen_get_inode(struct inode *inode) -+{ -+ if (inode->i_ino != EXT3_BAD_INO) -+ return 0; -+ -+ inode->i_mode = S_IFDIR | S_IRUSR | S_IXUSR; -+ if (test_opt(inode->i_sb, IOPEN_NOPRIV)) -+ inode->i_mode |= 0777; -+ inode->i_uid = 0; -+ inode->i_gid = 0; -+ inode->i_nlink = 1; -+ inode->i_size = 4096; -+ inode->i_atime = CURRENT_TIME; -+ inode->i_ctime = CURRENT_TIME; -+ inode->i_mtime = CURRENT_TIME; -+ EXT3_I(inode)->i_dtime = 0; -+ inode->i_blksize = PAGE_SIZE; /* This is the optimal IO size -+ * (for stat), not the fs block -+ * size */ -+ inode->i_blocks = 0; -+ inode->i_version = 1; -+ inode->i_generation = 0; -+ -+ inode->i_op = &iopen_inode_operations; -+ inode->i_fop = &iopen_file_operations; -+ inode->i_mapping->a_ops = 0; -+ -+ return 1; -+} -Index: linux-stage/fs/ext3/iopen.h -=================================================================== ---- linux-stage.orig/fs/ext3/iopen.h 2005-02-25 14:41:01.017787968 +0200 -+++ linux-stage/fs/ext3/iopen.h 2005-02-25 14:41:01.045783712 +0200 -@@ -0,0 +1,15 @@ -+/* -+ * iopen.h -+ * -+ * Special support for opening files by inode number. -+ * -+ * Copyright (C) 2001 by Theodore Ts'o (tytso@alum.mit.edu). -+ * -+ * This file may be redistributed under the terms of the GNU General -+ * Public License. -+ */ -+ -+extern int ext3_check_for_iopen(struct inode *dir, struct dentry *dentry); -+extern int ext3_iopen_get_inode(struct inode *inode); -+extern struct dentry *iopen_connect_dentry(struct dentry *dentry, -+ struct inode *inode, int rehash); -Index: linux-stage/fs/ext3/namei.c -=================================================================== ---- linux-stage.orig/fs/ext3/namei.c 2005-02-25 14:37:28.975023368 +0200 -+++ linux-stage/fs/ext3/namei.c 2005-02-25 14:46:43.090784968 +0200 -@@ -37,6 +37,7 @@ - #include - #include - #include "xattr.h" -+#include "iopen.h" - #include "acl.h" - - /* -@@ -980,6 +981,9 @@ - if (dentry->d_name.len > EXT3_NAME_LEN) - return ERR_PTR(-ENAMETOOLONG); - -+ if (ext3_check_for_iopen(dir, dentry)) -+ return NULL; -+ - bh = ext3_find_entry(dentry, &de); - inode = NULL; - if (bh) { -@@ -990,10 +994,8 @@ - if (!inode) - return ERR_PTR(-EACCES); - } -- if (inode) -- return d_splice_alias(inode, dentry); -- d_add(dentry, inode); -- return NULL; -+ -+ return iopen_connect_dentry(dentry, inode, 1); - } - - -@@ -2037,10 +2039,6 @@ - inode->i_nlink); - inode->i_version++; - inode->i_nlink = 0; -- /* There's no need to set i_disksize: the fact that i_nlink is -- * zero will ensure that the right thing happens during any -- * recovery. */ -- inode->i_size = 0; - ext3_orphan_add(handle, inode); - inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; - ext3_mark_inode_dirty(handle, inode); -@@ -2163,6 +2161,23 @@ - return err; - } - -+/* Like ext3_add_nondir() except for call to iopen_connect_dentry */ -+static int ext3_add_link(handle_t *handle, struct dentry *dentry, -+ struct inode *inode) -+{ -+ int err = ext3_add_entry(handle, dentry, inode); -+ if (!err) { -+ err = ext3_mark_inode_dirty(handle, inode); -+ if (err == 0) { -+ dput(iopen_connect_dentry(dentry, inode, 0)); -+ return 0; -+ } -+ } -+ ext3_dec_count(handle, inode); -+ iput(inode); -+ return err; -+} -+ - static int ext3_link (struct dentry * old_dentry, - struct inode * dir, struct dentry *dentry) - { -@@ -2186,7 +2201,8 @@ - ext3_inc_count(handle, inode); - atomic_inc(&inode->i_count); - -- err = ext3_add_nondir(handle, dentry, inode); -+ err = ext3_add_link(handle, dentry, inode); -+ ext3_orphan_del(handle, inode); - ext3_journal_stop(handle); - if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) - goto retry; -Index: linux-stage/fs/ext3/super.c -=================================================================== ---- linux-stage.orig/fs/ext3/super.c 2005-02-25 14:37:30.987717392 +0200 -+++ linux-stage/fs/ext3/super.c 2005-02-25 14:44:50.495901992 +0200 -@@ -586,6 +586,7 @@ - Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, - Opt_ignore, Opt_barrier, - Opt_err, -+ Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, - }; - - static match_table_t tokens = { -@@ -633,6 +634,9 @@ - {Opt_ignore, "noquota"}, - {Opt_ignore, "quota"}, - {Opt_ignore, "usrquota"}, -+ {Opt_iopen, "iopen"}, -+ {Opt_noiopen, "noiopen"}, -+ {Opt_iopen_nopriv, "iopen_nopriv"}, - {Opt_barrier, "barrier=%u"}, - {Opt_err, NULL} - }; -@@ -914,6 +918,18 @@ - else - clear_opt(sbi->s_mount_opt, BARRIER); - break; -+ case Opt_iopen: -+ set_opt (sbi->s_mount_opt, IOPEN); -+ clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV); -+ break; -+ case Opt_noiopen: -+ clear_opt (sbi->s_mount_opt, IOPEN); -+ clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV); -+ break; -+ case Opt_iopen_nopriv: -+ set_opt (sbi->s_mount_opt, IOPEN); -+ set_opt (sbi->s_mount_opt, IOPEN_NOPRIV); -+ break; - case Opt_ignore: - break; - default: -Index: linux-stage/include/linux/ext3_fs.h -=================================================================== ---- linux-stage.orig/include/linux/ext3_fs.h 2005-02-25 14:37:28.977023064 +0200 -+++ linux-stage/include/linux/ext3_fs.h 2005-02-25 14:49:00.569884968 +0200 -@@ -355,6 +355,8 @@ - #define EXT3_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */ - #define EXT3_MOUNT_RESERVATION 0x10000 /* Preallocation */ - #define EXT3_MOUNT_BARRIER 0x20000 /* Use block barriers */ -+#define EXT3_MOUNT_IOPEN 0x80000 /* Allow access via iopen */ -+#define EXT3_MOUNT_IOPEN_NOPRIV 0x100000/* Make iopen world-readable */ - - /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ - #ifndef _LINUX_EXT2_FS_H diff --git a/ldiskfs/kernel_patches/patches/iopen-2.6.12.patch b/ldiskfs/kernel_patches/patches/iopen-2.6.12.patch deleted file mode 100644 index 8d456ac..0000000 --- a/ldiskfs/kernel_patches/patches/iopen-2.6.12.patch +++ /dev/null @@ -1,471 +0,0 @@ -Index: linux-2.6.12-rc6/fs/ext3/Makefile -=================================================================== ---- linux-2.6.12-rc6.orig/fs/ext3/Makefile 2005-06-14 16:00:45.206720992 +0200 -+++ linux-2.6.12-rc6/fs/ext3/Makefile 2005-06-14 16:14:33.595382720 +0200 -@@ -4,7 +4,7 @@ - - obj-$(CONFIG_EXT3_FS) += ext3.o - --ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ -+ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ - ioctl.o namei.o super.o symlink.o hash.o resize.o - - ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o -Index: linux-2.6.12-rc6/fs/ext3/inode.c -=================================================================== ---- linux-2.6.12-rc6.orig/fs/ext3/inode.c 2005-06-14 16:01:16.272150299 +0200 -+++ linux-2.6.12-rc6/fs/ext3/inode.c 2005-06-14 16:24:55.686195412 +0200 -@@ -37,6 +37,7 @@ - #include - #include - #include "xattr.h" -+#include "iopen.h" - #include "acl.h" - - static int ext3_writepage_trans_blocks(struct inode *inode); -@@ -2437,6 +2438,8 @@ - ei->i_default_acl = EXT3_ACL_NOT_CACHED; - #endif - ei->i_block_alloc_info = NULL; -+ if (ext3_iopen_get_inode(inode)) -+ return; - - if (__ext3_get_inode_loc(inode, &iloc, 0)) - goto bad_inode; -Index: linux-2.6.12-rc6/fs/ext3/iopen.c -=================================================================== ---- linux-2.6.12-rc6.orig/fs/ext3/iopen.c 2005-06-14 16:14:33.530929595 +0200 -+++ linux-2.6.12-rc6/fs/ext3/iopen.c 2005-06-14 16:14:33.626632719 +0200 -@@ -0,0 +1,278 @@ -+/* -+ * linux/fs/ext3/iopen.c -+ * -+ * Special support for open by inode number -+ * -+ * Copyright (C) 2001 by Theodore Ts'o (tytso@alum.mit.edu). -+ * -+ * This file may be redistributed under the terms of the GNU General -+ * Public License. -+ * -+ * -+ * Invariants: -+ * - there is only ever a single DCACHE_NFSD_DISCONNECTED dentry alias -+ * for an inode at one time. -+ * - there are never both connected and DCACHE_NFSD_DISCONNECTED dentry -+ * aliases on an inode at the same time. -+ * -+ * If we have any connected dentry aliases for an inode, use one of those -+ * in iopen_lookup(). Otherwise, we instantiate a single NFSD_DISCONNECTED -+ * dentry for this inode, which thereafter will be found by the dcache -+ * when looking up this inode number in __iopen__, so we don't return here -+ * until it is gone. -+ * -+ * If we get an inode via a regular name lookup, then we "rename" the -+ * NFSD_DISCONNECTED dentry to the proper name and parent. This ensures -+ * existing users of the disconnected dentry will continue to use the same -+ * dentry as the connected users, and there will never be both kinds of -+ * dentry aliases at one time. -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include "iopen.h" -+ -+#ifndef assert -+#define assert(test) J_ASSERT(test) -+#endif -+ -+#define IOPEN_NAME_LEN 32 -+ -+/* -+ * This implements looking up an inode by number. -+ */ -+static struct dentry *iopen_lookup(struct inode * dir, struct dentry *dentry, -+ struct nameidata *nd) -+{ -+ struct inode *inode; -+ unsigned long ino; -+ struct list_head *lp; -+ struct dentry *alternate; -+ char buf[IOPEN_NAME_LEN]; -+ -+ if (dentry->d_name.len >= IOPEN_NAME_LEN) -+ return ERR_PTR(-ENAMETOOLONG); -+ -+ memcpy(buf, dentry->d_name.name, dentry->d_name.len); -+ buf[dentry->d_name.len] = 0; -+ -+ if (strcmp(buf, ".") == 0) -+ ino = dir->i_ino; -+ else if (strcmp(buf, "..") == 0) -+ ino = EXT3_ROOT_INO; -+ else -+ ino = simple_strtoul(buf, 0, 0); -+ -+ if ((ino != EXT3_ROOT_INO && -+ //ino != EXT3_ACL_IDX_INO && -+ //ino != EXT3_ACL_DATA_INO && -+ ino < EXT3_FIRST_INO(dir->i_sb)) || -+ ino > le32_to_cpu(EXT3_SB(dir->i_sb)->s_es->s_inodes_count)) -+ return ERR_PTR(-ENOENT); -+ -+ inode = iget(dir->i_sb, ino); -+ if (!inode) -+ return ERR_PTR(-EACCES); -+ if (is_bad_inode(inode)) { -+ iput(inode); -+ return ERR_PTR(-ENOENT); -+ } -+ -+ assert(list_empty(&dentry->d_alias)); /* d_instantiate */ -+ assert(d_unhashed(dentry)); /* d_rehash */ -+ -+ /* preferrably return a connected dentry */ -+ spin_lock(&dcache_lock); -+ list_for_each(lp, &inode->i_dentry) { -+ alternate = list_entry(lp, struct dentry, d_alias); -+ assert(!(alternate->d_flags & DCACHE_DISCONNECTED)); -+ } -+ -+ if (!list_empty(&inode->i_dentry)) { -+ alternate = list_entry(inode->i_dentry.next, -+ struct dentry, d_alias); -+ dget_locked(alternate); -+ spin_lock(&alternate->d_lock); -+ alternate->d_flags |= DCACHE_REFERENCED; -+ spin_unlock(&alternate->d_lock); -+ iput(inode); -+ spin_unlock(&dcache_lock); -+ return alternate; -+ } -+ dentry->d_flags |= DCACHE_DISCONNECTED; -+ -+ /* d_add(), but don't drop dcache_lock before adding dentry to inode */ -+ list_add(&dentry->d_alias, &inode->i_dentry); /* d_instantiate */ -+ dentry->d_inode = inode; -+ -+ d_rehash_cond(dentry, 0); /* d_rehash */ -+ spin_unlock(&dcache_lock); -+ -+ return NULL; -+} -+ -+#define do_switch(x,y) do { \ -+ __typeof__ (x) __tmp = x; \ -+ x = y; y = __tmp; } while (0) -+ -+static inline void switch_names(struct dentry *dentry, struct dentry *target) -+{ -+ const unsigned char *old_name, *new_name; -+ -+ memcpy(dentry->d_iname, target->d_iname, DNAME_INLINE_LEN_MIN); -+ old_name = target->d_name.name; -+ new_name = dentry->d_name.name; -+ if (old_name == target->d_iname) -+ old_name = dentry->d_iname; -+ if (new_name == dentry->d_iname) -+ new_name = target->d_iname; -+ target->d_name.name = new_name; -+ dentry->d_name.name = old_name; -+} -+ -+/* This function is spliced into ext3_lookup and does the move of a -+ * disconnected dentry (if it exists) to a connected dentry. -+ */ -+struct dentry *iopen_connect_dentry(struct dentry *dentry, struct inode *inode, -+ int rehash) -+{ -+ struct dentry *tmp, *goal = NULL; -+ struct list_head *lp; -+ -+ /* verify this dentry is really new */ -+ assert(dentry->d_inode == NULL); -+ assert(list_empty(&dentry->d_alias)); /* d_instantiate */ -+ if (rehash) -+ assert(d_unhashed(dentry)); /* d_rehash */ -+ assert(list_empty(&dentry->d_subdirs)); -+ -+ spin_lock(&dcache_lock); -+ if (!inode) -+ goto do_rehash; -+ -+ if (!test_opt(inode->i_sb, IOPEN)) -+ goto do_instantiate; -+ -+ /* preferrably return a connected dentry */ -+ list_for_each(lp, &inode->i_dentry) { -+ tmp = list_entry(lp, struct dentry, d_alias); -+ if (tmp->d_flags & DCACHE_DISCONNECTED) { -+ assert(tmp->d_alias.next == &inode->i_dentry); -+ assert(tmp->d_alias.prev == &inode->i_dentry); -+ goal = tmp; -+ dget_locked(goal); -+ break; -+ } -+ } -+ -+ if (!goal) -+ goto do_instantiate; -+ -+ /* Move the goal to the de hash queue */ -+ goal->d_flags &= ~DCACHE_DISCONNECTED; -+ security_d_instantiate(goal, inode); -+ __d_drop(dentry); -+ d_rehash_cond(dentry, 0); -+ __d_move(goal, dentry); -+ spin_unlock(&dcache_lock); -+ iput(inode); -+ -+ return goal; -+ -+ /* d_add(), but don't drop dcache_lock before adding dentry to inode */ -+do_instantiate: -+ list_add(&dentry->d_alias, &inode->i_dentry); /* d_instantiate */ -+ dentry->d_inode = inode; -+do_rehash: -+ if (rehash) -+ d_rehash_cond(dentry, 0); /* d_rehash */ -+ spin_unlock(&dcache_lock); -+ -+ return NULL; -+} -+ -+/* -+ * These are the special structures for the iopen pseudo directory. -+ */ -+ -+static struct inode_operations iopen_inode_operations = { -+ lookup: iopen_lookup, /* BKL held */ -+}; -+ -+static struct file_operations iopen_file_operations = { -+ read: generic_read_dir, -+}; -+ -+static int match_dentry(struct dentry *dentry, const char *name) -+{ -+ int len; -+ -+ len = strlen(name); -+ if (dentry->d_name.len != len) -+ return 0; -+ if (strncmp(dentry->d_name.name, name, len)) -+ return 0; -+ return 1; -+} -+ -+/* -+ * This function is spliced into ext3_lookup and returns 1 the file -+ * name is __iopen__ and dentry has been filled in appropriately. -+ */ -+int ext3_check_for_iopen(struct inode *dir, struct dentry *dentry) -+{ -+ struct inode *inode; -+ -+ if (dir->i_ino != EXT3_ROOT_INO || -+ !test_opt(dir->i_sb, IOPEN) || -+ !match_dentry(dentry, "__iopen__")) -+ return 0; -+ -+ inode = iget(dir->i_sb, EXT3_BAD_INO); -+ -+ if (!inode) -+ return 0; -+ d_add(dentry, inode); -+ return 1; -+} -+ -+/* -+ * This function is spliced into read_inode; it returns 1 if inode -+ * number is the one for /__iopen__, in which case the inode is filled -+ * in appropriately. Otherwise, this fuction returns 0. -+ */ -+int ext3_iopen_get_inode(struct inode *inode) -+{ -+ if (inode->i_ino != EXT3_BAD_INO) -+ return 0; -+ -+ inode->i_mode = S_IFDIR | S_IRUSR | S_IXUSR; -+ if (test_opt(inode->i_sb, IOPEN_NOPRIV)) -+ inode->i_mode |= 0777; -+ inode->i_uid = 0; -+ inode->i_gid = 0; -+ inode->i_nlink = 1; -+ inode->i_size = 4096; -+ inode->i_atime = CURRENT_TIME; -+ inode->i_ctime = CURRENT_TIME; -+ inode->i_mtime = CURRENT_TIME; -+ EXT3_I(inode)->i_dtime = 0; -+ inode->i_blksize = PAGE_SIZE; /* This is the optimal IO size -+ * (for stat), not the fs block -+ * size */ -+ inode->i_blocks = 0; -+ inode->i_version = 1; -+ inode->i_generation = 0; -+ -+ inode->i_op = &iopen_inode_operations; -+ inode->i_fop = &iopen_file_operations; -+ inode->i_mapping->a_ops = 0; -+ -+ return 1; -+} -Index: linux-2.6.12-rc6/fs/ext3/iopen.h -=================================================================== ---- linux-2.6.12-rc6.orig/fs/ext3/iopen.h 2005-06-14 16:14:33.534835845 +0200 -+++ linux-2.6.12-rc6/fs/ext3/iopen.h 2005-06-14 16:14:33.633468657 +0200 -@@ -0,0 +1,15 @@ -+/* -+ * iopen.h -+ * -+ * Special support for opening files by inode number. -+ * -+ * Copyright (C) 2001 by Theodore Ts'o (tytso@alum.mit.edu). -+ * -+ * This file may be redistributed under the terms of the GNU General -+ * Public License. -+ */ -+ -+extern int ext3_check_for_iopen(struct inode *dir, struct dentry *dentry); -+extern int ext3_iopen_get_inode(struct inode *inode); -+extern struct dentry *iopen_connect_dentry(struct dentry *dentry, -+ struct inode *inode, int rehash); -Index: linux-2.6.12-rc6/fs/ext3/namei.c -=================================================================== ---- linux-2.6.12-rc6.orig/fs/ext3/namei.c 2005-06-14 16:01:14.701837819 +0200 -+++ linux-2.6.12-rc6/fs/ext3/namei.c 2005-06-14 16:14:33.644210844 +0200 -@@ -37,6 +37,7 @@ - #include - #include - #include "xattr.h" -+#include "iopen.h" - #include "acl.h" - - /* -@@ -985,6 +986,9 @@ - if (dentry->d_name.len > EXT3_NAME_LEN) - return ERR_PTR(-ENAMETOOLONG); - -+ if (ext3_check_for_iopen(dir, dentry)) -+ return NULL; -+ - bh = ext3_find_entry(dentry, &de); - inode = NULL; - if (bh) { -@@ -995,10 +999,8 @@ - if (!inode) - return ERR_PTR(-EACCES); - } -- if (inode) -- return d_splice_alias(inode, dentry); -- d_add(dentry, inode); -- return NULL; -+ -+ return iopen_connect_dentry(dentry, inode, 1); - } - - -@@ -2042,10 +2044,6 @@ - inode->i_nlink); - inode->i_version++; - inode->i_nlink = 0; -- /* There's no need to set i_disksize: the fact that i_nlink is -- * zero will ensure that the right thing happens during any -- * recovery. */ -- inode->i_size = 0; - ext3_orphan_add(handle, inode); - inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; - ext3_mark_inode_dirty(handle, inode); -@@ -2168,6 +2166,23 @@ - return err; - } - -+/* Like ext3_add_nondir() except for call to iopen_connect_dentry */ -+static int ext3_add_link(handle_t *handle, struct dentry *dentry, -+ struct inode *inode) -+{ -+ int err = ext3_add_entry(handle, dentry, inode); -+ if (!err) { -+ err = ext3_mark_inode_dirty(handle, inode); -+ if (err == 0) { -+ dput(iopen_connect_dentry(dentry, inode, 0)); -+ return 0; -+ } -+ } -+ ext3_dec_count(handle, inode); -+ iput(inode); -+ return err; -+} -+ - static int ext3_link (struct dentry * old_dentry, - struct inode * dir, struct dentry *dentry) - { -@@ -2191,7 +2206,8 @@ - ext3_inc_count(handle, inode); - atomic_inc(&inode->i_count); - -- err = ext3_add_nondir(handle, dentry, inode); -+ err = ext3_add_link(handle, dentry, inode); -+ ext3_orphan_del(handle, inode); - ext3_journal_stop(handle); - if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) - goto retry; -Index: linux-2.6.12-rc6/fs/ext3/super.c -=================================================================== ---- linux-2.6.12-rc6.orig/fs/ext3/super.c 2005-06-14 16:01:16.287775299 +0200 -+++ linux-2.6.12-rc6/fs/ext3/super.c 2005-06-14 16:14:33.656906156 +0200 -@@ -590,6 +590,7 @@ - Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, - Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, - Opt_ignore, Opt_barrier, Opt_err, Opt_resize, -+ Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, - }; - - static match_table_t tokens = { -@@ -638,6 +639,9 @@ - {Opt_ignore, "noquota"}, - {Opt_ignore, "quota"}, - {Opt_ignore, "usrquota"}, -+ {Opt_iopen, "iopen"}, -+ {Opt_noiopen, "noiopen"}, -+ {Opt_iopen_nopriv, "iopen_nopriv"}, - {Opt_barrier, "barrier=%u"}, - {Opt_err, NULL}, - {Opt_resize, "resize"}, -@@ -921,6 +925,18 @@ - else - clear_opt(sbi->s_mount_opt, BARRIER); - break; -+ case Opt_iopen: -+ set_opt (sbi->s_mount_opt, IOPEN); -+ clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV); -+ break; -+ case Opt_noiopen: -+ clear_opt (sbi->s_mount_opt, IOPEN); -+ clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV); -+ break; -+ case Opt_iopen_nopriv: -+ set_opt (sbi->s_mount_opt, IOPEN); -+ set_opt (sbi->s_mount_opt, IOPEN_NOPRIV); -+ break; - case Opt_ignore: - break; - case Opt_resize: -Index: linux-2.6.12-rc6/include/linux/ext3_fs.h -=================================================================== ---- linux-2.6.12-rc6.orig/include/linux/ext3_fs.h 2005-06-14 16:01:14.709650318 +0200 -+++ linux-2.6.12-rc6/include/linux/ext3_fs.h 2005-06-14 16:28:38.452794245 +0200 -@@ -358,6 +358,8 @@ - #define EXT3_MOUNT_RESERVATION 0x10000 /* Preallocation */ - #define EXT3_MOUNT_BARRIER 0x20000 /* Use block barriers */ - #define EXT3_MOUNT_NOBH 0x40000 /* No bufferheads */ -+#define EXT3_MOUNT_IOPEN 0x80000 /* Allow access via iopen */ -+#define EXT3_MOUNT_IOPEN_NOPRIV 0x100000/* Make iopen world-readable */ - - /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ - #ifndef _LINUX_EXT2_FS_H diff --git a/ldiskfs/kernel_patches/series/ldiskfs-2.6-fc3.series b/ldiskfs/kernel_patches/series/ldiskfs-2.6-fc3.series deleted file mode 100644 index 3661023..0000000 --- a/ldiskfs/kernel_patches/series/ldiskfs-2.6-fc3.series +++ /dev/null @@ -1,13 +0,0 @@ -ext3-wantedi-2.6-rhel4.patch -ext3-san-jdike-2.6-suse.patch -iopen-2.6-rhel4.patch -export_symbols-ext3-2.6-suse.patch -ext3-map_inode_page-2.6-suse.patch -ext3-ea-in-inode-2.6-rhel4.patch -export-ext3-2.6-rhel4.patch -ext3-include-fixes-2.6-rhel4.patch -ext3-extents-2.6.9-rhel4.patch -ext3-mballoc2-2.6.9-rhel4.patch -ext3-nlinks-2.6.9.patch -ext3-ialloc-2.6.patch -ext3-lookup-dotdot-2.6.9.patch diff --git a/ldiskfs/kernel_patches/series/ldiskfs-2.6-fc5.series b/ldiskfs/kernel_patches/series/ldiskfs-2.6-fc5.series deleted file mode 100644 index 1c853bd..0000000 --- a/ldiskfs/kernel_patches/series/ldiskfs-2.6-fc5.series +++ /dev/null @@ -1,12 +0,0 @@ -ext3-wantedi-2.6-rhel4.patch -ext3-san-jdike-2.6-suse.patch -iopen-2.6-fc5.patch -ext3-map_inode_page-2.6-suse.patch -export-ext3-2.6-rhel4.patch -ext3-include-fixes-2.6-rhel4.patch -ext3-extents-2.6.15.patch -ext3-mballoc2-2.6-fc5.patch -ext3-nlinks-2.6.9.patch -ext3-ialloc-2.6.patch -ext3-remove-cond_resched-calls-2.6.12.patch -ext3-filterdata-2.6.15.patch diff --git a/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel4.series b/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel4.series deleted file mode 100644 index ee07d11..0000000 --- a/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel4.series +++ /dev/null @@ -1,17 +0,0 @@ -ext3-wantedi-2.6-rhel4.patch -ext3-san-jdike-2.6-suse.patch -iopen-2.6-rhel4.patch -export_symbols-ext3-2.6-suse.patch -ext3-map_inode_page-2.6-suse.patch -ext3-ea-in-inode-2.6-rhel4.patch -export-ext3-2.6-rhel4.patch -ext3-include-fixes-2.6-rhel4.patch -ext3-extents-2.6.9-rhel4.patch -ext3-mballoc2-2.6.9-rhel4.patch -ext3-nlinks-2.6.9.patch -ext3-ialloc-2.6.patch -ext3-lookup-dotdot-2.6.9.patch -ext3-sector_t-overflow-2.6.9-rhel4.patch -ext3-check-jbd-errors-2.6.9.patch -ext3-nanosecond-2.6-rhel4.patch -ext3-extents-bug11324.patch diff --git a/ldiskfs/kernel_patches/series/ldiskfs-2.6-sles10.series b/ldiskfs/kernel_patches/series/ldiskfs-2.6-sles10.series deleted file mode 100644 index e9f3f1f..0000000 --- a/ldiskfs/kernel_patches/series/ldiskfs-2.6-sles10.series +++ /dev/null @@ -1,15 +0,0 @@ -ext3-wantedi-2.6-rhel4.patch -ext3-san-jdike-2.6-suse.patch -iopen-2.6-fc5.patch -ext3-map_inode_page-2.6-suse.patch -export-ext3-2.6-rhel4.patch -ext3-include-fixes-2.6-rhel4.patch -ext3-extents-2.6.16-sles10.patch -ext3-mballoc2-2.6-fc5.patch -ext3-nlinks-2.6.9.patch -ext3-ialloc-2.6.patch -ext3-remove-cond_resched-calls-2.6.12.patch -ext3-filterdata-2.6.15.patch -ext3-disable-write-bar-by-default-2.6-sles10.patch -ext3-nanosecond-2.6-sles10.patch -ext3-inode-version-2.6-sles10.patch diff --git a/ldiskfs/kernel_patches/series/ldiskfs-2.6-suse.series b/ldiskfs/kernel_patches/series/ldiskfs-2.6-suse.series deleted file mode 100644 index e27e861..0000000 --- a/ldiskfs/kernel_patches/series/ldiskfs-2.6-suse.series +++ /dev/null @@ -1,17 +0,0 @@ -ext3-wantedi-2.6-suse.patch -ext3-san-jdike-2.6-suse.patch -iopen-2.6-suse.patch -export_symbols-ext3-2.6-suse.patch -ext3-map_inode_page-2.6-suse.patch -ext3-ea-in-inode-2.6-suse.patch -export-ext3-2.6-suse.patch -ext3-include-fixes-2.6-suse.patch -ext3-extents-2.6.5.patch -ext3-mballoc2-2.6-suse.patch -ext3-nlinks-2.6.7.patch -ext3-rename-reserve-2.6-suse.patch -ext3-ialloc-2.6.patch -ext3-lookup-dotdot-2.6.9.patch -ext3-sector_t-overflow-2.6.5-suse.patch -ext3-check-jbd-errors-2.6.5.patch -ext3-nanosecond-2.6-suse.patch diff --git a/ldiskfs/kernel_patches/series/ldiskfs-2.6.12-vanilla.series b/ldiskfs/kernel_patches/series/ldiskfs-2.6.12-vanilla.series deleted file mode 100644 index 53c060b..0000000 --- a/ldiskfs/kernel_patches/series/ldiskfs-2.6.12-vanilla.series +++ /dev/null @@ -1,15 +0,0 @@ -ext3-wantedi-2.6-rhel4.patch -ext3-san-jdike-2.6-suse.patch -iopen-2.6.12.patch -ext3-map_inode_page-2.6-suse.patch -export-ext3-2.6-rhel4.patch -ext3-include-fixes-2.6-rhel4.patch -ext3-extents-2.6.12.patch -ext3-mballoc2-2.6.12.patch -ext3-nlinks-2.6.9.patch -ext3-ialloc-2.6.patch -ext3-remove-cond_resched-calls-2.6.12.patch -ext3-htree-dot-2.6.patch -ext3-external-journal-2.6.12.patch -ext3-lookup-dotdot-2.6.9.patch -ext3-sector_t-overflow-2.6.12.patch diff --git a/ldiskfs/kernel_patches/series/ldiskfs-2.6.18-vanilla.series b/ldiskfs/kernel_patches/series/ldiskfs-2.6.18-vanilla.series deleted file mode 100644 index 350067d..0000000 --- a/ldiskfs/kernel_patches/series/ldiskfs-2.6.18-vanilla.series +++ /dev/null @@ -1,14 +0,0 @@ -ext3-wantedi-2.6-rhel4.patch -ext3-san-jdike-2.6-suse.patch -iopen-2.6-fc5.patch -ext3-map_inode_page-2.6-suse.patch -export-ext3-2.6-rhel4.patch -ext3-include-fixes-2.6-rhel4.patch -ext3-extents-2.6.18-vanilla.patch -ext3-mballoc2-2.6.18-vanilla.patch -ext3-nlinks-2.6.9.patch -ext3-ialloc-2.6.patch -ext3-remove-cond_resched-calls-2.6.12.patch -ext3-filterdata-2.6.15.patch -ext3-nanosecond-2.6.18-vanilla.patch -ext3-inode-version-2.6.18-vanilla.patch diff --git a/ldiskfs/ldiskfs/Makefile.in b/ldiskfs/ldiskfs/Makefile.in deleted file mode 100644 index eeb1bed..0000000 --- a/ldiskfs/ldiskfs/Makefile.in +++ /dev/null @@ -1,21 +0,0 @@ -default: all - -MODULES := ldiskfs - -# copy makefile over to not break patches -ext3_extra := $(wildcard @LINUX@/fs/ext3/Makefile) - -ext3_headers := $(wildcard @LINUX@/fs/ext3/*.h) -linux_headers := $(wildcard @LINUX@/include/linux/ext3*.h) - -ext3_sources := $(filter-out %.mod.c,$(wildcard @LINUX@/fs/ext3/*.c)) -new_sources := iopen.c iopen.h extents.c mballoc.c -new_headers := ext3_extents.h -ldiskfs_patched_sources := $(notdir $(ext3_sources) $(ext3_headers)) $(new_sources) $(new_headers) -ldiskfs_sources := $(ldiskfs_patched_sources) - -ldiskfs-objs := $(filter %.o,$(ldiskfs_sources:.c=.o)) - -EXTRA_PRE_CFLAGS := -I@LINUX@/fs -I@LUSTRE@ -I@LUSTRE@/ldiskfs - -@INCLUDE_RULES@ diff --git a/ldiskfs/ldiskfs/autoMakefile.am b/ldiskfs/ldiskfs/autoMakefile.am deleted file mode 100644 index 8ac1b87..0000000 --- a/ldiskfs/ldiskfs/autoMakefile.am +++ /dev/null @@ -1,80 +0,0 @@ -if MODULES -if LDISKFS -modulefs_DATA = ldiskfs$(KMODEXT) -endif -endif - -ldiskfs_linux_headers := $(addprefix linux/,$(subst ext3,ldiskfs,$(notdir $(linux_headers)))) - -$(filter %.c,$(ldiskfs_patched_sources)): sources $(ldiskfs_linux_headers) $(filter %.h,$(ldiskfs_patched_sources)) - -ldiskfs_sed_flags = \ - -e "s/dx_hash_info/ext3_dx_hash_info/g" \ - -e "s/dir_private_info/ext3_dir_private_info/g" \ - -e "s/DX_HASH/EXT3_DX_HASH/g" \ - -e "s/reserve_window/ext3_reserve_window/g" \ - -e "s/rsv_window_add/ext3_rsv_window_add/g" \ - -e "s/EXT3/LDISKFS/g" -e "s/ext3/ldiskfs/g" - -%.c: linux-stage/fs/ext3/%.c - sed $(strip $(ldiskfs_sed_flags)) $< > $@ - -%.h: linux-stage/fs/ext3/%.h - sed $(strip $(ldiskfs_sed_flags)) $< > $@ - -linux/ldiskfs%.h: linux-stage/include/linux/ext3%.h - sed $(strip $(ldiskfs_sed_flags)) $< > $@ - -# -# FIXME: we need to grab the series in configure somehow -# (see bug 1679) -# -series := @top_srcdir@/lustre/kernel_patches/series/ldiskfs-$(LDISKFS_SERIES) -patches := @top_srcdir@/lustre/kernel_patches/patches - -sources: $(ext3_sources) $(ext3_headers) $(linux_headers) $(series) - rm -rf linux-stage linux sources $(ldiskfs_SOURCES) - mkdir -p linux-stage/fs/ext3 linux-stage/include/linux - cp $(ext3_sources) $(ext3_headers) $(ext3_extra) linux-stage/fs/ext3 - cp $(linux_headers) linux-stage/include/linux -if USE_QUILT - ln -s ../$(patches) linux-stage/patches - ln -s ../$(series) linux-stage/series - cd linux-stage && quilt push -a -q -else - @echo -n "Applying ext3 patches:" - @cd linux-stage && for i in $$(<../$(series)) ; do \ - echo -n " $$i" ; \ - patch -s -p1 < ../$(patches)/$$i || exit 1 ; \ - done - @echo -endif - mkdir linux - @echo -n "Replacing 'ext3' with 'ldiskfs':" - @for i in $(notdir $(ext3_headers) $(ext3_sources)) $(new_sources) ; do \ - echo -n " $$i" ; \ - sed $(strip $(ldiskfs_sed_flags)) \ - linux-stage/fs/ext3/$$i > $$i ; \ - done - @for i in $(subst ext3,,$(notdir $(linux_headers) $(new_headers))) ; do \ - echo -n " ext3$$i" ; \ - sed $(strip $(ldiskfs_sed_flags)) \ - linux-stage/include/linux/ext3$$i \ - > linux/ldiskfs$$i ; \ - done - @echo - touch sources - -foo-check: - @echo "ldiskfs_sources: $(ldiskfs_sources)" - @echo "ldiskfs_SOURCES: $(ldiskfs_SOURCES)" - @echo "ldiskfs_headers: $(ldiskfs_headers)" - @echo "ldiskfs_objects: $(ldiskfs_objects)" - @echo "ldiskfs_OBJECTS: $(ldiskfs_OBJECTS)" - @echo "ldiskfs_LDADD: $(ldiskfs_LDADD)" - -MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ -CLEANFILES = sources $(notdir $(linux_headers) $(ext3_headers) $(ext3_sources) $(new_sources) $(new_headers)) - -clean: clean-am - rm -rf linux linux-stage diff --git a/lustre/.cvsignore b/lustre/.cvsignore deleted file mode 100644 index 07a5c92..0000000 --- a/lustre/.cvsignore +++ /dev/null @@ -1,30 +0,0 @@ -.Xrefs -.Xrefs-2.5 -aclocal.m4 -config.log -config.status -config.cache -config.guess -config.sub -configure -Makefile -autoMakefile -autoMakefile.in -.deps -tags -TAGS -lustre*.tar.gz -cscope.files -cscope.out -autom4te-2.53.cache -autom4te.cache -depcomp -compile -.*.cmd -.mergeinfo-* -Rules -missing -mkinstalldirs -install-sh -.depend -.tmp_versions diff --git a/lustre/BUGS b/lustre/BUGS deleted file mode 100644 index ba84777..0000000 --- a/lustre/BUGS +++ /dev/null @@ -1 +0,0 @@ -To report bugs, please visit http://bugzilla.clusterfs.com/ diff --git a/lustre/BUILDING b/lustre/BUILDING deleted file mode 100644 index 1c69d3c..0000000 --- a/lustre/BUILDING +++ /dev/null @@ -1,30 +0,0 @@ -BUILDING LUSTRE ---------------- - -You must already have a Lustre-patched kernel, which is outside of the -scope of this document. For more information on this process, see the -web sites below. Also consider downloading a pre-packaged Lustre -kernel and utilities from http://www.lustre.org/downloads.html - -To build: - sh autogen.sh - ./configure --with-linux=/usr/src/lustre_patched_kernel_tree - make - -To play with Lustre: - cd tests - sh llmount.sh - -To clean up: - sh llmountcleanup.sh - -More information about Lustre: - http://www.lustre.org/ - -More information about Cluster File Systems: - http://www.clusterfs.com/ - -Feedback: - lustre-discuss@lists.clusterfs.com - -- The Lustre Team - diff --git a/lustre/ChangeLog b/lustre/ChangeLog deleted file mode 100644 index 044f51e..0000000 --- a/lustre/ChangeLog +++ /dev/null @@ -1,4689 +0,0 @@ -tbd Cluster File Systems, Inc. - * version 1.6.5 - * Support for kernels: - 2.6.5-7.286 (SLES 9), - 2.6.9-55.0.9.EL (RHEL 4), - 2.6.16.53-0.8 (SLES 10), - 2.6.18-8.1.14.el5 (RHEL 5), - 2.6.18.8 vanilla (kernel.org) - * Client support for unpatched kernels: - (see http://wiki.lustre.org/index.php?title=Patchless_Client) - 2.6.16 - 2.6.22 vanilla (kernel.org) - * Due to recently discovered recovery problems, we do not recommend - using patchless RHEL 4 clients with this or any earlier release. - * Recommended e2fsprogs version: 1.40.2-cfs1 - * Note that reiserfs quotas are disabled on SLES 10 in this kernel. - -Severity : enhancement -Bugzilla : 13690 -Description: Build SLES10 patchless client fails -Details : The configure was broken by run ./configure with - --with-linux-obj=.... argument for patchless client. When the - configure use --with-linux-obj, the LINUXINCLUDE= -Iinclude - can't search header adequately. Use absolute path such as - -I($LINUX)/include instead. - -Severity : enhancement -Bugzilla : 11622 -Description: Lustre Page Accounting -Details : New macros for page alloc and free which enable accounting - of page allocation of Lustre. Use percpu counters to store memory - and page statistics. - -Severity : normal -Bugzilla : 13497 -Description: LASSERT_{REQ,REP}SWAB macros are buggy -Details : If SWAB_PARANOIA is disabled, the LASSERT_REQSWAB and - LASSERT_REPSWAB macros become no-ops, which is incorrect. Drop - these macros and replace them with their difinitions instead. - -Severity : normal -Bugzilla : 13556 -Description: conf-sanity.sh test_33 failed with 1 -Details : change mgsnode - -Severity : normal -Bugzilla : 13888 -Description: interrupt oig_wait produce painc on resend. -Details : brw_redo_request can be used for resend requests from ptlrpcd and - private set, and this produce situation when rq_ptlrpcd_data not - copyed to new allocated request and triggered LBUG on assert - req->rq_ptlrpcd_data != NULL. But this member used only for wakeup - ptlrpcd set if request is changed and can be safety changed to use - rq_set directly. - -Severity : normal -Frequency : when using O_DIRECT and quotas -Bugzilla : 13930 -Description: Incorrect file ownership on O_DIRECT output files -Details : block usage reported by 'lfs quota' does not take into account - files that have been written with O_DIRECT. - --------------------------------------------------------------------------------- - -2007-10-26 Cluster File Systems, Inc. - * version 1.6.4 - * Support for kernels: - 2.6.5-7.286 (SLES 9), - 2.6.9-55.0.9.EL (RHEL 4), - 2.6.16.53-0.8 (SLES 10), - 2.6.18-8.1.14.el5 (RHEL 5), - 2.6.18.8 vanilla (kernel.org) - * Client support for unpatched kernels: - (see http://wiki.lustre.org/index.php?title=Patchless_Client) - 2.6.16 - 2.6.22 vanilla (kernel.org) - * Due to recently discovered recovery problems, we do not recommend - using patchless RHEL 4 clients with this or any earlier release. - * Recommended e2fsprogs version: 1.40.2-cfs1 - * Note that reiserfs quotas are disabled on SLES 10 in this kernel. - -Severity : normal -Bugzilla : 13730 -Description: Not fail import if we got -EAGAIN -Details : if osc_interpret_create got -EAGAIN his immediately exit and - wakeup oscc_waitq. After wakeup oscc_wait_for_objects call - oscc_has_objects and see osc has no objests and call - oscc_internal_create for resend create request. - -Severity : normal -Bugzilla : 13521 -Description: Update kernel patches for SLES10 2.6.16.53-0.8. -Details : Update which_patch & target file for SLES10 latest kernel. - -Severity : enhancement -Bugzilla : 13128 -Description: add --type and --size parameters to lfs find -Details : Enhance lfs find by adding filetype and filesize parameters. Also - multiple OBDs can now be specified for the --obd option. - -Severity : enhancement -Bugzilla : 11270 -Description: eliminate client locks in face of contention -Details : file contention detection and lockless i/o implementation - for contended files. - -Severity : normal -Bugzilla : 12411 -Description: Remove client patches from SLES 10 kernel. -Details : This causes SLES 10 clients to behave as patchless clients - even on a Lustre-patched (server) kernel. - -Severity : minor -Bugzilla : 2369 -Description: use i_size_read and i_size_write in 2.6 port -Details : replace inode->i_size access with i_size_read/write() - -Severity : normal -Frequency : when removing large files -Bugzilla : 13181 -Description: scheduling issue during removal of large Lustre files -Details : Don't take the BKL in fsfilt_ext3_setattr() for 2.6 kernels. - It causes scheduling issues when removing large files (17TB in the - present case). - -Severity : normal -Bugzilla : 13358 -Description: 1.4.11 Can't handle directories with stripe set and extended acls -Details : Impossible (EPROTO is returned) to access a directory that has a - non-default striping and ACLs. - -Severity : normal -Frequency : only on ppc -Bugzilla : 12234 -Description: /proc/fs/lustre/devices broken on ppc -Details : The patch as applied to 1.6.2 doesn't look correct for all arches. - We should make sure the type of 'index' is loff_t and then cast - explicitly as needed below. Do not assign an explicitly cast - loff_t to an int. - -Severity : normal -Frequency : only for rhel5 -Bugzilla : 13616 -Description: Kernel patches update for RHEL5 2.6.18-8.1.10.el5. -Details : Modify the target file & which_kernel. - -Severity : normal -Frequency : if the uninit_groups feature is enabled on ldiskfs -Bugzilla : 13706 -Description: e2fsck reports "invalid unused inodes count" -Details : If a new ldiskfs filesystem is created with the "uninit_groups" - feature and only a single inode is created in a group then the - "bg_unused_inodes" count is incorrectly updated. Creating a - second inode in that group would update it correctly. - -Severity : minor -Bugzilla : 12948 -Description: buffer overruns could theoretically occur -Details : llapi_semantic_traverse() modifies the "path" argument by - appending values to the end of the origin string, and a buffer - overrun may occur. Adding buffer overrun check in liblustreapi. - -Severity : normal -Bugzilla : 13454 -Description: Add jbd statistics patch for RHEL5 and 2.6.18-vanilla. - -Severity : normal -Bugzilla : 11673 -Description: handle "serious error: objid * already exists" more gracefully -Details : If LAST_ID value on disk is smaller than the objects existing in - the O/0/d* directories, it indicates disk corruption and causes an - LBUG(). If the object is 0-length, then we should use the existing - object. This will help to avoid a full fsck in most cases. - -Severity : normal -Bugzilla : 13518 -Description: Kernel patches update for RHEL4 2.6.9-55.0.6. -Details : Modify vm-tunables-rhel4.patch. - -Severity : normal -Bugzilla : 13452 -Description: Kernel config for 2.6.18-vanilla. -Details : Modify targets/2.6-vanilla.target.in. - Add config file kernel-2.6.18-2.6-vanilla-i686.config. - Add config file kernel-2.6.18-2.6-vanilla-i686-smp.config. - Add config file kernel-2.6.18-2.6-vanilla-x86_64.config. - Add config file kernel-2.6.18-2.6-vanilla-x86_64-smp.config. - -Severity : major -Bugzilla : 11710 -Description: improve handling recoverable errors -Details : if request processig with error which can be recoverable on server - request should be resend, otherwise page released from cache and - marked as error. - -Severity : critical -Bugzilla : 13751 -Description: Kernel patches update for RHEL5 2.6.18-8.1.14.el5. -Details : Modify target file & which_patch. - A flaw was found in the IA32 system call emulation provided - on AMD64 and Intel 64 platforms. An improperly validated 64-bit - value could be stored in the %RAX register, which could trigger an - out-of-bounds system call table access. An untrusted local user - could exploit this flaw to run code in the kernel - (ie a root privilege escalation). (CVE-2007-4573). - -Severity : minor -Bugzilla : 13732 -Description: change order of libsysio includes -Details : '#include sysio.h' should always come before '#include xtio.h' - -Severity : enhancement -Bugzilla : 13207 -Description: adapt the lustre_config script to support the upgrade case -Details : Add "-u" option for lustre_config script to support upgrading 1.4 - server targets to 1.6 in parallel. - -Severity : normal -Bugzilla : 13570 -Description: To avoid grant space > avaible space when the disk is almost - full. Without this patch you might see the error "grant XXXX > - available" or some LBUG about grant, when the disk is almost - full. -Details : In filter_check_grant, for non_grant cache write, we should - check the left space by if (*left > ungranted + bytes), instead - of (*left > ungranted), because only we are sure the left space - is enough for another "bytes", then the ungrant space should be - increase. In client, we should update cl_avail_grant only there is - OBD_MD_FLGRANT in the reply. - -Severity : critical -Bugzilla : 13748 -Description: Update RHEL 4 kernel to fix local root privilege escalation. -Details : Update to the latest RHEL 4 kernel to fix the vulnerability - described in CVE-2007-4573. This problem could allow untrusted - local users to gain root access. - --------------------------------------------------------------------------------- - -2007-09-27 Cluster File Systems, Inc. - * version 1.6.3 - * Support for kernels: - 2.6.5-7.286 (SLES 9), - 2.6.9-55.0.2.EL (RHEL 4), - 2.6.16.46-0.14 (SLES 10), - 2.6.18-8.1.8.el5 (RHEL 5), - 2.6.18.8 vanilla (kernel.org) - * Client support for unpatched kernels: - (see http://wiki.lustre.org/index.php?title=Patchless_Client) - 2.6.16 - 2.6.22 vanilla (kernel.org) - * Due to recently discovered recovery problems, we do not recommend - using patchless RHEL 4 clients with this or any earlier release. - * Recommended e2fsprogs version: 1.40.2-cfs1 - * Note that reiserfs quotas are disabled on SLES 10 in this kernel. - -Severity : minor -Bugzilla : 12186 -Description: Fix errors in lfs documentation -Details : Fixes man pages - -Severity : enhancement -Bugzilla : 3055 -Description: Adaptive timeouts -Details : RPC timeouts adapt to changing server load and network - conditions to reduce resend attempts and improve recovery time. - -Severity : enhancement -Bugzilla : 12192 -Description: llapi_file_create() does not allow some changes -Details : add llapi_file_open() that allows specifying the file creation - mode and open flags, and also returns an open file handle. - -Severity : enhancement -Bugzilla : 12743 -Description: df doesn't work properly if diskfs blocksize != 4K -Details : Choose biggest blocksize of OST's as the LOV's blocksize. - -Severity : normal -Frequency : always on directories with default striping set -Bugzilla : 12836 -Description: lfs find on -1 stripe looping in lsm_lmm_verify_common() -Details : Avoid lov_verify_lmm_common() on directory with -1 stripe count. - -Severity : enhancement -Bugzilla : 11248 -Description: merge and cleanup kernel patches. -Details : Remove mnt_lustre_list in vfs_intent-2.6-rhel4.patch. - -Severity : enhancement -Bugzilla : 13039 -Description: RedHat Update kernel for RHEL5 -Details : Kernel config file for RHEL5. - -Severity : enhancement -Bugzilla : 12446 -Description: OSS needs mutliple precreate threads -Details : Add ability to start more than one create thread per OSS. - -Severity : major -Frequency : only with quota on the root user -Bugzilla : 12223 -Description: mds_obd_create error creating tmp object -Details : When the user sets quota on root, llog will be affected and can't - create files and write files. - -Severity : normal -Frequency : Always on ia64 patchless client, and possibly others. -Bugzilla : 12826 -Description: Add EXPORT_SYMBOL check for node_to_cpumask symbol. -Details : This allows the patchless client to be loaded on architectures - without this export. - -Severity : normal -Frequency : rare -Bugzilla : 13142 -Description: disorder of journal start and llog_add cause deadlock. -Details : in llog_origin_connect, journal start should happen before - llog_add keep the same order as other functions to avoid - the deadlock. - -Severity : enhancement -Bugzilla : 13039 -Description: RedHat Update kernel for RHEL5 -Details : Modify the kernel config file more closer RHEL5. - -Severity : enhancement -Bugzilla : 13360 -Description: Build failure against Centos5 (RHEL5) -Details : Define PAGE_SIZE when it isn't present. - -Severity : normal -Frequency : occasionally when using NFS -Bugzilla : 13030 -Description: "ll_intent_file_open()) lock enqueue: err: -13" with nfs -Details : with NFS, the anon dentry's parent was set to itself in - d_alloc_anon(), so in MDS, we use rec->ur_fid1 to find the - corresponding dentry other than use rec->ur_name. - -Severity : enhancement -Bugzilla : 12398 -Description: enable data checksumming by default -Details : enable checksum by default, allow --disable-checksum - configure option and "-o nochecksum" mount option. Checksums - can also be disabled at runtime via $LPROC/osc/*/checksum_pages. - -Severity : normal -Frequency : Occasionally with failover -Bugzilla : 12459 -Description: Client eviction due to failover config -Details : after a connection loss, the lustre client should attempt to - reconnect to the last active server first before trying the - other potential connections. - -Severity : enhancement -Bugzilla : 11401 -Description: client-side metadata stat-ahead during readdir(directory readahead) -Details : perform client-side metadata stat-ahead when the client detects - readdir and sequential stat of dir entries therein - -Severity : minor -Frequency : rare -Bugzilla : 12588 -Description: when mds and osts use different quota unit(32bit and 64bit), - quota will be released repeatly. -Details : void sending multiple quota reqs to mds, which will keep the - status between the reqs. - -Severity : normal -Frequency : only with liblustre clients on XT3 -Bugzilla : 12418 -Description: evictions taking too long -Details : allow llrd to evict clients directly on OSTs - -Severity : normal -Bugzilla : 13125 -Description: osts not allocated evenly to files -Details : change the condition to increase offset_idx - -Severity : enhancement -Bugzilla : 2262 -Description: self-adjustable client's lru lists -Details : use adaptive algorithm for managing client cached locks lru - lists according to current server load, other client's work - pattern, memory activities, etc. Both, server and client - side namespaces provide number of proc tunables for controlling - things - -Severity : cleanup -Bugzilla : 13532 -Description: rewrite ext2-derived code in llite/dir.c and obdclass/uuid.c -Details : rewrite inherited code (uuid parsing code from ext2 utils and - readdir code from ext3) from scratch preserving functionality. - -Severity : normal -Bugzilla : 13436 -Description: Only those disconnect error should be returned by rq_status. -Details : In open/enqueue processs, Some errors, which will cause client - disconnected, should be returned by rq_status, while other - errors should still be returned by intent, then mdc or llite will - detect them. - -Severity : enhancement -Bugzilla : 11230 -Description: Tune the kernel for good SCSI performance. -Details : Set the value of /sys/block/{dev}/queue/max_sectors_kb - to the value of /sys/block/{dev}/queue/max_hw_sectors_kb - in mount_lustre. - -Severity : critical -Frequency : Always for filesystems larger than 2TB on 32-bit systems. -Bugzilla : 13547, 13627 -Description: Data corruption for OSTs that are formatted larger than 2TB - on 32-bit servers. -Details : When generating the bio request for lustre file writes the - sector number would overflow a temporary variable before being - used for the IO. The data reads correctly from Lustre (which - will overflow in a similar manner) but other file data or - filesystem metadata may be corrupted in some cases. - -Severity : normal -Bugzilla : 13600 -Description: "lfs find -obd UUID" prints directories -Details : "lfs find -obd UUID" will return all directory names instead - of just file names. It is incorrect because the directories - do not reside on the OSTs. - -Severity : normal -Bugzilla : 13596 -Description: MDS hang after unclean shutdown of lots of clients -Details : Never resend AST requests. - -Severity : normal -Bugzilla : 13304 -Frequency : Always, for kernels after 2.6.16 -Description: Fix warning idr_remove called for id=.. which is not allocated. -Details : Last kernels save old s_dev before kill super and not allow - to restore from callback - restore it before call kill_anon_super. - -Severity : normal -Bugzilla : 13334 -Description: Fix error on 'ls .' at the top of the Lustre mount. -Details : Don't revalidate dentry if it is a root dentry. - --------------------------------------------------------------------------------- - -2007-08-27 Cluster File Systems, Inc. - * version 1.6.2 - * Support for kernels: - 2.6.5-7.286 (SLES 9), - 2.6.9-55.0.2.EL (RHEL 4), - 2.6.16.46-0.14 (SLES 10), - 2.6.18-8.1.8.el5 (RHEL 5), - 2.6.18.8 vanilla (kernel.org) - * Client support for unpatched kernels: - (see http://wiki.lustre.org/index.php?title=Patchless_Client) - 2.6.16 - 2.6.22 vanilla (kernel.org) - * Due to recently discovered recovery problems, we do not recommend - using patchless RHEL 4 clients with this or any earlier release. - * Recommended e2fsprogs version: 1.39.cfs8 - * Note that reiserfs quotas are disabled on SLES 10 in this kernel. - -Severity : minor -Frequency : rare -Bugzilla : 13147 -Description: block reactivating mgc import until all deactivates complete -Details : Fix race when failing back MDT/MGS to itself (testing) - -Severity : enhancement -Bugzilla : 12786 -Description: lfs setstripe enhancement -Details : Make lfs setstripe understand 'k', 'm' and 'g' for stripe size. - -Severity : normal -Frequency : mds/oss recovery -Bugzilla : 10800 -Description: llog ctxt is refrenced after it has been freed. -Details : llog ctxt refcount was added to avoide the race between ctxt free - and llog recovery process. Each llog user must hold ctxt refcount - before it access the llog. And the llog ctxt can only be freed - when its refcount is zero. - -Severity : enhancement -Bugzilla : 12211 -Description: randomly memory allocation failure util -Details : Make lustre randomly failed allocating memory for testing purpose. - -Severity : normal -Frequency : only for SLES10 -Bugzilla : 12771 -Description: Update kernel patch for SLES10 SP1 -Details : Add patch blkdev_tunables-2.6-sles10.patch to 2.6-sles10.series. - -Severity : enhancement -Bugzilla : 10786 -Description: omit set fsid for export NFS -Details : fix set/restore device id for avoid EMFILE error and mark lustre fs - as FS_REQUIRES_DEV for avoid problems with generate fsid. - -Severity : major -Frequency : after network failures -Bugzilla : 12769 -Description: Add sync option to mount_lustre.c -Details : Client loses data written to lustre after a network interruption. - -Severity : enhancement -Bugzilla : 10595 -Description: Error message improvement. -Details : Merging of two LCONSOLE_ERROR_MSG into one. - -Severity : enhancement -Bugzilla : 12606 -Description: don't use GFP_* in generic Lustre code. -Details : Use cfs_alloc_* functions and CFS_* flags for code portability. - -Severity : enhancement -Bugzilla : 12333 -Description: obdclass is limited by single OBD_ALLOC(idarray) -Details : replace OBD_ALLOC/OBD_FREE with OBD_VMALLOC/OBD_VFREE - -Severity : enhancement -Bugzilla : 12415 -Description: updated patchess for new RHEL4 kernel -Details : Fixed ext3-unlink-race.patch per Kalpak's comment. - -Severity : enhancement -Bugzilla : 13006 -Description: warnings with build patchless client with vanila 2.6.19 and up -Details : change old ctl_table style and replace ctl_table/ctl_table_header - with cfs_sysctl_table_t/cfs_sysctl_table_header_t - -Severity : enhancement -Bugzilla : 13093 -Description: O_DIRECT bypasses client statistics. -Details : When running with O_DIRECT I/O, neither the client rpc_stats nor - read_ahead_stats were updated. Copied stats section from - osc_send_oap_rpc() into async_internal(). - -Severity : minor -Frequency : only for Cray XT3 -Bugzilla : 11706 -Description: peer credits not enough on many OST per OSS systems. -Details : Use new lnet way to add credits as we need those for pings and ASTs - -Severity : minor -Frequency : only with liblustre -Bugzilla : 12790 -Description: Liblustre is not releasing flock locks on file close. -Details : Release flock locks on file close. - -Severity : minor -Frequency : only for RHEL4 -Bugzilla : 12839 -Description: Update kernel patches for kernel-2.6.9-55.0.2.EL -Details : Remove inode-nr_unused-2.6.9-rhel4.patch from 2.6-rhel4.series - Update target file and kernel config. - -Severity : normal -Bugzilla : 11802 -Description: lustre support for RHEL5 -Details : Add support for RHEL5. - -Severity : minor -Bugzilla : 11327 -Frequency : rare -Description: ASSERTION(export != NULL) failed in target_handle_connect -Details : Assetion hit is result of rare race between disconnect and connet - to same nid. target_handle_connect found old connect cockie and - tried to reconnect, but can't find export for this cockie. - -Severity : normal -Frequency : rare -Bugzilla : 11756 -Description: umount blocks forever on error -Details : In result of wrong using obd_no_recov and obd_force flags client - can hand if cancel or some other requests is lost. - -Severity : normal -Frequency : Only for SLES -Bugzilla : 13177 -Description: sanity_quota fail test_1 -Details : There are multiple occurences of $TSTUSR in SLES's /etc/group - file, which makes TSTID[2] inunique. - -Severity : enhancement -Bugzilla : 13249 -Description: Kernel patches for SLES9 2.6.5-7.286 kernel -Details : Update target/ChangeLog/which_patch . - -Severity : enhancement -Bugzilla : 12955 -Description: jbd statistics -Details : Port older jbd statistics patch for sles10 - -Severity : normal -Frequency : rare -Bugzilla : 9977 -Description: lvbo_init failed for resource with missing objects. -Details : Fix returning error if we do stat for file with missing/corrupted - objects and i_size set to all sum of size of all avaible objects. - if we to truncate/write to missing object - it is recreated. - -Severity : minor -Frequency : rare -Bugzilla : 13276 -Description: Oops in read and write path when failing to allocate lock. -Details : Check if lock allocation failed and return error back. - -Severity : normal -Frequency : When flocks are used. -Bugzilla : 13103 -Description: assertion failure in ldlm_cli_enquque_fini for non NULL lock. -Details : Flock locks might destroy just granted lock if it could be merged - with another existing flock, this is done in completion handler, - so teach ldlm_cli_enquque_fini that this is a valid case for - flock locks. - -Severity : normal -Frequency : Rare -Bugzilla : 11974 -Description: reply_lock_interpret crash due to race with it and lock cancel. -Details : Do not replay locks that are being cancelled. Do not reference - locks by their address during replay, just by their handle. - -Severity : normal -Frequency : only with deactivated OSTs -Bugzilla : 11679 -Description: lstripe command fails for valid OST index -Details : The stripe offset is compared to 'lov->desc.ld_tgt_count' - instead of lov->desc.ld_active_tgt_count. - -Severity : enhancement -Bugzilla : 13360 -Description: Build failure against Centos5 (RHEL5) -Details : Use getpagesize() instead of PAGE_SIZE. - --------------------------------------------------------------------------------- - -2007-07-30 Cluster File Systems, Inc. - * version 1.6.1 - * Support for kernels: - 2.6.5-7.283 (SLES 9), - 2.6.9-55.EL (RHEL 4), - 2.6.16.46-0.14 (SLES 10), - 2.6.18.8 vanilla (kernel.org) - * Client support for unpatched kernels: - (see http://wiki.lustre.org/index.php?title=Patchless_Client) - 2.6.16 - 2.6.22 vanilla (kernel.org) - * Due to recently discovered recovery problems, we do not recommend - using patchless RHEL 4 clients with this or any earlier release. - * Recommended e2fsprogs version: 1.39.cfs8 - * Note that reiserfs quotas are disabled on SLES 10 in this kernel. - * Starting with this release, the ldiskfs backing filesystem required - by Lustre is now in its own package, lustre-ldiskfs. This package - should be installed. It is versioned separately from Lustre and - may be released separately in future. - -Severity : normal -Frequency : rare -Bugzilla : 13129 -Description: server LBUG when shutting down -Details : Block umount forever until the mount refcount is zero rather - than giving up after an arbitrary timeout. - -Severity : enhancement -Bugzilla : 12194 -Description: add optional extra BUILD_VERSION info -Details : add a new environment variable (namely LUSTRE_VERS) which allows - to override the lustre version. - -Severity : normal -Frequency : 2.6.18 servers only -Bugzilla : 12546 -Description: ll_kern_mount() doesn't release the module reference -Details : The ldiskfs module reference count never drops down to 0 - because ll_kern_mount() doesn't release the module reference. - -Severity : normal -Frequency : rare -Bugzilla : 12470 -Description: server LBUG when using old ost_num_threads parameter -Details : Accept the old ost_num_threads parameter but warn that it - is deprecated, and fix an off-by-one error that caused an LBUG. - -Severity : normal -Frequency : rare -Bugzilla : 11722 -Description: Transient SCSI error results in persistent IO issue -Details : iobuf->dr_error is not reinitialized to 0 between two - uses. - -Severity : normal -Frequency : sometimes when underlying device returns I/O errors -Bugzilla : 11743 -Description: OSTs not going read-only during write failures -Details : OSTs are not remounted read-only when the journal commit threads - get I/O errors because fsfilt_ext3 calls journal_start/stop() - instead of the ext3 wrappers. - -Severity : minor -Frequency : only with 10000 clients or more -Bugzilla : 12364 -Description: poor connect scaling with increasing client count -Details : Don't run filter_grant_sanity_check for more than 100 exports - to improve scaling for large numbers of clients. - -Severity : normal -Frequency : SLES10 only -Bugzilla : 12538 -Description: sanity-quota.sh quotacheck failed: rc = -22 -Details : Quotas cannot be enabled on SLES10. - -Severity : normal -Frequency : liblustre clients only -Bugzilla : 12229 -Description: getdirentries does not give error when run on compute nodes -Details : getdirentries does not fail when the size specified as an argument - is too small to contain at least one entry - -Severity : enhancement -Bugzilla : 11548 -Description: Add LNET router traceability for debug purposes -Details : If a checksum failure occurs with a router as part of the - IO path, the NID of the last router that forwarded the bulk data - is printed so it can be identified. - -Severity : normal -Frequency : rare -Bugzilla : 11315 -Description: OST "spontaneously" evicts client; client has imp_pingable == 0 -Details : Due to a race condition, liblustre clients were occasionally - evicted incorrectly. - -Severity : enhancement -Bugzilla : 10997 -Description: lfs setstripe use optional parameters instead of postional - parameters. - -Severity : enhancement -Bugzilla : 10651 -Description: Nanosecond timestamp support for ldiskfs -Details : The on-disk ldiskfs filesystem has added support for nanosecond - resolution timestamps. There is not yet support for this at - the Lustre filesystem level. - -Severity : normal -Frequency : during server recovery -Bugzilla : 11203 -Description: MDS failing to send precreate requests due to OSCC_FLAG_RECOVERING -Details : request with rq_no_resend flag not awake l_wait_event if they get a - timeout. - -Severity : minor -Frequency : nfs export on patchless client -Bugzilla : 11970 -Description: connectathon hang when test nfs export over patchless client -Details : Disconnected dentry cannot be found with lookup, so we do not need - to unhash it or make it invalid - -Bugzilla : 11757 -Description: fix llapi_lov_get_uuids() to allow many OSTs to be returned -Details: : Change llapi_lov_get_uuids() to read the UUIDs from /proc instead - of using an ioctl. This allows lfsck for > 160 OSTs to succeed. - -Severity : minor -Frequency : rare -Bugzilla : 11546 -Description: open req refcounting wrong on reconnect -Details : If reconnect happened between getting open reply from server and - call to mdc_set_replay_data in ll_file_open, we will schedule - replay for unreferenced request that we are about to free. - Subsequent close will crash in variety of ways. - Check that request is still eligible for replay in - mdc_set_replay_data(). - -Severity : minor -Frequency : rare -Bugzilla : 11512 -Description: disable writes to filesystem when reading health_check file -Details : the default for reading the health_check proc file has changed - to NOT do a journal transaction and write to disk, because this - can cause reads of the /proc file to hang and block HA state - checking on a healthy but otherwise heavily loaded system. It - is possible to return to the previous behaviour during configure - with --enable-health-write. - -Severity : enhancement -Bugzilla : 10768 -Description: 64-bit inode version -Details: : Add a on-disk 64-bit inode version for ext3 to track changes made - to the inode. This will be required for version-based recovery. - -Severity : normal -Frequency : rare -Bugzilla : 11818 -Description: MDS fails to start if a duplicate client export is detected -Details : in some rare cases it was possible for a client to connect to - an MDS multiple times. Upon recovery the MDS would detect this - and fail during startup. Handle this more gracefully. - -Severity : enhancement -Bugzilla : 11563 -Description: Add -o localflock option to simulate old noflock behaviour. -Details : This will achieve local-only flock/fcntl locks coherentness. - -Severity : minor -Frequency : rare -Bugzilla : 11658 -Description: log_commit_thread vs filter_destroy race leads to crash -Details : Take import reference before releasing llog record semaphore - -Severity : normal -Frequency : rare -Bugzilla : 12477 -Description: Wrong request locking in request set processing -Details : ptlrpc_check_set wrongly uses req->rq_lock for proctect add to - imp_delayed_list, in this place should be used imp_lock. - -Severity : normal -Frequency : when reconnecting -Bugzilla : 11662 -Description: Grant leak when OSC reconnect to OST -Details : When osc reconnect ost, OST(filter) should check whether it - should grant more space to client by comparing fed_grant and - cl_avail_grant, and return the granted space to client instead - of "new granted" space, because client will call osc_init_grant - to update the client grant space info. - -Severity : normal -Frequency : when client reconnects to OST -Bugzilla : 11662 -Description: Grant leak when OSC does a resend and replays bulk write -Details : When osc reconnect to OST, OST(filter) should clear grant info of - bulk write request, because the grant info while be sync between - OSC and OST when reconnect, and we should ignore the grant info - these of resend/replay write req. - -Severity : normal -Frequency : rare -Bugzilla : 11662 -Description: Grant space more than avaiable space sometimes. -Details : When then OST is about to be full, if two bulk writing from - different clients came to OST. Accord the avaliable space of the - OST, the first req should be permitted, and the second one - should be denied by ENOSPC. But if the seconde arrived before - the first one is commited. The OST might wrongly permit second - writing, which will cause grant space > avaiable space. - -Severity : normal -Frequency : when client is evicted -Bugzilla : 12371 -Description: Grant might be wrongly erased when osc is evicted by OST -Details : when the import is evicted by server, it will fork another - thread ptlrpc_invalidate_import_thread to invalidate the - import, where the grant will be set to 0. While the original - thread will update the grant it got when connecting. So if - the former happened latter, the grant will be wrongly errased - because of this race. - -Severity : normal -Frequency : rare -Bugzilla : 12401 -Description: Checking Stale with correct fid -Details : ll_revalidate_it should uses de_inode instead of op_data.fid2 - to check whether it is stale, because sometimes, we want the - enqueue happened anyway, and op_data.fid2 will not be initialized. - -Severity : enhancement -Bugzilla : 11647 -Description: update patchless client -Details : Add support for patchless client with 2.6.20, 2.6.21 and RHEL 5 - -Severity : normal -Frequency : only with 2.4 kernel -Bugzilla : 12134 -Description: random memory corruption -Details : size of struct ll_inode_info is to big for union inode.u and this - can be cause of random memory corruption. - -Severity : normal -Frequency : rare -Bugzilla : 10818 -Description: Memory leak in recovery -Details : Lov_mds_md was not free in an error handler in mds_create_object. - It should also check obd_fail before fsfilt_start, otherwise if - fsfilt_start return -EROFS,(failover mds during mds recovery). - then the req will return with repmsg->transno = 0 and rc = EROFS. - and we met hit the assert LASSERT(req->rq_reqmsg->transno == - req->rq_repmsg->transno) in ptlrpc_replay_interpret. Fcc should - be freed no matter whether fsfilt_commit success or not. - -Severity : minor -Frequency : only with huge numbers of clients -Bugzilla : 11817 -Description: Prevents from taking the superblock lock in llap_from_page for - a soon died page. -Details : using LL_ORIGIN_REMOVEPAGE origin flag instead of LL_ORIGIN_UNKNOW - for llap_from_page call in ll_removepage() prevents from taking - the superblock lock for a soon died page. - -Severity : normal -Frequency : rare -Bugzilla : 11935 -Description: Not check open intent error before release open handle -Details : in some rare cases, the open intent error is not checked before - release open handle, which may cause - ASSERTION(open_req->rq_transno != 0), because it tries to release - the failed open handle. - -Severity : normal -Frequency : rare -Bugzilla : 12556 -Description: Set cat log bitmap only after create log success. -Details : in some rare cases, the cat log bitmap is set too early. and it - should be set only after create log success. - -Severity : major -Bugzilla : 11971 -Description: Accessing a block bevice can re-enable I/O when Lustre is - tearing down a device. -Details : dev_clear_rdonly(bdev) must be called in kill_bdev() instead of - blkdev_put(). - -Severity : minor -Bugzilla : 11706 -Description: service threads may hog cpus when there are a lot of requests -Details : Insert cond_resched to give other threads a chance to use some CPU - -Severity : normal -Frequency : rare -Bugzilla : 12086 -Description: the cat log was not initialized in recovery -Details : When mds(mgs) do recovery, the tgt_count might be zero, so the - unlink log on mds will not be initialized until mds post - recovery. And also in mds post recovery, the unlink log will - initialization will be done asynchronausly, so there will be race - between add unlink log and unlink log initialization. - -Severity : normal -Bugzilla : 12597 -Description: brw_stats were being printed incorrectly -Details : brw_stats were being printed as log2 but all of them were not - recorded as log2. Also remove some code duplication arising from - filter_tally_{read,write}. - -Severity : normal -Bugzilla : 11674 -Frequency : rare, only in recovery. -Description: ASSERTION(req->rq_type != LI_POISON) failed -Details : imp_lock should be held while iterating over imp_sending_list for - prevent destroy request after get timeout in ptlrpc_queue_wait. - -Severity : normal -Bugzilla : 12689 -Description: replay-single.sh test 52 fails -Details : A lock's skiplist need to be cleanup when it being unlinked - from its resource list. - -Severity : normal -Bugzilla : 11737 -Frequency : always -Description: Short directio read returns full requested size rather than - actual amount read. -Details : Direct I/O operations should return actual amount of bytes - transferred rather than requested size. - -Severity : enhancement -Bugzilla : 10589 -Description: metadata RPC reduction (e.g. for rm performance) -Details : decrease the amount of synchronous RPC between clients and servers - by canceling conflicing lock before the operation on the client - and packing thier handles into the main operation RPC to server. - -Severity : enhancement -Bugzilla : 12605 -Description: add #ifdef HAVE_KERNEL_CONFIG_H -Details : kernels from 2.6.19 not need include linux/config.h, but add - include linux/autoconf.h in commpiler command line. - -Severity : enhancement -Bugzilla : 12764 -Description: patchless client support for 2.6.22 kernel -Details : 2.6.22 has only one visble change, SLAB_CTOR_* constants is - removed. In this case we need drop using os depended interface to - kmem_cache and use cfs_mem_cache API. - -Severity : minor -Bugzilla : 12747 -Frequency : always -Description: fix mal-formatted messages -Details : fix some mal-formatted DEBUG_REQ and LCONSOLE_ERROR_MSG messages - -Severity : minor -Bugzilla : 11737 -Frequency : always in liblustre -Description: wrong IS_ERR implementation in liblustre.h -Details : fix IS_ERR implementation in liblustre.h for right detect errors. - -Severity : minor -Bugzilla : 10419 -Frequency : always -Description: Correct condition for output debug message. -Details : inode i_nlink equal zero is not enough for output message about - disk corruption, i_ctime and i_mode should be also checked. - -Severity : minor -Bugzilla : 12415 -Frequency : always in patchless client -Description: add configure check for truncate_complete_page -Details : improve checks for exported symbols. This allow run check without - sources, but with Module.symvers shipped with kernel distribution. - add check for truncate_complete_page used by patchless client. - -Severity : normal -Bugzilla : 12646 -Description: sanity.sh test_77h fails with "test_77h file compare failed" -Details : test_77h uses a file which was messed by other test case. - -Severity : normal -Bugzilla : 12576 -Description: Not Check whether lov_tgts is NULL in some lov functions -Details : Checking whether lov_tgts is NULL in some functions. - -Severity : normal -Bugzilla : 11815 -Description: replace obdo_alloc() with OBDO_ALLOC macro -Details : nothing special is done in obdo_alloc() function, and for - debugging purpose, it needs to be replaced with macros. - -Severity : normal -Bugzilla : 12784 -Description: bad return value and errno from fcntl call -Details : In liblustre API, errno should be a negative value if error - happens. - -Severity : normal -Bugzilla : 11544 -Description: ptlrpc_check_set() LBUG -Details : In case of positive reply from server and failed client bulk - callback after bulk transfer shouldn't LBUG, but process this - request as erroneous. - -Severity : enhancement -Bugzilla : 10968 -Description: VFS operations stats tool. -Details : Tool which collects stats by tracking value written in pid, - ppid, gid and uses llstat to generate output to plot graph using - plot-llstat - Updated lustre/utils/Makefile.am - Added lustre/utils/ltrack_stats.c - -Severity : enhancement -Bugzilla : 11039 -Description: 2.6.18 server support (lustre 1.6.1) -Details : Support for 2.6.18 kernels on the server side. - -Severity : normal -Frequency : rare -Bugzilla : 12696 -Description: ASSERTION(imp->imp_conn_current) failed -Details : an assertion failure is hit if a client node boots and attempts to - mount a lustre filesystem faster than RECONNECT_INTERVAL seconds. - -Severity : normal -Frequency : only for i686 -Bugzilla : 12695 -Description: 1.4.11 RC1 build fails for RHEL 4, i686 -Details : Fixed config variable for build. - -Severity : normal -Frequency : rare -Bugzilla : 12415 -Description: Updated patchess for new RHEL4 kernel -Details : Updated patch inode-nr_unused-2.6.9-rhel4.patch - Updated patch jbd-stats-2.6.9.patch - Updated patch qsnet-rhel4-2.6.patch - Updated patch quota-deadlock-on-pagelock-core.patch - Updated patch vfs_intent-2.6-rhel4.patch - Updated patch vfs_races-2.6-rhel4.patch - Updated series file 2.6-rhel4-titech.series - Updated series file 2.6-rhel4.series - Updated kernel config files - -Severity : normal -Frequency : rare -Bugzilla : 12374 -Description: lquota slave complains LBUG when reconnecting with mds - or failover in mds. -Details : quota slave depends qctxt->lqc_import to send its quota request. - This pointer will be invalid if mds did failover or broke its - connect to osts, which leads to LBUG. - -Severity : normal -Frequency : when qunit size is too small(less than 20M) -Bugzilla : 12588 -Description: write is stopped by improper -EDQUOT -Details : If the master is busy and qunit size is small enough(let's say 1M), - the slave can not get quota from master on time, which will lead - slave to trigger a -EQUOTA to client. - -Severity : normal -Frequency : rare -Bugzilla : 12629 -Description: Deadlock during metadata tests -Details : in prune_dir_dentries(), shrink_dcache_parent() should not be - called with the per-dentry lock held. - -Severity : normal -Frequency : SLES9 only -Bugzilla : 12744 -Description: Lustre patched kernel for SLES9 SP3 has NR_CPUS set to 8 -Details : set CONFIG_NR_CPUS to 128 instead of 8. - -Severity : enhancement -Bugzilla : 12678 -Description: remove fs_prep_san_write operation and related patches -Details : remove the ext3-san-jdike patches which are no longer useful. - -Severity : normal -Frequency : rare -Bugzilla : 11324 -Description: LDISKFS-fs error (device sdc): ldiskfs_free_blocks -Details : a disk corruption can cause the mballoc code to assert on a - double free or other extent corruptions. Handle these with - ext3_error() instead of assertions. - -Severity : major -Frequency : only with mballoc3 code and deep extent trees -Bugzilla : 12861 -Description: ldiskfs_ext_search_right: bad header in inode: unexpected eh_depth -Details : a wrong check of extent headers in ldiskfs_ext_search_right() - can cause the filesystem to be remounted read-only. - -Severity : normal -Frequency : rare -Bugzilla : 13063 -Description: lfsck built against 1.4.x cannot run against 1.6.0 lustre -Details : the definition for OBD_IOC_GETNAME changed in 1.6.0. One of the - few external users of this ioctl number is lfsck's call to - llapi_lov_get_uuids() and this caused lfsck to fail at startup. - Add the old ioctl number to the handler so both old and new - lfsck can work. - -Severity : normal -Bugzilla : 11301 -Description: parallel lock callbacks -Details : Instead of sending blocking and completion callbacks as separated - requests, adding them to a set and sending in parallel. - -Severity : normal -Bugzilla : 12417 -Description: Disable most debugging by default -Details : To improve performance, disable most logging (for debug purposes) - by default. VFSTRACE, RPCTRACE, and DLMTRACE are now off by - default, and HA includes fewer messages. - -Severity : minor -Bugzilla : 12858 -Frequency : only run on patchless client. -Description: use do_facet on sanity.sh for test handling recoverables errors -Details : use do_facet instead of direct use sysctl for set fail_loc on OST - -Severity : normal -Bugzilla : 11013 -Description: hash tables for lists of nids, connections and uuids -Details : Hash tables noticeably help when a lot of clients connect to a - server, to faster identify duplicate connections or reconnects, - also to faster find export to evict in manual eviction case. - -Severity : normal -Bugzilla : 11190 -Description: Sometimes, when the server evict a client, and the client will - not be evicted as soon as possible. -Details : In enqueue req, the error was returned by intent, instead of - rq_status which make ptlrpc layer not detect this error, and - does not evict the client. So enqueue error should be returned - by rq_status. - -Severity : minor -Frequency : only at startup -Bugzilla : 11778 -Description: Delay client connections to MDT until fist MDT->OST connect -Details : If a client tried to create a new file before the MDT had - connected to any OSTs, the create would return EIO. Now - the client will simply block until the MDT connects to the - first OST and the create can succeed. - -Severity : normal -Frequency : only for SLES9 -Bugzilla : 12543 -Description: Routinely utilize latest Quadrics drivers in CFS releases -Details : Update patch qsnet-suse-2.6.patch. - -Severity : normal -Frequency : only for sles10 -Bugzilla : 12771 -Description: Update patches for SLES 10 SP1 kernel. -Details : Update the patch vfs_intent-2.6-sles10.patch. - -Severity : normal -Frequency : rare -Bugzilla : 12543 -Description: Routinely utilize latest Quadrics drivers in CFS releases -Details : Update patch qsnet-rhel4-2.6.patch. - -Severity : minor -Frequency : at statup only -Bugzilla : 12860 -Description: mds_lov_synchronize race leads to various problems -Details : simultaneous MDT->OST connections at startup can cause the - sync to abort, leaving the OSC in a bad state. - -Severity : normal -Bugzilla : 12975 -Frequency : rare -Description: Using wrong pointer in osc_brw_prep_request -Details : Access to array[-1] can produce panic if kernel compiled with - CONFIG_PAGE_ALLOC enabled - -Severity : enhancement -Bugzilla : 4900 -Description: Async OSC create to avoid the blocking unnecessarily. -Details : If a OST has no remain object, system will block on the creating - when need to create a new object on this OST. Now, ways use - pre-created objects when available, instead of blocking on an - empty osc while others are not empty. If we must block, we block - for the shortest possible period of time. - -Severity : normal -Bugzilla : 13148 -Frequency : only in recovery -Description: Mark OST as early accessible if his start SYNC. -Details : osc_precreate return flag early accessible if oscc marked as - OSCC_FLAG_SYNC_IN_PROGRESS. - -Severity : normal -Bugzilla : 13196 -Frequency : rare -Description: Sometimes precreate code can triger create object on wrong ost -Details : Wrong protected or not not restored variables aftre precreate loop - can produce creation object on wrong ost. - -Severity : normal -Frequency : oss recovery -Bugzilla : 10800 -Description: llog_commit_thread cleanup should sync with llog_commit_thread - start -Details : llog_commit_thread_count should be synced between llog_commit - start and cleanup, so new llog_commit thread should not be started - when llog_commit threads being stopped to avoid accessing some - freed stuff. - -Severity : enhancement -Bugzilla : 11721 -Description: Add printing inode info into message about error in writepage. - --------------------------------------------------------------------------------- - -2007-05-03 Cluster File Systems, Inc. - * version 1.6.0.1 - * bug fixes - -Severity : normal -Frequency : on some architectures -Bugzilla : 12404 -Description: 1.6 client sometimes fails to mount from a 1.4 MDT -Details : Uninitialized flags sometimes cause configuration commands to - be skipped. - -Severity : normal -Frequency : patchless clients only -Bugzilla : 12391 -Description: missing __iget() symbol export -Details : The __iget() symbol export is missing. To avoid the need for - this on patchless clients the deathrow inode reaper is turned - off, and we depend on the VM to clean up old inodes. This - dependency was during via the fix for bug 12181. - -Severity : normal -Frequency : always -Bugzilla : 12848 -Description: sanity.sh fail: test_52b -Details : The ll_inode_to_ext_flags() has a glitch which makes MDS return - incorrect inode's flags to client. - --------------------------------------------------------------------------------- - -2007-04-19 Cluster File Systems, Inc. - * version 1.6.0 - * CONFIGURATION CHANGE. This version of Lustre WILL NOT - INTEROPERATE with older versions automatically. In many cases a - special upgrade step is needed. Please read the - user documentation before upgrading any part of a 1.4.x system. - * WARNING: Lustre configuration and startup changes are required with - this release. See https://mail.clusterfs.com/wikis/lustre/MountConf - for details. - * Support for kernels: - 2.4.21-47.0.1.EL (RHEL 3), - 2.6.5-7.283 (SLES 9), - 2.6.9-42.0.10.EL (RHEL 4), - 2.6.12.6 vanilla (kernel.org), - 2.6.16.27-0.9 (SLES10) - * Client support for unpatched kernels: - (see https://mail.clusterfs.com/wikis/lustre/PatchlessClient) - 2.6.16 - 2.6.19 vanilla (kernel.org), - 2.6.9-42.0.8EL (RHEL 4) - * Recommended e2fsprogs version: 1.39.cfs6 - * Note that reiserfs quotas are disabled on SLES 10 in this kernel - * bug fixes - -Severity : enhancement -Bugzilla : 4900 -Description: Async OSC create to avoid the blocking unnecessarily. -Details : If a OST has no remain object, system will block on the creating - when need to create a new object on this OST. Now, ways use - pre-created objects when available, instead of blocking on an - empty osc while others are not empty. If we must block, we block - for the shortest possible period of time. - -Severity : enhancement -Bugzilla : 8007 -Description: MountConf -Details : Lustre configuration is now managed via mkfs and mount - commands instead of lmc and lconf. New obd types (MGS, MGC) - are added for dynamic configuration management. See - https://mail.clusterfs.com/wikis/lustre/MountConf for - details. - -Severity : enhancement -Bugzilla : 4482 -Description: dynamic OST addition -Details : OSTs can now be added to a live filesystem - -Severity : enhancement -Bugzilla : 9851 -Description: startup order invariance -Details : MDTs and OSTs can be started in any order. Clients only - require the MDT to complete startup. - -Severity : enhancement -Bugzilla : 4899 -Description: parallel, asynchronous orphan cleanup -Details : orphan cleanup is now performed in separate threads for each - OST, allowing parallel non-blocking operation. - -Severity : enhancement -Bugzilla : 9862 -Description: optimized stripe assignment -Details : stripe assignments are now made based on ost space available, - ost previous usage, and OSS previous usage, in order to try - to optimize storage space and networking resources. - -Severity : enhancement -Bugzilla : 4226 -Description: Permanently set tunables -Details : All writable /proc/fs/lustre tunables can now be permanently - set on a per-server basis, at mkfs time or on a live - system. - -Severity : enhancement -Bugzilla : 10547 -Description: Lustre message v2 -Details : Add lustre message format v2. - -Severity : enhancement -Bugzilla : 9866 -Description: client OST exclusion list -Details : Clients can be started with a list of OSTs that should be - declared "inactive" for known non-responsive OSTs. - -Severity : normal -Bugzilla : 12123 -Description: ENOENT returned for valid filehandle during dbench. -Details : Check if a directory has children when invalidating dentries - associated with an inode during lock cancellation. This fixes - an incorrect ENOENT sometimes seen for valid filehandles during - testing with dbench. - -Severity : minor -Frequency : SFS test only (otherwise harmless) -Bugzilla : 6062 -Description: SPEC SFS validation failure on NFS v2 over lustre. -Details : Changes the blocksize for regular files to be 2x RPC size, - and not depend on stripe size. - -Severity : enhancement -Bugzilla : 10088 -Description: fine-grained SMP locking inside DLM -Details : Improve DLM performance on SMP systems by removing the single - per-namespace lock and replace it with per-resource locks. - -Severity : enhancement -Bugzilla : 9332 -Description: don't hold multiple extent locks at one time -Details : To avoid client eviction during large writes, locks are not - held on multiple stripes at one time or for very large writes. - Otherwise, clients can block waiting for a lock on a failed OST - while holding locks on other OSTs and be evicted. - -Severity : enhancement -Bugzilla : 9293 -Description: Multiple MD RPCs in flight. -Details : Further unserialise some read-only MDT RPCs - learn about intents. - To avoid overly-overloading MDT, introduce a limit on number of - MDT RPCs in flight for a single client and add /proc controls - to adjust this limit. - -Severity : enhancement -Bugzilla : 22484 -Description: client read/write statistics -Details : Add client read/write call usage stats for performance - analysis of user processes. - /proc/fs/lustre/llite/*/offset_stats shows non-sequential - file access. extents_stats shows chunk size distribution. - extents_stats_per_process show chunk size distribution per - user process. - -Severity : enhancement -Bugzilla : 22485 -Description: per-client statistics on server -Details : Add ldlm and operations statistics for each client in - /proc/fs/lustre/mds|obdfilter/*/exports/ - -Severity : enhancement -Bugzilla : 22486 -Description: improved MDT statistics -Details : Add detailed MDT operations statistics in - /proc/fs/lustre/mds/*/stats - -Severity : enhancement -Bugzilla : 10968 -Description: VFS operations stats -Details : Add client VFS call stats, trackable by pid, ppid, or gid - /proc/fs/lustre/llite/*/stats_track_[pid|ppid|gid] - -Severity : minor -Frequency : always -Bugzilla : 6380 -Description: Fix client-side osc byte counters -Details : The osc read/write byte counters in - /proc/fs/lustre/osc/*/stats are now working - -Severity : minor -Frequency : always as root on SLES -Bugzilla : 10667 -Description: Failure of copying files with lustre special EAs. -Details : Client side always return success for setxattr call for lustre - special xattr (currently only "trusted.lov"). - -Severity : minor -Frequency : always -Bugzilla : 10345 -Description: Refcount LNET uuids -Details : The global LNET uuid list grew linearly with every startup; - refcount repeated list entries instead of always adding to - the list. - -Severity : enhancement -Bugzilla : 2258 -Description: Dynamic service threads -Details : Within a small range, start extra service threads - automatically when the request queue builds up. - -Severity : major -Frequency : mixed-endian client/server environments -Bugzilla : 11214 -Description: mixed-endian crashes -Details : The new msg_v2 system had some failures in mixed-endian - environments. - -Severity : enhancement -Bugzilla : 11229 -Description: Easy OST removal -Details : OSTs can be permanently deactivated with e.g. 'lctl - conf_param lustre-OST0001.osc.active=0' - -Severity : enhancement -Bugzilla : 11335 -Description: MGS proc entries -Details : Added basic proc entries for the MGS showing what filesystems - are served. - -Severity : enhancement -Bugzilla : 10998 -Description: provide MGS failover -Details : Added config lock reacquisition after MGS server failover. - -Severity : enhancement -Bugzilla : 11461 -Description: add Linux 2.4 support -Details : Added support for RHEL 2.4.21 kernel for 1.6 servers and clients - -Severity : normal -Bugzilla : 11330 -Description: a large application tries to do I/O to the same resource and dies - in the middle of it. -Details : Check the req->rq_arrival time after the call to - ost_brw_lock_get(), but before we do anything about - processing it & sending the BULK transfer request. This - should help move old stale pending locks off the queue as - quickly as obd_timeout. - -Severity : major -Frequency : when an incorrect nid is specified during startup -Bugzilla : 10734 -Description: ptlrpc connect to non-existant node causes kernel crash -Details : LNET can't be re-entered from an event callback, which - happened when we expire a message after the export has been - cleaned up. Instead, hand the zombie cleanup off to another - thread. - -Severity : enhancement -Bugzilla : 10902 -Description: plain/inodebits lock performance improvement -Details : Grouping plain/inodebits in granted list by their request modes - and bits policy, thus improving the performance of search through - the granted list. - -Severity : major -Frequency : only if OST filesystem is corrupted -Bugzilla : 9829 -Description: client incorrectly hits assertion in ptlrpc_replay_req() -Details : for a short time RPCs with bulk IO are in the replay list, - but replay of bulk IOs is unimplemented. If the OST filesystem - is corrupted due to disk cache incoherency and then replay is - started it is possible to trip an assertion. Avoid putting - committed RPCs into the replay list at all to avoid this issue. - -Severity : major -Frequency : liblustre (e.g. catamount) on a large cluster with >= 8 OSTs/OSS -Bugzilla : 11684 -Description: System hang on startup -Details : This bug allowed the liblustre (e.g. catamount) client to - return to the app before handling all startup RPCs. This - could leave the node unresponsive to lustre network traffic - and manifested as a server ptllnd timeout. - -Severity : enhancement -Bugzilla : 11667 -Description: Add "/proc/sys/lustre/debug_peer_on_timeout" -Details : liblustre envirable: LIBLUSTRE_DEBUG_PEER_ON_TIMEOUT - boolean to control whether to print peer debug info when a - client's RPC times out. - -Severity : minor -Frequency : only for kernels with patches from Lustre below 1.4.3 -Bugzilla : 11248 -Description: Remove old rdonly API -Details : Remove old rdonly API which unused from at least lustre 1.4.3 - -Severity : major -Frequency : only for devices with external journals -Bugzilla : 10719 -Description: Set external device read-only also -Details : During a commanded failover stop, we set the disk device - read-only while the server shuts down. We now also set any - external journal device read-only at the same time. - -Severity : minor -Frequency : when upgrading from 1.4 while trying to change parameters -Bugzilla : 11692 -Description: The wrong (new) MDC name was used when setting parameters for - upgraded MDT's. Also allows changing of OSC (and MDC) - parameters if --writeconf is specified at tunefs upgrade time. - -Severity : major -Frequency : when setting specific ost indicies -Bugzilla : 11149 -Description: QOS code breaks on skipped indicies -Details : Add checks for missing OST indicies in the QOS code, so OSTs - created with --index need not be sequential. - -Severity : enhancement -Bugzilla : 11264 -Description: Add uninit_groups feature to ldiskfs2 to speed up e2fsck -Details : The uninit_groups feature works in conjunction with the kernel - filesystem code (ldiskfs2 only) and e2fsprogs-1.39-cfs6 to speed - up the pass1 processing of e2fsck. This is a read-only feature - in ldiskfs2 only, so older kernels and current ldiskfs cannot - mount filesystems that have had this feature enabled. - -Severity : enhancement -Bugzilla : 10816 -Description: Improve multi-block allocation algorithm to avoid fragmentation -Details : The mballoc3 code (ldiskfs2 only) adds new mechanisms to improve - allocation locality and avoid filesystem fragmentation. - ------------------------------------------------------------------------------- - -2007-04-01 Cluster File Systems, Inc. - * version 1.4.10 - * Support for kernels: - 2.4.21-47.0.1.EL (RHEL 3) - 2.6.5-7.283 (SLES 9) - 2.6.9-42.0.10.EL (RHEL 4) - 2.6.12.6 vanilla (kernel.org) - 2.6.16.27-0.9 (SLES 10) - * Recommended e2fsprogs version: 1.39.cfs5 - - * Note that reiserfs quotas are disabled on SLES 10 in this kernel - * bug fixes - -Severity : critical -Frequency : occasional, depends on client load and configuration -Bugzilla : 12181, 12203 -Description: data loss for recently-modified files -Introduced : 1.4.6 -Details : In some cases it is possible that recently written or created - files may not be written to disk in a timely manner (this should - normally be within 30s unless client IO load is very high). - The problem appears as zero-length files or files that are a - multiple of 1MB in size after a client crash or client eviction - that are missing data at the end of the file. - - This problem is more likely to be hit on clients where files are - repeatedly created and unlinked in the same directory, clients - have a large amount of RAM, have many CPUs, the filesystem has - many OSTs, the clients are rebooted frequently, and/or the files - are not accessed by other nodes after being written. - - The presence of the problem can be detected by looking at - /proc/sys/fs/inode-state. If the first number (nr_inodes) is - smaller than the second (nr_unused) then dirty files will not - be flushed automatically to disk. "sync; sleep 10" should be - run several times on the node before unmounting it to update - Lustre (this is also safe to run on nodes without this problem). - - There is also a related kernel bug in the RHEL4 4 2.6.9 kernel - that can cause this same problem, so customers using that kernel - also need to update the kernel in addition to Lustre. In order - to properly fix this bug, the RHEL3 2.4.21 kernel is also updated. - - It is normal that files written just before a client crash (less - than 30s) may not yet have been flushed to disk, even for local - filesystems. - -Severity : normal -Frequency : frequent on thin XT3 nodes -Bugzilla : 10802 -Description: UUID collision on thin XT3 Linux nodes -Details : UUIDs on Compute Node Linux XT3 nodes were not generated - randomly, since we relied on an insufficiently-seeded PRNG. - -Severity : normal -Frequency : rare -Bugzilla : 11693 -Description: OSS hangs after "All ost request buffers busy" -Details : A deadlock between quota and journal operations caused OSS - hangs after printing "All ost request buffers busy." - -Severity : minor -Frequency : always on liblustre builds -Bugzilla : 11175 -Description: Cleanup compiler warnings on liblustre - -Severity : minor -Frequency : always on liblustre builds on XT3 -Bugzilla : 12146 -Description: LC_CONFIG_CDEBUG don't run while build liblustre on XT3. - -Frequency : always -Bugzilla : 3244 -Description: Addition of EXT3_FEATURE_RO_COMPAT_DIR_NLINKS flag for - > 32000 subdirectories -Details : Add EXT3_FEATURE_RO_COMPAT_DIR_NLINK flag to - EXT3_FEATURE_RO_COMPAT_SUPP. This flag will be set whenever - subdirectory count crosses 32000. This will aid e2fsck to - correctly handle more than 32000 subdirectories. - -Severity : major -Frequency : liblustre (e.g. catamount) on a large cluster with >= 8 OSTs/OSS -Bugzilla : 11684 -Description: System hang on startup -Details : This bug allowed the liblustre (e.g. catamount) client to - return to the app before handling all startup RPCs. This - could leave the node unresponsive to lustre network traffic - and manifested as a server ptllnd timeout. - -Severity : enhancement -Bugzilla : 11667 -Description: Add "/proc/sys/lustre/debug_peer_on_timeout" - (liblustre envirable: LIBLUSTRE_DEBUG_PEER_ON_TIMEOUT) - boolean to control whether to print peer debug info when a - client's RPC times out. - -Severity : normal -Frequency : always -Bugzilla : 10214 -Description: make O_SYNC working on 2.6 kernels -Details : 2.6 kernels use different method for mark pages for write, - so need add a code to lustre for O_SYNC work. - -Severity : minor -Frequency : always -Bugzilla : 11110 -Description: Failure to close file and release space on NFS -Details : Put inode details into lock acquired in ll_intent_file_open. - Use mdc_intent_lock in ll_intent_open to properly - detect all kind of errors unhandled by mdc_enqueue. - -Severity : major -Frequency : rare -Bugzilla : 10866 -Description: proc file read during shutdown sometimes raced obd removal, - causing node crash -Details : Add lock to prevent obd access after proc file removal. - -Severity : normal -Frequency : Only for files larger than 4GB on 32-bit clients. -Bugzilla : 11237 -Description: improperly doing page alignment of locks -Details : Modify lustre core code to use CFS_PAGE_* defines instead of - PAGE_*. Make CFS_PAGE_MASK a 64-bit mask. - -Severity : normal -Frequency : rarely -Bugzilla : 11203 -Description: RPCs being resent when they shouldn't be -Details : Some RPCs that should not be resent are being resent. This - can cause inconsistencies in the RPC state machine. Do not - resend such requests. - -Severity : normal -Frequency : rare, only with NFS export -Bugzilla : 11669 -Description: Crash on NFS re-export node -Details : under very unusual load conditions an assertion is hit in - ll_intent_file_open() - -Severity : major -Frequency : only if OST filesystem is corrupted -Bugzilla : 9829 -Description: client incorrectly hits assertion in ptlrpc_replay_req() -Details : for a short time RPCs with bulk IO are in the replay list, - but replay of bulk IOs is unimplemented. If the OST filesystem - is corrupted due to disk cache incoherency and then replay is - started it is possible to trip an assertion. Avoid putting - committed RPCs into the replay list at all to avoid this issue. - -Severity : normal -Frequency : always -Bugzilla : 10901 -Description: large O_DIRECT requests fail under memory pressure/fragmentation -Details : Large single O_DIRECT read and write calls can fail to allocate - a sufficiently large buffer to process the request. In case of - allocation failure the allocation is retried with a smaller - buffer and broken into smaller requests. - -Severity : enhancement -Bugzilla : 11563 -Description: Add -o localflock option to simulate old noflock behaviour. -Details : This will achieve local-only flock/fcntl locks coherentness. - -Severity : normal -Frequency : always -Bugzilla : 11090 -Description: versioning check is incomplete -Details : Checking the version difference of client vs. server, report - error if the gap is too big. - -Severity : major -Bugzilla : 11710 -Frequency : always -Description: add support PG_writeback bit -Details : add support for PG_writeback bit for Lustre, for more carefull - work with page cache in 2.6 kernel. This also fix some deadlocks - and remove hack for work O_SYNC with 2.6 kernel. - -Severity : enhancement -Bugzilla : 11264 -Description: Add uninit_groups feature to ldiskfs2 to speed up e2fsck -Details : The uninit_groups feature works in conjunction with the kernel - filesystem code (ldiskfs2 only) and e2fsprogs-1.39-cfs6 to speed - up the pass1 processing of e2fsck. This is a read-only feature - in ldiskfs2 only, so older kernels and current ldiskfs cannot - mount filesystems that have had this feature enabled. - -Severity : enhancement -Bugzilla : 10816 -Description: Improve multi-block allocation algorithm to avoid fragmentation -Details : The mballoc3 code (ldiskfs2 only) adds new mechanisms to improve - allocation locality and avoid filesystem fragmentation. - ------------------------------------------------------------------------------- - -2007-02-09 Cluster File Systems, Inc. - * version 1.4.9 - * Support for kernels: - 2.6.9-42.0.3.EL (RHEL 4) - 2.6.5-7.276 (SLES 9) - 2.4.21-47.0.1.EL (RHEL 3) - 2.6.12.6 vanilla (kernel.org) - 2.6.16.21-0.8 (SLES10) - * Recommended e2fsprogs version: 1.39.cfs2-0 - - * The backwards-compatible /proc/sys/portals symlink has been removed - in this release. Before upgrading, please ensure that you change - any configuration scripts or /etc/sysctl.conf files that access - /proc/sys/portals/* or sysctl portals.* to use the corresponding - entry in /proc/sys/lnet or sysctl lnet.*. This change can be made - in advance of the upgrade on any system running Lustre 1.4.6 or - newer, since /proc/sys/lnet was added in that version. - * Note that reiserfs quotas are disabled on SLES 10 in this kernel - * bug fixes - -Severity : minor -Frequency : only when quota is used -Bugzilla : 11286 -Description: avoid scanning export list for quota master -Details : Change the algorithms to avoid scanning export list in order - to improve the efficiency. - -Severity : critical -Frequency : MDS failover only, very rarely -Bugzilla : 11125 -Description: "went back in time" messages on mds failover -Details : The greatest transno may be lost when the current operation - finishes with an error (transno==0) and the client's last_rcvd - record is over-written. Save the greatest transno in the - mds_last_transno for this case. - -Severity : minor -Frequency : always for specific kernels and striping counts -Bugzilla : 11042 -Description: client may get "Matching packet too big" without ACL support -Details : Clients compiled without CONFIG_FS_POSIX_ACL get an error message - when trying to access files in certain configurations. The - clients should in fact be denied when mounting because they do - not understand ACLs. - -Severity : major -Frequency : Cray XT3 with more than 4000 clients and multiple jobs -Bugzilla : 10906 -Description: many clients connecting with IO in progress causes connect timeouts -Details : Avoid synchronous journal commits to avoid delays caused by many - clients connecting/disconnecting when bulk IO is in progress. - Queue liblustre connect requests on OST_REQUEST_PORTAL instead of - OST_IO_PORTAL to avoid delays behind potentially many pending - slow IO requests. - -Severity : normal -Frequency : occasionally with multiple writers to a single file -Bugzilla : 11081 -Description: shared writes to file may result in wrong size reported by stat() -Details : Allow growing of kms when extent lock is cancelled - -Severity : minor -Frequency : always with random mmap IO to multi-striped file -Bugzilla : 10919 -Description: mmap write might be lost if we are writing to a 'hole' in stripe -Details : Only if the hole is at the end of OST object so that kms is too - small. Fix is to increase kms accordingly in ll_nopage. - -Severity : normal -Frequency : rare, only if OST filesystem is inconsistent with MDS filesystem -Bugzilla : 11211 -Description: writes to a missing object would leak memory on the OST -Details : If there is an inconsistency between the MDS and OST filesystems, - such that the MDS references an object that doesn't exist, writes - to that object will leak memory due to incorrect cleanup in the - error handling path, eventually running out of memory on the OST. - -Severity : minor -Frequency : rare -Bugzilla : 11040 -Description: Creating too long symlink causes lustre errors -Details : Check symlink and name lengths before sending requests to MDS. - -Severity : normal -Frequency : only if flock is enabled (not on by default) -Bugzilla : 11415 -Description: posix locks not released on fd closure on 2.6.9+ -Details : We failed to add posix locks to list of inode locks on 2.6.9+ - kernels, this caused such locks not to be released on fd close and - then assertions on fs unmount about still used locks. - -Severity : minor -Frequency : MDS failover only, very rarely -Bugzilla : 11277 -Description: clients may get ASSERTION(granted_lock != NULL) -Details : When request was taking a long time, and a client was resending - a getattr by name lock request. The were multiple lock requests - with the same client lock handle and - mds_getattr_name->fixup_handle_for_resent_request found one of the - lock handles but later failed with ASSERTION(granted_lock != NULL). - -Severity : major -Frequency : rare -Bugzilla : 10891 -Description: handle->h_buffer_credits > 0, assertion failure -Details : h_buffer_credits is zero after truncate, causing assertion - failure. This patch extends the transaction or creates a new - one after truncate. - -Severity : normal -Frequency : NFS re-export or patchless client -Bugzilla : 11179, 10796 -Description: Crash on NFS re-export node (__d_move) -Details : We do not want to hash the dentry if we don't have a lock. - But if this dentry is later used in d_move, we'd hit uninitialised - list head d_hash, so we just do this to init d_hash field but - leave dentry unhashed. - -Severity : normal -Frequency : NFS re-export or patchless client -Bugzilla : 11135 -Description: NFS exports has problem with symbolic link -Details : lustre client didn't properly install dentry when re-exported - to NFS or running patchless client. - -Severity : normal -Frequency : NFS re-export or patchless client -Bugzilla : 10796 -Description: Various nfs/patchless fixes. -Details : fixes reuse disconected alias for lookup process - this fixes - warning "find_exported_dentry: npd != pd", - fix permission error with open files at nfs. - fix apply umask when do revalidate. - -Severity : normal -Frequency : occasional -Bugzilla : 11191 -Description: Crash on NFS re-export node -Details : calling clear_page() on the wrong pointer triggered oops in - generic_mapping_read(). - -Severity : normal -Frequency : rarely, using O_DIRECT IO -Bugzilla : 10903 -Description: unaligned directio crashes client with LASSERT -Details : check for unaligned buffers before trying any requests. - -Severity : major -Frequency : rarely, using CFS RAID5 patches in non-standard kernel series -Bugzilla : 11313 -Description: stale data returned from RAID cache -Details : If only a small amount of IO is done to the RAID device before - reading it again it is possible to get stale data from the RAID - cache instead of reading it from disk. - -Severity : normal -Frequency : always for sles10 kernel -Bugzilla : 10947 -Description: sles10 support -Details : ll_follow_link: compile fixes and using of nd_set_link - under newer kernels. - -Severity : major -Frequency : depends on arch, kernel and compiler version, always on sles10 - kernel and x86_64 -Bugzilla : 11562 -Description: recursive or deep enough symlinks cause stack overflow -Details : getting rid of large stack-allocated variable in - __vfs_follow_link - -Severity : minor -Frequency : depends on hardware -Bugzilla : 11540 -Description: lustre write performance loss in the SLES10 kernel -Details : the performance loss is caused by using of write barriers in the - ext3 code. The SLES10 kernel turns barrier support on by - default. The fix is to undo that change for ldiskfs. - ------------------------------------------------------------------------------- - -2006-12-09 Cluster File Systems, Inc. - * version 1.4.8 - * Support for kernels: - 2.6.9-42.0.3EL (RHEL 4) - 2.6.5-7.276 (SLES 9) - 2.4.21-47.0.1.EL (RHEL 3) - 2.6.12.6 vanilla (kernel.org) - * bug fixes - -Severity : major -Frequency : quota enabled and large files being deleted -Bugzilla : 10707 -Description: releasing more than 4GB of quota at once hangs OST -Details : If a user deletes more than 4GB of files on a single OST it - will cause the OST to spin in an infinite loop. Release - quota in < 4GB chunks, or use a 64-bit value for 1.4.7.1+. - -Severity : minor -Frequency : rare -Bugzilla : 10845 -Description: statfs data retrieved from /proc may be stale or zero -Details : When reading per-device statfs data from /proc, in the - {kbytes,files}_{total,free,avail} files, it may appear - as zero or be out of date. - -Severity : minor -Frequency : systems with MD RAID1 external journal devices -Bugzilla : 10832 -Description: lconf's call to blkid is confused by RAID1 journal devices -Details : Use the "blkid -l" flag to locate the MD RAID device instead - of returning all block devices that match the journal UUID. - -Severity : normal -Frequency : always, for aggregate stripe size over 4GB -Bugzilla : 10725 -Description: "lfs setstripe" fails assertion when setting 4GB+ stripe width -Details : Using "lfs setstripe" to set stripe size * stripe count over 4GB - will fail the kernel with "ASSERTION(lsm->lsm_xfersize != 0)" - -Severity : minor -Frequency : always if "lfs find" used on a local file/directory -Bugzilla : 10864 -Description: "lfs find" segfaults if used on a local file/directory -Details : The case where a directory component was not specified wasn't - handled correctly. Handle this properly. - -Severity : normal -Frequency : always on ppc64 -Bugzilla : 10634 -Description: the write to an ext3 filesystem mounted with mballoc got stuck -Details : ext3_mb_generate_buddy() uses find_next_bit() which does not - perform endianness conversion. - -Severity : major -Frequency : rarely (truncate to non-zero file size after write under load) -Bugzilla : 10730, 10687 -Description: Files padded with zeros to next 4K multiple -Details : With filesystems mounted using the "extents" option (2.6 kernels) - it is possible that files that are truncated to a non-zero size - immediately after being written are filled with zero bytes beyond - the truncated size. No file data is lost. - -Severity : enhancement -Bugzilla : 10452 -Description: Allow recovery/failover for liblustre clients. -Details : liblustre clients were unaware of failover configurations until - now. - -Severity : enhancement -Bugzilla : 10743 -Description: user file locks should fail when not mounting with flock option -Details : Set up an error-returning stub in ll_file_operations.lock field - to prevent incorrect behaviour when client is mounted without - flock option. Also, set up properly f_op->flock field for - RHEL4 kernels. - -Severity : minor -Frequency : always on ia64 -Bugzilla : 10905 -Description: "lfs df" loops on printing out MDS statfs information -Details : The obd_ioctl_data was not initialized and in some systems - this caused a failure during the ioctl that did not return - an error. Initialize the struct and return an error on failure. - -Severity : minor -Frequency : SLES 9 only -Bugzilla : 10667 -Description: Error of copying files with lustre special EAs as root -Details : Client side always return success for setxattr call for lustre - special xattr (currently only "trusted.lov"). - -Severity : normal -Frequency : rarely on clusters with both ia64+i386 clients -Bugzilla : 10672 -Description: ia64+i686 clients doing shared IO on the same file may LBUG -Details : In rare cases when both ia64+i686 (or other mixed-PAGE_SIZE) - clients are doing concurrent writes to the same file it is - possible that the ia64 clients may LASSERT because the OST - extent locks are not PAGE_SIZE aligned. Ensure that grown - locks are always aligned on the request boundary. - -Severity : normal -Frequency : specific use, occasional -Bugzilla : 7040 -Description: Overwriting in use executable truncates on-disk binary image -Details : If one node attempts to overwrite an executable in use by - another node, we now correctly return ETXTBSY instead of - truncating the file. - -Severity : enhancement -Bugzilla : 4900 -Description: Async OSC create to avoid the blocking unnecessarily. -Details : If a OST has no remain object, system will block on the creating - when need to create a new object on this OST. Now, ways use - pre-created objects when available, instead of blocking on an - empty osc while others are not empty. If we must block, we block - for the shortest possible period of time. - -Severity : normal -Frequency : rare -Bugzilla : 2707 -Description: chmod on Lustre root is propagated to other clients -Details : Re-validate root's dentry in ll_lookup_it to avoid having it - invalid by the follow_mount time. - -Severity : minor -Frequency : liblustre clients only -Bugzilla : 10883 -Description: Race in 'instant cancel' lock handling could lead to such locks - never to be granted in case of SMP MDS -Details : Do not destroy not yet granted but cbpending locks in - handle_enqueue - -Severity : minor -Frequency : replay/resend of open -Bugzilla : 10991 -Description: non null lock assetion failure in mds_intent_policy -Details : Trying to replay/resend lockless open requests resulted in - mds_open() returning 0 with no lock. Now it sets a flag if - a lock is going to be returned. - -Severity : enhancement -Bugzilla : 10889 -Description: Checksum enhancements -Details : New checksum enhancements allow for resending RPCs that failed - checksum checks. - -Severity : enhancement -Bugzilla : 7376 -Description: Tunables on number of dirty pages in cacche -Details : Allow to set limit on number of dirty pages cached. - -Severity : normal -Frequency : rare -Bugzilla : 10643 -Description: client crash on unmount - lock still has references -Details : In some error handling cases it was possible to leak a lock - reference on a client while accessing a file. This was not - harmful to the client during operation, but would cause the - client to crash when the filesystem is unmounted. - -Severity : normal -Frequency : specific case, rare -Bugzilla : 10921 -Description: ETXTBSY on mds though file not in use by client -Details : ETXTBSY is no longer incorrectly returned when attempting to - chmod or chown a directory that the user previously tried to - execute or a currently-executing binary. - -Severity : major -Frequency : extremely rare except on liblustre-based clients -Bugzilla : 10480 -Description: Lustre space not freed when files are deleted -Details : Clean up open-unlinked files after client eviction. Previously - the unlink was skipped and the files remained as orphans. - -Severity : normal -Frequency : rare -Bugzilla : 10999 -Description: OST failure "would be an LBUG" in waiting_locks_callback() -Details : In some cases it was possible to send a blocking callback to - a client doing a glimpse, even though that client didn't get - a lock granted. When the glimpse lock is cancelled on the OST - the freed lock is left on the waiting list and corrupted the list. - -Severity : major -Frequency : all core dumps -Bugzilla : 11103 -Description: Broke core dumps to lustre -Details : Negative dentry may be unhashed if parent does not have UPDATE - lock, but some callers, e.g. do_coredump, expect dentry to be - hashed after successful create, hash it in ll_create_it. - ------------------------------------------------------------------------------- - -2006-09-13 Cluster File Systems, Inc. - * version 1.4.7.1 - * Support for kernels: - 2.6.9-42.0.2.EL (RHEL 4) - 2.6.5-7.276 (SLES 9) - 2.4.21-40.EL (RHEL 3) - 2.6.12.6 vanilla (kernel.org) - * bug fix - -Severity : major -Frequency : always on RHEL 3 -Bugzilla : 10867 -Description: Number of open files grows over time -Details : The number of open files grows over time, whether or not - Lustre is started. This was due to a filp leak introduced - by one of our kernel patches. - ------------------------------------------------------------------------------- - -2006-08-20 Cluster File Systems, Inc. - * version 1.4.7 - * Support for kernels: - 2.6.9-42.EL (RHEL 4) - 2.6.5-7.267 (SLES 9) - 2.4.21-40.EL (RHEL 3) - 2.6.12.6 vanilla (kernel.org) - * bug fixes - -Severity : major -Frequency : rare -Bugzilla : 5719, 9635, 9792, 9684 -Description: OST (or MDS) trips assertions in (re)connection under heavy load -Details : If a server is under heavy load and cannot reply to new - connection requests before the client resends the (re)connect, - the connection handling code can behave badly if two service - threads are concurrently handing separate (re)connections from - the same client. Add better locking to the connection handling - code, and ensure that only a single connection will be processed - for a given client UUID, even if the lock is dropped. - -Severity : enhancement -Bugzilla : 3627 -Description: add TCP zero-copy support to kernel -Details : Add support to the kernel TCP stack to allow zero-copy bulk - sends if the hardware supports scatter-gather and checksumming. - This allows socklnd to do client-write and server-read more - efficiently and reduce CPU utilization from skbuf copying. - -Severity : minor -Frequency : only if NFS exporting from client -Bugzilla : 10258 -Description: NULL pointer deref in ll_iocontrol() if chattr mknod file -Details : If setting attributes on a file created under NFS that had - never been opened it would be possible to oops the client - if the file had no objects. - -Severity : major -Frequency : rare -Bugzilla : 9326, 10402, 10897 -Description: client crash in ptlrpcd_wake() thread when sending async RPC -Details : It is possible that ptlrpcd_wake() dereferences a freed async - RPC. In rare cases the ptlrpcd thread alread processed the RPC - before ptlrpcd_wake() was called and the request was freed. - -Severity : minor -Frequency : always for liblustre -Bugzilla : 10290 -Description: liblustre client does MDS+OSTs setattr RPC for each write -Details : When doing a write from a liblustre client, the client - incorrectly issued an RPC to the MDS and each OST the file was - striped over in order to update the timestamps. When writing - with small chunks and many clients this could overwhelm the MDS - with RPCs. In all cases it would slow down the write because - these RPCs are unnecessary. - -Severity : enhancement -Bugzilla : 9340 -Description: allow number of MDS service threads to be changed at module load -Details : It is now possible to change the number of MDS service threads - running. Adding "options mds mds_num_threads={N}" to the MDS's - /etc/modprobe.conf will set the number of threads for the next - time Lustre is restarted (assuming the "mds" module is also - reloaded at that time). The default number of threads will - stay the same, 32 for most systems. - -Severity : major -Frequency : rare -Bugzilla : 10300 -Description: OST crash if filesystem is unformatted or corrupt -Details : If an OST is started on a device that has never been formatted - or if the filesystem is corrupt and cannot even mount then the - error handling cleanup routines would dereference a NULL pointer. - -Severity : normal -Frequency : rare -Bugzilla : 10047 -Description: NULL pointer deref in llap_from_page. -Details : get_cache_page_nowait can return a page with NULL (or otherwise - incorrect) mapping if the page was truncated/reclaimed while it was - searched for. Check for this condition and skip such pages when - doing readahead. Introduce extra check to llap_from_page() to - verify page->mapping->host is non-NULL (so page is not anonymous). - -Severity : minor -Frequency : Sometimes when using sys_sendfile -Bugzilla : 7020 -Description: "page not covered by a lock" warnings from ll_readpage -Details : sendfile called ll_readpage without right page locks present. - Now we introduced ll_file_sendfile that does necessary locking - around call to generic_file_sendfile() much like we do in - ll_file_read(). - -Severity : normal -Frequency : with certain MDS communication failures at client mount time -Bugzilla : 10268 -Description: NULL pointer deref after failed client mount -Details : a client connection request may delayed by the network layer - and not be sent until after the PTLRPC layer has timed out the - request. If the client fails the mount immediately it will try - to clean up before the network times out the request. Add a - reference from the request import to the obd device and delay - the cleanup until the network drops the request. - -Severity : normal -Frequency : occasionally during client (re)connect -Bugzilla : 9387 -Description: assertion failure during client (re)connect -Details : processing a client connection request may be delayed by the - client or server longer than the client connect timeout. This - causes the client to resend the connection request. If the - original connection request is replied in this interval, the - client may trip an assertion failure in ptlrpc_connect_interpret() - which thought it would be the only running connect process. - -Severity : normal -Frequency : only with obd_echo servers and clients that are rebooted -Bugzilla : 10140 -Description: kernel BUG accessing uninitialized data structure -Details : When running an obd_echo server it did not start the ping_evictor - thread, and when a client was evicted an uninitialized data - structure was accessed. Start the ping_evictor in the RPC - service startup instead of the OBD startup. - -Severity : enhancement -Bugzilla : 10193 (patchless) -Description: Remove dependency on various unexported kernel interfaces. -Details : No longer need reparent_to_init, exit_mm, exit_files, - sock_getsockopt, filemap_populate, FMODE_EXEC, put_filp. - -Severity : minor -Frequency : rare (only users of deprecated and unsupported LDAP config) -Bugzilla : 9337 -Description: write_conf for zeroconf mount queried LDAP incorrectly for client -Details : LDAP apparently contains 'lustreName' attributes instead of - 'name'. A simple remapping of the name is sufficient. - -Severity : major -Frequency : rare (only with non-default dump_on_timeout debug enabled) -Bugzilla : 10397 -Description: waiting_locks_callback trips kernel BUG if client is evicted -Details : Running with the dump_on_timeout debug flag turned on makes - it possible that the waiting_locks_callback() can try to dump - the Lustre kernel debug logs from an interrupt handler. Defer - this log dumping to the expired_lock_main() thread. - -Severity : enhancement -Bugzilla : 10420 -Description: Support NFS exporting on 2.6 kernels. -Details : Implement non-rawops metadata methods for NFS server to use without - changing NFS server code. - -Severity : normal -Frequency : very rare (synthetic metadata workload only) -Bugzilla : 9974 -Description: two racing renames might cause an MDS thread to deadlock -Details : Running the "racer" program may cause one MDS thread to rename - a file from being the source of a rename to being the target of - a rename at exactly the same time that another thread is doing - so, and the second thread has already enqueued these locks after - doing a lookup of the target and is trying to relock them in - order. Ensure that we don't try to re-lock the same resource. - -Severity : major -Frequency : only very large systems with liblustre clients -Bugzilla : 7304 -Description: slow eviction of liblustre clients with the "evict_by_nid" RPC -Details : Use asynchronous set_info RPCs to send the "evict_by_nid" to - all OSTs in parallel. This allows the eviction of stale liblustre - clients to proceed much faster than if they were done in series, - and also offers similar improvements for other set_info RPCs. - -Severity : minor -Frequency : common -Bugzilla : 10265 -Description: excessive CPU usage during initial read phase on client -Details : During the initial read phase on a client, it would agressively - retry readahead on the file, consuming too much CPU and impacting - performance (since 1.4.5.8). Improve the readahead algorithm - to avoid this, and also improve some other common cases (read - of small files in particular, where "small" is files smaller than - /proc/fs/lustre/llite/*/max_read_ahead_whole_mb, 2MB by default). - -Severity : minor -Frequency : rare -Bugzilla : 10450 -Description: MDS crash when receiving packet with unknown intent. -Details : Do not LBUG in unknown intent case, just return -EFAULT - -Severity : enhancement -Bugzilla : 9293, 9385 -Description: MDS RPCs are serialised on client. This is unnecessary for some. -Details : Do not serialize getattr (non-intent version) and statfs. - -Severity : minor -Frequency : occasional, when OST network is overloaded/intermittent -Bugzilla : 10416 -Description: client evicted by OST after bulk IO timeout -Details : If a client sends a bulk IO request (read or write) the OST - may evict the client if it is unresposive to its data GET/PUT - request. This is incorrect if the network is overloaded (takes - too long to transfer the RPC data) or dropped the OST GET/PUT - request. There is no need to evict the client at all, since - the pinger and/or lock callbacks will handle this, and the - client can restart the bulk request. - -Severity : minor -Frequency : Always when mmapping file with no objects -Bugzilla : 10438 -Description: client crashes when mmapping file with no objects -Details : Check that we actually have objects in a file before doing any - operations on objects in ll_vm_open, ll_vm_close and - ll_glimpse_size. - -Severity : minor -Frequency : Rare -Bugzilla : 10484 -Description: Request leak when working with deleted CWD -Details : Introduce advanced request refcount tracking for requests - referenced from lustre intent. - -Severity : Enhancement -Bugzilla : 10482 -Description: Cache open file handles on client. -Details : MDS now will return special lock along with openhandle, if - requested and client is allowed to hold openhandle, even if unused, - until such a lock is revoked. Helps NFS a lot, since NFS is opening - closing files for every read/write openration. - -Severity : Enhancement -Bugzilla : 9291 -Description: Cache open negative dentries on client when possible. -Details : Guard negative dentries with UPDATE lock on parent dir, drop - negative dentries on lock revocation. - -Severity : minor -Frequency : Always -Bugzilla : 10510 -Description: Remounting a client read-only wasn't possible with a zconf mount -Details : It wasn't possible to remount a client read-only with llmount. - -Severity : enhancement -Description: Include MPICH 1.2.6 Lustre ADIO interface patch -Details : In lustre/contrib/ or /usr/share/lustre in RPM a patch for - MPICH is included to add Lustre-specific ADIO interfaces. - This is based closely on the UFS ADIO layer and only differs - in file creation, in order to allow the OST striping to be set. - This is user-contributed code and not supported by CFS. - -Severity : minor -Frequency : Always -Bugzilla : 9486 -Description: extended inode attributes (immutable, append-only) work improperly - when 2.4 and 2.6 kernels are used on client/server or vice versa -Details : Introduce kernel-independent values for these flags. - -Severity : enhancement -Frequency : Always -Bugzilla : 10248 -Description: Allow fractional MB tunings for lustre in /proc/ filesystem. -Details : Many of the /proc/ tunables can only be tuned at a megabyte - granularity. Now, Fractional MB granularity is be supported, - this is very useful for low memory system. - -Severity : enhancement -Bugzilla : 9292 -Description: Getattr by fid -Details : Getting a file attributes by its fid, obtaining UPDATE|LOOKUP - locks, avoids extra getattr rpc requests to MDS, allows '/' to - have locks and avoids getattr rpc requests for it on every stat. - -Severity : major -Frequency : Always, for filesystems larger than 2TB -Bugzilla : 6191 -Description: ldiskfs crash at mount for filesystem larger than 2TB with mballoc -Details : Kenrel kmalloc limits allocations to 128kB and this prevents - filesystems larger than 2TB to be mounted with mballoc enabled. - -Severity : critical -Frequency : Always, for 32-bit kernel without CONFIG_LBD and filesystem > 2TB -Bugzilla : 6191 -Description: filesystem corruption for non-standard kernels and very large OSTs -Details : If a 32-bit kernel is compiled without CONFIG_LBD enabled and a - filesystems larger than 2TB is mounted then the kernel will - silently corrupt the start of the filesystem. CONFIG_LBD is - enabled for all CFS-supported kernels, but the possibility of - this happening with a modified kernel config exists. - -Severity : enhancement -Bugzilla : 10462 -Description: add client O_DIRECT support for 2.6 kernels -Details : It is now possible to do O_DIRECT reads and writes to files - in the Lustre client mountpoint on 2.6 kernel clients. - -Severity : enhancement -Bugzilla : 10446 -Description: parallel glimpse, setattr, statfs, punch, destroy requests -Details : Sends glimpse, setattr, statfs, punch, destroy requests to OSTs in - parallel, not waiting for response from every OST before sending - a rpc to the next OST. - -Severity : minor -Frequency : rare -Bugzilla : 10150 -Description: setattr vs write race when updating file timestamps -Details : Client processes that update a file timestamp into the past - right after writing to the file (e.g. tar) it is possible that - the updated file modification time can be reset to the current - time due to a race between processing the setattr and write RPC. - -Severity : enhancement -Bugzilla : 10318 -Description: Bring 'lfs find' closer in line with regular Linux find. -Details : lfs find util supports -atime, -mtime, -ctime, -maxdepth, -print, - -print0 options and obtains all the needed info through the lustre - ioctls. - -Severity : enhancement -Bugzilla : 6221 -Description: support up to 1024 configured devices on one node -Details : change obd_dev array from statically allocated to dynamically - allocated structs as they are first used to reduce memory usage - -Severity : minor -Frequency : rare -Bugzilla : 10437 -Description: Flush dirty partially truncated pages during truncate -Details : Immediatelly flush partially truncated pages in filter_setattr, - this way we completely avoid having any pages in page cache on OST - and can retire ugly workarounds during writes to flush such pages. - -Severity : minor -Frequency : rare -Bugzilla : 10409 -Description: i_sem vs transaction deadlock in mds_obd_destroy during unlink. -Details : protect inode from truncation within vfs_unlink() context - just take a reference before calling vfs_unlink() and release it - when parent's i_sem is free. - -Severity : minor -Frequency : always, if extents are used on OSTs -Bugzilla : 10703 -Description: index ei_leaf_hi (48-bit extension) is not zeroed in extent index -Details : OSTs using the extents format would not zero the high 16 bits of - the index physical block number. This is not a problem for any - OST filesystems smaller than 16TB, and no kernels support ext3 - filesystems larger than 16TB yet. This is fixed in 1.4.7 (all - new/modified files) and can be fixed for existing filesystems - with e2fsprogs-1.39-cfs1. - -Severity : minor -Frequency : rare -Bugzilla : 9387 -Description: import connection selection may be incorrect if timer wraps -Details : Using a 32-bit jiffies timer with HZ=1000 may cause backup - import connections to be ignored if the 32-bit jiffies counter - wraps. Use a 64-bit jiffies counter. - -Severity : major -Frequency : during server recovery -Bugzilla : 10479 -Description: crash after server is denying duplicate export -Details : If clients are resending connect requests to the server, the - server refuses to allow a client to connect multiple times. - Fixed a bug in the handling of this case. - -Severity : minor -Frequency : very large clusters immediately after boot -Bugzilla : 10083 -Description: LNET request buffers exhausted under heavy short-term load -Details : If a large number of client requests are generated on a service - that has previously never seen so many requests it is possible - that the request buffer growth cannot keep up with the spike in - demand. Instead of dropping incoming requests, they are held in - the LND until the RPC service can accept more requests. - -Severity : minor -Frequency : Sometimes during replay -Bugzilla : 9314 -Description: Assertion failure in ll_local_open after replay. -Details : If replay happened on an open request reply before we were able - to set replay handler, reply will become not swabbed tripping the - assertion in ll_local_open. Now we set the handler right after - recognising of open request - -Severity : minor -Frequency : very rare -Bugzilla : 10584 -Description: kernel reports "badness in vsnprintf" -Details : Reading from the "recovery_status" /proc file in small chunks - may cause a negative length in lprocfs_obd_rd_recovery_status() - call to vsnprintf() (which is otherwise harmless). Exit early - if there is no more space in the output buffer. - -Severity : enhancement -Bugzilla : 2259 -Description: clear OBD RPC statistics by writing to them -Details : It is now possible to clear the OBD RPC statistics by writing - to the "stats" file. - -Severity : minor -Frequency : rare -Bugzilla : 10641 -Description: Client mtime is not the same on different clients after utimes -Details : In some cases, the client was using the utimes() syscall on - a file cached on another node. The clients now validate the - ctime from the MDS + OSTs to determine which one is right. - -Severity : minor -Frequency : always -Bugzilla : 10611 -Description: Inability to activate failout mode -Details : lconf script incorrectly assumed that in python string's numeric - value is used in comparisons. - -Severity : minor -Frequency : always with multiple stripes per file -Bugzilla : 10671 -Description: Inefficient object allocation for mutli-stripe files -Details : When selecting which OSTs to stripe files over, for files with - a stripe count that divides evenly into the number of OSTs, - the MDS is always picking the same starting OST for each file. - Return the OST selection heuristic to the original design. - -Severity : minor -Frequency : rare -Bugzilla : 10673 -Description: mount failures may take full timeout to return an error -Details : Under some heavy load conditions it is possible that a - failed mount can wait for the full obd_timeout interval, - possibly several minutes, before reporting an error. - Instead return an error as soon as the status is known. - ------------------------------------------------------------------------------- - -2006-02-14 Cluster File Systems, Inc. - * version 1.4.6 - * WIRE PROTOCOL CHANGE. This version of Lustre networking WILL NOT - INTEROPERATE with older versions automatically. Please read the - user documentation before upgrading any part of a live system. - * WARNING: Lustre networking configuration changes are required with - this release. See https://bugzilla.clusterfs.com/show_bug.cgi?id=10052 - for details. - * bug fixes - * Support for kernels: - 2.6.9-22.0.2.EL (RHEL 4) - 2.6.5-7.244 (SLES 9) - 2.6.12.6 vanilla (kernel.org) - - -Severity : enhancement -Bugzilla : 7981/8208 -Description: Introduced Lustre Networking (LNET) -Details : LNET is new networking infrastructure for Lustre, it includes - a reorganized network configuration mode (see the user - documentation for full details) as well as support for routing - between different network fabrics. Lustre Networking Devices - (LNDs) for the supported network fabrics have also been - created for this new infrastructure. - -Severity : enhancement -Description: Introduced Access control lists -Details : clients can set ACLs on files and directories in order to have - more fine-grained permissions than the standard Unix UGO+RWX. - The MDS must be started with the "-o acl" mount option. - -Severity : enhancement -Description: Introduced filesystem quotas -Details : Administrators may now establish per-user quotas on the - filesystem. - -Severity : enhancement -Bugzilla : 7982 -Description: Configuration change for the XT3 - The PTLLND is now used to run Lustre over Portals on the XT3 - The configure option(s) --with-cray-portals are no longer used. - Rather --with-portals= is used to - enable building on the XT3. In addition to enable XT3 specific - features the option --enable-cray-xt3 must be used. - -Severity : major -Frequency : rare -Bugzilla : 7407 -Description: Running on many-way SMP OSTs can trigger oops in llcd_send() -Details : A race between allocating a new llcd and re-getting the llcd_lock - allowed another thread to grab newly-allocated llcd. - -Severity : enhancement -Bugzilla : 7116 -Description: 2.6 OST async journal commit and locking fix to improve performance -Details : The filter_direct_io()+filter_commitrw_write() journal commits for - 2.6 kernels are now async as they already were in 2.4 kernels so - that they can commit concurrently with the network bulk transfer. - For block-allocated files the filter allocation semaphore is held - to avoid filesystem fragmentation during allocation. BKL lock - removed for 2.6 xattr operations where it is no longer needed. - -Severity : minor -Frequency : rare -Bugzilla : 8320 -Description: lconf incorrectly determined whether two IP networks could talk -Details : In some more complicated routing and multiple-network - configurations, lconf will avoid trying to make a network - connection to a disjoint part of the IP space. It was doing the - math incorrectly for one set of cases. - -Severity : major -Frequency : rare -Bugzilla : 7359 -Description: Fix for potential infinite loop processing records in an llog. -Details : If an llog record is corrupted/zeroed, it is possible to loop - forever in llog_process(). Validate the llog record length - and skip the remainder of the block on error. - -Severity : minor -Frequency : occasional (liblustre only) -Bugzilla : 6363 -Description: liblustre could not open files whose last component is a symlink -Details : sysio_path_walk() would incorrectly pass the open intent to - intermediate path components. - -Severity : minor -Frequency : rare (liblustre only with non-standard tuning) -Bugzilla : 7201 (7350) -Description: Tuning the MDC DLM LRU size to zero triggers client LASSERT -Details : llu_lookup_finish_locks() tries to set lock data on a lock - after it has been released, only do this for referenced locks - -Severity : enhancement -Bugzilla : 7328 -Description: specifying an (invalid) directory default stripe_size of -1 - would reset the directory default striping -Details : stripe_size -1 was used internally to signal directory stripe - removal, now use "all default" to signal dir stripe removal - as a directory striping of "all default" is not useful - -Severity : minor -Frequency : common for large clusters running liblustre clients -Bugzilla : 7198 -Description: doing an ls when liblustre clients are running is slow -Details : sending a glimpse AST to a liblustre client waits for every AST - to time out, as liblustre clients will not respond. Since they - cannot cache data we refresh the OST lock LVB from disk instead. - -Severity : enhancement -Bugzilla : 7198 -Description: doing an ls at the same time as file IO can be slow -Details : enqueue and other "small" requests can be blocked behind many - large IO requests. Create a new OST IO portal for non-IO - requests so they can be processed faster. - -Severity : minor -Frequency : rare (only HPUX clients mounting unsupported re-exported NFS vol) -Bugzilla : 5781 -Description: an HPUX NFS client would get -EACCESS when ftruncate()ing a newly - created file with mode 000 -Details : the Linux NFS server relies on an MDS_OPEN_OWNEROVERRIDE hack to - allow an ftruncate() as a non-root user to a file with mode 000. - Lustre now respects this flag to disable mode checks when - truncating a file owned by the user - -Severity : minor -Frequency : liblustre-only, when liblustre client dies unexpectedly or becomes - busy -Bugzilla : 7313 -Description: Revoking locks from clients that went dead or catatonic might take - a lot of time. -Details : New lock flags FL_CANCEL_ON_BLOCK used by liblustre makes - cancellation of such locks instant on servers without waiting for - any reply from clients. Clients drops these locks when cancel - notification from server is received without replying. - -Severity : minor -Frequency : liblustre-only, when liblustre client dies or becomes busy -Bugzilla : 7311 -Description: Doing ls on Linux clients can take a long time with active - liblustre clients -Details : Liblustre client cannot handle ASTs in timely manner, so avoid - granting such locks to it in the first place if possible. Locks - are taken by proxy on the OST during the read or write and - dropped immediately afterward. Add connect flags handling, do - not grant locks to liblustre clients for glimpse ASTs. - -Severity : enhancement -Bugzilla : 6252 -Description: Improve read-ahead algorithm to avoid excessive IO for random reads -Details : Existing read-ahead algorithm is tuned for the case of streamlined - sequential reads and behaves badly with applications doing random - reads. Improve it by reading ahead at least read region, and - avoiding excessive large RPC for small reads. - -Severity : enhancement -Bugzilla : 8330 -Description: Creating more than 1000 files for a single job may cause a load - imbalance on the OSTs if there are also a large number of OSTs. -Details : qos_prep_create() uses an OST index reseed value that is an - even multiple of the number of available OSTs so that if the - reseed happens in the middle of the object allocation it will - still utilize the OSTs as uniformly as possible. - -Severity : major -Frequency : rare -Bugzilla : 8322 -Description: OST or MDS may oops in ping_evictor_main() -Details : ping_evictor_main() drops obd_dev_lock if deleting a stale export - but doesn't restart at beginning of obd_exports_timed list - afterward. - -Severity : enhancement -Bugzilla : 7304 -Description: improve by-nid export eviction on the MDS and OST -Details : allow multiple exports with the same NID to be evicted at one - time without re-searching the exports list. - -Severity : major -Frequency : rare, only with supplementary groups enabled on SMP 2.6 kernels -Bugzilla : 7273 -Description: MDS may oops in groups_free() -Details : in rare race conditions a newly allocated group_info struct is - freed again, and this can be NULL. The 2.4 compatibility code - for groups_free() checked for a NULL pointer, but 2.6 did not. - -Severity : minor -Frequency : common for liblustre clients doing little filesystem IO -Bugzilla : 9352, 7313 -Description: server may evict liblustre clients accessing contended locks -Details : if a client is granted a lock or receives a completion AST - with a blocking AST already set it would not reply to the AST - for LDLM_FL_CANCEL_ON_BLOCK locks. It now replies to such ASTs. - -Severity : minor -Frequency : lfs setstripe, only systems with more than 160 OSTs -Bugzilla : 9440 -Description: unable to set striping with a starting offset beyond OST 160 -Details : llapi_create_file() incorrectly limited the starting stripe - index to the maximum single-file stripe count. - -Severity : minor -Frequency : LDAP users only -Bugzilla : 6163 -Description: lconf did not handle in-kernel recovery with LDAP properly -Details : lconf/LustreDB get_refs() is searching the wrong namespace - -Severity : enhancement -Bugzilla : 7342 -Description: bind OST threads to NUMA nodes to improve performance -Details : all OST threads are uniformly bound to CPUs on a single NUMA - node and do their allocations there to localize memory access - -Severity : enhancement -Bugzilla : 7979 -Description: llmount can determine client NID directly from Myrinet (GM) -Details : the client NID code from gmnalnid was moved directly into - llmount, removing the need to use this or specifying the - client NID explicitly when mounting GM clients with zeroconf - -Severity : minor -Frequency : if client is started with down MDS -Bugzilla : 7184 -Description: if client is started with down MDS mount hangs in ptlrpc_queue_wait -Details : Having an LWI_INTR() wait event (interruptible, but no timeout) - will wait indefinitely in ptlrpc_queue_wait->l_wait_event() after - ptlrpc_import_delayed_req() because we didn't check if the - request was interrupted, and we also didn't break out of the - event loop if there was no timeout - -Severity : major -Frequency : rare -Bugzilla : 5047 -Description: data loss during non-page-aligned writes to a single file from - both multiple nodes and multiple threads on one node at same time -Details : updates to KMS and lsm weren't protected by common lock. Resulting - inconsistency led to false short-reads, that were cached and later - used by ->prepare_write() to fill in partially written page, - leading to data loss. - -Severity : minor -Frequency : always, if lconf --abort_recovery used -Bugzilla : 7047 -Description: lconf --abort_recovery fails with 'Operation not supported' -Details : lconf was attempting to abort recovery on the MDT device and not - the MDS device - -Severity : enhancement -Bugzilla : 9445 -Description: remove cleanup logs -Details : replace lconf-generated cleanup logs with lustre internal - cleanup routines. Eliminates the need for client-cleanup and - mds-cleanup logs. - -Severity : enhancement -Bugzilla : 8592 -Description: add support for EAs (user and system) on lustre filesystems -Details : it is now possible to store extended attributes in the Lustre - client filesystem, and with the user_xattr mount option it - is possible to allow users to store EAs on their files also - -Severity : enhancement -Bugzilla : 7293 -Description: Add possibility (config option) to show minimal available OST free - space. -Details : When compiled with --enable-mindf configure option, statfs(2) - (and so, df) will return least minimal free space available from - all OSTs as amount of free space on FS, instead of summary of - free spaces of all OSTs. - -Severity : enhancement -Bugzilla : 7311 -Description: do not expand extent locks acquired on OST-side -Details : Modify ldlm_extent_policy() to not expand local locks, acquired - by server: they are not cached anyway. - -Severity : major -Frequency : when mmap is used/binaries executed from Lustre -Bugzilla : 9482 -Description: Unmmap pages before throwing them away from read cache. -Details : llap_shrink cache now attempts to unmap pages before discarding - them (if unmapping failed - do not discard). SLES9 kernel has - extra checks that trigger if this unmapping is not done first. - -Severity : minor -Frequency : rare -Bugzilla : 6034 -Description: lconf didn't resolve symlinks before checking to see whether a - given mountpoint was already in use - -Severity : minor -Frequency : when migrating failover services -Bugzilla : 6395, 9514 -Description: When migrating a subset of services from a node (e.g. failback - from a failover service node) the remaining services would - time out and evict clients. -Details : lconf --force (implied by --failover) sets the global obd_timeout - to 5 seconds in order to quickly disconnect, but this caused - other RPCs to time out too quickly. Do not change the global - obd_timeout for force cleanup, only set it for DISCONNECT RPCs. - -Severity : enhancement -Frequency : if MDS is started with down OST -Bugzilla : 9439,5706 -Description: Allow startup/shutdown of an MDS without depending on the - availability of the OSTs. -Details : Asynchronously call mds_lov_synchronize during MDS startup. - Add appropriate locking and lov-osc refcounts for safe - cleaning. Add osc abort_inflight calls in case the - synchronize never started. - -Severity : minor -Frequency : occasional (Cray XT3 only) -Bugzilla : 7305 -Description: root not authorized to access files in CRAY_PORTALS environment -Details : The client process capabilities were not honoured on the MDS in - a CRAY_PORTALS/CRAY_XT3 environment. If the file had previously - been accessed by an authorized user then root was able to access - the file on the local client also. The root user capabilities - are now allowed on the MDS, as this environment has secure UID. - -Severity : minor -Frequency : occasional -Bugzilla : 6449 -Description: ldiskfs "too long searching" message happens too often -Details : A debugging message (otherwise harmless) prints too often on - the OST console. This has been reduced to only happen when - there are fragmentation problems on the filesystem. - -Severity : minor -Frequency : rare -Bugzilla : 9598 -Description: Division by zero in statfs when all OSCs are inactive -Details : lov_get_stripecnt() returns zero due to incorrect order of checks, - lov_statfs divides by value returned by lov_get_stripecnt(). - -Severity : minor -Frequency : common -Bugzilla : 9489, 3273 -Description: First write from each client to each OST was only 4kB in size, - to initialize client writeback cache, which caused sub-optimal - RPCs and poor layout on disk for the first writen file. -Details : Clients now request an initial cache grant at (re)connect time - and so that they can start streaming writes to the cache right - away and always do full-sized RPCs if there is enough data. - If the OST is rebooted the client also re-establishes its grant - so that client cached writes will be honoured under the grant. - -Severity : minor -Frequency : common -Bugzilla : 7198 -Description: Slow ls (and stat(2) syscall) on files residing on IO-loaded OSTs -Details : Now I/O RPCs go to different portal number and (presumably) fast - lock requests (and glimses) and other RPCs get their own service - threads pool that should be able to service those RPCs - immediatelly. - -Severity : enhancement -Bugzilla : 7417 -Description: Ability to exchange lustre version between client and servers and - issue warnings at client side if client is too old. Also for - liblustre clients there is ability to refuse connection of too old - clients. -Details : New 'version' field is added to connect data structure that is - filled with version info. That info is later checked by server and - by client. - -Severity : minor -Frequency : rare, liblustre only. -Bugzilla : 9296, 9581 -Description: Two simultaneous writes from liblustre at offset within same page - might proceed at the same time overwriting eachother with stale - data. -Details : I/O lock withing llu_file_prwv was released too early, before data - actually was hitting the wire. Extended lock-holding time until - server acknowledges receiving data. - -Severity : minor -Frequency : extremely rare. Never observed in practice. -Bugzilla : 9652 -Description: avoid generating lustre_handle cookie of 0. -Details : class_handle_hash() generates handle cookies by incrementing - global counter, and can hit 0 occasionaly (this is unlikely, but - not impossible, because initial value of cookie counter is - selected randonly). Value of 0 is used as a sentinel meaning - "unassigned handle" --- avoid it. Also coalesce two critical - sections in this function into one. - -Severity : enhancement -Bugzilla : 9528 -Description: allow liblustre clients to delegate truncate locking to OST -Details : To avoid overhead of locking, liblustre client instructs OST to - take extent lock in ost_punch() on client's behalf. New connection - flag is added to handle backward compatibility. - -Severity : enhancement -Bugzilla : 4928, 7341, 9758 -Description: allow number of OST service threads to be specified -Details : a module parameter allows the number of OST service threads - to be specified via "options ost ost_num_threads={N}" in the - OSS's /etc/modules.conf or /etc/modprobe.conf. - -Severity : major -Frequency : rare -Bugzilla : 6146, 9635, 9895 -Description: servers crash with bad pointer in target_handle_connect() -Details : In rare cases when a client is reconnecting it was possible that - the connection request was the last reference for that export. - We would temporarily drop the export reference and get a new - one, but this may have been the last reference and the export - was just destroyed. Get new reference before dropping old one. - -Severity : enhancement -Frequency : if client is started with failover MDS -Bugzilla : 9818 -Description: Allow multiple MDS hostnames in the mount command -Details : Try to read the configuration from all specified MDS - hostnames during a client mount in case the "primary" - MDS is down. - -Severity : enhancement -Bugzilla : 9297 -Description: Stop sending data to evicted clients as soon as possible. -Details : Check if the client we are about to send or are sending data to - was evicted already. (Check is done every second of waiting, - for which l_wait_event interface was extended to allow checking - of exit condition at specified intervals). - -Severity : minor -Frequency : rare, normally only when NFS exporting is done from client -Bugzilla : 9301 -Description: 'bad disk LOV MAGIC: 0x00000000' error when chown'ing files - without objects -Details : Make mds_get_md() recognise empty md case and set lmm size to 0. - -Severity : minor -Frequency : always, if srand() is called before liblustre initialization -Bugzilla : 9794 -Description: Liblustre uses system PRNG disturbing its usage by user application -Details : Introduce internal to lustre fast and high-quality PRNG for - lustre usage and make liblustre and some other places in generic - lustre code to use it. - -Severity : enhancement -Bugzilla : 9477, 9557, 9870 -Description: Verify that the MDS configuration logs are updated when xml is -Details : Check if the .xml configuration logs are newer than the config - logs stored on the MDS and report an error if this is the case. - Request --write-conf, or allow starting with --old_conf. - -Severity : enhancement -Bugzilla : 6034 -Description: Handle symlinks in the path when checking if Lustre is mounted. -Details : Resolve intermediate symlinks when checking if a client has - mounted a filesystem to avoid duplicate client mounts. - -Severity : minor -Frequency : rare -Bugzilla : 9309 -Description: lconf can hit an error exception but still return success. -Details : The lconf command catches the Command error exception at the top - level script context and will exit with the associated exit - status, but doesn't ensure that this exit status is non-zero. - -Severity : minor -Frequency : rare -Bugzilla : 9493 -Description: failure of ptlrpc thread startup can cause oops -Details : Starting a ptlrpc service thread can fail if there are a large - number of threads or the server memory is very fragmented. - Handle this without oopsing. - -Severity : minor -Frequency : always, only if liblustre and non-default acceptor port was used -Bugzilla : 9933 -Description: liblustre cannot connect to servers with non-default acceptor port -Details : tcpnal_set_default_params() was not called and was therefore - ignoring the environment varaible TCPNAL_PORT, as well as other - TCPNAL_ environment variables - -Severity : minor -Frequency : rare -Bugzilla : 9923 -Description: two objects could be created on the same OST for a single file -Details : If an OST is down, in some cases it was possible to create two - objects on a single OST for a single file. No problems other - than potential performance impact and spurious error messages. - -Severity : minor -Frequency : rare -Bugzilla : 5681, 9562 -Description: Client may oops in ll_unhash_aliases -Details : Client dcache may become inconsistent in race condition. - In some cases "getcwd" can fail if the current directory is - modified. - -Severity : minor -Frequency : always -Bugzilla : 9942 -Description: Inode refcounting problems in NFS export code -Details : link_raw functions used to call d_instantiate without obtaining - extra inode reference first. - -Severity : minor -Frequency : rare -Bugzilla : 9942, 9903 -Description: Referencing freed requests leading to crash, memleaks with NFS. -Details : We used to require that call to ll_revalidate_it was always - followed by ll_lookup_it. Also with revalidate_special() it is - possible to call ll_revalidate_it() twice for the same dentry - even if first occurence returned success. This fix changes semantic - between DISP_ENQ_COMPLETE disposition flag to mean there is extra - reference on a request referred from the intent. - ll_intent_release() then releases such a request. - -Severity : minor -Frequency : rare, normally benchmark loads only -Bugzilla : 1443 -Description: unlinked inodes were kept in memory on the client -Details : If a client is repeatedly creating and unlinking files it - can accumulate a lot of stale inodes in the inode slab cache. - If there is no other client load running this can cause the - client node to run out of memory. Instead flush old inodes - from client cache that have the same inode number as a new inode. - -Severity : minor -Frequency : SLES9 2.6.5 kernel and long filenames only -Bugzilla : 9969, 10379 -Description: utime reports stale NFS file handle -Details : SLES9 uses out-of-dentry names in some cases, which confused - the lustre dentry revalidation. Change it to always use the - in-dentry qstr. - -Severity : major -Frequency : rare, unless heavy write-truncate concurrency is continuous -Bugzilla : 4180, 6984, 7171, 9963, 9331 -Description: OST becomes very slow and/or deadlocked during object unlink -Details : filter_destroy() was holding onto the parent directory lock - while truncating+unlinking objects. For very large objects this - may block other threads for a long time and slow overall OST - responsiveness. It may also be possible to get a lock ordering - deadlock in this case, or run out of journal credits because of - the combined truncate+unlink. Solution is to do object truncate - first in one transaction without parent lock, and then do the - final unlink in a new transaction with the parent lock. This - reduces the lock hold time dramatically. - -Severity : major -Frequency : rare, 2.4 kernels only -Bugzilla : 9967 -Description: MDS or OST cleanup may trip kernel BUG when dropping kernel lock -Details : mds_cleanup() and filter_cleanup() need to drop the kernel lock - before unmounting their filesystem in order to avoid deadlock. - The kernel_locked() function in 2.4 kernels only checks whether - the kernel lock is held, not whether it is this process that is - holding it as 2.6 kernels do. - -Severity : major -Frequency : rare -Bugzilla : 9635 -Description: MDS or OST may oops/LBUG if a client is connecting multiple times -Details : The client ptlrpc code may be trying to reconnect to a down - server before a previous connection attempt has timed out. - Increase the reconnect interval to be longer than the connection - timeout interval to avoid sending duplicate connections to - servers. - -Severity : minor -Frequency : echo_client brw_test command -Bugzilla : 9919 -Description: fix echo_client to work with OST preallocated code -Details : OST preallocation code (5137) didn't take echo_client IO path - into account: echo_client calls filter methods outside of any - OST thread and, hence, there is no per-thread preallocated - pages and buffers to use. Solution: hijack pga pages for IO. As - a byproduct, this avoids unnecessary data copying. - -Severity : minor -Frequency : rare -Bugzilla : 3555, 5962, 6025, 6155, 6296, 9574 -Description: Client can oops in mdc_commit_close() after open replay -Details : It was possible for the MDS to return an open request with no - transaction number in mds_finish_transno() if the client was - evicted, but without actually returning an error. Clients - would later try to replay that open and may trip an assertion - Simplify the client close codepath, and always return an error - from the MDS in case the open is not successful. - -Severity : major -Frequency : rare, 2.6 OSTs only -Bugzilla : 10076 -Description: OST may deadlock under high load on fragmented files -Details : If there was a heavy load and highly-fragmented OST filesystems - it was possible to have all the OST threads deadlock waiting on - allocation of biovecs, because the biovecs were not released - until the entire RPC IO was completed. Instead, release biovecs - as soon as they are complete to ensure forward IO progress. - -Severity : enhancement -Bugzilla : 9578 -Description: Support for specifying external journal device at mount -Details : If an OST or MDS device is formatted with an external journal - device, this device major/minor is stored in the ext3 superblock - and may not be valid for failover. Allow detecting and - specifying the external journal at mount time. - -Severity : major -Frequency : rare -Bugzilla : 10235 -Description: Mounting an MDS with pending unlinked files may cause oops -Details : target_finish_recovery() calls mds_postrecov() which returned - the number of orphans unlinked. mds_lov_connect->mds_postsetup() - considers this an error and immediately begins cleaning up the - lov, just after starting the mds_lov process - -Severity : enhancement -Bugzilla : 9461 -Description: Implement 'lfs df' to report actual free space on per-OST basis -Details : Add sub-command 'df' on 'lfs' to report the disk space usage of - MDS/OSDs. Usage: lfs df [-i][-h]. Command Options: '-i' to report - usage of objects; '-h' to report in human readable format. - ------------------------------------------------------------------------------- - -2005-08-26 Cluster File Systems, Inc. - * version 1.4.5 - * bug fixes - -Severity : major -Frequency : rare -Bugzilla : 7264 -Description: Mounting an ldiskfs file system with mballoc may crash OST node. -Details : ldiskfs mballoc code may reference an uninitialized buddy struct - at startup during orphan unlinking. Instead, skip buddy update - before setup, as it will be regenerated after recovery is complete. - -Severity : minor -Frequency : rare -Bugzilla : 7039 -Description: If an OST is inactive, its locks might reference stale inodes. -Details : lov_change_cbdata() must iterate over all namespaces, even if - they are inactive to clear inode references from the lock. - -Severity : enhancement -Frequency : occasional, if non-standard max_dirty_mb used -Bugzilla : 7138 -Description: Client will block write RPCs if not enough grant -Details : If a client has max_dirty_mb smaller than max_rpcs_in_flight, - then the client will block writes while waiting for another RPC - to complete instead of consuming its dirty limit. With change - we get improved performance when max_dirty_mb is small. - -Severity : enhancement -Bugzilla : 3389, 6253 -Description: Add support for supplementary groups on the MDS. -Details : The MDS has an upcall /proc/fs/lustre/mds/{mds}/group_upcall - (set to /usr/sbin/l_getgroups if enabled) which will do MDS-side - lookups for user supplementary groups into a cache. - -Severity : minor -Bugzilla : 7278 -Description: O_CREAT|O_EXCL open flags in liblustre always return -EEXIST -Details : Make libsysio to not enforce O_EXCL by clearing the flag, - for liblustre O_EXCL is enforced by MDS. - -Severity : minor -Bugzilla : 6455 -Description: readdir never returns NULL in liblustre. -Details : Corrected llu_iop_getdirentries logic, to return offset of next - dentry in struct dirent. - -Severity : minor -Bugzilla : 7137 -Frequency : liblustre only, depends on application IO pattern -Description: liblustre clients evicted if not contacting servers -Details : Don't put liblustre clients into the ping_evictor list, so - they will not be evicted by the pinger ever. - -Severity : enhancement -Bugzilla : 6902 -Description: Add ability to evict clients by NID from MDS. -Details : By echoing "nid:$NID" string into - /proc/fs/lustre/mds/.../evict_client client with nid that equals to - $NID would be instantly evicted from this MDS and from all active - OSTs connected to it. - -Severity : minor -Bugzilla : 7198 -Description: Do not query file size twice, somewhat slowing stat(2) calls. -Details : lookup_it_finish() used to query file size from OSTs that was not - needed. - -Severity : minor -Bugzilla : 6237 -Description: service threads change working directory to that of init -Details : Starting lustre service threads may pin the working directory - of the parent thread, making that filesystem busy. Threads - now change to the working directory of init to avoid this. - -Severity : minor -Bugzilla : 6827 -Frequency : during shutdown only -Description: shutdown with a failed MDS or OST can cause unmount to hang -Details : Don't resend DISCONNECT messages in ptlrpc_disconnect_import() - if server is down. - -Severity : minor -Bugzilla : 7331 -Frequency : 2.6 only -Description: chmod/chown may include an extra supplementary group -Details : ll{,u}_mdc_pack_op_data() does not properly initialize the - supplementary group and if none is specified this is used. - -Severity : minor -Bugzilla : 5479 (6816) -Frequency : rare -Description: Racing open + rm can assert client in mdc_set_open_replay_data() -Details : If lookup is in progress on a file that is unlinked we might try - to revalidate the inode and fail in revalidate after lookup is - complete and ll_file_open() enqueues the open again but - it_open_error() was not checking DISP_OPEN_OPEN errors correctly. - -Severity : minor -Frequency : always, if lconf --abort_recovery used -Bugzilla : 7047 -Description: lconf --abort_recovery fails with 'Operation not supported' -Details : lconf was attempting to abort recovery on the MDT device and not - the MDS device - ------------------------------------------------------------------------------- - -2005-08-08 Cluster File Systems, Inc. - * version 1.4.4 - * bug fixes - -Severity : major -Frequency : rare (only unsupported configurations with a node running as an - OST and a client) -Bugzilla : 6514, 5137 -Description: Mounting a Lustre file system on a node running as an OST could - lead to deadlocks -Details : OSTs now preallocates memory needed to write out data at - startup, instead of when needed, to avoid having to - allocate memory in possibly low memory situations. - Specifically, if the file system is mounted on on OST, - memory pressure could force it to try to write out data, - which it needed to allocate memory to do. Due to the low - memory, it would be unable to do so and the node would - become unresponsive. - -Severity : enhancement -Bugzilla : 7015 -Description: Addition of lconf --service command line option -Details : lconf now accepts a '--service ' option, which is - shorthand for 'lconf --group --select =' - -Severity : enhancement -Bugzilla : 6101 -Description: Failover mode is now the default for OSTs. -Details : By default, OSTs will now run in failover mode. To return to - the old behaviour, add '--failout' to the lmc line for OSTs. - -Severity : enhancement -Bugzilla : 1693 -Description: Health checks are now provided for MDS and OSTs -Details : Additional detailed health check information on MSD and OSTs - is now provided through the procfs health_check value. - -Severity : minor -Frequency : occasional, depends on IO load -Bugzilla : 4466 -Description: Disk fragmentation on the OSTs could eventually cause slowdowns - after numerous create/delete cycles -Details : The ext3 inode allocation policy would not allocate new inodes - very well on the OSTs because there are no new directories - being created. Instead we look for groups with free space if - the parent directories are nearly full. - -Severity : major -Bugzilla : 6302 -Frequency : rare -Description: Network or server problems during mount may cause partially - mounted clients instead of returning an error. -Details : The config llog parsing code may overwrite the error return - code during mount error handling, returning success instead - of an error. - -Severity : minor -Bugzilla : 6422 -Frequency : rare -Description: MDS can fail to allocate large reply buffers -Details : After long uptimes the MDS can fail to allocate large reply - buffers (e.g. zconf client mount config records) due to memory - fragmentation or consumption by the buffer cache. Preallocate - some large reply buffers so that these replies can be sent even - under memory pressure. - -Severity : minor -Bugzilla : 6266 -Frequency : rare (liblustre) -Description: fsx running with liblustre complained that using truncate() to - extend the file doesn't work. This patch corrects that issue. -Details : This is the liblustre equivalent of the fix for bug 6196. Fixes - ATTR_SIZE and lsm use in llu_setattr_raw. - -Severity : critical -Bugzilla : 6866 -Frequency : rare, only 2.6 kernels -Description: Unusual file access patterns on the MDS may result in inode - data being lost in very rare circumstances. -Details : Bad interaction between the ea-in-inode patch and the "no-read" - code in the 2.6 kernel caused the inode and/or EA data not to - be read from disk, causing single-file corruption. - -Severity : critical -Bugzilla : 6998 -Frequency : rare, only 2.6 filesystems using extents -Description: Heavy concurrent write and delete load may cause data corruption. -Details : It was possible under high-load situations to have an extent - metadata block in the block device cache from a just-unlinked - file overwrite a newly-allocated data block. We now unmap any - metadata buffers that alias just-allocated data blocks. - -Severity : minor -Bugzilla : 7241 -Frequency : filesystems with default stripe_count larger than 77 -Description: lconf+mke2fs fail when formatting filesystem with > 77 stripes -Details : lconf specifies an inode size of 4096 bytes when the default - stripe_count is larger than 77. This conflicts with the default - inode density of 1 per 4096 bytes. Allocate smaller inodes in - this case to avoid pinning too much memory for large EAs. - ------------------------------------------------------------------------------- - -2005-07-07 Cluster File Systems, Inc. - * version 1.4.3 - * bug fixes - -Severity : minor -Frequency : rare (extremely heavy IO load with hundreds of clients) -Bugzilla : 6172 -Description: Client is evicted, gets IO error writing to file -Details : lock ordering changes for bug 5492 reintroduced bug 3267 and - caused clients to be evicted for AST timeouts. The fixes in - bug 5192 mean we no longer need to have such short AST timeouts - so ldlm_timeout has been increased. - -Severity : major -Frequency : occasional during --force or --failover shutdown under load -Bugzilla : 5949, 4834 -Description: Server oops/LBUG if stopped with --force or --failover under load -Details : a collection of import/export refcount and cleanup ordering - issues fixed for safer force cleanup - -Severity : major -Frequency : only filesystems larger than 120 OSTs -Bugzilla : 5990, 6223 -Description: lfs getstripe would oops on a very large filesystem -Details : lov_getconfig used kfree on vmalloc'd memory - -Severity : minor -Frequency : only filesystems exporting via NFS to Solaris 10 clients -Bugzilla : 6242, 6243 -Description: reading from files that had been truncated to a non-zero size - but never opened returned no data -Details : ll_file_read() reads zeros from no-object files to EOF - -Severity : major -Frequency : rare -Bugzilla : 6200 -Description: A bug in MDS/OSS recovery could cause the OSS to fail an assertion -Details : There's little harm in aborting MDS/OSS recovery and letting it - try again, so I removed the LASSERT and return an error instead. - -Severity : enhancement -Bugzilla : 5902 -Description: New debugging infrastructure for tracking down data corruption -Details : The I/O checksum code was replaced to: (a) control it at runtime, - (b) cover more of the client-side code path, and (c) try to narrow - down where problems occurred - -Severity : major -Frequency : rare -Bugzilla : 3819, 4364, 4397, 6313 -Description: Racing close and eviction MDS could cause assertion in mds_close -Details : It was possible to get multiple mfd references during close and - client eviction, leading to one thread referencing a freed mfd. - -Severity: : enhancement -Bugzilla : 3262, 6359 -Description: Attempts to reconnect to servers are now more aggressive. -Details : This builds on the enhanced upcall-less recovery that was added - in 1.4.2. When trying to reconnect to servers, clients will - now try each server in the failover group every 10 seconds. By - default, clients would previously try one server every 25 seconds. - -Severity : major -Frequency : rare -Bugzilla : 6371 -Description: After recovery, certain operations trigger a failed - assertion on a client. -Details : Failing over an mds, using lconf -d --failover, while a - client was doing a readdir() call would cause the client to - LBUG after recovery completed and the readdir() was resent. - -Severity : enhancement -Bugzilla : 6296 -Description: Default groups are now added by lconf -Details : You can now run lconf --group without having to - manually add groups with lmc. - -Severity : major -Frequency : occasional -Bugzilla : 6412 -Description: Nodes with an elan id of 0 trigger a failed assertion - -Severity : minor -Frequency : always when accessing e.g. tty/console device nodes -Bugzilla : 3790 -Description: tty and some other devices nodes cannot be used on lustre -Details : file's private_data field is used by device data and lustre - values in there got lost. New field was added to struct file to - store fs-specific private data. - -Severity : minor -Frequency : when exporting Lustre via NFS -Bugzilla : 5275 -Description: NFSD failed occasionally when looking up a path component -Details : NFSD is looking up ".." which was broken in ext3 directories - that had grown large enough to become hashed. - -Severity : minor -Frequency : Clusters with multiple interfaces not on the same subnet -Bugzilla : 5541 -Description: Nodes will repeatedly try to reconnect to an interface which it - cannot reach and report an error to the log. -Details : Extra peer list entries will be created by lconf with some peers - unreachable. lconf now validates the peer before adding it. - -Severity : major -Frequency : Only if a default stripe is set on the filesystem root. -Bugzilla : 6367 -Description: Setting a default stripe on the filesystem root prevented the - filesystem from being remounted. -Details : The client was sending extra request flags in the root getattr - request and did not allocate a reply buffer for the dir EA. - -Severity : major -Frequency : occasional, higher if lots of files are accessed by one client -Bugzilla : 6159, 6097 -Description: Client trips assertion regarding lsm mismatch/magic -Details : While revalidating inodes the VFS looks up inodes with ifind() - and in rare cases can find an inode that is being freed. - The ll_test_inode() code will free the lsm during ifind() - when it finds an existing inode and then the VFS later attaches - this free lsm to a new inode. - -Severity : major -Frequency : rare -Bugzilla : 6422, 7030 -Description: MDS deadlock between mkdir and client eviction -Details : Creating a new file via mkdir or mknod (starting a transaction - and getting the ns lock) can deadlock with client eviction - (gets ns lock and trying to finish a synchronous transaction). - -Severity : minor -Frequency : occasional -Description: While starting a server, the fsfilt_ext3 module could not be - loaded. -Details : CFS's improved ext3 filesystem is named ldiskfs for 2.6 - kernels. Previously, lconf would still use the ext3 name - when trying to load modules. Now, it will correctly use - ext3 on 2.4 and ldiskfs on 2.6. - -Severity : enhancement -Description: The default stripe count has been changed to 1 -Details : The interpretation of the default stripe count (0, to lfs - or lmc) has been changed to mean striping across a single - OST, rather than all available. For general usage we have - found a stripe count of 1 or 2 works best. - -Severity : enhancement -Description: Add support for compiling against Cray portals. -Details : Conditional compiling for some areas that are different - on Cray Portals. - -Severity : major -Frequency : occasional -Bugzilla : 6409, 6834 -Description: Creating files with an explicit stripe count may lead to - a failed assertion on the MDS -Details : If some OSTs are full or unavailable, creating files may - trigger a failed assertion on the MDS. Now, Lustre will - try to use other servers or return an error to the - client. - -Severity : minor -Frequency : occasional -Bugzilla : 6469 -Description: Multiple concurrent overlapping read+write on multiple SMP nodes - caused lock timeout during readahead (since 1.4.2). -Details : Processes doing readahead might match a lock that hasn't been - granted yet if there are overlapping and conflicting lock - requests. The readahead process waits on ungranted lock - (original lock is CBPENDING), while OST waits for that process - to cancel CBPENDING read lock and eventually evicts client. - -Severity : enhancement -Bugzilla : 6931 -Description: Initial enabling of flock support for clients -Details : Implements fcntl advisory locking and file status functions. - This feature is provided as an optional mount flag (default - off), and is NOT CURRENTLY SUPPORTED. Not all types of record - locking are implemented yet, and those that are are not guaranteed - to be completely correct in production environments. - mount -t lustre -o [flock|noflock] ... - -Severity : major -Frequency : occasional -Bugzilla : 6198 -Description: OSTs running 2.4 kernels but with extents enabled might trip an - assertion in the ext3 JBD (journaling) layer. -Details : The b_committed_data struct is protected by the big kernel lock - in 2.4 kernels, serializing journal_commit_transaction() and - ext3_get_block_handle->ext3_new_block->find_next_usable_block() - access to this struct. In 2.6 kernels there is finer grained - locking to improve SMP performance of the JBD layer. - -Severity : minor -Bugzilla : 6147 -Description: Changes the "SCSI I/O Stats" kernel patch to default to "enabled" - ------------------------------------------------------------------------------ - -2005-05-05 Cluster File Systems, Inc. - * version 1.4.2 - NOTE: Lustre 1.4.2 uses an incompatible network protocol than previous - versions of Lustre. Please update all servers and clients to - version 1.4.2 or later at the same time. You must also run - "lconf --write-conf {config}.xml" on the MDS while it is stopped - to update the configuration logs. - * bug fixes - - fix for HPUX NFS client breakage when NFS exporting Lustre (5781) - - mdc_enqueue does not need max_mds_easize request buffer on send (5707) - - swab llog records of type '0' so we get proper header size/idx (5861) - - send llog cancel req to DLM cancel portal instead of cb portal (5515) - - fix rename of one directory over another leaking an inode (5953) - - avoid SetPageDirty on 2.6 (5981) - - don't re-add just-being-destroyed locks to the waiting list (5653) - - when creating new directories, inherit the parent's custom - striping settings if present parent (3048) - - flush buffers from cache before direct IO in 2.6 obdfilter (4982) - - don't hold i_size_sem in ll_nopage() and ll_ap_refresh_count (6077) - - don't hold client locks on temporary worklist from l_lru (5666) - - handle IO errors in 2.6 obdfilter bio completion routine (6046) - - automatically evict dead clients (5921) - - Update file size properly in create+truncate+fstat case (6196) - - Do not unhash mountpoint dentries, do not allow removal of - mountpoints (5907) - - Avoid lock ordering deadlock issue with write/truncate (6203,5654) - - reserve enough journal credits in fsfilt_start_log for setattr (4554) - - ldlm_enqueue freed-export error path would always LBUG (6149,6184) - - don't reference lr_lvb_data until after we hold lr_lvb_sem (6170) - - don't overwrite last_rcvd if there is a *_client_add() error (6086) - - Correctly handle reads of files with no objects (6243) - - lctl recover will also mark a device active if deactivate used (5933) - * miscellania - - by default create 1 inode per 4kB space on MDS, per 16kB on OSTs - - allow --write-conf on an MDS with different nettype than client (5619) - - don't write config llogs to MDS for mounts not from that MDS (5617) - - lconf should create multiple TCP connections from a client (5201) - - init scripts are now turned off by default; run chkconfig --on - lustre and chkconfig --on lustrefs to use them - - upcalls are no longer needed for clients to recover to failover - servers (3262) - - add --abort-recovery option to lconf to abort recovery on device - startup (6017) - - add support for an arbitrary number of OSTs (3026) - - Quota support protocol changes. - - forward compatibility changes to wire structs (6007) - - rmmod NALs that might be loaded because of /etc/modules.conf (6133) - - support for mountfsoptions and clientoptions to the Lustre LDAP (5873) - - improved "lustre status" script - - initialize blocksize for non-regular files (6062) - - added --disable-server and --disable-client configure options (5782) - - introduce a lookup cache for lconf to avoid repeated DB scans (6204) - - Vanilla 2.4.29 support - - increase maximum number of obd devices to 520 (6242) - - remove the tcp-zero-copy patch from the suse-2.4 series (5902) - - Quadrics Elan drivers are now included for the RHEL 3 2.4.21 and - SLES 9 2.6.5 kernels - - limit stripes per file to 160 (the maximum EA size) (6093) - -2005-03-22 Cluster File Systems, Inc. - * version 1.4.1 - * bug fixes - - don't LASSERT in ll_release on NULL lld with NFS export (4655, 5760) - - hold NS lock when calling handle_ast_error->del_waiting_lock (5746) - - fix setattr mtime regression from lovcleanup merge (4829, 5669) - - workaround for 2.6 crash in ll_unhash_aliases (5687, 5210) - - small ext3 extents cleanups and fixes (5733) - - improved mballoc code, several small races and bugs fixed (5733, 5638) - - kernel version 43 - fix remove_suid bugs in both 2.4 and 2.6 (5695) - - avoid needless client->OST connect, fix handle mismatch (5317) - - fix DLM error path that led to out-of-sync client, long delays (5779) - - support common vfs-enforced mount options (nodev,nosuid,noexec) (5637) - - fix several locking issues related to i_size (5492,5624,5654,5672) - - don't move pending lock onto export if it is already evicted (5683) - - fix kernel oops when creating .foo in unlinked directory (5548) - - fix deadlock in obdfilter statistics vs. object create (5811) - - use time_{before,after} to avoid timer jiffies wrap (5882) - - shutdown --force/--failover stability (3607,3651,4797,5203,4834) - - Do not leak request if server was not able to process it (5154) - - If mds_open unable to find parent dir, make that negative lookup(5154) - - don't create new directories with extent-mapping (5909, 5936) - * miscellania - - fix lustre/lustrefs init scripts for SuSE (patch from Scali, 5702) - - don't hold the pinger_sem in ptlrpc_pinger_sending_on_import - - change obd_increase_kms to obd_adjust_kms (up or down) (5654) - - lconf, lmc search both /usr/lib and /usr/lib64 for Python libs (5800) - - support for RHEL4 kernel on i686 (5773) - - provide error messages when incompatible logs are encountered (5898) - -2005-02-18 Cluster File Systems, Inc. - * version 1.4.0.10 (1.4.1 release candidate 1) - * bug fixes - - don't keep a lock reference when lock is not granted (4238) - - unsafe list practices (rarely) led to infinite eviction loop (4908) - - add per-fs limit of Lustre pages in page cache, avoid OOM (4699) - - drop import inflight refcount on signal_completed_replay error (5255) - - unlock page after async write error during send (3677) - - handle missing objects in filter_preprw_read properly (5265) - - no transno return for symlink open, don't save no-trasno open (3440) - - don't try to complete elan receive that already failed (4012) - - free RPC server reply state on error (5406) - - clean up thread from ptlrpc_start_thread() on error (5160) - - readahead could read extra page into cache that wasn't ejected (5388) - - prevent races in class_attach/setup/cleanup/detach (5260) - - don't dereference de->d_inode after l_dput of de (5458) - - use "int" for stripe value returned from lock_to_stripe (5544) - - mballoc allocation and error-checking fixes in 2.6 (5504) - - block device patches to fix I/O request sizes in 2.6 (5482) - - look up hostnames for IB nals (5602) - - 2.6 changed lock ordering of 2 semaphores, caused deadlock (5654) - - don't start multiple acceptors for the same port (5277) - - fix incorrect LASSERT in mds_getattr_name (5635) - - export a proc file for general "ping" checking (5628) - - fix "lfs check" to not block when the MDS is down (5628) - * miscellania - - service request history (4965) - - put {ll,lov,osc}_async_page structs in a single slab (4699) - - create an "evict_client" /proc entry on OSTs, like the MDS has - - fix mount usage message, return errors per mount(8) (5168) - - change grep [] to grep "[]" in tests so they work in more UMLs - - fix ppc64/x86_64 spec to use %{_libdir} instead of /usr/lib (5389) - - remove ancient LOV_MAGIC_V0 EA support (5047) - - add "disk I/Os in flight" and "I/O req time" stats in obdfilter - - align r/w RPCs to PTLRPC_MAX_BRW_SIZE boundary for performance (3451) - - allow readahead allocations to fail when low on memory (5383) - - mmap locking landed again, after considerable improvement (2828) - - add get_hostaddr() to lustreDB.py for LDAP support (5459) - -2004-11-23 Cluster File Systems, Inc. - * version 1.4.0 - * bug fixes - - send OST transaction number in read/write reply to free req (4966) - - don't ASSERT in ptl_send_rpc() if we run out of memory (5119) - - lock /proc/sys/portals/routes internal state, avoiding oops (4827) - - the watchdog thread now runs as interruptible (5246) - - flock/lockf fixes (but it's still disabled, pending 5135) - - don't use EXT3 constants in llite code (5094) - - memory shortage at startup could cause assertion (5176) - * miscellania - - reorganization of lov code - - single portals codebase - - Infiniband NAL - - add extents/mballoc support (5025) - - direct I/O reads in the obdfilter (4048) - - kernel patches from LNXI for 2.6 (bluesmoke, perfctr, mtd, kexec) - -tbd Cluster File Systems, Inc. - * version 1.2.9 - * bug fixes - - send OST transaction number in read/write reply to free req (4966) - - don't ASSERT in ptl_send_rpc() if we run out of memory (5119) - - lock /proc/sys/portals/routes internal state, avoiding oops (4827) - - the watchdog thread now runs as interruptible (5246) - - handle missing objects in filter_preprw_read properly (5265) - - unsafe list practices (rarely) led to infinite eviction loop (4908) - - drop import inflight refcount on signal_completed_replay error (5255) - - unlock page after async write error during send (3677) - - return original error code on reconstructed replies (3761) - - no transno return for symlink open, don't save no-trasno open (3440) - * miscellania - - add pid to ldlm debugging output (4922) - - bump the watchdog timeouts -- we can't handle 30sec yet - - extra debugging for orphan dentry/inode bug (5259) - -2004-11-16 Cluster File Systems, Inc. - * version 1.2.8 - * bug fixes - - fix TCP_NODELAY bug, which caused extreme perf regression (5134) - - allocate qswnal tx descriptors singly to avoid fragmentation (4504) - - don't LBUG on obdo_alloc() failure, use OBD_SLAB_ALLOC() (4800) - - fix NULL dereference in /proc/sys/portals/routes (4827) - - allow failed mdc_close() operations to be interrupted (4561) - - stop precreate on OST before MDS would time out on it (4778) - - don't send partial-page writes before EOF from client (4410) - - discard client grant for sub-page writes on large-page clients (4520) - - don't free dentries not owned by NFS code, check generation (4806) - - fix lsm leak if mds_create_objects() fails (4801) - - limit debug_daemon file size, always print CERROR messages (4789) - - use transno after validating reply (3892) - - process timed out requests if import state changes (3754) - - update mtime on OST during writes, return in glimpse (4829) - - add mkfsoptions to LDAP (4679) - - use ->max_readahead method instead of zapping global ra (5039) - - don't interrupt __l_wait_event() during strace - * miscellania - - add software watchdogs to catch hung threads quickly (4941) - - make lustrefs init script start after nfs is mounted - - fix CWARN/ERROR duplication (4930) - - return async write errors to application if possible (2248) - - add /proc/sys/portal/memused (bytes allocated by PORTALS_ALLOC) - - print NAL number in %x format (4645) - - update barely-supported suse-2.4.21-171 series (4842) - - support for sles 9 %post scripts - - support for building 2.6 kernel-source packages - - support for sles km_* packages - -2004-10-07 Cluster File Systems, Inc. - * version 1.2.7 - * bug fixes - - ignore -ENOENT errors in osc_destroy (3639) - - notify osc create thread that OSC is being cleaned up (4600) - - add nettype argument for llmount in #5d in conf-sanity.sh (3936) - - reconstruct ost_handle() like mds_handle() (4657) - - create a new thread to do import eviction to avoid deadlock (3969) - - let lconf resolve symlinked-to devices (4629) - - don't unlink "objects" from directory with default EA (4554) - - hold socknal file ref over connect in case target is down (4394) - - allow more than 32000 subdirectories in a single directory (3244) - - fix blocks count for O_DIRECT writes (3751) - - OST returns ENOSPC from object create when no space left (4539) - - don't send truncate RPC if file size isn't changing (4410) - - limit OSC precreate to 1/2 of value OST considers bogus (4778) - - bind to privileged port in socknal and tcpnal (3689) - * miscellania - - rate limit CERROR/CWARN console message to avoid overload (4519) - - GETFILEINFO dir ioctl returns LOV EA + MDS stat in 1 call (3327) - - basic mmap support (3918) - - kernel patch series update from b1_4 (4711) - -2004-09-16 Cluster File Systems, Inc. - * version 1.2.6 - * bug fixes - - avoid crash during MDS cleanup with OST shut down (2775) - - fix loi_list_lock/oig_lock inversion on interrupted IO (4136) - - don't use bad inodes on the MDS (3744) - - dynamic object preallocation to improve recovery speed (4236) - - don't hold spinlock over lock dumping or change debug flags (4401) - - don't zero obd_dev when it is force cleaned (3651) - - print grants to console if they go negative (4431) - - "lctl deactivate" will stop automatic recovery attempts (3406) - - look for existing locks in ldlm_handle_enqueue() (3764) - - don't resolve lock handle twice in recovery avoiding race (4401) - - revalidate should check working dir is a directory (4134) - * miscellania - - don't always mark "slow" obdfilter messages as errors (4418) - -2004-08-24 Cluster File Systems, Inc. - * version 1.2.5 - * bug fixes - - don't close LustreDB during write_conf until it is done (3860) - - fix typo in lconf for_each_profile (3821) - - allow dumping logs from multiple threads at one time (3820) - - don't allow multiple threads in OSC recovery (3812) - - fix debug_size parameters (3864) - - fix mds_postrecov to initialize import for llog ctxt (3121) - - replace config semaphore with spinlock (3306) - - be sure to send a reply for a CANCEL rpc with bad export (3863) - - don't allow enqueue to complete on a destroyed export (3822) - - down write_lock before checking llog header bitmap (3825) - - recover from lock replay timeout (3764) - - up llog sem before sending rpc (3652) - - reduce ns lock hold times when setting kms (3267) - - change a dlm LBUG to LASSERTF, to maybe learn something (4228) - - fix NULL deref and obd_dev leak on setup error (3312) - - replace some LBUG about llog ops with error handling (3841) - - don't match INVALID dentries from d_lookup and spin (3784) - - hold dcache_lock while marking dentries INVALID and hashing (4255) - - fix invalid assertion in ptlrpc_set_wait (3880) - * miscellania - - add libwrap support for the TCP acceptor (3996) - - add /proc/sys/portals/routes for non-root route listing (3994) - - allow setting MDS UUID in .xml (2580) - - print the stack of a process that LBUGs (4228) - -2004-07-14 Cluster File Systems, Inc. - * version 1.2.4 - * bug fixes - - don't cleanup request in ll_file_open() on failed MDS open (3430) - - make sure to unset replay flag from failed open requests (3440) - - if default stripe count is 0, use OST count for inode size (3636) - - update parent mtime/ctime on client for create/unlink (2611) - - drop dentry ref in ext3_add_link from open_connect_dentry (3266) - - free recovery state on server during a forced cleanup (3571) - - unregister_reply for resent reqs (3063) - - loop back devices mounting and status check on 2.6 (3563) - - fix resource-creation race that can provoke i_size == 0 (3513) - - don't try to use bad inodes returned from MDS/OST fs lookup (3688) - - more debugging for page-accounting assertion (3746) - - return -ENOENT instead of asserting if ost getattr+unlink race (3558) - - avoid deadlock after precreation failure (3758) - - fix race and lock order deadlock in orphan handling (3450, 3750) - - add validity checks when grabbing inodes from l_ast_data (3599) - * miscellania - - add /proc/.../recovery_status to obdfilter (3428) - - lightweight CDEBUG infrastructure, debug daemon (3668) - - change default OSC RPC parameters to be better on small clusters - - turn off OST read cache for files smaller than 32MB - - install man pages and include them in rpms (3100) - - add new init script for (un)mounting lustre filesystems (2593) - - run chkconfig in %post for init scripts (3701) - - drop scimac NAL (unmaintained) - -2004-06-17 Cluster File Systems, Inc. - * version 1.2.3 - * bug fixes - - clean kiobufs before and after use (3485) - - strip trailing '/'s before comparing paths with /proc/mounts (3486) - - remove assertions to work around "in-flight rpcs" recovery bug (3063) - - change init script to fail more clearly if not run as root (1528) - - allow clients to reconnect during replay (1742) - - fix ns_lock/i_sem lock ordering deadlock for kms update (3477) - - don't do DNS lookups on NIDs too small for IP addresses (3442) - - re-awaken ptlrpcd if new requests arrive during check_set (3554) - - fix cond_resched (3554) - - only evict unfinished clients after recovery (3515) - - allow bulk resend, prevent data loss (3570) - - dynamic ptlrpc request buffer allocation (2102) - - don't allow unlinking open directory if it isn't empty (2904) - - set MDS/OST threads to umask 0 to not clobber client modes (3359) - - remove extraneous obd dereference causing LASSERT failure (3334) - - don't use get_cycles() when creating temp. files on the mds (3156) - - hold i_sem when setting i_size in ll_extent_lock() (3564) - - handle EEXIST for set-stripe, set proper directory name (3336) - * miscellania - - servers can dump a log evicting a client - lustre.dump_on_timeout=1 - - fix ksocknal_fmb_callback() error messages (2918) - -2004-05-27 Cluster File Systems, Inc. - * version 1.2.2 - * bug fixes - - don't copy lvb into (possibly NULL) reply on error (2983) - - don't deref dentry after dput, don't free lvb on error (2922) - - use the kms to determine writeback rpc length (2947) - - increment oti_logcookies when osc is inactive (2948) - - update client's i_blocks count via lvb messages (2543) - - handle intent open/close of special files properly (1557) - - mount MDS with errors=remount-ro, like obdfilter (2009) - - initialize lock handle to avoid ASSERT on error cleanup (3057) - - don't use cancelling-locks' kms values (2947) - - use highest lock extent for kms, not last one (2925) - - don't dereference ERR_PTR() dentry in error handling path (3107) - - fix thread race in portals_debug_dumplog() (3122) - - create lprocfs device entries at setup instead of at attach (1519) - - common AST error handler, don't evict client on completion race (3145) - - zero nameidata in detach_mnt in 2.6 (3118) - - verify d_inode after revalidate_special is valid in 2.6 (3116) - - use lustre_put_super() to handle zconf unmounts in 2.6 (3064) - - initialize RPC timeout timer earlier for 2.6 (3219) - - don't dereference NULL reply buffer if mdc_close was never sent (2410) - - print nal/nid for unknown nid (3258) - - additional checks for oscc recovery before doing precreate (3284) - - fix ll_extent_lock() error return code for 64-bit systems (3043) - - don't crash in mdc_close for bad permissions on open (3285) - - zero i_rdev for non-device files (3147) - - clear page->private before handing to FS, better assertion (3119) - - tune the read pipeline (3236) - - fix incorrect decref of invalidated dentry (2350) - - provide read-ahead stats and refine rpc in flight stats (3328) - - don't hold journal transaction open across create RPC (3313) - - update atime on MDS at close time (3265) - - close LDAP connection when recovering to avoid server load (3315) - - update iopen-2.6 patch with fixes from 2399,2517,2904 (3301) - - don't leak open file on MDS after open resend (3325) - - serialize filter_precreate and filter_destroy_precreated (3329) - - loop device shouldn't call sync_dev() for nul device (3092) - - clear page cache after eviction (2766) - - resynchronize MDS->OST in background (2824) - - refuse to mount the same filesystem twice on same mountpoint (3394) - - allow llmount to create routes for mounting behind routers (3320) - - push lock cancellation to blocking thread for glimpse ASTs (3409) - - don't call osc_set_data_with_check() for TEST_LOCK matches (3159) - - fix rare problem with rename on htree directories (3417) - * miscellania - - allow default OST striping configuration per directory (1414) - - fix compilation for qswnal for 2.6 kernels (3125) - - increase maximum number of MDS request buffers for large systems - - change liblustreapi to be useful for external progs like lfsck (3098) - - increase local configuration timeout for slow disks (3353) - - allow configuring ldlm AST timeout - lustre.ldlm_timeout= - -2004-03-22 Cluster File Systems, Inc. - * version 1.2.1 - * bug fixes - - fixes for glimpse AST timeouts / incorrectly 0-sized files (2818) - - don't overwrite extent policy data in reply if lock was blocked (2901) - - drop filter export grants atomically with removal from device (2663) - - del obd_self_export from work_list in class_disconnect_exports (2908) - - don't LBUG if MDS recovery times out during orphan cleanup (2530) - - swab reply message in mdc_close, other PPC fixes (2464) - - fix destroying of named logs (2325) - - overwrite old logs when running lconf --write_conf (2264) - - bump LLOG_CHUNKSIZE to 8k to allow for larger clusters (2306) - - fix race in target_handle_connect (2898) - - mds_reint_create() should take same inode create lock (2926) - - correct journal credits calculated for CANCEL_UNLINK_LOG (2931) - - don't close files for self_export to avoid uninitialized obd (2936) - - allow MDS with the same name as client node (2939) - - hold dentry reference for closed log files for unlink (2325) - - reserve space for all logs during transactions (2059) - - don't evict page beyond end of stripe extent (2925) - - don't oops on a deleted current working directory (2399) - - handle hard links to targets without a parent properly (2517) - - don't dereference NULL lock when racing during eviction (2867) - - don't grow lock extents when lots of conflicting locks (2919) - -2004-03-04 Cluster File Systems, Inc. - * version 1.2.0 - * bug fixes - - account for cache space usage on clients to avoid data loss (974) - - lfsck support in lustre kernel code (2349) - - reduce journal credits needed for BRW writes (2370) - - orphan handling to avoid losing space on client/server crashes - - ptlrpcd can be blocked, stopping ALL progress (2477) - - use lock value blocks to assist in proper KMS, faster stat (1021) - - takes i_sem instead of DLM locks internally on obdfilter (2720) - - recovery for initial connections (2355) - - fixes for mds_cleanup_orphans (1934) - - abort_recovery crashes MDS in b_eq (mds_unlink_orphan) (2584) - - block all file creations until orphan recovery completes (1901) - - client remove rq_connection from request struct (2423) - - conf-sanity test_5, proper cleanup in umount log not availale (2640) - - recovery timer race (2670) - - mdc_close recovey bug (2532) - - ptlrpc cleanup bug (2710) - - mds timeout on local locks (2588) - - namespace lock held during RPCs (2431) - - handle interrupted sync write properly (2503) - - don't try to handle a message that hasn't been replied to (2699) - - client assert failure during cleanup after abort recovery (2701) - - leak mdc device after failed mount (2712) - - ptlrpc_check_set allows timedout requests to complete (2714) - - wait for inflight reqs when ptlrpcd finishes (2710) - - make sure unregistered services are removed from the srv_list - - reset bulk XID's when resending them (caught by 1138 test) - - unregister_bulk after timeout - - fix lconf error (2694) - - handle write after unfinished setstripe, stripe-only getstripe (2388) - - readahead locks pages, leaves pending causing memory pressure (2673) - - increase OST request buffers to 4096 on large machines (2729) - - fix up permission of existing directories in simple_mkdir (2661) - - init deleted item, add assertions ptlrpc_abort_inflight() (2725) - - don't assign transno to errored transactions (2742) - - don't delete objects on OST if given a bogus objid from MDS (2751) - - handle large client PAGE_SIZE readdir on small PAGE_SIZE MDS (2777) - - if rq_no_resend, then timeout request after recovery (2432) - - fix MDS llog_logid record size, 64-bit array alignment (2733) - - don't call usermode_helper from ptlrpcd, DEFAULT upcall (2773) - - put magic in mount.lustre data, check for bad/NULL mount data (2529) - - MDS recovery shouldn't delete objects that it has given out (2730) - - if enqueue arrives after completion, don't clobber LVB (2819) - - don't unlock pages twice when trigger_group_io returns error (2814) - - don't deref NULL rq_repmsg if ldlm_handle_enqueue failed (2822) - - don't write pages to disk if there was an error (1450) - - don't ping imports that have recovery disabled (2676) - - take buffered bytes into account when balancing socknal conn (2817) - - hold a DLM lock over readdir always, use truncate_inode_pages (2706) - - reconnect unlink llog connection after MDS reconnects to OST (2816) - - remove little-endian swabbing of llog records (1987) - - set/limit i_blksize to LL_MAX_BLKSIZE on client (2884) - - retry reposting request buffers if they fail (1191) - - grow extent at grant time to avoid granting a revoked lock (2809) - - lock revoke doesn't evict page if covered by a second lock (2765) - - disable VM readahead to avoid reading outside lock extents (2805) - * miscellania - - return LL_SUPER_MAGIC from statfs for the filesystem type (1972) - - updated kernel patches for hp-2.4.20 kernel (2681) - -2004-02-07 Cluster File Systems, Inc. - * version 1.0.4 - * kernel patches - - fix truncated write corruption (2366) - - fix for failed assertion in iopen_connect_dentry (1792,2517) - * bug fixes - - don't flag the ptlrpcd thread with PF_MEMALLOC (2636) - - ensure len(uuid) < 37 in lmc (1171) - - fix ia64 OOPS in llog_test (2255) - - zero end of page at obdfilter for partial page writes (2648) - - don't leave stale dentries around after renames (bug 2428) - - fix timeouts when evicting a client with a single lock held (2642) - - set deadline for the initial HELLO message to drain (2634) - - print out dotted-quad IP addresses in the socknal (2302) - * miscellania - - additional debugging for MDS client eviction problem (2443) - - fix mkfsoptions support for osts (2603, 2604) - -2004-01-27 Cluster File Systems, Inc. - * version 1.0.3 - * kernel patches - - add series for the vanilla 2.6.0 kernel - - add series for the vanilla 2.4.24 kernel - - add series for a cray x86/64 UL kernel drop - - fix xattr patches for the vanilla 2.4.19 series - * bug fixes - - generate true UUIDs in lmc (1171) - - have portals stack dumping break in UML (2466) - - avoid bad dchild deref; avoid inum lock w/o creation (2362) - - allocate with _NOFS in ldlm to avoid deadlock (1933) - - wake callback waiting threads on client eviction (2460) - - Add --ptldebug and --subsystem to lmc (1719) - - update assertion to allow safe interrupt allocation - - set rq_no_resend for cancel requests (2432) - - recalculate ptlrpcd timeout after resend (2494) - - call vfs_rmdir when removing pending directories (2368) - - fix renaming a file to itself (2429) - - lmc creates a default one-stripe lov (2454) - - expand procfs space to handle large clusters (2326) - - increase UML stack to avoid overflow - - update lconf's list of debug and subsystem masks - - fix lfs find --obd (2510) - - /proc tunable for disabling filter read caching (2591) - - stop rpm packages from altering slapd.conf (2301) - - disable nagle in the socknal under 0conf (2578) - - choose mds inode size based on stripe count (2572) - - fix kernel-source rpm problems (2516) - * miscellania - - add --disable-doc to avoid pdf generation (2421) - - update documentation, tests, type-os, comments - - avoid format warnings on ia64 - - remove the TOE NAL - - tiny code cleanups by removing unused fields - -2004-01-07 Cluster File Systems, Inc. - * version 1.0.2 - * bug fixes - - fix obvious semaphore misuse in as-yet-unused setattr path (2348) - - remove the most blatant lies from BUILDING file (2371) - - change default debug level to reasonable production setting - - reduce client side cache size to reduce cache flush time - - reduce max RPCs in flight to avoid unnecessary file fragmentation - - make TCP zerocopy and pinger support enabled by default (2476) - - sync writes completed after process exits caused crashes (2319) - - maintain correct mount count on the MDS (2356) - - backout 1557, because 2316 wasn't really fixed - - better file I/O statistics gathering in /proc - - don't take unnecessary, deadlock-inducing bug in readpage (2383) - - another kernel patch to fix zero-copy TCP function export - - don't take duplicate lock when processing re-sent getattr (2420) - - lctl uses obd_self_export instead of creating new conn (2353) - - MDS/OST recovery case which requires object creation asserted (2425) - - move lfs from /usr/sbin to /usr/bin in packages - - fix race between mds_client_add and mds_client_free (2417) - - use kmalloc instead of slabs in portals (2430) - - don't create duplicate records when a failover MDS is present (2442) - - remove unnecessary mount age check (2332) - - don't remove directory inodes from locks prematurely (2451) - - don't break if MDS service name is the same as hostname (2103) - - fix races in client write RPC generation when cache full (2482) - -2003-12-13 Cluster File Systems, Inc. - * version 1.0.1 - * bug fixes - - remove now-unused request->rq_obd (278) - - if an allocation fails, print out how much memory we've used (1933) - - use PORTAL_SLAB_ALLOC for structures, to get GFP_MEMALLOC (1933) - - add the "configurable stack size" patch to most series files (1256) - - ability to write large log records, for 100+ OST configs (2306) - - fix NULL deref when filter_prep fails (2314) - - fix operator precedence error in filter_sync - - dynamic allocation of socknal TX descriptors (2315) - - fix a missed case in the GFP_MEMALLOC patch, can cause deadlock (2310) - - fix gcc 2.96 compilation problem in xattr kernel patch (2294) - - ensure that CWARN messages in Portals always get to the syslog - - __init/__exit are not for prototype decls (ldlm_init/exit) - - x86-64 compile warning fixes - - fix gateway LMC keyword conflict (2318) - - fix MDS lock inversions in getattr/reint paths (1844) - - fix a rare lock re-ordering bug, which caused deadlock (2322) - - fix i_sem/journal inversion in fsfilt_ext3_write_record (2306) - - DLM race condition prevented some lock evictions (2328) - - ENOMEM detection and retry on socknal sends (2230) - - use GFP_NOFS throughout Lustre, to combat ENOMEM (2230) - - move osc_rpcd into ptlrpc, for use in MDC and others (2329) - - protect MDS inode fsdata with stronger locking; fixes assertion (2313) - - better error messages when a client is rejected during recovery (1505) - - avoid cancelling locks which were never granted, after failure (2330) - - fix i_sem/journal inversion in mds_client_add (2333) - - fix truncate/getattr lock cycle deadlock (2334) - - use rpcd to send close; allows resend after timeout, avoid leak (1897) - - fix two rare exit paths which could leak an l_lock() ref (2321) - - fencepost error in MDS/OST orphan recovery (2226) - - make log record alignment 8 bytes (1988) - - lstripe now fails when requested offset > ost_count (2237) - - ensure that all kernel series have a complete list.h (1607) - - fix crashes in special-file operations (2316) - - lctl create/brw OID mismatch, caused by obsolete filter loop (2339) - * miscellania - - allow configurable automake binary, for testing new versions - - small update to the lfs documentation - -2003-12-03 Cluster File Systems, Inc. - * version 1.0.0 - * fix negative export reference count in fsfilt_sync (2312) - -2003-12-01 Cluster File Systems, Inc. - * release candidate 0.9.1 - * bug fixes - - orphans are moved into the PENDING directory for possible recovery - - replayed opens now open by fid for orphan/rename safety (1042) - - last close of an orphan inode generates a transno (683) - - chdir() and mount() now pin the directory entry (1020) - - avoid CERROR in normal ll_setattr_raw() error case (1500) - - discard very old requests without processing them (1502) - - remove some common, well-understood CERRORs (1505) - - require O_DIRECT I/O to be page-sized to workaround IA64 crash (1609) - - clear "grant" flags in OST replies until OST grant code lands (1644) - - fix read performance by not clobbering i_blksize on client (1598) - - fix __ldlm_handle2lock oops by not dereferencing lock after PUT (1625) - - make LRU size a /proc tunable, clears locks when reduced (707) - - fix some lprocfs rot that prevented ptlbd from loading (1732) - - server locks take references on exports now (1558) - - build fixes for 2.4.20-rh trees (1663) - - return an error from lov_create if all OSCs are inactive (1751) - - fix import levels when a reconnect happens without a timeout (1597) - - exit early from mds_open if we get a lookup error (1749) - - partial page read at EOF wouldn't wait for disk before sending (1642) - - avoid NULL deref in obdfilter when reading page past EOF (1592) - - avoid LASSERT in ll_intent_lock if server failed very early (1090) - - fix LBUG in ll_it_open_error with rc = -2 (1861) - - write/truncate lock inversion (1639) - - Don't auto-load obdclass, portals modules during cleanup (1495) - - fix timestamps from jumping to "now" (1763) - - extra journal assertions (1648) - - add an extra multiunlink test (1771) - - fix read_record/write_record API (1776) - - fix leak of offset_extent, possible incorrect i_size later (1772) - - fix lasserts in mis-matched transnos during open-unlink testing (1541) - - Debugging for the kqswnal_get_idle_tx problems (1820) - - Allow recovery to be attempted multiple times (1536) - - Write out MDS last_rcvd file after it is first created (1600) - - Fix tx_descriptor leak in failed transmit situations (1827) - - ext3 journaling fixes for assertion failure after IO error (1871) - - class_export_put() on freed export after completion AST error (1896) - - Fix revalidate looping in VFS (1322) - - Don't access a freed export during MDS_REINT timeout (1521) - - Add open-unlink recovery support on the MDS (1673,1764) - - Return an error if no MDS data was read from last_rcvd (1946) - - Fix for lookup "." or ".." crash on error (1932,1931,1935) - - Don't setup a disk device that doesn't match exported UUID (317) - - Reduce bulk RPC timeout to avoid cascading client/OST failures (1845) - - avoid committing NULL handle in force close - - local.sh is now a one-stripe LOV configuration - - POSIX utime.4 -EPERM on FIFO not owned by user (56) - - fix ext3 htree duplicate directory entry corruption (1516) - - POSIX creat.13, fstat.1, open.18, stat.3 new file atime/mtime (2020) - - update to new LOV EA format (2097) - - interoperability for different PAGE_SIZE/wordsize (686,1821,1343,2042) - -2003-06-15 Phil Schwan - * version v0_7 - * bug fixes - - imports and exports cleanup too early, need refcounts (349, 879, 1045) - - per-import/export recovery handling (958, 931, 959) - - multiple last-rcvd slots, for serving multiple FSes (949) - - connections are again shared between multiple imp/exports (963, 964) - - "umount -f" would hang if any requests needed to be sent (393, 978) - - avoid pinning large req buffer by copying for queued messages (989) - - add "uuid" to "lctl device" command to help upcalls (991) - - "open" RPCs with transnos would confuse recovery counters (1037) - - do proper endian conversion of all wire messages (288, 340, 891) - - remove OST bulk get LBUGs, fix ost_brw_write cleanup (1126) - - call waiting locks callback from LDLM recovery thread (1127, 1151) - - fix ptlrpc_connection leak in target_handle_connect (1174) - - fix import refcounting bug in OST and MDS cleanup (1134) - - if an invalid-at-open-time OSC returned before close(), LBUG (1150) - - fix very unlikely obd_types race condition (501) - - remove osc_open hack for echo_client (1187) - - we leaked exports/dlmimps for forcibly disconnected clients (1143) - - a failure in read_inode2 leads to deadlock (1139) - - cancel ack-locks as soon as transaction is committed (1072) - - fix major leaks and crashes in the bulk I/O path (937, 1057) - - make sure to commitrw after any preprw to avoid deadlock (1162) - - failing to execute a file in a lustre FS would lock inode (1203) - - small DEBUG_REQ fix to avoid dereferencing a NULL (1227) - - don't ASSERT while cleaning up an incompletely-setup obd (1248) - - obd_uuid2tgt would walk off the end of the list (1255) - - on IA64 the osc would give portals incorrect bulk size (1258) - - fix debug daemon ioctl interface; allows daemon on ia64 (1274) - - fix lock inversion caused by new llite matching code (1282) - - limit the number of dirty pages on a client to 10MB (1286) - - timed out locks were not being corrected cancelled (1289) - - fix O_DIRECT above 4GB on IA-32 (1292) - * major user-visible changes - - fail out/fail over policy now controlled by the upcall (993) - * protocol changes - - add OBD_PING to check server availability and failure (954) - - lustre messages are now sent in sending host order (288, 340, 891) - - add eadatalen to MDS getattr reply (340) - - OST read replies may contain second buffer, with per-page status (593) - -2003-03-11 Phil Schwan - * version v0_6 - * bug fixes - - LDLM_DEBUG macro fix, for gcc 3.2 (850) - - failed open()s could cause deadlock; fixed (867, 869) - - stop cancelling OST locks when files are closed (481) - - overlapping XID spaces caused network corruption (851, 853) - - fix unsafe fsfilt counter arithmetic; change to atomic_t - - setattr_raw added, to do single-RPC, server-side setattrs - - lmc/lconf syntax change for OST UUIDs - - fix crashy race condition between ptlrpc_free_req and osc_close - - don't use request in mdc_enqueue if we hit a timeout (889) - - don't set the inode i_size for regular files from the MDS (896) - - handle out of order completion AST (842) - - don't LBUG if a lock request times out after receiving AST (913) - - avoid d_rehash race in ll_find_alias by rehashing inside dcache_lock - - if a bad lock AST arrives, send an error instead of dropping entirely - - return 0 from revalidate2 if ll_intent_lock returns -EINTR (912) - - fix leak in bulk IO when only partially completed (899, 900, 926) - - fix O_DIRECT for ia64 (55) - - (almost) eliminate Lustre-kernel-thread effects on load average (722) - - C-z after timeout could hang a process forever; fixed (977) - * Features - - client-side I/O cache (678, 924, 929, 941, 970) - * protocol changes - - READPAGE and SETATTRs which don't take server-side locks get - their own portal - -2003-02-11 Phil Schwan - * version v0_5_20 - * bug fixes - - Fix ldlm_lock_match on the MDS to avoid matching remote locks (592) - - Fix fsfilt_extN_readpage() to read a full page of directory - entries, or fake the remainder if PAGE_SIZE != blocksize (500) - - Avoid extra mdc_getattr() in ll_intent_lock when possible (534, 604) - - Fix imbalanced LOV object allocation and out-of-bound access (469) - - Most intent operations were removed, in favour of a new RPC mode - that does a single RPC to the server and bypasses most of the VFS - - All LDLM resource ID arrays were removed in favour of ldlm_res_id - - Aggressively cancel local locks on DLM servers - - mds_reint_unlink sends EA to the client if it's the last nlink. - client uses that EA to unlink OST objects. - - mds_reint_{rename,unlink,link} were rewritten to take ordered locks - - recursive symlinks were fixed (439) - - fixed NULL deref in DEBUG_REQ - - filter_update_lastobjid no longer calls sync, which annoyed extN - - fixed multi-client small-writes to a single file problem (445) - - fixed mtime updates during file writes (607) - - fixed vector writes on obdfilter causing problems when ENOSPC (670) - - fixed bug in obd_brw_read/write() (under guise of testing 367) - - fixed Linux OST size reporting problem (444, 656) - - OST now updates object mtime with writes or setattr (607, 619) - - client verifies file size before zeroing page past EOF (445) - - OST now writes last allocated objid to disk with allocation (108) - - LOV on echo now works (409) - * protocol changes - - mds_reint_unlink sends a new buffer, with the EA included. this - buffer is only valid if body->valid & OBD_MD_FLEASIZE, which is only - set if a regular file was being unlinked, and it was the last link - - use PtlGet from the target for bulk writes (315) - - OST now updates object mtime with writes or setattr (607, 619) - - LDLM now has a grant-time callback to revalidate locked items, if - necessary (604) - - Many MDS operations were reorganized to combat race conditions - * other changes - - Merge b_intel branch (updated lprocfs code) - now at /proc/fs/lustre - - configure check to avoid gcc version 2.96 20000731-2.96-98) (606) - -2003-01-06 Andreas Dilger - * version v0_5_19 - * bug fixes - - Fully reactivate OST imports after reconnection (512, others) - - Make sure client sees our -ENOTCONN from mds_handle (513 - partial) - - More graceful error handling for truncating on dead OST (515) - - Don't error out unless we're actually accessing dead stripes (474) - - Fix garbage sizes when stripes are missing (410) - - LRU counters were broken, causing constant lock purge (433, 432) - - garbage on read from stripes with failed OSTs (441) - - mark OSCs as active before reconnecting during recovery (438) - - lov_enqueue and lov_cancel need to handle inactive OSTs (403) - - lfind did not preserve OST order in output (443) - - symlinks cause hung clients, incorrect data (439) - - stop dereferencing request after dropping refcount (457) - - don't LASSERT(spin_is_locked) on non-SMP (455) - - fixes for many rename() bugs - - fstat didn't correctly synchronize attributes (399) - - server must handle lock cancellation during blocking AST prep (487) - - bulk descriptors were free()d too soon (511) - - fix paths in lconf, which would load incorrect modules (451, 507) - - fix confusing lconf 'host not found' error message (386) - - fix lock order deadlock on OST (O/R i_sem before journal ops, 478) - - fix race condition in mdc_blocking_ast() for inode access (526) - - fix lov_unpackmd() unpacking wrong number of stripes (537) - - fix lov_set_osc_active() marking wrong OSC inactive (440) - - fix bad lstripe lov_unpackmd() assertion (fix layering too) (527) - - fix multiple writes of stripe MD to MDS (358, maybe 519) - - fix lstripe in several ways (kernel side) (527) - - fix request leak in ldlm_cli_enqueue (262) - - incorrect OSC was marked inactive after OST failure - - call mds_fs_cleanup before unmounting filesystem (524) - - fix races between taking ns_lock and ldlm_lock_change_resource - - fix races updating LOV export open file list - - fix lov_enqueue error path, avoid decref-ing bad lock handle - - fix recovery NULL deref in ldlm_cli_cancel_unused - - fix some DLM races by using new hash table for lock handles (419) - - permit the client to specify desired inodes, at replay - - duplicate requests when we queue them for replay reintegration - - fix last_rcvd offset calculation - - sync after each recovered transaction, so we always make progress - - never, not always, ERESTART requests without transnos - - store the lov_desc in the MDS, so we don't depend on getlovinfo to - set it - - skip replay if the MDS says that the client is already connected - - don't check for a recovery-enabled export to match lctl's UUID - - don't INC_USE_COUNT for phantom exports - - don't crash when cleaning up phantom exports (567) - - don't double-finish or set replay data for errored mdc_open requests - - abort requests when they time out, so we don't get old replies - - send/receive replies for AST messages again - - if the client says that it doesn't have the lock, cancel it on the - server - - if we timeout during I/O, don't try to cancel an in-use lock; instead - mark it as destroyed, it will all work out when decref is called - - fix module use counts (22, 581) - * protocol changes - - ASTs now expect a reply (server cancels lock on error reply) - -2002-12-02 Andreas Dilger - * version v0_5_18 - * bug fixes - - fix many simultaneous client startup (392) - - fix dentry->d_it clobbering - - credentials weren't being shipped for readdir/getattr operations - - remove invalid assertions triggered during some concurrent MD - updates - - proper Lustre versions added (336, 389) - - fix memory leak for create error case (398) - - fix LOV locking bug that would get cli/srv out of sync - - fix echo client over LOV (409) - - fix dbench 2, extN refcount problem (170, 258, 356, 418) - - fix double-O_EXCL intent crash (424) - - avoid sending multiple lock CANCELs (352) - * Features - - MDS can do multi-client recovery (modulo bugs in new code) - * Documentation - - many updates, edits, cleanups - -2002-11-18 Phil Schwan - * version v0_5_17 - * bug fixes - - fix null d_it dereference (346) - - fix full OST/dbench hang (333) - - fix permission problem with file removal (286) - - fix removal of OSCs from LOV when they fail - - fix NULL deref during bulk timeout (214) - - fix problems related to multiple filesystems on one MDS (241) - - fixed serious subtle metadata locking bugs - - free locks on clients when inodes are removed due to memory - pressure (201) - - fix inode pointer in lock data (285) - - partial support for multiple MDS on a single host (241) - - data locks weren't cancelled at clear_inode time (290, 311) - - intent locks could lead to unbounded lock growth (205) - - added a maximum lock count, an LRU list, and a flusher - - fix multiple rename (365) - - properly abstracted the echo client - - OSC locked 1 byte too many; fixed - - rewrote brw callback code: - - fixed recovery bugs related to LOVs (306) - - fixed too-many-pages-in-one-write crash (191) - - fixed (again) crash in sync_io_timeout (214) - - probably fixed callback-related race (385) - * protocol change - - Add capability to MDS protocol - - LDLM cancellations and callbacks on different portals - -2002-10-28 Andreas Dilger - * version v0_5_16 - * bug fixes: - - limit client IOV size to PTL_MD_MAX_IOV (611336, 191) - - defer open object destruction to close time (601981, 138) - - open/close OST file handle in obdo (OBD_MD_FLHANDLE) (601981, 138) - - move LDLM_ENQUEUE/CONVERT back to MDS portal (625069) - - abstract ll_lookup2, fix ll_revalidate2 to use abstraction (256) - - don't call obd_setattr in ll_file_release for destroyed objects - * protocol change to lustre_msg: move |version| and add |flags| - * protocol change to osc_punch: "start" in "o_size", "end" in "o_blocks" - * lock replay: for LDLM_FL_REPLAY trust client to do right thing - * added replay of create, unlink, link and rename operations during - MDS failover; recovery should be much more robust now - * remove failed OSCs from LOVs (only lov_create uses this so far) - * the lustre-HOWTO was brought (more) up to date (582544) - -2002-10-23 Phil Schwan - * version v0_5_15 - * bug fixes: - - in-use dentries weren't being reused properly (617851) - - prevent multiple LDLM setup (599178) - - fix LOV size calculations for truncate (617853) - - fix client handling of MDS intent errors (POSIX) - - fix permission bug in lovstripe.c test (624321) - - fix MDS thread deadlock - move LDLM handler to DLM portal (625069) - - truncate past end of file could corrupt data - - proper cleanup after timeouts, crashes, etc (592524, 550815) - - a race in recovery could return ETIMEDOUT to apps (623947) - - building outside the source directory was fixed - * the lustre-HOWTO was brought (more) up to date (582544) - * major progress was made on recovery functionality - -2002-10-10 Phil Schwan - * version v0_5_14 - * bug fixes: - - recovery deadlock fix - - rm -rf causes LBUG fix (617817) - - file open by multiple tasks fix (618962) - - directory permissions bugs (602707 and 620007) - - journal_stop fixed with locking (611313) - - O_APPEND failures resolved (618273, perhaps 614459) - - lconf PATH fix (619770) - - IA64 build fix (621450) - - RPC buffer sizes scale with amount of memory - -2002-10-01 Phil Schwan - * version v0_5_13 - * bug fixes: - - locks would be cancelled without throwing away data pages, - resulting in inconsistent data (605627) - - inode attributes were not always being refreshed (605627, 612449) - - lconf now continues to cleanup after lctl reports an error - - MDS now enforces user permissions (602707) - - lprocfs cleanup fixed, but not yet enabled (614157) - - fixed infinite server hang, should a client not respond to an AST - - avoid going into recovery if user calls readlink() with a buffer - that's too small (613941) - - AST RPCs no longer require replies (614867) -- this may be changed - - don't crash server if client sends an IOV that's too big (611336) - - fixed lock conversion deadlock (611892) - - fixed the following of symlinks (614622) - * recovery: the server can remove locks from a client that dies, other - clients can make progress - * more extN patch fixes - * compile-time configurable ptlrpc buffer allocations - * documentation - - collaborative read cache document - - Lustre Lite Performance CDR document-in-progress - -2002-09-20 Andreas Dilger - * version v0_5_12 - * bug fix - - fix typo in patch-2.4.18 - -2002-09-20 Andreas Dilger - * version v0_5_11 - * bug fixes - - clear ptlrpc request each time in handle_incoming_request() - - unlink of files now destroys the object on the OST - -2002-09-19 Peter Braam - * version 0_5_10 - * add hard link support - * change obdfile creation method - * kernel patch changed - -2002-09-19 Peter Braam - * version 0_5_9 - * bug fix - - stack overflow bug in extN fixed - -2002-09-18 Andreas Dilger - * version 0_5_8 - * documentation updates - - add man pages for config tools - - update tests/README to describe testing with new config tools - - finish metadata API descriptions - * bug fixes and cleanups - - statfs workaround for 16TB limit - - LOV stripe allocation improved, can stripe on subset of OSTs - - LOV file size/IO offset was wrong for files > 4GB in size - - object EA data was being dropped, caused files to be unreadable - - memory overflow with non-LOV OST caused memory corruption - - fixed regression tests to work with new config tools, obdfilter - - fixed bug when directory size became larger than 1 block - - fixed bug (for single client case) when PWD was deleted - - invalidate local directory pages when doing intent-based ops - - avoid LDLM oops when lock callback contained bad data - -2002-09-09 Andreas Dilger - * version 0_5_7 - * documentation updates - * bug fixes and cleanups - - configuration tools - - LOV - - imports/exports - - 64-bit compile warnings - - 64-bit internal statfs data - - many more - * test_brw on persistent OST devices - * MDS recovery - * lprocfs (disabled) - -2002-09-04 Andreas Dilger - * version 0_5_6 - * documentation updates - * bug fixes and cleanups - * configuration tools - -2002-08-30 Peter J. Braam - - * version v0_5_5 - * many small fixes to 0_5_4 - * io/network handling - * thinkos in MDS operations - -2002-08-24 Peter J. Braam - - * version v0_5_4 - * crucial basic fixes to 0.5.3 - * IOR, Iozone work over Elan - * EOF locks added - -2002-08-07 Phil Schwan - * version 0_5_3, our first alpha - * we use the new Portals iovs - * documentation updates - * bug fixes and cleanups - * small changes in the DLM wire protocol - -2002-07-25 Peter J. Braam - * version 0_5_1 with some initial stability, - * locking on MD and file I/O. - * documentation updates - * several bug fixes since 0.5.0 - * small changes in wire protocol - -2002-07-18 Phil Schwan - * version v0_4_5 - * delivered as Lustre Light Alpha - * fixed a crash after handling invalid MDS requests - * fixed directory pages for architectures with non-4k pages sizes - -2002-07-11 Andreas Dilger - * release version v0_4_4 - * Moves TCP acceptor to be on port 2432 (unused Coda port) instead - of 1234. - * Fixes a number of interruption problems with OST operations. - * Update documentation for portals header changes - * Move all wire protocol structs/defines to lustre_idl.h - * Fixes symlink length bug. - * Add tcpdump to repository. - -2002-07-05 Andreas Dilger - * release version v0_4_3 - * Fixes statfs for inodes on extN. - * Fixes bug in runtests which would delete /etc/hosts. - * Use 64-bit object IDs wherever possible (not into VFS though) - Remove ost_get_info, which is unused by lustre, and out of date. - -2002-07-03 Peter Braam - * release version v0_4_2 Fixes a lookup error (type not passed) - * move forward to head of Portals - * move forward to latest Lustre kernel - -2002-06-25 Peter Braam - * release version v0_4_1. Hopefully stable on single node use. diff --git a/lustre/FDL b/lustre/FDL deleted file mode 100644 index b42936b..0000000 --- a/lustre/FDL +++ /dev/null @@ -1,355 +0,0 @@ - GNU Free Documentation License - Version 1.1, March 2000 - - Copyright (C) 2000 Free Software Foundation, Inc. - 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - Everyone is permitted to copy and distribute verbatim copies - of this license document, but changing it is not allowed. - - -0. PREAMBLE - -The purpose of this License is to make a manual, textbook, or other -written document "free" in the sense of freedom: to assure everyone -the effective freedom to copy and redistribute it, with or without -modifying it, either commercially or noncommercially. Secondarily, -this License preserves for the author and publisher a way to get -credit for their work, while not being considered responsible for -modifications made by others. - -This License is a kind of "copyleft", which means that derivative -works of the document must themselves be free in the same sense. It -complements the GNU General Public License, which is a copyleft -license designed for free software. - -We have designed this License in order to use it for manuals for free -software, because free software needs free documentation: a free -program should come with manuals providing the same freedoms that the -software does. But this License is not limited to software manuals; -it can be used for any textual work, regardless of subject matter or -whether it is published as a printed book. We recommend this License -principally for works whose purpose is instruction or reference. - - -1. APPLICABILITY AND DEFINITIONS - -This License applies to any manual or other work that contains a -notice placed by the copyright holder saying it can be distributed -under the terms of this License. The "Document", below, refers to any -such manual or work. Any member of the public is a licensee, and is -addressed as "you". - -A "Modified Version" of the Document means any work containing the -Document or a portion of it, either copied verbatim, or with -modifications and/or translated into another language. - -A "Secondary Section" is a named appendix or a front-matter section of -the Document that deals exclusively with the relationship of the -publishers or authors of the Document to the Document's overall subject -(or to related matters) and contains nothing that could fall directly -within that overall subject. (For example, if the Document is in part a -textbook of mathematics, a Secondary Section may not explain any -mathematics.) The relationship could be a matter of historical -connection with the subject or with related matters, or of legal, -commercial, philosophical, ethical or political position regarding -them. - -The "Invariant Sections" are certain Secondary Sections whose titles -are designated, as being those of Invariant Sections, in the notice -that says that the Document is released under this License. - -The "Cover Texts" are certain short passages of text that are listed, -as Front-Cover Texts or Back-Cover Texts, in the notice that says that -the Document is released under this License. - -A "Transparent" copy of the Document means a machine-readable copy, -represented in a format whose specification is available to the -general public, whose contents can be viewed and edited directly and -straightforwardly with generic text editors or (for images composed of -pixels) generic paint programs or (for drawings) some widely available -drawing editor, and that is suitable for input to text formatters or -for automatic translation to a variety of formats suitable for input -to text formatters. A copy made in an otherwise Transparent file -format whose markup has been designed to thwart or discourage -subsequent modification by readers is not Transparent. A copy that is -not "Transparent" is called "Opaque". - -Examples of suitable formats for Transparent copies include plain -ASCII without markup, Texinfo input format, LaTeX input format, SGML -or XML using a publicly available DTD, and standard-conforming simple -HTML designed for human modification. Opaque formats include -PostScript, PDF, proprietary formats that can be read and edited only -by proprietary word processors, SGML or XML for which the DTD and/or -processing tools are not generally available, and the -machine-generated HTML produced by some word processors for output -purposes only. - -The "Title Page" means, for a printed book, the title page itself, -plus such following pages as are needed to hold, legibly, the material -this License requires to appear in the title page. For works in -formats which do not have any title page as such, "Title Page" means -the text near the most prominent appearance of the work's title, -preceding the beginning of the body of the text. - - -2. VERBATIM COPYING - -You may copy and distribute the Document in any medium, either -commercially or noncommercially, provided that this License, the -copyright notices, and the license notice saying this License applies -to the Document are reproduced in all copies, and that you add no other -conditions whatsoever to those of this License. You may not use -technical measures to obstruct or control the reading or further -copying of the copies you make or distribute. However, you may accept -compensation in exchange for copies. If you distribute a large enough -number of copies you must also follow the conditions in section 3. - -You may also lend copies, under the same conditions stated above, and -you may publicly display copies. - - -3. COPYING IN QUANTITY - -If you publish printed copies of the Document numbering more than 100, -and the Document's license notice requires Cover Texts, you must enclose -the copies in covers that carry, clearly and legibly, all these Cover -Texts: Front-Cover Texts on the front cover, and Back-Cover Texts on -the back cover. Both covers must also clearly and legibly identify -you as the publisher of these copies. The front cover must present -the full title with all words of the title equally prominent and -visible. You may add other material on the covers in addition. -Copying with changes limited to the covers, as long as they preserve -the title of the Document and satisfy these conditions, can be treated -as verbatim copying in other respects. - -If the required texts for either cover are too voluminous to fit -legibly, you should put the first ones listed (as many as fit -reasonably) on the actual cover, and continue the rest onto adjacent -pages. - -If you publish or distribute Opaque copies of the Document numbering -more than 100, you must either include a machine-readable Transparent -copy along with each Opaque copy, or state in or with each Opaque copy -a publicly-accessible computer-network location containing a complete -Transparent copy of the Document, free of added material, which the -general network-using public has access to download anonymously at no -charge using public-standard network protocols. If you use the latter -option, you must take reasonably prudent steps, when you begin -distribution of Opaque copies in quantity, to ensure that this -Transparent copy will remain thus accessible at the stated location -until at least one year after the last time you distribute an Opaque -copy (directly or through your agents or retailers) of that edition to -the public. - -It is requested, but not required, that you contact the authors of the -Document well before redistributing any large number of copies, to give -them a chance to provide you with an updated version of the Document. - - -4. MODIFICATIONS - -You may copy and distribute a Modified Version of the Document under -the conditions of sections 2 and 3 above, provided that you release -the Modified Version under precisely this License, with the Modified -Version filling the role of the Document, thus licensing distribution -and modification of the Modified Version to whoever possesses a copy -of it. In addition, you must do these things in the Modified Version: - -A. Use in the Title Page (and on the covers, if any) a title distinct - from that of the Document, and from those of previous versions - (which should, if there were any, be listed in the History section - of the Document). You may use the same title as a previous version - if the original publisher of that version gives permission. -B. List on the Title Page, as authors, one or more persons or entities - responsible for authorship of the modifications in the Modified - Version, together with at least five of the principal authors of the - Document (all of its principal authors, if it has less than five). -C. State on the Title page the name of the publisher of the - Modified Version, as the publisher. -D. Preserve all the copyright notices of the Document. -E. Add an appropriate copyright notice for your modifications - adjacent to the other copyright notices. -F. Include, immediately after the copyright notices, a license notice - giving the public permission to use the Modified Version under the - terms of this License, in the form shown in the Addendum below. -G. Preserve in that license notice the full lists of Invariant Sections - and required Cover Texts given in the Document's license notice. -H. Include an unaltered copy of this License. -I. Preserve the section entitled "History", and its title, and add to - it an item stating at least the title, year, new authors, and - publisher of the Modified Version as given on the Title Page. If - there is no section entitled "History" in the Document, create one - stating the title, year, authors, and publisher of the Document as - given on its Title Page, then add an item describing the Modified - Version as stated in the previous sentence. -J. Preserve the network location, if any, given in the Document for - public access to a Transparent copy of the Document, and likewise - the network locations given in the Document for previous versions - it was based on. These may be placed in the "History" section. - You may omit a network location for a work that was published at - least four years before the Document itself, or if the original - publisher of the version it refers to gives permission. -K. In any section entitled "Acknowledgements" or "Dedications", - preserve the section's title, and preserve in the section all the - substance and tone of each of the contributor acknowledgements - and/or dedications given therein. -L. Preserve all the Invariant Sections of the Document, - unaltered in their text and in their titles. Section numbers - or the equivalent are not considered part of the section titles. -M. Delete any section entitled "Endorsements". Such a section - may not be included in the Modified Version. -N. Do not retitle any existing section as "Endorsements" - or to conflict in title with any Invariant Section. - -If the Modified Version includes new front-matter sections or -appendices that qualify as Secondary Sections and contain no material -copied from the Document, you may at your option designate some or all -of these sections as invariant. To do this, add their titles to the -list of Invariant Sections in the Modified Version's license notice. -These titles must be distinct from any other section titles. - -You may add a section entitled "Endorsements", provided it contains -nothing but endorsements of your Modified Version by various -parties--for example, statements of peer review or that the text has -been approved by an organization as the authoritative definition of a -standard. - -You may add a passage of up to five words as a Front-Cover Text, and a -passage of up to 25 words as a Back-Cover Text, to the end of the list -of Cover Texts in the Modified Version. Only one passage of -Front-Cover Text and one of Back-Cover Text may be added by (or -through arrangements made by) any one entity. If the Document already -includes a cover text for the same cover, previously added by you or -by arrangement made by the same entity you are acting on behalf of, -you may not add another; but you may replace the old one, on explicit -permission from the previous publisher that added the old one. - -The author(s) and publisher(s) of the Document do not by this License -give permission to use their names for publicity for or to assert or -imply endorsement of any Modified Version. - - -5. COMBINING DOCUMENTS - -You may combine the Document with other documents released under this -License, under the terms defined in section 4 above for modified -versions, provided that you include in the combination all of the -Invariant Sections of all of the original documents, unmodified, and -list them all as Invariant Sections of your combined work in its -license notice. - -The combined work need only contain one copy of this License, and -multiple identical Invariant Sections may be replaced with a single -copy. If there are multiple Invariant Sections with the same name but -different contents, make the title of each such section unique by -adding at the end of it, in parentheses, the name of the original -author or publisher of that section if known, or else a unique number. -Make the same adjustment to the section titles in the list of -Invariant Sections in the license notice of the combined work. - -In the combination, you must combine any sections entitled "History" -in the various original documents, forming one section entitled -"History"; likewise combine any sections entitled "Acknowledgements", -and any sections entitled "Dedications". You must delete all sections -entitled "Endorsements." - - -6. COLLECTIONS OF DOCUMENTS - -You may make a collection consisting of the Document and other documents -released under this License, and replace the individual copies of this -License in the various documents with a single copy that is included in -the collection, provided that you follow the rules of this License for -verbatim copying of each of the documents in all other respects. - -You may extract a single document from such a collection, and distribute -it individually under this License, provided you insert a copy of this -License into the extracted document, and follow this License in all -other respects regarding verbatim copying of that document. - - -7. AGGREGATION WITH INDEPENDENT WORKS - -A compilation of the Document or its derivatives with other separate -and independent documents or works, in or on a volume of a storage or -distribution medium, does not as a whole count as a Modified Version -of the Document, provided no compilation copyright is claimed for the -compilation. Such a compilation is called an "aggregate", and this -License does not apply to the other self-contained works thus compiled -with the Document, on account of their being thus compiled, if they -are not themselves derivative works of the Document. - -If the Cover Text requirement of section 3 is applicable to these -copies of the Document, then if the Document is less than one quarter -of the entire aggregate, the Document's Cover Texts may be placed on -covers that surround only the Document within the aggregate. -Otherwise they must appear on covers around the whole aggregate. - - -8. TRANSLATION - -Translation is considered a kind of modification, so you may -distribute translations of the Document under the terms of section 4. -Replacing Invariant Sections with translations requires special -permission from their copyright holders, but you may include -translations of some or all Invariant Sections in addition to the -original versions of these Invariant Sections. You may include a -translation of this License provided that you also include the -original English version of this License. In case of a disagreement -between the translation and the original English version of this -License, the original English version will prevail. - - -9. TERMINATION - -You may not copy, modify, sublicense, or distribute the Document except -as expressly provided for under this License. Any other attempt to -copy, modify, sublicense or distribute the Document is void, and will -automatically terminate your rights under this License. However, -parties who have received copies, or rights, from you under this -License will not have their licenses terminated so long as such -parties remain in full compliance. - - -10. FUTURE REVISIONS OF THIS LICENSE - -The Free Software Foundation may publish new, revised versions -of the GNU Free Documentation License from time to time. Such new -versions will be similar in spirit to the present version, but may -differ in detail to address new problems or concerns. See -http://www.gnu.org/copyleft/. - -Each version of the License is given a distinguishing version number. -If the Document specifies that a particular numbered version of this -License "or any later version" applies to it, you have the option of -following the terms and conditions either of that specified version or -of any later version that has been published (not as a draft) by the -Free Software Foundation. If the Document does not specify a version -number of this License, you may choose any version ever published (not -as a draft) by the Free Software Foundation. - - -ADDENDUM: How to use this License for your documents - -To use this License in a document you have written, include a copy of -the License in the document and put the following copyright and -license notices just after the title page: - - Copyright (c) YEAR YOUR NAME. - Permission is granted to copy, distribute and/or modify this document - under the terms of the GNU Free Documentation License, Version 1.1 - or any later version published by the Free Software Foundation; - with the Invariant Sections being LIST THEIR TITLES, with the - Front-Cover Texts being LIST, and with the Back-Cover Texts being LIST. - A copy of the license is included in the section entitled "GNU - Free Documentation License". - -If you have no Invariant Sections, write "with no Invariant Sections" -instead of saying which ones are invariant. If you have no -Front-Cover Texts, write "no Front-Cover Texts" instead of -"Front-Cover Texts being LIST"; likewise for Back-Cover Texts. - -If your document contains nontrivial examples of program code, we -recommend releasing these examples in parallel under your choice of -free software license, such as the GNU General Public License, -to permit their use in free software. diff --git a/lustre/LICENSE b/lustre/LICENSE deleted file mode 100644 index edb73cd..0000000 --- a/lustre/LICENSE +++ /dev/null @@ -1,372 +0,0 @@ -Each file in this distribution contains a header stating the copyright -owner(s), and the licensing terms for that file. Some files are not -eligible for copyright protection, and contain neither. - -There are many files which may be covered by a separate license that -you signed or otherwise agreed to before downloading this software. -If you did not agree to such an agreement, or if the file does not -mention that license, then you can redistribute and/or modify it under -the terms of version 2 of the GNU General Public License. Each file -is very clear about which license is applicable. - -In any case, Lustre is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the license -text for more details. - -Reproduced below is the GNU General Public License version 2, and -Linus's clarifying statement from the Linux kernel source code: - ----------------------------------------- - - NOTE! This copyright does *not* cover user programs that use kernel - services by normal system calls - this is merely considered normal use - of the kernel, and does *not* fall under the heading of "derived work". - Also note that the GPL below is copyrighted by the Free Software - Foundation, but the instance of code that it refers to (the Linux - kernel) is copyrighted by me and others who actually wrote it. - - Linus Torvalds - ----------------------------------------- - - GNU GENERAL PUBLIC LICENSE - Version 2, June 1991 - - Copyright (C) 1989, 1991 Free Software Foundation, Inc. - 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - Everyone is permitted to copy and distribute verbatim copies - of this license document, but changing it is not allowed. - - Preamble - - The licenses for most software are designed to take away your -freedom to share and change it. By contrast, the GNU General Public -License is intended to guarantee your freedom to share and change free -software--to make sure the software is free for all its users. This -General Public License applies to most of the Free Software -Foundation's software and to any other program whose authors commit to -using it. (Some other Free Software Foundation software is covered by -the GNU Library General Public License instead.) You can apply it to -your programs, too. - - When we speak of free software, we are referring to freedom, not -price. Our General Public Licenses are designed to make sure that you -have the freedom to distribute copies of free software (and charge for -this service if you wish), that you receive source code or can get it -if you want it, that you can change the software or use pieces of it -in new free programs; and that you know you can do these things. - - To protect your rights, we need to make restrictions that forbid -anyone to deny you these rights or to ask you to surrender the rights. -These restrictions translate to certain responsibilities for you if you -distribute copies of the software, or if you modify it. - - For example, if you distribute copies of such a program, whether -gratis or for a fee, you must give the recipients all the rights that -you have. You must make sure that they, too, receive or can get the -source code. And you must show them these terms so they know their -rights. - - We protect your rights with two steps: (1) copyright the software, and -(2) offer you this license which gives you legal permission to copy, -distribute and/or modify the software. - - Also, for each author's protection and ours, we want to make certain -that everyone understands that there is no warranty for this free -software. If the software is modified by someone else and passed on, we -want its recipients to know that what they have is not the original, so -that any problems introduced by others will not reflect on the original -authors' reputations. - - Finally, any free program is threatened constantly by software -patents. We wish to avoid the danger that redistributors of a free -program will individually obtain patent licenses, in effect making the -program proprietary. To prevent this, we have made it clear that any -patent must be licensed for everyone's free use or not licensed at all. - - The precise terms and conditions for copying, distribution and -modification follow. - - GNU GENERAL PUBLIC LICENSE - TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION - - 0. This License applies to any program or other work which contains -a notice placed by the copyright holder saying it may be distributed -under the terms of this General Public License. The "Program", below, -refers to any such program or work, and a "work based on the Program" -means either the Program or any derivative work under copyright law: -that is to say, a work containing the Program or a portion of it, -either verbatim or with modifications and/or translated into another -language. (Hereinafter, translation is included without limitation in -the term "modification".) Each licensee is addressed as "you". - -Activities other than copying, distribution and modification are not -covered by this License; they are outside its scope. The act of -running the Program is not restricted, and the output from the Program -is covered only if its contents constitute a work based on the -Program (independent of having been made by running the Program). -Whether that is true depends on what the Program does. - - 1. You may copy and distribute verbatim copies of the Program's -source code as you receive it, in any medium, provided that you -conspicuously and appropriately publish on each copy an appropriate -copyright notice and disclaimer of warranty; keep intact all the -notices that refer to this License and to the absence of any warranty; -and give any other recipients of the Program a copy of this License -along with the Program. - -You may charge a fee for the physical act of transferring a copy, and -you may at your option offer warranty protection in exchange for a fee. - - 2. You may modify your copy or copies of the Program or any portion -of it, thus forming a work based on the Program, and copy and -distribute such modifications or work under the terms of Section 1 -above, provided that you also meet all of these conditions: - - a) You must cause the modified files to carry prominent notices - stating that you changed the files and the date of any change. - - b) You must cause any work that you distribute or publish, that in - whole or in part contains or is derived from the Program or any - part thereof, to be licensed as a whole at no charge to all third - parties under the terms of this License. - - c) If the modified program normally reads commands interactively - when run, you must cause it, when started running for such - interactive use in the most ordinary way, to print or display an - announcement including an appropriate copyright notice and a - notice that there is no warranty (or else, saying that you provide - a warranty) and that users may redistribute the program under - these conditions, and telling the user how to view a copy of this - License. (Exception: if the Program itself is interactive but - does not normally print such an announcement, your work based on - the Program is not required to print an announcement.) - -These requirements apply to the modified work as a whole. If -identifiable sections of that work are not derived from the Program, -and can be reasonably considered independent and separate works in -themselves, then this License, and its terms, do not apply to those -sections when you distribute them as separate works. But when you -distribute the same sections as part of a whole which is a work based -on the Program, the distribution of the whole must be on the terms of -this License, whose permissions for other licensees extend to the -entire whole, and thus to each and every part regardless of who wrote it. - -Thus, it is not the intent of this section to claim rights or contest -your rights to work written entirely by you; rather, the intent is to -exercise the right to control the distribution of derivative or -collective works based on the Program. - -In addition, mere aggregation of another work not based on the Program -with the Program (or with a work based on the Program) on a volume of -a storage or distribution medium does not bring the other work under -the scope of this License. - - 3. You may copy and distribute the Program (or a work based on it, -under Section 2) in object code or executable form under the terms of -Sections 1 and 2 above provided that you also do one of the following: - - a) Accompany it with the complete corresponding machine-readable - source code, which must be distributed under the terms of Sections - 1 and 2 above on a medium customarily used for software interchange; or, - - b) Accompany it with a written offer, valid for at least three - years, to give any third party, for a charge no more than your - cost of physically performing source distribution, a complete - machine-readable copy of the corresponding source code, to be - distributed under the terms of Sections 1 and 2 above on a medium - customarily used for software interchange; or, - - c) Accompany it with the information you received as to the offer - to distribute corresponding source code. (This alternative is - allowed only for noncommercial distribution and only if you - received the program in object code or executable form with such - an offer, in accord with Subsection b above.) - -The source code for a work means the preferred form of the work for -making modifications to it. For an executable work, complete source -code means all the source code for all modules it contains, plus any -associated interface definition files, plus the scripts used to -control compilation and installation of the executable. However, as a -special exception, the source code distributed need not include -anything that is normally distributed (in either source or binary -form) with the major components (compiler, kernel, and so on) of the -operating system on which the executable runs, unless that component -itself accompanies the executable. - -If distribution of executable or object code is made by offering -access to copy from a designated place, then offering equivalent -access to copy the source code from the same place counts as -distribution of the source code, even though third parties are not -compelled to copy the source along with the object code. - - 4. You may not copy, modify, sublicense, or distribute the Program -except as expressly provided under this License. Any attempt -otherwise to copy, modify, sublicense or distribute the Program is -void, and will automatically terminate your rights under this License. -However, parties who have received copies, or rights, from you under -this License will not have their licenses terminated so long as such -parties remain in full compliance. - - 5. You are not required to accept this License, since you have not -signed it. However, nothing else grants you permission to modify or -distribute the Program or its derivative works. These actions are -prohibited by law if you do not accept this License. Therefore, by -modifying or distributing the Program (or any work based on the -Program), you indicate your acceptance of this License to do so, and -all its terms and conditions for copying, distributing or modifying -the Program or works based on it. - - 6. Each time you redistribute the Program (or any work based on the -Program), the recipient automatically receives a license from the -original licensor to copy, distribute or modify the Program subject to -these terms and conditions. You may not impose any further -restrictions on the recipients' exercise of the rights granted herein. -You are not responsible for enforcing compliance by third parties to -this License. - - 7. If, as a consequence of a court judgment or allegation of patent -infringement or for any other reason (not limited to patent issues), -conditions are imposed on you (whether by court order, agreement or -otherwise) that contradict the conditions of this License, they do not -excuse you from the conditions of this License. If you cannot -distribute so as to satisfy simultaneously your obligations under this -License and any other pertinent obligations, then as a consequence you -may not distribute the Program at all. For example, if a patent -license would not permit royalty-free redistribution of the Program by -all those who receive copies directly or indirectly through you, then -the only way you could satisfy both it and this License would be to -refrain entirely from distribution of the Program. - -If any portion of this section is held invalid or unenforceable under -any particular circumstance, the balance of the section is intended to -apply and the section as a whole is intended to apply in other -circumstances. - -It is not the purpose of this section to induce you to infringe any -patents or other property right claims or to contest validity of any -such claims; this section has the sole purpose of protecting the -integrity of the free software distribution system, which is -implemented by public license practices. Many people have made -generous contributions to the wide range of software distributed -through that system in reliance on consistent application of that -system; it is up to the author/donor to decide if he or she is willing -to distribute software through any other system and a licensee cannot -impose that choice. - -This section is intended to make thoroughly clear what is believed to -be a consequence of the rest of this License. - - 8. If the distribution and/or use of the Program is restricted in -certain countries either by patents or by copyrighted interfaces, the -original copyright holder who places the Program under this License -may add an explicit geographical distribution limitation excluding -those countries, so that distribution is permitted only in or among -countries not thus excluded. In such case, this License incorporates -the limitation as if written in the body of this License. - - 9. The Free Software Foundation may publish revised and/or new versions -of the General Public License from time to time. Such new versions will -be similar in spirit to the present version, but may differ in detail to -address new problems or concerns. - -Each version is given a distinguishing version number. If the Program -specifies a version number of this License which applies to it and "any -later version", you have the option of following the terms and conditions -either of that version or of any later version published by the Free -Software Foundation. If the Program does not specify a version number of -this License, you may choose any version ever published by the Free Software -Foundation. - - 10. If you wish to incorporate parts of the Program into other free -programs whose distribution conditions are different, write to the author -to ask for permission. For software which is copyrighted by the Free -Software Foundation, write to the Free Software Foundation; we sometimes -make exceptions for this. Our decision will be guided by the two goals -of preserving the free status of all derivatives of our free software and -of promoting the sharing and reuse of software generally. - - NO WARRANTY - - 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY -FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN -OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES -PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED -OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF -MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS -TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE -PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, -REPAIR OR CORRECTION. - - 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING -WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR -REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, -INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING -OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED -TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY -YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER -PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE -POSSIBILITY OF SUCH DAMAGES. - - END OF TERMS AND CONDITIONS - - How to Apply These Terms to Your New Programs - - If you develop a new program, and you want it to be of the greatest -possible use to the public, the best way to achieve this is to make it -free software which everyone can redistribute and change under these terms. - - To do so, attach the following notices to the program. It is safest -to attach them to the start of each source file to most effectively -convey the exclusion of warranty; and each file should have at least -the "copyright" line and a pointer to where the full notice is found. - - - Copyright (C) 19yy - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - - -Also add information on how to contact you by electronic and paper mail. - -If the program is interactive, make it output a short notice like this -when it starts in an interactive mode: - - Gnomovision version 69, Copyright (C) 19yy name of author - Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. - This is free software, and you are welcome to redistribute it - under certain conditions; type `show c' for details. - -The hypothetical commands `show w' and `show c' should show the appropriate -parts of the General Public License. Of course, the commands you use may -be called something other than `show w' and `show c'; they could even be -mouse-clicks or menu items--whatever suits your program. - -You should also get your employer (if you work as a programmer) or your -school, if any, to sign a "copyright disclaimer" for the program, if -necessary. Here is a sample; alter the names: - - Yoyodyne, Inc., hereby disclaims all copyright interest in the program - `Gnomovision' (which makes passes at compilers) written by James Hacker. - - , 1 April 1989 - Ty Coon, President of Vice - -This General Public License does not permit incorporating your program into -proprietary programs. If your program is a subroutine library, you may -consider it more useful to permit linking proprietary applications with the -library. If this is what you want to do, use the GNU Library General -Public License instead of this License. diff --git a/lustre/Makefile.in b/lustre/Makefile.in deleted file mode 100644 index c06794a..0000000 --- a/lustre/Makefile.in +++ /dev/null @@ -1,13 +0,0 @@ -subdir-m += lvfs -subdir-m += obdclass -subdir-m += lov -subdir-m += ptlrpc -subdir-m += osc -subdir-m += obdecho -subdir-m += mgc - -@SERVER_TRUE@subdir-m += mds obdfilter ost mgs -@CLIENT_TRUE@subdir-m += mdc llite -@QUOTA_TRUE@subdir-m += quota - -@INCLUDE_RULES@ diff --git a/lustre/autoMakefile.am b/lustre/autoMakefile.am deleted file mode 100644 index 53ae48a..0000000 --- a/lustre/autoMakefile.am +++ /dev/null @@ -1,63 +0,0 @@ -# Copyright (C) 2001 Cluster File Systems, Inc. -# -# This code is issued under the GNU General Public License. -# See the file COPYING in this distribution - -AUTOMAKE_OPTIONS = foreign - -# also update lustre/autoconf/lustre-core.m4 AC_CONFIG_FILES -ALWAYS_SUBDIRS := include lvfs obdclass ldlm ptlrpc osc lov obdecho \ - mgc doc utils tests scripts autoconf contrib - -SERVER_SUBDIRS := obdfilter ost mds mgs - -CLIENT_SUBDIRS := mdc llite - -QUOTA_SUBDIRS := quota - -LIBLUSTRE_SUBDIRS := liblustre - -SUBDIRS := $(ALWAYS_SUBDIRS) - -if SERVER -SUBDIRS += $(SERVER_SUBDIRS) -endif - -if CLIENT -SUBDIRS += $(CLIENT_SUBDIRS) -endif - -if QUOTA -SUBDIRS += $(QUOTA_SUBDIRS) -endif - -# this needs to be after the client subdirs -if LIBLUSTRE -if !CLIENT -SUBDIRS += $(CLIENT_SUBDIRS) -endif -SUBDIRS += $(LIBLUSTRE_SUBDIRS) -endif - -DIST_SUBDIRS := $(ALWAYS_SUBDIRS) $(SERVER_SUBDIRS) $(CLIENT_SUBDIRS) \ - $(LIBLUSTRE_SUBDIRS) $(QUOTA_SUBDIRS) - -EXTRA_DIST = BUGS FDL kernel_patches - -lvfs-sources: - $(MAKE) sources -C lvfs -obdclass-sources: - $(MAKE) sources -C obdclass - -sources: $(LDISKFS) lvfs-sources obdclass-sources lustre_build_version - -all-recursive: lustre_build_version - -BUILD_VER_H=$(top_builddir)/lustre/include/lustre/lustre_build_version.h - -lustre_build_version: - perl $(top_builddir)/lustre/scripts/version_tag.pl $(top_srcdir) $(top_builddir) > tmpver - echo "#define LUSTRE_RELEASE @RELEASE@" >> tmpver - cmp -s $(BUILD_VER_H) tmpver > tmpdiff 2> /dev/null && \ - $(RM) tmpver tmpdiff || \ - mv -f tmpver $(BUILD_VER_H) diff --git a/lustre/autoconf/.cvsignore b/lustre/autoconf/.cvsignore deleted file mode 100644 index 282522d..0000000 --- a/lustre/autoconf/.cvsignore +++ /dev/null @@ -1,2 +0,0 @@ -Makefile -Makefile.in diff --git a/lustre/autoconf/Makefile.am b/lustre/autoconf/Makefile.am deleted file mode 100644 index 7a747da..0000000 --- a/lustre/autoconf/Makefile.am +++ /dev/null @@ -1 +0,0 @@ -EXTRA_DIST := lustre-core.m4 lustre-version.ac diff --git a/lustre/autoconf/lustre-core.m4 b/lustre/autoconf/lustre-core.m4 deleted file mode 100644 index c0651fa..0000000 --- a/lustre/autoconf/lustre-core.m4 +++ /dev/null @@ -1,1542 +0,0 @@ -#* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- -#* vim:expandtab:shiftwidth=8:tabstop=8: -# -# LC_CONFIG_SRCDIR -# -# Wrapper for AC_CONFIG_SUBDIR -# -AC_DEFUN([LC_CONFIG_SRCDIR], -[AC_CONFIG_SRCDIR([lustre/obdclass/obdo.c]) -]) - -# -# LC_PATH_DEFAULTS -# -# lustre specific paths -# -AC_DEFUN([LC_PATH_DEFAULTS], -[# ptlrpc kernel build requires this -LUSTRE="$PWD/lustre" -AC_SUBST(LUSTRE) - -# mount.lustre -rootsbindir='/sbin' -AC_SUBST(rootsbindir) - -demodir='$(docdir)/demo' -AC_SUBST(demodir) - -pkgexampledir='${pkgdatadir}/examples' -AC_SUBST(pkgexampledir) -]) - -# -# LC_TARGET_SUPPORTED -# -# is the target os supported? -# -AC_DEFUN([LC_TARGET_SUPPORTED], -[case $target_os in - linux* | darwin*) -$1 - ;; - *) -$2 - ;; -esac -]) - -# -# LC_CONFIG_EXT3 -# -# that ext3 is enabled in the kernel -# -AC_DEFUN([LC_CONFIG_EXT3], -[LB_LINUX_CONFIG([EXT3_FS],[],[ - LB_LINUX_CONFIG([EXT3_FS_MODULE],[],[$2]) -]) -LB_LINUX_CONFIG([EXT3_FS_XATTR],[$1],[$3]) -]) - -# -# LC_FSHOOKS -# -# If we have (and can build) fshooks.h -# -AC_DEFUN([LC_FSHOOKS], -[LB_CHECK_FILE([$LINUX/include/linux/fshooks.h],[ - AC_MSG_CHECKING([if fshooks.h can be compiled]) - LB_LINUX_TRY_COMPILE([ - #include - ],[],[ - AC_MSG_RESULT([yes]) - ],[ - AC_MSG_RESULT([no]) - AC_MSG_WARN([You might have better luck with gcc 3.3.x.]) - AC_MSG_WARN([You can set CC=gcc33 before running configure.]) - AC_MSG_ERROR([Your compiler cannot build fshooks.h.]) - ]) -$1 -],[ -$2 -]) -]) - -# -# LC_STRUCT_KIOBUF -# -# rh 2.4.18 has iobuf->dovary, but other kernels do not -# -AC_DEFUN([LC_STRUCT_KIOBUF], -[AC_MSG_CHECKING([if struct kiobuf has a dovary field]) -LB_LINUX_TRY_COMPILE([ - #include -],[ - struct kiobuf iobuf; - iobuf.dovary = 1; -],[ - AC_MSG_RESULT([yes]) - AC_DEFINE(HAVE_KIOBUF_DOVARY, 1, [struct kiobuf has a dovary field]) -],[ - AC_MSG_RESULT([no]) -]) -]) - -# -# LC_FUNC_COND_RESCHED -# -# cond_resched() was introduced in 2.4.20 -# -AC_DEFUN([LC_FUNC_COND_RESCHED], -[AC_MSG_CHECKING([if kernel offers cond_resched]) -LB_LINUX_TRY_COMPILE([ - #include -],[ - cond_resched(); -],[ - AC_MSG_RESULT([yes]) - AC_DEFINE(HAVE_COND_RESCHED, 1, [cond_resched found]) -],[ - AC_MSG_RESULT([no]) -]) -]) - -# -# LC_FUNC_ZAP_PAGE_RANGE -# -# if zap_page_range() takes a vma arg -# -AC_DEFUN([LC_FUNC_ZAP_PAGE_RANGE], -[AC_MSG_CHECKING([if zap_page_range with vma parameter]) -ZAP_PAGE_RANGE_VMA="`grep -c 'zap_page_range.*struct vm_area_struct' $LINUX/include/linux/mm.h`" -if test "$ZAP_PAGE_RANGE_VMA" != 0 ; then - AC_DEFINE(ZAP_PAGE_RANGE_VMA, 1, [zap_page_range with vma parameter]) - AC_MSG_RESULT([yes]) -else - AC_MSG_RESULT([no]) -fi -]) - -# -# LC_FUNC_PDE -# -# if proc_fs.h defines PDE() -# -AC_DEFUN([LC_FUNC_PDE], -[AC_MSG_CHECKING([if kernel defines PDE]) -HAVE_PDE="`grep -c 'proc_dir_entry..PDE' $LINUX/include/linux/proc_fs.h`" -if test "$HAVE_PDE" != 0 ; then - AC_DEFINE(HAVE_PDE, 1, [the kernel defines PDE]) - AC_MSG_RESULT([yes]) -else - AC_MSG_RESULT([no]) -fi -]) - -# -# LC_FUNC_FILEMAP_FDATASYNC -# -# if filemap_fdatasync() exists -# -AC_DEFUN([LC_FUNC_FILEMAP_FDATAWRITE], -[AC_MSG_CHECKING([whether filemap_fdatawrite() is defined]) -LB_LINUX_TRY_COMPILE([ - #include -],[ - int (*foo)(struct address_space *)= filemap_fdatawrite; -],[ - AC_MSG_RESULT([yes]) - AC_DEFINE(HAVE_FILEMAP_FDATAWRITE, 1, [filemap_fdatawrite() found]) -],[ - AC_MSG_RESULT([no]) -]) -]) - -# -# LC_FUNC_DIRECT_IO -# -# if direct_IO takes a struct file argument -# -AC_DEFUN([LC_FUNC_DIRECT_IO], -[AC_MSG_CHECKING([if kernel passes struct file to direct_IO]) -HAVE_DIO_FILE="`grep -c 'direct_IO.*struct file' $LINUX/include/linux/fs.h`" -if test "$HAVE_DIO_FILE" != 0 ; then - AC_DEFINE(HAVE_DIO_FILE, 1, [the kernel passes struct file to direct_IO]) - AC_MSG_RESULT(yes) -else - AC_MSG_RESULT(no) -fi -]) - -# -# LC_HEADER_MM_INLINE -# -# RHEL kernels define page_count in mm_inline.h -# -AC_DEFUN([LC_HEADER_MM_INLINE], -[AC_MSG_CHECKING([if kernel has mm_inline.h header]) -LB_LINUX_TRY_COMPILE([ - #include -],[ - #ifndef page_count - #error mm_inline.h does not define page_count - #endif -],[ - AC_MSG_RESULT([yes]) - AC_DEFINE(HAVE_MM_INLINE, 1, [mm_inline found]) -],[ - AC_MSG_RESULT([no]) -]) -]) - -# -# LC_STRUCT_INODE -# -# if inode->i_alloc_sem exists -# -AC_DEFUN([LC_STRUCT_INODE], -[AC_MSG_CHECKING([if struct inode has i_alloc_sem]) -LB_LINUX_TRY_COMPILE([ - #include - #include -],[ - #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,4,24)) - #error "down_read_trylock broken before 2.4.24" - #endif - struct inode i; - return (char *)&i.i_alloc_sem - (char *)&i; -],[ - AC_MSG_RESULT([yes]) - AC_DEFINE(HAVE_I_ALLOC_SEM, 1, [struct inode has i_alloc_sem]) -],[ - AC_MSG_RESULT([no]) -]) -]) - -# -# LC_FUNC_REGISTER_CACHE -# -# if register_cache() is defined by kernel -# -AC_DEFUN([LC_FUNC_REGISTER_CACHE], -[AC_MSG_CHECKING([if kernel defines register_cache()]) -LB_LINUX_TRY_COMPILE([ - #include - #include -],[ - struct cache_definition cache; -],[ - AC_MSG_RESULT([yes]) - AC_DEFINE(HAVE_REGISTER_CACHE, 1, [register_cache found]) - AC_MSG_CHECKING([if kernel expects return from cache shrink function]) - HAVE_CACHE_RETURN_INT="`grep -c 'int.*shrink' $LINUX/include/linux/cache_def.h`" - if test "$HAVE_CACHE_RETURN_INT" != 0 ; then - AC_DEFINE(HAVE_CACHE_RETURN_INT, 1, [kernel expects return from shrink_cache]) - AC_MSG_RESULT(yes) - else - AC_MSG_RESULT(no) - fi -],[ - AC_MSG_RESULT([no]) -]) -]) - -# -# LC_FUNC_GRAB_CACHE_PAGE_NOWAIT_GFP -# -# check for our patched grab_cache_page_nowait_gfp() function -# -AC_DEFUN([LC_FUNC_GRAB_CACHE_PAGE_NOWAIT_GFP], -[AC_MSG_CHECKING([if kernel defines grab_cache_page_nowait_gfp()]) -HAVE_GCPN_GFP="`grep -c 'grab_cache_page_nowait_gfp' $LINUX/include/linux/pagemap.h`" -if test "$HAVE_GCPN_GFP" != 0 ; then - AC_DEFINE(HAVE_GRAB_CACHE_PAGE_NOWAIT_GFP, 1, - [kernel has grab_cache_page_nowait_gfp()]) - AC_MSG_RESULT(yes) -else - AC_MSG_RESULT(no) -fi -]) - -# -# LC_FUNC_DEV_SET_RDONLY -# -# check for the old-style dev_set_rdonly which took an extra "devno" param -# and can only set a single device to discard writes at one time -# -AC_DEFUN([LC_FUNC_DEV_SET_RDONLY], -[AC_MSG_CHECKING([if kernel has new dev_set_rdonly]) -LB_LINUX_TRY_COMPILE([ - #include -],[ - #ifndef HAVE_CLEAR_RDONLY_ON_PUT - #error needs to be patched by lustre kernel patches from Lustre version 1.4.3 or above. - #endif -],[ - AC_MSG_RESULT([yes]) - AC_DEFINE(HAVE_DEV_SET_RDONLY, 1, [kernel has new dev_set_rdonly]) -],[ - AC_MSG_RESULT([no, Linux kernel source needs to be patches by lustre -kernel patches from Lustre version 1.4.3 or above.]) -]) -]) - -# -# LC_CONFIG_BACKINGFS -# -# setup, check the backing filesystem -# -AC_DEFUN([LC_CONFIG_BACKINGFS], -[ -BACKINGFS="ldiskfs" - -if test x$with_ldiskfs = xno ; then - BACKINGFS="ext3" - - if test x$linux25$enable_server = xyesyes ; then - AC_MSG_ERROR([ldiskfs is required for 2.6-based servers.]) - fi - - # --- Check that ext3 and ext3 xattr are enabled in the kernel - LC_CONFIG_EXT3([],[ - AC_MSG_ERROR([Lustre requires that ext3 is enabled in the kernel]) - ],[ - AC_MSG_WARN([Lustre requires that extended attributes for ext3 are enabled in the kernel]) - AC_MSG_WARN([This build may fail.]) - ]) -else - # ldiskfs is enabled - LB_DEFINE_LDISKFS_OPTIONS -fi #ldiskfs - -AC_MSG_CHECKING([which backing filesystem to use]) -AC_MSG_RESULT([$BACKINGFS]) -AC_SUBST(BACKINGFS) -]) - -# -# LC_CONFIG_PINGER -# -# the pinger is temporary, until we have the recovery node in place -# -AC_DEFUN([LC_CONFIG_PINGER], -[AC_MSG_CHECKING([whether to enable pinger support]) -AC_ARG_ENABLE([pinger], - AC_HELP_STRING([--disable-pinger], - [disable recovery pinger support]), - [],[enable_pinger='yes']) -AC_MSG_RESULT([$enable_pinger]) -if test x$enable_pinger != xno ; then - AC_DEFINE(ENABLE_PINGER, 1, Use the Pinger) -fi -]) - -# -# LC_CONFIG_CHECKSUM -# -# do checksum of bulk data between client and OST -# -AC_DEFUN([LC_CONFIG_CHECKSUM], -[AC_MSG_CHECKING([whether to enable data checksum support]) -AC_ARG_ENABLE([checksum], - AC_HELP_STRING([--disable-checksum], - [disable data checksum support]), - [],[enable_checksum='yes']) -AC_MSG_RESULT([$enable_checksum]) -if test x$enable_checksum != xno ; then - AC_DEFINE(ENABLE_CHECKSUM, 1, do data checksums) -fi -]) - -# -# LC_CONFIG_HEALTH_CHECK_WRITE -# -# Turn on the actual write to the disk -# -AC_DEFUN([LC_CONFIG_HEALTH_CHECK_WRITE], -[AC_MSG_CHECKING([whether to enable a write with the health check]) -AC_ARG_ENABLE([health-write], - AC_HELP_STRING([--enable-health-write], - [enable disk writes when doing health check]), - [],[enable_health_write='no']) -AC_MSG_RESULT([$enable_health_write]) -if test x$enable_health_write == xyes ; then - AC_DEFINE(USE_HEALTH_CHECK_WRITE, 1, Write when Checking Health) -fi -]) - -# -# LC_CONFIG_LIBLUSTRE_RECOVERY -# -AC_DEFUN([LC_CONFIG_LIBLUSTRE_RECOVERY], -[AC_MSG_CHECKING([whether to enable liblustre recovery support]) -AC_ARG_ENABLE([liblustre-recovery], - AC_HELP_STRING([--disable-liblustre-recovery], - [disable liblustre recovery support]), - [],[enable_liblustre_recovery='yes']) -AC_MSG_RESULT([$enable_liblustre_recovery]) -if test x$enable_liblustre_recovery != xno ; then - AC_DEFINE(ENABLE_LIBLUSTRE_RECOVERY, 1, Liblustre Can Recover) -fi -]) - -# -# LC_CONFIG_OBD_BUFFER_SIZE -# -# the maximum buffer size of lctl ioctls -# -AC_DEFUN([LC_CONFIG_OBD_BUFFER_SIZE], -[AC_MSG_CHECKING([maximum OBD ioctl size]) -AC_ARG_WITH([obd-buffer-size], - AC_HELP_STRING([--with-obd-buffer-size=[size]], - [set lctl ioctl maximum bytes (default=8192)]), - [ - OBD_BUFFER_SIZE=$with_obd_buffer_size - ],[ - OBD_BUFFER_SIZE=8192 - ]) -AC_MSG_RESULT([$OBD_BUFFER_SIZE bytes]) -AC_DEFINE_UNQUOTED(OBD_MAX_IOCTL_BUFFER, $OBD_BUFFER_SIZE, [IOCTL Buffer Size]) -]) - -# -# LC_STRUCT_STATFS -# -# AIX does not have statfs.f_namelen -# -AC_DEFUN([LC_STRUCT_STATFS], -[AC_MSG_CHECKING([if struct statfs has a f_namelen field]) -LB_LINUX_TRY_COMPILE([ - #include -],[ - struct statfs sfs; - sfs.f_namelen = 1; -],[ - AC_MSG_RESULT([yes]) - AC_DEFINE(HAVE_STATFS_NAMELEN, 1, [struct statfs has a namelen field]) -],[ - AC_MSG_RESULT([no]) -]) -]) - -# -# LC_READLINK_SSIZE_T -# -AC_DEFUN([LC_READLINK_SSIZE_T], -[AC_MSG_CHECKING([if readlink returns ssize_t]) -AC_TRY_COMPILE([ - #include -],[ - ssize_t readlink(const char *, char *, size_t); -],[ - AC_MSG_RESULT([yes]) - AC_DEFINE(HAVE_POSIX_1003_READLINK, 1, [readlink returns ssize_t]) -],[ - AC_MSG_RESULT([no]) -]) -]) - -AC_DEFUN([LC_FUNC_PAGE_MAPPED], -[AC_MSG_CHECKING([if kernel offers page_mapped]) -LB_LINUX_TRY_COMPILE([ - #include -],[ - page_mapped(NULL); -],[ - AC_MSG_RESULT([yes]) - AC_DEFINE(HAVE_PAGE_MAPPED, 1, [page_mapped found]) -],[ - AC_MSG_RESULT([no]) -]) -]) - -AC_DEFUN([LC_STRUCT_FILE_OPS_UNLOCKED_IOCTL], -[AC_MSG_CHECKING([if struct file_operations has an unlocked_ioctl field]) -LB_LINUX_TRY_COMPILE([ - #include -],[ - struct file_operations fops; - &fops.unlocked_ioctl; -],[ - AC_MSG_RESULT([yes]) - AC_DEFINE(HAVE_UNLOCKED_IOCTL, 1, [struct file_operations has an unlock ed_ioctl field]) -],[ - AC_MSG_RESULT([no]) -]) -]) - -AC_DEFUN([LC_FILEMAP_POPULATE], -[AC_MSG_CHECKING([for exported filemap_populate]) -LB_LINUX_TRY_COMPILE([ - #include - #include -],[ - filemap_populate(NULL, 0, 0, __pgprot(0), 0, 0); -],[ - AC_MSG_RESULT([yes]) - AC_DEFINE(HAVE_FILEMAP_POPULATE, 1, [Kernel exports filemap_populate]) -],[ - AC_MSG_RESULT([no]) -]) -]) - -AC_DEFUN([LC_D_ADD_UNIQUE], -[AC_MSG_CHECKING([for d_add_unique]) -LB_LINUX_TRY_COMPILE([ - #include -],[ - d_add_unique(NULL, NULL); -],[ - AC_MSG_RESULT([yes]) - AC_DEFINE(HAVE_D_ADD_UNIQUE, 1, [Kernel has d_add_unique]) -],[ - AC_MSG_RESULT([no]) -]) -]) - -AC_DEFUN([LC_BIT_SPINLOCK_H], -[LB_CHECK_FILE([$LINUX/include/linux/bit_spinlock.h],[ - AC_MSG_CHECKING([if bit_spinlock.h can be compiled]) - LB_LINUX_TRY_COMPILE([ - #include - #include - #include - ],[],[ - AC_MSG_RESULT([yes]) - AC_DEFINE(HAVE_BIT_SPINLOCK_H, 1, [Kernel has bit_spinlock.h]) - ],[ - AC_MSG_RESULT([no]) - ]) -], -[]) -]) - -# -# LC_POSIX_ACL_XATTR -# -# If we have xattr_acl.h -# -AC_DEFUN([LC_XATTR_ACL], -[LB_CHECK_FILE([$LINUX/include/linux/xattr_acl.h],[ - AC_MSG_CHECKING([if xattr_acl.h can be compiled]) - LB_LINUX_TRY_COMPILE([ - #include - ],[],[ - AC_MSG_RESULT([yes]) - AC_DEFINE(HAVE_XATTR_ACL, 1, [Kernel has xattr_acl]) - ],[ - AC_MSG_RESULT([no]) - ]) -], -[]) -]) - -AC_DEFUN([LC_STRUCT_INTENT_FILE], -[AC_MSG_CHECKING([if struct open_intent has a file field]) -LB_LINUX_TRY_COMPILE([ - #include - #include -],[ - struct open_intent intent; - &intent.file; -],[ - AC_MSG_RESULT([yes]) - AC_DEFINE(HAVE_FILE_IN_STRUCT_INTENT, 1, [struct open_intent has a file field]) -],[ - AC_MSG_RESULT([no]) -]) -]) - - -AC_DEFUN([LC_POSIX_ACL_XATTR_H], -[LB_CHECK_FILE([$LINUX/include/linux/posix_acl_xattr.h],[ - AC_MSG_CHECKING([if linux/posix_acl_xattr.h can be compiled]) - LB_LINUX_TRY_COMPILE([ - #include - ],[],[ - AC_MSG_RESULT([yes]) - AC_DEFINE(HAVE_LINUX_POSIX_ACL_XATTR_H, 1, [linux/posix_acl_xattr.h found]) - - ],[ - AC_MSG_RESULT([no]) - ]) -$1 -],[ -AC_MSG_RESULT([no]) -]) -]) - -# -# LC_EXPORT___IGET -# starting from 2.6.19 linux kernel exports __iget() -# -AC_DEFUN([LC_EXPORT___IGET], -[LB_CHECK_SYMBOL_EXPORT([__iget], -[fs/inode.c],[ - AC_DEFINE(HAVE_EXPORT___IGET, 1, [kernel exports __iget]) -],[ -]) -]) - - -AC_DEFUN([LC_LUSTRE_VERSION_H], -[LB_CHECK_FILE([$LINUX/include/linux/lustre_version.h],[ - rm -f "$LUSTRE/include/linux/lustre_version.h" -],[ - touch "$LUSTRE/include/linux/lustre_version.h" - if test x$enable_server = xyes ; then - AC_MSG_WARN([Unpatched kernel detected.]) - AC_MSG_WARN([Lustre servers cannot be built with an unpatched kernel;]) - AC_MSG_WARN([disabling server build]) - enable_server='no' - fi -]) -]) - -AC_DEFUN([LC_FUNC_SET_FS_PWD], -[LB_CHECK_SYMBOL_EXPORT([set_fs_pwd], -[fs/namespace.c],[ - AC_DEFINE(HAVE_SET_FS_PWD, 1, [set_fs_pwd is exported]) -],[ -]) -]) - - -# -# LC_FUNC_MS_FLOCK_LOCK -# -# SLES9 kernel has MS_FLOCK_LOCK sb flag -# -AC_DEFUN([LC_FUNC_MS_FLOCK_LOCK], -[AC_MSG_CHECKING([if kernel has MS_FLOCK_LOCK sb flag]) -LB_LINUX_TRY_COMPILE([ - #include -],[ - int flags = MS_FLOCK_LOCK; -],[ - AC_DEFINE(HAVE_MS_FLOCK_LOCK, 1, - [kernel has MS_FLOCK_LOCK flag]) - AC_MSG_RESULT([yes]) -],[ - AC_MSG_RESULT([no]) -]) -]) - -# -# LC_FUNC_HAVE_CAN_SLEEP_ARG -# -# SLES9 kernel has third arg can_sleep -# in fs/locks.c: flock_lock_file_wait() -# -AC_DEFUN([LC_FUNC_HAVE_CAN_SLEEP_ARG], -[AC_MSG_CHECKING([if kernel has third arg can_sleep in fs/locks.c: flock_lock_file_wait()]) -LB_LINUX_TRY_COMPILE([ - #include -],[ - int cansleep; - struct file *file; - struct file_lock *file_lock; - flock_lock_file_wait(file, file_lock, cansleep); -],[ - AC_DEFINE(HAVE_CAN_SLEEP_ARG, 1, - [kernel has third arg can_sleep in fs/locks.c: flock_lock_file_wait()]) - AC_MSG_RESULT([yes]) -],[ - AC_MSG_RESULT([no]) -]) -]) - -# -# LC_FUNC_F_OP_FLOCK -# -# rhel4.2 kernel has f_op->flock field -# -AC_DEFUN([LC_FUNC_F_OP_FLOCK], -[AC_MSG_CHECKING([if struct file_operations has flock field]) -LB_LINUX_TRY_COMPILE([ - #include -],[ - struct file_operations ll_file_operations_flock; - ll_file_operations_flock.flock = NULL; -],[ - AC_DEFINE(HAVE_F_OP_FLOCK, 1, - [struct file_operations has flock field]) - AC_MSG_RESULT([yes]) -],[ - AC_MSG_RESULT([no]) -]) -]) - -# -# LC_FUNC_MS_FLOCK_LOCK -# -# SLES9 kernel has MS_FLOCK_LOCK sb flag -# -AC_DEFUN([LC_FUNC_MS_FLOCK_LOCK], -[AC_MSG_CHECKING([if kernel has MS_FLOCK_LOCK sb flag]) -LB_LINUX_TRY_COMPILE([ - #include -],[ - int flags = MS_FLOCK_LOCK; -],[ - AC_DEFINE(HAVE_MS_FLOCK_LOCK, 1, - [kernel has MS_FLOCK_LOCK flag]) - AC_MSG_RESULT([yes]) -],[ - AC_MSG_RESULT([no]) -]) -]) - -# -# LC_FUNC_HAVE_CAN_SLEEP_ARG -# -# SLES9 kernel has third arg can_sleep -# in fs/locks.c: flock_lock_file_wait() -# -AC_DEFUN([LC_FUNC_HAVE_CAN_SLEEP_ARG], -[AC_MSG_CHECKING([if kernel has third arg can_sleep in fs/locks.c: flock_lock_file_wait()]) -LB_LINUX_TRY_COMPILE([ - #include -],[ - int cansleep; - struct file *file; - struct file_lock *file_lock; - flock_lock_file_wait(file, file_lock, cansleep); -],[ - AC_DEFINE(HAVE_CAN_SLEEP_ARG, 1, - [kernel has third arg can_sleep in fs/locks.c: flock_lock_file_wait()]) - AC_MSG_RESULT([yes]) -],[ - AC_MSG_RESULT([no]) -]) -]) - -# -# LC_TASK_PPTR -# -# task struct has p_pptr instead of parent -# -AC_DEFUN([LC_TASK_PPTR], -[AC_MSG_CHECKING([task p_pptr found]) -LB_LINUX_TRY_COMPILE([ - #include -],[ - struct task_struct *p; - - p = p->p_pptr; -],[ - AC_MSG_RESULT([yes]) - AC_DEFINE(HAVE_TASK_PPTR, 1, [task p_pptr found]) -],[ - AC_MSG_RESULT([no]) -]) -]) - -# -# LC_FUNC_F_OP_FLOCK -# -# rhel4.2 kernel has f_op->flock field -# -AC_DEFUN([LC_FUNC_F_OP_FLOCK], -[AC_MSG_CHECKING([if struct file_operations has flock field]) -LB_LINUX_TRY_COMPILE([ - #include -],[ - struct file_operations ll_file_operations_flock; - ll_file_operations_flock.flock = NULL; -],[ - AC_DEFINE(HAVE_F_OP_FLOCK, 1, - [struct file_operations has flock field]) - AC_MSG_RESULT([yes]) -],[ - AC_MSG_RESULT([no]) -]) -]) - -# LC_INODE_I_MUTEX -# after 2.6.15 inode have i_mutex intead of i_sem -AC_DEFUN([LC_INODE_I_MUTEX], -[AC_MSG_CHECKING([use inode have i_mutex ]) -LB_LINUX_TRY_COMPILE([ - #include - #include -],[ - struct inode i; - - mutex_unlock(&i.i_mutex); -],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_INODE_I_MUTEX, 1, - [after 2.6.15 inode have i_mutex intead of i_sem]) -],[ - AC_MSG_RESULT(NO) -]) -]) - - -# LC_DQUOTOFF_MUTEX -# after 2.6.17 dquote use mutex instead if semaphore -AC_DEFUN([LC_DQUOTOFF_MUTEX], -[AC_MSG_CHECKING([use dqonoff_mutex]) -LB_LINUX_TRY_COMPILE([ - #include - #include - #include -],[ - struct quota_info dq; - - mutex_unlock(&dq.dqonoff_mutex); -],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_DQUOTOFF_MUTEX, 1, - [after 2.6.17 dquote use mutex instead if semaphore]) -],[ - AC_MSG_RESULT(NO) -]) -]) - -# -# LC_STATFS_DENTRY_PARAM -# starting from 2.6.18 linux kernel uses dentry instead of -# super_block for first vfs_statfs argument -# -AC_DEFUN([LC_STATFS_DENTRY_PARAM], -[AC_MSG_CHECKING([first vfs_statfs parameter is dentry]) -LB_LINUX_TRY_COMPILE([ - #include -],[ - int vfs_statfs(struct dentry *, struct kstatfs *); -],[ - AC_DEFINE(HAVE_STATFS_DENTRY_PARAM, 1, - [first parameter of vfs_statfs is dentry]) - AC_MSG_RESULT([yes]) -],[ - AC_MSG_RESULT([no]) -]) -]) - -# -# LC_VFS_KERN_MOUNT -# starting from 2.6.18 kernel don't export do_kern_mount -# and want to use vfs_kern_mount instead. -# -AC_DEFUN([LC_VFS_KERN_MOUNT], -[AC_MSG_CHECKING([vfs_kern_mount exist in kernel]) -LB_LINUX_TRY_COMPILE([ - #include -],[ - vfs_kern_mount(NULL, 0, NULL, NULL); -],[ - AC_DEFINE(HAVE_VFS_KERN_MOUNT, 1, - [vfs_kern_mount exist in kernel]) - AC_MSG_RESULT([yes]) -],[ - AC_MSG_RESULT([no]) -]) -]) - -# -# LC_INVALIDATEPAGE_RETURN_INT -# more 2.6 api changes. return type for the invalidatepage -# address_space_operation is 'void' in new kernels but 'int' in old -# -AC_DEFUN([LC_INVALIDATEPAGE_RETURN_INT], -[AC_MSG_CHECKING([invalidatepage has return int]) -LB_LINUX_TRY_COMPILE([ - #include -],[ - int rc = block_invalidatepage(NULL, 0); -],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_INVALIDATEPAGE_RETURN_INT, 1, - [Define if return type of invalidatepage should be int]) -],[ - AC_MSG_RESULT(NO) -]) -]) - -# LC_UMOUNTBEGIN_HAS_VFSMOUNT -# more 2.6 API changes. 2.6.18 umount_begin has different parameters -AC_DEFUN([LC_UMOUNTBEGIN_HAS_VFSMOUNT], -[AC_MSG_CHECKING([if umount_begin needs vfsmount parameter instead of super_block]) -tmp_flags="$EXTRA_KCFLAGS" -EXTRA_KCFLAGS="-Werror" -LB_LINUX_TRY_COMPILE([ - #include - - struct vfsmount; - static void cfg_umount_begin (struct vfsmount *v, int flags) - { - ; - } - - static struct super_operations cfg_super_operations = { - .umount_begin = cfg_umount_begin, - }; -],[ - cfg_super_operations.umount_begin(NULL,0); -],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_UMOUNTBEGIN_VFSMOUNT, 1, - [Define umount_begin need second argument]) -],[ - AC_MSG_RESULT(NO) -]) -EXTRA_KCFLAGS="$tmp_flags" -]) - -# 2.6.19 API changes -# inode don't have i_blksize field -AC_DEFUN([LC_INODE_BLKSIZE], -[AC_MSG_CHECKING([inode has i_blksize field]) -LB_LINUX_TRY_COMPILE([ -#include -],[ - struct inode i; - i.i_blksize = 0; -],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_INODE_BLKSIZE, 1, - [struct inode has i_blksize field]) -],[ - AC_MSG_RESULT(NO) -]) -]) - -# LC_VFS_READDIR_U64_INO -# 2.6.19 use u64 for inode number instead of inode_t -AC_DEFUN([LC_VFS_READDIR_U64_INO], -[AC_MSG_CHECKING([check vfs_readdir need 64bit inode number]) -tmp_flags="$EXTRA_KCFLAGS" -EXTRA_KCFLAGS="-Werror" -LB_LINUX_TRY_COMPILE([ -#include - int fillonedir(void * __buf, const char * name, int namlen, loff_t offset, - u64 ino, unsigned int d_type) - { - return 0; - } -],[ - filldir_t filter; - - filter = fillonedir; - return 1; -],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_VFS_READDIR_U64_INO, 1, - [if vfs_readdir need 64bit inode number]) -],[ - AC_MSG_RESULT(NO) -]) -EXTRA_KCFLAGS="$tmp_flags" -]) - -# LC_GENERIC_FILE_WRITE -# 2.6.19 introduce do_sync_write instead of -# generic_file_write -AC_DEFUN([LC_GENERIC_FILE_WRITE], -[AC_MSG_CHECKING([use generic_file_write]) -LB_LINUX_TRY_COMPILE([ - #include -],[ - int result = generic_file_read(NULL, NULL, 0, 0); -],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_GENERIC_FILE_WRITE, 1, - [use generic_file_write]) -],[ - AC_MSG_RESULT(NO) -]) -]) - -# LC_GENERIC_FILE_READ -# 2.6.19 need to use do_sync_read instead of -# generic_file_read -AC_DEFUN([LC_GENERIC_FILE_READ], -[AC_MSG_CHECKING([use generic_file_read]) -LB_LINUX_TRY_COMPILE([ - #include -],[ - int result = generic_file_read(NULL, NULL, 0, 0); -],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_GENERIC_FILE_READ, 1, - [use generic_file_read]) -],[ - AC_MSG_RESULT(NO) -]) -]) - -# LC_NR_PAGECACHE -# 2.6.18 don't export nr_pagecahe -AC_DEFUN([LC_NR_PAGECACHE], -[AC_MSG_CHECKING([kernel export nr_pagecache]) -LB_LINUX_TRY_COMPILE([ - #include -],[ - return atomic_read(&nr_pagecache); -],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_NR_PAGECACHE, 1, - [is kernel export nr_pagecache]) -],[ - AC_MSG_RESULT(NO) -]) -]) - -# LC_CANCEL_DIRTY_PAGE -# 2.6.20 introduse cancel_dirty_page instead of -# clear_page_dirty. -AC_DEFUN([LC_CANCEL_DIRTY_PAGE], -[AC_MSG_CHECKING([kernel has cancel_dirty_page]) -LB_LINUX_TRY_COMPILE([ - #include -],[ - cancel_dirty_page(NULL, 0); -],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_CANCEL_DIRTY_PAGE, 1, - [kernel has cancel_dirty_page instead of clear_page_dirty]) -],[ - AC_MSG_RESULT(NO) -]) -]) - -# -# LC_PAGE_CONSTANT -# -# In order to support raid5 zerocopy patch, we have to patch the kernel to make -# it support constant page, which means the page won't be modified during the -# IO. -# -AC_DEFUN([LC_PAGE_CONSTANT], -[AC_MSG_CHECKING([if kernel have PageConstant defined]) -LB_LINUX_TRY_COMPILE([ - #include -],[ - #ifndef PG_constant - #error "Have no raid5 zcopy patch" - #endif -],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_PAGE_CONSTANT, 1, [kernel have PageConstant supported]) -],[ - AC_MSG_RESULT(no); -]) -]) - -# RHEL5 in FS-cache patch rename PG_checked flag -# into PG_fs_misc -AC_DEFUN([LC_PG_FS_MISC], -[AC_MSG_CHECKING([kernel has PG_fs_misc]) -LB_LINUX_TRY_COMPILE([ - #include -],[ - #ifndef PG_fs_misc - #error PG_fs_misc not defined in kernel - #endif -],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_PG_FS_MISC, 1, - [is kernel have PG_fs_misc]) -],[ - AC_MSG_RESULT(NO) -]) -]) - -AC_DEFUN([LC_EXPORT_TRUNCATE_COMPLETE], -[LB_CHECK_SYMBOL_EXPORT([truncate_complete_page], -[mm/truncate.c],[ -AC_DEFINE(HAVE_TRUNCATE_COMPLETE_PAGE, 1, - [kernel export truncate_complete_page]) -],[ -]) -]) - -AC_DEFUN([LC_EXPORT_D_REHASH_COND], -[LB_CHECK_SYMBOL_EXPORT([d_rehash_cond], -[fs/dcache.c],[ -AC_DEFINE(HAVE_D_REHASH_COND, 1, - [d_rehash_cond is exported by the kernel]) -],[ -]) -]) - -AC_DEFUN([LC_EXPORT___D_REHASH], -[LB_CHECK_SYMBOL_EXPORT([__d_rehash], -[fs/dcache.c],[ -AC_DEFINE(HAVE___D_REHASH, 1, - [__d_rehash is exported by the kernel]) -],[ -]) -]) - -# The actual symbol exported varies among architectures, so we need -# to check many symbols (but only in the current architecture.) No -# matter what symbol is exported, the kernel #defines node_to_cpumask -# to the appropriate function and that's what we use. -AC_DEFUN([LC_EXPORT_NODE_TO_CPUMASK], - [LB_LINUX_ARCH - LB_CHECK_SYMBOL_EXPORT([node_to_cpumask], - [arch/$LINUX_ARCH/mm/numa.c], - [AC_DEFINE(HAVE_NODE_TO_CPUMASK, 1, - [node_to_cpumask is exported by - the kernel])]) # x86_64 - LB_CHECK_SYMBOL_EXPORT([node_to_cpu_mask], - [arch/$LINUX_ARCH/kernel/smpboot.c], - [AC_DEFINE(HAVE_NODE_TO_CPUMASK, 1, - [node_to_cpumask is exported by - the kernel])]) # ia64 - LB_CHECK_SYMBOL_EXPORT([node_2_cpu_mask], - [arch/$LINUX_ARCH/kernel/smpboot.c], - [AC_DEFINE(HAVE_NODE_TO_CPUMASK, 1, - [node_to_cpumask is exported by - the kernel])]) # i386 - ]) - -# -# LC_VFS_INTENT_PATCHES -# -# check if the kernel has the VFS intent patches -AC_DEFUN([LC_VFS_INTENT_PATCHES], -[AC_MSG_CHECKING([if the kernel has the VFS intent patches]) -LB_LINUX_TRY_COMPILE([ - #include - #include -],[ - struct nameidata nd; - struct lookup_intent *it; - - it = &nd.intent; - intent_init(it, IT_OPEN); - it->d.lustre.it_disposition = 0; - it->d.lustre.it_data = NULL; -],[ - AC_MSG_RESULT([yes]) - AC_DEFINE(HAVE_VFS_INTENT_PATCHES, 1, [VFS intent patches are applied]) -],[ - AC_MSG_RESULT([no]) -]) -]) - -# -# LC_PROG_LINUX -# -# Lustre linux kernel checks -# -AC_DEFUN([LC_PROG_LINUX], -[ LC_LUSTRE_VERSION_H -if test x$enable_server = xyes ; then - LC_CONFIG_BACKINGFS -fi -LC_CONFIG_PINGER -LC_CONFIG_CHECKSUM -LC_CONFIG_LIBLUSTRE_RECOVERY -LC_CONFIG_QUOTA -LC_CONFIG_HEALTH_CHECK_WRITE -LC_CONFIG_LRU_RESIZE - -LC_TASK_PPTR -# RHEL4 patches -LC_EXPORT_TRUNCATE_COMPLETE -LC_EXPORT_D_REHASH_COND -LC_EXPORT___D_REHASH -LC_EXPORT_NODE_TO_CPUMASK - -LC_STRUCT_KIOBUF -LC_FUNC_COND_RESCHED -LC_FUNC_ZAP_PAGE_RANGE -LC_FUNC_PDE -LC_FUNC_DIRECT_IO -LC_HEADER_MM_INLINE -LC_STRUCT_INODE -LC_FUNC_REGISTER_CACHE -LC_FUNC_GRAB_CACHE_PAGE_NOWAIT_GFP -LC_FUNC_DEV_SET_RDONLY -LC_FUNC_FILEMAP_FDATAWRITE -LC_STRUCT_STATFS -LC_FUNC_PAGE_MAPPED -LC_STRUCT_FILE_OPS_UNLOCKED_IOCTL -LC_FILEMAP_POPULATE -LC_D_ADD_UNIQUE -LC_BIT_SPINLOCK_H -LC_XATTR_ACL -LC_STRUCT_INTENT_FILE -LC_POSIX_ACL_XATTR_H -LC_EXPORT___IGET -LC_FUNC_SET_FS_PWD -LC_FUNC_MS_FLOCK_LOCK -LC_FUNC_HAVE_CAN_SLEEP_ARG -LC_FUNC_F_OP_FLOCK -LC_QUOTA_READ -LC_COOKIE_FOLLOW_LINK -LC_FUNC_RCU - -# does the kernel have VFS intent patches? -LC_VFS_INTENT_PATCHES - -# 2.6.15 -LC_INODE_I_MUTEX - -# 2.6.17 -LC_DQUOTOFF_MUTEX - -# 2.6.18 -LC_NR_PAGECACHE -LC_STATFS_DENTRY_PARAM -LC_VFS_KERN_MOUNT -LC_INVALIDATEPAGE_RETURN_INT -LC_UMOUNTBEGIN_HAS_VFSMOUNT - -#2.6.18 + RHEL5 (fc6) -LC_PG_FS_MISC - -# 2.6.19 -LC_INODE_BLKSIZE -LC_VFS_READDIR_U64_INO -LC_GENERIC_FILE_READ -LC_GENERIC_FILE_WRITE - -# 2.6.20 -LC_CANCEL_DIRTY_PAGE - -# raid5-zerocopy patch -LC_PAGE_CONSTANT -]) - -# -# LC_CONFIG_CLIENT_SERVER -# -# Build client/server sides of Lustre -# -AC_DEFUN([LC_CONFIG_CLIENT_SERVER], -[AC_MSG_CHECKING([whether to build Lustre server support]) -AC_ARG_ENABLE([server], - AC_HELP_STRING([--disable-server], - [disable Lustre server support]), - [],[enable_server='yes']) -AC_MSG_RESULT([$enable_server]) - -AC_MSG_CHECKING([whether to build Lustre client support]) -AC_ARG_ENABLE([client], - AC_HELP_STRING([--disable-client], - [disable Lustre client support]), - [],[enable_client='yes']) -AC_MSG_RESULT([$enable_client])]) - -# -# LC_CONFIG_LIBLUSTRE -# -# whether to build liblustre -# -AC_DEFUN([LC_CONFIG_LIBLUSTRE], -[AC_MSG_CHECKING([whether to build Lustre library]) -AC_ARG_ENABLE([liblustre], - AC_HELP_STRING([--disable-liblustre], - [disable building of Lustre library]), - [],[enable_liblustre=$with_sysio]) -AC_MSG_RESULT([$enable_liblustre]) -# only build sysio if liblustre is built -with_sysio="$enable_liblustre" - -AC_MSG_CHECKING([whether to build liblustre tests]) -AC_ARG_ENABLE([liblustre-tests], - AC_HELP_STRING([--enable-liblustre-tests], - [enable liblustre tests, if --disable-tests is used]), - [],[enable_liblustre_tests=$enable_tests]) -if test x$enable_liblustre != xyes ; then - enable_liblustre_tests='no' -fi -AC_MSG_RESULT([$enable_liblustre_tests]) - -AC_MSG_CHECKING([whether to enable liblustre acl]) -AC_ARG_ENABLE([liblustre-acl], - AC_HELP_STRING([--disable-liblustre-acl], - [disable ACL support for liblustre]), - [],[enable_liblustre_acl=yes]) -if test x$enable_liblustre != xyes ; then - enable_liblustre_acl='no' -fi -AC_MSG_RESULT([$enable_liblustre_acl]) -if test x$enable_liblustre_acl != xno ; then - AC_DEFINE(LIBLUSTRE_POSIX_ACL, 1, Liblustre Support ACL-enabled MDS) -fi - -AC_MSG_CHECKING([whether to build mpitests]) -AC_ARG_ENABLE([mpitests], - AC_HELP_STRING([--enable-mpitests], - [build liblustre mpi tests]), - [],[enable_mpitests=no]) -AC_MSG_RESULT([$enable_mpitests]) - -AC_MSG_NOTICE([Enabling Lustre configure options for libsysio]) -ac_configure_args="$ac_configure_args --with-lustre-hack --with-sockets" - -LC_CONFIG_PINGER -LC_CONFIG_LIBLUSTRE_RECOVERY -]) - -AC_DEFUN([LC_CONFIG_LRU_RESIZE], -[AC_MSG_CHECKING([whether to enable lru self-adjusting]) -AC_ARG_ENABLE([lru_resize], - AC_HELP_STRING([--enable-lru-resize], - [enable lru resize support]), - [],[enable_lru_resize='yes']) -AC_MSG_RESULT([$enable_lru_resize]) -if test x$enable_lru_resize != xno; then - AC_DEFINE(HAVE_LRU_RESIZE_SUPPORT, 1, [Enable lru resize support]) -fi -]) - -# -# LC_CONFIG_QUOTA -# -# whether to enable quota support -# -AC_DEFUN([LC_CONFIG_QUOTA], -[AC_MSG_CHECKING([whether to enable quota support]) -AC_ARG_ENABLE([quota], - AC_HELP_STRING([--enable-quota], - [enable quota support]), - [],[enable_quota='yes']) -AC_MSG_RESULT([$enable_quota]) -if test x$linux25 != xyes; then - enable_quota='no' -fi -if test x$enable_quota != xno; then - AC_DEFINE(HAVE_QUOTA_SUPPORT, 1, [Enable quota support]) -fi -]) - -AC_DEFUN([LC_QUOTA_READ], -[AC_MSG_CHECKING([if kernel supports quota_read]) -LB_LINUX_TRY_COMPILE([ - #include -],[ - struct super_operations sp; - void *i = (void *)sp.quota_read; -],[ - AC_MSG_RESULT([yes]) - AC_DEFINE(KERNEL_SUPPORTS_QUOTA_READ, 1, [quota_read found]) -],[ - AC_MSG_RESULT([no]) -]) -]) - -# -# LC_COOKIE_FOLLOW_LINK -# -# kernel 2.6.13+ ->follow_link returns a cookie -# - -AC_DEFUN([LC_COOKIE_FOLLOW_LINK], -[AC_MSG_CHECKING([if inode_operations->follow_link returns a cookie]) -LB_LINUX_TRY_COMPILE([ - #include - #include -],[ - struct dentry dentry; - struct nameidata nd; - - dentry.d_inode->i_op->put_link(&dentry, &nd, NULL); -],[ - AC_DEFINE(HAVE_COOKIE_FOLLOW_LINK, 1, [inode_operations->follow_link returns a cookie]) - AC_MSG_RESULT([yes]) -],[ - AC_MSG_RESULT([no]) -]) -]) - -# -# LC_FUNC_RCU -# -# kernels prior than 2.6.0(?) have no RCU supported; in kernel 2.6.5(SUSE), -# call_rcu takes three parameters. -# -AC_DEFUN([LC_FUNC_RCU], -[AC_MSG_CHECKING([if kernel have RCU supported]) -LB_LINUX_TRY_COMPILE([ - #include -],[],[ - AC_DEFINE(HAVE_RCU, 1, [have RCU defined]) - AC_MSG_RESULT([yes]) - - AC_MSG_CHECKING([if call_rcu takes three parameters]) - LB_LINUX_TRY_COMPILE([ - #include - ],[ - struct rcu_head rh; - call_rcu(&rh, (void (*)(struct rcu_head *))1, NULL); - ],[ - AC_DEFINE(HAVE_CALL_RCU_PARAM, 1, [call_rcu takes three parameters]) - AC_MSG_RESULT([yes]) - ],[ - AC_MSG_RESULT([no]) - ]) -],[ - AC_MSG_RESULT([no]) -]) -]) - -# -# LC_CONFIGURE -# -# other configure checks -# -AC_DEFUN([LC_CONFIGURE], -[LC_CONFIG_OBD_BUFFER_SIZE - -# include/liblustre.h -AC_CHECK_HEADERS([asm/page.h sys/user.h sys/vfs.h stdint.h blkid/blkid.h]) - -# include/lustre/lustre_user.h -# See note there re: __ASM_X86_64_PROCESSOR_H -AC_CHECK_HEADERS([linux/quota.h]) - -# liblustre/llite_lib.h -AC_CHECK_HEADERS([xtio.h file.h]) - -# liblustre/dir.c -AC_CHECK_HEADERS([linux/types.h sys/types.h linux/unistd.h unistd.h]) - -# liblustre/lutil.c -AC_CHECK_HEADERS([netinet/in.h arpa/inet.h catamount/data.h]) -AC_CHECK_FUNCS([inet_ntoa]) - -# libsysio/src/readlink.c -LC_READLINK_SSIZE_T - -# utils/llverfs.c -AC_CHECK_HEADERS([ext2fs/ext2fs.h]) - -# Super safe df -AC_ARG_ENABLE([mindf], - AC_HELP_STRING([--enable-mindf], - [Make statfs report the minimum available space on any single OST instead of the sum of free space on all OSTs]), - [],[]) -if test "$enable_mindf" = "yes" ; then - AC_DEFINE([MIN_DF], 1, [Report minimum OST free space]) -fi - -AC_ARG_ENABLE([fail_alloc], - AC_HELP_STRING([--disable-fail-alloc], - [disable randomly alloc failure]), - [],[enable_fail_alloc=yes]) -AC_MSG_CHECKING([whether to randomly failing memory alloc]) -AC_MSG_RESULT([$enable_fail_alloc]) -if test x$enable_fail_alloc != xno ; then - AC_DEFINE([RANDOM_FAIL_ALLOC], 1, [enable randomly alloc failure]) -fi - -]) - -# -# LC_CONDITIONALS -# -# AM_CONDITIONALS for lustre -# -AC_DEFUN([LC_CONDITIONALS], -[AM_CONDITIONAL(LIBLUSTRE, test x$enable_liblustre = xyes) -AM_CONDITIONAL(USE_QUILT, test x$QUILT != xno) -AM_CONDITIONAL(LIBLUSTRE_TESTS, test x$enable_liblustre_tests = xyes) -AM_CONDITIONAL(MPITESTS, test x$enable_mpitests = xyes, Build MPI Tests) -AM_CONDITIONAL(CLIENT, test x$enable_client = xyes) -AM_CONDITIONAL(SERVER, test x$enable_server = xyes) -AM_CONDITIONAL(QUOTA, test x$enable_quota = xyes) -AM_CONDITIONAL(BLKID, test x$ac_cv_header_blkid_blkid_h = xyes) -AM_CONDITIONAL(EXT2FS_DEVEL, test x$ac_cv_header_ext2fs_ext2fs_h = xyes) -AM_CONDITIONAL(LIBPTHREAD, test x$enable_libpthread = xyes) -]) - -# -# LC_CONFIG_FILES -# -# files that should be generated with AC_OUTPUT -# -AC_DEFUN([LC_CONFIG_FILES], -[AC_CONFIG_FILES([ -lustre/Makefile -lustre/autoMakefile -lustre/autoconf/Makefile -lustre/contrib/Makefile -lustre/doc/Makefile -lustre/include/Makefile -lustre/include/lustre_ver.h -lustre/include/linux/Makefile -lustre/include/lustre/Makefile -lustre/kernel_patches/targets/2.6-suse.target -lustre/kernel_patches/targets/2.6-vanilla.target -lustre/kernel_patches/targets/2.6-rhel4.target -lustre/kernel_patches/targets/2.6-rhel5.target -lustre/kernel_patches/targets/2.6-fc5.target -lustre/kernel_patches/targets/2.6-patchless.target -lustre/kernel_patches/targets/2.6-sles10.target -lustre/kernel_patches/targets/hp_pnnl-2.4.target -lustre/kernel_patches/targets/rh-2.4.target -lustre/kernel_patches/targets/rhel-2.4.target -lustre/kernel_patches/targets/suse-2.4.21-2.target -lustre/kernel_patches/targets/sles-2.4.target -lustre/ldlm/Makefile -lustre/liblustre/Makefile -lustre/liblustre/tests/Makefile -lustre/llite/Makefile -lustre/llite/autoMakefile -lustre/lov/Makefile -lustre/lov/autoMakefile -lustre/lvfs/Makefile -lustre/lvfs/autoMakefile -lustre/mdc/Makefile -lustre/mdc/autoMakefile -lustre/mds/Makefile -lustre/mds/autoMakefile -lustre/obdclass/Makefile -lustre/obdclass/autoMakefile -lustre/obdclass/linux/Makefile -lustre/obdecho/Makefile -lustre/obdecho/autoMakefile -lustre/obdfilter/Makefile -lustre/obdfilter/autoMakefile -lustre/osc/Makefile -lustre/osc/autoMakefile -lustre/ost/Makefile -lustre/ost/autoMakefile -lustre/mgc/Makefile -lustre/mgc/autoMakefile -lustre/mgs/Makefile -lustre/mgs/autoMakefile -lustre/ptlrpc/Makefile -lustre/ptlrpc/autoMakefile -lustre/quota/Makefile -lustre/quota/autoMakefile -lustre/scripts/Makefile -lustre/scripts/version_tag.pl -lustre/tests/Makefile -lustre/utils/Makefile -]) -case $lb_target_os in - darwin) - AC_CONFIG_FILES([ lustre/obdclass/darwin/Makefile ]) - ;; -esac - -]) diff --git a/lustre/autoconf/lustre-version.ac b/lustre/autoconf/lustre-version.ac deleted file mode 100644 index fda4109..0000000 --- a/lustre/autoconf/lustre-version.ac +++ /dev/null @@ -1,36 +0,0 @@ -m4_define([LUSTRE_MAJOR],[1]) -m4_define([LUSTRE_MINOR],[6]) -m4_define([LUSTRE_PATCH],[4]) -m4_define([LUSTRE_FIX],[50]) -# Note: we're starting prerelease versions at 50 this time. - -dnl # liblustre delta is 0.0.1.32 , next version with fixes is ok, but -dnl # after following release candidate/beta would spill this warning already. -m4_define([LUSTRE_VER_ALLOWED_OFFSET],["OBD_OCD_VERSION(0,0,1,32)"]) -m4_define([LUSTRE_LIB_VER_OFFSET_WARN],["OBD_OCD_VERSION(0,0,1,32)"]) - -dnl # linux lustre delta is 0.2.0.0 , next major release version is ok -m4_define([LUSTRE_CLI_VER_OFFSET_WARN],["OBD_OCD_VERSION(0,2,0,0)"]) - -dnl # User editable part ends here. ----------------------------------------- - -m4_pattern_allow(AC_LUSTRE) -m4_define([LUSTRE_VERSION],m4_if(LUSTRE_FIX,[0],LUSTRE_MAJOR.LUSTRE_MINOR.LUSTRE_PATCH,LUSTRE_MAJOR.LUSTRE_MINOR.LUSTRE_PATCH.LUSTRE_FIX)) - -[AC_LUSTRE_MAJOR]=LUSTRE_MAJOR -[AC_LUSTRE_MINOR]=LUSTRE_MINOR -[AC_LUSTRE_PATCH]=LUSTRE_PATCH -[AC_LUSTRE_FIX]=LUSTRE_FIX -[AC_LUSTRE_VERSION_STRING]=LUSTRE_VERSION -[AC_LUSTRE_VER_ALLOWED_OFFSET]=LUSTRE_VER_ALLOWED_OFFSET -[AC_LUSTRE_LIB_VER_OFFSET_WARN]=LUSTRE_LIB_VER_OFFSET_WARN -[AC_LUSTRE_CLI_VER_OFFSET_WARN]=LUSTRE_CLI_VER_OFFSET_WARN - -AC_SUBST([AC_LUSTRE_MAJOR]) -AC_SUBST([AC_LUSTRE_MINOR]) -AC_SUBST([AC_LUSTRE_PATCH]) -AC_SUBST([AC_LUSTRE_FIX]) -AC_SUBST([AC_LUSTRE_VERSION_STRING]) -AC_SUBST([AC_LUSTRE_VER_ALLOWED_OFFSET]) -AC_SUBST([AC_LUSTRE_LIB_VER_OFFSET_WARN]) -AC_SUBST([AC_LUSTRE_CLI_VER_OFFSET_WARN]) diff --git a/lustre/conf/.cvsignore b/lustre/conf/.cvsignore deleted file mode 100644 index 282522d..0000000 --- a/lustre/conf/.cvsignore +++ /dev/null @@ -1,2 +0,0 @@ -Makefile -Makefile.in diff --git a/lustre/conf/Makefile.am b/lustre/conf/Makefile.am deleted file mode 100644 index 978cf29..0000000 --- a/lustre/conf/Makefile.am +++ /dev/null @@ -1,14 +0,0 @@ -# Copyright (C) 2001 Cluster File Systems, Inc. -# -# This code is issued under the GNU General Public License. -# See the file COPYING in this distribution - -EXTRA_DIST = lustre.dtd lustre.schema slapd-lustre.conf lustre2ldif.xsl top.ldif -ldapconfdir = $(sysconfdir)/openldap -ldapschemadir = $(sysconfdir)/openldap/schema - -if UTILS -ldapconf_SCRIPTS = slapd-lustre.conf -ldapschema_SCRIPTS = lustre.schema -pkgdata_DATA = top.ldif lustre2ldif.xsl -endif diff --git a/lustre/conf/lustre.dtd b/lustre/conf/lustre.dtd deleted file mode 100644 index 360f4a0..0000000 --- a/lustre/conf/lustre.dtd +++ /dev/null @@ -1,145 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - filesystem_ref #REQUIRED > - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/lustre/conf/lustre2ldif.xsl b/lustre/conf/lustre2ldif.xsl deleted file mode 100644 index 58b0649..0000000 --- a/lustre/conf/lustre2ldif.xsl +++ /dev/null @@ -1,308 +0,0 @@ - - - - -fs=lustre -config=,fs=lustre - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/lustre/conf/modules.conf b/lustre/conf/modules.conf deleted file mode 100644 index a5bdefa..0000000 --- a/lustre/conf/modules.conf +++ /dev/null @@ -1,8 +0,0 @@ -# sample modules.conf for autoloading lustre modules on zeroconf clients - -add below kptlrouter portals -#add below ksocknal kptlrouter -#add below kqswnal kptlrouter -add below ptlrpc ksocknal -add below llite lov osc -alias lustre llite diff --git a/lustre/conf/slapd-lustre.conf b/lustre/conf/slapd-lustre.conf deleted file mode 100644 index b93b411..0000000 --- a/lustre/conf/slapd-lustre.conf +++ /dev/null @@ -1,11 +0,0 @@ -####################################################################### -# lustre ldap config database -####################################################################### - -database ldbm -suffix "fs=lustre" -rootdn "cn=Manager,fs=lustre" -include /etc/openldap/schema/lustre.schema -rootpw secret -directory /var/lib/ldap/lustre -index objectClass,uuid eq diff --git a/lustre/conf/top.ldif b/lustre/conf/top.ldif deleted file mode 100644 index d0cfdac..0000000 --- a/lustre/conf/top.ldif +++ /dev/null @@ -1,4 +0,0 @@ -dn: fs=lustre -fs:lustre -objectClass: lustre -lustreDesc: Lustre Config diff --git a/lustre/contrib/.cvsignore b/lustre/contrib/.cvsignore deleted file mode 100644 index 282522d..0000000 --- a/lustre/contrib/.cvsignore +++ /dev/null @@ -1,2 +0,0 @@ -Makefile -Makefile.in diff --git a/lustre/contrib/Makefile.am b/lustre/contrib/Makefile.am deleted file mode 100644 index 5a8e66c..0000000 --- a/lustre/contrib/Makefile.am +++ /dev/null @@ -1,5 +0,0 @@ -# Contributions Makefile - -EXTRA_DIST = mpich-*.patch -pkgdata_DATA = $(EXTRA_DIST) - diff --git a/lustre/contrib/README b/lustre/contrib/README deleted file mode 100644 index 73270f3..0000000 --- a/lustre/contrib/README +++ /dev/null @@ -1,2 +0,0 @@ -The files in this directory are user-contributed and are not supported by -CFS in any way. diff --git a/lustre/contrib/mpich-1.2.6-lustre.patch b/lustre/contrib/mpich-1.2.6-lustre.patch deleted file mode 100644 index d32fab9..0000000 --- a/lustre/contrib/mpich-1.2.6-lustre.patch +++ /dev/null @@ -1,1829 +0,0 @@ -diff -r -u --new-file mpich-1.2.6/romio/adio/ad_lustre/ad_lustre.c mpich-1.2.6/romio/adio/ad_lustre/ad_lustre.c ---- mpich-1.2.6/romio/adio/ad_lustre/ad_lustre.c 1969-12-31 19:00:00.000000000 -0500 -+++ mpich-1.2.6/romio/adio/ad_lustre/ad_lustre.c 2005-12-06 11:54:37.883130927 -0500 -@@ -0,0 +1,37 @@ -+/* -*- Mode: C; c-basic-offset:4 ; -*- */ -+/* -+ * $Id: ad_lustre.c,v 1.1.1.1 2004/11/04 11:03:38 liam Exp $ -+ * -+ * Copyright (C) 2001 University of Chicago. -+ * See COPYRIGHT notice in top-level directory. -+ */ -+ -+#include "ad_lustre.h" -+ -+/* adioi.h has the ADIOI_Fns_struct define */ -+#include "adioi.h" -+ -+struct ADIOI_Fns_struct ADIO_LUSTRE_operations = { -+ ADIOI_LUSTRE_Open, /* Open */ -+ ADIOI_LUSTRE_ReadContig, /* ReadContig */ -+ ADIOI_LUSTRE_WriteContig, /* WriteContig */ -+ ADIOI_GEN_ReadStridedColl, /* ReadStridedColl */ -+ ADIOI_GEN_WriteStridedColl, /* WriteStridedColl */ -+ ADIOI_GEN_SeekIndividual, /* SeekIndividual */ -+ ADIOI_LUSTRE_Fcntl, /* Fcntl */ -+ ADIOI_LUSTRE_SetInfo, /* SetInfo */ -+ ADIOI_GEN_ReadStrided, /* ReadStrided */ -+ ADIOI_GEN_WriteStrided, /* WriteStrided */ -+ ADIOI_LUSTRE_Close, /* Close */ -+ ADIOI_LUSTRE_IreadContig, /* IreadContig */ -+ ADIOI_LUSTRE_IwriteContig, /* IwriteContig */ -+ ADIOI_LUSTRE_ReadDone, /* ReadDone */ -+ ADIOI_LUSTRE_WriteDone, /* WriteDone */ -+ ADIOI_LUSTRE_ReadComplete, /* ReadComplete */ -+ ADIOI_LUSTRE_WriteComplete, /* WriteComplete */ -+ ADIOI_LUSTRE_IreadStrided, /* IreadStrided */ -+ ADIOI_LUSTRE_IwriteStrided, /* IwriteStrided */ -+ ADIOI_GEN_Flush, /* Flush */ -+ ADIOI_LUSTRE_Resize, /* Resize */ -+ ADIOI_GEN_Delete, /* Delete */ -+}; -diff -r -u --new-file mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_close.c mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_close.c ---- mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_close.c 1969-12-31 19:00:00.000000000 -0500 -+++ mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_close.c 2005-12-06 11:54:37.895129327 -0500 -@@ -0,0 +1,32 @@ -+/* -*- Mode: C; c-basic-offset:4 ; -*- */ -+/* -+ * $Id: ad_lustre_close.c,v 1.1.1.1 2004/11/04 11:03:38 liam Exp $ -+ * -+ * Copyright (C) 1997 University of Chicago. -+ * See COPYRIGHT notice in top-level directory. -+ */ -+ -+#include "ad_lustre.h" -+ -+void ADIOI_LUSTRE_Close(ADIO_File fd, int *error_code) -+{ -+ int err; -+#if defined(MPICH2) || !defined(PRINT_ERR_MSG) -+ static char myname[] = "ADIOI_LUSTRE_CLOSE"; -+#endif -+ -+ err = close(fd->fd_sys); -+ if (err == -1) { -+#ifdef MPICH2 -+ *error_code = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", -+ "**io %s", strerror(errno)); -+#elif defined(PRINT_ERR_MSG) -+ *error_code = MPI_ERR_UNKNOWN; -+#else -+ *error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_ADIO_ERROR, -+ myname, "I/O Error", "%s", strerror(errno)); -+ ADIOI_Error(fd, *error_code, myname); -+#endif -+ } -+ else *error_code = MPI_SUCCESS; -+} -diff -r -u --new-file mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_done.c mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_done.c ---- mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_done.c 1969-12-31 19:00:00.000000000 -0500 -+++ mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_done.c 2005-12-06 11:54:37.898128927 -0500 -@@ -0,0 +1,188 @@ -+/* -*- Mode: C; c-basic-offset:4 ; -*- */ -+/* -+ * $Id: ad_lustre_done.c,v 1.1.1.1 2004/11/04 11:03:38 liam Exp $ -+ * -+ * Copyright (C) 1997 University of Chicago. -+ * See COPYRIGHT notice in top-level directory. -+ */ -+ -+#include "ad_lustre.h" -+ -+int ADIOI_LUSTRE_ReadDone(ADIO_Request *request, ADIO_Status *status, int *error_code) -+{ -+#ifndef NO_AIO -+ int done=0; -+#if defined(MPICH2) || !defined(PRINT_ERR_MSG) -+ static char myname[] = "ADIOI_LUSTRE_READDONE"; -+#endif -+#ifdef AIO_SUN -+ aio_result_t *result=0, *tmp; -+#else -+ int err; -+#endif -+#ifdef AIO_HANDLE_IN_AIOCB -+ struct aiocb *tmp1; -+#endif -+#endif -+ -+ if (*request == ADIO_REQUEST_NULL) { -+ *error_code = MPI_SUCCESS; -+ return 1; -+ } -+ -+#ifdef NO_AIO -+/* HP, FreeBSD, Linux */ -+#ifdef HAVE_STATUS_SET_BYTES -+ MPIR_Status_set_bytes(status, (*request)->datatype, (*request)->nbytes); -+#endif -+ (*request)->fd->async_count--; -+ ADIOI_Free_request((ADIOI_Req_node *) (*request)); -+ *request = ADIO_REQUEST_NULL; -+ *error_code = MPI_SUCCESS; -+ return 1; -+#endif -+ -+#ifdef AIO_SUN -+ if ((*request)->queued) { -+ tmp = (aio_result_t *) (*request)->handle; -+ if (tmp->aio_return == AIO_INPROGRESS) { -+ done = 0; -+ *error_code = MPI_SUCCESS; -+ } -+ else if (tmp->aio_return != -1) { -+ result = (aio_result_t *) aiowait(0); /* dequeue any one request */ -+ done = 1; -+ (*request)->nbytes = tmp->aio_return; -+ *error_code = MPI_SUCCESS; -+ } -+ else { -+#ifdef MPICH2 -+ *error_code = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", -+ "**io %s", strerror(tmp->aio_errno)); -+ return; -+#elif defined(PRINT_ERR_MSG) -+ *error_code = MPI_ERR_UNKNOWN; -+#else -+ *error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_ADIO_ERROR, -+ myname, "I/O Error", "%s", strerror(tmp->aio_errno)); -+ ADIOI_Error((*request)->fd, *error_code, myname); -+#endif -+ } -+ } /* if ((*request)->queued) ... */ -+ else { -+ /* ADIOI_Complete_Async completed this request, but request object -+ was not freed. */ -+ done = 1; -+ *error_code = MPI_SUCCESS; -+ } -+#ifdef HAVE_STATUS_SET_BYTES -+ if (done && ((*request)->nbytes != -1)) -+ MPIR_Status_set_bytes(status, (*request)->datatype, (*request)->nbytes); -+#endif -+ -+#endif -+ -+#ifdef AIO_HANDLE_IN_AIOCB -+/* IBM */ -+ if ((*request)->queued) { -+ tmp1 = (struct aiocb *) (*request)->handle; -+ errno = aio_error(tmp1->aio_handle); -+ if (errno == EINPROG) { -+ done = 0; -+ *error_code = MPI_SUCCESS; -+ } -+ else { -+ err = aio_return(tmp1->aio_handle); -+ (*request)->nbytes = err; -+ errno = aio_error(tmp1->aio_handle); -+ -+ done = 1; -+ -+ if (err == -1) { -+#ifdef MPICH2 -+ *error_code = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", -+ "**io %s", strerror(errno)); -+ return; -+#elif defined(PRINT_ERR_MSG) -+ *error_code = MPI_ERR_UNKNOWN; -+#else -+ *error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_ADIO_ERROR, -+ myname, "I/O Error", "%s", strerror(errno)); -+ ADIOI_Error((*request)->fd, *error_code, myname); -+#endif -+ } -+ else *error_code = MPI_SUCCESS; -+ } -+ } /* if ((*request)->queued) */ -+ else { -+ done = 1; -+ *error_code = MPI_SUCCESS; -+ } -+#ifdef HAVE_STATUS_SET_BYTES -+ if (done && ((*request)->nbytes != -1)) -+ MPIR_Status_set_bytes(status, (*request)->datatype, (*request)->nbytes); -+#endif -+ -+#elif (!defined(NO_AIO) && !defined(AIO_SUN)) -+/* DEC, SGI IRIX 5 and 6 */ -+ if ((*request)->queued) { -+ errno = aio_error((const struct aiocb *) (*request)->handle); -+ if (errno == EINPROGRESS) { -+ done = 0; -+ *error_code = MPI_SUCCESS; -+ } -+ else { -+ err = aio_return((struct aiocb *) (*request)->handle); -+ (*request)->nbytes = err; -+ errno = aio_error((struct aiocb *) (*request)->handle); -+ -+ done = 1; -+ -+ if (err == -1) { -+#ifdef MPICH2 -+ *error_code = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", -+ "**io %s", strerror(errno)); -+ return; -+#elif defined(PRINT_ERR_MSG) -+ *error_code = MPI_ERR_UNKNOWN; -+#else /* MPICH-1 */ -+ *error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_ADIO_ERROR, -+ myname, "I/O Error", "%s", strerror(errno)); -+ ADIOI_Error((*request)->fd, *error_code, myname); -+#endif -+ } -+ else *error_code = MPI_SUCCESS; -+ } -+ } /* if ((*request)->queued) */ -+ else { -+ done = 1; -+ *error_code = MPI_SUCCESS; -+ } -+#ifdef HAVE_STATUS_SET_BYTES -+ if (done && ((*request)->nbytes != -1)) -+ MPIR_Status_set_bytes(status, (*request)->datatype, (*request)->nbytes); -+#endif -+ -+#endif -+ -+#ifndef NO_AIO -+ if (done) { -+ /* if request is still queued in the system, it is also there -+ on ADIOI_Async_list. Delete it from there. */ -+ if ((*request)->queued) ADIOI_Del_req_from_list(request); -+ -+ (*request)->fd->async_count--; -+ if ((*request)->handle) ADIOI_Free((*request)->handle); -+ ADIOI_Free_request((ADIOI_Req_node *) (*request)); -+ *request = ADIO_REQUEST_NULL; -+ } -+ return done; -+#endif -+ -+} -+ -+ -+int ADIOI_LUSTRE_WriteDone(ADIO_Request *request, ADIO_Status *status, int *error_code) -+{ -+ return ADIOI_LUSTRE_ReadDone(request, status, error_code); -+} -diff -r -u --new-file mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_fcntl.c mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_fcntl.c ---- mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_fcntl.c 1969-12-31 19:00:00.000000000 -0500 -+++ mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_fcntl.c 2005-12-06 11:54:37.901128527 -0500 -@@ -0,0 +1,126 @@ -+/* -*- Mode: C; c-basic-offset:4 ; -*- */ -+/* -+ * $Id: ad_lustre_fcntl.c,v 1.1.1.1 2004/11/04 11:03:38 liam Exp $ -+ * -+ * Copyright (C) 1997 University of Chicago. -+ * See COPYRIGHT notice in top-level directory. -+ */ -+ -+#include "ad_lustre.h" -+#include "adio_extern.h" -+/* #ifdef MPISGI -+#include "mpisgi2.h" -+#endif */ -+ -+void ADIOI_LUSTRE_Fcntl(ADIO_File fd, int flag, ADIO_Fcntl_t *fcntl_struct, int *error_code) -+{ -+ int i, ntimes; -+ ADIO_Offset curr_fsize, alloc_size, size, len, done; -+ ADIO_Status status; -+ char *buf; -+#if defined(MPICH2) || !defined(PRINT_ERR_MSG) -+ static char myname[] = "ADIOI_LUSTRE_FCNTL"; -+#endif -+ -+ switch(flag) { -+ case ADIO_FCNTL_GET_FSIZE: -+ fcntl_struct->fsize = lseek(fd->fd_sys, 0, SEEK_END); -+ if (fd->fp_sys_posn != -1) -+ lseek(fd->fd_sys, fd->fp_sys_posn, SEEK_SET); -+ if (fcntl_struct->fsize == -1) { -+#ifdef MPICH2 -+ *error_code = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", -+ "**io %s", strerror(errno)); -+#elif defined(PRINT_ERR_MSG) -+ *error_code = MPI_ERR_UNKNOWN; -+#else /* MPICH-1 */ -+ *error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_ADIO_ERROR, -+ myname, "I/O Error", "%s", strerror(errno)); -+ ADIOI_Error(fd, *error_code, myname); -+#endif -+ } -+ else *error_code = MPI_SUCCESS; -+ break; -+ -+ case ADIO_FCNTL_SET_DISKSPACE: -+ /* will be called by one process only */ -+ /* On file systems with no preallocation function, I have to -+ explicitly write -+ to allocate space. Since there could be holes in the file, -+ I need to read up to the current file size, write it back, -+ and then write beyond that depending on how much -+ preallocation is needed. -+ read/write in sizes of no more than ADIOI_PREALLOC_BUFSZ */ -+ -+ curr_fsize = lseek(fd->fd_sys, 0, SEEK_END); -+ alloc_size = fcntl_struct->diskspace; -+ -+ size = ADIOI_MIN(curr_fsize, alloc_size); -+ -+ ntimes = (size + ADIOI_PREALLOC_BUFSZ - 1)/ADIOI_PREALLOC_BUFSZ; -+ buf = (char *) ADIOI_Malloc(ADIOI_PREALLOC_BUFSZ); -+ done = 0; -+ -+ for (i=0; i curr_fsize) { -+ memset(buf, 0, ADIOI_PREALLOC_BUFSZ); -+ size = alloc_size - curr_fsize; -+ ntimes = (size + ADIOI_PREALLOC_BUFSZ - 1)/ADIOI_PREALLOC_BUFSZ; -+ for (i=0; ifp_sys_posn != -1) -+ lseek(fd->fd_sys, fd->fp_sys_posn, SEEK_SET); -+ *error_code = MPI_SUCCESS; -+ break; -+ -+ case ADIO_FCNTL_SET_IOMODE: -+ /* for implementing PFS I/O modes. will not occur in MPI-IO -+ implementation.*/ -+ if (fd->iomode != fcntl_struct->iomode) { -+ fd->iomode = fcntl_struct->iomode; -+ MPI_Barrier(MPI_COMM_WORLD); -+ } -+ *error_code = MPI_SUCCESS; -+ break; -+ -+ case ADIO_FCNTL_SET_ATOMICITY: -+ fd->atomicity = (fcntl_struct->atomicity == 0) ? 0 : 1; -+ *error_code = MPI_SUCCESS; -+ break; -+ -+ default: -+ FPRINTF(stderr, "Unknown flag passed to ADIOI_LUSTRE_Fcntl\n"); -+ MPI_Abort(MPI_COMM_WORLD, 1); -+ } -+} -diff -r -u --new-file mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_flush.c mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_flush.c ---- mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_flush.c 1969-12-31 19:00:00.000000000 -0500 -+++ mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_flush.c 2005-12-06 11:54:37.903128261 -0500 -@@ -0,0 +1,14 @@ -+/* -*- Mode: C; c-basic-offset:4 ; -*- */ -+/* -+ * $Id: ad_lustre_flush.c,v 1.1.1.1 2004/11/04 11:03:38 liam Exp $ -+ * -+ * Copyright (C) 1997 University of Chicago. -+ * See COPYRIGHT notice in top-level directory. -+ */ -+ -+#include "ad_lustre.h" -+ -+void ADIOI_LUSTRE_Flush(ADIO_File fd, int *error_code) -+{ -+ ADIOI_GEN_Flush(fd, error_code); -+} -diff -r -u --new-file mpich-1.2.6/romio/adio/ad_lustre/ad_lustre.h mpich-1.2.6/romio/adio/ad_lustre/ad_lustre.h ---- mpich-1.2.6/romio/adio/ad_lustre/ad_lustre.h 1969-12-31 19:00:00.000000000 -0500 -+++ mpich-1.2.6/romio/adio/ad_lustre/ad_lustre.h 2005-12-06 11:54:37.891129861 -0500 -@@ -0,0 +1,36 @@ -+/* -*- Mode: C; c-basic-offset:4 ; -*- */ -+/* -+ * $Id: ad_lustre.h,v 1.2 2005/07/07 14:38:17 liam Exp $ -+ * -+ * Copyright (C) 1997 University of Chicago. -+ * See COPYRIGHT notice in top-level directory. -+ */ -+ -+#ifndef AD_UNIX_INCLUDE -+#define AD_UNIX_INCLUDE -+ -+/* temp*/ -+#define HAVE_ASM_TYPES_H 1 -+ -+#include -+#include -+#include -+#include -+#include "lustre/lustre_user.h" -+#include "adio.h" -+ -+#ifndef NO_AIO -+#ifdef AIO_SUN -+#include -+#else -+#include -+#ifdef NEEDS_ADIOCB_T -+typedef struct adiocb adiocb_t; -+#endif -+#endif -+#endif -+ -+int ADIOI_LUSTRE_aio(ADIO_File fd, void *buf, int len, ADIO_Offset offset, -+ int wr, void *handle); -+ -+#endif -diff -r -u --new-file mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_hints.c mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_hints.c ---- mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_hints.c 1969-12-31 19:00:00.000000000 -0500 -+++ mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_hints.c 2005-12-06 11:54:37.904128127 -0500 -@@ -0,0 +1,130 @@ -+/* -*- Mode: C; c-basic-offset:4 ; -*- */ -+/* -+ * $Id: ad_lustre_hints.c,v 1.2 2005/07/07 14:38:17 liam Exp $ -+ * -+ * Copyright (C) 1997 University of Chicago. -+ * See COPYRIGHT notice in top-level directory. -+ */ -+ -+#include "ad_lustre.h" -+ -+void ADIOI_LUSTRE_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code) -+{ -+ char *value, *value_in_fd; -+ int flag, tmp_val, str_factor=-1, str_unit=0, start_iodev=-1; -+ struct lov_user_md lum = { 0 }; -+ int err, myrank, fd_sys, perm, amode, old_mask; -+ -+ if ( (fd->info) == MPI_INFO_NULL) { -+ /* This must be part of the open call. can set striping parameters -+ if necessary. */ -+ MPI_Info_create(&(fd->info)); -+ -+ /* has user specified striping or server buffering parameters -+ and do they have the same value on all processes? */ -+ if (users_info != MPI_INFO_NULL) { -+ value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); -+ -+ MPI_Info_get(users_info, "striping_factor", MPI_MAX_INFO_VAL, -+ value, &flag); -+ if (flag) { -+ str_factor=atoi(value); -+ tmp_val = str_factor; -+ MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); -+ if (tmp_val != str_factor) { -+ FPRINTF(stderr, "ADIOI_LUSTRE_SetInfo: the value for key \"striping_factor\" must be the same on all processes\n"); -+ MPI_Abort(MPI_COMM_WORLD, 1); -+ } -+ } -+ -+ MPI_Info_get(users_info, "striping_unit", MPI_MAX_INFO_VAL, -+ value, &flag); -+ if (flag) { -+ str_unit=atoi(value); -+ tmp_val = str_unit; -+ MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); -+ if (tmp_val != str_unit) { -+ FPRINTF(stderr, "ADIOI_LUSTRE_SetInfo: the value for key \"striping_unit\" must be the same on all processes\n"); -+ MPI_Abort(MPI_COMM_WORLD, 1); -+ } -+ } -+ -+ MPI_Info_get(users_info, "start_iodevice", MPI_MAX_INFO_VAL, -+ value, &flag); -+ if (flag) { -+ start_iodev=atoi(value); -+ tmp_val = start_iodev; -+ MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); -+ if (tmp_val != start_iodev) { -+ FPRINTF(stderr, "ADIOI_LUSTRE_SetInfo: the value for key \"start_iodevice\" must be the same on all processes\n"); -+ MPI_Abort(MPI_COMM_WORLD, 1); -+ } -+ } -+ -+ /* if user has specified striping info, process 0 tries to set it */ -+ if ((str_factor > 0) || (str_unit > 0) || (start_iodev >= 0)) { -+ MPI_Comm_rank(fd->comm, &myrank); -+ if (!myrank) { -+ if (fd->perm == ADIO_PERM_NULL) { -+ old_mask = umask(022); -+ umask(old_mask); -+ perm = old_mask ^ 0666; -+ } -+ else perm = fd->perm; -+ -+ amode = 0; -+ if (fd->access_mode & ADIO_CREATE) -+ amode = amode | O_CREAT; -+ if (fd->access_mode & ADIO_RDWR || -+ (fd->access_mode & ADIO_RDONLY && -+ fd->access_mode & ADIO_WRONLY)) -+ amode = amode | O_RDWR; -+ else if (fd->access_mode & ADIO_WRONLY) -+ amode = amode | O_WRONLY; -+ else if (fd->access_mode & ADIO_RDONLY) -+ amode = amode | O_RDONLY; -+ if (fd->access_mode & ADIO_EXCL) -+ amode = amode | O_EXCL; -+ -+ /* we need to create file so ensure this is set */ -+ amode = amode | O_LOV_DELAY_CREATE | O_CREAT; -+ -+ fd_sys = open(fd->filename, amode, perm); -+ if (fd_sys == -1) { -+ if (errno != EEXIST) -+ FPRINTF(stderr, "ADIOI_LUSTRE_SetInfo: Failure to open file %s %d %d\n",strerror(errno), amode, perm); -+ } else { -+ lum.lmm_magic = LOV_USER_MAGIC; -+ lum.lmm_pattern = 0; -+ lum.lmm_stripe_size = str_unit; -+ lum.lmm_stripe_count = str_factor; -+ lum.lmm_stripe_offset = start_iodev; -+ -+ err = ioctl(fd_sys, LL_IOC_LOV_SETSTRIPE, &lum); -+ if (err == -1 && errno != EEXIST) { -+ FPRINTF(stderr, "ADIOI_LUSTRE_SetInfo: Failure to set stripe info %s \n",strerror(errno)); -+ } -+ -+ close(fd_sys); -+ } -+ -+ } -+ MPI_Barrier(fd->comm); -+ } -+ -+ ADIOI_Free(value); -+ } -+ -+ /* set the values for collective I/O and data sieving parameters */ -+ ADIOI_GEN_SetInfo(fd, users_info, error_code); -+ } -+ -+ else { -+ /* The file has been opened previously and fd->fd_sys is a valid -+ file descriptor. cannot set striping parameters now. */ -+ -+ /* set the values for collective I/O and data sieving parameters */ -+ ADIOI_GEN_SetInfo(fd, users_info, error_code); -+ -+ } -+ -+ *error_code = MPI_SUCCESS; -+} -diff -r -u --new-file mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_iread.c mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_iread.c ---- mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_iread.c 1969-12-31 19:00:00.000000000 -0500 -+++ mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_iread.c 2005-12-06 11:54:37.904128127 -0500 -@@ -0,0 +1,106 @@ -+/* -*- Mode: C; c-basic-offset:4 ; -*- */ -+/* -+ * $Id: ad_lustre_iread.c,v 1.1.1.1 2004/11/04 11:03:38 liam Exp $ -+ * -+ * Copyright (C) 1997 University of Chicago. -+ * See COPYRIGHT notice in top-level directory. -+ */ -+ -+#include "ad_lustre.h" -+ -+void ADIOI_LUSTRE_IreadContig(ADIO_File fd, void *buf, int count, -+ MPI_Datatype datatype, int file_ptr_type, -+ ADIO_Offset offset, ADIO_Request *request, int *error_code) -+{ -+ int len, typesize; -+#ifdef NO_AIO -+ ADIO_Status status; -+#else -+ int err=-1; -+#if defined(MPICH2) || !defined(PRINT_ERR_MSG) -+ static char myname[] = "ADIOI_LUSTRE_IREADCONTIG"; -+#endif -+#endif -+ -+ (*request) = ADIOI_Malloc_request(); -+ (*request)->optype = ADIOI_READ; -+ (*request)->fd = fd; -+ (*request)->datatype = datatype; -+ -+ MPI_Type_size(datatype, &typesize); -+ len = count * typesize; -+ -+#ifdef NO_AIO -+ /* HP, FreeBSD, Linux */ -+ /* no support for nonblocking I/O. Use blocking I/O. */ -+ -+ ADIOI_LUSTRE_ReadContig(fd, buf, len, MPI_BYTE, file_ptr_type, offset, -+ &status, error_code); -+ (*request)->queued = 0; -+#ifdef HAVE_STATUS_SET_BYTES -+ if (*error_code == MPI_SUCCESS) { -+ MPI_Get_elements(&status, MPI_BYTE, &len); -+ (*request)->nbytes = len; -+ } -+#endif -+ -+#else -+ if (file_ptr_type == ADIO_INDIVIDUAL) offset = fd->fp_ind; -+ err = ADIOI_LUSTRE_aio(fd, buf, len, offset, 0, &((*request)->handle)); -+ if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind += len; -+ -+ (*request)->queued = 1; -+ ADIOI_Add_req_to_list(request); -+ -+ if (err == -1) { -+#ifdef MPICH2 -+ *error_code = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", -+ "**io %s", strerror(errno)); -+ return; -+#elif defined(PRINT_ERR_MSG) -+ *error_code = MPI_ERR_UNKNOWN; -+#else /* MPICH-1 */ -+ *error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_ADIO_ERROR, -+ myname, "I/O Error", "%s", strerror(errno)); -+ ADIOI_Error(fd, *error_code, myname); -+#endif -+ } -+ else *error_code = MPI_SUCCESS; -+#endif /* NO_AIO */ -+ -+ fd->fp_sys_posn = -1; /* set it to null. */ -+ fd->async_count++; -+} -+ -+ -+ -+void ADIOI_LUSTRE_IreadStrided(ADIO_File fd, void *buf, int count, -+ MPI_Datatype datatype, int file_ptr_type, -+ ADIO_Offset offset, ADIO_Request *request, int -+ *error_code) -+{ -+ ADIO_Status status; -+#ifdef HAVE_STATUS_SET_BYTES -+ int typesize; -+#endif -+ -+ *request = ADIOI_Malloc_request(); -+ (*request)->optype = ADIOI_READ; -+ (*request)->fd = fd; -+ (*request)->datatype = datatype; -+ (*request)->queued = 0; -+ (*request)->handle = 0; -+ -+/* call the blocking version. It is faster because it does data sieving. */ -+ ADIOI_LUSTRE_ReadStrided(fd, buf, count, datatype, file_ptr_type, -+ offset, &status, error_code); -+ -+ fd->async_count++; -+ -+#ifdef HAVE_STATUS_SET_BYTES -+ if (*error_code == MPI_SUCCESS) { -+ MPI_Type_size(datatype, &typesize); -+ (*request)->nbytes = count * typesize; -+ } -+#endif -+} -diff -r -u --new-file mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_iwrite.c mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_iwrite.c ---- mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_iwrite.c 1969-12-31 19:00:00.000000000 -0500 -+++ mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_iwrite.c 2005-12-06 11:54:37.906127861 -0500 -@@ -0,0 +1,268 @@ -+/* -*- Mode: C; c-basic-offset:4 ; -*- */ -+/* -+ * $Id: ad_lustre_iwrite.c,v 1.1.1.1 2004/11/04 11:03:38 liam Exp $ -+ * -+ * Copyright (C) 1997 University of Chicago. -+ * See COPYRIGHT notice in top-level directory. -+ */ -+ -+#include "ad_lustre.h" -+ -+void ADIOI_LUSTRE_IwriteContig(ADIO_File fd, void *buf, int count, -+ MPI_Datatype datatype, int file_ptr_type, -+ ADIO_Offset offset, ADIO_Request *request, int *error_code) -+{ -+ int len, typesize; -+#ifdef NO_AIO -+ ADIO_Status status; -+#else -+ int err=-1; -+#if defined(MPICH2) || !defined(PRINT_ERR_MSG) -+ static char myname[] = "ADIOI_LUSTRE_IWRITECONTIG"; -+#endif -+#endif -+ -+ *request = ADIOI_Malloc_request(); -+ (*request)->optype = ADIOI_WRITE; -+ (*request)->fd = fd; -+ (*request)->datatype = datatype; -+ -+ MPI_Type_size(datatype, &typesize); -+ len = count * typesize; -+ -+#ifdef NO_AIO -+ /* HP, FreeBSD, Linux */ -+ /* no support for nonblocking I/O. Use blocking I/O. */ -+ -+ ADIOI_LUSTRE_WriteContig(fd, buf, len, MPI_BYTE, file_ptr_type, offset, -+ &status, error_code); -+ (*request)->queued = 0; -+#ifdef HAVE_STATUS_SET_BYTES -+ if (*error_code == MPI_SUCCESS) { -+ MPI_Get_elements(&status, MPI_BYTE, &len); -+ (*request)->nbytes = len; -+ } -+#endif -+ -+#else -+ if (file_ptr_type == ADIO_INDIVIDUAL) offset = fd->fp_ind; -+ err = ADIOI_LUSTRE_aio(fd, buf, len, offset, 1, &((*request)->handle)); -+ if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind += len; -+ -+ (*request)->queued = 1; -+ ADIOI_Add_req_to_list(request); -+ -+ if (err == -1) { -+#ifdef MPICH2 -+ *error_code = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", -+ "**io %s", strerror(errno)); -+ return; -+#elif defined(PRINT_ERR_MSG) -+ *error_code = MPI_ERR_UNKNOWN; -+#else /* MPICH-1 */ -+ *error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_ADIO_ERROR, -+ myname, "I/O Error", "%s", strerror(errno)); -+ ADIOI_Error(fd, *error_code, myname); -+#endif -+ } -+ else *error_code = MPI_SUCCESS; -+#endif /* NO_AIO */ -+ -+ fd->fp_sys_posn = -1; /* set it to null. */ -+ fd->async_count++; -+} -+ -+ -+ -+ -+void ADIOI_LUSTRE_IwriteStrided(ADIO_File fd, void *buf, int count, -+ MPI_Datatype datatype, int file_ptr_type, -+ ADIO_Offset offset, ADIO_Request *request, int -+ *error_code) -+{ -+ ADIO_Status status; -+#ifdef HAVE_STATUS_SET_BYTES -+ int typesize; -+#endif -+ -+ *request = ADIOI_Malloc_request(); -+ (*request)->optype = ADIOI_WRITE; -+ (*request)->fd = fd; -+ (*request)->datatype = datatype; -+ (*request)->queued = 0; -+ (*request)->handle = 0; -+ -+/* call the blocking version. It is faster because it does data sieving. */ -+ ADIOI_LUSTRE_WriteStrided(fd, buf, count, datatype, file_ptr_type, -+ offset, &status, error_code); -+ -+ fd->async_count++; -+ -+#ifdef HAVE_STATUS_SET_BYTES -+ if (*error_code == MPI_SUCCESS) { -+ MPI_Type_size(datatype, &typesize); -+ (*request)->nbytes = count * typesize; -+ } -+#endif -+} -+ -+ -+/* This function is for implementation convenience. It is not user-visible. -+ It takes care of the differences in the interface for nonblocking I/O -+ on various Unix machines! If wr==1 write, wr==0 read. */ -+ -+int ADIOI_LUSTRE_aio(ADIO_File fd, void *buf, int len, ADIO_Offset offset, -+ int wr, void *handle) -+{ -+ int err=-1, fd_sys; -+ -+#ifndef NO_AIO -+ int error_code; -+#ifdef AIO_SUN -+ aio_result_t *result; -+#else -+ struct aiocb *aiocbp; -+#endif -+#endif -+ -+ fd_sys = fd->fd_sys; -+ -+#ifdef AIO_SUN -+ result = (aio_result_t *) ADIOI_Malloc(sizeof(aio_result_t)); -+ result->aio_return = AIO_INPROGRESS; -+ if (wr) err = aiowrite(fd_sys, buf, len, offset, SEEK_SET, result); -+ else err = aioread(fd_sys, buf, len, offset, SEEK_SET, result); -+ -+ if (err == -1) { -+ if (errno == EAGAIN) { -+ /* the man pages say EPROCLIM, but in reality errno is set to EAGAIN! */ -+ -+ /* exceeded the max. no. of outstanding requests. -+ complete all previous async. requests and try again.*/ -+ -+ ADIOI_Complete_async(&error_code); -+ if (wr) err = aiowrite(fd_sys, buf, len, offset, SEEK_SET, result); -+ else err = aioread(fd_sys, buf, len, offset, SEEK_SET, result); -+ -+ while (err == -1) { -+ if (errno == EAGAIN) { -+ /* sleep and try again */ -+ sleep(1); -+ if (wr) err = aiowrite(fd_sys, buf, len, offset, SEEK_SET, result); -+ else err = aioread(fd_sys, buf, len, offset, SEEK_SET, result); -+ } -+ else { -+ FPRINTF(stderr, "Unknown errno %d in ADIOI_LUSTRE_aio\n", errno); -+ MPI_Abort(MPI_COMM_WORLD, 1); -+ } -+ } -+ } -+ else { -+ FPRINTF(stderr, "Unknown errno %d in ADIOI_LUSTRE_aio\n", errno); -+ MPI_Abort(MPI_COMM_WORLD, 1); -+ } -+ } -+ -+ *((aio_result_t **) handle) = result; -+#endif -+ -+#ifdef NO_FD_IN_AIOCB -+/* IBM */ -+ aiocbp = (struct aiocb *) ADIOI_Malloc(sizeof(struct aiocb)); -+ aiocbp->aio_whence = SEEK_SET; -+ aiocbp->aio_offset = offset; -+ aiocbp->aio_buf = buf; -+ aiocbp->aio_nbytes = len; -+ if (wr) err = aio_write(fd_sys, aiocbp); -+ else err = aio_read(fd_sys, aiocbp); -+ -+ if (err == -1) { -+ if (errno == EAGAIN) { -+ /* exceeded the max. no. of outstanding requests. -+ complete all previous async. requests and try again. */ -+ -+ ADIOI_Complete_async(&error_code); -+ if (wr) err = aio_write(fd_sys, aiocbp); -+ else err = aio_read(fd_sys, aiocbp); -+ -+ while (err == -1) { -+ if (errno == EAGAIN) { -+ /* sleep and try again */ -+ sleep(1); -+ if (wr) err = aio_write(fd_sys, aiocbp); -+ else err = aio_read(fd_sys, aiocbp); -+ } -+ else { -+ FPRINTF(stderr, "Unknown errno %d in ADIOI_LUSTRE_aio\n", errno); -+ MPI_Abort(MPI_COMM_WORLD, 1); -+ } -+ } -+ } -+ else { -+ FPRINTF(stderr, "Unknown errno %d in ADIOI_LUSTRE_aio\n", errno); -+ MPI_Abort(MPI_COMM_WORLD, 1); -+ } -+ } -+ -+ *((struct aiocb **) handle) = aiocbp; -+ -+#elif (!defined(NO_AIO) && !defined(AIO_SUN)) -+/* DEC, SGI IRIX 5 and 6 */ -+ -+ aiocbp = (struct aiocb *) ADIOI_Calloc(sizeof(struct aiocb), 1); -+ aiocbp->aio_fildes = fd_sys; -+ aiocbp->aio_offset = offset; -+ aiocbp->aio_buf = buf; -+ aiocbp->aio_nbytes = len; -+ -+#ifdef AIO_PRIORITY_DEFAULT -+/* DEC */ -+ aiocbp->aio_reqprio = AIO_PRIO_DFL; /* not needed in DEC Unix 4.0 */ -+ aiocbp->aio_sigevent.sigev_signo = 0; -+#else -+ aiocbp->aio_reqprio = 0; -+#endif -+ -+#ifdef AIO_SIGNOTIFY_NONE -+/* SGI IRIX 6 */ -+ aiocbp->aio_sigevent.sigev_notify = SIGEV_NONE; -+#else -+ aiocbp->aio_sigevent.sigev_signo = 0; -+#endif -+ -+ if (wr) err = aio_write(aiocbp); -+ else err = aio_read(aiocbp); -+ -+ if (err == -1) { -+ if (errno == EAGAIN) { -+ /* exceeded the max. no. of outstanding requests. -+ complete all previous async. requests and try again. */ -+ -+ ADIOI_Complete_async(&error_code); -+ if (wr) err = aio_write(aiocbp); -+ else err = aio_read(aiocbp); -+ -+ while (err == -1) { -+ if (errno == EAGAIN) { -+ /* sleep and try again */ -+ sleep(1); -+ if (wr) err = aio_write(aiocbp); -+ else err = aio_read(aiocbp); -+ } -+ else { -+ FPRINTF(stderr, "Unknown errno %d in ADIOI_LUSTRE_aio\n", errno); -+ MPI_Abort(MPI_COMM_WORLD, 1); -+ } -+ } -+ } -+ else { -+ FPRINTF(stderr, "Unknown errno %d in ADIOI_LUSTRE_aio\n", errno); -+ MPI_Abort(MPI_COMM_WORLD, 1); -+ } -+ } -+ -+ *((struct aiocb **) handle) = aiocbp; -+#endif -+ -+ return err; -+} -diff -r -u --new-file mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_open.c mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_open.c ---- mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_open.c 1969-12-31 19:00:00.000000000 -0500 -+++ mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_open.c 2005-12-06 11:54:37.906127861 -0500 -@@ -0,0 +1,100 @@ -+/* -*- Mode: C; c-basic-offset:4 ; -*- */ -+/* -+ * $Id: ad_lustre_open.c,v 1.1.1.1 2004/11/04 11:03:38 liam Exp $ -+ * -+ * Copyright (C) 1997 University of Chicago. -+ * See COPYRIGHT notice in top-level directory. -+ */ -+ -+#include "ad_lustre.h" -+ -+void ADIOI_LUSTRE_Open(ADIO_File fd, int *error_code) -+{ -+ int perm, old_mask, amode; -+ struct lov_user_md lum = { 0 }; -+ char *value; -+ -+#if defined(MPICH2) || !defined(PRINT_ERR_MSG) -+ static char myname[] = "ADIOI_LUSTRE_OPEN"; -+#endif -+ -+ if (fd->perm == ADIO_PERM_NULL) { -+ old_mask = umask(022); -+ umask(old_mask); -+ perm = old_mask ^ 0666; -+ } -+ else perm = fd->perm; -+ -+ amode = 0; -+ if (fd->access_mode & ADIO_CREATE) -+ amode = amode | O_CREAT; -+ if (fd->access_mode & ADIO_RDONLY) -+ amode = amode | O_RDONLY; -+ if (fd->access_mode & ADIO_WRONLY) -+ amode = amode | O_WRONLY; -+ if (fd->access_mode & ADIO_RDWR) -+ amode = amode | O_RDWR; -+ if (fd->access_mode & ADIO_EXCL) -+ amode = amode | O_EXCL; -+ -+ fd->fd_sys = open(fd->filename, amode, perm); -+ -+ if (fd->fd_sys != -1) { -+ int err; -+ -+ value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); -+ -+ /* get file striping information and set it in info */ -+ lum.lmm_magic = LOV_USER_MAGIC; -+ err = ioctl(fd->fd_sys, LL_IOC_LOV_GETSTRIPE, (void *) &lum); -+ -+ if (!err) { -+ sprintf(value, "%d", lum.lmm_stripe_size); -+ MPI_Info_set(fd->info, "striping_unit", value); -+ -+ sprintf(value, "%d", lum.lmm_stripe_count); -+ MPI_Info_set(fd->info, "striping_factor", value); -+ -+ sprintf(value, "%d", lum.lmm_stripe_offset); -+ MPI_Info_set(fd->info, "start_iodevice", value); -+ } -+ ADIOI_Free(value); -+ -+ if (fd->access_mode & ADIO_APPEND) -+ fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END); -+ } -+ -+ -+ if ((fd->fd_sys != -1) && (fd->access_mode & ADIO_APPEND)) -+ fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END); -+ -+ if (fd->fd_sys == -1) { -+#ifdef MPICH2 -+ if (errno == ENAMETOOLONG) -+ *error_code = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_BAD_FILE, "**filenamelong", "**filenamelong %s %d", fd->filename, strlen(fd->filename) ); -+ else if (errno == ENOENT) -+ *error_code = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_NO_SUCH_FILE, "**filenoexist", "**filenoexist %s", fd->filename ); -+ else if (errno == ENOTDIR || errno == ELOOP) -+ *error_code = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_BAD_FILE, "**filenamedir", "**filenamedir %s", fd->filename ); -+ else if (errno == EACCES) { -+ *error_code = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_ACCESS, "**fileaccess", "**fileaccess %s", -+ fd->filename ); -+ } -+ else if (errno == EROFS) { -+ /* Read only file or file system and write access requested */ -+ *error_code = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_READ_ONLY, "**ioneedrd", 0 ); -+ } -+ else { -+ *error_code = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", -+ "**io %s", strerror(errno)); -+ } -+#elif defined(PRINT_ERR_MSG) -+ *error_code = MPI_ERR_UNKNOWN; -+#else /* MPICH-1 */ -+ *error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_ADIO_ERROR, -+ myname, "I/O Error", "%s", strerror(errno)); -+ ADIOI_Error(ADIO_FILE_NULL, *error_code, myname); -+#endif -+ } -+ else *error_code = MPI_SUCCESS; -+} -diff -r -u --new-file mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_rdcoll.c mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_rdcoll.c ---- mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_rdcoll.c 1969-12-31 19:00:00.000000000 -0500 -+++ mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_rdcoll.c 2005-12-06 11:54:37.907127727 -0500 -@@ -0,0 +1,18 @@ -+/* -*- Mode: C; c-basic-offset:4 ; -*- */ -+/* -+ * $Id: ad_lustre_rdcoll.c,v 1.1.1.1 2004/11/04 11:03:38 liam Exp $ -+ * -+ * Copyright (C) 1997 University of Chicago. -+ * See COPYRIGHT notice in top-level directory. -+ */ -+ -+#include "ad_lustre.h" -+ -+void ADIOI_LUSTRE_ReadStridedColl(ADIO_File fd, void *buf, int count, -+ MPI_Datatype datatype, int file_ptr_type, -+ ADIO_Offset offset, ADIO_Status *status, int -+ *error_code) -+{ -+ ADIOI_GEN_ReadStridedColl(fd, buf, count, datatype, file_ptr_type, -+ offset, status, error_code); -+} -diff -r -u --new-file mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_read.c mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_read.c ---- mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_read.c 1969-12-31 19:00:00.000000000 -0500 -+++ mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_read.c 2005-12-06 11:54:37.907127727 -0500 -@@ -0,0 +1,67 @@ -+/* -*- Mode: C; c-basic-offset:4 ; -*- */ -+/* -+ * $Id: ad_lustre_read.c,v 1.1.1.1 2004/11/04 11:03:38 liam Exp $ -+ * -+ * Copyright (C) 1997 University of Chicago. -+ * See COPYRIGHT notice in top-level directory. -+ */ -+ -+#include "ad_lustre.h" -+ -+void ADIOI_LUSTRE_ReadContig(ADIO_File fd, void *buf, int count, -+ MPI_Datatype datatype, int file_ptr_type, -+ ADIO_Offset offset, ADIO_Status *status, int *error_code) -+{ -+ int err=-1, datatype_size, len; -+#if defined(MPICH2) || !defined(PRINT_ERR_MSG) -+ static char myname[] = "ADIOI_LUSTRE_READCONTIG"; -+#endif -+ -+ MPI_Type_size(datatype, &datatype_size); -+ len = datatype_size * count; -+ -+ if (file_ptr_type == ADIO_EXPLICIT_OFFSET) { -+ if (fd->fp_sys_posn != offset) -+ lseek(fd->fd_sys, offset, SEEK_SET); -+ err = read(fd->fd_sys, buf, len); -+ fd->fp_sys_posn = offset + len; -+ /* individual file pointer not updated */ -+ } -+ else { /* read from curr. location of ind. file pointer */ -+ if (fd->fp_sys_posn != fd->fp_ind) -+ lseek(fd->fd_sys, fd->fp_ind, SEEK_SET); -+ err = read(fd->fd_sys, buf, len); -+ fd->fp_ind += err; -+ fd->fp_sys_posn = fd->fp_ind; -+ } -+ -+#ifdef HAVE_STATUS_SET_BYTES -+ if (err != -1) MPIR_Status_set_bytes(status, datatype, err); -+#endif -+ -+ if (err == -1) { -+#ifdef MPICH2 -+ *error_code = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", -+ "**io %s", strerror(errno)); -+#elif defined(PRINT_ERR_MSG) -+ *error_code = MPI_ERR_UNKNOWN; -+#else /* MPICH-1 */ -+ *error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_ADIO_ERROR, -+ myname, "I/O Error", "%s", strerror(errno)); -+ ADIOI_Error(fd, *error_code, myname); -+#endif -+ } -+ else *error_code = MPI_SUCCESS; -+} -+ -+ -+ -+ -+void ADIOI_LUSTRE_ReadStrided(ADIO_File fd, void *buf, int count, -+ MPI_Datatype datatype, int file_ptr_type, -+ ADIO_Offset offset, ADIO_Status *status, int -+ *error_code) -+{ -+ ADIOI_GEN_ReadStrided(fd, buf, count, datatype, file_ptr_type, -+ offset, status, error_code); -+} -diff -r -u --new-file mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_resize.c mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_resize.c ---- mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_resize.c 1969-12-31 19:00:00.000000000 -0500 -+++ mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_resize.c 2005-12-06 11:54:37.909127460 -0500 -@@ -0,0 +1,32 @@ -+/* -*- Mode: C; c-basic-offset:4 ; -*- */ -+/* -+ * $Id: ad_lustre_resize.c,v 1.1.1.1 2004/11/04 11:03:38 liam Exp $ -+ * -+ * Copyright (C) 1997 University of Chicago. -+ * See COPYRIGHT notice in top-level directory. -+ */ -+ -+#include "ad_lustre.h" -+ -+void ADIOI_LUSTRE_Resize(ADIO_File fd, ADIO_Offset size, int *error_code) -+{ -+ int err; -+#if defined(MPICH2) || !defined(PRINT_ERR_MSG) -+ static char myname[] = "ADIOI_LUSTRE_RESIZE"; -+#endif -+ -+ err = ftruncate(fd->fd_sys, size); -+ if (err == -1) { -+#ifdef MPICH2 -+ *error_code = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", -+ "**io %s", strerror(errno)); -+#elif defined(PRINT_ERR_MSG) -+ *error_code = MPI_ERR_UNKNOWN; -+#else /* MPICH-1 */ -+ *error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_ADIO_ERROR, -+ myname, "I/O Error", "%s", strerror(errno)); -+ ADIOI_Error(fd, *error_code, myname); -+#endif -+ } -+ else *error_code = MPI_SUCCESS; -+} -diff -r -u --new-file mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_seek.c mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_seek.c ---- mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_seek.c 1969-12-31 19:00:00.000000000 -0500 -+++ mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_seek.c 2005-12-06 11:54:37.911127194 -0500 -@@ -0,0 +1,15 @@ -+/* -*- Mode: C; c-basic-offset:4 ; -*- */ -+/* -+ * $Id: ad_lustre_seek.c,v 1.1.1.1 2004/11/04 11:03:38 liam Exp $ -+ * -+ * Copyright (C) 1997 University of Chicago. -+ * See COPYRIGHT notice in top-level directory. -+ */ -+ -+#include "ad_lustre.h" -+ -+ADIO_Offset ADIOI_LUSTRE_SeekIndividual(ADIO_File fd, ADIO_Offset offset, -+ int whence, int *error_code) -+{ -+ return ADIOI_GEN_SeekIndividual(fd, offset, whence, error_code); -+} -diff -r -u --new-file mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_wait.c mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_wait.c ---- mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_wait.c 1969-12-31 19:00:00.000000000 -0500 -+++ mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_wait.c 2005-12-06 11:54:37.914126794 -0500 -@@ -0,0 +1,188 @@ -+/* -*- Mode: C; c-basic-offset:4 ; -*- */ -+/* -+ * $Id: ad_lustre_wait.c,v 1.1.1.1 2004/11/04 11:03:38 liam Exp $ -+ * -+ * Copyright (C) 1997 University of Chicago. -+ * See COPYRIGHT notice in top-level directory. -+ */ -+ -+#include "ad_lustre.h" -+ -+void ADIOI_LUSTRE_ReadComplete(ADIO_Request *request, ADIO_Status *status, int *error_code) -+{ -+#ifndef NO_AIO -+#if defined(MPICH2) || !defined(PRINT_ERR_MSG) -+ static char myname[] = "ADIOI_LUSTRE_READCOMPLETE"; -+#endif -+#ifdef AIO_SUN -+ aio_result_t *result=0, *tmp; -+#else -+ int err; -+#endif -+#ifdef AIO_HANDLE_IN_AIOCB -+ struct aiocb *tmp1; -+#endif -+#endif -+ -+ if (*request == ADIO_REQUEST_NULL) { -+ *error_code = MPI_SUCCESS; -+ return; -+ } -+ -+#ifdef AIO_SUN -+ if ((*request)->queued) { /* dequeue it */ -+ tmp = (aio_result_t *) (*request)->handle; -+ while (tmp->aio_return == AIO_INPROGRESS) usleep(1000); -+ /* sleep for 1 ms., until done. Is 1 ms. a good number? */ -+ /* when done, dequeue any one request */ -+ result = (aio_result_t *) aiowait(0); -+ -+ (*request)->nbytes = tmp->aio_return; -+ -+ if (tmp->aio_return == -1) { -+#ifdef MPICH2 -+ *error_code = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", -+ "**io %s", strerror(tmp->aio_errno)); -+ return; -+#elif defined(PRINT_ERR_MSG) -+ *error_code = MPI_ERR_UNKNOWN; -+#else /* MPICH-1 */ -+ *error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_ADIO_ERROR, -+ myname, "I/O Error", "%s", strerror(tmp->aio_errno)); -+ ADIOI_Error((*request)->fd, *error_code, myname); -+#endif -+ } -+ else *error_code = MPI_SUCCESS; -+ -+/* aiowait only dequeues a request. The completion of a request can be -+ checked by just checking the aio_return flag in the handle passed -+ to the original aioread()/aiowrite(). Therefore, I need to ensure -+ that aiowait() is called exactly once for each previous -+ aioread()/aiowrite(). This is also taken care of in ADIOI_xxxDone */ -+ } -+ else *error_code = MPI_SUCCESS; -+ -+#ifdef HAVE_STATUS_SET_BYTES -+ if ((*request)->nbytes != -1) -+ MPIR_Status_set_bytes(status, (*request)->datatype, (*request)->nbytes); -+#endif -+ -+#endif -+ -+#ifdef AIO_HANDLE_IN_AIOCB -+/* IBM */ -+ if ((*request)->queued) { -+ do { -+ err = aio_suspend(1, (struct aiocb **) &((*request)->handle)); -+ } while ((err == -1) && (errno == EINTR)); -+ -+ tmp1 = (struct aiocb *) (*request)->handle; -+ if (err != -1) { -+ err = aio_return(tmp1->aio_handle); -+ (*request)->nbytes = err; -+ errno = aio_error(tmp1->aio_handle); -+ } -+ else (*request)->nbytes = -1; -+ -+/* on DEC, it is required to call aio_return to dequeue the request. -+ IBM man pages don't indicate what function to use for dequeue. -+ I'm assuming it is aio_return! POSIX says aio_return may be called -+ only once on a given handle. */ -+ -+ if (err == -1) { -+#ifdef MPICH2 -+ *error_code = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", -+ "**io %s", strerror(errno)); -+ return; -+#elif defined(PRINT_ERR_MSG) -+ *error_code = MPI_ERR_UNKNOWN; -+#else /* MPICH-1 */ -+ *error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_ADIO_ERROR, -+ myname, "I/O Error", "%s", strerror(errno)); -+ ADIOI_Error((*request)->fd, *error_code, myname); -+#endif -+ } -+ else *error_code = MPI_SUCCESS; -+ } /* if ((*request)->queued) */ -+ else *error_code = MPI_SUCCESS; -+ -+#ifdef HAVE_STATUS_SET_BYTES -+ if ((*request)->nbytes != -1) -+ MPIR_Status_set_bytes(status, (*request)->datatype, (*request)->nbytes); -+#endif -+ -+#elif (!defined(NO_AIO) && !defined(AIO_SUN)) -+/* DEC, SGI IRIX 5 and 6 */ -+ if ((*request)->queued) { -+ do { -+ err = aio_suspend((const aiocb_t **) &((*request)->handle), 1, 0); -+ } while ((err == -1) && (errno == EINTR)); -+ -+ if (err != -1) { -+ err = aio_return((struct aiocb *) (*request)->handle); -+ (*request)->nbytes = err; -+ errno = aio_error((struct aiocb *) (*request)->handle); -+ } -+ else (*request)->nbytes = -1; -+ -+ if (err == -1) { -+#ifdef MPICH2 -+ *error_code = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", -+ "**io %s", strerror(errno)); -+ return; -+#elif defined(PRINT_ERR_MSG) -+ *error_code = MPI_ERR_UNKNOWN; -+#else /* MPICH-1 */ -+ *error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_ADIO_ERROR, -+ myname, "I/O Error", "%s", strerror(errno)); -+ ADIOI_Error((*request)->fd, *error_code, myname); -+#endif -+ } -+ else *error_code = MPI_SUCCESS; -+ } /* if ((*request)->queued) */ -+ else *error_code = MPI_SUCCESS; -+#ifdef HAVE_STATUS_SET_BYTES -+ if ((*request)->nbytes != -1) -+ MPIR_Status_set_bytes(status, (*request)->datatype, (*request)->nbytes); -+#endif -+#endif -+ -+#ifndef NO_AIO -+ if ((*request)->queued != -1) { -+ -+ /* queued = -1 is an internal hack used when the request must -+ be completed, but the request object should not be -+ freed. This is used in ADIOI_Complete_async, because the user -+ will call MPI_Wait later, which would require status to -+ be filled. Ugly but works. queued = -1 should be used only -+ in ADIOI_Complete_async. -+ This should not affect the user in any way. */ -+ -+ /* if request is still queued in the system, it is also there -+ on ADIOI_Async_list. Delete it from there. */ -+ if ((*request)->queued) ADIOI_Del_req_from_list(request); -+ -+ (*request)->fd->async_count--; -+ if ((*request)->handle) ADIOI_Free((*request)->handle); -+ ADIOI_Free_request((ADIOI_Req_node *) (*request)); -+ *request = ADIO_REQUEST_NULL; -+ } -+ -+#else -+/* HP, FreeBSD, Linux */ -+ -+#ifdef HAVE_STATUS_SET_BYTES -+ MPIR_Status_set_bytes(status, (*request)->datatype, (*request)->nbytes); -+#endif -+ (*request)->fd->async_count--; -+ ADIOI_Free_request((ADIOI_Req_node *) (*request)); -+ *request = ADIO_REQUEST_NULL; -+ *error_code = MPI_SUCCESS; -+#endif -+} -+ -+ -+void ADIOI_LUSTRE_WriteComplete(ADIO_Request *request, ADIO_Status *status, int *error_code) -+{ -+ ADIOI_LUSTRE_ReadComplete(request, status, error_code); -+} -diff -r -u --new-file mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_wrcoll.c mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_wrcoll.c ---- mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_wrcoll.c 1969-12-31 19:00:00.000000000 -0500 -+++ mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_wrcoll.c 2005-12-06 11:54:37.914126794 -0500 -@@ -0,0 +1,18 @@ -+/* -*- Mode: C; c-basic-offset:4 ; -*- */ -+/* -+ * $Id: ad_lustre_wrcoll.c,v 1.1.1.1 2004/11/04 11:03:38 liam Exp $ -+ * -+ * Copyright (C) 1997 University of Chicago. -+ * See COPYRIGHT notice in top-level directory. -+ */ -+ -+#include "ad_lustre.h" -+ -+void ADIOI_LUSTRE_WriteStridedColl(ADIO_File fd, void *buf, int count, -+ MPI_Datatype datatype, int file_ptr_type, -+ ADIO_Offset offset, ADIO_Status *status, int -+ *error_code) -+{ -+ ADIOI_GEN_WriteStridedColl(fd, buf, count, datatype, file_ptr_type, -+ offset, status, error_code); -+} -diff -r -u --new-file mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_write.c mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_write.c ---- mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_write.c 1969-12-31 19:00:00.000000000 -0500 -+++ mpich-1.2.6/romio/adio/ad_lustre/ad_lustre_write.c 2005-12-06 11:54:37.914126794 -0500 -@@ -0,0 +1,66 @@ -+/* -*- Mode: C; c-basic-offset:4 ; -*- */ -+/* -+ * $Id: ad_lustre_write.c,v 1.1.1.1 2004/11/04 11:03:38 liam Exp $ -+ * -+ * Copyright (C) 1997 University of Chicago. -+ * See COPYRIGHT notice in top-level directory. -+ */ -+ -+#include "ad_lustre.h" -+ -+void ADIOI_LUSTRE_WriteContig(ADIO_File fd, void *buf, int count, -+ MPI_Datatype datatype, int file_ptr_type, -+ ADIO_Offset offset, ADIO_Status *status, int *error_code) -+{ -+ int err=-1, datatype_size, len; -+#if defined(MPICH2) || !defined(PRINT_ERR_MSG) -+ static char myname[] = "ADIOI_LUSTRE_WRITECONTIG"; -+#endif -+ -+ MPI_Type_size(datatype, &datatype_size); -+ len = datatype_size * count; -+ -+ if (file_ptr_type == ADIO_EXPLICIT_OFFSET) { -+ if (fd->fp_sys_posn != offset) -+ lseek(fd->fd_sys, offset, SEEK_SET); -+ err = write(fd->fd_sys, buf, len); -+ fd->fp_sys_posn = offset + err; -+ /* individual file pointer not updated */ -+ } -+ else { /* write from curr. location of ind. file pointer */ -+ if (fd->fp_sys_posn != fd->fp_ind) -+ lseek(fd->fd_sys, fd->fp_ind, SEEK_SET); -+ err = write(fd->fd_sys, buf, len); -+ fd->fp_ind += err; -+ fd->fp_sys_posn = fd->fp_ind; -+ } -+ -+#ifdef HAVE_STATUS_SET_BYTES -+ if (err != -1 && status) MPIR_Status_set_bytes(status, datatype, err); -+#endif -+ -+ if (err == -1) { -+#ifdef MPICH2 -+ *error_code = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", -+ "**io %s", strerror(errno)); -+#elif defined(PRINT_ERR_MSG) -+ *error_code = MPI_ERR_UNKNOWN; -+#else -+ *error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_ADIO_ERROR, -+ myname, "I/O Error", "%s", strerror(errno)); -+ ADIOI_Error(fd, *error_code, myname); -+#endif -+ } -+ else *error_code = MPI_SUCCESS; -+} -+ -+ -+ -+void ADIOI_LUSTRE_WriteStrided(ADIO_File fd, void *buf, int count, -+ MPI_Datatype datatype, int file_ptr_type, -+ ADIO_Offset offset, ADIO_Status *status, int -+ *error_code) -+{ -+ ADIOI_GEN_WriteStrided(fd, buf, count, datatype, file_ptr_type, -+ offset, status, error_code); -+} -diff -r -u --new-file mpich-1.2.6/romio/adio/ad_lustre/Makefile.in mpich-1.2.6/romio/adio/ad_lustre/Makefile.in ---- mpich-1.2.6/romio/adio/ad_lustre/Makefile.in 1969-12-31 19:00:00.000000000 -0500 -+++ mpich-1.2.6/romio/adio/ad_lustre/Makefile.in 2005-12-06 11:54:37.883130927 -0500 -@@ -0,0 +1,47 @@ -+CC = @CC@ -+AR = @AR@ -+LIBNAME = @LIBNAME@ -+srcdir = @srcdir@ -+CC_SHL = @CC_SHL@ -+SHLIBNAME = @SHLIBNAME@ -+ -+INCLUDE_DIR = -I@MPI_INCLUDE_DIR@ -I${srcdir}/../include -I../include -+CFLAGS = @CFLAGS@ $(INCLUDE_DIR) -+ -+C_COMPILE_SHL = $(CC_SHL) @CFLAGS@ $(INCLUDE_DIR) -+ -+@VPATH@ -+ -+AD_LUSTRE_OBJECTS = ad_lustre_close.o ad_lustre_read.o \ -+ ad_lustre_open.o ad_lustre_write.o ad_lustre_done.o \ -+ ad_lustre_fcntl.o ad_lustre_iread.o ad_lustre_iwrite.o ad_lustre_wait.o \ -+ ad_lustre_resize.o ad_lustre_hints.o \ -+ ad_lustre.o -+ -+ -+default: $(LIBNAME) -+ @if [ "@ENABLE_SHLIB@" != "none" ] ; then \ -+ $(MAKE) $(SHLIBNAME).la ;\ -+ fi -+ -+.SUFFIXES: $(SUFFIXES) .p .lo -+ -+.c.o: -+ $(CC) $(CFLAGS) -c $< -+.c.lo: -+ $(C_COMPILE_SHL) -c $< -+ @mv -f $*.o $*.lo -+ -+$(LIBNAME): $(AD_LUSTRE_OBJECTS) -+ $(AR) $(LIBNAME) $(AD_LUSTRE_OBJECTS) -+ -+AD_LUSTRE_LOOBJECTS=$(AD_LUSTRE_OBJECTS:.o=.lo) -+$(SHLIBNAME).la: $(AD_LUSTRE_LOOBJECTS) -+ $(AR) $(SHLIBNAME).la $(AD_LUSTRE_LOOBJECTS) -+ -+coverage: -+ -@for file in ${AD_LUSTRE_OBJECTS:.o=.c} ; do \ -+ gcov -b -f $$file ; done -+ -+clean: -+ @rm -f *.o *.lo ---- mpich-1.2.6/romio/Makefile.in 2004-01-27 18:27:35.000000000 -0500 -+++ mpich-1.2.6/romio/Makefile.in 2005-12-06 11:54:38.000000000 -0500 -@@ -14,7 +14,7 @@ DIRS = mpi-io adio/common - MPIO_DIRS = mpi-io - EXTRA_SRC_DIRS = @EXTRA_SRC_DIRS@ - FILE_SYS_DIRS = @FILE_SYS_DIRS@ --ALL_DIRS = mpi-io mpi-io/fortran mpi2-other/info mpi2-other/info/fortran mpi2-other/array mpi2-other/array/fortran adio/common adio/ad_pfs adio/ad_piofs adio/ad_nfs adio/ad_ufs adio/ad_xfs adio/ad_hfs adio/ad_sfs adio/ad_testfs adio/ad_pvfs adio/ad_pvfs2 test -+ALL_DIRS = mpi-io mpi-io/fortran mpi2-other/info mpi2-other/info/fortran mpi2-other/array mpi2-other/array/fortran adio/common adio/ad_pfs adio/ad_piofs adio/ad_nfs adio/ad_ufs adio/ad_xfs adio/ad_hfs adio/ad_sfs adio/ad_testfs adio/ad_pvfs adio/ad_pvfs2 adio/ad_lustre test - SHELL = /bin/sh - - @VPATH@ ---- mpich-1.2.6/romio/configure.in 2004-08-02 09:37:31.000000000 -0400 -+++ mpich-1.2.6/romio/configure.in 2005-12-06 11:54:38.000000000 -0500 -@@ -90,7 +90,7 @@ MPIO_REQ_REAL_POBJECTS="_iotest.o _iowai - # - have_aio=no - # --known_filesystems="nfs ufs pfs piofs pvfs pvfs2 testfs xfs hfs sfs" -+known_filesystems="nfs ufs pfs piofs pvfs pvfs2 testfs xfs hfs sfs lustre" - known_mpi_impls="mpich_mpi sgi_mpi hp_mpi cray_mpi lam_mpi" - # - # Defaults -@@ -1270,6 +1270,9 @@ fi - if test -n "$file_system_testfs"; then - AC_DEFINE(ROMIO_TESTFS,1,[Define for TESTFS]) - fi -+if test -n "$file_system_lustre"; then -+ AC_DEFINE(ROMIO_LUSTRE,1,[Define for LUSTRE]) -+fi - if test -n "$file_system_piofs"; then - AC_DEFINE(PIOFS,1,[Define for PIOFS]) - USER_CFLAGS="$USER_CFLAGS -bI:/usr/include/piofs/piofs.exp" -@@ -1634,7 +1637,7 @@ AC_OUTPUT(Makefile localdefs mpi-io/Make - adio/ad_nfs/Makefile adio/ad_ufs/Makefile \ - adio/ad_xfs/Makefile adio/ad_hfs/Makefile \ - adio/ad_sfs/Makefile adio/ad_pfs/Makefile \ -- adio/ad_testfs/Makefile adio/ad_pvfs/Makefile \ -+ adio/ad_testfs/Makefile adio/ad_lustre/Makefile adio/ad_pvfs/Makefile \ - adio/ad_pvfs2/Makefile adio/ad_piofs/Makefile \ - mpi-io/fortran/Makefile mpi2-other/info/fortran/Makefile \ - mpi2-other/array/fortran/Makefile test/fmisc.f \ ---- mpich-1.2.6/romio/configure 2004-08-04 12:08:28.000000000 -0400 -+++ mpich-1.2.6/romio/configure 2005-12-06 11:54:38.000000000 -0500 -@@ -623,7 +623,7 @@ MPIO_REQ_REAL_POBJECTS="_iotest.o _iowai - # - have_aio=no - # --known_filesystems="nfs ufs pfs piofs pvfs pvfs2 testfs xfs hfs sfs" -+known_filesystems="nfs ufs pfs piofs pvfs pvfs2 testfs lustre xfs hfs sfs" - known_mpi_impls="mpich_mpi sgi_mpi hp_mpi cray_mpi lam_mpi" - # - # Defaults -@@ -4022,6 +4022,13 @@ if test -n "$file_system_testfs"; then - EOF - - fi -+if test -n "$file_system_lustre"; then -+ cat >> confdefs.h <<\EOF -+#define LUSTRE 1 -+EOF -+ -+fi -+ - if test -n "$file_system_piofs"; then - cat >> confdefs.h <<\EOF - #define PIOFS 1 -@@ -4746,7 +4753,7 @@ trap 'rm -fr `echo "Makefile localdefs m - adio/ad_xfs/Makefile adio/ad_hfs/Makefile \ - adio/ad_sfs/Makefile adio/ad_pfs/Makefile \ - adio/ad_testfs/Makefile adio/ad_pvfs/Makefile \ -- adio/ad_pvfs2/Makefile adio/ad_piofs/Makefile \ -+ adio/ad_pvfs2/Makefile adio/ad_piofs/Makefile adio/ad_lustre/Makefile\ - mpi-io/fortran/Makefile mpi2-other/info/fortran/Makefile \ - mpi2-other/array/fortran/Makefile test/fmisc.f \ - test/fcoll_test.f test/pfcoll_test.f test/fperf.f adio/include/romioconf.h" | sed "s/:[^ ]*//g"` conftest*; exit 1' 1 2 15 -@@ -4912,7 +4919,7 @@ CONFIG_FILES=\${CONFIG_FILES-"Makefile l - adio/ad_nfs/Makefile adio/ad_ufs/Makefile \ - adio/ad_xfs/Makefile adio/ad_hfs/Makefile \ - adio/ad_sfs/Makefile adio/ad_pfs/Makefile \ -- adio/ad_testfs/Makefile adio/ad_pvfs/Makefile \ -+ adio/ad_testfs/Makefile adio/ad_lustre/Makefile adio/ad_pvfs/Makefile \ - adio/ad_pvfs2/Makefile adio/ad_piofs/Makefile \ - mpi-io/fortran/Makefile mpi2-other/info/fortran/Makefile \ - mpi2-other/array/fortran/Makefile test/fmisc.f \ ---- mpich-1.2.6/romio/adio/include/romioconf.h.in 2004-08-04 12:08:28.000000000 -0400 -+++ mpich-1.2.6/romio/adio/include/romioconf.h.in 2005-12-06 11:54:38.000000000 -0500 -@@ -192,6 +192,9 @@ - /* Define for TESTFS */ - #undef ROMIO_TESTFS - -+/* Define for LUSTRE */ -+#undef LUSTRE -+ - /* Define for PIOFS */ - #undef PIOFS - ---- mpich-1.2.6/romio/adio/include/mpio_error.h 2002-11-15 11:26:23.000000000 -0500 -+++ mpich-1.2.6/romio/adio/include/mpio_error.h 2005-12-06 11:54:38.000000000 -0500 -@@ -62,6 +62,7 @@ - #define MPIR_ERR_FILETYPE 33 - #define MPIR_ERR_NO_NTFS 35 - #define MPIR_ERR_NO_TESTFS 36 -+#define MPIR_ERR_NO_LUSTRE 37 - - /* MPI_ERR_COMM */ - #ifndef MPIR_ERR_COMM_NULL ---- mpich-1.2.6/romio/adio/include/adioi_fs_proto.h 2003-06-24 18:48:23.000000000 -0400 -+++ mpich-1.2.6/romio/adio/include/adioi_fs_proto.h 2005-12-06 11:54:38.000000000 -0500 -@@ -261,6 +261,68 @@ ADIO_Offset ADIOI_UFS_SeekIndividual(ADI - void ADIOI_UFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code); - #endif - -+#ifdef LUSTRE -+extern struct ADIOI_Fns_struct ADIO_LUSTRE_operations; -+ -+void ADIOI_LUSTRE_Open(ADIO_File fd, int *error_code); -+void ADIOI_LUSTRE_Close(ADIO_File fd, int *error_code); -+void ADIOI_LUSTRE_ReadContig(ADIO_File fd, void *buf, int count, -+ MPI_Datatype datatype, int file_ptr_type, -+ ADIO_Offset offset, ADIO_Status *status, int -+ *error_code); -+void ADIOI_LUSTRE_WriteContig(ADIO_File fd, void *buf, int count, -+ MPI_Datatype datatype, int file_ptr_type, -+ ADIO_Offset offset, ADIO_Status *status, int -+ *error_code); -+void ADIOI_LUSTRE_IwriteContig(ADIO_File fd, void *buf, int count, -+ MPI_Datatype datatype, int file_ptr_type, -+ ADIO_Offset offset, ADIO_Request *request, int -+ *error_code); -+void ADIOI_LUSTRE_IreadContig(ADIO_File fd, void *buf, int count, -+ MPI_Datatype datatype, int file_ptr_type, -+ ADIO_Offset offset, ADIO_Request *request, int -+ *error_code); -+int ADIOI_LUSTRE_ReadDone(ADIO_Request *request, ADIO_Status *status, int -+ *error_code); -+int ADIOI_LUSTRE_WriteDone(ADIO_Request *request, ADIO_Status *status, int -+ *error_code); -+void ADIOI_LUSTRE_ReadComplete(ADIO_Request *request, ADIO_Status *status, int -+ *error_code); -+void ADIOI_LUSTRE_WriteComplete(ADIO_Request *request, ADIO_Status *status, -+ int *error_code); -+void ADIOI_LUSTRE_Fcntl(ADIO_File fd, int flag, ADIO_Fcntl_t *fcntl_struct, int -+ *error_code); -+void ADIOI_LUSTRE_WriteStrided(ADIO_File fd, void *buf, int count, -+ MPI_Datatype datatype, int file_ptr_type, -+ ADIO_Offset offset, ADIO_Status *status, int -+ *error_code); -+void ADIOI_LUSTRE_ReadStrided(ADIO_File fd, void *buf, int count, -+ MPI_Datatype datatype, int file_ptr_type, -+ ADIO_Offset offset, ADIO_Status *status, int -+ *error_code); -+void ADIOI_LUSTRE_WriteStridedColl(ADIO_File fd, void *buf, int count, -+ MPI_Datatype datatype, int file_ptr_type, -+ ADIO_Offset offset, ADIO_Status *status, int -+ *error_code); -+void ADIOI_LUSTRE_ReadStridedColl(ADIO_File fd, void *buf, int count, -+ MPI_Datatype datatype, int file_ptr_type, -+ ADIO_Offset offset, ADIO_Status *status, int -+ *error_code); -+void ADIOI_LUSTRE_IreadStrided(ADIO_File fd, void *buf, int count, -+ MPI_Datatype datatype, int file_ptr_type, -+ ADIO_Offset offset, ADIO_Request *request, int -+ *error_code); -+void ADIOI_LUSTRE_IwriteStrided(ADIO_File fd, void *buf, int count, -+ MPI_Datatype datatype, int file_ptr_type, -+ ADIO_Offset offset, ADIO_Request *request, int -+ *error_code); -+void ADIOI_LUSTRE_Flush(ADIO_File fd, int *error_code); -+void ADIOI_LUSTRE_Resize(ADIO_File fd, ADIO_Offset size, int *error_code); -+ADIO_Offset ADIOI_LUSTRE_SeekIndividual(ADIO_File fd, ADIO_Offset offset, -+ int whence, int *error_code); -+void ADIOI_LUSTRE_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code); -+#endif -+ - #ifdef ROMIO_NTFS - extern struct ADIOI_Fns_struct ADIO_NTFS_operations; - ---- mpich-1.2.6/romio/adio/include/adio.h 2004-06-07 13:59:57.000000000 -0400 -+++ mpich-1.2.6/romio/adio/include/adio.h 2005-12-06 11:54:38.000000000 -0500 -@@ -276,6 +276,7 @@ typedef struct { - #define ADIO_NTFS 158 /* NTFS for Windows NT */ - #define ADIO_TESTFS 159 /* fake file system for testing */ - #define ADIO_PVFS2 160 /* PVFS2: 2nd generation PVFS */ -+#define ADIO_LUSTRE 161 /* Lustre */ - - #define ADIO_SEEK_SET SEEK_SET - #define ADIO_SEEK_CUR SEEK_CUR ---- mpich-1.2.6/romio/adio/common/setfn.c 2003-06-24 18:48:18.000000000 -0400 -+++ mpich-1.2.6/romio/adio/common/setfn.c 2005-12-06 11:54:38.000000000 -0500 -@@ -114,6 +114,16 @@ void ADIOI_SetFunctions(ADIO_File fd) - #endif - break; - -+ case ADIO_LUSTRE: -+#ifdef LUSTRE -+ *(fd->fns) = ADIO_LUSTRE_operations; -+#else -+ FPRINTF(stderr, "ADIOI_SetFunctions: ROMIO has not been configured to use the LUSTRE file system\n"); -+ MPI_Abort(MPI_COMM_WORLD, 1); -+#endif -+ break; -+ -+ - default: - FPRINTF(stderr, "ADIOI_SetFunctions: Unsupported file system type\n"); - MPI_Abort(MPI_COMM_WORLD, 1); ---- mpich-1.2.6/romio/adio/common/ad_fstype.c 2003-09-04 16:24:44.000000000 -0400 -+++ mpich-1.2.6/romio/adio/common/ad_fstype.c 2005-12-06 11:54:38.000000000 -0500 -@@ -204,6 +204,11 @@ static void ADIO_FileSysType_fncall(char - } - } - #elif defined(LINUX) -+#warning use correct include -+# if defined (LUSTRE) -+#define LL_SUPER_MAGIC 0x0BD00BD0 -+# endif -+ - do { - err = statfs(filename, &fsbuf); - } while (err && (errno == ESTALE)); -@@ -218,6 +223,9 @@ static void ADIO_FileSysType_fncall(char - else { - /* FPRINTF(stderr, "%d\n", fsbuf.f_type);*/ - if (fsbuf.f_type == NFS_SUPER_MAGIC) *fstype = ADIO_NFS; -+# if defined (LUSTRE) -+ else if (fsbuf.f_type == LL_SUPER_MAGIC) *fstype = ADIO_LUSTRE; -+#endif - # if defined(ROMIO_PVFS) - else if (fsbuf.f_type == PVFS_SUPER_MAGIC) *fstype = ADIO_PVFS; - # endif -@@ -359,6 +367,11 @@ static void ADIO_FileSysType_prefix(char - { - *fstype = ADIO_TESTFS; - } -+ else if (!strncmp(filename, "lustre:", 7) -+ || !strncmp(filename, "LUSTRE:", 7)) -+ { -+ *fstype = ADIO_LUSTRE; -+ } - else { - #ifdef ROMIO_NTFS - *fstype = ADIO_NTFS; -@@ -644,6 +657,24 @@ void ADIO_ResolveFileType(MPI_Comm comm, - *ops = &ADIO_TESTFS_operations; - #endif - } -+ if (file_system == ADIO_LUSTRE) { -+#ifndef LUSTRE -+# ifdef MPICH2 -+ *error_code = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**iofstypeunsupported", 0); -+ return; -+# elif defined(PRINT_ERR_MSG) -+ FPRINTF(stderr, "ADIO_ResolveFileType: ROMIO has not been configured to use the LUSTRE file system\n"); -+ MPI_Abort(MPI_COMM_WORLD, 1); -+# else /* MPICH-1 */ -+ myerrcode = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_ERR_NO_LUSTRE, -+ myname, (char *) 0, (char *) 0); -+ *error_code = ADIOI_Error(MPI_FILE_NULL, myerrcode, myname); -+# endif -+ return; -+#else -+ *ops = &ADIO_LUSTRE_operations; -+#endif -+ } - *error_code = MPI_SUCCESS; - *fstype = file_system; - return; diff --git a/lustre/contrib/mpich2-1.0.3.patch b/lustre/contrib/mpich2-1.0.3.patch deleted file mode 100644 index 78dda9b..0000000 --- a/lustre/contrib/mpich2-1.0.3.patch +++ /dev/null @@ -1,1831 +0,0 @@ -Date: Fri, 08 Jun 2007 14:04:34 -0400 -From: Weikuan Yu -To: Weikuan Yu -Subject: Re: [Lustre-discuss] MPI-IO for Lustre -Cc: lustre-discuss@clusterfs.com - - -This is the MPICH2 patch I originally started as a base for some ROMIO -optimizations over Lustre. It should work fine for MPICH2-1.0.3 on -experimental systems. However, use it as your risk :) - -Given time, I will try to push out my optimizations after some cleanup. I -would very happy to hear feedbacks on what features people would need most -at the time. - --- -Weikuan - - -diff -ruN romio-orig/adio/ad_lustre/ad_lustre.c romio/adio/ad_lustre/ad_lustre.c ---- romio-orig/adio/ad_lustre/ad_lustre.c 1969-12-31 19:00:00.000000000 -0500 -+++ romio/adio/ad_lustre/ad_lustre.c 2006-09-06 18:40:56.000844619 -0400 -@@ -0,0 +1,37 @@ -+/* -*- Mode: C; c-basic-offset:4 ; -*- */ -+/* -+ * $Id: ad_lustre.c,v 1.1.1.1 2004/11/04 11:03:38 liam Exp $ -+ * -+ * Copyright (C) 2001 University of Chicago. -+ * See COPYRIGHT notice in top-level directory. -+ */ -+ -+#include "ad_lustre.h" -+ -+/* adioi.h has the ADIOI_Fns_struct define */ -+#include "adioi.h" -+ -+struct ADIOI_Fns_struct ADIO_LUSTRE_operations = { -+ ADIOI_LUSTRE_Open, /* Open */ -+ ADIOI_LUSTRE_ReadContig, /* ReadContig */ -+ ADIOI_LUSTRE_WriteContig, /* WriteContig */ -+ ADIOI_GEN_ReadStridedColl, /* ReadStridedColl */ -+ ADIOI_GEN_WriteStridedColl, /* WriteStridedColl */ -+ ADIOI_GEN_SeekIndividual, /* SeekIndividual */ -+ ADIOI_LUSTRE_Fcntl, /* Fcntl */ -+ ADIOI_LUSTRE_SetInfo, /* SetInfo */ -+ ADIOI_GEN_ReadStrided, /* ReadStrided */ -+ ADIOI_GEN_WriteStrided, /* WriteStrided */ -+ ADIOI_LUSTRE_Close, /* Close */ -+ ADIOI_LUSTRE_IreadContig, /* IreadContig */ -+ ADIOI_LUSTRE_IwriteContig, /* IwriteContig */ -+ ADIOI_LUSTRE_ReadDone, /* ReadDone */ -+ ADIOI_LUSTRE_WriteDone, /* WriteDone */ -+ ADIOI_LUSTRE_ReadComplete, /* ReadComplete */ -+ ADIOI_LUSTRE_WriteComplete, /* WriteComplete */ -+ ADIOI_LUSTRE_IreadStrided, /* IreadStrided */ -+ ADIOI_LUSTRE_IwriteStrided, /* IwriteStrided */ -+ ADIOI_GEN_Flush, /* Flush */ -+ ADIOI_LUSTRE_Resize, /* Resize */ -+ ADIOI_GEN_Delete, /* Delete */ -+}; -diff -ruN romio-orig/adio/ad_lustre/ad_lustre_close.c romio/adio/ad_lustre/ad_lustre_close.c ---- romio-orig/adio/ad_lustre/ad_lustre_close.c 1969-12-31 19:00:00.000000000 -0500 -+++ romio/adio/ad_lustre/ad_lustre_close.c 2006-09-06 17:10:35.000683211 -0400 -@@ -0,0 +1,32 @@ -+/* -*- Mode: C; c-basic-offset:4 ; -*- */ -+/* -+ * $Id: ad_lustre_close.c,v 1.1.1.1 2004/11/04 11:03:38 liam Exp $ -+ * -+ * Copyright (C) 1997 University of Chicago. -+ * See COPYRIGHT notice in top-level directory. -+ */ -+ -+#include "ad_lustre.h" -+ -+void ADIOI_LUSTRE_Close(ADIO_File fd, int *error_code) -+{ -+ int err; -+#if defined(MPICH2) || !defined(PRINT_ERR_MSG) -+ static char myname[] = "ADIOI_LUSTRE_CLOSE"; -+#endif -+ -+ err = close(fd->fd_sys); -+ if (err == -1) { -+#ifdef MPICH2 -+ *error_code = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", -+ "**io %s", strerror(errno)); -+#elif defined(PRINT_ERR_MSG) -+ *error_code = MPI_ERR_UNKNOWN; -+#else -+ *error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_ADIO_ERROR, -+ myname, "I/O Error", "%s", strerror(errno)); -+ ADIOI_Error(fd, *error_code, myname); -+#endif -+ } -+ else *error_code = MPI_SUCCESS; -+} -diff -ruN romio-orig/adio/ad_lustre/ad_lustre_done.c romio/adio/ad_lustre/ad_lustre_done.c ---- romio-orig/adio/ad_lustre/ad_lustre_done.c 1969-12-31 19:00:00.000000000 -0500 -+++ romio/adio/ad_lustre/ad_lustre_done.c 2006-09-06 17:10:35.000692922 -0400 -@@ -0,0 +1,188 @@ -+/* -*- Mode: C; c-basic-offset:4 ; -*- */ -+/* -+ * $Id: ad_lustre_done.c,v 1.1.1.1 2004/11/04 11:03:38 liam Exp $ -+ * -+ * Copyright (C) 1997 University of Chicago. -+ * See COPYRIGHT notice in top-level directory. -+ */ -+ -+#include "ad_lustre.h" -+ -+int ADIOI_LUSTRE_ReadDone(ADIO_Request *request, ADIO_Status *status, int *error_code) -+{ -+#ifndef NO_AIO -+ int done=0; -+#if defined(MPICH2) || !defined(PRINT_ERR_MSG) -+ static char myname[] = "ADIOI_LUSTRE_READDONE"; -+#endif -+#ifdef AIO_SUN -+ aio_result_t *result=0, *tmp; -+#else -+ int err; -+#endif -+#ifdef AIO_HANDLE_IN_AIOCB -+ struct aiocb *tmp1; -+#endif -+#endif -+ -+ if (*request == ADIO_REQUEST_NULL) { -+ *error_code = MPI_SUCCESS; -+ return 1; -+ } -+ -+#ifdef NO_AIO -+/* HP, FreeBSD, Linux */ -+#ifdef HAVE_STATUS_SET_BYTES -+ MPIR_Status_set_bytes(status, (*request)->datatype, (*request)->nbytes); -+#endif -+ (*request)->fd->async_count--; -+ ADIOI_Free_request((ADIOI_Req_node *) (*request)); -+ *request = ADIO_REQUEST_NULL; -+ *error_code = MPI_SUCCESS; -+ return 1; -+#endif -+ -+#ifdef AIO_SUN -+ if ((*request)->queued) { -+ tmp = (aio_result_t *) (*request)->handle; -+ if (tmp->aio_return == AIO_INPROGRESS) { -+ done = 0; -+ *error_code = MPI_SUCCESS; -+ } -+ else if (tmp->aio_return != -1) { -+ result = (aio_result_t *) aiowait(0); /* dequeue any one request */ -+ done = 1; -+ (*request)->nbytes = tmp->aio_return; -+ *error_code = MPI_SUCCESS; -+ } -+ else { -+#ifdef MPICH2 -+ *error_code = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", -+ "**io %s", strerror(tmp->aio_errno)); -+ return; -+#elif defined(PRINT_ERR_MSG) -+ *error_code = MPI_ERR_UNKNOWN; -+#else -+ *error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_ADIO_ERROR, -+ myname, "I/O Error", "%s", strerror(tmp->aio_errno)); -+ ADIOI_Error((*request)->fd, *error_code, myname); -+#endif -+ } -+ } /* if ((*request)->queued) ... */ -+ else { -+ /* ADIOI_Complete_Async completed this request, but request object -+ was not freed. */ -+ done = 1; -+ *error_code = MPI_SUCCESS; -+ } -+#ifdef HAVE_STATUS_SET_BYTES -+ if (done && ((*request)->nbytes != -1)) -+ MPIR_Status_set_bytes(status, (*request)->datatype, (*request)->nbytes); -+#endif -+ -+#endif -+ -+#ifdef AIO_HANDLE_IN_AIOCB -+/* IBM */ -+ if ((*request)->queued) { -+ tmp1 = (struct aiocb *) (*request)->handle; -+ errno = aio_error(tmp1->aio_handle); -+ if (errno == EINPROG) { -+ done = 0; -+ *error_code = MPI_SUCCESS; -+ } -+ else { -+ err = aio_return(tmp1->aio_handle); -+ (*request)->nbytes = err; -+ errno = aio_error(tmp1->aio_handle); -+ -+ done = 1; -+ -+ if (err == -1) { -+#ifdef MPICH2 -+ *error_code = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", -+ "**io %s", strerror(errno)); -+ return; -+#elif defined(PRINT_ERR_MSG) -+ *error_code = MPI_ERR_UNKNOWN; -+#else -+ *error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_ADIO_ERROR, -+ myname, "I/O Error", "%s", strerror(errno)); -+ ADIOI_Error((*request)->fd, *error_code, myname); -+#endif -+ } -+ else *error_code = MPI_SUCCESS; -+ } -+ } /* if ((*request)->queued) */ -+ else { -+ done = 1; -+ *error_code = MPI_SUCCESS; -+ } -+#ifdef HAVE_STATUS_SET_BYTES -+ if (done && ((*request)->nbytes != -1)) -+ MPIR_Status_set_bytes(status, (*request)->datatype, (*request)->nbytes); -+#endif -+ -+#elif (!defined(NO_AIO) && !defined(AIO_SUN)) -+/* DEC, SGI IRIX 5 and 6 */ -+ if ((*request)->queued) { -+ errno = aio_error((const struct aiocb *) (*request)->handle); -+ if (errno == EINPROGRESS) { -+ done = 0; -+ *error_code = MPI_SUCCESS; -+ } -+ else { -+ err = aio_return((struct aiocb *) (*request)->handle); -+ (*request)->nbytes = err; -+ errno = aio_error((struct aiocb *) (*request)->handle); -+ -+ done = 1; -+ -+ if (err == -1) { -+#ifdef MPICH2 -+ *error_code = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", -+ "**io %s", strerror(errno)); -+ return; -+#elif defined(PRINT_ERR_MSG) -+ *error_code = MPI_ERR_UNKNOWN; -+#else /* MPICH-1 */ -+ *error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_ADIO_ERROR, -+ myname, "I/O Error", "%s", strerror(errno)); -+ ADIOI_Error((*request)->fd, *error_code, myname); -+#endif -+ } -+ else *error_code = MPI_SUCCESS; -+ } -+ } /* if ((*request)->queued) */ -+ else { -+ done = 1; -+ *error_code = MPI_SUCCESS; -+ } -+#ifdef HAVE_STATUS_SET_BYTES -+ if (done && ((*request)->nbytes != -1)) -+ MPIR_Status_set_bytes(status, (*request)->datatype, (*request)->nbytes); -+#endif -+ -+#endif -+ -+#ifndef NO_AIO -+ if (done) { -+ /* if request is still queued in the system, it is also there -+ on ADIOI_Async_list. Delete it from there. */ -+ if ((*request)->queued) ADIOI_Del_req_from_list(request); -+ -+ (*request)->fd->async_count--; -+ if ((*request)->handle) ADIOI_Free((*request)->handle); -+ ADIOI_Free_request((ADIOI_Req_node *) (*request)); -+ *request = ADIO_REQUEST_NULL; -+ } -+ return done; -+#endif -+ -+} -+ -+ -+int ADIOI_LUSTRE_WriteDone(ADIO_Request *request, ADIO_Status *status, int *error_code) -+{ -+ return ADIOI_LUSTRE_ReadDone(request, status, error_code); -+} -diff -ruN romio-orig/adio/ad_lustre/ad_lustre_fcntl.c romio/adio/ad_lustre/ad_lustre_fcntl.c ---- romio-orig/adio/ad_lustre/ad_lustre_fcntl.c 1969-12-31 19:00:00.000000000 -0500 -+++ romio/adio/ad_lustre/ad_lustre_fcntl.c 2006-09-06 18:43:11.000365177 -0400 -@@ -0,0 +1,127 @@ -+/* -*- Mode: C; c-basic-offset:4 ; -*- */ -+/* -+ * $Id: ad_lustre_fcntl.c,v 1.1.1.1 2004/11/04 11:03:38 liam Exp $ -+ * -+ * Copyright (C) 1997 University of Chicago. -+ * See COPYRIGHT notice in top-level directory. -+ */ -+ -+#include "ad_lustre.h" -+#include "adio_extern.h" -+/* #ifdef MPISGI -+#include "mpisgi2.h" -+#endif */ -+ -+void ADIOI_LUSTRE_Fcntl(ADIO_File fd, int flag, ADIO_Fcntl_t *fcntl_struct, int *error_code) -+{ -+ int i, ntimes; -+ ADIO_Offset curr_fsize, alloc_size, size, len, done; -+ ADIO_Status status; -+ char *buf; -+#if defined(MPICH2) || !defined(PRINT_ERR_MSG) -+ static char myname[] = "ADIOI_LUSTRE_FCNTL"; -+#endif -+ -+ switch(flag) { -+ case ADIO_FCNTL_GET_FSIZE: -+ fcntl_struct->fsize = lseek(fd->fd_sys, 0, SEEK_END); -+ if (fd->fp_sys_posn != -1) -+ lseek(fd->fd_sys, fd->fp_sys_posn, SEEK_SET); -+ if (fcntl_struct->fsize == -1) { -+#ifdef MPICH2 -+ *error_code = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", -+ "**io %s", strerror(errno)); -+#elif defined(PRINT_ERR_MSG) -+ *error_code = MPI_ERR_UNKNOWN; -+#else /* MPICH-1 */ -+ *error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_ADIO_ERROR, -+ myname, "I/O Error", "%s", strerror(errno)); -+ ADIOI_Error(fd, *error_code, myname); -+#endif -+ } -+ else *error_code = MPI_SUCCESS; -+ break; -+ -+ case ADIO_FCNTL_SET_DISKSPACE: -+ /* will be called by one process only */ -+ /* On file systems with no preallocation function, I have to -+ explicitly write -+ to allocate space. Since there could be holes in the file, -+ I need to read up to the current file size, write it back, -+ and then write beyond that depending on how much -+ preallocation is needed. -+ read/write in sizes of no more than ADIOI_PREALLOC_BUFSZ */ -+ -+ curr_fsize = lseek(fd->fd_sys, 0, SEEK_END); -+ alloc_size = fcntl_struct->diskspace; -+ -+ size = ADIOI_MIN(curr_fsize, alloc_size); -+ -+ ntimes = (size + ADIOI_PREALLOC_BUFSZ - 1)/ADIOI_PREALLOC_BUFSZ; -+ buf = (char *) ADIOI_Malloc(ADIOI_PREALLOC_BUFSZ); -+ done = 0; -+ -+ for (i=0; i curr_fsize) { -+ memset(buf, 0, ADIOI_PREALLOC_BUFSZ); -+ size = alloc_size - curr_fsize; -+ ntimes = (size + ADIOI_PREALLOC_BUFSZ - 1)/ADIOI_PREALLOC_BUFSZ; -+ for (i=0; ifp_sys_posn != -1) -+ lseek(fd->fd_sys, fd->fp_sys_posn, SEEK_SET); -+ *error_code = MPI_SUCCESS; -+ break; -+ -+#if 0 -+ case ADIO_FCNTL_SET_IOMODE: -+ /* for implementing PFS I/O modes. will not occur in MPI-IO -+ implementation.*/ -+ if (fd->iomode != fcntl_struct->iomode) { -+ fd->iomode = fcntl_struct->iomode; -+ MPI_Barrier(MPI_COMM_WORLD); -+ } -+ *error_code = MPI_SUCCESS; -+ break; -+#endif -+ -+ case ADIO_FCNTL_SET_ATOMICITY: -+ fd->atomicity = (fcntl_struct->atomicity == 0) ? 0 : 1; -+ *error_code = MPI_SUCCESS; -+ break; -+ -+ default: -+ FPRINTF(stderr, "Unknown flag passed to ADIOI_LUSTRE_Fcntl\n"); -+ MPI_Abort(MPI_COMM_WORLD, 1); -+ } -+} -diff -ruN romio-orig/adio/ad_lustre/ad_lustre_flush.c romio/adio/ad_lustre/ad_lustre_flush.c ---- romio-orig/adio/ad_lustre/ad_lustre_flush.c 1969-12-31 19:00:00.000000000 -0500 -+++ romio/adio/ad_lustre/ad_lustre_flush.c 2006-09-06 17:10:35.000711888 -0400 -@@ -0,0 +1,14 @@ -+/* -*- Mode: C; c-basic-offset:4 ; -*- */ -+/* -+ * $Id: ad_lustre_flush.c,v 1.1.1.1 2004/11/04 11:03:38 liam Exp $ -+ * -+ * Copyright (C) 1997 University of Chicago. -+ * See COPYRIGHT notice in top-level directory. -+ */ -+ -+#include "ad_lustre.h" -+ -+void ADIOI_LUSTRE_Flush(ADIO_File fd, int *error_code) -+{ -+ ADIOI_GEN_Flush(fd, error_code); -+} -diff -ruN romio-orig/adio/ad_lustre/ad_lustre.h romio/adio/ad_lustre/ad_lustre.h ---- romio-orig/adio/ad_lustre/ad_lustre.h 1969-12-31 19:00:00.000000000 -0500 -+++ romio/adio/ad_lustre/ad_lustre.h 2006-09-06 17:10:35.000722616 -0400 -@@ -0,0 +1,36 @@ -+/* -*- Mode: C; c-basic-offset:4 ; -*- */ -+/* -+ * $Id: ad_lustre.h,v 1.2 2005/07/07 14:38:17 liam Exp $ -+ * -+ * Copyright (C) 1997 University of Chicago. -+ * See COPYRIGHT notice in top-level directory. -+ */ -+ -+#ifndef AD_UNIX_INCLUDE -+#define AD_UNIX_INCLUDE -+ -+/* temp*/ -+#define HAVE_ASM_TYPES_H 1 -+ -+#include -+#include -+#include -+#include -+#include "lustre/lustre_user.h" -+#include "adio.h" -+ -+#ifndef NO_AIO -+#ifdef AIO_SUN -+#include -+#else -+#include -+#ifdef NEEDS_ADIOCB_T -+typedef struct adiocb adiocb_t; -+#endif -+#endif -+#endif -+ -+int ADIOI_LUSTRE_aio(ADIO_File fd, void *buf, int len, ADIO_Offset offset, -+ int wr, void *handle); -+ -+#endif -diff -ruN romio-orig/adio/ad_lustre/ad_lustre_hints.c romio/adio/ad_lustre/ad_lustre_hints.c ---- romio-orig/adio/ad_lustre/ad_lustre_hints.c 1969-12-31 19:00:00.000000000 -0500 -+++ romio/adio/ad_lustre/ad_lustre_hints.c 2006-09-06 17:10:35.000741994 -0400 -@@ -0,0 +1,130 @@ -+/* -*- Mode: C; c-basic-offset:4 ; -*- */ -+/* -+ * $Id: ad_lustre_hints.c,v 1.2 2005/07/07 14:38:17 liam Exp $ -+ * -+ * Copyright (C) 1997 University of Chicago. -+ * See COPYRIGHT notice in top-level directory. -+ */ -+ -+#include "ad_lustre.h" -+ -+void ADIOI_LUSTRE_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code) -+{ -+ char *value, *value_in_fd; -+ int flag, tmp_val, str_factor=-1, str_unit=0, start_iodev=-1; -+ struct lov_user_md lum = { 0 }; -+ int err, myrank, fd_sys, perm, amode, old_mask; -+ -+ if ( (fd->info) == MPI_INFO_NULL) { -+ /* This must be part of the open call. can set striping parameters -+ if necessary. */ -+ MPI_Info_create(&(fd->info)); -+ -+ /* has user specified striping or server buffering parameters -+ and do they have the same value on all processes? */ -+ if (users_info != MPI_INFO_NULL) { -+ value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); -+ -+ MPI_Info_get(users_info, "striping_factor", MPI_MAX_INFO_VAL, -+ value, &flag); -+ if (flag) { -+ str_factor=atoi(value); -+ tmp_val = str_factor; -+ MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); -+ if (tmp_val != str_factor) { -+ FPRINTF(stderr, "ADIOI_LUSTRE_SetInfo: the value for key \"striping_factor\" must be the same on all processes\n"); -+ MPI_Abort(MPI_COMM_WORLD, 1); -+ } -+ } -+ -+ MPI_Info_get(users_info, "striping_unit", MPI_MAX_INFO_VAL, -+ value, &flag); -+ if (flag) { -+ str_unit=atoi(value); -+ tmp_val = str_unit; -+ MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); -+ if (tmp_val != str_unit) { -+ FPRINTF(stderr, "ADIOI_LUSTRE_SetInfo: the value for key \"striping_unit\" must be the same on all processes\n"); -+ MPI_Abort(MPI_COMM_WORLD, 1); -+ } -+ } -+ -+ MPI_Info_get(users_info, "start_iodevice", MPI_MAX_INFO_VAL, -+ value, &flag); -+ if (flag) { -+ start_iodev=atoi(value); -+ tmp_val = start_iodev; -+ MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); -+ if (tmp_val != start_iodev) { -+ FPRINTF(stderr, "ADIOI_LUSTRE_SetInfo: the value for key \"start_iodevice\" must be the same on all processes\n"); -+ MPI_Abort(MPI_COMM_WORLD, 1); -+ } -+ } -+ -+ /* if user has specified striping info, process 0 tries to set it */ -+ if ((str_factor > 0) || (str_unit > 0) || (start_iodev >= 0)) { -+ MPI_Comm_rank(fd->comm, &myrank); -+ if (!myrank) { -+ if (fd->perm == ADIO_PERM_NULL) { -+ old_mask = umask(022); -+ umask(old_mask); -+ perm = old_mask ^ 0666; -+ } -+ else perm = fd->perm; -+ -+ amode = 0; -+ if (fd->access_mode & ADIO_CREATE) -+ amode = amode | O_CREAT; -+ if (fd->access_mode & ADIO_RDONLY) -+ amode = amode | O_RDONLY; -+ if (fd->access_mode & ADIO_WRONLY) -+ amode = amode | O_WRONLY; -+ if (fd->access_mode & ADIO_RDWR) -+ amode = amode | O_RDWR; -+ if (fd->access_mode & ADIO_EXCL) -+ amode = amode | O_EXCL; -+ -+ /* we need to create file so ensure this is set */ -+ amode = amode | O_LOV_DELAY_CREATE | O_CREAT; -+ -+ fd_sys = open(fd->filename, amode, perm); -+ if (fd_sys == -1) { -+ if (errno != EEXIST) -+ printf("Failure to open file %s %d %d\n",strerror(errno), amode, perm); -+ } else { -+ lum.lmm_magic = LOV_USER_MAGIC; -+ lum.lmm_pattern = 0; -+ lum.lmm_stripe_size = str_unit; -+ lum.lmm_stripe_count = str_factor; -+ lum.lmm_stripe_offset = start_iodev; -+ -+ err = ioctl(fd_sys, LL_IOC_LOV_SETSTRIPE, &lum); -+ if (err == -1 && errno != EEXIST) { -+ printf("Failure to set stripe info %s \n",strerror(errno)); -+ } -+ -+ close(fd_sys); -+ } -+ -+ } -+ MPI_Barrier(fd->comm); -+ } -+ -+ ADIOI_Free(value); -+ } -+ -+ /* set the values for collective I/O and data sieving parameters */ -+ ADIOI_GEN_SetInfo(fd, users_info, error_code); -+ } -+ -+ else { -+ /* The file has been opened previously and fd->fd_sys is a valid -+ file descriptor. cannot set striping parameters now. */ -+ -+ /* set the values for collective I/O and data sieving parameters */ -+ ADIOI_GEN_SetInfo(fd, users_info, error_code); -+ -+ } -+ -+ *error_code = MPI_SUCCESS; -+} -diff -ruN romio-orig/adio/ad_lustre/ad_lustre_iread.c romio/adio/ad_lustre/ad_lustre_iread.c ---- romio-orig/adio/ad_lustre/ad_lustre_iread.c 1969-12-31 19:00:00.000000000 -0500 -+++ romio/adio/ad_lustre/ad_lustre_iread.c 2006-09-06 17:10:35.000751765 -0400 -@@ -0,0 +1,106 @@ -+/* -*- Mode: C; c-basic-offset:4 ; -*- */ -+/* -+ * $Id: ad_lustre_iread.c,v 1.1.1.1 2004/11/04 11:03:38 liam Exp $ -+ * -+ * Copyright (C) 1997 University of Chicago. -+ * See COPYRIGHT notice in top-level directory. -+ */ -+ -+#include "ad_lustre.h" -+ -+void ADIOI_LUSTRE_IreadContig(ADIO_File fd, void *buf, int count, -+ MPI_Datatype datatype, int file_ptr_type, -+ ADIO_Offset offset, ADIO_Request *request, int *error_code) -+{ -+ int len, typesize; -+#ifdef NO_AIO -+ ADIO_Status status; -+#else -+ int err=-1; -+#if defined(MPICH2) || !defined(PRINT_ERR_MSG) -+ static char myname[] = "ADIOI_LUSTRE_IREADCONTIG"; -+#endif -+#endif -+ -+ (*request) = ADIOI_Malloc_request(); -+ (*request)->optype = ADIOI_READ; -+ (*request)->fd = fd; -+ (*request)->datatype = datatype; -+ -+ MPI_Type_size(datatype, &typesize); -+ len = count * typesize; -+ -+#ifdef NO_AIO -+ /* HP, FreeBSD, Linux */ -+ /* no support for nonblocking I/O. Use blocking I/O. */ -+ -+ ADIOI_LUSTRE_ReadContig(fd, buf, len, MPI_BYTE, file_ptr_type, offset, -+ &status, error_code); -+ (*request)->queued = 0; -+#ifdef HAVE_STATUS_SET_BYTES -+ if (*error_code == MPI_SUCCESS) { -+ MPI_Get_elements(&status, MPI_BYTE, &len); -+ (*request)->nbytes = len; -+ } -+#endif -+ -+#else -+ if (file_ptr_type == ADIO_INDIVIDUAL) offset = fd->fp_ind; -+ err = ADIOI_LUSTRE_aio(fd, buf, len, offset, 0, &((*request)->handle)); -+ if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind += len; -+ -+ (*request)->queued = 1; -+ ADIOI_Add_req_to_list(request); -+ -+ if (err == -1) { -+#ifdef MPICH2 -+ *error_code = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", -+ "**io %s", strerror(errno)); -+ return; -+#elif defined(PRINT_ERR_MSG) -+ *error_code = MPI_ERR_UNKNOWN; -+#else /* MPICH-1 */ -+ *error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_ADIO_ERROR, -+ myname, "I/O Error", "%s", strerror(errno)); -+ ADIOI_Error(fd, *error_code, myname); -+#endif -+ } -+ else *error_code = MPI_SUCCESS; -+#endif /* NO_AIO */ -+ -+ fd->fp_sys_posn = -1; /* set it to null. */ -+ fd->async_count++; -+} -+ -+ -+ -+void ADIOI_LUSTRE_IreadStrided(ADIO_File fd, void *buf, int count, -+ MPI_Datatype datatype, int file_ptr_type, -+ ADIO_Offset offset, ADIO_Request *request, int -+ *error_code) -+{ -+ ADIO_Status status; -+#ifdef HAVE_STATUS_SET_BYTES -+ int typesize; -+#endif -+ -+ *request = ADIOI_Malloc_request(); -+ (*request)->optype = ADIOI_READ; -+ (*request)->fd = fd; -+ (*request)->datatype = datatype; -+ (*request)->queued = 0; -+ (*request)->handle = 0; -+ -+/* call the blocking version. It is faster because it does data sieving. */ -+ ADIOI_LUSTRE_ReadStrided(fd, buf, count, datatype, file_ptr_type, -+ offset, &status, error_code); -+ -+ fd->async_count++; -+ -+#ifdef HAVE_STATUS_SET_BYTES -+ if (*error_code == MPI_SUCCESS) { -+ MPI_Type_size(datatype, &typesize); -+ (*request)->nbytes = count * typesize; -+ } -+#endif -+} -diff -ruN romio-orig/adio/ad_lustre/ad_lustre_iwrite.c romio/adio/ad_lustre/ad_lustre_iwrite.c ---- romio-orig/adio/ad_lustre/ad_lustre_iwrite.c 1969-12-31 19:00:00.000000000 -0500 -+++ romio/adio/ad_lustre/ad_lustre_iwrite.c 2006-09-06 17:10:35.000761678 -0400 -@@ -0,0 +1,268 @@ -+/* -*- Mode: C; c-basic-offset:4 ; -*- */ -+/* -+ * $Id: ad_lustre_iwrite.c,v 1.1.1.1 2004/11/04 11:03:38 liam Exp $ -+ * -+ * Copyright (C) 1997 University of Chicago. -+ * See COPYRIGHT notice in top-level directory. -+ */ -+ -+#include "ad_lustre.h" -+ -+void ADIOI_LUSTRE_IwriteContig(ADIO_File fd, void *buf, int count, -+ MPI_Datatype datatype, int file_ptr_type, -+ ADIO_Offset offset, ADIO_Request *request, int *error_code) -+{ -+ int len, typesize; -+#ifdef NO_AIO -+ ADIO_Status status; -+#else -+ int err=-1; -+#if defined(MPICH2) || !defined(PRINT_ERR_MSG) -+ static char myname[] = "ADIOI_LUSTRE_IWRITECONTIG"; -+#endif -+#endif -+ -+ *request = ADIOI_Malloc_request(); -+ (*request)->optype = ADIOI_WRITE; -+ (*request)->fd = fd; -+ (*request)->datatype = datatype; -+ -+ MPI_Type_size(datatype, &typesize); -+ len = count * typesize; -+ -+#ifdef NO_AIO -+ /* HP, FreeBSD, Linux */ -+ /* no support for nonblocking I/O. Use blocking I/O. */ -+ -+ ADIOI_LUSTRE_WriteContig(fd, buf, len, MPI_BYTE, file_ptr_type, offset, -+ &status, error_code); -+ (*request)->queued = 0; -+#ifdef HAVE_STATUS_SET_BYTES -+ if (*error_code == MPI_SUCCESS) { -+ MPI_Get_elements(&status, MPI_BYTE, &len); -+ (*request)->nbytes = len; -+ } -+#endif -+ -+#else -+ if (file_ptr_type == ADIO_INDIVIDUAL) offset = fd->fp_ind; -+ err = ADIOI_LUSTRE_aio(fd, buf, len, offset, 1, &((*request)->handle)); -+ if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind += len; -+ -+ (*request)->queued = 1; -+ ADIOI_Add_req_to_list(request); -+ -+ if (err == -1) { -+#ifdef MPICH2 -+ *error_code = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", -+ "**io %s", strerror(errno)); -+ return; -+#elif defined(PRINT_ERR_MSG) -+ *error_code = MPI_ERR_UNKNOWN; -+#else /* MPICH-1 */ -+ *error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_ADIO_ERROR, -+ myname, "I/O Error", "%s", strerror(errno)); -+ ADIOI_Error(fd, *error_code, myname); -+#endif -+ } -+ else *error_code = MPI_SUCCESS; -+#endif /* NO_AIO */ -+ -+ fd->fp_sys_posn = -1; /* set it to null. */ -+ fd->async_count++; -+} -+ -+ -+ -+ -+void ADIOI_LUSTRE_IwriteStrided(ADIO_File fd, void *buf, int count, -+ MPI_Datatype datatype, int file_ptr_type, -+ ADIO_Offset offset, ADIO_Request *request, int -+ *error_code) -+{ -+ ADIO_Status status; -+#ifdef HAVE_STATUS_SET_BYTES -+ int typesize; -+#endif -+ -+ *request = ADIOI_Malloc_request(); -+ (*request)->optype = ADIOI_WRITE; -+ (*request)->fd = fd; -+ (*request)->datatype = datatype; -+ (*request)->queued = 0; -+ (*request)->handle = 0; -+ -+/* call the blocking version. It is faster because it does data sieving. */ -+ ADIOI_LUSTRE_WriteStrided(fd, buf, count, datatype, file_ptr_type, -+ offset, &status, error_code); -+ -+ fd->async_count++; -+ -+#ifdef HAVE_STATUS_SET_BYTES -+ if (*error_code == MPI_SUCCESS) { -+ MPI_Type_size(datatype, &typesize); -+ (*request)->nbytes = count * typesize; -+ } -+#endif -+} -+ -+ -+/* This function is for implementation convenience. It is not user-visible. -+ It takes care of the differences in the interface for nonblocking I/O -+ on various Unix machines! If wr==1 write, wr==0 read. */ -+ -+int ADIOI_LUSTRE_aio(ADIO_File fd, void *buf, int len, ADIO_Offset offset, -+ int wr, void *handle) -+{ -+ int err=-1, fd_sys; -+ -+#ifndef NO_AIO -+ int error_code; -+#ifdef AIO_SUN -+ aio_result_t *result; -+#else -+ struct aiocb *aiocbp; -+#endif -+#endif -+ -+ fd_sys = fd->fd_sys; -+ -+#ifdef AIO_SUN -+ result = (aio_result_t *) ADIOI_Malloc(sizeof(aio_result_t)); -+ result->aio_return = AIO_INPROGRESS; -+ if (wr) err = aiowrite(fd_sys, buf, len, offset, SEEK_SET, result); -+ else err = aioread(fd_sys, buf, len, offset, SEEK_SET, result); -+ -+ if (err == -1) { -+ if (errno == EAGAIN) { -+ /* the man pages say EPROCLIM, but in reality errno is set to EAGAIN! */ -+ -+ /* exceeded the max. no. of outstanding requests. -+ complete all previous async. requests and try again.*/ -+ -+ ADIOI_Complete_async(&error_code); -+ if (wr) err = aiowrite(fd_sys, buf, len, offset, SEEK_SET, result); -+ else err = aioread(fd_sys, buf, len, offset, SEEK_SET, result); -+ -+ while (err == -1) { -+ if (errno == EAGAIN) { -+ /* sleep and try again */ -+ sleep(1); -+ if (wr) err = aiowrite(fd_sys, buf, len, offset, SEEK_SET, result); -+ else err = aioread(fd_sys, buf, len, offset, SEEK_SET, result); -+ } -+ else { -+ FPRINTF(stderr, "Unknown errno %d in ADIOI_LUSTRE_aio\n", errno); -+ MPI_Abort(MPI_COMM_WORLD, 1); -+ } -+ } -+ } -+ else { -+ FPRINTF(stderr, "Unknown errno %d in ADIOI_LUSTRE_aio\n", errno); -+ MPI_Abort(MPI_COMM_WORLD, 1); -+ } -+ } -+ -+ *((aio_result_t **) handle) = result; -+#endif -+ -+#ifdef NO_FD_IN_AIOCB -+/* IBM */ -+ aiocbp = (struct aiocb *) ADIOI_Malloc(sizeof(struct aiocb)); -+ aiocbp->aio_whence = SEEK_SET; -+ aiocbp->aio_offset = offset; -+ aiocbp->aio_buf = buf; -+ aiocbp->aio_nbytes = len; -+ if (wr) err = aio_write(fd_sys, aiocbp); -+ else err = aio_read(fd_sys, aiocbp); -+ -+ if (err == -1) { -+ if (errno == EAGAIN) { -+ /* exceeded the max. no. of outstanding requests. -+ complete all previous async. requests and try again. */ -+ -+ ADIOI_Complete_async(&error_code); -+ if (wr) err = aio_write(fd_sys, aiocbp); -+ else err = aio_read(fd_sys, aiocbp); -+ -+ while (err == -1) { -+ if (errno == EAGAIN) { -+ /* sleep and try again */ -+ sleep(1); -+ if (wr) err = aio_write(fd_sys, aiocbp); -+ else err = aio_read(fd_sys, aiocbp); -+ } -+ else { -+ FPRINTF(stderr, "Unknown errno %d in ADIOI_LUSTRE_aio\n", errno); -+ MPI_Abort(MPI_COMM_WORLD, 1); -+ } -+ } -+ } -+ else { -+ FPRINTF(stderr, "Unknown errno %d in ADIOI_LUSTRE_aio\n", errno); -+ MPI_Abort(MPI_COMM_WORLD, 1); -+ } -+ } -+ -+ *((struct aiocb **) handle) = aiocbp; -+ -+#elif (!defined(NO_AIO) && !defined(AIO_SUN)) -+/* DEC, SGI IRIX 5 and 6 */ -+ -+ aiocbp = (struct aiocb *) ADIOI_Calloc(sizeof(struct aiocb), 1); -+ aiocbp->aio_fildes = fd_sys; -+ aiocbp->aio_offset = offset; -+ aiocbp->aio_buf = buf; -+ aiocbp->aio_nbytes = len; -+ -+#ifdef AIO_PRIORITY_DEFAULT -+/* DEC */ -+ aiocbp->aio_reqprio = AIO_PRIO_DFL; /* not needed in DEC Unix 4.0 */ -+ aiocbp->aio_sigevent.sigev_signo = 0; -+#else -+ aiocbp->aio_reqprio = 0; -+#endif -+ -+#ifdef AIO_SIGNOTIFY_NONE -+/* SGI IRIX 6 */ -+ aiocbp->aio_sigevent.sigev_notify = SIGEV_NONE; -+#else -+ aiocbp->aio_sigevent.sigev_signo = 0; -+#endif -+ -+ if (wr) err = aio_write(aiocbp); -+ else err = aio_read(aiocbp); -+ -+ if (err == -1) { -+ if (errno == EAGAIN) { -+ /* exceeded the max. no. of outstanding requests. -+ complete all previous async. requests and try again. */ -+ -+ ADIOI_Complete_async(&error_code); -+ if (wr) err = aio_write(aiocbp); -+ else err = aio_read(aiocbp); -+ -+ while (err == -1) { -+ if (errno == EAGAIN) { -+ /* sleep and try again */ -+ sleep(1); -+ if (wr) err = aio_write(aiocbp); -+ else err = aio_read(aiocbp); -+ } -+ else { -+ FPRINTF(stderr, "Unknown errno %d in ADIOI_LUSTRE_aio\n", errno); -+ MPI_Abort(MPI_COMM_WORLD, 1); -+ } -+ } -+ } -+ else { -+ FPRINTF(stderr, "Unknown errno %d in ADIOI_LUSTRE_aio\n", errno); -+ MPI_Abort(MPI_COMM_WORLD, 1); -+ } -+ } -+ -+ *((struct aiocb **) handle) = aiocbp; -+#endif -+ -+ return err; -+} -diff -ruN romio-orig/adio/ad_lustre/ad_lustre_open.c romio/adio/ad_lustre/ad_lustre_open.c ---- romio-orig/adio/ad_lustre/ad_lustre_open.c 1969-12-31 19:00:00.000000000 -0500 -+++ romio/adio/ad_lustre/ad_lustre_open.c 2006-09-06 17:10:35.000771351 -0400 -@@ -0,0 +1,100 @@ -+/* -*- Mode: C; c-basic-offset:4 ; -*- */ -+/* -+ * $Id: ad_lustre_open.c,v 1.1.1.1 2004/11/04 11:03:38 liam Exp $ -+ * -+ * Copyright (C) 1997 University of Chicago. -+ * See COPYRIGHT notice in top-level directory. -+ */ -+ -+#include "ad_lustre.h" -+ -+void ADIOI_LUSTRE_Open(ADIO_File fd, int *error_code) -+{ -+ int perm, old_mask, amode; -+ struct lov_user_md lum = { 0 }; -+ char *value; -+ -+#if defined(MPICH2) || !defined(PRINT_ERR_MSG) -+ static char myname[] = "ADIOI_LUSTRE_OPEN"; -+#endif -+ -+ if (fd->perm == ADIO_PERM_NULL) { -+ old_mask = umask(022); -+ umask(old_mask); -+ perm = old_mask ^ 0666; -+ } -+ else perm = fd->perm; -+ -+ amode = 0; -+ if (fd->access_mode & ADIO_CREATE) -+ amode = amode | O_CREAT; -+ if (fd->access_mode & ADIO_RDONLY) -+ amode = amode | O_RDONLY; -+ if (fd->access_mode & ADIO_WRONLY) -+ amode = amode | O_WRONLY; -+ if (fd->access_mode & ADIO_RDWR) -+ amode = amode | O_RDWR; -+ if (fd->access_mode & ADIO_EXCL) -+ amode = amode | O_EXCL; -+ -+ fd->fd_sys = open(fd->filename, amode, perm); -+ -+ if (fd->fd_sys != -1) { -+ int err; -+ -+ value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); -+ -+ /* get file striping information and set it in info */ -+ lum.lmm_magic = LOV_USER_MAGIC; -+ err = ioctl(fd->fd_sys, LL_IOC_LOV_GETSTRIPE, (void *) &lum); -+ -+ if (!err) { -+ sprintf(value, "%d", lum.lmm_stripe_size); -+ MPI_Info_set(fd->info, "striping_unit", value); -+ -+ sprintf(value, "%d", lum.lmm_stripe_count); -+ MPI_Info_set(fd->info, "striping_factor", value); -+ -+ sprintf(value, "%d", lum.lmm_stripe_offset); -+ MPI_Info_set(fd->info, "start_iodevice", value); -+ } -+ ADIOI_Free(value); -+ -+ if (fd->access_mode & ADIO_APPEND) -+ fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END); -+ } -+ -+ -+ if ((fd->fd_sys != -1) && (fd->access_mode & ADIO_APPEND)) -+ fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END); -+ -+ if (fd->fd_sys == -1) { -+#ifdef MPICH2 -+ if (errno == ENAMETOOLONG) -+ *error_code = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_BAD_FILE, "**filenamelong", "**filenamelong %s %d", fd->filename, strlen(fd->filename)); -+ else if (errno == ENOENT) -+ *error_code = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_NO_SUCH_FILE, "**filenoexist", "**filenoexist %s", fd->filename); -+ else if (errno == ENOTDIR || errno == ELOOP) -+ *error_code = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_BAD_FILE, "**filenamedir", "**filenamedir %s", fd->filename); -+ else if (errno == EACCES) { -+ *error_code = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_ACCESS, "**fileaccess", "**fileaccess %s", -+ fd->filename); -+ } -+ else if (errno == EROFS) { -+ /* Read only file or file system and write access requested */ -+ *error_code = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_READ_ONLY, "**ioneedrd", 0); -+ } -+ else { -+ *error_code = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", -+ "**io %s", strerror(errno)); -+ } -+#elif defined(PRINT_ERR_MSG) -+ *error_code = MPI_ERR_UNKNOWN; -+#else /* MPICH-1 */ -+ *error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_ADIO_ERROR, -+ myname, "I/O Error", "%s", strerror(errno)); -+ ADIOI_Error(ADIO_FILE_NULL, *error_code, myname); -+#endif -+ } -+ else *error_code = MPI_SUCCESS; -+} -diff -ruN romio-orig/adio/ad_lustre/ad_lustre_rdcoll.c romio/adio/ad_lustre/ad_lustre_rdcoll.c ---- romio-orig/adio/ad_lustre/ad_lustre_rdcoll.c 1969-12-31 19:00:00.000000000 -0500 -+++ romio/adio/ad_lustre/ad_lustre_rdcoll.c 2006-09-06 17:10:35.000780880 -0400 -@@ -0,0 +1,18 @@ -+/* -*- Mode: C; c-basic-offset:4 ; -*- */ -+/* -+ * $Id: ad_lustre_rdcoll.c,v 1.1.1.1 2004/11/04 11:03:38 liam Exp $ -+ * -+ * Copyright (C) 1997 University of Chicago. -+ * See COPYRIGHT notice in top-level directory. -+ */ -+ -+#include "ad_lustre.h" -+ -+void ADIOI_LUSTRE_ReadStridedColl(ADIO_File fd, void *buf, int count, -+ MPI_Datatype datatype, int file_ptr_type, -+ ADIO_Offset offset, ADIO_Status *status, int -+ *error_code) -+{ -+ ADIOI_GEN_ReadStridedColl(fd, buf, count, datatype, file_ptr_type, -+ offset, status, error_code); -+} -diff -ruN romio-orig/adio/ad_lustre/ad_lustre_read.c romio/adio/ad_lustre/ad_lustre_read.c ---- romio-orig/adio/ad_lustre/ad_lustre_read.c 1969-12-31 19:00:00.000000000 -0500 -+++ romio/adio/ad_lustre/ad_lustre_read.c 2006-09-06 17:10:35.000790846 -0400 -@@ -0,0 +1,67 @@ -+/* -*- Mode: C; c-basic-offset:4 ; -*- */ -+/* -+ * $Id: ad_lustre_read.c,v 1.1.1.1 2004/11/04 11:03:38 liam Exp $ -+ * -+ * Copyright (C) 1997 University of Chicago. -+ * See COPYRIGHT notice in top-level directory. -+ */ -+ -+#include "ad_lustre.h" -+ -+void ADIOI_LUSTRE_ReadContig(ADIO_File fd, void *buf, int count, -+ MPI_Datatype datatype, int file_ptr_type, -+ ADIO_Offset offset, ADIO_Status *status, int *error_code) -+{ -+ int err=-1, datatype_size, len; -+#if defined(MPICH2) || !defined(PRINT_ERR_MSG) -+ static char myname[] = "ADIOI_LUSTRE_READCONTIG"; -+#endif -+ -+ MPI_Type_size(datatype, &datatype_size); -+ len = datatype_size * count; -+ -+ if (file_ptr_type == ADIO_EXPLICIT_OFFSET) { -+ if (fd->fp_sys_posn != offset) -+ lseek(fd->fd_sys, offset, SEEK_SET); -+ err = read(fd->fd_sys, buf, len); -+ fd->fp_sys_posn = offset + len; -+ /* individual file pointer not updated */ -+ } -+ else { /* read from curr. location of ind. file pointer */ -+ if (fd->fp_sys_posn != fd->fp_ind) -+ lseek(fd->fd_sys, fd->fp_ind, SEEK_SET); -+ err = read(fd->fd_sys, buf, len); -+ fd->fp_ind += err; -+ fd->fp_sys_posn = fd->fp_ind; -+ } -+ -+#ifdef HAVE_STATUS_SET_BYTES -+ if (err != -1) MPIR_Status_set_bytes(status, datatype, err); -+#endif -+ -+ if (err == -1) { -+#ifdef MPICH2 -+ *error_code = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", -+ "**io %s", strerror(errno)); -+#elif defined(PRINT_ERR_MSG) -+ *error_code = MPI_ERR_UNKNOWN; -+#else /* MPICH-1 */ -+ *error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_ADIO_ERROR, -+ myname, "I/O Error", "%s", strerror(errno)); -+ ADIOI_Error(fd, *error_code, myname); -+#endif -+ } -+ else *error_code = MPI_SUCCESS; -+} -+ -+ -+ -+ -+void ADIOI_LUSTRE_ReadStrided(ADIO_File fd, void *buf, int count, -+ MPI_Datatype datatype, int file_ptr_type, -+ ADIO_Offset offset, ADIO_Status *status, int -+ *error_code) -+{ -+ ADIOI_GEN_ReadStrided(fd, buf, count, datatype, file_ptr_type, -+ offset, status, error_code); -+} -diff -ruN romio-orig/adio/ad_lustre/ad_lustre_resize.c romio/adio/ad_lustre/ad_lustre_resize.c ---- romio-orig/adio/ad_lustre/ad_lustre_resize.c 1969-12-31 19:00:00.000000000 -0500 -+++ romio/adio/ad_lustre/ad_lustre_resize.c 2006-09-06 17:10:35.000807397 -0400 -@@ -0,0 +1,32 @@ -+/* -*- Mode: C; c-basic-offset:4 ; -*- */ -+/* -+ * $Id: ad_lustre_resize.c,v 1.1.1.1 2004/11/04 11:03:38 liam Exp $ -+ * -+ * Copyright (C) 1997 University of Chicago. -+ * See COPYRIGHT notice in top-level directory. -+ */ -+ -+#include "ad_lustre.h" -+ -+void ADIOI_LUSTRE_Resize(ADIO_File fd, ADIO_Offset size, int *error_code) -+{ -+ int err; -+#if defined(MPICH2) || !defined(PRINT_ERR_MSG) -+ static char myname[] = "ADIOI_LUSTRE_RESIZE"; -+#endif -+ -+ err = ftruncate(fd->fd_sys, size); -+ if (err == -1) { -+#ifdef MPICH2 -+ *error_code = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", -+ "**io %s", strerror(errno)); -+#elif defined(PRINT_ERR_MSG) -+ *error_code = MPI_ERR_UNKNOWN; -+#else /* MPICH-1 */ -+ *error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_ADIO_ERROR, -+ myname, "I/O Error", "%s", strerror(errno)); -+ ADIOI_Error(fd, *error_code, myname); -+#endif -+ } -+ else *error_code = MPI_SUCCESS; -+} -diff -ruN romio-orig/adio/ad_lustre/ad_lustre_seek.c romio/adio/ad_lustre/ad_lustre_seek.c ---- romio-orig/adio/ad_lustre/ad_lustre_seek.c 1969-12-31 19:00:00.000000000 -0500 -+++ romio/adio/ad_lustre/ad_lustre_seek.c 2006-09-06 17:10:35.000816583 -0400 -@@ -0,0 +1,15 @@ -+/* -*- Mode: C; c-basic-offset:4 ; -*- */ -+/* -+ * $Id: ad_lustre_seek.c,v 1.1.1.1 2004/11/04 11:03:38 liam Exp $ -+ * -+ * Copyright (C) 1997 University of Chicago. -+ * See COPYRIGHT notice in top-level directory. -+ */ -+ -+#include "ad_lustre.h" -+ -+ADIO_Offset ADIOI_LUSTRE_SeekIndividual(ADIO_File fd, ADIO_Offset offset, -+ int whence, int *error_code) -+{ -+ return ADIOI_GEN_SeekIndividual(fd, offset, whence, error_code); -+} -diff -ruN romio-orig/adio/ad_lustre/ad_lustre_wait.c romio/adio/ad_lustre/ad_lustre_wait.c ---- romio-orig/adio/ad_lustre/ad_lustre_wait.c 1969-12-31 19:00:00.000000000 -0500 -+++ romio/adio/ad_lustre/ad_lustre_wait.c 2006-09-06 18:45:39.000190529 -0400 -@@ -0,0 +1,188 @@ -+/* -*- Mode: C; c-basic-offset:4 ; -*- */ -+/* -+ * $Id: ad_lustre_wait.c,v 1.1.1.1 2004/11/04 11:03:38 liam Exp $ -+ * -+ * Copyright (C) 1997 University of Chicago. -+ * See COPYRIGHT notice in top-level directory. -+ */ -+ -+#include "ad_lustre.h" -+ -+void ADIOI_LUSTRE_ReadComplete(ADIO_Request *request, ADIO_Status *status, int *error_code) -+{ -+#ifndef NO_AIO -+#if defined(MPICH2) || !defined(PRINT_ERR_MSG) -+ static char myname[] = "ADIOI_LUSTRE_READCOMPLETE"; -+#endif -+#ifdef AIO_SUN -+ aio_result_t *result=0, *tmp; -+#else -+ int err; -+#endif -+#ifdef AIO_HANDLE_IN_AIOCB -+ struct aiocb *tmp1; -+#endif -+#endif -+ -+ if (*request == ADIO_REQUEST_NULL) { -+ *error_code = MPI_SUCCESS; -+ return; -+ } -+ -+#ifdef AIO_SUN -+ if ((*request)->queued) { /* dequeue it */ -+ tmp = (aio_result_t *) (*request)->handle; -+ while (tmp->aio_return == AIO_INPROGRESS) usleep(1000); -+ /* sleep for 1 ms., until done. Is 1 ms. a good number? */ -+ /* when done, dequeue any one request */ -+ result = (aio_result_t *) aiowait(0); -+ -+ (*request)->nbytes = tmp->aio_return; -+ -+ if (tmp->aio_return == -1) { -+#ifdef MPICH2 -+ *error_code = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", -+ "**io %s", strerror(tmp->aio_errno)); -+ return; -+#elif defined(PRINT_ERR_MSG) -+ *error_code = MPI_ERR_UNKNOWN; -+#else /* MPICH-1 */ -+ *error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_ADIO_ERROR, -+ myname, "I/O Error", "%s", strerror(tmp->aio_errno)); -+ ADIOI_Error((*request)->fd, *error_code, myname); -+#endif -+ } -+ else *error_code = MPI_SUCCESS; -+ -+/* aiowait only dequeues a request. The completion of a request can be -+ checked by just checking the aio_return flag in the handle passed -+ to the original aioread()/aiowrite(). Therefore, I need to ensure -+ that aiowait() is called exactly once for each previous -+ aioread()/aiowrite(). This is also taken care of in ADIOI_xxxDone */ -+ } -+ else *error_code = MPI_SUCCESS; -+ -+#ifdef HAVE_STATUS_SET_BYTES -+ if ((*request)->nbytes != -1) -+ MPIR_Status_set_bytes(status, (*request)->datatype, (*request)->nbytes); -+#endif -+ -+#endif -+ -+#ifdef AIO_HANDLE_IN_AIOCB -+/* IBM */ -+ if ((*request)->queued) { -+ do { -+ err = aio_suspend(1, (struct aiocb **) &((*request)->handle)); -+ } while ((err == -1) && (errno == EINTR)); -+ -+ tmp1 = (struct aiocb *) (*request)->handle; -+ if (err != -1) { -+ err = aio_return(tmp1->aio_handle); -+ (*request)->nbytes = err; -+ errno = aio_error(tmp1->aio_handle); -+ } -+ else (*request)->nbytes = -1; -+ -+/* on DEC, it is required to call aio_return to dequeue the request. -+ IBM man pages don't indicate what function to use for dequeue. -+ I'm assuming it is aio_return! POSIX says aio_return may be called -+ only once on a given handle. */ -+ -+ if (err == -1) { -+#ifdef MPICH2 -+ *error_code = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", -+ "**io %s", strerror(errno)); -+ return; -+#elif defined(PRINT_ERR_MSG) -+ *error_code = MPI_ERR_UNKNOWN; -+#else /* MPICH-1 */ -+ *error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_ADIO_ERROR, -+ myname, "I/O Error", "%s", strerror(errno)); -+ ADIOI_Error((*request)->fd, *error_code, myname); -+#endif -+ } -+ else *error_code = MPI_SUCCESS; -+ } /* if ((*request)->queued) */ -+ else *error_code = MPI_SUCCESS; -+ -+#ifdef HAVE_STATUS_SET_BYTES -+ if ((*request)->nbytes != -1) -+ MPIR_Status_set_bytes(status, (*request)->datatype, (*request)->nbytes); -+#endif -+ -+#elif (!defined(NO_AIO) && !defined(AIO_SUN)) -+/* DEC, SGI IRIX 5 and 6 */ -+ if ((*request)->queued) { -+ do { -+ err = aio_suspend((const struct aiocb_t **) &((*request)->handle), 1, 0); -+ } while ((err == -1) && (errno == EINTR)); -+ -+ if (err != -1) { -+ err = aio_return((struct aiocb *) (*request)->handle); -+ (*request)->nbytes = err; -+ errno = aio_error((struct aiocb *) (*request)->handle); -+ } -+ else (*request)->nbytes = -1; -+ -+ if (err == -1) { -+#ifdef MPICH2 -+ *error_code = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", -+ "**io %s", strerror(errno)); -+ return; -+#elif defined(PRINT_ERR_MSG) -+ *error_code = MPI_ERR_UNKNOWN; -+#else /* MPICH-1 */ -+ *error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_ADIO_ERROR, -+ myname, "I/O Error", "%s", strerror(errno)); -+ ADIOI_Error((*request)->fd, *error_code, myname); -+#endif -+ } -+ else *error_code = MPI_SUCCESS; -+ } /* if ((*request)->queued) */ -+ else *error_code = MPI_SUCCESS; -+#ifdef HAVE_STATUS_SET_BYTES -+ if ((*request)->nbytes != -1) -+ MPIR_Status_set_bytes(status, (*request)->datatype, (*request)->nbytes); -+#endif -+#endif -+ -+#ifndef NO_AIO -+ if ((*request)->queued != -1) { -+ -+ /* queued = -1 is an internal hack used when the request must -+ be completed, but the request object should not be -+ freed. This is used in ADIOI_Complete_async, because the user -+ will call MPI_Wait later, which would require status to -+ be filled. Ugly but works. queued = -1 should be used only -+ in ADIOI_Complete_async. -+ This should not affect the user in any way. */ -+ -+ /* if request is still queued in the system, it is also there -+ on ADIOI_Async_list. Delete it from there. */ -+ if ((*request)->queued) ADIOI_Del_req_from_list(request); -+ -+ (*request)->fd->async_count--; -+ if ((*request)->handle) ADIOI_Free((*request)->handle); -+ ADIOI_Free_request((ADIOI_Req_node *) (*request)); -+ *request = ADIO_REQUEST_NULL; -+ } -+ -+#else -+/* HP, FreeBSD, Linux */ -+ -+#ifdef HAVE_STATUS_SET_BYTES -+ MPIR_Status_set_bytes(status, (*request)->datatype, (*request)->nbytes); -+#endif -+ (*request)->fd->async_count--; -+ ADIOI_Free_request((ADIOI_Req_node *) (*request)); -+ *request = ADIO_REQUEST_NULL; -+ *error_code = MPI_SUCCESS; -+#endif -+} -+ -+ -+void ADIOI_LUSTRE_WriteComplete(ADIO_Request *request, ADIO_Status *status, int *error_code) -+{ -+ ADIOI_LUSTRE_ReadComplete(request, status, error_code); -+} -diff -ruN romio-orig/adio/ad_lustre/ad_lustre_wrcoll.c romio/adio/ad_lustre/ad_lustre_wrcoll.c ---- romio-orig/adio/ad_lustre/ad_lustre_wrcoll.c 1969-12-31 19:00:00.000000000 -0500 -+++ romio/adio/ad_lustre/ad_lustre_wrcoll.c 2006-09-06 17:10:35.000835460 -0400 -@@ -0,0 +1,18 @@ -+/* -*- Mode: C; c-basic-offset:4 ; -*- */ -+/* -+ * $Id: ad_lustre_wrcoll.c,v 1.1.1.1 2004/11/04 11:03:38 liam Exp $ -+ * -+ * Copyright (C) 1997 University of Chicago. -+ * See COPYRIGHT notice in top-level directory. -+ */ -+ -+#include "ad_lustre.h" -+ -+void ADIOI_LUSTRE_WriteStridedColl(ADIO_File fd, void *buf, int count, -+ MPI_Datatype datatype, int file_ptr_type, -+ ADIO_Offset offset, ADIO_Status *status, int -+ *error_code) -+{ -+ ADIOI_GEN_WriteStridedColl(fd, buf, count, datatype, file_ptr_type, -+ offset, status, error_code); -+} -diff -ruN romio-orig/adio/ad_lustre/ad_lustre_write.c romio/adio/ad_lustre/ad_lustre_write.c ---- romio-orig/adio/ad_lustre/ad_lustre_write.c 1969-12-31 19:00:00.000000000 -0500 -+++ romio/adio/ad_lustre/ad_lustre_write.c 2006-09-06 17:10:35.000844658 -0400 -@@ -0,0 +1,66 @@ -+/* -*- Mode: C; c-basic-offset:4 ; -*- */ -+/* -+ * $Id: ad_lustre_write.c,v 1.1.1.1 2004/11/04 11:03:38 liam Exp $ -+ * -+ * Copyright (C) 1997 University of Chicago. -+ * See COPYRIGHT notice in top-level directory. -+ */ -+ -+#include "ad_lustre.h" -+ -+void ADIOI_LUSTRE_WriteContig(ADIO_File fd, void *buf, int count, -+ MPI_Datatype datatype, int file_ptr_type, -+ ADIO_Offset offset, ADIO_Status *status, int *error_code) -+{ -+ int err=-1, datatype_size, len; -+#if defined(MPICH2) || !defined(PRINT_ERR_MSG) -+ static char myname[] = "ADIOI_LUSTRE_WRITECONTIG"; -+#endif -+ -+ MPI_Type_size(datatype, &datatype_size); -+ len = datatype_size * count; -+ -+ if (file_ptr_type == ADIO_EXPLICIT_OFFSET) { -+ if (fd->fp_sys_posn != offset) -+ lseek(fd->fd_sys, offset, SEEK_SET); -+ err = write(fd->fd_sys, buf, len); -+ fd->fp_sys_posn = offset + err; -+ /* individual file pointer not updated */ -+ } -+ else { /* write from curr. location of ind. file pointer */ -+ if (fd->fp_sys_posn != fd->fp_ind) -+ lseek(fd->fd_sys, fd->fp_ind, SEEK_SET); -+ err = write(fd->fd_sys, buf, len); -+ fd->fp_ind += err; -+ fd->fp_sys_posn = fd->fp_ind; -+ } -+ -+#ifdef HAVE_STATUS_SET_BYTES -+ if (err != -1 && status) MPIR_Status_set_bytes(status, datatype, err); -+#endif -+ -+ if (err == -1) { -+#ifdef MPICH2 -+ *error_code = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**io", -+ "**io %s", strerror(errno)); -+#elif defined(PRINT_ERR_MSG) -+ *error_code = MPI_ERR_UNKNOWN; -+#else -+ *error_code = MPIR_Err_setmsg(MPI_ERR_IO, MPIR_ADIO_ERROR, -+ myname, "I/O Error", "%s", strerror(errno)); -+ ADIOI_Error(fd, *error_code, myname); -+#endif -+ } -+ else *error_code = MPI_SUCCESS; -+} -+ -+ -+ -+void ADIOI_LUSTRE_WriteStrided(ADIO_File fd, void *buf, int count, -+ MPI_Datatype datatype, int file_ptr_type, -+ ADIO_Offset offset, ADIO_Status *status, int -+ *error_code) -+{ -+ ADIOI_GEN_WriteStrided(fd, buf, count, datatype, file_ptr_type, -+ offset, status, error_code); -+} -diff -ruN romio-orig/adio/ad_lustre/Makefile.in romio/adio/ad_lustre/Makefile.in ---- romio-orig/adio/ad_lustre/Makefile.in 1969-12-31 19:00:00.000000000 -0500 -+++ romio/adio/ad_lustre/Makefile.in 2006-09-06 18:48:56.000800829 -0400 -@@ -0,0 +1,51 @@ -+CC = @CC@ -+AR = @AR@ -+RANLIB = @RANLIB@ -+LIBNAME = @LIBNAME@ -+srcdir = @srcdir@ -+CC_SHL = @CC_SHL@ -+SHLIBNAME = @SHLIBNAME@ -+ -+INCLUDE_DIR = -I@MPI_INCLUDE_DIR@ -I${srcdir}/../include -I../include -I../../include -I${srcdir}/../../../../include -I../../../../include -+CFLAGS = @CPPFLAGS@ @CFLAGS@ $(INCLUDE_DIR) -+ -+top_builddir = @master_topbuild_dir@ -+LIBTOOL = @LIBTOOL@ -+C_COMPILE_SHL = $(CC_SHL) @CFLAGS@ $(INCLUDE_DIR) -+ -+@VPATH@ -+ -+AD_LUSTRE_OBJECTS = ad_lustre.o ad_lustre_close.o ad_lustre_read.o \ -+ ad_lustre_open.o ad_lustre_write.o ad_lustre_done.o \ -+ ad_lustre_fcntl.o ad_lustre_iread.o ad_lustre_iwrite.o ad_lustre_wait.o \ -+ ad_lustre_resize.o ad_lustre_hints.o -+ -+default: $(LIBNAME) -+ @if [ "@ENABLE_SHLIB@" != "none" ] ; then \ -+ $(MAKE) $(SHLIBNAME).la ;\ -+ fi -+ -+.SUFFIXES: $(SUFFIXES) .p .lo -+ -+.c.o: -+ $(CC) $(CFLAGS) -c $< -+.c.lo: -+ $(C_COMPILE_SHL) -c $< -o _s$*.o -+ @mv -f _s$*.o $*.lo -+# $(C_COMPILE_SHL) -c $< -+# @mv -f $*.o $*.lo -+ -+$(LIBNAME): $(AD_LUSTRE_OBJECTS) -+ $(AR) $(LIBNAME) $(AD_LUSTRE_OBJECTS) -+ $(RANLIB) $(LIBNAME) -+ -+AD_LUSTRE_LOOBJECTS=$(AD_LUSTRE_OBJECTS:.o=.lo) -+$(SHLIBNAME).la: $(AD_LUSTRE_LOOBJECTS) -+ $(AR) $(SHLIBNAME).la $(AD_LUSTRE_LOOBJECTS) -+ -+coverage: -+ -@for file in ${AD_LUSTRE_OBJECTS:.o=.c} ; do \ -+ gcov -b -f $$file ; done -+ -+clean: -+ @rm -f *.o *.lo -diff -ruN romio-orig/adio/common/ad_fstype.c romio/adio/common/ad_fstype.c ---- romio-orig/adio/common/ad_fstype.c 2005-08-11 19:33:46.000000000 -0400 -+++ romio/adio/common/ad_fstype.c 2006-09-06 17:41:20.000830936 -0400 -@@ -265,6 +265,9 @@ - /* if UFS support is enabled, default to that */ - *fstype = ADIO_UFS; - return; -+# elif defined(LINUX) && defined(ROMIO_LUSTRE) -+# warning use correct include -+# define LL_SUPER_MAGIC 0x0BD00BD0 - # endif - - /* --BEGIN ERROR HANDLING-- */ -@@ -308,6 +311,13 @@ - } - # endif - -+# ifdef LL_SUPER_MAGIC -+ if (fsbuf.f_type == LL_SUPER_MAGIC) { -+ *fstype = ADIO_LUSTRE; -+ return; -+ } -+# endif -+ - # ifdef PAN_KERNEL_FS_CLIENT_SUPER_MAGIC - if (fsbuf.f_type == PAN_KERNEL_FS_CLIENT_SUPER_MAGIC) { - *fstype = ADIO_PANFS; -@@ -458,6 +468,11 @@ - { - *fstype = ADIO_GRIDFTP; - } -+ else if (!strncmp(filename, "lustre:", 7) -+ || !strncmp(filename, "LUSTRE:", 7)) -+ { -+ *fstype = ADIO_LUSTRE; -+ } - else { - #ifdef ROMIO_NTFS - *fstype = ADIO_NTFS; -@@ -657,6 +672,14 @@ - *ops = &ADIO_GRIDFTP_operations; - #endif - } -+ if (file_system == ADIO_LUSTRE) { -+#ifndef ROMIO_LUSTRE -+ *error_code = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**iofstypeunsupported", 0); -+ return; -+#else -+ *ops = &ADIO_LUSTRE_operations; -+#endif -+ } - *error_code = MPI_SUCCESS; - *fstype = file_system; - return; -diff -ruN romio-orig/adio/include/adio.h romio/adio/include/adio.h ---- romio-orig/adio/include/adio.h 2006-06-09 17:45:04.000000000 -0400 -+++ romio/adio/include/adio.h 2006-09-06 17:44:16.000614058 -0400 -@@ -302,6 +302,7 @@ - #define ADIO_PVFS2 160 /* PVFS2: 2nd generation PVFS */ - #define ADIO_PANFS 161 /* Panasas FS */ - #define ADIO_GRIDFTP 162 /* Globus GridFTP */ -+#define ADIO_LUSTRE 163 /* Lustre */ - - #define ADIO_SEEK_SET SEEK_SET - #define ADIO_SEEK_CUR SEEK_CUR -diff -ruN romio-orig/adio/include/adioi_fs_proto.h romio/adio/include/adioi_fs_proto.h ---- romio-orig/adio/include/adioi_fs_proto.h 2005-06-08 17:16:39.000000000 -0400 -+++ romio/adio/include/adioi_fs_proto.h 2006-09-06 17:48:11.000523566 -0400 -@@ -49,6 +49,68 @@ - /* prototypes are in adio/ad_sfs/ad_sfs.h */ - #endif - -+#ifdef ROMIO_LUSTRE -+extern struct ADIOI_Fns_struct ADIO_LUSTRE_operations; -+ -+void ADIOI_LUSTRE_Open(ADIO_File fd, int *error_code); -+void ADIOI_LUSTRE_Close(ADIO_File fd, int *error_code); -+void ADIOI_LUSTRE_ReadContig(ADIO_File fd, void *buf, int count, -+ MPI_Datatype datatype, int file_ptr_type, -+ ADIO_Offset offset, ADIO_Status *status, int -+ *error_code); -+void ADIOI_LUSTRE_WriteContig(ADIO_File fd, void *buf, int count, -+ MPI_Datatype datatype, int file_ptr_type, -+ ADIO_Offset offset, ADIO_Status *status, int -+ *error_code); -+void ADIOI_LUSTRE_IwriteContig(ADIO_File fd, void *buf, int count, -+ MPI_Datatype datatype, int file_ptr_type, -+ ADIO_Offset offset, ADIO_Request *request, int -+ *error_code); -+void ADIOI_LUSTRE_IreadContig(ADIO_File fd, void *buf, int count, -+ MPI_Datatype datatype, int file_ptr_type, -+ ADIO_Offset offset, ADIO_Request *request, int -+ *error_code); -+int ADIOI_LUSTRE_ReadDone(ADIO_Request *request, ADIO_Status *status, int -+ *error_code); -+int ADIOI_LUSTRE_WriteDone(ADIO_Request *request, ADIO_Status *status, int -+ *error_code); -+void ADIOI_LUSTRE_ReadComplete(ADIO_Request *request, ADIO_Status *status, int -+ *error_code); -+void ADIOI_LUSTRE_WriteComplete(ADIO_Request *request, ADIO_Status *status, -+ int *error_code); -+void ADIOI_LUSTRE_Fcntl(ADIO_File fd, int flag, ADIO_Fcntl_t *fcntl_struct, int -+ *error_code); -+void ADIOI_LUSTRE_WriteStrided(ADIO_File fd, void *buf, int count, -+ MPI_Datatype datatype, int file_ptr_type, -+ ADIO_Offset offset, ADIO_Status *status, int -+ *error_code); -+void ADIOI_LUSTRE_ReadStrided(ADIO_File fd, void *buf, int count, -+ MPI_Datatype datatype, int file_ptr_type, -+ ADIO_Offset offset, ADIO_Status *status, int -+ *error_code); -+void ADIOI_LUSTRE_WriteStridedColl(ADIO_File fd, void *buf, int count, -+ MPI_Datatype datatype, int file_ptr_type, -+ ADIO_Offset offset, ADIO_Status *status, int -+ *error_code); -+void ADIOI_LUSTRE_ReadStridedColl(ADIO_File fd, void *buf, int count, -+ MPI_Datatype datatype, int file_ptr_type, -+ ADIO_Offset offset, ADIO_Status *status, int -+ *error_code); -+void ADIOI_LUSTRE_IreadStrided(ADIO_File fd, void *buf, int count, -+ MPI_Datatype datatype, int file_ptr_type, -+ ADIO_Offset offset, ADIO_Request *request, int -+ *error_code); -+void ADIOI_LUSTRE_IwriteStrided(ADIO_File fd, void *buf, int count, -+ MPI_Datatype datatype, int file_ptr_type, -+ ADIO_Offset offset, ADIO_Request *request, int -+ *error_code); -+void ADIOI_LUSTRE_Flush(ADIO_File fd, int *error_code); -+void ADIOI_LUSTRE_Resize(ADIO_File fd, ADIO_Offset size, int *error_code); -+ADIO_Offset ADIOI_LUSTRE_SeekIndividual(ADIO_File fd, ADIO_Offset offset, -+ int whence, int *error_code); -+void ADIOI_LUSTRE_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code); -+#endif -+ - #ifdef ROMIO_NTFS - extern struct ADIOI_Fns_struct ADIO_NTFS_operations; - /* prototypes are in adio/ad_ntfs/ad_ntfs.h */ -diff -ruN romio-orig/adio/include/mpio_error.h romio/adio/include/mpio_error.h ---- romio-orig/adio/include/mpio_error.h 2005-05-23 19:27:50.000000000 -0400 -+++ romio/adio/include/mpio_error.h 2006-09-06 17:10:35.000984078 -0400 -@@ -63,6 +63,7 @@ - #define MPIR_ERR_FILETYPE 33 - #define MPIR_ERR_NO_NTFS 35 - #define MPIR_ERR_NO_TESTFS 36 -+#define MPIR_ERR_NO_LUSTRE 37 - - /* MPI_ERR_COMM */ - #ifndef MPIR_ERR_COMM_NULL -diff -ruN romio-orig/adio/include/romioconf.h.in romio/adio/include/romioconf.h.in ---- romio-orig/adio/include/romioconf.h.in 2006-08-11 09:48:44.000000000 -0400 -+++ romio/adio/include/romioconf.h.in 2006-09-06 17:43:08.000599274 -0400 -@@ -276,6 +276,9 @@ - /* Define for ROMIO with PVFS2 */ - #undef ROMIO_PVFS2 - -+/* Define for ROMIO with LUSTRE */ -+#undef ROMIO_LUSTRE -+ - /* Define if int64_t must be defined for PVFS */ - #undef ROMIO_PVFS_NEEDS_INT64_DEFINITION - -diff -ruN romio-orig/configure romio/configure ---- romio-orig/configure 2006-08-11 09:48:45.000000000 -0400 -+++ romio/configure 2006-09-06 17:20:57.000555513 -0400 -@@ -1400,7 +1400,7 @@ - # - have_aio=no - # --known_filesystems="nfs ufs pfs pvfs pvfs2 testfs xfs panfs gridftp" -+known_filesystems="nfs ufs pfs pvfs pvfs2 testfs xfs panfs gridftp lustre" - known_mpi_impls="mpich2_mpi mpich_mpi sgi_mpi hp_mpi cray_mpi lam_mpi" - # - # Defaults -@@ -7490,6 +7490,14 @@ - - fi - -+if test -n "$file_system_lustre"; then -+ -+cat >>confdefs.h <<\_ACEOF -+#define ROMIO_LUSTRE 1 -+_ACEOF -+ -+fi -+ - # - # Check for presence and characteristics of async. I/O calls if - # not disabled. -@@ -11977,7 +11985,7 @@ - # are active will be called by the top level ROMIO make - ac_config_commands="$ac_config_commands default-1" - -- ac_config_files="$ac_config_files Makefile localdefs mpi-io/Makefile mpi2-other/info/Makefile mpi2-other/array/Makefile adio/common/Makefile test/Makefile test/misc.c test/large_file.c test/runtests util/romioinstall include/mpio.h include/mpiof.h adio/ad_nfs/Makefile adio/ad_ufs/Makefile adio/ad_panfs/Makefile adio/ad_xfs/Makefile adio/ad_sfs/Makefile adio/ad_pfs/Makefile adio/ad_testfs/Makefile adio/ad_pvfs/Makefile adio/ad_pvfs2/Makefile adio/ad_gridftp/Makefile mpi-io/fortran/Makefile mpi2-other/info/fortran/Makefile mpi2-other/array/fortran/Makefile test/fmisc.f test/fcoll_test.f test/pfcoll_test.f test/fperf.f mpi-io/glue/mpich2/Makefile mpi-io/glue/mpich1/Makefile mpi-io/glue/default/Makefile" -+ ac_config_files="$ac_config_files Makefile localdefs mpi-io/Makefile mpi2-other/info/Makefile mpi2-other/array/Makefile adio/common/Makefile test/Makefile test/misc.c test/large_file.c test/runtests util/romioinstall include/mpio.h include/mpiof.h adio/ad_nfs/Makefile adio/ad_ufs/Makefile adio/ad_panfs/Makefile adio/ad_xfs/Makefile adio/ad_sfs/Makefile adio/ad_pfs/Makefile adio/ad_testfs/Makefile adio/ad_pvfs/Makefile adio/ad_pvfs2/Makefile adio/ad_gridftp/Makefile adio/ad_lustre/Makefile mpi-io/fortran/Makefile mpi2-other/info/fortran/Makefile mpi2-other/array/fortran/Makefile test/fmisc.f test/fcoll_test.f test/pfcoll_test.f test/fperf.f mpi-io/glue/mpich2/Makefile mpi-io/glue/mpich1/Makefile mpi-io/glue/default/Makefile" - cat >confcache <<\_ACEOF - # This file is a shell script that caches the results of configure - # tests run on this system so they can be shared between configure -@@ -12535,6 +12543,7 @@ - "adio/ad_pvfs/Makefile" ) CONFIG_FILES="$CONFIG_FILES adio/ad_pvfs/Makefile" ;; - "adio/ad_pvfs2/Makefile" ) CONFIG_FILES="$CONFIG_FILES adio/ad_pvfs2/Makefile" ;; - "adio/ad_gridftp/Makefile" ) CONFIG_FILES="$CONFIG_FILES adio/ad_gridftp/Makefile" ;; -+ "adio/ad_lustre/Makefile" ) CONFIG_FILES="$CONFIG_FILES adio/ad_lustre/Makefile" ;; - "mpi-io/fortran/Makefile" ) CONFIG_FILES="$CONFIG_FILES mpi-io/fortran/Makefile" ;; - "mpi2-other/info/fortran/Makefile" ) CONFIG_FILES="$CONFIG_FILES mpi2-other/info/fortran/Makefile" ;; - "mpi2-other/array/fortran/Makefile" ) CONFIG_FILES="$CONFIG_FILES mpi2-other/array/fortran/Makefile" ;; -diff -ruN romio-orig/configure.in romio/configure.in ---- romio-orig/configure.in 2006-07-24 17:55:57.000000000 -0400 -+++ romio/configure.in 2006-09-06 17:16:13.000525117 -0400 -@@ -93,7 +93,7 @@ - # - have_aio=no - # --known_filesystems="nfs ufs pfs pvfs pvfs2 testfs xfs panfs gridftp" -+known_filesystems="nfs ufs pfs pvfs pvfs2 testfs xfs panfs gridftp lustre" - known_mpi_impls="mpich2_mpi mpich_mpi sgi_mpi hp_mpi cray_mpi lam_mpi" - # - # Defaults -@@ -1062,6 +1062,9 @@ - if test -n "$file_system_testfs"; then - AC_DEFINE(ROMIO_TESTFS,1,[Define for ROMIO with TESTFS]) - fi -+if test -n "$file_system_lustre"; then -+ AC_DEFINE(ROMIO_LUSTRE,1,[Define for ROMIO with LUSTRE]) -+fi - - if test -n "$file_system_xfs"; then - AC_DEFINE(ROMIO_XFS,1,[Define for ROMIO with XFS]) -@@ -2024,6 +2027,7 @@ - adio/ad_testfs/Makefile adio/ad_pvfs/Makefile \ - adio/ad_pvfs2/Makefile \ - adio/ad_gridftp/Makefile \ -+ adio/ad_lustre/Makefile \ - mpi-io/fortran/Makefile mpi2-other/info/fortran/Makefile \ - mpi2-other/array/fortran/Makefile test/fmisc.f \ - test/fcoll_test.f test/pfcoll_test.f test/fperf.f \ -diff -ruN romio-orig/Makefile.in romio/Makefile.in ---- romio-orig/Makefile.in 2005-05-24 18:53:11.000000000 -0400 -+++ romio/Makefile.in 2006-09-06 17:13:25.000393429 -0400 -@@ -14,7 +14,7 @@ - MPIO_DIRS = mpi-io - EXTRA_SRC_DIRS = @EXTRA_SRC_DIRS@ - FILE_SYS_DIRS = @FILE_SYS_DIRS@ --ALL_DIRS = mpi-io mpi-io/fortran mpi2-other/info mpi2-other/info/fortran mpi2-other/array mpi2-other/array/fortran adio/common adio/ad_pfs adio/ad_piofs adio/ad_nfs adio/ad_ufs adio/ad_xfs adio/ad_hfs adio/ad_sfs adio/ad_testfs adio/ad_pvfs adio/ad_pvfs2 adio/ad_panfs adio/ad_gridftp test -+ALL_DIRS = mpi-io mpi-io/fortran mpi2-other/info mpi2-other/info/fortran mpi2-other/array mpi2-other/array/fortran adio/common adio/ad_pfs adio/ad_piofs adio/ad_nfs adio/ad_ufs adio/ad_xfs adio/ad_hfs adio/ad_sfs adio/ad_testfs adio/ad_pvfs adio/ad_pvfs2 adio/ad_panfs adio/ad_gridftp adio/ad_lustre test - SHELL = /bin/sh - - @VPATH@ diff --git a/lustre/doc/.cvsignore b/lustre/doc/.cvsignore deleted file mode 100644 index fdf1642..0000000 --- a/lustre/doc/.cvsignore +++ /dev/null @@ -1,23 +0,0 @@ -.Xrefs -config.log -config.status -configure -Makefile -Makefile.in -.deps -tags -TAGS -OBD-HOWTO.html -OBD-HOWTO.txt -lustre-HOWTO.lyx -lustre-HOWTO.txt -lustre-pdf.bbl -lustre-pdf.blg -lustre-pdf.log -lustre-pdf.out -lustre-pdf.toc -*.eps -lustre.lyx -*.tex -*.pdf -*.aux diff --git a/lustre/doc/Makefile.am b/lustre/doc/Makefile.am deleted file mode 100644 index 1d02c60..0000000 --- a/lustre/doc/Makefile.am +++ /dev/null @@ -1,62 +0,0 @@ -# Copyright (C) 2001, 2002 Cluster File Systems, Inc. -# -# This code is issued under the GNU General Public License. -# See the file COPYING in this distribution -LYX2PDF = GS_OPTIONS=-dCompatibilityLevel=1.1 $(srcdir)/tex2pdf -overwrite -TEX2PDF = GS_OPTIONS=-dCompatibilityLevel=1.1 $(srcdir)/tex2pdf -overwrite -LYX2PS = lyx --export ps -LYX2TEX = lyx --export latex -LYX2TXT = lyx --export text -LYX2HTML = lyx --export html -LATEX = latex -DVIPS = dvips -PS2PDF = ps2pdf -TEXEXPAND = texexpand -SUFFIXES = .lin .lyx .pdf .ps .sgml .html .txt .tex .fig .eps .dvi - -if UTILS -man_MANS = lustre.7 lfs.1 mount.lustre.8 mkfs.lustre.8 tunefs.lustre.8 lctl.8 -endif - -LYXFILES= $(filter-out $(patsubst %.lin,%.lyx,$(wildcard *.lin)),\ - $(wildcard *.lin *.lyx)) - -CLEANFILES = *.aux *.tex *.log *.pdf - -EXTRA_DIST = tex2pdf lustre.7 mount.lustre.8 mkfs.lustre.8 tunefs.lustre.8 \ - $(LYXFILES) lfs.1 lctl.8 - -all: - -# These variables are set by lbuild/check-build. -RPMRELEASE ?= RELEASE -KERNTYPE ?= chaos -KERNRPM ?= kernel-2.4.18lustre13-RELEASE.i386.rpm - -.lyx.pdf: - @echo $(LYX2PDF) $< && $(LYX2PDF) $< || printf "\n*** Warning: not creating PDF docs; install lyx to rectify this\n" - -.lyx.ps: - @echo $(LYX2PS) $< && $(LYX2PS) $< || printf "\n*** Warning: not creating PostScript docs; install lyx to rectify this\n" - -.lyx.tex: - @echo $(LYX2TEX) $< && $(LYX2TEX) $< || printf "\n*** Warning: not creating LaTeX docs; install lyx to rectify this\n" - -.lyx.txt: - @echo $(LYX2TXT) $< && $(LYX2TXT) $< || printf "\n*** Warning: not creating text docs; install lyx to rectify this\n" - -.lyx.html: - @echo $(LYX2HTML) $< && $(LYX2HTML) $< || printf "\n*** Warning: not creating HTML docs; install lyx to rectify this\n" - -.tex.pdf: - $(TEX2PDF) $< - -.tex.dvi: - $(LATEX) $< - $(LATEX) $< - -.dvi.ps: - $(DVIPS) $< -o $@ - -.ps.pdf: - $(PS2PDF) $< $@ diff --git a/lustre/doc/VERSIONING b/lustre/doc/VERSIONING deleted file mode 100644 index a719103..0000000 --- a/lustre/doc/VERSIONING +++ /dev/null @@ -1,90 +0,0 @@ -Lustre versioning -================= - -0.0.1 2/19/2002 -0.0.2 3/14/2002 describe branches / stable tag -0.0.3 6/10/2002 describe release mechanisms - -This document describes versioning of source and binaries for Lustre. - -Packages -======== - -RPM's that you build should get 3 figure versions, CVS versions will -be 4 digits, and can correspond to test RPM's, and lead up to the -package version. So let's plan on releasing - -So you'd build 2 sets of test rpms this week: - -1.6.8.91 -1.6.8.92 - -we decide it's fine then and we release: - -1.6.9 - -If there are critical hotfixes that need to be released to customers -(e.g. data corruption fixes) then point releases would be created: - -1.6.9.{1,2,3,...} - -We go on developing with - -1.6.9.{91,92,93,94,...} - -as test releases and then we release: - -1.6.10 - -The 1.7 sequence would be an unstable sequence, like 2.5 for the kernel -is. So we expect lots of 1.7.X releases leading up to a stable 1.8 (or -2.0) at the time of deployment. - -CVS -=== - -Versions will have 4 digits: - major.minor.patch.test - -Such versions will be tagged in CVS as: - v1_6_9_97 -and referred to as: - 1.6.9.97 -encoded as: - 0x01060961 - -Usage: ------- - -New numbers are used as follows: - -1. major: - - increased when major new functionality becomes available -2. minor: - - even: for each new release with new functionality - - odd : when a new development cycle starts after a release -3. patch: - - when a development snapshot or release update becomes available - - all these are announced on lustre-{announce,devel}@clusterfs.com -4. test: - - when developers feel it is time to exchange a named version - -What will run, what won't ? ---------------------------- - -1. If the test level is non-zero, i.e. there are 4 digits in the - version, no guarantees of any kind are made. - -2. For three digit releases/tags the code should perform - according to the announcement. - -Branches --------- - -Any and all development must be done on branches, and can only merge to the -HEAD if _at_least_ tests/acceptance-small.sh and IOR with 5 SMP nodes and -2 clients/node with 1GB file/client pass without any errors or cleanup -problems. Additional tests may be added in the future, so the tests in the -current CVS head must pass before a branch can be merged back to the trunk. - -See http://lustre.org/docs/branches.html for details on CVS branch usage. diff --git a/lustre/doc/chbar.sh b/lustre/doc/chbar.sh deleted file mode 100755 index 7825241..0000000 --- a/lustre/doc/chbar.sh +++ /dev/null @@ -1,243 +0,0 @@ -#!/bin/sh -# Gadget to take two LaTeX files and produce a third which -# has changebars highlighting the difference between them. -# -# Version 1.2 -# Author: -# Don Ward, Careful Computing (don@careful.co.uk) -# v1.0 April 1989 -# v1.1 Feb 93 Amended to use changebar.sty (v3.0) and dvips -# v1.2 Aug 95 Added support for LaTeX209/LaTeX2e -# Added RCS support to retrive old files - -CMD=`basename $0` - -SED=sed -RM="rm -f" -DIFF=diff -ED=ed -AWK=awk -GREP=grep -MV=mv -CAT=cat -MKDIR=mkdir -CO="co" - -TMPDIR=${TMP-/tmp}/$CMD.$$ -trap 'test $DEBUG = NO && rm -rf $TMPDIR' 0 1 2 3 6 7 13 15 -mkdir $TMPDIR || { echo "cannot create directory \`$TMPDIR'." >&2; exit 1; } -TMPFILE=${TMPDIR}/$CMD.$$ -SED_CMD_FILE=$TMPFILE.sed - -usage() -{ -$CAT << _END_ -Usage: - $CMD [-hgG] [-d dir] old new [output] - default output is stdout - - $CMD [-hgG] [-d dir] old - new file on stdin, output on stdout - - $CMD [-hgG] -d dir -r rev files - old file retrieved using RCS - - Gadget to take two LaTeX files and produce a third which - has changebars highlighting the difference between them. - Changebars are inserted for differences after '\begin{document}'. - - Feature: \`new' can not be named \`-'. - - Options are: - -d dir : Write the output to file \`dir/new', if \`new' is given or - to file \`dir/old'. - If \`dir' does not exist, it is created. - If \`output' is given, it is discarded. - - -r rev : If the LaTeX \`files' are kept under control of the - Revision Control System RCS, the old files of - the revision \`rev' can be retrived. - \`rev' is specified using the RCS conventions. - This option must be used together with the \`-d dir' option. - \`files' must be a nonempty list of files. - - -h : Print this info text. - -g : Print some debugging info. - -G : Even more debug info. - - Version 1.2: August 3. 1995 -_END_ -exit 1 -} - -# parse options and arguments -DEBUG="NO" -DIR= -REV= -# process options -while getopts d:r:gGh i $* -do - case $i in - d ) DIR=$OPTARG;; - r ) REV=$OPTARG;; - g ) DEBUG="YES" ;; - G ) set -x; DEBUG="YES";; - h | \ - * ) usage ;; - esac -done - -shift `expr $OPTIND - 1` - -case $# in - 1 ) OLD=$1; NEW="-"; OUT="" ;; - 2 ) OLD=$1; NEW=$2; OUT="" ;; - 3 ) OLD=$1; NEW=$2; OUT="$3" ;; - * ) usage ;; -esac - -# check correct options -if [ ! -z "$DIR" ] -then - [ -d $DIR ] || $MKDIR $DIR -fi - -if [ ! -z "$REV" ] -then - [ -z "$DIR" ] && usage - FILES=$* -else - FILES=$NEW -fi - -# do the work -for NEW in $FILES -do - if [ ! -z "$DIR" ] - then - if [ $NEW = "-" ] - then - OUT=$DIR/$OLD - else - OUT=$DIR/$NEW - fi - fi - if [ ! -z "$REV" ] - then - OLD=${TMPFILE}.old - $CO -p"$REV" -q $NEW > $OLD - fi - - [ $DEBUG = "YES" ] && echo "OLD=\`$OLD' NEW=\`$NEW' OUT=\`$OUT'" - - # gather some info about the file - # Since we have for sure only the name of the OLD file, ... - $GREP "^\\\\begin{document}" $OLD > /dev/null - if [ $? -eq 0 ] - then - [ $DEBUG = "YES" ] && echo "contains a \\begin{document}" - HAS_BEGIN_DOC="YES" - else - [ $DEBUG = "YES" ] && echo "contains no \\begin{document}" - HAS_BEGIN_DOC="NO" - fi - - # Method to do the work: - # 1 Use diff to get an ed script to go from file1 to file2. - # 2 Breath on it a bit (with sed) to insert changebar commands. - # 3 Apply modified ed script to produce (nearly) the output. - # 4 Use awk to insert the changebars option into the \documentstyle - # and to handle changebar commands inside verbatim environments. - # 5 Remove changebars before \begin{document} with sed - - # SED commands to edit ED commands to edit old file - $CAT > $SED_CMD_FILE <<\_END_ -/^\.$/i\ -\\cbend{}% -/^[0-9][0-9]*[ac]$/a\ -\\cbstart{}% -/^[0-9][0-9]*,[0-9][0-9]*[ac]$/a\ -\\cbstart{}% -/^[0-9][0-9]*d$/a\ -i\ -\\cbdelete{}%\ -. -/^[0-9][0-9]*,[0-9][0-9]*d$/a\ -i\ -\\cbdelete{}%\ -. -_END_ - - # note DIFF accepts `-' as stdin - $DIFF -b -e $OLD $NEW | \ - ( $SED -f $SED_CMD_FILE ; echo w ${TMPFILE}.1 ; echo q ) | \ - $ED - $OLD - - # AWK commands to insert Changebars style and to protect - # changebar commands in verbatim environments - # and to tell what driver is in use; we assume the `dvips' driver - - $AWK ' - BEGIN {kind=""; # we saw now \documentXXX[]{} - } - /^\\documentstyle/{ - kind = "209"; - if (index($0, "changebar") == 0 ) { - opts = index($0, "[") - if (opts > 0) - printf "%schangebar,%s\n",substr($0,1,opts),substr($0,opts+1) - else - printf "\\documentstyle[changebar]%s\n", substr($0,15) - next - } - } - /^\\documentclass/{ - kind = "2e"; - printf "%s\n", $0 - printf "\\usepackage[dvips]{changebar}\n" - next - } - /\\begin{document}/ {if (kind == "209" ) {print "\\driver{dvips}"}} - /\\begin{verbatim}/{++nesting} - /\\end{verbatim}/{--nesting} - /\\cbstart{}%|\\cbend{}%|\cbdelete{}%/ { - if ( nesting > 0) { - # changebar command in a verbatim environment: Temporarily exit, - # do the changebar command and reenter. - # - # The obvious ( printf "\\end{verbatim}%s\\begin{verbatim} , $0 ) - # leaves too much vertical space around the changed line(s). - # The following magic seeems to work - # - print "\\end{verbatim}\\nointerlineskip" - print "\\vskip -\\ht\\strutbox\\vskip -\\ht\\strutbox" - printf "\\vbox to 0pt{\\vskip \\ht\\strutbox%s\\vss}\n", $0 - print "\\begin{verbatim}" - next - } - } - { print $0 } - ' ${TMPFILE}.1 > ${TMPFILE}.2 - - # if a \begin{document} is contained in the file, - # remove the changebar commands before them - - if [ $HAS_BEGIN_DOC = "YES" ] - then - SED_CMD="1,/\\\\begin{document}/s/\(\\\\cb[sed][tne][adl][^{}]*{}%\)$/%%\1/" - $SED "$SED_CMD" ${TMPFILE}.2 > ${TMPFILE}.3 - else - $CAT ${TMPFILE}.2 > ${TMPFILE}.3 - fi - if [ -z "$OUT" ] - then - $CAT ${TMPFILE}.3 - else - $MV ${TMPFILE}.3 $OUT - fi - -done - -[ $DEBUG = "NO" ] && $RM ${TMPFILE}.* - -############################################################### diff --git a/lustre/doc/lconf.8 b/lustre/doc/lconf.8 deleted file mode 100644 index a6ca88a..0000000 --- a/lustre/doc/lconf.8 +++ /dev/null @@ -1,206 +0,0 @@ -.TH lconf 1 "2004 Sep 16" Lustre "configuration utilities" -.SH NAME -lconf \- Lustre filesystem configuration utility -.SH SYNOPSIS -.br -.B lconf -[OPTIONS] -.br -.SH DESCRIPTION -.B lconf -, when invoked configures a node following directives in the -.Can be used to control recovery and startup/shutdown -. There will be single configuration file for all the nodes in a -single cluster. This file should be distributed to all the nodes in -the cluster or kept in a location accessible to all the nodes. The XML file must be specified. When invoked with no options, lconf will attempt to configure the resources owned by the node it is invoked on -.PP -The arguments that can be used for lconf are: -.PP -.TP ---abort_recovery - Used to start Lustre when you are certian that -recovery will not succeed, as when an OST or MDS is disabled. -.TP ---acl Enable Access Control List support on the MDS -.TP ---allow_unprivileged_port Allows connections from unprivileged ports -.TP ---clientoptions -Additional options for mounting Lustre clients. Obsolete with -zeroconfig mounting.. -.TP ---client_uuid -The failed client (required for recovery). -.TP ---clumanager Generate a Red Hat Clumanager configuration file for this -node. -.TP ---config -Cluster configuration name used for LDAP query (depreciated) -.TP ---conn_uuid -The failed connection (required for recovery). -.TP --d|--cleanup -Unconfigure a node. The same config and --node argument used for configuration needs to be used for cleanup as well. This will attempt to undo all of the configuration steps done by lconf, including unloading the kernel modules. -.TP ---debug_path -Path to save debug dumps.(default is /tmp/lustre-log) -.TP ---dump -Dump the kernel debug log to the specified file before portals is unloaded during cleanup. -.TP ---failover -Used to shutdown without saving state. This will allow the node to give up service to another node for failover purposes. This will not be a clean shutdown. -.TP --f|--force -Forced unmounting and/or obd detach during cleanup. -.TP ---gdb -Causes lconf to create a gdb module script and pause 5 seconds before doing any Lustre configuration (the gdb module script is always created, however). -.TP ---gdb_script -Full name of gdb debug script. Default is /tmp/ogdb. -.TP ---group -The group of devices to cleanup/configure. -.TP ---group_upcall -Pathname to the MDS upcall to resolve secondary group membership. Defaults to NONE, meaning that the MDS will use whatever group the client supplies, but this is limited to a single supplementary group. -.TP --h,--help -Print help. -.TP ---inactive -The UUID of the service to be ignored by a client mounting Lustre. Allows the client to mount in the presence of some inactive services. (currently OST only). Multiple UUIDs can be specified by repeating the option. -.TP ---lctl-dump -Dump all ioctls to the specified file -.TP ---ldapurl -LDAP server URL. Depreciated -.TP ---lustre=src_dir -Specify the base directory for Lustre sources, this parameter will cause lconf to load the lustre modules from this source tree. -.TP ---lustre_upcall -Set the location of the Lustre upcall scripts used by the client for recovery -.TP ---make_service_scripts Create per-service symlinks for use with clumanager HA software -.TP ---mds_ost_conn -Open connections to OSTs on MDS. -.TP ---maxlevel -Perform configuration of devices and services up to level given. When -used in conjunction with cleanup, services are torn down up to a -certain level. -Levels are aproximatly like: -10 - network -20 - device, ldlm -30 - osd, mdd -40 - mds, ost -70 - mountpoint, echo_client, osc, mdc, lov -.TP ---minlevel -Specify the minimum level of services to configure/cleanup. Default is 0. -.TP ---mkfsoptions -Specify additional options for the mk*fs command line. -.TP ---mountfsoptions -Specify additional options for mount fs command line. Mount options will be passed by this argument. For example, extents are to be enabled by adding ",extents" to the --mountfsoptions option. "errors=remount-ro" and "asyncdel" can also be added to it. -.TP ---node node_name -Specify a specific node to configure. By default, lconf will search for nodes with the local hostname and 'localhost'. When --node is used, only node_name is searched for. If a matching node is not found in the config, then lconf exits with an error. -.TP ---noexec,-n -Print, but don't execute, the steps lconf will perform. This is useful for debugging a configuration, and when used with --node, can be run on any host. -.TP ---nomod -Only setup devices and services, do not load modules. -.TP ---nosetup -Only load modules, do not configure devices or services. -.TP ---old_conf Start up service even though config logs appear outdated. -.TP ---portals -Specify portals source directory. If this is a relative path, then it -is assumed to be relative to lustre. (Depreciated) -.TP ---portals_upcall -Specify the location of the Portals upcall scripts used by the client -for recovery (Depreciated) -.TP ---ptldebug debug-level -This options can be used to set the required debug level. -.TP ---quota -Enable quota support for client filesystem -.TP ---rawprimary For clumanager, device of the primary quorum -(default=/dev/raw/raw1) -.TP ---rawsecondary For clumanager, device of the secondary quorum (default=/dev/raw/raw2) -.TP ---record -Write config information on mds. -.TP ---record_device -Specify MDS device name that will record the config commands. -.TP ---record_log -Specify the name of config record log. -.TP ---recover -Recover a device. -.TP ---reformat -Reformat all the devices. This is essential on the first time the file system is brought up. -.TP ---select -Select a particular node for a service -.TP ---service -Shorthand for --group --select = -.TP ---service_scripts For clumanager, directory containing per-service scripts (default=/etc/lustre/services) -.TP ---single_socket The socknal option. Uses only one socket instead of a -bundle. -.TP ---subsystem -Set the portals debug subsystem. -.TP ---tgt_uuid -Specify the failed target (required for recovery). -.TP ---timeout -Set the recovery timeout period. -.TP ---upcall -Set the location of both Lustre and Portals upcall scripts used by the -client for recovery -.TP ---user_xattr Enable user_xattr support on MDS -.TP ---verbose,-v -Be verbose and show actions while going along. -.TP ---write_conf -Save all client configuration information on the MDS -.SH EXAMPLES -.TP -.B lconf --node client config.xml -This invokes lconf on the client node. -.TP -.B lconf --ptldebug "~(portals | malloc | trace)" -Used to set the required debug levels (all but these). -.TP -.B lconf --ptldebug "ldlm|ha" -Used to turn-on specific debug types. -.TP -.B lconf --inactive OST_ost1_UUID --inactive OST_ost2_UUID config.xml -A subset of failed OSTs can be ignored during Lustre mount on the clients by using this option. Here OST1 and OST2 have failed and need to be ignored. -.SH BUGS -None are known. diff --git a/lustre/doc/lconf.lyx b/lustre/doc/lconf.lyx deleted file mode 100644 index 2846f48..0000000 --- a/lustre/doc/lconf.lyx +++ /dev/null @@ -1,387 +0,0 @@ -#LyX 1.3 created this file. For more info see http://www.lyx.org/ -\lyxformat 221 -\textclass amsart -\language english -\inputencoding auto -\fontscheme times -\graphics default -\paperfontsize default -\spacing single -\papersize letterpaper -\paperpackage a4 -\use_geometry 0 -\use_amsmath 0 -\use_natbib 0 -\use_numerical_citations 0 -\paperorientation portrait -\secnumdepth 3 -\tocdepth 3 -\paragraph_separation skip -\defskip medskip -\quotes_language english -\quotes_times 2 -\papercolumns 1 -\papersides 1 -\paperpagestyle default - -\layout Section - -lconf -\layout Subsection - -NAME -\layout Description - -lconf Lustre filesystem configuration utility. -\layout Subsection - -SYNOPSIS -\layout Standard - - -\series bold -lconf\SpecialChar ~ -[--node ] [-d,--cleanup] [--noexec] [--gdb] [--nosetup] - [--nomod] [-n,--noexec] [-v,--verbose] [-h,--help] -\layout Subsection - -DESCRIPTION -\layout Standard - -This program configures a node following directives in the . - There will be single configuration file for all the nodes in a single cluster. - This file should be distributed to all the nodes in the cluster or kept - in a location accessible to all the nodes. - One option is to store the cluster configuration information in LDAP format - on an LDAP server that can be reached from all the cluster nodes. -\layout Description - ---client_uuid\SpecialChar ~ - The failed client (required for recovery). -\layout Description - ---clientoptions\SpecialChar ~ - Additional options for Lustre. -\layout Description - ---config\SpecialChar ~ - Cluster configuration name used for LDAP query -\layout Description - ---conn_uuid\SpecialChar ~ - The failed connection (required for recovery). -\layout Description - ---d|--cleanup Unconfigure a node. - The same config and -\emph on ---node -\emph default - argument used for configuration needs to be used for cleanup as well. - This will attempt to undo all of the configuration steps done by lconf, - including unloading the kernel modules. -\layout Description - ---debug_path\SpecialChar ~ - Path to save debug dumps. -\layout Description - ---dump\SpecialChar ~ - Dump the kernel debug log to the specified file before portals - is unloaded during cleanup. -\layout Description - ---dump_path\SpecialChar ~ - Path to save debug dumps. - Default is /tmp/lustre_log -\layout Description - ---failover Used to shutdown without saving state. - Default is 0. - This will allow the node to give up service to another node for failover - purposes. - This will not be a clean shutdown. -\layout Description - ---force Forced unmounting and/or obd detach during cleanup. - Default is 0. - -\layout Description - ---gdb Causes lconf to print a message and pause for 5 seconds after creating - a gdb module script and before doing any Lustre configuration (the gdb - module script is always created, however). -\layout Description - ---gdb_script\SpecialChar ~ - Full name of gdb debug script. - Default is /tmp/ogdb. -\layout Description - ---group\SpecialChar ~ - The group of devices to cleanup/configure. -\layout Description - ---group_upcall\SpecialChar ~ - Pathname to the MDS upcall to resolve secondary group membership. - Defaults to NONE, meaning that the MDS will use whatever group the client - supplies, but this is limited to a single supplementary group. -\layout Description - --h,--help Print help. -\layout Description - ---inactive\SpecialChar ~ - The UUID of the service to be ignored by a client mounting - Lustre. - Allows the client to mount in the presence of some inactive services. - (currently OST only). - Multiple UUIDs can be specified by repeating the option. - -\layout Description - ---lctl-dump\SpecialChar ~ - Dump all ioctls to the specified file -\layout Description - ---ldapurl\SpecialChar ~ - LDAP server URL -\layout Description - ---lustre_upcall\SpecialChar ~ - Set the location of the Lustre upcall scripts used - by the client for recovery -\layout Description - ---lustre=src_dir Specify the base directory for Lustre sources, this parameter - will cause lconf to load the lustre modules from this soure tree. -\layout Description - ---mds_ost_conn Open connections to OSTs on MDS. -\layout Description - ---maxlevel\SpecialChar ~ - Perform configuration of devices and services up to level - given. - -\emph on -level -\emph default - can take the values -\series bold -net, dev, svc, fs. - -\series default -When used in conjunction with cleanup, services are torn down up to a certain - level. - Default is 100. -\layout Description - ---minlevel\SpecialChar ~ - Specify the minimum level of services to configure/cleanup. - Default is 0. -\layout Description - ---mkfsoptions\SpecialChar ~ - Specify additional options for the mk*fs command - line. -\layout Description - ---mountfsoptions\SpecialChar ~ - Specify additional options for mount fs command - line. - Mount options will be passed by this argument. - For example, extents are to be enabled by adding -\begin_inset Quotes eld -\end_inset - -,extents -\begin_inset Quotes erd -\end_inset - - to the --mountfsoptions option. - -\begin_inset Quotes eld -\end_inset - -errors=remount-ro -\begin_inset Quotes erd -\end_inset - - and -\begin_inset Quotes eld -\end_inset - -asyncdel -\begin_inset Quotes erd -\end_inset - - can also be added to it. -\layout Description - ---node\SpecialChar ~ -node_name Specify a specific node to configure. - By default, lconf will search for nodes with the local hostname and 'localhost'. - When -\emph on - --node -\emph default - is used, only -\emph on -node_name -\emph default - is searched for. - If a matching node is not found in the config, then lconf exits with an - error. -\layout Description - ---noexec,-n Print, but don't execute, the steps lconf will perform. - This is useful for debugging a configuration, and when used with -\emph on ---node -\emph default -, can be run on any host. -\layout Description - ---nomod Only setup devices and services, do not load modules. -\layout Description - ---nosetup Only load modules, do not configure devices or services. -\layout Description - ---portals\SpecialChar ~ - Specify portals source directory. - If this is a relative path, then it is assumed to be relative to lustre. -\layout Description - ---portals_upcall\SpecialChar ~ - Specify the location of the Portals upcall scripts - used by the client for recovery -\layout Description - ---ptldebug\SpecialChar ~ -debug\SpecialChar ~ -level This options can be used to set the required debug - level. -\layout Description - ---record Write config information on mds. -\layout Description - ---record_log\SpecialChar ~ - Specify the name of config record log. -\layout Description - ---record_device\SpecialChar ~ - Specify MDS device name that will record the config - commands. -\layout Description - ---recover\SpecialChar ~ - Recover a device. -\layout Description - ---reformat Reformat all the devices. - This is essential on the first time the file system is brought up. -\layout Description - ---select\SpecialChar ~ - Select a particular node for a service -\layout Description - ---single_socket Specify socknal option: only use one socket instead of bundle. -\layout Description - ---subsystem\SpecialChar ~ - Set the portals debug subsystem. -\layout Description - ---tgt_uuid\SpecialChar ~ - Specify the failed target (required for recovery). -\layout Description - ---timeout\SpecialChar ~ - Set the recovery timeout period. -\layout Description - ---upcall\SpecialChar ~ - Set the location of both Lustre and Portals upcall scripts - used by the client for recovery -\layout Description - ---verbose,-v Be verbose and show actions while going along. -\layout Description - ---write_conf Save all client configuration information on the MDS -\layout Subsection - -EXAMPLES -\layout Standard - -On client nodes this is typically invoked as: -\layout LyX-Code - - -\size small -lconf --node client config.xml -\layout Standard - -in order to give clients, regardless of hostname, a single configuration. -\layout Standard - -Required debug levels can be set like this: -\layout LyX-Code - - -\size small - ## Everything but these -\layout LyX-Code - - -\size small -lconf --ptldebug -\begin_inset Quotes eld -\end_inset - -~(portals | malloc | trace) -\begin_inset Quotes erd -\end_inset - - -\layout LyX-Code - -\layout LyX-Code - - -\size small -## Only these debug types -\layout LyX-Code - - -\size small -lconf --ptldebug -\begin_inset Quotes eld -\end_inset - -ldlm|ha -\begin_inset Quotes erd -\end_inset - - -\layout Standard - -A subset of failed OSTs can be ignored during Lustre mount on the clients - by using the following option: -\layout LyX-Code - - lconf --inactive OST_ost1_UUID --inactive OST_ost2_UUID config.xml -\layout Standard - -where OST1 and OST2 have failed and need to be ignored. -\layout Subsection - -BUGS -\layout Standard - -None are known. -\the_end diff --git a/lustre/doc/lctl.8 b/lustre/doc/lctl.8 deleted file mode 100644 index 9243863..0000000 --- a/lustre/doc/lctl.8 +++ /dev/null @@ -1,190 +0,0 @@ -.TH lctl 1 "2003 Oct 8" Lustre "configuration utilities" -.SH NAME -lctl \- Low level Lustre filesystem configuration utility -.SH SYNOPSIS -.br -.B lctl -.br -.B lctl --device -.br -.SH DESCRIPTION -.B lctl -is used to directly control Lustre via an ioctl interface, allowing -various configuration, maintenance, and debugging features to be accessed. - -.B lctl -can be invoked in interactive mode by issuing lctl command. After that, commands are issued as below. The most common commands in lctl are -.B dl -, -.B device -, -.B network -.I -, -.B list_nids -, -.B ping -.I nid -, -.B help -, -.B quit. - -To get a complete listing of available commands, type -.B help -at the lctl prompt. To get basic help on the meaning and syntax of a -command, type -.B help -.I command -. Command completion is activated with the TAB key, and command history is available via the up- and down-arrow keys. - -For non-interactive use, one uses the second invocation, which runs command after connecting to the device. - -.SS Network Configuration -.TP -.BI network " |" -Start or stop LNET, or select a network type for other -.I lctl -LNET commands -.TP -.BI list_nids -Print all Network Identifiers on the local node. LNET must be running. -.TP -.BI which_nid " " -From a list of nids for a remote node, show which interface communication -will take place on. -.TP -.BI ping " " -Check LNET connectivity via an LNET ping. This will use the fabric -appropriate to the specified NID. -.TP -.BI interface_list -Print the network interface information for a given -.B network -type. -.TP -.BI peer_list -Print the known peers for a given -.B network -type. -.TP -.BI conn_list -Print all the connected remote NIDs for a given -.B network -type. -.TP -.BI active_tx -This command should print active transmits, and it is only used for elan network type. -.TP -.BI route_list -Print the complete routing table. -.PP -.SS Device Selection -.TP -.BI device " " -This will select the specified OBD device. All other commands depend on the device being set. -.TP -.BI device_list -Show all the local Lustre OBDs. AKA -.B dl -.PP -.SS Device Operations -.TP -.BI conf_param " " -Set a permanent configuration parameter for any device via the MGS. This -command must be run on the MGS node. -.TP -.BI activate -Reactivate an import after deactivating, below. -.TP -.BI deactivate -Deactivate an import, in particular meaning do not assign new file stripes -to an OSC. This command should be used on the OSC in the MDT LOV -corresponding to a failed OST device, to prevent further attempts at -communication with the failed OST. -.TP -.BI abort_recovery -Abort the recovery process on a restarting MDT or OST device -.PP -.SS Virtual Block Device Operation -Lustre is able to emulate a virtual block device upon regular file. It is necessary to be used when you are trying to setup a swap space via file. -.TP -.BI blockdev_attach " " -Attach the lustre regular file to a block device. If the device node is not existent, lctl will create it \- it is recommended to create it by lctl since the emulator uses a dynamical major number. -.TP -.BI blockdev_detach " " -Detach the virtual block device. -.TP -.BI blockdev_info " " -Acquire which lustre file was attached to the device node. -.PP -.SS Debug -.TP -.BI debug_daemon -Start and stop the debug daemon, and control the output filename and size. -.TP -.BI debug_kernel " [file] [raw]" -Dump the kernel debug buffer to stdout or file. -.TP -.BI debug_file " [output]" -Convert kernel-dumped debug log from binary to plain text format. -.TP -.BI clear -Clear the kernel debug buffer. -.TP -.BI mark " " -Insert marker text in the kernel debug buffer. -.TP -.BI filter " " -Filter kernel debug messages by subsystem or mask. -.TP -.BI show " " -Show specific type of messages. -.TP -.BI debug_list " " -List all the subsystem and debug types. -.TP -.BI modules " " -Provide gdb-friendly module information. - -.SH OPTIONS -The following options can be used to invoke lctl. -.TP -.B --device -The device to be used for the operation. This can be specified by name or -number. See -.B device_list -.TP -.B --ignore_errors | ignore_errors -Ignore errors during script processing -.TP -.SH EXAMPLES -# lctl -.br -lctl > dl - 0 UP mgc MGC192.168.0.20@tcp bfbb24e3-7deb-2ffa-eab0-44dffe00f692 5 - 1 UP ost OSS OSS_uuid 3 - 2 UP obdfilter testfs-OST0000 testfs-OST0000_UUID 3 -.br -lctl > dk /tmp/log -Debug log: 87 lines, 87 kept, 0 dropped. -.br -lctl > quit -.PP -# lctl conf_param testfs-MDT0000 sys.timeout=40 - -.SH BUGS -Please report all bugs to ClusterFileSystems, support@clusterfs.com -.SH AVAILABILITY -.B lctl -is part of the -.BR Lustre (7) -filesystem package and is available from CFS -.br -http://clusterfs.com -.SH SEE ALSO -.BR Lustre (7), -.BR mkfs.lustre (8), -.BR mount.lustre (8), -.BR lctl (8), -.BR lfs (1) diff --git a/lustre/doc/lctl.lyx b/lustre/doc/lctl.lyx deleted file mode 100644 index c3a769f..0000000 --- a/lustre/doc/lctl.lyx +++ /dev/null @@ -1,928 +0,0 @@ -#LyX 1.3 created this file. For more info see http://www.lyx.org/ -\lyxformat 221 -\textclass amsart-plain -\language english -\inputencoding auto -\fontscheme times -\graphics default -\paperfontsize default -\spacing single -\papersize letterpaper -\paperpackage a4 -\use_geometry 0 -\use_amsmath 0 -\use_natbib 0 -\use_numerical_citations 0 -\paperorientation portrait -\secnumdepth 3 -\tocdepth 3 -\paragraph_separation skip -\defskip medskip -\quotes_language english -\quotes_times 2 -\papercolumns 1 -\papersides 1 -\paperpagestyle default - -\layout Section - -lctl -\layout Subsection - -NAME -\layout Description - -lctl Low level Lustre filesystem configuration utility. -\layout Subsection - -SYNOPSIS -\layout Standard - - -\series bold -lctl -\layout Standard - - -\series bold -lctl\SpecialChar ~ ---device\SpecialChar ~ - -\layout Standard - - -\series bold -lctl\SpecialChar ~ ---threads\SpecialChar ~ -\SpecialChar ~ -\SpecialChar ~ -\SpecialChar ~ - -\layout Subsection - -DESCRIPTION -\layout Standard - -The program can be invoked in interactive mode by issuing -\series bold -lctl. - -\series default - After that, commands are issued as below. - The most common commands in lctl are (in matching pairs) -\family typewriter -\size small -device -\family default -\size default - and -\family typewriter -\size small -attach -\family default -\size default -, -\family typewriter -\size small -detach -\family default -\size default - and -\family typewriter -\size small -setup -\family default -\size default -, -\family typewriter -\size small -cleanup -\family default -\size default - and -\family typewriter -\size small -connect -\family default -\size default -, -\family typewriter -\size small -disconnect -\family default -\size default - and -\family typewriter -\size small -help -\family default -\size default -, and -\family typewriter -\size small -quit -\family default -\size default -. - To get a complete listing of available commands, type -\family typewriter -\size small -help -\family default -\size default - at the lctl prompt. - To get basic help on the meaning and syntax of a command, type -\family typewriter -\size small -help command -\family default -\size default -. - Command completion is activated with the -\family typewriter -\size small -TAB -\family default -\size default - key, and command history is available via the up- and down-arrow keys. - -\layout Standard - -For non-interactive single threaded use, one uses the second invocation, - which runs -\emph on -command -\emph default - after connecting to the device -\emph on -. - -\emph default - -\layout Description - ---device The device number to be used for the operation. - The value of devno is an integer, normally found by calling -\emph on -lctl device_list/dl -\emph default -. - -\layout Description - ---threads How many threads should be forked doing the command specified. - The numthreads variable is a strictly positive integer indicating how many - threads should be started. - The -\emph on -devno -\emph default -option is used as above. -\layout Description - ---ignore_errors\SpecialChar ~ -|\SpecialChar ~ -ignore_errors Ignore errors that occur during script processing. -\layout Description - -dump Save ioctl buffer to file. -\layout LyX-Code - -\layout Description - -Network\SpecialChar ~ -Configuration -\begin_deeper -\layout Description - ---net\SpecialChar ~ - -\series bold - -\series default -\SpecialChar ~ - -\series bold - -\series default -Indicate the network type to be used for the operation. -\layout Description - -network\SpecialChar ~ - Indicate what kind of network applies for the - configuration commands that follow. -\layout Description - -interface_list Print the interface entries. -\layout Description - -add_interface\SpecialChar ~ - -\series bold -< -\series default -ip>\SpecialChar ~ -[netmask] Add an interface entry. -\layout Description - -del_interface\SpecialChar ~ -[ip] Delete an interface entry. -\layout Description - -peer_list Print the peer entries. -\layout Description - -add_peer\SpecialChar ~ - -\series bold -< -\series default -nid>\SpecialChar ~ - -\series bold -< -\series default -host>\SpecialChar ~ - -\series bold -< -\series default -port> -\series bold -Add a peer entry. -\layout Description - -del_peer\SpecialChar ~ -[ -\series bold -< -\series default -nid>]\SpecialChar ~ -[ -\series bold -< -\series default -host>]\SpecialChar ~ -[ks] Remove a peer entry. -\layout Description - -connect\SpecialChar ~ -\SpecialChar ~ -\SpecialChar ~ -[iIOC] This will establish a connection to a remote - network, network -\emph on -id -\emph default - given by the hostname/port combination. -\layout Description - -disconnect\SpecialChar ~ - Disconnect from a remote -\emph on -nid -\emph default -. -\layout Description - -active_tx This command should print active transmits, and it is only used - for elan network type. -\layout Description - -mynid\SpecialChar ~ -[nid] Informs the socknal of the local -\emph on -nid -\emph default -. - It defaults to hostname for tcp networks and is automatically setup for - elan/myrinet networks. -\layout Description - -shownid Print the local NID. -\layout Description - -add_uuid\SpecialChar ~ -\SpecialChar ~ -\SpecialChar ~ - Associate a given UUID with an -\emph on -nid. -\layout Description - -close_uuid\SpecialChar ~ -\SpecialChar ~ - Disconnect a UUID. -\layout Description - -del_uuid\SpecialChar ~ - Delete a UUID association. -\layout Description - -add_route\SpecialChar ~ -\SpecialChar ~ -\SpecialChar ~ -[] Add an entry to the portals routing - table for the given target. - The arguments should be nid. - If only one is provided, this command should only add the route - for this target. - But, if both of the s are provides, this command should add the - route for all the target in the range specified by the two targets(from - low nid to high nid). -\layout Description - -del_route\SpecialChar ~ -\SpecialChar ~ -[]\SpecialChar ~ -[] Delete the route entry for the given - targets from the portals routing table. - The arguments should be nid. - -\layout Description - -set_route\SpecialChar ~ -\SpecialChar ~ -\SpecialChar ~ -[