From 053cc3651b455f4dcaa2cdac30b78d45f6b8d70f Mon Sep 17 00:00:00 2001 From: qiyong Date: Sun, 16 Jul 2006 15:37:33 +0000 Subject: [PATCH] support for linux v2.6.18 --- .../patches/ext3-extents-2.6.18-vanilla.patch | 2935 ++++++++++++++++++++ .../patches/ext3-mballoc2-2.6.18-vanilla.patch | 2810 +++++++++++++++++++ .../series/ldiskfs-2.6.18-vanilla.series | 13 + .../patches/dev_read_only-2.6.18-vanilla.patch | 145 + .../patches/export-2.6.18-vanilla.patch | 24 + .../patches/export-show_task-2.6.18-vanilla.patch | 25 + .../patches/export-truncate-2.6.18-vanilla.patch | 39 + .../patches/export_symbols-2.6.18-vanilla.patch | 64 + .../patches/ext3-extents-2.6.18-vanilla.patch | 2935 ++++++++++++++++++++ .../patches/ext3-mballoc2-2.6.18-vanilla.patch | 2810 +++++++++++++++++++ ...xt3-multi-mount-protection-2.6.18-vanilla.patch | 373 +++ .../patches/ext3-wantedi-misc-2.6.18-vanilla.patch | 16 + .../patches/iopen-misc-2.6.18-vanilla.patch | 82 + .../patches/jbd-jcberr-2.6.18-vanilla.patch | 228 ++ .../patches/nfs-cifs-intent-2.6.18-vanilla.patch | 120 + .../patches/tcp-zero-copy-2.6.18-vanilla.patch | 450 +++ .../patches/vfs_intent-2.6.18-vanilla.patch | 824 ++++++ .../patches/vfs_nointent-2.6.18-vanilla.patch | 451 +++ .../patches/vfs_races-2.6.18-vanilla.patch | 61 + lustre/kernel_patches/series/2.6.18-vanilla.series | 20 + .../series/ldiskfs-2.6.18-vanilla.series | 13 + 21 files changed, 14438 insertions(+) create mode 100644 ldiskfs/kernel_patches/patches/ext3-extents-2.6.18-vanilla.patch create mode 100644 ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6.18-vanilla.patch create mode 100644 ldiskfs/kernel_patches/series/ldiskfs-2.6.18-vanilla.series create mode 100644 lustre/kernel_patches/patches/dev_read_only-2.6.18-vanilla.patch create mode 100644 lustre/kernel_patches/patches/export-2.6.18-vanilla.patch create mode 100644 lustre/kernel_patches/patches/export-show_task-2.6.18-vanilla.patch create mode 100644 lustre/kernel_patches/patches/export-truncate-2.6.18-vanilla.patch create mode 100644 lustre/kernel_patches/patches/export_symbols-2.6.18-vanilla.patch create mode 100644 lustre/kernel_patches/patches/ext3-extents-2.6.18-vanilla.patch create mode 100644 lustre/kernel_patches/patches/ext3-mballoc2-2.6.18-vanilla.patch create mode 100644 lustre/kernel_patches/patches/ext3-multi-mount-protection-2.6.18-vanilla.patch create mode 100644 lustre/kernel_patches/patches/ext3-wantedi-misc-2.6.18-vanilla.patch create mode 100644 lustre/kernel_patches/patches/iopen-misc-2.6.18-vanilla.patch create mode 100644 lustre/kernel_patches/patches/jbd-jcberr-2.6.18-vanilla.patch create mode 100644 lustre/kernel_patches/patches/nfs-cifs-intent-2.6.18-vanilla.patch create mode 100644 lustre/kernel_patches/patches/tcp-zero-copy-2.6.18-vanilla.patch create mode 100644 lustre/kernel_patches/patches/vfs_intent-2.6.18-vanilla.patch create mode 100644 lustre/kernel_patches/patches/vfs_nointent-2.6.18-vanilla.patch create mode 100644 lustre/kernel_patches/patches/vfs_races-2.6.18-vanilla.patch create mode 100644 lustre/kernel_patches/series/2.6.18-vanilla.series create mode 100644 lustre/kernel_patches/series/ldiskfs-2.6.18-vanilla.series diff --git a/ldiskfs/kernel_patches/patches/ext3-extents-2.6.18-vanilla.patch b/ldiskfs/kernel_patches/patches/ext3-extents-2.6.18-vanilla.patch new file mode 100644 index 0000000..e89e8e7 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext3-extents-2.6.18-vanilla.patch @@ -0,0 +1,2935 @@ +Index: linux-stage/fs/ext3/extents.c +=================================================================== +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ linux-stage/fs/ext3/extents.c 2006-07-16 14:10:21.000000000 +0800 +@@ -0,0 +1,2347 @@ ++/* ++ * Copyright(c) 2003, 2004, 2005, Cluster File Systems, Inc, info@clusterfs.com ++ * Written by Alex Tomas ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public Licens ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- ++ */ ++ ++/* ++ * Extents support for EXT3 ++ * ++ * TODO: ++ * - ext3_ext_walk_space() sould not use ext3_ext_find_extent() ++ * - ext3_ext_calc_credits() could take 'mergable' into account ++ * - ext3*_error() should be used in some situations ++ * - find_goal() [to be tested and improved] ++ * - smart tree reduction ++ * - arch-independence ++ * common on-disk format for big/little-endian arch ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++ ++static inline int ext3_ext_check_header(struct ext3_extent_header *eh) ++{ ++ if (eh->eh_magic != EXT3_EXT_MAGIC) { ++ printk(KERN_ERR "EXT3-fs: invalid magic = 0x%x\n", ++ (unsigned)eh->eh_magic); ++ return -EIO; ++ } ++ if (eh->eh_max == 0) { ++ printk(KERN_ERR "EXT3-fs: invalid eh_max = %u\n", ++ (unsigned)eh->eh_max); ++ return -EIO; ++ } ++ if (eh->eh_entries > eh->eh_max) { ++ printk(KERN_ERR "EXT3-fs: invalid eh_entries = %u\n", ++ (unsigned)eh->eh_entries); ++ return -EIO; ++ } ++ return 0; ++} ++ ++static handle_t *ext3_ext_journal_restart(handle_t *handle, int needed) ++{ ++ int err; ++ ++ if (handle->h_buffer_credits > needed) ++ return handle; ++ if (!ext3_journal_extend(handle, needed)) ++ return handle; ++ err = ext3_journal_restart(handle, needed); ++ ++ return handle; ++} ++ ++static int inline ++ext3_ext_get_access_for_root(handle_t *h, struct ext3_extents_tree *tree) ++{ ++ if (tree->ops->get_write_access) ++ return tree->ops->get_write_access(h,tree->buffer); ++ else ++ return 0; ++} ++ ++static int inline ++ext3_ext_mark_root_dirty(handle_t *h, struct ext3_extents_tree *tree) ++{ ++ if (tree->ops->mark_buffer_dirty) ++ return tree->ops->mark_buffer_dirty(h,tree->buffer); ++ else ++ return 0; ++} ++ ++/* ++ * could return: ++ * - EROFS ++ * - ENOMEM ++ */ ++static int ext3_ext_get_access(handle_t *handle, ++ struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ int err; ++ ++ if (path->p_bh) { ++ /* path points to block */ ++ err = ext3_journal_get_write_access(handle, path->p_bh); ++ } else { ++ /* path points to leaf/index in inode body */ ++ err = ext3_ext_get_access_for_root(handle, tree); ++ } ++ return err; ++} ++ ++/* ++ * could return: ++ * - EROFS ++ * - ENOMEM ++ * - EIO ++ */ ++static int ext3_ext_dirty(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ int err; ++ if (path->p_bh) { ++ /* path points to block */ ++ err =ext3_journal_dirty_metadata(handle, path->p_bh); ++ } else { ++ /* path points to leaf/index in inode body */ ++ err = ext3_ext_mark_root_dirty(handle, tree); ++ } ++ return err; ++} ++ ++static int inline ++ext3_ext_new_block(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, struct ext3_extent *ex, ++ int *err) ++{ ++ int goal, depth, newblock; ++ struct inode *inode; ++ ++ EXT_ASSERT(tree); ++ if (tree->ops->new_block) ++ return tree->ops->new_block(handle, tree, path, ex, err); ++ ++ inode = tree->inode; ++ depth = EXT_DEPTH(tree); ++ if (path && depth > 0) { ++ goal = path[depth-1].p_block; ++ } else { ++ struct ext3_inode_info *ei = EXT3_I(inode); ++ unsigned long bg_start; ++ unsigned long colour; ++ ++ bg_start = (ei->i_block_group * ++ EXT3_BLOCKS_PER_GROUP(inode->i_sb)) + ++ le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block); ++ colour = (current->pid % 16) * ++ (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16); ++ goal = bg_start + colour; ++ } ++ ++ newblock = ext3_new_block(handle, inode, goal, err); ++ return newblock; ++} ++ ++static inline void ext3_ext_tree_changed(struct ext3_extents_tree *tree) ++{ ++ struct ext3_extent_header *neh; ++ neh = EXT_ROOT_HDR(tree); ++ neh->eh_generation++; ++} ++ ++static inline int ext3_ext_space_block(struct ext3_extents_tree *tree) ++{ ++ int size; ++ ++ size = (tree->inode->i_sb->s_blocksize - ++ sizeof(struct ext3_extent_header)) / ++ sizeof(struct ext3_extent); ++#ifdef AGRESSIVE_TEST ++ size = 6; ++#endif ++ return size; ++} ++ ++static inline int ext3_ext_space_block_idx(struct ext3_extents_tree *tree) ++{ ++ int size; ++ ++ size = (tree->inode->i_sb->s_blocksize - ++ sizeof(struct ext3_extent_header)) / ++ sizeof(struct ext3_extent_idx); ++#ifdef AGRESSIVE_TEST ++ size = 5; ++#endif ++ return size; ++} ++ ++static inline int ext3_ext_space_root(struct ext3_extents_tree *tree) ++{ ++ int size; ++ ++ size = (tree->buffer_len - sizeof(struct ext3_extent_header)) / ++ sizeof(struct ext3_extent); ++#ifdef AGRESSIVE_TEST ++ size = 3; ++#endif ++ return size; ++} ++ ++static inline int ext3_ext_space_root_idx(struct ext3_extents_tree *tree) ++{ ++ int size; ++ ++ size = (tree->buffer_len - sizeof(struct ext3_extent_header)) / ++ sizeof(struct ext3_extent_idx); ++#ifdef AGRESSIVE_TEST ++ size = 4; ++#endif ++ return size; ++} ++ ++static void ext3_ext_show_path(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++#ifdef EXT_DEBUG ++ int k, l = path->p_depth; ++ ++ ext_debug(tree, "path:"); ++ for (k = 0; k <= l; k++, path++) { ++ if (path->p_idx) { ++ ext_debug(tree, " %d->%d", path->p_idx->ei_block, ++ path->p_idx->ei_leaf); ++ } else if (path->p_ext) { ++ ext_debug(tree, " %d:%d:%d", ++ path->p_ext->ee_block, ++ path->p_ext->ee_len, ++ path->p_ext->ee_start); ++ } else ++ ext_debug(tree, " []"); ++ } ++ ext_debug(tree, "\n"); ++#endif ++} ++ ++static void ext3_ext_show_leaf(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++#ifdef EXT_DEBUG ++ int depth = EXT_DEPTH(tree); ++ struct ext3_extent_header *eh; ++ struct ext3_extent *ex; ++ int i; ++ ++ if (!path) ++ return; ++ ++ eh = path[depth].p_hdr; ++ ex = EXT_FIRST_EXTENT(eh); ++ ++ for (i = 0; i < eh->eh_entries; i++, ex++) { ++ ext_debug(tree, "%d:%d:%d ", ++ ex->ee_block, ex->ee_len, ex->ee_start); ++ } ++ ext_debug(tree, "\n"); ++#endif ++} ++ ++static void ext3_ext_drop_refs(struct ext3_ext_path *path) ++{ ++ int depth = path->p_depth; ++ int i; ++ ++ for (i = 0; i <= depth; i++, path++) { ++ if (path->p_bh) { ++ brelse(path->p_bh); ++ path->p_bh = NULL; ++ } ++ } ++} ++ ++/* ++ * binary search for closest index by given block ++ */ ++static inline void ++ext3_ext_binsearch_idx(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, int block) ++{ ++ struct ext3_extent_header *eh = path->p_hdr; ++ struct ext3_extent_idx *ix; ++ int l = 0, k, r; ++ ++ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); ++ EXT_ASSERT(eh->eh_entries <= eh->eh_max); ++ EXT_ASSERT(eh->eh_entries > 0); ++ ++ ext_debug(tree, "binsearch for %d(idx): ", block); ++ ++ path->p_idx = ix = EXT_FIRST_INDEX(eh); ++ ++ r = k = eh->eh_entries; ++ while (k > 1) { ++ k = (r - l) / 2; ++ if (block < ix[l + k].ei_block) ++ r -= k; ++ else ++ l += k; ++ ext_debug(tree, "%d:%d:%d ", k, l, r); ++ } ++ ++ ix += l; ++ path->p_idx = ix; ++ ext_debug(tree," -> %d->%d ",path->p_idx->ei_block,path->p_idx->ei_leaf); ++ ++ while (l++ < r) { ++ if (block < ix->ei_block) ++ break; ++ path->p_idx = ix++; ++ } ++ ext_debug(tree, " -> %d->%d\n", path->p_idx->ei_block, ++ path->p_idx->ei_leaf); ++ ++#ifdef CHECK_BINSEARCH ++ { ++ struct ext3_extent_idx *chix; ++ ++ chix = ix = EXT_FIRST_INDEX(eh); ++ for (k = 0; k < eh->eh_entries; k++, ix++) { ++ if (k != 0 && ix->ei_block <= ix[-1].ei_block) { ++ printk("k=%d, ix=0x%p, first=0x%p\n", k, ++ ix, EXT_FIRST_INDEX(eh)); ++ printk("%u <= %u\n", ++ ix->ei_block,ix[-1].ei_block); ++ } ++ EXT_ASSERT(k == 0 || ix->ei_block > ix[-1].ei_block); ++ if (block < ix->ei_block) ++ break; ++ chix = ix; ++ } ++ EXT_ASSERT(chix == path->p_idx); ++ } ++#endif ++} ++ ++/* ++ * binary search for closest extent by given block ++ */ ++static inline void ++ext3_ext_binsearch(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, int block) ++{ ++ struct ext3_extent_header *eh = path->p_hdr; ++ struct ext3_extent *ex; ++ int l = 0, k, r; ++ ++ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); ++ EXT_ASSERT(eh->eh_entries <= eh->eh_max); ++ ++ if (eh->eh_entries == 0) { ++ /* ++ * this leaf is empty yet: ++ * we get such a leaf in split/add case ++ */ ++ return; ++ } ++ ++ ext_debug(tree, "binsearch for %d: ", block); ++ ++ path->p_ext = ex = EXT_FIRST_EXTENT(eh); ++ ++ r = k = eh->eh_entries; ++ while (k > 1) { ++ k = (r - l) / 2; ++ if (block < ex[l + k].ee_block) ++ r -= k; ++ else ++ l += k; ++ ext_debug(tree, "%d:%d:%d ", k, l, r); ++ } ++ ++ ex += l; ++ path->p_ext = ex; ++ ext_debug(tree, " -> %d:%d:%d ", path->p_ext->ee_block, ++ path->p_ext->ee_start, path->p_ext->ee_len); ++ ++ while (l++ < r) { ++ if (block < ex->ee_block) ++ break; ++ path->p_ext = ex++; ++ } ++ ext_debug(tree, " -> %d:%d:%d\n", path->p_ext->ee_block, ++ path->p_ext->ee_start, path->p_ext->ee_len); ++ ++#ifdef CHECK_BINSEARCH ++ { ++ struct ext3_extent *chex; ++ ++ chex = ex = EXT_FIRST_EXTENT(eh); ++ for (k = 0; k < eh->eh_entries; k++, ex++) { ++ EXT_ASSERT(k == 0 || ex->ee_block > ex[-1].ee_block); ++ if (block < ex->ee_block) ++ break; ++ chex = ex; ++ } ++ EXT_ASSERT(chex == path->p_ext); ++ } ++#endif ++} ++ ++int ext3_extent_tree_init(handle_t *handle, struct ext3_extents_tree *tree) ++{ ++ struct ext3_extent_header *eh; ++ ++ BUG_ON(tree->buffer_len == 0); ++ ext3_ext_get_access_for_root(handle, tree); ++ eh = EXT_ROOT_HDR(tree); ++ eh->eh_depth = 0; ++ eh->eh_entries = 0; ++ eh->eh_magic = EXT3_EXT_MAGIC; ++ eh->eh_max = ext3_ext_space_root(tree); ++ ext3_ext_mark_root_dirty(handle, tree); ++ ext3_ext_invalidate_cache(tree); ++ return 0; ++} ++ ++struct ext3_ext_path * ++ext3_ext_find_extent(struct ext3_extents_tree *tree, int block, ++ struct ext3_ext_path *path) ++{ ++ struct ext3_extent_header *eh; ++ struct buffer_head *bh; ++ int depth, i, ppos = 0; ++ ++ EXT_ASSERT(tree); ++ EXT_ASSERT(tree->inode); ++ EXT_ASSERT(tree->root); ++ ++ eh = EXT_ROOT_HDR(tree); ++ EXT_ASSERT(eh); ++ if (ext3_ext_check_header(eh)) ++ goto err; ++ ++ i = depth = EXT_DEPTH(tree); ++ EXT_ASSERT(eh->eh_max); ++ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); ++ ++ /* account possible depth increase */ ++ if (!path) { ++ path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 2), ++ GFP_NOFS); ++ if (!path) ++ return ERR_PTR(-ENOMEM); ++ } ++ memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1)); ++ path[0].p_hdr = eh; ++ ++ /* walk through the tree */ ++ while (i) { ++ ext_debug(tree, "depth %d: num %d, max %d\n", ++ ppos, eh->eh_entries, eh->eh_max); ++ ext3_ext_binsearch_idx(tree, path + ppos, block); ++ path[ppos].p_block = path[ppos].p_idx->ei_leaf; ++ path[ppos].p_depth = i; ++ path[ppos].p_ext = NULL; ++ ++ bh = sb_bread(tree->inode->i_sb, path[ppos].p_block); ++ if (!bh) ++ goto err; ++ ++ eh = EXT_BLOCK_HDR(bh); ++ ppos++; ++ EXT_ASSERT(ppos <= depth); ++ path[ppos].p_bh = bh; ++ path[ppos].p_hdr = eh; ++ i--; ++ ++ if (ext3_ext_check_header(eh)) ++ goto err; ++ } ++ ++ path[ppos].p_depth = i; ++ path[ppos].p_hdr = eh; ++ path[ppos].p_ext = NULL; ++ path[ppos].p_idx = NULL; ++ ++ if (ext3_ext_check_header(eh)) ++ goto err; ++ ++ /* find extent */ ++ ext3_ext_binsearch(tree, path + ppos, block); ++ ++ ext3_ext_show_path(tree, path); ++ ++ return path; ++ ++err: ++ printk(KERN_ERR "EXT3-fs: header is corrupted!\n"); ++ ext3_ext_drop_refs(path); ++ kfree(path); ++ return ERR_PTR(-EIO); ++} ++ ++/* ++ * insert new index [logical;ptr] into the block at cupr ++ * it check where to insert: before curp or after curp ++ */ ++static int ext3_ext_insert_index(handle_t *handle, ++ struct ext3_extents_tree *tree, ++ struct ext3_ext_path *curp, ++ int logical, int ptr) ++{ ++ struct ext3_extent_idx *ix; ++ int len, err; ++ ++ if ((err = ext3_ext_get_access(handle, tree, curp))) ++ return err; ++ ++ EXT_ASSERT(logical != curp->p_idx->ei_block); ++ len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx; ++ if (logical > curp->p_idx->ei_block) { ++ /* insert after */ ++ if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) { ++ len = (len - 1) * sizeof(struct ext3_extent_idx); ++ len = len < 0 ? 0 : len; ++ ext_debug(tree, "insert new index %d after: %d. " ++ "move %d from 0x%p to 0x%p\n", ++ logical, ptr, len, ++ (curp->p_idx + 1), (curp->p_idx + 2)); ++ memmove(curp->p_idx + 2, curp->p_idx + 1, len); ++ } ++ ix = curp->p_idx + 1; ++ } else { ++ /* insert before */ ++ len = len * sizeof(struct ext3_extent_idx); ++ len = len < 0 ? 0 : len; ++ ext_debug(tree, "insert new index %d before: %d. " ++ "move %d from 0x%p to 0x%p\n", ++ logical, ptr, len, ++ curp->p_idx, (curp->p_idx + 1)); ++ memmove(curp->p_idx + 1, curp->p_idx, len); ++ ix = curp->p_idx; ++ } ++ ++ ix->ei_block = logical; ++ ix->ei_leaf = ptr; ++ curp->p_hdr->eh_entries++; ++ ++ EXT_ASSERT(curp->p_hdr->eh_entries <= curp->p_hdr->eh_max); ++ EXT_ASSERT(ix <= EXT_LAST_INDEX(curp->p_hdr)); ++ ++ err = ext3_ext_dirty(handle, tree, curp); ++ ext3_std_error(tree->inode->i_sb, err); ++ ++ return err; ++} ++ ++/* ++ * routine inserts new subtree into the path, using free index entry ++ * at depth 'at: ++ * - allocates all needed blocks (new leaf and all intermediate index blocks) ++ * - makes decision where to split ++ * - moves remaining extens and index entries (right to the split point) ++ * into the newly allocated blocks ++ * - initialize subtree ++ */ ++static int ext3_ext_split(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newext, int at) ++{ ++ struct buffer_head *bh = NULL; ++ int depth = EXT_DEPTH(tree); ++ struct ext3_extent_header *neh; ++ struct ext3_extent_idx *fidx; ++ struct ext3_extent *ex; ++ int i = at, k, m, a; ++ unsigned long newblock, oldblock, border; ++ int *ablocks = NULL; /* array of allocated blocks */ ++ int err = 0; ++ ++ /* make decision: where to split? */ ++ /* FIXME: now desicion is simplest: at current extent */ ++ ++ /* if current leaf will be splitted, then we should use ++ * border from split point */ ++ EXT_ASSERT(path[depth].p_ext <= EXT_MAX_EXTENT(path[depth].p_hdr)); ++ if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) { ++ border = path[depth].p_ext[1].ee_block; ++ ext_debug(tree, "leaf will be splitted." ++ " next leaf starts at %d\n", ++ (int)border); ++ } else { ++ border = newext->ee_block; ++ ext_debug(tree, "leaf will be added." ++ " next leaf starts at %d\n", ++ (int)border); ++ } ++ ++ /* ++ * if error occurs, then we break processing ++ * and turn filesystem read-only. so, index won't ++ * be inserted and tree will be in consistent ++ * state. next mount will repair buffers too ++ */ ++ ++ /* ++ * get array to track all allocated blocks ++ * we need this to handle errors and free blocks ++ * upon them ++ */ ++ ablocks = kmalloc(sizeof(unsigned long) * depth, GFP_NOFS); ++ if (!ablocks) ++ return -ENOMEM; ++ memset(ablocks, 0, sizeof(unsigned long) * depth); ++ ++ /* allocate all needed blocks */ ++ ext_debug(tree, "allocate %d blocks for indexes/leaf\n", depth - at); ++ for (a = 0; a < depth - at; a++) { ++ newblock = ext3_ext_new_block(handle, tree, path, newext, &err); ++ if (newblock == 0) ++ goto cleanup; ++ ablocks[a] = newblock; ++ } ++ ++ /* initialize new leaf */ ++ newblock = ablocks[--a]; ++ EXT_ASSERT(newblock); ++ bh = sb_getblk(tree->inode->i_sb, newblock); ++ if (!bh) { ++ err = -EIO; ++ goto cleanup; ++ } ++ lock_buffer(bh); ++ ++ if ((err = ext3_journal_get_create_access(handle, bh))) ++ goto cleanup; ++ ++ neh = EXT_BLOCK_HDR(bh); ++ neh->eh_entries = 0; ++ neh->eh_max = ext3_ext_space_block(tree); ++ neh->eh_magic = EXT3_EXT_MAGIC; ++ neh->eh_depth = 0; ++ ex = EXT_FIRST_EXTENT(neh); ++ ++ /* move remain of path[depth] to the new leaf */ ++ EXT_ASSERT(path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max); ++ /* start copy from next extent */ ++ /* TODO: we could do it by single memmove */ ++ m = 0; ++ path[depth].p_ext++; ++ while (path[depth].p_ext <= ++ EXT_MAX_EXTENT(path[depth].p_hdr)) { ++ ext_debug(tree, "move %d:%d:%d in new leaf %lu\n", ++ path[depth].p_ext->ee_block, ++ path[depth].p_ext->ee_start, ++ path[depth].p_ext->ee_len, ++ newblock); ++ memmove(ex++, path[depth].p_ext++, sizeof(struct ext3_extent)); ++ neh->eh_entries++; ++ m++; ++ } ++ set_buffer_uptodate(bh); ++ unlock_buffer(bh); ++ ++ if ((err = ext3_journal_dirty_metadata(handle, bh))) ++ goto cleanup; ++ brelse(bh); ++ bh = NULL; ++ ++ /* correct old leaf */ ++ if (m) { ++ if ((err = ext3_ext_get_access(handle, tree, path + depth))) ++ goto cleanup; ++ path[depth].p_hdr->eh_entries -= m; ++ if ((err = ext3_ext_dirty(handle, tree, path + depth))) ++ goto cleanup; ++ ++ } ++ ++ /* create intermediate indexes */ ++ k = depth - at - 1; ++ EXT_ASSERT(k >= 0); ++ if (k) ++ ext_debug(tree, "create %d intermediate indices\n", k); ++ /* insert new index into current index block */ ++ /* current depth stored in i var */ ++ i = depth - 1; ++ while (k--) { ++ oldblock = newblock; ++ newblock = ablocks[--a]; ++ bh = sb_getblk(tree->inode->i_sb, newblock); ++ if (!bh) { ++ err = -EIO; ++ goto cleanup; ++ } ++ lock_buffer(bh); ++ ++ if ((err = ext3_journal_get_create_access(handle, bh))) ++ goto cleanup; ++ ++ neh = EXT_BLOCK_HDR(bh); ++ neh->eh_entries = 1; ++ neh->eh_magic = EXT3_EXT_MAGIC; ++ neh->eh_max = ext3_ext_space_block_idx(tree); ++ neh->eh_depth = depth - i; ++ fidx = EXT_FIRST_INDEX(neh); ++ fidx->ei_block = border; ++ fidx->ei_leaf = oldblock; ++ ++ ext_debug(tree, "int.index at %d (block %lu): %lu -> %lu\n", ++ i, newblock, border, oldblock); ++ /* copy indexes */ ++ m = 0; ++ path[i].p_idx++; ++ ++ ext_debug(tree, "cur 0x%p, last 0x%p\n", path[i].p_idx, ++ EXT_MAX_INDEX(path[i].p_hdr)); ++ EXT_ASSERT(EXT_MAX_INDEX(path[i].p_hdr) == ++ EXT_LAST_INDEX(path[i].p_hdr)); ++ while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) { ++ ext_debug(tree, "%d: move %d:%d in new index %lu\n", ++ i, path[i].p_idx->ei_block, ++ path[i].p_idx->ei_leaf, newblock); ++ memmove(++fidx, path[i].p_idx++, ++ sizeof(struct ext3_extent_idx)); ++ neh->eh_entries++; ++ EXT_ASSERT(neh->eh_entries <= neh->eh_max); ++ m++; ++ } ++ set_buffer_uptodate(bh); ++ unlock_buffer(bh); ++ ++ if ((err = ext3_journal_dirty_metadata(handle, bh))) ++ goto cleanup; ++ brelse(bh); ++ bh = NULL; ++ ++ /* correct old index */ ++ if (m) { ++ err = ext3_ext_get_access(handle, tree, path + i); ++ if (err) ++ goto cleanup; ++ path[i].p_hdr->eh_entries -= m; ++ err = ext3_ext_dirty(handle, tree, path + i); ++ if (err) ++ goto cleanup; ++ } ++ ++ i--; ++ } ++ ++ /* insert new index */ ++ if (!err) ++ err = ext3_ext_insert_index(handle, tree, path + at, ++ border, newblock); ++ ++cleanup: ++ if (bh) { ++ if (buffer_locked(bh)) ++ unlock_buffer(bh); ++ brelse(bh); ++ } ++ ++ if (err) { ++ /* free all allocated blocks in error case */ ++ for (i = 0; i < depth; i++) { ++ if (!ablocks[i]) ++ continue; ++ ext3_free_blocks(handle, tree->inode, ablocks[i], 1); ++ } ++ } ++ kfree(ablocks); ++ ++ return err; ++} ++ ++/* ++ * routine implements tree growing procedure: ++ * - allocates new block ++ * - moves top-level data (index block or leaf) into the new block ++ * - initialize new top-level, creating index that points to the ++ * just created block ++ */ ++static int ext3_ext_grow_indepth(handle_t *handle, ++ struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newext) ++{ ++ struct ext3_ext_path *curp = path; ++ struct ext3_extent_header *neh; ++ struct ext3_extent_idx *fidx; ++ struct buffer_head *bh; ++ unsigned long newblock; ++ int err = 0; ++ ++ newblock = ext3_ext_new_block(handle, tree, path, newext, &err); ++ if (newblock == 0) ++ return err; ++ ++ bh = sb_getblk(tree->inode->i_sb, newblock); ++ if (!bh) { ++ err = -EIO; ++ ext3_std_error(tree->inode->i_sb, err); ++ return err; ++ } ++ lock_buffer(bh); ++ ++ if ((err = ext3_journal_get_create_access(handle, bh))) { ++ unlock_buffer(bh); ++ goto out; ++ } ++ ++ /* move top-level index/leaf into new block */ ++ memmove(bh->b_data, curp->p_hdr, tree->buffer_len); ++ ++ /* set size of new block */ ++ neh = EXT_BLOCK_HDR(bh); ++ /* old root could have indexes or leaves ++ * so calculate eh_max right way */ ++ if (EXT_DEPTH(tree)) ++ neh->eh_max = ext3_ext_space_block_idx(tree); ++ else ++ neh->eh_max = ext3_ext_space_block(tree); ++ neh->eh_magic = EXT3_EXT_MAGIC; ++ set_buffer_uptodate(bh); ++ unlock_buffer(bh); ++ ++ if ((err = ext3_journal_dirty_metadata(handle, bh))) ++ goto out; ++ ++ /* create index in new top-level index: num,max,pointer */ ++ if ((err = ext3_ext_get_access(handle, tree, curp))) ++ goto out; ++ ++ curp->p_hdr->eh_magic = EXT3_EXT_MAGIC; ++ curp->p_hdr->eh_max = ext3_ext_space_root_idx(tree); ++ curp->p_hdr->eh_entries = 1; ++ curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr); ++ /* FIXME: it works, but actually path[0] can be index */ ++ curp->p_idx->ei_block = EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block; ++ curp->p_idx->ei_leaf = newblock; ++ ++ neh = EXT_ROOT_HDR(tree); ++ fidx = EXT_FIRST_INDEX(neh); ++ ext_debug(tree, "new root: num %d(%d), lblock %d, ptr %d\n", ++ neh->eh_entries, neh->eh_max, fidx->ei_block, fidx->ei_leaf); ++ ++ neh->eh_depth = path->p_depth + 1; ++ err = ext3_ext_dirty(handle, tree, curp); ++out: ++ brelse(bh); ++ ++ return err; ++} ++ ++/* ++ * routine finds empty index and adds new leaf. if no free index found ++ * then it requests in-depth growing ++ */ ++static int ext3_ext_create_new_leaf(handle_t *handle, ++ struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newext) ++{ ++ struct ext3_ext_path *curp; ++ int depth, i, err = 0; ++ ++repeat: ++ i = depth = EXT_DEPTH(tree); ++ ++ /* walk up to the tree and look for free index entry */ ++ curp = path + depth; ++ while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) { ++ i--; ++ curp--; ++ } ++ ++ /* we use already allocated block for index block ++ * so, subsequent data blocks should be contigoues */ ++ if (EXT_HAS_FREE_INDEX(curp)) { ++ /* if we found index with free entry, then use that ++ * entry: create all needed subtree and add new leaf */ ++ err = ext3_ext_split(handle, tree, path, newext, i); ++ ++ /* refill path */ ++ ext3_ext_drop_refs(path); ++ path = ext3_ext_find_extent(tree, newext->ee_block, path); ++ if (IS_ERR(path)) ++ err = PTR_ERR(path); ++ } else { ++ /* tree is full, time to grow in depth */ ++ err = ext3_ext_grow_indepth(handle, tree, path, newext); ++ ++ /* refill path */ ++ ext3_ext_drop_refs(path); ++ path = ext3_ext_find_extent(tree, newext->ee_block, path); ++ if (IS_ERR(path)) ++ err = PTR_ERR(path); ++ ++ /* ++ * only first (depth 0 -> 1) produces free space ++ * in all other cases we have to split growed tree ++ */ ++ depth = EXT_DEPTH(tree); ++ if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) { ++ /* now we need split */ ++ goto repeat; ++ } ++ } ++ ++ if (err) ++ return err; ++ ++ return 0; ++} ++ ++/* ++ * returns allocated block in subsequent extent or EXT_MAX_BLOCK ++ * NOTE: it consider block number from index entry as ++ * allocated block. thus, index entries have to be consistent ++ * with leafs ++ */ ++static unsigned long ++ext3_ext_next_allocated_block(struct ext3_ext_path *path) ++{ ++ int depth; ++ ++ EXT_ASSERT(path != NULL); ++ depth = path->p_depth; ++ ++ if (depth == 0 && path->p_ext == NULL) ++ return EXT_MAX_BLOCK; ++ ++ /* FIXME: what if index isn't full ?! */ ++ while (depth >= 0) { ++ if (depth == path->p_depth) { ++ /* leaf */ ++ if (path[depth].p_ext != ++ EXT_LAST_EXTENT(path[depth].p_hdr)) ++ return path[depth].p_ext[1].ee_block; ++ } else { ++ /* index */ ++ if (path[depth].p_idx != ++ EXT_LAST_INDEX(path[depth].p_hdr)) ++ return path[depth].p_idx[1].ei_block; ++ } ++ depth--; ++ } ++ ++ return EXT_MAX_BLOCK; ++} ++ ++/* ++ * returns first allocated block from next leaf or EXT_MAX_BLOCK ++ */ ++static unsigned ext3_ext_next_leaf_block(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ int depth; ++ ++ EXT_ASSERT(path != NULL); ++ depth = path->p_depth; ++ ++ /* zero-tree has no leaf blocks at all */ ++ if (depth == 0) ++ return EXT_MAX_BLOCK; ++ ++ /* go to index block */ ++ depth--; ++ ++ while (depth >= 0) { ++ if (path[depth].p_idx != ++ EXT_LAST_INDEX(path[depth].p_hdr)) ++ return path[depth].p_idx[1].ei_block; ++ depth--; ++ } ++ ++ return EXT_MAX_BLOCK; ++} ++ ++/* ++ * if leaf gets modified and modified extent is first in the leaf ++ * then we have to correct all indexes above ++ * TODO: do we need to correct tree in all cases? ++ */ ++int ext3_ext_correct_indexes(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ struct ext3_extent_header *eh; ++ int depth = EXT_DEPTH(tree); ++ struct ext3_extent *ex; ++ unsigned long border; ++ int k, err = 0; ++ ++ eh = path[depth].p_hdr; ++ ex = path[depth].p_ext; ++ EXT_ASSERT(ex); ++ EXT_ASSERT(eh); ++ ++ if (depth == 0) { ++ /* there is no tree at all */ ++ return 0; ++ } ++ ++ if (ex != EXT_FIRST_EXTENT(eh)) { ++ /* we correct tree if first leaf got modified only */ ++ return 0; ++ } ++ ++ /* ++ * TODO: we need correction if border is smaller then current one ++ */ ++ k = depth - 1; ++ border = path[depth].p_ext->ee_block; ++ if ((err = ext3_ext_get_access(handle, tree, path + k))) ++ return err; ++ path[k].p_idx->ei_block = border; ++ if ((err = ext3_ext_dirty(handle, tree, path + k))) ++ return err; ++ ++ while (k--) { ++ /* change all left-side indexes */ ++ if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr)) ++ break; ++ if ((err = ext3_ext_get_access(handle, tree, path + k))) ++ break; ++ path[k].p_idx->ei_block = border; ++ if ((err = ext3_ext_dirty(handle, tree, path + k))) ++ break; ++ } ++ ++ return err; ++} ++ ++static int inline ++ext3_can_extents_be_merged(struct ext3_extents_tree *tree, ++ struct ext3_extent *ex1, ++ struct ext3_extent *ex2) ++{ ++ if (ex1->ee_block + ex1->ee_len != ex2->ee_block) ++ return 0; ++ ++#ifdef AGRESSIVE_TEST ++ if (ex1->ee_len >= 4) ++ return 0; ++#endif ++ ++ if (!tree->ops->mergable) ++ return 1; ++ ++ return tree->ops->mergable(ex1, ex2); ++} ++ ++/* ++ * this routine tries to merge requsted extent into the existing ++ * extent or inserts requested extent as new one into the tree, ++ * creating new leaf in no-space case ++ */ ++int ext3_ext_insert_extent(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newext) ++{ ++ struct ext3_extent_header * eh; ++ struct ext3_extent *ex, *fex; ++ struct ext3_extent *nearex; /* nearest extent */ ++ struct ext3_ext_path *npath = NULL; ++ int depth, len, err, next; ++ ++ EXT_ASSERT(newext->ee_len > 0); ++ depth = EXT_DEPTH(tree); ++ ex = path[depth].p_ext; ++ EXT_ASSERT(path[depth].p_hdr); ++ ++ /* try to insert block into found extent and return */ ++ if (ex && ext3_can_extents_be_merged(tree, ex, newext)) { ++ ext_debug(tree, "append %d block to %d:%d (from %d)\n", ++ newext->ee_len, ex->ee_block, ex->ee_len, ++ ex->ee_start); ++ if ((err = ext3_ext_get_access(handle, tree, path + depth))) ++ return err; ++ ex->ee_len += newext->ee_len; ++ eh = path[depth].p_hdr; ++ nearex = ex; ++ goto merge; ++ } ++ ++repeat: ++ depth = EXT_DEPTH(tree); ++ eh = path[depth].p_hdr; ++ if (eh->eh_entries < eh->eh_max) ++ goto has_space; ++ ++ /* probably next leaf has space for us? */ ++ fex = EXT_LAST_EXTENT(eh); ++ next = ext3_ext_next_leaf_block(tree, path); ++ if (newext->ee_block > fex->ee_block && next != EXT_MAX_BLOCK) { ++ ext_debug(tree, "next leaf block - %d\n", next); ++ EXT_ASSERT(!npath); ++ npath = ext3_ext_find_extent(tree, next, NULL); ++ if (IS_ERR(npath)) ++ return PTR_ERR(npath); ++ EXT_ASSERT(npath->p_depth == path->p_depth); ++ eh = npath[depth].p_hdr; ++ if (eh->eh_entries < eh->eh_max) { ++ ext_debug(tree, "next leaf isnt full(%d)\n", ++ eh->eh_entries); ++ path = npath; ++ goto repeat; ++ } ++ ext_debug(tree, "next leaf hasno free space(%d,%d)\n", ++ eh->eh_entries, eh->eh_max); ++ } ++ ++ /* ++ * there is no free space in found leaf ++ * we're gonna add new leaf in the tree ++ */ ++ err = ext3_ext_create_new_leaf(handle, tree, path, newext); ++ if (err) ++ goto cleanup; ++ depth = EXT_DEPTH(tree); ++ eh = path[depth].p_hdr; ++ ++has_space: ++ nearex = path[depth].p_ext; ++ ++ if ((err = ext3_ext_get_access(handle, tree, path + depth))) ++ goto cleanup; ++ ++ if (!nearex) { ++ /* there is no extent in this leaf, create first one */ ++ ext_debug(tree, "first extent in the leaf: %d:%d:%d\n", ++ newext->ee_block, newext->ee_start, ++ newext->ee_len); ++ path[depth].p_ext = EXT_FIRST_EXTENT(eh); ++ } else if (newext->ee_block > nearex->ee_block) { ++ EXT_ASSERT(newext->ee_block != nearex->ee_block); ++ if (nearex != EXT_LAST_EXTENT(eh)) { ++ len = EXT_MAX_EXTENT(eh) - nearex; ++ len = (len - 1) * sizeof(struct ext3_extent); ++ len = len < 0 ? 0 : len; ++ ext_debug(tree, "insert %d:%d:%d after: nearest 0x%p, " ++ "move %d from 0x%p to 0x%p\n", ++ newext->ee_block, newext->ee_start, ++ newext->ee_len, ++ nearex, len, nearex + 1, nearex + 2); ++ memmove(nearex + 2, nearex + 1, len); ++ } ++ path[depth].p_ext = nearex + 1; ++ } else { ++ EXT_ASSERT(newext->ee_block != nearex->ee_block); ++ len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext3_extent); ++ len = len < 0 ? 0 : len; ++ ext_debug(tree, "insert %d:%d:%d before: nearest 0x%p, " ++ "move %d from 0x%p to 0x%p\n", ++ newext->ee_block, newext->ee_start, newext->ee_len, ++ nearex, len, nearex + 1, nearex + 2); ++ memmove(nearex + 1, nearex, len); ++ path[depth].p_ext = nearex; ++ } ++ ++ eh->eh_entries++; ++ nearex = path[depth].p_ext; ++ nearex->ee_block = newext->ee_block; ++ nearex->ee_start = newext->ee_start; ++ nearex->ee_len = newext->ee_len; ++ /* FIXME: support for large fs */ ++ nearex->ee_start_hi = 0; ++ ++merge: ++ /* try to merge extents to the right */ ++ while (nearex < EXT_LAST_EXTENT(eh)) { ++ if (!ext3_can_extents_be_merged(tree, nearex, nearex + 1)) ++ break; ++ /* merge with next extent! */ ++ nearex->ee_len += nearex[1].ee_len; ++ if (nearex + 1 < EXT_LAST_EXTENT(eh)) { ++ len = (EXT_LAST_EXTENT(eh) - nearex - 1) * ++ sizeof(struct ext3_extent); ++ memmove(nearex + 1, nearex + 2, len); ++ } ++ eh->eh_entries--; ++ EXT_ASSERT(eh->eh_entries > 0); ++ } ++ ++ /* try to merge extents to the left */ ++ ++ /* time to correct all indexes above */ ++ err = ext3_ext_correct_indexes(handle, tree, path); ++ if (err) ++ goto cleanup; ++ ++ err = ext3_ext_dirty(handle, tree, path + depth); ++ ++cleanup: ++ if (npath) { ++ ext3_ext_drop_refs(npath); ++ kfree(npath); ++ } ++ ext3_ext_tree_changed(tree); ++ ext3_ext_invalidate_cache(tree); ++ return err; ++} ++ ++int ext3_ext_walk_space(struct ext3_extents_tree *tree, unsigned long block, ++ unsigned long num, ext_prepare_callback func) ++{ ++ struct ext3_ext_path *path = NULL; ++ struct ext3_ext_cache cbex; ++ struct ext3_extent *ex; ++ unsigned long next, start = 0, end = 0; ++ unsigned long last = block + num; ++ int depth, exists, err = 0; ++ ++ EXT_ASSERT(tree); ++ EXT_ASSERT(func); ++ EXT_ASSERT(tree->inode); ++ EXT_ASSERT(tree->root); ++ ++ while (block < last && block != EXT_MAX_BLOCK) { ++ num = last - block; ++ /* find extent for this block */ ++ path = ext3_ext_find_extent(tree, block, path); ++ if (IS_ERR(path)) { ++ err = PTR_ERR(path); ++ path = NULL; ++ break; ++ } ++ ++ depth = EXT_DEPTH(tree); ++ EXT_ASSERT(path[depth].p_hdr); ++ ex = path[depth].p_ext; ++ next = ext3_ext_next_allocated_block(path); ++ ++ exists = 0; ++ if (!ex) { ++ /* there is no extent yet, so try to allocate ++ * all requested space */ ++ start = block; ++ end = block + num; ++ } else if (ex->ee_block > block) { ++ /* need to allocate space before found extent */ ++ start = block; ++ end = ex->ee_block; ++ if (block + num < end) ++ end = block + num; ++ } else if (block >= ex->ee_block + ex->ee_len) { ++ /* need to allocate space after found extent */ ++ start = block; ++ end = block + num; ++ if (end >= next) ++ end = next; ++ } else if (block >= ex->ee_block) { ++ /* ++ * some part of requested space is covered ++ * by found extent ++ */ ++ start = block; ++ end = ex->ee_block + ex->ee_len; ++ if (block + num < end) ++ end = block + num; ++ exists = 1; ++ } else { ++ BUG(); ++ } ++ EXT_ASSERT(end > start); ++ ++ if (!exists) { ++ cbex.ec_block = start; ++ cbex.ec_len = end - start; ++ cbex.ec_start = 0; ++ cbex.ec_type = EXT3_EXT_CACHE_GAP; ++ } else { ++ cbex.ec_block = ex->ee_block; ++ cbex.ec_len = ex->ee_len; ++ cbex.ec_start = ex->ee_start; ++ cbex.ec_type = EXT3_EXT_CACHE_EXTENT; ++ } ++ ++ EXT_ASSERT(cbex.ec_len > 0); ++ EXT_ASSERT(path[depth].p_hdr); ++ err = func(tree, path, &cbex); ++ ext3_ext_drop_refs(path); ++ ++ if (err < 0) ++ break; ++ if (err == EXT_REPEAT) ++ continue; ++ else if (err == EXT_BREAK) { ++ err = 0; ++ break; ++ } ++ ++ if (EXT_DEPTH(tree) != depth) { ++ /* depth was changed. we have to realloc path */ ++ kfree(path); ++ path = NULL; ++ } ++ ++ block = cbex.ec_block + cbex.ec_len; ++ } ++ ++ if (path) { ++ ext3_ext_drop_refs(path); ++ kfree(path); ++ } ++ ++ return err; ++} ++ ++static inline void ++ext3_ext_put_in_cache(struct ext3_extents_tree *tree, __u32 block, ++ __u32 len, __u32 start, int type) ++{ ++ EXT_ASSERT(len > 0); ++ if (tree->cex) { ++ tree->cex->ec_type = type; ++ tree->cex->ec_block = block; ++ tree->cex->ec_len = len; ++ tree->cex->ec_start = start; ++ } ++} ++ ++/* ++ * this routine calculate boundaries of the gap requested block fits into ++ * and cache this gap ++ */ ++static inline void ++ext3_ext_put_gap_in_cache(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ unsigned long block) ++{ ++ int depth = EXT_DEPTH(tree); ++ unsigned long lblock, len; ++ struct ext3_extent *ex; ++ ++ if (!tree->cex) ++ return; ++ ++ ex = path[depth].p_ext; ++ if (ex == NULL) { ++ /* there is no extent yet, so gap is [0;-] */ ++ lblock = 0; ++ len = EXT_MAX_BLOCK; ++ ext_debug(tree, "cache gap(whole file):"); ++ } else if (block < ex->ee_block) { ++ lblock = block; ++ len = ex->ee_block - block; ++ ext_debug(tree, "cache gap(before): %lu [%lu:%lu]", ++ (unsigned long) block, ++ (unsigned long) ex->ee_block, ++ (unsigned long) ex->ee_len); ++ } else if (block >= ex->ee_block + ex->ee_len) { ++ lblock = ex->ee_block + ex->ee_len; ++ len = ext3_ext_next_allocated_block(path); ++ ext_debug(tree, "cache gap(after): [%lu:%lu] %lu", ++ (unsigned long) ex->ee_block, ++ (unsigned long) ex->ee_len, ++ (unsigned long) block); ++ EXT_ASSERT(len > lblock); ++ len = len - lblock; ++ } else { ++ lblock = len = 0; ++ BUG(); ++ } ++ ++ ext_debug(tree, " -> %lu:%lu\n", (unsigned long) lblock, len); ++ ext3_ext_put_in_cache(tree, lblock, len, 0, EXT3_EXT_CACHE_GAP); ++} ++ ++static inline int ++ext3_ext_in_cache(struct ext3_extents_tree *tree, unsigned long block, ++ struct ext3_extent *ex) ++{ ++ struct ext3_ext_cache *cex = tree->cex; ++ ++ /* is there cache storage at all? */ ++ if (!cex) ++ return EXT3_EXT_CACHE_NO; ++ ++ /* has cache valid data? */ ++ if (cex->ec_type == EXT3_EXT_CACHE_NO) ++ return EXT3_EXT_CACHE_NO; ++ ++ EXT_ASSERT(cex->ec_type == EXT3_EXT_CACHE_GAP || ++ cex->ec_type == EXT3_EXT_CACHE_EXTENT); ++ if (block >= cex->ec_block && block < cex->ec_block + cex->ec_len) { ++ ex->ee_block = cex->ec_block; ++ ex->ee_start = cex->ec_start; ++ ex->ee_len = cex->ec_len; ++ ext_debug(tree, "%lu cached by %lu:%lu:%lu\n", ++ (unsigned long) block, ++ (unsigned long) ex->ee_block, ++ (unsigned long) ex->ee_len, ++ (unsigned long) ex->ee_start); ++ return cex->ec_type; ++ } ++ ++ /* not in cache */ ++ return EXT3_EXT_CACHE_NO; ++} ++ ++/* ++ * routine removes index from the index block ++ * it's used in truncate case only. thus all requests are for ++ * last index in the block only ++ */ ++int ext3_ext_rm_idx(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ struct buffer_head *bh; ++ int err; ++ ++ /* free index block */ ++ path--; ++ EXT_ASSERT(path->p_hdr->eh_entries); ++ if ((err = ext3_ext_get_access(handle, tree, path))) ++ return err; ++ path->p_hdr->eh_entries--; ++ if ((err = ext3_ext_dirty(handle, tree, path))) ++ return err; ++ ext_debug(tree, "index is empty, remove it, free block %d\n", ++ path->p_idx->ei_leaf); ++ bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf); ++ ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf); ++ ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1); ++ return err; ++} ++ ++int ext3_ext_calc_credits_for_insert(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ int depth = EXT_DEPTH(tree); ++ int needed; ++ ++ if (path) { ++ /* probably there is space in leaf? */ ++ if (path[depth].p_hdr->eh_entries < path[depth].p_hdr->eh_max) ++ return 1; ++ } ++ ++ /* ++ * the worste case we're expecting is creation of the ++ * new root (growing in depth) with index splitting ++ * for splitting we have to consider depth + 1 because ++ * previous growing could increase it ++ */ ++ depth = depth + 1; ++ ++ /* ++ * growing in depth: ++ * block allocation + new root + old root ++ */ ++ needed = EXT3_ALLOC_NEEDED + 2; ++ ++ /* index split. we may need: ++ * allocate intermediate indexes and new leaf ++ * change two blocks at each level, but root ++ * modify root block (inode) ++ */ ++ needed += (depth * EXT3_ALLOC_NEEDED) + (2 * depth) + 1; ++ ++ return needed; ++} ++ ++static int ++ext3_ext_split_for_rm(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, unsigned long start, ++ unsigned long end) ++{ ++ struct ext3_extent *ex, tex; ++ struct ext3_ext_path *npath; ++ int depth, creds, err; ++ ++ depth = EXT_DEPTH(tree); ++ ex = path[depth].p_ext; ++ EXT_ASSERT(ex); ++ EXT_ASSERT(end < ex->ee_block + ex->ee_len - 1); ++ EXT_ASSERT(ex->ee_block < start); ++ ++ /* calculate tail extent */ ++ tex.ee_block = end + 1; ++ EXT_ASSERT(tex.ee_block < ex->ee_block + ex->ee_len); ++ tex.ee_len = ex->ee_block + ex->ee_len - tex.ee_block; ++ ++ creds = ext3_ext_calc_credits_for_insert(tree, path); ++ handle = ext3_ext_journal_restart(handle, creds); ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ ++ /* calculate head extent. use primary extent */ ++ err = ext3_ext_get_access(handle, tree, path + depth); ++ if (err) ++ return err; ++ ex->ee_len = start - ex->ee_block; ++ err = ext3_ext_dirty(handle, tree, path + depth); ++ if (err) ++ return err; ++ ++ /* FIXME: some callback to free underlying resource ++ * and correct ee_start? */ ++ ext_debug(tree, "split extent: head %u:%u, tail %u:%u\n", ++ ex->ee_block, ex->ee_len, tex.ee_block, tex.ee_len); ++ ++ npath = ext3_ext_find_extent(tree, ex->ee_block, NULL); ++ if (IS_ERR(npath)) ++ return PTR_ERR(npath); ++ depth = EXT_DEPTH(tree); ++ EXT_ASSERT(npath[depth].p_ext->ee_block == ex->ee_block); ++ EXT_ASSERT(npath[depth].p_ext->ee_len == ex->ee_len); ++ ++ err = ext3_ext_insert_extent(handle, tree, npath, &tex); ++ ext3_ext_drop_refs(npath); ++ kfree(npath); ++ ++ return err; ++} ++ ++static int ++ext3_ext_rm_leaf(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, unsigned long start, ++ unsigned long end) ++{ ++ struct ext3_extent *ex, *fu = NULL, *lu, *le; ++ int err = 0, correct_index = 0; ++ int depth = EXT_DEPTH(tree), credits; ++ struct ext3_extent_header *eh; ++ unsigned a, b, block, num; ++ ++ ext_debug(tree, "remove [%lu:%lu] in leaf\n", start, end); ++ if (!path[depth].p_hdr) ++ path[depth].p_hdr = EXT_BLOCK_HDR(path[depth].p_bh); ++ eh = path[depth].p_hdr; ++ EXT_ASSERT(eh); ++ EXT_ASSERT(eh->eh_entries <= eh->eh_max); ++ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); ++ ++ /* find where to start removing */ ++ le = ex = EXT_LAST_EXTENT(eh); ++ while (ex != EXT_FIRST_EXTENT(eh)) { ++ if (ex->ee_block <= end) ++ break; ++ ex--; ++ } ++ ++ if (start > ex->ee_block && end < ex->ee_block + ex->ee_len - 1) { ++ /* removal of internal part of the extent requested ++ * tail and head must be placed in different extent ++ * so, we have to insert one more extent */ ++ path[depth].p_ext = ex; ++ return ext3_ext_split_for_rm(handle, tree, path, start, end); ++ } ++ ++ lu = ex; ++ while (ex >= EXT_FIRST_EXTENT(eh) && ex->ee_block + ex->ee_len > start) { ++ ext_debug(tree, "remove ext %u:%u\n", ex->ee_block, ex->ee_len); ++ path[depth].p_ext = ex; ++ ++ a = ex->ee_block > start ? ex->ee_block : start; ++ b = ex->ee_block + ex->ee_len - 1 < end ? ++ ex->ee_block + ex->ee_len - 1 : end; ++ ++ ext_debug(tree, " border %u:%u\n", a, b); ++ ++ if (a != ex->ee_block && b != ex->ee_block + ex->ee_len - 1) { ++ block = 0; ++ num = 0; ++ BUG(); ++ } else if (a != ex->ee_block) { ++ /* remove tail of the extent */ ++ block = ex->ee_block; ++ num = a - block; ++ } else if (b != ex->ee_block + ex->ee_len - 1) { ++ /* remove head of the extent */ ++ block = a; ++ num = b - a; ++ } else { ++ /* remove whole extent: excelent! */ ++ block = ex->ee_block; ++ num = 0; ++ EXT_ASSERT(a == ex->ee_block && ++ b == ex->ee_block + ex->ee_len - 1); ++ } ++ ++ if (ex == EXT_FIRST_EXTENT(eh)) ++ correct_index = 1; ++ ++ credits = 1; ++ if (correct_index) ++ credits += (EXT_DEPTH(tree) * EXT3_ALLOC_NEEDED) + 1; ++ if (tree->ops->remove_extent_credits) ++ credits+=tree->ops->remove_extent_credits(tree,ex,a,b); ++ ++ handle = ext3_ext_journal_restart(handle, credits); ++ if (IS_ERR(handle)) { ++ err = PTR_ERR(handle); ++ goto out; ++ } ++ ++ err = ext3_ext_get_access(handle, tree, path + depth); ++ if (err) ++ goto out; ++ ++ if (tree->ops->remove_extent) ++ err = tree->ops->remove_extent(tree, ex, a, b); ++ if (err) ++ goto out; ++ ++ if (num == 0) { ++ /* this extent is removed entirely mark slot unused */ ++ ex->ee_start = 0; ++ eh->eh_entries--; ++ fu = ex; ++ } ++ ++ ex->ee_block = block; ++ ex->ee_len = num; ++ ++ err = ext3_ext_dirty(handle, tree, path + depth); ++ if (err) ++ goto out; ++ ++ ext_debug(tree, "new extent: %u:%u:%u\n", ++ ex->ee_block, ex->ee_len, ex->ee_start); ++ ex--; ++ } ++ ++ if (fu) { ++ /* reuse unused slots */ ++ while (lu < le) { ++ if (lu->ee_start) { ++ *fu = *lu; ++ lu->ee_start = 0; ++ fu++; ++ } ++ lu++; ++ } ++ } ++ ++ if (correct_index && eh->eh_entries) ++ err = ext3_ext_correct_indexes(handle, tree, path); ++ ++ /* if this leaf is free, then we should ++ * remove it from index block above */ ++ if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL) ++ err = ext3_ext_rm_idx(handle, tree, path + depth); ++ ++out: ++ return err; ++} ++ ++ ++static struct ext3_extent_idx * ++ext3_ext_last_covered(struct ext3_extent_header *hdr, unsigned long block) ++{ ++ struct ext3_extent_idx *ix; ++ ++ ix = EXT_LAST_INDEX(hdr); ++ while (ix != EXT_FIRST_INDEX(hdr)) { ++ if (ix->ei_block <= block) ++ break; ++ ix--; ++ } ++ return ix; ++} ++ ++/* ++ * returns 1 if current index have to be freed (even partial) ++ */ ++static int inline ++ext3_ext_more_to_rm(struct ext3_ext_path *path) ++{ ++ EXT_ASSERT(path->p_idx); ++ ++ if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr)) ++ return 0; ++ ++ /* ++ * if truncate on deeper level happened it it wasn't partial ++ * so we have to consider current index for truncation ++ */ ++ if (path->p_hdr->eh_entries == path->p_block) ++ return 0; ++ return 1; ++} ++ ++int ext3_ext_remove_space(struct ext3_extents_tree *tree, ++ unsigned long start, unsigned long end) ++{ ++ struct inode *inode = tree->inode; ++ struct super_block *sb = inode->i_sb; ++ int depth = EXT_DEPTH(tree); ++ struct ext3_ext_path *path; ++ handle_t *handle; ++ int i = 0, err = 0; ++ ++ ext_debug(tree, "space to be removed: %lu:%lu\n", start, end); ++ ++ /* probably first extent we're gonna free will be last in block */ ++ handle = ext3_journal_start(inode, depth + 1); ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ ++ ext3_ext_invalidate_cache(tree); ++ ++ /* ++ * we start scanning from right side freeing all the blocks ++ * after i_size and walking into the deep ++ */ ++ path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 1), GFP_KERNEL); ++ if (IS_ERR(path)) { ++ ext3_error(sb, __FUNCTION__, "Can't allocate path array"); ++ ext3_journal_stop(handle); ++ return -ENOMEM; ++ } ++ memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1)); ++ path[i].p_hdr = EXT_ROOT_HDR(tree); ++ ++ while (i >= 0 && err == 0) { ++ if (i == depth) { ++ /* this is leaf block */ ++ err = ext3_ext_rm_leaf(handle, tree, path, start, end); ++ /* root level have p_bh == NULL, brelse() eats this */ ++ brelse(path[i].p_bh); ++ i--; ++ continue; ++ } ++ ++ /* this is index block */ ++ if (!path[i].p_hdr) { ++ ext_debug(tree, "initialize header\n"); ++ path[i].p_hdr = EXT_BLOCK_HDR(path[i].p_bh); ++ } ++ ++ EXT_ASSERT(path[i].p_hdr->eh_entries <= path[i].p_hdr->eh_max); ++ EXT_ASSERT(path[i].p_hdr->eh_magic == EXT3_EXT_MAGIC); ++ ++ if (!path[i].p_idx) { ++ /* this level hasn't touched yet */ ++ path[i].p_idx = ++ ext3_ext_last_covered(path[i].p_hdr, end); ++ path[i].p_block = path[i].p_hdr->eh_entries + 1; ++ ext_debug(tree, "init index ptr: hdr 0x%p, num %d\n", ++ path[i].p_hdr, path[i].p_hdr->eh_entries); ++ } else { ++ /* we've already was here, see at next index */ ++ path[i].p_idx--; ++ } ++ ++ ext_debug(tree, "level %d - index, first 0x%p, cur 0x%p\n", ++ i, EXT_FIRST_INDEX(path[i].p_hdr), ++ path[i].p_idx); ++ if (ext3_ext_more_to_rm(path + i)) { ++ /* go to the next level */ ++ ext_debug(tree, "move to level %d (block %d)\n", ++ i + 1, path[i].p_idx->ei_leaf); ++ memset(path + i + 1, 0, sizeof(*path)); ++ path[i+1].p_bh = sb_bread(sb, path[i].p_idx->ei_leaf); ++ if (!path[i+1].p_bh) { ++ /* should we reset i_size? */ ++ err = -EIO; ++ break; ++ } ++ /* put actual number of indexes to know is this ++ * number got changed at the next iteration */ ++ path[i].p_block = path[i].p_hdr->eh_entries; ++ i++; ++ } else { ++ /* we finish processing this index, go up */ ++ if (path[i].p_hdr->eh_entries == 0 && i > 0) { ++ /* index is empty, remove it ++ * handle must be already prepared by the ++ * truncatei_leaf() */ ++ err = ext3_ext_rm_idx(handle, tree, path + i); ++ } ++ /* root level have p_bh == NULL, brelse() eats this */ ++ brelse(path[i].p_bh); ++ i--; ++ ext_debug(tree, "return to level %d\n", i); ++ } ++ } ++ ++ /* TODO: flexible tree reduction should be here */ ++ if (path->p_hdr->eh_entries == 0) { ++ /* ++ * truncate to zero freed all the tree ++ * so, we need to correct eh_depth ++ */ ++ err = ext3_ext_get_access(handle, tree, path); ++ if (err == 0) { ++ EXT_ROOT_HDR(tree)->eh_depth = 0; ++ EXT_ROOT_HDR(tree)->eh_max = ext3_ext_space_root(tree); ++ err = ext3_ext_dirty(handle, tree, path); ++ } ++ } ++ ext3_ext_tree_changed(tree); ++ ++ kfree(path); ++ ext3_journal_stop(handle); ++ ++ return err; ++} ++ ++int ext3_ext_calc_metadata_amount(struct ext3_extents_tree *tree, int blocks) ++{ ++ int lcap, icap, rcap, leafs, idxs, num; ++ ++ rcap = ext3_ext_space_root(tree); ++ if (blocks <= rcap) { ++ /* all extents fit to the root */ ++ return 0; ++ } ++ ++ rcap = ext3_ext_space_root_idx(tree); ++ lcap = ext3_ext_space_block(tree); ++ icap = ext3_ext_space_block_idx(tree); ++ ++ num = leafs = (blocks + lcap - 1) / lcap; ++ if (leafs <= rcap) { ++ /* all pointers to leafs fit to the root */ ++ return leafs; ++ } ++ ++ /* ok. we need separate index block(s) to link all leaf blocks */ ++ idxs = (leafs + icap - 1) / icap; ++ do { ++ num += idxs; ++ idxs = (idxs + icap - 1) / icap; ++ } while (idxs > rcap); ++ ++ return num; ++} ++ ++/* ++ * called at mount time ++ */ ++void ext3_ext_init(struct super_block *sb) ++{ ++ /* ++ * possible initialization would be here ++ */ ++ ++ if (test_opt(sb, EXTENTS)) { ++ printk("EXT3-fs: file extents enabled"); ++#ifdef AGRESSIVE_TEST ++ printk(", agressive tests"); ++#endif ++#ifdef CHECK_BINSEARCH ++ printk(", check binsearch"); ++#endif ++ printk("\n"); ++ } ++} ++ ++/* ++ * called at umount time ++ */ ++void ext3_ext_release(struct super_block *sb) ++{ ++} ++ ++/************************************************************************ ++ * VFS related routines ++ ************************************************************************/ ++ ++static int ext3_get_inode_write_access(handle_t *handle, void *buffer) ++{ ++ /* we use in-core data, not bh */ ++ return 0; ++} ++ ++static int ext3_mark_buffer_dirty(handle_t *handle, void *buffer) ++{ ++ struct inode *inode = buffer; ++ return ext3_mark_inode_dirty(handle, inode); ++} ++ ++static int ext3_ext_mergable(struct ext3_extent *ex1, ++ struct ext3_extent *ex2) ++{ ++ /* FIXME: support for large fs */ ++ if (ex1->ee_start + ex1->ee_len == ex2->ee_start) ++ return 1; ++ return 0; ++} ++ ++static int ++ext3_remove_blocks_credits(struct ext3_extents_tree *tree, ++ struct ext3_extent *ex, ++ unsigned long from, unsigned long to) ++{ ++ int needed; ++ ++ /* at present, extent can't cross block group */; ++ needed = 4; /* bitmap + group desc + sb + inode */ ++ ++#ifdef CONFIG_QUOTA ++ needed += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS; ++#endif ++ return needed; ++} ++ ++static int ++ext3_remove_blocks(struct ext3_extents_tree *tree, ++ struct ext3_extent *ex, ++ unsigned long from, unsigned long to) ++{ ++ int needed = ext3_remove_blocks_credits(tree, ex, from, to); ++ handle_t *handle = ext3_journal_start(tree->inode, needed); ++ struct buffer_head *bh; ++ int i; ++ ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) { ++ /* tail removal */ ++ unsigned long num, start; ++ num = ex->ee_block + ex->ee_len - from; ++ start = ex->ee_start + ex->ee_len - num; ++ ext_debug(tree, "free last %lu blocks starting %lu\n", ++ num, start); ++ for (i = 0; i < num; i++) { ++ bh = sb_find_get_block(tree->inode->i_sb, start + i); ++ ext3_forget(handle, 0, tree->inode, bh, start + i); ++ } ++ ext3_free_blocks(handle, tree->inode, start, num); ++ } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) { ++ printk("strange request: removal %lu-%lu from %u:%u\n", ++ from, to, ex->ee_block, ex->ee_len); ++ } else { ++ printk("strange request: removal(2) %lu-%lu from %u:%u\n", ++ from, to, ex->ee_block, ex->ee_len); ++ } ++ ext3_journal_stop(handle); ++ return 0; ++} ++ ++static int ext3_ext_find_goal(struct inode *inode, ++ struct ext3_ext_path *path, unsigned long block) ++{ ++ struct ext3_inode_info *ei = EXT3_I(inode); ++ unsigned long bg_start; ++ unsigned long colour; ++ int depth; ++ ++ if (path) { ++ struct ext3_extent *ex; ++ depth = path->p_depth; ++ ++ /* try to predict block placement */ ++ if ((ex = path[depth].p_ext)) ++ return ex->ee_start + (block - ex->ee_block); ++ ++ /* it looks index is empty ++ * try to find starting from index itself */ ++ if (path[depth].p_bh) ++ return path[depth].p_bh->b_blocknr; ++ } ++ ++ /* OK. use inode's group */ ++ bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) + ++ le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block); ++ colour = (current->pid % 16) * ++ (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16); ++ return bg_start + colour + block; ++} ++ ++static int ext3_new_block_cb(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *ex, int *err) ++{ ++ struct inode *inode = tree->inode; ++ int newblock, goal; ++ ++ EXT_ASSERT(path); ++ EXT_ASSERT(ex); ++ EXT_ASSERT(ex->ee_start); ++ EXT_ASSERT(ex->ee_len); ++ ++ /* reuse block from the extent to order data/metadata */ ++ newblock = ex->ee_start++; ++ ex->ee_len--; ++ if (ex->ee_len == 0) { ++ ex->ee_len = 1; ++ /* allocate new block for the extent */ ++ goal = ext3_ext_find_goal(inode, path, ex->ee_block); ++ ex->ee_start = ext3_new_block(handle, inode, goal, err); ++ if (ex->ee_start == 0) { ++ /* error occured: restore old extent */ ++ ex->ee_start = newblock; ++ return 0; ++ } ++ } ++ return newblock; ++} ++ ++static struct ext3_extents_helpers ext3_blockmap_helpers = { ++ .get_write_access = ext3_get_inode_write_access, ++ .mark_buffer_dirty = ext3_mark_buffer_dirty, ++ .mergable = ext3_ext_mergable, ++ .new_block = ext3_new_block_cb, ++ .remove_extent = ext3_remove_blocks, ++ .remove_extent_credits = ext3_remove_blocks_credits, ++}; ++ ++void ext3_init_tree_desc(struct ext3_extents_tree *tree, ++ struct inode *inode) ++{ ++ tree->inode = inode; ++ tree->root = (void *) EXT3_I(inode)->i_data; ++ tree->buffer = (void *) inode; ++ tree->buffer_len = sizeof(EXT3_I(inode)->i_data); ++ tree->cex = (struct ext3_ext_cache *) &EXT3_I(inode)->i_cached_extent; ++ tree->ops = &ext3_blockmap_helpers; ++} ++ ++int ext3_ext_get_block(handle_t *handle, struct inode *inode, ++ long iblock, struct buffer_head *bh_result, ++ int create, int extend_disksize) ++{ ++ struct ext3_ext_path *path = NULL; ++ struct ext3_extent newex; ++ struct ext3_extent *ex; ++ int goal, newblock, err = 0, depth; ++ struct ext3_extents_tree tree; ++ ++ clear_buffer_new(bh_result); ++ ext3_init_tree_desc(&tree, inode); ++ ext_debug(&tree, "block %d requested for inode %u\n", ++ (int) iblock, (unsigned) inode->i_ino); ++ mutex_lock(&EXT3_I(inode)->truncate_mutex); ++ ++ /* check in cache */ ++ if ((goal = ext3_ext_in_cache(&tree, iblock, &newex))) { ++ if (goal == EXT3_EXT_CACHE_GAP) { ++ if (!create) { ++ /* block isn't allocated yet and ++ * user don't want to allocate it */ ++ goto out2; ++ } ++ /* we should allocate requested block */ ++ } else if (goal == EXT3_EXT_CACHE_EXTENT) { ++ /* block is already allocated */ ++ newblock = iblock - newex.ee_block + newex.ee_start; ++ goto out; ++ } else { ++ EXT_ASSERT(0); ++ } ++ } ++ ++ /* find extent for this block */ ++ path = ext3_ext_find_extent(&tree, iblock, NULL); ++ if (IS_ERR(path)) { ++ err = PTR_ERR(path); ++ path = NULL; ++ goto out2; ++ } ++ ++ depth = EXT_DEPTH(&tree); ++ ++ /* ++ * consistent leaf must not be empty ++ * this situations is possible, though, _during_ tree modification ++ * this is why assert can't be put in ext3_ext_find_extent() ++ */ ++ EXT_ASSERT(path[depth].p_ext != NULL || depth == 0); ++ ++ if ((ex = path[depth].p_ext)) { ++ /* if found exent covers block, simple return it */ ++ if (iblock >= ex->ee_block && iblock < ex->ee_block + ex->ee_len) { ++ newblock = iblock - ex->ee_block + ex->ee_start; ++ ext_debug(&tree, "%d fit into %d:%d -> %d\n", ++ (int) iblock, ex->ee_block, ex->ee_len, ++ newblock); ++ ext3_ext_put_in_cache(&tree, ex->ee_block, ++ ex->ee_len, ex->ee_start, ++ EXT3_EXT_CACHE_EXTENT); ++ goto out; ++ } ++ } ++ ++ /* ++ * requested block isn't allocated yet ++ * we couldn't try to create block if create flag is zero ++ */ ++ if (!create) { ++ /* put just found gap into cache to speedup subsequest reqs */ ++ ext3_ext_put_gap_in_cache(&tree, path, iblock); ++ goto out2; ++ } ++ ++ /* allocate new block */ ++ goal = ext3_ext_find_goal(inode, path, iblock); ++ newblock = ext3_new_block(handle, inode, goal, &err); ++ if (!newblock) ++ goto out2; ++ ext_debug(&tree, "allocate new block: goal %d, found %d\n", ++ goal, newblock); ++ ++ /* try to insert new extent into found leaf and return */ ++ newex.ee_block = iblock; ++ newex.ee_start = newblock; ++ newex.ee_len = 1; ++ err = ext3_ext_insert_extent(handle, &tree, path, &newex); ++ if (err) ++ goto out2; ++ ++ if (extend_disksize && inode->i_size > EXT3_I(inode)->i_disksize) ++ EXT3_I(inode)->i_disksize = inode->i_size; ++ ++ /* previous routine could use block we allocated */ ++ newblock = newex.ee_start; ++ set_buffer_new(bh_result); ++ ++ ext3_ext_put_in_cache(&tree, newex.ee_block, newex.ee_len, ++ newex.ee_start, EXT3_EXT_CACHE_EXTENT); ++out: ++ ext3_ext_show_leaf(&tree, path); ++ map_bh(bh_result, inode->i_sb, newblock); ++out2: ++ if (path) { ++ ext3_ext_drop_refs(path); ++ kfree(path); ++ } ++ mutex_unlock(&EXT3_I(inode)->truncate_mutex); ++ ++ return err; ++} ++ ++void ext3_ext_truncate(struct inode * inode, struct page *page) ++{ ++ struct address_space *mapping = inode->i_mapping; ++ struct super_block *sb = inode->i_sb; ++ struct ext3_extents_tree tree; ++ unsigned long last_block; ++ handle_t *handle; ++ int err = 0; ++ ++ ext3_init_tree_desc(&tree, inode); ++ ++ /* ++ * probably first extent we're gonna free will be last in block ++ */ ++ err = ext3_writepage_trans_blocks(inode) + 3; ++ handle = ext3_journal_start(inode, err); ++ if (IS_ERR(handle)) { ++ if (page) { ++ clear_highpage(page); ++ flush_dcache_page(page); ++ unlock_page(page); ++ page_cache_release(page); ++ } ++ return; ++ } ++ ++ if (page) ++ ext3_block_truncate_page(handle, page, mapping, inode->i_size); ++ ++ mutex_lock(&EXT3_I(inode)->truncate_mutex); ++ ext3_ext_invalidate_cache(&tree); ++ ++ /* ++ * TODO: optimization is possible here ++ * probably we need not scaning at all, ++ * because page truncation is enough ++ */ ++ if (ext3_orphan_add(handle, inode)) ++ goto out_stop; ++ ++ /* we have to know where to truncate from in crash case */ ++ EXT3_I(inode)->i_disksize = inode->i_size; ++ ext3_mark_inode_dirty(handle, inode); ++ ++ last_block = (inode->i_size + sb->s_blocksize - 1) >> ++ EXT3_BLOCK_SIZE_BITS(sb); ++ err = ext3_ext_remove_space(&tree, last_block, EXT_MAX_BLOCK); ++ ++ /* In a multi-transaction truncate, we only make the final ++ * transaction synchronous */ ++ if (IS_SYNC(inode)) ++ handle->h_sync = 1; ++ ++out_stop: ++ /* ++ * If this was a simple ftruncate(), and the file will remain alive ++ * then we need to clear up the orphan record which we created above. ++ * However, if this was a real unlink then we were called by ++ * ext3_delete_inode(), and we allow that function to clean up the ++ * orphan info for us. ++ */ ++ if (inode->i_nlink) ++ ext3_orphan_del(handle, inode); ++ ++ mutex_unlock(&EXT3_I(inode)->truncate_mutex); ++ ext3_journal_stop(handle); ++} ++ ++/* ++ * this routine calculate max number of blocks we could modify ++ * in order to allocate new block for an inode ++ */ ++int ext3_ext_writepage_trans_blocks(struct inode *inode, int num) ++{ ++ struct ext3_extents_tree tree; ++ int needed; ++ ++ ext3_init_tree_desc(&tree, inode); ++ ++ needed = ext3_ext_calc_credits_for_insert(&tree, NULL); ++ ++ /* caller want to allocate num blocks */ ++ needed *= num; ++ ++#ifdef CONFIG_QUOTA ++ /* ++ * FIXME: real calculation should be here ++ * it depends on blockmap format of qouta file ++ */ ++ needed += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS; ++#endif ++ ++ return needed; ++} ++ ++void ext3_extents_initialize_blockmap(handle_t *handle, struct inode *inode) ++{ ++ struct ext3_extents_tree tree; ++ ++ ext3_init_tree_desc(&tree, inode); ++ ext3_extent_tree_init(handle, &tree); ++} ++ ++int ext3_ext_calc_blockmap_metadata(struct inode *inode, int blocks) ++{ ++ struct ext3_extents_tree tree; ++ ++ ext3_init_tree_desc(&tree, inode); ++ return ext3_ext_calc_metadata_amount(&tree, blocks); ++} ++ ++static int ++ext3_ext_store_extent_cb(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_ext_cache *newex) ++{ ++ struct ext3_extent_buf *buf = (struct ext3_extent_buf *) tree->private; ++ ++ if (newex->ec_type != EXT3_EXT_CACHE_EXTENT) ++ return EXT_CONTINUE; ++ ++ if (buf->err < 0) ++ return EXT_BREAK; ++ if (buf->cur - buf->buffer + sizeof(*newex) > buf->buflen) ++ return EXT_BREAK; ++ ++ if (!copy_to_user(buf->cur, newex, sizeof(*newex))) { ++ buf->err++; ++ buf->cur += sizeof(*newex); ++ } else { ++ buf->err = -EFAULT; ++ return EXT_BREAK; ++ } ++ return EXT_CONTINUE; ++} ++ ++static int ++ext3_ext_collect_stats_cb(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_ext_cache *ex) ++{ ++ struct ext3_extent_tree_stats *buf = ++ (struct ext3_extent_tree_stats *) tree->private; ++ int depth; ++ ++ if (ex->ec_type != EXT3_EXT_CACHE_EXTENT) ++ return EXT_CONTINUE; ++ ++ depth = EXT_DEPTH(tree); ++ buf->extents_num++; ++ if (path[depth].p_ext == EXT_FIRST_EXTENT(path[depth].p_hdr)) ++ buf->leaf_num++; ++ return EXT_CONTINUE; ++} ++ ++int ext3_ext_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, ++ unsigned long arg) ++{ ++ int err = 0; ++ ++ if (!(EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL)) ++ return -EINVAL; ++ ++ if (cmd == EXT3_IOC_GET_EXTENTS) { ++ struct ext3_extent_buf buf; ++ struct ext3_extents_tree tree; ++ ++ if (copy_from_user(&buf, (void *) arg, sizeof(buf))) ++ return -EFAULT; ++ ++ ext3_init_tree_desc(&tree, inode); ++ buf.cur = buf.buffer; ++ buf.err = 0; ++ tree.private = &buf; ++ mutex_lock(&EXT3_I(inode)->truncate_mutex); ++ err = ext3_ext_walk_space(&tree, buf.start, EXT_MAX_BLOCK, ++ ext3_ext_store_extent_cb); ++ mutex_unlock(&EXT3_I(inode)->truncate_mutex); ++ if (err == 0) ++ err = buf.err; ++ } else if (cmd == EXT3_IOC_GET_TREE_STATS) { ++ struct ext3_extent_tree_stats buf; ++ struct ext3_extents_tree tree; ++ ++ ext3_init_tree_desc(&tree, inode); ++ mutex_lock(&EXT3_I(inode)->truncate_mutex); ++ buf.depth = EXT_DEPTH(&tree); ++ buf.extents_num = 0; ++ buf.leaf_num = 0; ++ tree.private = &buf; ++ err = ext3_ext_walk_space(&tree, 0, EXT_MAX_BLOCK, ++ ext3_ext_collect_stats_cb); ++ mutex_unlock(&EXT3_I(inode)->truncate_mutex); ++ if (!err) ++ err = copy_to_user((void *) arg, &buf, sizeof(buf)); ++ } else if (cmd == EXT3_IOC_GET_TREE_DEPTH) { ++ struct ext3_extents_tree tree; ++ ext3_init_tree_desc(&tree, inode); ++ mutex_lock(&EXT3_I(inode)->truncate_mutex); ++ err = EXT_DEPTH(&tree); ++ mutex_unlock(&EXT3_I(inode)->truncate_mutex); ++ } ++ ++ return err; ++} ++ ++EXPORT_SYMBOL(ext3_init_tree_desc); ++EXPORT_SYMBOL(ext3_mark_inode_dirty); ++EXPORT_SYMBOL(ext3_ext_invalidate_cache); ++EXPORT_SYMBOL(ext3_ext_insert_extent); ++EXPORT_SYMBOL(ext3_ext_walk_space); ++EXPORT_SYMBOL(ext3_ext_find_goal); ++EXPORT_SYMBOL(ext3_ext_calc_credits_for_insert); +Index: linux-stage/fs/ext3/ialloc.c +=================================================================== +--- linux-stage.orig/fs/ext3/ialloc.c 2006-07-16 13:55:31.000000000 +0800 ++++ linux-stage/fs/ext3/ialloc.c 2006-07-16 14:10:20.000000000 +0800 +@@ -600,7 +600,7 @@ got: + ei->i_dir_start_lookup = 0; + ei->i_disksize = 0; + +- ei->i_flags = EXT3_I(dir)->i_flags & ~EXT3_INDEX_FL; ++ ei->i_flags = EXT3_I(dir)->i_flags & ~(EXT3_INDEX_FL|EXT3_EXTENTS_FL); + if (S_ISLNK(mode)) + ei->i_flags &= ~(EXT3_IMMUTABLE_FL|EXT3_APPEND_FL); + /* dirsync only applies to directories */ +@@ -644,6 +644,18 @@ got: + if (err) + goto fail_free_drop; + ++ if (test_opt(sb, EXTENTS) && S_ISREG(inode->i_mode)) { ++ EXT3_I(inode)->i_flags |= EXT3_EXTENTS_FL; ++ ext3_extents_initialize_blockmap(handle, inode); ++ if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_EXTENTS)) { ++ err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh); ++ if (err) goto fail; ++ EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_EXTENTS); ++ BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "call ext3_journal_dirty_metadata"); ++ err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); ++ } ++ } ++ + err = ext3_mark_inode_dirty(handle, inode); + if (err) { + ext3_std_error(sb, err); +Index: linux-stage/fs/ext3/inode.c +=================================================================== +--- linux-stage.orig/fs/ext3/inode.c 2006-07-16 13:55:31.000000000 +0800 ++++ linux-stage/fs/ext3/inode.c 2006-07-16 14:11:28.000000000 +0800 +@@ -40,7 +40,7 @@ + #include "iopen.h" + #include "acl.h" + +-static int ext3_writepage_trans_blocks(struct inode *inode); ++int ext3_writepage_trans_blocks(struct inode *inode); + + /* + * Test whether an inode is a fast symlink. +@@ -944,6 +944,17 @@ out: + + #define DIO_CREDITS (EXT3_RESERVE_TRANS_BLOCKS + 32) + ++static inline int ++ext3_get_block_wrap(handle_t *handle, struct inode *inode, long block, ++ struct buffer_head *bh, int create, int extend_disksize) ++{ ++ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) ++ return ext3_ext_get_block(handle, inode, block, bh, create, ++ extend_disksize); ++ return ext3_get_blocks_handle(handle, inode, block, 1, bh, create, ++ extend_disksize); ++} ++ + static int ext3_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) + { +@@ -984,8 +995,8 @@ static int ext3_get_block(struct inode * + + get_block: + if (ret == 0) { +- ret = ext3_get_blocks_handle(handle, inode, iblock, +- max_blocks, bh_result, create, 0); ++ ret = ext3_get_block_wrap(handle, inode, iblock, ++ bh_result, create, 0); + if (ret > 0) { + bh_result->b_size = (ret << inode->i_blkbits); + ret = 0; +@@ -1008,7 +1019,7 @@ struct buffer_head *ext3_getblk(handle_t + dummy.b_state = 0; + dummy.b_blocknr = -1000; + buffer_trace_init(&dummy.b_history); +- err = ext3_get_blocks_handle(handle, inode, block, 1, ++ err = ext3_get_block_wrap(handle, inode, block, + &dummy, create, 1); + if (err == 1) { + err = 0; +@@ -1756,7 +1767,7 @@ void ext3_set_aops(struct inode *inode) + * This required during truncate. We need to physically zero the tail end + * of that block so it doesn't yield old data if the file is later grown. + */ +-static int ext3_block_truncate_page(handle_t *handle, struct page *page, ++int ext3_block_truncate_page(handle_t *handle, struct page *page, + struct address_space *mapping, loff_t from) + { + ext3_fsblk_t index = from >> PAGE_CACHE_SHIFT; +@@ -2260,6 +2271,9 @@ void ext3_truncate(struct inode *inode) + return; + } + ++ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) ++ return ext3_ext_truncate(inode, page); ++ + handle = start_transaction(inode); + if (IS_ERR(handle)) { + if (page) { +@@ -3004,12 +3018,15 @@ err_out: + * block and work out the exact number of indirects which are touched. Pah. + */ + +-static int ext3_writepage_trans_blocks(struct inode *inode) ++int ext3_writepage_trans_blocks(struct inode *inode) + { + int bpp = ext3_journal_blocks_per_page(inode); + int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3; + int ret; + ++ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) ++ return ext3_ext_writepage_trans_blocks(inode, bpp); ++ + if (ext3_should_journal_data(inode)) + ret = 3 * (bpp + indirects) + 2; + else +@@ -3277,7 +3294,7 @@ int ext3_prep_san_write(struct inode *in + + /* alloc blocks one by one */ + for (i = 0; i < nblocks; i++) { +- ret = ext3_get_block_handle(handle, inode, blocks[i], ++ ret = ext3_get_blocks_handle(handle, inode, blocks[i], 1, + &bh_tmp, 1, 1); + if (ret) + break; +@@ -3337,7 +3354,7 @@ int ext3_map_inode_page(struct inode *in + if (blocks[i] != 0) + continue; + +- rc = ext3_get_block_handle(handle, inode, iblock, &dummy, 1, 1); ++ rc = ext3_get_blocks_handle(handle, inode, iblock, 1, &dummy, 1, 1); + if (rc) { + printk(KERN_INFO "ext3_map_inode_page: error reading " + "block %ld\n", iblock); +Index: linux-stage/fs/ext3/Makefile +=================================================================== +--- linux-stage.orig/fs/ext3/Makefile 2006-07-16 13:55:31.000000000 +0800 ++++ linux-stage/fs/ext3/Makefile 2006-07-16 14:10:21.000000000 +0800 +@@ -5,7 +5,8 @@ + obj-$(CONFIG_EXT3_FS) += ext3.o + + ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ +- ioctl.o namei.o super.o symlink.o hash.o resize.o ++ ioctl.o namei.o super.o symlink.o hash.o resize.o \ ++ extents.o + + ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o + ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o +Index: linux-stage/fs/ext3/super.c +=================================================================== +--- linux-stage.orig/fs/ext3/super.c 2006-07-16 13:55:31.000000000 +0800 ++++ linux-stage/fs/ext3/super.c 2006-07-16 14:10:21.000000000 +0800 +@@ -391,6 +391,7 @@ static void ext3_put_super (struct super + struct ext3_super_block *es = sbi->s_es; + int i; + ++ ext3_ext_release(sb); + ext3_xattr_put_super(sb); + journal_destroy(sbi->s_journal); + if (!(sb->s_flags & MS_RDONLY)) { +@@ -455,6 +456,8 @@ static struct inode *ext3_alloc_inode(st + #endif + ei->i_block_alloc_info = NULL; + ei->vfs_inode.i_version = 1; ++ ++ memset(&ei->i_cached_extent, 0, sizeof(ei->i_cached_extent)); + return &ei->vfs_inode; + } + +@@ -638,6 +641,7 @@ enum { + Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, + Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, + Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, ++ Opt_extents, Opt_extdebug, + Opt_grpquota + }; + +@@ -690,6 +694,8 @@ static match_table_t tokens = { + {Opt_iopen, "iopen"}, + {Opt_noiopen, "noiopen"}, + {Opt_iopen_nopriv, "iopen_nopriv"}, ++ {Opt_extents, "extents"}, ++ {Opt_extdebug, "extdebug"}, + {Opt_barrier, "barrier=%u"}, + {Opt_err, NULL}, + {Opt_resize, "resize"}, +@@ -1035,6 +1041,12 @@ clear_qf_name: + case Opt_bh: + clear_opt(sbi->s_mount_opt, NOBH); + break; ++ case Opt_extents: ++ set_opt (sbi->s_mount_opt, EXTENTS); ++ break; ++ case Opt_extdebug: ++ set_opt (sbi->s_mount_opt, EXTDEBUG); ++ break; + default: + printk (KERN_ERR + "EXT3-fs: Unrecognized mount option \"%s\" " +@@ -1760,6 +1772,7 @@ static int ext3_fill_super (struct super + test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered": + "writeback"); + ++ ext3_ext_init(sb); + lock_kernel(); + return 0; + +Index: linux-stage/fs/ext3/ioctl.c +=================================================================== +--- linux-stage.orig/fs/ext3/ioctl.c 2006-07-16 13:55:31.000000000 +0800 ++++ linux-stage/fs/ext3/ioctl.c 2006-07-16 13:55:31.000000000 +0800 +@@ -135,6 +135,10 @@ flags_err: + mutex_unlock(&inode->i_mutex); + return err; + } ++ case EXT3_IOC_GET_EXTENTS: ++ case EXT3_IOC_GET_TREE_STATS: ++ case EXT3_IOC_GET_TREE_DEPTH: ++ return ext3_ext_ioctl(inode, filp, cmd, arg); + case EXT3_IOC_GETVERSION: + case EXT3_IOC_GETVERSION_OLD: + return put_user(inode->i_generation, (int __user *) arg); +Index: linux-stage/include/linux/ext3_fs.h +=================================================================== +--- linux-stage.orig/include/linux/ext3_fs.h 2006-07-16 13:55:31.000000000 +0800 ++++ linux-stage/include/linux/ext3_fs.h 2006-07-16 14:10:21.000000000 +0800 +@@ -181,9 +181,10 @@ struct ext3_group_desc + #define EXT3_NOTAIL_FL 0x00008000 /* file tail should not be merged */ + #define EXT3_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ + #define EXT3_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ ++#define EXT3_EXTENTS_FL 0x00080000 /* Inode uses extents */ + #define EXT3_RESERVED_FL 0x80000000 /* reserved for ext3 lib */ + +-#define EXT3_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */ ++#define EXT3_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */ + #define EXT3_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */ + + /* +@@ -233,6 +234,9 @@ struct ext3_new_group_data { + #endif + #define EXT3_IOC_GETRSVSZ _IOR('f', 5, long) + #define EXT3_IOC_SETRSVSZ _IOW('f', 6, long) ++#define EXT3_IOC_GET_EXTENTS _IOR('f', 7, long) ++#define EXT3_IOC_GET_TREE_DEPTH _IOR('f', 8, long) ++#define EXT3_IOC_GET_TREE_STATS _IOR('f', 9, long) + + /* + * Mount options +@@ -373,6 +377,8 @@ struct ext3_inode { + #define EXT3_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ + #define EXT3_MOUNT_IOPEN 0x400000 /* Allow access via iopen */ + #define EXT3_MOUNT_IOPEN_NOPRIV 0x800000/* Make iopen world-readable */ ++#define EXT3_MOUNT_EXTENTS 0x1000000/* Extents support */ ++#define EXT3_MOUNT_EXTDEBUG 0x2000000/* Extents debug */ + + /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ + #ifndef clear_opt +@@ -563,11 +569,13 @@ static inline struct ext3_inode_info *EX + #define EXT3_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */ + #define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */ + #define EXT3_FEATURE_INCOMPAT_META_BG 0x0010 ++#define EXT3_FEATURE_INCOMPAT_EXTENTS 0x0040 /* extents support */ + + #define EXT3_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR + #define EXT3_FEATURE_INCOMPAT_SUPP (EXT3_FEATURE_INCOMPAT_FILETYPE| \ + EXT3_FEATURE_INCOMPAT_RECOVER| \ +- EXT3_FEATURE_INCOMPAT_META_BG) ++ EXT3_FEATURE_INCOMPAT_META_BG| \ ++ EXT3_FEATURE_INCOMPAT_EXTENTS) + #define EXT3_FEATURE_RO_COMPAT_SUPP (EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT3_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT3_FEATURE_RO_COMPAT_BTREE_DIR) +@@ -787,6 +795,8 @@ extern unsigned long ext3_count_free (st + + + /* inode.c */ ++extern int ext3_block_truncate_page(handle_t *, struct page *, ++ struct address_space *, loff_t); + int ext3_forget(handle_t *handle, int is_metadata, struct inode *inode, + struct buffer_head *bh, ext3_fsblk_t blocknr); + struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *); +@@ -860,6 +870,16 @@ extern struct inode_operations ext3_spec + extern struct inode_operations ext3_symlink_inode_operations; + extern struct inode_operations ext3_fast_symlink_inode_operations; + ++/* extents.c */ ++extern int ext3_ext_writepage_trans_blocks(struct inode *, int); ++extern int ext3_ext_get_block(handle_t *, struct inode *, long, ++ struct buffer_head *, int, int); ++extern void ext3_ext_truncate(struct inode *, struct page *); ++extern void ext3_ext_init(struct super_block *); ++extern void ext3_ext_release(struct super_block *); ++extern void ext3_extents_initialize_blockmap(handle_t *, struct inode *); ++extern int ext3_ext_ioctl(struct inode *inode, struct file *filp, ++ unsigned int cmd, unsigned long arg); + + #endif /* __KERNEL__ */ + +Index: linux-stage/include/linux/ext3_extents.h +=================================================================== +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ linux-stage/include/linux/ext3_extents.h 2006-07-16 13:55:31.000000000 +0800 +@@ -0,0 +1,264 @@ ++/* ++ * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com ++ * Written by Alex Tomas ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public Licens ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- ++ */ ++ ++#ifndef _LINUX_EXT3_EXTENTS ++#define _LINUX_EXT3_EXTENTS ++ ++/* ++ * with AGRESSIVE_TEST defined capacity of index/leaf blocks ++ * become very little, so index split, in-depth growing and ++ * other hard changes happens much more often ++ * this is for debug purposes only ++ */ ++#define AGRESSIVE_TEST_ ++ ++/* ++ * if CHECK_BINSEARCH defined, then results of binary search ++ * will be checked by linear search ++ */ ++#define CHECK_BINSEARCH_ ++ ++/* ++ * if EXT_DEBUG is defined you can use 'extdebug' mount option ++ * to get lots of info what's going on ++ */ ++#define EXT_DEBUG_ ++#ifdef EXT_DEBUG ++#define ext_debug(tree,fmt,a...) \ ++do { \ ++ if (test_opt((tree)->inode->i_sb, EXTDEBUG)) \ ++ printk(fmt, ##a); \ ++} while (0); ++#else ++#define ext_debug(tree,fmt,a...) ++#endif ++ ++/* ++ * if EXT_STATS is defined then stats numbers are collected ++ * these number will be displayed at umount time ++ */ ++#define EXT_STATS_ ++ ++ ++#define EXT3_ALLOC_NEEDED 3 /* block bitmap + group desc. + sb */ ++ ++/* ++ * ext3_inode has i_block array (total 60 bytes) ++ * first 4 bytes are used to store: ++ * - tree depth (0 mean there is no tree yet. all extents in the inode) ++ * - number of alive extents in the inode ++ */ ++ ++/* ++ * this is extent on-disk structure ++ * it's used at the bottom of the tree ++ */ ++struct ext3_extent { ++ __u32 ee_block; /* first logical block extent covers */ ++ __u16 ee_len; /* number of blocks covered by extent */ ++ __u16 ee_start_hi; /* high 16 bits of physical block */ ++ __u32 ee_start; /* low 32 bigs of physical block */ ++}; ++ ++/* ++ * this is index on-disk structure ++ * it's used at all the levels, but the bottom ++ */ ++struct ext3_extent_idx { ++ __u32 ei_block; /* index covers logical blocks from 'block' */ ++ __u32 ei_leaf; /* pointer to the physical block of the next * ++ * level. leaf or next index could bet here */ ++ __u16 ei_leaf_hi; /* high 16 bits of physical block */ ++ __u16 ei_unused; ++}; ++ ++/* ++ * each block (leaves and indexes), even inode-stored has header ++ */ ++struct ext3_extent_header { ++ __u16 eh_magic; /* probably will support different formats */ ++ __u16 eh_entries; /* number of valid entries */ ++ __u16 eh_max; /* capacity of store in entries */ ++ __u16 eh_depth; /* has tree real underlaying blocks? */ ++ __u32 eh_generation; /* generation of the tree */ ++}; ++ ++#define EXT3_EXT_MAGIC 0xf30a ++ ++/* ++ * array of ext3_ext_path contains path to some extent ++ * creation/lookup routines use it for traversal/splitting/etc ++ * truncate uses it to simulate recursive walking ++ */ ++struct ext3_ext_path { ++ __u32 p_block; ++ __u16 p_depth; ++ struct ext3_extent *p_ext; ++ struct ext3_extent_idx *p_idx; ++ struct ext3_extent_header *p_hdr; ++ struct buffer_head *p_bh; ++}; ++ ++/* ++ * structure for external API ++ */ ++ ++/* ++ * storage for cached extent ++ */ ++struct ext3_ext_cache { ++ __u32 ec_start; ++ __u32 ec_block; ++ __u32 ec_len; ++ __u32 ec_type; ++}; ++ ++#define EXT3_EXT_CACHE_NO 0 ++#define EXT3_EXT_CACHE_GAP 1 ++#define EXT3_EXT_CACHE_EXTENT 2 ++ ++/* ++ * ext3_extents_tree is used to pass initial information ++ * to top-level extents API ++ */ ++struct ext3_extents_helpers; ++struct ext3_extents_tree { ++ struct inode *inode; /* inode which tree belongs to */ ++ void *root; /* ptr to data top of tree resides at */ ++ void *buffer; /* will be passed as arg to ^^ routines */ ++ int buffer_len; ++ void *private; ++ struct ext3_ext_cache *cex;/* last found extent */ ++ struct ext3_extents_helpers *ops; ++}; ++ ++struct ext3_extents_helpers { ++ int (*get_write_access)(handle_t *h, void *buffer); ++ int (*mark_buffer_dirty)(handle_t *h, void *buffer); ++ int (*mergable)(struct ext3_extent *ex1, struct ext3_extent *ex2); ++ int (*remove_extent_credits)(struct ext3_extents_tree *, ++ struct ext3_extent *, unsigned long, ++ unsigned long); ++ int (*remove_extent)(struct ext3_extents_tree *, ++ struct ext3_extent *, unsigned long, ++ unsigned long); ++ int (*new_block)(handle_t *, struct ext3_extents_tree *, ++ struct ext3_ext_path *, struct ext3_extent *, ++ int *); ++}; ++ ++/* ++ * to be called by ext3_ext_walk_space() ++ * negative retcode - error ++ * positive retcode - signal for ext3_ext_walk_space(), see below ++ * callback must return valid extent (passed or newly created) ++ */ ++typedef int (*ext_prepare_callback)(struct ext3_extents_tree *, ++ struct ext3_ext_path *, ++ struct ext3_ext_cache *); ++ ++#define EXT_CONTINUE 0 ++#define EXT_BREAK 1 ++#define EXT_REPEAT 2 ++ ++ ++#define EXT_MAX_BLOCK 0xffffffff ++ ++ ++#define EXT_FIRST_EXTENT(__hdr__) \ ++ ((struct ext3_extent *) (((char *) (__hdr__)) + \ ++ sizeof(struct ext3_extent_header))) ++#define EXT_FIRST_INDEX(__hdr__) \ ++ ((struct ext3_extent_idx *) (((char *) (__hdr__)) + \ ++ sizeof(struct ext3_extent_header))) ++#define EXT_HAS_FREE_INDEX(__path__) \ ++ ((__path__)->p_hdr->eh_entries < (__path__)->p_hdr->eh_max) ++#define EXT_LAST_EXTENT(__hdr__) \ ++ (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_entries - 1) ++#define EXT_LAST_INDEX(__hdr__) \ ++ (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_entries - 1) ++#define EXT_MAX_EXTENT(__hdr__) \ ++ (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_max - 1) ++#define EXT_MAX_INDEX(__hdr__) \ ++ (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_max - 1) ++ ++#define EXT_ROOT_HDR(tree) \ ++ ((struct ext3_extent_header *) (tree)->root) ++#define EXT_BLOCK_HDR(bh) \ ++ ((struct ext3_extent_header *) (bh)->b_data) ++#define EXT_DEPTH(_t_) \ ++ (((struct ext3_extent_header *)((_t_)->root))->eh_depth) ++#define EXT_GENERATION(_t_) \ ++ (((struct ext3_extent_header *)((_t_)->root))->eh_generation) ++ ++ ++#define EXT_ASSERT(__x__) if (!(__x__)) BUG(); ++ ++#define EXT_CHECK_PATH(tree,path) \ ++{ \ ++ int depth = EXT_DEPTH(tree); \ ++ BUG_ON((unsigned long) (path) < __PAGE_OFFSET); \ ++ BUG_ON((unsigned long) (path)[depth].p_idx < \ ++ __PAGE_OFFSET && (path)[depth].p_idx != NULL); \ ++ BUG_ON((unsigned long) (path)[depth].p_ext < \ ++ __PAGE_OFFSET && (path)[depth].p_ext != NULL); \ ++ BUG_ON((unsigned long) (path)[depth].p_hdr < __PAGE_OFFSET); \ ++ BUG_ON((unsigned long) (path)[depth].p_bh < __PAGE_OFFSET \ ++ && depth != 0); \ ++ BUG_ON((path)[0].p_depth != depth); \ ++} ++ ++ ++/* ++ * this structure is used to gather extents from the tree via ioctl ++ */ ++struct ext3_extent_buf { ++ unsigned long start; ++ int buflen; ++ void *buffer; ++ void *cur; ++ int err; ++}; ++ ++/* ++ * this structure is used to collect stats info about the tree ++ */ ++struct ext3_extent_tree_stats { ++ int depth; ++ int extents_num; ++ int leaf_num; ++}; ++ ++extern void ext3_init_tree_desc(struct ext3_extents_tree *, struct inode *); ++extern int ext3_extent_tree_init(handle_t *, struct ext3_extents_tree *); ++extern int ext3_ext_calc_credits_for_insert(struct ext3_extents_tree *, struct ext3_ext_path *); ++extern int ext3_ext_insert_extent(handle_t *, struct ext3_extents_tree *, struct ext3_ext_path *, struct ext3_extent *); ++extern int ext3_ext_walk_space(struct ext3_extents_tree *, unsigned long, unsigned long, ext_prepare_callback); ++extern int ext3_ext_remove_space(struct ext3_extents_tree *, unsigned long, unsigned long); ++extern struct ext3_ext_path * ext3_ext_find_extent(struct ext3_extents_tree *, int, struct ext3_ext_path *); ++extern int ext3_ext_calc_blockmap_metadata(struct inode *, int); ++ ++static inline void ++ext3_ext_invalidate_cache(struct ext3_extents_tree *tree) ++{ ++ if (tree->cex) ++ tree->cex->ec_type = EXT3_EXT_CACHE_NO; ++} ++ ++ ++#endif /* _LINUX_EXT3_EXTENTS */ +Index: linux-stage/include/linux/ext3_fs_i.h +=================================================================== +--- linux-stage.orig/include/linux/ext3_fs_i.h 2006-07-16 13:55:30.000000000 +0800 ++++ linux-stage/include/linux/ext3_fs_i.h 2006-07-16 14:10:20.000000000 +0800 +@@ -142,6 +142,8 @@ struct ext3_inode_info { + */ + struct mutex truncate_mutex; + struct inode vfs_inode; ++ ++ __u32 i_cached_extent[4]; + }; + + #endif /* _LINUX_EXT3_FS_I */ diff --git a/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6.18-vanilla.patch b/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6.18-vanilla.patch new file mode 100644 index 0000000..0040a6f --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6.18-vanilla.patch @@ -0,0 +1,2810 @@ +Index: linux-stage/fs/ext3/mballoc.c +=================================================================== +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ linux-stage/fs/ext3/mballoc.c 2006-07-16 02:29:49.000000000 +0800 +@@ -0,0 +1,2434 @@ ++/* ++ * Copyright (c) 2003-2005, Cluster File Systems, Inc, info@clusterfs.com ++ * Written by Alex Tomas ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public Licens ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- ++ */ ++ ++ ++/* ++ * mballoc.c contains the multiblocks allocation routines ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * TODO: ++ * - bitmap read-ahead (proposed by Oleg Drokin aka green) ++ * - track min/max extents in each group for better group selection ++ * - mb_mark_used() may allocate chunk right after splitting buddy ++ * - special flag to advice allocator to look for requested + N blocks ++ * this may improve interaction between extents and mballoc ++ * - tree of groups sorted by number of free blocks ++ * - percpu reservation code (hotpath) ++ * - error handling ++ */ ++ ++/* ++ * with AGRESSIVE_CHECK allocator runs consistency checks over ++ * structures. these checks slow things down a lot ++ */ ++#define AGGRESSIVE_CHECK__ ++ ++/* ++ */ ++#define MB_DEBUG__ ++#ifdef MB_DEBUG ++#define mb_debug(fmt,a...) printk(fmt, ##a) ++#else ++#define mb_debug(fmt,a...) ++#endif ++ ++/* ++ * with EXT3_MB_HISTORY mballoc stores last N allocations in memory ++ * and you can monitor it in /proc/fs/ext3//mb_history ++ */ ++#define EXT3_MB_HISTORY ++ ++/* ++ * How long mballoc can look for a best extent (in found extents) ++ */ ++long ext3_mb_max_to_scan = 500; ++ ++/* ++ * How long mballoc must look for a best extent ++ */ ++long ext3_mb_min_to_scan = 30; ++ ++/* ++ * with 'ext3_mb_stats' allocator will collect stats that will be ++ * shown at umount. The collecting costs though! ++ */ ++ ++long ext3_mb_stats = 1; ++ ++#ifdef EXT3_BB_MAX_BLOCKS ++#undef EXT3_BB_MAX_BLOCKS ++#endif ++#define EXT3_BB_MAX_BLOCKS 30 ++ ++struct ext3_free_metadata { ++ unsigned short group; ++ unsigned short num; ++ unsigned short blocks[EXT3_BB_MAX_BLOCKS]; ++ struct list_head list; ++}; ++ ++struct ext3_group_info { ++ unsigned long bb_state; ++ unsigned long bb_tid; ++ struct ext3_free_metadata *bb_md_cur; ++ unsigned short bb_first_free; ++ unsigned short bb_free; ++ unsigned short bb_fragments; ++ unsigned short bb_counters[]; ++}; ++ ++ ++#define EXT3_GROUP_INFO_NEED_INIT_BIT 0 ++#define EXT3_GROUP_INFO_LOCKED_BIT 1 ++ ++#define EXT3_MB_GRP_NEED_INIT(grp) \ ++ (test_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, &(grp)->bb_state)) ++ ++struct ext3_free_extent { ++ __u16 fe_start; ++ __u16 fe_len; ++ __u16 fe_group; ++}; ++ ++struct ext3_allocation_context { ++ struct super_block *ac_sb; ++ ++ /* search goals */ ++ struct ext3_free_extent ac_g_ex; ++ ++ /* the best found extent */ ++ struct ext3_free_extent ac_b_ex; ++ ++ /* number of iterations done. we have to track to limit searching */ ++ unsigned long ac_ex_scanned; ++ __u16 ac_groups_scanned; ++ __u16 ac_found; ++ __u16 ac_tail; ++ __u16 ac_buddy; ++ __u8 ac_status; ++ __u8 ac_flags; /* allocation hints */ ++ __u8 ac_criteria; ++ __u8 ac_repeats; ++ __u8 ac_2order; /* if request is to allocate 2^N blocks and ++ * N > 0, the field stores N, otherwise 0 */ ++}; ++ ++#define AC_STATUS_CONTINUE 1 ++#define AC_STATUS_FOUND 2 ++#define AC_STATUS_BREAK 3 ++ ++struct ext3_mb_history { ++ struct ext3_free_extent goal; /* goal allocation */ ++ struct ext3_free_extent result; /* result allocation */ ++ __u16 found; /* how many extents have been found */ ++ __u16 groups; /* how many groups have been scanned */ ++ __u16 tail; /* what tail broke some buddy */ ++ __u16 buddy; /* buddy the tail ^^^ broke */ ++ __u8 cr; /* which phase the result extent was found at */ ++ __u8 merged; ++}; ++ ++struct ext3_buddy { ++ struct page *bd_buddy_page; ++ void *bd_buddy; ++ struct page *bd_bitmap_page; ++ void *bd_bitmap; ++ struct ext3_group_info *bd_info; ++ struct super_block *bd_sb; ++ __u16 bd_blkbits; ++ __u16 bd_group; ++}; ++#define EXT3_MB_BITMAP(e3b) ((e3b)->bd_bitmap) ++#define EXT3_MB_BUDDY(e3b) ((e3b)->bd_buddy) ++ ++#ifndef EXT3_MB_HISTORY ++#define ext3_mb_store_history(sb,ac) ++#else ++static void ext3_mb_store_history(struct super_block *, ++ struct ext3_allocation_context *ac); ++#endif ++ ++#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) ++ ++static struct proc_dir_entry *proc_root_ext3; ++ ++int ext3_create (struct inode *, struct dentry *, int, struct nameidata *); ++struct buffer_head * read_block_bitmap(struct super_block *, unsigned int); ++int ext3_new_block_old(handle_t *, struct inode *, unsigned long, int *); ++int ext3_mb_reserve_blocks(struct super_block *, int); ++void ext3_mb_release_blocks(struct super_block *, int); ++void ext3_mb_poll_new_transaction(struct super_block *, handle_t *); ++void ext3_mb_free_committed_blocks(struct super_block *); ++ ++#if BITS_PER_LONG == 64 ++#define mb_correct_addr_and_bit(bit,addr) \ ++{ \ ++ bit += ((unsigned long) addr & 7UL) << 3; \ ++ addr = (void *) ((unsigned long) addr & ~7UL); \ ++} ++#elif BITS_PER_LONG == 32 ++#define mb_correct_addr_and_bit(bit,addr) \ ++{ \ ++ bit += ((unsigned long) addr & 3UL) << 3; \ ++ addr = (void *) ((unsigned long) addr & ~3UL); \ ++} ++#else ++#error "how many bits you are?!" ++#endif ++ ++static inline int mb_test_bit(int bit, void *addr) ++{ ++ mb_correct_addr_and_bit(bit,addr); ++ return ext2_test_bit(bit, addr); ++} ++ ++static inline void mb_set_bit(int bit, void *addr) ++{ ++ mb_correct_addr_and_bit(bit,addr); ++ ext2_set_bit(bit, addr); ++} ++ ++static inline void mb_set_bit_atomic(int bit, void *addr) ++{ ++ mb_correct_addr_and_bit(bit,addr); ++ ext2_set_bit_atomic(NULL, bit, addr); ++} ++ ++static inline void mb_clear_bit(int bit, void *addr) ++{ ++ mb_correct_addr_and_bit(bit,addr); ++ ext2_clear_bit(bit, addr); ++} ++ ++static inline void mb_clear_bit_atomic(int bit, void *addr) ++{ ++ mb_correct_addr_and_bit(bit,addr); ++ ext2_clear_bit_atomic(NULL, bit, addr); ++} ++ ++static inline int mb_find_next_zero_bit(void *addr, int max, int start) ++{ ++ int fix; ++#if BITS_PER_LONG == 64 ++ fix = ((unsigned long) addr & 7UL) << 3; ++ addr = (void *) ((unsigned long) addr & ~7UL); ++#elif BITS_PER_LONG == 32 ++ fix = ((unsigned long) addr & 3UL) << 3; ++ addr = (void *) ((unsigned long) addr & ~3UL); ++#else ++#error "how many bits you are?!" ++#endif ++ max += fix; ++ start += fix; ++ return ext2_find_next_zero_bit(addr, max, start) - fix; ++} ++ ++static inline void *mb_find_buddy(struct ext3_buddy *e3b, int order, int *max) ++{ ++ char *bb; ++ ++ J_ASSERT(EXT3_MB_BITMAP(e3b) != EXT3_MB_BUDDY(e3b)); ++ J_ASSERT(max != NULL); ++ ++ if (order > e3b->bd_blkbits + 1) { ++ *max = 0; ++ return NULL; ++ } ++ ++ /* at order 0 we see each particular block */ ++ *max = 1 << (e3b->bd_blkbits + 3); ++ if (order == 0) ++ return EXT3_MB_BITMAP(e3b); ++ ++ bb = EXT3_MB_BUDDY(e3b) + EXT3_SB(e3b->bd_sb)->s_mb_offsets[order]; ++ *max = EXT3_SB(e3b->bd_sb)->s_mb_maxs[order]; ++ ++ return bb; ++} ++ ++#ifdef AGGRESSIVE_CHECK ++ ++static void mb_check_buddy(struct ext3_buddy *e3b) ++{ ++ int order = e3b->bd_blkbits + 1; ++ int max, max2, i, j, k, count; ++ int fragments = 0, fstart; ++ void *buddy, *buddy2; ++ ++ if (!test_opt(e3b->bd_sb, MBALLOC)) ++ return; ++ ++ { ++ static int mb_check_counter = 0; ++ if (mb_check_counter++ % 300 != 0) ++ return; ++ } ++ ++ while (order > 1) { ++ buddy = mb_find_buddy(e3b, order, &max); ++ J_ASSERT(buddy); ++ buddy2 = mb_find_buddy(e3b, order - 1, &max2); ++ J_ASSERT(buddy2); ++ J_ASSERT(buddy != buddy2); ++ J_ASSERT(max * 2 == max2); ++ ++ count = 0; ++ for (i = 0; i < max; i++) { ++ ++ if (mb_test_bit(i, buddy)) { ++ /* only single bit in buddy2 may be 1 */ ++ if (!mb_test_bit(i << 1, buddy2)) ++ J_ASSERT(mb_test_bit((i<<1)+1, buddy2)); ++ else if (!mb_test_bit((i << 1) + 1, buddy2)) ++ J_ASSERT(mb_test_bit(i << 1, buddy2)); ++ continue; ++ } ++ ++ /* both bits in buddy2 must be 0 */ ++ J_ASSERT(mb_test_bit(i << 1, buddy2)); ++ J_ASSERT(mb_test_bit((i << 1) + 1, buddy2)); ++ ++ for (j = 0; j < (1 << order); j++) { ++ k = (i * (1 << order)) + j; ++ J_ASSERT(!mb_test_bit(k, EXT3_MB_BITMAP(e3b))); ++ } ++ count++; ++ } ++ J_ASSERT(e3b->bd_info->bb_counters[order] == count); ++ order--; ++ } ++ ++ fstart = -1; ++ buddy = mb_find_buddy(e3b, 0, &max); ++ for (i = 0; i < max; i++) { ++ if (!mb_test_bit(i, buddy)) { ++ J_ASSERT(i >= e3b->bd_info->bb_first_free); ++ if (fstart == -1) { ++ fragments++; ++ fstart = i; ++ } ++ continue; ++ } ++ fstart = -1; ++ /* check used bits only */ ++ for (j = 0; j < e3b->bd_blkbits + 1; j++) { ++ buddy2 = mb_find_buddy(e3b, j, &max2); ++ k = i >> j; ++ J_ASSERT(k < max2); ++ J_ASSERT(mb_test_bit(k, buddy2)); ++ } ++ } ++ J_ASSERT(!EXT3_MB_GRP_NEED_INIT(e3b->bd_info)); ++ J_ASSERT(e3b->bd_info->bb_fragments == fragments); ++} ++ ++#else ++#define mb_check_buddy(e3b) ++#endif ++ ++/* find most significant bit */ ++static int inline fmsb(unsigned short word) ++{ ++ int order; ++ ++ if (word > 255) { ++ order = 7; ++ word >>= 8; ++ } else { ++ order = -1; ++ } ++ ++ do { ++ order++; ++ word >>= 1; ++ } while (word != 0); ++ ++ return order; ++} ++ ++static void inline ++ext3_mb_mark_free_simple(struct super_block *sb, void *buddy, unsigned first, ++ int len, struct ext3_group_info *grp) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ unsigned short min, max, chunk, border; ++ ++ mb_debug("mark %u/%u free\n", first, len); ++ J_ASSERT(len < EXT3_BLOCKS_PER_GROUP(sb)); ++ ++ border = 2 << sb->s_blocksize_bits; ++ ++ while (len > 0) { ++ /* find how many blocks can be covered since this position */ ++ max = ffs(first | border) - 1; ++ ++ /* find how many blocks of power 2 we need to mark */ ++ min = fmsb(len); ++ ++ mb_debug(" %u/%u -> max %u, min %u\n", ++ first & ((2 << sb->s_blocksize_bits) - 1), ++ len, max, min); ++ ++ if (max < min) ++ min = max; ++ chunk = 1 << min; ++ ++ /* mark multiblock chunks only */ ++ grp->bb_counters[min]++; ++ if (min > 0) { ++ mb_debug(" set %u at %u \n", first >> min, ++ sbi->s_mb_offsets[min]); ++ mb_clear_bit(first >> min, buddy + sbi->s_mb_offsets[min]); ++ } ++ ++ len -= chunk; ++ first += chunk; ++ } ++} ++ ++static void ++ext3_mb_generate_buddy(struct super_block *sb, void *buddy, void *bitmap, ++ struct ext3_group_info *grp) ++{ ++ unsigned short max = EXT3_BLOCKS_PER_GROUP(sb); ++ unsigned short i = 0, first, len; ++ unsigned free = 0, fragments = 0; ++ unsigned long long period = get_cycles(); ++ ++ i = mb_find_next_zero_bit(bitmap, max, 0); ++ grp->bb_first_free = i; ++ while (i < max) { ++ fragments++; ++ first = i; ++ i = find_next_bit(bitmap, max, i); ++ len = i - first; ++ free += len; ++ if (len > 1) ++ ext3_mb_mark_free_simple(sb, buddy, first, len, grp); ++ else ++ grp->bb_counters[0]++; ++ if (i < max) ++ i = mb_find_next_zero_bit(bitmap, max, i); ++ } ++ grp->bb_fragments = fragments; ++ ++ /* bb_state shouldn't being modified because all ++ * others waits for init completion on page lock */ ++ clear_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, &grp->bb_state); ++ if (free != grp->bb_free) { ++ printk("EXT3-fs: %u blocks in bitmap, %u in group descriptor\n", ++ free, grp->bb_free); ++ grp->bb_free = free; ++ } ++ ++ period = get_cycles() - period; ++ spin_lock(&EXT3_SB(sb)->s_bal_lock); ++ EXT3_SB(sb)->s_mb_buddies_generated++; ++ EXT3_SB(sb)->s_mb_generation_time += period; ++ spin_unlock(&EXT3_SB(sb)->s_bal_lock); ++} ++ ++static int ext3_mb_init_cache(struct page *page) ++{ ++ int blocksize, blocks_per_page, groups_per_page; ++ int err = 0, i, first_group, first_block; ++ struct super_block *sb; ++ struct buffer_head *bhs; ++ struct buffer_head **bh; ++ struct inode *inode; ++ char *data, *bitmap; ++ ++ mb_debug("init page %lu\n", page->index); ++ ++ inode = page->mapping->host; ++ sb = inode->i_sb; ++ blocksize = 1 << inode->i_blkbits; ++ blocks_per_page = PAGE_CACHE_SIZE / blocksize; ++ ++ groups_per_page = blocks_per_page >> 1; ++ if (groups_per_page == 0) ++ groups_per_page = 1; ++ ++ /* allocate buffer_heads to read bitmaps */ ++ if (groups_per_page > 1) { ++ err = -ENOMEM; ++ i = sizeof(struct buffer_head *) * groups_per_page; ++ bh = kmalloc(i, GFP_NOFS); ++ if (bh == NULL) ++ goto out; ++ memset(bh, 0, i); ++ } else ++ bh = &bhs; ++ ++ first_group = page->index * blocks_per_page / 2; ++ ++ /* read all groups the page covers into the cache */ ++ for (i = 0; i < groups_per_page; i++) { ++ struct ext3_group_desc * desc; ++ ++ if (first_group + i >= EXT3_SB(sb)->s_groups_count) ++ break; ++ ++ err = -EIO; ++ desc = ext3_get_group_desc(sb, first_group + i, NULL); ++ if (desc == NULL) ++ goto out; ++ ++ err = -ENOMEM; ++ bh[i] = sb_getblk(sb, le32_to_cpu(desc->bg_block_bitmap)); ++ if (bh[i] == NULL) ++ goto out; ++ ++ if (buffer_uptodate(bh[i])) ++ continue; ++ ++ lock_buffer(bh[i]); ++ if (buffer_uptodate(bh[i])) { ++ unlock_buffer(bh[i]); ++ continue; ++ } ++ ++ get_bh(bh[i]); ++ bh[i]->b_end_io = end_buffer_read_sync; ++ submit_bh(READ, bh[i]); ++ mb_debug("read bitmap for group %u\n", first_group + i); ++ } ++ ++ /* wait for I/O completion */ ++ for (i = 0; i < groups_per_page && bh[i]; i++) ++ wait_on_buffer(bh[i]); ++ ++ /* XXX: I/O error handling here */ ++ ++ first_block = page->index * blocks_per_page; ++ for (i = 0; i < blocks_per_page; i++) { ++ int group; ++ ++ group = (first_block + i) >> 1; ++ if (group >= EXT3_SB(sb)->s_groups_count) ++ break; ++ ++ data = page_address(page) + (i * blocksize); ++ bitmap = bh[group - first_group]->b_data; ++ ++ if ((first_block + i) & 1) { ++ /* this is block of buddy */ ++ mb_debug("put buddy for group %u in page %lu/%x\n", ++ group, page->index, i * blocksize); ++ memset(data, 0xff, blocksize); ++ EXT3_SB(sb)->s_group_info[group]->bb_fragments = 0; ++ memset(EXT3_SB(sb)->s_group_info[group]->bb_counters, 0, ++ sizeof(unsigned short)*(sb->s_blocksize_bits+2)); ++ ext3_mb_generate_buddy(sb, data, bitmap, ++ EXT3_SB(sb)->s_group_info[group]); ++ } else { ++ /* this is block of bitmap */ ++ mb_debug("put bitmap for group %u in page %lu/%x\n", ++ group, page->index, i * blocksize); ++ memcpy(data, bitmap, blocksize); ++ } ++ } ++ SetPageUptodate(page); ++ ++out: ++ for (i = 0; i < groups_per_page && bh[i]; i++) ++ brelse(bh[i]); ++ if (bh && bh != &bhs) ++ kfree(bh); ++ return err; ++} ++ ++static int ext3_mb_load_buddy(struct super_block *sb, int group, ++ struct ext3_buddy *e3b) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct inode *inode = sbi->s_buddy_cache; ++ int blocks_per_page, block, pnum, poff; ++ struct page *page; ++ ++ mb_debug("load group %u\n", group); ++ ++ blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; ++ ++ e3b->bd_blkbits = sb->s_blocksize_bits; ++ e3b->bd_info = sbi->s_group_info[group]; ++ e3b->bd_sb = sb; ++ e3b->bd_group = group; ++ e3b->bd_buddy_page = NULL; ++ e3b->bd_bitmap_page = NULL; ++ ++ block = group * 2; ++ pnum = block / blocks_per_page; ++ poff = block % blocks_per_page; ++ ++ page = find_get_page(inode->i_mapping, pnum); ++ if (page == NULL || !PageUptodate(page)) { ++ if (page) ++ page_cache_release(page); ++ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); ++ if (page) { ++ if (!PageUptodate(page)) ++ ext3_mb_init_cache(page); ++ unlock_page(page); ++ } ++ } ++ if (page == NULL || !PageUptodate(page)) ++ goto err; ++ e3b->bd_bitmap_page = page; ++ e3b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); ++ mark_page_accessed(page); ++ ++ block++; ++ pnum = block / blocks_per_page; ++ poff = block % blocks_per_page; ++ ++ page = find_get_page(inode->i_mapping, pnum); ++ if (page == NULL || !PageUptodate(page)) { ++ if (page) ++ page_cache_release(page); ++ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); ++ if (page) { ++ if (!PageUptodate(page)) ++ ext3_mb_init_cache(page); ++ unlock_page(page); ++ } ++ } ++ if (page == NULL || !PageUptodate(page)) ++ goto err; ++ e3b->bd_buddy_page = page; ++ e3b->bd_buddy = page_address(page) + (poff * sb->s_blocksize); ++ mark_page_accessed(page); ++ ++ J_ASSERT(e3b->bd_bitmap_page != NULL); ++ J_ASSERT(e3b->bd_buddy_page != NULL); ++ ++ return 0; ++ ++err: ++ if (e3b->bd_bitmap_page) ++ page_cache_release(e3b->bd_bitmap_page); ++ if (e3b->bd_buddy_page) ++ page_cache_release(e3b->bd_buddy_page); ++ e3b->bd_buddy = NULL; ++ e3b->bd_bitmap = NULL; ++ return -EIO; ++} ++ ++static void ext3_mb_release_desc(struct ext3_buddy *e3b) ++{ ++ if (e3b->bd_bitmap_page) ++ page_cache_release(e3b->bd_bitmap_page); ++ if (e3b->bd_buddy_page) ++ page_cache_release(e3b->bd_buddy_page); ++} ++ ++ ++static inline void ++ext3_lock_group(struct super_block *sb, int group) ++{ ++ bit_spin_lock(EXT3_GROUP_INFO_LOCKED_BIT, ++ &EXT3_SB(sb)->s_group_info[group]->bb_state); ++} ++ ++static inline void ++ext3_unlock_group(struct super_block *sb, int group) ++{ ++ bit_spin_unlock(EXT3_GROUP_INFO_LOCKED_BIT, ++ &EXT3_SB(sb)->s_group_info[group]->bb_state); ++} ++ ++static int mb_find_order_for_block(struct ext3_buddy *e3b, int block) ++{ ++ int order = 1; ++ void *bb; ++ ++ J_ASSERT(EXT3_MB_BITMAP(e3b) != EXT3_MB_BUDDY(e3b)); ++ J_ASSERT(block < (1 << (e3b->bd_blkbits + 3))); ++ ++ bb = EXT3_MB_BUDDY(e3b); ++ while (order <= e3b->bd_blkbits + 1) { ++ block = block >> 1; ++ if (!mb_test_bit(block, bb)) { ++ /* this block is part of buddy of order 'order' */ ++ return order; ++ } ++ bb += 1 << (e3b->bd_blkbits - order); ++ order++; ++ } ++ return 0; ++} ++ ++static inline void mb_clear_bits(void *bm, int cur, int len) ++{ ++ __u32 *addr; ++ ++ len = cur + len; ++ while (cur < len) { ++ if ((cur & 31) == 0 && (len - cur) >= 32) { ++ /* fast path: clear whole word at once */ ++ addr = bm + (cur >> 3); ++ *addr = 0; ++ cur += 32; ++ continue; ++ } ++ mb_clear_bit_atomic(cur, bm); ++ cur++; ++ } ++} ++ ++static inline void mb_set_bits(void *bm, int cur, int len) ++{ ++ __u32 *addr; ++ ++ len = cur + len; ++ while (cur < len) { ++ if ((cur & 31) == 0 && (len - cur) >= 32) { ++ /* fast path: clear whole word at once */ ++ addr = bm + (cur >> 3); ++ *addr = 0xffffffff; ++ cur += 32; ++ continue; ++ } ++ mb_set_bit_atomic(cur, bm); ++ cur++; ++ } ++} ++ ++static int mb_free_blocks(struct ext3_buddy *e3b, int first, int count) ++{ ++ int block = 0, max = 0, order; ++ void *buddy, *buddy2; ++ ++ mb_check_buddy(e3b); ++ ++ e3b->bd_info->bb_free += count; ++ if (first < e3b->bd_info->bb_first_free) ++ e3b->bd_info->bb_first_free = first; ++ ++ /* let's maintain fragments counter */ ++ if (first != 0) ++ block = !mb_test_bit(first - 1, EXT3_MB_BITMAP(e3b)); ++ if (first + count < EXT3_SB(e3b->bd_sb)->s_mb_maxs[0]) ++ max = !mb_test_bit(first + count, EXT3_MB_BITMAP(e3b)); ++ if (block && max) ++ e3b->bd_info->bb_fragments--; ++ else if (!block && !max) ++ e3b->bd_info->bb_fragments++; ++ ++ /* let's maintain buddy itself */ ++ while (count-- > 0) { ++ block = first++; ++ order = 0; ++ ++ J_ASSERT(mb_test_bit(block, EXT3_MB_BITMAP(e3b))); ++ mb_clear_bit(block, EXT3_MB_BITMAP(e3b)); ++ e3b->bd_info->bb_counters[order]++; ++ ++ /* start of the buddy */ ++ buddy = mb_find_buddy(e3b, order, &max); ++ ++ do { ++ block &= ~1UL; ++ if (mb_test_bit(block, buddy) || ++ mb_test_bit(block + 1, buddy)) ++ break; ++ ++ /* both the buddies are free, try to coalesce them */ ++ buddy2 = mb_find_buddy(e3b, order + 1, &max); ++ ++ if (!buddy2) ++ break; ++ ++ if (order > 0) { ++ /* for special purposes, we don't set ++ * free bits in bitmap */ ++ mb_set_bit(block, buddy); ++ mb_set_bit(block + 1, buddy); ++ } ++ e3b->bd_info->bb_counters[order]--; ++ e3b->bd_info->bb_counters[order]--; ++ ++ block = block >> 1; ++ order++; ++ e3b->bd_info->bb_counters[order]++; ++ ++ mb_clear_bit(block, buddy2); ++ buddy = buddy2; ++ } while (1); ++ } ++ mb_check_buddy(e3b); ++ ++ return 0; ++} ++ ++static int mb_find_extent(struct ext3_buddy *e3b, int order, int block, ++ int needed, struct ext3_free_extent *ex) ++{ ++ int next, max, ord; ++ void *buddy; ++ ++ J_ASSERT(ex != NULL); ++ ++ buddy = mb_find_buddy(e3b, order, &max); ++ J_ASSERT(buddy); ++ J_ASSERT(block < max); ++ if (mb_test_bit(block, buddy)) { ++ ex->fe_len = 0; ++ ex->fe_start = 0; ++ ex->fe_group = 0; ++ return 0; ++ } ++ ++ if (likely(order == 0)) { ++ /* find actual order */ ++ order = mb_find_order_for_block(e3b, block); ++ block = block >> order; ++ } ++ ++ ex->fe_len = 1 << order; ++ ex->fe_start = block << order; ++ ex->fe_group = e3b->bd_group; ++ ++ while (needed > ex->fe_len && (buddy = mb_find_buddy(e3b, order, &max))) { ++ ++ if (block + 1 >= max) ++ break; ++ ++ next = (block + 1) * (1 << order); ++ if (mb_test_bit(next, EXT3_MB_BITMAP(e3b))) ++ break; ++ ++ ord = mb_find_order_for_block(e3b, next); ++ ++ order = ord; ++ block = next >> order; ++ ex->fe_len += 1 << order; ++ } ++ ++ J_ASSERT(ex->fe_start + ex->fe_len <= (1 << (e3b->bd_blkbits + 3))); ++ return ex->fe_len; ++} ++ ++static int mb_mark_used(struct ext3_buddy *e3b, struct ext3_free_extent *ex) ++{ ++ int ord, mlen = 0, max = 0, cur; ++ int start = ex->fe_start; ++ int len = ex->fe_len; ++ unsigned ret = 0; ++ int len0 = len; ++ void *buddy; ++ ++ mb_check_buddy(e3b); ++ ++ e3b->bd_info->bb_free -= len; ++ if (e3b->bd_info->bb_first_free == start) ++ e3b->bd_info->bb_first_free += len; ++ ++ /* let's maintain fragments counter */ ++ if (start != 0) ++ mlen = !mb_test_bit(start - 1, EXT3_MB_BITMAP(e3b)); ++ if (start + len < EXT3_SB(e3b->bd_sb)->s_mb_maxs[0]) ++ max = !mb_test_bit(start + len, EXT3_MB_BITMAP(e3b)); ++ if (mlen && max) ++ e3b->bd_info->bb_fragments++; ++ else if (!mlen && !max) ++ e3b->bd_info->bb_fragments--; ++ ++ /* let's maintain buddy itself */ ++ while (len) { ++ ord = mb_find_order_for_block(e3b, start); ++ ++ if (((start >> ord) << ord) == start && len >= (1 << ord)) { ++ /* the whole chunk may be allocated at once! */ ++ mlen = 1 << ord; ++ buddy = mb_find_buddy(e3b, ord, &max); ++ J_ASSERT((start >> ord) < max); ++ mb_set_bit(start >> ord, buddy); ++ e3b->bd_info->bb_counters[ord]--; ++ start += mlen; ++ len -= mlen; ++ J_ASSERT(len >= 0); ++ continue; ++ } ++ ++ /* store for history */ ++ if (ret == 0) ++ ret = len | (ord << 16); ++ ++ /* we have to split large buddy */ ++ J_ASSERT(ord > 0); ++ buddy = mb_find_buddy(e3b, ord, &max); ++ mb_set_bit(start >> ord, buddy); ++ e3b->bd_info->bb_counters[ord]--; ++ ++ ord--; ++ cur = (start >> ord) & ~1U; ++ buddy = mb_find_buddy(e3b, ord, &max); ++ mb_clear_bit(cur, buddy); ++ mb_clear_bit(cur + 1, buddy); ++ e3b->bd_info->bb_counters[ord]++; ++ e3b->bd_info->bb_counters[ord]++; ++ } ++ ++ /* now drop all the bits in bitmap */ ++ mb_set_bits(EXT3_MB_BITMAP(e3b), ex->fe_start, len0); ++ ++ mb_check_buddy(e3b); ++ ++ return ret; ++} ++ ++/* ++ * Must be called under group lock! ++ */ ++static void ext3_mb_use_best_found(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ unsigned long ret; ++ ++ ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len); ++ ret = mb_mark_used(e3b, &ac->ac_b_ex); ++ ++ ac->ac_status = AC_STATUS_FOUND; ++ ac->ac_tail = ret & 0xffff; ++ ac->ac_buddy = ret >> 16; ++} ++ ++/* ++ * The routine checks whether found extent is good enough. If it is, ++ * then the extent gets marked used and flag is set to the context ++ * to stop scanning. Otherwise, the extent is compared with the ++ * previous found extent and if new one is better, then it's stored ++ * in the context. Later, the best found extent will be used, if ++ * mballoc can't find good enough extent. ++ * ++ * FIXME: real allocation policy is to be designed yet! ++ */ ++static void ext3_mb_measure_extent(struct ext3_allocation_context *ac, ++ struct ext3_free_extent *ex, ++ struct ext3_buddy *e3b) ++{ ++ struct ext3_free_extent *bex = &ac->ac_b_ex; ++ struct ext3_free_extent *gex = &ac->ac_g_ex; ++ ++ J_ASSERT(ex->fe_len > 0); ++ J_ASSERT(ex->fe_len < (1 << ac->ac_sb->s_blocksize_bits) * 8); ++ J_ASSERT(ex->fe_start < (1 << ac->ac_sb->s_blocksize_bits) * 8); ++ ++ ac->ac_found++; ++ ++ /* ++ * The special case - take what you catch first ++ */ ++ if (unlikely(ac->ac_flags & EXT3_MB_HINT_FIRST)) { ++ *bex = *ex; ++ ext3_mb_use_best_found(ac, e3b); ++ return; ++ } ++ ++ /* ++ * Let's check whether the chuck is good enough ++ */ ++ if (ex->fe_len == gex->fe_len) { ++ *bex = *ex; ++ ext3_mb_use_best_found(ac, e3b); ++ return; ++ } ++ ++ /* ++ * If this is first found extent, just store it in the context ++ */ ++ if (bex->fe_len == 0) { ++ *bex = *ex; ++ return; ++ } ++ ++ /* ++ * If new found extent is better, store it in the context ++ */ ++ if (bex->fe_len < gex->fe_len) { ++ /* if the request isn't satisfied, any found extent ++ * larger than previous best one is better */ ++ if (ex->fe_len > bex->fe_len) ++ *bex = *ex; ++ } else if (ex->fe_len > gex->fe_len) { ++ /* if the request is satisfied, then we try to find ++ * an extent that still satisfy the request, but is ++ * smaller than previous one */ ++ *bex = *ex; ++ } ++ ++ /* ++ * Let's scan at least few extents and don't pick up a first one ++ */ ++ if (bex->fe_len > gex->fe_len && ac->ac_found > ext3_mb_min_to_scan) ++ ac->ac_status = AC_STATUS_BREAK; ++ ++ /* ++ * We don't want to scan for a whole year ++ */ ++ if (ac->ac_found > ext3_mb_max_to_scan) ++ ac->ac_status = AC_STATUS_BREAK; ++} ++ ++static int ext3_mb_try_best_found(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ struct ext3_free_extent ex = ac->ac_b_ex; ++ int group = ex.fe_group, max, err; ++ ++ J_ASSERT(ex.fe_len > 0); ++ err = ext3_mb_load_buddy(ac->ac_sb, group, e3b); ++ if (err) ++ return err; ++ ++ ext3_lock_group(ac->ac_sb, group); ++ max = mb_find_extent(e3b, 0, ex.fe_start, ex.fe_len, &ex); ++ ++ if (max > 0) { ++ ac->ac_b_ex = ex; ++ ext3_mb_use_best_found(ac, e3b); ++ } ++ ++ ext3_unlock_group(ac->ac_sb, group); ++ ++ ext3_mb_release_desc(e3b); ++ ++ return 0; ++} ++ ++static int ext3_mb_find_by_goal(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ int group = ac->ac_g_ex.fe_group, max, err; ++ struct ext3_free_extent ex; ++ ++ err = ext3_mb_load_buddy(ac->ac_sb, group, e3b); ++ if (err) ++ return err; ++ ++ ext3_lock_group(ac->ac_sb, group); ++ max = mb_find_extent(e3b, 0, ac->ac_g_ex.fe_start, ++ ac->ac_g_ex.fe_len, &ex); ++ ++ if (max > 0) { ++ J_ASSERT(ex.fe_len > 0); ++ J_ASSERT(ex.fe_group == ac->ac_g_ex.fe_group); ++ J_ASSERT(ex.fe_start == ac->ac_g_ex.fe_start); ++ ac->ac_found++; ++ ac->ac_b_ex = ex; ++ ext3_mb_use_best_found(ac, e3b); ++ } ++ ext3_unlock_group(ac->ac_sb, group); ++ ++ ext3_mb_release_desc(e3b); ++ ++ return 0; ++} ++ ++/* ++ * The routine scans buddy structures (not bitmap!) from given order ++ * to max order and tries to find big enough chunk to satisfy the req ++ */ ++static void ext3_mb_simple_scan_group(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ struct super_block *sb = ac->ac_sb; ++ struct ext3_group_info *grp = e3b->bd_info; ++ void *buddy; ++ int i, k, max; ++ ++ J_ASSERT(ac->ac_2order > 0); ++ for (i = ac->ac_2order; i < sb->s_blocksize_bits + 1; i++) { ++ if (grp->bb_counters[i] == 0) ++ continue; ++ ++ buddy = mb_find_buddy(e3b, i, &max); ++ if (buddy == NULL) { ++ printk(KERN_ALERT "looking for wrong order?\n"); ++ break; ++ } ++ ++ k = mb_find_next_zero_bit(buddy, max, 0); ++ J_ASSERT(k < max); ++ ++ ac->ac_found++; ++ ++ ac->ac_b_ex.fe_len = 1 << i; ++ ac->ac_b_ex.fe_start = k << i; ++ ac->ac_b_ex.fe_group = e3b->bd_group; ++ ++ ext3_mb_use_best_found(ac, e3b); ++ J_ASSERT(ac->ac_b_ex.fe_len == ac->ac_g_ex.fe_len); ++ ++ if (unlikely(ext3_mb_stats)) ++ atomic_inc(&EXT3_SB(sb)->s_bal_2orders); ++ ++ break; ++ } ++} ++ ++/* ++ * The routine scans the group and measures all found extents. ++ * In order to optimize scanning, caller must pass number of ++ * free blocks in the group, so the routine can know upper limit. ++ */ ++static void ext3_mb_complex_scan_group(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ struct super_block *sb = ac->ac_sb; ++ void *bitmap = EXT3_MB_BITMAP(e3b); ++ struct ext3_free_extent ex; ++ int i, free; ++ ++ free = e3b->bd_info->bb_free; ++ J_ASSERT(free > 0); ++ ++ i = e3b->bd_info->bb_first_free; ++ ++ while (free && ac->ac_status == AC_STATUS_CONTINUE) { ++ i = mb_find_next_zero_bit(bitmap, sb->s_blocksize * 8, i); ++ if (i >= sb->s_blocksize * 8) { ++ J_ASSERT(free == 0); ++ break; ++ } ++ ++ mb_find_extent(e3b, 0, i, ac->ac_g_ex.fe_len, &ex); ++ J_ASSERT(ex.fe_len > 0); ++ J_ASSERT(free >= ex.fe_len); ++ ++ ext3_mb_measure_extent(ac, &ex, e3b); ++ ++ i += ex.fe_len; ++ free -= ex.fe_len; ++ } ++} ++ ++static int ext3_mb_good_group(struct ext3_allocation_context *ac, ++ int group, int cr) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(ac->ac_sb); ++ struct ext3_group_info *grp = sbi->s_group_info[group]; ++ unsigned free, fragments, i, bits; ++ ++ J_ASSERT(cr >= 0 && cr < 4); ++ J_ASSERT(!EXT3_MB_GRP_NEED_INIT(grp)); ++ ++ free = grp->bb_free; ++ fragments = grp->bb_fragments; ++ if (free == 0) ++ return 0; ++ if (fragments == 0) ++ return 0; ++ ++ switch (cr) { ++ case 0: ++ J_ASSERT(ac->ac_2order != 0); ++ bits = ac->ac_sb->s_blocksize_bits + 1; ++ for (i = ac->ac_2order; i < bits; i++) ++ if (grp->bb_counters[i] > 0) ++ return 1; ++ case 1: ++ if ((free / fragments) >= ac->ac_g_ex.fe_len) ++ return 1; ++ case 2: ++ if (free >= ac->ac_g_ex.fe_len) ++ return 1; ++ case 3: ++ return 1; ++ default: ++ BUG(); ++ } ++ ++ return 0; ++} ++ ++int ext3_mb_new_blocks(handle_t *handle, struct inode *inode, ++ unsigned long goal, int *len, int flags, int *errp) ++{ ++ struct buffer_head *bitmap_bh = NULL; ++ struct ext3_allocation_context ac; ++ int i, group, block, cr, err = 0; ++ struct ext3_group_desc *gdp; ++ struct ext3_super_block *es; ++ struct buffer_head *gdp_bh; ++ struct ext3_sb_info *sbi; ++ struct super_block *sb; ++ struct ext3_buddy e3b; ++ ++ J_ASSERT(len != NULL); ++ J_ASSERT(*len > 0); ++ ++ sb = inode->i_sb; ++ if (!sb) { ++ printk("ext3_mb_new_nblocks: nonexistent device"); ++ return 0; ++ } ++ ++ if (!test_opt(sb, MBALLOC)) { ++ static int ext3_mballoc_warning = 0; ++ if (ext3_mballoc_warning == 0) { ++ printk(KERN_ERR "EXT3-fs: multiblock request with " ++ "mballoc disabled!\n"); ++ ext3_mballoc_warning++; ++ } ++ *len = 1; ++ err = ext3_new_block_old(handle, inode, goal, errp); ++ return err; ++ } ++ ++ ext3_mb_poll_new_transaction(sb, handle); ++ ++ sbi = EXT3_SB(sb); ++ es = EXT3_SB(sb)->s_es; ++ ++ /* ++ * We can't allocate > group size ++ */ ++ if (*len >= EXT3_BLOCKS_PER_GROUP(sb) - 10) ++ *len = EXT3_BLOCKS_PER_GROUP(sb) - 10; ++ ++ if (!(flags & EXT3_MB_HINT_RESERVED)) { ++ /* someone asks for non-reserved blocks */ ++ BUG_ON(*len > 1); ++ err = ext3_mb_reserve_blocks(sb, 1); ++ if (err) { ++ *errp = err; ++ return 0; ++ } ++ } ++ ++ /* ++ * Check quota for allocation of this blocks. ++ */ ++ while (*len && DQUOT_ALLOC_BLOCK(inode, *len)) ++ *len -= 1; ++ if (*len == 0) { ++ *errp = -EDQUOT; ++ block = 0; ++ goto out; ++ } ++ ++ /* start searching from the goal */ ++ if (goal < le32_to_cpu(es->s_first_data_block) || ++ goal >= le32_to_cpu(es->s_blocks_count)) ++ goal = le32_to_cpu(es->s_first_data_block); ++ group = (goal - le32_to_cpu(es->s_first_data_block)) / ++ EXT3_BLOCKS_PER_GROUP(sb); ++ block = ((goal - le32_to_cpu(es->s_first_data_block)) % ++ EXT3_BLOCKS_PER_GROUP(sb)); ++ ++ /* set up allocation goals */ ++ ac.ac_b_ex.fe_group = 0; ++ ac.ac_b_ex.fe_start = 0; ++ ac.ac_b_ex.fe_len = 0; ++ ac.ac_status = AC_STATUS_CONTINUE; ++ ac.ac_groups_scanned = 0; ++ ac.ac_ex_scanned = 0; ++ ac.ac_found = 0; ++ ac.ac_sb = inode->i_sb; ++ ac.ac_g_ex.fe_group = group; ++ ac.ac_g_ex.fe_start = block; ++ ac.ac_g_ex.fe_len = *len; ++ ac.ac_flags = flags; ++ ac.ac_2order = 0; ++ ac.ac_criteria = 0; ++ ++ /* probably, the request is for 2^8+ blocks (1/2/3/... MB) */ ++ i = ffs(*len); ++ if (i >= 8) { ++ i--; ++ if ((*len & (~(1 << i))) == 0) ++ ac.ac_2order = i; ++ } ++ ++ /* Sometimes, caller may want to merge even small ++ * number of blocks to an existing extent */ ++ if (ac.ac_flags & EXT3_MB_HINT_MERGE) { ++ err = ext3_mb_find_by_goal(&ac, &e3b); ++ if (err) ++ goto out_err; ++ if (ac.ac_status == AC_STATUS_FOUND) ++ goto found; ++ } ++ ++ /* Let's just scan groups to find more-less suitable blocks */ ++ cr = ac.ac_2order ? 0 : 1; ++repeat: ++ for (; cr < 4 && ac.ac_status == AC_STATUS_CONTINUE; cr++) { ++ ac.ac_criteria = cr; ++ for (i = 0; i < EXT3_SB(sb)->s_groups_count; group++, i++) { ++ if (group == EXT3_SB(sb)->s_groups_count) ++ group = 0; ++ ++ if (EXT3_MB_GRP_NEED_INIT(sbi->s_group_info[group])) { ++ /* we need full data about the group ++ * to make a good selection */ ++ err = ext3_mb_load_buddy(ac.ac_sb, group, &e3b); ++ if (err) ++ goto out_err; ++ ext3_mb_release_desc(&e3b); ++ } ++ ++ /* check is group good for our criteries */ ++ if (!ext3_mb_good_group(&ac, group, cr)) ++ continue; ++ ++ err = ext3_mb_load_buddy(ac.ac_sb, group, &e3b); ++ if (err) ++ goto out_err; ++ ++ ext3_lock_group(sb, group); ++ if (!ext3_mb_good_group(&ac, group, cr)) { ++ /* someone did allocation from this group */ ++ ext3_unlock_group(sb, group); ++ ext3_mb_release_desc(&e3b); ++ continue; ++ } ++ ++ ac.ac_groups_scanned++; ++ if (cr == 0) ++ ext3_mb_simple_scan_group(&ac, &e3b); ++ else ++ ext3_mb_complex_scan_group(&ac, &e3b); ++ ++ ext3_unlock_group(sb, group); ++ ++ ext3_mb_release_desc(&e3b); ++ ++ if (err) ++ goto out_err; ++ if (ac.ac_status != AC_STATUS_CONTINUE) ++ break; ++ } ++ } ++ ++ if (ac.ac_b_ex.fe_len > 0 && ac.ac_status != AC_STATUS_FOUND && ++ !(ac.ac_flags & EXT3_MB_HINT_FIRST)) { ++ /* ++ * We've been searching too long. Let's try to allocate ++ * the best chunk we've found so far ++ */ ++ ++ /*if (ac.ac_found > ext3_mb_max_to_scan) ++ printk(KERN_ERR "EXT3-fs: too long searching at " ++ "%u (%d/%d)\n", cr, ac.ac_b_ex.fe_len, ++ ac.ac_g_ex.fe_len);*/ ++ ext3_mb_try_best_found(&ac, &e3b); ++ if (ac.ac_status != AC_STATUS_FOUND) { ++ /* ++ * Someone more lucky has already allocated it. ++ * The only thing we can do is just take first ++ * found block(s) ++ */ ++ printk(KERN_ERR "EXT3-fs: and someone won our chunk\n"); ++ ac.ac_b_ex.fe_group = 0; ++ ac.ac_b_ex.fe_start = 0; ++ ac.ac_b_ex.fe_len = 0; ++ ac.ac_status = AC_STATUS_CONTINUE; ++ ac.ac_flags |= EXT3_MB_HINT_FIRST; ++ cr = 3; ++ goto repeat; ++ } ++ } ++ ++ if (ac.ac_status != AC_STATUS_FOUND) { ++ /* ++ * We aren't lucky definitely ++ */ ++ DQUOT_FREE_BLOCK(inode, *len); ++ *errp = -ENOSPC; ++ block = 0; ++#if 1 ++ printk(KERN_ERR "EXT3-fs: cant allocate: status %d, flags %d\n", ++ ac.ac_status, ac.ac_flags); ++ printk(KERN_ERR "EXT3-fs: goal %d, best found %d/%d/%d, cr %d\n", ++ ac.ac_g_ex.fe_len, ac.ac_b_ex.fe_group, ++ ac.ac_b_ex.fe_start, ac.ac_b_ex.fe_len, cr); ++ printk(KERN_ERR "EXT3-fs: %lu block reserved, %d found\n", ++ sbi->s_blocks_reserved, ac.ac_found); ++ printk("EXT3-fs: groups: "); ++ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) ++ printk("%d: %d ", i, ++ sbi->s_group_info[i]->bb_free); ++ printk("\n"); ++#endif ++ goto out; ++ } ++ ++found: ++ J_ASSERT(ac.ac_b_ex.fe_len > 0); ++ ++ /* good news - free block(s) have been found. now it's time ++ * to mark block(s) in good old journaled bitmap */ ++ block = ac.ac_b_ex.fe_group * EXT3_BLOCKS_PER_GROUP(sb) ++ + ac.ac_b_ex.fe_start ++ + le32_to_cpu(es->s_first_data_block); ++ ++ /* we made a desicion, now mark found blocks in good old ++ * bitmap to be journaled */ ++ ++ ext3_debug("using block group %d(%d)\n", ++ ac.ac_b_group.group, gdp->bg_free_blocks_count); ++ ++ bitmap_bh = read_block_bitmap(sb, ac.ac_b_ex.fe_group); ++ if (!bitmap_bh) { ++ *errp = -EIO; ++ goto out_err; ++ } ++ ++ err = ext3_journal_get_write_access(handle, bitmap_bh); ++ if (err) { ++ *errp = err; ++ goto out_err; ++ } ++ ++ gdp = ext3_get_group_desc(sb, ac.ac_b_ex.fe_group, &gdp_bh); ++ if (!gdp) { ++ *errp = -EIO; ++ goto out_err; ++ } ++ ++ err = ext3_journal_get_write_access(handle, gdp_bh); ++ if (err) ++ goto out_err; ++ ++ block = ac.ac_b_ex.fe_group * EXT3_BLOCKS_PER_GROUP(sb) ++ + ac.ac_b_ex.fe_start ++ + le32_to_cpu(es->s_first_data_block); ++ ++ if (block == le32_to_cpu(gdp->bg_block_bitmap) || ++ block == le32_to_cpu(gdp->bg_inode_bitmap) || ++ in_range(block, le32_to_cpu(gdp->bg_inode_table), ++ EXT3_SB(sb)->s_itb_per_group)) ++ ext3_error(sb, "ext3_new_block", ++ "Allocating block in system zone - " ++ "block = %u", block); ++#ifdef AGGRESSIVE_CHECK ++ for (i = 0; i < ac.ac_b_ex.fe_len; i++) ++ J_ASSERT(!mb_test_bit(ac.ac_b_ex.fe_start + i, bitmap_bh->b_data)); ++#endif ++ mb_set_bits(bitmap_bh->b_data, ac.ac_b_ex.fe_start, ac.ac_b_ex.fe_len); ++ ++ spin_lock(sb_bgl_lock(sbi, ac.ac_b_ex.fe_group)); ++ gdp->bg_free_blocks_count = ++ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) ++ - ac.ac_b_ex.fe_len); ++ spin_unlock(sb_bgl_lock(sbi, ac.ac_b_ex.fe_group)); ++ percpu_counter_mod(&sbi->s_freeblocks_counter, - ac.ac_b_ex.fe_len); ++ ++ err = ext3_journal_dirty_metadata(handle, bitmap_bh); ++ if (err) ++ goto out_err; ++ err = ext3_journal_dirty_metadata(handle, gdp_bh); ++ if (err) ++ goto out_err; ++ ++ sb->s_dirt = 1; ++ *errp = 0; ++ brelse(bitmap_bh); ++ ++ /* drop non-allocated, but dquote'd blocks */ ++ J_ASSERT(*len >= ac.ac_b_ex.fe_len); ++ DQUOT_FREE_BLOCK(inode, *len - ac.ac_b_ex.fe_len); ++ ++ *len = ac.ac_b_ex.fe_len; ++ J_ASSERT(*len > 0); ++ J_ASSERT(block != 0); ++ goto out; ++ ++out_err: ++ /* if we've already allocated something, roll it back */ ++ if (ac.ac_status == AC_STATUS_FOUND) { ++ /* FIXME: free blocks here */ ++ } ++ ++ DQUOT_FREE_BLOCK(inode, *len); ++ brelse(bitmap_bh); ++ *errp = err; ++ block = 0; ++out: ++ if (!(flags & EXT3_MB_HINT_RESERVED)) { ++ /* block wasn't reserved before and we reserved it ++ * at the beginning of allocation. it doesn't matter ++ * whether we allocated anything or we failed: time ++ * to release reservation. NOTE: because I expect ++ * any multiblock request from delayed allocation ++ * path only, here is single block always */ ++ ext3_mb_release_blocks(sb, 1); ++ } ++ ++ if (unlikely(ext3_mb_stats) && ac.ac_g_ex.fe_len > 1) { ++ atomic_inc(&sbi->s_bal_reqs); ++ atomic_add(*len, &sbi->s_bal_allocated); ++ if (*len >= ac.ac_g_ex.fe_len) ++ atomic_inc(&sbi->s_bal_success); ++ atomic_add(ac.ac_found, &sbi->s_bal_ex_scanned); ++ if (ac.ac_g_ex.fe_start == ac.ac_b_ex.fe_start && ++ ac.ac_g_ex.fe_group == ac.ac_b_ex.fe_group) ++ atomic_inc(&sbi->s_bal_goals); ++ if (ac.ac_found > ext3_mb_max_to_scan) ++ atomic_inc(&sbi->s_bal_breaks); ++ } ++ ++ ext3_mb_store_history(sb, &ac); ++ ++ return block; ++} ++EXPORT_SYMBOL(ext3_mb_new_blocks); ++ ++#ifdef EXT3_MB_HISTORY ++struct ext3_mb_proc_session { ++ struct ext3_mb_history *history; ++ struct super_block *sb; ++ int start; ++ int max; ++}; ++ ++static void *ext3_mb_history_skip_empty(struct ext3_mb_proc_session *s, ++ struct ext3_mb_history *hs, ++ int first) ++{ ++ if (hs == s->history + s->max) ++ hs = s->history; ++ if (!first && hs == s->history + s->start) ++ return NULL; ++ while (hs->goal.fe_len == 0) { ++ hs++; ++ if (hs == s->history + s->max) ++ hs = s->history; ++ if (hs == s->history + s->start) ++ return NULL; ++ } ++ return hs; ++} ++ ++static void *ext3_mb_seq_history_start(struct seq_file *seq, loff_t *pos) ++{ ++ struct ext3_mb_proc_session *s = seq->private; ++ struct ext3_mb_history *hs; ++ int l = *pos; ++ ++ if (l == 0) ++ return SEQ_START_TOKEN; ++ hs = ext3_mb_history_skip_empty(s, s->history + s->start, 1); ++ if (!hs) ++ return NULL; ++ while (--l && (hs = ext3_mb_history_skip_empty(s, ++hs, 0)) != NULL); ++ return hs; ++} ++ ++static void *ext3_mb_seq_history_next(struct seq_file *seq, void *v, loff_t *pos) ++{ ++ struct ext3_mb_proc_session *s = seq->private; ++ struct ext3_mb_history *hs = v; ++ ++ ++*pos; ++ if (v == SEQ_START_TOKEN) ++ return ext3_mb_history_skip_empty(s, s->history + s->start, 1); ++ else ++ return ext3_mb_history_skip_empty(s, ++hs, 0); ++} ++ ++static int ext3_mb_seq_history_show(struct seq_file *seq, void *v) ++{ ++ struct ext3_mb_history *hs = v; ++ char buf[20], buf2[20]; ++ ++ if (v == SEQ_START_TOKEN) { ++ seq_printf(seq, "%-17s %-17s %-5s %-5s %-2s %-5s %-5s %-6s\n", ++ "goal", "result", "found", "grps", "cr", "merge", ++ "tail", "broken"); ++ return 0; ++ } ++ ++ sprintf(buf, "%u/%u/%u", hs->goal.fe_group, ++ hs->goal.fe_start, hs->goal.fe_len); ++ sprintf(buf2, "%u/%u/%u", hs->result.fe_group, ++ hs->result.fe_start, hs->result.fe_len); ++ seq_printf(seq, "%-17s %-17s %-5u %-5u %-2u %-5s %-5u %-6u\n", buf, ++ buf2, hs->found, hs->groups, hs->cr, ++ hs->merged ? "M" : "", hs->tail, ++ hs->buddy ? 1 << hs->buddy : 0); ++ return 0; ++} ++ ++static void ext3_mb_seq_history_stop(struct seq_file *seq, void *v) ++{ ++} ++ ++static struct seq_operations ext3_mb_seq_history_ops = { ++ .start = ext3_mb_seq_history_start, ++ .next = ext3_mb_seq_history_next, ++ .stop = ext3_mb_seq_history_stop, ++ .show = ext3_mb_seq_history_show, ++}; ++ ++static int ext3_mb_seq_history_open(struct inode *inode, struct file *file) ++{ ++ struct super_block *sb = PDE(inode)->data; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct ext3_mb_proc_session *s; ++ int rc, size; ++ ++ s = kmalloc(sizeof(*s), GFP_KERNEL); ++ if (s == NULL) ++ return -EIO; ++ size = sizeof(struct ext3_mb_history) * sbi->s_mb_history_max; ++ s->history = kmalloc(size, GFP_KERNEL); ++ if (s == NULL) { ++ kfree(s); ++ return -EIO; ++ } ++ ++ spin_lock(&sbi->s_mb_history_lock); ++ memcpy(s->history, sbi->s_mb_history, size); ++ s->max = sbi->s_mb_history_max; ++ s->start = sbi->s_mb_history_cur % s->max; ++ spin_unlock(&sbi->s_mb_history_lock); ++ ++ rc = seq_open(file, &ext3_mb_seq_history_ops); ++ if (rc == 0) { ++ struct seq_file *m = (struct seq_file *)file->private_data; ++ m->private = s; ++ } else { ++ kfree(s->history); ++ kfree(s); ++ } ++ return rc; ++ ++} ++ ++static int ext3_mb_seq_history_release(struct inode *inode, struct file *file) ++{ ++ struct seq_file *seq = (struct seq_file *)file->private_data; ++ struct ext3_mb_proc_session *s = seq->private; ++ kfree(s->history); ++ kfree(s); ++ return seq_release(inode, file); ++} ++ ++static struct file_operations ext3_mb_seq_history_fops = { ++ .owner = THIS_MODULE, ++ .open = ext3_mb_seq_history_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = ext3_mb_seq_history_release, ++}; ++ ++static void ext3_mb_history_release(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ char name[64]; ++ ++ snprintf(name, sizeof(name) - 1, "%s", bdevname(sb->s_bdev, name)); ++ remove_proc_entry("mb_history", sbi->s_mb_proc); ++ remove_proc_entry(name, proc_root_ext3); ++ ++ if (sbi->s_mb_history) ++ kfree(sbi->s_mb_history); ++} ++ ++static void ext3_mb_history_init(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ char name[64]; ++ int i; ++ ++ snprintf(name, sizeof(name) - 1, "%s", bdevname(sb->s_bdev, name)); ++ sbi->s_mb_proc = proc_mkdir(name, proc_root_ext3); ++ if (sbi->s_mb_proc != NULL) { ++ struct proc_dir_entry *p; ++ p = create_proc_entry("mb_history", S_IRUGO, sbi->s_mb_proc); ++ if (p) { ++ p->proc_fops = &ext3_mb_seq_history_fops; ++ p->data = sb; ++ } ++ } ++ ++ sbi->s_mb_history_max = 1000; ++ sbi->s_mb_history_cur = 0; ++ spin_lock_init(&sbi->s_mb_history_lock); ++ i = sbi->s_mb_history_max * sizeof(struct ext3_mb_history); ++ sbi->s_mb_history = kmalloc(i, GFP_KERNEL); ++ memset(sbi->s_mb_history, 0, i); ++ /* if we can't allocate history, then we simple won't use it */ ++} ++ ++static void ++ext3_mb_store_history(struct super_block *sb, struct ext3_allocation_context *ac) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct ext3_mb_history h; ++ ++ if (likely(sbi->s_mb_history == NULL)) ++ return; ++ ++ h.goal = ac->ac_g_ex; ++ h.result = ac->ac_b_ex; ++ h.found = ac->ac_found; ++ h.cr = ac->ac_criteria; ++ h.groups = ac->ac_groups_scanned; ++ h.tail = ac->ac_tail; ++ h.buddy = ac->ac_buddy; ++ h.merged = 0; ++ if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && ++ ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group) ++ h.merged = 1; ++ ++ spin_lock(&sbi->s_mb_history_lock); ++ memcpy(sbi->s_mb_history + sbi->s_mb_history_cur, &h, sizeof(h)); ++ if (++sbi->s_mb_history_cur >= sbi->s_mb_history_max) ++ sbi->s_mb_history_cur = 0; ++ spin_unlock(&sbi->s_mb_history_lock); ++} ++ ++#else ++#define ext3_mb_history_release(sb) ++#define ext3_mb_history_init(sb) ++#endif ++ ++int ext3_mb_init_backend(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int i, len; ++ ++ len = sizeof(struct ext3_buddy_group_blocks *) * sbi->s_groups_count; ++ sbi->s_group_info = kmalloc(len, GFP_KERNEL); ++ if (sbi->s_group_info == NULL) { ++ printk(KERN_ERR "EXT3-fs: can't allocate mem for buddy\n"); ++ return -ENOMEM; ++ } ++ memset(sbi->s_group_info, 0, len); ++ ++ sbi->s_buddy_cache = new_inode(sb); ++ if (sbi->s_buddy_cache == NULL) { ++ printk(KERN_ERR "EXT3-fs: can't get new inode\n"); ++ kfree(sbi->s_group_info); ++ return -ENOMEM; ++ } ++ ++ /* ++ * calculate needed size. if change bb_counters size, ++ * don't forget about ext3_mb_generate_buddy() ++ */ ++ len = sizeof(struct ext3_group_info); ++ len += sizeof(unsigned short) * (sb->s_blocksize_bits + 2); ++ for (i = 0; i < sbi->s_groups_count; i++) { ++ struct ext3_group_desc * desc; ++ ++ sbi->s_group_info[i] = kmalloc(len, GFP_KERNEL); ++ if (sbi->s_group_info[i] == NULL) { ++ printk(KERN_ERR "EXT3-fs: cant allocate mem for buddy\n"); ++ goto err_out; ++ } ++ desc = ext3_get_group_desc(sb, i, NULL); ++ if (desc == NULL) { ++ printk(KERN_ERR "EXT3-fs: cant read descriptor %u\n", i); ++ goto err_out; ++ } ++ memset(sbi->s_group_info[i], 0, len); ++ set_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, ++ &sbi->s_group_info[i]->bb_state); ++ sbi->s_group_info[i]->bb_free = ++ le16_to_cpu(desc->bg_free_blocks_count); ++ } ++ ++ return 0; ++ ++err_out: ++ while (--i >= 0) ++ kfree(sbi->s_group_info[i]); ++ iput(sbi->s_buddy_cache); ++ ++ return -ENOMEM; ++} ++ ++int ext3_mb_init(struct super_block *sb, int needs_recovery) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct inode *root = sb->s_root->d_inode; ++ unsigned i, offset, max; ++ struct dentry *dentry; ++ ++ if (!test_opt(sb, MBALLOC)) ++ return 0; ++ ++ i = (sb->s_blocksize_bits + 2) * sizeof(unsigned short); ++ ++ sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL); ++ if (sbi->s_mb_offsets == NULL) { ++ clear_opt(sbi->s_mount_opt, MBALLOC); ++ return -ENOMEM; ++ } ++ sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); ++ if (sbi->s_mb_maxs == NULL) { ++ clear_opt(sbi->s_mount_opt, MBALLOC); ++ kfree(sbi->s_mb_maxs); ++ return -ENOMEM; ++ } ++ ++ /* order 0 is regular bitmap */ ++ sbi->s_mb_maxs[0] = sb->s_blocksize << 3; ++ sbi->s_mb_offsets[0] = 0; ++ ++ i = 1; ++ offset = 0; ++ max = sb->s_blocksize << 2; ++ do { ++ sbi->s_mb_offsets[i] = offset; ++ sbi->s_mb_maxs[i] = max; ++ offset += 1 << (sb->s_blocksize_bits - i); ++ max = max >> 1; ++ i++; ++ } while (i <= sb->s_blocksize_bits + 1); ++ ++ ++ /* init file for buddy data */ ++ if ((i = ext3_mb_init_backend(sb))) { ++ clear_opt(sbi->s_mount_opt, MBALLOC); ++ kfree(sbi->s_mb_offsets); ++ kfree(sbi->s_mb_maxs); ++ return i; ++ } ++ ++ spin_lock_init(&sbi->s_reserve_lock); ++ spin_lock_init(&sbi->s_md_lock); ++ INIT_LIST_HEAD(&sbi->s_active_transaction); ++ INIT_LIST_HEAD(&sbi->s_closed_transaction); ++ INIT_LIST_HEAD(&sbi->s_committed_transaction); ++ spin_lock_init(&sbi->s_bal_lock); ++ ++ /* remove old on-disk buddy file */ ++ mutex_lock(&root->i_mutex); ++ dentry = lookup_one_len(".buddy", sb->s_root, strlen(".buddy")); ++ if (dentry->d_inode != NULL) { ++ i = vfs_unlink(root, dentry); ++ if (i != 0) ++ printk("EXT3-fs: can't remove .buddy file: %d\n", i); ++ } ++ dput(dentry); ++ mutex_unlock(&root->i_mutex); ++ ++ ext3_mb_history_init(sb); ++ ++ printk("EXT3-fs: mballoc enabled\n"); ++ return 0; ++} ++ ++int ext3_mb_release(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int i; ++ ++ if (!test_opt(sb, MBALLOC)) ++ return 0; ++ ++ /* release freed, non-committed blocks */ ++ spin_lock(&sbi->s_md_lock); ++ list_splice_init(&sbi->s_closed_transaction, ++ &sbi->s_committed_transaction); ++ list_splice_init(&sbi->s_active_transaction, ++ &sbi->s_committed_transaction); ++ spin_unlock(&sbi->s_md_lock); ++ ext3_mb_free_committed_blocks(sb); ++ ++ if (sbi->s_group_info) { ++ for (i = 0; i < sbi->s_groups_count; i++) { ++ if (sbi->s_group_info[i] == NULL) ++ continue; ++ kfree(sbi->s_group_info[i]); ++ } ++ kfree(sbi->s_group_info); ++ } ++ if (sbi->s_mb_offsets) ++ kfree(sbi->s_mb_offsets); ++ if (sbi->s_mb_maxs) ++ kfree(sbi->s_mb_maxs); ++ if (sbi->s_buddy_cache) ++ iput(sbi->s_buddy_cache); ++ if (sbi->s_blocks_reserved) ++ printk("ext3-fs: %ld blocks being reserved at umount!\n", ++ sbi->s_blocks_reserved); ++ if (ext3_mb_stats) { ++ printk("EXT3-fs: mballoc: %u blocks %u reqs (%u success)\n", ++ atomic_read(&sbi->s_bal_allocated), ++ atomic_read(&sbi->s_bal_reqs), ++ atomic_read(&sbi->s_bal_success)); ++ printk("EXT3-fs: mballoc: %u extents scanned, %u goal hits, " ++ "%u 2^N hits, %u breaks\n", ++ atomic_read(&sbi->s_bal_ex_scanned), ++ atomic_read(&sbi->s_bal_goals), ++ atomic_read(&sbi->s_bal_2orders), ++ atomic_read(&sbi->s_bal_breaks)); ++ printk("EXT3-fs: mballoc: %lu generated and it took %Lu\n", ++ sbi->s_mb_buddies_generated++, ++ sbi->s_mb_generation_time); ++ } ++ ++ ext3_mb_history_release(sb); ++ ++ return 0; ++} ++ ++void ext3_mb_free_committed_blocks(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int err, i, count = 0, count2 = 0; ++ struct ext3_free_metadata *md; ++ struct ext3_buddy e3b; ++ ++ if (list_empty(&sbi->s_committed_transaction)) ++ return; ++ ++ /* there is committed blocks to be freed yet */ ++ do { ++ /* get next array of blocks */ ++ md = NULL; ++ spin_lock(&sbi->s_md_lock); ++ if (!list_empty(&sbi->s_committed_transaction)) { ++ md = list_entry(sbi->s_committed_transaction.next, ++ struct ext3_free_metadata, list); ++ list_del(&md->list); ++ } ++ spin_unlock(&sbi->s_md_lock); ++ ++ if (md == NULL) ++ break; ++ ++ mb_debug("gonna free %u blocks in group %u (0x%p):", ++ md->num, md->group, md); ++ ++ err = ext3_mb_load_buddy(sb, md->group, &e3b); ++ BUG_ON(err != 0); ++ ++ /* there are blocks to put in buddy to make them really free */ ++ count += md->num; ++ count2++; ++ ext3_lock_group(sb, md->group); ++ for (i = 0; i < md->num; i++) { ++ mb_debug(" %u", md->blocks[i]); ++ mb_free_blocks(&e3b, md->blocks[i], 1); ++ } ++ mb_debug("\n"); ++ ext3_unlock_group(sb, md->group); ++ ++ /* balance refcounts from ext3_mb_free_metadata() */ ++ page_cache_release(e3b.bd_buddy_page); ++ page_cache_release(e3b.bd_bitmap_page); ++ ++ kfree(md); ++ ext3_mb_release_desc(&e3b); ++ ++ } while (md); ++ mb_debug("freed %u blocks in %u structures\n", count, count2); ++} ++ ++void ext3_mb_poll_new_transaction(struct super_block *sb, handle_t *handle) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ ++ if (sbi->s_last_transaction == handle->h_transaction->t_tid) ++ return; ++ ++ /* new transaction! time to close last one and free blocks for ++ * committed transaction. we know that only transaction can be ++ * active, so previos transaction can be being logged and we ++ * know that transaction before previous is known to be already ++ * logged. this means that now we may free blocks freed in all ++ * transactions before previous one. hope I'm clear enough ... */ ++ ++ spin_lock(&sbi->s_md_lock); ++ if (sbi->s_last_transaction != handle->h_transaction->t_tid) { ++ mb_debug("new transaction %lu, old %lu\n", ++ (unsigned long) handle->h_transaction->t_tid, ++ (unsigned long) sbi->s_last_transaction); ++ list_splice_init(&sbi->s_closed_transaction, ++ &sbi->s_committed_transaction); ++ list_splice_init(&sbi->s_active_transaction, ++ &sbi->s_closed_transaction); ++ sbi->s_last_transaction = handle->h_transaction->t_tid; ++ } ++ spin_unlock(&sbi->s_md_lock); ++ ++ ext3_mb_free_committed_blocks(sb); ++} ++ ++int ext3_mb_free_metadata(handle_t *handle, struct ext3_buddy *e3b, ++ int group, int block, int count) ++{ ++ struct ext3_group_info *db = e3b->bd_info; ++ struct super_block *sb = e3b->bd_sb; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct ext3_free_metadata *md; ++ int i; ++ ++ J_ASSERT(e3b->bd_bitmap_page != NULL); ++ J_ASSERT(e3b->bd_buddy_page != NULL); ++ ++ ext3_lock_group(sb, group); ++ for (i = 0; i < count; i++) { ++ md = db->bb_md_cur; ++ if (md && db->bb_tid != handle->h_transaction->t_tid) { ++ db->bb_md_cur = NULL; ++ md = NULL; ++ } ++ ++ if (md == NULL) { ++ ext3_unlock_group(sb, group); ++ md = kmalloc(sizeof(*md), GFP_KERNEL); ++ if (md == NULL) ++ return -ENOMEM; ++ md->num = 0; ++ md->group = group; ++ ++ ext3_lock_group(sb, group); ++ if (db->bb_md_cur == NULL) { ++ spin_lock(&sbi->s_md_lock); ++ list_add(&md->list, &sbi->s_active_transaction); ++ spin_unlock(&sbi->s_md_lock); ++ /* protect buddy cache from being freed, ++ * otherwise we'll refresh it from ++ * on-disk bitmap and lose not-yet-available ++ * blocks */ ++ page_cache_get(e3b->bd_buddy_page); ++ page_cache_get(e3b->bd_bitmap_page); ++ db->bb_md_cur = md; ++ db->bb_tid = handle->h_transaction->t_tid; ++ mb_debug("new md 0x%p for group %u\n", ++ md, md->group); ++ } else { ++ kfree(md); ++ md = db->bb_md_cur; ++ } ++ } ++ ++ BUG_ON(md->num >= EXT3_BB_MAX_BLOCKS); ++ md->blocks[md->num] = block + i; ++ md->num++; ++ if (md->num == EXT3_BB_MAX_BLOCKS) { ++ /* no more space, put full container on a sb's list */ ++ db->bb_md_cur = NULL; ++ } ++ } ++ ext3_unlock_group(sb, group); ++ return 0; ++} ++ ++void ext3_mb_free_blocks(handle_t *handle, struct inode *inode, ++ unsigned long block, unsigned long count, ++ int metadata, int *freed) ++{ ++ struct buffer_head *bitmap_bh = NULL; ++ struct ext3_group_desc *gdp; ++ struct ext3_super_block *es; ++ unsigned long bit, overflow; ++ struct buffer_head *gd_bh; ++ unsigned long block_group; ++ struct ext3_sb_info *sbi; ++ struct super_block *sb; ++ struct ext3_buddy e3b; ++ int err = 0, ret; ++ ++ *freed = 0; ++ sb = inode->i_sb; ++ if (!sb) { ++ printk ("ext3_free_blocks: nonexistent device"); ++ return; ++ } ++ ++ ext3_mb_poll_new_transaction(sb, handle); ++ ++ sbi = EXT3_SB(sb); ++ es = EXT3_SB(sb)->s_es; ++ if (block < le32_to_cpu(es->s_first_data_block) || ++ block + count < block || ++ block + count > le32_to_cpu(es->s_blocks_count)) { ++ ext3_error (sb, "ext3_free_blocks", ++ "Freeing blocks not in datazone - " ++ "block = %lu, count = %lu", block, count); ++ goto error_return; ++ } ++ ++ ext3_debug("freeing block %lu\n", block); ++ ++do_more: ++ overflow = 0; ++ block_group = (block - le32_to_cpu(es->s_first_data_block)) / ++ EXT3_BLOCKS_PER_GROUP(sb); ++ bit = (block - le32_to_cpu(es->s_first_data_block)) % ++ EXT3_BLOCKS_PER_GROUP(sb); ++ /* ++ * Check to see if we are freeing blocks across a group ++ * boundary. ++ */ ++ if (bit + count > EXT3_BLOCKS_PER_GROUP(sb)) { ++ overflow = bit + count - EXT3_BLOCKS_PER_GROUP(sb); ++ count -= overflow; ++ } ++ brelse(bitmap_bh); ++ bitmap_bh = read_block_bitmap(sb, block_group); ++ if (!bitmap_bh) ++ goto error_return; ++ gdp = ext3_get_group_desc (sb, block_group, &gd_bh); ++ if (!gdp) ++ goto error_return; ++ ++ if (in_range (le32_to_cpu(gdp->bg_block_bitmap), block, count) || ++ in_range (le32_to_cpu(gdp->bg_inode_bitmap), block, count) || ++ in_range (block, le32_to_cpu(gdp->bg_inode_table), ++ EXT3_SB(sb)->s_itb_per_group) || ++ in_range (block + count - 1, le32_to_cpu(gdp->bg_inode_table), ++ EXT3_SB(sb)->s_itb_per_group)) ++ ext3_error (sb, "ext3_free_blocks", ++ "Freeing blocks in system zones - " ++ "Block = %lu, count = %lu", ++ block, count); ++ ++ BUFFER_TRACE(bitmap_bh, "getting write access"); ++ err = ext3_journal_get_write_access(handle, bitmap_bh); ++ if (err) ++ goto error_return; ++ ++ /* ++ * We are about to modify some metadata. Call the journal APIs ++ * to unshare ->b_data if a currently-committing transaction is ++ * using it ++ */ ++ BUFFER_TRACE(gd_bh, "get_write_access"); ++ err = ext3_journal_get_write_access(handle, gd_bh); ++ if (err) ++ goto error_return; ++ ++ err = ext3_mb_load_buddy(sb, block_group, &e3b); ++ if (err) ++ goto error_return; ++ ++#ifdef AGGRESSIVE_CHECK ++ { ++ int i; ++ for (i = 0; i < count; i++) ++ J_ASSERT(mb_test_bit(bit + i, bitmap_bh->b_data)); ++ } ++#endif ++ mb_clear_bits(bitmap_bh->b_data, bit, count); ++ ++ /* We dirtied the bitmap block */ ++ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); ++ err = ext3_journal_dirty_metadata(handle, bitmap_bh); ++ ++ if (metadata) { ++ /* blocks being freed are metadata. these blocks shouldn't ++ * be used until this transaction is committed */ ++ ext3_mb_free_metadata(handle, &e3b, block_group, bit, count); ++ } else { ++ ext3_lock_group(sb, block_group); ++ mb_free_blocks(&e3b, bit, count); ++ ext3_unlock_group(sb, block_group); ++ } ++ ++ spin_lock(sb_bgl_lock(sbi, block_group)); ++ gdp->bg_free_blocks_count = ++ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + count); ++ spin_unlock(sb_bgl_lock(sbi, block_group)); ++ percpu_counter_mod(&sbi->s_freeblocks_counter, count); ++ ++ ext3_mb_release_desc(&e3b); ++ ++ *freed = count; ++ ++ /* And the group descriptor block */ ++ BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); ++ ret = ext3_journal_dirty_metadata(handle, gd_bh); ++ if (!err) err = ret; ++ ++ if (overflow && !err) { ++ block += count; ++ count = overflow; ++ goto do_more; ++ } ++ sb->s_dirt = 1; ++error_return: ++ brelse(bitmap_bh); ++ ext3_std_error(sb, err); ++ return; ++} ++ ++int ext3_mb_reserve_blocks(struct super_block *sb, int blocks) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int free, ret = -ENOSPC; ++ ++ BUG_ON(blocks < 0); ++ spin_lock(&sbi->s_reserve_lock); ++ free = percpu_counter_read_positive(&sbi->s_freeblocks_counter); ++ if (blocks <= free - sbi->s_blocks_reserved) { ++ sbi->s_blocks_reserved += blocks; ++ ret = 0; ++ } ++ spin_unlock(&sbi->s_reserve_lock); ++ return ret; ++} ++ ++void ext3_mb_release_blocks(struct super_block *sb, int blocks) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ ++ BUG_ON(blocks < 0); ++ spin_lock(&sbi->s_reserve_lock); ++ sbi->s_blocks_reserved -= blocks; ++ WARN_ON(sbi->s_blocks_reserved < 0); ++ if (sbi->s_blocks_reserved < 0) ++ sbi->s_blocks_reserved = 0; ++ spin_unlock(&sbi->s_reserve_lock); ++} ++ ++int ext3_new_block(handle_t *handle, struct inode *inode, ++ unsigned long goal, int *errp) ++{ ++ int ret, len; ++ ++ if (!test_opt(inode->i_sb, MBALLOC)) { ++ ret = ext3_new_block_old(handle, inode, goal, errp); ++ goto out; ++ } ++ len = 1; ++ ret = ext3_mb_new_blocks(handle, inode, goal, &len, 0, errp); ++out: ++ return ret; ++} ++ ++ ++void ext3_free_blocks(handle_t *handle, struct inode * inode, ++ unsigned long block, unsigned long count, int metadata) ++{ ++ struct super_block *sb; ++ int freed; ++ ++ sb = inode->i_sb; ++ if (!test_opt(sb, MBALLOC)) ++ ext3_free_blocks_sb(handle, sb, block, count, &freed); ++ else ++ ext3_mb_free_blocks(handle, inode, block, count, metadata, &freed); ++ if (freed) ++ DQUOT_FREE_BLOCK(inode, freed); ++ return; ++} ++ ++#define EXT3_ROOT "ext3" ++#define EXT3_MB_STATS_NAME "mb_stats" ++#define EXT3_MB_MAX_TO_SCAN_NAME "mb_max_to_scan" ++#define EXT3_MB_MIN_TO_SCAN_NAME "mb_min_to_scan" ++ ++static int ext3_mb_stats_read(char *page, char **start, off_t off, ++ int count, int *eof, void *data) ++{ ++ int len; ++ ++ *eof = 1; ++ if (off != 0) ++ return 0; ++ ++ len = sprintf(page, "%ld\n", ext3_mb_stats); ++ *start = page; ++ return len; ++} ++ ++static int ext3_mb_stats_write(struct file *file, const char *buffer, ++ unsigned long count, void *data) ++{ ++ char str[32]; ++ ++ if (count >= sizeof(str)) { ++ printk(KERN_ERR "EXT3: %s string to long, max %u bytes\n", ++ EXT3_MB_STATS_NAME, (int)sizeof(str)); ++ return -EOVERFLOW; ++ } ++ ++ if (copy_from_user(str, buffer, count)) ++ return -EFAULT; ++ ++ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ ++ ext3_mb_stats = (simple_strtol(str, NULL, 0) != 0); ++ return count; ++} ++ ++static int ext3_mb_max_to_scan_read(char *page, char **start, off_t off, ++ int count, int *eof, void *data) ++{ ++ int len; ++ ++ *eof = 1; ++ if (off != 0) ++ return 0; ++ ++ len = sprintf(page, "%ld\n", ext3_mb_max_to_scan); ++ *start = page; ++ return len; ++} ++ ++static int ext3_mb_max_to_scan_write(struct file *file, const char *buffer, ++ unsigned long count, void *data) ++{ ++ char str[32]; ++ long value; ++ ++ if (count >= sizeof(str)) { ++ printk(KERN_ERR "EXT3: %s string to long, max %u bytes\n", ++ EXT3_MB_MAX_TO_SCAN_NAME, (int)sizeof(str)); ++ return -EOVERFLOW; ++ } ++ ++ if (copy_from_user(str, buffer, count)) ++ return -EFAULT; ++ ++ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ ++ value = simple_strtol(str, NULL, 0); ++ if (value <= 0) ++ return -ERANGE; ++ ++ ext3_mb_max_to_scan = value; ++ ++ return count; ++} ++ ++static int ext3_mb_min_to_scan_read(char *page, char **start, off_t off, ++ int count, int *eof, void *data) ++{ ++ int len; ++ ++ *eof = 1; ++ if (off != 0) ++ return 0; ++ ++ len = sprintf(page, "%ld\n", ext3_mb_min_to_scan); ++ *start = page; ++ return len; ++} ++ ++static int ext3_mb_min_to_scan_write(struct file *file, const char *buffer, ++ unsigned long count, void *data) ++{ ++ char str[32]; ++ long value; ++ ++ if (count >= sizeof(str)) { ++ printk(KERN_ERR "EXT3: %s string to long, max %u bytes\n", ++ EXT3_MB_MIN_TO_SCAN_NAME, (int)sizeof(str)); ++ return -EOVERFLOW; ++ } ++ ++ if (copy_from_user(str, buffer, count)) ++ return -EFAULT; ++ ++ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ ++ value = simple_strtol(str, NULL, 0); ++ if (value <= 0) ++ return -ERANGE; ++ ++ ext3_mb_min_to_scan = value; ++ ++ return count; ++} ++ ++int __init init_ext3_proc(void) ++{ ++ struct proc_dir_entry *proc_ext3_mb_stats; ++ struct proc_dir_entry *proc_ext3_mb_max_to_scan; ++ struct proc_dir_entry *proc_ext3_mb_min_to_scan; ++ ++ proc_root_ext3 = proc_mkdir(EXT3_ROOT, proc_root_fs); ++ if (proc_root_ext3 == NULL) { ++ printk(KERN_ERR "EXT3: Unable to create %s\n", EXT3_ROOT); ++ return -EIO; ++ } ++ ++ /* Initialize EXT3_MB_STATS_NAME */ ++ proc_ext3_mb_stats = create_proc_entry(EXT3_MB_STATS_NAME, ++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); ++ if (proc_ext3_mb_stats == NULL) { ++ printk(KERN_ERR "EXT3: Unable to create %s\n", ++ EXT3_MB_STATS_NAME); ++ remove_proc_entry(EXT3_ROOT, proc_root_fs); ++ return -EIO; ++ } ++ ++ proc_ext3_mb_stats->data = NULL; ++ proc_ext3_mb_stats->read_proc = ext3_mb_stats_read; ++ proc_ext3_mb_stats->write_proc = ext3_mb_stats_write; ++ ++ /* Initialize EXT3_MAX_TO_SCAN_NAME */ ++ proc_ext3_mb_max_to_scan = create_proc_entry( ++ EXT3_MB_MAX_TO_SCAN_NAME, ++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); ++ if (proc_ext3_mb_max_to_scan == NULL) { ++ printk(KERN_ERR "EXT3: Unable to create %s\n", ++ EXT3_MB_MAX_TO_SCAN_NAME); ++ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_ROOT, proc_root_fs); ++ return -EIO; ++ } ++ ++ proc_ext3_mb_max_to_scan->data = NULL; ++ proc_ext3_mb_max_to_scan->read_proc = ext3_mb_max_to_scan_read; ++ proc_ext3_mb_max_to_scan->write_proc = ext3_mb_max_to_scan_write; ++ ++ /* Initialize EXT3_MIN_TO_SCAN_NAME */ ++ proc_ext3_mb_min_to_scan = create_proc_entry( ++ EXT3_MB_MIN_TO_SCAN_NAME, ++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); ++ if (proc_ext3_mb_min_to_scan == NULL) { ++ printk(KERN_ERR "EXT3: Unable to create %s\n", ++ EXT3_MB_MIN_TO_SCAN_NAME); ++ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_ROOT, proc_root_fs); ++ return -EIO; ++ } ++ ++ proc_ext3_mb_min_to_scan->data = NULL; ++ proc_ext3_mb_min_to_scan->read_proc = ext3_mb_min_to_scan_read; ++ proc_ext3_mb_min_to_scan->write_proc = ext3_mb_min_to_scan_write; ++ ++ return 0; ++} ++ ++void exit_ext3_proc(void) ++{ ++ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_MIN_TO_SCAN_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_ROOT, proc_root_fs); ++} ++ +Index: linux-stage/fs/ext3/extents.c +=================================================================== +--- linux-stage.orig/fs/ext3/extents.c 2006-07-16 02:29:43.000000000 +0800 ++++ linux-stage/fs/ext3/extents.c 2006-07-16 02:29:49.000000000 +0800 +@@ -771,7 +771,7 @@ cleanup: + for (i = 0; i < depth; i++) { + if (!ablocks[i]) + continue; +- ext3_free_blocks(handle, tree->inode, ablocks[i], 1); ++ ext3_free_blocks(handle, tree->inode, ablocks[i], 1, 1); + } + } + kfree(ablocks); +@@ -1428,7 +1428,7 @@ int ext3_ext_rm_idx(handle_t *handle, st + path->p_idx->ei_leaf); + bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf); + ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf); +- ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1); ++ ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1, 1); + return err; + } + +@@ -1913,10 +1913,12 @@ ext3_remove_blocks(struct ext3_extents_t + int needed = ext3_remove_blocks_credits(tree, ex, from, to); + handle_t *handle = ext3_journal_start(tree->inode, needed); + struct buffer_head *bh; +- int i; ++ int i, metadata = 0; + + if (IS_ERR(handle)) + return PTR_ERR(handle); ++ if (S_ISDIR(tree->inode->i_mode) || S_ISLNK(tree->inode->i_mode)) ++ metadata = 1; + if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) { + /* tail removal */ + unsigned long num, start; +@@ -1928,7 +1930,7 @@ ext3_remove_blocks(struct ext3_extents_t + bh = sb_find_get_block(tree->inode->i_sb, start + i); + ext3_forget(handle, 0, tree->inode, bh, start + i); + } +- ext3_free_blocks(handle, tree->inode, start, num); ++ ext3_free_blocks(handle, tree->inode, start, num, metadata); + } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) { + printk("strange request: removal %lu-%lu from %u:%u\n", + from, to, ex->ee_block, ex->ee_len); +Index: linux-stage/fs/ext3/xattr.c +=================================================================== +--- linux-stage.orig/fs/ext3/xattr.c 2006-07-16 02:29:43.000000000 +0800 ++++ linux-stage/fs/ext3/xattr.c 2006-07-16 02:29:49.000000000 +0800 +@@ -484,7 +484,7 @@ ext3_xattr_release_block(handle_t *handl + ea_bdebug(bh, "refcount now=0; freeing"); + if (ce) + mb_cache_entry_free(ce); +- ext3_free_blocks(handle, inode, bh->b_blocknr, 1); ++ ext3_free_blocks(handle, inode, bh->b_blocknr, 1, 1); + get_bh(bh); + ext3_forget(handle, 1, inode, bh, bh->b_blocknr); + } else { +@@ -805,7 +805,7 @@ inserted: + new_bh = sb_getblk(sb, block); + if (!new_bh) { + getblk_failed: +- ext3_free_blocks(handle, inode, block, 1); ++ ext3_free_blocks(handle, inode, block, 1, 1); + error = -EIO; + goto cleanup; + } +Index: linux-stage/fs/ext3/balloc.c +=================================================================== +--- linux-stage.orig/fs/ext3/balloc.c 2006-07-16 02:29:43.000000000 +0800 ++++ linux-stage/fs/ext3/balloc.c 2006-07-16 02:33:13.000000000 +0800 +@@ -79,7 +79,7 @@ struct ext3_group_desc * ext3_get_group_ + * + * Return buffer_head on success or NULL in case of failure. + */ +-static struct buffer_head * ++struct buffer_head * + read_block_bitmap(struct super_block *sb, unsigned int block_group) + { + struct ext3_group_desc * desc; +@@ -490,24 +490,6 @@ error_return: + return; + } + +-/* Free given blocks, update quota and i_blocks field */ +-void ext3_free_blocks(handle_t *handle, struct inode *inode, +- ext3_fsblk_t block, unsigned long count) +-{ +- struct super_block * sb; +- unsigned long dquot_freed_blocks; +- +- sb = inode->i_sb; +- if (!sb) { +- printk ("ext3_free_blocks: nonexistent device"); +- return; +- } +- ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks); +- if (dquot_freed_blocks) +- DQUOT_FREE_BLOCK(inode, dquot_freed_blocks); +- return; +-} +- + /* + * For ext3 allocations, we must not reuse any blocks which are + * allocated in the bitmap buffer's "last committed data" copy. This +@@ -1463,7 +1445,7 @@ out: + return 0; + } + +-ext3_fsblk_t ext3_new_block(handle_t *handle, struct inode *inode, ++ext3_fsblk_t ext3_new_block_old(handle_t *handle, struct inode *inode, + ext3_fsblk_t goal, int *errp) + { + unsigned long count = 1; +Index: linux-stage/fs/ext3/super.c +=================================================================== +--- linux-stage.orig/fs/ext3/super.c 2006-07-16 02:29:43.000000000 +0800 ++++ linux-stage/fs/ext3/super.c 2006-07-16 02:29:49.000000000 +0800 +@@ -391,6 +391,7 @@ static void ext3_put_super (struct super + struct ext3_super_block *es = sbi->s_es; + int i; + ++ ext3_mb_release(sb); + ext3_ext_release(sb); + ext3_xattr_put_super(sb); + journal_destroy(sbi->s_journal); +@@ -641,7 +642,7 @@ enum { + Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, + Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, + Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, +- Opt_extents, Opt_extdebug, ++ Opt_extents, Opt_extdebug, Opt_mballoc, + Opt_grpquota + }; + +@@ -696,6 +697,7 @@ static match_table_t tokens = { + {Opt_iopen_nopriv, "iopen_nopriv"}, + {Opt_extents, "extents"}, + {Opt_extdebug, "extdebug"}, ++ {Opt_mballoc, "mballoc"}, + {Opt_barrier, "barrier=%u"}, + {Opt_err, NULL}, + {Opt_resize, "resize"}, +@@ -1047,6 +1049,9 @@ clear_qf_name: + case Opt_extdebug: + set_opt (sbi->s_mount_opt, EXTDEBUG); + break; ++ case Opt_mballoc: ++ set_opt (sbi->s_mount_opt, MBALLOC); ++ break; + default: + printk (KERN_ERR + "EXT3-fs: Unrecognized mount option \"%s\" " +@@ -1773,6 +1778,7 @@ static int ext3_fill_super (struct super + "writeback"); + + ext3_ext_init(sb); ++ ext3_mb_init(sb, needs_recovery); + lock_kernel(); + return 0; + +@@ -2712,7 +2718,13 @@ static struct file_system_type ext3_fs_t + + static int __init init_ext3_fs(void) + { +- int err = init_ext3_xattr(); ++ int err; ++ ++ err = init_ext3_proc(); ++ if (err) ++ return err; ++ ++ err = init_ext3_xattr(); + if (err) + return err; + err = init_inodecache(); +@@ -2734,6 +2746,7 @@ static void __exit exit_ext3_fs(void) + unregister_filesystem(&ext3_fs_type); + destroy_inodecache(); + exit_ext3_xattr(); ++ exit_ext3_proc(); + } + + int ext3_prep_san_write(struct inode *inode, long *blocks, +Index: linux-stage/fs/ext3/Makefile +=================================================================== +--- linux-stage.orig/fs/ext3/Makefile 2006-07-16 02:29:43.000000000 +0800 ++++ linux-stage/fs/ext3/Makefile 2006-07-16 02:29:49.000000000 +0800 +@@ -6,7 +6,7 @@ obj-$(CONFIG_EXT3_FS) += ext3.o + + ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ + ioctl.o namei.o super.o symlink.o hash.o resize.o \ +- extents.o ++ extents.o mballoc.o + + ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o + ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o +Index: linux-stage/include/linux/ext3_fs.h +=================================================================== +--- linux-stage.orig/include/linux/ext3_fs.h 2006-07-16 02:29:43.000000000 +0800 ++++ linux-stage/include/linux/ext3_fs.h 2006-07-16 02:29:49.000000000 +0800 +@@ -53,6 +53,14 @@ + #define ext3_debug(f, a...) do {} while (0) + #endif + ++#define EXT3_MULTIBLOCK_ALLOCATOR 1 ++ ++#define EXT3_MB_HINT_MERGE 1 ++#define EXT3_MB_HINT_RESERVED 2 ++#define EXT3_MB_HINT_METADATA 4 ++#define EXT3_MB_HINT_FIRST 8 ++#define EXT3_MB_HINT_BEST 16 ++ + /* + * Special inodes numbers + */ +@@ -379,6 +387,7 @@ struct ext3_inode { + #define EXT3_MOUNT_IOPEN_NOPRIV 0x800000/* Make iopen world-readable */ + #define EXT3_MOUNT_EXTENTS 0x1000000/* Extents support */ + #define EXT3_MOUNT_EXTDEBUG 0x2000000/* Extents debug */ ++#define EXT3_MOUNT_MBALLOC 0x800000/* Buddy allocation support */ + + /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ + #ifndef clear_opt +@@ -749,12 +758,12 @@ ext3_group_first_block_no(struct super_b + /* balloc.c */ + extern int ext3_bg_has_super(struct super_block *sb, int group); + extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group); +-extern ext3_fsblk_t ext3_new_block (handle_t *handle, struct inode *inode, +- ext3_fsblk_t goal, int *errp); ++//extern ext3_fsblk_t ext3_new_block (handle_t *handle, struct inode *inode, ++// ext3_fsblk_t goal, int *errp); + extern ext3_fsblk_t ext3_new_blocks (handle_t *handle, struct inode *inode, + ext3_fsblk_t goal, unsigned long *count, int *errp); + extern void ext3_free_blocks (handle_t *handle, struct inode *inode, +- ext3_fsblk_t block, unsigned long count); ++ ext3_fsblk_t block, unsigned long count, int metadata); + extern void ext3_free_blocks_sb (handle_t *handle, struct super_block *sb, + ext3_fsblk_t block, unsigned long count, + unsigned long *pdquot_freed_blocks); +@@ -881,6 +890,17 @@ extern void ext3_extents_initialize_bloc + extern int ext3_ext_ioctl(struct inode *inode, struct file *filp, + unsigned int cmd, unsigned long arg); + ++/* mballoc.c */ ++extern long ext3_mb_stats; ++extern long ext3_mb_max_to_scan; ++extern int ext3_mb_init(struct super_block *, int); ++extern int ext3_mb_release(struct super_block *); ++extern int ext3_mb_new_blocks(handle_t *, struct inode *, unsigned long, int *, int, int *); ++extern int ext3_mb_reserve_blocks(struct super_block *, int); ++extern void ext3_mb_release_blocks(struct super_block *, int); ++int __init init_ext3_proc(void); ++void exit_ext3_proc(void); ++ + #endif /* __KERNEL__ */ + + /* EXT3_IOC_CREATE_INUM at bottom of file (visible to kernel and user). */ +Index: linux-stage/include/linux/ext3_fs_sb.h +=================================================================== +--- linux-stage.orig/include/linux/ext3_fs_sb.h 2006-07-16 02:29:43.000000000 +0800 ++++ linux-stage/include/linux/ext3_fs_sb.h 2006-07-16 02:29:49.000000000 +0800 +@@ -21,8 +21,14 @@ + #include + #include + #include ++#include + #endif + #include ++#include ++ ++struct ext3_buddy_group_blocks; ++struct ext3_mb_history; ++#define EXT3_BB_MAX_BLOCKS + + /* + * third extended-fs super-block data in memory +@@ -78,6 +84,38 @@ struct ext3_sb_info { + char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */ + int s_jquota_fmt; /* Format of quota to use */ + #endif ++ ++ /* for buddy allocator */ ++ struct ext3_group_info **s_group_info; ++ struct inode *s_buddy_cache; ++ long s_blocks_reserved; ++ spinlock_t s_reserve_lock; ++ struct list_head s_active_transaction; ++ struct list_head s_closed_transaction; ++ struct list_head s_committed_transaction; ++ spinlock_t s_md_lock; ++ tid_t s_last_transaction; ++ int s_mb_factor; ++ unsigned short *s_mb_offsets, *s_mb_maxs; ++ ++ /* history to debug policy */ ++ struct ext3_mb_history *s_mb_history; ++ int s_mb_history_cur; ++ int s_mb_history_max; ++ struct proc_dir_entry *s_mb_proc; ++ spinlock_t s_mb_history_lock; ++ ++ /* stats for buddy allocator */ ++ atomic_t s_bal_reqs; /* number of reqs with len > 1 */ ++ atomic_t s_bal_success; /* we found long enough chunks */ ++ atomic_t s_bal_allocated; /* in blocks */ ++ atomic_t s_bal_ex_scanned; /* total extents scanned */ ++ atomic_t s_bal_goals; /* goal hits */ ++ atomic_t s_bal_breaks; /* too long searches */ ++ atomic_t s_bal_2orders; /* 2^order hits */ ++ spinlock_t s_bal_lock; ++ unsigned long s_mb_buddies_generated; ++ unsigned long long s_mb_generation_time; + }; + + #endif /* _LINUX_EXT3_FS_SB */ +Index: linux-stage/fs/ext3/inode.c +=================================================================== +--- linux-stage.orig/fs/ext3/inode.c 2006-07-16 02:29:43.000000000 +0800 ++++ linux-stage/fs/ext3/inode.c 2006-07-16 02:29:49.000000000 +0800 +@@ -562,7 +562,7 @@ static int ext3_alloc_blocks(handle_t *h + return ret; + failed_out: + for (i = 0; i bd_mutex); ++ dev_clear_rdonly(bdev); + bdput(bdev); + return ret; + } +Index: linux-2.6/block/ll_rw_blk.c +=================================================================== +--- linux-2.6.orig/block/ll_rw_blk.c 2006-07-10 22:30:08.000000000 +0800 ++++ linux-2.6/block/ll_rw_blk.c 2006-07-15 16:15:14.000000000 +0800 +@@ -2993,6 +2993,8 @@ static void handle_bad_sector(struct bio + set_bit(BIO_EOF, &bio->bi_flags); + } + ++int dev_check_rdonly(struct block_device *bdev); ++ + /** + * generic_make_request: hand a buffer to its device driver for I/O + * @bio: The bio describing the location in memory and on the device. +@@ -3076,6 +3078,12 @@ end_io: + + if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) + goto end_io; ++ /* this is cfs's dev_rdonly check */ ++ if (bio->bi_rw == WRITE && ++ dev_check_rdonly(bio->bi_bdev)) { ++ bio_endio(bio, bio->bi_size, 0); ++ break; ++ } + + /* + * If this device has partitions, remap block n +@@ -3673,6 +3681,91 @@ void swap_io_context(struct io_context * + *ioc2 = temp; + } + EXPORT_SYMBOL(swap_io_context); ++ /* ++ * Debug code for turning block devices "read-only" (will discard writes ++ * silently). This is for filesystem crash/recovery testing. ++ */ ++struct deventry { ++ dev_t dev; ++ struct deventry *next; ++}; ++ ++static struct deventry *devlist = NULL; ++static spinlock_t devlock = SPIN_LOCK_UNLOCKED; ++ ++int dev_check_rdonly(struct block_device *bdev) ++{ ++ struct deventry *cur; ++ if (!bdev) return 0; ++ spin_lock(&devlock); ++ cur = devlist; ++ while(cur) { ++ if (bdev->bd_dev == cur->dev) { ++ spin_unlock(&devlock); ++ return 1; ++ } ++ cur = cur->next; ++ } ++ spin_unlock(&devlock); ++ return 0; ++} ++ ++void dev_set_rdonly(struct block_device *bdev) ++{ ++ struct deventry *newdev, *cur; ++ ++ if (!bdev) ++ return; ++ newdev = kmalloc(sizeof(struct deventry), GFP_KERNEL); ++ if (!newdev) ++ return; ++ ++ spin_lock(&devlock); ++ cur = devlist; ++ while(cur) { ++ if (bdev->bd_dev == cur->dev) { ++ spin_unlock(&devlock); ++ kfree(newdev); ++ return; ++ } ++ cur = cur->next; ++ } ++ newdev->dev = bdev->bd_dev; ++ newdev->next = devlist; ++ devlist = newdev; ++ spin_unlock(&devlock); ++ printk(KERN_WARNING "Turning device %s (%#x) read-only\n", ++ bdev->bd_disk ? bdev->bd_disk->disk_name : "", bdev->bd_dev); ++} ++ ++void dev_clear_rdonly(struct block_device *bdev) ++{ ++ struct deventry *cur, *last = NULL; ++ if (!bdev) return; ++ spin_lock(&devlock); ++ cur = devlist; ++ while(cur) { ++ if (bdev->bd_dev == cur->dev) { ++ if (last) ++ last->next = cur->next; ++ else ++ devlist = cur->next; ++ spin_unlock(&devlock); ++ kfree(cur); ++ printk(KERN_WARNING "Removing read-only on %s (%#x)\n", ++ bdev->bd_disk ? bdev->bd_disk->disk_name : ++ "unknown block", bdev->bd_dev); ++ return; ++ } ++ last = cur; ++ cur = cur->next; ++ } ++ spin_unlock(&devlock); ++} ++ ++EXPORT_SYMBOL(dev_set_rdonly); ++EXPORT_SYMBOL(dev_clear_rdonly); ++EXPORT_SYMBOL(dev_check_rdonly); + + /* + * sysfs parts below +Index: linux-2.6/include/linux/fs.h +=================================================================== +--- linux-2.6.orig/include/linux/fs.h 2006-07-15 16:14:58.000000000 +0800 ++++ linux-2.6/include/linux/fs.h 2006-07-15 16:15:14.000000000 +0800 +@@ -1648,6 +1648,10 @@ extern void file_kill(struct file *f); + struct bio; + extern void submit_bio(int, struct bio *); + extern int bdev_read_only(struct block_device *); ++#define HAVE_CLEAR_RDONLY_ON_PUT ++void dev_set_rdonly(struct block_device *bdev); ++int dev_check_rdonly(struct block_device *bdev); ++void dev_clear_rdonly(struct block_device *bdev); + extern int set_blocksize(struct block_device *, int); + extern int sb_set_blocksize(struct super_block *, int); + extern int sb_min_blocksize(struct super_block *, int); diff --git a/lustre/kernel_patches/patches/export-2.6.18-vanilla.patch b/lustre/kernel_patches/patches/export-2.6.18-vanilla.patch new file mode 100644 index 0000000..a6813e6 --- /dev/null +++ b/lustre/kernel_patches/patches/export-2.6.18-vanilla.patch @@ -0,0 +1,24 @@ +Index: linux-2.6/fs/open.c +=================================================================== +--- linux-2.6.orig/fs/open.c 2006-07-15 16:10:37.000000000 +0800 ++++ linux-2.6/fs/open.c 2006-07-15 16:22:04.000000000 +0800 +@@ -808,7 +808,6 @@ asmlinkage long sys_lchown(const char __ + return error; + } + +- + asmlinkage long sys_fchown(unsigned int fd, uid_t user, gid_t group) + { + struct file * file; +Index: linux-2.6/fs/jbd/journal.c +=================================================================== +--- linux-2.6.orig/fs/jbd/journal.c 2006-07-15 16:13:50.000000000 +0800 ++++ linux-2.6/fs/jbd/journal.c 2006-07-15 16:22:04.000000000 +0800 +@@ -74,6 +74,7 @@ EXPORT_SYMBOL(journal_abort); + EXPORT_SYMBOL(journal_errno); + EXPORT_SYMBOL(journal_ack_err); + EXPORT_SYMBOL(journal_clear_err); ++EXPORT_SYMBOL(log_start_commit); + EXPORT_SYMBOL(log_wait_commit); + EXPORT_SYMBOL(journal_start_commit); + EXPORT_SYMBOL(journal_force_commit_nested); diff --git a/lustre/kernel_patches/patches/export-show_task-2.6.18-vanilla.patch b/lustre/kernel_patches/patches/export-show_task-2.6.18-vanilla.patch new file mode 100644 index 0000000..834c886 --- /dev/null +++ b/lustre/kernel_patches/patches/export-show_task-2.6.18-vanilla.patch @@ -0,0 +1,25 @@ +Index: linux-2.6/kernel/sched.c +=================================================================== +--- linux-2.6.orig/kernel/sched.c 2006-07-15 11:51:46.000000000 +0800 ++++ linux-2.6/kernel/sched.c 2006-07-15 16:24:35.000000000 +0800 +@@ -4652,7 +4652,7 @@ static inline struct task_struct *younge + + static const char stat_nam[] = "RSDTtZX"; + +-static void show_task(struct task_struct *p) ++void show_task(struct task_struct *p) + { + struct task_struct *relative; + unsigned long free = 0; +@@ -4698,9 +4698,10 @@ static void show_task(struct task_struct + else + printk(" (NOTLB)\n"); + +- if (state != TASK_RUNNING) ++ if (state != TASK_RUNNING || p == current) + show_stack(p, NULL); + } ++EXPORT_SYMBOL(show_task); + + void show_state(void) + { diff --git a/lustre/kernel_patches/patches/export-truncate-2.6.18-vanilla.patch b/lustre/kernel_patches/patches/export-truncate-2.6.18-vanilla.patch new file mode 100644 index 0000000..f956ef4 --- /dev/null +++ b/lustre/kernel_patches/patches/export-truncate-2.6.18-vanilla.patch @@ -0,0 +1,39 @@ + include/linux/mm.h | 2 ++ + mm/truncate.c | 4 ++-- + 2 files changed, 4 insertions(+), 2 deletions(-) + +Index: linux-2.6/include/linux/mm.h +=================================================================== +--- linux-2.6.orig/include/linux/mm.h 2006-07-06 23:41:48.000000000 +0800 ++++ linux-2.6/include/linux/mm.h 2006-07-15 12:55:07.000000000 +0800 +@@ -529,6 +529,8 @@ static __always_inline void *lowmem_page + { + return __va(page_to_pfn(page) << PAGE_SHIFT); + } ++/* truncate.c */ ++extern void truncate_complete_page(struct address_space *mapping,struct page *); + + #if defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) + #define HASHED_PAGE_VIRTUAL +Index: linux-2.6/mm/truncate.c +=================================================================== +--- linux-2.6.orig/mm/truncate.c 2006-06-24 14:22:39.000000000 +0800 ++++ linux-2.6/mm/truncate.c 2006-07-15 12:55:07.000000000 +0800 +@@ -33,7 +33,7 @@ static inline void truncate_partial_page + * its lock, b) when a concurrent invalidate_inode_pages got there first and + * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space. + */ +-static void ++void + truncate_complete_page(struct address_space *mapping, struct page *page) + { + if (page->mapping != mapping) +@@ -48,7 +48,7 @@ truncate_complete_page(struct address_sp + remove_from_page_cache(page); + page_cache_release(page); /* pagecache ref */ + } +- ++EXPORT_SYMBOL_GPL(truncate_complete_page); + /* + * This is for invalidate_inode_pages(). That function can be called at + * any time, and is not supposed to throw away dirty pages. But pages can diff --git a/lustre/kernel_patches/patches/export_symbols-2.6.18-vanilla.patch b/lustre/kernel_patches/patches/export_symbols-2.6.18-vanilla.patch new file mode 100644 index 0000000..18a9815 --- /dev/null +++ b/lustre/kernel_patches/patches/export_symbols-2.6.18-vanilla.patch @@ -0,0 +1,64 @@ +Index: linux-2.6/fs/filesystems.c +=================================================================== +--- linux-2.6.orig/fs/filesystems.c 2006-07-15 16:08:35.000000000 +0800 ++++ linux-2.6/fs/filesystems.c 2006-07-15 16:14:19.000000000 +0800 +@@ -29,7 +29,9 @@ + */ + + static struct file_system_type *file_systems; +-static DEFINE_RWLOCK(file_systems_lock); ++DEFINE_RWLOCK(file_systems_lock); ++ ++EXPORT_SYMBOL(file_systems_lock); + + /* WARNING: This can be used only if we _already_ own a reference */ + void get_filesystem(struct file_system_type *fs) +Index: linux-2.6/include/linux/fs.h +=================================================================== +--- linux-2.6.orig/include/linux/fs.h 2006-07-15 16:10:37.000000000 +0800 ++++ linux-2.6/include/linux/fs.h 2006-07-15 16:14:19.000000000 +0800 +@@ -1768,6 +1768,7 @@ static inline ssize_t blockdev_direct_IO + + extern const struct file_operations generic_ro_fops; + ++extern rwlock_t file_systems_lock; + #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m)) + + extern int vfs_readlink(struct dentry *, char __user *, int, const char *); +Index: linux-2.6/fs/namespace.c +=================================================================== +--- linux-2.6.orig/fs/namespace.c 2006-07-15 16:10:33.000000000 +0800 ++++ linux-2.6/fs/namespace.c 2006-07-15 16:14:19.000000000 +0800 +@@ -1641,6 +1641,7 @@ void set_fs_pwd(struct fs_struct *fs, st + mntput(old_pwdmnt); + } + } ++EXPORT_SYMBOL(set_fs_pwd); + + static void chroot_fs_refs(struct nameidata *old_nd, struct nameidata *new_nd) + { +Index: linux-2.6/kernel/exit.c +=================================================================== +--- linux-2.6.orig/kernel/exit.c 2006-07-15 16:08:34.000000000 +0800 ++++ linux-2.6/kernel/exit.c 2006-07-15 16:14:19.000000000 +0800 +@@ -305,6 +305,8 @@ static void reparent_to_init(void) + switch_uid(INIT_USER); + } + ++EXPORT_SYMBOL(reparent_to_init); ++ + void __set_special_pids(pid_t session, pid_t pgrp) + { + struct task_struct *curr = current->group_leader; +Index: linux-2.6/fs/dcache.c +=================================================================== +--- linux-2.6.orig/fs/dcache.c 2006-07-15 16:14:00.000000000 +0800 ++++ linux-2.6/fs/dcache.c 2006-07-15 16:14:19.000000000 +0800 +@@ -1628,6 +1628,7 @@ int is_subdir(struct dentry * new_dentry + + return result; + } ++EXPORT_SYMBOL(is_subdir); + + void d_genocide(struct dentry *root) + { diff --git a/lustre/kernel_patches/patches/ext3-extents-2.6.18-vanilla.patch b/lustre/kernel_patches/patches/ext3-extents-2.6.18-vanilla.patch new file mode 100644 index 0000000..e89e8e7 --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-extents-2.6.18-vanilla.patch @@ -0,0 +1,2935 @@ +Index: linux-stage/fs/ext3/extents.c +=================================================================== +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ linux-stage/fs/ext3/extents.c 2006-07-16 14:10:21.000000000 +0800 +@@ -0,0 +1,2347 @@ ++/* ++ * Copyright(c) 2003, 2004, 2005, Cluster File Systems, Inc, info@clusterfs.com ++ * Written by Alex Tomas ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public Licens ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- ++ */ ++ ++/* ++ * Extents support for EXT3 ++ * ++ * TODO: ++ * - ext3_ext_walk_space() sould not use ext3_ext_find_extent() ++ * - ext3_ext_calc_credits() could take 'mergable' into account ++ * - ext3*_error() should be used in some situations ++ * - find_goal() [to be tested and improved] ++ * - smart tree reduction ++ * - arch-independence ++ * common on-disk format for big/little-endian arch ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++ ++static inline int ext3_ext_check_header(struct ext3_extent_header *eh) ++{ ++ if (eh->eh_magic != EXT3_EXT_MAGIC) { ++ printk(KERN_ERR "EXT3-fs: invalid magic = 0x%x\n", ++ (unsigned)eh->eh_magic); ++ return -EIO; ++ } ++ if (eh->eh_max == 0) { ++ printk(KERN_ERR "EXT3-fs: invalid eh_max = %u\n", ++ (unsigned)eh->eh_max); ++ return -EIO; ++ } ++ if (eh->eh_entries > eh->eh_max) { ++ printk(KERN_ERR "EXT3-fs: invalid eh_entries = %u\n", ++ (unsigned)eh->eh_entries); ++ return -EIO; ++ } ++ return 0; ++} ++ ++static handle_t *ext3_ext_journal_restart(handle_t *handle, int needed) ++{ ++ int err; ++ ++ if (handle->h_buffer_credits > needed) ++ return handle; ++ if (!ext3_journal_extend(handle, needed)) ++ return handle; ++ err = ext3_journal_restart(handle, needed); ++ ++ return handle; ++} ++ ++static int inline ++ext3_ext_get_access_for_root(handle_t *h, struct ext3_extents_tree *tree) ++{ ++ if (tree->ops->get_write_access) ++ return tree->ops->get_write_access(h,tree->buffer); ++ else ++ return 0; ++} ++ ++static int inline ++ext3_ext_mark_root_dirty(handle_t *h, struct ext3_extents_tree *tree) ++{ ++ if (tree->ops->mark_buffer_dirty) ++ return tree->ops->mark_buffer_dirty(h,tree->buffer); ++ else ++ return 0; ++} ++ ++/* ++ * could return: ++ * - EROFS ++ * - ENOMEM ++ */ ++static int ext3_ext_get_access(handle_t *handle, ++ struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ int err; ++ ++ if (path->p_bh) { ++ /* path points to block */ ++ err = ext3_journal_get_write_access(handle, path->p_bh); ++ } else { ++ /* path points to leaf/index in inode body */ ++ err = ext3_ext_get_access_for_root(handle, tree); ++ } ++ return err; ++} ++ ++/* ++ * could return: ++ * - EROFS ++ * - ENOMEM ++ * - EIO ++ */ ++static int ext3_ext_dirty(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ int err; ++ if (path->p_bh) { ++ /* path points to block */ ++ err =ext3_journal_dirty_metadata(handle, path->p_bh); ++ } else { ++ /* path points to leaf/index in inode body */ ++ err = ext3_ext_mark_root_dirty(handle, tree); ++ } ++ return err; ++} ++ ++static int inline ++ext3_ext_new_block(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, struct ext3_extent *ex, ++ int *err) ++{ ++ int goal, depth, newblock; ++ struct inode *inode; ++ ++ EXT_ASSERT(tree); ++ if (tree->ops->new_block) ++ return tree->ops->new_block(handle, tree, path, ex, err); ++ ++ inode = tree->inode; ++ depth = EXT_DEPTH(tree); ++ if (path && depth > 0) { ++ goal = path[depth-1].p_block; ++ } else { ++ struct ext3_inode_info *ei = EXT3_I(inode); ++ unsigned long bg_start; ++ unsigned long colour; ++ ++ bg_start = (ei->i_block_group * ++ EXT3_BLOCKS_PER_GROUP(inode->i_sb)) + ++ le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block); ++ colour = (current->pid % 16) * ++ (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16); ++ goal = bg_start + colour; ++ } ++ ++ newblock = ext3_new_block(handle, inode, goal, err); ++ return newblock; ++} ++ ++static inline void ext3_ext_tree_changed(struct ext3_extents_tree *tree) ++{ ++ struct ext3_extent_header *neh; ++ neh = EXT_ROOT_HDR(tree); ++ neh->eh_generation++; ++} ++ ++static inline int ext3_ext_space_block(struct ext3_extents_tree *tree) ++{ ++ int size; ++ ++ size = (tree->inode->i_sb->s_blocksize - ++ sizeof(struct ext3_extent_header)) / ++ sizeof(struct ext3_extent); ++#ifdef AGRESSIVE_TEST ++ size = 6; ++#endif ++ return size; ++} ++ ++static inline int ext3_ext_space_block_idx(struct ext3_extents_tree *tree) ++{ ++ int size; ++ ++ size = (tree->inode->i_sb->s_blocksize - ++ sizeof(struct ext3_extent_header)) / ++ sizeof(struct ext3_extent_idx); ++#ifdef AGRESSIVE_TEST ++ size = 5; ++#endif ++ return size; ++} ++ ++static inline int ext3_ext_space_root(struct ext3_extents_tree *tree) ++{ ++ int size; ++ ++ size = (tree->buffer_len - sizeof(struct ext3_extent_header)) / ++ sizeof(struct ext3_extent); ++#ifdef AGRESSIVE_TEST ++ size = 3; ++#endif ++ return size; ++} ++ ++static inline int ext3_ext_space_root_idx(struct ext3_extents_tree *tree) ++{ ++ int size; ++ ++ size = (tree->buffer_len - sizeof(struct ext3_extent_header)) / ++ sizeof(struct ext3_extent_idx); ++#ifdef AGRESSIVE_TEST ++ size = 4; ++#endif ++ return size; ++} ++ ++static void ext3_ext_show_path(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++#ifdef EXT_DEBUG ++ int k, l = path->p_depth; ++ ++ ext_debug(tree, "path:"); ++ for (k = 0; k <= l; k++, path++) { ++ if (path->p_idx) { ++ ext_debug(tree, " %d->%d", path->p_idx->ei_block, ++ path->p_idx->ei_leaf); ++ } else if (path->p_ext) { ++ ext_debug(tree, " %d:%d:%d", ++ path->p_ext->ee_block, ++ path->p_ext->ee_len, ++ path->p_ext->ee_start); ++ } else ++ ext_debug(tree, " []"); ++ } ++ ext_debug(tree, "\n"); ++#endif ++} ++ ++static void ext3_ext_show_leaf(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++#ifdef EXT_DEBUG ++ int depth = EXT_DEPTH(tree); ++ struct ext3_extent_header *eh; ++ struct ext3_extent *ex; ++ int i; ++ ++ if (!path) ++ return; ++ ++ eh = path[depth].p_hdr; ++ ex = EXT_FIRST_EXTENT(eh); ++ ++ for (i = 0; i < eh->eh_entries; i++, ex++) { ++ ext_debug(tree, "%d:%d:%d ", ++ ex->ee_block, ex->ee_len, ex->ee_start); ++ } ++ ext_debug(tree, "\n"); ++#endif ++} ++ ++static void ext3_ext_drop_refs(struct ext3_ext_path *path) ++{ ++ int depth = path->p_depth; ++ int i; ++ ++ for (i = 0; i <= depth; i++, path++) { ++ if (path->p_bh) { ++ brelse(path->p_bh); ++ path->p_bh = NULL; ++ } ++ } ++} ++ ++/* ++ * binary search for closest index by given block ++ */ ++static inline void ++ext3_ext_binsearch_idx(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, int block) ++{ ++ struct ext3_extent_header *eh = path->p_hdr; ++ struct ext3_extent_idx *ix; ++ int l = 0, k, r; ++ ++ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); ++ EXT_ASSERT(eh->eh_entries <= eh->eh_max); ++ EXT_ASSERT(eh->eh_entries > 0); ++ ++ ext_debug(tree, "binsearch for %d(idx): ", block); ++ ++ path->p_idx = ix = EXT_FIRST_INDEX(eh); ++ ++ r = k = eh->eh_entries; ++ while (k > 1) { ++ k = (r - l) / 2; ++ if (block < ix[l + k].ei_block) ++ r -= k; ++ else ++ l += k; ++ ext_debug(tree, "%d:%d:%d ", k, l, r); ++ } ++ ++ ix += l; ++ path->p_idx = ix; ++ ext_debug(tree," -> %d->%d ",path->p_idx->ei_block,path->p_idx->ei_leaf); ++ ++ while (l++ < r) { ++ if (block < ix->ei_block) ++ break; ++ path->p_idx = ix++; ++ } ++ ext_debug(tree, " -> %d->%d\n", path->p_idx->ei_block, ++ path->p_idx->ei_leaf); ++ ++#ifdef CHECK_BINSEARCH ++ { ++ struct ext3_extent_idx *chix; ++ ++ chix = ix = EXT_FIRST_INDEX(eh); ++ for (k = 0; k < eh->eh_entries; k++, ix++) { ++ if (k != 0 && ix->ei_block <= ix[-1].ei_block) { ++ printk("k=%d, ix=0x%p, first=0x%p\n", k, ++ ix, EXT_FIRST_INDEX(eh)); ++ printk("%u <= %u\n", ++ ix->ei_block,ix[-1].ei_block); ++ } ++ EXT_ASSERT(k == 0 || ix->ei_block > ix[-1].ei_block); ++ if (block < ix->ei_block) ++ break; ++ chix = ix; ++ } ++ EXT_ASSERT(chix == path->p_idx); ++ } ++#endif ++} ++ ++/* ++ * binary search for closest extent by given block ++ */ ++static inline void ++ext3_ext_binsearch(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, int block) ++{ ++ struct ext3_extent_header *eh = path->p_hdr; ++ struct ext3_extent *ex; ++ int l = 0, k, r; ++ ++ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); ++ EXT_ASSERT(eh->eh_entries <= eh->eh_max); ++ ++ if (eh->eh_entries == 0) { ++ /* ++ * this leaf is empty yet: ++ * we get such a leaf in split/add case ++ */ ++ return; ++ } ++ ++ ext_debug(tree, "binsearch for %d: ", block); ++ ++ path->p_ext = ex = EXT_FIRST_EXTENT(eh); ++ ++ r = k = eh->eh_entries; ++ while (k > 1) { ++ k = (r - l) / 2; ++ if (block < ex[l + k].ee_block) ++ r -= k; ++ else ++ l += k; ++ ext_debug(tree, "%d:%d:%d ", k, l, r); ++ } ++ ++ ex += l; ++ path->p_ext = ex; ++ ext_debug(tree, " -> %d:%d:%d ", path->p_ext->ee_block, ++ path->p_ext->ee_start, path->p_ext->ee_len); ++ ++ while (l++ < r) { ++ if (block < ex->ee_block) ++ break; ++ path->p_ext = ex++; ++ } ++ ext_debug(tree, " -> %d:%d:%d\n", path->p_ext->ee_block, ++ path->p_ext->ee_start, path->p_ext->ee_len); ++ ++#ifdef CHECK_BINSEARCH ++ { ++ struct ext3_extent *chex; ++ ++ chex = ex = EXT_FIRST_EXTENT(eh); ++ for (k = 0; k < eh->eh_entries; k++, ex++) { ++ EXT_ASSERT(k == 0 || ex->ee_block > ex[-1].ee_block); ++ if (block < ex->ee_block) ++ break; ++ chex = ex; ++ } ++ EXT_ASSERT(chex == path->p_ext); ++ } ++#endif ++} ++ ++int ext3_extent_tree_init(handle_t *handle, struct ext3_extents_tree *tree) ++{ ++ struct ext3_extent_header *eh; ++ ++ BUG_ON(tree->buffer_len == 0); ++ ext3_ext_get_access_for_root(handle, tree); ++ eh = EXT_ROOT_HDR(tree); ++ eh->eh_depth = 0; ++ eh->eh_entries = 0; ++ eh->eh_magic = EXT3_EXT_MAGIC; ++ eh->eh_max = ext3_ext_space_root(tree); ++ ext3_ext_mark_root_dirty(handle, tree); ++ ext3_ext_invalidate_cache(tree); ++ return 0; ++} ++ ++struct ext3_ext_path * ++ext3_ext_find_extent(struct ext3_extents_tree *tree, int block, ++ struct ext3_ext_path *path) ++{ ++ struct ext3_extent_header *eh; ++ struct buffer_head *bh; ++ int depth, i, ppos = 0; ++ ++ EXT_ASSERT(tree); ++ EXT_ASSERT(tree->inode); ++ EXT_ASSERT(tree->root); ++ ++ eh = EXT_ROOT_HDR(tree); ++ EXT_ASSERT(eh); ++ if (ext3_ext_check_header(eh)) ++ goto err; ++ ++ i = depth = EXT_DEPTH(tree); ++ EXT_ASSERT(eh->eh_max); ++ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); ++ ++ /* account possible depth increase */ ++ if (!path) { ++ path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 2), ++ GFP_NOFS); ++ if (!path) ++ return ERR_PTR(-ENOMEM); ++ } ++ memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1)); ++ path[0].p_hdr = eh; ++ ++ /* walk through the tree */ ++ while (i) { ++ ext_debug(tree, "depth %d: num %d, max %d\n", ++ ppos, eh->eh_entries, eh->eh_max); ++ ext3_ext_binsearch_idx(tree, path + ppos, block); ++ path[ppos].p_block = path[ppos].p_idx->ei_leaf; ++ path[ppos].p_depth = i; ++ path[ppos].p_ext = NULL; ++ ++ bh = sb_bread(tree->inode->i_sb, path[ppos].p_block); ++ if (!bh) ++ goto err; ++ ++ eh = EXT_BLOCK_HDR(bh); ++ ppos++; ++ EXT_ASSERT(ppos <= depth); ++ path[ppos].p_bh = bh; ++ path[ppos].p_hdr = eh; ++ i--; ++ ++ if (ext3_ext_check_header(eh)) ++ goto err; ++ } ++ ++ path[ppos].p_depth = i; ++ path[ppos].p_hdr = eh; ++ path[ppos].p_ext = NULL; ++ path[ppos].p_idx = NULL; ++ ++ if (ext3_ext_check_header(eh)) ++ goto err; ++ ++ /* find extent */ ++ ext3_ext_binsearch(tree, path + ppos, block); ++ ++ ext3_ext_show_path(tree, path); ++ ++ return path; ++ ++err: ++ printk(KERN_ERR "EXT3-fs: header is corrupted!\n"); ++ ext3_ext_drop_refs(path); ++ kfree(path); ++ return ERR_PTR(-EIO); ++} ++ ++/* ++ * insert new index [logical;ptr] into the block at cupr ++ * it check where to insert: before curp or after curp ++ */ ++static int ext3_ext_insert_index(handle_t *handle, ++ struct ext3_extents_tree *tree, ++ struct ext3_ext_path *curp, ++ int logical, int ptr) ++{ ++ struct ext3_extent_idx *ix; ++ int len, err; ++ ++ if ((err = ext3_ext_get_access(handle, tree, curp))) ++ return err; ++ ++ EXT_ASSERT(logical != curp->p_idx->ei_block); ++ len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx; ++ if (logical > curp->p_idx->ei_block) { ++ /* insert after */ ++ if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) { ++ len = (len - 1) * sizeof(struct ext3_extent_idx); ++ len = len < 0 ? 0 : len; ++ ext_debug(tree, "insert new index %d after: %d. " ++ "move %d from 0x%p to 0x%p\n", ++ logical, ptr, len, ++ (curp->p_idx + 1), (curp->p_idx + 2)); ++ memmove(curp->p_idx + 2, curp->p_idx + 1, len); ++ } ++ ix = curp->p_idx + 1; ++ } else { ++ /* insert before */ ++ len = len * sizeof(struct ext3_extent_idx); ++ len = len < 0 ? 0 : len; ++ ext_debug(tree, "insert new index %d before: %d. " ++ "move %d from 0x%p to 0x%p\n", ++ logical, ptr, len, ++ curp->p_idx, (curp->p_idx + 1)); ++ memmove(curp->p_idx + 1, curp->p_idx, len); ++ ix = curp->p_idx; ++ } ++ ++ ix->ei_block = logical; ++ ix->ei_leaf = ptr; ++ curp->p_hdr->eh_entries++; ++ ++ EXT_ASSERT(curp->p_hdr->eh_entries <= curp->p_hdr->eh_max); ++ EXT_ASSERT(ix <= EXT_LAST_INDEX(curp->p_hdr)); ++ ++ err = ext3_ext_dirty(handle, tree, curp); ++ ext3_std_error(tree->inode->i_sb, err); ++ ++ return err; ++} ++ ++/* ++ * routine inserts new subtree into the path, using free index entry ++ * at depth 'at: ++ * - allocates all needed blocks (new leaf and all intermediate index blocks) ++ * - makes decision where to split ++ * - moves remaining extens and index entries (right to the split point) ++ * into the newly allocated blocks ++ * - initialize subtree ++ */ ++static int ext3_ext_split(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newext, int at) ++{ ++ struct buffer_head *bh = NULL; ++ int depth = EXT_DEPTH(tree); ++ struct ext3_extent_header *neh; ++ struct ext3_extent_idx *fidx; ++ struct ext3_extent *ex; ++ int i = at, k, m, a; ++ unsigned long newblock, oldblock, border; ++ int *ablocks = NULL; /* array of allocated blocks */ ++ int err = 0; ++ ++ /* make decision: where to split? */ ++ /* FIXME: now desicion is simplest: at current extent */ ++ ++ /* if current leaf will be splitted, then we should use ++ * border from split point */ ++ EXT_ASSERT(path[depth].p_ext <= EXT_MAX_EXTENT(path[depth].p_hdr)); ++ if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) { ++ border = path[depth].p_ext[1].ee_block; ++ ext_debug(tree, "leaf will be splitted." ++ " next leaf starts at %d\n", ++ (int)border); ++ } else { ++ border = newext->ee_block; ++ ext_debug(tree, "leaf will be added." ++ " next leaf starts at %d\n", ++ (int)border); ++ } ++ ++ /* ++ * if error occurs, then we break processing ++ * and turn filesystem read-only. so, index won't ++ * be inserted and tree will be in consistent ++ * state. next mount will repair buffers too ++ */ ++ ++ /* ++ * get array to track all allocated blocks ++ * we need this to handle errors and free blocks ++ * upon them ++ */ ++ ablocks = kmalloc(sizeof(unsigned long) * depth, GFP_NOFS); ++ if (!ablocks) ++ return -ENOMEM; ++ memset(ablocks, 0, sizeof(unsigned long) * depth); ++ ++ /* allocate all needed blocks */ ++ ext_debug(tree, "allocate %d blocks for indexes/leaf\n", depth - at); ++ for (a = 0; a < depth - at; a++) { ++ newblock = ext3_ext_new_block(handle, tree, path, newext, &err); ++ if (newblock == 0) ++ goto cleanup; ++ ablocks[a] = newblock; ++ } ++ ++ /* initialize new leaf */ ++ newblock = ablocks[--a]; ++ EXT_ASSERT(newblock); ++ bh = sb_getblk(tree->inode->i_sb, newblock); ++ if (!bh) { ++ err = -EIO; ++ goto cleanup; ++ } ++ lock_buffer(bh); ++ ++ if ((err = ext3_journal_get_create_access(handle, bh))) ++ goto cleanup; ++ ++ neh = EXT_BLOCK_HDR(bh); ++ neh->eh_entries = 0; ++ neh->eh_max = ext3_ext_space_block(tree); ++ neh->eh_magic = EXT3_EXT_MAGIC; ++ neh->eh_depth = 0; ++ ex = EXT_FIRST_EXTENT(neh); ++ ++ /* move remain of path[depth] to the new leaf */ ++ EXT_ASSERT(path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max); ++ /* start copy from next extent */ ++ /* TODO: we could do it by single memmove */ ++ m = 0; ++ path[depth].p_ext++; ++ while (path[depth].p_ext <= ++ EXT_MAX_EXTENT(path[depth].p_hdr)) { ++ ext_debug(tree, "move %d:%d:%d in new leaf %lu\n", ++ path[depth].p_ext->ee_block, ++ path[depth].p_ext->ee_start, ++ path[depth].p_ext->ee_len, ++ newblock); ++ memmove(ex++, path[depth].p_ext++, sizeof(struct ext3_extent)); ++ neh->eh_entries++; ++ m++; ++ } ++ set_buffer_uptodate(bh); ++ unlock_buffer(bh); ++ ++ if ((err = ext3_journal_dirty_metadata(handle, bh))) ++ goto cleanup; ++ brelse(bh); ++ bh = NULL; ++ ++ /* correct old leaf */ ++ if (m) { ++ if ((err = ext3_ext_get_access(handle, tree, path + depth))) ++ goto cleanup; ++ path[depth].p_hdr->eh_entries -= m; ++ if ((err = ext3_ext_dirty(handle, tree, path + depth))) ++ goto cleanup; ++ ++ } ++ ++ /* create intermediate indexes */ ++ k = depth - at - 1; ++ EXT_ASSERT(k >= 0); ++ if (k) ++ ext_debug(tree, "create %d intermediate indices\n", k); ++ /* insert new index into current index block */ ++ /* current depth stored in i var */ ++ i = depth - 1; ++ while (k--) { ++ oldblock = newblock; ++ newblock = ablocks[--a]; ++ bh = sb_getblk(tree->inode->i_sb, newblock); ++ if (!bh) { ++ err = -EIO; ++ goto cleanup; ++ } ++ lock_buffer(bh); ++ ++ if ((err = ext3_journal_get_create_access(handle, bh))) ++ goto cleanup; ++ ++ neh = EXT_BLOCK_HDR(bh); ++ neh->eh_entries = 1; ++ neh->eh_magic = EXT3_EXT_MAGIC; ++ neh->eh_max = ext3_ext_space_block_idx(tree); ++ neh->eh_depth = depth - i; ++ fidx = EXT_FIRST_INDEX(neh); ++ fidx->ei_block = border; ++ fidx->ei_leaf = oldblock; ++ ++ ext_debug(tree, "int.index at %d (block %lu): %lu -> %lu\n", ++ i, newblock, border, oldblock); ++ /* copy indexes */ ++ m = 0; ++ path[i].p_idx++; ++ ++ ext_debug(tree, "cur 0x%p, last 0x%p\n", path[i].p_idx, ++ EXT_MAX_INDEX(path[i].p_hdr)); ++ EXT_ASSERT(EXT_MAX_INDEX(path[i].p_hdr) == ++ EXT_LAST_INDEX(path[i].p_hdr)); ++ while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) { ++ ext_debug(tree, "%d: move %d:%d in new index %lu\n", ++ i, path[i].p_idx->ei_block, ++ path[i].p_idx->ei_leaf, newblock); ++ memmove(++fidx, path[i].p_idx++, ++ sizeof(struct ext3_extent_idx)); ++ neh->eh_entries++; ++ EXT_ASSERT(neh->eh_entries <= neh->eh_max); ++ m++; ++ } ++ set_buffer_uptodate(bh); ++ unlock_buffer(bh); ++ ++ if ((err = ext3_journal_dirty_metadata(handle, bh))) ++ goto cleanup; ++ brelse(bh); ++ bh = NULL; ++ ++ /* correct old index */ ++ if (m) { ++ err = ext3_ext_get_access(handle, tree, path + i); ++ if (err) ++ goto cleanup; ++ path[i].p_hdr->eh_entries -= m; ++ err = ext3_ext_dirty(handle, tree, path + i); ++ if (err) ++ goto cleanup; ++ } ++ ++ i--; ++ } ++ ++ /* insert new index */ ++ if (!err) ++ err = ext3_ext_insert_index(handle, tree, path + at, ++ border, newblock); ++ ++cleanup: ++ if (bh) { ++ if (buffer_locked(bh)) ++ unlock_buffer(bh); ++ brelse(bh); ++ } ++ ++ if (err) { ++ /* free all allocated blocks in error case */ ++ for (i = 0; i < depth; i++) { ++ if (!ablocks[i]) ++ continue; ++ ext3_free_blocks(handle, tree->inode, ablocks[i], 1); ++ } ++ } ++ kfree(ablocks); ++ ++ return err; ++} ++ ++/* ++ * routine implements tree growing procedure: ++ * - allocates new block ++ * - moves top-level data (index block or leaf) into the new block ++ * - initialize new top-level, creating index that points to the ++ * just created block ++ */ ++static int ext3_ext_grow_indepth(handle_t *handle, ++ struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newext) ++{ ++ struct ext3_ext_path *curp = path; ++ struct ext3_extent_header *neh; ++ struct ext3_extent_idx *fidx; ++ struct buffer_head *bh; ++ unsigned long newblock; ++ int err = 0; ++ ++ newblock = ext3_ext_new_block(handle, tree, path, newext, &err); ++ if (newblock == 0) ++ return err; ++ ++ bh = sb_getblk(tree->inode->i_sb, newblock); ++ if (!bh) { ++ err = -EIO; ++ ext3_std_error(tree->inode->i_sb, err); ++ return err; ++ } ++ lock_buffer(bh); ++ ++ if ((err = ext3_journal_get_create_access(handle, bh))) { ++ unlock_buffer(bh); ++ goto out; ++ } ++ ++ /* move top-level index/leaf into new block */ ++ memmove(bh->b_data, curp->p_hdr, tree->buffer_len); ++ ++ /* set size of new block */ ++ neh = EXT_BLOCK_HDR(bh); ++ /* old root could have indexes or leaves ++ * so calculate eh_max right way */ ++ if (EXT_DEPTH(tree)) ++ neh->eh_max = ext3_ext_space_block_idx(tree); ++ else ++ neh->eh_max = ext3_ext_space_block(tree); ++ neh->eh_magic = EXT3_EXT_MAGIC; ++ set_buffer_uptodate(bh); ++ unlock_buffer(bh); ++ ++ if ((err = ext3_journal_dirty_metadata(handle, bh))) ++ goto out; ++ ++ /* create index in new top-level index: num,max,pointer */ ++ if ((err = ext3_ext_get_access(handle, tree, curp))) ++ goto out; ++ ++ curp->p_hdr->eh_magic = EXT3_EXT_MAGIC; ++ curp->p_hdr->eh_max = ext3_ext_space_root_idx(tree); ++ curp->p_hdr->eh_entries = 1; ++ curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr); ++ /* FIXME: it works, but actually path[0] can be index */ ++ curp->p_idx->ei_block = EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block; ++ curp->p_idx->ei_leaf = newblock; ++ ++ neh = EXT_ROOT_HDR(tree); ++ fidx = EXT_FIRST_INDEX(neh); ++ ext_debug(tree, "new root: num %d(%d), lblock %d, ptr %d\n", ++ neh->eh_entries, neh->eh_max, fidx->ei_block, fidx->ei_leaf); ++ ++ neh->eh_depth = path->p_depth + 1; ++ err = ext3_ext_dirty(handle, tree, curp); ++out: ++ brelse(bh); ++ ++ return err; ++} ++ ++/* ++ * routine finds empty index and adds new leaf. if no free index found ++ * then it requests in-depth growing ++ */ ++static int ext3_ext_create_new_leaf(handle_t *handle, ++ struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newext) ++{ ++ struct ext3_ext_path *curp; ++ int depth, i, err = 0; ++ ++repeat: ++ i = depth = EXT_DEPTH(tree); ++ ++ /* walk up to the tree and look for free index entry */ ++ curp = path + depth; ++ while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) { ++ i--; ++ curp--; ++ } ++ ++ /* we use already allocated block for index block ++ * so, subsequent data blocks should be contigoues */ ++ if (EXT_HAS_FREE_INDEX(curp)) { ++ /* if we found index with free entry, then use that ++ * entry: create all needed subtree and add new leaf */ ++ err = ext3_ext_split(handle, tree, path, newext, i); ++ ++ /* refill path */ ++ ext3_ext_drop_refs(path); ++ path = ext3_ext_find_extent(tree, newext->ee_block, path); ++ if (IS_ERR(path)) ++ err = PTR_ERR(path); ++ } else { ++ /* tree is full, time to grow in depth */ ++ err = ext3_ext_grow_indepth(handle, tree, path, newext); ++ ++ /* refill path */ ++ ext3_ext_drop_refs(path); ++ path = ext3_ext_find_extent(tree, newext->ee_block, path); ++ if (IS_ERR(path)) ++ err = PTR_ERR(path); ++ ++ /* ++ * only first (depth 0 -> 1) produces free space ++ * in all other cases we have to split growed tree ++ */ ++ depth = EXT_DEPTH(tree); ++ if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) { ++ /* now we need split */ ++ goto repeat; ++ } ++ } ++ ++ if (err) ++ return err; ++ ++ return 0; ++} ++ ++/* ++ * returns allocated block in subsequent extent or EXT_MAX_BLOCK ++ * NOTE: it consider block number from index entry as ++ * allocated block. thus, index entries have to be consistent ++ * with leafs ++ */ ++static unsigned long ++ext3_ext_next_allocated_block(struct ext3_ext_path *path) ++{ ++ int depth; ++ ++ EXT_ASSERT(path != NULL); ++ depth = path->p_depth; ++ ++ if (depth == 0 && path->p_ext == NULL) ++ return EXT_MAX_BLOCK; ++ ++ /* FIXME: what if index isn't full ?! */ ++ while (depth >= 0) { ++ if (depth == path->p_depth) { ++ /* leaf */ ++ if (path[depth].p_ext != ++ EXT_LAST_EXTENT(path[depth].p_hdr)) ++ return path[depth].p_ext[1].ee_block; ++ } else { ++ /* index */ ++ if (path[depth].p_idx != ++ EXT_LAST_INDEX(path[depth].p_hdr)) ++ return path[depth].p_idx[1].ei_block; ++ } ++ depth--; ++ } ++ ++ return EXT_MAX_BLOCK; ++} ++ ++/* ++ * returns first allocated block from next leaf or EXT_MAX_BLOCK ++ */ ++static unsigned ext3_ext_next_leaf_block(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ int depth; ++ ++ EXT_ASSERT(path != NULL); ++ depth = path->p_depth; ++ ++ /* zero-tree has no leaf blocks at all */ ++ if (depth == 0) ++ return EXT_MAX_BLOCK; ++ ++ /* go to index block */ ++ depth--; ++ ++ while (depth >= 0) { ++ if (path[depth].p_idx != ++ EXT_LAST_INDEX(path[depth].p_hdr)) ++ return path[depth].p_idx[1].ei_block; ++ depth--; ++ } ++ ++ return EXT_MAX_BLOCK; ++} ++ ++/* ++ * if leaf gets modified and modified extent is first in the leaf ++ * then we have to correct all indexes above ++ * TODO: do we need to correct tree in all cases? ++ */ ++int ext3_ext_correct_indexes(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ struct ext3_extent_header *eh; ++ int depth = EXT_DEPTH(tree); ++ struct ext3_extent *ex; ++ unsigned long border; ++ int k, err = 0; ++ ++ eh = path[depth].p_hdr; ++ ex = path[depth].p_ext; ++ EXT_ASSERT(ex); ++ EXT_ASSERT(eh); ++ ++ if (depth == 0) { ++ /* there is no tree at all */ ++ return 0; ++ } ++ ++ if (ex != EXT_FIRST_EXTENT(eh)) { ++ /* we correct tree if first leaf got modified only */ ++ return 0; ++ } ++ ++ /* ++ * TODO: we need correction if border is smaller then current one ++ */ ++ k = depth - 1; ++ border = path[depth].p_ext->ee_block; ++ if ((err = ext3_ext_get_access(handle, tree, path + k))) ++ return err; ++ path[k].p_idx->ei_block = border; ++ if ((err = ext3_ext_dirty(handle, tree, path + k))) ++ return err; ++ ++ while (k--) { ++ /* change all left-side indexes */ ++ if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr)) ++ break; ++ if ((err = ext3_ext_get_access(handle, tree, path + k))) ++ break; ++ path[k].p_idx->ei_block = border; ++ if ((err = ext3_ext_dirty(handle, tree, path + k))) ++ break; ++ } ++ ++ return err; ++} ++ ++static int inline ++ext3_can_extents_be_merged(struct ext3_extents_tree *tree, ++ struct ext3_extent *ex1, ++ struct ext3_extent *ex2) ++{ ++ if (ex1->ee_block + ex1->ee_len != ex2->ee_block) ++ return 0; ++ ++#ifdef AGRESSIVE_TEST ++ if (ex1->ee_len >= 4) ++ return 0; ++#endif ++ ++ if (!tree->ops->mergable) ++ return 1; ++ ++ return tree->ops->mergable(ex1, ex2); ++} ++ ++/* ++ * this routine tries to merge requsted extent into the existing ++ * extent or inserts requested extent as new one into the tree, ++ * creating new leaf in no-space case ++ */ ++int ext3_ext_insert_extent(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newext) ++{ ++ struct ext3_extent_header * eh; ++ struct ext3_extent *ex, *fex; ++ struct ext3_extent *nearex; /* nearest extent */ ++ struct ext3_ext_path *npath = NULL; ++ int depth, len, err, next; ++ ++ EXT_ASSERT(newext->ee_len > 0); ++ depth = EXT_DEPTH(tree); ++ ex = path[depth].p_ext; ++ EXT_ASSERT(path[depth].p_hdr); ++ ++ /* try to insert block into found extent and return */ ++ if (ex && ext3_can_extents_be_merged(tree, ex, newext)) { ++ ext_debug(tree, "append %d block to %d:%d (from %d)\n", ++ newext->ee_len, ex->ee_block, ex->ee_len, ++ ex->ee_start); ++ if ((err = ext3_ext_get_access(handle, tree, path + depth))) ++ return err; ++ ex->ee_len += newext->ee_len; ++ eh = path[depth].p_hdr; ++ nearex = ex; ++ goto merge; ++ } ++ ++repeat: ++ depth = EXT_DEPTH(tree); ++ eh = path[depth].p_hdr; ++ if (eh->eh_entries < eh->eh_max) ++ goto has_space; ++ ++ /* probably next leaf has space for us? */ ++ fex = EXT_LAST_EXTENT(eh); ++ next = ext3_ext_next_leaf_block(tree, path); ++ if (newext->ee_block > fex->ee_block && next != EXT_MAX_BLOCK) { ++ ext_debug(tree, "next leaf block - %d\n", next); ++ EXT_ASSERT(!npath); ++ npath = ext3_ext_find_extent(tree, next, NULL); ++ if (IS_ERR(npath)) ++ return PTR_ERR(npath); ++ EXT_ASSERT(npath->p_depth == path->p_depth); ++ eh = npath[depth].p_hdr; ++ if (eh->eh_entries < eh->eh_max) { ++ ext_debug(tree, "next leaf isnt full(%d)\n", ++ eh->eh_entries); ++ path = npath; ++ goto repeat; ++ } ++ ext_debug(tree, "next leaf hasno free space(%d,%d)\n", ++ eh->eh_entries, eh->eh_max); ++ } ++ ++ /* ++ * there is no free space in found leaf ++ * we're gonna add new leaf in the tree ++ */ ++ err = ext3_ext_create_new_leaf(handle, tree, path, newext); ++ if (err) ++ goto cleanup; ++ depth = EXT_DEPTH(tree); ++ eh = path[depth].p_hdr; ++ ++has_space: ++ nearex = path[depth].p_ext; ++ ++ if ((err = ext3_ext_get_access(handle, tree, path + depth))) ++ goto cleanup; ++ ++ if (!nearex) { ++ /* there is no extent in this leaf, create first one */ ++ ext_debug(tree, "first extent in the leaf: %d:%d:%d\n", ++ newext->ee_block, newext->ee_start, ++ newext->ee_len); ++ path[depth].p_ext = EXT_FIRST_EXTENT(eh); ++ } else if (newext->ee_block > nearex->ee_block) { ++ EXT_ASSERT(newext->ee_block != nearex->ee_block); ++ if (nearex != EXT_LAST_EXTENT(eh)) { ++ len = EXT_MAX_EXTENT(eh) - nearex; ++ len = (len - 1) * sizeof(struct ext3_extent); ++ len = len < 0 ? 0 : len; ++ ext_debug(tree, "insert %d:%d:%d after: nearest 0x%p, " ++ "move %d from 0x%p to 0x%p\n", ++ newext->ee_block, newext->ee_start, ++ newext->ee_len, ++ nearex, len, nearex + 1, nearex + 2); ++ memmove(nearex + 2, nearex + 1, len); ++ } ++ path[depth].p_ext = nearex + 1; ++ } else { ++ EXT_ASSERT(newext->ee_block != nearex->ee_block); ++ len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext3_extent); ++ len = len < 0 ? 0 : len; ++ ext_debug(tree, "insert %d:%d:%d before: nearest 0x%p, " ++ "move %d from 0x%p to 0x%p\n", ++ newext->ee_block, newext->ee_start, newext->ee_len, ++ nearex, len, nearex + 1, nearex + 2); ++ memmove(nearex + 1, nearex, len); ++ path[depth].p_ext = nearex; ++ } ++ ++ eh->eh_entries++; ++ nearex = path[depth].p_ext; ++ nearex->ee_block = newext->ee_block; ++ nearex->ee_start = newext->ee_start; ++ nearex->ee_len = newext->ee_len; ++ /* FIXME: support for large fs */ ++ nearex->ee_start_hi = 0; ++ ++merge: ++ /* try to merge extents to the right */ ++ while (nearex < EXT_LAST_EXTENT(eh)) { ++ if (!ext3_can_extents_be_merged(tree, nearex, nearex + 1)) ++ break; ++ /* merge with next extent! */ ++ nearex->ee_len += nearex[1].ee_len; ++ if (nearex + 1 < EXT_LAST_EXTENT(eh)) { ++ len = (EXT_LAST_EXTENT(eh) - nearex - 1) * ++ sizeof(struct ext3_extent); ++ memmove(nearex + 1, nearex + 2, len); ++ } ++ eh->eh_entries--; ++ EXT_ASSERT(eh->eh_entries > 0); ++ } ++ ++ /* try to merge extents to the left */ ++ ++ /* time to correct all indexes above */ ++ err = ext3_ext_correct_indexes(handle, tree, path); ++ if (err) ++ goto cleanup; ++ ++ err = ext3_ext_dirty(handle, tree, path + depth); ++ ++cleanup: ++ if (npath) { ++ ext3_ext_drop_refs(npath); ++ kfree(npath); ++ } ++ ext3_ext_tree_changed(tree); ++ ext3_ext_invalidate_cache(tree); ++ return err; ++} ++ ++int ext3_ext_walk_space(struct ext3_extents_tree *tree, unsigned long block, ++ unsigned long num, ext_prepare_callback func) ++{ ++ struct ext3_ext_path *path = NULL; ++ struct ext3_ext_cache cbex; ++ struct ext3_extent *ex; ++ unsigned long next, start = 0, end = 0; ++ unsigned long last = block + num; ++ int depth, exists, err = 0; ++ ++ EXT_ASSERT(tree); ++ EXT_ASSERT(func); ++ EXT_ASSERT(tree->inode); ++ EXT_ASSERT(tree->root); ++ ++ while (block < last && block != EXT_MAX_BLOCK) { ++ num = last - block; ++ /* find extent for this block */ ++ path = ext3_ext_find_extent(tree, block, path); ++ if (IS_ERR(path)) { ++ err = PTR_ERR(path); ++ path = NULL; ++ break; ++ } ++ ++ depth = EXT_DEPTH(tree); ++ EXT_ASSERT(path[depth].p_hdr); ++ ex = path[depth].p_ext; ++ next = ext3_ext_next_allocated_block(path); ++ ++ exists = 0; ++ if (!ex) { ++ /* there is no extent yet, so try to allocate ++ * all requested space */ ++ start = block; ++ end = block + num; ++ } else if (ex->ee_block > block) { ++ /* need to allocate space before found extent */ ++ start = block; ++ end = ex->ee_block; ++ if (block + num < end) ++ end = block + num; ++ } else if (block >= ex->ee_block + ex->ee_len) { ++ /* need to allocate space after found extent */ ++ start = block; ++ end = block + num; ++ if (end >= next) ++ end = next; ++ } else if (block >= ex->ee_block) { ++ /* ++ * some part of requested space is covered ++ * by found extent ++ */ ++ start = block; ++ end = ex->ee_block + ex->ee_len; ++ if (block + num < end) ++ end = block + num; ++ exists = 1; ++ } else { ++ BUG(); ++ } ++ EXT_ASSERT(end > start); ++ ++ if (!exists) { ++ cbex.ec_block = start; ++ cbex.ec_len = end - start; ++ cbex.ec_start = 0; ++ cbex.ec_type = EXT3_EXT_CACHE_GAP; ++ } else { ++ cbex.ec_block = ex->ee_block; ++ cbex.ec_len = ex->ee_len; ++ cbex.ec_start = ex->ee_start; ++ cbex.ec_type = EXT3_EXT_CACHE_EXTENT; ++ } ++ ++ EXT_ASSERT(cbex.ec_len > 0); ++ EXT_ASSERT(path[depth].p_hdr); ++ err = func(tree, path, &cbex); ++ ext3_ext_drop_refs(path); ++ ++ if (err < 0) ++ break; ++ if (err == EXT_REPEAT) ++ continue; ++ else if (err == EXT_BREAK) { ++ err = 0; ++ break; ++ } ++ ++ if (EXT_DEPTH(tree) != depth) { ++ /* depth was changed. we have to realloc path */ ++ kfree(path); ++ path = NULL; ++ } ++ ++ block = cbex.ec_block + cbex.ec_len; ++ } ++ ++ if (path) { ++ ext3_ext_drop_refs(path); ++ kfree(path); ++ } ++ ++ return err; ++} ++ ++static inline void ++ext3_ext_put_in_cache(struct ext3_extents_tree *tree, __u32 block, ++ __u32 len, __u32 start, int type) ++{ ++ EXT_ASSERT(len > 0); ++ if (tree->cex) { ++ tree->cex->ec_type = type; ++ tree->cex->ec_block = block; ++ tree->cex->ec_len = len; ++ tree->cex->ec_start = start; ++ } ++} ++ ++/* ++ * this routine calculate boundaries of the gap requested block fits into ++ * and cache this gap ++ */ ++static inline void ++ext3_ext_put_gap_in_cache(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ unsigned long block) ++{ ++ int depth = EXT_DEPTH(tree); ++ unsigned long lblock, len; ++ struct ext3_extent *ex; ++ ++ if (!tree->cex) ++ return; ++ ++ ex = path[depth].p_ext; ++ if (ex == NULL) { ++ /* there is no extent yet, so gap is [0;-] */ ++ lblock = 0; ++ len = EXT_MAX_BLOCK; ++ ext_debug(tree, "cache gap(whole file):"); ++ } else if (block < ex->ee_block) { ++ lblock = block; ++ len = ex->ee_block - block; ++ ext_debug(tree, "cache gap(before): %lu [%lu:%lu]", ++ (unsigned long) block, ++ (unsigned long) ex->ee_block, ++ (unsigned long) ex->ee_len); ++ } else if (block >= ex->ee_block + ex->ee_len) { ++ lblock = ex->ee_block + ex->ee_len; ++ len = ext3_ext_next_allocated_block(path); ++ ext_debug(tree, "cache gap(after): [%lu:%lu] %lu", ++ (unsigned long) ex->ee_block, ++ (unsigned long) ex->ee_len, ++ (unsigned long) block); ++ EXT_ASSERT(len > lblock); ++ len = len - lblock; ++ } else { ++ lblock = len = 0; ++ BUG(); ++ } ++ ++ ext_debug(tree, " -> %lu:%lu\n", (unsigned long) lblock, len); ++ ext3_ext_put_in_cache(tree, lblock, len, 0, EXT3_EXT_CACHE_GAP); ++} ++ ++static inline int ++ext3_ext_in_cache(struct ext3_extents_tree *tree, unsigned long block, ++ struct ext3_extent *ex) ++{ ++ struct ext3_ext_cache *cex = tree->cex; ++ ++ /* is there cache storage at all? */ ++ if (!cex) ++ return EXT3_EXT_CACHE_NO; ++ ++ /* has cache valid data? */ ++ if (cex->ec_type == EXT3_EXT_CACHE_NO) ++ return EXT3_EXT_CACHE_NO; ++ ++ EXT_ASSERT(cex->ec_type == EXT3_EXT_CACHE_GAP || ++ cex->ec_type == EXT3_EXT_CACHE_EXTENT); ++ if (block >= cex->ec_block && block < cex->ec_block + cex->ec_len) { ++ ex->ee_block = cex->ec_block; ++ ex->ee_start = cex->ec_start; ++ ex->ee_len = cex->ec_len; ++ ext_debug(tree, "%lu cached by %lu:%lu:%lu\n", ++ (unsigned long) block, ++ (unsigned long) ex->ee_block, ++ (unsigned long) ex->ee_len, ++ (unsigned long) ex->ee_start); ++ return cex->ec_type; ++ } ++ ++ /* not in cache */ ++ return EXT3_EXT_CACHE_NO; ++} ++ ++/* ++ * routine removes index from the index block ++ * it's used in truncate case only. thus all requests are for ++ * last index in the block only ++ */ ++int ext3_ext_rm_idx(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ struct buffer_head *bh; ++ int err; ++ ++ /* free index block */ ++ path--; ++ EXT_ASSERT(path->p_hdr->eh_entries); ++ if ((err = ext3_ext_get_access(handle, tree, path))) ++ return err; ++ path->p_hdr->eh_entries--; ++ if ((err = ext3_ext_dirty(handle, tree, path))) ++ return err; ++ ext_debug(tree, "index is empty, remove it, free block %d\n", ++ path->p_idx->ei_leaf); ++ bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf); ++ ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf); ++ ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1); ++ return err; ++} ++ ++int ext3_ext_calc_credits_for_insert(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ int depth = EXT_DEPTH(tree); ++ int needed; ++ ++ if (path) { ++ /* probably there is space in leaf? */ ++ if (path[depth].p_hdr->eh_entries < path[depth].p_hdr->eh_max) ++ return 1; ++ } ++ ++ /* ++ * the worste case we're expecting is creation of the ++ * new root (growing in depth) with index splitting ++ * for splitting we have to consider depth + 1 because ++ * previous growing could increase it ++ */ ++ depth = depth + 1; ++ ++ /* ++ * growing in depth: ++ * block allocation + new root + old root ++ */ ++ needed = EXT3_ALLOC_NEEDED + 2; ++ ++ /* index split. we may need: ++ * allocate intermediate indexes and new leaf ++ * change two blocks at each level, but root ++ * modify root block (inode) ++ */ ++ needed += (depth * EXT3_ALLOC_NEEDED) + (2 * depth) + 1; ++ ++ return needed; ++} ++ ++static int ++ext3_ext_split_for_rm(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, unsigned long start, ++ unsigned long end) ++{ ++ struct ext3_extent *ex, tex; ++ struct ext3_ext_path *npath; ++ int depth, creds, err; ++ ++ depth = EXT_DEPTH(tree); ++ ex = path[depth].p_ext; ++ EXT_ASSERT(ex); ++ EXT_ASSERT(end < ex->ee_block + ex->ee_len - 1); ++ EXT_ASSERT(ex->ee_block < start); ++ ++ /* calculate tail extent */ ++ tex.ee_block = end + 1; ++ EXT_ASSERT(tex.ee_block < ex->ee_block + ex->ee_len); ++ tex.ee_len = ex->ee_block + ex->ee_len - tex.ee_block; ++ ++ creds = ext3_ext_calc_credits_for_insert(tree, path); ++ handle = ext3_ext_journal_restart(handle, creds); ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ ++ /* calculate head extent. use primary extent */ ++ err = ext3_ext_get_access(handle, tree, path + depth); ++ if (err) ++ return err; ++ ex->ee_len = start - ex->ee_block; ++ err = ext3_ext_dirty(handle, tree, path + depth); ++ if (err) ++ return err; ++ ++ /* FIXME: some callback to free underlying resource ++ * and correct ee_start? */ ++ ext_debug(tree, "split extent: head %u:%u, tail %u:%u\n", ++ ex->ee_block, ex->ee_len, tex.ee_block, tex.ee_len); ++ ++ npath = ext3_ext_find_extent(tree, ex->ee_block, NULL); ++ if (IS_ERR(npath)) ++ return PTR_ERR(npath); ++ depth = EXT_DEPTH(tree); ++ EXT_ASSERT(npath[depth].p_ext->ee_block == ex->ee_block); ++ EXT_ASSERT(npath[depth].p_ext->ee_len == ex->ee_len); ++ ++ err = ext3_ext_insert_extent(handle, tree, npath, &tex); ++ ext3_ext_drop_refs(npath); ++ kfree(npath); ++ ++ return err; ++} ++ ++static int ++ext3_ext_rm_leaf(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, unsigned long start, ++ unsigned long end) ++{ ++ struct ext3_extent *ex, *fu = NULL, *lu, *le; ++ int err = 0, correct_index = 0; ++ int depth = EXT_DEPTH(tree), credits; ++ struct ext3_extent_header *eh; ++ unsigned a, b, block, num; ++ ++ ext_debug(tree, "remove [%lu:%lu] in leaf\n", start, end); ++ if (!path[depth].p_hdr) ++ path[depth].p_hdr = EXT_BLOCK_HDR(path[depth].p_bh); ++ eh = path[depth].p_hdr; ++ EXT_ASSERT(eh); ++ EXT_ASSERT(eh->eh_entries <= eh->eh_max); ++ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); ++ ++ /* find where to start removing */ ++ le = ex = EXT_LAST_EXTENT(eh); ++ while (ex != EXT_FIRST_EXTENT(eh)) { ++ if (ex->ee_block <= end) ++ break; ++ ex--; ++ } ++ ++ if (start > ex->ee_block && end < ex->ee_block + ex->ee_len - 1) { ++ /* removal of internal part of the extent requested ++ * tail and head must be placed in different extent ++ * so, we have to insert one more extent */ ++ path[depth].p_ext = ex; ++ return ext3_ext_split_for_rm(handle, tree, path, start, end); ++ } ++ ++ lu = ex; ++ while (ex >= EXT_FIRST_EXTENT(eh) && ex->ee_block + ex->ee_len > start) { ++ ext_debug(tree, "remove ext %u:%u\n", ex->ee_block, ex->ee_len); ++ path[depth].p_ext = ex; ++ ++ a = ex->ee_block > start ? ex->ee_block : start; ++ b = ex->ee_block + ex->ee_len - 1 < end ? ++ ex->ee_block + ex->ee_len - 1 : end; ++ ++ ext_debug(tree, " border %u:%u\n", a, b); ++ ++ if (a != ex->ee_block && b != ex->ee_block + ex->ee_len - 1) { ++ block = 0; ++ num = 0; ++ BUG(); ++ } else if (a != ex->ee_block) { ++ /* remove tail of the extent */ ++ block = ex->ee_block; ++ num = a - block; ++ } else if (b != ex->ee_block + ex->ee_len - 1) { ++ /* remove head of the extent */ ++ block = a; ++ num = b - a; ++ } else { ++ /* remove whole extent: excelent! */ ++ block = ex->ee_block; ++ num = 0; ++ EXT_ASSERT(a == ex->ee_block && ++ b == ex->ee_block + ex->ee_len - 1); ++ } ++ ++ if (ex == EXT_FIRST_EXTENT(eh)) ++ correct_index = 1; ++ ++ credits = 1; ++ if (correct_index) ++ credits += (EXT_DEPTH(tree) * EXT3_ALLOC_NEEDED) + 1; ++ if (tree->ops->remove_extent_credits) ++ credits+=tree->ops->remove_extent_credits(tree,ex,a,b); ++ ++ handle = ext3_ext_journal_restart(handle, credits); ++ if (IS_ERR(handle)) { ++ err = PTR_ERR(handle); ++ goto out; ++ } ++ ++ err = ext3_ext_get_access(handle, tree, path + depth); ++ if (err) ++ goto out; ++ ++ if (tree->ops->remove_extent) ++ err = tree->ops->remove_extent(tree, ex, a, b); ++ if (err) ++ goto out; ++ ++ if (num == 0) { ++ /* this extent is removed entirely mark slot unused */ ++ ex->ee_start = 0; ++ eh->eh_entries--; ++ fu = ex; ++ } ++ ++ ex->ee_block = block; ++ ex->ee_len = num; ++ ++ err = ext3_ext_dirty(handle, tree, path + depth); ++ if (err) ++ goto out; ++ ++ ext_debug(tree, "new extent: %u:%u:%u\n", ++ ex->ee_block, ex->ee_len, ex->ee_start); ++ ex--; ++ } ++ ++ if (fu) { ++ /* reuse unused slots */ ++ while (lu < le) { ++ if (lu->ee_start) { ++ *fu = *lu; ++ lu->ee_start = 0; ++ fu++; ++ } ++ lu++; ++ } ++ } ++ ++ if (correct_index && eh->eh_entries) ++ err = ext3_ext_correct_indexes(handle, tree, path); ++ ++ /* if this leaf is free, then we should ++ * remove it from index block above */ ++ if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL) ++ err = ext3_ext_rm_idx(handle, tree, path + depth); ++ ++out: ++ return err; ++} ++ ++ ++static struct ext3_extent_idx * ++ext3_ext_last_covered(struct ext3_extent_header *hdr, unsigned long block) ++{ ++ struct ext3_extent_idx *ix; ++ ++ ix = EXT_LAST_INDEX(hdr); ++ while (ix != EXT_FIRST_INDEX(hdr)) { ++ if (ix->ei_block <= block) ++ break; ++ ix--; ++ } ++ return ix; ++} ++ ++/* ++ * returns 1 if current index have to be freed (even partial) ++ */ ++static int inline ++ext3_ext_more_to_rm(struct ext3_ext_path *path) ++{ ++ EXT_ASSERT(path->p_idx); ++ ++ if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr)) ++ return 0; ++ ++ /* ++ * if truncate on deeper level happened it it wasn't partial ++ * so we have to consider current index for truncation ++ */ ++ if (path->p_hdr->eh_entries == path->p_block) ++ return 0; ++ return 1; ++} ++ ++int ext3_ext_remove_space(struct ext3_extents_tree *tree, ++ unsigned long start, unsigned long end) ++{ ++ struct inode *inode = tree->inode; ++ struct super_block *sb = inode->i_sb; ++ int depth = EXT_DEPTH(tree); ++ struct ext3_ext_path *path; ++ handle_t *handle; ++ int i = 0, err = 0; ++ ++ ext_debug(tree, "space to be removed: %lu:%lu\n", start, end); ++ ++ /* probably first extent we're gonna free will be last in block */ ++ handle = ext3_journal_start(inode, depth + 1); ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ ++ ext3_ext_invalidate_cache(tree); ++ ++ /* ++ * we start scanning from right side freeing all the blocks ++ * after i_size and walking into the deep ++ */ ++ path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 1), GFP_KERNEL); ++ if (IS_ERR(path)) { ++ ext3_error(sb, __FUNCTION__, "Can't allocate path array"); ++ ext3_journal_stop(handle); ++ return -ENOMEM; ++ } ++ memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1)); ++ path[i].p_hdr = EXT_ROOT_HDR(tree); ++ ++ while (i >= 0 && err == 0) { ++ if (i == depth) { ++ /* this is leaf block */ ++ err = ext3_ext_rm_leaf(handle, tree, path, start, end); ++ /* root level have p_bh == NULL, brelse() eats this */ ++ brelse(path[i].p_bh); ++ i--; ++ continue; ++ } ++ ++ /* this is index block */ ++ if (!path[i].p_hdr) { ++ ext_debug(tree, "initialize header\n"); ++ path[i].p_hdr = EXT_BLOCK_HDR(path[i].p_bh); ++ } ++ ++ EXT_ASSERT(path[i].p_hdr->eh_entries <= path[i].p_hdr->eh_max); ++ EXT_ASSERT(path[i].p_hdr->eh_magic == EXT3_EXT_MAGIC); ++ ++ if (!path[i].p_idx) { ++ /* this level hasn't touched yet */ ++ path[i].p_idx = ++ ext3_ext_last_covered(path[i].p_hdr, end); ++ path[i].p_block = path[i].p_hdr->eh_entries + 1; ++ ext_debug(tree, "init index ptr: hdr 0x%p, num %d\n", ++ path[i].p_hdr, path[i].p_hdr->eh_entries); ++ } else { ++ /* we've already was here, see at next index */ ++ path[i].p_idx--; ++ } ++ ++ ext_debug(tree, "level %d - index, first 0x%p, cur 0x%p\n", ++ i, EXT_FIRST_INDEX(path[i].p_hdr), ++ path[i].p_idx); ++ if (ext3_ext_more_to_rm(path + i)) { ++ /* go to the next level */ ++ ext_debug(tree, "move to level %d (block %d)\n", ++ i + 1, path[i].p_idx->ei_leaf); ++ memset(path + i + 1, 0, sizeof(*path)); ++ path[i+1].p_bh = sb_bread(sb, path[i].p_idx->ei_leaf); ++ if (!path[i+1].p_bh) { ++ /* should we reset i_size? */ ++ err = -EIO; ++ break; ++ } ++ /* put actual number of indexes to know is this ++ * number got changed at the next iteration */ ++ path[i].p_block = path[i].p_hdr->eh_entries; ++ i++; ++ } else { ++ /* we finish processing this index, go up */ ++ if (path[i].p_hdr->eh_entries == 0 && i > 0) { ++ /* index is empty, remove it ++ * handle must be already prepared by the ++ * truncatei_leaf() */ ++ err = ext3_ext_rm_idx(handle, tree, path + i); ++ } ++ /* root level have p_bh == NULL, brelse() eats this */ ++ brelse(path[i].p_bh); ++ i--; ++ ext_debug(tree, "return to level %d\n", i); ++ } ++ } ++ ++ /* TODO: flexible tree reduction should be here */ ++ if (path->p_hdr->eh_entries == 0) { ++ /* ++ * truncate to zero freed all the tree ++ * so, we need to correct eh_depth ++ */ ++ err = ext3_ext_get_access(handle, tree, path); ++ if (err == 0) { ++ EXT_ROOT_HDR(tree)->eh_depth = 0; ++ EXT_ROOT_HDR(tree)->eh_max = ext3_ext_space_root(tree); ++ err = ext3_ext_dirty(handle, tree, path); ++ } ++ } ++ ext3_ext_tree_changed(tree); ++ ++ kfree(path); ++ ext3_journal_stop(handle); ++ ++ return err; ++} ++ ++int ext3_ext_calc_metadata_amount(struct ext3_extents_tree *tree, int blocks) ++{ ++ int lcap, icap, rcap, leafs, idxs, num; ++ ++ rcap = ext3_ext_space_root(tree); ++ if (blocks <= rcap) { ++ /* all extents fit to the root */ ++ return 0; ++ } ++ ++ rcap = ext3_ext_space_root_idx(tree); ++ lcap = ext3_ext_space_block(tree); ++ icap = ext3_ext_space_block_idx(tree); ++ ++ num = leafs = (blocks + lcap - 1) / lcap; ++ if (leafs <= rcap) { ++ /* all pointers to leafs fit to the root */ ++ return leafs; ++ } ++ ++ /* ok. we need separate index block(s) to link all leaf blocks */ ++ idxs = (leafs + icap - 1) / icap; ++ do { ++ num += idxs; ++ idxs = (idxs + icap - 1) / icap; ++ } while (idxs > rcap); ++ ++ return num; ++} ++ ++/* ++ * called at mount time ++ */ ++void ext3_ext_init(struct super_block *sb) ++{ ++ /* ++ * possible initialization would be here ++ */ ++ ++ if (test_opt(sb, EXTENTS)) { ++ printk("EXT3-fs: file extents enabled"); ++#ifdef AGRESSIVE_TEST ++ printk(", agressive tests"); ++#endif ++#ifdef CHECK_BINSEARCH ++ printk(", check binsearch"); ++#endif ++ printk("\n"); ++ } ++} ++ ++/* ++ * called at umount time ++ */ ++void ext3_ext_release(struct super_block *sb) ++{ ++} ++ ++/************************************************************************ ++ * VFS related routines ++ ************************************************************************/ ++ ++static int ext3_get_inode_write_access(handle_t *handle, void *buffer) ++{ ++ /* we use in-core data, not bh */ ++ return 0; ++} ++ ++static int ext3_mark_buffer_dirty(handle_t *handle, void *buffer) ++{ ++ struct inode *inode = buffer; ++ return ext3_mark_inode_dirty(handle, inode); ++} ++ ++static int ext3_ext_mergable(struct ext3_extent *ex1, ++ struct ext3_extent *ex2) ++{ ++ /* FIXME: support for large fs */ ++ if (ex1->ee_start + ex1->ee_len == ex2->ee_start) ++ return 1; ++ return 0; ++} ++ ++static int ++ext3_remove_blocks_credits(struct ext3_extents_tree *tree, ++ struct ext3_extent *ex, ++ unsigned long from, unsigned long to) ++{ ++ int needed; ++ ++ /* at present, extent can't cross block group */; ++ needed = 4; /* bitmap + group desc + sb + inode */ ++ ++#ifdef CONFIG_QUOTA ++ needed += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS; ++#endif ++ return needed; ++} ++ ++static int ++ext3_remove_blocks(struct ext3_extents_tree *tree, ++ struct ext3_extent *ex, ++ unsigned long from, unsigned long to) ++{ ++ int needed = ext3_remove_blocks_credits(tree, ex, from, to); ++ handle_t *handle = ext3_journal_start(tree->inode, needed); ++ struct buffer_head *bh; ++ int i; ++ ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) { ++ /* tail removal */ ++ unsigned long num, start; ++ num = ex->ee_block + ex->ee_len - from; ++ start = ex->ee_start + ex->ee_len - num; ++ ext_debug(tree, "free last %lu blocks starting %lu\n", ++ num, start); ++ for (i = 0; i < num; i++) { ++ bh = sb_find_get_block(tree->inode->i_sb, start + i); ++ ext3_forget(handle, 0, tree->inode, bh, start + i); ++ } ++ ext3_free_blocks(handle, tree->inode, start, num); ++ } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) { ++ printk("strange request: removal %lu-%lu from %u:%u\n", ++ from, to, ex->ee_block, ex->ee_len); ++ } else { ++ printk("strange request: removal(2) %lu-%lu from %u:%u\n", ++ from, to, ex->ee_block, ex->ee_len); ++ } ++ ext3_journal_stop(handle); ++ return 0; ++} ++ ++static int ext3_ext_find_goal(struct inode *inode, ++ struct ext3_ext_path *path, unsigned long block) ++{ ++ struct ext3_inode_info *ei = EXT3_I(inode); ++ unsigned long bg_start; ++ unsigned long colour; ++ int depth; ++ ++ if (path) { ++ struct ext3_extent *ex; ++ depth = path->p_depth; ++ ++ /* try to predict block placement */ ++ if ((ex = path[depth].p_ext)) ++ return ex->ee_start + (block - ex->ee_block); ++ ++ /* it looks index is empty ++ * try to find starting from index itself */ ++ if (path[depth].p_bh) ++ return path[depth].p_bh->b_blocknr; ++ } ++ ++ /* OK. use inode's group */ ++ bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) + ++ le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block); ++ colour = (current->pid % 16) * ++ (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16); ++ return bg_start + colour + block; ++} ++ ++static int ext3_new_block_cb(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *ex, int *err) ++{ ++ struct inode *inode = tree->inode; ++ int newblock, goal; ++ ++ EXT_ASSERT(path); ++ EXT_ASSERT(ex); ++ EXT_ASSERT(ex->ee_start); ++ EXT_ASSERT(ex->ee_len); ++ ++ /* reuse block from the extent to order data/metadata */ ++ newblock = ex->ee_start++; ++ ex->ee_len--; ++ if (ex->ee_len == 0) { ++ ex->ee_len = 1; ++ /* allocate new block for the extent */ ++ goal = ext3_ext_find_goal(inode, path, ex->ee_block); ++ ex->ee_start = ext3_new_block(handle, inode, goal, err); ++ if (ex->ee_start == 0) { ++ /* error occured: restore old extent */ ++ ex->ee_start = newblock; ++ return 0; ++ } ++ } ++ return newblock; ++} ++ ++static struct ext3_extents_helpers ext3_blockmap_helpers = { ++ .get_write_access = ext3_get_inode_write_access, ++ .mark_buffer_dirty = ext3_mark_buffer_dirty, ++ .mergable = ext3_ext_mergable, ++ .new_block = ext3_new_block_cb, ++ .remove_extent = ext3_remove_blocks, ++ .remove_extent_credits = ext3_remove_blocks_credits, ++}; ++ ++void ext3_init_tree_desc(struct ext3_extents_tree *tree, ++ struct inode *inode) ++{ ++ tree->inode = inode; ++ tree->root = (void *) EXT3_I(inode)->i_data; ++ tree->buffer = (void *) inode; ++ tree->buffer_len = sizeof(EXT3_I(inode)->i_data); ++ tree->cex = (struct ext3_ext_cache *) &EXT3_I(inode)->i_cached_extent; ++ tree->ops = &ext3_blockmap_helpers; ++} ++ ++int ext3_ext_get_block(handle_t *handle, struct inode *inode, ++ long iblock, struct buffer_head *bh_result, ++ int create, int extend_disksize) ++{ ++ struct ext3_ext_path *path = NULL; ++ struct ext3_extent newex; ++ struct ext3_extent *ex; ++ int goal, newblock, err = 0, depth; ++ struct ext3_extents_tree tree; ++ ++ clear_buffer_new(bh_result); ++ ext3_init_tree_desc(&tree, inode); ++ ext_debug(&tree, "block %d requested for inode %u\n", ++ (int) iblock, (unsigned) inode->i_ino); ++ mutex_lock(&EXT3_I(inode)->truncate_mutex); ++ ++ /* check in cache */ ++ if ((goal = ext3_ext_in_cache(&tree, iblock, &newex))) { ++ if (goal == EXT3_EXT_CACHE_GAP) { ++ if (!create) { ++ /* block isn't allocated yet and ++ * user don't want to allocate it */ ++ goto out2; ++ } ++ /* we should allocate requested block */ ++ } else if (goal == EXT3_EXT_CACHE_EXTENT) { ++ /* block is already allocated */ ++ newblock = iblock - newex.ee_block + newex.ee_start; ++ goto out; ++ } else { ++ EXT_ASSERT(0); ++ } ++ } ++ ++ /* find extent for this block */ ++ path = ext3_ext_find_extent(&tree, iblock, NULL); ++ if (IS_ERR(path)) { ++ err = PTR_ERR(path); ++ path = NULL; ++ goto out2; ++ } ++ ++ depth = EXT_DEPTH(&tree); ++ ++ /* ++ * consistent leaf must not be empty ++ * this situations is possible, though, _during_ tree modification ++ * this is why assert can't be put in ext3_ext_find_extent() ++ */ ++ EXT_ASSERT(path[depth].p_ext != NULL || depth == 0); ++ ++ if ((ex = path[depth].p_ext)) { ++ /* if found exent covers block, simple return it */ ++ if (iblock >= ex->ee_block && iblock < ex->ee_block + ex->ee_len) { ++ newblock = iblock - ex->ee_block + ex->ee_start; ++ ext_debug(&tree, "%d fit into %d:%d -> %d\n", ++ (int) iblock, ex->ee_block, ex->ee_len, ++ newblock); ++ ext3_ext_put_in_cache(&tree, ex->ee_block, ++ ex->ee_len, ex->ee_start, ++ EXT3_EXT_CACHE_EXTENT); ++ goto out; ++ } ++ } ++ ++ /* ++ * requested block isn't allocated yet ++ * we couldn't try to create block if create flag is zero ++ */ ++ if (!create) { ++ /* put just found gap into cache to speedup subsequest reqs */ ++ ext3_ext_put_gap_in_cache(&tree, path, iblock); ++ goto out2; ++ } ++ ++ /* allocate new block */ ++ goal = ext3_ext_find_goal(inode, path, iblock); ++ newblock = ext3_new_block(handle, inode, goal, &err); ++ if (!newblock) ++ goto out2; ++ ext_debug(&tree, "allocate new block: goal %d, found %d\n", ++ goal, newblock); ++ ++ /* try to insert new extent into found leaf and return */ ++ newex.ee_block = iblock; ++ newex.ee_start = newblock; ++ newex.ee_len = 1; ++ err = ext3_ext_insert_extent(handle, &tree, path, &newex); ++ if (err) ++ goto out2; ++ ++ if (extend_disksize && inode->i_size > EXT3_I(inode)->i_disksize) ++ EXT3_I(inode)->i_disksize = inode->i_size; ++ ++ /* previous routine could use block we allocated */ ++ newblock = newex.ee_start; ++ set_buffer_new(bh_result); ++ ++ ext3_ext_put_in_cache(&tree, newex.ee_block, newex.ee_len, ++ newex.ee_start, EXT3_EXT_CACHE_EXTENT); ++out: ++ ext3_ext_show_leaf(&tree, path); ++ map_bh(bh_result, inode->i_sb, newblock); ++out2: ++ if (path) { ++ ext3_ext_drop_refs(path); ++ kfree(path); ++ } ++ mutex_unlock(&EXT3_I(inode)->truncate_mutex); ++ ++ return err; ++} ++ ++void ext3_ext_truncate(struct inode * inode, struct page *page) ++{ ++ struct address_space *mapping = inode->i_mapping; ++ struct super_block *sb = inode->i_sb; ++ struct ext3_extents_tree tree; ++ unsigned long last_block; ++ handle_t *handle; ++ int err = 0; ++ ++ ext3_init_tree_desc(&tree, inode); ++ ++ /* ++ * probably first extent we're gonna free will be last in block ++ */ ++ err = ext3_writepage_trans_blocks(inode) + 3; ++ handle = ext3_journal_start(inode, err); ++ if (IS_ERR(handle)) { ++ if (page) { ++ clear_highpage(page); ++ flush_dcache_page(page); ++ unlock_page(page); ++ page_cache_release(page); ++ } ++ return; ++ } ++ ++ if (page) ++ ext3_block_truncate_page(handle, page, mapping, inode->i_size); ++ ++ mutex_lock(&EXT3_I(inode)->truncate_mutex); ++ ext3_ext_invalidate_cache(&tree); ++ ++ /* ++ * TODO: optimization is possible here ++ * probably we need not scaning at all, ++ * because page truncation is enough ++ */ ++ if (ext3_orphan_add(handle, inode)) ++ goto out_stop; ++ ++ /* we have to know where to truncate from in crash case */ ++ EXT3_I(inode)->i_disksize = inode->i_size; ++ ext3_mark_inode_dirty(handle, inode); ++ ++ last_block = (inode->i_size + sb->s_blocksize - 1) >> ++ EXT3_BLOCK_SIZE_BITS(sb); ++ err = ext3_ext_remove_space(&tree, last_block, EXT_MAX_BLOCK); ++ ++ /* In a multi-transaction truncate, we only make the final ++ * transaction synchronous */ ++ if (IS_SYNC(inode)) ++ handle->h_sync = 1; ++ ++out_stop: ++ /* ++ * If this was a simple ftruncate(), and the file will remain alive ++ * then we need to clear up the orphan record which we created above. ++ * However, if this was a real unlink then we were called by ++ * ext3_delete_inode(), and we allow that function to clean up the ++ * orphan info for us. ++ */ ++ if (inode->i_nlink) ++ ext3_orphan_del(handle, inode); ++ ++ mutex_unlock(&EXT3_I(inode)->truncate_mutex); ++ ext3_journal_stop(handle); ++} ++ ++/* ++ * this routine calculate max number of blocks we could modify ++ * in order to allocate new block for an inode ++ */ ++int ext3_ext_writepage_trans_blocks(struct inode *inode, int num) ++{ ++ struct ext3_extents_tree tree; ++ int needed; ++ ++ ext3_init_tree_desc(&tree, inode); ++ ++ needed = ext3_ext_calc_credits_for_insert(&tree, NULL); ++ ++ /* caller want to allocate num blocks */ ++ needed *= num; ++ ++#ifdef CONFIG_QUOTA ++ /* ++ * FIXME: real calculation should be here ++ * it depends on blockmap format of qouta file ++ */ ++ needed += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS; ++#endif ++ ++ return needed; ++} ++ ++void ext3_extents_initialize_blockmap(handle_t *handle, struct inode *inode) ++{ ++ struct ext3_extents_tree tree; ++ ++ ext3_init_tree_desc(&tree, inode); ++ ext3_extent_tree_init(handle, &tree); ++} ++ ++int ext3_ext_calc_blockmap_metadata(struct inode *inode, int blocks) ++{ ++ struct ext3_extents_tree tree; ++ ++ ext3_init_tree_desc(&tree, inode); ++ return ext3_ext_calc_metadata_amount(&tree, blocks); ++} ++ ++static int ++ext3_ext_store_extent_cb(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_ext_cache *newex) ++{ ++ struct ext3_extent_buf *buf = (struct ext3_extent_buf *) tree->private; ++ ++ if (newex->ec_type != EXT3_EXT_CACHE_EXTENT) ++ return EXT_CONTINUE; ++ ++ if (buf->err < 0) ++ return EXT_BREAK; ++ if (buf->cur - buf->buffer + sizeof(*newex) > buf->buflen) ++ return EXT_BREAK; ++ ++ if (!copy_to_user(buf->cur, newex, sizeof(*newex))) { ++ buf->err++; ++ buf->cur += sizeof(*newex); ++ } else { ++ buf->err = -EFAULT; ++ return EXT_BREAK; ++ } ++ return EXT_CONTINUE; ++} ++ ++static int ++ext3_ext_collect_stats_cb(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_ext_cache *ex) ++{ ++ struct ext3_extent_tree_stats *buf = ++ (struct ext3_extent_tree_stats *) tree->private; ++ int depth; ++ ++ if (ex->ec_type != EXT3_EXT_CACHE_EXTENT) ++ return EXT_CONTINUE; ++ ++ depth = EXT_DEPTH(tree); ++ buf->extents_num++; ++ if (path[depth].p_ext == EXT_FIRST_EXTENT(path[depth].p_hdr)) ++ buf->leaf_num++; ++ return EXT_CONTINUE; ++} ++ ++int ext3_ext_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, ++ unsigned long arg) ++{ ++ int err = 0; ++ ++ if (!(EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL)) ++ return -EINVAL; ++ ++ if (cmd == EXT3_IOC_GET_EXTENTS) { ++ struct ext3_extent_buf buf; ++ struct ext3_extents_tree tree; ++ ++ if (copy_from_user(&buf, (void *) arg, sizeof(buf))) ++ return -EFAULT; ++ ++ ext3_init_tree_desc(&tree, inode); ++ buf.cur = buf.buffer; ++ buf.err = 0; ++ tree.private = &buf; ++ mutex_lock(&EXT3_I(inode)->truncate_mutex); ++ err = ext3_ext_walk_space(&tree, buf.start, EXT_MAX_BLOCK, ++ ext3_ext_store_extent_cb); ++ mutex_unlock(&EXT3_I(inode)->truncate_mutex); ++ if (err == 0) ++ err = buf.err; ++ } else if (cmd == EXT3_IOC_GET_TREE_STATS) { ++ struct ext3_extent_tree_stats buf; ++ struct ext3_extents_tree tree; ++ ++ ext3_init_tree_desc(&tree, inode); ++ mutex_lock(&EXT3_I(inode)->truncate_mutex); ++ buf.depth = EXT_DEPTH(&tree); ++ buf.extents_num = 0; ++ buf.leaf_num = 0; ++ tree.private = &buf; ++ err = ext3_ext_walk_space(&tree, 0, EXT_MAX_BLOCK, ++ ext3_ext_collect_stats_cb); ++ mutex_unlock(&EXT3_I(inode)->truncate_mutex); ++ if (!err) ++ err = copy_to_user((void *) arg, &buf, sizeof(buf)); ++ } else if (cmd == EXT3_IOC_GET_TREE_DEPTH) { ++ struct ext3_extents_tree tree; ++ ext3_init_tree_desc(&tree, inode); ++ mutex_lock(&EXT3_I(inode)->truncate_mutex); ++ err = EXT_DEPTH(&tree); ++ mutex_unlock(&EXT3_I(inode)->truncate_mutex); ++ } ++ ++ return err; ++} ++ ++EXPORT_SYMBOL(ext3_init_tree_desc); ++EXPORT_SYMBOL(ext3_mark_inode_dirty); ++EXPORT_SYMBOL(ext3_ext_invalidate_cache); ++EXPORT_SYMBOL(ext3_ext_insert_extent); ++EXPORT_SYMBOL(ext3_ext_walk_space); ++EXPORT_SYMBOL(ext3_ext_find_goal); ++EXPORT_SYMBOL(ext3_ext_calc_credits_for_insert); +Index: linux-stage/fs/ext3/ialloc.c +=================================================================== +--- linux-stage.orig/fs/ext3/ialloc.c 2006-07-16 13:55:31.000000000 +0800 ++++ linux-stage/fs/ext3/ialloc.c 2006-07-16 14:10:20.000000000 +0800 +@@ -600,7 +600,7 @@ got: + ei->i_dir_start_lookup = 0; + ei->i_disksize = 0; + +- ei->i_flags = EXT3_I(dir)->i_flags & ~EXT3_INDEX_FL; ++ ei->i_flags = EXT3_I(dir)->i_flags & ~(EXT3_INDEX_FL|EXT3_EXTENTS_FL); + if (S_ISLNK(mode)) + ei->i_flags &= ~(EXT3_IMMUTABLE_FL|EXT3_APPEND_FL); + /* dirsync only applies to directories */ +@@ -644,6 +644,18 @@ got: + if (err) + goto fail_free_drop; + ++ if (test_opt(sb, EXTENTS) && S_ISREG(inode->i_mode)) { ++ EXT3_I(inode)->i_flags |= EXT3_EXTENTS_FL; ++ ext3_extents_initialize_blockmap(handle, inode); ++ if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_EXTENTS)) { ++ err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh); ++ if (err) goto fail; ++ EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_EXTENTS); ++ BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "call ext3_journal_dirty_metadata"); ++ err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); ++ } ++ } ++ + err = ext3_mark_inode_dirty(handle, inode); + if (err) { + ext3_std_error(sb, err); +Index: linux-stage/fs/ext3/inode.c +=================================================================== +--- linux-stage.orig/fs/ext3/inode.c 2006-07-16 13:55:31.000000000 +0800 ++++ linux-stage/fs/ext3/inode.c 2006-07-16 14:11:28.000000000 +0800 +@@ -40,7 +40,7 @@ + #include "iopen.h" + #include "acl.h" + +-static int ext3_writepage_trans_blocks(struct inode *inode); ++int ext3_writepage_trans_blocks(struct inode *inode); + + /* + * Test whether an inode is a fast symlink. +@@ -944,6 +944,17 @@ out: + + #define DIO_CREDITS (EXT3_RESERVE_TRANS_BLOCKS + 32) + ++static inline int ++ext3_get_block_wrap(handle_t *handle, struct inode *inode, long block, ++ struct buffer_head *bh, int create, int extend_disksize) ++{ ++ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) ++ return ext3_ext_get_block(handle, inode, block, bh, create, ++ extend_disksize); ++ return ext3_get_blocks_handle(handle, inode, block, 1, bh, create, ++ extend_disksize); ++} ++ + static int ext3_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) + { +@@ -984,8 +995,8 @@ static int ext3_get_block(struct inode * + + get_block: + if (ret == 0) { +- ret = ext3_get_blocks_handle(handle, inode, iblock, +- max_blocks, bh_result, create, 0); ++ ret = ext3_get_block_wrap(handle, inode, iblock, ++ bh_result, create, 0); + if (ret > 0) { + bh_result->b_size = (ret << inode->i_blkbits); + ret = 0; +@@ -1008,7 +1019,7 @@ struct buffer_head *ext3_getblk(handle_t + dummy.b_state = 0; + dummy.b_blocknr = -1000; + buffer_trace_init(&dummy.b_history); +- err = ext3_get_blocks_handle(handle, inode, block, 1, ++ err = ext3_get_block_wrap(handle, inode, block, + &dummy, create, 1); + if (err == 1) { + err = 0; +@@ -1756,7 +1767,7 @@ void ext3_set_aops(struct inode *inode) + * This required during truncate. We need to physically zero the tail end + * of that block so it doesn't yield old data if the file is later grown. + */ +-static int ext3_block_truncate_page(handle_t *handle, struct page *page, ++int ext3_block_truncate_page(handle_t *handle, struct page *page, + struct address_space *mapping, loff_t from) + { + ext3_fsblk_t index = from >> PAGE_CACHE_SHIFT; +@@ -2260,6 +2271,9 @@ void ext3_truncate(struct inode *inode) + return; + } + ++ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) ++ return ext3_ext_truncate(inode, page); ++ + handle = start_transaction(inode); + if (IS_ERR(handle)) { + if (page) { +@@ -3004,12 +3018,15 @@ err_out: + * block and work out the exact number of indirects which are touched. Pah. + */ + +-static int ext3_writepage_trans_blocks(struct inode *inode) ++int ext3_writepage_trans_blocks(struct inode *inode) + { + int bpp = ext3_journal_blocks_per_page(inode); + int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3; + int ret; + ++ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) ++ return ext3_ext_writepage_trans_blocks(inode, bpp); ++ + if (ext3_should_journal_data(inode)) + ret = 3 * (bpp + indirects) + 2; + else +@@ -3277,7 +3294,7 @@ int ext3_prep_san_write(struct inode *in + + /* alloc blocks one by one */ + for (i = 0; i < nblocks; i++) { +- ret = ext3_get_block_handle(handle, inode, blocks[i], ++ ret = ext3_get_blocks_handle(handle, inode, blocks[i], 1, + &bh_tmp, 1, 1); + if (ret) + break; +@@ -3337,7 +3354,7 @@ int ext3_map_inode_page(struct inode *in + if (blocks[i] != 0) + continue; + +- rc = ext3_get_block_handle(handle, inode, iblock, &dummy, 1, 1); ++ rc = ext3_get_blocks_handle(handle, inode, iblock, 1, &dummy, 1, 1); + if (rc) { + printk(KERN_INFO "ext3_map_inode_page: error reading " + "block %ld\n", iblock); +Index: linux-stage/fs/ext3/Makefile +=================================================================== +--- linux-stage.orig/fs/ext3/Makefile 2006-07-16 13:55:31.000000000 +0800 ++++ linux-stage/fs/ext3/Makefile 2006-07-16 14:10:21.000000000 +0800 +@@ -5,7 +5,8 @@ + obj-$(CONFIG_EXT3_FS) += ext3.o + + ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ +- ioctl.o namei.o super.o symlink.o hash.o resize.o ++ ioctl.o namei.o super.o symlink.o hash.o resize.o \ ++ extents.o + + ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o + ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o +Index: linux-stage/fs/ext3/super.c +=================================================================== +--- linux-stage.orig/fs/ext3/super.c 2006-07-16 13:55:31.000000000 +0800 ++++ linux-stage/fs/ext3/super.c 2006-07-16 14:10:21.000000000 +0800 +@@ -391,6 +391,7 @@ static void ext3_put_super (struct super + struct ext3_super_block *es = sbi->s_es; + int i; + ++ ext3_ext_release(sb); + ext3_xattr_put_super(sb); + journal_destroy(sbi->s_journal); + if (!(sb->s_flags & MS_RDONLY)) { +@@ -455,6 +456,8 @@ static struct inode *ext3_alloc_inode(st + #endif + ei->i_block_alloc_info = NULL; + ei->vfs_inode.i_version = 1; ++ ++ memset(&ei->i_cached_extent, 0, sizeof(ei->i_cached_extent)); + return &ei->vfs_inode; + } + +@@ -638,6 +641,7 @@ enum { + Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, + Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, + Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, ++ Opt_extents, Opt_extdebug, + Opt_grpquota + }; + +@@ -690,6 +694,8 @@ static match_table_t tokens = { + {Opt_iopen, "iopen"}, + {Opt_noiopen, "noiopen"}, + {Opt_iopen_nopriv, "iopen_nopriv"}, ++ {Opt_extents, "extents"}, ++ {Opt_extdebug, "extdebug"}, + {Opt_barrier, "barrier=%u"}, + {Opt_err, NULL}, + {Opt_resize, "resize"}, +@@ -1035,6 +1041,12 @@ clear_qf_name: + case Opt_bh: + clear_opt(sbi->s_mount_opt, NOBH); + break; ++ case Opt_extents: ++ set_opt (sbi->s_mount_opt, EXTENTS); ++ break; ++ case Opt_extdebug: ++ set_opt (sbi->s_mount_opt, EXTDEBUG); ++ break; + default: + printk (KERN_ERR + "EXT3-fs: Unrecognized mount option \"%s\" " +@@ -1760,6 +1772,7 @@ static int ext3_fill_super (struct super + test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered": + "writeback"); + ++ ext3_ext_init(sb); + lock_kernel(); + return 0; + +Index: linux-stage/fs/ext3/ioctl.c +=================================================================== +--- linux-stage.orig/fs/ext3/ioctl.c 2006-07-16 13:55:31.000000000 +0800 ++++ linux-stage/fs/ext3/ioctl.c 2006-07-16 13:55:31.000000000 +0800 +@@ -135,6 +135,10 @@ flags_err: + mutex_unlock(&inode->i_mutex); + return err; + } ++ case EXT3_IOC_GET_EXTENTS: ++ case EXT3_IOC_GET_TREE_STATS: ++ case EXT3_IOC_GET_TREE_DEPTH: ++ return ext3_ext_ioctl(inode, filp, cmd, arg); + case EXT3_IOC_GETVERSION: + case EXT3_IOC_GETVERSION_OLD: + return put_user(inode->i_generation, (int __user *) arg); +Index: linux-stage/include/linux/ext3_fs.h +=================================================================== +--- linux-stage.orig/include/linux/ext3_fs.h 2006-07-16 13:55:31.000000000 +0800 ++++ linux-stage/include/linux/ext3_fs.h 2006-07-16 14:10:21.000000000 +0800 +@@ -181,9 +181,10 @@ struct ext3_group_desc + #define EXT3_NOTAIL_FL 0x00008000 /* file tail should not be merged */ + #define EXT3_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ + #define EXT3_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ ++#define EXT3_EXTENTS_FL 0x00080000 /* Inode uses extents */ + #define EXT3_RESERVED_FL 0x80000000 /* reserved for ext3 lib */ + +-#define EXT3_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */ ++#define EXT3_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */ + #define EXT3_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */ + + /* +@@ -233,6 +234,9 @@ struct ext3_new_group_data { + #endif + #define EXT3_IOC_GETRSVSZ _IOR('f', 5, long) + #define EXT3_IOC_SETRSVSZ _IOW('f', 6, long) ++#define EXT3_IOC_GET_EXTENTS _IOR('f', 7, long) ++#define EXT3_IOC_GET_TREE_DEPTH _IOR('f', 8, long) ++#define EXT3_IOC_GET_TREE_STATS _IOR('f', 9, long) + + /* + * Mount options +@@ -373,6 +377,8 @@ struct ext3_inode { + #define EXT3_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ + #define EXT3_MOUNT_IOPEN 0x400000 /* Allow access via iopen */ + #define EXT3_MOUNT_IOPEN_NOPRIV 0x800000/* Make iopen world-readable */ ++#define EXT3_MOUNT_EXTENTS 0x1000000/* Extents support */ ++#define EXT3_MOUNT_EXTDEBUG 0x2000000/* Extents debug */ + + /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ + #ifndef clear_opt +@@ -563,11 +569,13 @@ static inline struct ext3_inode_info *EX + #define EXT3_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */ + #define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */ + #define EXT3_FEATURE_INCOMPAT_META_BG 0x0010 ++#define EXT3_FEATURE_INCOMPAT_EXTENTS 0x0040 /* extents support */ + + #define EXT3_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR + #define EXT3_FEATURE_INCOMPAT_SUPP (EXT3_FEATURE_INCOMPAT_FILETYPE| \ + EXT3_FEATURE_INCOMPAT_RECOVER| \ +- EXT3_FEATURE_INCOMPAT_META_BG) ++ EXT3_FEATURE_INCOMPAT_META_BG| \ ++ EXT3_FEATURE_INCOMPAT_EXTENTS) + #define EXT3_FEATURE_RO_COMPAT_SUPP (EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT3_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT3_FEATURE_RO_COMPAT_BTREE_DIR) +@@ -787,6 +795,8 @@ extern unsigned long ext3_count_free (st + + + /* inode.c */ ++extern int ext3_block_truncate_page(handle_t *, struct page *, ++ struct address_space *, loff_t); + int ext3_forget(handle_t *handle, int is_metadata, struct inode *inode, + struct buffer_head *bh, ext3_fsblk_t blocknr); + struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *); +@@ -860,6 +870,16 @@ extern struct inode_operations ext3_spec + extern struct inode_operations ext3_symlink_inode_operations; + extern struct inode_operations ext3_fast_symlink_inode_operations; + ++/* extents.c */ ++extern int ext3_ext_writepage_trans_blocks(struct inode *, int); ++extern int ext3_ext_get_block(handle_t *, struct inode *, long, ++ struct buffer_head *, int, int); ++extern void ext3_ext_truncate(struct inode *, struct page *); ++extern void ext3_ext_init(struct super_block *); ++extern void ext3_ext_release(struct super_block *); ++extern void ext3_extents_initialize_blockmap(handle_t *, struct inode *); ++extern int ext3_ext_ioctl(struct inode *inode, struct file *filp, ++ unsigned int cmd, unsigned long arg); + + #endif /* __KERNEL__ */ + +Index: linux-stage/include/linux/ext3_extents.h +=================================================================== +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ linux-stage/include/linux/ext3_extents.h 2006-07-16 13:55:31.000000000 +0800 +@@ -0,0 +1,264 @@ ++/* ++ * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com ++ * Written by Alex Tomas ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public Licens ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- ++ */ ++ ++#ifndef _LINUX_EXT3_EXTENTS ++#define _LINUX_EXT3_EXTENTS ++ ++/* ++ * with AGRESSIVE_TEST defined capacity of index/leaf blocks ++ * become very little, so index split, in-depth growing and ++ * other hard changes happens much more often ++ * this is for debug purposes only ++ */ ++#define AGRESSIVE_TEST_ ++ ++/* ++ * if CHECK_BINSEARCH defined, then results of binary search ++ * will be checked by linear search ++ */ ++#define CHECK_BINSEARCH_ ++ ++/* ++ * if EXT_DEBUG is defined you can use 'extdebug' mount option ++ * to get lots of info what's going on ++ */ ++#define EXT_DEBUG_ ++#ifdef EXT_DEBUG ++#define ext_debug(tree,fmt,a...) \ ++do { \ ++ if (test_opt((tree)->inode->i_sb, EXTDEBUG)) \ ++ printk(fmt, ##a); \ ++} while (0); ++#else ++#define ext_debug(tree,fmt,a...) ++#endif ++ ++/* ++ * if EXT_STATS is defined then stats numbers are collected ++ * these number will be displayed at umount time ++ */ ++#define EXT_STATS_ ++ ++ ++#define EXT3_ALLOC_NEEDED 3 /* block bitmap + group desc. + sb */ ++ ++/* ++ * ext3_inode has i_block array (total 60 bytes) ++ * first 4 bytes are used to store: ++ * - tree depth (0 mean there is no tree yet. all extents in the inode) ++ * - number of alive extents in the inode ++ */ ++ ++/* ++ * this is extent on-disk structure ++ * it's used at the bottom of the tree ++ */ ++struct ext3_extent { ++ __u32 ee_block; /* first logical block extent covers */ ++ __u16 ee_len; /* number of blocks covered by extent */ ++ __u16 ee_start_hi; /* high 16 bits of physical block */ ++ __u32 ee_start; /* low 32 bigs of physical block */ ++}; ++ ++/* ++ * this is index on-disk structure ++ * it's used at all the levels, but the bottom ++ */ ++struct ext3_extent_idx { ++ __u32 ei_block; /* index covers logical blocks from 'block' */ ++ __u32 ei_leaf; /* pointer to the physical block of the next * ++ * level. leaf or next index could bet here */ ++ __u16 ei_leaf_hi; /* high 16 bits of physical block */ ++ __u16 ei_unused; ++}; ++ ++/* ++ * each block (leaves and indexes), even inode-stored has header ++ */ ++struct ext3_extent_header { ++ __u16 eh_magic; /* probably will support different formats */ ++ __u16 eh_entries; /* number of valid entries */ ++ __u16 eh_max; /* capacity of store in entries */ ++ __u16 eh_depth; /* has tree real underlaying blocks? */ ++ __u32 eh_generation; /* generation of the tree */ ++}; ++ ++#define EXT3_EXT_MAGIC 0xf30a ++ ++/* ++ * array of ext3_ext_path contains path to some extent ++ * creation/lookup routines use it for traversal/splitting/etc ++ * truncate uses it to simulate recursive walking ++ */ ++struct ext3_ext_path { ++ __u32 p_block; ++ __u16 p_depth; ++ struct ext3_extent *p_ext; ++ struct ext3_extent_idx *p_idx; ++ struct ext3_extent_header *p_hdr; ++ struct buffer_head *p_bh; ++}; ++ ++/* ++ * structure for external API ++ */ ++ ++/* ++ * storage for cached extent ++ */ ++struct ext3_ext_cache { ++ __u32 ec_start; ++ __u32 ec_block; ++ __u32 ec_len; ++ __u32 ec_type; ++}; ++ ++#define EXT3_EXT_CACHE_NO 0 ++#define EXT3_EXT_CACHE_GAP 1 ++#define EXT3_EXT_CACHE_EXTENT 2 ++ ++/* ++ * ext3_extents_tree is used to pass initial information ++ * to top-level extents API ++ */ ++struct ext3_extents_helpers; ++struct ext3_extents_tree { ++ struct inode *inode; /* inode which tree belongs to */ ++ void *root; /* ptr to data top of tree resides at */ ++ void *buffer; /* will be passed as arg to ^^ routines */ ++ int buffer_len; ++ void *private; ++ struct ext3_ext_cache *cex;/* last found extent */ ++ struct ext3_extents_helpers *ops; ++}; ++ ++struct ext3_extents_helpers { ++ int (*get_write_access)(handle_t *h, void *buffer); ++ int (*mark_buffer_dirty)(handle_t *h, void *buffer); ++ int (*mergable)(struct ext3_extent *ex1, struct ext3_extent *ex2); ++ int (*remove_extent_credits)(struct ext3_extents_tree *, ++ struct ext3_extent *, unsigned long, ++ unsigned long); ++ int (*remove_extent)(struct ext3_extents_tree *, ++ struct ext3_extent *, unsigned long, ++ unsigned long); ++ int (*new_block)(handle_t *, struct ext3_extents_tree *, ++ struct ext3_ext_path *, struct ext3_extent *, ++ int *); ++}; ++ ++/* ++ * to be called by ext3_ext_walk_space() ++ * negative retcode - error ++ * positive retcode - signal for ext3_ext_walk_space(), see below ++ * callback must return valid extent (passed or newly created) ++ */ ++typedef int (*ext_prepare_callback)(struct ext3_extents_tree *, ++ struct ext3_ext_path *, ++ struct ext3_ext_cache *); ++ ++#define EXT_CONTINUE 0 ++#define EXT_BREAK 1 ++#define EXT_REPEAT 2 ++ ++ ++#define EXT_MAX_BLOCK 0xffffffff ++ ++ ++#define EXT_FIRST_EXTENT(__hdr__) \ ++ ((struct ext3_extent *) (((char *) (__hdr__)) + \ ++ sizeof(struct ext3_extent_header))) ++#define EXT_FIRST_INDEX(__hdr__) \ ++ ((struct ext3_extent_idx *) (((char *) (__hdr__)) + \ ++ sizeof(struct ext3_extent_header))) ++#define EXT_HAS_FREE_INDEX(__path__) \ ++ ((__path__)->p_hdr->eh_entries < (__path__)->p_hdr->eh_max) ++#define EXT_LAST_EXTENT(__hdr__) \ ++ (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_entries - 1) ++#define EXT_LAST_INDEX(__hdr__) \ ++ (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_entries - 1) ++#define EXT_MAX_EXTENT(__hdr__) \ ++ (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_max - 1) ++#define EXT_MAX_INDEX(__hdr__) \ ++ (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_max - 1) ++ ++#define EXT_ROOT_HDR(tree) \ ++ ((struct ext3_extent_header *) (tree)->root) ++#define EXT_BLOCK_HDR(bh) \ ++ ((struct ext3_extent_header *) (bh)->b_data) ++#define EXT_DEPTH(_t_) \ ++ (((struct ext3_extent_header *)((_t_)->root))->eh_depth) ++#define EXT_GENERATION(_t_) \ ++ (((struct ext3_extent_header *)((_t_)->root))->eh_generation) ++ ++ ++#define EXT_ASSERT(__x__) if (!(__x__)) BUG(); ++ ++#define EXT_CHECK_PATH(tree,path) \ ++{ \ ++ int depth = EXT_DEPTH(tree); \ ++ BUG_ON((unsigned long) (path) < __PAGE_OFFSET); \ ++ BUG_ON((unsigned long) (path)[depth].p_idx < \ ++ __PAGE_OFFSET && (path)[depth].p_idx != NULL); \ ++ BUG_ON((unsigned long) (path)[depth].p_ext < \ ++ __PAGE_OFFSET && (path)[depth].p_ext != NULL); \ ++ BUG_ON((unsigned long) (path)[depth].p_hdr < __PAGE_OFFSET); \ ++ BUG_ON((unsigned long) (path)[depth].p_bh < __PAGE_OFFSET \ ++ && depth != 0); \ ++ BUG_ON((path)[0].p_depth != depth); \ ++} ++ ++ ++/* ++ * this structure is used to gather extents from the tree via ioctl ++ */ ++struct ext3_extent_buf { ++ unsigned long start; ++ int buflen; ++ void *buffer; ++ void *cur; ++ int err; ++}; ++ ++/* ++ * this structure is used to collect stats info about the tree ++ */ ++struct ext3_extent_tree_stats { ++ int depth; ++ int extents_num; ++ int leaf_num; ++}; ++ ++extern void ext3_init_tree_desc(struct ext3_extents_tree *, struct inode *); ++extern int ext3_extent_tree_init(handle_t *, struct ext3_extents_tree *); ++extern int ext3_ext_calc_credits_for_insert(struct ext3_extents_tree *, struct ext3_ext_path *); ++extern int ext3_ext_insert_extent(handle_t *, struct ext3_extents_tree *, struct ext3_ext_path *, struct ext3_extent *); ++extern int ext3_ext_walk_space(struct ext3_extents_tree *, unsigned long, unsigned long, ext_prepare_callback); ++extern int ext3_ext_remove_space(struct ext3_extents_tree *, unsigned long, unsigned long); ++extern struct ext3_ext_path * ext3_ext_find_extent(struct ext3_extents_tree *, int, struct ext3_ext_path *); ++extern int ext3_ext_calc_blockmap_metadata(struct inode *, int); ++ ++static inline void ++ext3_ext_invalidate_cache(struct ext3_extents_tree *tree) ++{ ++ if (tree->cex) ++ tree->cex->ec_type = EXT3_EXT_CACHE_NO; ++} ++ ++ ++#endif /* _LINUX_EXT3_EXTENTS */ +Index: linux-stage/include/linux/ext3_fs_i.h +=================================================================== +--- linux-stage.orig/include/linux/ext3_fs_i.h 2006-07-16 13:55:30.000000000 +0800 ++++ linux-stage/include/linux/ext3_fs_i.h 2006-07-16 14:10:20.000000000 +0800 +@@ -142,6 +142,8 @@ struct ext3_inode_info { + */ + struct mutex truncate_mutex; + struct inode vfs_inode; ++ ++ __u32 i_cached_extent[4]; + }; + + #endif /* _LINUX_EXT3_FS_I */ diff --git a/lustre/kernel_patches/patches/ext3-mballoc2-2.6.18-vanilla.patch b/lustre/kernel_patches/patches/ext3-mballoc2-2.6.18-vanilla.patch new file mode 100644 index 0000000..0040a6f --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-mballoc2-2.6.18-vanilla.patch @@ -0,0 +1,2810 @@ +Index: linux-stage/fs/ext3/mballoc.c +=================================================================== +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ linux-stage/fs/ext3/mballoc.c 2006-07-16 02:29:49.000000000 +0800 +@@ -0,0 +1,2434 @@ ++/* ++ * Copyright (c) 2003-2005, Cluster File Systems, Inc, info@clusterfs.com ++ * Written by Alex Tomas ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public Licens ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- ++ */ ++ ++ ++/* ++ * mballoc.c contains the multiblocks allocation routines ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * TODO: ++ * - bitmap read-ahead (proposed by Oleg Drokin aka green) ++ * - track min/max extents in each group for better group selection ++ * - mb_mark_used() may allocate chunk right after splitting buddy ++ * - special flag to advice allocator to look for requested + N blocks ++ * this may improve interaction between extents and mballoc ++ * - tree of groups sorted by number of free blocks ++ * - percpu reservation code (hotpath) ++ * - error handling ++ */ ++ ++/* ++ * with AGRESSIVE_CHECK allocator runs consistency checks over ++ * structures. these checks slow things down a lot ++ */ ++#define AGGRESSIVE_CHECK__ ++ ++/* ++ */ ++#define MB_DEBUG__ ++#ifdef MB_DEBUG ++#define mb_debug(fmt,a...) printk(fmt, ##a) ++#else ++#define mb_debug(fmt,a...) ++#endif ++ ++/* ++ * with EXT3_MB_HISTORY mballoc stores last N allocations in memory ++ * and you can monitor it in /proc/fs/ext3//mb_history ++ */ ++#define EXT3_MB_HISTORY ++ ++/* ++ * How long mballoc can look for a best extent (in found extents) ++ */ ++long ext3_mb_max_to_scan = 500; ++ ++/* ++ * How long mballoc must look for a best extent ++ */ ++long ext3_mb_min_to_scan = 30; ++ ++/* ++ * with 'ext3_mb_stats' allocator will collect stats that will be ++ * shown at umount. The collecting costs though! ++ */ ++ ++long ext3_mb_stats = 1; ++ ++#ifdef EXT3_BB_MAX_BLOCKS ++#undef EXT3_BB_MAX_BLOCKS ++#endif ++#define EXT3_BB_MAX_BLOCKS 30 ++ ++struct ext3_free_metadata { ++ unsigned short group; ++ unsigned short num; ++ unsigned short blocks[EXT3_BB_MAX_BLOCKS]; ++ struct list_head list; ++}; ++ ++struct ext3_group_info { ++ unsigned long bb_state; ++ unsigned long bb_tid; ++ struct ext3_free_metadata *bb_md_cur; ++ unsigned short bb_first_free; ++ unsigned short bb_free; ++ unsigned short bb_fragments; ++ unsigned short bb_counters[]; ++}; ++ ++ ++#define EXT3_GROUP_INFO_NEED_INIT_BIT 0 ++#define EXT3_GROUP_INFO_LOCKED_BIT 1 ++ ++#define EXT3_MB_GRP_NEED_INIT(grp) \ ++ (test_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, &(grp)->bb_state)) ++ ++struct ext3_free_extent { ++ __u16 fe_start; ++ __u16 fe_len; ++ __u16 fe_group; ++}; ++ ++struct ext3_allocation_context { ++ struct super_block *ac_sb; ++ ++ /* search goals */ ++ struct ext3_free_extent ac_g_ex; ++ ++ /* the best found extent */ ++ struct ext3_free_extent ac_b_ex; ++ ++ /* number of iterations done. we have to track to limit searching */ ++ unsigned long ac_ex_scanned; ++ __u16 ac_groups_scanned; ++ __u16 ac_found; ++ __u16 ac_tail; ++ __u16 ac_buddy; ++ __u8 ac_status; ++ __u8 ac_flags; /* allocation hints */ ++ __u8 ac_criteria; ++ __u8 ac_repeats; ++ __u8 ac_2order; /* if request is to allocate 2^N blocks and ++ * N > 0, the field stores N, otherwise 0 */ ++}; ++ ++#define AC_STATUS_CONTINUE 1 ++#define AC_STATUS_FOUND 2 ++#define AC_STATUS_BREAK 3 ++ ++struct ext3_mb_history { ++ struct ext3_free_extent goal; /* goal allocation */ ++ struct ext3_free_extent result; /* result allocation */ ++ __u16 found; /* how many extents have been found */ ++ __u16 groups; /* how many groups have been scanned */ ++ __u16 tail; /* what tail broke some buddy */ ++ __u16 buddy; /* buddy the tail ^^^ broke */ ++ __u8 cr; /* which phase the result extent was found at */ ++ __u8 merged; ++}; ++ ++struct ext3_buddy { ++ struct page *bd_buddy_page; ++ void *bd_buddy; ++ struct page *bd_bitmap_page; ++ void *bd_bitmap; ++ struct ext3_group_info *bd_info; ++ struct super_block *bd_sb; ++ __u16 bd_blkbits; ++ __u16 bd_group; ++}; ++#define EXT3_MB_BITMAP(e3b) ((e3b)->bd_bitmap) ++#define EXT3_MB_BUDDY(e3b) ((e3b)->bd_buddy) ++ ++#ifndef EXT3_MB_HISTORY ++#define ext3_mb_store_history(sb,ac) ++#else ++static void ext3_mb_store_history(struct super_block *, ++ struct ext3_allocation_context *ac); ++#endif ++ ++#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) ++ ++static struct proc_dir_entry *proc_root_ext3; ++ ++int ext3_create (struct inode *, struct dentry *, int, struct nameidata *); ++struct buffer_head * read_block_bitmap(struct super_block *, unsigned int); ++int ext3_new_block_old(handle_t *, struct inode *, unsigned long, int *); ++int ext3_mb_reserve_blocks(struct super_block *, int); ++void ext3_mb_release_blocks(struct super_block *, int); ++void ext3_mb_poll_new_transaction(struct super_block *, handle_t *); ++void ext3_mb_free_committed_blocks(struct super_block *); ++ ++#if BITS_PER_LONG == 64 ++#define mb_correct_addr_and_bit(bit,addr) \ ++{ \ ++ bit += ((unsigned long) addr & 7UL) << 3; \ ++ addr = (void *) ((unsigned long) addr & ~7UL); \ ++} ++#elif BITS_PER_LONG == 32 ++#define mb_correct_addr_and_bit(bit,addr) \ ++{ \ ++ bit += ((unsigned long) addr & 3UL) << 3; \ ++ addr = (void *) ((unsigned long) addr & ~3UL); \ ++} ++#else ++#error "how many bits you are?!" ++#endif ++ ++static inline int mb_test_bit(int bit, void *addr) ++{ ++ mb_correct_addr_and_bit(bit,addr); ++ return ext2_test_bit(bit, addr); ++} ++ ++static inline void mb_set_bit(int bit, void *addr) ++{ ++ mb_correct_addr_and_bit(bit,addr); ++ ext2_set_bit(bit, addr); ++} ++ ++static inline void mb_set_bit_atomic(int bit, void *addr) ++{ ++ mb_correct_addr_and_bit(bit,addr); ++ ext2_set_bit_atomic(NULL, bit, addr); ++} ++ ++static inline void mb_clear_bit(int bit, void *addr) ++{ ++ mb_correct_addr_and_bit(bit,addr); ++ ext2_clear_bit(bit, addr); ++} ++ ++static inline void mb_clear_bit_atomic(int bit, void *addr) ++{ ++ mb_correct_addr_and_bit(bit,addr); ++ ext2_clear_bit_atomic(NULL, bit, addr); ++} ++ ++static inline int mb_find_next_zero_bit(void *addr, int max, int start) ++{ ++ int fix; ++#if BITS_PER_LONG == 64 ++ fix = ((unsigned long) addr & 7UL) << 3; ++ addr = (void *) ((unsigned long) addr & ~7UL); ++#elif BITS_PER_LONG == 32 ++ fix = ((unsigned long) addr & 3UL) << 3; ++ addr = (void *) ((unsigned long) addr & ~3UL); ++#else ++#error "how many bits you are?!" ++#endif ++ max += fix; ++ start += fix; ++ return ext2_find_next_zero_bit(addr, max, start) - fix; ++} ++ ++static inline void *mb_find_buddy(struct ext3_buddy *e3b, int order, int *max) ++{ ++ char *bb; ++ ++ J_ASSERT(EXT3_MB_BITMAP(e3b) != EXT3_MB_BUDDY(e3b)); ++ J_ASSERT(max != NULL); ++ ++ if (order > e3b->bd_blkbits + 1) { ++ *max = 0; ++ return NULL; ++ } ++ ++ /* at order 0 we see each particular block */ ++ *max = 1 << (e3b->bd_blkbits + 3); ++ if (order == 0) ++ return EXT3_MB_BITMAP(e3b); ++ ++ bb = EXT3_MB_BUDDY(e3b) + EXT3_SB(e3b->bd_sb)->s_mb_offsets[order]; ++ *max = EXT3_SB(e3b->bd_sb)->s_mb_maxs[order]; ++ ++ return bb; ++} ++ ++#ifdef AGGRESSIVE_CHECK ++ ++static void mb_check_buddy(struct ext3_buddy *e3b) ++{ ++ int order = e3b->bd_blkbits + 1; ++ int max, max2, i, j, k, count; ++ int fragments = 0, fstart; ++ void *buddy, *buddy2; ++ ++ if (!test_opt(e3b->bd_sb, MBALLOC)) ++ return; ++ ++ { ++ static int mb_check_counter = 0; ++ if (mb_check_counter++ % 300 != 0) ++ return; ++ } ++ ++ while (order > 1) { ++ buddy = mb_find_buddy(e3b, order, &max); ++ J_ASSERT(buddy); ++ buddy2 = mb_find_buddy(e3b, order - 1, &max2); ++ J_ASSERT(buddy2); ++ J_ASSERT(buddy != buddy2); ++ J_ASSERT(max * 2 == max2); ++ ++ count = 0; ++ for (i = 0; i < max; i++) { ++ ++ if (mb_test_bit(i, buddy)) { ++ /* only single bit in buddy2 may be 1 */ ++ if (!mb_test_bit(i << 1, buddy2)) ++ J_ASSERT(mb_test_bit((i<<1)+1, buddy2)); ++ else if (!mb_test_bit((i << 1) + 1, buddy2)) ++ J_ASSERT(mb_test_bit(i << 1, buddy2)); ++ continue; ++ } ++ ++ /* both bits in buddy2 must be 0 */ ++ J_ASSERT(mb_test_bit(i << 1, buddy2)); ++ J_ASSERT(mb_test_bit((i << 1) + 1, buddy2)); ++ ++ for (j = 0; j < (1 << order); j++) { ++ k = (i * (1 << order)) + j; ++ J_ASSERT(!mb_test_bit(k, EXT3_MB_BITMAP(e3b))); ++ } ++ count++; ++ } ++ J_ASSERT(e3b->bd_info->bb_counters[order] == count); ++ order--; ++ } ++ ++ fstart = -1; ++ buddy = mb_find_buddy(e3b, 0, &max); ++ for (i = 0; i < max; i++) { ++ if (!mb_test_bit(i, buddy)) { ++ J_ASSERT(i >= e3b->bd_info->bb_first_free); ++ if (fstart == -1) { ++ fragments++; ++ fstart = i; ++ } ++ continue; ++ } ++ fstart = -1; ++ /* check used bits only */ ++ for (j = 0; j < e3b->bd_blkbits + 1; j++) { ++ buddy2 = mb_find_buddy(e3b, j, &max2); ++ k = i >> j; ++ J_ASSERT(k < max2); ++ J_ASSERT(mb_test_bit(k, buddy2)); ++ } ++ } ++ J_ASSERT(!EXT3_MB_GRP_NEED_INIT(e3b->bd_info)); ++ J_ASSERT(e3b->bd_info->bb_fragments == fragments); ++} ++ ++#else ++#define mb_check_buddy(e3b) ++#endif ++ ++/* find most significant bit */ ++static int inline fmsb(unsigned short word) ++{ ++ int order; ++ ++ if (word > 255) { ++ order = 7; ++ word >>= 8; ++ } else { ++ order = -1; ++ } ++ ++ do { ++ order++; ++ word >>= 1; ++ } while (word != 0); ++ ++ return order; ++} ++ ++static void inline ++ext3_mb_mark_free_simple(struct super_block *sb, void *buddy, unsigned first, ++ int len, struct ext3_group_info *grp) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ unsigned short min, max, chunk, border; ++ ++ mb_debug("mark %u/%u free\n", first, len); ++ J_ASSERT(len < EXT3_BLOCKS_PER_GROUP(sb)); ++ ++ border = 2 << sb->s_blocksize_bits; ++ ++ while (len > 0) { ++ /* find how many blocks can be covered since this position */ ++ max = ffs(first | border) - 1; ++ ++ /* find how many blocks of power 2 we need to mark */ ++ min = fmsb(len); ++ ++ mb_debug(" %u/%u -> max %u, min %u\n", ++ first & ((2 << sb->s_blocksize_bits) - 1), ++ len, max, min); ++ ++ if (max < min) ++ min = max; ++ chunk = 1 << min; ++ ++ /* mark multiblock chunks only */ ++ grp->bb_counters[min]++; ++ if (min > 0) { ++ mb_debug(" set %u at %u \n", first >> min, ++ sbi->s_mb_offsets[min]); ++ mb_clear_bit(first >> min, buddy + sbi->s_mb_offsets[min]); ++ } ++ ++ len -= chunk; ++ first += chunk; ++ } ++} ++ ++static void ++ext3_mb_generate_buddy(struct super_block *sb, void *buddy, void *bitmap, ++ struct ext3_group_info *grp) ++{ ++ unsigned short max = EXT3_BLOCKS_PER_GROUP(sb); ++ unsigned short i = 0, first, len; ++ unsigned free = 0, fragments = 0; ++ unsigned long long period = get_cycles(); ++ ++ i = mb_find_next_zero_bit(bitmap, max, 0); ++ grp->bb_first_free = i; ++ while (i < max) { ++ fragments++; ++ first = i; ++ i = find_next_bit(bitmap, max, i); ++ len = i - first; ++ free += len; ++ if (len > 1) ++ ext3_mb_mark_free_simple(sb, buddy, first, len, grp); ++ else ++ grp->bb_counters[0]++; ++ if (i < max) ++ i = mb_find_next_zero_bit(bitmap, max, i); ++ } ++ grp->bb_fragments = fragments; ++ ++ /* bb_state shouldn't being modified because all ++ * others waits for init completion on page lock */ ++ clear_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, &grp->bb_state); ++ if (free != grp->bb_free) { ++ printk("EXT3-fs: %u blocks in bitmap, %u in group descriptor\n", ++ free, grp->bb_free); ++ grp->bb_free = free; ++ } ++ ++ period = get_cycles() - period; ++ spin_lock(&EXT3_SB(sb)->s_bal_lock); ++ EXT3_SB(sb)->s_mb_buddies_generated++; ++ EXT3_SB(sb)->s_mb_generation_time += period; ++ spin_unlock(&EXT3_SB(sb)->s_bal_lock); ++} ++ ++static int ext3_mb_init_cache(struct page *page) ++{ ++ int blocksize, blocks_per_page, groups_per_page; ++ int err = 0, i, first_group, first_block; ++ struct super_block *sb; ++ struct buffer_head *bhs; ++ struct buffer_head **bh; ++ struct inode *inode; ++ char *data, *bitmap; ++ ++ mb_debug("init page %lu\n", page->index); ++ ++ inode = page->mapping->host; ++ sb = inode->i_sb; ++ blocksize = 1 << inode->i_blkbits; ++ blocks_per_page = PAGE_CACHE_SIZE / blocksize; ++ ++ groups_per_page = blocks_per_page >> 1; ++ if (groups_per_page == 0) ++ groups_per_page = 1; ++ ++ /* allocate buffer_heads to read bitmaps */ ++ if (groups_per_page > 1) { ++ err = -ENOMEM; ++ i = sizeof(struct buffer_head *) * groups_per_page; ++ bh = kmalloc(i, GFP_NOFS); ++ if (bh == NULL) ++ goto out; ++ memset(bh, 0, i); ++ } else ++ bh = &bhs; ++ ++ first_group = page->index * blocks_per_page / 2; ++ ++ /* read all groups the page covers into the cache */ ++ for (i = 0; i < groups_per_page; i++) { ++ struct ext3_group_desc * desc; ++ ++ if (first_group + i >= EXT3_SB(sb)->s_groups_count) ++ break; ++ ++ err = -EIO; ++ desc = ext3_get_group_desc(sb, first_group + i, NULL); ++ if (desc == NULL) ++ goto out; ++ ++ err = -ENOMEM; ++ bh[i] = sb_getblk(sb, le32_to_cpu(desc->bg_block_bitmap)); ++ if (bh[i] == NULL) ++ goto out; ++ ++ if (buffer_uptodate(bh[i])) ++ continue; ++ ++ lock_buffer(bh[i]); ++ if (buffer_uptodate(bh[i])) { ++ unlock_buffer(bh[i]); ++ continue; ++ } ++ ++ get_bh(bh[i]); ++ bh[i]->b_end_io = end_buffer_read_sync; ++ submit_bh(READ, bh[i]); ++ mb_debug("read bitmap for group %u\n", first_group + i); ++ } ++ ++ /* wait for I/O completion */ ++ for (i = 0; i < groups_per_page && bh[i]; i++) ++ wait_on_buffer(bh[i]); ++ ++ /* XXX: I/O error handling here */ ++ ++ first_block = page->index * blocks_per_page; ++ for (i = 0; i < blocks_per_page; i++) { ++ int group; ++ ++ group = (first_block + i) >> 1; ++ if (group >= EXT3_SB(sb)->s_groups_count) ++ break; ++ ++ data = page_address(page) + (i * blocksize); ++ bitmap = bh[group - first_group]->b_data; ++ ++ if ((first_block + i) & 1) { ++ /* this is block of buddy */ ++ mb_debug("put buddy for group %u in page %lu/%x\n", ++ group, page->index, i * blocksize); ++ memset(data, 0xff, blocksize); ++ EXT3_SB(sb)->s_group_info[group]->bb_fragments = 0; ++ memset(EXT3_SB(sb)->s_group_info[group]->bb_counters, 0, ++ sizeof(unsigned short)*(sb->s_blocksize_bits+2)); ++ ext3_mb_generate_buddy(sb, data, bitmap, ++ EXT3_SB(sb)->s_group_info[group]); ++ } else { ++ /* this is block of bitmap */ ++ mb_debug("put bitmap for group %u in page %lu/%x\n", ++ group, page->index, i * blocksize); ++ memcpy(data, bitmap, blocksize); ++ } ++ } ++ SetPageUptodate(page); ++ ++out: ++ for (i = 0; i < groups_per_page && bh[i]; i++) ++ brelse(bh[i]); ++ if (bh && bh != &bhs) ++ kfree(bh); ++ return err; ++} ++ ++static int ext3_mb_load_buddy(struct super_block *sb, int group, ++ struct ext3_buddy *e3b) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct inode *inode = sbi->s_buddy_cache; ++ int blocks_per_page, block, pnum, poff; ++ struct page *page; ++ ++ mb_debug("load group %u\n", group); ++ ++ blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; ++ ++ e3b->bd_blkbits = sb->s_blocksize_bits; ++ e3b->bd_info = sbi->s_group_info[group]; ++ e3b->bd_sb = sb; ++ e3b->bd_group = group; ++ e3b->bd_buddy_page = NULL; ++ e3b->bd_bitmap_page = NULL; ++ ++ block = group * 2; ++ pnum = block / blocks_per_page; ++ poff = block % blocks_per_page; ++ ++ page = find_get_page(inode->i_mapping, pnum); ++ if (page == NULL || !PageUptodate(page)) { ++ if (page) ++ page_cache_release(page); ++ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); ++ if (page) { ++ if (!PageUptodate(page)) ++ ext3_mb_init_cache(page); ++ unlock_page(page); ++ } ++ } ++ if (page == NULL || !PageUptodate(page)) ++ goto err; ++ e3b->bd_bitmap_page = page; ++ e3b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); ++ mark_page_accessed(page); ++ ++ block++; ++ pnum = block / blocks_per_page; ++ poff = block % blocks_per_page; ++ ++ page = find_get_page(inode->i_mapping, pnum); ++ if (page == NULL || !PageUptodate(page)) { ++ if (page) ++ page_cache_release(page); ++ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); ++ if (page) { ++ if (!PageUptodate(page)) ++ ext3_mb_init_cache(page); ++ unlock_page(page); ++ } ++ } ++ if (page == NULL || !PageUptodate(page)) ++ goto err; ++ e3b->bd_buddy_page = page; ++ e3b->bd_buddy = page_address(page) + (poff * sb->s_blocksize); ++ mark_page_accessed(page); ++ ++ J_ASSERT(e3b->bd_bitmap_page != NULL); ++ J_ASSERT(e3b->bd_buddy_page != NULL); ++ ++ return 0; ++ ++err: ++ if (e3b->bd_bitmap_page) ++ page_cache_release(e3b->bd_bitmap_page); ++ if (e3b->bd_buddy_page) ++ page_cache_release(e3b->bd_buddy_page); ++ e3b->bd_buddy = NULL; ++ e3b->bd_bitmap = NULL; ++ return -EIO; ++} ++ ++static void ext3_mb_release_desc(struct ext3_buddy *e3b) ++{ ++ if (e3b->bd_bitmap_page) ++ page_cache_release(e3b->bd_bitmap_page); ++ if (e3b->bd_buddy_page) ++ page_cache_release(e3b->bd_buddy_page); ++} ++ ++ ++static inline void ++ext3_lock_group(struct super_block *sb, int group) ++{ ++ bit_spin_lock(EXT3_GROUP_INFO_LOCKED_BIT, ++ &EXT3_SB(sb)->s_group_info[group]->bb_state); ++} ++ ++static inline void ++ext3_unlock_group(struct super_block *sb, int group) ++{ ++ bit_spin_unlock(EXT3_GROUP_INFO_LOCKED_BIT, ++ &EXT3_SB(sb)->s_group_info[group]->bb_state); ++} ++ ++static int mb_find_order_for_block(struct ext3_buddy *e3b, int block) ++{ ++ int order = 1; ++ void *bb; ++ ++ J_ASSERT(EXT3_MB_BITMAP(e3b) != EXT3_MB_BUDDY(e3b)); ++ J_ASSERT(block < (1 << (e3b->bd_blkbits + 3))); ++ ++ bb = EXT3_MB_BUDDY(e3b); ++ while (order <= e3b->bd_blkbits + 1) { ++ block = block >> 1; ++ if (!mb_test_bit(block, bb)) { ++ /* this block is part of buddy of order 'order' */ ++ return order; ++ } ++ bb += 1 << (e3b->bd_blkbits - order); ++ order++; ++ } ++ return 0; ++} ++ ++static inline void mb_clear_bits(void *bm, int cur, int len) ++{ ++ __u32 *addr; ++ ++ len = cur + len; ++ while (cur < len) { ++ if ((cur & 31) == 0 && (len - cur) >= 32) { ++ /* fast path: clear whole word at once */ ++ addr = bm + (cur >> 3); ++ *addr = 0; ++ cur += 32; ++ continue; ++ } ++ mb_clear_bit_atomic(cur, bm); ++ cur++; ++ } ++} ++ ++static inline void mb_set_bits(void *bm, int cur, int len) ++{ ++ __u32 *addr; ++ ++ len = cur + len; ++ while (cur < len) { ++ if ((cur & 31) == 0 && (len - cur) >= 32) { ++ /* fast path: clear whole word at once */ ++ addr = bm + (cur >> 3); ++ *addr = 0xffffffff; ++ cur += 32; ++ continue; ++ } ++ mb_set_bit_atomic(cur, bm); ++ cur++; ++ } ++} ++ ++static int mb_free_blocks(struct ext3_buddy *e3b, int first, int count) ++{ ++ int block = 0, max = 0, order; ++ void *buddy, *buddy2; ++ ++ mb_check_buddy(e3b); ++ ++ e3b->bd_info->bb_free += count; ++ if (first < e3b->bd_info->bb_first_free) ++ e3b->bd_info->bb_first_free = first; ++ ++ /* let's maintain fragments counter */ ++ if (first != 0) ++ block = !mb_test_bit(first - 1, EXT3_MB_BITMAP(e3b)); ++ if (first + count < EXT3_SB(e3b->bd_sb)->s_mb_maxs[0]) ++ max = !mb_test_bit(first + count, EXT3_MB_BITMAP(e3b)); ++ if (block && max) ++ e3b->bd_info->bb_fragments--; ++ else if (!block && !max) ++ e3b->bd_info->bb_fragments++; ++ ++ /* let's maintain buddy itself */ ++ while (count-- > 0) { ++ block = first++; ++ order = 0; ++ ++ J_ASSERT(mb_test_bit(block, EXT3_MB_BITMAP(e3b))); ++ mb_clear_bit(block, EXT3_MB_BITMAP(e3b)); ++ e3b->bd_info->bb_counters[order]++; ++ ++ /* start of the buddy */ ++ buddy = mb_find_buddy(e3b, order, &max); ++ ++ do { ++ block &= ~1UL; ++ if (mb_test_bit(block, buddy) || ++ mb_test_bit(block + 1, buddy)) ++ break; ++ ++ /* both the buddies are free, try to coalesce them */ ++ buddy2 = mb_find_buddy(e3b, order + 1, &max); ++ ++ if (!buddy2) ++ break; ++ ++ if (order > 0) { ++ /* for special purposes, we don't set ++ * free bits in bitmap */ ++ mb_set_bit(block, buddy); ++ mb_set_bit(block + 1, buddy); ++ } ++ e3b->bd_info->bb_counters[order]--; ++ e3b->bd_info->bb_counters[order]--; ++ ++ block = block >> 1; ++ order++; ++ e3b->bd_info->bb_counters[order]++; ++ ++ mb_clear_bit(block, buddy2); ++ buddy = buddy2; ++ } while (1); ++ } ++ mb_check_buddy(e3b); ++ ++ return 0; ++} ++ ++static int mb_find_extent(struct ext3_buddy *e3b, int order, int block, ++ int needed, struct ext3_free_extent *ex) ++{ ++ int next, max, ord; ++ void *buddy; ++ ++ J_ASSERT(ex != NULL); ++ ++ buddy = mb_find_buddy(e3b, order, &max); ++ J_ASSERT(buddy); ++ J_ASSERT(block < max); ++ if (mb_test_bit(block, buddy)) { ++ ex->fe_len = 0; ++ ex->fe_start = 0; ++ ex->fe_group = 0; ++ return 0; ++ } ++ ++ if (likely(order == 0)) { ++ /* find actual order */ ++ order = mb_find_order_for_block(e3b, block); ++ block = block >> order; ++ } ++ ++ ex->fe_len = 1 << order; ++ ex->fe_start = block << order; ++ ex->fe_group = e3b->bd_group; ++ ++ while (needed > ex->fe_len && (buddy = mb_find_buddy(e3b, order, &max))) { ++ ++ if (block + 1 >= max) ++ break; ++ ++ next = (block + 1) * (1 << order); ++ if (mb_test_bit(next, EXT3_MB_BITMAP(e3b))) ++ break; ++ ++ ord = mb_find_order_for_block(e3b, next); ++ ++ order = ord; ++ block = next >> order; ++ ex->fe_len += 1 << order; ++ } ++ ++ J_ASSERT(ex->fe_start + ex->fe_len <= (1 << (e3b->bd_blkbits + 3))); ++ return ex->fe_len; ++} ++ ++static int mb_mark_used(struct ext3_buddy *e3b, struct ext3_free_extent *ex) ++{ ++ int ord, mlen = 0, max = 0, cur; ++ int start = ex->fe_start; ++ int len = ex->fe_len; ++ unsigned ret = 0; ++ int len0 = len; ++ void *buddy; ++ ++ mb_check_buddy(e3b); ++ ++ e3b->bd_info->bb_free -= len; ++ if (e3b->bd_info->bb_first_free == start) ++ e3b->bd_info->bb_first_free += len; ++ ++ /* let's maintain fragments counter */ ++ if (start != 0) ++ mlen = !mb_test_bit(start - 1, EXT3_MB_BITMAP(e3b)); ++ if (start + len < EXT3_SB(e3b->bd_sb)->s_mb_maxs[0]) ++ max = !mb_test_bit(start + len, EXT3_MB_BITMAP(e3b)); ++ if (mlen && max) ++ e3b->bd_info->bb_fragments++; ++ else if (!mlen && !max) ++ e3b->bd_info->bb_fragments--; ++ ++ /* let's maintain buddy itself */ ++ while (len) { ++ ord = mb_find_order_for_block(e3b, start); ++ ++ if (((start >> ord) << ord) == start && len >= (1 << ord)) { ++ /* the whole chunk may be allocated at once! */ ++ mlen = 1 << ord; ++ buddy = mb_find_buddy(e3b, ord, &max); ++ J_ASSERT((start >> ord) < max); ++ mb_set_bit(start >> ord, buddy); ++ e3b->bd_info->bb_counters[ord]--; ++ start += mlen; ++ len -= mlen; ++ J_ASSERT(len >= 0); ++ continue; ++ } ++ ++ /* store for history */ ++ if (ret == 0) ++ ret = len | (ord << 16); ++ ++ /* we have to split large buddy */ ++ J_ASSERT(ord > 0); ++ buddy = mb_find_buddy(e3b, ord, &max); ++ mb_set_bit(start >> ord, buddy); ++ e3b->bd_info->bb_counters[ord]--; ++ ++ ord--; ++ cur = (start >> ord) & ~1U; ++ buddy = mb_find_buddy(e3b, ord, &max); ++ mb_clear_bit(cur, buddy); ++ mb_clear_bit(cur + 1, buddy); ++ e3b->bd_info->bb_counters[ord]++; ++ e3b->bd_info->bb_counters[ord]++; ++ } ++ ++ /* now drop all the bits in bitmap */ ++ mb_set_bits(EXT3_MB_BITMAP(e3b), ex->fe_start, len0); ++ ++ mb_check_buddy(e3b); ++ ++ return ret; ++} ++ ++/* ++ * Must be called under group lock! ++ */ ++static void ext3_mb_use_best_found(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ unsigned long ret; ++ ++ ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len); ++ ret = mb_mark_used(e3b, &ac->ac_b_ex); ++ ++ ac->ac_status = AC_STATUS_FOUND; ++ ac->ac_tail = ret & 0xffff; ++ ac->ac_buddy = ret >> 16; ++} ++ ++/* ++ * The routine checks whether found extent is good enough. If it is, ++ * then the extent gets marked used and flag is set to the context ++ * to stop scanning. Otherwise, the extent is compared with the ++ * previous found extent and if new one is better, then it's stored ++ * in the context. Later, the best found extent will be used, if ++ * mballoc can't find good enough extent. ++ * ++ * FIXME: real allocation policy is to be designed yet! ++ */ ++static void ext3_mb_measure_extent(struct ext3_allocation_context *ac, ++ struct ext3_free_extent *ex, ++ struct ext3_buddy *e3b) ++{ ++ struct ext3_free_extent *bex = &ac->ac_b_ex; ++ struct ext3_free_extent *gex = &ac->ac_g_ex; ++ ++ J_ASSERT(ex->fe_len > 0); ++ J_ASSERT(ex->fe_len < (1 << ac->ac_sb->s_blocksize_bits) * 8); ++ J_ASSERT(ex->fe_start < (1 << ac->ac_sb->s_blocksize_bits) * 8); ++ ++ ac->ac_found++; ++ ++ /* ++ * The special case - take what you catch first ++ */ ++ if (unlikely(ac->ac_flags & EXT3_MB_HINT_FIRST)) { ++ *bex = *ex; ++ ext3_mb_use_best_found(ac, e3b); ++ return; ++ } ++ ++ /* ++ * Let's check whether the chuck is good enough ++ */ ++ if (ex->fe_len == gex->fe_len) { ++ *bex = *ex; ++ ext3_mb_use_best_found(ac, e3b); ++ return; ++ } ++ ++ /* ++ * If this is first found extent, just store it in the context ++ */ ++ if (bex->fe_len == 0) { ++ *bex = *ex; ++ return; ++ } ++ ++ /* ++ * If new found extent is better, store it in the context ++ */ ++ if (bex->fe_len < gex->fe_len) { ++ /* if the request isn't satisfied, any found extent ++ * larger than previous best one is better */ ++ if (ex->fe_len > bex->fe_len) ++ *bex = *ex; ++ } else if (ex->fe_len > gex->fe_len) { ++ /* if the request is satisfied, then we try to find ++ * an extent that still satisfy the request, but is ++ * smaller than previous one */ ++ *bex = *ex; ++ } ++ ++ /* ++ * Let's scan at least few extents and don't pick up a first one ++ */ ++ if (bex->fe_len > gex->fe_len && ac->ac_found > ext3_mb_min_to_scan) ++ ac->ac_status = AC_STATUS_BREAK; ++ ++ /* ++ * We don't want to scan for a whole year ++ */ ++ if (ac->ac_found > ext3_mb_max_to_scan) ++ ac->ac_status = AC_STATUS_BREAK; ++} ++ ++static int ext3_mb_try_best_found(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ struct ext3_free_extent ex = ac->ac_b_ex; ++ int group = ex.fe_group, max, err; ++ ++ J_ASSERT(ex.fe_len > 0); ++ err = ext3_mb_load_buddy(ac->ac_sb, group, e3b); ++ if (err) ++ return err; ++ ++ ext3_lock_group(ac->ac_sb, group); ++ max = mb_find_extent(e3b, 0, ex.fe_start, ex.fe_len, &ex); ++ ++ if (max > 0) { ++ ac->ac_b_ex = ex; ++ ext3_mb_use_best_found(ac, e3b); ++ } ++ ++ ext3_unlock_group(ac->ac_sb, group); ++ ++ ext3_mb_release_desc(e3b); ++ ++ return 0; ++} ++ ++static int ext3_mb_find_by_goal(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ int group = ac->ac_g_ex.fe_group, max, err; ++ struct ext3_free_extent ex; ++ ++ err = ext3_mb_load_buddy(ac->ac_sb, group, e3b); ++ if (err) ++ return err; ++ ++ ext3_lock_group(ac->ac_sb, group); ++ max = mb_find_extent(e3b, 0, ac->ac_g_ex.fe_start, ++ ac->ac_g_ex.fe_len, &ex); ++ ++ if (max > 0) { ++ J_ASSERT(ex.fe_len > 0); ++ J_ASSERT(ex.fe_group == ac->ac_g_ex.fe_group); ++ J_ASSERT(ex.fe_start == ac->ac_g_ex.fe_start); ++ ac->ac_found++; ++ ac->ac_b_ex = ex; ++ ext3_mb_use_best_found(ac, e3b); ++ } ++ ext3_unlock_group(ac->ac_sb, group); ++ ++ ext3_mb_release_desc(e3b); ++ ++ return 0; ++} ++ ++/* ++ * The routine scans buddy structures (not bitmap!) from given order ++ * to max order and tries to find big enough chunk to satisfy the req ++ */ ++static void ext3_mb_simple_scan_group(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ struct super_block *sb = ac->ac_sb; ++ struct ext3_group_info *grp = e3b->bd_info; ++ void *buddy; ++ int i, k, max; ++ ++ J_ASSERT(ac->ac_2order > 0); ++ for (i = ac->ac_2order; i < sb->s_blocksize_bits + 1; i++) { ++ if (grp->bb_counters[i] == 0) ++ continue; ++ ++ buddy = mb_find_buddy(e3b, i, &max); ++ if (buddy == NULL) { ++ printk(KERN_ALERT "looking for wrong order?\n"); ++ break; ++ } ++ ++ k = mb_find_next_zero_bit(buddy, max, 0); ++ J_ASSERT(k < max); ++ ++ ac->ac_found++; ++ ++ ac->ac_b_ex.fe_len = 1 << i; ++ ac->ac_b_ex.fe_start = k << i; ++ ac->ac_b_ex.fe_group = e3b->bd_group; ++ ++ ext3_mb_use_best_found(ac, e3b); ++ J_ASSERT(ac->ac_b_ex.fe_len == ac->ac_g_ex.fe_len); ++ ++ if (unlikely(ext3_mb_stats)) ++ atomic_inc(&EXT3_SB(sb)->s_bal_2orders); ++ ++ break; ++ } ++} ++ ++/* ++ * The routine scans the group and measures all found extents. ++ * In order to optimize scanning, caller must pass number of ++ * free blocks in the group, so the routine can know upper limit. ++ */ ++static void ext3_mb_complex_scan_group(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ struct super_block *sb = ac->ac_sb; ++ void *bitmap = EXT3_MB_BITMAP(e3b); ++ struct ext3_free_extent ex; ++ int i, free; ++ ++ free = e3b->bd_info->bb_free; ++ J_ASSERT(free > 0); ++ ++ i = e3b->bd_info->bb_first_free; ++ ++ while (free && ac->ac_status == AC_STATUS_CONTINUE) { ++ i = mb_find_next_zero_bit(bitmap, sb->s_blocksize * 8, i); ++ if (i >= sb->s_blocksize * 8) { ++ J_ASSERT(free == 0); ++ break; ++ } ++ ++ mb_find_extent(e3b, 0, i, ac->ac_g_ex.fe_len, &ex); ++ J_ASSERT(ex.fe_len > 0); ++ J_ASSERT(free >= ex.fe_len); ++ ++ ext3_mb_measure_extent(ac, &ex, e3b); ++ ++ i += ex.fe_len; ++ free -= ex.fe_len; ++ } ++} ++ ++static int ext3_mb_good_group(struct ext3_allocation_context *ac, ++ int group, int cr) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(ac->ac_sb); ++ struct ext3_group_info *grp = sbi->s_group_info[group]; ++ unsigned free, fragments, i, bits; ++ ++ J_ASSERT(cr >= 0 && cr < 4); ++ J_ASSERT(!EXT3_MB_GRP_NEED_INIT(grp)); ++ ++ free = grp->bb_free; ++ fragments = grp->bb_fragments; ++ if (free == 0) ++ return 0; ++ if (fragments == 0) ++ return 0; ++ ++ switch (cr) { ++ case 0: ++ J_ASSERT(ac->ac_2order != 0); ++ bits = ac->ac_sb->s_blocksize_bits + 1; ++ for (i = ac->ac_2order; i < bits; i++) ++ if (grp->bb_counters[i] > 0) ++ return 1; ++ case 1: ++ if ((free / fragments) >= ac->ac_g_ex.fe_len) ++ return 1; ++ case 2: ++ if (free >= ac->ac_g_ex.fe_len) ++ return 1; ++ case 3: ++ return 1; ++ default: ++ BUG(); ++ } ++ ++ return 0; ++} ++ ++int ext3_mb_new_blocks(handle_t *handle, struct inode *inode, ++ unsigned long goal, int *len, int flags, int *errp) ++{ ++ struct buffer_head *bitmap_bh = NULL; ++ struct ext3_allocation_context ac; ++ int i, group, block, cr, err = 0; ++ struct ext3_group_desc *gdp; ++ struct ext3_super_block *es; ++ struct buffer_head *gdp_bh; ++ struct ext3_sb_info *sbi; ++ struct super_block *sb; ++ struct ext3_buddy e3b; ++ ++ J_ASSERT(len != NULL); ++ J_ASSERT(*len > 0); ++ ++ sb = inode->i_sb; ++ if (!sb) { ++ printk("ext3_mb_new_nblocks: nonexistent device"); ++ return 0; ++ } ++ ++ if (!test_opt(sb, MBALLOC)) { ++ static int ext3_mballoc_warning = 0; ++ if (ext3_mballoc_warning == 0) { ++ printk(KERN_ERR "EXT3-fs: multiblock request with " ++ "mballoc disabled!\n"); ++ ext3_mballoc_warning++; ++ } ++ *len = 1; ++ err = ext3_new_block_old(handle, inode, goal, errp); ++ return err; ++ } ++ ++ ext3_mb_poll_new_transaction(sb, handle); ++ ++ sbi = EXT3_SB(sb); ++ es = EXT3_SB(sb)->s_es; ++ ++ /* ++ * We can't allocate > group size ++ */ ++ if (*len >= EXT3_BLOCKS_PER_GROUP(sb) - 10) ++ *len = EXT3_BLOCKS_PER_GROUP(sb) - 10; ++ ++ if (!(flags & EXT3_MB_HINT_RESERVED)) { ++ /* someone asks for non-reserved blocks */ ++ BUG_ON(*len > 1); ++ err = ext3_mb_reserve_blocks(sb, 1); ++ if (err) { ++ *errp = err; ++ return 0; ++ } ++ } ++ ++ /* ++ * Check quota for allocation of this blocks. ++ */ ++ while (*len && DQUOT_ALLOC_BLOCK(inode, *len)) ++ *len -= 1; ++ if (*len == 0) { ++ *errp = -EDQUOT; ++ block = 0; ++ goto out; ++ } ++ ++ /* start searching from the goal */ ++ if (goal < le32_to_cpu(es->s_first_data_block) || ++ goal >= le32_to_cpu(es->s_blocks_count)) ++ goal = le32_to_cpu(es->s_first_data_block); ++ group = (goal - le32_to_cpu(es->s_first_data_block)) / ++ EXT3_BLOCKS_PER_GROUP(sb); ++ block = ((goal - le32_to_cpu(es->s_first_data_block)) % ++ EXT3_BLOCKS_PER_GROUP(sb)); ++ ++ /* set up allocation goals */ ++ ac.ac_b_ex.fe_group = 0; ++ ac.ac_b_ex.fe_start = 0; ++ ac.ac_b_ex.fe_len = 0; ++ ac.ac_status = AC_STATUS_CONTINUE; ++ ac.ac_groups_scanned = 0; ++ ac.ac_ex_scanned = 0; ++ ac.ac_found = 0; ++ ac.ac_sb = inode->i_sb; ++ ac.ac_g_ex.fe_group = group; ++ ac.ac_g_ex.fe_start = block; ++ ac.ac_g_ex.fe_len = *len; ++ ac.ac_flags = flags; ++ ac.ac_2order = 0; ++ ac.ac_criteria = 0; ++ ++ /* probably, the request is for 2^8+ blocks (1/2/3/... MB) */ ++ i = ffs(*len); ++ if (i >= 8) { ++ i--; ++ if ((*len & (~(1 << i))) == 0) ++ ac.ac_2order = i; ++ } ++ ++ /* Sometimes, caller may want to merge even small ++ * number of blocks to an existing extent */ ++ if (ac.ac_flags & EXT3_MB_HINT_MERGE) { ++ err = ext3_mb_find_by_goal(&ac, &e3b); ++ if (err) ++ goto out_err; ++ if (ac.ac_status == AC_STATUS_FOUND) ++ goto found; ++ } ++ ++ /* Let's just scan groups to find more-less suitable blocks */ ++ cr = ac.ac_2order ? 0 : 1; ++repeat: ++ for (; cr < 4 && ac.ac_status == AC_STATUS_CONTINUE; cr++) { ++ ac.ac_criteria = cr; ++ for (i = 0; i < EXT3_SB(sb)->s_groups_count; group++, i++) { ++ if (group == EXT3_SB(sb)->s_groups_count) ++ group = 0; ++ ++ if (EXT3_MB_GRP_NEED_INIT(sbi->s_group_info[group])) { ++ /* we need full data about the group ++ * to make a good selection */ ++ err = ext3_mb_load_buddy(ac.ac_sb, group, &e3b); ++ if (err) ++ goto out_err; ++ ext3_mb_release_desc(&e3b); ++ } ++ ++ /* check is group good for our criteries */ ++ if (!ext3_mb_good_group(&ac, group, cr)) ++ continue; ++ ++ err = ext3_mb_load_buddy(ac.ac_sb, group, &e3b); ++ if (err) ++ goto out_err; ++ ++ ext3_lock_group(sb, group); ++ if (!ext3_mb_good_group(&ac, group, cr)) { ++ /* someone did allocation from this group */ ++ ext3_unlock_group(sb, group); ++ ext3_mb_release_desc(&e3b); ++ continue; ++ } ++ ++ ac.ac_groups_scanned++; ++ if (cr == 0) ++ ext3_mb_simple_scan_group(&ac, &e3b); ++ else ++ ext3_mb_complex_scan_group(&ac, &e3b); ++ ++ ext3_unlock_group(sb, group); ++ ++ ext3_mb_release_desc(&e3b); ++ ++ if (err) ++ goto out_err; ++ if (ac.ac_status != AC_STATUS_CONTINUE) ++ break; ++ } ++ } ++ ++ if (ac.ac_b_ex.fe_len > 0 && ac.ac_status != AC_STATUS_FOUND && ++ !(ac.ac_flags & EXT3_MB_HINT_FIRST)) { ++ /* ++ * We've been searching too long. Let's try to allocate ++ * the best chunk we've found so far ++ */ ++ ++ /*if (ac.ac_found > ext3_mb_max_to_scan) ++ printk(KERN_ERR "EXT3-fs: too long searching at " ++ "%u (%d/%d)\n", cr, ac.ac_b_ex.fe_len, ++ ac.ac_g_ex.fe_len);*/ ++ ext3_mb_try_best_found(&ac, &e3b); ++ if (ac.ac_status != AC_STATUS_FOUND) { ++ /* ++ * Someone more lucky has already allocated it. ++ * The only thing we can do is just take first ++ * found block(s) ++ */ ++ printk(KERN_ERR "EXT3-fs: and someone won our chunk\n"); ++ ac.ac_b_ex.fe_group = 0; ++ ac.ac_b_ex.fe_start = 0; ++ ac.ac_b_ex.fe_len = 0; ++ ac.ac_status = AC_STATUS_CONTINUE; ++ ac.ac_flags |= EXT3_MB_HINT_FIRST; ++ cr = 3; ++ goto repeat; ++ } ++ } ++ ++ if (ac.ac_status != AC_STATUS_FOUND) { ++ /* ++ * We aren't lucky definitely ++ */ ++ DQUOT_FREE_BLOCK(inode, *len); ++ *errp = -ENOSPC; ++ block = 0; ++#if 1 ++ printk(KERN_ERR "EXT3-fs: cant allocate: status %d, flags %d\n", ++ ac.ac_status, ac.ac_flags); ++ printk(KERN_ERR "EXT3-fs: goal %d, best found %d/%d/%d, cr %d\n", ++ ac.ac_g_ex.fe_len, ac.ac_b_ex.fe_group, ++ ac.ac_b_ex.fe_start, ac.ac_b_ex.fe_len, cr); ++ printk(KERN_ERR "EXT3-fs: %lu block reserved, %d found\n", ++ sbi->s_blocks_reserved, ac.ac_found); ++ printk("EXT3-fs: groups: "); ++ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) ++ printk("%d: %d ", i, ++ sbi->s_group_info[i]->bb_free); ++ printk("\n"); ++#endif ++ goto out; ++ } ++ ++found: ++ J_ASSERT(ac.ac_b_ex.fe_len > 0); ++ ++ /* good news - free block(s) have been found. now it's time ++ * to mark block(s) in good old journaled bitmap */ ++ block = ac.ac_b_ex.fe_group * EXT3_BLOCKS_PER_GROUP(sb) ++ + ac.ac_b_ex.fe_start ++ + le32_to_cpu(es->s_first_data_block); ++ ++ /* we made a desicion, now mark found blocks in good old ++ * bitmap to be journaled */ ++ ++ ext3_debug("using block group %d(%d)\n", ++ ac.ac_b_group.group, gdp->bg_free_blocks_count); ++ ++ bitmap_bh = read_block_bitmap(sb, ac.ac_b_ex.fe_group); ++ if (!bitmap_bh) { ++ *errp = -EIO; ++ goto out_err; ++ } ++ ++ err = ext3_journal_get_write_access(handle, bitmap_bh); ++ if (err) { ++ *errp = err; ++ goto out_err; ++ } ++ ++ gdp = ext3_get_group_desc(sb, ac.ac_b_ex.fe_group, &gdp_bh); ++ if (!gdp) { ++ *errp = -EIO; ++ goto out_err; ++ } ++ ++ err = ext3_journal_get_write_access(handle, gdp_bh); ++ if (err) ++ goto out_err; ++ ++ block = ac.ac_b_ex.fe_group * EXT3_BLOCKS_PER_GROUP(sb) ++ + ac.ac_b_ex.fe_start ++ + le32_to_cpu(es->s_first_data_block); ++ ++ if (block == le32_to_cpu(gdp->bg_block_bitmap) || ++ block == le32_to_cpu(gdp->bg_inode_bitmap) || ++ in_range(block, le32_to_cpu(gdp->bg_inode_table), ++ EXT3_SB(sb)->s_itb_per_group)) ++ ext3_error(sb, "ext3_new_block", ++ "Allocating block in system zone - " ++ "block = %u", block); ++#ifdef AGGRESSIVE_CHECK ++ for (i = 0; i < ac.ac_b_ex.fe_len; i++) ++ J_ASSERT(!mb_test_bit(ac.ac_b_ex.fe_start + i, bitmap_bh->b_data)); ++#endif ++ mb_set_bits(bitmap_bh->b_data, ac.ac_b_ex.fe_start, ac.ac_b_ex.fe_len); ++ ++ spin_lock(sb_bgl_lock(sbi, ac.ac_b_ex.fe_group)); ++ gdp->bg_free_blocks_count = ++ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) ++ - ac.ac_b_ex.fe_len); ++ spin_unlock(sb_bgl_lock(sbi, ac.ac_b_ex.fe_group)); ++ percpu_counter_mod(&sbi->s_freeblocks_counter, - ac.ac_b_ex.fe_len); ++ ++ err = ext3_journal_dirty_metadata(handle, bitmap_bh); ++ if (err) ++ goto out_err; ++ err = ext3_journal_dirty_metadata(handle, gdp_bh); ++ if (err) ++ goto out_err; ++ ++ sb->s_dirt = 1; ++ *errp = 0; ++ brelse(bitmap_bh); ++ ++ /* drop non-allocated, but dquote'd blocks */ ++ J_ASSERT(*len >= ac.ac_b_ex.fe_len); ++ DQUOT_FREE_BLOCK(inode, *len - ac.ac_b_ex.fe_len); ++ ++ *len = ac.ac_b_ex.fe_len; ++ J_ASSERT(*len > 0); ++ J_ASSERT(block != 0); ++ goto out; ++ ++out_err: ++ /* if we've already allocated something, roll it back */ ++ if (ac.ac_status == AC_STATUS_FOUND) { ++ /* FIXME: free blocks here */ ++ } ++ ++ DQUOT_FREE_BLOCK(inode, *len); ++ brelse(bitmap_bh); ++ *errp = err; ++ block = 0; ++out: ++ if (!(flags & EXT3_MB_HINT_RESERVED)) { ++ /* block wasn't reserved before and we reserved it ++ * at the beginning of allocation. it doesn't matter ++ * whether we allocated anything or we failed: time ++ * to release reservation. NOTE: because I expect ++ * any multiblock request from delayed allocation ++ * path only, here is single block always */ ++ ext3_mb_release_blocks(sb, 1); ++ } ++ ++ if (unlikely(ext3_mb_stats) && ac.ac_g_ex.fe_len > 1) { ++ atomic_inc(&sbi->s_bal_reqs); ++ atomic_add(*len, &sbi->s_bal_allocated); ++ if (*len >= ac.ac_g_ex.fe_len) ++ atomic_inc(&sbi->s_bal_success); ++ atomic_add(ac.ac_found, &sbi->s_bal_ex_scanned); ++ if (ac.ac_g_ex.fe_start == ac.ac_b_ex.fe_start && ++ ac.ac_g_ex.fe_group == ac.ac_b_ex.fe_group) ++ atomic_inc(&sbi->s_bal_goals); ++ if (ac.ac_found > ext3_mb_max_to_scan) ++ atomic_inc(&sbi->s_bal_breaks); ++ } ++ ++ ext3_mb_store_history(sb, &ac); ++ ++ return block; ++} ++EXPORT_SYMBOL(ext3_mb_new_blocks); ++ ++#ifdef EXT3_MB_HISTORY ++struct ext3_mb_proc_session { ++ struct ext3_mb_history *history; ++ struct super_block *sb; ++ int start; ++ int max; ++}; ++ ++static void *ext3_mb_history_skip_empty(struct ext3_mb_proc_session *s, ++ struct ext3_mb_history *hs, ++ int first) ++{ ++ if (hs == s->history + s->max) ++ hs = s->history; ++ if (!first && hs == s->history + s->start) ++ return NULL; ++ while (hs->goal.fe_len == 0) { ++ hs++; ++ if (hs == s->history + s->max) ++ hs = s->history; ++ if (hs == s->history + s->start) ++ return NULL; ++ } ++ return hs; ++} ++ ++static void *ext3_mb_seq_history_start(struct seq_file *seq, loff_t *pos) ++{ ++ struct ext3_mb_proc_session *s = seq->private; ++ struct ext3_mb_history *hs; ++ int l = *pos; ++ ++ if (l == 0) ++ return SEQ_START_TOKEN; ++ hs = ext3_mb_history_skip_empty(s, s->history + s->start, 1); ++ if (!hs) ++ return NULL; ++ while (--l && (hs = ext3_mb_history_skip_empty(s, ++hs, 0)) != NULL); ++ return hs; ++} ++ ++static void *ext3_mb_seq_history_next(struct seq_file *seq, void *v, loff_t *pos) ++{ ++ struct ext3_mb_proc_session *s = seq->private; ++ struct ext3_mb_history *hs = v; ++ ++ ++*pos; ++ if (v == SEQ_START_TOKEN) ++ return ext3_mb_history_skip_empty(s, s->history + s->start, 1); ++ else ++ return ext3_mb_history_skip_empty(s, ++hs, 0); ++} ++ ++static int ext3_mb_seq_history_show(struct seq_file *seq, void *v) ++{ ++ struct ext3_mb_history *hs = v; ++ char buf[20], buf2[20]; ++ ++ if (v == SEQ_START_TOKEN) { ++ seq_printf(seq, "%-17s %-17s %-5s %-5s %-2s %-5s %-5s %-6s\n", ++ "goal", "result", "found", "grps", "cr", "merge", ++ "tail", "broken"); ++ return 0; ++ } ++ ++ sprintf(buf, "%u/%u/%u", hs->goal.fe_group, ++ hs->goal.fe_start, hs->goal.fe_len); ++ sprintf(buf2, "%u/%u/%u", hs->result.fe_group, ++ hs->result.fe_start, hs->result.fe_len); ++ seq_printf(seq, "%-17s %-17s %-5u %-5u %-2u %-5s %-5u %-6u\n", buf, ++ buf2, hs->found, hs->groups, hs->cr, ++ hs->merged ? "M" : "", hs->tail, ++ hs->buddy ? 1 << hs->buddy : 0); ++ return 0; ++} ++ ++static void ext3_mb_seq_history_stop(struct seq_file *seq, void *v) ++{ ++} ++ ++static struct seq_operations ext3_mb_seq_history_ops = { ++ .start = ext3_mb_seq_history_start, ++ .next = ext3_mb_seq_history_next, ++ .stop = ext3_mb_seq_history_stop, ++ .show = ext3_mb_seq_history_show, ++}; ++ ++static int ext3_mb_seq_history_open(struct inode *inode, struct file *file) ++{ ++ struct super_block *sb = PDE(inode)->data; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct ext3_mb_proc_session *s; ++ int rc, size; ++ ++ s = kmalloc(sizeof(*s), GFP_KERNEL); ++ if (s == NULL) ++ return -EIO; ++ size = sizeof(struct ext3_mb_history) * sbi->s_mb_history_max; ++ s->history = kmalloc(size, GFP_KERNEL); ++ if (s == NULL) { ++ kfree(s); ++ return -EIO; ++ } ++ ++ spin_lock(&sbi->s_mb_history_lock); ++ memcpy(s->history, sbi->s_mb_history, size); ++ s->max = sbi->s_mb_history_max; ++ s->start = sbi->s_mb_history_cur % s->max; ++ spin_unlock(&sbi->s_mb_history_lock); ++ ++ rc = seq_open(file, &ext3_mb_seq_history_ops); ++ if (rc == 0) { ++ struct seq_file *m = (struct seq_file *)file->private_data; ++ m->private = s; ++ } else { ++ kfree(s->history); ++ kfree(s); ++ } ++ return rc; ++ ++} ++ ++static int ext3_mb_seq_history_release(struct inode *inode, struct file *file) ++{ ++ struct seq_file *seq = (struct seq_file *)file->private_data; ++ struct ext3_mb_proc_session *s = seq->private; ++ kfree(s->history); ++ kfree(s); ++ return seq_release(inode, file); ++} ++ ++static struct file_operations ext3_mb_seq_history_fops = { ++ .owner = THIS_MODULE, ++ .open = ext3_mb_seq_history_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = ext3_mb_seq_history_release, ++}; ++ ++static void ext3_mb_history_release(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ char name[64]; ++ ++ snprintf(name, sizeof(name) - 1, "%s", bdevname(sb->s_bdev, name)); ++ remove_proc_entry("mb_history", sbi->s_mb_proc); ++ remove_proc_entry(name, proc_root_ext3); ++ ++ if (sbi->s_mb_history) ++ kfree(sbi->s_mb_history); ++} ++ ++static void ext3_mb_history_init(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ char name[64]; ++ int i; ++ ++ snprintf(name, sizeof(name) - 1, "%s", bdevname(sb->s_bdev, name)); ++ sbi->s_mb_proc = proc_mkdir(name, proc_root_ext3); ++ if (sbi->s_mb_proc != NULL) { ++ struct proc_dir_entry *p; ++ p = create_proc_entry("mb_history", S_IRUGO, sbi->s_mb_proc); ++ if (p) { ++ p->proc_fops = &ext3_mb_seq_history_fops; ++ p->data = sb; ++ } ++ } ++ ++ sbi->s_mb_history_max = 1000; ++ sbi->s_mb_history_cur = 0; ++ spin_lock_init(&sbi->s_mb_history_lock); ++ i = sbi->s_mb_history_max * sizeof(struct ext3_mb_history); ++ sbi->s_mb_history = kmalloc(i, GFP_KERNEL); ++ memset(sbi->s_mb_history, 0, i); ++ /* if we can't allocate history, then we simple won't use it */ ++} ++ ++static void ++ext3_mb_store_history(struct super_block *sb, struct ext3_allocation_context *ac) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct ext3_mb_history h; ++ ++ if (likely(sbi->s_mb_history == NULL)) ++ return; ++ ++ h.goal = ac->ac_g_ex; ++ h.result = ac->ac_b_ex; ++ h.found = ac->ac_found; ++ h.cr = ac->ac_criteria; ++ h.groups = ac->ac_groups_scanned; ++ h.tail = ac->ac_tail; ++ h.buddy = ac->ac_buddy; ++ h.merged = 0; ++ if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && ++ ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group) ++ h.merged = 1; ++ ++ spin_lock(&sbi->s_mb_history_lock); ++ memcpy(sbi->s_mb_history + sbi->s_mb_history_cur, &h, sizeof(h)); ++ if (++sbi->s_mb_history_cur >= sbi->s_mb_history_max) ++ sbi->s_mb_history_cur = 0; ++ spin_unlock(&sbi->s_mb_history_lock); ++} ++ ++#else ++#define ext3_mb_history_release(sb) ++#define ext3_mb_history_init(sb) ++#endif ++ ++int ext3_mb_init_backend(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int i, len; ++ ++ len = sizeof(struct ext3_buddy_group_blocks *) * sbi->s_groups_count; ++ sbi->s_group_info = kmalloc(len, GFP_KERNEL); ++ if (sbi->s_group_info == NULL) { ++ printk(KERN_ERR "EXT3-fs: can't allocate mem for buddy\n"); ++ return -ENOMEM; ++ } ++ memset(sbi->s_group_info, 0, len); ++ ++ sbi->s_buddy_cache = new_inode(sb); ++ if (sbi->s_buddy_cache == NULL) { ++ printk(KERN_ERR "EXT3-fs: can't get new inode\n"); ++ kfree(sbi->s_group_info); ++ return -ENOMEM; ++ } ++ ++ /* ++ * calculate needed size. if change bb_counters size, ++ * don't forget about ext3_mb_generate_buddy() ++ */ ++ len = sizeof(struct ext3_group_info); ++ len += sizeof(unsigned short) * (sb->s_blocksize_bits + 2); ++ for (i = 0; i < sbi->s_groups_count; i++) { ++ struct ext3_group_desc * desc; ++ ++ sbi->s_group_info[i] = kmalloc(len, GFP_KERNEL); ++ if (sbi->s_group_info[i] == NULL) { ++ printk(KERN_ERR "EXT3-fs: cant allocate mem for buddy\n"); ++ goto err_out; ++ } ++ desc = ext3_get_group_desc(sb, i, NULL); ++ if (desc == NULL) { ++ printk(KERN_ERR "EXT3-fs: cant read descriptor %u\n", i); ++ goto err_out; ++ } ++ memset(sbi->s_group_info[i], 0, len); ++ set_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, ++ &sbi->s_group_info[i]->bb_state); ++ sbi->s_group_info[i]->bb_free = ++ le16_to_cpu(desc->bg_free_blocks_count); ++ } ++ ++ return 0; ++ ++err_out: ++ while (--i >= 0) ++ kfree(sbi->s_group_info[i]); ++ iput(sbi->s_buddy_cache); ++ ++ return -ENOMEM; ++} ++ ++int ext3_mb_init(struct super_block *sb, int needs_recovery) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct inode *root = sb->s_root->d_inode; ++ unsigned i, offset, max; ++ struct dentry *dentry; ++ ++ if (!test_opt(sb, MBALLOC)) ++ return 0; ++ ++ i = (sb->s_blocksize_bits + 2) * sizeof(unsigned short); ++ ++ sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL); ++ if (sbi->s_mb_offsets == NULL) { ++ clear_opt(sbi->s_mount_opt, MBALLOC); ++ return -ENOMEM; ++ } ++ sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); ++ if (sbi->s_mb_maxs == NULL) { ++ clear_opt(sbi->s_mount_opt, MBALLOC); ++ kfree(sbi->s_mb_maxs); ++ return -ENOMEM; ++ } ++ ++ /* order 0 is regular bitmap */ ++ sbi->s_mb_maxs[0] = sb->s_blocksize << 3; ++ sbi->s_mb_offsets[0] = 0; ++ ++ i = 1; ++ offset = 0; ++ max = sb->s_blocksize << 2; ++ do { ++ sbi->s_mb_offsets[i] = offset; ++ sbi->s_mb_maxs[i] = max; ++ offset += 1 << (sb->s_blocksize_bits - i); ++ max = max >> 1; ++ i++; ++ } while (i <= sb->s_blocksize_bits + 1); ++ ++ ++ /* init file for buddy data */ ++ if ((i = ext3_mb_init_backend(sb))) { ++ clear_opt(sbi->s_mount_opt, MBALLOC); ++ kfree(sbi->s_mb_offsets); ++ kfree(sbi->s_mb_maxs); ++ return i; ++ } ++ ++ spin_lock_init(&sbi->s_reserve_lock); ++ spin_lock_init(&sbi->s_md_lock); ++ INIT_LIST_HEAD(&sbi->s_active_transaction); ++ INIT_LIST_HEAD(&sbi->s_closed_transaction); ++ INIT_LIST_HEAD(&sbi->s_committed_transaction); ++ spin_lock_init(&sbi->s_bal_lock); ++ ++ /* remove old on-disk buddy file */ ++ mutex_lock(&root->i_mutex); ++ dentry = lookup_one_len(".buddy", sb->s_root, strlen(".buddy")); ++ if (dentry->d_inode != NULL) { ++ i = vfs_unlink(root, dentry); ++ if (i != 0) ++ printk("EXT3-fs: can't remove .buddy file: %d\n", i); ++ } ++ dput(dentry); ++ mutex_unlock(&root->i_mutex); ++ ++ ext3_mb_history_init(sb); ++ ++ printk("EXT3-fs: mballoc enabled\n"); ++ return 0; ++} ++ ++int ext3_mb_release(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int i; ++ ++ if (!test_opt(sb, MBALLOC)) ++ return 0; ++ ++ /* release freed, non-committed blocks */ ++ spin_lock(&sbi->s_md_lock); ++ list_splice_init(&sbi->s_closed_transaction, ++ &sbi->s_committed_transaction); ++ list_splice_init(&sbi->s_active_transaction, ++ &sbi->s_committed_transaction); ++ spin_unlock(&sbi->s_md_lock); ++ ext3_mb_free_committed_blocks(sb); ++ ++ if (sbi->s_group_info) { ++ for (i = 0; i < sbi->s_groups_count; i++) { ++ if (sbi->s_group_info[i] == NULL) ++ continue; ++ kfree(sbi->s_group_info[i]); ++ } ++ kfree(sbi->s_group_info); ++ } ++ if (sbi->s_mb_offsets) ++ kfree(sbi->s_mb_offsets); ++ if (sbi->s_mb_maxs) ++ kfree(sbi->s_mb_maxs); ++ if (sbi->s_buddy_cache) ++ iput(sbi->s_buddy_cache); ++ if (sbi->s_blocks_reserved) ++ printk("ext3-fs: %ld blocks being reserved at umount!\n", ++ sbi->s_blocks_reserved); ++ if (ext3_mb_stats) { ++ printk("EXT3-fs: mballoc: %u blocks %u reqs (%u success)\n", ++ atomic_read(&sbi->s_bal_allocated), ++ atomic_read(&sbi->s_bal_reqs), ++ atomic_read(&sbi->s_bal_success)); ++ printk("EXT3-fs: mballoc: %u extents scanned, %u goal hits, " ++ "%u 2^N hits, %u breaks\n", ++ atomic_read(&sbi->s_bal_ex_scanned), ++ atomic_read(&sbi->s_bal_goals), ++ atomic_read(&sbi->s_bal_2orders), ++ atomic_read(&sbi->s_bal_breaks)); ++ printk("EXT3-fs: mballoc: %lu generated and it took %Lu\n", ++ sbi->s_mb_buddies_generated++, ++ sbi->s_mb_generation_time); ++ } ++ ++ ext3_mb_history_release(sb); ++ ++ return 0; ++} ++ ++void ext3_mb_free_committed_blocks(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int err, i, count = 0, count2 = 0; ++ struct ext3_free_metadata *md; ++ struct ext3_buddy e3b; ++ ++ if (list_empty(&sbi->s_committed_transaction)) ++ return; ++ ++ /* there is committed blocks to be freed yet */ ++ do { ++ /* get next array of blocks */ ++ md = NULL; ++ spin_lock(&sbi->s_md_lock); ++ if (!list_empty(&sbi->s_committed_transaction)) { ++ md = list_entry(sbi->s_committed_transaction.next, ++ struct ext3_free_metadata, list); ++ list_del(&md->list); ++ } ++ spin_unlock(&sbi->s_md_lock); ++ ++ if (md == NULL) ++ break; ++ ++ mb_debug("gonna free %u blocks in group %u (0x%p):", ++ md->num, md->group, md); ++ ++ err = ext3_mb_load_buddy(sb, md->group, &e3b); ++ BUG_ON(err != 0); ++ ++ /* there are blocks to put in buddy to make them really free */ ++ count += md->num; ++ count2++; ++ ext3_lock_group(sb, md->group); ++ for (i = 0; i < md->num; i++) { ++ mb_debug(" %u", md->blocks[i]); ++ mb_free_blocks(&e3b, md->blocks[i], 1); ++ } ++ mb_debug("\n"); ++ ext3_unlock_group(sb, md->group); ++ ++ /* balance refcounts from ext3_mb_free_metadata() */ ++ page_cache_release(e3b.bd_buddy_page); ++ page_cache_release(e3b.bd_bitmap_page); ++ ++ kfree(md); ++ ext3_mb_release_desc(&e3b); ++ ++ } while (md); ++ mb_debug("freed %u blocks in %u structures\n", count, count2); ++} ++ ++void ext3_mb_poll_new_transaction(struct super_block *sb, handle_t *handle) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ ++ if (sbi->s_last_transaction == handle->h_transaction->t_tid) ++ return; ++ ++ /* new transaction! time to close last one and free blocks for ++ * committed transaction. we know that only transaction can be ++ * active, so previos transaction can be being logged and we ++ * know that transaction before previous is known to be already ++ * logged. this means that now we may free blocks freed in all ++ * transactions before previous one. hope I'm clear enough ... */ ++ ++ spin_lock(&sbi->s_md_lock); ++ if (sbi->s_last_transaction != handle->h_transaction->t_tid) { ++ mb_debug("new transaction %lu, old %lu\n", ++ (unsigned long) handle->h_transaction->t_tid, ++ (unsigned long) sbi->s_last_transaction); ++ list_splice_init(&sbi->s_closed_transaction, ++ &sbi->s_committed_transaction); ++ list_splice_init(&sbi->s_active_transaction, ++ &sbi->s_closed_transaction); ++ sbi->s_last_transaction = handle->h_transaction->t_tid; ++ } ++ spin_unlock(&sbi->s_md_lock); ++ ++ ext3_mb_free_committed_blocks(sb); ++} ++ ++int ext3_mb_free_metadata(handle_t *handle, struct ext3_buddy *e3b, ++ int group, int block, int count) ++{ ++ struct ext3_group_info *db = e3b->bd_info; ++ struct super_block *sb = e3b->bd_sb; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct ext3_free_metadata *md; ++ int i; ++ ++ J_ASSERT(e3b->bd_bitmap_page != NULL); ++ J_ASSERT(e3b->bd_buddy_page != NULL); ++ ++ ext3_lock_group(sb, group); ++ for (i = 0; i < count; i++) { ++ md = db->bb_md_cur; ++ if (md && db->bb_tid != handle->h_transaction->t_tid) { ++ db->bb_md_cur = NULL; ++ md = NULL; ++ } ++ ++ if (md == NULL) { ++ ext3_unlock_group(sb, group); ++ md = kmalloc(sizeof(*md), GFP_KERNEL); ++ if (md == NULL) ++ return -ENOMEM; ++ md->num = 0; ++ md->group = group; ++ ++ ext3_lock_group(sb, group); ++ if (db->bb_md_cur == NULL) { ++ spin_lock(&sbi->s_md_lock); ++ list_add(&md->list, &sbi->s_active_transaction); ++ spin_unlock(&sbi->s_md_lock); ++ /* protect buddy cache from being freed, ++ * otherwise we'll refresh it from ++ * on-disk bitmap and lose not-yet-available ++ * blocks */ ++ page_cache_get(e3b->bd_buddy_page); ++ page_cache_get(e3b->bd_bitmap_page); ++ db->bb_md_cur = md; ++ db->bb_tid = handle->h_transaction->t_tid; ++ mb_debug("new md 0x%p for group %u\n", ++ md, md->group); ++ } else { ++ kfree(md); ++ md = db->bb_md_cur; ++ } ++ } ++ ++ BUG_ON(md->num >= EXT3_BB_MAX_BLOCKS); ++ md->blocks[md->num] = block + i; ++ md->num++; ++ if (md->num == EXT3_BB_MAX_BLOCKS) { ++ /* no more space, put full container on a sb's list */ ++ db->bb_md_cur = NULL; ++ } ++ } ++ ext3_unlock_group(sb, group); ++ return 0; ++} ++ ++void ext3_mb_free_blocks(handle_t *handle, struct inode *inode, ++ unsigned long block, unsigned long count, ++ int metadata, int *freed) ++{ ++ struct buffer_head *bitmap_bh = NULL; ++ struct ext3_group_desc *gdp; ++ struct ext3_super_block *es; ++ unsigned long bit, overflow; ++ struct buffer_head *gd_bh; ++ unsigned long block_group; ++ struct ext3_sb_info *sbi; ++ struct super_block *sb; ++ struct ext3_buddy e3b; ++ int err = 0, ret; ++ ++ *freed = 0; ++ sb = inode->i_sb; ++ if (!sb) { ++ printk ("ext3_free_blocks: nonexistent device"); ++ return; ++ } ++ ++ ext3_mb_poll_new_transaction(sb, handle); ++ ++ sbi = EXT3_SB(sb); ++ es = EXT3_SB(sb)->s_es; ++ if (block < le32_to_cpu(es->s_first_data_block) || ++ block + count < block || ++ block + count > le32_to_cpu(es->s_blocks_count)) { ++ ext3_error (sb, "ext3_free_blocks", ++ "Freeing blocks not in datazone - " ++ "block = %lu, count = %lu", block, count); ++ goto error_return; ++ } ++ ++ ext3_debug("freeing block %lu\n", block); ++ ++do_more: ++ overflow = 0; ++ block_group = (block - le32_to_cpu(es->s_first_data_block)) / ++ EXT3_BLOCKS_PER_GROUP(sb); ++ bit = (block - le32_to_cpu(es->s_first_data_block)) % ++ EXT3_BLOCKS_PER_GROUP(sb); ++ /* ++ * Check to see if we are freeing blocks across a group ++ * boundary. ++ */ ++ if (bit + count > EXT3_BLOCKS_PER_GROUP(sb)) { ++ overflow = bit + count - EXT3_BLOCKS_PER_GROUP(sb); ++ count -= overflow; ++ } ++ brelse(bitmap_bh); ++ bitmap_bh = read_block_bitmap(sb, block_group); ++ if (!bitmap_bh) ++ goto error_return; ++ gdp = ext3_get_group_desc (sb, block_group, &gd_bh); ++ if (!gdp) ++ goto error_return; ++ ++ if (in_range (le32_to_cpu(gdp->bg_block_bitmap), block, count) || ++ in_range (le32_to_cpu(gdp->bg_inode_bitmap), block, count) || ++ in_range (block, le32_to_cpu(gdp->bg_inode_table), ++ EXT3_SB(sb)->s_itb_per_group) || ++ in_range (block + count - 1, le32_to_cpu(gdp->bg_inode_table), ++ EXT3_SB(sb)->s_itb_per_group)) ++ ext3_error (sb, "ext3_free_blocks", ++ "Freeing blocks in system zones - " ++ "Block = %lu, count = %lu", ++ block, count); ++ ++ BUFFER_TRACE(bitmap_bh, "getting write access"); ++ err = ext3_journal_get_write_access(handle, bitmap_bh); ++ if (err) ++ goto error_return; ++ ++ /* ++ * We are about to modify some metadata. Call the journal APIs ++ * to unshare ->b_data if a currently-committing transaction is ++ * using it ++ */ ++ BUFFER_TRACE(gd_bh, "get_write_access"); ++ err = ext3_journal_get_write_access(handle, gd_bh); ++ if (err) ++ goto error_return; ++ ++ err = ext3_mb_load_buddy(sb, block_group, &e3b); ++ if (err) ++ goto error_return; ++ ++#ifdef AGGRESSIVE_CHECK ++ { ++ int i; ++ for (i = 0; i < count; i++) ++ J_ASSERT(mb_test_bit(bit + i, bitmap_bh->b_data)); ++ } ++#endif ++ mb_clear_bits(bitmap_bh->b_data, bit, count); ++ ++ /* We dirtied the bitmap block */ ++ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); ++ err = ext3_journal_dirty_metadata(handle, bitmap_bh); ++ ++ if (metadata) { ++ /* blocks being freed are metadata. these blocks shouldn't ++ * be used until this transaction is committed */ ++ ext3_mb_free_metadata(handle, &e3b, block_group, bit, count); ++ } else { ++ ext3_lock_group(sb, block_group); ++ mb_free_blocks(&e3b, bit, count); ++ ext3_unlock_group(sb, block_group); ++ } ++ ++ spin_lock(sb_bgl_lock(sbi, block_group)); ++ gdp->bg_free_blocks_count = ++ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + count); ++ spin_unlock(sb_bgl_lock(sbi, block_group)); ++ percpu_counter_mod(&sbi->s_freeblocks_counter, count); ++ ++ ext3_mb_release_desc(&e3b); ++ ++ *freed = count; ++ ++ /* And the group descriptor block */ ++ BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); ++ ret = ext3_journal_dirty_metadata(handle, gd_bh); ++ if (!err) err = ret; ++ ++ if (overflow && !err) { ++ block += count; ++ count = overflow; ++ goto do_more; ++ } ++ sb->s_dirt = 1; ++error_return: ++ brelse(bitmap_bh); ++ ext3_std_error(sb, err); ++ return; ++} ++ ++int ext3_mb_reserve_blocks(struct super_block *sb, int blocks) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int free, ret = -ENOSPC; ++ ++ BUG_ON(blocks < 0); ++ spin_lock(&sbi->s_reserve_lock); ++ free = percpu_counter_read_positive(&sbi->s_freeblocks_counter); ++ if (blocks <= free - sbi->s_blocks_reserved) { ++ sbi->s_blocks_reserved += blocks; ++ ret = 0; ++ } ++ spin_unlock(&sbi->s_reserve_lock); ++ return ret; ++} ++ ++void ext3_mb_release_blocks(struct super_block *sb, int blocks) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ ++ BUG_ON(blocks < 0); ++ spin_lock(&sbi->s_reserve_lock); ++ sbi->s_blocks_reserved -= blocks; ++ WARN_ON(sbi->s_blocks_reserved < 0); ++ if (sbi->s_blocks_reserved < 0) ++ sbi->s_blocks_reserved = 0; ++ spin_unlock(&sbi->s_reserve_lock); ++} ++ ++int ext3_new_block(handle_t *handle, struct inode *inode, ++ unsigned long goal, int *errp) ++{ ++ int ret, len; ++ ++ if (!test_opt(inode->i_sb, MBALLOC)) { ++ ret = ext3_new_block_old(handle, inode, goal, errp); ++ goto out; ++ } ++ len = 1; ++ ret = ext3_mb_new_blocks(handle, inode, goal, &len, 0, errp); ++out: ++ return ret; ++} ++ ++ ++void ext3_free_blocks(handle_t *handle, struct inode * inode, ++ unsigned long block, unsigned long count, int metadata) ++{ ++ struct super_block *sb; ++ int freed; ++ ++ sb = inode->i_sb; ++ if (!test_opt(sb, MBALLOC)) ++ ext3_free_blocks_sb(handle, sb, block, count, &freed); ++ else ++ ext3_mb_free_blocks(handle, inode, block, count, metadata, &freed); ++ if (freed) ++ DQUOT_FREE_BLOCK(inode, freed); ++ return; ++} ++ ++#define EXT3_ROOT "ext3" ++#define EXT3_MB_STATS_NAME "mb_stats" ++#define EXT3_MB_MAX_TO_SCAN_NAME "mb_max_to_scan" ++#define EXT3_MB_MIN_TO_SCAN_NAME "mb_min_to_scan" ++ ++static int ext3_mb_stats_read(char *page, char **start, off_t off, ++ int count, int *eof, void *data) ++{ ++ int len; ++ ++ *eof = 1; ++ if (off != 0) ++ return 0; ++ ++ len = sprintf(page, "%ld\n", ext3_mb_stats); ++ *start = page; ++ return len; ++} ++ ++static int ext3_mb_stats_write(struct file *file, const char *buffer, ++ unsigned long count, void *data) ++{ ++ char str[32]; ++ ++ if (count >= sizeof(str)) { ++ printk(KERN_ERR "EXT3: %s string to long, max %u bytes\n", ++ EXT3_MB_STATS_NAME, (int)sizeof(str)); ++ return -EOVERFLOW; ++ } ++ ++ if (copy_from_user(str, buffer, count)) ++ return -EFAULT; ++ ++ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ ++ ext3_mb_stats = (simple_strtol(str, NULL, 0) != 0); ++ return count; ++} ++ ++static int ext3_mb_max_to_scan_read(char *page, char **start, off_t off, ++ int count, int *eof, void *data) ++{ ++ int len; ++ ++ *eof = 1; ++ if (off != 0) ++ return 0; ++ ++ len = sprintf(page, "%ld\n", ext3_mb_max_to_scan); ++ *start = page; ++ return len; ++} ++ ++static int ext3_mb_max_to_scan_write(struct file *file, const char *buffer, ++ unsigned long count, void *data) ++{ ++ char str[32]; ++ long value; ++ ++ if (count >= sizeof(str)) { ++ printk(KERN_ERR "EXT3: %s string to long, max %u bytes\n", ++ EXT3_MB_MAX_TO_SCAN_NAME, (int)sizeof(str)); ++ return -EOVERFLOW; ++ } ++ ++ if (copy_from_user(str, buffer, count)) ++ return -EFAULT; ++ ++ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ ++ value = simple_strtol(str, NULL, 0); ++ if (value <= 0) ++ return -ERANGE; ++ ++ ext3_mb_max_to_scan = value; ++ ++ return count; ++} ++ ++static int ext3_mb_min_to_scan_read(char *page, char **start, off_t off, ++ int count, int *eof, void *data) ++{ ++ int len; ++ ++ *eof = 1; ++ if (off != 0) ++ return 0; ++ ++ len = sprintf(page, "%ld\n", ext3_mb_min_to_scan); ++ *start = page; ++ return len; ++} ++ ++static int ext3_mb_min_to_scan_write(struct file *file, const char *buffer, ++ unsigned long count, void *data) ++{ ++ char str[32]; ++ long value; ++ ++ if (count >= sizeof(str)) { ++ printk(KERN_ERR "EXT3: %s string to long, max %u bytes\n", ++ EXT3_MB_MIN_TO_SCAN_NAME, (int)sizeof(str)); ++ return -EOVERFLOW; ++ } ++ ++ if (copy_from_user(str, buffer, count)) ++ return -EFAULT; ++ ++ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ ++ value = simple_strtol(str, NULL, 0); ++ if (value <= 0) ++ return -ERANGE; ++ ++ ext3_mb_min_to_scan = value; ++ ++ return count; ++} ++ ++int __init init_ext3_proc(void) ++{ ++ struct proc_dir_entry *proc_ext3_mb_stats; ++ struct proc_dir_entry *proc_ext3_mb_max_to_scan; ++ struct proc_dir_entry *proc_ext3_mb_min_to_scan; ++ ++ proc_root_ext3 = proc_mkdir(EXT3_ROOT, proc_root_fs); ++ if (proc_root_ext3 == NULL) { ++ printk(KERN_ERR "EXT3: Unable to create %s\n", EXT3_ROOT); ++ return -EIO; ++ } ++ ++ /* Initialize EXT3_MB_STATS_NAME */ ++ proc_ext3_mb_stats = create_proc_entry(EXT3_MB_STATS_NAME, ++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); ++ if (proc_ext3_mb_stats == NULL) { ++ printk(KERN_ERR "EXT3: Unable to create %s\n", ++ EXT3_MB_STATS_NAME); ++ remove_proc_entry(EXT3_ROOT, proc_root_fs); ++ return -EIO; ++ } ++ ++ proc_ext3_mb_stats->data = NULL; ++ proc_ext3_mb_stats->read_proc = ext3_mb_stats_read; ++ proc_ext3_mb_stats->write_proc = ext3_mb_stats_write; ++ ++ /* Initialize EXT3_MAX_TO_SCAN_NAME */ ++ proc_ext3_mb_max_to_scan = create_proc_entry( ++ EXT3_MB_MAX_TO_SCAN_NAME, ++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); ++ if (proc_ext3_mb_max_to_scan == NULL) { ++ printk(KERN_ERR "EXT3: Unable to create %s\n", ++ EXT3_MB_MAX_TO_SCAN_NAME); ++ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_ROOT, proc_root_fs); ++ return -EIO; ++ } ++ ++ proc_ext3_mb_max_to_scan->data = NULL; ++ proc_ext3_mb_max_to_scan->read_proc = ext3_mb_max_to_scan_read; ++ proc_ext3_mb_max_to_scan->write_proc = ext3_mb_max_to_scan_write; ++ ++ /* Initialize EXT3_MIN_TO_SCAN_NAME */ ++ proc_ext3_mb_min_to_scan = create_proc_entry( ++ EXT3_MB_MIN_TO_SCAN_NAME, ++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); ++ if (proc_ext3_mb_min_to_scan == NULL) { ++ printk(KERN_ERR "EXT3: Unable to create %s\n", ++ EXT3_MB_MIN_TO_SCAN_NAME); ++ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_ROOT, proc_root_fs); ++ return -EIO; ++ } ++ ++ proc_ext3_mb_min_to_scan->data = NULL; ++ proc_ext3_mb_min_to_scan->read_proc = ext3_mb_min_to_scan_read; ++ proc_ext3_mb_min_to_scan->write_proc = ext3_mb_min_to_scan_write; ++ ++ return 0; ++} ++ ++void exit_ext3_proc(void) ++{ ++ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_MIN_TO_SCAN_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_ROOT, proc_root_fs); ++} ++ +Index: linux-stage/fs/ext3/extents.c +=================================================================== +--- linux-stage.orig/fs/ext3/extents.c 2006-07-16 02:29:43.000000000 +0800 ++++ linux-stage/fs/ext3/extents.c 2006-07-16 02:29:49.000000000 +0800 +@@ -771,7 +771,7 @@ cleanup: + for (i = 0; i < depth; i++) { + if (!ablocks[i]) + continue; +- ext3_free_blocks(handle, tree->inode, ablocks[i], 1); ++ ext3_free_blocks(handle, tree->inode, ablocks[i], 1, 1); + } + } + kfree(ablocks); +@@ -1428,7 +1428,7 @@ int ext3_ext_rm_idx(handle_t *handle, st + path->p_idx->ei_leaf); + bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf); + ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf); +- ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1); ++ ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1, 1); + return err; + } + +@@ -1913,10 +1913,12 @@ ext3_remove_blocks(struct ext3_extents_t + int needed = ext3_remove_blocks_credits(tree, ex, from, to); + handle_t *handle = ext3_journal_start(tree->inode, needed); + struct buffer_head *bh; +- int i; ++ int i, metadata = 0; + + if (IS_ERR(handle)) + return PTR_ERR(handle); ++ if (S_ISDIR(tree->inode->i_mode) || S_ISLNK(tree->inode->i_mode)) ++ metadata = 1; + if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) { + /* tail removal */ + unsigned long num, start; +@@ -1928,7 +1930,7 @@ ext3_remove_blocks(struct ext3_extents_t + bh = sb_find_get_block(tree->inode->i_sb, start + i); + ext3_forget(handle, 0, tree->inode, bh, start + i); + } +- ext3_free_blocks(handle, tree->inode, start, num); ++ ext3_free_blocks(handle, tree->inode, start, num, metadata); + } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) { + printk("strange request: removal %lu-%lu from %u:%u\n", + from, to, ex->ee_block, ex->ee_len); +Index: linux-stage/fs/ext3/xattr.c +=================================================================== +--- linux-stage.orig/fs/ext3/xattr.c 2006-07-16 02:29:43.000000000 +0800 ++++ linux-stage/fs/ext3/xattr.c 2006-07-16 02:29:49.000000000 +0800 +@@ -484,7 +484,7 @@ ext3_xattr_release_block(handle_t *handl + ea_bdebug(bh, "refcount now=0; freeing"); + if (ce) + mb_cache_entry_free(ce); +- ext3_free_blocks(handle, inode, bh->b_blocknr, 1); ++ ext3_free_blocks(handle, inode, bh->b_blocknr, 1, 1); + get_bh(bh); + ext3_forget(handle, 1, inode, bh, bh->b_blocknr); + } else { +@@ -805,7 +805,7 @@ inserted: + new_bh = sb_getblk(sb, block); + if (!new_bh) { + getblk_failed: +- ext3_free_blocks(handle, inode, block, 1); ++ ext3_free_blocks(handle, inode, block, 1, 1); + error = -EIO; + goto cleanup; + } +Index: linux-stage/fs/ext3/balloc.c +=================================================================== +--- linux-stage.orig/fs/ext3/balloc.c 2006-07-16 02:29:43.000000000 +0800 ++++ linux-stage/fs/ext3/balloc.c 2006-07-16 02:33:13.000000000 +0800 +@@ -79,7 +79,7 @@ struct ext3_group_desc * ext3_get_group_ + * + * Return buffer_head on success or NULL in case of failure. + */ +-static struct buffer_head * ++struct buffer_head * + read_block_bitmap(struct super_block *sb, unsigned int block_group) + { + struct ext3_group_desc * desc; +@@ -490,24 +490,6 @@ error_return: + return; + } + +-/* Free given blocks, update quota and i_blocks field */ +-void ext3_free_blocks(handle_t *handle, struct inode *inode, +- ext3_fsblk_t block, unsigned long count) +-{ +- struct super_block * sb; +- unsigned long dquot_freed_blocks; +- +- sb = inode->i_sb; +- if (!sb) { +- printk ("ext3_free_blocks: nonexistent device"); +- return; +- } +- ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks); +- if (dquot_freed_blocks) +- DQUOT_FREE_BLOCK(inode, dquot_freed_blocks); +- return; +-} +- + /* + * For ext3 allocations, we must not reuse any blocks which are + * allocated in the bitmap buffer's "last committed data" copy. This +@@ -1463,7 +1445,7 @@ out: + return 0; + } + +-ext3_fsblk_t ext3_new_block(handle_t *handle, struct inode *inode, ++ext3_fsblk_t ext3_new_block_old(handle_t *handle, struct inode *inode, + ext3_fsblk_t goal, int *errp) + { + unsigned long count = 1; +Index: linux-stage/fs/ext3/super.c +=================================================================== +--- linux-stage.orig/fs/ext3/super.c 2006-07-16 02:29:43.000000000 +0800 ++++ linux-stage/fs/ext3/super.c 2006-07-16 02:29:49.000000000 +0800 +@@ -391,6 +391,7 @@ static void ext3_put_super (struct super + struct ext3_super_block *es = sbi->s_es; + int i; + ++ ext3_mb_release(sb); + ext3_ext_release(sb); + ext3_xattr_put_super(sb); + journal_destroy(sbi->s_journal); +@@ -641,7 +642,7 @@ enum { + Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, + Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, + Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, +- Opt_extents, Opt_extdebug, ++ Opt_extents, Opt_extdebug, Opt_mballoc, + Opt_grpquota + }; + +@@ -696,6 +697,7 @@ static match_table_t tokens = { + {Opt_iopen_nopriv, "iopen_nopriv"}, + {Opt_extents, "extents"}, + {Opt_extdebug, "extdebug"}, ++ {Opt_mballoc, "mballoc"}, + {Opt_barrier, "barrier=%u"}, + {Opt_err, NULL}, + {Opt_resize, "resize"}, +@@ -1047,6 +1049,9 @@ clear_qf_name: + case Opt_extdebug: + set_opt (sbi->s_mount_opt, EXTDEBUG); + break; ++ case Opt_mballoc: ++ set_opt (sbi->s_mount_opt, MBALLOC); ++ break; + default: + printk (KERN_ERR + "EXT3-fs: Unrecognized mount option \"%s\" " +@@ -1773,6 +1778,7 @@ static int ext3_fill_super (struct super + "writeback"); + + ext3_ext_init(sb); ++ ext3_mb_init(sb, needs_recovery); + lock_kernel(); + return 0; + +@@ -2712,7 +2718,13 @@ static struct file_system_type ext3_fs_t + + static int __init init_ext3_fs(void) + { +- int err = init_ext3_xattr(); ++ int err; ++ ++ err = init_ext3_proc(); ++ if (err) ++ return err; ++ ++ err = init_ext3_xattr(); + if (err) + return err; + err = init_inodecache(); +@@ -2734,6 +2746,7 @@ static void __exit exit_ext3_fs(void) + unregister_filesystem(&ext3_fs_type); + destroy_inodecache(); + exit_ext3_xattr(); ++ exit_ext3_proc(); + } + + int ext3_prep_san_write(struct inode *inode, long *blocks, +Index: linux-stage/fs/ext3/Makefile +=================================================================== +--- linux-stage.orig/fs/ext3/Makefile 2006-07-16 02:29:43.000000000 +0800 ++++ linux-stage/fs/ext3/Makefile 2006-07-16 02:29:49.000000000 +0800 +@@ -6,7 +6,7 @@ obj-$(CONFIG_EXT3_FS) += ext3.o + + ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ + ioctl.o namei.o super.o symlink.o hash.o resize.o \ +- extents.o ++ extents.o mballoc.o + + ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o + ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o +Index: linux-stage/include/linux/ext3_fs.h +=================================================================== +--- linux-stage.orig/include/linux/ext3_fs.h 2006-07-16 02:29:43.000000000 +0800 ++++ linux-stage/include/linux/ext3_fs.h 2006-07-16 02:29:49.000000000 +0800 +@@ -53,6 +53,14 @@ + #define ext3_debug(f, a...) do {} while (0) + #endif + ++#define EXT3_MULTIBLOCK_ALLOCATOR 1 ++ ++#define EXT3_MB_HINT_MERGE 1 ++#define EXT3_MB_HINT_RESERVED 2 ++#define EXT3_MB_HINT_METADATA 4 ++#define EXT3_MB_HINT_FIRST 8 ++#define EXT3_MB_HINT_BEST 16 ++ + /* + * Special inodes numbers + */ +@@ -379,6 +387,7 @@ struct ext3_inode { + #define EXT3_MOUNT_IOPEN_NOPRIV 0x800000/* Make iopen world-readable */ + #define EXT3_MOUNT_EXTENTS 0x1000000/* Extents support */ + #define EXT3_MOUNT_EXTDEBUG 0x2000000/* Extents debug */ ++#define EXT3_MOUNT_MBALLOC 0x800000/* Buddy allocation support */ + + /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ + #ifndef clear_opt +@@ -749,12 +758,12 @@ ext3_group_first_block_no(struct super_b + /* balloc.c */ + extern int ext3_bg_has_super(struct super_block *sb, int group); + extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group); +-extern ext3_fsblk_t ext3_new_block (handle_t *handle, struct inode *inode, +- ext3_fsblk_t goal, int *errp); ++//extern ext3_fsblk_t ext3_new_block (handle_t *handle, struct inode *inode, ++// ext3_fsblk_t goal, int *errp); + extern ext3_fsblk_t ext3_new_blocks (handle_t *handle, struct inode *inode, + ext3_fsblk_t goal, unsigned long *count, int *errp); + extern void ext3_free_blocks (handle_t *handle, struct inode *inode, +- ext3_fsblk_t block, unsigned long count); ++ ext3_fsblk_t block, unsigned long count, int metadata); + extern void ext3_free_blocks_sb (handle_t *handle, struct super_block *sb, + ext3_fsblk_t block, unsigned long count, + unsigned long *pdquot_freed_blocks); +@@ -881,6 +890,17 @@ extern void ext3_extents_initialize_bloc + extern int ext3_ext_ioctl(struct inode *inode, struct file *filp, + unsigned int cmd, unsigned long arg); + ++/* mballoc.c */ ++extern long ext3_mb_stats; ++extern long ext3_mb_max_to_scan; ++extern int ext3_mb_init(struct super_block *, int); ++extern int ext3_mb_release(struct super_block *); ++extern int ext3_mb_new_blocks(handle_t *, struct inode *, unsigned long, int *, int, int *); ++extern int ext3_mb_reserve_blocks(struct super_block *, int); ++extern void ext3_mb_release_blocks(struct super_block *, int); ++int __init init_ext3_proc(void); ++void exit_ext3_proc(void); ++ + #endif /* __KERNEL__ */ + + /* EXT3_IOC_CREATE_INUM at bottom of file (visible to kernel and user). */ +Index: linux-stage/include/linux/ext3_fs_sb.h +=================================================================== +--- linux-stage.orig/include/linux/ext3_fs_sb.h 2006-07-16 02:29:43.000000000 +0800 ++++ linux-stage/include/linux/ext3_fs_sb.h 2006-07-16 02:29:49.000000000 +0800 +@@ -21,8 +21,14 @@ + #include + #include + #include ++#include + #endif + #include ++#include ++ ++struct ext3_buddy_group_blocks; ++struct ext3_mb_history; ++#define EXT3_BB_MAX_BLOCKS + + /* + * third extended-fs super-block data in memory +@@ -78,6 +84,38 @@ struct ext3_sb_info { + char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */ + int s_jquota_fmt; /* Format of quota to use */ + #endif ++ ++ /* for buddy allocator */ ++ struct ext3_group_info **s_group_info; ++ struct inode *s_buddy_cache; ++ long s_blocks_reserved; ++ spinlock_t s_reserve_lock; ++ struct list_head s_active_transaction; ++ struct list_head s_closed_transaction; ++ struct list_head s_committed_transaction; ++ spinlock_t s_md_lock; ++ tid_t s_last_transaction; ++ int s_mb_factor; ++ unsigned short *s_mb_offsets, *s_mb_maxs; ++ ++ /* history to debug policy */ ++ struct ext3_mb_history *s_mb_history; ++ int s_mb_history_cur; ++ int s_mb_history_max; ++ struct proc_dir_entry *s_mb_proc; ++ spinlock_t s_mb_history_lock; ++ ++ /* stats for buddy allocator */ ++ atomic_t s_bal_reqs; /* number of reqs with len > 1 */ ++ atomic_t s_bal_success; /* we found long enough chunks */ ++ atomic_t s_bal_allocated; /* in blocks */ ++ atomic_t s_bal_ex_scanned; /* total extents scanned */ ++ atomic_t s_bal_goals; /* goal hits */ ++ atomic_t s_bal_breaks; /* too long searches */ ++ atomic_t s_bal_2orders; /* 2^order hits */ ++ spinlock_t s_bal_lock; ++ unsigned long s_mb_buddies_generated; ++ unsigned long long s_mb_generation_time; + }; + + #endif /* _LINUX_EXT3_FS_SB */ +Index: linux-stage/fs/ext3/inode.c +=================================================================== +--- linux-stage.orig/fs/ext3/inode.c 2006-07-16 02:29:43.000000000 +0800 ++++ linux-stage/fs/ext3/inode.c 2006-07-16 02:29:49.000000000 +0800 +@@ -562,7 +562,7 @@ static int ext3_alloc_blocks(handle_t *h + return ret; + failed_out: + for (i = 0; i ++ */ ++ ++#define ALIVE_MAGIC 0xA1153C29 ++struct alive_struct { ++ __le32 al_magic; ++ __le32 al_seq; ++ __le32 al_time; ++ char al_nodename[65]; ++}; +Index: linux-stage/fs/ext3/namei.c +=================================================================== +--- linux-stage.orig/fs/ext3/namei.c 2006-07-16 00:49:58.000000000 +0800 ++++ linux-stage/fs/ext3/namei.c 2006-07-16 00:50:31.000000000 +0800 +@@ -805,7 +805,7 @@ static inline int search_dirblock(struct + * The returned buffer_head has ->b_count elevated. The caller is expected + * to brelse() it when appropriate. + */ +-static struct buffer_head * ext3_find_entry (struct dentry *dentry, ++struct buffer_head * ext3_find_entry (struct dentry *dentry, + struct ext3_dir_entry_2 ** res_dir) + { + struct super_block * sb; +Index: linux-stage/fs/ext3/super.c +=================================================================== +--- linux-stage.orig/fs/ext3/super.c 2006-07-16 00:50:06.000000000 +0800 ++++ linux-stage/fs/ext3/super.c 2006-07-16 00:50:31.000000000 +0800 +@@ -35,12 +35,14 @@ + #include + #include + #include ++#include + + #include + + #include "xattr.h" + #include "acl.h" + #include "namei.h" ++#include "al.h" + + static int ext3_load_journal(struct super_block *, struct ext3_super_block *, + unsigned long journal_devnum); +@@ -61,6 +63,8 @@ static int ext3_statfs (struct dentry * + static void ext3_unlockfs(struct super_block *sb); + static void ext3_write_super (struct super_block * sb); + static void ext3_write_super_lockfs(struct super_block *sb); ++struct buffer_head * ext3_find_entry (struct dentry *dentry, ++ struct ext3_dir_entry_2 ** res_dir); + + /* + * Wrappers for journal_start/end. +@@ -434,6 +438,10 @@ static void ext3_put_super (struct super + invalidate_bdev(sbi->journal_bdev, 0); + ext3_blkdev_remove(sbi); + } ++ if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_ALIVE)) { ++ BUG_ON(!sbi->s_alive_tsk); ++ kthread_stop(sbi->s_alive_tsk); ++ } + sb->s_fs_info = NULL; + kfree(sbi); + return; +@@ -1374,6 +1382,261 @@ static ext3_fsblk_t descriptor_loc(struc + return (has_super + ext3_group_first_block_no(sb, bg)); + } + ++static int write_alive(struct buffer_head * bh) ++{ ++ lock_buffer(bh); ++ bh->b_end_io = end_buffer_write_sync; ++ get_bh(bh); ++ submit_bh(WRITE, bh); ++ wait_on_buffer(bh); ++ if (unlikely(!buffer_uptodate(bh))) ++ return 1; ++ return 0; ++} ++ ++static int read_alive_again(struct buffer_head * bh) ++{ ++ lock_buffer(bh); ++ bh->b_end_io = end_buffer_read_sync; ++ get_bh(bh); ++ submit_bh(READ, bh); ++ wait_on_buffer(bh); ++ if (!buffer_uptodate(bh)) { ++ brelse(bh); ++ return 1; ++ } ++ return 0; ++} ++ ++/* ++ * The caller must have a ref on the buffer_head. ++ */ ++static int kalived(void *data) ++{ ++ struct buffer_head * bh; ++ struct alive_struct * alive; ++ char b[BDEVNAME_SIZE]; ++ u32 seq = 0; ++ ++ bh = (struct buffer_head *)data; ++ bdevname(bh->b_bdev, b); ++ ++ alive = (struct alive_struct *)(bh->b_data); ++ alive->al_magic = cpu_to_le32(ALIVE_MAGIC); ++ alive->al_time = cpu_to_le32(get_seconds()); ++ ++ down_read(&uts_sem); ++ memcpy(alive->al_nodename, system_utsname.nodename, 65); ++ up_read(&uts_sem); ++ ++ while (!kthread_should_stop()) { ++ if (++seq == 0) ++ ++seq; ++ ++ alive->al_seq = cpu_to_le32(seq); ++ alive->al_time = cpu_to_le32(get_seconds()); ++ ++ if (unlikely(write_alive(bh))) { ++ /* panic here? */ ++ printk(KERN_ERR "Alive (device %s): " ++ "can't write alive block\n", b); ++ continue; ++ } ++ ++ schedule_timeout_interruptible(5 * HZ); ++ } ++ ++ alive->al_seq = 0; ++ alive->al_time = cpu_to_le32(get_seconds()); ++ ++ if (unlikely(write_alive(bh))) ++ printk(KERN_ERR "Alive (device %s): " ++ "can't reset alive block\n", b); ++ brelse(bh); ++ return 0; ++} ++ ++static unsigned long get_alive_ino(struct super_block *sb) ++{ ++ unsigned long ino = 0; ++ struct dentry alive; ++ struct dentry * root; ++ struct inode * root_inode; ++ struct ext3_dir_entry_2 * de; ++ struct buffer_head * bh; ++ ++ root_inode = iget(sb, EXT3_ROOT_INO); ++ root = d_alloc_root(root_inode); ++ if (!root) { ++ printk(KERN_ERR "Alive (device %s): get root inode failed\n", ++ sb->s_id); ++ iput(root_inode); ++ goto out; ++ } ++ ++ alive.d_name.name = ".alive"; ++ alive.d_name.len = 6; ++ alive.d_parent = root; ++ ++ bh = ext3_find_entry(&alive, &de); ++ dput(root); ++ ++ if (!bh) { ++ printk(KERN_WARNING "Alive (device %s): alive lookup failed\n", ++ sb->s_id); ++ goto out; ++ } ++ ++ ino = le32_to_cpu(de->inode); ++ brelse (bh); ++ pr_debug("Alive (device %s): alive_ino=%lu\n", sb->s_id, ino); ++out: ++ return ino; ++} ++ ++/* check alive file */ ++static int check_alive(struct super_block *sb, struct ext3_sb_info *sbi) ++{ ++ unsigned long ino; ++ struct buffer_head * bh; ++ struct ext3_inode_info * ei; ++ struct inode * alive_inode; ++ struct alive_struct * alive; ++ u32 alive_block; ++ u32 seq; ++ ++ ino = get_alive_ino(sb); ++ if (!ino) ++ goto failed; ++ ++ alive_inode = iget(sb, ino); ++ if (!alive_inode) { ++ iput(alive_inode); ++ printk(KERN_ERR "Alive (device %s): get alive inode failed\n", ++ sb->s_id); ++ goto failed; ++ } ++ if (!alive_inode->i_nlink) { ++ make_bad_inode(alive_inode); ++ iput(alive_inode); ++ printk(KERN_ERR "Alive (device %s): alive inode is deleted\n", ++ sb->s_id); ++ goto failed; ++ } ++ if (!S_ISREG(alive_inode->i_mode)) { ++ iput(alive_inode); ++ printk(KERN_ERR "Alive (device %s): invalid alive inode\n", ++ sb->s_id); ++ goto failed; ++ } ++ if (EXT3_I(alive_inode)->i_flags & EXT3_EXTENTS_FL) { ++ iput(alive_inode); ++ printk(KERN_ERR "Alive (device %s): invalid alive inode, " ++ "in extents format\n", sb->s_id); ++ goto failed; ++ } ++ ++ ei = EXT3_I(alive_inode); ++ alive_block = ei->i_data[0]; ++ iput(alive_inode); ++ ++ pr_debug("Alive (device %s): read in alive block #%u\n", ++ sb->s_id, alive_block); ++ ++ /* first read */ ++ bh = sb_bread(sb, alive_block); ++ if (!bh) { ++ printk(KERN_ERR "Alive (device %s): " ++ "can't read alive block #%u\n", sb->s_id, alive_block); ++ goto failed; ++ } ++ ++ alive = (struct alive_struct *)(bh->b_data); ++ if (le32_to_cpu(alive->al_magic) != ALIVE_MAGIC) { ++ printk(KERN_ERR "Alive (device %s): " ++ "magic mismatch\n", sb->s_id); ++ brelse(bh); ++ goto failed; ++ } ++ ++ seq = le32_to_cpu(alive->al_seq); ++ pr_debug("Alive (device %s): seq=%u\n", sb->s_id, seq); ++ pr_info ("Alive (device %s): last touched by node: %s, " ++ "%li seconds ago\n", sb->s_id, alive->al_nodename, ++ get_seconds() - le32_to_cpu(alive->al_time)); ++ ++ if (seq == 0) ++ goto skip; ++ ++ /* wait 8s */ ++ pr_info("Alive (device %s): wait for 8 seconds...\n", sb->s_id); ++ schedule_timeout_uninterruptible(HZ * 8); ++ ++ /* read again */ ++ if (read_alive_again(bh)) { ++ printk(KERN_ERR "Alive (device %s): " ++ "can't read alive block #%u\n", ++ sb->s_id, alive_block); ++ goto failed; ++ } ++ ++ alive = (struct alive_struct *)(bh->b_data); ++ pr_debug("Alive (device %s): seq=%u\n", ++ sb->s_id, le32_to_cpu(alive->al_seq)); ++ ++ if (seq != le32_to_cpu(alive->al_seq)) { ++ printk(KERN_WARNING "Alive (device %s): " ++ "still active on node %s\n", ++ sb->s_id, alive->al_nodename); ++ brelse(bh); ++ goto failed; ++ } ++skip: ++ /* write a new random seq */ ++ get_random_bytes(&seq, sizeof(u32)); ++ alive->al_seq = cpu_to_le32(seq); ++ if (unlikely(write_alive(bh))) { ++ printk(KERN_ERR "Alive (device %s): " ++ "can't write alive block\n", sb->s_id); ++ goto failed; ++ } ++ pr_debug("Alive (device %s): write random seq=%u\n", sb->s_id, seq); ++ ++ /* wait 6s */ ++ pr_info("Alive (device %s): wait for 6 seconds...\n", sb->s_id); ++ schedule_timeout_uninterruptible(HZ * 6); ++ ++ /* read again */ ++ if (read_alive_again(bh)) { ++ printk(KERN_ERR "Alive (device %s): " ++ "can't read alive block #%u\n", ++ sb->s_id, alive_block); ++ goto failed; ++ } ++ ++ alive = (struct alive_struct *)(bh->b_data); ++ pr_debug("Alive (device %s): seq=%u\n", ++ sb->s_id, le32_to_cpu(alive->al_seq)); ++ ++ if (seq != le32_to_cpu(alive->al_seq)) { ++ printk(KERN_WARNING "Alive (device %s): " ++ "still active on node %s\n", ++ sb->s_id, alive->al_nodename); ++ brelse(bh); ++ goto failed; ++ } ++ ++ /* succeed */ ++ pr_info("Alive (device %s): alive check passed!\n", sb->s_id); ++ sbi->s_alive_tsk = kthread_run(kalived, bh, "kalived"); ++ return 0; ++ ++failed: ++ printk(KERN_WARNING "Alive (device %s): alive check failed!\n", ++ sb->s_id); ++ return 1; ++} ++ + + static int ext3_fill_super (struct super_block *sb, void *data, int silent) + { +@@ -1688,6 +1951,10 @@ static int ext3_fill_super (struct super + EXT3_HAS_INCOMPAT_FEATURE(sb, + EXT3_FEATURE_INCOMPAT_RECOVER)); + ++ if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_ALIVE)) ++ if (check_alive(sb, sbi)) ++ goto failed_mount2; ++ + /* + * The first inode we look at is the journal inode. Don't try + * root first: it may be modified in the journal! +Index: linux-stage/include/linux/ext3_fs.h +=================================================================== +--- linux-stage.orig/include/linux/ext3_fs.h 2006-07-16 00:49:58.000000000 +0800 ++++ linux-stage/include/linux/ext3_fs.h 2006-07-16 00:50:31.000000000 +0800 +@@ -579,12 +579,14 @@ static inline struct ext3_inode_info *EX + #define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */ + #define EXT3_FEATURE_INCOMPAT_META_BG 0x0010 + #define EXT3_FEATURE_INCOMPAT_EXTENTS 0x0040 /* extents support */ ++#define EXT3_FEATURE_INCOMPAT_ALIVE 0x0080 + + #define EXT3_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR + #define EXT3_FEATURE_INCOMPAT_SUPP (EXT3_FEATURE_INCOMPAT_FILETYPE| \ + EXT3_FEATURE_INCOMPAT_RECOVER| \ + EXT3_FEATURE_INCOMPAT_META_BG| \ +- EXT3_FEATURE_INCOMPAT_EXTENTS) ++ EXT3_FEATURE_INCOMPAT_EXTENTS| \ ++ EXT3_FEATURE_INCOMPAT_ALIVE) + #define EXT3_FEATURE_RO_COMPAT_SUPP (EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT3_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT3_FEATURE_RO_COMPAT_BTREE_DIR) +Index: linux-stage/include/linux/ext3_fs_sb.h +=================================================================== +--- linux-stage.orig/include/linux/ext3_fs_sb.h 2006-07-16 00:50:02.000000000 +0800 ++++ linux-stage/include/linux/ext3_fs_sb.h 2006-07-16 00:50:31.000000000 +0800 +@@ -86,6 +86,7 @@ struct ext3_sb_info { + char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */ + int s_jquota_fmt; /* Format of quota to use */ + #endif ++ struct task_struct * s_alive_tsk; + + /* for buddy allocator */ + struct ext3_group_info **s_group_info; diff --git a/lustre/kernel_patches/patches/ext3-wantedi-misc-2.6.18-vanilla.patch b/lustre/kernel_patches/patches/ext3-wantedi-misc-2.6.18-vanilla.patch new file mode 100644 index 0000000..11c6ada --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-wantedi-misc-2.6.18-vanilla.patch @@ -0,0 +1,16 @@ +Index: linux-2.6/include/linux/dcache.h +=================================================================== +--- linux-2.6.orig/include/linux/dcache.h 2006-07-15 16:11:52.000000000 +0800 ++++ linux-2.6/include/linux/dcache.h 2006-07-15 16:12:04.000000000 +0800 +@@ -24,6 +24,11 @@ struct vfsmount; + + #define IS_ROOT(x) ((x) == (x)->d_parent) + ++struct dentry_params { ++ unsigned long p_inum; ++ void *p_ptr; ++}; ++ + /* + * "quick string" -- eases parameter passing, but more importantly + * saves "metadata" about the string (ie length and the hash). diff --git a/lustre/kernel_patches/patches/iopen-misc-2.6.18-vanilla.patch b/lustre/kernel_patches/patches/iopen-misc-2.6.18-vanilla.patch new file mode 100644 index 0000000..89927c3 --- /dev/null +++ b/lustre/kernel_patches/patches/iopen-misc-2.6.18-vanilla.patch @@ -0,0 +1,82 @@ +Index: linux-2.6/Documentation/filesystems/ext2.txt +=================================================================== +--- linux-2.6.orig/Documentation/filesystems/ext2.txt 2006-04-03 22:46:38.000000000 +0800 ++++ linux-2.6/Documentation/filesystems/ext2.txt 2006-07-15 12:54:06.000000000 +0800 +@@ -58,6 +58,22 @@ nobh Do not attach buffer_heads to fi + + xip Use execute in place (no caching) if possible + ++iopen Makes an invisible pseudo-directory called ++ __iopen__ available in the root directory ++ of the filesystem. Allows open-by-inode- ++ number. i.e., inode 3145 can be accessed ++ via /mntpt/__iopen__/3145 ++ ++iopen_nopriv This option makes the iopen directory be ++ world-readable. This may be safer since it ++ allows daemons to run as an unprivileged user, ++ however it significantly changes the security ++ model of a Unix filesystem, since previously ++ all files under a mode 700 directory were not ++ generally avilable even if the ++ permissions on the file itself is ++ world-readable. ++ + grpquota,noquota,quota,usrquota Quota options are silently ignored by ext2. + + +Index: linux-2.6/fs/dcache.c +=================================================================== +--- linux-2.6.orig/fs/dcache.c 2006-07-15 12:48:18.000000000 +0800 ++++ linux-2.6/fs/dcache.c 2006-07-15 12:54:06.000000000 +0800 +@@ -1341,14 +1341,13 @@ static void switch_names(struct dentry * + * dcache entries should not be moved in this way. + */ + +-void d_move(struct dentry * dentry, struct dentry * target) ++void __d_move(struct dentry * dentry, struct dentry * target) + { + struct hlist_head *list; + + if (!dentry->d_inode) + printk(KERN_WARNING "VFS: moving negative dcache entry\n"); + +- spin_lock(&dcache_lock); + write_seqlock(&rename_lock); + /* + * XXXX: do we really need to take target->d_lock? +@@ -1399,6 +1398,14 @@ already_unhashed: + fsnotify_d_move(dentry); + spin_unlock(&dentry->d_lock); + write_sequnlock(&rename_lock); ++} ++ ++EXPORT_SYMBOL(__d_move); ++ ++void d_move(struct dentry *dentry, struct dentry *target) ++{ ++ spin_lock(&dcache_lock); ++ __d_move(dentry, target); + spin_unlock(&dcache_lock); + } + +Index: linux-2.6/include/linux/dcache.h +=================================================================== +--- linux-2.6.orig/include/linux/dcache.h 2006-07-15 12:48:41.000000000 +0800 ++++ linux-2.6/include/linux/dcache.h 2006-07-15 12:54:06.000000000 +0800 +@@ -257,6 +257,7 @@ extern int have_submounts(struct dentry + * This adds the entry to the hash queues. + */ + extern void d_rehash(struct dentry *); ++extern void d_rehash_cond(struct dentry *, int lock); + + /** + * d_add - add dentry to hash queues +@@ -292,6 +293,7 @@ static inline struct dentry *d_add_uniqu + + /* used for rename() and baskets */ + extern void d_move(struct dentry *, struct dentry *); ++extern void __d_move(struct dentry *, struct dentry *); + + /* appendix may either be NULL or be used for transname suffixes */ + extern struct dentry * d_lookup(struct dentry *, struct qstr *); diff --git a/lustre/kernel_patches/patches/jbd-jcberr-2.6.18-vanilla.patch b/lustre/kernel_patches/patches/jbd-jcberr-2.6.18-vanilla.patch new file mode 100644 index 0000000..867d41c --- /dev/null +++ b/lustre/kernel_patches/patches/jbd-jcberr-2.6.18-vanilla.patch @@ -0,0 +1,228 @@ +Index: linux-2.6/include/linux/jbd.h +=================================================================== +--- linux-2.6.orig/include/linux/jbd.h 2006-07-15 16:08:35.000000000 +0800 ++++ linux-2.6/include/linux/jbd.h 2006-07-15 16:13:01.000000000 +0800 +@@ -356,6 +356,27 @@ static inline void jbd_unlock_bh_journal + bit_spin_unlock(BH_JournalHead, &bh->b_state); + } + ++#define HAVE_JOURNAL_CALLBACK_STATUS ++/** ++ * struct journal_callback - Base structure for callback information. ++ * @jcb_list: list information for other callbacks attached to the same handle. ++ * @jcb_func: Function to call with this callback structure. ++ * ++ * This struct is a 'seed' structure for a using with your own callback ++ * structs. If you are using callbacks you must allocate one of these ++ * or another struct of your own definition which has this struct ++ * as it's first element and pass it to journal_callback_set(). ++ * ++ * This is used internally by jbd to maintain callback information. ++ * ++ * See journal_callback_set for more information. ++ **/ ++struct journal_callback { ++ struct list_head jcb_list; /* t_jcb_lock */ ++ void (*jcb_func)(struct journal_callback *jcb, int error); ++ /* user data goes here */ ++}; ++ + struct jbd_revoke_table_s; + + /** +@@ -364,6 +385,7 @@ struct jbd_revoke_table_s; + * @h_transaction: Which compound transaction is this update a part of? + * @h_buffer_credits: Number of remaining buffers we are allowed to dirty. + * @h_ref: Reference count on this handle ++ * @h_jcb: List of application registered callbacks for this handle. + * @h_err: Field for caller's use to track errors through large fs operations + * @h_sync: flag for sync-on-close + * @h_jdata: flag to force data journaling +@@ -389,6 +411,13 @@ struct handle_s + /* operations */ + int h_err; + ++ /* ++ * List of application registered callbacks for this handle. The ++ * function(s) will be called after the transaction that this handle is ++ * part of has been committed to disk. [t_jcb_lock] ++ */ ++ struct list_head h_jcb; ++ + /* Flags [no locking] */ + unsigned int h_sync: 1; /* sync-on-close */ + unsigned int h_jdata: 1; /* force data journaling */ +@@ -430,6 +459,8 @@ struct handle_s + * j_state_lock + * ->j_list_lock (journal_unmap_buffer) + * ++ * t_handle_lock ++ * ->t_jcb_lock + */ + + struct transaction_s +@@ -559,6 +590,15 @@ struct transaction_s + */ + int t_handle_count; + ++ /* ++ * Protects the callback list ++ */ ++ spinlock_t t_jcb_lock; ++ /* ++ * List of registered callback functions for this transaction. ++ * Called when the transaction is committed. [t_jcb_lock] ++ */ ++ struct list_head t_jcb; + }; + + /** +@@ -906,6 +946,10 @@ extern void journal_invalidatepage(jour + extern int journal_try_to_free_buffers(journal_t *, struct page *, gfp_t); + extern int journal_stop(handle_t *); + extern int journal_flush (journal_t *); ++extern void journal_callback_set(handle_t *handle, ++ void (*fn)(struct journal_callback *,int), ++ struct journal_callback *jcb); ++ + extern void journal_lock_updates (journal_t *); + extern void journal_unlock_updates (journal_t *); + +Index: linux-2.6/fs/jbd/checkpoint.c +=================================================================== +--- linux-2.6.orig/fs/jbd/checkpoint.c 2006-07-15 16:08:36.000000000 +0800 ++++ linux-2.6/fs/jbd/checkpoint.c 2006-07-15 16:13:01.000000000 +0800 +@@ -688,6 +688,7 @@ void __journal_drop_transaction(journal_ + J_ASSERT(transaction->t_checkpoint_list == NULL); + J_ASSERT(transaction->t_checkpoint_io_list == NULL); + J_ASSERT(transaction->t_updates == 0); ++ J_ASSERT(list_empty(&transaction->t_jcb)); + J_ASSERT(journal->j_committing_transaction != transaction); + J_ASSERT(journal->j_running_transaction != transaction); + +Index: linux-2.6/fs/jbd/commit.c +=================================================================== +--- linux-2.6.orig/fs/jbd/commit.c 2006-07-15 16:08:36.000000000 +0800 ++++ linux-2.6/fs/jbd/commit.c 2006-07-15 16:13:01.000000000 +0800 +@@ -708,6 +708,30 @@ wait_for_iobuf: + transaction can be removed from any checkpoint list it was on + before. */ + ++ /* ++ * Call any callbacks that had been registered for handles in this ++ * transaction. It is up to the callback to free any allocated ++ * memory. ++ * ++ * The spinlocking (t_jcb_lock) here is surely unnecessary... ++ */ ++ spin_lock(&commit_transaction->t_jcb_lock); ++ if (!list_empty(&commit_transaction->t_jcb)) { ++ struct list_head *p, *n; ++ int error = is_journal_aborted(journal); ++ ++ list_for_each_safe(p, n, &commit_transaction->t_jcb) { ++ struct journal_callback *jcb; ++ ++ jcb = list_entry(p, struct journal_callback, jcb_list); ++ list_del(p); ++ spin_unlock(&commit_transaction->t_jcb_lock); ++ jcb->jcb_func(jcb, error); ++ spin_lock(&commit_transaction->t_jcb_lock); ++ } ++ } ++ spin_unlock(&commit_transaction->t_jcb_lock); ++ + jbd_debug(3, "JBD: commit phase 7\n"); + + J_ASSERT(commit_transaction->t_sync_datalist == NULL); +Index: linux-2.6/fs/jbd/journal.c +=================================================================== +--- linux-2.6.orig/fs/jbd/journal.c 2006-07-15 16:08:36.000000000 +0800 ++++ linux-2.6/fs/jbd/journal.c 2006-07-15 16:13:01.000000000 +0800 +@@ -58,6 +58,7 @@ EXPORT_SYMBOL(journal_sync_buffer); + #endif + EXPORT_SYMBOL(journal_flush); + EXPORT_SYMBOL(journal_revoke); ++EXPORT_SYMBOL(journal_callback_set); + + EXPORT_SYMBOL(journal_init_dev); + EXPORT_SYMBOL(journal_init_inode); +@@ -80,6 +81,7 @@ EXPORT_SYMBOL(journal_wipe); + EXPORT_SYMBOL(journal_blocks_per_page); + EXPORT_SYMBOL(journal_invalidatepage); + EXPORT_SYMBOL(journal_try_to_free_buffers); ++EXPORT_SYMBOL(journal_bmap); + EXPORT_SYMBOL(journal_force_commit); + + static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *); +Index: linux-2.6/fs/jbd/transaction.c +=================================================================== +--- linux-2.6.orig/fs/jbd/transaction.c 2006-07-15 16:08:35.000000000 +0800 ++++ linux-2.6/fs/jbd/transaction.c 2006-07-15 16:13:01.000000000 +0800 +@@ -50,7 +50,9 @@ get_transaction(journal_t *journal, tran + transaction->t_state = T_RUNNING; + transaction->t_tid = journal->j_transaction_sequence++; + transaction->t_expires = jiffies + journal->j_commit_interval; ++ INIT_LIST_HEAD(&transaction->t_jcb); + spin_lock_init(&transaction->t_handle_lock); ++ spin_lock_init(&transaction->t_jcb_lock); + + /* Set up the commit timer for the new transaction. */ + journal->j_commit_timer.expires = transaction->t_expires; +@@ -241,6 +243,7 @@ static handle_t *new_handle(int nblocks) + memset(handle, 0, sizeof(*handle)); + handle->h_buffer_credits = nblocks; + handle->h_ref = 1; ++ INIT_LIST_HEAD(&handle->h_jcb); + + return handle; + } +@@ -1291,6 +1294,36 @@ drop: + } + + /** ++ * void journal_callback_set() - Register a callback function for this handle. ++ * @handle: handle to attach the callback to. ++ * @func: function to callback. ++ * @jcb: structure with additional information required by func() , and ++ * some space for jbd internal information. ++ * ++ * The function will be ++ * called when the transaction that this handle is part of has been ++ * committed to disk with the original callback data struct and the ++ * error status of the journal as parameters. There is no guarantee of ++ * ordering between handles within a single transaction, nor between ++ * callbacks registered on the same handle. ++ * ++ * The caller is responsible for allocating the journal_callback struct. ++ * This is to allow the caller to add as much extra data to the callback ++ * as needed, but reduce the overhead of multiple allocations. The caller ++ * allocated struct must start with a struct journal_callback at offset 0, ++ * and has the caller-specific data afterwards. ++ */ ++void journal_callback_set(handle_t *handle, ++ void (*func)(struct journal_callback *jcb, int error), ++ struct journal_callback *jcb) ++{ ++ spin_lock(&handle->h_transaction->t_jcb_lock); ++ list_add_tail(&jcb->jcb_list, &handle->h_jcb); ++ spin_unlock(&handle->h_transaction->t_jcb_lock); ++ jcb->jcb_func = func; ++} ++ ++/** + * int journal_stop() - complete a transaction + * @handle: tranaction to complete. + * +@@ -1363,6 +1396,11 @@ int journal_stop(handle_t *handle) + wake_up(&journal->j_wait_transaction_locked); + } + ++ /* Move callbacks from the handle to the transaction. */ ++ spin_lock(&transaction->t_jcb_lock); ++ list_splice(&handle->h_jcb, &transaction->t_jcb); ++ spin_unlock(&transaction->t_jcb_lock); ++ + /* + * If the handle is marked SYNC, we need to set another commit + * going! We also want to force a commit if the current diff --git a/lustre/kernel_patches/patches/nfs-cifs-intent-2.6.18-vanilla.patch b/lustre/kernel_patches/patches/nfs-cifs-intent-2.6.18-vanilla.patch new file mode 100644 index 0000000..5639fe4 --- /dev/null +++ b/lustre/kernel_patches/patches/nfs-cifs-intent-2.6.18-vanilla.patch @@ -0,0 +1,120 @@ +Index: linux-2.6/fs/cifs/dir.c +=================================================================== +--- linux-2.6.orig/fs/cifs/dir.c 2006-07-15 21:04:01.000000000 +0800 ++++ linux-2.6/fs/cifs/dir.c 2006-07-15 21:04:47.000000000 +0800 +@@ -146,7 +146,7 @@ cifs_create(struct inode *inode, struct + } + + if(nd && (nd->flags & LOOKUP_OPEN)) { +- int oflags = nd->intent.open.flags; ++ int oflags = nd->intent.flags; + + desiredAccess = 0; + if (oflags & FMODE_READ) +Index: linux-2.6/fs/nfs/dir.c +=================================================================== +--- linux-2.6.orig/fs/nfs/dir.c 2006-07-15 21:04:01.000000000 +0800 ++++ linux-2.6/fs/nfs/dir.c 2006-07-15 21:04:47.000000000 +0800 +@@ -867,7 +867,7 @@ int nfs_is_exclusive_create(struct inode + return 0; + if (nd == NULL || nfs_lookup_check_intent(nd, LOOKUP_CREATE) == 0) + return 0; +- return (nd->intent.open.flags & O_EXCL) != 0; ++ return (nd->intent.it_flags & O_EXCL) != 0; + } + + static inline int nfs_reval_fsid(struct inode *dir, +@@ -955,7 +955,7 @@ static int is_atomic_open(struct inode * + if (nd->flags & LOOKUP_DIRECTORY) + return 0; + /* Are we trying to write to a read only partition? */ +- if (IS_RDONLY(dir) && (nd->intent.open.flags & (O_CREAT|O_TRUNC|FMODE_WRITE))) ++ if (IS_RDONLY(dir) && (nd->intent.it_flags & (O_CREAT|O_TRUNC|FMODE_WRITE))) + return 0; + return 1; + } +@@ -979,7 +979,7 @@ static struct dentry *nfs_atomic_lookup( + dentry->d_op = NFS_PROTO(dir)->dentry_ops; + + /* Let vfs_create() deal with O_EXCL */ +- if (nd->intent.open.flags & O_EXCL) { ++ if (nd->intent.it_flags & O_EXCL) { + d_add(dentry, NULL); + goto out; + } +@@ -994,7 +994,7 @@ static struct dentry *nfs_atomic_lookup( + goto out; + } + +- if (nd->intent.open.flags & O_CREAT) { ++ if (nd->intent.it_flags & O_CREAT) { + nfs_begin_data_update(dir); + res = nfs4_atomic_open(dir, dentry, nd); + nfs_end_data_update(dir); +@@ -1013,7 +1013,7 @@ static struct dentry *nfs_atomic_lookup( + case -ENOTDIR: + goto no_open; + case -ELOOP: +- if (!(nd->intent.open.flags & O_NOFOLLOW)) ++ if (!(nd->intent.it_flags & O_NOFOLLOW)) + goto no_open; + /* case -EINVAL: */ + default: +@@ -1049,7 +1049,7 @@ static int nfs_open_revalidate(struct de + /* NFS only supports OPEN on regular files */ + if (!S_ISREG(inode->i_mode)) + goto no_open; +- openflags = nd->intent.open.flags; ++ openflags = nd->intent.it_flags; + /* We cannot do exclusive creation on a positive dentry */ + if ((openflags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL)) + goto no_open; +@@ -1182,7 +1182,7 @@ static int nfs_create(struct inode *dir, + attr.ia_valid = ATTR_MODE; + + if (nd && (nd->flags & LOOKUP_CREATE)) +- open_flags = nd->intent.open.flags; ++ open_flags = nd->intent.it_flags; + + lock_kernel(); + nfs_begin_data_update(dir); +Index: linux-2.6/fs/nfs/nfs4proc.c +=================================================================== +--- linux-2.6.orig/fs/nfs/nfs4proc.c 2006-07-15 21:04:01.000000000 +0800 ++++ linux-2.6/fs/nfs/nfs4proc.c 2006-07-15 21:09:29.000000000 +0800 +@@ -1246,7 +1246,7 @@ static int nfs4_intent_set_file(struct n + ctx->state = state; + return 0; + } +- nfs4_close_state(state, nd->intent.open.flags); ++ nfs4_close_state(state, nd->intent.flags); + return PTR_ERR(filp); + } + +@@ -1259,22 +1259,22 @@ nfs4_atomic_open(struct inode *dir, stru + struct dentry *res; + + if (nd->flags & LOOKUP_CREATE) { +- attr.ia_mode = nd->intent.open.create_mode; ++ attr.ia_mode = nd->intent.create_mode; + attr.ia_valid = ATTR_MODE; + if (!IS_POSIXACL(dir)) + attr.ia_mode &= ~current->fs->umask; + } else { + attr.ia_valid = 0; +- BUG_ON(nd->intent.open.flags & O_CREAT); ++ BUG_ON(nd->intent.flags & O_CREAT); + } + + cred = rpcauth_lookupcred(NFS_SERVER(dir)->client->cl_auth, 0); + if (IS_ERR(cred)) + return (struct dentry *)cred; +- state = nfs4_do_open(dir, dentry, nd->intent.open.flags, &attr, cred); ++ state = nfs4_do_open(dir, dentry, nd->intent.flags, &attr, cred); + put_rpccred(cred); + if (IS_ERR(state)) { +- if (PTR_ERR(state) == -ENOENT) ++ ose_statef (PTR_ERR(state) == -ENOENT); + d_add(dentry, NULL); + return (struct dentry *)state; + } diff --git a/lustre/kernel_patches/patches/tcp-zero-copy-2.6.18-vanilla.patch b/lustre/kernel_patches/patches/tcp-zero-copy-2.6.18-vanilla.patch new file mode 100644 index 0000000..cb33b04 --- /dev/null +++ b/lustre/kernel_patches/patches/tcp-zero-copy-2.6.18-vanilla.patch @@ -0,0 +1,450 @@ +Index: linux-2.6/net/core/skbuff.c +=================================================================== +--- linux-2.6.orig/net/core/skbuff.c 2006-07-15 21:08:45.000000000 +0800 ++++ linux-2.6/net/core/skbuff.c 2006-07-15 21:12:21.000000000 +0800 +@@ -183,7 +183,8 @@ struct sk_buff *__alloc_skb(unsigned int + shinfo->gso_type = 0; + shinfo->ip6_frag_id = 0; + shinfo->frag_list = NULL; +- ++ shinfo->zccd = NULL; /* skbuffs kick off with NO user zero copy descriptors */ ++ shinfo->zccd2 = NULL; + if (fclone) { + struct sk_buff *child = skb + 1; + atomic_t *fclone_ref = (atomic_t *) (child + 1); +@@ -283,6 +284,10 @@ static void skb_release_data(struct sk_b + if (!skb->cloned || + !atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1, + &skb_shinfo(skb)->dataref)) { ++ if (skb_shinfo(skb)->zccd != NULL) /* zero copy callback descriptor? */ ++ zccd_put (skb_shinfo(skb)->zccd); /* release hold */ ++ if (skb_shinfo(skb)->zccd2 != NULL) /* 2nd zero copy callback descriptor? */ ++ zccd_put (skb_shinfo(skb)->zccd2); /* release hold */ + if (skb_shinfo(skb)->nr_frags) { + int i; + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) +@@ -618,6 +623,14 @@ struct sk_buff *pskb_copy(struct sk_buff + n->data_len = skb->data_len; + n->len = skb->len; + ++ if (skb_shinfo(skb)->zccd != NULL) /* user zero copy descriptor? */ ++ zccd_get (skb_shinfo(skb)->zccd); /* 1 more ref (pages are shared) */ ++ skb_shinfo(n)->zccd = skb_shinfo(skb)->zccd; ++ ++ if (skb_shinfo(skb)->zccd2 != NULL) /* 2nd user zero copy descriptor? */ ++ zccd_get (skb_shinfo(skb)->zccd2); /* 1 more ref (pages are shared) */ ++ skb_shinfo(n)->zccd2 = skb_shinfo(skb)->zccd2; ++ + if (skb_shinfo(skb)->nr_frags) { + int i; + +@@ -661,6 +674,9 @@ int pskb_expand_head(struct sk_buff *skb + u8 *data; + int size = nhead + (skb->end - skb->head) + ntail; + long off; ++ zccd_t *zccd = skb_shinfo(skb)->zccd; /* stash user zero copy descriptor */ ++ zccd_t *zccd2 = skb_shinfo(skb)->zccd2; /* stash 2nd user zero copy descriptor */ ++ + + if (skb_shared(skb)) + BUG(); +@@ -682,6 +698,11 @@ int pskb_expand_head(struct sk_buff *skb + if (skb_shinfo(skb)->frag_list) + skb_clone_fraglist(skb); + ++ if (zccd != NULL) /* user zero copy descriptor? */ ++ zccd_get (zccd); /* extra ref (pages are shared) */ ++ if (zccd2 != NULL) /* 2nd user zero copy descriptor? */ ++ zccd_get (zccd2); /* extra ref (pages are shared) */ ++ + skb_release_data(skb); + + off = (data + nhead) - skb->head; +@@ -696,6 +717,8 @@ int pskb_expand_head(struct sk_buff *skb + skb->cloned = 0; + skb->nohdr = 0; + atomic_set(&skb_shinfo(skb)->dataref, 1); ++ skb_shinfo(skb)->zccd = zccd; ++ skb_shinfo(skb)->zccd2 = zccd2; + return 0; + + nodata: +Index: linux-2.6/net/ipv4/tcp.c +=================================================================== +--- linux-2.6.orig/net/ipv4/tcp.c 2006-07-15 21:08:45.000000000 +0800 ++++ linux-2.6/net/ipv4/tcp.c 2006-07-15 22:32:12.000000000 +0800 +@@ -499,8 +499,10 @@ static inline void tcp_push(struct sock + } + } + ++/* Extra parameter: user zero copy descriptor (or NULL if not doing that) */ + static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, +- size_t psize, int flags) ++ size_t psize, int flags, zccd_t *zccd) ++ + { + struct tcp_sock *tp = tcp_sk(sk); + int mss_now, size_goal; +@@ -548,6 +550,17 @@ new_segment: + copy = size; + + i = skb_shinfo(skb)->nr_frags; ++ ++ if (zccd != NULL && /* this is a zcc I/O */ ++ skb_shinfo(skb)->zccd != NULL && /* skb is part of a zcc I/O */ ++ skb_shinfo(skb)->zccd2 != NULL && ++ skb_shinfo(skb)->zccd != zccd && /* not the same one */ ++ skb_shinfo(skb)->zccd2 != zccd) ++ { ++ tcp_mark_push (tp, skb); ++ goto new_segment; ++ } ++ + can_coalesce = skb_can_coalesce(skb, i, page, offset); + if (!can_coalesce && i >= MAX_SKB_FRAGS) { + tcp_mark_push(tp, skb); +@@ -563,6 +576,20 @@ new_segment: + skb_fill_page_desc(skb, i, page, offset, copy); + } + ++ if (zccd != NULL && /* this is a zcc I/O */ ++ skb_shinfo(skb)->zccd != zccd && /* not already referencing this zccd */ ++ skb_shinfo(skb)->zccd2 != zccd) ++ { ++ zccd_get (zccd); /* bump ref count */ ++ ++ BUG_TRAP (skb_shinfo(skb)->zccd2 == NULL); ++ ++ if (skb_shinfo(skb)->zccd == NULL) /* reference this zccd */ ++ skb_shinfo(skb)->zccd = zccd; ++ else ++ skb_shinfo(skb)->zccd2 = zccd; ++ } ++ + skb->len += copy; + skb->data_len += copy; + skb->truesize += copy; +@@ -628,12 +655,37 @@ ssize_t tcp_sendpage(struct socket *sock + + lock_sock(sk); + TCP_CHECK_TIMER(sk); +- res = do_tcp_sendpages(sk, &page, offset, size, flags); ++ res = do_tcp_sendpages(sk, &page, offset, size, flags,NULL); ++ TCP_CHECK_TIMER(sk); ++ release_sock(sk); ++ return res; ++} ++ ++ssize_t tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size, ++ int flags, zccd_t *zccd) ++{ ++ ssize_t res; ++ struct sock *sk = sock->sk; ++ ++#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) ++ ++ if (!(sk->sk_route_caps & NETIF_F_SG) || /* caller shouldn't waste her time */ ++ !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS)) /* on double mapping */ ++ BUG (); ++ ++#undef TCP_ZC_CSUM_FLAGS ++ ++ lock_sock(sk); ++ TCP_CHECK_TIMER(sk); ++ ++ res = do_tcp_sendpages(sk, &page, offset, size, flags, zccd); ++ + TCP_CHECK_TIMER(sk); + release_sock(sk); + return res; + } + ++ + #define TCP_PAGE(sk) (sk->sk_sndmsg_page) + #define TCP_OFF(sk) (sk->sk_sndmsg_off) + +@@ -1477,6 +1529,202 @@ recv_urg: + goto out; + } + ++int tcp_recvpackets (struct sock *sk, struct sk_buff_head *packets, ++ int len, int nonblock) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ int copied; ++ long timeo; ++ ++ BUG_TRAP (len > 0); ++ /*BUG_TRAP ((flags & (MSG_OOB | MSG_PEEK | MSG_TRUNC)) == 0);*/ ++ ++ lock_sock(sk); ++ ++ TCP_CHECK_TIMER(sk); ++ ++ copied = -ENOTCONN; ++ if (sk->sk_state == TCP_LISTEN) ++ goto out; ++ ++ copied = 0; ++ timeo = sock_rcvtimeo(sk, nonblock); ++ ++ do { ++ struct sk_buff * skb; ++ u32 offset; ++ unsigned long used; ++ int exhausted; ++ int eaten; ++ ++ /* Are we at urgent data? Stop if we have read anything. */ ++ if (copied && tp->urg_data && tp->urg_seq == tp->copied_seq) ++ break; ++ ++ /* We need to check signals first, to get correct SIGURG ++ * handling. FIXME: Need to check this doesnt impact 1003.1g ++ * and move it down to the bottom of the loop ++ */ ++ if (signal_pending(current)) { ++ if (copied) ++ break; ++ copied = timeo ? sock_intr_errno(timeo) : -EAGAIN; ++ break; ++ } ++ ++ /* Next get a buffer. */ ++ ++ skb = skb_peek(&sk->sk_receive_queue); ++ ++ if (skb == NULL) /* nothing ready */ ++ { ++ if (copied) { ++ if (sk->sk_err || ++ sk->sk_state == TCP_CLOSE || ++ (sk->sk_shutdown & RCV_SHUTDOWN) || ++ !timeo || ++ (0)) ++ break; ++ } else { ++ if (sock_flag(sk, SOCK_DONE)) ++ break; ++ ++ if (sk->sk_err) { ++ copied = sock_error(sk); ++ break; ++ } ++ ++ if (sk->sk_shutdown & RCV_SHUTDOWN) ++ break; ++ ++ if (sk->sk_state == TCP_CLOSE) { ++ if (!(sock_flag(sk, SOCK_DONE))) { ++ /* This occurs when user tries to read ++ * from never connected socket. ++ */ ++ copied = -ENOTCONN; ++ break; ++ } ++ break; ++ } ++ ++ if (!timeo) { ++ copied = -EAGAIN; ++ break; ++ } ++ } ++ ++ tcp_cleanup_rbuf(sk, copied); ++ sk_wait_data(sk, &timeo); ++ continue; ++ } ++ ++ BUG_TRAP (atomic_read (&skb->users) == 1); ++ ++ exhausted = eaten = 0; ++ ++ offset = tp->copied_seq - TCP_SKB_CB(skb)->seq; ++ if (skb->h.th->syn) ++ offset--; ++ ++ used = skb->len - offset; ++ ++ if (tp->urg_data) { ++ u32 urg_offset = tp->urg_seq - tp->copied_seq; ++ if (urg_offset < used) { ++ if (!urg_offset) { /* at urgent date */ ++ if (!(sock_flag(sk, SOCK_URGINLINE))) { ++ tp->copied_seq++; /* discard the single byte of urgent data */ ++ offset++; ++ used--; ++ } ++ } else /* truncate read */ ++ used = urg_offset; ++ } ++ } ++ ++ BUG_TRAP (used >= 0); ++ if (len < used) ++ used = len; ++ ++ if (used == 0) ++ exhausted = 1; ++ else ++ { ++ if (skb_is_nonlinear (skb)) ++ { ++ int rc = skb_linearize (skb); ++ ++ printk ("tcp_recvpackets(): linearising: %d\n", rc); ++ ++ if (rc) ++ { ++ if (!copied) ++ copied = rc; ++ break; ++ } ++ } ++ ++ if ((offset + used) == skb->len) /* consuming the whole packet */ ++ { ++ __skb_unlink (skb, &sk->sk_receive_queue); ++ dst_release (skb->dst); ++ skb_orphan (skb); ++ __skb_pull (skb, offset); ++ __skb_queue_tail (packets, skb); ++ exhausted = eaten = 1; ++ } ++ else /* consuming only part of the packet */ ++ { ++ struct sk_buff *skb2 = skb_clone (skb, GFP_KERNEL); ++ ++ if (skb2 == NULL) ++ { ++ if (!copied) ++ copied = -ENOMEM; ++ break; ++ } ++ ++ dst_release (skb2->dst); ++ __skb_pull (skb2, offset); ++ __skb_trim (skb2, used); ++ __skb_queue_tail (packets, skb2); ++ } ++ ++ tp->copied_seq += used; ++ copied += used; ++ len -= used; ++ } ++ ++ if (tp->urg_data && after(tp->copied_seq,tp->urg_seq)) { ++ tp->urg_data = 0; ++ tcp_fast_path_check(sk, tp); ++ } ++ ++ if (!exhausted) ++ continue; ++ ++ if (skb->h.th->fin) ++ { ++ tp->copied_seq++; ++ if (!eaten) ++ sk_eat_skb (sk, skb, 0); ++ break; ++ } ++ ++ if (!eaten) ++ sk_eat_skb (sk, skb, 0); ++ ++ } while (len > 0); ++ ++ out: ++ /* Clean up data we have read: This will do ACK frames. */ ++ tcp_cleanup_rbuf(sk, copied); ++ TCP_CHECK_TIMER(sk); ++ release_sock(sk); ++ return copied; ++} ++ + /* + * State processing on a close. This implements the state shift for + * sending our FIN frame. Note that we only send a FIN for some +@@ -2345,6 +2593,8 @@ EXPORT_SYMBOL(tcp_read_sock); + EXPORT_SYMBOL(tcp_recvmsg); + EXPORT_SYMBOL(tcp_sendmsg); + EXPORT_SYMBOL(tcp_sendpage); ++EXPORT_SYMBOL(tcp_sendpage_zccd); ++EXPORT_SYMBOL(tcp_recvpackets); + EXPORT_SYMBOL(tcp_setsockopt); + EXPORT_SYMBOL(tcp_shutdown); + EXPORT_SYMBOL(tcp_statistics); +Index: linux-2.6/include/linux/skbuff.h +=================================================================== +--- linux-2.6.orig/include/linux/skbuff.h 2006-07-15 21:08:45.000000000 +0800 ++++ linux-2.6/include/linux/skbuff.h 2006-07-15 21:12:21.000000000 +0800 +@@ -128,6 +128,30 @@ struct skb_frag_struct { + __u16 size; + }; + ++/* Support for callback when skb data has been released */ ++typedef struct zccd /* Zero Copy Callback Descriptor */ ++{ /* (embed as first member of custom struct) */ ++ atomic_t zccd_count; /* reference count */ ++ void (*zccd_destructor)(struct zccd *); /* callback when refcount reaches zero */ ++} zccd_t; ++ ++static inline void zccd_init (zccd_t *d, void (*callback)(zccd_t *)) ++{ ++ atomic_set (&d->zccd_count, 1); ++ d->zccd_destructor = callback; ++} ++ ++static inline void zccd_get (zccd_t *d) /* take a reference */ ++{ ++ atomic_inc (&d->zccd_count); ++} ++ ++static inline void zccd_put (zccd_t *d) /* release a reference */ ++{ ++ if (atomic_dec_and_test (&d->zccd_count)) ++ (d->zccd_destructor)(d); ++} ++ + /* This data is invariant across clones and lives at + * the end of the header data, ie. at skb->end. + */ +@@ -140,6 +164,13 @@ struct skb_shared_info { + unsigned short gso_type; + unsigned int ip6_frag_id; + struct sk_buff *frag_list; ++ zccd_t *zccd; /* zero copy descriptor */ ++ zccd_t *zccd2; /* 2nd zero copy descriptor */ ++ /* NB we expect zero-copy data to be at least 1 packet, so ++ * having 2 zccds means we don't unneccessarily split the packet ++ * where consecutive zero-copy sends abutt. ++ */ ++ + skb_frag_t frags[MAX_SKB_FRAGS]; + }; + +Index: linux-2.6/include/net/tcp.h +=================================================================== +--- linux-2.6.orig/include/net/tcp.h 2006-07-15 21:08:45.000000000 +0800 ++++ linux-2.6/include/net/tcp.h 2006-07-15 21:12:21.000000000 +0800 +@@ -278,6 +278,9 @@ extern int tcp_v4_tw_remember_stam + extern int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, + struct msghdr *msg, size_t size); + extern ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags); ++extern ssize_t tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size, ++ int flags, zccd_t *zccd); ++ + + extern int tcp_ioctl(struct sock *sk, + int cmd, +@@ -368,6 +371,9 @@ extern int tcp_recvmsg(struct kiocb *i + struct msghdr *msg, + size_t len, int nonblock, + int flags, int *addr_len); ++extern int tcp_recvpackets(struct sock *sk, ++ struct sk_buff_head *packets, ++ int len, int nonblock); + + extern void tcp_parse_options(struct sk_buff *skb, + struct tcp_options_received *opt_rx, diff --git a/lustre/kernel_patches/patches/vfs_intent-2.6.18-vanilla.patch b/lustre/kernel_patches/patches/vfs_intent-2.6.18-vanilla.patch new file mode 100644 index 0000000..6e86bde --- /dev/null +++ b/lustre/kernel_patches/patches/vfs_intent-2.6.18-vanilla.patch @@ -0,0 +1,824 @@ +Index: linux-2.6/fs/inode.c +=================================================================== +--- linux-2.6.orig/fs/inode.c 2006-07-15 21:04:02.000000000 +0800 ++++ linux-2.6/fs/inode.c 2006-07-15 21:04:08.000000000 +0800 +@@ -234,6 +234,7 @@ void __iget(struct inode * inode) + inodes_stat.nr_unused--; + } + ++EXPORT_SYMBOL(__iget); + /** + * clear_inode - clear an inode + * @inode: inode to clear +Index: linux-2.6/fs/open.c +=================================================================== +--- linux-2.6.orig/fs/open.c 2006-07-15 21:04:02.000000000 +0800 ++++ linux-2.6/fs/open.c 2006-07-15 21:04:08.000000000 +0800 +@@ -225,12 +225,12 @@ static long do_sys_truncate(const char _ + struct nameidata nd; + struct inode * inode; + int error; +- ++ intent_init(&nd.intent, IT_GETATTR); + error = -EINVAL; + if (length < 0) /* sorry, but loff_t says... */ + goto out; + +- error = user_path_walk(path, &nd); ++ error = user_path_walk_it(path, &nd); + if (error) + goto out; + inode = nd.dentry->d_inode; +@@ -495,6 +495,7 @@ asmlinkage long sys_faccessat(int dfd, c + int old_fsuid, old_fsgid; + kernel_cap_t old_cap; + int res; ++ intent_init(&nd.intent, IT_GETATTR); + + if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */ + return -EINVAL; +@@ -519,7 +520,7 @@ asmlinkage long sys_faccessat(int dfd, c + else + current->cap_effective = current->cap_permitted; + +- res = __user_walk_fd(dfd, filename, LOOKUP_FOLLOW|LOOKUP_ACCESS, &nd); ++ res = __user_walk_fd_it(dfd, filename, LOOKUP_FOLLOW|LOOKUP_ACCESS, &nd); + if (!res) { + res = vfs_permission(&nd, mode); + /* SuS v2 requires we report a read only fs too */ +@@ -545,8 +546,9 @@ asmlinkage long sys_chdir(const char __u + { + struct nameidata nd; + int error; ++ intent_init(&nd.intent, IT_GETATTR); + +- error = __user_walk(filename, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &nd); ++ error = __user_walk_it(filename, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &nd); + if (error) + goto out; + +@@ -596,8 +598,9 @@ asmlinkage long sys_chroot(const char __ + { + struct nameidata nd; + int error; ++ intent_init(&nd.intent, IT_GETATTR); + +- error = __user_walk(filename, LOOKUP_FOLLOW | LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd); ++ error = __user_walk_it(filename, LOOKUP_FOLLOW | LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd); + if (error) + goto out; + +@@ -823,6 +826,7 @@ static struct file *__dentry_open(struct + error = open(inode, f); + if (error) + goto cleanup_all; ++ intent_release(f->f_it); + } + + f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); +@@ -849,6 +853,7 @@ cleanup_all: + f->f_dentry = NULL; + f->f_vfsmnt = NULL; + cleanup_file: ++ intent_release(f->f_it); + put_filp(f); + dput(dentry); + mntput(mnt); +@@ -874,6 +879,7 @@ static struct file *do_filp_open(int dfd + { + int namei_flags, error; + struct nameidata nd; ++ intent_init(&nd.intent, IT_OPEN); + + namei_flags = flags; + if ((namei_flags+1) & O_ACCMODE) +@@ -914,19 +920,19 @@ EXPORT_SYMBOL(filp_open); + struct file *lookup_instantiate_filp(struct nameidata *nd, struct dentry *dentry, + int (*open)(struct inode *, struct file *)) + { +- if (IS_ERR(nd->intent.open.file)) ++ if (IS_ERR(nd->intent.file)) + goto out; + if (IS_ERR(dentry)) + goto out_err; +- nd->intent.open.file = __dentry_open(dget(dentry), mntget(nd->mnt), +- nd->intent.open.flags - 1, +- nd->intent.open.file, ++ nd->intent.file = __dentry_open(dget(dentry), mntget(nd->mnt), ++ nd->intent.flags - 1, ++ nd->intent.file, + open); + out: +- return nd->intent.open.file; ++ return nd->intent.file; + out_err: + release_open_intent(nd); +- nd->intent.open.file = (struct file *)dentry; ++ nd->intent.file = (struct file *)dentry; + goto out; + } + EXPORT_SYMBOL_GPL(lookup_instantiate_filp); +@@ -943,7 +949,8 @@ struct file *nameidata_to_filp(struct na + struct file *filp; + + /* Pick up the filp from the open intent */ +- filp = nd->intent.open.file; ++ filp = nd->intent.file; ++ filp->f_it = &nd->intent; + /* Has the filesystem initialised the file for us? */ + if (filp->f_dentry == NULL) + filp = __dentry_open(nd->dentry, nd->mnt, flags, filp, NULL); +Index: linux-2.6/fs/nfsctl.c +=================================================================== +--- linux-2.6.orig/fs/nfsctl.c 2006-07-15 21:04:02.000000000 +0800 ++++ linux-2.6/fs/nfsctl.c 2006-07-15 21:04:08.000000000 +0800 +@@ -25,6 +25,7 @@ static struct file *do_open(char *name, + struct nameidata nd; + int error; + ++ intent_init(&nd.intent, IT_OPEN); + nd.mnt = do_kern_mount("nfsd", 0, "nfsd", NULL); + + if (IS_ERR(nd.mnt)) +Index: linux-2.6/fs/namei.c +=================================================================== +--- linux-2.6.orig/fs/namei.c 2006-07-15 21:04:02.000000000 +0800 ++++ linux-2.6/fs/namei.c 2006-07-15 21:04:36.000000000 +0800 +@@ -337,8 +337,19 @@ int deny_write_access(struct file * file + return 0; + } + ++void intent_release(struct lookup_intent *it) ++{ ++ if (!it) ++ return; ++ if (it->it_magic != INTENT_MAGIC) ++ return; ++ if (it->it_op_release) ++ it->it_op_release(it); ++} ++ + void path_release(struct nameidata *nd) + { ++ intent_release(&nd->intent); + dput(nd->dentry); + mntput(nd->mnt); + } +@@ -359,10 +370,10 @@ void path_release_on_umount(struct namei + */ + void release_open_intent(struct nameidata *nd) + { +- if (nd->intent.open.file->f_dentry == NULL) +- put_filp(nd->intent.open.file); ++ if (nd->intent.file->f_dentry == NULL) ++ put_filp(nd->intent.file); + else +- fput(nd->intent.open.file); ++ fput(nd->intent.file); + } + + /* +@@ -440,8 +451,12 @@ static struct dentry * real_lookup(struc + { + struct dentry * result; + struct inode *dir = parent->d_inode; ++ int counter = 0; + + mutex_lock(&dir->i_mutex); ++again: ++ counter++; ++ + /* + * First re-do the cached lookup just in case it was created + * while we waited for the directory semaphore.. +@@ -475,13 +490,16 @@ static struct dentry * real_lookup(struc + * Uhhuh! Nasty case: the cache was re-populated while + * we waited on the semaphore. Need to revalidate. + */ +- mutex_unlock(&dir->i_mutex); + if (result->d_op && result->d_op->d_revalidate) { + if (!result->d_op->d_revalidate(result, nd) && !d_invalidate(result)) { + dput(result); +- result = ERR_PTR(-ENOENT); ++ if (counter > 10) ++ result = ERR_PTR(-ESTALE); ++ if (!IS_ERR(result)) ++ goto again; + } + } ++ mutex_unlock(&dir->i_mutex); + return result; + } + +@@ -509,7 +527,9 @@ walk_init_root(const char *name, struct + static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link) + { + int res = 0; ++ struct lookup_intent it = nd->intent; + char *name; ++ + if (IS_ERR(link)) + goto fail; + +@@ -519,6 +539,10 @@ static __always_inline int __vfs_follow_ + /* weird __emul_prefix() stuff did it */ + goto out; + } ++ intent_init(&nd->intent, it.it_op); ++ nd->intent.it_flags = it.it_flags; ++ nd->intent.it_create_mode = it.it_create_mode; ++ nd->intent.file = it.file; + res = link_path_walk(link, nd); + out: + if (nd->depth || res || nd->last_type!=LAST_NORM) +@@ -771,6 +795,33 @@ fail: + return PTR_ERR(dentry); + } + ++static int revalidate_special(struct nameidata *nd) ++{ ++ struct dentry *dentry = nd->dentry; ++ int err, counter = 0; ++ ++ revalidate_again: ++ if (!dentry->d_op || !dentry->d_op->d_revalidate) ++ return 0; ++ if (!dentry->d_op->d_revalidate(dentry, nd)) { ++ struct dentry *new; ++ if ((err = permission(dentry->d_parent->d_inode, MAY_EXEC, nd))) ++ return err; ++ new = real_lookup(dentry->d_parent, &dentry->d_name, nd); ++ if (IS_ERR(new)) ++ return PTR_ERR(new); ++ d_invalidate(dentry); ++ dput(dentry); ++ nd->dentry = dentry = new; ++ counter++; ++ if (counter < 10) ++ goto revalidate_again; ++ printk("excessive revalidate_it loops\n"); ++ return -ESTALE; ++ } ++ return 0; ++} ++ + /* + * Name resolution. + * This is the basic name resolution function, turning a pathname into +@@ -867,7 +918,11 @@ static fastcall int __link_path_walk(con + goto out_dput; + + if (inode->i_op->follow_link) { ++ int save_flags = nd->flags; ++ nd->flags |= LOOKUP_LINK_NOTLAST; + err = do_follow_link(&next, nd); ++ if (!(save_flags & LOOKUP_LINK_NOTLAST)) ++ nd->flags &= ~LOOKUP_LINK_NOTLAST; + if (err) + goto return_err; + err = -ENOENT; +@@ -902,6 +957,23 @@ last_component: + inode = nd->dentry->d_inode; + /* fallthrough */ + case 1: ++ nd->flags |= LOOKUP_LAST; ++ err = revalidate_special(nd); ++ nd->flags &= ~LOOKUP_LAST; ++ if (!nd->dentry->d_inode) ++ err = -ENOENT; ++ if (err) { ++ path_release(nd); ++ goto return_err; ++ } ++ if (lookup_flags & LOOKUP_DIRECTORY) { ++ err = -ENOTDIR; ++ if(!nd->dentry->d_inode->i_op || ++ !nd->dentry->d_inode->i_op->lookup) { ++ path_release(nd); ++ goto return_err; ++ } ++ } + goto return_reval; + } + if (nd->dentry->d_op && nd->dentry->d_op->d_hash) { +@@ -909,7 +981,9 @@ last_component: + if (err < 0) + break; + } ++ nd->flags |= LOOKUP_LAST; + err = do_lookup(nd, &this, &next); ++ nd->flags &= ~LOOKUP_LAST; + if (err) + break; + inode = next.dentry->d_inode; +@@ -1152,13 +1226,13 @@ static int __path_lookup_intent_open(int + + if (filp == NULL) + return -ENFILE; +- nd->intent.open.file = filp; +- nd->intent.open.flags = open_flags; +- nd->intent.open.create_mode = create_mode; ++ nd->intent.file = filp; ++ nd->intent.flags = open_flags; ++ nd->intent.create_mode = create_mode; + err = do_path_lookup(dfd, name, lookup_flags|LOOKUP_OPEN, nd); +- if (IS_ERR(nd->intent.open.file)) { ++ if (IS_ERR(nd->intent.file)) { + if (err == 0) { +- err = PTR_ERR(nd->intent.open.file); ++ err = PTR_ERR(nd->intent.file); + path_release(nd); + } + } else if (err != 0) +@@ -1261,7 +1335,7 @@ static struct dentry *lookup_hash(struct + } + + /* SMP-safe */ +-struct dentry * lookup_one_len(const char * name, struct dentry * base, int len) ++struct dentry * lookup_one_len_it(const char * name, struct dentry * base, int len, struct nameidata *nd) + { + unsigned long hash; + struct qstr this; +@@ -1281,11 +1355,17 @@ struct dentry * lookup_one_len(const cha + } + this.hash = end_name_hash(hash); + +- return __lookup_hash(&this, base, NULL); ++ return __lookup_hash(&this, base, nd); + access: + return ERR_PTR(-EACCES); + } + ++struct dentry * lookup_one_len(const char * name, struct dentry * base, int len) ++{ ++ return lookup_one_len_it(name, base, len, NULL); ++} ++ ++ + /* + * namei() + * +@@ -1297,8 +1377,9 @@ access: + * that namei follows links, while lnamei does not. + * SMP-safe + */ +-int fastcall __user_walk_fd(int dfd, const char __user *name, unsigned flags, +- struct nameidata *nd) ++ ++int fastcall __user_walk_fd_it(int dfd, const char __user *name, unsigned flags, ++ struct nameidata *nd) + { + char *tmp = getname(name); + int err = PTR_ERR(tmp); +@@ -1310,9 +1391,22 @@ int fastcall __user_walk_fd(int dfd, con + return err; + } + ++int fastcall __user_walk_fd(int dfd, const char __user *name, unsigned flags, ++ struct nameidata *nd) ++{ ++ intent_init(&nd->intent, IT_LOOKUP); ++ return __user_walk_fd_it(dfd, name, flags, nd); ++} ++ ++int fastcall __user_walk_it(const char __user *name, unsigned flags, struct nameidata *nd) ++{ ++ return __user_walk_fd_it(AT_FDCWD, name, flags, nd); ++} ++ + int fastcall __user_walk(const char __user *name, unsigned flags, struct nameidata *nd) + { +- return __user_walk_fd(AT_FDCWD, name, flags, nd); ++ intent_init(&nd->intent, IT_LOOKUP); ++ return __user_walk_it(name, flags, nd); + } + + /* +@@ -1593,6 +1687,8 @@ int open_namei(int dfd, const char *path + if (flag & O_APPEND) + acc_mode |= MAY_APPEND; + ++ nd->intent.it_flags = flag; ++ nd->intent.it_create_mode = mode; + /* + * The simplest case - just a plain lookup. + */ +@@ -1607,6 +1703,7 @@ int open_namei(int dfd, const char *path + /* + * Create - we need to know the parent. + */ ++ nd->intent.it_op |= IT_CREAT; + error = path_lookup_create(dfd,pathname,LOOKUP_PARENT,nd,flag,mode); + if (error) + return error; +@@ -1623,7 +1720,9 @@ int open_namei(int dfd, const char *path + dir = nd->dentry; + nd->flags &= ~LOOKUP_PARENT; + mutex_lock(&dir->d_inode->i_mutex); ++ nd->flags |= LOOKUP_LAST; + path.dentry = lookup_hash(nd); ++ nd->flags &= ~LOOKUP_LAST; + path.mnt = nd->mnt; + + do_last: +@@ -1633,9 +1732,9 @@ do_last: + goto exit; + } + +- if (IS_ERR(nd->intent.open.file)) { ++ if (IS_ERR(nd->intent.file)) { + mutex_unlock(&dir->d_inode->i_mutex); +- error = PTR_ERR(nd->intent.open.file); ++ error = PTR_ERR(nd->intent.file); + goto exit_dput; + } + +@@ -1688,7 +1787,7 @@ ok: + exit_dput: + dput_path(&path, nd); + exit: +- if (!IS_ERR(nd->intent.open.file)) ++ if (!IS_ERR(nd->intent.file)) + release_open_intent(nd); + path_release(nd); + return error; +@@ -1731,7 +1830,9 @@ do_link: + } + dir = nd->dentry; + mutex_lock(&dir->d_inode->i_mutex); ++ nd->flags |= LOOKUP_LAST; + path.dentry = lookup_hash(nd); ++ nd->flags &= ~LOOKUP_LAST; + path.mnt = nd->mnt; + __putname(nd->last.name); + goto do_last; +@@ -2243,6 +2344,9 @@ asmlinkage long sys_linkat(int olddfd, c + int error; + char * to; + ++ intent_init(&nd.intent, IT_LOOKUP); ++ intent_init(&old_nd.intent, IT_LOOKUP); ++ + if ((flags & ~AT_SYMLINK_FOLLOW) != 0) + return -EINVAL; + +@@ -2250,7 +2354,7 @@ asmlinkage long sys_linkat(int olddfd, c + if (IS_ERR(to)) + return PTR_ERR(to); + +- error = __user_walk_fd(olddfd, oldname, ++ error = __user_walk_fd_it(olddfd, oldname, + flags & AT_SYMLINK_FOLLOW ? LOOKUP_FOLLOW : 0, + &old_nd); + if (error) +Index: linux-2.6/fs/stat.c +=================================================================== +--- linux-2.6.orig/fs/stat.c 2006-07-15 21:04:02.000000000 +0800 ++++ linux-2.6/fs/stat.c 2006-07-15 21:04:08.000000000 +0800 +@@ -37,7 +37,7 @@ void generic_fillattr(struct inode *inod + + EXPORT_SYMBOL(generic_fillattr); + +-int vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) ++int vfs_getattr_it(struct vfsmount *mnt, struct dentry *dentry, struct lookup_intent *it, struct kstat *stat) + { + struct inode *inode = dentry->d_inode; + int retval; +@@ -46,6 +46,8 @@ int vfs_getattr(struct vfsmount *mnt, st + if (retval) + return retval; + ++ if (inode->i_op->getattr_it) ++ return inode->i_op->getattr_it(mnt, dentry, it, stat); + if (inode->i_op->getattr) + return inode->i_op->getattr(mnt, dentry, stat); + +@@ -60,6 +62,11 @@ int vfs_getattr(struct vfsmount *mnt, st + return 0; + } + ++int vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) ++{ ++ return vfs_getattr_it(mnt, dentry, NULL, stat); ++} ++ + EXPORT_SYMBOL(vfs_getattr); + + int vfs_stat_fd(int dfd, char __user *name, struct kstat *stat) +@@ -67,9 +74,10 @@ int vfs_stat_fd(int dfd, char __user *na + struct nameidata nd; + int error; + +- error = __user_walk_fd(dfd, name, LOOKUP_FOLLOW, &nd); ++ intent_init(&nd.intent, IT_GETATTR); ++ error = __user_walk_fd_it(dfd, name, LOOKUP_FOLLOW, &nd); + if (!error) { +- error = vfs_getattr(nd.mnt, nd.dentry, stat); ++ error = vfs_getattr_it(nd.mnt, nd.dentry, &nd.intent, stat); + path_release(&nd); + } + return error; +@@ -87,9 +95,10 @@ int vfs_lstat_fd(int dfd, char __user *n + struct nameidata nd; + int error; + +- error = __user_walk_fd(dfd, name, 0, &nd); ++ intent_init(&nd.intent, IT_GETATTR); ++ error = __user_walk_fd_it(dfd, name, 0, &nd); + if (!error) { +- error = vfs_getattr(nd.mnt, nd.dentry, stat); ++ error = vfs_getattr_it(nd.mnt, nd.dentry, &nd.intent, stat); + path_release(&nd); + } + return error; +@@ -106,9 +115,12 @@ int vfs_fstat(unsigned int fd, struct ks + { + struct file *f = fget(fd); + int error = -EBADF; ++ struct nameidata nd; ++ intent_init(&nd.intent, IT_GETATTR); + + if (f) { +- error = vfs_getattr(f->f_vfsmnt, f->f_dentry, stat); ++ error = vfs_getattr_it(f->f_vfsmnt, f->f_dentry, &nd.intent, stat); ++ intent_release(&nd.intent); + fput(f); + } + return error; +Index: linux-2.6/fs/namespace.c +=================================================================== +--- linux-2.6.orig/fs/namespace.c 2006-07-15 21:04:02.000000000 +0800 ++++ linux-2.6/fs/namespace.c 2006-07-15 21:04:08.000000000 +0800 +@@ -73,6 +73,7 @@ struct vfsmount *alloc_vfsmnt(const char + INIT_LIST_HEAD(&mnt->mnt_share); + INIT_LIST_HEAD(&mnt->mnt_slave_list); + INIT_LIST_HEAD(&mnt->mnt_slave); ++ INIT_LIST_HEAD(&mnt->mnt_lustre_list); + if (name) { + int size = strlen(name) + 1; + char *newname = kmalloc(size, GFP_KERNEL); +@@ -162,6 +163,7 @@ static void __touch_namespace(struct nam + + static void detach_mnt(struct vfsmount *mnt, struct nameidata *old_nd) + { ++ memset(old_nd, 0, sizeof(*old_nd)); + old_nd->dentry = mnt->mnt_mountpoint; + old_nd->mnt = mnt->mnt_parent; + mnt->mnt_parent = mnt; +@@ -280,6 +282,9 @@ static inline void __mntput(struct vfsmo + { + struct super_block *sb = mnt->mnt_sb; + dput(mnt->mnt_root); ++ spin_lock(&dcache_lock); ++ list_del(&mnt->mnt_lustre_list); ++ spin_unlock(&dcache_lock); + free_vfsmnt(mnt); + deactivate_super(sb); + } +@@ -582,6 +587,8 @@ static int do_umount(struct vfsmount *mn + */ + + lock_kernel(); ++ if (sb->s_op->umount_lustre) ++ sb->s_op->umount_lustre(sb); + if (sb->s_op->umount_begin) + sb->s_op->umount_begin(mnt, flags); + unlock_kernel(); +@@ -914,6 +921,7 @@ static int do_loopback(struct nameidata + return err; + if (!old_name || !*old_name) + return -EINVAL; ++ intent_init(&old_nd.intent, IT_LOOKUP); + err = path_lookup(old_name, LOOKUP_FOLLOW, &old_nd); + if (err) + return err; +@@ -999,6 +1007,7 @@ static int do_move_mount(struct nameidat + return -EPERM; + if (!old_name || !*old_name) + return -EINVAL; ++ intent_init(&old_nd.intent, IT_LOOKUP); + err = path_lookup(old_name, LOOKUP_FOLLOW, &old_nd); + if (err) + return err; +@@ -1388,6 +1397,7 @@ long do_mount(char *dev_name, char *dir_ + int retval = 0; + int mnt_flags = 0; + ++ intent_init(&nd.intent, IT_LOOKUP); + /* Discard magic */ + if ((flags & MS_MGC_MSK) == MS_MGC_VAL) + flags &= ~MS_MGC_MSK; +Index: linux-2.6/fs/exec.c +=================================================================== +--- linux-2.6.orig/fs/exec.c 2006-07-15 21:04:02.000000000 +0800 ++++ linux-2.6/fs/exec.c 2006-07-15 21:04:08.000000000 +0800 +@@ -127,6 +127,7 @@ asmlinkage long sys_uselib(const char __ + struct nameidata nd; + int error; + ++ intent_init(&nd.intent, IT_OPEN); + error = __user_path_lookup_open(library, LOOKUP_FOLLOW, &nd, FMODE_READ|FMODE_EXEC); + if (error) + goto out; +@@ -477,6 +478,7 @@ struct file *open_exec(const char *name) + int err; + struct file *file; + ++ intent_init(&nd.intent, IT_OPEN); + err = path_lookup_open(AT_FDCWD, name, LOOKUP_FOLLOW, &nd, FMODE_READ|FMODE_EXEC); + file = ERR_PTR(err); + +Index: linux-2.6/include/linux/dcache.h +=================================================================== +--- linux-2.6.orig/include/linux/dcache.h 2006-07-15 21:04:02.000000000 +0800 ++++ linux-2.6/include/linux/dcache.h 2006-07-15 21:04:08.000000000 +0800 +@@ -4,6 +4,7 @@ + #ifdef __KERNEL__ + + #include ++#include + #include + #include + #include +@@ -36,6 +37,8 @@ struct qstr { + const unsigned char *name; + }; + ++#include ++ + struct dentry_stat_t { + int nr_dentry; + int nr_unused; +Index: linux-2.6/include/linux/fs.h +=================================================================== +--- linux-2.6.orig/include/linux/fs.h 2006-07-15 21:04:02.000000000 +0800 ++++ linux-2.6/include/linux/fs.h 2006-07-15 21:04:08.000000000 +0800 +@@ -280,6 +280,8 @@ typedef void (dio_iodone_t)(struct kiocb + #define ATTR_KILL_SUID 2048 + #define ATTR_KILL_SGID 4096 + #define ATTR_FILE 8192 ++#define ATTR_RAW 16384 /* file system, not vfs will massage attrs */ ++#define ATTR_FROM_OPEN 32768 /* called from open path, ie O_TRUNC */ + + /* + * This is the Inode Attributes structure, used for notify_change(). It +@@ -533,6 +535,7 @@ struct inode { + struct block_device *i_bdev; + struct cdev *i_cdev; + int i_cindex; ++ void *i_filterdata; + + __u32 i_generation; + +@@ -699,6 +702,7 @@ struct file { + spinlock_t f_ep_lock; + #endif /* #ifdef CONFIG_EPOLL */ + struct address_space *f_mapping; ++ struct lookup_intent *f_it; + }; + extern spinlock_t files_lock; + #define file_list_lock() spin_lock(&files_lock); +@@ -1099,7 +1103,9 @@ struct inode_operations { + void (*truncate) (struct inode *); + int (*permission) (struct inode *, int, struct nameidata *); + int (*setattr) (struct dentry *, struct iattr *); ++ int (*setattr_raw) (struct inode *, struct iattr *); + int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *); ++ int (*getattr_it) (struct vfsmount *, struct dentry *, struct lookup_intent *, struct kstat *); + int (*setxattr) (struct dentry *, const char *,const void *,size_t,int); + ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t); + ssize_t (*listxattr) (struct dentry *, char *, size_t); +@@ -1140,6 +1146,7 @@ struct super_operations { + int (*remount_fs) (struct super_block *, int *, char *); + void (*clear_inode) (struct inode *); + void (*umount_begin) (struct vfsmount *, int); ++ void (*umount_lustre) (struct super_block *); + + int (*show_options)(struct seq_file *, struct vfsmount *); + int (*show_stats)(struct seq_file *, struct vfsmount *); +@@ -1362,6 +1369,7 @@ extern int may_umount_tree(struct vfsmou + extern int may_umount(struct vfsmount *); + extern void umount_tree(struct vfsmount *, int, struct list_head *); + extern void release_mounts(struct list_head *); ++struct vfsmount *do_kern_mount(const char *type, int flags, const char *name, void *data); + extern long do_mount(char *, char *, char *, unsigned long, void *); + extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int); + extern void mnt_set_mountpoint(struct vfsmount *, struct dentry *, +@@ -1423,6 +1431,7 @@ extern long do_sys_open(int fdf, const c + int mode); + extern struct file *filp_open(const char *, int, int); + extern struct file * dentry_open(struct dentry *, struct vfsmount *, int); ++extern struct file * dentry_open_it(struct dentry *, struct vfsmount *, int, struct lookup_intent *); + extern int filp_close(struct file *, fl_owner_t id); + extern char * getname(const char __user *); + +Index: linux-2.6/include/linux/namei.h +=================================================================== +--- linux-2.6.orig/include/linux/namei.h 2006-07-15 21:04:02.000000000 +0800 ++++ linux-2.6/include/linux/namei.h 2006-07-15 21:04:08.000000000 +0800 +@@ -5,10 +5,39 @@ + + struct vfsmount; + ++#define IT_OPEN (1) ++#define IT_CREAT (1<<1) ++#define IT_READDIR (1<<2) ++#define IT_GETATTR (1<<3) ++#define IT_LOOKUP (1<<4) ++#define IT_UNLINK (1<<5) ++#define IT_TRUNC (1<<6) ++#define IT_GETXATTR (1<<7) ++ ++struct lustre_intent_data { ++ int it_disposition; ++ int it_status; ++ __u64 it_lock_handle; ++ void *it_data; ++ int it_lock_mode; ++}; ++ ++#define INTENT_MAGIC 0x19620323 ++ ++#define it_flags flags ++#define it_create_mode create_mode ++#define lookup_intent open_intent ++ + struct open_intent { +- int flags; +- int create_mode; +- struct file *file; ++ int it_magic; ++ void (*it_op_release)(struct open_intent *); ++ int it_op; ++ int flags; ++ int create_mode; ++ struct file *file; ++ union { ++ struct lustre_intent_data lustre; ++ } d; + }; + + enum { MAX_NESTED_LINKS = 5 }; +@@ -22,12 +51,16 @@ struct nameidata { + unsigned depth; + char *saved_names[MAX_NESTED_LINKS + 1]; + +- /* Intent data */ +- union { +- struct open_intent open; +- } intent; ++ struct lookup_intent intent; + }; + ++static inline void intent_init(struct lookup_intent *it, int op) ++{ ++ memset(it, 0, sizeof(*it)); ++ it->it_magic = INTENT_MAGIC; ++ it->it_op = op; ++} ++ + /* + * Type of the last component on LOOKUP_PARENT + */ +@@ -48,6 +81,8 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LA + #define LOOKUP_PARENT 16 + #define LOOKUP_NOALT 32 + #define LOOKUP_REVAL 64 ++#define LOOKUP_LAST (0x1000) ++#define LOOKUP_LINK_NOTLAST (0x2000) + /* + * Intent data + */ +@@ -57,10 +92,19 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LA + + extern int FASTCALL(__user_walk(const char __user *, unsigned, struct nameidata *)); + extern int FASTCALL(__user_walk_fd(int dfd, const char __user *, unsigned, struct nameidata *)); ++extern int FASTCALL(__user_walk_fd_it(int dfd, const char __user *, unsigned, struct nameidata *)); + #define user_path_walk(name,nd) \ + __user_walk_fd(AT_FDCWD, name, LOOKUP_FOLLOW, nd) + #define user_path_walk_link(name,nd) \ + __user_walk_fd(AT_FDCWD, name, 0, nd) ++ ++extern int FASTCALL(__user_walk_it(const char __user *name, unsigned flags, struct nameidata *nd)); ++#define user_path_walk_it(name,nd) \ ++ __user_walk_it(name, LOOKUP_FOLLOW, nd) ++#define user_path_walk_link_it(name,nd) \ ++ __user_walk_it(name, 0, nd) ++extern void intent_release(struct lookup_intent *); ++ + extern int FASTCALL(path_lookup(const char *, unsigned, struct nameidata *)); + extern int FASTCALL(path_walk(const char *, struct nameidata *)); + extern int FASTCALL(link_path_walk(const char *, struct nameidata *)); +Index: linux-2.6/include/linux/mount.h +=================================================================== +--- linux-2.6.orig/include/linux/mount.h 2006-07-15 21:04:02.000000000 +0800 ++++ linux-2.6/include/linux/mount.h 2006-07-15 21:04:08.000000000 +0800 +@@ -53,6 +53,8 @@ struct vfsmount { + struct list_head mnt_slave; /* slave list entry */ + struct vfsmount *mnt_master; /* slave is on master->mnt_slave_list */ + struct namespace *mnt_namespace; /* containing namespace */ ++ struct list_head mnt_lustre_list; /* GNS mount list */ ++ unsigned long mnt_last_used; /* for GNS auto-umount (jiffies) */ + int mnt_pinned; + }; + diff --git a/lustre/kernel_patches/patches/vfs_nointent-2.6.18-vanilla.patch b/lustre/kernel_patches/patches/vfs_nointent-2.6.18-vanilla.patch new file mode 100644 index 0000000..3caa54c --- /dev/null +++ b/lustre/kernel_patches/patches/vfs_nointent-2.6.18-vanilla.patch @@ -0,0 +1,451 @@ +Index: linux-2.6/net/unix/af_unix.c +=================================================================== +--- linux-2.6.orig/net/unix/af_unix.c 2006-07-15 21:01:06.000000000 +0800 ++++ linux-2.6/net/unix/af_unix.c 2006-07-15 21:01:13.000000000 +0800 +@@ -706,6 +706,7 @@ static struct sock *unix_find_other(stru + int err = 0; + + if (sunname->sun_path[0]) { ++ intent_init(&nd.intent, IT_LOOKUP); + err = path_lookup(sunname->sun_path, LOOKUP_FOLLOW, &nd); + if (err) + goto fail; +Index: linux-2.6/fs/open.c +=================================================================== +--- linux-2.6.orig/fs/open.c 2006-07-15 21:01:10.000000000 +0800 ++++ linux-2.6/fs/open.c 2006-07-15 21:01:31.000000000 +0800 +@@ -198,9 +198,10 @@ out: + } + + int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs, +- struct file *filp) ++ struct file *filp, int called_from_open) + { + int err; ++ struct inode_operations *op = dentry->d_inode->i_op; + struct iattr newattrs; + + /* Not pretty: "inode->i_size" shouldn't really be signed. But it is. */ +@@ -215,7 +216,17 @@ int do_truncate(struct dentry *dentry, l + } + + mutex_lock(&dentry->d_inode->i_mutex); +- err = notify_change(dentry, &newattrs); ++ if (called_from_open) ++ newattrs.ia_valid |= ATTR_FROM_OPEN; ++ if (op->setattr_raw) { ++ newattrs.ia_valid |= ATTR_RAW; ++ newattrs.ia_ctime = CURRENT_TIME; ++ down_write(&dentry->d_inode->i_alloc_sem); ++ err = op->setattr_raw(dentry->d_inode, &newattrs); ++ up_write(&dentry->d_inode->i_alloc_sem); ++ } else ++ err = notify_change(dentry, &newattrs); ++ + mutex_unlock(&dentry->d_inode->i_mutex); + return err; + } +@@ -270,7 +281,7 @@ static long do_sys_truncate(const char _ + error = locks_verify_truncate(inode, NULL, length); + if (!error) { + DQUOT_INIT(inode); +- error = do_truncate(nd.dentry, length, 0, NULL); ++ error = do_truncate(nd.dentry, length, 0, NULL, 0); + } + put_write_access(inode); + +@@ -322,7 +333,7 @@ static long do_sys_ftruncate(unsigned in + + error = locks_verify_truncate(inode, file, length); + if (!error) +- error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, file); ++ error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, file, 0); + out_putf: + fput(file); + out: +@@ -407,9 +418,20 @@ asmlinkage long sys_utime(char __user * + (error = vfs_permission(&nd, MAY_WRITE)) != 0) + goto dput_and_out; + } +- mutex_lock(&inode->i_mutex); +- error = notify_change(nd.dentry, &newattrs); +- mutex_unlock(&inode->i_mutex); ++ if (inode->i_op->setattr_raw) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ ++ newattrs.ia_valid |= ATTR_RAW; ++ error = op->setattr_raw(inode, &newattrs); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto dput_and_out; ++ } else { ++ mutex_lock(&inode->i_mutex); ++ error = notify_change(nd.dentry, &newattrs); ++ mutex_unlock(&inode->i_mutex); ++ } ++ + dput_and_out: + path_release(&nd); + out: +@@ -621,38 +643,53 @@ out: + return error; + } + +-asmlinkage long sys_fchmod(unsigned int fd, mode_t mode) ++int chmod_common(struct dentry *dentry, mode_t mode) + { +- struct inode * inode; +- struct dentry * dentry; +- struct file * file; +- int err = -EBADF; ++ struct inode * inode = dentry->d_inode; + struct iattr newattrs; +- +- file = fget(fd); +- if (!file) ++ int error = -EROFS; ++ ++ if (IS_RDONLY(inode)) + goto out; + +- dentry = file->f_dentry; +- inode = dentry->d_inode; ++ if (inode->i_op->setattr_raw) { ++ struct inode_operations *op = dentry->d_inode->i_op; + +- audit_inode(NULL, inode); ++ newattrs.ia_mode = mode; ++ newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; ++ newattrs.ia_valid |= ATTR_RAW; ++ error = op->setattr_raw(inode, &newattrs); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto out; ++ } + +- err = -EROFS; +- if (IS_RDONLY(inode)) +- goto out_putf; +- err = -EPERM; ++ error = -EPERM; + if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) +- goto out_putf; ++ goto out; ++ + mutex_lock(&inode->i_mutex); + if (mode == (mode_t) -1) + mode = inode->i_mode; + newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); + newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; +- err = notify_change(dentry, &newattrs); ++ error = notify_change(dentry, &newattrs); + mutex_unlock(&inode->i_mutex); ++out: ++ return error; ++} + +-out_putf: ++ ++asmlinkage long sys_fchmod(unsigned int fd, mode_t mode) ++{ ++ struct file * file; ++ int err = -EBADF; ++ ++ file = fget(fd); ++ if (!file) ++ goto out; ++ ++ err = chmod_common(file->f_dentry, mode); + fput(file); + out: + return err; +@@ -662,32 +699,12 @@ asmlinkage long sys_fchmodat(int dfd, co + mode_t mode) + { + struct nameidata nd; +- struct inode * inode; + int error; +- struct iattr newattrs; + + error = __user_walk_fd(dfd, filename, LOOKUP_FOLLOW, &nd); + if (error) + goto out; +- inode = nd.dentry->d_inode; +- +- error = -EROFS; +- if (IS_RDONLY(inode)) +- goto dput_and_out; +- +- error = -EPERM; +- if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) +- goto dput_and_out; +- +- mutex_lock(&inode->i_mutex); +- if (mode == (mode_t) -1) +- mode = inode->i_mode; +- newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); +- newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; +- error = notify_change(nd.dentry, &newattrs); +- mutex_unlock(&inode->i_mutex); +- +-dput_and_out: ++ error = chmod_common(nd.dentry, mode); + path_release(&nd); + out: + return error; +@@ -713,6 +730,18 @@ static int chown_common(struct dentry * + if (IS_RDONLY(inode)) + goto out; + error = -EPERM; ++ if (inode->i_op->setattr_raw) { ++ struct inode_operations *op = dentry->d_inode->i_op; ++ ++ newattrs.ia_uid = user; ++ newattrs.ia_gid = group; ++ newattrs.ia_valid = ATTR_UID | ATTR_GID | ATTR_CTIME; ++ newattrs.ia_valid |= ATTR_RAW; ++ error = op->setattr_raw(inode, &newattrs); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ return error; ++ } + if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) + goto out; + newattrs.ia_valid = ATTR_CTIME; +Index: linux-2.6/fs/namei.c +=================================================================== +--- linux-2.6.orig/fs/namei.c 2006-07-15 21:01:10.000000000 +0800 ++++ linux-2.6/fs/namei.c 2006-07-15 21:01:13.000000000 +0800 +@@ -1642,7 +1642,7 @@ int may_open(struct nameidata *nd, int a + if (!error) { + DQUOT_INIT(inode); + +- error = do_truncate(dentry, 0, ATTR_MTIME|ATTR_CTIME, NULL); ++ error = do_truncate(dentry, 0, ATTR_MTIME|ATTR_CTIME, NULL, 1); + } + put_write_access(inode); + if (error) +@@ -1916,6 +1916,7 @@ asmlinkage long sys_mknodat(int dfd, con + char * tmp; + struct dentry * dentry; + struct nameidata nd; ++ intent_init(&nd.intent, IT_LOOKUP); + + if (S_ISDIR(mode)) + return -EPERM; +@@ -1926,6 +1927,15 @@ asmlinkage long sys_mknodat(int dfd, con + error = do_path_lookup(dfd, tmp, LOOKUP_PARENT, &nd); + if (error) + goto out; ++ ++ if (nd.dentry->d_inode->i_op->mknod_raw) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ error = op->mknod_raw(&nd, mode, dev); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto out2; ++ } ++ + dentry = lookup_create(&nd, 0); + error = PTR_ERR(dentry); + +@@ -1952,6 +1962,7 @@ asmlinkage long sys_mknodat(int dfd, con + dput(dentry); + } + mutex_unlock(&nd.dentry->d_inode->i_mutex); ++out2: + path_release(&nd); + out: + putname(tmp); +@@ -1997,9 +2008,18 @@ asmlinkage long sys_mkdirat(int dfd, con + struct dentry *dentry; + struct nameidata nd; + ++ intent_init(&nd.intent, IT_LOOKUP); + error = do_path_lookup(dfd, tmp, LOOKUP_PARENT, &nd); + if (error) + goto out; ++ if (nd.dentry->d_inode->i_op->mkdir_raw) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ error = op->mkdir_raw(&nd, mode); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto out2; ++ } ++ + dentry = lookup_create(&nd, 1); + error = PTR_ERR(dentry); + if (!IS_ERR(dentry)) { +@@ -2009,6 +2029,7 @@ asmlinkage long sys_mkdirat(int dfd, con + dput(dentry); + } + mutex_unlock(&nd.dentry->d_inode->i_mutex); ++out2: + path_release(&nd); + out: + putname(tmp); +@@ -2089,6 +2110,7 @@ static long do_rmdir(int dfd, const char + char * name; + struct dentry *dentry; + struct nameidata nd; ++ intent_init(&nd.intent, IT_LOOKUP); + + name = getname(pathname); + if(IS_ERR(name)) +@@ -2109,6 +2131,14 @@ static long do_rmdir(int dfd, const char + error = -EBUSY; + goto exit1; + } ++ if (nd.dentry->d_inode->i_op->rmdir_raw) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ ++ error = op->rmdir_raw(&nd); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto exit1; ++ } + mutex_lock_nested(&nd.dentry->d_inode->i_mutex, I_MUTEX_PARENT); + dentry = lookup_hash(&nd); + error = PTR_ERR(dentry); +@@ -2172,6 +2202,7 @@ static long do_unlinkat(int dfd, const c + struct dentry *dentry; + struct nameidata nd; + struct inode *inode = NULL; ++ intent_init(&nd.intent, IT_LOOKUP); + + name = getname(pathname); + if(IS_ERR(name)) +@@ -2183,6 +2214,13 @@ static long do_unlinkat(int dfd, const c + error = -EISDIR; + if (nd.last_type != LAST_NORM) + goto exit1; ++ if (nd.dentry->d_inode->i_op->unlink_raw) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ error = op->unlink_raw(&nd); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto exit1; ++ } + mutex_lock_nested(&nd.dentry->d_inode->i_mutex, I_MUTEX_PARENT); + dentry = lookup_hash(&nd); + error = PTR_ERR(dentry); +@@ -2265,9 +2303,17 @@ asmlinkage long sys_symlinkat(const char + struct dentry *dentry; + struct nameidata nd; + ++ intent_init(&nd.intent, IT_LOOKUP); + error = do_path_lookup(newdfd, to, LOOKUP_PARENT, &nd); + if (error) + goto out; ++ if (nd.dentry->d_inode->i_op->symlink_raw) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ error = op->symlink_raw(&nd, from); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto out2; ++ } + dentry = lookup_create(&nd, 0); + error = PTR_ERR(dentry); + if (!IS_ERR(dentry)) { +@@ -2275,6 +2321,7 @@ asmlinkage long sys_symlinkat(const char + dput(dentry); + } + mutex_unlock(&nd.dentry->d_inode->i_mutex); ++out2: + path_release(&nd); + out: + putname(to); +@@ -2365,6 +2412,13 @@ asmlinkage long sys_linkat(int olddfd, c + error = -EXDEV; + if (old_nd.mnt != nd.mnt) + goto out_release; ++ if (nd.dentry->d_inode->i_op->link_raw) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ error = op->link_raw(&old_nd, &nd); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto out_release; ++ } + new_dentry = lookup_create(&nd, 0); + error = PTR_ERR(new_dentry); + if (!IS_ERR(new_dentry)) { +@@ -2541,6 +2595,8 @@ static int do_rename(int olddfd, const c + struct dentry * old_dentry, *new_dentry; + struct dentry * trap; + struct nameidata oldnd, newnd; ++ intent_init(&oldnd.intent, IT_LOOKUP); ++ intent_init(&newnd.intent, IT_LOOKUP); + + error = do_path_lookup(olddfd, oldname, LOOKUP_PARENT, &oldnd); + if (error) +@@ -2563,6 +2619,13 @@ static int do_rename(int olddfd, const c + if (newnd.last_type != LAST_NORM) + goto exit2; + ++ if (old_dir->d_inode->i_op->rename_raw) { ++ error = old_dir->d_inode->i_op->rename_raw(&oldnd, &newnd); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto exit2; ++ } ++ + trap = lock_rename(new_dir, old_dir); + + old_dentry = lookup_hash(&oldnd); +@@ -2594,8 +2657,7 @@ static int do_rename(int olddfd, const c + if (new_dentry == trap) + goto exit5; + +- error = vfs_rename(old_dir->d_inode, old_dentry, +- new_dir->d_inode, new_dentry); ++ error = vfs_rename(old_dir->d_inode, old_dentry, new_dir->d_inode, new_dentry); + exit5: + dput(new_dentry); + exit4: +Index: linux-2.6/fs/exec.c +=================================================================== +--- linux-2.6.orig/fs/exec.c 2006-07-15 21:01:10.000000000 +0800 ++++ linux-2.6/fs/exec.c 2006-07-15 21:01:13.000000000 +0800 +@@ -1533,7 +1533,7 @@ int do_coredump(long signr, int exit_cod + goto close_fail; + if (!file->f_op->write) + goto close_fail; +- if (do_truncate(file->f_dentry, 0, 0, file) != 0) ++ if (do_truncate(file->f_dentry, 0, 0, file, 0) != 0) + goto close_fail; + + retval = binfmt->core_dump(signr, regs, file); +Index: linux-2.6/include/linux/fs.h +=================================================================== +--- linux-2.6.orig/include/linux/fs.h 2006-07-15 21:01:10.000000000 +0800 ++++ linux-2.6/include/linux/fs.h 2006-07-15 21:01:13.000000000 +0800 +@@ -1090,13 +1090,20 @@ struct inode_operations { + int (*create) (struct inode *,struct dentry *,int, struct nameidata *); + struct dentry * (*lookup) (struct inode *,struct dentry *, struct nameidata *); + int (*link) (struct dentry *,struct inode *,struct dentry *); ++ int (*link_raw) (struct nameidata *,struct nameidata *); + int (*unlink) (struct inode *,struct dentry *); ++ int (*unlink_raw) (struct nameidata *); + int (*symlink) (struct inode *,struct dentry *,const char *); ++ int (*symlink_raw) (struct nameidata *,const char *); + int (*mkdir) (struct inode *,struct dentry *,int); ++ int (*mkdir_raw) (struct nameidata *,int); + int (*rmdir) (struct inode *,struct dentry *); ++ int (*rmdir_raw) (struct nameidata *); + int (*mknod) (struct inode *,struct dentry *,int,dev_t); ++ int (*mknod_raw) (struct nameidata *,int,dev_t); + int (*rename) (struct inode *, struct dentry *, + struct inode *, struct dentry *); ++ int (*rename_raw) (struct nameidata *, struct nameidata *); + int (*readlink) (struct dentry *, char __user *,int); + void * (*follow_link) (struct dentry *, struct nameidata *); + void (*put_link) (struct dentry *, struct nameidata *, void *); +@@ -1426,7 +1433,7 @@ static inline int break_lease(struct ino + /* fs/open.c */ + + extern int do_truncate(struct dentry *, loff_t start, unsigned int time_attrs, +- struct file *filp); ++ struct file *filp, int called_from_open); + extern long do_sys_open(int fdf, const char __user *filename, int flags, + int mode); + extern struct file *filp_open(const char *, int, int); diff --git a/lustre/kernel_patches/patches/vfs_races-2.6.18-vanilla.patch b/lustre/kernel_patches/patches/vfs_races-2.6.18-vanilla.patch new file mode 100644 index 0000000..36b2e65 --- /dev/null +++ b/lustre/kernel_patches/patches/vfs_races-2.6.18-vanilla.patch @@ -0,0 +1,61 @@ +Index: linux-2.6/fs/dcache.c +=================================================================== +--- linux-2.6.orig/fs/dcache.c 2006-07-15 16:08:36.000000000 +0800 ++++ linux-2.6/fs/dcache.c 2006-07-15 16:10:41.000000000 +0800 +@@ -226,6 +226,13 @@ int d_invalidate(struct dentry * dentry) + spin_unlock(&dcache_lock); + return 0; + } ++ ++ /* network invalidation by Lustre */ ++ if (dentry->d_flags & DCACHE_LUSTRE_INVALID) { ++ spin_unlock(&dcache_lock); ++ return 0; ++ } ++ + /* + * Check whether to do a partial shrink_dcache + * to get rid of unused child entries. +@@ -1242,17 +1249,26 @@ static void __d_rehash(struct dentry * e + * Adds a dentry to the hash according to its name. + */ + +-void d_rehash(struct dentry * entry) ++void d_rehash_cond(struct dentry * entry, int lock) + { + struct hlist_head *list = d_hash(entry->d_parent, entry->d_name.hash); + +- spin_lock(&dcache_lock); ++ if (lock) ++ spin_lock(&dcache_lock); + spin_lock(&entry->d_lock); + __d_rehash(entry, list); + spin_unlock(&entry->d_lock); +- spin_unlock(&dcache_lock); ++ if (lock) ++ spin_unlock(&dcache_lock); + } + ++EXPORT_SYMBOL(d_rehash_cond); ++ ++void d_rehash(struct dentry * entry) ++{ ++ d_rehash_cond(entry, 1); ++ } ++ + #define do_switch(x,y) do { \ + __typeof__ (x) __tmp = x; \ + x = y; y = __tmp; } while (0) +Index: linux-2.6/include/linux/dcache.h +=================================================================== +--- linux-2.6.orig/include/linux/dcache.h 2006-07-15 16:10:33.000000000 +0800 ++++ linux-2.6/include/linux/dcache.h 2006-07-15 16:10:41.000000000 +0800 +@@ -176,6 +176,8 @@ d_iput: no no no yes + + #define DCACHE_REFERENCED 0x0008 /* Recently used, don't discard. */ + #define DCACHE_UNHASHED 0x0010 ++#define DCACHE_LUSTRE_INVALID 0x0020 /* Lustre invalidated */ ++ + + #define DCACHE_INOTIFY_PARENT_WATCHED 0x0020 /* Parent inode is watched */ + diff --git a/lustre/kernel_patches/series/2.6.18-vanilla.series b/lustre/kernel_patches/series/2.6.18-vanilla.series new file mode 100644 index 0000000..0ac21df --- /dev/null +++ b/lustre/kernel_patches/series/2.6.18-vanilla.series @@ -0,0 +1,20 @@ +lustre_version.patch +vfs_intent-2.6.18-vanilla.patch +vfs_nointent-2.6.18-vanilla.patch +vfs_races-2.6.18-vanilla.patch +ext3-wantedi-misc-2.6.18-vanilla.patch +jbd-jcberr-2.6.18-vanilla.patch +nfs-cifs-intent-2.6.18-vanilla.patch +iopen-misc-2.6.18-vanilla.patch +export-truncate-2.6.18-vanilla.patch +export_symbols-2.6.18-vanilla.patch +dev_read_only-2.6.18-vanilla.patch +export-2.6.18-vanilla.patch +lookup_bdev_init_intent.patch +8kstack-2.6.12.patch +remove-suid-2.6-suse.patch +export-show_task-2.6.18-vanilla.patch +sd_iostats-2.6-rhel4.patch +export_symbol_numa-2.6-fc5.patch +tcp-zero-copy-2.6.18-vanilla.patch +vfs_intent-2.6-fc5-fix.patch diff --git a/lustre/kernel_patches/series/ldiskfs-2.6.18-vanilla.series b/lustre/kernel_patches/series/ldiskfs-2.6.18-vanilla.series new file mode 100644 index 0000000..f379cec --- /dev/null +++ b/lustre/kernel_patches/series/ldiskfs-2.6.18-vanilla.series @@ -0,0 +1,13 @@ +ext3-wantedi-2.6-rhel4.patch +ext3-san-jdike-2.6-suse.patch +iopen-2.6-fc5.patch +ext3-map_inode_page-2.6-suse.patch +export-ext3-2.6-rhel4.patch +ext3-include-fixes-2.6-rhel4.patch +ext3-extents-2.6.18-vanilla.patch +ext3-mballoc2-2.6.18-vanilla.patch +ext3-nlinks-2.6.9.patch +ext3-ialloc-2.6.patch +ext3-remove-cond_resched-calls-2.6.12.patch +ext3-filterdata-2.6.15.patch +ext3-multi-mount-protection-2.6.18-vanilla.patch -- 1.8.3.1