--- /dev/null
+%patch
+Index: linux-2.6.7/fs/ext3/extents.c
+===================================================================
+--- linux-2.6.7.orig/fs/ext3/extents.c 2003-01-30 13:24:37.000000000 +0300
++++ linux-2.6.7/fs/ext3/extents.c 2004-08-19 08:53:49.000000000 +0400
+@@ -0,0 +1,2306 @@
++/*
++ * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com
++ * Written by Alex Tomas <alex@clusterfs.com>
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License version 2 as
++ * published by the Free Software Foundation.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public Licens
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
++ */
++
++/*
++ * Extents support for EXT3
++ *
++ * TODO:
++ * - ext3_ext_walk_space() sould not use ext3_ext_find_extent()
++ * - ext3_ext_calc_credits() could take 'mergable' into account
++ * - ext3*_error() should be used in some situations
++ * - find_goal() [to be tested and improved]
++ * - smart tree reduction
++ * - arch-independence
++ * common on-disk format for big/little-endian arch
++ */
++
++#include <linux/module.h>
++#include <linux/fs.h>
++#include <linux/time.h>
++#include <linux/ext3_jbd.h>
++#include <linux/jbd.h>
++#include <linux/smp_lock.h>
++#include <linux/highuid.h>
++#include <linux/pagemap.h>
++#include <linux/quotaops.h>
++#include <linux/string.h>
++#include <linux/slab.h>
++#include <linux/ext3_extents.h>
++#include <asm/uaccess.h>
++
++static handle_t *ext3_ext_journal_restart(handle_t *handle, int needed)
++{
++ int err;
++
++ if (handle->h_buffer_credits > needed)
++ return handle;
++ if (!ext3_journal_extend(handle, needed))
++ return handle;
++ err = ext3_journal_restart(handle, needed);
++
++ return handle;
++}
++
++static int inline
++ext3_ext_get_access_for_root(handle_t *h, struct ext3_extents_tree *tree)
++{
++ if (tree->ops->get_write_access)
++ return tree->ops->get_write_access(h,tree->buffer);
++ else
++ return 0;
++}
++
++static int inline
++ext3_ext_mark_root_dirty(handle_t *h, struct ext3_extents_tree *tree)
++{
++ if (tree->ops->mark_buffer_dirty)
++ return tree->ops->mark_buffer_dirty(h,tree->buffer);
++ else
++ return 0;
++}
++
++/*
++ * could return:
++ * - EROFS
++ * - ENOMEM
++ */
++static int ext3_ext_get_access(handle_t *handle,
++ struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path)
++{
++ int err;
++
++ if (path->p_bh) {
++ /* path points to block */
++ err = ext3_journal_get_write_access(handle, path->p_bh);
++ } else {
++ /* path points to leaf/index in inode body */
++ err = ext3_ext_get_access_for_root(handle, tree);
++ }
++ return err;
++}
++
++/*
++ * could return:
++ * - EROFS
++ * - ENOMEM
++ * - EIO
++ */
++static int ext3_ext_dirty(handle_t *handle, struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path)
++{
++ int err;
++ if (path->p_bh) {
++ /* path points to block */
++ err =ext3_journal_dirty_metadata(handle, path->p_bh);
++ } else {
++ /* path points to leaf/index in inode body */
++ err = ext3_ext_mark_root_dirty(handle, tree);
++ }
++ return err;
++}
++
++static int inline
++ext3_ext_new_block(handle_t *handle, struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path, struct ext3_extent *ex,
++ int *err)
++{
++ int goal, depth, newblock;
++ struct inode *inode;
++
++ EXT_ASSERT(tree);
++ if (tree->ops->new_block)
++ return tree->ops->new_block(handle, tree, path, ex, err);
++
++ inode = tree->inode;
++ depth = EXT_DEPTH(tree);
++ if (path && depth > 0) {
++ goal = path[depth-1].p_block;
++ } else {
++ struct ext3_inode_info *ei = EXT3_I(inode);
++ unsigned long bg_start;
++ unsigned long colour;
++
++ bg_start = (ei->i_block_group *
++ EXT3_BLOCKS_PER_GROUP(inode->i_sb)) +
++ le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block);
++ colour = (current->pid % 16) *
++ (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16);
++ goal = bg_start + colour;
++ }
++
++ newblock = ext3_new_block(handle, inode, goal, 0, 0, err);
++ return newblock;
++}
++
++static inline void ext3_ext_tree_changed(struct ext3_extents_tree *tree)
++{
++ struct ext3_extent_header *neh;
++ neh = EXT_ROOT_HDR(tree);
++ neh->eh_generation++;
++}
++
++static inline int ext3_ext_space_block(struct ext3_extents_tree *tree)
++{
++ int size;
++
++ size = (tree->inode->i_sb->s_blocksize -
++ sizeof(struct ext3_extent_header))
++ / sizeof(struct ext3_extent);
++#ifdef AGRESSIVE_TEST
++ size = 6;
++#endif
++ return size;
++}
++
++static inline int ext3_ext_space_block_idx(struct ext3_extents_tree *tree)
++{
++ int size;
++
++ size = (tree->inode->i_sb->s_blocksize -
++ sizeof(struct ext3_extent_header))
++ / sizeof(struct ext3_extent_idx);
++#ifdef AGRESSIVE_TEST
++ size = 5;
++#endif
++ return size;
++}
++
++static inline int ext3_ext_space_root(struct ext3_extents_tree *tree)
++{
++ int size;
++
++ size = (tree->buffer_len - sizeof(struct ext3_extent_header))
++ / sizeof(struct ext3_extent);
++#ifdef AGRESSIVE_TEST
++ size = 3;
++#endif
++ return size;
++}
++
++static inline int ext3_ext_space_root_idx(struct ext3_extents_tree *tree)
++{
++ int size;
++
++ size = (tree->buffer_len -
++ sizeof(struct ext3_extent_header))
++ / sizeof(struct ext3_extent_idx);
++#ifdef AGRESSIVE_TEST
++ size = 4;
++#endif
++ return size;
++}
++
++static void ext3_ext_show_path(struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path)
++{
++#ifdef EXT_DEBUG
++ int k, l = path->p_depth;
++
++ ext_debug(tree, "path:");
++ for (k = 0; k <= l; k++, path++) {
++ if (path->p_idx) {
++ ext_debug(tree, " %d->%d", path->p_idx->ei_block,
++ path->p_idx->ei_leaf);
++ } else if (path->p_ext) {
++ ext_debug(tree, " %d:%d:%d",
++ path->p_ext->ee_block,
++ path->p_ext->ee_len,
++ path->p_ext->ee_start);
++ } else
++ ext_debug(tree, " []");
++ }
++ ext_debug(tree, "\n");
++#endif
++}
++
++static void ext3_ext_show_leaf(struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path)
++{
++#ifdef EXT_DEBUG
++ int depth = EXT_DEPTH(tree);
++ struct ext3_extent_header *eh;
++ struct ext3_extent *ex;
++ int i;
++
++ if (!path)
++ return;
++
++ eh = path[depth].p_hdr;
++ ex = EXT_FIRST_EXTENT(eh);
++
++ for (i = 0; i < eh->eh_entries; i++, ex++) {
++ ext_debug(tree, "%d:%d:%d ",
++ ex->ee_block, ex->ee_len, ex->ee_start);
++ }
++ ext_debug(tree, "\n");
++#endif
++}
++
++static void ext3_ext_drop_refs(struct ext3_ext_path *path)
++{
++ int depth = path->p_depth;
++ int i;
++
++ for (i = 0; i <= depth; i++, path++)
++ if (path->p_bh) {
++ brelse(path->p_bh);
++ path->p_bh = NULL;
++ }
++}
++
++/*
++ * binary search for closest index by given block
++ */
++static inline void
++ext3_ext_binsearch_idx(struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path, int block)
++{
++ struct ext3_extent_header *eh = path->p_hdr;
++ struct ext3_extent_idx *ix;
++ int l = 0, k, r;
++
++ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC);
++ EXT_ASSERT(eh->eh_entries <= eh->eh_max);
++ EXT_ASSERT(eh->eh_entries > 0);
++
++ ext_debug(tree, "binsearch for %d(idx): ", block);
++
++ path->p_idx = ix = EXT_FIRST_INDEX(eh);
++
++ r = k = eh->eh_entries;
++ while (k > 1) {
++ k = (r - l) / 2;
++ if (block < ix[l + k].ei_block)
++ r -= k;
++ else
++ l += k;
++ ext_debug(tree, "%d:%d:%d ", k, l, r);
++ }
++
++ ix += l;
++ path->p_idx = ix;
++ ext_debug(tree, " -> %d->%d ", path->p_idx->ei_block, path->p_idx->ei_leaf);
++
++ while (l++ < r) {
++ if (block < ix->ei_block)
++ break;
++ path->p_idx = ix++;
++ }
++ ext_debug(tree, " -> %d->%d\n", path->p_idx->ei_block,
++ path->p_idx->ei_leaf);
++
++#ifdef CHECK_BINSEARCH
++ {
++ struct ext3_extent_idx *chix;
++
++ chix = ix = EXT_FIRST_INDEX(eh);
++ for (k = 0; k < eh->eh_entries; k++, ix++) {
++ if (k != 0 && ix->ei_block <= ix[-1].ei_block) {
++ printk("k=%d, ix=0x%p, first=0x%p\n", k,
++ ix, EXT_FIRST_INDEX(eh));
++ printk("%u <= %u\n",
++ ix->ei_block,ix[-1].ei_block);
++ }
++ EXT_ASSERT(k == 0 || ix->ei_block > ix[-1].ei_block);
++ if (block < ix->ei_block)
++ break;
++ chix = ix;
++ }
++ EXT_ASSERT(chix == path->p_idx);
++ }
++#endif
++
++}
++
++/*
++ * binary search for closest extent by given block
++ */
++static inline void
++ext3_ext_binsearch(struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path, int block)
++{
++ struct ext3_extent_header *eh = path->p_hdr;
++ struct ext3_extent *ex;
++ int l = 0, k, r;
++
++ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC);
++ EXT_ASSERT(eh->eh_entries <= eh->eh_max);
++
++ if (eh->eh_entries == 0) {
++ /*
++ * this leaf is empty yet:
++ * we get such a leaf in split/add case
++ */
++ return;
++ }
++
++ ext_debug(tree, "binsearch for %d: ", block);
++
++ path->p_ext = ex = EXT_FIRST_EXTENT(eh);
++
++ r = k = eh->eh_entries;
++ while (k > 1) {
++ k = (r - l) / 2;
++ if (block < ex[l + k].ee_block)
++ r -= k;
++ else
++ l += k;
++ ext_debug(tree, "%d:%d:%d ", k, l, r);
++ }
++
++ ex += l;
++ path->p_ext = ex;
++ ext_debug(tree, " -> %d:%d:%d ", path->p_ext->ee_block,
++ path->p_ext->ee_start, path->p_ext->ee_len);
++
++ while (l++ < r) {
++ if (block < ex->ee_block)
++ break;
++ path->p_ext = ex++;
++ }
++ ext_debug(tree, " -> %d:%d:%d\n", path->p_ext->ee_block,
++ path->p_ext->ee_start, path->p_ext->ee_len);
++
++#ifdef CHECK_BINSEARCH
++ {
++ struct ext3_extent *chex;
++
++ chex = ex = EXT_FIRST_EXTENT(eh);
++ for (k = 0; k < eh->eh_entries; k++, ex++) {
++ EXT_ASSERT(k == 0 || ex->ee_block > ex[-1].ee_block);
++ if (block < ex->ee_block)
++ break;
++ chex = ex;
++ }
++ EXT_ASSERT(chex == path->p_ext);
++ }
++#endif
++
++}
++
++int ext3_extent_tree_init(handle_t *handle, struct ext3_extents_tree *tree)
++{
++ struct ext3_extent_header *eh;
++
++ BUG_ON(tree->buffer_len == 0);
++ ext3_ext_get_access_for_root(handle, tree);
++ eh = EXT_ROOT_HDR(tree);
++ eh->eh_depth = 0;
++ eh->eh_entries = 0;
++ eh->eh_magic = EXT3_EXT_MAGIC;
++ eh->eh_max = ext3_ext_space_root(tree);
++ ext3_ext_mark_root_dirty(handle, tree);
++ ext3_ext_invalidate_cache(tree);
++ return 0;
++}
++
++struct ext3_ext_path *
++ext3_ext_find_extent(struct ext3_extents_tree *tree, int block,
++ struct ext3_ext_path *path)
++{
++ struct ext3_extent_header *eh;
++ struct buffer_head *bh;
++ int depth, i, ppos = 0;
++
++ EXT_ASSERT(tree);
++ EXT_ASSERT(tree->inode);
++ EXT_ASSERT(tree->root);
++
++ eh = EXT_ROOT_HDR(tree);
++ EXT_ASSERT(eh);
++ i = depth = EXT_DEPTH(tree);
++ EXT_ASSERT(eh->eh_max);
++ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC);
++ EXT_ASSERT(i == 0 || eh->eh_entries > 0);
++
++ /* account possible depth increase */
++ if (!path) {
++ path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 2),
++ GFP_NOFS);
++ if (!path)
++ return ERR_PTR(-ENOMEM);
++ }
++ memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1));
++ path[0].p_hdr = eh;
++
++ /* walk through the tree */
++ while (i) {
++ ext_debug(tree, "depth %d: num %d, max %d\n",
++ ppos, eh->eh_entries, eh->eh_max);
++ ext3_ext_binsearch_idx(tree, path + ppos, block);
++ path[ppos].p_block = path[ppos].p_idx->ei_leaf;
++ path[ppos].p_depth = i;
++ path[ppos].p_ext = NULL;
++
++ bh = sb_bread(tree->inode->i_sb, path[ppos].p_block);
++ if (!bh) {
++ ext3_ext_drop_refs(path);
++ kfree(path);
++ return ERR_PTR(-EIO);
++ }
++ eh = EXT_BLOCK_HDR(bh);
++ ppos++;
++ EXT_ASSERT(ppos <= depth);
++ path[ppos].p_bh = bh;
++ path[ppos].p_hdr = eh;
++ i--;
++ }
++
++ path[ppos].p_depth = i;
++ path[ppos].p_hdr = eh;
++ path[ppos].p_ext = NULL;
++
++ /* find extent */
++ ext3_ext_binsearch(tree, path + ppos, block);
++
++ ext3_ext_show_path(tree, path);
++
++ return path;
++}
++
++/*
++ * insert new index [logical;ptr] into the block at cupr
++ * it check where to insert: before curp or after curp
++ */
++static int ext3_ext_insert_index(handle_t *handle,
++ struct ext3_extents_tree *tree,
++ struct ext3_ext_path *curp,
++ int logical, int ptr)
++{
++ struct ext3_extent_idx *ix;
++ int len, err;
++
++ if ((err = ext3_ext_get_access(handle, tree, curp)))
++ return err;
++
++ EXT_ASSERT(logical != curp->p_idx->ei_block);
++ len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx;
++ if (logical > curp->p_idx->ei_block) {
++ /* insert after */
++ if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) {
++ len = (len - 1) * sizeof(struct ext3_extent_idx);
++ len = len < 0 ? 0 : len;
++ ext_debug(tree, "insert new index %d after: %d. "
++ "move %d from 0x%p to 0x%p\n",
++ logical, ptr, len,
++ (curp->p_idx + 1), (curp->p_idx + 2));
++ memmove(curp->p_idx + 2, curp->p_idx + 1, len);
++ }
++ ix = curp->p_idx + 1;
++ } else {
++ /* insert before */
++ len = len * sizeof(struct ext3_extent_idx);
++ len = len < 0 ? 0 : len;
++ ext_debug(tree, "insert new index %d before: %d. "
++ "move %d from 0x%p to 0x%p\n",
++ logical, ptr, len,
++ curp->p_idx, (curp->p_idx + 1));
++ memmove(curp->p_idx + 1, curp->p_idx, len);
++ ix = curp->p_idx;
++ }
++
++ ix->ei_block = logical;
++ ix->ei_leaf = ptr;
++ curp->p_hdr->eh_entries++;
++
++ EXT_ASSERT(curp->p_hdr->eh_entries <= curp->p_hdr->eh_max);
++ EXT_ASSERT(ix <= EXT_LAST_INDEX(curp->p_hdr));
++
++ err = ext3_ext_dirty(handle, tree, curp);
++ ext3_std_error(tree->inode->i_sb, err);
++
++ return err;
++}
++
++/*
++ * routine inserts new subtree into the path, using free index entry
++ * at depth 'at:
++ * - allocates all needed blocks (new leaf and all intermediate index blocks)
++ * - makes decision where to split
++ * - moves remaining extens and index entries (right to the split point)
++ * into the newly allocated blocks
++ * - initialize subtree
++ */
++static int ext3_ext_split(handle_t *handle, struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path,
++ struct ext3_extent *newext, int at)
++{
++ struct buffer_head *bh = NULL;
++ int depth = EXT_DEPTH(tree);
++ struct ext3_extent_header *neh;
++ struct ext3_extent_idx *fidx;
++ struct ext3_extent *ex;
++ int i = at, k, m, a;
++ unsigned long newblock, oldblock, border;
++ int *ablocks = NULL; /* array of allocated blocks */
++ int err = 0;
++
++ /* make decision: where to split? */
++ /* FIXME: now desicion is simplest: at current extent */
++
++ /* if current leaf will be splitted, then we should use
++ * border from split point */
++ EXT_ASSERT(path[depth].p_ext <= EXT_MAX_EXTENT(path[depth].p_hdr));
++ if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
++ border = path[depth].p_ext[1].ee_block;
++ ext_debug(tree, "leaf will be splitted."
++ " next leaf starts at %d\n",
++ (int)border);
++ } else {
++ border = newext->ee_block;
++ ext_debug(tree, "leaf will be added."
++ " next leaf starts at %d\n",
++ (int)border);
++ }
++
++ /*
++ * if error occurs, then we break processing
++ * and turn filesystem read-only. so, index won't
++ * be inserted and tree will be in consistent
++ * state. next mount will repair buffers too
++ */
++
++ /*
++ * get array to track all allocated blocks
++ * we need this to handle errors and free blocks
++ * upon them
++ */
++ ablocks = kmalloc(sizeof(unsigned long) * depth, GFP_NOFS);
++ if (!ablocks)
++ return -ENOMEM;
++ memset(ablocks, 0, sizeof(unsigned long) * depth);
++
++ /* allocate all needed blocks */
++ ext_debug(tree, "allocate %d blocks for indexes/leaf\n", depth - at);
++ for (a = 0; a < depth - at; a++) {
++ newblock = ext3_ext_new_block(handle, tree, path, newext, &err);
++ if (newblock == 0)
++ goto cleanup;
++ ablocks[a] = newblock;
++ }
++
++ /* initialize new leaf */
++ newblock = ablocks[--a];
++ EXT_ASSERT(newblock);
++ bh = sb_getblk(tree->inode->i_sb, newblock);
++ if (!bh) {
++ err = -EIO;
++ goto cleanup;
++ }
++ lock_buffer(bh);
++
++ if ((err = ext3_journal_get_create_access(handle, bh)))
++ goto cleanup;
++
++ neh = EXT_BLOCK_HDR(bh);
++ neh->eh_entries = 0;
++ neh->eh_max = ext3_ext_space_block(tree);
++ neh->eh_magic = EXT3_EXT_MAGIC;
++ neh->eh_depth = 0;
++ ex = EXT_FIRST_EXTENT(neh);
++
++ /* move remain of path[depth] to the new leaf */
++ EXT_ASSERT(path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max);
++ /* start copy from next extent */
++ /* TODO: we could do it by single memmove */
++ m = 0;
++ path[depth].p_ext++;
++ while (path[depth].p_ext <=
++ EXT_MAX_EXTENT(path[depth].p_hdr)) {
++ ext_debug(tree, "move %d:%d:%d in new leaf %lu\n",
++ path[depth].p_ext->ee_block,
++ path[depth].p_ext->ee_start,
++ path[depth].p_ext->ee_len,
++ newblock);
++ memmove(ex++, path[depth].p_ext++,
++ sizeof(struct ext3_extent));
++ neh->eh_entries++;
++ m++;
++ }
++ set_buffer_uptodate(bh);
++ unlock_buffer(bh);
++
++ if ((err = ext3_journal_dirty_metadata(handle, bh)))
++ goto cleanup;
++ brelse(bh);
++ bh = NULL;
++
++ /* correct old leaf */
++ if (m) {
++ if ((err = ext3_ext_get_access(handle, tree, path + depth)))
++ goto cleanup;
++ path[depth].p_hdr->eh_entries -= m;
++ if ((err = ext3_ext_dirty(handle, tree, path + depth)))
++ goto cleanup;
++
++ }
++
++ /* create intermediate indexes */
++ k = depth - at - 1;
++ EXT_ASSERT(k >= 0);
++ if (k)
++ ext_debug(tree, "create %d intermediate indices\n", k);
++ /* insert new index into current index block */
++ /* current depth stored in i var */
++ i = depth - 1;
++ while (k--) {
++ oldblock = newblock;
++ newblock = ablocks[--a];
++ bh = sb_getblk(tree->inode->i_sb, newblock);
++ if (!bh) {
++ err = -EIO;
++ goto cleanup;
++ }
++ lock_buffer(bh);
++
++ if ((err = ext3_journal_get_create_access(handle, bh)))
++ goto cleanup;
++
++ neh = EXT_BLOCK_HDR(bh);
++ neh->eh_entries = 1;
++ neh->eh_magic = EXT3_EXT_MAGIC;
++ neh->eh_max = ext3_ext_space_block_idx(tree);
++ neh->eh_depth = depth - i;
++ fidx = EXT_FIRST_INDEX(neh);
++ fidx->ei_block = border;
++ fidx->ei_leaf = oldblock;
++
++ ext_debug(tree, "int.index at %d (block %lu): %lu -> %lu\n",
++ i, newblock, border, oldblock);
++ /* copy indexes */
++ m = 0;
++ path[i].p_idx++;
++
++ ext_debug(tree, "cur 0x%p, last 0x%p\n", path[i].p_idx,
++ EXT_MAX_INDEX(path[i].p_hdr));
++ EXT_ASSERT(EXT_MAX_INDEX(path[i].p_hdr) ==
++ EXT_LAST_INDEX(path[i].p_hdr));
++ while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) {
++ ext_debug(tree, "%d: move %d:%d in new index %lu\n",
++ i, path[i].p_idx->ei_block,
++ path[i].p_idx->ei_leaf, newblock);
++ memmove(++fidx, path[i].p_idx++,
++ sizeof(struct ext3_extent_idx));
++ neh->eh_entries++;
++ EXT_ASSERT(neh->eh_entries <= neh->eh_max);
++ m++;
++ }
++ set_buffer_uptodate(bh);
++ unlock_buffer(bh);
++
++ if ((err = ext3_journal_dirty_metadata(handle, bh)))
++ goto cleanup;
++ brelse(bh);
++ bh = NULL;
++
++ /* correct old index */
++ if (m) {
++ err = ext3_ext_get_access(handle, tree, path + i);
++ if (err)
++ goto cleanup;
++ path[i].p_hdr->eh_entries -= m;
++ err = ext3_ext_dirty(handle, tree, path + i);
++ if (err)
++ goto cleanup;
++ }
++
++ i--;
++ }
++
++ /* insert new index */
++ if (!err)
++ err = ext3_ext_insert_index(handle, tree, path + at,
++ border, newblock);
++
++cleanup:
++ if (bh) {
++ if (buffer_locked(bh))
++ unlock_buffer(bh);
++ brelse(bh);
++ }
++
++ if (err) {
++ /* free all allocated blocks in error case */
++ for (i = 0; i < depth; i++) {
++ if (!ablocks[i])
++ continue;
++ ext3_free_blocks(handle, tree->inode, ablocks[i], 1);
++ }
++ }
++ kfree(ablocks);
++
++ return err;
++}
++
++/*
++ * routine implements tree growing procedure:
++ * - allocates new block
++ * - moves top-level data (index block or leaf) into the new block
++ * - initialize new top-level, creating index that points to the
++ * just created block
++ */
++static int ext3_ext_grow_indepth(handle_t *handle,
++ struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path,
++ struct ext3_extent *newext)
++{
++ struct ext3_ext_path *curp = path;
++ struct ext3_extent_header *neh;
++ struct ext3_extent_idx *fidx;
++ struct buffer_head *bh;
++ unsigned long newblock;
++ int err = 0;
++
++ newblock = ext3_ext_new_block(handle, tree, path, newext, &err);
++ if (newblock == 0)
++ return err;
++
++ bh = sb_getblk(tree->inode->i_sb, newblock);
++ if (!bh) {
++ err = -EIO;
++ ext3_std_error(tree->inode->i_sb, err);
++ return err;
++ }
++ lock_buffer(bh);
++
++ if ((err = ext3_journal_get_create_access(handle, bh))) {
++ unlock_buffer(bh);
++ goto out;
++ }
++
++ /* move top-level index/leaf into new block */
++ memmove(bh->b_data, curp->p_hdr, tree->buffer_len);
++
++ /* set size of new block */
++ neh = EXT_BLOCK_HDR(bh);
++ /* old root could have indexes or leaves
++ * so calculate e_max right way */
++ if (EXT_DEPTH(tree))
++ neh->eh_max = ext3_ext_space_block_idx(tree);
++ else
++ neh->eh_max = ext3_ext_space_block(tree);
++ neh->eh_magic = EXT3_EXT_MAGIC;
++ set_buffer_uptodate(bh);
++ unlock_buffer(bh);
++
++ if ((err = ext3_journal_dirty_metadata(handle, bh)))
++ goto out;
++
++ /* create index in new top-level index: num,max,pointer */
++ if ((err = ext3_ext_get_access(handle, tree, curp)))
++ goto out;
++
++ curp->p_hdr->eh_magic = EXT3_EXT_MAGIC;
++ curp->p_hdr->eh_max = ext3_ext_space_root_idx(tree);
++ curp->p_hdr->eh_entries = 1;
++ curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr);
++ /* FIXME: it works, but actually path[0] can be index */
++ curp->p_idx->ei_block = EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block;
++ curp->p_idx->ei_leaf = newblock;
++
++ neh = EXT_ROOT_HDR(tree);
++ fidx = EXT_FIRST_INDEX(neh);
++ ext_debug(tree, "new root: num %d(%d), lblock %d, ptr %d\n",
++ neh->eh_entries, neh->eh_max, fidx->ei_block, fidx->ei_leaf);
++
++ neh->eh_depth = path->p_depth + 1;
++ err = ext3_ext_dirty(handle, tree, curp);
++out:
++ brelse(bh);
++
++ return err;
++}
++
++/*
++ * routine finds empty index and adds new leaf. if no free index found
++ * then it requests in-depth growing
++ */
++static int ext3_ext_create_new_leaf(handle_t *handle,
++ struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path,
++ struct ext3_extent *newext)
++{
++ struct ext3_ext_path *curp;
++ int depth, i, err = 0;
++
++repeat:
++ i = depth = EXT_DEPTH(tree);
++
++ /* walk up to the tree and look for free index entry */
++ curp = path + depth;
++ while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) {
++ i--;
++ curp--;
++ }
++
++ /* we use already allocated block for index block
++ * so, subsequent data blocks should be contigoues */
++ if (EXT_HAS_FREE_INDEX(curp)) {
++ /* if we found index with free entry, then use that
++ * entry: create all needed subtree and add new leaf */
++ err = ext3_ext_split(handle, tree, path, newext, i);
++
++ /* refill path */
++ ext3_ext_drop_refs(path);
++ path = ext3_ext_find_extent(tree, newext->ee_block, path);
++ if (IS_ERR(path))
++ err = PTR_ERR(path);
++ } else {
++ /* tree is full, time to grow in depth */
++ err = ext3_ext_grow_indepth(handle, tree, path, newext);
++
++ /* refill path */
++ ext3_ext_drop_refs(path);
++ path = ext3_ext_find_extent(tree, newext->ee_block, path);
++ if (IS_ERR(path))
++ err = PTR_ERR(path);
++
++ /*
++ * only first (depth 0 -> 1) produces free space
++ * in all other cases we have to split growed tree
++ */
++ depth = EXT_DEPTH(tree);
++ if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) {
++ /* now we need split */
++ goto repeat;
++ }
++ }
++
++ if (err)
++ return err;
++
++ return 0;
++}
++
++/*
++ * returns allocated block in subsequent extent or EXT_MAX_BLOCK
++ * NOTE: it consider block number from index entry as
++ * allocated block. thus, index entries have to be consistent
++ * with leafs
++ */
++static unsigned long
++ext3_ext_next_allocated_block(struct ext3_ext_path *path)
++{
++ int depth;
++
++ EXT_ASSERT(path != NULL);
++ depth = path->p_depth;
++
++ if (depth == 0 && path->p_ext == NULL)
++ return EXT_MAX_BLOCK;
++
++ /* FIXME: what if index isn't full ?! */
++ while (depth >= 0) {
++ if (depth == path->p_depth) {
++ /* leaf */
++ if (path[depth].p_ext !=
++ EXT_LAST_EXTENT(path[depth].p_hdr))
++ return path[depth].p_ext[1].ee_block;
++ } else {
++ /* index */
++ if (path[depth].p_idx !=
++ EXT_LAST_INDEX(path[depth].p_hdr))
++ return path[depth].p_idx[1].ei_block;
++ }
++ depth--;
++ }
++
++ return EXT_MAX_BLOCK;
++}
++
++/*
++ * returns first allocated block from next leaf or EXT_MAX_BLOCK
++ */
++static unsigned ext3_ext_next_leaf_block(struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path)
++{
++ int depth;
++
++ EXT_ASSERT(path != NULL);
++ depth = path->p_depth;
++
++ /* zero-tree has no leaf blocks at all */
++ if (depth == 0)
++ return EXT_MAX_BLOCK;
++
++ /* go to index block */
++ depth--;
++
++ while (depth >= 0) {
++ if (path[depth].p_idx !=
++ EXT_LAST_INDEX(path[depth].p_hdr))
++ return path[depth].p_idx[1].ei_block;
++ depth--;
++ }
++
++ return EXT_MAX_BLOCK;
++}
++
++/*
++ * if leaf gets modified and modified extent is first in the leaf
++ * then we have to correct all indexes above
++ * TODO: do we need to correct tree in all cases?
++ */
++int ext3_ext_correct_indexes(handle_t *handle, struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path)
++{
++ struct ext3_extent_header *eh;
++ int depth = EXT_DEPTH(tree);
++ struct ext3_extent *ex;
++ unsigned long border;
++ int k, err = 0;
++
++ eh = path[depth].p_hdr;
++ ex = path[depth].p_ext;
++ EXT_ASSERT(ex);
++ EXT_ASSERT(eh);
++
++ if (depth == 0) {
++ /* there is no tree at all */
++ return 0;
++ }
++
++ if (ex != EXT_FIRST_EXTENT(eh)) {
++ /* we correct tree if first leaf got modified only */
++ return 0;
++ }
++
++ /*
++ * TODO: we need correction if border is smaller then current one
++ */
++ k = depth - 1;
++ border = path[depth].p_ext->ee_block;
++ if ((err = ext3_ext_get_access(handle, tree, path + k)))
++ return err;
++ path[k].p_idx->ei_block = border;
++ if ((err = ext3_ext_dirty(handle, tree, path + k)))
++ return err;
++
++ while (k--) {
++ /* change all left-side indexes */
++ if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr))
++ break;
++ if ((err = ext3_ext_get_access(handle, tree, path + k)))
++ break;
++ path[k].p_idx->ei_block = border;
++ if ((err = ext3_ext_dirty(handle, tree, path + k)))
++ break;
++ }
++
++ return err;
++}
++
++static int inline
++ext3_can_extents_be_merged(struct ext3_extents_tree *tree,
++ struct ext3_extent *ex1,
++ struct ext3_extent *ex2)
++{
++ if (ex1->ee_block + ex1->ee_len != ex2->ee_block)
++ return 0;
++
++#ifdef AGRESSIVE_TEST
++ if (ex1->ee_len >= 4)
++ return 0;
++#endif
++
++ if (!tree->ops->mergable)
++ return 1;
++
++ return tree->ops->mergable(ex1, ex2);
++}
++
++/*
++ * this routine tries to merge requsted extent into the existing
++ * extent or inserts requested extent as new one into the tree,
++ * creating new leaf in no-space case
++ */
++int ext3_ext_insert_extent(handle_t *handle, struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path,
++ struct ext3_extent *newext)
++{
++ struct ext3_extent_header * eh;
++ struct ext3_extent *ex, *fex;
++ struct ext3_extent *nearex; /* nearest extent */
++ struct ext3_ext_path *npath = NULL;
++ int depth, len, err, next;
++
++ EXT_ASSERT(newext->ee_len > 0);
++ EXT_ASSERT(newext->ee_len < EXT_CACHE_MARK);
++ depth = EXT_DEPTH(tree);
++ ex = path[depth].p_ext;
++ EXT_ASSERT(path[depth].p_hdr);
++
++ /* try to insert block into found extent and return */
++ if (ex && ext3_can_extents_be_merged(tree, ex, newext)) {
++ ext_debug(tree, "append %d block to %d:%d (from %d)\n",
++ newext->ee_len, ex->ee_block, ex->ee_len,
++ ex->ee_start);
++ if ((err = ext3_ext_get_access(handle, tree, path + depth)))
++ return err;
++ ex->ee_len += newext->ee_len;
++ eh = path[depth].p_hdr;
++ nearex = ex;
++ goto merge;
++ }
++
++repeat:
++ depth = EXT_DEPTH(tree);
++ eh = path[depth].p_hdr;
++ if (eh->eh_entries < eh->eh_max)
++ goto has_space;
++
++ /* probably next leaf has space for us? */
++ fex = EXT_LAST_EXTENT(eh);
++ next = ext3_ext_next_leaf_block(tree, path);
++ if (newext->ee_block > fex->ee_block && next != EXT_MAX_BLOCK) {
++ ext_debug(tree, "next leaf block - %d\n", next);
++ EXT_ASSERT(!npath);
++ npath = ext3_ext_find_extent(tree, next, NULL);
++ if (IS_ERR(npath))
++ return PTR_ERR(npath);
++ EXT_ASSERT(npath->p_depth == path->p_depth);
++ eh = npath[depth].p_hdr;
++ if (eh->eh_entries < eh->eh_max) {
++ ext_debug(tree, "next leaf isnt full(%d)\n",
++ eh->eh_entries);
++ path = npath;
++ goto repeat;
++ }
++ ext_debug(tree, "next leaf hasno free space(%d,%d)\n",
++ eh->eh_entries, eh->eh_max);
++ }
++
++ /*
++ * there is no free space in found leaf
++ * we're gonna add new leaf in the tree
++ */
++ err = ext3_ext_create_new_leaf(handle, tree, path, newext);
++ if (err)
++ goto cleanup;
++ depth = EXT_DEPTH(tree);
++ eh = path[depth].p_hdr;
++
++has_space:
++ nearex = path[depth].p_ext;
++
++ if ((err = ext3_ext_get_access(handle, tree, path + depth)))
++ goto cleanup;
++
++ if (!nearex) {
++ /* there is no extent in this leaf, create first one */
++ ext_debug(tree, "first extent in the leaf: %d:%d:%d\n",
++ newext->ee_block, newext->ee_start,
++ newext->ee_len);
++ path[depth].p_ext = EXT_FIRST_EXTENT(eh);
++ } else if (newext->ee_block > nearex->ee_block) {
++ EXT_ASSERT(newext->ee_block != nearex->ee_block);
++ if (nearex != EXT_LAST_EXTENT(eh)) {
++ len = EXT_MAX_EXTENT(eh) - nearex;
++ len = (len - 1) * sizeof(struct ext3_extent);
++ len = len < 0 ? 0 : len;
++ ext_debug(tree, "insert %d:%d:%d after: nearest 0x%p, "
++ "move %d from 0x%p to 0x%p\n",
++ newext->ee_block, newext->ee_start,
++ newext->ee_len,
++ nearex, len, nearex + 1, nearex + 2);
++ memmove(nearex + 2, nearex + 1, len);
++ }
++ path[depth].p_ext = nearex + 1;
++ } else {
++ EXT_ASSERT(newext->ee_block != nearex->ee_block);
++ len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext3_extent);
++ len = len < 0 ? 0 : len;
++ ext_debug(tree, "insert %d:%d:%d before: nearest 0x%p, "
++ "move %d from 0x%p to 0x%p\n",
++ newext->ee_block, newext->ee_start, newext->ee_len,
++ nearex, len, nearex + 1, nearex + 2);
++ memmove(nearex + 1, nearex, len);
++ path[depth].p_ext = nearex;
++ }
++
++ eh->eh_entries++;
++ nearex = path[depth].p_ext;
++ nearex->ee_block = newext->ee_block;
++ nearex->ee_start = newext->ee_start;
++ nearex->ee_len = newext->ee_len;
++ /* FIXME: support for large fs */
++ nearex->ee_start_hi = 0;
++
++merge:
++ /* try to merge extents to the right */
++ while (nearex < EXT_LAST_EXTENT(eh)) {
++ if (!ext3_can_extents_be_merged(tree, nearex, nearex + 1))
++ break;
++ /* merge with next extent! */
++ nearex->ee_len += nearex[1].ee_len;
++ if (nearex + 1 < EXT_LAST_EXTENT(eh)) {
++ len = (EXT_LAST_EXTENT(eh) - nearex - 1)
++ * sizeof(struct ext3_extent);
++ memmove(nearex + 1, nearex + 2, len);
++ }
++ eh->eh_entries--;
++ EXT_ASSERT(eh->eh_entries > 0);
++ }
++
++ /* try to merge extents to the left */
++
++ /* time to correct all indexes above */
++ err = ext3_ext_correct_indexes(handle, tree, path);
++ if (err)
++ goto cleanup;
++
++ err = ext3_ext_dirty(handle, tree, path + depth);
++
++cleanup:
++ if (npath) {
++ ext3_ext_drop_refs(npath);
++ kfree(npath);
++ }
++ ext3_ext_tree_changed(tree);
++ ext3_ext_invalidate_cache(tree);
++ return err;
++}
++
++int ext3_ext_walk_space(struct ext3_extents_tree *tree, unsigned long block,
++ unsigned long num, ext_prepare_callback func)
++{
++ struct ext3_ext_path *path = NULL;
++ struct ext3_extent *ex, cbex;
++ unsigned long next, start = 0, end = 0;
++ unsigned long last = block + num;
++ int depth, exists, err = 0;
++
++ EXT_ASSERT(tree);
++ EXT_ASSERT(func);
++ EXT_ASSERT(tree->inode);
++ EXT_ASSERT(tree->root);
++
++ while (block < last && block != EXT_MAX_BLOCK) {
++ num = last - block;
++ /* find extent for this block */
++ path = ext3_ext_find_extent(tree, block, path);
++ if (IS_ERR(path)) {
++ err = PTR_ERR(path);
++ path = NULL;
++ break;
++ }
++
++ depth = EXT_DEPTH(tree);
++ EXT_ASSERT(path[depth].p_hdr);
++ ex = path[depth].p_ext;
++ next = ext3_ext_next_allocated_block(path);
++
++ exists = 0;
++ if (!ex) {
++ /* there is no extent yet, so try to allocate
++ * all requested space */
++ start = block;
++ end = block + num;
++ } else if (ex->ee_block > block) {
++ /* need to allocate space before found extent */
++ start = block;
++ end = ex->ee_block;
++ if (block + num < end)
++ end = block + num;
++ } else if (block >= ex->ee_block + ex->ee_len) {
++ /* need to allocate space after found extent */
++ start = block;
++ end = block + num;
++ if (end >= next)
++ end = next;
++ } else if (block >= ex->ee_block) {
++ /*
++ * some part of requested space is covered
++ * by found extent
++ */
++ start = block;
++ end = ex->ee_block + ex->ee_len;
++ if (block + num < end)
++ end = block + num;
++ exists = 1;
++ } else {
++ BUG();
++ }
++ EXT_ASSERT(end > start);
++
++ if (!exists) {
++ cbex.ee_block = start;
++ cbex.ee_len = end - start;
++ cbex.ee_start = 0;
++ } else
++ cbex = *ex;
++
++ EXT_ASSERT(path[depth].p_hdr);
++ err = func(tree, path, &cbex, exists);
++ ext3_ext_drop_refs(path);
++
++ if (err < 0)
++ break;
++ if (err == EXT_REPEAT)
++ continue;
++ else if (err == EXT_BREAK) {
++ err = 0;
++ break;
++ }
++
++ if (EXT_DEPTH(tree) != depth) {
++ /* depth was changed. we have to realloc path */
++ kfree(path);
++ path = NULL;
++ }
++
++ block = cbex.ee_block + cbex.ee_len;
++ }
++
++ if (path) {
++ ext3_ext_drop_refs(path);
++ kfree(path);
++ }
++
++ return err;
++}
++
++static inline void
++ext3_ext_put_in_cache(struct ext3_extents_tree *tree, struct ext3_extent *ex)
++{
++ if (tree->cex) {
++ EXT_ASSERT(ex);
++ EXT_ASSERT(ex->ee_len);
++ tree->cex->ee_block = ex->ee_block;
++ tree->cex->ee_start = ex->ee_start;
++ tree->cex->ee_len = ex->ee_len;
++ }
++}
++
++/*
++ * this routine calculate boundaries of the gap requested block fits into
++ * and cache this gap
++ */
++static inline void
++ext3_ext_put_gap_in_cache(struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path,
++ unsigned long block)
++{
++ int depth = EXT_DEPTH(tree);
++ struct ext3_extent *ex, gex;
++
++ if (!tree->cex)
++ return;
++
++ ex = path[depth].p_ext;
++ if (ex == NULL) {
++ /* there is no extent yet, so gap is [0;-] */
++ gex.ee_block = 0;
++ gex.ee_len = EXT_CACHE_MARK;
++ ext_debug(tree, "cache gap(whole file):");
++ } else if (block < ex->ee_block) {
++ gex.ee_block = block;
++ gex.ee_len = ex->ee_block - block;
++ ext_debug(tree, "cache gap(before): %lu [%lu:%lu]",
++ (unsigned long) block,
++ (unsigned long) ex->ee_block,
++ (unsigned long) ex->ee_len);
++ } else if (block >= ex->ee_block + ex->ee_len) {
++ gex.ee_block = ex->ee_block + ex->ee_len;
++ gex.ee_len = ext3_ext_next_allocated_block(path);
++ ext_debug(tree, "cache gap(after): [%lu:%lu] %lu",
++ (unsigned long) ex->ee_block,
++ (unsigned long) ex->ee_len,
++ (unsigned long) block);
++ EXT_ASSERT(gex.ee_len > gex.ee_block);
++ gex.ee_len = gex.ee_len - gex.ee_block;
++ } else {
++ BUG();
++ }
++
++ ext_debug(tree, " -> %lu:%lu\n", (unsigned long) gex.ee_block,
++ (unsigned long) gex.ee_len);
++ gex.ee_start = EXT_CACHE_MARK;
++ ext3_ext_put_in_cache(tree, &gex);
++}
++
++static inline int
++ext3_ext_in_cache(struct ext3_extents_tree *tree, unsigned long block,
++ struct ext3_extent *ex)
++{
++ struct ext3_extent *cex = tree->cex;
++
++ /* is there cache storage at all? */
++ if (!cex)
++ return 0;
++
++ /* has cache valid data? */
++ if (cex->ee_len == 0)
++ return 0;
++
++ if (block >= cex->ee_block && block < cex->ee_block + cex->ee_len) {
++ ex->ee_block = cex->ee_block;
++ ex->ee_start = cex->ee_start;
++ ex->ee_len = cex->ee_len;
++ ext_debug(tree, "%lu cached by %lu:%lu:%lu\n",
++ (unsigned long) block,
++ (unsigned long) ex->ee_block,
++ (unsigned long) ex->ee_len,
++ (unsigned long) ex->ee_start);
++ return 1;
++ }
++
++ /* not in cache */
++ return 0;
++}
++
++/*
++ * routine removes index from the index block
++ * it's used in truncate case only. thus all requests are for
++ * last index in the block only
++ */
++int ext3_ext_rm_idx(handle_t *handle, struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path)
++{
++ struct buffer_head *bh;
++ int err;
++
++ /* free index block */
++ path--;
++ EXT_ASSERT(path->p_hdr->eh_entries);
++ if ((err = ext3_ext_get_access(handle, tree, path)))
++ return err;
++ path->p_hdr->eh_entries--;
++ if ((err = ext3_ext_dirty(handle, tree, path)))
++ return err;
++ ext_debug(tree, "index is empty, remove it, free block %d\n",
++ path->p_idx->ei_leaf);
++ bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf);
++ ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf);
++ ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1);
++ return err;
++}
++
++int ext3_ext_calc_credits_for_insert(struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path)
++{
++ int depth = EXT_DEPTH(tree);
++ int needed;
++
++ if (path) {
++ /* probably there is space in leaf? */
++ if (path[depth].p_hdr->eh_entries < path[depth].p_hdr->eh_max)
++ return 1;
++ }
++
++ /*
++ * the worste case we're expecting is creation of the
++ * new root (growing in depth) with index splitting
++ * for splitting we have to consider depth + 1 because
++ * previous growing could increase it
++ */
++ depth = depth + 1;
++
++ /*
++ * growing in depth:
++ * block allocation + new root + old root
++ */
++ needed = EXT3_ALLOC_NEEDED + 2;
++
++ /* index split. we may need:
++ * allocate intermediate indexes and new leaf
++ * change two blocks at each level, but root
++ * modify root block (inode)
++ */
++ needed += (depth * EXT3_ALLOC_NEEDED) + (2 * depth) + 1;
++
++ return needed;
++}
++
++static int
++ext3_ext_split_for_rm(handle_t *handle, struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path, unsigned long start,
++ unsigned long end)
++{
++ struct ext3_extent *ex, tex;
++ struct ext3_ext_path *npath;
++ int depth, creds, err;
++
++ depth = EXT_DEPTH(tree);
++ ex = path[depth].p_ext;
++ EXT_ASSERT(ex);
++ EXT_ASSERT(end < ex->ee_block + ex->ee_len - 1);
++ EXT_ASSERT(ex->ee_block < start);
++
++ /* calculate tail extent */
++ tex.ee_block = end + 1;
++ EXT_ASSERT(tex.ee_block < ex->ee_block + ex->ee_len);
++ tex.ee_len = ex->ee_block + ex->ee_len - tex.ee_block;
++
++ creds = ext3_ext_calc_credits_for_insert(tree, path);
++ handle = ext3_ext_journal_restart(handle, creds);
++ if (IS_ERR(handle))
++ return PTR_ERR(handle);
++
++ /* calculate head extent. use primary extent */
++ err = ext3_ext_get_access(handle, tree, path + depth);
++ if (err)
++ return err;
++ ex->ee_len = start - ex->ee_block;
++ err = ext3_ext_dirty(handle, tree, path + depth);
++ if (err)
++ return err;
++
++ /* FIXME: some callback to free underlying resource
++ * and correct ee_start? */
++ ext_debug(tree, "split extent: head %u:%u, tail %u:%u\n",
++ ex->ee_block, ex->ee_len, tex.ee_block, tex.ee_len);
++
++ npath = ext3_ext_find_extent(tree, ex->ee_block, NULL);
++ if (IS_ERR(npath))
++ return PTR_ERR(npath);
++ depth = EXT_DEPTH(tree);
++ EXT_ASSERT(npath[depth].p_ext->ee_block == ex->ee_block);
++ EXT_ASSERT(npath[depth].p_ext->ee_len == ex->ee_len);
++
++ err = ext3_ext_insert_extent(handle, tree, npath, &tex);
++ ext3_ext_drop_refs(npath);
++ kfree(npath);
++
++ return err;
++
++}
++
++static int
++ext3_ext_rm_leaf(handle_t *handle, struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path, unsigned long start,
++ unsigned long end)
++{
++ struct ext3_extent *ex, *fu = NULL, *lu, *le;
++ int err = 0, correct_index = 0;
++ int depth = EXT_DEPTH(tree), credits;
++ struct ext3_extent_header *eh;
++ unsigned a, b, block, num;
++
++ ext_debug(tree, "remove [%lu:%lu] in leaf\n", start, end);
++ if (!path[depth].p_hdr)
++ path[depth].p_hdr = EXT_BLOCK_HDR(path[depth].p_bh);
++ eh = path[depth].p_hdr;
++ EXT_ASSERT(eh);
++ EXT_ASSERT(eh->eh_entries <= eh->eh_max);
++ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC);
++
++ /* find where to start removing */
++ le = ex = EXT_LAST_EXTENT(eh);
++ while (ex != EXT_FIRST_EXTENT(eh)) {
++ if (ex->ee_block <= end)
++ break;
++ ex--;
++ }
++
++ if (start > ex->ee_block && end < ex->ee_block + ex->ee_len - 1) {
++ /* removal of internal part of the extent requested
++ * tail and head must be placed in different extent
++ * so, we have to insert one more extent */
++ path[depth].p_ext = ex;
++ return ext3_ext_split_for_rm(handle, tree, path, start, end);
++ }
++
++ lu = ex;
++ while (ex >= EXT_FIRST_EXTENT(eh) &&
++ ex->ee_block + ex->ee_len > start) {
++ ext_debug(tree, "remove ext %u:%u\n", ex->ee_block, ex->ee_len);
++ path[depth].p_ext = ex;
++
++ a = ex->ee_block > start ? ex->ee_block : start;
++ b = ex->ee_block + ex->ee_len - 1 < end ?
++ ex->ee_block + ex->ee_len - 1 : end;
++
++ ext_debug(tree, " border %u:%u\n", a, b);
++
++ if (a != ex->ee_block && b != ex->ee_block + ex->ee_len - 1) {
++ block = 0;
++ num = 0;
++ BUG();
++ } else if (a != ex->ee_block) {
++ /* remove tail of the extent */
++ block = ex->ee_block;
++ num = a - block;
++ } else if (b != ex->ee_block + ex->ee_len - 1) {
++ /* remove head of the extent */
++ block = a;
++ num = b - a;
++ } else {
++ /* remove whole extent: excelent! */
++ block = ex->ee_block;
++ num = 0;
++ EXT_ASSERT(a == ex->ee_block &&
++ b == ex->ee_block + ex->ee_len - 1);
++ }
++
++ if (ex == EXT_FIRST_EXTENT(eh))
++ correct_index = 1;
++
++ credits = 1;
++ if (correct_index)
++ credits += (EXT_DEPTH(tree) * EXT3_ALLOC_NEEDED) + 1;
++ if (tree->ops->remove_extent_credits)
++ credits+=tree->ops->remove_extent_credits(tree,ex,a,b);
++
++ handle = ext3_ext_journal_restart(handle, credits);
++ if (IS_ERR(handle)) {
++ err = PTR_ERR(handle);
++ goto out;
++ }
++
++ err = ext3_ext_get_access(handle, tree, path + depth);
++ if (err)
++ goto out;
++
++ if (tree->ops->remove_extent)
++ err = tree->ops->remove_extent(tree, ex, a, b);
++ if (err)
++ goto out;
++
++ if (num == 0) {
++ /* this extent is removed entirely mark slot unused */
++ ex->ee_start = 0;
++ eh->eh_entries--;
++ fu = ex;
++ }
++
++ ex->ee_block = block;
++ ex->ee_len = num;
++
++ err = ext3_ext_dirty(handle, tree, path + depth);
++ if (err)
++ goto out;
++
++ ext_debug(tree, "new extent: %u:%u:%u\n",
++ ex->ee_block, ex->ee_len, ex->ee_start);
++ ex--;
++ }
++
++ if (fu) {
++ /* reuse unused slots */
++ while (lu < le) {
++ if (lu->ee_start) {
++ *fu = *lu;
++ lu->ee_start = 0;
++ fu++;
++ }
++ lu++;
++ }
++ }
++
++ if (correct_index && eh->eh_entries)
++ err = ext3_ext_correct_indexes(handle, tree, path);
++
++ /* if this leaf is free, then we should
++ * remove it from index block above */
++ if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL)
++ err = ext3_ext_rm_idx(handle, tree, path + depth);
++
++out:
++ return err;
++}
++
++
++static struct ext3_extent_idx *
++ext3_ext_last_covered(struct ext3_extent_header *hdr, unsigned long block)
++{
++ struct ext3_extent_idx *ix;
++
++ ix = EXT_LAST_INDEX(hdr);
++ while (ix != EXT_FIRST_INDEX(hdr)) {
++ if (ix->ei_block <= block)
++ break;
++ ix--;
++ }
++ return ix;
++}
++
++/*
++ * returns 1 if current index have to be freed (even partial)
++ */
++static int inline
++ext3_ext_more_to_rm(struct ext3_ext_path *path)
++{
++ EXT_ASSERT(path->p_idx);
++
++ if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr))
++ return 0;
++
++ /*
++ * if truncate on deeper level happened it it wasn't partial
++ * so we have to consider current index for truncation
++ */
++ if (path->p_hdr->eh_entries == path->p_block)
++ return 0;
++ return 1;
++}
++
++int ext3_ext_remove_space(struct ext3_extents_tree *tree,
++ unsigned long start, unsigned long end)
++{
++ struct inode *inode = tree->inode;
++ struct super_block *sb = inode->i_sb;
++ int depth = EXT_DEPTH(tree);
++ struct ext3_ext_path *path;
++ handle_t *handle;
++ int i = 0, err = 0;
++
++ ext_debug(tree, "space to be removed: %lu:%lu\n", start, end);
++
++ /* probably first extent we're gonna free will be last in block */
++ handle = ext3_journal_start(inode, depth + 1);
++ if (IS_ERR(handle))
++ return PTR_ERR(handle);
++
++ ext3_ext_invalidate_cache(tree);
++
++ /*
++ * we start scanning from right side freeing all the blocks
++ * after i_size and walking into the deep
++ */
++ path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 1), GFP_KERNEL);
++ if (IS_ERR(path)) {
++ ext3_error(sb, "ext3_ext_remove_space",
++ "Can't allocate path array");
++ ext3_journal_stop(handle);
++ return -ENOMEM;
++ }
++ memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1));
++ path[i].p_hdr = EXT_ROOT_HDR(tree);
++
++ while (i >= 0 && err == 0) {
++ if (i == depth) {
++ /* this is leaf block */
++ err = ext3_ext_rm_leaf(handle, tree, path, start, end);
++ /* root level have p_bh == NULL, brelse() eats this */
++ brelse(path[i].p_bh);
++ i--;
++ continue;
++ }
++
++ /* this is index block */
++ if (!path[i].p_hdr) {
++ ext_debug(tree, "initialize header\n");
++ path[i].p_hdr = EXT_BLOCK_HDR(path[i].p_bh);
++ }
++
++ EXT_ASSERT(path[i].p_hdr->eh_entries <= path[i].p_hdr->eh_max);
++ EXT_ASSERT(path[i].p_hdr->eh_magic == EXT3_EXT_MAGIC);
++
++ if (!path[i].p_idx) {
++ /* this level hasn't touched yet */
++ path[i].p_idx =
++ ext3_ext_last_covered(path[i].p_hdr, end);
++ path[i].p_block = path[i].p_hdr->eh_entries + 1;
++ ext_debug(tree, "init index ptr: hdr 0x%p, num %d\n",
++ path[i].p_hdr, path[i].p_hdr->eh_entries);
++ } else {
++ /* we've already was here, see at next index */
++ path[i].p_idx--;
++ }
++
++ ext_debug(tree, "level %d - index, first 0x%p, cur 0x%p\n",
++ i, EXT_FIRST_INDEX(path[i].p_hdr),
++ path[i].p_idx);
++ if (ext3_ext_more_to_rm(path + i)) {
++ /* go to the next level */
++ ext_debug(tree, "move to level %d (block %d)\n",
++ i + 1, path[i].p_idx->ei_leaf);
++ memset(path + i + 1, 0, sizeof(*path));
++ path[i+1].p_bh = sb_bread(sb, path[i].p_idx->ei_leaf);
++ if (!path[i+1].p_bh) {
++ /* should we reset i_size? */
++ err = -EIO;
++ break;
++ }
++ /* put actual number of indexes to know is this
++ * number got changed at the next iteration */
++ path[i].p_block = path[i].p_hdr->eh_entries;
++ i++;
++ } else {
++ /* we finish processing this index, go up */
++ if (path[i].p_hdr->eh_entries == 0 && i > 0) {
++ /* index is empty, remove it
++ * handle must be already prepared by the
++ * truncatei_leaf() */
++ err = ext3_ext_rm_idx(handle, tree, path + i);
++ }
++ /* root level have p_bh == NULL, brelse() eats this */
++ brelse(path[i].p_bh);
++ i--;
++ ext_debug(tree, "return to level %d\n", i);
++ }
++ }
++
++ /* TODO: flexible tree reduction should be here */
++ if (path->p_hdr->eh_entries == 0) {
++ /*
++ * truncate to zero freed all the tree
++ * so, we need to correct eh_depth
++ */
++ err = ext3_ext_get_access(handle, tree, path);
++ if (err == 0) {
++ EXT_ROOT_HDR(tree)->eh_depth = 0;
++ EXT_ROOT_HDR(tree)->eh_max = ext3_ext_space_root(tree);
++ err = ext3_ext_dirty(handle, tree, path);
++ }
++ }
++ ext3_ext_tree_changed(tree);
++
++ kfree(path);
++ ext3_journal_stop(handle);
++
++ return err;
++}
++
++int ext3_ext_calc_metadata_amount(struct ext3_extents_tree *tree, int blocks)
++{
++ int lcap, icap, rcap, leafs, idxs, num;
++
++ rcap = ext3_ext_space_root(tree);
++ if (blocks <= rcap) {
++ /* all extents fit to the root */
++ return 0;
++ }
++
++ rcap = ext3_ext_space_root_idx(tree);
++ lcap = ext3_ext_space_block(tree);
++ icap = ext3_ext_space_block_idx(tree);
++
++ num = leafs = (blocks + lcap - 1) / lcap;
++ if (leafs <= rcap) {
++ /* all pointers to leafs fit to the root */
++ return leafs;
++ }
++
++ /* ok. we need separate index block(s) to link all leaf blocks */
++ idxs = (leafs + icap - 1) / icap;
++ do {
++ num += idxs;
++ idxs = (idxs + icap - 1) / icap;
++ } while (idxs > rcap);
++
++ return num;
++}
++
++/*
++ * called at mount time
++ */
++void ext3_ext_init(struct super_block *sb)
++{
++ /*
++ * possible initialization would be here
++ */
++
++ if (test_opt(sb, EXTENTS)) {
++ printk("EXT3-fs: file extents enabled");
++#ifdef AGRESSIVE_TEST
++ printk(", agressive tests");
++#endif
++#ifdef CHECK_BINSEARCH
++ printk(", check binsearch");
++#endif
++ printk("\n");
++ }
++}
++
++/*
++ * called at umount time
++ */
++void ext3_ext_release(struct super_block *sb)
++{
++}
++
++/************************************************************************
++ * VFS related routines
++ ************************************************************************/
++
++static int ext3_get_inode_write_access(handle_t *handle, void *buffer)
++{
++ /* we use in-core data, not bh */
++ return 0;
++}
++
++static int ext3_mark_buffer_dirty(handle_t *handle, void *buffer)
++{
++ struct inode *inode = buffer;
++ return ext3_mark_inode_dirty(handle, inode);
++}
++
++static int ext3_ext_mergable(struct ext3_extent *ex1,
++ struct ext3_extent *ex2)
++{
++ /* FIXME: support for large fs */
++ if (ex1->ee_start + ex1->ee_len == ex2->ee_start)
++ return 1;
++ return 0;
++}
++
++static int
++ext3_remove_blocks_credits(struct ext3_extents_tree *tree,
++ struct ext3_extent *ex,
++ unsigned long from, unsigned long to)
++{
++ int needed;
++
++ /* at present, extent can't cross block group */;
++ needed = 4; /* bitmap + group desc + sb + inode */
++
++#ifdef CONFIG_QUOTA
++ needed += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS;
++#endif
++ return needed;
++}
++
++static int
++ext3_remove_blocks(struct ext3_extents_tree *tree,
++ struct ext3_extent *ex,
++ unsigned long from, unsigned long to)
++{
++ int needed = ext3_remove_blocks_credits(tree, ex, from, to);
++ handle_t *handle = ext3_journal_start(tree->inode, needed);
++ struct buffer_head *bh;
++ int i;
++
++ if (IS_ERR(handle))
++ return PTR_ERR(handle);
++ if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) {
++ /* tail removal */
++ unsigned long num, start;
++ num = ex->ee_block + ex->ee_len - from;
++ start = ex->ee_start + ex->ee_len - num;
++ ext_debug(tree, "free last %lu blocks starting %lu\n",
++ num, start);
++ for (i = 0; i < num; i++) {
++ bh = sb_find_get_block(tree->inode->i_sb, start + i);
++ ext3_forget(handle, 0, tree->inode, bh, start + i);
++ }
++ ext3_free_blocks(handle, tree->inode, start, num);
++ } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) {
++ printk("strange request: removal %lu-%lu from %u:%u\n",
++ from, to, ex->ee_block, ex->ee_len);
++ } else {
++ printk("strange request: removal(2) %lu-%lu from %u:%u\n",
++ from, to, ex->ee_block, ex->ee_len);
++ }
++ ext3_journal_stop(handle);
++ return 0;
++}
++
++static int ext3_ext_find_goal(struct inode *inode,
++ struct ext3_ext_path *path, unsigned long block)
++{
++ struct ext3_inode_info *ei = EXT3_I(inode);
++ unsigned long bg_start;
++ unsigned long colour;
++ int depth;
++
++ if (path) {
++ struct ext3_extent *ex;
++ depth = path->p_depth;
++
++ /* try to predict block placement */
++ if ((ex = path[depth].p_ext))
++ return ex->ee_start + (block - ex->ee_block);
++
++ /* it looks index is empty
++ * try to find starting from index itself */
++ if (path[depth].p_bh)
++ return path[depth].p_bh->b_blocknr;
++ }
++
++ /* OK. use inode's group */
++ bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) +
++ le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block);
++ colour = (current->pid % 16) *
++ (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16);
++ return bg_start + colour + block;
++}
++
++static int ext3_new_block_cb(handle_t *handle, struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path,
++ struct ext3_extent *ex, int *err)
++{
++ struct inode *inode = tree->inode;
++ int newblock, goal;
++
++ EXT_ASSERT(path);
++ EXT_ASSERT(ex);
++ EXT_ASSERT(ex->ee_start);
++ EXT_ASSERT(ex->ee_len);
++
++ /* reuse block from the extent to order data/metadata */
++ newblock = ex->ee_start++;
++ ex->ee_len--;
++ if (ex->ee_len == 0) {
++ ex->ee_len = 1;
++ /* allocate new block for the extent */
++ goal = ext3_ext_find_goal(inode, path, ex->ee_block);
++ ex->ee_start = ext3_new_block(handle, inode, goal, 0, 0, err);
++ if (ex->ee_start == 0) {
++ /* error occured: restore old extent */
++ ex->ee_start = newblock;
++ return 0;
++ }
++ }
++ return newblock;
++}
++
++static struct ext3_extents_helpers ext3_blockmap_helpers = {
++ .get_write_access = ext3_get_inode_write_access,
++ .mark_buffer_dirty = ext3_mark_buffer_dirty,
++ .mergable = ext3_ext_mergable,
++ .new_block = ext3_new_block_cb,
++ .remove_extent = ext3_remove_blocks,
++ .remove_extent_credits = ext3_remove_blocks_credits,
++};
++
++void ext3_init_tree_desc(struct ext3_extents_tree *tree,
++ struct inode *inode)
++{
++ tree->inode = inode;
++ tree->root = (void *) EXT3_I(inode)->i_data;
++ tree->buffer = (void *) inode;
++ tree->buffer_len = sizeof(EXT3_I(inode)->i_data);
++ tree->cex = (struct ext3_extent *) &EXT3_I(inode)->i_cached_extent;
++ tree->ops = &ext3_blockmap_helpers;
++}
++
++int ext3_ext_get_block(handle_t *handle, struct inode *inode,
++ long iblock, struct buffer_head *bh_result,
++ int create, int extend_disksize)
++{
++ struct ext3_ext_path *path = NULL;
++ struct ext3_extent newex;
++ struct ext3_extent *ex;
++ int goal, newblock, err = 0, depth;
++ struct ext3_extents_tree tree;
++
++ clear_buffer_new(bh_result);
++ ext3_init_tree_desc(&tree, inode);
++ ext_debug(&tree, "block %d requested for inode %u\n",
++ (int) iblock, (unsigned) inode->i_ino);
++ down(&EXT3_I(inode)->truncate_sem);
++
++ /* check in cache */
++ if (ext3_ext_in_cache(&tree, iblock, &newex)) {
++ if (newex.ee_start == EXT_CACHE_MARK) {
++ /* this is cached gap */
++ if (!create) {
++ /* block isn't allocated yet and
++ * user don't want to allocate it */
++ goto out2;
++ }
++ /* we should allocate requested block */
++ } else if (newex.ee_start) {
++ /* block is already allocated */
++ newblock = iblock - newex.ee_block + newex.ee_start;
++ goto out;
++ }
++ }
++
++ /* find extent for this block */
++ path = ext3_ext_find_extent(&tree, iblock, NULL);
++ if (IS_ERR(path)) {
++ err = PTR_ERR(path);
++ path = NULL;
++ goto out2;
++ }
++
++ depth = EXT_DEPTH(&tree);
++
++ /*
++ * consistent leaf must not be empty
++ * this situations is possible, though, _during_ tree modification
++ * this is why assert can't be put in ext3_ext_find_extent()
++ */
++ EXT_ASSERT(path[depth].p_ext != NULL || depth == 0);
++
++ if ((ex = path[depth].p_ext)) {
++ /* if found exent covers block, simple return it */
++ if (iblock >= ex->ee_block && iblock < ex->ee_block + ex->ee_len) {
++ newblock = iblock - ex->ee_block + ex->ee_start;
++ ext_debug(&tree, "%d fit into %d:%d -> %d\n",
++ (int) iblock, ex->ee_block, ex->ee_len,
++ newblock);
++ ext3_ext_put_in_cache(&tree, ex);
++ goto out;
++ }
++ }
++
++ /*
++ * requested block isn't allocated yet
++ * we couldn't try to create block if create flag is zero
++ */
++ if (!create) {
++ /* put just found gap into cache to speedup subsequest reqs */
++ ext3_ext_put_gap_in_cache(&tree, path, iblock);
++ goto out2;
++ }
++
++ /* allocate new block */
++ goal = ext3_ext_find_goal(inode, path, iblock);
++ newblock = ext3_new_block(handle, inode, goal, 0, 0, &err);
++ if (!newblock)
++ goto out2;
++ ext_debug(&tree, "allocate new block: goal %d, found %d\n",
++ goal, newblock);
++
++ /* try to insert new extent into found leaf and return */
++ newex.ee_block = iblock;
++ newex.ee_start = newblock;
++ newex.ee_len = 1;
++ err = ext3_ext_insert_extent(handle, &tree, path, &newex);
++ if (err)
++ goto out2;
++
++ if (extend_disksize && inode->i_size > EXT3_I(inode)->i_disksize)
++ EXT3_I(inode)->i_disksize = inode->i_size;
++
++ /* previous routine could use block we allocated */
++ newblock = newex.ee_start;
++ set_buffer_new(bh_result);
++
++ ext3_ext_put_in_cache(&tree, &newex);
++out:
++ ext3_ext_show_leaf(&tree, path);
++ map_bh(bh_result, inode->i_sb, newblock);
++out2:
++ if (path) {
++ ext3_ext_drop_refs(path);
++ kfree(path);
++ }
++ up(&EXT3_I(inode)->truncate_sem);
++
++ return err;
++}
++
++void ext3_ext_truncate(struct inode * inode, struct page *page)
++{
++ struct address_space *mapping = inode->i_mapping;
++ struct super_block *sb = inode->i_sb;
++ struct ext3_extents_tree tree;
++ unsigned long last_block;
++ handle_t *handle;
++ int err = 0;
++
++ ext3_init_tree_desc(&tree, inode);
++
++ /*
++ * probably first extent we're gonna free will be last in block
++ */
++ err = ext3_writepage_trans_blocks(inode) + 3;
++ handle = ext3_journal_start(inode, err);
++ if (IS_ERR(handle)) {
++ if (page) {
++ clear_highpage(page);
++ flush_dcache_page(page);
++ unlock_page(page);
++ page_cache_release(page);
++ }
++ return;
++ }
++
++ if (page)
++ ext3_block_truncate_page(handle, page, mapping, inode->i_size);
++
++ down(&EXT3_I(inode)->truncate_sem);
++ ext3_ext_invalidate_cache(&tree);
++
++ /*
++ * TODO: optimization is possible here
++ * probably we need not scaning at all,
++ * because page truncation is enough
++ */
++ if (ext3_orphan_add(handle, inode))
++ goto out_stop;
++
++ /* we have to know where to truncate from in crash case */
++ EXT3_I(inode)->i_disksize = inode->i_size;
++ ext3_mark_inode_dirty(handle, inode);
++
++ last_block = (inode->i_size + sb->s_blocksize - 1)
++ >> EXT3_BLOCK_SIZE_BITS(sb);
++ err = ext3_ext_remove_space(&tree, last_block, EXT_MAX_BLOCK);
++
++ /* In a multi-transaction truncate, we only make the final
++ * transaction synchronous */
++ if (IS_SYNC(inode))
++ handle->h_sync = 1;
++
++out_stop:
++ /*
++ * If this was a simple ftruncate(), and the file will remain alive
++ * then we need to clear up the orphan record which we created above.
++ * However, if this was a real unlink then we were called by
++ * ext3_delete_inode(), and we allow that function to clean up the
++ * orphan info for us.
++ */
++ if (inode->i_nlink)
++ ext3_orphan_del(handle, inode);
++
++ up(&EXT3_I(inode)->truncate_sem);
++ ext3_journal_stop(handle);
++}
++
++/*
++ * this routine calculate max number of blocks we could modify
++ * in order to allocate new block for an inode
++ */
++int ext3_ext_writepage_trans_blocks(struct inode *inode, int num)
++{
++ struct ext3_extents_tree tree;
++ int needed;
++
++ ext3_init_tree_desc(&tree, inode);
++
++ needed = ext3_ext_calc_credits_for_insert(&tree, NULL);
++
++ /* caller want to allocate num blocks */
++ needed *= num;
++
++#ifdef CONFIG_QUOTA
++ /*
++ * FIXME: real calculation should be here
++ * it depends on blockmap format of qouta file
++ */
++ needed += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS;
++#endif
++
++ return needed;
++}
++
++void ext3_extents_initialize_blockmap(handle_t *handle, struct inode *inode)
++{
++ struct ext3_extents_tree tree;
++
++ ext3_init_tree_desc(&tree, inode);
++ ext3_extent_tree_init(handle, &tree);
++}
++
++int ext3_ext_calc_blockmap_metadata(struct inode *inode, int blocks)
++{
++ struct ext3_extents_tree tree;
++
++ ext3_init_tree_desc(&tree, inode);
++ return ext3_ext_calc_metadata_amount(&tree, blocks);
++}
++
++static int
++ext3_ext_store_extent_cb(struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path,
++ struct ext3_extent *newex, int exist)
++{
++ struct ext3_extent_buf *buf = (struct ext3_extent_buf *) tree->private;
++
++ if (!exist)
++ return EXT_CONTINUE;
++ if (buf->err < 0)
++ return EXT_BREAK;
++ if (buf->cur - buf->buffer + sizeof(*newex) > buf->buflen)
++ return EXT_BREAK;
++
++ if (!copy_to_user(buf->cur, newex, sizeof(*newex))) {
++ buf->err++;
++ buf->cur += sizeof(*newex);
++ } else {
++ buf->err = -EFAULT;
++ return EXT_BREAK;
++ }
++ return EXT_CONTINUE;
++}
++
++static int
++ext3_ext_collect_stats_cb(struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path,
++ struct ext3_extent *ex, int exist)
++{
++ struct ext3_extent_tree_stats *buf =
++ (struct ext3_extent_tree_stats *) tree->private;
++ int depth;
++
++ if (!exist)
++ return EXT_CONTINUE;
++
++ depth = EXT_DEPTH(tree);
++ buf->extents_num++;
++ if (path[depth].p_ext == EXT_FIRST_EXTENT(path[depth].p_hdr))
++ buf->leaf_num++;
++ return EXT_CONTINUE;
++}
++
++int ext3_ext_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
++ unsigned long arg)
++{
++ int err = 0;
++
++ if (!(EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL))
++ return -EINVAL;
++
++ if (cmd == EXT3_IOC_GET_EXTENTS) {
++ struct ext3_extent_buf buf;
++ struct ext3_extents_tree tree;
++
++ if (copy_from_user(&buf, (void *) arg, sizeof(buf)))
++ return -EFAULT;
++
++ ext3_init_tree_desc(&tree, inode);
++ buf.cur = buf.buffer;
++ buf.err = 0;
++ tree.private = &buf;
++ down(&EXT3_I(inode)->truncate_sem);
++ err = ext3_ext_walk_space(&tree, buf.start, EXT_MAX_BLOCK,
++ ext3_ext_store_extent_cb);
++ up(&EXT3_I(inode)->truncate_sem);
++ if (err == 0)
++ err = buf.err;
++ } else if (cmd == EXT3_IOC_GET_TREE_STATS) {
++ struct ext3_extent_tree_stats buf;
++ struct ext3_extents_tree tree;
++
++ ext3_init_tree_desc(&tree, inode);
++ down(&EXT3_I(inode)->truncate_sem);
++ buf.depth = EXT_DEPTH(&tree);
++ buf.extents_num = 0;
++ buf.leaf_num = 0;
++ tree.private = &buf;
++ err = ext3_ext_walk_space(&tree, 0, EXT_MAX_BLOCK,
++ ext3_ext_collect_stats_cb);
++ up(&EXT3_I(inode)->truncate_sem);
++ if (!err)
++ err = copy_to_user((void *) arg, &buf, sizeof(buf));
++ } else if (cmd == EXT3_IOC_GET_TREE_DEPTH) {
++ struct ext3_extents_tree tree;
++ ext3_init_tree_desc(&tree, inode);
++ down(&EXT3_I(inode)->truncate_sem);
++ err = EXT_DEPTH(&tree);
++ up(&EXT3_I(inode)->truncate_sem);
++ }
++
++ return err;
++}
++
++EXPORT_SYMBOL(ext3_init_tree_desc);
++EXPORT_SYMBOL(ext3_mark_inode_dirty);
++EXPORT_SYMBOL(ext3_ext_invalidate_cache);
++EXPORT_SYMBOL(ext3_ext_insert_extent);
++EXPORT_SYMBOL(ext3_ext_walk_space);
++EXPORT_SYMBOL(ext3_ext_find_goal);
++EXPORT_SYMBOL(ext3_ext_calc_credits_for_insert);
++
+Index: linux-2.6.7/fs/ext3/ialloc.c
+===================================================================
+--- linux-2.6.7.orig/fs/ext3/ialloc.c 2004-08-19 08:51:04.000000000 +0400
++++ linux-2.6.7/fs/ext3/ialloc.c 2004-08-19 08:53:49.000000000 +0400
+@@ -646,6 +646,10 @@
+ DQUOT_FREE_INODE(inode);
+ goto fail2;
+ }
++ if (test_opt(sb, EXTENTS)) {
++ EXT3_I(inode)->i_flags |= EXT3_EXTENTS_FL;
++ ext3_extents_initialize_blockmap(handle, inode);
++ }
+ err = ext3_mark_inode_dirty(handle, inode);
+ if (err) {
+ ext3_std_error(sb, err);
+Index: linux-2.6.7/fs/ext3/inode.c
+===================================================================
+--- linux-2.6.7.orig/fs/ext3/inode.c 2004-08-19 08:51:04.000000000 +0400
++++ linux-2.6.7/fs/ext3/inode.c 2004-08-19 08:53:49.000000000 +0400
+@@ -857,6 +857,17 @@
+ goto reread;
+ }
+
++static inline int
++ext3_get_block_wrap(handle_t *handle, struct inode *inode, long block,
++ struct buffer_head *bh, int create, int extend_disksize)
++{
++ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL)
++ return ext3_ext_get_block(handle, inode, block, bh, create,
++ extend_disksize);
++ return ext3_get_block_handle(handle, inode, block, bh, create,
++ extend_disksize);
++}
++
+ static int ext3_get_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+ {
+@@ -867,8 +878,8 @@
+ handle = ext3_journal_current_handle();
+ J_ASSERT(handle != 0);
+ }
+- ret = ext3_get_block_handle(handle, inode, iblock,
+- bh_result, create, 1);
++ ret = ext3_get_block_wrap(handle, inode, iblock,
++ bh_result, create, 1);
+ return ret;
+ }
+
+@@ -894,8 +905,8 @@
+ }
+ }
+ if (ret == 0)
+- ret = ext3_get_block_handle(handle, inode, iblock,
+- bh_result, create, 0);
++ ret = ext3_get_block_wrap(handle, inode, iblock,
++ bh_result, create, 0);
+ if (ret == 0)
+ bh_result->b_size = (1 << inode->i_blkbits);
+ return ret;
+@@ -916,7 +927,7 @@
+ dummy.b_state = 0;
+ dummy.b_blocknr = -1000;
+ buffer_trace_init(&dummy.b_history);
+- *errp = ext3_get_block_handle(handle, inode, block, &dummy, create, 1);
++ *errp = ext3_get_block_wrap(handle, inode, block, &dummy, create, 1);
+ if (!*errp && buffer_mapped(&dummy)) {
+ struct buffer_head *bh;
+ bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
+@@ -1669,7 +1680,7 @@
+ * This required during truncate. We need to physically zero the tail end
+ * of that block so it doesn't yield old data if the file is later grown.
+ */
+-static int ext3_block_truncate_page(handle_t *handle, struct page *page,
++int ext3_block_truncate_page(handle_t *handle, struct page *page,
+ struct address_space *mapping, loff_t from)
+ {
+ unsigned long index = from >> PAGE_CACHE_SHIFT;
+@@ -2165,6 +2176,9 @@
+ return;
+ }
+
++ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL)
++ return ext3_ext_truncate(inode, page);
++
+ handle = start_transaction(inode);
+ if (IS_ERR(handle)) {
+ if (page) {
+@@ -2888,6 +2902,9 @@
+ int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3;
+ int ret;
+
++ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL)
++ return ext3_ext_writepage_trans_blocks(inode, bpp);
++
+ if (ext3_should_journal_data(inode))
+ ret = 3 * (bpp + indirects) + 2;
+ else
+Index: linux-2.6.7/fs/ext3/Makefile
+===================================================================
+--- linux-2.6.7.orig/fs/ext3/Makefile 2004-08-19 08:52:14.000000000 +0400
++++ linux-2.6.7/fs/ext3/Makefile 2004-08-19 08:53:49.000000000 +0400
+@@ -5,7 +5,7 @@
+ obj-$(CONFIG_EXT3_FS) += ext3.o
+
+ ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \
+- ioctl.o namei.o super.o symlink.o hash.o
++ ioctl.o namei.o super.o symlink.o hash.o extents.o
+
+ ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
+ ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o
+Index: linux-2.6.7/fs/ext3/super.c
+===================================================================
+--- linux-2.6.7.orig/fs/ext3/super.c 2004-08-19 08:51:04.000000000 +0400
++++ linux-2.6.7/fs/ext3/super.c 2004-08-19 08:53:49.000000000 +0400
+@@ -392,6 +392,7 @@
+ struct ext3_super_block *es = sbi->s_es;
+ int i;
+
++ ext3_ext_release(sb);
+ ext3_xattr_put_super(sb);
+ journal_destroy(sbi->s_journal);
+ if (!(sb->s_flags & MS_RDONLY)) {
+@@ -455,6 +456,9 @@
+ ei->i_default_acl = EXT3_ACL_NOT_CACHED;
+ #endif
+ ei->vfs_inode.i_version = 1;
++ ei->i_cached_extent[0] = 0;
++ ei->i_cached_extent[1] = 0;
++ ei->i_cached_extent[2] = 0;
+ return &ei->vfs_inode;
+ }
+
+@@ -590,7 +594,7 @@
+ Opt_commit, Opt_journal_update, Opt_journal_inum,
+ Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
+ Opt_ignore, Opt_barrier, Opt_iopen, Opt_noiopen, Opt_iopen_nopriv,
+- Opt_err,
++ Opt_err, Opt_extents, Opt_extdebug
+ };
+
+ static match_table_t tokens = {
+@@ -638,6 +642,8 @@
+ {Opt_iopen, "iopen"},
+ {Opt_noiopen, "noiopen"},
+ {Opt_iopen_nopriv, "iopen_nopriv"},
++ {Opt_extents, "extents"},
++ {Opt_extdebug, "extdebug"},
+ {Opt_err, NULL}
+ };
+
+@@ -917,6 +923,12 @@
+ break;
+ case Opt_ignore:
+ break;
++ case Opt_extents:
++ set_opt (sbi->s_mount_opt, EXTENTS);
++ break;
++ case Opt_extdebug:
++ set_opt (sbi->s_mount_opt, EXTDEBUG);
++ break;
+ default:
+ printk (KERN_ERR
+ "EXT3-fs: Unrecognized mount option \"%s\" "
+@@ -1589,6 +1601,8 @@
+ percpu_counter_mod(&sbi->s_dirs_counter,
+ ext3_count_dirs(sb));
+
++ ext3_ext_init(sb);
++
+ return 0;
+
+ failed_mount3:
+Index: linux-2.6.7/fs/ext3/ioctl.c
+===================================================================
+--- linux-2.6.7.orig/fs/ext3/ioctl.c 2004-08-19 08:51:03.000000000 +0400
++++ linux-2.6.7/fs/ext3/ioctl.c 2004-08-19 08:53:49.000000000 +0400
+@@ -176,6 +176,10 @@
+ return ret;
+ }
+ #endif
++ case EXT3_IOC_GET_EXTENTS:
++ case EXT3_IOC_GET_TREE_STATS:
++ case EXT3_IOC_GET_TREE_DEPTH:
++ return ext3_ext_ioctl(inode, filp, cmd, arg);
+ default:
+ return -ENOTTY;
+ }
+Index: linux-2.6.7/include/linux/ext3_fs.h
+===================================================================
+--- linux-2.6.7.orig/include/linux/ext3_fs.h 2004-08-19 08:51:04.000000000 +0400
++++ linux-2.6.7/include/linux/ext3_fs.h 2004-08-19 08:53:49.000000000 +0400
+@@ -186,6 +186,7 @@
+ #define EXT3_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */
+ #define EXT3_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/
+ #define EXT3_RESERVED_FL 0x80000000 /* reserved for ext3 lib */
++#define EXT3_EXTENTS_FL 0x00080000 /* Inode uses extents */
+
+ #define EXT3_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */
+ #define EXT3_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */
+@@ -209,6 +210,9 @@
+ #ifdef CONFIG_JBD_DEBUG
+ #define EXT3_IOC_WAIT_FOR_READONLY _IOR('f', 99, long)
+ #endif
++#define EXT3_IOC_GET_EXTENTS _IOR('f', 5, long)
++#define EXT3_IOC_GET_TREE_DEPTH _IOR('f', 6, long)
++#define EXT3_IOC_GET_TREE_STATS _IOR('f', 7, long)
+
+ /*
+ * Structure of an inode on the disk
+@@ -329,6 +333,8 @@
+ #define EXT3_MOUNT_POSIX_ACL 0x8000 /* POSIX Access Control Lists */
+ #define EXT3_MOUNT_IOPEN 0x40000 /* Allow access via iopen */
+ #define EXT3_MOUNT_IOPEN_NOPRIV 0x80000 /* Make iopen world-readable */
++#define EXT3_MOUNT_EXTENTS 0x10000 /* Extents support */
++#define EXT3_MOUNT_EXTDEBUG 0x20000 /* Extents debug */
+
+ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
+ #ifndef clear_opt
+@@ -724,6 +730,7 @@
+
+
+ /* inode.c */
++extern int ext3_block_truncate_page(handle_t *, struct page *, struct address_space *, loff_t);
+ extern int ext3_forget(handle_t *, int, struct inode *, struct buffer_head *, int);
+ extern struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *);
+ extern struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *);
+@@ -796,6 +803,14 @@
+ extern struct inode_operations ext3_symlink_inode_operations;
+ extern struct inode_operations ext3_fast_symlink_inode_operations;
+
++/* extents.c */
++extern int ext3_ext_writepage_trans_blocks(struct inode *, int);
++extern int ext3_ext_get_block(handle_t *, struct inode *, long,
++ struct buffer_head *, int, int);
++extern void ext3_ext_truncate(struct inode *, struct page *);
++extern void ext3_ext_init(struct super_block *);
++extern void ext3_ext_release(struct super_block *);
++extern void ext3_extents_initialize_blockmap(handle_t *, struct inode *);
+
+ #endif /* __KERNEL__ */
+
+Index: linux-2.6.7/include/linux/ext3_extents.h
+===================================================================
+--- linux-2.6.7.orig/include/linux/ext3_extents.h 2003-01-30 13:24:37.000000000 +0300
++++ linux-2.6.7/include/linux/ext3_extents.h 2004-08-19 08:53:49.000000000 +0400
+@@ -0,0 +1,238 @@
++/*
++ * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com
++ * Written by Alex Tomas <alex@clusterfs.com>
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License version 2 as
++ * published by the Free Software Foundation.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public Licens
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
++ */
++
++#ifndef _LINUX_EXT3_EXTENTS
++#define _LINUX_EXT3_EXTENTS
++
++/*
++ * with AGRESSIVE_TEST defined capacity of index/leaf blocks
++ * become very little, so index split, in-depth growing and
++ * other hard changes happens much more often
++ * this is for debug purposes only
++ */
++#define AGRESSIVE_TEST_
++
++/*
++ * if CHECK_BINSEARCH defined, then results of binary search
++ * will be checked by linear search
++ */
++#define CHECK_BINSEARCH_
++
++/*
++ * if EXT_DEBUG is defined you can use 'extdebug' mount option
++ * to get lots of info what's going on
++ */
++#define EXT_DEBUG
++#ifdef EXT_DEBUG
++#define ext_debug(tree,fmt,a...) \
++do { \
++ if (test_opt((tree)->inode->i_sb, EXTDEBUG)) \
++ printk(fmt, ##a); \
++} while (0);
++#else
++#define ext_debug(tree,fmt,a...)
++#endif
++
++/*
++ * if EXT_STATS is defined then stats numbers are collected
++ * these number will be displayed at umount time
++ */
++#define EXT_STATS_
++
++
++#define EXT3_ALLOC_NEEDED 3 /* block bitmap + group desc. + sb */
++
++/*
++ * ext3_inode has i_block array (total 60 bytes)
++ * first 4 bytes are used to store:
++ * - tree depth (0 mean there is no tree yet. all extents in the inode)
++ * - number of alive extents in the inode
++ */
++
++/*
++ * this is extent on-disk structure
++ * it's used at the bottom of the tree
++ */
++struct ext3_extent {
++ __u32 ee_block; /* first logical block extent covers */
++ __u16 ee_len; /* number of blocks covered by extent */
++ __u16 ee_start_hi; /* high 16 bits of physical block */
++ __u32 ee_start; /* low 32 bigs of physical block */
++};
++
++/*
++ * this is index on-disk structure
++ * it's used at all the levels, but the bottom
++ */
++struct ext3_extent_idx {
++ __u32 ei_block; /* index covers logical blocks from 'block' */
++ __u32 ei_leaf; /* pointer to the physical block of the next *
++ * level. leaf or next index could bet here */
++ __u16 ei_leaf_hi; /* high 16 bits of physical block */
++ __u16 ei_unused;
++};
++
++/*
++ * each block (leaves and indexes), even inode-stored has header
++ */
++struct ext3_extent_header {
++ __u16 eh_magic; /* probably will support different formats */
++ __u16 eh_entries; /* number of valid entries */
++ __u16 eh_max; /* capacity of store in entries */
++ __u16 eh_depth; /* has tree real underlaying blocks? */
++ __u32 eh_generation; /* generation of the tree */
++};
++
++#define EXT3_EXT_MAGIC 0xf30a
++
++/*
++ * array of ext3_ext_path contains path to some extent
++ * creation/lookup routines use it for traversal/splitting/etc
++ * truncate uses it to simulate recursive walking
++ */
++struct ext3_ext_path {
++ __u32 p_block;
++ __u16 p_depth;
++ struct ext3_extent *p_ext;
++ struct ext3_extent_idx *p_idx;
++ struct ext3_extent_header *p_hdr;
++ struct buffer_head *p_bh;
++};
++
++/*
++ * structure for external API
++ */
++
++/*
++ * ext3_extents_tree is used to pass initial information
++ * to top-level extents API
++ */
++struct ext3_extents_helpers;
++struct ext3_extents_tree {
++ struct inode *inode; /* inode which tree belongs to */
++ void *root; /* ptr to data top of tree resides at */
++ void *buffer; /* will be passed as arg to ^^ routines */
++ int buffer_len;
++ void *private;
++ struct ext3_extent *cex;/* last found extent */
++ struct ext3_extents_helpers *ops;
++};
++
++struct ext3_extents_helpers {
++ int (*get_write_access)(handle_t *h, void *buffer);
++ int (*mark_buffer_dirty)(handle_t *h, void *buffer);
++ int (*mergable)(struct ext3_extent *ex1, struct ext3_extent *ex2);
++ int (*remove_extent_credits)(struct ext3_extents_tree *,
++ struct ext3_extent *, unsigned long,
++ unsigned long);
++ int (*remove_extent)(struct ext3_extents_tree *,
++ struct ext3_extent *, unsigned long,
++ unsigned long);
++ int (*new_block)(handle_t *, struct ext3_extents_tree *,
++ struct ext3_ext_path *, struct ext3_extent *,
++ int *);
++};
++
++/*
++ * to be called by ext3_ext_walk_space()
++ * negative retcode - error
++ * positive retcode - signal for ext3_ext_walk_space(), see below
++ * callback must return valid extent (passed or newly created)
++ */
++typedef int (*ext_prepare_callback)(struct ext3_extents_tree *,
++ struct ext3_ext_path *,
++ struct ext3_extent *, int);
++
++#define EXT_CONTINUE 0
++#define EXT_BREAK 1
++#define EXT_REPEAT 2
++
++
++#define EXT_MAX_BLOCK 0xffffffff
++#define EXT_CACHE_MARK 0xffff
++
++
++#define EXT_FIRST_EXTENT(__hdr__) \
++ ((struct ext3_extent *) (((char *) (__hdr__)) + \
++ sizeof(struct ext3_extent_header)))
++#define EXT_FIRST_INDEX(__hdr__) \
++ ((struct ext3_extent_idx *) (((char *) (__hdr__)) + \
++ sizeof(struct ext3_extent_header)))
++#define EXT_HAS_FREE_INDEX(__path__) \
++ ((__path__)->p_hdr->eh_entries < (__path__)->p_hdr->eh_max)
++#define EXT_LAST_EXTENT(__hdr__) \
++ (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_entries - 1)
++#define EXT_LAST_INDEX(__hdr__) \
++ (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_entries - 1)
++#define EXT_MAX_EXTENT(__hdr__) \
++ (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_max - 1)
++#define EXT_MAX_INDEX(__hdr__) \
++ (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_max - 1)
++
++#define EXT_ROOT_HDR(tree) \
++ ((struct ext3_extent_header *) (tree)->root)
++#define EXT_BLOCK_HDR(bh) \
++ ((struct ext3_extent_header *) (bh)->b_data)
++#define EXT_DEPTH(_t_) \
++ (((struct ext3_extent_header *)((_t_)->root))->eh_depth)
++#define EXT_GENERATION(_t_) \
++ (((struct ext3_extent_header *)((_t_)->root))->eh_generation)
++
++
++#define EXT_ASSERT(__x__) if (!(__x__)) BUG();
++
++
++/*
++ * this structure is used to gather extents from the tree via ioctl
++ */
++struct ext3_extent_buf {
++ unsigned long start;
++ int buflen;
++ void *buffer;
++ void *cur;
++ int err;
++};
++
++/*
++ * this structure is used to collect stats info about the tree
++ */
++struct ext3_extent_tree_stats {
++ int depth;
++ int extents_num;
++ int leaf_num;
++};
++
++extern int ext3_extent_tree_init(handle_t *, struct ext3_extents_tree *);
++extern int ext3_ext_calc_credits_for_insert(struct ext3_extents_tree *, struct ext3_ext_path *);
++extern int ext3_ext_insert_extent(handle_t *, struct ext3_extents_tree *, struct ext3_ext_path *, struct ext3_extent *);
++extern int ext3_ext_walk_space(struct ext3_extents_tree *, unsigned long, unsigned long, ext_prepare_callback);
++extern int ext3_ext_remove_space(struct ext3_extents_tree *, unsigned long, unsigned long);
++extern struct ext3_ext_path * ext3_ext_find_extent(struct ext3_extents_tree *, int, struct ext3_ext_path *);
++extern void ext3_init_tree_desc(struct ext3_extents_tree *, struct inode *);
++extern int ext3_ext_calc_blockmap_metadata(struct inode *, int);
++
++static inline void
++ext3_ext_invalidate_cache(struct ext3_extents_tree *tree)
++{
++ if (tree->cex)
++ tree->cex->ee_len = 0;
++}
++
++
++#endif /* _LINUX_EXT3_EXTENTS */
++
+Index: linux-2.6.7/include/linux/ext3_fs_i.h
+===================================================================
+--- linux-2.6.7.orig/include/linux/ext3_fs_i.h 2004-08-19 08:51:04.000000000 +0400
++++ linux-2.6.7/include/linux/ext3_fs_i.h 2004-08-19 08:53:49.000000000 +0400
+@@ -111,6 +111,8 @@
+ */
+ struct semaphore truncate_sem;
+ struct inode vfs_inode;
++
++ __u32 i_cached_extent[3];
+ };
+
+ #endif /* _LINUX_EXT3_FS_I */
+
+%diffstat
+ fs/ext3/Makefile | 2
+ fs/ext3/extents.c | 2306 +++++++++++++++++++++++++++++++++++++++++++
+ fs/ext3/ialloc.c | 4
+ fs/ext3/inode.c | 29
+ fs/ext3/ioctl.c | 4
+ fs/ext3/super.c | 16
+ include/linux/ext3_extents.h | 238 ++++
+ include/linux/ext3_fs.h | 15
+ include/linux/ext3_fs_i.h | 2
+ 9 files changed, 2608 insertions(+), 8 deletions(-)
+
--- /dev/null
+%patch
+Index: linux-2.6.7/fs/ext3/extents.c
+===================================================================
+--- linux-2.6.7.orig/fs/ext3/extents.c 2003-01-30 13:24:37.000000000 +0300
++++ linux-2.6.7/fs/ext3/extents.c 2004-08-19 08:53:49.000000000 +0400
+@@ -0,0 +1,2306 @@
++/*
++ * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com
++ * Written by Alex Tomas <alex@clusterfs.com>
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License version 2 as
++ * published by the Free Software Foundation.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public Licens
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
++ */
++
++/*
++ * Extents support for EXT3
++ *
++ * TODO:
++ * - ext3_ext_walk_space() sould not use ext3_ext_find_extent()
++ * - ext3_ext_calc_credits() could take 'mergable' into account
++ * - ext3*_error() should be used in some situations
++ * - find_goal() [to be tested and improved]
++ * - smart tree reduction
++ * - arch-independence
++ * common on-disk format for big/little-endian arch
++ */
++
++#include <linux/module.h>
++#include <linux/fs.h>
++#include <linux/time.h>
++#include <linux/ext3_jbd.h>
++#include <linux/jbd.h>
++#include <linux/smp_lock.h>
++#include <linux/highuid.h>
++#include <linux/pagemap.h>
++#include <linux/quotaops.h>
++#include <linux/string.h>
++#include <linux/slab.h>
++#include <linux/ext3_extents.h>
++#include <asm/uaccess.h>
++
++static handle_t *ext3_ext_journal_restart(handle_t *handle, int needed)
++{
++ int err;
++
++ if (handle->h_buffer_credits > needed)
++ return handle;
++ if (!ext3_journal_extend(handle, needed))
++ return handle;
++ err = ext3_journal_restart(handle, needed);
++
++ return handle;
++}
++
++static int inline
++ext3_ext_get_access_for_root(handle_t *h, struct ext3_extents_tree *tree)
++{
++ if (tree->ops->get_write_access)
++ return tree->ops->get_write_access(h,tree->buffer);
++ else
++ return 0;
++}
++
++static int inline
++ext3_ext_mark_root_dirty(handle_t *h, struct ext3_extents_tree *tree)
++{
++ if (tree->ops->mark_buffer_dirty)
++ return tree->ops->mark_buffer_dirty(h,tree->buffer);
++ else
++ return 0;
++}
++
++/*
++ * could return:
++ * - EROFS
++ * - ENOMEM
++ */
++static int ext3_ext_get_access(handle_t *handle,
++ struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path)
++{
++ int err;
++
++ if (path->p_bh) {
++ /* path points to block */
++ err = ext3_journal_get_write_access(handle, path->p_bh);
++ } else {
++ /* path points to leaf/index in inode body */
++ err = ext3_ext_get_access_for_root(handle, tree);
++ }
++ return err;
++}
++
++/*
++ * could return:
++ * - EROFS
++ * - ENOMEM
++ * - EIO
++ */
++static int ext3_ext_dirty(handle_t *handle, struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path)
++{
++ int err;
++ if (path->p_bh) {
++ /* path points to block */
++ err =ext3_journal_dirty_metadata(handle, path->p_bh);
++ } else {
++ /* path points to leaf/index in inode body */
++ err = ext3_ext_mark_root_dirty(handle, tree);
++ }
++ return err;
++}
++
++static int inline
++ext3_ext_new_block(handle_t *handle, struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path, struct ext3_extent *ex,
++ int *err)
++{
++ int goal, depth, newblock;
++ struct inode *inode;
++
++ EXT_ASSERT(tree);
++ if (tree->ops->new_block)
++ return tree->ops->new_block(handle, tree, path, ex, err);
++
++ inode = tree->inode;
++ depth = EXT_DEPTH(tree);
++ if (path && depth > 0) {
++ goal = path[depth-1].p_block;
++ } else {
++ struct ext3_inode_info *ei = EXT3_I(inode);
++ unsigned long bg_start;
++ unsigned long colour;
++
++ bg_start = (ei->i_block_group *
++ EXT3_BLOCKS_PER_GROUP(inode->i_sb)) +
++ le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block);
++ colour = (current->pid % 16) *
++ (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16);
++ goal = bg_start + colour;
++ }
++
++ newblock = ext3_new_block(handle, inode, goal, 0, 0, err);
++ return newblock;
++}
++
++static inline void ext3_ext_tree_changed(struct ext3_extents_tree *tree)
++{
++ struct ext3_extent_header *neh;
++ neh = EXT_ROOT_HDR(tree);
++ neh->eh_generation++;
++}
++
++static inline int ext3_ext_space_block(struct ext3_extents_tree *tree)
++{
++ int size;
++
++ size = (tree->inode->i_sb->s_blocksize -
++ sizeof(struct ext3_extent_header))
++ / sizeof(struct ext3_extent);
++#ifdef AGRESSIVE_TEST
++ size = 6;
++#endif
++ return size;
++}
++
++static inline int ext3_ext_space_block_idx(struct ext3_extents_tree *tree)
++{
++ int size;
++
++ size = (tree->inode->i_sb->s_blocksize -
++ sizeof(struct ext3_extent_header))
++ / sizeof(struct ext3_extent_idx);
++#ifdef AGRESSIVE_TEST
++ size = 5;
++#endif
++ return size;
++}
++
++static inline int ext3_ext_space_root(struct ext3_extents_tree *tree)
++{
++ int size;
++
++ size = (tree->buffer_len - sizeof(struct ext3_extent_header))
++ / sizeof(struct ext3_extent);
++#ifdef AGRESSIVE_TEST
++ size = 3;
++#endif
++ return size;
++}
++
++static inline int ext3_ext_space_root_idx(struct ext3_extents_tree *tree)
++{
++ int size;
++
++ size = (tree->buffer_len -
++ sizeof(struct ext3_extent_header))
++ / sizeof(struct ext3_extent_idx);
++#ifdef AGRESSIVE_TEST
++ size = 4;
++#endif
++ return size;
++}
++
++static void ext3_ext_show_path(struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path)
++{
++#ifdef EXT_DEBUG
++ int k, l = path->p_depth;
++
++ ext_debug(tree, "path:");
++ for (k = 0; k <= l; k++, path++) {
++ if (path->p_idx) {
++ ext_debug(tree, " %d->%d", path->p_idx->ei_block,
++ path->p_idx->ei_leaf);
++ } else if (path->p_ext) {
++ ext_debug(tree, " %d:%d:%d",
++ path->p_ext->ee_block,
++ path->p_ext->ee_len,
++ path->p_ext->ee_start);
++ } else
++ ext_debug(tree, " []");
++ }
++ ext_debug(tree, "\n");
++#endif
++}
++
++static void ext3_ext_show_leaf(struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path)
++{
++#ifdef EXT_DEBUG
++ int depth = EXT_DEPTH(tree);
++ struct ext3_extent_header *eh;
++ struct ext3_extent *ex;
++ int i;
++
++ if (!path)
++ return;
++
++ eh = path[depth].p_hdr;
++ ex = EXT_FIRST_EXTENT(eh);
++
++ for (i = 0; i < eh->eh_entries; i++, ex++) {
++ ext_debug(tree, "%d:%d:%d ",
++ ex->ee_block, ex->ee_len, ex->ee_start);
++ }
++ ext_debug(tree, "\n");
++#endif
++}
++
++static void ext3_ext_drop_refs(struct ext3_ext_path *path)
++{
++ int depth = path->p_depth;
++ int i;
++
++ for (i = 0; i <= depth; i++, path++)
++ if (path->p_bh) {
++ brelse(path->p_bh);
++ path->p_bh = NULL;
++ }
++}
++
++/*
++ * binary search for closest index by given block
++ */
++static inline void
++ext3_ext_binsearch_idx(struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path, int block)
++{
++ struct ext3_extent_header *eh = path->p_hdr;
++ struct ext3_extent_idx *ix;
++ int l = 0, k, r;
++
++ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC);
++ EXT_ASSERT(eh->eh_entries <= eh->eh_max);
++ EXT_ASSERT(eh->eh_entries > 0);
++
++ ext_debug(tree, "binsearch for %d(idx): ", block);
++
++ path->p_idx = ix = EXT_FIRST_INDEX(eh);
++
++ r = k = eh->eh_entries;
++ while (k > 1) {
++ k = (r - l) / 2;
++ if (block < ix[l + k].ei_block)
++ r -= k;
++ else
++ l += k;
++ ext_debug(tree, "%d:%d:%d ", k, l, r);
++ }
++
++ ix += l;
++ path->p_idx = ix;
++ ext_debug(tree, " -> %d->%d ", path->p_idx->ei_block, path->p_idx->ei_leaf);
++
++ while (l++ < r) {
++ if (block < ix->ei_block)
++ break;
++ path->p_idx = ix++;
++ }
++ ext_debug(tree, " -> %d->%d\n", path->p_idx->ei_block,
++ path->p_idx->ei_leaf);
++
++#ifdef CHECK_BINSEARCH
++ {
++ struct ext3_extent_idx *chix;
++
++ chix = ix = EXT_FIRST_INDEX(eh);
++ for (k = 0; k < eh->eh_entries; k++, ix++) {
++ if (k != 0 && ix->ei_block <= ix[-1].ei_block) {
++ printk("k=%d, ix=0x%p, first=0x%p\n", k,
++ ix, EXT_FIRST_INDEX(eh));
++ printk("%u <= %u\n",
++ ix->ei_block,ix[-1].ei_block);
++ }
++ EXT_ASSERT(k == 0 || ix->ei_block > ix[-1].ei_block);
++ if (block < ix->ei_block)
++ break;
++ chix = ix;
++ }
++ EXT_ASSERT(chix == path->p_idx);
++ }
++#endif
++
++}
++
++/*
++ * binary search for closest extent by given block
++ */
++static inline void
++ext3_ext_binsearch(struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path, int block)
++{
++ struct ext3_extent_header *eh = path->p_hdr;
++ struct ext3_extent *ex;
++ int l = 0, k, r;
++
++ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC);
++ EXT_ASSERT(eh->eh_entries <= eh->eh_max);
++
++ if (eh->eh_entries == 0) {
++ /*
++ * this leaf is empty yet:
++ * we get such a leaf in split/add case
++ */
++ return;
++ }
++
++ ext_debug(tree, "binsearch for %d: ", block);
++
++ path->p_ext = ex = EXT_FIRST_EXTENT(eh);
++
++ r = k = eh->eh_entries;
++ while (k > 1) {
++ k = (r - l) / 2;
++ if (block < ex[l + k].ee_block)
++ r -= k;
++ else
++ l += k;
++ ext_debug(tree, "%d:%d:%d ", k, l, r);
++ }
++
++ ex += l;
++ path->p_ext = ex;
++ ext_debug(tree, " -> %d:%d:%d ", path->p_ext->ee_block,
++ path->p_ext->ee_start, path->p_ext->ee_len);
++
++ while (l++ < r) {
++ if (block < ex->ee_block)
++ break;
++ path->p_ext = ex++;
++ }
++ ext_debug(tree, " -> %d:%d:%d\n", path->p_ext->ee_block,
++ path->p_ext->ee_start, path->p_ext->ee_len);
++
++#ifdef CHECK_BINSEARCH
++ {
++ struct ext3_extent *chex;
++
++ chex = ex = EXT_FIRST_EXTENT(eh);
++ for (k = 0; k < eh->eh_entries; k++, ex++) {
++ EXT_ASSERT(k == 0 || ex->ee_block > ex[-1].ee_block);
++ if (block < ex->ee_block)
++ break;
++ chex = ex;
++ }
++ EXT_ASSERT(chex == path->p_ext);
++ }
++#endif
++
++}
++
++int ext3_extent_tree_init(handle_t *handle, struct ext3_extents_tree *tree)
++{
++ struct ext3_extent_header *eh;
++
++ BUG_ON(tree->buffer_len == 0);
++ ext3_ext_get_access_for_root(handle, tree);
++ eh = EXT_ROOT_HDR(tree);
++ eh->eh_depth = 0;
++ eh->eh_entries = 0;
++ eh->eh_magic = EXT3_EXT_MAGIC;
++ eh->eh_max = ext3_ext_space_root(tree);
++ ext3_ext_mark_root_dirty(handle, tree);
++ ext3_ext_invalidate_cache(tree);
++ return 0;
++}
++
++struct ext3_ext_path *
++ext3_ext_find_extent(struct ext3_extents_tree *tree, int block,
++ struct ext3_ext_path *path)
++{
++ struct ext3_extent_header *eh;
++ struct buffer_head *bh;
++ int depth, i, ppos = 0;
++
++ EXT_ASSERT(tree);
++ EXT_ASSERT(tree->inode);
++ EXT_ASSERT(tree->root);
++
++ eh = EXT_ROOT_HDR(tree);
++ EXT_ASSERT(eh);
++ i = depth = EXT_DEPTH(tree);
++ EXT_ASSERT(eh->eh_max);
++ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC);
++ EXT_ASSERT(i == 0 || eh->eh_entries > 0);
++
++ /* account possible depth increase */
++ if (!path) {
++ path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 2),
++ GFP_NOFS);
++ if (!path)
++ return ERR_PTR(-ENOMEM);
++ }
++ memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1));
++ path[0].p_hdr = eh;
++
++ /* walk through the tree */
++ while (i) {
++ ext_debug(tree, "depth %d: num %d, max %d\n",
++ ppos, eh->eh_entries, eh->eh_max);
++ ext3_ext_binsearch_idx(tree, path + ppos, block);
++ path[ppos].p_block = path[ppos].p_idx->ei_leaf;
++ path[ppos].p_depth = i;
++ path[ppos].p_ext = NULL;
++
++ bh = sb_bread(tree->inode->i_sb, path[ppos].p_block);
++ if (!bh) {
++ ext3_ext_drop_refs(path);
++ kfree(path);
++ return ERR_PTR(-EIO);
++ }
++ eh = EXT_BLOCK_HDR(bh);
++ ppos++;
++ EXT_ASSERT(ppos <= depth);
++ path[ppos].p_bh = bh;
++ path[ppos].p_hdr = eh;
++ i--;
++ }
++
++ path[ppos].p_depth = i;
++ path[ppos].p_hdr = eh;
++ path[ppos].p_ext = NULL;
++
++ /* find extent */
++ ext3_ext_binsearch(tree, path + ppos, block);
++
++ ext3_ext_show_path(tree, path);
++
++ return path;
++}
++
++/*
++ * insert new index [logical;ptr] into the block at cupr
++ * it check where to insert: before curp or after curp
++ */
++static int ext3_ext_insert_index(handle_t *handle,
++ struct ext3_extents_tree *tree,
++ struct ext3_ext_path *curp,
++ int logical, int ptr)
++{
++ struct ext3_extent_idx *ix;
++ int len, err;
++
++ if ((err = ext3_ext_get_access(handle, tree, curp)))
++ return err;
++
++ EXT_ASSERT(logical != curp->p_idx->ei_block);
++ len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx;
++ if (logical > curp->p_idx->ei_block) {
++ /* insert after */
++ if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) {
++ len = (len - 1) * sizeof(struct ext3_extent_idx);
++ len = len < 0 ? 0 : len;
++ ext_debug(tree, "insert new index %d after: %d. "
++ "move %d from 0x%p to 0x%p\n",
++ logical, ptr, len,
++ (curp->p_idx + 1), (curp->p_idx + 2));
++ memmove(curp->p_idx + 2, curp->p_idx + 1, len);
++ }
++ ix = curp->p_idx + 1;
++ } else {
++ /* insert before */
++ len = len * sizeof(struct ext3_extent_idx);
++ len = len < 0 ? 0 : len;
++ ext_debug(tree, "insert new index %d before: %d. "
++ "move %d from 0x%p to 0x%p\n",
++ logical, ptr, len,
++ curp->p_idx, (curp->p_idx + 1));
++ memmove(curp->p_idx + 1, curp->p_idx, len);
++ ix = curp->p_idx;
++ }
++
++ ix->ei_block = logical;
++ ix->ei_leaf = ptr;
++ curp->p_hdr->eh_entries++;
++
++ EXT_ASSERT(curp->p_hdr->eh_entries <= curp->p_hdr->eh_max);
++ EXT_ASSERT(ix <= EXT_LAST_INDEX(curp->p_hdr));
++
++ err = ext3_ext_dirty(handle, tree, curp);
++ ext3_std_error(tree->inode->i_sb, err);
++
++ return err;
++}
++
++/*
++ * routine inserts new subtree into the path, using free index entry
++ * at depth 'at:
++ * - allocates all needed blocks (new leaf and all intermediate index blocks)
++ * - makes decision where to split
++ * - moves remaining extens and index entries (right to the split point)
++ * into the newly allocated blocks
++ * - initialize subtree
++ */
++static int ext3_ext_split(handle_t *handle, struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path,
++ struct ext3_extent *newext, int at)
++{
++ struct buffer_head *bh = NULL;
++ int depth = EXT_DEPTH(tree);
++ struct ext3_extent_header *neh;
++ struct ext3_extent_idx *fidx;
++ struct ext3_extent *ex;
++ int i = at, k, m, a;
++ unsigned long newblock, oldblock, border;
++ int *ablocks = NULL; /* array of allocated blocks */
++ int err = 0;
++
++ /* make decision: where to split? */
++ /* FIXME: now desicion is simplest: at current extent */
++
++ /* if current leaf will be splitted, then we should use
++ * border from split point */
++ EXT_ASSERT(path[depth].p_ext <= EXT_MAX_EXTENT(path[depth].p_hdr));
++ if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
++ border = path[depth].p_ext[1].ee_block;
++ ext_debug(tree, "leaf will be splitted."
++ " next leaf starts at %d\n",
++ (int)border);
++ } else {
++ border = newext->ee_block;
++ ext_debug(tree, "leaf will be added."
++ " next leaf starts at %d\n",
++ (int)border);
++ }
++
++ /*
++ * if error occurs, then we break processing
++ * and turn filesystem read-only. so, index won't
++ * be inserted and tree will be in consistent
++ * state. next mount will repair buffers too
++ */
++
++ /*
++ * get array to track all allocated blocks
++ * we need this to handle errors and free blocks
++ * upon them
++ */
++ ablocks = kmalloc(sizeof(unsigned long) * depth, GFP_NOFS);
++ if (!ablocks)
++ return -ENOMEM;
++ memset(ablocks, 0, sizeof(unsigned long) * depth);
++
++ /* allocate all needed blocks */
++ ext_debug(tree, "allocate %d blocks for indexes/leaf\n", depth - at);
++ for (a = 0; a < depth - at; a++) {
++ newblock = ext3_ext_new_block(handle, tree, path, newext, &err);
++ if (newblock == 0)
++ goto cleanup;
++ ablocks[a] = newblock;
++ }
++
++ /* initialize new leaf */
++ newblock = ablocks[--a];
++ EXT_ASSERT(newblock);
++ bh = sb_getblk(tree->inode->i_sb, newblock);
++ if (!bh) {
++ err = -EIO;
++ goto cleanup;
++ }
++ lock_buffer(bh);
++
++ if ((err = ext3_journal_get_create_access(handle, bh)))
++ goto cleanup;
++
++ neh = EXT_BLOCK_HDR(bh);
++ neh->eh_entries = 0;
++ neh->eh_max = ext3_ext_space_block(tree);
++ neh->eh_magic = EXT3_EXT_MAGIC;
++ neh->eh_depth = 0;
++ ex = EXT_FIRST_EXTENT(neh);
++
++ /* move remain of path[depth] to the new leaf */
++ EXT_ASSERT(path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max);
++ /* start copy from next extent */
++ /* TODO: we could do it by single memmove */
++ m = 0;
++ path[depth].p_ext++;
++ while (path[depth].p_ext <=
++ EXT_MAX_EXTENT(path[depth].p_hdr)) {
++ ext_debug(tree, "move %d:%d:%d in new leaf %lu\n",
++ path[depth].p_ext->ee_block,
++ path[depth].p_ext->ee_start,
++ path[depth].p_ext->ee_len,
++ newblock);
++ memmove(ex++, path[depth].p_ext++,
++ sizeof(struct ext3_extent));
++ neh->eh_entries++;
++ m++;
++ }
++ set_buffer_uptodate(bh);
++ unlock_buffer(bh);
++
++ if ((err = ext3_journal_dirty_metadata(handle, bh)))
++ goto cleanup;
++ brelse(bh);
++ bh = NULL;
++
++ /* correct old leaf */
++ if (m) {
++ if ((err = ext3_ext_get_access(handle, tree, path + depth)))
++ goto cleanup;
++ path[depth].p_hdr->eh_entries -= m;
++ if ((err = ext3_ext_dirty(handle, tree, path + depth)))
++ goto cleanup;
++
++ }
++
++ /* create intermediate indexes */
++ k = depth - at - 1;
++ EXT_ASSERT(k >= 0);
++ if (k)
++ ext_debug(tree, "create %d intermediate indices\n", k);
++ /* insert new index into current index block */
++ /* current depth stored in i var */
++ i = depth - 1;
++ while (k--) {
++ oldblock = newblock;
++ newblock = ablocks[--a];
++ bh = sb_getblk(tree->inode->i_sb, newblock);
++ if (!bh) {
++ err = -EIO;
++ goto cleanup;
++ }
++ lock_buffer(bh);
++
++ if ((err = ext3_journal_get_create_access(handle, bh)))
++ goto cleanup;
++
++ neh = EXT_BLOCK_HDR(bh);
++ neh->eh_entries = 1;
++ neh->eh_magic = EXT3_EXT_MAGIC;
++ neh->eh_max = ext3_ext_space_block_idx(tree);
++ neh->eh_depth = depth - i;
++ fidx = EXT_FIRST_INDEX(neh);
++ fidx->ei_block = border;
++ fidx->ei_leaf = oldblock;
++
++ ext_debug(tree, "int.index at %d (block %lu): %lu -> %lu\n",
++ i, newblock, border, oldblock);
++ /* copy indexes */
++ m = 0;
++ path[i].p_idx++;
++
++ ext_debug(tree, "cur 0x%p, last 0x%p\n", path[i].p_idx,
++ EXT_MAX_INDEX(path[i].p_hdr));
++ EXT_ASSERT(EXT_MAX_INDEX(path[i].p_hdr) ==
++ EXT_LAST_INDEX(path[i].p_hdr));
++ while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) {
++ ext_debug(tree, "%d: move %d:%d in new index %lu\n",
++ i, path[i].p_idx->ei_block,
++ path[i].p_idx->ei_leaf, newblock);
++ memmove(++fidx, path[i].p_idx++,
++ sizeof(struct ext3_extent_idx));
++ neh->eh_entries++;
++ EXT_ASSERT(neh->eh_entries <= neh->eh_max);
++ m++;
++ }
++ set_buffer_uptodate(bh);
++ unlock_buffer(bh);
++
++ if ((err = ext3_journal_dirty_metadata(handle, bh)))
++ goto cleanup;
++ brelse(bh);
++ bh = NULL;
++
++ /* correct old index */
++ if (m) {
++ err = ext3_ext_get_access(handle, tree, path + i);
++ if (err)
++ goto cleanup;
++ path[i].p_hdr->eh_entries -= m;
++ err = ext3_ext_dirty(handle, tree, path + i);
++ if (err)
++ goto cleanup;
++ }
++
++ i--;
++ }
++
++ /* insert new index */
++ if (!err)
++ err = ext3_ext_insert_index(handle, tree, path + at,
++ border, newblock);
++
++cleanup:
++ if (bh) {
++ if (buffer_locked(bh))
++ unlock_buffer(bh);
++ brelse(bh);
++ }
++
++ if (err) {
++ /* free all allocated blocks in error case */
++ for (i = 0; i < depth; i++) {
++ if (!ablocks[i])
++ continue;
++ ext3_free_blocks(handle, tree->inode, ablocks[i], 1);
++ }
++ }
++ kfree(ablocks);
++
++ return err;
++}
++
++/*
++ * routine implements tree growing procedure:
++ * - allocates new block
++ * - moves top-level data (index block or leaf) into the new block
++ * - initialize new top-level, creating index that points to the
++ * just created block
++ */
++static int ext3_ext_grow_indepth(handle_t *handle,
++ struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path,
++ struct ext3_extent *newext)
++{
++ struct ext3_ext_path *curp = path;
++ struct ext3_extent_header *neh;
++ struct ext3_extent_idx *fidx;
++ struct buffer_head *bh;
++ unsigned long newblock;
++ int err = 0;
++
++ newblock = ext3_ext_new_block(handle, tree, path, newext, &err);
++ if (newblock == 0)
++ return err;
++
++ bh = sb_getblk(tree->inode->i_sb, newblock);
++ if (!bh) {
++ err = -EIO;
++ ext3_std_error(tree->inode->i_sb, err);
++ return err;
++ }
++ lock_buffer(bh);
++
++ if ((err = ext3_journal_get_create_access(handle, bh))) {
++ unlock_buffer(bh);
++ goto out;
++ }
++
++ /* move top-level index/leaf into new block */
++ memmove(bh->b_data, curp->p_hdr, tree->buffer_len);
++
++ /* set size of new block */
++ neh = EXT_BLOCK_HDR(bh);
++ /* old root could have indexes or leaves
++ * so calculate e_max right way */
++ if (EXT_DEPTH(tree))
++ neh->eh_max = ext3_ext_space_block_idx(tree);
++ else
++ neh->eh_max = ext3_ext_space_block(tree);
++ neh->eh_magic = EXT3_EXT_MAGIC;
++ set_buffer_uptodate(bh);
++ unlock_buffer(bh);
++
++ if ((err = ext3_journal_dirty_metadata(handle, bh)))
++ goto out;
++
++ /* create index in new top-level index: num,max,pointer */
++ if ((err = ext3_ext_get_access(handle, tree, curp)))
++ goto out;
++
++ curp->p_hdr->eh_magic = EXT3_EXT_MAGIC;
++ curp->p_hdr->eh_max = ext3_ext_space_root_idx(tree);
++ curp->p_hdr->eh_entries = 1;
++ curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr);
++ /* FIXME: it works, but actually path[0] can be index */
++ curp->p_idx->ei_block = EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block;
++ curp->p_idx->ei_leaf = newblock;
++
++ neh = EXT_ROOT_HDR(tree);
++ fidx = EXT_FIRST_INDEX(neh);
++ ext_debug(tree, "new root: num %d(%d), lblock %d, ptr %d\n",
++ neh->eh_entries, neh->eh_max, fidx->ei_block, fidx->ei_leaf);
++
++ neh->eh_depth = path->p_depth + 1;
++ err = ext3_ext_dirty(handle, tree, curp);
++out:
++ brelse(bh);
++
++ return err;
++}
++
++/*
++ * routine finds empty index and adds new leaf. if no free index found
++ * then it requests in-depth growing
++ */
++static int ext3_ext_create_new_leaf(handle_t *handle,
++ struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path,
++ struct ext3_extent *newext)
++{
++ struct ext3_ext_path *curp;
++ int depth, i, err = 0;
++
++repeat:
++ i = depth = EXT_DEPTH(tree);
++
++ /* walk up to the tree and look for free index entry */
++ curp = path + depth;
++ while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) {
++ i--;
++ curp--;
++ }
++
++ /* we use already allocated block for index block
++ * so, subsequent data blocks should be contigoues */
++ if (EXT_HAS_FREE_INDEX(curp)) {
++ /* if we found index with free entry, then use that
++ * entry: create all needed subtree and add new leaf */
++ err = ext3_ext_split(handle, tree, path, newext, i);
++
++ /* refill path */
++ ext3_ext_drop_refs(path);
++ path = ext3_ext_find_extent(tree, newext->ee_block, path);
++ if (IS_ERR(path))
++ err = PTR_ERR(path);
++ } else {
++ /* tree is full, time to grow in depth */
++ err = ext3_ext_grow_indepth(handle, tree, path, newext);
++
++ /* refill path */
++ ext3_ext_drop_refs(path);
++ path = ext3_ext_find_extent(tree, newext->ee_block, path);
++ if (IS_ERR(path))
++ err = PTR_ERR(path);
++
++ /*
++ * only first (depth 0 -> 1) produces free space
++ * in all other cases we have to split growed tree
++ */
++ depth = EXT_DEPTH(tree);
++ if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) {
++ /* now we need split */
++ goto repeat;
++ }
++ }
++
++ if (err)
++ return err;
++
++ return 0;
++}
++
++/*
++ * returns allocated block in subsequent extent or EXT_MAX_BLOCK
++ * NOTE: it consider block number from index entry as
++ * allocated block. thus, index entries have to be consistent
++ * with leafs
++ */
++static unsigned long
++ext3_ext_next_allocated_block(struct ext3_ext_path *path)
++{
++ int depth;
++
++ EXT_ASSERT(path != NULL);
++ depth = path->p_depth;
++
++ if (depth == 0 && path->p_ext == NULL)
++ return EXT_MAX_BLOCK;
++
++ /* FIXME: what if index isn't full ?! */
++ while (depth >= 0) {
++ if (depth == path->p_depth) {
++ /* leaf */
++ if (path[depth].p_ext !=
++ EXT_LAST_EXTENT(path[depth].p_hdr))
++ return path[depth].p_ext[1].ee_block;
++ } else {
++ /* index */
++ if (path[depth].p_idx !=
++ EXT_LAST_INDEX(path[depth].p_hdr))
++ return path[depth].p_idx[1].ei_block;
++ }
++ depth--;
++ }
++
++ return EXT_MAX_BLOCK;
++}
++
++/*
++ * returns first allocated block from next leaf or EXT_MAX_BLOCK
++ */
++static unsigned ext3_ext_next_leaf_block(struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path)
++{
++ int depth;
++
++ EXT_ASSERT(path != NULL);
++ depth = path->p_depth;
++
++ /* zero-tree has no leaf blocks at all */
++ if (depth == 0)
++ return EXT_MAX_BLOCK;
++
++ /* go to index block */
++ depth--;
++
++ while (depth >= 0) {
++ if (path[depth].p_idx !=
++ EXT_LAST_INDEX(path[depth].p_hdr))
++ return path[depth].p_idx[1].ei_block;
++ depth--;
++ }
++
++ return EXT_MAX_BLOCK;
++}
++
++/*
++ * if leaf gets modified and modified extent is first in the leaf
++ * then we have to correct all indexes above
++ * TODO: do we need to correct tree in all cases?
++ */
++int ext3_ext_correct_indexes(handle_t *handle, struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path)
++{
++ struct ext3_extent_header *eh;
++ int depth = EXT_DEPTH(tree);
++ struct ext3_extent *ex;
++ unsigned long border;
++ int k, err = 0;
++
++ eh = path[depth].p_hdr;
++ ex = path[depth].p_ext;
++ EXT_ASSERT(ex);
++ EXT_ASSERT(eh);
++
++ if (depth == 0) {
++ /* there is no tree at all */
++ return 0;
++ }
++
++ if (ex != EXT_FIRST_EXTENT(eh)) {
++ /* we correct tree if first leaf got modified only */
++ return 0;
++ }
++
++ /*
++ * TODO: we need correction if border is smaller then current one
++ */
++ k = depth - 1;
++ border = path[depth].p_ext->ee_block;
++ if ((err = ext3_ext_get_access(handle, tree, path + k)))
++ return err;
++ path[k].p_idx->ei_block = border;
++ if ((err = ext3_ext_dirty(handle, tree, path + k)))
++ return err;
++
++ while (k--) {
++ /* change all left-side indexes */
++ if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr))
++ break;
++ if ((err = ext3_ext_get_access(handle, tree, path + k)))
++ break;
++ path[k].p_idx->ei_block = border;
++ if ((err = ext3_ext_dirty(handle, tree, path + k)))
++ break;
++ }
++
++ return err;
++}
++
++static int inline
++ext3_can_extents_be_merged(struct ext3_extents_tree *tree,
++ struct ext3_extent *ex1,
++ struct ext3_extent *ex2)
++{
++ if (ex1->ee_block + ex1->ee_len != ex2->ee_block)
++ return 0;
++
++#ifdef AGRESSIVE_TEST
++ if (ex1->ee_len >= 4)
++ return 0;
++#endif
++
++ if (!tree->ops->mergable)
++ return 1;
++
++ return tree->ops->mergable(ex1, ex2);
++}
++
++/*
++ * this routine tries to merge requsted extent into the existing
++ * extent or inserts requested extent as new one into the tree,
++ * creating new leaf in no-space case
++ */
++int ext3_ext_insert_extent(handle_t *handle, struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path,
++ struct ext3_extent *newext)
++{
++ struct ext3_extent_header * eh;
++ struct ext3_extent *ex, *fex;
++ struct ext3_extent *nearex; /* nearest extent */
++ struct ext3_ext_path *npath = NULL;
++ int depth, len, err, next;
++
++ EXT_ASSERT(newext->ee_len > 0);
++ EXT_ASSERT(newext->ee_len < EXT_CACHE_MARK);
++ depth = EXT_DEPTH(tree);
++ ex = path[depth].p_ext;
++ EXT_ASSERT(path[depth].p_hdr);
++
++ /* try to insert block into found extent and return */
++ if (ex && ext3_can_extents_be_merged(tree, ex, newext)) {
++ ext_debug(tree, "append %d block to %d:%d (from %d)\n",
++ newext->ee_len, ex->ee_block, ex->ee_len,
++ ex->ee_start);
++ if ((err = ext3_ext_get_access(handle, tree, path + depth)))
++ return err;
++ ex->ee_len += newext->ee_len;
++ eh = path[depth].p_hdr;
++ nearex = ex;
++ goto merge;
++ }
++
++repeat:
++ depth = EXT_DEPTH(tree);
++ eh = path[depth].p_hdr;
++ if (eh->eh_entries < eh->eh_max)
++ goto has_space;
++
++ /* probably next leaf has space for us? */
++ fex = EXT_LAST_EXTENT(eh);
++ next = ext3_ext_next_leaf_block(tree, path);
++ if (newext->ee_block > fex->ee_block && next != EXT_MAX_BLOCK) {
++ ext_debug(tree, "next leaf block - %d\n", next);
++ EXT_ASSERT(!npath);
++ npath = ext3_ext_find_extent(tree, next, NULL);
++ if (IS_ERR(npath))
++ return PTR_ERR(npath);
++ EXT_ASSERT(npath->p_depth == path->p_depth);
++ eh = npath[depth].p_hdr;
++ if (eh->eh_entries < eh->eh_max) {
++ ext_debug(tree, "next leaf isnt full(%d)\n",
++ eh->eh_entries);
++ path = npath;
++ goto repeat;
++ }
++ ext_debug(tree, "next leaf hasno free space(%d,%d)\n",
++ eh->eh_entries, eh->eh_max);
++ }
++
++ /*
++ * there is no free space in found leaf
++ * we're gonna add new leaf in the tree
++ */
++ err = ext3_ext_create_new_leaf(handle, tree, path, newext);
++ if (err)
++ goto cleanup;
++ depth = EXT_DEPTH(tree);
++ eh = path[depth].p_hdr;
++
++has_space:
++ nearex = path[depth].p_ext;
++
++ if ((err = ext3_ext_get_access(handle, tree, path + depth)))
++ goto cleanup;
++
++ if (!nearex) {
++ /* there is no extent in this leaf, create first one */
++ ext_debug(tree, "first extent in the leaf: %d:%d:%d\n",
++ newext->ee_block, newext->ee_start,
++ newext->ee_len);
++ path[depth].p_ext = EXT_FIRST_EXTENT(eh);
++ } else if (newext->ee_block > nearex->ee_block) {
++ EXT_ASSERT(newext->ee_block != nearex->ee_block);
++ if (nearex != EXT_LAST_EXTENT(eh)) {
++ len = EXT_MAX_EXTENT(eh) - nearex;
++ len = (len - 1) * sizeof(struct ext3_extent);
++ len = len < 0 ? 0 : len;
++ ext_debug(tree, "insert %d:%d:%d after: nearest 0x%p, "
++ "move %d from 0x%p to 0x%p\n",
++ newext->ee_block, newext->ee_start,
++ newext->ee_len,
++ nearex, len, nearex + 1, nearex + 2);
++ memmove(nearex + 2, nearex + 1, len);
++ }
++ path[depth].p_ext = nearex + 1;
++ } else {
++ EXT_ASSERT(newext->ee_block != nearex->ee_block);
++ len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext3_extent);
++ len = len < 0 ? 0 : len;
++ ext_debug(tree, "insert %d:%d:%d before: nearest 0x%p, "
++ "move %d from 0x%p to 0x%p\n",
++ newext->ee_block, newext->ee_start, newext->ee_len,
++ nearex, len, nearex + 1, nearex + 2);
++ memmove(nearex + 1, nearex, len);
++ path[depth].p_ext = nearex;
++ }
++
++ eh->eh_entries++;
++ nearex = path[depth].p_ext;
++ nearex->ee_block = newext->ee_block;
++ nearex->ee_start = newext->ee_start;
++ nearex->ee_len = newext->ee_len;
++ /* FIXME: support for large fs */
++ nearex->ee_start_hi = 0;
++
++merge:
++ /* try to merge extents to the right */
++ while (nearex < EXT_LAST_EXTENT(eh)) {
++ if (!ext3_can_extents_be_merged(tree, nearex, nearex + 1))
++ break;
++ /* merge with next extent! */
++ nearex->ee_len += nearex[1].ee_len;
++ if (nearex + 1 < EXT_LAST_EXTENT(eh)) {
++ len = (EXT_LAST_EXTENT(eh) - nearex - 1)
++ * sizeof(struct ext3_extent);
++ memmove(nearex + 1, nearex + 2, len);
++ }
++ eh->eh_entries--;
++ EXT_ASSERT(eh->eh_entries > 0);
++ }
++
++ /* try to merge extents to the left */
++
++ /* time to correct all indexes above */
++ err = ext3_ext_correct_indexes(handle, tree, path);
++ if (err)
++ goto cleanup;
++
++ err = ext3_ext_dirty(handle, tree, path + depth);
++
++cleanup:
++ if (npath) {
++ ext3_ext_drop_refs(npath);
++ kfree(npath);
++ }
++ ext3_ext_tree_changed(tree);
++ ext3_ext_invalidate_cache(tree);
++ return err;
++}
++
++int ext3_ext_walk_space(struct ext3_extents_tree *tree, unsigned long block,
++ unsigned long num, ext_prepare_callback func)
++{
++ struct ext3_ext_path *path = NULL;
++ struct ext3_extent *ex, cbex;
++ unsigned long next, start = 0, end = 0;
++ unsigned long last = block + num;
++ int depth, exists, err = 0;
++
++ EXT_ASSERT(tree);
++ EXT_ASSERT(func);
++ EXT_ASSERT(tree->inode);
++ EXT_ASSERT(tree->root);
++
++ while (block < last && block != EXT_MAX_BLOCK) {
++ num = last - block;
++ /* find extent for this block */
++ path = ext3_ext_find_extent(tree, block, path);
++ if (IS_ERR(path)) {
++ err = PTR_ERR(path);
++ path = NULL;
++ break;
++ }
++
++ depth = EXT_DEPTH(tree);
++ EXT_ASSERT(path[depth].p_hdr);
++ ex = path[depth].p_ext;
++ next = ext3_ext_next_allocated_block(path);
++
++ exists = 0;
++ if (!ex) {
++ /* there is no extent yet, so try to allocate
++ * all requested space */
++ start = block;
++ end = block + num;
++ } else if (ex->ee_block > block) {
++ /* need to allocate space before found extent */
++ start = block;
++ end = ex->ee_block;
++ if (block + num < end)
++ end = block + num;
++ } else if (block >= ex->ee_block + ex->ee_len) {
++ /* need to allocate space after found extent */
++ start = block;
++ end = block + num;
++ if (end >= next)
++ end = next;
++ } else if (block >= ex->ee_block) {
++ /*
++ * some part of requested space is covered
++ * by found extent
++ */
++ start = block;
++ end = ex->ee_block + ex->ee_len;
++ if (block + num < end)
++ end = block + num;
++ exists = 1;
++ } else {
++ BUG();
++ }
++ EXT_ASSERT(end > start);
++
++ if (!exists) {
++ cbex.ee_block = start;
++ cbex.ee_len = end - start;
++ cbex.ee_start = 0;
++ } else
++ cbex = *ex;
++
++ EXT_ASSERT(path[depth].p_hdr);
++ err = func(tree, path, &cbex, exists);
++ ext3_ext_drop_refs(path);
++
++ if (err < 0)
++ break;
++ if (err == EXT_REPEAT)
++ continue;
++ else if (err == EXT_BREAK) {
++ err = 0;
++ break;
++ }
++
++ if (EXT_DEPTH(tree) != depth) {
++ /* depth was changed. we have to realloc path */
++ kfree(path);
++ path = NULL;
++ }
++
++ block = cbex.ee_block + cbex.ee_len;
++ }
++
++ if (path) {
++ ext3_ext_drop_refs(path);
++ kfree(path);
++ }
++
++ return err;
++}
++
++static inline void
++ext3_ext_put_in_cache(struct ext3_extents_tree *tree, struct ext3_extent *ex)
++{
++ if (tree->cex) {
++ EXT_ASSERT(ex);
++ EXT_ASSERT(ex->ee_len);
++ tree->cex->ee_block = ex->ee_block;
++ tree->cex->ee_start = ex->ee_start;
++ tree->cex->ee_len = ex->ee_len;
++ }
++}
++
++/*
++ * this routine calculate boundaries of the gap requested block fits into
++ * and cache this gap
++ */
++static inline void
++ext3_ext_put_gap_in_cache(struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path,
++ unsigned long block)
++{
++ int depth = EXT_DEPTH(tree);
++ struct ext3_extent *ex, gex;
++
++ if (!tree->cex)
++ return;
++
++ ex = path[depth].p_ext;
++ if (ex == NULL) {
++ /* there is no extent yet, so gap is [0;-] */
++ gex.ee_block = 0;
++ gex.ee_len = EXT_CACHE_MARK;
++ ext_debug(tree, "cache gap(whole file):");
++ } else if (block < ex->ee_block) {
++ gex.ee_block = block;
++ gex.ee_len = ex->ee_block - block;
++ ext_debug(tree, "cache gap(before): %lu [%lu:%lu]",
++ (unsigned long) block,
++ (unsigned long) ex->ee_block,
++ (unsigned long) ex->ee_len);
++ } else if (block >= ex->ee_block + ex->ee_len) {
++ gex.ee_block = ex->ee_block + ex->ee_len;
++ gex.ee_len = ext3_ext_next_allocated_block(path);
++ ext_debug(tree, "cache gap(after): [%lu:%lu] %lu",
++ (unsigned long) ex->ee_block,
++ (unsigned long) ex->ee_len,
++ (unsigned long) block);
++ EXT_ASSERT(gex.ee_len > gex.ee_block);
++ gex.ee_len = gex.ee_len - gex.ee_block;
++ } else {
++ BUG();
++ }
++
++ ext_debug(tree, " -> %lu:%lu\n", (unsigned long) gex.ee_block,
++ (unsigned long) gex.ee_len);
++ gex.ee_start = EXT_CACHE_MARK;
++ ext3_ext_put_in_cache(tree, &gex);
++}
++
++static inline int
++ext3_ext_in_cache(struct ext3_extents_tree *tree, unsigned long block,
++ struct ext3_extent *ex)
++{
++ struct ext3_extent *cex = tree->cex;
++
++ /* is there cache storage at all? */
++ if (!cex)
++ return 0;
++
++ /* has cache valid data? */
++ if (cex->ee_len == 0)
++ return 0;
++
++ if (block >= cex->ee_block && block < cex->ee_block + cex->ee_len) {
++ ex->ee_block = cex->ee_block;
++ ex->ee_start = cex->ee_start;
++ ex->ee_len = cex->ee_len;
++ ext_debug(tree, "%lu cached by %lu:%lu:%lu\n",
++ (unsigned long) block,
++ (unsigned long) ex->ee_block,
++ (unsigned long) ex->ee_len,
++ (unsigned long) ex->ee_start);
++ return 1;
++ }
++
++ /* not in cache */
++ return 0;
++}
++
++/*
++ * routine removes index from the index block
++ * it's used in truncate case only. thus all requests are for
++ * last index in the block only
++ */
++int ext3_ext_rm_idx(handle_t *handle, struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path)
++{
++ struct buffer_head *bh;
++ int err;
++
++ /* free index block */
++ path--;
++ EXT_ASSERT(path->p_hdr->eh_entries);
++ if ((err = ext3_ext_get_access(handle, tree, path)))
++ return err;
++ path->p_hdr->eh_entries--;
++ if ((err = ext3_ext_dirty(handle, tree, path)))
++ return err;
++ ext_debug(tree, "index is empty, remove it, free block %d\n",
++ path->p_idx->ei_leaf);
++ bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf);
++ ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf);
++ ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1);
++ return err;
++}
++
++int ext3_ext_calc_credits_for_insert(struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path)
++{
++ int depth = EXT_DEPTH(tree);
++ int needed;
++
++ if (path) {
++ /* probably there is space in leaf? */
++ if (path[depth].p_hdr->eh_entries < path[depth].p_hdr->eh_max)
++ return 1;
++ }
++
++ /*
++ * the worste case we're expecting is creation of the
++ * new root (growing in depth) with index splitting
++ * for splitting we have to consider depth + 1 because
++ * previous growing could increase it
++ */
++ depth = depth + 1;
++
++ /*
++ * growing in depth:
++ * block allocation + new root + old root
++ */
++ needed = EXT3_ALLOC_NEEDED + 2;
++
++ /* index split. we may need:
++ * allocate intermediate indexes and new leaf
++ * change two blocks at each level, but root
++ * modify root block (inode)
++ */
++ needed += (depth * EXT3_ALLOC_NEEDED) + (2 * depth) + 1;
++
++ return needed;
++}
++
++static int
++ext3_ext_split_for_rm(handle_t *handle, struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path, unsigned long start,
++ unsigned long end)
++{
++ struct ext3_extent *ex, tex;
++ struct ext3_ext_path *npath;
++ int depth, creds, err;
++
++ depth = EXT_DEPTH(tree);
++ ex = path[depth].p_ext;
++ EXT_ASSERT(ex);
++ EXT_ASSERT(end < ex->ee_block + ex->ee_len - 1);
++ EXT_ASSERT(ex->ee_block < start);
++
++ /* calculate tail extent */
++ tex.ee_block = end + 1;
++ EXT_ASSERT(tex.ee_block < ex->ee_block + ex->ee_len);
++ tex.ee_len = ex->ee_block + ex->ee_len - tex.ee_block;
++
++ creds = ext3_ext_calc_credits_for_insert(tree, path);
++ handle = ext3_ext_journal_restart(handle, creds);
++ if (IS_ERR(handle))
++ return PTR_ERR(handle);
++
++ /* calculate head extent. use primary extent */
++ err = ext3_ext_get_access(handle, tree, path + depth);
++ if (err)
++ return err;
++ ex->ee_len = start - ex->ee_block;
++ err = ext3_ext_dirty(handle, tree, path + depth);
++ if (err)
++ return err;
++
++ /* FIXME: some callback to free underlying resource
++ * and correct ee_start? */
++ ext_debug(tree, "split extent: head %u:%u, tail %u:%u\n",
++ ex->ee_block, ex->ee_len, tex.ee_block, tex.ee_len);
++
++ npath = ext3_ext_find_extent(tree, ex->ee_block, NULL);
++ if (IS_ERR(npath))
++ return PTR_ERR(npath);
++ depth = EXT_DEPTH(tree);
++ EXT_ASSERT(npath[depth].p_ext->ee_block == ex->ee_block);
++ EXT_ASSERT(npath[depth].p_ext->ee_len == ex->ee_len);
++
++ err = ext3_ext_insert_extent(handle, tree, npath, &tex);
++ ext3_ext_drop_refs(npath);
++ kfree(npath);
++
++ return err;
++
++}
++
++static int
++ext3_ext_rm_leaf(handle_t *handle, struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path, unsigned long start,
++ unsigned long end)
++{
++ struct ext3_extent *ex, *fu = NULL, *lu, *le;
++ int err = 0, correct_index = 0;
++ int depth = EXT_DEPTH(tree), credits;
++ struct ext3_extent_header *eh;
++ unsigned a, b, block, num;
++
++ ext_debug(tree, "remove [%lu:%lu] in leaf\n", start, end);
++ if (!path[depth].p_hdr)
++ path[depth].p_hdr = EXT_BLOCK_HDR(path[depth].p_bh);
++ eh = path[depth].p_hdr;
++ EXT_ASSERT(eh);
++ EXT_ASSERT(eh->eh_entries <= eh->eh_max);
++ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC);
++
++ /* find where to start removing */
++ le = ex = EXT_LAST_EXTENT(eh);
++ while (ex != EXT_FIRST_EXTENT(eh)) {
++ if (ex->ee_block <= end)
++ break;
++ ex--;
++ }
++
++ if (start > ex->ee_block && end < ex->ee_block + ex->ee_len - 1) {
++ /* removal of internal part of the extent requested
++ * tail and head must be placed in different extent
++ * so, we have to insert one more extent */
++ path[depth].p_ext = ex;
++ return ext3_ext_split_for_rm(handle, tree, path, start, end);
++ }
++
++ lu = ex;
++ while (ex >= EXT_FIRST_EXTENT(eh) &&
++ ex->ee_block + ex->ee_len > start) {
++ ext_debug(tree, "remove ext %u:%u\n", ex->ee_block, ex->ee_len);
++ path[depth].p_ext = ex;
++
++ a = ex->ee_block > start ? ex->ee_block : start;
++ b = ex->ee_block + ex->ee_len - 1 < end ?
++ ex->ee_block + ex->ee_len - 1 : end;
++
++ ext_debug(tree, " border %u:%u\n", a, b);
++
++ if (a != ex->ee_block && b != ex->ee_block + ex->ee_len - 1) {
++ block = 0;
++ num = 0;
++ BUG();
++ } else if (a != ex->ee_block) {
++ /* remove tail of the extent */
++ block = ex->ee_block;
++ num = a - block;
++ } else if (b != ex->ee_block + ex->ee_len - 1) {
++ /* remove head of the extent */
++ block = a;
++ num = b - a;
++ } else {
++ /* remove whole extent: excelent! */
++ block = ex->ee_block;
++ num = 0;
++ EXT_ASSERT(a == ex->ee_block &&
++ b == ex->ee_block + ex->ee_len - 1);
++ }
++
++ if (ex == EXT_FIRST_EXTENT(eh))
++ correct_index = 1;
++
++ credits = 1;
++ if (correct_index)
++ credits += (EXT_DEPTH(tree) * EXT3_ALLOC_NEEDED) + 1;
++ if (tree->ops->remove_extent_credits)
++ credits+=tree->ops->remove_extent_credits(tree,ex,a,b);
++
++ handle = ext3_ext_journal_restart(handle, credits);
++ if (IS_ERR(handle)) {
++ err = PTR_ERR(handle);
++ goto out;
++ }
++
++ err = ext3_ext_get_access(handle, tree, path + depth);
++ if (err)
++ goto out;
++
++ if (tree->ops->remove_extent)
++ err = tree->ops->remove_extent(tree, ex, a, b);
++ if (err)
++ goto out;
++
++ if (num == 0) {
++ /* this extent is removed entirely mark slot unused */
++ ex->ee_start = 0;
++ eh->eh_entries--;
++ fu = ex;
++ }
++
++ ex->ee_block = block;
++ ex->ee_len = num;
++
++ err = ext3_ext_dirty(handle, tree, path + depth);
++ if (err)
++ goto out;
++
++ ext_debug(tree, "new extent: %u:%u:%u\n",
++ ex->ee_block, ex->ee_len, ex->ee_start);
++ ex--;
++ }
++
++ if (fu) {
++ /* reuse unused slots */
++ while (lu < le) {
++ if (lu->ee_start) {
++ *fu = *lu;
++ lu->ee_start = 0;
++ fu++;
++ }
++ lu++;
++ }
++ }
++
++ if (correct_index && eh->eh_entries)
++ err = ext3_ext_correct_indexes(handle, tree, path);
++
++ /* if this leaf is free, then we should
++ * remove it from index block above */
++ if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL)
++ err = ext3_ext_rm_idx(handle, tree, path + depth);
++
++out:
++ return err;
++}
++
++
++static struct ext3_extent_idx *
++ext3_ext_last_covered(struct ext3_extent_header *hdr, unsigned long block)
++{
++ struct ext3_extent_idx *ix;
++
++ ix = EXT_LAST_INDEX(hdr);
++ while (ix != EXT_FIRST_INDEX(hdr)) {
++ if (ix->ei_block <= block)
++ break;
++ ix--;
++ }
++ return ix;
++}
++
++/*
++ * returns 1 if current index have to be freed (even partial)
++ */
++static int inline
++ext3_ext_more_to_rm(struct ext3_ext_path *path)
++{
++ EXT_ASSERT(path->p_idx);
++
++ if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr))
++ return 0;
++
++ /*
++ * if truncate on deeper level happened it it wasn't partial
++ * so we have to consider current index for truncation
++ */
++ if (path->p_hdr->eh_entries == path->p_block)
++ return 0;
++ return 1;
++}
++
++int ext3_ext_remove_space(struct ext3_extents_tree *tree,
++ unsigned long start, unsigned long end)
++{
++ struct inode *inode = tree->inode;
++ struct super_block *sb = inode->i_sb;
++ int depth = EXT_DEPTH(tree);
++ struct ext3_ext_path *path;
++ handle_t *handle;
++ int i = 0, err = 0;
++
++ ext_debug(tree, "space to be removed: %lu:%lu\n", start, end);
++
++ /* probably first extent we're gonna free will be last in block */
++ handle = ext3_journal_start(inode, depth + 1);
++ if (IS_ERR(handle))
++ return PTR_ERR(handle);
++
++ ext3_ext_invalidate_cache(tree);
++
++ /*
++ * we start scanning from right side freeing all the blocks
++ * after i_size and walking into the deep
++ */
++ path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 1), GFP_KERNEL);
++ if (IS_ERR(path)) {
++ ext3_error(sb, "ext3_ext_remove_space",
++ "Can't allocate path array");
++ ext3_journal_stop(handle);
++ return -ENOMEM;
++ }
++ memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1));
++ path[i].p_hdr = EXT_ROOT_HDR(tree);
++
++ while (i >= 0 && err == 0) {
++ if (i == depth) {
++ /* this is leaf block */
++ err = ext3_ext_rm_leaf(handle, tree, path, start, end);
++ /* root level have p_bh == NULL, brelse() eats this */
++ brelse(path[i].p_bh);
++ i--;
++ continue;
++ }
++
++ /* this is index block */
++ if (!path[i].p_hdr) {
++ ext_debug(tree, "initialize header\n");
++ path[i].p_hdr = EXT_BLOCK_HDR(path[i].p_bh);
++ }
++
++ EXT_ASSERT(path[i].p_hdr->eh_entries <= path[i].p_hdr->eh_max);
++ EXT_ASSERT(path[i].p_hdr->eh_magic == EXT3_EXT_MAGIC);
++
++ if (!path[i].p_idx) {
++ /* this level hasn't touched yet */
++ path[i].p_idx =
++ ext3_ext_last_covered(path[i].p_hdr, end);
++ path[i].p_block = path[i].p_hdr->eh_entries + 1;
++ ext_debug(tree, "init index ptr: hdr 0x%p, num %d\n",
++ path[i].p_hdr, path[i].p_hdr->eh_entries);
++ } else {
++ /* we've already was here, see at next index */
++ path[i].p_idx--;
++ }
++
++ ext_debug(tree, "level %d - index, first 0x%p, cur 0x%p\n",
++ i, EXT_FIRST_INDEX(path[i].p_hdr),
++ path[i].p_idx);
++ if (ext3_ext_more_to_rm(path + i)) {
++ /* go to the next level */
++ ext_debug(tree, "move to level %d (block %d)\n",
++ i + 1, path[i].p_idx->ei_leaf);
++ memset(path + i + 1, 0, sizeof(*path));
++ path[i+1].p_bh = sb_bread(sb, path[i].p_idx->ei_leaf);
++ if (!path[i+1].p_bh) {
++ /* should we reset i_size? */
++ err = -EIO;
++ break;
++ }
++ /* put actual number of indexes to know is this
++ * number got changed at the next iteration */
++ path[i].p_block = path[i].p_hdr->eh_entries;
++ i++;
++ } else {
++ /* we finish processing this index, go up */
++ if (path[i].p_hdr->eh_entries == 0 && i > 0) {
++ /* index is empty, remove it
++ * handle must be already prepared by the
++ * truncatei_leaf() */
++ err = ext3_ext_rm_idx(handle, tree, path + i);
++ }
++ /* root level have p_bh == NULL, brelse() eats this */
++ brelse(path[i].p_bh);
++ i--;
++ ext_debug(tree, "return to level %d\n", i);
++ }
++ }
++
++ /* TODO: flexible tree reduction should be here */
++ if (path->p_hdr->eh_entries == 0) {
++ /*
++ * truncate to zero freed all the tree
++ * so, we need to correct eh_depth
++ */
++ err = ext3_ext_get_access(handle, tree, path);
++ if (err == 0) {
++ EXT_ROOT_HDR(tree)->eh_depth = 0;
++ EXT_ROOT_HDR(tree)->eh_max = ext3_ext_space_root(tree);
++ err = ext3_ext_dirty(handle, tree, path);
++ }
++ }
++ ext3_ext_tree_changed(tree);
++
++ kfree(path);
++ ext3_journal_stop(handle);
++
++ return err;
++}
++
++int ext3_ext_calc_metadata_amount(struct ext3_extents_tree *tree, int blocks)
++{
++ int lcap, icap, rcap, leafs, idxs, num;
++
++ rcap = ext3_ext_space_root(tree);
++ if (blocks <= rcap) {
++ /* all extents fit to the root */
++ return 0;
++ }
++
++ rcap = ext3_ext_space_root_idx(tree);
++ lcap = ext3_ext_space_block(tree);
++ icap = ext3_ext_space_block_idx(tree);
++
++ num = leafs = (blocks + lcap - 1) / lcap;
++ if (leafs <= rcap) {
++ /* all pointers to leafs fit to the root */
++ return leafs;
++ }
++
++ /* ok. we need separate index block(s) to link all leaf blocks */
++ idxs = (leafs + icap - 1) / icap;
++ do {
++ num += idxs;
++ idxs = (idxs + icap - 1) / icap;
++ } while (idxs > rcap);
++
++ return num;
++}
++
++/*
++ * called at mount time
++ */
++void ext3_ext_init(struct super_block *sb)
++{
++ /*
++ * possible initialization would be here
++ */
++
++ if (test_opt(sb, EXTENTS)) {
++ printk("EXT3-fs: file extents enabled");
++#ifdef AGRESSIVE_TEST
++ printk(", agressive tests");
++#endif
++#ifdef CHECK_BINSEARCH
++ printk(", check binsearch");
++#endif
++ printk("\n");
++ }
++}
++
++/*
++ * called at umount time
++ */
++void ext3_ext_release(struct super_block *sb)
++{
++}
++
++/************************************************************************
++ * VFS related routines
++ ************************************************************************/
++
++static int ext3_get_inode_write_access(handle_t *handle, void *buffer)
++{
++ /* we use in-core data, not bh */
++ return 0;
++}
++
++static int ext3_mark_buffer_dirty(handle_t *handle, void *buffer)
++{
++ struct inode *inode = buffer;
++ return ext3_mark_inode_dirty(handle, inode);
++}
++
++static int ext3_ext_mergable(struct ext3_extent *ex1,
++ struct ext3_extent *ex2)
++{
++ /* FIXME: support for large fs */
++ if (ex1->ee_start + ex1->ee_len == ex2->ee_start)
++ return 1;
++ return 0;
++}
++
++static int
++ext3_remove_blocks_credits(struct ext3_extents_tree *tree,
++ struct ext3_extent *ex,
++ unsigned long from, unsigned long to)
++{
++ int needed;
++
++ /* at present, extent can't cross block group */;
++ needed = 4; /* bitmap + group desc + sb + inode */
++
++#ifdef CONFIG_QUOTA
++ needed += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS;
++#endif
++ return needed;
++}
++
++static int
++ext3_remove_blocks(struct ext3_extents_tree *tree,
++ struct ext3_extent *ex,
++ unsigned long from, unsigned long to)
++{
++ int needed = ext3_remove_blocks_credits(tree, ex, from, to);
++ handle_t *handle = ext3_journal_start(tree->inode, needed);
++ struct buffer_head *bh;
++ int i;
++
++ if (IS_ERR(handle))
++ return PTR_ERR(handle);
++ if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) {
++ /* tail removal */
++ unsigned long num, start;
++ num = ex->ee_block + ex->ee_len - from;
++ start = ex->ee_start + ex->ee_len - num;
++ ext_debug(tree, "free last %lu blocks starting %lu\n",
++ num, start);
++ for (i = 0; i < num; i++) {
++ bh = sb_find_get_block(tree->inode->i_sb, start + i);
++ ext3_forget(handle, 0, tree->inode, bh, start + i);
++ }
++ ext3_free_blocks(handle, tree->inode, start, num);
++ } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) {
++ printk("strange request: removal %lu-%lu from %u:%u\n",
++ from, to, ex->ee_block, ex->ee_len);
++ } else {
++ printk("strange request: removal(2) %lu-%lu from %u:%u\n",
++ from, to, ex->ee_block, ex->ee_len);
++ }
++ ext3_journal_stop(handle);
++ return 0;
++}
++
++static int ext3_ext_find_goal(struct inode *inode,
++ struct ext3_ext_path *path, unsigned long block)
++{
++ struct ext3_inode_info *ei = EXT3_I(inode);
++ unsigned long bg_start;
++ unsigned long colour;
++ int depth;
++
++ if (path) {
++ struct ext3_extent *ex;
++ depth = path->p_depth;
++
++ /* try to predict block placement */
++ if ((ex = path[depth].p_ext))
++ return ex->ee_start + (block - ex->ee_block);
++
++ /* it looks index is empty
++ * try to find starting from index itself */
++ if (path[depth].p_bh)
++ return path[depth].p_bh->b_blocknr;
++ }
++
++ /* OK. use inode's group */
++ bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) +
++ le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block);
++ colour = (current->pid % 16) *
++ (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16);
++ return bg_start + colour + block;
++}
++
++static int ext3_new_block_cb(handle_t *handle, struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path,
++ struct ext3_extent *ex, int *err)
++{
++ struct inode *inode = tree->inode;
++ int newblock, goal;
++
++ EXT_ASSERT(path);
++ EXT_ASSERT(ex);
++ EXT_ASSERT(ex->ee_start);
++ EXT_ASSERT(ex->ee_len);
++
++ /* reuse block from the extent to order data/metadata */
++ newblock = ex->ee_start++;
++ ex->ee_len--;
++ if (ex->ee_len == 0) {
++ ex->ee_len = 1;
++ /* allocate new block for the extent */
++ goal = ext3_ext_find_goal(inode, path, ex->ee_block);
++ ex->ee_start = ext3_new_block(handle, inode, goal, 0, 0, err);
++ if (ex->ee_start == 0) {
++ /* error occured: restore old extent */
++ ex->ee_start = newblock;
++ return 0;
++ }
++ }
++ return newblock;
++}
++
++static struct ext3_extents_helpers ext3_blockmap_helpers = {
++ .get_write_access = ext3_get_inode_write_access,
++ .mark_buffer_dirty = ext3_mark_buffer_dirty,
++ .mergable = ext3_ext_mergable,
++ .new_block = ext3_new_block_cb,
++ .remove_extent = ext3_remove_blocks,
++ .remove_extent_credits = ext3_remove_blocks_credits,
++};
++
++void ext3_init_tree_desc(struct ext3_extents_tree *tree,
++ struct inode *inode)
++{
++ tree->inode = inode;
++ tree->root = (void *) EXT3_I(inode)->i_data;
++ tree->buffer = (void *) inode;
++ tree->buffer_len = sizeof(EXT3_I(inode)->i_data);
++ tree->cex = (struct ext3_extent *) &EXT3_I(inode)->i_cached_extent;
++ tree->ops = &ext3_blockmap_helpers;
++}
++
++int ext3_ext_get_block(handle_t *handle, struct inode *inode,
++ long iblock, struct buffer_head *bh_result,
++ int create, int extend_disksize)
++{
++ struct ext3_ext_path *path = NULL;
++ struct ext3_extent newex;
++ struct ext3_extent *ex;
++ int goal, newblock, err = 0, depth;
++ struct ext3_extents_tree tree;
++
++ clear_buffer_new(bh_result);
++ ext3_init_tree_desc(&tree, inode);
++ ext_debug(&tree, "block %d requested for inode %u\n",
++ (int) iblock, (unsigned) inode->i_ino);
++ down(&EXT3_I(inode)->truncate_sem);
++
++ /* check in cache */
++ if (ext3_ext_in_cache(&tree, iblock, &newex)) {
++ if (newex.ee_start == EXT_CACHE_MARK) {
++ /* this is cached gap */
++ if (!create) {
++ /* block isn't allocated yet and
++ * user don't want to allocate it */
++ goto out2;
++ }
++ /* we should allocate requested block */
++ } else if (newex.ee_start) {
++ /* block is already allocated */
++ newblock = iblock - newex.ee_block + newex.ee_start;
++ goto out;
++ }
++ }
++
++ /* find extent for this block */
++ path = ext3_ext_find_extent(&tree, iblock, NULL);
++ if (IS_ERR(path)) {
++ err = PTR_ERR(path);
++ path = NULL;
++ goto out2;
++ }
++
++ depth = EXT_DEPTH(&tree);
++
++ /*
++ * consistent leaf must not be empty
++ * this situations is possible, though, _during_ tree modification
++ * this is why assert can't be put in ext3_ext_find_extent()
++ */
++ EXT_ASSERT(path[depth].p_ext != NULL || depth == 0);
++
++ if ((ex = path[depth].p_ext)) {
++ /* if found exent covers block, simple return it */
++ if (iblock >= ex->ee_block && iblock < ex->ee_block + ex->ee_len) {
++ newblock = iblock - ex->ee_block + ex->ee_start;
++ ext_debug(&tree, "%d fit into %d:%d -> %d\n",
++ (int) iblock, ex->ee_block, ex->ee_len,
++ newblock);
++ ext3_ext_put_in_cache(&tree, ex);
++ goto out;
++ }
++ }
++
++ /*
++ * requested block isn't allocated yet
++ * we couldn't try to create block if create flag is zero
++ */
++ if (!create) {
++ /* put just found gap into cache to speedup subsequest reqs */
++ ext3_ext_put_gap_in_cache(&tree, path, iblock);
++ goto out2;
++ }
++
++ /* allocate new block */
++ goal = ext3_ext_find_goal(inode, path, iblock);
++ newblock = ext3_new_block(handle, inode, goal, 0, 0, &err);
++ if (!newblock)
++ goto out2;
++ ext_debug(&tree, "allocate new block: goal %d, found %d\n",
++ goal, newblock);
++
++ /* try to insert new extent into found leaf and return */
++ newex.ee_block = iblock;
++ newex.ee_start = newblock;
++ newex.ee_len = 1;
++ err = ext3_ext_insert_extent(handle, &tree, path, &newex);
++ if (err)
++ goto out2;
++
++ if (extend_disksize && inode->i_size > EXT3_I(inode)->i_disksize)
++ EXT3_I(inode)->i_disksize = inode->i_size;
++
++ /* previous routine could use block we allocated */
++ newblock = newex.ee_start;
++ set_buffer_new(bh_result);
++
++ ext3_ext_put_in_cache(&tree, &newex);
++out:
++ ext3_ext_show_leaf(&tree, path);
++ map_bh(bh_result, inode->i_sb, newblock);
++out2:
++ if (path) {
++ ext3_ext_drop_refs(path);
++ kfree(path);
++ }
++ up(&EXT3_I(inode)->truncate_sem);
++
++ return err;
++}
++
++void ext3_ext_truncate(struct inode * inode, struct page *page)
++{
++ struct address_space *mapping = inode->i_mapping;
++ struct super_block *sb = inode->i_sb;
++ struct ext3_extents_tree tree;
++ unsigned long last_block;
++ handle_t *handle;
++ int err = 0;
++
++ ext3_init_tree_desc(&tree, inode);
++
++ /*
++ * probably first extent we're gonna free will be last in block
++ */
++ err = ext3_writepage_trans_blocks(inode) + 3;
++ handle = ext3_journal_start(inode, err);
++ if (IS_ERR(handle)) {
++ if (page) {
++ clear_highpage(page);
++ flush_dcache_page(page);
++ unlock_page(page);
++ page_cache_release(page);
++ }
++ return;
++ }
++
++ if (page)
++ ext3_block_truncate_page(handle, page, mapping, inode->i_size);
++
++ down(&EXT3_I(inode)->truncate_sem);
++ ext3_ext_invalidate_cache(&tree);
++
++ /*
++ * TODO: optimization is possible here
++ * probably we need not scaning at all,
++ * because page truncation is enough
++ */
++ if (ext3_orphan_add(handle, inode))
++ goto out_stop;
++
++ /* we have to know where to truncate from in crash case */
++ EXT3_I(inode)->i_disksize = inode->i_size;
++ ext3_mark_inode_dirty(handle, inode);
++
++ last_block = (inode->i_size + sb->s_blocksize - 1)
++ >> EXT3_BLOCK_SIZE_BITS(sb);
++ err = ext3_ext_remove_space(&tree, last_block, EXT_MAX_BLOCK);
++
++ /* In a multi-transaction truncate, we only make the final
++ * transaction synchronous */
++ if (IS_SYNC(inode))
++ handle->h_sync = 1;
++
++out_stop:
++ /*
++ * If this was a simple ftruncate(), and the file will remain alive
++ * then we need to clear up the orphan record which we created above.
++ * However, if this was a real unlink then we were called by
++ * ext3_delete_inode(), and we allow that function to clean up the
++ * orphan info for us.
++ */
++ if (inode->i_nlink)
++ ext3_orphan_del(handle, inode);
++
++ up(&EXT3_I(inode)->truncate_sem);
++ ext3_journal_stop(handle);
++}
++
++/*
++ * this routine calculate max number of blocks we could modify
++ * in order to allocate new block for an inode
++ */
++int ext3_ext_writepage_trans_blocks(struct inode *inode, int num)
++{
++ struct ext3_extents_tree tree;
++ int needed;
++
++ ext3_init_tree_desc(&tree, inode);
++
++ needed = ext3_ext_calc_credits_for_insert(&tree, NULL);
++
++ /* caller want to allocate num blocks */
++ needed *= num;
++
++#ifdef CONFIG_QUOTA
++ /*
++ * FIXME: real calculation should be here
++ * it depends on blockmap format of qouta file
++ */
++ needed += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS;
++#endif
++
++ return needed;
++}
++
++void ext3_extents_initialize_blockmap(handle_t *handle, struct inode *inode)
++{
++ struct ext3_extents_tree tree;
++
++ ext3_init_tree_desc(&tree, inode);
++ ext3_extent_tree_init(handle, &tree);
++}
++
++int ext3_ext_calc_blockmap_metadata(struct inode *inode, int blocks)
++{
++ struct ext3_extents_tree tree;
++
++ ext3_init_tree_desc(&tree, inode);
++ return ext3_ext_calc_metadata_amount(&tree, blocks);
++}
++
++static int
++ext3_ext_store_extent_cb(struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path,
++ struct ext3_extent *newex, int exist)
++{
++ struct ext3_extent_buf *buf = (struct ext3_extent_buf *) tree->private;
++
++ if (!exist)
++ return EXT_CONTINUE;
++ if (buf->err < 0)
++ return EXT_BREAK;
++ if (buf->cur - buf->buffer + sizeof(*newex) > buf->buflen)
++ return EXT_BREAK;
++
++ if (!copy_to_user(buf->cur, newex, sizeof(*newex))) {
++ buf->err++;
++ buf->cur += sizeof(*newex);
++ } else {
++ buf->err = -EFAULT;
++ return EXT_BREAK;
++ }
++ return EXT_CONTINUE;
++}
++
++static int
++ext3_ext_collect_stats_cb(struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path,
++ struct ext3_extent *ex, int exist)
++{
++ struct ext3_extent_tree_stats *buf =
++ (struct ext3_extent_tree_stats *) tree->private;
++ int depth;
++
++ if (!exist)
++ return EXT_CONTINUE;
++
++ depth = EXT_DEPTH(tree);
++ buf->extents_num++;
++ if (path[depth].p_ext == EXT_FIRST_EXTENT(path[depth].p_hdr))
++ buf->leaf_num++;
++ return EXT_CONTINUE;
++}
++
++int ext3_ext_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
++ unsigned long arg)
++{
++ int err = 0;
++
++ if (!(EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL))
++ return -EINVAL;
++
++ if (cmd == EXT3_IOC_GET_EXTENTS) {
++ struct ext3_extent_buf buf;
++ struct ext3_extents_tree tree;
++
++ if (copy_from_user(&buf, (void *) arg, sizeof(buf)))
++ return -EFAULT;
++
++ ext3_init_tree_desc(&tree, inode);
++ buf.cur = buf.buffer;
++ buf.err = 0;
++ tree.private = &buf;
++ down(&EXT3_I(inode)->truncate_sem);
++ err = ext3_ext_walk_space(&tree, buf.start, EXT_MAX_BLOCK,
++ ext3_ext_store_extent_cb);
++ up(&EXT3_I(inode)->truncate_sem);
++ if (err == 0)
++ err = buf.err;
++ } else if (cmd == EXT3_IOC_GET_TREE_STATS) {
++ struct ext3_extent_tree_stats buf;
++ struct ext3_extents_tree tree;
++
++ ext3_init_tree_desc(&tree, inode);
++ down(&EXT3_I(inode)->truncate_sem);
++ buf.depth = EXT_DEPTH(&tree);
++ buf.extents_num = 0;
++ buf.leaf_num = 0;
++ tree.private = &buf;
++ err = ext3_ext_walk_space(&tree, 0, EXT_MAX_BLOCK,
++ ext3_ext_collect_stats_cb);
++ up(&EXT3_I(inode)->truncate_sem);
++ if (!err)
++ err = copy_to_user((void *) arg, &buf, sizeof(buf));
++ } else if (cmd == EXT3_IOC_GET_TREE_DEPTH) {
++ struct ext3_extents_tree tree;
++ ext3_init_tree_desc(&tree, inode);
++ down(&EXT3_I(inode)->truncate_sem);
++ err = EXT_DEPTH(&tree);
++ up(&EXT3_I(inode)->truncate_sem);
++ }
++
++ return err;
++}
++
++EXPORT_SYMBOL(ext3_init_tree_desc);
++EXPORT_SYMBOL(ext3_mark_inode_dirty);
++EXPORT_SYMBOL(ext3_ext_invalidate_cache);
++EXPORT_SYMBOL(ext3_ext_insert_extent);
++EXPORT_SYMBOL(ext3_ext_walk_space);
++EXPORT_SYMBOL(ext3_ext_find_goal);
++EXPORT_SYMBOL(ext3_ext_calc_credits_for_insert);
++
+Index: linux-2.6.7/fs/ext3/ialloc.c
+===================================================================
+--- linux-2.6.7.orig/fs/ext3/ialloc.c 2004-08-19 08:51:04.000000000 +0400
++++ linux-2.6.7/fs/ext3/ialloc.c 2004-08-19 08:53:49.000000000 +0400
+@@ -646,6 +646,10 @@
+ DQUOT_FREE_INODE(inode);
+ goto fail2;
+ }
++ if (test_opt(sb, EXTENTS)) {
++ EXT3_I(inode)->i_flags |= EXT3_EXTENTS_FL;
++ ext3_extents_initialize_blockmap(handle, inode);
++ }
+ err = ext3_mark_inode_dirty(handle, inode);
+ if (err) {
+ ext3_std_error(sb, err);
+Index: linux-2.6.7/fs/ext3/inode.c
+===================================================================
+--- linux-2.6.7.orig/fs/ext3/inode.c 2004-08-19 08:51:04.000000000 +0400
++++ linux-2.6.7/fs/ext3/inode.c 2004-08-19 08:53:49.000000000 +0400
+@@ -857,6 +857,17 @@
+ goto reread;
+ }
+
++static inline int
++ext3_get_block_wrap(handle_t *handle, struct inode *inode, long block,
++ struct buffer_head *bh, int create, int extend_disksize)
++{
++ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL)
++ return ext3_ext_get_block(handle, inode, block, bh, create,
++ extend_disksize);
++ return ext3_get_block_handle(handle, inode, block, bh, create,
++ extend_disksize);
++}
++
+ static int ext3_get_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+ {
+@@ -867,8 +878,8 @@
+ handle = ext3_journal_current_handle();
+ J_ASSERT(handle != 0);
+ }
+- ret = ext3_get_block_handle(handle, inode, iblock,
+- bh_result, create, 1);
++ ret = ext3_get_block_wrap(handle, inode, iblock,
++ bh_result, create, 1);
+ return ret;
+ }
+
+@@ -894,8 +905,8 @@
+ }
+ }
+ if (ret == 0)
+- ret = ext3_get_block_handle(handle, inode, iblock,
+- bh_result, create, 0);
++ ret = ext3_get_block_wrap(handle, inode, iblock,
++ bh_result, create, 0);
+ if (ret == 0)
+ bh_result->b_size = (1 << inode->i_blkbits);
+ return ret;
+@@ -916,7 +927,7 @@
+ dummy.b_state = 0;
+ dummy.b_blocknr = -1000;
+ buffer_trace_init(&dummy.b_history);
+- *errp = ext3_get_block_handle(handle, inode, block, &dummy, create, 1);
++ *errp = ext3_get_block_wrap(handle, inode, block, &dummy, create, 1);
+ if (!*errp && buffer_mapped(&dummy)) {
+ struct buffer_head *bh;
+ bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
+@@ -1669,7 +1680,7 @@
+ * This required during truncate. We need to physically zero the tail end
+ * of that block so it doesn't yield old data if the file is later grown.
+ */
+-static int ext3_block_truncate_page(handle_t *handle, struct page *page,
++int ext3_block_truncate_page(handle_t *handle, struct page *page,
+ struct address_space *mapping, loff_t from)
+ {
+ unsigned long index = from >> PAGE_CACHE_SHIFT;
+@@ -2165,6 +2176,9 @@
+ return;
+ }
+
++ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL)
++ return ext3_ext_truncate(inode, page);
++
+ handle = start_transaction(inode);
+ if (IS_ERR(handle)) {
+ if (page) {
+@@ -2888,6 +2902,9 @@
+ int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3;
+ int ret;
+
++ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL)
++ return ext3_ext_writepage_trans_blocks(inode, bpp);
++
+ if (ext3_should_journal_data(inode))
+ ret = 3 * (bpp + indirects) + 2;
+ else
+Index: linux-2.6.7/fs/ext3/Makefile
+===================================================================
+--- linux-2.6.7.orig/fs/ext3/Makefile 2004-08-19 08:52:14.000000000 +0400
++++ linux-2.6.7/fs/ext3/Makefile 2004-08-19 08:53:49.000000000 +0400
+@@ -5,7 +5,7 @@
+ obj-$(CONFIG_EXT3_FS) += ext3.o
+
+ ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \
+- ioctl.o namei.o super.o symlink.o hash.o
++ ioctl.o namei.o super.o symlink.o hash.o extents.o
+
+ ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
+ ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o
+Index: linux-2.6.7/fs/ext3/super.c
+===================================================================
+--- linux-2.6.7.orig/fs/ext3/super.c 2004-08-19 08:51:04.000000000 +0400
++++ linux-2.6.7/fs/ext3/super.c 2004-08-19 08:53:49.000000000 +0400
+@@ -392,6 +392,7 @@
+ struct ext3_super_block *es = sbi->s_es;
+ int i;
+
++ ext3_ext_release(sb);
+ ext3_xattr_put_super(sb);
+ journal_destroy(sbi->s_journal);
+ if (!(sb->s_flags & MS_RDONLY)) {
+@@ -455,6 +456,9 @@
+ ei->i_default_acl = EXT3_ACL_NOT_CACHED;
+ #endif
+ ei->vfs_inode.i_version = 1;
++ ei->i_cached_extent[0] = 0;
++ ei->i_cached_extent[1] = 0;
++ ei->i_cached_extent[2] = 0;
+ return &ei->vfs_inode;
+ }
+
+@@ -590,7 +594,7 @@
+ Opt_commit, Opt_journal_update, Opt_journal_inum,
+ Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
+ Opt_ignore, Opt_barrier, Opt_iopen, Opt_noiopen, Opt_iopen_nopriv,
+- Opt_err,
++ Opt_err, Opt_extents, Opt_extdebug
+ };
+
+ static match_table_t tokens = {
+@@ -638,6 +642,8 @@
+ {Opt_iopen, "iopen"},
+ {Opt_noiopen, "noiopen"},
+ {Opt_iopen_nopriv, "iopen_nopriv"},
++ {Opt_extents, "extents"},
++ {Opt_extdebug, "extdebug"},
+ {Opt_err, NULL}
+ };
+
+@@ -917,6 +923,12 @@
+ break;
+ case Opt_ignore:
+ break;
++ case Opt_extents:
++ set_opt (sbi->s_mount_opt, EXTENTS);
++ break;
++ case Opt_extdebug:
++ set_opt (sbi->s_mount_opt, EXTDEBUG);
++ break;
+ default:
+ printk (KERN_ERR
+ "EXT3-fs: Unrecognized mount option \"%s\" "
+@@ -1589,6 +1601,8 @@
+ percpu_counter_mod(&sbi->s_dirs_counter,
+ ext3_count_dirs(sb));
+
++ ext3_ext_init(sb);
++
+ return 0;
+
+ failed_mount3:
+Index: linux-2.6.7/fs/ext3/ioctl.c
+===================================================================
+--- linux-2.6.7.orig/fs/ext3/ioctl.c 2004-08-19 08:51:03.000000000 +0400
++++ linux-2.6.7/fs/ext3/ioctl.c 2004-08-19 08:53:49.000000000 +0400
+@@ -176,6 +176,10 @@
+ return ret;
+ }
+ #endif
++ case EXT3_IOC_GET_EXTENTS:
++ case EXT3_IOC_GET_TREE_STATS:
++ case EXT3_IOC_GET_TREE_DEPTH:
++ return ext3_ext_ioctl(inode, filp, cmd, arg);
+ default:
+ return -ENOTTY;
+ }
+Index: linux-2.6.7/include/linux/ext3_fs.h
+===================================================================
+--- linux-2.6.7.orig/include/linux/ext3_fs.h 2004-08-19 08:51:04.000000000 +0400
++++ linux-2.6.7/include/linux/ext3_fs.h 2004-08-19 08:53:49.000000000 +0400
+@@ -186,6 +186,7 @@
+ #define EXT3_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */
+ #define EXT3_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/
+ #define EXT3_RESERVED_FL 0x80000000 /* reserved for ext3 lib */
++#define EXT3_EXTENTS_FL 0x00080000 /* Inode uses extents */
+
+ #define EXT3_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */
+ #define EXT3_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */
+@@ -209,6 +210,9 @@
+ #ifdef CONFIG_JBD_DEBUG
+ #define EXT3_IOC_WAIT_FOR_READONLY _IOR('f', 99, long)
+ #endif
++#define EXT3_IOC_GET_EXTENTS _IOR('f', 5, long)
++#define EXT3_IOC_GET_TREE_DEPTH _IOR('f', 6, long)
++#define EXT3_IOC_GET_TREE_STATS _IOR('f', 7, long)
+
+ /*
+ * Structure of an inode on the disk
+@@ -329,6 +333,8 @@
+ #define EXT3_MOUNT_POSIX_ACL 0x8000 /* POSIX Access Control Lists */
+ #define EXT3_MOUNT_IOPEN 0x40000 /* Allow access via iopen */
+ #define EXT3_MOUNT_IOPEN_NOPRIV 0x80000 /* Make iopen world-readable */
++#define EXT3_MOUNT_EXTENTS 0x10000 /* Extents support */
++#define EXT3_MOUNT_EXTDEBUG 0x20000 /* Extents debug */
+
+ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
+ #ifndef clear_opt
+@@ -724,6 +730,7 @@
+
+
+ /* inode.c */
++extern int ext3_block_truncate_page(handle_t *, struct page *, struct address_space *, loff_t);
+ extern int ext3_forget(handle_t *, int, struct inode *, struct buffer_head *, int);
+ extern struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *);
+ extern struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *);
+@@ -796,6 +803,14 @@
+ extern struct inode_operations ext3_symlink_inode_operations;
+ extern struct inode_operations ext3_fast_symlink_inode_operations;
+
++/* extents.c */
++extern int ext3_ext_writepage_trans_blocks(struct inode *, int);
++extern int ext3_ext_get_block(handle_t *, struct inode *, long,
++ struct buffer_head *, int, int);
++extern void ext3_ext_truncate(struct inode *, struct page *);
++extern void ext3_ext_init(struct super_block *);
++extern void ext3_ext_release(struct super_block *);
++extern void ext3_extents_initialize_blockmap(handle_t *, struct inode *);
+
+ #endif /* __KERNEL__ */
+
+Index: linux-2.6.7/include/linux/ext3_extents.h
+===================================================================
+--- linux-2.6.7.orig/include/linux/ext3_extents.h 2003-01-30 13:24:37.000000000 +0300
++++ linux-2.6.7/include/linux/ext3_extents.h 2004-08-19 08:53:49.000000000 +0400
+@@ -0,0 +1,238 @@
++/*
++ * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com
++ * Written by Alex Tomas <alex@clusterfs.com>
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License version 2 as
++ * published by the Free Software Foundation.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public Licens
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
++ */
++
++#ifndef _LINUX_EXT3_EXTENTS
++#define _LINUX_EXT3_EXTENTS
++
++/*
++ * with AGRESSIVE_TEST defined capacity of index/leaf blocks
++ * become very little, so index split, in-depth growing and
++ * other hard changes happens much more often
++ * this is for debug purposes only
++ */
++#define AGRESSIVE_TEST_
++
++/*
++ * if CHECK_BINSEARCH defined, then results of binary search
++ * will be checked by linear search
++ */
++#define CHECK_BINSEARCH_
++
++/*
++ * if EXT_DEBUG is defined you can use 'extdebug' mount option
++ * to get lots of info what's going on
++ */
++#define EXT_DEBUG
++#ifdef EXT_DEBUG
++#define ext_debug(tree,fmt,a...) \
++do { \
++ if (test_opt((tree)->inode->i_sb, EXTDEBUG)) \
++ printk(fmt, ##a); \
++} while (0);
++#else
++#define ext_debug(tree,fmt,a...)
++#endif
++
++/*
++ * if EXT_STATS is defined then stats numbers are collected
++ * these number will be displayed at umount time
++ */
++#define EXT_STATS_
++
++
++#define EXT3_ALLOC_NEEDED 3 /* block bitmap + group desc. + sb */
++
++/*
++ * ext3_inode has i_block array (total 60 bytes)
++ * first 4 bytes are used to store:
++ * - tree depth (0 mean there is no tree yet. all extents in the inode)
++ * - number of alive extents in the inode
++ */
++
++/*
++ * this is extent on-disk structure
++ * it's used at the bottom of the tree
++ */
++struct ext3_extent {
++ __u32 ee_block; /* first logical block extent covers */
++ __u16 ee_len; /* number of blocks covered by extent */
++ __u16 ee_start_hi; /* high 16 bits of physical block */
++ __u32 ee_start; /* low 32 bigs of physical block */
++};
++
++/*
++ * this is index on-disk structure
++ * it's used at all the levels, but the bottom
++ */
++struct ext3_extent_idx {
++ __u32 ei_block; /* index covers logical blocks from 'block' */
++ __u32 ei_leaf; /* pointer to the physical block of the next *
++ * level. leaf or next index could bet here */
++ __u16 ei_leaf_hi; /* high 16 bits of physical block */
++ __u16 ei_unused;
++};
++
++/*
++ * each block (leaves and indexes), even inode-stored has header
++ */
++struct ext3_extent_header {
++ __u16 eh_magic; /* probably will support different formats */
++ __u16 eh_entries; /* number of valid entries */
++ __u16 eh_max; /* capacity of store in entries */
++ __u16 eh_depth; /* has tree real underlaying blocks? */
++ __u32 eh_generation; /* generation of the tree */
++};
++
++#define EXT3_EXT_MAGIC 0xf30a
++
++/*
++ * array of ext3_ext_path contains path to some extent
++ * creation/lookup routines use it for traversal/splitting/etc
++ * truncate uses it to simulate recursive walking
++ */
++struct ext3_ext_path {
++ __u32 p_block;
++ __u16 p_depth;
++ struct ext3_extent *p_ext;
++ struct ext3_extent_idx *p_idx;
++ struct ext3_extent_header *p_hdr;
++ struct buffer_head *p_bh;
++};
++
++/*
++ * structure for external API
++ */
++
++/*
++ * ext3_extents_tree is used to pass initial information
++ * to top-level extents API
++ */
++struct ext3_extents_helpers;
++struct ext3_extents_tree {
++ struct inode *inode; /* inode which tree belongs to */
++ void *root; /* ptr to data top of tree resides at */
++ void *buffer; /* will be passed as arg to ^^ routines */
++ int buffer_len;
++ void *private;
++ struct ext3_extent *cex;/* last found extent */
++ struct ext3_extents_helpers *ops;
++};
++
++struct ext3_extents_helpers {
++ int (*get_write_access)(handle_t *h, void *buffer);
++ int (*mark_buffer_dirty)(handle_t *h, void *buffer);
++ int (*mergable)(struct ext3_extent *ex1, struct ext3_extent *ex2);
++ int (*remove_extent_credits)(struct ext3_extents_tree *,
++ struct ext3_extent *, unsigned long,
++ unsigned long);
++ int (*remove_extent)(struct ext3_extents_tree *,
++ struct ext3_extent *, unsigned long,
++ unsigned long);
++ int (*new_block)(handle_t *, struct ext3_extents_tree *,
++ struct ext3_ext_path *, struct ext3_extent *,
++ int *);
++};
++
++/*
++ * to be called by ext3_ext_walk_space()
++ * negative retcode - error
++ * positive retcode - signal for ext3_ext_walk_space(), see below
++ * callback must return valid extent (passed or newly created)
++ */
++typedef int (*ext_prepare_callback)(struct ext3_extents_tree *,
++ struct ext3_ext_path *,
++ struct ext3_extent *, int);
++
++#define EXT_CONTINUE 0
++#define EXT_BREAK 1
++#define EXT_REPEAT 2
++
++
++#define EXT_MAX_BLOCK 0xffffffff
++#define EXT_CACHE_MARK 0xffff
++
++
++#define EXT_FIRST_EXTENT(__hdr__) \
++ ((struct ext3_extent *) (((char *) (__hdr__)) + \
++ sizeof(struct ext3_extent_header)))
++#define EXT_FIRST_INDEX(__hdr__) \
++ ((struct ext3_extent_idx *) (((char *) (__hdr__)) + \
++ sizeof(struct ext3_extent_header)))
++#define EXT_HAS_FREE_INDEX(__path__) \
++ ((__path__)->p_hdr->eh_entries < (__path__)->p_hdr->eh_max)
++#define EXT_LAST_EXTENT(__hdr__) \
++ (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_entries - 1)
++#define EXT_LAST_INDEX(__hdr__) \
++ (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_entries - 1)
++#define EXT_MAX_EXTENT(__hdr__) \
++ (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_max - 1)
++#define EXT_MAX_INDEX(__hdr__) \
++ (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_max - 1)
++
++#define EXT_ROOT_HDR(tree) \
++ ((struct ext3_extent_header *) (tree)->root)
++#define EXT_BLOCK_HDR(bh) \
++ ((struct ext3_extent_header *) (bh)->b_data)
++#define EXT_DEPTH(_t_) \
++ (((struct ext3_extent_header *)((_t_)->root))->eh_depth)
++#define EXT_GENERATION(_t_) \
++ (((struct ext3_extent_header *)((_t_)->root))->eh_generation)
++
++
++#define EXT_ASSERT(__x__) if (!(__x__)) BUG();
++
++
++/*
++ * this structure is used to gather extents from the tree via ioctl
++ */
++struct ext3_extent_buf {
++ unsigned long start;
++ int buflen;
++ void *buffer;
++ void *cur;
++ int err;
++};
++
++/*
++ * this structure is used to collect stats info about the tree
++ */
++struct ext3_extent_tree_stats {
++ int depth;
++ int extents_num;
++ int leaf_num;
++};
++
++extern int ext3_extent_tree_init(handle_t *, struct ext3_extents_tree *);
++extern int ext3_ext_calc_credits_for_insert(struct ext3_extents_tree *, struct ext3_ext_path *);
++extern int ext3_ext_insert_extent(handle_t *, struct ext3_extents_tree *, struct ext3_ext_path *, struct ext3_extent *);
++extern int ext3_ext_walk_space(struct ext3_extents_tree *, unsigned long, unsigned long, ext_prepare_callback);
++extern int ext3_ext_remove_space(struct ext3_extents_tree *, unsigned long, unsigned long);
++extern struct ext3_ext_path * ext3_ext_find_extent(struct ext3_extents_tree *, int, struct ext3_ext_path *);
++extern void ext3_init_tree_desc(struct ext3_extents_tree *, struct inode *);
++extern int ext3_ext_calc_blockmap_metadata(struct inode *, int);
++
++static inline void
++ext3_ext_invalidate_cache(struct ext3_extents_tree *tree)
++{
++ if (tree->cex)
++ tree->cex->ee_len = 0;
++}
++
++
++#endif /* _LINUX_EXT3_EXTENTS */
++
+Index: linux-2.6.7/include/linux/ext3_fs_i.h
+===================================================================
+--- linux-2.6.7.orig/include/linux/ext3_fs_i.h 2004-08-19 08:51:04.000000000 +0400
++++ linux-2.6.7/include/linux/ext3_fs_i.h 2004-08-19 08:53:49.000000000 +0400
+@@ -111,6 +111,8 @@
+ */
+ struct semaphore truncate_sem;
+ struct inode vfs_inode;
++
++ __u32 i_cached_extent[3];
+ };
+
+ #endif /* _LINUX_EXT3_FS_I */
+
+%diffstat
+ fs/ext3/Makefile | 2
+ fs/ext3/extents.c | 2306 +++++++++++++++++++++++++++++++++++++++++++
+ fs/ext3/ialloc.c | 4
+ fs/ext3/inode.c | 29
+ fs/ext3/ioctl.c | 4
+ fs/ext3/super.c | 16
+ include/linux/ext3_extents.h | 238 ++++
+ include/linux/ext3_fs.h | 15
+ include/linux/ext3_fs_i.h | 2
+ 9 files changed, 2608 insertions(+), 8 deletions(-)
+
--- /dev/null
+Index: linux-2.6.7/fs/ext3/mballoc.c
+===================================================================
+--- linux-2.6.7.orig/fs/ext3/mballoc.c 2003-01-30 13:24:37.000000000 +0300
++++ linux-2.6.7/fs/ext3/mballoc.c 2004-09-03 09:48:40.000000000 +0400
+@@ -0,0 +1,1401 @@
++/*
++ * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com
++ * Written by Alex Tomas <alex@clusterfs.com>
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License version 2 as
++ * published by the Free Software Foundation.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public Licens
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
++ */
++
++
++/*
++ * mballoc.c contains the multiblocks allocation routines
++ */
++
++#include <linux/config.h>
++#include <linux/time.h>
++#include <linux/fs.h>
++#include <linux/namei.h>
++#include <linux/jbd.h>
++#include <linux/ext3_fs.h>
++#include <linux/ext3_jbd.h>
++#include <linux/quotaops.h>
++#include <linux/buffer_head.h>
++#include <linux/module.h>
++
++/*
++ * TODO:
++ * - do not scan from the beginning, try to remember first free block
++ * - mb_mark_used_* may allocate chunk right after splitting buddy
++ * - special flag to advice allocator to look for requested + N blocks
++ * this may improve interaction between extents and mballoc
++ */
++
++/*
++ * with AGRESSIVE_CHECK allocator runs consistency checks over
++ * structures. this checks slow things down a lot
++ */
++#define AGGRESSIVE_CHECK__
++
++/*
++ */
++#define MB_DEBUG__
++#ifdef MB_DEBUG
++#define mb_debug(fmt,a...) printk(fmt, ##a)
++#else
++#define mb_debug(fmt,a...)
++#endif
++
++/*
++ * where to save buddies structures beetween umount/mount (clean case only)
++ */
++#define EXT3_BUDDY_FILE ".buddy"
++
++/*
++ * max. number of chunks to be tracked in ext3_free_extent struct
++ */
++#define MB_ARR_SIZE 32
++
++struct ext3_allocation_context {
++ struct super_block *ac_sb;
++
++ /* search goals */
++ int ac_g_group;
++ int ac_g_start;
++ int ac_g_len;
++ int ac_g_flags;
++
++ /* the best found extent */
++ int ac_b_group;
++ int ac_b_start;
++ int ac_b_len;
++
++ /* number of iterations done. we have to track to limit searching */
++ int ac_repeats;
++ int ac_groups_scanned;
++ int ac_status;
++};
++
++#define AC_STATUS_CONTINUE 1
++#define AC_STATUS_FOUND 2
++
++
++struct ext3_buddy {
++ void *bd_bitmap;
++ void *bd_buddy;
++ int bd_blkbits;
++ struct buffer_head *bd_bh;
++ struct buffer_head *bd_bh2;
++ struct ext3_buddy_group_blocks *bd_bd;
++ struct super_block *bd_sb;
++};
++
++struct ext3_free_extent {
++ int fe_start;
++ int fe_len;
++ unsigned char fe_orders[MB_ARR_SIZE];
++ unsigned char fe_nums;
++ unsigned char fe_back;
++};
++
++#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
++
++
++int ext3_create (struct inode *, struct dentry *, int, struct nameidata *);
++struct buffer_head * read_block_bitmap(struct super_block *, unsigned int);
++void ext3_free_blocks_old(handle_t *, struct inode *, unsigned long, unsigned long);
++int ext3_new_block_old(handle_t *, struct inode *, unsigned long, u32 *, u32 *, int *);
++int ext3_mb_reserve_blocks(struct super_block *, int);
++void ext3_mb_release_blocks(struct super_block *, int);
++void ext3_mb_poll_new_transaction(struct super_block *, handle_t *);
++void ext3_mb_free_committed_blocks(struct super_block *);
++
++static inline void *mb_find_buddy(struct ext3_buddy *e3b, int order, int *max)
++{
++ int i = 1;
++ void *bb;
++
++ J_ASSERT(e3b->bd_bitmap != e3b->bd_buddy);
++ J_ASSERT(max != NULL);
++
++ if (order > e3b->bd_blkbits + 1)
++ return NULL;
++
++ /* at order 0 we see each particular block */
++ *max = 1 << (e3b->bd_blkbits + 3);
++ if (order == 0)
++ return e3b->bd_bitmap;
++
++ bb = e3b->bd_buddy;
++ *max = *max >> 1;
++ while (i < order) {
++ bb += 1 << (e3b->bd_blkbits - i);
++ i++;
++ *max = *max >> 1;
++ }
++ return bb;
++}
++
++static int ext3_mb_load_desc(struct super_block *sb, int group,
++ struct ext3_buddy *e3b)
++{
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++
++ J_ASSERT(sbi->s_buddy_blocks[group].bb_bitmap);
++ J_ASSERT(sbi->s_buddy_blocks[group].bb_buddy);
++
++ /* load bitmap */
++ e3b->bd_bh = sb_getblk(sb, sbi->s_buddy_blocks[group].bb_bitmap);
++ if (e3b->bd_bh == NULL) {
++ ext3_error(sb, "ext3_mb_load_desc",
++ "can't get block for buddy bitmap\n");
++ goto out;
++ }
++ if (!buffer_uptodate(e3b->bd_bh)) {
++ ll_rw_block(READ, 1, &e3b->bd_bh);
++ wait_on_buffer(e3b->bd_bh);
++ }
++ J_ASSERT(buffer_uptodate(e3b->bd_bh));
++
++ /* load buddy */
++ e3b->bd_bh2 = sb_getblk(sb, sbi->s_buddy_blocks[group].bb_buddy);
++ if (e3b->bd_bh2 == NULL) {
++ ext3_error(sb, "ext3_mb_load_desc",
++ "can't get block for buddy bitmap\n");
++ goto out;
++ }
++ if (!buffer_uptodate(e3b->bd_bh2)) {
++ ll_rw_block(READ, 1, &e3b->bd_bh2);
++ wait_on_buffer(e3b->bd_bh2);
++ }
++ J_ASSERT(buffer_uptodate(e3b->bd_bh2));
++
++ e3b->bd_bitmap = e3b->bd_bh->b_data;
++ e3b->bd_buddy = e3b->bd_bh2->b_data;
++ e3b->bd_blkbits = sb->s_blocksize_bits;
++ e3b->bd_bd = sbi->s_buddy_blocks + group;
++ e3b->bd_sb = sb;
++
++ return 0;
++out:
++ brelse(e3b->bd_bh);
++ brelse(e3b->bd_bh2);
++ e3b->bd_bh = NULL;
++ e3b->bd_bh2 = NULL;
++ return -EIO;
++}
++
++static void ext3_mb_dirty_buddy(struct ext3_buddy *e3b)
++{
++ mark_buffer_dirty(e3b->bd_bh);
++ mark_buffer_dirty(e3b->bd_bh2);
++}
++
++static void ext3_mb_release_desc(struct ext3_buddy *e3b)
++{
++ brelse(e3b->bd_bh);
++ brelse(e3b->bd_bh2);
++}
++
++#ifdef AGGRESSIVE_CHECK
++static void mb_check_buddy(struct ext3_buddy *e3b)
++{
++ int order = e3b->bd_blkbits + 1;
++ int max, max2, i, j, k, count;
++ void *buddy, *buddy2;
++
++ if (!test_opt(e3b->bd_sb, MBALLOC))
++ return;
++
++ while (order > 1) {
++ buddy = mb_find_buddy(e3b, order, &max);
++ J_ASSERT(buddy);
++ buddy2 = mb_find_buddy(e3b, order - 1, &max2);
++ J_ASSERT(buddy2);
++ J_ASSERT(buddy != buddy2);
++ J_ASSERT(max * 2 == max2);
++
++ count = 0;
++ for (i = 0; i < max; i++) {
++
++ if (!test_bit(i, buddy)) {
++ /* only single bit in buddy2 may be 1 */
++ if (test_bit(i << 1, buddy2))
++ J_ASSERT(!test_bit((i<<1)+1, buddy2));
++ else if (test_bit((i << 1) + 1, buddy2))
++ J_ASSERT(!test_bit(i << 1, buddy2));
++ continue;
++ }
++
++ /* both bits in buddy2 must be 0 */
++ J_ASSERT(!test_bit(i << 1, buddy2));
++ J_ASSERT(!test_bit((i << 1) + 1, buddy2));
++
++ for (j = 0; j < (1 << order); j++) {
++ k = (i * (1 << order)) + j;
++ J_ASSERT(test_bit(k, e3b->bd_bitmap));
++ }
++ count++;
++ }
++ J_ASSERT(e3b->bd_bd->bb_counters[order] == count);
++ order--;
++ }
++
++ buddy = mb_find_buddy(e3b, 0, &max);
++ for (i = 0; i < max; i++) {
++ if (test_bit(i, buddy))
++ continue;
++ /* check used bits only */
++ for (j = 0; j < e3b->bd_blkbits + 1; j++) {
++ buddy2 = mb_find_buddy(e3b, j, &max2);
++ k = i >> j;
++ J_ASSERT(k < max2);
++ J_ASSERT(!test_bit(k, buddy2));
++ }
++ }
++}
++#else
++#define mb_check_buddy(e3b)
++#endif
++
++static inline void
++ext3_lock_group(struct super_block *sb, int group)
++{
++ spin_lock(&EXT3_SB(sb)->s_buddy_blocks[group].bb_lock);
++}
++
++static inline void
++ext3_unlock_group(struct super_block *sb, int group)
++{
++ spin_unlock(&EXT3_SB(sb)->s_buddy_blocks[group].bb_lock);
++}
++
++static int mb_find_order_for_block(struct ext3_buddy *e3b, int block)
++{
++ int order = 1;
++ void *bb;
++
++ J_ASSERT(e3b->bd_bitmap != e3b->bd_buddy);
++ J_ASSERT(block < (1 << (e3b->bd_blkbits + 3)));
++
++ bb = e3b->bd_buddy;
++ while (order <= e3b->bd_blkbits + 1) {
++ block = block >> 1;
++ if (test_bit(block, bb)) {
++ /* this block is part of buddy of order 'order' */
++ return order;
++ }
++ bb += 1 << (e3b->bd_blkbits - order);
++ order++;
++ }
++ return 0;
++}
++
++static inline void mb_clear_bits(void *bm, int cur, int len)
++{
++ __u32 *addr;
++
++ len = cur + len;
++ while (cur < len) {
++ if ((cur & 31) == 0 && (len - cur) >= 32) {
++ /* fast path: clear whole word at once */
++ addr = bm + (cur >> 3);
++ *addr = 0;
++ cur += 32;
++ continue;
++ }
++ clear_bit(cur, bm);
++ cur++;
++ }
++}
++
++static inline void mb_set_bits(void *bm, int cur, int len)
++{
++ __u32 *addr;
++
++ len = cur + len;
++ while (cur < len) {
++ if ((cur & 31) == 0 && (len - cur) >= 32) {
++ /* fast path: clear whole word at once */
++ addr = bm + (cur >> 3);
++ *addr = 0xffffffff;
++ cur += 32;
++ continue;
++ }
++ set_bit(cur, bm);
++ cur++;
++ }
++}
++
++static int mb_free_blocks(struct ext3_buddy *e3b, int first, int count)
++{
++ int block, max, order;
++ void *buddy, *buddy2;
++
++ mb_check_buddy(e3b);
++ while (count-- > 0) {
++ block = first++;
++ order = 0;
++
++ J_ASSERT(!test_bit(block, e3b->bd_bitmap));
++ set_bit(block, e3b->bd_bitmap);
++ e3b->bd_bd->bb_counters[order]++;
++
++ /* start of the buddy */
++ buddy = mb_find_buddy(e3b, order, &max);
++
++ do {
++ block &= ~1UL;
++ if (!test_bit(block, buddy) ||
++ !test_bit(block + 1, buddy))
++ break;
++
++ /* both the buddies are free, try to coalesce them */
++ buddy2 = mb_find_buddy(e3b, order + 1, &max);
++
++ if (!buddy2)
++ break;
++
++ if (order > 0) {
++ /* for special purposes, we don't clear
++ * free bits in bitmap */
++ clear_bit(block, buddy);
++ clear_bit(block + 1, buddy);
++ }
++ e3b->bd_bd->bb_counters[order]--;
++ e3b->bd_bd->bb_counters[order]--;
++
++ block = block >> 1;
++ order++;
++ e3b->bd_bd->bb_counters[order]++;
++
++ set_bit(block, buddy2);
++ buddy = buddy2;
++ } while (1);
++ }
++ mb_check_buddy(e3b);
++
++ return 0;
++}
++
++/*
++ * returns 1 if out extent is enough to fill needed space
++ */
++int mb_make_backward_extent(struct ext3_free_extent *in,
++ struct ext3_free_extent *out, int needed)
++{
++ int i;
++
++ J_ASSERT(in);
++ J_ASSERT(out);
++ J_ASSERT(in->fe_nums < MB_ARR_SIZE);
++
++ out->fe_len = 0;
++ out->fe_start = in->fe_start + in->fe_len;
++ out->fe_nums = 0;
++
++ /* for single-chunk extent we need not back order
++ * also, if an extent doesn't fill needed space
++ * then it makes no sense to try back order becase
++ * if we select this extent then it'll be use as is */
++ if (in->fe_nums < 2 || in->fe_len < needed)
++ return 0;
++
++ i = in->fe_nums - 1;
++ while (i >= 0 && out->fe_len < needed) {
++ out->fe_len += (1 << in->fe_orders[i]);
++ out->fe_start -= (1 << in->fe_orders[i]);
++ i--;
++ }
++ /* FIXME: in some situation fe_orders may be too small to hold
++ * all the buddies */
++ J_ASSERT(out->fe_len >= needed);
++
++ for (i++; i < in->fe_nums; i++)
++ out->fe_orders[out->fe_nums++] = in->fe_orders[i];
++ J_ASSERT(out->fe_nums < MB_ARR_SIZE);
++ out->fe_back = 1;
++
++ return 1;
++}
++
++int mb_find_extent(struct ext3_buddy *e3b, int order, int block,
++ int needed, struct ext3_free_extent *ex)
++{
++ int space = needed;
++ int next, max, ord;
++ void *buddy;
++
++ J_ASSERT(ex != NULL);
++
++ ex->fe_nums = 0;
++ ex->fe_len = 0;
++
++ buddy = mb_find_buddy(e3b, order, &max);
++ J_ASSERT(buddy);
++ J_ASSERT(block < max);
++ if (!test_bit(block, buddy))
++ goto nofree;
++
++ if (order == 0) {
++ /* find actual order */
++ order = mb_find_order_for_block(e3b, block);
++ block = block >> order;
++ }
++
++ ex->fe_orders[ex->fe_nums++] = order;
++ ex->fe_len = 1 << order;
++ ex->fe_start = block << order;
++ ex->fe_back = 0;
++
++ while ((space = space - (1 << order)) > 0) {
++
++ buddy = mb_find_buddy(e3b, order, &max);
++ J_ASSERT(buddy);
++
++ if (block + 1 >= max)
++ break;
++
++ next = (block + 1) * (1 << order);
++ if (!test_bit(next, e3b->bd_bitmap))
++ break;
++
++ ord = mb_find_order_for_block(e3b, next);
++
++ if ((1 << ord) >= needed) {
++ /* we dont want to coalesce with self-enough buddies */
++ break;
++ }
++ order = ord;
++ block = next >> order;
++ ex->fe_len += 1 << order;
++
++ if (ex->fe_nums < MB_ARR_SIZE)
++ ex->fe_orders[ex->fe_nums++] = order;
++ }
++
++nofree:
++ J_ASSERT(ex->fe_start + ex->fe_len <= (1 << (e3b->bd_blkbits + 3)));
++ return ex->fe_len;
++}
++
++static int mb_mark_used_backward(struct ext3_buddy *e3b,
++ struct ext3_free_extent *ex, int len)
++{
++ int start = ex->fe_start, len0 = len;
++ int ord, mlen, max, cur;
++ void *buddy;
++
++ start = ex->fe_start + ex->fe_len - 1;
++ while (len) {
++ ord = mb_find_order_for_block(e3b, start);
++ if (((start >> ord) << ord) == (start - (1 << ord) + 1) &&
++ len >= (1 << ord)) {
++ /* the whole chunk may be allocated at once! */
++ mlen = 1 << ord;
++ buddy = mb_find_buddy(e3b, ord, &max);
++ J_ASSERT((start >> ord) < max);
++ clear_bit(start >> ord, buddy);
++ e3b->bd_bd->bb_counters[ord]--;
++ start -= mlen;
++ len -= mlen;
++ J_ASSERT(len >= 0);
++ J_ASSERT(start >= 0);
++ continue;
++ }
++
++ /* we have to split large buddy */
++ J_ASSERT(ord > 0);
++ buddy = mb_find_buddy(e3b, ord, &max);
++ clear_bit(start >> ord, buddy);
++ e3b->bd_bd->bb_counters[ord]--;
++
++ ord--;
++ cur = (start >> ord) & ~1U;
++ buddy = mb_find_buddy(e3b, ord, &max);
++ set_bit(cur, buddy);
++ set_bit(cur + 1, buddy);
++ e3b->bd_bd->bb_counters[ord]++;
++ e3b->bd_bd->bb_counters[ord]++;
++ }
++
++ /* now drop all the bits in bitmap */
++ mb_clear_bits(e3b->bd_bitmap, ex->fe_start + ex->fe_len - len0, len0);
++
++ mb_check_buddy(e3b);
++
++ return 0;
++}
++
++static int mb_mark_used_forward(struct ext3_buddy *e3b,
++ struct ext3_free_extent *ex, int len)
++{
++ int start = ex->fe_start, len0 = len;
++ int ord, mlen, max, cur;
++ void *buddy;
++
++ while (len) {
++ ord = mb_find_order_for_block(e3b, start);
++
++ if (((start >> ord) << ord) == start && len >= (1 << ord)) {
++ /* the whole chunk may be allocated at once! */
++ mlen = 1 << ord;
++ buddy = mb_find_buddy(e3b, ord, &max);
++ J_ASSERT((start >> ord) < max);
++ clear_bit(start >> ord, buddy);
++ e3b->bd_bd->bb_counters[ord]--;
++ start += mlen;
++ len -= mlen;
++ J_ASSERT(len >= 0);
++ continue;
++ }
++
++ /* we have to split large buddy */
++ J_ASSERT(ord > 0);
++ buddy = mb_find_buddy(e3b, ord, &max);
++ clear_bit(start >> ord, buddy);
++ e3b->bd_bd->bb_counters[ord]--;
++
++ ord--;
++ cur = (start >> ord) & ~1U;
++ buddy = mb_find_buddy(e3b, ord, &max);
++ set_bit(cur, buddy);
++ set_bit(cur + 1, buddy);
++ e3b->bd_bd->bb_counters[ord]++;
++ e3b->bd_bd->bb_counters[ord]++;
++ }
++
++ /* now drop all the bits in bitmap */
++ mb_clear_bits(e3b->bd_bitmap, ex->fe_start, len0);
++
++ mb_check_buddy(e3b);
++
++ return 0;
++}
++
++int inline mb_mark_used(struct ext3_buddy *e3b,
++ struct ext3_free_extent *ex, int len)
++{
++ int err;
++
++ J_ASSERT(ex);
++ if (ex->fe_back == 0)
++ err = mb_mark_used_forward(e3b, ex, len);
++ else
++ err = mb_mark_used_backward(e3b, ex, len);
++ return err;
++}
++
++int ext3_mb_new_in_group(struct ext3_allocation_context *ac,
++ struct ext3_buddy *e3b, int group)
++{
++ struct super_block *sb = ac->ac_sb;
++ int err, gorder, max, i;
++ struct ext3_free_extent curex;
++
++ /* let's know order of allocation */
++ gorder = 0;
++ while (ac->ac_g_len > (1 << gorder))
++ gorder++;
++
++ if ((ac->ac_g_flags & 1) && ac->ac_g_group == group) {
++ /* someone asks for space at this specified block
++ * probably he wants to merge it into existing extent */
++ if (test_bit(ac->ac_g_start, e3b->bd_bitmap)) {
++ /* good. at least one block is free */
++ max = mb_find_extent(e3b, 0, ac->ac_g_start,
++ ac->ac_g_len, &curex);
++ max = min(curex.fe_len, ac->ac_g_len);
++ mb_mark_used(e3b, &curex, max);
++
++ ac->ac_b_group = group;
++ ac->ac_b_start = curex.fe_start;
++ ac->ac_b_len = max;
++ ac->ac_status = AC_STATUS_FOUND;
++ err = 0;
++ goto out;
++ }
++ /* don't try to find goal anymore */
++ ac->ac_g_flags &= ~1;
++ }
++
++ i = 0;
++ while (1) {
++ i = find_next_bit(e3b->bd_bitmap, sb->s_blocksize * 8, i);
++ if (i >= sb->s_blocksize * 8)
++ break;
++
++ max = mb_find_extent(e3b, 0, i, ac->ac_g_len, &curex);
++ if (max >= ac->ac_g_len) {
++ max = min(curex.fe_len, ac->ac_g_len);
++ mb_mark_used(e3b, &curex, max);
++
++ ac->ac_b_group = group;
++ ac->ac_b_start = curex.fe_start;
++ ac->ac_b_len = max;
++ ac->ac_status = AC_STATUS_FOUND;
++ break;
++ }
++ i += max;
++ }
++
++ return 0;
++
++out:
++ return err;
++}
++
++int mb_good_group(struct ext3_allocation_context *ac, int group, int cr)
++{
++ struct ext3_group_desc *gdp;
++ int free_blocks;
++
++ gdp = ext3_get_group_desc(ac->ac_sb, group, NULL);
++ if (!gdp)
++ return 0;
++ free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
++ if (free_blocks == 0)
++ return 0;
++
++ /* someone wants this block very much */
++ if ((ac->ac_g_flags & 1) && ac->ac_g_group == group)
++ return 1;
++
++ /* FIXME: I'd like to take fragmentation into account here */
++ if (cr == 0) {
++ if (free_blocks >= ac->ac_g_len >> 1)
++ return 1;
++ } else if (cr == 1) {
++ if (free_blocks >= ac->ac_g_len >> 2)
++ return 1;
++ } else if (cr == 2) {
++ return 1;
++ } else {
++ BUG();
++ }
++ return 0;
++}
++
++int ext3_mb_new_blocks(handle_t *handle, struct inode *inode,
++ unsigned long goal, int *len, int flags, int *errp)
++{
++ struct buffer_head *bitmap_bh = NULL;
++ struct ext3_allocation_context ac;
++ int i, group, block, cr, err = 0;
++ struct ext3_group_desc *gdp;
++ struct ext3_super_block *es;
++ struct buffer_head *gdp_bh;
++ struct ext3_sb_info *sbi;
++ struct super_block *sb;
++ struct ext3_buddy e3b;
++
++ J_ASSERT(len != NULL);
++ J_ASSERT(*len > 0);
++
++ sb = inode->i_sb;
++ if (!sb) {
++ printk("ext3_mb_new_nblocks: nonexistent device");
++ return 0;
++ }
++
++ if (!test_opt(sb, MBALLOC)) {
++ static int ext3_mballoc_warning = 0;
++ if (ext3_mballoc_warning == 0) {
++ printk(KERN_ERR "EXT3-fs: multiblock request with "
++ "mballoc disabled!\n");
++ ext3_mballoc_warning++;
++ }
++ *len = 1;
++ err = ext3_new_block_old(handle, inode, goal, NULL,NULL, errp);
++ return err;
++ }
++
++ ext3_mb_poll_new_transaction(sb, handle);
++
++ sbi = EXT3_SB(sb);
++ es = EXT3_SB(sb)->s_es;
++
++ if (!(flags & 2)) {
++ /* someone asks for non-reserved blocks */
++ BUG_ON(*len > 1);
++ err = ext3_mb_reserve_blocks(sb, 1);
++ if (err) {
++ *errp = err;
++ return 0;
++ }
++ }
++
++ /*
++ * Check quota for allocation of this blocks.
++ */
++ while (*len && DQUOT_ALLOC_BLOCK(inode, *len))
++ *len -= 1;
++ if (*len == 0) {
++ *errp = -EDQUOT;
++ block = 0;
++ goto out;
++ }
++
++ /* start searching from the goal */
++ if (goal < le32_to_cpu(es->s_first_data_block) ||
++ goal >= le32_to_cpu(es->s_blocks_count))
++ goal = le32_to_cpu(es->s_first_data_block);
++ group = (goal - le32_to_cpu(es->s_first_data_block)) /
++ EXT3_BLOCKS_PER_GROUP(sb);
++ block = ((goal - le32_to_cpu(es->s_first_data_block)) %
++ EXT3_BLOCKS_PER_GROUP(sb));
++
++ /* set up allocation goals */
++ ac.ac_b_group = ac.ac_b_start = ac.ac_b_len = 0;
++ ac.ac_status = 0;
++ ac.ac_groups_scanned = 0;
++ ac.ac_sb = inode->i_sb;
++ ac.ac_g_group = group;
++ ac.ac_g_start = block;
++ ac.ac_g_len = *len;
++ ac.ac_g_flags = flags;
++
++ /* loop over the groups */
++ for (cr = 0; cr < 3 && ac.ac_status != AC_STATUS_FOUND; cr++) {
++ for (i = 0; i < EXT3_SB(sb)->s_groups_count; group++, i++) {
++ if (group == EXT3_SB(sb)->s_groups_count)
++ group = 0;
++
++ /* check is group good for our criteries */
++ if (!mb_good_group(&ac, group, cr))
++ continue;
++
++ err = ext3_mb_load_desc(ac.ac_sb, group, &e3b);
++ if (err)
++ goto out_err;
++
++ ext3_lock_group(sb, group);
++ if (!mb_good_group(&ac, group, cr)) {
++ /* someone did allocation from this group */
++ ext3_unlock_group(sb, group);
++ ext3_mb_release_desc(&e3b);
++ continue;
++ }
++
++ err = ext3_mb_new_in_group(&ac, &e3b, group);
++ ext3_unlock_group(sb, group);
++ if (ac.ac_status == AC_STATUS_FOUND)
++ ext3_mb_dirty_buddy(&e3b);
++ ext3_mb_release_desc(&e3b);
++ if (err)
++ goto out_err;
++ if (ac.ac_status == AC_STATUS_FOUND)
++ break;
++ }
++ }
++
++ if (ac.ac_status != AC_STATUS_FOUND) {
++ /* unfortunately, we can't satisfy this request */
++ J_ASSERT(ac.ac_b_len == 0);
++ DQUOT_FREE_BLOCK(inode, *len);
++ *errp = -ENOSPC;
++ block = 0;
++ goto out;
++ }
++
++ /* good news - free block(s) have been found. now it's time
++ * to mark block(s) in good old journaled bitmap */
++ block = ac.ac_b_group * EXT3_BLOCKS_PER_GROUP(sb)
++ + ac.ac_b_start + le32_to_cpu(es->s_first_data_block);
++
++ /* we made a desicion, now mark found blocks in good old
++ * bitmap to be journaled */
++
++ ext3_debug("using block group %d(%d)\n",
++ ac.ac_b_group.group, gdp->bg_free_blocks_count);
++
++ bitmap_bh = read_block_bitmap(sb, ac.ac_b_group);
++ if (!bitmap_bh) {
++ *errp = -EIO;
++ goto out_err;
++ }
++
++ err = ext3_journal_get_write_access(handle, bitmap_bh);
++ if (err) {
++ *errp = err;
++ goto out_err;
++ }
++
++ gdp = ext3_get_group_desc(sb, ac.ac_b_group, &gdp_bh);
++ if (!gdp) {
++ *errp = -EIO;
++ goto out_err;
++ }
++
++ err = ext3_journal_get_write_access(handle, gdp_bh);
++ if (err)
++ goto out_err;
++
++ block = ac.ac_b_start + ac.ac_b_group * EXT3_BLOCKS_PER_GROUP(sb)
++ + le32_to_cpu(es->s_first_data_block);
++
++ if (block == le32_to_cpu(gdp->bg_block_bitmap) ||
++ block == le32_to_cpu(gdp->bg_inode_bitmap) ||
++ in_range(block, le32_to_cpu(gdp->bg_inode_table),
++ EXT3_SB(sb)->s_itb_per_group))
++ ext3_error(sb, "ext3_new_block",
++ "Allocating block in system zone - "
++ "block = %u", block);
++#if 0
++ for (i = 0; i < ac.ac_b_len; i++)
++ J_ASSERT(!test_bit(ac.ac_b_start + i, bitmap_bh->b_data));
++#endif
++ mb_set_bits(bitmap_bh->b_data, ac.ac_b_start, ac.ac_b_len);
++
++ ext3_lock_group(sb, ac.ac_b_group);
++ gdp->bg_free_blocks_count =
++ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) -
++ ac.ac_b_len);
++ ext3_unlock_group(sb, ac.ac_b_group);
++ percpu_counter_mod(&sbi->s_freeblocks_counter, -ac.ac_b_len);
++
++ err = ext3_journal_dirty_metadata(handle, bitmap_bh);
++ if (err)
++ goto out_err;
++ err = ext3_journal_dirty_metadata(handle, gdp_bh);
++ if (err)
++ goto out_err;
++
++ sb->s_dirt = 1;
++ *errp = 0;
++ brelse(bitmap_bh);
++
++ /* drop non-allocated, but dquote'd blocks */
++ J_ASSERT(*len >= ac.ac_b_len);
++ DQUOT_FREE_BLOCK(inode, *len - ac.ac_b_len);
++
++ *len = ac.ac_b_len;
++ J_ASSERT(block != 0);
++ goto out;
++
++out_err:
++ /* if we've already allocated something, roll it back */
++ if (ac.ac_status == AC_STATUS_FOUND) {
++ /* FIXME: free blocks here */
++ }
++
++ DQUOT_FREE_BLOCK(inode, *len);
++ brelse(bitmap_bh);
++ *errp = err;
++ block = 0;
++out:
++ if (!(flags & 2)) {
++ /* block wasn't reserved before and we reserved it
++ * at the beginning of allocation. it doesn't matter
++ * whether we allocated anything or we failed: time
++ * to release reservation. NOTE: because I expect
++ * any multiblock request from delayed allocation
++ * path only, here is single block always */
++ ext3_mb_release_blocks(sb, 1);
++ }
++ return block;
++}
++
++int ext3_mb_generate_buddy(struct super_block *sb, int group)
++{
++ struct buffer_head *bh;
++ int i, err, count = 0;
++ struct ext3_buddy e3b;
++
++ err = ext3_mb_load_desc(sb, group, &e3b);
++ if (err)
++ goto out;
++ memset(e3b.bd_bh->b_data, 0, sb->s_blocksize);
++ memset(e3b.bd_bh2->b_data, 0, sb->s_blocksize);
++
++ bh = read_block_bitmap(sb, group);
++ if (bh == NULL) {
++ err = -EIO;
++ goto out2;
++ }
++
++ /* loop over the blocks, nad create buddies for free ones */
++ for (i = 0; i < sb->s_blocksize * 8; i++) {
++ if (!test_bit(i, (void *) bh->b_data)) {
++ mb_free_blocks(&e3b, i, 1);
++ count++;
++ }
++ }
++ brelse(bh);
++ mb_check_buddy(&e3b);
++ ext3_mb_dirty_buddy(&e3b);
++
++out2:
++ ext3_mb_release_desc(&e3b);
++out:
++ return err;
++}
++
++EXPORT_SYMBOL(ext3_mb_new_blocks);
++
++#ifndef EXT3_QUOTA_INIT_BLOCKS
++#define EXT3_QUOTA_INIT_BLOCKS 0
++#endif
++
++#define MB_CREDITS \
++ (EXT3_DATA_TRANS_BLOCKS + 3 + EXT3_INDEX_EXTRA_TRANS_BLOCKS + \
++ 2 * EXT3_QUOTA_INIT_BLOCKS)
++
++int ext3_mb_init_backend(struct super_block *sb)
++{
++ struct inode *root = sb->s_root->d_inode;
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ struct dentry *db;
++ tid_t target;
++ int err, i;
++
++ sbi->s_buddy_blocks = kmalloc(sizeof(struct ext3_buddy_group_blocks) *
++ sbi->s_groups_count, GFP_KERNEL);
++ if (sbi->s_buddy_blocks == NULL) {
++ printk("can't allocate mem for buddy maps\n");
++ return -ENOMEM;
++ }
++ memset(sbi->s_buddy_blocks, 0,
++ sizeof(struct ext3_buddy_group_blocks) * sbi->s_groups_count);
++ sbi->s_buddy = NULL;
++
++ down(&root->i_sem);
++ db = lookup_one_len(EXT3_BUDDY_FILE, sb->s_root,
++ strlen(EXT3_BUDDY_FILE));
++ if (IS_ERR(db)) {
++ err = PTR_ERR(db);
++ printk("can't lookup buddy file: %d\n", err);
++ goto out;
++ }
++
++ if (db->d_inode != NULL) {
++ sbi->s_buddy = igrab(db->d_inode);
++ goto map;
++ }
++
++ err = ext3_create(root, db, S_IFREG, NULL);
++ if (err) {
++ printk("error while creation buddy file: %d\n", err);
++ } else {
++ sbi->s_buddy = igrab(db->d_inode);
++ }
++
++map:
++ for (i = 0; i < sbi->s_groups_count; i++) {
++ struct buffer_head *bh = NULL;
++ handle_t *handle;
++
++ handle = ext3_journal_start(sbi->s_buddy, MB_CREDITS);
++ if (IS_ERR(handle)) {
++ err = PTR_ERR(handle);
++ goto out2;
++ }
++
++ /* allocate block for bitmap */
++ bh = ext3_getblk(handle, sbi->s_buddy, i * 2, 1, &err);
++ if (bh == NULL) {
++ printk("can't get block for buddy bitmap: %d\n", err);
++ goto out2;
++ }
++ sbi->s_buddy_blocks[i].bb_bitmap = bh->b_blocknr;
++ brelse(bh);
++
++ /* allocate block for buddy */
++ bh = ext3_getblk(handle, sbi->s_buddy, i * 2 + 1, 1, &err);
++ if (bh == NULL) {
++ printk("can't get block for buddy: %d\n", err);
++ goto out2;
++ }
++ sbi->s_buddy_blocks[i].bb_buddy = bh->b_blocknr;
++ brelse(bh);
++ ext3_journal_stop(handle);
++ spin_lock_init(&sbi->s_buddy_blocks[i].bb_lock);
++ sbi->s_buddy_blocks[i].bb_md_cur = NULL;
++ sbi->s_buddy_blocks[i].bb_tid = 0;
++ }
++
++ if (journal_start_commit(sbi->s_journal, &target))
++ log_wait_commit(sbi->s_journal, target);
++
++out2:
++ dput(db);
++out:
++ up(&root->i_sem);
++ return err;
++}
++
++int ext3_mb_release(struct super_block *sb)
++{
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++
++ if (!test_opt(sb, MBALLOC))
++ return 0;
++
++ /* release freed, non-committed blocks */
++ spin_lock(&sbi->s_md_lock);
++ list_splice_init(&sbi->s_closed_transaction,
++ &sbi->s_committed_transaction);
++ list_splice_init(&sbi->s_active_transaction,
++ &sbi->s_committed_transaction);
++ spin_unlock(&sbi->s_md_lock);
++ ext3_mb_free_committed_blocks(sb);
++
++ if (sbi->s_buddy_blocks)
++ kfree(sbi->s_buddy_blocks);
++ if (sbi->s_buddy)
++ iput(sbi->s_buddy);
++ if (sbi->s_blocks_reserved)
++ printk("ext3-fs: %ld blocks being reserved at umount!\n",
++ sbi->s_blocks_reserved);
++ return 0;
++}
++
++int ext3_mb_init(struct super_block *sb)
++{
++ struct ext3_super_block *es;
++ int i;
++
++ if (!test_opt(sb, MBALLOC))
++ return 0;
++
++ /* init file for buddy data */
++ clear_opt(EXT3_SB(sb)->s_mount_opt, MBALLOC);
++ ext3_mb_init_backend(sb);
++
++ es = EXT3_SB(sb)->s_es;
++ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++)
++ ext3_mb_generate_buddy(sb, i);
++ spin_lock_init(&EXT3_SB(sb)->s_reserve_lock);
++ spin_lock_init(&EXT3_SB(sb)->s_md_lock);
++ INIT_LIST_HEAD(&EXT3_SB(sb)->s_active_transaction);
++ INIT_LIST_HEAD(&EXT3_SB(sb)->s_closed_transaction);
++ INIT_LIST_HEAD(&EXT3_SB(sb)->s_committed_transaction);
++ set_opt(EXT3_SB(sb)->s_mount_opt, MBALLOC);
++ printk("EXT3-fs: mballoc enabled\n");
++ return 0;
++}
++
++void ext3_mb_free_committed_blocks(struct super_block *sb)
++{
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ int err, i, count = 0, count2 = 0;
++ struct ext3_free_metadata *md;
++ struct ext3_buddy e3b;
++
++ if (list_empty(&sbi->s_committed_transaction))
++ return;
++
++ /* there is committed blocks to be freed yet */
++ do {
++ /* get next array of blocks */
++ md = NULL;
++ spin_lock(&sbi->s_md_lock);
++ if (!list_empty(&sbi->s_committed_transaction)) {
++ md = list_entry(sbi->s_committed_transaction.next,
++ struct ext3_free_metadata, list);
++ list_del(&md->list);
++ }
++ spin_unlock(&sbi->s_md_lock);
++
++ if (md == NULL)
++ break;
++
++ mb_debug("gonna free %u blocks in group %u (0x%p):",
++ md->num, md->group, md);
++
++ err = ext3_mb_load_desc(sb, md->group, &e3b);
++ BUG_ON(err != 0);
++
++ /* there are blocks to put in buddy to make them really free */
++ count += md->num;
++ count2++;
++ ext3_lock_group(sb, md->group);
++ for (i = 0; i < md->num; i++) {
++ mb_debug(" %u", md->blocks[i]);
++ mb_free_blocks(&e3b, md->blocks[i], 1);
++ }
++ mb_debug("\n");
++ ext3_unlock_group(sb, md->group);
++
++ kfree(md);
++ ext3_mb_dirty_buddy(&e3b);
++ ext3_mb_release_desc(&e3b);
++
++ } while (md);
++ mb_debug("freed %u blocks in %u structures\n", count, count2);
++}
++
++void ext3_mb_poll_new_transaction(struct super_block *sb, handle_t *handle)
++{
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++
++ if (sbi->s_last_transaction == handle->h_transaction->t_tid)
++ return;
++
++ /* new transaction! time to close last one and free blocks for
++ * committed transaction. we know that only transaction can be
++ * active, so previos transaction can be being logged and we
++ * know that transaction before previous is known to be alreade
++ * logged. this means that now we may free blocks freed in all
++ * transactions before previous one. hope I'm clear enough ... */
++
++ spin_lock(&sbi->s_md_lock);
++ if (sbi->s_last_transaction != handle->h_transaction->t_tid) {
++ mb_debug("new transaction %lu, old %lu\n",
++ (unsigned long) handle->h_transaction->t_tid,
++ (unsigned long) sbi->s_last_transaction);
++ list_splice_init(&sbi->s_closed_transaction,
++ &sbi->s_committed_transaction);
++ list_splice_init(&sbi->s_active_transaction,
++ &sbi->s_closed_transaction);
++ sbi->s_last_transaction = handle->h_transaction->t_tid;
++ }
++ spin_unlock(&sbi->s_md_lock);
++
++ ext3_mb_free_committed_blocks(sb);
++}
++
++int ext3_mb_free_metadata(handle_t *handle, struct ext3_buddy *e3b,
++ int group, int block, int count)
++{
++ struct ext3_buddy_group_blocks *db = e3b->bd_bd;
++ struct super_block *sb = e3b->bd_sb;
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ struct ext3_free_metadata *md;
++ int i;
++
++ ext3_lock_group(sb, group);
++ for (i = 0; i < count; i++) {
++ md = db->bb_md_cur;
++ if (md && db->bb_tid != handle->h_transaction->t_tid) {
++ db->bb_md_cur = NULL;
++ md = NULL;
++ }
++
++ if (md == NULL) {
++ ext3_unlock_group(sb, group);
++ md = kmalloc(sizeof(*md), GFP_KERNEL);
++ if (md == NULL)
++ return -ENOMEM;
++ md->num = 0;
++ md->group = group;
++
++ ext3_lock_group(sb, group);
++ if (db->bb_md_cur == NULL) {
++ spin_lock(&sbi->s_md_lock);
++ list_add(&md->list, &sbi->s_active_transaction);
++ spin_unlock(&sbi->s_md_lock);
++ db->bb_md_cur = md;
++ db->bb_tid = handle->h_transaction->t_tid;
++ mb_debug("new md 0x%p for group %u\n",
++ md, md->group);
++ } else {
++ kfree(md);
++ md = db->bb_md_cur;
++ }
++ }
++
++ BUG_ON(md->num >= EXT3_BB_MAX_BLOCKS);
++ md->blocks[md->num] = block + i;
++ md->num++;
++ if (md->num == EXT3_BB_MAX_BLOCKS) {
++ /* no more space, put full container on a sb's list */
++ db->bb_md_cur = NULL;
++ }
++ }
++ ext3_unlock_group(sb, group);
++ return 0;
++}
++
++void ext3_mb_free_blocks(handle_t *handle, struct inode *inode,
++ unsigned long block, unsigned long count, int metadata)
++{
++ struct buffer_head *bitmap_bh = NULL;
++ struct ext3_group_desc *gdp;
++ struct ext3_super_block *es;
++ unsigned long bit, overflow;
++ struct buffer_head *gd_bh;
++ unsigned long block_group;
++ struct ext3_sb_info *sbi;
++ struct super_block *sb;
++ struct ext3_buddy e3b;
++ int err = 0, ret;
++
++ sb = inode->i_sb;
++ if (!sb) {
++ printk ("ext3_free_blocks: nonexistent device");
++ return;
++ }
++
++ ext3_mb_poll_new_transaction(sb, handle);
++
++ sbi = EXT3_SB(sb);
++ es = EXT3_SB(sb)->s_es;
++ if (block < le32_to_cpu(es->s_first_data_block) ||
++ block + count < block ||
++ block + count > le32_to_cpu(es->s_blocks_count)) {
++ ext3_error (sb, "ext3_free_blocks",
++ "Freeing blocks not in datazone - "
++ "block = %lu, count = %lu", block, count);
++ goto error_return;
++ }
++
++ ext3_debug("freeing block %lu\n", block);
++
++do_more:
++ overflow = 0;
++ block_group = (block - le32_to_cpu(es->s_first_data_block)) /
++ EXT3_BLOCKS_PER_GROUP(sb);
++ bit = (block - le32_to_cpu(es->s_first_data_block)) %
++ EXT3_BLOCKS_PER_GROUP(sb);
++ /*
++ * Check to see if we are freeing blocks across a group
++ * boundary.
++ */
++ if (bit + count > EXT3_BLOCKS_PER_GROUP(sb)) {
++ overflow = bit + count - EXT3_BLOCKS_PER_GROUP(sb);
++ count -= overflow;
++ }
++ brelse(bitmap_bh);
++ bitmap_bh = read_block_bitmap(sb, block_group);
++ if (!bitmap_bh)
++ goto error_return;
++ gdp = ext3_get_group_desc (sb, block_group, &gd_bh);
++ if (!gdp)
++ goto error_return;
++
++ if (in_range (le32_to_cpu(gdp->bg_block_bitmap), block, count) ||
++ in_range (le32_to_cpu(gdp->bg_inode_bitmap), block, count) ||
++ in_range (block, le32_to_cpu(gdp->bg_inode_table),
++ EXT3_SB(sb)->s_itb_per_group) ||
++ in_range (block + count - 1, le32_to_cpu(gdp->bg_inode_table),
++ EXT3_SB(sb)->s_itb_per_group))
++ ext3_error (sb, "ext3_free_blocks",
++ "Freeing blocks in system zones - "
++ "Block = %lu, count = %lu",
++ block, count);
++
++ BUFFER_TRACE(bitmap_bh, "getting write access");
++ err = ext3_journal_get_write_access(handle, bitmap_bh);
++ if (err)
++ goto error_return;
++
++ /*
++ * We are about to modify some metadata. Call the journal APIs
++ * to unshare ->b_data if a currently-committing transaction is
++ * using it
++ */
++ BUFFER_TRACE(gd_bh, "get_write_access");
++ err = ext3_journal_get_write_access(handle, gd_bh);
++ if (err)
++ goto error_return;
++
++ err = ext3_mb_load_desc(sb, block_group, &e3b);
++ if (err)
++ goto error_return;
++
++ if (metadata) {
++ /* blocks being freed are metadata. these blocks shouldn't
++ * be used until this transaction is committed */
++ ext3_mb_free_metadata(handle, &e3b, block_group, bit, count);
++ } else {
++ ext3_lock_group(sb, block_group);
++ mb_free_blocks(&e3b, bit, count);
++ gdp->bg_free_blocks_count =
++ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + count);
++ ext3_unlock_group(sb, block_group);
++ percpu_counter_mod(&sbi->s_freeblocks_counter, count);
++ }
++
++ ext3_mb_dirty_buddy(&e3b);
++ ext3_mb_release_desc(&e3b);
++
++ /* FIXME: undo logic will be implemented later and another way */
++ mb_clear_bits(bitmap_bh->b_data, bit, count);
++ DQUOT_FREE_BLOCK(inode, count);
++
++ /* We dirtied the bitmap block */
++ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
++ err = ext3_journal_dirty_metadata(handle, bitmap_bh);
++
++ /* And the group descriptor block */
++ BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
++ ret = ext3_journal_dirty_metadata(handle, gd_bh);
++ if (!err) err = ret;
++
++ if (overflow && !err) {
++ block += count;
++ count = overflow;
++ goto do_more;
++ }
++ sb->s_dirt = 1;
++error_return:
++ brelse(bitmap_bh);
++ ext3_std_error(sb, err);
++ return;
++}
++
++int ext3_mb_reserve_blocks(struct super_block *sb, int blocks)
++{
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ int free, ret = -ENOSPC;
++
++ BUG_ON(blocks < 0);
++ spin_lock(&sbi->s_reserve_lock);
++ free = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
++ if (blocks <= free - sbi->s_blocks_reserved) {
++ sbi->s_blocks_reserved += blocks;
++ ret = 0;
++ }
++ spin_unlock(&sbi->s_reserve_lock);
++ return ret;
++}
++
++void ext3_mb_release_blocks(struct super_block *sb, int blocks)
++{
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++
++ BUG_ON(blocks < 0);
++ spin_lock(&sbi->s_reserve_lock);
++ sbi->s_blocks_reserved -= blocks;
++ WARN_ON(sbi->s_blocks_reserved < 0);
++ if (sbi->s_blocks_reserved < 0)
++ sbi->s_blocks_reserved = 0;
++ spin_unlock(&sbi->s_reserve_lock);
++}
++
++int ext3_new_block(handle_t *handle, struct inode *inode,
++ unsigned long goal, u32 *pc, u32 *pb, int *errp)
++{
++ int ret, len;
++
++ if (!test_opt(inode->i_sb, MBALLOC)) {
++ ret = ext3_new_block_old(handle, inode, goal, pc, pb, errp);
++ goto out;
++ }
++ len = 1;
++ ret = ext3_mb_new_blocks(handle, inode, goal, &len, 0, errp);
++out:
++ return ret;
++}
++
++
++void ext3_free_blocks(handle_t *handle, struct inode * inode,
++ unsigned long block, unsigned long count, int metadata)
++{
++ if (!test_opt(inode->i_sb, MBALLOC))
++ ext3_free_blocks_old(handle, inode, block, count);
++ else
++ ext3_mb_free_blocks(handle, inode, block, count, metadata);
++ return;
++}
++
+Index: linux-2.6.7/fs/ext3/super.c
+===================================================================
+--- linux-2.6.7.orig/fs/ext3/super.c 2004-09-03 08:46:59.000000000 +0400
++++ linux-2.6.7/fs/ext3/super.c 2004-09-03 08:46:59.000000000 +0400
+@@ -392,6 +392,7 @@
+ struct ext3_super_block *es = sbi->s_es;
+ int i;
+
++ ext3_mb_release(sb);
+ ext3_ext_release(sb);
+ ext3_xattr_put_super(sb);
+ journal_destroy(sbi->s_journal);
+@@ -594,7 +595,7 @@
+ Opt_commit, Opt_journal_update, Opt_journal_inum,
+ Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
+ Opt_ignore, Opt_barrier, Opt_iopen, Opt_noiopen, Opt_iopen_nopriv,
+- Opt_err, Opt_extents, Opt_extdebug
++ Opt_err, Opt_extents, Opt_extdebug, Opt_mballoc,
+ };
+
+ static match_table_t tokens = {
+@@ -644,6 +645,7 @@
+ {Opt_iopen_nopriv, "iopen_nopriv"},
+ {Opt_extents, "extents"},
+ {Opt_extdebug, "extdebug"},
++ {Opt_mballoc, "mballoc"},
+ {Opt_err, NULL}
+ };
+
+@@ -929,6 +931,9 @@
+ case Opt_extdebug:
+ set_opt (sbi->s_mount_opt, EXTDEBUG);
+ break;
++ case Opt_mballoc:
++ set_opt (sbi->s_mount_opt, MBALLOC);
++ break;
+ default:
+ printk (KERN_ERR
+ "EXT3-fs: Unrecognized mount option \"%s\" "
+@@ -1602,7 +1607,8 @@
+ ext3_count_dirs(sb));
+
+ ext3_ext_init(sb);
+-
++ ext3_mb_init(sb);
++
+ return 0;
+
+ failed_mount3:
+Index: linux-2.6.7/fs/ext3/Makefile
+===================================================================
+--- linux-2.6.7.orig/fs/ext3/Makefile 2004-09-03 08:46:59.000000000 +0400
++++ linux-2.6.7/fs/ext3/Makefile 2004-09-03 08:46:59.000000000 +0400
+@@ -5,7 +5,7 @@
+ obj-$(CONFIG_EXT3_FS) += ext3.o
+
+ ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \
+- ioctl.o namei.o super.o symlink.o hash.o extents.o
++ ioctl.o namei.o super.o symlink.o hash.o extents.o mballoc.o
+
+ ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
+ ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o
+Index: linux-2.6.7/fs/ext3/balloc.c
+===================================================================
+--- linux-2.6.7.orig/fs/ext3/balloc.c 2004-08-26 17:11:16.000000000 +0400
++++ linux-2.6.7/fs/ext3/balloc.c 2004-09-03 08:46:59.000000000 +0400
+@@ -78,7 +78,7 @@
+ *
+ * Return buffer_head on success or NULL in case of failure.
+ */
+-static struct buffer_head *
++struct buffer_head *
+ read_block_bitmap(struct super_block *sb, unsigned int block_group)
+ {
+ struct ext3_group_desc * desc;
+@@ -98,8 +98,8 @@
+ }
+
+ /* Free given blocks, update quota and i_blocks field */
+-void ext3_free_blocks (handle_t *handle, struct inode * inode,
+- unsigned long block, unsigned long count)
++void ext3_free_blocks_old (handle_t *handle, struct inode * inode,
++ unsigned long block, unsigned long count)
+ {
+ struct buffer_head *bitmap_bh = NULL;
+ struct buffer_head *gd_bh;
+@@ -474,8 +474,8 @@
+ * This function also updates quota and i_blocks field.
+ */
+ int
+-ext3_new_block(handle_t *handle, struct inode *inode, unsigned long goal,
+- u32 *prealloc_count, u32 *prealloc_block, int *errp)
++ext3_new_block_old(handle_t *handle, struct inode *inode, unsigned long goal,
++ u32 *prealloc_count, u32 *prealloc_block, int *errp)
+ {
+ struct buffer_head *bitmap_bh = NULL; /* bh */
+ struct buffer_head *gdp_bh; /* bh2 */
+Index: linux-2.6.7/fs/ext3/namei.c
+===================================================================
+--- linux-2.6.7.orig/fs/ext3/namei.c 2004-09-03 08:46:59.000000000 +0400
++++ linux-2.6.7/fs/ext3/namei.c 2004-09-03 08:46:59.000000000 +0400
+@@ -1640,7 +1640,7 @@
+ * If the create succeeds, we fill in the inode information
+ * with d_instantiate().
+ */
+-static int ext3_create (struct inode * dir, struct dentry * dentry, int mode,
++int ext3_create (struct inode * dir, struct dentry * dentry, int mode,
+ struct nameidata *nd)
+ {
+ handle_t *handle;
+Index: linux-2.6.7/fs/ext3/inode.c
+===================================================================
+--- linux-2.6.7.orig/fs/ext3/inode.c 2004-09-03 08:46:59.000000000 +0400
++++ linux-2.6.7/fs/ext3/inode.c 2004-09-03 08:46:59.000000000 +0400
+@@ -254,7 +254,7 @@
+ ei->i_prealloc_count = 0;
+ ei->i_prealloc_block = 0;
+ /* Writer: end */
+- ext3_free_blocks (inode, block, total);
++ ext3_free_blocks (inode, block, total, 1);
+ }
+ #endif
+ }
+@@ -633,7 +633,7 @@
+ ext3_journal_forget(handle, branch[i].bh);
+ }
+ for (i = 0; i < keys; i++)
+- ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1);
++ ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1, 1);
+ return err;
+ }
+
+@@ -734,7 +734,7 @@
+ if (err == -EAGAIN)
+ for (i = 0; i < num; i++)
+ ext3_free_blocks(handle, inode,
+- le32_to_cpu(where[i].key), 1);
++ le32_to_cpu(where[i].key), 1, 1);
+ return err;
+ }
+
+@@ -1911,7 +1911,7 @@
+ }
+ }
+
+- ext3_free_blocks(handle, inode, block_to_free, count);
++ ext3_free_blocks(handle, inode, block_to_free, count, 1);
+ }
+
+ /**
+@@ -2082,7 +2082,7 @@
+ ext3_journal_test_restart(handle, inode);
+ }
+
+- ext3_free_blocks(handle, inode, nr, 1);
++ ext3_free_blocks(handle, inode, nr, 1, 1);
+
+ if (parent_bh) {
+ /*
+Index: linux-2.6.7/fs/ext3/extents.c
+===================================================================
+--- linux-2.6.7.orig/fs/ext3/extents.c 2004-09-03 08:46:59.000000000 +0400
++++ linux-2.6.7/fs/ext3/extents.c 2004-09-03 08:46:59.000000000 +0400
+@@ -740,7 +740,7 @@
+ for (i = 0; i < depth; i++) {
+ if (!ablocks[i])
+ continue;
+- ext3_free_blocks(handle, tree->inode, ablocks[i], 1);
++ ext3_free_blocks(handle, tree->inode, ablocks[i], 1, 1);
+ }
+ }
+ kfree(ablocks);
+@@ -1388,7 +1388,7 @@
+ path->p_idx->ei_leaf);
+ bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf);
+ ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf);
+- ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1);
++ ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1, 1);
+ return err;
+ }
+
+@@ -1876,10 +1876,12 @@
+ int needed = ext3_remove_blocks_credits(tree, ex, from, to);
+ handle_t *handle = ext3_journal_start(tree->inode, needed);
+ struct buffer_head *bh;
+- int i;
++ int i, metadata = 0;
+
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
++ if (S_ISDIR(tree->inode->i_mode))
++ metadata = 1;
+ if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) {
+ /* tail removal */
+ unsigned long num, start;
+@@ -1891,7 +1893,7 @@
+ bh = sb_find_get_block(tree->inode->i_sb, start + i);
+ ext3_forget(handle, 0, tree->inode, bh, start + i);
+ }
+- ext3_free_blocks(handle, tree->inode, start, num);
++ ext3_free_blocks(handle, tree->inode, start, num, metadata);
+ } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) {
+ printk("strange request: removal %lu-%lu from %u:%u\n",
+ from, to, ex->ee_block, ex->ee_len);
+Index: linux-2.6.7/fs/ext3/xattr.c
+===================================================================
+--- linux-2.6.7.orig/fs/ext3/xattr.c 2004-09-03 08:46:59.000000000 +0400
++++ linux-2.6.7/fs/ext3/xattr.c 2004-09-03 08:46:59.000000000 +0400
+@@ -1366,7 +1366,7 @@
+ new_bh = sb_getblk(sb, block);
+ if (!new_bh) {
+ getblk_failed:
+- ext3_free_blocks(handle, inode, block, 1);
++ ext3_free_blocks(handle, inode, block, 1, 1);
+ error = -EIO;
+ goto cleanup;
+ }
+@@ -1408,7 +1408,7 @@
+ if (HDR(old_bh)->h_refcount == cpu_to_le32(1)) {
+ /* Free the old block. */
+ ea_bdebug(old_bh, "freeing");
+- ext3_free_blocks(handle, inode, old_bh->b_blocknr, 1);
++ ext3_free_blocks(handle, inode, old_bh->b_blocknr, 1, 1);
+
+ /* ext3_forget() calls bforget() for us, but we
+ let our caller release old_bh, so we need to
+@@ -1497,7 +1497,7 @@
+ lock_buffer(bh);
+ if (HDR(bh)->h_refcount == cpu_to_le32(1)) {
+ ext3_xattr_cache_remove(bh);
+- ext3_free_blocks(handle, inode, EXT3_I(inode)->i_file_acl, 1);
++ ext3_free_blocks(handle, inode, EXT3_I(inode)->i_file_acl, 1, 1);
+ get_bh(bh);
+ ext3_forget(handle, 1, inode, bh, EXT3_I(inode)->i_file_acl);
+ } else {
+Index: linux-2.6.7/include/linux/ext3_fs.h
+===================================================================
+--- linux-2.6.7.orig/include/linux/ext3_fs.h 2004-09-03 08:46:59.000000000 +0400
++++ linux-2.6.7/include/linux/ext3_fs.h 2004-09-03 08:47:35.000000000 +0400
+@@ -57,6 +57,8 @@
+ #define ext3_debug(f, a...) do {} while (0)
+ #endif
+
++#define EXT3_MULTIBLOCK_ALLOCATOR 1
++
+ /*
+ * Special inodes numbers
+ */
+@@ -335,6 +337,7 @@
+ #define EXT3_MOUNT_IOPEN_NOPRIV 0x80000 /* Make iopen world-readable */
+ #define EXT3_MOUNT_EXTENTS 0x10000 /* Extents support */
+ #define EXT3_MOUNT_EXTDEBUG 0x20000 /* Extents debug */
++#define EXT3_MOUNT_MBALLOC 0x100000/* Buddy allocation support */
+
+ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
+ #ifndef clear_opt
+@@ -695,7 +698,7 @@
+ extern int ext3_new_block (handle_t *, struct inode *, unsigned long,
+ __u32 *, __u32 *, int *);
+ extern void ext3_free_blocks (handle_t *, struct inode *, unsigned long,
+- unsigned long);
++ unsigned long, int);
+ extern unsigned long ext3_count_free_blocks (struct super_block *);
+ extern void ext3_check_blocks_bitmap (struct super_block *);
+ extern struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb,
+Index: linux-2.6.7/include/linux/ext3_fs_sb.h
+===================================================================
+--- linux-2.6.7.orig/include/linux/ext3_fs_sb.h 2004-09-03 08:46:59.000000000 +0400
++++ linux-2.6.7/include/linux/ext3_fs_sb.h 2004-09-03 08:46:59.000000000 +0400
+@@ -23,9 +23,29 @@
+ #define EXT_INCLUDE
+ #include <linux/blockgroup_lock.h>
+ #include <linux/percpu_counter.h>
++#include <linux/list.h>
+ #endif
+ #endif
+
++#define EXT3_BB_MAX_BLOCKS 30
++struct ext3_free_metadata {
++ unsigned short group;
++ unsigned short num;
++ unsigned short blocks[EXT3_BB_MAX_BLOCKS];
++ struct list_head list;
++};
++
++#define EXT3_BB_MAX_ORDER 14
++
++struct ext3_buddy_group_blocks {
++ sector_t bb_bitmap;
++ sector_t bb_buddy;
++ spinlock_t bb_lock;
++ unsigned bb_counters[EXT3_BB_MAX_ORDER];
++ struct ext3_free_metadata *bb_md_cur;
++ unsigned long bb_tid;
++};
++
+ /*
+ * third extended-fs super-block data in memory
+ */
+@@ -76,6 +96,17 @@
+ char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */
+ int s_jquota_fmt; /* Format of quota to use */
+ #endif
++
++ /* for buddy allocator */
++ struct ext3_buddy_group_blocks *s_buddy_blocks;
++ struct inode *s_buddy;
++ long s_blocks_reserved;
++ spinlock_t s_reserve_lock;
++ struct list_head s_active_transaction;
++ struct list_head s_closed_transaction;
++ struct list_head s_committed_transaction;
++ spinlock_t s_md_lock;
++ tid_t s_last_transaction;
+ };
+
+ #endif /* _LINUX_EXT3_FS_SB */