--- /dev/null
+%patch
+Index: linux-2.6.5-sles9/fs/ext3/extents.c
+===================================================================
+--- linux-2.6.5-sles9.orig/fs/ext3/extents.c 2003-01-30 13:24:37.000000000 +0300
++++ linux-2.6.5-sles9/fs/ext3/extents.c 2004-11-09 02:25:56.143726112 +0300
+@@ -0,0 +1,2313 @@
++/*
++ * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com
++ * Written by Alex Tomas <alex@clusterfs.com>
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License version 2 as
++ * published by the Free Software Foundation.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public Licens
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
++ */
++
++/*
++ * Extents support for EXT3
++ *
++ * TODO:
++ * - ext3_ext_walk_space() sould not use ext3_ext_find_extent()
++ * - ext3_ext_calc_credits() could take 'mergable' into account
++ * - ext3*_error() should be used in some situations
++ * - find_goal() [to be tested and improved]
++ * - smart tree reduction
++ * - arch-independence
++ * common on-disk format for big/little-endian arch
++ */
++
++#include <linux/module.h>
++#include <linux/fs.h>
++#include <linux/time.h>
++#include <linux/ext3_jbd.h>
++#include <linux/jbd.h>
++#include <linux/smp_lock.h>
++#include <linux/highuid.h>
++#include <linux/pagemap.h>
++#include <linux/quotaops.h>
++#include <linux/string.h>
++#include <linux/slab.h>
++#include <linux/ext3_extents.h>
++#include <asm/uaccess.h>
++
++static handle_t *ext3_ext_journal_restart(handle_t *handle, int needed)
++{
++ int err;
++
++ if (handle->h_buffer_credits > needed)
++ return handle;
++ if (!ext3_journal_extend(handle, needed))
++ return handle;
++ err = ext3_journal_restart(handle, needed);
++
++ return handle;
++}
++
++static int inline
++ext3_ext_get_access_for_root(handle_t *h, struct ext3_extents_tree *tree)
++{
++ if (tree->ops->get_write_access)
++ return tree->ops->get_write_access(h,tree->buffer);
++ else
++ return 0;
++}
++
++static int inline
++ext3_ext_mark_root_dirty(handle_t *h, struct ext3_extents_tree *tree)
++{
++ if (tree->ops->mark_buffer_dirty)
++ return tree->ops->mark_buffer_dirty(h,tree->buffer);
++ else
++ return 0;
++}
++
++/*
++ * could return:
++ * - EROFS
++ * - ENOMEM
++ */
++static int ext3_ext_get_access(handle_t *handle,
++ struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path)
++{
++ int err;
++
++ if (path->p_bh) {
++ /* path points to block */
++ err = ext3_journal_get_write_access(handle, path->p_bh);
++ } else {
++ /* path points to leaf/index in inode body */
++ err = ext3_ext_get_access_for_root(handle, tree);
++ }
++ return err;
++}
++
++/*
++ * could return:
++ * - EROFS
++ * - ENOMEM
++ * - EIO
++ */
++static int ext3_ext_dirty(handle_t *handle, struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path)
++{
++ int err;
++ if (path->p_bh) {
++ /* path points to block */
++ err =ext3_journal_dirty_metadata(handle, path->p_bh);
++ } else {
++ /* path points to leaf/index in inode body */
++ err = ext3_ext_mark_root_dirty(handle, tree);
++ }
++ return err;
++}
++
++static int inline
++ext3_ext_new_block(handle_t *handle, struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path, struct ext3_extent *ex,
++ int *err)
++{
++ int goal, depth, newblock;
++ struct inode *inode;
++
++ EXT_ASSERT(tree);
++ if (tree->ops->new_block)
++ return tree->ops->new_block(handle, tree, path, ex, err);
++
++ inode = tree->inode;
++ depth = EXT_DEPTH(tree);
++ if (path && depth > 0) {
++ goal = path[depth-1].p_block;
++ } else {
++ struct ext3_inode_info *ei = EXT3_I(inode);
++ unsigned long bg_start;
++ unsigned long colour;
++
++ bg_start = (ei->i_block_group *
++ EXT3_BLOCKS_PER_GROUP(inode->i_sb)) +
++ le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block);
++ colour = (current->pid % 16) *
++ (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16);
++ goal = bg_start + colour;
++ }
++
++ newblock = ext3_new_block(handle, inode, goal, err);
++ return newblock;
++}
++
++static inline void ext3_ext_tree_changed(struct ext3_extents_tree *tree)
++{
++ struct ext3_extent_header *neh;
++ neh = EXT_ROOT_HDR(tree);
++ neh->eh_generation++;
++}
++
++static inline int ext3_ext_space_block(struct ext3_extents_tree *tree)
++{
++ int size;
++
++ size = (tree->inode->i_sb->s_blocksize -
++ sizeof(struct ext3_extent_header))
++ / sizeof(struct ext3_extent);
++#ifdef AGRESSIVE_TEST
++ size = 6;
++#endif
++ return size;
++}
++
++static inline int ext3_ext_space_block_idx(struct ext3_extents_tree *tree)
++{
++ int size;
++
++ size = (tree->inode->i_sb->s_blocksize -
++ sizeof(struct ext3_extent_header))
++ / sizeof(struct ext3_extent_idx);
++#ifdef AGRESSIVE_TEST
++ size = 5;
++#endif
++ return size;
++}
++
++static inline int ext3_ext_space_root(struct ext3_extents_tree *tree)
++{
++ int size;
++
++ size = (tree->buffer_len - sizeof(struct ext3_extent_header))
++ / sizeof(struct ext3_extent);
++#ifdef AGRESSIVE_TEST
++ size = 3;
++#endif
++ return size;
++}
++
++static inline int ext3_ext_space_root_idx(struct ext3_extents_tree *tree)
++{
++ int size;
++
++ size = (tree->buffer_len -
++ sizeof(struct ext3_extent_header))
++ / sizeof(struct ext3_extent_idx);
++#ifdef AGRESSIVE_TEST
++ size = 4;
++#endif
++ return size;
++}
++
++static void ext3_ext_show_path(struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path)
++{
++#ifdef EXT_DEBUG
++ int k, l = path->p_depth;
++
++ ext_debug(tree, "path:");
++ for (k = 0; k <= l; k++, path++) {
++ if (path->p_idx) {
++ ext_debug(tree, " %d->%d", path->p_idx->ei_block,
++ path->p_idx->ei_leaf);
++ } else if (path->p_ext) {
++ ext_debug(tree, " %d:%d:%d",
++ path->p_ext->ee_block,
++ path->p_ext->ee_len,
++ path->p_ext->ee_start);
++ } else
++ ext_debug(tree, " []");
++ }
++ ext_debug(tree, "\n");
++#endif
++}
++
++static void ext3_ext_show_leaf(struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path)
++{
++#ifdef EXT_DEBUG
++ int depth = EXT_DEPTH(tree);
++ struct ext3_extent_header *eh;
++ struct ext3_extent *ex;
++ int i;
++
++ if (!path)
++ return;
++
++ eh = path[depth].p_hdr;
++ ex = EXT_FIRST_EXTENT(eh);
++
++ for (i = 0; i < eh->eh_entries; i++, ex++) {
++ ext_debug(tree, "%d:%d:%d ",
++ ex->ee_block, ex->ee_len, ex->ee_start);
++ }
++ ext_debug(tree, "\n");
++#endif
++}
++
++static void ext3_ext_drop_refs(struct ext3_ext_path *path)
++{
++ int depth = path->p_depth;
++ int i;
++
++ for (i = 0; i <= depth; i++, path++)
++ if (path->p_bh) {
++ brelse(path->p_bh);
++ path->p_bh = NULL;
++ }
++}
++
++/*
++ * binary search for closest index by given block
++ */
++static inline void
++ext3_ext_binsearch_idx(struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path, int block)
++{
++ struct ext3_extent_header *eh = path->p_hdr;
++ struct ext3_extent_idx *ix;
++ int l = 0, k, r;
++
++ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC);
++ EXT_ASSERT(eh->eh_entries <= eh->eh_max);
++ EXT_ASSERT(eh->eh_entries > 0);
++
++ ext_debug(tree, "binsearch for %d(idx): ", block);
++
++ path->p_idx = ix = EXT_FIRST_INDEX(eh);
++
++ r = k = eh->eh_entries;
++ while (k > 1) {
++ k = (r - l) / 2;
++ if (block < ix[l + k].ei_block)
++ r -= k;
++ else
++ l += k;
++ ext_debug(tree, "%d:%d:%d ", k, l, r);
++ }
++
++ ix += l;
++ path->p_idx = ix;
++ ext_debug(tree, " -> %d->%d ", path->p_idx->ei_block, path->p_idx->ei_leaf);
++
++ while (l++ < r) {
++ if (block < ix->ei_block)
++ break;
++ path->p_idx = ix++;
++ }
++ ext_debug(tree, " -> %d->%d\n", path->p_idx->ei_block,
++ path->p_idx->ei_leaf);
++
++#ifdef CHECK_BINSEARCH
++ {
++ struct ext3_extent_idx *chix;
++
++ chix = ix = EXT_FIRST_INDEX(eh);
++ for (k = 0; k < eh->eh_entries; k++, ix++) {
++ if (k != 0 && ix->ei_block <= ix[-1].ei_block) {
++ printk("k=%d, ix=0x%p, first=0x%p\n", k,
++ ix, EXT_FIRST_INDEX(eh));
++ printk("%u <= %u\n",
++ ix->ei_block,ix[-1].ei_block);
++ }
++ EXT_ASSERT(k == 0 || ix->ei_block > ix[-1].ei_block);
++ if (block < ix->ei_block)
++ break;
++ chix = ix;
++ }
++ EXT_ASSERT(chix == path->p_idx);
++ }
++#endif
++
++}
++
++/*
++ * binary search for closest extent by given block
++ */
++static inline void
++ext3_ext_binsearch(struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path, int block)
++{
++ struct ext3_extent_header *eh = path->p_hdr;
++ struct ext3_extent *ex;
++ int l = 0, k, r;
++
++ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC);
++ EXT_ASSERT(eh->eh_entries <= eh->eh_max);
++
++ if (eh->eh_entries == 0) {
++ /*
++ * this leaf is empty yet:
++ * we get such a leaf in split/add case
++ */
++ return;
++ }
++
++ ext_debug(tree, "binsearch for %d: ", block);
++
++ path->p_ext = ex = EXT_FIRST_EXTENT(eh);
++
++ r = k = eh->eh_entries;
++ while (k > 1) {
++ k = (r - l) / 2;
++ if (block < ex[l + k].ee_block)
++ r -= k;
++ else
++ l += k;
++ ext_debug(tree, "%d:%d:%d ", k, l, r);
++ }
++
++ ex += l;
++ path->p_ext = ex;
++ ext_debug(tree, " -> %d:%d:%d ", path->p_ext->ee_block,
++ path->p_ext->ee_start, path->p_ext->ee_len);
++
++ while (l++ < r) {
++ if (block < ex->ee_block)
++ break;
++ path->p_ext = ex++;
++ }
++ ext_debug(tree, " -> %d:%d:%d\n", path->p_ext->ee_block,
++ path->p_ext->ee_start, path->p_ext->ee_len);
++
++#ifdef CHECK_BINSEARCH
++ {
++ struct ext3_extent *chex;
++
++ chex = ex = EXT_FIRST_EXTENT(eh);
++ for (k = 0; k < eh->eh_entries; k++, ex++) {
++ EXT_ASSERT(k == 0 || ex->ee_block > ex[-1].ee_block);
++ if (block < ex->ee_block)
++ break;
++ chex = ex;
++ }
++ EXT_ASSERT(chex == path->p_ext);
++ }
++#endif
++
++}
++
++int ext3_extent_tree_init(handle_t *handle, struct ext3_extents_tree *tree)
++{
++ struct ext3_extent_header *eh;
++
++ BUG_ON(tree->buffer_len == 0);
++ ext3_ext_get_access_for_root(handle, tree);
++ eh = EXT_ROOT_HDR(tree);
++ eh->eh_depth = 0;
++ eh->eh_entries = 0;
++ eh->eh_magic = EXT3_EXT_MAGIC;
++ eh->eh_max = ext3_ext_space_root(tree);
++ ext3_ext_mark_root_dirty(handle, tree);
++ ext3_ext_invalidate_cache(tree);
++ return 0;
++}
++
++struct ext3_ext_path *
++ext3_ext_find_extent(struct ext3_extents_tree *tree, int block,
++ struct ext3_ext_path *path)
++{
++ struct ext3_extent_header *eh;
++ struct buffer_head *bh;
++ int depth, i, ppos = 0;
++
++ EXT_ASSERT(tree);
++ EXT_ASSERT(tree->inode);
++ EXT_ASSERT(tree->root);
++
++ eh = EXT_ROOT_HDR(tree);
++ EXT_ASSERT(eh);
++ i = depth = EXT_DEPTH(tree);
++ EXT_ASSERT(eh->eh_max);
++ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC);
++ EXT_ASSERT(i == 0 || eh->eh_entries > 0);
++
++ /* account possible depth increase */
++ if (!path) {
++ path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 2),
++ GFP_NOFS);
++ if (!path)
++ return ERR_PTR(-ENOMEM);
++ }
++ memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1));
++ path[0].p_hdr = eh;
++
++ /* walk through the tree */
++ while (i) {
++ ext_debug(tree, "depth %d: num %d, max %d\n",
++ ppos, eh->eh_entries, eh->eh_max);
++ ext3_ext_binsearch_idx(tree, path + ppos, block);
++ path[ppos].p_block = path[ppos].p_idx->ei_leaf;
++ path[ppos].p_depth = i;
++ path[ppos].p_ext = NULL;
++
++ bh = sb_bread(tree->inode->i_sb, path[ppos].p_block);
++ if (!bh) {
++ ext3_ext_drop_refs(path);
++ kfree(path);
++ return ERR_PTR(-EIO);
++ }
++ eh = EXT_BLOCK_HDR(bh);
++ ppos++;
++ EXT_ASSERT(ppos <= depth);
++ path[ppos].p_bh = bh;
++ path[ppos].p_hdr = eh;
++ i--;
++ }
++
++ path[ppos].p_depth = i;
++ path[ppos].p_hdr = eh;
++ path[ppos].p_ext = NULL;
++
++ /* find extent */
++ ext3_ext_binsearch(tree, path + ppos, block);
++
++ ext3_ext_show_path(tree, path);
++
++ return path;
++}
++
++/*
++ * insert new index [logical;ptr] into the block at cupr
++ * it check where to insert: before curp or after curp
++ */
++static int ext3_ext_insert_index(handle_t *handle,
++ struct ext3_extents_tree *tree,
++ struct ext3_ext_path *curp,
++ int logical, int ptr)
++{
++ struct ext3_extent_idx *ix;
++ int len, err;
++
++ if ((err = ext3_ext_get_access(handle, tree, curp)))
++ return err;
++
++ EXT_ASSERT(logical != curp->p_idx->ei_block);
++ len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx;
++ if (logical > curp->p_idx->ei_block) {
++ /* insert after */
++ if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) {
++ len = (len - 1) * sizeof(struct ext3_extent_idx);
++ len = len < 0 ? 0 : len;
++ ext_debug(tree, "insert new index %d after: %d. "
++ "move %d from 0x%p to 0x%p\n",
++ logical, ptr, len,
++ (curp->p_idx + 1), (curp->p_idx + 2));
++ memmove(curp->p_idx + 2, curp->p_idx + 1, len);
++ }
++ ix = curp->p_idx + 1;
++ } else {
++ /* insert before */
++ len = len * sizeof(struct ext3_extent_idx);
++ len = len < 0 ? 0 : len;
++ ext_debug(tree, "insert new index %d before: %d. "
++ "move %d from 0x%p to 0x%p\n",
++ logical, ptr, len,
++ curp->p_idx, (curp->p_idx + 1));
++ memmove(curp->p_idx + 1, curp->p_idx, len);
++ ix = curp->p_idx;
++ }
++
++ ix->ei_block = logical;
++ ix->ei_leaf = ptr;
++ curp->p_hdr->eh_entries++;
++
++ EXT_ASSERT(curp->p_hdr->eh_entries <= curp->p_hdr->eh_max);
++ EXT_ASSERT(ix <= EXT_LAST_INDEX(curp->p_hdr));
++
++ err = ext3_ext_dirty(handle, tree, curp);
++ ext3_std_error(tree->inode->i_sb, err);
++
++ return err;
++}
++
++/*
++ * routine inserts new subtree into the path, using free index entry
++ * at depth 'at:
++ * - allocates all needed blocks (new leaf and all intermediate index blocks)
++ * - makes decision where to split
++ * - moves remaining extens and index entries (right to the split point)
++ * into the newly allocated blocks
++ * - initialize subtree
++ */
++static int ext3_ext_split(handle_t *handle, struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path,
++ struct ext3_extent *newext, int at)
++{
++ struct buffer_head *bh = NULL;
++ int depth = EXT_DEPTH(tree);
++ struct ext3_extent_header *neh;
++ struct ext3_extent_idx *fidx;
++ struct ext3_extent *ex;
++ int i = at, k, m, a;
++ unsigned long newblock, oldblock, border;
++ int *ablocks = NULL; /* array of allocated blocks */
++ int err = 0;
++
++ /* make decision: where to split? */
++ /* FIXME: now desicion is simplest: at current extent */
++
++ /* if current leaf will be splitted, then we should use
++ * border from split point */
++ EXT_ASSERT(path[depth].p_ext <= EXT_MAX_EXTENT(path[depth].p_hdr));
++ if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
++ border = path[depth].p_ext[1].ee_block;
++ ext_debug(tree, "leaf will be splitted."
++ " next leaf starts at %d\n",
++ (int)border);
++ } else {
++ border = newext->ee_block;
++ ext_debug(tree, "leaf will be added."
++ " next leaf starts at %d\n",
++ (int)border);
++ }
++
++ /*
++ * if error occurs, then we break processing
++ * and turn filesystem read-only. so, index won't
++ * be inserted and tree will be in consistent
++ * state. next mount will repair buffers too
++ */
++
++ /*
++ * get array to track all allocated blocks
++ * we need this to handle errors and free blocks
++ * upon them
++ */
++ ablocks = kmalloc(sizeof(unsigned long) * depth, GFP_NOFS);
++ if (!ablocks)
++ return -ENOMEM;
++ memset(ablocks, 0, sizeof(unsigned long) * depth);
++
++ /* allocate all needed blocks */
++ ext_debug(tree, "allocate %d blocks for indexes/leaf\n", depth - at);
++ for (a = 0; a < depth - at; a++) {
++ newblock = ext3_ext_new_block(handle, tree, path, newext, &err);
++ if (newblock == 0)
++ goto cleanup;
++ ablocks[a] = newblock;
++ }
++
++ /* initialize new leaf */
++ newblock = ablocks[--a];
++ EXT_ASSERT(newblock);
++ bh = sb_getblk(tree->inode->i_sb, newblock);
++ if (!bh) {
++ err = -EIO;
++ goto cleanup;
++ }
++ lock_buffer(bh);
++
++ if ((err = ext3_journal_get_create_access(handle, bh)))
++ goto cleanup;
++
++ neh = EXT_BLOCK_HDR(bh);
++ neh->eh_entries = 0;
++ neh->eh_max = ext3_ext_space_block(tree);
++ neh->eh_magic = EXT3_EXT_MAGIC;
++ neh->eh_depth = 0;
++ ex = EXT_FIRST_EXTENT(neh);
++
++ /* move remain of path[depth] to the new leaf */
++ EXT_ASSERT(path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max);
++ /* start copy from next extent */
++ /* TODO: we could do it by single memmove */
++ m = 0;
++ path[depth].p_ext++;
++ while (path[depth].p_ext <=
++ EXT_MAX_EXTENT(path[depth].p_hdr)) {
++ ext_debug(tree, "move %d:%d:%d in new leaf %lu\n",
++ path[depth].p_ext->ee_block,
++ path[depth].p_ext->ee_start,
++ path[depth].p_ext->ee_len,
++ newblock);
++ memmove(ex++, path[depth].p_ext++,
++ sizeof(struct ext3_extent));
++ neh->eh_entries++;
++ m++;
++ }
++ set_buffer_uptodate(bh);
++ unlock_buffer(bh);
++
++ if ((err = ext3_journal_dirty_metadata(handle, bh)))
++ goto cleanup;
++ brelse(bh);
++ bh = NULL;
++
++ /* correct old leaf */
++ if (m) {
++ if ((err = ext3_ext_get_access(handle, tree, path + depth)))
++ goto cleanup;
++ path[depth].p_hdr->eh_entries -= m;
++ if ((err = ext3_ext_dirty(handle, tree, path + depth)))
++ goto cleanup;
++
++ }
++
++ /* create intermediate indexes */
++ k = depth - at - 1;
++ EXT_ASSERT(k >= 0);
++ if (k)
++ ext_debug(tree, "create %d intermediate indices\n", k);
++ /* insert new index into current index block */
++ /* current depth stored in i var */
++ i = depth - 1;
++ while (k--) {
++ oldblock = newblock;
++ newblock = ablocks[--a];
++ bh = sb_getblk(tree->inode->i_sb, newblock);
++ if (!bh) {
++ err = -EIO;
++ goto cleanup;
++ }
++ lock_buffer(bh);
++
++ if ((err = ext3_journal_get_create_access(handle, bh)))
++ goto cleanup;
++
++ neh = EXT_BLOCK_HDR(bh);
++ neh->eh_entries = 1;
++ neh->eh_magic = EXT3_EXT_MAGIC;
++ neh->eh_max = ext3_ext_space_block_idx(tree);
++ neh->eh_depth = depth - i;
++ fidx = EXT_FIRST_INDEX(neh);
++ fidx->ei_block = border;
++ fidx->ei_leaf = oldblock;
++
++ ext_debug(tree, "int.index at %d (block %lu): %lu -> %lu\n",
++ i, newblock, border, oldblock);
++ /* copy indexes */
++ m = 0;
++ path[i].p_idx++;
++
++ ext_debug(tree, "cur 0x%p, last 0x%p\n", path[i].p_idx,
++ EXT_MAX_INDEX(path[i].p_hdr));
++ EXT_ASSERT(EXT_MAX_INDEX(path[i].p_hdr) ==
++ EXT_LAST_INDEX(path[i].p_hdr));
++ while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) {
++ ext_debug(tree, "%d: move %d:%d in new index %lu\n",
++ i, path[i].p_idx->ei_block,
++ path[i].p_idx->ei_leaf, newblock);
++ memmove(++fidx, path[i].p_idx++,
++ sizeof(struct ext3_extent_idx));
++ neh->eh_entries++;
++ EXT_ASSERT(neh->eh_entries <= neh->eh_max);
++ m++;
++ }
++ set_buffer_uptodate(bh);
++ unlock_buffer(bh);
++
++ if ((err = ext3_journal_dirty_metadata(handle, bh)))
++ goto cleanup;
++ brelse(bh);
++ bh = NULL;
++
++ /* correct old index */
++ if (m) {
++ err = ext3_ext_get_access(handle, tree, path + i);
++ if (err)
++ goto cleanup;
++ path[i].p_hdr->eh_entries -= m;
++ err = ext3_ext_dirty(handle, tree, path + i);
++ if (err)
++ goto cleanup;
++ }
++
++ i--;
++ }
++
++ /* insert new index */
++ if (!err)
++ err = ext3_ext_insert_index(handle, tree, path + at,
++ border, newblock);
++
++cleanup:
++ if (bh) {
++ if (buffer_locked(bh))
++ unlock_buffer(bh);
++ brelse(bh);
++ }
++
++ if (err) {
++ /* free all allocated blocks in error case */
++ for (i = 0; i < depth; i++) {
++ if (!ablocks[i])
++ continue;
++ ext3_free_blocks(handle, tree->inode, ablocks[i], 1);
++ }
++ }
++ kfree(ablocks);
++
++ return err;
++}
++
++/*
++ * routine implements tree growing procedure:
++ * - allocates new block
++ * - moves top-level data (index block or leaf) into the new block
++ * - initialize new top-level, creating index that points to the
++ * just created block
++ */
++static int ext3_ext_grow_indepth(handle_t *handle,
++ struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path,
++ struct ext3_extent *newext)
++{
++ struct ext3_ext_path *curp = path;
++ struct ext3_extent_header *neh;
++ struct ext3_extent_idx *fidx;
++ struct buffer_head *bh;
++ unsigned long newblock;
++ int err = 0;
++
++ newblock = ext3_ext_new_block(handle, tree, path, newext, &err);
++ if (newblock == 0)
++ return err;
++
++ bh = sb_getblk(tree->inode->i_sb, newblock);
++ if (!bh) {
++ err = -EIO;
++ ext3_std_error(tree->inode->i_sb, err);
++ return err;
++ }
++ lock_buffer(bh);
++
++ if ((err = ext3_journal_get_create_access(handle, bh))) {
++ unlock_buffer(bh);
++ goto out;
++ }
++
++ /* move top-level index/leaf into new block */
++ memmove(bh->b_data, curp->p_hdr, tree->buffer_len);
++
++ /* set size of new block */
++ neh = EXT_BLOCK_HDR(bh);
++ /* old root could have indexes or leaves
++ * so calculate e_max right way */
++ if (EXT_DEPTH(tree))
++ neh->eh_max = ext3_ext_space_block_idx(tree);
++ else
++ neh->eh_max = ext3_ext_space_block(tree);
++ neh->eh_magic = EXT3_EXT_MAGIC;
++ set_buffer_uptodate(bh);
++ unlock_buffer(bh);
++
++ if ((err = ext3_journal_dirty_metadata(handle, bh)))
++ goto out;
++
++ /* create index in new top-level index: num,max,pointer */
++ if ((err = ext3_ext_get_access(handle, tree, curp)))
++ goto out;
++
++ curp->p_hdr->eh_magic = EXT3_EXT_MAGIC;
++ curp->p_hdr->eh_max = ext3_ext_space_root_idx(tree);
++ curp->p_hdr->eh_entries = 1;
++ curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr);
++ /* FIXME: it works, but actually path[0] can be index */
++ curp->p_idx->ei_block = EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block;
++ curp->p_idx->ei_leaf = newblock;
++
++ neh = EXT_ROOT_HDR(tree);
++ fidx = EXT_FIRST_INDEX(neh);
++ ext_debug(tree, "new root: num %d(%d), lblock %d, ptr %d\n",
++ neh->eh_entries, neh->eh_max, fidx->ei_block, fidx->ei_leaf);
++
++ neh->eh_depth = path->p_depth + 1;
++ err = ext3_ext_dirty(handle, tree, curp);
++out:
++ brelse(bh);
++
++ return err;
++}
++
++/*
++ * routine finds empty index and adds new leaf. if no free index found
++ * then it requests in-depth growing
++ */
++static int ext3_ext_create_new_leaf(handle_t *handle,
++ struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path,
++ struct ext3_extent *newext)
++{
++ struct ext3_ext_path *curp;
++ int depth, i, err = 0;
++
++repeat:
++ i = depth = EXT_DEPTH(tree);
++
++ /* walk up to the tree and look for free index entry */
++ curp = path + depth;
++ while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) {
++ i--;
++ curp--;
++ }
++
++ /* we use already allocated block for index block
++ * so, subsequent data blocks should be contigoues */
++ if (EXT_HAS_FREE_INDEX(curp)) {
++ /* if we found index with free entry, then use that
++ * entry: create all needed subtree and add new leaf */
++ err = ext3_ext_split(handle, tree, path, newext, i);
++
++ /* refill path */
++ ext3_ext_drop_refs(path);
++ path = ext3_ext_find_extent(tree, newext->ee_block, path);
++ if (IS_ERR(path))
++ err = PTR_ERR(path);
++ } else {
++ /* tree is full, time to grow in depth */
++ err = ext3_ext_grow_indepth(handle, tree, path, newext);
++
++ /* refill path */
++ ext3_ext_drop_refs(path);
++ path = ext3_ext_find_extent(tree, newext->ee_block, path);
++ if (IS_ERR(path))
++ err = PTR_ERR(path);
++
++ /*
++ * only first (depth 0 -> 1) produces free space
++ * in all other cases we have to split growed tree
++ */
++ depth = EXT_DEPTH(tree);
++ if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) {
++ /* now we need split */
++ goto repeat;
++ }
++ }
++
++ if (err)
++ return err;
++
++ return 0;
++}
++
++/*
++ * returns allocated block in subsequent extent or EXT_MAX_BLOCK
++ * NOTE: it consider block number from index entry as
++ * allocated block. thus, index entries have to be consistent
++ * with leafs
++ */
++static unsigned long
++ext3_ext_next_allocated_block(struct ext3_ext_path *path)
++{
++ int depth;
++
++ EXT_ASSERT(path != NULL);
++ depth = path->p_depth;
++
++ if (depth == 0 && path->p_ext == NULL)
++ return EXT_MAX_BLOCK;
++
++ /* FIXME: what if index isn't full ?! */
++ while (depth >= 0) {
++ if (depth == path->p_depth) {
++ /* leaf */
++ if (path[depth].p_ext !=
++ EXT_LAST_EXTENT(path[depth].p_hdr))
++ return path[depth].p_ext[1].ee_block;
++ } else {
++ /* index */
++ if (path[depth].p_idx !=
++ EXT_LAST_INDEX(path[depth].p_hdr))
++ return path[depth].p_idx[1].ei_block;
++ }
++ depth--;
++ }
++
++ return EXT_MAX_BLOCK;
++}
++
++/*
++ * returns first allocated block from next leaf or EXT_MAX_BLOCK
++ */
++static unsigned ext3_ext_next_leaf_block(struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path)
++{
++ int depth;
++
++ EXT_ASSERT(path != NULL);
++ depth = path->p_depth;
++
++ /* zero-tree has no leaf blocks at all */
++ if (depth == 0)
++ return EXT_MAX_BLOCK;
++
++ /* go to index block */
++ depth--;
++
++ while (depth >= 0) {
++ if (path[depth].p_idx !=
++ EXT_LAST_INDEX(path[depth].p_hdr))
++ return path[depth].p_idx[1].ei_block;
++ depth--;
++ }
++
++ return EXT_MAX_BLOCK;
++}
++
++/*
++ * if leaf gets modified and modified extent is first in the leaf
++ * then we have to correct all indexes above
++ * TODO: do we need to correct tree in all cases?
++ */
++int ext3_ext_correct_indexes(handle_t *handle, struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path)
++{
++ struct ext3_extent_header *eh;
++ int depth = EXT_DEPTH(tree);
++ struct ext3_extent *ex;
++ unsigned long border;
++ int k, err = 0;
++
++ eh = path[depth].p_hdr;
++ ex = path[depth].p_ext;
++ EXT_ASSERT(ex);
++ EXT_ASSERT(eh);
++
++ if (depth == 0) {
++ /* there is no tree at all */
++ return 0;
++ }
++
++ if (ex != EXT_FIRST_EXTENT(eh)) {
++ /* we correct tree if first leaf got modified only */
++ return 0;
++ }
++
++ /*
++ * TODO: we need correction if border is smaller then current one
++ */
++ k = depth - 1;
++ border = path[depth].p_ext->ee_block;
++ if ((err = ext3_ext_get_access(handle, tree, path + k)))
++ return err;
++ path[k].p_idx->ei_block = border;
++ if ((err = ext3_ext_dirty(handle, tree, path + k)))
++ return err;
++
++ while (k--) {
++ /* change all left-side indexes */
++ if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr))
++ break;
++ if ((err = ext3_ext_get_access(handle, tree, path + k)))
++ break;
++ path[k].p_idx->ei_block = border;
++ if ((err = ext3_ext_dirty(handle, tree, path + k)))
++ break;
++ }
++
++ return err;
++}
++
++static int inline
++ext3_can_extents_be_merged(struct ext3_extents_tree *tree,
++ struct ext3_extent *ex1,
++ struct ext3_extent *ex2)
++{
++ if (ex1->ee_block + ex1->ee_len != ex2->ee_block)
++ return 0;
++
++#ifdef AGRESSIVE_TEST
++ if (ex1->ee_len >= 4)
++ return 0;
++#endif
++
++ if (!tree->ops->mergable)
++ return 1;
++
++ return tree->ops->mergable(ex1, ex2);
++}
++
++/*
++ * this routine tries to merge requsted extent into the existing
++ * extent or inserts requested extent as new one into the tree,
++ * creating new leaf in no-space case
++ */
++int ext3_ext_insert_extent(handle_t *handle, struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path,
++ struct ext3_extent *newext)
++{
++ struct ext3_extent_header * eh;
++ struct ext3_extent *ex, *fex;
++ struct ext3_extent *nearex; /* nearest extent */
++ struct ext3_ext_path *npath = NULL;
++ int depth, len, err, next;
++
++ EXT_ASSERT(newext->ee_len > 0);
++ EXT_ASSERT(newext->ee_len < EXT_CACHE_MARK);
++ depth = EXT_DEPTH(tree);
++ ex = path[depth].p_ext;
++ EXT_ASSERT(path[depth].p_hdr);
++
++ /* try to insert block into found extent and return */
++ if (ex && ext3_can_extents_be_merged(tree, ex, newext)) {
++ ext_debug(tree, "append %d block to %d:%d (from %d)\n",
++ newext->ee_len, ex->ee_block, ex->ee_len,
++ ex->ee_start);
++ if ((err = ext3_ext_get_access(handle, tree, path + depth)))
++ return err;
++ ex->ee_len += newext->ee_len;
++ eh = path[depth].p_hdr;
++ nearex = ex;
++ goto merge;
++ }
++
++repeat:
++ depth = EXT_DEPTH(tree);
++ eh = path[depth].p_hdr;
++ if (eh->eh_entries < eh->eh_max)
++ goto has_space;
++
++ /* probably next leaf has space for us? */
++ fex = EXT_LAST_EXTENT(eh);
++ next = ext3_ext_next_leaf_block(tree, path);
++ if (newext->ee_block > fex->ee_block && next != EXT_MAX_BLOCK) {
++ ext_debug(tree, "next leaf block - %d\n", next);
++ EXT_ASSERT(!npath);
++ npath = ext3_ext_find_extent(tree, next, NULL);
++ if (IS_ERR(npath))
++ return PTR_ERR(npath);
++ EXT_ASSERT(npath->p_depth == path->p_depth);
++ eh = npath[depth].p_hdr;
++ if (eh->eh_entries < eh->eh_max) {
++ ext_debug(tree, "next leaf isnt full(%d)\n",
++ eh->eh_entries);
++ path = npath;
++ goto repeat;
++ }
++ ext_debug(tree, "next leaf hasno free space(%d,%d)\n",
++ eh->eh_entries, eh->eh_max);
++ }
++
++ /*
++ * there is no free space in found leaf
++ * we're gonna add new leaf in the tree
++ */
++ err = ext3_ext_create_new_leaf(handle, tree, path, newext);
++ if (err)
++ goto cleanup;
++ depth = EXT_DEPTH(tree);
++ eh = path[depth].p_hdr;
++
++has_space:
++ nearex = path[depth].p_ext;
++
++ if ((err = ext3_ext_get_access(handle, tree, path + depth)))
++ goto cleanup;
++
++ if (!nearex) {
++ /* there is no extent in this leaf, create first one */
++ ext_debug(tree, "first extent in the leaf: %d:%d:%d\n",
++ newext->ee_block, newext->ee_start,
++ newext->ee_len);
++ path[depth].p_ext = EXT_FIRST_EXTENT(eh);
++ } else if (newext->ee_block > nearex->ee_block) {
++ EXT_ASSERT(newext->ee_block != nearex->ee_block);
++ if (nearex != EXT_LAST_EXTENT(eh)) {
++ len = EXT_MAX_EXTENT(eh) - nearex;
++ len = (len - 1) * sizeof(struct ext3_extent);
++ len = len < 0 ? 0 : len;
++ ext_debug(tree, "insert %d:%d:%d after: nearest 0x%p, "
++ "move %d from 0x%p to 0x%p\n",
++ newext->ee_block, newext->ee_start,
++ newext->ee_len,
++ nearex, len, nearex + 1, nearex + 2);
++ memmove(nearex + 2, nearex + 1, len);
++ }
++ path[depth].p_ext = nearex + 1;
++ } else {
++ EXT_ASSERT(newext->ee_block != nearex->ee_block);
++ len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext3_extent);
++ len = len < 0 ? 0 : len;
++ ext_debug(tree, "insert %d:%d:%d before: nearest 0x%p, "
++ "move %d from 0x%p to 0x%p\n",
++ newext->ee_block, newext->ee_start, newext->ee_len,
++ nearex, len, nearex + 1, nearex + 2);
++ memmove(nearex + 1, nearex, len);
++ path[depth].p_ext = nearex;
++ }
++
++ eh->eh_entries++;
++ nearex = path[depth].p_ext;
++ nearex->ee_block = newext->ee_block;
++ nearex->ee_start = newext->ee_start;
++ nearex->ee_len = newext->ee_len;
++ /* FIXME: support for large fs */
++ nearex->ee_start_hi = 0;
++
++merge:
++ /* try to merge extents to the right */
++ while (nearex < EXT_LAST_EXTENT(eh)) {
++ if (!ext3_can_extents_be_merged(tree, nearex, nearex + 1))
++ break;
++ /* merge with next extent! */
++ nearex->ee_len += nearex[1].ee_len;
++ if (nearex + 1 < EXT_LAST_EXTENT(eh)) {
++ len = (EXT_LAST_EXTENT(eh) - nearex - 1)
++ * sizeof(struct ext3_extent);
++ memmove(nearex + 1, nearex + 2, len);
++ }
++ eh->eh_entries--;
++ EXT_ASSERT(eh->eh_entries > 0);
++ }
++
++ /* try to merge extents to the left */
++
++ /* time to correct all indexes above */
++ err = ext3_ext_correct_indexes(handle, tree, path);
++ if (err)
++ goto cleanup;
++
++ err = ext3_ext_dirty(handle, tree, path + depth);
++
++cleanup:
++ if (npath) {
++ ext3_ext_drop_refs(npath);
++ kfree(npath);
++ }
++ ext3_ext_tree_changed(tree);
++ ext3_ext_invalidate_cache(tree);
++ return err;
++}
++
++int ext3_ext_walk_space(struct ext3_extents_tree *tree, unsigned long block,
++ unsigned long num, ext_prepare_callback func)
++{
++ struct ext3_ext_path *path = NULL;
++ struct ext3_extent *ex, cbex;
++ unsigned long next, start = 0, end = 0;
++ unsigned long last = block + num;
++ int depth, exists, err = 0;
++
++ EXT_ASSERT(tree);
++ EXT_ASSERT(func);
++ EXT_ASSERT(tree->inode);
++ EXT_ASSERT(tree->root);
++
++ while (block < last && block != EXT_MAX_BLOCK) {
++ num = last - block;
++ /* find extent for this block */
++ path = ext3_ext_find_extent(tree, block, path);
++ if (IS_ERR(path)) {
++ err = PTR_ERR(path);
++ path = NULL;
++ break;
++ }
++
++ depth = EXT_DEPTH(tree);
++ EXT_ASSERT(path[depth].p_hdr);
++ ex = path[depth].p_ext;
++ next = ext3_ext_next_allocated_block(path);
++
++ exists = 0;
++ if (!ex) {
++ /* there is no extent yet, so try to allocate
++ * all requested space */
++ start = block;
++ end = block + num;
++ } else if (ex->ee_block > block) {
++ /* need to allocate space before found extent */
++ start = block;
++ end = ex->ee_block;
++ if (block + num < end)
++ end = block + num;
++ } else if (block >= ex->ee_block + ex->ee_len) {
++ /* need to allocate space after found extent */
++ start = block;
++ end = block + num;
++ if (end >= next)
++ end = next;
++ } else if (block >= ex->ee_block) {
++ /*
++ * some part of requested space is covered
++ * by found extent
++ */
++ start = block;
++ end = ex->ee_block + ex->ee_len;
++ if (block + num < end)
++ end = block + num;
++ exists = 1;
++ } else {
++ BUG();
++ }
++ EXT_ASSERT(end > start);
++
++ if (!exists) {
++ cbex.ee_block = start;
++ cbex.ee_len = end - start;
++ cbex.ee_start = 0;
++ } else
++ cbex = *ex;
++
++ EXT_ASSERT(path[depth].p_hdr);
++ err = func(tree, path, &cbex, exists);
++ ext3_ext_drop_refs(path);
++
++ if (err < 0)
++ break;
++ if (err == EXT_REPEAT)
++ continue;
++ else if (err == EXT_BREAK) {
++ err = 0;
++ break;
++ }
++
++ if (EXT_DEPTH(tree) != depth) {
++ /* depth was changed. we have to realloc path */
++ kfree(path);
++ path = NULL;
++ }
++
++ block = cbex.ee_block + cbex.ee_len;
++ }
++
++ if (path) {
++ ext3_ext_drop_refs(path);
++ kfree(path);
++ }
++
++ return err;
++}
++
++static inline void
++ext3_ext_put_in_cache(struct ext3_extents_tree *tree, __u32 block,
++ __u32 len, __u32 start, int type)
++{
++ EXT_ASSERT(len > 0);
++ if (tree->cex) {
++ tree->cex->ec_type = type;
++ tree->cex->ec_block = block;
++ tree->cex->ec_len = len;
++ tree->cex->ec_start = start;
++ }
++}
++
++/*
++ * this routine calculate boundaries of the gap requested block fits into
++ * and cache this gap
++ */
++static inline void
++ext3_ext_put_gap_in_cache(struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path,
++ unsigned long block)
++{
++ int depth = EXT_DEPTH(tree);
++ unsigned long lblock, len;
++ struct ext3_extent *ex;
++
++ if (!tree->cex)
++ return;
++
++ ex = path[depth].p_ext;
++ if (ex == NULL) {
++ /* there is no extent yet, so gap is [0;-] */
++ lblock = 0;
++ len = EXT_MAX_BLOCK;
++ ext_debug(tree, "cache gap(whole file):");
++ } else if (block < ex->ee_block) {
++ lblock = block;
++ len = ex->ee_block - block;
++ ext_debug(tree, "cache gap(before): %lu [%lu:%lu]",
++ (unsigned long) block,
++ (unsigned long) ex->ee_block,
++ (unsigned long) ex->ee_len);
++ } else if (block >= ex->ee_block + ex->ee_len) {
++ lblock = ex->ee_block + ex->ee_len;
++ len = ext3_ext_next_allocated_block(path);
++ ext_debug(tree, "cache gap(after): [%lu:%lu] %lu",
++ (unsigned long) ex->ee_block,
++ (unsigned long) ex->ee_len,
++ (unsigned long) block);
++ EXT_ASSERT(len > lblock);
++ len = len - lblock;
++ } else {
++ lblock = len = 0;
++ BUG();
++ }
++
++ ext_debug(tree, " -> %lu:%lu\n", (unsigned long) lblock, len);
++ ext3_ext_put_in_cache(tree, lblock, len, 0, EXT3_EXT_CACHE_GAP);
++}
++
++static inline int
++ext3_ext_in_cache(struct ext3_extents_tree *tree, unsigned long block,
++ struct ext3_extent *ex)
++{
++ struct ext3_ext_cache *cex = tree->cex;
++
++ /* is there cache storage at all? */
++ if (!cex)
++ return EXT3_EXT_CACHE_NO;
++
++ /* has cache valid data? */
++ if (cex->ec_type == EXT3_EXT_CACHE_NO)
++ return EXT3_EXT_CACHE_NO;
++
++ EXT_ASSERT(cex->ec_type == EXT3_EXT_CACHE_GAP ||
++ cex->ec_type == EXT3_EXT_CACHE_EXTENT);
++ if (block >= cex->ec_block && block < cex->ec_block + cex->ec_len) {
++ ex->ee_block = cex->ec_block;
++ ex->ee_start = cex->ec_start;
++ ex->ee_len = cex->ec_len;
++ ext_debug(tree, "%lu cached by %lu:%lu:%lu\n",
++ (unsigned long) block,
++ (unsigned long) ex->ee_block,
++ (unsigned long) ex->ee_len,
++ (unsigned long) ex->ee_start);
++ return cex->ec_type;
++ }
++
++ /* not in cache */
++ return EXT3_EXT_CACHE_NO;
++}
++
++/*
++ * routine removes index from the index block
++ * it's used in truncate case only. thus all requests are for
++ * last index in the block only
++ */
++int ext3_ext_rm_idx(handle_t *handle, struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path)
++{
++ struct buffer_head *bh;
++ int err;
++
++ /* free index block */
++ path--;
++ EXT_ASSERT(path->p_hdr->eh_entries);
++ if ((err = ext3_ext_get_access(handle, tree, path)))
++ return err;
++ path->p_hdr->eh_entries--;
++ if ((err = ext3_ext_dirty(handle, tree, path)))
++ return err;
++ ext_debug(tree, "index is empty, remove it, free block %d\n",
++ path->p_idx->ei_leaf);
++ bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf);
++ ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf);
++ ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1);
++ return err;
++}
++
++int ext3_ext_calc_credits_for_insert(struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path)
++{
++ int depth = EXT_DEPTH(tree);
++ int needed;
++
++ if (path) {
++ /* probably there is space in leaf? */
++ if (path[depth].p_hdr->eh_entries < path[depth].p_hdr->eh_max)
++ return 1;
++ }
++
++ /*
++ * the worste case we're expecting is creation of the
++ * new root (growing in depth) with index splitting
++ * for splitting we have to consider depth + 1 because
++ * previous growing could increase it
++ */
++ depth = depth + 1;
++
++ /*
++ * growing in depth:
++ * block allocation + new root + old root
++ */
++ needed = EXT3_ALLOC_NEEDED + 2;
++
++ /* index split. we may need:
++ * allocate intermediate indexes and new leaf
++ * change two blocks at each level, but root
++ * modify root block (inode)
++ */
++ needed += (depth * EXT3_ALLOC_NEEDED) + (2 * depth) + 1;
++
++ return needed;
++}
++
++static int
++ext3_ext_split_for_rm(handle_t *handle, struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path, unsigned long start,
++ unsigned long end)
++{
++ struct ext3_extent *ex, tex;
++ struct ext3_ext_path *npath;
++ int depth, creds, err;
++
++ depth = EXT_DEPTH(tree);
++ ex = path[depth].p_ext;
++ EXT_ASSERT(ex);
++ EXT_ASSERT(end < ex->ee_block + ex->ee_len - 1);
++ EXT_ASSERT(ex->ee_block < start);
++
++ /* calculate tail extent */
++ tex.ee_block = end + 1;
++ EXT_ASSERT(tex.ee_block < ex->ee_block + ex->ee_len);
++ tex.ee_len = ex->ee_block + ex->ee_len - tex.ee_block;
++
++ creds = ext3_ext_calc_credits_for_insert(tree, path);
++ handle = ext3_ext_journal_restart(handle, creds);
++ if (IS_ERR(handle))
++ return PTR_ERR(handle);
++
++ /* calculate head extent. use primary extent */
++ err = ext3_ext_get_access(handle, tree, path + depth);
++ if (err)
++ return err;
++ ex->ee_len = start - ex->ee_block;
++ err = ext3_ext_dirty(handle, tree, path + depth);
++ if (err)
++ return err;
++
++ /* FIXME: some callback to free underlying resource
++ * and correct ee_start? */
++ ext_debug(tree, "split extent: head %u:%u, tail %u:%u\n",
++ ex->ee_block, ex->ee_len, tex.ee_block, tex.ee_len);
++
++ npath = ext3_ext_find_extent(tree, ex->ee_block, NULL);
++ if (IS_ERR(npath))
++ return PTR_ERR(npath);
++ depth = EXT_DEPTH(tree);
++ EXT_ASSERT(npath[depth].p_ext->ee_block == ex->ee_block);
++ EXT_ASSERT(npath[depth].p_ext->ee_len == ex->ee_len);
++
++ err = ext3_ext_insert_extent(handle, tree, npath, &tex);
++ ext3_ext_drop_refs(npath);
++ kfree(npath);
++
++ return err;
++
++}
++
++static int
++ext3_ext_rm_leaf(handle_t *handle, struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path, unsigned long start,
++ unsigned long end)
++{
++ struct ext3_extent *ex, *fu = NULL, *lu, *le;
++ int err = 0, correct_index = 0;
++ int depth = EXT_DEPTH(tree), credits;
++ struct ext3_extent_header *eh;
++ unsigned a, b, block, num;
++
++ ext_debug(tree, "remove [%lu:%lu] in leaf\n", start, end);
++ if (!path[depth].p_hdr)
++ path[depth].p_hdr = EXT_BLOCK_HDR(path[depth].p_bh);
++ eh = path[depth].p_hdr;
++ EXT_ASSERT(eh);
++ EXT_ASSERT(eh->eh_entries <= eh->eh_max);
++ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC);
++
++ /* find where to start removing */
++ le = ex = EXT_LAST_EXTENT(eh);
++ while (ex != EXT_FIRST_EXTENT(eh)) {
++ if (ex->ee_block <= end)
++ break;
++ ex--;
++ }
++
++ if (start > ex->ee_block && end < ex->ee_block + ex->ee_len - 1) {
++ /* removal of internal part of the extent requested
++ * tail and head must be placed in different extent
++ * so, we have to insert one more extent */
++ path[depth].p_ext = ex;
++ return ext3_ext_split_for_rm(handle, tree, path, start, end);
++ }
++
++ lu = ex;
++ while (ex >= EXT_FIRST_EXTENT(eh) &&
++ ex->ee_block + ex->ee_len > start) {
++ ext_debug(tree, "remove ext %u:%u\n", ex->ee_block, ex->ee_len);
++ path[depth].p_ext = ex;
++
++ a = ex->ee_block > start ? ex->ee_block : start;
++ b = ex->ee_block + ex->ee_len - 1 < end ?
++ ex->ee_block + ex->ee_len - 1 : end;
++
++ ext_debug(tree, " border %u:%u\n", a, b);
++
++ if (a != ex->ee_block && b != ex->ee_block + ex->ee_len - 1) {
++ block = 0;
++ num = 0;
++ BUG();
++ } else if (a != ex->ee_block) {
++ /* remove tail of the extent */
++ block = ex->ee_block;
++ num = a - block;
++ } else if (b != ex->ee_block + ex->ee_len - 1) {
++ /* remove head of the extent */
++ block = a;
++ num = b - a;
++ } else {
++ /* remove whole extent: excelent! */
++ block = ex->ee_block;
++ num = 0;
++ EXT_ASSERT(a == ex->ee_block &&
++ b == ex->ee_block + ex->ee_len - 1);
++ }
++
++ if (ex == EXT_FIRST_EXTENT(eh))
++ correct_index = 1;
++
++ credits = 1;
++ if (correct_index)
++ credits += (EXT_DEPTH(tree) * EXT3_ALLOC_NEEDED) + 1;
++ if (tree->ops->remove_extent_credits)
++ credits+=tree->ops->remove_extent_credits(tree,ex,a,b);
++
++ handle = ext3_ext_journal_restart(handle, credits);
++ if (IS_ERR(handle)) {
++ err = PTR_ERR(handle);
++ goto out;
++ }
++
++ err = ext3_ext_get_access(handle, tree, path + depth);
++ if (err)
++ goto out;
++
++ if (tree->ops->remove_extent)
++ err = tree->ops->remove_extent(tree, ex, a, b);
++ if (err)
++ goto out;
++
++ if (num == 0) {
++ /* this extent is removed entirely mark slot unused */
++ ex->ee_start = 0;
++ eh->eh_entries--;
++ fu = ex;
++ }
++
++ ex->ee_block = block;
++ ex->ee_len = num;
++
++ err = ext3_ext_dirty(handle, tree, path + depth);
++ if (err)
++ goto out;
++
++ ext_debug(tree, "new extent: %u:%u:%u\n",
++ ex->ee_block, ex->ee_len, ex->ee_start);
++ ex--;
++ }
++
++ if (fu) {
++ /* reuse unused slots */
++ while (lu < le) {
++ if (lu->ee_start) {
++ *fu = *lu;
++ lu->ee_start = 0;
++ fu++;
++ }
++ lu++;
++ }
++ }
++
++ if (correct_index && eh->eh_entries)
++ err = ext3_ext_correct_indexes(handle, tree, path);
++
++ /* if this leaf is free, then we should
++ * remove it from index block above */
++ if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL)
++ err = ext3_ext_rm_idx(handle, tree, path + depth);
++
++out:
++ return err;
++}
++
++
++static struct ext3_extent_idx *
++ext3_ext_last_covered(struct ext3_extent_header *hdr, unsigned long block)
++{
++ struct ext3_extent_idx *ix;
++
++ ix = EXT_LAST_INDEX(hdr);
++ while (ix != EXT_FIRST_INDEX(hdr)) {
++ if (ix->ei_block <= block)
++ break;
++ ix--;
++ }
++ return ix;
++}
++
++/*
++ * returns 1 if current index have to be freed (even partial)
++ */
++static int inline
++ext3_ext_more_to_rm(struct ext3_ext_path *path)
++{
++ EXT_ASSERT(path->p_idx);
++
++ if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr))
++ return 0;
++
++ /*
++ * if truncate on deeper level happened it it wasn't partial
++ * so we have to consider current index for truncation
++ */
++ if (path->p_hdr->eh_entries == path->p_block)
++ return 0;
++ return 1;
++}
++
++int ext3_ext_remove_space(struct ext3_extents_tree *tree,
++ unsigned long start, unsigned long end)
++{
++ struct inode *inode = tree->inode;
++ struct super_block *sb = inode->i_sb;
++ int depth = EXT_DEPTH(tree);
++ struct ext3_ext_path *path;
++ handle_t *handle;
++ int i = 0, err = 0;
++
++ ext_debug(tree, "space to be removed: %lu:%lu\n", start, end);
++
++ /* probably first extent we're gonna free will be last in block */
++ handle = ext3_journal_start(inode, depth + 1);
++ if (IS_ERR(handle))
++ return PTR_ERR(handle);
++
++ ext3_ext_invalidate_cache(tree);
++
++ /*
++ * we start scanning from right side freeing all the blocks
++ * after i_size and walking into the deep
++ */
++ path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 1), GFP_KERNEL);
++ if (IS_ERR(path)) {
++ ext3_error(sb, "ext3_ext_remove_space",
++ "Can't allocate path array");
++ ext3_journal_stop(handle);
++ return -ENOMEM;
++ }
++ memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1));
++ path[i].p_hdr = EXT_ROOT_HDR(tree);
++
++ while (i >= 0 && err == 0) {
++ if (i == depth) {
++ /* this is leaf block */
++ err = ext3_ext_rm_leaf(handle, tree, path, start, end);
++ /* root level have p_bh == NULL, brelse() eats this */
++ brelse(path[i].p_bh);
++ i--;
++ continue;
++ }
++
++ /* this is index block */
++ if (!path[i].p_hdr) {
++ ext_debug(tree, "initialize header\n");
++ path[i].p_hdr = EXT_BLOCK_HDR(path[i].p_bh);
++ }
++
++ EXT_ASSERT(path[i].p_hdr->eh_entries <= path[i].p_hdr->eh_max);
++ EXT_ASSERT(path[i].p_hdr->eh_magic == EXT3_EXT_MAGIC);
++
++ if (!path[i].p_idx) {
++ /* this level hasn't touched yet */
++ path[i].p_idx =
++ ext3_ext_last_covered(path[i].p_hdr, end);
++ path[i].p_block = path[i].p_hdr->eh_entries + 1;
++ ext_debug(tree, "init index ptr: hdr 0x%p, num %d\n",
++ path[i].p_hdr, path[i].p_hdr->eh_entries);
++ } else {
++ /* we've already was here, see at next index */
++ path[i].p_idx--;
++ }
++
++ ext_debug(tree, "level %d - index, first 0x%p, cur 0x%p\n",
++ i, EXT_FIRST_INDEX(path[i].p_hdr),
++ path[i].p_idx);
++ if (ext3_ext_more_to_rm(path + i)) {
++ /* go to the next level */
++ ext_debug(tree, "move to level %d (block %d)\n",
++ i + 1, path[i].p_idx->ei_leaf);
++ memset(path + i + 1, 0, sizeof(*path));
++ path[i+1].p_bh = sb_bread(sb, path[i].p_idx->ei_leaf);
++ if (!path[i+1].p_bh) {
++ /* should we reset i_size? */
++ err = -EIO;
++ break;
++ }
++ /* put actual number of indexes to know is this
++ * number got changed at the next iteration */
++ path[i].p_block = path[i].p_hdr->eh_entries;
++ i++;
++ } else {
++ /* we finish processing this index, go up */
++ if (path[i].p_hdr->eh_entries == 0 && i > 0) {
++ /* index is empty, remove it
++ * handle must be already prepared by the
++ * truncatei_leaf() */
++ err = ext3_ext_rm_idx(handle, tree, path + i);
++ }
++ /* root level have p_bh == NULL, brelse() eats this */
++ brelse(path[i].p_bh);
++ i--;
++ ext_debug(tree, "return to level %d\n", i);
++ }
++ }
++
++ /* TODO: flexible tree reduction should be here */
++ if (path->p_hdr->eh_entries == 0) {
++ /*
++ * truncate to zero freed all the tree
++ * so, we need to correct eh_depth
++ */
++ err = ext3_ext_get_access(handle, tree, path);
++ if (err == 0) {
++ EXT_ROOT_HDR(tree)->eh_depth = 0;
++ EXT_ROOT_HDR(tree)->eh_max = ext3_ext_space_root(tree);
++ err = ext3_ext_dirty(handle, tree, path);
++ }
++ }
++ ext3_ext_tree_changed(tree);
++
++ kfree(path);
++ ext3_journal_stop(handle);
++
++ return err;
++}
++
++int ext3_ext_calc_metadata_amount(struct ext3_extents_tree *tree, int blocks)
++{
++ int lcap, icap, rcap, leafs, idxs, num;
++
++ rcap = ext3_ext_space_root(tree);
++ if (blocks <= rcap) {
++ /* all extents fit to the root */
++ return 0;
++ }
++
++ rcap = ext3_ext_space_root_idx(tree);
++ lcap = ext3_ext_space_block(tree);
++ icap = ext3_ext_space_block_idx(tree);
++
++ num = leafs = (blocks + lcap - 1) / lcap;
++ if (leafs <= rcap) {
++ /* all pointers to leafs fit to the root */
++ return leafs;
++ }
++
++ /* ok. we need separate index block(s) to link all leaf blocks */
++ idxs = (leafs + icap - 1) / icap;
++ do {
++ num += idxs;
++ idxs = (idxs + icap - 1) / icap;
++ } while (idxs > rcap);
++
++ return num;
++}
++
++/*
++ * called at mount time
++ */
++void ext3_ext_init(struct super_block *sb)
++{
++ /*
++ * possible initialization would be here
++ */
++
++ if (test_opt(sb, EXTENTS)) {
++ printk("EXT3-fs: file extents enabled");
++#ifdef AGRESSIVE_TEST
++ printk(", agressive tests");
++#endif
++#ifdef CHECK_BINSEARCH
++ printk(", check binsearch");
++#endif
++ printk("\n");
++ }
++}
++
++/*
++ * called at umount time
++ */
++void ext3_ext_release(struct super_block *sb)
++{
++}
++
++/************************************************************************
++ * VFS related routines
++ ************************************************************************/
++
++static int ext3_get_inode_write_access(handle_t *handle, void *buffer)
++{
++ /* we use in-core data, not bh */
++ return 0;
++}
++
++static int ext3_mark_buffer_dirty(handle_t *handle, void *buffer)
++{
++ struct inode *inode = buffer;
++ return ext3_mark_inode_dirty(handle, inode);
++}
++
++static int ext3_ext_mergable(struct ext3_extent *ex1,
++ struct ext3_extent *ex2)
++{
++ /* FIXME: support for large fs */
++ if (ex1->ee_start + ex1->ee_len == ex2->ee_start)
++ return 1;
++ return 0;
++}
++
++static int
++ext3_remove_blocks_credits(struct ext3_extents_tree *tree,
++ struct ext3_extent *ex,
++ unsigned long from, unsigned long to)
++{
++ int needed;
++
++ /* at present, extent can't cross block group */;
++ needed = 4; /* bitmap + group desc + sb + inode */
++
++#ifdef CONFIG_QUOTA
++ needed += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS;
++#endif
++ return needed;
++}
++
++static int
++ext3_remove_blocks(struct ext3_extents_tree *tree,
++ struct ext3_extent *ex,
++ unsigned long from, unsigned long to)
++{
++ int needed = ext3_remove_blocks_credits(tree, ex, from, to);
++ handle_t *handle = ext3_journal_start(tree->inode, needed);
++ struct buffer_head *bh;
++ int i;
++
++ if (IS_ERR(handle))
++ return PTR_ERR(handle);
++ if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) {
++ /* tail removal */
++ unsigned long num, start;
++ num = ex->ee_block + ex->ee_len - from;
++ start = ex->ee_start + ex->ee_len - num;
++ ext_debug(tree, "free last %lu blocks starting %lu\n",
++ num, start);
++ for (i = 0; i < num; i++) {
++ bh = sb_find_get_block(tree->inode->i_sb, start + i);
++ ext3_forget(handle, 0, tree->inode, bh, start + i);
++ }
++ ext3_free_blocks(handle, tree->inode, start, num);
++ } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) {
++ printk("strange request: removal %lu-%lu from %u:%u\n",
++ from, to, ex->ee_block, ex->ee_len);
++ } else {
++ printk("strange request: removal(2) %lu-%lu from %u:%u\n",
++ from, to, ex->ee_block, ex->ee_len);
++ }
++ ext3_journal_stop(handle);
++ return 0;
++}
++
++static int ext3_ext_find_goal(struct inode *inode,
++ struct ext3_ext_path *path, unsigned long block)
++{
++ struct ext3_inode_info *ei = EXT3_I(inode);
++ unsigned long bg_start;
++ unsigned long colour;
++ int depth;
++
++ if (path) {
++ struct ext3_extent *ex;
++ depth = path->p_depth;
++
++ /* try to predict block placement */
++ if ((ex = path[depth].p_ext))
++ return ex->ee_start + (block - ex->ee_block);
++
++ /* it looks index is empty
++ * try to find starting from index itself */
++ if (path[depth].p_bh)
++ return path[depth].p_bh->b_blocknr;
++ }
++
++ /* OK. use inode's group */
++ bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) +
++ le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block);
++ colour = (current->pid % 16) *
++ (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16);
++ return bg_start + colour + block;
++}
++
++static int ext3_new_block_cb(handle_t *handle, struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path,
++ struct ext3_extent *ex, int *err)
++{
++ struct inode *inode = tree->inode;
++ int newblock, goal;
++
++ EXT_ASSERT(path);
++ EXT_ASSERT(ex);
++ EXT_ASSERT(ex->ee_start);
++ EXT_ASSERT(ex->ee_len);
++
++ /* reuse block from the extent to order data/metadata */
++ newblock = ex->ee_start++;
++ ex->ee_len--;
++ if (ex->ee_len == 0) {
++ ex->ee_len = 1;
++ /* allocate new block for the extent */
++ goal = ext3_ext_find_goal(inode, path, ex->ee_block);
++ ex->ee_start = ext3_new_block(handle, inode, goal, err);
++ if (ex->ee_start == 0) {
++ /* error occured: restore old extent */
++ ex->ee_start = newblock;
++ return 0;
++ }
++ }
++ return newblock;
++}
++
++static struct ext3_extents_helpers ext3_blockmap_helpers = {
++ .get_write_access = ext3_get_inode_write_access,
++ .mark_buffer_dirty = ext3_mark_buffer_dirty,
++ .mergable = ext3_ext_mergable,
++ .new_block = ext3_new_block_cb,
++ .remove_extent = ext3_remove_blocks,
++ .remove_extent_credits = ext3_remove_blocks_credits,
++};
++
++void ext3_init_tree_desc(struct ext3_extents_tree *tree,
++ struct inode *inode)
++{
++ tree->inode = inode;
++ tree->root = (void *) EXT3_I(inode)->i_data;
++ tree->buffer = (void *) inode;
++ tree->buffer_len = sizeof(EXT3_I(inode)->i_data);
++ tree->cex = (struct ext3_ext_cache *) &EXT3_I(inode)->i_cached_extent;
++ tree->ops = &ext3_blockmap_helpers;
++}
++
++int ext3_ext_get_block(handle_t *handle, struct inode *inode,
++ long iblock, struct buffer_head *bh_result,
++ int create, int extend_disksize)
++{
++ struct ext3_ext_path *path = NULL;
++ struct ext3_extent newex;
++ struct ext3_extent *ex;
++ int goal, newblock, err = 0, depth;
++ struct ext3_extents_tree tree;
++
++ clear_buffer_new(bh_result);
++ ext3_init_tree_desc(&tree, inode);
++ ext_debug(&tree, "block %d requested for inode %u\n",
++ (int) iblock, (unsigned) inode->i_ino);
++ down(&EXT3_I(inode)->truncate_sem);
++
++ /* check in cache */
++ if ((goal = ext3_ext_in_cache(&tree, iblock, &newex))) {
++ if (goal == EXT3_EXT_CACHE_GAP) {
++ if (!create) {
++ /* block isn't allocated yet and
++ * user don't want to allocate it */
++ goto out2;
++ }
++ /* we should allocate requested block */
++ } else if (goal == EXT3_EXT_CACHE_EXTENT) {
++ /* block is already allocated */
++ newblock = iblock - newex.ee_block + newex.ee_start;
++ goto out;
++ } else {
++ EXT_ASSERT(0);
++ }
++ }
++
++ /* find extent for this block */
++ path = ext3_ext_find_extent(&tree, iblock, NULL);
++ if (IS_ERR(path)) {
++ err = PTR_ERR(path);
++ path = NULL;
++ goto out2;
++ }
++
++ depth = EXT_DEPTH(&tree);
++
++ /*
++ * consistent leaf must not be empty
++ * this situations is possible, though, _during_ tree modification
++ * this is why assert can't be put in ext3_ext_find_extent()
++ */
++ EXT_ASSERT(path[depth].p_ext != NULL || depth == 0);
++
++ if ((ex = path[depth].p_ext)) {
++ /* if found exent covers block, simple return it */
++ if (iblock >= ex->ee_block && iblock < ex->ee_block + ex->ee_len) {
++ newblock = iblock - ex->ee_block + ex->ee_start;
++ ext_debug(&tree, "%d fit into %d:%d -> %d\n",
++ (int) iblock, ex->ee_block, ex->ee_len,
++ newblock);
++ ext3_ext_put_in_cache(&tree, ex->ee_block,
++ ex->ee_len, ex->ee_start,
++ EXT3_EXT_CACHE_EXTENT);
++ goto out;
++ }
++ }
++
++ /*
++ * requested block isn't allocated yet
++ * we couldn't try to create block if create flag is zero
++ */
++ if (!create) {
++ /* put just found gap into cache to speedup subsequest reqs */
++ ext3_ext_put_gap_in_cache(&tree, path, iblock);
++ goto out2;
++ }
++
++ /* allocate new block */
++ goal = ext3_ext_find_goal(inode, path, iblock);
++ newblock = ext3_new_block(handle, inode, goal, &err);
++ if (!newblock)
++ goto out2;
++ ext_debug(&tree, "allocate new block: goal %d, found %d\n",
++ goal, newblock);
++
++ /* try to insert new extent into found leaf and return */
++ newex.ee_block = iblock;
++ newex.ee_start = newblock;
++ newex.ee_len = 1;
++ err = ext3_ext_insert_extent(handle, &tree, path, &newex);
++ if (err)
++ goto out2;
++
++ if (extend_disksize && inode->i_size > EXT3_I(inode)->i_disksize)
++ EXT3_I(inode)->i_disksize = inode->i_size;
++
++ /* previous routine could use block we allocated */
++ newblock = newex.ee_start;
++ set_buffer_new(bh_result);
++
++ ext3_ext_put_in_cache(&tree, newex.ee_block, newex.ee_len,
++ newex.ee_start, EXT3_EXT_CACHE_EXTENT);
++out:
++ ext3_ext_show_leaf(&tree, path);
++ map_bh(bh_result, inode->i_sb, newblock);
++out2:
++ if (path) {
++ ext3_ext_drop_refs(path);
++ kfree(path);
++ }
++ up(&EXT3_I(inode)->truncate_sem);
++
++ return err;
++}
++
++void ext3_ext_truncate(struct inode * inode, struct page *page)
++{
++ struct address_space *mapping = inode->i_mapping;
++ struct super_block *sb = inode->i_sb;
++ struct ext3_extents_tree tree;
++ unsigned long last_block;
++ handle_t *handle;
++ int err = 0;
++
++ ext3_init_tree_desc(&tree, inode);
++
++ /*
++ * probably first extent we're gonna free will be last in block
++ */
++ err = ext3_writepage_trans_blocks(inode) + 3;
++ handle = ext3_journal_start(inode, err);
++ if (IS_ERR(handle)) {
++ if (page) {
++ clear_highpage(page);
++ flush_dcache_page(page);
++ unlock_page(page);
++ page_cache_release(page);
++ }
++ return;
++ }
++
++ if (page)
++ ext3_block_truncate_page(handle, page, mapping, inode->i_size);
++
++ down(&EXT3_I(inode)->truncate_sem);
++ ext3_ext_invalidate_cache(&tree);
++
++ /*
++ * TODO: optimization is possible here
++ * probably we need not scaning at all,
++ * because page truncation is enough
++ */
++ if (ext3_orphan_add(handle, inode))
++ goto out_stop;
++
++ /* we have to know where to truncate from in crash case */
++ EXT3_I(inode)->i_disksize = inode->i_size;
++ ext3_mark_inode_dirty(handle, inode);
++
++ last_block = (inode->i_size + sb->s_blocksize - 1)
++ >> EXT3_BLOCK_SIZE_BITS(sb);
++ err = ext3_ext_remove_space(&tree, last_block, EXT_MAX_BLOCK);
++
++ /* In a multi-transaction truncate, we only make the final
++ * transaction synchronous */
++ if (IS_SYNC(inode))
++ handle->h_sync = 1;
++
++out_stop:
++ /*
++ * If this was a simple ftruncate(), and the file will remain alive
++ * then we need to clear up the orphan record which we created above.
++ * However, if this was a real unlink then we were called by
++ * ext3_delete_inode(), and we allow that function to clean up the
++ * orphan info for us.
++ */
++ if (inode->i_nlink)
++ ext3_orphan_del(handle, inode);
++
++ up(&EXT3_I(inode)->truncate_sem);
++ ext3_journal_stop(handle);
++}
++
++/*
++ * this routine calculate max number of blocks we could modify
++ * in order to allocate new block for an inode
++ */
++int ext3_ext_writepage_trans_blocks(struct inode *inode, int num)
++{
++ struct ext3_extents_tree tree;
++ int needed;
++
++ ext3_init_tree_desc(&tree, inode);
++
++ needed = ext3_ext_calc_credits_for_insert(&tree, NULL);
++
++ /* caller want to allocate num blocks */
++ needed *= num;
++
++#ifdef CONFIG_QUOTA
++ /*
++ * FIXME: real calculation should be here
++ * it depends on blockmap format of qouta file
++ */
++ needed += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS;
++#endif
++
++ return needed;
++}
++
++void ext3_extents_initialize_blockmap(handle_t *handle, struct inode *inode)
++{
++ struct ext3_extents_tree tree;
++
++ ext3_init_tree_desc(&tree, inode);
++ ext3_extent_tree_init(handle, &tree);
++}
++
++int ext3_ext_calc_blockmap_metadata(struct inode *inode, int blocks)
++{
++ struct ext3_extents_tree tree;
++
++ ext3_init_tree_desc(&tree, inode);
++ return ext3_ext_calc_metadata_amount(&tree, blocks);
++}
++
++static int
++ext3_ext_store_extent_cb(struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path,
++ struct ext3_extent *newex, int exist)
++{
++ struct ext3_extent_buf *buf = (struct ext3_extent_buf *) tree->private;
++
++ if (!exist)
++ return EXT_CONTINUE;
++ if (buf->err < 0)
++ return EXT_BREAK;
++ if (buf->cur - buf->buffer + sizeof(*newex) > buf->buflen)
++ return EXT_BREAK;
++
++ if (!copy_to_user(buf->cur, newex, sizeof(*newex))) {
++ buf->err++;
++ buf->cur += sizeof(*newex);
++ } else {
++ buf->err = -EFAULT;
++ return EXT_BREAK;
++ }
++ return EXT_CONTINUE;
++}
++
++static int
++ext3_ext_collect_stats_cb(struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path,
++ struct ext3_extent *ex, int exist)
++{
++ struct ext3_extent_tree_stats *buf =
++ (struct ext3_extent_tree_stats *) tree->private;
++ int depth;
++
++ if (!exist)
++ return EXT_CONTINUE;
++
++ depth = EXT_DEPTH(tree);
++ buf->extents_num++;
++ if (path[depth].p_ext == EXT_FIRST_EXTENT(path[depth].p_hdr))
++ buf->leaf_num++;
++ return EXT_CONTINUE;
++}
++
++int ext3_ext_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
++ unsigned long arg)
++{
++ int err = 0;
++
++ if (!(EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL))
++ return -EINVAL;
++
++ if (cmd == EXT3_IOC_GET_EXTENTS) {
++ struct ext3_extent_buf buf;
++ struct ext3_extents_tree tree;
++
++ if (copy_from_user(&buf, (void *) arg, sizeof(buf)))
++ return -EFAULT;
++
++ ext3_init_tree_desc(&tree, inode);
++ buf.cur = buf.buffer;
++ buf.err = 0;
++ tree.private = &buf;
++ down(&EXT3_I(inode)->truncate_sem);
++ err = ext3_ext_walk_space(&tree, buf.start, EXT_MAX_BLOCK,
++ ext3_ext_store_extent_cb);
++ up(&EXT3_I(inode)->truncate_sem);
++ if (err == 0)
++ err = buf.err;
++ } else if (cmd == EXT3_IOC_GET_TREE_STATS) {
++ struct ext3_extent_tree_stats buf;
++ struct ext3_extents_tree tree;
++
++ ext3_init_tree_desc(&tree, inode);
++ down(&EXT3_I(inode)->truncate_sem);
++ buf.depth = EXT_DEPTH(&tree);
++ buf.extents_num = 0;
++ buf.leaf_num = 0;
++ tree.private = &buf;
++ err = ext3_ext_walk_space(&tree, 0, EXT_MAX_BLOCK,
++ ext3_ext_collect_stats_cb);
++ up(&EXT3_I(inode)->truncate_sem);
++ if (!err)
++ err = copy_to_user((void *) arg, &buf, sizeof(buf));
++ } else if (cmd == EXT3_IOC_GET_TREE_DEPTH) {
++ struct ext3_extents_tree tree;
++ ext3_init_tree_desc(&tree, inode);
++ down(&EXT3_I(inode)->truncate_sem);
++ err = EXT_DEPTH(&tree);
++ up(&EXT3_I(inode)->truncate_sem);
++ }
++
++ return err;
++}
++
++EXPORT_SYMBOL(ext3_init_tree_desc);
++EXPORT_SYMBOL(ext3_mark_inode_dirty);
++EXPORT_SYMBOL(ext3_ext_invalidate_cache);
++EXPORT_SYMBOL(ext3_ext_insert_extent);
++EXPORT_SYMBOL(ext3_ext_walk_space);
++EXPORT_SYMBOL(ext3_ext_find_goal);
++EXPORT_SYMBOL(ext3_ext_calc_credits_for_insert);
++
+Index: linux-2.6.5-sles9/fs/ext3/ialloc.c
+===================================================================
+--- linux-2.6.5-sles9.orig/fs/ext3/ialloc.c 2004-11-09 02:22:55.763148128 +0300
++++ linux-2.6.5-sles9/fs/ext3/ialloc.c 2004-11-09 02:23:21.587222272 +0300
+@@ -647,6 +647,10 @@
+ DQUOT_FREE_INODE(inode);
+ goto fail2;
+ }
++ if (test_opt(sb, EXTENTS)) {
++ EXT3_I(inode)->i_flags |= EXT3_EXTENTS_FL;
++ ext3_extents_initialize_blockmap(handle, inode);
++ }
+ err = ext3_mark_inode_dirty(handle, inode);
+ if (err) {
+ ext3_std_error(sb, err);
+Index: linux-2.6.5-sles9/fs/ext3/inode.c
+===================================================================
+--- linux-2.6.5-sles9.orig/fs/ext3/inode.c 2004-11-09 02:22:55.767147520 +0300
++++ linux-2.6.5-sles9/fs/ext3/inode.c 2004-11-09 02:23:21.592221512 +0300
+@@ -796,6 +796,17 @@
+ goto reread;
+ }
+
++static inline int
++ext3_get_block_wrap(handle_t *handle, struct inode *inode, long block,
++ struct buffer_head *bh, int create, int extend_disksize)
++{
++ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL)
++ return ext3_ext_get_block(handle, inode, block, bh, create,
++ extend_disksize);
++ return ext3_get_block_handle(handle, inode, block, bh, create,
++ extend_disksize);
++}
++
+ static int ext3_get_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+ {
+@@ -806,8 +817,8 @@
+ handle = ext3_journal_current_handle();
+ J_ASSERT(handle != 0);
+ }
+- ret = ext3_get_block_handle(handle, inode, iblock,
+- bh_result, create, 1);
++ ret = ext3_get_block_wrap(handle, inode, iblock,
++ bh_result, create, 1);
+ return ret;
+ }
+
+@@ -833,8 +844,8 @@
+ }
+ }
+ if (ret == 0)
+- ret = ext3_get_block_handle(handle, inode, iblock,
+- bh_result, create, 0);
++ ret = ext3_get_block_wrap(handle, inode, iblock,
++ bh_result, create, 0);
+ if (ret == 0)
+ bh_result->b_size = (1 << inode->i_blkbits);
+ return ret;
+@@ -855,7 +866,7 @@
+ dummy.b_state = 0;
+ dummy.b_blocknr = -1000;
+ buffer_trace_init(&dummy.b_history);
+- *errp = ext3_get_block_handle(handle, inode, block, &dummy, create, 1);
++ *errp = ext3_get_block_wrap(handle, inode, block, &dummy, create, 1);
+ if (!*errp && buffer_mapped(&dummy)) {
+ struct buffer_head *bh;
+ bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
+@@ -1587,7 +1598,7 @@
+ * This required during truncate. We need to physically zero the tail end
+ * of that block so it doesn't yield old data if the file is later grown.
+ */
+-static int ext3_block_truncate_page(handle_t *handle, struct page *page,
++int ext3_block_truncate_page(handle_t *handle, struct page *page,
+ struct address_space *mapping, loff_t from)
+ {
+ unsigned long index = from >> PAGE_CACHE_SHIFT;
+@@ -2083,6 +2094,9 @@
+ return;
+ }
+
++ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL)
++ return ext3_ext_truncate(inode, page);
++
+ handle = start_transaction(inode);
+ if (IS_ERR(handle)) {
+ if (page) {
+@@ -2789,6 +2803,9 @@
+ int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3;
+ int ret;
+
++ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL)
++ return ext3_ext_writepage_trans_blocks(inode, bpp);
++
+ if (ext3_should_journal_data(inode))
+ ret = 3 * (bpp + indirects) + 2;
+ else
+Index: linux-2.6.5-sles9/fs/ext3/Makefile
+===================================================================
+--- linux-2.6.5-sles9.orig/fs/ext3/Makefile 2004-11-09 02:18:27.604914376 +0300
++++ linux-2.6.5-sles9/fs/ext3/Makefile 2004-11-09 02:23:21.593221360 +0300
+@@ -5,7 +5,7 @@
+ obj-$(CONFIG_EXT3_FS) += ext3.o
+
+ ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \
+- ioctl.o namei.o super.o symlink.o hash.o
++ ioctl.o namei.o super.o symlink.o hash.o extents.o
+
+ ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
+ ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o
+Index: linux-2.6.5-sles9/fs/ext3/super.c
+===================================================================
+--- linux-2.6.5-sles9.orig/fs/ext3/super.c 2004-11-09 02:22:56.450043704 +0300
++++ linux-2.6.5-sles9/fs/ext3/super.c 2004-11-09 02:23:21.597220752 +0300
+@@ -389,6 +389,7 @@
+ struct ext3_super_block *es = sbi->s_es;
+ int i;
+
++ ext3_ext_release(sb);
+ ext3_xattr_put_super(sb);
+ journal_destroy(sbi->s_journal);
+ if (!(sb->s_flags & MS_RDONLY)) {
+@@ -447,6 +448,10 @@
+ #endif
+ ei->i_rsv_window.rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
+ ei->vfs_inode.i_version = 1;
++ ei->i_cached_extent[0] = 0;
++ ei->i_cached_extent[1] = 0;
++ ei->i_cached_extent[2] = 0;
++ ei->i_cached_extent[3] = 0;
+ return &ei->vfs_inode;
+ }
+
+@@ -537,7 +542,7 @@
+ Opt_commit, Opt_journal_update, Opt_journal_inum,
+ Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
+ Opt_ignore, Opt_barrier, Opt_iopen, Opt_noiopen, Opt_iopen_nopriv,
+- Opt_err,
++ Opt_err, Opt_extents, Opt_extdebug
+ };
+
+ static match_table_t tokens = {
+@@ -582,6 +587,8 @@
+ {Opt_iopen, "iopen"},
+ {Opt_noiopen, "noiopen"},
+ {Opt_iopen_nopriv, "iopen_nopriv"},
++ {Opt_extents, "extents"},
++ {Opt_extdebug, "extdebug"},
+ {Opt_err, NULL}
+ };
+
+@@ -797,6 +804,12 @@
+ break;
+ case Opt_ignore:
+ break;
++ case Opt_extents:
++ set_opt (sbi->s_mount_opt, EXTENTS);
++ break;
++ case Opt_extdebug:
++ set_opt (sbi->s_mount_opt, EXTDEBUG);
++ break;
+ default:
+ printk (KERN_ERR
+ "EXT3-fs: Unrecognized mount option \"%s\" "
+@@ -1449,6 +1462,8 @@
+ percpu_counter_mod(&sbi->s_dirs_counter,
+ ext3_count_dirs(sb));
+
++ ext3_ext_init(sb);
++
+ return 0;
+
+ failed_mount3:
+Index: linux-2.6.5-sles9/fs/ext3/ioctl.c
+===================================================================
+--- linux-2.6.5-sles9.orig/fs/ext3/ioctl.c 2004-11-09 02:15:44.610693264 +0300
++++ linux-2.6.5-sles9/fs/ext3/ioctl.c 2004-11-09 02:23:52.991448104 +0300
+@@ -124,6 +124,10 @@
+ err = ext3_change_inode_journal_flag(inode, jflag);
+ return err;
+ }
++ case EXT3_IOC_GET_EXTENTS:
++ case EXT3_IOC_GET_TREE_STATS:
++ case EXT3_IOC_GET_TREE_DEPTH:
++ return ext3_ext_ioctl(inode, filp, cmd, arg);
+ case EXT3_IOC_GETVERSION:
+ case EXT3_IOC_GETVERSION_OLD:
+ return put_user(inode->i_generation, (int *) arg);
+Index: linux-2.6.5-sles9/include/linux/ext3_fs.h
+===================================================================
+--- linux-2.6.5-sles9.orig/include/linux/ext3_fs.h 2004-11-09 02:22:58.767691368 +0300
++++ linux-2.6.5-sles9/include/linux/ext3_fs.h 2004-11-09 02:25:17.238640584 +0300
+@@ -186,6 +186,7 @@
+ #define EXT3_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */
+ #define EXT3_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/
+ #define EXT3_RESERVED_FL 0x80000000 /* reserved for ext3 lib */
++#define EXT3_EXTENTS_FL 0x00080000 /* Inode uses extents */
+
+ #define EXT3_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */
+ #define EXT3_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */
+@@ -211,6 +212,9 @@
+ #endif
+ #define EXT3_IOC_GETRSVSZ _IOR('f', 5, long)
+ #define EXT3_IOC_SETRSVSZ _IOW('f', 6, long)
++#define EXT3_IOC_GET_EXTENTS _IOR('f', 7, long)
++#define EXT3_IOC_GET_TREE_DEPTH _IOR('f', 8, long)
++#define EXT3_IOC_GET_TREE_STATS _IOR('f', 9, long)
+
+ /*
+ * Structure of an inode on the disk
+@@ -333,6 +337,8 @@
+ #define EXT3_MOUNT_BARRIER 0x20000 /* Use block barriers */
+ #define EXT3_MOUNT_IOPEN 0x40000 /* Allow access via iopen */
+ #define EXT3_MOUNT_IOPEN_NOPRIV 0x80000 /* Make iopen world-readable */
++#define EXT3_MOUNT_EXTENTS 0x100000/* Extents support */
++#define EXT3_MOUNT_EXTDEBUG 0x200000/* Extents debug */
+
+ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
+ #ifndef clear_opt
+@@ -729,6 +735,7 @@
+
+
+ /* inode.c */
++extern int ext3_block_truncate_page(handle_t *, struct page *, struct address_space *, loff_t);
+ extern int ext3_forget(handle_t *, int, struct inode *, struct buffer_head *, int);
+ extern struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *);
+ extern struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *);
+@@ -802,6 +809,14 @@
+ extern struct inode_operations ext3_symlink_inode_operations;
+ extern struct inode_operations ext3_fast_symlink_inode_operations;
+
++/* extents.c */
++extern int ext3_ext_writepage_trans_blocks(struct inode *, int);
++extern int ext3_ext_get_block(handle_t *, struct inode *, long,
++ struct buffer_head *, int, int);
++extern void ext3_ext_truncate(struct inode *, struct page *);
++extern void ext3_ext_init(struct super_block *);
++extern void ext3_ext_release(struct super_block *);
++extern void ext3_extents_initialize_blockmap(handle_t *, struct inode *);
+
+ #endif /* __KERNEL__ */
+
+Index: linux-2.6.5-sles9/include/linux/ext3_extents.h
+===================================================================
+--- linux-2.6.5-sles9.orig/include/linux/ext3_extents.h 2003-01-30 13:24:37.000000000 +0300
++++ linux-2.6.5-sles9/include/linux/ext3_extents.h 2004-11-09 02:23:21.606219384 +0300
+@@ -0,0 +1,252 @@
++/*
++ * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com
++ * Written by Alex Tomas <alex@clusterfs.com>
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License version 2 as
++ * published by the Free Software Foundation.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public Licens
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
++ */
++
++#ifndef _LINUX_EXT3_EXTENTS
++#define _LINUX_EXT3_EXTENTS
++
++/*
++ * with AGRESSIVE_TEST defined capacity of index/leaf blocks
++ * become very little, so index split, in-depth growing and
++ * other hard changes happens much more often
++ * this is for debug purposes only
++ */
++#define AGRESSIVE_TEST_
++
++/*
++ * if CHECK_BINSEARCH defined, then results of binary search
++ * will be checked by linear search
++ */
++#define CHECK_BINSEARCH_
++
++/*
++ * if EXT_DEBUG is defined you can use 'extdebug' mount option
++ * to get lots of info what's going on
++ */
++#define EXT_DEBUG_
++#ifdef EXT_DEBUG
++#define ext_debug(tree,fmt,a...) \
++do { \
++ if (test_opt((tree)->inode->i_sb, EXTDEBUG)) \
++ printk(fmt, ##a); \
++} while (0);
++#else
++#define ext_debug(tree,fmt,a...)
++#endif
++
++/*
++ * if EXT_STATS is defined then stats numbers are collected
++ * these number will be displayed at umount time
++ */
++#define EXT_STATS_
++
++
++#define EXT3_ALLOC_NEEDED 3 /* block bitmap + group desc. + sb */
++
++/*
++ * ext3_inode has i_block array (total 60 bytes)
++ * first 4 bytes are used to store:
++ * - tree depth (0 mean there is no tree yet. all extents in the inode)
++ * - number of alive extents in the inode
++ */
++
++/*
++ * this is extent on-disk structure
++ * it's used at the bottom of the tree
++ */
++struct ext3_extent {
++ __u32 ee_block; /* first logical block extent covers */
++ __u16 ee_len; /* number of blocks covered by extent */
++ __u16 ee_start_hi; /* high 16 bits of physical block */
++ __u32 ee_start; /* low 32 bigs of physical block */
++};
++
++/*
++ * this is index on-disk structure
++ * it's used at all the levels, but the bottom
++ */
++struct ext3_extent_idx {
++ __u32 ei_block; /* index covers logical blocks from 'block' */
++ __u32 ei_leaf; /* pointer to the physical block of the next *
++ * level. leaf or next index could bet here */
++ __u16 ei_leaf_hi; /* high 16 bits of physical block */
++ __u16 ei_unused;
++};
++
++/*
++ * each block (leaves and indexes), even inode-stored has header
++ */
++struct ext3_extent_header {
++ __u16 eh_magic; /* probably will support different formats */
++ __u16 eh_entries; /* number of valid entries */
++ __u16 eh_max; /* capacity of store in entries */
++ __u16 eh_depth; /* has tree real underlaying blocks? */
++ __u32 eh_generation; /* generation of the tree */
++};
++
++#define EXT3_EXT_MAGIC 0xf30a
++
++/*
++ * array of ext3_ext_path contains path to some extent
++ * creation/lookup routines use it for traversal/splitting/etc
++ * truncate uses it to simulate recursive walking
++ */
++struct ext3_ext_path {
++ __u32 p_block;
++ __u16 p_depth;
++ struct ext3_extent *p_ext;
++ struct ext3_extent_idx *p_idx;
++ struct ext3_extent_header *p_hdr;
++ struct buffer_head *p_bh;
++};
++
++/*
++ * structure for external API
++ */
++
++/*
++ * storage for cached extent
++ */
++struct ext3_ext_cache {
++ __u32 ec_start;
++ __u32 ec_block;
++ __u32 ec_len;
++ __u32 ec_type;
++};
++
++#define EXT3_EXT_CACHE_NO 0
++#define EXT3_EXT_CACHE_GAP 1
++#define EXT3_EXT_CACHE_EXTENT 2
++
++/*
++ * ext3_extents_tree is used to pass initial information
++ * to top-level extents API
++ */
++struct ext3_extents_helpers;
++struct ext3_extents_tree {
++ struct inode *inode; /* inode which tree belongs to */
++ void *root; /* ptr to data top of tree resides at */
++ void *buffer; /* will be passed as arg to ^^ routines */
++ int buffer_len;
++ void *private;
++ struct ext3_ext_cache *cex;/* last found extent */
++ struct ext3_extents_helpers *ops;
++};
++
++struct ext3_extents_helpers {
++ int (*get_write_access)(handle_t *h, void *buffer);
++ int (*mark_buffer_dirty)(handle_t *h, void *buffer);
++ int (*mergable)(struct ext3_extent *ex1, struct ext3_extent *ex2);
++ int (*remove_extent_credits)(struct ext3_extents_tree *,
++ struct ext3_extent *, unsigned long,
++ unsigned long);
++ int (*remove_extent)(struct ext3_extents_tree *,
++ struct ext3_extent *, unsigned long,
++ unsigned long);
++ int (*new_block)(handle_t *, struct ext3_extents_tree *,
++ struct ext3_ext_path *, struct ext3_extent *,
++ int *);
++};
++
++/*
++ * to be called by ext3_ext_walk_space()
++ * negative retcode - error
++ * positive retcode - signal for ext3_ext_walk_space(), see below
++ * callback must return valid extent (passed or newly created)
++ */
++typedef int (*ext_prepare_callback)(struct ext3_extents_tree *,
++ struct ext3_ext_path *,
++ struct ext3_extent *, int);
++
++#define EXT_CONTINUE 0
++#define EXT_BREAK 1
++#define EXT_REPEAT 2
++
++
++#define EXT_MAX_BLOCK 0xffffffff
++#define EXT_CACHE_MARK 0xffff
++
++
++#define EXT_FIRST_EXTENT(__hdr__) \
++ ((struct ext3_extent *) (((char *) (__hdr__)) + \
++ sizeof(struct ext3_extent_header)))
++#define EXT_FIRST_INDEX(__hdr__) \
++ ((struct ext3_extent_idx *) (((char *) (__hdr__)) + \
++ sizeof(struct ext3_extent_header)))
++#define EXT_HAS_FREE_INDEX(__path__) \
++ ((__path__)->p_hdr->eh_entries < (__path__)->p_hdr->eh_max)
++#define EXT_LAST_EXTENT(__hdr__) \
++ (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_entries - 1)
++#define EXT_LAST_INDEX(__hdr__) \
++ (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_entries - 1)
++#define EXT_MAX_EXTENT(__hdr__) \
++ (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_max - 1)
++#define EXT_MAX_INDEX(__hdr__) \
++ (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_max - 1)
++
++#define EXT_ROOT_HDR(tree) \
++ ((struct ext3_extent_header *) (tree)->root)
++#define EXT_BLOCK_HDR(bh) \
++ ((struct ext3_extent_header *) (bh)->b_data)
++#define EXT_DEPTH(_t_) \
++ (((struct ext3_extent_header *)((_t_)->root))->eh_depth)
++#define EXT_GENERATION(_t_) \
++ (((struct ext3_extent_header *)((_t_)->root))->eh_generation)
++
++
++#define EXT_ASSERT(__x__) if (!(__x__)) BUG();
++
++
++/*
++ * this structure is used to gather extents from the tree via ioctl
++ */
++struct ext3_extent_buf {
++ unsigned long start;
++ int buflen;
++ void *buffer;
++ void *cur;
++ int err;
++};
++
++/*
++ * this structure is used to collect stats info about the tree
++ */
++struct ext3_extent_tree_stats {
++ int depth;
++ int extents_num;
++ int leaf_num;
++};
++
++extern int ext3_extent_tree_init(handle_t *, struct ext3_extents_tree *);
++extern int ext3_ext_calc_credits_for_insert(struct ext3_extents_tree *, struct ext3_ext_path *);
++extern int ext3_ext_insert_extent(handle_t *, struct ext3_extents_tree *, struct ext3_ext_path *, struct ext3_extent *);
++extern int ext3_ext_walk_space(struct ext3_extents_tree *, unsigned long, unsigned long, ext_prepare_callback);
++extern int ext3_ext_remove_space(struct ext3_extents_tree *, unsigned long, unsigned long);
++extern struct ext3_ext_path * ext3_ext_find_extent(struct ext3_extents_tree *, int, struct ext3_ext_path *);
++extern void ext3_init_tree_desc(struct ext3_extents_tree *, struct inode *);
++extern int ext3_ext_calc_blockmap_metadata(struct inode *, int);
++
++static inline void
++ext3_ext_invalidate_cache(struct ext3_extents_tree *tree)
++{
++ if (tree->cex)
++ tree->cex->ec_type = EXT3_EXT_CACHE_NO;
++}
++
++
++#endif /* _LINUX_EXT3_EXTENTS */
++
+Index: linux-2.6.5-sles9/include/linux/ext3_fs_i.h
+===================================================================
+--- linux-2.6.5-sles9.orig/include/linux/ext3_fs_i.h 2004-11-09 02:22:55.780145544 +0300
++++ linux-2.6.5-sles9/include/linux/ext3_fs_i.h 2004-11-09 02:23:21.606219384 +0300
+@@ -128,6 +128,8 @@
+ */
+ struct semaphore truncate_sem;
+ struct inode vfs_inode;
++
++ __u32 i_cached_extent[4];
+ };
+
+ #endif /* _LINUX_EXT3_FS_I */
+
+%diffstat
+ fs/ext3/Makefile | 2
+ fs/ext3/extents.c | 2313 +++++++++++++++++++++++++++++++++++++++++++
+ fs/ext3/ialloc.c | 4
+ fs/ext3/inode.c | 29
+ fs/ext3/ioctl.c | 4
+ fs/ext3/super.c | 17
+ include/linux/ext3_extents.h | 252 ++++
+ include/linux/ext3_fs.h | 15
+ include/linux/ext3_fs_i.h | 2
+ 9 files changed, 2630 insertions(+), 8 deletions(-)
+
--- /dev/null
+Index: linux-2.6.5-sles9/fs/ext3/mballoc.c
+===================================================================
+--- linux-2.6.5-sles9.orig/fs/ext3/mballoc.c 2003-01-30 13:24:37.000000000 +0300
++++ linux-2.6.5-sles9/fs/ext3/mballoc.c 2004-11-09 02:34:25.181340632 +0300
+@@ -0,0 +1,1428 @@
++/*
++ * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com
++ * Written by Alex Tomas <alex@clusterfs.com>
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License version 2 as
++ * published by the Free Software Foundation.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public Licens
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
++ */
++
++
++/*
++ * mballoc.c contains the multiblocks allocation routines
++ */
++
++#include <linux/config.h>
++#include <linux/time.h>
++#include <linux/fs.h>
++#include <linux/namei.h>
++#include <linux/jbd.h>
++#include <linux/ext3_fs.h>
++#include <linux/ext3_jbd.h>
++#include <linux/quotaops.h>
++#include <linux/buffer_head.h>
++#include <linux/module.h>
++
++/*
++ * TODO:
++ * - do not scan from the beginning, try to remember first free block
++ * - mb_mark_used_* may allocate chunk right after splitting buddy
++ * - special flag to advice allocator to look for requested + N blocks
++ * this may improve interaction between extents and mballoc
++ */
++
++/*
++ * with AGRESSIVE_CHECK allocator runs consistency checks over
++ * structures. this checks slow things down a lot
++ */
++#define AGGRESSIVE_CHECK__
++
++/*
++ */
++#define MB_DEBUG__
++#ifdef MB_DEBUG
++#define mb_debug(fmt,a...) printk(fmt, ##a)
++#else
++#define mb_debug(fmt,a...)
++#endif
++
++/*
++ * where to save buddies structures beetween umount/mount (clean case only)
++ */
++#define EXT3_BUDDY_FILE ".buddy"
++
++/*
++ * max. number of chunks to be tracked in ext3_free_extent struct
++ */
++#define MB_ARR_SIZE 32
++
++struct ext3_allocation_context {
++ struct super_block *ac_sb;
++
++ /* search goals */
++ int ac_g_group;
++ int ac_g_start;
++ int ac_g_len;
++ int ac_g_flags;
++
++ /* the best found extent */
++ int ac_b_group;
++ int ac_b_start;
++ int ac_b_len;
++
++ /* number of iterations done. we have to track to limit searching */
++ int ac_repeats;
++ int ac_groups_scanned;
++ int ac_status;
++};
++
++#define AC_STATUS_CONTINUE 1
++#define AC_STATUS_FOUND 2
++
++
++struct ext3_buddy {
++ void *bd_bitmap;
++ void *bd_buddy;
++ int bd_blkbits;
++ struct buffer_head *bd_bh;
++ struct buffer_head *bd_bh2;
++ struct ext3_buddy_group_blocks *bd_bd;
++ struct super_block *bd_sb;
++};
++
++struct ext3_free_extent {
++ int fe_start;
++ int fe_len;
++ unsigned char fe_orders[MB_ARR_SIZE];
++ unsigned char fe_nums;
++ unsigned char fe_back;
++};
++
++#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
++
++
++int ext3_create (struct inode *, struct dentry *, int, struct nameidata *);
++struct buffer_head * read_block_bitmap(struct super_block *, unsigned int);
++void ext3_free_blocks_old(handle_t *, struct inode *, unsigned long, unsigned long);
++int ext3_new_block_old(handle_t *, struct inode *, unsigned long, int *);
++int ext3_mb_reserve_blocks(struct super_block *, int);
++void ext3_mb_release_blocks(struct super_block *, int);
++void ext3_mb_poll_new_transaction(struct super_block *, handle_t *);
++void ext3_mb_free_committed_blocks(struct super_block *);
++
++#define mb_correct_addr_and_bit(bit,addr) \
++{ \
++ if ((unsigned) addr & 1) { \
++ bit += 8; \
++ addr--; \
++ } \
++ if ((unsigned) addr & 2) { \
++ bit += 16; \
++ addr--; \
++ addr--; \
++ } \
++}
++
++static inline int mb_test_bit(int bit, void *addr)
++{
++ mb_correct_addr_and_bit(bit,addr);
++ return test_bit(bit, addr);
++}
++
++static inline void mb_set_bit(int bit, void *addr)
++{
++ mb_correct_addr_and_bit(bit,addr);
++ set_bit(bit, addr);
++}
++
++static inline void mb_clear_bit(int bit, void *addr)
++{
++ mb_correct_addr_and_bit(bit,addr);
++ clear_bit(bit, addr);
++}
++
++static inline void *mb_find_buddy(struct ext3_buddy *e3b, int order, int *max)
++{
++ int i = 1;
++ void *bb;
++
++ J_ASSERT(e3b->bd_bitmap != e3b->bd_buddy);
++ J_ASSERT(max != NULL);
++
++ if (order > e3b->bd_blkbits + 1)
++ return NULL;
++
++ /* at order 0 we see each particular block */
++ *max = 1 << (e3b->bd_blkbits + 3);
++ if (order == 0)
++ return e3b->bd_bitmap;
++
++ bb = e3b->bd_buddy;
++ *max = *max >> 1;
++ while (i < order) {
++ bb += 1 << (e3b->bd_blkbits - i);
++ i++;
++ *max = *max >> 1;
++ }
++ return bb;
++}
++
++static int ext3_mb_load_desc(struct super_block *sb, int group,
++ struct ext3_buddy *e3b)
++{
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++
++ J_ASSERT(sbi->s_buddy_blocks[group].bb_bitmap);
++ J_ASSERT(sbi->s_buddy_blocks[group].bb_buddy);
++
++ /* load bitmap */
++ e3b->bd_bh = sb_getblk(sb, sbi->s_buddy_blocks[group].bb_bitmap);
++ if (e3b->bd_bh == NULL) {
++ ext3_error(sb, "ext3_mb_load_desc",
++ "can't get block for buddy bitmap\n");
++ goto out;
++ }
++ if (!buffer_uptodate(e3b->bd_bh)) {
++ ll_rw_block(READ, 1, &e3b->bd_bh);
++ wait_on_buffer(e3b->bd_bh);
++ }
++ J_ASSERT(buffer_uptodate(e3b->bd_bh));
++
++ /* load buddy */
++ e3b->bd_bh2 = sb_getblk(sb, sbi->s_buddy_blocks[group].bb_buddy);
++ if (e3b->bd_bh2 == NULL) {
++ ext3_error(sb, "ext3_mb_load_desc",
++ "can't get block for buddy bitmap\n");
++ goto out;
++ }
++ if (!buffer_uptodate(e3b->bd_bh2)) {
++ ll_rw_block(READ, 1, &e3b->bd_bh2);
++ wait_on_buffer(e3b->bd_bh2);
++ }
++ J_ASSERT(buffer_uptodate(e3b->bd_bh2));
++
++ e3b->bd_bitmap = e3b->bd_bh->b_data;
++ e3b->bd_buddy = e3b->bd_bh2->b_data;
++ e3b->bd_blkbits = sb->s_blocksize_bits;
++ e3b->bd_bd = sbi->s_buddy_blocks + group;
++ e3b->bd_sb = sb;
++
++ return 0;
++out:
++ brelse(e3b->bd_bh);
++ brelse(e3b->bd_bh2);
++ e3b->bd_bh = NULL;
++ e3b->bd_bh2 = NULL;
++ return -EIO;
++}
++
++static void ext3_mb_dirty_buddy(struct ext3_buddy *e3b)
++{
++ mark_buffer_dirty(e3b->bd_bh);
++ mark_buffer_dirty(e3b->bd_bh2);
++}
++
++static void ext3_mb_release_desc(struct ext3_buddy *e3b)
++{
++ brelse(e3b->bd_bh);
++ brelse(e3b->bd_bh2);
++}
++
++#ifdef AGGRESSIVE_CHECK
++static void mb_check_buddy(struct ext3_buddy *e3b)
++{
++ int order = e3b->bd_blkbits + 1;
++ int max, max2, i, j, k, count;
++ void *buddy, *buddy2;
++
++ if (!test_opt(e3b->bd_sb, MBALLOC))
++ return;
++
++ while (order > 1) {
++ buddy = mb_find_buddy(e3b, order, &max);
++ J_ASSERT(buddy);
++ buddy2 = mb_find_buddy(e3b, order - 1, &max2);
++ J_ASSERT(buddy2);
++ J_ASSERT(buddy != buddy2);
++ J_ASSERT(max * 2 == max2);
++
++ count = 0;
++ for (i = 0; i < max; i++) {
++
++ if (!mb_test_bit(i, buddy)) {
++ /* only single bit in buddy2 may be 1 */
++ if (mb_test_bit(i << 1, buddy2))
++ J_ASSERT(!mb_test_bit((i<<1)+1, buddy2));
++ else if (mb_test_bit((i << 1) + 1, buddy2))
++ J_ASSERT(!mb_test_bit(i << 1, buddy2));
++ continue;
++ }
++
++ /* both bits in buddy2 must be 0 */
++ J_ASSERT(!mb_test_bit(i << 1, buddy2));
++ J_ASSERT(!mb_test_bit((i << 1) + 1, buddy2));
++
++ for (j = 0; j < (1 << order); j++) {
++ k = (i * (1 << order)) + j;
++ J_ASSERT(mb_test_bit(k, e3b->bd_bitmap));
++ }
++ count++;
++ }
++ J_ASSERT(e3b->bd_bd->bb_counters[order] == count);
++ order--;
++ }
++
++ buddy = mb_find_buddy(e3b, 0, &max);
++ for (i = 0; i < max; i++) {
++ if (mb_test_bit(i, buddy))
++ continue;
++ /* check used bits only */
++ for (j = 0; j < e3b->bd_blkbits + 1; j++) {
++ buddy2 = mb_find_buddy(e3b, j, &max2);
++ k = i >> j;
++ J_ASSERT(k < max2);
++ J_ASSERT(!mb_test_bit(k, buddy2));
++ }
++ }
++}
++#else
++#define mb_check_buddy(e3b)
++#endif
++
++static inline void
++ext3_lock_group(struct super_block *sb, int group)
++{
++ spin_lock(&EXT3_SB(sb)->s_buddy_blocks[group].bb_lock);
++}
++
++static inline void
++ext3_unlock_group(struct super_block *sb, int group)
++{
++ spin_unlock(&EXT3_SB(sb)->s_buddy_blocks[group].bb_lock);
++}
++
++static int mb_find_order_for_block(struct ext3_buddy *e3b, int block)
++{
++ int order = 1;
++ void *bb;
++
++ J_ASSERT(e3b->bd_bitmap != e3b->bd_buddy);
++ J_ASSERT(block < (1 << (e3b->bd_blkbits + 3)));
++
++ bb = e3b->bd_buddy;
++ while (order <= e3b->bd_blkbits + 1) {
++ block = block >> 1;
++ if (mb_test_bit(block, bb)) {
++ /* this block is part of buddy of order 'order' */
++ return order;
++ }
++ bb += 1 << (e3b->bd_blkbits - order);
++ order++;
++ }
++ return 0;
++}
++
++static inline void mb_clear_bits(void *bm, int cur, int len)
++{
++ __u32 *addr;
++
++ len = cur + len;
++ while (cur < len) {
++ if ((cur & 31) == 0 && (len - cur) >= 32) {
++ /* fast path: clear whole word at once */
++ addr = bm + (cur >> 3);
++ *addr = 0;
++ cur += 32;
++ continue;
++ }
++ mb_clear_bit(cur, bm);
++ cur++;
++ }
++}
++
++static inline void mb_set_bits(void *bm, int cur, int len)
++{
++ __u32 *addr;
++
++ len = cur + len;
++ while (cur < len) {
++ if ((cur & 31) == 0 && (len - cur) >= 32) {
++ /* fast path: clear whole word at once */
++ addr = bm + (cur >> 3);
++ *addr = 0xffffffff;
++ cur += 32;
++ continue;
++ }
++ mb_set_bit(cur, bm);
++ cur++;
++ }
++}
++
++static int mb_free_blocks(struct ext3_buddy *e3b, int first, int count)
++{
++ int block, max, order;
++ void *buddy, *buddy2;
++
++ mb_check_buddy(e3b);
++ while (count-- > 0) {
++ block = first++;
++ order = 0;
++
++ J_ASSERT(!mb_test_bit(block, e3b->bd_bitmap));
++ mb_set_bit(block, e3b->bd_bitmap);
++ e3b->bd_bd->bb_counters[order]++;
++
++ /* start of the buddy */
++ buddy = mb_find_buddy(e3b, order, &max);
++
++ do {
++ block &= ~1UL;
++ if (!mb_test_bit(block, buddy) ||
++ !mb_test_bit(block + 1, buddy))
++ break;
++
++ /* both the buddies are free, try to coalesce them */
++ buddy2 = mb_find_buddy(e3b, order + 1, &max);
++
++ if (!buddy2)
++ break;
++
++ if (order > 0) {
++ /* for special purposes, we don't clear
++ * free bits in bitmap */
++ mb_clear_bit(block, buddy);
++ mb_clear_bit(block + 1, buddy);
++ }
++ e3b->bd_bd->bb_counters[order]--;
++ e3b->bd_bd->bb_counters[order]--;
++
++ block = block >> 1;
++ order++;
++ e3b->bd_bd->bb_counters[order]++;
++
++ mb_set_bit(block, buddy2);
++ buddy = buddy2;
++ } while (1);
++ }
++ mb_check_buddy(e3b);
++
++ return 0;
++}
++
++/*
++ * returns 1 if out extent is enough to fill needed space
++ */
++int mb_make_backward_extent(struct ext3_free_extent *in,
++ struct ext3_free_extent *out, int needed)
++{
++ int i;
++
++ J_ASSERT(in);
++ J_ASSERT(out);
++ J_ASSERT(in->fe_nums < MB_ARR_SIZE);
++
++ out->fe_len = 0;
++ out->fe_start = in->fe_start + in->fe_len;
++ out->fe_nums = 0;
++
++ /* for single-chunk extent we need not back order
++ * also, if an extent doesn't fill needed space
++ * then it makes no sense to try back order becase
++ * if we select this extent then it'll be use as is */
++ if (in->fe_nums < 2 || in->fe_len < needed)
++ return 0;
++
++ i = in->fe_nums - 1;
++ while (i >= 0 && out->fe_len < needed) {
++ out->fe_len += (1 << in->fe_orders[i]);
++ out->fe_start -= (1 << in->fe_orders[i]);
++ i--;
++ }
++ /* FIXME: in some situation fe_orders may be too small to hold
++ * all the buddies */
++ J_ASSERT(out->fe_len >= needed);
++
++ for (i++; i < in->fe_nums; i++)
++ out->fe_orders[out->fe_nums++] = in->fe_orders[i];
++ J_ASSERT(out->fe_nums < MB_ARR_SIZE);
++ out->fe_back = 1;
++
++ return 1;
++}
++
++int mb_find_extent(struct ext3_buddy *e3b, int order, int block,
++ int needed, struct ext3_free_extent *ex)
++{
++ int space = needed;
++ int next, max, ord;
++ void *buddy;
++
++ J_ASSERT(ex != NULL);
++
++ ex->fe_nums = 0;
++ ex->fe_len = 0;
++
++ buddy = mb_find_buddy(e3b, order, &max);
++ J_ASSERT(buddy);
++ J_ASSERT(block < max);
++ if (!mb_test_bit(block, buddy))
++ goto nofree;
++
++ if (order == 0) {
++ /* find actual order */
++ order = mb_find_order_for_block(e3b, block);
++ block = block >> order;
++ }
++
++ ex->fe_orders[ex->fe_nums++] = order;
++ ex->fe_len = 1 << order;
++ ex->fe_start = block << order;
++ ex->fe_back = 0;
++
++ while ((space = space - (1 << order)) > 0) {
++
++ buddy = mb_find_buddy(e3b, order, &max);
++ J_ASSERT(buddy);
++
++ if (block + 1 >= max)
++ break;
++
++ next = (block + 1) * (1 << order);
++ if (!mb_test_bit(next, e3b->bd_bitmap))
++ break;
++
++ ord = mb_find_order_for_block(e3b, next);
++
++ if ((1 << ord) >= needed) {
++ /* we dont want to coalesce with self-enough buddies */
++ break;
++ }
++ order = ord;
++ block = next >> order;
++ ex->fe_len += 1 << order;
++
++ if (ex->fe_nums < MB_ARR_SIZE)
++ ex->fe_orders[ex->fe_nums++] = order;
++ }
++
++nofree:
++ J_ASSERT(ex->fe_start + ex->fe_len <= (1 << (e3b->bd_blkbits + 3)));
++ return ex->fe_len;
++}
++
++static int mb_mark_used_backward(struct ext3_buddy *e3b,
++ struct ext3_free_extent *ex, int len)
++{
++ int start = ex->fe_start, len0 = len;
++ int ord, mlen, max, cur;
++ void *buddy;
++
++ start = ex->fe_start + ex->fe_len - 1;
++ while (len) {
++ ord = mb_find_order_for_block(e3b, start);
++ if (((start >> ord) << ord) == (start - (1 << ord) + 1) &&
++ len >= (1 << ord)) {
++ /* the whole chunk may be allocated at once! */
++ mlen = 1 << ord;
++ buddy = mb_find_buddy(e3b, ord, &max);
++ J_ASSERT((start >> ord) < max);
++ mb_clear_bit(start >> ord, buddy);
++ e3b->bd_bd->bb_counters[ord]--;
++ start -= mlen;
++ len -= mlen;
++ J_ASSERT(len >= 0);
++ J_ASSERT(start >= 0);
++ continue;
++ }
++
++ /* we have to split large buddy */
++ J_ASSERT(ord > 0);
++ buddy = mb_find_buddy(e3b, ord, &max);
++ mb_clear_bit(start >> ord, buddy);
++ e3b->bd_bd->bb_counters[ord]--;
++
++ ord--;
++ cur = (start >> ord) & ~1U;
++ buddy = mb_find_buddy(e3b, ord, &max);
++ mb_set_bit(cur, buddy);
++ mb_set_bit(cur + 1, buddy);
++ e3b->bd_bd->bb_counters[ord]++;
++ e3b->bd_bd->bb_counters[ord]++;
++ }
++
++ /* now drop all the bits in bitmap */
++ mb_clear_bits(e3b->bd_bitmap, ex->fe_start + ex->fe_len - len0, len0);
++
++ mb_check_buddy(e3b);
++
++ return 0;
++}
++
++static int mb_mark_used_forward(struct ext3_buddy *e3b,
++ struct ext3_free_extent *ex, int len)
++{
++ int start = ex->fe_start, len0 = len;
++ int ord, mlen, max, cur;
++ void *buddy;
++
++ while (len) {
++ ord = mb_find_order_for_block(e3b, start);
++
++ if (((start >> ord) << ord) == start && len >= (1 << ord)) {
++ /* the whole chunk may be allocated at once! */
++ mlen = 1 << ord;
++ buddy = mb_find_buddy(e3b, ord, &max);
++ J_ASSERT((start >> ord) < max);
++ mb_clear_bit(start >> ord, buddy);
++ e3b->bd_bd->bb_counters[ord]--;
++ start += mlen;
++ len -= mlen;
++ J_ASSERT(len >= 0);
++ continue;
++ }
++
++ /* we have to split large buddy */
++ J_ASSERT(ord > 0);
++ buddy = mb_find_buddy(e3b, ord, &max);
++ mb_clear_bit(start >> ord, buddy);
++ e3b->bd_bd->bb_counters[ord]--;
++
++ ord--;
++ cur = (start >> ord) & ~1U;
++ buddy = mb_find_buddy(e3b, ord, &max);
++ mb_set_bit(cur, buddy);
++ mb_set_bit(cur + 1, buddy);
++ e3b->bd_bd->bb_counters[ord]++;
++ e3b->bd_bd->bb_counters[ord]++;
++ }
++
++ /* now drop all the bits in bitmap */
++ mb_clear_bits(e3b->bd_bitmap, ex->fe_start, len0);
++
++ mb_check_buddy(e3b);
++
++ return 0;
++}
++
++int inline mb_mark_used(struct ext3_buddy *e3b,
++ struct ext3_free_extent *ex, int len)
++{
++ int err;
++
++ J_ASSERT(ex);
++ if (ex->fe_back == 0)
++ err = mb_mark_used_forward(e3b, ex, len);
++ else
++ err = mb_mark_used_backward(e3b, ex, len);
++ return err;
++}
++
++int ext3_mb_new_in_group(struct ext3_allocation_context *ac,
++ struct ext3_buddy *e3b, int group)
++{
++ struct super_block *sb = ac->ac_sb;
++ int err, gorder, max, i;
++ struct ext3_free_extent curex;
++
++ /* let's know order of allocation */
++ gorder = 0;
++ while (ac->ac_g_len > (1 << gorder))
++ gorder++;
++
++ if ((ac->ac_g_flags & 1) && ac->ac_g_group == group) {
++ /* someone asks for space at this specified block
++ * probably he wants to merge it into existing extent */
++ if (mb_test_bit(ac->ac_g_start, e3b->bd_bitmap)) {
++ /* good. at least one block is free */
++ max = mb_find_extent(e3b, 0, ac->ac_g_start,
++ ac->ac_g_len, &curex);
++ max = min(curex.fe_len, ac->ac_g_len);
++ mb_mark_used(e3b, &curex, max);
++
++ ac->ac_b_group = group;
++ ac->ac_b_start = curex.fe_start;
++ ac->ac_b_len = max;
++ ac->ac_status = AC_STATUS_FOUND;
++ err = 0;
++ goto out;
++ }
++ /* don't try to find goal anymore */
++ ac->ac_g_flags &= ~1;
++ }
++
++ i = 0;
++ while (1) {
++ i = find_next_bit(e3b->bd_bitmap, sb->s_blocksize * 8, i);
++ if (i >= sb->s_blocksize * 8)
++ break;
++
++ max = mb_find_extent(e3b, 0, i, ac->ac_g_len, &curex);
++ if (max >= ac->ac_g_len) {
++ max = min(curex.fe_len, ac->ac_g_len);
++ mb_mark_used(e3b, &curex, max);
++
++ ac->ac_b_group = group;
++ ac->ac_b_start = curex.fe_start;
++ ac->ac_b_len = max;
++ ac->ac_status = AC_STATUS_FOUND;
++ break;
++ }
++ i += max;
++ }
++
++ return 0;
++
++out:
++ return err;
++}
++
++int mb_good_group(struct ext3_allocation_context *ac, int group, int cr)
++{
++ struct ext3_group_desc *gdp;
++ int free_blocks;
++
++ gdp = ext3_get_group_desc(ac->ac_sb, group, NULL);
++ if (!gdp)
++ return 0;
++ free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
++ if (free_blocks == 0)
++ return 0;
++
++ /* someone wants this block very much */
++ if ((ac->ac_g_flags & 1) && ac->ac_g_group == group)
++ return 1;
++
++ /* FIXME: I'd like to take fragmentation into account here */
++ if (cr == 0) {
++ if (free_blocks >= ac->ac_g_len >> 1)
++ return 1;
++ } else if (cr == 1) {
++ if (free_blocks >= ac->ac_g_len >> 2)
++ return 1;
++ } else if (cr == 2) {
++ return 1;
++ } else {
++ BUG();
++ }
++ return 0;
++}
++
++int ext3_mb_new_blocks(handle_t *handle, struct inode *inode,
++ unsigned long goal, int *len, int flags, int *errp)
++{
++ struct buffer_head *bitmap_bh = NULL;
++ struct ext3_allocation_context ac;
++ int i, group, block, cr, err = 0;
++ struct ext3_group_desc *gdp;
++ struct ext3_super_block *es;
++ struct buffer_head *gdp_bh;
++ struct ext3_sb_info *sbi;
++ struct super_block *sb;
++ struct ext3_buddy e3b;
++
++ J_ASSERT(len != NULL);
++ J_ASSERT(*len > 0);
++
++ sb = inode->i_sb;
++ if (!sb) {
++ printk("ext3_mb_new_nblocks: nonexistent device");
++ return 0;
++ }
++
++ if (!test_opt(sb, MBALLOC)) {
++ static int ext3_mballoc_warning = 0;
++ if (ext3_mballoc_warning == 0) {
++ printk(KERN_ERR "EXT3-fs: multiblock request with "
++ "mballoc disabled!\n");
++ ext3_mballoc_warning++;
++ }
++ *len = 1;
++ err = ext3_new_block_old(handle, inode, goal, errp);
++ return err;
++ }
++
++ ext3_mb_poll_new_transaction(sb, handle);
++
++ sbi = EXT3_SB(sb);
++ es = EXT3_SB(sb)->s_es;
++
++ if (!(flags & 2)) {
++ /* someone asks for non-reserved blocks */
++ BUG_ON(*len > 1);
++ err = ext3_mb_reserve_blocks(sb, 1);
++ if (err) {
++ *errp = err;
++ return 0;
++ }
++ }
++
++ /*
++ * Check quota for allocation of this blocks.
++ */
++ while (*len && DQUOT_ALLOC_BLOCK(inode, *len))
++ *len -= 1;
++ if (*len == 0) {
++ *errp = -EDQUOT;
++ block = 0;
++ goto out;
++ }
++
++ /* start searching from the goal */
++ if (goal < le32_to_cpu(es->s_first_data_block) ||
++ goal >= le32_to_cpu(es->s_blocks_count))
++ goal = le32_to_cpu(es->s_first_data_block);
++ group = (goal - le32_to_cpu(es->s_first_data_block)) /
++ EXT3_BLOCKS_PER_GROUP(sb);
++ block = ((goal - le32_to_cpu(es->s_first_data_block)) %
++ EXT3_BLOCKS_PER_GROUP(sb));
++
++ /* set up allocation goals */
++ ac.ac_b_group = ac.ac_b_start = ac.ac_b_len = 0;
++ ac.ac_status = 0;
++ ac.ac_groups_scanned = 0;
++ ac.ac_sb = inode->i_sb;
++ ac.ac_g_group = group;
++ ac.ac_g_start = block;
++ ac.ac_g_len = *len;
++ ac.ac_g_flags = flags;
++
++ /* loop over the groups */
++ for (cr = 0; cr < 3 && ac.ac_status != AC_STATUS_FOUND; cr++) {
++ for (i = 0; i < EXT3_SB(sb)->s_groups_count; group++, i++) {
++ if (group == EXT3_SB(sb)->s_groups_count)
++ group = 0;
++
++ /* check is group good for our criteries */
++ if (!mb_good_group(&ac, group, cr))
++ continue;
++
++ err = ext3_mb_load_desc(ac.ac_sb, group, &e3b);
++ if (err)
++ goto out_err;
++
++ ext3_lock_group(sb, group);
++ if (!mb_good_group(&ac, group, cr)) {
++ /* someone did allocation from this group */
++ ext3_unlock_group(sb, group);
++ ext3_mb_release_desc(&e3b);
++ continue;
++ }
++
++ err = ext3_mb_new_in_group(&ac, &e3b, group);
++ ext3_unlock_group(sb, group);
++ if (ac.ac_status == AC_STATUS_FOUND)
++ ext3_mb_dirty_buddy(&e3b);
++ ext3_mb_release_desc(&e3b);
++ if (err)
++ goto out_err;
++ if (ac.ac_status == AC_STATUS_FOUND)
++ break;
++ }
++ }
++
++ if (ac.ac_status != AC_STATUS_FOUND) {
++ /* unfortunately, we can't satisfy this request */
++ J_ASSERT(ac.ac_b_len == 0);
++ DQUOT_FREE_BLOCK(inode, *len);
++ *errp = -ENOSPC;
++ block = 0;
++ goto out;
++ }
++
++ /* good news - free block(s) have been found. now it's time
++ * to mark block(s) in good old journaled bitmap */
++ block = ac.ac_b_group * EXT3_BLOCKS_PER_GROUP(sb)
++ + ac.ac_b_start + le32_to_cpu(es->s_first_data_block);
++
++ /* we made a desicion, now mark found blocks in good old
++ * bitmap to be journaled */
++
++ ext3_debug("using block group %d(%d)\n",
++ ac.ac_b_group.group, gdp->bg_free_blocks_count);
++
++ bitmap_bh = read_block_bitmap(sb, ac.ac_b_group);
++ if (!bitmap_bh) {
++ *errp = -EIO;
++ goto out_err;
++ }
++
++ err = ext3_journal_get_write_access(handle, bitmap_bh);
++ if (err) {
++ *errp = err;
++ goto out_err;
++ }
++
++ gdp = ext3_get_group_desc(sb, ac.ac_b_group, &gdp_bh);
++ if (!gdp) {
++ *errp = -EIO;
++ goto out_err;
++ }
++
++ err = ext3_journal_get_write_access(handle, gdp_bh);
++ if (err)
++ goto out_err;
++
++ block = ac.ac_b_start + ac.ac_b_group * EXT3_BLOCKS_PER_GROUP(sb)
++ + le32_to_cpu(es->s_first_data_block);
++
++ if (block == le32_to_cpu(gdp->bg_block_bitmap) ||
++ block == le32_to_cpu(gdp->bg_inode_bitmap) ||
++ in_range(block, le32_to_cpu(gdp->bg_inode_table),
++ EXT3_SB(sb)->s_itb_per_group))
++ ext3_error(sb, "ext3_new_block",
++ "Allocating block in system zone - "
++ "block = %u", block);
++#if 0
++ for (i = 0; i < ac.ac_b_len; i++)
++ J_ASSERT(!mb_test_bit(ac.ac_b_start + i, bitmap_bh->b_data));
++#endif
++ mb_set_bits(bitmap_bh->b_data, ac.ac_b_start, ac.ac_b_len);
++
++ ext3_lock_group(sb, ac.ac_b_group);
++ gdp->bg_free_blocks_count =
++ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) -
++ ac.ac_b_len);
++ ext3_unlock_group(sb, ac.ac_b_group);
++ percpu_counter_mod(&sbi->s_freeblocks_counter, -ac.ac_b_len);
++
++ err = ext3_journal_dirty_metadata(handle, bitmap_bh);
++ if (err)
++ goto out_err;
++ err = ext3_journal_dirty_metadata(handle, gdp_bh);
++ if (err)
++ goto out_err;
++
++ sb->s_dirt = 1;
++ *errp = 0;
++ brelse(bitmap_bh);
++
++ /* drop non-allocated, but dquote'd blocks */
++ J_ASSERT(*len >= ac.ac_b_len);
++ DQUOT_FREE_BLOCK(inode, *len - ac.ac_b_len);
++
++ *len = ac.ac_b_len;
++ J_ASSERT(block != 0);
++ goto out;
++
++out_err:
++ /* if we've already allocated something, roll it back */
++ if (ac.ac_status == AC_STATUS_FOUND) {
++ /* FIXME: free blocks here */
++ }
++
++ DQUOT_FREE_BLOCK(inode, *len);
++ brelse(bitmap_bh);
++ *errp = err;
++ block = 0;
++out:
++ if (!(flags & 2)) {
++ /* block wasn't reserved before and we reserved it
++ * at the beginning of allocation. it doesn't matter
++ * whether we allocated anything or we failed: time
++ * to release reservation. NOTE: because I expect
++ * any multiblock request from delayed allocation
++ * path only, here is single block always */
++ ext3_mb_release_blocks(sb, 1);
++ }
++ return block;
++}
++
++int ext3_mb_generate_buddy(struct super_block *sb, int group)
++{
++ struct buffer_head *bh;
++ int i, err, count = 0;
++ struct ext3_buddy e3b;
++
++ err = ext3_mb_load_desc(sb, group, &e3b);
++ if (err)
++ goto out;
++ memset(e3b.bd_bh->b_data, 0, sb->s_blocksize);
++ memset(e3b.bd_bh2->b_data, 0, sb->s_blocksize);
++
++ bh = read_block_bitmap(sb, group);
++ if (bh == NULL) {
++ err = -EIO;
++ goto out2;
++ }
++
++ /* loop over the blocks, nad create buddies for free ones */
++ for (i = 0; i < sb->s_blocksize * 8; i++) {
++ if (!mb_test_bit(i, (void *) bh->b_data)) {
++ mb_free_blocks(&e3b, i, 1);
++ count++;
++ }
++ }
++ brelse(bh);
++ mb_check_buddy(&e3b);
++ ext3_mb_dirty_buddy(&e3b);
++
++out2:
++ ext3_mb_release_desc(&e3b);
++out:
++ return err;
++}
++
++EXPORT_SYMBOL(ext3_mb_new_blocks);
++
++#define MB_CREDITS \
++ (EXT3_DATA_TRANS_BLOCKS + 3 + EXT3_INDEX_EXTRA_TRANS_BLOCKS + \
++ + 2 * EXT3_SINGLEDATA_TRANS_BLOCKS)
++
++int ext3_mb_init_backend(struct super_block *sb)
++{
++ struct inode *root = sb->s_root->d_inode;
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ struct dentry *db;
++ tid_t target;
++ int err, i;
++
++ sbi->s_buddy_blocks = kmalloc(sizeof(struct ext3_buddy_group_blocks) *
++ sbi->s_groups_count, GFP_KERNEL);
++ if (sbi->s_buddy_blocks == NULL) {
++ printk("can't allocate mem for buddy maps\n");
++ return -ENOMEM;
++ }
++ memset(sbi->s_buddy_blocks, 0,
++ sizeof(struct ext3_buddy_group_blocks) * sbi->s_groups_count);
++ sbi->s_buddy = NULL;
++
++ down(&root->i_sem);
++ db = lookup_one_len(EXT3_BUDDY_FILE, sb->s_root,
++ strlen(EXT3_BUDDY_FILE));
++ if (IS_ERR(db)) {
++ err = PTR_ERR(db);
++ printk("can't lookup buddy file: %d\n", err);
++ goto out;
++ }
++
++ if (db->d_inode != NULL) {
++ sbi->s_buddy = igrab(db->d_inode);
++ goto map;
++ }
++
++ err = ext3_create(root, db, S_IFREG, NULL);
++ if (err) {
++ printk("error while creation buddy file: %d\n", err);
++ } else {
++ sbi->s_buddy = igrab(db->d_inode);
++ }
++
++map:
++ for (i = 0; i < sbi->s_groups_count; i++) {
++ struct buffer_head *bh = NULL;
++ handle_t *handle;
++
++ handle = ext3_journal_start(sbi->s_buddy, MB_CREDITS);
++ if (IS_ERR(handle)) {
++ err = PTR_ERR(handle);
++ goto out2;
++ }
++
++ /* allocate block for bitmap */
++ bh = ext3_getblk(handle, sbi->s_buddy, i * 2, 1, &err);
++ if (bh == NULL) {
++ printk("can't get block for buddy bitmap: %d\n", err);
++ goto out2;
++ }
++ sbi->s_buddy_blocks[i].bb_bitmap = bh->b_blocknr;
++ brelse(bh);
++
++ /* allocate block for buddy */
++ bh = ext3_getblk(handle, sbi->s_buddy, i * 2 + 1, 1, &err);
++ if (bh == NULL) {
++ printk("can't get block for buddy: %d\n", err);
++ goto out2;
++ }
++ sbi->s_buddy_blocks[i].bb_buddy = bh->b_blocknr;
++ brelse(bh);
++ ext3_journal_stop(handle);
++ spin_lock_init(&sbi->s_buddy_blocks[i].bb_lock);
++ sbi->s_buddy_blocks[i].bb_md_cur = NULL;
++ sbi->s_buddy_blocks[i].bb_tid = 0;
++ }
++
++ if (journal_start_commit(sbi->s_journal, &target))
++ log_wait_commit(sbi->s_journal, target);
++
++out2:
++ dput(db);
++out:
++ up(&root->i_sem);
++ return err;
++}
++
++int ext3_mb_release(struct super_block *sb)
++{
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++
++ if (!test_opt(sb, MBALLOC))
++ return 0;
++
++ /* release freed, non-committed blocks */
++ spin_lock(&sbi->s_md_lock);
++ list_splice_init(&sbi->s_closed_transaction,
++ &sbi->s_committed_transaction);
++ list_splice_init(&sbi->s_active_transaction,
++ &sbi->s_committed_transaction);
++ spin_unlock(&sbi->s_md_lock);
++ ext3_mb_free_committed_blocks(sb);
++
++ if (sbi->s_buddy_blocks)
++ kfree(sbi->s_buddy_blocks);
++ if (sbi->s_buddy)
++ iput(sbi->s_buddy);
++ if (sbi->s_blocks_reserved)
++ printk("ext3-fs: %ld blocks being reserved at umount!\n",
++ sbi->s_blocks_reserved);
++ return 0;
++}
++
++int ext3_mb_init(struct super_block *sb)
++{
++ struct ext3_super_block *es;
++ int i;
++
++ if (!test_opt(sb, MBALLOC))
++ return 0;
++
++ /* init file for buddy data */
++ clear_opt(EXT3_SB(sb)->s_mount_opt, MBALLOC);
++ ext3_mb_init_backend(sb);
++
++ es = EXT3_SB(sb)->s_es;
++ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++)
++ ext3_mb_generate_buddy(sb, i);
++ spin_lock_init(&EXT3_SB(sb)->s_reserve_lock);
++ spin_lock_init(&EXT3_SB(sb)->s_md_lock);
++ INIT_LIST_HEAD(&EXT3_SB(sb)->s_active_transaction);
++ INIT_LIST_HEAD(&EXT3_SB(sb)->s_closed_transaction);
++ INIT_LIST_HEAD(&EXT3_SB(sb)->s_committed_transaction);
++ set_opt(EXT3_SB(sb)->s_mount_opt, MBALLOC);
++ printk("EXT3-fs: mballoc enabled\n");
++ return 0;
++}
++
++void ext3_mb_free_committed_blocks(struct super_block *sb)
++{
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ int err, i, count = 0, count2 = 0;
++ struct ext3_free_metadata *md;
++ struct ext3_buddy e3b;
++
++ if (list_empty(&sbi->s_committed_transaction))
++ return;
++
++ /* there is committed blocks to be freed yet */
++ do {
++ /* get next array of blocks */
++ md = NULL;
++ spin_lock(&sbi->s_md_lock);
++ if (!list_empty(&sbi->s_committed_transaction)) {
++ md = list_entry(sbi->s_committed_transaction.next,
++ struct ext3_free_metadata, list);
++ list_del(&md->list);
++ }
++ spin_unlock(&sbi->s_md_lock);
++
++ if (md == NULL)
++ break;
++
++ mb_debug("gonna free %u blocks in group %u (0x%p):",
++ md->num, md->group, md);
++
++ err = ext3_mb_load_desc(sb, md->group, &e3b);
++ BUG_ON(err != 0);
++
++ /* there are blocks to put in buddy to make them really free */
++ count += md->num;
++ count2++;
++ ext3_lock_group(sb, md->group);
++ for (i = 0; i < md->num; i++) {
++ mb_debug(" %u", md->blocks[i]);
++ mb_free_blocks(&e3b, md->blocks[i], 1);
++ }
++ mb_debug("\n");
++ ext3_unlock_group(sb, md->group);
++
++ kfree(md);
++ ext3_mb_dirty_buddy(&e3b);
++ ext3_mb_release_desc(&e3b);
++
++ } while (md);
++ mb_debug("freed %u blocks in %u structures\n", count, count2);
++}
++
++void ext3_mb_poll_new_transaction(struct super_block *sb, handle_t *handle)
++{
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++
++ if (sbi->s_last_transaction == handle->h_transaction->t_tid)
++ return;
++
++ /* new transaction! time to close last one and free blocks for
++ * committed transaction. we know that only transaction can be
++ * active, so previos transaction can be being logged and we
++ * know that transaction before previous is known to be alreade
++ * logged. this means that now we may free blocks freed in all
++ * transactions before previous one. hope I'm clear enough ... */
++
++ spin_lock(&sbi->s_md_lock);
++ if (sbi->s_last_transaction != handle->h_transaction->t_tid) {
++ mb_debug("new transaction %lu, old %lu\n",
++ (unsigned long) handle->h_transaction->t_tid,
++ (unsigned long) sbi->s_last_transaction);
++ list_splice_init(&sbi->s_closed_transaction,
++ &sbi->s_committed_transaction);
++ list_splice_init(&sbi->s_active_transaction,
++ &sbi->s_closed_transaction);
++ sbi->s_last_transaction = handle->h_transaction->t_tid;
++ }
++ spin_unlock(&sbi->s_md_lock);
++
++ ext3_mb_free_committed_blocks(sb);
++}
++
++int ext3_mb_free_metadata(handle_t *handle, struct ext3_buddy *e3b,
++ int group, int block, int count)
++{
++ struct ext3_buddy_group_blocks *db = e3b->bd_bd;
++ struct super_block *sb = e3b->bd_sb;
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ struct ext3_free_metadata *md;
++ int i;
++
++ ext3_lock_group(sb, group);
++ for (i = 0; i < count; i++) {
++ md = db->bb_md_cur;
++ if (md && db->bb_tid != handle->h_transaction->t_tid) {
++ db->bb_md_cur = NULL;
++ md = NULL;
++ }
++
++ if (md == NULL) {
++ ext3_unlock_group(sb, group);
++ md = kmalloc(sizeof(*md), GFP_KERNEL);
++ if (md == NULL)
++ return -ENOMEM;
++ md->num = 0;
++ md->group = group;
++
++ ext3_lock_group(sb, group);
++ if (db->bb_md_cur == NULL) {
++ spin_lock(&sbi->s_md_lock);
++ list_add(&md->list, &sbi->s_active_transaction);
++ spin_unlock(&sbi->s_md_lock);
++ db->bb_md_cur = md;
++ db->bb_tid = handle->h_transaction->t_tid;
++ mb_debug("new md 0x%p for group %u\n",
++ md, md->group);
++ } else {
++ kfree(md);
++ md = db->bb_md_cur;
++ }
++ }
++
++ BUG_ON(md->num >= EXT3_BB_MAX_BLOCKS);
++ md->blocks[md->num] = block + i;
++ md->num++;
++ if (md->num == EXT3_BB_MAX_BLOCKS) {
++ /* no more space, put full container on a sb's list */
++ db->bb_md_cur = NULL;
++ }
++ }
++ ext3_unlock_group(sb, group);
++ return 0;
++}
++
++void ext3_mb_free_blocks(handle_t *handle, struct inode *inode,
++ unsigned long block, unsigned long count, int metadata)
++{
++ struct buffer_head *bitmap_bh = NULL;
++ struct ext3_group_desc *gdp;
++ struct ext3_super_block *es;
++ unsigned long bit, overflow;
++ struct buffer_head *gd_bh;
++ unsigned long block_group;
++ struct ext3_sb_info *sbi;
++ struct super_block *sb;
++ struct ext3_buddy e3b;
++ int err = 0, ret;
++
++ sb = inode->i_sb;
++ if (!sb) {
++ printk ("ext3_free_blocks: nonexistent device");
++ return;
++ }
++
++ ext3_mb_poll_new_transaction(sb, handle);
++
++ sbi = EXT3_SB(sb);
++ es = EXT3_SB(sb)->s_es;
++ if (block < le32_to_cpu(es->s_first_data_block) ||
++ block + count < block ||
++ block + count > le32_to_cpu(es->s_blocks_count)) {
++ ext3_error (sb, "ext3_free_blocks",
++ "Freeing blocks not in datazone - "
++ "block = %lu, count = %lu", block, count);
++ goto error_return;
++ }
++
++ ext3_debug("freeing block %lu\n", block);
++
++do_more:
++ overflow = 0;
++ block_group = (block - le32_to_cpu(es->s_first_data_block)) /
++ EXT3_BLOCKS_PER_GROUP(sb);
++ bit = (block - le32_to_cpu(es->s_first_data_block)) %
++ EXT3_BLOCKS_PER_GROUP(sb);
++ /*
++ * Check to see if we are freeing blocks across a group
++ * boundary.
++ */
++ if (bit + count > EXT3_BLOCKS_PER_GROUP(sb)) {
++ overflow = bit + count - EXT3_BLOCKS_PER_GROUP(sb);
++ count -= overflow;
++ }
++ brelse(bitmap_bh);
++ bitmap_bh = read_block_bitmap(sb, block_group);
++ if (!bitmap_bh)
++ goto error_return;
++ gdp = ext3_get_group_desc (sb, block_group, &gd_bh);
++ if (!gdp)
++ goto error_return;
++
++ if (in_range (le32_to_cpu(gdp->bg_block_bitmap), block, count) ||
++ in_range (le32_to_cpu(gdp->bg_inode_bitmap), block, count) ||
++ in_range (block, le32_to_cpu(gdp->bg_inode_table),
++ EXT3_SB(sb)->s_itb_per_group) ||
++ in_range (block + count - 1, le32_to_cpu(gdp->bg_inode_table),
++ EXT3_SB(sb)->s_itb_per_group))
++ ext3_error (sb, "ext3_free_blocks",
++ "Freeing blocks in system zones - "
++ "Block = %lu, count = %lu",
++ block, count);
++
++ BUFFER_TRACE(bitmap_bh, "getting write access");
++ err = ext3_journal_get_write_access(handle, bitmap_bh);
++ if (err)
++ goto error_return;
++
++ /*
++ * We are about to modify some metadata. Call the journal APIs
++ * to unshare ->b_data if a currently-committing transaction is
++ * using it
++ */
++ BUFFER_TRACE(gd_bh, "get_write_access");
++ err = ext3_journal_get_write_access(handle, gd_bh);
++ if (err)
++ goto error_return;
++
++ err = ext3_mb_load_desc(sb, block_group, &e3b);
++ if (err)
++ goto error_return;
++
++ if (metadata) {
++ /* blocks being freed are metadata. these blocks shouldn't
++ * be used until this transaction is committed */
++ ext3_mb_free_metadata(handle, &e3b, block_group, bit, count);
++ } else {
++ ext3_lock_group(sb, block_group);
++ mb_free_blocks(&e3b, bit, count);
++ gdp->bg_free_blocks_count =
++ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + count);
++ ext3_unlock_group(sb, block_group);
++ percpu_counter_mod(&sbi->s_freeblocks_counter, count);
++ }
++
++ ext3_mb_dirty_buddy(&e3b);
++ ext3_mb_release_desc(&e3b);
++
++ /* FIXME: undo logic will be implemented later and another way */
++ mb_clear_bits(bitmap_bh->b_data, bit, count);
++ DQUOT_FREE_BLOCK(inode, count);
++
++ /* We dirtied the bitmap block */
++ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
++ err = ext3_journal_dirty_metadata(handle, bitmap_bh);
++
++ /* And the group descriptor block */
++ BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
++ ret = ext3_journal_dirty_metadata(handle, gd_bh);
++ if (!err) err = ret;
++
++ if (overflow && !err) {
++ block += count;
++ count = overflow;
++ goto do_more;
++ }
++ sb->s_dirt = 1;
++error_return:
++ brelse(bitmap_bh);
++ ext3_std_error(sb, err);
++ return;
++}
++
++int ext3_mb_reserve_blocks(struct super_block *sb, int blocks)
++{
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ int free, ret = -ENOSPC;
++
++ BUG_ON(blocks < 0);
++ spin_lock(&sbi->s_reserve_lock);
++ free = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
++ if (blocks <= free - sbi->s_blocks_reserved) {
++ sbi->s_blocks_reserved += blocks;
++ ret = 0;
++ }
++ spin_unlock(&sbi->s_reserve_lock);
++ return ret;
++}
++
++void ext3_mb_release_blocks(struct super_block *sb, int blocks)
++{
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++
++ BUG_ON(blocks < 0);
++ spin_lock(&sbi->s_reserve_lock);
++ sbi->s_blocks_reserved -= blocks;
++ WARN_ON(sbi->s_blocks_reserved < 0);
++ if (sbi->s_blocks_reserved < 0)
++ sbi->s_blocks_reserved = 0;
++ spin_unlock(&sbi->s_reserve_lock);
++}
++
++int ext3_new_block(handle_t *handle, struct inode *inode,
++ unsigned long goal, int *errp)
++{
++ int ret, len;
++
++ if (!test_opt(inode->i_sb, MBALLOC)) {
++ ret = ext3_new_block_old(handle, inode, goal, errp);
++ goto out;
++ }
++ len = 1;
++ ret = ext3_mb_new_blocks(handle, inode, goal, &len, 0, errp);
++out:
++ return ret;
++}
++
++
++void ext3_free_blocks(handle_t *handle, struct inode * inode,
++ unsigned long block, unsigned long count, int metadata)
++{
++ if (!test_opt(inode->i_sb, MBALLOC))
++ ext3_free_blocks_old(handle, inode, block, count);
++ else
++ ext3_mb_free_blocks(handle, inode, block, count, metadata);
++ return;
++}
++
+Index: linux-2.6.5-sles9/fs/ext3/super.c
+===================================================================
+--- linux-2.6.5-sles9.orig/fs/ext3/super.c 2004-11-09 02:23:21.597220752 +0300
++++ linux-2.6.5-sles9/fs/ext3/super.c 2004-11-09 02:26:12.572228600 +0300
+@@ -389,6 +389,7 @@
+ struct ext3_super_block *es = sbi->s_es;
+ int i;
+
++ ext3_mb_release(sb);
+ ext3_ext_release(sb);
+ ext3_xattr_put_super(sb);
+ journal_destroy(sbi->s_journal);
+@@ -542,7 +543,7 @@
+ Opt_commit, Opt_journal_update, Opt_journal_inum,
+ Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
+ Opt_ignore, Opt_barrier, Opt_iopen, Opt_noiopen, Opt_iopen_nopriv,
+- Opt_err, Opt_extents, Opt_extdebug
++ Opt_err, Opt_extents, Opt_extdebug, Opt_mballoc,
+ };
+
+ static match_table_t tokens = {
+@@ -589,6 +590,7 @@
+ {Opt_iopen_nopriv, "iopen_nopriv"},
+ {Opt_extents, "extents"},
+ {Opt_extdebug, "extdebug"},
++ {Opt_mballoc, "mballoc"},
+ {Opt_err, NULL}
+ };
+
+@@ -810,6 +812,9 @@
+ case Opt_extdebug:
+ set_opt (sbi->s_mount_opt, EXTDEBUG);
+ break;
++ case Opt_mballoc:
++ set_opt (sbi->s_mount_opt, MBALLOC);
++ break;
+ default:
+ printk (KERN_ERR
+ "EXT3-fs: Unrecognized mount option \"%s\" "
+@@ -1463,7 +1468,8 @@
+ ext3_count_dirs(sb));
+
+ ext3_ext_init(sb);
+-
++ ext3_mb_init(sb);
++
+ return 0;
+
+ failed_mount3:
+Index: linux-2.6.5-sles9/fs/ext3/Makefile
+===================================================================
+--- linux-2.6.5-sles9.orig/fs/ext3/Makefile 2004-11-09 02:23:21.593221360 +0300
++++ linux-2.6.5-sles9/fs/ext3/Makefile 2004-11-09 02:26:12.572228600 +0300
+@@ -5,7 +5,7 @@
+ obj-$(CONFIG_EXT3_FS) += ext3.o
+
+ ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \
+- ioctl.o namei.o super.o symlink.o hash.o extents.o
++ ioctl.o namei.o super.o symlink.o hash.o extents.o mballoc.o
+
+ ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
+ ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o
+Index: linux-2.6.5-sles9/fs/ext3/balloc.c
+===================================================================
+--- linux-2.6.5-sles9.orig/fs/ext3/balloc.c 2004-11-03 08:36:51.000000000 +0300
++++ linux-2.6.5-sles9/fs/ext3/balloc.c 2004-11-09 02:26:53.078070776 +0300
+@@ -78,7 +78,7 @@
+ *
+ * Return buffer_head on success or NULL in case of failure.
+ */
+-static struct buffer_head *
++struct buffer_head *
+ read_block_bitmap(struct super_block *sb, unsigned int block_group)
+ {
+ struct ext3_group_desc * desc;
+@@ -274,7 +274,7 @@
+ }
+
+ /* Free given blocks, update quota and i_blocks field */
+-void ext3_free_blocks(handle_t *handle, struct inode *inode,
++void ext3_free_blocks_old(handle_t *handle, struct inode *inode,
+ unsigned long block, unsigned long count)
+ {
+ struct buffer_head *bitmap_bh = NULL;
+@@ -1142,7 +1142,7 @@
+ * bitmap, and then for any free bit if that fails.
+ * This function also updates quota and i_blocks field.
+ */
+-int ext3_new_block(handle_t *handle, struct inode *inode,
++int ext3_new_block_old(handle_t *handle, struct inode *inode,
+ unsigned long goal, int *errp)
+ {
+ struct buffer_head *bitmap_bh = NULL;
+Index: linux-2.6.5-sles9/fs/ext3/namei.c
+===================================================================
+--- linux-2.6.5-sles9.orig/fs/ext3/namei.c 2004-11-09 02:18:27.616912552 +0300
++++ linux-2.6.5-sles9/fs/ext3/namei.c 2004-11-09 02:26:12.580227384 +0300
+@@ -1640,7 +1640,7 @@
+ * If the create succeeds, we fill in the inode information
+ * with d_instantiate().
+ */
+-static int ext3_create (struct inode * dir, struct dentry * dentry, int mode,
++int ext3_create (struct inode * dir, struct dentry * dentry, int mode,
+ struct nameidata *nd)
+ {
+ handle_t *handle;
+Index: linux-2.6.5-sles9/fs/ext3/inode.c
+===================================================================
+--- linux-2.6.5-sles9.orig/fs/ext3/inode.c 2004-11-09 02:23:21.592221512 +0300
++++ linux-2.6.5-sles9/fs/ext3/inode.c 2004-11-09 02:26:12.587226320 +0300
+@@ -572,7 +572,7 @@
+ ext3_journal_forget(handle, branch[i].bh);
+ }
+ for (i = 0; i < keys; i++)
+- ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1);
++ ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1, 1);
+ return err;
+ }
+
+@@ -673,7 +673,7 @@
+ if (err == -EAGAIN)
+ for (i = 0; i < num; i++)
+ ext3_free_blocks(handle, inode,
+- le32_to_cpu(where[i].key), 1);
++ le32_to_cpu(where[i].key), 1, 1);
+ return err;
+ }
+
+@@ -1829,7 +1829,7 @@
+ }
+ }
+
+- ext3_free_blocks(handle, inode, block_to_free, count);
++ ext3_free_blocks(handle, inode, block_to_free, count, 1);
+ }
+
+ /**
+@@ -2000,7 +2000,7 @@
+ ext3_journal_test_restart(handle, inode);
+ }
+
+- ext3_free_blocks(handle, inode, nr, 1);
++ ext3_free_blocks(handle, inode, nr, 1, 1);
+
+ if (parent_bh) {
+ /*
+Index: linux-2.6.5-sles9/fs/ext3/extents.c
+===================================================================
+--- linux-2.6.5-sles9.orig/fs/ext3/extents.c 2004-11-09 02:25:56.143726112 +0300
++++ linux-2.6.5-sles9/fs/ext3/extents.c 2004-11-09 02:26:12.591225712 +0300
+@@ -740,7 +740,7 @@
+ for (i = 0; i < depth; i++) {
+ if (!ablocks[i])
+ continue;
+- ext3_free_blocks(handle, tree->inode, ablocks[i], 1);
++ ext3_free_blocks(handle, tree->inode, ablocks[i], 1, 1);
+ }
+ }
+ kfree(ablocks);
+@@ -1391,7 +1391,7 @@
+ path->p_idx->ei_leaf);
+ bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf);
+ ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf);
+- ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1);
++ ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1, 1);
+ return err;
+ }
+
+@@ -1879,10 +1879,12 @@
+ int needed = ext3_remove_blocks_credits(tree, ex, from, to);
+ handle_t *handle = ext3_journal_start(tree->inode, needed);
+ struct buffer_head *bh;
+- int i;
++ int i, metadata = 0;
+
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
++ if (S_ISDIR(tree->inode->i_mode))
++ metadata = 1;
+ if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) {
+ /* tail removal */
+ unsigned long num, start;
+@@ -1894,7 +1896,7 @@
+ bh = sb_find_get_block(tree->inode->i_sb, start + i);
+ ext3_forget(handle, 0, tree->inode, bh, start + i);
+ }
+- ext3_free_blocks(handle, tree->inode, start, num);
++ ext3_free_blocks(handle, tree->inode, start, num, metadata);
+ } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) {
+ printk("strange request: removal %lu-%lu from %u:%u\n",
+ from, to, ex->ee_block, ex->ee_len);
+Index: linux-2.6.5-sles9/fs/ext3/xattr.c
+===================================================================
+--- linux-2.6.5-sles9.orig/fs/ext3/xattr.c 2004-11-09 02:22:55.777146000 +0300
++++ linux-2.6.5-sles9/fs/ext3/xattr.c 2004-11-09 02:26:12.593225408 +0300
+@@ -1366,7 +1366,7 @@
+ new_bh = sb_getblk(sb, block);
+ if (!new_bh) {
+ getblk_failed:
+- ext3_free_blocks(handle, inode, block, 1);
++ ext3_free_blocks(handle, inode, block, 1, 1);
+ error = -EIO;
+ goto cleanup;
+ }
+@@ -1408,7 +1408,7 @@
+ if (HDR(old_bh)->h_refcount == cpu_to_le32(1)) {
+ /* Free the old block. */
+ ea_bdebug(old_bh, "freeing");
+- ext3_free_blocks(handle, inode, old_bh->b_blocknr, 1);
++ ext3_free_blocks(handle, inode, old_bh->b_blocknr, 1, 1);
+
+ /* ext3_forget() calls bforget() for us, but we
+ let our caller release old_bh, so we need to
+@@ -1504,7 +1504,7 @@
+ lock_buffer(bh);
+ if (HDR(bh)->h_refcount == cpu_to_le32(1)) {
+ ext3_xattr_cache_remove(bh);
+- ext3_free_blocks(handle, inode, EXT3_I(inode)->i_file_acl, 1);
++ ext3_free_blocks(handle, inode, EXT3_I(inode)->i_file_acl, 1, 1);
+ get_bh(bh);
+ ext3_forget(handle, 1, inode, bh, EXT3_I(inode)->i_file_acl);
+ } else {
+Index: linux-2.6.5-sles9/include/linux/ext3_fs.h
+===================================================================
+--- linux-2.6.5-sles9.orig/include/linux/ext3_fs.h 2004-11-09 02:25:17.238640584 +0300
++++ linux-2.6.5-sles9/include/linux/ext3_fs.h 2004-11-09 02:26:12.596224952 +0300
+@@ -57,6 +57,8 @@
+ #define ext3_debug(f, a...) do {} while (0)
+ #endif
+
++#define EXT3_MULTIBLOCK_ALLOCATOR 1
++
+ /*
+ * Special inodes numbers
+ */
+@@ -339,6 +341,7 @@
+ #define EXT3_MOUNT_IOPEN_NOPRIV 0x80000 /* Make iopen world-readable */
+ #define EXT3_MOUNT_EXTENTS 0x100000/* Extents support */
+ #define EXT3_MOUNT_EXTDEBUG 0x200000/* Extents debug */
++#define EXT3_MOUNT_MBALLOC 0x400000/* Buddy allocation support */
+
+ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
+ #ifndef clear_opt
+@@ -698,7 +701,7 @@
+ extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group);
+ extern int ext3_new_block (handle_t *, struct inode *, unsigned long, int *);
+ extern void ext3_free_blocks (handle_t *, struct inode *, unsigned long,
+- unsigned long);
++ unsigned long, int);
+ extern unsigned long ext3_count_free_blocks (struct super_block *);
+ extern void ext3_check_blocks_bitmap (struct super_block *);
+ extern struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb,
+Index: linux-2.6.5-sles9/include/linux/ext3_fs_sb.h
+===================================================================
+--- linux-2.6.5-sles9.orig/include/linux/ext3_fs_sb.h 2004-11-09 02:20:51.598024096 +0300
++++ linux-2.6.5-sles9/include/linux/ext3_fs_sb.h 2004-11-09 02:28:18.753046200 +0300
+@@ -23,10 +23,30 @@
+ #define EXT_INCLUDE
+ #include <linux/blockgroup_lock.h>
+ #include <linux/percpu_counter.h>
++#include <linux/list.h>
+ #endif
+ #endif
+ #include <linux/rbtree.h>
+
++#define EXT3_BB_MAX_BLOCKS 30
++struct ext3_free_metadata {
++ unsigned short group;
++ unsigned short num;
++ unsigned short blocks[EXT3_BB_MAX_BLOCKS];
++ struct list_head list;
++};
++
++#define EXT3_BB_MAX_ORDER 14
++
++struct ext3_buddy_group_blocks {
++ sector_t bb_bitmap;
++ sector_t bb_buddy;
++ spinlock_t bb_lock;
++ unsigned bb_counters[EXT3_BB_MAX_ORDER];
++ struct ext3_free_metadata *bb_md_cur;
++ unsigned long bb_tid;
++};
++
+ /*
+ * third extended-fs super-block data in memory
+ */
+@@ -78,6 +98,17 @@
+ struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */
+ wait_queue_head_t ro_wait_queue; /* For people waiting for the fs to go read-only */
+ #endif
++
++ /* for buddy allocator */
++ struct ext3_buddy_group_blocks *s_buddy_blocks;
++ struct inode *s_buddy;
++ long s_blocks_reserved;
++ spinlock_t s_reserve_lock;
++ struct list_head s_active_transaction;
++ struct list_head s_closed_transaction;
++ struct list_head s_committed_transaction;
++ spinlock_t s_md_lock;
++ tid_t s_last_transaction;
+ };
+
+ #endif /* _LINUX_EXT3_FS_SB */
--- /dev/null
+Index: linux-2.6.7/fs/ext3/namei.c
+===================================================================
+--- linux-2.6.7.orig/fs/ext3/namei.c 2004-06-15 23:19:36.000000000 -0600
++++ linux-2.6.7/fs/ext3/namei.c 2004-08-20 17:48:54.000000000 -0600
+@@ -1596,11 +1596,17 @@ static int ext3_delete_entry (handle_t *
+ static inline void ext3_inc_count(handle_t *handle, struct inode *inode)
+ {
+ inode->i_nlink++;
++ if (is_dx(inode) && inode->i_nlink > 1) {
++ /* limit is 16-bit i_links_count */
++ if (inode->i_nlink >= EXT3_LINK_MAX || inode->i_nlink == 2)
++ inode->i_nlink = 1;
++ }
+ }
+
+ static inline void ext3_dec_count(handle_t *handle, struct inode *inode)
+ {
+- inode->i_nlink--;
++ if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 2)
++ inode->i_nlink--;
+ }
+
+ static int ext3_add_nondir(handle_t *handle,
+@@ -1693,7 +1698,7 @@ static int ext3_mkdir(struct inode * dir
+ struct ext3_dir_entry_2 * de;
+ int err;
+
+- if (dir->i_nlink >= EXT3_LINK_MAX)
++ if (EXT3_DIR_LINK_MAXED(dir))
+ return -EMLINK;
+
+ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
+@@ -1715,7 +1720,7 @@ static int ext3_mkdir(struct inode * dir
+ inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize;
+ dir_block = ext3_bread (handle, inode, 0, 1, &err);
+ if (!dir_block) {
+- inode->i_nlink--; /* is this nlink == 0? */
++ ext3_dec_count(handle, inode); /* is this nlink == 0? */
+ ext3_mark_inode_dirty(handle, inode);
+ iput (inode);
+ goto out_stop;
+@@ -1747,7 +1752,7 @@ static int ext3_mkdir(struct inode * dir
+ iput (inode);
+ goto out_stop;
+ }
+- dir->i_nlink++;
++ ext3_inc_count(handle, dir);
+ ext3_update_dx_flag(dir);
+ ext3_mark_inode_dirty(handle, dir);
+ d_instantiate(dentry, inode);
+@@ -2010,10 +2015,10 @@ static int ext3_rmdir (struct inode * di
+ retval = ext3_delete_entry(handle, dir, de, bh);
+ if (retval)
+ goto end_rmdir;
+- if (inode->i_nlink != 2)
+- ext3_warning (inode->i_sb, "ext3_rmdir",
+- "empty directory has nlink!=2 (%d)",
+- inode->i_nlink);
++ if (!EXT3_DIR_LINK_EMPTY(inode))
++ ext3_warning(inode->i_sb, "ext3_rmdir",
++ "empty directory has too many links (%d)",
++ inode->i_nlink);
+ inode->i_version++;
+ inode->i_nlink = 0;
+ /* There's no need to set i_disksize: the fact that i_nlink is
+@@ -2023,7 +2028,7 @@ static int ext3_rmdir (struct inode * di
+ ext3_orphan_add(handle, inode);
+ inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+ ext3_mark_inode_dirty(handle, inode);
+- dir->i_nlink--;
++ ext3_dec_count(handle, dir);
+ ext3_update_dx_flag(dir);
+ ext3_mark_inode_dirty(handle, dir);
+
+@@ -2074,7 +2079,7 @@ static int ext3_unlink(struct inode * di
+ dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+ ext3_update_dx_flag(dir);
+ ext3_mark_inode_dirty(handle, dir);
+- inode->i_nlink--;
++ ext3_dec_count(handle, inode);
+ if (!inode->i_nlink)
+ ext3_orphan_add(handle, inode);
+ inode->i_ctime = dir->i_ctime;
+@@ -2146,7 +2151,7 @@ static int ext3_link (struct dentry * ol
+ struct inode *inode = old_dentry->d_inode;
+ int err;
+
+- if (inode->i_nlink >= EXT3_LINK_MAX)
++ if (EXT3_DIR_LINK_MAXED(inode))
+ return -EMLINK;
+
+ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
+@@ -2230,8 +2235,8 @@ static int ext3_rename (struct inode * o
+ if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino)
+ goto end_rename;
+ retval = -EMLINK;
+- if (!new_inode && new_dir!=old_dir &&
+- new_dir->i_nlink >= EXT3_LINK_MAX)
++ if (!new_inode && new_dir != old_dir &&
++ EXT3_DIR_LINK_MAXED(new_dir))
+ goto end_rename;
+ }
+ if (!new_bh) {
+@@ -2288,7 +2293,7 @@ static int ext3_rename (struct inode * o
+ }
+
+ if (new_inode) {
+- new_inode->i_nlink--;
++ ext3_dec_count(handle, new_inode);
+ new_inode->i_ctime = CURRENT_TIME;
+ }
+ old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME;
+@@ -2299,11 +2304,11 @@ static int ext3_rename (struct inode * o
+ PARENT_INO(dir_bh->b_data) = le32_to_cpu(new_dir->i_ino);
+ BUFFER_TRACE(dir_bh, "call ext3_journal_dirty_metadata");
+ ext3_journal_dirty_metadata(handle, dir_bh);
+- old_dir->i_nlink--;
++ ext3_dec_count(handle, old_dir);
+ if (new_inode) {
+- new_inode->i_nlink--;
++ ext3_dec_count(handle, new_inode);
+ } else {
+- new_dir->i_nlink++;
++ ext3_inc_count(handle, new_dir);
+ ext3_update_dx_flag(new_dir);
+ ext3_mark_inode_dirty(handle, new_dir);
+ }
+Index: linux-2.6.7/include/linux/ext3_fs.h
+===================================================================
+--- linux-2.6.7.orig/include/linux/ext3_fs.h 2004-06-15 23:19:36.000000000 -0600
++++ linux-2.6.7/include/linux/ext3_fs.h 2004-08-20 17:41:27.000000000 -0600
+@@ -41,7 +41,7 @@ struct statfs;
+ /*
+ * Always enable hashed directories
+ */
+-#define CONFIG_EXT3_INDEX
++#define CONFIG_EXT3_INDEX 1
+
+ /*
+ * Debug code
+@@ -79,7 +81,7 @@
+ /*
+ * Maximal count of links to a file
+ */
+-#define EXT3_LINK_MAX 32000
++#define EXT3_LINK_MAX 65000
+
+ /*
+ * Macro-instructions used to manage several block sizes
+@@ -595,14 +595,15 @@ struct ext3_dir_entry_2 {
+ */
+
+ #ifdef CONFIG_EXT3_INDEX
+- #define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \
+- EXT3_FEATURE_COMPAT_DIR_INDEX) && \
++#define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \
++ EXT3_FEATURE_COMPAT_DIR_INDEX) && \
+ (EXT3_I(dir)->i_flags & EXT3_INDEX_FL))
+-#define EXT3_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT3_LINK_MAX)
+-#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1)
++#define EXT3_DIR_LINK_MAXED(dir) (!is_dx(dir) && (dir)->i_nlink >=EXT3_LINK_MAX)
++#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || \
++ (is_dx(dir) && (dir)->i_nlink == 1))
+ #else
+ #define is_dx(dir) 0
+-#define EXT3_DIR_LINK_MAX(dir) ((dir)->i_nlink >= EXT3_LINK_MAX)
++#define EXT3_DIR_LINK_MAXED(dir) ((dir)->i_nlink >= EXT3_LINK_MAX)
+ #define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2)
+ #endif
+
Index: linux-stage/fs/ext3/Makefile
===================================================================
---- linux-stage.orig/fs/ext3/Makefile 2004-05-11 17:21:20.000000000 -0400
-+++ linux-stage/fs/ext3/Makefile 2004-05-11 17:21:21.000000000 -0400
+--- linux-stage.orig/fs/ext3/Makefile 2004-11-03 14:41:24.747805262 -0500
++++ linux-stage/fs/ext3/Makefile 2004-11-03 14:41:25.123696274 -0500
@@ -4,7 +4,7 @@
obj-$(CONFIG_EXT3_FS) += ext3.o
ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
Index: linux-stage/fs/ext3/inode.c
===================================================================
---- linux-stage.orig/fs/ext3/inode.c 2004-05-11 17:21:21.000000000 -0400
-+++ linux-stage/fs/ext3/inode.c 2004-05-11 17:21:21.000000000 -0400
+--- linux-stage.orig/fs/ext3/inode.c 2004-11-03 14:41:25.040720333 -0500
++++ linux-stage/fs/ext3/inode.c 2004-11-03 14:46:08.458515670 -0500
@@ -37,6 +37,7 @@
#include <linux/mpage.h>
#include <linux/uio.h>
#include "acl.h"
/*
-@@ -2472,6 +2473,9 @@
- ei->i_acl = EXT3_ACL_NOT_CACHED;
+@@ -2401,6 +2402,9 @@
ei->i_default_acl = EXT3_ACL_NOT_CACHED;
#endif
-+ if (ext3_iopen_get_inode(inode))
-+ return;
-+
+ ei->i_rsv_window.rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
++
++ if (ext3_iopen_get_inode(inode))
++ return;
+
if (ext3_get_inode_loc(inode, &iloc, 0))
goto bad_inode;
- bh = iloc.bh;
Index: linux-stage/fs/ext3/iopen.c
===================================================================
--- linux-stage.orig/fs/ext3/iopen.c 1969-12-31 19:00:00.000000000 -0500
-+++ linux-stage/fs/ext3/iopen.c 2004-05-11 17:21:21.000000000 -0400
++++ linux-stage/fs/ext3/iopen.c 2004-11-03 14:41:25.125695694 -0500
@@ -0,0 +1,272 @@
+/*
+ * linux/fs/ext3/iopen.c
Index: linux-stage/fs/ext3/iopen.h
===================================================================
--- linux-stage.orig/fs/ext3/iopen.h 1969-12-31 19:00:00.000000000 -0500
-+++ linux-stage/fs/ext3/iopen.h 2004-05-11 17:21:21.000000000 -0400
++++ linux-stage/fs/ext3/iopen.h 2004-11-03 14:41:25.126695404 -0500
@@ -0,0 +1,15 @@
+/*
+ * iopen.h
+ struct inode *inode, int rehash);
Index: linux-stage/fs/ext3/namei.c
===================================================================
---- linux-stage.orig/fs/ext3/namei.c 2004-05-11 17:21:20.000000000 -0400
-+++ linux-stage/fs/ext3/namei.c 2004-05-11 17:21:21.000000000 -0400
+--- linux-stage.orig/fs/ext3/namei.c 2004-11-03 14:41:24.957744391 -0500
++++ linux-stage/fs/ext3/namei.c 2004-11-03 14:41:25.127695114 -0500
@@ -37,6 +37,7 @@
#include <linux/buffer_head.h>
#include <linux/smp_lock.h>
}
-@@ -2019,10 +2021,6 @@
+@@ -2029,10 +2031,6 @@
inode->i_nlink);
inode->i_version++;
inode->i_nlink = 0;
ext3_orphan_add(handle, inode);
inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
ext3_mark_inode_dirty(handle, inode);
-@@ -2139,6 +2137,23 @@
+@@ -2152,6 +2150,23 @@
return err;
}
static int ext3_link (struct dentry * old_dentry,
struct inode * dir, struct dentry *dentry)
{
-@@ -2161,7 +2176,8 @@
+@@ -2175,7 +2190,8 @@
ext3_inc_count(handle, inode);
atomic_inc(&inode->i_count);
+ err = ext3_add_link(handle, dentry, inode);
+ ext3_orphan_del(handle,inode);
ext3_journal_stop(handle);
- return err;
- }
+ if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
+ goto retry;
Index: linux-stage/fs/ext3/super.c
===================================================================
---- linux-stage.orig/fs/ext3/super.c 2004-05-11 17:21:21.000000000 -0400
-+++ linux-stage/fs/ext3/super.c 2004-05-11 17:44:53.000000000 -0400
-@@ -536,7 +536,7 @@
- Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, Opt_noload,
+--- linux-stage.orig/fs/ext3/super.c 2004-11-03 14:41:25.043719463 -0500
++++ linux-stage/fs/ext3/super.c 2004-11-03 14:41:25.129694535 -0500
+@@ -534,7 +534,7 @@
+ Opt_reservation, Opt_noreservation, Opt_noload,
Opt_commit, Opt_journal_update, Opt_journal_inum,
Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
- Opt_ignore, Opt_barrier,
{Opt_err, NULL}
};
-@@ -772,6 +775,18 @@
+@@ -778,6 +781,18 @@
else
clear_opt(sbi->s_mount_opt, BARRIER);
break;
-tbd Cluster File Systems, Inc. <info@clusterfs.com>
- * version 1.2.x
+tbd Cluster File Systems, Inc. <info@clusterfs.com>
+ * version 1.3.4
+ * bug fixes
+ - fixes from lustre 1.2.8
+ - print NAL number in %x format (4645)
+ - the watchdog thread now runs as interruptible (5246)
+ - drop import inflight refcount on signal_completed_replay error (5255)
+ * miscellania
+ - add pid to ldlm debugging output (4922)
+
+2004-10-08 Cluster File Systems, Inc. <info@clusterfs.com>
+ * version 1.3.3
+ * bug fixes
+ - properly handle portals process identifiers in messages (4165)
+ - finish default directory EA handling (3048)
+ - fixes from lustre 1.2.7
+ - removed PTL_MD_KIOV usage under CRAY_PORTALS (4420)
+ - allow EADDRNOTAVAIL as retry for connect in liblustre tcpnal (4822)
+
+2004-09-16 Cluster File Systems, Inc. <info@clusterfs.com>
+ * version 1.3.2
+ * bug fixes
+ - many liblustre fixes
+ - fixes from lustre 1.2.6
+ * miscellania
+ - update to new libsysio-head-0806
+ - reorganization of lov code
+
+2004-08-30 Cluster File Systems, Inc. <info@clusterfs.com>
+ * version 1.3.1
+ * bug fixes
+ - add locking for mmapped files (2828)
+ - lmc/lconf changes to support multiple interfaces (3376)
+ - fixes from lustre 1.2.5
+
+2004-08-14 Cluster File Systems, Inc. <info@clusterfs.com>
+ * version 1.3.0
* bug fixes
- don't dereference NULL peer_ni in ldlm_handle_ast_error (3258)
- don't allow unlinking open directory if it isn't empty (2904)
- chose better nal ids in liblustre (3292)
- initialize liblustre with uid/group membership (2862)
- let lconf resolve symlinked-to devices (4629)
+ - balance journal closure when 2.6 filter write fails (3401)
+ - add second rpc_lock and last_rcvd info for close reqs (3462)
+ - don't hold llog sem during network request (3652)
+ - update server last transno after client disconnects (2525)
+ - replace config semaphore with spinlock (3306)
+ - ext3 exents and multi-block allocation (3024)
+ - service time statistics in /proc
+ - minor fixes to liblustre build (3317)
+ - client recovery without upcall (3262)
+ - use transno after validating reply (3892)
+ - use different name for 2nd ptlrpcd thread (3887)
+ - get a client lock in ll_inode_revalidate_it (3597)
+ - direct IO reads on OST (4048)
+ - process timed out requests if import state changes (3754)
+ - ignore -ENOENT errors in osc_destroy (3639)
+ - fixes from lustre 1.2.0-1.2.4
+ * miscellania
+ - use "CATALOGS" for the llog catalogs, not "CATLIST" (old) (b=2841)
+ - added kernel patch for /dev/sd I/O stats (4385)
+
+2004-11-16 Cluster File Systems, Inc. <info@clusterfs.com>
+ * version 1.2.8
+ * bug fixes
+ - fix TCP_NODELAY bug, which caused extreme perf regression (5134)
+ - allocate qswnal tx descriptors singly to avoid fragmentation (4504)
+ - don't LBUG on obdo_alloc() failure, use OBD_SLAB_ALLOC() (4800)
+ - fix NULL dereference in /proc/sys/portals/routes (4827)
+ - allow failed mdc_close() operations to be interrupted (4561)
+ - stop precreate on OST before MDS would time out on it (4778)
+ - don't free dentries not owned by NFS code, check generation (4806)
+ - fix lsm leak if mds_create_objects() fails (4801)
+ - limit debug_daemon file size, always print CERROR messages (4789)
+ - use transno after validating reply (3892)
+ - process timed out requests if import state changes (3754)
+ - update mtime on OST during writes, return in glimpse (4829)
+ - add mkfsoptions to LDAP (4679)
+ - use ->max_readahead method instead of zapping global ra (5039)
+ - don't interrupt __l_wait_event() during strace
+ * miscellania
+ - add software watchdogs to catch hung threads quickly (4941)
+ - make lustrefs init script start after nfs is mounted
+ - fix CWARN/ERROR duplication (4930)
+ - return async write errors to application if possible (2248)
+ - update barely-supported suse-2.4.21-171 series (4842)
+ - support for sles 9 %post scripts
+ - support for building 2.6 kernel-source packages
+ - support for sles km_* packages
+
+2004-10-07 Cluster File Systems, Inc. <info@clusterfs.com>
+ * version 1.2.7
+ * bug fixes
+ - ignore -ENOENT errors in osc_destroy (3639)
+ - notify osc create thread that OSC is being cleaned up (4600)
+ - add nettype argument for llmount in #5d in conf-sanity.sh (3936)
+ - reconstruct ost_handle() like mds_handle() (4657)
+ - create a new thread to do import eviction to avoid deadlock (3969)
+ - let lconf resolve symlinked-to devices (4629)
+ - don't unlink "objects" from directory with default EA (4554)
+ - hold socknal file ref over connect in case target is down (4394)
+ - allow more than 32000 subdirectories in a single directory (3244)
+ - OST returns ENOSPC from object create when no space left (4539)
+ - don't send truncate RPC if file size isn't changing (4410)
+ - limit OSC precreate to 1/2 of value OST considers bogus (4778)
+ - bind to privileged port in socknal and tcpnal (3689)
+ * miscellania
+ - rate limit CERROR/CWARN console message to avoid overload (4519)
+ - basic mmap support (3918)
+ - kernel patch series update from b1_4 (4711)
+
+2004-09-16 Cluster File Systems, Inc. <info@clusterfs.com>
+ * version 1.2.6
+ * bug fixes
+ - avoid crash during MDS cleanup with OST shut down (2775)
+ - fix loi_list_lock/oig_lock inversion on interrupted IO (4136)
+ - don't use bad inodes on the MDS (3744)
+ - dynamic object preallocation to improve recovery speed (4236)
+ - don't hold spinlock over lock dumping or change debug flags (4401)
+ - don't zero obd_dev when it is force cleaned (3651)
+ - "lctl deactivate" will stop automatic recovery attempts (3406)
+ - look for existing replayed locks to avoid duplicates (3764)
+ - don't resolve lock handle twice in recovery avoiding race (4401)
+ - revalidate should check working dir is a directory (4134)
+ * miscellania
+ - don't always mark "slow" obdfilter messages as errors (4418)
+
+2004-08-24 Cluster File Systems, Inc. <info@clusterfs.com>
+ * version 1.2.5
+ * bug fixes
+ - don't close LustreDB during write_conf until it is done (3860)
+ - fix typo in lconf for_each_profile (3821)
+ - allow dumping logs from multiple threads at one time (3820)
+ - don't allow multiple threads in OSC recovery (3812)
+ - fix debug_size parameters (3864)
+ - fix mds_postrecov to initialize import for llog ctxt (3121)
+ - replace config semaphore with spinlock (3306)
+ - be sure to send a reply for a CANCEL rpc with bad export (3863)
+ - don't allow enqueue to complete on a destroyed export (3822)
+ - down write_lock before checking llog header bitmap (3825)
+ - recover from lock replay timeout (3764)
+ - up llog sem before sending rpc (3652)
+ - reduce ns lock hold times when setting kms (3267)
+ - change a dlm LBUG to LASSERTF, to maybe learn something (4228)
+ - fix NULL deref and obd_dev leak on setup error (3312)
+ - replace some LBUG about llog ops with error handling (3841)
+ - don't match INVALID dentries from d_lookup and spin (3784)
+ - hold dcache_lock while marking dentries INVALID and hashing (4255)
+ - fix invalid assertion in ptlrpc_set_wait (3880)
+ * miscellania
+ - add libwrap support for the TCP acceptor (3996)
+ - add /proc/sys/portals/routes for non-root route listing (3994)
+ - allow setting MDS UUID in .xml (2580)
+ - print the stack of a process that LBUGs (4228)
+
+2004-07-14 Cluster File Systems, Inc. <info@clusterfs.com>
+ * version 1.2.4
+ * bug fixes
+ - don't cleanup request in ll_file_open() on failed MDS open (3430)
+ - make sure to unset replay flag from failed open requests (3440)
+ - if default stripe count is 0, use OST count for inode size (3636)
+ - update parent mtime/ctime on client for create/unlink (2611)
+ - drop dentry ref in ext3_add_link from open_connect_dentry (3266)
+ - free recovery state on server during a forced cleanup (3571)
+ - unregister_reply for resent reqs (3063)
+ - loop back devices mounting and status check on 2.6 (3563)
+ - fix resource-creation race that can provoke i_size == 0 (3513)
+ - don't try to use bad inodes returned from MDS/OST fs lookup (3688)
+ - more debugging for page-accounting assertion (3746)
+ - return -ENOENT instead of asserting if ost getattr+unlink race (3558)
+ - avoid deadlock after precreation failure (3758)
+ - fix race and lock order deadlock in orphan handling (3450, 3750)
+ - add validity checks when grabbing inodes from l_ast_data (3599)
* miscellania
- drop scimac NAL (unmaintained)
AM_CONDITIONAL(SMFS, test x$enable_smfs = xyes)
AM_CONDITIONAL(GSS, test x$enable_gss = xyes)
AM_CONDITIONAL(LIBLUSTRE, test x$enable_liblustre = xyes)
+AM_CONDITIONAL(LIBLUSTRE_TESTS, test x$enable_liblustre_tests = xyes)
AM_CONDITIONAL(MPITESTS, test x$enable_mpitests = xyes, Build MPI Tests)
])
<!ATTLIST ptlrpc %object.attr;>
<!ELEMENT osd (fstype | devpath | devsize | autoformat |
- target_ref | node_ref | journalsize )*>
+ target_ref | node_ref | journalsize | mkfsoptions)*>
<!ATTLIST osd %object.attr;
osdtype (obdfilter | obdecho) 'obdfilter'>
failover ( 1 | 0 ) #IMPLIED>
<!ELEMENT mdsdev (fstype | devpath | devsize | autoformat |
- target_ref | node_ref | journalsize )*>
+ target_ref | node_ref | journalsize | mkfsoptions)*>
<!ATTLIST mdsdev %object.attr;>
<!ELEMENT lov (mds_ref |(obd_ref)+)*>
<!ELEMENT ptldebug %object.content;>
<!ELEMENT subsystem %object.content;>
<!ELEMENT journalsize %object.content;>
+<!ELEMENT mkfsoptions %object.content;>
<!ELEMENT fstype %object.content;>
<!ELEMENT nid %object.content;>
<!ELEMENT port %object.content;>
<if test="journalsize">
journalsize: <value-of select="journalsize"/>
</if>
+<if test="mkfsoptions">
+mkfsoptions: <value-of select="mkfsoptions"/>
+</if>
nodeRef: <value-of select="node_ref/@uuidref"/>
targetRef: <value-of select="target_ref/@uuidref"/>
<text>
<if test="journalsize">
journalsize: <value-of select="journalsize"/>
</if>
+<if test="mkfsoptions">
+mkfsoptions: <value-of select="mkfsoptions"/>
+</if>
<text>
</text>
</template>
#define LIBLUSTRE_H__
#include <sys/mman.h>
-#ifndef __CYGWIN__
-#include <stdint.h>
-#include <asm/page.h>
-#else
-#include <sys/types.h>
-#include "ioctl.h"
+#ifdef HAVE_STDINT_H
+# include <stdint.h>
+#endif
+#ifdef HAVE_ASM_PAGE_H
+# include <asm/page.h>
+#endif
+#ifdef HAVE_SYS_USER_H
+# include <sys/user.h>
#endif
+
+#include "ioctl.h"
+
#include <stdio.h>
#include <sys/ioctl.h>
#include <stdlib.h>
}
#define lock_kernel() do {} while (0)
+#define unlock_kernel() do {} while (0)
#define daemonize(l) do {} while (0)
#define sigfillset(l) do {} while (0)
#define recalc_sigpending(l) do {} while (0)
#include <linux/lustre_export.h>
#include <linux/lustre_net.h>
-
#endif
const char *name,
struct lprocfs_stats *stats);
-#define LPROCFS_INIT_MULTI_VARS(array, size) \
-void lprocfs_init_multi_vars(unsigned int idx, \
- struct lprocfs_static_vars *x) \
-{ \
- struct lprocfs_static_vars *glob = (struct lprocfs_static_vars*)array; \
- LASSERT(glob != 0); \
- LASSERT(idx < (unsigned int)(size)); \
- x->module_vars = glob[idx].module_vars; \
- x->obd_vars = glob[idx].obd_vars; \
-} \
-
-#define LPROCFS_INIT_VARS(name, vclass, vinstance) \
-void lprocfs_##name##_init_vars(struct lprocfs_static_vars *x) \
-{ \
- x->module_vars = vclass; \
- x->obd_vars = vinstance; \
-} \
-
-#define lprocfs_init_vars(NAME, VAR) \
-do { \
+#define LPROCFS_INIT_MULTI_VARS(array, size) \
+void lprocfs_init_multi_vars(unsigned int idx, \
+ struct lprocfs_static_vars *x) \
+{ \
+ struct lprocfs_static_vars *glob = (struct lprocfs_static_vars*)array; \
+ LASSERT(glob != 0); \
+ LASSERT(idx < (unsigned int)(size)); \
+ x->module_vars = glob[idx].module_vars; \
+ x->obd_vars = glob[idx].obd_vars; \
+} \
+
+#define LPROCFS_INIT_VARS(name, vclass, vinstance) \
+void lprocfs_##name##_init_vars(struct lprocfs_static_vars *x) \
+{ \
+ x->module_vars = vclass; \
+ x->obd_vars = vinstance; \
+} \
+
+#define lprocfs_init_vars(NAME, VAR) \
+do { \
extern void lprocfs_##NAME##_init_vars(struct lprocfs_static_vars *); \
lprocfs_##NAME##_init_vars(VAR); \
} while (0)
uint32_t lmd_nal;
uint32_t lmd_server_ipaddr;
uint32_t lmd_port;
+ uint32_t lmd_async;
uint32_t lmd_nllu;
uint32_t lmd_nllg;
char lmd_security[16];
/* ptlrpc/recov_thread.c */
int llog_start_commit_thread(void);
+int llog_cleanup_commit_master(int force);
struct llog_canceld_ctxt *llcd_grab(void);
void llcd_send(struct llog_canceld_ctxt *llcd);
} while (0)
#define groups_sort(gi) do {} while (0)
-
#define GROUP_AT(gi, i) ((gi)->small_block[(i)])
static inline int cleanup_group_info(void)
page->private = 0; \
} while(0)
+#ifndef smp_num_cpus
+#define smp_num_cpus num_online_cpus()
+#endif
+
#define kiobuf bio
#include <linux/proc_fs.h>
static inline int mapping_mapped(struct address_space *mapping)
{
- return mapping->i_mmap_shared ? 1 : 0;
+ if (mapping->i_mmap_shared)
+ return 1;
+ if (mapping->i_mmap)
+ return 1;
+ return 0;
}
/* to find proc_dir_entry from inode. 2.6 has native one -bzzz */
#define ll_vfs_symlink(dir, dentry, path, mode) vfs_symlink(dir, dentry, path, mode)
#endif
+#ifndef container_of
+#define container_of(ptr, type, member) ({ \
+ const typeof( ((type *)0)->member ) *__mptr = (ptr); \
+ (type *)( (char *)__mptr - offsetof(type,member) );})
+#endif
+
#ifdef HAVE_I_ALLOC_SEM
#define UP_WRITE_I_ALLOC_SEM(i) do { up_write(&(i)->i_alloc_sem); } while (0)
#define DOWN_WRITE_I_ALLOC_SEM(i) do { down_write(&(i)->i_alloc_sem); } while(0)
OP; \
}} while(0)
+#define LL_CDEBUG_PAGE(mask, page, fmt, arg...) \
+ CDEBUG(mask, "page %p map %p ind %lu priv %0lx: " fmt, \
+ page, page->mapping, page->index, page->private, ## arg)
+
/* lib/debug.c */
int dump_lniobuf(struct niobuf_local *lnb);
int dump_rniobuf(struct niobuf_remote *rnb);
struct list_head ns_unused_list; /* all root resources in ns */
int ns_nr_unused;
unsigned int ns_max_unused;
+ unsigned long ns_next_dump; /* next dump time */
spinlock_t ns_counter_lock;
__u64 ns_locks;
/* Server-side-only members */
struct list_head l_pending_chain; /* callbacks pending */
unsigned long l_callback_timeout;
+
+ __u32 l_pid; /* pid which created this lock */
};
#define LDLM_PLAIN 10
CDEBUG(level, "### " format \
" ns: \?\? lock: %p/"LPX64" lrc: %d/%d,%d mode: %s/%s "\
"res: \?\? rrc=\?\? type: \?\?\? flags: %x remote: " \
- LPX64" expref: %d\n" , ## a, lock, \
+ LPX64" expref: %d pid: %u\n" , ## a, lock, \
lock->l_handle.h_cookie, atomic_read(&lock->l_refc), \
lock->l_readers, lock->l_writers, \
ldlm_lockname[lock->l_granted_mode], \
ldlm_lockname[lock->l_req_mode], \
lock->l_flags, lock->l_remote_handle.cookie, \
lock->l_export ? \
- atomic_read(&lock->l_export->exp_refcount) : -99); \
+ atomic_read(&lock->l_export->exp_refcount) : -99, \
+ lock->l_pid); \
break; \
} \
if (lock->l_resource->lr_type == LDLM_EXTENT) { \
CDEBUG(level, "### " format \
" ns: %s lock: %p/"LPX64" lrc: %d/%d,%d mode: %s/%s " \
"res: "LPU64"/"LPU64"/"LPU64" rrc: %d type: %s ["LPU64 \
- "->"LPU64"] (req "LPU64"->"LPU64") flags: %x remote: " \
- LPX64" expref: %d\n" , ## a, \
+ "->"LPU64"] (req "LPU64"->"LPU64") flags: %x remote: " \
+ LPX64" expref: %d pid: %u\n" , ## a, \
lock->l_resource->lr_namespace->ns_name, lock, \
lock->l_handle.h_cookie, atomic_read(&lock->l_refc), \
lock->l_readers, lock->l_writers, \
lock->l_req_extent.start, lock->l_req_extent.end, \
lock->l_flags, lock->l_remote_handle.cookie, \
lock->l_export ? \
- atomic_read(&lock->l_export->exp_refcount) : -99); \
+ atomic_read(&lock->l_export->exp_refcount) : -99, \
+ lock->l_pid); \
break; \
} \
if (lock->l_resource->lr_type == LDLM_FLOCK) { \
CDEBUG(level, "### " format \
" ns: %s lock: %p/"LPX64" lrc: %d/%d,%d mode: %s/%s " \
"res: "LPU64"/"LPU64"/"LPU64" rrc: %d type: %s pid: " \
- LPU64" ["LPU64"->"LPU64"] flags: %x remote: "LPX64 \
- " expref: %d\n" , ## a, \
+ LPU64" " "["LPU64"->"LPU64"] flags: %x remote: "LPX64 \
+ " expref: %d pid: %u\n" , ## a, \
lock->l_resource->lr_namespace->ns_name, lock, \
lock->l_handle.h_cookie, atomic_read(&lock->l_refc), \
lock->l_readers, lock->l_writers, \
lock->l_policy_data.l_flock.end, \
lock->l_flags, lock->l_remote_handle.cookie, \
lock->l_export ? \
- atomic_read(&lock->l_export->exp_refcount) : -99); \
+ atomic_read(&lock->l_export->exp_refcount) : -99, \
+ lock->l_pid); \
break; \
} \
if (lock->l_resource->lr_type == LDLM_IBITS) { \
CDEBUG(level, "### " format \
" ns: %s lock: %p/"LPX64" lrc: %d/%d,%d mode: %s/%s " \
"res: "LPU64"/"LPU64"/"LPU64" bits "LPX64" rrc: %d " \
- "type: %s flags: %x remote: "LPX64" expref: %d\n" , ## a,\
+ "type: %s flags: %x remote: "LPX64" expref: %d " \
+ "pid %u\n" , ## a, \
lock->l_resource->lr_namespace->ns_name, \
lock, lock->l_handle.h_cookie, \
atomic_read (&lock->l_refc), \
ldlm_typename[lock->l_resource->lr_type], \
lock->l_flags, lock->l_remote_handle.cookie, \
lock->l_export ? \
- atomic_read(&lock->l_export->exp_refcount) : -99); \
+ atomic_read(&lock->l_export->exp_refcount) : -99, \
+ lock->l_pid); \
break; \
} \
{ \
CDEBUG(level, "### " format \
" ns: %s lock: %p/"LPX64" lrc: %d/%d,%d mode: %s/%s " \
- "res: "LPU64"/"LPU64"/"LPU64"/"LPU64" rrc: %d type: %s "\
- "flags: %x remote: "LPX64" expref: %d\n" , ## a, \
+ "res: "LPU64"/"LPU64"/"LPU64"/"LPU64" rrc: %d type: %s " \
+ "flags: %x remote: "LPX64" expref: %d " \
+ "pid: %u\n" , ## a, \
lock->l_resource->lr_namespace->ns_name, \
lock, lock->l_handle.h_cookie, \
atomic_read (&lock->l_refc), \
ldlm_typename[lock->l_resource->lr_type], \
lock->l_flags, lock->l_remote_handle.cookie, \
lock->l_export ? \
- atomic_read(&lock->l_export->exp_refcount) : -99); \
+ atomic_read(&lock->l_export->exp_refcount) : -99, \
+ lock->l_pid); \
} \
} while (0)
struct ldlm_lock *lock);
void ldlm_resource_unlink_lock(struct ldlm_lock *lock);
void ldlm_res2desc(struct ldlm_resource *res, struct ldlm_resource_desc *desc);
-void ldlm_dump_all_namespaces(void);
-void ldlm_namespace_dump(struct ldlm_namespace *);
-void ldlm_resource_dump(struct ldlm_resource *);
+void ldlm_dump_all_namespaces(int level);
+void ldlm_namespace_dump(int level, struct ldlm_namespace *);
+void ldlm_resource_dump(int level, struct ldlm_resource *);
int ldlm_lock_change_resource(struct ldlm_namespace *, struct ldlm_lock *,
struct ldlm_res_id);
struct filter_client_data *fed_fcd;
loff_t fed_lr_off;
int fed_lr_idx;
- unsigned long fed_dirty; /* in bytes */
- unsigned long fed_grant; /* in bytes */
- unsigned long fed_pending; /* bytes just being written */
+ long fed_dirty; /* in bytes */
+ long fed_grant; /* in bytes */
+ long fed_pending; /* bytes just being written */
};
struct obd_export {
unsigned long exp_flags;
int exp_failed:1,
exp_replay_needed:1,
- exp_libclient:1; /* liblustre client? */
+ exp_libclient:1, /* liblustre client? */
+ exp_sync:1;
union {
struct mds_export_data eu_mds_data;
struct filter_export_data eu_filter_data;
int fso_bufcnt;
};
+/* lustre EA type (MEA, LOV, etc.) */
+enum ea_type {
+ EA_LOV = (1 << 0),
+ EA_MEA = (1 << 1),
+ EA_SID = (1 << 2),
+ EA_MID = (1 << 3)
+};
+
struct fsfilt_operations {
struct list_head fs_list;
struct module *fs_owner;
char *fs_type;
+
void *(* fs_start)(struct inode *inode, int op, void *desc_private,
int logs);
void *(* fs_brw_start)(int objcount, struct fsfilt_objinfo *fso,
struct iattr *iattr, int do_trunc);
int (* fs_iocontrol)(struct inode *inode, struct file *file,
unsigned int cmd, unsigned long arg);
-
- /* two methods for getting lov EA and setting it back to inode xattr. */
- int (* fs_set_md)(struct inode *inode, void *handle, void *md,
- int size);
- int (* fs_get_md)(struct inode *inode, void *md, int size);
-
- /* two methods for getting MID (master id) EA and setting it back to
- * inode xattr. */
- int (* fs_set_mid)(struct inode *inode, void *handle, void *fid,
- int size);
- int (* fs_get_mid)(struct inode *inode, void *fid, int size);
- /* two methods for getting self id EA and setting it back to inode
- * xattr. */
- int (* fs_set_sid)(struct inode *inode, void *handle, void *sid,
- int size);
- int (* fs_get_sid)(struct inode *inode, void *sid, int size);
+ /* two methods for setting getting diff. kind of EAs from inode. */
+ int (* fs_set_md)(struct inode *inode, void *handle, void *md,
+ int size, enum ea_type type);
+ int (* fs_get_md)(struct inode *inode, void *md, int size,
+ enum ea_type type);
int (* fs_send_bio)(int rw, struct inode *inode, void *bio);
ssize_t (* fs_readpage)(struct file *file, char *buf, size_t count,
loff_t *offset);
- int (* fs_add_journal_cb)(struct obd_device *obd,
+ int (* fs_add_journal_cb)(struct obd_device *obd,
struct super_block *sb,
- __u64 last_rcvd, void *handle,
- fsfilt_cb_t cb_func,
- void *cb_data);
+ __u64 last_rcvd, void *handle,
+ fsfilt_cb_t cb_func, void *cb_data);
int (* fs_statfs)(struct super_block *sb, struct obd_statfs *osfs);
int (* fs_sync)(struct super_block *sb);
int (* fs_map_inode_pages)(struct inode *inode, struct page **page,
#define LMV_EA 1
#define LOV_EA 0
+#define fsfilt_check_slow(start, timeout, msg) \
+do { \
+ if (time_before(jiffies, start + 15 * HZ)) \
+ break; \
+ else if (time_before(jiffies, start + timeout / 2 * HZ)) \
+ CWARN("slow %s %lus\n", msg, (jiffies - start) / HZ); \
+ else \
+ CERROR("slow %s %lus\n", msg, (jiffies - start) / HZ); \
+} while (0)
+
static inline void *
fsfilt_start_ops(struct fsfilt_operations *ops, struct inode *inode,
int op, struct obd_trans_info *oti, int logs)
LBUG();
}
}
- if (time_after(jiffies, now + 15 * HZ))
- CERROR("long journal start time %lus\n", (jiffies - now) / HZ);
+ fsfilt_check_slow(now, 60, "journal start");
return handle;
}
int rc = ops->fs_commit(sb, inode, handle, force_sync);
CDEBUG(D_INFO, "committing handle %p\n", handle);
- if (time_after(jiffies, now + 15 * HZ))
- CERROR("long journal start time %lus\n", (jiffies - now) / HZ);
+ fsfilt_check_slow(now, 60, "journal start");
return rc;
}
LBUG();
}
}
- if (time_after(jiffies, now + 15 * HZ))
- CERROR("long journal start time %lus\n", (jiffies - now) / HZ);
+ fsfilt_check_slow(now, obd_timeout, "journal start");
return handle;
}
int rc = obd->obd_fsops->fs_commit_async(inode, handle, wait_handle);
CDEBUG(D_INFO, "committing handle %p (async)\n", *wait_handle);
- if (time_after(jiffies, now + 15 * HZ))
- CERROR("long journal start time %lus\n", (jiffies - now) / HZ);
+ fsfilt_check_slow(now, obd_timeout, "journal start");
return rc;
}
unsigned long now = jiffies;
int rc = obd->obd_fsops->fs_commit_wait(inode, handle);
CDEBUG(D_INFO, "waiting for completion %p\n", handle);
- if (time_after(jiffies, now + 15 * HZ))
- CERROR("long journal start time %lus\n", (jiffies - now) / HZ);
+ fsfilt_check_slow(now, obd_timeout, "journal start");
return rc;
}
unsigned long now = jiffies;
int rc;
rc = obd->obd_fsops->fs_setattr(dentry, handle, iattr, do_trunc);
- if (time_after(jiffies, now + 15 * HZ))
- CERROR("long setattr time %lus\n", (jiffies - now) / HZ);
+ fsfilt_check_slow(now, obd_timeout, "setattr");
return rc;
}
static inline int
fsfilt_set_md(struct obd_device *obd, struct inode *inode,
- void *handle, void *md, int size)
+ void *handle, void *md, int size, enum ea_type type)
{
- return obd->obd_fsops->fs_set_md(inode, handle, md, size);
+ if (!obd->obd_fsops->fs_set_md)
+ return -ENOSYS;
+
+ return obd->obd_fsops->fs_set_md(inode, handle, md,
+ size, type);
}
static inline int
fsfilt_get_md(struct obd_device *obd, struct inode *inode,
- void *md, int size)
-{
- return obd->obd_fsops->fs_get_md(inode, md, size);
-}
-
-static inline int
-fsfilt_set_mid(struct obd_device *obd, struct inode *inode,
- void *handle, void *mid, int size)
+ void *md, int size, enum ea_type type)
{
- return obd->obd_fsops->fs_set_mid(inode, handle, mid, size);
-}
-
-static inline int
-fsfilt_get_mid(struct obd_device *obd, struct inode *inode,
- void *mid, int size)
-{
- return obd->obd_fsops->fs_get_mid(inode, mid, size);
-}
-
-static inline int
-fsfilt_set_sid(struct obd_device *obd, struct inode *inode,
- void *handle, void *sid, int size)
-{
- return obd->obd_fsops->fs_set_sid(inode, handle, sid, size);
-}
-
-static inline int
-fsfilt_get_sid(struct obd_device *obd, struct inode *inode,
- void *sid, int size)
-{
- return obd->obd_fsops->fs_get_sid(inode, sid, size);
+ if (!obd->obd_fsops->fs_get_md)
+ return -ENOSYS;
+
+ return obd->obd_fsops->fs_get_md(inode, md, size,
+ type);
}
static inline int fsfilt_send_bio(int rw, struct obd_device *obd,
rc = obd->obd_fsops->fs_putpage(inode, page);
- if (time_after(jiffies, now + 15 * HZ))
- CERROR("long putpage time %lus\n", (jiffies - now) / HZ);
+ fsfilt_check_slow(now, obd_timeout, "putpage");
return rc;
}
page = obd->obd_fsops->fs_getpage(inode, index);
- if (time_after(jiffies, now + 15 * HZ))
- CERROR("long getpage time %lus\n", (jiffies - now) / HZ);
+ fsfilt_check_slow(now, obd_timeout, "getpage");
return page;
}
#ifndef _LUSTRE_IDL_H_
#define _LUSTRE_IDL_H_
+#ifdef HAVE_ASM_TYPES_H
+#include <asm/types.h>
+#else
+#include "types.h"
+#endif
+
+
#ifdef __KERNEL__
# include <linux/ioctl.h>
-# include <asm/types.h>
# include <linux/types.h>
# include <linux/list.h>
# include <linux/string.h> /* for strncpy, below */
#ifdef __CYGWIN__
# include <sys/types.h>
#else
-# include <asm/types.h>
# include <stdint.h>
#endif
# include <libcfs/list.h>
#define MSG_CONNECT_RECOVERING 0x1
#define MSG_CONNECT_RECONNECT 0x2
#define MSG_CONNECT_REPLAYABLE 0x4
-//#define MSG_CONNECT_PEER 0x8
+#define MSG_CONNECT_PEER 0x8
#define MSG_CONNECT_LIBCLIENT 0x10
#define MSG_CONNECT_INITIAL 0x20
+#define MSG_CONNECT_ASYNC 0x40
/*
* OST requests: OBDO & OBD request records
#define OBD_MD_FLDIREA (0x0000000020000000LL) /* dir's extended attribute data */
#define OBD_MD_REINT (0x0000000040000000LL) /* reintegrate oa */
#define OBD_MD_FID (0x0000000080000000LL) /* lustre_id data */
-#define OBD_MD_FLEALIST (0x0000000100000000LL) /* list extended attributes */
-#define OBD_MD_FLACL_ACCESS (0x0000000200000000LL) /*access acl*/
+#define OBD_MD_MEA (0x0000000100000000LL) /* shows we are interested in MEA */
+#define OBD_MD_FLEALIST (0x0000000200000000LL) /* list extended attributes */
+#define OBD_MD_FLACL_ACCESS (0x0000000400000000LL) /*access acl*/
#define OBD_MD_FLNOTOBD (~(OBD_MD_FLBLOCKS | OBD_MD_LINKNAME | \
OBD_MD_FLEASIZE | OBD_MD_FLHANDLE | \
#define OBD_OBJECT_EOF 0xffffffffffffffffULL
+#define OST_MIN_PRECREATE 32
+#define OST_MAX_PRECREATE 20000
+
struct obd_ioobj {
obd_id ioo_id;
obd_gr ioo_gr;
/* INODE LOCK PARTS */
#define MDS_INODELOCK_LOOKUP 0x000001 /* dentry, mode, owner, group */
#define MDS_INODELOCK_UPDATE 0x000002 /* size, links, timestamps */
+#define MDS_INODELOCK_OPEN 0x000004 /* for opened files */
+
+/* do not forget to increase MDS_INODELOCK_MAXSHIFT when adding new bits */
+#define MDS_INODELOCK_MAXSHIFT 2
+
+/* this FULL lock is useful to take on unlink sort of operations */
+#define MDS_INODELOCK_FULL ((1 << (MDS_INODELOCK_MAXSHIFT + 1)) - 1)
/* lustre store cookie */
struct lustre_stc {
static inline char * ptlrpc_import_state_name(enum lustre_imp_state state)
{
-
static char* import_state_names[] = {
- "<UNKNOWN>", "CLOSED", "NEW", "DISCONN",
- "CONNECTING", "REPLAY", "REPLAY_LOCKS", "REPLAY_WAIT",
+ "<UNKNOWN>", "CLOSED", "NEW", "DISCONN",
+ "CONNECTING", "REPLAY", "REPLAY_LOCKS", "REPLAY_WAIT",
"RECOVER", "FULL", "EVICTED",
};
int imp_invalid:1, imp_replayable:1,
imp_dlm_fake:1, imp_server_timeout:1,
imp_initial_recov:1, imp_force_verify:1,
- imp_pingable:1, imp_resend_replay:1;
+ imp_pingable:1, imp_resend_replay:1,
+ imp_deactive:1;
__u32 imp_connect_op;
__u32 imp_connect_flags;
};
#define OBD_IOC_PROCESS_CFG _IOWR('f', 184, long)
#define OBD_IOC_DUMP_LOG _IOWR('f', 185, long)
#define OBD_IOC_CLEAR_LOG _IOWR('f', 186, long)
+#define OBD_IOC_START _IOWR('f', 187, long)
#define OBD_IOC_CATLOGLIST _IOWR('f', 190, long)
#define OBD_IOC_LLOG_INFO _IOWR('f', 191, long)
int __timed_out = 0; \
unsigned long irqflags; \
sigset_t blocked; \
+ signed long timeout_remaining; \
\
init_waitqueue_entry(&__wait, current); \
if (excl) \
else \
blocked = l_w_e_set_sigs(0); \
\
+ timeout_remaining = info->lwi_timeout; \
+ \
for (;;) { \
set_current_state(TASK_INTERRUPTIBLE); \
if (condition) \
break; \
- if (signal_pending(current)) { \
- if (info->lwi_on_signal) \
- info->lwi_on_signal(info->lwi_cb_data); \
- ret = -EINTR; \
- break; \
- } \
if (info->lwi_timeout && !__timed_out) { \
- if (schedule_timeout(info->lwi_timeout) == 0) { \
+ timeout_remaining = schedule_timeout(timeout_remaining); \
+ if (timeout_remaining == 0) { \
__timed_out = 1; \
if (!info->lwi_on_timeout || \
info->lwi_on_timeout(info->lwi_cb_data)) { \
} else { \
schedule(); \
} \
+ if (condition) \
+ break; \
+ if (signal_pending(current)) { \
+ if (__timed_out) { \
+ break; \
+ } else { \
+ /* We have to do this here because some signals */ \
+ /* are not blockable - ie from strace(1). */ \
+ /* In these cases we want to schedule_timeout() */ \
+ /* again, because we don't want that to return */ \
+ /* -EINTR when the RPC actually succeeded. */ \
+ /* the RECALC_SIGPENDING below will deliver the */ \
+ /* signal properly. */ \
+ SIGNAL_MASK_LOCK(current, irqflags); \
+ CLEAR_SIGPENDING; \
+ SIGNAL_MASK_UNLOCK(current, irqflags); \
+ } \
+ } \
} \
\
SIGNAL_MASK_LOCK(current, irqflags); \
RECALC_SIGPENDING; \
SIGNAL_MASK_UNLOCK(current, irqflags); \
\
+ if (__timed_out && signal_pending(current)) { \
+ if (info->lwi_on_signal) \
+ info->lwi_on_signal(info->lwi_cb_data); \
+ ret = -EINTR; \
+ } \
+ \
current->state = TASK_RUNNING; \
remove_wait_queue(&wq, &__wait); \
} while(0)
#else /* !__KERNEL__ */
#define __l_wait_event(wq, condition, info, ret, excl) \
do { \
- int timeout = info->lwi_timeout, elapse; \
+ long timeout = info->lwi_timeout, elapse, last = 0; \
int __timed_out = 0; \
- long last; \
\
- last = time(NULL); \
+ if (info->lwi_timeout == 0) \
+ timeout = 1000000000; \
+ else \
+ last = time(NULL); \
+ \
for (;;) { \
if (condition) \
break; \
if (liblustre_wait_event(timeout)) { \
- if (timeout == 0) \
+ if (timeout == 0 || info->lwi_timeout == 0) \
continue; \
- elapse = (int) (time(NULL) - last); \
+ elapse = time(NULL) - last; \
if (elapse) { \
last += elapse; \
timeout -= elapse; \
struct lustre_id lli_id; /* full lustre_id */
char *lli_symlink_name;
struct semaphore lli_open_sem;
+ struct semaphore lli_size_sem;
__u64 lli_maxbytes;
__u64 lli_io_epoch;
unsigned long lli_flags;
#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
struct inode lli_vfs_inode;
#endif
+ struct semaphore lli_och_sem; /* Protects access to och pointers
+ and their usage counters */
+ /* We need all three because every inode may be opened in different
+ modes */
+ struct obd_client_handle *lli_mds_read_och;
+ __u64 lli_open_fd_read_count;
+ struct obd_client_handle *lli_mds_write_och;
+ __u64 lli_open_fd_write_count;
+ struct obd_client_handle *lli_mds_exec_och;
+ __u64 lli_open_fd_exec_count;
struct posix_acl *lli_acl_access;
};
#include <linux/lustre_idl.h>
#endif /* __KERNEL__ */
+#define LLAP_FROM_COOKIE(c) \
+ (LASSERT(((struct ll_async_page *)(c))->llap_magic == LLAP_MAGIC), \
+ (struct ll_async_page *)(c))
+
#include <lustre/lustre_user.h>
#endif
int llog_cat_id2handle(struct llog_handle *cathandle, struct llog_handle **res,
struct llog_logid *logid);
+int class_config_dump_handler(struct llog_handle * handle,
+ struct llog_rec_hdr *rec, void *data);
int llog_cat_put(struct llog_handle *cathandle);
int llog_cat_add_rec(struct llog_handle *cathandle, struct llog_rec_hdr *rec,
struct llog_cookie *reccookie, void *buf,
size = offset + sizeof(struct rw_semaphore *) * lcl->lcl_count;
OBD_FREE(lcl, size);
}
-
#endif
__u64 mcd_last_xid; /* xid for the last transaction */
__u32 mcd_last_result; /* result from last RPC */
__u32 mcd_last_data; /* per-op data (disposition for open &c.) */
- __u8 mcd_padding[MDS_LR_CLIENT_SIZE - 64];
+ /* for MDS_CLOSE requests */
+ __u64 mcd_last_close_transno; /* last completed transaction ID */
+ __u64 mcd_last_close_xid; /* xid for the last transaction */
+ __u32 mcd_last_close_result; /* result from last RPC */
+ __u32 mcd_last_close_data; /* per-op data (disposition for open &c.) */
+ __u8 mcd_padding[MDS_LR_CLIENT_SIZE - 88];
};
/* simple uid/gid mapping hash table */
#define IOC_REQUEST_CLOSE _IOWR('f', 35, long)
#define IOC_REQUEST_MAX_NR 35
-#define MDS_CHECK_RESENT(req, reconstruct) \
-{ \
- if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT) { \
- struct mds_client_data *mcd = \
- req->rq_export->exp_mds_data.med_mcd; \
- if (mcd->mcd_last_xid == req->rq_xid) { \
- reconstruct; \
- RETURN(req->rq_repmsg->status); \
- } \
- DEBUG_REQ(D_HA, req, "no reply for RESENT req (have "LPD64")", \
- mcd->mcd_last_xid); \
- } \
+#define MDS_CHECK_RESENT(req, reconstruct) \
+{ \
+ if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT) { \
+ struct mds_client_data *mcd = \
+ req->rq_export->exp_mds_data.med_mcd; \
+ \
+ if (le64_to_cpu(mcd->mcd_last_xid) == req->rq_xid) { \
+ reconstruct; \
+ RETURN(le32_to_cpu(mcd->mcd_last_result)); \
+ } \
+ if (le64_to_cpu(mcd->mcd_last_close_xid) == req->rq_xid) { \
+ reconstruct; \
+ RETURN(le32_to_cpu(mcd->mcd_last_close_result));\
+ } \
+ DEBUG_REQ(D_HA, req, "no reply for RESENT req" \
+ "(have "LPD64", and "LPD64")", \
+ mcd->mcd_last_xid, mcd->mcd_last_close_xid); \
+ } \
}
#endif
#define MDT_MAX_THREADS 32UL
#define MDT_NUM_THREADS max(min_t(unsigned long, num_physpages / 8192, \
MDT_MAX_THREADS), 2UL)
-#define MDS_NBUFS (64 * smp_num_cpus)
+#define MDS_NBUFS (64 * smp_num_cpus)
#define MDS_BUFSIZE (8 * 1024)
/* Assume file name length = FNAME_MAX = 256 (true for extN).
* path name length = PATH_MAX = 4096
#define OST_MAX_THREADS 36UL
#define OST_NUM_THREADS max(min_t(unsigned long, num_physpages / 8192, \
OST_MAX_THREADS), 2UL)
-#define OST_NBUFS (64 * smp_num_cpus)
+#define OST_NBUFS (64 * smp_num_cpus)
#define OST_BUFSIZE (8 * 1024)
/* OST_MAXREQSIZE ~= 1640 bytes =
* lustre_msg + obdo + 16 * obd_ioobj + 64 * niobuf_remote
#define PTLBD_MAXREQSIZE 1024
struct ptlrpc_peer {
-/* bugfix #4615
- */
- ptl_process_id_t peer_id;
+ ptl_process_id_t peer_id;
struct ptlrpc_ni *peer_ni;
};
wait_queue_head_t *set_wakeup_ptr;
struct list_head set_requests;
set_interpreter_func set_interpret; /* completion callback */
- union ptlrpc_async_args set_args; /* completion context */
+ void *set_arg; /* completion context */
/* locked so that any old caller can communicate requests to
* the set holder who can then fold them into the lock-free set */
spinlock_t set_new_req_lock;
struct timeval rq_arrival_time; /* request arrival time */
struct ptlrpc_reply_state *rq_reply_state; /* separated reply state */
struct ptlrpc_request_buffer_desc *rq_rqbd; /* incoming request buffer */
+#if CRAY_PORTALS
+ ptl_uid_t rq_uid; /* peer uid, used in MDS only */
+#endif
/* client-only incoming reply */
ptl_handle_md_t rq_reply_md_h;
struct ptlrpc_cb_id rq_reply_cbid;
struct ptlrpc_peer rq_peer; /* XXX see service.c can this be factored away? */
+ char rq_peerstr[PTL_NALFMT_SIZE];
struct obd_export *rq_export;
struct obd_import *rq_import;
int srv_n_difficult_replies; /* # 'difficult' replies */
int srv_n_active_reqs; /* # reqs being served */
int srv_rqbd_timeout; /* timeout before re-posting reqs */
-
+ int srv_watchdog_timeout; /* soft watchdog timeout, in ms */
+
__u32 srv_req_portal;
__u32 srv_rep_portal;
struct proc_dir_entry *srv_procroot;
struct lprocfs_stats *srv_stats;
-
+
struct ptlrpc_srv_ni srv_interfaces[0];
};
return (portals_nid2str(p->peer_ni->pni_number, p->peer_id.nid, str));
}
-/* For bug #4615 */
static inline char *ptlrpc_id2str(struct ptlrpc_peer *p, char *str)
{
LASSERT(p->peer_ni != NULL);
void ptlrpc_commit_replies (struct obd_device *obd);
void ptlrpc_schedule_difficult_reply (struct ptlrpc_reply_state *rs);
struct ptlrpc_service *ptlrpc_init_svc(int nbufs, int bufsize, int max_req_size,
- int req_portal, int rep_portal,
+ int req_portal, int rep_portal,
+ int watchdog_timeout, /* in ms */
svc_handler_t, char *name,
struct proc_dir_entry *proc_entry);
void ptlrpc_stop_all_threads(struct ptlrpc_service *svc);
#ifndef __LUSTRE_SMFS_H
#define __LUSTRE_SMFS_H
+#include <linux/lustre_fsfilt.h>
#include <linux/namei.h>
+
struct snap_inode_info {
int sn_flags; /*the flags indicated inode type */
int sn_gen; /*the inode generation*/
unsigned long from, unsigned long num);
extern int smfs_rec_setattr(struct inode *dir, struct dentry *dentry,
struct iattr *attr);
-extern int smfs_rec_precreate(struct dentry *dentry, int *num, struct obdo *oa);
-extern int smfs_rec_md(struct inode *inode, void * lmm, int lmm_size);
+extern int smfs_rec_precreate(struct dentry *dentry, int *num,
+ struct obdo *oa);
+extern int smfs_rec_md(struct inode *inode, void * lmm, int lmm_size,
+ enum ea_type type);
extern int smfs_rec_unpack(struct smfs_proc_args *args, char *record,
char **pbuf, int *opcode);
/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
* vim:expandtab:shiftwidth=8:tabstop=8:
- */
+ *
+ * Copyright (C) 2001, 2002, 2003, 2004 Cluster File Systems, Inc.
+ *
+ * Author: <braam@clusterfs.com>
+ *
+ * This file is part of Lustre, http://www.lustre.org.
+ *
+ * Lustre is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Lustre is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Lustre; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * lustre VFS/process permission interface
+ */
+
#ifndef __LVFS_H__
#define __LVFS_H__
#include <linux/namei.h>
#include <linux/lustre_compat25.h>
#include <linux/lvfs_linux.h>
-#endif
+#endif
#ifdef LIBLUSTRE
#include <lvfs_user_fs.h>
#endif
};
+struct lvfs_obd_ctxt {
+ struct vfsmount *loc_mnt;
+ atomic_t loc_refcount;
+ char *loc_name;
+ struct list_head loc_list;
+};
+
#ifdef OBD_CTXT_DEBUG
#define OBD_SET_CTXT_MAGIC(ctxt) (ctxt)->magic = OBD_RUN_CTXT_MAGIC
#else
int lustre_fwrite(struct file *file, const void *buf, int len, loff_t *off);
int lustre_fsync(struct file *file);
long l_readdir(struct file * file, struct list_head *dentry_list);
-
+int lvfs_mount_fs(char *name, char *fstype, char *options, int flags,
+ struct lvfs_obd_ctxt **lvfs_ctxt);
+void lvfs_umount_fs(struct lvfs_obd_ctxt *lvfs_ctxt);
static inline void l_dput(struct dentry *de)
{
if (!de || IS_ERR(de))
#else
up(&dparent->d_inode->i_sem);
#endif
+
+ if (IS_ERR(dchild) || dchild->d_inode == NULL)
+ return dchild;
+
+ if (is_bad_inode(dchild->d_inode)) {
+ CERROR("bad inode returned %lu/%u\n",
+ dchild->d_inode->i_ino, dchild->d_inode->i_generation);
+ dput(dchild);
+ dchild = ERR_PTR(-ENOENT);
+ }
+
return dchild;
}
const char *fo_fstype;
struct super_block *fo_sb;
struct vfsmount *fo_vfsmnt;
+ struct lvfs_obd_ctxt *fo_lvfs_ctxt;
int fo_group_count;
struct dentry *fo_dentry_O; /* the "O"bject directory dentry */
struct obd_service_time cl_enter_stime;
struct mdc_rpc_lock *cl_rpc_lock;
- struct mdc_rpc_lock *cl_setattr_lock;
+ struct mdc_rpc_lock *cl_setattr_lock;
+ struct mdc_rpc_lock *cl_close_lock;
struct osc_creator cl_oscc;
+ int cl_async:1;
};
/* Like a client, with some hangers-on. Keep mc_client_obd first so that we
struct super_block *mds_sb;
struct vfsmount *mds_vfsmnt;
struct dentry *mds_id_de;
+ struct lvfs_obd_ctxt *mds_lvfs_ctxt;
int mds_max_mdsize;
int mds_max_cookiesize;
struct file *mds_rcvd_filp;
gid_t mds_squash_gid;
ptl_nid_t mds_nosquash_nid;
atomic_t mds_real_clients;
+ atomic_t mds_open_count;
struct dentry *mds_id_dir;
int mds_obd_type;
struct dentry *mds_unnamed_dir; /* for mdt_obd_create only */
int master_group;
struct cmobd_write_service *write_srv;
};
-
+
+struct conf_obd {
+