Index: linux-2.4.29/fs/ext3/extents.c =================================================================== --- linux-2.4.29.orig/fs/ext3/extents.c 2005-05-03 16:52:08.723069952 +0300 +++ linux-2.4.29/fs/ext3/extents.c 2005-05-03 16:52:08.802057944 +0300 @@ -0,0 +1,2308 @@ +/* + * Copyright(c) 2003, 2004, 2005, Cluster File Systems, Inc, info@clusterfs.com + * Written by Alex Tomas + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public Licens + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- + */ + +/* + * Extents support for EXT3 + * + * TODO: + * - ext3_ext_walk_space() sould not use ext3_ext_find_extent() + * - ext3_ext_calc_credits() could take 'mergable' into account + * - ext3*_error() should be used in some situations + * - find_goal() [to be tested and improved] + * - smart tree reduction + * - arch-independence + * common on-disk format for big/little-endian arch + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +static inline int ext3_ext_check_header(struct ext3_extent_header *eh) +{ + if (eh->eh_magic != EXT3_EXT_MAGIC) { + printk(KERN_ERR "EXT3-fs: invalid magic = 0x%x\n", + (unsigned)eh->eh_magic); + return -EIO; + } + if (eh->eh_max == 0) { + printk(KERN_ERR "EXT3-fs: invalid eh_max = %u\n", + (unsigned)eh->eh_max); + return -EIO; + } + if (eh->eh_entries > eh->eh_max) { + printk(KERN_ERR "EXT3-fs: invalid eh_entries = %u\n", + (unsigned)eh->eh_entries); + return -EIO; + } + return 0; +} + +static handle_t *ext3_ext_journal_restart(handle_t *handle, int needed) +{ + int err; + + if (handle->h_buffer_credits > needed) + return handle; + if (!ext3_journal_extend(handle, needed)) + return handle; + err = ext3_journal_restart(handle, needed); + + return handle; +} + +static int inline +ext3_ext_get_access_for_root(handle_t *h, struct ext3_extents_tree *tree) +{ + if (tree->ops->get_write_access) + return tree->ops->get_write_access(h,tree->buffer); + else + return 0; +} + +static int inline +ext3_ext_mark_root_dirty(handle_t *h, struct ext3_extents_tree *tree) +{ + if (tree->ops->mark_buffer_dirty) + return tree->ops->mark_buffer_dirty(h,tree->buffer); + else + return 0; +} + +/* + * could return: + * - EROFS + * - ENOMEM + */ +static int ext3_ext_get_access(handle_t *handle, + struct ext3_extents_tree *tree, + struct ext3_ext_path *path) +{ + int err; + + if (path->p_bh) { + /* path points to block */ + err = ext3_journal_get_write_access(handle, path->p_bh); + } else { + /* path points to leaf/index in inode body */ + err = ext3_ext_get_access_for_root(handle, tree); + } + return err; +} + +/* + * could return: + * - EROFS + * - ENOMEM + * - EIO + */ +static int ext3_ext_dirty(handle_t *handle, struct ext3_extents_tree *tree, + struct ext3_ext_path *path) +{ + int err; + if (path->p_bh) { + /* path points to block */ + err =ext3_journal_dirty_metadata(handle, path->p_bh); + } else { + /* path points to leaf/index in inode body */ + err = ext3_ext_mark_root_dirty(handle, tree); + } + return err; +} + +static int inline +ext3_ext_new_block(handle_t *handle, struct ext3_extents_tree *tree, + struct ext3_ext_path *path, struct ext3_extent *ex, + int *err) +{ + int goal, depth, newblock; + struct inode *inode; + + EXT_ASSERT(tree); + if (tree->ops->new_block) + return tree->ops->new_block(handle, tree, path, ex, err); + + inode = tree->inode; + depth = EXT_DEPTH(tree); + if (path && depth > 0) { + goal = path[depth-1].p_block; + } else { + struct ext3_inode_info *ei = EXT3_I(inode); + unsigned long bg_start; + unsigned long colour; + + bg_start = (ei->i_block_group * + EXT3_BLOCKS_PER_GROUP(inode->i_sb)) + + le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block); + colour = (current->pid % 16) * + (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16); + goal = bg_start + colour; + } + + lock_kernel(); + newblock = ext3_new_block(handle, inode, goal, 0, 0, err); + unlock_kernel(); + return newblock; +} + +static inline void ext3_ext_tree_changed(struct ext3_extents_tree *tree) +{ + struct ext3_extent_header *neh = EXT_ROOT_HDR(tree); + neh->eh_generation = ((EXT_FLAGS(neh) & ~EXT_FLAGS_CLR_UNKNOWN) << 24) | + (EXT_HDR_GEN(neh) + 1); +} + +static inline int ext3_ext_space_block(struct ext3_extents_tree *tree) +{ + int size; + + size = (tree->inode->i_sb->s_blocksize - + sizeof(struct ext3_extent_header)) / + sizeof(struct ext3_extent); +#ifdef AGRESSIVE_TEST + size = 6; +#endif + return size; +} + +static inline int ext3_ext_space_block_idx(struct ext3_extents_tree *tree) +{ + int size; + + size = (tree->inode->i_sb->s_blocksize - + sizeof(struct ext3_extent_header)) / + sizeof(struct ext3_extent_idx); +#ifdef AGRESSIVE_TEST + size = 5; +#endif + return size; +} + +static inline int ext3_ext_space_root(struct ext3_extents_tree *tree) +{ + int size; + + size = (tree->buffer_len - sizeof(struct ext3_extent_header)) / + sizeof(struct ext3_extent); +#ifdef AGRESSIVE_TEST + size = 3; +#endif + return size; +} + +static inline int ext3_ext_space_root_idx(struct ext3_extents_tree *tree) +{ + int size; + + size = (tree->buffer_len - sizeof(struct ext3_extent_header)) / + sizeof(struct ext3_extent_idx); +#ifdef AGRESSIVE_TEST + size = 4; +#endif + return size; +} + +static void ext3_ext_show_path(struct ext3_extents_tree *tree, + struct ext3_ext_path *path) +{ +#ifdef EXT_DEBUG + int k, l = path->p_depth; + + ext_debug(tree, "path:"); + for (k = 0; k <= l; k++, path++) { + if (path->p_idx) { + ext_debug(tree, " %d->%d", path->p_idx->ei_block, + path->p_idx->ei_leaf); + } else if (path->p_ext) { + ext_debug(tree, " %d:%d:%d", + path->p_ext->ee_block, + path->p_ext->ee_len, + path->p_ext->ee_start); + } else + ext_debug(tree, " []"); + } + ext_debug(tree, "\n"); +#endif +} + +static void ext3_ext_show_leaf(struct ext3_extents_tree *tree, + struct ext3_ext_path *path) +{ +#ifdef EXT_DEBUG + int depth = EXT_DEPTH(tree); + struct ext3_extent_header *eh; + struct ext3_extent *ex; + int i; + + if (!path) + return; + + eh = path[depth].p_hdr; + ex = EXT_FIRST_EXTENT(eh); + + for (i = 0; i < eh->eh_entries; i++, ex++) { + ext_debug(tree, "%d:%d:%d ", + ex->ee_block, ex->ee_len, ex->ee_start); + } + ext_debug(tree, "\n"); +#endif +} + +static void ext3_ext_drop_refs(struct ext3_ext_path *path) +{ + int depth = path->p_depth; + int i; + + for (i = 0; i <= depth; i++, path++) { + if (path->p_bh) { + brelse(path->p_bh); + path->p_bh = NULL; + } + } +} + +/* + * binary search for closest index by given block + */ +static inline void +ext3_ext_binsearch_idx(struct ext3_extents_tree *tree, + struct ext3_ext_path *path, int block) +{ + struct ext3_extent_header *eh = path->p_hdr; + struct ext3_extent_idx *ix; + int l = 0, k, r; + + EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); + EXT_ASSERT(eh->eh_entries <= eh->eh_max); + EXT_ASSERT(eh->eh_entries > 0); + + ext_debug(tree, "binsearch for %d(idx): ", block); + + path->p_idx = ix = EXT_FIRST_INDEX(eh); + + r = k = eh->eh_entries; + while (k > 1) { + k = (r - l) / 2; + if (block < ix[l + k].ei_block) + r -= k; + else + l += k; + ext_debug(tree, "%d:%d:%d ", k, l, r); + } + + ix += l; + path->p_idx = ix; + ext_debug(tree," -> %d->%d ",path->p_idx->ei_block,path->p_idx->ei_leaf); + + while (l++ < r) { + if (block < ix->ei_block) + break; + path->p_idx = ix++; + } + ext_debug(tree, " -> %d->%d\n", path->p_idx->ei_block, + path->p_idx->ei_leaf); + +#ifdef CHECK_BINSEARCH + { + struct ext3_extent_idx *chix; + + chix = ix = EXT_FIRST_INDEX(eh); + for (k = 0; k < eh->eh_entries; k++, ix++) { + if (k != 0 && ix->ei_block <= ix[-1].ei_block) { + printk("k=%d, ix=0x%p, first=0x%p\n", k, + ix, EXT_FIRST_INDEX(eh)); + printk("%u <= %u\n", + ix->ei_block,ix[-1].ei_block); + } + EXT_ASSERT(k == 0 || ix->ei_block > ix[-1].ei_block); + if (block < ix->ei_block) + break; + chix = ix; + } + EXT_ASSERT(chix == path->p_idx); + } +#endif +} + +/* + * binary search for closest extent by given block + */ +static inline void +ext3_ext_binsearch(struct ext3_extents_tree *tree, + struct ext3_ext_path *path, int block) +{ + struct ext3_extent_header *eh = path->p_hdr; + struct ext3_extent *ex; + int l = 0, k, r; + + EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); + EXT_ASSERT(eh->eh_entries <= eh->eh_max); + + if (eh->eh_entries == 0) { + /* + * this leaf is empty yet: + * we get such a leaf in split/add case + */ + return; + } + + ext_debug(tree, "binsearch for %d: ", block); + + path->p_ext = ex = EXT_FIRST_EXTENT(eh); + + r = k = eh->eh_entries; + while (k > 1) { + k = (r - l) / 2; + if (block < ex[l + k].ee_block) + r -= k; + else + l += k; + ext_debug(tree, "%d:%d:%d ", k, l, r); + } + + ex += l; + path->p_ext = ex; + ext_debug(tree, " -> %d:%d:%d ", path->p_ext->ee_block, + path->p_ext->ee_start, path->p_ext->ee_len); + + while (l++ < r) { + if (block < ex->ee_block) + break; + path->p_ext = ex++; + } + ext_debug(tree, " -> %d:%d:%d\n", path->p_ext->ee_block, + path->p_ext->ee_start, path->p_ext->ee_len); + +#ifdef CHECK_BINSEARCH + { + struct ext3_extent *chex; + + chex = ex = EXT_FIRST_EXTENT(eh); + for (k = 0; k < eh->eh_entries; k++, ex++) { + EXT_ASSERT(k == 0 || ex->ee_block > ex[-1].ee_block); + if (block < ex->ee_block) + break; + chex = ex; + } + EXT_ASSERT(chex == path->p_ext); + } +#endif +} + +int ext3_extent_tree_init(handle_t *handle, struct ext3_extents_tree *tree) +{ + struct ext3_extent_header *eh; + + BUG_ON(tree->buffer_len == 0); + ext3_ext_get_access_for_root(handle, tree); + eh = EXT_ROOT_HDR(tree); + eh->eh_depth = 0; + eh->eh_entries = 0; + eh->eh_magic = EXT3_EXT_MAGIC; + eh->eh_max = ext3_ext_space_root(tree); + ext3_ext_mark_root_dirty(handle, tree); + ext3_ext_invalidate_cache(tree); + return 0; +} + +struct ext3_ext_path * +ext3_ext_find_extent(struct ext3_extents_tree *tree, int block, + struct ext3_ext_path *path) +{ + struct ext3_extent_header *eh; + struct buffer_head *bh; + int depth, i, ppos = 0; + + EXT_ASSERT(tree); + EXT_ASSERT(tree->inode); + EXT_ASSERT(tree->root); + + eh = EXT_ROOT_HDR(tree); + EXT_ASSERT(eh); + if (ext3_ext_check_header(eh)) + goto err; + + i = depth = EXT_DEPTH(tree); + EXT_ASSERT(eh->eh_max); + EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); + + /* account possible depth increase */ + if (!path) { + path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 2), + GFP_NOFS); + if (!path) + return ERR_PTR(-ENOMEM); + } + memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1)); + path[0].p_hdr = eh; + + /* walk through the tree */ + while (i) { + ext_debug(tree, "depth %d: num %d, max %d\n", + ppos, eh->eh_entries, eh->eh_max); + ext3_ext_binsearch_idx(tree, path + ppos, block); + path[ppos].p_block = path[ppos].p_idx->ei_leaf; + path[ppos].p_depth = i; + path[ppos].p_ext = NULL; + + bh = sb_bread(tree->inode->i_sb, path[ppos].p_block); + if (!bh) + goto err; + eh = EXT_BLOCK_HDR(bh); + ppos++; + EXT_ASSERT(ppos <= depth); + path[ppos].p_bh = bh; + path[ppos].p_hdr = eh; + i--; + + if (ext3_ext_check_header(eh)) + goto err; + } + + path[ppos].p_depth = i; + path[ppos].p_hdr = eh; + path[ppos].p_ext = NULL; + path[ppos].p_idx = NULL; + + if (ext3_ext_check_header(eh)) + goto err; + + /* find extent */ + ext3_ext_binsearch(tree, path + ppos, block); + + ext3_ext_show_path(tree, path); + + return path; + +err: + printk(KERN_ERR "EXT3-fs: header is corrupted!\n"); + ext3_ext_drop_refs(path); + kfree(path); + return ERR_PTR(-EIO); +} + +/* + * insert new index [logical;ptr] into the block at cupr + * it check where to insert: before curp or after curp + */ +static int ext3_ext_insert_index(handle_t *handle, + struct ext3_extents_tree *tree, + struct ext3_ext_path *curp, + int logical, int ptr) +{ + struct ext3_extent_idx *ix; + int len, err; + + if ((err = ext3_ext_get_access(handle, tree, curp))) + return err; + + EXT_ASSERT(logical != curp->p_idx->ei_block); + len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx; + if (logical > curp->p_idx->ei_block) { + /* insert after */ + if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) { + len = (len - 1) * sizeof(struct ext3_extent_idx); + len = len < 0 ? 0 : len; + ext_debug(tree, "insert new index %d after: %d. " + "move %d from 0x%p to 0x%p\n", + logical, ptr, len, + (curp->p_idx + 1), (curp->p_idx + 2)); + memmove(curp->p_idx + 2, curp->p_idx + 1, len); + } + ix = curp->p_idx + 1; + } else { + /* insert before */ + len = len * sizeof(struct ext3_extent_idx); + len = len < 0 ? 0 : len; + ext_debug(tree, "insert new index %d before: %d. " + "move %d from 0x%p to 0x%p\n", + logical, ptr, len, + curp->p_idx, (curp->p_idx + 1)); + memmove(curp->p_idx + 1, curp->p_idx, len); + ix = curp->p_idx; + } + + ix->ei_block = logical; + ix->ei_leaf = ptr; + curp->p_hdr->eh_entries++; + + EXT_ASSERT(curp->p_hdr->eh_entries <= curp->p_hdr->eh_max); + EXT_ASSERT(ix <= EXT_LAST_INDEX(curp->p_hdr)); + + err = ext3_ext_dirty(handle, tree, curp); + ext3_std_error(tree->inode->i_sb, err); + + return err; +} + +/* + * routine inserts new subtree into the path, using free index entry + * at depth 'at: + * - allocates all needed blocks (new leaf and all intermediate index blocks) + * - makes decision where to split + * - moves remaining extens and index entries (right to the split point) + * into the newly allocated blocks + * - initialize subtree + */ +static int ext3_ext_split(handle_t *handle, struct ext3_extents_tree *tree, + struct ext3_ext_path *path, + struct ext3_extent *newext, int at) +{ + struct buffer_head *bh = NULL; + int depth = EXT_DEPTH(tree); + struct ext3_extent_header *neh; + struct ext3_extent_idx *fidx; + struct ext3_extent *ex; + int i = at, k, m, a; + unsigned long newblock, oldblock, border; + int *ablocks = NULL; /* array of allocated blocks */ + int err = 0; + + /* make decision: where to split? */ + /* FIXME: now desicion is simplest: at current extent */ + + /* if current leaf will be splitted, then we should use + * border from split point */ + EXT_ASSERT(path[depth].p_ext <= EXT_MAX_EXTENT(path[depth].p_hdr)); + if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) { + border = path[depth].p_ext[1].ee_block; + ext_debug(tree, "leaf will be splitted." + " next leaf starts at %d\n", + (int)border); + } else { + border = newext->ee_block; + ext_debug(tree, "leaf will be added." + " next leaf starts at %d\n", + (int)border); + } + + /* + * if error occurs, then we break processing + * and turn filesystem read-only. so, index won't + * be inserted and tree will be in consistent + * state. next mount will repair buffers too + */ + + /* + * get array to track all allocated blocks + * we need this to handle errors and free blocks + * upon them + */ + ablocks = kmalloc(sizeof(unsigned long) * depth, GFP_NOFS); + if (!ablocks) + return -ENOMEM; + memset(ablocks, 0, sizeof(unsigned long) * depth); + + /* allocate all needed blocks */ + ext_debug(tree, "allocate %d blocks for indexes/leaf\n", depth - at); + for (a = 0; a < depth - at; a++) { + newblock = ext3_ext_new_block(handle, tree, path, newext, &err); + if (newblock == 0) + goto cleanup; + ablocks[a] = newblock; + } + + /* initialize new leaf */ + newblock = ablocks[--a]; + EXT_ASSERT(newblock); + bh = sb_getblk(tree->inode->i_sb, newblock); + if (!bh) { + err = -EIO; + goto cleanup; + } + lock_buffer(bh); + + if ((err = ext3_journal_get_create_access(handle, bh))) + goto cleanup; + + neh = EXT_BLOCK_HDR(bh); + neh->eh_entries = 0; + neh->eh_max = ext3_ext_space_block(tree); + neh->eh_magic = EXT3_EXT_MAGIC; + neh->eh_depth = 0; + ex = EXT_FIRST_EXTENT(neh); + + /* move remain of path[depth] to the new leaf */ + EXT_ASSERT(path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max); + /* start copy from next extent */ + /* TODO: we could do it by single memmove */ + m = 0; + path[depth].p_ext++; + while (path[depth].p_ext <= + EXT_MAX_EXTENT(path[depth].p_hdr)) { + ext_debug(tree, "move %d:%d:%d in new leaf %lu\n", + path[depth].p_ext->ee_block, + path[depth].p_ext->ee_start, + path[depth].p_ext->ee_len, + newblock); + memmove(ex++, path[depth].p_ext++, sizeof(struct ext3_extent)); + neh->eh_entries++; + m++; + } + mark_buffer_uptodate(bh, 1); + unlock_buffer(bh); + + if ((err = ext3_journal_dirty_metadata(handle, bh))) + goto cleanup; + brelse(bh); + bh = NULL; + + /* correct old leaf */ + if (m) { + if ((err = ext3_ext_get_access(handle, tree, path + depth))) + goto cleanup; + path[depth].p_hdr->eh_entries -= m; + if ((err = ext3_ext_dirty(handle, tree, path + depth))) + goto cleanup; + + } + + /* create intermediate indexes */ + k = depth - at - 1; + EXT_ASSERT(k >= 0); + if (k) + ext_debug(tree, "create %d intermediate indices\n", k); + /* insert new index into current index block */ + /* current depth stored in i var */ + i = depth - 1; + while (k--) { + oldblock = newblock; + newblock = ablocks[--a]; + bh = sb_getblk(tree->inode->i_sb, newblock); + if (!bh) { + err = -EIO; + goto cleanup; + } + lock_buffer(bh); + + if ((err = ext3_journal_get_create_access(handle, bh))) + goto cleanup; + + neh = EXT_BLOCK_HDR(bh); + neh->eh_entries = 1; + neh->eh_magic = EXT3_EXT_MAGIC; + neh->eh_max = ext3_ext_space_block_idx(tree); + neh->eh_depth = depth - i; + fidx = EXT_FIRST_INDEX(neh); + fidx->ei_block = border; + fidx->ei_leaf = oldblock; + + ext_debug(tree, "int.index at %d (block %lu): %lu -> %lu\n", + i, newblock, border, oldblock); + /* copy indexes */ + m = 0; + path[i].p_idx++; + + ext_debug(tree, "cur 0x%p, last 0x%p\n", path[i].p_idx, + EXT_MAX_INDEX(path[i].p_hdr)); + EXT_ASSERT(EXT_MAX_INDEX(path[i].p_hdr) == + EXT_LAST_INDEX(path[i].p_hdr)); + while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) { + ext_debug(tree, "%d: move %d:%d in new index %lu\n", + i, path[i].p_idx->ei_block, + path[i].p_idx->ei_leaf, newblock); + memmove(++fidx, path[i].p_idx++, + sizeof(struct ext3_extent_idx)); + neh->eh_entries++; + EXT_ASSERT(neh->eh_entries <= neh->eh_max); + m++; + } + mark_buffer_uptodate(bh, 1); + unlock_buffer(bh); + + if ((err = ext3_journal_dirty_metadata(handle, bh))) + goto cleanup; + brelse(bh); + bh = NULL; + + /* correct old index */ + if (m) { + err = ext3_ext_get_access(handle, tree, path + i); + if (err) + goto cleanup; + path[i].p_hdr->eh_entries -= m; + err = ext3_ext_dirty(handle, tree, path + i); + if (err) + goto cleanup; + } + + i--; + } + + /* insert new index */ + if (!err) + err = ext3_ext_insert_index(handle, tree, path + at, + border, newblock); + +cleanup: + if (bh) { + if (buffer_locked(bh)) + unlock_buffer(bh); + brelse(bh); + } + + if (err) { + /* free all allocated blocks in error case */ + for (i = 0; i < depth; i++) { + if (!ablocks[i]) + continue; + ext3_free_blocks(handle, tree->inode, ablocks[i], 1); + } + } + kfree(ablocks); + + return err; +} + +/* + * routine implements tree growing procedure: + * - allocates new block + * - moves top-level data (index block or leaf) into the new block + * - initialize new top-level, creating index that points to the + * just created block + */ +static int ext3_ext_grow_indepth(handle_t *handle, + struct ext3_extents_tree *tree, + struct ext3_ext_path *path, + struct ext3_extent *newext) +{ + struct ext3_ext_path *curp = path; + struct ext3_extent_header *neh; + struct ext3_extent_idx *fidx; + struct buffer_head *bh; + unsigned long newblock; + int err = 0; + + newblock = ext3_ext_new_block(handle, tree, path, newext, &err); + if (newblock == 0) + return err; + + bh = sb_getblk(tree->inode->i_sb, newblock); + if (!bh) { + err = -EIO; + ext3_std_error(tree->inode->i_sb, err); + return err; + } + lock_buffer(bh); + + if ((err = ext3_journal_get_create_access(handle, bh))) { + unlock_buffer(bh); + goto out; + } + + /* move top-level index/leaf into new block */ + memmove(bh->b_data, curp->p_hdr, tree->buffer_len); + + /* set size of new block */ + neh = EXT_BLOCK_HDR(bh); + /* old root could have indexes or leaves + * so calculate eh_max right way */ + if (EXT_DEPTH(tree)) + neh->eh_max = ext3_ext_space_block_idx(tree); + else + neh->eh_max = ext3_ext_space_block(tree); + neh->eh_magic = EXT3_EXT_MAGIC; + mark_buffer_uptodate(bh, 1); + unlock_buffer(bh); + + if ((err = ext3_journal_dirty_metadata(handle, bh))) + goto out; + + /* create index in new top-level index: num,max,pointer */ + if ((err = ext3_ext_get_access(handle, tree, curp))) + goto out; + + curp->p_hdr->eh_magic = EXT3_EXT_MAGIC; + curp->p_hdr->eh_max = ext3_ext_space_root_idx(tree); + curp->p_hdr->eh_entries = 1; + curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr); + /* FIXME: it works, but actually path[0] can be index */ + curp->p_idx->ei_block = EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block; + curp->p_idx->ei_leaf = newblock; + + neh = EXT_ROOT_HDR(tree); + fidx = EXT_FIRST_INDEX(neh); + ext_debug(tree, "new root: num %d(%d), lblock %d, ptr %d\n", + neh->eh_entries, neh->eh_max, fidx->ei_block, fidx->ei_leaf); + + neh->eh_depth = path->p_depth + 1; + err = ext3_ext_dirty(handle, tree, curp); +out: + brelse(bh); + + return err; +} + +/* + * routine finds empty index and adds new leaf. if no free index found + * then it requests in-depth growing + */ +static int ext3_ext_create_new_leaf(handle_t *handle, + struct ext3_extents_tree *tree, + struct ext3_ext_path *path, + struct ext3_extent *newext) +{ + struct ext3_ext_path *curp; + int depth, i, err = 0; + +repeat: + i = depth = EXT_DEPTH(tree); + + /* walk up to the tree and look for free index entry */ + curp = path + depth; + while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) { + i--; + curp--; + } + + /* we use already allocated block for index block + * so, subsequent data blocks should be contigoues */ + if (EXT_HAS_FREE_INDEX(curp)) { + /* if we found index with free entry, then use that + * entry: create all needed subtree and add new leaf */ + err = ext3_ext_split(handle, tree, path, newext, i); + + /* refill path */ + ext3_ext_drop_refs(path); + path = ext3_ext_find_extent(tree, newext->ee_block, path); + if (IS_ERR(path)) + err = PTR_ERR(path); + } else { + /* tree is full, time to grow in depth */ + err = ext3_ext_grow_indepth(handle, tree, path, newext); + + /* refill path */ + ext3_ext_drop_refs(path); + path = ext3_ext_find_extent(tree, newext->ee_block, path); + if (IS_ERR(path)) + err = PTR_ERR(path); + + /* + * only first (depth 0 -> 1) produces free space + * in all other cases we have to split growed tree + */ + depth = EXT_DEPTH(tree); + if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) { + /* now we need split */ + goto repeat; + } + } + + if (err) + return err; + + return 0; +} + +/* + * returns allocated block in subsequent extent or EXT_MAX_BLOCK + * NOTE: it consider block number from index entry as + * allocated block. thus, index entries have to be consistent + * with leafs + */ +static unsigned long +ext3_ext_next_allocated_block(struct ext3_ext_path *path) +{ + int depth; + + EXT_ASSERT(path != NULL); + depth = path->p_depth; + + if (depth == 0 && path->p_ext == NULL) + return EXT_MAX_BLOCK; + + /* FIXME: what if index isn't full ?! */ + while (depth >= 0) { + if (depth == path->p_depth) { + /* leaf */ + if (path[depth].p_ext != + EXT_LAST_EXTENT(path[depth].p_hdr)) + return path[depth].p_ext[1].ee_block; + } else { + /* index */ + if (path[depth].p_idx != + EXT_LAST_INDEX(path[depth].p_hdr)) + return path[depth].p_idx[1].ei_block; + } + depth--; + } + + return EXT_MAX_BLOCK; +} + +/* + * returns first allocated block from next leaf or EXT_MAX_BLOCK + */ +static unsigned ext3_ext_next_leaf_block(struct ext3_extents_tree *tree, + struct ext3_ext_path *path) +{ + int depth; + + EXT_ASSERT(path != NULL); + depth = path->p_depth; + + /* zero-tree has no leaf blocks at all */ + if (depth == 0) + return EXT_MAX_BLOCK; + + /* go to index block */ + depth--; + + while (depth >= 0) { + if (path[depth].p_idx != + EXT_LAST_INDEX(path[depth].p_hdr)) + return path[depth].p_idx[1].ei_block; + depth--; + } + + return EXT_MAX_BLOCK; +} + +/* + * if leaf gets modified and modified extent is first in the leaf + * then we have to correct all indexes above + * TODO: do we need to correct tree in all cases? + */ +int ext3_ext_correct_indexes(handle_t *handle, struct ext3_extents_tree *tree, + struct ext3_ext_path *path) +{ + struct ext3_extent_header *eh; + int depth = EXT_DEPTH(tree); + struct ext3_extent *ex; + unsigned long border; + int k, err = 0; + + eh = path[depth].p_hdr; + ex = path[depth].p_ext; + EXT_ASSERT(ex); + EXT_ASSERT(eh); + + if (depth == 0) { + /* there is no tree at all */ + return 0; + } + + if (ex != EXT_FIRST_EXTENT(eh)) { + /* we correct tree if first leaf got modified only */ + return 0; + } + + /* + * TODO: we need correction if border is smaller then current one + */ + k = depth - 1; + border = path[depth].p_ext->ee_block; + if ((err = ext3_ext_get_access(handle, tree, path + k))) + return err; + path[k].p_idx->ei_block = border; + if ((err = ext3_ext_dirty(handle, tree, path + k))) + return err; + + while (k--) { + /* change all left-side indexes */ + if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr)) + break; + if ((err = ext3_ext_get_access(handle, tree, path + k))) + break; + path[k].p_idx->ei_block = border; + if ((err = ext3_ext_dirty(handle, tree, path + k))) + break; + } + + return err; +} + +static int inline +ext3_can_extents_be_merged(struct ext3_extents_tree *tree, + struct ext3_extent *ex1, + struct ext3_extent *ex2) +{ + if (ex1->ee_block + ex1->ee_len != ex2->ee_block) + return 0; + +#ifdef AGRESSIVE_TEST + if (ex1->ee_len >= 4) + return 0; +#endif + + if (!tree->ops->mergable) + return 1; + + return tree->ops->mergable(ex1, ex2); +} + +/* + * this routine tries to merge requsted extent into the existing + * extent or inserts requested extent as new one into the tree, + * creating new leaf in no-space case + */ +int ext3_ext_insert_extent(handle_t *handle, struct ext3_extents_tree *tree, + struct ext3_ext_path *path, + struct ext3_extent *newext) +{ + struct ext3_extent_header * eh; + struct ext3_extent *ex, *fex; + struct ext3_extent *nearex; /* nearest extent */ + struct ext3_ext_path *npath = NULL; + int depth, len, err, next; + + EXT_ASSERT(newext->ee_len > 0); + depth = EXT_DEPTH(tree); + ex = path[depth].p_ext; + EXT_ASSERT(path[depth].p_hdr); + + /* try to insert block into found extent and return */ + if (ex && ext3_can_extents_be_merged(tree, ex, newext)) { + ext_debug(tree, "append %d block to %d:%d (from %d)\n", + newext->ee_len, ex->ee_block, ex->ee_len, + ex->ee_start); + if ((err = ext3_ext_get_access(handle, tree, path + depth))) + return err; + ex->ee_len += newext->ee_len; + eh = path[depth].p_hdr; + nearex = ex; + goto merge; + } + +repeat: + depth = EXT_DEPTH(tree); + eh = path[depth].p_hdr; + if (eh->eh_entries < eh->eh_max) + goto has_space; + + /* probably next leaf has space for us? */ + fex = EXT_LAST_EXTENT(eh); + next = ext3_ext_next_leaf_block(tree, path); + if (newext->ee_block > fex->ee_block && next != EXT_MAX_BLOCK) { + ext_debug(tree, "next leaf block - %d\n", next); + EXT_ASSERT(!npath); + npath = ext3_ext_find_extent(tree, next, NULL); + if (IS_ERR(npath)) + return PTR_ERR(npath); + EXT_ASSERT(npath->p_depth == path->p_depth); + eh = npath[depth].p_hdr; + if (eh->eh_entries < eh->eh_max) { + ext_debug(tree, "next leaf isnt full(%d)\n", + eh->eh_entries); + path = npath; + goto repeat; + } + ext_debug(tree, "next leaf hasno free space(%d,%d)\n", + eh->eh_entries, eh->eh_max); + } + + /* + * there is no free space in found leaf + * we're gonna add new leaf in the tree + */ + err = ext3_ext_create_new_leaf(handle, tree, path, newext); + if (err) + goto cleanup; + depth = EXT_DEPTH(tree); + eh = path[depth].p_hdr; + +has_space: + nearex = path[depth].p_ext; + + if ((err = ext3_ext_get_access(handle, tree, path + depth))) + goto cleanup; + + if (!nearex) { + /* there is no extent in this leaf, create first one */ + ext_debug(tree, "first extent in the leaf: %d:%d:%d\n", + newext->ee_block, newext->ee_start, + newext->ee_len); + path[depth].p_ext = EXT_FIRST_EXTENT(eh); + } else if (newext->ee_block > nearex->ee_block) { + EXT_ASSERT(newext->ee_block != nearex->ee_block); + if (nearex != EXT_LAST_EXTENT(eh)) { + len = EXT_MAX_EXTENT(eh) - nearex; + len = (len - 1) * sizeof(struct ext3_extent); + len = len < 0 ? 0 : len; + ext_debug(tree, "insert %d:%d:%d after: nearest 0x%p, " + "move %d from 0x%p to 0x%p\n", + newext->ee_block, newext->ee_start, + newext->ee_len, + nearex, len, nearex + 1, nearex + 2); + memmove(nearex + 2, nearex + 1, len); + } + path[depth].p_ext = nearex + 1; + } else { + EXT_ASSERT(newext->ee_block != nearex->ee_block); + len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext3_extent); + len = len < 0 ? 0 : len; + ext_debug(tree, "insert %d:%d:%d before: nearest 0x%p, " + "move %d from 0x%p to 0x%p\n", + newext->ee_block, newext->ee_start, newext->ee_len, + nearex, len, nearex + 1, nearex + 2); + memmove(nearex + 1, nearex, len); + path[depth].p_ext = nearex; + } + + eh->eh_entries++; + nearex = path[depth].p_ext; + nearex->ee_block = newext->ee_block; + nearex->ee_start = newext->ee_start; + nearex->ee_len = newext->ee_len; + /* FIXME: support for large fs */ + nearex->ee_start_hi = 0; + +merge: + /* try to merge extents to the right */ + while (nearex < EXT_LAST_EXTENT(eh)) { + if (!ext3_can_extents_be_merged(tree, nearex, nearex + 1)) + break; + /* merge with next extent! */ + nearex->ee_len += nearex[1].ee_len; + if (nearex + 1 < EXT_LAST_EXTENT(eh)) { + len = (EXT_LAST_EXTENT(eh) - nearex - 1) * + sizeof(struct ext3_extent); + memmove(nearex + 1, nearex + 2, len); + } + eh->eh_entries--; + EXT_ASSERT(eh->eh_entries > 0); + } + + /* try to merge extents to the left */ + + /* time to correct all indexes above */ + err = ext3_ext_correct_indexes(handle, tree, path); + if (err) + goto cleanup; + + err = ext3_ext_dirty(handle, tree, path + depth); + +cleanup: + if (npath) { + ext3_ext_drop_refs(npath); + kfree(npath); + } + ext3_ext_tree_changed(tree); + ext3_ext_invalidate_cache(tree); + return err; +} + +int ext3_ext_walk_space(struct ext3_extents_tree *tree, unsigned long block, + unsigned long num, ext_prepare_callback func) +{ + struct ext3_ext_path *path = NULL; + struct ext3_ext_cache cbex; + struct ext3_extent *ex; + unsigned long next, start = 0, end = 0; + unsigned long last = block + num; + int depth, exists, err = 0; + + EXT_ASSERT(tree); + EXT_ASSERT(func); + EXT_ASSERT(tree->inode); + EXT_ASSERT(tree->root); + + while (block < last && block != EXT_MAX_BLOCK) { + num = last - block; + /* find extent for this block */ + path = ext3_ext_find_extent(tree, block, path); + if (IS_ERR(path)) { + err = PTR_ERR(path); + path = NULL; + break; + } + + depth = EXT_DEPTH(tree); + EXT_ASSERT(path[depth].p_hdr); + ex = path[depth].p_ext; + next = ext3_ext_next_allocated_block(path); + + exists = 0; + if (!ex) { + /* there is no extent yet, so try to allocate + * all requested space */ + start = block; + end = block + num; + } else if (ex->ee_block > block) { + /* need to allocate space before found extent */ + start = block; + end = ex->ee_block; + if (block + num < end) + end = block + num; + } else if (block >= ex->ee_block + ex->ee_len) { + /* need to allocate space after found extent */ + start = block; + end = block + num; + if (end >= next) + end = next; + } else if (block >= ex->ee_block) { + /* + * some part of requested space is covered + * by found extent + */ + start = block; + end = ex->ee_block + ex->ee_len; + if (block + num < end) + end = block + num; + exists = 1; + } else { + BUG(); + } + EXT_ASSERT(end > start); + + if (!exists) { + cbex.ec_block = start; + cbex.ec_len = end - start; + cbex.ec_start = 0; + cbex.ec_type = EXT3_EXT_CACHE_GAP; + } else { + cbex.ec_block = ex->ee_block; + cbex.ec_len = ex->ee_len; + cbex.ec_start = ex->ee_start; + cbex.ec_type = EXT3_EXT_CACHE_EXTENT; + } + + EXT_ASSERT(cbex.ec_len > 0); + EXT_ASSERT(path[depth].p_hdr); + err = func(tree, path, &cbex); + ext3_ext_drop_refs(path); + + if (err < 0) + break; + if (err == EXT_REPEAT) + continue; + else if (err == EXT_BREAK) { + err = 0; + break; + } + + if (EXT_DEPTH(tree) != depth) { + /* depth was changed. we have to realloc path */ + kfree(path); + path = NULL; + } + + block = cbex.ec_block + cbex.ec_len; + } + + if (path) { + ext3_ext_drop_refs(path); + kfree(path); + } + + return err; +} + +static inline void +ext3_ext_put_in_cache(struct ext3_extents_tree *tree, __u32 block, + __u32 len, __u32 start, int type) +{ + EXT_ASSERT(len > 0); + if (tree->cex) { + tree->cex->ec_type = type; + tree->cex->ec_block = block; + tree->cex->ec_len = len; + tree->cex->ec_start = start; + } +} + +/* + * this routine calculate boundaries of the gap requested block fits into + * and cache this gap + */ +static inline void +ext3_ext_put_gap_in_cache(struct ext3_extents_tree *tree, + struct ext3_ext_path *path, + unsigned long block) +{ + int depth = EXT_DEPTH(tree); + unsigned long lblock, len; + struct ext3_extent *ex; + + if (!tree->cex) + return; + + ex = path[depth].p_ext; + if (ex == NULL) { + /* there is no extent yet, so gap is [0;-] */ + lblock = 0; + len = EXT_MAX_BLOCK; + ext_debug(tree, "cache gap(whole file):"); + } else if (block < ex->ee_block) { + lblock = block; + len = ex->ee_block - block; + ext_debug(tree, "cache gap(before): %lu [%lu:%lu]", + (unsigned long) block, + (unsigned long) ex->ee_block, + (unsigned long) ex->ee_len); + } else if (block >= ex->ee_block + ex->ee_len) { + lblock = ex->ee_block + ex->ee_len; + len = ext3_ext_next_allocated_block(path); + ext_debug(tree, "cache gap(after): [%lu:%lu] %lu", + (unsigned long) ex->ee_block, + (unsigned long) ex->ee_len, + (unsigned long) block); + EXT_ASSERT(len > lblock); + len = len - lblock; + } else { + lblock = len = 0; + BUG(); + } + + ext_debug(tree, " -> %lu:%lu\n", (unsigned long) lblock, len); + ext3_ext_put_in_cache(tree, lblock, len, 0, EXT3_EXT_CACHE_GAP); +} + +static inline int +ext3_ext_in_cache(struct ext3_extents_tree *tree, unsigned long block, + struct ext3_extent *ex) +{ + struct ext3_ext_cache *cex = tree->cex; + + /* is there cache storage at all? */ + if (!cex) + return EXT3_EXT_CACHE_NO; + + /* has cache valid data? */ + if (cex->ec_type == EXT3_EXT_CACHE_NO) + return EXT3_EXT_CACHE_NO; + + EXT_ASSERT(cex->ec_type == EXT3_EXT_CACHE_GAP || + cex->ec_type == EXT3_EXT_CACHE_EXTENT); + if (block >= cex->ec_block && block < cex->ec_block + cex->ec_len) { + ex->ee_block = cex->ec_block; + ex->ee_start = cex->ec_start; + ex->ee_len = cex->ec_len; + ext_debug(tree, "%lu cached by %lu:%lu:%lu\n", + (unsigned long) block, + (unsigned long) ex->ee_block, + (unsigned long) ex->ee_len, + (unsigned long) ex->ee_start); + return cex->ec_type; + } + + /* not in cache */ + return EXT3_EXT_CACHE_NO; +} + +/* + * routine removes index from the index block + * it's used in truncate case only. thus all requests are for + * last index in the block only + */ +int ext3_ext_rm_idx(handle_t *handle, struct ext3_extents_tree *tree, + struct ext3_ext_path *path) +{ + struct buffer_head *bh; + int err; + + /* free index block */ + path--; + EXT_ASSERT(path->p_hdr->eh_entries); + if ((err = ext3_ext_get_access(handle, tree, path))) + return err; + path->p_hdr->eh_entries--; + if ((err = ext3_ext_dirty(handle, tree, path))) + return err; + ext_debug(tree, "index is empty, remove it, free block %d\n", + path->p_idx->ei_leaf); + bh = sb_get_hash_table(tree->inode->i_sb, path->p_idx->ei_leaf); + ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf); + ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1); + return err; +} + +int ext3_ext_calc_credits_for_insert(struct ext3_extents_tree *tree, + struct ext3_ext_path *path) +{ + int depth = EXT_DEPTH(tree); + int needed; + + if (path) { + /* probably there is space in leaf? */ + if (path[depth].p_hdr->eh_entries < path[depth].p_hdr->eh_max) + return 1; + } + + /* + * the worste case we're expecting is creation of the + * new root (growing in depth) with index splitting + * for splitting we have to consider depth + 1 because + * previous growing could increase it + */ + depth = depth + 1; + + /* + * growing in depth: + * block allocation + new root + old root + */ + needed = EXT3_ALLOC_NEEDED + 2; + + /* index split. we may need: + * allocate intermediate indexes and new leaf + * change two blocks at each level, but root + * modify root block (inode) + */ + needed += (depth * EXT3_ALLOC_NEEDED) + (2 * depth) + 1; + + return needed; +} + +static int +ext3_ext_split_for_rm(handle_t *handle, struct ext3_extents_tree *tree, + struct ext3_ext_path *path, unsigned long start, + unsigned long end) +{ + struct ext3_extent *ex, tex; + struct ext3_ext_path *npath; + int depth, creds, err; + + depth = EXT_DEPTH(tree); + ex = path[depth].p_ext; + EXT_ASSERT(ex); + EXT_ASSERT(end < ex->ee_block + ex->ee_len - 1); + EXT_ASSERT(ex->ee_block < start); + + /* calculate tail extent */ + tex.ee_block = end + 1; + EXT_ASSERT(tex.ee_block < ex->ee_block + ex->ee_len); + tex.ee_len = ex->ee_block + ex->ee_len - tex.ee_block; + + creds = ext3_ext_calc_credits_for_insert(tree, path); + handle = ext3_ext_journal_restart(handle, creds); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + /* calculate head extent. use primary extent */ + err = ext3_ext_get_access(handle, tree, path + depth); + if (err) + return err; + ex->ee_len = start - ex->ee_block; + err = ext3_ext_dirty(handle, tree, path + depth); + if (err) + return err; + + /* FIXME: some callback to free underlying resource + * and correct ee_start? */ + ext_debug(tree, "split extent: head %u:%u, tail %u:%u\n", + ex->ee_block, ex->ee_len, tex.ee_block, tex.ee_len); + + npath = ext3_ext_find_extent(tree, ex->ee_block, NULL); + if (IS_ERR(npath)) + return PTR_ERR(npath); + depth = EXT_DEPTH(tree); + EXT_ASSERT(npath[depth].p_ext->ee_block == ex->ee_block); + EXT_ASSERT(npath[depth].p_ext->ee_len == ex->ee_len); + + err = ext3_ext_insert_extent(handle, tree, npath, &tex); + ext3_ext_drop_refs(npath); + kfree(npath); + + return err; +} + +static int +ext3_ext_rm_leaf(handle_t *handle, struct ext3_extents_tree *tree, + struct ext3_ext_path *path, unsigned long start, + unsigned long end) +{ + struct ext3_extent *ex, *fu = NULL, *lu, *le; + int err = 0, correct_index = 0; + int depth = EXT_DEPTH(tree), credits; + struct ext3_extent_header *eh; + unsigned a, b, block, num; + + ext_debug(tree, "remove [%lu:%lu] in leaf\n", start, end); + if (!path[depth].p_hdr) + path[depth].p_hdr = EXT_BLOCK_HDR(path[depth].p_bh); + eh = path[depth].p_hdr; + EXT_ASSERT(eh); + EXT_ASSERT(eh->eh_entries <= eh->eh_max); + EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); + + /* find where to start removing */ + le = ex = EXT_LAST_EXTENT(eh); + while (ex != EXT_FIRST_EXTENT(eh)) { + if (ex->ee_block <= end) + break; + ex--; + } + + if (start > ex->ee_block && end < ex->ee_block + ex->ee_len - 1) { + /* removal of internal part of the extent requested + * tail and head must be placed in different extent + * so, we have to insert one more extent */ + path[depth].p_ext = ex; + return ext3_ext_split_for_rm(handle, tree, path, start, end); + } + + lu = ex; + while (ex >= EXT_FIRST_EXTENT(eh) && ex->ee_block + ex->ee_len > start) { + ext_debug(tree, "remove ext %u:%u\n", ex->ee_block, ex->ee_len); + path[depth].p_ext = ex; + + a = ex->ee_block > start ? ex->ee_block : start; + b = ex->ee_block + ex->ee_len - 1 < end ? + ex->ee_block + ex->ee_len - 1 : end; + + ext_debug(tree, " border %u:%u\n", a, b); + + if (a != ex->ee_block && b != ex->ee_block + ex->ee_len - 1) { + block = 0; + num = 0; + BUG(); + } else if (a != ex->ee_block) { + /* remove tail of the extent */ + block = ex->ee_block; + num = a - block; + } else if (b != ex->ee_block + ex->ee_len - 1) { + /* remove head of the extent */ + block = a; + num = b - a; + } else { + /* remove whole extent: excelent! */ + block = ex->ee_block; + num = 0; + EXT_ASSERT(a == ex->ee_block && + b == ex->ee_block + ex->ee_len - 1); + } + + if (ex == EXT_FIRST_EXTENT(eh)) + correct_index = 1; + + credits = 1; + if (correct_index) + credits += (EXT_DEPTH(tree) * EXT3_ALLOC_NEEDED) + 1; + if (tree->ops->remove_extent_credits) + credits+=tree->ops->remove_extent_credits(tree,ex,a,b); + + handle = ext3_ext_journal_restart(handle, credits); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto out; + } + + err = ext3_ext_get_access(handle, tree, path + depth); + if (err) + goto out; + + if (tree->ops->remove_extent) + err = tree->ops->remove_extent(tree, ex, a, b); + if (err) + goto out; + + if (num == 0) { + /* this extent is removed entirely mark slot unused */ + ex->ee_start = 0; + eh->eh_entries--; + fu = ex; + } + + ex->ee_block = block; + ex->ee_len = num; + + err = ext3_ext_dirty(handle, tree, path + depth); + if (err) + goto out; + + ext_debug(tree, "new extent: %u:%u:%u\n", + ex->ee_block, ex->ee_len, ex->ee_start); + ex--; + } + + if (fu) { + /* reuse unused slots */ + while (lu < le) { + if (lu->ee_start) { + *fu = *lu; + lu->ee_start = 0; + fu++; + } + lu++; + } + } + + if (correct_index && eh->eh_entries) + err = ext3_ext_correct_indexes(handle, tree, path); + + /* if this leaf is free, then we should + * remove it from index block above */ + if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL) + err = ext3_ext_rm_idx(handle, tree, path + depth); + +out: + return err; +} + + +static struct ext3_extent_idx * +ext3_ext_last_covered(struct ext3_extent_header *hdr, unsigned long block) +{ + struct ext3_extent_idx *ix; + + ix = EXT_LAST_INDEX(hdr); + while (ix != EXT_FIRST_INDEX(hdr)) { + if (ix->ei_block <= block) + break; + ix--; + } + return ix; +} + +/* + * returns 1 if current index have to be freed (even partial) + */ +static int inline +ext3_ext_more_to_rm(struct ext3_ext_path *path) +{ + EXT_ASSERT(path->p_idx); + + if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr)) + return 0; + + /* + * if truncate on deeper level happened it it wasn't partial + * so we have to consider current index for truncation + */ + if (path->p_hdr->eh_entries == path->p_block) + return 0; + return 1; +} + +int ext3_ext_remove_space(struct ext3_extents_tree *tree, + unsigned long start, unsigned long end) +{ + struct inode *inode = tree->inode; + struct super_block *sb = inode->i_sb; + int depth = EXT_DEPTH(tree); + struct ext3_ext_path *path; + handle_t *handle; + int i = 0, err = 0; + + ext_debug(tree, "space to be removed: %lu:%lu\n", start, end); + + /* probably first extent we're gonna free will be last in block */ + handle = ext3_journal_start(inode, depth + 1); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + ext3_ext_invalidate_cache(tree); + + /* + * we start scanning from right side freeing all the blocks + * after i_size and walking into the deep + */ + path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 1), GFP_KERNEL); + if (IS_ERR(path)) { + ext3_error(sb, __FUNCTION__, "Can't allocate path array"); + ext3_journal_stop(handle, inode); + return -ENOMEM; + } + memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1)); + path[i].p_hdr = EXT_ROOT_HDR(tree); + + while (i >= 0 && err == 0) { + if (i == depth) { + /* this is leaf block */ + err = ext3_ext_rm_leaf(handle, tree, path, start, end); + /* root level have p_bh == NULL, brelse() eats this */ + brelse(path[i].p_bh); + i--; + continue; + } + + /* this is index block */ + if (!path[i].p_hdr) { + ext_debug(tree, "initialize header\n"); + path[i].p_hdr = EXT_BLOCK_HDR(path[i].p_bh); + } + + EXT_ASSERT(path[i].p_hdr->eh_entries <= path[i].p_hdr->eh_max); + EXT_ASSERT(path[i].p_hdr->eh_magic == EXT3_EXT_MAGIC); + + if (!path[i].p_idx) { + /* this level hasn't touched yet */ + path[i].p_idx = + ext3_ext_last_covered(path[i].p_hdr, end); + path[i].p_block = path[i].p_hdr->eh_entries + 1; + ext_debug(tree, "init index ptr: hdr 0x%p, num %d\n", + path[i].p_hdr, path[i].p_hdr->eh_entries); + } else { + /* we've already was here, see at next index */ + path[i].p_idx--; + } + + ext_debug(tree, "level %d - index, first 0x%p, cur 0x%p\n", + i, EXT_FIRST_INDEX(path[i].p_hdr), + path[i].p_idx); + if (ext3_ext_more_to_rm(path + i)) { + /* go to the next level */ + ext_debug(tree, "move to level %d (block %d)\n", + i + 1, path[i].p_idx->ei_leaf); + memset(path + i + 1, 0, sizeof(*path)); + path[i+1].p_bh = sb_bread(sb, path[i].p_idx->ei_leaf); + if (!path[i+1].p_bh) { + /* should we reset i_size? */ + err = -EIO; + break; + } + /* put actual number of indexes to know is this + * number got changed at the next iteration */ + path[i].p_block = path[i].p_hdr->eh_entries; + i++; + } else { + /* we finish processing this index, go up */ + if (path[i].p_hdr->eh_entries == 0 && i > 0) { + /* index is empty, remove it + * handle must be already prepared by the + * truncatei_leaf() */ + err = ext3_ext_rm_idx(handle, tree, path + i); + } + /* root level have p_bh == NULL, brelse() eats this */ + brelse(path[i].p_bh); + i--; + ext_debug(tree, "return to level %d\n", i); + } + } + + /* TODO: flexible tree reduction should be here */ + if (path->p_hdr->eh_entries == 0) { + /* + * truncate to zero freed all the tree + * so, we need to correct eh_depth + */ + err = ext3_ext_get_access(handle, tree, path); + if (err == 0) { + EXT_ROOT_HDR(tree)->eh_depth = 0; + EXT_ROOT_HDR(tree)->eh_max = ext3_ext_space_root(tree); + err = ext3_ext_dirty(handle, tree, path); + } + } + ext3_ext_tree_changed(tree); + + kfree(path); + ext3_journal_stop(handle, inode); + + return err; +} + +/* + * called at mount time + */ +void ext3_ext_init(struct super_block *sb) +{ + /* + * possible initialization would be here + */ + + if (test_opt(sb, EXTENTS)) { + printk("EXT3-fs: file extents enabled"); +#ifdef AGRESSIVE_TEST + printk(", agressive tests"); +#endif +#ifdef CHECK_BINSEARCH + printk(", check binsearch"); +#endif + printk("\n"); + } +} + +/* + * called at umount time + */ +void ext3_ext_release(struct super_block *sb) +{ +} + +/************************************************************************ + * VFS related routines + ************************************************************************/ + +static int ext3_get_inode_write_access(handle_t *handle, void *buffer) +{ + /* we use in-core data, not bh */ + return 0; +} + +static int ext3_mark_buffer_dirty(handle_t *handle, void *buffer) +{ + struct inode *inode = buffer; + return ext3_mark_inode_dirty(handle, inode); +} + +static int ext3_ext_mergable(struct ext3_extent *ex1, + struct ext3_extent *ex2) +{ + /* FIXME: support for large fs */ + if (ex1->ee_start + ex1->ee_len == ex2->ee_start) + return 1; + return 0; +} + +static int +ext3_remove_blocks_credits(struct ext3_extents_tree *tree, + struct ext3_extent *ex, + unsigned long from, unsigned long to) +{ + int needed; + + /* at present, extent can't cross block group */; + needed = 4; /* bitmap + group desc + sb + inode */ + +#ifdef CONFIG_QUOTA + needed += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS; +#endif + return needed; +} + +static int +ext3_remove_blocks(struct ext3_extents_tree *tree, + struct ext3_extent *ex, + unsigned long from, unsigned long to) +{ + int needed = ext3_remove_blocks_credits(tree, ex, from, to); + handle_t *handle = ext3_journal_start(tree->inode, needed); + struct buffer_head *bh; + int i; + + if (IS_ERR(handle)) + return PTR_ERR(handle); + if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) { + /* tail removal */ + unsigned long num, start; + num = ex->ee_block + ex->ee_len - from; + start = ex->ee_start + ex->ee_len - num; + ext_debug(tree, "free last %lu blocks starting %lu\n", + num, start); + for (i = 0; i < num; i++) { + bh = sb_get_hash_table(tree->inode->i_sb, start + i); + ext3_forget(handle, 0, tree->inode, bh, start + i); + } + ext3_free_blocks(handle, tree->inode, start, num); + } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) { + printk("strange request: removal %lu-%lu from %u:%u\n", + from, to, ex->ee_block, ex->ee_len); + } else { + printk("strange request: removal(2) %lu-%lu from %u:%u\n", + from, to, ex->ee_block, ex->ee_len); + } + ext3_journal_stop(handle, tree->inode); + return 0; +} + +int ext3_ext_find_goal(struct inode *inode, struct ext3_ext_path *path, + unsigned long block) +{ + struct ext3_inode_info *ei = EXT3_I(inode); + unsigned long bg_start; + unsigned long colour; + int depth; + + if (path) { + struct ext3_extent *ex; + depth = path->p_depth; + + /* try to predict block placement */ + if ((ex = path[depth].p_ext)) + return ex->ee_start + (block - ex->ee_block); + + /* it looks index is empty + * try to find starting from index itself */ + if (path[depth].p_bh) + return path[depth].p_bh->b_blocknr; + } + + /* OK. use inode's group */ + bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) + + le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block); + colour = (current->pid % 16) * + (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16); + return bg_start + colour + block; +} + +static int ext3_new_block_cb(handle_t *handle, struct ext3_extents_tree *tree, + struct ext3_ext_path *path, + struct ext3_extent *ex, int *err) +{ + struct inode *inode = tree->inode; + int newblock, goal; + + EXT_ASSERT(path); + EXT_ASSERT(ex); + EXT_ASSERT(ex->ee_start); + EXT_ASSERT(ex->ee_len); + + /* reuse block from the extent to order data/metadata */ + newblock = ex->ee_start++; + ex->ee_len--; + if (ex->ee_len == 0) { + ex->ee_len = 1; + /* allocate new block for the extent */ + goal = ext3_ext_find_goal(inode, path, ex->ee_block); + lock_kernel(); + ex->ee_start = ext3_new_block(handle, inode, goal, 0, 0, err); + unlock_kernel(); + if (ex->ee_start == 0) { + /* error occured: restore old extent */ + ex->ee_start = newblock; + return 0; + } + } + return newblock; +} + +static struct ext3_extents_helpers ext3_blockmap_helpers = { + .get_write_access = ext3_get_inode_write_access, + .mark_buffer_dirty = ext3_mark_buffer_dirty, + .mergable = ext3_ext_mergable, + .new_block = ext3_new_block_cb, + .remove_extent = ext3_remove_blocks, + .remove_extent_credits = ext3_remove_blocks_credits, +}; + +void ext3_init_tree_desc(struct ext3_extents_tree *tree, + struct inode *inode) +{ + tree->inode = inode; + tree->root = (void *) EXT3_I(inode)->i_data; + tree->buffer = (void *) inode; + tree->buffer_len = sizeof(EXT3_I(inode)->i_data); + tree->cex = (struct ext3_ext_cache *) &EXT3_I(inode)->i_cached_extent; + tree->ops = &ext3_blockmap_helpers; +} + +int ext3_ext_get_block(handle_t *handle, struct inode *inode, + long iblock, struct buffer_head *bh_result, int create) +{ + struct ext3_ext_path *path = NULL; + struct ext3_extent newex; + struct ext3_extent *ex; + int goal, newblock, err = 0, depth; + struct ext3_extents_tree tree; + + clear_bit(BH_New, &bh_result->b_state); + ext3_init_tree_desc(&tree, inode); + ext_debug(&tree, "block %d requested for inode %u\n", + (int) iblock, (unsigned) inode->i_ino); + down_write(&EXT3_I(inode)->truncate_sem); + + /* check in cache */ + if ((goal = ext3_ext_in_cache(&tree, iblock, &newex))) { + if (goal == EXT3_EXT_CACHE_GAP) { + if (!create) { + /* block isn't allocated yet and + * user don't want to allocate it */ + goto out2; + } + /* we should allocate requested block */ + } else if (goal == EXT3_EXT_CACHE_EXTENT) { + /* block is already allocated */ + newblock = iblock - newex.ee_block + newex.ee_start; + goto out; + } else { + EXT_ASSERT(0); + } + } + + /* find extent for this block */ + path = ext3_ext_find_extent(&tree, iblock, NULL); + if (IS_ERR(path)) { + err = PTR_ERR(path); + path = NULL; + goto out2; + } + + depth = EXT_DEPTH(&tree); + + /* + * consistent leaf must not be empty + * this situations is possible, though, _during_ tree modification + * this is why assert can't be put in ext3_ext_find_extent() + */ + EXT_ASSERT(path[depth].p_ext != NULL || depth == 0); + + if ((ex = path[depth].p_ext)) { + /* if found exent covers block, simple return it */ + if (iblock >= ex->ee_block && iblock < ex->ee_block + ex->ee_len) { + newblock = iblock - ex->ee_block + ex->ee_start; + ext_debug(&tree, "%d fit into %d:%d -> %d\n", + (int) iblock, ex->ee_block, ex->ee_len, + newblock); + ext3_ext_put_in_cache(&tree, ex->ee_block, + ex->ee_len, ex->ee_start, + EXT3_EXT_CACHE_EXTENT); + goto out; + } + } + + /* + * requested block isn't allocated yet + * we couldn't try to create block if create flag is zero + */ + if (!create) { + /* put just found gap into cache to speedup subsequest reqs */ + ext3_ext_put_gap_in_cache(&tree, path, iblock); + goto out2; + } + + /* allocate new block */ + goal = ext3_ext_find_goal(inode, path, iblock); + lock_kernel(); + newblock = ext3_new_block(handle, inode, goal, 0, 0, &err); + unlock_kernel(); + if (!newblock) + goto out2; + ext_debug(&tree, "allocate new block: goal %d, found %d\n", + goal, newblock); + + /* try to insert new extent into found leaf and return */ + newex.ee_block = iblock; + newex.ee_start = newblock; + newex.ee_len = 1; + err = ext3_ext_insert_extent(handle, &tree, path, &newex); + if (err) + goto out2; + + if (inode->i_size > EXT3_I(inode)->i_disksize) + EXT3_I(inode)->i_disksize = inode->i_size; + + /* previous routine could use block we allocated */ + newblock = newex.ee_start; + set_bit(BH_New, &bh_result->b_state); + + ext3_ext_put_in_cache(&tree, newex.ee_block, newex.ee_len, + newex.ee_start, EXT3_EXT_CACHE_EXTENT); +out: + ext3_ext_show_leaf(&tree, path); + set_bit(BH_Mapped, &bh_result->b_state); + bh_result->b_dev = inode->i_sb->s_dev; + bh_result->b_blocknr = newblock; +out2: + if (path) { + ext3_ext_drop_refs(path); + kfree(path); + } + up_write(&EXT3_I(inode)->truncate_sem); + + return err; +} + +void ext3_ext_truncate(struct inode * inode) +{ + struct address_space *mapping = inode->i_mapping; + struct super_block *sb = inode->i_sb; + struct ext3_extents_tree tree; + unsigned long last_block; + handle_t *handle; + int err = 0; + + ext3_init_tree_desc(&tree, inode); + + /* + * probably first extent we're gonna free will be last in block + */ + err = ext3_writepage_trans_blocks(inode) + 3; + handle = ext3_journal_start(inode, err); + if (IS_ERR(handle)) + return; + + ext3_block_truncate_page(handle, mapping, inode->i_size); + + down_write(&EXT3_I(inode)->truncate_sem); + ext3_ext_invalidate_cache(&tree); + + /* + * TODO: optimization is possible here + * probably we need not scaning at all, + * because page truncation is enough + */ + if (ext3_orphan_add(handle, inode)) + goto out_stop; + + /* we have to know where to truncate from in crash case */ + EXT3_I(inode)->i_disksize = inode->i_size; + ext3_mark_inode_dirty(handle, inode); + + last_block = (inode->i_size + sb->s_blocksize - 1) >> + EXT3_BLOCK_SIZE_BITS(sb); + err = ext3_ext_remove_space(&tree, last_block, EXT_MAX_BLOCK); + + /* In a multi-transaction truncate, we only make the final + * transaction synchronous */ + if (IS_SYNC(inode)) + handle->h_sync = 1; + +out_stop: + /* + * If this was a simple ftruncate(), and the file will remain alive + * then we need to clear up the orphan record which we created above. + * However, if this was a real unlink then we were called by + * ext3_delete_inode(), and we allow that function to clean up the + * orphan info for us. + */ + if (inode->i_nlink) + ext3_orphan_del(handle, inode); + + up_write(&EXT3_I(inode)->truncate_sem); + ext3_journal_stop(handle, inode); +} + +/* + * this routine calculate max number of blocks we could modify + * in order to allocate new block for an inode + */ +int ext3_ext_writepage_trans_blocks(struct inode *inode, int num) +{ + struct ext3_extents_tree tree; + int needed; + + ext3_init_tree_desc(&tree, inode); + + needed = ext3_ext_calc_credits_for_insert(&tree, NULL); + + /* caller want to allocate num blocks */ + needed *= num; + +#ifdef CONFIG_QUOTA + /* + * FIXME: real calculation should be here + * it depends on blockmap format of qouta file + */ + needed += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS; +#endif + + return needed; +} + +void ext3_extents_initialize_blockmap(handle_t *handle, struct inode *inode) +{ + struct ext3_extents_tree tree; + + ext3_init_tree_desc(&tree, inode); + ext3_extent_tree_init(handle, &tree); +} + +static int +ext3_ext_store_extent_cb(struct ext3_extents_tree *tree, + struct ext3_ext_path *path, + struct ext3_ext_cache *newex) +{ + struct ext3_extent_buf *buf = (struct ext3_extent_buf *) tree->private; + + if (newex->ec_type != EXT3_EXT_CACHE_EXTENT) + return EXT_CONTINUE; + + if (buf->err < 0) + return EXT_BREAK; + if (buf->cur - buf->buffer + sizeof(*newex) > buf->buflen) + return EXT_BREAK; + + if (!copy_to_user(buf->cur, newex, sizeof(*newex))) { + buf->err++; + buf->cur += sizeof(*newex); + } else { + buf->err = -EFAULT; + return EXT_BREAK; + } + return EXT_CONTINUE; +} + +static int +ext3_ext_collect_stats_cb(struct ext3_extents_tree *tree, + struct ext3_ext_path *path, + struct ext3_ext_cache *ex) +{ + struct ext3_extent_tree_stats *buf = + (struct ext3_extent_tree_stats *) tree->private; + int depth; + + if (ex->ec_type != EXT3_EXT_CACHE_EXTENT) + return EXT_CONTINUE; + + depth = EXT_DEPTH(tree); + buf->extents_num++; + if (path[depth].p_ext == EXT_FIRST_EXTENT(path[depth].p_hdr)) + buf->leaf_num++; + return EXT_CONTINUE; +} + +int ext3_ext_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, + unsigned long arg) +{ + int err = 0; + + if (!(EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL)) + return -EINVAL; + + if (cmd == EXT3_IOC_GET_EXTENTS) { + struct ext3_extent_buf buf; + struct ext3_extents_tree tree; + + if (copy_from_user(&buf, (void *) arg, sizeof(buf))) + return -EFAULT; + + ext3_init_tree_desc(&tree, inode); + buf.cur = buf.buffer; + buf.err = 0; + tree.private = &buf; + down_write(&EXT3_I(inode)->truncate_sem); + err = ext3_ext_walk_space(&tree, buf.start, EXT_MAX_BLOCK, + ext3_ext_store_extent_cb); + up_write(&EXT3_I(inode)->truncate_sem); + if (err == 0) + err = buf.err; + } else if (cmd == EXT3_IOC_GET_TREE_STATS) { + struct ext3_extent_tree_stats buf; + struct ext3_extents_tree tree; + + ext3_init_tree_desc(&tree, inode); + down_write(&EXT3_I(inode)->truncate_sem); + buf.depth = EXT_DEPTH(&tree); + buf.extents_num = 0; + buf.leaf_num = 0; + tree.private = &buf; + err = ext3_ext_walk_space(&tree, 0, EXT_MAX_BLOCK, + ext3_ext_collect_stats_cb); + up_write(&EXT3_I(inode)->truncate_sem); + if (!err) + err = copy_to_user((void *) arg, &buf, sizeof(buf)); + } else if (cmd == EXT3_IOC_GET_TREE_DEPTH) { + struct ext3_extents_tree tree; + ext3_init_tree_desc(&tree, inode); + down_write(&EXT3_I(inode)->truncate_sem); + err = EXT_DEPTH(&tree); + up_write(&EXT3_I(inode)->truncate_sem); + } + + return err; +} + +EXPORT_SYMBOL(ext3_init_tree_desc); +EXPORT_SYMBOL(ext3_mark_inode_dirty); +EXPORT_SYMBOL(ext3_ext_invalidate_cache); +EXPORT_SYMBOL(ext3_ext_insert_extent); +EXPORT_SYMBOL(ext3_ext_walk_space); +EXPORT_SYMBOL(ext3_ext_find_goal); +EXPORT_SYMBOL(ext3_ext_calc_credits_for_insert); Index: linux-2.4.29/fs/ext3/ialloc.c =================================================================== --- linux-2.4.29.orig/fs/ext3/ialloc.c 2005-05-03 16:50:30.216045296 +0300 +++ linux-2.4.29/fs/ext3/ialloc.c 2005-05-03 16:52:08.804057640 +0300 @@ -553,7 +553,8 @@ inode->i_blksize = PAGE_SIZE; inode->i_blocks = 0; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; - inode->u.ext3_i.i_flags = dir->u.ext3_i.i_flags & ~EXT3_INDEX_FL; + inode->u.ext3_i.i_flags = dir->u.ext3_i.i_flags & + ~(EXT3_INDEX_FL | EXT3_EXTENTS_FL); if (S_ISLNK(mode)) inode->u.ext3_i.i_flags &= ~(EXT3_IMMUTABLE_FL|EXT3_APPEND_FL); #ifdef EXT3_FRAGMENTS @@ -592,6 +593,19 @@ iloc.bh = NULL; goto fail; } + if (test_opt(sb, EXTENTS) && S_ISREG(inode->i_mode)) { + EXT3_I(inode)->i_flags |= EXT3_EXTENTS_FL; + memset(&EXT3_I(inode)->i_cached_extent, 0, sizeof(__u32) * 4); + ext3_extents_initialize_blockmap(handle, inode); + if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_EXTENTS)) { + err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh); + if (err) goto fail; + EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_EXTENTS); + BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "call ext3_journal_dirty_metadata"); + err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); + } + } + err = ext3_mark_iloc_dirty(handle, inode, &iloc); if (err) goto fail; Index: linux-2.4.29/fs/ext3/inode.c =================================================================== --- linux-2.4.29.orig/fs/ext3/inode.c 2005-05-03 16:51:50.331865840 +0300 +++ linux-2.4.29/fs/ext3/inode.c 2005-05-03 16:52:08.808057032 +0300 @@ -861,6 +861,15 @@ goto reread; } +static inline int +ext3_get_block_wrap(handle_t *handle, struct inode *inode, long block, + struct buffer_head *bh, int create) +{ + if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) + return ext3_ext_get_block(handle, inode, block, bh, create); + return ext3_get_block_handle(handle, inode, block, bh, create); +} + /* * The BKL is not held on entry here. */ @@ -874,7 +883,7 @@ handle = ext3_journal_current_handle(); J_ASSERT(handle != 0); } - ret = ext3_get_block_handle(handle, inode, iblock, bh_result, create); + ret = ext3_get_block_wrap(handle, inode, iblock, bh_result, create); return ret; } @@ -892,7 +901,7 @@ dummy.b_state = 0; dummy.b_blocknr = -1000; buffer_trace_init(&dummy.b_history); - *errp = ext3_get_block_handle(handle, inode, block, &dummy, create); + *errp = ext3_get_block_wrap(handle, inode, block, &dummy, create); if (!*errp && buffer_mapped(&dummy)) { struct buffer_head *bh; bh = sb_getblk(inode->i_sb, dummy.b_blocknr); @@ -1416,7 +1425,7 @@ * This required during truncate. We need to physically zero the tail end * of that block so it doesn't yield old data if the file is later grown. */ -static int ext3_block_truncate_page(handle_t *handle, +int ext3_block_truncate_page(handle_t *handle, struct address_space *mapping, loff_t from) { unsigned long index = from >> PAGE_CACHE_SHIFT; @@ -1904,6 +1913,9 @@ ext3_discard_prealloc(inode); + if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) + return ext3_ext_truncate(inode); + handle = start_transaction(inode); if (IS_ERR(handle)) return; /* AKPM: return what? */ @@ -2240,6 +2252,7 @@ for (block = 0; block < EXT3_N_BLOCKS; block++) inode->u.ext3_i.i_data[block] = iloc.raw_inode->i_block[block]; INIT_LIST_HEAD(&inode->u.ext3_i.i_orphan); + memset(&EXT3_I(inode)->i_cached_extent, 0, sizeof(__u32) * 4); if (EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) inode->u.ext3_i.i_extra_isize = @@ -2546,6 +2559,9 @@ int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3; int ret; + if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) + return ext3_ext_writepage_trans_blocks(inode, bpp); + if (ext3_should_journal_data(inode)) ret = 3 * (bpp + indirects) + 2; else @@ -2982,7 +2998,7 @@ /* alloc blocks one by one */ for (i = 0; i < nblocks; i++) { - ret = ext3_get_block_handle(handle, inode, blocks[i], + ret = ext3_get_block_wrap(handle, inode, blocks[i], &bh_tmp, 1); if (ret) break; @@ -3058,7 +3074,7 @@ if (blocks[i] != 0) continue; - rc = ext3_get_block_handle(handle, inode, iblock, &bh, 1); + rc = ext3_get_block_wrap(handle, inode, iblock, &bh, 1); if (rc) { printk(KERN_INFO "ext3_map_inode_page: error %d " "allocating block %ld\n", rc, iblock); Index: linux-2.4.29/fs/ext3/Makefile =================================================================== --- linux-2.4.29.orig/fs/ext3/Makefile 2005-05-03 16:51:32.127633304 +0300 +++ linux-2.4.29/fs/ext3/Makefile 2005-05-03 16:53:38.634401352 +0300 @@ -13,7 +13,9 @@ obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ ioctl.o namei.o super.o symlink.o hash.o ext3-exports.o \ - xattr_trusted.o + xattr_trusted.o extents.o +export-objs += extents.o + obj-m := $(O_TARGET) export-objs += xattr.o Index: linux-2.4.29/fs/ext3/super.c =================================================================== --- linux-2.4.29.orig/fs/ext3/super.c 2005-05-03 16:50:14.750396432 +0300 +++ linux-2.4.29/fs/ext3/super.c 2005-05-03 16:52:08.813056272 +0300 @@ -531,6 +531,7 @@ #ifdef EXT3_DELETE_THREAD J_ASSERT(sbi->s_delete_inodes == 0); #endif + ext3_ext_release(sb); ext3_xattr_put_super(sb); journal_destroy(sbi->s_journal); if (!(sb->s_flags & MS_RDONLY)) { @@ -702,6 +703,12 @@ return 0; } } + else if (!strcmp (this_char, "extents")) + set_opt (*mount_options, EXTENTS); + else if (!strcmp (this_char, "noextents")) + clear_opt (*mount_options, EXTENTS); + else if (!strcmp (this_char, "extdebug")) + set_opt (*mount_options, EXTDEBUG); else if (!strcmp (this_char, "grpid") || !strcmp (this_char, "bsdgroups")) set_opt (*mount_options, GRPID); @@ -1405,6 +1410,8 @@ test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered": "writeback"); + ext3_ext_init(sb); + return sb; failed_mount3: Index: linux-2.4.29/fs/ext3/ioctl.c =================================================================== --- linux-2.4.29.orig/fs/ext3/ioctl.c 2005-05-03 16:49:36.825161944 +0300 +++ linux-2.4.29/fs/ext3/ioctl.c 2005-05-03 16:52:08.814056120 +0300 @@ -174,6 +174,10 @@ return ret; } #endif + case EXT3_IOC_GET_EXTENTS: + case EXT3_IOC_GET_TREE_STATS: + case EXT3_IOC_GET_TREE_DEPTH: + return ext3_ext_ioctl(inode, filp, cmd, arg); default: return -ENOTTY; } Index: linux-2.4.29/include/linux/ext3_fs.h =================================================================== --- linux-2.4.29.orig/include/linux/ext3_fs.h 2005-05-03 16:50:30.228043472 +0300 +++ linux-2.4.29/include/linux/ext3_fs.h 2005-05-03 16:52:08.817055664 +0300 @@ -184,8 +184,9 @@ #define EXT3_IMAGIC_FL 0x00002000 /* AFS directory */ #define EXT3_JOURNAL_DATA_FL 0x00004000 /* file data should be journaled */ #define EXT3_RESERVED_FL 0x80000000 /* reserved for ext3 lib */ +#define EXT3_EXTENTS_FL 0x00080000 /* Inode uses extents */ -#define EXT3_FL_USER_VISIBLE 0x00005FFF /* User visible flags */ +#define EXT3_FL_USER_VISIBLE 0x00085FFF /* User visible flags */ #define EXT3_FL_USER_MODIFIABLE 0x000000FF /* User modifiable flags */ /* @@ -208,6 +209,9 @@ #ifdef CONFIG_JBD_DEBUG #define EXT3_IOC_WAIT_FOR_READONLY _IOR('f', 99, long) #endif +#define EXT3_IOC_GET_EXTENTS _IOR('f', 7, long) +#define EXT3_IOC_GET_TREE_DEPTH _IOR('f', 8, long) +#define EXT3_IOC_GET_TREE_STATS _IOR('f', 9, long) /* * Structure of an inode on the disk @@ -327,6 +331,8 @@ #define EXT3_MOUNT_ASYNCDEL 0x20000 /* Delayed deletion */ #define EXT3_MOUNT_IOPEN 0x80000 /* Allow access via iopen */ #define EXT3_MOUNT_IOPEN_NOPRIV 0x100000/* Make iopen world-readable */ +#define EXT3_MOUNT_EXTENTS 0x200000/* Extents support */ +#define EXT3_MOUNT_EXTDEBUG 0x400000/* Extents debug */ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ #ifndef _LINUX_EXT2_FS_H @@ -506,11 +512,13 @@ #define EXT3_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */ #define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */ #define EXT3_FEATURE_INCOMPAT_META_BG 0x0010 +#define EXT3_FEATURE_INCOMPAT_EXTENTS 0x0040 /* extents support */ #define EXT3_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR #define EXT3_FEATURE_INCOMPAT_SUPP (EXT3_FEATURE_INCOMPAT_FILETYPE| \ EXT3_FEATURE_INCOMPAT_RECOVER| \ - EXT3_FEATURE_INCOMPAT_META_BG) + EXT3_FEATURE_INCOMPAT_META_BG| \ + EXT3_FEATURE_INCOMPAT_EXTENTS) #define EXT3_FEATURE_RO_COMPAT_SUPP (EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \ EXT3_FEATURE_RO_COMPAT_LARGE_FILE| \ EXT3_FEATURE_RO_COMPAT_BTREE_DIR) @@ -702,6 +711,8 @@ extern unsigned long ext3_count_free (struct buffer_head *, unsigned); /* inode.c */ +extern int ext3_block_truncate_page(handle_t *, struct address_space *, loff_t); +extern int ext3_writepage_trans_blocks(struct inode *inode); extern int ext3_forget(handle_t *, int, struct inode *, struct buffer_head *, int); extern struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *); extern struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *); @@ -783,6 +793,16 @@ extern struct inode_operations ext3_symlink_inode_operations; extern struct inode_operations ext3_fast_symlink_inode_operations; +/* extents.c */ +extern int ext3_ext_writepage_trans_blocks(struct inode *, int); +extern int ext3_ext_get_block(handle_t *, struct inode *, long, + struct buffer_head *, int); +extern void ext3_ext_truncate(struct inode *); +extern void ext3_ext_init(struct super_block *); +extern void ext3_ext_release(struct super_block *); +extern void ext3_extents_initialize_blockmap(handle_t *, struct inode *); +extern int ext3_ext_ioctl(struct inode *inode, struct file *filp, + unsigned int cmd, unsigned long arg); #endif /* __KERNEL__ */ Index: linux-2.4.29/include/linux/ext3_extents.h =================================================================== --- linux-2.4.29.orig/include/linux/ext3_extents.h 2005-05-03 16:52:08.724069800 +0300 +++ linux-2.4.29/include/linux/ext3_extents.h 2005-05-03 16:52:08.819055360 +0300 @@ -0,0 +1,261 @@ +/* + * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com + * Written by Alex Tomas + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public Licens + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- + */ + +#ifndef _LINUX_EXT3_EXTENTS +#define _LINUX_EXT3_EXTENTS + +/* + * with AGRESSIVE_TEST defined capacity of index/leaf blocks + * become very little, so index split, in-depth growing and + * other hard changes happens much more often + * this is for debug purposes only + */ +#define AGRESSIVE_TEST_ + +/* + * if CHECK_BINSEARCH defined, then results of binary search + * will be checked by linear search + */ +#define CHECK_BINSEARCH_ + +/* + * if EXT_DEBUG is defined you can use 'extdebug' mount option + * to get lots of info what's going on + */ +#define EXT_DEBUG_ +#ifdef EXT_DEBUG +#define ext_debug(tree,fmt,a...) \ +do { \ + if (test_opt((tree)->inode->i_sb, EXTDEBUG)) \ + printk(fmt, ##a); \ +} while (0); +#else +#define ext_debug(tree,fmt,a...) +#endif + +/* + * if EXT_STATS is defined then stats numbers are collected + * these number will be displayed at umount time + */ +#define EXT_STATS_ + + +#define EXT3_ALLOC_NEEDED 3 /* block bitmap + group desc. + sb */ + +/* + * ext3_inode has i_block array (total 60 bytes) + * first 4 bytes are used to store: + * - tree depth (0 mean there is no tree yet. all extents in the inode) + * - number of alive extents in the inode + */ + +/* + * this is extent on-disk structure + * it's used at the bottom of the tree + */ +struct ext3_extent { + __u32 ee_block; /* first logical block extent covers */ + __u16 ee_len; /* number of blocks covered by extent */ + __u16 ee_start_hi; /* high 16 bits of physical block */ + __u32 ee_start; /* low 32 bigs of physical block */ +}; + +/* + * this is index on-disk structure + * it's used at all the levels, but the bottom + */ +struct ext3_extent_idx { + __u32 ei_block; /* index covers logical blocks from 'block' */ + __u32 ei_leaf; /* pointer to the physical block of the next * + * level. leaf or next index could bet here */ + __u16 ei_leaf_hi; /* high 16 bits of physical block */ + __u16 ei_unused; +}; + +/* + * each block (leaves and indexes), even inode-stored has header + */ +struct ext3_extent_header { + __u16 eh_magic; /* probably will support different formats */ + __u16 eh_entries; /* number of valid entries */ + __u16 eh_max; /* capacity of store in entries */ + __u16 eh_depth; /* has tree real underlaying blocks? */ + __u32 eh_generation; /* flags(8 bits) | generation of the tree */ +}; + +#define EXT3_EXT_MAGIC 0xf30a + +/* + * array of ext3_ext_path contains path to some extent + * creation/lookup routines use it for traversal/splitting/etc + * truncate uses it to simulate recursive walking + */ +struct ext3_ext_path { + __u32 p_block; + __u16 p_depth; + struct ext3_extent *p_ext; + struct ext3_extent_idx *p_idx; + struct ext3_extent_header *p_hdr; + struct buffer_head *p_bh; +}; + +/* + * structure for external API + */ + +/* + * storage for cached extent + */ +struct ext3_ext_cache { + __u32 ec_start; + __u32 ec_block; + __u32 ec_len; + __u32 ec_type; +}; + +#define EXT3_EXT_CACHE_NO 0 +#define EXT3_EXT_CACHE_GAP 1 +#define EXT3_EXT_CACHE_EXTENT 2 + +/* + * ext3_extents_tree is used to pass initial information + * to top-level extents API + */ +struct ext3_extents_helpers; +struct ext3_extents_tree { + struct inode *inode; /* inode which tree belongs to */ + void *root; /* ptr to data top of tree resides at */ + void *buffer; /* will be passed as arg to ^^ routines */ + int buffer_len; + void *private; + struct ext3_ext_cache *cex;/* last found extent */ + struct ext3_extents_helpers *ops; +}; + +struct ext3_extents_helpers { + int (*get_write_access)(handle_t *h, void *buffer); + int (*mark_buffer_dirty)(handle_t *h, void *buffer); + int (*mergable)(struct ext3_extent *ex1, struct ext3_extent *ex2); + int (*remove_extent_credits)(struct ext3_extents_tree *, + struct ext3_extent *, unsigned long, + unsigned long); + int (*remove_extent)(struct ext3_extents_tree *, + struct ext3_extent *, unsigned long, + unsigned long); + int (*new_block)(handle_t *, struct ext3_extents_tree *, + struct ext3_ext_path *, struct ext3_extent *, + int *); +}; + +/* + * to be called by ext3_ext_walk_space() + * negative retcode - error + * positive retcode - signal for ext3_ext_walk_space(), see below + * callback must return valid extent (passed or newly created) + */ +typedef int (*ext_prepare_callback)(struct ext3_extents_tree *, + struct ext3_ext_path *, + struct ext3_ext_cache *); + +#define EXT_CONTINUE 0 +#define EXT_BREAK 1 +#define EXT_REPEAT 2 + + +#define EXT_MAX_BLOCK 0xffffffff + + +#define EXT_FIRST_EXTENT(__hdr__) \ + ((struct ext3_extent *) (((char *) (__hdr__)) + \ + sizeof(struct ext3_extent_header))) +#define EXT_FIRST_INDEX(__hdr__) \ + ((struct ext3_extent_idx *) (((char *) (__hdr__)) + \ + sizeof(struct ext3_extent_header))) +#define EXT_HAS_FREE_INDEX(__path__) \ + ((__path__)->p_hdr->eh_entries < (__path__)->p_hdr->eh_max) +#define EXT_LAST_EXTENT(__hdr__) \ + (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_entries - 1) +#define EXT_LAST_INDEX(__hdr__) \ + (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_entries - 1) +#define EXT_MAX_EXTENT(__hdr__) \ + (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_max - 1) +#define EXT_MAX_INDEX(__hdr__) \ + (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_max - 1) +#define EXT_HDR_GEN(__hdr__) ((__hdr__)->eh_generation & 0x00ffffff) +#define EXT_FLAGS(__hdr__) ((__hdr__)->eh_generation >> 24) +#define EXT_FLAGS_CLR_UNKNOWN 0x7 /* Flags cleared on modification */ + +#define EXT_BLOCK_HDR(__bh__) ((struct ext3_extent_header *)(__bh__)->b_data) +#define EXT_ROOT_HDR(__tree__) ((struct ext3_extent_header *)(__tree__)->root) +#define EXT_DEPTH(__tree__) (EXT_ROOT_HDR(__tree__)->eh_depth) +#define EXT_GENERATION(__tree__) EXT_HDR_GEN(EXT_ROOT_HDR(__tree__)) + +#define EXT_ASSERT(__x__) if (!(__x__)) BUG(); + +#define EXT_CHECK_PATH(tree,path) \ +{ \ + int depth = EXT_DEPTH(tree); \ + BUG_ON((unsigned long) (path) < __PAGE_OFFSET); \ + BUG_ON((unsigned long) (path)[depth].p_idx < \ + __PAGE_OFFSET && (path)[depth].p_idx != NULL); \ + BUG_ON((unsigned long) (path)[depth].p_ext < \ + __PAGE_OFFSET && (path)[depth].p_ext != NULL); \ + BUG_ON((unsigned long) (path)[depth].p_hdr < __PAGE_OFFSET); \ + BUG_ON((unsigned long) (path)[depth].p_bh < __PAGE_OFFSET \ + && depth != 0); \ + BUG_ON((path)[0].p_depth != depth); \ +} + + +/* + * this structure is used to gather extents from the tree via ioctl + */ +struct ext3_extent_buf { + unsigned long start; + int buflen; + void *buffer; + void *cur; + int err; +}; + +/* + * this structure is used to collect stats info about the tree + */ +struct ext3_extent_tree_stats { + int depth; + int extents_num; + int leaf_num; +}; + +extern void ext3_init_tree_desc(struct ext3_extents_tree *, struct inode *); +extern int ext3_extent_tree_init(handle_t *, struct ext3_extents_tree *); +extern int ext3_ext_calc_credits_for_insert(struct ext3_extents_tree *, struct ext3_ext_path *); +extern int ext3_ext_insert_extent(handle_t *, struct ext3_extents_tree *, struct ext3_ext_path *, struct ext3_extent *); +extern int ext3_ext_walk_space(struct ext3_extents_tree *, unsigned long, unsigned long, ext_prepare_callback); +extern int ext3_ext_remove_space(struct ext3_extents_tree *, unsigned long, unsigned long); +extern struct ext3_ext_path * ext3_ext_find_extent(struct ext3_extents_tree *, int, struct ext3_ext_path *); + +static inline void +ext3_ext_invalidate_cache(struct ext3_extents_tree *tree) +{ + if (tree->cex) + tree->cex->ec_type = EXT3_EXT_CACHE_NO; +} + + +#endif /* _LINUX_EXT3_EXTENTS */ Index: linux-2.4.29/include/linux/ext3_fs_i.h =================================================================== --- linux-2.4.29.orig/include/linux/ext3_fs_i.h 2005-05-03 16:50:30.229043320 +0300 +++ linux-2.4.29/include/linux/ext3_fs_i.h 2005-05-03 16:52:08.823054752 +0300 @@ -76,6 +76,8 @@ * by other means, so we have truncate_sem. */ struct rw_semaphore truncate_sem; + + __u32 i_cached_extent[4]; }; #endif /* _LINUX_EXT3_FS_I */