From d08374da3b2122d33b724194fb5166698af51359 Mon Sep 17 00:00:00 2001 From: alex Date: Sat, 24 Jan 2004 12:23:54 +0000 Subject: [PATCH] - reworked extents patch against 2.4.20 - support for extents in EA - access to extents stored in EA via ioctl --- .../patches/ext3-extents-2.4.20.patch | 2526 +++++++++++++------- .../patches/ext3-extents-in-ea-2.4.20.patch | 343 +++ .../patches/ext3-extents-in-ea-ioctl-2.4.20.patch | 154 ++ 3 files changed, 2178 insertions(+), 845 deletions(-) create mode 100644 lustre/kernel_patches/patches/ext3-extents-in-ea-2.4.20.patch create mode 100644 lustre/kernel_patches/patches/ext3-extents-in-ea-ioctl-2.4.20.patch diff --git a/lustre/kernel_patches/patches/ext3-extents-2.4.20.patch b/lustre/kernel_patches/patches/ext3-extents-2.4.20.patch index 7c63464..1da5f7c 100644 --- a/lustre/kernel_patches/patches/ext3-extents-2.4.20.patch +++ b/lustre/kernel_patches/patches/ext3-extents-2.4.20.patch @@ -1,33 +1,36 @@ - fs/ext3/Makefile | 3 - fs/ext3/extents.c | 1570 +++++++++++++++++++++++++++++++++++++++++++++ - fs/ext3/ialloc.c | 4 - fs/ext3/inode.c | 28 - fs/ext3/super.c | 6 - include/linux/ext3_fs.h | 16 - include/linux/ext3_fs_i.h | 4 - include/linux/ext3_fs_sb.h | 10 - 8 files changed, 1634 insertions(+), 7 deletions(-) - ---- /dev/null 2003-01-30 13:24:37.000000000 +0300 -+++ linux-2.4.20-vanilla-alexey/fs/ext3/extents.c 2003-09-15 19:57:29.000000000 +0400 -@@ -0,0 +1,1570 @@ +Index: linux-2.4.20/fs/ext3/extents.c +=================================================================== +--- linux-2.4.20.orig/fs/ext3/extents.c 2003-01-30 13:24:37.000000000 +0300 ++++ linux-2.4.20/fs/ext3/extents.c 2004-01-24 14:19:29.000000000 +0300 +@@ -0,0 +1,2224 @@ +/* ++ * Copyright (C) 2003 Alex Tomas + * -+ * linux/fs/ext3/extents.c ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. + * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public Licens ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- ++ */ ++ ++/* + * Extents support for EXT3 + * -+ * 07/08/2003 Alex Tomas -+ * + * TODO: ++ * - ext3_ext_walk_space() sould not use ext3_ext_find_extent() ++ * - ext3_ext_calc_credits() could take 'mergable' into account + * - ext3*_error() should be used in some situations + * - find_goal() [to be tested and improved] -+ * - error handling -+ * - we could leak allocated block in some error cases -+ * - quick search for index/leaf in ext3_ext_find_extent() -+ * - tree reduction -+ * - cache last found extent -+ * - arch-independent ++ * - smart tree reduction ++ * - arch-independence ++ * common on-disk format for big/little-endian arch + */ + +#include @@ -42,107 +45,47 @@ +#include +#include +#include ++#include ++#include + -+/* -+ * with AGRESSIVE_TEST defined capacity of index/leaf blocks -+ * become very little, so index split, in-depth growing and -+ * other hard changes happens much more often -+ * this is for debug purposes only -+ */ -+#define AGRESSIVE_TEST_ -+ -+/* -+ * if EXT_DEBUG defined you can use 'extdebug' mount option -+ * to get lots of info what's going on -+ */ -+#define EXT_DEBUG -+#ifdef EXT_DEBUG -+#define ext_debug(inode,fmt,a...) \ -+do { \ -+ if (test_opt((inode)->i_sb, EXTDEBUG)) \ -+ printk(fmt, ##a); \ -+} while (0); -+#else -+#define ext_debug(inode,fmt,a...) -+#endif -+ -+#define EXT3_ALLOC_NEEDED 2 /* block bitmap + group descriptor */ -+ -+/* -+ * ext3_inode has i_block array (total 60 bytes) -+ * first 4 bytes are used to store: -+ * - tree depth (0 mean there is no tree yet. all extents in the inode) -+ * - number of alive extents in the inode -+ */ -+ -+/* -+ * this is extent on-disk structure -+ * it's used at the bottom of the tree -+ */ -+struct ext3_extent { -+ __u32 e_block; /* first logical block extent covers */ -+ __u32 e_start; /* first physical block extents lives */ -+ __u32 e_num; /* number of blocks covered by extent */ -+}; -+ -+/* -+ * this is index on-disk structure -+ * it's used at all the levels, but the bottom -+ */ -+struct ext3_extent_idx { -+ __u32 e_block; /* index covers logical blocks from 'block' */ -+ __u32 e_leaf; /* pointer to the physical block of the next * -+ * level. leaf or next index could bet here */ -+}; -+ -+/* -+ * each block (leaves and indexes), even inode-stored has header -+ */ -+struct ext3_extent_header { -+ __u16 e_num; /* number of valid entries */ -+ __u16 e_max; /* capacity of store in entries */ -+}; -+ -+/* -+ * array of ext3_ext_path contains path to some extent -+ * creation/lookup routines use it for traversal/splitting/etc -+ * truncate uses it to simulate recursive walking -+ */ -+struct ext3_ext_path { -+ __u32 p_block; -+ __u16 p_depth; -+ struct ext3_extent *p_ext; -+ struct ext3_extent_idx *p_idx; -+ struct ext3_extent_header *p_hdr; -+ struct buffer_head *p_bh; -+}; ++static handle_t *ext3_ext_journal_restart(handle_t *handle, int needed) ++{ ++ int err; + -+#define EXT_FIRST_EXTENT(__hdr__) \ -+ ((struct ext3_extent *) (((char *) (__hdr__)) + \ -+ sizeof(struct ext3_extent_header))) -+#define EXT_FIRST_INDEX(__hdr__) \ -+ ((struct ext3_extent_idx *) (((char *) (__hdr__)) + \ -+ sizeof(struct ext3_extent_header))) -+#define EXT_HAS_FREE_INDEX(__path__) \ -+ ((__path__)->p_hdr->e_num < (__path__)->p_hdr->e_max) -+#define EXT_LAST_EXTENT(__hdr__) \ -+ (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->e_num - 1) -+#define EXT_LAST_INDEX(__hdr__) \ -+ (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->e_num - 1) -+#define EXT_MAX_EXTENT(__hdr__) \ -+ (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->e_max - 1) -+#define EXT_MAX_INDEX(__hdr__) \ -+ (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->e_max - 1) ++ if (handle->h_buffer_credits > needed) ++ return handle; ++ if (!ext3_journal_extend(handle, needed)) ++ return handle; ++ err = ext3_journal_restart(handle, needed); ++ ++ return handle; ++} + ++static int inline ++ext3_ext_get_access_for_root(handle_t *h, struct ext3_extents_tree *tree) ++{ ++ if (tree->get_write_access) ++ return tree->get_write_access(h,tree->buffer); ++ else ++ return 0; ++} + -+#define EXT_ASSERT(__x__) if (!(__x__)) BUG(); ++static int inline ++ext3_ext_mark_root_dirty(handle_t *h, struct ext3_extents_tree *tree) ++{ ++ if (tree->mark_buffer_dirty) ++ return tree->mark_buffer_dirty(h,tree->buffer); ++ else ++ return 0; ++} + +/* + * could return: + * - EROFS + * - ENOMEM + */ -+static int ext3_ext_get_access(handle_t *handle, struct inode *inode, ++static int ext3_ext_get_access(handle_t *handle, ++ struct ext3_extents_tree *tree, + struct ext3_ext_path *path) +{ + if (path->p_bh) { @@ -151,7 +94,7 @@ + } + + /* path points to leaf/index in inode body */ -+ return 0; ++ return ext3_ext_get_access_for_root(handle, tree); +} + +/* @@ -160,7 +103,7 @@ + * - ENOMEM + * - EIO + */ -+static int ext3_ext_dirty(handle_t *handle, struct inode *inode, ++static int ext3_ext_dirty(handle_t *handle, struct ext3_extents_tree *tree, + struct ext3_ext_path *path) +{ + if (path->p_bh) { @@ -169,82 +112,140 @@ + } + + /* path points to leaf/index in inode body */ -+ return ext3_mark_inode_dirty(handle, inode); ++ return ext3_ext_mark_root_dirty(handle, tree); ++} ++ ++static int inline ++ext3_ext_new_block(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, struct ext3_extent *ex, ++ int *err) ++{ ++ int goal, depth, newblock; ++ struct inode *inode; ++ ++ EXT_ASSERT(tree); ++ if (tree->new_block) ++ return tree->new_block(handle, tree, path, ex, err); ++ ++ inode = tree->inode; ++ depth = EXT_DEPTH(tree); ++ if (path && depth > 0) { ++ goal = path[depth-1].p_block; ++ } else { ++ struct ext3_inode_info *ei = EXT3_I(inode); ++ unsigned long bg_start; ++ unsigned long colour; ++ ++ bg_start = (ei->i_block_group * ++ EXT3_BLOCKS_PER_GROUP(inode->i_sb)) + ++ le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block); ++ colour = (current->pid % 16) * ++ (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16); ++ goal = bg_start + colour; ++ } ++ ++ newblock = ext3_new_block(handle, inode, goal, 0, 0, err); ++ return newblock; +} + -+static inline int ext3_ext_space_block(struct inode *inode) ++static inline int ext3_ext_space_block(struct ext3_extents_tree *tree) +{ + int size; + -+ size = (inode->i_sb->s_blocksize - sizeof(struct ext3_extent_header)) -+ / sizeof(struct ext3_extent); ++ size = (tree->inode->i_sb->s_blocksize - ++ sizeof(struct ext3_extent_header)) ++ / sizeof(struct ext3_extent); +#ifdef AGRESSIVE_TEST -+ size = 6; /* FIXME: for debug, remove this line */ ++ size = 6; +#endif + return size; +} + -+static inline int ext3_ext_space_inode(struct inode *inode) ++static inline int ext3_ext_space_block_idx(struct ext3_extents_tree *tree) +{ + int size; + -+ size = (sizeof(EXT3_I(inode)->i_data) - ++ size = (tree->inode->i_sb->s_blocksize - + sizeof(struct ext3_extent_header)) ++ / sizeof(struct ext3_extent_idx); ++#ifdef AGRESSIVE_TEST ++ size = 5; ++#endif ++ return size; ++} ++ ++static inline int ext3_ext_space_root(struct ext3_extents_tree *tree) ++{ ++ int size; ++ ++ size = (tree->buffer_len - sizeof(struct ext3_extent_header)) + / sizeof(struct ext3_extent); +#ifdef AGRESSIVE_TEST -+ size = 3; /* FIXME: for debug, remove this line */ ++ size = 3; +#endif + return size; +} + -+static inline int ext3_ext_space_inode_idx(struct inode *inode) ++static inline int ext3_ext_space_root_idx(struct ext3_extents_tree *tree) +{ + int size; + -+ size = (sizeof(EXT3_I(inode)->i_data) - ++ size = (tree->buffer_len - + sizeof(struct ext3_extent_header)) + / sizeof(struct ext3_extent_idx); +#ifdef AGRESSIVE_TEST -+ size = 4; /* FIXME: for debug, remove this line */ ++ size = 4; +#endif + return size; +} + -+static void ext3_ext_show_path(struct inode *inode, struct ext3_ext_path *path) ++static void ext3_ext_show_path(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) +{ ++#ifdef EXT_DEBUG + int k, l = path->p_depth; + -+ ext_debug(inode, "path:"); ++ ext_debug(tree, "path:"); + for (k = 0; k <= l; k++, path++) { + if (path->p_idx) { -+ ext_debug(inode, " %d->%d", path->p_idx->e_block, ++ ext_debug(tree, " %d->%d", path->p_idx->e_block, + path->p_idx->e_leaf); + } else if (path->p_ext) { -+ ext_debug(inode, " %d:%d:%d", ++ ext_debug(tree, " %d:%d:%d", + path->p_ext->e_block, -+ path->p_ext->e_start, -+ path->p_ext->e_num); ++ path->p_ext->e_num, ++ path->p_ext->e_start); + } else -+ ext_debug(inode, " []"); ++ ext_debug(tree, " []"); + } -+ ext_debug(inode, "\n"); ++ ext_debug(tree, "\n"); ++#endif +} + -+static void ext3_ext_show_leaf(struct inode *inode, struct ext3_ext_path *path) ++static void ext3_ext_show_leaf(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) +{ -+ int depth = EXT3_I(inode)->i_depth; -+ struct ext3_extent_header *eh = path[depth].p_hdr; -+ struct ext3_extent *ex = EXT_FIRST_EXTENT(eh); ++#ifdef EXT_DEBUG ++ int depth = EXT_DEPTH(tree); ++ struct ext3_extent_header *eh; ++ struct ext3_extent *ex; + int i; + ++ if (!path) ++ return; ++ ++ eh = path[depth].p_hdr; ++ ex = EXT_FIRST_EXTENT(eh); ++ + for (i = 0; i < eh->e_num; i++, ex++) { -+ ext_debug(inode, "%d:%d:%d ", -+ ex->e_block, ex->e_start, ex->e_num); ++ ext_debug(tree, "%d:%d:%d ", ++ ex->e_block, ex->e_num, ex->e_start); + } -+ ext_debug(inode, "\n"); ++ ext_debug(tree, "\n"); ++#endif +} + -+static void ext3_ext_drop_refs(struct inode *inode, struct ext3_ext_path *path) ++static void ext3_ext_drop_refs(struct ext3_ext_path *path) +{ + int depth = path->p_depth; + int i; @@ -256,50 +257,157 @@ + } +} + -+static int ext3_ext_find_goal(struct inode *inode, struct ext3_ext_path *path) ++/* ++ * binary search for closest index by given block ++ */ ++static inline void ++ext3_ext_binsearch_idx(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, int block) +{ -+ struct ext3_inode_info *ei = EXT3_I(inode); -+ unsigned long bg_start; -+ unsigned long colour; -+ int depth; -+ -+ if (path) { -+ depth = path->p_depth; -+ /* try to find previous block */ -+ if (path[depth].p_ext) -+ return path[depth].p_ext->e_start + -+ path[depth].p_ext->e_num - 1; -+ -+ /* it looks index is empty -+ * try to find starting from index itself */ -+ if (path[depth].p_bh) -+ return path[depth].p_bh->b_blocknr; ++ struct ext3_extent_header *eh = path->p_hdr; ++ struct ext3_extent_idx *ix; ++ int l = 0, k, r; ++ ++ EXT_ASSERT(eh->e_num <= eh->e_max); ++ EXT_ASSERT(eh->e_num > 0); ++ ++ ext_debug(tree, "binsearch for %d(idx): ", block); ++ ++ path->p_idx = ix = EXT_FIRST_INDEX(eh); ++ ++ r = k = eh->e_num; ++ while (k > 1) { ++ k = (r - l) / 2; ++ if (block < ix[l + k].e_block) ++ r -= k; ++ else ++ l += k; ++ ext_debug(tree, "%d:%d:%d ", k, l, r); + } + -+ /* OK. use inode's group */ -+ bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) + -+ le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block); -+ colour = (current->pid % 16) * -+ (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16); -+ return bg_start + colour; ++ ix += l; ++ path->p_idx = ix; ++ ext_debug(tree, " -> %d->%d ", path->p_idx->e_block, path->p_idx->e_leaf); ++ ++ while (l++ < r) { ++ if (block < ix->e_block) ++ break; ++ path->p_idx = ix++; ++ } ++ ext_debug(tree, " -> %d->%d\n", path->p_idx->e_block, ++ path->p_idx->e_leaf); ++ ++#ifdef CHECK_BINSEARCH ++ { ++ struct ext3_extent_idx *chix; ++ ++ chix = ix = EXT_FIRST_INDEX(eh); ++ for (k = 0; k < eh->e_num; k++, ix++) { ++ EXT_ASSERT(k == 0 || ix->e_block > ix[-1].e_block); ++ if (block < ix->e_block) ++ break; ++ chix = ix; ++ } ++ EXT_ASSERT(chix == path->p_idx); ++ } ++#endif ++ +} + -+static struct ext3_ext_path * -+ext3_ext_find_extent(struct inode *inode, int block, struct ext3_ext_path *path) ++/* ++ * binary search for closest extent by given block ++ */ ++static inline void ++ext3_ext_binsearch(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, int block) +{ -+ struct ext3_inode_info *ei = EXT3_I(inode); -+ struct ext3_extent_header *eh = (void *) ei->i_data; -+ struct ext3_extent_idx *ix; -+ struct buffer_head *bh; ++ struct ext3_extent_header *eh = path->p_hdr; + struct ext3_extent *ex; -+ int depth, i, k, ppos = 0; ++ int l = 0, k, r; ++ ++ EXT_ASSERT(eh->e_num <= eh->e_max); ++ ++ if (eh->e_num == 0) { ++ /* ++ * this leaf is empty yet: ++ * we get such a leaf in split/add case ++ */ ++ return; ++ } + -+ eh = (struct ext3_extent_header *) ei->i_data; ++ ext_debug(tree, "binsearch for %d: ", block); ++ ++ path->p_ext = ex = EXT_FIRST_EXTENT(eh); ++ ++ r = k = eh->e_num; ++ while (k > 1) { ++ k = (r - l) / 2; ++ if (block < ex[l + k].e_block) ++ r -= k; ++ else ++ l += k; ++ ext_debug(tree, "%d:%d:%d ", k, l, r); ++ } ++ ++ ex += l; ++ path->p_ext = ex; ++ ext_debug(tree, " -> %d:%d:%d ", path->p_ext->e_block, ++ path->p_ext->e_start, path->p_ext->e_num); ++ ++ while (l++ < r) { ++ if (block < ex->e_block) ++ break; ++ path->p_ext = ex++; ++ } ++ ext_debug(tree, " -> %d:%d:%d\n", path->p_ext->e_block, ++ path->p_ext->e_start, path->p_ext->e_num); ++ ++#ifdef CHECK_BINSEARCH ++ { ++ struct ext3_extent *chex; ++ ++ chex = ex = EXT_FIRST_EXTENT(eh); ++ for (k = 0; k < eh->e_num; k++, ex++) { ++ EXT_ASSERT(k == 0 || ex->e_block > ex[-1].e_block); ++ if (block < ex->e_block) ++ break; ++ chex = ex; ++ } ++ EXT_ASSERT(chex == path->p_ext); ++ } ++#endif ++ ++} ++ ++int ext3_extent_tree_init(handle_t *handle, struct ext3_extents_tree *tree) ++{ ++ struct ext3_extent_header *eh; ++ ++ BUG_ON(tree->buffer_len == 0); ++ ext3_ext_get_access_for_root(handle, tree); ++ eh = EXT_ROOT_HDR(tree); ++ eh->e_depth = 0; ++ eh->e_num = 0; ++ eh->e_max = ext3_ext_space_root(tree); ++ ext3_ext_mark_root_dirty(handle, tree); ++ return 0; ++} ++ ++struct ext3_ext_path * ++ext3_ext_find_extent(struct ext3_extents_tree *tree, int block, ++ struct ext3_ext_path *path) ++{ ++ struct ext3_extent_header *eh; ++ struct buffer_head *bh; ++ int depth, i, ppos = 0; + -+ /* initialize capacity of leaf in inode for first time */ -+ if (eh->e_max == 0) -+ eh->e_max = ext3_ext_space_inode(inode); -+ i = depth = ei->i_depth; ++ EXT_ASSERT(tree); ++ EXT_ASSERT(tree->inode); ++ EXT_ASSERT(tree->root); ++ ++ eh = EXT_ROOT_HDR(tree); ++ i = depth = EXT_DEPTH(tree); ++ EXT_ASSERT(eh->e_max); + EXT_ASSERT(i == 0 || eh->e_num > 0); + + /* account possible depth increase */ @@ -310,120 +418,80 @@ + return ERR_PTR(-ENOMEM); + } + memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1)); ++ path[0].p_hdr = eh; + + /* walk through the tree */ + while (i) { -+ ext_debug(inode, "depth %d: num %d, max %d\n", ++ ext_debug(tree, "depth %d: num %d, max %d\n", + ppos, eh->e_num, eh->e_max); -+ ix = EXT_FIRST_INDEX(eh); -+ if (eh->e_num) -+ path[ppos].p_idx = ix; -+ EXT_ASSERT(eh->e_num <= eh->e_max); -+ for (k = 0; k < eh->e_num; k++, ix++) { -+ ext_debug(inode, "index: %d -> %d\n", -+ ix->e_block, ix->e_leaf); -+ if (block < ix->e_block) -+ break; -+ path[ppos].p_idx = ix; -+ } ++ ext3_ext_binsearch_idx(tree, path + ppos, block); + path[ppos].p_block = path[ppos].p_idx->e_leaf; + path[ppos].p_depth = i; -+ path[ppos].p_hdr = eh; + path[ppos].p_ext = NULL; + -+ bh = sb_bread(inode->i_sb, path[ppos].p_block); ++ bh = sb_bread(tree->inode->i_sb, path[ppos].p_block); + if (!bh) { -+ ext3_ext_drop_refs(inode, path); ++ ext3_ext_drop_refs(path); + kfree(path); + return ERR_PTR(-EIO); + } -+ eh = (struct ext3_extent_header *) bh->b_data; ++ eh = EXT_BLOCK_HDR(bh); + ppos++; + EXT_ASSERT(ppos <= depth); + path[ppos].p_bh = bh; ++ path[ppos].p_hdr = eh; + i--; + } + + path[ppos].p_depth = i; + path[ppos].p_hdr = eh; + path[ppos].p_ext = NULL; -+ ++ + /* find extent */ -+ ex = EXT_FIRST_EXTENT(eh); -+ if (eh->e_num) -+ path[ppos].p_ext = ex; -+ EXT_ASSERT(eh->e_num <= eh->e_max); -+ for (k = 0; k < eh->e_num; k++, ex++) { -+ if (block < ex->e_block) -+ break; -+ path[ppos].p_ext = ex; -+ } ++ ext3_ext_binsearch(tree, path + ppos, block); + -+ ext3_ext_show_path(inode, path); ++ ext3_ext_show_path(tree, path); + + return path; +} + -+static void ext3_ext_check_boundary(struct inode *inode, -+ struct ext3_ext_path *curp, -+ void *addr, int len) -+{ -+ void *end; -+ -+ if (!len) -+ return; -+ if (curp->p_bh) -+ end = (void *) curp->p_hdr + inode->i_sb->s_blocksize; -+ else -+ end = (void *) curp->p_hdr + sizeof(EXT3_I(inode)->i_data); -+ if (((unsigned long) addr) + len > (unsigned long) end) { -+ printk("overflow! 0x%p > 0x%p\n", addr + len, end); -+ BUG(); -+ } -+ if ((unsigned long) addr < (unsigned long) curp->p_hdr) { -+ printk("underflow! 0x%p < 0x%p\n", addr, curp->p_hdr); -+ BUG(); -+ } -+} -+ +/* + * insert new index [logical;ptr] into the block at cupr + * it check where to insert: before curp or after curp + */ -+static int ext3_ext_insert_index(handle_t *handle, struct inode *inode, -+ struct ext3_ext_path *curp, int logical, -+ int ptr) ++static int ext3_ext_insert_index(handle_t *handle, ++ struct ext3_extents_tree *tree, ++ struct ext3_ext_path *curp, ++ int logical, int ptr) +{ + struct ext3_extent_idx *ix; + int len, err; + -+ if ((err = ext3_ext_get_access(handle, inode, curp))) ++ if ((err = ext3_ext_get_access(handle, tree, curp))) + return err; + + EXT_ASSERT(logical != curp->p_idx->e_block); + len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx; + if (logical > curp->p_idx->e_block) { + /* insert after */ -+ len = (len - 1) * sizeof(struct ext3_extent_idx); -+ len = len < 0 ? 0 : len; -+ ext_debug(inode, "insert new index %d after: %d. " -+ "move %d from 0x%p to 0x%p\n", -+ logical, ptr, len, -+ (curp->p_idx + 1), (curp->p_idx + 2)); -+ -+ ext3_ext_check_boundary(inode, curp, curp->p_idx + 2, len); -+ memmove(curp->p_idx + 2, curp->p_idx + 1, len); ++ if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) { ++ len = (len - 1) * sizeof(struct ext3_extent_idx); ++ len = len < 0 ? 0 : len; ++ ext_debug(tree, "insert new index %d after: %d. " ++ "move %d from 0x%p to 0x%p\n", ++ logical, ptr, len, ++ (curp->p_idx + 1), (curp->p_idx + 2)); ++ memmove(curp->p_idx + 2, curp->p_idx + 1, len); ++ } + ix = curp->p_idx + 1; + } else { + /* insert before */ + len = len * sizeof(struct ext3_extent_idx); + len = len < 0 ? 0 : len; -+ ext_debug(inode, "insert new index %d before: %d. " ++ ext_debug(tree, "insert new index %d before: %d. " + "move %d from 0x%p to 0x%p\n", + logical, ptr, len, + curp->p_idx, (curp->p_idx + 1)); -+ -+ ext3_ext_check_boundary(inode, curp, curp->p_idx + 1, len); + memmove(curp->p_idx + 1, curp->p_idx, len); + ix = curp->p_idx; + } @@ -432,8 +500,8 @@ + ix->e_leaf = ptr; + curp->p_hdr->e_num++; + -+ err = ext3_ext_dirty(handle, inode, curp); -+ ext3_std_error(inode->i_sb, err); ++ err = ext3_ext_dirty(handle, tree, curp); ++ ext3_std_error(tree->inode->i_sb, err); + + return err; +} @@ -447,17 +515,17 @@ + * into the newly allocated blocks + * - initialize subtree + */ -+static int ext3_ext_split(handle_t *handle, struct inode *inode, ++static int ext3_ext_split(handle_t *handle, struct ext3_extents_tree *tree, + struct ext3_ext_path *path, + struct ext3_extent *newext, int at) +{ + struct buffer_head *bh = NULL; -+ int depth = EXT3_I(inode)->i_depth; ++ int depth = EXT_DEPTH(tree); + struct ext3_extent_header *neh; + struct ext3_extent_idx *fidx; + struct ext3_extent *ex; + int i = at, k, m, a; -+ long newblock, oldblock, border; ++ unsigned long newblock, oldblock, border; + int *ablocks = NULL; /* array of allocated blocks */ + int err = 0; + @@ -466,14 +534,15 @@ + + /* if current leaf will be splitted, then we should use + * border from split point */ ++ + if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) { + border = path[depth].p_ext[1].e_block; -+ ext_debug(inode, "leaf will be splitted." ++ ext_debug(tree, "leaf will be splitted." + " next leaf starts at %d\n", + (int)border); + } else { + border = newext->e_block; -+ ext_debug(inode, "leaf will be added." ++ ext_debug(tree, "leaf will be added." + " next leaf starts at %d\n", + (int)border); + } @@ -490,19 +559,15 @@ + * we need this to handle errors and free blocks + * upon them + */ -+ ablocks = kmalloc(sizeof(long) * depth, GFP_NOFS); ++ ablocks = kmalloc(sizeof(unsigned long) * depth, GFP_NOFS); + if (!ablocks) + return -ENOMEM; -+ memset(ablocks, 0, sizeof(long) * depth); ++ memset(ablocks, 0, sizeof(unsigned long) * depth); + + /* allocate all needed blocks */ -+ ext_debug(inode, "allocate %d blocks for indexes and leaf\n", -+ depth - at); -+ ablocks[0] = newext->e_start++; -+ newext->e_num--; -+ for (a = 1; a < depth - at; a++) { -+ newblock = ext3_new_block(handle, inode, newext->e_start, -+ 0, 0, &err); ++ ext_debug(tree, "allocate %d blocks for indexes/leaf\n", depth - at); ++ for (a = 0; a < depth - at; a++) { ++ newblock = ext3_ext_new_block(handle, tree, path, newext, &err); + if (newblock == 0) + goto cleanup; + ablocks[a] = newblock; @@ -511,7 +576,7 @@ + /* initialize new leaf */ + newblock = ablocks[--a]; + EXT_ASSERT(newblock); -+ bh = sb_getblk(inode->i_sb, newblock); ++ bh = sb_getblk(tree->inode->i_sb, newblock); + if (!bh) { + err = -EIO; + goto cleanup; @@ -521,21 +586,20 @@ + if ((err = ext3_journal_get_create_access(handle, bh))) + goto cleanup; + -+ neh = (struct ext3_extent_header *) bh->b_data; ++ neh = EXT_BLOCK_HDR(bh); + neh->e_num = 0; -+ neh->e_max = ext3_ext_space_block(inode); ++ neh->e_max = ext3_ext_space_block(tree); + ex = EXT_FIRST_EXTENT(neh); + + /* move remain of path[depth] to the new leaf */ -+ EXT_ASSERT(path[depth].p_hdr->e_num == -+ path[depth].p_hdr->e_max); ++ EXT_ASSERT(path[depth].p_hdr->e_num == path[depth].p_hdr->e_max); + /* start copy from next extent */ + /* TODO: we could do it by single memmove */ + m = 0; + path[depth].p_ext++; + while (path[depth].p_ext <= + EXT_MAX_EXTENT(path[depth].p_hdr)) { -+ ext_debug(inode, "move %d:%d:%d in new leaf\n", ++ ext_debug(tree, "move %d:%d:%d in new leaf\n", + path[depth].p_ext->e_block, + path[depth].p_ext->e_start, + path[depth].p_ext->e_num); @@ -554,10 +618,10 @@ + + /* correct old leaf */ + if (m) { -+ if ((err = ext3_ext_get_access(handle, inode, path))) ++ if ((err = ext3_ext_get_access(handle, tree, path))) + goto cleanup; + path[depth].p_hdr->e_num -= m; -+ if ((err = ext3_ext_dirty(handle, inode, path))) ++ if ((err = ext3_ext_dirty(handle, tree, path))) + goto cleanup; + + } @@ -566,15 +630,14 @@ + k = depth - at - 1; + EXT_ASSERT(k >= 0); + if (k) -+ ext_debug(inode, -+ "create %d intermediate indices\n", k); ++ ext_debug(tree, "create %d intermediate indices\n", k); + /* insert new index into current index block */ + /* current depth stored in i var */ + i = depth - 1; + while (k--) { + oldblock = newblock; + newblock = ablocks[--a]; -+ bh = sb_getblk(inode->i_sb, newblock); ++ bh = sb_getblk(tree->inode->i_sb, newblock); + if (!bh) { + err = -EIO; + goto cleanup; @@ -584,28 +647,27 @@ + if ((err = ext3_journal_get_create_access(handle, bh))) + goto cleanup; + -+ neh = (struct ext3_extent_header *) bh->b_data; ++ neh = EXT_BLOCK_HDR(bh); + neh->e_num = 1; -+ neh->e_max = ext3_ext_space_block(inode); ++ neh->e_max = ext3_ext_space_block_idx(tree); + fidx = EXT_FIRST_INDEX(neh); + fidx->e_block = border; + fidx->e_leaf = oldblock; + -+ ext_debug(inode, -+ "int.index at %d (block %u): %d -> %d\n", ++ ext_debug(tree, "int.index at %d (block %u): %d -> %d\n", + i, (unsigned) newblock, + (int) border, + (int) oldblock); + /* copy indexes */ + m = 0; + path[i].p_idx++; ++ ext_debug(tree, "cur 0x%p, last 0x%p\n", path[i].p_idx, ++ EXT_MAX_INDEX(path[i].p_hdr)); + EXT_ASSERT(EXT_MAX_INDEX(path[i].p_hdr) == + EXT_LAST_INDEX(path[i].p_hdr)); -+ ext_debug(inode, "cur 0x%p, last 0x%p\n", path[i].p_idx, -+ EXT_MAX_INDEX(path[i].p_hdr)); + while (path[i].p_idx <= + EXT_MAX_INDEX(path[i].p_hdr)) { -+ ext_debug(inode, "%d: move %d:%d in new index\n", ++ ext_debug(tree, "%d: move %d:%d in new index\n", + i, path[i].p_idx->e_block, + path[i].p_idx->e_leaf); + memmove(++fidx, path[i].p_idx++, @@ -624,11 +686,11 @@ + + /* correct old index */ + if (m) { -+ err = ext3_ext_get_access(handle,inode,path+i); ++ err = ext3_ext_get_access(handle, tree, path + i); + if (err) + goto cleanup; + path[i].p_hdr->e_num -= m; -+ err = ext3_ext_dirty(handle, inode, path + i); ++ err = ext3_ext_dirty(handle, tree, path + i); + if (err) + goto cleanup; + } @@ -637,8 +699,8 @@ + } + + /* insert new index */ -+ if (!err) -+ err = ext3_ext_insert_index(handle, inode, path + at, ++ if (!err) ++ err = ext3_ext_insert_index(handle, tree, path + at, + border, newblock); + +cleanup: @@ -653,7 +715,7 @@ + for (i = 0; i < depth; i++) + if (!ablocks[i]) + continue; -+ ext3_free_blocks(handle, inode, ablocks[i], 1); ++ ext3_free_blocks(handle, tree->inode, ablocks[i], 1); + } + kfree(ablocks); + @@ -667,7 +729,8 @@ + * - initialize new top-level, creating index that points to the + * just created block + */ -+static int ext3_ext_grow_indepth(handle_t *handle, struct inode *inode, ++static int ext3_ext_grow_indepth(handle_t *handle, ++ struct ext3_extents_tree *tree, + struct ext3_ext_path *path, + struct ext3_extent *newext) +{ @@ -676,18 +739,16 @@ + struct ext3_extent_header *neh; + struct ext3_extent_idx *fidx; + int len, err = 0; -+ long newblock; ++ unsigned long newblock; + -+ /* -+ * use already allocated by the called block for new root block -+ */ -+ newblock = newext->e_start++; -+ newext->e_num--; -+ -+ bh = sb_getblk(inode->i_sb, newblock); ++ newblock = ext3_ext_new_block(handle, tree, path, newext, &err); ++ if (newblock == 0) ++ return err; ++ ++ bh = sb_getblk(tree->inode->i_sb, newblock); + if (!bh) { + err = -EIO; -+ ext3_std_error(inode->i_sb, err); ++ ext3_std_error(tree->inode->i_sb, err); + return err; + } + lock_buffer(bh); @@ -704,8 +765,8 @@ + memmove(bh->b_data, curp->p_hdr, len); + + /* set size of new block */ -+ neh = (struct ext3_extent_header *) bh->b_data; -+ neh->e_max = ext3_ext_space_block(inode); ++ neh = EXT_BLOCK_HDR(bh); ++ neh->e_max = ext3_ext_space_block(tree); + mark_buffer_uptodate(bh, 1); + unlock_buffer(bh); + @@ -713,22 +774,22 @@ + goto out; + + /* create index in new top-level index: num,max,pointer */ -+ if ((err = ext3_ext_get_access(handle, inode, curp))) ++ if ((err = ext3_ext_get_access(handle, tree, curp))) + goto out; + -+ curp->p_hdr->e_max = ext3_ext_space_inode_idx(inode); ++ curp->p_hdr->e_max = ext3_ext_space_root_idx(tree); + curp->p_hdr->e_num = 1; + curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr); + curp->p_idx->e_block = EXT_FIRST_EXTENT(path[0].p_hdr)->e_block; + curp->p_idx->e_leaf = newblock; + -+ neh = (struct ext3_extent_header *) EXT3_I(inode)->i_data; ++ neh = EXT_ROOT_HDR(tree); + fidx = EXT_FIRST_INDEX(neh); -+ ext_debug(inode, "new root: num %d(%d), lblock %d, ptr %d\n", ++ ext_debug(tree, "new root: num %d(%d), lblock %d, ptr %d\n", + neh->e_num, neh->e_max, fidx->e_block, fidx->e_leaf); + -+ EXT3_I(inode)->i_depth++; -+ err = ext3_ext_dirty(handle, inode, curp); ++ neh->e_depth = path->p_depth + 1; ++ err = ext3_ext_dirty(handle, tree, curp); +out: + brelse(bh); + @@ -739,15 +800,17 @@ + * routine finds empty index and adds new leaf. if no free index found + * then it requests in-depth growing + */ -+static int ext3_ext_create_new_leaf(handle_t *handle, struct inode *inode, ++static int ext3_ext_create_new_leaf(handle_t *handle, ++ struct ext3_extents_tree *tree, + struct ext3_ext_path *path, + struct ext3_extent *newext) +{ -+ int depth = EXT3_I(inode)->i_depth; + struct ext3_ext_path *curp; -+ int i = depth, err = 0; -+ long newblock = newext->e_start; ++ int depth, i, err = 0; + ++repeat: ++ i = depth = EXT_DEPTH(tree); ++ + /* walk up to the tree and look for free index entry */ + curp = path + depth; + while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) { @@ -760,42 +823,48 @@ + if (EXT_HAS_FREE_INDEX(curp)) { + /* if we found index with free entry, then use that + * entry: create all needed subtree and add new leaf */ -+ err = ext3_ext_split(handle, inode, path, newext, i); -+ } else { -+ /* tree is full, time to grow in depth */ -+ err = ext3_ext_grow_indepth(handle, inode, path, newext); -+ } ++ err = ext3_ext_split(handle, tree, path, newext, i); + -+ if (!err) { + /* refill path */ -+ ext3_ext_drop_refs(inode, path); -+ path = ext3_ext_find_extent(inode, newext->e_block, path); ++ ext3_ext_drop_refs(path); ++ path = ext3_ext_find_extent(tree, newext->e_block, path); + if (IS_ERR(path)) + err = PTR_ERR(path); ++ } else { ++ /* tree is full, time to grow in depth */ ++ err = ext3_ext_grow_indepth(handle, tree, path, newext); + ++ /* refill path */ ++ ext3_ext_drop_refs(path); ++ path = ext3_ext_find_extent(tree, newext->e_block, path); ++ if (IS_ERR(path)) ++ err = PTR_ERR(path); ++ + /* -+ * probably we've used some blocks from extent -+ * let's allocate new block for it ++ * only first (depth 0 -> 1) produces free space ++ * in all other cases we have to split growed tree + */ -+ if (newext->e_num == 0 && !err) { -+ newext->e_start = -+ ext3_new_block(handle, inode, newblock, -+ 0, 0, &err); -+ newext->e_num = 1; ++ depth = EXT_DEPTH(tree); ++ if (path[depth].p_hdr->e_num == path[depth].p_hdr->e_max) { ++ /* now we need split */ ++ goto repeat; + } + } + -+ return err; ++ if (err) ++ return err; ++ ++ return 0; +} + +/* -+ * returns next allocated block or 0xffffffff ++ * returns allocated block in subsequent extent or 0xffffffff + * NOTE: it consider block number from index entry as + * allocated block. thus, index entries have to be consistent + * with leafs + */ -+static inline unsigned ext3_ext_next_allocated_block(struct inode *inode, -+ struct ext3_ext_path *path) ++static unsigned long ++ext3_ext_next_allocated_block(struct ext3_ext_path *path) +{ + int depth; + @@ -827,7 +896,7 @@ +/* + * returns first allocated block from next leaf or 0xffffffff + */ -+static unsigned ext3_ext_next_leaf_block(struct inode *inode, ++static unsigned ext3_ext_next_leaf_block(struct ext3_extents_tree *tree, + struct ext3_ext_path *path) +{ + int depth; @@ -857,18 +926,17 @@ + * then we have to correct all indexes above + * TODO: do we need to correct tree in all cases? + */ -+int ext3_ext_correct_indexes(handle_t *handle, struct inode *inode, ++int ext3_ext_correct_indexes(handle_t *handle, struct ext3_extents_tree *tree, + struct ext3_ext_path *path) +{ -+ int depth = EXT3_I(inode)->i_depth; + struct ext3_extent_header *eh; ++ int depth = EXT_DEPTH(tree); + struct ext3_extent *ex; -+ long border; ++ unsigned long border; + int k, err = 0; + + eh = path[depth].p_hdr; + ex = path[depth].p_ext; -+ + EXT_ASSERT(ex); + EXT_ASSERT(eh); + @@ -882,35 +950,56 @@ + return 0; + } + ++ /* ++ * TODO: we need correction if border is smaller then current one ++ */ + k = depth - 1; + border = path[depth].p_ext->e_block; -+ if ((err = ext3_ext_get_access(handle, inode, path + k))) ++ if ((err = ext3_ext_get_access(handle, tree, path + k))) + return err; + path[k].p_idx->e_block = border; -+ if ((err = ext3_ext_dirty(handle, inode, path + k))) ++ if ((err = ext3_ext_dirty(handle, tree, path + k))) + return err; + + while (k--) { + /* change all left-side indexes */ -+ if (path[k].p_idx != EXT_FIRST_INDEX(path[k].p_hdr) -+ && k != 0) ++ if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr)) + break; -+ if ((err = ext3_ext_get_access(handle, inode, path + k))) ++ if ((err = ext3_ext_get_access(handle, tree, path + k))) + break; + path[k].p_idx->e_block = border; -+ if ((err = ext3_ext_dirty(handle, inode, path + k))) ++ if ((err = ext3_ext_dirty(handle, tree, path + k))) + break; + } + + return err; +} + ++static int inline ++ext3_can_extents_be_merged(struct ext3_extents_tree *tree, ++ struct ext3_extent *ex1, ++ struct ext3_extent *ex2) ++{ ++ if (ex1->e_block + ex1->e_num != ex2->e_block) ++ return 0; ++ ++#ifdef AGRESSIVE_TEST ++ if (ex1->e_num >= 4) ++ return 0; ++#endif ++ ++ if (!tree->mergable) ++ return 1; ++ ++ return tree->mergable(ex1, ex2); ++} ++ +/* + * this routine tries to merge requsted extent into the existing + * extent or inserts requested extent as new one into the tree, + * creating new leaf in no-space case + */ -+int ext3_ext_insert_extent(handle_t *handle, struct inode *inode, ++int ext3_ext_insert_extent(handle_t *handle, struct ext3_extents_tree *tree, + struct ext3_ext_path *path, + struct ext3_extent *newext) +{ @@ -921,56 +1010,49 @@ + struct ext3_ext_path *npath = NULL; + int err; + -+ depth = EXT3_I(inode)->i_depth; -+ if ((ex = path[depth].p_ext)) { -+ /* try to insert block into found extent and return */ -+ if (ex->e_block + ex->e_num == newext->e_block && -+ ex->e_start + ex->e_num == newext->e_start) { -+#ifdef AGRESSIVE_TEST -+ if (ex->e_num >= 2) -+ goto repeat; -+#endif -+ if ((err = ext3_ext_get_access(handle, inode, -+ path + depth))) -+ return err; -+ ext_debug(inode, "append %d block to %d:%d (from %d)\n", -+ newext->e_num, ex->e_block, ex->e_num, -+ ex->e_start); -+ ex->e_num += newext->e_num; -+ err = ext3_ext_dirty(handle, inode, path + depth); ++ depth = EXT_DEPTH(tree); ++ ex = path[depth].p_ext; ++ ++ /* try to insert block into found extent and return */ ++ if (ex && ext3_can_extents_be_merged(tree, ex, newext)) { ++ ext_debug(tree, "append %d block to %d:%d (from %d)\n", ++ newext->e_num, ex->e_block, ex->e_num, ++ ex->e_start); ++ if ((err = ext3_ext_get_access(handle, tree, path + depth))) + return err; -+ } ++ ex->e_num += newext->e_num; ++ err = ext3_ext_dirty(handle, tree, path + depth); ++ return err; + } + +repeat: -+ depth = EXT3_I(inode)->i_depth; ++ depth = EXT_DEPTH(tree); + eh = path[depth].p_hdr; + if (eh->e_num == eh->e_max) { + /* probably next leaf has space for us? */ -+ int next = ext3_ext_next_leaf_block(inode, path); ++ int next = ext3_ext_next_leaf_block(tree, path); + if (next != 0xffffffff) { -+ ext_debug(inode, "next leaf block - %d\n", next); ++ ext_debug(tree, "next leaf block - %d\n", next); + EXT_ASSERT(!npath); -+ npath = ext3_ext_find_extent(inode, next, NULL); ++ npath = ext3_ext_find_extent(tree, next, NULL); + if (IS_ERR(npath)) + return PTR_ERR(npath); + EXT_ASSERT(npath->p_depth == path->p_depth); + eh = npath[depth].p_hdr; + if (eh->e_num < eh->e_max) { -+ ext_debug(inode, -+ "next leaf has free ext(%d)\n", ++ ext_debug(tree, "next leaf isnt full(%d)\n", + eh->e_num); + path = npath; + goto repeat; + } -+ ext_debug(inode, "next leaf hasno free space(%d,%d)\n", ++ ext_debug(tree, "next leaf hasno free space(%d,%d)\n", + eh->e_num, eh->e_max); + } + /* + * there is no free space in found leaf + * we're gonna add new leaf in the tree + */ -+ err = ext3_ext_create_new_leaf(handle, inode, path, newext); ++ err = ext3_ext_create_new_leaf(handle, tree, path, newext); + if (err) + goto cleanup; + goto repeat; @@ -978,161 +1060,247 @@ + + nearex = path[depth].p_ext; + -+ if ((err = ext3_ext_get_access(handle, inode, path + depth))) ++ if ((err = ext3_ext_get_access(handle, tree, path + depth))) + goto cleanup; + + if (!nearex) { + /* there is no extent in this leaf, create first one */ -+ ext_debug(inode, "first extent in the leaf: %d:%d:%d\n", ++ ext_debug(tree, "first extent in the leaf: %d:%d:%d\n", + newext->e_block, newext->e_start, + newext->e_num); -+ eh->e_num++; + path[depth].p_ext = EXT_FIRST_EXTENT(eh); -+ + } else if (newext->e_block > nearex->e_block) { + EXT_ASSERT(newext->e_block != nearex->e_block); -+ len = EXT_MAX_EXTENT(eh) - nearex; -+ len = (len - 1) * sizeof(struct ext3_extent); -+ len = len < 0 ? 0 : len; -+ ext_debug(inode, "insert %d:%d:%d after: nearest 0x%p, " -+ "move %d from 0x%p to 0x%p\n", -+ newext->e_block, newext->e_start, newext->e_num, -+ nearex, len, nearex + 1, nearex + 2); -+ ext3_ext_check_boundary(inode, path + depth, nearex + 2, len); -+ memmove(nearex + 2, nearex + 1, len); ++ if (nearex != EXT_LAST_EXTENT(eh)) { ++ len = EXT_MAX_EXTENT(eh) - nearex; ++ len = (len - 1) * sizeof(struct ext3_extent); ++ len = len < 0 ? 0 : len; ++ ext_debug(tree, "insert %d:%d:%d after: nearest 0x%p, " ++ "move %d from 0x%p to 0x%p\n", ++ newext->e_block, newext->e_start, ++ newext->e_num, ++ nearex, len, nearex + 1, nearex + 2); ++ memmove(nearex + 2, nearex + 1, len); ++ } + path[depth].p_ext = nearex + 1; -+ eh->e_num++; + } else { + EXT_ASSERT(newext->e_block != nearex->e_block); + len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext3_extent); + len = len < 0 ? 0 : len; -+ ext_debug(inode, "insert %d:%d:%d before: nearest 0x%p, " ++ ext_debug(tree, "insert %d:%d:%d before: nearest 0x%p, " + "move %d from 0x%p to 0x%p\n", + newext->e_block, newext->e_start, newext->e_num, + nearex, len, nearex + 1, nearex + 2); ++ + memmove(nearex + 1, nearex, len); + path[depth].p_ext = nearex; -+ eh->e_num++; -+ -+ /* time to correct all indexes above */ -+ err = ext3_ext_correct_indexes(handle, inode, path); + } + + if (!err) { ++ eh->e_num++; + nearex = path[depth].p_ext; + nearex->e_block = newext->e_block; + nearex->e_start = newext->e_start; + nearex->e_num = newext->e_num; ++ ++ /* time to correct all indexes above */ ++ err = ext3_ext_correct_indexes(handle, tree, path); + } + -+ err = ext3_ext_dirty(handle, inode, path + depth); ++ err = ext3_ext_dirty(handle, tree, path + depth); + +cleanup: + if (npath) { -+ ext3_ext_drop_refs(inode, npath); ++ ext3_ext_drop_refs(npath); + kfree(npath); + } + + return err; +} + -+int ext3_ext_get_block(handle_t *handle, struct inode *inode, long iblock, -+ struct buffer_head *bh_result, int create) ++int ext3_ext_walk_space(struct ext3_extents_tree *tree, unsigned long block, ++ unsigned long num, ext_prepare_callback func) +{ -+ struct ext3_ext_path *path; -+ int depth = EXT3_I(inode)->i_depth; -+ struct ext3_extent newex; -+ struct ext3_extent *ex; -+ int goal, newblock, err = 0; ++ struct ext3_ext_path *path = NULL; ++ struct ext3_extent *ex, cbex; ++ unsigned long next, start = 0, end = 0; ++ int depth, exists, err = 0; ++ ++ EXT_ASSERT(tree); ++ EXT_ASSERT(func); ++ EXT_ASSERT(tree->inode); ++ EXT_ASSERT(tree->root); ++ ++ while (num > 0 && block != 0xfffffffff) { ++ /* find extent for this block */ ++ path = ext3_ext_find_extent(tree, block, path); ++ if (IS_ERR(path)) { ++ err = PTR_ERR(path); ++ break; ++ } + -+ ext_debug(inode, "block %d requested for inode %u, bh_result 0x%p\n", -+ (int) iblock, (unsigned) inode->i_ino, bh_result); -+ bh_result->b_state &= ~(1UL << BH_New); ++ depth = EXT_DEPTH(tree); ++ ex = path[depth].p_ext; ++ next = ext3_ext_next_allocated_block(path); ++ ++ exists = 0; ++ if (!ex) { ++ /* there is no extent yet, so try to allocate ++ * all requested space */ ++ start = block; ++ end = block + num - 1; ++ } else if (ex->e_block > block) { ++ /* need to allocate space before found extent */ ++ start = block; ++ end = ex->e_block - 1; ++ if (block + num - 1 < end) ++ end = block + num - 1; ++ } else if (block >= ex->e_block + ex->e_num) { ++ /* need to allocate space after found extent */ ++ start = block; ++ end = block + num - 1; ++ if (end >= next) ++ end = next - 1; ++ } else if (block >= ex->e_block) { ++ /* ++ * some part of requested space is covered ++ * by found extent ++ */ ++ start = block; ++ end = ex->e_block + ex->e_num - 1; ++ if (block + num - 1 < end) ++ end = block + num - 1; ++ exists = 1; ++ } else { ++ BUG(); ++ } + -+ down(&EXT3_I(inode)->i_ext_sem); ++ if (!exists) { ++ cbex.e_block = start; ++ cbex.e_num = end - start + 1; ++ cbex.e_start = 0; ++ } else ++ cbex = *ex; + -+ /* find extent for this block */ -+ path = ext3_ext_find_extent(inode, iblock, NULL); -+ if (IS_ERR(path)) { -+ err = PTR_ERR(path); -+ goto out2; -+ } ++ err = func(tree, path, &cbex, exists); ++ if (err < 0) ++ break; + -+ if ((ex = path[depth].p_ext)) { -+ /* if found exent covers block, simple return it */ -+ if (iblock >= ex->e_block && iblock < ex->e_block + ex->e_num) { -+ newblock = iblock - ex->e_block + ex->e_start; -+ ext_debug(inode, "%d fit into %d:%d -> %d\n", -+ (int) iblock, ex->e_block, ex->e_num, -+ newblock); -+ goto out; ++ if (err == EXT_BREAK) { ++ err = 0; ++ break; + } -+ } + -+ /* -+ * we couldn't try to create block if create flag is zero -+ */ -+ if (!create) -+ goto out2; ++ if (EXT_DEPTH(tree) != depth) { ++ /* depth was changed. we have to realloc path */ ++ ext3_ext_drop_refs(path); ++ kfree(path); ++ path = NULL; ++ } + -+ /* allocate new block */ -+ goal = ext3_ext_find_goal(inode, path); -+ newblock = ext3_new_block(handle, inode, goal, 0, 0, &err); -+ if (!newblock) -+ goto out2; -+ ext_debug(inode, "allocate new block: goal %d, found %d\n", -+ goal, newblock); ++ block += cbex.e_num; ++ num -= cbex.e_num; ++ } + -+ /* try to insert new extent into found leaf and return */ -+ newex.e_block = iblock; -+ newex.e_start = newblock; -+ newex.e_num = 1; -+ err = ext3_ext_insert_extent(handle, inode, path, &newex); -+ if (err) -+ goto out2; -+ -+ /* previous routine could use block we allocated */ -+ newblock = newex.e_start; -+ bh_result->b_state |= (1UL << BH_New); ++ if (path) { ++ ext3_ext_drop_refs(path); ++ kfree(path); ++ } + -+out: -+ ext3_ext_show_leaf(inode, path); -+ bh_result->b_dev = inode->i_dev; -+ bh_result->b_blocknr = newblock; -+ bh_result->b_state |= (1UL << BH_Mapped); -+out2: -+ ext3_ext_drop_refs(inode, path); -+ kfree(path); -+ up(&EXT3_I(inode)->i_ext_sem); ++ return err; ++} + -+ return err; ++static inline void ++ext3_ext_invalidate_cache(struct ext3_extents_tree *tree) ++{ ++ if (tree->cex) ++ tree->cex->e_num = 0; ++} ++ ++static inline void ++ext3_ext_put_in_cache(struct ext3_extents_tree *tree, struct ext3_extent *ex) ++{ ++ if (tree->cex) { ++ EXT_ASSERT(ex); ++ EXT_ASSERT(ex->e_num); ++ tree->cex->e_block = ex->e_block; ++ tree->cex->e_start = ex->e_start; ++ tree->cex->e_num = ex->e_num; ++ } +} + +/* -+ * returns 1 if current index have to be freed (even partial) ++ * this routine calculate boundaries of the gap requested block fits into ++ * and cache this gap + */ -+static int ext3_ext_more_to_truncate(struct inode *inode, -+ struct ext3_ext_path *path) ++static inline void ++ext3_ext_put_gap_in_cache(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ unsigned long block) +{ -+ EXT_ASSERT(path->p_idx); ++ int depth = EXT_DEPTH(tree); ++ struct ext3_extent *ex, gex; + -+ if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr)) ++ ex = path[depth].p_ext; ++ if (ex == NULL) { ++ /* there is no extent yet, so gap is [0;-] */ ++ gex.e_block = 0; ++ gex.e_num = 0xffffffff; ++ ext_debug(tree, "cache gap(whole file):"); ++ } else if (block < ex->e_block) { ++ gex.e_block = block; ++ gex.e_num = ex->e_block - block; ++ ext_debug(tree, "cache gap(before): %lu [%lu:%lu]", ++ (unsigned long) block, ++ (unsigned long) ex->e_block, ++ (unsigned long) ex->e_num); ++ } else if (block >= ex->e_block + ex->e_num) { ++ gex.e_block = ex->e_block + ex->e_num; ++ gex.e_num = ext3_ext_next_allocated_block(path); ++ ext_debug(tree, "cache gap(after): [%lu:%lu] %lu", ++ (unsigned long) ex->e_block, ++ (unsigned long) ex->e_num, ++ (unsigned long) block); ++ EXT_ASSERT(gex.e_num > gex.e_block); ++ gex.e_num = gex.e_num - gex.e_block; ++ } else { ++ BUG(); ++ } ++ ++ ext_debug(tree, " -> %lu:%lu\n", (unsigned long) gex.e_block, ++ (unsigned long) gex.e_num); ++ gex.e_start = 0xffffffff; ++ ext3_ext_put_in_cache(tree, &gex); ++} ++ ++static inline int ++ext3_ext_in_cache(struct ext3_extents_tree *tree, unsigned long block, ++ struct ext3_extent *ex) ++{ ++ struct ext3_extent *cex = tree->cex; ++ ++ /* is there cache storage at all? */ ++ if (!cex) + return 0; + -+ /* -+ * if truncate on deeper level happened it it wasn't partial -+ * so we have to consider current index for truncation -+ */ -+ if (path->p_hdr->e_num == path->p_block) ++ /* has cache valid data? */ ++ if (cex->e_num == 0) + return 0; + -+ /* -+ * put actual number of indexes to know is this number got -+ * changed at the next iteration -+ */ -+ path->p_block = path->p_hdr->e_num; -+ -+ return 1; ++ if (block >= cex->e_block && block < cex->e_block + cex->e_num) { ++ ex->e_block = cex->e_block; ++ ex->e_start = cex->e_start; ++ ex->e_num = cex->e_num; ++ ext_debug(tree, "%lu cached by %lu:%lu:%lu(gap)\n", ++ (unsigned long) block, ++ (unsigned long) ex->e_block, ++ (unsigned long) ex->e_num, ++ (unsigned long) ex->e_start); ++ return 1; ++ } ++ ++ /* not in cache */ ++ return 0; +} + +/* @@ -1140,8 +1308,8 @@ + * it's used in truncate case only. thus all requests are for + * last index in the block only + */ -+int ext3_ext_remove_index(handle_t *handle, struct inode *inode, -+ struct ext3_ext_path *path) ++int ext3_ext_rm_idx(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) +{ + struct buffer_head *bh; + int err; @@ -1149,271 +1317,314 @@ + /* free index block */ + path--; + EXT_ASSERT(path->p_hdr->e_num); -+ if ((err = ext3_ext_get_access(handle, inode, path))) ++ if ((err = ext3_ext_get_access(handle, tree, path))) + return err; + path->p_hdr->e_num--; -+ if ((err = ext3_ext_dirty(handle, inode, path))) ++ if ((err = ext3_ext_dirty(handle, tree, path))) + return err; -+ bh = sb_get_hash_table(inode->i_sb, path->p_idx->e_leaf); -+ ext3_forget(handle, 0, inode, bh, path->p_idx->e_leaf); -+ ext3_free_blocks(handle, inode, path->p_idx->e_leaf, 1); -+ -+ ext_debug(inode, "index is empty, remove it, free block %d\n", ++ ext_debug(tree, "index is empty, remove it, free block %d\n", + path->p_idx->e_leaf); ++ bh = sb_get_hash_table(tree->inode->i_sb, path->p_idx->e_leaf); ++ ext3_forget(handle, 0, tree->inode, bh, path->p_idx->e_leaf); ++ ext3_free_blocks(handle, tree->inode, path->p_idx->e_leaf, 1); + return err; +} + -+/* -+ * returns 1 if current extent needs to be freed (even partial) -+ * instead, returns 0 -+ */ -+int ext3_ext_more_leaves_to_truncate(struct inode *inode, ++int ext3_ext_calc_credits_for_insert(struct ext3_extents_tree *tree, + struct ext3_ext_path *path) +{ -+ unsigned blocksize = inode->i_sb->s_blocksize; -+ struct ext3_extent *ex = path->p_ext; -+ int last_block; -+ -+ EXT_ASSERT(ex); ++ int depth = EXT_DEPTH(tree); ++ int needed; + -+ /* is there leave in the current leaf? */ -+ if (ex < EXT_FIRST_EXTENT(path->p_hdr)) -+ return 0; ++ if (path) { ++ /* probably there is space in leaf? */ ++ if (path[depth].p_hdr->e_num < path[depth].p_hdr->e_max) ++ return 1; ++ } + -+ last_block = (inode->i_size + blocksize-1) -+ >> EXT3_BLOCK_SIZE_BITS(inode->i_sb); ++ /* ++ * the worste case we're expecting is creation of the ++ * new root (growing in depth) with index splitting ++ * for splitting we have to consider depth + 1 because ++ * previous growing could increase it ++ */ ++ depth = depth + 1; + -+ if (last_block >= ex->e_block + ex->e_num) -+ return 0; ++ /* ++ * growing in depth: ++ * block allocation + new root + old root ++ */ ++ needed = EXT3_ALLOC_NEEDED + 2; + -+ /* seems it extent have to be freed */ -+ return 1; ++ /* index split. we may need: ++ * allocate intermediate indexes and new leaf ++ * change two blocks at each level, but root ++ * modify root block (inode) ++ */ ++ needed += (depth * EXT3_ALLOC_NEEDED) + (2 * depth) + 1; ++ ++ return needed; +} + -+handle_t *ext3_ext_journal_restart(handle_t *handle, int needed) ++static int ++ext3_ext_split_for_rm(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, unsigned long start, ++ unsigned long end) +{ -+ int err; ++ struct ext3_extent *ex, tex; ++ struct ext3_ext_path *npath; ++ int depth, creds, err; + -+ if (handle->h_buffer_credits > needed) -+ return handle; -+ if (!ext3_journal_extend(handle, needed)) -+ return handle; -+ err = ext3_journal_restart(handle, needed); ++ depth = EXT_DEPTH(tree); ++ ex = path[depth].p_ext; ++ EXT_ASSERT(ex); ++ EXT_ASSERT(end < ex->e_block + ex->e_num - 1); ++ EXT_ASSERT(ex->e_block < start); ++ ++ /* calculate tail extent */ ++ tex.e_block = end + 1; ++ EXT_ASSERT(tex.e_block < ex->e_block + ex->e_num); ++ tex.e_num = ex->e_block + ex->e_num - tex.e_block; ++ ++ creds = ext3_ext_calc_credits_for_insert(tree, path); ++ handle = ext3_ext_journal_restart(handle, creds); ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); + -+ return handle; -+} ++ /* calculate head extent. use primary extent */ ++ err = ext3_ext_get_access(handle, tree, path + depth); ++ if (err) ++ return err; ++ ex->e_num = start - ex->e_block; ++ err = ext3_ext_dirty(handle, tree, path + depth); ++ if (err) ++ return err; + -+/* -+ * this routine calculate max number of blocks to be modified -+ * while freeing extent and is intended to be used in truncate path -+ */ -+static int ext3_ext_calc_credits(struct inode *inode, -+ struct ext3_ext_path *path, -+ int num) -+{ -+ int depth = EXT3_I(inode)->i_depth; -+ int needed; -+ -+ /* -+ * extent couldn't cross group, so we will modify -+ * single bitmap block and single group descriptor -+ */ -+ needed = 2; ++ /* FIXME: some callback to free underlying resource ++ * and correct e_start? */ ++ ext_debug(tree, "split extent: head %u:%u, tail %u:%u\n", ++ ex->e_block, ex->e_num, tex.e_block, tex.e_num); + -+ /* -+ * if this is last extent in a leaf, then we have to -+ * free leaf block and remove pointer from index above. -+ * that pointer could be last in index block, so we'll -+ * have to remove it too. this way we could modify/free -+ * the whole path + root index (inode stored) will be -+ * modified -+ */ -+ if (!path || (num == path->p_ext->e_num && -+ path->p_ext == EXT_FIRST_EXTENT(path->p_hdr))) -+ needed += (depth * EXT3_ALLOC_NEEDED) + 1; ++ npath = ext3_ext_find_extent(tree, ex->e_block, NULL); ++ if (IS_ERR(npath)) ++ return PTR_ERR(npath); ++ depth = EXT_DEPTH(tree); ++ EXT_ASSERT(npath[depth].p_ext->e_block == ex->e_block); ++ EXT_ASSERT(npath[depth].p_ext->e_num == ex->e_num); + -+ return needed; ++ err = ext3_ext_insert_extent(handle, tree, npath, &tex); ++ ext3_ext_drop_refs(npath); ++ kfree(npath); ++ ++ return err; ++ +} + -+/* -+ * core of the truncate procedure: -+ * - calculated what part of each extent in the requested leaf -+ * need to be freed -+ * - frees and forgets these blocks -+ * -+ * TODO: we could optimize and free several extents during -+ * single journal_restart()-journal_restart() cycle -+ */ -+static int ext3_ext_truncate_leaf(handle_t *handle, -+ struct inode *inode, -+ struct ext3_ext_path *path, -+ int depth) ++static int ++ext3_ext_rm_leaf(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, unsigned long start, ++ unsigned long end) +{ -+ unsigned blocksize = inode->i_sb->s_blocksize; -+ int last_block; -+ int i, err = 0, sf, num; ++ struct ext3_extent *ex, *fu = NULL, *lu, *le; ++ int err = 0, correct_index = 0; ++ int depth = EXT_DEPTH(tree), credits; ++ struct ext3_extent_header *eh; ++ unsigned a, b, block, num; + -+ ext_debug(inode, "level %d - leaf\n", depth); -+ if (!path->p_hdr) -+ path->p_hdr = -+ (struct ext3_extent_header *) path->p_bh->b_data; ++ ext_debug(tree, "remove [%lu:%lu] in leaf\n", start, end); ++ if (!path[depth].p_hdr) ++ path[depth].p_hdr = EXT_BLOCK_HDR(path[depth].p_bh); ++ eh = path[depth].p_hdr; ++ EXT_ASSERT(eh); ++ EXT_ASSERT(eh->e_num <= eh->e_max); ++ ++ /* find where to start removing */ ++ le = ex = EXT_LAST_EXTENT(eh); ++ while (ex != EXT_FIRST_EXTENT(eh)) { ++ if (ex->e_block <= end) ++ break; ++ ex--; ++ } + -+ EXT_ASSERT(path->p_hdr->e_num <= path->p_hdr->e_max); ++ if (start > ex->e_block && end < ex->e_block + ex->e_num - 1) { ++ /* removal of internal part of the extent requested ++ * tail and head must be placed in different extent ++ * so, we have to insert one more extent */ ++ path[depth].p_ext = ex; ++ return ext3_ext_split_for_rm(handle, tree, path, start, end); ++ } + -+ last_block = (inode->i_size + blocksize-1) -+ >> EXT3_BLOCK_SIZE_BITS(inode->i_sb); -+ path->p_ext = EXT_LAST_EXTENT(path->p_hdr); -+ while (ext3_ext_more_leaves_to_truncate(inode, path)) { ++ lu = ex; ++ while (ex >= EXT_FIRST_EXTENT(eh) && ++ ex->e_block + ex->e_num > start) { ++ ext_debug(tree, "remove ext %u:%u\n", ex->e_block, ex->e_num); ++ path[depth].p_ext = ex; ++ ++ a = ex->e_block > start ? ex->e_block : start; ++ b = ex->e_block + ex->e_num - 1 < end ? ++ ex->e_block + ex->e_num - 1 : end; ++ ++ ext_debug(tree, " border %u:%u\n", a, b); ++ ++ if (a != ex->e_block && b != ex->e_block + ex->e_num - 1) { ++ block = 0; ++ num = 0; ++ BUG(); ++ } else if (a != ex->e_block) { ++ /* remove tail of the extent */ ++ block = ex->e_block; ++ num = a - block; ++ } else if (b != ex->e_block + ex->e_num - 1) { ++ /* remove head of the extent */ ++ block = a; ++ num = b - a; ++ } else { ++ /* remove whole extent: excelent! */ ++ block = ex->e_block; ++ num = 0; ++ EXT_ASSERT(a == ex->e_block && ++ b == ex->e_block + ex->e_num - 1); ++ } + -+ /* what part of extent have to be freed? */ -+ sf = last_block > path->p_ext->e_block ? -+ last_block : path->p_ext->e_block; ++ if (ex == EXT_FIRST_EXTENT(eh)) ++ correct_index = 1; ++ ++ credits = 1; ++ if (correct_index) ++ credits += (EXT_DEPTH(tree) * EXT3_ALLOC_NEEDED) + 1; ++ if (tree->remove_extent_credits) ++ credits += tree->remove_extent_credits(tree, ex, a, b); ++ ++ handle = ext3_ext_journal_restart(handle, credits); ++ if (IS_ERR(handle)) { ++ err = PTR_ERR(handle); ++ goto out; ++ } + -+ /* number of blocks from extent to be freed */ -+ num = path->p_ext->e_block + path->p_ext->e_num - sf; ++ err = ext3_ext_get_access(handle, tree, path + depth); ++ if (err) ++ goto out; + -+ /* calc physical first physical block to be freed */ -+ sf = path->p_ext->e_start + (sf - path->p_ext->e_block); ++ if (tree->remove_extent) ++ err = tree->remove_extent(tree, ex, a, b); ++ if (err) ++ goto out; + -+ i = ext3_ext_calc_credits(inode, path, num); -+ handle = ext3_ext_journal_restart(handle, i); -+ if (IS_ERR(handle)) -+ return PTR_ERR(handle); -+ -+ ext_debug(inode, "free extent %d:%d:%d -> free %d:%d\n", -+ path->p_ext->e_block, path->p_ext->e_start, -+ path->p_ext->e_num, sf, num); -+ for (i = 0; i < num; i++) { -+ struct buffer_head *bh = -+ sb_get_hash_table(inode->i_sb, sf + i); -+ ext3_forget(handle, 0, inode, bh, sf + i); ++ if (num == 0) { ++ /* this extent is removed entirely mark slot unused */ ++ ex->e_start = 0; ++ eh->e_num--; ++ fu = ex; + } -+ ext3_free_blocks(handle, inode, sf, num); + -+ /* collect extents usage stats */ -+ spin_lock(&EXT3_SB(inode->i_sb)->s_ext_lock); -+ EXT3_SB(inode->i_sb)->s_ext_extents++; -+ EXT3_SB(inode->i_sb)->s_ext_blocks += num; -+ spin_unlock(&EXT3_SB(inode->i_sb)->s_ext_lock); ++ ex->e_block = block; ++ ex->e_num = num; + -+ /* reduce extent */ -+ if ((err = ext3_ext_get_access(handle, inode, path))) -+ return err; -+ path->p_ext->e_num -= num; -+ if (path->p_ext->e_num == 0) -+ path->p_hdr->e_num--; -+ if ((err = ext3_ext_dirty(handle, inode, path))) -+ return err; ++ err = ext3_ext_dirty(handle, tree, path + depth); ++ if (err) ++ goto out; + -+ path->p_ext--; ++ ext_debug(tree, "new extent: %u:%u:%u\n", ++ ex->e_block, ex->e_num, ex->e_start); ++ ex--; + } -+ ++ ++ if (fu) { ++ /* reuse unused slots */ ++ while (lu < le) { ++ if (lu->e_start) { ++ *fu = *lu; ++ lu->e_start = 0; ++ fu++; ++ } ++ lu++; ++ } ++ } ++ ++ if (correct_index && eh->e_num) ++ err = ext3_ext_correct_indexes(handle, tree, path); ++ + /* if this leaf is free, then we should + * remove it from index block above */ -+ if (path->p_hdr->e_num == 0 && depth > 0) -+ err = ext3_ext_remove_index(handle, inode, path); ++ if (err == 0 && eh->e_num == 0 && path[depth].p_bh != NULL) ++ err = ext3_ext_rm_idx(handle, tree, path + depth); + ++out: + return err; +} + -+static void ext3_ext_collect_stats(struct inode *inode) -+{ -+ int depth; -+ -+ /* skip inodes with old good bitmap */ -+ if (!(EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL)) -+ return; -+ -+ /* collect on full truncate only */ -+ if (inode->i_size) -+ return; + -+ depth = EXT3_I(inode)->i_depth; -+ if (depth < EXT3_SB(inode->i_sb)->s_ext_mindepth) -+ EXT3_SB(inode->i_sb)->s_ext_mindepth = depth; -+ if (depth > EXT3_SB(inode->i_sb)->s_ext_maxdepth) -+ EXT3_SB(inode->i_sb)->s_ext_maxdepth = depth; -+ EXT3_SB(inode->i_sb)->s_ext_sum += depth; -+ EXT3_SB(inode->i_sb)->s_ext_count++; ++static struct ext3_extent_idx * ++ext3_ext_last_covered(struct ext3_extent_header *hdr, unsigned long block) ++{ ++ struct ext3_extent_idx *ix; + ++ ix = EXT_LAST_INDEX(hdr); ++ while (ix != EXT_FIRST_INDEX(hdr)) { ++ if (ix->e_block <= block) ++ break; ++ ix--; ++ } ++ return ix; +} + -+void ext3_ext_truncate(struct inode * inode) ++/* ++ * returns 1 if current index have to be freed (even partial) ++ */ ++static int inline ++ext3_ext_more_to_rm(struct ext3_ext_path *path) +{ -+ struct address_space *mapping = inode->i_mapping; -+ struct ext3_ext_path *path; -+ struct page * page; -+ handle_t *handle; -+ int i, depth, err = 0; ++ EXT_ASSERT(path->p_idx); + -+ ext3_ext_collect_stats(inode); ++ if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr)) ++ return 0; + + /* -+ * We have to lock the EOF page here, because lock_page() nests -+ * outside journal_start(). ++ * if truncate on deeper level happened it it wasn't partial ++ * so we have to consider current index for truncation + */ -+ if ((inode->i_size & (inode->i_sb->s_blocksize - 1)) == 0) { -+ /* Block boundary? Nothing to do */ -+ page = NULL; -+ } else { -+ page = grab_cache_page(mapping, -+ inode->i_size >> PAGE_CACHE_SHIFT); -+ if (!page) -+ return; -+ } -+ -+ /* -+ * probably first extent we're gonna free will be last in block -+ */ -+ i = ext3_ext_calc_credits(inode, NULL, 0); -+ handle = ext3_journal_start(inode, i); -+ if (IS_ERR(handle)) { -+ if (page) { -+ clear_highpage(page); -+ flush_dcache_page(page); -+ unlock_page(page); -+ page_cache_release(page); -+ } -+ return; -+ } ++ if (path->p_hdr->e_num == path->p_block) ++ return 0; ++ return 1; ++} + -+ if (page) -+ ext3_block_truncate_page(handle, mapping, inode->i_size); ++int ext3_ext_remove_space(struct ext3_extents_tree *tree, ++ unsigned long start, unsigned long end) ++{ ++ struct inode *inode = tree->inode; ++ struct super_block *sb = inode->i_sb; ++ int depth = EXT_DEPTH(tree); ++ struct ext3_ext_path *path; ++ handle_t *handle; ++ int i = 0, err = 0; + -+ down(&EXT3_I(inode)->i_ext_sem); ++ ext_debug(tree, "space to be removed: %lu:%lu\n", start, end); + -+ /* -+ * TODO: optimization is possible here -+ * probably we need not scaning at all, -+ * because page truncation is enough -+ */ -+ if (ext3_orphan_add(handle, inode)) -+ goto out_stop; ++ /* probably first extent we're gonna free will be last in block */ ++ handle = ext3_journal_start(inode, depth + 1); ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); + -+ /* we have to know where to truncate from in crash case */ -+ EXT3_I(inode)->i_disksize = inode->i_size; -+ ext3_mark_inode_dirty(handle, inode); ++ ext3_ext_invalidate_cache(tree); + + /* + * we start scanning from right side freeing all the blocks + * after i_size and walking into the deep + */ -+ i = 0; -+ depth = EXT3_I(inode)->i_depth; + path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 1), GFP_KERNEL); + if (IS_ERR(path)) { -+ ext3_error(inode->i_sb, "ext3_ext_truncate", ++ ext3_error(sb, "ext3_ext_remove_space", + "Can't allocate path array"); -+ goto out_stop; ++ ext3_journal_stop(handle, inode); ++ return -ENOMEM; + } + memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1)); -+ -+ path[i].p_hdr = (struct ext3_extent_header *) EXT3_I(inode)->i_data; ++ path[i].p_hdr = EXT_ROOT_HDR(tree); ++ + while (i >= 0 && err == 0) { + if (i == depth) { + /* this is leaf block */ -+ err = ext3_ext_truncate_leaf(handle, inode, -+ path + i, i); ++ err = ext3_ext_rm_leaf(handle, tree, path, start, end); + /* root level have p_bh == NULL, brelse() eats this */ + brelse(path[i].p_bh); + i--; @@ -1422,54 +1633,54 @@ + + /* this is index block */ + if (!path[i].p_hdr) { -+ path[i].p_hdr = -+ (struct ext3_extent_header *) path[i].p_bh->b_data; -+ ext_debug(inode, "initialize header\n"); ++ ext_debug(tree, "initialize header\n"); ++ path[i].p_hdr = EXT_BLOCK_HDR(path[i].p_bh); + } + + EXT_ASSERT(path[i].p_hdr->e_num <= path[i].p_hdr->e_max); + + if (!path[i].p_idx) { + /* this level hasn't touched yet */ -+ path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr); ++ path[i].p_idx = ++ ext3_ext_last_covered(path[i].p_hdr, end); + path[i].p_block = path[i].p_hdr->e_num + 1; -+ ext_debug(inode, "init index ptr: hdr 0x%p, num %d\n", ++ ext_debug(tree, "init index ptr: hdr 0x%p, num %d\n", + path[i].p_hdr, path[i].p_hdr->e_num); + } else { + /* we've already was here, see at next index */ + path[i].p_idx--; + } + -+ ext_debug(inode, "level %d - index, first 0x%p, cur 0x%p\n", ++ ext_debug(tree, "level %d - index, first 0x%p, cur 0x%p\n", + i, EXT_FIRST_INDEX(path[i].p_hdr), + path[i].p_idx); -+ if (ext3_ext_more_to_truncate(inode, path + i)) { ++ if (ext3_ext_more_to_rm(path + i)) { + /* go to the next level */ -+ ext_debug(inode, "move to level %d (block %d)\n", i+1, -+ path[i].p_idx->e_leaf); ++ ext_debug(tree, "move to level %d (block %d)\n", ++ i + 1, path[i].p_idx->e_leaf); + memset(path + i + 1, 0, sizeof(*path)); -+ path[i+1].p_bh = sb_bread(inode->i_sb, -+ path[i].p_idx->e_leaf); ++ path[i+1].p_bh = sb_bread(sb, path[i].p_idx->e_leaf); + if (!path[i+1].p_bh) { + /* should we reset i_size? */ + err = -EIO; + break; + } ++ /* put actual number of indexes to know is this ++ * number got changed at the next iteration */ ++ path[i].p_block = path[i].p_hdr->e_num; + i++; + } else { + /* we finish processing this index, go up */ + if (path[i].p_hdr->e_num == 0 && i > 0) { + /* index is empty, remove it + * handle must be already prepared by the -+ * truncate_leaf() -+ */ -+ err = ext3_ext_remove_index(handle, inode, -+ path + i); ++ * truncate_leaf() */ ++ err = ext3_ext_rm_idx(handle, tree, path + i); + } + /* root level have p_bh == NULL, brelse() eats this */ + brelse(path[i].p_bh); + i--; -+ ext_debug(inode, "return to level %d\n", i); ++ ext_debug(tree, "return to level %d\n", i); + } + } + @@ -1477,15 +1688,405 @@ + if (path->p_hdr->e_num == 0) { + /* + * truncate to zero freed all the tree -+ * so, we need to correct i_depth ++ * so, we need to correct e_depth + */ -+ EXT3_I(inode)->i_depth = 0; -+ path->p_hdr->e_max = 0; -+ ext3_mark_inode_dirty(handle, inode); ++ err = ext3_ext_get_access(handle, tree, path); ++ if (err == 0) { ++ EXT_ROOT_HDR(tree)->e_depth = 0; ++ err = ext3_ext_dirty(handle, tree, path); ++ } + } + + kfree(path); ++ ext3_journal_stop(handle, inode); ++ ++ return err; ++} ++ ++/* ++ * called at mount time ++ */ ++void ext3_ext_init(struct super_block *sb) ++{ ++ /* ++ * possible initialization would be here ++ */ ++ ++ if (test_opt(sb, EXTENTS)) ++ printk("EXT3-fs: file extents enabled\n"); ++} ++ ++/* ++ * called at umount time ++ */ ++void ext3_ext_release(struct super_block *sb) ++{ ++} ++ ++/************************************************************************ ++ * VFS related routines ++ ************************************************************************/ ++ ++static int ext3_get_inode_write_access(handle_t *handle, void *buffer) ++{ ++ /* we use in-core data, not bh */ ++ return 0; ++} ++ ++static int ext3_mark_buffer_dirty(handle_t *handle, void *buffer) ++{ ++ struct inode *inode = buffer; ++ ext3_mark_inode_dirty(handle, inode); ++ return 0; ++} ++ ++static int ext3_ext_mergable(struct ext3_extent *ex1, ++ struct ext3_extent *ex2) ++{ ++ if (ex1->e_start + ex1->e_num == ex2->e_start) ++ return 1; ++ return 0; ++} ++ ++static int ++ext3_remove_blocks_credits(struct ext3_extents_tree *tree, ++ struct ext3_extent *ex, ++ unsigned long from, unsigned long to) ++{ ++ int needed; ++ ++ /* at present, extent can't cross block group */; ++ needed = 3; /* bitmap + group desc + sb */ ++ ++#ifdef CONFIG_QUOTA ++ needed += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS; ++#endif ++ return needed; ++} ++ ++static int ++ext3_remove_blocks(struct ext3_extents_tree *tree, ++ struct ext3_extent *ex, ++ unsigned long from, unsigned long to) ++{ ++ int needed = ext3_remove_blocks_credits(tree, ex, from, to); ++ handle_t *handle = ext3_journal_start(tree->inode, needed); ++ ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ if (from >= ex->e_block && to == ex->e_block + ex->e_num - 1) { ++ /* tail removal */ ++ unsigned long num, start; ++ num = ex->e_block + ex->e_num - from; ++ start = ex->e_start + ex->e_num - num; ++ ext_debug(tree, "free last %lu blocks starting %lu\n", ++ num, start); ++ ext3_free_blocks(handle, tree->inode, start, num); ++ } else if (from == ex->e_block && to <= ex->e_block + ex->e_num - 1) { ++ printk("strange request: removal %lu-%lu from %u:%u\n", ++ from, to, ex->e_block, ex->e_num); ++ } else { ++ printk("strange request: removal(2) %lu-%lu from %u:%u\n", ++ from, to, ex->e_block, ex->e_num); ++ } ++ ext3_journal_stop(handle, tree->inode); ++ return 0; ++} ++ ++static int ext3_ext_find_goal(struct inode *inode, ++ struct ext3_ext_path *path) ++{ ++ struct ext3_inode_info *ei = EXT3_I(inode); ++ unsigned long bg_start; ++ unsigned long colour; ++ int depth; ++ ++ if (path) { ++ depth = path->p_depth; ++ /* try to find previous block */ ++ if (path[depth].p_ext) ++ return path[depth].p_ext->e_start + ++ path[depth].p_ext->e_num - 1; ++ ++ /* it looks index is empty ++ * try to find starting from index itself */ ++ if (path[depth].p_bh) ++ return path[depth].p_bh->b_blocknr; ++ } ++ ++ /* OK. use inode's group */ ++ bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) + ++ le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block); ++ colour = (current->pid % 16) * ++ (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16); ++ return bg_start + colour; ++} ++ ++static int ext3_new_block_cb(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *ex, int *err) ++{ ++ struct inode *inode = tree->inode; ++ int newblock, goal; ++ ++ EXT_ASSERT(path); ++ EXT_ASSERT(ex); ++ EXT_ASSERT(ex->e_start); ++ EXT_ASSERT(ex->e_num); ++ ++ /* reuse block from the extent to order data/metadata */ ++ newblock = ex->e_start++; ++ ex->e_num--; ++ if (ex->e_num == 0) { ++ ex->e_num = 1; ++ /* allocate new block for the extent */ ++ goal = ext3_ext_find_goal(inode, path); ++ ex->e_start = ext3_new_block(handle, inode, goal, 0, 0, err); ++ if (ex->e_start == 0) { ++ /* error occured: restore old extent */ ++ ex->e_start = newblock; ++ return 0; ++ } ++ } ++ return newblock; ++} ++ ++static void ext3_init_tree_desc(struct ext3_extents_tree *tree, ++ struct inode *inode) ++{ ++ tree->inode = inode; ++ tree->root = (void *) EXT3_I(inode)->i_data; ++ tree->get_write_access = ext3_get_inode_write_access; ++ tree->mark_buffer_dirty = ext3_mark_buffer_dirty; ++ tree->mergable = ext3_ext_mergable; ++ tree->new_block = ext3_new_block_cb; ++ tree->remove_extent = ext3_remove_blocks; ++ tree->remove_extent_credits = ext3_remove_blocks_credits; ++ tree->buffer = (void *) inode; ++ tree->buffer_len = sizeof(EXT3_I(inode)->i_data); ++ tree->cex = NULL; /* FIXME: add cache store later */ ++} ++ ++#if 0 ++static int ++ext3_ext_new_extent_cb(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newex, int exist) ++{ ++ struct inode *inode = tree->inode; ++ int count, err, goal; ++ loff_t new_i_size; ++ handle_t *handle; ++ unsigned long pblock; ++ ++ if (exist) ++ return EXT_CONTINUE; ++ ++ count = ext3_ext_calc_credits_for_insert(tree, path); ++ handle = ext3_journal_start(inode, count + EXT3_ALLOC_NEEDED + 1); ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ ++ goal = ext3_ext_find_goal(inode, path); ++ count = newex->e_num; ++#ifdef EXT3_MULTIBLOCK_ALLOCATOR ++ pblock = ext3_new_block(handle, inode, goal, &count, NULL, &err); ++ EXT_ASSERT(count <= num); ++ /* FIXME: error handling here */ ++ EXT_ASSERT(err == 0); ++#else ++ pblock = 0; ++#endif ++ ++ /* insert new extent */ ++ newex->e_start = pblock; ++ newex->e_num = count; ++ err = ext3_ext_insert_extent(handle, tree, path, newex); ++ if (err) ++ goto out; ++ ++ /* correct on-disk inode size */ ++ if (newex->e_num > 0) { ++ new_i_size = (loff_t) newex->e_block + newex->e_num; ++ new_i_size = new_i_size << inode->i_blkbits; ++ if (new_i_size > i_size_read(inode)) ++ new_i_size = i_size_read(inode); ++ if (new_i_size > EXT3_I(inode)->i_disksize) { ++ EXT3_I(inode)->i_disksize = new_i_size; ++ err = ext3_mark_inode_dirty(handle, inode); ++ } ++ } ++ ++out: ++ ext3_journal_stop(handle, inode); ++ return err; ++} ++ ++ ++int ext3_ext_allocate_nblocks(struct inode *inode, unsigned long block, ++ unsigned long num) ++{ ++ struct ext3_extents_tree tree; ++ int err; ++ ++ ext_debug(&tree, "blocks %lu-%lu requested for inode %u\n", ++ block, block + num,(unsigned) inode->i_ino); ++ ++ ext3_init_tree_desc(&tree, inode); ++ down(&EXT3_I(inode)->truncate_sem); ++ err = ext3_ext_walk_space(&tree, block, num, ext3_ext_new_extent_cb); ++ ext3_ext_invalidate_cache(&tree); ++ up(&EXT3_I(inode)->truncate_sem); ++ ++ return err; ++} ++#endif ++ ++int ext3_ext_get_block(handle_t *handle, struct inode *inode, ++ long iblock, struct buffer_head *bh_result, int create) ++{ ++ struct ext3_ext_path *path = NULL; ++ struct ext3_extent newex; ++ struct ext3_extent *ex; ++ int goal, newblock, err = 0, depth; ++ struct ext3_extents_tree tree; ++ ++ clear_bit(BH_New, &bh_result->b_state); ++ ext3_init_tree_desc(&tree, inode); ++ ext_debug(&tree, "block %d requested for inode %u\n", ++ (int) iblock, (unsigned) inode->i_ino); ++ down_write(&EXT3_I(inode)->truncate_sem); ++ ++ /* check in cache */ ++ if (ext3_ext_in_cache(&tree, iblock, &newex)) { ++ if (newex.e_start == 0xffffffff && !create) { ++ /* block isn't allocated yet and ++ * user don't want to allocate it */ ++ goto out2; ++ } else if (newex.e_start) { ++ /* block is already allocated */ ++ newblock = iblock - newex.e_block + newex.e_start; ++ goto out; ++ } ++ } ++ ++ /* find extent for this block */ ++ path = ext3_ext_find_extent(&tree, iblock, NULL); ++ if (IS_ERR(path)) { ++ err = PTR_ERR(path); ++ goto out2; ++ } ++ ++ depth = EXT_DEPTH(&tree); ++ ++ /* ++ * consistent leaf must not be empty ++ * this situations is possible, though, _during_ tree modification ++ * this is why assert can't be put in ext3_ext_find_extent() ++ */ ++ EXT_ASSERT(path[depth].p_ext != NULL || depth == 0); ++ ++ if ((ex = path[depth].p_ext)) { ++ /* if found exent covers block, simple return it */ ++ if (iblock >= ex->e_block && iblock < ex->e_block + ex->e_num) { ++ newblock = iblock - ex->e_block + ex->e_start; ++ ext_debug(&tree, "%d fit into %d:%d -> %d\n", ++ (int) iblock, ex->e_block, ex->e_num, ++ newblock); ++ ext3_ext_put_in_cache(&tree, ex); ++ goto out; ++ } ++ } ++ ++ /* ++ * requested block isn't allocated yet ++ * we couldn't try to create block if create flag is zero ++ */ ++ if (!create) { ++ /* put just found gap into cache to speedup subsequest reqs */ ++ ext3_ext_put_gap_in_cache(&tree, path, iblock); ++ goto out2; ++ } ++ ++ /* allocate new block */ ++ goal = ext3_ext_find_goal(inode, path); ++ newblock = ext3_new_block(handle, inode, goal, 0, 0, &err); ++ if (!newblock) ++ goto out2; ++ ext_debug(&tree, "allocate new block: goal %d, found %d\n", ++ goal, newblock); ++ ++ /* try to insert new extent into found leaf and return */ ++ newex.e_block = iblock; ++ newex.e_start = newblock; ++ newex.e_num = 1; ++ err = ext3_ext_insert_extent(handle, &tree, path, &newex); ++ if (err) ++ goto out2; ++ ++ if (inode->i_size > EXT3_I(inode)->i_disksize) ++ EXT3_I(inode)->i_disksize = inode->i_size; ++ ++ /* previous routine could use block we allocated */ ++ newblock = newex.e_start; ++ set_bit(BH_New, &bh_result->b_state); ++ ++ ext3_ext_put_in_cache(&tree, &newex); ++out: ++ ext3_ext_show_leaf(&tree, path); ++ set_bit(BH_Mapped, &bh_result->b_state); ++ bh_result->b_dev = inode->i_sb->s_dev; ++ bh_result->b_blocknr = newblock; ++out2: ++ if (path) { ++ ext3_ext_drop_refs(path); ++ kfree(path); ++ } ++ up_write(&EXT3_I(inode)->truncate_sem); ++ ++ return err; ++} ++ ++void ext3_ext_truncate(struct inode * inode) ++{ ++ struct address_space *mapping = inode->i_mapping; ++ struct super_block *sb = inode->i_sb; ++ struct ext3_extents_tree tree; ++ unsigned long last_block; ++ handle_t *handle; ++ int err = 0; + ++ ext3_init_tree_desc(&tree, inode); ++ ++ /* ++ * probably first extent we're gonna free will be last in block ++ */ ++ err = ext3_writepage_trans_blocks(inode) + 3; ++ handle = ext3_journal_start(inode, err); ++ if (IS_ERR(handle)) ++ return; ++ ++ ext3_block_truncate_page(handle, mapping, inode->i_size); ++ ++ down_write(&EXT3_I(inode)->truncate_sem); ++ ext3_ext_invalidate_cache(&tree); ++ ++ /* ++ * TODO: optimization is possible here ++ * probably we need not scaning at all, ++ * because page truncation is enough ++ */ ++ if (ext3_orphan_add(handle, inode)) ++ goto out_stop; ++ ++ /* we have to know where to truncate from in crash case */ ++ EXT3_I(inode)->i_disksize = inode->i_size; ++ ext3_mark_inode_dirty(handle, inode); ++ ++ last_block = (inode->i_size + sb->s_blocksize - 1) ++ >> EXT3_BLOCK_SIZE_BITS(sb); ++ err = ext3_ext_remove_space(&tree, last_block, 0xffffffff); ++ + /* In a multi-transaction truncate, we only make the final + * transaction synchronous */ + if (IS_SYNC(inode)) @@ -1502,7 +2103,7 @@ + if (inode->i_nlink) + ext3_orphan_del(handle, inode); + -+ up(&EXT3_I(inode)->i_ext_sem); ++ up_write(&EXT3_I(inode)->truncate_sem); + ext3_journal_stop(handle, inode); +} + @@ -1512,29 +2113,12 @@ + */ +int ext3_ext_writepage_trans_blocks(struct inode *inode, int num) +{ -+ struct ext3_inode_info *ei = EXT3_I(inode); -+ int depth = ei->i_depth + 1; ++ struct ext3_extents_tree tree; + int needed; + -+ /* -+ * the worste case we're expecting is creation of the -+ * new root (growing in depth) with index splitting -+ * for splitting we have to consider depth + 1 because -+ * previous growing could increase it -+ */ -+ -+ /* -+ * growing in depth: -+ * block allocation + new root + old root -+ */ -+ needed = EXT3_ALLOC_NEEDED + 2; -+ -+ /* index split. we may need: -+ * allocate intermediate indexes and new leaf -+ * change two blocks at each level, but root -+ * modify root block (inode) -+ */ -+ needed += (depth * EXT3_ALLOC_NEEDED) + (2 * depth) + 1; ++ ext3_init_tree_desc(&tree, inode); ++ ++ needed = ext3_ext_calc_credits_for_insert(&tree, NULL); + + /* caller want to allocate num blocks */ + needed *= num; @@ -1550,53 +2134,124 @@ + return needed; +} + -+/* -+ * called at mount time -+ */ -+void ext3_ext_init(struct super_block *sb) ++void ext3_extents_initialize_blockmap(handle_t *handle, struct inode *inode) +{ -+ /* -+ * possible initialization would be here -+ */ ++ struct ext3_extents_tree tree; + -+ if (test_opt(sb, EXTENTS)) -+ printk("EXT3-fs: file extents enabled\n"); -+ spin_lock_init(&EXT3_SB(sb)->s_ext_lock); ++ ext3_init_tree_desc(&tree, inode); ++ ext3_extent_tree_init(handle, &tree); +} + -+/* -+ * called at umount time -+ */ -+void ext3_ext_release(struct super_block *sb) ++static int ++ext3_ext_store_extent_cb(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newex, int exist) +{ -+ struct ext3_sb_info *sbi = EXT3_SB(sb); -+ -+ /* show collected stats */ -+ if (sbi->s_ext_count && sbi->s_ext_extents) -+ printk("EXT3-fs: min depth - %d, max depth - %d, " -+ "ave. depth - %d, ave. blocks/extent - %d\n", -+ sbi->s_ext_mindepth, -+ sbi->s_ext_maxdepth, -+ sbi->s_ext_sum / sbi->s_ext_count, -+ sbi->s_ext_blocks / sbi->s_ext_extents); ++ struct ext3_extent_buf *buf = (struct ext3_extent_buf *) tree->private; ++ ++ if (!exist) ++ return EXT_CONTINUE; ++ if (buf->err < 0) ++ return EXT_BREAK; ++ if (buf->cur - buf->buffer + sizeof(*newex) > buf->buflen) ++ return EXT_BREAK; ++ ++ if (!copy_to_user(buf->cur, newex, sizeof(*newex))) { ++ buf->err++; ++ buf->cur += sizeof(*newex); ++ } else { ++ buf->err = -EFAULT; ++ return EXT_BREAK; ++ } ++ return EXT_CONTINUE; +} + ---- linux-2.4.20-vanilla/fs/ext3/ialloc.c~ext3-extents-2.4.20 2003-09-15 18:54:58.000000000 +0400 -+++ linux-2.4.20-vanilla-alexey/fs/ext3/ialloc.c 2003-09-15 19:31:40.000000000 +0400 -@@ -569,6 +569,10 @@ repeat: - inode->u.ext3_i.i_prealloc_count = 0; - #endif - inode->u.ext3_i.i_block_group = i; -+ if (test_opt(sb, EXTENTS)) -+ inode->u.ext3_i.i_flags |= EXT3_EXTENTS_FL; -+ inode->u.ext3_i.i_depth = 0; -+ sema_init(&inode->u.ext3_i.i_ext_sem, 1); - - if (inode->u.ext3_i.i_flags & EXT3_SYNC_FL) - inode->i_flags |= S_SYNC; ---- linux-2.4.20-vanilla/fs/ext3/inode.c~ext3-extents-2.4.20 2003-09-15 18:54:58.000000000 +0400 -+++ linux-2.4.20-vanilla-alexey/fs/ext3/inode.c 2003-09-15 19:53:10.000000000 +0400 -@@ -848,6 +848,15 @@ changed: ++static int ++ext3_ext_collect_stats_cb(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *ex, int exist) ++{ ++ struct ext3_extent_tree_stats *buf = ++ (struct ext3_extent_tree_stats *) tree->private; ++ int depth; ++ ++ if (!exist) ++ return EXT_CONTINUE; ++ ++ depth = EXT_DEPTH(tree); ++ buf->extents_num++; ++ if (path[depth].p_ext == EXT_FIRST_EXTENT(path[depth].p_hdr)) ++ buf->leaf_num++; ++ return EXT_CONTINUE; ++} ++ ++int ext3_ext_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, ++ unsigned long arg) ++{ ++ int err = 0; ++ ++ if (cmd == EXT3_IOC_GET_EXTENTS) { ++ struct ext3_extent_buf buf; ++ struct ext3_extents_tree tree; ++ ++ if (copy_from_user(&buf, (void *) arg, sizeof(buf))) ++ return -EFAULT; ++ ++ ext3_init_tree_desc(&tree, inode); ++ buf.cur = buf.buffer; ++ buf.err = 0; ++ tree.private = &buf; ++ err = ext3_ext_walk_space(&tree, buf.start, 0xffffffff, ++ ext3_ext_store_extent_cb); ++ if (err == 0) ++ err = buf.err; ++ } else if (cmd == EXT3_IOC_GET_TREE_STATS) { ++ struct ext3_extent_tree_stats buf; ++ struct ext3_extents_tree tree; ++ ++ ext3_init_tree_desc(&tree, inode); ++ buf.depth = EXT_DEPTH(&tree); ++ buf.extents_num = 0; ++ buf.leaf_num = 0; ++ tree.private = &buf; ++ err = ext3_ext_walk_space(&tree, 0, 0xffffffff, ++ ext3_ext_collect_stats_cb); ++ if (!err) ++ err = copy_to_user((void *) arg, &buf, sizeof(buf)); ++ } else if (cmd == EXT3_IOC_GET_TREE_DEPTH) { ++ struct ext3_extents_tree tree; ++ ext3_init_tree_desc(&tree, inode); ++ err = EXT_DEPTH(&tree); ++ } ++ ++ return err; ++} ++ +Index: linux-2.4.20/fs/ext3/ialloc.c +=================================================================== +--- linux-2.4.20.orig/fs/ext3/ialloc.c 2004-01-23 19:00:25.000000000 +0300 ++++ linux-2.4.20/fs/ext3/ialloc.c 2004-01-24 00:45:20.000000000 +0300 +@@ -593,11 +593,13 @@ + iloc.bh = NULL; + goto fail; + } ++ if (test_opt(sb, EXTENTS)) { ++ EXT3_I(inode)->i_flags |= EXT3_EXTENTS_FL; ++ ext3_extents_initialize_blockmap(handle, inode); ++ } + err = ext3_mark_iloc_dirty(handle, inode, &iloc); + if (err) goto fail; + +- +- + unlock_super (sb); + if(DQUOT_ALLOC_INODE(inode)) { + DQUOT_DROP(inode); +Index: linux-2.4.20/fs/ext3/inode.c +=================================================================== +--- linux-2.4.20.orig/fs/ext3/inode.c 2004-01-23 19:00:25.000000000 +0300 ++++ linux-2.4.20/fs/ext3/inode.c 2004-01-24 04:34:04.000000000 +0300 +@@ -848,6 +848,15 @@ goto reread; } @@ -1612,7 +2267,7 @@ /* * The BKL is not held on entry here. */ -@@ -861,7 +870,7 @@ static int ext3_get_block(struct inode * +@@ -861,7 +870,7 @@ handle = ext3_journal_current_handle(); J_ASSERT(handle != 0); } @@ -1621,7 +2276,7 @@ return ret; } -@@ -879,7 +888,7 @@ struct buffer_head *ext3_getblk(handle_t +@@ -879,7 +888,7 @@ dummy.b_state = 0; dummy.b_blocknr = -1000; buffer_trace_init(&dummy.b_history); @@ -1630,7 +2285,7 @@ if (!*errp && buffer_mapped(&dummy)) { struct buffer_head *bh; bh = sb_getblk(inode->i_sb, dummy.b_blocknr); -@@ -1403,7 +1412,7 @@ struct address_space_operations ext3_aop +@@ -1403,7 +1412,7 @@ * This required during truncate. We need to physically zero the tail end * of that block so it doesn't yield old data if the file is later grown. */ @@ -1639,7 +2294,7 @@ struct address_space *mapping, loff_t from) { unsigned long index = from >> PAGE_CACHE_SHIFT; -@@ -1888,6 +1897,9 @@ void ext3_truncate(struct inode * inode) +@@ -1888,6 +1897,9 @@ ext3_discard_prealloc(inode); @@ -1649,24 +2304,7 @@ handle = start_transaction(inode); if (IS_ERR(handle)) return; /* AKPM: return what? */ -@@ -2200,6 +2212,8 @@ void ext3_read_inode(struct inode * inod - inode->u.ext3_i.i_prealloc_count = 0; - #endif - inode->u.ext3_i.i_block_group = iloc.block_group; -+ inode->u.ext3_i.i_depth = raw_inode->osd2.linux2.l_i_depth; -+ sema_init(&inode->u.ext3_i.i_ext_sem, 1); - - /* - * NOTE! The in-memory inode i_data array is in little-endian order -@@ -2321,6 +2335,7 @@ static int ext3_do_update_inode(handle_t - raw_inode->i_fsize = 0; - } - #endif -+ raw_inode->osd2.linux2.l_i_depth = inode->u.ext3_i.i_depth; - raw_inode->i_file_acl = cpu_to_le32(inode->u.ext3_i.i_file_acl); - if (!S_ISREG(inode->i_mode)) { - raw_inode->i_dir_acl = cpu_to_le32(inode->u.ext3_i.i_dir_acl); -@@ -2525,6 +2540,9 @@ int ext3_writepage_trans_blocks(struct i +@@ -2537,6 +2549,9 @@ int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3; int ret; @@ -1676,7 +2314,7 @@ if (ext3_should_journal_data(inode)) ret = 3 * (bpp + indirects) + 2; else -@@ -2961,7 +2979,7 @@ int ext3_prep_san_write(struct inode *in +@@ -2973,7 +2988,7 @@ /* alloc blocks one by one */ for (i = 0; i < nblocks; i++) { @@ -1685,7 +2323,7 @@ &bh_tmp, 1); if (ret) break; -@@ -3022,7 +3040,7 @@ int ext3_map_inode_page(struct inode *in +@@ -3049,7 +3064,7 @@ if (blocks[i] != 0) continue; @@ -1694,50 +2332,71 @@ if (rc) { printk(KERN_INFO "ext3_map_inode_page: error %d " "allocating block %ld\n", rc, iblock); ---- linux-2.4.20-vanilla/fs/ext3/Makefile~ext3-extents-2.4.20 2003-09-15 18:54:58.000000000 +0400 -+++ linux-2.4.20-vanilla-alexey/fs/ext3/Makefile 2003-09-15 19:41:08.000000000 +0400 -@@ -12,7 +12,8 @@ O_TARGET := ext3.o - export-objs := ext3-exports.o +Index: linux-2.4.20/fs/ext3/Makefile +=================================================================== +--- linux-2.4.20.orig/fs/ext3/Makefile 2004-01-23 19:00:42.000000000 +0300 ++++ linux-2.4.20/fs/ext3/Makefile 2004-01-24 00:45:20.000000000 +0300 +@@ -13,7 +13,7 @@ obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ -- ioctl.o namei.o super.o symlink.o hash.o ext3-exports.o -+ ioctl.o namei.o super.o symlink.o hash.o ext3-exports.o \ -+ extents.o + ioctl.o namei.o super.o symlink.o hash.o ext3-exports.o \ +- xattr_trusted.o ++ xattr_trusted.o extents.o obj-m := $(O_TARGET) export-objs += xattr.o ---- linux-2.4.20-vanilla/fs/ext3/super.c~ext3-extents-2.4.20 2003-09-15 18:54:59.000000000 +0400 -+++ linux-2.4.20-vanilla-alexey/fs/ext3/super.c 2003-09-15 19:42:57.000000000 +0400 -@@ -619,6 +619,7 @@ void ext3_put_super (struct super_block - kdev_t j_dev = sbi->s_journal->j_dev; +Index: linux-2.4.20/fs/ext3/super.c +=================================================================== +--- linux-2.4.20.orig/fs/ext3/super.c 2004-01-23 19:00:25.000000000 +0300 ++++ linux-2.4.20/fs/ext3/super.c 2004-01-24 04:30:14.000000000 +0300 +@@ -623,6 +623,7 @@ int i; + J_ASSERT(sbi->s_delete_inodes == 0); + ext3_ext_release(sb); - ext3_stop_delete_thread(sbi); ext3_xattr_put_super(sb); journal_destroy(sbi->s_journal); -@@ -765,6 +766,10 @@ static int parse_options (char * options - "EXT3 Check option not supported\n"); - #endif + if (!(sb->s_flags & MS_RDONLY)) { +@@ -796,6 +797,10 @@ + return 0; + } } + else if (!strcmp (this_char, "extents")) -+ set_opt (sbi->s_mount_opt, EXTENTS); -+ else if (!strcmp (this_char, "extdebug")) -+ set_opt (sbi->s_mount_opt, EXTDEBUG); - else if (!strcmp (this_char, "debug")) - set_opt (*mount_options, DEBUG); - else if (!strcmp (this_char, "errors")) { -@@ -1478,6 +1483,7 @@ struct super_block * ext3_read_super (st - test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ? "journal": ++ set_opt (*mount_options, EXTENTS); ++ else if (!strcmp (this_char, "extdebug")) ++ set_opt (*mount_options, EXTDEBUG); + else if (!strcmp (this_char, "grpid") || + !strcmp (this_char, "bsdgroups")) + set_opt (*mount_options, GRPID); +@@ -1485,6 +1490,8 @@ test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered": "writeback"); -+ ext3_ext_init(sb); ++ ext3_ext_init(sb); ++ return sb; ---- linux-2.4.20-vanilla/include/linux/ext3_fs.h~ext3-extents-2.4.20 2003-09-15 18:54:58.000000000 +0400 -+++ linux-2.4.20-vanilla-alexey/include/linux/ext3_fs.h 2003-09-15 20:15:52.000000000 +0400 -@@ -184,6 +184,7 @@ struct ext3_group_desc + failed_mount3: +Index: linux-2.4.20/fs/ext3/ioctl.c +=================================================================== +--- linux-2.4.20.orig/fs/ext3/ioctl.c 2004-01-13 17:00:09.000000000 +0300 ++++ linux-2.4.20/fs/ext3/ioctl.c 2004-01-24 14:54:31.000000000 +0300 +@@ -189,6 +189,10 @@ + return ret; + } + #endif ++ case EXT3_IOC_GET_EXTENTS: ++ case EXT3_IOC_GET_TREE_STATS: ++ case EXT3_IOC_GET_TREE_DEPTH: ++ return ext3_ext_ioctl(inode, filp, cmd, arg); + default: + return -ENOTTY; + } +Index: linux-2.4.20/include/linux/ext3_fs.h +=================================================================== +--- linux-2.4.20.orig/include/linux/ext3_fs.h 2004-01-23 19:00:25.000000000 +0300 ++++ linux-2.4.20/include/linux/ext3_fs.h 2004-01-24 01:28:06.000000000 +0300 +@@ -184,6 +184,7 @@ #define EXT3_IMAGIC_FL 0x00002000 /* AFS directory */ #define EXT3_JOURNAL_DATA_FL 0x00004000 /* file data should be journaled */ #define EXT3_RESERVED_FL 0x80000000 /* reserved for ext3 lib */ @@ -1745,36 +2404,34 @@ #define EXT3_FL_USER_VISIBLE 0x00005FFF /* User visible flags */ #define EXT3_FL_USER_MODIFIABLE 0x000000FF /* User modifiable flags */ -@@ -244,7 +245,7 @@ struct ext3_inode { - struct { - __u8 l_i_frag; /* Fragment number */ - __u8 l_i_fsize; /* Fragment size */ -- __u16 i_pad1; -+ __u16 l_i_depth; - __u16 l_i_uid_high; /* these 2 fields */ - __u16 l_i_gid_high; /* were reserved2[0] */ - __u32 l_i_reserved2; -@@ -325,6 +326,8 @@ struct ext3_inode { +@@ -208,6 +209,9 @@ + #ifdef CONFIG_JBD_DEBUG + #define EXT3_IOC_WAIT_FOR_READONLY _IOR('f', 99, long) + #endif ++#define EXT3_IOC_GET_EXTENTS _IOR('f', 5, long) ++#define EXT3_IOC_GET_TREE_DEPTH _IOR('f', 6, long) ++#define EXT3_IOC_GET_TREE_STATS _IOR('f', 7, long) + + /* + * Structure of an inode on the disk +@@ -327,6 +331,8 @@ #define EXT3_MOUNT_IOPEN 0x8000 /* Allow access via iopen */ #define EXT3_MOUNT_IOPEN_NOPRIV 0x10000 /* Make iopen world-readable */ #define EXT3_MOUNT_ASYNCDEL 0x20000 /* Delayed deletion */ -+#define EXT3_MOUNT_EXTENTS 0x40000 /* Extents support */ -+#define EXT3_MOUNT_EXTDEBUG 0x80000 /* Extents debug */ ++#define EXT3_MOUNT_EXTENTS 0x100000/* Extents support */ ++#define EXT3_MOUNT_EXTDEBUG 0x200000/* Extents debug */ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ #ifndef _LINUX_EXT2_FS_H -@@ -702,6 +705,10 @@ extern void ext3_discard_prealloc (struc - extern void ext3_dirty_inode(struct inode *); - extern int ext3_change_inode_journal_flag(struct inode *, int); - extern void ext3_truncate (struct inode *); +@@ -687,6 +693,7 @@ + extern unsigned long ext3_count_free (struct buffer_head *, unsigned); + + /* inode.c */ +extern int ext3_block_truncate_page(handle_t *, struct address_space *, loff_t); -+extern int ext3_forget(handle_t *handle, int is_metadata, -+ struct inode *inode, struct buffer_head *bh, -+ int blocknr); - #ifdef EXT3_DELETE_THREAD - extern void ext3_truncate_thread(struct inode *inode); - #endif -@@ -765,6 +772,13 @@ extern struct inode_operations ext3_spec + extern int ext3_forget(handle_t *, int, struct inode *, struct buffer_head *, int); + extern struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *); + extern struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *); +@@ -767,6 +774,14 @@ extern struct inode_operations ext3_symlink_inode_operations; extern struct inode_operations ext3_fast_symlink_inode_operations; @@ -1785,40 +2442,219 @@ +extern void ext3_ext_truncate(struct inode *); +extern void ext3_ext_init(struct super_block *); +extern void ext3_ext_release(struct super_block *); ++extern void ext3_extents_initialize_blockmap(handle_t *, struct inode *); #endif /* __KERNEL__ */ ---- linux-2.4.20-vanilla/include/linux/ext3_fs_i.h~ext3-extents-2.4.20 2003-09-15 10:16:38.000000000 +0400 -+++ linux-2.4.20-vanilla-alexey/include/linux/ext3_fs_i.h 2003-09-15 20:14:40.000000000 +0400 -@@ -73,6 +73,10 @@ struct ext3_inode_info { - * by other means, so we have truncate_sem. - */ - struct rw_semaphore truncate_sem; -+ -+ /* extents-related data */ -+ struct semaphore i_ext_sem; -+ __u16 i_depth; - }; - - #endif /* _LINUX_EXT3_FS_I */ ---- linux-2.4.20-vanilla/include/linux/ext3_fs_sb.h~ext3-extents-2.4.20 2003-09-15 18:54:57.000000000 +0400 -+++ linux-2.4.20-vanilla-alexey/include/linux/ext3_fs_sb.h 2003-09-15 20:14:40.000000000 +0400 -@@ -86,6 +86,16 @@ struct ext3_sb_info { - wait_queue_head_t s_delete_thread_queue; - wait_queue_head_t s_delete_waiter_queue; - #endif +Index: linux-2.4.20/include/linux/ext3_extents.h +=================================================================== +--- linux-2.4.20.orig/include/linux/ext3_extents.h 2003-01-30 13:24:37.000000000 +0300 ++++ linux-2.4.20/include/linux/ext3_extents.h 2004-01-24 15:15:11.000000000 +0300 +@@ -0,0 +1,207 @@ ++/* ++ * Copyright (C) 2003 Alex Tomas ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public Licens ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- ++ */ ++ ++ ++/* ++ * with AGRESSIVE_TEST defined capacity of index/leaf blocks ++ * become very little, so index split, in-depth growing and ++ * other hard changes happens much more often ++ * this is for debug purposes only ++ */ ++#define AGRESSIVE_TEST_ ++ ++/* ++ * if CHECK_BINSEARCH defined, then results of binary search ++ * will be checked by linear search ++ */ ++#define CHECK_BINSEARCH_ ++ ++/* ++ * if EXT_DEBUG is defined you can use 'extdebug' mount option ++ * to get lots of info what's going on ++ */ ++#define EXT_DEBUG ++#ifdef EXT_DEBUG ++#define ext_debug(tree,fmt,a...) \ ++do { \ ++ if (test_opt((tree)->inode->i_sb, EXTDEBUG)) \ ++ printk(fmt, ##a); \ ++} while (0); ++#else ++#define ext_debug(tree,fmt,a...) ++#endif ++ ++/* ++ * if EXT_STATS is defined then stats numbers are collected ++ * these number will be displayed at umount time ++ */ ++#define EXT_STATS_ ++ ++ ++#define EXT3_ALLOC_NEEDED 2 /* block bitmap + group descriptor */ ++ ++/* ++ * ext3_inode has i_block array (total 60 bytes) ++ * first 4 bytes are used to store: ++ * - tree depth (0 mean there is no tree yet. all extents in the inode) ++ * - number of alive extents in the inode ++ */ ++ ++/* ++ * this is extent on-disk structure ++ * it's used at the bottom of the tree ++ */ ++struct ext3_extent { ++ __u32 e_block; /* first logical block extent covers */ ++ __u32 e_start; /* first physical block extents lives */ ++ __u32 e_num; /* number of blocks covered by extent */ ++}; ++ ++/* ++ * this is index on-disk structure ++ * it's used at all the levels, but the bottom ++ */ ++struct ext3_extent_idx { ++ __u32 e_block; /* index covers logical blocks from 'block' */ ++ __u32 e_leaf; /* pointer to the physical block of the next * ++ * level. leaf or next index could bet here */ ++}; ++ ++/* ++ * each block (leaves and indexes), even inode-stored has header ++ */ ++struct ext3_extent_header { ++ __u16 e_magic; /* probably will support different formats */ ++ __u16 e_num; /* number of valid entries */ ++ __u16 e_max; /* capacity of store in entries */ ++ __u16 e_depth; /* has tree real underlaying blocks? */ ++}; ++ ++/* ++ * array of ext3_ext_path contains path to some extent ++ * creation/lookup routines use it for traversal/splitting/etc ++ * truncate uses it to simulate recursive walking ++ */ ++struct ext3_ext_path { ++ __u32 p_block; ++ __u16 p_depth; ++ struct ext3_extent *p_ext; ++ struct ext3_extent_idx *p_idx; ++ struct ext3_extent_header *p_hdr; ++ struct buffer_head *p_bh; ++}; ++ ++/* ++ * structure for external API ++ */ ++ ++ ++/* ++ * ext3_extents_tree is used to pass initial information ++ * to top-level extents API ++ */ ++struct ext3_extents_tree { ++ struct inode *inode; /* inode which tree belongs to */ ++ void *root; /* ptr to data top of tree resides at */ ++ void *buffer; /* will be passed as arg to ^^ routines */ ++ int buffer_len; ++ void *private; ++ struct ext3_extent *cex;/* last found extent */ ++ int (*get_write_access)(handle_t *h, void *buffer); ++ int (*mark_buffer_dirty)(handle_t *h, void *buffer); ++ int (*mergable)(struct ext3_extent *ex1, struct ext3_extent *ex2); ++ int (*remove_extent_credits)(struct ext3_extents_tree *, ++ struct ext3_extent *, unsigned long, ++ unsigned long); ++ int (*remove_extent)(struct ext3_extents_tree *, ++ struct ext3_extent *, unsigned long, ++ unsigned long); ++ int (*new_block)(handle_t *, struct ext3_extents_tree *, ++ struct ext3_ext_path *, struct ext3_extent *, ++ int *); ++}; ++ ++/* ++ * to be called by ext3_ext_walk_space() ++ * negative retcode - error ++ * positive retcode - signal for ext3_ext_walk_space(), see below ++ * callback must return valid extent (passed or newly created) ++ */ ++typedef int (*ext_prepare_callback)(struct ext3_extents_tree *, ++ struct ext3_ext_path *, ++ struct ext3_extent *, int); ++ ++#define EXT_CONTINUE 0 ++#define EXT_BREAK 1 ++ ++ ++#define EXT_FIRST_EXTENT(__hdr__) \ ++ ((struct ext3_extent *) (((char *) (__hdr__)) + \ ++ sizeof(struct ext3_extent_header))) ++#define EXT_FIRST_INDEX(__hdr__) \ ++ ((struct ext3_extent_idx *) (((char *) (__hdr__)) + \ ++ sizeof(struct ext3_extent_header))) ++#define EXT_HAS_FREE_INDEX(__path__) \ ++ ((__path__)->p_hdr->e_num < (__path__)->p_hdr->e_max) ++#define EXT_LAST_EXTENT(__hdr__) \ ++ (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->e_num - 1) ++#define EXT_LAST_INDEX(__hdr__) \ ++ (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->e_num - 1) ++#define EXT_MAX_EXTENT(__hdr__) \ ++ (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->e_max - 1) ++#define EXT_MAX_INDEX(__hdr__) \ ++ (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->e_max - 1) ++ ++#define EXT_ROOT_HDR(tree) \ ++ ((struct ext3_extent_header *) (tree)->root) ++#define EXT_BLOCK_HDR(bh) \ ++ ((struct ext3_extent_header *) (bh)->b_data) ++#define EXT_DEPTH(_t_) \ ++ (((struct ext3_extent_header *)((_t_)->root))->e_depth) ++ ++ ++#define EXT_ASSERT(__x__) if (!(__x__)) BUG(); ++ ++ ++/* ++ * this structure is used to gather extents from the tree via ioctl ++ */ ++struct ext3_extent_buf { ++ unsigned long start; ++ int buflen; ++ void *buffer; ++ void *cur; ++ int err; ++}; ++ ++/* ++ * this structure is used to collect stats info about the tree ++ */ ++struct ext3_extent_tree_stats { ++ int depth; ++ int extents_num; ++ int leaf_num; ++}; ++ ++extern int ext3_extent_tree_init(handle_t *, struct ext3_extents_tree *); ++extern int ext3_ext_calc_credits_for_insert(struct ext3_extents_tree *, struct ext3_ext_path *); ++extern int ext3_ext_insert_extent(handle_t *, struct ext3_extents_tree *, struct ext3_ext_path *, struct ext3_extent *); ++extern int ext3_ext_walk_space(struct ext3_extents_tree *, unsigned long, unsigned long, ext_prepare_callback); ++extern int ext3_ext_remove_space(struct ext3_extents_tree *, unsigned long, unsigned long); ++extern struct ext3_ext_path * ext3_ext_find_extent(struct ext3_extents_tree *, int, struct ext3_ext_path *); ++ + -+ /* extents */ -+ int s_ext_debug; -+ int s_ext_mindepth; -+ int s_ext_maxdepth; -+ int s_ext_sum; -+ int s_ext_count; -+ spinlock_t s_ext_lock; -+ int s_ext_extents; -+ int s_ext_blocks; - }; - - #endif /* _LINUX_EXT3_FS_SB */ - -_ diff --git a/lustre/kernel_patches/patches/ext3-extents-in-ea-2.4.20.patch b/lustre/kernel_patches/patches/ext3-extents-in-ea-2.4.20.patch new file mode 100644 index 0000000..bff003b --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-extents-in-ea-2.4.20.patch @@ -0,0 +1,343 @@ +Index: linux-2.4.20/fs/ext3/extents-in-ea.c +=================================================================== +--- linux-2.4.20.orig/fs/ext3/extents-in-ea.c 2003-01-30 13:24:37.000000000 +0300 ++++ linux-2.4.20/fs/ext3/extents-in-ea.c 2004-01-24 14:54:04.000000000 +0300 +@@ -0,0 +1,202 @@ ++/* ++ * Copyright (C) 2003 Alex Tomas ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public Licens ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++static int ext3_get_ea_write_access(handle_t *handle, void *buffer) ++{ ++ struct buffer_head *bh = (struct buffer_head *) buffer; ++ return ext3_journal_get_write_access(handle, bh); ++} ++ ++static int ext3_mark_ea_buffer_dirty(handle_t *handle, void *buffer) ++{ ++ struct buffer_head *bh = (struct buffer_head *) buffer; ++ ext3_journal_dirty_metadata(handle, bh); ++ return 0; ++} ++ ++int ext3_init_tree_in_ea_desc(struct ext3_extents_tree *tree, ++ struct inode *inode, int name_index, ++ const char *eaname) ++{ ++ struct buffer_head *bh; ++ int offset, err, size; ++ ++ err = ext3_xattr_get_ea_loc(inode, name_index, eaname, ++ &bh, &offset, &size); ++ if (err) ++ return err; ++ ++ EXT_ASSERT(bh); ++ EXT_ASSERT(size >= sizeof(struct ext3_extent_header) ++ + sizeof(struct ext3_extent)); ++ tree->inode = inode; ++ tree->root = (void *) bh->b_data + offset; ++ tree->buffer_len = size; ++ tree->buffer = (void *) bh; ++ tree->get_write_access = ext3_get_ea_write_access; ++ tree->mark_buffer_dirty = ext3_mark_ea_buffer_dirty; ++ tree->mergable = NULL; ++ tree->remove_extent = NULL; ++ tree->remove_extent_credits = NULL; ++ tree->new_block = NULL; ++ tree->cex = NULL; /* FIXME: add cache store later */ ++ return 0; ++} ++ ++void ext3_release_tree_in_ea_desc(struct ext3_extents_tree *tree) ++{ ++ struct buffer_head *bh; ++ ++ bh = (struct buffer_head *) tree->buffer; ++ EXT_ASSERT(bh); ++ brelse(bh); ++} ++ ++int ext3_init_tree_in_ea(struct inode *inode, int name_index, ++ const char *eaname, int size) ++{ ++ struct ext3_extents_tree tree; ++ handle_t *handle; ++ char *root; ++ int err; ++ ++ root = kmalloc(size, GFP_USER); ++ if (!root) ++ return -ENOMEM; ++ memset(root, 0, size); ++ ++ /* first, create ea to store root of the tree */ ++ handle = ext3_journal_start(inode, EXT3_ALLOC_NEEDED + 1); ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ if ((err = ext3_xattr_set(handle, inode, name_index, ++ eaname, root, size, 0))) ++ goto out; ++ if ((err = ext3_init_tree_in_ea_desc(&tree, inode, name_index, eaname))) ++ goto out; ++ err = ext3_extent_tree_init(handle, &tree); ++ ext3_release_tree_in_ea_desc(&tree); ++out: ++ ext3_journal_stop(handle, inode); ++ kfree(root); ++ return err; ++} ++ ++static int ++ext3_ext_in_ea_new_extent(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newex, int exist) ++{ ++ handle_t *handle; ++ int needed, err; ++ ++ if (exist) ++ return EXT_CONTINUE; ++ ++ needed = ext3_ext_calc_credits_for_insert(tree, path); ++ handle = ext3_journal_start(tree->inode, needed); ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ ++ /* insert new extent */ ++ newex->e_start = 0; ++ err = ext3_ext_insert_extent(handle, tree, path, newex); ++ if (!err) ++ ext3_journal_stop(handle, tree->inode); ++ ++ return err; ++} ++ ++int ext3_ext_in_ea_alloc_space(struct inode *inode, int name_index, ++ const char *eaname, unsigned long from, ++ unsigned long num) ++{ ++ struct ext3_extents_tree tree; ++ int err; ++ ++ err = ext3_init_tree_in_ea_desc(&tree, inode, name_index, eaname); ++ if (err == 0) { ++ err = ext3_ext_walk_space(&tree, from, num, ++ ext3_ext_in_ea_new_extent); ++ ext3_release_tree_in_ea_desc(&tree); ++ } ++ return err; ++} ++ ++int ext3_ext_in_ea_remove_space(struct inode *inode, int name_index, ++ const char *eaname, unsigned long from, ++ unsigned long num) ++{ ++ struct ext3_extents_tree tree; ++ int err; ++ ++ err = ext3_init_tree_in_ea_desc(&tree, inode, name_index, eaname); ++ if (err == 0) { ++ err = ext3_ext_remove_space(&tree, from, num); ++ ext3_release_tree_in_ea_desc(&tree); ++ } ++ return err; ++} ++ ++int ext3_ext_in_ea_presence(struct inode *inode, int name_index, ++ const char *eaname, unsigned long block) ++{ ++ struct ext3_extents_tree tree; ++ struct ext3_ext_path *path; ++ struct ext3_extent *ex; ++ int err, depth; ++ ++ err = ext3_init_tree_in_ea_desc(&tree, inode, name_index, eaname); ++ if (err) ++ return err; ++ ++ /* find extent for this block */ ++ path = ext3_ext_find_extent(&tree, block, NULL); ++ if (IS_ERR(path)) { ++ err = PTR_ERR(path); ++ goto out; ++ } ++ ++ depth = EXT_DEPTH(&tree); ++ ex = path[depth].p_ext; ++ if (!ex) { ++ /* there is no extent yet */ ++ goto out; ++ } ++ ++ if (block >= ex->e_block && block < ex->e_block + ex->e_num) ++ err = 1; ++out: ++ ext3_release_tree_in_ea_desc(&tree); ++ return err; ++} ++ +Index: linux-2.4.20/fs/ext3/Makefile +=================================================================== +--- linux-2.4.20.orig/fs/ext3/Makefile 2004-01-24 00:45:20.000000000 +0300 ++++ linux-2.4.20/fs/ext3/Makefile 2004-01-24 14:34:27.000000000 +0300 +@@ -17,7 +17,7 @@ + obj-m := $(O_TARGET) + + export-objs += xattr.o +-obj-$(CONFIG_EXT3_FS_XATTR) += xattr.o ++obj-$(CONFIG_EXT3_FS_XATTR) += xattr.o extents-in-ea.o + obj-$(CONFIG_EXT3_FS_XATTR_USER) += xattr_user.o + + include $(TOPDIR)/Rules.make +Index: linux-2.4.20/fs/ext3/xattr.c +=================================================================== +--- linux-2.4.20.orig/fs/ext3/xattr.c 2004-01-23 19:00:43.000000000 +0300 ++++ linux-2.4.20/fs/ext3/xattr.c 2004-01-24 14:34:27.000000000 +0300 +@@ -771,7 +771,8 @@ + */ + int + ext3_xattr_ibody_find(struct inode *inode, int name_index, +- const char *name, struct ext3_xattr_entry *rentry, int *free) ++ const char *name, struct ext3_xattr_entry *rentry, int *free, ++ struct buffer_head **bh, int *offset) + { + struct ext3_xattr_entry *last; + struct ext3_inode *raw_inode; +@@ -818,6 +819,15 @@ + name_len == last->e_name_len && + !memcmp(name, last->e_name, name_len)) { + memcpy(rentry, last, sizeof(struct ext3_xattr_entry)); ++ if (offset) { ++ void *voff; ++ voff = start + le16_to_cpu(last->e_value_offs); ++ *offset = voff - (void *) iloc.bh->b_data; ++ } ++ if (bh) { ++ get_bh(iloc.bh); ++ *bh = iloc.bh; ++ } + ret = 0; + } else { + *free -= EXT3_XATTR_LEN(last->e_name_len); +@@ -838,7 +848,8 @@ + */ + int + ext3_xattr_block_find(struct inode *inode, int name_index, const char *name, +- struct ext3_xattr_entry *rentry, int *free) ++ struct ext3_xattr_entry *rentry, int *free, ++ struct buffer_head **tbh, int *offset) + { + struct buffer_head *bh = NULL; + struct ext3_xattr_entry *entry; +@@ -881,6 +892,12 @@ + memcmp(name, entry->e_name, name_len) == 0) { + memcpy(rentry, entry, sizeof(struct ext3_xattr_entry)); + error = 0; ++ if (offset) ++ *offset = le16_to_cpu(entry->e_value_offs); ++ if (tbh) { ++ get_bh(bh); ++ *tbh = bh; ++ } + } else { + *free -= EXT3_XATTR_LEN(entry->e_name_len); + *free -= le32_to_cpu(entry->e_value_size); +@@ -1073,7 +1090,8 @@ + return -ERANGE; + + /* try to find attribute in inode body */ +- err = ext3_xattr_ibody_find(inode, name_index, name, &entry, &free1); ++ err = ext3_xattr_ibody_find(inode, name_index, name, ++ &entry, &free1, NULL, NULL); + if (err == 0) { + /* found EA in inode */ + found = 1; +@@ -1082,7 +1100,7 @@ + /* there is no such attribute in inode body */ + /* try to find attribute in dedicated block */ + err = ext3_xattr_block_find(inode, name_index, name, +- &entry, &free2); ++ &entry, &free2, NULL, NULL); + if (err != 0 && err != -ENOENT) { + /* not found EA in block */ + goto finish; +@@ -1138,6 +1156,38 @@ + return err; + } + ++int ext3_xattr_get_ea_loc(struct inode *inode, int name_index, ++ const char *name, struct buffer_head **bh, ++ int *offset, int *size) ++{ ++ int free1 = -1, free2 = -1, err, name_len; ++ struct ext3_xattr_entry entry; ++ ++ ea_idebug(inode, "name=%d.%s", name_index, name); ++ ++ if (name == NULL) ++ return -EINVAL; ++ name_len = strlen(name); ++ if (name_len > 255) ++ return -ERANGE; ++ ++ down(&ext3_xattr_sem); ++ ++ /* try to find attribute in inode body */ ++ err = ext3_xattr_ibody_find(inode, name_index, name, ++ &entry, &free1, bh, offset); ++ if (err == -ENOENT) { ++ /* there is no such attribute in inode body */ ++ /* try to find attribute in dedicated block */ ++ err = ext3_xattr_block_find(inode, name_index, name, ++ &entry, &free2, bh, offset); ++ } ++ if (err == 0 && size) ++ *size = le32_to_cpu(entry.e_value_size); ++ up(&ext3_xattr_sem); ++ return err; ++} ++ + /* + * ext3_xattr_block_set() + * +Index: linux-2.4.20/include/linux/ext3_xattr.h +=================================================================== +--- linux-2.4.20.orig/include/linux/ext3_xattr.h 2004-01-24 14:22:28.000000000 +0300 ++++ linux-2.4.20/include/linux/ext3_xattr.h 2004-01-24 14:34:27.000000000 +0300 +@@ -80,6 +80,7 @@ + extern int ext3_xattr_get(struct inode *, int, const char *, void *, size_t); + extern int ext3_xattr_list(struct inode *, char *, size_t); + extern int ext3_xattr_set(handle_t *handle, struct inode *, int, const char *, const void *, size_t, int); ++extern int ext3_xattr_get_ea_loc(struct inode *, int, const char *, struct buffer_head **, int *, int *); + + extern void ext3_xattr_delete_inode(handle_t *, struct inode *); + extern void ext3_xattr_put_super(struct super_block *); diff --git a/lustre/kernel_patches/patches/ext3-extents-in-ea-ioctl-2.4.20.patch b/lustre/kernel_patches/patches/ext3-extents-in-ea-ioctl-2.4.20.patch new file mode 100644 index 0000000..027b0ea --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-extents-in-ea-ioctl-2.4.20.patch @@ -0,0 +1,154 @@ +Index: linux-2.4.20/fs/ext3/extents-in-ea.c +=================================================================== +--- linux-2.4.20.orig/fs/ext3/extents-in-ea.c 2004-01-24 14:54:37.000000000 +0300 ++++ linux-2.4.20/fs/ext3/extents-in-ea.c 2004-01-24 14:55:10.000000000 +0300 +@@ -200,3 +200,112 @@ + return err; + } + ++static int ++ext3_ext_store_extent_cb(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newex, int exist) ++{ ++ struct ext3_extent_buf *buf = (struct ext3_extent_buf *) tree->private; ++ ++ if (!exist) ++ return EXT_CONTINUE; ++ if (buf->err < 0) ++ return EXT_BREAK; ++ if (buf->cur - buf->buffer + sizeof(*newex) > buf->buflen) ++ return EXT_BREAK; ++ ++ if (!copy_to_user(buf->cur, newex, sizeof(*newex))) { ++ buf->err++; ++ buf->cur += sizeof(*newex); ++ } else { ++ buf->err = -EFAULT; ++ return EXT_BREAK; ++ } ++ return EXT_CONTINUE; ++} ++ ++static int ++ext3_ext_collect_stats_cb(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *ex, int exist) ++{ ++ struct ext3_extent_tree_stats *buf = ++ (struct ext3_extent_tree_stats *) tree->private; ++ int depth; ++ ++ if (!exist) ++ return EXT_CONTINUE; ++ ++ depth = EXT_DEPTH(tree); ++ buf->extents_num++; ++ if (path[depth].p_ext == EXT_FIRST_EXTENT(path[depth].p_hdr)) ++ buf->leaf_num++; ++ return EXT_CONTINUE; ++} ++ ++struct ea_tree_desc { ++ int name_index; ++ char eaname[256]; ++}; ++ ++int ext3_ext_in_ea_ioctl(struct inode *inode, struct file *filp, ++ unsigned int cmd, unsigned long arg) ++{ ++ int err = 0; ++ ++ if (cmd == EXT3_IOC_EA_TREE_INIT) { ++ struct ea_tree_desc desc; ++ ++ if (copy_from_user(&desc, (void *) arg, sizeof(desc))) ++ return -EFAULT; ++ err = ext3_init_tree_in_ea(inode, desc.name_index, ++ desc.eaname, 64); ++ } else if (cmd == EXT3_IOC_GET_EA_EXTENTS) { ++ struct ext3_extents_tree tree; ++ struct ext3_extent_buf buf; ++ struct ea_tree_desc desc; ++ ++ if (copy_from_user(&buf, (void *) arg, sizeof(buf))) ++ return -EFAULT; ++ if (copy_from_user(&desc, buf.cur, sizeof(desc))) ++ return -EFAULT; ++ err = ext3_init_tree_in_ea_desc(&tree, inode, ++ desc.name_index, desc.eaname); ++ if (err) ++ goto out; ++ buf.cur = buf.buffer; ++ buf.err = 0; ++ tree.private = &buf; ++ err = ext3_ext_walk_space(&tree, buf.start, 0xffffffff, ++ ext3_ext_store_extent_cb); ++ if (err == 0) ++ err = buf.err; ++ ext3_release_tree_in_ea_desc(&tree); ++ } else if (cmd == EXT3_IOC_EA_TREE_ALLOCATE) { ++ struct ext3_extent_buf buf; ++ struct ea_tree_desc desc; ++ ++ if (copy_from_user(&buf, (void *) arg, sizeof(buf))) ++ return -EFAULT; ++ if (copy_from_user(&desc, buf.cur, sizeof(desc))) ++ return -EFAULT; ++ err = ext3_ext_in_ea_alloc_space(inode, desc.name_index, ++ desc.eaname, buf.start, ++ buf.err); ++ } else if (cmd == EXT3_IOC_EA_TREE_REMOVE) { ++ struct ext3_extent_buf buf; ++ struct ea_tree_desc desc; ++ ++ if (copy_from_user(&buf, (void *) arg, sizeof(buf))) ++ return -EFAULT; ++ if (copy_from_user(&desc, buf.cur, sizeof(desc))) ++ return -EFAULT; ++ err = ext3_ext_in_ea_remove_space(inode, desc.name_index, ++ desc.eaname, buf.start, ++ buf.err); ++ } ++ ++out: ++ return err; ++} ++ +Index: linux-2.4.20/fs/ext3/ioctl.c +=================================================================== +--- linux-2.4.20.orig/fs/ext3/ioctl.c 2004-01-24 14:54:31.000000000 +0300 ++++ linux-2.4.20/fs/ext3/ioctl.c 2004-01-24 14:55:04.000000000 +0300 +@@ -193,6 +193,13 @@ + case EXT3_IOC_GET_TREE_STATS: + case EXT3_IOC_GET_TREE_DEPTH: + return ext3_ext_ioctl(inode, filp, cmd, arg); ++ case EXT3_IOC_GET_EA_EXTENTS: ++ case EXT3_IOC_GET_EA_TREE_DEPTH: ++ case EXT3_IOC_GET_EA_TREE_STATS: ++ case EXT3_IOC_EA_TREE_INIT: ++ case EXT3_IOC_EA_TREE_ALLOCATE: ++ case EXT3_IOC_EA_TREE_REMOVE: ++ return ext3_ext_in_ea_ioctl(inode, filp, cmd, arg); + default: + return -ENOTTY; + } +Index: linux-2.4.20/include/linux/ext3_fs.h +=================================================================== +--- linux-2.4.20.orig/include/linux/ext3_fs.h 2004-01-24 01:28:06.000000000 +0300 ++++ linux-2.4.20/include/linux/ext3_fs.h 2004-01-24 14:55:04.000000000 +0300 +@@ -213,6 +213,14 @@ + #define EXT3_IOC_GET_TREE_DEPTH _IOR('f', 6, long) + #define EXT3_IOC_GET_TREE_STATS _IOR('f', 7, long) + ++#define EXT3_IOC_GET_EA_EXTENTS _IOR('f', 10, long) ++#define EXT3_IOC_GET_EA_TREE_DEPTH _IOR('f', 11, long) ++#define EXT3_IOC_GET_EA_TREE_STATS _IOR('f', 12, long) ++#define EXT3_IOC_EA_TREE_INIT _IOW('f', 13, long) ++#define EXT3_IOC_EA_TREE_ALLOCATE _IOW('f', 14, long) ++#define EXT3_IOC_EA_TREE_REMOVE _IOW('f', 15, long) ++ ++ + /* + * Structure of an inode on the disk + */ -- 1.8.3.1