From a1d1a0c45b12ca0d176acafc92f997dc82355a97 Mon Sep 17 00:00:00 2001 From: alex Date: Mon, 15 Sep 2003 16:29:17 +0000 Subject: [PATCH] - port of extents onto 2.4.20-vanilla NOTE: tested by dbench 4/8 in UML only --- .../patches/ext3-extents-2.4.20.patch | 1824 ++++++++++++++++++++ lustre/kernel_patches/pc/ext3-extents-2.4.20.pc | 8 + 2 files changed, 1832 insertions(+) create mode 100644 lustre/kernel_patches/patches/ext3-extents-2.4.20.patch create mode 100644 lustre/kernel_patches/pc/ext3-extents-2.4.20.pc diff --git a/lustre/kernel_patches/patches/ext3-extents-2.4.20.patch b/lustre/kernel_patches/patches/ext3-extents-2.4.20.patch new file mode 100644 index 0000000..0daad4a --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-extents-2.4.20.patch @@ -0,0 +1,1824 @@ + fs/ext3/Makefile | 3 + fs/ext3/extents.c | 1570 +++++++++++++++++++++++++++++++++++++++++++++ + fs/ext3/ialloc.c | 4 + fs/ext3/inode.c | 28 + fs/ext3/super.c | 6 + include/linux/ext3_fs.h | 16 + include/linux/ext3_fs_i.h | 4 + include/linux/ext3_fs_sb.h | 10 + 8 files changed, 1634 insertions(+), 7 deletions(-) + +--- /dev/null 2003-01-30 13:24:37.000000000 +0300 ++++ linux-2.4.20-vanilla-alexey/fs/ext3/extents.c 2003-09-15 19:57:29.000000000 +0400 +@@ -0,0 +1,1570 @@ ++/* ++ * ++ * linux/fs/ext3/extents.c ++ * ++ * Extents support for EXT3 ++ * ++ * 07/08/2003 Alex Tomas ++ * ++ * TODO: ++ * - ext3*_error() should be used in some situations ++ * - find_goal() [to be tested and improved] ++ * - error handling ++ * - we could leak allocated block in some error cases ++ * - quick search for index/leaf in ext3_ext_find_extent() ++ * - tree reduction ++ * - cache last found extent ++ * - arch-independent ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * with AGRESSIVE_TEST defined capacity of index/leaf blocks ++ * become very little, so index split, in-depth growing and ++ * other hard changes happens much more often ++ * this is for debug purposes only ++ */ ++#define AGRESSIVE_TEST_ ++ ++/* ++ * if EXT_DEBUG defined you can use 'extdebug' mount option ++ * to get lots of info what's going on ++ */ ++#define EXT_DEBUG ++#ifdef EXT_DEBUG ++#define ext_debug(inode,fmt,a...) \ ++do { \ ++ if (test_opt((inode)->i_sb, EXTDEBUG)) \ ++ printk(fmt, ##a); \ ++} while (0); ++#else ++#define ext_debug(inode,fmt,a...) ++#endif ++ ++#define EXT3_ALLOC_NEEDED 2 /* block bitmap + group descriptor */ ++ ++/* ++ * ext3_inode has i_block array (total 60 bytes) ++ * first 4 bytes are used to store: ++ * - tree depth (0 mean there is no tree yet. all extents in the inode) ++ * - number of alive extents in the inode ++ */ ++ ++/* ++ * this is extent on-disk structure ++ * it's used at the bottom of the tree ++ */ ++struct ext3_extent { ++ __u32 e_block; /* first logical block extent covers */ ++ __u32 e_start; /* first physical block extents lives */ ++ __u32 e_num; /* number of blocks covered by extent */ ++}; ++ ++/* ++ * this is index on-disk structure ++ * it's used at all the levels, but the bottom ++ */ ++struct ext3_extent_idx { ++ __u32 e_block; /* index covers logical blocks from 'block' */ ++ __u32 e_leaf; /* pointer to the physical block of the next * ++ * level. leaf or next index could bet here */ ++}; ++ ++/* ++ * each block (leaves and indexes), even inode-stored has header ++ */ ++struct ext3_extent_header { ++ __u16 e_num; /* number of valid entries */ ++ __u16 e_max; /* capacity of store in entries */ ++}; ++ ++/* ++ * array of ext3_ext_path contains path to some extent ++ * creation/lookup routines use it for traversal/splitting/etc ++ * truncate uses it to simulate recursive walking ++ */ ++struct ext3_ext_path { ++ __u32 p_block; ++ __u16 p_depth; ++ struct ext3_extent *p_ext; ++ struct ext3_extent_idx *p_idx; ++ struct ext3_extent_header *p_hdr; ++ struct buffer_head *p_bh; ++}; ++ ++#define EXT_FIRST_EXTENT(__hdr__) \ ++ ((struct ext3_extent *) (((char *) (__hdr__)) + \ ++ sizeof(struct ext3_extent_header))) ++#define EXT_FIRST_INDEX(__hdr__) \ ++ ((struct ext3_extent_idx *) (((char *) (__hdr__)) + \ ++ sizeof(struct ext3_extent_header))) ++#define EXT_HAS_FREE_INDEX(__path__) \ ++ ((__path__)->p_hdr->e_num < (__path__)->p_hdr->e_max) ++#define EXT_LAST_EXTENT(__hdr__) \ ++ (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->e_num - 1) ++#define EXT_LAST_INDEX(__hdr__) \ ++ (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->e_num - 1) ++#define EXT_MAX_EXTENT(__hdr__) \ ++ (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->e_max - 1) ++#define EXT_MAX_INDEX(__hdr__) \ ++ (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->e_max - 1) ++ ++ ++#define EXT_ASSERT(__x__) if (!(__x__)) BUG(); ++ ++/* ++ * could return: ++ * - EROFS ++ * - ENOMEM ++ */ ++static int ext3_ext_get_access(handle_t *handle, struct inode *inode, ++ struct ext3_ext_path *path) ++{ ++ if (path->p_bh) { ++ /* path points to block */ ++ return ext3_journal_get_write_access(handle, path->p_bh); ++ } ++ ++ /* path points to leaf/index in inode body */ ++ return 0; ++} ++ ++/* ++ * could return: ++ * - EROFS ++ * - ENOMEM ++ * - EIO ++ */ ++static int ext3_ext_dirty(handle_t *handle, struct inode *inode, ++ struct ext3_ext_path *path) ++{ ++ if (path->p_bh) { ++ /* path points to block */ ++ return ext3_journal_dirty_metadata(handle, path->p_bh); ++ } ++ ++ /* path points to leaf/index in inode body */ ++ return ext3_mark_inode_dirty(handle, inode); ++} ++ ++static inline int ext3_ext_space_block(struct inode *inode) ++{ ++ int size; ++ ++ size = (inode->i_sb->s_blocksize - sizeof(struct ext3_extent_header)) ++ / sizeof(struct ext3_extent); ++#ifdef AGRESSIVE_TEST ++ size = 6; /* FIXME: for debug, remove this line */ ++#endif ++ return size; ++} ++ ++static inline int ext3_ext_space_inode(struct inode *inode) ++{ ++ int size; ++ ++ size = (sizeof(EXT3_I(inode)->i_data) - ++ sizeof(struct ext3_extent_header)) ++ / sizeof(struct ext3_extent); ++#ifdef AGRESSIVE_TEST ++ size = 3; /* FIXME: for debug, remove this line */ ++#endif ++ return size; ++} ++ ++static inline int ext3_ext_space_inode_idx(struct inode *inode) ++{ ++ int size; ++ ++ size = (sizeof(EXT3_I(inode)->i_data) - ++ sizeof(struct ext3_extent_header)) ++ / sizeof(struct ext3_extent_idx); ++#ifdef AGRESSIVE_TEST ++ size = 4; /* FIXME: for debug, remove this line */ ++#endif ++ return size; ++} ++ ++static void ext3_ext_show_path(struct inode *inode, struct ext3_ext_path *path) ++{ ++ int k, l = path->p_depth; ++ ++ ext_debug(inode, "path:"); ++ for (k = 0; k <= l; k++, path++) { ++ if (path->p_idx) { ++ ext_debug(inode, " %d->%d", path->p_idx->e_block, ++ path->p_idx->e_leaf); ++ } else if (path->p_ext) { ++ ext_debug(inode, " %d:%d:%d", ++ path->p_ext->e_block, ++ path->p_ext->e_start, ++ path->p_ext->e_num); ++ } else ++ ext_debug(inode, " []"); ++ } ++ ext_debug(inode, "\n"); ++} ++ ++static void ext3_ext_show_leaf(struct inode *inode, struct ext3_ext_path *path) ++{ ++ int depth = EXT3_I(inode)->i_depth; ++ struct ext3_extent_header *eh = path[depth].p_hdr; ++ struct ext3_extent *ex = EXT_FIRST_EXTENT(eh); ++ int i; ++ ++ for (i = 0; i < eh->e_num; i++, ex++) { ++ ext_debug(inode, "%d:%d:%d ", ++ ex->e_block, ex->e_start, ex->e_num); ++ } ++ ext_debug(inode, "\n"); ++} ++ ++static void ext3_ext_drop_refs(struct inode *inode, struct ext3_ext_path *path) ++{ ++ int depth = path->p_depth; ++ int i; ++ ++ for (i = 0; i <= depth; i++, path++) ++ if (path->p_bh) { ++ brelse(path->p_bh); ++ path->p_bh = NULL; ++ } ++} ++ ++static int ext3_ext_find_goal(struct inode *inode, struct ext3_ext_path *path) ++{ ++ struct ext3_inode_info *ei = EXT3_I(inode); ++ unsigned long bg_start; ++ unsigned long colour; ++ int depth; ++ ++ if (path) { ++ depth = path->p_depth; ++ /* try to find previous block */ ++ if (path[depth].p_ext) ++ return path[depth].p_ext->e_start + ++ path[depth].p_ext->e_num - 1; ++ ++ /* it looks index is empty ++ * try to find starting from index itself */ ++ if (path[depth].p_bh) ++ return path[depth].p_bh->b_blocknr; ++ } ++ ++ /* OK. use inode's group */ ++ bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) + ++ le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block); ++ colour = (current->pid % 16) * ++ (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16); ++ return bg_start + colour; ++} ++ ++static struct ext3_ext_path * ++ext3_ext_find_extent(struct inode *inode, int block, struct ext3_ext_path *path) ++{ ++ struct ext3_inode_info *ei = EXT3_I(inode); ++ struct ext3_extent_header *eh = (void *) ei->i_data; ++ struct ext3_extent_idx *ix; ++ struct buffer_head *bh; ++ struct ext3_extent *ex; ++ int depth, i, k, ppos = 0; ++ ++ eh = (struct ext3_extent_header *) ei->i_data; ++ ++ /* initialize capacity of leaf in inode for first time */ ++ if (eh->e_max == 0) ++ eh->e_max = ext3_ext_space_inode(inode); ++ i = depth = ei->i_depth; ++ EXT_ASSERT(i == 0 || eh->e_num > 0); ++ ++ /* account possible depth increase */ ++ if (!path) { ++ path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 2), ++ GFP_NOFS); ++ if (!path) ++ return ERR_PTR(-ENOMEM); ++ } ++ memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1)); ++ ++ /* walk through the tree */ ++ while (i) { ++ ext_debug(inode, "depth %d: num %d, max %d\n", ++ ppos, eh->e_num, eh->e_max); ++ ix = EXT_FIRST_INDEX(eh); ++ if (eh->e_num) ++ path[ppos].p_idx = ix; ++ EXT_ASSERT(eh->e_num <= eh->e_max); ++ for (k = 0; k < eh->e_num; k++, ix++) { ++ ext_debug(inode, "index: %d -> %d\n", ++ ix->e_block, ix->e_leaf); ++ if (block < ix->e_block) ++ break; ++ path[ppos].p_idx = ix; ++ } ++ path[ppos].p_block = path[ppos].p_idx->e_leaf; ++ path[ppos].p_depth = i; ++ path[ppos].p_hdr = eh; ++ path[ppos].p_ext = NULL; ++ ++ bh = sb_bread(inode->i_sb, path[ppos].p_block); ++ if (!bh) { ++ ext3_ext_drop_refs(inode, path); ++ kfree(path); ++ return ERR_PTR(-EIO); ++ } ++ eh = (struct ext3_extent_header *) bh->b_data; ++ ppos++; ++ EXT_ASSERT(ppos <= depth); ++ path[ppos].p_bh = bh; ++ i--; ++ } ++ ++ path[ppos].p_depth = i; ++ path[ppos].p_hdr = eh; ++ path[ppos].p_ext = NULL; ++ ++ /* find extent */ ++ ex = EXT_FIRST_EXTENT(eh); ++ if (eh->e_num) ++ path[ppos].p_ext = ex; ++ EXT_ASSERT(eh->e_num <= eh->e_max); ++ for (k = 0; k < eh->e_num; k++, ex++) { ++ if (block < ex->e_block) ++ break; ++ path[ppos].p_ext = ex; ++ } ++ ++ ext3_ext_show_path(inode, path); ++ ++ return path; ++} ++ ++static void ext3_ext_check_boundary(struct inode *inode, ++ struct ext3_ext_path *curp, ++ void *addr, int len) ++{ ++ void *end; ++ ++ if (!len) ++ return; ++ if (curp->p_bh) ++ end = (void *) curp->p_hdr + inode->i_sb->s_blocksize; ++ else ++ end = (void *) curp->p_hdr + sizeof(EXT3_I(inode)->i_data); ++ if (((unsigned long) addr) + len > (unsigned long) end) { ++ printk("overflow! 0x%p > 0x%p\n", addr + len, end); ++ BUG(); ++ } ++ if ((unsigned long) addr < (unsigned long) curp->p_hdr) { ++ printk("underflow! 0x%p < 0x%p\n", addr, curp->p_hdr); ++ BUG(); ++ } ++} ++ ++/* ++ * insert new index [logical;ptr] into the block at cupr ++ * it check where to insert: before curp or after curp ++ */ ++static int ext3_ext_insert_index(handle_t *handle, struct inode *inode, ++ struct ext3_ext_path *curp, int logical, ++ int ptr) ++{ ++ struct ext3_extent_idx *ix; ++ int len, err; ++ ++ if ((err = ext3_ext_get_access(handle, inode, curp))) ++ return err; ++ ++ EXT_ASSERT(logical != curp->p_idx->e_block); ++ len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx; ++ if (logical > curp->p_idx->e_block) { ++ /* insert after */ ++ len = (len - 1) * sizeof(struct ext3_extent_idx); ++ len = len < 0 ? 0 : len; ++ ext_debug(inode, "insert new index %d after: %d. " ++ "move %d from 0x%p to 0x%p\n", ++ logical, ptr, len, ++ (curp->p_idx + 1), (curp->p_idx + 2)); ++ ++ ext3_ext_check_boundary(inode, curp, curp->p_idx + 2, len); ++ memmove(curp->p_idx + 2, curp->p_idx + 1, len); ++ ix = curp->p_idx + 1; ++ } else { ++ /* insert before */ ++ len = len * sizeof(struct ext3_extent_idx); ++ len = len < 0 ? 0 : len; ++ ext_debug(inode, "insert new index %d before: %d. " ++ "move %d from 0x%p to 0x%p\n", ++ logical, ptr, len, ++ curp->p_idx, (curp->p_idx + 1)); ++ ++ ext3_ext_check_boundary(inode, curp, curp->p_idx + 1, len); ++ memmove(curp->p_idx + 1, curp->p_idx, len); ++ ix = curp->p_idx; ++ } ++ ++ ix->e_block = logical; ++ ix->e_leaf = ptr; ++ curp->p_hdr->e_num++; ++ ++ err = ext3_ext_dirty(handle, inode, curp); ++ ext3_std_error(inode->i_sb, err); ++ ++ return err; ++} ++ ++/* ++ * routine inserts new subtree into the path, using free index entry ++ * at depth 'at: ++ * - allocates all needed blocks (new leaf and all intermediate index blocks) ++ * - makes decision where to split ++ * - moves remaining extens and index entries (right to the split point) ++ * into the newly allocated blocks ++ * - initialize subtree ++ */ ++static int ext3_ext_split(handle_t *handle, struct inode *inode, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newext, int at) ++{ ++ struct buffer_head *bh = NULL; ++ int depth = EXT3_I(inode)->i_depth; ++ struct ext3_extent_header *neh; ++ struct ext3_extent_idx *fidx; ++ struct ext3_extent *ex; ++ int i = at, k, m, a; ++ long newblock, oldblock, border; ++ int *ablocks = NULL; /* array of allocated blocks */ ++ int err = 0; ++ ++ /* make decision: where to split? */ ++ /* FIXME: now desicion is simplest: at current extent */ ++ ++ /* if current leaf will be splitted, then we should use ++ * border from split point */ ++ if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) { ++ border = path[depth].p_ext[1].e_block; ++ ext_debug(inode, "leaf will be splitted." ++ " next leaf starts at %d\n", ++ (int)border); ++ } else { ++ border = newext->e_block; ++ ext_debug(inode, "leaf will be added." ++ " next leaf starts at %d\n", ++ (int)border); ++ } ++ ++ /* ++ * if error occurs, then we break processing ++ * and turn filesystem read-only. so, index won't ++ * be inserted and tree will be in consistent ++ * state. next mount will repair buffers too ++ */ ++ ++ /* ++ * get array to track all allocated blocks ++ * we need this to handle errors and free blocks ++ * upon them ++ */ ++ ablocks = kmalloc(sizeof(long) * depth, GFP_NOFS); ++ if (!ablocks) ++ return -ENOMEM; ++ memset(ablocks, 0, sizeof(long) * depth); ++ ++ /* allocate all needed blocks */ ++ ext_debug(inode, "allocate %d blocks for indexes and leaf\n", ++ depth - at); ++ ablocks[0] = newext->e_start++; ++ newext->e_num--; ++ for (a = 1; a < depth - at; a++) { ++ newblock = ext3_new_block(handle, inode, newext->e_start, ++ 0, 0, &err); ++ if (newblock == 0) ++ goto cleanup; ++ ablocks[a] = newblock; ++ } ++ ++ /* initialize new leaf */ ++ newblock = ablocks[--a]; ++ EXT_ASSERT(newblock); ++ bh = sb_getblk(inode->i_sb, newblock); ++ if (!bh) { ++ err = -EIO; ++ goto cleanup; ++ } ++ lock_buffer(bh); ++ ++ if ((err = ext3_journal_get_create_access(handle, bh))) ++ goto cleanup; ++ ++ neh = (struct ext3_extent_header *) bh->b_data; ++ neh->e_num = 0; ++ neh->e_max = ext3_ext_space_block(inode); ++ ex = EXT_FIRST_EXTENT(neh); ++ ++ /* move remain of path[depth] to the new leaf */ ++ EXT_ASSERT(path[depth].p_hdr->e_num == ++ path[depth].p_hdr->e_max); ++ /* start copy from next extent */ ++ /* TODO: we could do it by single memmove */ ++ m = 0; ++ path[depth].p_ext++; ++ while (path[depth].p_ext <= ++ EXT_MAX_EXTENT(path[depth].p_hdr)) { ++ ext_debug(inode, "move %d:%d:%d in new leaf\n", ++ path[depth].p_ext->e_block, ++ path[depth].p_ext->e_start, ++ path[depth].p_ext->e_num); ++ memmove(ex++, path[depth].p_ext++, ++ sizeof(struct ext3_extent)); ++ neh->e_num++; ++ m++; ++ } ++ mark_buffer_uptodate(bh, 1); ++ unlock_buffer(bh); ++ ++ if ((err = ext3_journal_dirty_metadata(handle, bh))) ++ goto cleanup; ++ brelse(bh); ++ bh = NULL; ++ ++ /* correct old leaf */ ++ if (m) { ++ if ((err = ext3_ext_get_access(handle, inode, path))) ++ goto cleanup; ++ path[depth].p_hdr->e_num -= m; ++ if ((err = ext3_ext_dirty(handle, inode, path))) ++ goto cleanup; ++ ++ } ++ ++ /* create intermediate indexes */ ++ k = depth - at - 1; ++ EXT_ASSERT(k >= 0); ++ if (k) ++ ext_debug(inode, ++ "create %d intermediate indices\n", k); ++ /* insert new index into current index block */ ++ /* current depth stored in i var */ ++ i = depth - 1; ++ while (k--) { ++ oldblock = newblock; ++ newblock = ablocks[--a]; ++ bh = sb_getblk(inode->i_sb, newblock); ++ if (!bh) { ++ err = -EIO; ++ goto cleanup; ++ } ++ lock_buffer(bh); ++ ++ if ((err = ext3_journal_get_create_access(handle, bh))) ++ goto cleanup; ++ ++ neh = (struct ext3_extent_header *) bh->b_data; ++ neh->e_num = 1; ++ neh->e_max = ext3_ext_space_block(inode); ++ fidx = EXT_FIRST_INDEX(neh); ++ fidx->e_block = border; ++ fidx->e_leaf = oldblock; ++ ++ ext_debug(inode, ++ "int.index at %d (block %u): %d -> %d\n", ++ i, (unsigned) newblock, ++ (int) border, ++ (int) oldblock); ++ /* copy indexes */ ++ m = 0; ++ path[i].p_idx++; ++ EXT_ASSERT(EXT_MAX_INDEX(path[i].p_hdr) == ++ EXT_LAST_INDEX(path[i].p_hdr)); ++ ext_debug(inode, "cur 0x%p, last 0x%p\n", path[i].p_idx, ++ EXT_MAX_INDEX(path[i].p_hdr)); ++ while (path[i].p_idx <= ++ EXT_MAX_INDEX(path[i].p_hdr)) { ++ ext_debug(inode, "%d: move %d:%d in new index\n", ++ i, path[i].p_idx->e_block, ++ path[i].p_idx->e_leaf); ++ memmove(++fidx, path[i].p_idx++, ++ sizeof(struct ext3_extent_idx)); ++ neh->e_num++; ++ m++; ++ } ++ ++ mark_buffer_uptodate(bh, 1); ++ unlock_buffer(bh); ++ ++ if ((err = ext3_journal_dirty_metadata(handle, bh))) ++ goto cleanup; ++ brelse(bh); ++ bh = NULL; ++ ++ /* correct old index */ ++ if (m) { ++ err = ext3_ext_get_access(handle,inode,path+i); ++ if (err) ++ goto cleanup; ++ path[i].p_hdr->e_num -= m; ++ err = ext3_ext_dirty(handle, inode, path + i); ++ if (err) ++ goto cleanup; ++ } ++ ++ i--; ++ } ++ ++ /* insert new index */ ++ if (!err) ++ err = ext3_ext_insert_index(handle, inode, path + at, ++ border, newblock); ++ ++cleanup: ++ if (bh) { ++ if (buffer_locked(bh)) ++ unlock_buffer(bh); ++ brelse(bh); ++ } ++ ++ if (err) { ++ /* free all allocated blocks in error case */ ++ for (i = 0; i < depth; i++) ++ if (!ablocks[i]) ++ continue; ++ ext3_free_blocks(handle, inode, ablocks[i], 1); ++ } ++ kfree(ablocks); ++ ++ return err; ++} ++ ++/* ++ * routine implements tree growing procedure: ++ * - allocates new block ++ * - moves top-level data (index block or leaf) into the new block ++ * - initialize new top-level, creating index that points to the ++ * just created block ++ */ ++static int ext3_ext_grow_indepth(handle_t *handle, struct inode *inode, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newext) ++{ ++ struct buffer_head *bh; ++ struct ext3_ext_path *curp = path; ++ struct ext3_extent_header *neh; ++ struct ext3_extent_idx *fidx; ++ int len, err = 0; ++ long newblock; ++ ++ /* ++ * use already allocated by the called block for new root block ++ */ ++ newblock = newext->e_start++; ++ newext->e_num--; ++ ++ bh = sb_getblk(inode->i_sb, newblock); ++ if (!bh) { ++ err = -EIO; ++ ext3_std_error(inode->i_sb, err); ++ return err; ++ } ++ lock_buffer(bh); ++ ++ if ((err = ext3_journal_get_create_access(handle, bh))) { ++ unlock_buffer(bh); ++ goto out; ++ } ++ ++ /* move top-level index/leaf into new block */ ++ len = sizeof(struct ext3_extent_header) + ++ sizeof(struct ext3_extent) * curp->p_hdr->e_max; ++ EXT_ASSERT(len >= 0 && len < 4096); ++ memmove(bh->b_data, curp->p_hdr, len); ++ ++ /* set size of new block */ ++ neh = (struct ext3_extent_header *) bh->b_data; ++ neh->e_max = ext3_ext_space_block(inode); ++ mark_buffer_uptodate(bh, 1); ++ unlock_buffer(bh); ++ ++ if ((err = ext3_journal_dirty_metadata(handle, bh))) ++ goto out; ++ ++ /* create index in new top-level index: num,max,pointer */ ++ if ((err = ext3_ext_get_access(handle, inode, curp))) ++ goto out; ++ ++ curp->p_hdr->e_max = ext3_ext_space_inode_idx(inode); ++ curp->p_hdr->e_num = 1; ++ curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr); ++ curp->p_idx->e_block = EXT_FIRST_EXTENT(path[0].p_hdr)->e_block; ++ curp->p_idx->e_leaf = newblock; ++ ++ neh = (struct ext3_extent_header *) EXT3_I(inode)->i_data; ++ fidx = EXT_FIRST_INDEX(neh); ++ ext_debug(inode, "new root: num %d(%d), lblock %d, ptr %d\n", ++ neh->e_num, neh->e_max, fidx->e_block, fidx->e_leaf); ++ ++ EXT3_I(inode)->i_depth++; ++ err = ext3_ext_dirty(handle, inode, curp); ++out: ++ brelse(bh); ++ ++ return err; ++} ++ ++/* ++ * routine finds empty index and adds new leaf. if no free index found ++ * then it requests in-depth growing ++ */ ++static int ext3_ext_create_new_leaf(handle_t *handle, struct inode *inode, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newext) ++{ ++ int depth = EXT3_I(inode)->i_depth; ++ struct ext3_ext_path *curp; ++ int i = depth, err = 0; ++ long newblock = newext->e_start; ++ ++ /* walk up to the tree and look for free index entry */ ++ curp = path + depth; ++ while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) { ++ i--; ++ curp--; ++ } ++ ++ /* we use already allocated block for index block ++ * so, subsequent data blocks should be contigoues */ ++ if (EXT_HAS_FREE_INDEX(curp)) { ++ /* if we found index with free entry, then use that ++ * entry: create all needed subtree and add new leaf */ ++ err = ext3_ext_split(handle, inode, path, newext, i); ++ } else { ++ /* tree is full, time to grow in depth */ ++ err = ext3_ext_grow_indepth(handle, inode, path, newext); ++ } ++ ++ if (!err) { ++ /* refill path */ ++ ext3_ext_drop_refs(inode, path); ++ path = ext3_ext_find_extent(inode, newext->e_block, path); ++ if (IS_ERR(path)) ++ err = PTR_ERR(path); ++ ++ /* ++ * probably we've used some blocks from extent ++ * let's allocate new block for it ++ */ ++ if (newext->e_num == 0 && !err) { ++ newext->e_start = ++ ext3_new_block(handle, inode, newblock, ++ 0, 0, &err); ++ newext->e_num = 1; ++ } ++ } ++ ++ return err; ++} ++ ++/* ++ * returns next allocated block or 0xffffffff ++ * NOTE: it consider block number from index entry as ++ * allocated block. thus, index entries have to be consistent ++ * with leafs ++ */ ++static inline unsigned ext3_ext_next_allocated_block(struct inode *inode, ++ struct ext3_ext_path *path) ++{ ++ int depth; ++ ++ EXT_ASSERT(path != NULL); ++ depth = path->p_depth; ++ ++ if (depth == 0 && path->p_ext == NULL) ++ return 0xffffffff; ++ ++ /* FIXME: what if index isn't full ?! */ ++ while (depth >= 0) { ++ if (depth == path->p_depth) { ++ /* leaf */ ++ if (path[depth].p_ext != ++ EXT_LAST_EXTENT(path[depth].p_hdr)) ++ return path[depth].p_ext[1].e_block; ++ } else { ++ /* index */ ++ if (path[depth].p_idx != ++ EXT_LAST_INDEX(path[depth].p_hdr)) ++ return path[depth].p_idx[1].e_block; ++ } ++ depth--; ++ } ++ ++ return 0xffffffff; ++} ++ ++/* ++ * returns first allocated block from next leaf or 0xffffffff ++ */ ++static unsigned ext3_ext_next_leaf_block(struct inode *inode, ++ struct ext3_ext_path *path) ++{ ++ int depth; ++ ++ EXT_ASSERT(path != NULL); ++ depth = path->p_depth; ++ ++ /* zero-tree has no leaf blocks at all */ ++ if (depth == 0) ++ return 0xffffffff; ++ ++ /* go to index block */ ++ depth--; ++ ++ while (depth >= 0) { ++ if (path[depth].p_idx != ++ EXT_LAST_INDEX(path[depth].p_hdr)) ++ return path[depth].p_idx[1].e_block; ++ depth--; ++ } ++ ++ return 0xffffffff; ++} ++ ++/* ++ * if leaf gets modified and modified extent is first in the leaf ++ * then we have to correct all indexes above ++ * TODO: do we need to correct tree in all cases? ++ */ ++int ext3_ext_correct_indexes(handle_t *handle, struct inode *inode, ++ struct ext3_ext_path *path) ++{ ++ int depth = EXT3_I(inode)->i_depth; ++ struct ext3_extent_header *eh; ++ struct ext3_extent *ex; ++ long border; ++ int k, err = 0; ++ ++ eh = path[depth].p_hdr; ++ ex = path[depth].p_ext; ++ ++ EXT_ASSERT(ex); ++ EXT_ASSERT(eh); ++ ++ if (depth == 0) { ++ /* there is no tree at all */ ++ return 0; ++ } ++ ++ if (ex != EXT_FIRST_EXTENT(eh)) { ++ /* we correct tree if first leaf got modified only */ ++ return 0; ++ } ++ ++ k = depth - 1; ++ border = path[depth].p_ext->e_block; ++ if ((err = ext3_ext_get_access(handle, inode, path + k))) ++ return err; ++ path[k].p_idx->e_block = border; ++ if ((err = ext3_ext_dirty(handle, inode, path + k))) ++ return err; ++ ++ while (k--) { ++ /* change all left-side indexes */ ++ if (path[k].p_idx != EXT_FIRST_INDEX(path[k].p_hdr) ++ && k != 0) ++ break; ++ if ((err = ext3_ext_get_access(handle, inode, path + k))) ++ break; ++ path[k].p_idx->e_block = border; ++ if ((err = ext3_ext_dirty(handle, inode, path + k))) ++ break; ++ } ++ ++ return err; ++} ++ ++/* ++ * this routine tries to merge requsted extent into the existing ++ * extent or inserts requested extent as new one into the tree, ++ * creating new leaf in no-space case ++ */ ++int ext3_ext_insert_extent(handle_t *handle, struct inode *inode, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newext) ++{ ++ int depth, len; ++ struct ext3_extent_header * eh; ++ struct ext3_extent *ex; ++ struct ext3_extent *nearex; /* nearest extent */ ++ struct ext3_ext_path *npath = NULL; ++ int err; ++ ++ depth = EXT3_I(inode)->i_depth; ++ if ((ex = path[depth].p_ext)) { ++ /* try to insert block into found extent and return */ ++ if (ex->e_block + ex->e_num == newext->e_block && ++ ex->e_start + ex->e_num == newext->e_start) { ++#ifdef AGRESSIVE_TEST ++ if (ex->e_num >= 2) ++ goto repeat; ++#endif ++ if ((err = ext3_ext_get_access(handle, inode, ++ path + depth))) ++ return err; ++ ext_debug(inode, "append %d block to %d:%d (from %d)\n", ++ newext->e_num, ex->e_block, ex->e_num, ++ ex->e_start); ++ ex->e_num += newext->e_num; ++ err = ext3_ext_dirty(handle, inode, path + depth); ++ return err; ++ } ++ } ++ ++repeat: ++ depth = EXT3_I(inode)->i_depth; ++ eh = path[depth].p_hdr; ++ if (eh->e_num == eh->e_max) { ++ /* probably next leaf has space for us? */ ++ int next = ext3_ext_next_leaf_block(inode, path); ++ if (next != 0xffffffff) { ++ ext_debug(inode, "next leaf block - %d\n", next); ++ EXT_ASSERT(!npath); ++ npath = ext3_ext_find_extent(inode, next, NULL); ++ if (IS_ERR(npath)) ++ return PTR_ERR(npath); ++ EXT_ASSERT(npath->p_depth == path->p_depth); ++ eh = npath[depth].p_hdr; ++ if (eh->e_num < eh->e_max) { ++ ext_debug(inode, ++ "next leaf has free ext(%d)\n", ++ eh->e_num); ++ path = npath; ++ goto repeat; ++ } ++ ext_debug(inode, "next leaf hasno free space(%d,%d)\n", ++ eh->e_num, eh->e_max); ++ } ++ /* ++ * there is no free space in found leaf ++ * we're gonna add new leaf in the tree ++ */ ++ err = ext3_ext_create_new_leaf(handle, inode, path, newext); ++ if (err) ++ goto cleanup; ++ goto repeat; ++ } ++ ++ nearex = path[depth].p_ext; ++ ++ if ((err = ext3_ext_get_access(handle, inode, path + depth))) ++ goto cleanup; ++ ++ if (!nearex) { ++ /* there is no extent in this leaf, create first one */ ++ ext_debug(inode, "first extent in the leaf: %d:%d:%d\n", ++ newext->e_block, newext->e_start, ++ newext->e_num); ++ eh->e_num++; ++ path[depth].p_ext = EXT_FIRST_EXTENT(eh); ++ ++ } else if (newext->e_block > nearex->e_block) { ++ EXT_ASSERT(newext->e_block != nearex->e_block); ++ len = EXT_MAX_EXTENT(eh) - nearex; ++ len = (len - 1) * sizeof(struct ext3_extent); ++ len = len < 0 ? 0 : len; ++ ext_debug(inode, "insert %d:%d:%d after: nearest 0x%p, " ++ "move %d from 0x%p to 0x%p\n", ++ newext->e_block, newext->e_start, newext->e_num, ++ nearex, len, nearex + 1, nearex + 2); ++ ext3_ext_check_boundary(inode, path + depth, nearex + 2, len); ++ memmove(nearex + 2, nearex + 1, len); ++ path[depth].p_ext = nearex + 1; ++ eh->e_num++; ++ } else { ++ EXT_ASSERT(newext->e_block != nearex->e_block); ++ len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext3_extent); ++ len = len < 0 ? 0 : len; ++ ext_debug(inode, "insert %d:%d:%d before: nearest 0x%p, " ++ "move %d from 0x%p to 0x%p\n", ++ newext->e_block, newext->e_start, newext->e_num, ++ nearex, len, nearex + 1, nearex + 2); ++ memmove(nearex + 1, nearex, len); ++ path[depth].p_ext = nearex; ++ eh->e_num++; ++ ++ /* time to correct all indexes above */ ++ err = ext3_ext_correct_indexes(handle, inode, path); ++ } ++ ++ if (!err) { ++ nearex = path[depth].p_ext; ++ nearex->e_block = newext->e_block; ++ nearex->e_start = newext->e_start; ++ nearex->e_num = newext->e_num; ++ } ++ ++ err = ext3_ext_dirty(handle, inode, path + depth); ++ ++cleanup: ++ if (npath) { ++ ext3_ext_drop_refs(inode, npath); ++ kfree(npath); ++ } ++ ++ return err; ++} ++ ++int ext3_ext_get_block(handle_t *handle, struct inode *inode, long iblock, ++ struct buffer_head *bh_result, int create) ++{ ++ struct ext3_ext_path *path; ++ int depth = EXT3_I(inode)->i_depth; ++ struct ext3_extent newex; ++ struct ext3_extent *ex; ++ int goal, newblock, err = 0; ++ ++ ext_debug(inode, "block %d requested for inode %u, bh_result 0x%p\n", ++ (int) iblock, (unsigned) inode->i_ino, bh_result); ++ bh_result->b_state &= ~(1UL << BH_New); ++ ++ down(&EXT3_I(inode)->i_ext_sem); ++ ++ /* find extent for this block */ ++ path = ext3_ext_find_extent(inode, iblock, NULL); ++ if (IS_ERR(path)) { ++ err = PTR_ERR(path); ++ goto out2; ++ } ++ ++ if ((ex = path[depth].p_ext)) { ++ /* if found exent covers block, simple return it */ ++ if (iblock >= ex->e_block && iblock < ex->e_block + ex->e_num) { ++ newblock = iblock - ex->e_block + ex->e_start; ++ ext_debug(inode, "%d fit into %d:%d -> %d\n", ++ (int) iblock, ex->e_block, ex->e_num, ++ newblock); ++ goto out; ++ } ++ } ++ ++ /* ++ * we couldn't try to create block if create flag is zero ++ */ ++ if (!create) ++ goto out2; ++ ++ /* allocate new block */ ++ goal = ext3_ext_find_goal(inode, path); ++ newblock = ext3_new_block(handle, inode, goal, 0, 0, &err); ++ if (!newblock) ++ goto out2; ++ ext_debug(inode, "allocate new block: goal %d, found %d\n", ++ goal, newblock); ++ ++ /* try to insert new extent into found leaf and return */ ++ newex.e_block = iblock; ++ newex.e_start = newblock; ++ newex.e_num = 1; ++ err = ext3_ext_insert_extent(handle, inode, path, &newex); ++ if (err) ++ goto out2; ++ ++ /* previous routine could use block we allocated */ ++ newblock = newex.e_start; ++ bh_result->b_state |= (1UL << BH_New); ++ ++out: ++ ext3_ext_show_leaf(inode, path); ++ bh_result->b_dev = inode->i_dev; ++ bh_result->b_blocknr = newblock; ++ bh_result->b_state |= (1UL << BH_Mapped); ++out2: ++ ext3_ext_drop_refs(inode, path); ++ kfree(path); ++ up(&EXT3_I(inode)->i_ext_sem); ++ ++ return err; ++} ++ ++/* ++ * returns 1 if current index have to be freed (even partial) ++ */ ++static int ext3_ext_more_to_truncate(struct inode *inode, ++ struct ext3_ext_path *path) ++{ ++ EXT_ASSERT(path->p_idx); ++ ++ if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr)) ++ return 0; ++ ++ /* ++ * if truncate on deeper level happened it it wasn't partial ++ * so we have to consider current index for truncation ++ */ ++ if (path->p_hdr->e_num == path->p_block) ++ return 0; ++ ++ /* ++ * put actual number of indexes to know is this number got ++ * changed at the next iteration ++ */ ++ path->p_block = path->p_hdr->e_num; ++ ++ return 1; ++} ++ ++/* ++ * routine removes index from the index block ++ * it's used in truncate case only. thus all requests are for ++ * last index in the block only ++ */ ++int ext3_ext_remove_index(handle_t *handle, struct inode *inode, ++ struct ext3_ext_path *path) ++{ ++ struct buffer_head *bh; ++ int err; ++ ++ /* free index block */ ++ path--; ++ EXT_ASSERT(path->p_hdr->e_num); ++ if ((err = ext3_ext_get_access(handle, inode, path))) ++ return err; ++ path->p_hdr->e_num--; ++ if ((err = ext3_ext_dirty(handle, inode, path))) ++ return err; ++ bh = sb_get_hash_table(inode->i_sb, path->p_idx->e_leaf); ++ ext3_forget(handle, 0, inode, bh, path->p_idx->e_leaf); ++ ext3_free_blocks(handle, inode, path->p_idx->e_leaf, 1); ++ ++ ext_debug(inode, "index is empty, remove it, free block %d\n", ++ path->p_idx->e_leaf); ++ return err; ++} ++ ++/* ++ * returns 1 if current extent needs to be freed (even partial) ++ * instead, returns 0 ++ */ ++int ext3_ext_more_leaves_to_truncate(struct inode *inode, ++ struct ext3_ext_path *path) ++{ ++ unsigned blocksize = inode->i_sb->s_blocksize; ++ struct ext3_extent *ex = path->p_ext; ++ int last_block; ++ ++ EXT_ASSERT(ex); ++ ++ /* is there leave in the current leaf? */ ++ if (ex < EXT_FIRST_EXTENT(path->p_hdr)) ++ return 0; ++ ++ last_block = (inode->i_size + blocksize-1) ++ >> EXT3_BLOCK_SIZE_BITS(inode->i_sb); ++ ++ if (last_block >= ex->e_block + ex->e_num) ++ return 0; ++ ++ /* seems it extent have to be freed */ ++ return 1; ++} ++ ++handle_t *ext3_ext_journal_restart(handle_t *handle, int needed) ++{ ++ int err; ++ ++ if (handle->h_buffer_credits > needed) ++ return handle; ++ if (!ext3_journal_extend(handle, needed)) ++ return handle; ++ err = ext3_journal_restart(handle, needed); ++ ++ return handle; ++} ++ ++/* ++ * this routine calculate max number of blocks to be modified ++ * while freeing extent and is intended to be used in truncate path ++ */ ++static int ext3_ext_calc_credits(struct inode *inode, ++ struct ext3_ext_path *path, ++ int num) ++{ ++ int depth = EXT3_I(inode)->i_depth; ++ int needed; ++ ++ /* ++ * extent couldn't cross group, so we will modify ++ * single bitmap block and single group descriptor ++ */ ++ needed = 2; ++ ++ /* ++ * if this is last extent in a leaf, then we have to ++ * free leaf block and remove pointer from index above. ++ * that pointer could be last in index block, so we'll ++ * have to remove it too. this way we could modify/free ++ * the whole path + root index (inode stored) will be ++ * modified ++ */ ++ if (!path || (num == path->p_ext->e_num && ++ path->p_ext == EXT_FIRST_EXTENT(path->p_hdr))) ++ needed += (depth * EXT3_ALLOC_NEEDED) + 1; ++ ++ return needed; ++} ++ ++/* ++ * core of the truncate procedure: ++ * - calculated what part of each extent in the requested leaf ++ * need to be freed ++ * - frees and forgets these blocks ++ * ++ * TODO: we could optimize and free several extents during ++ * single journal_restart()-journal_restart() cycle ++ */ ++static int ext3_ext_truncate_leaf(handle_t *handle, ++ struct inode *inode, ++ struct ext3_ext_path *path, ++ int depth) ++{ ++ unsigned blocksize = inode->i_sb->s_blocksize; ++ int last_block; ++ int i, err = 0, sf, num; ++ ++ ext_debug(inode, "level %d - leaf\n", depth); ++ if (!path->p_hdr) ++ path->p_hdr = ++ (struct ext3_extent_header *) path->p_bh->b_data; ++ ++ EXT_ASSERT(path->p_hdr->e_num <= path->p_hdr->e_max); ++ ++ last_block = (inode->i_size + blocksize-1) ++ >> EXT3_BLOCK_SIZE_BITS(inode->i_sb); ++ path->p_ext = EXT_LAST_EXTENT(path->p_hdr); ++ while (ext3_ext_more_leaves_to_truncate(inode, path)) { ++ ++ /* what part of extent have to be freed? */ ++ sf = last_block > path->p_ext->e_block ? ++ last_block : path->p_ext->e_block; ++ ++ /* number of blocks from extent to be freed */ ++ num = path->p_ext->e_block + path->p_ext->e_num - sf; ++ ++ /* calc physical first physical block to be freed */ ++ sf = path->p_ext->e_start + (sf - path->p_ext->e_block); ++ ++ i = ext3_ext_calc_credits(inode, path, num); ++ handle = ext3_ext_journal_restart(handle, i); ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ ++ ext_debug(inode, "free extent %d:%d:%d -> free %d:%d\n", ++ path->p_ext->e_block, path->p_ext->e_start, ++ path->p_ext->e_num, sf, num); ++ for (i = 0; i < num; i++) { ++ struct buffer_head *bh = ++ sb_get_hash_table(inode->i_sb, sf + i); ++ ext3_forget(handle, 0, inode, bh, sf + i); ++ } ++ ext3_free_blocks(handle, inode, sf, num); ++ ++ /* collect extents usage stats */ ++ spin_lock(&EXT3_SB(inode->i_sb)->s_ext_lock); ++ EXT3_SB(inode->i_sb)->s_ext_extents++; ++ EXT3_SB(inode->i_sb)->s_ext_blocks += num; ++ spin_unlock(&EXT3_SB(inode->i_sb)->s_ext_lock); ++ ++ /* reduce extent */ ++ if ((err = ext3_ext_get_access(handle, inode, path))) ++ return err; ++ path->p_ext->e_num -= num; ++ if (path->p_ext->e_num == 0) ++ path->p_hdr->e_num--; ++ if ((err = ext3_ext_dirty(handle, inode, path))) ++ return err; ++ ++ path->p_ext--; ++ } ++ ++ /* if this leaf is free, then we should ++ * remove it from index block above */ ++ if (path->p_hdr->e_num == 0 && depth > 0) ++ err = ext3_ext_remove_index(handle, inode, path); ++ ++ return err; ++} ++ ++static void ext3_ext_collect_stats(struct inode *inode) ++{ ++ int depth; ++ ++ /* skip inodes with old good bitmap */ ++ if (!(EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL)) ++ return; ++ ++ /* collect on full truncate only */ ++ if (inode->i_size) ++ return; ++ ++ depth = EXT3_I(inode)->i_depth; ++ if (depth < EXT3_SB(inode->i_sb)->s_ext_mindepth) ++ EXT3_SB(inode->i_sb)->s_ext_mindepth = depth; ++ if (depth > EXT3_SB(inode->i_sb)->s_ext_maxdepth) ++ EXT3_SB(inode->i_sb)->s_ext_maxdepth = depth; ++ EXT3_SB(inode->i_sb)->s_ext_sum += depth; ++ EXT3_SB(inode->i_sb)->s_ext_count++; ++ ++} ++ ++void ext3_ext_truncate(struct inode * inode) ++{ ++ struct address_space *mapping = inode->i_mapping; ++ struct ext3_ext_path *path; ++ struct page * page; ++ handle_t *handle; ++ int i, depth, err = 0; ++ ++ ext3_ext_collect_stats(inode); ++ ++ /* ++ * We have to lock the EOF page here, because lock_page() nests ++ * outside journal_start(). ++ */ ++ if ((inode->i_size & (inode->i_sb->s_blocksize - 1)) == 0) { ++ /* Block boundary? Nothing to do */ ++ page = NULL; ++ } else { ++ page = grab_cache_page(mapping, ++ inode->i_size >> PAGE_CACHE_SHIFT); ++ if (!page) ++ return; ++ } ++ ++ /* ++ * probably first extent we're gonna free will be last in block ++ */ ++ i = ext3_ext_calc_credits(inode, NULL, 0); ++ handle = ext3_journal_start(inode, i); ++ if (IS_ERR(handle)) { ++ if (page) { ++ clear_highpage(page); ++ flush_dcache_page(page); ++ unlock_page(page); ++ page_cache_release(page); ++ } ++ return; ++ } ++ ++ if (page) ++ ext3_block_truncate_page(handle, mapping, inode->i_size); ++ ++ down(&EXT3_I(inode)->i_ext_sem); ++ ++ /* ++ * TODO: optimization is possible here ++ * probably we need not scaning at all, ++ * because page truncation is enough ++ */ ++ if (ext3_orphan_add(handle, inode)) ++ goto out_stop; ++ ++ /* we have to know where to truncate from in crash case */ ++ EXT3_I(inode)->i_disksize = inode->i_size; ++ ext3_mark_inode_dirty(handle, inode); ++ ++ /* ++ * we start scanning from right side freeing all the blocks ++ * after i_size and walking into the deep ++ */ ++ i = 0; ++ depth = EXT3_I(inode)->i_depth; ++ path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 1), GFP_KERNEL); ++ if (IS_ERR(path)) { ++ ext3_error(inode->i_sb, "ext3_ext_truncate", ++ "Can't allocate path array"); ++ goto out_stop; ++ } ++ memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1)); ++ ++ path[i].p_hdr = (struct ext3_extent_header *) EXT3_I(inode)->i_data; ++ while (i >= 0 && err == 0) { ++ if (i == depth) { ++ /* this is leaf block */ ++ err = ext3_ext_truncate_leaf(handle, inode, ++ path + i, i); ++ /* root level have p_bh == NULL, brelse() eats this */ ++ brelse(path[i].p_bh); ++ i--; ++ continue; ++ } ++ ++ /* this is index block */ ++ if (!path[i].p_hdr) { ++ path[i].p_hdr = ++ (struct ext3_extent_header *) path[i].p_bh->b_data; ++ ext_debug(inode, "initialize header\n"); ++ } ++ ++ EXT_ASSERT(path[i].p_hdr->e_num <= path[i].p_hdr->e_max); ++ ++ if (!path[i].p_idx) { ++ /* this level hasn't touched yet */ ++ path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr); ++ path[i].p_block = path[i].p_hdr->e_num + 1; ++ ext_debug(inode, "init index ptr: hdr 0x%p, num %d\n", ++ path[i].p_hdr, path[i].p_hdr->e_num); ++ } else { ++ /* we've already was here, see at next index */ ++ path[i].p_idx--; ++ } ++ ++ ext_debug(inode, "level %d - index, first 0x%p, cur 0x%p\n", ++ i, EXT_FIRST_INDEX(path[i].p_hdr), ++ path[i].p_idx); ++ if (ext3_ext_more_to_truncate(inode, path + i)) { ++ /* go to the next level */ ++ ext_debug(inode, "move to level %d (block %d)\n", i+1, ++ path[i].p_idx->e_leaf); ++ memset(path + i + 1, 0, sizeof(*path)); ++ path[i+1].p_bh = sb_bread(inode->i_sb, ++ path[i].p_idx->e_leaf); ++ if (!path[i+1].p_bh) { ++ /* should we reset i_size? */ ++ err = -EIO; ++ break; ++ } ++ i++; ++ } else { ++ /* we finish processing this index, go up */ ++ if (path[i].p_hdr->e_num == 0 && i > 0) { ++ /* index is empty, remove it ++ * handle must be already prepared by the ++ * truncate_leaf() ++ */ ++ err = ext3_ext_remove_index(handle, inode, ++ path + i); ++ } ++ /* root level have p_bh == NULL, brelse() eats this */ ++ brelse(path[i].p_bh); ++ i--; ++ ext_debug(inode, "return to level %d\n", i); ++ } ++ } ++ ++ /* TODO: flexible tree reduction should be here */ ++ if (path->p_hdr->e_num == 0) { ++ /* ++ * truncate to zero freed all the tree ++ * so, we need to correct i_depth ++ */ ++ EXT3_I(inode)->i_depth = 0; ++ path->p_hdr->e_max = 0; ++ ext3_mark_inode_dirty(handle, inode); ++ } ++ ++ kfree(path); ++ ++ /* In a multi-transaction truncate, we only make the final ++ * transaction synchronous */ ++ if (IS_SYNC(inode)) ++ handle->h_sync = 1; ++ ++out_stop: ++ /* ++ * If this was a simple ftruncate(), and the file will remain alive ++ * then we need to clear up the orphan record which we created above. ++ * However, if this was a real unlink then we were called by ++ * ext3_delete_inode(), and we allow that function to clean up the ++ * orphan info for us. ++ */ ++ if (inode->i_nlink) ++ ext3_orphan_del(handle, inode); ++ ++ up(&EXT3_I(inode)->i_ext_sem); ++ ext3_journal_stop(handle, inode); ++} ++ ++/* ++ * this routine calculate max number of blocks we could modify ++ * in order to allocate new block for an inode ++ */ ++int ext3_ext_writepage_trans_blocks(struct inode *inode, int num) ++{ ++ struct ext3_inode_info *ei = EXT3_I(inode); ++ int depth = ei->i_depth + 1; ++ int needed; ++ ++ /* ++ * the worste case we're expecting is creation of the ++ * new root (growing in depth) with index splitting ++ * for splitting we have to consider depth + 1 because ++ * previous growing could increase it ++ */ ++ ++ /* ++ * growing in depth: ++ * block allocation + new root + old root ++ */ ++ needed = EXT3_ALLOC_NEEDED + 2; ++ ++ /* index split. we may need: ++ * allocate intermediate indexes and new leaf ++ * change two blocks at each level, but root ++ * modify root block (inode) ++ */ ++ needed += (depth * EXT3_ALLOC_NEEDED) + (2 * depth) + 1; ++ ++ /* caller want to allocate num blocks */ ++ needed *= num; ++ ++#ifdef CONFIG_QUOTA ++ /* ++ * FIXME: real calculation should be here ++ * it depends on blockmap format of qouta file ++ */ ++ needed += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS; ++#endif ++ ++ return needed; ++} ++ ++/* ++ * called at mount time ++ */ ++void ext3_ext_init(struct super_block *sb) ++{ ++ /* ++ * possible initialization would be here ++ */ ++ ++ if (test_opt(sb, EXTENTS)) ++ printk("EXT3-fs: file extents enabled\n"); ++ spin_lock_init(&EXT3_SB(sb)->s_ext_lock); ++} ++ ++/* ++ * called at umount time ++ */ ++void ext3_ext_release(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ ++ /* show collected stats */ ++ if (sbi->s_ext_count && sbi->s_ext_extents) ++ printk("EXT3-fs: min depth - %d, max depth - %d, " ++ "ave. depth - %d, ave. blocks/extent - %d\n", ++ sbi->s_ext_mindepth, ++ sbi->s_ext_maxdepth, ++ sbi->s_ext_sum / sbi->s_ext_count, ++ sbi->s_ext_blocks / sbi->s_ext_extents); ++} ++ +--- linux-2.4.20-vanilla/fs/ext3/ialloc.c~ext3-extents-2.4.20 2003-09-15 18:54:58.000000000 +0400 ++++ linux-2.4.20-vanilla-alexey/fs/ext3/ialloc.c 2003-09-15 19:31:40.000000000 +0400 +@@ -569,6 +569,10 @@ repeat: + inode->u.ext3_i.i_prealloc_count = 0; + #endif + inode->u.ext3_i.i_block_group = i; ++ if (test_opt(sb, EXTENTS)) ++ inode->u.ext3_i.i_flags |= EXT3_EXTENTS_FL; ++ inode->u.ext3_i.i_depth = 0; ++ sema_init(&inode->u.ext3_i.i_ext_sem, 1); + + if (inode->u.ext3_i.i_flags & EXT3_SYNC_FL) + inode->i_flags |= S_SYNC; +--- linux-2.4.20-vanilla/fs/ext3/inode.c~ext3-extents-2.4.20 2003-09-15 18:54:58.000000000 +0400 ++++ linux-2.4.20-vanilla-alexey/fs/ext3/inode.c 2003-09-15 19:53:10.000000000 +0400 +@@ -848,6 +848,15 @@ changed: + goto reread; + } + ++static inline int ++ext3_get_block_wrap(handle_t *handle, struct inode *inode, long block, ++ struct buffer_head *bh, int create) ++{ ++ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) ++ return ext3_ext_get_block(handle, inode, block, bh, create); ++ return ext3_get_block_handle(handle, inode, block, bh, create); ++} ++ + /* + * The BKL is not held on entry here. + */ +@@ -861,7 +870,7 @@ static int ext3_get_block(struct inode * + handle = ext3_journal_current_handle(); + J_ASSERT(handle != 0); + } +- ret = ext3_get_block_handle(handle, inode, iblock, bh_result, create); ++ ret = ext3_get_block_wrap(handle, inode, iblock, bh_result, create); + return ret; + } + +@@ -879,7 +888,7 @@ struct buffer_head *ext3_getblk(handle_t + dummy.b_state = 0; + dummy.b_blocknr = -1000; + buffer_trace_init(&dummy.b_history); +- *errp = ext3_get_block_handle(handle, inode, block, &dummy, create); ++ *errp = ext3_get_block_wrap(handle, inode, block, &dummy, create); + if (!*errp && buffer_mapped(&dummy)) { + struct buffer_head *bh; + bh = sb_getblk(inode->i_sb, dummy.b_blocknr); +@@ -1403,7 +1412,7 @@ struct address_space_operations ext3_aop + * This required during truncate. We need to physically zero the tail end + * of that block so it doesn't yield old data if the file is later grown. + */ +-static int ext3_block_truncate_page(handle_t *handle, ++int ext3_block_truncate_page(handle_t *handle, + struct address_space *mapping, loff_t from) + { + unsigned long index = from >> PAGE_CACHE_SHIFT; +@@ -1888,6 +1897,9 @@ void ext3_truncate(struct inode * inode) + + ext3_discard_prealloc(inode); + ++ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) ++ return ext3_ext_truncate(inode); ++ + handle = start_transaction(inode); + if (IS_ERR(handle)) + return; /* AKPM: return what? */ +@@ -2200,6 +2212,8 @@ void ext3_read_inode(struct inode * inod + inode->u.ext3_i.i_prealloc_count = 0; + #endif + inode->u.ext3_i.i_block_group = iloc.block_group; ++ inode->u.ext3_i.i_depth = raw_inode->osd2.linux2.l_i_depth; ++ sema_init(&inode->u.ext3_i.i_ext_sem, 1); + + /* + * NOTE! The in-memory inode i_data array is in little-endian order +@@ -2321,6 +2335,7 @@ static int ext3_do_update_inode(handle_t + raw_inode->i_fsize = 0; + } + #endif ++ raw_inode->osd2.linux2.l_i_depth = inode->u.ext3_i.i_depth; + raw_inode->i_file_acl = cpu_to_le32(inode->u.ext3_i.i_file_acl); + if (!S_ISREG(inode->i_mode)) { + raw_inode->i_dir_acl = cpu_to_le32(inode->u.ext3_i.i_dir_acl); +@@ -2525,6 +2540,9 @@ int ext3_writepage_trans_blocks(struct i + int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3; + int ret; + ++ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) ++ return ext3_ext_writepage_trans_blocks(inode, bpp); ++ + if (ext3_should_journal_data(inode)) + ret = 3 * (bpp + indirects) + 2; + else +@@ -2961,7 +2979,7 @@ int ext3_prep_san_write(struct inode *in + + /* alloc blocks one by one */ + for (i = 0; i < nblocks; i++) { +- ret = ext3_get_block_handle(handle, inode, blocks[i], ++ ret = ext3_get_block_wrap(handle, inode, blocks[i], + &bh_tmp, 1); + if (ret) + break; +@@ -3022,7 +3040,7 @@ int ext3_map_inode_page(struct inode *in + if (blocks[i] != 0) + continue; + +- rc = ext3_get_block_handle(handle, inode, iblock, &dummy, 1); ++ rc = ext3_get_block_wrap(handle, inode, iblock, &dummy, 1); + if (rc) { + printk(KERN_INFO "ext3_map_inode_page: error reading " + "block %ld\n", iblock); +--- linux-2.4.20-vanilla/fs/ext3/Makefile~ext3-extents-2.4.20 2003-09-15 18:54:58.000000000 +0400 ++++ linux-2.4.20-vanilla-alexey/fs/ext3/Makefile 2003-09-15 19:41:08.000000000 +0400 +@@ -12,7 +12,8 @@ O_TARGET := ext3.o + export-objs := ext3-exports.o + + obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ +- ioctl.o namei.o super.o symlink.o hash.o ext3-exports.o ++ ioctl.o namei.o super.o symlink.o hash.o ext3-exports.o \ ++ extents.o + obj-m := $(O_TARGET) + + export-objs += xattr.o +--- linux-2.4.20-vanilla/fs/ext3/super.c~ext3-extents-2.4.20 2003-09-15 18:54:59.000000000 +0400 ++++ linux-2.4.20-vanilla-alexey/fs/ext3/super.c 2003-09-15 19:42:57.000000000 +0400 +@@ -619,6 +619,7 @@ void ext3_put_super (struct super_block + kdev_t j_dev = sbi->s_journal->j_dev; + int i; + ++ ext3_ext_release(sb); + ext3_stop_delete_thread(sbi); + ext3_xattr_put_super(sb); + journal_destroy(sbi->s_journal); +@@ -765,6 +766,10 @@ static int parse_options (char * options + "EXT3 Check option not supported\n"); + #endif + } ++ else if (!strcmp (this_char, "extents")) ++ set_opt (sbi->s_mount_opt, EXTENTS); ++ else if (!strcmp (this_char, "extdebug")) ++ set_opt (sbi->s_mount_opt, EXTDEBUG); + else if (!strcmp (this_char, "debug")) + set_opt (*mount_options, DEBUG); + else if (!strcmp (this_char, "errors")) { +@@ -1478,6 +1483,7 @@ struct super_block * ext3_read_super (st + test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ? "journal": + test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered": + "writeback"); ++ ext3_ext_init(sb); + + return sb; + +--- linux-2.4.20-vanilla/include/linux/ext3_fs.h~ext3-extents-2.4.20 2003-09-15 18:54:58.000000000 +0400 ++++ linux-2.4.20-vanilla-alexey/include/linux/ext3_fs.h 2003-09-15 20:15:52.000000000 +0400 +@@ -184,6 +184,7 @@ struct ext3_group_desc + #define EXT3_IMAGIC_FL 0x00002000 /* AFS directory */ + #define EXT3_JOURNAL_DATA_FL 0x00004000 /* file data should be journaled */ + #define EXT3_RESERVED_FL 0x80000000 /* reserved for ext3 lib */ ++#define EXT3_EXTENTS_FL 0x00080000 /* Inode uses extents */ + + #define EXT3_FL_USER_VISIBLE 0x00005FFF /* User visible flags */ + #define EXT3_FL_USER_MODIFIABLE 0x000000FF /* User modifiable flags */ +@@ -244,7 +245,7 @@ struct ext3_inode { + struct { + __u8 l_i_frag; /* Fragment number */ + __u8 l_i_fsize; /* Fragment size */ +- __u16 i_pad1; ++ __u16 l_i_depth; + __u16 l_i_uid_high; /* these 2 fields */ + __u16 l_i_gid_high; /* were reserved2[0] */ + __u32 l_i_reserved2; +@@ -325,6 +326,8 @@ struct ext3_inode { + #define EXT3_MOUNT_IOPEN 0x8000 /* Allow access via iopen */ + #define EXT3_MOUNT_IOPEN_NOPRIV 0x10000 /* Make iopen world-readable */ + #define EXT3_MOUNT_ASYNCDEL 0x20000 /* Delayed deletion */ ++#define EXT3_MOUNT_EXTENTS 0x40000 /* Extents support */ ++#define EXT3_MOUNT_EXTDEBUG 0x80000 /* Extents debug */ + + /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ + #ifndef _LINUX_EXT2_FS_H +@@ -702,6 +705,10 @@ extern void ext3_discard_prealloc (struc + extern void ext3_dirty_inode(struct inode *); + extern int ext3_change_inode_journal_flag(struct inode *, int); + extern void ext3_truncate (struct inode *); ++extern int ext3_block_truncate_page(handle_t *, struct address_space *, loff_t); ++extern int ext3_forget(handle_t *handle, int is_metadata, ++ struct inode *inode, struct buffer_head *bh, ++ int blocknr); + #ifdef EXT3_DELETE_THREAD + extern void ext3_truncate_thread(struct inode *inode); + #endif +@@ -765,6 +772,13 @@ extern struct inode_operations ext3_spec + extern struct inode_operations ext3_symlink_inode_operations; + extern struct inode_operations ext3_fast_symlink_inode_operations; + ++/* extents.c */ ++extern int ext3_ext_writepage_trans_blocks(struct inode *, int); ++extern int ext3_ext_get_block(handle_t *, struct inode *, long, ++ struct buffer_head *, int); ++extern void ext3_ext_truncate(struct inode *); ++extern void ext3_ext_init(struct super_block *); ++extern void ext3_ext_release(struct super_block *); + + #endif /* __KERNEL__ */ + +--- linux-2.4.20-vanilla/include/linux/ext3_fs_i.h~ext3-extents-2.4.20 2003-09-15 10:16:38.000000000 +0400 ++++ linux-2.4.20-vanilla-alexey/include/linux/ext3_fs_i.h 2003-09-15 20:14:40.000000000 +0400 +@@ -73,6 +73,10 @@ struct ext3_inode_info { + * by other means, so we have truncate_sem. + */ + struct rw_semaphore truncate_sem; ++ ++ /* extents-related data */ ++ struct semaphore i_ext_sem; ++ __u16 i_depth; + }; + + #endif /* _LINUX_EXT3_FS_I */ +--- linux-2.4.20-vanilla/include/linux/ext3_fs_sb.h~ext3-extents-2.4.20 2003-09-15 18:54:57.000000000 +0400 ++++ linux-2.4.20-vanilla-alexey/include/linux/ext3_fs_sb.h 2003-09-15 20:14:40.000000000 +0400 +@@ -86,6 +86,16 @@ struct ext3_sb_info { + wait_queue_head_t s_delete_thread_queue; + wait_queue_head_t s_delete_waiter_queue; + #endif ++ ++ /* extents */ ++ int s_ext_debug; ++ int s_ext_mindepth; ++ int s_ext_maxdepth; ++ int s_ext_sum; ++ int s_ext_count; ++ spinlock_t s_ext_lock; ++ int s_ext_extents; ++ int s_ext_blocks; + }; + + #endif /* _LINUX_EXT3_FS_SB */ + +_ diff --git a/lustre/kernel_patches/pc/ext3-extents-2.4.20.pc b/lustre/kernel_patches/pc/ext3-extents-2.4.20.pc new file mode 100644 index 0000000..f408025 --- /dev/null +++ b/lustre/kernel_patches/pc/ext3-extents-2.4.20.pc @@ -0,0 +1,8 @@ +fs/ext3/extents.c +fs/ext3/ialloc.c +fs/ext3/inode.c +fs/ext3/Makefile +fs/ext3/super.c +include/linux/ext3_fs.h +include/linux/ext3_fs_i.h +include/linux/ext3_fs_sb.h -- 1.8.3.1