fs/ext3/Makefile | 3 fs/ext3/extents.c | 1615 +++++++++++++++++++++++++++++++++++++++++++++ fs/ext3/ialloc.c | 4 fs/ext3/inode.c | 30 fs/ext3/super.c | 8 include/linux/ext3_fs.h | 18 include/linux/ext3_fs_i.h | 4 include/linux/ext3_fs_sb.h | 10 8 files changed, 1684 insertions(+), 8 deletions(-) Index: linux-2.4.18-chaos/fs/ext3/extents.c =================================================================== --- linux-2.4.18-chaos.orig/fs/ext3/extents.c 2003-01-30 13:24:37.000000000 +0300 +++ linux-2.4.18-chaos/fs/ext3/extents.c 2004-01-13 16:11:00.000000000 +0300 @@ -0,0 +1,1614 @@ +/* + * + * linux/fs/ext3/extents.c + * + * Extents support for EXT3 + * + * 07/08/2003 Alex Tomas + * + * TODO: + * - ext3*_error() should be used in some situations + * - find_goal() [to be tested and improved] + * - error handling + * - we could leak allocated block in some error cases + * - quick search for index/leaf in ext3_ext_find_extent() + * - tree reduction + * - cache last found extent + * - arch-independent + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * with AGRESSIVE_TEST defined capacity of index/leaf blocks + * become very little, so index split, in-depth growing and + * other hard changes happens much more often + * this is for debug purposes only + */ +#define AGRESSIVE_TEST_ + +/* + * if EXT_DEBUG defined you can use 'extdebug' mount option + * to get lots of info what's going on + */ +#define EXT_DEBUG +#ifdef EXT_DEBUG +#define ext_debug(inode,fmt,a...) \ +do { \ + if (test_opt((inode)->i_sb, EXTDEBUG)) \ + printk(fmt, ##a); \ +} while (0); +#else +#define ext_debug(inode,fmt,a...) +#endif + +#define EXT3_ALLOC_NEEDED 2 /* block bitmap + group descriptor */ + +/* + * ext3_inode has i_block array (total 60 bytes) + * first 4 bytes are used to store: + * - tree depth (0 mean there is no tree yet. all extents in the inode) + * - number of alive extents in the inode + */ + +/* + * this is extent on-disk structure + * it's used at the bottom of the tree + */ +struct ext3_extent { + __u32 e_block; /* first logical block extent covers */ + __u32 e_start; /* first physical block extents lives */ + __u32 e_num; /* number of blocks covered by extent */ +}; + +/* + * this is index on-disk structure + * it's used at all the levels, but the bottom + */ +struct ext3_extent_idx { + __u32 e_block; /* index covers logical blocks from 'block' */ + __u32 e_leaf; /* pointer to the physical block of the next * + * level. leaf or next index could bet here */ +}; + +/* + * each block (leaves and indexes), even inode-stored has header + */ +struct ext3_extent_header { + __u16 e_num; /* number of valid entries */ + __u16 e_max; /* capacity of store in entries */ +}; + +/* + * array of ext3_ext_path contains path to some extent + * creation/lookup routines use it for traversal/splitting/etc + * truncate uses it to simulate recursive walking + */ +struct ext3_ext_path { + __u32 p_block; + __u16 p_depth; + struct ext3_extent *p_ext; + struct ext3_extent_idx *p_idx; + struct ext3_extent_header *p_hdr; + struct buffer_head *p_bh; +}; + +#define EXT_FIRST_EXTENT(__hdr__) \ + ((struct ext3_extent *) (((char *) (__hdr__)) + \ + sizeof(struct ext3_extent_header))) +#define EXT_FIRST_INDEX(__hdr__) \ + ((struct ext3_extent_idx *) (((char *) (__hdr__)) + \ + sizeof(struct ext3_extent_header))) +#define EXT_HAS_FREE_INDEX(__path__) \ + ((__path__)->p_hdr->e_num < (__path__)->p_hdr->e_max) +#define EXT_LAST_EXTENT(__hdr__) \ + (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->e_num - 1) +#define EXT_LAST_INDEX(__hdr__) \ + (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->e_num - 1) +#define EXT_MAX_EXTENT(__hdr__) \ + (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->e_max - 1) +#define EXT_MAX_INDEX(__hdr__) \ + (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->e_max - 1) + + +#define EXT_ASSERT(__x__) if (!(__x__)) BUG(); + +/* + * could return: + * - EROFS + * - ENOMEM + */ +static int ext3_ext_get_access(handle_t *handle, struct inode *inode, + struct ext3_ext_path *path) +{ + if (path->p_bh) { + /* path points to block */ + return ext3_journal_get_write_access(handle, path->p_bh); + } + + /* path points to leaf/index in inode body */ + return 0; +} + +/* + * could return: + * - EROFS + * - ENOMEM + * - EIO + */ +static int ext3_ext_dirty(handle_t *handle, struct inode *inode, + struct ext3_ext_path *path) +{ + if (path->p_bh) { + /* path points to block */ + return ext3_journal_dirty_metadata(handle, path->p_bh); + } + + /* path points to leaf/index in inode body */ + return ext3_mark_inode_dirty(handle, inode); +} + +static inline int ext3_ext_space_block(struct inode *inode) +{ + int size; + + size = (inode->i_sb->s_blocksize - sizeof(struct ext3_extent_header)) + / sizeof(struct ext3_extent); +#ifdef AGRESSIVE_TEST + size = 6; /* FIXME: for debug, remove this line */ +#endif + return size; +} + +static inline int ext3_ext_space_inode(struct inode *inode) +{ + int size; + + size = (sizeof(EXT3_I(inode)->i_data) - + sizeof(struct ext3_extent_header)) + / sizeof(struct ext3_extent); +#ifdef AGRESSIVE_TEST + size = 3; /* FIXME: for debug, remove this line */ +#endif + return size; +} + +static inline int ext3_ext_space_inode_idx(struct inode *inode) +{ + int size; + + size = (sizeof(EXT3_I(inode)->i_data) - + sizeof(struct ext3_extent_header)) + / sizeof(struct ext3_extent_idx); +#ifdef AGRESSIVE_TEST + size = 4; /* FIXME: for debug, remove this line */ +#endif + return size; +} + +static void ext3_ext_show_path(struct inode *inode, struct ext3_ext_path *path) +{ + int k, l = path->p_depth; + + ext_debug(inode, "path:"); + for (k = 0; k <= l; k++, path++) { + if (path->p_idx) { + ext_debug(inode, " %d->%d", path->p_idx->e_block, + path->p_idx->e_leaf); + } else if (path->p_ext) { + ext_debug(inode, " %d:%d:%d", + path->p_ext->e_block, + path->p_ext->e_start, + path->p_ext->e_num); + } else + ext_debug(inode, " []"); + } + ext_debug(inode, "\n"); +} + +static void ext3_ext_show_leaf(struct inode *inode, struct ext3_ext_path *path) +{ + int depth = EXT3_I(inode)->i_depth; + struct ext3_extent_header *eh = path[depth].p_hdr; + struct ext3_extent *ex = EXT_FIRST_EXTENT(eh); + int i; + + for (i = 0; i < eh->e_num; i++, ex++) { + ext_debug(inode, "%d:%d:%d ", + ex->e_block, ex->e_start, ex->e_num); + } + ext_debug(inode, "\n"); +} + +static void ext3_ext_drop_refs(struct inode *inode, struct ext3_ext_path *path) +{ + int depth = path->p_depth; + int i; + + for (i = 0; i <= depth; i++, path++) + if (path->p_bh) { + brelse(path->p_bh); + path->p_bh = NULL; + } +} + +static int ext3_ext_find_goal(struct inode *inode, struct ext3_ext_path *path) +{ + struct ext3_inode_info *ei = EXT3_I(inode); + unsigned long bg_start; + unsigned long colour; + int depth; + + if (path) { + depth = path->p_depth; + /* try to find previous block */ + if (path[depth].p_ext) + return path[depth].p_ext->e_start + + path[depth].p_ext->e_num - 1; + + /* it looks index is empty + * try to find starting from index itself */ + if (path[depth].p_bh) + return path[depth].p_bh->b_blocknr; + } + + /* OK. use inode's group */ + bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) + + le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block); + colour = (current->pid % 16) * + (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16); + return bg_start + colour; +} + +static struct ext3_ext_path * +ext3_ext_find_extent(struct inode *inode, int block, struct ext3_ext_path *path) +{ + struct ext3_inode_info *ei = EXT3_I(inode); + struct ext3_extent_header *eh = (void *) ei->i_data; + struct ext3_extent_idx *ix; + struct buffer_head *bh; + struct ext3_extent *ex; + int depth, i, k, ppos = 0, prev = 0; + + eh = (struct ext3_extent_header *) ei->i_data; + + /* initialize capacity of leaf in inode for first time */ + if (eh->e_max == 0) + eh->e_max = ext3_ext_space_inode(inode); + i = depth = ei->i_depth; + EXT_ASSERT(i == 0 || eh->e_num > 0); + + /* account possible depth increase */ + if (!path) { + path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 2), + GFP_NOFS); + if (!path) + return ERR_PTR(-ENOMEM); + } + memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1)); + + /* walk through the tree */ + while (i) { + ext_debug(inode, "depth %d: num %d, max %d\n", + ppos, eh->e_num, eh->e_max); + ix = EXT_FIRST_INDEX(eh); + if (eh->e_num) { + EXT_ASSERT(prev == 0 || ix->e_block == prev); + path[ppos].p_idx = ix; + } + EXT_ASSERT(eh->e_num <= eh->e_max); + for (k = 0; k < eh->e_num; k++, ix++) { + ext_debug(inode, "index: %d -> %d\n", + ix->e_block, ix->e_leaf); + EXT_ASSERT((k == 0 && prev <= (int)ix->e_block) || + (k > 0 && prev < (int)ix->e_block)); + if (block < ix->e_block) + break; + prev = ix->e_block; + path[ppos].p_idx = ix; + } + path[ppos].p_block = path[ppos].p_idx->e_leaf; + path[ppos].p_depth = i; + path[ppos].p_hdr = eh; + path[ppos].p_ext = NULL; + + bh = sb_bread(inode->i_sb, path[ppos].p_block); + if (!bh) { + ext3_ext_drop_refs(inode, path); + kfree(path); + return ERR_PTR(-EIO); + } + eh = (struct ext3_extent_header *) bh->b_data; + ppos++; + EXT_ASSERT(ppos <= depth); + path[ppos].p_bh = bh; + i--; + } + + path[ppos].p_depth = i; + path[ppos].p_hdr = eh; + path[ppos].p_ext = NULL; + + /* find extent */ + ex = EXT_FIRST_EXTENT(eh); + if (eh->e_num) + path[ppos].p_ext = ex; + EXT_ASSERT(eh->e_num <= eh->e_max); + for (k = 0; k < eh->e_num; k++, ex++) { + EXT_ASSERT(ex->e_num < EXT3_BLOCKS_PER_GROUP(inode->i_sb)); + EXT_ASSERT((k == 0 && prev <= (int)ex->e_block) || + (k > 0 && prev < (int)ex->e_block)); + if (block < ex->e_block) + break; + prev = ex->e_block; + path[ppos].p_ext = ex; + } + + ext3_ext_show_path(inode, path); + + return path; +} + +static void ext3_ext_check_boundary(struct inode *inode, + struct ext3_ext_path *curp, + void *addr, int len) +{ + void *end; + + if (!len) + return; + if (curp->p_bh) + end = (void *) curp->p_hdr + inode->i_sb->s_blocksize; + else + end = (void *) curp->p_hdr + sizeof(EXT3_I(inode)->i_data); + if (((unsigned long) addr) + len > (unsigned long) end) { + printk("overflow! 0x%p > 0x%p\n", addr + len, end); + BUG(); + } + if ((unsigned long) addr < (unsigned long) curp->p_hdr) { + printk("underflow! 0x%p < 0x%p\n", addr, curp->p_hdr); + BUG(); + } +} + +/* + * insert new index [logical;ptr] into the block at cupr + * it check where to insert: before curp or after curp + */ +static int ext3_ext_insert_index(handle_t *handle, struct inode *inode, + struct ext3_ext_path *curp, int logical, + int ptr) +{ + struct ext3_extent_idx *ix; + int len, err; + + if ((err = ext3_ext_get_access(handle, inode, curp))) + return err; + + EXT_ASSERT(logical != curp->p_idx->e_block); + len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx; + if (logical > curp->p_idx->e_block) { + /* insert after */ + len = (len - 1) * sizeof(struct ext3_extent_idx); + len = len < 0 ? 0 : len; + ext_debug(inode, "insert new index %d after: %d. " + "move %d from 0x%p to 0x%p\n", + logical, ptr, len, + (curp->p_idx + 1), (curp->p_idx + 2)); + + ext3_ext_check_boundary(inode, curp, curp->p_idx + 2, len); + memmove(curp->p_idx + 2, curp->p_idx + 1, len); + ix = curp->p_idx + 1; + } else { + /* insert before */ + len = len * sizeof(struct ext3_extent_idx); + len = len < 0 ? 0 : len; + ext_debug(inode, "insert new index %d before: %d. " + "move %d from 0x%p to 0x%p\n", + logical, ptr, len, + curp->p_idx, (curp->p_idx + 1)); + + ext3_ext_check_boundary(inode, curp, curp->p_idx + 1, len); + memmove(curp->p_idx + 1, curp->p_idx, len); + ix = curp->p_idx; + } + + ix->e_block = logical; + ix->e_leaf = ptr; + curp->p_hdr->e_num++; + + err = ext3_ext_dirty(handle, inode, curp); + ext3_std_error(inode->i_sb, err); + + return err; +} + +/* + * routine inserts new subtree into the path, using free index entry + * at depth 'at: + * - allocates all needed blocks (new leaf and all intermediate index blocks) + * - makes decision where to split + * - moves remaining extens and index entries (right to the split point) + * into the newly allocated blocks + * - initialize subtree + */ +static int ext3_ext_split(handle_t *handle, struct inode *inode, + struct ext3_ext_path *path, + struct ext3_extent *newext, int at) +{ + struct buffer_head *bh = NULL; + int depth = EXT3_I(inode)->i_depth; + struct ext3_extent_header *neh; + struct ext3_extent_idx *fidx; + struct ext3_extent *ex; + int i = at, k, m, a; + long newblock, oldblock, border; + int *ablocks = NULL; /* array of allocated blocks */ + int err = 0; + + /* make decision: where to split? */ + /* FIXME: now desicion is simplest: at current extent */ + + /* if current leaf will be splitted, then we should use + * border from split point */ + + if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) { + border = path[depth].p_ext[1].e_block; + ext_debug(inode, "leaf will be splitted." + " next leaf starts at %d\n", + (int)border); + } else { + border = newext->e_block; + ext_debug(inode, "leaf will be added." + " next leaf starts at %d\n", + (int)border); + } + + /* + * if error occurs, then we break processing + * and turn filesystem read-only. so, index won't + * be inserted and tree will be in consistent + * state. next mount will repair buffers too + */ + + /* + * get array to track all allocated blocks + * we need this to handle errors and free blocks + * upon them + */ + ablocks = kmalloc(sizeof(long) * depth, GFP_NOFS); + if (!ablocks) + return -ENOMEM; + memset(ablocks, 0, sizeof(long) * depth); + + /* allocate all needed blocks */ + ext_debug(inode, "allocate %d blocks for indexes/leaf\n", depth - at); + newblock = 0; /* FIXME: something more sophisticated needed here */ + for (a = 0; newext->e_num > 0 && a < depth - at; a++) { + newblock = ablocks[a] = newext->e_start++; + newext->e_num--; + } + for (; a < depth - at; a++) { + newblock = ext3_new_block(handle, inode, + newblock + 1, 0, 0, &err); + if (newblock == 0) + goto cleanup; + ablocks[a] = newblock; + } + + /* initialize new leaf */ + newblock = ablocks[--a]; + EXT_ASSERT(newblock); + bh = sb_getblk(inode->i_sb, newblock); + if (!bh) { + err = -EIO; + goto cleanup; + } + lock_buffer(bh); + + if ((err = ext3_journal_get_create_access(handle, bh))) + goto cleanup; + + neh = (struct ext3_extent_header *) bh->b_data; + neh->e_num = 0; + neh->e_max = ext3_ext_space_block(inode); + ex = EXT_FIRST_EXTENT(neh); + + /* move remain of path[depth] to the new leaf */ + EXT_ASSERT(path[depth].p_hdr->e_num == + path[depth].p_hdr->e_max); + /* start copy from next extent */ + /* TODO: we could do it by single memmove */ + m = 0; + path[depth].p_ext++; + while (path[depth].p_ext <= + EXT_MAX_EXTENT(path[depth].p_hdr)) { + ext_debug(inode, "move %d:%d:%d in new leaf\n", + path[depth].p_ext->e_block, + path[depth].p_ext->e_start, + path[depth].p_ext->e_num); + memmove(ex++, path[depth].p_ext++, + sizeof(struct ext3_extent)); + neh->e_num++; + m++; + } + mark_buffer_uptodate(bh, 1); + unlock_buffer(bh); + + if ((err = ext3_journal_dirty_metadata(handle, bh))) + goto cleanup; + brelse(bh); + bh = NULL; + + /* correct old leaf */ + if (m) { + if ((err = ext3_ext_get_access(handle, inode, path))) + goto cleanup; + path[depth].p_hdr->e_num -= m; + if ((err = ext3_ext_dirty(handle, inode, path))) + goto cleanup; + + } + + /* create intermediate indexes */ + k = depth - at - 1; + EXT_ASSERT(k >= 0); + if (k) + ext_debug(inode, + "create %d intermediate indices\n", k); + /* insert new index into current index block */ + /* current depth stored in i var */ + i = depth - 1; + while (k--) { + oldblock = newblock; + newblock = ablocks[--a]; + bh = sb_getblk(inode->i_sb, newblock); + if (!bh) { + err = -EIO; + goto cleanup; + } + lock_buffer(bh); + + if ((err = ext3_journal_get_create_access(handle, bh))) + goto cleanup; + + neh = (struct ext3_extent_header *) bh->b_data; + neh->e_num = 1; + neh->e_max = ext3_ext_space_block(inode); + fidx = EXT_FIRST_INDEX(neh); + fidx->e_block = border; + fidx->e_leaf = oldblock; + + ext_debug(inode, + "int.index at %d (block %u): %d -> %d\n", + i, (unsigned) newblock, + (int) border, + (int) oldblock); + /* copy indexes */ + m = 0; + path[i].p_idx++; + ext_debug(inode, "cur 0x%p, last 0x%p\n", path[i].p_idx, + EXT_MAX_INDEX(path[i].p_hdr)); + EXT_ASSERT(EXT_MAX_INDEX(path[i].p_hdr) == + EXT_LAST_INDEX(path[i].p_hdr)); + while (path[i].p_idx <= + EXT_MAX_INDEX(path[i].p_hdr)) { + ext_debug(inode, "%d: move %d:%d in new index\n", + i, path[i].p_idx->e_block, + path[i].p_idx->e_leaf); + memmove(++fidx, path[i].p_idx++, + sizeof(struct ext3_extent_idx)); + neh->e_num++; + m++; + } + + mark_buffer_uptodate(bh, 1); + unlock_buffer(bh); + + if ((err = ext3_journal_dirty_metadata(handle, bh))) + goto cleanup; + brelse(bh); + bh = NULL; + + /* correct old index */ + if (m) { + err = ext3_ext_get_access(handle,inode,path+i); + if (err) + goto cleanup; + path[i].p_hdr->e_num -= m; + err = ext3_ext_dirty(handle, inode, path + i); + if (err) + goto cleanup; + } + + i--; + } + + /* insert new index */ + if (!err) + err = ext3_ext_insert_index(handle, inode, path + at, + border, newblock); + +cleanup: + if (bh) { + if (buffer_locked(bh)) + unlock_buffer(bh); + brelse(bh); + } + + if (err) { + /* free all allocated blocks in error case */ + for (i = 0; i < depth; i++) + if (!ablocks[i]) + continue; + ext3_free_blocks(handle, inode, ablocks[i], 1); + } + kfree(ablocks); + + return err; +} + +/* + * routine implements tree growing procedure: + * - allocates new block + * - moves top-level data (index block or leaf) into the new block + * - initialize new top-level, creating index that points to the + * just created block + */ +static int ext3_ext_grow_indepth(handle_t *handle, struct inode *inode, + struct ext3_ext_path *path, + struct ext3_extent *newext) +{ + struct buffer_head *bh; + struct ext3_ext_path *curp = path; + struct ext3_extent_header *neh; + struct ext3_extent_idx *fidx; + int len, err = 0; + long newblock; + + /* + * use already allocated by the called block for new root block + */ + newblock = newext->e_start++; + if (newext->e_num == 0) { + /* + * FIXME: if this may happen, then we have to handle + * possible error and free allocated block + */ + printk("grow_indepth with zero blocks\n"); + newblock = ext3_new_block(handle, inode, + newblock, 0, 0, &err); + } else + newext->e_num--; + + bh = sb_getblk(inode->i_sb, newblock); + if (!bh) { + err = -EIO; + ext3_std_error(inode->i_sb, err); + return err; + } + lock_buffer(bh); + + if ((err = ext3_journal_get_create_access(handle, bh))) { + unlock_buffer(bh); + goto out; + } + + /* move top-level index/leaf into new block */ + len = sizeof(struct ext3_extent_header) + + sizeof(struct ext3_extent) * curp->p_hdr->e_max; + EXT_ASSERT(len >= 0 && len < 4096); + memmove(bh->b_data, curp->p_hdr, len); + + /* set size of new block */ + neh = (struct ext3_extent_header *) bh->b_data; + neh->e_max = ext3_ext_space_block(inode); + mark_buffer_uptodate(bh, 1); + unlock_buffer(bh); + + if ((err = ext3_journal_dirty_metadata(handle, bh))) + goto out; + + /* create index in new top-level index: num,max,pointer */ + if ((err = ext3_ext_get_access(handle, inode, curp))) + goto out; + + curp->p_hdr->e_max = ext3_ext_space_inode_idx(inode); + curp->p_hdr->e_num = 1; + curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr); + curp->p_idx->e_block = EXT_FIRST_EXTENT(path[0].p_hdr)->e_block; + curp->p_idx->e_leaf = newblock; + + neh = (struct ext3_extent_header *) EXT3_I(inode)->i_data; + fidx = EXT_FIRST_INDEX(neh); + ext_debug(inode, "new root: num %d(%d), lblock %d, ptr %d\n", + neh->e_num, neh->e_max, fidx->e_block, fidx->e_leaf); + + EXT3_I(inode)->i_depth++; + err = ext3_ext_dirty(handle, inode, curp); +out: + brelse(bh); + + return err; +} + +/* + * routine finds empty index and adds new leaf. if no free index found + * then it requests in-depth growing + */ +static int ext3_ext_create_new_leaf(handle_t *handle, struct inode *inode, + struct ext3_ext_path *path, + struct ext3_extent *newext) +{ + long newblock = newext->e_start; + struct ext3_ext_path *curp; + int depth, i, err = 0; + +repeat: + i = depth = EXT3_I(inode)->i_depth; + + /* walk up to the tree and look for free index entry */ + curp = path + depth; + while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) { + i--; + curp--; + } + + /* we use already allocated block for index block + * so, subsequent data blocks should be contigoues */ + if (EXT_HAS_FREE_INDEX(curp)) { + /* if we found index with free entry, then use that + * entry: create all needed subtree and add new leaf */ + err = ext3_ext_split(handle, inode, path, newext, i); + + /* refill path */ + ext3_ext_drop_refs(inode, path); + path = ext3_ext_find_extent(inode, newext->e_block, path); + if (IS_ERR(path)) + err = PTR_ERR(path); + } else { + /* tree is full, time to grow in depth */ + err = ext3_ext_grow_indepth(handle, inode, path, newext); + + /* refill path */ + ext3_ext_drop_refs(inode, path); + path = ext3_ext_find_extent(inode, newext->e_block, path); + if (IS_ERR(path)) + err = PTR_ERR(path); + + /* + * only first (depth 0 -> 1) produces free space + * in all other cases we have to split growed tree + */ + depth = EXT3_I(inode)->i_depth; + if (path[depth].p_hdr->e_num == path[depth].p_hdr->e_max) { + /* now we need split */ + goto repeat; + } + } + + if (err) + return err; + + /* + * probably we've used some blocks from extent + * let's allocate new block for it + */ + if (newext->e_num == 0 && !err) { + newext->e_start = + ext3_new_block(handle, inode, newblock, + 0, 0, &err); + if (newext->e_start != 0) + newext->e_num = 1; + } + + return 0; +} + +/* + * returns next allocated block or 0xffffffff + * NOTE: it consider block number from index entry as + * allocated block. thus, index entries have to be consistent + * with leafs + */ +static inline unsigned ext3_ext_next_allocated_block(struct inode *inode, + struct ext3_ext_path *path) +{ + int depth; + + EXT_ASSERT(path != NULL); + depth = path->p_depth; + + if (depth == 0 && path->p_ext == NULL) + return 0xffffffff; + + /* FIXME: what if index isn't full ?! */ + while (depth >= 0) { + if (depth == path->p_depth) { + /* leaf */ + if (path[depth].p_ext != + EXT_LAST_EXTENT(path[depth].p_hdr)) + return path[depth].p_ext[1].e_block; + } else { + /* index */ + if (path[depth].p_idx != + EXT_LAST_INDEX(path[depth].p_hdr)) + return path[depth].p_idx[1].e_block; + } + depth--; + } + + return 0xffffffff; +} + +/* + * returns first allocated block from next leaf or 0xffffffff + */ +static unsigned ext3_ext_next_leaf_block(struct inode *inode, + struct ext3_ext_path *path) +{ + int depth; + + EXT_ASSERT(path != NULL); + depth = path->p_depth; + + /* zero-tree has no leaf blocks at all */ + if (depth == 0) + return 0xffffffff; + + /* go to index block */ + depth--; + + while (depth >= 0) { + if (path[depth].p_idx != + EXT_LAST_INDEX(path[depth].p_hdr)) + return path[depth].p_idx[1].e_block; + depth--; + } + + return 0xffffffff; +} + +/* + * if leaf gets modified and modified extent is first in the leaf + * then we have to correct all indexes above + * TODO: do we need to correct tree in all cases? + */ +int ext3_ext_correct_indexes(handle_t *handle, struct inode *inode, + struct ext3_ext_path *path) +{ + int depth = EXT3_I(inode)->i_depth; + struct ext3_extent_header *eh; + struct ext3_extent *ex; + long border; + int k, err = 0; + + eh = path[depth].p_hdr; + ex = path[depth].p_ext; + + EXT_ASSERT(ex); + EXT_ASSERT(eh); + + if (depth == 0) { + /* there is no tree at all */ + return 0; + } + + if (ex != EXT_FIRST_EXTENT(eh)) { + /* we correct tree if first leaf got modified only */ + return 0; + } + + /* + * TODO: we need correction if border is smaller then current one + */ + k = depth - 1; + border = path[depth].p_ext->e_block; + if ((err = ext3_ext_get_access(handle, inode, path + k))) + return err; + path[k].p_idx->e_block = border; + if ((err = ext3_ext_dirty(handle, inode, path + k))) + return err; + + while (k--) { + /* change all left-side indexes */ + if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr)) + break; + if ((err = ext3_ext_get_access(handle, inode, path + k))) + break; + path[k].p_idx->e_block = border; + if ((err = ext3_ext_dirty(handle, inode, path + k))) + break; + } + + return err; +} + +/* + * this routine tries to merge requsted extent into the existing + * extent or inserts requested extent as new one into the tree, + * creating new leaf in no-space case + */ +int ext3_ext_insert_extent(handle_t *handle, struct inode *inode, + struct ext3_ext_path *path, + struct ext3_extent *newext) +{ + int depth, len; + struct ext3_extent_header * eh; + struct ext3_extent *ex; + struct ext3_extent *nearex; /* nearest extent */ + struct ext3_ext_path *npath = NULL; + int err; + + depth = EXT3_I(inode)->i_depth; + if ((ex = path[depth].p_ext)) { + /* try to insert block into found extent and return */ + if (ex->e_block + ex->e_num == newext->e_block && + ex->e_start + ex->e_num == newext->e_start) { +#ifdef AGRESSIVE_TEST + if (ex->e_num >= 2) + goto repeat; +#endif + if ((err = ext3_ext_get_access(handle, inode, + path + depth))) + return err; + ext_debug(inode, "append %d block to %d:%d (from %d)\n", + newext->e_num, ex->e_block, ex->e_num, + ex->e_start); + ex->e_num += newext->e_num; + err = ext3_ext_dirty(handle, inode, path + depth); + return err; + } + } + +repeat: + depth = EXT3_I(inode)->i_depth; + eh = path[depth].p_hdr; + if (eh->e_num == eh->e_max) { + /* probably next leaf has space for us? */ + int next = ext3_ext_next_leaf_block(inode, path); + if (next != 0xffffffff) { + ext_debug(inode, "next leaf block - %d\n", next); + EXT_ASSERT(!npath); + npath = ext3_ext_find_extent(inode, next, NULL); + if (IS_ERR(npath)) + return PTR_ERR(npath); + EXT_ASSERT(npath->p_depth == path->p_depth); + eh = npath[depth].p_hdr; + if (eh->e_num < eh->e_max) { + ext_debug(inode, + "next leaf has free ext(%d)\n", + eh->e_num); + path = npath; + goto repeat; + } + ext_debug(inode, "next leaf hasno free space(%d,%d)\n", + eh->e_num, eh->e_max); + } + /* + * there is no free space in found leaf + * we're gonna add new leaf in the tree + */ + err = ext3_ext_create_new_leaf(handle, inode, path, newext); + if (err) + goto cleanup; + goto repeat; + } + + nearex = path[depth].p_ext; + + if ((err = ext3_ext_get_access(handle, inode, path + depth))) + goto cleanup; + + if (!nearex) { + /* there is no extent in this leaf, create first one */ + ext_debug(inode, "first extent in the leaf: %d:%d:%d\n", + newext->e_block, newext->e_start, + newext->e_num); + path[depth].p_ext = EXT_FIRST_EXTENT(eh); + } else if (newext->e_block > nearex->e_block) { + EXT_ASSERT(newext->e_block != nearex->e_block); + len = EXT_MAX_EXTENT(eh) - nearex; + len = (len - 1) * sizeof(struct ext3_extent); + len = len < 0 ? 0 : len; + ext_debug(inode, "insert %d:%d:%d after: nearest 0x%p, " + "move %d from 0x%p to 0x%p\n", + newext->e_block, newext->e_start, newext->e_num, + nearex, len, nearex + 1, nearex + 2); + ext3_ext_check_boundary(inode, path + depth, nearex + 2, len); + memmove(nearex + 2, nearex + 1, len); + path[depth].p_ext = nearex + 1; + } else { + EXT_ASSERT(newext->e_block != nearex->e_block); + len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext3_extent); + len = len < 0 ? 0 : len; + ext_debug(inode, "insert %d:%d:%d before: nearest 0x%p, " + "move %d from 0x%p to 0x%p\n", + newext->e_block, newext->e_start, newext->e_num, + nearex, len, nearex + 1, nearex + 2); + + memmove(nearex + 1, nearex, len); + path[depth].p_ext = nearex; + } + + if (!err) { + eh->e_num++; + nearex = path[depth].p_ext; + nearex->e_block = newext->e_block; + nearex->e_start = newext->e_start; + nearex->e_num = newext->e_num; + EXT_ASSERT(nearex->e_num < EXT3_BLOCKS_PER_GROUP(inode->i_sb) && + nearex->e_num > 0); + + /* time to correct all indexes above */ + err = ext3_ext_correct_indexes(handle, inode, path); + } + + err = ext3_ext_dirty(handle, inode, path + depth); + +cleanup: + if (npath) { + ext3_ext_drop_refs(inode, npath); + kfree(npath); + } + + return err; +} + +int ext3_ext_get_block(handle_t *handle, struct inode *inode, long iblock, + struct buffer_head *bh_result, int create, + int extend_disksize) +{ + struct ext3_ext_path *path; + int depth = EXT3_I(inode)->i_depth; + struct ext3_extent newex; + struct ext3_extent *ex; + int goal, newblock, err = 0; + + ext_debug(inode, "block %d requested for inode %u, bh_result 0x%p\n", + (int) iblock, (unsigned) inode->i_ino, bh_result); + bh_result->b_state &= ~(1UL << BH_New); + + down(&EXT3_I(inode)->i_ext_sem); + + /* find extent for this block */ + path = ext3_ext_find_extent(inode, iblock, NULL); + if (IS_ERR(path)) { + err = PTR_ERR(path); + goto out2; + } + + if ((ex = path[depth].p_ext)) { + /* if found exent covers block, simple return it */ + if (iblock >= ex->e_block && iblock < ex->e_block + ex->e_num) { + newblock = iblock - ex->e_block + ex->e_start; + ext_debug(inode, "%d fit into %d:%d -> %d\n", + (int) iblock, ex->e_block, ex->e_num, + newblock); + goto out; + } + } + + /* + * we couldn't try to create block if create flag is zero + */ + if (!create) + goto out2; + + /* allocate new block */ + goal = ext3_ext_find_goal(inode, path); + newblock = ext3_new_block(handle, inode, goal, 0, 0, &err); + if (!newblock) + goto out2; + ext_debug(inode, "allocate new block: goal %d, found %d\n", + goal, newblock); + + /* try to insert new extent into found leaf and return */ + newex.e_block = iblock; + newex.e_start = newblock; + newex.e_num = 1; + err = ext3_ext_insert_extent(handle, inode, path, &newex); + if (err) + goto out2; + + /* previous routine could use block we allocated */ + newblock = newex.e_start; + bh_result->b_state |= (1UL << BH_New); + +out: + ext3_ext_show_leaf(inode, path); + bh_result->b_dev = inode->i_dev; + bh_result->b_blocknr = newblock; + bh_result->b_state |= (1UL << BH_Mapped); +out2: + ext3_ext_drop_refs(inode, path); + kfree(path); + up(&EXT3_I(inode)->i_ext_sem); + + return err; +} + +/* + * returns 1 if current index have to be freed (even partial) + */ +static int ext3_ext_more_to_truncate(struct inode *inode, + struct ext3_ext_path *path) +{ + EXT_ASSERT(path->p_idx); + + if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr)) + return 0; + + /* + * if truncate on deeper level happened it it wasn't partial + * so we have to consider current index for truncation + */ + if (path->p_hdr->e_num == path->p_block) + return 0; + + /* + * put actual number of indexes to know is this number got + * changed at the next iteration + */ + path->p_block = path->p_hdr->e_num; + + return 1; +} + +/* + * routine removes index from the index block + * it's used in truncate case only. thus all requests are for + * last index in the block only + */ +int ext3_ext_remove_index(handle_t *handle, struct inode *inode, + struct ext3_ext_path *path) +{ + struct buffer_head *bh; + int err; + + /* free index block */ + path--; + EXT_ASSERT(path->p_hdr->e_num); + if ((err = ext3_ext_get_access(handle, inode, path))) + return err; + path->p_hdr->e_num--; + if ((err = ext3_ext_dirty(handle, inode, path))) + return err; + bh = sb_get_hash_table(inode->i_sb, path->p_idx->e_leaf); + ext3_forget(handle, 0, inode, bh, path->p_idx->e_leaf); + ext3_free_blocks(handle, inode, path->p_idx->e_leaf, 1); + + ext_debug(inode, "index is empty, remove it, free block %d\n", + path->p_idx->e_leaf); + return err; +} + +/* + * returns 1 if current extent needs to be freed (even partial) + * instead, returns 0 + */ +int ext3_ext_more_leaves_to_truncate(struct inode *inode, + struct ext3_ext_path *path) +{ + unsigned blocksize = inode->i_sb->s_blocksize; + struct ext3_extent *ex = path->p_ext; + int last_block; + + EXT_ASSERT(ex); + + /* is there leave in the current leaf? */ + if (ex < EXT_FIRST_EXTENT(path->p_hdr)) + return 0; + + last_block = (inode->i_size + blocksize-1) + >> EXT3_BLOCK_SIZE_BITS(inode->i_sb); + + if (last_block >= ex->e_block + ex->e_num) + return 0; + + /* seems it extent have to be freed */ + return 1; +} + +handle_t *ext3_ext_journal_restart(handle_t *handle, int needed) +{ + int err; + + if (handle->h_buffer_credits > needed) + return handle; + if (!ext3_journal_extend(handle, needed)) + return handle; + err = ext3_journal_restart(handle, needed); + + return handle; +} + +/* + * this routine calculate max number of blocks to be modified + * while freeing extent and is intended to be used in truncate path + */ +static int ext3_ext_calc_credits(struct inode *inode, + struct ext3_ext_path *path, + int num) +{ + int depth = EXT3_I(inode)->i_depth; + int needed; + + /* + * extent couldn't cross group, so we will modify + * single bitmap block and single group descriptor + */ + needed = 3; + + /* + * if this is last extent in a leaf, then we have to + * free leaf block and remove pointer from index above. + * that pointer could be last in index block, so we'll + * have to remove it too. this way we could modify/free + * the whole path + root index (inode stored) will be + * modified + */ + if (!path || (num == path->p_ext->e_num && + path->p_ext == EXT_FIRST_EXTENT(path->p_hdr))) + needed += (depth * (EXT3_ALLOC_NEEDED + 1)) + 1; + + return needed; +} + +/* + * core of the truncate procedure: + * - calculated what part of each extent in the requested leaf + * need to be freed + * - frees and forgets these blocks + * + * TODO: we could optimize and free several extents during + * single journal_restart()-journal_restart() cycle + */ +static int ext3_ext_truncate_leaf(handle_t *handle, + struct inode *inode, + struct ext3_ext_path *path, + int depth) +{ + unsigned blocksize = inode->i_sb->s_blocksize; + int last_block; + int i, err = 0, sf, num; + + ext_debug(inode, "level %d - leaf\n", depth); + if (!path->p_hdr) + path->p_hdr = + (struct ext3_extent_header *) path->p_bh->b_data; + + EXT_ASSERT(path->p_hdr->e_num <= path->p_hdr->e_max); + + last_block = (inode->i_size + blocksize-1) + >> EXT3_BLOCK_SIZE_BITS(inode->i_sb); + path->p_ext = EXT_LAST_EXTENT(path->p_hdr); + while (ext3_ext_more_leaves_to_truncate(inode, path)) { + + /* what part of extent have to be freed? */ + sf = last_block > path->p_ext->e_block ? + last_block : path->p_ext->e_block; + + /* number of blocks from extent to be freed */ + num = path->p_ext->e_block + path->p_ext->e_num - sf; + + /* calc physical first physical block to be freed */ + sf = path->p_ext->e_start + (sf - path->p_ext->e_block); + + i = ext3_ext_calc_credits(inode, path, num); + handle = ext3_ext_journal_restart(handle, i); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + ext_debug(inode, "free extent %d:%d:%d -> free %d:%d\n", + path->p_ext->e_block, path->p_ext->e_start, + path->p_ext->e_num, sf, num); + for (i = 0; i < num; i++) { + struct buffer_head *bh = + sb_get_hash_table(inode->i_sb, sf + i); + ext3_forget(handle, 0, inode, bh, sf + i); + } + ext3_free_blocks(handle, inode, sf, num); + + /* collect extents usage stats */ + spin_lock(&EXT3_SB(inode->i_sb)->s_ext_lock); + EXT3_SB(inode->i_sb)->s_ext_extents++; + EXT3_SB(inode->i_sb)->s_ext_blocks += num; + spin_unlock(&EXT3_SB(inode->i_sb)->s_ext_lock); + + /* reduce extent */ + if ((err = ext3_ext_get_access(handle, inode, path))) + return err; + path->p_ext->e_num -= num; + if (path->p_ext->e_num == 0) + path->p_hdr->e_num--; + if ((err = ext3_ext_dirty(handle, inode, path))) + return err; + + path->p_ext--; + } + + /* if this leaf is free, then we should + * remove it from index block above */ + if (path->p_hdr->e_num == 0 && depth > 0) + err = ext3_ext_remove_index(handle, inode, path); + + return err; +} + +static void ext3_ext_collect_stats(struct inode *inode) +{ + int depth; + + /* skip inodes with old good bitmap */ + if (!(EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL)) + return; + + /* collect on full truncate only */ + if (inode->i_size) + return; + + depth = EXT3_I(inode)->i_depth; + if (depth < EXT3_SB(inode->i_sb)->s_ext_mindepth) + EXT3_SB(inode->i_sb)->s_ext_mindepth = depth; + if (depth > EXT3_SB(inode->i_sb)->s_ext_maxdepth) + EXT3_SB(inode->i_sb)->s_ext_maxdepth = depth; + EXT3_SB(inode->i_sb)->s_ext_sum += depth; + EXT3_SB(inode->i_sb)->s_ext_count++; + +} + +void ext3_ext_truncate(struct inode * inode) +{ + struct address_space *mapping = inode->i_mapping; + struct ext3_ext_path *path; + struct page * page; + handle_t *handle; + int i, depth, err = 0; + + ext3_ext_collect_stats(inode); + + /* + * We have to lock the EOF page here, because lock_page() nests + * outside journal_start(). + */ + if ((inode->i_size & (inode->i_sb->s_blocksize - 1)) == 0) { + /* Block boundary? Nothing to do */ + page = NULL; + } else { + page = grab_cache_page(mapping, + inode->i_size >> PAGE_CACHE_SHIFT); + if (!page) + return; + } + + /* + * probably first extent we're gonna free will be last in block + */ + i = ext3_ext_calc_credits(inode, NULL, 0); + handle = ext3_journal_start(inode, i); + if (IS_ERR(handle)) { + if (page) { + clear_highpage(page); + flush_dcache_page(page); + unlock_page(page); + page_cache_release(page); + } + return; + } + + if (page) + ext3_block_truncate_page(handle, mapping, inode->i_size, page, + inode->i_sb->s_blocksize); + + down(&EXT3_I(inode)->i_ext_sem); + + /* + * TODO: optimization is possible here + * probably we need not scaning at all, + * because page truncation is enough + */ + if (ext3_orphan_add(handle, inode)) + goto out_stop; + + /* we have to know where to truncate from in crash case */ + EXT3_I(inode)->i_disksize = inode->i_size; + ext3_mark_inode_dirty(handle, inode); + + /* + * we start scanning from right side freeing all the blocks + * after i_size and walking into the deep + */ + i = 0; + depth = EXT3_I(inode)->i_depth; + path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 1), GFP_KERNEL); + if (IS_ERR(path)) { + ext3_error(inode->i_sb, "ext3_ext_truncate", + "Can't allocate path array"); + goto out_stop; + } + memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1)); + + path[i].p_hdr = (struct ext3_extent_header *) EXT3_I(inode)->i_data; + while (i >= 0 && err == 0) { + if (i == depth) { + /* this is leaf block */ + err = ext3_ext_truncate_leaf(handle, inode, + path + i, i); + /* root level have p_bh == NULL, brelse() eats this */ + brelse(path[i].p_bh); + i--; + continue; + } + + /* this is index block */ + if (!path[i].p_hdr) { + path[i].p_hdr = + (struct ext3_extent_header *) path[i].p_bh->b_data; + ext_debug(inode, "initialize header\n"); + } + + EXT_ASSERT(path[i].p_hdr->e_num <= path[i].p_hdr->e_max); + + if (!path[i].p_idx) { + /* this level hasn't touched yet */ + path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr); + path[i].p_block = path[i].p_hdr->e_num + 1; + ext_debug(inode, "init index ptr: hdr 0x%p, num %d\n", + path[i].p_hdr, path[i].p_hdr->e_num); + } else { + /* we've already was here, see at next index */ + path[i].p_idx--; + } + + ext_debug(inode, "level %d - index, first 0x%p, cur 0x%p\n", + i, EXT_FIRST_INDEX(path[i].p_hdr), + path[i].p_idx); + if (ext3_ext_more_to_truncate(inode, path + i)) { + /* go to the next level */ + ext_debug(inode, "move to level %d (block %d)\n", i+1, + path[i].p_idx->e_leaf); + memset(path + i + 1, 0, sizeof(*path)); + path[i+1].p_bh = sb_bread(inode->i_sb, + path[i].p_idx->e_leaf); + if (!path[i+1].p_bh) { + /* should we reset i_size? */ + err = -EIO; + break; + } + i++; + } else { + /* we finish processing this index, go up */ + if (path[i].p_hdr->e_num == 0 && i > 0) { + /* index is empty, remove it + * handle must be already prepared by the + * truncate_leaf() + */ + err = ext3_ext_remove_index(handle, inode, + path + i); + } + /* root level have p_bh == NULL, brelse() eats this */ + brelse(path[i].p_bh); + i--; + ext_debug(inode, "return to level %d\n", i); + } + } + + /* TODO: flexible tree reduction should be here */ + if (path->p_hdr->e_num == 0) { + /* + * truncate to zero freed all the tree + * so, we need to correct i_depth + */ + EXT3_I(inode)->i_depth = 0; + path->p_hdr->e_max = 0; + ext3_mark_inode_dirty(handle, inode); + } + + kfree(path); + + /* In a multi-transaction truncate, we only make the final + * transaction synchronous */ + if (IS_SYNC(inode)) + handle->h_sync = 1; + +out_stop: + /* + * If this was a simple ftruncate(), and the file will remain alive + * then we need to clear up the orphan record which we created above. + * However, if this was a real unlink then we were called by + * ext3_delete_inode(), and we allow that function to clean up the + * orphan info for us. + */ + if (inode->i_nlink) + ext3_orphan_del(handle, inode); + + up(&EXT3_I(inode)->i_ext_sem); + ext3_journal_stop(handle, inode); +} + +/* + * this routine calculate max number of blocks we could modify + * in order to allocate new block for an inode + */ +int ext3_ext_writepage_trans_blocks(struct inode *inode, int num) +{ + struct ext3_inode_info *ei = EXT3_I(inode); + int depth = ei->i_depth + 1; + int needed; + + /* + * the worste case we're expecting is creation of the + * new root (growing in depth) with index splitting + * for splitting we have to consider depth + 1 because + * previous growing could increase it + */ + + /* + * growing in depth: + * block allocation + new root + old root + */ + needed = EXT3_ALLOC_NEEDED + 2; + + /* index split. we may need: + * allocate intermediate indexes and new leaf + * change two blocks at each level, but root + * modify root block (inode) + */ + needed += (depth * EXT3_ALLOC_NEEDED) + (2 * depth) + 1; + + /* caller want to allocate num blocks */ + needed *= num; + +#ifdef CONFIG_QUOTA + /* + * FIXME: real calculation should be here + * it depends on blockmap format of qouta file + */ + needed += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS; +#endif + + return needed; +} + +/* + * called at mount time + */ +void ext3_ext_init(struct super_block *sb) +{ + /* + * possible initialization would be here + */ + + if (test_opt(sb, EXTENTS)) + printk("EXT3-fs: file extents enabled\n"); + spin_lock_init(&EXT3_SB(sb)->s_ext_lock); +} + +/* + * called at umount time + */ +void ext3_ext_release(struct super_block *sb) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + + /* show collected stats */ + if (sbi->s_ext_count && sbi->s_ext_extents) + printk("EXT3-fs: min depth - %d, max depth - %d, " + "ave. depth - %d, ave. blocks/extent - %d\n", + sbi->s_ext_mindepth, + sbi->s_ext_maxdepth, + sbi->s_ext_sum / sbi->s_ext_count, + sbi->s_ext_blocks / sbi->s_ext_extents); +} Index: linux-2.4.18-chaos/fs/ext3/ialloc.c =================================================================== --- linux-2.4.18-chaos.orig/fs/ext3/ialloc.c 2004-01-13 16:10:23.000000000 +0300 +++ linux-2.4.18-chaos/fs/ext3/ialloc.c 2004-01-13 16:11:00.000000000 +0300 @@ -573,6 +573,10 @@ ei->i_prealloc_count = 0; #endif ei->i_block_group = i; + if (test_opt(sb, EXTENTS)) + EXT3_I(inode)->i_flags |= EXT3_EXTENTS_FL; + ei->i_depth = 0; + sema_init(&ei->i_ext_sem, 1); if (ei->i_flags & EXT3_SYNC_FL) inode->i_flags |= S_SYNC; Index: linux-2.4.18-chaos/fs/ext3/inode.c =================================================================== --- linux-2.4.18-chaos.orig/fs/ext3/inode.c 2004-01-13 16:10:23.000000000 +0300 +++ linux-2.4.18-chaos/fs/ext3/inode.c 2004-01-13 16:11:00.000000000 +0300 @@ -842,6 +842,15 @@ goto reread; } +static inline int +ext3_get_block_wrap(handle_t *handle, struct inode *inode, long block, + struct buffer_head *bh, int create, int extend_disksize) +{ + if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) + return ext3_ext_get_block(handle, inode, block, bh, create, 1); + return ext3_get_block_handle(handle, inode, block, bh, create, 1); +} + /* * The BKL is not held on entry here. */ @@ -855,7 +864,7 @@ handle = ext3_journal_current_handle(); J_ASSERT(handle != 0); } - ret = ext3_get_block_handle(handle, inode, iblock, + ret = ext3_get_block_wrap(handle, inode, iblock, bh_result, create, 1); return ret; } @@ -882,7 +891,7 @@ } } if (ret == 0) - ret = ext3_get_block_handle(handle, inode, iblock, + ret = ext3_get_block_wrap(handle, inode, iblock, bh_result, create, 0); if (ret == 0) bh_result->b_size = (1 << inode->i_blkbits); @@ -904,7 +913,7 @@ dummy.b_state = 0; dummy.b_blocknr = -1000; buffer_trace_init(&dummy.b_history); - *errp = ext3_get_block_handle(handle, inode, block, &dummy, create, 1); + *errp = ext3_get_block_wrap(handle, inode, block, &dummy, create, 1); if (!*errp && buffer_mapped(&dummy)) { struct buffer_head *bh; bh = sb_getblk(inode->i_sb, dummy.b_blocknr); @@ -1520,7 +1529,7 @@ * This required during truncate. We need to physically zero the tail end * of that block so it doesn't yield old data if the file is later grown. */ -static int ext3_block_truncate_page(handle_t *handle, +int ext3_block_truncate_page(handle_t *handle, struct address_space *mapping, loff_t from, struct page *page, unsigned blocksize) { @@ -1998,6 +2007,9 @@ ext3_discard_prealloc(inode); + if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) + return ext3_ext_truncate(inode); + blocksize = inode->i_sb->s_blocksize; last_block = (inode->i_size + blocksize-1) >> EXT3_BLOCK_SIZE_BITS(inode->i_sb); @@ -2436,6 +2448,8 @@ ei->i_prealloc_count = 0; #endif ei->i_block_group = iloc.block_group; + ei->i_depth = raw_inode->osd2.linux2.l_i_depth; + sema_init(&ei->i_ext_sem, 1); /* * NOTE! The in-memory inode i_data array is in little-endian order @@ -2556,6 +2570,7 @@ raw_inode->i_fsize = 0; } #endif + raw_inode->osd2.linux2.l_i_depth = ei->i_depth; raw_inode->i_file_acl = cpu_to_le32(ei->i_file_acl); if (!S_ISREG(inode->i_mode)) { raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl); @@ -2759,6 +2774,9 @@ int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3; int ret; + if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) + return ext3_ext_writepage_trans_blocks(inode, bpp); + if (ext3_should_journal_data(inode)) ret = 3 * (bpp + indirects) + 2; else @@ -3082,7 +3100,7 @@ /* alloc blocks one by one */ for (i = 0; i < nblocks; i++) { - ret = ext3_get_block_handle(handle, inode, blocks[i], + ret = ext3_get_block_wrap(handle, inode, blocks[i], &bh_tmp, 1, 1); if (ret) break; @@ -3158,7 +3176,7 @@ if (blocks[i] != 0) continue; - rc = ext3_get_block_handle(handle, inode, iblock, &bh, 1, 1); + rc = ext3_get_block_wrap(handle, inode, iblock, &bh, 1, 1); if (rc) { printk(KERN_INFO "ext3_map_inode_page: error %d " "allocating block %ld\n", rc, iblock); Index: linux-2.4.18-chaos/fs/ext3/Makefile =================================================================== --- linux-2.4.18-chaos.orig/fs/ext3/Makefile 2004-01-13 16:10:23.000000000 +0300 +++ linux-2.4.18-chaos/fs/ext3/Makefile 2004-01-13 16:11:00.000000000 +0300 @@ -12,7 +12,8 @@ export-objs := ext3-exports.o obj-y := balloc.o iopen.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ - ioctl.o namei.o super.o symlink.o xattr.o ext3-exports.o + ioctl.o namei.o super.o symlink.o xattr.o ext3-exports.o \ + extents.o obj-m := $(O_TARGET) include $(TOPDIR)/Rules.make Index: linux-2.4.18-chaos/fs/ext3/super.c =================================================================== --- linux-2.4.18-chaos.orig/fs/ext3/super.c 2004-01-13 16:10:23.000000000 +0300 +++ linux-2.4.18-chaos/fs/ext3/super.c 2004-01-13 16:11:23.000000000 +0300 @@ -622,6 +622,7 @@ J_ASSERT(sbi->s_delete_inodes == 0); + ext3_ext_release(sb); ext3_xattr_put_super(sb); journal_destroy(sbi->s_journal); if (!(sb->s_flags & MS_RDONLY)) { @@ -743,6 +744,12 @@ else #endif + if (!strcmp (this_char, "extents")) + set_opt (sbi->s_mount_opt, EXTENTS); + else + if (!strcmp (this_char, "extdebug")) + set_opt (sbi->s_mount_opt, EXTDEBUG); + else if (!strcmp (this_char, "bsddf")) clear_opt (*mount_options, MINIX_DF); else if (!strcmp (this_char, "nouid32")) { @@ -1470,6 +1477,7 @@ test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ? "journal": test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered": "writeback"); + ext3_ext_init(sb); return sb; Index: linux-2.4.18-chaos/include/linux/ext3_fs.h =================================================================== --- linux-2.4.18-chaos.orig/include/linux/ext3_fs.h 2004-01-13 16:10:23.000000000 +0300 +++ linux-2.4.18-chaos/include/linux/ext3_fs.h 2004-01-13 16:11:00.000000000 +0300 @@ -183,6 +183,7 @@ #define EXT3_IMAGIC_FL 0x00002000 /* AFS directory */ #define EXT3_JOURNAL_DATA_FL 0x00004000 /* file data should be journaled */ #define EXT3_RESERVED_FL 0x80000000 /* reserved for ext3 lib */ +#define EXT3_EXTENTS_FL 0x00080000 /* Inode uses extents */ #define EXT3_FL_USER_VISIBLE 0x00005FFF /* User visible flags */ #define EXT3_FL_USER_MODIFIABLE 0x000000FF /* User modifiable flags */ @@ -243,7 +244,7 @@ struct { __u8 l_i_frag; /* Fragment number */ __u8 l_i_fsize; /* Fragment size */ - __u16 i_pad1; + __u16 l_i_depth; __u16 l_i_uid_high; /* these 2 fields */ __u16 l_i_gid_high; /* were reserved2[0] */ __u32 l_i_reserved2; @@ -324,6 +325,8 @@ #define EXT3_MOUNT_IOPEN 0x8000 /* Allow access via iopen */ #define EXT3_MOUNT_IOPEN_NOPRIV 0x10000 /* Make iopen world-readable */ #define EXT3_MOUNT_ASYNCDEL 0x20000 /* Delayed deletion */ +#define EXT3_MOUNT_EXTENTS 0x40000 /* Extents support */ +#define EXT3_MOUNT_EXTDEBUG 0x80000 /* Extents debug */ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ #ifndef _LINUX_EXT2_FS_H @@ -663,6 +666,12 @@ extern void ext3_dirty_inode(struct inode *); extern int ext3_change_inode_journal_flag(struct inode *, int); extern void ext3_truncate (struct inode *); +extern int ext3_block_truncate_page(handle_t *handle, + struct address_space *mapping, loff_t from, + struct page *page, unsigned blocksize); +extern int ext3_forget(handle_t *handle, int is_metadata, + struct inode *inode, struct buffer_head *bh, + int blocknr); #ifdef EXT3_DELETE_THREAD extern void ext3_truncate_thread(struct inode *inode); #endif @@ -722,6 +731,13 @@ /* symlink.c */ extern struct inode_operations ext3_fast_symlink_inode_operations; +/* extents.c */ +extern int ext3_ext_writepage_trans_blocks(struct inode *, int); +extern int ext3_ext_get_block(handle_t *, struct inode *, long, + struct buffer_head *, int, int); +extern void ext3_ext_truncate(struct inode *); +extern void ext3_ext_init(struct super_block *); +extern void ext3_ext_release(struct super_block *); #endif /* __KERNEL__ */ Index: linux-2.4.18-chaos/include/linux/ext3_fs_i.h =================================================================== --- linux-2.4.18-chaos.orig/include/linux/ext3_fs_i.h 2001-11-22 22:46:19.000000000 +0300 +++ linux-2.4.18-chaos/include/linux/ext3_fs_i.h 2004-01-13 16:11:00.000000000 +0300 @@ -73,6 +73,10 @@ * by other means, so we have truncate_sem. */ struct rw_semaphore truncate_sem; + + /* extents-related data */ + struct semaphore i_ext_sem; + __u16 i_depth; }; #endif /* _LINUX_EXT3_FS_I */ Index: linux-2.4.18-chaos/include/linux/ext3_fs_sb.h =================================================================== --- linux-2.4.18-chaos.orig/include/linux/ext3_fs_sb.h 2004-01-13 16:10:21.000000000 +0300 +++ linux-2.4.18-chaos/include/linux/ext3_fs_sb.h 2004-01-13 16:11:00.000000000 +0300 @@ -84,6 +84,16 @@ wait_queue_head_t s_delete_thread_queue; wait_queue_head_t s_delete_waiter_queue; #endif + + /* extents */ + int s_ext_debug; + int s_ext_mindepth; + int s_ext_maxdepth; + int s_ext_sum; + int s_ext_count; + spinlock_t s_ext_lock; + int s_ext_extents; + int s_ext_blocks; }; #endif /* _LINUX_EXT3_FS_SB */