From 3a5d92cd580c057fcafa5e76ef98ada9677bd619 Mon Sep 17 00:00:00 2001 From: adilger Date: Fri, 25 Nov 2005 13:27:45 +0000 Subject: [PATCH] Branch b_release_1_4_6 Add patches from Bull for 2.6.12 kernel into CVS. r=pbojanic --- .../patches/ext3-extents-2.6.12.patch | 2924 ++++++++++++++++++++ .../patches/ext3-external-journal-2.6.12.patch | 148 + .../patches/ext3-mballoc2-2.6.12.patch | 2463 +++++++++++++++++ .../kernel_patches/patches/ext3-nlinks-2.6.9.patch | 4 +- ldiskfs/kernel_patches/patches/iopen-2.6.12.patch | 470 ++++ .../series/ldiskfs-2.6.12-vanilla.series | 13 + lustre/kernel_patches/patches/8kstack-2.6.12.patch | 13 + .../patches/export_symbols-2.6-rhel4.patch | 18 + .../patches/export_symbols-2.6-suse.patch | 22 + .../patches/export_symbols-2.6.12.patch | 114 + .../patches/ext3-extents-2.6.12.patch | 2924 ++++++++++++++++++++ .../patches/ext3-external-journal-2.6.12.patch | 148 + .../patches/ext3-mballoc2-2.6.12.patch | 2463 +++++++++++++++++ .../patches/ext3-nlinks-2.6.12.patch | 161 ++ .../kernel_patches/patches/ext3-nlinks-2.6.9.patch | 4 +- .../ext3-remove-cond-resched-calls-2.6.12.patch | 29 + .../patches/ext3-statfs-2.6.12.patch | 171 ++ lustre/kernel_patches/patches/iopen-2.6.12.patch | 470 ++++ .../kernel_patches/patches/iopen-misc-2.6.12.patch | 82 + .../patches/nfs-cifs-intent-2.6.12.patch | 110 + .../patches/uml-exprt-clearuser-2.6.12.patch | 11 + .../patches/vfs_intent-2.6-rhel4.patch | 34 +- .../patches/vfs_intent-2.6-suse.patch | 22 - .../kernel_patches/patches/vfs_intent-2.6.12.patch | 819 ++++++ .../patches/vfs_nointent-2.6-rhel4.patch | 64 +- .../patches/vfs_nointent-2.6.12.patch | 490 ++++ .../kernel_patches/patches/vfs_races-2.6.12.patch | 63 + lustre/kernel_patches/series/2.6.12-vanilla.series | 19 + .../series/ldiskfs-2.6.12-vanilla.series | 13 + 29 files changed, 14191 insertions(+), 95 deletions(-) create mode 100644 ldiskfs/kernel_patches/patches/ext3-extents-2.6.12.patch create mode 100644 ldiskfs/kernel_patches/patches/ext3-external-journal-2.6.12.patch create mode 100644 ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6.12.patch create mode 100644 ldiskfs/kernel_patches/patches/iopen-2.6.12.patch create mode 100644 ldiskfs/kernel_patches/series/ldiskfs-2.6.12-vanilla.series create mode 100644 lustre/kernel_patches/patches/8kstack-2.6.12.patch create mode 100644 lustre/kernel_patches/patches/export_symbols-2.6.12.patch create mode 100644 lustre/kernel_patches/patches/ext3-extents-2.6.12.patch create mode 100644 lustre/kernel_patches/patches/ext3-external-journal-2.6.12.patch create mode 100644 lustre/kernel_patches/patches/ext3-mballoc2-2.6.12.patch create mode 100644 lustre/kernel_patches/patches/ext3-nlinks-2.6.12.patch create mode 100644 lustre/kernel_patches/patches/ext3-remove-cond-resched-calls-2.6.12.patch create mode 100644 lustre/kernel_patches/patches/ext3-statfs-2.6.12.patch create mode 100644 lustre/kernel_patches/patches/iopen-2.6.12.patch create mode 100644 lustre/kernel_patches/patches/iopen-misc-2.6.12.patch create mode 100644 lustre/kernel_patches/patches/nfs-cifs-intent-2.6.12.patch create mode 100644 lustre/kernel_patches/patches/uml-exprt-clearuser-2.6.12.patch create mode 100644 lustre/kernel_patches/patches/vfs_intent-2.6.12.patch create mode 100644 lustre/kernel_patches/patches/vfs_nointent-2.6.12.patch create mode 100644 lustre/kernel_patches/patches/vfs_races-2.6.12.patch create mode 100644 lustre/kernel_patches/series/2.6.12-vanilla.series create mode 100644 lustre/kernel_patches/series/ldiskfs-2.6.12-vanilla.series diff --git a/ldiskfs/kernel_patches/patches/ext3-extents-2.6.12.patch b/ldiskfs/kernel_patches/patches/ext3-extents-2.6.12.patch new file mode 100644 index 0000000..b6d0c57 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext3-extents-2.6.12.patch @@ -0,0 +1,2924 @@ +Index: linux-2.6.12-rc6/fs/ext3/extents.c +=================================================================== +--- linux-2.6.12-rc6.orig/fs/ext3/extents.c 2005-06-14 16:31:25.756503133 +0200 ++++ linux-2.6.12-rc6/fs/ext3/extents.c 2005-06-14 16:31:25.836581257 +0200 +@@ -0,0 +1,2347 @@ ++/* ++ * Copyright(c) 2003, 2004, 2005, Cluster File Systems, Inc, info@clusterfs.com ++ * Written by Alex Tomas ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public Licens ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- ++ */ ++ ++/* ++ * Extents support for EXT3 ++ * ++ * TODO: ++ * - ext3_ext_walk_space() sould not use ext3_ext_find_extent() ++ * - ext3_ext_calc_credits() could take 'mergable' into account ++ * - ext3*_error() should be used in some situations ++ * - find_goal() [to be tested and improved] ++ * - smart tree reduction ++ * - arch-independence ++ * common on-disk format for big/little-endian arch ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++ ++static inline int ext3_ext_check_header(struct ext3_extent_header *eh) ++{ ++ if (eh->eh_magic != EXT3_EXT_MAGIC) { ++ printk(KERN_ERR "EXT3-fs: invalid magic = 0x%x\n", ++ (unsigned)eh->eh_magic); ++ return -EIO; ++ } ++ if (eh->eh_max == 0) { ++ printk(KERN_ERR "EXT3-fs: invalid eh_max = %u\n", ++ (unsigned)eh->eh_max); ++ return -EIO; ++ } ++ if (eh->eh_entries > eh->eh_max) { ++ printk(KERN_ERR "EXT3-fs: invalid eh_entries = %u\n", ++ (unsigned)eh->eh_entries); ++ return -EIO; ++ } ++ return 0; ++} ++ ++static handle_t *ext3_ext_journal_restart(handle_t *handle, int needed) ++{ ++ int err; ++ ++ if (handle->h_buffer_credits > needed) ++ return handle; ++ if (!ext3_journal_extend(handle, needed)) ++ return handle; ++ err = ext3_journal_restart(handle, needed); ++ ++ return handle; ++} ++ ++static int inline ++ext3_ext_get_access_for_root(handle_t *h, struct ext3_extents_tree *tree) ++{ ++ if (tree->ops->get_write_access) ++ return tree->ops->get_write_access(h,tree->buffer); ++ else ++ return 0; ++} ++ ++static int inline ++ext3_ext_mark_root_dirty(handle_t *h, struct ext3_extents_tree *tree) ++{ ++ if (tree->ops->mark_buffer_dirty) ++ return tree->ops->mark_buffer_dirty(h,tree->buffer); ++ else ++ return 0; ++} ++ ++/* ++ * could return: ++ * - EROFS ++ * - ENOMEM ++ */ ++static int ext3_ext_get_access(handle_t *handle, ++ struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ int err; ++ ++ if (path->p_bh) { ++ /* path points to block */ ++ err = ext3_journal_get_write_access(handle, path->p_bh); ++ } else { ++ /* path points to leaf/index in inode body */ ++ err = ext3_ext_get_access_for_root(handle, tree); ++ } ++ return err; ++} ++ ++/* ++ * could return: ++ * - EROFS ++ * - ENOMEM ++ * - EIO ++ */ ++static int ext3_ext_dirty(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ int err; ++ if (path->p_bh) { ++ /* path points to block */ ++ err =ext3_journal_dirty_metadata(handle, path->p_bh); ++ } else { ++ /* path points to leaf/index in inode body */ ++ err = ext3_ext_mark_root_dirty(handle, tree); ++ } ++ return err; ++} ++ ++static int inline ++ext3_ext_new_block(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, struct ext3_extent *ex, ++ int *err) ++{ ++ int goal, depth, newblock; ++ struct inode *inode; ++ ++ EXT_ASSERT(tree); ++ if (tree->ops->new_block) ++ return tree->ops->new_block(handle, tree, path, ex, err); ++ ++ inode = tree->inode; ++ depth = EXT_DEPTH(tree); ++ if (path && depth > 0) { ++ goal = path[depth-1].p_block; ++ } else { ++ struct ext3_inode_info *ei = EXT3_I(inode); ++ unsigned long bg_start; ++ unsigned long colour; ++ ++ bg_start = (ei->i_block_group * ++ EXT3_BLOCKS_PER_GROUP(inode->i_sb)) + ++ le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block); ++ colour = (current->pid % 16) * ++ (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16); ++ goal = bg_start + colour; ++ } ++ ++ newblock = ext3_new_block(handle, inode, goal, err); ++ return newblock; ++} ++ ++static inline void ext3_ext_tree_changed(struct ext3_extents_tree *tree) ++{ ++ struct ext3_extent_header *neh; ++ neh = EXT_ROOT_HDR(tree); ++ neh->eh_generation++; ++} ++ ++static inline int ext3_ext_space_block(struct ext3_extents_tree *tree) ++{ ++ int size; ++ ++ size = (tree->inode->i_sb->s_blocksize - ++ sizeof(struct ext3_extent_header)) / ++ sizeof(struct ext3_extent); ++#ifdef AGRESSIVE_TEST ++ size = 6; ++#endif ++ return size; ++} ++ ++static inline int ext3_ext_space_block_idx(struct ext3_extents_tree *tree) ++{ ++ int size; ++ ++ size = (tree->inode->i_sb->s_blocksize - ++ sizeof(struct ext3_extent_header)) / ++ sizeof(struct ext3_extent_idx); ++#ifdef AGRESSIVE_TEST ++ size = 5; ++#endif ++ return size; ++} ++ ++static inline int ext3_ext_space_root(struct ext3_extents_tree *tree) ++{ ++ int size; ++ ++ size = (tree->buffer_len - sizeof(struct ext3_extent_header)) / ++ sizeof(struct ext3_extent); ++#ifdef AGRESSIVE_TEST ++ size = 3; ++#endif ++ return size; ++} ++ ++static inline int ext3_ext_space_root_idx(struct ext3_extents_tree *tree) ++{ ++ int size; ++ ++ size = (tree->buffer_len - sizeof(struct ext3_extent_header)) / ++ sizeof(struct ext3_extent_idx); ++#ifdef AGRESSIVE_TEST ++ size = 4; ++#endif ++ return size; ++} ++ ++static void ext3_ext_show_path(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++#ifdef EXT_DEBUG ++ int k, l = path->p_depth; ++ ++ ext_debug(tree, "path:"); ++ for (k = 0; k <= l; k++, path++) { ++ if (path->p_idx) { ++ ext_debug(tree, " %d->%d", path->p_idx->ei_block, ++ path->p_idx->ei_leaf); ++ } else if (path->p_ext) { ++ ext_debug(tree, " %d:%d:%d", ++ path->p_ext->ee_block, ++ path->p_ext->ee_len, ++ path->p_ext->ee_start); ++ } else ++ ext_debug(tree, " []"); ++ } ++ ext_debug(tree, "\n"); ++#endif ++} ++ ++static void ext3_ext_show_leaf(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++#ifdef EXT_DEBUG ++ int depth = EXT_DEPTH(tree); ++ struct ext3_extent_header *eh; ++ struct ext3_extent *ex; ++ int i; ++ ++ if (!path) ++ return; ++ ++ eh = path[depth].p_hdr; ++ ex = EXT_FIRST_EXTENT(eh); ++ ++ for (i = 0; i < eh->eh_entries; i++, ex++) { ++ ext_debug(tree, "%d:%d:%d ", ++ ex->ee_block, ex->ee_len, ex->ee_start); ++ } ++ ext_debug(tree, "\n"); ++#endif ++} ++ ++static void ext3_ext_drop_refs(struct ext3_ext_path *path) ++{ ++ int depth = path->p_depth; ++ int i; ++ ++ for (i = 0; i <= depth; i++, path++) { ++ if (path->p_bh) { ++ brelse(path->p_bh); ++ path->p_bh = NULL; ++ } ++ } ++} ++ ++/* ++ * binary search for closest index by given block ++ */ ++static inline void ++ext3_ext_binsearch_idx(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, int block) ++{ ++ struct ext3_extent_header *eh = path->p_hdr; ++ struct ext3_extent_idx *ix; ++ int l = 0, k, r; ++ ++ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); ++ EXT_ASSERT(eh->eh_entries <= eh->eh_max); ++ EXT_ASSERT(eh->eh_entries > 0); ++ ++ ext_debug(tree, "binsearch for %d(idx): ", block); ++ ++ path->p_idx = ix = EXT_FIRST_INDEX(eh); ++ ++ r = k = eh->eh_entries; ++ while (k > 1) { ++ k = (r - l) / 2; ++ if (block < ix[l + k].ei_block) ++ r -= k; ++ else ++ l += k; ++ ext_debug(tree, "%d:%d:%d ", k, l, r); ++ } ++ ++ ix += l; ++ path->p_idx = ix; ++ ext_debug(tree," -> %d->%d ",path->p_idx->ei_block,path->p_idx->ei_leaf); ++ ++ while (l++ < r) { ++ if (block < ix->ei_block) ++ break; ++ path->p_idx = ix++; ++ } ++ ext_debug(tree, " -> %d->%d\n", path->p_idx->ei_block, ++ path->p_idx->ei_leaf); ++ ++#ifdef CHECK_BINSEARCH ++ { ++ struct ext3_extent_idx *chix; ++ ++ chix = ix = EXT_FIRST_INDEX(eh); ++ for (k = 0; k < eh->eh_entries; k++, ix++) { ++ if (k != 0 && ix->ei_block <= ix[-1].ei_block) { ++ printk("k=%d, ix=0x%p, first=0x%p\n", k, ++ ix, EXT_FIRST_INDEX(eh)); ++ printk("%u <= %u\n", ++ ix->ei_block,ix[-1].ei_block); ++ } ++ EXT_ASSERT(k == 0 || ix->ei_block > ix[-1].ei_block); ++ if (block < ix->ei_block) ++ break; ++ chix = ix; ++ } ++ EXT_ASSERT(chix == path->p_idx); ++ } ++#endif ++} ++ ++/* ++ * binary search for closest extent by given block ++ */ ++static inline void ++ext3_ext_binsearch(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, int block) ++{ ++ struct ext3_extent_header *eh = path->p_hdr; ++ struct ext3_extent *ex; ++ int l = 0, k, r; ++ ++ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); ++ EXT_ASSERT(eh->eh_entries <= eh->eh_max); ++ ++ if (eh->eh_entries == 0) { ++ /* ++ * this leaf is empty yet: ++ * we get such a leaf in split/add case ++ */ ++ return; ++ } ++ ++ ext_debug(tree, "binsearch for %d: ", block); ++ ++ path->p_ext = ex = EXT_FIRST_EXTENT(eh); ++ ++ r = k = eh->eh_entries; ++ while (k > 1) { ++ k = (r - l) / 2; ++ if (block < ex[l + k].ee_block) ++ r -= k; ++ else ++ l += k; ++ ext_debug(tree, "%d:%d:%d ", k, l, r); ++ } ++ ++ ex += l; ++ path->p_ext = ex; ++ ext_debug(tree, " -> %d:%d:%d ", path->p_ext->ee_block, ++ path->p_ext->ee_start, path->p_ext->ee_len); ++ ++ while (l++ < r) { ++ if (block < ex->ee_block) ++ break; ++ path->p_ext = ex++; ++ } ++ ext_debug(tree, " -> %d:%d:%d\n", path->p_ext->ee_block, ++ path->p_ext->ee_start, path->p_ext->ee_len); ++ ++#ifdef CHECK_BINSEARCH ++ { ++ struct ext3_extent *chex; ++ ++ chex = ex = EXT_FIRST_EXTENT(eh); ++ for (k = 0; k < eh->eh_entries; k++, ex++) { ++ EXT_ASSERT(k == 0 || ex->ee_block > ex[-1].ee_block); ++ if (block < ex->ee_block) ++ break; ++ chex = ex; ++ } ++ EXT_ASSERT(chex == path->p_ext); ++ } ++#endif ++} ++ ++int ext3_extent_tree_init(handle_t *handle, struct ext3_extents_tree *tree) ++{ ++ struct ext3_extent_header *eh; ++ ++ BUG_ON(tree->buffer_len == 0); ++ ext3_ext_get_access_for_root(handle, tree); ++ eh = EXT_ROOT_HDR(tree); ++ eh->eh_depth = 0; ++ eh->eh_entries = 0; ++ eh->eh_magic = EXT3_EXT_MAGIC; ++ eh->eh_max = ext3_ext_space_root(tree); ++ ext3_ext_mark_root_dirty(handle, tree); ++ ext3_ext_invalidate_cache(tree); ++ return 0; ++} ++ ++struct ext3_ext_path * ++ext3_ext_find_extent(struct ext3_extents_tree *tree, int block, ++ struct ext3_ext_path *path) ++{ ++ struct ext3_extent_header *eh; ++ struct buffer_head *bh; ++ int depth, i, ppos = 0; ++ ++ EXT_ASSERT(tree); ++ EXT_ASSERT(tree->inode); ++ EXT_ASSERT(tree->root); ++ ++ eh = EXT_ROOT_HDR(tree); ++ EXT_ASSERT(eh); ++ if (ext3_ext_check_header(eh)) ++ goto err; ++ ++ i = depth = EXT_DEPTH(tree); ++ EXT_ASSERT(eh->eh_max); ++ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); ++ ++ /* account possible depth increase */ ++ if (!path) { ++ path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 2), ++ GFP_NOFS); ++ if (!path) ++ return ERR_PTR(-ENOMEM); ++ } ++ memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1)); ++ path[0].p_hdr = eh; ++ ++ /* walk through the tree */ ++ while (i) { ++ ext_debug(tree, "depth %d: num %d, max %d\n", ++ ppos, eh->eh_entries, eh->eh_max); ++ ext3_ext_binsearch_idx(tree, path + ppos, block); ++ path[ppos].p_block = path[ppos].p_idx->ei_leaf; ++ path[ppos].p_depth = i; ++ path[ppos].p_ext = NULL; ++ ++ bh = sb_bread(tree->inode->i_sb, path[ppos].p_block); ++ if (!bh) ++ goto err; ++ ++ eh = EXT_BLOCK_HDR(bh); ++ ppos++; ++ EXT_ASSERT(ppos <= depth); ++ path[ppos].p_bh = bh; ++ path[ppos].p_hdr = eh; ++ i--; ++ ++ if (ext3_ext_check_header(eh)) ++ goto err; ++ } ++ ++ path[ppos].p_depth = i; ++ path[ppos].p_hdr = eh; ++ path[ppos].p_ext = NULL; ++ path[ppos].p_idx = NULL; ++ ++ if (ext3_ext_check_header(eh)) ++ goto err; ++ ++ /* find extent */ ++ ext3_ext_binsearch(tree, path + ppos, block); ++ ++ ext3_ext_show_path(tree, path); ++ ++ return path; ++ ++err: ++ printk(KERN_ERR "EXT3-fs: header is corrupted!\n"); ++ ext3_ext_drop_refs(path); ++ kfree(path); ++ return ERR_PTR(-EIO); ++} ++ ++/* ++ * insert new index [logical;ptr] into the block at cupr ++ * it check where to insert: before curp or after curp ++ */ ++static int ext3_ext_insert_index(handle_t *handle, ++ struct ext3_extents_tree *tree, ++ struct ext3_ext_path *curp, ++ int logical, int ptr) ++{ ++ struct ext3_extent_idx *ix; ++ int len, err; ++ ++ if ((err = ext3_ext_get_access(handle, tree, curp))) ++ return err; ++ ++ EXT_ASSERT(logical != curp->p_idx->ei_block); ++ len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx; ++ if (logical > curp->p_idx->ei_block) { ++ /* insert after */ ++ if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) { ++ len = (len - 1) * sizeof(struct ext3_extent_idx); ++ len = len < 0 ? 0 : len; ++ ext_debug(tree, "insert new index %d after: %d. " ++ "move %d from 0x%p to 0x%p\n", ++ logical, ptr, len, ++ (curp->p_idx + 1), (curp->p_idx + 2)); ++ memmove(curp->p_idx + 2, curp->p_idx + 1, len); ++ } ++ ix = curp->p_idx + 1; ++ } else { ++ /* insert before */ ++ len = len * sizeof(struct ext3_extent_idx); ++ len = len < 0 ? 0 : len; ++ ext_debug(tree, "insert new index %d before: %d. " ++ "move %d from 0x%p to 0x%p\n", ++ logical, ptr, len, ++ curp->p_idx, (curp->p_idx + 1)); ++ memmove(curp->p_idx + 1, curp->p_idx, len); ++ ix = curp->p_idx; ++ } ++ ++ ix->ei_block = logical; ++ ix->ei_leaf = ptr; ++ curp->p_hdr->eh_entries++; ++ ++ EXT_ASSERT(curp->p_hdr->eh_entries <= curp->p_hdr->eh_max); ++ EXT_ASSERT(ix <= EXT_LAST_INDEX(curp->p_hdr)); ++ ++ err = ext3_ext_dirty(handle, tree, curp); ++ ext3_std_error(tree->inode->i_sb, err); ++ ++ return err; ++} ++ ++/* ++ * routine inserts new subtree into the path, using free index entry ++ * at depth 'at: ++ * - allocates all needed blocks (new leaf and all intermediate index blocks) ++ * - makes decision where to split ++ * - moves remaining extens and index entries (right to the split point) ++ * into the newly allocated blocks ++ * - initialize subtree ++ */ ++static int ext3_ext_split(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newext, int at) ++{ ++ struct buffer_head *bh = NULL; ++ int depth = EXT_DEPTH(tree); ++ struct ext3_extent_header *neh; ++ struct ext3_extent_idx *fidx; ++ struct ext3_extent *ex; ++ int i = at, k, m, a; ++ unsigned long newblock, oldblock, border; ++ int *ablocks = NULL; /* array of allocated blocks */ ++ int err = 0; ++ ++ /* make decision: where to split? */ ++ /* FIXME: now desicion is simplest: at current extent */ ++ ++ /* if current leaf will be splitted, then we should use ++ * border from split point */ ++ EXT_ASSERT(path[depth].p_ext <= EXT_MAX_EXTENT(path[depth].p_hdr)); ++ if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) { ++ border = path[depth].p_ext[1].ee_block; ++ ext_debug(tree, "leaf will be splitted." ++ " next leaf starts at %d\n", ++ (int)border); ++ } else { ++ border = newext->ee_block; ++ ext_debug(tree, "leaf will be added." ++ " next leaf starts at %d\n", ++ (int)border); ++ } ++ ++ /* ++ * if error occurs, then we break processing ++ * and turn filesystem read-only. so, index won't ++ * be inserted and tree will be in consistent ++ * state. next mount will repair buffers too ++ */ ++ ++ /* ++ * get array to track all allocated blocks ++ * we need this to handle errors and free blocks ++ * upon them ++ */ ++ ablocks = kmalloc(sizeof(unsigned long) * depth, GFP_NOFS); ++ if (!ablocks) ++ return -ENOMEM; ++ memset(ablocks, 0, sizeof(unsigned long) * depth); ++ ++ /* allocate all needed blocks */ ++ ext_debug(tree, "allocate %d blocks for indexes/leaf\n", depth - at); ++ for (a = 0; a < depth - at; a++) { ++ newblock = ext3_ext_new_block(handle, tree, path, newext, &err); ++ if (newblock == 0) ++ goto cleanup; ++ ablocks[a] = newblock; ++ } ++ ++ /* initialize new leaf */ ++ newblock = ablocks[--a]; ++ EXT_ASSERT(newblock); ++ bh = sb_getblk(tree->inode->i_sb, newblock); ++ if (!bh) { ++ err = -EIO; ++ goto cleanup; ++ } ++ lock_buffer(bh); ++ ++ if ((err = ext3_journal_get_create_access(handle, bh))) ++ goto cleanup; ++ ++ neh = EXT_BLOCK_HDR(bh); ++ neh->eh_entries = 0; ++ neh->eh_max = ext3_ext_space_block(tree); ++ neh->eh_magic = EXT3_EXT_MAGIC; ++ neh->eh_depth = 0; ++ ex = EXT_FIRST_EXTENT(neh); ++ ++ /* move remain of path[depth] to the new leaf */ ++ EXT_ASSERT(path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max); ++ /* start copy from next extent */ ++ /* TODO: we could do it by single memmove */ ++ m = 0; ++ path[depth].p_ext++; ++ while (path[depth].p_ext <= ++ EXT_MAX_EXTENT(path[depth].p_hdr)) { ++ ext_debug(tree, "move %d:%d:%d in new leaf %lu\n", ++ path[depth].p_ext->ee_block, ++ path[depth].p_ext->ee_start, ++ path[depth].p_ext->ee_len, ++ newblock); ++ memmove(ex++, path[depth].p_ext++, sizeof(struct ext3_extent)); ++ neh->eh_entries++; ++ m++; ++ } ++ set_buffer_uptodate(bh); ++ unlock_buffer(bh); ++ ++ if ((err = ext3_journal_dirty_metadata(handle, bh))) ++ goto cleanup; ++ brelse(bh); ++ bh = NULL; ++ ++ /* correct old leaf */ ++ if (m) { ++ if ((err = ext3_ext_get_access(handle, tree, path + depth))) ++ goto cleanup; ++ path[depth].p_hdr->eh_entries -= m; ++ if ((err = ext3_ext_dirty(handle, tree, path + depth))) ++ goto cleanup; ++ ++ } ++ ++ /* create intermediate indexes */ ++ k = depth - at - 1; ++ EXT_ASSERT(k >= 0); ++ if (k) ++ ext_debug(tree, "create %d intermediate indices\n", k); ++ /* insert new index into current index block */ ++ /* current depth stored in i var */ ++ i = depth - 1; ++ while (k--) { ++ oldblock = newblock; ++ newblock = ablocks[--a]; ++ bh = sb_getblk(tree->inode->i_sb, newblock); ++ if (!bh) { ++ err = -EIO; ++ goto cleanup; ++ } ++ lock_buffer(bh); ++ ++ if ((err = ext3_journal_get_create_access(handle, bh))) ++ goto cleanup; ++ ++ neh = EXT_BLOCK_HDR(bh); ++ neh->eh_entries = 1; ++ neh->eh_magic = EXT3_EXT_MAGIC; ++ neh->eh_max = ext3_ext_space_block_idx(tree); ++ neh->eh_depth = depth - i; ++ fidx = EXT_FIRST_INDEX(neh); ++ fidx->ei_block = border; ++ fidx->ei_leaf = oldblock; ++ ++ ext_debug(tree, "int.index at %d (block %lu): %lu -> %lu\n", ++ i, newblock, border, oldblock); ++ /* copy indexes */ ++ m = 0; ++ path[i].p_idx++; ++ ++ ext_debug(tree, "cur 0x%p, last 0x%p\n", path[i].p_idx, ++ EXT_MAX_INDEX(path[i].p_hdr)); ++ EXT_ASSERT(EXT_MAX_INDEX(path[i].p_hdr) == ++ EXT_LAST_INDEX(path[i].p_hdr)); ++ while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) { ++ ext_debug(tree, "%d: move %d:%d in new index %lu\n", ++ i, path[i].p_idx->ei_block, ++ path[i].p_idx->ei_leaf, newblock); ++ memmove(++fidx, path[i].p_idx++, ++ sizeof(struct ext3_extent_idx)); ++ neh->eh_entries++; ++ EXT_ASSERT(neh->eh_entries <= neh->eh_max); ++ m++; ++ } ++ set_buffer_uptodate(bh); ++ unlock_buffer(bh); ++ ++ if ((err = ext3_journal_dirty_metadata(handle, bh))) ++ goto cleanup; ++ brelse(bh); ++ bh = NULL; ++ ++ /* correct old index */ ++ if (m) { ++ err = ext3_ext_get_access(handle, tree, path + i); ++ if (err) ++ goto cleanup; ++ path[i].p_hdr->eh_entries -= m; ++ err = ext3_ext_dirty(handle, tree, path + i); ++ if (err) ++ goto cleanup; ++ } ++ ++ i--; ++ } ++ ++ /* insert new index */ ++ if (!err) ++ err = ext3_ext_insert_index(handle, tree, path + at, ++ border, newblock); ++ ++cleanup: ++ if (bh) { ++ if (buffer_locked(bh)) ++ unlock_buffer(bh); ++ brelse(bh); ++ } ++ ++ if (err) { ++ /* free all allocated blocks in error case */ ++ for (i = 0; i < depth; i++) { ++ if (!ablocks[i]) ++ continue; ++ ext3_free_blocks(handle, tree->inode, ablocks[i], 1); ++ } ++ } ++ kfree(ablocks); ++ ++ return err; ++} ++ ++/* ++ * routine implements tree growing procedure: ++ * - allocates new block ++ * - moves top-level data (index block or leaf) into the new block ++ * - initialize new top-level, creating index that points to the ++ * just created block ++ */ ++static int ext3_ext_grow_indepth(handle_t *handle, ++ struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newext) ++{ ++ struct ext3_ext_path *curp = path; ++ struct ext3_extent_header *neh; ++ struct ext3_extent_idx *fidx; ++ struct buffer_head *bh; ++ unsigned long newblock; ++ int err = 0; ++ ++ newblock = ext3_ext_new_block(handle, tree, path, newext, &err); ++ if (newblock == 0) ++ return err; ++ ++ bh = sb_getblk(tree->inode->i_sb, newblock); ++ if (!bh) { ++ err = -EIO; ++ ext3_std_error(tree->inode->i_sb, err); ++ return err; ++ } ++ lock_buffer(bh); ++ ++ if ((err = ext3_journal_get_create_access(handle, bh))) { ++ unlock_buffer(bh); ++ goto out; ++ } ++ ++ /* move top-level index/leaf into new block */ ++ memmove(bh->b_data, curp->p_hdr, tree->buffer_len); ++ ++ /* set size of new block */ ++ neh = EXT_BLOCK_HDR(bh); ++ /* old root could have indexes or leaves ++ * so calculate eh_max right way */ ++ if (EXT_DEPTH(tree)) ++ neh->eh_max = ext3_ext_space_block_idx(tree); ++ else ++ neh->eh_max = ext3_ext_space_block(tree); ++ neh->eh_magic = EXT3_EXT_MAGIC; ++ set_buffer_uptodate(bh); ++ unlock_buffer(bh); ++ ++ if ((err = ext3_journal_dirty_metadata(handle, bh))) ++ goto out; ++ ++ /* create index in new top-level index: num,max,pointer */ ++ if ((err = ext3_ext_get_access(handle, tree, curp))) ++ goto out; ++ ++ curp->p_hdr->eh_magic = EXT3_EXT_MAGIC; ++ curp->p_hdr->eh_max = ext3_ext_space_root_idx(tree); ++ curp->p_hdr->eh_entries = 1; ++ curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr); ++ /* FIXME: it works, but actually path[0] can be index */ ++ curp->p_idx->ei_block = EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block; ++ curp->p_idx->ei_leaf = newblock; ++ ++ neh = EXT_ROOT_HDR(tree); ++ fidx = EXT_FIRST_INDEX(neh); ++ ext_debug(tree, "new root: num %d(%d), lblock %d, ptr %d\n", ++ neh->eh_entries, neh->eh_max, fidx->ei_block, fidx->ei_leaf); ++ ++ neh->eh_depth = path->p_depth + 1; ++ err = ext3_ext_dirty(handle, tree, curp); ++out: ++ brelse(bh); ++ ++ return err; ++} ++ ++/* ++ * routine finds empty index and adds new leaf. if no free index found ++ * then it requests in-depth growing ++ */ ++static int ext3_ext_create_new_leaf(handle_t *handle, ++ struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newext) ++{ ++ struct ext3_ext_path *curp; ++ int depth, i, err = 0; ++ ++repeat: ++ i = depth = EXT_DEPTH(tree); ++ ++ /* walk up to the tree and look for free index entry */ ++ curp = path + depth; ++ while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) { ++ i--; ++ curp--; ++ } ++ ++ /* we use already allocated block for index block ++ * so, subsequent data blocks should be contigoues */ ++ if (EXT_HAS_FREE_INDEX(curp)) { ++ /* if we found index with free entry, then use that ++ * entry: create all needed subtree and add new leaf */ ++ err = ext3_ext_split(handle, tree, path, newext, i); ++ ++ /* refill path */ ++ ext3_ext_drop_refs(path); ++ path = ext3_ext_find_extent(tree, newext->ee_block, path); ++ if (IS_ERR(path)) ++ err = PTR_ERR(path); ++ } else { ++ /* tree is full, time to grow in depth */ ++ err = ext3_ext_grow_indepth(handle, tree, path, newext); ++ ++ /* refill path */ ++ ext3_ext_drop_refs(path); ++ path = ext3_ext_find_extent(tree, newext->ee_block, path); ++ if (IS_ERR(path)) ++ err = PTR_ERR(path); ++ ++ /* ++ * only first (depth 0 -> 1) produces free space ++ * in all other cases we have to split growed tree ++ */ ++ depth = EXT_DEPTH(tree); ++ if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) { ++ /* now we need split */ ++ goto repeat; ++ } ++ } ++ ++ if (err) ++ return err; ++ ++ return 0; ++} ++ ++/* ++ * returns allocated block in subsequent extent or EXT_MAX_BLOCK ++ * NOTE: it consider block number from index entry as ++ * allocated block. thus, index entries have to be consistent ++ * with leafs ++ */ ++static unsigned long ++ext3_ext_next_allocated_block(struct ext3_ext_path *path) ++{ ++ int depth; ++ ++ EXT_ASSERT(path != NULL); ++ depth = path->p_depth; ++ ++ if (depth == 0 && path->p_ext == NULL) ++ return EXT_MAX_BLOCK; ++ ++ /* FIXME: what if index isn't full ?! */ ++ while (depth >= 0) { ++ if (depth == path->p_depth) { ++ /* leaf */ ++ if (path[depth].p_ext != ++ EXT_LAST_EXTENT(path[depth].p_hdr)) ++ return path[depth].p_ext[1].ee_block; ++ } else { ++ /* index */ ++ if (path[depth].p_idx != ++ EXT_LAST_INDEX(path[depth].p_hdr)) ++ return path[depth].p_idx[1].ei_block; ++ } ++ depth--; ++ } ++ ++ return EXT_MAX_BLOCK; ++} ++ ++/* ++ * returns first allocated block from next leaf or EXT_MAX_BLOCK ++ */ ++static unsigned ext3_ext_next_leaf_block(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ int depth; ++ ++ EXT_ASSERT(path != NULL); ++ depth = path->p_depth; ++ ++ /* zero-tree has no leaf blocks at all */ ++ if (depth == 0) ++ return EXT_MAX_BLOCK; ++ ++ /* go to index block */ ++ depth--; ++ ++ while (depth >= 0) { ++ if (path[depth].p_idx != ++ EXT_LAST_INDEX(path[depth].p_hdr)) ++ return path[depth].p_idx[1].ei_block; ++ depth--; ++ } ++ ++ return EXT_MAX_BLOCK; ++} ++ ++/* ++ * if leaf gets modified and modified extent is first in the leaf ++ * then we have to correct all indexes above ++ * TODO: do we need to correct tree in all cases? ++ */ ++int ext3_ext_correct_indexes(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ struct ext3_extent_header *eh; ++ int depth = EXT_DEPTH(tree); ++ struct ext3_extent *ex; ++ unsigned long border; ++ int k, err = 0; ++ ++ eh = path[depth].p_hdr; ++ ex = path[depth].p_ext; ++ EXT_ASSERT(ex); ++ EXT_ASSERT(eh); ++ ++ if (depth == 0) { ++ /* there is no tree at all */ ++ return 0; ++ } ++ ++ if (ex != EXT_FIRST_EXTENT(eh)) { ++ /* we correct tree if first leaf got modified only */ ++ return 0; ++ } ++ ++ /* ++ * TODO: we need correction if border is smaller then current one ++ */ ++ k = depth - 1; ++ border = path[depth].p_ext->ee_block; ++ if ((err = ext3_ext_get_access(handle, tree, path + k))) ++ return err; ++ path[k].p_idx->ei_block = border; ++ if ((err = ext3_ext_dirty(handle, tree, path + k))) ++ return err; ++ ++ while (k--) { ++ /* change all left-side indexes */ ++ if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr)) ++ break; ++ if ((err = ext3_ext_get_access(handle, tree, path + k))) ++ break; ++ path[k].p_idx->ei_block = border; ++ if ((err = ext3_ext_dirty(handle, tree, path + k))) ++ break; ++ } ++ ++ return err; ++} ++ ++static int inline ++ext3_can_extents_be_merged(struct ext3_extents_tree *tree, ++ struct ext3_extent *ex1, ++ struct ext3_extent *ex2) ++{ ++ if (ex1->ee_block + ex1->ee_len != ex2->ee_block) ++ return 0; ++ ++#ifdef AGRESSIVE_TEST ++ if (ex1->ee_len >= 4) ++ return 0; ++#endif ++ ++ if (!tree->ops->mergable) ++ return 1; ++ ++ return tree->ops->mergable(ex1, ex2); ++} ++ ++/* ++ * this routine tries to merge requsted extent into the existing ++ * extent or inserts requested extent as new one into the tree, ++ * creating new leaf in no-space case ++ */ ++int ext3_ext_insert_extent(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newext) ++{ ++ struct ext3_extent_header * eh; ++ struct ext3_extent *ex, *fex; ++ struct ext3_extent *nearex; /* nearest extent */ ++ struct ext3_ext_path *npath = NULL; ++ int depth, len, err, next; ++ ++ EXT_ASSERT(newext->ee_len > 0); ++ depth = EXT_DEPTH(tree); ++ ex = path[depth].p_ext; ++ EXT_ASSERT(path[depth].p_hdr); ++ ++ /* try to insert block into found extent and return */ ++ if (ex && ext3_can_extents_be_merged(tree, ex, newext)) { ++ ext_debug(tree, "append %d block to %d:%d (from %d)\n", ++ newext->ee_len, ex->ee_block, ex->ee_len, ++ ex->ee_start); ++ if ((err = ext3_ext_get_access(handle, tree, path + depth))) ++ return err; ++ ex->ee_len += newext->ee_len; ++ eh = path[depth].p_hdr; ++ nearex = ex; ++ goto merge; ++ } ++ ++repeat: ++ depth = EXT_DEPTH(tree); ++ eh = path[depth].p_hdr; ++ if (eh->eh_entries < eh->eh_max) ++ goto has_space; ++ ++ /* probably next leaf has space for us? */ ++ fex = EXT_LAST_EXTENT(eh); ++ next = ext3_ext_next_leaf_block(tree, path); ++ if (newext->ee_block > fex->ee_block && next != EXT_MAX_BLOCK) { ++ ext_debug(tree, "next leaf block - %d\n", next); ++ EXT_ASSERT(!npath); ++ npath = ext3_ext_find_extent(tree, next, NULL); ++ if (IS_ERR(npath)) ++ return PTR_ERR(npath); ++ EXT_ASSERT(npath->p_depth == path->p_depth); ++ eh = npath[depth].p_hdr; ++ if (eh->eh_entries < eh->eh_max) { ++ ext_debug(tree, "next leaf isnt full(%d)\n", ++ eh->eh_entries); ++ path = npath; ++ goto repeat; ++ } ++ ext_debug(tree, "next leaf hasno free space(%d,%d)\n", ++ eh->eh_entries, eh->eh_max); ++ } ++ ++ /* ++ * there is no free space in found leaf ++ * we're gonna add new leaf in the tree ++ */ ++ err = ext3_ext_create_new_leaf(handle, tree, path, newext); ++ if (err) ++ goto cleanup; ++ depth = EXT_DEPTH(tree); ++ eh = path[depth].p_hdr; ++ ++has_space: ++ nearex = path[depth].p_ext; ++ ++ if ((err = ext3_ext_get_access(handle, tree, path + depth))) ++ goto cleanup; ++ ++ if (!nearex) { ++ /* there is no extent in this leaf, create first one */ ++ ext_debug(tree, "first extent in the leaf: %d:%d:%d\n", ++ newext->ee_block, newext->ee_start, ++ newext->ee_len); ++ path[depth].p_ext = EXT_FIRST_EXTENT(eh); ++ } else if (newext->ee_block > nearex->ee_block) { ++ EXT_ASSERT(newext->ee_block != nearex->ee_block); ++ if (nearex != EXT_LAST_EXTENT(eh)) { ++ len = EXT_MAX_EXTENT(eh) - nearex; ++ len = (len - 1) * sizeof(struct ext3_extent); ++ len = len < 0 ? 0 : len; ++ ext_debug(tree, "insert %d:%d:%d after: nearest 0x%p, " ++ "move %d from 0x%p to 0x%p\n", ++ newext->ee_block, newext->ee_start, ++ newext->ee_len, ++ nearex, len, nearex + 1, nearex + 2); ++ memmove(nearex + 2, nearex + 1, len); ++ } ++ path[depth].p_ext = nearex + 1; ++ } else { ++ EXT_ASSERT(newext->ee_block != nearex->ee_block); ++ len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext3_extent); ++ len = len < 0 ? 0 : len; ++ ext_debug(tree, "insert %d:%d:%d before: nearest 0x%p, " ++ "move %d from 0x%p to 0x%p\n", ++ newext->ee_block, newext->ee_start, newext->ee_len, ++ nearex, len, nearex + 1, nearex + 2); ++ memmove(nearex + 1, nearex, len); ++ path[depth].p_ext = nearex; ++ } ++ ++ eh->eh_entries++; ++ nearex = path[depth].p_ext; ++ nearex->ee_block = newext->ee_block; ++ nearex->ee_start = newext->ee_start; ++ nearex->ee_len = newext->ee_len; ++ /* FIXME: support for large fs */ ++ nearex->ee_start_hi = 0; ++ ++merge: ++ /* try to merge extents to the right */ ++ while (nearex < EXT_LAST_EXTENT(eh)) { ++ if (!ext3_can_extents_be_merged(tree, nearex, nearex + 1)) ++ break; ++ /* merge with next extent! */ ++ nearex->ee_len += nearex[1].ee_len; ++ if (nearex + 1 < EXT_LAST_EXTENT(eh)) { ++ len = (EXT_LAST_EXTENT(eh) - nearex - 1) * ++ sizeof(struct ext3_extent); ++ memmove(nearex + 1, nearex + 2, len); ++ } ++ eh->eh_entries--; ++ EXT_ASSERT(eh->eh_entries > 0); ++ } ++ ++ /* try to merge extents to the left */ ++ ++ /* time to correct all indexes above */ ++ err = ext3_ext_correct_indexes(handle, tree, path); ++ if (err) ++ goto cleanup; ++ ++ err = ext3_ext_dirty(handle, tree, path + depth); ++ ++cleanup: ++ if (npath) { ++ ext3_ext_drop_refs(npath); ++ kfree(npath); ++ } ++ ext3_ext_tree_changed(tree); ++ ext3_ext_invalidate_cache(tree); ++ return err; ++} ++ ++int ext3_ext_walk_space(struct ext3_extents_tree *tree, unsigned long block, ++ unsigned long num, ext_prepare_callback func) ++{ ++ struct ext3_ext_path *path = NULL; ++ struct ext3_ext_cache cbex; ++ struct ext3_extent *ex; ++ unsigned long next, start = 0, end = 0; ++ unsigned long last = block + num; ++ int depth, exists, err = 0; ++ ++ EXT_ASSERT(tree); ++ EXT_ASSERT(func); ++ EXT_ASSERT(tree->inode); ++ EXT_ASSERT(tree->root); ++ ++ while (block < last && block != EXT_MAX_BLOCK) { ++ num = last - block; ++ /* find extent for this block */ ++ path = ext3_ext_find_extent(tree, block, path); ++ if (IS_ERR(path)) { ++ err = PTR_ERR(path); ++ path = NULL; ++ break; ++ } ++ ++ depth = EXT_DEPTH(tree); ++ EXT_ASSERT(path[depth].p_hdr); ++ ex = path[depth].p_ext; ++ next = ext3_ext_next_allocated_block(path); ++ ++ exists = 0; ++ if (!ex) { ++ /* there is no extent yet, so try to allocate ++ * all requested space */ ++ start = block; ++ end = block + num; ++ } else if (ex->ee_block > block) { ++ /* need to allocate space before found extent */ ++ start = block; ++ end = ex->ee_block; ++ if (block + num < end) ++ end = block + num; ++ } else if (block >= ex->ee_block + ex->ee_len) { ++ /* need to allocate space after found extent */ ++ start = block; ++ end = block + num; ++ if (end >= next) ++ end = next; ++ } else if (block >= ex->ee_block) { ++ /* ++ * some part of requested space is covered ++ * by found extent ++ */ ++ start = block; ++ end = ex->ee_block + ex->ee_len; ++ if (block + num < end) ++ end = block + num; ++ exists = 1; ++ } else { ++ BUG(); ++ } ++ EXT_ASSERT(end > start); ++ ++ if (!exists) { ++ cbex.ec_block = start; ++ cbex.ec_len = end - start; ++ cbex.ec_start = 0; ++ cbex.ec_type = EXT3_EXT_CACHE_GAP; ++ } else { ++ cbex.ec_block = ex->ee_block; ++ cbex.ec_len = ex->ee_len; ++ cbex.ec_start = ex->ee_start; ++ cbex.ec_type = EXT3_EXT_CACHE_EXTENT; ++ } ++ ++ EXT_ASSERT(cbex.ec_len > 0); ++ EXT_ASSERT(path[depth].p_hdr); ++ err = func(tree, path, &cbex); ++ ext3_ext_drop_refs(path); ++ ++ if (err < 0) ++ break; ++ if (err == EXT_REPEAT) ++ continue; ++ else if (err == EXT_BREAK) { ++ err = 0; ++ break; ++ } ++ ++ if (EXT_DEPTH(tree) != depth) { ++ /* depth was changed. we have to realloc path */ ++ kfree(path); ++ path = NULL; ++ } ++ ++ block = cbex.ec_block + cbex.ec_len; ++ } ++ ++ if (path) { ++ ext3_ext_drop_refs(path); ++ kfree(path); ++ } ++ ++ return err; ++} ++ ++static inline void ++ext3_ext_put_in_cache(struct ext3_extents_tree *tree, __u32 block, ++ __u32 len, __u32 start, int type) ++{ ++ EXT_ASSERT(len > 0); ++ if (tree->cex) { ++ tree->cex->ec_type = type; ++ tree->cex->ec_block = block; ++ tree->cex->ec_len = len; ++ tree->cex->ec_start = start; ++ } ++} ++ ++/* ++ * this routine calculate boundaries of the gap requested block fits into ++ * and cache this gap ++ */ ++static inline void ++ext3_ext_put_gap_in_cache(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ unsigned long block) ++{ ++ int depth = EXT_DEPTH(tree); ++ unsigned long lblock, len; ++ struct ext3_extent *ex; ++ ++ if (!tree->cex) ++ return; ++ ++ ex = path[depth].p_ext; ++ if (ex == NULL) { ++ /* there is no extent yet, so gap is [0;-] */ ++ lblock = 0; ++ len = EXT_MAX_BLOCK; ++ ext_debug(tree, "cache gap(whole file):"); ++ } else if (block < ex->ee_block) { ++ lblock = block; ++ len = ex->ee_block - block; ++ ext_debug(tree, "cache gap(before): %lu [%lu:%lu]", ++ (unsigned long) block, ++ (unsigned long) ex->ee_block, ++ (unsigned long) ex->ee_len); ++ } else if (block >= ex->ee_block + ex->ee_len) { ++ lblock = ex->ee_block + ex->ee_len; ++ len = ext3_ext_next_allocated_block(path); ++ ext_debug(tree, "cache gap(after): [%lu:%lu] %lu", ++ (unsigned long) ex->ee_block, ++ (unsigned long) ex->ee_len, ++ (unsigned long) block); ++ EXT_ASSERT(len > lblock); ++ len = len - lblock; ++ } else { ++ lblock = len = 0; ++ BUG(); ++ } ++ ++ ext_debug(tree, " -> %lu:%lu\n", (unsigned long) lblock, len); ++ ext3_ext_put_in_cache(tree, lblock, len, 0, EXT3_EXT_CACHE_GAP); ++} ++ ++static inline int ++ext3_ext_in_cache(struct ext3_extents_tree *tree, unsigned long block, ++ struct ext3_extent *ex) ++{ ++ struct ext3_ext_cache *cex = tree->cex; ++ ++ /* is there cache storage at all? */ ++ if (!cex) ++ return EXT3_EXT_CACHE_NO; ++ ++ /* has cache valid data? */ ++ if (cex->ec_type == EXT3_EXT_CACHE_NO) ++ return EXT3_EXT_CACHE_NO; ++ ++ EXT_ASSERT(cex->ec_type == EXT3_EXT_CACHE_GAP || ++ cex->ec_type == EXT3_EXT_CACHE_EXTENT); ++ if (block >= cex->ec_block && block < cex->ec_block + cex->ec_len) { ++ ex->ee_block = cex->ec_block; ++ ex->ee_start = cex->ec_start; ++ ex->ee_len = cex->ec_len; ++ ext_debug(tree, "%lu cached by %lu:%lu:%lu\n", ++ (unsigned long) block, ++ (unsigned long) ex->ee_block, ++ (unsigned long) ex->ee_len, ++ (unsigned long) ex->ee_start); ++ return cex->ec_type; ++ } ++ ++ /* not in cache */ ++ return EXT3_EXT_CACHE_NO; ++} ++ ++/* ++ * routine removes index from the index block ++ * it's used in truncate case only. thus all requests are for ++ * last index in the block only ++ */ ++int ext3_ext_rm_idx(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ struct buffer_head *bh; ++ int err; ++ ++ /* free index block */ ++ path--; ++ EXT_ASSERT(path->p_hdr->eh_entries); ++ if ((err = ext3_ext_get_access(handle, tree, path))) ++ return err; ++ path->p_hdr->eh_entries--; ++ if ((err = ext3_ext_dirty(handle, tree, path))) ++ return err; ++ ext_debug(tree, "index is empty, remove it, free block %d\n", ++ path->p_idx->ei_leaf); ++ bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf); ++ ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf); ++ ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1); ++ return err; ++} ++ ++int ext3_ext_calc_credits_for_insert(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ int depth = EXT_DEPTH(tree); ++ int needed; ++ ++ if (path) { ++ /* probably there is space in leaf? */ ++ if (path[depth].p_hdr->eh_entries < path[depth].p_hdr->eh_max) ++ return 1; ++ } ++ ++ /* ++ * the worste case we're expecting is creation of the ++ * new root (growing in depth) with index splitting ++ * for splitting we have to consider depth + 1 because ++ * previous growing could increase it ++ */ ++ depth = depth + 1; ++ ++ /* ++ * growing in depth: ++ * block allocation + new root + old root ++ */ ++ needed = EXT3_ALLOC_NEEDED + 2; ++ ++ /* index split. we may need: ++ * allocate intermediate indexes and new leaf ++ * change two blocks at each level, but root ++ * modify root block (inode) ++ */ ++ needed += (depth * EXT3_ALLOC_NEEDED) + (2 * depth) + 1; ++ ++ return needed; ++} ++ ++static int ++ext3_ext_split_for_rm(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, unsigned long start, ++ unsigned long end) ++{ ++ struct ext3_extent *ex, tex; ++ struct ext3_ext_path *npath; ++ int depth, creds, err; ++ ++ depth = EXT_DEPTH(tree); ++ ex = path[depth].p_ext; ++ EXT_ASSERT(ex); ++ EXT_ASSERT(end < ex->ee_block + ex->ee_len - 1); ++ EXT_ASSERT(ex->ee_block < start); ++ ++ /* calculate tail extent */ ++ tex.ee_block = end + 1; ++ EXT_ASSERT(tex.ee_block < ex->ee_block + ex->ee_len); ++ tex.ee_len = ex->ee_block + ex->ee_len - tex.ee_block; ++ ++ creds = ext3_ext_calc_credits_for_insert(tree, path); ++ handle = ext3_ext_journal_restart(handle, creds); ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ ++ /* calculate head extent. use primary extent */ ++ err = ext3_ext_get_access(handle, tree, path + depth); ++ if (err) ++ return err; ++ ex->ee_len = start - ex->ee_block; ++ err = ext3_ext_dirty(handle, tree, path + depth); ++ if (err) ++ return err; ++ ++ /* FIXME: some callback to free underlying resource ++ * and correct ee_start? */ ++ ext_debug(tree, "split extent: head %u:%u, tail %u:%u\n", ++ ex->ee_block, ex->ee_len, tex.ee_block, tex.ee_len); ++ ++ npath = ext3_ext_find_extent(tree, ex->ee_block, NULL); ++ if (IS_ERR(npath)) ++ return PTR_ERR(npath); ++ depth = EXT_DEPTH(tree); ++ EXT_ASSERT(npath[depth].p_ext->ee_block == ex->ee_block); ++ EXT_ASSERT(npath[depth].p_ext->ee_len == ex->ee_len); ++ ++ err = ext3_ext_insert_extent(handle, tree, npath, &tex); ++ ext3_ext_drop_refs(npath); ++ kfree(npath); ++ ++ return err; ++} ++ ++static int ++ext3_ext_rm_leaf(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, unsigned long start, ++ unsigned long end) ++{ ++ struct ext3_extent *ex, *fu = NULL, *lu, *le; ++ int err = 0, correct_index = 0; ++ int depth = EXT_DEPTH(tree), credits; ++ struct ext3_extent_header *eh; ++ unsigned a, b, block, num; ++ ++ ext_debug(tree, "remove [%lu:%lu] in leaf\n", start, end); ++ if (!path[depth].p_hdr) ++ path[depth].p_hdr = EXT_BLOCK_HDR(path[depth].p_bh); ++ eh = path[depth].p_hdr; ++ EXT_ASSERT(eh); ++ EXT_ASSERT(eh->eh_entries <= eh->eh_max); ++ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); ++ ++ /* find where to start removing */ ++ le = ex = EXT_LAST_EXTENT(eh); ++ while (ex != EXT_FIRST_EXTENT(eh)) { ++ if (ex->ee_block <= end) ++ break; ++ ex--; ++ } ++ ++ if (start > ex->ee_block && end < ex->ee_block + ex->ee_len - 1) { ++ /* removal of internal part of the extent requested ++ * tail and head must be placed in different extent ++ * so, we have to insert one more extent */ ++ path[depth].p_ext = ex; ++ return ext3_ext_split_for_rm(handle, tree, path, start, end); ++ } ++ ++ lu = ex; ++ while (ex >= EXT_FIRST_EXTENT(eh) && ex->ee_block + ex->ee_len > start) { ++ ext_debug(tree, "remove ext %u:%u\n", ex->ee_block, ex->ee_len); ++ path[depth].p_ext = ex; ++ ++ a = ex->ee_block > start ? ex->ee_block : start; ++ b = ex->ee_block + ex->ee_len - 1 < end ? ++ ex->ee_block + ex->ee_len - 1 : end; ++ ++ ext_debug(tree, " border %u:%u\n", a, b); ++ ++ if (a != ex->ee_block && b != ex->ee_block + ex->ee_len - 1) { ++ block = 0; ++ num = 0; ++ BUG(); ++ } else if (a != ex->ee_block) { ++ /* remove tail of the extent */ ++ block = ex->ee_block; ++ num = a - block; ++ } else if (b != ex->ee_block + ex->ee_len - 1) { ++ /* remove head of the extent */ ++ block = a; ++ num = b - a; ++ } else { ++ /* remove whole extent: excelent! */ ++ block = ex->ee_block; ++ num = 0; ++ EXT_ASSERT(a == ex->ee_block && ++ b == ex->ee_block + ex->ee_len - 1); ++ } ++ ++ if (ex == EXT_FIRST_EXTENT(eh)) ++ correct_index = 1; ++ ++ credits = 1; ++ if (correct_index) ++ credits += (EXT_DEPTH(tree) * EXT3_ALLOC_NEEDED) + 1; ++ if (tree->ops->remove_extent_credits) ++ credits+=tree->ops->remove_extent_credits(tree,ex,a,b); ++ ++ handle = ext3_ext_journal_restart(handle, credits); ++ if (IS_ERR(handle)) { ++ err = PTR_ERR(handle); ++ goto out; ++ } ++ ++ err = ext3_ext_get_access(handle, tree, path + depth); ++ if (err) ++ goto out; ++ ++ if (tree->ops->remove_extent) ++ err = tree->ops->remove_extent(tree, ex, a, b); ++ if (err) ++ goto out; ++ ++ if (num == 0) { ++ /* this extent is removed entirely mark slot unused */ ++ ex->ee_start = 0; ++ eh->eh_entries--; ++ fu = ex; ++ } ++ ++ ex->ee_block = block; ++ ex->ee_len = num; ++ ++ err = ext3_ext_dirty(handle, tree, path + depth); ++ if (err) ++ goto out; ++ ++ ext_debug(tree, "new extent: %u:%u:%u\n", ++ ex->ee_block, ex->ee_len, ex->ee_start); ++ ex--; ++ } ++ ++ if (fu) { ++ /* reuse unused slots */ ++ while (lu < le) { ++ if (lu->ee_start) { ++ *fu = *lu; ++ lu->ee_start = 0; ++ fu++; ++ } ++ lu++; ++ } ++ } ++ ++ if (correct_index && eh->eh_entries) ++ err = ext3_ext_correct_indexes(handle, tree, path); ++ ++ /* if this leaf is free, then we should ++ * remove it from index block above */ ++ if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL) ++ err = ext3_ext_rm_idx(handle, tree, path + depth); ++ ++out: ++ return err; ++} ++ ++ ++static struct ext3_extent_idx * ++ext3_ext_last_covered(struct ext3_extent_header *hdr, unsigned long block) ++{ ++ struct ext3_extent_idx *ix; ++ ++ ix = EXT_LAST_INDEX(hdr); ++ while (ix != EXT_FIRST_INDEX(hdr)) { ++ if (ix->ei_block <= block) ++ break; ++ ix--; ++ } ++ return ix; ++} ++ ++/* ++ * returns 1 if current index have to be freed (even partial) ++ */ ++static int inline ++ext3_ext_more_to_rm(struct ext3_ext_path *path) ++{ ++ EXT_ASSERT(path->p_idx); ++ ++ if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr)) ++ return 0; ++ ++ /* ++ * if truncate on deeper level happened it it wasn't partial ++ * so we have to consider current index for truncation ++ */ ++ if (path->p_hdr->eh_entries == path->p_block) ++ return 0; ++ return 1; ++} ++ ++int ext3_ext_remove_space(struct ext3_extents_tree *tree, ++ unsigned long start, unsigned long end) ++{ ++ struct inode *inode = tree->inode; ++ struct super_block *sb = inode->i_sb; ++ int depth = EXT_DEPTH(tree); ++ struct ext3_ext_path *path; ++ handle_t *handle; ++ int i = 0, err = 0; ++ ++ ext_debug(tree, "space to be removed: %lu:%lu\n", start, end); ++ ++ /* probably first extent we're gonna free will be last in block */ ++ handle = ext3_journal_start(inode, depth + 1); ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ ++ ext3_ext_invalidate_cache(tree); ++ ++ /* ++ * we start scanning from right side freeing all the blocks ++ * after i_size and walking into the deep ++ */ ++ path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 1), GFP_KERNEL); ++ if (IS_ERR(path)) { ++ ext3_error(sb, __FUNCTION__, "Can't allocate path array"); ++ ext3_journal_stop(handle); ++ return -ENOMEM; ++ } ++ memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1)); ++ path[i].p_hdr = EXT_ROOT_HDR(tree); ++ ++ while (i >= 0 && err == 0) { ++ if (i == depth) { ++ /* this is leaf block */ ++ err = ext3_ext_rm_leaf(handle, tree, path, start, end); ++ /* root level have p_bh == NULL, brelse() eats this */ ++ brelse(path[i].p_bh); ++ i--; ++ continue; ++ } ++ ++ /* this is index block */ ++ if (!path[i].p_hdr) { ++ ext_debug(tree, "initialize header\n"); ++ path[i].p_hdr = EXT_BLOCK_HDR(path[i].p_bh); ++ } ++ ++ EXT_ASSERT(path[i].p_hdr->eh_entries <= path[i].p_hdr->eh_max); ++ EXT_ASSERT(path[i].p_hdr->eh_magic == EXT3_EXT_MAGIC); ++ ++ if (!path[i].p_idx) { ++ /* this level hasn't touched yet */ ++ path[i].p_idx = ++ ext3_ext_last_covered(path[i].p_hdr, end); ++ path[i].p_block = path[i].p_hdr->eh_entries + 1; ++ ext_debug(tree, "init index ptr: hdr 0x%p, num %d\n", ++ path[i].p_hdr, path[i].p_hdr->eh_entries); ++ } else { ++ /* we've already was here, see at next index */ ++ path[i].p_idx--; ++ } ++ ++ ext_debug(tree, "level %d - index, first 0x%p, cur 0x%p\n", ++ i, EXT_FIRST_INDEX(path[i].p_hdr), ++ path[i].p_idx); ++ if (ext3_ext_more_to_rm(path + i)) { ++ /* go to the next level */ ++ ext_debug(tree, "move to level %d (block %d)\n", ++ i + 1, path[i].p_idx->ei_leaf); ++ memset(path + i + 1, 0, sizeof(*path)); ++ path[i+1].p_bh = sb_bread(sb, path[i].p_idx->ei_leaf); ++ if (!path[i+1].p_bh) { ++ /* should we reset i_size? */ ++ err = -EIO; ++ break; ++ } ++ /* put actual number of indexes to know is this ++ * number got changed at the next iteration */ ++ path[i].p_block = path[i].p_hdr->eh_entries; ++ i++; ++ } else { ++ /* we finish processing this index, go up */ ++ if (path[i].p_hdr->eh_entries == 0 && i > 0) { ++ /* index is empty, remove it ++ * handle must be already prepared by the ++ * truncatei_leaf() */ ++ err = ext3_ext_rm_idx(handle, tree, path + i); ++ } ++ /* root level have p_bh == NULL, brelse() eats this */ ++ brelse(path[i].p_bh); ++ i--; ++ ext_debug(tree, "return to level %d\n", i); ++ } ++ } ++ ++ /* TODO: flexible tree reduction should be here */ ++ if (path->p_hdr->eh_entries == 0) { ++ /* ++ * truncate to zero freed all the tree ++ * so, we need to correct eh_depth ++ */ ++ err = ext3_ext_get_access(handle, tree, path); ++ if (err == 0) { ++ EXT_ROOT_HDR(tree)->eh_depth = 0; ++ EXT_ROOT_HDR(tree)->eh_max = ext3_ext_space_root(tree); ++ err = ext3_ext_dirty(handle, tree, path); ++ } ++ } ++ ext3_ext_tree_changed(tree); ++ ++ kfree(path); ++ ext3_journal_stop(handle); ++ ++ return err; ++} ++ ++int ext3_ext_calc_metadata_amount(struct ext3_extents_tree *tree, int blocks) ++{ ++ int lcap, icap, rcap, leafs, idxs, num; ++ ++ rcap = ext3_ext_space_root(tree); ++ if (blocks <= rcap) { ++ /* all extents fit to the root */ ++ return 0; ++ } ++ ++ rcap = ext3_ext_space_root_idx(tree); ++ lcap = ext3_ext_space_block(tree); ++ icap = ext3_ext_space_block_idx(tree); ++ ++ num = leafs = (blocks + lcap - 1) / lcap; ++ if (leafs <= rcap) { ++ /* all pointers to leafs fit to the root */ ++ return leafs; ++ } ++ ++ /* ok. we need separate index block(s) to link all leaf blocks */ ++ idxs = (leafs + icap - 1) / icap; ++ do { ++ num += idxs; ++ idxs = (idxs + icap - 1) / icap; ++ } while (idxs > rcap); ++ ++ return num; ++} ++ ++/* ++ * called at mount time ++ */ ++void ext3_ext_init(struct super_block *sb) ++{ ++ /* ++ * possible initialization would be here ++ */ ++ ++ if (test_opt(sb, EXTENTS)) { ++ printk("EXT3-fs: file extents enabled"); ++#ifdef AGRESSIVE_TEST ++ printk(", agressive tests"); ++#endif ++#ifdef CHECK_BINSEARCH ++ printk(", check binsearch"); ++#endif ++ printk("\n"); ++ } ++} ++ ++/* ++ * called at umount time ++ */ ++void ext3_ext_release(struct super_block *sb) ++{ ++} ++ ++/************************************************************************ ++ * VFS related routines ++ ************************************************************************/ ++ ++static int ext3_get_inode_write_access(handle_t *handle, void *buffer) ++{ ++ /* we use in-core data, not bh */ ++ return 0; ++} ++ ++static int ext3_mark_buffer_dirty(handle_t *handle, void *buffer) ++{ ++ struct inode *inode = buffer; ++ return ext3_mark_inode_dirty(handle, inode); ++} ++ ++static int ext3_ext_mergable(struct ext3_extent *ex1, ++ struct ext3_extent *ex2) ++{ ++ /* FIXME: support for large fs */ ++ if (ex1->ee_start + ex1->ee_len == ex2->ee_start) ++ return 1; ++ return 0; ++} ++ ++static int ++ext3_remove_blocks_credits(struct ext3_extents_tree *tree, ++ struct ext3_extent *ex, ++ unsigned long from, unsigned long to) ++{ ++ int needed; ++ ++ /* at present, extent can't cross block group */; ++ needed = 4; /* bitmap + group desc + sb + inode */ ++ ++#ifdef CONFIG_QUOTA ++ needed += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS; ++#endif ++ return needed; ++} ++ ++static int ++ext3_remove_blocks(struct ext3_extents_tree *tree, ++ struct ext3_extent *ex, ++ unsigned long from, unsigned long to) ++{ ++ int needed = ext3_remove_blocks_credits(tree, ex, from, to); ++ handle_t *handle = ext3_journal_start(tree->inode, needed); ++ struct buffer_head *bh; ++ int i; ++ ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) { ++ /* tail removal */ ++ unsigned long num, start; ++ num = ex->ee_block + ex->ee_len - from; ++ start = ex->ee_start + ex->ee_len - num; ++ ext_debug(tree, "free last %lu blocks starting %lu\n", ++ num, start); ++ for (i = 0; i < num; i++) { ++ bh = sb_find_get_block(tree->inode->i_sb, start + i); ++ ext3_forget(handle, 0, tree->inode, bh, start + i); ++ } ++ ext3_free_blocks(handle, tree->inode, start, num); ++ } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) { ++ printk("strange request: removal %lu-%lu from %u:%u\n", ++ from, to, ex->ee_block, ex->ee_len); ++ } else { ++ printk("strange request: removal(2) %lu-%lu from %u:%u\n", ++ from, to, ex->ee_block, ex->ee_len); ++ } ++ ext3_journal_stop(handle); ++ return 0; ++} ++ ++static int ext3_ext_find_goal(struct inode *inode, ++ struct ext3_ext_path *path, unsigned long block) ++{ ++ struct ext3_inode_info *ei = EXT3_I(inode); ++ unsigned long bg_start; ++ unsigned long colour; ++ int depth; ++ ++ if (path) { ++ struct ext3_extent *ex; ++ depth = path->p_depth; ++ ++ /* try to predict block placement */ ++ if ((ex = path[depth].p_ext)) ++ return ex->ee_start + (block - ex->ee_block); ++ ++ /* it looks index is empty ++ * try to find starting from index itself */ ++ if (path[depth].p_bh) ++ return path[depth].p_bh->b_blocknr; ++ } ++ ++ /* OK. use inode's group */ ++ bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) + ++ le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block); ++ colour = (current->pid % 16) * ++ (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16); ++ return bg_start + colour + block; ++} ++ ++static int ext3_new_block_cb(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *ex, int *err) ++{ ++ struct inode *inode = tree->inode; ++ int newblock, goal; ++ ++ EXT_ASSERT(path); ++ EXT_ASSERT(ex); ++ EXT_ASSERT(ex->ee_start); ++ EXT_ASSERT(ex->ee_len); ++ ++ /* reuse block from the extent to order data/metadata */ ++ newblock = ex->ee_start++; ++ ex->ee_len--; ++ if (ex->ee_len == 0) { ++ ex->ee_len = 1; ++ /* allocate new block for the extent */ ++ goal = ext3_ext_find_goal(inode, path, ex->ee_block); ++ ex->ee_start = ext3_new_block(handle, inode, goal, err); ++ if (ex->ee_start == 0) { ++ /* error occured: restore old extent */ ++ ex->ee_start = newblock; ++ return 0; ++ } ++ } ++ return newblock; ++} ++ ++static struct ext3_extents_helpers ext3_blockmap_helpers = { ++ .get_write_access = ext3_get_inode_write_access, ++ .mark_buffer_dirty = ext3_mark_buffer_dirty, ++ .mergable = ext3_ext_mergable, ++ .new_block = ext3_new_block_cb, ++ .remove_extent = ext3_remove_blocks, ++ .remove_extent_credits = ext3_remove_blocks_credits, ++}; ++ ++void ext3_init_tree_desc(struct ext3_extents_tree *tree, ++ struct inode *inode) ++{ ++ tree->inode = inode; ++ tree->root = (void *) EXT3_I(inode)->i_data; ++ tree->buffer = (void *) inode; ++ tree->buffer_len = sizeof(EXT3_I(inode)->i_data); ++ tree->cex = (struct ext3_ext_cache *) &EXT3_I(inode)->i_cached_extent; ++ tree->ops = &ext3_blockmap_helpers; ++} ++ ++int ext3_ext_get_block(handle_t *handle, struct inode *inode, ++ long iblock, struct buffer_head *bh_result, ++ int create, int extend_disksize) ++{ ++ struct ext3_ext_path *path = NULL; ++ struct ext3_extent newex; ++ struct ext3_extent *ex; ++ int goal, newblock, err = 0, depth; ++ struct ext3_extents_tree tree; ++ ++ clear_buffer_new(bh_result); ++ ext3_init_tree_desc(&tree, inode); ++ ext_debug(&tree, "block %d requested for inode %u\n", ++ (int) iblock, (unsigned) inode->i_ino); ++ down(&EXT3_I(inode)->truncate_sem); ++ ++ /* check in cache */ ++ if ((goal = ext3_ext_in_cache(&tree, iblock, &newex))) { ++ if (goal == EXT3_EXT_CACHE_GAP) { ++ if (!create) { ++ /* block isn't allocated yet and ++ * user don't want to allocate it */ ++ goto out2; ++ } ++ /* we should allocate requested block */ ++ } else if (goal == EXT3_EXT_CACHE_EXTENT) { ++ /* block is already allocated */ ++ newblock = iblock - newex.ee_block + newex.ee_start; ++ goto out; ++ } else { ++ EXT_ASSERT(0); ++ } ++ } ++ ++ /* find extent for this block */ ++ path = ext3_ext_find_extent(&tree, iblock, NULL); ++ if (IS_ERR(path)) { ++ err = PTR_ERR(path); ++ path = NULL; ++ goto out2; ++ } ++ ++ depth = EXT_DEPTH(&tree); ++ ++ /* ++ * consistent leaf must not be empty ++ * this situations is possible, though, _during_ tree modification ++ * this is why assert can't be put in ext3_ext_find_extent() ++ */ ++ EXT_ASSERT(path[depth].p_ext != NULL || depth == 0); ++ ++ if ((ex = path[depth].p_ext)) { ++ /* if found exent covers block, simple return it */ ++ if (iblock >= ex->ee_block && iblock < ex->ee_block + ex->ee_len) { ++ newblock = iblock - ex->ee_block + ex->ee_start; ++ ext_debug(&tree, "%d fit into %d:%d -> %d\n", ++ (int) iblock, ex->ee_block, ex->ee_len, ++ newblock); ++ ext3_ext_put_in_cache(&tree, ex->ee_block, ++ ex->ee_len, ex->ee_start, ++ EXT3_EXT_CACHE_EXTENT); ++ goto out; ++ } ++ } ++ ++ /* ++ * requested block isn't allocated yet ++ * we couldn't try to create block if create flag is zero ++ */ ++ if (!create) { ++ /* put just found gap into cache to speedup subsequest reqs */ ++ ext3_ext_put_gap_in_cache(&tree, path, iblock); ++ goto out2; ++ } ++ ++ /* allocate new block */ ++ goal = ext3_ext_find_goal(inode, path, iblock); ++ newblock = ext3_new_block(handle, inode, goal, &err); ++ if (!newblock) ++ goto out2; ++ ext_debug(&tree, "allocate new block: goal %d, found %d\n", ++ goal, newblock); ++ ++ /* try to insert new extent into found leaf and return */ ++ newex.ee_block = iblock; ++ newex.ee_start = newblock; ++ newex.ee_len = 1; ++ err = ext3_ext_insert_extent(handle, &tree, path, &newex); ++ if (err) ++ goto out2; ++ ++ if (extend_disksize && inode->i_size > EXT3_I(inode)->i_disksize) ++ EXT3_I(inode)->i_disksize = inode->i_size; ++ ++ /* previous routine could use block we allocated */ ++ newblock = newex.ee_start; ++ set_buffer_new(bh_result); ++ ++ ext3_ext_put_in_cache(&tree, newex.ee_block, newex.ee_len, ++ newex.ee_start, EXT3_EXT_CACHE_EXTENT); ++out: ++ ext3_ext_show_leaf(&tree, path); ++ map_bh(bh_result, inode->i_sb, newblock); ++out2: ++ if (path) { ++ ext3_ext_drop_refs(path); ++ kfree(path); ++ } ++ up(&EXT3_I(inode)->truncate_sem); ++ ++ return err; ++} ++ ++void ext3_ext_truncate(struct inode * inode, struct page *page) ++{ ++ struct address_space *mapping = inode->i_mapping; ++ struct super_block *sb = inode->i_sb; ++ struct ext3_extents_tree tree; ++ unsigned long last_block; ++ handle_t *handle; ++ int err = 0; ++ ++ ext3_init_tree_desc(&tree, inode); ++ ++ /* ++ * probably first extent we're gonna free will be last in block ++ */ ++ err = ext3_writepage_trans_blocks(inode) + 3; ++ handle = ext3_journal_start(inode, err); ++ if (IS_ERR(handle)) { ++ if (page) { ++ clear_highpage(page); ++ flush_dcache_page(page); ++ unlock_page(page); ++ page_cache_release(page); ++ } ++ return; ++ } ++ ++ if (page) ++ ext3_block_truncate_page(handle, page, mapping, inode->i_size); ++ ++ down(&EXT3_I(inode)->truncate_sem); ++ ext3_ext_invalidate_cache(&tree); ++ ++ /* ++ * TODO: optimization is possible here ++ * probably we need not scaning at all, ++ * because page truncation is enough ++ */ ++ if (ext3_orphan_add(handle, inode)) ++ goto out_stop; ++ ++ /* we have to know where to truncate from in crash case */ ++ EXT3_I(inode)->i_disksize = inode->i_size; ++ ext3_mark_inode_dirty(handle, inode); ++ ++ last_block = (inode->i_size + sb->s_blocksize - 1) >> ++ EXT3_BLOCK_SIZE_BITS(sb); ++ err = ext3_ext_remove_space(&tree, last_block, EXT_MAX_BLOCK); ++ ++ /* In a multi-transaction truncate, we only make the final ++ * transaction synchronous */ ++ if (IS_SYNC(inode)) ++ handle->h_sync = 1; ++ ++out_stop: ++ /* ++ * If this was a simple ftruncate(), and the file will remain alive ++ * then we need to clear up the orphan record which we created above. ++ * However, if this was a real unlink then we were called by ++ * ext3_delete_inode(), and we allow that function to clean up the ++ * orphan info for us. ++ */ ++ if (inode->i_nlink) ++ ext3_orphan_del(handle, inode); ++ ++ up(&EXT3_I(inode)->truncate_sem); ++ ext3_journal_stop(handle); ++} ++ ++/* ++ * this routine calculate max number of blocks we could modify ++ * in order to allocate new block for an inode ++ */ ++int ext3_ext_writepage_trans_blocks(struct inode *inode, int num) ++{ ++ struct ext3_extents_tree tree; ++ int needed; ++ ++ ext3_init_tree_desc(&tree, inode); ++ ++ needed = ext3_ext_calc_credits_for_insert(&tree, NULL); ++ ++ /* caller want to allocate num blocks */ ++ needed *= num; ++ ++#ifdef CONFIG_QUOTA ++ /* ++ * FIXME: real calculation should be here ++ * it depends on blockmap format of qouta file ++ */ ++ needed += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS; ++#endif ++ ++ return needed; ++} ++ ++void ext3_extents_initialize_blockmap(handle_t *handle, struct inode *inode) ++{ ++ struct ext3_extents_tree tree; ++ ++ ext3_init_tree_desc(&tree, inode); ++ ext3_extent_tree_init(handle, &tree); ++} ++ ++int ext3_ext_calc_blockmap_metadata(struct inode *inode, int blocks) ++{ ++ struct ext3_extents_tree tree; ++ ++ ext3_init_tree_desc(&tree, inode); ++ return ext3_ext_calc_metadata_amount(&tree, blocks); ++} ++ ++static int ++ext3_ext_store_extent_cb(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_ext_cache *newex) ++{ ++ struct ext3_extent_buf *buf = (struct ext3_extent_buf *) tree->private; ++ ++ if (newex->ec_type != EXT3_EXT_CACHE_EXTENT) ++ return EXT_CONTINUE; ++ ++ if (buf->err < 0) ++ return EXT_BREAK; ++ if (buf->cur - buf->buffer + sizeof(*newex) > buf->buflen) ++ return EXT_BREAK; ++ ++ if (!copy_to_user(buf->cur, newex, sizeof(*newex))) { ++ buf->err++; ++ buf->cur += sizeof(*newex); ++ } else { ++ buf->err = -EFAULT; ++ return EXT_BREAK; ++ } ++ return EXT_CONTINUE; ++} ++ ++static int ++ext3_ext_collect_stats_cb(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_ext_cache *ex) ++{ ++ struct ext3_extent_tree_stats *buf = ++ (struct ext3_extent_tree_stats *) tree->private; ++ int depth; ++ ++ if (ex->ec_type != EXT3_EXT_CACHE_EXTENT) ++ return EXT_CONTINUE; ++ ++ depth = EXT_DEPTH(tree); ++ buf->extents_num++; ++ if (path[depth].p_ext == EXT_FIRST_EXTENT(path[depth].p_hdr)) ++ buf->leaf_num++; ++ return EXT_CONTINUE; ++} ++ ++int ext3_ext_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, ++ unsigned long arg) ++{ ++ int err = 0; ++ ++ if (!(EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL)) ++ return -EINVAL; ++ ++ if (cmd == EXT3_IOC_GET_EXTENTS) { ++ struct ext3_extent_buf buf; ++ struct ext3_extents_tree tree; ++ ++ if (copy_from_user(&buf, (void *) arg, sizeof(buf))) ++ return -EFAULT; ++ ++ ext3_init_tree_desc(&tree, inode); ++ buf.cur = buf.buffer; ++ buf.err = 0; ++ tree.private = &buf; ++ down(&EXT3_I(inode)->truncate_sem); ++ err = ext3_ext_walk_space(&tree, buf.start, EXT_MAX_BLOCK, ++ ext3_ext_store_extent_cb); ++ up(&EXT3_I(inode)->truncate_sem); ++ if (err == 0) ++ err = buf.err; ++ } else if (cmd == EXT3_IOC_GET_TREE_STATS) { ++ struct ext3_extent_tree_stats buf; ++ struct ext3_extents_tree tree; ++ ++ ext3_init_tree_desc(&tree, inode); ++ down(&EXT3_I(inode)->truncate_sem); ++ buf.depth = EXT_DEPTH(&tree); ++ buf.extents_num = 0; ++ buf.leaf_num = 0; ++ tree.private = &buf; ++ err = ext3_ext_walk_space(&tree, 0, EXT_MAX_BLOCK, ++ ext3_ext_collect_stats_cb); ++ up(&EXT3_I(inode)->truncate_sem); ++ if (!err) ++ err = copy_to_user((void *) arg, &buf, sizeof(buf)); ++ } else if (cmd == EXT3_IOC_GET_TREE_DEPTH) { ++ struct ext3_extents_tree tree; ++ ext3_init_tree_desc(&tree, inode); ++ down(&EXT3_I(inode)->truncate_sem); ++ err = EXT_DEPTH(&tree); ++ up(&EXT3_I(inode)->truncate_sem); ++ } ++ ++ return err; ++} ++ ++EXPORT_SYMBOL(ext3_init_tree_desc); ++EXPORT_SYMBOL(ext3_mark_inode_dirty); ++EXPORT_SYMBOL(ext3_ext_invalidate_cache); ++EXPORT_SYMBOL(ext3_ext_insert_extent); ++EXPORT_SYMBOL(ext3_ext_walk_space); ++EXPORT_SYMBOL(ext3_ext_find_goal); ++EXPORT_SYMBOL(ext3_ext_calc_credits_for_insert); +Index: linux-2.6.12-rc6/fs/ext3/ialloc.c +=================================================================== +--- linux-2.6.12-rc6.orig/fs/ext3/ialloc.c 2005-06-14 16:31:08.634433030 +0200 ++++ linux-2.6.12-rc6/fs/ext3/ialloc.c 2005-06-14 16:31:25.846346882 +0200 +@@ -598,7 +598,7 @@ + ei->i_dir_start_lookup = 0; + ei->i_disksize = 0; + +- ei->i_flags = EXT3_I(dir)->i_flags & ~EXT3_INDEX_FL; ++ ei->i_flags = EXT3_I(dir)->i_flags & ~(EXT3_INDEX_FL|EXT3_EXTENTS_FL); + if (S_ISLNK(mode)) + ei->i_flags &= ~(EXT3_IMMUTABLE_FL|EXT3_APPEND_FL); + /* dirsync only applies to directories */ +@@ -639,6 +639,18 @@ + DQUOT_FREE_INODE(inode); + goto fail2; + } ++ if (test_opt(sb, EXTENTS) && S_ISREG(inode->i_mode)) { ++ EXT3_I(inode)->i_flags |= EXT3_EXTENTS_FL; ++ ext3_extents_initialize_blockmap(handle, inode); ++ if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_EXTENTS)) { ++ err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh); ++ if (err) goto fail; ++ EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_EXTENTS); ++ BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "call ext3_journal_dirty_metadata"); ++ err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); ++ } ++ } ++ + err = ext3_mark_inode_dirty(handle, inode); + if (err) { + ext3_std_error(sb, err); +Index: linux-2.6.12-rc6/fs/ext3/inode.c +=================================================================== +--- linux-2.6.12-rc6.orig/fs/ext3/inode.c 2005-06-14 16:31:09.701815830 +0200 ++++ linux-2.6.12-rc6/fs/ext3/inode.c 2005-06-14 16:31:25.861971882 +0200 +@@ -40,7 +40,7 @@ + #include "iopen.h" + #include "acl.h" + +-static int ext3_writepage_trans_blocks(struct inode *inode); ++int ext3_writepage_trans_blocks(struct inode *inode); + + /* + * Test whether an inode is a fast symlink. +@@ -784,6 +784,17 @@ + return err; + } + ++static inline int ++ext3_get_block_wrap(handle_t *handle, struct inode *inode, long block, ++ struct buffer_head *bh, int create, int extend_disksize) ++{ ++ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) ++ return ext3_ext_get_block(handle, inode, block, bh, create, ++ extend_disksize); ++ return ext3_get_block_handle(handle, inode, block, bh, create, ++ extend_disksize); ++} ++ + static int ext3_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) + { +@@ -794,8 +805,8 @@ + handle = ext3_journal_current_handle(); + J_ASSERT(handle != 0); + } +- ret = ext3_get_block_handle(handle, inode, iblock, +- bh_result, create, 1); ++ ret = ext3_get_block_wrap(handle, inode, iblock, ++ bh_result, create, 1); + return ret; + } + +@@ -839,7 +850,7 @@ + + get_block: + if (ret == 0) +- ret = ext3_get_block_handle(handle, inode, iblock, ++ ret = ext3_get_block_wrap(handle, inode, iblock, + bh_result, create, 0); + bh_result->b_size = (1 << inode->i_blkbits); + return ret; +@@ -859,7 +870,7 @@ + dummy.b_state = 0; + dummy.b_blocknr = -1000; + buffer_trace_init(&dummy.b_history); +- *errp = ext3_get_block_handle(handle, inode, block, &dummy, create, 1); ++ *errp = ext3_get_block_wrap(handle, inode, block, &dummy, create, 1); + if (!*errp && buffer_mapped(&dummy)) { + struct buffer_head *bh; + bh = sb_getblk(inode->i_sb, dummy.b_blocknr); +@@ -1593,7 +1604,7 @@ + * This required during truncate. We need to physically zero the tail end + * of that block so it doesn't yield old data if the file is later grown. + */ +-static int ext3_block_truncate_page(handle_t *handle, struct page *page, ++int ext3_block_truncate_page(handle_t *handle, struct page *page, + struct address_space *mapping, loff_t from) + { + unsigned long index = from >> PAGE_CACHE_SHIFT; +@@ -2104,6 +2115,9 @@ + return; + } + ++ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) ++ return ext3_ext_truncate(inode, page); ++ + handle = start_transaction(inode); + if (IS_ERR(handle)) { + if (page) { +@@ -2850,12 +2864,15 @@ + * block and work out the exact number of indirects which are touched. Pah. + */ + +-static int ext3_writepage_trans_blocks(struct inode *inode) ++int ext3_writepage_trans_blocks(struct inode *inode) + { + int bpp = ext3_journal_blocks_per_page(inode); + int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3; + int ret; + ++ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) ++ return ext3_ext_writepage_trans_blocks(inode, bpp); ++ + if (ext3_should_journal_data(inode)) + ret = 3 * (bpp + indirects) + 2; + else +Index: linux-2.6.12-rc6/fs/ext3/Makefile +=================================================================== +--- linux-2.6.12-rc6.orig/fs/ext3/Makefile 2005-06-14 16:31:09.179354899 +0200 ++++ linux-2.6.12-rc6/fs/ext3/Makefile 2005-06-14 16:31:25.872714069 +0200 +@@ -5,7 +5,7 @@ + obj-$(CONFIG_EXT3_FS) += ext3.o + + ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o\ +- ioctl.o namei.o super.o symlink.o hash.o resize.o ++ ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o + + ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o + ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o +Index: linux-2.6.12-rc6/fs/ext3/super.c +=================================================================== +--- linux-2.6.12-rc6.orig/fs/ext3/super.c 2005-06-14 16:31:09.950839264 +0200 ++++ linux-2.6.12-rc6/fs/ext3/super.c 2005-06-14 16:31:25.886385944 +0200 +@@ -387,6 +387,7 @@ + struct ext3_super_block *es = sbi->s_es; + int i; + ++ ext3_ext_release(sb); + ext3_xattr_put_super(sb); + journal_destroy(sbi->s_journal); + if (!(sb->s_flags & MS_RDONLY)) { +@@ -451,6 +452,8 @@ + #endif + ei->i_block_alloc_info = NULL; + ei->vfs_inode.i_version = 1; ++ ++ memset(&ei->i_cached_extent, 0, sizeof(ei->i_cached_extent)); + return &ei->vfs_inode; + } + +@@ -593,7 +596,7 @@ + Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, + Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, + Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, +- Opt_ignore, Opt_barrier, Opt_err, Opt_resize, ++ Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_extents, Opt_extdebug, + }; + + static match_table_t tokens = { +@@ -644,6 +647,8 @@ + {Opt_iopen, "iopen"}, + {Opt_noiopen, "noiopen"}, + {Opt_iopen_nopriv, "iopen_nopriv"}, ++ {Opt_extents, "extents"}, ++ {Opt_extdebug, "extdebug"}, + {Opt_barrier, "barrier=%u"}, + {Opt_err, NULL}, + {Opt_resize, "resize"}, +@@ -953,6 +958,12 @@ + case Opt_nobh: + set_opt(sbi->s_mount_opt, NOBH); + break; ++ case Opt_extents: ++ set_opt (sbi->s_mount_opt, EXTENTS); ++ break; ++ case Opt_extdebug: ++ set_opt (sbi->s_mount_opt, EXTDEBUG); ++ break; + default: + printk (KERN_ERR + "EXT3-fs: Unrecognized mount option \"%s\" " +@@ -1668,6 +1681,7 @@ + percpu_counter_mod(&sbi->s_dirs_counter, + ext3_count_dirs(sb)); + ++ ext3_ext_init(sb); + lock_kernel(); + return 0; + +Index: linux-2.6.12-rc6/fs/ext3/ioctl.c +=================================================================== +--- linux-2.6.12-rc6.orig/fs/ext3/ioctl.c 2005-06-14 16:31:08.646151780 +0200 ++++ linux-2.6.12-rc6/fs/ext3/ioctl.c 2005-06-14 16:31:25.897128131 +0200 +@@ -124,6 +124,10 @@ + err = ext3_change_inode_journal_flag(inode, jflag); + return err; + } ++ case EXT3_IOC_GET_EXTENTS: ++ case EXT3_IOC_GET_TREE_STATS: ++ case EXT3_IOC_GET_TREE_DEPTH: ++ return ext3_ext_ioctl(inode, filp, cmd, arg); + case EXT3_IOC_GETVERSION: + case EXT3_IOC_GETVERSION_OLD: + return put_user(inode->i_generation, (int __user *) arg); +Index: linux-2.6.12-rc6/include/linux/ext3_fs.h +=================================================================== +--- linux-2.6.12-rc6.orig/include/linux/ext3_fs.h 2005-06-14 16:31:10.185214261 +0200 ++++ linux-2.6.12-rc6/include/linux/ext3_fs.h 2005-06-14 16:31:52.859041864 +0200 +@@ -186,8 +186,9 @@ + #define EXT3_NOTAIL_FL 0x00008000 /* don't merge file tail */ + #define EXT3_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ + #define EXT3_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ ++#define EXT3_EXTENTS_FL 0x00080000 /* Inode uses extents */ + #define EXT3_RESERVED_FL 0x80000000 /* reserved for ext3 lib */ + +-#define EXT3_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */ ++#define EXT3_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */ + #define EXT3_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */ + +@@ -237,6 +238,9 @@ + #endif + #define EXT3_IOC_GETRSVSZ _IOR('f', 5, long) + #define EXT3_IOC_SETRSVSZ _IOW('f', 6, long) ++#define EXT3_IOC_GET_EXTENTS _IOR('f', 7, long) ++#define EXT3_IOC_GET_TREE_DEPTH _IOR('f', 8, long) ++#define EXT3_IOC_GET_TREE_STATS _IOR('f', 9, long) + + /* + * Structure of an inode on the disk +@@ -360,6 +364,8 @@ + #define EXT3_MOUNT_NOBH 0x40000 /* No bufferheads */ + #define EXT3_MOUNT_IOPEN 0x80000 /* Allow access via iopen */ + #define EXT3_MOUNT_IOPEN_NOPRIV 0x100000 /* Make iopen world-readable */ ++#define EXT3_MOUNT_EXTENTS 0x200000/* Extents support */ ++#define EXT3_MOUNT_EXTDEBUG 0x400000/* Extents debug */ + + /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ + #ifndef clear_opt +@@ -548,11 +554,13 @@ + #define EXT3_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */ + #define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */ + #define EXT3_FEATURE_INCOMPAT_META_BG 0x0010 ++#define EXT3_FEATURE_INCOMPAT_EXTENTS 0x0040 /* extents support */ + + #define EXT3_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR + #define EXT3_FEATURE_INCOMPAT_SUPP (EXT3_FEATURE_INCOMPAT_FILETYPE| \ + EXT3_FEATURE_INCOMPAT_RECOVER| \ +- EXT3_FEATURE_INCOMPAT_META_BG) ++ EXT3_FEATURE_INCOMPAT_META_BG| \ ++ EXT3_FEATURE_INCOMPAT_EXTENTS) + #define EXT3_FEATURE_RO_COMPAT_SUPP (EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT3_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT3_FEATURE_RO_COMPAT_BTREE_DIR) +@@ -759,6 +767,7 @@ + + + /* inode.c */ ++extern int ext3_block_truncate_page(handle_t *, struct page *, struct address_space *, loff_t); + extern int ext3_forget(handle_t *, int, struct inode *, struct buffer_head *, int); + extern struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *); + extern struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *); +@@ -828,6 +837,16 @@ + extern struct inode_operations ext3_symlink_inode_operations; + extern struct inode_operations ext3_fast_symlink_inode_operations; + ++/* extents.c */ ++extern int ext3_ext_writepage_trans_blocks(struct inode *, int); ++extern int ext3_ext_get_block(handle_t *, struct inode *, long, ++ struct buffer_head *, int, int); ++extern void ext3_ext_truncate(struct inode *, struct page *); ++extern void ext3_ext_init(struct super_block *); ++extern void ext3_ext_release(struct super_block *); ++extern void ext3_extents_initialize_blockmap(handle_t *, struct inode *); ++extern int ext3_ext_ioctl(struct inode *inode, struct file *filp, ++ unsigned int cmd, unsigned long arg); + + #endif /* __KERNEL__ */ + +Index: linux-2.6.12-rc6/include/linux/ext3_extents.h +=================================================================== +--- linux-2.6.12-rc6.orig/include/linux/ext3_extents.h 2005-06-14 16:31:25.780917195 +0200 ++++ linux-2.6.12-rc6/include/linux/ext3_extents.h 2005-06-14 16:31:25.932284381 +0200 +@@ -0,0 +1,264 @@ ++/* ++ * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com ++ * Written by Alex Tomas ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public Licens ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- ++ */ ++ ++#ifndef _LINUX_EXT3_EXTENTS ++#define _LINUX_EXT3_EXTENTS ++ ++/* ++ * with AGRESSIVE_TEST defined capacity of index/leaf blocks ++ * become very little, so index split, in-depth growing and ++ * other hard changes happens much more often ++ * this is for debug purposes only ++ */ ++#define AGRESSIVE_TEST_ ++ ++/* ++ * if CHECK_BINSEARCH defined, then results of binary search ++ * will be checked by linear search ++ */ ++#define CHECK_BINSEARCH_ ++ ++/* ++ * if EXT_DEBUG is defined you can use 'extdebug' mount option ++ * to get lots of info what's going on ++ */ ++#define EXT_DEBUG_ ++#ifdef EXT_DEBUG ++#define ext_debug(tree,fmt,a...) \ ++do { \ ++ if (test_opt((tree)->inode->i_sb, EXTDEBUG)) \ ++ printk(fmt, ##a); \ ++} while (0); ++#else ++#define ext_debug(tree,fmt,a...) ++#endif ++ ++/* ++ * if EXT_STATS is defined then stats numbers are collected ++ * these number will be displayed at umount time ++ */ ++#define EXT_STATS_ ++ ++ ++#define EXT3_ALLOC_NEEDED 3 /* block bitmap + group desc. + sb */ ++ ++/* ++ * ext3_inode has i_block array (total 60 bytes) ++ * first 4 bytes are used to store: ++ * - tree depth (0 mean there is no tree yet. all extents in the inode) ++ * - number of alive extents in the inode ++ */ ++ ++/* ++ * this is extent on-disk structure ++ * it's used at the bottom of the tree ++ */ ++struct ext3_extent { ++ __u32 ee_block; /* first logical block extent covers */ ++ __u16 ee_len; /* number of blocks covered by extent */ ++ __u16 ee_start_hi; /* high 16 bits of physical block */ ++ __u32 ee_start; /* low 32 bigs of physical block */ ++}; ++ ++/* ++ * this is index on-disk structure ++ * it's used at all the levels, but the bottom ++ */ ++struct ext3_extent_idx { ++ __u32 ei_block; /* index covers logical blocks from 'block' */ ++ __u32 ei_leaf; /* pointer to the physical block of the next * ++ * level. leaf or next index could bet here */ ++ __u16 ei_leaf_hi; /* high 16 bits of physical block */ ++ __u16 ei_unused; ++}; ++ ++/* ++ * each block (leaves and indexes), even inode-stored has header ++ */ ++struct ext3_extent_header { ++ __u16 eh_magic; /* probably will support different formats */ ++ __u16 eh_entries; /* number of valid entries */ ++ __u16 eh_max; /* capacity of store in entries */ ++ __u16 eh_depth; /* has tree real underlaying blocks? */ ++ __u32 eh_generation; /* generation of the tree */ ++}; ++ ++#define EXT3_EXT_MAGIC 0xf30a ++ ++/* ++ * array of ext3_ext_path contains path to some extent ++ * creation/lookup routines use it for traversal/splitting/etc ++ * truncate uses it to simulate recursive walking ++ */ ++struct ext3_ext_path { ++ __u32 p_block; ++ __u16 p_depth; ++ struct ext3_extent *p_ext; ++ struct ext3_extent_idx *p_idx; ++ struct ext3_extent_header *p_hdr; ++ struct buffer_head *p_bh; ++}; ++ ++/* ++ * structure for external API ++ */ ++ ++/* ++ * storage for cached extent ++ */ ++struct ext3_ext_cache { ++ __u32 ec_start; ++ __u32 ec_block; ++ __u32 ec_len; ++ __u32 ec_type; ++}; ++ ++#define EXT3_EXT_CACHE_NO 0 ++#define EXT3_EXT_CACHE_GAP 1 ++#define EXT3_EXT_CACHE_EXTENT 2 ++ ++/* ++ * ext3_extents_tree is used to pass initial information ++ * to top-level extents API ++ */ ++struct ext3_extents_helpers; ++struct ext3_extents_tree { ++ struct inode *inode; /* inode which tree belongs to */ ++ void *root; /* ptr to data top of tree resides at */ ++ void *buffer; /* will be passed as arg to ^^ routines */ ++ int buffer_len; ++ void *private; ++ struct ext3_ext_cache *cex;/* last found extent */ ++ struct ext3_extents_helpers *ops; ++}; ++ ++struct ext3_extents_helpers { ++ int (*get_write_access)(handle_t *h, void *buffer); ++ int (*mark_buffer_dirty)(handle_t *h, void *buffer); ++ int (*mergable)(struct ext3_extent *ex1, struct ext3_extent *ex2); ++ int (*remove_extent_credits)(struct ext3_extents_tree *, ++ struct ext3_extent *, unsigned long, ++ unsigned long); ++ int (*remove_extent)(struct ext3_extents_tree *, ++ struct ext3_extent *, unsigned long, ++ unsigned long); ++ int (*new_block)(handle_t *, struct ext3_extents_tree *, ++ struct ext3_ext_path *, struct ext3_extent *, ++ int *); ++}; ++ ++/* ++ * to be called by ext3_ext_walk_space() ++ * negative retcode - error ++ * positive retcode - signal for ext3_ext_walk_space(), see below ++ * callback must return valid extent (passed or newly created) ++ */ ++typedef int (*ext_prepare_callback)(struct ext3_extents_tree *, ++ struct ext3_ext_path *, ++ struct ext3_ext_cache *); ++ ++#define EXT_CONTINUE 0 ++#define EXT_BREAK 1 ++#define EXT_REPEAT 2 ++ ++ ++#define EXT_MAX_BLOCK 0xffffffff ++ ++ ++#define EXT_FIRST_EXTENT(__hdr__) \ ++ ((struct ext3_extent *) (((char *) (__hdr__)) + \ ++ sizeof(struct ext3_extent_header))) ++#define EXT_FIRST_INDEX(__hdr__) \ ++ ((struct ext3_extent_idx *) (((char *) (__hdr__)) + \ ++ sizeof(struct ext3_extent_header))) ++#define EXT_HAS_FREE_INDEX(__path__) \ ++ ((__path__)->p_hdr->eh_entries < (__path__)->p_hdr->eh_max) ++#define EXT_LAST_EXTENT(__hdr__) \ ++ (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_entries - 1) ++#define EXT_LAST_INDEX(__hdr__) \ ++ (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_entries - 1) ++#define EXT_MAX_EXTENT(__hdr__) \ ++ (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_max - 1) ++#define EXT_MAX_INDEX(__hdr__) \ ++ (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_max - 1) ++ ++#define EXT_ROOT_HDR(tree) \ ++ ((struct ext3_extent_header *) (tree)->root) ++#define EXT_BLOCK_HDR(bh) \ ++ ((struct ext3_extent_header *) (bh)->b_data) ++#define EXT_DEPTH(_t_) \ ++ (((struct ext3_extent_header *)((_t_)->root))->eh_depth) ++#define EXT_GENERATION(_t_) \ ++ (((struct ext3_extent_header *)((_t_)->root))->eh_generation) ++ ++ ++#define EXT_ASSERT(__x__) if (!(__x__)) BUG(); ++ ++#define EXT_CHECK_PATH(tree,path) \ ++{ \ ++ int depth = EXT_DEPTH(tree); \ ++ BUG_ON((unsigned long) (path) < __PAGE_OFFSET); \ ++ BUG_ON((unsigned long) (path)[depth].p_idx < \ ++ __PAGE_OFFSET && (path)[depth].p_idx != NULL); \ ++ BUG_ON((unsigned long) (path)[depth].p_ext < \ ++ __PAGE_OFFSET && (path)[depth].p_ext != NULL); \ ++ BUG_ON((unsigned long) (path)[depth].p_hdr < __PAGE_OFFSET); \ ++ BUG_ON((unsigned long) (path)[depth].p_bh < __PAGE_OFFSET \ ++ && depth != 0); \ ++ BUG_ON((path)[0].p_depth != depth); \ ++} ++ ++ ++/* ++ * this structure is used to gather extents from the tree via ioctl ++ */ ++struct ext3_extent_buf { ++ unsigned long start; ++ int buflen; ++ void *buffer; ++ void *cur; ++ int err; ++}; ++ ++/* ++ * this structure is used to collect stats info about the tree ++ */ ++struct ext3_extent_tree_stats { ++ int depth; ++ int extents_num; ++ int leaf_num; ++}; ++ ++extern void ext3_init_tree_desc(struct ext3_extents_tree *, struct inode *); ++extern int ext3_extent_tree_init(handle_t *, struct ext3_extents_tree *); ++extern int ext3_ext_calc_credits_for_insert(struct ext3_extents_tree *, struct ext3_ext_path *); ++extern int ext3_ext_insert_extent(handle_t *, struct ext3_extents_tree *, struct ext3_ext_path *, struct ext3_extent *); ++extern int ext3_ext_walk_space(struct ext3_extents_tree *, unsigned long, unsigned long, ext_prepare_callback); ++extern int ext3_ext_remove_space(struct ext3_extents_tree *, unsigned long, unsigned long); ++extern struct ext3_ext_path * ext3_ext_find_extent(struct ext3_extents_tree *, int, struct ext3_ext_path *); ++extern int ext3_ext_calc_blockmap_metadata(struct inode *, int); ++ ++static inline void ++ext3_ext_invalidate_cache(struct ext3_extents_tree *tree) ++{ ++ if (tree->cex) ++ tree->cex->ec_type = EXT3_EXT_CACHE_NO; ++} ++ ++ ++#endif /* _LINUX_EXT3_EXTENTS */ +Index: linux-2.6.12-rc6/include/linux/ext3_fs_i.h +=================================================================== +--- linux-2.6.12-rc6.orig/include/linux/ext3_fs_i.h 2005-06-06 17:22:29.000000000 +0200 ++++ linux-2.6.12-rc6/include/linux/ext3_fs_i.h 2005-06-14 16:31:25.941073443 +0200 +@@ -133,6 +133,8 @@ + */ + struct semaphore truncate_sem; + struct inode vfs_inode; ++ ++ __u32 i_cached_extent[4]; + }; + + #endif /* _LINUX_EXT3_FS_I */ diff --git a/ldiskfs/kernel_patches/patches/ext3-external-journal-2.6.12.patch b/ldiskfs/kernel_patches/patches/ext3-external-journal-2.6.12.patch new file mode 100644 index 0000000..bcfdae2 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext3-external-journal-2.6.12.patch @@ -0,0 +1,148 @@ +Signed-off-by: Johann Lombardi + +--- linux-2.6.12.orig/fs/ext3/super.c 2005-06-17 21:48:29.000000000 +0200 ++++ linux-2.6.12/fs/ext3/super.c 2005-11-07 13:37:30.000000000 +0100 +@@ -39,7 +39,8 @@ + #include "xattr.h" + #include "acl.h" + +-static int ext3_load_journal(struct super_block *, struct ext3_super_block *); ++static int ext3_load_journal(struct super_block *, struct ext3_super_block *, ++ unsigned long journal_devnum); + static int ext3_create_journal(struct super_block *, struct ext3_super_block *, + int); + static void ext3_commit_super (struct super_block * sb, +@@ -586,7 +587,7 @@ enum { + Opt_nouid32, Opt_check, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov, + Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, + Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, +- Opt_commit, Opt_journal_update, Opt_journal_inum, ++ Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev, + Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, + Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, + Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, +@@ -624,6 +625,7 @@ static match_table_t tokens = { + {Opt_commit, "commit=%u"}, + {Opt_journal_update, "journal=update"}, + {Opt_journal_inum, "journal=%u"}, ++ {Opt_journal_dev, "journal_dev=%u"}, + {Opt_abort, "abort"}, + {Opt_data_journal, "data=journal"}, + {Opt_data_ordered, "data=ordered"}, +@@ -663,8 +665,9 @@ static unsigned long get_sb_block(void * + return sb_block; + } + +-static int parse_options (char * options, struct super_block *sb, +- unsigned long * inum, unsigned long *n_blocks_count, int is_remount) ++static int parse_options (char *options, struct super_block *sb, ++ unsigned long *inum, unsigned long *journal_devnum, ++ unsigned long *n_blocks_count, int is_remount) + { + struct ext3_sb_info *sbi = EXT3_SB(sb); + char * p; +@@ -805,6 +808,16 @@ static int parse_options (char * options + return 0; + *inum = option; + break; ++ case Opt_journal_dev: ++ if (is_remount) { ++ printk(KERN_ERR "EXT3-fs: cannot specify " ++ "journal on remount\n"); ++ return 0; ++ } ++ if (match_int(&args[0], &option)) ++ return 0; ++ *journal_devnum = option; ++ break; + case Opt_noload: + set_opt (sbi->s_mount_opt, NOLOAD); + break; +@@ -1250,6 +1263,7 @@ static int ext3_fill_super (struct super + unsigned long logic_sb_block; + unsigned long offset = 0; + unsigned long journal_inum = 0; ++ unsigned long journal_devnum = 0; + unsigned long def_mount_opts; + struct inode *root; + int blocksize; +@@ -1330,7 +1344,8 @@ static int ext3_fill_super (struct super + + set_opt(sbi->s_mount_opt, RESERVATION); + +- if (!parse_options ((char *) data, sb, &journal_inum, NULL, 0)) ++ if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum, ++ NULL, 0)) + goto failed_mount; + + sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | +@@ -1541,7 +1556,7 @@ static int ext3_fill_super (struct super + */ + if (!test_opt(sb, NOLOAD) && + EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) { +- if (ext3_load_journal(sb, es)) ++ if (ext3_load_journal(sb, es, journal_devnum)) + goto failed_mount2; + } else if (journal_inum) { + if (ext3_create_journal(sb, es, journal_inum)) +@@ -1821,15 +1836,24 @@ out_bdev: + return NULL; + } + +-static int ext3_load_journal(struct super_block * sb, +- struct ext3_super_block * es) ++static int ext3_load_journal(struct super_block *sb, ++ struct ext3_super_block *es, ++ unsigned long journal_devnum) + { + journal_t *journal; + int journal_inum = le32_to_cpu(es->s_journal_inum); +- dev_t journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev)); ++ dev_t journal_dev; + int err = 0; + int really_read_only; + ++ if (journal_devnum && ++ journal_devnum != le32_to_cpu(es->s_journal_dev)) { ++ printk(KERN_INFO "EXT3-fs: external journal device major/minor " ++ "numbers have changed\n"); ++ journal_dev = new_decode_dev(journal_devnum); ++ } else ++ journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev)); ++ + really_read_only = bdev_read_only(sb->s_bdev); + + /* +@@ -1888,6 +1912,16 @@ static int ext3_load_journal(struct supe + + EXT3_SB(sb)->s_journal = journal; + ext3_clear_journal_err(sb, es); ++ ++ if (journal_devnum && ++ journal_devnum != le32_to_cpu(es->s_journal_dev)) { ++ es->s_journal_dev = cpu_to_le32(journal_devnum); ++ sb->s_dirt = 1; ++ ++ /* Make sure we flush the recovery flag to disk. */ ++ ext3_commit_super(sb, es, 1); ++ } ++ + return 0; + } + +@@ -2093,13 +2127,13 @@ static int ext3_remount (struct super_bl + { + struct ext3_super_block * es; + struct ext3_sb_info *sbi = EXT3_SB(sb); +- unsigned long tmp; ++ unsigned long tmp1, tmp2; + unsigned long n_blocks_count = 0; + + /* + * Allow the "check" option to be passed as a remount option. + */ +- if (!parse_options(data, sb, &tmp, &n_blocks_count, 1)) ++ if (!parse_options(data, sb, &tmp1, &tmp2, &n_blocks_count, 1)) + return -EINVAL; + + if (sbi->s_mount_opt & EXT3_MOUNT_ABORT) diff --git a/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6.12.patch b/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6.12.patch new file mode 100644 index 0000000..c4c9d0b --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6.12.patch @@ -0,0 +1,2463 @@ +Index: linux-2.6.12/include/linux/ext3_fs_sb.h +=================================================================== +--- linux-2.6.12.orig/include/linux/ext3_fs_sb.h 2005-06-17 21:48:29.000000000 +0200 ++++ linux-2.6.12/include/linux/ext3_fs_sb.h 2005-06-21 13:59:09.186627289 +0200 +@@ -21,9 +21,29 @@ + #include + #include + #include ++#include + #endif + #include + ++#define EXT3_BB_MAX_BLOCKS 30 ++struct ext3_free_metadata { ++ unsigned short group; ++ unsigned short num; ++ unsigned short blocks[EXT3_BB_MAX_BLOCKS]; ++ struct list_head list; ++}; ++ ++struct ext3_buddy_group_blocks { ++ __u32 bb_bitmap; ++ __u32 bb_buddy; ++ spinlock_t bb_lock; ++ unsigned long bb_tid; ++ struct ext3_free_metadata *bb_md_cur; ++ unsigned short bb_first_free; ++ unsigned short bb_free; ++ unsigned bb_counters[]; ++}; ++ + /* + * third extended-fs super-block data in memory + */ +@@ -78,6 +98,27 @@ + char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */ + int s_jquota_fmt; /* Format of quota to use */ + #endif ++ ++ /* for buddy allocator */ ++ struct ext3_buddy_group_blocks **s_buddy_blocks; ++ struct inode *s_buddy; ++ long s_blocks_reserved; ++ spinlock_t s_reserve_lock; ++ struct list_head s_active_transaction; ++ struct list_head s_closed_transaction; ++ struct list_head s_committed_transaction; ++ spinlock_t s_md_lock; ++ tid_t s_last_transaction; ++ int s_mb_factor; ++ ++ /* stats for buddy allocator */ ++ spinlock_t s_bal_lock; ++ unsigned long s_bal_reqs; /* number of reqs with len > 1 */ ++ unsigned long s_bal_success; /* we found long enough chunks */ ++ unsigned long s_bal_allocated; /* in blocks */ ++ unsigned long s_bal_ex_scanned; /* total extents scanned */ ++ unsigned long s_bal_goals; /* goal hits */ ++ unsigned long s_bal_breaks; /* too long searches */ + }; + + #endif /* _LINUX_EXT3_FS_SB */ +Index: linux-2.6.12/include/linux/ext3_fs.h +=================================================================== +--- linux-2.6.12.orig/include/linux/ext3_fs.h 2005-06-21 13:57:16.542097419 +0200 ++++ linux-2.6.12/include/linux/ext3_fs.h 2005-06-21 13:57:25.862409805 +0200 +@@ -57,6 +57,14 @@ + #define ext3_debug(f, a...) do {} while (0) + #endif + ++#define EXT3_MULTIBLOCK_ALLOCATOR 1 ++ ++#define EXT3_MB_HINT_MERGE 1 ++#define EXT3_MB_HINT_RESERVED 2 ++#define EXT3_MB_HINT_METADATA 4 ++#define EXT3_MB_HINT_FIRST 8 ++#define EXT3_MB_HINT_BEST 16 ++ + /* + * Special inodes numbers + */ +@@ -366,6 +374,7 @@ + #define EXT3_MOUNT_IOPEN_NOPRIV 0x100000 /* Make iopen world-readable */ + #define EXT3_MOUNT_EXTENTS 0x200000/* Extents support */ + #define EXT3_MOUNT_EXTDEBUG 0x400000/* Extents debug */ ++#define EXT3_MOUNT_MBALLOC 0x800000/* Buddy allocation support */ + + /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ + #ifndef clear_opt +@@ -727,7 +736,7 @@ + extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group); + extern int ext3_new_block (handle_t *, struct inode *, unsigned long, int *); + extern void ext3_free_blocks (handle_t *, struct inode *, unsigned long, +- unsigned long); ++ unsigned long, int); + extern void ext3_free_blocks_sb (handle_t *, struct super_block *, + unsigned long, unsigned long, int *); + extern unsigned long ext3_count_free_blocks (struct super_block *); +@@ -848,6 +857,44 @@ + extern int ext3_ext_ioctl(struct inode *inode, struct file *filp, + unsigned int cmd, unsigned long arg); + ++/* mballoc.c */ ++extern long ext3_mb_aggressive; ++extern long ext3_mb_stats; ++extern long ext3_mb_max_to_scan; ++extern int ext3_mb_init(struct super_block *, int); ++extern int ext3_mb_release(struct super_block *); ++extern int ext3_mb_new_blocks(handle_t *, struct inode *, unsigned long, int *, int, int *); ++extern int ext3_mb_reserve_blocks(struct super_block *, int); ++extern void ext3_mb_release_blocks(struct super_block *, int); ++ ++/* writeback.c */ ++extern int ext3_wb_writepages(struct address_space *, struct writeback_control *); ++extern int ext3_wb_prepare_write(struct file *file, struct page *page, ++ unsigned from, unsigned to); ++extern int ext3_wb_commit_write(struct file *, struct page *, unsigned, unsigned); ++extern int ext3_wb_writepage(struct page *, struct writeback_control *); ++extern int ext3_wb_invalidatepage(struct page *, unsigned long); ++extern int ext3_wb_releasepage(struct page *, int); ++extern int ext3_wb_block_truncate_page(handle_t *, struct page *, struct address_space *, loff_t); ++extern void ext3_wb_init(struct super_block *); ++extern void ext3_wb_release(struct super_block *); ++ ++/* writeback.c */ ++extern int ext3_wb_writepages(struct address_space *, struct writeback_control *); ++extern int ext3_wb_prepare_write(struct file *file, struct page *page, ++ unsigned from, unsigned to); ++extern int ext3_wb_commit_write(struct file *, struct page *, unsigned, unsigned); ++extern int ext3_wb_writepage(struct page *, struct writeback_control *); ++extern int ext3_wb_invalidatepage(struct page *, unsigned long); ++extern int ext3_wb_releasepage(struct page *, int); ++extern int ext3_wb_block_truncate_page(handle_t *, struct page *, struct address_space *, loff_t); ++extern void ext3_wb_init(struct super_block *); ++extern void ext3_wb_release(struct super_block *); ++ ++/* proc.c */ ++extern int init_ext3_proc(void); ++extern void exit_ext3_proc(void); ++ + #endif /* __KERNEL__ */ + + /* EXT3_IOC_CREATE_INUM at bottom of file (visible to kernel and user). */ +Index: linux-2.6.12/fs/ext3/balloc.c +=================================================================== +--- linux-2.6.12.orig/fs/ext3/balloc.c 2005-06-17 21:48:29.000000000 +0200 ++++ linux-2.6.12/fs/ext3/balloc.c 2005-06-21 13:57:25.820417618 +0200 +@@ -79,7 +79,7 @@ + * + * Return buffer_head on success or NULL in case of failure. + */ +-static struct buffer_head * ++struct buffer_head * + read_block_bitmap(struct super_block *sb, unsigned int block_group) + { + struct ext3_group_desc * desc; +@@ -490,24 +490,6 @@ + return; + } + +-/* Free given blocks, update quota and i_blocks field */ +-void ext3_free_blocks(handle_t *handle, struct inode *inode, +- unsigned long block, unsigned long count) +-{ +- struct super_block * sb; +- int dquot_freed_blocks; +- +- sb = inode->i_sb; +- if (!sb) { +- printk ("ext3_free_blocks: nonexistent device"); +- return; +- } +- ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks); +- if (dquot_freed_blocks) +- DQUOT_FREE_BLOCK(inode, dquot_freed_blocks); +- return; +-} +- + /* + * For ext3 allocations, we must not reuse any blocks which are + * allocated in the bitmap buffer's "last committed data" copy. This +@@ -1162,7 +1144,7 @@ + * bitmap, and then for any free bit if that fails. + * This function also updates quota and i_blocks field. + */ +-int ext3_new_block(handle_t *handle, struct inode *inode, ++int ext3_new_block_old(handle_t *handle, struct inode *inode, + unsigned long goal, int *errp) + { + struct buffer_head *bitmap_bh = NULL; +Index: linux-2.6.12/fs/ext3/extents.c +=================================================================== +--- linux-2.6.12.orig/fs/ext3/extents.c 2005-06-21 13:57:16.493269295 +0200 ++++ linux-2.6.12/fs/ext3/extents.c 2005-06-21 13:57:25.847761367 +0200 +@@ -771,7 +771,7 @@ + for (i = 0; i < depth; i++) { + if (!ablocks[i]) + continue; +- ext3_free_blocks(handle, tree->inode, ablocks[i], 1); ++ ext3_free_blocks(handle, tree->inode, ablocks[i], 1, 1); + } + } + kfree(ablocks); +@@ -1428,7 +1428,7 @@ + path->p_idx->ei_leaf); + bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf); + ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf); +- ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1); ++ ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1, 1); + return err; + } + +@@ -1913,10 +1913,12 @@ + int needed = ext3_remove_blocks_credits(tree, ex, from, to); + handle_t *handle = ext3_journal_start(tree->inode, needed); + struct buffer_head *bh; +- int i; ++ int i, metadata = 0; + + if (IS_ERR(handle)) + return PTR_ERR(handle); ++ if (S_ISDIR(tree->inode->i_mode)) ++ metadata = 1; + if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) { + /* tail removal */ + unsigned long num, start; +@@ -1928,7 +1930,7 @@ + bh = sb_find_get_block(tree->inode->i_sb, start + i); + ext3_forget(handle, 0, tree->inode, bh, start + i); + } +- ext3_free_blocks(handle, tree->inode, start, num); ++ ext3_free_blocks(handle, tree->inode, start, num, metadata); + } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) { + printk("strange request: removal %lu-%lu from %u:%u\n", + from, to, ex->ee_block, ex->ee_len); +Index: linux-2.6.12/fs/ext3/namei.c +=================================================================== +--- linux-2.6.12.orig/fs/ext3/namei.c 2005-06-21 13:57:11.984480287 +0200 ++++ linux-2.6.12/fs/ext3/namei.c 2005-06-21 13:57:25.828230118 +0200 +@@ -1644,7 +1644,7 @@ + * If the create succeeds, we fill in the inode information + * with d_instantiate(). + */ +-static int ext3_create (struct inode * dir, struct dentry * dentry, int mode, ++int ext3_create (struct inode * dir, struct dentry * dentry, int mode, + struct nameidata *nd) + { + handle_t *handle; +Index: linux-2.6.12/fs/ext3/xattr.c +=================================================================== +--- linux-2.6.12.orig/fs/ext3/xattr.c 2005-06-17 21:48:29.000000000 +0200 ++++ linux-2.6.12/fs/ext3/xattr.c 2005-06-21 13:57:25.854597305 +0200 +@@ -484,7 +484,7 @@ + ea_bdebug(bh, "refcount now=0; freeing"); + if (ce) + mb_cache_entry_free(ce); +- ext3_free_blocks(handle, inode, bh->b_blocknr, 1); ++ ext3_free_blocks(handle, inode, bh->b_blocknr, 1, 1); + get_bh(bh); + ext3_forget(handle, 1, inode, bh, bh->b_blocknr); + } else { +@@ -804,7 +804,7 @@ + new_bh = sb_getblk(sb, block); + if (!new_bh) { + getblk_failed: +- ext3_free_blocks(handle, inode, block, 1); ++ ext3_free_blocks(handle, inode, block, 1, 1); + error = -EIO; + goto cleanup; + } +Index: linux-2.6.12/fs/ext3/Makefile +=================================================================== +--- linux-2.6.12.orig/fs/ext3/Makefile 2005-06-21 13:57:16.514753669 +0200 ++++ linux-2.6.12/fs/ext3/Makefile 2005-06-21 13:57:25.812605118 +0200 +@@ -5,7 +5,8 @@ + obj-$(CONFIG_EXT3_FS) += ext3.o + + ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o\ +- ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o ++ ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ ++ mballoc.o + + ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o + ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o +Index: linux-2.6.12/fs/ext3/mballoc.c +=================================================================== +--- linux-2.6.12.orig/fs/ext3/mballoc.c 2005-06-21 13:57:25.736433244 +0200 ++++ linux-2.6.12/fs/ext3/mballoc.c 2005-06-21 13:57:25.795026993 +0200 +@@ -0,0 +1,1865 @@ ++/* ++ * Copyright(c) 2003, 2004, 2005, Cluster File Systems, Inc, info@clusterfs.com ++ * Written by Alex Tomas ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public Licens ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- ++ */ ++ ++ ++/* ++ * mballoc.c contains the multiblocks allocation routines ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * TODO: ++ * - bitmap/buddy read-ahead (proposed by Oleg Drokin aka green) ++ * - track min/max extents in each group for better group selection ++ * - is it worthwhile to use buddies directly if req is 2^N blocks? ++ * - mb_mark_used() may allocate chunk right after splitting buddy ++ * - special flag to advice allocator to look for requested + N blocks ++ * this may improve interaction between extents and mballoc ++ * - tree of groups sorted by number of free blocks ++ * - percpu reservation code (hotpath) ++ * - error handling ++ */ ++ ++/* ++ * with 'ext3_mb_aggressive' set the allocator runs consistency checks over ++ * structures. these checks slow things down a lot ++ */ ++long ext3_mb_aggressive = 0; ++ ++ ++/* ++ * with 'ext3_mb_stats' allocator will collect stats that will be ++ * shown at umount. The collecting costs though! ++ */ ++long ext3_mb_stats = 1; ++ ++/* ++ */ ++#define MB_DEBUG__ ++#ifdef MB_DEBUG ++#define mb_debug(fmt,a...) printk(fmt, ##a) ++#else ++#define mb_debug(fmt,a...) ++#endif ++ ++/* ++ * where to save buddies structures beetween umount/mount (clean case only) ++ */ ++#define EXT3_BUDDY_FILE ".buddy" ++ ++/* ++ * How long mballoc can look for a best extent (in found extents) ++ */ ++long ext3_mb_max_to_scan = 100; ++ ++/* ++ * This structure is on-disk description of a group for mballoc ++ */ ++struct ext3_mb_group_descr { ++ __u16 mgd_first_free; /* first free block in the group */ ++ __u16 mgd_free; /* number of free blocks in the group */ ++ __u16 mgd_counters[16]; /* number of free blocks by order */ ++}; ++ ++/* ++ * This structure is header of mballoc's file ++ */ ++struct ext3_mb_grp_header { ++ __u32 mh_magic; ++}; ++ ++#define EXT3_MB_MAGIC_V1 0xbabd16fd ++ ++ ++struct ext3_free_extent { ++ __u16 fe_start; ++ __u16 fe_len; ++ __u16 fe_group; ++}; ++ ++struct ext3_allocation_context { ++ struct super_block *ac_sb; ++ ++ /* search goals */ ++ struct ext3_free_extent ac_g_ex; ++ ++ /* the best found extent */ ++ struct ext3_free_extent ac_b_ex; ++ ++ /* number of iterations done. we have to track to limit searching */ ++ unsigned long ac_ex_scanned; ++ __u16 ac_groups_scanned; ++ __u16 ac_found; ++ __u8 ac_status; ++ __u8 ac_flags; /* allocation hints */ ++ __u8 ac_repeats; ++}; ++ ++#define AC_STATUS_CONTINUE 1 ++#define AC_STATUS_FOUND 2 ++#define AC_STATUS_BREAK 3 ++ ++struct ext3_buddy { ++ struct buffer_head *bd_bh; ++ struct buffer_head *bd_bh2; ++ struct ext3_buddy_group_blocks *bd_bd; ++ struct super_block *bd_sb; ++ __u16 bd_blkbits; ++ __u16 bd_group; ++}; ++#define EXT3_MB_BITMAP(e3b) ((e3b)->bd_bh->b_data) ++#define EXT3_MB_BUDDY(e3b) ((e3b)->bd_bh2->b_data) ++ ++#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) ++ ++int ext3_create (struct inode *, struct dentry *, int, struct nameidata *); ++struct buffer_head * read_block_bitmap(struct super_block *, unsigned int); ++int ext3_new_block_old(handle_t *, struct inode *, unsigned long, int *); ++int ext3_mb_reserve_blocks(struct super_block *, int); ++void ext3_mb_release_blocks(struct super_block *, int); ++void ext3_mb_poll_new_transaction(struct super_block *, handle_t *); ++void ext3_mb_free_committed_blocks(struct super_block *); ++ ++#if BITS_PER_LONG == 64 ++#define mb_correct_addr_and_bit(bit,addr) \ ++{ \ ++ bit += ((unsigned long) addr & 7UL) << 3; \ ++ addr = (void *) ((unsigned long) addr & ~7UL); \ ++} ++#elif BITS_PER_LONG == 32 ++#define mb_correct_addr_and_bit(bit,addr) \ ++{ \ ++ bit += ((unsigned long) addr & 3UL) << 3; \ ++ addr = (void *) ((unsigned long) addr & ~3UL); \ ++} ++#else ++#error "how many bits you are?!" ++#endif ++ ++static inline int mb_test_bit(int bit, void *addr) ++{ ++ mb_correct_addr_and_bit(bit,addr); ++ return ext3_test_bit(bit, addr); ++} ++ ++static inline void mb_set_bit(int bit, void *addr) ++{ ++ mb_correct_addr_and_bit(bit,addr); ++ ext3_set_bit(bit, addr); ++} ++ ++static inline void mb_set_bit_atomic(int bit, void *addr) ++{ ++ mb_correct_addr_and_bit(bit,addr); ++ ext3_set_bit_atomic(NULL, bit, addr); ++} ++ ++static inline void mb_clear_bit(int bit, void *addr) ++{ ++ mb_correct_addr_and_bit(bit,addr); ++ ext3_clear_bit(bit, addr); ++} ++ ++static inline void mb_clear_bit_atomic(int bit, void *addr) ++{ ++ mb_correct_addr_and_bit(bit,addr); ++ ext3_clear_bit_atomic(NULL, bit, addr); ++} ++ ++static inline void *mb_find_buddy(struct ext3_buddy *e3b, int order, int *max) ++{ ++ int i = 1; ++ char *bb; ++ ++ J_ASSERT(EXT3_MB_BITMAP(e3b) != EXT3_MB_BUDDY(e3b)); ++ J_ASSERT(max != NULL); ++ ++ if (order > e3b->bd_blkbits + 1) { ++ *max = 0; ++ return NULL; ++ } ++ ++ /* at order 0 we see each particular block */ ++ *max = 1 << (e3b->bd_blkbits + 3); ++ if (order == 0) ++ return EXT3_MB_BITMAP(e3b); ++ ++ bb = EXT3_MB_BUDDY(e3b); ++ *max = *max >> 1; ++ while (i < order) { ++ bb += 1 << (e3b->bd_blkbits - i); ++ i++; ++ *max = *max >> 1; ++ } ++ J_ASSERT((unsigned) (bb - (char *) EXT3_MB_BUDDY(e3b)) < ++ e3b->bd_sb->s_blocksize); ++ return bb; ++} ++ ++static int ext3_mb_load_buddy(struct super_block *sb, int group, ++ struct ext3_buddy *e3b) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ ++ J_ASSERT(sbi->s_buddy_blocks[group]->bb_bitmap); ++ J_ASSERT(sbi->s_buddy_blocks[group]->bb_buddy); ++ ++ /* load bitmap */ ++ e3b->bd_bh = sb_getblk(sb, sbi->s_buddy_blocks[group]->bb_bitmap); ++ if (e3b->bd_bh == NULL) { ++ ext3_error(sb, "ext3_mb_load_buddy", ++ "can't get block for buddy bitmap\n"); ++ goto out; ++ } ++ /* load buddy */ ++ e3b->bd_bh2 = sb_getblk(sb, sbi->s_buddy_blocks[group]->bb_buddy); ++ if (e3b->bd_bh2 == NULL) { ++ ext3_error(sb, "ext3_mb_load_buddy", ++ "can't get block for buddy bitmap\n"); ++ goto out; ++ } ++ ++ if (!buffer_uptodate(e3b->bd_bh)) ++ ll_rw_block(READ, 1, &e3b->bd_bh); ++ if (!buffer_uptodate(e3b->bd_bh2)) ++ ll_rw_block(READ, 1, &e3b->bd_bh2); ++ ++ wait_on_buffer(e3b->bd_bh); ++ J_ASSERT(buffer_uptodate(e3b->bd_bh)); ++ wait_on_buffer(e3b->bd_bh2); ++ J_ASSERT(buffer_uptodate(e3b->bd_bh2)); ++ ++ e3b->bd_blkbits = sb->s_blocksize_bits; ++ e3b->bd_bd = sbi->s_buddy_blocks[group]; ++ e3b->bd_sb = sb; ++ e3b->bd_group = group; ++ ++ return 0; ++out: ++ brelse(e3b->bd_bh); ++ brelse(e3b->bd_bh2); ++ e3b->bd_bh = NULL; ++ e3b->bd_bh2 = NULL; ++ return -EIO; ++} ++ ++static void ext3_mb_dirty_buddy(struct ext3_buddy *e3b) ++{ ++ mark_buffer_dirty(e3b->bd_bh); ++ mark_buffer_dirty(e3b->bd_bh2); ++} ++ ++static void ext3_mb_release_desc(struct ext3_buddy *e3b) ++{ ++ brelse(e3b->bd_bh); ++ brelse(e3b->bd_bh2); ++} ++ ++static void mb_check_buddy(struct ext3_buddy *e3b) ++{ ++ int order = e3b->bd_blkbits + 1; ++ int max, max2, i, j, k, count; ++ void *buddy, *buddy2; ++ ++ if (likely(!ext3_mb_aggressive)) ++ return; ++ ++ if (!test_opt(e3b->bd_sb, MBALLOC)) ++ return; ++ ++ while (order > 1) { ++ buddy = mb_find_buddy(e3b, order, &max); ++ J_ASSERT(buddy); ++ buddy2 = mb_find_buddy(e3b, order - 1, &max2); ++ J_ASSERT(buddy2); ++ J_ASSERT(buddy != buddy2); ++ J_ASSERT(max * 2 == max2); ++ ++ count = 0; ++ for (i = 0; i < max; i++) { ++ ++ if (mb_test_bit(i, buddy)) { ++ /* only single bit in buddy2 may be 1 */ ++ if (!mb_test_bit(i << 1, buddy2)) ++ J_ASSERT(mb_test_bit((i<<1)+1, buddy2)); ++ else if (!mb_test_bit((i << 1) + 1, buddy2)) ++ J_ASSERT(mb_test_bit(i << 1, buddy2)); ++ continue; ++ } ++ ++ /* both bits in buddy2 must be 0 */ ++ J_ASSERT(mb_test_bit(i << 1, buddy2)); ++ J_ASSERT(mb_test_bit((i << 1) + 1, buddy2)); ++ ++ for (j = 0; j < (1 << order); j++) { ++ k = (i * (1 << order)) + j; ++ J_ASSERT(!mb_test_bit(k, EXT3_MB_BITMAP(e3b))); ++ } ++ count++; ++ } ++ J_ASSERT(e3b->bd_bd->bb_counters[order] == count); ++ order--; ++ } ++ ++ buddy = mb_find_buddy(e3b, 0, &max); ++ for (i = 0; i < max; i++) { ++ if (!mb_test_bit(i, buddy)) ++ continue; ++ /* check used bits only */ ++ for (j = 0; j < e3b->bd_blkbits + 1; j++) { ++ buddy2 = mb_find_buddy(e3b, j, &max2); ++ k = i >> j; ++ J_ASSERT(k < max2); ++ J_ASSERT(mb_test_bit(k, buddy2)); ++ } ++ } ++} ++ ++static inline void ++ext3_lock_group(struct super_block *sb, int group) ++{ ++ spin_lock(&EXT3_SB(sb)->s_buddy_blocks[group]->bb_lock); ++} ++ ++static inline void ++ext3_unlock_group(struct super_block *sb, int group) ++{ ++ spin_unlock(&EXT3_SB(sb)->s_buddy_blocks[group]->bb_lock); ++} ++ ++static int mb_find_order_for_block(struct ext3_buddy *e3b, int block) ++{ ++ int order = 1; ++ void *bb; ++ ++ J_ASSERT(EXT3_MB_BITMAP(e3b) != EXT3_MB_BUDDY(e3b)); ++ J_ASSERT(block < (1 << (e3b->bd_blkbits + 3))); ++ ++ bb = EXT3_MB_BUDDY(e3b); ++ while (order <= e3b->bd_blkbits + 1) { ++ block = block >> 1; ++ if (!mb_test_bit(block, bb)) { ++ /* this block is part of buddy of order 'order' */ ++ return order; ++ } ++ bb += 1 << (e3b->bd_blkbits - order); ++ order++; ++ } ++ return 0; ++} ++ ++static inline void mb_clear_bits(void *bm, int cur, int len) ++{ ++ __u32 *addr; ++ ++ len = cur + len; ++ while (cur < len) { ++ if ((cur & 31) == 0 && (len - cur) >= 32) { ++ /* fast path: clear whole word at once */ ++ addr = bm + (cur >> 3); ++ *addr = 0; ++ cur += 32; ++ continue; ++ } ++ mb_clear_bit_atomic(cur, bm); ++ cur++; ++ } ++} ++ ++static inline void mb_set_bits(void *bm, int cur, int len) ++{ ++ __u32 *addr; ++ ++ len = cur + len; ++ while (cur < len) { ++ if ((cur & 31) == 0 && (len - cur) >= 32) { ++ /* fast path: clear whole word at once */ ++ addr = bm + (cur >> 3); ++ *addr = 0xffffffff; ++ cur += 32; ++ continue; ++ } ++ mb_set_bit_atomic(cur, bm); ++ cur++; ++ } ++} ++ ++static int mb_free_blocks(struct ext3_buddy *e3b, int first, int count) ++{ ++ int block, max, order; ++ void *buddy, *buddy2; ++ ++ mb_check_buddy(e3b); ++ ++ e3b->bd_bd->bb_free += count; ++ if (first < e3b->bd_bd->bb_first_free) ++ e3b->bd_bd->bb_first_free = first; ++ ++ while (count-- > 0) { ++ block = first++; ++ order = 0; ++ ++ J_ASSERT(mb_test_bit(block, EXT3_MB_BITMAP(e3b))); ++ mb_clear_bit(block, EXT3_MB_BITMAP(e3b)); ++ e3b->bd_bd->bb_counters[order]++; ++ ++ /* start of the buddy */ ++ buddy = mb_find_buddy(e3b, order, &max); ++ ++ do { ++ block &= ~1UL; ++ if (mb_test_bit(block, buddy) || ++ mb_test_bit(block + 1, buddy)) ++ break; ++ ++ /* both the buddies are free, try to coalesce them */ ++ buddy2 = mb_find_buddy(e3b, order + 1, &max); ++ ++ if (!buddy2) ++ break; ++ ++ if (order > 0) { ++ /* for special purposes, we don't set ++ * free bits in bitmap */ ++ mb_set_bit(block, buddy); ++ mb_set_bit(block + 1, buddy); ++ } ++ e3b->bd_bd->bb_counters[order]--; ++ e3b->bd_bd->bb_counters[order]--; ++ ++ block = block >> 1; ++ order++; ++ e3b->bd_bd->bb_counters[order]++; ++ ++ mb_clear_bit(block, buddy2); ++ buddy = buddy2; ++ } while (1); ++ } ++ mb_check_buddy(e3b); ++ ++ return 0; ++} ++ ++static int mb_find_extent(struct ext3_buddy *e3b, int order, int block, ++ int needed, struct ext3_free_extent *ex) ++{ ++ int next, max, ord; ++ void *buddy; ++ ++ J_ASSERT(ex != NULL); ++ ++ buddy = mb_find_buddy(e3b, order, &max); ++ J_ASSERT(buddy); ++ J_ASSERT(block < max); ++ if (mb_test_bit(block, buddy)) { ++ ex->fe_len = 0; ++ ex->fe_start = 0; ++ ex->fe_group = 0; ++ return 0; ++ } ++ ++ if (order == 0) { ++ /* find actual order */ ++ order = mb_find_order_for_block(e3b, block); ++ block = block >> order; ++ } ++ ++ ex->fe_len = 1 << order; ++ ex->fe_start = block << order; ++ ex->fe_group = e3b->bd_group; ++ ++ while ((buddy = mb_find_buddy(e3b, order, &max))) { ++ ++ if (block + 1 >= max) ++ break; ++ ++ next = (block + 1) * (1 << order); ++ if (mb_test_bit(next, EXT3_MB_BITMAP(e3b))) ++ break; ++ ++ ord = mb_find_order_for_block(e3b, next); ++ ++ order = ord; ++ block = next >> order; ++ ex->fe_len += 1 << order; ++ } ++ ++ J_ASSERT(ex->fe_start + ex->fe_len <= (1 << (e3b->bd_blkbits + 3))); ++ return ex->fe_len; ++} ++ ++static int mb_mark_used(struct ext3_buddy *e3b, struct ext3_free_extent *ex) ++{ ++ int start = ex->fe_start; ++ int len = ex->fe_len; ++ int ord, mlen, max, cur; ++ int len0 = len; ++ void *buddy; ++ ++ e3b->bd_bd->bb_free -= len; ++ if (e3b->bd_bd->bb_first_free == start) ++ e3b->bd_bd->bb_first_free += len; ++ ++ while (len) { ++ ord = mb_find_order_for_block(e3b, start); ++ ++ if (((start >> ord) << ord) == start && len >= (1 << ord)) { ++ /* the whole chunk may be allocated at once! */ ++ mlen = 1 << ord; ++ buddy = mb_find_buddy(e3b, ord, &max); ++ J_ASSERT((start >> ord) < max); ++ mb_set_bit(start >> ord, buddy); ++ e3b->bd_bd->bb_counters[ord]--; ++ start += mlen; ++ len -= mlen; ++ J_ASSERT(len >= 0); ++ continue; ++ } ++ ++ /* we have to split large buddy */ ++ J_ASSERT(ord > 0); ++ buddy = mb_find_buddy(e3b, ord, &max); ++ mb_set_bit(start >> ord, buddy); ++ e3b->bd_bd->bb_counters[ord]--; ++ ++ ord--; ++ cur = (start >> ord) & ~1U; ++ buddy = mb_find_buddy(e3b, ord, &max); ++ mb_clear_bit(cur, buddy); ++ mb_clear_bit(cur + 1, buddy); ++ e3b->bd_bd->bb_counters[ord]++; ++ e3b->bd_bd->bb_counters[ord]++; ++ } ++ ++ /* now drop all the bits in bitmap */ ++ mb_set_bits(EXT3_MB_BITMAP(e3b), ex->fe_start, len0); ++ ++ mb_check_buddy(e3b); ++ ++ return 0; ++} ++ ++/* ++ * Must be called under group lock! ++ */ ++static void ext3_mb_use_best_found(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len); ++ mb_mark_used(e3b, &ac->ac_b_ex); ++ ac->ac_status = AC_STATUS_FOUND; ++} ++ ++/* ++ * The routine checks whether found extent is good enough. If it is, ++ * then the extent gets marked used and flag is set to the context ++ * to stop scanning. Otherwise, the extent is compared with the ++ * previous found extent and if new one is better, then it's stored ++ * in the context. Later, the best found extent will be used, if ++ * mballoc can't find good enough extent. ++ * ++ * FIXME: real allocation policy is to be designed yet! ++ */ ++static void ext3_mb_measure_extent(struct ext3_allocation_context *ac, ++ struct ext3_free_extent *ex, ++ struct ext3_buddy *e3b) ++{ ++ int factor = EXT3_SB(ac->ac_sb)->s_mb_factor; ++ struct ext3_free_extent *bex = &ac->ac_b_ex; ++ int diff = ac->ac_g_ex.fe_len - ex->fe_len; ++ ++ J_ASSERT(ex->fe_len > 0); ++ J_ASSERT(ex->fe_len < (1 << ac->ac_sb->s_blocksize_bits) * 8); ++ J_ASSERT(ex->fe_start < (1 << ac->ac_sb->s_blocksize_bits) * 8); ++ ++ ac->ac_found++; ++ ++ /* ++ * The special case - take what you catch first ++ */ ++ if (ac->ac_flags & EXT3_MB_HINT_FIRST) { ++ *bex = *ex; ++ ext3_mb_use_best_found(ac, e3b); ++ return; ++ } ++ ++ /* ++ * Let's check whether the chuck is good enough ++ */ ++ if (ex->fe_len >= ac->ac_g_ex.fe_len) { ++ *bex = *ex; ++ ext3_mb_use_best_found(ac, e3b); ++ return; ++ } ++ ++ /* ++ * If the request is vey large, then it makes sense to use large ++ * chunks for it. Even if they don't satisfy whole request. ++ */ ++ if (ex->fe_len > 1000) { ++ *bex = *ex; ++ ext3_mb_use_best_found(ac, e3b); ++ return; ++ } ++ ++ /* ++ * Sometimes it's worty to take close chunk ++ */ ++ if (factor && (ac->ac_g_ex.fe_len * 100) / (diff * 100) >= factor) { ++ *bex = *ex; ++ ext3_mb_use_best_found(ac, e3b); ++ return; ++ } ++ ++ /* ++ * If this is first found extent, just store it in the context ++ */ ++ if (bex->fe_len == 0) { ++ *bex = *ex; ++ return; ++ } ++ ++ /* ++ * If new found extent is better, store it in the context ++ * FIXME: possible the policy should be more complex? ++ */ ++ if (ex->fe_len > bex->fe_len) { ++ *bex = *ex; ++ } ++ ++ /* ++ * We don't want to scan for a whole year ++ */ ++ if (ac->ac_found > ext3_mb_max_to_scan) ++ ac->ac_status = AC_STATUS_BREAK; ++} ++ ++static int ext3_mb_try_best_found(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ struct ext3_free_extent ex = ac->ac_b_ex; ++ int group = ex.fe_group, max, err; ++ ++ J_ASSERT(ex.fe_len > 0); ++ err = ext3_mb_load_buddy(ac->ac_sb, group, e3b); ++ if (err) ++ return err; ++ ++ ext3_lock_group(ac->ac_sb, group); ++ max = mb_find_extent(e3b, 0, ex.fe_start, ex.fe_len, &ex); ++ ++ if (max > 0) ++ ext3_mb_use_best_found(ac, e3b); ++ ++ ext3_unlock_group(ac->ac_sb, group); ++ ++ if (ac->ac_status == AC_STATUS_FOUND) ++ ext3_mb_dirty_buddy(e3b); ++ ext3_mb_release_desc(e3b); ++ ++ return 0; ++} ++ ++static int ext3_mb_find_by_goal(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ int group = ac->ac_g_ex.fe_group, max, err; ++ struct ext3_free_extent ex; ++ ++ err = ext3_mb_load_buddy(ac->ac_sb, group, e3b); ++ if (err) ++ return err; ++ ++ ext3_lock_group(ac->ac_sb, group); ++ max = mb_find_extent(e3b, 0, ac->ac_g_ex.fe_start, ++ ac->ac_g_ex.fe_len, &ex); ++ ++ if (max > 0) { ++ J_ASSERT(ex.fe_len > 0); ++ J_ASSERT(ex.fe_group == ac->ac_g_ex.fe_group); ++ J_ASSERT(ex.fe_start == ac->ac_g_ex.fe_start); ++ ac->ac_b_ex = ex; ++ ext3_mb_use_best_found(ac, e3b); ++ } ++ ext3_unlock_group(ac->ac_sb, group); ++ ++ if (ac->ac_status == AC_STATUS_FOUND) ++ ext3_mb_dirty_buddy(e3b); ++ ext3_mb_release_desc(e3b); ++ ++ return 0; ++} ++/* ++ * The routine scans the group and measures all found extents. ++ * In order to optimize scanning, caller must pass number of ++ * free blocks in the group, so the routine can upper limit. ++ */ ++static void ext3_mb_scan_group(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ struct super_block *sb = ac->ac_sb; ++ void *bitmap = EXT3_MB_BITMAP(e3b); ++ struct ext3_free_extent ex; ++ int i, free; ++ ++ free = e3b->bd_bd->bb_free; ++ J_ASSERT(free > 0); ++ ++ i = e3b->bd_bd->bb_first_free; ++ ++ while (free && ac->ac_status != AC_STATUS_FOUND) { ++ i = ext3_find_next_zero_bit(bitmap, sb->s_blocksize * 8, i); ++ if (i >= sb->s_blocksize * 8) { ++ J_ASSERT(free == 0); ++ break; ++ } ++ ++ mb_find_extent(e3b, 0, i, ac->ac_g_ex.fe_len, &ex); ++ J_ASSERT(ex.fe_len > 0); ++ J_ASSERT(free >= ex.fe_len); ++ ++ ext3_mb_measure_extent(ac, &ex, e3b); ++ ++ i += ex.fe_len; ++ free -= ex.fe_len; ++ } ++} ++ ++static int ext3_mb_good_group(struct ext3_allocation_context *ac, ++ int group, int cr) ++{ ++ int free; ++ ++ J_ASSERT(cr >= 0 && cr < 3); ++ ++ free = EXT3_SB(ac->ac_sb)->s_buddy_blocks[group]->bb_free; ++ if (free == 0) ++ return 0; ++ ++ if (cr == 0) { ++ if (free >= ac->ac_g_ex.fe_len >> 1) ++ return 1; ++ } else if (cr == 1) { ++ if (free >= ac->ac_g_ex.fe_len >> 2) ++ return 1; ++ } else if (cr == 2) { ++ return 1; ++ } ++ return 0; ++} ++ ++int ext3_mb_new_blocks(handle_t *handle, struct inode *inode, ++ unsigned long goal, int *len, int flags, int *errp) ++{ ++ struct buffer_head *bitmap_bh = NULL; ++ struct ext3_allocation_context ac; ++ int i, group, block, cr, err = 0; ++ struct ext3_group_desc *gdp; ++ struct ext3_super_block *es; ++ struct buffer_head *gdp_bh; ++ struct ext3_sb_info *sbi; ++ struct super_block *sb; ++ struct ext3_buddy e3b; ++ ++ J_ASSERT(len != NULL); ++ J_ASSERT(*len > 0); ++ ++ sb = inode->i_sb; ++ if (!sb) { ++ printk("ext3_mb_new_nblocks: nonexistent device"); ++ return 0; ++ } ++ ++ if (!test_opt(sb, MBALLOC)) { ++ static int ext3_mballoc_warning = 0; ++ if (ext3_mballoc_warning == 0) { ++ printk(KERN_ERR "EXT3-fs: multiblock request with " ++ "mballoc disabled!\n"); ++ ext3_mballoc_warning++; ++ } ++ *len = 1; ++ err = ext3_new_block_old(handle, inode, goal, errp); ++ return err; ++ } ++ ++ ext3_mb_poll_new_transaction(sb, handle); ++ ++ sbi = EXT3_SB(sb); ++ es = EXT3_SB(sb)->s_es; ++ ++ /* ++ * We can't allocate > group size ++ */ ++ if (*len >= EXT3_BLOCKS_PER_GROUP(sb) - 10) ++ *len = EXT3_BLOCKS_PER_GROUP(sb) - 10; ++ ++ if (!(flags & EXT3_MB_HINT_RESERVED)) { ++ /* someone asks for non-reserved blocks */ ++ BUG_ON(*len > 1); ++ err = ext3_mb_reserve_blocks(sb, 1); ++ if (err) { ++ *errp = err; ++ return 0; ++ } ++ } ++ ++ /* ++ * Check quota for allocation of this blocks. ++ */ ++ while (*len && DQUOT_ALLOC_BLOCK(inode, *len)) ++ *len -= 1; ++ if (*len == 0) { ++ *errp = -EDQUOT; ++ block = 0; ++ goto out; ++ } ++ ++ /* start searching from the goal */ ++ if (goal < le32_to_cpu(es->s_first_data_block) || ++ goal >= le32_to_cpu(es->s_blocks_count)) ++ goal = le32_to_cpu(es->s_first_data_block); ++ group = (goal - le32_to_cpu(es->s_first_data_block)) / ++ EXT3_BLOCKS_PER_GROUP(sb); ++ block = ((goal - le32_to_cpu(es->s_first_data_block)) % ++ EXT3_BLOCKS_PER_GROUP(sb)); ++ ++ /* set up allocation goals */ ++ ac.ac_b_ex.fe_group = 0; ++ ac.ac_b_ex.fe_start = 0; ++ ac.ac_b_ex.fe_len = 0; ++ ac.ac_status = AC_STATUS_CONTINUE; ++ ac.ac_groups_scanned = 0; ++ ac.ac_ex_scanned = 0; ++ ac.ac_found = 0; ++ ac.ac_sb = inode->i_sb; ++ ac.ac_g_ex.fe_group = group; ++ ac.ac_g_ex.fe_start = block; ++ ac.ac_g_ex.fe_len = *len; ++ ac.ac_flags = flags; ++ ++ /* ++ * Sometimes, caller may want to merge even small number ++ * of blocks to an existing extent ++ */ ++ if (ac.ac_flags & EXT3_MB_HINT_MERGE) { ++ err = ext3_mb_find_by_goal(&ac, &e3b); ++ if (err) ++ goto out_err; ++ if (ac.ac_status == AC_STATUS_FOUND) ++ goto found; ++ } ++ ++ /* ++ * FIXME ++ * If requested chunk is power of 2 length, we can try ++ * to exploit buddy nature to speed allocation up ++ */ ++ ++ ++ /* ++ * Let's just scan groups to find more-less suitable blocks ++ */ ++ cr = 0; ++repeat: ++ for (; cr < 3 && ac.ac_status == AC_STATUS_CONTINUE; cr++) { ++ for (i = 0; i < EXT3_SB(sb)->s_groups_count; group++, i++) { ++ if (group == EXT3_SB(sb)->s_groups_count) ++ group = 0; ++ ++ /* check is group good for our criteries */ ++ if (!ext3_mb_good_group(&ac, group, cr)) ++ continue; ++ ++ err = ext3_mb_load_buddy(ac.ac_sb, group, &e3b); ++ if (err) ++ goto out_err; ++ ++ ext3_lock_group(sb, group); ++ if (!ext3_mb_good_group(&ac, group, cr)) { ++ /* someone did allocation from this group */ ++ ext3_unlock_group(sb, group); ++ ext3_mb_release_desc(&e3b); ++ continue; ++ } ++ ++ ext3_mb_scan_group(&ac, &e3b); ++ ext3_unlock_group(sb, group); ++ ++ if (ac.ac_status == AC_STATUS_FOUND) ++ ext3_mb_dirty_buddy(&e3b); ++ ext3_mb_release_desc(&e3b); ++ ++ if (err) ++ goto out_err; ++ if (ac.ac_status != AC_STATUS_CONTINUE) ++ break; ++ } ++ } ++ ++ if (ac.ac_status == AC_STATUS_BREAK && ++ !(ac.ac_flags & EXT3_MB_HINT_FIRST)) { ++ /* We've been searching too long. Let's try to allocate ++ * the best chunk we've found so far. */ ++ if (ac.ac_g_ex.fe_len >= 128 && ++ ac.ac_b_ex.fe_len < ac.ac_g_ex.fe_len / 4) ++ ext3_warning(inode->i_sb, __FUNCTION__, ++ "too long searching: got %d want %d\n", ++ ac.ac_b_ex.fe_len, ac.ac_g_ex.fe_len); ++ ext3_mb_try_best_found(&ac, &e3b); ++ if (ac.ac_status != AC_STATUS_FOUND) { ++ /* ++ * Someone more lucky has already allocated it. ++ * The only thing we can do is just take first ++ * found block(s) ++ */ ++ mb_debug(KERN_ERR "EXT3-fs: and someone won our chunk\n"); ++ ac.ac_b_ex.fe_group = 0; ++ ac.ac_b_ex.fe_start = 0; ++ ac.ac_b_ex.fe_len = 0; ++ ac.ac_status = AC_STATUS_CONTINUE; ++ ac.ac_flags |= EXT3_MB_HINT_FIRST; ++ cr = 2; ++ goto repeat; ++ } ++ } ++ ++ if (ac.ac_status != AC_STATUS_FOUND) { ++ /* ++ * We aren't lucky definitely ++ */ ++ DQUOT_FREE_BLOCK(inode, *len); ++ *errp = -ENOSPC; ++ block = 0; ++#if 1 ++ printk(KERN_ERR "EXT3-fs: cant allocate: status %d, flags %d\n", ++ ac.ac_status, ac.ac_flags); ++ printk(KERN_ERR "EXT3-fs: goal %d, best found %d/%d/%d, cr %d\n", ++ ac.ac_g_ex.fe_len, ac.ac_b_ex.fe_group, ++ ac.ac_b_ex.fe_start, ac.ac_b_ex.fe_len, cr); ++ printk(KERN_ERR "EXT3-fs: %lu block reserved, %d found\n", ++ sbi->s_blocks_reserved, ac.ac_found); ++ printk("EXT3-fs: groups: "); ++ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) ++ printk("%d: %d ", i, ++ sbi->s_buddy_blocks[i]->bb_free); ++ printk("\n"); ++#endif ++ goto out; ++ } ++ ++found: ++ J_ASSERT(ac.ac_b_ex.fe_len > 0); ++ ++ /* good news - free block(s) have been found. now it's time ++ * to mark block(s) in good old journaled bitmap */ ++ block = ac.ac_b_ex.fe_group * EXT3_BLOCKS_PER_GROUP(sb) ++ + ac.ac_b_ex.fe_start ++ + le32_to_cpu(es->s_first_data_block); ++ ++ /* we made a desicion, now mark found blocks in good old ++ * bitmap to be journaled */ ++ ++ ext3_debug("using block group %d(%d)\n", ++ ac.ac_b_group.group, gdp->bg_free_blocks_count); ++ ++ bitmap_bh = read_block_bitmap(sb, ac.ac_b_ex.fe_group); ++ if (!bitmap_bh) { ++ *errp = -EIO; ++ goto out_err; ++ } ++ ++ err = ext3_journal_get_write_access(handle, bitmap_bh); ++ if (err) { ++ *errp = err; ++ goto out_err; ++ } ++ ++ gdp = ext3_get_group_desc(sb, ac.ac_b_ex.fe_group, &gdp_bh); ++ if (!gdp) { ++ *errp = -EIO; ++ goto out_err; ++ } ++ ++ err = ext3_journal_get_write_access(handle, gdp_bh); ++ if (err) ++ goto out_err; ++ ++ block = ac.ac_b_ex.fe_group * EXT3_BLOCKS_PER_GROUP(sb) ++ + ac.ac_b_ex.fe_start ++ + le32_to_cpu(es->s_first_data_block); ++ ++ if (block == le32_to_cpu(gdp->bg_block_bitmap) || ++ block == le32_to_cpu(gdp->bg_inode_bitmap) || ++ in_range(block, le32_to_cpu(gdp->bg_inode_table), ++ EXT3_SB(sb)->s_itb_per_group)) ++ ext3_error(sb, "ext3_new_block", ++ "Allocating block in system zone - " ++ "block = %u", block); ++ if (unlikely(ext3_mb_aggressive)) { ++ for (i = 0; i < ac.ac_b_ex.fe_len; i++) ++ J_ASSERT(!mb_test_bit(ac.ac_b_ex.fe_start + i, ++ bitmap_bh->b_data)); ++ } ++ ++ mb_set_bits(bitmap_bh->b_data, ac.ac_b_ex.fe_start, ac.ac_b_ex.fe_len); ++ ++ spin_lock(sb_bgl_lock(sbi, ac.ac_b_ex.fe_group)); ++ gdp->bg_free_blocks_count = ++ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) ++ - ac.ac_b_ex.fe_len); ++ spin_unlock(sb_bgl_lock(sbi, ac.ac_b_ex.fe_group)); ++ percpu_counter_mod(&sbi->s_freeblocks_counter, - ac.ac_b_ex.fe_len); ++ ++ err = ext3_journal_dirty_metadata(handle, bitmap_bh); ++ if (err) ++ goto out_err; ++ err = ext3_journal_dirty_metadata(handle, gdp_bh); ++ if (err) ++ goto out_err; ++ ++ sb->s_dirt = 1; ++ *errp = 0; ++ brelse(bitmap_bh); ++ ++ /* drop non-allocated, but dquote'd blocks */ ++ J_ASSERT(*len >= ac.ac_b_ex.fe_len); ++ DQUOT_FREE_BLOCK(inode, *len - ac.ac_b_ex.fe_len); ++ ++ *len = ac.ac_b_ex.fe_len; ++ J_ASSERT(*len > 0); ++ J_ASSERT(block != 0); ++ goto out; ++ ++out_err: ++ /* if we've already allocated something, roll it back */ ++ if (ac.ac_status == AC_STATUS_FOUND) { ++ /* FIXME: free blocks here */ ++ } ++ ++ DQUOT_FREE_BLOCK(inode, *len); ++ brelse(bitmap_bh); ++ *errp = err; ++ block = 0; ++out: ++ if (!(flags & EXT3_MB_HINT_RESERVED)) { ++ /* block wasn't reserved before and we reserved it ++ * at the beginning of allocation. it doesn't matter ++ * whether we allocated anything or we failed: time ++ * to release reservation. NOTE: because I expect ++ * any multiblock request from delayed allocation ++ * path only, here is single block always */ ++ ext3_mb_release_blocks(sb, 1); ++ } ++ ++ if ((ext3_mb_stats) && (ac.ac_g_ex.fe_len > 1)) { ++ spin_lock(&sbi->s_bal_lock); ++ sbi->s_bal_reqs++; ++ sbi->s_bal_allocated += *len; ++ if (*len >= ac.ac_g_ex.fe_len) ++ sbi->s_bal_success++; ++ sbi->s_bal_ex_scanned += ac.ac_found; ++ if (ac.ac_g_ex.fe_start == ac.ac_b_ex.fe_start && ++ ac.ac_g_ex.fe_group == ac.ac_b_ex.fe_group) ++ sbi->s_bal_goals++; ++ if (ac.ac_found > ext3_mb_max_to_scan) ++ sbi->s_bal_breaks++; ++ spin_unlock(&sbi->s_bal_lock); ++ } ++ ++ return block; ++} ++ ++int ext3_mb_get_descr_loc(struct ext3_buddy *e3b, struct buffer_head **bh, ++ struct ext3_mb_group_descr **grp) ++{ ++ struct super_block *sb = e3b->bd_sb; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int descr_per_block, err, offset; ++ struct ext3_mb_grp_header *hdr; ++ unsigned long block; ++ ++ descr_per_block = (sb->s_blocksize - sizeof(struct ext3_mb_grp_header)) ++ / sizeof(struct ext3_mb_group_descr); ++ block = e3b->bd_group / descr_per_block; ++ *bh = ext3_bread(NULL, sbi->s_buddy, block, 0, &err); ++ if (*bh == NULL) { ++ printk(KERN_ERR "EXT3-fs: cant getblk descr for group %d: %d\n", ++ e3b->bd_group, err); ++ return err; ++ } ++ ++ hdr = (struct ext3_mb_grp_header *) (*bh)->b_data; ++ if (hdr->mh_magic != EXT3_MB_MAGIC_V1) { ++ printk(KERN_ERR "EXT3-fs: invalid magic in group %d!\n", ++ e3b->bd_group); ++ brelse(*bh); ++ *bh = NULL; ++ return -EIO; ++ } ++ ++ offset = e3b->bd_group % descr_per_block ++ * sizeof(struct ext3_mb_group_descr) ++ + sizeof(struct ext3_mb_grp_header); ++ *grp = (struct ext3_mb_group_descr *) ((*bh)->b_data + offset); ++ ++ return 0; ++} ++ ++int ext3_mb_load_descr(struct ext3_buddy *e3b) ++{ ++ struct ext3_mb_group_descr *grp; ++ struct ext3_group_desc *gdp; ++ struct buffer_head *bh; ++ int err, i; ++ ++ err = ext3_mb_get_descr_loc(e3b, &bh, &grp); ++ if (err) ++ return err; ++ ++ e3b->bd_bd->bb_first_free = grp->mgd_first_free; ++ e3b->bd_bd->bb_free = grp->mgd_free; ++ for (i = 0; i <= e3b->bd_blkbits + 1; i++) { ++ J_ASSERT(i < 16); ++ e3b->bd_bd->bb_counters[i] = grp->mgd_counters[i]; ++ } ++ brelse(bh); ++ ++ /* additional checks against old group descriptor */ ++ gdp = ext3_get_group_desc(e3b->bd_sb, e3b->bd_group, NULL); ++ if (!gdp) ++ return -EIO; ++ if (e3b->bd_bd->bb_free != le16_to_cpu(gdp->bg_free_blocks_count)) { ++ printk(KERN_ERR "EXT3-fs: mbgroup %d corrupted (%d != %d)\n", ++ e3b->bd_group, e3b->bd_bd->bb_free, ++ le16_to_cpu(gdp->bg_free_blocks_count)); ++ return -ENODATA; ++ } ++ ++ return 0; ++} ++ ++ ++int ext3_mb_update_descr(struct ext3_buddy *e3b) ++{ ++ struct ext3_mb_group_descr *grp; ++ struct ext3_group_desc *gdp; ++ struct buffer_head *bh; ++ handle_t *handle; ++ int err, i; ++ ++ /* additional checks against old group descriptor */ ++ gdp = ext3_get_group_desc(e3b->bd_sb, e3b->bd_group, NULL); ++ if (!gdp) ++ return -EIO; ++ if (e3b->bd_bd->bb_free != le16_to_cpu(gdp->bg_free_blocks_count)) { ++ printk(KERN_ERR "EXT3-fs: mbgroup %d corrupted (%d != %d)\n", ++ e3b->bd_group, e3b->bd_bd->bb_free, ++ le16_to_cpu(gdp->bg_free_blocks_count)); ++ return -ENODATA; ++ } ++ ++ err = ext3_mb_get_descr_loc(e3b, &bh, &grp); ++ if (err) ++ return err; ++ ++ handle = ext3_journal_start_sb(e3b->bd_sb, 1); ++ if (IS_ERR(handle)) { ++ err = PTR_ERR(handle); ++ handle = NULL; ++ goto out; ++ } ++ ++ err = ext3_journal_get_write_access(handle, bh); ++ if (err) ++ goto out; ++ grp->mgd_first_free = e3b->bd_bd->bb_first_free; ++ grp->mgd_free = e3b->bd_bd->bb_free; ++ for (i = 0; i <= e3b->bd_blkbits + 1; i++) { ++ J_ASSERT(i < 16); ++ grp->mgd_counters[i] = e3b->bd_bd->bb_counters[i]; ++ } ++ err = ext3_journal_dirty_metadata(handle, bh); ++ if (err) ++ goto out; ++ err = 0; ++out: ++ brelse(bh); ++ if (handle) ++ ext3_journal_stop(handle); ++ return err; ++} ++ ++int ext3_mb_generate_buddy(struct ext3_buddy *e3b) ++{ ++ struct super_block *sb = e3b->bd_sb; ++ struct buffer_head *bh; ++ int i, count = 0; ++ ++ mb_debug("generate buddy for group %d\n", e3b->bd_group); ++ memset(e3b->bd_bh->b_data, 0xff, sb->s_blocksize); ++ memset(e3b->bd_bh2->b_data, 0xff, sb->s_blocksize); ++ ++ bh = read_block_bitmap(sb, e3b->bd_group); ++ if (bh == NULL) ++ return -EIO; ++ ++ /* mb_free_blocks will set real free */ ++ e3b->bd_bd->bb_free = 0; ++ e3b->bd_bd->bb_first_free = 1 << 15; ++ /* ++ * if change bb_counters size, don't forget about ++ * ext3_mb_init_backend() -bzzz ++ */ ++ memset(e3b->bd_bd->bb_counters, 0, ++ sizeof(unsigned) * (sb->s_blocksize_bits + 2)); ++ ++ /* loop over the blocks, and create buddies for free ones */ ++ for (i = 0; i < sb->s_blocksize * 8; i++) { ++ if (!mb_test_bit(i, (void *) bh->b_data)) { ++ mb_free_blocks(e3b, i, 1); ++ count++; ++ } ++ } ++ brelse(bh); ++ mb_check_buddy(e3b); ++ ext3_mb_dirty_buddy(e3b); ++ ++ return 0; ++} ++ ++EXPORT_SYMBOL(ext3_mb_new_blocks); ++ ++#define MB_CREDITS \ ++ (EXT3_DATA_TRANS_BLOCKS + 3 + EXT3_INDEX_EXTRA_TRANS_BLOCKS + \ ++ 2 * EXT3_SINGLEDATA_TRANS_BLOCKS) ++ ++int ext3_mb_init_backend(struct super_block *sb, int *created) ++{ ++ int err, i, len, descr_per_block, buddy_offset, size; ++ struct inode *root = sb->s_root->d_inode; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct ext3_mb_grp_header *hdr; ++ struct buffer_head *bh = NULL; ++ unsigned long block; ++ struct dentry *db; ++ handle_t *handle; ++ tid_t target; ++ ++ *created = 0; ++ len = sizeof(struct ext3_buddy_group_blocks *) * sbi->s_groups_count; ++ sbi->s_buddy_blocks = kmalloc(len, GFP_KERNEL); ++ if (sbi->s_buddy_blocks == NULL) { ++ printk(KERN_ERR "EXT3-fs: cant allocate mem for buddy\n"); ++ return -ENOMEM; ++ } ++ memset(sbi->s_buddy_blocks, 0, len); ++ sbi->s_buddy = NULL; ++ ++ down(&root->i_sem); ++ len = strlen(EXT3_BUDDY_FILE); ++ db = lookup_one_len(EXT3_BUDDY_FILE, sb->s_root, len); ++ if (IS_ERR(db)) { ++ err = PTR_ERR(db); ++ printk(KERN_ERR "EXT3-fs: cant lookup buddy: %d\n", err); ++ up(&root->i_sem); ++ goto out; ++ } ++ ++ if (db->d_inode == NULL) { ++ err = ext3_create(root, db, S_IFREG, NULL); ++ if (err) { ++ printk(KERN_ERR "EXT3-fs: cant create buddy: %d\n", err); ++ up(&root->i_sem); ++ goto out; ++ } ++ db->d_inode->i_flags |= S_IMMUTABLE | S_NOATIME; ++ *created = 1; ++ mb_debug("no buddy file, regenerate\n"); ++ } ++ up(&root->i_sem); ++ sbi->s_buddy = igrab(db->d_inode); ++ ++ /* calculate needed size */ ++ descr_per_block = (sb->s_blocksize - sizeof(struct ext3_mb_grp_header)) ++ / sizeof(struct ext3_mb_group_descr); ++ buddy_offset = (sbi->s_groups_count + descr_per_block - 1) ++ / descr_per_block; ++ len = sbi->s_groups_count * sb->s_blocksize * 2 + ++ buddy_offset * sb->s_blocksize; ++ if (len != i_size_read(sbi->s_buddy)) { ++ if (*created == 0) ++ printk("EXT3-fs: wrong i_size (%u != %u), regenerate\n", ++ (unsigned) len, ++ (unsigned) i_size_read(sbi->s_buddy)); ++ *created = 1; ++ } ++ ++ /* read/create mb group descriptors */ ++ for (i = 0; i < buddy_offset; i++) { ++ handle = ext3_journal_start(sbi->s_buddy, MB_CREDITS); ++ if (IS_ERR(handle)) { ++ printk(KERN_ERR "EXT3-fs: cant start transaction\n"); ++ err = PTR_ERR(handle); ++ goto err_out; ++ } ++ ++ bh = ext3_bread(handle, sbi->s_buddy, i, 1, &err); ++ if (bh == NULL) { ++ printk(KERN_ERR "EXT3-fs: cant getblk grp: %d\n", err); ++ goto err_out; ++ } ++ hdr = (struct ext3_mb_grp_header *) bh->b_data; ++ if (hdr->mh_magic != EXT3_MB_MAGIC_V1) { ++ err = ext3_journal_get_write_access(handle, bh); ++ if (err) ++ goto err_out; ++ if (*created == 0) ++ printk(KERN_ERR ++ "EXT3-fs: invalid header 0x%x in %d," ++ "regenerate\n", hdr->mh_magic, i); ++ *created = 1; ++ hdr->mh_magic = EXT3_MB_MAGIC_V1; ++ err = ext3_journal_dirty_metadata(handle, bh); ++ if (err) ++ goto err_out; ++ } ++ brelse(bh); ++ ext3_journal_stop(handle); ++ } ++ ++ /* ++ * if change bb_counters size, don't forget about ext3_mb_generate_buddy() ++ */ ++ len = sizeof(struct ext3_buddy_group_blocks); ++ len += sizeof(unsigned) * (sb->s_blocksize_bits + 2); ++ for (i = 0; i < sbi->s_groups_count; i++) { ++ ++ sbi->s_buddy_blocks[i] = kmalloc(len, GFP_KERNEL); ++ if (sbi->s_buddy_blocks[i] == NULL) { ++ printk(KERN_ERR "EXT3-fs: cant allocate mem for buddy\n"); ++ err = -ENOMEM; ++ goto out2; ++ } ++ memset(sbi->s_buddy_blocks[i], 0, len); ++ ++ handle = ext3_journal_start(sbi->s_buddy, MB_CREDITS); ++ if (IS_ERR(handle)) { ++ printk(KERN_ERR "EXT3-fs: cant start transaction\n"); ++ err = PTR_ERR(handle); ++ goto out2; ++ } ++ ++ /* allocate block for bitmap */ ++ block = buddy_offset + i * 2; ++ bh = ext3_getblk(handle, sbi->s_buddy, block, 1, &err); ++ if (bh == NULL) { ++ printk(KERN_ERR "EXT3-fs: cant getblk bitmap: %d\n", err); ++ goto out2; ++ } ++ sbi->s_buddy_blocks[i]->bb_bitmap = bh->b_blocknr; ++ brelse(bh); ++ ++ /* allocate block for buddy */ ++ block = buddy_offset + i * 2 + 1; ++ bh = ext3_getblk(handle, sbi->s_buddy, block, 1, &err); ++ if (bh == NULL) { ++ printk(KERN_ERR "EXT3-fs: cant getblk for buddy: %d\n", err); ++ goto out2; ++ } ++ sbi->s_buddy_blocks[i]->bb_buddy = bh->b_blocknr; ++ brelse(bh); ++ ++ size = (block + 1) << sbi->s_buddy->i_blkbits; ++ if (size > sbi->s_buddy->i_size) { ++ *created = 1; ++ EXT3_I(sbi->s_buddy)->i_disksize = size; ++ i_size_write(sbi->s_buddy, size); ++ mark_inode_dirty(sbi->s_buddy); ++ } ++ ext3_journal_stop(handle); ++ ++ spin_lock_init(&sbi->s_buddy_blocks[i]->bb_lock); ++ sbi->s_buddy_blocks[i]->bb_md_cur = NULL; ++ sbi->s_buddy_blocks[i]->bb_tid = 0; ++ } ++ ++ if (journal_start_commit(sbi->s_journal, &target)) ++ log_wait_commit(sbi->s_journal, target); ++ ++out2: ++ dput(db); ++out: ++ return err; ++ ++err_out: ++ return err; ++} ++ ++int ext3_mb_write_descriptors(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct ext3_buddy e3b; ++ int ret = 0, i, err; ++ ++ for (i = 0; i < sbi->s_groups_count; i++) { ++ if (sbi->s_buddy_blocks[i] == NULL) ++ continue; ++ ++ err = ext3_mb_load_buddy(sb, i, &e3b); ++ if (err == 0) { ++ ext3_mb_update_descr(&e3b); ++ ext3_mb_release_desc(&e3b); ++ } else ++ ret = err; ++ } ++ return ret; ++} ++ ++int ext3_mb_release(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int i; ++ ++ if (!test_opt(sb, MBALLOC)) ++ return 0; ++ ++ /* release freed, non-committed blocks */ ++ spin_lock(&sbi->s_md_lock); ++ list_splice_init(&sbi->s_closed_transaction, ++ &sbi->s_committed_transaction); ++ list_splice_init(&sbi->s_active_transaction, ++ &sbi->s_committed_transaction); ++ spin_unlock(&sbi->s_md_lock); ++ ext3_mb_free_committed_blocks(sb); ++ ++ if (sbi->s_buddy_blocks) { ++ ext3_mb_write_descriptors(sb); ++ for (i = 0; i < sbi->s_groups_count; i++) { ++ if (sbi->s_buddy_blocks[i] == NULL) ++ continue; ++ kfree(sbi->s_buddy_blocks[i]); ++ } ++ kfree(sbi->s_buddy_blocks); ++ } ++ if (sbi->s_buddy) ++ iput(sbi->s_buddy); ++ if (sbi->s_blocks_reserved) ++ printk("ext3-fs: %ld blocks being reserved at umount!\n", ++ sbi->s_blocks_reserved); ++ if (ext3_mb_stats) { ++ printk("EXT3-fs: mballoc: %lu blocks %lu reqs " ++ "(%lu success)\n", sbi->s_bal_allocated, ++ sbi->s_bal_reqs, sbi->s_bal_success); ++ printk("EXT3-fs: mballoc: %lu extents scanned, " ++ "%lu goal hits, %lu breaks\n", sbi->s_bal_ex_scanned, ++ sbi->s_bal_goals, sbi->s_bal_breaks); ++ } ++ ++ return 0; ++} ++ ++int ext3_mb_init(struct super_block *sb, int needs_recovery) ++{ ++ struct ext3_buddy e3b; ++ int i, err, created; ++ ++ if (!test_opt(sb, MBALLOC)) ++ return 0; ++ ++ /* init file for buddy data */ ++ clear_opt(EXT3_SB(sb)->s_mount_opt, MBALLOC); ++ if ((err = ext3_mb_init_backend(sb, &created))) ++ return err; ++ ++repeat: ++ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) { ++ err = ext3_mb_load_buddy(sb, i, &e3b); ++ if (err) { ++ /* FIXME: release backend */ ++ return err; ++ } ++ if (created || needs_recovery) ++ ext3_mb_generate_buddy(&e3b); ++ else ++ err = ext3_mb_load_descr(&e3b); ++ ext3_mb_release_desc(&e3b); ++ if (err == -ENODATA) { ++ created = 1; ++ goto repeat; ++ } ++ } ++ if (created || needs_recovery) ++ printk(KERN_ERR "EXT3-fs: generated buddies for %lu groups\n", ++ EXT3_SB(sb)->s_groups_count); ++ spin_lock_init(&EXT3_SB(sb)->s_reserve_lock); ++ spin_lock_init(&EXT3_SB(sb)->s_md_lock); ++ INIT_LIST_HEAD(&EXT3_SB(sb)->s_active_transaction); ++ INIT_LIST_HEAD(&EXT3_SB(sb)->s_closed_transaction); ++ INIT_LIST_HEAD(&EXT3_SB(sb)->s_committed_transaction); ++ set_opt(EXT3_SB(sb)->s_mount_opt, MBALLOC); ++ ++ spin_lock_init(&EXT3_SB(sb)->s_bal_lock); ++ if (ext3_mb_stats) { ++ printk("EXT3-fs: mballoc enabled (stats)\n"); ++ } else { ++ printk("EXT3-fs: mballoc enabled\n"); ++ } ++ ++ return 0; ++} ++ ++void ext3_mb_free_committed_blocks(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int err, i, count = 0, count2 = 0; ++ struct ext3_free_metadata *md; ++ struct ext3_buddy e3b; ++ ++ if (list_empty(&sbi->s_committed_transaction)) ++ return; ++ ++ /* there is committed blocks to be freed yet */ ++ do { ++ /* get next array of blocks */ ++ md = NULL; ++ spin_lock(&sbi->s_md_lock); ++ if (!list_empty(&sbi->s_committed_transaction)) { ++ md = list_entry(sbi->s_committed_transaction.next, ++ struct ext3_free_metadata, list); ++ list_del(&md->list); ++ } ++ spin_unlock(&sbi->s_md_lock); ++ ++ if (md == NULL) ++ break; ++ ++ mb_debug("gonna free %u blocks in group %u (0x%p):", ++ md->num, md->group, md); ++ ++ err = ext3_mb_load_buddy(sb, md->group, &e3b); ++ BUG_ON(err != 0); ++ ++ /* there are blocks to put in buddy to make them really free */ ++ count += md->num; ++ count2++; ++ ext3_lock_group(sb, md->group); ++ for (i = 0; i < md->num; i++) { ++ mb_debug(" %u", md->blocks[i]); ++ mb_free_blocks(&e3b, md->blocks[i], 1); ++ } ++ mb_debug("\n"); ++ ext3_unlock_group(sb, md->group); ++ ++ kfree(md); ++ ext3_mb_dirty_buddy(&e3b); ++ ext3_mb_release_desc(&e3b); ++ ++ } while (md); ++ mb_debug("freed %u blocks in %u structures\n", count, count2); ++} ++ ++void ext3_mb_poll_new_transaction(struct super_block *sb, handle_t *handle) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ ++ if (sbi->s_last_transaction == handle->h_transaction->t_tid) ++ return; ++ ++ /* new transaction! time to close last one and free blocks for ++ * committed transaction. we know that only transaction can be ++ * active, so previos transaction can be being logged and we ++ * know that transaction before previous is known to be alreade ++ * logged. this means that now we may free blocks freed in all ++ * transactions before previous one. hope I'm clear enough ... */ ++ ++ spin_lock(&sbi->s_md_lock); ++ if (sbi->s_last_transaction != handle->h_transaction->t_tid) { ++ mb_debug("new transaction %lu, old %lu\n", ++ (unsigned long) handle->h_transaction->t_tid, ++ (unsigned long) sbi->s_last_transaction); ++ list_splice_init(&sbi->s_closed_transaction, ++ &sbi->s_committed_transaction); ++ list_splice_init(&sbi->s_active_transaction, ++ &sbi->s_closed_transaction); ++ sbi->s_last_transaction = handle->h_transaction->t_tid; ++ } ++ spin_unlock(&sbi->s_md_lock); ++ ++ ext3_mb_free_committed_blocks(sb); ++} ++ ++int ext3_mb_free_metadata(handle_t *handle, struct ext3_buddy *e3b, ++ int group, int block, int count) ++{ ++ struct ext3_buddy_group_blocks *db = e3b->bd_bd; ++ struct super_block *sb = e3b->bd_sb; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct ext3_free_metadata *md; ++ int i; ++ ++ ext3_lock_group(sb, group); ++ for (i = 0; i < count; i++) { ++ md = db->bb_md_cur; ++ if (md && db->bb_tid != handle->h_transaction->t_tid) { ++ db->bb_md_cur = NULL; ++ md = NULL; ++ } ++ ++ if (md == NULL) { ++ ext3_unlock_group(sb, group); ++ md = kmalloc(sizeof(*md), GFP_KERNEL); ++ if (md == NULL) ++ return -ENOMEM; ++ md->num = 0; ++ md->group = group; ++ ++ ext3_lock_group(sb, group); ++ if (db->bb_md_cur == NULL) { ++ spin_lock(&sbi->s_md_lock); ++ list_add(&md->list, &sbi->s_active_transaction); ++ spin_unlock(&sbi->s_md_lock); ++ db->bb_md_cur = md; ++ db->bb_tid = handle->h_transaction->t_tid; ++ mb_debug("new md 0x%p for group %u\n", ++ md, md->group); ++ } else { ++ kfree(md); ++ md = db->bb_md_cur; ++ } ++ } ++ ++ BUG_ON(md->num >= EXT3_BB_MAX_BLOCKS); ++ md->blocks[md->num] = block + i; ++ md->num++; ++ if (md->num == EXT3_BB_MAX_BLOCKS) { ++ /* no more space, put full container on a sb's list */ ++ db->bb_md_cur = NULL; ++ } ++ } ++ ext3_unlock_group(sb, group); ++ return 0; ++} ++ ++void ext3_mb_free_blocks(handle_t *handle, struct inode *inode, ++ unsigned long block, unsigned long count, ++ int metadata, int *freed) ++{ ++ struct buffer_head *bitmap_bh = NULL; ++ struct ext3_group_desc *gdp; ++ struct ext3_super_block *es; ++ unsigned long bit, overflow; ++ struct buffer_head *gd_bh; ++ unsigned long block_group; ++ struct ext3_sb_info *sbi; ++ struct super_block *sb; ++ struct ext3_buddy e3b; ++ int err = 0, ret; ++ ++ *freed = 0; ++ sb = inode->i_sb; ++ if (!sb) { ++ printk ("ext3_free_blocks: nonexistent device"); ++ return; ++ } ++ ++ ext3_mb_poll_new_transaction(sb, handle); ++ ++ sbi = EXT3_SB(sb); ++ es = EXT3_SB(sb)->s_es; ++ if (block < le32_to_cpu(es->s_first_data_block) || ++ block + count < block || ++ block + count > le32_to_cpu(es->s_blocks_count)) { ++ ext3_error (sb, "ext3_free_blocks", ++ "Freeing blocks not in datazone - " ++ "block = %lu, count = %lu", block, count); ++ goto error_return; ++ } ++ ++ ext3_debug("freeing block %lu\n", block); ++ ++do_more: ++ overflow = 0; ++ block_group = (block - le32_to_cpu(es->s_first_data_block)) / ++ EXT3_BLOCKS_PER_GROUP(sb); ++ bit = (block - le32_to_cpu(es->s_first_data_block)) % ++ EXT3_BLOCKS_PER_GROUP(sb); ++ /* ++ * Check to see if we are freeing blocks across a group ++ * boundary. ++ */ ++ if (bit + count > EXT3_BLOCKS_PER_GROUP(sb)) { ++ overflow = bit + count - EXT3_BLOCKS_PER_GROUP(sb); ++ count -= overflow; ++ } ++ brelse(bitmap_bh); ++ bitmap_bh = read_block_bitmap(sb, block_group); ++ if (!bitmap_bh) ++ goto error_return; ++ gdp = ext3_get_group_desc (sb, block_group, &gd_bh); ++ if (!gdp) ++ goto error_return; ++ ++ if (in_range (le32_to_cpu(gdp->bg_block_bitmap), block, count) || ++ in_range (le32_to_cpu(gdp->bg_inode_bitmap), block, count) || ++ in_range (block, le32_to_cpu(gdp->bg_inode_table), ++ EXT3_SB(sb)->s_itb_per_group) || ++ in_range (block + count - 1, le32_to_cpu(gdp->bg_inode_table), ++ EXT3_SB(sb)->s_itb_per_group)) ++ ext3_error (sb, "ext3_free_blocks", ++ "Freeing blocks in system zones - " ++ "Block = %lu, count = %lu", ++ block, count); ++ ++ BUFFER_TRACE(bitmap_bh, "getting write access"); ++ err = ext3_journal_get_write_access(handle, bitmap_bh); ++ if (err) ++ goto error_return; ++ ++ /* ++ * We are about to modify some metadata. Call the journal APIs ++ * to unshare ->b_data if a currently-committing transaction is ++ * using it ++ */ ++ BUFFER_TRACE(gd_bh, "get_write_access"); ++ err = ext3_journal_get_write_access(handle, gd_bh); ++ if (err) ++ goto error_return; ++ ++ err = ext3_mb_load_buddy(sb, block_group, &e3b); ++ if (err) ++ goto error_return; ++ ++ if (unlikely(ext3_mb_aggressive)) { ++ int i; ++ for (i = 0; i < count; i++) ++ J_ASSERT(mb_test_bit(bit + i, bitmap_bh->b_data)); ++ } ++ ++ mb_clear_bits(bitmap_bh->b_data, bit, count); ++ ++ /* We dirtied the bitmap block */ ++ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); ++ err = ext3_journal_dirty_metadata(handle, bitmap_bh); ++ ++ if (metadata) { ++ /* blocks being freed are metadata. these blocks shouldn't ++ * be used until this transaction is committed */ ++ ext3_mb_free_metadata(handle, &e3b, block_group, bit, count); ++ } else { ++ ext3_lock_group(sb, block_group); ++ mb_free_blocks(&e3b, bit, count); ++ ext3_unlock_group(sb, block_group); ++ } ++ ++ spin_lock(sb_bgl_lock(sbi, block_group)); ++ gdp->bg_free_blocks_count = ++ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + count); ++ spin_unlock(sb_bgl_lock(sbi, block_group)); ++ percpu_counter_mod(&sbi->s_freeblocks_counter, count); ++ ++ ext3_mb_dirty_buddy(&e3b); ++ ext3_mb_release_desc(&e3b); ++ ++ *freed = count; ++ ++ /* And the group descriptor block */ ++ BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); ++ ret = ext3_journal_dirty_metadata(handle, gd_bh); ++ if (!err) err = ret; ++ ++ if (overflow && !err) { ++ block += count; ++ count = overflow; ++ goto do_more; ++ } ++ sb->s_dirt = 1; ++error_return: ++ brelse(bitmap_bh); ++ ext3_std_error(sb, err); ++ return; ++} ++ ++int ext3_mb_reserve_blocks(struct super_block *sb, int blocks) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int free, ret = -ENOSPC; ++ ++ BUG_ON(blocks < 0); ++ spin_lock(&sbi->s_reserve_lock); ++ free = percpu_counter_read_positive(&sbi->s_freeblocks_counter); ++ if (blocks <= free - sbi->s_blocks_reserved) { ++ sbi->s_blocks_reserved += blocks; ++ ret = 0; ++ } ++ spin_unlock(&sbi->s_reserve_lock); ++ return ret; ++} ++ ++void ext3_mb_release_blocks(struct super_block *sb, int blocks) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ ++ BUG_ON(blocks < 0); ++ spin_lock(&sbi->s_reserve_lock); ++ sbi->s_blocks_reserved -= blocks; ++ WARN_ON(sbi->s_blocks_reserved < 0); ++ if (sbi->s_blocks_reserved < 0) ++ sbi->s_blocks_reserved = 0; ++ spin_unlock(&sbi->s_reserve_lock); ++} ++ ++int ext3_new_block(handle_t *handle, struct inode *inode, ++ unsigned long goal, int *errp) ++{ ++ int ret, len; ++ ++ if (!test_opt(inode->i_sb, MBALLOC)) { ++ ret = ext3_new_block_old(handle, inode, goal, errp); ++ goto out; ++ } ++ len = 1; ++ ret = ext3_mb_new_blocks(handle, inode, goal, &len, 0, errp); ++out: ++ return ret; ++} ++ ++void ext3_free_blocks(handle_t *handle, struct inode *inode, ++ unsigned long block, unsigned long count, int metadata) ++{ ++ int freed; ++ ++ if (!test_opt(inode->i_sb, MBALLOC) || ++ EXT3_SB(inode->i_sb)->s_buddy_blocks == NULL) ++ ext3_free_blocks_sb(handle, inode->i_sb, block, count, &freed); ++ else ++ ext3_mb_free_blocks(handle, inode, block,count,metadata,&freed); ++ ++ if (freed) ++ DQUOT_FREE_BLOCK(inode, freed); ++ return; ++} +Index: linux-2.6.12/fs/ext3/proc.c +=================================================================== +--- linux-2.6.12.orig/fs/ext3/proc.c 2005-10-13 19:40:57.851699336 +0400 ++++ linux-2.6.12/fs/ext3/proc.c 2005-10-14 09:10:31.000000000 +0400 +@@ -0,0 +1,195 @@ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++ ++#define EXT3_ROOT "ext3" ++#define EXT3_MB_AGGRESSIVE_NAME "mb_aggressive" ++#define EXT3_MB_STATS_NAME "mb_stats" ++#define EXT3_MB_MAX_TO_SCAN_NAME "mb_max_to_scan" ++ ++ ++static struct proc_dir_entry *proc_root_ext3; ++ ++ ++static int ext3_mb_aggressive_read(char *page, char **start, off_t off, ++ int count, int *eof, void *data) ++{ ++ int len; ++ ++ *eof = 1; ++ if (off != 0) ++ return 0; ++ ++ len = sprintf(page, "%ld\n", ext3_mb_aggressive); ++ *start = page; ++ return len; ++} ++ ++static int ext3_mb_aggressive_write(struct file *file, const char *buffer, ++ unsigned long count, void *data) ++{ ++ char str[32]; ++ ++ if (count >= sizeof(str)) { ++ printk(KERN_ERR "EXT3: %s string to long, max %u bytes\n", ++ EXT3_MB_AGGRESSIVE_NAME, sizeof(str)); ++ return -EOVERFLOW; ++ } ++ ++ if (copy_from_user(str, buffer, count)) ++ return -EFAULT; ++ ++ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ ++ ext3_mb_aggressive = (simple_strtol(str, NULL, 0) != 0); ++ return count; ++} ++ ++static int ext3_mb_stats_read(char *page, char **start, off_t off, ++ int count, int *eof, void *data) ++{ ++ int len; ++ ++ *eof = 1; ++ if (off != 0) ++ return 0; ++ ++ len = sprintf(page, "%ld\n", ext3_mb_stats); ++ *start = page; ++ return len; ++} ++ ++static int ext3_mb_stats_write(struct file *file, const char *buffer, ++ unsigned long count, void *data) ++{ ++ char str[32]; ++ ++ if (count >= sizeof(str)) { ++ printk(KERN_ERR "EXT3: %s string to long, max %u bytes\n", ++ EXT3_MB_STATS_NAME, sizeof(str)); ++ return -EOVERFLOW; ++ } ++ ++ if (copy_from_user(str, buffer, count)) ++ return -EFAULT; ++ ++ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ ++ ext3_mb_stats = (simple_strtol(str, NULL, 0) != 0); ++ return count; ++} ++ ++static int ext3_mb_max_to_scan_read(char *page, char **start, off_t off, ++ int count, int *eof, void *data) ++{ ++ int len; ++ ++ *eof = 1; ++ if (off != 0) ++ return 0; ++ ++ len = sprintf(page, "%ld\n", ext3_mb_max_to_scan); ++ *start = page; ++ return len; ++} ++ ++static int ext3_mb_max_to_scan_write(struct file *file, const char *buffer, ++ unsigned long count, void *data) ++{ ++ char str[32]; ++ long value; ++ ++ if (count >= sizeof(str)) { ++ printk(KERN_ERR "EXT3: %s string to long, max %u bytes\n", ++ EXT3_MB_MAX_TO_SCAN_NAME, sizeof(str)); ++ return -EOVERFLOW; ++ } ++ ++ if (copy_from_user(str, buffer, count)) ++ return -EFAULT; ++ ++ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ ++ value = simple_strtol(str, NULL, 0); ++ if (value <= 0) ++ return -ERANGE; ++ ++ ext3_mb_max_to_scan = value; ++ ++ return count; ++} ++ ++int __init init_ext3_proc(void) ++{ ++ struct proc_dir_entry *proc_ext3_mb_aggressive; ++ struct proc_dir_entry *proc_ext3_mb_stats; ++ struct proc_dir_entry *proc_ext3_mb_max_to_scan; ++ ++ proc_root_ext3 = proc_mkdir(EXT3_ROOT, proc_root_fs); ++ if (proc_root_ext3 == NULL) { ++ printk(KERN_ERR "EXT3: Unable to create %s\n", EXT3_ROOT); ++ return -EIO; ++ } ++ ++ /* Initialize EXT3_MB_AGGRESSIVE_NAME */ ++ proc_ext3_mb_aggressive = create_proc_entry(EXT3_MB_AGGRESSIVE_NAME, ++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); ++ if (proc_ext3_mb_aggressive == NULL) { ++ printk(KERN_ERR "EXT3: Unable to create %s\n", ++ EXT3_MB_AGGRESSIVE_NAME); ++ remove_proc_entry(EXT3_ROOT, proc_root_fs); ++ return -EIO; ++ } ++ ++ proc_ext3_mb_aggressive->data = NULL; ++ proc_ext3_mb_aggressive->read_proc = ext3_mb_aggressive_read; ++ proc_ext3_mb_aggressive->write_proc = ext3_mb_aggressive_write; ++ ++ /* Initialize EXT3_MB_STATS_NAME */ ++ proc_ext3_mb_stats = create_proc_entry(EXT3_MB_STATS_NAME, ++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); ++ if (proc_ext3_mb_stats == NULL) { ++ printk(KERN_ERR "EXT3: Unable to create %s\n", ++ EXT3_MB_STATS_NAME); ++ remove_proc_entry(EXT3_MB_AGGRESSIVE_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_ROOT, proc_root_fs); ++ return -EIO; ++ } ++ ++ proc_ext3_mb_stats->data = NULL; ++ proc_ext3_mb_stats->read_proc = ext3_mb_stats_read; ++ proc_ext3_mb_stats->write_proc = ext3_mb_stats_write; ++ ++ /* Initialize EXT3_MAX_TO_SCAN_NAME */ ++ proc_ext3_mb_max_to_scan = create_proc_entry( ++ EXT3_MB_MAX_TO_SCAN_NAME, ++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); ++ if (proc_ext3_mb_max_to_scan == NULL) { ++ printk(KERN_ERR "EXT3: Unable to create %s\n", ++ EXT3_MB_STATS_NAME); ++ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_AGGRESSIVE_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_ROOT, proc_root_fs); ++ return -EIO; ++ } ++ ++ proc_ext3_mb_max_to_scan->data = NULL; ++ proc_ext3_mb_max_to_scan->read_proc = ext3_mb_max_to_scan_read; ++ proc_ext3_mb_max_to_scan->write_proc = ext3_mb_max_to_scan_write; ++ ++ return 0; ++} ++ ++void exit_ext3_proc(void) ++{ ++ remove_proc_entry(EXT3_MB_AGGRESSIVE_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_ROOT, proc_root_fs); ++} +Index: linux-2.6.12/fs/ext3/inode.c +=================================================================== +--- linux-2.6.12.orig/fs/ext3/inode.c 2005-06-21 13:57:16.507917732 +0200 ++++ linux-2.6.12/fs/ext3/inode.c 2005-06-21 13:57:25.837019180 +0200 +@@ -564,7 +564,7 @@ + ext3_journal_forget(handle, branch[i].bh); + } + for (i = 0; i < keys; i++) +- ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1); ++ ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1, 1); + return err; + } + +@@ -1850,7 +1850,7 @@ + } + } + +- ext3_free_blocks(handle, inode, block_to_free, count); ++ ext3_free_blocks(handle, inode, block_to_free, count, 1); + } + + /** +@@ -2023,7 +2023,7 @@ + ext3_journal_test_restart(handle, inode); + } + +- ext3_free_blocks(handle, inode, nr, 1); ++ ext3_free_blocks(handle, inode, nr, 1, 1); + + if (parent_bh) { + /* +Index: linux-2.6.12/fs/ext3/super.c +=================================================================== +--- linux-2.6.12.orig/fs/ext3/super.c 2005-06-21 13:57:16.526472419 +0200 ++++ linux-2.6.12/fs/ext3/super.c 2005-06-21 13:57:25.802839493 +0200 +@@ -387,6 +387,7 @@ + struct ext3_super_block *es = sbi->s_es; + int i; + ++ ext3_mb_release(sb); + ext3_ext_release(sb); + ext3_xattr_put_super(sb); + journal_destroy(sbi->s_journal); +@@ -594,7 +595,7 @@ + Opt_commit, Opt_journal_update, Opt_journal_inum, + Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, + Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, +- Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, ++ Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_mballoc, Opt_mbfactor, + Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, + Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_extents, Opt_extdebug, + }; +@@ -649,6 +650,8 @@ + {Opt_iopen_nopriv, "iopen_nopriv"}, + {Opt_extents, "extents"}, + {Opt_extdebug, "extdebug"}, ++ {Opt_mballoc, "mballoc"}, ++ {Opt_mbfactor, "mbfactor=%u"}, + {Opt_barrier, "barrier=%u"}, + {Opt_err, NULL}, + {Opt_resize, "resize"}, +@@ -964,6 +967,16 @@ + case Opt_extdebug: + set_opt (sbi->s_mount_opt, EXTDEBUG); + break; ++ case Opt_mballoc: ++ set_opt (sbi->s_mount_opt, MBALLOC); ++ break; ++ case Opt_mbfactor: ++ if (match_int(&args[0], &option)) ++ return 0; ++ if (option < 0) ++ return 0; ++ sbi->s_mb_factor = option; ++ break; + default: + printk (KERN_ERR + "EXT3-fs: Unrecognized mount option \"%s\" " +@@ -1668,6 +1681,7 @@ + ext3_count_dirs(sb)); + + ext3_ext_init(sb); ++ ext3_mb_init(sb, needs_recovery); + lock_kernel(); + return 0; + +@@ -2450,7 +2464,13 @@ + + static int __init init_ext3_fs(void) + { +- int err = init_ext3_xattr(); ++ int err; ++ ++ err = init_ext3_proc(); ++ if (err) ++ return err; ++ ++ err = init_ext3_xattr(); + if (err) + return err; + err = init_inodecache(); +@@ -2441,6 +2461,7 @@ + unregister_filesystem(&ext3_fs_type); + destroy_inodecache(); + exit_ext3_xattr(); ++ exit_ext3_proc(); + } + + int ext3_prep_san_write(struct inode *inode, long *blocks, diff --git a/ldiskfs/kernel_patches/patches/ext3-nlinks-2.6.9.patch b/ldiskfs/kernel_patches/patches/ext3-nlinks-2.6.9.patch index e75373a..62bf156 100644 --- a/ldiskfs/kernel_patches/patches/ext3-nlinks-2.6.9.patch +++ b/ldiskfs/kernel_patches/patches/ext3-nlinks-2.6.9.patch @@ -9,7 +9,7 @@ diff -Nur orig/fs/ext3/namei.c patch/fs/ext3/namei.c + /* limit is 16-bit i_links_count */ + if (inode->i_nlink >= EXT3_LINK_MAX || inode->i_nlink == 2) + inode->i_nlink = 1; -+ } ++ } } static inline void ext3_dec_count(handle_t *handle, struct inode *inode) @@ -105,7 +105,7 @@ diff -Nur orig/fs/ext3/namei.c patch/fs/ext3/namei.c if (new_inode) { - new_inode->i_nlink--; -+ ext3_dec_count(handle, new_inode); ++ ext3_dec_count(handle, new_inode); new_inode->i_ctime = CURRENT_TIME_SEC; } old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME_SEC; diff --git a/ldiskfs/kernel_patches/patches/iopen-2.6.12.patch b/ldiskfs/kernel_patches/patches/iopen-2.6.12.patch new file mode 100644 index 0000000..41e4c05 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/iopen-2.6.12.patch @@ -0,0 +1,470 @@ +Index: linux-2.6.12-rc6/fs/ext3/Makefile +=================================================================== +--- linux-2.6.12-rc6.orig/fs/ext3/Makefile 2005-06-14 16:00:45.206720992 +0200 ++++ linux-2.6.12-rc6/fs/ext3/Makefile 2005-06-14 16:14:33.595382720 +0200 +@@ -4,7 +4,7 @@ + + obj-$(CONFIG_EXT3_FS) += ext3.o + +-ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ ++ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o\ + ioctl.o namei.o super.o symlink.o hash.o resize.o + + ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o +Index: linux-2.6.12-rc6/fs/ext3/inode.c +=================================================================== +--- linux-2.6.12-rc6.orig/fs/ext3/inode.c 2005-06-14 16:01:16.272150299 +0200 ++++ linux-2.6.12-rc6/fs/ext3/inode.c 2005-06-14 16:24:55.686195412 +0200 +@@ -37,6 +37,7 @@ + #include + #include + #include "xattr.h" ++#include "iopen.h" + #include "acl.h" + + static int ext3_writepage_trans_blocks(struct inode *inode); +@@ -2437,6 +2438,8 @@ + ei->i_default_acl = EXT3_ACL_NOT_CACHED; + #endif + ei->i_block_alloc_info = NULL; ++ if (ext3_iopen_get_inode(inode)) ++ return; + + if (__ext3_get_inode_loc(inode, &iloc, 0)) + goto bad_inode; +Index: linux-2.6.12-rc6/fs/ext3/iopen.c +=================================================================== +--- linux-2.6.12-rc6.orig/fs/ext3/iopen.c 2005-06-14 16:14:33.530929595 +0200 ++++ linux-2.6.12-rc6/fs/ext3/iopen.c 2005-06-14 16:14:33.626632719 +0200 +@@ -0,0 +1,277 @@ ++/* ++ * linux/fs/ext3/iopen.c ++ * ++ * Special support for open by inode number ++ * ++ * Copyright (C) 2001 by Theodore Ts'o (tytso@alum.mit.edu). ++ * ++ * This file may be redistributed under the terms of the GNU General ++ * Public License. ++ * ++ * ++ * Invariants: ++ * - there is only ever a single DCACHE_NFSD_DISCONNECTED dentry alias ++ * for an inode at one time. ++ * - there are never both connected and DCACHE_NFSD_DISCONNECTED dentry ++ * aliases on an inode at the same time. ++ * ++ * If we have any connected dentry aliases for an inode, use one of those ++ * in iopen_lookup(). Otherwise, we instantiate a single NFSD_DISCONNECTED ++ * dentry for this inode, which thereafter will be found by the dcache ++ * when looking up this inode number in __iopen__, so we don't return here ++ * until it is gone. ++ * ++ * If we get an inode via a regular name lookup, then we "rename" the ++ * NFSD_DISCONNECTED dentry to the proper name and parent. This ensures ++ * existing users of the disconnected dentry will continue to use the same ++ * dentry as the connected users, and there will never be both kinds of ++ * dentry aliases at one time. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "iopen.h" ++ ++#ifndef assert ++#define assert(test) J_ASSERT(test) ++#endif ++ ++#define IOPEN_NAME_LEN 32 ++ ++/* ++ * This implements looking up an inode by number. ++ */ ++static struct dentry *iopen_lookup(struct inode * dir, struct dentry *dentry, ++ struct nameidata *nd) ++{ ++ struct inode *inode; ++ unsigned long ino; ++ struct list_head *lp; ++ struct dentry *alternate; ++ char buf[IOPEN_NAME_LEN]; ++ ++ if (dentry->d_name.len >= IOPEN_NAME_LEN) ++ return ERR_PTR(-ENAMETOOLONG); ++ ++ memcpy(buf, dentry->d_name.name, dentry->d_name.len); ++ buf[dentry->d_name.len] = 0; ++ ++ if (strcmp(buf, ".") == 0) ++ ino = dir->i_ino; ++ else if (strcmp(buf, "..") == 0) ++ ino = EXT3_ROOT_INO; ++ else ++ ino = simple_strtoul(buf, 0, 0); ++ ++ if ((ino != EXT3_ROOT_INO && ++ //ino != EXT3_ACL_IDX_INO && ++ //ino != EXT3_ACL_DATA_INO && ++ ino < EXT3_FIRST_INO(dir->i_sb)) || ++ ino > le32_to_cpu(EXT3_SB(dir->i_sb)->s_es->s_inodes_count)) ++ return ERR_PTR(-ENOENT); ++ ++ inode = iget(dir->i_sb, ino); ++ if (!inode) ++ return ERR_PTR(-EACCES); ++ if (is_bad_inode(inode)) { ++ iput(inode); ++ return ERR_PTR(-ENOENT); ++ } ++ ++ assert(list_empty(&dentry->d_alias)); /* d_instantiate */ ++ assert(d_unhashed(dentry)); /* d_rehash */ ++ ++ /* preferrably return a connected dentry */ ++ spin_lock(&dcache_lock); ++ list_for_each(lp, &inode->i_dentry) { ++ alternate = list_entry(lp, struct dentry, d_alias); ++ assert(!(alternate->d_flags & DCACHE_DISCONNECTED)); ++ } ++ ++ if (!list_empty(&inode->i_dentry)) { ++ alternate = list_entry(inode->i_dentry.next, ++ struct dentry, d_alias); ++ dget_locked(alternate); ++ spin_lock(&alternate->d_lock); ++ alternate->d_flags |= DCACHE_REFERENCED; ++ spin_unlock(&alternate->d_lock); ++ iput(inode); ++ spin_unlock(&dcache_lock); ++ return alternate; ++ } ++ dentry->d_flags |= DCACHE_DISCONNECTED; ++ ++ /* d_add(), but don't drop dcache_lock before adding dentry to inode */ ++ list_add(&dentry->d_alias, &inode->i_dentry); /* d_instantiate */ ++ dentry->d_inode = inode; ++ ++ d_rehash_cond(dentry, 0); /* d_rehash */ ++ spin_unlock(&dcache_lock); ++ ++ return NULL; ++} ++ ++#define do_switch(x,y) do { \ ++ __typeof__ (x) __tmp = x; \ ++ x = y; y = __tmp; } while (0) ++ ++static inline void switch_names(struct dentry *dentry, struct dentry *target) ++{ ++ const unsigned char *old_name, *new_name; ++ ++ memcpy(dentry->d_iname, target->d_iname, DNAME_INLINE_LEN_MIN); ++ old_name = target->d_name.name; ++ new_name = dentry->d_name.name; ++ if (old_name == target->d_iname) ++ old_name = dentry->d_iname; ++ if (new_name == dentry->d_iname) ++ new_name = target->d_iname; ++ target->d_name.name = new_name; ++ dentry->d_name.name = old_name; ++} ++ ++/* This function is spliced into ext3_lookup and does the move of a ++ * disconnected dentry (if it exists) to a connected dentry. ++ */ ++struct dentry *iopen_connect_dentry(struct dentry *dentry, struct inode *inode, ++ int rehash) ++{ ++ struct dentry *tmp, *goal = NULL; ++ struct list_head *lp; ++ ++ /* verify this dentry is really new */ ++ assert(dentry->d_inode == NULL); ++ assert(list_empty(&dentry->d_alias)); /* d_instantiate */ ++ if (rehash) ++ assert(d_unhashed(dentry)); /* d_rehash */ ++ assert(list_empty(&dentry->d_subdirs)); ++ ++ spin_lock(&dcache_lock); ++ if (!inode) ++ goto do_rehash; ++ ++ if (!test_opt(inode->i_sb, IOPEN)) ++ goto do_instantiate; ++ ++ /* preferrably return a connected dentry */ ++ list_for_each(lp, &inode->i_dentry) { ++ tmp = list_entry(lp, struct dentry, d_alias); ++ if (tmp->d_flags & DCACHE_DISCONNECTED) { ++ assert(tmp->d_alias.next == &inode->i_dentry); ++ assert(tmp->d_alias.prev == &inode->i_dentry); ++ goal = tmp; ++ dget_locked(goal); ++ break; ++ } ++ } ++ ++ if (!goal) ++ goto do_instantiate; ++ ++ /* Move the goal to the de hash queue */ ++ goal->d_flags &= ~ DCACHE_DISCONNECTED; ++ security_d_instantiate(goal, inode); ++ d_rehash_cond(dentry, 0); ++ __d_move(goal, dentry); ++ spin_unlock(&dcache_lock); ++ iput(inode); ++ ++ return goal; ++ ++ /* d_add(), but don't drop dcache_lock before adding dentry to inode */ ++do_instantiate: ++ list_add(&dentry->d_alias, &inode->i_dentry); /* d_instantiate */ ++ dentry->d_inode = inode; ++do_rehash: ++ if (rehash) ++ d_rehash_cond(dentry, 0); /* d_rehash */ ++ spin_unlock(&dcache_lock); ++ ++ return NULL; ++} ++ ++/* ++ * These are the special structures for the iopen pseudo directory. ++ */ ++ ++static struct inode_operations iopen_inode_operations = { ++ lookup: iopen_lookup, /* BKL held */ ++}; ++ ++static struct file_operations iopen_file_operations = { ++ read: generic_read_dir, ++}; ++ ++static int match_dentry(struct dentry *dentry, const char *name) ++{ ++ int len; ++ ++ len = strlen(name); ++ if (dentry->d_name.len != len) ++ return 0; ++ if (strncmp(dentry->d_name.name, name, len)) ++ return 0; ++ return 1; ++} ++ ++/* ++ * This function is spliced into ext3_lookup and returns 1 the file ++ * name is __iopen__ and dentry has been filled in appropriately. ++ */ ++int ext3_check_for_iopen(struct inode *dir, struct dentry *dentry) ++{ ++ struct inode *inode; ++ ++ if (dir->i_ino != EXT3_ROOT_INO || ++ !test_opt(dir->i_sb, IOPEN) || ++ !match_dentry(dentry, "__iopen__")) ++ return 0; ++ ++ inode = iget(dir->i_sb, EXT3_BAD_INO); ++ ++ if (!inode) ++ return 0; ++ d_add(dentry, inode); ++ return 1; ++} ++ ++/* ++ * This function is spliced into read_inode; it returns 1 if inode ++ * number is the one for /__iopen__, in which case the inode is filled ++ * in appropriately. Otherwise, this fuction returns 0. ++ */ ++int ext3_iopen_get_inode(struct inode *inode) ++{ ++ if (inode->i_ino != EXT3_BAD_INO) ++ return 0; ++ ++ inode->i_mode = S_IFDIR | S_IRUSR | S_IXUSR; ++ if (test_opt(inode->i_sb, IOPEN_NOPRIV)) ++ inode->i_mode |= 0777; ++ inode->i_uid = 0; ++ inode->i_gid = 0; ++ inode->i_nlink = 1; ++ inode->i_size = 4096; ++ inode->i_atime = CURRENT_TIME; ++ inode->i_ctime = CURRENT_TIME; ++ inode->i_mtime = CURRENT_TIME; ++ EXT3_I(inode)->i_dtime = 0; ++ inode->i_blksize = PAGE_SIZE; /* This is the optimal IO size ++ * (for stat), not the fs block ++ * size */ ++ inode->i_blocks = 0; ++ inode->i_version = 1; ++ inode->i_generation = 0; ++ ++ inode->i_op = &iopen_inode_operations; ++ inode->i_fop = &iopen_file_operations; ++ inode->i_mapping->a_ops = 0; ++ ++ return 1; ++} +Index: linux-2.6.12-rc6/fs/ext3/iopen.h +=================================================================== +--- linux-2.6.12-rc6.orig/fs/ext3/iopen.h 2005-06-14 16:14:33.534835845 +0200 ++++ linux-2.6.12-rc6/fs/ext3/iopen.h 2005-06-14 16:14:33.633468657 +0200 +@@ -0,0 +1,15 @@ ++/* ++ * iopen.h ++ * ++ * Special support for opening files by inode number. ++ * ++ * Copyright (C) 2001 by Theodore Ts'o (tytso@alum.mit.edu). ++ * ++ * This file may be redistributed under the terms of the GNU General ++ * Public License. ++ */ ++ ++extern int ext3_check_for_iopen(struct inode *dir, struct dentry *dentry); ++extern int ext3_iopen_get_inode(struct inode *inode); ++extern struct dentry *iopen_connect_dentry(struct dentry *dentry, ++ struct inode *inode, int rehash); +Index: linux-2.6.12-rc6/fs/ext3/namei.c +=================================================================== +--- linux-2.6.12-rc6.orig/fs/ext3/namei.c 2005-06-14 16:01:14.701837819 +0200 ++++ linux-2.6.12-rc6/fs/ext3/namei.c 2005-06-14 16:14:33.644210844 +0200 +@@ -37,6 +37,7 @@ + #include + #include + #include "xattr.h" ++#include "iopen.h" + #include "acl.h" + + /* +@@ -985,6 +986,9 @@ + if (dentry->d_name.len > EXT3_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); + ++ if (ext3_check_for_iopen(dir, dentry)) ++ return NULL; ++ + bh = ext3_find_entry(dentry, &de); + inode = NULL; + if (bh) { +@@ -995,10 +999,8 @@ + if (!inode) + return ERR_PTR(-EACCES); + } +- if (inode) +- return d_splice_alias(inode, dentry); +- d_add(dentry, inode); +- return NULL; ++ ++ return iopen_connect_dentry(dentry, inode, 1); + } + + +@@ -2042,10 +2044,6 @@ + inode->i_nlink); + inode->i_version++; + inode->i_nlink = 0; +- /* There's no need to set i_disksize: the fact that i_nlink is +- * zero will ensure that the right thing happens during any +- * recovery. */ +- inode->i_size = 0; + ext3_orphan_add(handle, inode); + inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; + ext3_mark_inode_dirty(handle, inode); +@@ -2168,6 +2166,23 @@ + return err; + } + ++/* Like ext3_add_nondir() except for call to iopen_connect_dentry */ ++static int ext3_add_link(handle_t *handle, struct dentry *dentry, ++ struct inode *inode) ++{ ++ int err = ext3_add_entry(handle, dentry, inode); ++ if (!err) { ++ err = ext3_mark_inode_dirty(handle, inode); ++ if (err == 0) { ++ dput(iopen_connect_dentry(dentry, inode, 0)); ++ return 0; ++ } ++ } ++ ext3_dec_count(handle, inode); ++ iput(inode); ++ return err; ++} ++ + static int ext3_link (struct dentry * old_dentry, + struct inode * dir, struct dentry *dentry) + { +@@ -2191,7 +2206,8 @@ + ext3_inc_count(handle, inode); + atomic_inc(&inode->i_count); + +- err = ext3_add_nondir(handle, dentry, inode); ++ err = ext3_add_link(handle, dentry, inode); ++ ext3_orphan_del(handle,inode); + ext3_journal_stop(handle); + if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) + goto retry; +Index: linux-2.6.12-rc6/fs/ext3/super.c +=================================================================== +--- linux-2.6.12-rc6.orig/fs/ext3/super.c 2005-06-14 16:01:16.287775299 +0200 ++++ linux-2.6.12-rc6/fs/ext3/super.c 2005-06-14 16:14:33.656906156 +0200 +@@ -590,6 +590,7 @@ + Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, + Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, + Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, ++ Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, + Opt_ignore, Opt_barrier, Opt_err, Opt_resize, + }; + +@@ -638,6 +639,9 @@ + {Opt_ignore, "noquota"}, + {Opt_ignore, "quota"}, + {Opt_ignore, "usrquota"}, ++ {Opt_iopen, "iopen"}, ++ {Opt_noiopen, "noiopen"}, ++ {Opt_iopen_nopriv, "iopen_nopriv"}, + {Opt_barrier, "barrier=%u"}, + {Opt_err, NULL}, + {Opt_resize, "resize"}, +@@ -921,6 +925,18 @@ + else + clear_opt(sbi->s_mount_opt, BARRIER); + break; ++ case Opt_iopen: ++ set_opt (sbi->s_mount_opt, IOPEN); ++ clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV); ++ break; ++ case Opt_noiopen: ++ clear_opt (sbi->s_mount_opt, IOPEN); ++ clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV); ++ break; ++ case Opt_iopen_nopriv: ++ set_opt (sbi->s_mount_opt, IOPEN); ++ set_opt (sbi->s_mount_opt, IOPEN_NOPRIV); ++ break; + case Opt_ignore: + break; + case Opt_resize: +Index: linux-2.6.12-rc6/include/linux/ext3_fs.h +=================================================================== +--- linux-2.6.12-rc6.orig/include/linux/ext3_fs.h 2005-06-14 16:01:14.709650318 +0200 ++++ linux-2.6.12-rc6/include/linux/ext3_fs.h 2005-06-14 16:28:38.452794245 +0200 +@@ -358,6 +358,8 @@ + #define EXT3_MOUNT_RESERVATION 0x10000 /* Preallocation */ + #define EXT3_MOUNT_BARRIER 0x20000 /* Use block barriers */ + #define EXT3_MOUNT_NOBH 0x40000 /* No bufferheads */ ++#define EXT3_MOUNT_IOPEN 0x80000 /* Allow access via iopen */ ++#define EXT3_MOUNT_IOPEN_NOPRIV 0x100000 /* Make iopen world-readable */ + + /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ + #ifndef _LINUX_EXT2_FS_H diff --git a/ldiskfs/kernel_patches/series/ldiskfs-2.6.12-vanilla.series b/ldiskfs/kernel_patches/series/ldiskfs-2.6.12-vanilla.series new file mode 100644 index 0000000..7d0a383 --- /dev/null +++ b/ldiskfs/kernel_patches/series/ldiskfs-2.6.12-vanilla.series @@ -0,0 +1,13 @@ +ext3-wantedi-2.6-rhel4.patch +ext3-san-jdike-2.6-suse.patch +iopen-2.6.12.patch +ext3-map_inode_page-2.6-suse.patch +export-ext3-2.6-rhel4.patch +ext3-include-fixes-2.6-rhel4.patch +ext3-extents-2.6.12.patch +ext3-mballoc2-2.6.12.patch +ext3-nlinks-2.6.9.patch +ext3-ialloc-2.6.patch +ext3-remove-cond_resched-calls-2.6.12.patch +ext3-htree-dot-2.6.patch +ext3-external-journal-2.6.12.patch diff --git a/lustre/kernel_patches/patches/8kstack-2.6.12.patch b/lustre/kernel_patches/patches/8kstack-2.6.12.patch new file mode 100644 index 0000000..f3a2160 --- /dev/null +++ b/lustre/kernel_patches/patches/8kstack-2.6.12.patch @@ -0,0 +1,13 @@ +Index: linux-2.6.9-5.0.3.EL/include/asm-i386/thread_info.h +=================================================================== +--- linux-2.6.9-5.0.3.EL.orig/include/asm-i386/thread_info.h 2005-02-25 10:25:33.000000000 +0200 ++++ linux-2.6.9-5.0.3.EL/include/asm-i386/thread_info.h 2005-02-25 20:19:11.676139032 +0200 +@@ -54,7 +54,7 @@ + + #define PREEMPT_ACTIVE 0x10000000 + #ifdef CONFIG_4KSTACKS +-#define THREAD_SIZE (4096) ++#define THREAD_SIZE (8192) + #else + #define THREAD_SIZE (8192) + #endif diff --git a/lustre/kernel_patches/patches/export_symbols-2.6-rhel4.patch b/lustre/kernel_patches/patches/export_symbols-2.6-rhel4.patch index 16f26b0..8d9ab40 100644 --- a/lustre/kernel_patches/patches/export_symbols-2.6-rhel4.patch +++ b/lustre/kernel_patches/patches/export_symbols-2.6-rhel4.patch @@ -79,6 +79,24 @@ Index: linux-2.6.9-5.0.3.EL/kernel/exit.c =================================================================== --- linux-2.6.9-5.0.3.EL.orig/kernel/exit.c 2005-02-26 13:47:31.300655280 +0200 +++ linux-2.6.9-5.0.3.EL/kernel/exit.c 2005-02-26 13:53:13.805586616 +0200 +@@ -244,6 +244,8 @@ + write_unlock_irq(&tasklist_lock); + } + ++EXPORT_SYMBOL(reparent_to_init); ++ + void __set_special_pids(pid_t session, pid_t pgrp) + { + struct task_struct *curr = current; +@@ -428,6 +430,8 @@ + __exit_files(tsk); + } + ++EXPORT_SYMBOL(exit_files); ++ + static inline void __put_fs_struct(struct fs_struct *fs) + { + /* No need to hold fs->lock if we are killing it */ @@ -516,6 +516,7 @@ { __exit_mm(tsk); diff --git a/lustre/kernel_patches/patches/export_symbols-2.6-suse.patch b/lustre/kernel_patches/patches/export_symbols-2.6-suse.patch index b22d925..de1bf20 100644 --- a/lustre/kernel_patches/patches/export_symbols-2.6-suse.patch +++ b/lustre/kernel_patches/patches/export_symbols-2.6-suse.patch @@ -42,6 +42,28 @@ Index: linux-2.6.4-51.0/include/linux/ext2_fs_sb.h /* * second extended-fs super-block data in memory */ +Index: linux-2.6.5-12.1/kernel/exit.c +=================================================================== +--- linux-2.6.5-12.1.orig/kernel/exit.c 2004-05-10 12:21:56.000000000 -0400 ++++ linux-2.6.5-12.1/kernel/exit.c 2004-06-03 18:31:28.000000000 -0400 +@@ -260,6 +260,8 @@ + write_unlock_irq(&tasklist_lock); + } + ++EXPORT_SYMBOL(reparent_to_init); ++ + void __set_special_pids(pid_t session, pid_t pgrp) + { + struct task_struct *curr = current; +@@ -429,6 +431,8 @@ + __exit_files(tsk); + } + ++EXPORT_SYMBOL(exit_files); ++ + static inline void __put_fs_struct(struct fs_struct *fs) + { + /* No need to hold fs->lock if we are killing it */ Index: linux-2.6.4-51.0/kernel/kallsyms.c =================================================================== --- linux-2.6.4-51.0.orig/kernel/kallsyms.c 2004-04-05 12:42:08.000000000 -0400 diff --git a/lustre/kernel_patches/patches/export_symbols-2.6.12.patch b/lustre/kernel_patches/patches/export_symbols-2.6.12.patch new file mode 100644 index 0000000..be1a602 --- /dev/null +++ b/lustre/kernel_patches/patches/export_symbols-2.6.12.patch @@ -0,0 +1,114 @@ +Index: linux-2.6.12-rc6/fs/filesystems.c +=================================================================== +--- linux-2.6.12-rc6.orig/fs/filesystems.c 2005-06-06 17:22:29.000000000 +0200 ++++ linux-2.6.12-rc6/fs/filesystems.c 2005-06-14 15:53:58.298522852 +0200 +@@ -28,7 +28,9 @@ + */ + + static struct file_system_type *file_systems; +-static DEFINE_RWLOCK(file_systems_lock); ++DEFINE_RWLOCK(file_systems_lock); ++ ++EXPORT_SYMBOL(file_systems_lock); + + /* WARNING: This can be used only if we _already_ own a reference */ + void get_filesystem(struct file_system_type *fs) +Index: linux-2.6.12-rc6/include/linux/fs.h +=================================================================== +--- linux-2.6.12-rc6.orig/include/linux/fs.h 2005-06-14 15:53:18.356140529 +0200 ++++ linux-2.6.12-rc6/include/linux/fs.h 2005-06-14 15:53:58.309265039 +0200 +@@ -1563,6 +1563,7 @@ + + extern struct file_operations generic_ro_fops; + ++extern rwlock_t file_systems_lock; + #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m)) + + extern int vfs_readlink(struct dentry *, char __user *, int, const char *); +Index: linux-2.6.12-rc6/kernel/kallsyms.c +=================================================================== +--- linux-2.6.12-rc6.orig/kernel/kallsyms.c 2005-06-06 17:22:29.000000000 +0200 ++++ linux-2.6.12-rc6/kernel/kallsyms.c 2005-06-14 15:54:30.293639648 +0200 +@@ -418,3 +418,4 @@ + __initcall(kallsyms_init); + + EXPORT_SYMBOL(__print_symbol); ++EXPORT_SYMBOL(kernel_text_address); +Index: linux-2.6.12-rc6/net/core/sock.c +=================================================================== +--- linux-2.6.12-rc6.orig/net/core/sock.c 2005-06-06 17:22:29.000000000 +0200 ++++ linux-2.6.12-rc6/net/core/sock.c 2005-06-14 15:53:58.349304101 +0200 +@@ -613,6 +613,7 @@ + return -EFAULT; + return 0; + } ++EXPORT_SYMBOL(sock_getsockopt); + + /** + * sk_alloc - All socket objects are allocated here +Index: linux-2.6.12-rc6/fs/namespace.c +=================================================================== +--- linux-2.6.12-rc6.orig/fs/namespace.c 2005-06-14 15:53:17.868835847 +0200 ++++ linux-2.6.12-rc6/fs/namespace.c 2005-06-14 15:53:58.361022851 +0200 +@@ -1240,6 +1240,7 @@ + mntput(old_pwdmnt); + } + } ++EXPORT_SYMBOL(set_fs_pwd); + + static void chroot_fs_refs(struct nameidata *old_nd, struct nameidata *new_nd) + { +Index: linux-2.6.12.5/kernel/exit.c +=================================================================== +--- linux-2.6.12.5.orig/kernel/exit.c 2005-08-17 17:51:28.000000000 +0200 ++++ linux-2.6.12.5/kernel/exit.c 2005-08-17 17:51:44.000000000 +0200 +@@ -250,6 +250,8 @@ + switch_uid(INIT_USER); + } + ++EXPORT_SYMBOL(reparent_to_init); ++ + void __set_special_pids(pid_t session, pid_t pgrp) + { + struct task_struct *curr = current; +@@ -432,6 +434,8 @@ + __exit_files(tsk); + } + ++EXPORT_SYMBOL(exit_files); ++ + static inline void __put_fs_struct(struct fs_struct *fs) + { + /* No need to hold fs->lock if we are killing it */ +@@ -515,6 +515,7 @@ + task_unlock(tsk); + mmput(mm); + } ++EXPORT_SYMBOL(exit_mm); + + static inline void choose_new_parent(task_t *p, task_t *reaper, task_t *child_reaper) + { +Index: linux-2.6.12-rc6/fs/dcache.c +=================================================================== +--- linux-2.6.12-rc6.orig/fs/dcache.c 2005-06-14 15:53:19.812195198 +0200 ++++ linux-2.6.12-rc6/fs/dcache.c 2005-06-14 15:53:58.385436913 +0200 +@@ -1581,6 +1581,7 @@ + + return result; + } ++EXPORT_SYMBOL(is_subdir); + + void d_genocide(struct dentry *root) + { +Index: linux-2.6.12-rc6/fs/file_table.c +=================================================================== +--- linux-2.6.12-rc6.orig/fs/file_table.c 2005-06-06 17:22:29.000000000 +0200 ++++ linux-2.6.12-rc6/fs/file_table.c 2005-06-14 15:53:58.396179101 +0200 +@@ -197,6 +197,7 @@ + file_free(file); + } + } ++EXPORT_SYMBOL(put_filp); + + void file_move(struct file *file, struct list_head *list) + { diff --git a/lustre/kernel_patches/patches/ext3-extents-2.6.12.patch b/lustre/kernel_patches/patches/ext3-extents-2.6.12.patch new file mode 100644 index 0000000..b6d0c57 --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-extents-2.6.12.patch @@ -0,0 +1,2924 @@ +Index: linux-2.6.12-rc6/fs/ext3/extents.c +=================================================================== +--- linux-2.6.12-rc6.orig/fs/ext3/extents.c 2005-06-14 16:31:25.756503133 +0200 ++++ linux-2.6.12-rc6/fs/ext3/extents.c 2005-06-14 16:31:25.836581257 +0200 +@@ -0,0 +1,2347 @@ ++/* ++ * Copyright(c) 2003, 2004, 2005, Cluster File Systems, Inc, info@clusterfs.com ++ * Written by Alex Tomas ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public Licens ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- ++ */ ++ ++/* ++ * Extents support for EXT3 ++ * ++ * TODO: ++ * - ext3_ext_walk_space() sould not use ext3_ext_find_extent() ++ * - ext3_ext_calc_credits() could take 'mergable' into account ++ * - ext3*_error() should be used in some situations ++ * - find_goal() [to be tested and improved] ++ * - smart tree reduction ++ * - arch-independence ++ * common on-disk format for big/little-endian arch ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++ ++static inline int ext3_ext_check_header(struct ext3_extent_header *eh) ++{ ++ if (eh->eh_magic != EXT3_EXT_MAGIC) { ++ printk(KERN_ERR "EXT3-fs: invalid magic = 0x%x\n", ++ (unsigned)eh->eh_magic); ++ return -EIO; ++ } ++ if (eh->eh_max == 0) { ++ printk(KERN_ERR "EXT3-fs: invalid eh_max = %u\n", ++ (unsigned)eh->eh_max); ++ return -EIO; ++ } ++ if (eh->eh_entries > eh->eh_max) { ++ printk(KERN_ERR "EXT3-fs: invalid eh_entries = %u\n", ++ (unsigned)eh->eh_entries); ++ return -EIO; ++ } ++ return 0; ++} ++ ++static handle_t *ext3_ext_journal_restart(handle_t *handle, int needed) ++{ ++ int err; ++ ++ if (handle->h_buffer_credits > needed) ++ return handle; ++ if (!ext3_journal_extend(handle, needed)) ++ return handle; ++ err = ext3_journal_restart(handle, needed); ++ ++ return handle; ++} ++ ++static int inline ++ext3_ext_get_access_for_root(handle_t *h, struct ext3_extents_tree *tree) ++{ ++ if (tree->ops->get_write_access) ++ return tree->ops->get_write_access(h,tree->buffer); ++ else ++ return 0; ++} ++ ++static int inline ++ext3_ext_mark_root_dirty(handle_t *h, struct ext3_extents_tree *tree) ++{ ++ if (tree->ops->mark_buffer_dirty) ++ return tree->ops->mark_buffer_dirty(h,tree->buffer); ++ else ++ return 0; ++} ++ ++/* ++ * could return: ++ * - EROFS ++ * - ENOMEM ++ */ ++static int ext3_ext_get_access(handle_t *handle, ++ struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ int err; ++ ++ if (path->p_bh) { ++ /* path points to block */ ++ err = ext3_journal_get_write_access(handle, path->p_bh); ++ } else { ++ /* path points to leaf/index in inode body */ ++ err = ext3_ext_get_access_for_root(handle, tree); ++ } ++ return err; ++} ++ ++/* ++ * could return: ++ * - EROFS ++ * - ENOMEM ++ * - EIO ++ */ ++static int ext3_ext_dirty(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ int err; ++ if (path->p_bh) { ++ /* path points to block */ ++ err =ext3_journal_dirty_metadata(handle, path->p_bh); ++ } else { ++ /* path points to leaf/index in inode body */ ++ err = ext3_ext_mark_root_dirty(handle, tree); ++ } ++ return err; ++} ++ ++static int inline ++ext3_ext_new_block(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, struct ext3_extent *ex, ++ int *err) ++{ ++ int goal, depth, newblock; ++ struct inode *inode; ++ ++ EXT_ASSERT(tree); ++ if (tree->ops->new_block) ++ return tree->ops->new_block(handle, tree, path, ex, err); ++ ++ inode = tree->inode; ++ depth = EXT_DEPTH(tree); ++ if (path && depth > 0) { ++ goal = path[depth-1].p_block; ++ } else { ++ struct ext3_inode_info *ei = EXT3_I(inode); ++ unsigned long bg_start; ++ unsigned long colour; ++ ++ bg_start = (ei->i_block_group * ++ EXT3_BLOCKS_PER_GROUP(inode->i_sb)) + ++ le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block); ++ colour = (current->pid % 16) * ++ (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16); ++ goal = bg_start + colour; ++ } ++ ++ newblock = ext3_new_block(handle, inode, goal, err); ++ return newblock; ++} ++ ++static inline void ext3_ext_tree_changed(struct ext3_extents_tree *tree) ++{ ++ struct ext3_extent_header *neh; ++ neh = EXT_ROOT_HDR(tree); ++ neh->eh_generation++; ++} ++ ++static inline int ext3_ext_space_block(struct ext3_extents_tree *tree) ++{ ++ int size; ++ ++ size = (tree->inode->i_sb->s_blocksize - ++ sizeof(struct ext3_extent_header)) / ++ sizeof(struct ext3_extent); ++#ifdef AGRESSIVE_TEST ++ size = 6; ++#endif ++ return size; ++} ++ ++static inline int ext3_ext_space_block_idx(struct ext3_extents_tree *tree) ++{ ++ int size; ++ ++ size = (tree->inode->i_sb->s_blocksize - ++ sizeof(struct ext3_extent_header)) / ++ sizeof(struct ext3_extent_idx); ++#ifdef AGRESSIVE_TEST ++ size = 5; ++#endif ++ return size; ++} ++ ++static inline int ext3_ext_space_root(struct ext3_extents_tree *tree) ++{ ++ int size; ++ ++ size = (tree->buffer_len - sizeof(struct ext3_extent_header)) / ++ sizeof(struct ext3_extent); ++#ifdef AGRESSIVE_TEST ++ size = 3; ++#endif ++ return size; ++} ++ ++static inline int ext3_ext_space_root_idx(struct ext3_extents_tree *tree) ++{ ++ int size; ++ ++ size = (tree->buffer_len - sizeof(struct ext3_extent_header)) / ++ sizeof(struct ext3_extent_idx); ++#ifdef AGRESSIVE_TEST ++ size = 4; ++#endif ++ return size; ++} ++ ++static void ext3_ext_show_path(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++#ifdef EXT_DEBUG ++ int k, l = path->p_depth; ++ ++ ext_debug(tree, "path:"); ++ for (k = 0; k <= l; k++, path++) { ++ if (path->p_idx) { ++ ext_debug(tree, " %d->%d", path->p_idx->ei_block, ++ path->p_idx->ei_leaf); ++ } else if (path->p_ext) { ++ ext_debug(tree, " %d:%d:%d", ++ path->p_ext->ee_block, ++ path->p_ext->ee_len, ++ path->p_ext->ee_start); ++ } else ++ ext_debug(tree, " []"); ++ } ++ ext_debug(tree, "\n"); ++#endif ++} ++ ++static void ext3_ext_show_leaf(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++#ifdef EXT_DEBUG ++ int depth = EXT_DEPTH(tree); ++ struct ext3_extent_header *eh; ++ struct ext3_extent *ex; ++ int i; ++ ++ if (!path) ++ return; ++ ++ eh = path[depth].p_hdr; ++ ex = EXT_FIRST_EXTENT(eh); ++ ++ for (i = 0; i < eh->eh_entries; i++, ex++) { ++ ext_debug(tree, "%d:%d:%d ", ++ ex->ee_block, ex->ee_len, ex->ee_start); ++ } ++ ext_debug(tree, "\n"); ++#endif ++} ++ ++static void ext3_ext_drop_refs(struct ext3_ext_path *path) ++{ ++ int depth = path->p_depth; ++ int i; ++ ++ for (i = 0; i <= depth; i++, path++) { ++ if (path->p_bh) { ++ brelse(path->p_bh); ++ path->p_bh = NULL; ++ } ++ } ++} ++ ++/* ++ * binary search for closest index by given block ++ */ ++static inline void ++ext3_ext_binsearch_idx(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, int block) ++{ ++ struct ext3_extent_header *eh = path->p_hdr; ++ struct ext3_extent_idx *ix; ++ int l = 0, k, r; ++ ++ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); ++ EXT_ASSERT(eh->eh_entries <= eh->eh_max); ++ EXT_ASSERT(eh->eh_entries > 0); ++ ++ ext_debug(tree, "binsearch for %d(idx): ", block); ++ ++ path->p_idx = ix = EXT_FIRST_INDEX(eh); ++ ++ r = k = eh->eh_entries; ++ while (k > 1) { ++ k = (r - l) / 2; ++ if (block < ix[l + k].ei_block) ++ r -= k; ++ else ++ l += k; ++ ext_debug(tree, "%d:%d:%d ", k, l, r); ++ } ++ ++ ix += l; ++ path->p_idx = ix; ++ ext_debug(tree," -> %d->%d ",path->p_idx->ei_block,path->p_idx->ei_leaf); ++ ++ while (l++ < r) { ++ if (block < ix->ei_block) ++ break; ++ path->p_idx = ix++; ++ } ++ ext_debug(tree, " -> %d->%d\n", path->p_idx->ei_block, ++ path->p_idx->ei_leaf); ++ ++#ifdef CHECK_BINSEARCH ++ { ++ struct ext3_extent_idx *chix; ++ ++ chix = ix = EXT_FIRST_INDEX(eh); ++ for (k = 0; k < eh->eh_entries; k++, ix++) { ++ if (k != 0 && ix->ei_block <= ix[-1].ei_block) { ++ printk("k=%d, ix=0x%p, first=0x%p\n", k, ++ ix, EXT_FIRST_INDEX(eh)); ++ printk("%u <= %u\n", ++ ix->ei_block,ix[-1].ei_block); ++ } ++ EXT_ASSERT(k == 0 || ix->ei_block > ix[-1].ei_block); ++ if (block < ix->ei_block) ++ break; ++ chix = ix; ++ } ++ EXT_ASSERT(chix == path->p_idx); ++ } ++#endif ++} ++ ++/* ++ * binary search for closest extent by given block ++ */ ++static inline void ++ext3_ext_binsearch(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, int block) ++{ ++ struct ext3_extent_header *eh = path->p_hdr; ++ struct ext3_extent *ex; ++ int l = 0, k, r; ++ ++ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); ++ EXT_ASSERT(eh->eh_entries <= eh->eh_max); ++ ++ if (eh->eh_entries == 0) { ++ /* ++ * this leaf is empty yet: ++ * we get such a leaf in split/add case ++ */ ++ return; ++ } ++ ++ ext_debug(tree, "binsearch for %d: ", block); ++ ++ path->p_ext = ex = EXT_FIRST_EXTENT(eh); ++ ++ r = k = eh->eh_entries; ++ while (k > 1) { ++ k = (r - l) / 2; ++ if (block < ex[l + k].ee_block) ++ r -= k; ++ else ++ l += k; ++ ext_debug(tree, "%d:%d:%d ", k, l, r); ++ } ++ ++ ex += l; ++ path->p_ext = ex; ++ ext_debug(tree, " -> %d:%d:%d ", path->p_ext->ee_block, ++ path->p_ext->ee_start, path->p_ext->ee_len); ++ ++ while (l++ < r) { ++ if (block < ex->ee_block) ++ break; ++ path->p_ext = ex++; ++ } ++ ext_debug(tree, " -> %d:%d:%d\n", path->p_ext->ee_block, ++ path->p_ext->ee_start, path->p_ext->ee_len); ++ ++#ifdef CHECK_BINSEARCH ++ { ++ struct ext3_extent *chex; ++ ++ chex = ex = EXT_FIRST_EXTENT(eh); ++ for (k = 0; k < eh->eh_entries; k++, ex++) { ++ EXT_ASSERT(k == 0 || ex->ee_block > ex[-1].ee_block); ++ if (block < ex->ee_block) ++ break; ++ chex = ex; ++ } ++ EXT_ASSERT(chex == path->p_ext); ++ } ++#endif ++} ++ ++int ext3_extent_tree_init(handle_t *handle, struct ext3_extents_tree *tree) ++{ ++ struct ext3_extent_header *eh; ++ ++ BUG_ON(tree->buffer_len == 0); ++ ext3_ext_get_access_for_root(handle, tree); ++ eh = EXT_ROOT_HDR(tree); ++ eh->eh_depth = 0; ++ eh->eh_entries = 0; ++ eh->eh_magic = EXT3_EXT_MAGIC; ++ eh->eh_max = ext3_ext_space_root(tree); ++ ext3_ext_mark_root_dirty(handle, tree); ++ ext3_ext_invalidate_cache(tree); ++ return 0; ++} ++ ++struct ext3_ext_path * ++ext3_ext_find_extent(struct ext3_extents_tree *tree, int block, ++ struct ext3_ext_path *path) ++{ ++ struct ext3_extent_header *eh; ++ struct buffer_head *bh; ++ int depth, i, ppos = 0; ++ ++ EXT_ASSERT(tree); ++ EXT_ASSERT(tree->inode); ++ EXT_ASSERT(tree->root); ++ ++ eh = EXT_ROOT_HDR(tree); ++ EXT_ASSERT(eh); ++ if (ext3_ext_check_header(eh)) ++ goto err; ++ ++ i = depth = EXT_DEPTH(tree); ++ EXT_ASSERT(eh->eh_max); ++ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); ++ ++ /* account possible depth increase */ ++ if (!path) { ++ path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 2), ++ GFP_NOFS); ++ if (!path) ++ return ERR_PTR(-ENOMEM); ++ } ++ memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1)); ++ path[0].p_hdr = eh; ++ ++ /* walk through the tree */ ++ while (i) { ++ ext_debug(tree, "depth %d: num %d, max %d\n", ++ ppos, eh->eh_entries, eh->eh_max); ++ ext3_ext_binsearch_idx(tree, path + ppos, block); ++ path[ppos].p_block = path[ppos].p_idx->ei_leaf; ++ path[ppos].p_depth = i; ++ path[ppos].p_ext = NULL; ++ ++ bh = sb_bread(tree->inode->i_sb, path[ppos].p_block); ++ if (!bh) ++ goto err; ++ ++ eh = EXT_BLOCK_HDR(bh); ++ ppos++; ++ EXT_ASSERT(ppos <= depth); ++ path[ppos].p_bh = bh; ++ path[ppos].p_hdr = eh; ++ i--; ++ ++ if (ext3_ext_check_header(eh)) ++ goto err; ++ } ++ ++ path[ppos].p_depth = i; ++ path[ppos].p_hdr = eh; ++ path[ppos].p_ext = NULL; ++ path[ppos].p_idx = NULL; ++ ++ if (ext3_ext_check_header(eh)) ++ goto err; ++ ++ /* find extent */ ++ ext3_ext_binsearch(tree, path + ppos, block); ++ ++ ext3_ext_show_path(tree, path); ++ ++ return path; ++ ++err: ++ printk(KERN_ERR "EXT3-fs: header is corrupted!\n"); ++ ext3_ext_drop_refs(path); ++ kfree(path); ++ return ERR_PTR(-EIO); ++} ++ ++/* ++ * insert new index [logical;ptr] into the block at cupr ++ * it check where to insert: before curp or after curp ++ */ ++static int ext3_ext_insert_index(handle_t *handle, ++ struct ext3_extents_tree *tree, ++ struct ext3_ext_path *curp, ++ int logical, int ptr) ++{ ++ struct ext3_extent_idx *ix; ++ int len, err; ++ ++ if ((err = ext3_ext_get_access(handle, tree, curp))) ++ return err; ++ ++ EXT_ASSERT(logical != curp->p_idx->ei_block); ++ len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx; ++ if (logical > curp->p_idx->ei_block) { ++ /* insert after */ ++ if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) { ++ len = (len - 1) * sizeof(struct ext3_extent_idx); ++ len = len < 0 ? 0 : len; ++ ext_debug(tree, "insert new index %d after: %d. " ++ "move %d from 0x%p to 0x%p\n", ++ logical, ptr, len, ++ (curp->p_idx + 1), (curp->p_idx + 2)); ++ memmove(curp->p_idx + 2, curp->p_idx + 1, len); ++ } ++ ix = curp->p_idx + 1; ++ } else { ++ /* insert before */ ++ len = len * sizeof(struct ext3_extent_idx); ++ len = len < 0 ? 0 : len; ++ ext_debug(tree, "insert new index %d before: %d. " ++ "move %d from 0x%p to 0x%p\n", ++ logical, ptr, len, ++ curp->p_idx, (curp->p_idx + 1)); ++ memmove(curp->p_idx + 1, curp->p_idx, len); ++ ix = curp->p_idx; ++ } ++ ++ ix->ei_block = logical; ++ ix->ei_leaf = ptr; ++ curp->p_hdr->eh_entries++; ++ ++ EXT_ASSERT(curp->p_hdr->eh_entries <= curp->p_hdr->eh_max); ++ EXT_ASSERT(ix <= EXT_LAST_INDEX(curp->p_hdr)); ++ ++ err = ext3_ext_dirty(handle, tree, curp); ++ ext3_std_error(tree->inode->i_sb, err); ++ ++ return err; ++} ++ ++/* ++ * routine inserts new subtree into the path, using free index entry ++ * at depth 'at: ++ * - allocates all needed blocks (new leaf and all intermediate index blocks) ++ * - makes decision where to split ++ * - moves remaining extens and index entries (right to the split point) ++ * into the newly allocated blocks ++ * - initialize subtree ++ */ ++static int ext3_ext_split(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newext, int at) ++{ ++ struct buffer_head *bh = NULL; ++ int depth = EXT_DEPTH(tree); ++ struct ext3_extent_header *neh; ++ struct ext3_extent_idx *fidx; ++ struct ext3_extent *ex; ++ int i = at, k, m, a; ++ unsigned long newblock, oldblock, border; ++ int *ablocks = NULL; /* array of allocated blocks */ ++ int err = 0; ++ ++ /* make decision: where to split? */ ++ /* FIXME: now desicion is simplest: at current extent */ ++ ++ /* if current leaf will be splitted, then we should use ++ * border from split point */ ++ EXT_ASSERT(path[depth].p_ext <= EXT_MAX_EXTENT(path[depth].p_hdr)); ++ if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) { ++ border = path[depth].p_ext[1].ee_block; ++ ext_debug(tree, "leaf will be splitted." ++ " next leaf starts at %d\n", ++ (int)border); ++ } else { ++ border = newext->ee_block; ++ ext_debug(tree, "leaf will be added." ++ " next leaf starts at %d\n", ++ (int)border); ++ } ++ ++ /* ++ * if error occurs, then we break processing ++ * and turn filesystem read-only. so, index won't ++ * be inserted and tree will be in consistent ++ * state. next mount will repair buffers too ++ */ ++ ++ /* ++ * get array to track all allocated blocks ++ * we need this to handle errors and free blocks ++ * upon them ++ */ ++ ablocks = kmalloc(sizeof(unsigned long) * depth, GFP_NOFS); ++ if (!ablocks) ++ return -ENOMEM; ++ memset(ablocks, 0, sizeof(unsigned long) * depth); ++ ++ /* allocate all needed blocks */ ++ ext_debug(tree, "allocate %d blocks for indexes/leaf\n", depth - at); ++ for (a = 0; a < depth - at; a++) { ++ newblock = ext3_ext_new_block(handle, tree, path, newext, &err); ++ if (newblock == 0) ++ goto cleanup; ++ ablocks[a] = newblock; ++ } ++ ++ /* initialize new leaf */ ++ newblock = ablocks[--a]; ++ EXT_ASSERT(newblock); ++ bh = sb_getblk(tree->inode->i_sb, newblock); ++ if (!bh) { ++ err = -EIO; ++ goto cleanup; ++ } ++ lock_buffer(bh); ++ ++ if ((err = ext3_journal_get_create_access(handle, bh))) ++ goto cleanup; ++ ++ neh = EXT_BLOCK_HDR(bh); ++ neh->eh_entries = 0; ++ neh->eh_max = ext3_ext_space_block(tree); ++ neh->eh_magic = EXT3_EXT_MAGIC; ++ neh->eh_depth = 0; ++ ex = EXT_FIRST_EXTENT(neh); ++ ++ /* move remain of path[depth] to the new leaf */ ++ EXT_ASSERT(path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max); ++ /* start copy from next extent */ ++ /* TODO: we could do it by single memmove */ ++ m = 0; ++ path[depth].p_ext++; ++ while (path[depth].p_ext <= ++ EXT_MAX_EXTENT(path[depth].p_hdr)) { ++ ext_debug(tree, "move %d:%d:%d in new leaf %lu\n", ++ path[depth].p_ext->ee_block, ++ path[depth].p_ext->ee_start, ++ path[depth].p_ext->ee_len, ++ newblock); ++ memmove(ex++, path[depth].p_ext++, sizeof(struct ext3_extent)); ++ neh->eh_entries++; ++ m++; ++ } ++ set_buffer_uptodate(bh); ++ unlock_buffer(bh); ++ ++ if ((err = ext3_journal_dirty_metadata(handle, bh))) ++ goto cleanup; ++ brelse(bh); ++ bh = NULL; ++ ++ /* correct old leaf */ ++ if (m) { ++ if ((err = ext3_ext_get_access(handle, tree, path + depth))) ++ goto cleanup; ++ path[depth].p_hdr->eh_entries -= m; ++ if ((err = ext3_ext_dirty(handle, tree, path + depth))) ++ goto cleanup; ++ ++ } ++ ++ /* create intermediate indexes */ ++ k = depth - at - 1; ++ EXT_ASSERT(k >= 0); ++ if (k) ++ ext_debug(tree, "create %d intermediate indices\n", k); ++ /* insert new index into current index block */ ++ /* current depth stored in i var */ ++ i = depth - 1; ++ while (k--) { ++ oldblock = newblock; ++ newblock = ablocks[--a]; ++ bh = sb_getblk(tree->inode->i_sb, newblock); ++ if (!bh) { ++ err = -EIO; ++ goto cleanup; ++ } ++ lock_buffer(bh); ++ ++ if ((err = ext3_journal_get_create_access(handle, bh))) ++ goto cleanup; ++ ++ neh = EXT_BLOCK_HDR(bh); ++ neh->eh_entries = 1; ++ neh->eh_magic = EXT3_EXT_MAGIC; ++ neh->eh_max = ext3_ext_space_block_idx(tree); ++ neh->eh_depth = depth - i; ++ fidx = EXT_FIRST_INDEX(neh); ++ fidx->ei_block = border; ++ fidx->ei_leaf = oldblock; ++ ++ ext_debug(tree, "int.index at %d (block %lu): %lu -> %lu\n", ++ i, newblock, border, oldblock); ++ /* copy indexes */ ++ m = 0; ++ path[i].p_idx++; ++ ++ ext_debug(tree, "cur 0x%p, last 0x%p\n", path[i].p_idx, ++ EXT_MAX_INDEX(path[i].p_hdr)); ++ EXT_ASSERT(EXT_MAX_INDEX(path[i].p_hdr) == ++ EXT_LAST_INDEX(path[i].p_hdr)); ++ while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) { ++ ext_debug(tree, "%d: move %d:%d in new index %lu\n", ++ i, path[i].p_idx->ei_block, ++ path[i].p_idx->ei_leaf, newblock); ++ memmove(++fidx, path[i].p_idx++, ++ sizeof(struct ext3_extent_idx)); ++ neh->eh_entries++; ++ EXT_ASSERT(neh->eh_entries <= neh->eh_max); ++ m++; ++ } ++ set_buffer_uptodate(bh); ++ unlock_buffer(bh); ++ ++ if ((err = ext3_journal_dirty_metadata(handle, bh))) ++ goto cleanup; ++ brelse(bh); ++ bh = NULL; ++ ++ /* correct old index */ ++ if (m) { ++ err = ext3_ext_get_access(handle, tree, path + i); ++ if (err) ++ goto cleanup; ++ path[i].p_hdr->eh_entries -= m; ++ err = ext3_ext_dirty(handle, tree, path + i); ++ if (err) ++ goto cleanup; ++ } ++ ++ i--; ++ } ++ ++ /* insert new index */ ++ if (!err) ++ err = ext3_ext_insert_index(handle, tree, path + at, ++ border, newblock); ++ ++cleanup: ++ if (bh) { ++ if (buffer_locked(bh)) ++ unlock_buffer(bh); ++ brelse(bh); ++ } ++ ++ if (err) { ++ /* free all allocated blocks in error case */ ++ for (i = 0; i < depth; i++) { ++ if (!ablocks[i]) ++ continue; ++ ext3_free_blocks(handle, tree->inode, ablocks[i], 1); ++ } ++ } ++ kfree(ablocks); ++ ++ return err; ++} ++ ++/* ++ * routine implements tree growing procedure: ++ * - allocates new block ++ * - moves top-level data (index block or leaf) into the new block ++ * - initialize new top-level, creating index that points to the ++ * just created block ++ */ ++static int ext3_ext_grow_indepth(handle_t *handle, ++ struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newext) ++{ ++ struct ext3_ext_path *curp = path; ++ struct ext3_extent_header *neh; ++ struct ext3_extent_idx *fidx; ++ struct buffer_head *bh; ++ unsigned long newblock; ++ int err = 0; ++ ++ newblock = ext3_ext_new_block(handle, tree, path, newext, &err); ++ if (newblock == 0) ++ return err; ++ ++ bh = sb_getblk(tree->inode->i_sb, newblock); ++ if (!bh) { ++ err = -EIO; ++ ext3_std_error(tree->inode->i_sb, err); ++ return err; ++ } ++ lock_buffer(bh); ++ ++ if ((err = ext3_journal_get_create_access(handle, bh))) { ++ unlock_buffer(bh); ++ goto out; ++ } ++ ++ /* move top-level index/leaf into new block */ ++ memmove(bh->b_data, curp->p_hdr, tree->buffer_len); ++ ++ /* set size of new block */ ++ neh = EXT_BLOCK_HDR(bh); ++ /* old root could have indexes or leaves ++ * so calculate eh_max right way */ ++ if (EXT_DEPTH(tree)) ++ neh->eh_max = ext3_ext_space_block_idx(tree); ++ else ++ neh->eh_max = ext3_ext_space_block(tree); ++ neh->eh_magic = EXT3_EXT_MAGIC; ++ set_buffer_uptodate(bh); ++ unlock_buffer(bh); ++ ++ if ((err = ext3_journal_dirty_metadata(handle, bh))) ++ goto out; ++ ++ /* create index in new top-level index: num,max,pointer */ ++ if ((err = ext3_ext_get_access(handle, tree, curp))) ++ goto out; ++ ++ curp->p_hdr->eh_magic = EXT3_EXT_MAGIC; ++ curp->p_hdr->eh_max = ext3_ext_space_root_idx(tree); ++ curp->p_hdr->eh_entries = 1; ++ curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr); ++ /* FIXME: it works, but actually path[0] can be index */ ++ curp->p_idx->ei_block = EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block; ++ curp->p_idx->ei_leaf = newblock; ++ ++ neh = EXT_ROOT_HDR(tree); ++ fidx = EXT_FIRST_INDEX(neh); ++ ext_debug(tree, "new root: num %d(%d), lblock %d, ptr %d\n", ++ neh->eh_entries, neh->eh_max, fidx->ei_block, fidx->ei_leaf); ++ ++ neh->eh_depth = path->p_depth + 1; ++ err = ext3_ext_dirty(handle, tree, curp); ++out: ++ brelse(bh); ++ ++ return err; ++} ++ ++/* ++ * routine finds empty index and adds new leaf. if no free index found ++ * then it requests in-depth growing ++ */ ++static int ext3_ext_create_new_leaf(handle_t *handle, ++ struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newext) ++{ ++ struct ext3_ext_path *curp; ++ int depth, i, err = 0; ++ ++repeat: ++ i = depth = EXT_DEPTH(tree); ++ ++ /* walk up to the tree and look for free index entry */ ++ curp = path + depth; ++ while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) { ++ i--; ++ curp--; ++ } ++ ++ /* we use already allocated block for index block ++ * so, subsequent data blocks should be contigoues */ ++ if (EXT_HAS_FREE_INDEX(curp)) { ++ /* if we found index with free entry, then use that ++ * entry: create all needed subtree and add new leaf */ ++ err = ext3_ext_split(handle, tree, path, newext, i); ++ ++ /* refill path */ ++ ext3_ext_drop_refs(path); ++ path = ext3_ext_find_extent(tree, newext->ee_block, path); ++ if (IS_ERR(path)) ++ err = PTR_ERR(path); ++ } else { ++ /* tree is full, time to grow in depth */ ++ err = ext3_ext_grow_indepth(handle, tree, path, newext); ++ ++ /* refill path */ ++ ext3_ext_drop_refs(path); ++ path = ext3_ext_find_extent(tree, newext->ee_block, path); ++ if (IS_ERR(path)) ++ err = PTR_ERR(path); ++ ++ /* ++ * only first (depth 0 -> 1) produces free space ++ * in all other cases we have to split growed tree ++ */ ++ depth = EXT_DEPTH(tree); ++ if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) { ++ /* now we need split */ ++ goto repeat; ++ } ++ } ++ ++ if (err) ++ return err; ++ ++ return 0; ++} ++ ++/* ++ * returns allocated block in subsequent extent or EXT_MAX_BLOCK ++ * NOTE: it consider block number from index entry as ++ * allocated block. thus, index entries have to be consistent ++ * with leafs ++ */ ++static unsigned long ++ext3_ext_next_allocated_block(struct ext3_ext_path *path) ++{ ++ int depth; ++ ++ EXT_ASSERT(path != NULL); ++ depth = path->p_depth; ++ ++ if (depth == 0 && path->p_ext == NULL) ++ return EXT_MAX_BLOCK; ++ ++ /* FIXME: what if index isn't full ?! */ ++ while (depth >= 0) { ++ if (depth == path->p_depth) { ++ /* leaf */ ++ if (path[depth].p_ext != ++ EXT_LAST_EXTENT(path[depth].p_hdr)) ++ return path[depth].p_ext[1].ee_block; ++ } else { ++ /* index */ ++ if (path[depth].p_idx != ++ EXT_LAST_INDEX(path[depth].p_hdr)) ++ return path[depth].p_idx[1].ei_block; ++ } ++ depth--; ++ } ++ ++ return EXT_MAX_BLOCK; ++} ++ ++/* ++ * returns first allocated block from next leaf or EXT_MAX_BLOCK ++ */ ++static unsigned ext3_ext_next_leaf_block(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ int depth; ++ ++ EXT_ASSERT(path != NULL); ++ depth = path->p_depth; ++ ++ /* zero-tree has no leaf blocks at all */ ++ if (depth == 0) ++ return EXT_MAX_BLOCK; ++ ++ /* go to index block */ ++ depth--; ++ ++ while (depth >= 0) { ++ if (path[depth].p_idx != ++ EXT_LAST_INDEX(path[depth].p_hdr)) ++ return path[depth].p_idx[1].ei_block; ++ depth--; ++ } ++ ++ return EXT_MAX_BLOCK; ++} ++ ++/* ++ * if leaf gets modified and modified extent is first in the leaf ++ * then we have to correct all indexes above ++ * TODO: do we need to correct tree in all cases? ++ */ ++int ext3_ext_correct_indexes(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ struct ext3_extent_header *eh; ++ int depth = EXT_DEPTH(tree); ++ struct ext3_extent *ex; ++ unsigned long border; ++ int k, err = 0; ++ ++ eh = path[depth].p_hdr; ++ ex = path[depth].p_ext; ++ EXT_ASSERT(ex); ++ EXT_ASSERT(eh); ++ ++ if (depth == 0) { ++ /* there is no tree at all */ ++ return 0; ++ } ++ ++ if (ex != EXT_FIRST_EXTENT(eh)) { ++ /* we correct tree if first leaf got modified only */ ++ return 0; ++ } ++ ++ /* ++ * TODO: we need correction if border is smaller then current one ++ */ ++ k = depth - 1; ++ border = path[depth].p_ext->ee_block; ++ if ((err = ext3_ext_get_access(handle, tree, path + k))) ++ return err; ++ path[k].p_idx->ei_block = border; ++ if ((err = ext3_ext_dirty(handle, tree, path + k))) ++ return err; ++ ++ while (k--) { ++ /* change all left-side indexes */ ++ if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr)) ++ break; ++ if ((err = ext3_ext_get_access(handle, tree, path + k))) ++ break; ++ path[k].p_idx->ei_block = border; ++ if ((err = ext3_ext_dirty(handle, tree, path + k))) ++ break; ++ } ++ ++ return err; ++} ++ ++static int inline ++ext3_can_extents_be_merged(struct ext3_extents_tree *tree, ++ struct ext3_extent *ex1, ++ struct ext3_extent *ex2) ++{ ++ if (ex1->ee_block + ex1->ee_len != ex2->ee_block) ++ return 0; ++ ++#ifdef AGRESSIVE_TEST ++ if (ex1->ee_len >= 4) ++ return 0; ++#endif ++ ++ if (!tree->ops->mergable) ++ return 1; ++ ++ return tree->ops->mergable(ex1, ex2); ++} ++ ++/* ++ * this routine tries to merge requsted extent into the existing ++ * extent or inserts requested extent as new one into the tree, ++ * creating new leaf in no-space case ++ */ ++int ext3_ext_insert_extent(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newext) ++{ ++ struct ext3_extent_header * eh; ++ struct ext3_extent *ex, *fex; ++ struct ext3_extent *nearex; /* nearest extent */ ++ struct ext3_ext_path *npath = NULL; ++ int depth, len, err, next; ++ ++ EXT_ASSERT(newext->ee_len > 0); ++ depth = EXT_DEPTH(tree); ++ ex = path[depth].p_ext; ++ EXT_ASSERT(path[depth].p_hdr); ++ ++ /* try to insert block into found extent and return */ ++ if (ex && ext3_can_extents_be_merged(tree, ex, newext)) { ++ ext_debug(tree, "append %d block to %d:%d (from %d)\n", ++ newext->ee_len, ex->ee_block, ex->ee_len, ++ ex->ee_start); ++ if ((err = ext3_ext_get_access(handle, tree, path + depth))) ++ return err; ++ ex->ee_len += newext->ee_len; ++ eh = path[depth].p_hdr; ++ nearex = ex; ++ goto merge; ++ } ++ ++repeat: ++ depth = EXT_DEPTH(tree); ++ eh = path[depth].p_hdr; ++ if (eh->eh_entries < eh->eh_max) ++ goto has_space; ++ ++ /* probably next leaf has space for us? */ ++ fex = EXT_LAST_EXTENT(eh); ++ next = ext3_ext_next_leaf_block(tree, path); ++ if (newext->ee_block > fex->ee_block && next != EXT_MAX_BLOCK) { ++ ext_debug(tree, "next leaf block - %d\n", next); ++ EXT_ASSERT(!npath); ++ npath = ext3_ext_find_extent(tree, next, NULL); ++ if (IS_ERR(npath)) ++ return PTR_ERR(npath); ++ EXT_ASSERT(npath->p_depth == path->p_depth); ++ eh = npath[depth].p_hdr; ++ if (eh->eh_entries < eh->eh_max) { ++ ext_debug(tree, "next leaf isnt full(%d)\n", ++ eh->eh_entries); ++ path = npath; ++ goto repeat; ++ } ++ ext_debug(tree, "next leaf hasno free space(%d,%d)\n", ++ eh->eh_entries, eh->eh_max); ++ } ++ ++ /* ++ * there is no free space in found leaf ++ * we're gonna add new leaf in the tree ++ */ ++ err = ext3_ext_create_new_leaf(handle, tree, path, newext); ++ if (err) ++ goto cleanup; ++ depth = EXT_DEPTH(tree); ++ eh = path[depth].p_hdr; ++ ++has_space: ++ nearex = path[depth].p_ext; ++ ++ if ((err = ext3_ext_get_access(handle, tree, path + depth))) ++ goto cleanup; ++ ++ if (!nearex) { ++ /* there is no extent in this leaf, create first one */ ++ ext_debug(tree, "first extent in the leaf: %d:%d:%d\n", ++ newext->ee_block, newext->ee_start, ++ newext->ee_len); ++ path[depth].p_ext = EXT_FIRST_EXTENT(eh); ++ } else if (newext->ee_block > nearex->ee_block) { ++ EXT_ASSERT(newext->ee_block != nearex->ee_block); ++ if (nearex != EXT_LAST_EXTENT(eh)) { ++ len = EXT_MAX_EXTENT(eh) - nearex; ++ len = (len - 1) * sizeof(struct ext3_extent); ++ len = len < 0 ? 0 : len; ++ ext_debug(tree, "insert %d:%d:%d after: nearest 0x%p, " ++ "move %d from 0x%p to 0x%p\n", ++ newext->ee_block, newext->ee_start, ++ newext->ee_len, ++ nearex, len, nearex + 1, nearex + 2); ++ memmove(nearex + 2, nearex + 1, len); ++ } ++ path[depth].p_ext = nearex + 1; ++ } else { ++ EXT_ASSERT(newext->ee_block != nearex->ee_block); ++ len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext3_extent); ++ len = len < 0 ? 0 : len; ++ ext_debug(tree, "insert %d:%d:%d before: nearest 0x%p, " ++ "move %d from 0x%p to 0x%p\n", ++ newext->ee_block, newext->ee_start, newext->ee_len, ++ nearex, len, nearex + 1, nearex + 2); ++ memmove(nearex + 1, nearex, len); ++ path[depth].p_ext = nearex; ++ } ++ ++ eh->eh_entries++; ++ nearex = path[depth].p_ext; ++ nearex->ee_block = newext->ee_block; ++ nearex->ee_start = newext->ee_start; ++ nearex->ee_len = newext->ee_len; ++ /* FIXME: support for large fs */ ++ nearex->ee_start_hi = 0; ++ ++merge: ++ /* try to merge extents to the right */ ++ while (nearex < EXT_LAST_EXTENT(eh)) { ++ if (!ext3_can_extents_be_merged(tree, nearex, nearex + 1)) ++ break; ++ /* merge with next extent! */ ++ nearex->ee_len += nearex[1].ee_len; ++ if (nearex + 1 < EXT_LAST_EXTENT(eh)) { ++ len = (EXT_LAST_EXTENT(eh) - nearex - 1) * ++ sizeof(struct ext3_extent); ++ memmove(nearex + 1, nearex + 2, len); ++ } ++ eh->eh_entries--; ++ EXT_ASSERT(eh->eh_entries > 0); ++ } ++ ++ /* try to merge extents to the left */ ++ ++ /* time to correct all indexes above */ ++ err = ext3_ext_correct_indexes(handle, tree, path); ++ if (err) ++ goto cleanup; ++ ++ err = ext3_ext_dirty(handle, tree, path + depth); ++ ++cleanup: ++ if (npath) { ++ ext3_ext_drop_refs(npath); ++ kfree(npath); ++ } ++ ext3_ext_tree_changed(tree); ++ ext3_ext_invalidate_cache(tree); ++ return err; ++} ++ ++int ext3_ext_walk_space(struct ext3_extents_tree *tree, unsigned long block, ++ unsigned long num, ext_prepare_callback func) ++{ ++ struct ext3_ext_path *path = NULL; ++ struct ext3_ext_cache cbex; ++ struct ext3_extent *ex; ++ unsigned long next, start = 0, end = 0; ++ unsigned long last = block + num; ++ int depth, exists, err = 0; ++ ++ EXT_ASSERT(tree); ++ EXT_ASSERT(func); ++ EXT_ASSERT(tree->inode); ++ EXT_ASSERT(tree->root); ++ ++ while (block < last && block != EXT_MAX_BLOCK) { ++ num = last - block; ++ /* find extent for this block */ ++ path = ext3_ext_find_extent(tree, block, path); ++ if (IS_ERR(path)) { ++ err = PTR_ERR(path); ++ path = NULL; ++ break; ++ } ++ ++ depth = EXT_DEPTH(tree); ++ EXT_ASSERT(path[depth].p_hdr); ++ ex = path[depth].p_ext; ++ next = ext3_ext_next_allocated_block(path); ++ ++ exists = 0; ++ if (!ex) { ++ /* there is no extent yet, so try to allocate ++ * all requested space */ ++ start = block; ++ end = block + num; ++ } else if (ex->ee_block > block) { ++ /* need to allocate space before found extent */ ++ start = block; ++ end = ex->ee_block; ++ if (block + num < end) ++ end = block + num; ++ } else if (block >= ex->ee_block + ex->ee_len) { ++ /* need to allocate space after found extent */ ++ start = block; ++ end = block + num; ++ if (end >= next) ++ end = next; ++ } else if (block >= ex->ee_block) { ++ /* ++ * some part of requested space is covered ++ * by found extent ++ */ ++ start = block; ++ end = ex->ee_block + ex->ee_len; ++ if (block + num < end) ++ end = block + num; ++ exists = 1; ++ } else { ++ BUG(); ++ } ++ EXT_ASSERT(end > start); ++ ++ if (!exists) { ++ cbex.ec_block = start; ++ cbex.ec_len = end - start; ++ cbex.ec_start = 0; ++ cbex.ec_type = EXT3_EXT_CACHE_GAP; ++ } else { ++ cbex.ec_block = ex->ee_block; ++ cbex.ec_len = ex->ee_len; ++ cbex.ec_start = ex->ee_start; ++ cbex.ec_type = EXT3_EXT_CACHE_EXTENT; ++ } ++ ++ EXT_ASSERT(cbex.ec_len > 0); ++ EXT_ASSERT(path[depth].p_hdr); ++ err = func(tree, path, &cbex); ++ ext3_ext_drop_refs(path); ++ ++ if (err < 0) ++ break; ++ if (err == EXT_REPEAT) ++ continue; ++ else if (err == EXT_BREAK) { ++ err = 0; ++ break; ++ } ++ ++ if (EXT_DEPTH(tree) != depth) { ++ /* depth was changed. we have to realloc path */ ++ kfree(path); ++ path = NULL; ++ } ++ ++ block = cbex.ec_block + cbex.ec_len; ++ } ++ ++ if (path) { ++ ext3_ext_drop_refs(path); ++ kfree(path); ++ } ++ ++ return err; ++} ++ ++static inline void ++ext3_ext_put_in_cache(struct ext3_extents_tree *tree, __u32 block, ++ __u32 len, __u32 start, int type) ++{ ++ EXT_ASSERT(len > 0); ++ if (tree->cex) { ++ tree->cex->ec_type = type; ++ tree->cex->ec_block = block; ++ tree->cex->ec_len = len; ++ tree->cex->ec_start = start; ++ } ++} ++ ++/* ++ * this routine calculate boundaries of the gap requested block fits into ++ * and cache this gap ++ */ ++static inline void ++ext3_ext_put_gap_in_cache(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ unsigned long block) ++{ ++ int depth = EXT_DEPTH(tree); ++ unsigned long lblock, len; ++ struct ext3_extent *ex; ++ ++ if (!tree->cex) ++ return; ++ ++ ex = path[depth].p_ext; ++ if (ex == NULL) { ++ /* there is no extent yet, so gap is [0;-] */ ++ lblock = 0; ++ len = EXT_MAX_BLOCK; ++ ext_debug(tree, "cache gap(whole file):"); ++ } else if (block < ex->ee_block) { ++ lblock = block; ++ len = ex->ee_block - block; ++ ext_debug(tree, "cache gap(before): %lu [%lu:%lu]", ++ (unsigned long) block, ++ (unsigned long) ex->ee_block, ++ (unsigned long) ex->ee_len); ++ } else if (block >= ex->ee_block + ex->ee_len) { ++ lblock = ex->ee_block + ex->ee_len; ++ len = ext3_ext_next_allocated_block(path); ++ ext_debug(tree, "cache gap(after): [%lu:%lu] %lu", ++ (unsigned long) ex->ee_block, ++ (unsigned long) ex->ee_len, ++ (unsigned long) block); ++ EXT_ASSERT(len > lblock); ++ len = len - lblock; ++ } else { ++ lblock = len = 0; ++ BUG(); ++ } ++ ++ ext_debug(tree, " -> %lu:%lu\n", (unsigned long) lblock, len); ++ ext3_ext_put_in_cache(tree, lblock, len, 0, EXT3_EXT_CACHE_GAP); ++} ++ ++static inline int ++ext3_ext_in_cache(struct ext3_extents_tree *tree, unsigned long block, ++ struct ext3_extent *ex) ++{ ++ struct ext3_ext_cache *cex = tree->cex; ++ ++ /* is there cache storage at all? */ ++ if (!cex) ++ return EXT3_EXT_CACHE_NO; ++ ++ /* has cache valid data? */ ++ if (cex->ec_type == EXT3_EXT_CACHE_NO) ++ return EXT3_EXT_CACHE_NO; ++ ++ EXT_ASSERT(cex->ec_type == EXT3_EXT_CACHE_GAP || ++ cex->ec_type == EXT3_EXT_CACHE_EXTENT); ++ if (block >= cex->ec_block && block < cex->ec_block + cex->ec_len) { ++ ex->ee_block = cex->ec_block; ++ ex->ee_start = cex->ec_start; ++ ex->ee_len = cex->ec_len; ++ ext_debug(tree, "%lu cached by %lu:%lu:%lu\n", ++ (unsigned long) block, ++ (unsigned long) ex->ee_block, ++ (unsigned long) ex->ee_len, ++ (unsigned long) ex->ee_start); ++ return cex->ec_type; ++ } ++ ++ /* not in cache */ ++ return EXT3_EXT_CACHE_NO; ++} ++ ++/* ++ * routine removes index from the index block ++ * it's used in truncate case only. thus all requests are for ++ * last index in the block only ++ */ ++int ext3_ext_rm_idx(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ struct buffer_head *bh; ++ int err; ++ ++ /* free index block */ ++ path--; ++ EXT_ASSERT(path->p_hdr->eh_entries); ++ if ((err = ext3_ext_get_access(handle, tree, path))) ++ return err; ++ path->p_hdr->eh_entries--; ++ if ((err = ext3_ext_dirty(handle, tree, path))) ++ return err; ++ ext_debug(tree, "index is empty, remove it, free block %d\n", ++ path->p_idx->ei_leaf); ++ bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf); ++ ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf); ++ ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1); ++ return err; ++} ++ ++int ext3_ext_calc_credits_for_insert(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ int depth = EXT_DEPTH(tree); ++ int needed; ++ ++ if (path) { ++ /* probably there is space in leaf? */ ++ if (path[depth].p_hdr->eh_entries < path[depth].p_hdr->eh_max) ++ return 1; ++ } ++ ++ /* ++ * the worste case we're expecting is creation of the ++ * new root (growing in depth) with index splitting ++ * for splitting we have to consider depth + 1 because ++ * previous growing could increase it ++ */ ++ depth = depth + 1; ++ ++ /* ++ * growing in depth: ++ * block allocation + new root + old root ++ */ ++ needed = EXT3_ALLOC_NEEDED + 2; ++ ++ /* index split. we may need: ++ * allocate intermediate indexes and new leaf ++ * change two blocks at each level, but root ++ * modify root block (inode) ++ */ ++ needed += (depth * EXT3_ALLOC_NEEDED) + (2 * depth) + 1; ++ ++ return needed; ++} ++ ++static int ++ext3_ext_split_for_rm(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, unsigned long start, ++ unsigned long end) ++{ ++ struct ext3_extent *ex, tex; ++ struct ext3_ext_path *npath; ++ int depth, creds, err; ++ ++ depth = EXT_DEPTH(tree); ++ ex = path[depth].p_ext; ++ EXT_ASSERT(ex); ++ EXT_ASSERT(end < ex->ee_block + ex->ee_len - 1); ++ EXT_ASSERT(ex->ee_block < start); ++ ++ /* calculate tail extent */ ++ tex.ee_block = end + 1; ++ EXT_ASSERT(tex.ee_block < ex->ee_block + ex->ee_len); ++ tex.ee_len = ex->ee_block + ex->ee_len - tex.ee_block; ++ ++ creds = ext3_ext_calc_credits_for_insert(tree, path); ++ handle = ext3_ext_journal_restart(handle, creds); ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ ++ /* calculate head extent. use primary extent */ ++ err = ext3_ext_get_access(handle, tree, path + depth); ++ if (err) ++ return err; ++ ex->ee_len = start - ex->ee_block; ++ err = ext3_ext_dirty(handle, tree, path + depth); ++ if (err) ++ return err; ++ ++ /* FIXME: some callback to free underlying resource ++ * and correct ee_start? */ ++ ext_debug(tree, "split extent: head %u:%u, tail %u:%u\n", ++ ex->ee_block, ex->ee_len, tex.ee_block, tex.ee_len); ++ ++ npath = ext3_ext_find_extent(tree, ex->ee_block, NULL); ++ if (IS_ERR(npath)) ++ return PTR_ERR(npath); ++ depth = EXT_DEPTH(tree); ++ EXT_ASSERT(npath[depth].p_ext->ee_block == ex->ee_block); ++ EXT_ASSERT(npath[depth].p_ext->ee_len == ex->ee_len); ++ ++ err = ext3_ext_insert_extent(handle, tree, npath, &tex); ++ ext3_ext_drop_refs(npath); ++ kfree(npath); ++ ++ return err; ++} ++ ++static int ++ext3_ext_rm_leaf(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, unsigned long start, ++ unsigned long end) ++{ ++ struct ext3_extent *ex, *fu = NULL, *lu, *le; ++ int err = 0, correct_index = 0; ++ int depth = EXT_DEPTH(tree), credits; ++ struct ext3_extent_header *eh; ++ unsigned a, b, block, num; ++ ++ ext_debug(tree, "remove [%lu:%lu] in leaf\n", start, end); ++ if (!path[depth].p_hdr) ++ path[depth].p_hdr = EXT_BLOCK_HDR(path[depth].p_bh); ++ eh = path[depth].p_hdr; ++ EXT_ASSERT(eh); ++ EXT_ASSERT(eh->eh_entries <= eh->eh_max); ++ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); ++ ++ /* find where to start removing */ ++ le = ex = EXT_LAST_EXTENT(eh); ++ while (ex != EXT_FIRST_EXTENT(eh)) { ++ if (ex->ee_block <= end) ++ break; ++ ex--; ++ } ++ ++ if (start > ex->ee_block && end < ex->ee_block + ex->ee_len - 1) { ++ /* removal of internal part of the extent requested ++ * tail and head must be placed in different extent ++ * so, we have to insert one more extent */ ++ path[depth].p_ext = ex; ++ return ext3_ext_split_for_rm(handle, tree, path, start, end); ++ } ++ ++ lu = ex; ++ while (ex >= EXT_FIRST_EXTENT(eh) && ex->ee_block + ex->ee_len > start) { ++ ext_debug(tree, "remove ext %u:%u\n", ex->ee_block, ex->ee_len); ++ path[depth].p_ext = ex; ++ ++ a = ex->ee_block > start ? ex->ee_block : start; ++ b = ex->ee_block + ex->ee_len - 1 < end ? ++ ex->ee_block + ex->ee_len - 1 : end; ++ ++ ext_debug(tree, " border %u:%u\n", a, b); ++ ++ if (a != ex->ee_block && b != ex->ee_block + ex->ee_len - 1) { ++ block = 0; ++ num = 0; ++ BUG(); ++ } else if (a != ex->ee_block) { ++ /* remove tail of the extent */ ++ block = ex->ee_block; ++ num = a - block; ++ } else if (b != ex->ee_block + ex->ee_len - 1) { ++ /* remove head of the extent */ ++ block = a; ++ num = b - a; ++ } else { ++ /* remove whole extent: excelent! */ ++ block = ex->ee_block; ++ num = 0; ++ EXT_ASSERT(a == ex->ee_block && ++ b == ex->ee_block + ex->ee_len - 1); ++ } ++ ++ if (ex == EXT_FIRST_EXTENT(eh)) ++ correct_index = 1; ++ ++ credits = 1; ++ if (correct_index) ++ credits += (EXT_DEPTH(tree) * EXT3_ALLOC_NEEDED) + 1; ++ if (tree->ops->remove_extent_credits) ++ credits+=tree->ops->remove_extent_credits(tree,ex,a,b); ++ ++ handle = ext3_ext_journal_restart(handle, credits); ++ if (IS_ERR(handle)) { ++ err = PTR_ERR(handle); ++ goto out; ++ } ++ ++ err = ext3_ext_get_access(handle, tree, path + depth); ++ if (err) ++ goto out; ++ ++ if (tree->ops->remove_extent) ++ err = tree->ops->remove_extent(tree, ex, a, b); ++ if (err) ++ goto out; ++ ++ if (num == 0) { ++ /* this extent is removed entirely mark slot unused */ ++ ex->ee_start = 0; ++ eh->eh_entries--; ++ fu = ex; ++ } ++ ++ ex->ee_block = block; ++ ex->ee_len = num; ++ ++ err = ext3_ext_dirty(handle, tree, path + depth); ++ if (err) ++ goto out; ++ ++ ext_debug(tree, "new extent: %u:%u:%u\n", ++ ex->ee_block, ex->ee_len, ex->ee_start); ++ ex--; ++ } ++ ++ if (fu) { ++ /* reuse unused slots */ ++ while (lu < le) { ++ if (lu->ee_start) { ++ *fu = *lu; ++ lu->ee_start = 0; ++ fu++; ++ } ++ lu++; ++ } ++ } ++ ++ if (correct_index && eh->eh_entries) ++ err = ext3_ext_correct_indexes(handle, tree, path); ++ ++ /* if this leaf is free, then we should ++ * remove it from index block above */ ++ if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL) ++ err = ext3_ext_rm_idx(handle, tree, path + depth); ++ ++out: ++ return err; ++} ++ ++ ++static struct ext3_extent_idx * ++ext3_ext_last_covered(struct ext3_extent_header *hdr, unsigned long block) ++{ ++ struct ext3_extent_idx *ix; ++ ++ ix = EXT_LAST_INDEX(hdr); ++ while (ix != EXT_FIRST_INDEX(hdr)) { ++ if (ix->ei_block <= block) ++ break; ++ ix--; ++ } ++ return ix; ++} ++ ++/* ++ * returns 1 if current index have to be freed (even partial) ++ */ ++static int inline ++ext3_ext_more_to_rm(struct ext3_ext_path *path) ++{ ++ EXT_ASSERT(path->p_idx); ++ ++ if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr)) ++ return 0; ++ ++ /* ++ * if truncate on deeper level happened it it wasn't partial ++ * so we have to consider current index for truncation ++ */ ++ if (path->p_hdr->eh_entries == path->p_block) ++ return 0; ++ return 1; ++} ++ ++int ext3_ext_remove_space(struct ext3_extents_tree *tree, ++ unsigned long start, unsigned long end) ++{ ++ struct inode *inode = tree->inode; ++ struct super_block *sb = inode->i_sb; ++ int depth = EXT_DEPTH(tree); ++ struct ext3_ext_path *path; ++ handle_t *handle; ++ int i = 0, err = 0; ++ ++ ext_debug(tree, "space to be removed: %lu:%lu\n", start, end); ++ ++ /* probably first extent we're gonna free will be last in block */ ++ handle = ext3_journal_start(inode, depth + 1); ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ ++ ext3_ext_invalidate_cache(tree); ++ ++ /* ++ * we start scanning from right side freeing all the blocks ++ * after i_size and walking into the deep ++ */ ++ path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 1), GFP_KERNEL); ++ if (IS_ERR(path)) { ++ ext3_error(sb, __FUNCTION__, "Can't allocate path array"); ++ ext3_journal_stop(handle); ++ return -ENOMEM; ++ } ++ memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1)); ++ path[i].p_hdr = EXT_ROOT_HDR(tree); ++ ++ while (i >= 0 && err == 0) { ++ if (i == depth) { ++ /* this is leaf block */ ++ err = ext3_ext_rm_leaf(handle, tree, path, start, end); ++ /* root level have p_bh == NULL, brelse() eats this */ ++ brelse(path[i].p_bh); ++ i--; ++ continue; ++ } ++ ++ /* this is index block */ ++ if (!path[i].p_hdr) { ++ ext_debug(tree, "initialize header\n"); ++ path[i].p_hdr = EXT_BLOCK_HDR(path[i].p_bh); ++ } ++ ++ EXT_ASSERT(path[i].p_hdr->eh_entries <= path[i].p_hdr->eh_max); ++ EXT_ASSERT(path[i].p_hdr->eh_magic == EXT3_EXT_MAGIC); ++ ++ if (!path[i].p_idx) { ++ /* this level hasn't touched yet */ ++ path[i].p_idx = ++ ext3_ext_last_covered(path[i].p_hdr, end); ++ path[i].p_block = path[i].p_hdr->eh_entries + 1; ++ ext_debug(tree, "init index ptr: hdr 0x%p, num %d\n", ++ path[i].p_hdr, path[i].p_hdr->eh_entries); ++ } else { ++ /* we've already was here, see at next index */ ++ path[i].p_idx--; ++ } ++ ++ ext_debug(tree, "level %d - index, first 0x%p, cur 0x%p\n", ++ i, EXT_FIRST_INDEX(path[i].p_hdr), ++ path[i].p_idx); ++ if (ext3_ext_more_to_rm(path + i)) { ++ /* go to the next level */ ++ ext_debug(tree, "move to level %d (block %d)\n", ++ i + 1, path[i].p_idx->ei_leaf); ++ memset(path + i + 1, 0, sizeof(*path)); ++ path[i+1].p_bh = sb_bread(sb, path[i].p_idx->ei_leaf); ++ if (!path[i+1].p_bh) { ++ /* should we reset i_size? */ ++ err = -EIO; ++ break; ++ } ++ /* put actual number of indexes to know is this ++ * number got changed at the next iteration */ ++ path[i].p_block = path[i].p_hdr->eh_entries; ++ i++; ++ } else { ++ /* we finish processing this index, go up */ ++ if (path[i].p_hdr->eh_entries == 0 && i > 0) { ++ /* index is empty, remove it ++ * handle must be already prepared by the ++ * truncatei_leaf() */ ++ err = ext3_ext_rm_idx(handle, tree, path + i); ++ } ++ /* root level have p_bh == NULL, brelse() eats this */ ++ brelse(path[i].p_bh); ++ i--; ++ ext_debug(tree, "return to level %d\n", i); ++ } ++ } ++ ++ /* TODO: flexible tree reduction should be here */ ++ if (path->p_hdr->eh_entries == 0) { ++ /* ++ * truncate to zero freed all the tree ++ * so, we need to correct eh_depth ++ */ ++ err = ext3_ext_get_access(handle, tree, path); ++ if (err == 0) { ++ EXT_ROOT_HDR(tree)->eh_depth = 0; ++ EXT_ROOT_HDR(tree)->eh_max = ext3_ext_space_root(tree); ++ err = ext3_ext_dirty(handle, tree, path); ++ } ++ } ++ ext3_ext_tree_changed(tree); ++ ++ kfree(path); ++ ext3_journal_stop(handle); ++ ++ return err; ++} ++ ++int ext3_ext_calc_metadata_amount(struct ext3_extents_tree *tree, int blocks) ++{ ++ int lcap, icap, rcap, leafs, idxs, num; ++ ++ rcap = ext3_ext_space_root(tree); ++ if (blocks <= rcap) { ++ /* all extents fit to the root */ ++ return 0; ++ } ++ ++ rcap = ext3_ext_space_root_idx(tree); ++ lcap = ext3_ext_space_block(tree); ++ icap = ext3_ext_space_block_idx(tree); ++ ++ num = leafs = (blocks + lcap - 1) / lcap; ++ if (leafs <= rcap) { ++ /* all pointers to leafs fit to the root */ ++ return leafs; ++ } ++ ++ /* ok. we need separate index block(s) to link all leaf blocks */ ++ idxs = (leafs + icap - 1) / icap; ++ do { ++ num += idxs; ++ idxs = (idxs + icap - 1) / icap; ++ } while (idxs > rcap); ++ ++ return num; ++} ++ ++/* ++ * called at mount time ++ */ ++void ext3_ext_init(struct super_block *sb) ++{ ++ /* ++ * possible initialization would be here ++ */ ++ ++ if (test_opt(sb, EXTENTS)) { ++ printk("EXT3-fs: file extents enabled"); ++#ifdef AGRESSIVE_TEST ++ printk(", agressive tests"); ++#endif ++#ifdef CHECK_BINSEARCH ++ printk(", check binsearch"); ++#endif ++ printk("\n"); ++ } ++} ++ ++/* ++ * called at umount time ++ */ ++void ext3_ext_release(struct super_block *sb) ++{ ++} ++ ++/************************************************************************ ++ * VFS related routines ++ ************************************************************************/ ++ ++static int ext3_get_inode_write_access(handle_t *handle, void *buffer) ++{ ++ /* we use in-core data, not bh */ ++ return 0; ++} ++ ++static int ext3_mark_buffer_dirty(handle_t *handle, void *buffer) ++{ ++ struct inode *inode = buffer; ++ return ext3_mark_inode_dirty(handle, inode); ++} ++ ++static int ext3_ext_mergable(struct ext3_extent *ex1, ++ struct ext3_extent *ex2) ++{ ++ /* FIXME: support for large fs */ ++ if (ex1->ee_start + ex1->ee_len == ex2->ee_start) ++ return 1; ++ return 0; ++} ++ ++static int ++ext3_remove_blocks_credits(struct ext3_extents_tree *tree, ++ struct ext3_extent *ex, ++ unsigned long from, unsigned long to) ++{ ++ int needed; ++ ++ /* at present, extent can't cross block group */; ++ needed = 4; /* bitmap + group desc + sb + inode */ ++ ++#ifdef CONFIG_QUOTA ++ needed += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS; ++#endif ++ return needed; ++} ++ ++static int ++ext3_remove_blocks(struct ext3_extents_tree *tree, ++ struct ext3_extent *ex, ++ unsigned long from, unsigned long to) ++{ ++ int needed = ext3_remove_blocks_credits(tree, ex, from, to); ++ handle_t *handle = ext3_journal_start(tree->inode, needed); ++ struct buffer_head *bh; ++ int i; ++ ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) { ++ /* tail removal */ ++ unsigned long num, start; ++ num = ex->ee_block + ex->ee_len - from; ++ start = ex->ee_start + ex->ee_len - num; ++ ext_debug(tree, "free last %lu blocks starting %lu\n", ++ num, start); ++ for (i = 0; i < num; i++) { ++ bh = sb_find_get_block(tree->inode->i_sb, start + i); ++ ext3_forget(handle, 0, tree->inode, bh, start + i); ++ } ++ ext3_free_blocks(handle, tree->inode, start, num); ++ } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) { ++ printk("strange request: removal %lu-%lu from %u:%u\n", ++ from, to, ex->ee_block, ex->ee_len); ++ } else { ++ printk("strange request: removal(2) %lu-%lu from %u:%u\n", ++ from, to, ex->ee_block, ex->ee_len); ++ } ++ ext3_journal_stop(handle); ++ return 0; ++} ++ ++static int ext3_ext_find_goal(struct inode *inode, ++ struct ext3_ext_path *path, unsigned long block) ++{ ++ struct ext3_inode_info *ei = EXT3_I(inode); ++ unsigned long bg_start; ++ unsigned long colour; ++ int depth; ++ ++ if (path) { ++ struct ext3_extent *ex; ++ depth = path->p_depth; ++ ++ /* try to predict block placement */ ++ if ((ex = path[depth].p_ext)) ++ return ex->ee_start + (block - ex->ee_block); ++ ++ /* it looks index is empty ++ * try to find starting from index itself */ ++ if (path[depth].p_bh) ++ return path[depth].p_bh->b_blocknr; ++ } ++ ++ /* OK. use inode's group */ ++ bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) + ++ le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block); ++ colour = (current->pid % 16) * ++ (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16); ++ return bg_start + colour + block; ++} ++ ++static int ext3_new_block_cb(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *ex, int *err) ++{ ++ struct inode *inode = tree->inode; ++ int newblock, goal; ++ ++ EXT_ASSERT(path); ++ EXT_ASSERT(ex); ++ EXT_ASSERT(ex->ee_start); ++ EXT_ASSERT(ex->ee_len); ++ ++ /* reuse block from the extent to order data/metadata */ ++ newblock = ex->ee_start++; ++ ex->ee_len--; ++ if (ex->ee_len == 0) { ++ ex->ee_len = 1; ++ /* allocate new block for the extent */ ++ goal = ext3_ext_find_goal(inode, path, ex->ee_block); ++ ex->ee_start = ext3_new_block(handle, inode, goal, err); ++ if (ex->ee_start == 0) { ++ /* error occured: restore old extent */ ++ ex->ee_start = newblock; ++ return 0; ++ } ++ } ++ return newblock; ++} ++ ++static struct ext3_extents_helpers ext3_blockmap_helpers = { ++ .get_write_access = ext3_get_inode_write_access, ++ .mark_buffer_dirty = ext3_mark_buffer_dirty, ++ .mergable = ext3_ext_mergable, ++ .new_block = ext3_new_block_cb, ++ .remove_extent = ext3_remove_blocks, ++ .remove_extent_credits = ext3_remove_blocks_credits, ++}; ++ ++void ext3_init_tree_desc(struct ext3_extents_tree *tree, ++ struct inode *inode) ++{ ++ tree->inode = inode; ++ tree->root = (void *) EXT3_I(inode)->i_data; ++ tree->buffer = (void *) inode; ++ tree->buffer_len = sizeof(EXT3_I(inode)->i_data); ++ tree->cex = (struct ext3_ext_cache *) &EXT3_I(inode)->i_cached_extent; ++ tree->ops = &ext3_blockmap_helpers; ++} ++ ++int ext3_ext_get_block(handle_t *handle, struct inode *inode, ++ long iblock, struct buffer_head *bh_result, ++ int create, int extend_disksize) ++{ ++ struct ext3_ext_path *path = NULL; ++ struct ext3_extent newex; ++ struct ext3_extent *ex; ++ int goal, newblock, err = 0, depth; ++ struct ext3_extents_tree tree; ++ ++ clear_buffer_new(bh_result); ++ ext3_init_tree_desc(&tree, inode); ++ ext_debug(&tree, "block %d requested for inode %u\n", ++ (int) iblock, (unsigned) inode->i_ino); ++ down(&EXT3_I(inode)->truncate_sem); ++ ++ /* check in cache */ ++ if ((goal = ext3_ext_in_cache(&tree, iblock, &newex))) { ++ if (goal == EXT3_EXT_CACHE_GAP) { ++ if (!create) { ++ /* block isn't allocated yet and ++ * user don't want to allocate it */ ++ goto out2; ++ } ++ /* we should allocate requested block */ ++ } else if (goal == EXT3_EXT_CACHE_EXTENT) { ++ /* block is already allocated */ ++ newblock = iblock - newex.ee_block + newex.ee_start; ++ goto out; ++ } else { ++ EXT_ASSERT(0); ++ } ++ } ++ ++ /* find extent for this block */ ++ path = ext3_ext_find_extent(&tree, iblock, NULL); ++ if (IS_ERR(path)) { ++ err = PTR_ERR(path); ++ path = NULL; ++ goto out2; ++ } ++ ++ depth = EXT_DEPTH(&tree); ++ ++ /* ++ * consistent leaf must not be empty ++ * this situations is possible, though, _during_ tree modification ++ * this is why assert can't be put in ext3_ext_find_extent() ++ */ ++ EXT_ASSERT(path[depth].p_ext != NULL || depth == 0); ++ ++ if ((ex = path[depth].p_ext)) { ++ /* if found exent covers block, simple return it */ ++ if (iblock >= ex->ee_block && iblock < ex->ee_block + ex->ee_len) { ++ newblock = iblock - ex->ee_block + ex->ee_start; ++ ext_debug(&tree, "%d fit into %d:%d -> %d\n", ++ (int) iblock, ex->ee_block, ex->ee_len, ++ newblock); ++ ext3_ext_put_in_cache(&tree, ex->ee_block, ++ ex->ee_len, ex->ee_start, ++ EXT3_EXT_CACHE_EXTENT); ++ goto out; ++ } ++ } ++ ++ /* ++ * requested block isn't allocated yet ++ * we couldn't try to create block if create flag is zero ++ */ ++ if (!create) { ++ /* put just found gap into cache to speedup subsequest reqs */ ++ ext3_ext_put_gap_in_cache(&tree, path, iblock); ++ goto out2; ++ } ++ ++ /* allocate new block */ ++ goal = ext3_ext_find_goal(inode, path, iblock); ++ newblock = ext3_new_block(handle, inode, goal, &err); ++ if (!newblock) ++ goto out2; ++ ext_debug(&tree, "allocate new block: goal %d, found %d\n", ++ goal, newblock); ++ ++ /* try to insert new extent into found leaf and return */ ++ newex.ee_block = iblock; ++ newex.ee_start = newblock; ++ newex.ee_len = 1; ++ err = ext3_ext_insert_extent(handle, &tree, path, &newex); ++ if (err) ++ goto out2; ++ ++ if (extend_disksize && inode->i_size > EXT3_I(inode)->i_disksize) ++ EXT3_I(inode)->i_disksize = inode->i_size; ++ ++ /* previous routine could use block we allocated */ ++ newblock = newex.ee_start; ++ set_buffer_new(bh_result); ++ ++ ext3_ext_put_in_cache(&tree, newex.ee_block, newex.ee_len, ++ newex.ee_start, EXT3_EXT_CACHE_EXTENT); ++out: ++ ext3_ext_show_leaf(&tree, path); ++ map_bh(bh_result, inode->i_sb, newblock); ++out2: ++ if (path) { ++ ext3_ext_drop_refs(path); ++ kfree(path); ++ } ++ up(&EXT3_I(inode)->truncate_sem); ++ ++ return err; ++} ++ ++void ext3_ext_truncate(struct inode * inode, struct page *page) ++{ ++ struct address_space *mapping = inode->i_mapping; ++ struct super_block *sb = inode->i_sb; ++ struct ext3_extents_tree tree; ++ unsigned long last_block; ++ handle_t *handle; ++ int err = 0; ++ ++ ext3_init_tree_desc(&tree, inode); ++ ++ /* ++ * probably first extent we're gonna free will be last in block ++ */ ++ err = ext3_writepage_trans_blocks(inode) + 3; ++ handle = ext3_journal_start(inode, err); ++ if (IS_ERR(handle)) { ++ if (page) { ++ clear_highpage(page); ++ flush_dcache_page(page); ++ unlock_page(page); ++ page_cache_release(page); ++ } ++ return; ++ } ++ ++ if (page) ++ ext3_block_truncate_page(handle, page, mapping, inode->i_size); ++ ++ down(&EXT3_I(inode)->truncate_sem); ++ ext3_ext_invalidate_cache(&tree); ++ ++ /* ++ * TODO: optimization is possible here ++ * probably we need not scaning at all, ++ * because page truncation is enough ++ */ ++ if (ext3_orphan_add(handle, inode)) ++ goto out_stop; ++ ++ /* we have to know where to truncate from in crash case */ ++ EXT3_I(inode)->i_disksize = inode->i_size; ++ ext3_mark_inode_dirty(handle, inode); ++ ++ last_block = (inode->i_size + sb->s_blocksize - 1) >> ++ EXT3_BLOCK_SIZE_BITS(sb); ++ err = ext3_ext_remove_space(&tree, last_block, EXT_MAX_BLOCK); ++ ++ /* In a multi-transaction truncate, we only make the final ++ * transaction synchronous */ ++ if (IS_SYNC(inode)) ++ handle->h_sync = 1; ++ ++out_stop: ++ /* ++ * If this was a simple ftruncate(), and the file will remain alive ++ * then we need to clear up the orphan record which we created above. ++ * However, if this was a real unlink then we were called by ++ * ext3_delete_inode(), and we allow that function to clean up the ++ * orphan info for us. ++ */ ++ if (inode->i_nlink) ++ ext3_orphan_del(handle, inode); ++ ++ up(&EXT3_I(inode)->truncate_sem); ++ ext3_journal_stop(handle); ++} ++ ++/* ++ * this routine calculate max number of blocks we could modify ++ * in order to allocate new block for an inode ++ */ ++int ext3_ext_writepage_trans_blocks(struct inode *inode, int num) ++{ ++ struct ext3_extents_tree tree; ++ int needed; ++ ++ ext3_init_tree_desc(&tree, inode); ++ ++ needed = ext3_ext_calc_credits_for_insert(&tree, NULL); ++ ++ /* caller want to allocate num blocks */ ++ needed *= num; ++ ++#ifdef CONFIG_QUOTA ++ /* ++ * FIXME: real calculation should be here ++ * it depends on blockmap format of qouta file ++ */ ++ needed += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS; ++#endif ++ ++ return needed; ++} ++ ++void ext3_extents_initialize_blockmap(handle_t *handle, struct inode *inode) ++{ ++ struct ext3_extents_tree tree; ++ ++ ext3_init_tree_desc(&tree, inode); ++ ext3_extent_tree_init(handle, &tree); ++} ++ ++int ext3_ext_calc_blockmap_metadata(struct inode *inode, int blocks) ++{ ++ struct ext3_extents_tree tree; ++ ++ ext3_init_tree_desc(&tree, inode); ++ return ext3_ext_calc_metadata_amount(&tree, blocks); ++} ++ ++static int ++ext3_ext_store_extent_cb(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_ext_cache *newex) ++{ ++ struct ext3_extent_buf *buf = (struct ext3_extent_buf *) tree->private; ++ ++ if (newex->ec_type != EXT3_EXT_CACHE_EXTENT) ++ return EXT_CONTINUE; ++ ++ if (buf->err < 0) ++ return EXT_BREAK; ++ if (buf->cur - buf->buffer + sizeof(*newex) > buf->buflen) ++ return EXT_BREAK; ++ ++ if (!copy_to_user(buf->cur, newex, sizeof(*newex))) { ++ buf->err++; ++ buf->cur += sizeof(*newex); ++ } else { ++ buf->err = -EFAULT; ++ return EXT_BREAK; ++ } ++ return EXT_CONTINUE; ++} ++ ++static int ++ext3_ext_collect_stats_cb(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_ext_cache *ex) ++{ ++ struct ext3_extent_tree_stats *buf = ++ (struct ext3_extent_tree_stats *) tree->private; ++ int depth; ++ ++ if (ex->ec_type != EXT3_EXT_CACHE_EXTENT) ++ return EXT_CONTINUE; ++ ++ depth = EXT_DEPTH(tree); ++ buf->extents_num++; ++ if (path[depth].p_ext == EXT_FIRST_EXTENT(path[depth].p_hdr)) ++ buf->leaf_num++; ++ return EXT_CONTINUE; ++} ++ ++int ext3_ext_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, ++ unsigned long arg) ++{ ++ int err = 0; ++ ++ if (!(EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL)) ++ return -EINVAL; ++ ++ if (cmd == EXT3_IOC_GET_EXTENTS) { ++ struct ext3_extent_buf buf; ++ struct ext3_extents_tree tree; ++ ++ if (copy_from_user(&buf, (void *) arg, sizeof(buf))) ++ return -EFAULT; ++ ++ ext3_init_tree_desc(&tree, inode); ++ buf.cur = buf.buffer; ++ buf.err = 0; ++ tree.private = &buf; ++ down(&EXT3_I(inode)->truncate_sem); ++ err = ext3_ext_walk_space(&tree, buf.start, EXT_MAX_BLOCK, ++ ext3_ext_store_extent_cb); ++ up(&EXT3_I(inode)->truncate_sem); ++ if (err == 0) ++ err = buf.err; ++ } else if (cmd == EXT3_IOC_GET_TREE_STATS) { ++ struct ext3_extent_tree_stats buf; ++ struct ext3_extents_tree tree; ++ ++ ext3_init_tree_desc(&tree, inode); ++ down(&EXT3_I(inode)->truncate_sem); ++ buf.depth = EXT_DEPTH(&tree); ++ buf.extents_num = 0; ++ buf.leaf_num = 0; ++ tree.private = &buf; ++ err = ext3_ext_walk_space(&tree, 0, EXT_MAX_BLOCK, ++ ext3_ext_collect_stats_cb); ++ up(&EXT3_I(inode)->truncate_sem); ++ if (!err) ++ err = copy_to_user((void *) arg, &buf, sizeof(buf)); ++ } else if (cmd == EXT3_IOC_GET_TREE_DEPTH) { ++ struct ext3_extents_tree tree; ++ ext3_init_tree_desc(&tree, inode); ++ down(&EXT3_I(inode)->truncate_sem); ++ err = EXT_DEPTH(&tree); ++ up(&EXT3_I(inode)->truncate_sem); ++ } ++ ++ return err; ++} ++ ++EXPORT_SYMBOL(ext3_init_tree_desc); ++EXPORT_SYMBOL(ext3_mark_inode_dirty); ++EXPORT_SYMBOL(ext3_ext_invalidate_cache); ++EXPORT_SYMBOL(ext3_ext_insert_extent); ++EXPORT_SYMBOL(ext3_ext_walk_space); ++EXPORT_SYMBOL(ext3_ext_find_goal); ++EXPORT_SYMBOL(ext3_ext_calc_credits_for_insert); +Index: linux-2.6.12-rc6/fs/ext3/ialloc.c +=================================================================== +--- linux-2.6.12-rc6.orig/fs/ext3/ialloc.c 2005-06-14 16:31:08.634433030 +0200 ++++ linux-2.6.12-rc6/fs/ext3/ialloc.c 2005-06-14 16:31:25.846346882 +0200 +@@ -598,7 +598,7 @@ + ei->i_dir_start_lookup = 0; + ei->i_disksize = 0; + +- ei->i_flags = EXT3_I(dir)->i_flags & ~EXT3_INDEX_FL; ++ ei->i_flags = EXT3_I(dir)->i_flags & ~(EXT3_INDEX_FL|EXT3_EXTENTS_FL); + if (S_ISLNK(mode)) + ei->i_flags &= ~(EXT3_IMMUTABLE_FL|EXT3_APPEND_FL); + /* dirsync only applies to directories */ +@@ -639,6 +639,18 @@ + DQUOT_FREE_INODE(inode); + goto fail2; + } ++ if (test_opt(sb, EXTENTS) && S_ISREG(inode->i_mode)) { ++ EXT3_I(inode)->i_flags |= EXT3_EXTENTS_FL; ++ ext3_extents_initialize_blockmap(handle, inode); ++ if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_EXTENTS)) { ++ err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh); ++ if (err) goto fail; ++ EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_EXTENTS); ++ BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "call ext3_journal_dirty_metadata"); ++ err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); ++ } ++ } ++ + err = ext3_mark_inode_dirty(handle, inode); + if (err) { + ext3_std_error(sb, err); +Index: linux-2.6.12-rc6/fs/ext3/inode.c +=================================================================== +--- linux-2.6.12-rc6.orig/fs/ext3/inode.c 2005-06-14 16:31:09.701815830 +0200 ++++ linux-2.6.12-rc6/fs/ext3/inode.c 2005-06-14 16:31:25.861971882 +0200 +@@ -40,7 +40,7 @@ + #include "iopen.h" + #include "acl.h" + +-static int ext3_writepage_trans_blocks(struct inode *inode); ++int ext3_writepage_trans_blocks(struct inode *inode); + + /* + * Test whether an inode is a fast symlink. +@@ -784,6 +784,17 @@ + return err; + } + ++static inline int ++ext3_get_block_wrap(handle_t *handle, struct inode *inode, long block, ++ struct buffer_head *bh, int create, int extend_disksize) ++{ ++ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) ++ return ext3_ext_get_block(handle, inode, block, bh, create, ++ extend_disksize); ++ return ext3_get_block_handle(handle, inode, block, bh, create, ++ extend_disksize); ++} ++ + static int ext3_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) + { +@@ -794,8 +805,8 @@ + handle = ext3_journal_current_handle(); + J_ASSERT(handle != 0); + } +- ret = ext3_get_block_handle(handle, inode, iblock, +- bh_result, create, 1); ++ ret = ext3_get_block_wrap(handle, inode, iblock, ++ bh_result, create, 1); + return ret; + } + +@@ -839,7 +850,7 @@ + + get_block: + if (ret == 0) +- ret = ext3_get_block_handle(handle, inode, iblock, ++ ret = ext3_get_block_wrap(handle, inode, iblock, + bh_result, create, 0); + bh_result->b_size = (1 << inode->i_blkbits); + return ret; +@@ -859,7 +870,7 @@ + dummy.b_state = 0; + dummy.b_blocknr = -1000; + buffer_trace_init(&dummy.b_history); +- *errp = ext3_get_block_handle(handle, inode, block, &dummy, create, 1); ++ *errp = ext3_get_block_wrap(handle, inode, block, &dummy, create, 1); + if (!*errp && buffer_mapped(&dummy)) { + struct buffer_head *bh; + bh = sb_getblk(inode->i_sb, dummy.b_blocknr); +@@ -1593,7 +1604,7 @@ + * This required during truncate. We need to physically zero the tail end + * of that block so it doesn't yield old data if the file is later grown. + */ +-static int ext3_block_truncate_page(handle_t *handle, struct page *page, ++int ext3_block_truncate_page(handle_t *handle, struct page *page, + struct address_space *mapping, loff_t from) + { + unsigned long index = from >> PAGE_CACHE_SHIFT; +@@ -2104,6 +2115,9 @@ + return; + } + ++ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) ++ return ext3_ext_truncate(inode, page); ++ + handle = start_transaction(inode); + if (IS_ERR(handle)) { + if (page) { +@@ -2850,12 +2864,15 @@ + * block and work out the exact number of indirects which are touched. Pah. + */ + +-static int ext3_writepage_trans_blocks(struct inode *inode) ++int ext3_writepage_trans_blocks(struct inode *inode) + { + int bpp = ext3_journal_blocks_per_page(inode); + int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3; + int ret; + ++ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) ++ return ext3_ext_writepage_trans_blocks(inode, bpp); ++ + if (ext3_should_journal_data(inode)) + ret = 3 * (bpp + indirects) + 2; + else +Index: linux-2.6.12-rc6/fs/ext3/Makefile +=================================================================== +--- linux-2.6.12-rc6.orig/fs/ext3/Makefile 2005-06-14 16:31:09.179354899 +0200 ++++ linux-2.6.12-rc6/fs/ext3/Makefile 2005-06-14 16:31:25.872714069 +0200 +@@ -5,7 +5,7 @@ + obj-$(CONFIG_EXT3_FS) += ext3.o + + ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o\ +- ioctl.o namei.o super.o symlink.o hash.o resize.o ++ ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o + + ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o + ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o +Index: linux-2.6.12-rc6/fs/ext3/super.c +=================================================================== +--- linux-2.6.12-rc6.orig/fs/ext3/super.c 2005-06-14 16:31:09.950839264 +0200 ++++ linux-2.6.12-rc6/fs/ext3/super.c 2005-06-14 16:31:25.886385944 +0200 +@@ -387,6 +387,7 @@ + struct ext3_super_block *es = sbi->s_es; + int i; + ++ ext3_ext_release(sb); + ext3_xattr_put_super(sb); + journal_destroy(sbi->s_journal); + if (!(sb->s_flags & MS_RDONLY)) { +@@ -451,6 +452,8 @@ + #endif + ei->i_block_alloc_info = NULL; + ei->vfs_inode.i_version = 1; ++ ++ memset(&ei->i_cached_extent, 0, sizeof(ei->i_cached_extent)); + return &ei->vfs_inode; + } + +@@ -593,7 +596,7 @@ + Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, + Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, + Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, +- Opt_ignore, Opt_barrier, Opt_err, Opt_resize, ++ Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_extents, Opt_extdebug, + }; + + static match_table_t tokens = { +@@ -644,6 +647,8 @@ + {Opt_iopen, "iopen"}, + {Opt_noiopen, "noiopen"}, + {Opt_iopen_nopriv, "iopen_nopriv"}, ++ {Opt_extents, "extents"}, ++ {Opt_extdebug, "extdebug"}, + {Opt_barrier, "barrier=%u"}, + {Opt_err, NULL}, + {Opt_resize, "resize"}, +@@ -953,6 +958,12 @@ + case Opt_nobh: + set_opt(sbi->s_mount_opt, NOBH); + break; ++ case Opt_extents: ++ set_opt (sbi->s_mount_opt, EXTENTS); ++ break; ++ case Opt_extdebug: ++ set_opt (sbi->s_mount_opt, EXTDEBUG); ++ break; + default: + printk (KERN_ERR + "EXT3-fs: Unrecognized mount option \"%s\" " +@@ -1668,6 +1681,7 @@ + percpu_counter_mod(&sbi->s_dirs_counter, + ext3_count_dirs(sb)); + ++ ext3_ext_init(sb); + lock_kernel(); + return 0; + +Index: linux-2.6.12-rc6/fs/ext3/ioctl.c +=================================================================== +--- linux-2.6.12-rc6.orig/fs/ext3/ioctl.c 2005-06-14 16:31:08.646151780 +0200 ++++ linux-2.6.12-rc6/fs/ext3/ioctl.c 2005-06-14 16:31:25.897128131 +0200 +@@ -124,6 +124,10 @@ + err = ext3_change_inode_journal_flag(inode, jflag); + return err; + } ++ case EXT3_IOC_GET_EXTENTS: ++ case EXT3_IOC_GET_TREE_STATS: ++ case EXT3_IOC_GET_TREE_DEPTH: ++ return ext3_ext_ioctl(inode, filp, cmd, arg); + case EXT3_IOC_GETVERSION: + case EXT3_IOC_GETVERSION_OLD: + return put_user(inode->i_generation, (int __user *) arg); +Index: linux-2.6.12-rc6/include/linux/ext3_fs.h +=================================================================== +--- linux-2.6.12-rc6.orig/include/linux/ext3_fs.h 2005-06-14 16:31:10.185214261 +0200 ++++ linux-2.6.12-rc6/include/linux/ext3_fs.h 2005-06-14 16:31:52.859041864 +0200 +@@ -186,8 +186,9 @@ + #define EXT3_NOTAIL_FL 0x00008000 /* don't merge file tail */ + #define EXT3_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ + #define EXT3_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ ++#define EXT3_EXTENTS_FL 0x00080000 /* Inode uses extents */ + #define EXT3_RESERVED_FL 0x80000000 /* reserved for ext3 lib */ + +-#define EXT3_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */ ++#define EXT3_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */ + #define EXT3_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */ + +@@ -237,6 +238,9 @@ + #endif + #define EXT3_IOC_GETRSVSZ _IOR('f', 5, long) + #define EXT3_IOC_SETRSVSZ _IOW('f', 6, long) ++#define EXT3_IOC_GET_EXTENTS _IOR('f', 7, long) ++#define EXT3_IOC_GET_TREE_DEPTH _IOR('f', 8, long) ++#define EXT3_IOC_GET_TREE_STATS _IOR('f', 9, long) + + /* + * Structure of an inode on the disk +@@ -360,6 +364,8 @@ + #define EXT3_MOUNT_NOBH 0x40000 /* No bufferheads */ + #define EXT3_MOUNT_IOPEN 0x80000 /* Allow access via iopen */ + #define EXT3_MOUNT_IOPEN_NOPRIV 0x100000 /* Make iopen world-readable */ ++#define EXT3_MOUNT_EXTENTS 0x200000/* Extents support */ ++#define EXT3_MOUNT_EXTDEBUG 0x400000/* Extents debug */ + + /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ + #ifndef clear_opt +@@ -548,11 +554,13 @@ + #define EXT3_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */ + #define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */ + #define EXT3_FEATURE_INCOMPAT_META_BG 0x0010 ++#define EXT3_FEATURE_INCOMPAT_EXTENTS 0x0040 /* extents support */ + + #define EXT3_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR + #define EXT3_FEATURE_INCOMPAT_SUPP (EXT3_FEATURE_INCOMPAT_FILETYPE| \ + EXT3_FEATURE_INCOMPAT_RECOVER| \ +- EXT3_FEATURE_INCOMPAT_META_BG) ++ EXT3_FEATURE_INCOMPAT_META_BG| \ ++ EXT3_FEATURE_INCOMPAT_EXTENTS) + #define EXT3_FEATURE_RO_COMPAT_SUPP (EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT3_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT3_FEATURE_RO_COMPAT_BTREE_DIR) +@@ -759,6 +767,7 @@ + + + /* inode.c */ ++extern int ext3_block_truncate_page(handle_t *, struct page *, struct address_space *, loff_t); + extern int ext3_forget(handle_t *, int, struct inode *, struct buffer_head *, int); + extern struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *); + extern struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *); +@@ -828,6 +837,16 @@ + extern struct inode_operations ext3_symlink_inode_operations; + extern struct inode_operations ext3_fast_symlink_inode_operations; + ++/* extents.c */ ++extern int ext3_ext_writepage_trans_blocks(struct inode *, int); ++extern int ext3_ext_get_block(handle_t *, struct inode *, long, ++ struct buffer_head *, int, int); ++extern void ext3_ext_truncate(struct inode *, struct page *); ++extern void ext3_ext_init(struct super_block *); ++extern void ext3_ext_release(struct super_block *); ++extern void ext3_extents_initialize_blockmap(handle_t *, struct inode *); ++extern int ext3_ext_ioctl(struct inode *inode, struct file *filp, ++ unsigned int cmd, unsigned long arg); + + #endif /* __KERNEL__ */ + +Index: linux-2.6.12-rc6/include/linux/ext3_extents.h +=================================================================== +--- linux-2.6.12-rc6.orig/include/linux/ext3_extents.h 2005-06-14 16:31:25.780917195 +0200 ++++ linux-2.6.12-rc6/include/linux/ext3_extents.h 2005-06-14 16:31:25.932284381 +0200 +@@ -0,0 +1,264 @@ ++/* ++ * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com ++ * Written by Alex Tomas ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public Licens ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- ++ */ ++ ++#ifndef _LINUX_EXT3_EXTENTS ++#define _LINUX_EXT3_EXTENTS ++ ++/* ++ * with AGRESSIVE_TEST defined capacity of index/leaf blocks ++ * become very little, so index split, in-depth growing and ++ * other hard changes happens much more often ++ * this is for debug purposes only ++ */ ++#define AGRESSIVE_TEST_ ++ ++/* ++ * if CHECK_BINSEARCH defined, then results of binary search ++ * will be checked by linear search ++ */ ++#define CHECK_BINSEARCH_ ++ ++/* ++ * if EXT_DEBUG is defined you can use 'extdebug' mount option ++ * to get lots of info what's going on ++ */ ++#define EXT_DEBUG_ ++#ifdef EXT_DEBUG ++#define ext_debug(tree,fmt,a...) \ ++do { \ ++ if (test_opt((tree)->inode->i_sb, EXTDEBUG)) \ ++ printk(fmt, ##a); \ ++} while (0); ++#else ++#define ext_debug(tree,fmt,a...) ++#endif ++ ++/* ++ * if EXT_STATS is defined then stats numbers are collected ++ * these number will be displayed at umount time ++ */ ++#define EXT_STATS_ ++ ++ ++#define EXT3_ALLOC_NEEDED 3 /* block bitmap + group desc. + sb */ ++ ++/* ++ * ext3_inode has i_block array (total 60 bytes) ++ * first 4 bytes are used to store: ++ * - tree depth (0 mean there is no tree yet. all extents in the inode) ++ * - number of alive extents in the inode ++ */ ++ ++/* ++ * this is extent on-disk structure ++ * it's used at the bottom of the tree ++ */ ++struct ext3_extent { ++ __u32 ee_block; /* first logical block extent covers */ ++ __u16 ee_len; /* number of blocks covered by extent */ ++ __u16 ee_start_hi; /* high 16 bits of physical block */ ++ __u32 ee_start; /* low 32 bigs of physical block */ ++}; ++ ++/* ++ * this is index on-disk structure ++ * it's used at all the levels, but the bottom ++ */ ++struct ext3_extent_idx { ++ __u32 ei_block; /* index covers logical blocks from 'block' */ ++ __u32 ei_leaf; /* pointer to the physical block of the next * ++ * level. leaf or next index could bet here */ ++ __u16 ei_leaf_hi; /* high 16 bits of physical block */ ++ __u16 ei_unused; ++}; ++ ++/* ++ * each block (leaves and indexes), even inode-stored has header ++ */ ++struct ext3_extent_header { ++ __u16 eh_magic; /* probably will support different formats */ ++ __u16 eh_entries; /* number of valid entries */ ++ __u16 eh_max; /* capacity of store in entries */ ++ __u16 eh_depth; /* has tree real underlaying blocks? */ ++ __u32 eh_generation; /* generation of the tree */ ++}; ++ ++#define EXT3_EXT_MAGIC 0xf30a ++ ++/* ++ * array of ext3_ext_path contains path to some extent ++ * creation/lookup routines use it for traversal/splitting/etc ++ * truncate uses it to simulate recursive walking ++ */ ++struct ext3_ext_path { ++ __u32 p_block; ++ __u16 p_depth; ++ struct ext3_extent *p_ext; ++ struct ext3_extent_idx *p_idx; ++ struct ext3_extent_header *p_hdr; ++ struct buffer_head *p_bh; ++}; ++ ++/* ++ * structure for external API ++ */ ++ ++/* ++ * storage for cached extent ++ */ ++struct ext3_ext_cache { ++ __u32 ec_start; ++ __u32 ec_block; ++ __u32 ec_len; ++ __u32 ec_type; ++}; ++ ++#define EXT3_EXT_CACHE_NO 0 ++#define EXT3_EXT_CACHE_GAP 1 ++#define EXT3_EXT_CACHE_EXTENT 2 ++ ++/* ++ * ext3_extents_tree is used to pass initial information ++ * to top-level extents API ++ */ ++struct ext3_extents_helpers; ++struct ext3_extents_tree { ++ struct inode *inode; /* inode which tree belongs to */ ++ void *root; /* ptr to data top of tree resides at */ ++ void *buffer; /* will be passed as arg to ^^ routines */ ++ int buffer_len; ++ void *private; ++ struct ext3_ext_cache *cex;/* last found extent */ ++ struct ext3_extents_helpers *ops; ++}; ++ ++struct ext3_extents_helpers { ++ int (*get_write_access)(handle_t *h, void *buffer); ++ int (*mark_buffer_dirty)(handle_t *h, void *buffer); ++ int (*mergable)(struct ext3_extent *ex1, struct ext3_extent *ex2); ++ int (*remove_extent_credits)(struct ext3_extents_tree *, ++ struct ext3_extent *, unsigned long, ++ unsigned long); ++ int (*remove_extent)(struct ext3_extents_tree *, ++ struct ext3_extent *, unsigned long, ++ unsigned long); ++ int (*new_block)(handle_t *, struct ext3_extents_tree *, ++ struct ext3_ext_path *, struct ext3_extent *, ++ int *); ++}; ++ ++/* ++ * to be called by ext3_ext_walk_space() ++ * negative retcode - error ++ * positive retcode - signal for ext3_ext_walk_space(), see below ++ * callback must return valid extent (passed or newly created) ++ */ ++typedef int (*ext_prepare_callback)(struct ext3_extents_tree *, ++ struct ext3_ext_path *, ++ struct ext3_ext_cache *); ++ ++#define EXT_CONTINUE 0 ++#define EXT_BREAK 1 ++#define EXT_REPEAT 2 ++ ++ ++#define EXT_MAX_BLOCK 0xffffffff ++ ++ ++#define EXT_FIRST_EXTENT(__hdr__) \ ++ ((struct ext3_extent *) (((char *) (__hdr__)) + \ ++ sizeof(struct ext3_extent_header))) ++#define EXT_FIRST_INDEX(__hdr__) \ ++ ((struct ext3_extent_idx *) (((char *) (__hdr__)) + \ ++ sizeof(struct ext3_extent_header))) ++#define EXT_HAS_FREE_INDEX(__path__) \ ++ ((__path__)->p_hdr->eh_entries < (__path__)->p_hdr->eh_max) ++#define EXT_LAST_EXTENT(__hdr__) \ ++ (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_entries - 1) ++#define EXT_LAST_INDEX(__hdr__) \ ++ (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_entries - 1) ++#define EXT_MAX_EXTENT(__hdr__) \ ++ (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_max - 1) ++#define EXT_MAX_INDEX(__hdr__) \ ++ (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_max - 1) ++ ++#define EXT_ROOT_HDR(tree) \ ++ ((struct ext3_extent_header *) (tree)->root) ++#define EXT_BLOCK_HDR(bh) \ ++ ((struct ext3_extent_header *) (bh)->b_data) ++#define EXT_DEPTH(_t_) \ ++ (((struct ext3_extent_header *)((_t_)->root))->eh_depth) ++#define EXT_GENERATION(_t_) \ ++ (((struct ext3_extent_header *)((_t_)->root))->eh_generation) ++ ++ ++#define EXT_ASSERT(__x__) if (!(__x__)) BUG(); ++ ++#define EXT_CHECK_PATH(tree,path) \ ++{ \ ++ int depth = EXT_DEPTH(tree); \ ++ BUG_ON((unsigned long) (path) < __PAGE_OFFSET); \ ++ BUG_ON((unsigned long) (path)[depth].p_idx < \ ++ __PAGE_OFFSET && (path)[depth].p_idx != NULL); \ ++ BUG_ON((unsigned long) (path)[depth].p_ext < \ ++ __PAGE_OFFSET && (path)[depth].p_ext != NULL); \ ++ BUG_ON((unsigned long) (path)[depth].p_hdr < __PAGE_OFFSET); \ ++ BUG_ON((unsigned long) (path)[depth].p_bh < __PAGE_OFFSET \ ++ && depth != 0); \ ++ BUG_ON((path)[0].p_depth != depth); \ ++} ++ ++ ++/* ++ * this structure is used to gather extents from the tree via ioctl ++ */ ++struct ext3_extent_buf { ++ unsigned long start; ++ int buflen; ++ void *buffer; ++ void *cur; ++ int err; ++}; ++ ++/* ++ * this structure is used to collect stats info about the tree ++ */ ++struct ext3_extent_tree_stats { ++ int depth; ++ int extents_num; ++ int leaf_num; ++}; ++ ++extern void ext3_init_tree_desc(struct ext3_extents_tree *, struct inode *); ++extern int ext3_extent_tree_init(handle_t *, struct ext3_extents_tree *); ++extern int ext3_ext_calc_credits_for_insert(struct ext3_extents_tree *, struct ext3_ext_path *); ++extern int ext3_ext_insert_extent(handle_t *, struct ext3_extents_tree *, struct ext3_ext_path *, struct ext3_extent *); ++extern int ext3_ext_walk_space(struct ext3_extents_tree *, unsigned long, unsigned long, ext_prepare_callback); ++extern int ext3_ext_remove_space(struct ext3_extents_tree *, unsigned long, unsigned long); ++extern struct ext3_ext_path * ext3_ext_find_extent(struct ext3_extents_tree *, int, struct ext3_ext_path *); ++extern int ext3_ext_calc_blockmap_metadata(struct inode *, int); ++ ++static inline void ++ext3_ext_invalidate_cache(struct ext3_extents_tree *tree) ++{ ++ if (tree->cex) ++ tree->cex->ec_type = EXT3_EXT_CACHE_NO; ++} ++ ++ ++#endif /* _LINUX_EXT3_EXTENTS */ +Index: linux-2.6.12-rc6/include/linux/ext3_fs_i.h +=================================================================== +--- linux-2.6.12-rc6.orig/include/linux/ext3_fs_i.h 2005-06-06 17:22:29.000000000 +0200 ++++ linux-2.6.12-rc6/include/linux/ext3_fs_i.h 2005-06-14 16:31:25.941073443 +0200 +@@ -133,6 +133,8 @@ + */ + struct semaphore truncate_sem; + struct inode vfs_inode; ++ ++ __u32 i_cached_extent[4]; + }; + + #endif /* _LINUX_EXT3_FS_I */ diff --git a/lustre/kernel_patches/patches/ext3-external-journal-2.6.12.patch b/lustre/kernel_patches/patches/ext3-external-journal-2.6.12.patch new file mode 100644 index 0000000..bcfdae2 --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-external-journal-2.6.12.patch @@ -0,0 +1,148 @@ +Signed-off-by: Johann Lombardi + +--- linux-2.6.12.orig/fs/ext3/super.c 2005-06-17 21:48:29.000000000 +0200 ++++ linux-2.6.12/fs/ext3/super.c 2005-11-07 13:37:30.000000000 +0100 +@@ -39,7 +39,8 @@ + #include "xattr.h" + #include "acl.h" + +-static int ext3_load_journal(struct super_block *, struct ext3_super_block *); ++static int ext3_load_journal(struct super_block *, struct ext3_super_block *, ++ unsigned long journal_devnum); + static int ext3_create_journal(struct super_block *, struct ext3_super_block *, + int); + static void ext3_commit_super (struct super_block * sb, +@@ -586,7 +587,7 @@ enum { + Opt_nouid32, Opt_check, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov, + Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, + Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, +- Opt_commit, Opt_journal_update, Opt_journal_inum, ++ Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev, + Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, + Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, + Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, +@@ -624,6 +625,7 @@ static match_table_t tokens = { + {Opt_commit, "commit=%u"}, + {Opt_journal_update, "journal=update"}, + {Opt_journal_inum, "journal=%u"}, ++ {Opt_journal_dev, "journal_dev=%u"}, + {Opt_abort, "abort"}, + {Opt_data_journal, "data=journal"}, + {Opt_data_ordered, "data=ordered"}, +@@ -663,8 +665,9 @@ static unsigned long get_sb_block(void * + return sb_block; + } + +-static int parse_options (char * options, struct super_block *sb, +- unsigned long * inum, unsigned long *n_blocks_count, int is_remount) ++static int parse_options (char *options, struct super_block *sb, ++ unsigned long *inum, unsigned long *journal_devnum, ++ unsigned long *n_blocks_count, int is_remount) + { + struct ext3_sb_info *sbi = EXT3_SB(sb); + char * p; +@@ -805,6 +808,16 @@ static int parse_options (char * options + return 0; + *inum = option; + break; ++ case Opt_journal_dev: ++ if (is_remount) { ++ printk(KERN_ERR "EXT3-fs: cannot specify " ++ "journal on remount\n"); ++ return 0; ++ } ++ if (match_int(&args[0], &option)) ++ return 0; ++ *journal_devnum = option; ++ break; + case Opt_noload: + set_opt (sbi->s_mount_opt, NOLOAD); + break; +@@ -1250,6 +1263,7 @@ static int ext3_fill_super (struct super + unsigned long logic_sb_block; + unsigned long offset = 0; + unsigned long journal_inum = 0; ++ unsigned long journal_devnum = 0; + unsigned long def_mount_opts; + struct inode *root; + int blocksize; +@@ -1330,7 +1344,8 @@ static int ext3_fill_super (struct super + + set_opt(sbi->s_mount_opt, RESERVATION); + +- if (!parse_options ((char *) data, sb, &journal_inum, NULL, 0)) ++ if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum, ++ NULL, 0)) + goto failed_mount; + + sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | +@@ -1541,7 +1556,7 @@ static int ext3_fill_super (struct super + */ + if (!test_opt(sb, NOLOAD) && + EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) { +- if (ext3_load_journal(sb, es)) ++ if (ext3_load_journal(sb, es, journal_devnum)) + goto failed_mount2; + } else if (journal_inum) { + if (ext3_create_journal(sb, es, journal_inum)) +@@ -1821,15 +1836,24 @@ out_bdev: + return NULL; + } + +-static int ext3_load_journal(struct super_block * sb, +- struct ext3_super_block * es) ++static int ext3_load_journal(struct super_block *sb, ++ struct ext3_super_block *es, ++ unsigned long journal_devnum) + { + journal_t *journal; + int journal_inum = le32_to_cpu(es->s_journal_inum); +- dev_t journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev)); ++ dev_t journal_dev; + int err = 0; + int really_read_only; + ++ if (journal_devnum && ++ journal_devnum != le32_to_cpu(es->s_journal_dev)) { ++ printk(KERN_INFO "EXT3-fs: external journal device major/minor " ++ "numbers have changed\n"); ++ journal_dev = new_decode_dev(journal_devnum); ++ } else ++ journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev)); ++ + really_read_only = bdev_read_only(sb->s_bdev); + + /* +@@ -1888,6 +1912,16 @@ static int ext3_load_journal(struct supe + + EXT3_SB(sb)->s_journal = journal; + ext3_clear_journal_err(sb, es); ++ ++ if (journal_devnum && ++ journal_devnum != le32_to_cpu(es->s_journal_dev)) { ++ es->s_journal_dev = cpu_to_le32(journal_devnum); ++ sb->s_dirt = 1; ++ ++ /* Make sure we flush the recovery flag to disk. */ ++ ext3_commit_super(sb, es, 1); ++ } ++ + return 0; + } + +@@ -2093,13 +2127,13 @@ static int ext3_remount (struct super_bl + { + struct ext3_super_block * es; + struct ext3_sb_info *sbi = EXT3_SB(sb); +- unsigned long tmp; ++ unsigned long tmp1, tmp2; + unsigned long n_blocks_count = 0; + + /* + * Allow the "check" option to be passed as a remount option. + */ +- if (!parse_options(data, sb, &tmp, &n_blocks_count, 1)) ++ if (!parse_options(data, sb, &tmp1, &tmp2, &n_blocks_count, 1)) + return -EINVAL; + + if (sbi->s_mount_opt & EXT3_MOUNT_ABORT) diff --git a/lustre/kernel_patches/patches/ext3-mballoc2-2.6.12.patch b/lustre/kernel_patches/patches/ext3-mballoc2-2.6.12.patch new file mode 100644 index 0000000..c4c9d0b --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-mballoc2-2.6.12.patch @@ -0,0 +1,2463 @@ +Index: linux-2.6.12/include/linux/ext3_fs_sb.h +=================================================================== +--- linux-2.6.12.orig/include/linux/ext3_fs_sb.h 2005-06-17 21:48:29.000000000 +0200 ++++ linux-2.6.12/include/linux/ext3_fs_sb.h 2005-06-21 13:59:09.186627289 +0200 +@@ -21,9 +21,29 @@ + #include + #include + #include ++#include + #endif + #include + ++#define EXT3_BB_MAX_BLOCKS 30 ++struct ext3_free_metadata { ++ unsigned short group; ++ unsigned short num; ++ unsigned short blocks[EXT3_BB_MAX_BLOCKS]; ++ struct list_head list; ++}; ++ ++struct ext3_buddy_group_blocks { ++ __u32 bb_bitmap; ++ __u32 bb_buddy; ++ spinlock_t bb_lock; ++ unsigned long bb_tid; ++ struct ext3_free_metadata *bb_md_cur; ++ unsigned short bb_first_free; ++ unsigned short bb_free; ++ unsigned bb_counters[]; ++}; ++ + /* + * third extended-fs super-block data in memory + */ +@@ -78,6 +98,27 @@ + char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */ + int s_jquota_fmt; /* Format of quota to use */ + #endif ++ ++ /* for buddy allocator */ ++ struct ext3_buddy_group_blocks **s_buddy_blocks; ++ struct inode *s_buddy; ++ long s_blocks_reserved; ++ spinlock_t s_reserve_lock; ++ struct list_head s_active_transaction; ++ struct list_head s_closed_transaction; ++ struct list_head s_committed_transaction; ++ spinlock_t s_md_lock; ++ tid_t s_last_transaction; ++ int s_mb_factor; ++ ++ /* stats for buddy allocator */ ++ spinlock_t s_bal_lock; ++ unsigned long s_bal_reqs; /* number of reqs with len > 1 */ ++ unsigned long s_bal_success; /* we found long enough chunks */ ++ unsigned long s_bal_allocated; /* in blocks */ ++ unsigned long s_bal_ex_scanned; /* total extents scanned */ ++ unsigned long s_bal_goals; /* goal hits */ ++ unsigned long s_bal_breaks; /* too long searches */ + }; + + #endif /* _LINUX_EXT3_FS_SB */ +Index: linux-2.6.12/include/linux/ext3_fs.h +=================================================================== +--- linux-2.6.12.orig/include/linux/ext3_fs.h 2005-06-21 13:57:16.542097419 +0200 ++++ linux-2.6.12/include/linux/ext3_fs.h 2005-06-21 13:57:25.862409805 +0200 +@@ -57,6 +57,14 @@ + #define ext3_debug(f, a...) do {} while (0) + #endif + ++#define EXT3_MULTIBLOCK_ALLOCATOR 1 ++ ++#define EXT3_MB_HINT_MERGE 1 ++#define EXT3_MB_HINT_RESERVED 2 ++#define EXT3_MB_HINT_METADATA 4 ++#define EXT3_MB_HINT_FIRST 8 ++#define EXT3_MB_HINT_BEST 16 ++ + /* + * Special inodes numbers + */ +@@ -366,6 +374,7 @@ + #define EXT3_MOUNT_IOPEN_NOPRIV 0x100000 /* Make iopen world-readable */ + #define EXT3_MOUNT_EXTENTS 0x200000/* Extents support */ + #define EXT3_MOUNT_EXTDEBUG 0x400000/* Extents debug */ ++#define EXT3_MOUNT_MBALLOC 0x800000/* Buddy allocation support */ + + /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ + #ifndef clear_opt +@@ -727,7 +736,7 @@ + extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group); + extern int ext3_new_block (handle_t *, struct inode *, unsigned long, int *); + extern void ext3_free_blocks (handle_t *, struct inode *, unsigned long, +- unsigned long); ++ unsigned long, int); + extern void ext3_free_blocks_sb (handle_t *, struct super_block *, + unsigned long, unsigned long, int *); + extern unsigned long ext3_count_free_blocks (struct super_block *); +@@ -848,6 +857,44 @@ + extern int ext3_ext_ioctl(struct inode *inode, struct file *filp, + unsigned int cmd, unsigned long arg); + ++/* mballoc.c */ ++extern long ext3_mb_aggressive; ++extern long ext3_mb_stats; ++extern long ext3_mb_max_to_scan; ++extern int ext3_mb_init(struct super_block *, int); ++extern int ext3_mb_release(struct super_block *); ++extern int ext3_mb_new_blocks(handle_t *, struct inode *, unsigned long, int *, int, int *); ++extern int ext3_mb_reserve_blocks(struct super_block *, int); ++extern void ext3_mb_release_blocks(struct super_block *, int); ++ ++/* writeback.c */ ++extern int ext3_wb_writepages(struct address_space *, struct writeback_control *); ++extern int ext3_wb_prepare_write(struct file *file, struct page *page, ++ unsigned from, unsigned to); ++extern int ext3_wb_commit_write(struct file *, struct page *, unsigned, unsigned); ++extern int ext3_wb_writepage(struct page *, struct writeback_control *); ++extern int ext3_wb_invalidatepage(struct page *, unsigned long); ++extern int ext3_wb_releasepage(struct page *, int); ++extern int ext3_wb_block_truncate_page(handle_t *, struct page *, struct address_space *, loff_t); ++extern void ext3_wb_init(struct super_block *); ++extern void ext3_wb_release(struct super_block *); ++ ++/* writeback.c */ ++extern int ext3_wb_writepages(struct address_space *, struct writeback_control *); ++extern int ext3_wb_prepare_write(struct file *file, struct page *page, ++ unsigned from, unsigned to); ++extern int ext3_wb_commit_write(struct file *, struct page *, unsigned, unsigned); ++extern int ext3_wb_writepage(struct page *, struct writeback_control *); ++extern int ext3_wb_invalidatepage(struct page *, unsigned long); ++extern int ext3_wb_releasepage(struct page *, int); ++extern int ext3_wb_block_truncate_page(handle_t *, struct page *, struct address_space *, loff_t); ++extern void ext3_wb_init(struct super_block *); ++extern void ext3_wb_release(struct super_block *); ++ ++/* proc.c */ ++extern int init_ext3_proc(void); ++extern void exit_ext3_proc(void); ++ + #endif /* __KERNEL__ */ + + /* EXT3_IOC_CREATE_INUM at bottom of file (visible to kernel and user). */ +Index: linux-2.6.12/fs/ext3/balloc.c +=================================================================== +--- linux-2.6.12.orig/fs/ext3/balloc.c 2005-06-17 21:48:29.000000000 +0200 ++++ linux-2.6.12/fs/ext3/balloc.c 2005-06-21 13:57:25.820417618 +0200 +@@ -79,7 +79,7 @@ + * + * Return buffer_head on success or NULL in case of failure. + */ +-static struct buffer_head * ++struct buffer_head * + read_block_bitmap(struct super_block *sb, unsigned int block_group) + { + struct ext3_group_desc * desc; +@@ -490,24 +490,6 @@ + return; + } + +-/* Free given blocks, update quota and i_blocks field */ +-void ext3_free_blocks(handle_t *handle, struct inode *inode, +- unsigned long block, unsigned long count) +-{ +- struct super_block * sb; +- int dquot_freed_blocks; +- +- sb = inode->i_sb; +- if (!sb) { +- printk ("ext3_free_blocks: nonexistent device"); +- return; +- } +- ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks); +- if (dquot_freed_blocks) +- DQUOT_FREE_BLOCK(inode, dquot_freed_blocks); +- return; +-} +- + /* + * For ext3 allocations, we must not reuse any blocks which are + * allocated in the bitmap buffer's "last committed data" copy. This +@@ -1162,7 +1144,7 @@ + * bitmap, and then for any free bit if that fails. + * This function also updates quota and i_blocks field. + */ +-int ext3_new_block(handle_t *handle, struct inode *inode, ++int ext3_new_block_old(handle_t *handle, struct inode *inode, + unsigned long goal, int *errp) + { + struct buffer_head *bitmap_bh = NULL; +Index: linux-2.6.12/fs/ext3/extents.c +=================================================================== +--- linux-2.6.12.orig/fs/ext3/extents.c 2005-06-21 13:57:16.493269295 +0200 ++++ linux-2.6.12/fs/ext3/extents.c 2005-06-21 13:57:25.847761367 +0200 +@@ -771,7 +771,7 @@ + for (i = 0; i < depth; i++) { + if (!ablocks[i]) + continue; +- ext3_free_blocks(handle, tree->inode, ablocks[i], 1); ++ ext3_free_blocks(handle, tree->inode, ablocks[i], 1, 1); + } + } + kfree(ablocks); +@@ -1428,7 +1428,7 @@ + path->p_idx->ei_leaf); + bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf); + ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf); +- ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1); ++ ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1, 1); + return err; + } + +@@ -1913,10 +1913,12 @@ + int needed = ext3_remove_blocks_credits(tree, ex, from, to); + handle_t *handle = ext3_journal_start(tree->inode, needed); + struct buffer_head *bh; +- int i; ++ int i, metadata = 0; + + if (IS_ERR(handle)) + return PTR_ERR(handle); ++ if (S_ISDIR(tree->inode->i_mode)) ++ metadata = 1; + if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) { + /* tail removal */ + unsigned long num, start; +@@ -1928,7 +1930,7 @@ + bh = sb_find_get_block(tree->inode->i_sb, start + i); + ext3_forget(handle, 0, tree->inode, bh, start + i); + } +- ext3_free_blocks(handle, tree->inode, start, num); ++ ext3_free_blocks(handle, tree->inode, start, num, metadata); + } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) { + printk("strange request: removal %lu-%lu from %u:%u\n", + from, to, ex->ee_block, ex->ee_len); +Index: linux-2.6.12/fs/ext3/namei.c +=================================================================== +--- linux-2.6.12.orig/fs/ext3/namei.c 2005-06-21 13:57:11.984480287 +0200 ++++ linux-2.6.12/fs/ext3/namei.c 2005-06-21 13:57:25.828230118 +0200 +@@ -1644,7 +1644,7 @@ + * If the create succeeds, we fill in the inode information + * with d_instantiate(). + */ +-static int ext3_create (struct inode * dir, struct dentry * dentry, int mode, ++int ext3_create (struct inode * dir, struct dentry * dentry, int mode, + struct nameidata *nd) + { + handle_t *handle; +Index: linux-2.6.12/fs/ext3/xattr.c +=================================================================== +--- linux-2.6.12.orig/fs/ext3/xattr.c 2005-06-17 21:48:29.000000000 +0200 ++++ linux-2.6.12/fs/ext3/xattr.c 2005-06-21 13:57:25.854597305 +0200 +@@ -484,7 +484,7 @@ + ea_bdebug(bh, "refcount now=0; freeing"); + if (ce) + mb_cache_entry_free(ce); +- ext3_free_blocks(handle, inode, bh->b_blocknr, 1); ++ ext3_free_blocks(handle, inode, bh->b_blocknr, 1, 1); + get_bh(bh); + ext3_forget(handle, 1, inode, bh, bh->b_blocknr); + } else { +@@ -804,7 +804,7 @@ + new_bh = sb_getblk(sb, block); + if (!new_bh) { + getblk_failed: +- ext3_free_blocks(handle, inode, block, 1); ++ ext3_free_blocks(handle, inode, block, 1, 1); + error = -EIO; + goto cleanup; + } +Index: linux-2.6.12/fs/ext3/Makefile +=================================================================== +--- linux-2.6.12.orig/fs/ext3/Makefile 2005-06-21 13:57:16.514753669 +0200 ++++ linux-2.6.12/fs/ext3/Makefile 2005-06-21 13:57:25.812605118 +0200 +@@ -5,7 +5,8 @@ + obj-$(CONFIG_EXT3_FS) += ext3.o + + ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o\ +- ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o ++ ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ ++ mballoc.o + + ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o + ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o +Index: linux-2.6.12/fs/ext3/mballoc.c +=================================================================== +--- linux-2.6.12.orig/fs/ext3/mballoc.c 2005-06-21 13:57:25.736433244 +0200 ++++ linux-2.6.12/fs/ext3/mballoc.c 2005-06-21 13:57:25.795026993 +0200 +@@ -0,0 +1,1865 @@ ++/* ++ * Copyright(c) 2003, 2004, 2005, Cluster File Systems, Inc, info@clusterfs.com ++ * Written by Alex Tomas ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public Licens ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- ++ */ ++ ++ ++/* ++ * mballoc.c contains the multiblocks allocation routines ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * TODO: ++ * - bitmap/buddy read-ahead (proposed by Oleg Drokin aka green) ++ * - track min/max extents in each group for better group selection ++ * - is it worthwhile to use buddies directly if req is 2^N blocks? ++ * - mb_mark_used() may allocate chunk right after splitting buddy ++ * - special flag to advice allocator to look for requested + N blocks ++ * this may improve interaction between extents and mballoc ++ * - tree of groups sorted by number of free blocks ++ * - percpu reservation code (hotpath) ++ * - error handling ++ */ ++ ++/* ++ * with 'ext3_mb_aggressive' set the allocator runs consistency checks over ++ * structures. these checks slow things down a lot ++ */ ++long ext3_mb_aggressive = 0; ++ ++ ++/* ++ * with 'ext3_mb_stats' allocator will collect stats that will be ++ * shown at umount. The collecting costs though! ++ */ ++long ext3_mb_stats = 1; ++ ++/* ++ */ ++#define MB_DEBUG__ ++#ifdef MB_DEBUG ++#define mb_debug(fmt,a...) printk(fmt, ##a) ++#else ++#define mb_debug(fmt,a...) ++#endif ++ ++/* ++ * where to save buddies structures beetween umount/mount (clean case only) ++ */ ++#define EXT3_BUDDY_FILE ".buddy" ++ ++/* ++ * How long mballoc can look for a best extent (in found extents) ++ */ ++long ext3_mb_max_to_scan = 100; ++ ++/* ++ * This structure is on-disk description of a group for mballoc ++ */ ++struct ext3_mb_group_descr { ++ __u16 mgd_first_free; /* first free block in the group */ ++ __u16 mgd_free; /* number of free blocks in the group */ ++ __u16 mgd_counters[16]; /* number of free blocks by order */ ++}; ++ ++/* ++ * This structure is header of mballoc's file ++ */ ++struct ext3_mb_grp_header { ++ __u32 mh_magic; ++}; ++ ++#define EXT3_MB_MAGIC_V1 0xbabd16fd ++ ++ ++struct ext3_free_extent { ++ __u16 fe_start; ++ __u16 fe_len; ++ __u16 fe_group; ++}; ++ ++struct ext3_allocation_context { ++ struct super_block *ac_sb; ++ ++ /* search goals */ ++ struct ext3_free_extent ac_g_ex; ++ ++ /* the best found extent */ ++ struct ext3_free_extent ac_b_ex; ++ ++ /* number of iterations done. we have to track to limit searching */ ++ unsigned long ac_ex_scanned; ++ __u16 ac_groups_scanned; ++ __u16 ac_found; ++ __u8 ac_status; ++ __u8 ac_flags; /* allocation hints */ ++ __u8 ac_repeats; ++}; ++ ++#define AC_STATUS_CONTINUE 1 ++#define AC_STATUS_FOUND 2 ++#define AC_STATUS_BREAK 3 ++ ++struct ext3_buddy { ++ struct buffer_head *bd_bh; ++ struct buffer_head *bd_bh2; ++ struct ext3_buddy_group_blocks *bd_bd; ++ struct super_block *bd_sb; ++ __u16 bd_blkbits; ++ __u16 bd_group; ++}; ++#define EXT3_MB_BITMAP(e3b) ((e3b)->bd_bh->b_data) ++#define EXT3_MB_BUDDY(e3b) ((e3b)->bd_bh2->b_data) ++ ++#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) ++ ++int ext3_create (struct inode *, struct dentry *, int, struct nameidata *); ++struct buffer_head * read_block_bitmap(struct super_block *, unsigned int); ++int ext3_new_block_old(handle_t *, struct inode *, unsigned long, int *); ++int ext3_mb_reserve_blocks(struct super_block *, int); ++void ext3_mb_release_blocks(struct super_block *, int); ++void ext3_mb_poll_new_transaction(struct super_block *, handle_t *); ++void ext3_mb_free_committed_blocks(struct super_block *); ++ ++#if BITS_PER_LONG == 64 ++#define mb_correct_addr_and_bit(bit,addr) \ ++{ \ ++ bit += ((unsigned long) addr & 7UL) << 3; \ ++ addr = (void *) ((unsigned long) addr & ~7UL); \ ++} ++#elif BITS_PER_LONG == 32 ++#define mb_correct_addr_and_bit(bit,addr) \ ++{ \ ++ bit += ((unsigned long) addr & 3UL) << 3; \ ++ addr = (void *) ((unsigned long) addr & ~3UL); \ ++} ++#else ++#error "how many bits you are?!" ++#endif ++ ++static inline int mb_test_bit(int bit, void *addr) ++{ ++ mb_correct_addr_and_bit(bit,addr); ++ return ext3_test_bit(bit, addr); ++} ++ ++static inline void mb_set_bit(int bit, void *addr) ++{ ++ mb_correct_addr_and_bit(bit,addr); ++ ext3_set_bit(bit, addr); ++} ++ ++static inline void mb_set_bit_atomic(int bit, void *addr) ++{ ++ mb_correct_addr_and_bit(bit,addr); ++ ext3_set_bit_atomic(NULL, bit, addr); ++} ++ ++static inline void mb_clear_bit(int bit, void *addr) ++{ ++ mb_correct_addr_and_bit(bit,addr); ++ ext3_clear_bit(bit, addr); ++} ++ ++static inline void mb_clear_bit_atomic(int bit, void *addr) ++{ ++ mb_correct_addr_and_bit(bit,addr); ++ ext3_clear_bit_atomic(NULL, bit, addr); ++} ++ ++static inline void *mb_find_buddy(struct ext3_buddy *e3b, int order, int *max) ++{ ++ int i = 1; ++ char *bb; ++ ++ J_ASSERT(EXT3_MB_BITMAP(e3b) != EXT3_MB_BUDDY(e3b)); ++ J_ASSERT(max != NULL); ++ ++ if (order > e3b->bd_blkbits + 1) { ++ *max = 0; ++ return NULL; ++ } ++ ++ /* at order 0 we see each particular block */ ++ *max = 1 << (e3b->bd_blkbits + 3); ++ if (order == 0) ++ return EXT3_MB_BITMAP(e3b); ++ ++ bb = EXT3_MB_BUDDY(e3b); ++ *max = *max >> 1; ++ while (i < order) { ++ bb += 1 << (e3b->bd_blkbits - i); ++ i++; ++ *max = *max >> 1; ++ } ++ J_ASSERT((unsigned) (bb - (char *) EXT3_MB_BUDDY(e3b)) < ++ e3b->bd_sb->s_blocksize); ++ return bb; ++} ++ ++static int ext3_mb_load_buddy(struct super_block *sb, int group, ++ struct ext3_buddy *e3b) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ ++ J_ASSERT(sbi->s_buddy_blocks[group]->bb_bitmap); ++ J_ASSERT(sbi->s_buddy_blocks[group]->bb_buddy); ++ ++ /* load bitmap */ ++ e3b->bd_bh = sb_getblk(sb, sbi->s_buddy_blocks[group]->bb_bitmap); ++ if (e3b->bd_bh == NULL) { ++ ext3_error(sb, "ext3_mb_load_buddy", ++ "can't get block for buddy bitmap\n"); ++ goto out; ++ } ++ /* load buddy */ ++ e3b->bd_bh2 = sb_getblk(sb, sbi->s_buddy_blocks[group]->bb_buddy); ++ if (e3b->bd_bh2 == NULL) { ++ ext3_error(sb, "ext3_mb_load_buddy", ++ "can't get block for buddy bitmap\n"); ++ goto out; ++ } ++ ++ if (!buffer_uptodate(e3b->bd_bh)) ++ ll_rw_block(READ, 1, &e3b->bd_bh); ++ if (!buffer_uptodate(e3b->bd_bh2)) ++ ll_rw_block(READ, 1, &e3b->bd_bh2); ++ ++ wait_on_buffer(e3b->bd_bh); ++ J_ASSERT(buffer_uptodate(e3b->bd_bh)); ++ wait_on_buffer(e3b->bd_bh2); ++ J_ASSERT(buffer_uptodate(e3b->bd_bh2)); ++ ++ e3b->bd_blkbits = sb->s_blocksize_bits; ++ e3b->bd_bd = sbi->s_buddy_blocks[group]; ++ e3b->bd_sb = sb; ++ e3b->bd_group = group; ++ ++ return 0; ++out: ++ brelse(e3b->bd_bh); ++ brelse(e3b->bd_bh2); ++ e3b->bd_bh = NULL; ++ e3b->bd_bh2 = NULL; ++ return -EIO; ++} ++ ++static void ext3_mb_dirty_buddy(struct ext3_buddy *e3b) ++{ ++ mark_buffer_dirty(e3b->bd_bh); ++ mark_buffer_dirty(e3b->bd_bh2); ++} ++ ++static void ext3_mb_release_desc(struct ext3_buddy *e3b) ++{ ++ brelse(e3b->bd_bh); ++ brelse(e3b->bd_bh2); ++} ++ ++static void mb_check_buddy(struct ext3_buddy *e3b) ++{ ++ int order = e3b->bd_blkbits + 1; ++ int max, max2, i, j, k, count; ++ void *buddy, *buddy2; ++ ++ if (likely(!ext3_mb_aggressive)) ++ return; ++ ++ if (!test_opt(e3b->bd_sb, MBALLOC)) ++ return; ++ ++ while (order > 1) { ++ buddy = mb_find_buddy(e3b, order, &max); ++ J_ASSERT(buddy); ++ buddy2 = mb_find_buddy(e3b, order - 1, &max2); ++ J_ASSERT(buddy2); ++ J_ASSERT(buddy != buddy2); ++ J_ASSERT(max * 2 == max2); ++ ++ count = 0; ++ for (i = 0; i < max; i++) { ++ ++ if (mb_test_bit(i, buddy)) { ++ /* only single bit in buddy2 may be 1 */ ++ if (!mb_test_bit(i << 1, buddy2)) ++ J_ASSERT(mb_test_bit((i<<1)+1, buddy2)); ++ else if (!mb_test_bit((i << 1) + 1, buddy2)) ++ J_ASSERT(mb_test_bit(i << 1, buddy2)); ++ continue; ++ } ++ ++ /* both bits in buddy2 must be 0 */ ++ J_ASSERT(mb_test_bit(i << 1, buddy2)); ++ J_ASSERT(mb_test_bit((i << 1) + 1, buddy2)); ++ ++ for (j = 0; j < (1 << order); j++) { ++ k = (i * (1 << order)) + j; ++ J_ASSERT(!mb_test_bit(k, EXT3_MB_BITMAP(e3b))); ++ } ++ count++; ++ } ++ J_ASSERT(e3b->bd_bd->bb_counters[order] == count); ++ order--; ++ } ++ ++ buddy = mb_find_buddy(e3b, 0, &max); ++ for (i = 0; i < max; i++) { ++ if (!mb_test_bit(i, buddy)) ++ continue; ++ /* check used bits only */ ++ for (j = 0; j < e3b->bd_blkbits + 1; j++) { ++ buddy2 = mb_find_buddy(e3b, j, &max2); ++ k = i >> j; ++ J_ASSERT(k < max2); ++ J_ASSERT(mb_test_bit(k, buddy2)); ++ } ++ } ++} ++ ++static inline void ++ext3_lock_group(struct super_block *sb, int group) ++{ ++ spin_lock(&EXT3_SB(sb)->s_buddy_blocks[group]->bb_lock); ++} ++ ++static inline void ++ext3_unlock_group(struct super_block *sb, int group) ++{ ++ spin_unlock(&EXT3_SB(sb)->s_buddy_blocks[group]->bb_lock); ++} ++ ++static int mb_find_order_for_block(struct ext3_buddy *e3b, int block) ++{ ++ int order = 1; ++ void *bb; ++ ++ J_ASSERT(EXT3_MB_BITMAP(e3b) != EXT3_MB_BUDDY(e3b)); ++ J_ASSERT(block < (1 << (e3b->bd_blkbits + 3))); ++ ++ bb = EXT3_MB_BUDDY(e3b); ++ while (order <= e3b->bd_blkbits + 1) { ++ block = block >> 1; ++ if (!mb_test_bit(block, bb)) { ++ /* this block is part of buddy of order 'order' */ ++ return order; ++ } ++ bb += 1 << (e3b->bd_blkbits - order); ++ order++; ++ } ++ return 0; ++} ++ ++static inline void mb_clear_bits(void *bm, int cur, int len) ++{ ++ __u32 *addr; ++ ++ len = cur + len; ++ while (cur < len) { ++ if ((cur & 31) == 0 && (len - cur) >= 32) { ++ /* fast path: clear whole word at once */ ++ addr = bm + (cur >> 3); ++ *addr = 0; ++ cur += 32; ++ continue; ++ } ++ mb_clear_bit_atomic(cur, bm); ++ cur++; ++ } ++} ++ ++static inline void mb_set_bits(void *bm, int cur, int len) ++{ ++ __u32 *addr; ++ ++ len = cur + len; ++ while (cur < len) { ++ if ((cur & 31) == 0 && (len - cur) >= 32) { ++ /* fast path: clear whole word at once */ ++ addr = bm + (cur >> 3); ++ *addr = 0xffffffff; ++ cur += 32; ++ continue; ++ } ++ mb_set_bit_atomic(cur, bm); ++ cur++; ++ } ++} ++ ++static int mb_free_blocks(struct ext3_buddy *e3b, int first, int count) ++{ ++ int block, max, order; ++ void *buddy, *buddy2; ++ ++ mb_check_buddy(e3b); ++ ++ e3b->bd_bd->bb_free += count; ++ if (first < e3b->bd_bd->bb_first_free) ++ e3b->bd_bd->bb_first_free = first; ++ ++ while (count-- > 0) { ++ block = first++; ++ order = 0; ++ ++ J_ASSERT(mb_test_bit(block, EXT3_MB_BITMAP(e3b))); ++ mb_clear_bit(block, EXT3_MB_BITMAP(e3b)); ++ e3b->bd_bd->bb_counters[order]++; ++ ++ /* start of the buddy */ ++ buddy = mb_find_buddy(e3b, order, &max); ++ ++ do { ++ block &= ~1UL; ++ if (mb_test_bit(block, buddy) || ++ mb_test_bit(block + 1, buddy)) ++ break; ++ ++ /* both the buddies are free, try to coalesce them */ ++ buddy2 = mb_find_buddy(e3b, order + 1, &max); ++ ++ if (!buddy2) ++ break; ++ ++ if (order > 0) { ++ /* for special purposes, we don't set ++ * free bits in bitmap */ ++ mb_set_bit(block, buddy); ++ mb_set_bit(block + 1, buddy); ++ } ++ e3b->bd_bd->bb_counters[order]--; ++ e3b->bd_bd->bb_counters[order]--; ++ ++ block = block >> 1; ++ order++; ++ e3b->bd_bd->bb_counters[order]++; ++ ++ mb_clear_bit(block, buddy2); ++ buddy = buddy2; ++ } while (1); ++ } ++ mb_check_buddy(e3b); ++ ++ return 0; ++} ++ ++static int mb_find_extent(struct ext3_buddy *e3b, int order, int block, ++ int needed, struct ext3_free_extent *ex) ++{ ++ int next, max, ord; ++ void *buddy; ++ ++ J_ASSERT(ex != NULL); ++ ++ buddy = mb_find_buddy(e3b, order, &max); ++ J_ASSERT(buddy); ++ J_ASSERT(block < max); ++ if (mb_test_bit(block, buddy)) { ++ ex->fe_len = 0; ++ ex->fe_start = 0; ++ ex->fe_group = 0; ++ return 0; ++ } ++ ++ if (order == 0) { ++ /* find actual order */ ++ order = mb_find_order_for_block(e3b, block); ++ block = block >> order; ++ } ++ ++ ex->fe_len = 1 << order; ++ ex->fe_start = block << order; ++ ex->fe_group = e3b->bd_group; ++ ++ while ((buddy = mb_find_buddy(e3b, order, &max))) { ++ ++ if (block + 1 >= max) ++ break; ++ ++ next = (block + 1) * (1 << order); ++ if (mb_test_bit(next, EXT3_MB_BITMAP(e3b))) ++ break; ++ ++ ord = mb_find_order_for_block(e3b, next); ++ ++ order = ord; ++ block = next >> order; ++ ex->fe_len += 1 << order; ++ } ++ ++ J_ASSERT(ex->fe_start + ex->fe_len <= (1 << (e3b->bd_blkbits + 3))); ++ return ex->fe_len; ++} ++ ++static int mb_mark_used(struct ext3_buddy *e3b, struct ext3_free_extent *ex) ++{ ++ int start = ex->fe_start; ++ int len = ex->fe_len; ++ int ord, mlen, max, cur; ++ int len0 = len; ++ void *buddy; ++ ++ e3b->bd_bd->bb_free -= len; ++ if (e3b->bd_bd->bb_first_free == start) ++ e3b->bd_bd->bb_first_free += len; ++ ++ while (len) { ++ ord = mb_find_order_for_block(e3b, start); ++ ++ if (((start >> ord) << ord) == start && len >= (1 << ord)) { ++ /* the whole chunk may be allocated at once! */ ++ mlen = 1 << ord; ++ buddy = mb_find_buddy(e3b, ord, &max); ++ J_ASSERT((start >> ord) < max); ++ mb_set_bit(start >> ord, buddy); ++ e3b->bd_bd->bb_counters[ord]--; ++ start += mlen; ++ len -= mlen; ++ J_ASSERT(len >= 0); ++ continue; ++ } ++ ++ /* we have to split large buddy */ ++ J_ASSERT(ord > 0); ++ buddy = mb_find_buddy(e3b, ord, &max); ++ mb_set_bit(start >> ord, buddy); ++ e3b->bd_bd->bb_counters[ord]--; ++ ++ ord--; ++ cur = (start >> ord) & ~1U; ++ buddy = mb_find_buddy(e3b, ord, &max); ++ mb_clear_bit(cur, buddy); ++ mb_clear_bit(cur + 1, buddy); ++ e3b->bd_bd->bb_counters[ord]++; ++ e3b->bd_bd->bb_counters[ord]++; ++ } ++ ++ /* now drop all the bits in bitmap */ ++ mb_set_bits(EXT3_MB_BITMAP(e3b), ex->fe_start, len0); ++ ++ mb_check_buddy(e3b); ++ ++ return 0; ++} ++ ++/* ++ * Must be called under group lock! ++ */ ++static void ext3_mb_use_best_found(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len); ++ mb_mark_used(e3b, &ac->ac_b_ex); ++ ac->ac_status = AC_STATUS_FOUND; ++} ++ ++/* ++ * The routine checks whether found extent is good enough. If it is, ++ * then the extent gets marked used and flag is set to the context ++ * to stop scanning. Otherwise, the extent is compared with the ++ * previous found extent and if new one is better, then it's stored ++ * in the context. Later, the best found extent will be used, if ++ * mballoc can't find good enough extent. ++ * ++ * FIXME: real allocation policy is to be designed yet! ++ */ ++static void ext3_mb_measure_extent(struct ext3_allocation_context *ac, ++ struct ext3_free_extent *ex, ++ struct ext3_buddy *e3b) ++{ ++ int factor = EXT3_SB(ac->ac_sb)->s_mb_factor; ++ struct ext3_free_extent *bex = &ac->ac_b_ex; ++ int diff = ac->ac_g_ex.fe_len - ex->fe_len; ++ ++ J_ASSERT(ex->fe_len > 0); ++ J_ASSERT(ex->fe_len < (1 << ac->ac_sb->s_blocksize_bits) * 8); ++ J_ASSERT(ex->fe_start < (1 << ac->ac_sb->s_blocksize_bits) * 8); ++ ++ ac->ac_found++; ++ ++ /* ++ * The special case - take what you catch first ++ */ ++ if (ac->ac_flags & EXT3_MB_HINT_FIRST) { ++ *bex = *ex; ++ ext3_mb_use_best_found(ac, e3b); ++ return; ++ } ++ ++ /* ++ * Let's check whether the chuck is good enough ++ */ ++ if (ex->fe_len >= ac->ac_g_ex.fe_len) { ++ *bex = *ex; ++ ext3_mb_use_best_found(ac, e3b); ++ return; ++ } ++ ++ /* ++ * If the request is vey large, then it makes sense to use large ++ * chunks for it. Even if they don't satisfy whole request. ++ */ ++ if (ex->fe_len > 1000) { ++ *bex = *ex; ++ ext3_mb_use_best_found(ac, e3b); ++ return; ++ } ++ ++ /* ++ * Sometimes it's worty to take close chunk ++ */ ++ if (factor && (ac->ac_g_ex.fe_len * 100) / (diff * 100) >= factor) { ++ *bex = *ex; ++ ext3_mb_use_best_found(ac, e3b); ++ return; ++ } ++ ++ /* ++ * If this is first found extent, just store it in the context ++ */ ++ if (bex->fe_len == 0) { ++ *bex = *ex; ++ return; ++ } ++ ++ /* ++ * If new found extent is better, store it in the context ++ * FIXME: possible the policy should be more complex? ++ */ ++ if (ex->fe_len > bex->fe_len) { ++ *bex = *ex; ++ } ++ ++ /* ++ * We don't want to scan for a whole year ++ */ ++ if (ac->ac_found > ext3_mb_max_to_scan) ++ ac->ac_status = AC_STATUS_BREAK; ++} ++ ++static int ext3_mb_try_best_found(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ struct ext3_free_extent ex = ac->ac_b_ex; ++ int group = ex.fe_group, max, err; ++ ++ J_ASSERT(ex.fe_len > 0); ++ err = ext3_mb_load_buddy(ac->ac_sb, group, e3b); ++ if (err) ++ return err; ++ ++ ext3_lock_group(ac->ac_sb, group); ++ max = mb_find_extent(e3b, 0, ex.fe_start, ex.fe_len, &ex); ++ ++ if (max > 0) ++ ext3_mb_use_best_found(ac, e3b); ++ ++ ext3_unlock_group(ac->ac_sb, group); ++ ++ if (ac->ac_status == AC_STATUS_FOUND) ++ ext3_mb_dirty_buddy(e3b); ++ ext3_mb_release_desc(e3b); ++ ++ return 0; ++} ++ ++static int ext3_mb_find_by_goal(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ int group = ac->ac_g_ex.fe_group, max, err; ++ struct ext3_free_extent ex; ++ ++ err = ext3_mb_load_buddy(ac->ac_sb, group, e3b); ++ if (err) ++ return err; ++ ++ ext3_lock_group(ac->ac_sb, group); ++ max = mb_find_extent(e3b, 0, ac->ac_g_ex.fe_start, ++ ac->ac_g_ex.fe_len, &ex); ++ ++ if (max > 0) { ++ J_ASSERT(ex.fe_len > 0); ++ J_ASSERT(ex.fe_group == ac->ac_g_ex.fe_group); ++ J_ASSERT(ex.fe_start == ac->ac_g_ex.fe_start); ++ ac->ac_b_ex = ex; ++ ext3_mb_use_best_found(ac, e3b); ++ } ++ ext3_unlock_group(ac->ac_sb, group); ++ ++ if (ac->ac_status == AC_STATUS_FOUND) ++ ext3_mb_dirty_buddy(e3b); ++ ext3_mb_release_desc(e3b); ++ ++ return 0; ++} ++/* ++ * The routine scans the group and measures all found extents. ++ * In order to optimize scanning, caller must pass number of ++ * free blocks in the group, so the routine can upper limit. ++ */ ++static void ext3_mb_scan_group(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ struct super_block *sb = ac->ac_sb; ++ void *bitmap = EXT3_MB_BITMAP(e3b); ++ struct ext3_free_extent ex; ++ int i, free; ++ ++ free = e3b->bd_bd->bb_free; ++ J_ASSERT(free > 0); ++ ++ i = e3b->bd_bd->bb_first_free; ++ ++ while (free && ac->ac_status != AC_STATUS_FOUND) { ++ i = ext3_find_next_zero_bit(bitmap, sb->s_blocksize * 8, i); ++ if (i >= sb->s_blocksize * 8) { ++ J_ASSERT(free == 0); ++ break; ++ } ++ ++ mb_find_extent(e3b, 0, i, ac->ac_g_ex.fe_len, &ex); ++ J_ASSERT(ex.fe_len > 0); ++ J_ASSERT(free >= ex.fe_len); ++ ++ ext3_mb_measure_extent(ac, &ex, e3b); ++ ++ i += ex.fe_len; ++ free -= ex.fe_len; ++ } ++} ++ ++static int ext3_mb_good_group(struct ext3_allocation_context *ac, ++ int group, int cr) ++{ ++ int free; ++ ++ J_ASSERT(cr >= 0 && cr < 3); ++ ++ free = EXT3_SB(ac->ac_sb)->s_buddy_blocks[group]->bb_free; ++ if (free == 0) ++ return 0; ++ ++ if (cr == 0) { ++ if (free >= ac->ac_g_ex.fe_len >> 1) ++ return 1; ++ } else if (cr == 1) { ++ if (free >= ac->ac_g_ex.fe_len >> 2) ++ return 1; ++ } else if (cr == 2) { ++ return 1; ++ } ++ return 0; ++} ++ ++int ext3_mb_new_blocks(handle_t *handle, struct inode *inode, ++ unsigned long goal, int *len, int flags, int *errp) ++{ ++ struct buffer_head *bitmap_bh = NULL; ++ struct ext3_allocation_context ac; ++ int i, group, block, cr, err = 0; ++ struct ext3_group_desc *gdp; ++ struct ext3_super_block *es; ++ struct buffer_head *gdp_bh; ++ struct ext3_sb_info *sbi; ++ struct super_block *sb; ++ struct ext3_buddy e3b; ++ ++ J_ASSERT(len != NULL); ++ J_ASSERT(*len > 0); ++ ++ sb = inode->i_sb; ++ if (!sb) { ++ printk("ext3_mb_new_nblocks: nonexistent device"); ++ return 0; ++ } ++ ++ if (!test_opt(sb, MBALLOC)) { ++ static int ext3_mballoc_warning = 0; ++ if (ext3_mballoc_warning == 0) { ++ printk(KERN_ERR "EXT3-fs: multiblock request with " ++ "mballoc disabled!\n"); ++ ext3_mballoc_warning++; ++ } ++ *len = 1; ++ err = ext3_new_block_old(handle, inode, goal, errp); ++ return err; ++ } ++ ++ ext3_mb_poll_new_transaction(sb, handle); ++ ++ sbi = EXT3_SB(sb); ++ es = EXT3_SB(sb)->s_es; ++ ++ /* ++ * We can't allocate > group size ++ */ ++ if (*len >= EXT3_BLOCKS_PER_GROUP(sb) - 10) ++ *len = EXT3_BLOCKS_PER_GROUP(sb) - 10; ++ ++ if (!(flags & EXT3_MB_HINT_RESERVED)) { ++ /* someone asks for non-reserved blocks */ ++ BUG_ON(*len > 1); ++ err = ext3_mb_reserve_blocks(sb, 1); ++ if (err) { ++ *errp = err; ++ return 0; ++ } ++ } ++ ++ /* ++ * Check quota for allocation of this blocks. ++ */ ++ while (*len && DQUOT_ALLOC_BLOCK(inode, *len)) ++ *len -= 1; ++ if (*len == 0) { ++ *errp = -EDQUOT; ++ block = 0; ++ goto out; ++ } ++ ++ /* start searching from the goal */ ++ if (goal < le32_to_cpu(es->s_first_data_block) || ++ goal >= le32_to_cpu(es->s_blocks_count)) ++ goal = le32_to_cpu(es->s_first_data_block); ++ group = (goal - le32_to_cpu(es->s_first_data_block)) / ++ EXT3_BLOCKS_PER_GROUP(sb); ++ block = ((goal - le32_to_cpu(es->s_first_data_block)) % ++ EXT3_BLOCKS_PER_GROUP(sb)); ++ ++ /* set up allocation goals */ ++ ac.ac_b_ex.fe_group = 0; ++ ac.ac_b_ex.fe_start = 0; ++ ac.ac_b_ex.fe_len = 0; ++ ac.ac_status = AC_STATUS_CONTINUE; ++ ac.ac_groups_scanned = 0; ++ ac.ac_ex_scanned = 0; ++ ac.ac_found = 0; ++ ac.ac_sb = inode->i_sb; ++ ac.ac_g_ex.fe_group = group; ++ ac.ac_g_ex.fe_start = block; ++ ac.ac_g_ex.fe_len = *len; ++ ac.ac_flags = flags; ++ ++ /* ++ * Sometimes, caller may want to merge even small number ++ * of blocks to an existing extent ++ */ ++ if (ac.ac_flags & EXT3_MB_HINT_MERGE) { ++ err = ext3_mb_find_by_goal(&ac, &e3b); ++ if (err) ++ goto out_err; ++ if (ac.ac_status == AC_STATUS_FOUND) ++ goto found; ++ } ++ ++ /* ++ * FIXME ++ * If requested chunk is power of 2 length, we can try ++ * to exploit buddy nature to speed allocation up ++ */ ++ ++ ++ /* ++ * Let's just scan groups to find more-less suitable blocks ++ */ ++ cr = 0; ++repeat: ++ for (; cr < 3 && ac.ac_status == AC_STATUS_CONTINUE; cr++) { ++ for (i = 0; i < EXT3_SB(sb)->s_groups_count; group++, i++) { ++ if (group == EXT3_SB(sb)->s_groups_count) ++ group = 0; ++ ++ /* check is group good for our criteries */ ++ if (!ext3_mb_good_group(&ac, group, cr)) ++ continue; ++ ++ err = ext3_mb_load_buddy(ac.ac_sb, group, &e3b); ++ if (err) ++ goto out_err; ++ ++ ext3_lock_group(sb, group); ++ if (!ext3_mb_good_group(&ac, group, cr)) { ++ /* someone did allocation from this group */ ++ ext3_unlock_group(sb, group); ++ ext3_mb_release_desc(&e3b); ++ continue; ++ } ++ ++ ext3_mb_scan_group(&ac, &e3b); ++ ext3_unlock_group(sb, group); ++ ++ if (ac.ac_status == AC_STATUS_FOUND) ++ ext3_mb_dirty_buddy(&e3b); ++ ext3_mb_release_desc(&e3b); ++ ++ if (err) ++ goto out_err; ++ if (ac.ac_status != AC_STATUS_CONTINUE) ++ break; ++ } ++ } ++ ++ if (ac.ac_status == AC_STATUS_BREAK && ++ !(ac.ac_flags & EXT3_MB_HINT_FIRST)) { ++ /* We've been searching too long. Let's try to allocate ++ * the best chunk we've found so far. */ ++ if (ac.ac_g_ex.fe_len >= 128 && ++ ac.ac_b_ex.fe_len < ac.ac_g_ex.fe_len / 4) ++ ext3_warning(inode->i_sb, __FUNCTION__, ++ "too long searching: got %d want %d\n", ++ ac.ac_b_ex.fe_len, ac.ac_g_ex.fe_len); ++ ext3_mb_try_best_found(&ac, &e3b); ++ if (ac.ac_status != AC_STATUS_FOUND) { ++ /* ++ * Someone more lucky has already allocated it. ++ * The only thing we can do is just take first ++ * found block(s) ++ */ ++ mb_debug(KERN_ERR "EXT3-fs: and someone won our chunk\n"); ++ ac.ac_b_ex.fe_group = 0; ++ ac.ac_b_ex.fe_start = 0; ++ ac.ac_b_ex.fe_len = 0; ++ ac.ac_status = AC_STATUS_CONTINUE; ++ ac.ac_flags |= EXT3_MB_HINT_FIRST; ++ cr = 2; ++ goto repeat; ++ } ++ } ++ ++ if (ac.ac_status != AC_STATUS_FOUND) { ++ /* ++ * We aren't lucky definitely ++ */ ++ DQUOT_FREE_BLOCK(inode, *len); ++ *errp = -ENOSPC; ++ block = 0; ++#if 1 ++ printk(KERN_ERR "EXT3-fs: cant allocate: status %d, flags %d\n", ++ ac.ac_status, ac.ac_flags); ++ printk(KERN_ERR "EXT3-fs: goal %d, best found %d/%d/%d, cr %d\n", ++ ac.ac_g_ex.fe_len, ac.ac_b_ex.fe_group, ++ ac.ac_b_ex.fe_start, ac.ac_b_ex.fe_len, cr); ++ printk(KERN_ERR "EXT3-fs: %lu block reserved, %d found\n", ++ sbi->s_blocks_reserved, ac.ac_found); ++ printk("EXT3-fs: groups: "); ++ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) ++ printk("%d: %d ", i, ++ sbi->s_buddy_blocks[i]->bb_free); ++ printk("\n"); ++#endif ++ goto out; ++ } ++ ++found: ++ J_ASSERT(ac.ac_b_ex.fe_len > 0); ++ ++ /* good news - free block(s) have been found. now it's time ++ * to mark block(s) in good old journaled bitmap */ ++ block = ac.ac_b_ex.fe_group * EXT3_BLOCKS_PER_GROUP(sb) ++ + ac.ac_b_ex.fe_start ++ + le32_to_cpu(es->s_first_data_block); ++ ++ /* we made a desicion, now mark found blocks in good old ++ * bitmap to be journaled */ ++ ++ ext3_debug("using block group %d(%d)\n", ++ ac.ac_b_group.group, gdp->bg_free_blocks_count); ++ ++ bitmap_bh = read_block_bitmap(sb, ac.ac_b_ex.fe_group); ++ if (!bitmap_bh) { ++ *errp = -EIO; ++ goto out_err; ++ } ++ ++ err = ext3_journal_get_write_access(handle, bitmap_bh); ++ if (err) { ++ *errp = err; ++ goto out_err; ++ } ++ ++ gdp = ext3_get_group_desc(sb, ac.ac_b_ex.fe_group, &gdp_bh); ++ if (!gdp) { ++ *errp = -EIO; ++ goto out_err; ++ } ++ ++ err = ext3_journal_get_write_access(handle, gdp_bh); ++ if (err) ++ goto out_err; ++ ++ block = ac.ac_b_ex.fe_group * EXT3_BLOCKS_PER_GROUP(sb) ++ + ac.ac_b_ex.fe_start ++ + le32_to_cpu(es->s_first_data_block); ++ ++ if (block == le32_to_cpu(gdp->bg_block_bitmap) || ++ block == le32_to_cpu(gdp->bg_inode_bitmap) || ++ in_range(block, le32_to_cpu(gdp->bg_inode_table), ++ EXT3_SB(sb)->s_itb_per_group)) ++ ext3_error(sb, "ext3_new_block", ++ "Allocating block in system zone - " ++ "block = %u", block); ++ if (unlikely(ext3_mb_aggressive)) { ++ for (i = 0; i < ac.ac_b_ex.fe_len; i++) ++ J_ASSERT(!mb_test_bit(ac.ac_b_ex.fe_start + i, ++ bitmap_bh->b_data)); ++ } ++ ++ mb_set_bits(bitmap_bh->b_data, ac.ac_b_ex.fe_start, ac.ac_b_ex.fe_len); ++ ++ spin_lock(sb_bgl_lock(sbi, ac.ac_b_ex.fe_group)); ++ gdp->bg_free_blocks_count = ++ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) ++ - ac.ac_b_ex.fe_len); ++ spin_unlock(sb_bgl_lock(sbi, ac.ac_b_ex.fe_group)); ++ percpu_counter_mod(&sbi->s_freeblocks_counter, - ac.ac_b_ex.fe_len); ++ ++ err = ext3_journal_dirty_metadata(handle, bitmap_bh); ++ if (err) ++ goto out_err; ++ err = ext3_journal_dirty_metadata(handle, gdp_bh); ++ if (err) ++ goto out_err; ++ ++ sb->s_dirt = 1; ++ *errp = 0; ++ brelse(bitmap_bh); ++ ++ /* drop non-allocated, but dquote'd blocks */ ++ J_ASSERT(*len >= ac.ac_b_ex.fe_len); ++ DQUOT_FREE_BLOCK(inode, *len - ac.ac_b_ex.fe_len); ++ ++ *len = ac.ac_b_ex.fe_len; ++ J_ASSERT(*len > 0); ++ J_ASSERT(block != 0); ++ goto out; ++ ++out_err: ++ /* if we've already allocated something, roll it back */ ++ if (ac.ac_status == AC_STATUS_FOUND) { ++ /* FIXME: free blocks here */ ++ } ++ ++ DQUOT_FREE_BLOCK(inode, *len); ++ brelse(bitmap_bh); ++ *errp = err; ++ block = 0; ++out: ++ if (!(flags & EXT3_MB_HINT_RESERVED)) { ++ /* block wasn't reserved before and we reserved it ++ * at the beginning of allocation. it doesn't matter ++ * whether we allocated anything or we failed: time ++ * to release reservation. NOTE: because I expect ++ * any multiblock request from delayed allocation ++ * path only, here is single block always */ ++ ext3_mb_release_blocks(sb, 1); ++ } ++ ++ if ((ext3_mb_stats) && (ac.ac_g_ex.fe_len > 1)) { ++ spin_lock(&sbi->s_bal_lock); ++ sbi->s_bal_reqs++; ++ sbi->s_bal_allocated += *len; ++ if (*len >= ac.ac_g_ex.fe_len) ++ sbi->s_bal_success++; ++ sbi->s_bal_ex_scanned += ac.ac_found; ++ if (ac.ac_g_ex.fe_start == ac.ac_b_ex.fe_start && ++ ac.ac_g_ex.fe_group == ac.ac_b_ex.fe_group) ++ sbi->s_bal_goals++; ++ if (ac.ac_found > ext3_mb_max_to_scan) ++ sbi->s_bal_breaks++; ++ spin_unlock(&sbi->s_bal_lock); ++ } ++ ++ return block; ++} ++ ++int ext3_mb_get_descr_loc(struct ext3_buddy *e3b, struct buffer_head **bh, ++ struct ext3_mb_group_descr **grp) ++{ ++ struct super_block *sb = e3b->bd_sb; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int descr_per_block, err, offset; ++ struct ext3_mb_grp_header *hdr; ++ unsigned long block; ++ ++ descr_per_block = (sb->s_blocksize - sizeof(struct ext3_mb_grp_header)) ++ / sizeof(struct ext3_mb_group_descr); ++ block = e3b->bd_group / descr_per_block; ++ *bh = ext3_bread(NULL, sbi->s_buddy, block, 0, &err); ++ if (*bh == NULL) { ++ printk(KERN_ERR "EXT3-fs: cant getblk descr for group %d: %d\n", ++ e3b->bd_group, err); ++ return err; ++ } ++ ++ hdr = (struct ext3_mb_grp_header *) (*bh)->b_data; ++ if (hdr->mh_magic != EXT3_MB_MAGIC_V1) { ++ printk(KERN_ERR "EXT3-fs: invalid magic in group %d!\n", ++ e3b->bd_group); ++ brelse(*bh); ++ *bh = NULL; ++ return -EIO; ++ } ++ ++ offset = e3b->bd_group % descr_per_block ++ * sizeof(struct ext3_mb_group_descr) ++ + sizeof(struct ext3_mb_grp_header); ++ *grp = (struct ext3_mb_group_descr *) ((*bh)->b_data + offset); ++ ++ return 0; ++} ++ ++int ext3_mb_load_descr(struct ext3_buddy *e3b) ++{ ++ struct ext3_mb_group_descr *grp; ++ struct ext3_group_desc *gdp; ++ struct buffer_head *bh; ++ int err, i; ++ ++ err = ext3_mb_get_descr_loc(e3b, &bh, &grp); ++ if (err) ++ return err; ++ ++ e3b->bd_bd->bb_first_free = grp->mgd_first_free; ++ e3b->bd_bd->bb_free = grp->mgd_free; ++ for (i = 0; i <= e3b->bd_blkbits + 1; i++) { ++ J_ASSERT(i < 16); ++ e3b->bd_bd->bb_counters[i] = grp->mgd_counters[i]; ++ } ++ brelse(bh); ++ ++ /* additional checks against old group descriptor */ ++ gdp = ext3_get_group_desc(e3b->bd_sb, e3b->bd_group, NULL); ++ if (!gdp) ++ return -EIO; ++ if (e3b->bd_bd->bb_free != le16_to_cpu(gdp->bg_free_blocks_count)) { ++ printk(KERN_ERR "EXT3-fs: mbgroup %d corrupted (%d != %d)\n", ++ e3b->bd_group, e3b->bd_bd->bb_free, ++ le16_to_cpu(gdp->bg_free_blocks_count)); ++ return -ENODATA; ++ } ++ ++ return 0; ++} ++ ++ ++int ext3_mb_update_descr(struct ext3_buddy *e3b) ++{ ++ struct ext3_mb_group_descr *grp; ++ struct ext3_group_desc *gdp; ++ struct buffer_head *bh; ++ handle_t *handle; ++ int err, i; ++ ++ /* additional checks against old group descriptor */ ++ gdp = ext3_get_group_desc(e3b->bd_sb, e3b->bd_group, NULL); ++ if (!gdp) ++ return -EIO; ++ if (e3b->bd_bd->bb_free != le16_to_cpu(gdp->bg_free_blocks_count)) { ++ printk(KERN_ERR "EXT3-fs: mbgroup %d corrupted (%d != %d)\n", ++ e3b->bd_group, e3b->bd_bd->bb_free, ++ le16_to_cpu(gdp->bg_free_blocks_count)); ++ return -ENODATA; ++ } ++ ++ err = ext3_mb_get_descr_loc(e3b, &bh, &grp); ++ if (err) ++ return err; ++ ++ handle = ext3_journal_start_sb(e3b->bd_sb, 1); ++ if (IS_ERR(handle)) { ++ err = PTR_ERR(handle); ++ handle = NULL; ++ goto out; ++ } ++ ++ err = ext3_journal_get_write_access(handle, bh); ++ if (err) ++ goto out; ++ grp->mgd_first_free = e3b->bd_bd->bb_first_free; ++ grp->mgd_free = e3b->bd_bd->bb_free; ++ for (i = 0; i <= e3b->bd_blkbits + 1; i++) { ++ J_ASSERT(i < 16); ++ grp->mgd_counters[i] = e3b->bd_bd->bb_counters[i]; ++ } ++ err = ext3_journal_dirty_metadata(handle, bh); ++ if (err) ++ goto out; ++ err = 0; ++out: ++ brelse(bh); ++ if (handle) ++ ext3_journal_stop(handle); ++ return err; ++} ++ ++int ext3_mb_generate_buddy(struct ext3_buddy *e3b) ++{ ++ struct super_block *sb = e3b->bd_sb; ++ struct buffer_head *bh; ++ int i, count = 0; ++ ++ mb_debug("generate buddy for group %d\n", e3b->bd_group); ++ memset(e3b->bd_bh->b_data, 0xff, sb->s_blocksize); ++ memset(e3b->bd_bh2->b_data, 0xff, sb->s_blocksize); ++ ++ bh = read_block_bitmap(sb, e3b->bd_group); ++ if (bh == NULL) ++ return -EIO; ++ ++ /* mb_free_blocks will set real free */ ++ e3b->bd_bd->bb_free = 0; ++ e3b->bd_bd->bb_first_free = 1 << 15; ++ /* ++ * if change bb_counters size, don't forget about ++ * ext3_mb_init_backend() -bzzz ++ */ ++ memset(e3b->bd_bd->bb_counters, 0, ++ sizeof(unsigned) * (sb->s_blocksize_bits + 2)); ++ ++ /* loop over the blocks, and create buddies for free ones */ ++ for (i = 0; i < sb->s_blocksize * 8; i++) { ++ if (!mb_test_bit(i, (void *) bh->b_data)) { ++ mb_free_blocks(e3b, i, 1); ++ count++; ++ } ++ } ++ brelse(bh); ++ mb_check_buddy(e3b); ++ ext3_mb_dirty_buddy(e3b); ++ ++ return 0; ++} ++ ++EXPORT_SYMBOL(ext3_mb_new_blocks); ++ ++#define MB_CREDITS \ ++ (EXT3_DATA_TRANS_BLOCKS + 3 + EXT3_INDEX_EXTRA_TRANS_BLOCKS + \ ++ 2 * EXT3_SINGLEDATA_TRANS_BLOCKS) ++ ++int ext3_mb_init_backend(struct super_block *sb, int *created) ++{ ++ int err, i, len, descr_per_block, buddy_offset, size; ++ struct inode *root = sb->s_root->d_inode; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct ext3_mb_grp_header *hdr; ++ struct buffer_head *bh = NULL; ++ unsigned long block; ++ struct dentry *db; ++ handle_t *handle; ++ tid_t target; ++ ++ *created = 0; ++ len = sizeof(struct ext3_buddy_group_blocks *) * sbi->s_groups_count; ++ sbi->s_buddy_blocks = kmalloc(len, GFP_KERNEL); ++ if (sbi->s_buddy_blocks == NULL) { ++ printk(KERN_ERR "EXT3-fs: cant allocate mem for buddy\n"); ++ return -ENOMEM; ++ } ++ memset(sbi->s_buddy_blocks, 0, len); ++ sbi->s_buddy = NULL; ++ ++ down(&root->i_sem); ++ len = strlen(EXT3_BUDDY_FILE); ++ db = lookup_one_len(EXT3_BUDDY_FILE, sb->s_root, len); ++ if (IS_ERR(db)) { ++ err = PTR_ERR(db); ++ printk(KERN_ERR "EXT3-fs: cant lookup buddy: %d\n", err); ++ up(&root->i_sem); ++ goto out; ++ } ++ ++ if (db->d_inode == NULL) { ++ err = ext3_create(root, db, S_IFREG, NULL); ++ if (err) { ++ printk(KERN_ERR "EXT3-fs: cant create buddy: %d\n", err); ++ up(&root->i_sem); ++ goto out; ++ } ++ db->d_inode->i_flags |= S_IMMUTABLE | S_NOATIME; ++ *created = 1; ++ mb_debug("no buddy file, regenerate\n"); ++ } ++ up(&root->i_sem); ++ sbi->s_buddy = igrab(db->d_inode); ++ ++ /* calculate needed size */ ++ descr_per_block = (sb->s_blocksize - sizeof(struct ext3_mb_grp_header)) ++ / sizeof(struct ext3_mb_group_descr); ++ buddy_offset = (sbi->s_groups_count + descr_per_block - 1) ++ / descr_per_block; ++ len = sbi->s_groups_count * sb->s_blocksize * 2 + ++ buddy_offset * sb->s_blocksize; ++ if (len != i_size_read(sbi->s_buddy)) { ++ if (*created == 0) ++ printk("EXT3-fs: wrong i_size (%u != %u), regenerate\n", ++ (unsigned) len, ++ (unsigned) i_size_read(sbi->s_buddy)); ++ *created = 1; ++ } ++ ++ /* read/create mb group descriptors */ ++ for (i = 0; i < buddy_offset; i++) { ++ handle = ext3_journal_start(sbi->s_buddy, MB_CREDITS); ++ if (IS_ERR(handle)) { ++ printk(KERN_ERR "EXT3-fs: cant start transaction\n"); ++ err = PTR_ERR(handle); ++ goto err_out; ++ } ++ ++ bh = ext3_bread(handle, sbi->s_buddy, i, 1, &err); ++ if (bh == NULL) { ++ printk(KERN_ERR "EXT3-fs: cant getblk grp: %d\n", err); ++ goto err_out; ++ } ++ hdr = (struct ext3_mb_grp_header *) bh->b_data; ++ if (hdr->mh_magic != EXT3_MB_MAGIC_V1) { ++ err = ext3_journal_get_write_access(handle, bh); ++ if (err) ++ goto err_out; ++ if (*created == 0) ++ printk(KERN_ERR ++ "EXT3-fs: invalid header 0x%x in %d," ++ "regenerate\n", hdr->mh_magic, i); ++ *created = 1; ++ hdr->mh_magic = EXT3_MB_MAGIC_V1; ++ err = ext3_journal_dirty_metadata(handle, bh); ++ if (err) ++ goto err_out; ++ } ++ brelse(bh); ++ ext3_journal_stop(handle); ++ } ++ ++ /* ++ * if change bb_counters size, don't forget about ext3_mb_generate_buddy() ++ */ ++ len = sizeof(struct ext3_buddy_group_blocks); ++ len += sizeof(unsigned) * (sb->s_blocksize_bits + 2); ++ for (i = 0; i < sbi->s_groups_count; i++) { ++ ++ sbi->s_buddy_blocks[i] = kmalloc(len, GFP_KERNEL); ++ if (sbi->s_buddy_blocks[i] == NULL) { ++ printk(KERN_ERR "EXT3-fs: cant allocate mem for buddy\n"); ++ err = -ENOMEM; ++ goto out2; ++ } ++ memset(sbi->s_buddy_blocks[i], 0, len); ++ ++ handle = ext3_journal_start(sbi->s_buddy, MB_CREDITS); ++ if (IS_ERR(handle)) { ++ printk(KERN_ERR "EXT3-fs: cant start transaction\n"); ++ err = PTR_ERR(handle); ++ goto out2; ++ } ++ ++ /* allocate block for bitmap */ ++ block = buddy_offset + i * 2; ++ bh = ext3_getblk(handle, sbi->s_buddy, block, 1, &err); ++ if (bh == NULL) { ++ printk(KERN_ERR "EXT3-fs: cant getblk bitmap: %d\n", err); ++ goto out2; ++ } ++ sbi->s_buddy_blocks[i]->bb_bitmap = bh->b_blocknr; ++ brelse(bh); ++ ++ /* allocate block for buddy */ ++ block = buddy_offset + i * 2 + 1; ++ bh = ext3_getblk(handle, sbi->s_buddy, block, 1, &err); ++ if (bh == NULL) { ++ printk(KERN_ERR "EXT3-fs: cant getblk for buddy: %d\n", err); ++ goto out2; ++ } ++ sbi->s_buddy_blocks[i]->bb_buddy = bh->b_blocknr; ++ brelse(bh); ++ ++ size = (block + 1) << sbi->s_buddy->i_blkbits; ++ if (size > sbi->s_buddy->i_size) { ++ *created = 1; ++ EXT3_I(sbi->s_buddy)->i_disksize = size; ++ i_size_write(sbi->s_buddy, size); ++ mark_inode_dirty(sbi->s_buddy); ++ } ++ ext3_journal_stop(handle); ++ ++ spin_lock_init(&sbi->s_buddy_blocks[i]->bb_lock); ++ sbi->s_buddy_blocks[i]->bb_md_cur = NULL; ++ sbi->s_buddy_blocks[i]->bb_tid = 0; ++ } ++ ++ if (journal_start_commit(sbi->s_journal, &target)) ++ log_wait_commit(sbi->s_journal, target); ++ ++out2: ++ dput(db); ++out: ++ return err; ++ ++err_out: ++ return err; ++} ++ ++int ext3_mb_write_descriptors(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct ext3_buddy e3b; ++ int ret = 0, i, err; ++ ++ for (i = 0; i < sbi->s_groups_count; i++) { ++ if (sbi->s_buddy_blocks[i] == NULL) ++ continue; ++ ++ err = ext3_mb_load_buddy(sb, i, &e3b); ++ if (err == 0) { ++ ext3_mb_update_descr(&e3b); ++ ext3_mb_release_desc(&e3b); ++ } else ++ ret = err; ++ } ++ return ret; ++} ++ ++int ext3_mb_release(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int i; ++ ++ if (!test_opt(sb, MBALLOC)) ++ return 0; ++ ++ /* release freed, non-committed blocks */ ++ spin_lock(&sbi->s_md_lock); ++ list_splice_init(&sbi->s_closed_transaction, ++ &sbi->s_committed_transaction); ++ list_splice_init(&sbi->s_active_transaction, ++ &sbi->s_committed_transaction); ++ spin_unlock(&sbi->s_md_lock); ++ ext3_mb_free_committed_blocks(sb); ++ ++ if (sbi->s_buddy_blocks) { ++ ext3_mb_write_descriptors(sb); ++ for (i = 0; i < sbi->s_groups_count; i++) { ++ if (sbi->s_buddy_blocks[i] == NULL) ++ continue; ++ kfree(sbi->s_buddy_blocks[i]); ++ } ++ kfree(sbi->s_buddy_blocks); ++ } ++ if (sbi->s_buddy) ++ iput(sbi->s_buddy); ++ if (sbi->s_blocks_reserved) ++ printk("ext3-fs: %ld blocks being reserved at umount!\n", ++ sbi->s_blocks_reserved); ++ if (ext3_mb_stats) { ++ printk("EXT3-fs: mballoc: %lu blocks %lu reqs " ++ "(%lu success)\n", sbi->s_bal_allocated, ++ sbi->s_bal_reqs, sbi->s_bal_success); ++ printk("EXT3-fs: mballoc: %lu extents scanned, " ++ "%lu goal hits, %lu breaks\n", sbi->s_bal_ex_scanned, ++ sbi->s_bal_goals, sbi->s_bal_breaks); ++ } ++ ++ return 0; ++} ++ ++int ext3_mb_init(struct super_block *sb, int needs_recovery) ++{ ++ struct ext3_buddy e3b; ++ int i, err, created; ++ ++ if (!test_opt(sb, MBALLOC)) ++ return 0; ++ ++ /* init file for buddy data */ ++ clear_opt(EXT3_SB(sb)->s_mount_opt, MBALLOC); ++ if ((err = ext3_mb_init_backend(sb, &created))) ++ return err; ++ ++repeat: ++ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) { ++ err = ext3_mb_load_buddy(sb, i, &e3b); ++ if (err) { ++ /* FIXME: release backend */ ++ return err; ++ } ++ if (created || needs_recovery) ++ ext3_mb_generate_buddy(&e3b); ++ else ++ err = ext3_mb_load_descr(&e3b); ++ ext3_mb_release_desc(&e3b); ++ if (err == -ENODATA) { ++ created = 1; ++ goto repeat; ++ } ++ } ++ if (created || needs_recovery) ++ printk(KERN_ERR "EXT3-fs: generated buddies for %lu groups\n", ++ EXT3_SB(sb)->s_groups_count); ++ spin_lock_init(&EXT3_SB(sb)->s_reserve_lock); ++ spin_lock_init(&EXT3_SB(sb)->s_md_lock); ++ INIT_LIST_HEAD(&EXT3_SB(sb)->s_active_transaction); ++ INIT_LIST_HEAD(&EXT3_SB(sb)->s_closed_transaction); ++ INIT_LIST_HEAD(&EXT3_SB(sb)->s_committed_transaction); ++ set_opt(EXT3_SB(sb)->s_mount_opt, MBALLOC); ++ ++ spin_lock_init(&EXT3_SB(sb)->s_bal_lock); ++ if (ext3_mb_stats) { ++ printk("EXT3-fs: mballoc enabled (stats)\n"); ++ } else { ++ printk("EXT3-fs: mballoc enabled\n"); ++ } ++ ++ return 0; ++} ++ ++void ext3_mb_free_committed_blocks(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int err, i, count = 0, count2 = 0; ++ struct ext3_free_metadata *md; ++ struct ext3_buddy e3b; ++ ++ if (list_empty(&sbi->s_committed_transaction)) ++ return; ++ ++ /* there is committed blocks to be freed yet */ ++ do { ++ /* get next array of blocks */ ++ md = NULL; ++ spin_lock(&sbi->s_md_lock); ++ if (!list_empty(&sbi->s_committed_transaction)) { ++ md = list_entry(sbi->s_committed_transaction.next, ++ struct ext3_free_metadata, list); ++ list_del(&md->list); ++ } ++ spin_unlock(&sbi->s_md_lock); ++ ++ if (md == NULL) ++ break; ++ ++ mb_debug("gonna free %u blocks in group %u (0x%p):", ++ md->num, md->group, md); ++ ++ err = ext3_mb_load_buddy(sb, md->group, &e3b); ++ BUG_ON(err != 0); ++ ++ /* there are blocks to put in buddy to make them really free */ ++ count += md->num; ++ count2++; ++ ext3_lock_group(sb, md->group); ++ for (i = 0; i < md->num; i++) { ++ mb_debug(" %u", md->blocks[i]); ++ mb_free_blocks(&e3b, md->blocks[i], 1); ++ } ++ mb_debug("\n"); ++ ext3_unlock_group(sb, md->group); ++ ++ kfree(md); ++ ext3_mb_dirty_buddy(&e3b); ++ ext3_mb_release_desc(&e3b); ++ ++ } while (md); ++ mb_debug("freed %u blocks in %u structures\n", count, count2); ++} ++ ++void ext3_mb_poll_new_transaction(struct super_block *sb, handle_t *handle) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ ++ if (sbi->s_last_transaction == handle->h_transaction->t_tid) ++ return; ++ ++ /* new transaction! time to close last one and free blocks for ++ * committed transaction. we know that only transaction can be ++ * active, so previos transaction can be being logged and we ++ * know that transaction before previous is known to be alreade ++ * logged. this means that now we may free blocks freed in all ++ * transactions before previous one. hope I'm clear enough ... */ ++ ++ spin_lock(&sbi->s_md_lock); ++ if (sbi->s_last_transaction != handle->h_transaction->t_tid) { ++ mb_debug("new transaction %lu, old %lu\n", ++ (unsigned long) handle->h_transaction->t_tid, ++ (unsigned long) sbi->s_last_transaction); ++ list_splice_init(&sbi->s_closed_transaction, ++ &sbi->s_committed_transaction); ++ list_splice_init(&sbi->s_active_transaction, ++ &sbi->s_closed_transaction); ++ sbi->s_last_transaction = handle->h_transaction->t_tid; ++ } ++ spin_unlock(&sbi->s_md_lock); ++ ++ ext3_mb_free_committed_blocks(sb); ++} ++ ++int ext3_mb_free_metadata(handle_t *handle, struct ext3_buddy *e3b, ++ int group, int block, int count) ++{ ++ struct ext3_buddy_group_blocks *db = e3b->bd_bd; ++ struct super_block *sb = e3b->bd_sb; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct ext3_free_metadata *md; ++ int i; ++ ++ ext3_lock_group(sb, group); ++ for (i = 0; i < count; i++) { ++ md = db->bb_md_cur; ++ if (md && db->bb_tid != handle->h_transaction->t_tid) { ++ db->bb_md_cur = NULL; ++ md = NULL; ++ } ++ ++ if (md == NULL) { ++ ext3_unlock_group(sb, group); ++ md = kmalloc(sizeof(*md), GFP_KERNEL); ++ if (md == NULL) ++ return -ENOMEM; ++ md->num = 0; ++ md->group = group; ++ ++ ext3_lock_group(sb, group); ++ if (db->bb_md_cur == NULL) { ++ spin_lock(&sbi->s_md_lock); ++ list_add(&md->list, &sbi->s_active_transaction); ++ spin_unlock(&sbi->s_md_lock); ++ db->bb_md_cur = md; ++ db->bb_tid = handle->h_transaction->t_tid; ++ mb_debug("new md 0x%p for group %u\n", ++ md, md->group); ++ } else { ++ kfree(md); ++ md = db->bb_md_cur; ++ } ++ } ++ ++ BUG_ON(md->num >= EXT3_BB_MAX_BLOCKS); ++ md->blocks[md->num] = block + i; ++ md->num++; ++ if (md->num == EXT3_BB_MAX_BLOCKS) { ++ /* no more space, put full container on a sb's list */ ++ db->bb_md_cur = NULL; ++ } ++ } ++ ext3_unlock_group(sb, group); ++ return 0; ++} ++ ++void ext3_mb_free_blocks(handle_t *handle, struct inode *inode, ++ unsigned long block, unsigned long count, ++ int metadata, int *freed) ++{ ++ struct buffer_head *bitmap_bh = NULL; ++ struct ext3_group_desc *gdp; ++ struct ext3_super_block *es; ++ unsigned long bit, overflow; ++ struct buffer_head *gd_bh; ++ unsigned long block_group; ++ struct ext3_sb_info *sbi; ++ struct super_block *sb; ++ struct ext3_buddy e3b; ++ int err = 0, ret; ++ ++ *freed = 0; ++ sb = inode->i_sb; ++ if (!sb) { ++ printk ("ext3_free_blocks: nonexistent device"); ++ return; ++ } ++ ++ ext3_mb_poll_new_transaction(sb, handle); ++ ++ sbi = EXT3_SB(sb); ++ es = EXT3_SB(sb)->s_es; ++ if (block < le32_to_cpu(es->s_first_data_block) || ++ block + count < block || ++ block + count > le32_to_cpu(es->s_blocks_count)) { ++ ext3_error (sb, "ext3_free_blocks", ++ "Freeing blocks not in datazone - " ++ "block = %lu, count = %lu", block, count); ++ goto error_return; ++ } ++ ++ ext3_debug("freeing block %lu\n", block); ++ ++do_more: ++ overflow = 0; ++ block_group = (block - le32_to_cpu(es->s_first_data_block)) / ++ EXT3_BLOCKS_PER_GROUP(sb); ++ bit = (block - le32_to_cpu(es->s_first_data_block)) % ++ EXT3_BLOCKS_PER_GROUP(sb); ++ /* ++ * Check to see if we are freeing blocks across a group ++ * boundary. ++ */ ++ if (bit + count > EXT3_BLOCKS_PER_GROUP(sb)) { ++ overflow = bit + count - EXT3_BLOCKS_PER_GROUP(sb); ++ count -= overflow; ++ } ++ brelse(bitmap_bh); ++ bitmap_bh = read_block_bitmap(sb, block_group); ++ if (!bitmap_bh) ++ goto error_return; ++ gdp = ext3_get_group_desc (sb, block_group, &gd_bh); ++ if (!gdp) ++ goto error_return; ++ ++ if (in_range (le32_to_cpu(gdp->bg_block_bitmap), block, count) || ++ in_range (le32_to_cpu(gdp->bg_inode_bitmap), block, count) || ++ in_range (block, le32_to_cpu(gdp->bg_inode_table), ++ EXT3_SB(sb)->s_itb_per_group) || ++ in_range (block + count - 1, le32_to_cpu(gdp->bg_inode_table), ++ EXT3_SB(sb)->s_itb_per_group)) ++ ext3_error (sb, "ext3_free_blocks", ++ "Freeing blocks in system zones - " ++ "Block = %lu, count = %lu", ++ block, count); ++ ++ BUFFER_TRACE(bitmap_bh, "getting write access"); ++ err = ext3_journal_get_write_access(handle, bitmap_bh); ++ if (err) ++ goto error_return; ++ ++ /* ++ * We are about to modify some metadata. Call the journal APIs ++ * to unshare ->b_data if a currently-committing transaction is ++ * using it ++ */ ++ BUFFER_TRACE(gd_bh, "get_write_access"); ++ err = ext3_journal_get_write_access(handle, gd_bh); ++ if (err) ++ goto error_return; ++ ++ err = ext3_mb_load_buddy(sb, block_group, &e3b); ++ if (err) ++ goto error_return; ++ ++ if (unlikely(ext3_mb_aggressive)) { ++ int i; ++ for (i = 0; i < count; i++) ++ J_ASSERT(mb_test_bit(bit + i, bitmap_bh->b_data)); ++ } ++ ++ mb_clear_bits(bitmap_bh->b_data, bit, count); ++ ++ /* We dirtied the bitmap block */ ++ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); ++ err = ext3_journal_dirty_metadata(handle, bitmap_bh); ++ ++ if (metadata) { ++ /* blocks being freed are metadata. these blocks shouldn't ++ * be used until this transaction is committed */ ++ ext3_mb_free_metadata(handle, &e3b, block_group, bit, count); ++ } else { ++ ext3_lock_group(sb, block_group); ++ mb_free_blocks(&e3b, bit, count); ++ ext3_unlock_group(sb, block_group); ++ } ++ ++ spin_lock(sb_bgl_lock(sbi, block_group)); ++ gdp->bg_free_blocks_count = ++ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + count); ++ spin_unlock(sb_bgl_lock(sbi, block_group)); ++ percpu_counter_mod(&sbi->s_freeblocks_counter, count); ++ ++ ext3_mb_dirty_buddy(&e3b); ++ ext3_mb_release_desc(&e3b); ++ ++ *freed = count; ++ ++ /* And the group descriptor block */ ++ BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); ++ ret = ext3_journal_dirty_metadata(handle, gd_bh); ++ if (!err) err = ret; ++ ++ if (overflow && !err) { ++ block += count; ++ count = overflow; ++ goto do_more; ++ } ++ sb->s_dirt = 1; ++error_return: ++ brelse(bitmap_bh); ++ ext3_std_error(sb, err); ++ return; ++} ++ ++int ext3_mb_reserve_blocks(struct super_block *sb, int blocks) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int free, ret = -ENOSPC; ++ ++ BUG_ON(blocks < 0); ++ spin_lock(&sbi->s_reserve_lock); ++ free = percpu_counter_read_positive(&sbi->s_freeblocks_counter); ++ if (blocks <= free - sbi->s_blocks_reserved) { ++ sbi->s_blocks_reserved += blocks; ++ ret = 0; ++ } ++ spin_unlock(&sbi->s_reserve_lock); ++ return ret; ++} ++ ++void ext3_mb_release_blocks(struct super_block *sb, int blocks) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ ++ BUG_ON(blocks < 0); ++ spin_lock(&sbi->s_reserve_lock); ++ sbi->s_blocks_reserved -= blocks; ++ WARN_ON(sbi->s_blocks_reserved < 0); ++ if (sbi->s_blocks_reserved < 0) ++ sbi->s_blocks_reserved = 0; ++ spin_unlock(&sbi->s_reserve_lock); ++} ++ ++int ext3_new_block(handle_t *handle, struct inode *inode, ++ unsigned long goal, int *errp) ++{ ++ int ret, len; ++ ++ if (!test_opt(inode->i_sb, MBALLOC)) { ++ ret = ext3_new_block_old(handle, inode, goal, errp); ++ goto out; ++ } ++ len = 1; ++ ret = ext3_mb_new_blocks(handle, inode, goal, &len, 0, errp); ++out: ++ return ret; ++} ++ ++void ext3_free_blocks(handle_t *handle, struct inode *inode, ++ unsigned long block, unsigned long count, int metadata) ++{ ++ int freed; ++ ++ if (!test_opt(inode->i_sb, MBALLOC) || ++ EXT3_SB(inode->i_sb)->s_buddy_blocks == NULL) ++ ext3_free_blocks_sb(handle, inode->i_sb, block, count, &freed); ++ else ++ ext3_mb_free_blocks(handle, inode, block,count,metadata,&freed); ++ ++ if (freed) ++ DQUOT_FREE_BLOCK(inode, freed); ++ return; ++} +Index: linux-2.6.12/fs/ext3/proc.c +=================================================================== +--- linux-2.6.12.orig/fs/ext3/proc.c 2005-10-13 19:40:57.851699336 +0400 ++++ linux-2.6.12/fs/ext3/proc.c 2005-10-14 09:10:31.000000000 +0400 +@@ -0,0 +1,195 @@ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++ ++#define EXT3_ROOT "ext3" ++#define EXT3_MB_AGGRESSIVE_NAME "mb_aggressive" ++#define EXT3_MB_STATS_NAME "mb_stats" ++#define EXT3_MB_MAX_TO_SCAN_NAME "mb_max_to_scan" ++ ++ ++static struct proc_dir_entry *proc_root_ext3; ++ ++ ++static int ext3_mb_aggressive_read(char *page, char **start, off_t off, ++ int count, int *eof, void *data) ++{ ++ int len; ++ ++ *eof = 1; ++ if (off != 0) ++ return 0; ++ ++ len = sprintf(page, "%ld\n", ext3_mb_aggressive); ++ *start = page; ++ return len; ++} ++ ++static int ext3_mb_aggressive_write(struct file *file, const char *buffer, ++ unsigned long count, void *data) ++{ ++ char str[32]; ++ ++ if (count >= sizeof(str)) { ++ printk(KERN_ERR "EXT3: %s string to long, max %u bytes\n", ++ EXT3_MB_AGGRESSIVE_NAME, sizeof(str)); ++ return -EOVERFLOW; ++ } ++ ++ if (copy_from_user(str, buffer, count)) ++ return -EFAULT; ++ ++ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ ++ ext3_mb_aggressive = (simple_strtol(str, NULL, 0) != 0); ++ return count; ++} ++ ++static int ext3_mb_stats_read(char *page, char **start, off_t off, ++ int count, int *eof, void *data) ++{ ++ int len; ++ ++ *eof = 1; ++ if (off != 0) ++ return 0; ++ ++ len = sprintf(page, "%ld\n", ext3_mb_stats); ++ *start = page; ++ return len; ++} ++ ++static int ext3_mb_stats_write(struct file *file, const char *buffer, ++ unsigned long count, void *data) ++{ ++ char str[32]; ++ ++ if (count >= sizeof(str)) { ++ printk(KERN_ERR "EXT3: %s string to long, max %u bytes\n", ++ EXT3_MB_STATS_NAME, sizeof(str)); ++ return -EOVERFLOW; ++ } ++ ++ if (copy_from_user(str, buffer, count)) ++ return -EFAULT; ++ ++ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ ++ ext3_mb_stats = (simple_strtol(str, NULL, 0) != 0); ++ return count; ++} ++ ++static int ext3_mb_max_to_scan_read(char *page, char **start, off_t off, ++ int count, int *eof, void *data) ++{ ++ int len; ++ ++ *eof = 1; ++ if (off != 0) ++ return 0; ++ ++ len = sprintf(page, "%ld\n", ext3_mb_max_to_scan); ++ *start = page; ++ return len; ++} ++ ++static int ext3_mb_max_to_scan_write(struct file *file, const char *buffer, ++ unsigned long count, void *data) ++{ ++ char str[32]; ++ long value; ++ ++ if (count >= sizeof(str)) { ++ printk(KERN_ERR "EXT3: %s string to long, max %u bytes\n", ++ EXT3_MB_MAX_TO_SCAN_NAME, sizeof(str)); ++ return -EOVERFLOW; ++ } ++ ++ if (copy_from_user(str, buffer, count)) ++ return -EFAULT; ++ ++ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ ++ value = simple_strtol(str, NULL, 0); ++ if (value <= 0) ++ return -ERANGE; ++ ++ ext3_mb_max_to_scan = value; ++ ++ return count; ++} ++ ++int __init init_ext3_proc(void) ++{ ++ struct proc_dir_entry *proc_ext3_mb_aggressive; ++ struct proc_dir_entry *proc_ext3_mb_stats; ++ struct proc_dir_entry *proc_ext3_mb_max_to_scan; ++ ++ proc_root_ext3 = proc_mkdir(EXT3_ROOT, proc_root_fs); ++ if (proc_root_ext3 == NULL) { ++ printk(KERN_ERR "EXT3: Unable to create %s\n", EXT3_ROOT); ++ return -EIO; ++ } ++ ++ /* Initialize EXT3_MB_AGGRESSIVE_NAME */ ++ proc_ext3_mb_aggressive = create_proc_entry(EXT3_MB_AGGRESSIVE_NAME, ++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); ++ if (proc_ext3_mb_aggressive == NULL) { ++ printk(KERN_ERR "EXT3: Unable to create %s\n", ++ EXT3_MB_AGGRESSIVE_NAME); ++ remove_proc_entry(EXT3_ROOT, proc_root_fs); ++ return -EIO; ++ } ++ ++ proc_ext3_mb_aggressive->data = NULL; ++ proc_ext3_mb_aggressive->read_proc = ext3_mb_aggressive_read; ++ proc_ext3_mb_aggressive->write_proc = ext3_mb_aggressive_write; ++ ++ /* Initialize EXT3_MB_STATS_NAME */ ++ proc_ext3_mb_stats = create_proc_entry(EXT3_MB_STATS_NAME, ++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); ++ if (proc_ext3_mb_stats == NULL) { ++ printk(KERN_ERR "EXT3: Unable to create %s\n", ++ EXT3_MB_STATS_NAME); ++ remove_proc_entry(EXT3_MB_AGGRESSIVE_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_ROOT, proc_root_fs); ++ return -EIO; ++ } ++ ++ proc_ext3_mb_stats->data = NULL; ++ proc_ext3_mb_stats->read_proc = ext3_mb_stats_read; ++ proc_ext3_mb_stats->write_proc = ext3_mb_stats_write; ++ ++ /* Initialize EXT3_MAX_TO_SCAN_NAME */ ++ proc_ext3_mb_max_to_scan = create_proc_entry( ++ EXT3_MB_MAX_TO_SCAN_NAME, ++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); ++ if (proc_ext3_mb_max_to_scan == NULL) { ++ printk(KERN_ERR "EXT3: Unable to create %s\n", ++ EXT3_MB_STATS_NAME); ++ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_AGGRESSIVE_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_ROOT, proc_root_fs); ++ return -EIO; ++ } ++ ++ proc_ext3_mb_max_to_scan->data = NULL; ++ proc_ext3_mb_max_to_scan->read_proc = ext3_mb_max_to_scan_read; ++ proc_ext3_mb_max_to_scan->write_proc = ext3_mb_max_to_scan_write; ++ ++ return 0; ++} ++ ++void exit_ext3_proc(void) ++{ ++ remove_proc_entry(EXT3_MB_AGGRESSIVE_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_ROOT, proc_root_fs); ++} +Index: linux-2.6.12/fs/ext3/inode.c +=================================================================== +--- linux-2.6.12.orig/fs/ext3/inode.c 2005-06-21 13:57:16.507917732 +0200 ++++ linux-2.6.12/fs/ext3/inode.c 2005-06-21 13:57:25.837019180 +0200 +@@ -564,7 +564,7 @@ + ext3_journal_forget(handle, branch[i].bh); + } + for (i = 0; i < keys; i++) +- ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1); ++ ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1, 1); + return err; + } + +@@ -1850,7 +1850,7 @@ + } + } + +- ext3_free_blocks(handle, inode, block_to_free, count); ++ ext3_free_blocks(handle, inode, block_to_free, count, 1); + } + + /** +@@ -2023,7 +2023,7 @@ + ext3_journal_test_restart(handle, inode); + } + +- ext3_free_blocks(handle, inode, nr, 1); ++ ext3_free_blocks(handle, inode, nr, 1, 1); + + if (parent_bh) { + /* +Index: linux-2.6.12/fs/ext3/super.c +=================================================================== +--- linux-2.6.12.orig/fs/ext3/super.c 2005-06-21 13:57:16.526472419 +0200 ++++ linux-2.6.12/fs/ext3/super.c 2005-06-21 13:57:25.802839493 +0200 +@@ -387,6 +387,7 @@ + struct ext3_super_block *es = sbi->s_es; + int i; + ++ ext3_mb_release(sb); + ext3_ext_release(sb); + ext3_xattr_put_super(sb); + journal_destroy(sbi->s_journal); +@@ -594,7 +595,7 @@ + Opt_commit, Opt_journal_update, Opt_journal_inum, + Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, + Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, +- Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, ++ Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_mballoc, Opt_mbfactor, + Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, + Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_extents, Opt_extdebug, + }; +@@ -649,6 +650,8 @@ + {Opt_iopen_nopriv, "iopen_nopriv"}, + {Opt_extents, "extents"}, + {Opt_extdebug, "extdebug"}, ++ {Opt_mballoc, "mballoc"}, ++ {Opt_mbfactor, "mbfactor=%u"}, + {Opt_barrier, "barrier=%u"}, + {Opt_err, NULL}, + {Opt_resize, "resize"}, +@@ -964,6 +967,16 @@ + case Opt_extdebug: + set_opt (sbi->s_mount_opt, EXTDEBUG); + break; ++ case Opt_mballoc: ++ set_opt (sbi->s_mount_opt, MBALLOC); ++ break; ++ case Opt_mbfactor: ++ if (match_int(&args[0], &option)) ++ return 0; ++ if (option < 0) ++ return 0; ++ sbi->s_mb_factor = option; ++ break; + default: + printk (KERN_ERR + "EXT3-fs: Unrecognized mount option \"%s\" " +@@ -1668,6 +1681,7 @@ + ext3_count_dirs(sb)); + + ext3_ext_init(sb); ++ ext3_mb_init(sb, needs_recovery); + lock_kernel(); + return 0; + +@@ -2450,7 +2464,13 @@ + + static int __init init_ext3_fs(void) + { +- int err = init_ext3_xattr(); ++ int err; ++ ++ err = init_ext3_proc(); ++ if (err) ++ return err; ++ ++ err = init_ext3_xattr(); + if (err) + return err; + err = init_inodecache(); +@@ -2441,6 +2461,7 @@ + unregister_filesystem(&ext3_fs_type); + destroy_inodecache(); + exit_ext3_xattr(); ++ exit_ext3_proc(); + } + + int ext3_prep_san_write(struct inode *inode, long *blocks, diff --git a/lustre/kernel_patches/patches/ext3-nlinks-2.6.12.patch b/lustre/kernel_patches/patches/ext3-nlinks-2.6.12.patch new file mode 100644 index 0000000..6c3ebe1 --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-nlinks-2.6.12.patch @@ -0,0 +1,161 @@ +Index: linux-2.6.7/fs/ext3/namei.c +=================================================================== +--- linux-2.6.7.orig/fs/ext3/namei.c 2004-06-15 23:19:36.000000000 -0600 ++++ linux-2.6.7/fs/ext3/namei.c 2004-08-20 17:48:54.000000000 -0600 +@@ -1596,11 +1596,17 @@ static int ext3_delete_entry (handle_t * + static inline void ext3_inc_count(handle_t *handle, struct inode *inode) + { + inode->i_nlink++; ++ if (is_dx(inode) && inode->i_nlink > 1) { ++ /* limit is 16-bit i_links_count */ ++ if (inode->i_nlink >= EXT3_LINK_MAX || inode->i_nlink == 2) ++ inode->i_nlink = 1; ++ } + } + + static inline void ext3_dec_count(handle_t *handle, struct inode *inode) + { +- inode->i_nlink--; ++ if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 2) ++ inode->i_nlink--; + } + + static int ext3_add_nondir(handle_t *handle, +@@ -1693,7 +1698,7 @@ static int ext3_mkdir(struct inode * dir + struct ext3_dir_entry_2 * de; + int err; + +- if (dir->i_nlink >= EXT3_LINK_MAX) ++ if (EXT3_DIR_LINK_MAXED(dir)) + return -EMLINK; + + handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + +@@ -1715,7 +1720,7 @@ static int ext3_mkdir(struct inode * dir + inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize; + dir_block = ext3_bread (handle, inode, 0, 1, &err); + if (!dir_block) { +- inode->i_nlink--; /* is this nlink == 0? */ ++ ext3_dec_count(handle, inode); /* is this nlink == 0? */ + ext3_mark_inode_dirty(handle, inode); + iput (inode); + goto out_stop; +@@ -1747,7 +1752,7 @@ static int ext3_mkdir(struct inode * dir + iput (inode); + goto out_stop; + } +- dir->i_nlink++; ++ ext3_inc_count(handle, dir); + ext3_update_dx_flag(dir); + ext3_mark_inode_dirty(handle, dir); + d_instantiate(dentry, inode); +@@ -2010,10 +2015,10 @@ static int ext3_rmdir (struct inode * di + retval = ext3_delete_entry(handle, dir, de, bh); + if (retval) + goto end_rmdir; +- if (inode->i_nlink != 2) +- ext3_warning (inode->i_sb, "ext3_rmdir", +- "empty directory has nlink!=2 (%d)", +- inode->i_nlink); ++ if (!EXT3_DIR_LINK_EMPTY(inode)) ++ ext3_warning(inode->i_sb, "ext3_rmdir", ++ "empty directory has too many links (%d)", ++ inode->i_nlink); + inode->i_version++; + inode->i_nlink = 0; + /* There's no need to set i_disksize: the fact that i_nlink is +@@ -2023,7 +2028,7 @@ static int ext3_rmdir (struct inode * di + ext3_orphan_add(handle, inode); + inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; + ext3_mark_inode_dirty(handle, inode); +- dir->i_nlink--; ++ ext3_dec_count(handle, dir); + ext3_update_dx_flag(dir); + ext3_mark_inode_dirty(handle, dir); + +@@ -2074,7 +2079,7 @@ static int ext3_unlink(struct inode * di + dir->i_ctime = dir->i_mtime = CURRENT_TIME; + ext3_update_dx_flag(dir); + ext3_mark_inode_dirty(handle, dir); +- inode->i_nlink--; ++ ext3_dec_count(handle, inode); + if (!inode->i_nlink) + ext3_orphan_add(handle, inode); + inode->i_ctime = dir->i_ctime; +@@ -2146,7 +2151,7 @@ static int ext3_link (struct dentry * ol + struct inode *inode = old_dentry->d_inode; + int err; + +- if (inode->i_nlink >= EXT3_LINK_MAX) ++ if (EXT3_DIR_LINK_MAXED(inode)) + return -EMLINK; + + handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + +@@ -2230,8 +2235,8 @@ static int ext3_rename (struct inode * o + if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino) + goto end_rename; + retval = -EMLINK; +- if (!new_inode && new_dir!=old_dir && +- new_dir->i_nlink >= EXT3_LINK_MAX) ++ if (!new_inode && new_dir != old_dir && ++ EXT3_DIR_LINK_MAXED(new_dir)) + goto end_rename; + } + if (!new_bh) { +@@ -2288,7 +2293,7 @@ static int ext3_rename (struct inode * o + } + + if (new_inode) { +- new_inode->i_nlink--; ++ ext3_dec_count(handle, new_inode); + new_inode->i_ctime = CURRENT_TIME_SEC; + } + old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME_SEC; +@@ -2299,11 +2304,13 @@ static int ext3_rename (struct inode * o + PARENT_INO(dir_bh->b_data) = le32_to_cpu(new_dir->i_ino); + BUFFER_TRACE(dir_bh, "call ext3_journal_dirty_metadata"); + ext3_journal_dirty_metadata(handle, dir_bh); +- old_dir->i_nlink--; ++ ext3_dec_count(handle, old_dir); + if (new_inode) { +- new_inode->i_nlink--; ++ /* checked empty_dir above, can't have another parent, ++ * ext3_dec_count() won't work for many-linked dirs */ ++ new_inode->i_nlink = 0; + } else { +- new_dir->i_nlink++; ++ ext3_inc_count(handle, new_dir); + ext3_update_dx_flag(new_dir); + ext3_mark_inode_dirty(handle, new_dir); + } +--- linux-2.6.7.orig/include/linux/ext3_fs.h 2004-06-15 23:19:36.000000000 -0600 ++++ linux-2.6.7/include/linux/ext3_fs.h 2004-08-20 17:41:27.000000000 -0600 +@@ -79,7 +81,7 @@ + /* + * Maximal count of links to a file + */ +-#define EXT3_LINK_MAX 32000 ++#define EXT3_LINK_MAX 65000 + + /* + * Macro-instructions used to manage several block sizes +@@ -595,14 +595,15 @@ struct ext3_dir_entry_2 { + */ + + #ifdef CONFIG_EXT3_INDEX +- #define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \ +- EXT3_FEATURE_COMPAT_DIR_INDEX) && \ ++#define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \ ++ EXT3_FEATURE_COMPAT_DIR_INDEX) && \ + (EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) +-#define EXT3_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT3_LINK_MAX) +-#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1) ++#define EXT3_DIR_LINK_MAXED(dir) (!is_dx(dir) && (dir)->i_nlink >=EXT3_LINK_MAX) ++#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || \ ++ (is_dx(dir) && (dir)->i_nlink == 1)) + #else + #define is_dx(dir) 0 +-#define EXT3_DIR_LINK_MAX(dir) ((dir)->i_nlink >= EXT3_LINK_MAX) ++#define EXT3_DIR_LINK_MAXED(dir) ((dir)->i_nlink >= EXT3_LINK_MAX) + #define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2) + #endif + diff --git a/lustre/kernel_patches/patches/ext3-nlinks-2.6.9.patch b/lustre/kernel_patches/patches/ext3-nlinks-2.6.9.patch index e75373a..62bf156 100644 --- a/lustre/kernel_patches/patches/ext3-nlinks-2.6.9.patch +++ b/lustre/kernel_patches/patches/ext3-nlinks-2.6.9.patch @@ -9,7 +9,7 @@ diff -Nur orig/fs/ext3/namei.c patch/fs/ext3/namei.c + /* limit is 16-bit i_links_count */ + if (inode->i_nlink >= EXT3_LINK_MAX || inode->i_nlink == 2) + inode->i_nlink = 1; -+ } ++ } } static inline void ext3_dec_count(handle_t *handle, struct inode *inode) @@ -105,7 +105,7 @@ diff -Nur orig/fs/ext3/namei.c patch/fs/ext3/namei.c if (new_inode) { - new_inode->i_nlink--; -+ ext3_dec_count(handle, new_inode); ++ ext3_dec_count(handle, new_inode); new_inode->i_ctime = CURRENT_TIME_SEC; } old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME_SEC; diff --git a/lustre/kernel_patches/patches/ext3-remove-cond-resched-calls-2.6.12.patch b/lustre/kernel_patches/patches/ext3-remove-cond-resched-calls-2.6.12.patch new file mode 100644 index 0000000..57898d5 --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-remove-cond-resched-calls-2.6.12.patch @@ -0,0 +1,29 @@ +Index: linux-stage/fs/ext3/ialloc.c +=================================================================== +--- linux-stage.orig/fs/ext3/ialloc.c 2005-06-26 10:59:43.048185981 +0200 ++++ linux-stage/fs/ext3/ialloc.c 2005-06-26 11:01:21.317716027 +0200 +@@ -775,7 +775,6 @@ + if (!gdp) + continue; + desc_count += le16_to_cpu(gdp->bg_free_inodes_count); +- cond_resched(); + } + return desc_count; + #endif +Index: linux-stage/fs/ext3/super.c +=================================================================== +--- linux-stage.orig/fs/ext3/super.c 2005-06-26 10:59:43.205412542 +0200 ++++ linux-stage/fs/ext3/super.c 2005-06-26 11:02:29.599941754 +0200 +@@ -2236,11 +2232,9 @@ + * block group descriptors. If the sparse superblocks + * feature is turned on, then not all groups have this. + */ +- for (i = 0; i < ngroups; i++) { ++ for (i = 0; i < ngroups; i++) + overhead += ext3_bg_has_super(sb, i) + + ext3_bg_num_gdb(sb, i); +- cond_resched(); +- } + + /* + * Every block group has an inode bitmap, a block diff --git a/lustre/kernel_patches/patches/ext3-statfs-2.6.12.patch b/lustre/kernel_patches/patches/ext3-statfs-2.6.12.patch new file mode 100644 index 0000000..62755f4 --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-statfs-2.6.12.patch @@ -0,0 +1,171 @@ +Index: linux-2.6.12/fs/ext3/super.c +=================================================================== +--- linux-2.6.12.orig/fs/ext3/super.c 2005-06-17 13:48:29.000000000 -0600 ++++ linux-2.6.12/fs/ext3/super.c 2005-11-25 05:59:47.000000000 -0700 +@@ -2165,13 +2165,12 @@ + { + struct ext3_super_block *es = EXT3_SB(sb)->s_es; + unsigned long overhead; +- int i; + + if (test_opt (sb, MINIX_DF)) + overhead = 0; + else { +- unsigned long ngroups; +- ngroups = EXT3_SB(sb)->s_groups_count; ++ unsigned long ngroups = EXT3_SB(sb)->s_groups_count, group; ++ unsigned long three = 1, five = 5, seven = 7; + smp_rmb(); + + /* +@@ -2189,11 +2188,13 @@ + * block group descriptors. If the sparse superblocks + * feature is turned on, then not all groups have this. + */ +- for (i = 0; i < ngroups; i++) { +- overhead += ext3_bg_has_super(sb, i) + +- ext3_bg_num_gdb(sb, i); +- cond_resched(); +- } ++ overhead += 1 + EXT3_SB(sb)->s_gdb_count; /* group 0 */ ++ ++ while ((group = ext3_list_backups(sb,&three,&five,&seven)) < ++ ngroups) ++ overhead += 1 + group > ++ LDISKFS_SB(sb)->s_es->s_first_meta_bg ? 1 : ++ LDISKFS_SB(sb)->s_gdb_count; + + /* + * Every block group has an inode bitmap, a block +@@ -2205,12 +2204,16 @@ + buf->f_type = EXT3_SUPER_MAGIC; + buf->f_bsize = sb->s_blocksize; + buf->f_blocks = le32_to_cpu(es->s_blocks_count) - overhead; +- buf->f_bfree = ext3_count_free_blocks (sb); ++ buf->f_bfree = percpu_counter_read(&EXT3_SB(sb)->s_freeblocks_counter); ++ if (buf->f_bfree < 0) ++ buf->f_bfree = 0; + buf->f_bavail = buf->f_bfree - le32_to_cpu(es->s_r_blocks_count); + if (buf->f_bfree < le32_to_cpu(es->s_r_blocks_count)) + buf->f_bavail = 0; + buf->f_files = le32_to_cpu(es->s_inodes_count); +- buf->f_ffree = ext3_count_free_inodes (sb); ++ buf->f_ffree = percpu_counter_read(&EXT3_SB(sb)->s_freeinodes_counter); ++ if (buf->f_ffree < 0) ++ buf->f_ffree = 0; + buf->f_namelen = EXT3_NAME_LEN; + return 0; + } +Index: linux-2.6.12/fs/ext3/resize.c +=================================================================== +--- linux-2.6.12.orig/fs/ext3/resize.c 2005-11-24 15:17:06.000000000 -0700 ++++ linux-2.6.12/fs/ext3/resize.c 2005-11-25 06:01:01.000000000 -0700 +@@ -285,17 +285,17 @@ + * sequence of powers of 3, 5, and 7: 1, 3, 5, 7, 9, 25, 27, 49, 81, ... + * For a non-sparse filesystem it will be every group: 1, 2, 3, 4, ... + */ +-static unsigned ext3_list_backups(struct super_block *sb, unsigned *three, +- unsigned *five, unsigned *seven) ++unsigned long ext3_list_backups(struct super_block *sb, unsigned long *three, ++ unsigned long *five, unsigned long *seven) + { +- unsigned *min = three; ++ int first_metabg = le32_to_cpu(LDISKFS_SB(sb)->s_es->s_first_meta_bg); ++ unsigned long *min = three, ret; + int mult = 3; +- unsigned ret; + + if (!EXT3_HAS_RO_COMPAT_FEATURE(sb, + EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER)) { +- ret = *min; +- *min += 1; ++ ret = *three; ++ *three += 1; + return ret; + } + +@@ -308,8 +307,22 @@ + mult = 7; + } + +- ret = *min; +- *min *= mult; ++ if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_META_BG) && ++ *min > first_metabg)) { ++ if (*min & (EXT3_DESC_PER_BLOCK(sb) - 1)) { ++ ret = first_metabg); ++ *three = (first_metabg | ++ (EXT3_DESC_PER_BLOCK(sb) - 1)) + 1; ++ *five = -1UL; ++ *seven = -1UL; ++ } else { ++ ret = *three; ++ *three += EXT3_DESC_PER_BLOCK(sb); ++ } ++ } else { ++ ret = *min; ++ *min *= mult; ++ } + + return ret; + } +@@ -324,17 +337,17 @@ + { + const unsigned long blk = primary->b_blocknr; + const unsigned long end = EXT3_SB(sb)->s_groups_count; +- unsigned three = 1; +- unsigned five = 5; +- unsigned seven = 7; +- unsigned grp; ++ unsigned long three = 1; ++ unsigned long five = 5; ++ unsigned long seven = 7; ++ unsigned long grp; + __u32 *p = (__u32 *)primary->b_data; + int gdbackups = 0; + + while ((grp = ext3_list_backups(sb, &three, &five, &seven)) < end) { + if (le32_to_cpu(*p++) != grp * EXT3_BLOCKS_PER_GROUP(sb) + blk){ + ext3_warning(sb, __FUNCTION__, +- "reserved GDT %ld missing grp %d (%ld)\n", ++ "reserved GDT %ld missing grp %ld (%ld)\n", + blk, grp, + grp * EXT3_BLOCKS_PER_GROUP(sb) + blk); + return -EINVAL; +@@ -618,10 +631,8 @@ + struct ext3_sb_info *sbi = EXT3_SB(sb); + const unsigned long last = sbi->s_groups_count; + const int bpg = EXT3_BLOCKS_PER_GROUP(sb); +- unsigned three = 1; +- unsigned five = 5; +- unsigned seven = 7; +- unsigned group; ++ unsigned long three = 1, five = 5, seven = 7; ++ unsigned long group; + int rest = sb->s_blocksize - size; + handle_t *handle; + int err = 0, err2; +@@ -672,7 +683,7 @@ + exit_err: + if (err) { + ext3_warning(sb, __FUNCTION__, +- "can't update backup for group %d (err %d), " ++ "can't update backup for group %ld (err %d), " + "forcing fsck on next reboot\n", group, err); + sbi->s_mount_state &= ~EXT3_VALID_FS; + sbi->s_es->s_state &= ~cpu_to_le16(EXT3_VALID_FS); +Index: linux-2.6.12/include/linux/ext3_fs.h +=================================================================== +--- linux-2.6.12.orig/include/linux/ext3_fs.h 2005-06-17 13:48:29.000000000 -0600 ++++ linux-2.6.12/include/linux/ext3_fs.h 2005-11-25 05:59:47.000000000 -0700 +@@ -788,6 +788,10 @@ + extern int ext3_group_extend(struct super_block *sb, + struct ext3_super_block *es, + unsigned long n_blocks_count); ++extern unsigned long ext3_list_backups(struct super_block *sb, ++ unsigned long *three, ++ unsigned long *five, ++ unsigned long *seven); + + /* super.c */ + extern void ext3_error (struct super_block *, const char *, const char *, ...) diff --git a/lustre/kernel_patches/patches/iopen-2.6.12.patch b/lustre/kernel_patches/patches/iopen-2.6.12.patch new file mode 100644 index 0000000..41e4c05 --- /dev/null +++ b/lustre/kernel_patches/patches/iopen-2.6.12.patch @@ -0,0 +1,470 @@ +Index: linux-2.6.12-rc6/fs/ext3/Makefile +=================================================================== +--- linux-2.6.12-rc6.orig/fs/ext3/Makefile 2005-06-14 16:00:45.206720992 +0200 ++++ linux-2.6.12-rc6/fs/ext3/Makefile 2005-06-14 16:14:33.595382720 +0200 +@@ -4,7 +4,7 @@ + + obj-$(CONFIG_EXT3_FS) += ext3.o + +-ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ ++ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o\ + ioctl.o namei.o super.o symlink.o hash.o resize.o + + ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o +Index: linux-2.6.12-rc6/fs/ext3/inode.c +=================================================================== +--- linux-2.6.12-rc6.orig/fs/ext3/inode.c 2005-06-14 16:01:16.272150299 +0200 ++++ linux-2.6.12-rc6/fs/ext3/inode.c 2005-06-14 16:24:55.686195412 +0200 +@@ -37,6 +37,7 @@ + #include + #include + #include "xattr.h" ++#include "iopen.h" + #include "acl.h" + + static int ext3_writepage_trans_blocks(struct inode *inode); +@@ -2437,6 +2438,8 @@ + ei->i_default_acl = EXT3_ACL_NOT_CACHED; + #endif + ei->i_block_alloc_info = NULL; ++ if (ext3_iopen_get_inode(inode)) ++ return; + + if (__ext3_get_inode_loc(inode, &iloc, 0)) + goto bad_inode; +Index: linux-2.6.12-rc6/fs/ext3/iopen.c +=================================================================== +--- linux-2.6.12-rc6.orig/fs/ext3/iopen.c 2005-06-14 16:14:33.530929595 +0200 ++++ linux-2.6.12-rc6/fs/ext3/iopen.c 2005-06-14 16:14:33.626632719 +0200 +@@ -0,0 +1,277 @@ ++/* ++ * linux/fs/ext3/iopen.c ++ * ++ * Special support for open by inode number ++ * ++ * Copyright (C) 2001 by Theodore Ts'o (tytso@alum.mit.edu). ++ * ++ * This file may be redistributed under the terms of the GNU General ++ * Public License. ++ * ++ * ++ * Invariants: ++ * - there is only ever a single DCACHE_NFSD_DISCONNECTED dentry alias ++ * for an inode at one time. ++ * - there are never both connected and DCACHE_NFSD_DISCONNECTED dentry ++ * aliases on an inode at the same time. ++ * ++ * If we have any connected dentry aliases for an inode, use one of those ++ * in iopen_lookup(). Otherwise, we instantiate a single NFSD_DISCONNECTED ++ * dentry for this inode, which thereafter will be found by the dcache ++ * when looking up this inode number in __iopen__, so we don't return here ++ * until it is gone. ++ * ++ * If we get an inode via a regular name lookup, then we "rename" the ++ * NFSD_DISCONNECTED dentry to the proper name and parent. This ensures ++ * existing users of the disconnected dentry will continue to use the same ++ * dentry as the connected users, and there will never be both kinds of ++ * dentry aliases at one time. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "iopen.h" ++ ++#ifndef assert ++#define assert(test) J_ASSERT(test) ++#endif ++ ++#define IOPEN_NAME_LEN 32 ++ ++/* ++ * This implements looking up an inode by number. ++ */ ++static struct dentry *iopen_lookup(struct inode * dir, struct dentry *dentry, ++ struct nameidata *nd) ++{ ++ struct inode *inode; ++ unsigned long ino; ++ struct list_head *lp; ++ struct dentry *alternate; ++ char buf[IOPEN_NAME_LEN]; ++ ++ if (dentry->d_name.len >= IOPEN_NAME_LEN) ++ return ERR_PTR(-ENAMETOOLONG); ++ ++ memcpy(buf, dentry->d_name.name, dentry->d_name.len); ++ buf[dentry->d_name.len] = 0; ++ ++ if (strcmp(buf, ".") == 0) ++ ino = dir->i_ino; ++ else if (strcmp(buf, "..") == 0) ++ ino = EXT3_ROOT_INO; ++ else ++ ino = simple_strtoul(buf, 0, 0); ++ ++ if ((ino != EXT3_ROOT_INO && ++ //ino != EXT3_ACL_IDX_INO && ++ //ino != EXT3_ACL_DATA_INO && ++ ino < EXT3_FIRST_INO(dir->i_sb)) || ++ ino > le32_to_cpu(EXT3_SB(dir->i_sb)->s_es->s_inodes_count)) ++ return ERR_PTR(-ENOENT); ++ ++ inode = iget(dir->i_sb, ino); ++ if (!inode) ++ return ERR_PTR(-EACCES); ++ if (is_bad_inode(inode)) { ++ iput(inode); ++ return ERR_PTR(-ENOENT); ++ } ++ ++ assert(list_empty(&dentry->d_alias)); /* d_instantiate */ ++ assert(d_unhashed(dentry)); /* d_rehash */ ++ ++ /* preferrably return a connected dentry */ ++ spin_lock(&dcache_lock); ++ list_for_each(lp, &inode->i_dentry) { ++ alternate = list_entry(lp, struct dentry, d_alias); ++ assert(!(alternate->d_flags & DCACHE_DISCONNECTED)); ++ } ++ ++ if (!list_empty(&inode->i_dentry)) { ++ alternate = list_entry(inode->i_dentry.next, ++ struct dentry, d_alias); ++ dget_locked(alternate); ++ spin_lock(&alternate->d_lock); ++ alternate->d_flags |= DCACHE_REFERENCED; ++ spin_unlock(&alternate->d_lock); ++ iput(inode); ++ spin_unlock(&dcache_lock); ++ return alternate; ++ } ++ dentry->d_flags |= DCACHE_DISCONNECTED; ++ ++ /* d_add(), but don't drop dcache_lock before adding dentry to inode */ ++ list_add(&dentry->d_alias, &inode->i_dentry); /* d_instantiate */ ++ dentry->d_inode = inode; ++ ++ d_rehash_cond(dentry, 0); /* d_rehash */ ++ spin_unlock(&dcache_lock); ++ ++ return NULL; ++} ++ ++#define do_switch(x,y) do { \ ++ __typeof__ (x) __tmp = x; \ ++ x = y; y = __tmp; } while (0) ++ ++static inline void switch_names(struct dentry *dentry, struct dentry *target) ++{ ++ const unsigned char *old_name, *new_name; ++ ++ memcpy(dentry->d_iname, target->d_iname, DNAME_INLINE_LEN_MIN); ++ old_name = target->d_name.name; ++ new_name = dentry->d_name.name; ++ if (old_name == target->d_iname) ++ old_name = dentry->d_iname; ++ if (new_name == dentry->d_iname) ++ new_name = target->d_iname; ++ target->d_name.name = new_name; ++ dentry->d_name.name = old_name; ++} ++ ++/* This function is spliced into ext3_lookup and does the move of a ++ * disconnected dentry (if it exists) to a connected dentry. ++ */ ++struct dentry *iopen_connect_dentry(struct dentry *dentry, struct inode *inode, ++ int rehash) ++{ ++ struct dentry *tmp, *goal = NULL; ++ struct list_head *lp; ++ ++ /* verify this dentry is really new */ ++ assert(dentry->d_inode == NULL); ++ assert(list_empty(&dentry->d_alias)); /* d_instantiate */ ++ if (rehash) ++ assert(d_unhashed(dentry)); /* d_rehash */ ++ assert(list_empty(&dentry->d_subdirs)); ++ ++ spin_lock(&dcache_lock); ++ if (!inode) ++ goto do_rehash; ++ ++ if (!test_opt(inode->i_sb, IOPEN)) ++ goto do_instantiate; ++ ++ /* preferrably return a connected dentry */ ++ list_for_each(lp, &inode->i_dentry) { ++ tmp = list_entry(lp, struct dentry, d_alias); ++ if (tmp->d_flags & DCACHE_DISCONNECTED) { ++ assert(tmp->d_alias.next == &inode->i_dentry); ++ assert(tmp->d_alias.prev == &inode->i_dentry); ++ goal = tmp; ++ dget_locked(goal); ++ break; ++ } ++ } ++ ++ if (!goal) ++ goto do_instantiate; ++ ++ /* Move the goal to the de hash queue */ ++ goal->d_flags &= ~ DCACHE_DISCONNECTED; ++ security_d_instantiate(goal, inode); ++ d_rehash_cond(dentry, 0); ++ __d_move(goal, dentry); ++ spin_unlock(&dcache_lock); ++ iput(inode); ++ ++ return goal; ++ ++ /* d_add(), but don't drop dcache_lock before adding dentry to inode */ ++do_instantiate: ++ list_add(&dentry->d_alias, &inode->i_dentry); /* d_instantiate */ ++ dentry->d_inode = inode; ++do_rehash: ++ if (rehash) ++ d_rehash_cond(dentry, 0); /* d_rehash */ ++ spin_unlock(&dcache_lock); ++ ++ return NULL; ++} ++ ++/* ++ * These are the special structures for the iopen pseudo directory. ++ */ ++ ++static struct inode_operations iopen_inode_operations = { ++ lookup: iopen_lookup, /* BKL held */ ++}; ++ ++static struct file_operations iopen_file_operations = { ++ read: generic_read_dir, ++}; ++ ++static int match_dentry(struct dentry *dentry, const char *name) ++{ ++ int len; ++ ++ len = strlen(name); ++ if (dentry->d_name.len != len) ++ return 0; ++ if (strncmp(dentry->d_name.name, name, len)) ++ return 0; ++ return 1; ++} ++ ++/* ++ * This function is spliced into ext3_lookup and returns 1 the file ++ * name is __iopen__ and dentry has been filled in appropriately. ++ */ ++int ext3_check_for_iopen(struct inode *dir, struct dentry *dentry) ++{ ++ struct inode *inode; ++ ++ if (dir->i_ino != EXT3_ROOT_INO || ++ !test_opt(dir->i_sb, IOPEN) || ++ !match_dentry(dentry, "__iopen__")) ++ return 0; ++ ++ inode = iget(dir->i_sb, EXT3_BAD_INO); ++ ++ if (!inode) ++ return 0; ++ d_add(dentry, inode); ++ return 1; ++} ++ ++/* ++ * This function is spliced into read_inode; it returns 1 if inode ++ * number is the one for /__iopen__, in which case the inode is filled ++ * in appropriately. Otherwise, this fuction returns 0. ++ */ ++int ext3_iopen_get_inode(struct inode *inode) ++{ ++ if (inode->i_ino != EXT3_BAD_INO) ++ return 0; ++ ++ inode->i_mode = S_IFDIR | S_IRUSR | S_IXUSR; ++ if (test_opt(inode->i_sb, IOPEN_NOPRIV)) ++ inode->i_mode |= 0777; ++ inode->i_uid = 0; ++ inode->i_gid = 0; ++ inode->i_nlink = 1; ++ inode->i_size = 4096; ++ inode->i_atime = CURRENT_TIME; ++ inode->i_ctime = CURRENT_TIME; ++ inode->i_mtime = CURRENT_TIME; ++ EXT3_I(inode)->i_dtime = 0; ++ inode->i_blksize = PAGE_SIZE; /* This is the optimal IO size ++ * (for stat), not the fs block ++ * size */ ++ inode->i_blocks = 0; ++ inode->i_version = 1; ++ inode->i_generation = 0; ++ ++ inode->i_op = &iopen_inode_operations; ++ inode->i_fop = &iopen_file_operations; ++ inode->i_mapping->a_ops = 0; ++ ++ return 1; ++} +Index: linux-2.6.12-rc6/fs/ext3/iopen.h +=================================================================== +--- linux-2.6.12-rc6.orig/fs/ext3/iopen.h 2005-06-14 16:14:33.534835845 +0200 ++++ linux-2.6.12-rc6/fs/ext3/iopen.h 2005-06-14 16:14:33.633468657 +0200 +@@ -0,0 +1,15 @@ ++/* ++ * iopen.h ++ * ++ * Special support for opening files by inode number. ++ * ++ * Copyright (C) 2001 by Theodore Ts'o (tytso@alum.mit.edu). ++ * ++ * This file may be redistributed under the terms of the GNU General ++ * Public License. ++ */ ++ ++extern int ext3_check_for_iopen(struct inode *dir, struct dentry *dentry); ++extern int ext3_iopen_get_inode(struct inode *inode); ++extern struct dentry *iopen_connect_dentry(struct dentry *dentry, ++ struct inode *inode, int rehash); +Index: linux-2.6.12-rc6/fs/ext3/namei.c +=================================================================== +--- linux-2.6.12-rc6.orig/fs/ext3/namei.c 2005-06-14 16:01:14.701837819 +0200 ++++ linux-2.6.12-rc6/fs/ext3/namei.c 2005-06-14 16:14:33.644210844 +0200 +@@ -37,6 +37,7 @@ + #include + #include + #include "xattr.h" ++#include "iopen.h" + #include "acl.h" + + /* +@@ -985,6 +986,9 @@ + if (dentry->d_name.len > EXT3_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); + ++ if (ext3_check_for_iopen(dir, dentry)) ++ return NULL; ++ + bh = ext3_find_entry(dentry, &de); + inode = NULL; + if (bh) { +@@ -995,10 +999,8 @@ + if (!inode) + return ERR_PTR(-EACCES); + } +- if (inode) +- return d_splice_alias(inode, dentry); +- d_add(dentry, inode); +- return NULL; ++ ++ return iopen_connect_dentry(dentry, inode, 1); + } + + +@@ -2042,10 +2044,6 @@ + inode->i_nlink); + inode->i_version++; + inode->i_nlink = 0; +- /* There's no need to set i_disksize: the fact that i_nlink is +- * zero will ensure that the right thing happens during any +- * recovery. */ +- inode->i_size = 0; + ext3_orphan_add(handle, inode); + inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; + ext3_mark_inode_dirty(handle, inode); +@@ -2168,6 +2166,23 @@ + return err; + } + ++/* Like ext3_add_nondir() except for call to iopen_connect_dentry */ ++static int ext3_add_link(handle_t *handle, struct dentry *dentry, ++ struct inode *inode) ++{ ++ int err = ext3_add_entry(handle, dentry, inode); ++ if (!err) { ++ err = ext3_mark_inode_dirty(handle, inode); ++ if (err == 0) { ++ dput(iopen_connect_dentry(dentry, inode, 0)); ++ return 0; ++ } ++ } ++ ext3_dec_count(handle, inode); ++ iput(inode); ++ return err; ++} ++ + static int ext3_link (struct dentry * old_dentry, + struct inode * dir, struct dentry *dentry) + { +@@ -2191,7 +2206,8 @@ + ext3_inc_count(handle, inode); + atomic_inc(&inode->i_count); + +- err = ext3_add_nondir(handle, dentry, inode); ++ err = ext3_add_link(handle, dentry, inode); ++ ext3_orphan_del(handle,inode); + ext3_journal_stop(handle); + if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) + goto retry; +Index: linux-2.6.12-rc6/fs/ext3/super.c +=================================================================== +--- linux-2.6.12-rc6.orig/fs/ext3/super.c 2005-06-14 16:01:16.287775299 +0200 ++++ linux-2.6.12-rc6/fs/ext3/super.c 2005-06-14 16:14:33.656906156 +0200 +@@ -590,6 +590,7 @@ + Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, + Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, + Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, ++ Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, + Opt_ignore, Opt_barrier, Opt_err, Opt_resize, + }; + +@@ -638,6 +639,9 @@ + {Opt_ignore, "noquota"}, + {Opt_ignore, "quota"}, + {Opt_ignore, "usrquota"}, ++ {Opt_iopen, "iopen"}, ++ {Opt_noiopen, "noiopen"}, ++ {Opt_iopen_nopriv, "iopen_nopriv"}, + {Opt_barrier, "barrier=%u"}, + {Opt_err, NULL}, + {Opt_resize, "resize"}, +@@ -921,6 +925,18 @@ + else + clear_opt(sbi->s_mount_opt, BARRIER); + break; ++ case Opt_iopen: ++ set_opt (sbi->s_mount_opt, IOPEN); ++ clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV); ++ break; ++ case Opt_noiopen: ++ clear_opt (sbi->s_mount_opt, IOPEN); ++ clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV); ++ break; ++ case Opt_iopen_nopriv: ++ set_opt (sbi->s_mount_opt, IOPEN); ++ set_opt (sbi->s_mount_opt, IOPEN_NOPRIV); ++ break; + case Opt_ignore: + break; + case Opt_resize: +Index: linux-2.6.12-rc6/include/linux/ext3_fs.h +=================================================================== +--- linux-2.6.12-rc6.orig/include/linux/ext3_fs.h 2005-06-14 16:01:14.709650318 +0200 ++++ linux-2.6.12-rc6/include/linux/ext3_fs.h 2005-06-14 16:28:38.452794245 +0200 +@@ -358,6 +358,8 @@ + #define EXT3_MOUNT_RESERVATION 0x10000 /* Preallocation */ + #define EXT3_MOUNT_BARRIER 0x20000 /* Use block barriers */ + #define EXT3_MOUNT_NOBH 0x40000 /* No bufferheads */ ++#define EXT3_MOUNT_IOPEN 0x80000 /* Allow access via iopen */ ++#define EXT3_MOUNT_IOPEN_NOPRIV 0x100000 /* Make iopen world-readable */ + + /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ + #ifndef _LINUX_EXT2_FS_H diff --git a/lustre/kernel_patches/patches/iopen-misc-2.6.12.patch b/lustre/kernel_patches/patches/iopen-misc-2.6.12.patch new file mode 100644 index 0000000..48d8ab9 --- /dev/null +++ b/lustre/kernel_patches/patches/iopen-misc-2.6.12.patch @@ -0,0 +1,82 @@ +Index: linux-2.6.4-51.0/Documentation/filesystems/ext2.txt +=================================================================== +--- linux-2.6.4-51.0.orig/Documentation/filesystems/ext2.txt 2004-05-06 22:21:26.000000000 -0400 ++++ linux-2.6.4-51.0/Documentation/filesystems/ext2.txt 2004-05-06 22:24:42.000000000 -0400 +@@ -35,6 +35,22 @@ + + sb=n Use alternate superblock at this location. + ++iopen Makes an invisible pseudo-directory called ++ __iopen__ available in the root directory ++ of the filesystem. Allows open-by-inode- ++ number. i.e., inode 3145 can be accessed ++ via /mntpt/__iopen__/3145 ++ ++iopen_nopriv This option makes the iopen directory be ++ world-readable. This may be safer since it ++ allows daemons to run as an unprivileged user, ++ however it significantly changes the security ++ model of a Unix filesystem, since previously ++ all files under a mode 700 directory were not ++ generally avilable even if the ++ permissions on the file itself is ++ world-readable. ++ + grpquota,noquota,quota,usrquota Quota options are silently ignored by ext2. + + +Index: linux-2.6.4-51.0/fs/dcache.c +=================================================================== +--- linux-2.6.4-51.0.orig/fs/dcache.c 2004-05-06 22:24:42.000000000 -0400 ++++ linux-2.6.4-51.0/fs/dcache.c 2004-05-06 22:58:37.000000000 -0400 +@@ -1195,14 +1195,13 @@ + * dcache entries should not be moved in this way. + */ + +-void d_move(struct dentry * dentry, struct dentry * target) ++void __d_move(struct dentry * dentry, struct dentry * target) + { + struct hlist_head *list; + + if (!dentry->d_inode) + printk(KERN_WARNING "VFS: moving negative dcache entry\n"); + +- spin_lock(&dcache_lock); + write_seqlock(&rename_lock); + /* + * XXXX: do we really need to take target->d_lock? +@@ -1253,6 +1252,14 @@ + spin_unlock(&target->d_lock); + spin_unlock(&dentry->d_lock); + write_sequnlock(&rename_lock); ++} ++ ++EXPORT_SYMBOL(__d_move); ++ ++void d_move(struct dentry *dentry, struct dentry *target) ++{ ++ spin_lock(&dcache_lock); ++ __d_move(dentry, target); + spin_unlock(&dcache_lock); + } + +Index: linux-2.6.4-51.0/include/linux/dcache.h +=================================================================== +--- linux-2.6.4-51.0.orig/include/linux/dcache.h 2004-05-06 22:24:42.000000000 -0400 ++++ linux-2.6.4-51.0/include/linux/dcache.h 2004-05-06 23:03:43.000000000 -0400 +@@ -234,6 +234,7 @@ + * This adds the entry to the hash queues. + */ + extern void d_rehash(struct dentry *); ++extern void d_rehash_cond(struct dentry *, int lock); + + /** + * d_add - add dentry to hash queues +@@ -252,6 +253,7 @@ + + /* used for rename() and baskets */ + extern void d_move(struct dentry *, struct dentry *); ++extern void __d_move(struct dentry *, struct dentry *); + + /* appendix may either be NULL or be used for transname suffixes */ + extern struct dentry * d_lookup(struct dentry *, struct qstr *); diff --git a/lustre/kernel_patches/patches/nfs-cifs-intent-2.6.12.patch b/lustre/kernel_patches/patches/nfs-cifs-intent-2.6.12.patch new file mode 100644 index 0000000..41e5ecb --- /dev/null +++ b/lustre/kernel_patches/patches/nfs-cifs-intent-2.6.12.patch @@ -0,0 +1,110 @@ +Index: linux-2.6.12-rc6/fs/nfs/dir.c +=================================================================== +--- linux-2.6.12-rc6.orig/fs/nfs/dir.c 2005-06-14 14:22:14.585699648 +0200 ++++ linux-2.6.12-rc6/fs/nfs/dir.c 2005-06-14 14:26:39.884524523 +0200 +@@ -783,7 +783,7 @@ + if (nd->flags & LOOKUP_DIRECTORY) + return 0; + /* Are we trying to write to a read only partition? */ +- if (IS_RDONLY(dir) && (nd->intent.open.flags & (O_CREAT|O_TRUNC|FMODE_WRITE))) ++ if (IS_RDONLY(dir) && (nd->intent.it_flags & (O_CREAT|O_TRUNC|FMODE_WRITE))) + return 0; + return 1; + } +@@ -805,7 +805,7 @@ + dentry->d_op = NFS_PROTO(dir)->dentry_ops; + + /* Let vfs_create() deal with O_EXCL */ +- if (nd->intent.open.flags & O_EXCL) ++ if (nd->intent.it_flags & O_EXCL) + goto no_entry; + + /* Open the file on the server */ +@@ -817,7 +817,7 @@ + goto out; + } + +- if (nd->intent.open.flags & O_CREAT) { ++ if (nd->intent.it_flags & O_CREAT) { + nfs_begin_data_update(dir); + inode = nfs4_atomic_open(dir, dentry, nd); + nfs_end_data_update(dir); +@@ -833,7 +833,7 @@ + break; + /* This turned out not to be a regular file */ + case -ELOOP: +- if (!(nd->intent.open.flags & O_NOFOLLOW)) ++ if (!(nd->intent.it_flags & O_NOFOLLOW)) + goto no_open; + /* case -EISDIR: */ + /* case -EINVAL: */ +@@ -874,7 +874,7 @@ + /* NFS only supports OPEN on regular files */ + if (!S_ISREG(inode->i_mode)) + goto no_open; +- openflags = nd->intent.open.flags; ++ openflags = nd->intent.it_flags; + /* We cannot do exclusive creation on a positive dentry */ + if ((openflags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL)) + goto no_open; +Index: linux-2.6.12-rc6/fs/nfs/nfs4proc.c +=================================================================== +--- linux-2.6.12-rc6.orig/fs/nfs/nfs4proc.c 2005-06-06 17:22:29.000000000 +0200 ++++ linux-2.6.12-rc6/fs/nfs/nfs4proc.c 2005-06-14 14:30:18.499756220 +0200 +@@ -877,19 +877,19 @@ + struct nfs4_state *state; + + if (nd->flags & LOOKUP_CREATE) { +- attr.ia_mode = nd->intent.open.create_mode; ++ attr.ia_mode = nd->intent.it_create_mode; + attr.ia_valid = ATTR_MODE; + if (!IS_POSIXACL(dir)) + attr.ia_mode &= ~current->fs->umask; + } else { + attr.ia_valid = 0; +- BUG_ON(nd->intent.open.flags & O_CREAT); ++ BUG_ON(nd->intent.it_flags & O_CREAT); + } + + cred = rpcauth_lookupcred(NFS_SERVER(dir)->client->cl_auth, 0); + if (IS_ERR(cred)) + return (struct inode *)cred; +- state = nfs4_do_open(dir, dentry, nd->intent.open.flags, &attr, cred); ++ state = nfs4_do_open(dir, dentry, nd->intent.it_flags, &attr, cred); + put_rpccred(cred); + if (IS_ERR(state)) + return (struct inode *)state; +Index: linux-2.6.12-rc6/fs/cifs/dir.c +=================================================================== +--- linux-2.6.12-rc6.orig/fs/cifs/dir.c 2005-06-06 17:22:29.000000000 +0200 ++++ linux-2.6.12-rc6/fs/cifs/dir.c 2005-06-14 14:26:39.915774522 +0200 +@@ -146,23 +146,23 @@ + } + + if(nd) { +- if ((nd->intent.open.flags & O_ACCMODE) == O_RDONLY) ++ if ((nd->intent.it_flags & O_ACCMODE) == O_RDONLY) + desiredAccess = GENERIC_READ; +- else if ((nd->intent.open.flags & O_ACCMODE) == O_WRONLY) { ++ else if ((nd->intent.it_flags & O_ACCMODE) == O_WRONLY) { + desiredAccess = GENERIC_WRITE; + write_only = TRUE; +- } else if ((nd->intent.open.flags & O_ACCMODE) == O_RDWR) { ++ } else if ((nd->intent.it_flags & O_ACCMODE) == O_RDWR) { + /* GENERIC_ALL is too much permission to request */ + /* can cause unnecessary access denied on create */ + /* desiredAccess = GENERIC_ALL; */ + desiredAccess = GENERIC_READ | GENERIC_WRITE; + } + +- if((nd->intent.open.flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) ++ if((nd->intent.it_flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) + disposition = FILE_CREATE; +- else if((nd->intent.open.flags & (O_CREAT | O_TRUNC)) == (O_CREAT | O_TRUNC)) ++ else if((nd->intent.it_flags & (O_CREAT | O_TRUNC)) == (O_CREAT | O_TRUNC)) + disposition = FILE_OVERWRITE_IF; +- else if((nd->intent.open.flags & O_CREAT) == O_CREAT) ++ else if((nd->intent.it_flags & O_CREAT) == O_CREAT) + disposition = FILE_OPEN_IF; + else { + cFYI(1,("Create flag not set in create function")); diff --git a/lustre/kernel_patches/patches/uml-exprt-clearuser-2.6.12.patch b/lustre/kernel_patches/patches/uml-exprt-clearuser-2.6.12.patch new file mode 100644 index 0000000..381b03f --- /dev/null +++ b/lustre/kernel_patches/patches/uml-exprt-clearuser-2.6.12.patch @@ -0,0 +1,11 @@ +--- uml-2.4.24/arch/um/kernel/tt/ksyms.c.orig 2005-05-04 13:59:58.806659456 +0300 ++++ uml-2.4.24/arch/um/kernel/tt/ksyms.c 2005-05-04 14:00:18.358687096 +0300 +@@ -12,6 +12,8 @@ + EXPORT_SYMBOL(__do_strncpy_from_user); + EXPORT_SYMBOL(__do_strnlen_user); + EXPORT_SYMBOL(__do_clear_user); ++EXPORT_SYMBOL(clear_user_tt); ++EXPORT_SYMBOL(clear_user_skas); + + EXPORT_SYMBOL(tracing_pid); + EXPORT_SYMBOL(honeypot); diff --git a/lustre/kernel_patches/patches/vfs_intent-2.6-rhel4.patch b/lustre/kernel_patches/patches/vfs_intent-2.6-rhel4.patch index f8ba127..f52f0c0 100644 --- a/lustre/kernel_patches/patches/vfs_intent-2.6-rhel4.patch +++ b/lustre/kernel_patches/patches/vfs_intent-2.6-rhel4.patch @@ -306,9 +306,9 @@ Index: linux-2.6.9-5.0.3.EL/fs/namespace.c { struct super_block *sb = mnt->mnt_sb; dput(mnt->mnt_root); -+ spin_lock(&dcache_lock); -+ list_del(&mnt->mnt_lustre_list); -+ spin_unlock(&dcache_lock); ++ spin_lock(&dcache_lock); ++ list_del(&mnt->mnt_lustre_list); ++ spin_unlock(&dcache_lock); free_vfsmnt(mnt); deactivate_super(sb); } @@ -317,7 +317,7 @@ Index: linux-2.6.9-5.0.3.EL/fs/namespace.c lock_kernel(); + if (sb->s_op->umount_lustre) -+ sb->s_op->umount_lustre(sb); ++ sb->s_op->umount_lustre(sb); if( (flags&MNT_FORCE) && sb->s_op->umount_begin) sb->s_op->umount_begin(sb); unlock_kernel(); @@ -772,8 +772,8 @@ Index: linux-2.6.9-5.0.3.EL/include/linux/namei.h #define LOOKUP_NOALT 32 #define LOOKUP_ATOMIC 64 #define LOOKUP_REVAL 128 -+#define LOOKUP_LAST (0x1000) -+#define LOOKUP_LINK_NOTLAST (0x2000) ++#define LOOKUP_LAST (0x1000) ++#define LOOKUP_LINK_NOTLAST (0x2000) /* * Intent data @@ -811,25 +811,3 @@ Index: linux-2.6.9-5.0.3.EL/include/linux/mount.h }; static inline struct vfsmount *mntget(struct vfsmount *mnt) -Index: linux-2.6.9-5.0.3.EL/kernel/exit.c -=================================================================== ---- linux-2.6.9-5.0.3.EL.orig/kernel/exit.c 2005-02-26 14:28:01.000000000 +0200 -+++ linux-2.6.9-5.0.3.EL/kernel/exit.c 2005-02-26 23:29:02.000000000 +0200 -@@ -244,6 +244,8 @@ - write_unlock_irq(&tasklist_lock); - } - -+EXPORT_SYMBOL(reparent_to_init); -+ - void __set_special_pids(pid_t session, pid_t pgrp) - { - struct task_struct *curr = current; -@@ -428,6 +430,8 @@ - __exit_files(tsk); - } - -+EXPORT_SYMBOL(exit_files); -+ - static inline void __put_fs_struct(struct fs_struct *fs) - { - /* No need to hold fs->lock if we are killing it */ diff --git a/lustre/kernel_patches/patches/vfs_intent-2.6-suse.patch b/lustre/kernel_patches/patches/vfs_intent-2.6-suse.patch index e6e5392..0116393 100644 --- a/lustre/kernel_patches/patches/vfs_intent-2.6-suse.patch +++ b/lustre/kernel_patches/patches/vfs_intent-2.6-suse.patch @@ -853,25 +853,3 @@ Index: linux-2.6.5-12.1/include/linux/fshooks.h #define FSHOOK_END_USER_WALK(type, err, field) ((void)0);} -Index: linux-2.6.5-12.1/kernel/exit.c -=================================================================== ---- linux-2.6.5-12.1.orig/kernel/exit.c 2004-05-10 12:21:56.000000000 -0400 -+++ linux-2.6.5-12.1/kernel/exit.c 2004-06-03 18:31:28.000000000 -0400 -@@ -260,6 +260,8 @@ - write_unlock_irq(&tasklist_lock); - } - -+EXPORT_SYMBOL(reparent_to_init); -+ - void __set_special_pids(pid_t session, pid_t pgrp) - { - struct task_struct *curr = current; -@@ -429,6 +431,8 @@ - __exit_files(tsk); - } - -+EXPORT_SYMBOL(exit_files); -+ - static inline void __put_fs_struct(struct fs_struct *fs) - { - /* No need to hold fs->lock if we are killing it */ diff --git a/lustre/kernel_patches/patches/vfs_intent-2.6.12.patch b/lustre/kernel_patches/patches/vfs_intent-2.6.12.patch new file mode 100644 index 0000000..f46227f --- /dev/null +++ b/lustre/kernel_patches/patches/vfs_intent-2.6.12.patch @@ -0,0 +1,819 @@ +Index: linux-2.6.12.5/fs/exec.c +=================================================================== +--- linux-2.6.12.5.orig/fs/exec.c 2005-08-17 17:51:28.000000000 +0200 ++++ linux-2.6.12.5/fs/exec.c 2005-08-17 17:51:44.000000000 +0200 +@@ -122,9 +122,10 @@ + struct file * file; + struct nameidata nd; + int error; ++ intent_init(&nd.intent, IT_OPEN); + +- nd.intent.open.flags = FMODE_READ; +- error = __user_walk(library, LOOKUP_FOLLOW|LOOKUP_OPEN, &nd); ++ nd.intent.it_flags = FMODE_READ|FMODE_EXEC; ++ error = __user_walk_it(library, LOOKUP_FOLLOW|LOOKUP_OPEN, &nd); + if (error) + goto out; + +@@ -136,7 +137,7 @@ + if (error) + goto exit; + +- file = dentry_open(nd.dentry, nd.mnt, O_RDONLY); ++ file = dentry_open_it(nd.dentry, nd.mnt, O_RDONLY, &nd.intent); + error = PTR_ERR(file); + if (IS_ERR(file)) + goto out; +@@ -492,8 +493,9 @@ + int err; + struct file *file; + +- nd.intent.open.flags = FMODE_READ; +- err = path_lookup(name, LOOKUP_FOLLOW|LOOKUP_OPEN, &nd); ++ intent_init(&nd.intent, IT_OPEN); ++ nd.intent.it_flags = FMODE_READ|FMODE_EXEC; ++ err = path_lookup(name, LOOKUP_FOLLOW, &nd); + file = ERR_PTR(err); + + if (!err) { +@@ -506,7 +508,7 @@ + err = -EACCES; + file = ERR_PTR(err); + if (!err) { +- file = dentry_open(nd.dentry, nd.mnt, O_RDONLY); ++ file = dentry_open_it(nd.dentry, nd.mnt, O_RDONLY, &nd.intent); + if (!IS_ERR(file)) { + err = deny_write_access(file); + if (err) { +Index: linux-2.6.12.5/fs/namei.c +=================================================================== +--- linux-2.6.12.5.orig/fs/namei.c 2005-08-17 17:51:28.000000000 +0200 ++++ linux-2.6.12.5/fs/namei.c 2005-08-17 17:52:57.000000000 +0200 +@@ -301,8 +301,19 @@ + return 0; + } + ++void intent_release(struct lookup_intent *it) ++{ ++ if (!it) ++ return; ++ if (it->it_magic != INTENT_MAGIC) ++ return; ++ if (it->it_op_release) ++ it->it_op_release(it); ++} ++ + void path_release(struct nameidata *nd) + { ++ intent_release(&nd->intent); + dput(nd->dentry); + mntput(nd->mnt); + } +@@ -392,8 +403,11 @@ + { + struct dentry * result; + struct inode *dir = parent->d_inode; ++ int counter = 0; + + down(&dir->i_sem); ++again: ++ counter++; + /* + * First re-do the cached lookup just in case it was created + * while we waited for the directory semaphore.. +@@ -427,13 +441,16 @@ + * Uhhuh! Nasty case: the cache was re-populated while + * we waited on the semaphore. Need to revalidate. + */ +- up(&dir->i_sem); + if (result->d_op && result->d_op->d_revalidate) { + if (!result->d_op->d_revalidate(result, nd) && !d_invalidate(result)) { + dput(result); +- result = ERR_PTR(-ENOENT); ++ if (counter > 10) ++ result = ERR_PTR(-ESTALE); ++ if (!IS_ERR(result)) ++ goto again; + } + } ++ up(&dir->i_sem); + return result; + } + +@@ -461,7 +478,9 @@ + static inline int __vfs_follow_link(struct nameidata *nd, const char *link) + { + int res = 0; ++ struct lookup_intent it = nd->intent; + char *name; ++ + if (IS_ERR(link)) + goto fail; + +@@ -471,6 +490,9 @@ + /* weird __emul_prefix() stuff did it */ + goto out; + } ++ intent_init(&nd->intent, it.it_op); ++ nd->intent.it_flags = it.it_flags; ++ nd->intent.it_create_mode = it.it_create_mode; + res = link_path_walk(link, nd); + out: + if (nd->depth || res || nd->last_type!=LAST_NORM) +@@ -703,6 +725,33 @@ + return PTR_ERR(dentry); + } + ++static int revalidate_special(struct nameidata *nd) ++{ ++ struct dentry *dentry = nd->dentry; ++ int err, counter = 0; ++ ++ revalidate_again: ++ if (!dentry->d_op || !dentry->d_op->d_revalidate) ++ return 0; ++ if (!dentry->d_op->d_revalidate(dentry, nd)) { ++ struct dentry *new; ++ if ((err = permission(dentry->d_parent->d_inode, MAY_EXEC, nd))) ++ return err; ++ new = real_lookup(dentry->d_parent, &dentry->d_name, nd); ++ if (IS_ERR(new)) ++ return PTR_ERR(new); ++ d_invalidate(dentry); ++ dput(dentry); ++ nd->dentry = dentry = new; ++ counter++; ++ if (counter < 10) ++ goto revalidate_again; ++ printk("excessive revalidate_it loops\n"); ++ return -ESTALE; ++ } ++ return 0; ++} ++ + /* + * Name resolution. + * This is the basic name resolution function, turning a pathname into +@@ -800,7 +849,11 @@ + goto out_dput; + + if (inode->i_op->follow_link) { ++ int save_flags = nd->flags; ++ nd->flags |= LOOKUP_LINK_NOTLAST; + err = do_follow_link(&next, nd); ++ if (!(save_flags & LOOKUP_LINK_NOTLAST)) ++ nd->flags &= ~LOOKUP_LINK_NOTLAST; + if (err) + goto return_err; + err = -ENOENT; +@@ -839,6 +892,23 @@ + inode = nd->dentry->d_inode; + /* fallthrough */ + case 1: ++ nd->flags |= LOOKUP_LAST; ++ err = revalidate_special(nd); ++ nd->flags &= ~LOOKUP_LAST; ++ if (!nd->dentry->d_inode) ++ err = -ENOENT; ++ if (err) { ++ path_release(nd); ++ goto return_err; ++ } ++ if (lookup_flags & LOOKUP_DIRECTORY) { ++ err = -ENOTDIR; ++ if(!nd->dentry->d_inode->i_op || ++ !nd->dentry->d_inode->i_op->lookup) { ++ path_release(nd); ++ goto return_err; ++ } ++ } + goto return_reval; + } + if (nd->dentry->d_op && nd->dentry->d_op->d_hash) { +@@ -846,7 +916,9 @@ + if (err < 0) + break; + } ++ nd->flags |= LOOKUP_LAST; + err = do_lookup(nd, &this, &next); ++ nd->flags &= ~LOOKUP_LAST; + if (err) + break; + inode = next.dentry->d_inode; +@@ -1097,7 +1169,7 @@ + } + + /* SMP-safe */ +-struct dentry * lookup_one_len(const char * name, struct dentry * base, int len) ++struct dentry * lookup_one_len_it(const char * name, struct dentry * base, int len, struct nameidata *nd) + { + unsigned long hash; + struct qstr this; +@@ -1117,11 +1189,16 @@ + } + this.hash = end_name_hash(hash); + +- return lookup_hash(&this, base); ++ return __lookup_hash(&this, base, nd); + access: + return ERR_PTR(-EACCES); + } + ++struct dentry * lookup_one_len(const char * name, struct dentry * base, int len) ++{ ++ return lookup_one_len_it(name, base, len, NULL); ++} ++ + /* + * namei() + * +@@ -1133,7 +1210,7 @@ + * that namei follows links, while lnamei does not. + * SMP-safe + */ +-int fastcall __user_walk(const char __user *name, unsigned flags, struct nameidata *nd) ++int fastcall __user_walk_it(const char __user *name, unsigned flags, struct nameidata *nd) + { + char *tmp = getname(name); + int err = PTR_ERR(tmp); +@@ -1145,6 +1222,12 @@ + return err; + } + ++int fastcall __user_walk(const char __user *name, unsigned flags, struct nameidata *nd) ++{ ++ intent_init(&nd->intent, IT_LOOKUP); ++ return __user_walk_it(name, flags, nd); ++} ++ + /* + * It's inline, so penalty for filesystems that don't use sticky bit is + * minimal. +@@ -1426,8 +1509,8 @@ + acc_mode |= MAY_APPEND; + + /* Fill in the open() intent data */ +- nd->intent.open.flags = flag; +- nd->intent.open.create_mode = mode; ++ nd->intent.it_flags = flag; ++ nd->intent.it_create_mode = mode; + + /* + * The simplest case - just a plain lookup. +@@ -1442,6 +1525,7 @@ + /* + * Create - we need to know the parent. + */ ++ nd->intent.it_op |= IT_CREAT; + error = path_lookup(pathname, LOOKUP_PARENT|LOOKUP_OPEN|LOOKUP_CREATE, nd); + if (error) + return error; +@@ -1458,7 +1542,9 @@ + dir = nd->dentry; + nd->flags &= ~LOOKUP_PARENT; + down(&dir->d_inode->i_sem); ++ nd->flags |= LOOKUP_LAST; + path.dentry = __lookup_hash(&nd->last, nd->dentry, nd); ++ nd->flags &= ~LOOKUP_LAST; + path.mnt = nd->mnt; + + do_last: +@@ -1564,7 +1650,9 @@ + } + dir = nd->dentry; + down(&dir->d_inode->i_sem); ++ nd->flags |= LOOKUP_LAST; + path.dentry = __lookup_hash(&nd->last, nd->dentry, nd); ++ nd->flags &= ~LOOKUP_LAST; + path.mnt = nd->mnt; + putname(nd->last.name); + goto do_last; +Index: linux-2.6.12.5/fs/namespace.c +=================================================================== +--- linux-2.6.12.5.orig/fs/namespace.c 2005-08-17 17:51:28.000000000 +0200 ++++ linux-2.6.12.5/fs/namespace.c 2005-08-17 17:51:44.000000000 +0200 +@@ -62,6 +62,7 @@ + INIT_LIST_HEAD(&mnt->mnt_mounts); + INIT_LIST_HEAD(&mnt->mnt_list); + INIT_LIST_HEAD(&mnt->mnt_fslink); ++ INIT_LIST_HEAD(&mnt->mnt_lustre_list); + if (name) { + int size = strlen(name)+1; + char *newname = kmalloc(size, GFP_KERNEL); +@@ -113,6 +114,7 @@ + + static void detach_mnt(struct vfsmount *mnt, struct nameidata *old_nd) + { ++ memset(old_nd, 0, sizeof(*old_nd)); + old_nd->dentry = mnt->mnt_mountpoint; + old_nd->mnt = mnt->mnt_parent; + mnt->mnt_parent = mnt; +@@ -176,6 +178,9 @@ + { + struct super_block *sb = mnt->mnt_sb; + dput(mnt->mnt_root); ++ spin_lock(&dcache_lock); ++ list_del(&mnt->mnt_lustre_list); ++ spin_unlock(&dcache_lock); + free_vfsmnt(mnt); + deactivate_super(sb); + } +@@ -402,6 +407,8 @@ + */ + + lock_kernel(); ++ if (sb->s_op->umount_lustre) ++ sb->s_op->umount_lustre(sb); + if( (flags&MNT_FORCE) && sb->s_op->umount_begin) + sb->s_op->umount_begin(sb); + unlock_kernel(); +@@ -627,6 +634,7 @@ + return err; + if (!old_name || !*old_name) + return -EINVAL; ++ intent_init(&old_nd.intent, IT_LOOKUP); + err = path_lookup(old_name, LOOKUP_FOLLOW, &old_nd); + if (err) + return err; +@@ -701,6 +709,7 @@ + return -EPERM; + if (!old_name || !*old_name) + return -EINVAL; ++ intent_init(&old_nd.intent, IT_LOOKUP); + err = path_lookup(old_name, LOOKUP_FOLLOW, &old_nd); + if (err) + return err; +@@ -1012,6 +1021,7 @@ + int retval = 0; + int mnt_flags = 0; + ++ intent_init(&nd.intent, IT_LOOKUP); + /* Discard magic */ + if ((flags & MS_MGC_MSK) == MS_MGC_VAL) + flags &= ~MS_MGC_MSK; +Index: linux-2.6.12.5/fs/open.c +=================================================================== +--- linux-2.6.12.5.orig/fs/open.c 2005-08-17 17:51:28.000000000 +0200 ++++ linux-2.6.12.5/fs/open.c 2005-08-17 17:51:44.000000000 +0200 +@@ -215,12 +215,12 @@ + struct nameidata nd; + struct inode * inode; + int error; +- ++ intent_init(&nd.intent, IT_GETATTR); + error = -EINVAL; + if (length < 0) /* sorry, but loff_t says... */ + goto out; + +- error = user_path_walk(path, &nd); ++ error = user_path_walk_it(path, &nd); + if (error) + goto out; + inode = nd.dentry->d_inode; +@@ -474,6 +474,7 @@ + int old_fsuid, old_fsgid; + kernel_cap_t old_cap; + int res; ++ intent_init(&nd.intent, IT_GETATTR); + + if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */ + return -EINVAL; +@@ -498,13 +499,14 @@ + else + current->cap_effective = current->cap_permitted; + +- res = __user_walk(filename, LOOKUP_FOLLOW|LOOKUP_ACCESS, &nd); ++ res = __user_walk_it(filename, LOOKUP_FOLLOW|LOOKUP_ACCESS, &nd); + if (!res) { + res = permission(nd.dentry->d_inode, mode, &nd); + /* SuS v2 requires we report a read only fs too */ + if(!res && (mode & S_IWOTH) && IS_RDONLY(nd.dentry->d_inode) + && !special_file(nd.dentry->d_inode->i_mode)) + res = -EROFS; ++ + path_release(&nd); + } + +@@ -519,8 +521,9 @@ + { + struct nameidata nd; + int error; ++ intent_init(&nd.intent, IT_GETATTR); + +- error = __user_walk(filename, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &nd); ++ error = __user_walk_it(filename, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &nd); + if (error) + goto out; + +@@ -570,8 +573,9 @@ + { + struct nameidata nd; + int error; ++ intent_init(&nd.intent, IT_GETATTR); + +- error = __user_walk(filename, LOOKUP_FOLLOW | LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd); ++ error = __user_walk_it(filename, LOOKUP_FOLLOW | LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd); + if (error) + goto out; + +@@ -750,27 +754,8 @@ + * for the internal routines (ie open_namei()/follow_link() etc). 00 is + * used by symlinks. + */ +-struct file *filp_open(const char * filename, int flags, int mode) +-{ +- int namei_flags, error; +- struct nameidata nd; +- +- namei_flags = flags; +- if ((namei_flags+1) & O_ACCMODE) +- namei_flags++; +- if (namei_flags & O_TRUNC) +- namei_flags |= 2; +- +- error = open_namei(filename, namei_flags, mode, &nd); +- if (!error) +- return dentry_open(nd.dentry, nd.mnt, flags); +- +- return ERR_PTR(error); +-} +- +-EXPORT_SYMBOL(filp_open); +- +-struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags) ++struct file *dentry_open_it(struct dentry *dentry, struct vfsmount *mnt, int flags, ++ struct lookup_intent *it) + { + struct file * f; + struct inode *inode; +@@ -782,6 +767,7 @@ + goto cleanup_dentry; + f->f_flags = flags; + f->f_mode = ((flags+1) & O_ACCMODE) | FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; ++ f->f_it = it; + inode = dentry->d_inode; + if (f->f_mode & FMODE_WRITE) { + error = get_write_access(inode); +@@ -800,6 +786,7 @@ + error = f->f_op->open(inode,f); + if (error) + goto cleanup_all; ++ intent_release(it); + } + f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); + +@@ -825,6 +812,7 @@ + cleanup_file: + put_filp(f); + cleanup_dentry: ++ intent_release(it); + dput(dentry); + mntput(mnt); + return ERR_PTR(error); +@@ -832,6 +820,36 @@ + + EXPORT_SYMBOL(dentry_open); + ++struct file *filp_open(const char * filename, int flags, int mode) ++{ ++ int namei_flags, error; ++ struct file * temp_filp; ++ struct nameidata nd; ++ intent_init(&nd.intent, IT_OPEN); ++ ++ namei_flags = flags; ++ if ((namei_flags+1) & O_ACCMODE) ++ namei_flags++; ++ if (namei_flags & O_TRUNC) ++ namei_flags |= 2; ++ ++ error = open_namei(filename, namei_flags, mode, &nd); ++ if (!error) { ++ temp_filp = dentry_open_it(nd.dentry, nd.mnt, flags, &nd.intent); ++ return temp_filp; ++ } ++ return ERR_PTR(error); ++} ++ ++ ++struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags) ++{ ++ struct lookup_intent it; ++ intent_init(&it, IT_LOOKUP); ++ ++ return dentry_open_it(dentry, mnt, flags, &it); ++} ++ + /* + * Find an empty file descriptor entry, and mark it busy. + */ +Index: linux-2.6.12.5/fs/stat.c +=================================================================== +--- linux-2.6.12.5.orig/fs/stat.c 2005-08-17 17:51:28.000000000 +0200 ++++ linux-2.6.12.5/fs/stat.c 2005-08-17 17:51:44.000000000 +0200 +@@ -38,7 +38,7 @@ + + EXPORT_SYMBOL(generic_fillattr); + +-int vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) ++int vfs_getattr_it(struct vfsmount *mnt, struct dentry *dentry, struct lookup_intent *it, struct kstat *stat) + { + struct inode *inode = dentry->d_inode; + int retval; +@@ -47,6 +47,8 @@ + if (retval) + return retval; + ++ if (inode->i_op->getattr_it) ++ return inode->i_op->getattr_it(mnt, dentry, it, stat); + if (inode->i_op->getattr) + return inode->i_op->getattr(mnt, dentry, stat); + +@@ -63,14 +65,20 @@ + + EXPORT_SYMBOL(vfs_getattr); + ++int vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) ++{ ++ return vfs_getattr_it(mnt, dentry, NULL, stat); ++} ++ + int vfs_stat(char __user *name, struct kstat *stat) + { + struct nameidata nd; + int error; ++ intent_init(&nd.intent, IT_GETATTR); + +- error = user_path_walk(name, &nd); ++ error = user_path_walk_it(name, &nd); + if (!error) { +- error = vfs_getattr(nd.mnt, nd.dentry, stat); ++ error = vfs_getattr_it(nd.mnt, nd.dentry, &nd.intent, stat); + path_release(&nd); + } + return error; +@@ -82,10 +90,11 @@ + { + struct nameidata nd; + int error; ++ intent_init(&nd.intent, IT_GETATTR); + +- error = user_path_walk_link(name, &nd); ++ error = user_path_walk_link_it(name, &nd); + if (!error) { +- error = vfs_getattr(nd.mnt, nd.dentry, stat); ++ error = vfs_getattr_it(nd.mnt, nd.dentry, &nd.intent, stat); + path_release(&nd); + } + return error; +@@ -97,9 +106,12 @@ + { + struct file *f = fget(fd); + int error = -EBADF; ++ struct nameidata nd; ++ intent_init(&nd.intent, IT_GETATTR); + + if (f) { +- error = vfs_getattr(f->f_vfsmnt, f->f_dentry, stat); ++ error = vfs_getattr_it(f->f_vfsmnt, f->f_dentry, &nd.intent, stat); ++ intent_release(&nd.intent); + fput(f); + } + return error; +Index: linux-2.6.12.5/fs/nfs/dir.c +=================================================================== +--- linux-2.6.12.5.orig/fs/nfs/dir.c 2005-08-17 17:51:28.000000000 +0200 ++++ linux-2.6.12.5/fs/nfs/dir.c 2005-08-17 17:51:44.000000000 +0200 +@@ -727,7 +727,7 @@ + return 0; + if (nd == NULL || nfs_lookup_check_intent(nd, LOOKUP_CREATE) == 0) + return 0; +- return (nd->intent.open.flags & O_EXCL) != 0; ++ return (nd->intent.it_flags & O_EXCL) != 0; + } + + static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd) +@@ -1028,7 +1028,7 @@ + attr.ia_valid = ATTR_MODE; + + if (nd && (nd->flags & LOOKUP_CREATE)) +- open_flags = nd->intent.open.flags; ++ open_flags = nd->intent.it_flags; + + lock_kernel(); + nfs_begin_data_update(dir); +Index: linux-2.6.12.5/fs/inode.c +=================================================================== +--- linux-2.6.12.5.orig/fs/inode.c 2005-08-17 17:51:28.000000000 +0200 ++++ linux-2.6.12.5/fs/inode.c 2005-08-17 17:51:44.000000000 +0200 +@@ -230,6 +230,7 @@ + inodes_stat.nr_unused--; + } + ++EXPORT_SYMBOL(__iget); + /** + * clear_inode - clear an inode + * @inode: inode to clear +Index: linux-2.6.12.5/include/linux/dcache.h +=================================================================== +--- linux-2.6.12.5.orig/include/linux/dcache.h 2005-08-17 17:51:28.000000000 +0200 ++++ linux-2.6.12.5/include/linux/dcache.h 2005-08-17 17:51:44.000000000 +0200 +@@ -4,6 +4,7 @@ + #ifdef __KERNEL__ + + #include ++#include + #include + #include + #include +@@ -37,6 +38,8 @@ + const unsigned char *name; + }; + ++#include ++ + struct dentry_stat_t { + int nr_dentry; + int nr_unused; +Index: linux-2.6.12.5/include/linux/fs.h +=================================================================== +--- linux-2.6.12.5.orig/include/linux/fs.h 2005-08-17 17:51:28.000000000 +0200 ++++ linux-2.6.12.5/include/linux/fs.h 2005-08-17 17:51:44.000000000 +0200 +@@ -58,6 +58,7 @@ + + #define FMODE_READ 1 + #define FMODE_WRITE 2 ++#define FMODE_EXEC 4 + + /* Internal kernel extensions */ + #define FMODE_LSEEK 4 +@@ -260,6 +261,8 @@ + #define ATTR_ATTR_FLAG 1024 + #define ATTR_KILL_SUID 2048 + #define ATTR_KILL_SGID 4096 ++#define ATTR_RAW 8192 /* file system, not vfs will massage attrs */ ++#define ATTR_FROM_OPEN 16384 /* called from open path, ie O_TRUNC */ + + /* + * This is the Inode Attributes structure, used for notify_change(). It +@@ -463,6 +466,7 @@ + struct block_device *i_bdev; + struct cdev *i_cdev; + int i_cindex; ++ void *i_filterdata; + + __u32 i_generation; + +@@ -600,6 +604,7 @@ + spinlock_t f_ep_lock; + #endif /* #ifdef CONFIG_EPOLL */ + struct address_space *f_mapping; ++ struct lookup_intent *f_it; + }; + extern spinlock_t files_lock; + #define file_list_lock() spin_lock(&files_lock); +@@ -968,7 +973,9 @@ + void (*truncate) (struct inode *); + int (*permission) (struct inode *, int, struct nameidata *); + int (*setattr) (struct dentry *, struct iattr *); ++ int (*setattr_raw) (struct inode *, struct iattr *); + int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *); ++ int (*getattr_it) (struct vfsmount *, struct dentry *, struct lookup_intent *, struct kstat *); + int (*setxattr) (struct dentry *, const char *,const void *,size_t,int); + ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t); + ssize_t (*listxattr) (struct dentry *, char *, size_t); +@@ -1008,6 +1015,7 @@ + int (*remount_fs) (struct super_block *, int *, char *); + void (*clear_inode) (struct inode *); + void (*umount_begin) (struct super_block *); ++ void (*umount_lustre) (struct super_block *); + + int (*show_options)(struct seq_file *, struct vfsmount *); + +@@ -1210,6 +1218,7 @@ + extern struct vfsmount *kern_mount(struct file_system_type *); + extern int may_umount_tree(struct vfsmount *); + extern int may_umount(struct vfsmount *); ++struct vfsmount *do_kern_mount(const char *type, int flags, const char *name, void *data); + extern long do_mount(char *, char *, char *, unsigned long, void *); + + extern int vfs_statfs(struct super_block *, struct kstatfs *); +@@ -1262,6 +1271,7 @@ + extern int do_truncate(struct dentry *, loff_t start); + extern struct file *filp_open(const char *, int, int); + extern struct file * dentry_open(struct dentry *, struct vfsmount *, int); ++extern struct file * dentry_open_it(struct dentry *, struct vfsmount *, int, struct lookup_intent *); + extern int filp_close(struct file *, fl_owner_t id); + extern char * getname(const char __user *); + +Index: linux-2.6.12.5/include/linux/namei.h +=================================================================== +--- linux-2.6.12.5.orig/include/linux/namei.h 2005-08-17 17:51:28.000000000 +0200 ++++ linux-2.6.12.5/include/linux/namei.h 2005-08-17 17:51:44.000000000 +0200 +@@ -2,14 +2,48 @@ + #define _LINUX_NAMEI_H + + #include ++#include + + struct vfsmount; ++struct nameidata; + +-struct open_intent { +- int flags; +- int create_mode; ++/* intent opcodes */ ++#define IT_OPEN (1) ++#define IT_CREAT (1<<1) ++#define IT_READDIR (1<<2) ++#define IT_GETATTR (1<<3) ++#define IT_LOOKUP (1<<4) ++#define IT_UNLINK (1<<5) ++#define IT_TRUNC (1<<6) ++#define IT_GETXATTR (1<<7) ++ ++struct lustre_intent_data { ++ int it_disposition; ++ int it_status; ++ __u64 it_lock_handle; ++ void *it_data; ++ int it_lock_mode; + }; + ++#define INTENT_MAGIC 0x19620323 ++struct lookup_intent { ++ int it_magic; ++ void (*it_op_release)(struct lookup_intent *); ++ int it_op; ++ int it_flags; ++ int it_create_mode; ++ union { ++ struct lustre_intent_data lustre; ++ } d; ++}; ++ ++static inline void intent_init(struct lookup_intent *it, int op) ++{ ++ memset(it, 0, sizeof(*it)); ++ it->it_magic = INTENT_MAGIC; ++ it->it_op = op; ++} ++ + enum { MAX_NESTED_LINKS = 5 }; + + struct nameidata { +@@ -21,10 +55,7 @@ + unsigned depth; + char *saved_names[MAX_NESTED_LINKS + 1]; + +- /* Intent data */ +- union { +- struct open_intent open; +- } intent; ++ struct lookup_intent intent; + }; + + /* +@@ -47,6 +78,8 @@ + #define LOOKUP_PARENT 16 + #define LOOKUP_NOALT 32 + #define LOOKUP_REVAL 64 ++#define LOOKUP_LAST (0x1000) ++#define LOOKUP_LINK_NOTLAST (0x2000) + /* + * Intent data + */ +@@ -55,6 +88,12 @@ + #define LOOKUP_ACCESS (0x0400) + + extern int FASTCALL(__user_walk(const char __user *, unsigned, struct nameidata *)); ++extern int FASTCALL(__user_walk_it(const char __user *name, unsigned flags, struct nameidata *nd)); ++#define user_path_walk_it(name,nd) \ ++ __user_walk_it(name, LOOKUP_FOLLOW, nd) ++#define user_path_walk_link_it(name,nd) \ ++ __user_walk_it(name, 0, nd) ++extern void intent_release(struct lookup_intent *); + #define user_path_walk(name,nd) \ + __user_walk(name, LOOKUP_FOLLOW, nd) + #define user_path_walk_link(name,nd) \ +@@ -67,7 +106,6 @@ + + extern struct dentry * lookup_one_len(const char *, struct dentry *, int); + extern struct dentry * lookup_hash(struct qstr *, struct dentry *); +- + extern int follow_down(struct vfsmount **, struct dentry **); + extern int follow_up(struct vfsmount **, struct dentry **); + +Index: linux-2.6.12.5/include/linux/mount.h +=================================================================== +--- linux-2.6.12.5.orig/include/linux/mount.h 2005-08-17 17:51:28.000000000 +0200 ++++ linux-2.6.12.5/include/linux/mount.h 2005-08-17 17:51:44.000000000 +0200 +@@ -36,6 +36,8 @@ + struct list_head mnt_list; + struct list_head mnt_fslink; /* link in fs-specific expiry list */ + struct namespace *mnt_namespace; /* containing namespace */ ++ struct list_head mnt_lustre_list; /* GNS mount list */ ++ unsigned long mnt_last_used; /* for GNS auto-umount (jiffies) */ + }; + + static inline struct vfsmount *mntget(struct vfsmount *mnt) diff --git a/lustre/kernel_patches/patches/vfs_nointent-2.6-rhel4.patch b/lustre/kernel_patches/patches/vfs_nointent-2.6-rhel4.patch index 7f95eb3..166a512 100644 --- a/lustre/kernel_patches/patches/vfs_nointent-2.6-rhel4.patch +++ b/lustre/kernel_patches/patches/vfs_nointent-2.6-rhel4.patch @@ -47,7 +47,7 @@ Index: linux-2.6.9-5.0.3.EL/fs/namei.c if (!IS_ERR(tmp)) { struct dentry *dentry; struct nameidata nd; -+ intent_init(&nd.intent, IT_LOOKUP); ++ intent_init(&nd.intent, IT_LOOKUP); error = path_lookup(tmp, LOOKUP_PARENT, &nd); if (error) @@ -74,7 +74,7 @@ Index: linux-2.6.9-5.0.3.EL/fs/namei.c char * name; struct dentry *dentry; struct nameidata nd; -+ intent_init(&nd.intent, IT_LOOKUP); ++ intent_init(&nd.intent, IT_LOOKUP); name = getname(pathname); if(IS_ERR(name)) @@ -99,7 +99,7 @@ Index: linux-2.6.9-5.0.3.EL/fs/namei.c struct dentry *dentry; struct nameidata nd; struct inode *inode = NULL; -+ intent_init(&nd.intent, IT_LOOKUP); ++ intent_init(&nd.intent, IT_LOOKUP); name = getname(pathname); if(IS_ERR(name)) @@ -121,7 +121,7 @@ Index: linux-2.6.9-5.0.3.EL/fs/namei.c if (!IS_ERR(to)) { struct dentry *dentry; struct nameidata nd; -+ intent_init(&nd.intent, IT_LOOKUP); ++ intent_init(&nd.intent, IT_LOOKUP); error = path_lookup(to, LOOKUP_PARENT, &nd); if (error) @@ -148,8 +148,8 @@ Index: linux-2.6.9-5.0.3.EL/fs/namei.c struct nameidata nd, old_nd; int error; char * to; -+ intent_init(&nd.intent, IT_LOOKUP); -+ intent_init(&old_nd.intent, IT_LOOKUP); ++ intent_init(&nd.intent, IT_LOOKUP); ++ intent_init(&old_nd.intent, IT_LOOKUP); to = getname(newname); if (IS_ERR(to)) @@ -157,40 +157,22 @@ Index: linux-2.6.9-5.0.3.EL/fs/namei.c error = -EXDEV; if (old_nd.mnt != nd.mnt) goto out_release; -+ if (nd.dentry->d_inode->i_op->link_raw) { -+ struct inode_operations *op = nd.dentry->d_inode->i_op; -+ error = op->link_raw(&old_nd, &nd); -+ /* the file system wants to use normal vfs path now */ -+ if (error != -EOPNOTSUPP) -+ goto out_release; -+ } ++ if (nd.dentry->d_inode->i_op->link_raw) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ error = op->link_raw(&old_nd, &nd); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto out_release; ++ } new_dentry = lookup_create(&nd, 0); error = PTR_ERR(new_dentry); if (!IS_ERR(new_dentry)) { -@@ -2101,7 +2158,7 @@ - * locking]. - */ - int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry, -- struct inode *new_dir, struct dentry *new_dentry) -+ struct inode *new_dir, struct dentry *new_dentry) - { - int error = 0; - struct inode *target; -@@ -2146,7 +2203,7 @@ - } - - int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry, -- struct inode *new_dir, struct dentry *new_dentry) -+ struct inode *new_dir, struct dentry *new_dentry) - { - struct inode *target; - int error; @@ -2223,6 +2280,8 @@ struct dentry * old_dentry, *new_dentry; struct dentry * trap; struct nameidata oldnd, newnd; -+ intent_init(&oldnd.intent, IT_LOOKUP); -+ intent_init(&newnd.intent, IT_LOOKUP); ++ intent_init(&oldnd.intent, IT_LOOKUP); ++ intent_init(&newnd.intent, IT_LOOKUP); error = path_lookup(oldname, LOOKUP_PARENT, &oldnd); if (error) @@ -284,10 +266,10 @@ Index: linux-2.6.9-5.0.3.EL/fs/open.c + if (error != -EOPNOTSUPP) + goto dput_and_out; + } else { -+ down(&inode->i_sem); -+ error = notify_change(nd.dentry, &newattrs); -+ up(&inode->i_sem); -+ } ++ down(&inode->i_sem); ++ error = notify_change(nd.dentry, &newattrs); ++ up(&inode->i_sem); ++ } dput_and_out: path_release(&nd); out: @@ -307,10 +289,10 @@ Index: linux-2.6.9-5.0.3.EL/fs/open.c + if (error != -EOPNOTSUPP) + goto dput_and_out; + } else { -+ down(&inode->i_sem); -+ error = notify_change(nd.dentry, &newattrs); -+ up(&inode->i_sem); -+ } ++ down(&inode->i_sem); ++ error = notify_change(nd.dentry, &newattrs); ++ up(&inode->i_sem); ++ } dput_and_out: path_release(&nd); out: diff --git a/lustre/kernel_patches/patches/vfs_nointent-2.6.12.patch b/lustre/kernel_patches/patches/vfs_nointent-2.6.12.patch new file mode 100644 index 0000000..66ead6a --- /dev/null +++ b/lustre/kernel_patches/patches/vfs_nointent-2.6.12.patch @@ -0,0 +1,490 @@ +Index: linux-2.6.12.2/fs/namei.c +=================================================================== +--- linux-2.6.12.2.orig/fs/namei.c 2005-07-23 12:25:12.241868120 +0200 ++++ linux-2.6.12.2/fs/namei.c 2005-07-23 12:25:14.440533872 +0200 +@@ -1466,7 +1466,7 @@ + if (!error) { + DQUOT_INIT(inode); + +- error = do_truncate(dentry, 0); ++ error = do_truncate(dentry, 0, 1); + } + put_write_access(inode); + if (error) +@@ -1719,6 +1719,7 @@ + char * tmp; + struct dentry * dentry; + struct nameidata nd; ++ intent_init(&nd.intent, IT_LOOKUP); + + if (S_ISDIR(mode)) + return -EPERM; +@@ -1729,6 +1730,15 @@ + error = path_lookup(tmp, LOOKUP_PARENT, &nd); + if (error) + goto out; ++ ++ if (nd.dentry->d_inode->i_op->mknod_raw) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ error = op->mknod_raw(&nd, mode, dev); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto out2; ++ } ++ + dentry = lookup_create(&nd, 0); + error = PTR_ERR(dentry); + +@@ -1755,6 +1765,7 @@ + dput(dentry); + } + up(&nd.dentry->d_inode->i_sem); ++out2: + path_release(&nd); + out: + putname(tmp); +@@ -1796,10 +1807,18 @@ + if (!IS_ERR(tmp)) { + struct dentry *dentry; + struct nameidata nd; ++ intent_init(&nd.intent, IT_LOOKUP); + + error = path_lookup(tmp, LOOKUP_PARENT, &nd); + if (error) + goto out; ++ if (nd.dentry->d_inode->i_op->mkdir_raw) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ error = op->mkdir_raw(&nd, mode); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto out2; ++ } + dentry = lookup_create(&nd, 1); + error = PTR_ERR(dentry); + if (!IS_ERR(dentry)) { +@@ -1809,6 +1828,7 @@ + dput(dentry); + } + up(&nd.dentry->d_inode->i_sem); ++out2: + path_release(&nd); + out: + putname(tmp); +@@ -1885,6 +1905,7 @@ + char * name; + struct dentry *dentry; + struct nameidata nd; ++ intent_init(&nd.intent, IT_LOOKUP); + + name = getname(pathname); + if(IS_ERR(name)) +@@ -1905,6 +1926,16 @@ + error = -EBUSY; + goto exit1; + } ++ ++ if (nd.dentry->d_inode->i_op->rmdir_raw) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ ++ error = op->rmdir_raw(&nd); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto exit1; ++ } ++ + down(&nd.dentry->d_inode->i_sem); + dentry = lookup_hash(&nd.last, nd.dentry); + error = PTR_ERR(dentry); +@@ -1963,6 +1994,7 @@ + struct dentry *dentry; + struct nameidata nd; + struct inode *inode = NULL; ++ intent_init(&nd.intent, IT_LOOKUP); + + name = getname(pathname); + if(IS_ERR(name)) +@@ -1974,6 +2006,13 @@ + error = -EISDIR; + if (nd.last_type != LAST_NORM) + goto exit1; ++ if (nd.dentry->d_inode->i_op->unlink_raw) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ error = op->unlink_raw(&nd); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto exit1; ++ } + down(&nd.dentry->d_inode->i_sem); + dentry = lookup_hash(&nd.last, nd.dentry); + error = PTR_ERR(dentry); +@@ -2040,10 +2079,18 @@ + if (!IS_ERR(to)) { + struct dentry *dentry; + struct nameidata nd; ++ intent_init(&nd.intent, IT_LOOKUP); + + error = path_lookup(to, LOOKUP_PARENT, &nd); + if (error) + goto out; ++ if (nd.dentry->d_inode->i_op->symlink_raw) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ error = op->symlink_raw(&nd, from); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto out2; ++ } + dentry = lookup_create(&nd, 0); + error = PTR_ERR(dentry); + if (!IS_ERR(dentry)) { +@@ -2051,6 +2098,7 @@ + dput(dentry); + } + up(&nd.dentry->d_inode->i_sem); ++out2: + path_release(&nd); + out: + putname(to); +@@ -2114,6 +2162,8 @@ + struct nameidata nd, old_nd; + int error; + char * to; ++ intent_init(&nd.intent, IT_LOOKUP); ++ intent_init(&old_nd.intent, IT_LOOKUP); + + to = getname(newname); + if (IS_ERR(to)) +@@ -2128,6 +2178,13 @@ + error = -EXDEV; + if (old_nd.mnt != nd.mnt) + goto out_release; ++ if (nd.dentry->d_inode->i_op->link_raw) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ error = op->link_raw(&old_nd, &nd); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto out_release; ++ } + new_dentry = lookup_create(&nd, 0); + error = PTR_ERR(new_dentry); + if (!IS_ERR(new_dentry)) { +@@ -2300,6 +2357,8 @@ + struct dentry * old_dentry, *new_dentry; + struct dentry * trap; + struct nameidata oldnd, newnd; ++ intent_init(&oldnd.intent, IT_LOOKUP); ++ intent_init(&newnd.intent, IT_LOOKUP); + + error = path_lookup(oldname, LOOKUP_PARENT, &oldnd); + if (error) +@@ -2322,6 +2381,13 @@ + if (newnd.last_type != LAST_NORM) + goto exit2; + ++ if (old_dir->d_inode->i_op->rename_raw) { ++ error = old_dir->d_inode->i_op->rename_raw(&oldnd, &newnd); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto exit2; ++ } ++ + trap = lock_rename(new_dir, old_dir); + + old_dentry = lookup_hash(&oldnd.last, old_dir); +@@ -2353,8 +2419,7 @@ + if (new_dentry == trap) + goto exit5; + +- error = vfs_rename(old_dir->d_inode, old_dentry, +- new_dir->d_inode, new_dentry); ++ error = vfs_rename(old_dir->d_inode, old_dentry, new_dir->d_inode, new_dentry); + exit5: + dput(new_dentry); + exit4: +Index: linux-2.6.12.2/fs/open.c +=================================================================== +--- linux-2.6.12.2.orig/fs/open.c 2005-07-23 12:25:12.248867056 +0200 ++++ linux-2.6.12.2/fs/open.c 2005-07-23 12:28:13.221355056 +0200 +@@ -192,9 +192,10 @@ + return error; + } + +-int do_truncate(struct dentry *dentry, loff_t length) ++int do_truncate(struct dentry *dentry, loff_t length, int called_from_open) + { + int err; ++ struct inode_operations *op = dentry->d_inode->i_op; + struct iattr newattrs; + + /* Not pretty: "inode->i_size" shouldn't really be signed. But it is. */ +@@ -205,7 +206,16 @@ + newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME; + + down(&dentry->d_inode->i_sem); +- err = notify_change(dentry, &newattrs); ++ if (called_from_open) ++ newattrs.ia_valid |= ATTR_FROM_OPEN; ++ if (op->setattr_raw) { ++ newattrs.ia_valid |= ATTR_RAW; ++ newattrs.ia_ctime = CURRENT_TIME; ++ down_write(&dentry->d_inode->i_alloc_sem); ++ err = op->setattr_raw(dentry->d_inode, &newattrs); ++ up_write(&dentry->d_inode->i_alloc_sem); ++ } else ++ err = notify_change(dentry, &newattrs); + up(&dentry->d_inode->i_sem); + return err; + } +@@ -260,7 +270,7 @@ + error = locks_verify_truncate(inode, NULL, length); + if (!error) { + DQUOT_INIT(inode); +- error = do_truncate(nd.dentry, length); ++ error = do_truncate(nd.dentry, length, 0); + } + put_write_access(inode); + +@@ -312,7 +322,7 @@ + + error = locks_verify_truncate(inode, file, length); + if (!error) +- error = do_truncate(dentry, length); ++ error = do_truncate(dentry, length, 0); + out_putf: + fput(file); + out: +@@ -391,9 +401,19 @@ + (error = permission(inode,MAY_WRITE,&nd)) != 0) + goto dput_and_out; + } +- down(&inode->i_sem); +- error = notify_change(nd.dentry, &newattrs); +- up(&inode->i_sem); ++ if (inode->i_op->setattr_raw) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ ++ newattrs.ia_valid |= ATTR_RAW; ++ error = op->setattr_raw(inode, &newattrs); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto dput_and_out; ++ } else { ++ down(&inode->i_sem); ++ error = notify_change(nd.dentry, &newattrs); ++ up(&inode->i_sem); ++ } + dput_and_out: + path_release(&nd); + out: +@@ -444,9 +464,19 @@ + (error = permission(inode,MAY_WRITE,&nd)) != 0) + goto dput_and_out; + } +- down(&inode->i_sem); +- error = notify_change(nd.dentry, &newattrs); +- up(&inode->i_sem); ++ if (inode->i_op->setattr_raw) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ ++ newattrs.ia_valid |= ATTR_RAW; ++ error = op->setattr_raw(inode, &newattrs); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto dput_and_out; ++ } else { ++ down(&inode->i_sem); ++ error = notify_change(nd.dentry, &newattrs); ++ up(&inode->i_sem); ++ } + dput_and_out: + path_release(&nd); + out: +@@ -596,36 +626,52 @@ + return error; + } + +-asmlinkage long sys_fchmod(unsigned int fd, mode_t mode) ++int chmod_common(struct dentry *dentry, mode_t mode) + { +- struct inode * inode; +- struct dentry * dentry; +- struct file * file; +- int err = -EBADF; ++ struct inode * inode = dentry->d_inode; + struct iattr newattrs; ++ int error = -EROFS; + +- file = fget(fd); +- if (!file) ++ if (IS_RDONLY(inode)) + goto out; ++ ++ if (inode->i_op->setattr_raw) { ++ struct inode_operations *op = dentry->d_inode->i_op; ++ ++ newattrs.ia_mode = mode; ++ newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; ++ newattrs.ia_valid |= ATTR_RAW; ++ error = op->setattr_raw(inode, &newattrs); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto out; ++ } + +- dentry = file->f_dentry; +- inode = dentry->d_inode; +- +- err = -EROFS; +- if (IS_RDONLY(inode)) +- goto out_putf; +- err = -EPERM; ++ error = -EPERM; + if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) +- goto out_putf; ++ goto out; ++ + down(&inode->i_sem); + if (mode == (mode_t) -1) + mode = inode->i_mode; + newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); + newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; +- err = notify_change(dentry, &newattrs); ++ error = notify_change(dentry, &newattrs); + up(&inode->i_sem); ++out: ++ return error; ++} + +-out_putf: ++asmlinkage long sys_fchmod(unsigned int fd, mode_t mode) ++{ ++ struct file * file; ++ int err = -EBADF; ++ ++ file = fget(fd); ++ if (!file) ++ goto out; ++ ++ err = chmod_common(file->f_dentry, mode); + fput(file); + out: + return err; +@@ -634,32 +680,13 @@ + asmlinkage long sys_chmod(const char __user * filename, mode_t mode) + { + struct nameidata nd; +- struct inode * inode; + int error; +- struct iattr newattrs; + + error = user_path_walk(filename, &nd); + if (error) + goto out; +- inode = nd.dentry->d_inode; +- +- error = -EROFS; +- if (IS_RDONLY(inode)) +- goto dput_and_out; +- +- error = -EPERM; +- if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) +- goto dput_and_out; +- +- down(&inode->i_sem); +- if (mode == (mode_t) -1) +- mode = inode->i_mode; +- newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); +- newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; +- error = notify_change(nd.dentry, &newattrs); +- up(&inode->i_sem); + +-dput_and_out: ++ error = chmod_common(nd.dentry, mode); + path_release(&nd); + out: + return error; +@@ -680,6 +707,18 @@ + if (IS_RDONLY(inode)) + goto out; + error = -EPERM; ++ if (inode->i_op->setattr_raw) { ++ struct inode_operations *op = dentry->d_inode->i_op; ++ ++ newattrs.ia_uid = user; ++ newattrs.ia_gid = group; ++ newattrs.ia_valid = ATTR_UID | ATTR_GID | ATTR_CTIME; ++ newattrs.ia_valid |= ATTR_RAW; ++ error = op->setattr_raw(inode, &newattrs); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ return error; ++ } + if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) + goto out; + newattrs.ia_valid = ATTR_CTIME; +@@ -693,6 +732,7 @@ + } + if (!S_ISDIR(inode->i_mode)) + newattrs.ia_valid |= ATTR_KILL_SUID|ATTR_KILL_SGID; ++ + down(&inode->i_sem); + error = notify_change(dentry, &newattrs); + up(&inode->i_sem); +Index: linux-2.6.12.2/fs/exec.c +=================================================================== +--- linux-2.6.12.2.orig/fs/exec.c 2005-07-23 12:25:12.229869944 +0200 ++++ linux-2.6.12.2/fs/exec.c 2005-07-23 12:25:14.442533568 +0200 +@@ -1488,7 +1488,7 @@ + goto close_fail; + if (!file->f_op->write) + goto close_fail; +- if (do_truncate(file->f_dentry, 0) != 0) ++ if (do_truncate(file->f_dentry, 0, 0) != 0) + goto close_fail; + + retval = binfmt->core_dump(signr, regs, file); +Index: linux-2.6.12.2/include/linux/fs.h +=================================================================== +--- linux-2.6.12.2.orig/include/linux/fs.h 2005-07-23 12:25:12.279862344 +0200 ++++ linux-2.6.12.2/include/linux/fs.h 2005-07-23 12:25:14.443533416 +0200 +@@ -960,13 +960,20 @@ + int (*create) (struct inode *,struct dentry *,int, struct nameidata *); + struct dentry * (*lookup) (struct inode *,struct dentry *, struct nameidata *); + int (*link) (struct dentry *,struct inode *,struct dentry *); ++ int (*link_raw) (struct nameidata *,struct nameidata *); + int (*unlink) (struct inode *,struct dentry *); ++ int (*unlink_raw) (struct nameidata *); + int (*symlink) (struct inode *,struct dentry *,const char *); ++ int (*symlink_raw) (struct nameidata *,const char *); + int (*mkdir) (struct inode *,struct dentry *,int); ++ int (*mkdir_raw) (struct nameidata *,int); + int (*rmdir) (struct inode *,struct dentry *); ++ int (*rmdir_raw) (struct nameidata *); + int (*mknod) (struct inode *,struct dentry *,int,dev_t); ++ int (*mknod_raw) (struct nameidata *,int,dev_t); + int (*rename) (struct inode *, struct dentry *, + struct inode *, struct dentry *); ++ int (*rename_raw) (struct nameidata *, struct nameidata *); + int (*readlink) (struct dentry *, char __user *,int); + int (*follow_link) (struct dentry *, struct nameidata *); + void (*put_link) (struct dentry *, struct nameidata *); +@@ -1268,7 +1275,7 @@ + + /* fs/open.c */ + +-extern int do_truncate(struct dentry *, loff_t start); ++extern int do_truncate(struct dentry *, loff_t start, int called_from_open); + extern struct file *filp_open(const char *, int, int); + extern struct file * dentry_open(struct dentry *, struct vfsmount *, int); + extern struct file * dentry_open_it(struct dentry *, struct vfsmount *, int, struct lookup_intent *); +Index: linux-2.6.12.2/net/unix/af_unix.c +=================================================================== +--- linux-2.6.12.2.orig/net/unix/af_unix.c 2005-06-30 01:00:53.000000000 +0200 ++++ linux-2.6.12.2/net/unix/af_unix.c 2005-07-23 12:25:14.445533112 +0200 +@@ -673,6 +673,7 @@ + int err = 0; + + if (sunname->sun_path[0]) { ++ intent_init(&nd.intent, IT_LOOKUP); + err = path_lookup(sunname->sun_path, LOOKUP_FOLLOW, &nd); + if (err) + goto fail; diff --git a/lustre/kernel_patches/patches/vfs_races-2.6.12.patch b/lustre/kernel_patches/patches/vfs_races-2.6.12.patch new file mode 100644 index 0000000..4b5419d --- /dev/null +++ b/lustre/kernel_patches/patches/vfs_races-2.6.12.patch @@ -0,0 +1,63 @@ +Index: linux-2.6.7-vanilla/fs/dcache.c +=================================================================== +--- linux-2.6.7-vanilla.orig/fs/dcache.c 2004-07-01 12:09:19.000000000 +0300 ++++ linux-2.6.7-vanilla/fs/dcache.c 2004-07-01 12:29:12.510193264 +0300 +@@ -219,7 +219,14 @@ + spin_unlock(&dcache_lock); + return 0; + } +- /* ++ ++ /* network invalidation by Lustre */ ++ if (dentry->d_flags & DCACHE_LUSTRE_INVALID) { ++ spin_unlock(&dcache_lock); ++ return 0; ++ } ++ ++ /* + * Check whether to do a partial shrink_dcache + * to get rid of unused child entries. + */ +@@ -1199,16 +1199,25 @@ + * Adds a dentry to the hash according to its name. + */ + +-void d_rehash(struct dentry * entry) ++void d_rehash_cond(struct dentry * entry, int lock) + { + struct hlist_head *list = d_hash(entry->d_parent, entry->d_name.hash); + +- spin_lock(&dcache_lock); ++ if (lock) ++ spin_lock(&dcache_lock); + spin_lock(&entry->d_lock); + __d_rehash(entry, list); + spin_unlock(&entry->d_lock); +- spin_unlock(&dcache_lock); ++ if (lock) ++ spin_unlock(&dcache_lock); + } + ++EXPORT_SYMBOL(d_rehash_cond); ++ ++void d_rehash(struct dentry * entry) ++{ ++ d_rehash_cond(entry, 1); ++ } ++ + #define do_switch(x,y) do { \ + __typeof__ (x) __tmp = x; \ + x = y; y = __tmp; } while (0) +Index: linux-2.6.7-vanilla/include/linux/dcache.h +=================================================================== +--- linux-2.6.7-vanilla.orig/include/linux/dcache.h 2004-07-01 12:24:53.602553208 +0300 ++++ linux-2.6.7-vanilla/include/linux/dcache.h 2004-07-01 12:27:29.757814000 +0300 +@@ -159,6 +159,8 @@ + + #define DCACHE_REFERENCED 0x0008 /* Recently used, don't discard. */ + #define DCACHE_UNHASHED 0x0010 ++#define DCACHE_LUSTRE_INVALID 0x0020 /* Lustre invalidated */ ++ + + extern spinlock_t dcache_lock; + diff --git a/lustre/kernel_patches/series/2.6.12-vanilla.series b/lustre/kernel_patches/series/2.6.12-vanilla.series new file mode 100644 index 0000000..9ecb127 --- /dev/null +++ b/lustre/kernel_patches/series/2.6.12-vanilla.series @@ -0,0 +1,19 @@ +lustre_version.patch +vfs_intent-2.6.12.patch +vfs_nointent-2.6.12.patch +vfs_races-2.6.12.patch +ext3-wantedi-misc-2.6-suse.patch +jbd-2.6.10-jcberr.patch +nfs-cifs-intent-2.6.12.patch +iopen-misc-2.6.12.patch +export-truncate-2.6-suse.patch +export_symbols-2.6.12.patch +dev_read_only-2.6-suse.patch +export-2.6-suse.patch +lookup_bdev_init_intent.patch +8kstack-2.6.12.patch +remove-suid-2.6-suse.patch +export-show_task-2.6-vanilla.patch +sd_iostats-2.6-rhel4.patch +fsprivate-2.6.patch +export_symbol_numa.patch diff --git a/lustre/kernel_patches/series/ldiskfs-2.6.12-vanilla.series b/lustre/kernel_patches/series/ldiskfs-2.6.12-vanilla.series new file mode 100644 index 0000000..7d0a383 --- /dev/null +++ b/lustre/kernel_patches/series/ldiskfs-2.6.12-vanilla.series @@ -0,0 +1,13 @@ +ext3-wantedi-2.6-rhel4.patch +ext3-san-jdike-2.6-suse.patch +iopen-2.6.12.patch +ext3-map_inode_page-2.6-suse.patch +export-ext3-2.6-rhel4.patch +ext3-include-fixes-2.6-rhel4.patch +ext3-extents-2.6.12.patch +ext3-mballoc2-2.6.12.patch +ext3-nlinks-2.6.9.patch +ext3-ialloc-2.6.patch +ext3-remove-cond_resched-calls-2.6.12.patch +ext3-htree-dot-2.6.patch +ext3-external-journal-2.6.12.patch -- 1.8.3.1