From f74ab269d0013c61c5bf9df76bf3202b3a1e1c99 Mon Sep 17 00:00:00 2001 From: green Date: Sat, 26 Feb 2005 14:18:41 +0000 Subject: [PATCH] b=5773 added to rhel4 serie: export-show_task-2.6-vanilla.patch and remove-suid-2.6-suse vfs_intent-2.6-rhel4.patch: got rid of IT_CHDIR, moved LOOKUP_LAST and LOOKUP_NOT_LAST to not clash with LOOKUP_OPEN, incorporated fix for bug 4134 export_symbols-2.6-rhel4: provided declaration of filemap_populate. --- .../patches/ext3-ea-in-inode-2.6-rhel4.patch | 822 ++++++ .../patches/ext3-extents-2.6.9-rhel4.patch | 2831 ++++++++++++++++++++ .../patches/ext3-include-fixes-2.6-rhel4.patch | 0 .../patches/ext3-mballoc2-2.6.9-rhel4.patch | 2236 ++++++++++++++++ .../kernel_patches/series/ldiskfs-2.6-rhel4.series | 11 + .../kernel_patches/patches/8kstack-2.6-rhel4.patch | 13 + .../patches/export_symbols-2.6-rhel4.patch | 148 + .../patches/ext3-ea-in-inode-2.6-rhel4.patch | 822 ++++++ .../patches/ext3-extents-2.6.9-rhel4.patch | 2831 ++++++++++++++++++++ .../patches/ext3-include-fixes-2.6-rhel4.patch | 20 + .../patches/ext3-mballoc2-2.6.9-rhel4.patch | 2236 ++++++++++++++++ .../patches/vfs_intent-2.6-rhel4.patch | 92 +- ...-vanilla.patch => vfs_nointent-2.6-rhel4.patch} | 104 +- ...2.6-vanilla.patch => vfs_races-2.6-rhel4.patch} | 0 lustre/kernel_patches/series/2.6-rhel4.series | 9 +- .../kernel_patches/series/ldiskfs-2.6-rhel4.series | 11 + 16 files changed, 12086 insertions(+), 100 deletions(-) create mode 100644 ldiskfs/kernel_patches/patches/ext3-ea-in-inode-2.6-rhel4.patch create mode 100644 ldiskfs/kernel_patches/patches/ext3-extents-2.6.9-rhel4.patch rename lustre/kernel_patches/patches/ext3-include-fixes-2.6-rhel4.diff => ldiskfs/kernel_patches/patches/ext3-include-fixes-2.6-rhel4.patch (100%) create mode 100644 ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6.9-rhel4.patch create mode 100644 ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel4.series create mode 100644 lustre/kernel_patches/patches/8kstack-2.6-rhel4.patch create mode 100644 lustre/kernel_patches/patches/export_symbols-2.6-rhel4.patch create mode 100644 lustre/kernel_patches/patches/ext3-ea-in-inode-2.6-rhel4.patch create mode 100644 lustre/kernel_patches/patches/ext3-extents-2.6.9-rhel4.patch create mode 100644 lustre/kernel_patches/patches/ext3-include-fixes-2.6-rhel4.patch create mode 100644 lustre/kernel_patches/patches/ext3-mballoc2-2.6.9-rhel4.patch rename lustre/kernel_patches/patches/{vfs_nointent-2.6-vanilla.patch => vfs_nointent-2.6-rhel4.patch} (88%) rename lustre/kernel_patches/patches/{vfs_races-2.6-vanilla.patch => vfs_races-2.6-rhel4.patch} (100%) create mode 100644 lustre/kernel_patches/series/ldiskfs-2.6-rhel4.series diff --git a/ldiskfs/kernel_patches/patches/ext3-ea-in-inode-2.6-rhel4.patch b/ldiskfs/kernel_patches/patches/ext3-ea-in-inode-2.6-rhel4.patch new file mode 100644 index 0000000..3d554e4 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext3-ea-in-inode-2.6-rhel4.patch @@ -0,0 +1,822 @@ +Index: linux-stage/fs/ext3/ialloc.c +=================================================================== +--- linux-stage.orig/fs/ext3/ialloc.c 2005-02-25 16:47:04.411977280 +0200 ++++ linux-stage/fs/ext3/ialloc.c 2005-02-25 16:50:40.752088584 +0200 +@@ -629,6 +629,11 @@ + spin_unlock(&sbi->s_next_gen_lock); + + ei->i_state = EXT3_STATE_NEW; ++ if (EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) { ++ ei->i_extra_isize = sizeof(__u16) /* i_extra_isize */ ++ + sizeof(__u16); /* i_pad1 */ ++ } else ++ ei->i_extra_isize = 0; + + ret = inode; + if(DQUOT_ALLOC_INODE(inode)) { +Index: linux-stage/fs/ext3/inode.c +=================================================================== +--- linux-stage.orig/fs/ext3/inode.c 2005-02-25 16:47:04.415976672 +0200 ++++ linux-stage/fs/ext3/inode.c 2005-02-25 16:50:40.756087976 +0200 +@@ -2274,7 +2274,7 @@ + * trying to determine the inode's location on-disk and no read need be + * performed. + */ +-static int ext3_get_inode_loc(struct inode *inode, ++int ext3_get_inode_loc(struct inode *inode, + struct ext3_iloc *iloc, int in_mem) + { + unsigned long block; +@@ -2484,6 +2484,11 @@ + ei->i_data[block] = raw_inode->i_block[block]; + INIT_LIST_HEAD(&ei->i_orphan); + ++ if (EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) ++ ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); ++ else ++ ei->i_extra_isize = 0; ++ + if (S_ISREG(inode->i_mode)) { + inode->i_op = &ext3_file_inode_operations; + inode->i_fop = &ext3_file_operations; +@@ -2619,6 +2624,9 @@ + } else for (block = 0; block < EXT3_N_BLOCKS; block++) + raw_inode->i_block[block] = ei->i_data[block]; + ++ if (EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) ++ raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); ++ + BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); + rc = ext3_journal_dirty_metadata(handle, bh); + if (!err) +Index: linux-stage/fs/ext3/xattr.c +=================================================================== +--- linux-stage.orig/fs/ext3/xattr.c 2005-02-25 16:47:04.422975608 +0200 ++++ linux-stage/fs/ext3/xattr.c 2005-02-25 17:19:04.958009904 +0200 +@@ -149,17 +149,12 @@ + } + + /* +- * ext3_xattr_get() ++ * ext3_xattr_block_get() + * +- * Copy an extended attribute into the buffer +- * provided, or compute the buffer size required. +- * Buffer is NULL to compute the size of the buffer required. +- * +- * Returns a negative error number on failure, or the number of bytes +- * used / required on success. ++ * routine looks for attribute in EA block and returns it's value and size + */ + int +-ext3_xattr_get(struct inode *inode, int name_index, const char *name, ++ext3_xattr_block_get(struct inode *inode, int name_index, const char *name, + void *buffer, size_t buffer_size) + { + struct buffer_head *bh = NULL; +@@ -173,7 +168,6 @@ + + if (name == NULL) + return -EINVAL; +- down_read(&EXT3_I(inode)->xattr_sem); + error = -ENODATA; + if (!EXT3_I(inode)->i_file_acl) + goto cleanup; +@@ -246,15 +240,87 @@ + + cleanup: + brelse(bh); +- up_read(&EXT3_I(inode)->xattr_sem); + + return error; + } + + /* +- * ext3_xattr_list() ++ * ext3_xattr_ibody_get() + * +- * Copy a list of attribute names into the buffer ++ * routine looks for attribute in inode body and returns it's value and size ++ */ ++int ++ext3_xattr_ibody_get(struct inode *inode, int name_index, const char *name, ++ void *buffer, size_t buffer_size) ++{ ++ int size, name_len = strlen(name), storage_size; ++ struct ext3_xattr_entry *last; ++ struct ext3_inode *raw_inode; ++ struct ext3_iloc iloc; ++ char *start, *end; ++ int ret = -ENOENT; ++ ++ if (EXT3_SB(inode->i_sb)->s_inode_size <= EXT3_GOOD_OLD_INODE_SIZE) ++ return -ENOENT; ++ ++ ret = ext3_get_inode_loc(inode, &iloc, 1); ++ if (ret) ++ return ret; ++ raw_inode = ext3_raw_inode(&iloc); ++ ++ storage_size = EXT3_SB(inode->i_sb)->s_inode_size - ++ EXT3_GOOD_OLD_INODE_SIZE - ++ EXT3_I(inode)->i_extra_isize - ++ sizeof(__u32); ++ start = (char *) raw_inode + EXT3_GOOD_OLD_INODE_SIZE + ++ EXT3_I(inode)->i_extra_isize; ++ if (le32_to_cpu((*(__u32*) start)) != EXT3_XATTR_MAGIC) { ++ brelse(iloc.bh); ++ return -ENOENT; ++ } ++ start += sizeof(__u32); ++ end = (char *) raw_inode + EXT3_SB(inode->i_sb)->s_inode_size; ++ ++ last = (struct ext3_xattr_entry *) start; ++ while (!IS_LAST_ENTRY(last)) { ++ struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(last); ++ if (le32_to_cpu(last->e_value_size) > storage_size || ++ (char *) next >= end) { ++ ext3_error(inode->i_sb, "ext3_xattr_ibody_get", ++ "inode %ld", inode->i_ino); ++ brelse(iloc.bh); ++ return -EIO; ++ } ++ if (name_index == last->e_name_index && ++ name_len == last->e_name_len && ++ !memcmp(name, last->e_name, name_len)) ++ goto found; ++ last = next; ++ } ++ ++ /* can't find EA */ ++ brelse(iloc.bh); ++ return -ENOENT; ++ ++found: ++ size = le32_to_cpu(last->e_value_size); ++ if (buffer) { ++ ret = -ERANGE; ++ if (buffer_size >= size) { ++ memcpy(buffer, start + le16_to_cpu(last->e_value_offs), ++ size); ++ ret = size; ++ } ++ } else ++ ret = size; ++ brelse(iloc.bh); ++ return ret; ++} ++ ++/* ++ * ext3_xattr_get() ++ * ++ * Copy an extended attribute into the buffer + * provided, or compute the buffer size required. + * Buffer is NULL to compute the size of the buffer required. + * +@@ -262,7 +328,31 @@ + * used / required on success. + */ + int +-ext3_xattr_list(struct inode *inode, char *buffer, size_t buffer_size) ++ext3_xattr_get(struct inode *inode, int name_index, const char *name, ++ void *buffer, size_t buffer_size) ++{ ++ int err; ++ ++ down_read(&EXT3_I(inode)->xattr_sem); ++ ++ /* try to find attribute in inode body */ ++ err = ext3_xattr_ibody_get(inode, name_index, name, ++ buffer, buffer_size); ++ if (err < 0) ++ /* search was unsuccessful, try to find EA in dedicated block */ ++ err = ext3_xattr_block_get(inode, name_index, name, ++ buffer, buffer_size); ++ up_read(&EXT3_I(inode)->xattr_sem); ++ ++ return err; ++} ++ ++/* ext3_xattr_ibody_list() ++ * ++ * generate list of attributes stored in EA block ++ */ ++int ++ext3_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size) + { + struct buffer_head *bh = NULL; + struct ext3_xattr_entry *entry; +@@ -273,7 +363,6 @@ + ea_idebug(inode, "buffer=%p, buffer_size=%ld", + buffer, (long)buffer_size); + +- down_read(&EXT3_I(inode)->xattr_sem); + error = 0; + if (!EXT3_I(inode)->i_file_acl) + goto cleanup; +@@ -330,11 +419,139 @@ + + cleanup: + brelse(bh); +- up_read(&EXT3_I(inode)->xattr_sem); + + return error; + } + ++/* ext3_xattr_ibody_list() ++ * ++ * generate list of attributes stored in inode body ++ */ ++int ++ext3_xattr_ibody_list(struct inode *inode, char *buffer, size_t buffer_size) ++{ ++ struct ext3_xattr_entry *last; ++ struct ext3_inode *raw_inode; ++ char *start, *end, *buf; ++ struct ext3_iloc iloc; ++ int storage_size; ++ int ret; ++ int size = 0; ++ ++ if (EXT3_SB(inode->i_sb)->s_inode_size <= EXT3_GOOD_OLD_INODE_SIZE) ++ return 0; ++ ++ ret = ext3_get_inode_loc(inode, &iloc, 1); ++ if (ret) ++ return ret; ++ raw_inode = ext3_raw_inode(&iloc); ++ ++ storage_size = EXT3_SB(inode->i_sb)->s_inode_size - ++ EXT3_GOOD_OLD_INODE_SIZE - ++ EXT3_I(inode)->i_extra_isize - ++ sizeof(__u32); ++ start = (char *) raw_inode + EXT3_GOOD_OLD_INODE_SIZE + ++ EXT3_I(inode)->i_extra_isize; ++ if (le32_to_cpu((*(__u32*) start)) != EXT3_XATTR_MAGIC) { ++ brelse(iloc.bh); ++ return 0; ++ } ++ start += sizeof(__u32); ++ end = (char *) raw_inode + EXT3_SB(inode->i_sb)->s_inode_size; ++ ++ last = (struct ext3_xattr_entry *) start; ++ while (!IS_LAST_ENTRY(last)) { ++ struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(last); ++ struct xattr_handler *handler; ++ if (le32_to_cpu(last->e_value_size) > storage_size || ++ (char *) next >= end) { ++ ext3_error(inode->i_sb, "ext3_xattr_ibody_list", ++ "inode %ld", inode->i_ino); ++ brelse(iloc.bh); ++ return -EIO; ++ } ++ handler = ext3_xattr_handler(last->e_name_index); ++ if (handler) ++ size = handler->list(inode, NULL, 0, last->e_name, ++ last->e_name_len); ++ ++ last = next; ++ } ++ ++ if (!buffer) { ++ ret = size; ++ goto cleanup; ++ } else { ++ ret = -ERANGE; ++ if (size > buffer_size) ++ goto cleanup; ++ } ++ ++ last = (struct ext3_xattr_entry *) start; ++ buf = buffer; ++ while (!IS_LAST_ENTRY(last)) { ++ struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(last); ++ struct xattr_handler *handler; ++ handler = ext3_xattr_handler(last->e_name_index); ++ if (handler) ++ buf += handler->list(inode, buf, (buffer+buffer_size)-buf, last->e_name, ++ last->e_name_len); ++ last = next; ++ } ++ ret = size; ++cleanup: ++ brelse(iloc.bh); ++ return ret; ++} ++ ++/* ++ * ext3_xattr_list() ++ * ++ * Copy a list of attribute names into the buffer ++ * provided, or compute the buffer size required. ++ * Buffer is NULL to compute the size of the buffer required. ++ * ++ * Returns a negative error number on failure, or the number of bytes ++ * used / required on success. ++ */ ++int ++ext3_xattr_list(struct inode *inode, char *buffer, size_t buffer_size) ++{ ++ int error; ++ int size = buffer_size; ++ ++ down_read(&EXT3_I(inode)->xattr_sem); ++ ++ /* get list of attributes stored in inode body */ ++ error = ext3_xattr_ibody_list(inode, buffer, buffer_size); ++ if (error < 0) { ++ /* some error occured while collecting ++ * attributes in inode body */ ++ size = 0; ++ goto cleanup; ++ } ++ size = error; ++ ++ /* get list of attributes stored in dedicated block */ ++ if (buffer) { ++ buffer_size -= error; ++ if (buffer_size <= 0) { ++ buffer = NULL; ++ buffer_size = 0; ++ } else ++ buffer += error; ++ } ++ ++ error = ext3_xattr_block_list(inode, buffer, buffer_size); ++ if (error < 0) ++ /* listing was successful, so we return len */ ++ size = 0; ++ ++cleanup: ++ up_read(&EXT3_I(inode)->xattr_sem); ++ return error + size; ++} ++ + /* + * If the EXT3_FEATURE_COMPAT_EXT_ATTR feature of this file system is + * not set, set it. +@@ -356,6 +573,279 @@ + } + + /* ++ * ext3_xattr_ibody_find() ++ * ++ * search attribute and calculate free space in inode body ++ * NOTE: free space includes space our attribute hold ++ */ ++int ++ext3_xattr_ibody_find(struct inode *inode, int name_index, ++ const char *name, struct ext3_xattr_entry *rentry, int *free) ++{ ++ struct ext3_xattr_entry *last; ++ struct ext3_inode *raw_inode; ++ int name_len = strlen(name); ++ int err, storage_size; ++ struct ext3_iloc iloc; ++ char *start, *end; ++ int ret = -ENOENT; ++ ++ if (EXT3_SB(inode->i_sb)->s_inode_size <= EXT3_GOOD_OLD_INODE_SIZE) ++ return ret; ++ ++ err = ext3_get_inode_loc(inode, &iloc, 1); ++ if (err) ++ return -EIO; ++ raw_inode = ext3_raw_inode(&iloc); ++ ++ storage_size = EXT3_SB(inode->i_sb)->s_inode_size - ++ EXT3_GOOD_OLD_INODE_SIZE - ++ EXT3_I(inode)->i_extra_isize - ++ sizeof(__u32); ++ *free = storage_size - sizeof(__u32); ++ start = (char *) raw_inode + EXT3_GOOD_OLD_INODE_SIZE + ++ EXT3_I(inode)->i_extra_isize; ++ if (le32_to_cpu((*(__u32*) start)) != EXT3_XATTR_MAGIC) { ++ brelse(iloc.bh); ++ return -ENOENT; ++ } ++ start += sizeof(__u32); ++ end = (char *) raw_inode + EXT3_SB(inode->i_sb)->s_inode_size; ++ ++ last = (struct ext3_xattr_entry *) start; ++ while (!IS_LAST_ENTRY(last)) { ++ struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(last); ++ if (le32_to_cpu(last->e_value_size) > storage_size || ++ (char *) next >= end) { ++ ext3_error(inode->i_sb, "ext3_xattr_ibody_find", ++ "inode %ld", inode->i_ino); ++ brelse(iloc.bh); ++ return -EIO; ++ } ++ ++ if (name_index == last->e_name_index && ++ name_len == last->e_name_len && ++ !memcmp(name, last->e_name, name_len)) { ++ memcpy(rentry, last, sizeof(struct ext3_xattr_entry)); ++ ret = 0; ++ } else { ++ *free -= EXT3_XATTR_LEN(last->e_name_len); ++ *free -= le32_to_cpu(last->e_value_size); ++ } ++ last = next; ++ } ++ ++ brelse(iloc.bh); ++ return ret; ++} ++ ++/* ++ * ext3_xattr_block_find() ++ * ++ * search attribute and calculate free space in EA block (if it allocated) ++ * NOTE: free space includes space our attribute hold ++ */ ++int ++ext3_xattr_block_find(struct inode *inode, int name_index, const char *name, ++ struct ext3_xattr_entry *rentry, int *free) ++{ ++ struct buffer_head *bh = NULL; ++ struct ext3_xattr_entry *entry; ++ char *end; ++ int name_len, error = -ENOENT; ++ ++ if (!EXT3_I(inode)->i_file_acl) { ++ *free = inode->i_sb->s_blocksize - ++ sizeof(struct ext3_xattr_header) - ++ sizeof(__u32); ++ return -ENOENT; ++ } ++ ea_idebug(inode, "reading block %d", EXT3_I(inode)->i_file_acl); ++ bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl); ++ if (!bh) ++ return -EIO; ++ ea_bdebug(bh, "b_count=%d, refcount=%d", ++ atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount)); ++ end = bh->b_data + bh->b_size; ++ if (HDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) || ++ HDR(bh)->h_blocks != cpu_to_le32(1)) { ++bad_block: ext3_error(inode->i_sb, "ext3_xattr_get", ++ "inode %ld: bad block %d", inode->i_ino, ++ EXT3_I(inode)->i_file_acl); ++ brelse(bh); ++ return -EIO; ++ } ++ /* find named attribute */ ++ name_len = strlen(name); ++ *free = bh->b_size - sizeof(__u32); ++ ++ entry = FIRST_ENTRY(bh); ++ while (!IS_LAST_ENTRY(entry)) { ++ struct ext3_xattr_entry *next = ++ EXT3_XATTR_NEXT(entry); ++ if ((char *)next >= end) ++ goto bad_block; ++ if (name_index == entry->e_name_index && ++ name_len == entry->e_name_len && ++ memcmp(name, entry->e_name, name_len) == 0) { ++ memcpy(rentry, entry, sizeof(struct ext3_xattr_entry)); ++ error = 0; ++ } else { ++ *free -= EXT3_XATTR_LEN(entry->e_name_len); ++ *free -= le32_to_cpu(entry->e_value_size); ++ } ++ entry = next; ++ } ++ brelse(bh); ++ ++ return error; ++} ++ ++/* ++ * ext3_xattr_inode_set() ++ * ++ * this routine add/remove/replace attribute in inode body ++ */ ++int ++ext3_xattr_ibody_set(handle_t *handle, struct inode *inode, int name_index, ++ const char *name, const void *value, size_t value_len, ++ int flags) ++{ ++ struct ext3_xattr_entry *last, *next, *here = NULL; ++ struct ext3_inode *raw_inode; ++ int name_len = strlen(name); ++ int esize = EXT3_XATTR_LEN(name_len); ++ struct buffer_head *bh; ++ int err, storage_size; ++ struct ext3_iloc iloc; ++ int free, min_offs; ++ char *start, *end; ++ ++ if (EXT3_SB(inode->i_sb)->s_inode_size <= EXT3_GOOD_OLD_INODE_SIZE) ++ return -ENOSPC; ++ ++ err = ext3_get_inode_loc(inode, &iloc, 1); ++ if (err) ++ return err; ++ raw_inode = ext3_raw_inode(&iloc); ++ bh = iloc.bh; ++ ++ storage_size = EXT3_SB(inode->i_sb)->s_inode_size - ++ EXT3_GOOD_OLD_INODE_SIZE - ++ EXT3_I(inode)->i_extra_isize - ++ sizeof(__u32); ++ start = (char *) raw_inode + EXT3_GOOD_OLD_INODE_SIZE + ++ EXT3_I(inode)->i_extra_isize; ++ if ((*(__u32*) start) != EXT3_XATTR_MAGIC) { ++ /* inode had no attributes before */ ++ *((__u32*) start) = cpu_to_le32(EXT3_XATTR_MAGIC); ++ } ++ start += sizeof(__u32); ++ end = (char *) raw_inode + EXT3_SB(inode->i_sb)->s_inode_size; ++ min_offs = storage_size; ++ free = storage_size - sizeof(__u32); ++ ++ last = (struct ext3_xattr_entry *) start; ++ while (!IS_LAST_ENTRY(last)) { ++ next = EXT3_XATTR_NEXT(last); ++ if (le32_to_cpu(last->e_value_size) > storage_size || ++ (char *) next >= end) { ++ ext3_error(inode->i_sb, "ext3_xattr_ibody_set", ++ "inode %ld", inode->i_ino); ++ brelse(bh); ++ return -EIO; ++ } ++ ++ if (last->e_value_size) { ++ int offs = le16_to_cpu(last->e_value_offs); ++ if (offs < min_offs) ++ min_offs = offs; ++ } ++ if (name_index == last->e_name_index && ++ name_len == last->e_name_len && ++ !memcmp(name, last->e_name, name_len)) ++ here = last; ++ else { ++ /* we calculate all but our attribute ++ * because it will be removed before changing */ ++ free -= EXT3_XATTR_LEN(last->e_name_len); ++ free -= le32_to_cpu(last->e_value_size); ++ } ++ last = next; ++ } ++ ++ if (value && (esize + value_len > free)) { ++ brelse(bh); ++ return -ENOSPC; ++ } ++ ++ err = ext3_reserve_inode_write(handle, inode, &iloc); ++ if (err) { ++ brelse(bh); ++ return err; ++ } ++ ++ if (here) { ++ /* time to remove old value */ ++ struct ext3_xattr_entry *e; ++ int size = le32_to_cpu(here->e_value_size); ++ int border = le16_to_cpu(here->e_value_offs); ++ char *src; ++ ++ /* move tail */ ++ memmove(start + min_offs + size, start + min_offs, ++ border - min_offs); ++ ++ /* recalculate offsets */ ++ e = (struct ext3_xattr_entry *) start; ++ while (!IS_LAST_ENTRY(e)) { ++ struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(e); ++ int offs = le16_to_cpu(e->e_value_offs); ++ if (offs < border) ++ e->e_value_offs = ++ cpu_to_le16(offs + size); ++ e = next; ++ } ++ min_offs += size; ++ ++ /* remove entry */ ++ border = EXT3_XATTR_LEN(here->e_name_len); ++ src = (char *) here + EXT3_XATTR_LEN(here->e_name_len); ++ size = (char *) last - src; ++ if ((char *) here + size > end) ++ printk("ALERT at %s:%d: 0x%p + %d > 0x%p\n", ++ __FILE__, __LINE__, here, size, end); ++ memmove(here, src, size); ++ last = (struct ext3_xattr_entry *) ((char *) last - border); ++ *((__u32 *) last) = 0; ++ } ++ ++ if (value) { ++ int offs = min_offs - value_len; ++ /* use last to create new entry */ ++ last->e_name_len = strlen(name); ++ last->e_name_index = name_index; ++ last->e_value_offs = cpu_to_le16(offs); ++ last->e_value_size = cpu_to_le32(value_len); ++ last->e_hash = last->e_value_block = 0; ++ memset(last->e_name, 0, esize); ++ memcpy(last->e_name, name, last->e_name_len); ++ if (start + offs + value_len > end) ++ printk("ALERT at %s:%d: 0x%p + %d + %zd > 0x%p\n", ++ __FILE__, __LINE__, start, offs, ++ value_len, end); ++ memcpy(start + offs, value, value_len); ++ last = EXT3_XATTR_NEXT(last); ++ *((__u32 *) last) = 0; ++ } ++ ++ ext3_mark_iloc_dirty(handle, inode, &iloc); ++ brelse(bh); ++ ++ return 0; ++} ++ ++/* + * ext3_xattr_set_handle() + * + * Create, replace or remove an extended attribute for this inode. Buffer +@@ -369,6 +859,104 @@ + */ + int + ext3_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, ++ const char *name, const void *value, size_t value_len, ++ int flags) ++{ ++ struct ext3_xattr_entry entry; ++ int err, where = 0, found = 0, total; ++ int free1 = -1, free2 = -1; ++ int name_len; ++ ++ ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld", ++ name_index, name, value, (long)value_len); ++ ++ if (IS_RDONLY(inode)) ++ return -EROFS; ++ if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) ++ return -EPERM; ++ if (value == NULL) ++ value_len = 0; ++ if (name == NULL) ++ return -EINVAL; ++ name_len = strlen(name); ++ if (name_len > 255 || value_len > inode->i_sb->s_blocksize) ++ return -ERANGE; ++ down_write(&EXT3_I(inode)->xattr_sem); ++ ++ /* try to find attribute in inode body */ ++ err = ext3_xattr_ibody_find(inode, name_index, name, &entry, &free1); ++ if (err == 0) { ++ /* found EA in inode */ ++ found = 1; ++ where = 0; ++ } else if (err == -ENOENT) { ++ /* there is no such attribute in inode body */ ++ /* try to find attribute in dedicated block */ ++ err = ext3_xattr_block_find(inode, name_index, name, ++ &entry, &free2); ++ if (err != 0 && err != -ENOENT) { ++ /* not found EA in block */ ++ goto finish; ++ } else if (err == 0) { ++ /* found EA in block */ ++ where = 1; ++ found = 1; ++ } ++ } else ++ goto finish; ++ ++ /* check flags: may replace? may create ? */ ++ if (found && (flags & XATTR_CREATE)) { ++ err = -EEXIST; ++ goto finish; ++ } else if (!found && (flags & XATTR_REPLACE)) { ++ err = -ENODATA; ++ goto finish; ++ } ++ ++ /* check if we have enough space to store attribute */ ++ total = EXT3_XATTR_LEN(strlen(name)) + value_len; ++ if (free1 >= 0 && total > free1 && free2 >= 0 && total > free2) { ++ /* have no enough space */ ++ err = -ENOSPC; ++ goto finish; ++ } ++ ++ /* time to remove attribute */ ++ if (found) { ++ if (where == 0) { ++ /* EA is stored in inode body */ ++ ext3_xattr_ibody_set(handle, inode, name_index, name, ++ NULL, 0, flags); ++ } else { ++ /* EA is stored in separated block */ ++ ext3_xattr_block_set(handle, inode, name_index, name, ++ NULL, 0, flags); ++ } ++ } ++ ++ /* try to store EA in inode body */ ++ err = ext3_xattr_ibody_set(handle, inode, name_index, name, ++ value, value_len, flags); ++ if (err) { ++ /* can't store EA in inode body */ ++ /* try to store in block */ ++ err = ext3_xattr_block_set(handle, inode, name_index, ++ name, value, value_len, flags); ++ } ++ ++finish: ++ up_write(&EXT3_I(inode)->xattr_sem); ++ return err; ++} ++ ++/* ++ * ext3_xattr_block_set() ++ * ++ * this routine add/remove/replace attribute in EA block ++ */ ++int ++ext3_xattr_block_set(handle_t *handle, struct inode *inode, int name_index, + const char *name, const void *value, size_t value_len, + int flags) + { +@@ -391,22 +979,7 @@ + * towards the end of the block). + * end -- Points right after the block pointed to by header. + */ +- +- ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld", +- name_index, name, value, (long)value_len); +- +- if (IS_RDONLY(inode)) +- return -EROFS; +- if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) +- return -EPERM; +- if (value == NULL) +- value_len = 0; +- if (name == NULL) +- return -EINVAL; + name_len = strlen(name); +- if (name_len > 255 || value_len > sb->s_blocksize) +- return -ERANGE; +- down_write(&EXT3_I(inode)->xattr_sem); + if (EXT3_I(inode)->i_file_acl) { + /* The inode already has an extended attribute block. */ + bh = sb_bread(sb, EXT3_I(inode)->i_file_acl); +@@ -638,7 +1211,6 @@ + brelse(bh); + if (!(bh && header == HDR(bh))) + kfree(header); +- up_write(&EXT3_I(inode)->xattr_sem); + + return error; + } +Index: linux-stage/fs/ext3/xattr.h +=================================================================== +--- linux-stage.orig/fs/ext3/xattr.h 2005-02-25 16:47:04.423975456 +0200 ++++ linux-stage/fs/ext3/xattr.h 2005-02-25 16:50:40.763086912 +0200 +@@ -67,7 +67,8 @@ + extern int ext3_xattr_get(struct inode *, int, const char *, void *, size_t); + extern int ext3_xattr_list(struct inode *, char *, size_t); + extern int ext3_xattr_set(struct inode *, int, const char *, const void *, size_t, int); +-extern int ext3_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int); ++extern int ext3_xattr_set_handle(handle_t *, struct inode *, int, const char *,const void *,size_t,int); ++extern int ext3_xattr_block_set(handle_t *, struct inode *, int, const char *,const void *,size_t,int); + + extern void ext3_xattr_delete_inode(handle_t *, struct inode *); + extern void ext3_xattr_put_super(struct super_block *); +Index: linux-stage/include/linux/ext3_fs.h +=================================================================== +--- linux-stage.orig/include/linux/ext3_fs.h 2005-02-25 16:47:04.425975152 +0200 ++++ linux-stage/include/linux/ext3_fs.h 2005-02-25 16:50:40.765086608 +0200 +@@ -293,6 +293,8 @@ + __u32 m_i_reserved2[2]; + } masix2; + } osd2; /* OS dependent 2 */ ++ __u16 i_extra_isize; ++ __u16 i_pad1; + }; + + #define i_size_high i_dir_acl +@@ -757,6 +759,7 @@ + extern int ext3_forget(handle_t *, int, struct inode *, struct buffer_head *, int); + extern struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *); + extern struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *); ++int ext3_get_inode_loc(struct inode *inode, struct ext3_iloc *iloc, int in_mem); + + extern void ext3_read_inode (struct inode *); + extern int ext3_write_inode (struct inode *, int); +Index: linux-stage/include/linux/ext3_fs_i.h +=================================================================== +--- linux-stage.orig/include/linux/ext3_fs_i.h 2005-02-25 16:47:04.426975000 +0200 ++++ linux-stage/include/linux/ext3_fs_i.h 2005-02-25 16:50:40.766086456 +0200 +@@ -113,6 +113,9 @@ + */ + loff_t i_disksize; + ++ /* on-disk additional length */ ++ __u16 i_extra_isize; ++ + /* + * truncate_sem is for serialising ext3_truncate() against + * ext3_getblock(). In the 2.4 ext2 design, great chunks of inode's diff --git a/ldiskfs/kernel_patches/patches/ext3-extents-2.6.9-rhel4.patch b/ldiskfs/kernel_patches/patches/ext3-extents-2.6.9-rhel4.patch new file mode 100644 index 0000000..78c5d81 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext3-extents-2.6.9-rhel4.patch @@ -0,0 +1,2831 @@ +Index: linux-stage/fs/ext3/extents.c +=================================================================== +--- linux-stage.orig/fs/ext3/extents.c 2005-02-25 15:33:48.890198160 +0200 ++++ linux-stage/fs/ext3/extents.c 2005-02-25 15:33:48.917194056 +0200 +@@ -0,0 +1,2313 @@ ++/* ++ * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com ++ * Written by Alex Tomas ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public Licens ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- ++ */ ++ ++/* ++ * Extents support for EXT3 ++ * ++ * TODO: ++ * - ext3_ext_walk_space() sould not use ext3_ext_find_extent() ++ * - ext3_ext_calc_credits() could take 'mergable' into account ++ * - ext3*_error() should be used in some situations ++ * - find_goal() [to be tested and improved] ++ * - smart tree reduction ++ * - arch-independence ++ * common on-disk format for big/little-endian arch ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++static handle_t *ext3_ext_journal_restart(handle_t *handle, int needed) ++{ ++ int err; ++ ++ if (handle->h_buffer_credits > needed) ++ return handle; ++ if (!ext3_journal_extend(handle, needed)) ++ return handle; ++ err = ext3_journal_restart(handle, needed); ++ ++ return handle; ++} ++ ++static int inline ++ext3_ext_get_access_for_root(handle_t *h, struct ext3_extents_tree *tree) ++{ ++ if (tree->ops->get_write_access) ++ return tree->ops->get_write_access(h,tree->buffer); ++ else ++ return 0; ++} ++ ++static int inline ++ext3_ext_mark_root_dirty(handle_t *h, struct ext3_extents_tree *tree) ++{ ++ if (tree->ops->mark_buffer_dirty) ++ return tree->ops->mark_buffer_dirty(h,tree->buffer); ++ else ++ return 0; ++} ++ ++/* ++ * could return: ++ * - EROFS ++ * - ENOMEM ++ */ ++static int ext3_ext_get_access(handle_t *handle, ++ struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ int err; ++ ++ if (path->p_bh) { ++ /* path points to block */ ++ err = ext3_journal_get_write_access(handle, path->p_bh); ++ } else { ++ /* path points to leaf/index in inode body */ ++ err = ext3_ext_get_access_for_root(handle, tree); ++ } ++ return err; ++} ++ ++/* ++ * could return: ++ * - EROFS ++ * - ENOMEM ++ * - EIO ++ */ ++static int ext3_ext_dirty(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ int err; ++ if (path->p_bh) { ++ /* path points to block */ ++ err =ext3_journal_dirty_metadata(handle, path->p_bh); ++ } else { ++ /* path points to leaf/index in inode body */ ++ err = ext3_ext_mark_root_dirty(handle, tree); ++ } ++ return err; ++} ++ ++static int inline ++ext3_ext_new_block(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, struct ext3_extent *ex, ++ int *err) ++{ ++ int goal, depth, newblock; ++ struct inode *inode; ++ ++ EXT_ASSERT(tree); ++ if (tree->ops->new_block) ++ return tree->ops->new_block(handle, tree, path, ex, err); ++ ++ inode = tree->inode; ++ depth = EXT_DEPTH(tree); ++ if (path && depth > 0) { ++ goal = path[depth-1].p_block; ++ } else { ++ struct ext3_inode_info *ei = EXT3_I(inode); ++ unsigned long bg_start; ++ unsigned long colour; ++ ++ bg_start = (ei->i_block_group * ++ EXT3_BLOCKS_PER_GROUP(inode->i_sb)) + ++ le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block); ++ colour = (current->pid % 16) * ++ (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16); ++ goal = bg_start + colour; ++ } ++ ++ newblock = ext3_new_block(handle, inode, goal, err); ++ return newblock; ++} ++ ++static inline void ext3_ext_tree_changed(struct ext3_extents_tree *tree) ++{ ++ struct ext3_extent_header *neh; ++ neh = EXT_ROOT_HDR(tree); ++ neh->eh_generation++; ++} ++ ++static inline int ext3_ext_space_block(struct ext3_extents_tree *tree) ++{ ++ int size; ++ ++ size = (tree->inode->i_sb->s_blocksize - ++ sizeof(struct ext3_extent_header)) ++ / sizeof(struct ext3_extent); ++#ifdef AGRESSIVE_TEST ++ size = 6; ++#endif ++ return size; ++} ++ ++static inline int ext3_ext_space_block_idx(struct ext3_extents_tree *tree) ++{ ++ int size; ++ ++ size = (tree->inode->i_sb->s_blocksize - ++ sizeof(struct ext3_extent_header)) ++ / sizeof(struct ext3_extent_idx); ++#ifdef AGRESSIVE_TEST ++ size = 5; ++#endif ++ return size; ++} ++ ++static inline int ext3_ext_space_root(struct ext3_extents_tree *tree) ++{ ++ int size; ++ ++ size = (tree->buffer_len - sizeof(struct ext3_extent_header)) ++ / sizeof(struct ext3_extent); ++#ifdef AGRESSIVE_TEST ++ size = 3; ++#endif ++ return size; ++} ++ ++static inline int ext3_ext_space_root_idx(struct ext3_extents_tree *tree) ++{ ++ int size; ++ ++ size = (tree->buffer_len - ++ sizeof(struct ext3_extent_header)) ++ / sizeof(struct ext3_extent_idx); ++#ifdef AGRESSIVE_TEST ++ size = 4; ++#endif ++ return size; ++} ++ ++static void ext3_ext_show_path(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++#ifdef EXT_DEBUG ++ int k, l = path->p_depth; ++ ++ ext_debug(tree, "path:"); ++ for (k = 0; k <= l; k++, path++) { ++ if (path->p_idx) { ++ ext_debug(tree, " %d->%d", path->p_idx->ei_block, ++ path->p_idx->ei_leaf); ++ } else if (path->p_ext) { ++ ext_debug(tree, " %d:%d:%d", ++ path->p_ext->ee_block, ++ path->p_ext->ee_len, ++ path->p_ext->ee_start); ++ } else ++ ext_debug(tree, " []"); ++ } ++ ext_debug(tree, "\n"); ++#endif ++} ++ ++static void ext3_ext_show_leaf(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++#ifdef EXT_DEBUG ++ int depth = EXT_DEPTH(tree); ++ struct ext3_extent_header *eh; ++ struct ext3_extent *ex; ++ int i; ++ ++ if (!path) ++ return; ++ ++ eh = path[depth].p_hdr; ++ ex = EXT_FIRST_EXTENT(eh); ++ ++ for (i = 0; i < eh->eh_entries; i++, ex++) { ++ ext_debug(tree, "%d:%d:%d ", ++ ex->ee_block, ex->ee_len, ex->ee_start); ++ } ++ ext_debug(tree, "\n"); ++#endif ++} ++ ++static void ext3_ext_drop_refs(struct ext3_ext_path *path) ++{ ++ int depth = path->p_depth; ++ int i; ++ ++ for (i = 0; i <= depth; i++, path++) ++ if (path->p_bh) { ++ brelse(path->p_bh); ++ path->p_bh = NULL; ++ } ++} ++ ++/* ++ * binary search for closest index by given block ++ */ ++static inline void ++ext3_ext_binsearch_idx(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, int block) ++{ ++ struct ext3_extent_header *eh = path->p_hdr; ++ struct ext3_extent_idx *ix; ++ int l = 0, k, r; ++ ++ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); ++ EXT_ASSERT(eh->eh_entries <= eh->eh_max); ++ EXT_ASSERT(eh->eh_entries > 0); ++ ++ ext_debug(tree, "binsearch for %d(idx): ", block); ++ ++ path->p_idx = ix = EXT_FIRST_INDEX(eh); ++ ++ r = k = eh->eh_entries; ++ while (k > 1) { ++ k = (r - l) / 2; ++ if (block < ix[l + k].ei_block) ++ r -= k; ++ else ++ l += k; ++ ext_debug(tree, "%d:%d:%d ", k, l, r); ++ } ++ ++ ix += l; ++ path->p_idx = ix; ++ ext_debug(tree, " -> %d->%d ", path->p_idx->ei_block, path->p_idx->ei_leaf); ++ ++ while (l++ < r) { ++ if (block < ix->ei_block) ++ break; ++ path->p_idx = ix++; ++ } ++ ext_debug(tree, " -> %d->%d\n", path->p_idx->ei_block, ++ path->p_idx->ei_leaf); ++ ++#ifdef CHECK_BINSEARCH ++ { ++ struct ext3_extent_idx *chix; ++ ++ chix = ix = EXT_FIRST_INDEX(eh); ++ for (k = 0; k < eh->eh_entries; k++, ix++) { ++ if (k != 0 && ix->ei_block <= ix[-1].ei_block) { ++ printk("k=%d, ix=0x%p, first=0x%p\n", k, ++ ix, EXT_FIRST_INDEX(eh)); ++ printk("%u <= %u\n", ++ ix->ei_block,ix[-1].ei_block); ++ } ++ EXT_ASSERT(k == 0 || ix->ei_block > ix[-1].ei_block); ++ if (block < ix->ei_block) ++ break; ++ chix = ix; ++ } ++ EXT_ASSERT(chix == path->p_idx); ++ } ++#endif ++ ++} ++ ++/* ++ * binary search for closest extent by given block ++ */ ++static inline void ++ext3_ext_binsearch(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, int block) ++{ ++ struct ext3_extent_header *eh = path->p_hdr; ++ struct ext3_extent *ex; ++ int l = 0, k, r; ++ ++ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); ++ EXT_ASSERT(eh->eh_entries <= eh->eh_max); ++ ++ if (eh->eh_entries == 0) { ++ /* ++ * this leaf is empty yet: ++ * we get such a leaf in split/add case ++ */ ++ return; ++ } ++ ++ ext_debug(tree, "binsearch for %d: ", block); ++ ++ path->p_ext = ex = EXT_FIRST_EXTENT(eh); ++ ++ r = k = eh->eh_entries; ++ while (k > 1) { ++ k = (r - l) / 2; ++ if (block < ex[l + k].ee_block) ++ r -= k; ++ else ++ l += k; ++ ext_debug(tree, "%d:%d:%d ", k, l, r); ++ } ++ ++ ex += l; ++ path->p_ext = ex; ++ ext_debug(tree, " -> %d:%d:%d ", path->p_ext->ee_block, ++ path->p_ext->ee_start, path->p_ext->ee_len); ++ ++ while (l++ < r) { ++ if (block < ex->ee_block) ++ break; ++ path->p_ext = ex++; ++ } ++ ext_debug(tree, " -> %d:%d:%d\n", path->p_ext->ee_block, ++ path->p_ext->ee_start, path->p_ext->ee_len); ++ ++#ifdef CHECK_BINSEARCH ++ { ++ struct ext3_extent *chex; ++ ++ chex = ex = EXT_FIRST_EXTENT(eh); ++ for (k = 0; k < eh->eh_entries; k++, ex++) { ++ EXT_ASSERT(k == 0 || ex->ee_block > ex[-1].ee_block); ++ if (block < ex->ee_block) ++ break; ++ chex = ex; ++ } ++ EXT_ASSERT(chex == path->p_ext); ++ } ++#endif ++ ++} ++ ++int ext3_extent_tree_init(handle_t *handle, struct ext3_extents_tree *tree) ++{ ++ struct ext3_extent_header *eh; ++ ++ BUG_ON(tree->buffer_len == 0); ++ ext3_ext_get_access_for_root(handle, tree); ++ eh = EXT_ROOT_HDR(tree); ++ eh->eh_depth = 0; ++ eh->eh_entries = 0; ++ eh->eh_magic = EXT3_EXT_MAGIC; ++ eh->eh_max = ext3_ext_space_root(tree); ++ ext3_ext_mark_root_dirty(handle, tree); ++ ext3_ext_invalidate_cache(tree); ++ return 0; ++} ++ ++struct ext3_ext_path * ++ext3_ext_find_extent(struct ext3_extents_tree *tree, int block, ++ struct ext3_ext_path *path) ++{ ++ struct ext3_extent_header *eh; ++ struct buffer_head *bh; ++ int depth, i, ppos = 0; ++ ++ EXT_ASSERT(tree); ++ EXT_ASSERT(tree->inode); ++ EXT_ASSERT(tree->root); ++ ++ eh = EXT_ROOT_HDR(tree); ++ EXT_ASSERT(eh); ++ i = depth = EXT_DEPTH(tree); ++ EXT_ASSERT(eh->eh_max); ++ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); ++ EXT_ASSERT(i == 0 || eh->eh_entries > 0); ++ ++ /* account possible depth increase */ ++ if (!path) { ++ path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 2), ++ GFP_NOFS); ++ if (!path) ++ return ERR_PTR(-ENOMEM); ++ } ++ memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1)); ++ path[0].p_hdr = eh; ++ ++ /* walk through the tree */ ++ while (i) { ++ ext_debug(tree, "depth %d: num %d, max %d\n", ++ ppos, eh->eh_entries, eh->eh_max); ++ ext3_ext_binsearch_idx(tree, path + ppos, block); ++ path[ppos].p_block = path[ppos].p_idx->ei_leaf; ++ path[ppos].p_depth = i; ++ path[ppos].p_ext = NULL; ++ ++ bh = sb_bread(tree->inode->i_sb, path[ppos].p_block); ++ if (!bh) { ++ ext3_ext_drop_refs(path); ++ kfree(path); ++ return ERR_PTR(-EIO); ++ } ++ eh = EXT_BLOCK_HDR(bh); ++ ppos++; ++ EXT_ASSERT(ppos <= depth); ++ path[ppos].p_bh = bh; ++ path[ppos].p_hdr = eh; ++ i--; ++ } ++ ++ path[ppos].p_depth = i; ++ path[ppos].p_hdr = eh; ++ path[ppos].p_ext = NULL; ++ ++ /* find extent */ ++ ext3_ext_binsearch(tree, path + ppos, block); ++ ++ ext3_ext_show_path(tree, path); ++ ++ return path; ++} ++ ++/* ++ * insert new index [logical;ptr] into the block at cupr ++ * it check where to insert: before curp or after curp ++ */ ++static int ext3_ext_insert_index(handle_t *handle, ++ struct ext3_extents_tree *tree, ++ struct ext3_ext_path *curp, ++ int logical, int ptr) ++{ ++ struct ext3_extent_idx *ix; ++ int len, err; ++ ++ if ((err = ext3_ext_get_access(handle, tree, curp))) ++ return err; ++ ++ EXT_ASSERT(logical != curp->p_idx->ei_block); ++ len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx; ++ if (logical > curp->p_idx->ei_block) { ++ /* insert after */ ++ if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) { ++ len = (len - 1) * sizeof(struct ext3_extent_idx); ++ len = len < 0 ? 0 : len; ++ ext_debug(tree, "insert new index %d after: %d. " ++ "move %d from 0x%p to 0x%p\n", ++ logical, ptr, len, ++ (curp->p_idx + 1), (curp->p_idx + 2)); ++ memmove(curp->p_idx + 2, curp->p_idx + 1, len); ++ } ++ ix = curp->p_idx + 1; ++ } else { ++ /* insert before */ ++ len = len * sizeof(struct ext3_extent_idx); ++ len = len < 0 ? 0 : len; ++ ext_debug(tree, "insert new index %d before: %d. " ++ "move %d from 0x%p to 0x%p\n", ++ logical, ptr, len, ++ curp->p_idx, (curp->p_idx + 1)); ++ memmove(curp->p_idx + 1, curp->p_idx, len); ++ ix = curp->p_idx; ++ } ++ ++ ix->ei_block = logical; ++ ix->ei_leaf = ptr; ++ curp->p_hdr->eh_entries++; ++ ++ EXT_ASSERT(curp->p_hdr->eh_entries <= curp->p_hdr->eh_max); ++ EXT_ASSERT(ix <= EXT_LAST_INDEX(curp->p_hdr)); ++ ++ err = ext3_ext_dirty(handle, tree, curp); ++ ext3_std_error(tree->inode->i_sb, err); ++ ++ return err; ++} ++ ++/* ++ * routine inserts new subtree into the path, using free index entry ++ * at depth 'at: ++ * - allocates all needed blocks (new leaf and all intermediate index blocks) ++ * - makes decision where to split ++ * - moves remaining extens and index entries (right to the split point) ++ * into the newly allocated blocks ++ * - initialize subtree ++ */ ++static int ext3_ext_split(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newext, int at) ++{ ++ struct buffer_head *bh = NULL; ++ int depth = EXT_DEPTH(tree); ++ struct ext3_extent_header *neh; ++ struct ext3_extent_idx *fidx; ++ struct ext3_extent *ex; ++ int i = at, k, m, a; ++ unsigned long newblock, oldblock, border; ++ int *ablocks = NULL; /* array of allocated blocks */ ++ int err = 0; ++ ++ /* make decision: where to split? */ ++ /* FIXME: now desicion is simplest: at current extent */ ++ ++ /* if current leaf will be splitted, then we should use ++ * border from split point */ ++ EXT_ASSERT(path[depth].p_ext <= EXT_MAX_EXTENT(path[depth].p_hdr)); ++ if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) { ++ border = path[depth].p_ext[1].ee_block; ++ ext_debug(tree, "leaf will be splitted." ++ " next leaf starts at %d\n", ++ (int)border); ++ } else { ++ border = newext->ee_block; ++ ext_debug(tree, "leaf will be added." ++ " next leaf starts at %d\n", ++ (int)border); ++ } ++ ++ /* ++ * if error occurs, then we break processing ++ * and turn filesystem read-only. so, index won't ++ * be inserted and tree will be in consistent ++ * state. next mount will repair buffers too ++ */ ++ ++ /* ++ * get array to track all allocated blocks ++ * we need this to handle errors and free blocks ++ * upon them ++ */ ++ ablocks = kmalloc(sizeof(unsigned long) * depth, GFP_NOFS); ++ if (!ablocks) ++ return -ENOMEM; ++ memset(ablocks, 0, sizeof(unsigned long) * depth); ++ ++ /* allocate all needed blocks */ ++ ext_debug(tree, "allocate %d blocks for indexes/leaf\n", depth - at); ++ for (a = 0; a < depth - at; a++) { ++ newblock = ext3_ext_new_block(handle, tree, path, newext, &err); ++ if (newblock == 0) ++ goto cleanup; ++ ablocks[a] = newblock; ++ } ++ ++ /* initialize new leaf */ ++ newblock = ablocks[--a]; ++ EXT_ASSERT(newblock); ++ bh = sb_getblk(tree->inode->i_sb, newblock); ++ if (!bh) { ++ err = -EIO; ++ goto cleanup; ++ } ++ lock_buffer(bh); ++ ++ if ((err = ext3_journal_get_create_access(handle, bh))) ++ goto cleanup; ++ ++ neh = EXT_BLOCK_HDR(bh); ++ neh->eh_entries = 0; ++ neh->eh_max = ext3_ext_space_block(tree); ++ neh->eh_magic = EXT3_EXT_MAGIC; ++ neh->eh_depth = 0; ++ ex = EXT_FIRST_EXTENT(neh); ++ ++ /* move remain of path[depth] to the new leaf */ ++ EXT_ASSERT(path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max); ++ /* start copy from next extent */ ++ /* TODO: we could do it by single memmove */ ++ m = 0; ++ path[depth].p_ext++; ++ while (path[depth].p_ext <= ++ EXT_MAX_EXTENT(path[depth].p_hdr)) { ++ ext_debug(tree, "move %d:%d:%d in new leaf %lu\n", ++ path[depth].p_ext->ee_block, ++ path[depth].p_ext->ee_start, ++ path[depth].p_ext->ee_len, ++ newblock); ++ memmove(ex++, path[depth].p_ext++, ++ sizeof(struct ext3_extent)); ++ neh->eh_entries++; ++ m++; ++ } ++ set_buffer_uptodate(bh); ++ unlock_buffer(bh); ++ ++ if ((err = ext3_journal_dirty_metadata(handle, bh))) ++ goto cleanup; ++ brelse(bh); ++ bh = NULL; ++ ++ /* correct old leaf */ ++ if (m) { ++ if ((err = ext3_ext_get_access(handle, tree, path + depth))) ++ goto cleanup; ++ path[depth].p_hdr->eh_entries -= m; ++ if ((err = ext3_ext_dirty(handle, tree, path + depth))) ++ goto cleanup; ++ ++ } ++ ++ /* create intermediate indexes */ ++ k = depth - at - 1; ++ EXT_ASSERT(k >= 0); ++ if (k) ++ ext_debug(tree, "create %d intermediate indices\n", k); ++ /* insert new index into current index block */ ++ /* current depth stored in i var */ ++ i = depth - 1; ++ while (k--) { ++ oldblock = newblock; ++ newblock = ablocks[--a]; ++ bh = sb_getblk(tree->inode->i_sb, newblock); ++ if (!bh) { ++ err = -EIO; ++ goto cleanup; ++ } ++ lock_buffer(bh); ++ ++ if ((err = ext3_journal_get_create_access(handle, bh))) ++ goto cleanup; ++ ++ neh = EXT_BLOCK_HDR(bh); ++ neh->eh_entries = 1; ++ neh->eh_magic = EXT3_EXT_MAGIC; ++ neh->eh_max = ext3_ext_space_block_idx(tree); ++ neh->eh_depth = depth - i; ++ fidx = EXT_FIRST_INDEX(neh); ++ fidx->ei_block = border; ++ fidx->ei_leaf = oldblock; ++ ++ ext_debug(tree, "int.index at %d (block %lu): %lu -> %lu\n", ++ i, newblock, border, oldblock); ++ /* copy indexes */ ++ m = 0; ++ path[i].p_idx++; ++ ++ ext_debug(tree, "cur 0x%p, last 0x%p\n", path[i].p_idx, ++ EXT_MAX_INDEX(path[i].p_hdr)); ++ EXT_ASSERT(EXT_MAX_INDEX(path[i].p_hdr) == ++ EXT_LAST_INDEX(path[i].p_hdr)); ++ while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) { ++ ext_debug(tree, "%d: move %d:%d in new index %lu\n", ++ i, path[i].p_idx->ei_block, ++ path[i].p_idx->ei_leaf, newblock); ++ memmove(++fidx, path[i].p_idx++, ++ sizeof(struct ext3_extent_idx)); ++ neh->eh_entries++; ++ EXT_ASSERT(neh->eh_entries <= neh->eh_max); ++ m++; ++ } ++ set_buffer_uptodate(bh); ++ unlock_buffer(bh); ++ ++ if ((err = ext3_journal_dirty_metadata(handle, bh))) ++ goto cleanup; ++ brelse(bh); ++ bh = NULL; ++ ++ /* correct old index */ ++ if (m) { ++ err = ext3_ext_get_access(handle, tree, path + i); ++ if (err) ++ goto cleanup; ++ path[i].p_hdr->eh_entries -= m; ++ err = ext3_ext_dirty(handle, tree, path + i); ++ if (err) ++ goto cleanup; ++ } ++ ++ i--; ++ } ++ ++ /* insert new index */ ++ if (!err) ++ err = ext3_ext_insert_index(handle, tree, path + at, ++ border, newblock); ++ ++cleanup: ++ if (bh) { ++ if (buffer_locked(bh)) ++ unlock_buffer(bh); ++ brelse(bh); ++ } ++ ++ if (err) { ++ /* free all allocated blocks in error case */ ++ for (i = 0; i < depth; i++) { ++ if (!ablocks[i]) ++ continue; ++ ext3_free_blocks(handle, tree->inode, ablocks[i], 1); ++ } ++ } ++ kfree(ablocks); ++ ++ return err; ++} ++ ++/* ++ * routine implements tree growing procedure: ++ * - allocates new block ++ * - moves top-level data (index block or leaf) into the new block ++ * - initialize new top-level, creating index that points to the ++ * just created block ++ */ ++static int ext3_ext_grow_indepth(handle_t *handle, ++ struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newext) ++{ ++ struct ext3_ext_path *curp = path; ++ struct ext3_extent_header *neh; ++ struct ext3_extent_idx *fidx; ++ struct buffer_head *bh; ++ unsigned long newblock; ++ int err = 0; ++ ++ newblock = ext3_ext_new_block(handle, tree, path, newext, &err); ++ if (newblock == 0) ++ return err; ++ ++ bh = sb_getblk(tree->inode->i_sb, newblock); ++ if (!bh) { ++ err = -EIO; ++ ext3_std_error(tree->inode->i_sb, err); ++ return err; ++ } ++ lock_buffer(bh); ++ ++ if ((err = ext3_journal_get_create_access(handle, bh))) { ++ unlock_buffer(bh); ++ goto out; ++ } ++ ++ /* move top-level index/leaf into new block */ ++ memmove(bh->b_data, curp->p_hdr, tree->buffer_len); ++ ++ /* set size of new block */ ++ neh = EXT_BLOCK_HDR(bh); ++ /* old root could have indexes or leaves ++ * so calculate e_max right way */ ++ if (EXT_DEPTH(tree)) ++ neh->eh_max = ext3_ext_space_block_idx(tree); ++ else ++ neh->eh_max = ext3_ext_space_block(tree); ++ neh->eh_magic = EXT3_EXT_MAGIC; ++ set_buffer_uptodate(bh); ++ unlock_buffer(bh); ++ ++ if ((err = ext3_journal_dirty_metadata(handle, bh))) ++ goto out; ++ ++ /* create index in new top-level index: num,max,pointer */ ++ if ((err = ext3_ext_get_access(handle, tree, curp))) ++ goto out; ++ ++ curp->p_hdr->eh_magic = EXT3_EXT_MAGIC; ++ curp->p_hdr->eh_max = ext3_ext_space_root_idx(tree); ++ curp->p_hdr->eh_entries = 1; ++ curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr); ++ /* FIXME: it works, but actually path[0] can be index */ ++ curp->p_idx->ei_block = EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block; ++ curp->p_idx->ei_leaf = newblock; ++ ++ neh = EXT_ROOT_HDR(tree); ++ fidx = EXT_FIRST_INDEX(neh); ++ ext_debug(tree, "new root: num %d(%d), lblock %d, ptr %d\n", ++ neh->eh_entries, neh->eh_max, fidx->ei_block, fidx->ei_leaf); ++ ++ neh->eh_depth = path->p_depth + 1; ++ err = ext3_ext_dirty(handle, tree, curp); ++out: ++ brelse(bh); ++ ++ return err; ++} ++ ++/* ++ * routine finds empty index and adds new leaf. if no free index found ++ * then it requests in-depth growing ++ */ ++static int ext3_ext_create_new_leaf(handle_t *handle, ++ struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newext) ++{ ++ struct ext3_ext_path *curp; ++ int depth, i, err = 0; ++ ++repeat: ++ i = depth = EXT_DEPTH(tree); ++ ++ /* walk up to the tree and look for free index entry */ ++ curp = path + depth; ++ while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) { ++ i--; ++ curp--; ++ } ++ ++ /* we use already allocated block for index block ++ * so, subsequent data blocks should be contigoues */ ++ if (EXT_HAS_FREE_INDEX(curp)) { ++ /* if we found index with free entry, then use that ++ * entry: create all needed subtree and add new leaf */ ++ err = ext3_ext_split(handle, tree, path, newext, i); ++ ++ /* refill path */ ++ ext3_ext_drop_refs(path); ++ path = ext3_ext_find_extent(tree, newext->ee_block, path); ++ if (IS_ERR(path)) ++ err = PTR_ERR(path); ++ } else { ++ /* tree is full, time to grow in depth */ ++ err = ext3_ext_grow_indepth(handle, tree, path, newext); ++ ++ /* refill path */ ++ ext3_ext_drop_refs(path); ++ path = ext3_ext_find_extent(tree, newext->ee_block, path); ++ if (IS_ERR(path)) ++ err = PTR_ERR(path); ++ ++ /* ++ * only first (depth 0 -> 1) produces free space ++ * in all other cases we have to split growed tree ++ */ ++ depth = EXT_DEPTH(tree); ++ if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) { ++ /* now we need split */ ++ goto repeat; ++ } ++ } ++ ++ if (err) ++ return err; ++ ++ return 0; ++} ++ ++/* ++ * returns allocated block in subsequent extent or EXT_MAX_BLOCK ++ * NOTE: it consider block number from index entry as ++ * allocated block. thus, index entries have to be consistent ++ * with leafs ++ */ ++static unsigned long ++ext3_ext_next_allocated_block(struct ext3_ext_path *path) ++{ ++ int depth; ++ ++ EXT_ASSERT(path != NULL); ++ depth = path->p_depth; ++ ++ if (depth == 0 && path->p_ext == NULL) ++ return EXT_MAX_BLOCK; ++ ++ /* FIXME: what if index isn't full ?! */ ++ while (depth >= 0) { ++ if (depth == path->p_depth) { ++ /* leaf */ ++ if (path[depth].p_ext != ++ EXT_LAST_EXTENT(path[depth].p_hdr)) ++ return path[depth].p_ext[1].ee_block; ++ } else { ++ /* index */ ++ if (path[depth].p_idx != ++ EXT_LAST_INDEX(path[depth].p_hdr)) ++ return path[depth].p_idx[1].ei_block; ++ } ++ depth--; ++ } ++ ++ return EXT_MAX_BLOCK; ++} ++ ++/* ++ * returns first allocated block from next leaf or EXT_MAX_BLOCK ++ */ ++static unsigned ext3_ext_next_leaf_block(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ int depth; ++ ++ EXT_ASSERT(path != NULL); ++ depth = path->p_depth; ++ ++ /* zero-tree has no leaf blocks at all */ ++ if (depth == 0) ++ return EXT_MAX_BLOCK; ++ ++ /* go to index block */ ++ depth--; ++ ++ while (depth >= 0) { ++ if (path[depth].p_idx != ++ EXT_LAST_INDEX(path[depth].p_hdr)) ++ return path[depth].p_idx[1].ei_block; ++ depth--; ++ } ++ ++ return EXT_MAX_BLOCK; ++} ++ ++/* ++ * if leaf gets modified and modified extent is first in the leaf ++ * then we have to correct all indexes above ++ * TODO: do we need to correct tree in all cases? ++ */ ++int ext3_ext_correct_indexes(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ struct ext3_extent_header *eh; ++ int depth = EXT_DEPTH(tree); ++ struct ext3_extent *ex; ++ unsigned long border; ++ int k, err = 0; ++ ++ eh = path[depth].p_hdr; ++ ex = path[depth].p_ext; ++ EXT_ASSERT(ex); ++ EXT_ASSERT(eh); ++ ++ if (depth == 0) { ++ /* there is no tree at all */ ++ return 0; ++ } ++ ++ if (ex != EXT_FIRST_EXTENT(eh)) { ++ /* we correct tree if first leaf got modified only */ ++ return 0; ++ } ++ ++ /* ++ * TODO: we need correction if border is smaller then current one ++ */ ++ k = depth - 1; ++ border = path[depth].p_ext->ee_block; ++ if ((err = ext3_ext_get_access(handle, tree, path + k))) ++ return err; ++ path[k].p_idx->ei_block = border; ++ if ((err = ext3_ext_dirty(handle, tree, path + k))) ++ return err; ++ ++ while (k--) { ++ /* change all left-side indexes */ ++ if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr)) ++ break; ++ if ((err = ext3_ext_get_access(handle, tree, path + k))) ++ break; ++ path[k].p_idx->ei_block = border; ++ if ((err = ext3_ext_dirty(handle, tree, path + k))) ++ break; ++ } ++ ++ return err; ++} ++ ++static int inline ++ext3_can_extents_be_merged(struct ext3_extents_tree *tree, ++ struct ext3_extent *ex1, ++ struct ext3_extent *ex2) ++{ ++ if (ex1->ee_block + ex1->ee_len != ex2->ee_block) ++ return 0; ++ ++#ifdef AGRESSIVE_TEST ++ if (ex1->ee_len >= 4) ++ return 0; ++#endif ++ ++ if (!tree->ops->mergable) ++ return 1; ++ ++ return tree->ops->mergable(ex1, ex2); ++} ++ ++/* ++ * this routine tries to merge requsted extent into the existing ++ * extent or inserts requested extent as new one into the tree, ++ * creating new leaf in no-space case ++ */ ++int ext3_ext_insert_extent(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newext) ++{ ++ struct ext3_extent_header * eh; ++ struct ext3_extent *ex, *fex; ++ struct ext3_extent *nearex; /* nearest extent */ ++ struct ext3_ext_path *npath = NULL; ++ int depth, len, err, next; ++ ++ EXT_ASSERT(newext->ee_len > 0); ++ EXT_ASSERT(newext->ee_len < EXT_CACHE_MARK); ++ depth = EXT_DEPTH(tree); ++ ex = path[depth].p_ext; ++ EXT_ASSERT(path[depth].p_hdr); ++ ++ /* try to insert block into found extent and return */ ++ if (ex && ext3_can_extents_be_merged(tree, ex, newext)) { ++ ext_debug(tree, "append %d block to %d:%d (from %d)\n", ++ newext->ee_len, ex->ee_block, ex->ee_len, ++ ex->ee_start); ++ if ((err = ext3_ext_get_access(handle, tree, path + depth))) ++ return err; ++ ex->ee_len += newext->ee_len; ++ eh = path[depth].p_hdr; ++ nearex = ex; ++ goto merge; ++ } ++ ++repeat: ++ depth = EXT_DEPTH(tree); ++ eh = path[depth].p_hdr; ++ if (eh->eh_entries < eh->eh_max) ++ goto has_space; ++ ++ /* probably next leaf has space for us? */ ++ fex = EXT_LAST_EXTENT(eh); ++ next = ext3_ext_next_leaf_block(tree, path); ++ if (newext->ee_block > fex->ee_block && next != EXT_MAX_BLOCK) { ++ ext_debug(tree, "next leaf block - %d\n", next); ++ EXT_ASSERT(!npath); ++ npath = ext3_ext_find_extent(tree, next, NULL); ++ if (IS_ERR(npath)) ++ return PTR_ERR(npath); ++ EXT_ASSERT(npath->p_depth == path->p_depth); ++ eh = npath[depth].p_hdr; ++ if (eh->eh_entries < eh->eh_max) { ++ ext_debug(tree, "next leaf isnt full(%d)\n", ++ eh->eh_entries); ++ path = npath; ++ goto repeat; ++ } ++ ext_debug(tree, "next leaf hasno free space(%d,%d)\n", ++ eh->eh_entries, eh->eh_max); ++ } ++ ++ /* ++ * there is no free space in found leaf ++ * we're gonna add new leaf in the tree ++ */ ++ err = ext3_ext_create_new_leaf(handle, tree, path, newext); ++ if (err) ++ goto cleanup; ++ depth = EXT_DEPTH(tree); ++ eh = path[depth].p_hdr; ++ ++has_space: ++ nearex = path[depth].p_ext; ++ ++ if ((err = ext3_ext_get_access(handle, tree, path + depth))) ++ goto cleanup; ++ ++ if (!nearex) { ++ /* there is no extent in this leaf, create first one */ ++ ext_debug(tree, "first extent in the leaf: %d:%d:%d\n", ++ newext->ee_block, newext->ee_start, ++ newext->ee_len); ++ path[depth].p_ext = EXT_FIRST_EXTENT(eh); ++ } else if (newext->ee_block > nearex->ee_block) { ++ EXT_ASSERT(newext->ee_block != nearex->ee_block); ++ if (nearex != EXT_LAST_EXTENT(eh)) { ++ len = EXT_MAX_EXTENT(eh) - nearex; ++ len = (len - 1) * sizeof(struct ext3_extent); ++ len = len < 0 ? 0 : len; ++ ext_debug(tree, "insert %d:%d:%d after: nearest 0x%p, " ++ "move %d from 0x%p to 0x%p\n", ++ newext->ee_block, newext->ee_start, ++ newext->ee_len, ++ nearex, len, nearex + 1, nearex + 2); ++ memmove(nearex + 2, nearex + 1, len); ++ } ++ path[depth].p_ext = nearex + 1; ++ } else { ++ EXT_ASSERT(newext->ee_block != nearex->ee_block); ++ len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext3_extent); ++ len = len < 0 ? 0 : len; ++ ext_debug(tree, "insert %d:%d:%d before: nearest 0x%p, " ++ "move %d from 0x%p to 0x%p\n", ++ newext->ee_block, newext->ee_start, newext->ee_len, ++ nearex, len, nearex + 1, nearex + 2); ++ memmove(nearex + 1, nearex, len); ++ path[depth].p_ext = nearex; ++ } ++ ++ eh->eh_entries++; ++ nearex = path[depth].p_ext; ++ nearex->ee_block = newext->ee_block; ++ nearex->ee_start = newext->ee_start; ++ nearex->ee_len = newext->ee_len; ++ /* FIXME: support for large fs */ ++ nearex->ee_start_hi = 0; ++ ++merge: ++ /* try to merge extents to the right */ ++ while (nearex < EXT_LAST_EXTENT(eh)) { ++ if (!ext3_can_extents_be_merged(tree, nearex, nearex + 1)) ++ break; ++ /* merge with next extent! */ ++ nearex->ee_len += nearex[1].ee_len; ++ if (nearex + 1 < EXT_LAST_EXTENT(eh)) { ++ len = (EXT_LAST_EXTENT(eh) - nearex - 1) ++ * sizeof(struct ext3_extent); ++ memmove(nearex + 1, nearex + 2, len); ++ } ++ eh->eh_entries--; ++ EXT_ASSERT(eh->eh_entries > 0); ++ } ++ ++ /* try to merge extents to the left */ ++ ++ /* time to correct all indexes above */ ++ err = ext3_ext_correct_indexes(handle, tree, path); ++ if (err) ++ goto cleanup; ++ ++ err = ext3_ext_dirty(handle, tree, path + depth); ++ ++cleanup: ++ if (npath) { ++ ext3_ext_drop_refs(npath); ++ kfree(npath); ++ } ++ ext3_ext_tree_changed(tree); ++ ext3_ext_invalidate_cache(tree); ++ return err; ++} ++ ++int ext3_ext_walk_space(struct ext3_extents_tree *tree, unsigned long block, ++ unsigned long num, ext_prepare_callback func) ++{ ++ struct ext3_ext_path *path = NULL; ++ struct ext3_extent *ex, cbex; ++ unsigned long next, start = 0, end = 0; ++ unsigned long last = block + num; ++ int depth, exists, err = 0; ++ ++ EXT_ASSERT(tree); ++ EXT_ASSERT(func); ++ EXT_ASSERT(tree->inode); ++ EXT_ASSERT(tree->root); ++ ++ while (block < last && block != EXT_MAX_BLOCK) { ++ num = last - block; ++ /* find extent for this block */ ++ path = ext3_ext_find_extent(tree, block, path); ++ if (IS_ERR(path)) { ++ err = PTR_ERR(path); ++ path = NULL; ++ break; ++ } ++ ++ depth = EXT_DEPTH(tree); ++ EXT_ASSERT(path[depth].p_hdr); ++ ex = path[depth].p_ext; ++ next = ext3_ext_next_allocated_block(path); ++ ++ exists = 0; ++ if (!ex) { ++ /* there is no extent yet, so try to allocate ++ * all requested space */ ++ start = block; ++ end = block + num; ++ } else if (ex->ee_block > block) { ++ /* need to allocate space before found extent */ ++ start = block; ++ end = ex->ee_block; ++ if (block + num < end) ++ end = block + num; ++ } else if (block >= ex->ee_block + ex->ee_len) { ++ /* need to allocate space after found extent */ ++ start = block; ++ end = block + num; ++ if (end >= next) ++ end = next; ++ } else if (block >= ex->ee_block) { ++ /* ++ * some part of requested space is covered ++ * by found extent ++ */ ++ start = block; ++ end = ex->ee_block + ex->ee_len; ++ if (block + num < end) ++ end = block + num; ++ exists = 1; ++ } else { ++ BUG(); ++ } ++ EXT_ASSERT(end > start); ++ ++ if (!exists) { ++ cbex.ee_block = start; ++ cbex.ee_len = end - start; ++ cbex.ee_start = 0; ++ } else ++ cbex = *ex; ++ ++ EXT_ASSERT(path[depth].p_hdr); ++ err = func(tree, path, &cbex, exists); ++ ext3_ext_drop_refs(path); ++ ++ if (err < 0) ++ break; ++ if (err == EXT_REPEAT) ++ continue; ++ else if (err == EXT_BREAK) { ++ err = 0; ++ break; ++ } ++ ++ if (EXT_DEPTH(tree) != depth) { ++ /* depth was changed. we have to realloc path */ ++ kfree(path); ++ path = NULL; ++ } ++ ++ block = cbex.ee_block + cbex.ee_len; ++ } ++ ++ if (path) { ++ ext3_ext_drop_refs(path); ++ kfree(path); ++ } ++ ++ return err; ++} ++ ++static inline void ++ext3_ext_put_in_cache(struct ext3_extents_tree *tree, __u32 block, ++ __u32 len, __u32 start, int type) ++{ ++ EXT_ASSERT(len > 0); ++ if (tree->cex) { ++ tree->cex->ec_type = type; ++ tree->cex->ec_block = block; ++ tree->cex->ec_len = len; ++ tree->cex->ec_start = start; ++ } ++} ++ ++/* ++ * this routine calculate boundaries of the gap requested block fits into ++ * and cache this gap ++ */ ++static inline void ++ext3_ext_put_gap_in_cache(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ unsigned long block) ++{ ++ int depth = EXT_DEPTH(tree); ++ unsigned long lblock, len; ++ struct ext3_extent *ex; ++ ++ if (!tree->cex) ++ return; ++ ++ ex = path[depth].p_ext; ++ if (ex == NULL) { ++ /* there is no extent yet, so gap is [0;-] */ ++ lblock = 0; ++ len = EXT_MAX_BLOCK; ++ ext_debug(tree, "cache gap(whole file):"); ++ } else if (block < ex->ee_block) { ++ lblock = block; ++ len = ex->ee_block - block; ++ ext_debug(tree, "cache gap(before): %lu [%lu:%lu]", ++ (unsigned long) block, ++ (unsigned long) ex->ee_block, ++ (unsigned long) ex->ee_len); ++ } else if (block >= ex->ee_block + ex->ee_len) { ++ lblock = ex->ee_block + ex->ee_len; ++ len = ext3_ext_next_allocated_block(path); ++ ext_debug(tree, "cache gap(after): [%lu:%lu] %lu", ++ (unsigned long) ex->ee_block, ++ (unsigned long) ex->ee_len, ++ (unsigned long) block); ++ EXT_ASSERT(len > lblock); ++ len = len - lblock; ++ } else { ++ lblock = len = 0; ++ BUG(); ++ } ++ ++ ext_debug(tree, " -> %lu:%lu\n", (unsigned long) lblock, len); ++ ext3_ext_put_in_cache(tree, lblock, len, 0, EXT3_EXT_CACHE_GAP); ++} ++ ++static inline int ++ext3_ext_in_cache(struct ext3_extents_tree *tree, unsigned long block, ++ struct ext3_extent *ex) ++{ ++ struct ext3_ext_cache *cex = tree->cex; ++ ++ /* is there cache storage at all? */ ++ if (!cex) ++ return EXT3_EXT_CACHE_NO; ++ ++ /* has cache valid data? */ ++ if (cex->ec_type == EXT3_EXT_CACHE_NO) ++ return EXT3_EXT_CACHE_NO; ++ ++ EXT_ASSERT(cex->ec_type == EXT3_EXT_CACHE_GAP || ++ cex->ec_type == EXT3_EXT_CACHE_EXTENT); ++ if (block >= cex->ec_block && block < cex->ec_block + cex->ec_len) { ++ ex->ee_block = cex->ec_block; ++ ex->ee_start = cex->ec_start; ++ ex->ee_len = cex->ec_len; ++ ext_debug(tree, "%lu cached by %lu:%lu:%lu\n", ++ (unsigned long) block, ++ (unsigned long) ex->ee_block, ++ (unsigned long) ex->ee_len, ++ (unsigned long) ex->ee_start); ++ return cex->ec_type; ++ } ++ ++ /* not in cache */ ++ return EXT3_EXT_CACHE_NO; ++} ++ ++/* ++ * routine removes index from the index block ++ * it's used in truncate case only. thus all requests are for ++ * last index in the block only ++ */ ++int ext3_ext_rm_idx(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ struct buffer_head *bh; ++ int err; ++ ++ /* free index block */ ++ path--; ++ EXT_ASSERT(path->p_hdr->eh_entries); ++ if ((err = ext3_ext_get_access(handle, tree, path))) ++ return err; ++ path->p_hdr->eh_entries--; ++ if ((err = ext3_ext_dirty(handle, tree, path))) ++ return err; ++ ext_debug(tree, "index is empty, remove it, free block %d\n", ++ path->p_idx->ei_leaf); ++ bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf); ++ ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf); ++ ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1); ++ return err; ++} ++ ++int ext3_ext_calc_credits_for_insert(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ int depth = EXT_DEPTH(tree); ++ int needed; ++ ++ if (path) { ++ /* probably there is space in leaf? */ ++ if (path[depth].p_hdr->eh_entries < path[depth].p_hdr->eh_max) ++ return 1; ++ } ++ ++ /* ++ * the worste case we're expecting is creation of the ++ * new root (growing in depth) with index splitting ++ * for splitting we have to consider depth + 1 because ++ * previous growing could increase it ++ */ ++ depth = depth + 1; ++ ++ /* ++ * growing in depth: ++ * block allocation + new root + old root ++ */ ++ needed = EXT3_ALLOC_NEEDED + 2; ++ ++ /* index split. we may need: ++ * allocate intermediate indexes and new leaf ++ * change two blocks at each level, but root ++ * modify root block (inode) ++ */ ++ needed += (depth * EXT3_ALLOC_NEEDED) + (2 * depth) + 1; ++ ++ return needed; ++} ++ ++static int ++ext3_ext_split_for_rm(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, unsigned long start, ++ unsigned long end) ++{ ++ struct ext3_extent *ex, tex; ++ struct ext3_ext_path *npath; ++ int depth, creds, err; ++ ++ depth = EXT_DEPTH(tree); ++ ex = path[depth].p_ext; ++ EXT_ASSERT(ex); ++ EXT_ASSERT(end < ex->ee_block + ex->ee_len - 1); ++ EXT_ASSERT(ex->ee_block < start); ++ ++ /* calculate tail extent */ ++ tex.ee_block = end + 1; ++ EXT_ASSERT(tex.ee_block < ex->ee_block + ex->ee_len); ++ tex.ee_len = ex->ee_block + ex->ee_len - tex.ee_block; ++ ++ creds = ext3_ext_calc_credits_for_insert(tree, path); ++ handle = ext3_ext_journal_restart(handle, creds); ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ ++ /* calculate head extent. use primary extent */ ++ err = ext3_ext_get_access(handle, tree, path + depth); ++ if (err) ++ return err; ++ ex->ee_len = start - ex->ee_block; ++ err = ext3_ext_dirty(handle, tree, path + depth); ++ if (err) ++ return err; ++ ++ /* FIXME: some callback to free underlying resource ++ * and correct ee_start? */ ++ ext_debug(tree, "split extent: head %u:%u, tail %u:%u\n", ++ ex->ee_block, ex->ee_len, tex.ee_block, tex.ee_len); ++ ++ npath = ext3_ext_find_extent(tree, ex->ee_block, NULL); ++ if (IS_ERR(npath)) ++ return PTR_ERR(npath); ++ depth = EXT_DEPTH(tree); ++ EXT_ASSERT(npath[depth].p_ext->ee_block == ex->ee_block); ++ EXT_ASSERT(npath[depth].p_ext->ee_len == ex->ee_len); ++ ++ err = ext3_ext_insert_extent(handle, tree, npath, &tex); ++ ext3_ext_drop_refs(npath); ++ kfree(npath); ++ ++ return err; ++ ++} ++ ++static int ++ext3_ext_rm_leaf(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, unsigned long start, ++ unsigned long end) ++{ ++ struct ext3_extent *ex, *fu = NULL, *lu, *le; ++ int err = 0, correct_index = 0; ++ int depth = EXT_DEPTH(tree), credits; ++ struct ext3_extent_header *eh; ++ unsigned a, b, block, num; ++ ++ ext_debug(tree, "remove [%lu:%lu] in leaf\n", start, end); ++ if (!path[depth].p_hdr) ++ path[depth].p_hdr = EXT_BLOCK_HDR(path[depth].p_bh); ++ eh = path[depth].p_hdr; ++ EXT_ASSERT(eh); ++ EXT_ASSERT(eh->eh_entries <= eh->eh_max); ++ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); ++ ++ /* find where to start removing */ ++ le = ex = EXT_LAST_EXTENT(eh); ++ while (ex != EXT_FIRST_EXTENT(eh)) { ++ if (ex->ee_block <= end) ++ break; ++ ex--; ++ } ++ ++ if (start > ex->ee_block && end < ex->ee_block + ex->ee_len - 1) { ++ /* removal of internal part of the extent requested ++ * tail and head must be placed in different extent ++ * so, we have to insert one more extent */ ++ path[depth].p_ext = ex; ++ return ext3_ext_split_for_rm(handle, tree, path, start, end); ++ } ++ ++ lu = ex; ++ while (ex >= EXT_FIRST_EXTENT(eh) && ++ ex->ee_block + ex->ee_len > start) { ++ ext_debug(tree, "remove ext %u:%u\n", ex->ee_block, ex->ee_len); ++ path[depth].p_ext = ex; ++ ++ a = ex->ee_block > start ? ex->ee_block : start; ++ b = ex->ee_block + ex->ee_len - 1 < end ? ++ ex->ee_block + ex->ee_len - 1 : end; ++ ++ ext_debug(tree, " border %u:%u\n", a, b); ++ ++ if (a != ex->ee_block && b != ex->ee_block + ex->ee_len - 1) { ++ block = 0; ++ num = 0; ++ BUG(); ++ } else if (a != ex->ee_block) { ++ /* remove tail of the extent */ ++ block = ex->ee_block; ++ num = a - block; ++ } else if (b != ex->ee_block + ex->ee_len - 1) { ++ /* remove head of the extent */ ++ block = a; ++ num = b - a; ++ } else { ++ /* remove whole extent: excelent! */ ++ block = ex->ee_block; ++ num = 0; ++ EXT_ASSERT(a == ex->ee_block && ++ b == ex->ee_block + ex->ee_len - 1); ++ } ++ ++ if (ex == EXT_FIRST_EXTENT(eh)) ++ correct_index = 1; ++ ++ credits = 1; ++ if (correct_index) ++ credits += (EXT_DEPTH(tree) * EXT3_ALLOC_NEEDED) + 1; ++ if (tree->ops->remove_extent_credits) ++ credits+=tree->ops->remove_extent_credits(tree,ex,a,b); ++ ++ handle = ext3_ext_journal_restart(handle, credits); ++ if (IS_ERR(handle)) { ++ err = PTR_ERR(handle); ++ goto out; ++ } ++ ++ err = ext3_ext_get_access(handle, tree, path + depth); ++ if (err) ++ goto out; ++ ++ if (tree->ops->remove_extent) ++ err = tree->ops->remove_extent(tree, ex, a, b); ++ if (err) ++ goto out; ++ ++ if (num == 0) { ++ /* this extent is removed entirely mark slot unused */ ++ ex->ee_start = 0; ++ eh->eh_entries--; ++ fu = ex; ++ } ++ ++ ex->ee_block = block; ++ ex->ee_len = num; ++ ++ err = ext3_ext_dirty(handle, tree, path + depth); ++ if (err) ++ goto out; ++ ++ ext_debug(tree, "new extent: %u:%u:%u\n", ++ ex->ee_block, ex->ee_len, ex->ee_start); ++ ex--; ++ } ++ ++ if (fu) { ++ /* reuse unused slots */ ++ while (lu < le) { ++ if (lu->ee_start) { ++ *fu = *lu; ++ lu->ee_start = 0; ++ fu++; ++ } ++ lu++; ++ } ++ } ++ ++ if (correct_index && eh->eh_entries) ++ err = ext3_ext_correct_indexes(handle, tree, path); ++ ++ /* if this leaf is free, then we should ++ * remove it from index block above */ ++ if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL) ++ err = ext3_ext_rm_idx(handle, tree, path + depth); ++ ++out: ++ return err; ++} ++ ++ ++static struct ext3_extent_idx * ++ext3_ext_last_covered(struct ext3_extent_header *hdr, unsigned long block) ++{ ++ struct ext3_extent_idx *ix; ++ ++ ix = EXT_LAST_INDEX(hdr); ++ while (ix != EXT_FIRST_INDEX(hdr)) { ++ if (ix->ei_block <= block) ++ break; ++ ix--; ++ } ++ return ix; ++} ++ ++/* ++ * returns 1 if current index have to be freed (even partial) ++ */ ++static int inline ++ext3_ext_more_to_rm(struct ext3_ext_path *path) ++{ ++ EXT_ASSERT(path->p_idx); ++ ++ if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr)) ++ return 0; ++ ++ /* ++ * if truncate on deeper level happened it it wasn't partial ++ * so we have to consider current index for truncation ++ */ ++ if (path->p_hdr->eh_entries == path->p_block) ++ return 0; ++ return 1; ++} ++ ++int ext3_ext_remove_space(struct ext3_extents_tree *tree, ++ unsigned long start, unsigned long end) ++{ ++ struct inode *inode = tree->inode; ++ struct super_block *sb = inode->i_sb; ++ int depth = EXT_DEPTH(tree); ++ struct ext3_ext_path *path; ++ handle_t *handle; ++ int i = 0, err = 0; ++ ++ ext_debug(tree, "space to be removed: %lu:%lu\n", start, end); ++ ++ /* probably first extent we're gonna free will be last in block */ ++ handle = ext3_journal_start(inode, depth + 1); ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ ++ ext3_ext_invalidate_cache(tree); ++ ++ /* ++ * we start scanning from right side freeing all the blocks ++ * after i_size and walking into the deep ++ */ ++ path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 1), GFP_KERNEL); ++ if (IS_ERR(path)) { ++ ext3_error(sb, "ext3_ext_remove_space", ++ "Can't allocate path array"); ++ ext3_journal_stop(handle); ++ return -ENOMEM; ++ } ++ memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1)); ++ path[i].p_hdr = EXT_ROOT_HDR(tree); ++ ++ while (i >= 0 && err == 0) { ++ if (i == depth) { ++ /* this is leaf block */ ++ err = ext3_ext_rm_leaf(handle, tree, path, start, end); ++ /* root level have p_bh == NULL, brelse() eats this */ ++ brelse(path[i].p_bh); ++ i--; ++ continue; ++ } ++ ++ /* this is index block */ ++ if (!path[i].p_hdr) { ++ ext_debug(tree, "initialize header\n"); ++ path[i].p_hdr = EXT_BLOCK_HDR(path[i].p_bh); ++ } ++ ++ EXT_ASSERT(path[i].p_hdr->eh_entries <= path[i].p_hdr->eh_max); ++ EXT_ASSERT(path[i].p_hdr->eh_magic == EXT3_EXT_MAGIC); ++ ++ if (!path[i].p_idx) { ++ /* this level hasn't touched yet */ ++ path[i].p_idx = ++ ext3_ext_last_covered(path[i].p_hdr, end); ++ path[i].p_block = path[i].p_hdr->eh_entries + 1; ++ ext_debug(tree, "init index ptr: hdr 0x%p, num %d\n", ++ path[i].p_hdr, path[i].p_hdr->eh_entries); ++ } else { ++ /* we've already was here, see at next index */ ++ path[i].p_idx--; ++ } ++ ++ ext_debug(tree, "level %d - index, first 0x%p, cur 0x%p\n", ++ i, EXT_FIRST_INDEX(path[i].p_hdr), ++ path[i].p_idx); ++ if (ext3_ext_more_to_rm(path + i)) { ++ /* go to the next level */ ++ ext_debug(tree, "move to level %d (block %d)\n", ++ i + 1, path[i].p_idx->ei_leaf); ++ memset(path + i + 1, 0, sizeof(*path)); ++ path[i+1].p_bh = sb_bread(sb, path[i].p_idx->ei_leaf); ++ if (!path[i+1].p_bh) { ++ /* should we reset i_size? */ ++ err = -EIO; ++ break; ++ } ++ /* put actual number of indexes to know is this ++ * number got changed at the next iteration */ ++ path[i].p_block = path[i].p_hdr->eh_entries; ++ i++; ++ } else { ++ /* we finish processing this index, go up */ ++ if (path[i].p_hdr->eh_entries == 0 && i > 0) { ++ /* index is empty, remove it ++ * handle must be already prepared by the ++ * truncatei_leaf() */ ++ err = ext3_ext_rm_idx(handle, tree, path + i); ++ } ++ /* root level have p_bh == NULL, brelse() eats this */ ++ brelse(path[i].p_bh); ++ i--; ++ ext_debug(tree, "return to level %d\n", i); ++ } ++ } ++ ++ /* TODO: flexible tree reduction should be here */ ++ if (path->p_hdr->eh_entries == 0) { ++ /* ++ * truncate to zero freed all the tree ++ * so, we need to correct eh_depth ++ */ ++ err = ext3_ext_get_access(handle, tree, path); ++ if (err == 0) { ++ EXT_ROOT_HDR(tree)->eh_depth = 0; ++ EXT_ROOT_HDR(tree)->eh_max = ext3_ext_space_root(tree); ++ err = ext3_ext_dirty(handle, tree, path); ++ } ++ } ++ ext3_ext_tree_changed(tree); ++ ++ kfree(path); ++ ext3_journal_stop(handle); ++ ++ return err; ++} ++ ++int ext3_ext_calc_metadata_amount(struct ext3_extents_tree *tree, int blocks) ++{ ++ int lcap, icap, rcap, leafs, idxs, num; ++ ++ rcap = ext3_ext_space_root(tree); ++ if (blocks <= rcap) { ++ /* all extents fit to the root */ ++ return 0; ++ } ++ ++ rcap = ext3_ext_space_root_idx(tree); ++ lcap = ext3_ext_space_block(tree); ++ icap = ext3_ext_space_block_idx(tree); ++ ++ num = leafs = (blocks + lcap - 1) / lcap; ++ if (leafs <= rcap) { ++ /* all pointers to leafs fit to the root */ ++ return leafs; ++ } ++ ++ /* ok. we need separate index block(s) to link all leaf blocks */ ++ idxs = (leafs + icap - 1) / icap; ++ do { ++ num += idxs; ++ idxs = (idxs + icap - 1) / icap; ++ } while (idxs > rcap); ++ ++ return num; ++} ++ ++/* ++ * called at mount time ++ */ ++void ext3_ext_init(struct super_block *sb) ++{ ++ /* ++ * possible initialization would be here ++ */ ++ ++ if (test_opt(sb, EXTENTS)) { ++ printk("EXT3-fs: file extents enabled"); ++#ifdef AGRESSIVE_TEST ++ printk(", agressive tests"); ++#endif ++#ifdef CHECK_BINSEARCH ++ printk(", check binsearch"); ++#endif ++ printk("\n"); ++ } ++} ++ ++/* ++ * called at umount time ++ */ ++void ext3_ext_release(struct super_block *sb) ++{ ++} ++ ++/************************************************************************ ++ * VFS related routines ++ ************************************************************************/ ++ ++static int ext3_get_inode_write_access(handle_t *handle, void *buffer) ++{ ++ /* we use in-core data, not bh */ ++ return 0; ++} ++ ++static int ext3_mark_buffer_dirty(handle_t *handle, void *buffer) ++{ ++ struct inode *inode = buffer; ++ return ext3_mark_inode_dirty(handle, inode); ++} ++ ++static int ext3_ext_mergable(struct ext3_extent *ex1, ++ struct ext3_extent *ex2) ++{ ++ /* FIXME: support for large fs */ ++ if (ex1->ee_start + ex1->ee_len == ex2->ee_start) ++ return 1; ++ return 0; ++} ++ ++static int ++ext3_remove_blocks_credits(struct ext3_extents_tree *tree, ++ struct ext3_extent *ex, ++ unsigned long from, unsigned long to) ++{ ++ int needed; ++ ++ /* at present, extent can't cross block group */; ++ needed = 4; /* bitmap + group desc + sb + inode */ ++ ++#ifdef CONFIG_QUOTA ++ needed += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS; ++#endif ++ return needed; ++} ++ ++static int ++ext3_remove_blocks(struct ext3_extents_tree *tree, ++ struct ext3_extent *ex, ++ unsigned long from, unsigned long to) ++{ ++ int needed = ext3_remove_blocks_credits(tree, ex, from, to); ++ handle_t *handle = ext3_journal_start(tree->inode, needed); ++ struct buffer_head *bh; ++ int i; ++ ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) { ++ /* tail removal */ ++ unsigned long num, start; ++ num = ex->ee_block + ex->ee_len - from; ++ start = ex->ee_start + ex->ee_len - num; ++ ext_debug(tree, "free last %lu blocks starting %lu\n", ++ num, start); ++ for (i = 0; i < num; i++) { ++ bh = sb_find_get_block(tree->inode->i_sb, start + i); ++ ext3_forget(handle, 0, tree->inode, bh, start + i); ++ } ++ ext3_free_blocks(handle, tree->inode, start, num); ++ } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) { ++ printk("strange request: removal %lu-%lu from %u:%u\n", ++ from, to, ex->ee_block, ex->ee_len); ++ } else { ++ printk("strange request: removal(2) %lu-%lu from %u:%u\n", ++ from, to, ex->ee_block, ex->ee_len); ++ } ++ ext3_journal_stop(handle); ++ return 0; ++} ++ ++static int ext3_ext_find_goal(struct inode *inode, ++ struct ext3_ext_path *path, unsigned long block) ++{ ++ struct ext3_inode_info *ei = EXT3_I(inode); ++ unsigned long bg_start; ++ unsigned long colour; ++ int depth; ++ ++ if (path) { ++ struct ext3_extent *ex; ++ depth = path->p_depth; ++ ++ /* try to predict block placement */ ++ if ((ex = path[depth].p_ext)) ++ return ex->ee_start + (block - ex->ee_block); ++ ++ /* it looks index is empty ++ * try to find starting from index itself */ ++ if (path[depth].p_bh) ++ return path[depth].p_bh->b_blocknr; ++ } ++ ++ /* OK. use inode's group */ ++ bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) + ++ le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block); ++ colour = (current->pid % 16) * ++ (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16); ++ return bg_start + colour + block; ++} ++ ++static int ext3_new_block_cb(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *ex, int *err) ++{ ++ struct inode *inode = tree->inode; ++ int newblock, goal; ++ ++ EXT_ASSERT(path); ++ EXT_ASSERT(ex); ++ EXT_ASSERT(ex->ee_start); ++ EXT_ASSERT(ex->ee_len); ++ ++ /* reuse block from the extent to order data/metadata */ ++ newblock = ex->ee_start++; ++ ex->ee_len--; ++ if (ex->ee_len == 0) { ++ ex->ee_len = 1; ++ /* allocate new block for the extent */ ++ goal = ext3_ext_find_goal(inode, path, ex->ee_block); ++ ex->ee_start = ext3_new_block(handle, inode, goal, err); ++ if (ex->ee_start == 0) { ++ /* error occured: restore old extent */ ++ ex->ee_start = newblock; ++ return 0; ++ } ++ } ++ return newblock; ++} ++ ++static struct ext3_extents_helpers ext3_blockmap_helpers = { ++ .get_write_access = ext3_get_inode_write_access, ++ .mark_buffer_dirty = ext3_mark_buffer_dirty, ++ .mergable = ext3_ext_mergable, ++ .new_block = ext3_new_block_cb, ++ .remove_extent = ext3_remove_blocks, ++ .remove_extent_credits = ext3_remove_blocks_credits, ++}; ++ ++void ext3_init_tree_desc(struct ext3_extents_tree *tree, ++ struct inode *inode) ++{ ++ tree->inode = inode; ++ tree->root = (void *) EXT3_I(inode)->i_data; ++ tree->buffer = (void *) inode; ++ tree->buffer_len = sizeof(EXT3_I(inode)->i_data); ++ tree->cex = (struct ext3_ext_cache *) &EXT3_I(inode)->i_cached_extent; ++ tree->ops = &ext3_blockmap_helpers; ++} ++ ++int ext3_ext_get_block(handle_t *handle, struct inode *inode, ++ long iblock, struct buffer_head *bh_result, ++ int create, int extend_disksize) ++{ ++ struct ext3_ext_path *path = NULL; ++ struct ext3_extent newex; ++ struct ext3_extent *ex; ++ int goal, newblock, err = 0, depth; ++ struct ext3_extents_tree tree; ++ ++ clear_buffer_new(bh_result); ++ ext3_init_tree_desc(&tree, inode); ++ ext_debug(&tree, "block %d requested for inode %u\n", ++ (int) iblock, (unsigned) inode->i_ino); ++ down(&EXT3_I(inode)->truncate_sem); ++ ++ /* check in cache */ ++ if ((goal = ext3_ext_in_cache(&tree, iblock, &newex))) { ++ if (goal == EXT3_EXT_CACHE_GAP) { ++ if (!create) { ++ /* block isn't allocated yet and ++ * user don't want to allocate it */ ++ goto out2; ++ } ++ /* we should allocate requested block */ ++ } else if (goal == EXT3_EXT_CACHE_EXTENT) { ++ /* block is already allocated */ ++ newblock = iblock - newex.ee_block + newex.ee_start; ++ goto out; ++ } else { ++ EXT_ASSERT(0); ++ } ++ } ++ ++ /* find extent for this block */ ++ path = ext3_ext_find_extent(&tree, iblock, NULL); ++ if (IS_ERR(path)) { ++ err = PTR_ERR(path); ++ path = NULL; ++ goto out2; ++ } ++ ++ depth = EXT_DEPTH(&tree); ++ ++ /* ++ * consistent leaf must not be empty ++ * this situations is possible, though, _during_ tree modification ++ * this is why assert can't be put in ext3_ext_find_extent() ++ */ ++ EXT_ASSERT(path[depth].p_ext != NULL || depth == 0); ++ ++ if ((ex = path[depth].p_ext)) { ++ /* if found exent covers block, simple return it */ ++ if (iblock >= ex->ee_block && iblock < ex->ee_block + ex->ee_len) { ++ newblock = iblock - ex->ee_block + ex->ee_start; ++ ext_debug(&tree, "%d fit into %d:%d -> %d\n", ++ (int) iblock, ex->ee_block, ex->ee_len, ++ newblock); ++ ext3_ext_put_in_cache(&tree, ex->ee_block, ++ ex->ee_len, ex->ee_start, ++ EXT3_EXT_CACHE_EXTENT); ++ goto out; ++ } ++ } ++ ++ /* ++ * requested block isn't allocated yet ++ * we couldn't try to create block if create flag is zero ++ */ ++ if (!create) { ++ /* put just found gap into cache to speedup subsequest reqs */ ++ ext3_ext_put_gap_in_cache(&tree, path, iblock); ++ goto out2; ++ } ++ ++ /* allocate new block */ ++ goal = ext3_ext_find_goal(inode, path, iblock); ++ newblock = ext3_new_block(handle, inode, goal, &err); ++ if (!newblock) ++ goto out2; ++ ext_debug(&tree, "allocate new block: goal %d, found %d\n", ++ goal, newblock); ++ ++ /* try to insert new extent into found leaf and return */ ++ newex.ee_block = iblock; ++ newex.ee_start = newblock; ++ newex.ee_len = 1; ++ err = ext3_ext_insert_extent(handle, &tree, path, &newex); ++ if (err) ++ goto out2; ++ ++ if (extend_disksize && inode->i_size > EXT3_I(inode)->i_disksize) ++ EXT3_I(inode)->i_disksize = inode->i_size; ++ ++ /* previous routine could use block we allocated */ ++ newblock = newex.ee_start; ++ set_buffer_new(bh_result); ++ ++ ext3_ext_put_in_cache(&tree, newex.ee_block, newex.ee_len, ++ newex.ee_start, EXT3_EXT_CACHE_EXTENT); ++out: ++ ext3_ext_show_leaf(&tree, path); ++ map_bh(bh_result, inode->i_sb, newblock); ++out2: ++ if (path) { ++ ext3_ext_drop_refs(path); ++ kfree(path); ++ } ++ up(&EXT3_I(inode)->truncate_sem); ++ ++ return err; ++} ++ ++void ext3_ext_truncate(struct inode * inode, struct page *page) ++{ ++ struct address_space *mapping = inode->i_mapping; ++ struct super_block *sb = inode->i_sb; ++ struct ext3_extents_tree tree; ++ unsigned long last_block; ++ handle_t *handle; ++ int err = 0; ++ ++ ext3_init_tree_desc(&tree, inode); ++ ++ /* ++ * probably first extent we're gonna free will be last in block ++ */ ++ err = ext3_writepage_trans_blocks(inode) + 3; ++ handle = ext3_journal_start(inode, err); ++ if (IS_ERR(handle)) { ++ if (page) { ++ clear_highpage(page); ++ flush_dcache_page(page); ++ unlock_page(page); ++ page_cache_release(page); ++ } ++ return; ++ } ++ ++ if (page) ++ ext3_block_truncate_page(handle, page, mapping, inode->i_size); ++ ++ down(&EXT3_I(inode)->truncate_sem); ++ ext3_ext_invalidate_cache(&tree); ++ ++ /* ++ * TODO: optimization is possible here ++ * probably we need not scaning at all, ++ * because page truncation is enough ++ */ ++ if (ext3_orphan_add(handle, inode)) ++ goto out_stop; ++ ++ /* we have to know where to truncate from in crash case */ ++ EXT3_I(inode)->i_disksize = inode->i_size; ++ ext3_mark_inode_dirty(handle, inode); ++ ++ last_block = (inode->i_size + sb->s_blocksize - 1) ++ >> EXT3_BLOCK_SIZE_BITS(sb); ++ err = ext3_ext_remove_space(&tree, last_block, EXT_MAX_BLOCK); ++ ++ /* In a multi-transaction truncate, we only make the final ++ * transaction synchronous */ ++ if (IS_SYNC(inode)) ++ handle->h_sync = 1; ++ ++out_stop: ++ /* ++ * If this was a simple ftruncate(), and the file will remain alive ++ * then we need to clear up the orphan record which we created above. ++ * However, if this was a real unlink then we were called by ++ * ext3_delete_inode(), and we allow that function to clean up the ++ * orphan info for us. ++ */ ++ if (inode->i_nlink) ++ ext3_orphan_del(handle, inode); ++ ++ up(&EXT3_I(inode)->truncate_sem); ++ ext3_journal_stop(handle); ++} ++ ++/* ++ * this routine calculate max number of blocks we could modify ++ * in order to allocate new block for an inode ++ */ ++int ext3_ext_writepage_trans_blocks(struct inode *inode, int num) ++{ ++ struct ext3_extents_tree tree; ++ int needed; ++ ++ ext3_init_tree_desc(&tree, inode); ++ ++ needed = ext3_ext_calc_credits_for_insert(&tree, NULL); ++ ++ /* caller want to allocate num blocks */ ++ needed *= num; ++ ++#ifdef CONFIG_QUOTA ++ /* ++ * FIXME: real calculation should be here ++ * it depends on blockmap format of qouta file ++ */ ++ needed += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS; ++#endif ++ ++ return needed; ++} ++ ++void ext3_extents_initialize_blockmap(handle_t *handle, struct inode *inode) ++{ ++ struct ext3_extents_tree tree; ++ ++ ext3_init_tree_desc(&tree, inode); ++ ext3_extent_tree_init(handle, &tree); ++} ++ ++int ext3_ext_calc_blockmap_metadata(struct inode *inode, int blocks) ++{ ++ struct ext3_extents_tree tree; ++ ++ ext3_init_tree_desc(&tree, inode); ++ return ext3_ext_calc_metadata_amount(&tree, blocks); ++} ++ ++static int ++ext3_ext_store_extent_cb(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newex, int exist) ++{ ++ struct ext3_extent_buf *buf = (struct ext3_extent_buf *) tree->private; ++ ++ if (!exist) ++ return EXT_CONTINUE; ++ if (buf->err < 0) ++ return EXT_BREAK; ++ if (buf->cur - buf->buffer + sizeof(*newex) > buf->buflen) ++ return EXT_BREAK; ++ ++ if (!copy_to_user(buf->cur, newex, sizeof(*newex))) { ++ buf->err++; ++ buf->cur += sizeof(*newex); ++ } else { ++ buf->err = -EFAULT; ++ return EXT_BREAK; ++ } ++ return EXT_CONTINUE; ++} ++ ++static int ++ext3_ext_collect_stats_cb(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *ex, int exist) ++{ ++ struct ext3_extent_tree_stats *buf = ++ (struct ext3_extent_tree_stats *) tree->private; ++ int depth; ++ ++ if (!exist) ++ return EXT_CONTINUE; ++ ++ depth = EXT_DEPTH(tree); ++ buf->extents_num++; ++ if (path[depth].p_ext == EXT_FIRST_EXTENT(path[depth].p_hdr)) ++ buf->leaf_num++; ++ return EXT_CONTINUE; ++} ++ ++int ext3_ext_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, ++ unsigned long arg) ++{ ++ int err = 0; ++ ++ if (!(EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL)) ++ return -EINVAL; ++ ++ if (cmd == EXT3_IOC_GET_EXTENTS) { ++ struct ext3_extent_buf buf; ++ struct ext3_extents_tree tree; ++ ++ if (copy_from_user(&buf, (void *) arg, sizeof(buf))) ++ return -EFAULT; ++ ++ ext3_init_tree_desc(&tree, inode); ++ buf.cur = buf.buffer; ++ buf.err = 0; ++ tree.private = &buf; ++ down(&EXT3_I(inode)->truncate_sem); ++ err = ext3_ext_walk_space(&tree, buf.start, EXT_MAX_BLOCK, ++ ext3_ext_store_extent_cb); ++ up(&EXT3_I(inode)->truncate_sem); ++ if (err == 0) ++ err = buf.err; ++ } else if (cmd == EXT3_IOC_GET_TREE_STATS) { ++ struct ext3_extent_tree_stats buf; ++ struct ext3_extents_tree tree; ++ ++ ext3_init_tree_desc(&tree, inode); ++ down(&EXT3_I(inode)->truncate_sem); ++ buf.depth = EXT_DEPTH(&tree); ++ buf.extents_num = 0; ++ buf.leaf_num = 0; ++ tree.private = &buf; ++ err = ext3_ext_walk_space(&tree, 0, EXT_MAX_BLOCK, ++ ext3_ext_collect_stats_cb); ++ up(&EXT3_I(inode)->truncate_sem); ++ if (!err) ++ err = copy_to_user((void *) arg, &buf, sizeof(buf)); ++ } else if (cmd == EXT3_IOC_GET_TREE_DEPTH) { ++ struct ext3_extents_tree tree; ++ ext3_init_tree_desc(&tree, inode); ++ down(&EXT3_I(inode)->truncate_sem); ++ err = EXT_DEPTH(&tree); ++ up(&EXT3_I(inode)->truncate_sem); ++ } ++ ++ return err; ++} ++ ++EXPORT_SYMBOL(ext3_init_tree_desc); ++EXPORT_SYMBOL(ext3_mark_inode_dirty); ++EXPORT_SYMBOL(ext3_ext_invalidate_cache); ++EXPORT_SYMBOL(ext3_ext_insert_extent); ++EXPORT_SYMBOL(ext3_ext_walk_space); ++EXPORT_SYMBOL(ext3_ext_find_goal); ++EXPORT_SYMBOL(ext3_ext_calc_credits_for_insert); ++ +Index: linux-stage/fs/ext3/ialloc.c +=================================================================== +--- linux-stage.orig/fs/ext3/ialloc.c 2005-02-25 14:50:50.304202816 +0200 ++++ linux-stage/fs/ext3/ialloc.c 2005-02-25 15:33:48.920193600 +0200 +@@ -646,6 +646,10 @@ + DQUOT_FREE_INODE(inode); + goto fail2; + } ++ if (test_opt(sb, EXTENTS)) { ++ EXT3_I(inode)->i_flags |= EXT3_EXTENTS_FL; ++ ext3_extents_initialize_blockmap(handle, inode); ++ } + err = ext3_mark_inode_dirty(handle, inode); + if (err) { + ext3_std_error(sb, err); +Index: linux-stage/fs/ext3/inode.c +=================================================================== +--- linux-stage.orig/fs/ext3/inode.c 2005-02-25 14:50:50.309202056 +0200 ++++ linux-stage/fs/ext3/inode.c 2005-02-25 15:36:51.846384592 +0200 +@@ -796,6 +796,17 @@ + goto reread; + } + ++static inline int ++ext3_get_block_wrap(handle_t *handle, struct inode *inode, long block, ++ struct buffer_head *bh, int create, int extend_disksize) ++{ ++ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) ++ return ext3_ext_get_block(handle, inode, block, bh, create, ++ extend_disksize); ++ return ext3_get_block_handle(handle, inode, block, bh, create, ++ extend_disksize); ++} ++ + static int ext3_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) + { +@@ -806,8 +817,8 @@ + handle = ext3_journal_current_handle(); + J_ASSERT(handle != 0); + } +- ret = ext3_get_block_handle(handle, inode, iblock, +- bh_result, create, 1); ++ ret = ext3_get_block_wrap(handle, inode, iblock, ++ bh_result, create, 1); + return ret; + } + +@@ -851,7 +862,7 @@ + + get_block: + if (ret == 0) +- ret = ext3_get_block_handle(handle, inode, iblock, ++ ret = ext3_get_block_wrap(handle, inode, iblock, + bh_result, create, 0); + bh_result->b_size = (1 << inode->i_blkbits); + return ret; +@@ -871,7 +882,7 @@ + dummy.b_state = 0; + dummy.b_blocknr = -1000; + buffer_trace_init(&dummy.b_history); +- *errp = ext3_get_block_handle(handle, inode, block, &dummy, create, 1); ++ *errp = ext3_get_block_wrap(handle, inode, block, &dummy, create, 1); + if (!*errp && buffer_mapped(&dummy)) { + struct buffer_head *bh; + bh = sb_getblk(inode->i_sb, dummy.b_blocknr); +@@ -1589,7 +1600,7 @@ + * This required during truncate. We need to physically zero the tail end + * of that block so it doesn't yield old data if the file is later grown. + */ +-static int ext3_block_truncate_page(handle_t *handle, struct page *page, ++int ext3_block_truncate_page(handle_t *handle, struct page *page, + struct address_space *mapping, loff_t from) + { + unsigned long index = from >> PAGE_CACHE_SHIFT; +@@ -2087,6 +2098,9 @@ + return; + } + ++ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) ++ return ext3_ext_truncate(inode, page); ++ + handle = start_transaction(inode); + if (IS_ERR(handle)) { + if (page) { +@@ -2814,6 +2828,9 @@ + int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3; + int ret; + ++ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) ++ return ext3_ext_writepage_trans_blocks(inode, bpp); ++ + if (ext3_should_journal_data(inode)) + ret = 3 * (bpp + indirects) + 2; + else +Index: linux-stage/fs/ext3/Makefile +=================================================================== +--- linux-stage.orig/fs/ext3/Makefile 2005-02-25 14:49:42.168561008 +0200 ++++ linux-stage/fs/ext3/Makefile 2005-02-25 15:39:28.384587168 +0200 +@@ -5,7 +5,7 @@ + obj-$(CONFIG_EXT3_FS) += ext3.o + + ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o\ +- ioctl.o namei.o super.o symlink.o hash.o resize.o ++ ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o + + ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o + ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o +Index: linux-stage/fs/ext3/super.c +=================================================================== +--- linux-stage.orig/fs/ext3/super.c 2005-02-25 14:52:33.550506992 +0200 ++++ linux-stage/fs/ext3/super.c 2005-02-25 15:38:10.474431312 +0200 +@@ -394,6 +394,7 @@ + struct ext3_super_block *es = sbi->s_es; + int i; + ++ ext3_ext_release(sb); + ext3_xattr_put_super(sb); + journal_destroy(sbi->s_journal); + if (!(sb->s_flags & MS_RDONLY)) { +@@ -457,6 +458,10 @@ + #endif + ei->i_rsv_window.rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED; + ei->vfs_inode.i_version = 1; ++ ei->i_cached_extent[0] = 0; ++ ei->i_cached_extent[1] = 0; ++ ei->i_cached_extent[2] = 0; ++ ei->i_cached_extent[3] = 0; + return &ei->vfs_inode; + } + +@@ -589,7 +594,7 @@ + Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, + Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, + Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, +- Opt_ignore, Opt_barrier, Opt_err, Opt_resize, ++ Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_extents, Opt_extdebug, + }; + + static match_table_t tokens = { +@@ -639,6 +644,8 @@ + {Opt_iopen, "iopen"}, + {Opt_noiopen, "noiopen"}, + {Opt_iopen_nopriv, "iopen_nopriv"}, ++ {Opt_extents, "extents"}, ++ {Opt_extdebug, "extdebug"}, + {Opt_barrier, "barrier=%u"}, + {Opt_err, NULL}, + {Opt_resize, "resize"}, +@@ -943,6 +950,12 @@ + match_int(&args[0], &option); + *n_blocks_count = option; + break; ++ case Opt_extents: ++ set_opt (sbi->s_mount_opt, EXTENTS); ++ break; ++ case Opt_extdebug: ++ set_opt (sbi->s_mount_opt, EXTDEBUG); ++ break; + default: + printk (KERN_ERR + "EXT3-fs: Unrecognized mount option \"%s\" " +@@ -1625,6 +1638,8 @@ + percpu_counter_mod(&sbi->s_dirs_counter, + ext3_count_dirs(sb)); + ++ ext3_ext_init(sb); ++ + return 0; + + failed_mount3: +Index: linux-stage/fs/ext3/ioctl.c +=================================================================== +--- linux-stage.orig/fs/ext3/ioctl.c 2005-02-25 14:37:28.971023976 +0200 ++++ linux-stage/fs/ext3/ioctl.c 2005-02-25 15:33:48.938190864 +0200 +@@ -124,6 +124,10 @@ + err = ext3_change_inode_journal_flag(inode, jflag); + return err; + } ++ case EXT3_IOC_GET_EXTENTS: ++ case EXT3_IOC_GET_TREE_STATS: ++ case EXT3_IOC_GET_TREE_DEPTH: ++ return ext3_ext_ioctl(inode, filp, cmd, arg); + case EXT3_IOC_GETVERSION: + case EXT3_IOC_GETVERSION_OLD: + return put_user(inode->i_generation, (int __user *) arg); +Index: linux-stage/include/linux/ext3_fs.h +=================================================================== +--- linux-stage.orig/include/linux/ext3_fs.h 2005-02-25 14:53:56.424908168 +0200 ++++ linux-stage/include/linux/ext3_fs.h 2005-02-25 15:39:12.841950008 +0200 +@@ -186,6 +186,7 @@ + #define EXT3_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ + #define EXT3_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ + #define EXT3_RESERVED_FL 0x80000000 /* reserved for ext3 lib */ ++#define EXT3_EXTENTS_FL 0x00080000 /* Inode uses extents */ + + #define EXT3_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */ + #define EXT3_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */ +@@ -237,6 +238,9 @@ + #endif + #define EXT3_IOC_GETRSVSZ _IOR('f', 5, long) + #define EXT3_IOC_SETRSVSZ _IOW('f', 6, long) ++#define EXT3_IOC_GET_EXTENTS _IOR('f', 7, long) ++#define EXT3_IOC_GET_TREE_DEPTH _IOR('f', 8, long) ++#define EXT3_IOC_GET_TREE_STATS _IOR('f', 9, long) + + /* + * Structure of an inode on the disk +@@ -359,6 +363,9 @@ + #define EXT3_MOUNT_RESERVATION 0x20000 /* Preallocation */ + #define EXT3_MOUNT_IOPEN 0x40000 /* Allow access via iopen */ + #define EXT3_MOUNT_IOPEN_NOPRIV 0x80000 /* Make iopen world-readable */ ++#define EXT3_MOUNT_EXTENTS 0x100000/* Extents support */ ++#define EXT3_MOUNT_EXTDEBUG 0x200000/* Extents debug */ ++ + + /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ + #ifndef _LINUX_EXT2_FS_H +@@ -756,6 +763,7 @@ + + + /* inode.c */ ++extern int ext3_block_truncate_page(handle_t *, struct page *, struct address_space *, loff_t); + extern int ext3_forget(handle_t *, int, struct inode *, struct buffer_head *, int); + extern struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *); + extern struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *); +@@ -836,6 +844,16 @@ + extern struct inode_operations ext3_symlink_inode_operations; + extern struct inode_operations ext3_fast_symlink_inode_operations; + ++/* extents.c */ ++extern int ext3_ext_writepage_trans_blocks(struct inode *, int); ++extern int ext3_ext_get_block(handle_t *, struct inode *, long, ++ struct buffer_head *, int, int); ++extern void ext3_ext_truncate(struct inode *, struct page *); ++extern void ext3_ext_init(struct super_block *); ++extern void ext3_ext_release(struct super_block *); ++extern void ext3_extents_initialize_blockmap(handle_t *, struct inode *); ++extern int ext3_ext_ioctl(struct inode *inode, struct file *filp, ++ unsigned int cmd, unsigned long arg); + + #endif /* __KERNEL__ */ + +Index: linux-stage/include/linux/ext3_extents.h +=================================================================== +--- linux-stage.orig/include/linux/ext3_extents.h 2005-02-25 15:33:48.891198008 +0200 ++++ linux-stage/include/linux/ext3_extents.h 2005-02-25 15:33:48.944189952 +0200 +@@ -0,0 +1,252 @@ ++/* ++ * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com ++ * Written by Alex Tomas ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public Licens ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- ++ */ ++ ++#ifndef _LINUX_EXT3_EXTENTS ++#define _LINUX_EXT3_EXTENTS ++ ++/* ++ * with AGRESSIVE_TEST defined capacity of index/leaf blocks ++ * become very little, so index split, in-depth growing and ++ * other hard changes happens much more often ++ * this is for debug purposes only ++ */ ++#define AGRESSIVE_TEST_ ++ ++/* ++ * if CHECK_BINSEARCH defined, then results of binary search ++ * will be checked by linear search ++ */ ++#define CHECK_BINSEARCH_ ++ ++/* ++ * if EXT_DEBUG is defined you can use 'extdebug' mount option ++ * to get lots of info what's going on ++ */ ++#define EXT_DEBUG_ ++#ifdef EXT_DEBUG ++#define ext_debug(tree,fmt,a...) \ ++do { \ ++ if (test_opt((tree)->inode->i_sb, EXTDEBUG)) \ ++ printk(fmt, ##a); \ ++} while (0); ++#else ++#define ext_debug(tree,fmt,a...) ++#endif ++ ++/* ++ * if EXT_STATS is defined then stats numbers are collected ++ * these number will be displayed at umount time ++ */ ++#define EXT_STATS_ ++ ++ ++#define EXT3_ALLOC_NEEDED 3 /* block bitmap + group desc. + sb */ ++ ++/* ++ * ext3_inode has i_block array (total 60 bytes) ++ * first 4 bytes are used to store: ++ * - tree depth (0 mean there is no tree yet. all extents in the inode) ++ * - number of alive extents in the inode ++ */ ++ ++/* ++ * this is extent on-disk structure ++ * it's used at the bottom of the tree ++ */ ++struct ext3_extent { ++ __u32 ee_block; /* first logical block extent covers */ ++ __u16 ee_len; /* number of blocks covered by extent */ ++ __u16 ee_start_hi; /* high 16 bits of physical block */ ++ __u32 ee_start; /* low 32 bigs of physical block */ ++}; ++ ++/* ++ * this is index on-disk structure ++ * it's used at all the levels, but the bottom ++ */ ++struct ext3_extent_idx { ++ __u32 ei_block; /* index covers logical blocks from 'block' */ ++ __u32 ei_leaf; /* pointer to the physical block of the next * ++ * level. leaf or next index could bet here */ ++ __u16 ei_leaf_hi; /* high 16 bits of physical block */ ++ __u16 ei_unused; ++}; ++ ++/* ++ * each block (leaves and indexes), even inode-stored has header ++ */ ++struct ext3_extent_header { ++ __u16 eh_magic; /* probably will support different formats */ ++ __u16 eh_entries; /* number of valid entries */ ++ __u16 eh_max; /* capacity of store in entries */ ++ __u16 eh_depth; /* has tree real underlaying blocks? */ ++ __u32 eh_generation; /* generation of the tree */ ++}; ++ ++#define EXT3_EXT_MAGIC 0xf30a ++ ++/* ++ * array of ext3_ext_path contains path to some extent ++ * creation/lookup routines use it for traversal/splitting/etc ++ * truncate uses it to simulate recursive walking ++ */ ++struct ext3_ext_path { ++ __u32 p_block; ++ __u16 p_depth; ++ struct ext3_extent *p_ext; ++ struct ext3_extent_idx *p_idx; ++ struct ext3_extent_header *p_hdr; ++ struct buffer_head *p_bh; ++}; ++ ++/* ++ * structure for external API ++ */ ++ ++/* ++ * storage for cached extent ++ */ ++struct ext3_ext_cache { ++ __u32 ec_start; ++ __u32 ec_block; ++ __u32 ec_len; ++ __u32 ec_type; ++}; ++ ++#define EXT3_EXT_CACHE_NO 0 ++#define EXT3_EXT_CACHE_GAP 1 ++#define EXT3_EXT_CACHE_EXTENT 2 ++ ++/* ++ * ext3_extents_tree is used to pass initial information ++ * to top-level extents API ++ */ ++struct ext3_extents_helpers; ++struct ext3_extents_tree { ++ struct inode *inode; /* inode which tree belongs to */ ++ void *root; /* ptr to data top of tree resides at */ ++ void *buffer; /* will be passed as arg to ^^ routines */ ++ int buffer_len; ++ void *private; ++ struct ext3_ext_cache *cex;/* last found extent */ ++ struct ext3_extents_helpers *ops; ++}; ++ ++struct ext3_extents_helpers { ++ int (*get_write_access)(handle_t *h, void *buffer); ++ int (*mark_buffer_dirty)(handle_t *h, void *buffer); ++ int (*mergable)(struct ext3_extent *ex1, struct ext3_extent *ex2); ++ int (*remove_extent_credits)(struct ext3_extents_tree *, ++ struct ext3_extent *, unsigned long, ++ unsigned long); ++ int (*remove_extent)(struct ext3_extents_tree *, ++ struct ext3_extent *, unsigned long, ++ unsigned long); ++ int (*new_block)(handle_t *, struct ext3_extents_tree *, ++ struct ext3_ext_path *, struct ext3_extent *, ++ int *); ++}; ++ ++/* ++ * to be called by ext3_ext_walk_space() ++ * negative retcode - error ++ * positive retcode - signal for ext3_ext_walk_space(), see below ++ * callback must return valid extent (passed or newly created) ++ */ ++typedef int (*ext_prepare_callback)(struct ext3_extents_tree *, ++ struct ext3_ext_path *, ++ struct ext3_extent *, int); ++ ++#define EXT_CONTINUE 0 ++#define EXT_BREAK 1 ++#define EXT_REPEAT 2 ++ ++ ++#define EXT_MAX_BLOCK 0xffffffff ++#define EXT_CACHE_MARK 0xffff ++ ++ ++#define EXT_FIRST_EXTENT(__hdr__) \ ++ ((struct ext3_extent *) (((char *) (__hdr__)) + \ ++ sizeof(struct ext3_extent_header))) ++#define EXT_FIRST_INDEX(__hdr__) \ ++ ((struct ext3_extent_idx *) (((char *) (__hdr__)) + \ ++ sizeof(struct ext3_extent_header))) ++#define EXT_HAS_FREE_INDEX(__path__) \ ++ ((__path__)->p_hdr->eh_entries < (__path__)->p_hdr->eh_max) ++#define EXT_LAST_EXTENT(__hdr__) \ ++ (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_entries - 1) ++#define EXT_LAST_INDEX(__hdr__) \ ++ (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_entries - 1) ++#define EXT_MAX_EXTENT(__hdr__) \ ++ (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_max - 1) ++#define EXT_MAX_INDEX(__hdr__) \ ++ (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_max - 1) ++ ++#define EXT_ROOT_HDR(tree) \ ++ ((struct ext3_extent_header *) (tree)->root) ++#define EXT_BLOCK_HDR(bh) \ ++ ((struct ext3_extent_header *) (bh)->b_data) ++#define EXT_DEPTH(_t_) \ ++ (((struct ext3_extent_header *)((_t_)->root))->eh_depth) ++#define EXT_GENERATION(_t_) \ ++ (((struct ext3_extent_header *)((_t_)->root))->eh_generation) ++ ++ ++#define EXT_ASSERT(__x__) if (!(__x__)) BUG(); ++ ++ ++/* ++ * this structure is used to gather extents from the tree via ioctl ++ */ ++struct ext3_extent_buf { ++ unsigned long start; ++ int buflen; ++ void *buffer; ++ void *cur; ++ int err; ++}; ++ ++/* ++ * this structure is used to collect stats info about the tree ++ */ ++struct ext3_extent_tree_stats { ++ int depth; ++ int extents_num; ++ int leaf_num; ++}; ++ ++extern int ext3_extent_tree_init(handle_t *, struct ext3_extents_tree *); ++extern int ext3_ext_calc_credits_for_insert(struct ext3_extents_tree *, struct ext3_ext_path *); ++extern int ext3_ext_insert_extent(handle_t *, struct ext3_extents_tree *, struct ext3_ext_path *, struct ext3_extent *); ++extern int ext3_ext_walk_space(struct ext3_extents_tree *, unsigned long, unsigned long, ext_prepare_callback); ++extern int ext3_ext_remove_space(struct ext3_extents_tree *, unsigned long, unsigned long); ++extern struct ext3_ext_path * ext3_ext_find_extent(struct ext3_extents_tree *, int, struct ext3_ext_path *); ++extern void ext3_init_tree_desc(struct ext3_extents_tree *, struct inode *); ++extern int ext3_ext_calc_blockmap_metadata(struct inode *, int); ++ ++static inline void ++ext3_ext_invalidate_cache(struct ext3_extents_tree *tree) ++{ ++ if (tree->cex) ++ tree->cex->ec_type = EXT3_EXT_CACHE_NO; ++} ++ ++ ++#endif /* _LINUX_EXT3_EXTENTS */ ++ +Index: linux-stage/include/linux/ext3_fs_i.h +=================================================================== +--- linux-stage.orig/include/linux/ext3_fs_i.h 2005-02-25 14:50:50.320200384 +0200 ++++ linux-stage/include/linux/ext3_fs_i.h 2005-02-25 15:33:48.945189800 +0200 +@@ -128,6 +128,8 @@ + */ + struct semaphore truncate_sem; + struct inode vfs_inode; ++ ++ __u32 i_cached_extent[4]; + }; + + #endif /* _LINUX_EXT3_FS_I */ diff --git a/lustre/kernel_patches/patches/ext3-include-fixes-2.6-rhel4.diff b/ldiskfs/kernel_patches/patches/ext3-include-fixes-2.6-rhel4.patch similarity index 100% rename from lustre/kernel_patches/patches/ext3-include-fixes-2.6-rhel4.diff rename to ldiskfs/kernel_patches/patches/ext3-include-fixes-2.6-rhel4.patch diff --git a/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6.9-rhel4.patch b/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6.9-rhel4.patch new file mode 100644 index 0000000..fcceb30 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6.9-rhel4.patch @@ -0,0 +1,2236 @@ +Index: linux-stage/fs/ext3/mballoc.c +=================================================================== +--- linux-stage.orig/fs/ext3/mballoc.c 2005-02-25 17:28:41.836311072 +0200 ++++ linux-stage/fs/ext3/mballoc.c 2005-02-25 17:28:41.859307576 +0200 +@@ -0,0 +1,1847 @@ ++/* ++ * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com ++ * Written by Alex Tomas ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public Licens ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- ++ */ ++ ++ ++/* ++ * mballoc.c contains the multiblocks allocation routines ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * TODO: ++ * - track min/max extents in each group for better group selection ++ * - is it worthwhile to use buddies directly if req is 2^N blocks? ++ * - mb_mark_used() may allocate chunk right after splitting buddy ++ * - special flag to advice allocator to look for requested + N blocks ++ * this may improve interaction between extents and mballoc ++ * - tree of groups sorted by number of free blocks ++ * - percpu reservation code (hotpath) ++ * - error handling ++ */ ++ ++/* ++ * with AGRESSIVE_CHECK allocator runs consistency checks over ++ * structures. these checks slow things down a lot ++ */ ++#define AGGRESSIVE_CHECK__ ++ ++/* ++ * with MBALLOC_STATS allocator will collect stats that will be ++ * shown at umount. The collecting costs though! ++ */ ++#define MBALLOC_STATS ++ ++/* ++ */ ++#define MB_DEBUG__ ++#ifdef MB_DEBUG ++#define mb_debug(fmt,a...) printk(fmt, ##a) ++#else ++#define mb_debug(fmt,a...) ++#endif ++ ++/* ++ * where to save buddies structures beetween umount/mount (clean case only) ++ */ ++#define EXT3_BUDDY_FILE ".buddy" ++ ++/* ++ * How long mballoc can look for a best extent (in found extents) ++ */ ++#define EXT3_MB_MAX_TO_SCAN 100 ++ ++/* ++ * This structure is on-disk description of a group for mballoc ++ */ ++struct ext3_mb_group_descr { ++ __u16 mgd_first_free; /* first free block in the group */ ++ __u16 mgd_free; /* number of free blocks in the group */ ++ __u16 mgd_counters[16]; /* number of free blocks by order */ ++}; ++ ++/* ++ * This structure is header of mballoc's file ++ */ ++struct ext3_mb_grp_header { ++ __u32 mh_magic; ++}; ++ ++#define EXT3_MB_MAGIC_V1 0xbaad16fc ++ ++ ++struct ext3_free_extent { ++ __u16 fe_start; ++ __u16 fe_len; ++ __u16 fe_group; ++}; ++ ++struct ext3_allocation_context { ++ struct super_block *ac_sb; ++ ++ /* search goals */ ++struct ext3_free_extent ac_g_ex; ++ ++ /* the best found extent */ ++ struct ext3_free_extent ac_b_ex; ++ ++ /* number of iterations done. we have to track to limit searching */ ++ unsigned long ac_ex_scanned; ++ __u16 ac_groups_scanned; ++ __u16 ac_found; ++ __u8 ac_status; ++ __u8 ac_flags; /* allocation hints */ ++ __u8 ac_repeats; ++}; ++ ++#define AC_STATUS_CONTINUE 1 ++#define AC_STATUS_FOUND 2 ++#define AC_STATUS_BREAK 3 ++ ++struct ext3_buddy { ++ struct buffer_head *bd_bh; ++ struct buffer_head *bd_bh2; ++ struct ext3_buddy_group_blocks *bd_bd; ++ struct super_block *bd_sb; ++ __u16 bd_blkbits; ++ __u16 bd_group; ++}; ++#define EXT3_MB_BITMAP(e3b) ((e3b)->bd_bh->b_data) ++#define EXT3_MB_BUDDY(e3b) ((e3b)->bd_bh2->b_data) ++ ++#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) ++ ++int ext3_create (struct inode *, struct dentry *, int, struct nameidata *); ++struct buffer_head * read_block_bitmap(struct super_block *, unsigned int); ++int ext3_new_block_old(handle_t *, struct inode *, unsigned long, int *); ++int ext3_mb_reserve_blocks(struct super_block *, int); ++void ext3_mb_release_blocks(struct super_block *, int); ++void ext3_mb_poll_new_transaction(struct super_block *, handle_t *); ++void ext3_mb_free_committed_blocks(struct super_block *); ++ ++#define mb_correct_addr_and_bit(bit,addr) \ ++{ \ ++ if ((unsigned long)addr & 1) { \ ++ bit += 8; \ ++ addr--; \ ++ } \ ++ if ((unsigned long)addr & 2) { \ ++ bit += 16; \ ++ addr--; \ ++ addr--; \ ++ } \ ++} ++ ++static inline int mb_test_bit(int bit, void *addr) ++{ ++ mb_correct_addr_and_bit(bit,addr); ++ return test_bit(bit, addr); ++} ++ ++static inline void mb_set_bit(int bit, void *addr) ++{ ++ mb_correct_addr_and_bit(bit,addr); ++ __set_bit(bit, addr); ++} ++ ++static inline void mb_set_bit_atomic(int bit, void *addr) ++{ ++ mb_correct_addr_and_bit(bit,addr); ++ set_bit(bit, addr); ++} ++ ++static inline void mb_clear_bit(int bit, void *addr) ++{ ++ mb_correct_addr_and_bit(bit,addr); ++ __clear_bit(bit, addr); ++} ++ ++static inline void mb_clear_bit_atomic(int bit, void *addr) ++{ ++ mb_correct_addr_and_bit(bit,addr); ++ clear_bit(bit, addr); ++} ++ ++static inline void *mb_find_buddy(struct ext3_buddy *e3b, int order, int *max) ++{ ++ int i = 1; ++ char *bb; ++ ++ J_ASSERT(EXT3_MB_BITMAP(e3b) != EXT3_MB_BUDDY(e3b)); ++ J_ASSERT(max != NULL); ++ ++ if (order > e3b->bd_blkbits + 1) ++ return NULL; ++ ++ /* at order 0 we see each particular block */ ++ *max = 1 << (e3b->bd_blkbits + 3); ++ if (order == 0) ++ return EXT3_MB_BITMAP(e3b); ++ ++ bb = EXT3_MB_BUDDY(e3b); ++ *max = *max >> 1; ++ while (i < order) { ++ bb += 1 << (e3b->bd_blkbits - i); ++ i++; ++ *max = *max >> 1; ++ } ++ J_ASSERT((unsigned) (bb - (char *) EXT3_MB_BUDDY(e3b)) < ++ e3b->bd_sb->s_blocksize); ++ return bb; ++} ++ ++static int ext3_mb_load_buddy(struct super_block *sb, int group, ++ struct ext3_buddy *e3b) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ ++ J_ASSERT(sbi->s_buddy_blocks[group]->bb_bitmap); ++ J_ASSERT(sbi->s_buddy_blocks[group]->bb_buddy); ++ ++ /* load bitmap */ ++ e3b->bd_bh = sb_getblk(sb, sbi->s_buddy_blocks[group]->bb_bitmap); ++ if (e3b->bd_bh == NULL) { ++ ext3_error(sb, "ext3_mb_load_buddy", ++ "can't get block for buddy bitmap\n"); ++ goto out; ++ } ++ if (!buffer_uptodate(e3b->bd_bh)) { ++ ll_rw_block(READ, 1, &e3b->bd_bh); ++ wait_on_buffer(e3b->bd_bh); ++ } ++ J_ASSERT(buffer_uptodate(e3b->bd_bh)); ++ ++ /* load buddy */ ++ e3b->bd_bh2 = sb_getblk(sb, sbi->s_buddy_blocks[group]->bb_buddy); ++ if (e3b->bd_bh2 == NULL) { ++ ext3_error(sb, "ext3_mb_load_buddy", ++ "can't get block for buddy bitmap\n"); ++ goto out; ++ } ++ if (!buffer_uptodate(e3b->bd_bh2)) { ++ ll_rw_block(READ, 1, &e3b->bd_bh2); ++ wait_on_buffer(e3b->bd_bh2); ++ } ++ J_ASSERT(buffer_uptodate(e3b->bd_bh2)); ++ ++ e3b->bd_blkbits = sb->s_blocksize_bits; ++ e3b->bd_bd = sbi->s_buddy_blocks[group]; ++ e3b->bd_sb = sb; ++ e3b->bd_group = group; ++ ++ return 0; ++out: ++ brelse(e3b->bd_bh); ++ brelse(e3b->bd_bh2); ++ e3b->bd_bh = NULL; ++ e3b->bd_bh2 = NULL; ++ return -EIO; ++} ++ ++static void ext3_mb_dirty_buddy(struct ext3_buddy *e3b) ++{ ++ mark_buffer_dirty(e3b->bd_bh); ++ mark_buffer_dirty(e3b->bd_bh2); ++} ++ ++static void ext3_mb_release_desc(struct ext3_buddy *e3b) ++{ ++ brelse(e3b->bd_bh); ++ brelse(e3b->bd_bh2); ++} ++ ++#ifdef AGGRESSIVE_CHECK ++static void mb_check_buddy(struct ext3_buddy *e3b) ++{ ++ int order = e3b->bd_blkbits + 1; ++ int max, max2, i, j, k, count; ++ void *buddy, *buddy2; ++ ++ if (!test_opt(e3b->bd_sb, MBALLOC)) ++ return; ++ ++ while (order > 1) { ++ buddy = mb_find_buddy(e3b, order, &max); ++ J_ASSERT(buddy); ++ buddy2 = mb_find_buddy(e3b, order - 1, &max2); ++ J_ASSERT(buddy2); ++ J_ASSERT(buddy != buddy2); ++ J_ASSERT(max * 2 == max2); ++ ++ count = 0; ++ for (i = 0; i < max; i++) { ++ ++ if (!mb_test_bit(i, buddy)) { ++ /* only single bit in buddy2 may be 1 */ ++ if (mb_test_bit(i << 1, buddy2)) ++ J_ASSERT(!mb_test_bit((i<<1)+1, buddy2)); ++ else if (mb_test_bit((i << 1) + 1, buddy2)) ++ J_ASSERT(!mb_test_bit(i << 1, buddy2)); ++ continue; ++ } ++ ++ /* both bits in buddy2 must be 0 */ ++ J_ASSERT(!mb_test_bit(i << 1, buddy2)); ++ J_ASSERT(!mb_test_bit((i << 1) + 1, buddy2)); ++ ++ for (j = 0; j < (1 << order); j++) { ++ k = (i * (1 << order)) + j; ++ J_ASSERT(mb_test_bit(k, EXT3_MB_BITMAP(e3b))); ++ } ++ count++; ++ } ++ J_ASSERT(e3b->bd_bd->bb_counters[order] == count); ++ order--; ++ } ++ ++ buddy = mb_find_buddy(e3b, 0, &max); ++ for (i = 0; i < max; i++) { ++ if (mb_test_bit(i, buddy)) ++ continue; ++ /* check used bits only */ ++ for (j = 0; j < e3b->bd_blkbits + 1; j++) { ++ buddy2 = mb_find_buddy(e3b, j, &max2); ++ k = i >> j; ++ J_ASSERT(k < max2); ++ J_ASSERT(!mb_test_bit(k, buddy2)); ++ } ++ } ++} ++#else ++#define mb_check_buddy(e3b) ++#endif ++ ++static inline void ++ext3_lock_group(struct super_block *sb, int group) ++{ ++ spin_lock(&EXT3_SB(sb)->s_buddy_blocks[group]->bb_lock); ++} ++ ++static inline void ++ext3_unlock_group(struct super_block *sb, int group) ++{ ++ spin_unlock(&EXT3_SB(sb)->s_buddy_blocks[group]->bb_lock); ++} ++ ++static int mb_find_order_for_block(struct ext3_buddy *e3b, int block) ++{ ++ int order = 1; ++ void *bb; ++ ++ J_ASSERT(EXT3_MB_BITMAP(e3b) != EXT3_MB_BUDDY(e3b)); ++ J_ASSERT(block < (1 << (e3b->bd_blkbits + 3))); ++ ++ bb = EXT3_MB_BUDDY(e3b); ++ while (order <= e3b->bd_blkbits + 1) { ++ block = block >> 1; ++ if (mb_test_bit(block, bb)) { ++ /* this block is part of buddy of order 'order' */ ++ return order; ++ } ++ bb += 1 << (e3b->bd_blkbits - order); ++ order++; ++ } ++ return 0; ++} ++ ++static inline void mb_clear_bits(void *bm, int cur, int len) ++{ ++ __u32 *addr; ++ ++ len = cur + len; ++ while (cur < len) { ++ if ((cur & 31) == 0 && (len - cur) >= 32) { ++ /* fast path: clear whole word at once */ ++ addr = bm + (cur >> 3); ++ *addr = 0; ++ cur += 32; ++ continue; ++ } ++ mb_clear_bit_atomic(cur, bm); ++ cur++; ++ } ++} ++ ++static inline void mb_set_bits(void *bm, int cur, int len) ++{ ++ __u32 *addr; ++ ++ len = cur + len; ++ while (cur < len) { ++ if ((cur & 31) == 0 && (len - cur) >= 32) { ++ /* fast path: clear whole word at once */ ++ addr = bm + (cur >> 3); ++ *addr = 0xffffffff; ++ cur += 32; ++ continue; ++ } ++ mb_set_bit_atomic(cur, bm); ++ cur++; ++ } ++} ++ ++static int mb_free_blocks(struct ext3_buddy *e3b, int first, int count) ++{ ++ int block, max, order; ++ void *buddy, *buddy2; ++ ++ mb_check_buddy(e3b); ++ ++ e3b->bd_bd->bb_free += count; ++ if (first < e3b->bd_bd->bb_first_free) ++ e3b->bd_bd->bb_first_free = first; ++ ++ while (count-- > 0) { ++ block = first++; ++ order = 0; ++ ++ J_ASSERT(!mb_test_bit(block, EXT3_MB_BITMAP(e3b))); ++ mb_set_bit(block, EXT3_MB_BITMAP(e3b)); ++ e3b->bd_bd->bb_counters[order]++; ++ ++ /* start of the buddy */ ++ buddy = mb_find_buddy(e3b, order, &max); ++ ++ do { ++ block &= ~1UL; ++ if (!mb_test_bit(block, buddy) || ++ !mb_test_bit(block + 1, buddy)) ++ break; ++ ++ /* both the buddies are free, try to coalesce them */ ++ buddy2 = mb_find_buddy(e3b, order + 1, &max); ++ ++ if (!buddy2) ++ break; ++ ++ if (order > 0) { ++ /* for special purposes, we don't clear ++ * free bits in bitmap */ ++ mb_clear_bit(block, buddy); ++ mb_clear_bit(block + 1, buddy); ++ } ++ e3b->bd_bd->bb_counters[order]--; ++ e3b->bd_bd->bb_counters[order]--; ++ ++ block = block >> 1; ++ order++; ++ e3b->bd_bd->bb_counters[order]++; ++ ++ mb_set_bit(block, buddy2); ++ buddy = buddy2; ++ } while (1); ++ } ++ mb_check_buddy(e3b); ++ ++ return 0; ++} ++ ++static int mb_find_extent(struct ext3_buddy *e3b, int order, int block, ++ int needed, struct ext3_free_extent *ex) ++{ ++ int next, max, ord; ++ void *buddy; ++ ++ J_ASSERT(ex != NULL); ++ ++ buddy = mb_find_buddy(e3b, order, &max); ++ J_ASSERT(buddy); ++ J_ASSERT(block < max); ++ if (!mb_test_bit(block, buddy)) { ++ ex->fe_len = 0; ++ ex->fe_start = 0; ++ ex->fe_group = 0; ++ return 0; ++ } ++ ++ if (order == 0) { ++ /* find actual order */ ++ order = mb_find_order_for_block(e3b, block); ++ block = block >> order; ++ } ++ ++ ex->fe_len = 1 << order; ++ ex->fe_start = block << order; ++ ex->fe_group = e3b->bd_group; ++ ++ while ((buddy = mb_find_buddy(e3b, order, &max))) { ++ ++ if (block + 1 >= max) ++ break; ++ ++ next = (block + 1) * (1 << order); ++ if (!mb_test_bit(next, EXT3_MB_BITMAP(e3b))) ++ break; ++ ++ ord = mb_find_order_for_block(e3b, next); ++ ++ order = ord; ++ block = next >> order; ++ ex->fe_len += 1 << order; ++ } ++ ++ J_ASSERT(ex->fe_start + ex->fe_len <= (1 << (e3b->bd_blkbits + 3))); ++ return ex->fe_len; ++} ++ ++static int mb_mark_used(struct ext3_buddy *e3b, struct ext3_free_extent *ex) ++{ ++ int start = ex->fe_start; ++ int len = ex->fe_len; ++ int ord, mlen, max, cur; ++ int len0 = len; ++ void *buddy; ++ ++ e3b->bd_bd->bb_free -= len; ++ if (e3b->bd_bd->bb_first_free == start) ++ e3b->bd_bd->bb_first_free += len; ++ ++ while (len) { ++ ord = mb_find_order_for_block(e3b, start); ++ ++ if (((start >> ord) << ord) == start && len >= (1 << ord)) { ++ /* the whole chunk may be allocated at once! */ ++ mlen = 1 << ord; ++ buddy = mb_find_buddy(e3b, ord, &max); ++ J_ASSERT((start >> ord) < max); ++ mb_clear_bit(start >> ord, buddy); ++ e3b->bd_bd->bb_counters[ord]--; ++ start += mlen; ++ len -= mlen; ++ J_ASSERT(len >= 0); ++ continue; ++ } ++ ++ /* we have to split large buddy */ ++ J_ASSERT(ord > 0); ++ buddy = mb_find_buddy(e3b, ord, &max); ++ mb_clear_bit(start >> ord, buddy); ++ e3b->bd_bd->bb_counters[ord]--; ++ ++ ord--; ++ cur = (start >> ord) & ~1U; ++ buddy = mb_find_buddy(e3b, ord, &max); ++ mb_set_bit(cur, buddy); ++ mb_set_bit(cur + 1, buddy); ++ e3b->bd_bd->bb_counters[ord]++; ++ e3b->bd_bd->bb_counters[ord]++; ++ } ++ ++ /* now drop all the bits in bitmap */ ++ mb_clear_bits(EXT3_MB_BITMAP(e3b), ex->fe_start, len0); ++ ++ mb_check_buddy(e3b); ++ ++ return 0; ++} ++ ++/* ++ * Must be called under group lock! ++ */ ++static void ext3_mb_use_best_found(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len); ++ mb_mark_used(e3b, &ac->ac_b_ex); ++ ac->ac_status = AC_STATUS_FOUND; ++} ++ ++/* ++ * The routine checks whether found extent is good enough. If it is, ++ * then the extent gets marked used and flag is set to the context ++ * to stop scanning. Otherwise, the extent is compared with the ++ * previous found extent and if new one is better, then it's stored ++ * in the context. Later, the best found extent will be used, if ++ * mballoc can't find good enough extent. ++ * ++ * FIXME: real allocation policy is to be designed yet! ++ */ ++static void ext3_mb_measure_extent(struct ext3_allocation_context *ac, ++ struct ext3_free_extent *ex, ++ struct ext3_buddy *e3b) ++{ ++ int factor = EXT3_SB(ac->ac_sb)->s_mb_factor; ++ struct ext3_free_extent *bex = &ac->ac_b_ex; ++ int diff = ac->ac_g_ex.fe_len - ex->fe_len; ++ ++ J_ASSERT(ex->fe_len > 0); ++ J_ASSERT(ex->fe_len < (1 << ac->ac_sb->s_blocksize_bits) * 8); ++ J_ASSERT(ex->fe_start < (1 << ac->ac_sb->s_blocksize_bits) * 8); ++ ++ ac->ac_found++; ++ ++ /* ++ * The special case - take what you catch first ++ */ ++ if (ac->ac_flags & EXT3_MB_HINT_FIRST) { ++ *bex = *ex; ++ ext3_mb_use_best_found(ac, e3b); ++ return; ++ } ++ ++ /* ++ * Let's check whether the chuck is good enough ++ */ ++ if (ex->fe_len >= ac->ac_g_ex.fe_len) { ++ *bex = *ex; ++ ext3_mb_use_best_found(ac, e3b); ++ return; ++ } ++ ++ /* ++ * If the request is vey large, then it makes sense to use large ++ * chunks for it. Even if they don't satisfy whole request. ++ */ ++ if (ex->fe_len > 1000) { ++ *bex = *ex; ++ ext3_mb_use_best_found(ac, e3b); ++ return; ++ } ++ ++ /* ++ * Sometimes it's worty to take close chunk ++ */ ++ if (factor && (ac->ac_g_ex.fe_len * 100) / (diff * 100) >= factor) { ++ *bex = *ex; ++ ext3_mb_use_best_found(ac, e3b); ++ return; ++ } ++ ++ /* ++ * If this is first found extent, just store it in the context ++ */ ++ if (bex->fe_len == 0) { ++ *bex = *ex; ++ return; ++ } ++ ++ /* ++ * If new found extent is better, store it in the context ++ * FIXME: possible the policy should be more complex? ++ */ ++ if (ex->fe_len > bex->fe_len) { ++ *bex = *ex; ++ } ++ ++ /* ++ * We don't want to scan for a whole year ++ */ ++ if (ac->ac_found > EXT3_MB_MAX_TO_SCAN) ++ ac->ac_status = AC_STATUS_BREAK; ++} ++ ++static int ext3_mb_try_best_found(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ struct ext3_free_extent ex = ac->ac_b_ex; ++ int group = ex.fe_group, max, err; ++ ++ J_ASSERT(ex.fe_len > 0); ++ err = ext3_mb_load_buddy(ac->ac_sb, group, e3b); ++ if (err) ++ return err; ++ ++ ext3_lock_group(ac->ac_sb, group); ++ max = mb_find_extent(e3b, 0, ex.fe_start, ex.fe_len, &ex); ++ ++ if (max > 0) ++ ext3_mb_use_best_found(ac, e3b); ++ ++ ext3_unlock_group(ac->ac_sb, group); ++ ++ if (ac->ac_status == AC_STATUS_FOUND) ++ ext3_mb_dirty_buddy(e3b); ++ ext3_mb_release_desc(e3b); ++ ++ return 0; ++} ++ ++static int ext3_mb_find_by_goal(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ int group = ac->ac_g_ex.fe_group, max, err; ++ struct ext3_free_extent ex; ++ ++ err = ext3_mb_load_buddy(ac->ac_sb, group, e3b); ++ if (err) ++ return err; ++ ++ ext3_lock_group(ac->ac_sb, group); ++ max = mb_find_extent(e3b, 0, ac->ac_g_ex.fe_start, ++ ac->ac_g_ex.fe_len, &ex); ++ ++ if (max > 0) { ++ J_ASSERT(ex.fe_len > 0); ++ J_ASSERT(ex.fe_group == ac->ac_g_ex.fe_group); ++ J_ASSERT(ex.fe_start == ac->ac_g_ex.fe_start); ++ ac->ac_b_ex = ex; ++ ext3_mb_use_best_found(ac, e3b); ++ } ++ ext3_unlock_group(ac->ac_sb, group); ++ ++ if (ac->ac_status == AC_STATUS_FOUND) ++ ext3_mb_dirty_buddy(e3b); ++ ext3_mb_release_desc(e3b); ++ ++ return 0; ++} ++/* ++ * The routine scans the group and measures all found extents. ++ * In order to optimize scanning, caller must pass number of ++ * free blocks in the group, so the routine can upper limit. ++ */ ++static void ext3_mb_scan_group(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ struct super_block *sb = ac->ac_sb; ++ void *bitmap = EXT3_MB_BITMAP(e3b); ++ struct ext3_free_extent ex; ++ int i, free; ++ ++ free = e3b->bd_bd->bb_free; ++ J_ASSERT(free > 0); ++ ++ i = e3b->bd_bd->bb_first_free; ++ ++ while (free && ac->ac_status != AC_STATUS_FOUND) { ++ i = find_next_bit(bitmap, sb->s_blocksize * 8, i); ++ if (i >= sb->s_blocksize * 8) { ++ J_ASSERT(free == 0); ++ break; ++ } ++ ++ mb_find_extent(e3b, 0, i, ac->ac_g_ex.fe_len, &ex); ++ J_ASSERT(ex.fe_len > 0); ++ J_ASSERT(free >= ex.fe_len); ++ ++ ext3_mb_measure_extent(ac, &ex, e3b); ++ ++ i += ex.fe_len; ++ free -= ex.fe_len; ++ } ++} ++ ++static int ext3_mb_good_group(struct ext3_allocation_context *ac, ++ int group, int cr) ++{ ++ int free; ++ ++ J_ASSERT(cr >= 0 && cr < 3); ++ ++ free = EXT3_SB(ac->ac_sb)->s_buddy_blocks[group]->bb_free; ++ if (free == 0) ++ return 0; ++ ++ if (cr == 0) { ++ if (free >= ac->ac_g_ex.fe_len >> 1) ++ return 1; ++ } else if (cr == 1) { ++ if (free >= ac->ac_g_ex.fe_len >> 2) ++ return 1; ++ } else if (cr == 2) { ++ return 1; ++ } ++ return 0; ++} ++ ++int ext3_mb_new_blocks(handle_t *handle, struct inode *inode, ++ unsigned long goal, int *len, int flags, int *errp) ++{ ++ struct buffer_head *bitmap_bh = NULL; ++ struct ext3_allocation_context ac; ++ int i, group, block, cr, err = 0; ++ struct ext3_group_desc *gdp; ++ struct ext3_super_block *es; ++ struct buffer_head *gdp_bh; ++ struct ext3_sb_info *sbi; ++ struct super_block *sb; ++ struct ext3_buddy e3b; ++ ++ J_ASSERT(len != NULL); ++ J_ASSERT(*len > 0); ++ ++ sb = inode->i_sb; ++ if (!sb) { ++ printk("ext3_mb_new_nblocks: nonexistent device"); ++ return 0; ++ } ++ ++ if (!test_opt(sb, MBALLOC)) { ++ static int ext3_mballoc_warning = 0; ++ if (ext3_mballoc_warning == 0) { ++ printk(KERN_ERR "EXT3-fs: multiblock request with " ++ "mballoc disabled!\n"); ++ ext3_mballoc_warning++; ++ } ++ *len = 1; ++ err = ext3_new_block_old(handle, inode, goal, errp); ++ return err; ++ } ++ ++ ext3_mb_poll_new_transaction(sb, handle); ++ ++ sbi = EXT3_SB(sb); ++ es = EXT3_SB(sb)->s_es; ++ ++ /* ++ * We can't allocate > group size ++ */ ++ if (*len >= EXT3_BLOCKS_PER_GROUP(sb) - 10) ++ *len = EXT3_BLOCKS_PER_GROUP(sb) - 10; ++ ++ if (!(flags & EXT3_MB_HINT_RESERVED)) { ++ /* someone asks for non-reserved blocks */ ++ BUG_ON(*len > 1); ++ err = ext3_mb_reserve_blocks(sb, 1); ++ if (err) { ++ *errp = err; ++ return 0; ++ } ++ } ++ ++ /* ++ * Check quota for allocation of this blocks. ++ */ ++ while (*len && DQUOT_ALLOC_BLOCK(inode, *len)) ++ *len -= 1; ++ if (*len == 0) { ++ *errp = -EDQUOT; ++ block = 0; ++ goto out; ++ } ++ ++ /* start searching from the goal */ ++ if (goal < le32_to_cpu(es->s_first_data_block) || ++ goal >= le32_to_cpu(es->s_blocks_count)) ++ goal = le32_to_cpu(es->s_first_data_block); ++ group = (goal - le32_to_cpu(es->s_first_data_block)) / ++ EXT3_BLOCKS_PER_GROUP(sb); ++ block = ((goal - le32_to_cpu(es->s_first_data_block)) % ++ EXT3_BLOCKS_PER_GROUP(sb)); ++ ++ /* set up allocation goals */ ++ ac.ac_b_ex.fe_group = 0; ++ ac.ac_b_ex.fe_start = 0; ++ ac.ac_b_ex.fe_len = 0; ++ ac.ac_status = AC_STATUS_CONTINUE; ++ ac.ac_groups_scanned = 0; ++ ac.ac_ex_scanned = 0; ++ ac.ac_found = 0; ++ ac.ac_sb = inode->i_sb; ++ ac.ac_g_ex.fe_group = group; ++ ac.ac_g_ex.fe_start = block; ++ ac.ac_g_ex.fe_len = *len; ++ ac.ac_flags = flags; ++ ++ /* ++ * Sometimes, caller may want to merge even small number ++ * of blocks to an existing extent ++ */ ++ if (ac.ac_flags & EXT3_MB_HINT_MERGE) { ++ err = ext3_mb_find_by_goal(&ac, &e3b); ++ if (err) ++ goto out_err; ++ if (ac.ac_status == AC_STATUS_FOUND) ++ goto found; ++ } ++ ++ /* ++ * FIXME ++ * If requested chunk is power of 2 length, we can try ++ * to exploit buddy nature to speed allocation up ++ */ ++ ++ ++ /* ++ * Let's just scan groups to find more-less suitable blocks ++ */ ++ cr = 0; ++repeat: ++ for (; cr < 3 && ac.ac_status == AC_STATUS_CONTINUE; cr++) { ++ for (i = 0; i < EXT3_SB(sb)->s_groups_count; group++, i++) { ++ if (group == EXT3_SB(sb)->s_groups_count) ++ group = 0; ++ ++ /* check is group good for our criteries */ ++ if (!ext3_mb_good_group(&ac, group, cr)) ++ continue; ++ ++ err = ext3_mb_load_buddy(ac.ac_sb, group, &e3b); ++ if (err) ++ goto out_err; ++ ++ ext3_lock_group(sb, group); ++ if (!ext3_mb_good_group(&ac, group, cr)) { ++ /* someone did allocation from this group */ ++ ext3_unlock_group(sb, group); ++ ext3_mb_release_desc(&e3b); ++ continue; ++ } ++ ++ ext3_mb_scan_group(&ac, &e3b); ++ ext3_unlock_group(sb, group); ++ ++ if (ac.ac_status == AC_STATUS_FOUND) ++ ext3_mb_dirty_buddy(&e3b); ++ ext3_mb_release_desc(&e3b); ++ ++ if (err) ++ goto out_err; ++ if (ac.ac_status != AC_STATUS_CONTINUE) ++ break; ++ } ++ } ++ ++ if (ac.ac_status == AC_STATUS_BREAK && ++ !(ac.ac_flags & EXT3_MB_HINT_FIRST)) { ++ /* ++ * We've been searching too long. Let's try to allocate ++ * the best chunk we've found so far ++ */ ++ printk(KERN_ERR "EXT3-fs: too long searching (%d/%d)\n", ++ ac.ac_b_ex.fe_len, ac.ac_g_ex.fe_len); ++ ext3_mb_try_best_found(&ac, &e3b); ++ if (ac.ac_status != AC_STATUS_FOUND) { ++ /* ++ * Someone more lucky has already allocated it. ++ * The only thing we can do is just take first ++ * found block(s) ++ */ ++ printk(KERN_ERR "EXT3-fs: and someone won our chunk\n"); ++ ac.ac_b_ex.fe_group = 0; ++ ac.ac_b_ex.fe_start = 0; ++ ac.ac_b_ex.fe_len = 0; ++ ac.ac_status = AC_STATUS_CONTINUE; ++ ac.ac_flags |= EXT3_MB_HINT_FIRST; ++ cr = 2; ++ goto repeat; ++ } ++ } ++ ++ if (ac.ac_status != AC_STATUS_FOUND) { ++ /* ++ * We aren't lucky definitely ++ */ ++ J_ASSERT(ac.ac_b_ex.fe_len == 0); ++ DQUOT_FREE_BLOCK(inode, *len); ++ *errp = -ENOSPC; ++ block = 0; ++#if 1 ++ printk(KERN_ERR "EXT3-fs: cant allocate: status %d, flags %d\n", ++ ac.ac_status, ac.ac_flags); ++ printk(KERN_ERR "EXT3-fs: goal %d, best found %d/%d/%d, cr %d\n", ++ ac.ac_g_ex.fe_len, ac.ac_b_ex.fe_group, ++ ac.ac_b_ex.fe_start, ac.ac_b_ex.fe_len, cr); ++ printk(KERN_ERR "EXT3-fs: %lu block reserved, %d found\n", ++ sbi->s_blocks_reserved, ac.ac_found); ++ printk("EXT3-fs: groups: "); ++ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) ++ printk("%d: %d ", i, ++ sbi->s_buddy_blocks[i]->bb_free); ++ printk("\n"); ++#endif ++ goto out; ++ } ++ ++found: ++ J_ASSERT(ac.ac_b_ex.fe_len > 0); ++ ++ /* good news - free block(s) have been found. now it's time ++ * to mark block(s) in good old journaled bitmap */ ++ block = ac.ac_b_ex.fe_group * EXT3_BLOCKS_PER_GROUP(sb) ++ + ac.ac_b_ex.fe_start ++ + le32_to_cpu(es->s_first_data_block); ++ ++ /* we made a desicion, now mark found blocks in good old ++ * bitmap to be journaled */ ++ ++ ext3_debug("using block group %d(%d)\n", ++ ac.ac_b_group.group, gdp->bg_free_blocks_count); ++ ++ bitmap_bh = read_block_bitmap(sb, ac.ac_b_ex.fe_group); ++ if (!bitmap_bh) { ++ *errp = -EIO; ++ goto out_err; ++ } ++ ++ err = ext3_journal_get_write_access(handle, bitmap_bh); ++ if (err) { ++ *errp = err; ++ goto out_err; ++ } ++ ++ gdp = ext3_get_group_desc(sb, ac.ac_b_ex.fe_group, &gdp_bh); ++ if (!gdp) { ++ *errp = -EIO; ++ goto out_err; ++ } ++ ++ err = ext3_journal_get_write_access(handle, gdp_bh); ++ if (err) ++ goto out_err; ++ ++ block = ac.ac_b_ex.fe_group * EXT3_BLOCKS_PER_GROUP(sb) ++ + ac.ac_b_ex.fe_start ++ + le32_to_cpu(es->s_first_data_block); ++ ++ if (block == le32_to_cpu(gdp->bg_block_bitmap) || ++ block == le32_to_cpu(gdp->bg_inode_bitmap) || ++ in_range(block, le32_to_cpu(gdp->bg_inode_table), ++ EXT3_SB(sb)->s_itb_per_group)) ++ ext3_error(sb, "ext3_new_block", ++ "Allocating block in system zone - " ++ "block = %u", block); ++#if AGGRESSIVE_CHECK ++ for (i = 0; i < ac.ac_b_len; i++) ++ J_ASSERT(!mb_test_bit(ac.ac_b_ex.fe_start + i, bitmap_bh->b_data)); ++#endif ++ mb_set_bits(bitmap_bh->b_data, ac.ac_b_ex.fe_start, ac.ac_b_ex.fe_len); ++ ++ spin_lock(sb_bgl_lock(sbi, ac.ac_b_ex.fe_group)); ++ gdp->bg_free_blocks_count = ++ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) ++ - ac.ac_b_ex.fe_len); ++ spin_unlock(sb_bgl_lock(sbi, ac.ac_b_ex.fe_group)); ++ percpu_counter_mod(&sbi->s_freeblocks_counter, - ac.ac_b_ex.fe_len); ++ ++ err = ext3_journal_dirty_metadata(handle, bitmap_bh); ++ if (err) ++ goto out_err; ++ err = ext3_journal_dirty_metadata(handle, gdp_bh); ++ if (err) ++ goto out_err; ++ ++ sb->s_dirt = 1; ++ *errp = 0; ++ brelse(bitmap_bh); ++ ++ /* drop non-allocated, but dquote'd blocks */ ++ J_ASSERT(*len >= ac.ac_b_ex.fe_len); ++ DQUOT_FREE_BLOCK(inode, *len - ac.ac_b_ex.fe_len); ++ ++ *len = ac.ac_b_ex.fe_len; ++ J_ASSERT(*len > 0); ++ J_ASSERT(block != 0); ++ goto out; ++ ++out_err: ++ /* if we've already allocated something, roll it back */ ++ if (ac.ac_status == AC_STATUS_FOUND) { ++ /* FIXME: free blocks here */ ++ } ++ ++ DQUOT_FREE_BLOCK(inode, *len); ++ brelse(bitmap_bh); ++ *errp = err; ++ block = 0; ++out: ++ if (!(flags & EXT3_MB_HINT_RESERVED)) { ++ /* block wasn't reserved before and we reserved it ++ * at the beginning of allocation. it doesn't matter ++ * whether we allocated anything or we failed: time ++ * to release reservation. NOTE: because I expect ++ * any multiblock request from delayed allocation ++ * path only, here is single block always */ ++ ext3_mb_release_blocks(sb, 1); ++ } ++#ifdef MBALLOC_STATS ++ if (ac.ac_g_ex.fe_len > 1) { ++ spin_lock(&sbi->s_bal_lock); ++ sbi->s_bal_reqs++; ++ sbi->s_bal_allocated += *len; ++ if (*len >= ac.ac_g_ex.fe_len) ++ sbi->s_bal_success++; ++ sbi->s_bal_ex_scanned += ac.ac_found; ++ if (ac.ac_g_ex.fe_start == ac.ac_b_ex.fe_start && ++ ac.ac_g_ex.fe_group == ac.ac_b_ex.fe_group) ++ sbi->s_bal_goals++; ++ if (ac.ac_found > EXT3_MB_MAX_TO_SCAN) ++ sbi->s_bal_breaks++; ++ spin_unlock(&sbi->s_bal_lock); ++ } ++#endif ++ return block; ++} ++ ++int ext3_mb_get_descr_loc(struct ext3_buddy *e3b, struct buffer_head **bh, ++ struct ext3_mb_group_descr **grp) ++{ ++ struct super_block *sb = e3b->bd_sb; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int descr_per_block, err, offset; ++ struct ext3_mb_grp_header *hdr; ++ unsigned long block; ++ ++ descr_per_block = (sb->s_blocksize - sizeof(struct ext3_mb_grp_header)) ++ / sizeof(struct ext3_mb_group_descr); ++ block = e3b->bd_group / descr_per_block; ++ *bh = ext3_bread(NULL, sbi->s_buddy, block, 0, &err); ++ if (*bh == NULL) { ++ printk(KERN_ERR "EXT3-fs: cant getblk descr for group %d: %d\n", ++ e3b->bd_group, err); ++ return err; ++ } ++ ++ hdr = (struct ext3_mb_grp_header *) (*bh)->b_data; ++ if (hdr->mh_magic != EXT3_MB_MAGIC_V1) { ++ printk(KERN_ERR "EXT3-fs: invalid magic in group %d!\n", ++ e3b->bd_group); ++ brelse(*bh); ++ *bh = NULL; ++ return -EIO; ++ } ++ ++ offset = e3b->bd_group % descr_per_block ++ * sizeof(struct ext3_mb_group_descr) ++ + sizeof(struct ext3_mb_grp_header); ++ *grp = (struct ext3_mb_group_descr *) ((*bh)->b_data + offset); ++ ++ return 0; ++} ++ ++int ext3_mb_load_descr(struct ext3_buddy *e3b) ++{ ++ struct ext3_mb_group_descr *grp; ++ struct ext3_group_desc *gdp; ++ struct buffer_head *bh; ++ int err, i; ++ ++ err = ext3_mb_get_descr_loc(e3b, &bh, &grp); ++ if (err) ++ return err; ++ ++ e3b->bd_bd->bb_first_free = grp->mgd_first_free; ++ e3b->bd_bd->bb_free = grp->mgd_free; ++ for (i = 0; i < e3b->bd_blkbits; i++) { ++ J_ASSERT(i < 16); ++ e3b->bd_bd->bb_counters[i] = grp->mgd_counters[i]; ++ } ++ brelse(bh); ++ ++ /* additional checks against old group descriptor */ ++ gdp = ext3_get_group_desc(e3b->bd_sb, e3b->bd_group, NULL); ++ if (!gdp) ++ return -EIO; ++ if (e3b->bd_bd->bb_free != le16_to_cpu(gdp->bg_free_blocks_count)) { ++ printk(KERN_ERR "EXT3-fs: mbgroup %d corrupted (%d != %d)\n", ++ e3b->bd_group, e3b->bd_bd->bb_free, ++ le16_to_cpu(gdp->bg_free_blocks_count)); ++ return -ENODATA; ++ } ++ ++ return 0; ++} ++ ++ ++int ext3_mb_update_descr(struct ext3_buddy *e3b) ++{ ++ struct ext3_mb_group_descr *grp; ++ struct ext3_group_desc *gdp; ++ struct buffer_head *bh; ++ handle_t *handle; ++ int err, i; ++ ++ /* additional checks against old group descriptor */ ++ gdp = ext3_get_group_desc(e3b->bd_sb, e3b->bd_group, NULL); ++ if (!gdp) ++ return -EIO; ++ if (e3b->bd_bd->bb_free != le16_to_cpu(gdp->bg_free_blocks_count)) { ++ printk(KERN_ERR "EXT3-fs: mbgroup %d corrupted (%d != %d)\n", ++ e3b->bd_group, e3b->bd_bd->bb_free, ++ le16_to_cpu(gdp->bg_free_blocks_count)); ++ return -ENODATA; ++ } ++ ++ err = ext3_mb_get_descr_loc(e3b, &bh, &grp); ++ if (err) ++ return err; ++ ++ handle = ext3_journal_start_sb(e3b->bd_sb, 1); ++ if (IS_ERR(handle)) { ++ err = PTR_ERR(handle); ++ handle = NULL; ++ goto out; ++ } ++ ++ err = ext3_journal_get_write_access(handle, bh); ++ if (err) ++ goto out; ++ grp->mgd_first_free = e3b->bd_bd->bb_first_free; ++ grp->mgd_free = e3b->bd_bd->bb_free; ++ for (i = 0; i < e3b->bd_blkbits; i++) { ++ J_ASSERT(i < 16); ++ grp->mgd_counters[i] = e3b->bd_bd->bb_counters[i]; ++ } ++ err = ext3_journal_dirty_metadata(handle, bh); ++ if (err) ++ goto out; ++ err = 0; ++out: ++ brelse(bh); ++ if (handle) ++ ext3_journal_stop(handle); ++ return err; ++} ++ ++int ext3_mb_generate_buddy(struct ext3_buddy *e3b) ++{ ++ struct super_block *sb = e3b->bd_sb; ++ struct buffer_head *bh; ++ int i, count = 0; ++ ++ memset(e3b->bd_bh->b_data, 0, sb->s_blocksize); ++ memset(e3b->bd_bh2->b_data, 0, sb->s_blocksize); ++ ++ bh = read_block_bitmap(sb, e3b->bd_group); ++ if (bh == NULL) ++ return -EIO; ++ ++ /* mb_free_blocks will set real free */ ++ e3b->bd_bd->bb_free = 0; ++ e3b->bd_bd->bb_first_free = 1 << 15; ++ /* ++ * if change bb_counters size, don't forget about ++ * ext3_mb_init_backend() -bzzz ++ */ ++ memset(e3b->bd_bd->bb_counters, 0, ++ sizeof(unsigned) * (sb->s_blocksize_bits + 2)); ++ ++ /* loop over the blocks, and create buddies for free ones */ ++ for (i = 0; i < sb->s_blocksize * 8; i++) { ++ if (!mb_test_bit(i, (void *) bh->b_data)) { ++ mb_free_blocks(e3b, i, 1); ++ count++; ++ } ++ } ++ brelse(bh); ++ mb_check_buddy(e3b); ++ ext3_mb_dirty_buddy(e3b); ++ ++ return 0; ++} ++ ++EXPORT_SYMBOL(ext3_mb_new_blocks); ++ ++#define MB_CREDITS \ ++ (EXT3_DATA_TRANS_BLOCKS + 3 + EXT3_INDEX_EXTRA_TRANS_BLOCKS + \ ++ 2 * EXT3_SINGLEDATA_TRANS_BLOCKS) ++ ++int ext3_mb_init_backend(struct super_block *sb, int *created) ++{ ++ int err, i, len, descr_per_block, buddy_offset, size; ++ struct inode *root = sb->s_root->d_inode; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct ext3_mb_grp_header *hdr; ++ struct buffer_head *bh = NULL; ++ unsigned long block; ++ struct dentry *db; ++ handle_t *handle; ++ tid_t target; ++ ++ *created = 0; ++ len = sizeof(struct ext3_buddy_group_blocks *) * sbi->s_groups_count; ++ sbi->s_buddy_blocks = kmalloc(len, GFP_KERNEL); ++ if (sbi->s_buddy_blocks == NULL) { ++ printk(KERN_ERR "EXT3-fs: cant allocate mem for buddy\n"); ++ return -ENOMEM; ++ } ++ memset(sbi->s_buddy_blocks, 0, len); ++ sbi->s_buddy = NULL; ++ ++ down(&root->i_sem); ++ len = strlen(EXT3_BUDDY_FILE); ++ db = lookup_one_len(EXT3_BUDDY_FILE, sb->s_root, len); ++ if (IS_ERR(db)) { ++ err = PTR_ERR(db); ++ printk(KERN_ERR "EXT3-fs: cant lookup buddy: %d\n", err); ++ up(&root->i_sem); ++ goto out; ++ } ++ ++ if (db->d_inode == NULL) { ++ err = ext3_create(root, db, S_IFREG, NULL); ++ if (err) { ++ printk(KERN_ERR "EXT3-fs: cant create buddy: %d\n", err); ++ up(&root->i_sem); ++ goto out; ++ } ++ db->d_inode->i_flags |= S_IMMUTABLE | S_NOATIME; ++ *created = 1; ++ mb_debug("no buddy file, regenerate\n"); ++ } ++ up(&root->i_sem); ++ sbi->s_buddy = igrab(db->d_inode); ++ ++ /* calculate needed size */ ++ descr_per_block = (sb->s_blocksize - sizeof(struct ext3_mb_grp_header)) ++ / sizeof(struct ext3_mb_group_descr); ++ buddy_offset = (sbi->s_groups_count + descr_per_block - 1) ++ / descr_per_block; ++ len = sbi->s_groups_count * sb->s_blocksize * 2 + ++ buddy_offset * sb->s_blocksize; ++ if (len != i_size_read(sbi->s_buddy)) { ++ if (*created == 0) ++ printk("EXT3-fs: wrong i_size (%u != %u), regenerate\n", ++ (unsigned) len, ++ (unsigned) i_size_read(sbi->s_buddy)); ++ *created = 1; ++ } ++ ++ /* read/create mb group descriptors */ ++ for (i = 0; i < buddy_offset; i++) { ++ handle = ext3_journal_start(sbi->s_buddy, MB_CREDITS); ++ if (IS_ERR(handle)) { ++ printk(KERN_ERR "EXT3-fs: cant start transaction\n"); ++ err = PTR_ERR(handle); ++ goto err_out; ++ } ++ ++ bh = ext3_bread(handle, sbi->s_buddy, i, 1, &err); ++ if (bh == NULL) { ++ printk(KERN_ERR "EXT3-fs: cant getblk grp: %d\n", err); ++ goto err_out; ++ } ++ hdr = (struct ext3_mb_grp_header *) bh->b_data; ++ if (hdr->mh_magic != EXT3_MB_MAGIC_V1) { ++ err = ext3_journal_get_write_access(handle, bh); ++ if (err) ++ goto err_out; ++ if (*created == 0) ++ printk(KERN_ERR ++ "EXT3-fs: invalid header 0x%x in %d," ++ "regenerate\n", hdr->mh_magic, i); ++ *created = 1; ++ hdr->mh_magic = EXT3_MB_MAGIC_V1; ++ err = ext3_journal_dirty_metadata(handle, bh); ++ if (err) ++ goto err_out; ++ } ++ brelse(bh); ++ ext3_journal_stop(handle); ++ } ++ ++ /* ++ * if change bb_counters size, don't forget about ext3_mb_generate_buddy() ++ */ ++ len = sizeof(struct ext3_buddy_group_blocks); ++ len += sizeof(unsigned) * (sb->s_blocksize_bits + 2); ++ for (i = 0; i < sbi->s_groups_count; i++) { ++ ++ sbi->s_buddy_blocks[i] = kmalloc(len, GFP_KERNEL); ++ if (sbi->s_buddy_blocks[i] == NULL) { ++ printk(KERN_ERR "EXT3-fs: cant allocate mem for buddy\n"); ++ err = -ENOMEM; ++ goto out2; ++ } ++ memset(sbi->s_buddy_blocks[i], 0, len); ++ ++ handle = ext3_journal_start(sbi->s_buddy, MB_CREDITS); ++ if (IS_ERR(handle)) { ++ printk(KERN_ERR "EXT3-fs: cant start transaction\n"); ++ err = PTR_ERR(handle); ++ goto out2; ++ } ++ ++ /* allocate block for bitmap */ ++ block = buddy_offset + i * 2; ++ bh = ext3_getblk(handle, sbi->s_buddy, block, 1, &err); ++ if (bh == NULL) { ++ printk(KERN_ERR "EXT3-fs: cant getblk bitmap: %d\n", err); ++ goto out2; ++ } ++ sbi->s_buddy_blocks[i]->bb_bitmap = bh->b_blocknr; ++ brelse(bh); ++ ++ /* allocate block for buddy */ ++ block = buddy_offset + i * 2 + 1; ++ bh = ext3_getblk(handle, sbi->s_buddy, block, 1, &err); ++ if (bh == NULL) { ++ printk(KERN_ERR "EXT3-fs: cant getblk for buddy: %d\n", err); ++ goto out2; ++ } ++ sbi->s_buddy_blocks[i]->bb_buddy = bh->b_blocknr; ++ brelse(bh); ++ ++ size = (block + 1) << sbi->s_buddy->i_blkbits; ++ if (size > sbi->s_buddy->i_size) { ++ *created = 1; ++ EXT3_I(sbi->s_buddy)->i_disksize = size; ++ i_size_write(sbi->s_buddy, size); ++ mark_inode_dirty(sbi->s_buddy); ++ } ++ ext3_journal_stop(handle); ++ ++ spin_lock_init(&sbi->s_buddy_blocks[i]->bb_lock); ++ sbi->s_buddy_blocks[i]->bb_md_cur = NULL; ++ sbi->s_buddy_blocks[i]->bb_tid = 0; ++ } ++ ++ if (journal_start_commit(sbi->s_journal, &target)) ++ log_wait_commit(sbi->s_journal, target); ++ ++out2: ++ dput(db); ++out: ++ return err; ++ ++err_out: ++ return err; ++} ++ ++int ext3_mb_write_descriptors(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct ext3_buddy e3b; ++ int ret = 0, i, err; ++ ++ for (i = 0; i < sbi->s_groups_count; i++) { ++ if (sbi->s_buddy_blocks[i] == NULL) ++ continue; ++ ++ err = ext3_mb_load_buddy(sb, i, &e3b); ++ if (err == 0) { ++ ext3_mb_update_descr(&e3b); ++ ext3_mb_release_desc(&e3b); ++ } else ++ ret = err; ++ } ++ return ret; ++} ++ ++int ext3_mb_release(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int i; ++ ++ if (!test_opt(sb, MBALLOC)) ++ return 0; ++ ++ /* release freed, non-committed blocks */ ++ spin_lock(&sbi->s_md_lock); ++ list_splice_init(&sbi->s_closed_transaction, ++ &sbi->s_committed_transaction); ++ list_splice_init(&sbi->s_active_transaction, ++ &sbi->s_committed_transaction); ++ spin_unlock(&sbi->s_md_lock); ++ ext3_mb_free_committed_blocks(sb); ++ ++ if (sbi->s_buddy_blocks) { ++ ext3_mb_write_descriptors(sb); ++ for (i = 0; i < sbi->s_groups_count; i++) { ++ if (sbi->s_buddy_blocks[i] == NULL) ++ continue; ++ kfree(sbi->s_buddy_blocks[i]); ++ } ++ kfree(sbi->s_buddy_blocks); ++ } ++ if (sbi->s_buddy) ++ iput(sbi->s_buddy); ++ if (sbi->s_blocks_reserved) ++ printk("ext3-fs: %ld blocks being reserved at umount!\n", ++ sbi->s_blocks_reserved); ++#ifdef MBALLOC_STATS ++ printk("EXT3-fs: mballoc: %lu blocks %lu reqs (%lu success)\n", ++ sbi->s_bal_allocated, sbi->s_bal_reqs, sbi->s_bal_success); ++ printk("EXT3-fs: mballoc: %lu extents scanned, %lu goal hits, %lu breaks\n", ++ sbi->s_bal_ex_scanned, sbi->s_bal_goals, sbi->s_bal_breaks); ++#endif ++ return 0; ++} ++ ++int ext3_mb_init(struct super_block *sb, int needs_recovery) ++{ ++ struct ext3_buddy e3b; ++ int i, err, created; ++ ++ if (!test_opt(sb, MBALLOC)) ++ return 0; ++ ++ /* init file for buddy data */ ++ clear_opt(EXT3_SB(sb)->s_mount_opt, MBALLOC); ++ if ((err = ext3_mb_init_backend(sb, &created))) ++ return err; ++ ++repeat: ++ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) { ++ err = ext3_mb_load_buddy(sb, i, &e3b); ++ if (err) { ++ /* FIXME: release backend */ ++ return err; ++ } ++ if (created || needs_recovery) ++ ext3_mb_generate_buddy(&e3b); ++ else ++ err = ext3_mb_load_descr(&e3b); ++ ext3_mb_release_desc(&e3b); ++ if (err == -ENODATA) { ++ created = 1; ++ goto repeat; ++ } ++ } ++ if (created || needs_recovery) ++ printk(KERN_ERR "EXT3-fs: generated buddies for %lu groups\n", ++ EXT3_SB(sb)->s_groups_count); ++ spin_lock_init(&EXT3_SB(sb)->s_reserve_lock); ++ spin_lock_init(&EXT3_SB(sb)->s_md_lock); ++ INIT_LIST_HEAD(&EXT3_SB(sb)->s_active_transaction); ++ INIT_LIST_HEAD(&EXT3_SB(sb)->s_closed_transaction); ++ INIT_LIST_HEAD(&EXT3_SB(sb)->s_committed_transaction); ++ set_opt(EXT3_SB(sb)->s_mount_opt, MBALLOC); ++ ++#ifdef MBALLOC_STATS ++ spin_lock_init(&EXT3_SB(sb)->s_bal_lock); ++#define MBALLOC_INFO " (stats)" ++#else ++#define MBALLOC_INFO "" ++#endif ++ printk("EXT3-fs: mballoc enabled%s\n", MBALLOC_INFO); ++ return 0; ++} ++ ++void ext3_mb_free_committed_blocks(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int err, i, count = 0, count2 = 0; ++ struct ext3_free_metadata *md; ++ struct ext3_buddy e3b; ++ ++ if (list_empty(&sbi->s_committed_transaction)) ++ return; ++ ++ /* there is committed blocks to be freed yet */ ++ do { ++ /* get next array of blocks */ ++ md = NULL; ++ spin_lock(&sbi->s_md_lock); ++ if (!list_empty(&sbi->s_committed_transaction)) { ++ md = list_entry(sbi->s_committed_transaction.next, ++ struct ext3_free_metadata, list); ++ list_del(&md->list); ++ } ++ spin_unlock(&sbi->s_md_lock); ++ ++ if (md == NULL) ++ break; ++ ++ mb_debug("gonna free %u blocks in group %u (0x%p):", ++ md->num, md->group, md); ++ ++ err = ext3_mb_load_buddy(sb, md->group, &e3b); ++ BUG_ON(err != 0); ++ ++ /* there are blocks to put in buddy to make them really free */ ++ count += md->num; ++ count2++; ++ ext3_lock_group(sb, md->group); ++ for (i = 0; i < md->num; i++) { ++ mb_debug(" %u", md->blocks[i]); ++ mb_free_blocks(&e3b, md->blocks[i], 1); ++ } ++ mb_debug("\n"); ++ ext3_unlock_group(sb, md->group); ++ ++ kfree(md); ++ ext3_mb_dirty_buddy(&e3b); ++ ext3_mb_release_desc(&e3b); ++ ++ } while (md); ++ mb_debug("freed %u blocks in %u structures\n", count, count2); ++} ++ ++void ext3_mb_poll_new_transaction(struct super_block *sb, handle_t *handle) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ ++ if (sbi->s_last_transaction == handle->h_transaction->t_tid) ++ return; ++ ++ /* new transaction! time to close last one and free blocks for ++ * committed transaction. we know that only transaction can be ++ * active, so previos transaction can be being logged and we ++ * know that transaction before previous is known to be alreade ++ * logged. this means that now we may free blocks freed in all ++ * transactions before previous one. hope I'm clear enough ... */ ++ ++ spin_lock(&sbi->s_md_lock); ++ if (sbi->s_last_transaction != handle->h_transaction->t_tid) { ++ mb_debug("new transaction %lu, old %lu\n", ++ (unsigned long) handle->h_transaction->t_tid, ++ (unsigned long) sbi->s_last_transaction); ++ list_splice_init(&sbi->s_closed_transaction, ++ &sbi->s_committed_transaction); ++ list_splice_init(&sbi->s_active_transaction, ++ &sbi->s_closed_transaction); ++ sbi->s_last_transaction = handle->h_transaction->t_tid; ++ } ++ spin_unlock(&sbi->s_md_lock); ++ ++ ext3_mb_free_committed_blocks(sb); ++} ++ ++int ext3_mb_free_metadata(handle_t *handle, struct ext3_buddy *e3b, ++ int group, int block, int count) ++{ ++ struct ext3_buddy_group_blocks *db = e3b->bd_bd; ++ struct super_block *sb = e3b->bd_sb; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct ext3_free_metadata *md; ++ int i; ++ ++ ext3_lock_group(sb, group); ++ for (i = 0; i < count; i++) { ++ md = db->bb_md_cur; ++ if (md && db->bb_tid != handle->h_transaction->t_tid) { ++ db->bb_md_cur = NULL; ++ md = NULL; ++ } ++ ++ if (md == NULL) { ++ ext3_unlock_group(sb, group); ++ md = kmalloc(sizeof(*md), GFP_KERNEL); ++ if (md == NULL) ++ return -ENOMEM; ++ md->num = 0; ++ md->group = group; ++ ++ ext3_lock_group(sb, group); ++ if (db->bb_md_cur == NULL) { ++ spin_lock(&sbi->s_md_lock); ++ list_add(&md->list, &sbi->s_active_transaction); ++ spin_unlock(&sbi->s_md_lock); ++ db->bb_md_cur = md; ++ db->bb_tid = handle->h_transaction->t_tid; ++ mb_debug("new md 0x%p for group %u\n", ++ md, md->group); ++ } else { ++ kfree(md); ++ md = db->bb_md_cur; ++ } ++ } ++ ++ BUG_ON(md->num >= EXT3_BB_MAX_BLOCKS); ++ md->blocks[md->num] = block + i; ++ md->num++; ++ if (md->num == EXT3_BB_MAX_BLOCKS) { ++ /* no more space, put full container on a sb's list */ ++ db->bb_md_cur = NULL; ++ } ++ } ++ ext3_unlock_group(sb, group); ++ return 0; ++} ++ ++void ext3_mb_free_blocks(handle_t *handle, struct inode *inode, ++ unsigned long block, unsigned long count, ++ int metadata, int *freed) ++{ ++ struct buffer_head *bitmap_bh = NULL; ++ struct ext3_group_desc *gdp; ++ struct ext3_super_block *es; ++ unsigned long bit, overflow; ++ struct buffer_head *gd_bh; ++ unsigned long block_group; ++ struct ext3_sb_info *sbi; ++ struct super_block *sb; ++ struct ext3_buddy e3b; ++ int err = 0, ret; ++ ++ *freed = 0; ++ sb = inode->i_sb; ++ if (!sb) { ++ printk ("ext3_free_blocks: nonexistent device"); ++ return; ++ } ++ ++ ext3_mb_poll_new_transaction(sb, handle); ++ ++ sbi = EXT3_SB(sb); ++ es = EXT3_SB(sb)->s_es; ++ if (block < le32_to_cpu(es->s_first_data_block) || ++ block + count < block || ++ block + count > le32_to_cpu(es->s_blocks_count)) { ++ ext3_error (sb, "ext3_free_blocks", ++ "Freeing blocks not in datazone - " ++ "block = %lu, count = %lu", block, count); ++ goto error_return; ++ } ++ ++ ext3_debug("freeing block %lu\n", block); ++ ++do_more: ++ overflow = 0; ++ block_group = (block - le32_to_cpu(es->s_first_data_block)) / ++ EXT3_BLOCKS_PER_GROUP(sb); ++ bit = (block - le32_to_cpu(es->s_first_data_block)) % ++ EXT3_BLOCKS_PER_GROUP(sb); ++ /* ++ * Check to see if we are freeing blocks across a group ++ * boundary. ++ */ ++ if (bit + count > EXT3_BLOCKS_PER_GROUP(sb)) { ++ overflow = bit + count - EXT3_BLOCKS_PER_GROUP(sb); ++ count -= overflow; ++ } ++ brelse(bitmap_bh); ++ bitmap_bh = read_block_bitmap(sb, block_group); ++ if (!bitmap_bh) ++ goto error_return; ++ gdp = ext3_get_group_desc (sb, block_group, &gd_bh); ++ if (!gdp) ++ goto error_return; ++ ++ if (in_range (le32_to_cpu(gdp->bg_block_bitmap), block, count) || ++ in_range (le32_to_cpu(gdp->bg_inode_bitmap), block, count) || ++ in_range (block, le32_to_cpu(gdp->bg_inode_table), ++ EXT3_SB(sb)->s_itb_per_group) || ++ in_range (block + count - 1, le32_to_cpu(gdp->bg_inode_table), ++ EXT3_SB(sb)->s_itb_per_group)) ++ ext3_error (sb, "ext3_free_blocks", ++ "Freeing blocks in system zones - " ++ "Block = %lu, count = %lu", ++ block, count); ++ ++ BUFFER_TRACE(bitmap_bh, "getting write access"); ++ err = ext3_journal_get_write_access(handle, bitmap_bh); ++ if (err) ++ goto error_return; ++ ++ /* ++ * We are about to modify some metadata. Call the journal APIs ++ * to unshare ->b_data if a currently-committing transaction is ++ * using it ++ */ ++ BUFFER_TRACE(gd_bh, "get_write_access"); ++ err = ext3_journal_get_write_access(handle, gd_bh); ++ if (err) ++ goto error_return; ++ ++ err = ext3_mb_load_buddy(sb, block_group, &e3b); ++ if (err) ++ goto error_return; ++ ++ if (metadata) { ++ /* blocks being freed are metadata. these blocks shouldn't ++ * be used until this transaction is committed */ ++ ext3_mb_free_metadata(handle, &e3b, block_group, bit, count); ++ } else { ++ ext3_lock_group(sb, block_group); ++ mb_free_blocks(&e3b, bit, count); ++ ext3_unlock_group(sb, block_group); ++ } ++ spin_lock(sb_bgl_lock(sbi, block_group)); ++ gdp->bg_free_blocks_count = ++ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + count); ++ spin_unlock(sb_bgl_lock(sbi, block_group)); ++ ++ ext3_mb_dirty_buddy(&e3b); ++ ext3_mb_release_desc(&e3b); ++ ++ mb_clear_bits(bitmap_bh->b_data, bit, count); ++ *freed = count; ++ ++ /* We dirtied the bitmap block */ ++ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); ++ err = ext3_journal_dirty_metadata(handle, bitmap_bh); ++ ++ /* And the group descriptor block */ ++ BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); ++ ret = ext3_journal_dirty_metadata(handle, gd_bh); ++ if (!err) err = ret; ++ ++ if (overflow && !err) { ++ block += count; ++ count = overflow; ++ goto do_more; ++ } ++ sb->s_dirt = 1; ++error_return: ++ brelse(bitmap_bh); ++ ext3_std_error(sb, err); ++ return; ++} ++ ++int ext3_mb_reserve_blocks(struct super_block *sb, int blocks) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int free, ret = -ENOSPC; ++ ++ BUG_ON(blocks < 0); ++ spin_lock(&sbi->s_reserve_lock); ++ free = percpu_counter_read_positive(&sbi->s_freeblocks_counter); ++ if (blocks <= free - sbi->s_blocks_reserved) { ++ sbi->s_blocks_reserved += blocks; ++ ret = 0; ++ } ++ spin_unlock(&sbi->s_reserve_lock); ++ return ret; ++} ++ ++void ext3_mb_release_blocks(struct super_block *sb, int blocks) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ ++ BUG_ON(blocks < 0); ++ spin_lock(&sbi->s_reserve_lock); ++ sbi->s_blocks_reserved -= blocks; ++ WARN_ON(sbi->s_blocks_reserved < 0); ++ if (sbi->s_blocks_reserved < 0) ++ sbi->s_blocks_reserved = 0; ++ spin_unlock(&sbi->s_reserve_lock); ++} ++ ++int ext3_new_block(handle_t *handle, struct inode *inode, ++ unsigned long goal, int *errp) ++{ ++ int ret, len; ++ ++ if (!test_opt(inode->i_sb, MBALLOC)) { ++ ret = ext3_new_block_old(handle, inode, goal, errp); ++ goto out; ++ } ++ len = 1; ++ ret = ext3_mb_new_blocks(handle, inode, goal, &len, 0, errp); ++out: ++ return ret; ++} ++ ++ ++void ext3_free_blocks(handle_t *handle, struct inode * inode, ++ unsigned long block, unsigned long count, int metadata) ++{ ++ struct super_block *sb; ++ int freed; ++ ++ sb = inode->i_sb; ++ if (!test_opt(sb, MBALLOC)) ++ ext3_free_blocks_sb(handle, sb, block, count, &freed); ++ else ++ ext3_mb_free_blocks(handle, inode, block, count, metadata, &freed); ++ if (freed) ++ DQUOT_FREE_BLOCK(inode, freed); ++ return; ++} ++ +Index: linux-stage/fs/ext3/super.c +=================================================================== +--- linux-stage.orig/fs/ext3/super.c 2005-02-25 17:27:00.231757312 +0200 ++++ linux-stage/fs/ext3/super.c 2005-02-25 17:28:41.862307120 +0200 +@@ -394,6 +394,7 @@ + struct ext3_super_block *es = sbi->s_es; + int i; + ++ ext3_mb_release(sb); + ext3_ext_release(sb); + ext3_xattr_put_super(sb); + journal_destroy(sbi->s_journal); +@@ -592,7 +593,7 @@ + Opt_commit, Opt_journal_update, Opt_journal_inum, + Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, + Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, +- Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, ++ Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_mballoc, Opt_mbfactor, + Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, + Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_extents, Opt_extdebug, + }; +@@ -646,6 +647,8 @@ + {Opt_iopen_nopriv, "iopen_nopriv"}, + {Opt_extents, "extents"}, + {Opt_extdebug, "extdebug"}, ++ {Opt_mballoc, "mballoc"}, ++ {Opt_mballoc, "mbfactor=%u"}, + {Opt_barrier, "barrier=%u"}, + {Opt_err, NULL}, + {Opt_resize, "resize"}, +@@ -956,6 +959,16 @@ + case Opt_extdebug: + set_opt (sbi->s_mount_opt, EXTDEBUG); + break; ++ case Opt_mballoc: ++ set_opt (sbi->s_mount_opt, MBALLOC); ++ break; ++ case Opt_mbfactor: ++ if (match_int(&args[0], &option)) ++ return 0; ++ if (option < 0) ++ return 0; ++ sbi->s_mb_factor = option; ++ break; + default: + printk (KERN_ERR + "EXT3-fs: Unrecognized mount option \"%s\" " +@@ -1639,7 +1652,8 @@ + ext3_count_dirs(sb)); + + ext3_ext_init(sb); +- ++ ext3_mb_init(sb, needs_recovery); ++ + return 0; + + failed_mount3: +Index: linux-stage/fs/ext3/Makefile +=================================================================== +--- linux-stage.orig/fs/ext3/Makefile 2005-02-25 17:27:00.228757768 +0200 ++++ linux-stage/fs/ext3/Makefile 2005-02-25 17:28:41.863306968 +0200 +@@ -5,7 +5,8 @@ + obj-$(CONFIG_EXT3_FS) += ext3.o + + ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o\ +- ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o ++ ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ ++ mballoc.o + + ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o + ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o +Index: linux-stage/fs/ext3/balloc.c +=================================================================== +--- linux-stage.orig/fs/ext3/balloc.c 2005-02-25 17:26:58.965949744 +0200 ++++ linux-stage/fs/ext3/balloc.c 2005-02-25 17:28:41.865306664 +0200 +@@ -79,7 +79,7 @@ + * + * Return buffer_head on success or NULL in case of failure. + */ +-static struct buffer_head * ++struct buffer_head * + read_block_bitmap(struct super_block *sb, unsigned int block_group) + { + struct ext3_group_desc * desc; +@@ -450,24 +450,6 @@ + return; + } + +-/* Free given blocks, update quota and i_blocks field */ +-void ext3_free_blocks(handle_t *handle, struct inode *inode, +- unsigned long block, unsigned long count) +-{ +- struct super_block * sb; +- int dquot_freed_blocks; +- +- sb = inode->i_sb; +- if (!sb) { +- printk ("ext3_free_blocks: nonexistent device"); +- return; +- } +- ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks); +- if (dquot_freed_blocks) +- DQUOT_FREE_BLOCK(inode, dquot_freed_blocks); +- return; +-} +- + /* + * For ext3 allocations, we must not reuse any blocks which are + * allocated in the bitmap buffer's "last committed data" copy. This +@@ -1140,7 +1122,7 @@ + * bitmap, and then for any free bit if that fails. + * This function also updates quota and i_blocks field. + */ +-int ext3_new_block(handle_t *handle, struct inode *inode, ++int ext3_new_block_old(handle_t *handle, struct inode *inode, + unsigned long goal, int *errp) + { + struct buffer_head *bitmap_bh = NULL; +Index: linux-stage/fs/ext3/namei.c +=================================================================== +--- linux-stage.orig/fs/ext3/namei.c 2005-02-25 17:26:59.527864320 +0200 ++++ linux-stage/fs/ext3/namei.c 2005-02-25 17:28:41.867306360 +0200 +@@ -1639,7 +1639,7 @@ + * If the create succeeds, we fill in the inode information + * with d_instantiate(). + */ +-static int ext3_create (struct inode * dir, struct dentry * dentry, int mode, ++int ext3_create (struct inode * dir, struct dentry * dentry, int mode, + struct nameidata *nd) + { + handle_t *handle; +Index: linux-stage/fs/ext3/inode.c +=================================================================== +--- linux-stage.orig/fs/ext3/inode.c 2005-02-25 17:27:00.227757920 +0200 ++++ linux-stage/fs/ext3/inode.c 2005-02-25 17:28:41.872305600 +0200 +@@ -572,7 +572,7 @@ + ext3_journal_forget(handle, branch[i].bh); + } + for (i = 0; i < keys; i++) +- ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1); ++ ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1, 1); + return err; + } + +@@ -673,7 +673,7 @@ + if (err == -EAGAIN) + for (i = 0; i < num; i++) + ext3_free_blocks(handle, inode, +- le32_to_cpu(where[i].key), 1); ++ le32_to_cpu(where[i].key), 1, 1); + return err; + } + +@@ -1831,7 +1831,7 @@ + } + } + +- ext3_free_blocks(handle, inode, block_to_free, count); ++ ext3_free_blocks(handle, inode, block_to_free, count, 1); + } + + /** +@@ -2004,7 +2004,7 @@ + ext3_journal_test_restart(handle, inode); + } + +- ext3_free_blocks(handle, inode, nr, 1); ++ ext3_free_blocks(handle, inode, nr, 1, 1); + + if (parent_bh) { + /* +Index: linux-stage/fs/ext3/extents.c +=================================================================== +--- linux-stage.orig/fs/ext3/extents.c 2005-02-25 17:27:00.222758680 +0200 ++++ linux-stage/fs/ext3/extents.c 2005-02-25 17:29:29.364085752 +0200 +@@ -740,7 +740,7 @@ + for (i = 0; i < depth; i++) { + if (!ablocks[i]) + continue; +- ext3_free_blocks(handle, tree->inode, ablocks[i], 1); ++ ext3_free_blocks(handle, tree->inode, ablocks[i], 1, 1); + } + } + kfree(ablocks); +@@ -1391,7 +1391,7 @@ + path->p_idx->ei_leaf); + bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf); + ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf); +- ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1); ++ ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1, 1); + return err; + } + +@@ -1879,10 +1879,12 @@ + int needed = ext3_remove_blocks_credits(tree, ex, from, to); + handle_t *handle = ext3_journal_start(tree->inode, needed); + struct buffer_head *bh; +- int i; ++ int i, metadata = 0; + + if (IS_ERR(handle)) + return PTR_ERR(handle); ++ if (S_ISDIR(tree->inode->i_mode)) ++ metadata = 1; + if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) { + /* tail removal */ + unsigned long num, start; +@@ -1894,7 +1896,7 @@ + bh = sb_find_get_block(tree->inode->i_sb, start + i); + ext3_forget(handle, 0, tree->inode, bh, start + i); + } +- ext3_free_blocks(handle, tree->inode, start, num); ++ ext3_free_blocks(handle, tree->inode, start, num, metadata); + } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) { + printk("strange request: removal %lu-%lu from %u:%u\n", + from, to, ex->ee_block, ex->ee_len); +Index: linux-stage/fs/ext3/xattr.c +=================================================================== +--- linux-stage.orig/fs/ext3/xattr.c 2005-02-25 17:26:59.876811272 +0200 ++++ linux-stage/fs/ext3/xattr.c 2005-02-25 17:28:41.878304688 +0200 +@@ -1271,7 +1271,7 @@ + new_bh = sb_getblk(sb, block); + if (!new_bh) { + getblk_failed: +- ext3_free_blocks(handle, inode, block, 1); ++ ext3_free_blocks(handle, inode, block, 1, 1); + error = -EIO; + goto cleanup; + } +@@ -1318,7 +1318,7 @@ + if (ce) + mb_cache_entry_free(ce); + ea_bdebug(old_bh, "freeing"); +- ext3_free_blocks(handle, inode, old_bh->b_blocknr, 1); ++ ext3_free_blocks(handle, inode, old_bh->b_blocknr, 1, 1); + + /* ext3_forget() calls bforget() for us, but we + let our caller release old_bh, so we need to +@@ -1417,7 +1417,7 @@ + if (HDR(bh)->h_refcount == cpu_to_le32(1)) { + if (ce) + mb_cache_entry_free(ce); +- ext3_free_blocks(handle, inode, EXT3_I(inode)->i_file_acl, 1); ++ ext3_free_blocks(handle, inode, EXT3_I(inode)->i_file_acl, 1, 1); + get_bh(bh); + ext3_forget(handle, 1, inode, bh, EXT3_I(inode)->i_file_acl); + } else { +Index: linux-stage/include/linux/ext3_fs.h +=================================================================== +--- linux-stage.orig/include/linux/ext3_fs.h 2005-02-25 17:27:00.234756856 +0200 ++++ linux-stage/include/linux/ext3_fs.h 2005-02-25 17:28:41.881304232 +0200 +@@ -57,6 +57,14 @@ + #define ext3_debug(f, a...) do {} while (0) + #endif + ++#define EXT3_MULTIBLOCK_ALLOCATOR 1 ++ ++#define EXT3_MB_HINT_MERGE 1 ++#define EXT3_MB_HINT_RESERVED 2 ++#define EXT3_MB_HINT_METADATA 4 ++#define EXT3_MB_HINT_FIRST 8 ++#define EXT3_MB_HINT_BEST 16 ++ + /* + * Special inodes numbers + */ +@@ -365,6 +373,7 @@ + #define EXT3_MOUNT_IOPEN_NOPRIV 0x80000 /* Make iopen world-readable */ + #define EXT3_MOUNT_EXTENTS 0x100000/* Extents support */ + #define EXT3_MOUNT_EXTDEBUG 0x200000/* Extents debug */ ++#define EXT3_MOUNT_MBALLOC 0x400000/* Buddy allocation support */ + + + /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ +@@ -725,7 +734,7 @@ + extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group); + extern int ext3_new_block (handle_t *, struct inode *, unsigned long, int *); + extern void ext3_free_blocks (handle_t *, struct inode *, unsigned long, +- unsigned long); ++ unsigned long, int); + extern void ext3_free_blocks_sb (handle_t *, struct super_block *, + unsigned long, unsigned long, int *); + extern unsigned long ext3_count_free_blocks (struct super_block *); +@@ -856,6 +865,37 @@ + extern int ext3_ext_ioctl(struct inode *inode, struct file *filp, + unsigned int cmd, unsigned long arg); + ++/* mballoc.c */ ++extern int ext3_mb_init(struct super_block *, int); ++extern int ext3_mb_release(struct super_block *); ++extern int ext3_mb_new_blocks(handle_t *, struct inode *, unsigned long, int *, int, int *); ++extern int ext3_mb_reserve_blocks(struct super_block *, int); ++extern void ext3_mb_release_blocks(struct super_block *, int); ++ ++/* writeback.c */ ++extern int ext3_wb_writepages(struct address_space *, struct writeback_control *); ++extern int ext3_wb_prepare_write(struct file *file, struct page *page, ++ unsigned from, unsigned to); ++extern int ext3_wb_commit_write(struct file *, struct page *, unsigned, unsigned); ++extern int ext3_wb_writepage(struct page *, struct writeback_control *); ++extern int ext3_wb_invalidatepage(struct page *, unsigned long); ++extern int ext3_wb_releasepage(struct page *, int); ++extern int ext3_wb_block_truncate_page(handle_t *, struct page *, struct address_space *, loff_t); ++extern void ext3_wb_init(struct super_block *); ++extern void ext3_wb_release(struct super_block *); ++ ++/* writeback.c */ ++extern int ext3_wb_writepages(struct address_space *, struct writeback_control *); ++extern int ext3_wb_prepare_write(struct file *file, struct page *page, ++ unsigned from, unsigned to); ++extern int ext3_wb_commit_write(struct file *, struct page *, unsigned, unsigned); ++extern int ext3_wb_writepage(struct page *, struct writeback_control *); ++extern int ext3_wb_invalidatepage(struct page *, unsigned long); ++extern int ext3_wb_releasepage(struct page *, int); ++extern int ext3_wb_block_truncate_page(handle_t *, struct page *, struct address_space *, loff_t); ++extern void ext3_wb_init(struct super_block *); ++extern void ext3_wb_release(struct super_block *); ++ + #endif /* __KERNEL__ */ + + /* EXT3_IOC_CREATE_INUM at bottom of file (visible to kernel and user). */ +Index: linux-stage/include/linux/ext3_fs_sb.h +=================================================================== +--- linux-stage.orig/include/linux/ext3_fs_sb.h 2005-02-25 17:26:59.641846992 +0200 ++++ linux-stage/include/linux/ext3_fs_sb.h 2005-02-25 17:28:41.882304080 +0200 +@@ -23,10 +23,30 @@ + #define EXT_INCLUDE + #include + #include ++#include + #endif + #endif + #include + ++#define EXT3_BB_MAX_BLOCKS 30 ++struct ext3_free_metadata { ++ unsigned short group; ++ unsigned short num; ++ unsigned short blocks[EXT3_BB_MAX_BLOCKS]; ++ struct list_head list; ++}; ++ ++struct ext3_buddy_group_blocks { ++ __u32 bb_bitmap; ++ __u32 bb_buddy; ++ spinlock_t bb_lock; ++ unsigned long bb_tid; ++ struct ext3_free_metadata *bb_md_cur; ++ unsigned short bb_first_free; ++ unsigned short bb_free; ++ unsigned bb_counters[]; ++}; ++ + /* + * third extended-fs super-block data in memory + */ +@@ -81,6 +101,27 @@ + char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */ + int s_jquota_fmt; /* Format of quota to use */ + #endif ++ ++ /* for buddy allocator */ ++ struct ext3_buddy_group_blocks **s_buddy_blocks; ++ struct inode *s_buddy; ++ long s_blocks_reserved; ++ spinlock_t s_reserve_lock; ++ struct list_head s_active_transaction; ++ struct list_head s_closed_transaction; ++ struct list_head s_committed_transaction; ++ spinlock_t s_md_lock; ++ tid_t s_last_transaction; ++ int s_mb_factor; ++ ++ /* stats for buddy allocator */ ++ spinlock_t s_bal_lock; ++ unsigned long s_bal_reqs; /* number of reqs with len > 1 */ ++ unsigned long s_bal_success; /* we found long enough chunks */ ++ unsigned long s_bal_allocated; /* in blocks */ ++ unsigned long s_bal_ex_scanned; /* total extents scanned */ ++ unsigned long s_bal_goals; /* goal hits */ ++ unsigned long s_bal_breaks; /* too long searches */ + }; + + #endif /* _LINUX_EXT3_FS_SB */ diff --git a/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel4.series b/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel4.series new file mode 100644 index 0000000..70e7b12 --- /dev/null +++ b/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel4.series @@ -0,0 +1,11 @@ +ext3-wantedi-2.6-rhel4.patch +ext3-san-jdike-2.6-suse.patch +iopen-2.6-rhel4.patch +export_symbols-ext3-2.6-suse.patch +ext3-map_inode_page-2.6-suse.patch +ext3-ea-in-inode-2.6-rhel4.patch +export-ext3-2.6-rhel4.patch +ext3-include-fixes-2.6-rhel4.patch +ext3-extents-2.6.9-rhel4.patch +ext3-mballoc2-2.6.9-rhel4.patch +ext3-nlinks-2.6.7.patch diff --git a/lustre/kernel_patches/patches/8kstack-2.6-rhel4.patch b/lustre/kernel_patches/patches/8kstack-2.6-rhel4.patch new file mode 100644 index 0000000..36fea12 --- /dev/null +++ b/lustre/kernel_patches/patches/8kstack-2.6-rhel4.patch @@ -0,0 +1,13 @@ +Index: linux-2.6.9-5.0.3.EL/include/asm-i386/thread_info.h +=================================================================== +--- linux-2.6.9-5.0.3.EL.orig/include/asm-i386/thread_info.h 2005-02-25 10:25:33.000000000 +0200 ++++ linux-2.6.9-5.0.3.EL/include/asm-i386/thread_info.h 2005-02-25 20:19:11.676139032 +0200 +@@ -54,7 +54,7 @@ + #endif + + #define PREEMPT_ACTIVE 0x4000000 +-#define THREAD_SIZE (4096) ++#define THREAD_SIZE (8192) + + #define STACK_WARN (THREAD_SIZE/8) + /* diff --git a/lustre/kernel_patches/patches/export_symbols-2.6-rhel4.patch b/lustre/kernel_patches/patches/export_symbols-2.6-rhel4.patch new file mode 100644 index 0000000..16f26b0 --- /dev/null +++ b/lustre/kernel_patches/patches/export_symbols-2.6-rhel4.patch @@ -0,0 +1,148 @@ +Index: linux-2.6.9-5.0.3.EL/fs/filesystems.c +=================================================================== +--- linux-2.6.9-5.0.3.EL.orig/fs/filesystems.c 2005-02-26 13:24:35.467813664 +0200 ++++ linux-2.6.9-5.0.3.EL/fs/filesystems.c 2005-02-26 13:53:13.794588288 +0200 +@@ -27,7 +27,9 @@ + */ + + static struct file_system_type *file_systems; +-static rwlock_t file_systems_lock = RW_LOCK_UNLOCKED; ++rwlock_t file_systems_lock = RW_LOCK_UNLOCKED; ++ ++EXPORT_SYMBOL(file_systems_lock); + + /* WARNING: This can be used only if we _already_ own a reference */ + void get_filesystem(struct file_system_type *fs) +Index: linux-2.6.9-5.0.3.EL/include/linux/fs.h +=================================================================== +--- linux-2.6.9-5.0.3.EL.orig/include/linux/fs.h 2005-02-26 13:47:37.330738568 +0200 ++++ linux-2.6.9-5.0.3.EL/include/linux/fs.h 2005-02-26 13:53:13.796587984 +0200 +@@ -1529,6 +1529,7 @@ + + extern struct file_operations generic_ro_fops; + ++extern rwlock_t file_systems_lock; + #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m)) + + extern int vfs_readlink(struct dentry *, char __user *, int, const char *); +Index: linux-2.6.9-5.0.3.EL/include/linux/ext2_fs_sb.h +=================================================================== +--- linux-2.6.9-5.0.3.EL.orig/include/linux/ext2_fs_sb.h 2005-02-26 13:24:35.470813208 +0200 ++++ linux-2.6.9-5.0.3.EL/include/linux/ext2_fs_sb.h 2005-02-26 13:53:13.797587832 +0200 +@@ -16,9 +16,11 @@ + #ifndef _LINUX_EXT2_FS_SB + #define _LINUX_EXT2_FS_SB + ++#ifndef EXT_INCLUDE ++#define EXT_INCLUDE + #include + #include +- ++#endif + /* + * second extended-fs super-block data in memory + */ +Index: linux-2.6.9-5.0.3.EL/kernel/kallsyms.c +=================================================================== +--- linux-2.6.9-5.0.3.EL.orig/kernel/kallsyms.c 2005-02-26 13:24:35.479811840 +0200 ++++ linux-2.6.9-5.0.3.EL/kernel/kallsyms.c 2005-02-26 13:53:13.799587528 +0200 +@@ -310,3 +310,4 @@ + __initcall(kallsyms_init); + + EXPORT_SYMBOL(__print_symbol); ++EXPORT_SYMBOL(kernel_text_address); +Index: linux-2.6.9-5.0.3.EL/net/core/sock.c +=================================================================== +--- linux-2.6.9-5.0.3.EL.orig/net/core/sock.c 2005-02-26 13:24:35.490810168 +0200 ++++ linux-2.6.9-5.0.3.EL/net/core/sock.c 2005-02-26 13:53:13.801587224 +0200 +@@ -602,6 +602,7 @@ + return -EFAULT; + return 0; + } ++EXPORT_SYMBOL(sock_getsockopt); + + static kmem_cache_t *sk_cachep; + +Index: linux-2.6.9-5.0.3.EL/fs/namespace.c +=================================================================== +--- linux-2.6.9-5.0.3.EL.orig/fs/namespace.c 2005-02-26 13:47:31.282658016 +0200 ++++ linux-2.6.9-5.0.3.EL/fs/namespace.c 2005-02-26 13:53:13.803586920 +0200 +@@ -1241,6 +1241,7 @@ + mntput(old_pwdmnt); + } + } ++EXPORT_SYMBOL(set_fs_pwd); + + static void chroot_fs_refs(struct nameidata *old_nd, struct nameidata *new_nd) + { +Index: linux-2.6.9-5.0.3.EL/kernel/exit.c +=================================================================== +--- linux-2.6.9-5.0.3.EL.orig/kernel/exit.c 2005-02-26 13:47:31.300655280 +0200 ++++ linux-2.6.9-5.0.3.EL/kernel/exit.c 2005-02-26 13:53:13.805586616 +0200 +@@ -516,6 +516,7 @@ + { + __exit_mm(tsk); + } ++EXPORT_SYMBOL(exit_mm); + + static inline void choose_new_parent(task_t *p, task_t *reaper, task_t *child_reaper) + { +Index: linux-2.6.9-5.0.3.EL/fs/dcache.c +=================================================================== +--- linux-2.6.9-5.0.3.EL.orig/fs/dcache.c 2005-02-26 13:49:04.365507272 +0200 ++++ linux-2.6.9-5.0.3.EL/fs/dcache.c 2005-02-26 13:53:13.807586312 +0200 +@@ -1526,6 +1526,7 @@ + + return result; + } ++EXPORT_SYMBOL(is_subdir); + + void d_genocide(struct dentry *root) + { +Index: linux-2.6.9-5.0.3.EL/mm/filemap.c +=================================================================== +--- linux-2.6.9-5.0.3.EL.orig/mm/filemap.c 2005-02-26 13:24:35.502808344 +0200 ++++ linux-2.6.9-5.0.3.EL/mm/filemap.c 2005-02-26 13:53:59.787596288 +0200 +@@ -1473,7 +1473,7 @@ + return NULL; + } + +-static int filemap_populate(struct vm_area_struct *vma, ++int filemap_populate(struct vm_area_struct *vma, + unsigned long addr, + unsigned long len, + pgprot_t prot, +@@ -1520,6 +1520,7 @@ + + return 0; + } ++EXPORT_SYMBOL_GPL(filemap_populate); + + struct vm_operations_struct generic_file_vm_ops = { + .nopage = filemap_nopage, +Index: linux-2.6.9-5.0.3.EL/fs/file_table.c +=================================================================== +--- linux-2.6.9-5.0.3.EL.orig/fs/file_table.c 2005-02-26 13:24:35.512806824 +0200 ++++ linux-2.6.9-5.0.3.EL/fs/file_table.c 2005-02-26 13:53:13.811585704 +0200 +@@ -196,6 +196,7 @@ + file_free(file); + } + } ++EXPORT_SYMBOL(put_filp); + + void file_move(struct file *file, struct list_head *list) + { +Index: linux-2.6.9-5.0.3.EL/include/linux/mm.h +=================================================================== +--- linux-2.6.9-5.0.3.EL.orig/include/linux/mm.h 2005-02-26 13:49:05.823285656 +0200 ++++ linux-2.6.9-5.0.3.EL/include/linux/mm.h 2005-02-26 13:53:54.181448552 +0200 +@@ -721,6 +721,9 @@ + + /* generic vm_area_ops exported for stackable file systems */ + struct page *filemap_nopage(struct vm_area_struct *, unsigned long, int *); ++int filemap_populate(struct vm_area_struct *vma, unsigned long addr, ++ unsigned long len, pgprot_t prot, unsigned long pgoff, ++ int nonblock); + + /* mm/page-writeback.c */ + int write_one_page(struct page *page, int wait); diff --git a/lustre/kernel_patches/patches/ext3-ea-in-inode-2.6-rhel4.patch b/lustre/kernel_patches/patches/ext3-ea-in-inode-2.6-rhel4.patch new file mode 100644 index 0000000..3d554e4 --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-ea-in-inode-2.6-rhel4.patch @@ -0,0 +1,822 @@ +Index: linux-stage/fs/ext3/ialloc.c +=================================================================== +--- linux-stage.orig/fs/ext3/ialloc.c 2005-02-25 16:47:04.411977280 +0200 ++++ linux-stage/fs/ext3/ialloc.c 2005-02-25 16:50:40.752088584 +0200 +@@ -629,6 +629,11 @@ + spin_unlock(&sbi->s_next_gen_lock); + + ei->i_state = EXT3_STATE_NEW; ++ if (EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) { ++ ei->i_extra_isize = sizeof(__u16) /* i_extra_isize */ ++ + sizeof(__u16); /* i_pad1 */ ++ } else ++ ei->i_extra_isize = 0; + + ret = inode; + if(DQUOT_ALLOC_INODE(inode)) { +Index: linux-stage/fs/ext3/inode.c +=================================================================== +--- linux-stage.orig/fs/ext3/inode.c 2005-02-25 16:47:04.415976672 +0200 ++++ linux-stage/fs/ext3/inode.c 2005-02-25 16:50:40.756087976 +0200 +@@ -2274,7 +2274,7 @@ + * trying to determine the inode's location on-disk and no read need be + * performed. + */ +-static int ext3_get_inode_loc(struct inode *inode, ++int ext3_get_inode_loc(struct inode *inode, + struct ext3_iloc *iloc, int in_mem) + { + unsigned long block; +@@ -2484,6 +2484,11 @@ + ei->i_data[block] = raw_inode->i_block[block]; + INIT_LIST_HEAD(&ei->i_orphan); + ++ if (EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) ++ ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); ++ else ++ ei->i_extra_isize = 0; ++ + if (S_ISREG(inode->i_mode)) { + inode->i_op = &ext3_file_inode_operations; + inode->i_fop = &ext3_file_operations; +@@ -2619,6 +2624,9 @@ + } else for (block = 0; block < EXT3_N_BLOCKS; block++) + raw_inode->i_block[block] = ei->i_data[block]; + ++ if (EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) ++ raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); ++ + BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); + rc = ext3_journal_dirty_metadata(handle, bh); + if (!err) +Index: linux-stage/fs/ext3/xattr.c +=================================================================== +--- linux-stage.orig/fs/ext3/xattr.c 2005-02-25 16:47:04.422975608 +0200 ++++ linux-stage/fs/ext3/xattr.c 2005-02-25 17:19:04.958009904 +0200 +@@ -149,17 +149,12 @@ + } + + /* +- * ext3_xattr_get() ++ * ext3_xattr_block_get() + * +- * Copy an extended attribute into the buffer +- * provided, or compute the buffer size required. +- * Buffer is NULL to compute the size of the buffer required. +- * +- * Returns a negative error number on failure, or the number of bytes +- * used / required on success. ++ * routine looks for attribute in EA block and returns it's value and size + */ + int +-ext3_xattr_get(struct inode *inode, int name_index, const char *name, ++ext3_xattr_block_get(struct inode *inode, int name_index, const char *name, + void *buffer, size_t buffer_size) + { + struct buffer_head *bh = NULL; +@@ -173,7 +168,6 @@ + + if (name == NULL) + return -EINVAL; +- down_read(&EXT3_I(inode)->xattr_sem); + error = -ENODATA; + if (!EXT3_I(inode)->i_file_acl) + goto cleanup; +@@ -246,15 +240,87 @@ + + cleanup: + brelse(bh); +- up_read(&EXT3_I(inode)->xattr_sem); + + return error; + } + + /* +- * ext3_xattr_list() ++ * ext3_xattr_ibody_get() + * +- * Copy a list of attribute names into the buffer ++ * routine looks for attribute in inode body and returns it's value and size ++ */ ++int ++ext3_xattr_ibody_get(struct inode *inode, int name_index, const char *name, ++ void *buffer, size_t buffer_size) ++{ ++ int size, name_len = strlen(name), storage_size; ++ struct ext3_xattr_entry *last; ++ struct ext3_inode *raw_inode; ++ struct ext3_iloc iloc; ++ char *start, *end; ++ int ret = -ENOENT; ++ ++ if (EXT3_SB(inode->i_sb)->s_inode_size <= EXT3_GOOD_OLD_INODE_SIZE) ++ return -ENOENT; ++ ++ ret = ext3_get_inode_loc(inode, &iloc, 1); ++ if (ret) ++ return ret; ++ raw_inode = ext3_raw_inode(&iloc); ++ ++ storage_size = EXT3_SB(inode->i_sb)->s_inode_size - ++ EXT3_GOOD_OLD_INODE_SIZE - ++ EXT3_I(inode)->i_extra_isize - ++ sizeof(__u32); ++ start = (char *) raw_inode + EXT3_GOOD_OLD_INODE_SIZE + ++ EXT3_I(inode)->i_extra_isize; ++ if (le32_to_cpu((*(__u32*) start)) != EXT3_XATTR_MAGIC) { ++ brelse(iloc.bh); ++ return -ENOENT; ++ } ++ start += sizeof(__u32); ++ end = (char *) raw_inode + EXT3_SB(inode->i_sb)->s_inode_size; ++ ++ last = (struct ext3_xattr_entry *) start; ++ while (!IS_LAST_ENTRY(last)) { ++ struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(last); ++ if (le32_to_cpu(last->e_value_size) > storage_size || ++ (char *) next >= end) { ++ ext3_error(inode->i_sb, "ext3_xattr_ibody_get", ++ "inode %ld", inode->i_ino); ++ brelse(iloc.bh); ++ return -EIO; ++ } ++ if (name_index == last->e_name_index && ++ name_len == last->e_name_len && ++ !memcmp(name, last->e_name, name_len)) ++ goto found; ++ last = next; ++ } ++ ++ /* can't find EA */ ++ brelse(iloc.bh); ++ return -ENOENT; ++ ++found: ++ size = le32_to_cpu(last->e_value_size); ++ if (buffer) { ++ ret = -ERANGE; ++ if (buffer_size >= size) { ++ memcpy(buffer, start + le16_to_cpu(last->e_value_offs), ++ size); ++ ret = size; ++ } ++ } else ++ ret = size; ++ brelse(iloc.bh); ++ return ret; ++} ++ ++/* ++ * ext3_xattr_get() ++ * ++ * Copy an extended attribute into the buffer + * provided, or compute the buffer size required. + * Buffer is NULL to compute the size of the buffer required. + * +@@ -262,7 +328,31 @@ + * used / required on success. + */ + int +-ext3_xattr_list(struct inode *inode, char *buffer, size_t buffer_size) ++ext3_xattr_get(struct inode *inode, int name_index, const char *name, ++ void *buffer, size_t buffer_size) ++{ ++ int err; ++ ++ down_read(&EXT3_I(inode)->xattr_sem); ++ ++ /* try to find attribute in inode body */ ++ err = ext3_xattr_ibody_get(inode, name_index, name, ++ buffer, buffer_size); ++ if (err < 0) ++ /* search was unsuccessful, try to find EA in dedicated block */ ++ err = ext3_xattr_block_get(inode, name_index, name, ++ buffer, buffer_size); ++ up_read(&EXT3_I(inode)->xattr_sem); ++ ++ return err; ++} ++ ++/* ext3_xattr_ibody_list() ++ * ++ * generate list of attributes stored in EA block ++ */ ++int ++ext3_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size) + { + struct buffer_head *bh = NULL; + struct ext3_xattr_entry *entry; +@@ -273,7 +363,6 @@ + ea_idebug(inode, "buffer=%p, buffer_size=%ld", + buffer, (long)buffer_size); + +- down_read(&EXT3_I(inode)->xattr_sem); + error = 0; + if (!EXT3_I(inode)->i_file_acl) + goto cleanup; +@@ -330,11 +419,139 @@ + + cleanup: + brelse(bh); +- up_read(&EXT3_I(inode)->xattr_sem); + + return error; + } + ++/* ext3_xattr_ibody_list() ++ * ++ * generate list of attributes stored in inode body ++ */ ++int ++ext3_xattr_ibody_list(struct inode *inode, char *buffer, size_t buffer_size) ++{ ++ struct ext3_xattr_entry *last; ++ struct ext3_inode *raw_inode; ++ char *start, *end, *buf; ++ struct ext3_iloc iloc; ++ int storage_size; ++ int ret; ++ int size = 0; ++ ++ if (EXT3_SB(inode->i_sb)->s_inode_size <= EXT3_GOOD_OLD_INODE_SIZE) ++ return 0; ++ ++ ret = ext3_get_inode_loc(inode, &iloc, 1); ++ if (ret) ++ return ret; ++ raw_inode = ext3_raw_inode(&iloc); ++ ++ storage_size = EXT3_SB(inode->i_sb)->s_inode_size - ++ EXT3_GOOD_OLD_INODE_SIZE - ++ EXT3_I(inode)->i_extra_isize - ++ sizeof(__u32); ++ start = (char *) raw_inode + EXT3_GOOD_OLD_INODE_SIZE + ++ EXT3_I(inode)->i_extra_isize; ++ if (le32_to_cpu((*(__u32*) start)) != EXT3_XATTR_MAGIC) { ++ brelse(iloc.bh); ++ return 0; ++ } ++ start += sizeof(__u32); ++ end = (char *) raw_inode + EXT3_SB(inode->i_sb)->s_inode_size; ++ ++ last = (struct ext3_xattr_entry *) start; ++ while (!IS_LAST_ENTRY(last)) { ++ struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(last); ++ struct xattr_handler *handler; ++ if (le32_to_cpu(last->e_value_size) > storage_size || ++ (char *) next >= end) { ++ ext3_error(inode->i_sb, "ext3_xattr_ibody_list", ++ "inode %ld", inode->i_ino); ++ brelse(iloc.bh); ++ return -EIO; ++ } ++ handler = ext3_xattr_handler(last->e_name_index); ++ if (handler) ++ size = handler->list(inode, NULL, 0, last->e_name, ++ last->e_name_len); ++ ++ last = next; ++ } ++ ++ if (!buffer) { ++ ret = size; ++ goto cleanup; ++ } else { ++ ret = -ERANGE; ++ if (size > buffer_size) ++ goto cleanup; ++ } ++ ++ last = (struct ext3_xattr_entry *) start; ++ buf = buffer; ++ while (!IS_LAST_ENTRY(last)) { ++ struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(last); ++ struct xattr_handler *handler; ++ handler = ext3_xattr_handler(last->e_name_index); ++ if (handler) ++ buf += handler->list(inode, buf, (buffer+buffer_size)-buf, last->e_name, ++ last->e_name_len); ++ last = next; ++ } ++ ret = size; ++cleanup: ++ brelse(iloc.bh); ++ return ret; ++} ++ ++/* ++ * ext3_xattr_list() ++ * ++ * Copy a list of attribute names into the buffer ++ * provided, or compute the buffer size required. ++ * Buffer is NULL to compute the size of the buffer required. ++ * ++ * Returns a negative error number on failure, or the number of bytes ++ * used / required on success. ++ */ ++int ++ext3_xattr_list(struct inode *inode, char *buffer, size_t buffer_size) ++{ ++ int error; ++ int size = buffer_size; ++ ++ down_read(&EXT3_I(inode)->xattr_sem); ++ ++ /* get list of attributes stored in inode body */ ++ error = ext3_xattr_ibody_list(inode, buffer, buffer_size); ++ if (error < 0) { ++ /* some error occured while collecting ++ * attributes in inode body */ ++ size = 0; ++ goto cleanup; ++ } ++ size = error; ++ ++ /* get list of attributes stored in dedicated block */ ++ if (buffer) { ++ buffer_size -= error; ++ if (buffer_size <= 0) { ++ buffer = NULL; ++ buffer_size = 0; ++ } else ++ buffer += error; ++ } ++ ++ error = ext3_xattr_block_list(inode, buffer, buffer_size); ++ if (error < 0) ++ /* listing was successful, so we return len */ ++ size = 0; ++ ++cleanup: ++ up_read(&EXT3_I(inode)->xattr_sem); ++ return error + size; ++} ++ + /* + * If the EXT3_FEATURE_COMPAT_EXT_ATTR feature of this file system is + * not set, set it. +@@ -356,6 +573,279 @@ + } + + /* ++ * ext3_xattr_ibody_find() ++ * ++ * search attribute and calculate free space in inode body ++ * NOTE: free space includes space our attribute hold ++ */ ++int ++ext3_xattr_ibody_find(struct inode *inode, int name_index, ++ const char *name, struct ext3_xattr_entry *rentry, int *free) ++{ ++ struct ext3_xattr_entry *last; ++ struct ext3_inode *raw_inode; ++ int name_len = strlen(name); ++ int err, storage_size; ++ struct ext3_iloc iloc; ++ char *start, *end; ++ int ret = -ENOENT; ++ ++ if (EXT3_SB(inode->i_sb)->s_inode_size <= EXT3_GOOD_OLD_INODE_SIZE) ++ return ret; ++ ++ err = ext3_get_inode_loc(inode, &iloc, 1); ++ if (err) ++ return -EIO; ++ raw_inode = ext3_raw_inode(&iloc); ++ ++ storage_size = EXT3_SB(inode->i_sb)->s_inode_size - ++ EXT3_GOOD_OLD_INODE_SIZE - ++ EXT3_I(inode)->i_extra_isize - ++ sizeof(__u32); ++ *free = storage_size - sizeof(__u32); ++ start = (char *) raw_inode + EXT3_GOOD_OLD_INODE_SIZE + ++ EXT3_I(inode)->i_extra_isize; ++ if (le32_to_cpu((*(__u32*) start)) != EXT3_XATTR_MAGIC) { ++ brelse(iloc.bh); ++ return -ENOENT; ++ } ++ start += sizeof(__u32); ++ end = (char *) raw_inode + EXT3_SB(inode->i_sb)->s_inode_size; ++ ++ last = (struct ext3_xattr_entry *) start; ++ while (!IS_LAST_ENTRY(last)) { ++ struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(last); ++ if (le32_to_cpu(last->e_value_size) > storage_size || ++ (char *) next >= end) { ++ ext3_error(inode->i_sb, "ext3_xattr_ibody_find", ++ "inode %ld", inode->i_ino); ++ brelse(iloc.bh); ++ return -EIO; ++ } ++ ++ if (name_index == last->e_name_index && ++ name_len == last->e_name_len && ++ !memcmp(name, last->e_name, name_len)) { ++ memcpy(rentry, last, sizeof(struct ext3_xattr_entry)); ++ ret = 0; ++ } else { ++ *free -= EXT3_XATTR_LEN(last->e_name_len); ++ *free -= le32_to_cpu(last->e_value_size); ++ } ++ last = next; ++ } ++ ++ brelse(iloc.bh); ++ return ret; ++} ++ ++/* ++ * ext3_xattr_block_find() ++ * ++ * search attribute and calculate free space in EA block (if it allocated) ++ * NOTE: free space includes space our attribute hold ++ */ ++int ++ext3_xattr_block_find(struct inode *inode, int name_index, const char *name, ++ struct ext3_xattr_entry *rentry, int *free) ++{ ++ struct buffer_head *bh = NULL; ++ struct ext3_xattr_entry *entry; ++ char *end; ++ int name_len, error = -ENOENT; ++ ++ if (!EXT3_I(inode)->i_file_acl) { ++ *free = inode->i_sb->s_blocksize - ++ sizeof(struct ext3_xattr_header) - ++ sizeof(__u32); ++ return -ENOENT; ++ } ++ ea_idebug(inode, "reading block %d", EXT3_I(inode)->i_file_acl); ++ bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl); ++ if (!bh) ++ return -EIO; ++ ea_bdebug(bh, "b_count=%d, refcount=%d", ++ atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount)); ++ end = bh->b_data + bh->b_size; ++ if (HDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) || ++ HDR(bh)->h_blocks != cpu_to_le32(1)) { ++bad_block: ext3_error(inode->i_sb, "ext3_xattr_get", ++ "inode %ld: bad block %d", inode->i_ino, ++ EXT3_I(inode)->i_file_acl); ++ brelse(bh); ++ return -EIO; ++ } ++ /* find named attribute */ ++ name_len = strlen(name); ++ *free = bh->b_size - sizeof(__u32); ++ ++ entry = FIRST_ENTRY(bh); ++ while (!IS_LAST_ENTRY(entry)) { ++ struct ext3_xattr_entry *next = ++ EXT3_XATTR_NEXT(entry); ++ if ((char *)next >= end) ++ goto bad_block; ++ if (name_index == entry->e_name_index && ++ name_len == entry->e_name_len && ++ memcmp(name, entry->e_name, name_len) == 0) { ++ memcpy(rentry, entry, sizeof(struct ext3_xattr_entry)); ++ error = 0; ++ } else { ++ *free -= EXT3_XATTR_LEN(entry->e_name_len); ++ *free -= le32_to_cpu(entry->e_value_size); ++ } ++ entry = next; ++ } ++ brelse(bh); ++ ++ return error; ++} ++ ++/* ++ * ext3_xattr_inode_set() ++ * ++ * this routine add/remove/replace attribute in inode body ++ */ ++int ++ext3_xattr_ibody_set(handle_t *handle, struct inode *inode, int name_index, ++ const char *name, const void *value, size_t value_len, ++ int flags) ++{ ++ struct ext3_xattr_entry *last, *next, *here = NULL; ++ struct ext3_inode *raw_inode; ++ int name_len = strlen(name); ++ int esize = EXT3_XATTR_LEN(name_len); ++ struct buffer_head *bh; ++ int err, storage_size; ++ struct ext3_iloc iloc; ++ int free, min_offs; ++ char *start, *end; ++ ++ if (EXT3_SB(inode->i_sb)->s_inode_size <= EXT3_GOOD_OLD_INODE_SIZE) ++ return -ENOSPC; ++ ++ err = ext3_get_inode_loc(inode, &iloc, 1); ++ if (err) ++ return err; ++ raw_inode = ext3_raw_inode(&iloc); ++ bh = iloc.bh; ++ ++ storage_size = EXT3_SB(inode->i_sb)->s_inode_size - ++ EXT3_GOOD_OLD_INODE_SIZE - ++ EXT3_I(inode)->i_extra_isize - ++ sizeof(__u32); ++ start = (char *) raw_inode + EXT3_GOOD_OLD_INODE_SIZE + ++ EXT3_I(inode)->i_extra_isize; ++ if ((*(__u32*) start) != EXT3_XATTR_MAGIC) { ++ /* inode had no attributes before */ ++ *((__u32*) start) = cpu_to_le32(EXT3_XATTR_MAGIC); ++ } ++ start += sizeof(__u32); ++ end = (char *) raw_inode + EXT3_SB(inode->i_sb)->s_inode_size; ++ min_offs = storage_size; ++ free = storage_size - sizeof(__u32); ++ ++ last = (struct ext3_xattr_entry *) start; ++ while (!IS_LAST_ENTRY(last)) { ++ next = EXT3_XATTR_NEXT(last); ++ if (le32_to_cpu(last->e_value_size) > storage_size || ++ (char *) next >= end) { ++ ext3_error(inode->i_sb, "ext3_xattr_ibody_set", ++ "inode %ld", inode->i_ino); ++ brelse(bh); ++ return -EIO; ++ } ++ ++ if (last->e_value_size) { ++ int offs = le16_to_cpu(last->e_value_offs); ++ if (offs < min_offs) ++ min_offs = offs; ++ } ++ if (name_index == last->e_name_index && ++ name_len == last->e_name_len && ++ !memcmp(name, last->e_name, name_len)) ++ here = last; ++ else { ++ /* we calculate all but our attribute ++ * because it will be removed before changing */ ++ free -= EXT3_XATTR_LEN(last->e_name_len); ++ free -= le32_to_cpu(last->e_value_size); ++ } ++ last = next; ++ } ++ ++ if (value && (esize + value_len > free)) { ++ brelse(bh); ++ return -ENOSPC; ++ } ++ ++ err = ext3_reserve_inode_write(handle, inode, &iloc); ++ if (err) { ++ brelse(bh); ++ return err; ++ } ++ ++ if (here) { ++ /* time to remove old value */ ++ struct ext3_xattr_entry *e; ++ int size = le32_to_cpu(here->e_value_size); ++ int border = le16_to_cpu(here->e_value_offs); ++ char *src; ++ ++ /* move tail */ ++ memmove(start + min_offs + size, start + min_offs, ++ border - min_offs); ++ ++ /* recalculate offsets */ ++ e = (struct ext3_xattr_entry *) start; ++ while (!IS_LAST_ENTRY(e)) { ++ struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(e); ++ int offs = le16_to_cpu(e->e_value_offs); ++ if (offs < border) ++ e->e_value_offs = ++ cpu_to_le16(offs + size); ++ e = next; ++ } ++ min_offs += size; ++ ++ /* remove entry */ ++ border = EXT3_XATTR_LEN(here->e_name_len); ++ src = (char *) here + EXT3_XATTR_LEN(here->e_name_len); ++ size = (char *) last - src; ++ if ((char *) here + size > end) ++ printk("ALERT at %s:%d: 0x%p + %d > 0x%p\n", ++ __FILE__, __LINE__, here, size, end); ++ memmove(here, src, size); ++ last = (struct ext3_xattr_entry *) ((char *) last - border); ++ *((__u32 *) last) = 0; ++ } ++ ++ if (value) { ++ int offs = min_offs - value_len; ++ /* use last to create new entry */ ++ last->e_name_len = strlen(name); ++ last->e_name_index = name_index; ++ last->e_value_offs = cpu_to_le16(offs); ++ last->e_value_size = cpu_to_le32(value_len); ++ last->e_hash = last->e_value_block = 0; ++ memset(last->e_name, 0, esize); ++ memcpy(last->e_name, name, last->e_name_len); ++ if (start + offs + value_len > end) ++ printk("ALERT at %s:%d: 0x%p + %d + %zd > 0x%p\n", ++ __FILE__, __LINE__, start, offs, ++ value_len, end); ++ memcpy(start + offs, value, value_len); ++ last = EXT3_XATTR_NEXT(last); ++ *((__u32 *) last) = 0; ++ } ++ ++ ext3_mark_iloc_dirty(handle, inode, &iloc); ++ brelse(bh); ++ ++ return 0; ++} ++ ++/* + * ext3_xattr_set_handle() + * + * Create, replace or remove an extended attribute for this inode. Buffer +@@ -369,6 +859,104 @@ + */ + int + ext3_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, ++ const char *name, const void *value, size_t value_len, ++ int flags) ++{ ++ struct ext3_xattr_entry entry; ++ int err, where = 0, found = 0, total; ++ int free1 = -1, free2 = -1; ++ int name_len; ++ ++ ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld", ++ name_index, name, value, (long)value_len); ++ ++ if (IS_RDONLY(inode)) ++ return -EROFS; ++ if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) ++ return -EPERM; ++ if (value == NULL) ++ value_len = 0; ++ if (name == NULL) ++ return -EINVAL; ++ name_len = strlen(name); ++ if (name_len > 255 || value_len > inode->i_sb->s_blocksize) ++ return -ERANGE; ++ down_write(&EXT3_I(inode)->xattr_sem); ++ ++ /* try to find attribute in inode body */ ++ err = ext3_xattr_ibody_find(inode, name_index, name, &entry, &free1); ++ if (err == 0) { ++ /* found EA in inode */ ++ found = 1; ++ where = 0; ++ } else if (err == -ENOENT) { ++ /* there is no such attribute in inode body */ ++ /* try to find attribute in dedicated block */ ++ err = ext3_xattr_block_find(inode, name_index, name, ++ &entry, &free2); ++ if (err != 0 && err != -ENOENT) { ++ /* not found EA in block */ ++ goto finish; ++ } else if (err == 0) { ++ /* found EA in block */ ++ where = 1; ++ found = 1; ++ } ++ } else ++ goto finish; ++ ++ /* check flags: may replace? may create ? */ ++ if (found && (flags & XATTR_CREATE)) { ++ err = -EEXIST; ++ goto finish; ++ } else if (!found && (flags & XATTR_REPLACE)) { ++ err = -ENODATA; ++ goto finish; ++ } ++ ++ /* check if we have enough space to store attribute */ ++ total = EXT3_XATTR_LEN(strlen(name)) + value_len; ++ if (free1 >= 0 && total > free1 && free2 >= 0 && total > free2) { ++ /* have no enough space */ ++ err = -ENOSPC; ++ goto finish; ++ } ++ ++ /* time to remove attribute */ ++ if (found) { ++ if (where == 0) { ++ /* EA is stored in inode body */ ++ ext3_xattr_ibody_set(handle, inode, name_index, name, ++ NULL, 0, flags); ++ } else { ++ /* EA is stored in separated block */ ++ ext3_xattr_block_set(handle, inode, name_index, name, ++ NULL, 0, flags); ++ } ++ } ++ ++ /* try to store EA in inode body */ ++ err = ext3_xattr_ibody_set(handle, inode, name_index, name, ++ value, value_len, flags); ++ if (err) { ++ /* can't store EA in inode body */ ++ /* try to store in block */ ++ err = ext3_xattr_block_set(handle, inode, name_index, ++ name, value, value_len, flags); ++ } ++ ++finish: ++ up_write(&EXT3_I(inode)->xattr_sem); ++ return err; ++} ++ ++/* ++ * ext3_xattr_block_set() ++ * ++ * this routine add/remove/replace attribute in EA block ++ */ ++int ++ext3_xattr_block_set(handle_t *handle, struct inode *inode, int name_index, + const char *name, const void *value, size_t value_len, + int flags) + { +@@ -391,22 +979,7 @@ + * towards the end of the block). + * end -- Points right after the block pointed to by header. + */ +- +- ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld", +- name_index, name, value, (long)value_len); +- +- if (IS_RDONLY(inode)) +- return -EROFS; +- if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) +- return -EPERM; +- if (value == NULL) +- value_len = 0; +- if (name == NULL) +- return -EINVAL; + name_len = strlen(name); +- if (name_len > 255 || value_len > sb->s_blocksize) +- return -ERANGE; +- down_write(&EXT3_I(inode)->xattr_sem); + if (EXT3_I(inode)->i_file_acl) { + /* The inode already has an extended attribute block. */ + bh = sb_bread(sb, EXT3_I(inode)->i_file_acl); +@@ -638,7 +1211,6 @@ + brelse(bh); + if (!(bh && header == HDR(bh))) + kfree(header); +- up_write(&EXT3_I(inode)->xattr_sem); + + return error; + } +Index: linux-stage/fs/ext3/xattr.h +=================================================================== +--- linux-stage.orig/fs/ext3/xattr.h 2005-02-25 16:47:04.423975456 +0200 ++++ linux-stage/fs/ext3/xattr.h 2005-02-25 16:50:40.763086912 +0200 +@@ -67,7 +67,8 @@ + extern int ext3_xattr_get(struct inode *, int, const char *, void *, size_t); + extern int ext3_xattr_list(struct inode *, char *, size_t); + extern int ext3_xattr_set(struct inode *, int, const char *, const void *, size_t, int); +-extern int ext3_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int); ++extern int ext3_xattr_set_handle(handle_t *, struct inode *, int, const char *,const void *,size_t,int); ++extern int ext3_xattr_block_set(handle_t *, struct inode *, int, const char *,const void *,size_t,int); + + extern void ext3_xattr_delete_inode(handle_t *, struct inode *); + extern void ext3_xattr_put_super(struct super_block *); +Index: linux-stage/include/linux/ext3_fs.h +=================================================================== +--- linux-stage.orig/include/linux/ext3_fs.h 2005-02-25 16:47:04.425975152 +0200 ++++ linux-stage/include/linux/ext3_fs.h 2005-02-25 16:50:40.765086608 +0200 +@@ -293,6 +293,8 @@ + __u32 m_i_reserved2[2]; + } masix2; + } osd2; /* OS dependent 2 */ ++ __u16 i_extra_isize; ++ __u16 i_pad1; + }; + + #define i_size_high i_dir_acl +@@ -757,6 +759,7 @@ + extern int ext3_forget(handle_t *, int, struct inode *, struct buffer_head *, int); + extern struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *); + extern struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *); ++int ext3_get_inode_loc(struct inode *inode, struct ext3_iloc *iloc, int in_mem); + + extern void ext3_read_inode (struct inode *); + extern int ext3_write_inode (struct inode *, int); +Index: linux-stage/include/linux/ext3_fs_i.h +=================================================================== +--- linux-stage.orig/include/linux/ext3_fs_i.h 2005-02-25 16:47:04.426975000 +0200 ++++ linux-stage/include/linux/ext3_fs_i.h 2005-02-25 16:50:40.766086456 +0200 +@@ -113,6 +113,9 @@ + */ + loff_t i_disksize; + ++ /* on-disk additional length */ ++ __u16 i_extra_isize; ++ + /* + * truncate_sem is for serialising ext3_truncate() against + * ext3_getblock(). In the 2.4 ext2 design, great chunks of inode's diff --git a/lustre/kernel_patches/patches/ext3-extents-2.6.9-rhel4.patch b/lustre/kernel_patches/patches/ext3-extents-2.6.9-rhel4.patch new file mode 100644 index 0000000..78c5d81 --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-extents-2.6.9-rhel4.patch @@ -0,0 +1,2831 @@ +Index: linux-stage/fs/ext3/extents.c +=================================================================== +--- linux-stage.orig/fs/ext3/extents.c 2005-02-25 15:33:48.890198160 +0200 ++++ linux-stage/fs/ext3/extents.c 2005-02-25 15:33:48.917194056 +0200 +@@ -0,0 +1,2313 @@ ++/* ++ * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com ++ * Written by Alex Tomas ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public Licens ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- ++ */ ++ ++/* ++ * Extents support for EXT3 ++ * ++ * TODO: ++ * - ext3_ext_walk_space() sould not use ext3_ext_find_extent() ++ * - ext3_ext_calc_credits() could take 'mergable' into account ++ * - ext3*_error() should be used in some situations ++ * - find_goal() [to be tested and improved] ++ * - smart tree reduction ++ * - arch-independence ++ * common on-disk format for big/little-endian arch ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++static handle_t *ext3_ext_journal_restart(handle_t *handle, int needed) ++{ ++ int err; ++ ++ if (handle->h_buffer_credits > needed) ++ return handle; ++ if (!ext3_journal_extend(handle, needed)) ++ return handle; ++ err = ext3_journal_restart(handle, needed); ++ ++ return handle; ++} ++ ++static int inline ++ext3_ext_get_access_for_root(handle_t *h, struct ext3_extents_tree *tree) ++{ ++ if (tree->ops->get_write_access) ++ return tree->ops->get_write_access(h,tree->buffer); ++ else ++ return 0; ++} ++ ++static int inline ++ext3_ext_mark_root_dirty(handle_t *h, struct ext3_extents_tree *tree) ++{ ++ if (tree->ops->mark_buffer_dirty) ++ return tree->ops->mark_buffer_dirty(h,tree->buffer); ++ else ++ return 0; ++} ++ ++/* ++ * could return: ++ * - EROFS ++ * - ENOMEM ++ */ ++static int ext3_ext_get_access(handle_t *handle, ++ struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ int err; ++ ++ if (path->p_bh) { ++ /* path points to block */ ++ err = ext3_journal_get_write_access(handle, path->p_bh); ++ } else { ++ /* path points to leaf/index in inode body */ ++ err = ext3_ext_get_access_for_root(handle, tree); ++ } ++ return err; ++} ++ ++/* ++ * could return: ++ * - EROFS ++ * - ENOMEM ++ * - EIO ++ */ ++static int ext3_ext_dirty(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ int err; ++ if (path->p_bh) { ++ /* path points to block */ ++ err =ext3_journal_dirty_metadata(handle, path->p_bh); ++ } else { ++ /* path points to leaf/index in inode body */ ++ err = ext3_ext_mark_root_dirty(handle, tree); ++ } ++ return err; ++} ++ ++static int inline ++ext3_ext_new_block(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, struct ext3_extent *ex, ++ int *err) ++{ ++ int goal, depth, newblock; ++ struct inode *inode; ++ ++ EXT_ASSERT(tree); ++ if (tree->ops->new_block) ++ return tree->ops->new_block(handle, tree, path, ex, err); ++ ++ inode = tree->inode; ++ depth = EXT_DEPTH(tree); ++ if (path && depth > 0) { ++ goal = path[depth-1].p_block; ++ } else { ++ struct ext3_inode_info *ei = EXT3_I(inode); ++ unsigned long bg_start; ++ unsigned long colour; ++ ++ bg_start = (ei->i_block_group * ++ EXT3_BLOCKS_PER_GROUP(inode->i_sb)) + ++ le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block); ++ colour = (current->pid % 16) * ++ (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16); ++ goal = bg_start + colour; ++ } ++ ++ newblock = ext3_new_block(handle, inode, goal, err); ++ return newblock; ++} ++ ++static inline void ext3_ext_tree_changed(struct ext3_extents_tree *tree) ++{ ++ struct ext3_extent_header *neh; ++ neh = EXT_ROOT_HDR(tree); ++ neh->eh_generation++; ++} ++ ++static inline int ext3_ext_space_block(struct ext3_extents_tree *tree) ++{ ++ int size; ++ ++ size = (tree->inode->i_sb->s_blocksize - ++ sizeof(struct ext3_extent_header)) ++ / sizeof(struct ext3_extent); ++#ifdef AGRESSIVE_TEST ++ size = 6; ++#endif ++ return size; ++} ++ ++static inline int ext3_ext_space_block_idx(struct ext3_extents_tree *tree) ++{ ++ int size; ++ ++ size = (tree->inode->i_sb->s_blocksize - ++ sizeof(struct ext3_extent_header)) ++ / sizeof(struct ext3_extent_idx); ++#ifdef AGRESSIVE_TEST ++ size = 5; ++#endif ++ return size; ++} ++ ++static inline int ext3_ext_space_root(struct ext3_extents_tree *tree) ++{ ++ int size; ++ ++ size = (tree->buffer_len - sizeof(struct ext3_extent_header)) ++ / sizeof(struct ext3_extent); ++#ifdef AGRESSIVE_TEST ++ size = 3; ++#endif ++ return size; ++} ++ ++static inline int ext3_ext_space_root_idx(struct ext3_extents_tree *tree) ++{ ++ int size; ++ ++ size = (tree->buffer_len - ++ sizeof(struct ext3_extent_header)) ++ / sizeof(struct ext3_extent_idx); ++#ifdef AGRESSIVE_TEST ++ size = 4; ++#endif ++ return size; ++} ++ ++static void ext3_ext_show_path(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++#ifdef EXT_DEBUG ++ int k, l = path->p_depth; ++ ++ ext_debug(tree, "path:"); ++ for (k = 0; k <= l; k++, path++) { ++ if (path->p_idx) { ++ ext_debug(tree, " %d->%d", path->p_idx->ei_block, ++ path->p_idx->ei_leaf); ++ } else if (path->p_ext) { ++ ext_debug(tree, " %d:%d:%d", ++ path->p_ext->ee_block, ++ path->p_ext->ee_len, ++ path->p_ext->ee_start); ++ } else ++ ext_debug(tree, " []"); ++ } ++ ext_debug(tree, "\n"); ++#endif ++} ++ ++static void ext3_ext_show_leaf(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++#ifdef EXT_DEBUG ++ int depth = EXT_DEPTH(tree); ++ struct ext3_extent_header *eh; ++ struct ext3_extent *ex; ++ int i; ++ ++ if (!path) ++ return; ++ ++ eh = path[depth].p_hdr; ++ ex = EXT_FIRST_EXTENT(eh); ++ ++ for (i = 0; i < eh->eh_entries; i++, ex++) { ++ ext_debug(tree, "%d:%d:%d ", ++ ex->ee_block, ex->ee_len, ex->ee_start); ++ } ++ ext_debug(tree, "\n"); ++#endif ++} ++ ++static void ext3_ext_drop_refs(struct ext3_ext_path *path) ++{ ++ int depth = path->p_depth; ++ int i; ++ ++ for (i = 0; i <= depth; i++, path++) ++ if (path->p_bh) { ++ brelse(path->p_bh); ++ path->p_bh = NULL; ++ } ++} ++ ++/* ++ * binary search for closest index by given block ++ */ ++static inline void ++ext3_ext_binsearch_idx(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, int block) ++{ ++ struct ext3_extent_header *eh = path->p_hdr; ++ struct ext3_extent_idx *ix; ++ int l = 0, k, r; ++ ++ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); ++ EXT_ASSERT(eh->eh_entries <= eh->eh_max); ++ EXT_ASSERT(eh->eh_entries > 0); ++ ++ ext_debug(tree, "binsearch for %d(idx): ", block); ++ ++ path->p_idx = ix = EXT_FIRST_INDEX(eh); ++ ++ r = k = eh->eh_entries; ++ while (k > 1) { ++ k = (r - l) / 2; ++ if (block < ix[l + k].ei_block) ++ r -= k; ++ else ++ l += k; ++ ext_debug(tree, "%d:%d:%d ", k, l, r); ++ } ++ ++ ix += l; ++ path->p_idx = ix; ++ ext_debug(tree, " -> %d->%d ", path->p_idx->ei_block, path->p_idx->ei_leaf); ++ ++ while (l++ < r) { ++ if (block < ix->ei_block) ++ break; ++ path->p_idx = ix++; ++ } ++ ext_debug(tree, " -> %d->%d\n", path->p_idx->ei_block, ++ path->p_idx->ei_leaf); ++ ++#ifdef CHECK_BINSEARCH ++ { ++ struct ext3_extent_idx *chix; ++ ++ chix = ix = EXT_FIRST_INDEX(eh); ++ for (k = 0; k < eh->eh_entries; k++, ix++) { ++ if (k != 0 && ix->ei_block <= ix[-1].ei_block) { ++ printk("k=%d, ix=0x%p, first=0x%p\n", k, ++ ix, EXT_FIRST_INDEX(eh)); ++ printk("%u <= %u\n", ++ ix->ei_block,ix[-1].ei_block); ++ } ++ EXT_ASSERT(k == 0 || ix->ei_block > ix[-1].ei_block); ++ if (block < ix->ei_block) ++ break; ++ chix = ix; ++ } ++ EXT_ASSERT(chix == path->p_idx); ++ } ++#endif ++ ++} ++ ++/* ++ * binary search for closest extent by given block ++ */ ++static inline void ++ext3_ext_binsearch(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, int block) ++{ ++ struct ext3_extent_header *eh = path->p_hdr; ++ struct ext3_extent *ex; ++ int l = 0, k, r; ++ ++ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); ++ EXT_ASSERT(eh->eh_entries <= eh->eh_max); ++ ++ if (eh->eh_entries == 0) { ++ /* ++ * this leaf is empty yet: ++ * we get such a leaf in split/add case ++ */ ++ return; ++ } ++ ++ ext_debug(tree, "binsearch for %d: ", block); ++ ++ path->p_ext = ex = EXT_FIRST_EXTENT(eh); ++ ++ r = k = eh->eh_entries; ++ while (k > 1) { ++ k = (r - l) / 2; ++ if (block < ex[l + k].ee_block) ++ r -= k; ++ else ++ l += k; ++ ext_debug(tree, "%d:%d:%d ", k, l, r); ++ } ++ ++ ex += l; ++ path->p_ext = ex; ++ ext_debug(tree, " -> %d:%d:%d ", path->p_ext->ee_block, ++ path->p_ext->ee_start, path->p_ext->ee_len); ++ ++ while (l++ < r) { ++ if (block < ex->ee_block) ++ break; ++ path->p_ext = ex++; ++ } ++ ext_debug(tree, " -> %d:%d:%d\n", path->p_ext->ee_block, ++ path->p_ext->ee_start, path->p_ext->ee_len); ++ ++#ifdef CHECK_BINSEARCH ++ { ++ struct ext3_extent *chex; ++ ++ chex = ex = EXT_FIRST_EXTENT(eh); ++ for (k = 0; k < eh->eh_entries; k++, ex++) { ++ EXT_ASSERT(k == 0 || ex->ee_block > ex[-1].ee_block); ++ if (block < ex->ee_block) ++ break; ++ chex = ex; ++ } ++ EXT_ASSERT(chex == path->p_ext); ++ } ++#endif ++ ++} ++ ++int ext3_extent_tree_init(handle_t *handle, struct ext3_extents_tree *tree) ++{ ++ struct ext3_extent_header *eh; ++ ++ BUG_ON(tree->buffer_len == 0); ++ ext3_ext_get_access_for_root(handle, tree); ++ eh = EXT_ROOT_HDR(tree); ++ eh->eh_depth = 0; ++ eh->eh_entries = 0; ++ eh->eh_magic = EXT3_EXT_MAGIC; ++ eh->eh_max = ext3_ext_space_root(tree); ++ ext3_ext_mark_root_dirty(handle, tree); ++ ext3_ext_invalidate_cache(tree); ++ return 0; ++} ++ ++struct ext3_ext_path * ++ext3_ext_find_extent(struct ext3_extents_tree *tree, int block, ++ struct ext3_ext_path *path) ++{ ++ struct ext3_extent_header *eh; ++ struct buffer_head *bh; ++ int depth, i, ppos = 0; ++ ++ EXT_ASSERT(tree); ++ EXT_ASSERT(tree->inode); ++ EXT_ASSERT(tree->root); ++ ++ eh = EXT_ROOT_HDR(tree); ++ EXT_ASSERT(eh); ++ i = depth = EXT_DEPTH(tree); ++ EXT_ASSERT(eh->eh_max); ++ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); ++ EXT_ASSERT(i == 0 || eh->eh_entries > 0); ++ ++ /* account possible depth increase */ ++ if (!path) { ++ path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 2), ++ GFP_NOFS); ++ if (!path) ++ return ERR_PTR(-ENOMEM); ++ } ++ memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1)); ++ path[0].p_hdr = eh; ++ ++ /* walk through the tree */ ++ while (i) { ++ ext_debug(tree, "depth %d: num %d, max %d\n", ++ ppos, eh->eh_entries, eh->eh_max); ++ ext3_ext_binsearch_idx(tree, path + ppos, block); ++ path[ppos].p_block = path[ppos].p_idx->ei_leaf; ++ path[ppos].p_depth = i; ++ path[ppos].p_ext = NULL; ++ ++ bh = sb_bread(tree->inode->i_sb, path[ppos].p_block); ++ if (!bh) { ++ ext3_ext_drop_refs(path); ++ kfree(path); ++ return ERR_PTR(-EIO); ++ } ++ eh = EXT_BLOCK_HDR(bh); ++ ppos++; ++ EXT_ASSERT(ppos <= depth); ++ path[ppos].p_bh = bh; ++ path[ppos].p_hdr = eh; ++ i--; ++ } ++ ++ path[ppos].p_depth = i; ++ path[ppos].p_hdr = eh; ++ path[ppos].p_ext = NULL; ++ ++ /* find extent */ ++ ext3_ext_binsearch(tree, path + ppos, block); ++ ++ ext3_ext_show_path(tree, path); ++ ++ return path; ++} ++ ++/* ++ * insert new index [logical;ptr] into the block at cupr ++ * it check where to insert: before curp or after curp ++ */ ++static int ext3_ext_insert_index(handle_t *handle, ++ struct ext3_extents_tree *tree, ++ struct ext3_ext_path *curp, ++ int logical, int ptr) ++{ ++ struct ext3_extent_idx *ix; ++ int len, err; ++ ++ if ((err = ext3_ext_get_access(handle, tree, curp))) ++ return err; ++ ++ EXT_ASSERT(logical != curp->p_idx->ei_block); ++ len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx; ++ if (logical > curp->p_idx->ei_block) { ++ /* insert after */ ++ if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) { ++ len = (len - 1) * sizeof(struct ext3_extent_idx); ++ len = len < 0 ? 0 : len; ++ ext_debug(tree, "insert new index %d after: %d. " ++ "move %d from 0x%p to 0x%p\n", ++ logical, ptr, len, ++ (curp->p_idx + 1), (curp->p_idx + 2)); ++ memmove(curp->p_idx + 2, curp->p_idx + 1, len); ++ } ++ ix = curp->p_idx + 1; ++ } else { ++ /* insert before */ ++ len = len * sizeof(struct ext3_extent_idx); ++ len = len < 0 ? 0 : len; ++ ext_debug(tree, "insert new index %d before: %d. " ++ "move %d from 0x%p to 0x%p\n", ++ logical, ptr, len, ++ curp->p_idx, (curp->p_idx + 1)); ++ memmove(curp->p_idx + 1, curp->p_idx, len); ++ ix = curp->p_idx; ++ } ++ ++ ix->ei_block = logical; ++ ix->ei_leaf = ptr; ++ curp->p_hdr->eh_entries++; ++ ++ EXT_ASSERT(curp->p_hdr->eh_entries <= curp->p_hdr->eh_max); ++ EXT_ASSERT(ix <= EXT_LAST_INDEX(curp->p_hdr)); ++ ++ err = ext3_ext_dirty(handle, tree, curp); ++ ext3_std_error(tree->inode->i_sb, err); ++ ++ return err; ++} ++ ++/* ++ * routine inserts new subtree into the path, using free index entry ++ * at depth 'at: ++ * - allocates all needed blocks (new leaf and all intermediate index blocks) ++ * - makes decision where to split ++ * - moves remaining extens and index entries (right to the split point) ++ * into the newly allocated blocks ++ * - initialize subtree ++ */ ++static int ext3_ext_split(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newext, int at) ++{ ++ struct buffer_head *bh = NULL; ++ int depth = EXT_DEPTH(tree); ++ struct ext3_extent_header *neh; ++ struct ext3_extent_idx *fidx; ++ struct ext3_extent *ex; ++ int i = at, k, m, a; ++ unsigned long newblock, oldblock, border; ++ int *ablocks = NULL; /* array of allocated blocks */ ++ int err = 0; ++ ++ /* make decision: where to split? */ ++ /* FIXME: now desicion is simplest: at current extent */ ++ ++ /* if current leaf will be splitted, then we should use ++ * border from split point */ ++ EXT_ASSERT(path[depth].p_ext <= EXT_MAX_EXTENT(path[depth].p_hdr)); ++ if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) { ++ border = path[depth].p_ext[1].ee_block; ++ ext_debug(tree, "leaf will be splitted." ++ " next leaf starts at %d\n", ++ (int)border); ++ } else { ++ border = newext->ee_block; ++ ext_debug(tree, "leaf will be added." ++ " next leaf starts at %d\n", ++ (int)border); ++ } ++ ++ /* ++ * if error occurs, then we break processing ++ * and turn filesystem read-only. so, index won't ++ * be inserted and tree will be in consistent ++ * state. next mount will repair buffers too ++ */ ++ ++ /* ++ * get array to track all allocated blocks ++ * we need this to handle errors and free blocks ++ * upon them ++ */ ++ ablocks = kmalloc(sizeof(unsigned long) * depth, GFP_NOFS); ++ if (!ablocks) ++ return -ENOMEM; ++ memset(ablocks, 0, sizeof(unsigned long) * depth); ++ ++ /* allocate all needed blocks */ ++ ext_debug(tree, "allocate %d blocks for indexes/leaf\n", depth - at); ++ for (a = 0; a < depth - at; a++) { ++ newblock = ext3_ext_new_block(handle, tree, path, newext, &err); ++ if (newblock == 0) ++ goto cleanup; ++ ablocks[a] = newblock; ++ } ++ ++ /* initialize new leaf */ ++ newblock = ablocks[--a]; ++ EXT_ASSERT(newblock); ++ bh = sb_getblk(tree->inode->i_sb, newblock); ++ if (!bh) { ++ err = -EIO; ++ goto cleanup; ++ } ++ lock_buffer(bh); ++ ++ if ((err = ext3_journal_get_create_access(handle, bh))) ++ goto cleanup; ++ ++ neh = EXT_BLOCK_HDR(bh); ++ neh->eh_entries = 0; ++ neh->eh_max = ext3_ext_space_block(tree); ++ neh->eh_magic = EXT3_EXT_MAGIC; ++ neh->eh_depth = 0; ++ ex = EXT_FIRST_EXTENT(neh); ++ ++ /* move remain of path[depth] to the new leaf */ ++ EXT_ASSERT(path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max); ++ /* start copy from next extent */ ++ /* TODO: we could do it by single memmove */ ++ m = 0; ++ path[depth].p_ext++; ++ while (path[depth].p_ext <= ++ EXT_MAX_EXTENT(path[depth].p_hdr)) { ++ ext_debug(tree, "move %d:%d:%d in new leaf %lu\n", ++ path[depth].p_ext->ee_block, ++ path[depth].p_ext->ee_start, ++ path[depth].p_ext->ee_len, ++ newblock); ++ memmove(ex++, path[depth].p_ext++, ++ sizeof(struct ext3_extent)); ++ neh->eh_entries++; ++ m++; ++ } ++ set_buffer_uptodate(bh); ++ unlock_buffer(bh); ++ ++ if ((err = ext3_journal_dirty_metadata(handle, bh))) ++ goto cleanup; ++ brelse(bh); ++ bh = NULL; ++ ++ /* correct old leaf */ ++ if (m) { ++ if ((err = ext3_ext_get_access(handle, tree, path + depth))) ++ goto cleanup; ++ path[depth].p_hdr->eh_entries -= m; ++ if ((err = ext3_ext_dirty(handle, tree, path + depth))) ++ goto cleanup; ++ ++ } ++ ++ /* create intermediate indexes */ ++ k = depth - at - 1; ++ EXT_ASSERT(k >= 0); ++ if (k) ++ ext_debug(tree, "create %d intermediate indices\n", k); ++ /* insert new index into current index block */ ++ /* current depth stored in i var */ ++ i = depth - 1; ++ while (k--) { ++ oldblock = newblock; ++ newblock = ablocks[--a]; ++ bh = sb_getblk(tree->inode->i_sb, newblock); ++ if (!bh) { ++ err = -EIO; ++ goto cleanup; ++ } ++ lock_buffer(bh); ++ ++ if ((err = ext3_journal_get_create_access(handle, bh))) ++ goto cleanup; ++ ++ neh = EXT_BLOCK_HDR(bh); ++ neh->eh_entries = 1; ++ neh->eh_magic = EXT3_EXT_MAGIC; ++ neh->eh_max = ext3_ext_space_block_idx(tree); ++ neh->eh_depth = depth - i; ++ fidx = EXT_FIRST_INDEX(neh); ++ fidx->ei_block = border; ++ fidx->ei_leaf = oldblock; ++ ++ ext_debug(tree, "int.index at %d (block %lu): %lu -> %lu\n", ++ i, newblock, border, oldblock); ++ /* copy indexes */ ++ m = 0; ++ path[i].p_idx++; ++ ++ ext_debug(tree, "cur 0x%p, last 0x%p\n", path[i].p_idx, ++ EXT_MAX_INDEX(path[i].p_hdr)); ++ EXT_ASSERT(EXT_MAX_INDEX(path[i].p_hdr) == ++ EXT_LAST_INDEX(path[i].p_hdr)); ++ while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) { ++ ext_debug(tree, "%d: move %d:%d in new index %lu\n", ++ i, path[i].p_idx->ei_block, ++ path[i].p_idx->ei_leaf, newblock); ++ memmove(++fidx, path[i].p_idx++, ++ sizeof(struct ext3_extent_idx)); ++ neh->eh_entries++; ++ EXT_ASSERT(neh->eh_entries <= neh->eh_max); ++ m++; ++ } ++ set_buffer_uptodate(bh); ++ unlock_buffer(bh); ++ ++ if ((err = ext3_journal_dirty_metadata(handle, bh))) ++ goto cleanup; ++ brelse(bh); ++ bh = NULL; ++ ++ /* correct old index */ ++ if (m) { ++ err = ext3_ext_get_access(handle, tree, path + i); ++ if (err) ++ goto cleanup; ++ path[i].p_hdr->eh_entries -= m; ++ err = ext3_ext_dirty(handle, tree, path + i); ++ if (err) ++ goto cleanup; ++ } ++ ++ i--; ++ } ++ ++ /* insert new index */ ++ if (!err) ++ err = ext3_ext_insert_index(handle, tree, path + at, ++ border, newblock); ++ ++cleanup: ++ if (bh) { ++ if (buffer_locked(bh)) ++ unlock_buffer(bh); ++ brelse(bh); ++ } ++ ++ if (err) { ++ /* free all allocated blocks in error case */ ++ for (i = 0; i < depth; i++) { ++ if (!ablocks[i]) ++ continue; ++ ext3_free_blocks(handle, tree->inode, ablocks[i], 1); ++ } ++ } ++ kfree(ablocks); ++ ++ return err; ++} ++ ++/* ++ * routine implements tree growing procedure: ++ * - allocates new block ++ * - moves top-level data (index block or leaf) into the new block ++ * - initialize new top-level, creating index that points to the ++ * just created block ++ */ ++static int ext3_ext_grow_indepth(handle_t *handle, ++ struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newext) ++{ ++ struct ext3_ext_path *curp = path; ++ struct ext3_extent_header *neh; ++ struct ext3_extent_idx *fidx; ++ struct buffer_head *bh; ++ unsigned long newblock; ++ int err = 0; ++ ++ newblock = ext3_ext_new_block(handle, tree, path, newext, &err); ++ if (newblock == 0) ++ return err; ++ ++ bh = sb_getblk(tree->inode->i_sb, newblock); ++ if (!bh) { ++ err = -EIO; ++ ext3_std_error(tree->inode->i_sb, err); ++ return err; ++ } ++ lock_buffer(bh); ++ ++ if ((err = ext3_journal_get_create_access(handle, bh))) { ++ unlock_buffer(bh); ++ goto out; ++ } ++ ++ /* move top-level index/leaf into new block */ ++ memmove(bh->b_data, curp->p_hdr, tree->buffer_len); ++ ++ /* set size of new block */ ++ neh = EXT_BLOCK_HDR(bh); ++ /* old root could have indexes or leaves ++ * so calculate e_max right way */ ++ if (EXT_DEPTH(tree)) ++ neh->eh_max = ext3_ext_space_block_idx(tree); ++ else ++ neh->eh_max = ext3_ext_space_block(tree); ++ neh->eh_magic = EXT3_EXT_MAGIC; ++ set_buffer_uptodate(bh); ++ unlock_buffer(bh); ++ ++ if ((err = ext3_journal_dirty_metadata(handle, bh))) ++ goto out; ++ ++ /* create index in new top-level index: num,max,pointer */ ++ if ((err = ext3_ext_get_access(handle, tree, curp))) ++ goto out; ++ ++ curp->p_hdr->eh_magic = EXT3_EXT_MAGIC; ++ curp->p_hdr->eh_max = ext3_ext_space_root_idx(tree); ++ curp->p_hdr->eh_entries = 1; ++ curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr); ++ /* FIXME: it works, but actually path[0] can be index */ ++ curp->p_idx->ei_block = EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block; ++ curp->p_idx->ei_leaf = newblock; ++ ++ neh = EXT_ROOT_HDR(tree); ++ fidx = EXT_FIRST_INDEX(neh); ++ ext_debug(tree, "new root: num %d(%d), lblock %d, ptr %d\n", ++ neh->eh_entries, neh->eh_max, fidx->ei_block, fidx->ei_leaf); ++ ++ neh->eh_depth = path->p_depth + 1; ++ err = ext3_ext_dirty(handle, tree, curp); ++out: ++ brelse(bh); ++ ++ return err; ++} ++ ++/* ++ * routine finds empty index and adds new leaf. if no free index found ++ * then it requests in-depth growing ++ */ ++static int ext3_ext_create_new_leaf(handle_t *handle, ++ struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newext) ++{ ++ struct ext3_ext_path *curp; ++ int depth, i, err = 0; ++ ++repeat: ++ i = depth = EXT_DEPTH(tree); ++ ++ /* walk up to the tree and look for free index entry */ ++ curp = path + depth; ++ while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) { ++ i--; ++ curp--; ++ } ++ ++ /* we use already allocated block for index block ++ * so, subsequent data blocks should be contigoues */ ++ if (EXT_HAS_FREE_INDEX(curp)) { ++ /* if we found index with free entry, then use that ++ * entry: create all needed subtree and add new leaf */ ++ err = ext3_ext_split(handle, tree, path, newext, i); ++ ++ /* refill path */ ++ ext3_ext_drop_refs(path); ++ path = ext3_ext_find_extent(tree, newext->ee_block, path); ++ if (IS_ERR(path)) ++ err = PTR_ERR(path); ++ } else { ++ /* tree is full, time to grow in depth */ ++ err = ext3_ext_grow_indepth(handle, tree, path, newext); ++ ++ /* refill path */ ++ ext3_ext_drop_refs(path); ++ path = ext3_ext_find_extent(tree, newext->ee_block, path); ++ if (IS_ERR(path)) ++ err = PTR_ERR(path); ++ ++ /* ++ * only first (depth 0 -> 1) produces free space ++ * in all other cases we have to split growed tree ++ */ ++ depth = EXT_DEPTH(tree); ++ if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) { ++ /* now we need split */ ++ goto repeat; ++ } ++ } ++ ++ if (err) ++ return err; ++ ++ return 0; ++} ++ ++/* ++ * returns allocated block in subsequent extent or EXT_MAX_BLOCK ++ * NOTE: it consider block number from index entry as ++ * allocated block. thus, index entries have to be consistent ++ * with leafs ++ */ ++static unsigned long ++ext3_ext_next_allocated_block(struct ext3_ext_path *path) ++{ ++ int depth; ++ ++ EXT_ASSERT(path != NULL); ++ depth = path->p_depth; ++ ++ if (depth == 0 && path->p_ext == NULL) ++ return EXT_MAX_BLOCK; ++ ++ /* FIXME: what if index isn't full ?! */ ++ while (depth >= 0) { ++ if (depth == path->p_depth) { ++ /* leaf */ ++ if (path[depth].p_ext != ++ EXT_LAST_EXTENT(path[depth].p_hdr)) ++ return path[depth].p_ext[1].ee_block; ++ } else { ++ /* index */ ++ if (path[depth].p_idx != ++ EXT_LAST_INDEX(path[depth].p_hdr)) ++ return path[depth].p_idx[1].ei_block; ++ } ++ depth--; ++ } ++ ++ return EXT_MAX_BLOCK; ++} ++ ++/* ++ * returns first allocated block from next leaf or EXT_MAX_BLOCK ++ */ ++static unsigned ext3_ext_next_leaf_block(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ int depth; ++ ++ EXT_ASSERT(path != NULL); ++ depth = path->p_depth; ++ ++ /* zero-tree has no leaf blocks at all */ ++ if (depth == 0) ++ return EXT_MAX_BLOCK; ++ ++ /* go to index block */ ++ depth--; ++ ++ while (depth >= 0) { ++ if (path[depth].p_idx != ++ EXT_LAST_INDEX(path[depth].p_hdr)) ++ return path[depth].p_idx[1].ei_block; ++ depth--; ++ } ++ ++ return EXT_MAX_BLOCK; ++} ++ ++/* ++ * if leaf gets modified and modified extent is first in the leaf ++ * then we have to correct all indexes above ++ * TODO: do we need to correct tree in all cases? ++ */ ++int ext3_ext_correct_indexes(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ struct ext3_extent_header *eh; ++ int depth = EXT_DEPTH(tree); ++ struct ext3_extent *ex; ++ unsigned long border; ++ int k, err = 0; ++ ++ eh = path[depth].p_hdr; ++ ex = path[depth].p_ext; ++ EXT_ASSERT(ex); ++ EXT_ASSERT(eh); ++ ++ if (depth == 0) { ++ /* there is no tree at all */ ++ return 0; ++ } ++ ++ if (ex != EXT_FIRST_EXTENT(eh)) { ++ /* we correct tree if first leaf got modified only */ ++ return 0; ++ } ++ ++ /* ++ * TODO: we need correction if border is smaller then current one ++ */ ++ k = depth - 1; ++ border = path[depth].p_ext->ee_block; ++ if ((err = ext3_ext_get_access(handle, tree, path + k))) ++ return err; ++ path[k].p_idx->ei_block = border; ++ if ((err = ext3_ext_dirty(handle, tree, path + k))) ++ return err; ++ ++ while (k--) { ++ /* change all left-side indexes */ ++ if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr)) ++ break; ++ if ((err = ext3_ext_get_access(handle, tree, path + k))) ++ break; ++ path[k].p_idx->ei_block = border; ++ if ((err = ext3_ext_dirty(handle, tree, path + k))) ++ break; ++ } ++ ++ return err; ++} ++ ++static int inline ++ext3_can_extents_be_merged(struct ext3_extents_tree *tree, ++ struct ext3_extent *ex1, ++ struct ext3_extent *ex2) ++{ ++ if (ex1->ee_block + ex1->ee_len != ex2->ee_block) ++ return 0; ++ ++#ifdef AGRESSIVE_TEST ++ if (ex1->ee_len >= 4) ++ return 0; ++#endif ++ ++ if (!tree->ops->mergable) ++ return 1; ++ ++ return tree->ops->mergable(ex1, ex2); ++} ++ ++/* ++ * this routine tries to merge requsted extent into the existing ++ * extent or inserts requested extent as new one into the tree, ++ * creating new leaf in no-space case ++ */ ++int ext3_ext_insert_extent(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newext) ++{ ++ struct ext3_extent_header * eh; ++ struct ext3_extent *ex, *fex; ++ struct ext3_extent *nearex; /* nearest extent */ ++ struct ext3_ext_path *npath = NULL; ++ int depth, len, err, next; ++ ++ EXT_ASSERT(newext->ee_len > 0); ++ EXT_ASSERT(newext->ee_len < EXT_CACHE_MARK); ++ depth = EXT_DEPTH(tree); ++ ex = path[depth].p_ext; ++ EXT_ASSERT(path[depth].p_hdr); ++ ++ /* try to insert block into found extent and return */ ++ if (ex && ext3_can_extents_be_merged(tree, ex, newext)) { ++ ext_debug(tree, "append %d block to %d:%d (from %d)\n", ++ newext->ee_len, ex->ee_block, ex->ee_len, ++ ex->ee_start); ++ if ((err = ext3_ext_get_access(handle, tree, path + depth))) ++ return err; ++ ex->ee_len += newext->ee_len; ++ eh = path[depth].p_hdr; ++ nearex = ex; ++ goto merge; ++ } ++ ++repeat: ++ depth = EXT_DEPTH(tree); ++ eh = path[depth].p_hdr; ++ if (eh->eh_entries < eh->eh_max) ++ goto has_space; ++ ++ /* probably next leaf has space for us? */ ++ fex = EXT_LAST_EXTENT(eh); ++ next = ext3_ext_next_leaf_block(tree, path); ++ if (newext->ee_block > fex->ee_block && next != EXT_MAX_BLOCK) { ++ ext_debug(tree, "next leaf block - %d\n", next); ++ EXT_ASSERT(!npath); ++ npath = ext3_ext_find_extent(tree, next, NULL); ++ if (IS_ERR(npath)) ++ return PTR_ERR(npath); ++ EXT_ASSERT(npath->p_depth == path->p_depth); ++ eh = npath[depth].p_hdr; ++ if (eh->eh_entries < eh->eh_max) { ++ ext_debug(tree, "next leaf isnt full(%d)\n", ++ eh->eh_entries); ++ path = npath; ++ goto repeat; ++ } ++ ext_debug(tree, "next leaf hasno free space(%d,%d)\n", ++ eh->eh_entries, eh->eh_max); ++ } ++ ++ /* ++ * there is no free space in found leaf ++ * we're gonna add new leaf in the tree ++ */ ++ err = ext3_ext_create_new_leaf(handle, tree, path, newext); ++ if (err) ++ goto cleanup; ++ depth = EXT_DEPTH(tree); ++ eh = path[depth].p_hdr; ++ ++has_space: ++ nearex = path[depth].p_ext; ++ ++ if ((err = ext3_ext_get_access(handle, tree, path + depth))) ++ goto cleanup; ++ ++ if (!nearex) { ++ /* there is no extent in this leaf, create first one */ ++ ext_debug(tree, "first extent in the leaf: %d:%d:%d\n", ++ newext->ee_block, newext->ee_start, ++ newext->ee_len); ++ path[depth].p_ext = EXT_FIRST_EXTENT(eh); ++ } else if (newext->ee_block > nearex->ee_block) { ++ EXT_ASSERT(newext->ee_block != nearex->ee_block); ++ if (nearex != EXT_LAST_EXTENT(eh)) { ++ len = EXT_MAX_EXTENT(eh) - nearex; ++ len = (len - 1) * sizeof(struct ext3_extent); ++ len = len < 0 ? 0 : len; ++ ext_debug(tree, "insert %d:%d:%d after: nearest 0x%p, " ++ "move %d from 0x%p to 0x%p\n", ++ newext->ee_block, newext->ee_start, ++ newext->ee_len, ++ nearex, len, nearex + 1, nearex + 2); ++ memmove(nearex + 2, nearex + 1, len); ++ } ++ path[depth].p_ext = nearex + 1; ++ } else { ++ EXT_ASSERT(newext->ee_block != nearex->ee_block); ++ len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext3_extent); ++ len = len < 0 ? 0 : len; ++ ext_debug(tree, "insert %d:%d:%d before: nearest 0x%p, " ++ "move %d from 0x%p to 0x%p\n", ++ newext->ee_block, newext->ee_start, newext->ee_len, ++ nearex, len, nearex + 1, nearex + 2); ++ memmove(nearex + 1, nearex, len); ++ path[depth].p_ext = nearex; ++ } ++ ++ eh->eh_entries++; ++ nearex = path[depth].p_ext; ++ nearex->ee_block = newext->ee_block; ++ nearex->ee_start = newext->ee_start; ++ nearex->ee_len = newext->ee_len; ++ /* FIXME: support for large fs */ ++ nearex->ee_start_hi = 0; ++ ++merge: ++ /* try to merge extents to the right */ ++ while (nearex < EXT_LAST_EXTENT(eh)) { ++ if (!ext3_can_extents_be_merged(tree, nearex, nearex + 1)) ++ break; ++ /* merge with next extent! */ ++ nearex->ee_len += nearex[1].ee_len; ++ if (nearex + 1 < EXT_LAST_EXTENT(eh)) { ++ len = (EXT_LAST_EXTENT(eh) - nearex - 1) ++ * sizeof(struct ext3_extent); ++ memmove(nearex + 1, nearex + 2, len); ++ } ++ eh->eh_entries--; ++ EXT_ASSERT(eh->eh_entries > 0); ++ } ++ ++ /* try to merge extents to the left */ ++ ++ /* time to correct all indexes above */ ++ err = ext3_ext_correct_indexes(handle, tree, path); ++ if (err) ++ goto cleanup; ++ ++ err = ext3_ext_dirty(handle, tree, path + depth); ++ ++cleanup: ++ if (npath) { ++ ext3_ext_drop_refs(npath); ++ kfree(npath); ++ } ++ ext3_ext_tree_changed(tree); ++ ext3_ext_invalidate_cache(tree); ++ return err; ++} ++ ++int ext3_ext_walk_space(struct ext3_extents_tree *tree, unsigned long block, ++ unsigned long num, ext_prepare_callback func) ++{ ++ struct ext3_ext_path *path = NULL; ++ struct ext3_extent *ex, cbex; ++ unsigned long next, start = 0, end = 0; ++ unsigned long last = block + num; ++ int depth, exists, err = 0; ++ ++ EXT_ASSERT(tree); ++ EXT_ASSERT(func); ++ EXT_ASSERT(tree->inode); ++ EXT_ASSERT(tree->root); ++ ++ while (block < last && block != EXT_MAX_BLOCK) { ++ num = last - block; ++ /* find extent for this block */ ++ path = ext3_ext_find_extent(tree, block, path); ++ if (IS_ERR(path)) { ++ err = PTR_ERR(path); ++ path = NULL; ++ break; ++ } ++ ++ depth = EXT_DEPTH(tree); ++ EXT_ASSERT(path[depth].p_hdr); ++ ex = path[depth].p_ext; ++ next = ext3_ext_next_allocated_block(path); ++ ++ exists = 0; ++ if (!ex) { ++ /* there is no extent yet, so try to allocate ++ * all requested space */ ++ start = block; ++ end = block + num; ++ } else if (ex->ee_block > block) { ++ /* need to allocate space before found extent */ ++ start = block; ++ end = ex->ee_block; ++ if (block + num < end) ++ end = block + num; ++ } else if (block >= ex->ee_block + ex->ee_len) { ++ /* need to allocate space after found extent */ ++ start = block; ++ end = block + num; ++ if (end >= next) ++ end = next; ++ } else if (block >= ex->ee_block) { ++ /* ++ * some part of requested space is covered ++ * by found extent ++ */ ++ start = block; ++ end = ex->ee_block + ex->ee_len; ++ if (block + num < end) ++ end = block + num; ++ exists = 1; ++ } else { ++ BUG(); ++ } ++ EXT_ASSERT(end > start); ++ ++ if (!exists) { ++ cbex.ee_block = start; ++ cbex.ee_len = end - start; ++ cbex.ee_start = 0; ++ } else ++ cbex = *ex; ++ ++ EXT_ASSERT(path[depth].p_hdr); ++ err = func(tree, path, &cbex, exists); ++ ext3_ext_drop_refs(path); ++ ++ if (err < 0) ++ break; ++ if (err == EXT_REPEAT) ++ continue; ++ else if (err == EXT_BREAK) { ++ err = 0; ++ break; ++ } ++ ++ if (EXT_DEPTH(tree) != depth) { ++ /* depth was changed. we have to realloc path */ ++ kfree(path); ++ path = NULL; ++ } ++ ++ block = cbex.ee_block + cbex.ee_len; ++ } ++ ++ if (path) { ++ ext3_ext_drop_refs(path); ++ kfree(path); ++ } ++ ++ return err; ++} ++ ++static inline void ++ext3_ext_put_in_cache(struct ext3_extents_tree *tree, __u32 block, ++ __u32 len, __u32 start, int type) ++{ ++ EXT_ASSERT(len > 0); ++ if (tree->cex) { ++ tree->cex->ec_type = type; ++ tree->cex->ec_block = block; ++ tree->cex->ec_len = len; ++ tree->cex->ec_start = start; ++ } ++} ++ ++/* ++ * this routine calculate boundaries of the gap requested block fits into ++ * and cache this gap ++ */ ++static inline void ++ext3_ext_put_gap_in_cache(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ unsigned long block) ++{ ++ int depth = EXT_DEPTH(tree); ++ unsigned long lblock, len; ++ struct ext3_extent *ex; ++ ++ if (!tree->cex) ++ return; ++ ++ ex = path[depth].p_ext; ++ if (ex == NULL) { ++ /* there is no extent yet, so gap is [0;-] */ ++ lblock = 0; ++ len = EXT_MAX_BLOCK; ++ ext_debug(tree, "cache gap(whole file):"); ++ } else if (block < ex->ee_block) { ++ lblock = block; ++ len = ex->ee_block - block; ++ ext_debug(tree, "cache gap(before): %lu [%lu:%lu]", ++ (unsigned long) block, ++ (unsigned long) ex->ee_block, ++ (unsigned long) ex->ee_len); ++ } else if (block >= ex->ee_block + ex->ee_len) { ++ lblock = ex->ee_block + ex->ee_len; ++ len = ext3_ext_next_allocated_block(path); ++ ext_debug(tree, "cache gap(after): [%lu:%lu] %lu", ++ (unsigned long) ex->ee_block, ++ (unsigned long) ex->ee_len, ++ (unsigned long) block); ++ EXT_ASSERT(len > lblock); ++ len = len - lblock; ++ } else { ++ lblock = len = 0; ++ BUG(); ++ } ++ ++ ext_debug(tree, " -> %lu:%lu\n", (unsigned long) lblock, len); ++ ext3_ext_put_in_cache(tree, lblock, len, 0, EXT3_EXT_CACHE_GAP); ++} ++ ++static inline int ++ext3_ext_in_cache(struct ext3_extents_tree *tree, unsigned long block, ++ struct ext3_extent *ex) ++{ ++ struct ext3_ext_cache *cex = tree->cex; ++ ++ /* is there cache storage at all? */ ++ if (!cex) ++ return EXT3_EXT_CACHE_NO; ++ ++ /* has cache valid data? */ ++ if (cex->ec_type == EXT3_EXT_CACHE_NO) ++ return EXT3_EXT_CACHE_NO; ++ ++ EXT_ASSERT(cex->ec_type == EXT3_EXT_CACHE_GAP || ++ cex->ec_type == EXT3_EXT_CACHE_EXTENT); ++ if (block >= cex->ec_block && block < cex->ec_block + cex->ec_len) { ++ ex->ee_block = cex->ec_block; ++ ex->ee_start = cex->ec_start; ++ ex->ee_len = cex->ec_len; ++ ext_debug(tree, "%lu cached by %lu:%lu:%lu\n", ++ (unsigned long) block, ++ (unsigned long) ex->ee_block, ++ (unsigned long) ex->ee_len, ++ (unsigned long) ex->ee_start); ++ return cex->ec_type; ++ } ++ ++ /* not in cache */ ++ return EXT3_EXT_CACHE_NO; ++} ++ ++/* ++ * routine removes index from the index block ++ * it's used in truncate case only. thus all requests are for ++ * last index in the block only ++ */ ++int ext3_ext_rm_idx(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ struct buffer_head *bh; ++ int err; ++ ++ /* free index block */ ++ path--; ++ EXT_ASSERT(path->p_hdr->eh_entries); ++ if ((err = ext3_ext_get_access(handle, tree, path))) ++ return err; ++ path->p_hdr->eh_entries--; ++ if ((err = ext3_ext_dirty(handle, tree, path))) ++ return err; ++ ext_debug(tree, "index is empty, remove it, free block %d\n", ++ path->p_idx->ei_leaf); ++ bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf); ++ ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf); ++ ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1); ++ return err; ++} ++ ++int ext3_ext_calc_credits_for_insert(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ int depth = EXT_DEPTH(tree); ++ int needed; ++ ++ if (path) { ++ /* probably there is space in leaf? */ ++ if (path[depth].p_hdr->eh_entries < path[depth].p_hdr->eh_max) ++ return 1; ++ } ++ ++ /* ++ * the worste case we're expecting is creation of the ++ * new root (growing in depth) with index splitting ++ * for splitting we have to consider depth + 1 because ++ * previous growing could increase it ++ */ ++ depth = depth + 1; ++ ++ /* ++ * growing in depth: ++ * block allocation + new root + old root ++ */ ++ needed = EXT3_ALLOC_NEEDED + 2; ++ ++ /* index split. we may need: ++ * allocate intermediate indexes and new leaf ++ * change two blocks at each level, but root ++ * modify root block (inode) ++ */ ++ needed += (depth * EXT3_ALLOC_NEEDED) + (2 * depth) + 1; ++ ++ return needed; ++} ++ ++static int ++ext3_ext_split_for_rm(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, unsigned long start, ++ unsigned long end) ++{ ++ struct ext3_extent *ex, tex; ++ struct ext3_ext_path *npath; ++ int depth, creds, err; ++ ++ depth = EXT_DEPTH(tree); ++ ex = path[depth].p_ext; ++ EXT_ASSERT(ex); ++ EXT_ASSERT(end < ex->ee_block + ex->ee_len - 1); ++ EXT_ASSERT(ex->ee_block < start); ++ ++ /* calculate tail extent */ ++ tex.ee_block = end + 1; ++ EXT_ASSERT(tex.ee_block < ex->ee_block + ex->ee_len); ++ tex.ee_len = ex->ee_block + ex->ee_len - tex.ee_block; ++ ++ creds = ext3_ext_calc_credits_for_insert(tree, path); ++ handle = ext3_ext_journal_restart(handle, creds); ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ ++ /* calculate head extent. use primary extent */ ++ err = ext3_ext_get_access(handle, tree, path + depth); ++ if (err) ++ return err; ++ ex->ee_len = start - ex->ee_block; ++ err = ext3_ext_dirty(handle, tree, path + depth); ++ if (err) ++ return err; ++ ++ /* FIXME: some callback to free underlying resource ++ * and correct ee_start? */ ++ ext_debug(tree, "split extent: head %u:%u, tail %u:%u\n", ++ ex->ee_block, ex->ee_len, tex.ee_block, tex.ee_len); ++ ++ npath = ext3_ext_find_extent(tree, ex->ee_block, NULL); ++ if (IS_ERR(npath)) ++ return PTR_ERR(npath); ++ depth = EXT_DEPTH(tree); ++ EXT_ASSERT(npath[depth].p_ext->ee_block == ex->ee_block); ++ EXT_ASSERT(npath[depth].p_ext->ee_len == ex->ee_len); ++ ++ err = ext3_ext_insert_extent(handle, tree, npath, &tex); ++ ext3_ext_drop_refs(npath); ++ kfree(npath); ++ ++ return err; ++ ++} ++ ++static int ++ext3_ext_rm_leaf(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, unsigned long start, ++ unsigned long end) ++{ ++ struct ext3_extent *ex, *fu = NULL, *lu, *le; ++ int err = 0, correct_index = 0; ++ int depth = EXT_DEPTH(tree), credits; ++ struct ext3_extent_header *eh; ++ unsigned a, b, block, num; ++ ++ ext_debug(tree, "remove [%lu:%lu] in leaf\n", start, end); ++ if (!path[depth].p_hdr) ++ path[depth].p_hdr = EXT_BLOCK_HDR(path[depth].p_bh); ++ eh = path[depth].p_hdr; ++ EXT_ASSERT(eh); ++ EXT_ASSERT(eh->eh_entries <= eh->eh_max); ++ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC); ++ ++ /* find where to start removing */ ++ le = ex = EXT_LAST_EXTENT(eh); ++ while (ex != EXT_FIRST_EXTENT(eh)) { ++ if (ex->ee_block <= end) ++ break; ++ ex--; ++ } ++ ++ if (start > ex->ee_block && end < ex->ee_block + ex->ee_len - 1) { ++ /* removal of internal part of the extent requested ++ * tail and head must be placed in different extent ++ * so, we have to insert one more extent */ ++ path[depth].p_ext = ex; ++ return ext3_ext_split_for_rm(handle, tree, path, start, end); ++ } ++ ++ lu = ex; ++ while (ex >= EXT_FIRST_EXTENT(eh) && ++ ex->ee_block + ex->ee_len > start) { ++ ext_debug(tree, "remove ext %u:%u\n", ex->ee_block, ex->ee_len); ++ path[depth].p_ext = ex; ++ ++ a = ex->ee_block > start ? ex->ee_block : start; ++ b = ex->ee_block + ex->ee_len - 1 < end ? ++ ex->ee_block + ex->ee_len - 1 : end; ++ ++ ext_debug(tree, " border %u:%u\n", a, b); ++ ++ if (a != ex->ee_block && b != ex->ee_block + ex->ee_len - 1) { ++ block = 0; ++ num = 0; ++ BUG(); ++ } else if (a != ex->ee_block) { ++ /* remove tail of the extent */ ++ block = ex->ee_block; ++ num = a - block; ++ } else if (b != ex->ee_block + ex->ee_len - 1) { ++ /* remove head of the extent */ ++ block = a; ++ num = b - a; ++ } else { ++ /* remove whole extent: excelent! */ ++ block = ex->ee_block; ++ num = 0; ++ EXT_ASSERT(a == ex->ee_block && ++ b == ex->ee_block + ex->ee_len - 1); ++ } ++ ++ if (ex == EXT_FIRST_EXTENT(eh)) ++ correct_index = 1; ++ ++ credits = 1; ++ if (correct_index) ++ credits += (EXT_DEPTH(tree) * EXT3_ALLOC_NEEDED) + 1; ++ if (tree->ops->remove_extent_credits) ++ credits+=tree->ops->remove_extent_credits(tree,ex,a,b); ++ ++ handle = ext3_ext_journal_restart(handle, credits); ++ if (IS_ERR(handle)) { ++ err = PTR_ERR(handle); ++ goto out; ++ } ++ ++ err = ext3_ext_get_access(handle, tree, path + depth); ++ if (err) ++ goto out; ++ ++ if (tree->ops->remove_extent) ++ err = tree->ops->remove_extent(tree, ex, a, b); ++ if (err) ++ goto out; ++ ++ if (num == 0) { ++ /* this extent is removed entirely mark slot unused */ ++ ex->ee_start = 0; ++ eh->eh_entries--; ++ fu = ex; ++ } ++ ++ ex->ee_block = block; ++ ex->ee_len = num; ++ ++ err = ext3_ext_dirty(handle, tree, path + depth); ++ if (err) ++ goto out; ++ ++ ext_debug(tree, "new extent: %u:%u:%u\n", ++ ex->ee_block, ex->ee_len, ex->ee_start); ++ ex--; ++ } ++ ++ if (fu) { ++ /* reuse unused slots */ ++ while (lu < le) { ++ if (lu->ee_start) { ++ *fu = *lu; ++ lu->ee_start = 0; ++ fu++; ++ } ++ lu++; ++ } ++ } ++ ++ if (correct_index && eh->eh_entries) ++ err = ext3_ext_correct_indexes(handle, tree, path); ++ ++ /* if this leaf is free, then we should ++ * remove it from index block above */ ++ if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL) ++ err = ext3_ext_rm_idx(handle, tree, path + depth); ++ ++out: ++ return err; ++} ++ ++ ++static struct ext3_extent_idx * ++ext3_ext_last_covered(struct ext3_extent_header *hdr, unsigned long block) ++{ ++ struct ext3_extent_idx *ix; ++ ++ ix = EXT_LAST_INDEX(hdr); ++ while (ix != EXT_FIRST_INDEX(hdr)) { ++ if (ix->ei_block <= block) ++ break; ++ ix--; ++ } ++ return ix; ++} ++ ++/* ++ * returns 1 if current index have to be freed (even partial) ++ */ ++static int inline ++ext3_ext_more_to_rm(struct ext3_ext_path *path) ++{ ++ EXT_ASSERT(path->p_idx); ++ ++ if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr)) ++ return 0; ++ ++ /* ++ * if truncate on deeper level happened it it wasn't partial ++ * so we have to consider current index for truncation ++ */ ++ if (path->p_hdr->eh_entries == path->p_block) ++ return 0; ++ return 1; ++} ++ ++int ext3_ext_remove_space(struct ext3_extents_tree *tree, ++ unsigned long start, unsigned long end) ++{ ++ struct inode *inode = tree->inode; ++ struct super_block *sb = inode->i_sb; ++ int depth = EXT_DEPTH(tree); ++ struct ext3_ext_path *path; ++ handle_t *handle; ++ int i = 0, err = 0; ++ ++ ext_debug(tree, "space to be removed: %lu:%lu\n", start, end); ++ ++ /* probably first extent we're gonna free will be last in block */ ++ handle = ext3_journal_start(inode, depth + 1); ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ ++ ext3_ext_invalidate_cache(tree); ++ ++ /* ++ * we start scanning from right side freeing all the blocks ++ * after i_size and walking into the deep ++ */ ++ path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 1), GFP_KERNEL); ++ if (IS_ERR(path)) { ++ ext3_error(sb, "ext3_ext_remove_space", ++ "Can't allocate path array"); ++ ext3_journal_stop(handle); ++ return -ENOMEM; ++ } ++ memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1)); ++ path[i].p_hdr = EXT_ROOT_HDR(tree); ++ ++ while (i >= 0 && err == 0) { ++ if (i == depth) { ++ /* this is leaf block */ ++ err = ext3_ext_rm_leaf(handle, tree, path, start, end); ++ /* root level have p_bh == NULL, brelse() eats this */ ++ brelse(path[i].p_bh); ++ i--; ++ continue; ++ } ++ ++ /* this is index block */ ++ if (!path[i].p_hdr) { ++ ext_debug(tree, "initialize header\n"); ++ path[i].p_hdr = EXT_BLOCK_HDR(path[i].p_bh); ++ } ++ ++ EXT_ASSERT(path[i].p_hdr->eh_entries <= path[i].p_hdr->eh_max); ++ EXT_ASSERT(path[i].p_hdr->eh_magic == EXT3_EXT_MAGIC); ++ ++ if (!path[i].p_idx) { ++ /* this level hasn't touched yet */ ++ path[i].p_idx = ++ ext3_ext_last_covered(path[i].p_hdr, end); ++ path[i].p_block = path[i].p_hdr->eh_entries + 1; ++ ext_debug(tree, "init index ptr: hdr 0x%p, num %d\n", ++ path[i].p_hdr, path[i].p_hdr->eh_entries); ++ } else { ++ /* we've already was here, see at next index */ ++ path[i].p_idx--; ++ } ++ ++ ext_debug(tree, "level %d - index, first 0x%p, cur 0x%p\n", ++ i, EXT_FIRST_INDEX(path[i].p_hdr), ++ path[i].p_idx); ++ if (ext3_ext_more_to_rm(path + i)) { ++ /* go to the next level */ ++ ext_debug(tree, "move to level %d (block %d)\n", ++ i + 1, path[i].p_idx->ei_leaf); ++ memset(path + i + 1, 0, sizeof(*path)); ++ path[i+1].p_bh = sb_bread(sb, path[i].p_idx->ei_leaf); ++ if (!path[i+1].p_bh) { ++ /* should we reset i_size? */ ++ err = -EIO; ++ break; ++ } ++ /* put actual number of indexes to know is this ++ * number got changed at the next iteration */ ++ path[i].p_block = path[i].p_hdr->eh_entries; ++ i++; ++ } else { ++ /* we finish processing this index, go up */ ++ if (path[i].p_hdr->eh_entries == 0 && i > 0) { ++ /* index is empty, remove it ++ * handle must be already prepared by the ++ * truncatei_leaf() */ ++ err = ext3_ext_rm_idx(handle, tree, path + i); ++ } ++ /* root level have p_bh == NULL, brelse() eats this */ ++ brelse(path[i].p_bh); ++ i--; ++ ext_debug(tree, "return to level %d\n", i); ++ } ++ } ++ ++ /* TODO: flexible tree reduction should be here */ ++ if (path->p_hdr->eh_entries == 0) { ++ /* ++ * truncate to zero freed all the tree ++ * so, we need to correct eh_depth ++ */ ++ err = ext3_ext_get_access(handle, tree, path); ++ if (err == 0) { ++ EXT_ROOT_HDR(tree)->eh_depth = 0; ++ EXT_ROOT_HDR(tree)->eh_max = ext3_ext_space_root(tree); ++ err = ext3_ext_dirty(handle, tree, path); ++ } ++ } ++ ext3_ext_tree_changed(tree); ++ ++ kfree(path); ++ ext3_journal_stop(handle); ++ ++ return err; ++} ++ ++int ext3_ext_calc_metadata_amount(struct ext3_extents_tree *tree, int blocks) ++{ ++ int lcap, icap, rcap, leafs, idxs, num; ++ ++ rcap = ext3_ext_space_root(tree); ++ if (blocks <= rcap) { ++ /* all extents fit to the root */ ++ return 0; ++ } ++ ++ rcap = ext3_ext_space_root_idx(tree); ++ lcap = ext3_ext_space_block(tree); ++ icap = ext3_ext_space_block_idx(tree); ++ ++ num = leafs = (blocks + lcap - 1) / lcap; ++ if (leafs <= rcap) { ++ /* all pointers to leafs fit to the root */ ++ return leafs; ++ } ++ ++ /* ok. we need separate index block(s) to link all leaf blocks */ ++ idxs = (leafs + icap - 1) / icap; ++ do { ++ num += idxs; ++ idxs = (idxs + icap - 1) / icap; ++ } while (idxs > rcap); ++ ++ return num; ++} ++ ++/* ++ * called at mount time ++ */ ++void ext3_ext_init(struct super_block *sb) ++{ ++ /* ++ * possible initialization would be here ++ */ ++ ++ if (test_opt(sb, EXTENTS)) { ++ printk("EXT3-fs: file extents enabled"); ++#ifdef AGRESSIVE_TEST ++ printk(", agressive tests"); ++#endif ++#ifdef CHECK_BINSEARCH ++ printk(", check binsearch"); ++#endif ++ printk("\n"); ++ } ++} ++ ++/* ++ * called at umount time ++ */ ++void ext3_ext_release(struct super_block *sb) ++{ ++} ++ ++/************************************************************************ ++ * VFS related routines ++ ************************************************************************/ ++ ++static int ext3_get_inode_write_access(handle_t *handle, void *buffer) ++{ ++ /* we use in-core data, not bh */ ++ return 0; ++} ++ ++static int ext3_mark_buffer_dirty(handle_t *handle, void *buffer) ++{ ++ struct inode *inode = buffer; ++ return ext3_mark_inode_dirty(handle, inode); ++} ++ ++static int ext3_ext_mergable(struct ext3_extent *ex1, ++ struct ext3_extent *ex2) ++{ ++ /* FIXME: support for large fs */ ++ if (ex1->ee_start + ex1->ee_len == ex2->ee_start) ++ return 1; ++ return 0; ++} ++ ++static int ++ext3_remove_blocks_credits(struct ext3_extents_tree *tree, ++ struct ext3_extent *ex, ++ unsigned long from, unsigned long to) ++{ ++ int needed; ++ ++ /* at present, extent can't cross block group */; ++ needed = 4; /* bitmap + group desc + sb + inode */ ++ ++#ifdef CONFIG_QUOTA ++ needed += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS; ++#endif ++ return needed; ++} ++ ++static int ++ext3_remove_blocks(struct ext3_extents_tree *tree, ++ struct ext3_extent *ex, ++ unsigned long from, unsigned long to) ++{ ++ int needed = ext3_remove_blocks_credits(tree, ex, from, to); ++ handle_t *handle = ext3_journal_start(tree->inode, needed); ++ struct buffer_head *bh; ++ int i; ++ ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) { ++ /* tail removal */ ++ unsigned long num, start; ++ num = ex->ee_block + ex->ee_len - from; ++ start = ex->ee_start + ex->ee_len - num; ++ ext_debug(tree, "free last %lu blocks starting %lu\n", ++ num, start); ++ for (i = 0; i < num; i++) { ++ bh = sb_find_get_block(tree->inode->i_sb, start + i); ++ ext3_forget(handle, 0, tree->inode, bh, start + i); ++ } ++ ext3_free_blocks(handle, tree->inode, start, num); ++ } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) { ++ printk("strange request: removal %lu-%lu from %u:%u\n", ++ from, to, ex->ee_block, ex->ee_len); ++ } else { ++ printk("strange request: removal(2) %lu-%lu from %u:%u\n", ++ from, to, ex->ee_block, ex->ee_len); ++ } ++ ext3_journal_stop(handle); ++ return 0; ++} ++ ++static int ext3_ext_find_goal(struct inode *inode, ++ struct ext3_ext_path *path, unsigned long block) ++{ ++ struct ext3_inode_info *ei = EXT3_I(inode); ++ unsigned long bg_start; ++ unsigned long colour; ++ int depth; ++ ++ if (path) { ++ struct ext3_extent *ex; ++ depth = path->p_depth; ++ ++ /* try to predict block placement */ ++ if ((ex = path[depth].p_ext)) ++ return ex->ee_start + (block - ex->ee_block); ++ ++ /* it looks index is empty ++ * try to find starting from index itself */ ++ if (path[depth].p_bh) ++ return path[depth].p_bh->b_blocknr; ++ } ++ ++ /* OK. use inode's group */ ++ bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) + ++ le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block); ++ colour = (current->pid % 16) * ++ (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16); ++ return bg_start + colour + block; ++} ++ ++static int ext3_new_block_cb(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *ex, int *err) ++{ ++ struct inode *inode = tree->inode; ++ int newblock, goal; ++ ++ EXT_ASSERT(path); ++ EXT_ASSERT(ex); ++ EXT_ASSERT(ex->ee_start); ++ EXT_ASSERT(ex->ee_len); ++ ++ /* reuse block from the extent to order data/metadata */ ++ newblock = ex->ee_start++; ++ ex->ee_len--; ++ if (ex->ee_len == 0) { ++ ex->ee_len = 1; ++ /* allocate new block for the extent */ ++ goal = ext3_ext_find_goal(inode, path, ex->ee_block); ++ ex->ee_start = ext3_new_block(handle, inode, goal, err); ++ if (ex->ee_start == 0) { ++ /* error occured: restore old extent */ ++ ex->ee_start = newblock; ++ return 0; ++ } ++ } ++ return newblock; ++} ++ ++static struct ext3_extents_helpers ext3_blockmap_helpers = { ++ .get_write_access = ext3_get_inode_write_access, ++ .mark_buffer_dirty = ext3_mark_buffer_dirty, ++ .mergable = ext3_ext_mergable, ++ .new_block = ext3_new_block_cb, ++ .remove_extent = ext3_remove_blocks, ++ .remove_extent_credits = ext3_remove_blocks_credits, ++}; ++ ++void ext3_init_tree_desc(struct ext3_extents_tree *tree, ++ struct inode *inode) ++{ ++ tree->inode = inode; ++ tree->root = (void *) EXT3_I(inode)->i_data; ++ tree->buffer = (void *) inode; ++ tree->buffer_len = sizeof(EXT3_I(inode)->i_data); ++ tree->cex = (struct ext3_ext_cache *) &EXT3_I(inode)->i_cached_extent; ++ tree->ops = &ext3_blockmap_helpers; ++} ++ ++int ext3_ext_get_block(handle_t *handle, struct inode *inode, ++ long iblock, struct buffer_head *bh_result, ++ int create, int extend_disksize) ++{ ++ struct ext3_ext_path *path = NULL; ++ struct ext3_extent newex; ++ struct ext3_extent *ex; ++ int goal, newblock, err = 0, depth; ++ struct ext3_extents_tree tree; ++ ++ clear_buffer_new(bh_result); ++ ext3_init_tree_desc(&tree, inode); ++ ext_debug(&tree, "block %d requested for inode %u\n", ++ (int) iblock, (unsigned) inode->i_ino); ++ down(&EXT3_I(inode)->truncate_sem); ++ ++ /* check in cache */ ++ if ((goal = ext3_ext_in_cache(&tree, iblock, &newex))) { ++ if (goal == EXT3_EXT_CACHE_GAP) { ++ if (!create) { ++ /* block isn't allocated yet and ++ * user don't want to allocate it */ ++ goto out2; ++ } ++ /* we should allocate requested block */ ++ } else if (goal == EXT3_EXT_CACHE_EXTENT) { ++ /* block is already allocated */ ++ newblock = iblock - newex.ee_block + newex.ee_start; ++ goto out; ++ } else { ++ EXT_ASSERT(0); ++ } ++ } ++ ++ /* find extent for this block */ ++ path = ext3_ext_find_extent(&tree, iblock, NULL); ++ if (IS_ERR(path)) { ++ err = PTR_ERR(path); ++ path = NULL; ++ goto out2; ++ } ++ ++ depth = EXT_DEPTH(&tree); ++ ++ /* ++ * consistent leaf must not be empty ++ * this situations is possible, though, _during_ tree modification ++ * this is why assert can't be put in ext3_ext_find_extent() ++ */ ++ EXT_ASSERT(path[depth].p_ext != NULL || depth == 0); ++ ++ if ((ex = path[depth].p_ext)) { ++ /* if found exent covers block, simple return it */ ++ if (iblock >= ex->ee_block && iblock < ex->ee_block + ex->ee_len) { ++ newblock = iblock - ex->ee_block + ex->ee_start; ++ ext_debug(&tree, "%d fit into %d:%d -> %d\n", ++ (int) iblock, ex->ee_block, ex->ee_len, ++ newblock); ++ ext3_ext_put_in_cache(&tree, ex->ee_block, ++ ex->ee_len, ex->ee_start, ++ EXT3_EXT_CACHE_EXTENT); ++ goto out; ++ } ++ } ++ ++ /* ++ * requested block isn't allocated yet ++ * we couldn't try to create block if create flag is zero ++ */ ++ if (!create) { ++ /* put just found gap into cache to speedup subsequest reqs */ ++ ext3_ext_put_gap_in_cache(&tree, path, iblock); ++ goto out2; ++ } ++ ++ /* allocate new block */ ++ goal = ext3_ext_find_goal(inode, path, iblock); ++ newblock = ext3_new_block(handle, inode, goal, &err); ++ if (!newblock) ++ goto out2; ++ ext_debug(&tree, "allocate new block: goal %d, found %d\n", ++ goal, newblock); ++ ++ /* try to insert new extent into found leaf and return */ ++ newex.ee_block = iblock; ++ newex.ee_start = newblock; ++ newex.ee_len = 1; ++ err = ext3_ext_insert_extent(handle, &tree, path, &newex); ++ if (err) ++ goto out2; ++ ++ if (extend_disksize && inode->i_size > EXT3_I(inode)->i_disksize) ++ EXT3_I(inode)->i_disksize = inode->i_size; ++ ++ /* previous routine could use block we allocated */ ++ newblock = newex.ee_start; ++ set_buffer_new(bh_result); ++ ++ ext3_ext_put_in_cache(&tree, newex.ee_block, newex.ee_len, ++ newex.ee_start, EXT3_EXT_CACHE_EXTENT); ++out: ++ ext3_ext_show_leaf(&tree, path); ++ map_bh(bh_result, inode->i_sb, newblock); ++out2: ++ if (path) { ++ ext3_ext_drop_refs(path); ++ kfree(path); ++ } ++ up(&EXT3_I(inode)->truncate_sem); ++ ++ return err; ++} ++ ++void ext3_ext_truncate(struct inode * inode, struct page *page) ++{ ++ struct address_space *mapping = inode->i_mapping; ++ struct super_block *sb = inode->i_sb; ++ struct ext3_extents_tree tree; ++ unsigned long last_block; ++ handle_t *handle; ++ int err = 0; ++ ++ ext3_init_tree_desc(&tree, inode); ++ ++ /* ++ * probably first extent we're gonna free will be last in block ++ */ ++ err = ext3_writepage_trans_blocks(inode) + 3; ++ handle = ext3_journal_start(inode, err); ++ if (IS_ERR(handle)) { ++ if (page) { ++ clear_highpage(page); ++ flush_dcache_page(page); ++ unlock_page(page); ++ page_cache_release(page); ++ } ++ return; ++ } ++ ++ if (page) ++ ext3_block_truncate_page(handle, page, mapping, inode->i_size); ++ ++ down(&EXT3_I(inode)->truncate_sem); ++ ext3_ext_invalidate_cache(&tree); ++ ++ /* ++ * TODO: optimization is possible here ++ * probably we need not scaning at all, ++ * because page truncation is enough ++ */ ++ if (ext3_orphan_add(handle, inode)) ++ goto out_stop; ++ ++ /* we have to know where to truncate from in crash case */ ++ EXT3_I(inode)->i_disksize = inode->i_size; ++ ext3_mark_inode_dirty(handle, inode); ++ ++ last_block = (inode->i_size + sb->s_blocksize - 1) ++ >> EXT3_BLOCK_SIZE_BITS(sb); ++ err = ext3_ext_remove_space(&tree, last_block, EXT_MAX_BLOCK); ++ ++ /* In a multi-transaction truncate, we only make the final ++ * transaction synchronous */ ++ if (IS_SYNC(inode)) ++ handle->h_sync = 1; ++ ++out_stop: ++ /* ++ * If this was a simple ftruncate(), and the file will remain alive ++ * then we need to clear up the orphan record which we created above. ++ * However, if this was a real unlink then we were called by ++ * ext3_delete_inode(), and we allow that function to clean up the ++ * orphan info for us. ++ */ ++ if (inode->i_nlink) ++ ext3_orphan_del(handle, inode); ++ ++ up(&EXT3_I(inode)->truncate_sem); ++ ext3_journal_stop(handle); ++} ++ ++/* ++ * this routine calculate max number of blocks we could modify ++ * in order to allocate new block for an inode ++ */ ++int ext3_ext_writepage_trans_blocks(struct inode *inode, int num) ++{ ++ struct ext3_extents_tree tree; ++ int needed; ++ ++ ext3_init_tree_desc(&tree, inode); ++ ++ needed = ext3_ext_calc_credits_for_insert(&tree, NULL); ++ ++ /* caller want to allocate num blocks */ ++ needed *= num; ++ ++#ifdef CONFIG_QUOTA ++ /* ++ * FIXME: real calculation should be here ++ * it depends on blockmap format of qouta file ++ */ ++ needed += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS; ++#endif ++ ++ return needed; ++} ++ ++void ext3_extents_initialize_blockmap(handle_t *handle, struct inode *inode) ++{ ++ struct ext3_extents_tree tree; ++ ++ ext3_init_tree_desc(&tree, inode); ++ ext3_extent_tree_init(handle, &tree); ++} ++ ++int ext3_ext_calc_blockmap_metadata(struct inode *inode, int blocks) ++{ ++ struct ext3_extents_tree tree; ++ ++ ext3_init_tree_desc(&tree, inode); ++ return ext3_ext_calc_metadata_amount(&tree, blocks); ++} ++ ++static int ++ext3_ext_store_extent_cb(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newex, int exist) ++{ ++ struct ext3_extent_buf *buf = (struct ext3_extent_buf *) tree->private; ++ ++ if (!exist) ++ return EXT_CONTINUE; ++ if (buf->err < 0) ++ return EXT_BREAK; ++ if (buf->cur - buf->buffer + sizeof(*newex) > buf->buflen) ++ return EXT_BREAK; ++ ++ if (!copy_to_user(buf->cur, newex, sizeof(*newex))) { ++ buf->err++; ++ buf->cur += sizeof(*newex); ++ } else { ++ buf->err = -EFAULT; ++ return EXT_BREAK; ++ } ++ return EXT_CONTINUE; ++} ++ ++static int ++ext3_ext_collect_stats_cb(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *ex, int exist) ++{ ++ struct ext3_extent_tree_stats *buf = ++ (struct ext3_extent_tree_stats *) tree->private; ++ int depth; ++ ++ if (!exist) ++ return EXT_CONTINUE; ++ ++ depth = EXT_DEPTH(tree); ++ buf->extents_num++; ++ if (path[depth].p_ext == EXT_FIRST_EXTENT(path[depth].p_hdr)) ++ buf->leaf_num++; ++ return EXT_CONTINUE; ++} ++ ++int ext3_ext_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, ++ unsigned long arg) ++{ ++ int err = 0; ++ ++ if (!(EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL)) ++ return -EINVAL; ++ ++ if (cmd == EXT3_IOC_GET_EXTENTS) { ++ struct ext3_extent_buf buf; ++ struct ext3_extents_tree tree; ++ ++ if (copy_from_user(&buf, (void *) arg, sizeof(buf))) ++ return -EFAULT; ++ ++ ext3_init_tree_desc(&tree, inode); ++ buf.cur = buf.buffer; ++ buf.err = 0; ++ tree.private = &buf; ++ down(&EXT3_I(inode)->truncate_sem); ++ err = ext3_ext_walk_space(&tree, buf.start, EXT_MAX_BLOCK, ++ ext3_ext_store_extent_cb); ++ up(&EXT3_I(inode)->truncate_sem); ++ if (err == 0) ++ err = buf.err; ++ } else if (cmd == EXT3_IOC_GET_TREE_STATS) { ++ struct ext3_extent_tree_stats buf; ++ struct ext3_extents_tree tree; ++ ++ ext3_init_tree_desc(&tree, inode); ++ down(&EXT3_I(inode)->truncate_sem); ++ buf.depth = EXT_DEPTH(&tree); ++ buf.extents_num = 0; ++ buf.leaf_num = 0; ++ tree.private = &buf; ++ err = ext3_ext_walk_space(&tree, 0, EXT_MAX_BLOCK, ++ ext3_ext_collect_stats_cb); ++ up(&EXT3_I(inode)->truncate_sem); ++ if (!err) ++ err = copy_to_user((void *) arg, &buf, sizeof(buf)); ++ } else if (cmd == EXT3_IOC_GET_TREE_DEPTH) { ++ struct ext3_extents_tree tree; ++ ext3_init_tree_desc(&tree, inode); ++ down(&EXT3_I(inode)->truncate_sem); ++ err = EXT_DEPTH(&tree); ++ up(&EXT3_I(inode)->truncate_sem); ++ } ++ ++ return err; ++} ++ ++EXPORT_SYMBOL(ext3_init_tree_desc); ++EXPORT_SYMBOL(ext3_mark_inode_dirty); ++EXPORT_SYMBOL(ext3_ext_invalidate_cache); ++EXPORT_SYMBOL(ext3_ext_insert_extent); ++EXPORT_SYMBOL(ext3_ext_walk_space); ++EXPORT_SYMBOL(ext3_ext_find_goal); ++EXPORT_SYMBOL(ext3_ext_calc_credits_for_insert); ++ +Index: linux-stage/fs/ext3/ialloc.c +=================================================================== +--- linux-stage.orig/fs/ext3/ialloc.c 2005-02-25 14:50:50.304202816 +0200 ++++ linux-stage/fs/ext3/ialloc.c 2005-02-25 15:33:48.920193600 +0200 +@@ -646,6 +646,10 @@ + DQUOT_FREE_INODE(inode); + goto fail2; + } ++ if (test_opt(sb, EXTENTS)) { ++ EXT3_I(inode)->i_flags |= EXT3_EXTENTS_FL; ++ ext3_extents_initialize_blockmap(handle, inode); ++ } + err = ext3_mark_inode_dirty(handle, inode); + if (err) { + ext3_std_error(sb, err); +Index: linux-stage/fs/ext3/inode.c +=================================================================== +--- linux-stage.orig/fs/ext3/inode.c 2005-02-25 14:50:50.309202056 +0200 ++++ linux-stage/fs/ext3/inode.c 2005-02-25 15:36:51.846384592 +0200 +@@ -796,6 +796,17 @@ + goto reread; + } + ++static inline int ++ext3_get_block_wrap(handle_t *handle, struct inode *inode, long block, ++ struct buffer_head *bh, int create, int extend_disksize) ++{ ++ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) ++ return ext3_ext_get_block(handle, inode, block, bh, create, ++ extend_disksize); ++ return ext3_get_block_handle(handle, inode, block, bh, create, ++ extend_disksize); ++} ++ + static int ext3_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) + { +@@ -806,8 +817,8 @@ + handle = ext3_journal_current_handle(); + J_ASSERT(handle != 0); + } +- ret = ext3_get_block_handle(handle, inode, iblock, +- bh_result, create, 1); ++ ret = ext3_get_block_wrap(handle, inode, iblock, ++ bh_result, create, 1); + return ret; + } + +@@ -851,7 +862,7 @@ + + get_block: + if (ret == 0) +- ret = ext3_get_block_handle(handle, inode, iblock, ++ ret = ext3_get_block_wrap(handle, inode, iblock, + bh_result, create, 0); + bh_result->b_size = (1 << inode->i_blkbits); + return ret; +@@ -871,7 +882,7 @@ + dummy.b_state = 0; + dummy.b_blocknr = -1000; + buffer_trace_init(&dummy.b_history); +- *errp = ext3_get_block_handle(handle, inode, block, &dummy, create, 1); ++ *errp = ext3_get_block_wrap(handle, inode, block, &dummy, create, 1); + if (!*errp && buffer_mapped(&dummy)) { + struct buffer_head *bh; + bh = sb_getblk(inode->i_sb, dummy.b_blocknr); +@@ -1589,7 +1600,7 @@ + * This required during truncate. We need to physically zero the tail end + * of that block so it doesn't yield old data if the file is later grown. + */ +-static int ext3_block_truncate_page(handle_t *handle, struct page *page, ++int ext3_block_truncate_page(handle_t *handle, struct page *page, + struct address_space *mapping, loff_t from) + { + unsigned long index = from >> PAGE_CACHE_SHIFT; +@@ -2087,6 +2098,9 @@ + return; + } + ++ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) ++ return ext3_ext_truncate(inode, page); ++ + handle = start_transaction(inode); + if (IS_ERR(handle)) { + if (page) { +@@ -2814,6 +2828,9 @@ + int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3; + int ret; + ++ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) ++ return ext3_ext_writepage_trans_blocks(inode, bpp); ++ + if (ext3_should_journal_data(inode)) + ret = 3 * (bpp + indirects) + 2; + else +Index: linux-stage/fs/ext3/Makefile +=================================================================== +--- linux-stage.orig/fs/ext3/Makefile 2005-02-25 14:49:42.168561008 +0200 ++++ linux-stage/fs/ext3/Makefile 2005-02-25 15:39:28.384587168 +0200 +@@ -5,7 +5,7 @@ + obj-$(CONFIG_EXT3_FS) += ext3.o + + ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o\ +- ioctl.o namei.o super.o symlink.o hash.o resize.o ++ ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o + + ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o + ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o +Index: linux-stage/fs/ext3/super.c +=================================================================== +--- linux-stage.orig/fs/ext3/super.c 2005-02-25 14:52:33.550506992 +0200 ++++ linux-stage/fs/ext3/super.c 2005-02-25 15:38:10.474431312 +0200 +@@ -394,6 +394,7 @@ + struct ext3_super_block *es = sbi->s_es; + int i; + ++ ext3_ext_release(sb); + ext3_xattr_put_super(sb); + journal_destroy(sbi->s_journal); + if (!(sb->s_flags & MS_RDONLY)) { +@@ -457,6 +458,10 @@ + #endif + ei->i_rsv_window.rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED; + ei->vfs_inode.i_version = 1; ++ ei->i_cached_extent[0] = 0; ++ ei->i_cached_extent[1] = 0; ++ ei->i_cached_extent[2] = 0; ++ ei->i_cached_extent[3] = 0; + return &ei->vfs_inode; + } + +@@ -589,7 +594,7 @@ + Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, + Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, + Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, +- Opt_ignore, Opt_barrier, Opt_err, Opt_resize, ++ Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_extents, Opt_extdebug, + }; + + static match_table_t tokens = { +@@ -639,6 +644,8 @@ + {Opt_iopen, "iopen"}, + {Opt_noiopen, "noiopen"}, + {Opt_iopen_nopriv, "iopen_nopriv"}, ++ {Opt_extents, "extents"}, ++ {Opt_extdebug, "extdebug"}, + {Opt_barrier, "barrier=%u"}, + {Opt_err, NULL}, + {Opt_resize, "resize"}, +@@ -943,6 +950,12 @@ + match_int(&args[0], &option); + *n_blocks_count = option; + break; ++ case Opt_extents: ++ set_opt (sbi->s_mount_opt, EXTENTS); ++ break; ++ case Opt_extdebug: ++ set_opt (sbi->s_mount_opt, EXTDEBUG); ++ break; + default: + printk (KERN_ERR + "EXT3-fs: Unrecognized mount option \"%s\" " +@@ -1625,6 +1638,8 @@ + percpu_counter_mod(&sbi->s_dirs_counter, + ext3_count_dirs(sb)); + ++ ext3_ext_init(sb); ++ + return 0; + + failed_mount3: +Index: linux-stage/fs/ext3/ioctl.c +=================================================================== +--- linux-stage.orig/fs/ext3/ioctl.c 2005-02-25 14:37:28.971023976 +0200 ++++ linux-stage/fs/ext3/ioctl.c 2005-02-25 15:33:48.938190864 +0200 +@@ -124,6 +124,10 @@ + err = ext3_change_inode_journal_flag(inode, jflag); + return err; + } ++ case EXT3_IOC_GET_EXTENTS: ++ case EXT3_IOC_GET_TREE_STATS: ++ case EXT3_IOC_GET_TREE_DEPTH: ++ return ext3_ext_ioctl(inode, filp, cmd, arg); + case EXT3_IOC_GETVERSION: + case EXT3_IOC_GETVERSION_OLD: + return put_user(inode->i_generation, (int __user *) arg); +Index: linux-stage/include/linux/ext3_fs.h +=================================================================== +--- linux-stage.orig/include/linux/ext3_fs.h 2005-02-25 14:53:56.424908168 +0200 ++++ linux-stage/include/linux/ext3_fs.h 2005-02-25 15:39:12.841950008 +0200 +@@ -186,6 +186,7 @@ + #define EXT3_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ + #define EXT3_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ + #define EXT3_RESERVED_FL 0x80000000 /* reserved for ext3 lib */ ++#define EXT3_EXTENTS_FL 0x00080000 /* Inode uses extents */ + + #define EXT3_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */ + #define EXT3_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */ +@@ -237,6 +238,9 @@ + #endif + #define EXT3_IOC_GETRSVSZ _IOR('f', 5, long) + #define EXT3_IOC_SETRSVSZ _IOW('f', 6, long) ++#define EXT3_IOC_GET_EXTENTS _IOR('f', 7, long) ++#define EXT3_IOC_GET_TREE_DEPTH _IOR('f', 8, long) ++#define EXT3_IOC_GET_TREE_STATS _IOR('f', 9, long) + + /* + * Structure of an inode on the disk +@@ -359,6 +363,9 @@ + #define EXT3_MOUNT_RESERVATION 0x20000 /* Preallocation */ + #define EXT3_MOUNT_IOPEN 0x40000 /* Allow access via iopen */ + #define EXT3_MOUNT_IOPEN_NOPRIV 0x80000 /* Make iopen world-readable */ ++#define EXT3_MOUNT_EXTENTS 0x100000/* Extents support */ ++#define EXT3_MOUNT_EXTDEBUG 0x200000/* Extents debug */ ++ + + /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ + #ifndef _LINUX_EXT2_FS_H +@@ -756,6 +763,7 @@ + + + /* inode.c */ ++extern int ext3_block_truncate_page(handle_t *, struct page *, struct address_space *, loff_t); + extern int ext3_forget(handle_t *, int, struct inode *, struct buffer_head *, int); + extern struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *); + extern struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *); +@@ -836,6 +844,16 @@ + extern struct inode_operations ext3_symlink_inode_operations; + extern struct inode_operations ext3_fast_symlink_inode_operations; + ++/* extents.c */ ++extern int ext3_ext_writepage_trans_blocks(struct inode *, int); ++extern int ext3_ext_get_block(handle_t *, struct inode *, long, ++ struct buffer_head *, int, int); ++extern void ext3_ext_truncate(struct inode *, struct page *); ++extern void ext3_ext_init(struct super_block *); ++extern void ext3_ext_release(struct super_block *); ++extern void ext3_extents_initialize_blockmap(handle_t *, struct inode *); ++extern int ext3_ext_ioctl(struct inode *inode, struct file *filp, ++ unsigned int cmd, unsigned long arg); + + #endif /* __KERNEL__ */ + +Index: linux-stage/include/linux/ext3_extents.h +=================================================================== +--- linux-stage.orig/include/linux/ext3_extents.h 2005-02-25 15:33:48.891198008 +0200 ++++ linux-stage/include/linux/ext3_extents.h 2005-02-25 15:33:48.944189952 +0200 +@@ -0,0 +1,252 @@ ++/* ++ * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com ++ * Written by Alex Tomas ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public Licens ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- ++ */ ++ ++#ifndef _LINUX_EXT3_EXTENTS ++#define _LINUX_EXT3_EXTENTS ++ ++/* ++ * with AGRESSIVE_TEST defined capacity of index/leaf blocks ++ * become very little, so index split, in-depth growing and ++ * other hard changes happens much more often ++ * this is for debug purposes only ++ */ ++#define AGRESSIVE_TEST_ ++ ++/* ++ * if CHECK_BINSEARCH defined, then results of binary search ++ * will be checked by linear search ++ */ ++#define CHECK_BINSEARCH_ ++ ++/* ++ * if EXT_DEBUG is defined you can use 'extdebug' mount option ++ * to get lots of info what's going on ++ */ ++#define EXT_DEBUG_ ++#ifdef EXT_DEBUG ++#define ext_debug(tree,fmt,a...) \ ++do { \ ++ if (test_opt((tree)->inode->i_sb, EXTDEBUG)) \ ++ printk(fmt, ##a); \ ++} while (0); ++#else ++#define ext_debug(tree,fmt,a...) ++#endif ++ ++/* ++ * if EXT_STATS is defined then stats numbers are collected ++ * these number will be displayed at umount time ++ */ ++#define EXT_STATS_ ++ ++ ++#define EXT3_ALLOC_NEEDED 3 /* block bitmap + group desc. + sb */ ++ ++/* ++ * ext3_inode has i_block array (total 60 bytes) ++ * first 4 bytes are used to store: ++ * - tree depth (0 mean there is no tree yet. all extents in the inode) ++ * - number of alive extents in the inode ++ */ ++ ++/* ++ * this is extent on-disk structure ++ * it's used at the bottom of the tree ++ */ ++struct ext3_extent { ++ __u32 ee_block; /* first logical block extent covers */ ++ __u16 ee_len; /* number of blocks covered by extent */ ++ __u16 ee_start_hi; /* high 16 bits of physical block */ ++ __u32 ee_start; /* low 32 bigs of physical block */ ++}; ++ ++/* ++ * this is index on-disk structure ++ * it's used at all the levels, but the bottom ++ */ ++struct ext3_extent_idx { ++ __u32 ei_block; /* index covers logical blocks from 'block' */ ++ __u32 ei_leaf; /* pointer to the physical block of the next * ++ * level. leaf or next index could bet here */ ++ __u16 ei_leaf_hi; /* high 16 bits of physical block */ ++ __u16 ei_unused; ++}; ++ ++/* ++ * each block (leaves and indexes), even inode-stored has header ++ */ ++struct ext3_extent_header { ++ __u16 eh_magic; /* probably will support different formats */ ++ __u16 eh_entries; /* number of valid entries */ ++ __u16 eh_max; /* capacity of store in entries */ ++ __u16 eh_depth; /* has tree real underlaying blocks? */ ++ __u32 eh_generation; /* generation of the tree */ ++}; ++ ++#define EXT3_EXT_MAGIC 0xf30a ++ ++/* ++ * array of ext3_ext_path contains path to some extent ++ * creation/lookup routines use it for traversal/splitting/etc ++ * truncate uses it to simulate recursive walking ++ */ ++struct ext3_ext_path { ++ __u32 p_block; ++ __u16 p_depth; ++ struct ext3_extent *p_ext; ++ struct ext3_extent_idx *p_idx; ++ struct ext3_extent_header *p_hdr; ++ struct buffer_head *p_bh; ++}; ++ ++/* ++ * structure for external API ++ */ ++ ++/* ++ * storage for cached extent ++ */ ++struct ext3_ext_cache { ++ __u32 ec_start; ++ __u32 ec_block; ++ __u32 ec_len; ++ __u32 ec_type; ++}; ++ ++#define EXT3_EXT_CACHE_NO 0 ++#define EXT3_EXT_CACHE_GAP 1 ++#define EXT3_EXT_CACHE_EXTENT 2 ++ ++/* ++ * ext3_extents_tree is used to pass initial information ++ * to top-level extents API ++ */ ++struct ext3_extents_helpers; ++struct ext3_extents_tree { ++ struct inode *inode; /* inode which tree belongs to */ ++ void *root; /* ptr to data top of tree resides at */ ++ void *buffer; /* will be passed as arg to ^^ routines */ ++ int buffer_len; ++ void *private; ++ struct ext3_ext_cache *cex;/* last found extent */ ++ struct ext3_extents_helpers *ops; ++}; ++ ++struct ext3_extents_helpers { ++ int (*get_write_access)(handle_t *h, void *buffer); ++ int (*mark_buffer_dirty)(handle_t *h, void *buffer); ++ int (*mergable)(struct ext3_extent *ex1, struct ext3_extent *ex2); ++ int (*remove_extent_credits)(struct ext3_extents_tree *, ++ struct ext3_extent *, unsigned long, ++ unsigned long); ++ int (*remove_extent)(struct ext3_extents_tree *, ++ struct ext3_extent *, unsigned long, ++ unsigned long); ++ int (*new_block)(handle_t *, struct ext3_extents_tree *, ++ struct ext3_ext_path *, struct ext3_extent *, ++ int *); ++}; ++ ++/* ++ * to be called by ext3_ext_walk_space() ++ * negative retcode - error ++ * positive retcode - signal for ext3_ext_walk_space(), see below ++ * callback must return valid extent (passed or newly created) ++ */ ++typedef int (*ext_prepare_callback)(struct ext3_extents_tree *, ++ struct ext3_ext_path *, ++ struct ext3_extent *, int); ++ ++#define EXT_CONTINUE 0 ++#define EXT_BREAK 1 ++#define EXT_REPEAT 2 ++ ++ ++#define EXT_MAX_BLOCK 0xffffffff ++#define EXT_CACHE_MARK 0xffff ++ ++ ++#define EXT_FIRST_EXTENT(__hdr__) \ ++ ((struct ext3_extent *) (((char *) (__hdr__)) + \ ++ sizeof(struct ext3_extent_header))) ++#define EXT_FIRST_INDEX(__hdr__) \ ++ ((struct ext3_extent_idx *) (((char *) (__hdr__)) + \ ++ sizeof(struct ext3_extent_header))) ++#define EXT_HAS_FREE_INDEX(__path__) \ ++ ((__path__)->p_hdr->eh_entries < (__path__)->p_hdr->eh_max) ++#define EXT_LAST_EXTENT(__hdr__) \ ++ (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_entries - 1) ++#define EXT_LAST_INDEX(__hdr__) \ ++ (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_entries - 1) ++#define EXT_MAX_EXTENT(__hdr__) \ ++ (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_max - 1) ++#define EXT_MAX_INDEX(__hdr__) \ ++ (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_max - 1) ++ ++#define EXT_ROOT_HDR(tree) \ ++ ((struct ext3_extent_header *) (tree)->root) ++#define EXT_BLOCK_HDR(bh) \ ++ ((struct ext3_extent_header *) (bh)->b_data) ++#define EXT_DEPTH(_t_) \ ++ (((struct ext3_extent_header *)((_t_)->root))->eh_depth) ++#define EXT_GENERATION(_t_) \ ++ (((struct ext3_extent_header *)((_t_)->root))->eh_generation) ++ ++ ++#define EXT_ASSERT(__x__) if (!(__x__)) BUG(); ++ ++ ++/* ++ * this structure is used to gather extents from the tree via ioctl ++ */ ++struct ext3_extent_buf { ++ unsigned long start; ++ int buflen; ++ void *buffer; ++ void *cur; ++ int err; ++}; ++ ++/* ++ * this structure is used to collect stats info about the tree ++ */ ++struct ext3_extent_tree_stats { ++ int depth; ++ int extents_num; ++ int leaf_num; ++}; ++ ++extern int ext3_extent_tree_init(handle_t *, struct ext3_extents_tree *); ++extern int ext3_ext_calc_credits_for_insert(struct ext3_extents_tree *, struct ext3_ext_path *); ++extern int ext3_ext_insert_extent(handle_t *, struct ext3_extents_tree *, struct ext3_ext_path *, struct ext3_extent *); ++extern int ext3_ext_walk_space(struct ext3_extents_tree *, unsigned long, unsigned long, ext_prepare_callback); ++extern int ext3_ext_remove_space(struct ext3_extents_tree *, unsigned long, unsigned long); ++extern struct ext3_ext_path * ext3_ext_find_extent(struct ext3_extents_tree *, int, struct ext3_ext_path *); ++extern void ext3_init_tree_desc(struct ext3_extents_tree *, struct inode *); ++extern int ext3_ext_calc_blockmap_metadata(struct inode *, int); ++ ++static inline void ++ext3_ext_invalidate_cache(struct ext3_extents_tree *tree) ++{ ++ if (tree->cex) ++ tree->cex->ec_type = EXT3_EXT_CACHE_NO; ++} ++ ++ ++#endif /* _LINUX_EXT3_EXTENTS */ ++ +Index: linux-stage/include/linux/ext3_fs_i.h +=================================================================== +--- linux-stage.orig/include/linux/ext3_fs_i.h 2005-02-25 14:50:50.320200384 +0200 ++++ linux-stage/include/linux/ext3_fs_i.h 2005-02-25 15:33:48.945189800 +0200 +@@ -128,6 +128,8 @@ + */ + struct semaphore truncate_sem; + struct inode vfs_inode; ++ ++ __u32 i_cached_extent[4]; + }; + + #endif /* _LINUX_EXT3_FS_I */ diff --git a/lustre/kernel_patches/patches/ext3-include-fixes-2.6-rhel4.patch b/lustre/kernel_patches/patches/ext3-include-fixes-2.6-rhel4.patch new file mode 100644 index 0000000..49528cf --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-include-fixes-2.6-rhel4.patch @@ -0,0 +1,20 @@ +Index: linux-stage/include/linux/ext3_fs.h +=================================================================== +--- linux-stage.orig/include/linux/ext3_fs.h 2005-02-25 14:53:56.424908168 +0200 ++++ linux-stage/include/linux/ext3_fs.h 2005-02-25 14:53:59.376459464 +0200 +@@ -361,12 +361,13 @@ + #define EXT3_MOUNT_IOPEN_NOPRIV 0x80000 /* Make iopen world-readable */ + + /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ +-#ifndef _LINUX_EXT2_FS_H ++#ifndef clear_opt + #define clear_opt(o, opt) o &= ~EXT3_MOUNT_##opt + #define set_opt(o, opt) o |= EXT3_MOUNT_##opt + #define test_opt(sb, opt) (EXT3_SB(sb)->s_mount_opt & \ + EXT3_MOUNT_##opt) +-#else ++#endif ++#ifndef EXT2_MOUNT_NOLOAD + #define EXT2_MOUNT_NOLOAD EXT3_MOUNT_NOLOAD + #define EXT2_MOUNT_ABORT EXT3_MOUNT_ABORT + #define EXT2_MOUNT_DATA_FLAGS EXT3_MOUNT_DATA_FLAGS diff --git a/lustre/kernel_patches/patches/ext3-mballoc2-2.6.9-rhel4.patch b/lustre/kernel_patches/patches/ext3-mballoc2-2.6.9-rhel4.patch new file mode 100644 index 0000000..fcceb30 --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-mballoc2-2.6.9-rhel4.patch @@ -0,0 +1,2236 @@ +Index: linux-stage/fs/ext3/mballoc.c +=================================================================== +--- linux-stage.orig/fs/ext3/mballoc.c 2005-02-25 17:28:41.836311072 +0200 ++++ linux-stage/fs/ext3/mballoc.c 2005-02-25 17:28:41.859307576 +0200 +@@ -0,0 +1,1847 @@ ++/* ++ * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com ++ * Written by Alex Tomas ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public Licens ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- ++ */ ++ ++ ++/* ++ * mballoc.c contains the multiblocks allocation routines ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * TODO: ++ * - track min/max extents in each group for better group selection ++ * - is it worthwhile to use buddies directly if req is 2^N blocks? ++ * - mb_mark_used() may allocate chunk right after splitting buddy ++ * - special flag to advice allocator to look for requested + N blocks ++ * this may improve interaction between extents and mballoc ++ * - tree of groups sorted by number of free blocks ++ * - percpu reservation code (hotpath) ++ * - error handling ++ */ ++ ++/* ++ * with AGRESSIVE_CHECK allocator runs consistency checks over ++ * structures. these checks slow things down a lot ++ */ ++#define AGGRESSIVE_CHECK__ ++ ++/* ++ * with MBALLOC_STATS allocator will collect stats that will be ++ * shown at umount. The collecting costs though! ++ */ ++#define MBALLOC_STATS ++ ++/* ++ */ ++#define MB_DEBUG__ ++#ifdef MB_DEBUG ++#define mb_debug(fmt,a...) printk(fmt, ##a) ++#else ++#define mb_debug(fmt,a...) ++#endif ++ ++/* ++ * where to save buddies structures beetween umount/mount (clean case only) ++ */ ++#define EXT3_BUDDY_FILE ".buddy" ++ ++/* ++ * How long mballoc can look for a best extent (in found extents) ++ */ ++#define EXT3_MB_MAX_TO_SCAN 100 ++ ++/* ++ * This structure is on-disk description of a group for mballoc ++ */ ++struct ext3_mb_group_descr { ++ __u16 mgd_first_free; /* first free block in the group */ ++ __u16 mgd_free; /* number of free blocks in the group */ ++ __u16 mgd_counters[16]; /* number of free blocks by order */ ++}; ++ ++/* ++ * This structure is header of mballoc's file ++ */ ++struct ext3_mb_grp_header { ++ __u32 mh_magic; ++}; ++ ++#define EXT3_MB_MAGIC_V1 0xbaad16fc ++ ++ ++struct ext3_free_extent { ++ __u16 fe_start; ++ __u16 fe_len; ++ __u16 fe_group; ++}; ++ ++struct ext3_allocation_context { ++ struct super_block *ac_sb; ++ ++ /* search goals */ ++struct ext3_free_extent ac_g_ex; ++ ++ /* the best found extent */ ++ struct ext3_free_extent ac_b_ex; ++ ++ /* number of iterations done. we have to track to limit searching */ ++ unsigned long ac_ex_scanned; ++ __u16 ac_groups_scanned; ++ __u16 ac_found; ++ __u8 ac_status; ++ __u8 ac_flags; /* allocation hints */ ++ __u8 ac_repeats; ++}; ++ ++#define AC_STATUS_CONTINUE 1 ++#define AC_STATUS_FOUND 2 ++#define AC_STATUS_BREAK 3 ++ ++struct ext3_buddy { ++ struct buffer_head *bd_bh; ++ struct buffer_head *bd_bh2; ++ struct ext3_buddy_group_blocks *bd_bd; ++ struct super_block *bd_sb; ++ __u16 bd_blkbits; ++ __u16 bd_group; ++}; ++#define EXT3_MB_BITMAP(e3b) ((e3b)->bd_bh->b_data) ++#define EXT3_MB_BUDDY(e3b) ((e3b)->bd_bh2->b_data) ++ ++#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) ++ ++int ext3_create (struct inode *, struct dentry *, int, struct nameidata *); ++struct buffer_head * read_block_bitmap(struct super_block *, unsigned int); ++int ext3_new_block_old(handle_t *, struct inode *, unsigned long, int *); ++int ext3_mb_reserve_blocks(struct super_block *, int); ++void ext3_mb_release_blocks(struct super_block *, int); ++void ext3_mb_poll_new_transaction(struct super_block *, handle_t *); ++void ext3_mb_free_committed_blocks(struct super_block *); ++ ++#define mb_correct_addr_and_bit(bit,addr) \ ++{ \ ++ if ((unsigned long)addr & 1) { \ ++ bit += 8; \ ++ addr--; \ ++ } \ ++ if ((unsigned long)addr & 2) { \ ++ bit += 16; \ ++ addr--; \ ++ addr--; \ ++ } \ ++} ++ ++static inline int mb_test_bit(int bit, void *addr) ++{ ++ mb_correct_addr_and_bit(bit,addr); ++ return test_bit(bit, addr); ++} ++ ++static inline void mb_set_bit(int bit, void *addr) ++{ ++ mb_correct_addr_and_bit(bit,addr); ++ __set_bit(bit, addr); ++} ++ ++static inline void mb_set_bit_atomic(int bit, void *addr) ++{ ++ mb_correct_addr_and_bit(bit,addr); ++ set_bit(bit, addr); ++} ++ ++static inline void mb_clear_bit(int bit, void *addr) ++{ ++ mb_correct_addr_and_bit(bit,addr); ++ __clear_bit(bit, addr); ++} ++ ++static inline void mb_clear_bit_atomic(int bit, void *addr) ++{ ++ mb_correct_addr_and_bit(bit,addr); ++ clear_bit(bit, addr); ++} ++ ++static inline void *mb_find_buddy(struct ext3_buddy *e3b, int order, int *max) ++{ ++ int i = 1; ++ char *bb; ++ ++ J_ASSERT(EXT3_MB_BITMAP(e3b) != EXT3_MB_BUDDY(e3b)); ++ J_ASSERT(max != NULL); ++ ++ if (order > e3b->bd_blkbits + 1) ++ return NULL; ++ ++ /* at order 0 we see each particular block */ ++ *max = 1 << (e3b->bd_blkbits + 3); ++ if (order == 0) ++ return EXT3_MB_BITMAP(e3b); ++ ++ bb = EXT3_MB_BUDDY(e3b); ++ *max = *max >> 1; ++ while (i < order) { ++ bb += 1 << (e3b->bd_blkbits - i); ++ i++; ++ *max = *max >> 1; ++ } ++ J_ASSERT((unsigned) (bb - (char *) EXT3_MB_BUDDY(e3b)) < ++ e3b->bd_sb->s_blocksize); ++ return bb; ++} ++ ++static int ext3_mb_load_buddy(struct super_block *sb, int group, ++ struct ext3_buddy *e3b) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ ++ J_ASSERT(sbi->s_buddy_blocks[group]->bb_bitmap); ++ J_ASSERT(sbi->s_buddy_blocks[group]->bb_buddy); ++ ++ /* load bitmap */ ++ e3b->bd_bh = sb_getblk(sb, sbi->s_buddy_blocks[group]->bb_bitmap); ++ if (e3b->bd_bh == NULL) { ++ ext3_error(sb, "ext3_mb_load_buddy", ++ "can't get block for buddy bitmap\n"); ++ goto out; ++ } ++ if (!buffer_uptodate(e3b->bd_bh)) { ++ ll_rw_block(READ, 1, &e3b->bd_bh); ++ wait_on_buffer(e3b->bd_bh); ++ } ++ J_ASSERT(buffer_uptodate(e3b->bd_bh)); ++ ++ /* load buddy */ ++ e3b->bd_bh2 = sb_getblk(sb, sbi->s_buddy_blocks[group]->bb_buddy); ++ if (e3b->bd_bh2 == NULL) { ++ ext3_error(sb, "ext3_mb_load_buddy", ++ "can't get block for buddy bitmap\n"); ++ goto out; ++ } ++ if (!buffer_uptodate(e3b->bd_bh2)) { ++ ll_rw_block(READ, 1, &e3b->bd_bh2); ++ wait_on_buffer(e3b->bd_bh2); ++ } ++ J_ASSERT(buffer_uptodate(e3b->bd_bh2)); ++ ++ e3b->bd_blkbits = sb->s_blocksize_bits; ++ e3b->bd_bd = sbi->s_buddy_blocks[group]; ++ e3b->bd_sb = sb; ++ e3b->bd_group = group; ++ ++ return 0; ++out: ++ brelse(e3b->bd_bh); ++ brelse(e3b->bd_bh2); ++ e3b->bd_bh = NULL; ++ e3b->bd_bh2 = NULL; ++ return -EIO; ++} ++ ++static void ext3_mb_dirty_buddy(struct ext3_buddy *e3b) ++{ ++ mark_buffer_dirty(e3b->bd_bh); ++ mark_buffer_dirty(e3b->bd_bh2); ++} ++ ++static void ext3_mb_release_desc(struct ext3_buddy *e3b) ++{ ++ brelse(e3b->bd_bh); ++ brelse(e3b->bd_bh2); ++} ++ ++#ifdef AGGRESSIVE_CHECK ++static void mb_check_buddy(struct ext3_buddy *e3b) ++{ ++ int order = e3b->bd_blkbits + 1; ++ int max, max2, i, j, k, count; ++ void *buddy, *buddy2; ++ ++ if (!test_opt(e3b->bd_sb, MBALLOC)) ++ return; ++ ++ while (order > 1) { ++ buddy = mb_find_buddy(e3b, order, &max); ++ J_ASSERT(buddy); ++ buddy2 = mb_find_buddy(e3b, order - 1, &max2); ++ J_ASSERT(buddy2); ++ J_ASSERT(buddy != buddy2); ++ J_ASSERT(max * 2 == max2); ++ ++ count = 0; ++ for (i = 0; i < max; i++) { ++ ++ if (!mb_test_bit(i, buddy)) { ++ /* only single bit in buddy2 may be 1 */ ++ if (mb_test_bit(i << 1, buddy2)) ++ J_ASSERT(!mb_test_bit((i<<1)+1, buddy2)); ++ else if (mb_test_bit((i << 1) + 1, buddy2)) ++ J_ASSERT(!mb_test_bit(i << 1, buddy2)); ++ continue; ++ } ++ ++ /* both bits in buddy2 must be 0 */ ++ J_ASSERT(!mb_test_bit(i << 1, buddy2)); ++ J_ASSERT(!mb_test_bit((i << 1) + 1, buddy2)); ++ ++ for (j = 0; j < (1 << order); j++) { ++ k = (i * (1 << order)) + j; ++ J_ASSERT(mb_test_bit(k, EXT3_MB_BITMAP(e3b))); ++ } ++ count++; ++ } ++ J_ASSERT(e3b->bd_bd->bb_counters[order] == count); ++ order--; ++ } ++ ++ buddy = mb_find_buddy(e3b, 0, &max); ++ for (i = 0; i < max; i++) { ++ if (mb_test_bit(i, buddy)) ++ continue; ++ /* check used bits only */ ++ for (j = 0; j < e3b->bd_blkbits + 1; j++) { ++ buddy2 = mb_find_buddy(e3b, j, &max2); ++ k = i >> j; ++ J_ASSERT(k < max2); ++ J_ASSERT(!mb_test_bit(k, buddy2)); ++ } ++ } ++} ++#else ++#define mb_check_buddy(e3b) ++#endif ++ ++static inline void ++ext3_lock_group(struct super_block *sb, int group) ++{ ++ spin_lock(&EXT3_SB(sb)->s_buddy_blocks[group]->bb_lock); ++} ++ ++static inline void ++ext3_unlock_group(struct super_block *sb, int group) ++{ ++ spin_unlock(&EXT3_SB(sb)->s_buddy_blocks[group]->bb_lock); ++} ++ ++static int mb_find_order_for_block(struct ext3_buddy *e3b, int block) ++{ ++ int order = 1; ++ void *bb; ++ ++ J_ASSERT(EXT3_MB_BITMAP(e3b) != EXT3_MB_BUDDY(e3b)); ++ J_ASSERT(block < (1 << (e3b->bd_blkbits + 3))); ++ ++ bb = EXT3_MB_BUDDY(e3b); ++ while (order <= e3b->bd_blkbits + 1) { ++ block = block >> 1; ++ if (mb_test_bit(block, bb)) { ++ /* this block is part of buddy of order 'order' */ ++ return order; ++ } ++ bb += 1 << (e3b->bd_blkbits - order); ++ order++; ++ } ++ return 0; ++} ++ ++static inline void mb_clear_bits(void *bm, int cur, int len) ++{ ++ __u32 *addr; ++ ++ len = cur + len; ++ while (cur < len) { ++ if ((cur & 31) == 0 && (len - cur) >= 32) { ++ /* fast path: clear whole word at once */ ++ addr = bm + (cur >> 3); ++ *addr = 0; ++ cur += 32; ++ continue; ++ } ++ mb_clear_bit_atomic(cur, bm); ++ cur++; ++ } ++} ++ ++static inline void mb_set_bits(void *bm, int cur, int len) ++{ ++ __u32 *addr; ++ ++ len = cur + len; ++ while (cur < len) { ++ if ((cur & 31) == 0 && (len - cur) >= 32) { ++ /* fast path: clear whole word at once */ ++ addr = bm + (cur >> 3); ++ *addr = 0xffffffff; ++ cur += 32; ++ continue; ++ } ++ mb_set_bit_atomic(cur, bm); ++ cur++; ++ } ++} ++ ++static int mb_free_blocks(struct ext3_buddy *e3b, int first, int count) ++{ ++ int block, max, order; ++ void *buddy, *buddy2; ++ ++ mb_check_buddy(e3b); ++ ++ e3b->bd_bd->bb_free += count; ++ if (first < e3b->bd_bd->bb_first_free) ++ e3b->bd_bd->bb_first_free = first; ++ ++ while (count-- > 0) { ++ block = first++; ++ order = 0; ++ ++ J_ASSERT(!mb_test_bit(block, EXT3_MB_BITMAP(e3b))); ++ mb_set_bit(block, EXT3_MB_BITMAP(e3b)); ++ e3b->bd_bd->bb_counters[order]++; ++ ++ /* start of the buddy */ ++ buddy = mb_find_buddy(e3b, order, &max); ++ ++ do { ++ block &= ~1UL; ++ if (!mb_test_bit(block, buddy) || ++ !mb_test_bit(block + 1, buddy)) ++ break; ++ ++ /* both the buddies are free, try to coalesce them */ ++ buddy2 = mb_find_buddy(e3b, order + 1, &max); ++ ++ if (!buddy2) ++ break; ++ ++ if (order > 0) { ++ /* for special purposes, we don't clear ++ * free bits in bitmap */ ++ mb_clear_bit(block, buddy); ++ mb_clear_bit(block + 1, buddy); ++ } ++ e3b->bd_bd->bb_counters[order]--; ++ e3b->bd_bd->bb_counters[order]--; ++ ++ block = block >> 1; ++ order++; ++ e3b->bd_bd->bb_counters[order]++; ++ ++ mb_set_bit(block, buddy2); ++ buddy = buddy2; ++ } while (1); ++ } ++ mb_check_buddy(e3b); ++ ++ return 0; ++} ++ ++static int mb_find_extent(struct ext3_buddy *e3b, int order, int block, ++ int needed, struct ext3_free_extent *ex) ++{ ++ int next, max, ord; ++ void *buddy; ++ ++ J_ASSERT(ex != NULL); ++ ++ buddy = mb_find_buddy(e3b, order, &max); ++ J_ASSERT(buddy); ++ J_ASSERT(block < max); ++ if (!mb_test_bit(block, buddy)) { ++ ex->fe_len = 0; ++ ex->fe_start = 0; ++ ex->fe_group = 0; ++ return 0; ++ } ++ ++ if (order == 0) { ++ /* find actual order */ ++ order = mb_find_order_for_block(e3b, block); ++ block = block >> order; ++ } ++ ++ ex->fe_len = 1 << order; ++ ex->fe_start = block << order; ++ ex->fe_group = e3b->bd_group; ++ ++ while ((buddy = mb_find_buddy(e3b, order, &max))) { ++ ++ if (block + 1 >= max) ++ break; ++ ++ next = (block + 1) * (1 << order); ++ if (!mb_test_bit(next, EXT3_MB_BITMAP(e3b))) ++ break; ++ ++ ord = mb_find_order_for_block(e3b, next); ++ ++ order = ord; ++ block = next >> order; ++ ex->fe_len += 1 << order; ++ } ++ ++ J_ASSERT(ex->fe_start + ex->fe_len <= (1 << (e3b->bd_blkbits + 3))); ++ return ex->fe_len; ++} ++ ++static int mb_mark_used(struct ext3_buddy *e3b, struct ext3_free_extent *ex) ++{ ++ int start = ex->fe_start; ++ int len = ex->fe_len; ++ int ord, mlen, max, cur; ++ int len0 = len; ++ void *buddy; ++ ++ e3b->bd_bd->bb_free -= len; ++ if (e3b->bd_bd->bb_first_free == start) ++ e3b->bd_bd->bb_first_free += len; ++ ++ while (len) { ++ ord = mb_find_order_for_block(e3b, start); ++ ++ if (((start >> ord) << ord) == start && len >= (1 << ord)) { ++ /* the whole chunk may be allocated at once! */ ++ mlen = 1 << ord; ++ buddy = mb_find_buddy(e3b, ord, &max); ++ J_ASSERT((start >> ord) < max); ++ mb_clear_bit(start >> ord, buddy); ++ e3b->bd_bd->bb_counters[ord]--; ++ start += mlen; ++ len -= mlen; ++ J_ASSERT(len >= 0); ++ continue; ++ } ++ ++ /* we have to split large buddy */ ++ J_ASSERT(ord > 0); ++ buddy = mb_find_buddy(e3b, ord, &max); ++ mb_clear_bit(start >> ord, buddy); ++ e3b->bd_bd->bb_counters[ord]--; ++ ++ ord--; ++ cur = (start >> ord) & ~1U; ++ buddy = mb_find_buddy(e3b, ord, &max); ++ mb_set_bit(cur, buddy); ++ mb_set_bit(cur + 1, buddy); ++ e3b->bd_bd->bb_counters[ord]++; ++ e3b->bd_bd->bb_counters[ord]++; ++ } ++ ++ /* now drop all the bits in bitmap */ ++ mb_clear_bits(EXT3_MB_BITMAP(e3b), ex->fe_start, len0); ++ ++ mb_check_buddy(e3b); ++ ++ return 0; ++} ++ ++/* ++ * Must be called under group lock! ++ */ ++static void ext3_mb_use_best_found(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len); ++ mb_mark_used(e3b, &ac->ac_b_ex); ++ ac->ac_status = AC_STATUS_FOUND; ++} ++ ++/* ++ * The routine checks whether found extent is good enough. If it is, ++ * then the extent gets marked used and flag is set to the context ++ * to stop scanning. Otherwise, the extent is compared with the ++ * previous found extent and if new one is better, then it's stored ++ * in the context. Later, the best found extent will be used, if ++ * mballoc can't find good enough extent. ++ * ++ * FIXME: real allocation policy is to be designed yet! ++ */ ++static void ext3_mb_measure_extent(struct ext3_allocation_context *ac, ++ struct ext3_free_extent *ex, ++ struct ext3_buddy *e3b) ++{ ++ int factor = EXT3_SB(ac->ac_sb)->s_mb_factor; ++ struct ext3_free_extent *bex = &ac->ac_b_ex; ++ int diff = ac->ac_g_ex.fe_len - ex->fe_len; ++ ++ J_ASSERT(ex->fe_len > 0); ++ J_ASSERT(ex->fe_len < (1 << ac->ac_sb->s_blocksize_bits) * 8); ++ J_ASSERT(ex->fe_start < (1 << ac->ac_sb->s_blocksize_bits) * 8); ++ ++ ac->ac_found++; ++ ++ /* ++ * The special case - take what you catch first ++ */ ++ if (ac->ac_flags & EXT3_MB_HINT_FIRST) { ++ *bex = *ex; ++ ext3_mb_use_best_found(ac, e3b); ++ return; ++ } ++ ++ /* ++ * Let's check whether the chuck is good enough ++ */ ++ if (ex->fe_len >= ac->ac_g_ex.fe_len) { ++ *bex = *ex; ++ ext3_mb_use_best_found(ac, e3b); ++ return; ++ } ++ ++ /* ++ * If the request is vey large, then it makes sense to use large ++ * chunks for it. Even if they don't satisfy whole request. ++ */ ++ if (ex->fe_len > 1000) { ++ *bex = *ex; ++ ext3_mb_use_best_found(ac, e3b); ++ return; ++ } ++ ++ /* ++ * Sometimes it's worty to take close chunk ++ */ ++ if (factor && (ac->ac_g_ex.fe_len * 100) / (diff * 100) >= factor) { ++ *bex = *ex; ++ ext3_mb_use_best_found(ac, e3b); ++ return; ++ } ++ ++ /* ++ * If this is first found extent, just store it in the context ++ */ ++ if (bex->fe_len == 0) { ++ *bex = *ex; ++ return; ++ } ++ ++ /* ++ * If new found extent is better, store it in the context ++ * FIXME: possible the policy should be more complex? ++ */ ++ if (ex->fe_len > bex->fe_len) { ++ *bex = *ex; ++ } ++ ++ /* ++ * We don't want to scan for a whole year ++ */ ++ if (ac->ac_found > EXT3_MB_MAX_TO_SCAN) ++ ac->ac_status = AC_STATUS_BREAK; ++} ++ ++static int ext3_mb_try_best_found(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ struct ext3_free_extent ex = ac->ac_b_ex; ++ int group = ex.fe_group, max, err; ++ ++ J_ASSERT(ex.fe_len > 0); ++ err = ext3_mb_load_buddy(ac->ac_sb, group, e3b); ++ if (err) ++ return err; ++ ++ ext3_lock_group(ac->ac_sb, group); ++ max = mb_find_extent(e3b, 0, ex.fe_start, ex.fe_len, &ex); ++ ++ if (max > 0) ++ ext3_mb_use_best_found(ac, e3b); ++ ++ ext3_unlock_group(ac->ac_sb, group); ++ ++ if (ac->ac_status == AC_STATUS_FOUND) ++ ext3_mb_dirty_buddy(e3b); ++ ext3_mb_release_desc(e3b); ++ ++ return 0; ++} ++ ++static int ext3_mb_find_by_goal(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ int group = ac->ac_g_ex.fe_group, max, err; ++ struct ext3_free_extent ex; ++ ++ err = ext3_mb_load_buddy(ac->ac_sb, group, e3b); ++ if (err) ++ return err; ++ ++ ext3_lock_group(ac->ac_sb, group); ++ max = mb_find_extent(e3b, 0, ac->ac_g_ex.fe_start, ++ ac->ac_g_ex.fe_len, &ex); ++ ++ if (max > 0) { ++ J_ASSERT(ex.fe_len > 0); ++ J_ASSERT(ex.fe_group == ac->ac_g_ex.fe_group); ++ J_ASSERT(ex.fe_start == ac->ac_g_ex.fe_start); ++ ac->ac_b_ex = ex; ++ ext3_mb_use_best_found(ac, e3b); ++ } ++ ext3_unlock_group(ac->ac_sb, group); ++ ++ if (ac->ac_status == AC_STATUS_FOUND) ++ ext3_mb_dirty_buddy(e3b); ++ ext3_mb_release_desc(e3b); ++ ++ return 0; ++} ++/* ++ * The routine scans the group and measures all found extents. ++ * In order to optimize scanning, caller must pass number of ++ * free blocks in the group, so the routine can upper limit. ++ */ ++static void ext3_mb_scan_group(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ struct super_block *sb = ac->ac_sb; ++ void *bitmap = EXT3_MB_BITMAP(e3b); ++ struct ext3_free_extent ex; ++ int i, free; ++ ++ free = e3b->bd_bd->bb_free; ++ J_ASSERT(free > 0); ++ ++ i = e3b->bd_bd->bb_first_free; ++ ++ while (free && ac->ac_status != AC_STATUS_FOUND) { ++ i = find_next_bit(bitmap, sb->s_blocksize * 8, i); ++ if (i >= sb->s_blocksize * 8) { ++ J_ASSERT(free == 0); ++ break; ++ } ++ ++ mb_find_extent(e3b, 0, i, ac->ac_g_ex.fe_len, &ex); ++ J_ASSERT(ex.fe_len > 0); ++ J_ASSERT(free >= ex.fe_len); ++ ++ ext3_mb_measure_extent(ac, &ex, e3b); ++ ++ i += ex.fe_len; ++ free -= ex.fe_len; ++ } ++} ++ ++static int ext3_mb_good_group(struct ext3_allocation_context *ac, ++ int group, int cr) ++{ ++ int free; ++ ++ J_ASSERT(cr >= 0 && cr < 3); ++ ++ free = EXT3_SB(ac->ac_sb)->s_buddy_blocks[group]->bb_free; ++ if (free == 0) ++ return 0; ++ ++ if (cr == 0) { ++ if (free >= ac->ac_g_ex.fe_len >> 1) ++ return 1; ++ } else if (cr == 1) { ++ if (free >= ac->ac_g_ex.fe_len >> 2) ++ return 1; ++ } else if (cr == 2) { ++ return 1; ++ } ++ return 0; ++} ++ ++int ext3_mb_new_blocks(handle_t *handle, struct inode *inode, ++ unsigned long goal, int *len, int flags, int *errp) ++{ ++ struct buffer_head *bitmap_bh = NULL; ++ struct ext3_allocation_context ac; ++ int i, group, block, cr, err = 0; ++ struct ext3_group_desc *gdp; ++ struct ext3_super_block *es; ++ struct buffer_head *gdp_bh; ++ struct ext3_sb_info *sbi; ++ struct super_block *sb; ++ struct ext3_buddy e3b; ++ ++ J_ASSERT(len != NULL); ++ J_ASSERT(*len > 0); ++ ++ sb = inode->i_sb; ++ if (!sb) { ++ printk("ext3_mb_new_nblocks: nonexistent device"); ++ return 0; ++ } ++ ++ if (!test_opt(sb, MBALLOC)) { ++ static int ext3_mballoc_warning = 0; ++ if (ext3_mballoc_warning == 0) { ++ printk(KERN_ERR "EXT3-fs: multiblock request with " ++ "mballoc disabled!\n"); ++ ext3_mballoc_warning++; ++ } ++ *len = 1; ++ err = ext3_new_block_old(handle, inode, goal, errp); ++ return err; ++ } ++ ++ ext3_mb_poll_new_transaction(sb, handle); ++ ++ sbi = EXT3_SB(sb); ++ es = EXT3_SB(sb)->s_es; ++ ++ /* ++ * We can't allocate > group size ++ */ ++ if (*len >= EXT3_BLOCKS_PER_GROUP(sb) - 10) ++ *len = EXT3_BLOCKS_PER_GROUP(sb) - 10; ++ ++ if (!(flags & EXT3_MB_HINT_RESERVED)) { ++ /* someone asks for non-reserved blocks */ ++ BUG_ON(*len > 1); ++ err = ext3_mb_reserve_blocks(sb, 1); ++ if (err) { ++ *errp = err; ++ return 0; ++ } ++ } ++ ++ /* ++ * Check quota for allocation of this blocks. ++ */ ++ while (*len && DQUOT_ALLOC_BLOCK(inode, *len)) ++ *len -= 1; ++ if (*len == 0) { ++ *errp = -EDQUOT; ++ block = 0; ++ goto out; ++ } ++ ++ /* start searching from the goal */ ++ if (goal < le32_to_cpu(es->s_first_data_block) || ++ goal >= le32_to_cpu(es->s_blocks_count)) ++ goal = le32_to_cpu(es->s_first_data_block); ++ group = (goal - le32_to_cpu(es->s_first_data_block)) / ++ EXT3_BLOCKS_PER_GROUP(sb); ++ block = ((goal - le32_to_cpu(es->s_first_data_block)) % ++ EXT3_BLOCKS_PER_GROUP(sb)); ++ ++ /* set up allocation goals */ ++ ac.ac_b_ex.fe_group = 0; ++ ac.ac_b_ex.fe_start = 0; ++ ac.ac_b_ex.fe_len = 0; ++ ac.ac_status = AC_STATUS_CONTINUE; ++ ac.ac_groups_scanned = 0; ++ ac.ac_ex_scanned = 0; ++ ac.ac_found = 0; ++ ac.ac_sb = inode->i_sb; ++ ac.ac_g_ex.fe_group = group; ++ ac.ac_g_ex.fe_start = block; ++ ac.ac_g_ex.fe_len = *len; ++ ac.ac_flags = flags; ++ ++ /* ++ * Sometimes, caller may want to merge even small number ++ * of blocks to an existing extent ++ */ ++ if (ac.ac_flags & EXT3_MB_HINT_MERGE) { ++ err = ext3_mb_find_by_goal(&ac, &e3b); ++ if (err) ++ goto out_err; ++ if (ac.ac_status == AC_STATUS_FOUND) ++ goto found; ++ } ++ ++ /* ++ * FIXME ++ * If requested chunk is power of 2 length, we can try ++ * to exploit buddy nature to speed allocation up ++ */ ++ ++ ++ /* ++ * Let's just scan groups to find more-less suitable blocks ++ */ ++ cr = 0; ++repeat: ++ for (; cr < 3 && ac.ac_status == AC_STATUS_CONTINUE; cr++) { ++ for (i = 0; i < EXT3_SB(sb)->s_groups_count; group++, i++) { ++ if (group == EXT3_SB(sb)->s_groups_count) ++ group = 0; ++ ++ /* check is group good for our criteries */ ++ if (!ext3_mb_good_group(&ac, group, cr)) ++ continue; ++ ++ err = ext3_mb_load_buddy(ac.ac_sb, group, &e3b); ++ if (err) ++ goto out_err; ++ ++ ext3_lock_group(sb, group); ++ if (!ext3_mb_good_group(&ac, group, cr)) { ++ /* someone did allocation from this group */ ++ ext3_unlock_group(sb, group); ++ ext3_mb_release_desc(&e3b); ++ continue; ++ } ++ ++ ext3_mb_scan_group(&ac, &e3b); ++ ext3_unlock_group(sb, group); ++ ++ if (ac.ac_status == AC_STATUS_FOUND) ++ ext3_mb_dirty_buddy(&e3b); ++ ext3_mb_release_desc(&e3b); ++ ++ if (err) ++ goto out_err; ++ if (ac.ac_status != AC_STATUS_CONTINUE) ++ break; ++ } ++ } ++ ++ if (ac.ac_status == AC_STATUS_BREAK && ++ !(ac.ac_flags & EXT3_MB_HINT_FIRST)) { ++ /* ++ * We've been searching too long. Let's try to allocate ++ * the best chunk we've found so far ++ */ ++ printk(KERN_ERR "EXT3-fs: too long searching (%d/%d)\n", ++ ac.ac_b_ex.fe_len, ac.ac_g_ex.fe_len); ++ ext3_mb_try_best_found(&ac, &e3b); ++ if (ac.ac_status != AC_STATUS_FOUND) { ++ /* ++ * Someone more lucky has already allocated it. ++ * The only thing we can do is just take first ++ * found block(s) ++ */ ++ printk(KERN_ERR "EXT3-fs: and someone won our chunk\n"); ++ ac.ac_b_ex.fe_group = 0; ++ ac.ac_b_ex.fe_start = 0; ++ ac.ac_b_ex.fe_len = 0; ++ ac.ac_status = AC_STATUS_CONTINUE; ++ ac.ac_flags |= EXT3_MB_HINT_FIRST; ++ cr = 2; ++ goto repeat; ++ } ++ } ++ ++ if (ac.ac_status != AC_STATUS_FOUND) { ++ /* ++ * We aren't lucky definitely ++ */ ++ J_ASSERT(ac.ac_b_ex.fe_len == 0); ++ DQUOT_FREE_BLOCK(inode, *len); ++ *errp = -ENOSPC; ++ block = 0; ++#if 1 ++ printk(KERN_ERR "EXT3-fs: cant allocate: status %d, flags %d\n", ++ ac.ac_status, ac.ac_flags); ++ printk(KERN_ERR "EXT3-fs: goal %d, best found %d/%d/%d, cr %d\n", ++ ac.ac_g_ex.fe_len, ac.ac_b_ex.fe_group, ++ ac.ac_b_ex.fe_start, ac.ac_b_ex.fe_len, cr); ++ printk(KERN_ERR "EXT3-fs: %lu block reserved, %d found\n", ++ sbi->s_blocks_reserved, ac.ac_found); ++ printk("EXT3-fs: groups: "); ++ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) ++ printk("%d: %d ", i, ++ sbi->s_buddy_blocks[i]->bb_free); ++ printk("\n"); ++#endif ++ goto out; ++ } ++ ++found: ++ J_ASSERT(ac.ac_b_ex.fe_len > 0); ++ ++ /* good news - free block(s) have been found. now it's time ++ * to mark block(s) in good old journaled bitmap */ ++ block = ac.ac_b_ex.fe_group * EXT3_BLOCKS_PER_GROUP(sb) ++ + ac.ac_b_ex.fe_start ++ + le32_to_cpu(es->s_first_data_block); ++ ++ /* we made a desicion, now mark found blocks in good old ++ * bitmap to be journaled */ ++ ++ ext3_debug("using block group %d(%d)\n", ++ ac.ac_b_group.group, gdp->bg_free_blocks_count); ++ ++ bitmap_bh = read_block_bitmap(sb, ac.ac_b_ex.fe_group); ++ if (!bitmap_bh) { ++ *errp = -EIO; ++ goto out_err; ++ } ++ ++ err = ext3_journal_get_write_access(handle, bitmap_bh); ++ if (err) { ++ *errp = err; ++ goto out_err; ++ } ++ ++ gdp = ext3_get_group_desc(sb, ac.ac_b_ex.fe_group, &gdp_bh); ++ if (!gdp) { ++ *errp = -EIO; ++ goto out_err; ++ } ++ ++ err = ext3_journal_get_write_access(handle, gdp_bh); ++ if (err) ++ goto out_err; ++ ++ block = ac.ac_b_ex.fe_group * EXT3_BLOCKS_PER_GROUP(sb) ++ + ac.ac_b_ex.fe_start ++ + le32_to_cpu(es->s_first_data_block); ++ ++ if (block == le32_to_cpu(gdp->bg_block_bitmap) || ++ block == le32_to_cpu(gdp->bg_inode_bitmap) || ++ in_range(block, le32_to_cpu(gdp->bg_inode_table), ++ EXT3_SB(sb)->s_itb_per_group)) ++ ext3_error(sb, "ext3_new_block", ++ "Allocating block in system zone - " ++ "block = %u", block); ++#if AGGRESSIVE_CHECK ++ for (i = 0; i < ac.ac_b_len; i++) ++ J_ASSERT(!mb_test_bit(ac.ac_b_ex.fe_start + i, bitmap_bh->b_data)); ++#endif ++ mb_set_bits(bitmap_bh->b_data, ac.ac_b_ex.fe_start, ac.ac_b_ex.fe_len); ++ ++ spin_lock(sb_bgl_lock(sbi, ac.ac_b_ex.fe_group)); ++ gdp->bg_free_blocks_count = ++ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) ++ - ac.ac_b_ex.fe_len); ++ spin_unlock(sb_bgl_lock(sbi, ac.ac_b_ex.fe_group)); ++ percpu_counter_mod(&sbi->s_freeblocks_counter, - ac.ac_b_ex.fe_len); ++ ++ err = ext3_journal_dirty_metadata(handle, bitmap_bh); ++ if (err) ++ goto out_err; ++ err = ext3_journal_dirty_metadata(handle, gdp_bh); ++ if (err) ++ goto out_err; ++ ++ sb->s_dirt = 1; ++ *errp = 0; ++ brelse(bitmap_bh); ++ ++ /* drop non-allocated, but dquote'd blocks */ ++ J_ASSERT(*len >= ac.ac_b_ex.fe_len); ++ DQUOT_FREE_BLOCK(inode, *len - ac.ac_b_ex.fe_len); ++ ++ *len = ac.ac_b_ex.fe_len; ++ J_ASSERT(*len > 0); ++ J_ASSERT(block != 0); ++ goto out; ++ ++out_err: ++ /* if we've already allocated something, roll it back */ ++ if (ac.ac_status == AC_STATUS_FOUND) { ++ /* FIXME: free blocks here */ ++ } ++ ++ DQUOT_FREE_BLOCK(inode, *len); ++ brelse(bitmap_bh); ++ *errp = err; ++ block = 0; ++out: ++ if (!(flags & EXT3_MB_HINT_RESERVED)) { ++ /* block wasn't reserved before and we reserved it ++ * at the beginning of allocation. it doesn't matter ++ * whether we allocated anything or we failed: time ++ * to release reservation. NOTE: because I expect ++ * any multiblock request from delayed allocation ++ * path only, here is single block always */ ++ ext3_mb_release_blocks(sb, 1); ++ } ++#ifdef MBALLOC_STATS ++ if (ac.ac_g_ex.fe_len > 1) { ++ spin_lock(&sbi->s_bal_lock); ++ sbi->s_bal_reqs++; ++ sbi->s_bal_allocated += *len; ++ if (*len >= ac.ac_g_ex.fe_len) ++ sbi->s_bal_success++; ++ sbi->s_bal_ex_scanned += ac.ac_found; ++ if (ac.ac_g_ex.fe_start == ac.ac_b_ex.fe_start && ++ ac.ac_g_ex.fe_group == ac.ac_b_ex.fe_group) ++ sbi->s_bal_goals++; ++ if (ac.ac_found > EXT3_MB_MAX_TO_SCAN) ++ sbi->s_bal_breaks++; ++ spin_unlock(&sbi->s_bal_lock); ++ } ++#endif ++ return block; ++} ++ ++int ext3_mb_get_descr_loc(struct ext3_buddy *e3b, struct buffer_head **bh, ++ struct ext3_mb_group_descr **grp) ++{ ++ struct super_block *sb = e3b->bd_sb; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int descr_per_block, err, offset; ++ struct ext3_mb_grp_header *hdr; ++ unsigned long block; ++ ++ descr_per_block = (sb->s_blocksize - sizeof(struct ext3_mb_grp_header)) ++ / sizeof(struct ext3_mb_group_descr); ++ block = e3b->bd_group / descr_per_block; ++ *bh = ext3_bread(NULL, sbi->s_buddy, block, 0, &err); ++ if (*bh == NULL) { ++ printk(KERN_ERR "EXT3-fs: cant getblk descr for group %d: %d\n", ++ e3b->bd_group, err); ++ return err; ++ } ++ ++ hdr = (struct ext3_mb_grp_header *) (*bh)->b_data; ++ if (hdr->mh_magic != EXT3_MB_MAGIC_V1) { ++ printk(KERN_ERR "EXT3-fs: invalid magic in group %d!\n", ++ e3b->bd_group); ++ brelse(*bh); ++ *bh = NULL; ++ return -EIO; ++ } ++ ++ offset = e3b->bd_group % descr_per_block ++ * sizeof(struct ext3_mb_group_descr) ++ + sizeof(struct ext3_mb_grp_header); ++ *grp = (struct ext3_mb_group_descr *) ((*bh)->b_data + offset); ++ ++ return 0; ++} ++ ++int ext3_mb_load_descr(struct ext3_buddy *e3b) ++{ ++ struct ext3_mb_group_descr *grp; ++ struct ext3_group_desc *gdp; ++ struct buffer_head *bh; ++ int err, i; ++ ++ err = ext3_mb_get_descr_loc(e3b, &bh, &grp); ++ if (err) ++ return err; ++ ++ e3b->bd_bd->bb_first_free = grp->mgd_first_free; ++ e3b->bd_bd->bb_free = grp->mgd_free; ++ for (i = 0; i < e3b->bd_blkbits; i++) { ++ J_ASSERT(i < 16); ++ e3b->bd_bd->bb_counters[i] = grp->mgd_counters[i]; ++ } ++ brelse(bh); ++ ++ /* additional checks against old group descriptor */ ++ gdp = ext3_get_group_desc(e3b->bd_sb, e3b->bd_group, NULL); ++ if (!gdp) ++ return -EIO; ++ if (e3b->bd_bd->bb_free != le16_to_cpu(gdp->bg_free_blocks_count)) { ++ printk(KERN_ERR "EXT3-fs: mbgroup %d corrupted (%d != %d)\n", ++ e3b->bd_group, e3b->bd_bd->bb_free, ++ le16_to_cpu(gdp->bg_free_blocks_count)); ++ return -ENODATA; ++ } ++ ++ return 0; ++} ++ ++ ++int ext3_mb_update_descr(struct ext3_buddy *e3b) ++{ ++ struct ext3_mb_group_descr *grp; ++ struct ext3_group_desc *gdp; ++ struct buffer_head *bh; ++ handle_t *handle; ++ int err, i; ++ ++ /* additional checks against old group descriptor */ ++ gdp = ext3_get_group_desc(e3b->bd_sb, e3b->bd_group, NULL); ++ if (!gdp) ++ return -EIO; ++ if (e3b->bd_bd->bb_free != le16_to_cpu(gdp->bg_free_blocks_count)) { ++ printk(KERN_ERR "EXT3-fs: mbgroup %d corrupted (%d != %d)\n", ++ e3b->bd_group, e3b->bd_bd->bb_free, ++ le16_to_cpu(gdp->bg_free_blocks_count)); ++ return -ENODATA; ++ } ++ ++ err = ext3_mb_get_descr_loc(e3b, &bh, &grp); ++ if (err) ++ return err; ++ ++ handle = ext3_journal_start_sb(e3b->bd_sb, 1); ++ if (IS_ERR(handle)) { ++ err = PTR_ERR(handle); ++ handle = NULL; ++ goto out; ++ } ++ ++ err = ext3_journal_get_write_access(handle, bh); ++ if (err) ++ goto out; ++ grp->mgd_first_free = e3b->bd_bd->bb_first_free; ++ grp->mgd_free = e3b->bd_bd->bb_free; ++ for (i = 0; i < e3b->bd_blkbits; i++) { ++ J_ASSERT(i < 16); ++ grp->mgd_counters[i] = e3b->bd_bd->bb_counters[i]; ++ } ++ err = ext3_journal_dirty_metadata(handle, bh); ++ if (err) ++ goto out; ++ err = 0; ++out: ++ brelse(bh); ++ if (handle) ++ ext3_journal_stop(handle); ++ return err; ++} ++ ++int ext3_mb_generate_buddy(struct ext3_buddy *e3b) ++{ ++ struct super_block *sb = e3b->bd_sb; ++ struct buffer_head *bh; ++ int i, count = 0; ++ ++ memset(e3b->bd_bh->b_data, 0, sb->s_blocksize); ++ memset(e3b->bd_bh2->b_data, 0, sb->s_blocksize); ++ ++ bh = read_block_bitmap(sb, e3b->bd_group); ++ if (bh == NULL) ++ return -EIO; ++ ++ /* mb_free_blocks will set real free */ ++ e3b->bd_bd->bb_free = 0; ++ e3b->bd_bd->bb_first_free = 1 << 15; ++ /* ++ * if change bb_counters size, don't forget about ++ * ext3_mb_init_backend() -bzzz ++ */ ++ memset(e3b->bd_bd->bb_counters, 0, ++ sizeof(unsigned) * (sb->s_blocksize_bits + 2)); ++ ++ /* loop over the blocks, and create buddies for free ones */ ++ for (i = 0; i < sb->s_blocksize * 8; i++) { ++ if (!mb_test_bit(i, (void *) bh->b_data)) { ++ mb_free_blocks(e3b, i, 1); ++ count++; ++ } ++ } ++ brelse(bh); ++ mb_check_buddy(e3b); ++ ext3_mb_dirty_buddy(e3b); ++ ++ return 0; ++} ++ ++EXPORT_SYMBOL(ext3_mb_new_blocks); ++ ++#define MB_CREDITS \ ++ (EXT3_DATA_TRANS_BLOCKS + 3 + EXT3_INDEX_EXTRA_TRANS_BLOCKS + \ ++ 2 * EXT3_SINGLEDATA_TRANS_BLOCKS) ++ ++int ext3_mb_init_backend(struct super_block *sb, int *created) ++{ ++ int err, i, len, descr_per_block, buddy_offset, size; ++ struct inode *root = sb->s_root->d_inode; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct ext3_mb_grp_header *hdr; ++ struct buffer_head *bh = NULL; ++ unsigned long block; ++ struct dentry *db; ++ handle_t *handle; ++ tid_t target; ++ ++ *created = 0; ++ len = sizeof(struct ext3_buddy_group_blocks *) * sbi->s_groups_count; ++ sbi->s_buddy_blocks = kmalloc(len, GFP_KERNEL); ++ if (sbi->s_buddy_blocks == NULL) { ++ printk(KERN_ERR "EXT3-fs: cant allocate mem for buddy\n"); ++ return -ENOMEM; ++ } ++ memset(sbi->s_buddy_blocks, 0, len); ++ sbi->s_buddy = NULL; ++ ++ down(&root->i_sem); ++ len = strlen(EXT3_BUDDY_FILE); ++ db = lookup_one_len(EXT3_BUDDY_FILE, sb->s_root, len); ++ if (IS_ERR(db)) { ++ err = PTR_ERR(db); ++ printk(KERN_ERR "EXT3-fs: cant lookup buddy: %d\n", err); ++ up(&root->i_sem); ++ goto out; ++ } ++ ++ if (db->d_inode == NULL) { ++ err = ext3_create(root, db, S_IFREG, NULL); ++ if (err) { ++ printk(KERN_ERR "EXT3-fs: cant create buddy: %d\n", err); ++ up(&root->i_sem); ++ goto out; ++ } ++ db->d_inode->i_flags |= S_IMMUTABLE | S_NOATIME; ++ *created = 1; ++ mb_debug("no buddy file, regenerate\n"); ++ } ++ up(&root->i_sem); ++ sbi->s_buddy = igrab(db->d_inode); ++ ++ /* calculate needed size */ ++ descr_per_block = (sb->s_blocksize - sizeof(struct ext3_mb_grp_header)) ++ / sizeof(struct ext3_mb_group_descr); ++ buddy_offset = (sbi->s_groups_count + descr_per_block - 1) ++ / descr_per_block; ++ len = sbi->s_groups_count * sb->s_blocksize * 2 + ++ buddy_offset * sb->s_blocksize; ++ if (len != i_size_read(sbi->s_buddy)) { ++ if (*created == 0) ++ printk("EXT3-fs: wrong i_size (%u != %u), regenerate\n", ++ (unsigned) len, ++ (unsigned) i_size_read(sbi->s_buddy)); ++ *created = 1; ++ } ++ ++ /* read/create mb group descriptors */ ++ for (i = 0; i < buddy_offset; i++) { ++ handle = ext3_journal_start(sbi->s_buddy, MB_CREDITS); ++ if (IS_ERR(handle)) { ++ printk(KERN_ERR "EXT3-fs: cant start transaction\n"); ++ err = PTR_ERR(handle); ++ goto err_out; ++ } ++ ++ bh = ext3_bread(handle, sbi->s_buddy, i, 1, &err); ++ if (bh == NULL) { ++ printk(KERN_ERR "EXT3-fs: cant getblk grp: %d\n", err); ++ goto err_out; ++ } ++ hdr = (struct ext3_mb_grp_header *) bh->b_data; ++ if (hdr->mh_magic != EXT3_MB_MAGIC_V1) { ++ err = ext3_journal_get_write_access(handle, bh); ++ if (err) ++ goto err_out; ++ if (*created == 0) ++ printk(KERN_ERR ++ "EXT3-fs: invalid header 0x%x in %d," ++ "regenerate\n", hdr->mh_magic, i); ++ *created = 1; ++ hdr->mh_magic = EXT3_MB_MAGIC_V1; ++ err = ext3_journal_dirty_metadata(handle, bh); ++ if (err) ++ goto err_out; ++ } ++ brelse(bh); ++ ext3_journal_stop(handle); ++ } ++ ++ /* ++ * if change bb_counters size, don't forget about ext3_mb_generate_buddy() ++ */ ++ len = sizeof(struct ext3_buddy_group_blocks); ++ len += sizeof(unsigned) * (sb->s_blocksize_bits + 2); ++ for (i = 0; i < sbi->s_groups_count; i++) { ++ ++ sbi->s_buddy_blocks[i] = kmalloc(len, GFP_KERNEL); ++ if (sbi->s_buddy_blocks[i] == NULL) { ++ printk(KERN_ERR "EXT3-fs: cant allocate mem for buddy\n"); ++ err = -ENOMEM; ++ goto out2; ++ } ++ memset(sbi->s_buddy_blocks[i], 0, len); ++ ++ handle = ext3_journal_start(sbi->s_buddy, MB_CREDITS); ++ if (IS_ERR(handle)) { ++ printk(KERN_ERR "EXT3-fs: cant start transaction\n"); ++ err = PTR_ERR(handle); ++ goto out2; ++ } ++ ++ /* allocate block for bitmap */ ++ block = buddy_offset + i * 2; ++ bh = ext3_getblk(handle, sbi->s_buddy, block, 1, &err); ++ if (bh == NULL) { ++ printk(KERN_ERR "EXT3-fs: cant getblk bitmap: %d\n", err); ++ goto out2; ++ } ++ sbi->s_buddy_blocks[i]->bb_bitmap = bh->b_blocknr; ++ brelse(bh); ++ ++ /* allocate block for buddy */ ++ block = buddy_offset + i * 2 + 1; ++ bh = ext3_getblk(handle, sbi->s_buddy, block, 1, &err); ++ if (bh == NULL) { ++ printk(KERN_ERR "EXT3-fs: cant getblk for buddy: %d\n", err); ++ goto out2; ++ } ++ sbi->s_buddy_blocks[i]->bb_buddy = bh->b_blocknr; ++ brelse(bh); ++ ++ size = (block + 1) << sbi->s_buddy->i_blkbits; ++ if (size > sbi->s_buddy->i_size) { ++ *created = 1; ++ EXT3_I(sbi->s_buddy)->i_disksize = size; ++ i_size_write(sbi->s_buddy, size); ++ mark_inode_dirty(sbi->s_buddy); ++ } ++ ext3_journal_stop(handle); ++ ++ spin_lock_init(&sbi->s_buddy_blocks[i]->bb_lock); ++ sbi->s_buddy_blocks[i]->bb_md_cur = NULL; ++ sbi->s_buddy_blocks[i]->bb_tid = 0; ++ } ++ ++ if (journal_start_commit(sbi->s_journal, &target)) ++ log_wait_commit(sbi->s_journal, target); ++ ++out2: ++ dput(db); ++out: ++ return err; ++ ++err_out: ++ return err; ++} ++ ++int ext3_mb_write_descriptors(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct ext3_buddy e3b; ++ int ret = 0, i, err; ++ ++ for (i = 0; i < sbi->s_groups_count; i++) { ++ if (sbi->s_buddy_blocks[i] == NULL) ++ continue; ++ ++ err = ext3_mb_load_buddy(sb, i, &e3b); ++ if (err == 0) { ++ ext3_mb_update_descr(&e3b); ++ ext3_mb_release_desc(&e3b); ++ } else ++ ret = err; ++ } ++ return ret; ++} ++ ++int ext3_mb_release(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int i; ++ ++ if (!test_opt(sb, MBALLOC)) ++ return 0; ++ ++ /* release freed, non-committed blocks */ ++ spin_lock(&sbi->s_md_lock); ++ list_splice_init(&sbi->s_closed_transaction, ++ &sbi->s_committed_transaction); ++ list_splice_init(&sbi->s_active_transaction, ++ &sbi->s_committed_transaction); ++ spin_unlock(&sbi->s_md_lock); ++ ext3_mb_free_committed_blocks(sb); ++ ++ if (sbi->s_buddy_blocks) { ++ ext3_mb_write_descriptors(sb); ++ for (i = 0; i < sbi->s_groups_count; i++) { ++ if (sbi->s_buddy_blocks[i] == NULL) ++ continue; ++ kfree(sbi->s_buddy_blocks[i]); ++ } ++ kfree(sbi->s_buddy_blocks); ++ } ++ if (sbi->s_buddy) ++ iput(sbi->s_buddy); ++ if (sbi->s_blocks_reserved) ++ printk("ext3-fs: %ld blocks being reserved at umount!\n", ++ sbi->s_blocks_reserved); ++#ifdef MBALLOC_STATS ++ printk("EXT3-fs: mballoc: %lu blocks %lu reqs (%lu success)\n", ++ sbi->s_bal_allocated, sbi->s_bal_reqs, sbi->s_bal_success); ++ printk("EXT3-fs: mballoc: %lu extents scanned, %lu goal hits, %lu breaks\n", ++ sbi->s_bal_ex_scanned, sbi->s_bal_goals, sbi->s_bal_breaks); ++#endif ++ return 0; ++} ++ ++int ext3_mb_init(struct super_block *sb, int needs_recovery) ++{ ++ struct ext3_buddy e3b; ++ int i, err, created; ++ ++ if (!test_opt(sb, MBALLOC)) ++ return 0; ++ ++ /* init file for buddy data */ ++ clear_opt(EXT3_SB(sb)->s_mount_opt, MBALLOC); ++ if ((err = ext3_mb_init_backend(sb, &created))) ++ return err; ++ ++repeat: ++ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) { ++ err = ext3_mb_load_buddy(sb, i, &e3b); ++ if (err) { ++ /* FIXME: release backend */ ++ return err; ++ } ++ if (created || needs_recovery) ++ ext3_mb_generate_buddy(&e3b); ++ else ++ err = ext3_mb_load_descr(&e3b); ++ ext3_mb_release_desc(&e3b); ++ if (err == -ENODATA) { ++ created = 1; ++ goto repeat; ++ } ++ } ++ if (created || needs_recovery) ++ printk(KERN_ERR "EXT3-fs: generated buddies for %lu groups\n", ++ EXT3_SB(sb)->s_groups_count); ++ spin_lock_init(&EXT3_SB(sb)->s_reserve_lock); ++ spin_lock_init(&EXT3_SB(sb)->s_md_lock); ++ INIT_LIST_HEAD(&EXT3_SB(sb)->s_active_transaction); ++ INIT_LIST_HEAD(&EXT3_SB(sb)->s_closed_transaction); ++ INIT_LIST_HEAD(&EXT3_SB(sb)->s_committed_transaction); ++ set_opt(EXT3_SB(sb)->s_mount_opt, MBALLOC); ++ ++#ifdef MBALLOC_STATS ++ spin_lock_init(&EXT3_SB(sb)->s_bal_lock); ++#define MBALLOC_INFO " (stats)" ++#else ++#define MBALLOC_INFO "" ++#endif ++ printk("EXT3-fs: mballoc enabled%s\n", MBALLOC_INFO); ++ return 0; ++} ++ ++void ext3_mb_free_committed_blocks(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int err, i, count = 0, count2 = 0; ++ struct ext3_free_metadata *md; ++ struct ext3_buddy e3b; ++ ++ if (list_empty(&sbi->s_committed_transaction)) ++ return; ++ ++ /* there is committed blocks to be freed yet */ ++ do { ++ /* get next array of blocks */ ++ md = NULL; ++ spin_lock(&sbi->s_md_lock); ++ if (!list_empty(&sbi->s_committed_transaction)) { ++ md = list_entry(sbi->s_committed_transaction.next, ++ struct ext3_free_metadata, list); ++ list_del(&md->list); ++ } ++ spin_unlock(&sbi->s_md_lock); ++ ++ if (md == NULL) ++ break; ++ ++ mb_debug("gonna free %u blocks in group %u (0x%p):", ++ md->num, md->group, md); ++ ++ err = ext3_mb_load_buddy(sb, md->group, &e3b); ++ BUG_ON(err != 0); ++ ++ /* there are blocks to put in buddy to make them really free */ ++ count += md->num; ++ count2++; ++ ext3_lock_group(sb, md->group); ++ for (i = 0; i < md->num; i++) { ++ mb_debug(" %u", md->blocks[i]); ++ mb_free_blocks(&e3b, md->blocks[i], 1); ++ } ++ mb_debug("\n"); ++ ext3_unlock_group(sb, md->group); ++ ++ kfree(md); ++ ext3_mb_dirty_buddy(&e3b); ++ ext3_mb_release_desc(&e3b); ++ ++ } while (md); ++ mb_debug("freed %u blocks in %u structures\n", count, count2); ++} ++ ++void ext3_mb_poll_new_transaction(struct super_block *sb, handle_t *handle) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ ++ if (sbi->s_last_transaction == handle->h_transaction->t_tid) ++ return; ++ ++ /* new transaction! time to close last one and free blocks for ++ * committed transaction. we know that only transaction can be ++ * active, so previos transaction can be being logged and we ++ * know that transaction before previous is known to be alreade ++ * logged. this means that now we may free blocks freed in all ++ * transactions before previous one. hope I'm clear enough ... */ ++ ++ spin_lock(&sbi->s_md_lock); ++ if (sbi->s_last_transaction != handle->h_transaction->t_tid) { ++ mb_debug("new transaction %lu, old %lu\n", ++ (unsigned long) handle->h_transaction->t_tid, ++ (unsigned long) sbi->s_last_transaction); ++ list_splice_init(&sbi->s_closed_transaction, ++ &sbi->s_committed_transaction); ++ list_splice_init(&sbi->s_active_transaction, ++ &sbi->s_closed_transaction); ++ sbi->s_last_transaction = handle->h_transaction->t_tid; ++ } ++ spin_unlock(&sbi->s_md_lock); ++ ++ ext3_mb_free_committed_blocks(sb); ++} ++ ++int ext3_mb_free_metadata(handle_t *handle, struct ext3_buddy *e3b, ++ int group, int block, int count) ++{ ++ struct ext3_buddy_group_blocks *db = e3b->bd_bd; ++ struct super_block *sb = e3b->bd_sb; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct ext3_free_metadata *md; ++ int i; ++ ++ ext3_lock_group(sb, group); ++ for (i = 0; i < count; i++) { ++ md = db->bb_md_cur; ++ if (md && db->bb_tid != handle->h_transaction->t_tid) { ++ db->bb_md_cur = NULL; ++ md = NULL; ++ } ++ ++ if (md == NULL) { ++ ext3_unlock_group(sb, group); ++ md = kmalloc(sizeof(*md), GFP_KERNEL); ++ if (md == NULL) ++ return -ENOMEM; ++ md->num = 0; ++ md->group = group; ++ ++ ext3_lock_group(sb, group); ++ if (db->bb_md_cur == NULL) { ++ spin_lock(&sbi->s_md_lock); ++ list_add(&md->list, &sbi->s_active_transaction); ++ spin_unlock(&sbi->s_md_lock); ++ db->bb_md_cur = md; ++ db->bb_tid = handle->h_transaction->t_tid; ++ mb_debug("new md 0x%p for group %u\n", ++ md, md->group); ++ } else { ++ kfree(md); ++ md = db->bb_md_cur; ++ } ++ } ++ ++ BUG_ON(md->num >= EXT3_BB_MAX_BLOCKS); ++ md->blocks[md->num] = block + i; ++ md->num++; ++ if (md->num == EXT3_BB_MAX_BLOCKS) { ++ /* no more space, put full container on a sb's list */ ++ db->bb_md_cur = NULL; ++ } ++ } ++ ext3_unlock_group(sb, group); ++ return 0; ++} ++ ++void ext3_mb_free_blocks(handle_t *handle, struct inode *inode, ++ unsigned long block, unsigned long count, ++ int metadata, int *freed) ++{ ++ struct buffer_head *bitmap_bh = NULL; ++ struct ext3_group_desc *gdp; ++ struct ext3_super_block *es; ++ unsigned long bit, overflow; ++ struct buffer_head *gd_bh; ++ unsigned long block_group; ++ struct ext3_sb_info *sbi; ++ struct super_block *sb; ++ struct ext3_buddy e3b; ++ int err = 0, ret; ++ ++ *freed = 0; ++ sb = inode->i_sb; ++ if (!sb) { ++ printk ("ext3_free_blocks: nonexistent device"); ++ return; ++ } ++ ++ ext3_mb_poll_new_transaction(sb, handle); ++ ++ sbi = EXT3_SB(sb); ++ es = EXT3_SB(sb)->s_es; ++ if (block < le32_to_cpu(es->s_first_data_block) || ++ block + count < block || ++ block + count > le32_to_cpu(es->s_blocks_count)) { ++ ext3_error (sb, "ext3_free_blocks", ++ "Freeing blocks not in datazone - " ++ "block = %lu, count = %lu", block, count); ++ goto error_return; ++ } ++ ++ ext3_debug("freeing block %lu\n", block); ++ ++do_more: ++ overflow = 0; ++ block_group = (block - le32_to_cpu(es->s_first_data_block)) / ++ EXT3_BLOCKS_PER_GROUP(sb); ++ bit = (block - le32_to_cpu(es->s_first_data_block)) % ++ EXT3_BLOCKS_PER_GROUP(sb); ++ /* ++ * Check to see if we are freeing blocks across a group ++ * boundary. ++ */ ++ if (bit + count > EXT3_BLOCKS_PER_GROUP(sb)) { ++ overflow = bit + count - EXT3_BLOCKS_PER_GROUP(sb); ++ count -= overflow; ++ } ++ brelse(bitmap_bh); ++ bitmap_bh = read_block_bitmap(sb, block_group); ++ if (!bitmap_bh) ++ goto error_return; ++ gdp = ext3_get_group_desc (sb, block_group, &gd_bh); ++ if (!gdp) ++ goto error_return; ++ ++ if (in_range (le32_to_cpu(gdp->bg_block_bitmap), block, count) || ++ in_range (le32_to_cpu(gdp->bg_inode_bitmap), block, count) || ++ in_range (block, le32_to_cpu(gdp->bg_inode_table), ++ EXT3_SB(sb)->s_itb_per_group) || ++ in_range (block + count - 1, le32_to_cpu(gdp->bg_inode_table), ++ EXT3_SB(sb)->s_itb_per_group)) ++ ext3_error (sb, "ext3_free_blocks", ++ "Freeing blocks in system zones - " ++ "Block = %lu, count = %lu", ++ block, count); ++ ++ BUFFER_TRACE(bitmap_bh, "getting write access"); ++ err = ext3_journal_get_write_access(handle, bitmap_bh); ++ if (err) ++ goto error_return; ++ ++ /* ++ * We are about to modify some metadata. Call the journal APIs ++ * to unshare ->b_data if a currently-committing transaction is ++ * using it ++ */ ++ BUFFER_TRACE(gd_bh, "get_write_access"); ++ err = ext3_journal_get_write_access(handle, gd_bh); ++ if (err) ++ goto error_return; ++ ++ err = ext3_mb_load_buddy(sb, block_group, &e3b); ++ if (err) ++ goto error_return; ++ ++ if (metadata) { ++ /* blocks being freed are metadata. these blocks shouldn't ++ * be used until this transaction is committed */ ++ ext3_mb_free_metadata(handle, &e3b, block_group, bit, count); ++ } else { ++ ext3_lock_group(sb, block_group); ++ mb_free_blocks(&e3b, bit, count); ++ ext3_unlock_group(sb, block_group); ++ } ++ spin_lock(sb_bgl_lock(sbi, block_group)); ++ gdp->bg_free_blocks_count = ++ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + count); ++ spin_unlock(sb_bgl_lock(sbi, block_group)); ++ ++ ext3_mb_dirty_buddy(&e3b); ++ ext3_mb_release_desc(&e3b); ++ ++ mb_clear_bits(bitmap_bh->b_data, bit, count); ++ *freed = count; ++ ++ /* We dirtied the bitmap block */ ++ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); ++ err = ext3_journal_dirty_metadata(handle, bitmap_bh); ++ ++ /* And the group descriptor block */ ++ BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); ++ ret = ext3_journal_dirty_metadata(handle, gd_bh); ++ if (!err) err = ret; ++ ++ if (overflow && !err) { ++ block += count; ++ count = overflow; ++ goto do_more; ++ } ++ sb->s_dirt = 1; ++error_return: ++ brelse(bitmap_bh); ++ ext3_std_error(sb, err); ++ return; ++} ++ ++int ext3_mb_reserve_blocks(struct super_block *sb, int blocks) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int free, ret = -ENOSPC; ++ ++ BUG_ON(blocks < 0); ++ spin_lock(&sbi->s_reserve_lock); ++ free = percpu_counter_read_positive(&sbi->s_freeblocks_counter); ++ if (blocks <= free - sbi->s_blocks_reserved) { ++ sbi->s_blocks_reserved += blocks; ++ ret = 0; ++ } ++ spin_unlock(&sbi->s_reserve_lock); ++ return ret; ++} ++ ++void ext3_mb_release_blocks(struct super_block *sb, int blocks) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ ++ BUG_ON(blocks < 0); ++ spin_lock(&sbi->s_reserve_lock); ++ sbi->s_blocks_reserved -= blocks; ++ WARN_ON(sbi->s_blocks_reserved < 0); ++ if (sbi->s_blocks_reserved < 0) ++ sbi->s_blocks_reserved = 0; ++ spin_unlock(&sbi->s_reserve_lock); ++} ++ ++int ext3_new_block(handle_t *handle, struct inode *inode, ++ unsigned long goal, int *errp) ++{ ++ int ret, len; ++ ++ if (!test_opt(inode->i_sb, MBALLOC)) { ++ ret = ext3_new_block_old(handle, inode, goal, errp); ++ goto out; ++ } ++ len = 1; ++ ret = ext3_mb_new_blocks(handle, inode, goal, &len, 0, errp); ++out: ++ return ret; ++} ++ ++ ++void ext3_free_blocks(handle_t *handle, struct inode * inode, ++ unsigned long block, unsigned long count, int metadata) ++{ ++ struct super_block *sb; ++ int freed; ++ ++ sb = inode->i_sb; ++ if (!test_opt(sb, MBALLOC)) ++ ext3_free_blocks_sb(handle, sb, block, count, &freed); ++ else ++ ext3_mb_free_blocks(handle, inode, block, count, metadata, &freed); ++ if (freed) ++ DQUOT_FREE_BLOCK(inode, freed); ++ return; ++} ++ +Index: linux-stage/fs/ext3/super.c +=================================================================== +--- linux-stage.orig/fs/ext3/super.c 2005-02-25 17:27:00.231757312 +0200 ++++ linux-stage/fs/ext3/super.c 2005-02-25 17:28:41.862307120 +0200 +@@ -394,6 +394,7 @@ + struct ext3_super_block *es = sbi->s_es; + int i; + ++ ext3_mb_release(sb); + ext3_ext_release(sb); + ext3_xattr_put_super(sb); + journal_destroy(sbi->s_journal); +@@ -592,7 +593,7 @@ + Opt_commit, Opt_journal_update, Opt_journal_inum, + Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, + Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, +- Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, ++ Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_mballoc, Opt_mbfactor, + Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, + Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_extents, Opt_extdebug, + }; +@@ -646,6 +647,8 @@ + {Opt_iopen_nopriv, "iopen_nopriv"}, + {Opt_extents, "extents"}, + {Opt_extdebug, "extdebug"}, ++ {Opt_mballoc, "mballoc"}, ++ {Opt_mballoc, "mbfactor=%u"}, + {Opt_barrier, "barrier=%u"}, + {Opt_err, NULL}, + {Opt_resize, "resize"}, +@@ -956,6 +959,16 @@ + case Opt_extdebug: + set_opt (sbi->s_mount_opt, EXTDEBUG); + break; ++ case Opt_mballoc: ++ set_opt (sbi->s_mount_opt, MBALLOC); ++ break; ++ case Opt_mbfactor: ++ if (match_int(&args[0], &option)) ++ return 0; ++ if (option < 0) ++ return 0; ++ sbi->s_mb_factor = option; ++ break; + default: + printk (KERN_ERR + "EXT3-fs: Unrecognized mount option \"%s\" " +@@ -1639,7 +1652,8 @@ + ext3_count_dirs(sb)); + + ext3_ext_init(sb); +- ++ ext3_mb_init(sb, needs_recovery); ++ + return 0; + + failed_mount3: +Index: linux-stage/fs/ext3/Makefile +=================================================================== +--- linux-stage.orig/fs/ext3/Makefile 2005-02-25 17:27:00.228757768 +0200 ++++ linux-stage/fs/ext3/Makefile 2005-02-25 17:28:41.863306968 +0200 +@@ -5,7 +5,8 @@ + obj-$(CONFIG_EXT3_FS) += ext3.o + + ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o\ +- ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o ++ ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ ++ mballoc.o + + ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o + ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o +Index: linux-stage/fs/ext3/balloc.c +=================================================================== +--- linux-stage.orig/fs/ext3/balloc.c 2005-02-25 17:26:58.965949744 +0200 ++++ linux-stage/fs/ext3/balloc.c 2005-02-25 17:28:41.865306664 +0200 +@@ -79,7 +79,7 @@ + * + * Return buffer_head on success or NULL in case of failure. + */ +-static struct buffer_head * ++struct buffer_head * + read_block_bitmap(struct super_block *sb, unsigned int block_group) + { + struct ext3_group_desc * desc; +@@ -450,24 +450,6 @@ + return; + } + +-/* Free given blocks, update quota and i_blocks field */ +-void ext3_free_blocks(handle_t *handle, struct inode *inode, +- unsigned long block, unsigned long count) +-{ +- struct super_block * sb; +- int dquot_freed_blocks; +- +- sb = inode->i_sb; +- if (!sb) { +- printk ("ext3_free_blocks: nonexistent device"); +- return; +- } +- ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks); +- if (dquot_freed_blocks) +- DQUOT_FREE_BLOCK(inode, dquot_freed_blocks); +- return; +-} +- + /* + * For ext3 allocations, we must not reuse any blocks which are + * allocated in the bitmap buffer's "last committed data" copy. This +@@ -1140,7 +1122,7 @@ + * bitmap, and then for any free bit if that fails. + * This function also updates quota and i_blocks field. + */ +-int ext3_new_block(handle_t *handle, struct inode *inode, ++int ext3_new_block_old(handle_t *handle, struct inode *inode, + unsigned long goal, int *errp) + { + struct buffer_head *bitmap_bh = NULL; +Index: linux-stage/fs/ext3/namei.c +=================================================================== +--- linux-stage.orig/fs/ext3/namei.c 2005-02-25 17:26:59.527864320 +0200 ++++ linux-stage/fs/ext3/namei.c 2005-02-25 17:28:41.867306360 +0200 +@@ -1639,7 +1639,7 @@ + * If the create succeeds, we fill in the inode information + * with d_instantiate(). + */ +-static int ext3_create (struct inode * dir, struct dentry * dentry, int mode, ++int ext3_create (struct inode * dir, struct dentry * dentry, int mode, + struct nameidata *nd) + { + handle_t *handle; +Index: linux-stage/fs/ext3/inode.c +=================================================================== +--- linux-stage.orig/fs/ext3/inode.c 2005-02-25 17:27:00.227757920 +0200 ++++ linux-stage/fs/ext3/inode.c 2005-02-25 17:28:41.872305600 +0200 +@@ -572,7 +572,7 @@ + ext3_journal_forget(handle, branch[i].bh); + } + for (i = 0; i < keys; i++) +- ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1); ++ ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1, 1); + return err; + } + +@@ -673,7 +673,7 @@ + if (err == -EAGAIN) + for (i = 0; i < num; i++) + ext3_free_blocks(handle, inode, +- le32_to_cpu(where[i].key), 1); ++ le32_to_cpu(where[i].key), 1, 1); + return err; + } + +@@ -1831,7 +1831,7 @@ + } + } + +- ext3_free_blocks(handle, inode, block_to_free, count); ++ ext3_free_blocks(handle, inode, block_to_free, count, 1); + } + + /** +@@ -2004,7 +2004,7 @@ + ext3_journal_test_restart(handle, inode); + } + +- ext3_free_blocks(handle, inode, nr, 1); ++ ext3_free_blocks(handle, inode, nr, 1, 1); + + if (parent_bh) { + /* +Index: linux-stage/fs/ext3/extents.c +=================================================================== +--- linux-stage.orig/fs/ext3/extents.c 2005-02-25 17:27:00.222758680 +0200 ++++ linux-stage/fs/ext3/extents.c 2005-02-25 17:29:29.364085752 +0200 +@@ -740,7 +740,7 @@ + for (i = 0; i < depth; i++) { + if (!ablocks[i]) + continue; +- ext3_free_blocks(handle, tree->inode, ablocks[i], 1); ++ ext3_free_blocks(handle, tree->inode, ablocks[i], 1, 1); + } + } + kfree(ablocks); +@@ -1391,7 +1391,7 @@ + path->p_idx->ei_leaf); + bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf); + ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf); +- ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1); ++ ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1, 1); + return err; + } + +@@ -1879,10 +1879,12 @@ + int needed = ext3_remove_blocks_credits(tree, ex, from, to); + handle_t *handle = ext3_journal_start(tree->inode, needed); + struct buffer_head *bh; +- int i; ++ int i, metadata = 0; + + if (IS_ERR(handle)) + return PTR_ERR(handle); ++ if (S_ISDIR(tree->inode->i_mode)) ++ metadata = 1; + if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) { + /* tail removal */ + unsigned long num, start; +@@ -1894,7 +1896,7 @@ + bh = sb_find_get_block(tree->inode->i_sb, start + i); + ext3_forget(handle, 0, tree->inode, bh, start + i); + } +- ext3_free_blocks(handle, tree->inode, start, num); ++ ext3_free_blocks(handle, tree->inode, start, num, metadata); + } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) { + printk("strange request: removal %lu-%lu from %u:%u\n", + from, to, ex->ee_block, ex->ee_len); +Index: linux-stage/fs/ext3/xattr.c +=================================================================== +--- linux-stage.orig/fs/ext3/xattr.c 2005-02-25 17:26:59.876811272 +0200 ++++ linux-stage/fs/ext3/xattr.c 2005-02-25 17:28:41.878304688 +0200 +@@ -1271,7 +1271,7 @@ + new_bh = sb_getblk(sb, block); + if (!new_bh) { + getblk_failed: +- ext3_free_blocks(handle, inode, block, 1); ++ ext3_free_blocks(handle, inode, block, 1, 1); + error = -EIO; + goto cleanup; + } +@@ -1318,7 +1318,7 @@ + if (ce) + mb_cache_entry_free(ce); + ea_bdebug(old_bh, "freeing"); +- ext3_free_blocks(handle, inode, old_bh->b_blocknr, 1); ++ ext3_free_blocks(handle, inode, old_bh->b_blocknr, 1, 1); + + /* ext3_forget() calls bforget() for us, but we + let our caller release old_bh, so we need to +@@ -1417,7 +1417,7 @@ + if (HDR(bh)->h_refcount == cpu_to_le32(1)) { + if (ce) + mb_cache_entry_free(ce); +- ext3_free_blocks(handle, inode, EXT3_I(inode)->i_file_acl, 1); ++ ext3_free_blocks(handle, inode, EXT3_I(inode)->i_file_acl, 1, 1); + get_bh(bh); + ext3_forget(handle, 1, inode, bh, EXT3_I(inode)->i_file_acl); + } else { +Index: linux-stage/include/linux/ext3_fs.h +=================================================================== +--- linux-stage.orig/include/linux/ext3_fs.h 2005-02-25 17:27:00.234756856 +0200 ++++ linux-stage/include/linux/ext3_fs.h 2005-02-25 17:28:41.881304232 +0200 +@@ -57,6 +57,14 @@ + #define ext3_debug(f, a...) do {} while (0) + #endif + ++#define EXT3_MULTIBLOCK_ALLOCATOR 1 ++ ++#define EXT3_MB_HINT_MERGE 1 ++#define EXT3_MB_HINT_RESERVED 2 ++#define EXT3_MB_HINT_METADATA 4 ++#define EXT3_MB_HINT_FIRST 8 ++#define EXT3_MB_HINT_BEST 16 ++ + /* + * Special inodes numbers + */ +@@ -365,6 +373,7 @@ + #define EXT3_MOUNT_IOPEN_NOPRIV 0x80000 /* Make iopen world-readable */ + #define EXT3_MOUNT_EXTENTS 0x100000/* Extents support */ + #define EXT3_MOUNT_EXTDEBUG 0x200000/* Extents debug */ ++#define EXT3_MOUNT_MBALLOC 0x400000/* Buddy allocation support */ + + + /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ +@@ -725,7 +734,7 @@ + extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group); + extern int ext3_new_block (handle_t *, struct inode *, unsigned long, int *); + extern void ext3_free_blocks (handle_t *, struct inode *, unsigned long, +- unsigned long); ++ unsigned long, int); + extern void ext3_free_blocks_sb (handle_t *, struct super_block *, + unsigned long, unsigned long, int *); + extern unsigned long ext3_count_free_blocks (struct super_block *); +@@ -856,6 +865,37 @@ + extern int ext3_ext_ioctl(struct inode *inode, struct file *filp, + unsigned int cmd, unsigned long arg); + ++/* mballoc.c */ ++extern int ext3_mb_init(struct super_block *, int); ++extern int ext3_mb_release(struct super_block *); ++extern int ext3_mb_new_blocks(handle_t *, struct inode *, unsigned long, int *, int, int *); ++extern int ext3_mb_reserve_blocks(struct super_block *, int); ++extern void ext3_mb_release_blocks(struct super_block *, int); ++ ++/* writeback.c */ ++extern int ext3_wb_writepages(struct address_space *, struct writeback_control *); ++extern int ext3_wb_prepare_write(struct file *file, struct page *page, ++ unsigned from, unsigned to); ++extern int ext3_wb_commit_write(struct file *, struct page *, unsigned, unsigned); ++extern int ext3_wb_writepage(struct page *, struct writeback_control *); ++extern int ext3_wb_invalidatepage(struct page *, unsigned long); ++extern int ext3_wb_releasepage(struct page *, int); ++extern int ext3_wb_block_truncate_page(handle_t *, struct page *, struct address_space *, loff_t); ++extern void ext3_wb_init(struct super_block *); ++extern void ext3_wb_release(struct super_block *); ++ ++/* writeback.c */ ++extern int ext3_wb_writepages(struct address_space *, struct writeback_control *); ++extern int ext3_wb_prepare_write(struct file *file, struct page *page, ++ unsigned from, unsigned to); ++extern int ext3_wb_commit_write(struct file *, struct page *, unsigned, unsigned); ++extern int ext3_wb_writepage(struct page *, struct writeback_control *); ++extern int ext3_wb_invalidatepage(struct page *, unsigned long); ++extern int ext3_wb_releasepage(struct page *, int); ++extern int ext3_wb_block_truncate_page(handle_t *, struct page *, struct address_space *, loff_t); ++extern void ext3_wb_init(struct super_block *); ++extern void ext3_wb_release(struct super_block *); ++ + #endif /* __KERNEL__ */ + + /* EXT3_IOC_CREATE_INUM at bottom of file (visible to kernel and user). */ +Index: linux-stage/include/linux/ext3_fs_sb.h +=================================================================== +--- linux-stage.orig/include/linux/ext3_fs_sb.h 2005-02-25 17:26:59.641846992 +0200 ++++ linux-stage/include/linux/ext3_fs_sb.h 2005-02-25 17:28:41.882304080 +0200 +@@ -23,10 +23,30 @@ + #define EXT_INCLUDE + #include + #include ++#include + #endif + #endif + #include + ++#define EXT3_BB_MAX_BLOCKS 30 ++struct ext3_free_metadata { ++ unsigned short group; ++ unsigned short num; ++ unsigned short blocks[EXT3_BB_MAX_BLOCKS]; ++ struct list_head list; ++}; ++ ++struct ext3_buddy_group_blocks { ++ __u32 bb_bitmap; ++ __u32 bb_buddy; ++ spinlock_t bb_lock; ++ unsigned long bb_tid; ++ struct ext3_free_metadata *bb_md_cur; ++ unsigned short bb_first_free; ++ unsigned short bb_free; ++ unsigned bb_counters[]; ++}; ++ + /* + * third extended-fs super-block data in memory + */ +@@ -81,6 +101,27 @@ + char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */ + int s_jquota_fmt; /* Format of quota to use */ + #endif ++ ++ /* for buddy allocator */ ++ struct ext3_buddy_group_blocks **s_buddy_blocks; ++ struct inode *s_buddy; ++ long s_blocks_reserved; ++ spinlock_t s_reserve_lock; ++ struct list_head s_active_transaction; ++ struct list_head s_closed_transaction; ++ struct list_head s_committed_transaction; ++ spinlock_t s_md_lock; ++ tid_t s_last_transaction; ++ int s_mb_factor; ++ ++ /* stats for buddy allocator */ ++ spinlock_t s_bal_lock; ++ unsigned long s_bal_reqs; /* number of reqs with len > 1 */ ++ unsigned long s_bal_success; /* we found long enough chunks */ ++ unsigned long s_bal_allocated; /* in blocks */ ++ unsigned long s_bal_ex_scanned; /* total extents scanned */ ++ unsigned long s_bal_goals; /* goal hits */ ++ unsigned long s_bal_breaks; /* too long searches */ + }; + + #endif /* _LINUX_EXT3_FS_SB */ diff --git a/lustre/kernel_patches/patches/vfs_intent-2.6-rhel4.patch b/lustre/kernel_patches/patches/vfs_intent-2.6-rhel4.patch index d0aaa51..e00f2f3 100644 --- a/lustre/kernel_patches/patches/vfs_intent-2.6-rhel4.patch +++ b/lustre/kernel_patches/patches/vfs_intent-2.6-rhel4.patch @@ -1,7 +1,7 @@ Index: linux-2.6.9-5.0.3.EL/fs/exec.c =================================================================== ---- linux-2.6.9-5.0.3.EL.orig/fs/exec.c 2005-02-25 13:43:02.688574384 +0200 -+++ linux-2.6.9-5.0.3.EL/fs/exec.c 2005-02-25 13:43:42.442530864 +0200 +--- linux-2.6.9-5.0.3.EL.orig/fs/exec.c 2005-02-26 14:28:01.373228096 +0200 ++++ linux-2.6.9-5.0.3.EL/fs/exec.c 2005-02-26 14:32:04.728232512 +0200 @@ -124,9 +124,10 @@ struct file * file; struct nameidata nd; @@ -47,8 +47,8 @@ Index: linux-2.6.9-5.0.3.EL/fs/exec.c if (err) { Index: linux-2.6.9-5.0.3.EL/fs/namei.c =================================================================== ---- linux-2.6.9-5.0.3.EL.orig/fs/namei.c 2005-02-25 13:43:02.692573776 +0200 -+++ linux-2.6.9-5.0.3.EL/fs/namei.c 2005-02-25 13:43:42.446530256 +0200 +--- linux-2.6.9-5.0.3.EL.orig/fs/namei.c 2005-02-26 14:28:01.378227336 +0200 ++++ linux-2.6.9-5.0.3.EL/fs/namei.c 2005-02-26 14:32:04.732231904 +0200 @@ -272,8 +272,19 @@ return 0; } @@ -156,7 +156,7 @@ Index: linux-2.6.9-5.0.3.EL/fs/namei.c dput(next.dentry); mntput(next.mnt); if (err) -@@ -791,14 +842,26 @@ +@@ -791,14 +842,34 @@ inode = nd->dentry->d_inode; /* fallthrough */ case 1: @@ -169,6 +169,14 @@ Index: linux-2.6.9-5.0.3.EL/fs/namei.c + path_release(nd); + goto return_err; + } ++ if (lookup_flags & LOOKUP_DIRECTORY) { ++ err = -ENOTDIR; ++ if(!nd->dentry->d_inode->i_op || ++ !nd->dentry->d_inode->i_op->lookup) { ++ path_release(nd); ++ goto return_err; ++ } ++ } goto return_reval; } + @@ -183,7 +191,7 @@ Index: linux-2.6.9-5.0.3.EL/fs/namei.c if (err) break; follow_mount(&next.mnt, &next.dentry); -@@ -1016,7 +1079,7 @@ +@@ -1016,7 +1087,7 @@ } /* SMP-safe */ @@ -192,7 +200,7 @@ Index: linux-2.6.9-5.0.3.EL/fs/namei.c { unsigned long hash; struct qstr this; -@@ -1036,11 +1099,16 @@ +@@ -1036,11 +1107,16 @@ } this.hash = end_name_hash(hash); @@ -210,7 +218,7 @@ Index: linux-2.6.9-5.0.3.EL/fs/namei.c /* * namei() * -@@ -1052,7 +1120,7 @@ +@@ -1052,7 +1128,7 @@ * that namei follows links, while lnamei does not. * SMP-safe */ @@ -219,7 +227,7 @@ Index: linux-2.6.9-5.0.3.EL/fs/namei.c { char *tmp = getname(name); int err = PTR_ERR(tmp); -@@ -1064,6 +1132,12 @@ +@@ -1064,6 +1140,12 @@ return err; } @@ -232,7 +240,7 @@ Index: linux-2.6.9-5.0.3.EL/fs/namei.c /* * It's inline, so penalty for filesystems that don't use sticky bit is * minimal. -@@ -1347,8 +1421,8 @@ +@@ -1347,8 +1429,8 @@ acc_mode |= MAY_APPEND; /* Fill in the open() intent data */ @@ -243,7 +251,7 @@ Index: linux-2.6.9-5.0.3.EL/fs/namei.c /* * The simplest case - just a plain lookup. -@@ -1363,6 +1437,7 @@ +@@ -1363,6 +1445,7 @@ /* * Create - we need to know the parent. */ @@ -251,7 +259,7 @@ Index: linux-2.6.9-5.0.3.EL/fs/namei.c error = path_lookup(pathname, LOOKUP_PARENT|LOOKUP_OPEN|LOOKUP_CREATE, nd); if (error) return error; -@@ -1379,7 +1454,9 @@ +@@ -1379,7 +1462,9 @@ dir = nd->dentry; nd->flags &= ~LOOKUP_PARENT; down(&dir->d_inode->i_sem); @@ -261,7 +269,7 @@ Index: linux-2.6.9-5.0.3.EL/fs/namei.c do_last: error = PTR_ERR(dentry); -@@ -1492,7 +1569,9 @@ +@@ -1492,7 +1577,9 @@ } dir = nd->dentry; down(&dir->d_inode->i_sem); @@ -273,8 +281,8 @@ Index: linux-2.6.9-5.0.3.EL/fs/namei.c } Index: linux-2.6.9-5.0.3.EL/fs/namespace.c =================================================================== ---- linux-2.6.9-5.0.3.EL.orig/fs/namespace.c 2005-02-25 13:43:02.695573320 +0200 -+++ linux-2.6.9-5.0.3.EL/fs/namespace.c 2005-02-25 13:43:42.448529952 +0200 +--- linux-2.6.9-5.0.3.EL.orig/fs/namespace.c 2005-02-26 14:28:01.381226880 +0200 ++++ linux-2.6.9-5.0.3.EL/fs/namespace.c 2005-02-26 14:32:04.734231600 +0200 @@ -61,6 +61,7 @@ INIT_LIST_HEAD(&mnt->mnt_mounts); INIT_LIST_HEAD(&mnt->mnt_list); @@ -336,8 +344,8 @@ Index: linux-2.6.9-5.0.3.EL/fs/namespace.c flags &= ~MS_MGC_MSK; Index: linux-2.6.9-5.0.3.EL/fs/open.c =================================================================== ---- linux-2.6.9-5.0.3.EL.orig/fs/open.c 2005-02-25 13:43:02.725568760 +0200 -+++ linux-2.6.9-5.0.3.EL/fs/open.c 2005-02-25 13:43:42.451529496 +0200 +--- linux-2.6.9-5.0.3.EL.orig/fs/open.c 2005-02-26 14:28:01.383226576 +0200 ++++ linux-2.6.9-5.0.3.EL/fs/open.c 2005-02-26 14:32:04.736231296 +0200 @@ -215,12 +215,12 @@ struct nameidata nd; struct inode * inode; @@ -381,7 +389,7 @@ Index: linux-2.6.9-5.0.3.EL/fs/open.c { struct nameidata nd; int error; -+ intent_init(&nd.intent, IT_CHDIR); ++ intent_init(&nd.intent, IT_GETATTR); - error = __user_walk(filename, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &nd); + error = __user_walk_it(filename, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &nd); @@ -492,8 +500,8 @@ Index: linux-2.6.9-5.0.3.EL/fs/open.c */ Index: linux-2.6.9-5.0.3.EL/fs/stat.c =================================================================== ---- linux-2.6.9-5.0.3.EL.orig/fs/stat.c 2005-02-25 13:43:02.726568608 +0200 -+++ linux-2.6.9-5.0.3.EL/fs/stat.c 2005-02-25 13:43:42.452529344 +0200 +--- linux-2.6.9-5.0.3.EL.orig/fs/stat.c 2005-02-26 14:28:01.384226424 +0200 ++++ linux-2.6.9-5.0.3.EL/fs/stat.c 2005-02-26 14:32:04.738230992 +0200 @@ -37,7 +37,7 @@ EXPORT_SYMBOL(generic_fillattr); @@ -565,8 +573,8 @@ Index: linux-2.6.9-5.0.3.EL/fs/stat.c return error; Index: linux-2.6.9-5.0.3.EL/fs/nfs/dir.c =================================================================== ---- linux-2.6.9-5.0.3.EL.orig/fs/nfs/dir.c 2005-02-25 13:43:02.729568152 +0200 -+++ linux-2.6.9-5.0.3.EL/fs/nfs/dir.c 2005-02-25 13:52:18.971006600 +0200 +--- linux-2.6.9-5.0.3.EL.orig/fs/nfs/dir.c 2005-02-26 14:28:01.387225968 +0200 ++++ linux-2.6.9-5.0.3.EL/fs/nfs/dir.c 2005-02-26 14:32:04.740230688 +0200 @@ -718,7 +718,7 @@ return 0; if (!nd || (nd->flags & LOOKUP_CONTINUE) || !(nd->flags & LOOKUP_CREATE)) @@ -587,8 +595,8 @@ Index: linux-2.6.9-5.0.3.EL/fs/nfs/dir.c * The 0 argument passed into the create function should one day Index: linux-2.6.9-5.0.3.EL/fs/inode.c =================================================================== ---- linux-2.6.9-5.0.3.EL.orig/fs/inode.c 2005-02-25 13:43:02.731567848 +0200 -+++ linux-2.6.9-5.0.3.EL/fs/inode.c 2005-02-25 13:43:42.457528584 +0200 +--- linux-2.6.9-5.0.3.EL.orig/fs/inode.c 2005-02-26 14:28:01.389225664 +0200 ++++ linux-2.6.9-5.0.3.EL/fs/inode.c 2005-02-26 14:32:04.742230384 +0200 @@ -233,6 +233,7 @@ inodes_stat.nr_unused--; } @@ -599,8 +607,8 @@ Index: linux-2.6.9-5.0.3.EL/fs/inode.c * @inode: inode to clear Index: linux-2.6.9-5.0.3.EL/include/linux/dcache.h =================================================================== ---- linux-2.6.9-5.0.3.EL.orig/include/linux/dcache.h 2005-02-25 13:43:02.733567544 +0200 -+++ linux-2.6.9-5.0.3.EL/include/linux/dcache.h 2005-02-25 13:43:42.459528280 +0200 +--- linux-2.6.9-5.0.3.EL.orig/include/linux/dcache.h 2005-02-26 14:28:01.390225512 +0200 ++++ linux-2.6.9-5.0.3.EL/include/linux/dcache.h 2005-02-26 14:32:04.743230232 +0200 @@ -4,6 +4,7 @@ #ifdef __KERNEL__ @@ -620,8 +628,8 @@ Index: linux-2.6.9-5.0.3.EL/include/linux/dcache.h int nr_unused; Index: linux-2.6.9-5.0.3.EL/include/linux/fs.h =================================================================== ---- linux-2.6.9-5.0.3.EL.orig/include/linux/fs.h 2005-02-25 13:43:02.736567088 +0200 -+++ linux-2.6.9-5.0.3.EL/include/linux/fs.h 2005-02-25 13:43:42.462527824 +0200 +--- linux-2.6.9-5.0.3.EL.orig/include/linux/fs.h 2005-02-26 14:28:01.393225056 +0200 ++++ linux-2.6.9-5.0.3.EL/include/linux/fs.h 2005-02-26 14:32:04.745229928 +0200 @@ -74,6 +74,7 @@ #define FMODE_READ 1 @@ -691,9 +699,9 @@ Index: linux-2.6.9-5.0.3.EL/include/linux/fs.h Index: linux-2.6.9-5.0.3.EL/include/linux/namei.h =================================================================== ---- linux-2.6.9-5.0.3.EL.orig/include/linux/namei.h 2005-02-25 13:43:02.737566936 +0200 -+++ linux-2.6.9-5.0.3.EL/include/linux/namei.h 2005-02-25 13:53:33.690647488 +0200 -@@ -2,14 +2,49 @@ +--- linux-2.6.9-5.0.3.EL.orig/include/linux/namei.h 2005-02-26 14:28:01.396224600 +0200 ++++ linux-2.6.9-5.0.3.EL/include/linux/namei.h 2005-02-26 14:32:04.747229624 +0200 +@@ -2,14 +2,48 @@ #define _LINUX_NAMEI_H #include @@ -714,7 +722,6 @@ Index: linux-2.6.9-5.0.3.EL/include/linux/namei.h +#define IT_UNLINK (1<<5) +#define IT_TRUNC (1<<6) +#define IT_GETXATTR (1<<7) -+#define IT_CHDIR (1<<8) + +struct lustre_intent_data { + int it_disposition; @@ -746,8 +753,7 @@ Index: linux-2.6.9-5.0.3.EL/include/linux/namei.h enum { MAX_NESTED_LINKS = 8 }; struct nameidata { -@@ -20,11 +55,8 @@ - int last_type; +@@ -21,10 +56,7 @@ unsigned depth; char *saved_names[MAX_NESTED_LINKS + 1]; @@ -759,16 +765,16 @@ Index: linux-2.6.9-5.0.3.EL/include/linux/namei.h }; /* -@@ -46,6 +79,8 @@ +@@ -46,6 +78,8 @@ #define LOOKUP_PARENT 16 #define LOOKUP_NOALT 32 #define LOOKUP_ATOMIC 64 -+#define LOOKUP_LAST (1<<7) -+#define LOOKUP_LINK_NOTLAST (1<<8) ++#define LOOKUP_LAST (0x1000) ++#define LOOKUP_LINK_NOTLAST (0x2000) /* * Intent data -@@ -55,6 +90,12 @@ +@@ -55,6 +89,12 @@ #define LOOKUP_ACCESS (0x0400) extern int FASTCALL(__user_walk(const char __user *, unsigned, struct nameidata *)); @@ -781,7 +787,7 @@ Index: linux-2.6.9-5.0.3.EL/include/linux/namei.h #define user_path_walk(name,nd) \ __user_walk(name, LOOKUP_FOLLOW, nd) #define user_path_walk_link(name,nd) \ -@@ -67,7 +108,6 @@ +@@ -67,7 +107,6 @@ extern struct dentry * lookup_one_len(const char *, struct dentry *, int); extern struct dentry * lookup_hash(struct qstr *, struct dentry *); @@ -791,8 +797,8 @@ Index: linux-2.6.9-5.0.3.EL/include/linux/namei.h Index: linux-2.6.9-5.0.3.EL/include/linux/mount.h =================================================================== ---- linux-2.6.9-5.0.3.EL.orig/include/linux/mount.h 2005-02-25 13:43:02.738566784 +0200 -+++ linux-2.6.9-5.0.3.EL/include/linux/mount.h 2005-02-25 13:43:42.464527520 +0200 +--- linux-2.6.9-5.0.3.EL.orig/include/linux/mount.h 2005-02-26 14:28:01.397224448 +0200 ++++ linux-2.6.9-5.0.3.EL/include/linux/mount.h 2005-02-26 14:32:04.748229472 +0200 @@ -34,6 +34,8 @@ struct list_head mnt_list; struct list_head mnt_fslink; /* link in fs-specific expiry list */ @@ -804,8 +810,8 @@ Index: linux-2.6.9-5.0.3.EL/include/linux/mount.h static inline struct vfsmount *mntget(struct vfsmount *mnt) Index: linux-2.6.9-5.0.3.EL/kernel/exit.c =================================================================== ---- linux-2.6.9-5.0.3.EL.orig/kernel/exit.c 2005-02-25 13:43:02.740566480 +0200 -+++ linux-2.6.9-5.0.3.EL/kernel/exit.c 2005-02-25 13:43:42.466527216 +0200 +--- linux-2.6.9-5.0.3.EL.orig/kernel/exit.c 2005-02-26 14:28:01.399224144 +0200 ++++ linux-2.6.9-5.0.3.EL/kernel/exit.c 2005-02-26 14:32:04.750229168 +0200 @@ -244,6 +244,8 @@ write_unlock_irq(&tasklist_lock); } diff --git a/lustre/kernel_patches/patches/vfs_nointent-2.6-vanilla.patch b/lustre/kernel_patches/patches/vfs_nointent-2.6-rhel4.patch similarity index 88% rename from lustre/kernel_patches/patches/vfs_nointent-2.6-vanilla.patch rename to lustre/kernel_patches/patches/vfs_nointent-2.6-rhel4.patch index a2dab51..1aae992 100644 --- a/lustre/kernel_patches/patches/vfs_nointent-2.6-vanilla.patch +++ b/lustre/kernel_patches/patches/vfs_nointent-2.6-rhel4.patch @@ -1,12 +1,8 @@ - 0 files changed - -.old..........pc/vfs_nointent_2.6.0-suse/fs/namei.c -.new.........fs/namei.c -Index: linux-2.6.4-51.0/fs/namei.c +Index: linux-2.6.9-5.0.3.EL/fs/namei.c =================================================================== ---- linux-2.6.4-51.0.orig/fs/namei.c 2004-04-05 17:36:42.000000000 -0400 -+++ linux-2.6.4-51.0/fs/namei.c 2004-04-05 17:36:43.000000000 -0400 -@@ -1276,7 +1276,7 @@ +--- linux-2.6.9-5.0.3.EL.orig/fs/namei.c 2005-02-26 13:29:11.948782168 +0200 ++++ linux-2.6.9-5.0.3.EL/fs/namei.c 2005-02-26 13:29:13.355568304 +0200 +@@ -1380,7 +1380,7 @@ if (!error) { DQUOT_INIT(inode); @@ -15,7 +11,7 @@ Index: linux-2.6.4-51.0/fs/namei.c } put_write_access(inode); if (error) -@@ -1526,6 +1526,7 @@ +@@ -1638,6 +1638,7 @@ char * tmp; struct dentry * dentry; struct nameidata nd; @@ -23,7 +19,7 @@ Index: linux-2.6.4-51.0/fs/namei.c if (S_ISDIR(mode)) return -EPERM; -@@ -1536,6 +1537,15 @@ +@@ -1648,6 +1649,15 @@ error = path_lookup(tmp, LOOKUP_PARENT, &nd); if (error) goto out; @@ -39,7 +35,7 @@ Index: linux-2.6.4-51.0/fs/namei.c dentry = lookup_create(&nd, 0); error = PTR_ERR(dentry); -@@ -1562,6 +1572,7 @@ +@@ -1674,6 +1684,7 @@ dput(dentry); } up(&nd.dentry->d_inode->i_sem); @@ -47,7 +43,7 @@ Index: linux-2.6.4-51.0/fs/namei.c path_release(&nd); out: putname(tmp); -@@ -1603,10 +1614,18 @@ +@@ -1715,10 +1726,18 @@ if (!IS_ERR(tmp)) { struct dentry *dentry; struct nameidata nd; @@ -66,7 +62,7 @@ Index: linux-2.6.4-51.0/fs/namei.c dentry = lookup_create(&nd, 1); error = PTR_ERR(dentry); if (!IS_ERR(dentry)) { -@@ -1616,6 +1635,7 @@ +@@ -1728,6 +1747,7 @@ dput(dentry); } up(&nd.dentry->d_inode->i_sem); @@ -74,7 +70,7 @@ Index: linux-2.6.4-51.0/fs/namei.c path_release(&nd); out: putname(tmp); -@@ -1696,6 +1716,7 @@ +@@ -1808,6 +1828,7 @@ char * name; struct dentry *dentry; struct nameidata nd; @@ -82,7 +78,7 @@ Index: linux-2.6.4-51.0/fs/namei.c name = getname(pathname); if(IS_ERR(name)) -@@ -1716,6 +1737,16 @@ +@@ -1828,6 +1849,16 @@ error = -EBUSY; goto exit1; } @@ -99,7 +95,7 @@ Index: linux-2.6.4-51.0/fs/namei.c down(&nd.dentry->d_inode->i_sem); dentry = lookup_hash(&nd.last, nd.dentry); error = PTR_ERR(dentry); -@@ -1774,6 +1805,7 @@ +@@ -1886,6 +1917,7 @@ struct dentry *dentry; struct nameidata nd; struct inode *inode = NULL; @@ -107,7 +103,7 @@ Index: linux-2.6.4-51.0/fs/namei.c name = getname(pathname); if(IS_ERR(name)) -@@ -1785,6 +1817,13 @@ +@@ -1897,6 +1929,13 @@ error = -EISDIR; if (nd.last_type != LAST_NORM) goto exit1; @@ -121,7 +117,7 @@ Index: linux-2.6.4-51.0/fs/namei.c down(&nd.dentry->d_inode->i_sem); dentry = lookup_hash(&nd.last, nd.dentry); error = PTR_ERR(dentry); -@@ -1852,10 +1891,18 @@ +@@ -1963,10 +2002,18 @@ if (!IS_ERR(to)) { struct dentry *dentry; struct nameidata nd; @@ -140,7 +136,7 @@ Index: linux-2.6.4-51.0/fs/namei.c dentry = lookup_create(&nd, 0); error = PTR_ERR(dentry); if (!IS_ERR(dentry)) { -@@ -1863,6 +1910,7 @@ +@@ -1974,6 +2021,7 @@ dput(dentry); } up(&nd.dentry->d_inode->i_sem); @@ -148,7 +144,7 @@ Index: linux-2.6.4-51.0/fs/namei.c path_release(&nd); out: putname(to); -@@ -1926,6 +1974,8 @@ +@@ -2037,6 +2085,8 @@ struct nameidata nd, old_nd; int error; char * to; @@ -157,7 +153,7 @@ Index: linux-2.6.4-51.0/fs/namei.c to = getname(newname); if (IS_ERR(to)) -@@ -1940,6 +1990,13 @@ +@@ -2051,6 +2101,13 @@ error = -EXDEV; if (old_nd.mnt != nd.mnt) goto out_release; @@ -171,7 +167,7 @@ Index: linux-2.6.4-51.0/fs/namei.c new_dentry = lookup_create(&nd, 0); error = PTR_ERR(new_dentry); if (!IS_ERR(new_dentry)) { -@@ -1990,7 +2047,7 @@ +@@ -2101,7 +2158,7 @@ * locking]. */ int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry, @@ -180,7 +176,7 @@ Index: linux-2.6.4-51.0/fs/namei.c { int error = 0; struct inode *target; -@@ -2035,7 +2092,7 @@ +@@ -2146,7 +2203,7 @@ } int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry, @@ -189,7 +185,7 @@ Index: linux-2.6.4-51.0/fs/namei.c { struct inode *target; int error; -@@ -2112,6 +2169,8 @@ +@@ -2223,6 +2280,8 @@ struct dentry * old_dentry, *new_dentry; struct dentry * trap; struct nameidata oldnd, newnd; @@ -198,7 +194,7 @@ Index: linux-2.6.4-51.0/fs/namei.c error = path_lookup(oldname, LOOKUP_PARENT, &oldnd); if (error) -@@ -2134,6 +2193,13 @@ +@@ -2245,6 +2304,13 @@ if (newnd.last_type != LAST_NORM) goto exit2; @@ -212,7 +208,7 @@ Index: linux-2.6.4-51.0/fs/namei.c trap = lock_rename(new_dir, old_dir); old_dentry = lookup_hash(&oldnd.last, old_dir); -@@ -2165,8 +2231,7 @@ +@@ -2276,8 +2342,7 @@ if (new_dentry == trap) goto exit5; @@ -222,11 +218,11 @@ Index: linux-2.6.4-51.0/fs/namei.c exit5: dput(new_dentry); exit4: -Index: linux-2.6.4-51.0/fs/open.c +Index: linux-2.6.9-5.0.3.EL/fs/open.c =================================================================== ---- linux-2.6.4-51.0.orig/fs/open.c 2004-04-05 17:36:42.000000000 -0400 -+++ linux-2.6.4-51.0/fs/open.c 2004-04-06 01:37:39.000000000 -0400 -@@ -187,9 +187,10 @@ +--- linux-2.6.9-5.0.3.EL.orig/fs/open.c 2005-02-26 13:29:11.962780040 +0200 ++++ linux-2.6.9-5.0.3.EL/fs/open.c 2005-02-26 13:29:13.359567696 +0200 +@@ -191,9 +191,10 @@ return error; } @@ -238,7 +234,7 @@ Index: linux-2.6.4-51.0/fs/open.c struct iattr newattrs; /* Not pretty: "inode->i_size" shouldn't really be signed. But it is. */ -@@ -200,7 +201,14 @@ +@@ -204,7 +205,14 @@ newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME; down(&dentry->d_inode->i_sem); down_write(&dentry->d_inode->i_alloc_sem); @@ -254,7 +250,7 @@ Index: linux-2.6.4-51.0/fs/open.c up_write(&dentry->d_inode->i_alloc_sem); up(&dentry->d_inode->i_sem); return err; -@@ -256,7 +264,7 @@ +@@ -260,7 +268,7 @@ error = locks_verify_truncate(inode, NULL, length); if (!error) { DQUOT_INIT(inode); @@ -263,7 +259,7 @@ Index: linux-2.6.4-51.0/fs/open.c } put_write_access(inode); -@@ -308,7 +316,7 @@ +@@ -312,7 +320,7 @@ error = locks_verify_truncate(inode, file, length); if (!error) @@ -272,7 +268,7 @@ Index: linux-2.6.4-51.0/fs/open.c out_putf: fput(file); out: -@@ -387,9 +395,19 @@ +@@ -391,9 +399,19 @@ (error = permission(inode,MAY_WRITE,&nd)) != 0) goto dput_and_out; } @@ -295,7 +291,7 @@ Index: linux-2.6.4-51.0/fs/open.c dput_and_out: path_release(&nd); out: -@@ -440,9 +458,19 @@ +@@ -444,9 +462,19 @@ (error = permission(inode,MAY_WRITE,&nd)) != 0) goto dput_and_out; } @@ -318,9 +314,9 @@ Index: linux-2.6.4-51.0/fs/open.c dput_and_out: path_release(&nd); out: -@@ -592,36 +620,52 @@ - return error; - } +@@ -600,36 +628,52 @@ + + EXPORT_SYMBOL_GPL(sys_chroot); -asmlinkage long sys_fchmod(unsigned int fd, mode_t mode) +int chmod_common(struct dentry *dentry, mode_t mode) @@ -387,7 +383,7 @@ Index: linux-2.6.4-51.0/fs/open.c fput(file); out: return err; -@@ -630,32 +674,13 @@ +@@ -638,32 +682,13 @@ asmlinkage long sys_chmod(const char __user * filename, mode_t mode) { struct nameidata nd; @@ -421,7 +417,7 @@ Index: linux-2.6.4-51.0/fs/open.c path_release(&nd); out: return error; -@@ -676,6 +701,18 @@ +@@ -684,6 +709,18 @@ if (IS_RDONLY(inode)) goto out; error = -EPERM; @@ -440,7 +436,7 @@ Index: linux-2.6.4-51.0/fs/open.c if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) goto out; newattrs.ia_valid = ATTR_CTIME; -@@ -689,6 +726,7 @@ +@@ -697,6 +734,7 @@ } if (!S_ISDIR(inode->i_mode)) newattrs.ia_valid |= ATTR_KILL_SUID|ATTR_KILL_SGID; @@ -448,11 +444,11 @@ Index: linux-2.6.4-51.0/fs/open.c down(&inode->i_sem); error = notify_change(dentry, &newattrs); up(&inode->i_sem); -Index: linux-2.6.4-51.0/fs/exec.c +Index: linux-2.6.9-5.0.3.EL/fs/exec.c =================================================================== ---- linux-2.6.4-51.0.orig/fs/exec.c 2004-04-05 17:36:42.000000000 -0400 -+++ linux-2.6.4-51.0/fs/exec.c 2004-04-05 17:36:43.000000000 -0400 -@@ -1418,7 +1418,7 @@ +--- linux-2.6.9-5.0.3.EL.orig/fs/exec.c 2005-02-26 13:29:11.936783992 +0200 ++++ linux-2.6.9-5.0.3.EL/fs/exec.c 2005-02-26 13:29:13.362567240 +0200 +@@ -1451,7 +1451,7 @@ goto close_fail; if (!file->f_op->write) goto close_fail; @@ -461,11 +457,11 @@ Index: linux-2.6.4-51.0/fs/exec.c goto close_fail; retval = binfmt->core_dump(signr, regs, file); -Index: linux-2.6.4-51.0/include/linux/fs.h +Index: linux-2.6.9-5.0.3.EL/include/linux/fs.h =================================================================== ---- linux-2.6.4-51.0.orig/include/linux/fs.h 2004-04-05 17:36:43.000000000 -0400 -+++ linux-2.6.4-51.0/include/linux/fs.h 2004-04-05 17:36:43.000000000 -0400 -@@ -866,13 +866,20 @@ +--- linux-2.6.9-5.0.3.EL.orig/include/linux/fs.h 2005-02-26 13:29:11.987776240 +0200 ++++ linux-2.6.9-5.0.3.EL/include/linux/fs.h 2005-02-26 13:29:13.365566784 +0200 +@@ -926,13 +926,20 @@ int (*create) (struct inode *,struct dentry *,int, struct nameidata *); struct dentry * (*lookup) (struct inode *,struct dentry *, struct nameidata *); int (*link) (struct dentry *,struct inode *,struct dentry *); @@ -485,8 +481,8 @@ Index: linux-2.6.4-51.0/include/linux/fs.h + int (*rename_raw) (struct nameidata *, struct nameidata *); int (*readlink) (struct dentry *, char __user *,int); int (*follow_link) (struct dentry *, struct nameidata *); - void (*truncate) (struct inode *); -@@ -1169,7 +1176,7 @@ + void (*put_link) (struct dentry *, struct nameidata *); +@@ -1234,7 +1241,7 @@ /* fs/open.c */ @@ -495,10 +491,10 @@ Index: linux-2.6.4-51.0/include/linux/fs.h extern struct file *filp_open(const char *, int, int); extern struct file * dentry_open(struct dentry *, struct vfsmount *, int); extern struct file * dentry_open_it(struct dentry *, struct vfsmount *, int, struct lookup_intent *); -Index: linux-2.6.4-51.0/net/unix/af_unix.c +Index: linux-2.6.9-5.0.3.EL/net/unix/af_unix.c =================================================================== ---- linux-2.6.4-51.0.orig/net/unix/af_unix.c 2004-04-05 12:42:07.000000000 -0400 -+++ linux-2.6.4-51.0/net/unix/af_unix.c 2004-04-05 17:36:43.000000000 -0400 +--- linux-2.6.9-5.0.3.EL.orig/net/unix/af_unix.c 2005-02-25 10:25:31.000000000 +0200 ++++ linux-2.6.9-5.0.3.EL/net/unix/af_unix.c 2005-02-26 13:29:13.387563440 +0200 @@ -676,6 +676,7 @@ int err = 0; diff --git a/lustre/kernel_patches/patches/vfs_races-2.6-vanilla.patch b/lustre/kernel_patches/patches/vfs_races-2.6-rhel4.patch similarity index 100% rename from lustre/kernel_patches/patches/vfs_races-2.6-vanilla.patch rename to lustre/kernel_patches/patches/vfs_races-2.6-rhel4.patch diff --git a/lustre/kernel_patches/series/2.6-rhel4.series b/lustre/kernel_patches/series/2.6-rhel4.series index ffa4a51..7c0ae18 100644 --- a/lustre/kernel_patches/series/2.6-rhel4.series +++ b/lustre/kernel_patches/series/2.6-rhel4.series @@ -1,12 +1,15 @@ lustre_version.patch vfs_intent-2.6-rhel4.patch -vfs_nointent-2.6-vanilla.patch -vfs_races-2.6-vanilla.patch +vfs_nointent-2.6-rhel4.patch +vfs_races-2.6-rhel4.patch ext3-wantedi-misc-2.6-suse.patch nfs-cifs-intent-2.6-rhel4.patch iopen-misc-2.6-suse.patch export-truncate-2.6-suse.patch -export_symbols-2.6-suse.patch +export_symbols-2.6-rhel4.patch dev_read_only-2.6-suse.patch export-2.6-suse.patch lookup_bdev_init_intent.patch +8kstack-2.6-rhel4.patch +remove-suid-2.6-suse.patch +export-show_task-2.6-vanilla.patch diff --git a/lustre/kernel_patches/series/ldiskfs-2.6-rhel4.series b/lustre/kernel_patches/series/ldiskfs-2.6-rhel4.series new file mode 100644 index 0000000..70e7b12 --- /dev/null +++ b/lustre/kernel_patches/series/ldiskfs-2.6-rhel4.series @@ -0,0 +1,11 @@ +ext3-wantedi-2.6-rhel4.patch +ext3-san-jdike-2.6-suse.patch +iopen-2.6-rhel4.patch +export_symbols-ext3-2.6-suse.patch +ext3-map_inode_page-2.6-suse.patch +ext3-ea-in-inode-2.6-rhel4.patch +export-ext3-2.6-rhel4.patch +ext3-include-fixes-2.6-rhel4.patch +ext3-extents-2.6.9-rhel4.patch +ext3-mballoc2-2.6.9-rhel4.patch +ext3-nlinks-2.6.7.patch -- 1.8.3.1