From b175e2441b0cd9fae60341ba92b0f7f192e71446 Mon Sep 17 00:00:00 2001 From: girish Date: Fri, 29 May 2009 12:24:06 +0000 Subject: [PATCH] b=16893 i=adilger i=johann ext4 ldiskfs patches for rhel5 --- .../patches/export-ext4-2.6-rhel5.patch | 35 ++ .../patches/ext4-alloc-policy-2.6-rhel5.patch | 101 ++++ .../patches/ext4-big-endian-check-2.6-rhel5.patch | 56 ++ .../patches/ext4-fiemap-2.6-rhel5.patch | 566 +++++++++++++++++++++ .../patches/ext4-filterdata-rhel5.patch | 25 + .../patches/ext4-ialloc-2.6-rhel5.patch | 129 +++++ .../patches/ext4-include-fixes-2.6-rhel5.patch | 20 + .../patches/ext4-inode-version-rhel5.patch | 105 ++++ .../patches/ext4-lookup-dotdot-rhel5.patch | 63 +++ .../patches/ext4-map_inode_page-2.6.18-rhel5.patch | 86 ++++ .../patches/ext4-max-dir-size-rhel5.patch | 203 ++++++++ .../patches/ext4-mballoc-extra-checks-rhel5.patch | 330 ++++++++++++ .../ext4-mballoc-handle-dev-paths-rhel5.patch | 59 +++ .../kernel_patches/patches/ext4-misc-rhel5.patch | 271 ++++++++++ .../kernel_patches/patches/ext4-mmp-rhel5.patch | 479 +++++++++++++++++ .../patches/ext4-prealloc-rhel5.patch | 405 +++++++++++++++ .../ext4-print-inum-in-htree-warning-rhel5.patch | 15 + .../ext4-remove-cond_resched-calls-rhel5.patch | 29 ++ .../patches/ext4-unlink-race-rhel5.patch | 15 + .../patches/ext4-wantedi-2.6-rhel5.patch | 169 ++++++ .../patches/ext4-xattr-no-update-ctime-rhel5.patch | 32 ++ .../patches/iopen-2.6.18-rhel5-ext4.patch | 512 +++++++++++++++++++ .../series/ldiskfs-2.6-rhel5-ext4.series | 23 + 23 files changed, 3728 insertions(+) create mode 100644 ldiskfs/kernel_patches/patches/export-ext4-2.6-rhel5.patch create mode 100644 ldiskfs/kernel_patches/patches/ext4-alloc-policy-2.6-rhel5.patch create mode 100644 ldiskfs/kernel_patches/patches/ext4-big-endian-check-2.6-rhel5.patch create mode 100644 ldiskfs/kernel_patches/patches/ext4-fiemap-2.6-rhel5.patch create mode 100644 ldiskfs/kernel_patches/patches/ext4-filterdata-rhel5.patch create mode 100644 ldiskfs/kernel_patches/patches/ext4-ialloc-2.6-rhel5.patch create mode 100644 ldiskfs/kernel_patches/patches/ext4-include-fixes-2.6-rhel5.patch create mode 100644 ldiskfs/kernel_patches/patches/ext4-inode-version-rhel5.patch create mode 100644 ldiskfs/kernel_patches/patches/ext4-lookup-dotdot-rhel5.patch create mode 100644 ldiskfs/kernel_patches/patches/ext4-map_inode_page-2.6.18-rhel5.patch create mode 100644 ldiskfs/kernel_patches/patches/ext4-max-dir-size-rhel5.patch create mode 100644 ldiskfs/kernel_patches/patches/ext4-mballoc-extra-checks-rhel5.patch create mode 100644 ldiskfs/kernel_patches/patches/ext4-mballoc-handle-dev-paths-rhel5.patch create mode 100644 ldiskfs/kernel_patches/patches/ext4-misc-rhel5.patch create mode 100644 ldiskfs/kernel_patches/patches/ext4-mmp-rhel5.patch create mode 100644 ldiskfs/kernel_patches/patches/ext4-prealloc-rhel5.patch create mode 100644 ldiskfs/kernel_patches/patches/ext4-print-inum-in-htree-warning-rhel5.patch create mode 100644 ldiskfs/kernel_patches/patches/ext4-remove-cond_resched-calls-rhel5.patch create mode 100644 ldiskfs/kernel_patches/patches/ext4-unlink-race-rhel5.patch create mode 100644 ldiskfs/kernel_patches/patches/ext4-wantedi-2.6-rhel5.patch create mode 100644 ldiskfs/kernel_patches/patches/ext4-xattr-no-update-ctime-rhel5.patch create mode 100644 ldiskfs/kernel_patches/patches/iopen-2.6.18-rhel5-ext4.patch create mode 100644 ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel5-ext4.series diff --git a/ldiskfs/kernel_patches/patches/export-ext4-2.6-rhel5.patch b/ldiskfs/kernel_patches/patches/export-ext4-2.6-rhel5.patch new file mode 100644 index 0000000..3930843 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/export-ext4-2.6-rhel5.patch @@ -0,0 +1,35 @@ +Index: linux-2.6.18.i386/fs/ext4/super.c +=================================================================== +--- linux-2.6.18.i386.orig/fs/ext4/super.c ++++ linux-2.6.18.i386/fs/ext4/super.c +@@ -185,6 +185,8 @@ void ext4_journal_abort_handle(const cha + jbd2_journal_abort_handle(handle); + } + ++EXPORT_SYMBOL(ext4_journal_abort_handle); ++ + /* Deal with the reporting of failure conditions on a filesystem such as + * inconsistencies detected or read IO failures. + * +@@ -2459,6 +2461,8 @@ out_fail: + return ret; + } + ++EXPORT_SYMBOL(ext4_force_commit); ++ + /* + * Setup any per-fs journal parameters now. We'll do this both on + * initial mount, once the journal has been initialised but before we've +@@ -3502,6 +3506,12 @@ int ext4_map_inode_page(struct inode *in + unsigned long *blocks, int *created, int create); + EXPORT_SYMBOL(ext4_map_inode_page); + ++EXPORT_SYMBOL(ext4_xattr_get); ++EXPORT_SYMBOL(ext4_xattr_set_handle); ++EXPORT_SYMBOL(ext4_bread); ++EXPORT_SYMBOL(ext4_journal_start_sb); ++EXPORT_SYMBOL(__ext4_journal_stop); ++ + MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); + MODULE_DESCRIPTION("Fourth Extended Filesystem with extents"); + MODULE_LICENSE("GPL"); diff --git a/ldiskfs/kernel_patches/patches/ext4-alloc-policy-2.6-rhel5.patch b/ldiskfs/kernel_patches/patches/ext4-alloc-policy-2.6-rhel5.patch new file mode 100644 index 0000000..a1b8375 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext4-alloc-policy-2.6-rhel5.patch @@ -0,0 +1,101 @@ +Index: linux-2.6.18-128.1.6/fs/ext4/ialloc.c +=================================================================== +--- linux-2.6.18-128.1.6.orig/fs/ext4/ialloc.c ++++ linux-2.6.18-128.1.6/fs/ext4/ialloc.c +@@ -946,6 +946,36 @@ fail_drop: + return ERR_PTR(err); + } + ++unsigned long ext4_find_reverse(struct super_block *sb) ++{ ++ struct ext4_group_desc *desc; ++ struct buffer_head *bitmap_bh = NULL; ++ int group; ++ unsigned long ino, offset; ++ ++ for (offset = (EXT4_INODES_PER_GROUP(sb) >> 1); offset >= 0; ++ offset >>= 1) { ++ for (group = EXT4_SB(sb)->s_groups_count - 1; group >= 0; ++ --group) { ++ desc = ext4_get_group_desc(sb, group, NULL); ++ if (desc->bg_free_inodes_count == 0) ++ continue; ++ ++ bitmap_bh = ext4_read_inode_bitmap(sb, group); ++ if (!bitmap_bh) ++ continue; ++ ++ ino = ext4_find_next_zero_bit((unsigned long *) ++ bitmap_bh->b_data, ++ EXT4_INODES_PER_GROUP(sb), offset); ++ if (ino < EXT4_INODES_PER_GROUP(sb)) ++ return (group * EXT4_INODES_PER_GROUP(sb) + ++ ino + 1); ++ } ++ } ++ return 0; ++} ++ + /* Verify that we are loading a valid orphan from disk */ + struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino) + { +Index: linux-2.6.18-128.1.6/fs/ext4/namei.c +=================================================================== +--- linux-2.6.18-128.1.6.orig/fs/ext4/namei.c ++++ linux-2.6.18-128.1.6/fs/ext4/namei.c +@@ -151,14 +151,24 @@ struct dx_map_entry + u16 size; + }; + ++/* ++ * dentry_param used by ext4_new_inode_wantedi() ++ */ + #define LVFS_DENTRY_PARAM_MAGIC 20070216UL + struct lvfs_dentry_params + { +- unsigned long p_inum; +- void *p_ptr; +- u32 magic; ++ unsigned long ldp_inum; ++ long ldp_flags; ++ u32 ldp_magic; + }; + ++/* Only use the least 3 bits of ldp_flags for goal policy */ ++typedef enum { ++ DP_GOAL_POLICY = 0, ++ DP_LASTGROUP_REVERSE = 1, ++} dp_policy_t; ++ ++ + static inline ext4_lblk_t dx_get_block(struct dx_entry *entry); + static void dx_set_block(struct dx_entry *entry, ext4_lblk_t value); + static inline unsigned dx_get_hash (struct dx_entry *entry); +@@ -1762,8 +1772,13 @@ static struct inode * ext4_new_inode_wan + if (dentry->d_fsdata != NULL) { + struct lvfs_dentry_params *param = dentry->d_fsdata; + +- if (param->magic == LVFS_DENTRY_PARAM_MAGIC) +- inum = param->p_inum; ++ if (param->ldp_magic == LVFS_DENTRY_PARAM_MAGIC) { ++ if ((dp_policy_t)(param->ldp_flags & 0x7) == ++ DP_LASTGROUP_REVERSE) ++ inum = ext4_find_reverse(dir->i_sb); ++ else /* DP_GOAL_POLICY */ ++ inum = param->ldp_inum; ++ } + } + return ext4_new_inode(handle, dir, mode, inum); + } +Index: linux-2.6.18-128.1.6/fs/ext4/ext4.h +=================================================================== +--- linux-2.6.18-128.1.6.orig/fs/ext4/ext4.h ++++ linux-2.6.18-128.1.6/fs/ext4/ext4.h +@@ -1071,6 +1071,7 @@ extern int ext4fs_dirhash(const char *na + /* ialloc.c */ + extern struct inode * ext4_new_inode (handle_t *, struct inode *, int, + unsigned long); ++extern unsigned long ext4_find_reverse(struct super_block *); + extern void ext4_free_inode (handle_t *, struct inode *); + extern struct inode * ext4_orphan_get (struct super_block *, unsigned long); + extern unsigned long ext4_count_free_inodes (struct super_block *); diff --git a/ldiskfs/kernel_patches/patches/ext4-big-endian-check-2.6-rhel5.patch b/ldiskfs/kernel_patches/patches/ext4-big-endian-check-2.6-rhel5.patch new file mode 100644 index 0000000..0ec5670 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext4-big-endian-check-2.6-rhel5.patch @@ -0,0 +1,56 @@ +Index: linux-2.6.18-128.1.6/fs/ext4/super.c +=================================================================== +--- linux-2.6.18-128.1.6.orig/fs/ext4/super.c ++++ linux-2.6.18-128.1.6/fs/ext4/super.c +@@ -70,6 +70,8 @@ struct page *ext4_zero_page; + + struct proc_dir_entry *proc_root_ext4; + ++static int bigendian_extents; ++ + ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, + struct ext4_group_desc *bg) + { +@@ -1222,7 +1224,7 @@ enum { + Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, + Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version, + Opt_mballoc, Opt_nomballoc, Opt_stripe, +- Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, ++ Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, Opt_bigendian_extents, + }; + + static match_table_t tokens = { +@@ -1284,6 +1286,7 @@ static match_table_t tokens = { + {Opt_nomballoc, "nomballoc"}, + {Opt_stripe, "stripe=%u"}, + {Opt_resize, "resize"}, ++ {Opt_bigendian_extents, "bigendian_extents"}, + {Opt_err, NULL}, + }; + +@@ -1682,6 +1685,9 @@ clear_qf_name: + return 0; + sbi->s_stripe = option; + break; ++ case Opt_bigendian_extents: ++ bigendian_extents = 1; ++ break; + default: + printk(KERN_ERR + "EXT4-fs: Unrecognized mount option \"%s\" " +@@ -2561,6 +2567,15 @@ static int ext4_fill_super(struct super_ + goto failed_mount; + } + ++#ifdef __BIG_ENDIAN ++ if (bigendian_extents == 0) { ++ printk(KERN_ERR "EXT4-fs: extents feature is not guaranteed to " ++ "work on big-endian systems. Use \"bigendian_extents\" " ++ "mount option to override.\n"); ++ goto failed_mount; ++ } ++#endif ++ + bgl_lock_init(&sbi->s_blockgroup_lock); + + sbi->s_last_alloc_group = -1; diff --git a/ldiskfs/kernel_patches/patches/ext4-fiemap-2.6-rhel5.patch b/ldiskfs/kernel_patches/patches/ext4-fiemap-2.6-rhel5.patch new file mode 100644 index 0000000..b2f80d5 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext4-fiemap-2.6-rhel5.patch @@ -0,0 +1,566 @@ +A large part of this code is from the generic VFS code in fs/ioctl.c in the +upstream kernel. + +Index: linux-2.6.18.i386/fs/ext4/ioctl.c +=================================================================== +--- linux-2.6.18.i386.orig/fs/ext4/ioctl.c ++++ linux-2.6.18.i386/fs/ext4/ioctl.c +@@ -17,6 +17,162 @@ + #include "ext4_jbd2.h" + #include "ext4.h" + ++#include "fiemap.h" ++ ++/* So that the fiemap access checks can't overflow on 32 bit machines. */ ++#define FIEMAP_MAX_EXTENTS (UINT_MAX / sizeof(struct fiemap_extent)) ++ ++/** ++ * fiemap_fill_next_extent - Fiemap helper function ++ * @fieinfo: Fiemap context passed into ->fiemap ++ * @logical: Extent logical start offset, in bytes ++ * @phys: Extent physical start offset, in bytes ++ * @len: Extent length, in bytes ++ * @flags: FIEMAP_EXTENT flags that describe this extent ++ * @lun: LUN on which this extent resides ++ * ++ * Called from file system ->fiemap callback. Will populate extent ++ * info as passed in via arguments and copy to user memory. On ++ * success, extent count on fieinfo is incremented. ++ * ++ * Returns 0 on success, -errno on error, 1 if this was the last ++ * extent that will fit in user array. ++ */ ++#define SET_UNKNOWN_FLAGS (FIEMAP_EXTENT_DELALLOC) ++#define SET_NO_DIRECT_FLAGS (FIEMAP_EXTENT_DATA_ENCRYPTED \ ++ |FIEMAP_EXTENT_NET) ++#define SET_NO_UNMOUNTED_IO_FLAGS (FIEMAP_EXTENT_DATA_ENCRYPTED) ++#define SET_NOT_ALIGNED_FLAGS (FIEMAP_EXTENT_DATA_TAIL|FIEMAP_EXTENT_DATA_INLINE) ++int fiemap_fill_next_extent(struct fiemap_extent_info *fieinfo, u64 logical, ++ u64 phys, u64 len, u32 flags, dev_t dev) ++{ ++ struct fiemap_extent extent = { 0 }; ++ struct fiemap_extent *dest = fieinfo->fi_extents_start; ++ ++ /* only count the extents */ ++ if (fieinfo->fi_extents_max == 0) { ++ fieinfo->fi_extents_mapped++; ++ return (flags & FIEMAP_EXTENT_LAST) ? 1 : 0; ++ } ++ ++ if (fieinfo->fi_extents_mapped >= fieinfo->fi_extents_max) ++ return 1; ++ ++ if (flags & SET_UNKNOWN_FLAGS) ++ flags |= FIEMAP_EXTENT_UNKNOWN; ++ if (flags & SET_NO_DIRECT_FLAGS) ++ flags |= FIEMAP_EXTENT_NO_DIRECT; ++ if (flags & SET_NOT_ALIGNED_FLAGS) ++ flags |= FIEMAP_EXTENT_NOT_ALIGNED; ++ if (flags & SET_NO_UNMOUNTED_IO_FLAGS) ++ flags |= FIEMAP_EXTENT_ENCODED; ++ ++ extent.fe_logical = logical; ++ extent.fe_physical = phys; ++ extent.fe_length = len; ++ extent.fe_flags = flags; ++ extent.fe_device = new_encode_dev(dev); ++ ++ dest += fieinfo->fi_extents_mapped; ++ if (copy_to_user(dest, &extent, sizeof(extent))) ++ return -EFAULT; ++ ++ fieinfo->fi_extents_mapped++; ++ if (fieinfo->fi_extents_mapped == fieinfo->fi_extents_max) ++ return 1; ++ ++ return (flags & FIEMAP_EXTENT_LAST) ? 1 : 0; ++} ++ ++static int fiemap_check_ranges(struct super_block *sb, ++ u64 start, u64 len, u64 *new_len) ++{ ++ *new_len = len; ++ ++ if (len == 0) ++ return -EINVAL; ++ ++ if (start > sb->s_maxbytes) ++ return -EFBIG; ++ ++ /* ++ * Shrink request scope to what the fs can actually handle. ++ */ ++ if ((len > sb->s_maxbytes) || ++ (sb->s_maxbytes - len) < start) ++ *new_len = sb->s_maxbytes - start; ++ ++ return 0; ++} ++ ++/* ++ * fiemap_check_flags - check validity of requested flags for fiemap ++ * @fieinfo: Fiemap context passed into ->fiemap ++ * @fs_flags: Set of fiemap flags that the file system understands ++ * ++ * Called from file system ->fiemap callback. This will compute the ++ * intersection of valid fiemap flags and those that the fs supports. That ++ * value is then compared against the user supplied flags. In case of bad user ++ * flags, the invalid values will be written into the fieinfo structure, and ++ * -EBADR is returned, which tells ioctl_fiemap() to return those values to ++ * userspace. For this reason, a return code of -EBADR should be preserved. ++ * ++ * Returns 0 on success, -EBADR on bad flags. ++ */ ++int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags) ++{ ++ u32 incompat_flags; ++ ++ incompat_flags = fieinfo->fi_flags & ~(FIEMAP_FLAGS_COMPAT & fs_flags); ++ if (incompat_flags) { ++ fieinfo->fi_flags = incompat_flags; ++ return -EBADR; ++ } ++ ++ return 0; ++} ++ ++int ioctl_fiemap(struct inode *inode, struct file *filp, unsigned long arg) ++{ ++ struct fiemap fiemap; ++ u64 len; ++ struct fiemap_extent_info fieinfo = {0, }; ++ struct super_block *sb = inode->i_sb; ++ int error = 0; ++ ++ if (copy_from_user(&fiemap, (struct fiemap __user *) arg, ++ sizeof(struct fiemap))) ++ return -EFAULT; ++ ++ if (fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS) ++ return -EINVAL; ++ ++ error = fiemap_check_ranges(sb, fiemap.fm_start, fiemap.fm_length, ++ &len); ++ if (error) ++ return error; ++ ++ fieinfo.fi_flags = fiemap.fm_flags; ++ fieinfo.fi_extents_max = fiemap.fm_extent_count; ++ fieinfo.fi_extents_start = (struct fiemap_extent *)(arg + sizeof(fiemap)); ++ ++ if (fiemap.fm_extent_count != 0 && ++ !access_ok(VERIFY_WRITE, (void *)arg, ++ offsetof(typeof(fiemap), fm_extents[fiemap.fm_extent_count]))) ++ return -EFAULT; ++ ++ if (fieinfo.fi_flags & FIEMAP_FLAG_SYNC) ++ filemap_write_and_wait(inode->i_mapping); ++ ++ error = ext4_fiemap(inode, &fieinfo, fiemap.fm_start, len); ++ fiemap.fm_flags = fieinfo.fi_flags; ++ fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped; ++ if (copy_to_user((char *)arg, &fiemap, sizeof(fiemap))) ++ error = -EFAULT; ++ ++ return error; ++} ++ + long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) + { + struct inode *inode = filp->f_dentry->d_inode; +@@ -257,6 +413,10 @@ flags_err: + case EXT4_IOC_MIGRATE: + return ext4_ext_migrate(inode, filp, cmd, arg); + ++ case EXT4_IOC_FIEMAP: { ++ return ioctl_fiemap(inode, filp, arg); ++ } ++ + default: + return -ENOTTY; + } +Index: linux-2.6.18.i386/fs/ext4/ext4.h +=================================================================== +--- linux-2.6.18.i386.orig/fs/ext4/ext4.h ++++ linux-2.6.18.i386/fs/ext4/ext4.h +@@ -300,6 +300,7 @@ struct ext4_new_group_data { + #define EXT4_IOC_GETRSVSZ _IOR('f', 5, long) + #define EXT4_IOC_SETRSVSZ _IOW('f', 6, long) + #define EXT4_IOC_MIGRATE _IO('f', 7) ++#define EXT4_IOC_FIEMAP _IOWR('f', 11, struct fiemap) + + /* + * ioctl commands in 32 bit emulation +@@ -317,6 +318,8 @@ struct ext4_new_group_data { + #define EXT4_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION + #define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION + ++/* FIEMAP flags supported by ext4 */ ++#define EXT4_FIEMAP_FLAGS_COMPAT (FIEMAP_FLAG_SYNC) + + /* + * Mount options +@@ -1115,6 +1118,9 @@ extern int ext4_page_mkwrite(struct vm_a + /* ioctl.c */ + extern long ext4_ioctl(struct file *, unsigned int, unsigned long); + extern long ext4_compat_ioctl (struct file *, unsigned int, unsigned long); ++struct fiemap_extent_info; ++extern int ext4_fiemap(struct inode *, struct fiemap_extent_info *, __u64, ++ __u64); + + /* migrate.c */ + extern int ext4_ext_migrate(struct inode *, struct file *, unsigned int, +Index: linux-2.6.18.i386/fs/ext4/ext4_extents.h +=================================================================== +--- linux-2.6.18.i386.orig/fs/ext4/ext4_extents.h ++++ linux-2.6.18.i386/fs/ext4/ext4_extents.h +@@ -128,6 +128,22 @@ struct ext4_ext_path { + #define EXT_MAX_BLOCK 0xffffffff + + /* ++ * to be called by ext4_ext_walk_space() ++ * negative retcode - error ++ * positive retcode - signal for ext4_ext_walk_space(), see below ++ * callback must return valid extent (passed or newly created) ++ */ ++typedef int (*ext_prepare_callback)(struct inode *, struct ext4_ext_path *, ++ struct ext4_ext_cache *, ++ struct ext4_extent *, void *); ++ ++#define HAVE_EXT_PREPARE_CB_EXTENT ++ ++#define EXT_CONTINUE 0 ++#define EXT_BREAK 1 ++#define EXT_REPEAT 2 ++ ++/* + * EXT_INIT_MAX_LEN is the maximum number of blocks we can have in an + * initialized extent. This is 2^15 and not (2^16 - 1), since we use the + * MSB of ee_len field in the extent datastructure to signify if this +@@ -223,6 +239,8 @@ extern int ext4_ext_try_to_merge(struct + struct ext4_extent *); + extern unsigned int ext4_ext_check_overlap(struct inode *, struct ext4_extent *, struct ext4_ext_path *); + extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *); ++extern int ext4_ext_walk_space(struct inode *, ext4_lblk_t, ext4_lblk_t, ++ ext_prepare_callback, void *); + extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t, + struct ext4_ext_path *); + extern int ext4_ext_search_left(struct inode *, struct ext4_ext_path *, +Index: linux-2.6.18.i386/fs/ext4/extents.c +=================================================================== +--- linux-2.6.18.i386.orig/fs/ext4/extents.c ++++ linux-2.6.18.i386/fs/ext4/extents.c +@@ -44,7 +44,7 @@ + #include + #include "ext4_jbd2.h" + #include "ext4_extents.h" +- ++#include "fiemap.h" + + /* + * ext_pblock: +@@ -1597,6 +1597,113 @@ cleanup: + return err; + } + ++int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block, ++ ext4_lblk_t num, ext_prepare_callback func, ++ void *cbdata) ++{ ++ struct ext4_ext_path *path = NULL; ++ struct ext4_ext_cache cbex; ++ struct ext4_extent *ex; ++ ext4_lblk_t next, start = 0, end = 0; ++ ext4_lblk_t last = block + num; ++ int depth, exists, err = 0; ++ ++ BUG_ON(func == NULL); ++ BUG_ON(inode == NULL); ++ ++ while (block < last && block != EXT_MAX_BLOCK) { ++ num = last - block; ++ /* find extent for this block */ ++ path = ext4_ext_find_extent(inode, block, path); ++ if (IS_ERR(path)) { ++ err = PTR_ERR(path); ++ path = NULL; ++ break; ++ } ++ ++ depth = ext_depth(inode); ++ BUG_ON(path[depth].p_hdr == NULL); ++ ex = path[depth].p_ext; ++ next = ext4_ext_next_allocated_block(path); ++ ++ exists = 0; ++ if (!ex) { ++ /* there is no extent yet, so try to allocate ++ * all requested space */ ++ start = block; ++ end = block + num; ++ } else if (le32_to_cpu(ex->ee_block) > block) { ++ /* need to allocate space before found extent */ ++ start = block; ++ end = le32_to_cpu(ex->ee_block); ++ if (block + num < end) ++ end = block + num; ++ } else if (block >= le32_to_cpu(ex->ee_block) ++ + ext4_ext_get_actual_len(ex)) { ++ /* need to allocate space after found extent */ ++ start = block; ++ end = block + num; ++ if (end >= next) ++ end = next; ++ } else if (block >= le32_to_cpu(ex->ee_block)) { ++ /* ++ * some part of requested space is covered ++ * by found extent ++ */ ++ start = block; ++ end = le32_to_cpu(ex->ee_block) ++ + ext4_ext_get_actual_len(ex); ++ if (block + num < end) ++ end = block + num; ++ exists = 1; ++ } else { ++ BUG(); ++ } ++ BUG_ON(end <= start); ++ ++ if (!exists) { ++ cbex.ec_block = start; ++ cbex.ec_len = end - start; ++ cbex.ec_start = 0; ++ cbex.ec_type = EXT4_EXT_CACHE_GAP; ++ } else { ++ cbex.ec_block = le32_to_cpu(ex->ee_block); ++ cbex.ec_len = ext4_ext_get_actual_len(ex); ++ cbex.ec_start = ext_pblock(ex); ++ cbex.ec_type = EXT4_EXT_CACHE_EXTENT; ++ } ++ ++ BUG_ON(cbex.ec_len == 0); ++ err = func(inode, path, &cbex, ex, cbdata); ++ ext4_ext_drop_refs(path); ++ ++ if (err < 0) ++ break; ++ ++ if (err == EXT_REPEAT) ++ continue; ++ else if (err == EXT_BREAK) { ++ err = 0; ++ break; ++ } ++ ++ if (ext_depth(inode) != depth) { ++ /* depth was changed. we have to realloc path */ ++ kfree(path); ++ path = NULL; ++ } ++ ++ block = cbex.ec_block + cbex.ec_len; ++ } ++ ++ if (path) { ++ ext4_ext_drop_refs(path); ++ kfree(path); ++ } ++ ++ return err; ++} ++ + static void + ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block, + __u32 len, ext4_fsblk_t start, int type) +@@ -2953,3 +3060,100 @@ retry: + return ret > 0 ? ret2 : ret; + } + #endif ++ ++/* ++ * Callback function called for each extent to gather FIEMAP information. ++ */ ++int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path, ++ struct ext4_ext_cache *newex, struct ext4_extent *ex, ++ void *data) ++{ ++ struct fiemap_extent_info *fieinfo = data; ++ unsigned long blksize_bits = inode->i_sb->s_blocksize_bits; ++ __u64 logical; ++ __u64 physical; ++ __u64 length; ++ __u32 flags = 0; ++ int error; ++ ++ logical = (__u64)newex->ec_block << blksize_bits; ++ ++ if (newex->ec_type == EXT4_EXT_CACHE_GAP) { ++ pgoff_t offset; ++ struct page *page; ++ struct buffer_head *bh = NULL; ++ ++ offset = logical >> PAGE_SHIFT; ++ page = find_get_page(inode->i_mapping, offset); ++ if (!page || !page_has_buffers(page)) ++ return EXT_CONTINUE; ++ ++ bh = page_buffers(page); ++ ++ if (!bh) ++ return EXT_CONTINUE; ++ ++ if (buffer_delay(bh)) { ++ flags |= FIEMAP_EXTENT_DELALLOC; ++ page_cache_release(page); ++ } else { ++ page_cache_release(page); ++ return EXT_CONTINUE; ++ } ++ } ++ ++ physical = (__u64)newex->ec_start << blksize_bits; ++ length = (__u64)newex->ec_len << blksize_bits; ++ ++ if (ex && ext4_ext_is_uninitialized(ex)) ++ flags |= FIEMAP_EXTENT_UNWRITTEN; ++ ++ /* ++ * If this extent reaches EXT_MAX_BLOCK, it must be last. ++ * ++ * Or if ext4_ext_next_allocated_block is EXT_MAX_BLOCK, ++ * this indicates no more allocated blocks. ++ * ++ * XXX this might miss a single-block extent at EXT_MAX_BLOCK ++ */ ++ if (logical + length - 1 == EXT_MAX_BLOCK || ++ ext4_ext_next_allocated_block(path) == EXT_MAX_BLOCK) ++ flags |= FIEMAP_EXTENT_LAST; ++ ++ error = fiemap_fill_next_extent(fieinfo, logical, physical, ++ length, flags, inode->i_sb->s_dev); ++ if (error < 0) ++ return error; ++ if (error == 1) ++ return EXT_BREAK; ++ ++ return EXT_CONTINUE; ++} ++ ++int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, ++ __u64 start, __u64 len) ++{ ++ ext4_fsblk_t start_blk; ++ ext4_fsblk_t len_blks; ++ int error = 0; ++ ++ if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) ++ return -EOPNOTSUPP; ++ ++ if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS_COMPAT)) ++ return -EBADR; ++ ++ start_blk = start >> inode->i_sb->s_blocksize_bits; ++ len_blks = (len + inode->i_sb->s_blocksize - 1) >> inode->i_sb->s_blocksize_bits; ++ ++ /* ++ * Walk the extent tree gathering extent information. ++ * ext4_ext_fiemap_cb will push extents back to user. ++ */ ++ down_write(&EXT4_I(inode)->i_data_sem); ++ error = ext4_ext_walk_space(inode, start_blk, len_blks, ++ ext4_ext_fiemap_cb, fieinfo); ++ up_write(&EXT4_I(inode)->i_data_sem); ++ ++ return error; ++} +Index: linux-2.6.18.i386/fs/ext4/fiemap.h +=================================================================== +--- /dev/null ++++ linux-2.6.18.i386/fs/ext4/fiemap.h +@@ -0,0 +1,85 @@ ++/* ++ * FIEMAP ioctl infrastructure. ++ * ++ * Copyright 2008 Sun Microsystems, Inc ++ * ++ * Author: Kalpak Shah ++ * Andreas Dilger ++ */ ++ ++#ifndef _LINUX_EXT4_FIEMAP_H ++#define _LINUX_EXT4_FIEMAP_H ++ ++struct fiemap_extent { ++ __u64 fe_logical; /* logical offset in bytes for the start of ++ * the extent from the beginning of the file */ ++ __u64 fe_physical; /* physical offset in bytes for the start ++ * of the extent from the beginning of the disk */ ++ __u64 fe_length; /* length in bytes for this extent */ ++ __u64 fe_reserved64[2]; ++ __u32 fe_flags; /* FIEMAP_EXTENT_* flags for this extent */ ++ __u32 fe_device; /* device number for this extent */ ++ __u32 fe_reserved[2]; ++}; ++ ++struct fiemap { ++ __u64 fm_start; /* logical offset (inclusive) at ++ * which to start mapping (in) */ ++ __u64 fm_length; /* logical length of mapping which ++ * userspace wants (in) */ ++ __u32 fm_flags; /* FIEMAP_FLAG_* flags for request (in/out) */ ++ __u32 fm_mapped_extents;/* number of extents that were mapped (out) */ ++ __u32 fm_extent_count; /* size of fm_extents array (in) */ ++ __u32 fm_reserved; ++ struct fiemap_extent fm_extents[0]; /* array of mapped extents (out) */ ++}; ++ ++/* ++ * FIEMAP helper definition. ++ */ ++struct fiemap_extent_info { ++ unsigned int fi_flags; /* Flags as passed from user */ ++ unsigned int fi_extents_mapped; /* Number of mapped extents */ ++ unsigned int fi_extents_max; /* Size of fiemap_extent array*/ ++ struct fiemap_extent *fi_extents_start; /* Start of fiemap_extent array */ ++}; ++ ++int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags); ++int fiemap_fill_next_extent(struct fiemap_extent_info *info, u64 logical, ++ u64 phys, u64 len, u32 flags, u32 lun); ++ ++#define FIEMAP_MAX_OFFSET (~0ULL) ++ ++#define FIEMAP_FLAG_SYNC 0x00000001 /* sync file data before map */ ++#define FIEMAP_FLAG_XATTR 0x00000002 /* map extended attribute tree */ ++ ++/* ldiskfs only supports FLAG_SYNC flag currently */ ++#define FIEMAP_FLAGS_COMPAT (FIEMAP_FLAG_SYNC | FIEMAP_FLAG_XATTR) ++ ++#define FIEMAP_EXTENT_LAST 0x00000001 /* Last extent in file. */ ++#define FIEMAP_EXTENT_UNKNOWN 0x00000002 /* Data location unknown. */ ++#define FIEMAP_EXTENT_DELALLOC 0x00000004 /* Location still pending. ++ * Sets EXTENT_UNKNOWN. */ ++#define FIEMAP_EXTENT_ENCODED 0x00000008 /* Data can not be read ++ * while fs is unmounted */ ++#define FIEMAP_EXTENT_DATA_ENCRYPTED 0x00000080 /* Data is encrypted by fs. ++ * Sets EXTENT_NO_DIRECT. */ ++#define FIEMAP_EXTENT_NOT_ALIGNED 0x00000100 /* Extent offsets may not be ++ * block aligned. */ ++#define FIEMAP_EXTENT_DATA_INLINE 0x00000200 /* Data mixed with metadata. ++ * Sets EXTENT_NOT_ALIGNED.*/ ++#define FIEMAP_EXTENT_DATA_TAIL 0x00000400 /* Multiple files in block. ++ * Sets EXTENT_NOT_ALIGNED.*/ ++#define FIEMAP_EXTENT_UNWRITTEN 0x00000800 /* Space allocated, but ++ * no data (i.e. zero). */ ++#define FIEMAP_EXTENT_MERGED 0x00001000 /* File does not natively ++ * support extents. Result ++ * merged for efficiency. */ ++ ++/* Lustre specific flags - use a high bit, don't conflict with upstream flag */ ++#define FIEMAP_EXTENT_NO_DIRECT 0x40000000 /* Data mapping undefined */ ++#define FIEMAP_EXTENT_NET 0x80000000 /* Data stored remotely. ++ * Sets NO_DIRECT flag */ ++ ++#endif /* _LINUX_EXT4_FIEMAP_H */ ++ diff --git a/ldiskfs/kernel_patches/patches/ext4-filterdata-rhel5.patch b/ldiskfs/kernel_patches/patches/ext4-filterdata-rhel5.patch new file mode 100644 index 0000000..25ea28a --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext4-filterdata-rhel5.patch @@ -0,0 +1,25 @@ +Index: linux-2.6.18.i386/fs/ext4/ext4_i.h +=================================================================== +--- linux-2.6.18.i386.orig/fs/ext4/ext4_i.h ++++ linux-2.6.18.i386/fs/ext4/ext4_i.h +@@ -162,6 +162,8 @@ struct ext4_inode_info { + /* mballoc */ + struct list_head i_prealloc_list; + spinlock_t i_prealloc_lock; ++ ++ void *i_filterdata; + }; + + #endif /* _EXT4_I */ +Index: linux-2.6.18.i386/fs/ext4/super.c +=================================================================== +--- linux-2.6.18.i386.orig/fs/ext4/super.c ++++ linux-2.6.18.i386/fs/ext4/super.c +@@ -574,6 +574,7 @@ static struct inode *ext4_alloc_inode(st + memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache)); + INIT_LIST_HEAD(&ei->i_prealloc_list); + spin_lock_init(&ei->i_prealloc_lock); ++ ei->i_filterdata = NULL; + return &ei->vfs_inode; + } + diff --git a/ldiskfs/kernel_patches/patches/ext4-ialloc-2.6-rhel5.patch b/ldiskfs/kernel_patches/patches/ext4-ialloc-2.6-rhel5.patch new file mode 100644 index 0000000..7361a24 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext4-ialloc-2.6-rhel5.patch @@ -0,0 +1,129 @@ +Index: linux-2.6.18.i386/fs/ext4/ialloc.c +=================================================================== +--- linux-2.6.18.i386.orig/fs/ext4/ialloc.c ++++ linux-2.6.18.i386/fs/ext4/ialloc.c +@@ -509,12 +509,16 @@ fallback: + } + + static int find_group_other(struct super_block *sb, struct inode *parent, +- ext4_group_t *group) ++ ext4_group_t *group, int mode) + { ++ struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_group_t parent_group = EXT4_I(parent)->i_block_group; +- ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; ++ ext4_group_t ngroups = sbi->s_groups_count; + struct ext4_group_desc *desc; + ext4_group_t i; ++ int best_group = -1; ++ ext4_fsblk_t avefreeb, freeb; ++ int best_group_freeb = 0; + + /* + * Try to place the inode in its parent directory +@@ -522,8 +526,10 @@ static int find_group_other(struct super + *group = parent_group; + desc = ext4_get_group_desc(sb, *group, NULL); + if (desc && le16_to_cpu(desc->bg_free_inodes_count) && +- le16_to_cpu(desc->bg_free_blocks_count)) ++ (!S_ISREG(mode) || le16_to_cpu(desc->bg_free_blocks_count))) + return 0; ++ avefreeb = ext4_free_blocks_count(sbi->s_es); ++ do_div(avefreeb, ngroups); + + /* + * We're going to place this inode in a different blockgroup from its +@@ -537,33 +543,49 @@ static int find_group_other(struct super + *group = (*group + parent->i_ino) % ngroups; + + /* +- * Use a quadratic hash to find a group with a free inode and some free +- * blocks. ++ * Use a quadratic hash to find a group with a free inode and ++ * average number of free blocks. + */ + for (i = 1; i < ngroups; i <<= 1) { + *group += i; + if (*group >= ngroups) + *group -= ngroups; + desc = ext4_get_group_desc(sb, *group, NULL); +- if (desc && le16_to_cpu(desc->bg_free_inodes_count) && +- le16_to_cpu(desc->bg_free_blocks_count)) ++ if (!desc || !desc->bg_free_inodes_count) ++ continue; ++ if (!S_ISREG(mode)) ++ return 0; ++ if (le16_to_cpu(desc->bg_free_blocks_count) >= avefreeb) + return 0; + } + + /* +- * That failed: try linear search for a free inode, even if that group +- * has no free blocks. ++ * That failed: start from last group used to allocate inode ++ * try linear search for a free inode and prefereably ++ * free blocks. + */ +- *group = parent_group; ++ *group = sbi->s_last_alloc_group; ++ if (*group == -1) ++ *group = parent_group; ++ + for (i = 0; i < ngroups; i++) { + if (++*group >= ngroups) + *group = 0; + desc = ext4_get_group_desc(sb, *group, NULL); +- if (desc && le16_to_cpu(desc->bg_free_inodes_count)) +- return 0; ++ if (!desc || !desc->bg_free_inodes_count) ++ continue; ++ freeb = le16_to_cpu(desc->bg_free_blocks_count); ++ if (freeb > best_group_freeb) { ++ best_group_freeb = freeb; ++ best_group = *group; ++ if (freeb >= avefreeb || !S_ISREG(mode)) ++ break; ++ } + } + +- return -1; ++ sbi->s_last_alloc_group = best_group; ++ *group = best_group; ++ return 0; + } + + /* +@@ -656,7 +678,7 @@ continue_allocation: + else + ret2 = find_group_orlov(sb, dir, &group); + } else +- ret2 = find_group_other(sb, dir, &group); ++ ret2 = find_group_other(sb, dir, &group, mode); + + got_group: + err = -ENOSPC; +Index: linux-2.6.18.i386/fs/ext4/super.c +=================================================================== +--- linux-2.6.18.i386.orig/fs/ext4/super.c ++++ linux-2.6.18.i386/fs/ext4/super.c +@@ -2190,6 +2190,7 @@ static int ext4_fill_super(struct super_ + + bgl_lock_init(&sbi->s_blockgroup_lock); + ++ sbi->s_last_alloc_group = -1; + for (i = 0; i < db_count; i++) { + block = descriptor_loc(sb, logical_sb_block, i); + sbi->s_group_desc[i] = sb_bread(sb, block); +Index: linux-2.6.18.i386/fs/ext4/ext4_sb.h +=================================================================== +--- linux-2.6.18.i386.orig/fs/ext4/ext4_sb.h ++++ linux-2.6.18.i386/fs/ext4/ext4_sb.h +@@ -60,6 +60,8 @@ struct ext4_sb_info { + struct percpu_counter s_freeinodes_counter; + struct percpu_counter s_dirs_counter; + struct blockgroup_lock s_blockgroup_lock; ++ /* Last group used to allocate inode */ ++ int s_last_alloc_group; + + /* root of the per fs reservation window tree */ + spinlock_t s_rsv_window_lock; diff --git a/ldiskfs/kernel_patches/patches/ext4-include-fixes-2.6-rhel5.patch b/ldiskfs/kernel_patches/patches/ext4-include-fixes-2.6-rhel5.patch new file mode 100644 index 0000000..0009eaa --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext4-include-fixes-2.6-rhel5.patch @@ -0,0 +1,20 @@ +Index: linux-2.6.18.i386/fs/ext4/ext4.h +=================================================================== +--- linux-2.6.18.i386.orig/fs/ext4/ext4.h ++++ linux-2.6.18.i386/fs/ext4/ext4.h +@@ -541,12 +541,13 @@ do { \ + #define EXT4_MOUNT_IOPEN 0x8000000 /* Allow access via iopen */ + #define EXT4_MOUNT_IOPEN_NOPRIV 0x10000000 /* Make iopen world-readable */ + /* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */ +-#ifndef _LINUX_EXT2_FS_H ++#ifndef clear_opt + #define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt + #define set_opt(o, opt) o |= EXT4_MOUNT_##opt + #define test_opt(sb, opt) (EXT4_SB(sb)->s_mount_opt & \ + EXT4_MOUNT_##opt) +-#else ++#endif ++#ifndef EXT2_MOUNT_NOLOAD + #define EXT2_MOUNT_NOLOAD EXT4_MOUNT_NOLOAD + #define EXT2_MOUNT_ABORT EXT4_MOUNT_ABORT + #define EXT2_MOUNT_DATA_FLAGS EXT4_MOUNT_DATA_FLAGS diff --git a/ldiskfs/kernel_patches/patches/ext4-inode-version-rhel5.patch b/ldiskfs/kernel_patches/patches/ext4-inode-version-rhel5.patch new file mode 100644 index 0000000..d8a31ad --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext4-inode-version-rhel5.patch @@ -0,0 +1,105 @@ +Index: linux-2.6.18-128.1.6/fs/ext4/inode.c +=================================================================== +--- linux-2.6.18-128.1.6.orig/fs/ext4/inode.c ++++ linux-2.6.18-128.1.6/fs/ext4/inode.c +@@ -2850,11 +2850,11 @@ struct inode *ext4_iget(struct super_blo + EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode); + EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode); + +- inode->i_version = le32_to_cpu(raw_inode->i_disk_version); ++ ei->i_fs_version = le32_to_cpu(raw_inode->i_disk_version); + if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { + if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) +- inode->i_version |= +- (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32; ++ ei->i_fs_version |= (__u64)(le32_to_cpu(raw_inode->i_version_hi)) ++ << 32; + } + + if (S_ISREG(inode->i_mode)) { +@@ -3043,16 +3043,11 @@ static int ext4_do_update_inode(handle_t + } else for (block = 0; block < EXT4_N_BLOCKS; block++) + raw_inode->i_block[block] = ei->i_data[block]; + +- raw_inode->i_disk_version = cpu_to_le32(inode->i_version); ++ raw_inode->i_disk_version = cpu_to_le32(ei->i_fs_version); + if (ei->i_extra_isize) { + if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) +- /* in RHEL5 i_version is an unsigned long */ +-#if BITS_PER_LONG == 64 +- raw_inode->i_version_hi = +- cpu_to_le32(inode->i_version >> 32); +-#else +- raw_inode->i_version_hi = 0; +-#endif ++ raw_inode->i_version_hi = cpu_to_le32(ei->i_fs_version ++ >> 32); + raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); + } + +Index: linux-2.6.18-128.1.6/fs/ext4/ext4_i.h +=================================================================== +--- linux-2.6.18-128.1.6.orig/fs/ext4/ext4_i.h ++++ linux-2.6.18-128.1.6/fs/ext4/ext4_i.h +@@ -21,6 +21,8 @@ + #include + #include + ++#define HAVE_DISK_INODE_VERSION ++ + /* data type for block offset of block group */ + typedef int ext4_grpblk_t; + +@@ -164,6 +166,8 @@ struct ext4_inode_info { + spinlock_t i_prealloc_lock; + + void *i_filterdata; ++ ++ __u64 i_fs_version; + }; + + #endif /* _EXT4_I */ +Index: linux-2.6.18-128.1.6/fs/ext4/xattr.c +=================================================================== +--- linux-2.6.18-128.1.6.orig/fs/ext4/xattr.c ++++ linux-2.6.18-128.1.6/fs/ext4/xattr.c +@@ -959,13 +959,18 @@ ext4_xattr_set_handle(handle_t *handle, + struct ext4_xattr_block_find bs = { + .s = { .not_found = -ENODATA, }, + }; ++ unsigned long no_expand; + int error; + + if (!name) + return -EINVAL; + if (strlen(name) > 255) + return -ERANGE; ++ + down_write(&EXT4_I(inode)->xattr_sem); ++ no_expand = EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND; ++ EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND; ++ + error = ext4_get_inode_loc(inode, &is.iloc); + if (error) + goto cleanup; +@@ -1042,6 +1047,8 @@ ext4_xattr_set_handle(handle_t *handle, + cleanup: + brelse(is.iloc.bh); + brelse(bs.bh); ++ if (no_expand == 0) ++ EXT4_I(inode)->i_state &= ~EXT4_STATE_NO_EXPAND; + up_write(&EXT4_I(inode)->xattr_sem); + return error; + } +Index: linux-2.6.18-128.1.6/fs/ext4/ialloc.c +=================================================================== +--- linux-2.6.18-128.1.6.orig/fs/ext4/ialloc.c ++++ linux-2.6.18-128.1.6/fs/ext4/ialloc.c +@@ -878,6 +878,7 @@ got: + ei->i_dtime = 0; + ei->i_block_alloc_info = NULL; + ei->i_block_group = group; ++ ei->i_fs_version = 0; + + ext4_set_inode_flags(inode); + if (IS_DIRSYNC(inode)) diff --git a/ldiskfs/kernel_patches/patches/ext4-lookup-dotdot-rhel5.patch b/ldiskfs/kernel_patches/patches/ext4-lookup-dotdot-rhel5.patch new file mode 100644 index 0000000..af019fa --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext4-lookup-dotdot-rhel5.patch @@ -0,0 +1,63 @@ +Index: linux-2.6.18.i386/fs/ext4/iopen.c +=================================================================== +--- linux-2.6.18.i386.orig/fs/ext4/iopen.c ++++ linux-2.6.18.i386/fs/ext4/iopen.c +@@ -91,9 +91,12 @@ static struct dentry *iopen_lookup(struc + assert(!(alternate->d_flags & DCACHE_DISCONNECTED)); + } + +- if (!list_empty(&inode->i_dentry)) { +- alternate = list_entry(inode->i_dentry.next, +- struct dentry, d_alias); ++ list_for_each(lp, &inode->i_dentry) { ++ alternate = list_entry(lp, struct dentry, d_alias); ++ /* ignore dentries created for ".." to preserve ++ * proper dcache hierarchy -- bug 10458 */ ++ if (alternate->d_flags & DCACHE_NFSFS_RENAMED) ++ continue; + dget_locked(alternate); + spin_lock(&alternate->d_lock); + alternate->d_flags |= DCACHE_REFERENCED; +Index: linux-2.6.18.i386/fs/ext4/namei.c +=================================================================== +--- linux-2.6.18.i386.orig/fs/ext4/namei.c ++++ linux-2.6.18.i386/fs/ext4/namei.c +@@ -1067,6 +1067,38 @@ static struct dentry *ext4_lookup(struct + return ERR_CAST(inode); + } + ++ /* ".." shouldn't go into dcache to preserve dcache hierarchy ++ * otherwise we'll get parent being a child of actual child. ++ * see bug 10458 for details -bzzz */ ++ if (inode && (dentry->d_name.name[0] == '.' && (dentry->d_name.len == 1 || ++ (dentry->d_name.len == 2 && dentry->d_name.name[1] == '.')))) { ++ struct dentry *tmp, *goal = NULL; ++ struct list_head *lp; ++ ++ /* first, look for an existing dentry - any one is good */ ++ spin_lock(&dcache_lock); ++ list_for_each(lp, &inode->i_dentry) { ++ tmp = list_entry(lp, struct dentry, d_alias); ++ goal = tmp; ++ dget_locked(goal); ++ break; ++ } ++ if (goal == NULL) { ++ /* there is no alias, we need to make current dentry: ++ * a) inaccessible for __d_lookup() ++ * b) inaccessible for iopen */ ++ J_ASSERT(list_empty(&dentry->d_alias)); ++ dentry->d_flags |= DCACHE_NFSFS_RENAMED; ++ /* this is d_instantiate() ... */ ++ list_add(&dentry->d_alias, &inode->i_dentry); ++ dentry->d_inode = inode; ++ } ++ spin_unlock(&dcache_lock); ++ if (goal) ++ iput(inode); ++ return goal; ++ } ++ + return iopen_connect_dentry(dentry, inode, 1); + } + diff --git a/ldiskfs/kernel_patches/patches/ext4-map_inode_page-2.6.18-rhel5.patch b/ldiskfs/kernel_patches/patches/ext4-map_inode_page-2.6.18-rhel5.patch new file mode 100644 index 0000000..4ed87f0 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext4-map_inode_page-2.6.18-rhel5.patch @@ -0,0 +1,86 @@ +Index: linux-2.6.18.i386/fs/ext4/inode.c +=================================================================== +--- linux-2.6.18.i386.orig/fs/ext4/inode.c ++++ linux-2.6.18.i386/fs/ext4/inode.c +@@ -3666,3 +3666,66 @@ out_unlock: + unlock_page(page); + return ret; + } ++ ++int ext4_map_inode_page(struct inode *inode, struct page *page, ++ unsigned long *blocks, int *created, int create) ++{ ++ unsigned int blocksize, blocks_per_page; ++ unsigned long iblock; ++ struct buffer_head dummy; ++ void *handle; ++ int i, rc = 0, failed = 0, needed_blocks; ++ ++ blocksize = inode->i_sb->s_blocksize; ++ blocks_per_page = PAGE_SIZE >> inode->i_sb->s_blocksize_bits; ++ iblock = page->index * blocks_per_page; ++ ++ for (i = 0; i < blocks_per_page; i++, iblock++) { ++ blocks[i] = ext4_bmap(inode->i_mapping, iblock); ++ if (blocks[i] == 0) { ++ failed++; ++ if (created) ++ created[i] = -1; ++ } else if (created) { ++ created[i] = 0; ++ } ++ } ++ ++ if (failed == 0 || create == 0) ++ return 0; ++ ++ needed_blocks = ext4_writepage_trans_blocks(inode); ++ handle = ext4_journal_start(inode, needed_blocks); ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ ++ iblock = page->index * blocks_per_page; ++ for (i = 0; i < blocks_per_page; i++, iblock++) { ++ if (blocks[i] != 0) ++ continue; ++ ++ rc = ext4_get_blocks_handle(handle, inode, iblock, 1, &dummy, 1, 1); ++ if (rc < 0) { ++ printk(KERN_INFO "ext4_map_inode_page: error reading " ++ "block %ld\n", iblock); ++ goto out; ++ } else { ++ if (rc > 1) ++ WARN_ON(1); ++ rc = 0; ++ } ++ /* Unmap any metadata buffers from the block mapping, to avoid ++ * data corruption due to direct-write from Lustre being ++ * clobbered by a later flush of the blockdev metadata buffer.*/ ++ if (buffer_new(&dummy)) ++ unmap_underlying_metadata(dummy.b_bdev, ++ dummy.b_blocknr); ++ blocks[i] = dummy.b_blocknr; ++ if (created) ++ created[i] = 1; ++ } ++ ++out: ++ ext4_journal_stop(handle); ++ return rc; ++} +Index: linux-2.6.18.i386/fs/ext4/super.c +=================================================================== +--- linux-2.6.18.i386.orig/fs/ext4/super.c ++++ linux-2.6.18.i386/fs/ext4/super.c +@@ -3498,6 +3498,10 @@ static void __exit exit_ext4_fs(void) + __free_page(ext4_zero_page); + } + ++int ext4_map_inode_page(struct inode *inode, struct page *page, ++ unsigned long *blocks, int *created, int create); ++EXPORT_SYMBOL(ext4_map_inode_page); ++ + MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); + MODULE_DESCRIPTION("Fourth Extended Filesystem with extents"); + MODULE_LICENSE("GPL"); diff --git a/ldiskfs/kernel_patches/patches/ext4-max-dir-size-rhel5.patch b/ldiskfs/kernel_patches/patches/ext4-max-dir-size-rhel5.patch new file mode 100644 index 0000000..295d0d1 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext4-max-dir-size-rhel5.patch @@ -0,0 +1,203 @@ +Index: linux-2.6.18.i386/fs/ext4/ialloc.c +=================================================================== +--- linux-2.6.18.i386.orig/fs/ext4/ialloc.c ++++ linux-2.6.18.i386/fs/ext4/ialloc.c +@@ -622,12 +622,15 @@ struct inode *ext4_new_inode(handle_t *h + return ERR_PTR(-EPERM); + + sb = dir->i_sb; ++ sbi = EXT4_SB(sb); ++ if (sbi->s_max_dir_size > 0 && i_size_read(dir) >= sbi->s_max_dir_size) ++ return ERR_PTR(-EFBIG); ++ + inode = new_inode(sb); + if (!inode) + return ERR_PTR(-ENOMEM); + ei = EXT4_I(inode); + +- sbi = EXT4_SB(sb); + es = sbi->s_es; + + if (goal) { +Index: linux-2.6.18.i386/fs/ext4/super.c +=================================================================== +--- linux-2.6.18.i386.orig/fs/ext4/super.c ++++ linux-2.6.18.i386/fs/ext4/super.c +@@ -38,6 +38,7 @@ + #include + #include + #include ++#include + + #include "ext4.h" + #include "ext4_jbd2.h" +@@ -67,6 +68,8 @@ static void ext4_write_super_lockfs(stru + + struct page *ext4_zero_page; + ++struct proc_dir_entry *proc_root_ext4; ++ + ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, + struct ext4_group_desc *bg) + { +@@ -551,6 +554,9 @@ static void ext4_put_super(struct super_ + } + if (sbi->s_mmp_tsk) + kthread_stop(sbi->s_mmp_tsk); ++ ++ remove_proc_entry(EXT4_MAX_DIR_SIZE_NAME, sbi->s_mb_proc); ++ + sb->s_fs_info = NULL; + kfree(sbi); + return; +@@ -2185,6 +2191,46 @@ static unsigned long ext4_get_stripe_siz + return 0; + } + ++static int ext4_max_dir_size_read(char *page, char **start, off_t off, ++ int count, int *eof, void *data) ++{ ++ struct ext4_sb_info *sbi = data; ++ int len; ++ ++ *eof = 1; ++ if (off != 0) ++ return 0; ++ ++ len = sprintf(page, "%lu\n", sbi->s_max_dir_size); ++ *start = page; ++ return len; ++} ++ ++static int ext4_max_dir_size_write(struct file *file, const char *buffer, ++ unsigned long count, void *data) ++{ ++ struct ext4_sb_info *sbi = data; ++ char str[32]; ++ unsigned long value; ++ char *end; ++ ++ if (count >= sizeof(str)) { ++ printk(KERN_ERR "EXT4-fs: %s string too long, max %u bytes\n", ++ EXT4_MAX_DIR_SIZE_NAME, (int)sizeof(str)); ++ return -EOVERFLOW; ++ } ++ ++ if (copy_from_user(str, buffer, count)) ++ return -EFAULT; ++ ++ value = simple_strtol(str, &end, 0); ++ if (value < 0) ++ return -ERANGE; ++ ++ sbi->s_max_dir_size = value; ++ return count; ++} ++ + static int ext4_fill_super(struct super_block *sb, void *data, int silent) + __releases(kernel_lock) + __acquires(kernel_lock) +@@ -2208,6 +2254,7 @@ static int ext4_fill_super(struct super_ + int needs_recovery; + __le32 features; + __u64 blocks_count; ++ struct proc_dir_entry *proc; + + sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); + if (!sbi) +@@ -2743,6 +2790,22 @@ static int ext4_fill_super(struct super_ + ext4_ext_init(sb); + ext4_mb_init(sb, needs_recovery); + ++ sbi->s_max_dir_size = EXT4_DEFAULT_MAX_DIR_SIZE; ++ proc = create_proc_entry(EXT4_MAX_DIR_SIZE_NAME, ++ S_IFREG | S_IRUGO | S_IWUSR, sbi->s_mb_proc); ++ if (proc == NULL) { ++ printk(KERN_ERR "EXT4-fs: unable to create %s\n", ++ EXT4_MAX_DIR_SIZE_NAME); ++ remove_proc_entry(EXT4_MAX_DIR_SIZE_NAME, sbi->s_mb_proc); ++ remove_proc_entry(sbi->s_mb_proc->name, proc_root_ext4); ++ sbi->s_mb_proc = NULL; ++ ret = -ENOMEM; ++ goto failed_mount4; ++ } ++ proc->data = sbi; ++ proc->read_proc = ext4_max_dir_size_read; ++ proc->write_proc = ext4_max_dir_size_write; ++ + lock_kernel(); + return 0; + +@@ -3082,7 +3145,6 @@ static void ext4_commit_super(struct sup + sync_dirty_buffer(sbh); + } + +- + /* + * Have we just finished recovery? If so, and if we are mounting (or + * remounting) the filesystem readonly, then we will end up with a +Index: linux-2.6.18.i386/fs/ext4/ext4_sb.h +=================================================================== +--- linux-2.6.18.i386.orig/fs/ext4/ext4_sb.h ++++ linux-2.6.18.i386/fs/ext4/ext4_sb.h +@@ -117,6 +117,7 @@ struct ext4_sb_info { + /* where last allocation was done - for stream allocation */ + unsigned long s_mb_last_group; + unsigned long s_mb_last_start; ++ unsigned long s_max_dir_size; + + /* history to debug policy */ + struct ext4_mb_history *s_mb_history; +Index: linux-2.6.18.i386/fs/ext4/ext4.h +=================================================================== +--- linux-2.6.18.i386.orig/fs/ext4/ext4.h ++++ linux-2.6.18.i386/fs/ext4/ext4.h +@@ -992,6 +992,14 @@ struct mmp_struct { + */ + #define EXT4_MMP_MIN_CHECK_INTERVAL 5 + ++extern struct proc_dir_entry *proc_root_ext4; ++ ++/* ++ * max directory size tunable ++ */ ++#define EXT4_DEFAULT_MAX_DIR_SIZE 0 ++#define EXT4_MAX_DIR_SIZE_NAME "max_dir_size" ++ + /* + * Function prototypes + */ +Index: linux-2.6.18.i386/fs/ext4/mballoc.h +=================================================================== +--- linux-2.6.18.i386.orig/fs/ext4/mballoc.h ++++ linux-2.6.18.i386/fs/ext4/mballoc.h +@@ -257,7 +257,6 @@ static void ext4_mb_store_history(struct + + #define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) + +-static struct proc_dir_entry *proc_root_ext4; + struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t); + + static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, +Index: linux-2.6.18.i386/fs/ext4/mballoc.c +=================================================================== +--- linux-2.6.18.i386.orig/fs/ext4/mballoc.c ++++ linux-2.6.18.i386/fs/ext4/mballoc.c +@@ -2821,6 +2821,7 @@ err_out: + remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_mb_proc); + remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_mb_proc); + remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_mb_proc); ++ remove_proc_entry(EXT4_MAX_DIR_SIZE_NAME, sbi->s_mb_proc); + remove_proc_entry(devname, proc_root_ext4); + sbi->s_mb_proc = NULL; + +@@ -2842,7 +2843,9 @@ static int ext4_mb_destroy_per_dev_proc( + remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_mb_proc); + remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_mb_proc); + remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_mb_proc); ++ remove_proc_entry(EXT4_MAX_DIR_SIZE_NAME, sbi->s_mb_proc); + remove_proc_entry(devname, proc_root_ext4); ++ sbi->s_mb_proc = NULL; + + return 0; + } diff --git a/ldiskfs/kernel_patches/patches/ext4-mballoc-extra-checks-rhel5.patch b/ldiskfs/kernel_patches/patches/ext4-mballoc-extra-checks-rhel5.patch new file mode 100644 index 0000000..b8cdada --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext4-mballoc-extra-checks-rhel5.patch @@ -0,0 +1,330 @@ +Index: linux-2.6.18.i686/fs/ext4/mballoc.c +=================================================================== +--- linux-2.6.18.i686.orig/fs/ext4/mballoc.c ++++ linux-2.6.18.i686/fs/ext4/mballoc.c +@@ -660,7 +660,7 @@ static void ext4_mb_mark_free_simple(str + } + } + +-static void ext4_mb_generate_buddy(struct super_block *sb, ++static int ext4_mb_generate_buddy(struct super_block *sb, + void *buddy, void *bitmap, ext4_group_t group) + { + struct ext4_group_info *grp = ext4_get_group_info(sb, group); +@@ -692,14 +692,14 @@ static void ext4_mb_generate_buddy(struc + grp->bb_fragments = fragments; + + if (free != grp->bb_free) { +- ext4_error(sb, __func__, +- "EXT4-fs: group %lu: %u blocks in bitmap, %u in gd\n", +- group, free, grp->bb_free); +- /* +- * If we intent to continue, we consider group descritor +- * corrupt and update bb_free using bitmap value +- */ +- grp->bb_free = free; ++ struct ext4_group_desc *gdp; ++ gdp = ext4_get_group_desc (sb, group, NULL); ++ ext4_error(sb, __FUNCTION__, ++ "group %lu: %u blocks in bitmap, %u in bb, " ++ "%u in gd, %lu pa's\n", group, free, grp->bb_free, ++ le16_to_cpu(gdp->bg_free_blocks_count), ++ grp->bb_prealloc_nr); ++ return -EIO; + } + + clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); +@@ -709,6 +709,8 @@ static void ext4_mb_generate_buddy(struc + EXT4_SB(sb)->s_mb_buddies_generated++; + EXT4_SB(sb)->s_mb_generation_time += period; + spin_unlock(&EXT4_SB(sb)->s_bal_lock); ++ ++ return 0; + } + + /* The buddy information is attached the buddy cache inode +@@ -814,7 +816,7 @@ static int ext4_mb_init_cache(struct pag + + err = 0; + first_block = page->index * blocks_per_page; +- for (i = 0; i < blocks_per_page; i++) { ++ for (i = 0; i < blocks_per_page && err == 0; i++) { + int group; + struct ext4_group_info *grinfo; + +@@ -848,7 +850,7 @@ static int ext4_mb_init_cache(struct pag + /* + * incore got set to the group block bitmap below + */ +- ext4_mb_generate_buddy(sb, data, incore, group); ++ err = ext4_mb_generate_buddy(sb, data, incore, group); + incore = NULL; + } else { + /* this is block of bitmap */ +@@ -861,7 +863,7 @@ static int ext4_mb_init_cache(struct pag + memcpy(data, bitmap, blocksize); + + /* mark all preallocated blks used in in-core bitmap */ +- ext4_mb_generate_from_pa(sb, data, group); ++ err = ext4_mb_generate_from_pa(sb, data, group); + ext4_unlock_group(sb, group); + + /* set incore so that the buddy information can be +@@ -870,6 +872,7 @@ static int ext4_mb_init_cache(struct pag + incore = data; + } + } ++ if (likely(err == 0)) + SetPageUptodate(page); + + out: +@@ -1964,7 +1967,10 @@ static int ext4_mb_seq_history_show(stru + hs->result.fe_start, hs->result.fe_len); + seq_printf(seq, "%-5u %-8u %-23s free\n", + hs->pid, hs->ino, buf2); ++ } else { ++ seq_printf(seq, "unknown op %d\n", hs->op); + } ++ + return 0; + } + +@@ -2092,9 +2098,11 @@ static void *ext4_mb_seq_groups_next(str + static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) + { + struct super_block *sb = seq->private; ++ struct ext4_group_desc *gdp; + long group = (long) v; + int i; + int err; ++ int free = 0; + struct ext4_buddy e4b; + struct sg { + struct ext4_group_info info; +@@ -2103,10 +2111,10 @@ static int ext4_mb_seq_groups_show(struc + + group--; + if (group == 0) +- seq_printf(seq, "#%-5s: %-5s %-5s %-5s " ++ seq_printf(seq, "#%-5s: %-5s %-5s %-5s %-5s %-5s" + "[ %-5s %-5s %-5s %-5s %-5s %-5s %-5s " + "%-5s %-5s %-5s %-5s %-5s %-5s %-5s ]\n", +- "group", "free", "frags", "first", ++ "group", "free", "frags", "first", "first", "pa", + "2^0", "2^1", "2^2", "2^3", "2^4", "2^5", "2^6", + "2^7", "2^8", "2^9", "2^10", "2^11", "2^12", "2^13"); + +@@ -2117,13 +2125,20 @@ static int ext4_mb_seq_groups_show(struc + seq_printf(seq, "#%-5lu: I/O error\n", group); + return 0; + } ++ ++ gdp = ext4_get_group_desc(sb, group, NULL); ++ if (gdp != NULL) ++ free = le16_to_cpu(gdp->bg_free_blocks_count); ++ + ext4_lock_group(sb, group); + memcpy(&sg, ext4_get_group_info(sb, group), i); + ext4_unlock_group(sb, group); + ext4_mb_release_desc(&e4b); + +- seq_printf(seq, "#%-5lu: %-5u %-5u %-5u [", group, sg.info.bb_free, +- sg.info.bb_fragments, sg.info.bb_first_free); ++ seq_printf(seq, "#%-5lu: %-5u %-5u %-5u %-5u %-5lu [", group, ++ sg.info.bb_free, free, ++ sg.info.bb_fragments, sg.info.bb_first_free, ++ sg.info.bb_prealloc_nr); + for (i = 0; i <= 13; i++) + seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ? + sg.info.bb_counters[i] : 0); +@@ -2226,6 +2241,7 @@ ext4_mb_store_history(struct ext4_alloca + h.tail = ac->ac_tail; + h.buddy = ac->ac_buddy; + h.merged = 0; ++ h.cr = ac->ac_criteria; + if (ac->ac_op == EXT4_MB_HISTORY_ALLOC) { + if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && + ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group) +@@ -3531,22 +3547,66 @@ ext4_mb_use_preallocated(struct ext4_all + } + + /* ++ * check free blocks in bitmap match free block in group descriptor ++ * do this before taking preallocated blocks into account to be able ++ * to detect on-disk corruptions ++ */ ++int ext4_mb_check_ondisk_bitmap(struct super_block *sb, void *bitmap, ++ struct ext4_group_desc *gdp, int group) ++{ ++ unsigned short max = EXT4_BLOCKS_PER_GROUP(sb); ++ unsigned short i, first, free = 0; ++ ++ i = mb_find_next_zero_bit(bitmap, max, 0); ++ ++ while (i < max) { ++ first = i; ++ i = find_next_bit(bitmap, max, i); ++ if (i > max) ++ i = max; ++ free += i - first; ++ if (i < max) ++ i = mb_find_next_zero_bit(bitmap, max, i); ++ } ++ ++ if (free != le16_to_cpu(gdp->bg_free_blocks_count)) { ++ ext4_error(sb, __FUNCTION__, "on-disk bitmap for group %d" ++ "corrupted: %u blocks free in bitmap, %u - in gd\n", ++ group, free, le16_to_cpu(gdp->bg_free_blocks_count)); ++ return -EIO; ++ } ++ return 0; ++} ++ ++/* + * the function goes through all preallocation in this group and marks them + * used in in-core bitmap. buddy must be generated from this bitmap + * Need to be called with ext4 group lock (ext4_lock_group) + */ +-static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ++static int ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, + ext4_group_t group) + { + struct ext4_group_info *grp = ext4_get_group_info(sb, group); + struct ext4_prealloc_space *pa; ++ struct ext4_group_desc *gdp; + struct list_head *cur; + ext4_group_t groupnr; + ext4_grpblk_t start; + int preallocated = 0; + int count = 0; ++ int skip = 0; ++ int err; + int len; + ++ gdp = ext4_get_group_desc (sb, group, NULL); ++ if (gdp == NULL) ++ return -EIO; ++ ++ /* before applying preallocations, check bitmap consistency */ ++ err = ext4_mb_check_ondisk_bitmap(sb, bitmap, gdp, group); ++ if (err) ++ return err; ++ + /* all form of preallocation discards first load group, + * so the only competing code is preallocation use. + * we don't need any locking here +@@ -3562,15 +3622,24 @@ static void ext4_mb_generate_from_pa(str + &groupnr, &start); + len = pa->pa_len; + spin_unlock(&pa->pa_lock); +- if (unlikely(len == 0)) ++ if (unlikely(len == 0)) { ++ skip++; + continue; ++ } + BUG_ON(groupnr != group); + mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group), + bitmap, start, len); + preallocated += len; + count++; + } ++ if (count + skip != grp->bb_prealloc_nr) { ++ ext4_error(sb, __FUNCTION__, "lost preallocations: " ++ "count %d, bb_prealloc_nr %lu, skip %d\n", ++ count, grp->bb_prealloc_nr, skip); ++ return -EIO; ++ } + mb_debug("prellocated %u for group %lu\n", preallocated, group); ++ return 0; + } + + static void ext4_mb_pa_callback(struct rcu_head *head) +@@ -3621,6 +3690,7 @@ static void ext4_mb_put_pa(struct ext4_a + */ + ext4_lock_group(sb, grp); + list_del(&pa->pa_group_list); ++ ext4_get_group_info(sb, grp)->bb_prealloc_nr--; + ext4_unlock_group(sb, grp); + + spin_lock(pa->pa_obj_lock); +@@ -3709,6 +3779,7 @@ ext4_mb_new_inode_pa(struct ext4_allocat + + ext4_lock_group(sb, ac->ac_b_ex.fe_group); + list_add(&pa->pa_group_list, &grp->bb_prealloc_list); ++ grp->bb_prealloc_nr++; + ext4_unlock_group(sb, ac->ac_b_ex.fe_group); + + spin_lock(pa->pa_obj_lock); +@@ -3768,6 +3839,7 @@ ext4_mb_new_group_pa(struct ext4_allocat + + ext4_lock_group(sb, ac->ac_b_ex.fe_group); + list_add(&pa->pa_group_list, &grp->bb_prealloc_list); ++ grp->bb_prealloc_nr++; + ext4_unlock_group(sb, ac->ac_b_ex.fe_group); + + /* +@@ -3820,6 +3892,7 @@ ext4_mb_release_inode_pa(struct ext4_bud + ac->ac_sb = sb; + ac->ac_inode = pa->pa_inode; + ac->ac_op = EXT4_MB_HISTORY_DISCARD; ++ ac->ac_o_ex.fe_len = 1; + } + + while (bit < end) { +@@ -3964,6 +4037,8 @@ repeat: + + spin_unlock(&pa->pa_lock); + ++ BUG_ON(grp->bb_prealloc_nr == 0); ++ grp->bb_prealloc_nr--; + list_del(&pa->pa_group_list); + list_add(&pa->u.pa_tmp_list, &list); + } +@@ -4099,7 +4174,7 @@ repeat: + if (err) { + ext4_error(sb, __func__, "Error in loading buddy " + "information for %lu\n", group); +- continue; ++ return; + } + + bitmap_bh = ext4_read_block_bitmap(sb, group); +@@ -4111,6 +4186,8 @@ repeat: + } + + ext4_lock_group(sb, group); ++ BUG_ON(e4b.bd_info->bb_prealloc_nr == 0); ++ e4b.bd_info->bb_prealloc_nr--; + list_del(&pa->pa_group_list); + ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac); + ext4_unlock_group(sb, group); +Index: linux-2.6.18.i686/fs/ext4/mballoc.h +=================================================================== +--- linux-2.6.18.i686.orig/fs/ext4/mballoc.h ++++ linux-2.6.18.i686/fs/ext4/mballoc.h +@@ -119,6 +119,7 @@ struct ext4_group_info { + unsigned short bb_free; + unsigned short bb_fragments; + struct list_head bb_prealloc_list; ++ unsigned long bb_prealloc_nr; + #ifdef DOUBLE_CHECK + void *bb_bitmap; + #endif +@@ -228,7 +229,7 @@ struct ext4_mb_history { + __u16 tail; /* what tail broke some buddy */ + __u16 buddy; /* buddy the tail ^^^ broke */ + __u16 flags; +- __u8 cr:3; /* which phase the result extent was found at */ ++ __u8 cr:8; /* which phase the result extent was found at */ + __u8 op:4; + __u8 merged:1; + }; +@@ -259,7 +260,7 @@ static void ext4_mb_store_history(struct + + struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t); + +-static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ++static int ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, + ext4_group_t group); + static void ext4_mb_poll_new_transaction(struct super_block *, handle_t *); + static void ext4_mb_free_committed_blocks(struct super_block *); diff --git a/ldiskfs/kernel_patches/patches/ext4-mballoc-handle-dev-paths-rhel5.patch b/ldiskfs/kernel_patches/patches/ext4-mballoc-handle-dev-paths-rhel5.patch new file mode 100644 index 0000000..17a5fbd --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext4-mballoc-handle-dev-paths-rhel5.patch @@ -0,0 +1,59 @@ +Index: linux-2.6.18-128.1.6/fs/ext4/mballoc.c +=================================================================== +--- linux-2.6.18-128.1.6.orig/fs/ext4/mballoc.c 2009-05-29 16:32:19.000000000 +0530 ++++ linux-2.6.18-128.1.6/fs/ext4/mballoc.c 2009-05-29 16:34:16.000000000 +0530 +@@ -2949,14 +2949,20 @@ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct proc_dir_entry *proc; + struct proc_dir_entry *proc_entry; +- char devname[64]; ++ char devname[BDEVNAME_SIZE], *p; + + if (proc_root_ext4 == NULL) { + sbi->s_mb_proc = NULL; + return -EINVAL; + } + bdevname(sb->s_bdev, devname); ++ p = devname; ++ while ((p = strchr(p, '/'))) ++ *p = '!'; ++ + sbi->s_mb_proc = proc_mkdir(devname, proc_root_ext4); ++ if (!sbi->s_mb_proc) ++ goto err_create_dir; + + MB_PROC_HANDLER(EXT4_MB_STATS_NAME, stats); + MB_PROC_HANDLER(EXT4_MB_MAX_TO_SCAN_NAME, max_to_scan); +@@ -2980,7 +2986,6 @@ + return 0; + + err_out: +- printk(KERN_ERR "EXT4-fs: Unable to create %s\n", devname); + remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_mb_proc); + remove_proc_entry(EXT4_MB_PREALLOC_TABLE, sbi->s_mb_proc); + remove_proc_entry(EXT4_MB_LARGE_REQ, sbi->s_mb_proc); +@@ -2993,18 +2998,23 @@ + remove_proc_entry(devname, proc_root_ext4); + sbi->s_mb_proc = NULL; + ++err_create_dir: ++ printk(KERN_ERR "EXT4-fs: Unable to create %s\n", devname); + return -ENOMEM; + } + + static int ext4_mb_destroy_per_dev_proc(struct super_block *sb) + { + struct ext4_sb_info *sbi = EXT4_SB(sb); +- char devname[64]; ++ char devname[BDEVNAME_SIZE], *p; + + if (sbi->s_mb_proc == NULL) + return -EINVAL; + + bdevname(sb->s_bdev, devname); ++ p = devname; ++ while ((p = strchr(p, '/'))) ++ *p = '!'; + remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_mb_proc); + remove_proc_entry(EXT4_MB_PREALLOC_TABLE, sbi->s_mb_proc); + remove_proc_entry(EXT4_MB_LARGE_REQ, sbi->s_mb_proc); diff --git a/ldiskfs/kernel_patches/patches/ext4-misc-rhel5.patch b/ldiskfs/kernel_patches/patches/ext4-misc-rhel5.patch new file mode 100644 index 0000000..f8e77c7 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext4-misc-rhel5.patch @@ -0,0 +1,271 @@ +Index: linux-2.6.18.i386/fs/ext4/ext4_jbd2.h +=================================================================== +--- linux-2.6.18.i386.orig/fs/ext4/ext4_jbd2.h ++++ linux-2.6.18.i386/fs/ext4/ext4_jbd2.h +@@ -35,6 +35,9 @@ + (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS) \ + || test_opt(sb, EXTENTS) ? 27U : 8U) + ++/* Indicate that EXT4_SINGLEDATA_TRANS_BLOCKS takes the sb as argument */ ++#define EXT4_SINGLEDATA_TRANS_BLOCKS_HAS_SB ++ + /* Extended attribute operations touch at most two data buffers, + * two bitmap buffers, and two group summaries, in addition to the inode + * and the superblock, which are already accounted for. */ +Index: linux-2.6.18.i386/fs/ext4/extents.c +=================================================================== +--- linux-2.6.18.i386.orig/fs/ext4/extents.c ++++ linux-2.6.18.i386/fs/ext4/extents.c +@@ -50,7 +50,7 @@ + * ext_pblock: + * combine low and high parts of physical block number into ext4_fsblk_t + */ +-static ext4_fsblk_t ext_pblock(struct ext4_extent *ex) ++ext4_fsblk_t ext_pblock(struct ext4_extent *ex) + { + ext4_fsblk_t block; + +@@ -60,6 +60,17 @@ static ext4_fsblk_t ext_pblock(struct ex + } + + /* ++ * ext4_ext_store_pblock: ++ * stores a large physical block number into an extent struct, ++ * breaking it into parts ++ */ ++void ext4_ext_store_pblock(struct ext4_extent *ex, ext4_fsblk_t pb) ++{ ++ ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff)); ++ ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff); ++} ++ ++/* + * idx_pblock: + * combine low and high parts of a leaf physical block number into ext4_fsblk_t + */ +@@ -73,17 +84,6 @@ ext4_fsblk_t idx_pblock(struct ext4_exte + } + + /* +- * ext4_ext_store_pblock: +- * stores a large physical block number into an extent struct, +- * breaking it into parts +- */ +-void ext4_ext_store_pblock(struct ext4_extent *ex, ext4_fsblk_t pb) +-{ +- ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff)); +- ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff); +-} +- +-/* + * ext4_idx_store_pblock: + * stores a large physical block number into an index struct, + * breaking it into parts +@@ -1826,6 +1826,56 @@ static int ext4_ext_rm_idx(handle_t *han + } + + /* ++ * This routine returns max. credits extent tree can consume. ++ * It should be OK for low-performance paths like ->writepage() ++ * To allow many writing process to fit a single transaction, ++ * caller should calculate credits under truncate_mutex and ++ * pass actual path. ++ */ ++int ext4_ext_calc_credits_for_insert(struct inode *inode, ++ struct ext4_ext_path *path) ++{ ++ int depth, needed; ++ ++ if (path) { ++ /* probably there is space in leaf? */ ++ depth = ext_depth(inode); ++ if (le16_to_cpu(path[depth].p_hdr->eh_entries) ++ < le16_to_cpu(path[depth].p_hdr->eh_max)) ++ return 1; ++ } ++ ++ /* ++ * given 32bit logical block (4294967296 blocks), max. tree ++ * can be 4 levels in depth -- 4 * 340^4 == 53453440000. ++ * let's also add one more level for imbalance. ++ */ ++ depth = 5; ++ ++ /* allocation of new data block(s) */ ++ needed = 2; ++ ++ /* ++ * tree can be full, so it'd need to grow in depth: ++ * we need one credit to modify old root, credits for ++ * new root will be added in split accounting ++ */ ++ needed += 1; ++ ++ /* ++ * Index split can happen, we'd need: ++ * allocate intermediate indexes (bitmap + group) ++ * + change two blocks at each level, but root (already included) ++ */ ++ needed += (depth * 2) + (depth * 2); ++ ++ /* any allocation modifies superblock */ ++ needed += 1; ++ ++ return needed; ++} ++ ++/* + * ext4_ext_calc_credits_for_single_extent: + * This routine returns max. credits that needed to insert an extent + * to the extent tree. +@@ -3157,3 +3207,14 @@ int ext4_fiemap(struct inode *inode, str + + return error; + } ++ ++EXPORT_SYMBOL(ext4_ext_store_pblock); ++EXPORT_SYMBOL(ext4_ext_search_right); ++EXPORT_SYMBOL(ext4_ext_search_left); ++EXPORT_SYMBOL(ext_pblock); ++EXPORT_SYMBOL(ext4_ext_insert_extent); ++EXPORT_SYMBOL(ext4_mb_new_blocks); ++EXPORT_SYMBOL(ext4_ext_walk_space); ++EXPORT_SYMBOL(ext4_ext_calc_credits_for_insert); ++EXPORT_SYMBOL(ext4_mark_inode_dirty); ++ +Index: linux-2.6.18.i386/fs/ext4/ext4_extents.h +=================================================================== +--- linux-2.6.18.i386.orig/fs/ext4/ext4_extents.h ++++ linux-2.6.18.i386/fs/ext4/ext4_extents.h +@@ -59,6 +59,11 @@ + */ + #define EXT_STATS_ + ++/* ++ * define EXT4_ALLOC_NEEDED to 0 since block bitmap, group desc. and sb ++ * are now accounted in ext4_ext_calc_credits_for_insert() ++ */ ++#define EXT4_ALLOC_NEEDED 0 + + /* + * ext4_inode has i_block array (60 bytes total). +@@ -124,6 +129,7 @@ struct ext4_ext_path { + #define EXT4_EXT_CACHE_GAP 1 + #define EXT4_EXT_CACHE_EXTENT 2 + ++#define EXT4_EXT_HAS_NO_TREE /* ext4_extents_tree struct is not used*/ + + #define EXT_MAX_BLOCK 0xffffffff + +@@ -228,9 +234,13 @@ static inline int ext4_ext_get_actual_le + (le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN)); + } + ++extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex); ++extern void ext4_ext_store_pblock(struct ext4_extent *ex, ext4_fsblk_t pb); + extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *); + extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t); + extern int ext4_extent_tree_init(handle_t *, struct inode *); ++extern int ext4_ext_calc_credits_for_insert(struct inode *, ++ struct ext4_ext_path *); + extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode, + int num, + struct ext4_ext_path *path); +Index: linux-2.6.18.i386/fs/ext4/mballoc.c +=================================================================== +--- linux-2.6.18.i386.orig/fs/ext4/mballoc.c ++++ linux-2.6.18.i386/fs/ext4/mballoc.c +@@ -4965,3 +4965,7 @@ error_return: + kmem_cache_free(ext4_ac_cachep, ac); + return; + } ++ ++EXPORT_SYMBOL(ext4_free_blocks); ++EXPORT_SYMBOL(ext4_mb_discard_inode_preallocations); ++ +Index: linux-2.6.18.i386/fs/ext4/super.c +=================================================================== +--- linux-2.6.18.i386.orig/fs/ext4/super.c ++++ linux-2.6.18.i386/fs/ext4/super.c +@@ -91,6 +91,7 @@ ext4_fsblk_t ext4_inode_bitmap(struct su + (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? + (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0); + } ++EXPORT_SYMBOL(ext4_inode_bitmap); + + ext4_fsblk_t ext4_inode_table(struct super_block *sb, + struct ext4_group_desc *bg) +@@ -513,7 +514,8 @@ static void ext4_put_super(struct super_ + struct ext4_super_block *es = sbi->s_es; + int i; + +- ext4_mb_release(sb); ++ if (test_opt(sb, MBALLOC)) ++ ext4_mb_release(sb); + ext4_ext_release(sb); + ext4_xattr_put_super(sb); + jbd2_journal_destroy(sbi->s_journal); +@@ -2373,16 +2375,6 @@ static int ext4_fill_super(struct super_ + "running e2fsck is recommended\n"); + + /* +- * Since ext4 is still considered development code, we require +- * that the TEST_FILESYS flag in s->flags be set. +- */ +- if (!(le32_to_cpu(es->s_flags) & EXT2_FLAGS_TEST_FILESYS)) { +- printk(KERN_WARNING "EXT4-fs: %s: not marked " +- "OK to use with test code.\n", sb->s_id); +- goto failed_mount; +- } +- +- /* + * Check feature flags regardless of the revision level, since we + * previously didn't change the revision level when setting the flags, + * so there is a chance incompat flags are set on a rev 0 filesystem. +@@ -3835,9 +3827,9 @@ static int ext4_get_sb(struct file_syste + return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt); + } + +-static struct file_system_type ext4dev_fs_type = { ++static struct file_system_type ext4_fs_type = { + .owner = THIS_MODULE, +- .name = "ext4dev", ++ .name = "ext4", + .get_sb = ext4_get_sb, + .kill_sb = kill_block_super, + #ifdef HAVE_FALLOCATE +@@ -3867,7 +3859,7 @@ static int __init init_ext4_fs(void) + err = init_inodecache(); + if (err) + goto out1; +- err = register_filesystem(&ext4dev_fs_type); ++ err = register_filesystem(&ext4_fs_type); + if (err) + goto out; + return 0; +@@ -3884,7 +3876,7 @@ out3: + + static void __exit exit_ext4_fs(void) + { +- unregister_filesystem(&ext4dev_fs_type); ++ unregister_filesystem(&ext4_fs_type); + destroy_inodecache(); + exit_ext4_xattr(); + exit_ext4_mballoc(); +Index: linux-2.6.18.i386/fs/ext4/ext4_jbd2.c +=================================================================== +--- linux-2.6.18.i386.orig/fs/ext4/ext4_jbd2.c ++++ linux-2.6.18.i386/fs/ext4/ext4_jbd2.c +@@ -21,6 +21,7 @@ int __ext4_journal_get_write_access(cons + ext4_journal_abort_handle(where, __func__, bh, handle, err); + return err; + } ++EXPORT_SYMBOL(__ext4_journal_get_write_access); + + int __ext4_journal_forget(const char *where, handle_t *handle, + struct buffer_head *bh) +@@ -57,3 +58,4 @@ int __ext4_journal_dirty_metadata(const + ext4_journal_abort_handle(where, __func__, bh, handle, err); + return err; + } ++EXPORT_SYMBOL(__ext4_journal_dirty_metadata); diff --git a/ldiskfs/kernel_patches/patches/ext4-mmp-rhel5.patch b/ldiskfs/kernel_patches/patches/ext4-mmp-rhel5.patch new file mode 100644 index 0000000..d01d046 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext4-mmp-rhel5.patch @@ -0,0 +1,479 @@ +Index: linux-2.6.18-128.1.6/fs/ext4/super.c +=================================================================== +--- linux-2.6.18-128.1.6.orig/fs/ext4/super.c ++++ linux-2.6.18-128.1.6/fs/ext4/super.c +@@ -36,6 +36,8 @@ + #include + #include + #include ++#include ++#include + + #include "ext4.h" + #include "ext4_jbd2.h" +@@ -547,6 +549,8 @@ static void ext4_put_super(struct super_ + invalidate_bdev(sbi->journal_bdev, 0); + ext4_blkdev_remove(sbi); + } ++ if (sbi->s_mmp_tsk) ++ kthread_stop(sbi->s_mmp_tsk); + sb->s_fs_info = NULL; + kfree(sbi); + return; +@@ -766,6 +770,328 @@ static int ext4_show_options(struct seq_ + return 0; + } + ++/* ++ * Write the MMP block using WRITE_SYNC to try to get the block on-disk ++ * faster. ++ */ ++static int write_mmp_block(struct buffer_head *bh) ++{ ++ mark_buffer_dirty(bh); ++ lock_buffer(bh); ++ bh->b_end_io = end_buffer_write_sync; ++ get_bh(bh); ++ submit_bh(WRITE_SYNC, bh); ++ wait_on_buffer(bh); ++ if (unlikely(!buffer_uptodate(bh))) ++ return 1; ++ ++ return 0; ++} ++ ++/* ++ * Read the MMP block. It _must_ be read from disk and hence we clear the ++ * uptodate flag on the buffer. ++ */ ++static int read_mmp_block(struct super_block *sb, struct buffer_head **bh, ++ unsigned long mmp_block) ++{ ++ struct mmp_struct *mmp; ++ ++ if (*bh) ++ clear_buffer_uptodate(*bh); ++ ++#if 0 ++ brelse(*bh); ++ ++ *bh = sb_bread(sb, mmp_block); ++#else ++ if (!*bh) ++ *bh = sb_getblk(sb, mmp_block); ++ if (*bh) { ++ get_bh(*bh); ++ lock_buffer(*bh); ++ (*bh)->b_end_io = end_buffer_read_sync; ++ submit_bh(READ_SYNC, *bh); ++ wait_on_buffer(*bh); ++ if (!buffer_uptodate(*bh)) { ++ brelse(*bh); ++ *bh = NULL; ++ } ++ } ++#endif ++ if (!*bh) { ++ ext4_warning(sb, __FUNCTION__, ++ "Error while reading MMP block %lu", mmp_block); ++ return -EIO; ++ } ++ ++ mmp = (struct mmp_struct *)((*bh)->b_data); ++ if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC) ++ return -EINVAL; ++ ++ return 0; ++} ++ ++/* ++ * Dump as much information as possible to help the admin. ++ */ ++static void dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp, ++ const char *function, const char *msg) ++{ ++ ext4_warning(sb, function, msg); ++ ext4_warning(sb, function, "MMP failure info: last update time: %llu, " ++ "last update node: %s, last update device: %s\n", ++ le64_to_cpu(mmp->mmp_time), mmp->mmp_nodename, ++ mmp->mmp_bdevname); ++} ++ ++/* ++ * kmmpd will update the MMP sequence every s_mmp_update_interval seconds ++ */ ++static int kmmpd(void *data) ++{ ++ struct super_block *sb = (struct super_block *) data; ++ struct ext4_super_block *es = EXT4_SB(sb)->s_es; ++ struct buffer_head *bh = NULL; ++ struct mmp_struct *mmp; ++ unsigned long mmp_block; ++ u32 seq = 0; ++ unsigned long failed_writes = 0; ++ int mmp_update_interval = le16_to_cpu(es->s_mmp_update_interval); ++ unsigned mmp_check_interval; ++ unsigned long last_update_time; ++ unsigned long diff; ++ int retval; ++ ++ mmp_block = le64_to_cpu(es->s_mmp_block); ++ retval = read_mmp_block(sb, &bh, mmp_block); ++ if (retval) ++ goto failed; ++ ++ mmp = (struct mmp_struct *)(bh->b_data); ++ mmp->mmp_time = cpu_to_le64(get_seconds()); ++ /* ++ * Start with the higher mmp_check_interval and reduce it if ++ * the MMP block is being updated on time. ++ */ ++ mmp_check_interval = max(5 * mmp_update_interval, ++ EXT4_MMP_MIN_CHECK_INTERVAL); ++ mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval); ++ bdevname(bh->b_bdev, mmp->mmp_bdevname); ++ ++ down_read(&uts_sem); ++ memcpy(mmp->mmp_nodename, system_utsname.nodename, ++ sizeof(mmp->mmp_nodename)); ++ up_read(&uts_sem); ++ ++ while (!kthread_should_stop()) { ++ if (++seq > EXT4_MMP_SEQ_MAX) ++ seq = 1; ++ ++ mmp->mmp_seq = cpu_to_le32(seq); ++ mmp->mmp_time = cpu_to_le64(get_seconds()); ++ last_update_time = jiffies; ++ ++ retval = write_mmp_block(bh); ++ /* ++ * Don't spew too many error messages. Print one every ++ * (s_mmp_update_interval * 60) seconds. ++ */ ++ if (retval && (failed_writes % 60) == 0) { ++ ext4_error(sb, __FUNCTION__, ++ "Error writing to MMP block"); ++ failed_writes++; ++ } ++ ++ if (!(le32_to_cpu(es->s_feature_incompat) & ++ EXT4_FEATURE_INCOMPAT_MMP)) { ++ ext4_warning(sb, __FUNCTION__, "kmmpd being stopped " ++ "since MMP feature has been disabled."); ++ EXT4_SB(sb)->s_mmp_tsk = 0; ++ goto failed; ++ } ++ ++ if (sb->s_flags & MS_RDONLY) { ++ ext4_warning(sb, __FUNCTION__, "kmmpd being stopped " ++ "since filesystem has been remounted as " ++ "readonly."); ++ EXT4_SB(sb)->s_mmp_tsk = 0; ++ goto failed; ++ } ++ ++ diff = jiffies - last_update_time; ++ if (diff < mmp_update_interval * HZ) ++ schedule_timeout_interruptible(EXT4_MMP_UPDATE_INTERVAL* ++ HZ - diff); ++ ++ /* ++ * We need to make sure that more than mmp_check_interval ++ * seconds have not passed since writing. If that has happened ++ * we need to check if the MMP block is as we left it. ++ */ ++ diff = jiffies - last_update_time; ++ if (diff > mmp_check_interval * HZ) { ++ struct buffer_head *bh_check = NULL; ++ struct mmp_struct *mmp_check; ++ ++ retval = read_mmp_block(sb, &bh_check, mmp_block); ++ if (retval) { ++ EXT4_SB(sb)->s_mmp_tsk = 0; ++ goto failed; ++ } ++ ++ mmp_check = (struct mmp_struct *)(bh_check->b_data); ++ if (mmp->mmp_time != mmp_check->mmp_time || ++ memcmp(mmp->mmp_nodename, mmp_check->mmp_nodename, ++ sizeof(mmp->mmp_nodename))) ++ dump_mmp_msg(sb, mmp_check, __FUNCTION__, ++ "Error while updating MMP info. " ++ "The filesystem seems to have " ++ "been multiply mounted."); ++ ++ put_bh(bh_check); ++ } ++ ++ /* ++ * Adjust the mmp_check_interval depending on how much time ++ * it took for the MMP block to be written. ++ */ ++ mmp_check_interval = max(5 * diff / HZ, ++ (unsigned long) EXT4_MMP_MIN_CHECK_INTERVAL); ++ mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval); ++ } ++ ++ /* ++ * Unmount seems to be clean. ++ */ ++ mmp->mmp_seq = cpu_to_le32(EXT4_MMP_SEQ_CLEAN); ++ mmp->mmp_time = cpu_to_le64(get_seconds()); ++ ++ retval = write_mmp_block(bh); ++ ++failed: ++ brelse(bh); ++ return retval; ++} ++ ++/* ++ * Get a random new sequence number but make sure it is not greater than ++ * EXT4_MMP_SEQ_MAX. ++ */ ++static unsigned int mmp_new_seq(void) ++{ ++ u32 new_seq; ++ ++ do { ++ get_random_bytes(&new_seq, sizeof(u32)); ++ } while (new_seq > EXT4_MMP_SEQ_MAX); ++ ++ return new_seq; ++} ++ ++/* ++ * Protect the filesystem from being mounted more than once. ++ */ ++static int ext4_multi_mount_protect(struct super_block *sb, ++ unsigned long mmp_block) ++{ ++ struct ext4_super_block *es = EXT4_SB(sb)->s_es; ++ struct buffer_head *bh = NULL; ++ struct mmp_struct *mmp = NULL; ++ u32 seq; ++ unsigned int mmp_check_interval = le16_to_cpu(es->s_mmp_update_interval); ++ int retval; ++ ++ if (mmp_block < le32_to_cpu(es->s_first_data_block) || ++ mmp_block >= ext4_blocks_count(es)) { ++ ext4_warning(sb, __FUNCTION__, ++ "Invalid MMP block in superblock"); ++ goto failed; ++ } ++ ++ retval = read_mmp_block(sb, &bh, mmp_block); ++ if (retval) ++ goto failed; ++ ++ mmp = (struct mmp_struct *)(bh->b_data); ++ ++ if (mmp_check_interval < EXT4_MMP_MIN_CHECK_INTERVAL) ++ mmp_check_interval = EXT4_MMP_MIN_CHECK_INTERVAL; ++ ++ /* ++ * If check_interval in MMP block is larger, use that instead of ++ * update_interval from the superblock. ++ */ ++ if (mmp->mmp_check_interval > mmp_check_interval) ++ mmp_check_interval = mmp->mmp_check_interval; ++ ++ seq = le32_to_cpu(mmp->mmp_seq); ++ if (seq == EXT4_MMP_SEQ_CLEAN) ++ goto skip; ++ ++ if (seq == EXT4_MMP_SEQ_FSCK) { ++ dump_mmp_msg(sb, mmp, __FUNCTION__, ++ "fsck is running on the filesystem"); ++ goto failed; ++ } ++ ++ schedule_timeout_uninterruptible(HZ * (2 * mmp_check_interval + 1)); ++ ++ retval = read_mmp_block(sb, &bh, mmp_block); ++ if (retval) ++ goto failed; ++ mmp = (struct mmp_struct *)(bh->b_data); ++ if (seq != le32_to_cpu(mmp->mmp_seq)) { ++ dump_mmp_msg(sb, mmp, __FUNCTION__, ++ "Device is already active on another node."); ++ goto failed; ++ } ++ ++skip: ++ /* ++ * write a new random sequence number. ++ */ ++ mmp->mmp_seq = seq = cpu_to_le32(mmp_new_seq()); ++ ++ retval = write_mmp_block(bh); ++ if (retval) ++ goto failed; ++ ++ /* ++ * wait for MMP interval and check mmp_seq. ++ */ ++ schedule_timeout_uninterruptible(HZ * (2 * mmp_check_interval + 1)); ++ ++ retval = read_mmp_block(sb, &bh, mmp_block); ++ if (retval) ++ goto failed; ++ mmp = (struct mmp_struct *)(bh->b_data); ++ if (seq != le32_to_cpu(mmp->mmp_seq)) { ++ dump_mmp_msg(sb, mmp, __FUNCTION__, ++ "Device is already active on another node."); ++ goto failed; ++ } ++ ++ /* ++ * Start a kernel thread to update the MMP block periodically. ++ */ ++ EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, sb, "kmmpd-%02x:%02x", ++ MAJOR(sb->s_dev), ++ MINOR(sb->s_dev)); ++ if (IS_ERR(EXT4_SB(sb)->s_mmp_tsk)) { ++ EXT4_SB(sb)->s_mmp_tsk = 0; ++ ext4_warning(sb, __FUNCTION__, "Unable to create kmmpd thread " ++ "for %s.", sb->s_id); ++ goto failed; ++ } ++ ++ brelse(bh); ++ return 0; ++ ++failed: ++ brelse(bh); ++ return 1; ++} + + static struct dentry *ext4_get_dentry(struct super_block *sb, void *vobjp) + { +@@ -775,7 +1101,6 @@ static struct dentry *ext4_get_dentry(st + struct inode *inode; + struct dentry *result; + +- + if (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO) + return ERR_PTR(-ESTALE); + if (ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)) +@@ -2258,6 +2583,11 @@ static int ext4_fill_super(struct super_ + EXT4_HAS_INCOMPAT_FEATURE(sb, + EXT4_FEATURE_INCOMPAT_RECOVER)); + ++ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_MMP) && ++ !(sb->s_flags & MS_RDONLY)) ++ if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block))) ++ goto failed_mount3; ++ + /* + * The first inode we look at is the journal inode. Don't try + * root first: it may be modified in the journal! +@@ -2445,6 +2775,8 @@ failed_mount3: + percpu_counter_destroy(&sbi->s_freeblocks_counter); + percpu_counter_destroy(&sbi->s_freeinodes_counter); + percpu_counter_destroy(&sbi->s_dirs_counter); ++ if (sbi->s_mmp_tsk) ++ kthread_stop(sbi->s_mmp_tsk); + failed_mount2: + for (i = 0; i < db_count; i++) + brelse(sbi->s_group_desc[i]); +@@ -2918,7 +3250,7 @@ static int ext4_remount(struct super_blo + unsigned long old_sb_flags; + struct ext4_mount_options old_opts; + ext4_group_t g; +- int err; ++ int err = 0; + #ifdef CONFIG_QUOTA + int i; + #endif +@@ -3042,6 +3374,13 @@ static int ext4_remount(struct super_blo + goto restore_opts; + if (!ext4_setup_super(sb, es, 0)) + sb->s_flags &= ~MS_RDONLY; ++ if (EXT4_HAS_INCOMPAT_FEATURE(sb, ++ EXT4_FEATURE_INCOMPAT_MMP)) ++ if (ext4_multi_mount_protect(sb, ++ le64_to_cpu(es->s_mmp_block))) { ++ err = -EROFS; ++ goto restore_opts; ++ } + } + } + #ifdef CONFIG_QUOTA +Index: linux-2.6.18-128.1.6/fs/ext4/ext4.h +=================================================================== +--- linux-2.6.18-128.1.6.orig/fs/ext4/ext4.h ++++ linux-2.6.18-128.1.6/fs/ext4/ext4.h +@@ -658,7 +658,7 @@ struct ext4_super_block { + __le16 s_want_extra_isize; /* New inodes should reserve # bytes */ + __le32 s_flags; /* Miscellaneous flags */ + __le16 s_raid_stride; /* RAID stride */ +- __le16 s_mmp_interval; /* # seconds to wait in MMP checking */ ++ __le16 s_mmp_update_interval; /* # seconds to wait in MMP checking */ + __le64 s_mmp_block; /* Block for multi-mount protection */ + __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/ + __u8 s_log_groups_per_flex; /* FLEX_BG group size */ +@@ -775,7 +775,8 @@ static inline int ext4_valid_inum(struct + EXT4_FEATURE_INCOMPAT_META_BG| \ + EXT4_FEATURE_INCOMPAT_EXTENTS| \ + EXT4_FEATURE_INCOMPAT_64BIT| \ +- EXT4_FEATURE_INCOMPAT_FLEX_BG) ++ EXT4_FEATURE_INCOMPAT_FLEX_BG| \ ++ EXT4_FEATURE_INCOMPAT_MMP) + #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ +@@ -956,6 +957,39 @@ void ext4_get_group_no_and_offset(struct + unsigned long *blockgrpp, ext4_grpblk_t *offsetp); + + /* ++ * This structure will be used for multiple mount protection. It will be ++ * written into the block number saved in the s_mmp_block field in the ++ * superblock. Programs that check MMP should assume that if ++ * SEQ_FSCK (or any unknown code above SEQ_MAX) is present then it is NOT safe ++ * to use the filesystem, regardless of how old the timestamp is. ++ */ ++#define EXT4_MMP_MAGIC 0x004D4D50U /* ASCII for MMP */ ++#define EXT4_MMP_SEQ_CLEAN 0xFF4D4D50U /* mmp_seq value for clean unmount */ ++#define EXT4_MMP_SEQ_FSCK 0xE24D4D50U /* mmp_seq value when being fscked */ ++#define EXT4_MMP_SEQ_MAX 0xE24D4D4FU /* maximum valid mmp_seq value */ ++ ++struct mmp_struct { ++ __le32 mmp_magic; ++ __le32 mmp_seq; ++ __le64 mmp_time; ++ char mmp_nodename[64]; ++ char mmp_bdevname[32]; ++ __le16 mmp_check_interval; ++ __le16 mmp_pad1; ++ __le32 mmp_pad2[227]; ++}; ++ ++/* ++ * Default interval in seconds to update the MMP sequence number. ++ */ ++#define EXT4_MMP_UPDATE_INTERVAL 1 ++ ++/* ++ * Minimum interval for MMP checking in seconds. ++ */ ++#define EXT4_MMP_MIN_CHECK_INTERVAL 5 ++ ++/* + * Function prototypes + */ + +Index: linux-2.6.18-128.1.6/fs/ext4/ext4_sb.h +=================================================================== +--- linux-2.6.18-128.1.6.orig/fs/ext4/ext4_sb.h ++++ linux-2.6.18-128.1.6/fs/ext4/ext4_sb.h +@@ -148,6 +148,8 @@ struct ext4_sb_info { + + unsigned int s_log_groups_per_flex; + struct flex_groups *s_flex_groups; ++ ++ struct task_struct *s_mmp_tsk; /* Kernel thread for multiple mount protection */ + }; + + #endif /* _EXT4_SB */ diff --git a/ldiskfs/kernel_patches/patches/ext4-prealloc-rhel5.patch b/ldiskfs/kernel_patches/patches/ext4-prealloc-rhel5.patch new file mode 100644 index 0000000..34d0472 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext4-prealloc-rhel5.patch @@ -0,0 +1,405 @@ +Index: linux-2.6.18-128.1.6/fs/ext4/ext4_sb.h +=================================================================== +--- linux-2.6.18-128.1.6.orig/fs/ext4/ext4_sb.h 2009-05-28 17:16:51.000000000 +0530 ++++ linux-2.6.18-128.1.6/fs/ext4/ext4_sb.h 2009-05-28 17:16:52.000000000 +0530 +@@ -108,11 +108,14 @@ + + /* tunables */ + unsigned long s_stripe; +- unsigned long s_mb_stream_request; ++ unsigned long s_mb_small_req; ++ unsigned long s_mb_large_req; + unsigned long s_mb_max_to_scan; + unsigned long s_mb_min_to_scan; + unsigned long s_mb_stats; + unsigned long s_mb_order2_reqs; ++ unsigned long *s_mb_prealloc_table; ++ unsigned long s_mb_prealloc_table_size; + unsigned long s_mb_group_prealloc; + /* where last allocation was done - for stream allocation */ + unsigned long s_mb_last_group; +Index: linux-2.6.18-128.1.6/fs/ext4/mballoc.c +=================================================================== +--- linux-2.6.18-128.1.6.orig/fs/ext4/mballoc.c 2009-05-28 17:16:51.000000000 +0530 ++++ linux-2.6.18-128.1.6/fs/ext4/mballoc.c 2009-05-28 17:19:57.000000000 +0530 +@@ -1744,7 +1744,7 @@ + if (size < isize) + size = isize; + +- if (size < sbi->s_mb_stream_request && ++ if ((ac->ac_g_ex.fe_len < sbi->s_mb_large_req) && + (ac->ac_flags & EXT4_MB_HINT_DATA)) { + /* TBD: may be hot point */ + spin_lock(&sbi->s_md_lock); +@@ -2484,6 +2484,26 @@ + return -ENOMEM; + } + ++static void ext4_mb_prealloc_table_add(struct ext4_sb_info *sbi, int value) ++{ ++ int i; ++ ++ if (value > (sbi->s_blocks_per_group - 1 - 1 - sbi->s_itb_per_group)) ++ return; ++ ++ for (i = 0; i < sbi->s_mb_prealloc_table_size; i++) { ++ if (sbi->s_mb_prealloc_table[i] == 0) { ++ sbi->s_mb_prealloc_table[i] = value; ++ return; ++ } ++ ++ /* they should add values in order */ ++ if (value <= sbi->s_mb_prealloc_table[i]) ++ return; ++ } ++} ++ ++ + int ext4_mb_init(struct super_block *sb, int needs_recovery) + { + struct ext4_sb_info *sbi = EXT4_SB(sb); +@@ -2542,15 +2562,59 @@ + sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN; + sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN; + sbi->s_mb_stats = MB_DEFAULT_STATS; +- sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; + sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; + sbi->s_mb_history_filter = EXT4_MB_HISTORY_DEFAULT; +- sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC; ++ ++ if (sbi->s_stripe == 0) { ++ sbi->s_mb_prealloc_table_size = 8; ++ i = sbi->s_mb_prealloc_table_size * sizeof(unsigned long); ++ sbi->s_mb_prealloc_table = kmalloc(i, GFP_NOFS); ++ if (sbi->s_mb_prealloc_table == NULL) { ++ clear_opt(sbi->s_mount_opt, MBALLOC); ++ kfree(sbi->s_mb_offsets); ++ kfree(sbi->s_mb_maxs); ++ return -ENOMEM; ++ } ++ memset(sbi->s_mb_prealloc_table, 0, i); ++ ++ ext4_mb_prealloc_table_add(sbi, 4); ++ ext4_mb_prealloc_table_add(sbi, 8); ++ ext4_mb_prealloc_table_add(sbi, 16); ++ ext4_mb_prealloc_table_add(sbi, 32); ++ ext4_mb_prealloc_table_add(sbi, 64); ++ ext4_mb_prealloc_table_add(sbi, 128); ++ ext4_mb_prealloc_table_add(sbi, 256); ++ ext4_mb_prealloc_table_add(sbi, 512); ++ ++ sbi->s_mb_small_req = 256; ++ sbi->s_mb_large_req = 1024; ++ sbi->s_mb_group_prealloc = 512; ++ } else { ++ sbi->s_mb_prealloc_table_size = 3; ++ i = sbi->s_mb_prealloc_table_size * sizeof(unsigned long); ++ sbi->s_mb_prealloc_table = kmalloc(i, GFP_NOFS); ++ if (sbi->s_mb_prealloc_table == NULL) { ++ clear_opt(sbi->s_mount_opt, MBALLOC); ++ kfree(sbi->s_mb_offsets); ++ kfree(sbi->s_mb_maxs); ++ return -ENOMEM; ++ } ++ memset(sbi->s_mb_prealloc_table, 0, i); ++ ++ ext4_mb_prealloc_table_add(sbi, sbi->s_stripe); ++ ext4_mb_prealloc_table_add(sbi, sbi->s_stripe * 2); ++ ext4_mb_prealloc_table_add(sbi, sbi->s_stripe * 4); ++ ++ sbi->s_mb_small_req = sbi->s_stripe; ++ sbi->s_mb_large_req = sbi->s_stripe * 8; ++ sbi->s_mb_group_prealloc = sbi->s_stripe * 4; ++ } + + i = sizeof(struct ext4_locality_group) * num_possible_cpus(); + sbi->s_locality_groups = kmalloc(i, GFP_KERNEL); + if (sbi->s_locality_groups == NULL) { + clear_opt(sbi->s_mount_opt, MBALLOC); ++ kfree(sbi->s_mb_prealloc_table); + kfree(sbi->s_mb_offsets); + kfree(sbi->s_mb_maxs); + return -ENOMEM; +@@ -2725,10 +2789,82 @@ + #define EXT4_MB_MAX_TO_SCAN_NAME "max_to_scan" + #define EXT4_MB_MIN_TO_SCAN_NAME "min_to_scan" + #define EXT4_MB_ORDER2_REQ "order2_req" +-#define EXT4_MB_STREAM_REQ "stream_req" ++#define EXT4_MB_SMALL_REQ "small_req" ++#define EXT4_MB_LARGE_REQ "large_req" ++#define EXT4_MB_PREALLOC_TABLE "prealloc_table" + #define EXT4_MB_GROUP_PREALLOC "group_prealloc" + ++static int ext4_mb_prealloc_table_proc_read(char *page, char **start, off_t off, ++ int count, int *eof, void *data) ++{ ++ struct ext4_sb_info *sbi = data; ++ int len = 0; ++ int i; ++ ++ *eof = 1; ++ if (off != 0) ++ return 0; ++ ++ for (i = 0; i < sbi->s_mb_prealloc_table_size; i++) ++ len += sprintf(page + len, "%ld ", ++ sbi->s_mb_prealloc_table[i]); ++ len += sprintf(page + len, "\n"); ++ ++ *start = page; ++ return len; ++} ++ ++static int ext4_mb_prealloc_table_proc_write(struct file *file, ++ const char __user *buf, ++ unsigned long cnt, void *data) ++{ ++ struct ext4_sb_info *sbi = data; ++ unsigned long value; ++ unsigned long prev = 0; ++ char str[128]; ++ char *cur; ++ char *end; ++ unsigned long *new_table; ++ int num = 0; ++ int i = 0; ++ ++ if (cnt >= sizeof(str)) ++ return -EINVAL; ++ if (copy_from_user(str, buf, cnt)) ++ return -EFAULT; ++ ++ num = 0; ++ cur = str; ++ end = str + cnt; ++ while (cur < end) { ++ while ((cur < end) && (*cur == ' ')) cur++; ++ value = simple_strtol(cur, &cur, 0); ++ if (value == 0) ++ break; ++ if (value <= prev) ++ return -EINVAL; ++ prev = value; ++ num++; ++ } + ++ new_table = kmalloc(num * sizeof(*new_table), GFP_KERNEL); ++ if (new_table == NULL) ++ return -ENOMEM; ++ kfree(sbi->s_mb_prealloc_table); ++ memset(new_table, 0, num * sizeof(*new_table)); ++ sbi->s_mb_prealloc_table = new_table; ++ sbi->s_mb_prealloc_table_size = num; ++ cur = str; ++ end = str + cnt; ++ while (cur < end && i < num) { ++ while ((cur < end) && (*cur == ' ')) cur++; ++ value = simple_strtol(cur, &cur, 0); ++ ext4_mb_prealloc_table_add(sbi, value); ++ i++; ++ } ++ ++ return cnt; ++} + + #define MB_PROC_FOPS(name) \ + static int ext4_mb_##name##_proc_show(struct seq_file *m, void *v) \ +@@ -2774,7 +2910,8 @@ + MB_PROC_FOPS(max_to_scan); + MB_PROC_FOPS(min_to_scan); + MB_PROC_FOPS(order2_reqs); +-MB_PROC_FOPS(stream_request); ++MB_PROC_FOPS(small_req); ++MB_PROC_FOPS(large_req); + MB_PROC_FOPS(group_prealloc); + + #define MB_PROC_HANDLER(name, var) \ +@@ -2795,6 +2932,7 @@ + mode_t mode = S_IFREG | S_IRUGO | S_IWUSR; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct proc_dir_entry *proc; ++ struct proc_dir_entry *proc_entry; + char devname[64]; + + if (proc_root_ext4 == NULL) { +@@ -2808,15 +2946,29 @@ + MB_PROC_HANDLER(EXT4_MB_MAX_TO_SCAN_NAME, max_to_scan); + MB_PROC_HANDLER(EXT4_MB_MIN_TO_SCAN_NAME, min_to_scan); + MB_PROC_HANDLER(EXT4_MB_ORDER2_REQ, order2_reqs); +- MB_PROC_HANDLER(EXT4_MB_STREAM_REQ, stream_request); ++ MB_PROC_HANDLER(EXT4_MB_SMALL_REQ, small_req); ++ MB_PROC_HANDLER(EXT4_MB_LARGE_REQ, large_req); + MB_PROC_HANDLER(EXT4_MB_GROUP_PREALLOC, group_prealloc); + ++ proc_entry = create_proc_entry(EXT4_MB_PREALLOC_TABLE, S_IFREG | ++ S_IRUGO | S_IWUSR, sbi->s_mb_proc); ++ if (proc_entry == NULL) { ++ printk(KERN_ERR "EXT4-fs: unable to create %s\n", ++ EXT4_MB_PREALLOC_TABLE); ++ goto err_out; ++ } ++ proc_entry->data = sbi; ++ proc_entry->read_proc = ext4_mb_prealloc_table_proc_read; ++ proc_entry->write_proc = ext4_mb_prealloc_table_proc_write; ++ + return 0; + + err_out: + printk(KERN_ERR "EXT4-fs: Unable to create %s\n", devname); + remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_mb_proc); +- remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_mb_proc); ++ remove_proc_entry(EXT4_MB_PREALLOC_TABLE, sbi->s_mb_proc); ++ remove_proc_entry(EXT4_MB_LARGE_REQ, sbi->s_mb_proc); ++ remove_proc_entry(EXT4_MB_SMALL_REQ, sbi->s_mb_proc); + remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_mb_proc); + remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_mb_proc); + remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_mb_proc); +@@ -2838,7 +2990,9 @@ + + bdevname(sb->s_bdev, devname); + remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_mb_proc); +- remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_mb_proc); ++ remove_proc_entry(EXT4_MB_PREALLOC_TABLE, sbi->s_mb_proc); ++ remove_proc_entry(EXT4_MB_LARGE_REQ, sbi->s_mb_proc); ++ remove_proc_entry(EXT4_MB_SMALL_REQ, sbi->s_mb_proc); + remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_mb_proc); + remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_mb_proc); + remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_mb_proc); +@@ -3032,11 +3186,12 @@ + ext4_mb_normalize_request(struct ext4_allocation_context *ac, + struct ext4_allocation_request *ar) + { +- int bsbits, max; ++ int bsbits, i, wind; + ext4_lblk_t end; +- loff_t size, orig_size, start_off; ++ loff_t size, orig_size; + ext4_lblk_t start, orig_start; + struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); ++ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + struct ext4_prealloc_space *pa; + + /* do normalize only data requests, metadata requests +@@ -3066,49 +3221,35 @@ + size = size << bsbits; + if (size < i_size_read(ac->ac_inode)) + size = i_size_read(ac->ac_inode); ++ size = (size + ac->ac_sb->s_blocksize - 1) >> bsbits; + +- /* max size of free chunks */ +- max = 2 << bsbits; ++ start = wind = 0; + +-#define NRL_CHECK_SIZE(req, size, max, chunk_size) \ +- (req <= (size) || max <= (chunk_size)) ++ /* let's choose preallocation window depending on file size */ ++ for (i = 0; i < sbi->s_mb_prealloc_table_size; i++) { ++ if (size <= sbi->s_mb_prealloc_table[i]) { ++ wind = sbi->s_mb_prealloc_table[i]; ++ break; ++ } ++ } ++ size = wind; + +- /* first, try to predict filesize */ +- /* XXX: should this table be tunable? */ +- start_off = 0; +- if (size <= 16 * 1024) { +- size = 16 * 1024; +- } else if (size <= 32 * 1024) { +- size = 32 * 1024; +- } else if (size <= 64 * 1024) { +- size = 64 * 1024; +- } else if (size <= 128 * 1024) { +- size = 128 * 1024; +- } else if (size <= 256 * 1024) { +- size = 256 * 1024; +- } else if (size <= 512 * 1024) { +- size = 512 * 1024; +- } else if (size <= 1024 * 1024) { +- size = 1024 * 1024; +- } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) { +- start_off = ((loff_t)ac->ac_o_ex.fe_logical >> +- (21 - bsbits)) << 21; +- size = 2 * 1024 * 1024; +- } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) { +- start_off = ((loff_t)ac->ac_o_ex.fe_logical >> +- (22 - bsbits)) << 22; +- size = 4 * 1024 * 1024; +- } else if (NRL_CHECK_SIZE(ac->ac_o_ex.fe_len, +- (8<<20)>>bsbits, max, 8 * 1024)) { +- start_off = ((loff_t)ac->ac_o_ex.fe_logical >> +- (23 - bsbits)) << 23; +- size = 8 * 1024 * 1024; +- } else { +- start_off = (loff_t)ac->ac_o_ex.fe_logical << bsbits; +- size = ac->ac_o_ex.fe_len << bsbits; ++ if (wind == 0) { ++ __u64 tstart, tend; ++ /* file is quite large, we now preallocate with ++ * the biggest configured window with regart to ++ * logical offset */ ++ wind = sbi->s_mb_prealloc_table[i - 1]; ++ tstart = ac->ac_o_ex.fe_logical; ++ do_div(tstart, wind); ++ start = tstart * wind; ++ tend = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len - 1; ++ do_div(tend, wind); ++ tend = tend * wind + wind; ++ size = tend - start; + } +- orig_size = size = size >> bsbits; +- orig_start = start = start_off >> bsbits; ++ orig_size = size; ++ orig_start = start; + + /* don't cover already allocated blocks in selected range */ + if (ar->pleft && start <= ar->lleft) { +@@ -3185,7 +3326,6 @@ + } + BUG_ON(start + size <= ac->ac_o_ex.fe_logical && + start > ac->ac_o_ex.fe_logical); +- BUG_ON(size <= 0 || size >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); + + /* now prepare goal request */ + +@@ -4077,22 +4217,32 @@ + { + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + int bsbits = ac->ac_sb->s_blocksize_bits; +- loff_t size, isize; ++ loff_t size; + + if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) + return; + +- size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len; +- isize = i_size_read(ac->ac_inode) >> bsbits; +- size = max(size, isize); +- +- /* don't use group allocation for large files */ +- if (size >= sbi->s_mb_stream_request) ++ if (ac->ac_o_ex.fe_len >= sbi->s_mb_small_req) + return; + + if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) + return; + ++ /* request is so large that we don't care about ++ * streaming - it overweights any possible seek */ ++ if (ac->ac_o_ex.fe_len >= sbi->s_mb_large_req) ++ return; ++ ++ size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len; ++ size = size << bsbits; ++ if (size < i_size_read(ac->ac_inode)) ++ size = i_size_read(ac->ac_inode); ++ size = (size + ac->ac_sb->s_blocksize - 1) >> bsbits; ++ ++ /* don't use group allocation for large files */ ++ if (size >= sbi->s_mb_large_req) ++ return; ++ + BUG_ON(ac->ac_lg != NULL); + /* + * locality group prealloc space are per cpu. The reason for having diff --git a/ldiskfs/kernel_patches/patches/ext4-print-inum-in-htree-warning-rhel5.patch b/ldiskfs/kernel_patches/patches/ext4-print-inum-in-htree-warning-rhel5.patch new file mode 100644 index 0000000..1cc10a8 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext4-print-inum-in-htree-warning-rhel5.patch @@ -0,0 +1,15 @@ +Index: linux-2.6.18.i386/fs/ext4/namei.c +=================================================================== +--- linux-2.6.18.i386.orig/fs/ext4/namei.c ++++ linux-2.6.18.i386/fs/ext4/namei.c +@@ -374,8 +374,8 @@ dx_probe(struct dentry *dentry, struct i + root->info.hash_version != DX_HASH_HALF_MD4 && + root->info.hash_version != DX_HASH_LEGACY) { + ext4_warning(dir->i_sb, __func__, +- "Unrecognised inode hash code %d", +- root->info.hash_version); ++ "Unrecognised inode hash code %d for directory " ++ "#%lu", root->info.hash_version, dir->i_ino); + brelse(bh); + *err = ERR_BAD_DX_DIR; + goto fail; diff --git a/ldiskfs/kernel_patches/patches/ext4-remove-cond_resched-calls-rhel5.patch b/ldiskfs/kernel_patches/patches/ext4-remove-cond_resched-calls-rhel5.patch new file mode 100644 index 0000000..bab03d1 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext4-remove-cond_resched-calls-rhel5.patch @@ -0,0 +1,29 @@ +Index: linux-2.6.18.i386/fs/ext4/ialloc.c +=================================================================== +--- linux-2.6.18.i386.orig/fs/ext4/ialloc.c ++++ linux-2.6.18.i386/fs/ext4/ialloc.c +@@ -1057,7 +1057,6 @@ unsigned long ext4_count_free_inodes (st + if (!gdp) + continue; + desc_count += le16_to_cpu(gdp->bg_free_inodes_count); +- cond_resched(); + } + return desc_count; + #endif +Index: linux-2.6.18.i386/fs/ext4/super.c +=================================================================== +--- linux-2.6.18.i386.orig/fs/ext4/super.c ++++ linux-2.6.18.i386/fs/ext4/super.c +@@ -3100,11 +3100,9 @@ static int ext4_statfs(struct dentry *de + * block group descriptors. If the sparse superblocks + * feature is turned on, then not all groups have this. + */ +- for (i = 0; i < ngroups; i++) { ++ for (i = 0; i < ngroups; i++) + overhead += ext4_bg_has_super(sb, i) + + ext4_bg_num_gdb(sb, i); +- cond_resched(); +- } + + /* + * Every block group has an inode bitmap, a block diff --git a/ldiskfs/kernel_patches/patches/ext4-unlink-race-rhel5.patch b/ldiskfs/kernel_patches/patches/ext4-unlink-race-rhel5.patch new file mode 100644 index 0000000..f75ae84 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext4-unlink-race-rhel5.patch @@ -0,0 +1,15 @@ +Index: linux-2.6.18.i386/fs/ext4/namei.c +=================================================================== +--- linux-2.6.18.i386.orig/fs/ext4/namei.c ++++ linux-2.6.18.i386/fs/ext4/namei.c +@@ -2299,8 +2299,8 @@ static int ext4_link (struct dentry * ol + * Return -ENOENT if we've raced with unlink and i_nlink is 0. Doing + * otherwise has the potential to corrupt the orphan inode list. + */ +- if (inode->i_nlink == 0) +- return -ENOENT; ++ //if (inode->i_nlink == 0) ++ // return -ENOENT; + + retry: + handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + diff --git a/ldiskfs/kernel_patches/patches/ext4-wantedi-2.6-rhel5.patch b/ldiskfs/kernel_patches/patches/ext4-wantedi-2.6-rhel5.patch new file mode 100644 index 0000000..e0c6f8d --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext4-wantedi-2.6-rhel5.patch @@ -0,0 +1,169 @@ +Index: linux-2.6.18.i386/fs/ext4/ialloc.c +=================================================================== +--- linux-2.6.18.i386.orig/fs/ext4/ialloc.c ++++ linux-2.6.18.i386/fs/ext4/ialloc.c +@@ -576,7 +576,8 @@ static int find_group_other(struct super + * For other inodes, search forward from the parent directory's block + * group to find a free inode. + */ +-struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode) ++struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode, ++ unsigned long goal) + { + struct super_block *sb; + struct buffer_head *bitmap_bh = NULL; +@@ -607,6 +608,43 @@ struct inode *ext4_new_inode(handle_t *h + sbi = EXT4_SB(sb); + es = sbi->s_es; + ++ if (goal) { ++ group = (goal - 1) / EXT4_INODES_PER_GROUP(sb); ++ ino = (goal - 1) % EXT4_INODES_PER_GROUP(sb); ++ err = -EIO; ++ ++ gdp = ext4_get_group_desc(sb, group, &bh2); ++ if (!gdp) ++ goto fail; ++ ++ bitmap_bh = ext4_read_inode_bitmap(sb, group); ++ if (!bitmap_bh) ++ goto fail; ++ ++ BUFFER_TRACE(bh, "get_write_access"); ++ err = ext4_journal_get_write_access(handle, bitmap_bh); ++ if (err) ++ goto fail; ++ ++ if (ext4_set_bit_atomic(sb_bgl_lock(sbi, group), ++ ino, bitmap_bh->b_data)) { ++ printk(KERN_ERR "goal inode %lu unavailable\n", goal); ++ /* Oh well, we tried. */ ++ goto continue_allocation; ++ } ++ ++ BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata"); ++ err = ext4_journal_dirty_metadata(handle, bitmap_bh); ++ if (err) ++ goto fail; ++ ++ /* We've shortcircuited the allocation system successfully, ++ * now finish filling in the inode. ++ */ ++ goto got; ++ } ++ ++continue_allocation: + if (sbi->s_log_groups_per_flex) { + ret2 = find_group_flex(sb, dir, &group); + goto got_group; +Index: linux-2.6.18.i386/fs/ext4/namei.c +=================================================================== +--- linux-2.6.18.i386.orig/fs/ext4/namei.c ++++ linux-2.6.18.i386/fs/ext4/namei.c +@@ -104,6 +104,7 @@ struct dx_entry + __le32 block; + }; + ++ + /* + * dx_root_info is laid out so that if it should somehow get overlaid by a + * dirent the two low bits of the hash version will be zero. Therefore, the +@@ -149,6 +150,14 @@ struct dx_map_entry + u16 size; + }; + ++#define LVFS_DENTRY_PARAM_MAGIC 20070216UL ++struct lvfs_dentry_params ++{ ++ unsigned long p_inum; ++ void *p_ptr; ++ u32 magic; ++}; ++ + static inline ext4_lblk_t dx_get_block(struct dx_entry *entry); + static void dx_set_block(struct dx_entry *entry, ext4_lblk_t value); + static inline unsigned dx_get_hash (struct dx_entry *entry); +@@ -1708,6 +1717,20 @@ static int ext4_add_nondir(handle_t *han + return err; + } + ++static struct inode * ext4_new_inode_wantedi(handle_t *handle, struct inode *dir, ++ int mode, struct dentry *dentry) ++{ ++ unsigned long inum = 0; ++ ++ if (dentry->d_fsdata != NULL) { ++ struct lvfs_dentry_params *param = dentry->d_fsdata; ++ ++ if (param->magic == LVFS_DENTRY_PARAM_MAGIC) ++ inum = param->p_inum; ++ } ++ return ext4_new_inode(handle, dir, mode, inum); ++} ++ + /* + * By the time this is called, we already have created + * the directory cache entry for the new file, but it +@@ -1733,7 +1756,7 @@ retry: + if (IS_DIRSYNC(dir)) + handle->h_sync = 1; + +- inode = ext4_new_inode (handle, dir, mode); ++ inode = ext4_new_inode_wantedi (handle, dir, mode, dentry); + err = PTR_ERR(inode); + if (!IS_ERR(inode)) { + inode->i_op = &ext4_file_inode_operations; +@@ -1767,7 +1790,7 @@ retry: + if (IS_DIRSYNC(dir)) + handle->h_sync = 1; + +- inode = ext4_new_inode (handle, dir, mode); ++ inode = ext4_new_inode_wantedi (handle, dir, mode, dentry); + err = PTR_ERR(inode); + if (!IS_ERR(inode)) { + init_special_inode(inode, inode->i_mode, rdev); +@@ -1803,7 +1826,7 @@ retry: + if (IS_DIRSYNC(dir)) + handle->h_sync = 1; + +- inode = ext4_new_inode (handle, dir, S_IFDIR | mode); ++ inode = ext4_new_inode_wantedi (handle, dir, S_IFDIR | mode, dentry); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_stop; +@@ -2203,7 +2226,7 @@ retry: + if (IS_DIRSYNC(dir)) + handle->h_sync = 1; + +- inode = ext4_new_inode (handle, dir, S_IFLNK|S_IRWXUGO); ++ inode = ext4_new_inode_wantedi (handle, dir, S_IFLNK|S_IRWXUGO, dentry); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_stop; +Index: linux-2.6.18.i386/fs/ext4/ext4.h +=================================================================== +--- linux-2.6.18.i386.orig/fs/ext4/ext4.h ++++ linux-2.6.18.i386/fs/ext4/ext4.h +@@ -1013,7 +1013,8 @@ extern int ext4fs_dirhash(const char *na + dx_hash_info *hinfo); + + /* ialloc.c */ +-extern struct inode * ext4_new_inode (handle_t *, struct inode *, int); ++extern struct inode * ext4_new_inode (handle_t *, struct inode *, int, ++ unsigned long); + extern void ext4_free_inode (handle_t *, struct inode *); + extern struct inode * ext4_orphan_get (struct super_block *, unsigned long); + extern unsigned long ext4_count_free_inodes (struct super_block *); +Index: linux-2.6.18.i386/fs/ext4/migrate.c +=================================================================== +--- linux-2.6.18.i386.orig/fs/ext4/migrate.c ++++ linux-2.6.18.i386/fs/ext4/migrate.c +@@ -485,7 +485,7 @@ int ext4_ext_migrate(struct inode *inode + } + tmp_inode = ext4_new_inode(handle, + inode->i_sb->s_root->d_inode, +- S_IFREG); ++ S_IFREG, 0); + if (IS_ERR(tmp_inode)) { + retval = -ENOMEM; + ext4_journal_stop(handle); diff --git a/ldiskfs/kernel_patches/patches/ext4-xattr-no-update-ctime-rhel5.patch b/ldiskfs/kernel_patches/patches/ext4-xattr-no-update-ctime-rhel5.patch new file mode 100644 index 0000000..66de9df --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext4-xattr-no-update-ctime-rhel5.patch @@ -0,0 +1,32 @@ +Index: linux-2.6.18.i386/fs/ext4/ext4.h +=================================================================== +--- linux-2.6.18.i386.orig/fs/ext4/ext4.h ++++ linux-2.6.18.i386/fs/ext4/ext4.h +@@ -995,6 +995,13 @@ struct mmp_struct { + extern struct proc_dir_entry *proc_root_ext4; + + /* ++ * Indicates that ctime should not be updated in ext4_xattr_set_handle() ++ */ ++#ifndef XATTR_NO_CTIME ++#define XATTR_NO_CTIME 0x80 ++#endif ++ ++/* + * Function prototypes + */ + +Index: linux-2.6.18.i386/fs/ext4/xattr.c +=================================================================== +--- linux-2.6.18.i386.orig/fs/ext4/xattr.c ++++ linux-2.6.18.i386/fs/ext4/xattr.c +@@ -1026,7 +1026,8 @@ ext4_xattr_set_handle(handle_t *handle, + } + if (!error) { + ext4_xattr_update_super_block(handle, inode->i_sb); +- inode->i_ctime = ext4_current_time(inode); ++ if (!(flags & XATTR_NO_CTIME)) ++ inode->i_ctime = ext4_current_time(inode); + if (!value) + EXT4_I(inode)->i_state &= ~EXT4_STATE_NO_EXPAND; + error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); diff --git a/ldiskfs/kernel_patches/patches/iopen-2.6.18-rhel5-ext4.patch b/ldiskfs/kernel_patches/patches/iopen-2.6.18-rhel5-ext4.patch new file mode 100644 index 0000000..d7b94fa --- /dev/null +++ b/ldiskfs/kernel_patches/patches/iopen-2.6.18-rhel5-ext4.patch @@ -0,0 +1,512 @@ +Index: linux-2.6.18-128.1.6/fs/ext4/iopen.c +=================================================================== +--- /dev/null ++++ linux-2.6.18-128.1.6/fs/ext4/iopen.c +@@ -0,0 +1,295 @@ ++/* ++ * linux/fs/ext4/iopen.c ++ * ++ * Special support for open by inode number ++ * ++ * Copyright (C) 2001 by Theodore Ts'o (tytso@alum.mit.edu). ++ * ++ * This file may be redistributed under the terms of the GNU General ++ * Public License. ++ * ++ * ++ * Invariants: ++ * - there is only ever a single DCACHE_NFSD_DISCONNECTED dentry alias ++ * for an inode at one time. ++ * - there are never both connected and DCACHE_NFSD_DISCONNECTED dentry ++ * aliases on an inode at the same time. ++ * ++ * If we have any connected dentry aliases for an inode, use one of those ++ * in iopen_lookup(). Otherwise, we instantiate a single NFSD_DISCONNECTED ++ * dentry for this inode, which thereafter will be found by the dcache ++ * when looking up this inode number in __iopen__, so we don't return here ++ * until it is gone. ++ * ++ * If we get an inode via a regular name lookup, then we "rename" the ++ * NFSD_DISCONNECTED dentry to the proper name and parent. This ensures ++ * existing users of the disconnected dentry will continue to use the same ++ * dentry as the connected users, and there will never be both kinds of ++ * dentry aliases at one time. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include "iopen.h" ++#include "ext4.h" ++#include "ext4_jbd2.h" ++ ++#ifndef assert ++#define assert(test) J_ASSERT(test) ++#endif ++ ++#define IOPEN_NAME_LEN 32 ++ ++/* ++ * This implements looking up an inode by number. ++ */ ++static struct dentry *iopen_lookup(struct inode * dir, struct dentry *dentry, ++ struct nameidata *nd) ++{ ++ struct inode *inode; ++ unsigned long ino; ++ struct list_head *lp; ++ struct dentry *alternate; ++ char buf[IOPEN_NAME_LEN]; ++ ++ if (dentry->d_name.len >= IOPEN_NAME_LEN) ++ return ERR_PTR(-ENAMETOOLONG); ++ ++ memcpy(buf, dentry->d_name.name, dentry->d_name.len); ++ buf[dentry->d_name.len] = 0; ++ ++ if (strcmp(buf, ".") == 0) ++ ino = dir->i_ino; ++ else if (strcmp(buf, "..") == 0) ++ ino = EXT4_ROOT_INO; ++ else ++ ino = simple_strtoul(buf, 0, 0); ++ ++ if ((ino != EXT4_ROOT_INO && ++ ino < EXT4_FIRST_INO(dir->i_sb)) || ++ ino > le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count)) ++ return ERR_PTR(-ENOENT); ++ ++ inode = ext4_iget(dir->i_sb, ino); ++ if (IS_ERR(inode)) { ++ /* Newer kernels return -ESTALE for inodes that are not in use, ++ * but older kernels return a negative dentry. This can only ++ * happen when doing a lookup in the __iopen__ dir, because the ++ * "entry" will always be found even if inode is unallocated. ++ * Handle this here instead of fixing the callers. b=19114 */ ++ if (PTR_ERR(inode) == -ESTALE) ++ return (ERR_PTR(-ENOENT)); ++ return ERR_CAST(inode); ++ } ++ ++ assert(list_empty(&dentry->d_alias)); /* d_instantiate */ ++ assert(d_unhashed(dentry)); /* d_rehash */ ++ ++ /* preferrably return a connected dentry */ ++ spin_lock(&dcache_lock); ++ list_for_each(lp, &inode->i_dentry) { ++ alternate = list_entry(lp, struct dentry, d_alias); ++ assert(!(alternate->d_flags & DCACHE_DISCONNECTED)); ++ } ++ ++ if (!list_empty(&inode->i_dentry)) { ++ alternate = list_entry(inode->i_dentry.next, ++ struct dentry, d_alias); ++ dget_locked(alternate); ++ spin_lock(&alternate->d_lock); ++ alternate->d_flags |= DCACHE_REFERENCED; ++ spin_unlock(&alternate->d_lock); ++ iput(inode); ++ spin_unlock(&dcache_lock); ++ return alternate; ++ } ++ dentry->d_flags |= DCACHE_DISCONNECTED; ++ ++ /* d_add(), but don't drop dcache_lock before adding dentry to inode */ ++ list_add(&dentry->d_alias, &inode->i_dentry); /* d_instantiate */ ++ dentry->d_inode = inode; ++ ++ d_rehash_cond(dentry, 0); ++ spin_unlock(&dcache_lock); ++ ++ return NULL; ++} ++ ++/* This function is spliced into ext4_lookup and does the move of a ++ * disconnected dentry (if it exists) to a connected dentry. ++ */ ++struct dentry *iopen_connect_dentry(struct dentry *dentry, struct inode *inode, ++ int rehash) ++{ ++ struct dentry *tmp, *goal = NULL; ++ struct list_head *lp; ++ ++ /* verify this dentry is really new */ ++ assert(dentry->d_inode == NULL); ++ assert(list_empty(&dentry->d_alias)); /* d_instantiate */ ++ if (rehash) ++ assert(d_unhashed(dentry)); /* d_rehash */ ++ assert(list_empty(&dentry->d_subdirs)); ++ ++ spin_lock(&dcache_lock); ++ if (!inode) ++ goto do_rehash; ++ ++ if (!test_opt(inode->i_sb, IOPEN)) ++ goto do_instantiate; ++ ++ /* preferrably return a connected dentry */ ++ list_for_each(lp, &inode->i_dentry) { ++ tmp = list_entry(lp, struct dentry, d_alias); ++ if (tmp->d_flags & DCACHE_DISCONNECTED) { ++ assert(tmp->d_alias.next == &inode->i_dentry); ++ assert(tmp->d_alias.prev == &inode->i_dentry); ++ goal = tmp; ++ dget_locked(goal); ++ break; ++ } ++ } ++ ++ if (!goal) ++ goto do_instantiate; ++ ++ /* Move the goal to the de hash queue */ ++ goal->d_flags &= ~DCACHE_DISCONNECTED; ++ security_d_instantiate(goal, inode); ++ __d_drop(dentry); ++ d_rehash_cond(dentry, 0); ++ d_move_locked(goal, dentry); ++ spin_unlock(&dcache_lock); ++ iput(inode); ++ ++ return goal; ++ ++ /* d_add(), but don't drop dcache_lock before adding dentry to inode */ ++do_instantiate: ++ list_add(&dentry->d_alias, &inode->i_dentry); /* d_instantiate */ ++ dentry->d_inode = inode; ++do_rehash: ++ if (rehash) ++ d_rehash_cond(dentry, 0); ++ spin_unlock(&dcache_lock); ++ ++ return NULL; ++} ++ ++/* ++ * Similar as d_instantiate() except that it drops the disconnected ++ * dentry if any. ++ */ ++void iopen_d_instantiate(struct dentry *dentry, struct inode * inode) ++{ ++ struct dentry *dis_dentry; ++ ++ /* verify this dentry is really new */ ++ assert(dentry->d_inode == NULL); ++ assert(list_empty(&dentry->d_alias)); ++ ++ spin_lock(&dcache_lock); ++ if (!inode || !test_opt(inode->i_sb, IOPEN) || ++ list_empty(&inode->i_dentry)) ++ goto do_instantiate; ++ ++ /* a disconnected dentry has been added in our back, ++ * we have to drop this dentry, see bug 16362/15713*/ ++ dis_dentry = list_entry(inode->i_dentry.next, struct dentry, d_alias); ++ spin_lock(&dis_dentry->d_lock); ++ assert(dis_dentry->d_alias.next == &inode->i_dentry); ++ assert(dis_dentry->d_alias.prev == &inode->i_dentry); ++ assert(dis_dentry->d_flags & DCACHE_DISCONNECTED); ++ __d_drop(dis_dentry); ++ list_del_init(&dis_dentry->d_alias); ++ spin_unlock(&dis_dentry->d_lock); ++ ++do_instantiate: ++ if (inode) ++ list_add(&dentry->d_alias, &inode->i_dentry); ++ dentry->d_inode = inode; ++ spin_unlock(&dcache_lock); ++ security_d_instantiate(dentry, inode); ++} ++ ++/* ++ * These are the special structures for the iopen pseudo directory. ++ */ ++ ++static struct inode_operations iopen_inode_operations = { ++ lookup: iopen_lookup, /* BKL held */ ++}; ++ ++static struct file_operations iopen_file_operations = { ++ read: generic_read_dir, ++}; ++ ++static int match_dentry(struct dentry *dentry, const char *name) ++{ ++ int len; ++ ++ len = strlen(name); ++ if (dentry->d_name.len != len) ++ return 0; ++ if (strncmp(dentry->d_name.name, name, len)) ++ return 0; ++ return 1; ++} ++ ++/* ++ * This function is spliced into ext4_lookup and returns 1 the file ++ * name is __iopen__ and dentry has been filled in appropriately. ++ */ ++int ext4_check_for_iopen(struct inode *dir, struct dentry *dentry) ++{ ++ struct inode *inode; ++ ++ if (dir->i_ino != EXT4_ROOT_INO || ++ !test_opt(dir->i_sb, IOPEN) || ++ !match_dentry(dentry, "__iopen__")) ++ return 0; ++ ++ inode = ext4_iget(dir->i_sb, EXT4_BAD_INO); ++ if (IS_ERR(inode)) ++ return 0; ++ ++ d_add(dentry, inode); ++ return 1; ++} ++ ++/* ++ * This function is spliced into read_inode; it returns 1 if inode ++ * number is the one for /__iopen__, in which case the inode is filled ++ * in appropriately. Otherwise, this fuction returns 0. ++ */ ++int ext4_iopen_get_inode(struct inode *inode) ++{ ++ if (inode->i_ino != EXT4_BAD_INO) ++ return 0; ++ ++ inode->i_mode = S_IFDIR | S_IRUSR | S_IXUSR; ++ if (test_opt(inode->i_sb, IOPEN_NOPRIV)) ++ inode->i_mode |= 0777; ++ inode->i_uid = 0; ++ inode->i_gid = 0; ++ inode->i_nlink = 1; ++ inode->i_size = 4096; ++ inode->i_atime = inode->i_ctime = inode->i_mtime = ext4_current_time(inode); ++ EXT4_I(inode)->i_dtime = 0; ++ EXT4_I(inode)->i_file_acl = 0; ++ inode->i_blocks = 0; ++ inode->i_version = 1; ++ inode->i_generation = 0; ++ ++ inode->i_op = &iopen_inode_operations; ++ inode->i_fop = &iopen_file_operations; ++ inode->i_mapping->a_ops = 0; ++ ++ if (inode->i_state & I_NEW) ++ unlock_new_inode(inode); ++ ++ return 1; ++} +Index: linux-2.6.18-128.1.6/fs/ext4/iopen.h +=================================================================== +--- /dev/null ++++ linux-2.6.18-128.1.6/fs/ext4/iopen.h +@@ -0,0 +1,16 @@ ++/* ++ * iopen.h ++ * ++ * Special support for opening files by inode number. ++ * ++ * Copyright (C) 2001 by Theodore Ts'o (tytso@alum.mit.edu). ++ * ++ * This file may be redistributed under the terms of the GNU General ++ * Public License. ++ */ ++ ++extern int ext4_check_for_iopen(struct inode *dir, struct dentry *dentry); ++extern int ext4_iopen_get_inode(struct inode *inode); ++extern struct dentry *iopen_connect_dentry(struct dentry *dentry, ++ struct inode *inode, int rehash); ++extern void iopen_d_instantiate(struct dentry *dentry, struct inode * inode); +Index: linux-2.6.18-128.1.6/fs/ext4/inode.c +=================================================================== +--- linux-2.6.18-128.1.6.orig/fs/ext4/inode.c ++++ linux-2.6.18-128.1.6/fs/ext4/inode.c +@@ -37,6 +37,7 @@ + #include + #include "ext4_jbd2.h" + #include "xattr.h" ++#include "iopen.h" + #include "acl.h" + + /* +@@ -2764,6 +2765,8 @@ struct inode *ext4_iget(struct super_blo + ei->i_default_acl = EXT4_ACL_NOT_CACHED; + #endif + ei->i_block_alloc_info = NULL; ++ if (ext4_iopen_get_inode(inode)) ++ return inode; + + ret = __ext4_get_inode_loc(inode, &iloc, 0); + if (ret < 0) +Index: linux-2.6.18-128.1.6/fs/ext4/super.c +=================================================================== +--- linux-2.6.18-128.1.6.orig/fs/ext4/super.c ++++ linux-2.6.18-128.1.6/fs/ext4/super.c +@@ -888,6 +888,7 @@ enum { + Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, + Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version, + Opt_mballoc, Opt_nomballoc, Opt_stripe, ++ Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, + }; + + static match_table_t tokens = { +@@ -938,6 +939,9 @@ static match_table_t tokens = { + {Opt_noquota, "noquota"}, + {Opt_quota, "quota"}, + {Opt_usrquota, "usrquota"}, ++ {Opt_iopen, "iopen"}, ++ {Opt_noiopen, "noiopen"}, ++ {Opt_iopen_nopriv, "iopen_nopriv"}, + {Opt_barrier, "barrier=%u"}, + {Opt_extents, "extents"}, + {Opt_noextents, "noextents"}, +@@ -1270,6 +1274,18 @@ clear_qf_name: + else + clear_opt(sbi->s_mount_opt, BARRIER); + break; ++ case Opt_iopen: ++ set_opt (sbi->s_mount_opt, IOPEN); ++ clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV); ++ break; ++ case Opt_noiopen: ++ clear_opt (sbi->s_mount_opt, IOPEN); ++ clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV); ++ break; ++ case Opt_iopen_nopriv: ++ set_opt (sbi->s_mount_opt, IOPEN); ++ set_opt (sbi->s_mount_opt, IOPEN_NOPRIV); ++ break; + case Opt_ignore: + break; + case Opt_resize: +Index: linux-2.6.18-128.1.6/fs/ext4/namei.c +=================================================================== +--- linux-2.6.18-128.1.6.orig/fs/ext4/namei.c ++++ linux-2.6.18-128.1.6/fs/ext4/namei.c +@@ -39,6 +39,7 @@ + + #include "namei.h" + #include "xattr.h" ++#include "iopen.h" + #include "acl.h" + + /* +@@ -1048,6 +1049,9 @@ static struct dentry *ext4_lookup(struct + if (dentry->d_name.len > EXT4_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); + ++ if (ext4_check_for_iopen(dir, dentry)) ++ return NULL; ++ + bh = ext4_find_entry(dentry, &de); + inode = NULL; + if (bh) { +@@ -1062,7 +1066,8 @@ static struct dentry *ext4_lookup(struct + if (IS_ERR(inode)) + return ERR_CAST(inode); + } +- return d_splice_alias(inode, dentry); ++ ++ return iopen_connect_dentry(dentry, inode, 1); + } + + +@@ -1709,7 +1714,7 @@ static int ext4_add_nondir(handle_t *han + int err = ext4_add_entry(handle, dentry, inode); + if (!err) { + ext4_mark_inode_dirty(handle, inode); +- d_instantiate(dentry, inode); ++ iopen_d_instantiate(dentry, inode); + return 0; + } + drop_nlink(inode); +@@ -1868,7 +1873,7 @@ out_clear_inode: + ext4_inc_count(handle, dir); + ext4_update_dx_flag(dir); + ext4_mark_inode_dirty(handle, dir); +- d_instantiate(dentry, inode); ++ iopen_d_instantiate(dentry, inode); + out_stop: + ext4_journal_stop(handle); + if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) +@@ -2134,10 +2139,6 @@ static int ext4_rmdir (struct inode * di + inode->i_nlink); + inode->i_version++; + clear_nlink(inode); +- /* There's no need to set i_disksize: the fact that i_nlink is +- * zero will ensure that the right thing happens during any +- * recovery. */ +- inode->i_size = 0; + ext4_orphan_add(handle, inode); + inode->i_ctime = dir->i_ctime = dir->i_mtime = ext4_current_time(inode); + ext4_mark_inode_dirty(handle, inode); +@@ -2263,6 +2264,23 @@ out_stop: + return err; + } + ++/* Like ext4_add_nondir() except for call to iopen_connect_dentry */ ++static int ext4_add_link(handle_t *handle, struct dentry *dentry, ++ struct inode *inode) ++{ ++ int err = ext4_add_entry(handle, dentry, inode); ++ if (!err) { ++ err = ext4_mark_inode_dirty(handle, inode); ++ if (err == 0) { ++ dput(iopen_connect_dentry(dentry, inode, 0)); ++ return 0; ++ } ++ } ++ ext4_dec_count(handle, inode); ++ iput(inode); ++ return err; ++} ++ + static int ext4_link (struct dentry * old_dentry, + struct inode * dir, struct dentry *dentry) + { +@@ -2293,7 +2311,8 @@ retry: + ext4_inc_count(handle, inode); + atomic_inc(&inode->i_count); + +- err = ext4_add_nondir(handle, dentry, inode); ++ err = ext4_add_link(handle, dentry, inode); ++ ext4_orphan_del(handle, inode); + ext4_journal_stop(handle); + if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) + goto retry; +Index: linux-2.6.18-128.1.6/fs/ext4/Makefile +=================================================================== +--- linux-2.6.18-128.1.6.orig/fs/ext4/Makefile ++++ linux-2.6.18-128.1.6/fs/ext4/Makefile +@@ -4,7 +4,7 @@ + + obj-$(CONFIG_EXT4DEV_FS) += ext4dev.o + +-ext4dev-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ ++ext4dev-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ + ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ + ext4_jbd2.o migrate.o mballoc.o + +Index: linux-2.6.18-128.1.6/fs/ext4/ext4.h +=================================================================== +--- linux-2.6.18-128.1.6.orig/fs/ext4/ext4.h ++++ linux-2.6.18-128.1.6/fs/ext4/ext4.h +@@ -18,6 +18,7 @@ + + #include + #include ++#include + #include "ext4_i.h" + + #define EXT4_SUPER_MAGIC 0xEF53 +@@ -537,6 +538,8 @@ do { \ + #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ + #define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ + #define EXT4_MOUNT_MBALLOC 0x4000000 /* Buddy allocation support */ ++#define EXT4_MOUNT_IOPEN 0x8000000 /* Allow access via iopen */ ++#define EXT4_MOUNT_IOPEN_NOPRIV 0x10000000 /* Make iopen world-readable */ + /* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */ + #ifndef _LINUX_EXT2_FS_H + #define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt diff --git a/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel5-ext4.series b/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel5-ext4.series new file mode 100644 index 0000000..5e90b31 --- /dev/null +++ b/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel5-ext4.series @@ -0,0 +1,23 @@ +ext4-wantedi-2.6-rhel5.patch +iopen-2.6.18-rhel5-ext4.patch +ext4-map_inode_page-2.6.18-rhel5.patch +export-ext4-2.6-rhel5.patch +ext4-include-fixes-2.6-rhel5.patch +ext4-ialloc-2.6-rhel5.patch +ext4-remove-cond_resched-calls-rhel5.patch +ext4-filterdata-rhel5.patch +ext4-inode-version-rhel5.patch +ext4-mmp-rhel5.patch +ext4-unlink-race-rhel5.patch +ext4-fiemap-2.6-rhel5.patch +ext4-lookup-dotdot-rhel5.patch +ext4-max-dir-size-rhel5.patch +ext4-print-inum-in-htree-warning-rhel5.patch +ext4-xattr-no-update-ctime-rhel5.patch +ext4-prealloc-rhel5.patch +ext4-mballoc-extra-checks-rhel5.patch +ext4-mballoc-handle-dev-paths-rhel5.patch +ext4-big-endian-check-2.6-rhel5.patch +ext4-alloc-policy-2.6-rhel5.patch +ext4-misc-rhel5.patch + -- 1.8.3.1