From: girish Date: Wed, 27 May 2009 02:40:03 +0000 (+0000) Subject: b=19625 X-Git-Tag: v1_9_0_200~74 X-Git-Url: https://git.whamcloud.com/?a=commitdiff_plain;ds=sidebyside;h=3c2439c5c244d0625756e5fdaee2941589920b1b;p=fs%2Flustre-release.git b=19625 i=adilger i=johann ext4 patches for SLES11 --- diff --git a/ldiskfs/kernel_patches/patches/export-ext4-2.6-sles11.patch b/ldiskfs/kernel_patches/patches/export-ext4-2.6-sles11.patch new file mode 100644 index 0000000..3930843 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/export-ext4-2.6-sles11.patch @@ -0,0 +1,35 @@ +Index: linux-2.6.18.i386/fs/ext4/super.c +=================================================================== +--- linux-2.6.18.i386.orig/fs/ext4/super.c ++++ linux-2.6.18.i386/fs/ext4/super.c +@@ -185,6 +185,8 @@ void ext4_journal_abort_handle(const cha + jbd2_journal_abort_handle(handle); + } + ++EXPORT_SYMBOL(ext4_journal_abort_handle); ++ + /* Deal with the reporting of failure conditions on a filesystem such as + * inconsistencies detected or read IO failures. + * +@@ -2459,6 +2461,8 @@ out_fail: + return ret; + } + ++EXPORT_SYMBOL(ext4_force_commit); ++ + /* + * Setup any per-fs journal parameters now. We'll do this both on + * initial mount, once the journal has been initialised but before we've +@@ -3502,6 +3506,12 @@ int ext4_map_inode_page(struct inode *in + unsigned long *blocks, int *created, int create); + EXPORT_SYMBOL(ext4_map_inode_page); + ++EXPORT_SYMBOL(ext4_xattr_get); ++EXPORT_SYMBOL(ext4_xattr_set_handle); ++EXPORT_SYMBOL(ext4_bread); ++EXPORT_SYMBOL(ext4_journal_start_sb); ++EXPORT_SYMBOL(__ext4_journal_stop); ++ + MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); + MODULE_DESCRIPTION("Fourth Extended Filesystem with extents"); + MODULE_LICENSE("GPL"); diff --git a/ldiskfs/kernel_patches/patches/ext4-alloc-policy-2.6-sles11.patch b/ldiskfs/kernel_patches/patches/ext4-alloc-policy-2.6-sles11.patch new file mode 100644 index 0000000..a39fb88 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext4-alloc-policy-2.6-sles11.patch @@ -0,0 +1,101 @@ +Index: linux-2.6.27.21-0.1/fs/ext4/ialloc.c +=================================================================== +--- linux-2.6.27.21-0.1.orig/fs/ext4/ialloc.c ++++ linux-2.6.27.21-0.1/fs/ext4/ialloc.c +@@ -1005,6 +1005,36 @@ fail_drop: + return ERR_PTR(err); + } + ++unsigned long ext4_find_reverse(struct super_block *sb) ++{ ++ struct ext4_group_desc *desc; ++ struct buffer_head *bitmap_bh = NULL; ++ int group; ++ unsigned long ino, offset; ++ ++ for (offset = (EXT4_INODES_PER_GROUP(sb) >> 1); offset >= 0; ++ offset >>= 1) { ++ for (group = EXT4_SB(sb)->s_groups_count - 1; group >= 0; ++ --group) { ++ desc = ext4_get_group_desc(sb, group, NULL); ++ if (ext4_free_inodes_count(sb, desc) == 0) ++ continue; ++ ++ bitmap_bh = ext4_read_inode_bitmap(sb, group); ++ if (!bitmap_bh) ++ continue; ++ ++ ino = ext4_find_next_zero_bit((unsigned long *) ++ bitmap_bh->b_data, ++ EXT4_INODES_PER_GROUP(sb), offset); ++ if (ino < EXT4_INODES_PER_GROUP(sb)) ++ return (group * EXT4_INODES_PER_GROUP(sb) + ++ ino + 1); ++ } ++ } ++ return 0; ++} ++ + /* Verify that we are loading a valid orphan from disk */ + struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino) + { +Index: linux-2.6.27.21-0.1/fs/ext4/namei.c +=================================================================== +--- linux-2.6.27.21-0.1.orig/fs/ext4/namei.c ++++ linux-2.6.27.21-0.1/fs/ext4/namei.c +@@ -151,14 +151,24 @@ struct dx_map_entry + u16 size; + }; + ++/* ++ * dentry_param used by ext4_new_inode_wantedi() ++ */ + #define LVFS_DENTRY_PARAM_MAGIC 20070216UL + struct lvfs_dentry_params + { +- unsigned long p_inum; +- void *p_ptr; +- u32 magic; ++ unsigned long ldp_inum; ++ long ldp_flags; ++ u32 ldp_magic; + }; + ++/* Only use the least 3 bits of ldp_flags for goal policy */ ++typedef enum { ++ DP_GOAL_POLICY = 0, ++ DP_LASTGROUP_REVERSE = 1, ++} dp_policy_t; ++ ++ + static inline ext4_lblk_t dx_get_block(struct dx_entry *entry); + static void dx_set_block(struct dx_entry *entry, ext4_lblk_t value); + static inline unsigned dx_get_hash(struct dx_entry *entry); +@@ -1770,8 +1780,13 @@ static struct inode * ext4_new_inode_wan + if (dentry->d_fsdata != NULL) { + struct lvfs_dentry_params *param = dentry->d_fsdata; + +- if (param->magic == LVFS_DENTRY_PARAM_MAGIC) +- inum = param->p_inum; ++ if (param->ldp_magic == LVFS_DENTRY_PARAM_MAGIC) { ++ if ((dp_policy_t)(param->ldp_flags & 0x7) == ++ DP_LASTGROUP_REVERSE) ++ inum = ext4_find_reverse(dir->i_sb); ++ else /* DP_GOAL_POLICY */ ++ inum = param->ldp_inum; ++ } + } + return ext4_new_inode(handle, dir, mode, inum); + } +Index: linux-2.6.27.21-0.1/fs/ext4/ext4.h +=================================================================== +--- linux-2.6.27.21-0.1.orig/fs/ext4/ext4.h ++++ linux-2.6.27.21-0.1/fs/ext4/ext4.h +@@ -1089,6 +1089,7 @@ extern int ext4fs_dirhash(const char *na + /* ialloc.c */ + extern struct inode * ext4_new_inode(handle_t *, struct inode *, int, + unsigned long); ++extern unsigned long ext4_find_reverse(struct super_block *); + extern void ext4_free_inode(handle_t *, struct inode *); + extern struct inode * ext4_orphan_get(struct super_block *, unsigned long); + extern unsigned long ext4_count_free_inodes(struct super_block *); diff --git a/ldiskfs/kernel_patches/patches/ext4-big-endian-check-2.6-sles11.patch b/ldiskfs/kernel_patches/patches/ext4-big-endian-check-2.6-sles11.patch new file mode 100644 index 0000000..27bfee8 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext4-big-endian-check-2.6-sles11.patch @@ -0,0 +1,56 @@ +Index: linux-2.6.27.21-0.1/fs/ext4/super.c +=================================================================== +--- linux-2.6.27.21-0.1.orig/fs/ext4/super.c ++++ linux-2.6.27.21-0.1/fs/ext4/super.c +@@ -74,6 +74,8 @@ static void ext4_write_super_lockfs(stru + + struct proc_dir_entry *proc_root_ext4; + ++static int bigendian_extents; ++ + ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, + struct ext4_group_desc *bg) + { +@@ -1291,7 +1293,7 @@ enum { + Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, + Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version, + Opt_stripe, Opt_delalloc, Opt_nodelalloc, +- Opt_inode_readahead_blks, ++ Opt_inode_readahead_blks, Opt_bigendian_extents, + Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, + }; + +@@ -1353,6 +1355,7 @@ static const match_table_t tokens = { + {Opt_delalloc, "delalloc"}, + {Opt_nodelalloc, "nodelalloc"}, + {Opt_inode_readahead_blks, "inode_readahead_blks=%u"}, ++ {Opt_bigendian_extents, "bigendian_extents"}, + {Opt_err, NULL}, + }; + +@@ -1768,6 +1771,9 @@ set_qf_format: + return 0; + sbi->s_inode_readahead_blks = option; + break; ++ case Opt_bigendian_extents: ++ bigendian_extents = 1; ++ break; + default: + printk(KERN_ERR + "EXT4-fs: Unrecognized mount option \"%s\" " +@@ -2673,6 +2679,15 @@ static int ext4_fill_super(struct super_ + &sbi->s_inode_readahead_blks); + #endif + ++#ifdef __BIG_ENDIAN ++ if (bigendian_extents == 0) { ++ printk(KERN_ERR "EXT4-fs: extents feature is not guaranteed to " ++ "work on big-endian systems. Use \"bigendian_extents\" " ++ "mount option to override.\n"); ++ goto failed_mount; ++ } ++#endif ++ + bgl_lock_init(&sbi->s_blockgroup_lock); + + sbi->s_last_alloc_group = -1; diff --git a/ldiskfs/kernel_patches/patches/ext4-ext_generation-sles11.patch b/ldiskfs/kernel_patches/patches/ext4-ext_generation-sles11.patch new file mode 100644 index 0000000..742d0ab --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext4-ext_generation-sles11.patch @@ -0,0 +1,48 @@ +Index: linux-2.6.27.21-0.1/fs/ext4/ext4_extents.h +=================================================================== +--- linux-2.6.27.21-0.1.orig/fs/ext4/ext4_extents.h ++++ linux-2.6.27.21-0.1/fs/ext4/ext4_extents.h +@@ -203,6 +203,11 @@ static inline unsigned short ext_depth(s + return le16_to_cpu(ext_inode_hdr(inode)->eh_depth); + } + ++static inline void ext4_ext_tree_changed(struct inode *inode) ++{ ++ EXT4_I(inode)->i_ext_generation++; ++} ++ + static inline void + ext4_ext_invalidate_cache(struct inode *inode) + { +Index: linux-2.6.27.21-0.1/fs/ext4/ext4_i.h +=================================================================== +--- linux-2.6.27.21-0.1.orig/fs/ext4/ext4_i.h ++++ linux-2.6.27.21-0.1/fs/ext4/ext4_i.h +@@ -114,6 +114,7 @@ struct ext4_inode_info { + struct inode vfs_inode; + struct jbd2_inode jinode; + ++ unsigned long i_ext_generation; + struct ext4_ext_cache i_cached_extent; + /* + * File creation time. Its function is same as that of +Index: linux-2.6.27.21-0.1/fs/ext4/extents.c +=================================================================== +--- linux-2.6.27.21-0.1.orig/fs/ext4/extents.c ++++ linux-2.6.27.21-0.1/fs/ext4/extents.c +@@ -1618,6 +1618,7 @@ cleanup: + ext4_ext_drop_refs(npath); + kfree(npath); + } ++ ext4_ext_tree_changed(inode); + ext4_ext_invalidate_cache(inode); + return err; + } +@@ -2278,6 +2279,7 @@ static int ext4_ext_remove_space(struct + } + } + out: ++ ext4_ext_tree_changed(inode); + ext4_ext_drop_refs(path); + kfree(path); + ext4_journal_stop(handle); diff --git a/ldiskfs/kernel_patches/patches/ext4-fiemap-2.6-sles11.patch b/ldiskfs/kernel_patches/patches/ext4-fiemap-2.6-sles11.patch new file mode 100644 index 0000000..149019a --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext4-fiemap-2.6-sles11.patch @@ -0,0 +1,568 @@ +A large part of this code is from the generic VFS code in fs/ioctl.c in the +upstream kernel. + +Index: linux-2.6.27.21-0.1/fs/ext4/ioctl.c +=================================================================== +--- linux-2.6.27.21-0.1.orig/fs/ext4/ioctl.c ++++ linux-2.6.27.21-0.1/fs/ext4/ioctl.c +@@ -18,6 +18,162 @@ + #include "ext4_jbd2.h" + #include "ext4.h" + ++#include "fiemap.h" ++ ++/* So that the fiemap access checks can't overflow on 32 bit machines. */ ++#define FIEMAP_MAX_EXTENTS (UINT_MAX / sizeof(struct fiemap_extent)) ++ ++/** ++ * fiemap_fill_next_extent - Fiemap helper function ++ * @fieinfo: Fiemap context passed into ->fiemap ++ * @logical: Extent logical start offset, in bytes ++ * @phys: Extent physical start offset, in bytes ++ * @len: Extent length, in bytes ++ * @flags: FIEMAP_EXTENT flags that describe this extent ++ * @lun: LUN on which this extent resides ++ * ++ * Called from file system ->fiemap callback. Will populate extent ++ * info as passed in via arguments and copy to user memory. On ++ * success, extent count on fieinfo is incremented. ++ * ++ * Returns 0 on success, -errno on error, 1 if this was the last ++ * extent that will fit in user array. ++ */ ++#define SET_UNKNOWN_FLAGS (FIEMAP_EXTENT_DELALLOC) ++#define SET_NO_DIRECT_FLAGS (FIEMAP_EXTENT_DATA_ENCRYPTED \ ++ |FIEMAP_EXTENT_NET) ++#define SET_NO_UNMOUNTED_IO_FLAGS (FIEMAP_EXTENT_DATA_ENCRYPTED) ++#define SET_NOT_ALIGNED_FLAGS (FIEMAP_EXTENT_DATA_TAIL|FIEMAP_EXTENT_DATA_INLINE) ++int fiemap_fill_next_extent(struct fiemap_extent_info *fieinfo, u64 logical, ++ u64 phys, u64 len, u32 flags, dev_t dev) ++{ ++ struct fiemap_extent extent = { 0 }; ++ struct fiemap_extent *dest = fieinfo->fi_extents_start; ++ ++ /* only count the extents */ ++ if (fieinfo->fi_extents_max == 0) { ++ fieinfo->fi_extents_mapped++; ++ return (flags & FIEMAP_EXTENT_LAST) ? 1 : 0; ++ } ++ ++ if (fieinfo->fi_extents_mapped >= fieinfo->fi_extents_max) ++ return 1; ++ ++ if (flags & SET_UNKNOWN_FLAGS) ++ flags |= FIEMAP_EXTENT_UNKNOWN; ++ if (flags & SET_NO_DIRECT_FLAGS) ++ flags |= FIEMAP_EXTENT_NO_DIRECT; ++ if (flags & SET_NOT_ALIGNED_FLAGS) ++ flags |= FIEMAP_EXTENT_NOT_ALIGNED; ++ if (flags & SET_NO_UNMOUNTED_IO_FLAGS) ++ flags |= FIEMAP_EXTENT_ENCODED; ++ ++ extent.fe_logical = logical; ++ extent.fe_physical = phys; ++ extent.fe_length = len; ++ extent.fe_flags = flags; ++ extent.fe_device = new_encode_dev(dev); ++ ++ dest += fieinfo->fi_extents_mapped; ++ if (copy_to_user(dest, &extent, sizeof(extent))) ++ return -EFAULT; ++ ++ fieinfo->fi_extents_mapped++; ++ if (fieinfo->fi_extents_mapped == fieinfo->fi_extents_max) ++ return 1; ++ ++ return (flags & FIEMAP_EXTENT_LAST) ? 1 : 0; ++} ++ ++static int fiemap_check_ranges(struct super_block *sb, ++ u64 start, u64 len, u64 *new_len) ++{ ++ *new_len = len; ++ ++ if (len == 0) ++ return -EINVAL; ++ ++ if (start > sb->s_maxbytes) ++ return -EFBIG; ++ ++ /* ++ * Shrink request scope to what the fs can actually handle. ++ */ ++ if ((len > sb->s_maxbytes) || ++ (sb->s_maxbytes - len) < start) ++ *new_len = sb->s_maxbytes - start; ++ ++ return 0; ++} ++ ++/* ++ * fiemap_check_flags - check validity of requested flags for fiemap ++ * @fieinfo: Fiemap context passed into ->fiemap ++ * @fs_flags: Set of fiemap flags that the file system understands ++ * ++ * Called from file system ->fiemap callback. This will compute the ++ * intersection of valid fiemap flags and those that the fs supports. That ++ * value is then compared against the user supplied flags. In case of bad user ++ * flags, the invalid values will be written into the fieinfo structure, and ++ * -EBADR is returned, which tells ioctl_fiemap() to return those values to ++ * userspace. For this reason, a return code of -EBADR should be preserved. ++ * ++ * Returns 0 on success, -EBADR on bad flags. ++ */ ++int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags) ++{ ++ u32 incompat_flags; ++ ++ incompat_flags = fieinfo->fi_flags & ~(FIEMAP_FLAGS_COMPAT & fs_flags); ++ if (incompat_flags) { ++ fieinfo->fi_flags = incompat_flags; ++ return -EBADR; ++ } ++ ++ return 0; ++} ++ ++int ioctl_fiemap(struct inode *inode, struct file *filp, unsigned long arg) ++{ ++ struct fiemap fiemap; ++ u64 len; ++ struct fiemap_extent_info fieinfo = {0, }; ++ struct super_block *sb = inode->i_sb; ++ int error = 0; ++ ++ if (copy_from_user(&fiemap, (struct fiemap __user *) arg, ++ sizeof(struct fiemap))) ++ return -EFAULT; ++ ++ if (fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS) ++ return -EINVAL; ++ ++ error = fiemap_check_ranges(sb, fiemap.fm_start, fiemap.fm_length, ++ &len); ++ if (error) ++ return error; ++ ++ fieinfo.fi_flags = fiemap.fm_flags; ++ fieinfo.fi_extents_max = fiemap.fm_extent_count; ++ fieinfo.fi_extents_start = (struct fiemap_extent *)(arg + sizeof(fiemap)); ++ ++ if (fiemap.fm_extent_count != 0 && ++ !access_ok(VERIFY_WRITE, (void *)arg, ++ offsetof(typeof(fiemap), fm_extents[fiemap.fm_extent_count]))) ++ return -EFAULT; ++ ++ if (fieinfo.fi_flags & FIEMAP_FLAG_SYNC) ++ filemap_write_and_wait(inode->i_mapping); ++ ++ error = ext4_fiemap(inode, &fieinfo, fiemap.fm_start, len); ++ fiemap.fm_flags = fieinfo.fi_flags; ++ fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped; ++ if (copy_to_user((char *)arg, &fiemap, sizeof(fiemap))) ++ error = -EFAULT; ++ ++ return error; ++} ++ + long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) + { + struct inode *inode = filp->f_dentry->d_inode; +@@ -263,6 +419,10 @@ setversion_out: + return err; + } + ++ case EXT4_IOC_FIEMAP: { ++ return ioctl_fiemap(inode, filp, arg); ++ } ++ + default: + return -ENOTTY; + } +Index: linux-2.6.27.21-0.1/fs/ext4/ext4.h +=================================================================== +--- linux-2.6.27.21-0.1.orig/fs/ext4/ext4.h ++++ linux-2.6.27.21-0.1/fs/ext4/ext4.h +@@ -302,7 +302,8 @@ struct ext4_new_group_data { + #define EXT4_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long) + #define EXT4_IOC_GROUP_ADD _IOW('f', 8, struct ext4_new_group_input) + #define EXT4_IOC_MIGRATE _IO('f', 9) +- /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */ ++#define EXT4_IOC_FIEMAP _IOWR('f', 11, struct fiemap) ++ + + /* + * ioctl commands in 32 bit emulation +@@ -320,6 +321,8 @@ struct ext4_new_group_data { + #define EXT4_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION + #define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION + ++/* FIEMAP flags supported by ext4 */ ++#define EXT4_FIEMAP_FLAGS_COMPAT (FIEMAP_FLAG_SYNC) + + /* + * Mount options +@@ -1130,6 +1133,9 @@ extern int ext4_page_mkwrite(struct vm_a + /* ioctl.c */ + extern long ext4_ioctl(struct file *, unsigned int, unsigned long); + extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); ++struct fiemap_extent_info; ++extern int ext4_fiemap(struct inode *, struct fiemap_extent_info *, __u64, ++ __u64); + + /* migrate.c */ + extern int ext4_ext_migrate(struct inode *); +Index: linux-2.6.27.21-0.1/fs/ext4/ext4_extents.h +=================================================================== +--- linux-2.6.27.21-0.1.orig/fs/ext4/ext4_extents.h ++++ linux-2.6.27.21-0.1/fs/ext4/ext4_extents.h +@@ -128,6 +128,22 @@ struct ext4_ext_path { + #define EXT_MAX_BLOCK 0xffffffff + + /* ++ * to be called by ext4_ext_walk_space() ++ * negative retcode - error ++ * positive retcode - signal for ext4_ext_walk_space(), see below ++ * callback must return valid extent (passed or newly created) ++ */ ++typedef int (*ext_prepare_callback)(struct inode *, struct ext4_ext_path *, ++ struct ext4_ext_cache *, ++ struct ext4_extent *, void *); ++ ++#define HAVE_EXT_PREPARE_CB_EXTENT ++ ++#define EXT_CONTINUE 0 ++#define EXT_BREAK 1 ++#define EXT_REPEAT 2 ++ ++/* + * EXT_INIT_MAX_LEN is the maximum number of blocks we can have in an + * initialized extent. This is 2^15 and not (2^16 - 1), since we use the + * MSB of ee_len field in the extent datastructure to signify if this +@@ -219,6 +235,8 @@ extern int ext4_ext_try_to_merge(struct + struct ext4_extent *); + extern unsigned int ext4_ext_check_overlap(struct inode *, struct ext4_extent *, struct ext4_ext_path *); + extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *); ++extern int ext4_ext_walk_space(struct inode *, ext4_lblk_t, ext4_lblk_t, ++ ext_prepare_callback, void *); + extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t, + struct ext4_ext_path *); + extern int ext4_ext_search_left(struct inode *, struct ext4_ext_path *, +Index: linux-2.6.27.21-0.1/fs/ext4/extents.c +=================================================================== +--- linux-2.6.27.21-0.1.orig/fs/ext4/extents.c ++++ linux-2.6.27.21-0.1/fs/ext4/extents.c +@@ -42,7 +42,7 @@ + #include + #include "ext4_jbd2.h" + #include "ext4_extents.h" +- ++#include "fiemap.h" + + /* + * ext_pblock: +@@ -1622,6 +1622,113 @@ cleanup: + return err; + } + ++int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block, ++ ext4_lblk_t num, ext_prepare_callback func, ++ void *cbdata) ++{ ++ struct ext4_ext_path *path = NULL; ++ struct ext4_ext_cache cbex; ++ struct ext4_extent *ex; ++ ext4_lblk_t next, start = 0, end = 0; ++ ext4_lblk_t last = block + num; ++ int depth, exists, err = 0; ++ ++ BUG_ON(func == NULL); ++ BUG_ON(inode == NULL); ++ ++ while (block < last && block != EXT_MAX_BLOCK) { ++ num = last - block; ++ /* find extent for this block */ ++ path = ext4_ext_find_extent(inode, block, path); ++ if (IS_ERR(path)) { ++ err = PTR_ERR(path); ++ path = NULL; ++ break; ++ } ++ ++ depth = ext_depth(inode); ++ BUG_ON(path[depth].p_hdr == NULL); ++ ex = path[depth].p_ext; ++ next = ext4_ext_next_allocated_block(path); ++ ++ exists = 0; ++ if (!ex) { ++ /* there is no extent yet, so try to allocate ++ * all requested space */ ++ start = block; ++ end = block + num; ++ } else if (le32_to_cpu(ex->ee_block) > block) { ++ /* need to allocate space before found extent */ ++ start = block; ++ end = le32_to_cpu(ex->ee_block); ++ if (block + num < end) ++ end = block + num; ++ } else if (block >= le32_to_cpu(ex->ee_block) ++ + ext4_ext_get_actual_len(ex)) { ++ /* need to allocate space after found extent */ ++ start = block; ++ end = block + num; ++ if (end >= next) ++ end = next; ++ } else if (block >= le32_to_cpu(ex->ee_block)) { ++ /* ++ * some part of requested space is covered ++ * by found extent ++ */ ++ start = block; ++ end = le32_to_cpu(ex->ee_block) ++ + ext4_ext_get_actual_len(ex); ++ if (block + num < end) ++ end = block + num; ++ exists = 1; ++ } else { ++ BUG(); ++ } ++ BUG_ON(end <= start); ++ ++ if (!exists) { ++ cbex.ec_block = start; ++ cbex.ec_len = end - start; ++ cbex.ec_start = 0; ++ cbex.ec_type = EXT4_EXT_CACHE_GAP; ++ } else { ++ cbex.ec_block = le32_to_cpu(ex->ee_block); ++ cbex.ec_len = ext4_ext_get_actual_len(ex); ++ cbex.ec_start = ext_pblock(ex); ++ cbex.ec_type = EXT4_EXT_CACHE_EXTENT; ++ } ++ ++ BUG_ON(cbex.ec_len == 0); ++ err = func(inode, path, &cbex, ex, cbdata); ++ ext4_ext_drop_refs(path); ++ ++ if (err < 0) ++ break; ++ ++ if (err == EXT_REPEAT) ++ continue; ++ else if (err == EXT_BREAK) { ++ err = 0; ++ break; ++ } ++ ++ if (ext_depth(inode) != depth) { ++ /* depth was changed. we have to realloc path */ ++ kfree(path); ++ path = NULL; ++ } ++ ++ block = cbex.ec_block + cbex.ec_len; ++ } ++ ++ if (path) { ++ ext4_ext_drop_refs(path); ++ kfree(path); ++ } ++ ++ return err; ++} ++ + static void + ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block, + __u32 len, ext4_fsblk_t start, int type) +@@ -2966,3 +3073,100 @@ retry: + mutex_unlock(&inode->i_mutex); + return ret > 0 ? ret2 : ret; + } ++ ++/* ++ * Callback function called for each extent to gather FIEMAP information. ++ */ ++int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path, ++ struct ext4_ext_cache *newex, struct ext4_extent *ex, ++ void *data) ++{ ++ struct fiemap_extent_info *fieinfo = data; ++ unsigned long blksize_bits = inode->i_sb->s_blocksize_bits; ++ __u64 logical; ++ __u64 physical; ++ __u64 length; ++ __u32 flags = 0; ++ int error; ++ ++ logical = (__u64)newex->ec_block << blksize_bits; ++ ++ if (newex->ec_type == EXT4_EXT_CACHE_GAP) { ++ pgoff_t offset; ++ struct page *page; ++ struct buffer_head *bh = NULL; ++ ++ offset = logical >> PAGE_SHIFT; ++ page = find_get_page(inode->i_mapping, offset); ++ if (!page || !page_has_buffers(page)) ++ return EXT_CONTINUE; ++ ++ bh = page_buffers(page); ++ ++ if (!bh) ++ return EXT_CONTINUE; ++ ++ if (buffer_delay(bh)) { ++ flags |= FIEMAP_EXTENT_DELALLOC; ++ page_cache_release(page); ++ } else { ++ page_cache_release(page); ++ return EXT_CONTINUE; ++ } ++ } ++ ++ physical = (__u64)newex->ec_start << blksize_bits; ++ length = (__u64)newex->ec_len << blksize_bits; ++ ++ if (ex && ext4_ext_is_uninitialized(ex)) ++ flags |= FIEMAP_EXTENT_UNWRITTEN; ++ ++ /* ++ * If this extent reaches EXT_MAX_BLOCK, it must be last. ++ * ++ * Or if ext4_ext_next_allocated_block is EXT_MAX_BLOCK, ++ * this indicates no more allocated blocks. ++ * ++ * XXX this might miss a single-block extent at EXT_MAX_BLOCK ++ */ ++ if (logical + length - 1 == EXT_MAX_BLOCK || ++ ext4_ext_next_allocated_block(path) == EXT_MAX_BLOCK) ++ flags |= FIEMAP_EXTENT_LAST; ++ ++ error = fiemap_fill_next_extent(fieinfo, logical, physical, ++ length, flags, inode->i_sb->s_dev); ++ if (error < 0) ++ return error; ++ if (error == 1) ++ return EXT_BREAK; ++ ++ return EXT_CONTINUE; ++} ++ ++int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, ++ __u64 start, __u64 len) ++{ ++ ext4_fsblk_t start_blk; ++ ext4_fsblk_t len_blks; ++ int error = 0; ++ ++ if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) ++ return -EOPNOTSUPP; ++ ++ if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS_COMPAT)) ++ return -EBADR; ++ ++ start_blk = start >> inode->i_sb->s_blocksize_bits; ++ len_blks = (len + inode->i_sb->s_blocksize - 1) >> inode->i_sb->s_blocksize_bits; ++ ++ /* ++ * Walk the extent tree gathering extent information. ++ * ext4_ext_fiemap_cb will push extents back to user. ++ */ ++ down_write(&EXT4_I(inode)->i_data_sem); ++ error = ext4_ext_walk_space(inode, start_blk, len_blks, ++ ext4_ext_fiemap_cb, fieinfo); ++ up_write(&EXT4_I(inode)->i_data_sem); ++ ++ return error; ++} +Index: linux-2.6.27.21-0.1/fs/ext4/fiemap.h +=================================================================== +--- /dev/null ++++ linux-2.6.27.21-0.1/fs/ext4/fiemap.h +@@ -0,0 +1,85 @@ ++/* ++ * FIEMAP ioctl infrastructure. ++ * ++ * Copyright 2008 Sun Microsystems, Inc ++ * ++ * Author: Kalpak Shah ++ * Andreas Dilger ++ */ ++ ++#ifndef _LINUX_EXT4_FIEMAP_H ++#define _LINUX_EXT4_FIEMAP_H ++ ++struct fiemap_extent { ++ __u64 fe_logical; /* logical offset in bytes for the start of ++ * the extent from the beginning of the file */ ++ __u64 fe_physical; /* physical offset in bytes for the start ++ * of the extent from the beginning of the disk */ ++ __u64 fe_length; /* length in bytes for this extent */ ++ __u64 fe_reserved64[2]; ++ __u32 fe_flags; /* FIEMAP_EXTENT_* flags for this extent */ ++ __u32 fe_device; /* device number for this extent */ ++ __u32 fe_reserved[2]; ++}; ++ ++struct fiemap { ++ __u64 fm_start; /* logical offset (inclusive) at ++ * which to start mapping (in) */ ++ __u64 fm_length; /* logical length of mapping which ++ * userspace wants (in) */ ++ __u32 fm_flags; /* FIEMAP_FLAG_* flags for request (in/out) */ ++ __u32 fm_mapped_extents;/* number of extents that were mapped (out) */ ++ __u32 fm_extent_count; /* size of fm_extents array (in) */ ++ __u32 fm_reserved; ++ struct fiemap_extent fm_extents[0]; /* array of mapped extents (out) */ ++}; ++ ++/* ++ * FIEMAP helper definition. ++ */ ++struct fiemap_extent_info { ++ unsigned int fi_flags; /* Flags as passed from user */ ++ unsigned int fi_extents_mapped; /* Number of mapped extents */ ++ unsigned int fi_extents_max; /* Size of fiemap_extent array*/ ++ struct fiemap_extent *fi_extents_start; /* Start of fiemap_extent array */ ++}; ++ ++int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags); ++int fiemap_fill_next_extent(struct fiemap_extent_info *info, u64 logical, ++ u64 phys, u64 len, u32 flags, u32 lun); ++ ++#define FIEMAP_MAX_OFFSET (~0ULL) ++ ++#define FIEMAP_FLAG_SYNC 0x00000001 /* sync file data before map */ ++#define FIEMAP_FLAG_XATTR 0x00000002 /* map extended attribute tree */ ++ ++/* ldiskfs only supports FLAG_SYNC flag currently */ ++#define FIEMAP_FLAGS_COMPAT (FIEMAP_FLAG_SYNC | FIEMAP_FLAG_XATTR) ++ ++#define FIEMAP_EXTENT_LAST 0x00000001 /* Last extent in file. */ ++#define FIEMAP_EXTENT_UNKNOWN 0x00000002 /* Data location unknown. */ ++#define FIEMAP_EXTENT_DELALLOC 0x00000004 /* Location still pending. ++ * Sets EXTENT_UNKNOWN. */ ++#define FIEMAP_EXTENT_ENCODED 0x00000008 /* Data can not be read ++ * while fs is unmounted */ ++#define FIEMAP_EXTENT_DATA_ENCRYPTED 0x00000080 /* Data is encrypted by fs. ++ * Sets EXTENT_NO_DIRECT. */ ++#define FIEMAP_EXTENT_NOT_ALIGNED 0x00000100 /* Extent offsets may not be ++ * block aligned. */ ++#define FIEMAP_EXTENT_DATA_INLINE 0x00000200 /* Data mixed with metadata. ++ * Sets EXTENT_NOT_ALIGNED.*/ ++#define FIEMAP_EXTENT_DATA_TAIL 0x00000400 /* Multiple files in block. ++ * Sets EXTENT_NOT_ALIGNED.*/ ++#define FIEMAP_EXTENT_UNWRITTEN 0x00000800 /* Space allocated, but ++ * no data (i.e. zero). */ ++#define FIEMAP_EXTENT_MERGED 0x00001000 /* File does not natively ++ * support extents. Result ++ * merged for efficiency. */ ++ ++/* Lustre specific flags - use a high bit, don't conflict with upstream flag */ ++#define FIEMAP_EXTENT_NO_DIRECT 0x40000000 /* Data mapping undefined */ ++#define FIEMAP_EXTENT_NET 0x80000000 /* Data stored remotely. ++ * Sets NO_DIRECT flag */ ++ ++#endif /* _LINUX_EXT4_FIEMAP_H */ ++ diff --git a/ldiskfs/kernel_patches/patches/ext4-filterdata-sles11.patch b/ldiskfs/kernel_patches/patches/ext4-filterdata-sles11.patch new file mode 100644 index 0000000..ea83bcd --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext4-filterdata-sles11.patch @@ -0,0 +1,25 @@ +Index: linux-2.6.27.21-0.1/fs/ext4/ext4_i.h +=================================================================== +--- linux-2.6.27.21-0.1.orig/fs/ext4/ext4_i.h ++++ linux-2.6.27.21-0.1/fs/ext4/ext4_i.h +@@ -135,6 +135,8 @@ struct ext4_inode_info { + __u16 i_extra_isize; + + spinlock_t i_block_reservation_lock; ++ ++ void *i_filterdata; + }; + + #endif /* _EXT4_I */ +Index: linux-2.6.27.21-0.1/fs/ext4/super.c +=================================================================== +--- linux-2.6.27.21-0.1.orig/fs/ext4/super.c ++++ linux-2.6.27.21-0.1/fs/ext4/super.c +@@ -624,6 +624,7 @@ static struct inode *ext4_alloc_inode(st + memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache)); + INIT_LIST_HEAD(&ei->i_prealloc_list); + spin_lock_init(&ei->i_prealloc_lock); ++ ei->i_filterdata = NULL; + jbd2_journal_init_jbd_inode(&ei->jinode, &ei->vfs_inode); + ei->i_reserved_data_blocks = 0; + ei->i_reserved_meta_blocks = 0; diff --git a/ldiskfs/kernel_patches/patches/ext4-ialloc-2.6-sles11.patch b/ldiskfs/kernel_patches/patches/ext4-ialloc-2.6-sles11.patch new file mode 100644 index 0000000..1423331 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext4-ialloc-2.6-sles11.patch @@ -0,0 +1,129 @@ +Index: linux-2.6.27.21-0.1/fs/ext4/ialloc.c +=================================================================== +--- linux-2.6.27.21-0.1.orig/fs/ext4/ialloc.c ++++ linux-2.6.27.21-0.1/fs/ext4/ialloc.c +@@ -535,12 +535,16 @@ fallback: + } + + static int find_group_other(struct super_block *sb, struct inode *parent, +- ext4_group_t *group) ++ ext4_group_t *group, int mode) + { ++ struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_group_t parent_group = EXT4_I(parent)->i_block_group; +- ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; ++ ext4_group_t ngroups = sbi->s_groups_count; + struct ext4_group_desc *desc; + ext4_group_t i; ++ int best_group = -1; ++ ext4_fsblk_t avefreeb, freeb; ++ int best_group_freeb = 0; + + /* + * Try to place the inode in its parent directory +@@ -548,8 +552,10 @@ static int find_group_other(struct super + *group = parent_group; + desc = ext4_get_group_desc(sb, *group, NULL); + if (desc && ext4_free_inodes_count(sb, desc) && +- ext4_free_blks_count(sb, desc)) ++ (!S_ISREG(mode) || ext4_free_blks_count(sb, desc))) + return 0; ++ avefreeb = ext4_free_blocks_count(sbi->s_es); ++ do_div(avefreeb, ngroups); + + /* + * We're going to place this inode in a different blockgroup from its +@@ -563,33 +569,49 @@ static int find_group_other(struct super + *group = (*group + parent->i_ino) % ngroups; + + /* +- * Use a quadratic hash to find a group with a free inode and some free +- * blocks. ++ * Use a quadratic hash to find a group with a free inode and ++ * average number of free blocks. + */ + for (i = 1; i < ngroups; i <<= 1) { + *group += i; + if (*group >= ngroups) + *group -= ngroups; + desc = ext4_get_group_desc(sb, *group, NULL); +- if (desc && ext4_free_inodes_count(sb, desc) && +- ext4_free_blks_count(sb, desc)) ++ if (!desc || ext4_free_inodes_count(sb, desc)) ++ continue; ++ if (!S_ISREG(mode)) ++ return 0; ++ if (ext4_free_blks_count(sb, desc) >= avefreeb) + return 0; + } + + /* +- * That failed: try linear search for a free inode, even if that group +- * has no free blocks. ++ * That failed: start from last group used to allocate inode ++ * try linear search for a free inode and prefereably ++ * free blocks. + */ +- *group = parent_group; ++ *group = sbi->s_last_alloc_group; ++ if (*group == -1) ++ *group = parent_group; ++ + for (i = 0; i < ngroups; i++) { + if (++*group >= ngroups) + *group = 0; + desc = ext4_get_group_desc(sb, *group, NULL); +- if (desc && ext4_free_inodes_count(sb, desc)) +- return 0; ++ if (!desc || ext4_free_inodes_count(sb, desc)) ++ continue; ++ freeb = ext4_free_blks_count(sb, desc); ++ if (freeb > best_group_freeb) { ++ best_group_freeb = freeb; ++ best_group = *group; ++ if (freeb >= avefreeb || !S_ISREG(mode)) ++ break; ++ } + } + +- return -1; ++ sbi->s_last_alloc_group = best_group; ++ *group = best_group; ++ return 0; + } + + /* +@@ -755,7 +777,7 @@ continue_allocation: + else + ret2 = find_group_orlov(sb, dir, &group); + } else +- ret2 = find_group_other(sb, dir, &group); ++ ret2 = find_group_other(sb, dir, &group, mode); + + got_group: + err = -ENOSPC; +Index: linux-2.6.27.21-0.1/fs/ext4/super.c +=================================================================== +--- linux-2.6.27.21-0.1.orig/fs/ext4/super.c ++++ linux-2.6.27.21-0.1/fs/ext4/super.c +@@ -2300,6 +2300,7 @@ static int ext4_fill_super(struct super_ + + bgl_lock_init(&sbi->s_blockgroup_lock); + ++ sbi->s_last_alloc_group = -1; + for (i = 0; i < db_count; i++) { + block = descriptor_loc(sb, logical_sb_block, i); + sbi->s_group_desc[i] = sb_bread(sb, block); +Index: linux-2.6.27.21-0.1/fs/ext4/ext4_sb.h +=================================================================== +--- linux-2.6.27.21-0.1.orig/fs/ext4/ext4_sb.h ++++ linux-2.6.27.21-0.1/fs/ext4/ext4_sb.h +@@ -64,6 +64,8 @@ struct ext4_sb_info { + struct percpu_counter s_dirtyblocks_counter; + struct blockgroup_lock s_blockgroup_lock; + struct proc_dir_entry *s_proc; ++ /* Last group used to allocate inode */ ++ ext4_group_t s_last_alloc_group; + + /* root of the per fs reservation window tree */ + spinlock_t s_rsv_window_lock; diff --git a/ldiskfs/kernel_patches/patches/ext4-include-fixes-2.6-sles11.patch b/ldiskfs/kernel_patches/patches/ext4-include-fixes-2.6-sles11.patch new file mode 100644 index 0000000..0009eaa --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext4-include-fixes-2.6-sles11.patch @@ -0,0 +1,20 @@ +Index: linux-2.6.18.i386/fs/ext4/ext4.h +=================================================================== +--- linux-2.6.18.i386.orig/fs/ext4/ext4.h ++++ linux-2.6.18.i386/fs/ext4/ext4.h +@@ -541,12 +541,13 @@ do { \ + #define EXT4_MOUNT_IOPEN 0x8000000 /* Allow access via iopen */ + #define EXT4_MOUNT_IOPEN_NOPRIV 0x10000000 /* Make iopen world-readable */ + /* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */ +-#ifndef _LINUX_EXT2_FS_H ++#ifndef clear_opt + #define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt + #define set_opt(o, opt) o |= EXT4_MOUNT_##opt + #define test_opt(sb, opt) (EXT4_SB(sb)->s_mount_opt & \ + EXT4_MOUNT_##opt) +-#else ++#endif ++#ifndef EXT2_MOUNT_NOLOAD + #define EXT2_MOUNT_NOLOAD EXT4_MOUNT_NOLOAD + #define EXT2_MOUNT_ABORT EXT4_MOUNT_ABORT + #define EXT2_MOUNT_DATA_FLAGS EXT4_MOUNT_DATA_FLAGS diff --git a/ldiskfs/kernel_patches/patches/ext4-lookup-dotdot-sles11.patch b/ldiskfs/kernel_patches/patches/ext4-lookup-dotdot-sles11.patch new file mode 100644 index 0000000..af019fa --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext4-lookup-dotdot-sles11.patch @@ -0,0 +1,63 @@ +Index: linux-2.6.18.i386/fs/ext4/iopen.c +=================================================================== +--- linux-2.6.18.i386.orig/fs/ext4/iopen.c ++++ linux-2.6.18.i386/fs/ext4/iopen.c +@@ -91,9 +91,12 @@ static struct dentry *iopen_lookup(struc + assert(!(alternate->d_flags & DCACHE_DISCONNECTED)); + } + +- if (!list_empty(&inode->i_dentry)) { +- alternate = list_entry(inode->i_dentry.next, +- struct dentry, d_alias); ++ list_for_each(lp, &inode->i_dentry) { ++ alternate = list_entry(lp, struct dentry, d_alias); ++ /* ignore dentries created for ".." to preserve ++ * proper dcache hierarchy -- bug 10458 */ ++ if (alternate->d_flags & DCACHE_NFSFS_RENAMED) ++ continue; + dget_locked(alternate); + spin_lock(&alternate->d_lock); + alternate->d_flags |= DCACHE_REFERENCED; +Index: linux-2.6.18.i386/fs/ext4/namei.c +=================================================================== +--- linux-2.6.18.i386.orig/fs/ext4/namei.c ++++ linux-2.6.18.i386/fs/ext4/namei.c +@@ -1067,6 +1067,38 @@ static struct dentry *ext4_lookup(struct + return ERR_CAST(inode); + } + ++ /* ".." shouldn't go into dcache to preserve dcache hierarchy ++ * otherwise we'll get parent being a child of actual child. ++ * see bug 10458 for details -bzzz */ ++ if (inode && (dentry->d_name.name[0] == '.' && (dentry->d_name.len == 1 || ++ (dentry->d_name.len == 2 && dentry->d_name.name[1] == '.')))) { ++ struct dentry *tmp, *goal = NULL; ++ struct list_head *lp; ++ ++ /* first, look for an existing dentry - any one is good */ ++ spin_lock(&dcache_lock); ++ list_for_each(lp, &inode->i_dentry) { ++ tmp = list_entry(lp, struct dentry, d_alias); ++ goal = tmp; ++ dget_locked(goal); ++ break; ++ } ++ if (goal == NULL) { ++ /* there is no alias, we need to make current dentry: ++ * a) inaccessible for __d_lookup() ++ * b) inaccessible for iopen */ ++ J_ASSERT(list_empty(&dentry->d_alias)); ++ dentry->d_flags |= DCACHE_NFSFS_RENAMED; ++ /* this is d_instantiate() ... */ ++ list_add(&dentry->d_alias, &inode->i_dentry); ++ dentry->d_inode = inode; ++ } ++ spin_unlock(&dcache_lock); ++ if (goal) ++ iput(inode); ++ return goal; ++ } ++ + return iopen_connect_dentry(dentry, inode, 1); + } + diff --git a/ldiskfs/kernel_patches/patches/ext4-map_inode_page-sles11.patch b/ldiskfs/kernel_patches/patches/ext4-map_inode_page-sles11.patch new file mode 100644 index 0000000..4ed87f0 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext4-map_inode_page-sles11.patch @@ -0,0 +1,86 @@ +Index: linux-2.6.18.i386/fs/ext4/inode.c +=================================================================== +--- linux-2.6.18.i386.orig/fs/ext4/inode.c ++++ linux-2.6.18.i386/fs/ext4/inode.c +@@ -3666,3 +3666,66 @@ out_unlock: + unlock_page(page); + return ret; + } ++ ++int ext4_map_inode_page(struct inode *inode, struct page *page, ++ unsigned long *blocks, int *created, int create) ++{ ++ unsigned int blocksize, blocks_per_page; ++ unsigned long iblock; ++ struct buffer_head dummy; ++ void *handle; ++ int i, rc = 0, failed = 0, needed_blocks; ++ ++ blocksize = inode->i_sb->s_blocksize; ++ blocks_per_page = PAGE_SIZE >> inode->i_sb->s_blocksize_bits; ++ iblock = page->index * blocks_per_page; ++ ++ for (i = 0; i < blocks_per_page; i++, iblock++) { ++ blocks[i] = ext4_bmap(inode->i_mapping, iblock); ++ if (blocks[i] == 0) { ++ failed++; ++ if (created) ++ created[i] = -1; ++ } else if (created) { ++ created[i] = 0; ++ } ++ } ++ ++ if (failed == 0 || create == 0) ++ return 0; ++ ++ needed_blocks = ext4_writepage_trans_blocks(inode); ++ handle = ext4_journal_start(inode, needed_blocks); ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ ++ iblock = page->index * blocks_per_page; ++ for (i = 0; i < blocks_per_page; i++, iblock++) { ++ if (blocks[i] != 0) ++ continue; ++ ++ rc = ext4_get_blocks_handle(handle, inode, iblock, 1, &dummy, 1, 1); ++ if (rc < 0) { ++ printk(KERN_INFO "ext4_map_inode_page: error reading " ++ "block %ld\n", iblock); ++ goto out; ++ } else { ++ if (rc > 1) ++ WARN_ON(1); ++ rc = 0; ++ } ++ /* Unmap any metadata buffers from the block mapping, to avoid ++ * data corruption due to direct-write from Lustre being ++ * clobbered by a later flush of the blockdev metadata buffer.*/ ++ if (buffer_new(&dummy)) ++ unmap_underlying_metadata(dummy.b_bdev, ++ dummy.b_blocknr); ++ blocks[i] = dummy.b_blocknr; ++ if (created) ++ created[i] = 1; ++ } ++ ++out: ++ ext4_journal_stop(handle); ++ return rc; ++} +Index: linux-2.6.18.i386/fs/ext4/super.c +=================================================================== +--- linux-2.6.18.i386.orig/fs/ext4/super.c ++++ linux-2.6.18.i386/fs/ext4/super.c +@@ -3498,6 +3498,10 @@ static void __exit exit_ext4_fs(void) + __free_page(ext4_zero_page); + } + ++int ext4_map_inode_page(struct inode *inode, struct page *page, ++ unsigned long *blocks, int *created, int create); ++EXPORT_SYMBOL(ext4_map_inode_page); ++ + MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); + MODULE_DESCRIPTION("Fourth Extended Filesystem with extents"); + MODULE_LICENSE("GPL"); diff --git a/ldiskfs/kernel_patches/patches/ext4-max-dir-size-sles11.patch b/ldiskfs/kernel_patches/patches/ext4-max-dir-size-sles11.patch new file mode 100644 index 0000000..0f758d4 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext4-max-dir-size-sles11.patch @@ -0,0 +1,189 @@ +Index: linux-2.6.27.21-0.1/fs/ext4/ialloc.c +=================================================================== +--- linux-2.6.27.21-0.1.orig/fs/ext4/ialloc.c ++++ linux-2.6.27.21-0.1/fs/ext4/ialloc.c +@@ -721,12 +721,15 @@ struct inode *ext4_new_inode(handle_t *h + return ERR_PTR(-EPERM); + + sb = dir->i_sb; ++ sbi = EXT4_SB(sb); ++ if (sbi->s_max_dir_size > 0 && i_size_read(dir) >= sbi->s_max_dir_size) ++ return ERR_PTR(-EFBIG); ++ + inode = new_inode(sb); + if (!inode) + return ERR_PTR(-ENOMEM); + ei = EXT4_I(inode); + +- sbi = EXT4_SB(sb); + es = sbi->s_es; + + if (goal) { +Index: linux-2.6.27.21-0.1/fs/ext4/super.c +=================================================================== +--- linux-2.6.27.21-0.1.orig/fs/ext4/super.c ++++ linux-2.6.27.21-0.1/fs/ext4/super.c +@@ -41,6 +41,7 @@ + #include + #include + #include ++#include + + #include "ext4.h" + #include "ext4_jbd2.h" +@@ -71,6 +72,8 @@ static void ext4_write_super(struct supe + static void ext4_write_super_lockfs(struct super_block *sb); + + ++struct proc_dir_entry *proc_root_ext4; ++ + ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, + struct ext4_group_desc *bg) + { +@@ -602,6 +605,9 @@ static void ext4_put_super(struct super_ + } + if (sbi->s_mmp_tsk) + kthread_stop(sbi->s_mmp_tsk); ++ ++ remove_proc_entry(EXT4_MAX_DIR_SIZE_NAME, sbi->s_proc); ++ + sb->s_fs_info = NULL; + kfree(sbi); + return; +@@ -2287,6 +2293,46 @@ static unsigned long ext4_get_stripe_siz + return 0; + } + ++static int ext4_max_dir_size_read(char *page, char **start, off_t off, ++ int count, int *eof, void *data) ++{ ++ struct ext4_sb_info *sbi = data; ++ int len; ++ ++ *eof = 1; ++ if (off != 0) ++ return 0; ++ ++ len = sprintf(page, "%lu\n", sbi->s_max_dir_size); ++ *start = page; ++ return len; ++} ++ ++static int ext4_max_dir_size_write(struct file *file, const char *buffer, ++ unsigned long count, void *data) ++{ ++ struct ext4_sb_info *sbi = data; ++ char str[32]; ++ unsigned long value; ++ char *end; ++ ++ if (count >= sizeof(str)) { ++ printk(KERN_ERR "EXT4-fs: %s string too long, max %u bytes\n", ++ EXT4_MAX_DIR_SIZE_NAME, (int)sizeof(str)); ++ return -EOVERFLOW; ++ } ++ ++ if (copy_from_user(str, buffer, count)) ++ return -EFAULT; ++ ++ value = simple_strtol(str, &end, 0); ++ if (value < 0) ++ return -ERANGE; ++ ++ sbi->s_max_dir_size = value; ++ return count; ++} ++ + static int ext4_fill_super(struct super_block *sb, void *data, int silent) + __releases(kernel_lock) + __acquires(kernel_lock) +@@ -2311,6 +2357,7 @@ static int ext4_fill_super(struct super_ + int needs_recovery, has_huge_files; + int features; + __u64 blocks_count; ++ struct proc_dir_entry *proc; + int err; + + sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); +@@ -2881,6 +2928,22 @@ static int ext4_fill_super(struct super_ + test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered": + "writeback"); + ++ sbi->s_max_dir_size = EXT4_DEFAULT_MAX_DIR_SIZE; ++ proc = create_proc_entry(EXT4_MAX_DIR_SIZE_NAME, ++ S_IFREG | S_IRUGO | S_IWUSR, sbi->s_proc); ++ if (proc == NULL) { ++ printk(KERN_ERR "EXT4-fs: unable to create %s\n", ++ EXT4_MAX_DIR_SIZE_NAME); ++ remove_proc_entry(EXT4_MAX_DIR_SIZE_NAME, sbi->s_proc); ++ remove_proc_entry(sbi->s_proc->name, proc_root_ext4); ++ sbi->s_proc = NULL; ++ ret = -ENOMEM; ++ goto failed_mount4; ++ } ++ proc->data = sbi; ++ proc->read_proc = ext4_max_dir_size_read; ++ proc->write_proc = ext4_max_dir_size_write; ++ + lock_kernel(); + return 0; + +@@ -3254,7 +3317,6 @@ static void ext4_commit_super(struct sup + } + } + +- + /* + * Have we just finished recovery? If so, and if we are mounting (or + * remounting) the filesystem readonly, then we will end up with a +Index: linux-2.6.27.21-0.1/fs/ext4/ext4_sb.h +=================================================================== +--- linux-2.6.27.21-0.1.orig/fs/ext4/ext4_sb.h ++++ linux-2.6.27.21-0.1/fs/ext4/ext4_sb.h +@@ -120,6 +120,7 @@ struct ext4_sb_info { + /* where last allocation was done - for stream allocation */ + unsigned long s_mb_last_group; + unsigned long s_mb_last_start; ++ unsigned long s_max_dir_size; + + /* history to debug policy */ + struct ext4_mb_history *s_mb_history; +Index: linux-2.6.27.21-0.1/fs/ext4/ext4.h +=================================================================== +--- linux-2.6.27.21-0.1.orig/fs/ext4/ext4.h ++++ linux-2.6.27.21-0.1/fs/ext4/ext4.h +@@ -1017,6 +1017,14 @@ struct mmp_struct { + */ + #define EXT4_MMP_MIN_CHECK_INTERVAL 5 + ++extern struct proc_dir_entry *proc_root_ext4; ++ ++/* ++ * max directory size tunable ++ */ ++#define EXT4_DEFAULT_MAX_DIR_SIZE 0 ++#define EXT4_MAX_DIR_SIZE_NAME "max_dir_size" ++ + /* + * Function prototypes + */ +Index: linux-2.6.27.21-0.1/fs/ext4/mballoc.c +=================================================================== +--- linux-2.6.27.21-0.1.orig/fs/ext4/mballoc.c ++++ linux-2.6.27.21-0.1/fs/ext4/mballoc.c +@@ -2943,6 +2943,7 @@ err_out: + remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc); + remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc); + remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc); ++ remove_proc_entry(EXT4_MAX_DIR_SIZE_NAME, sbi->s_proc); + return -ENOMEM; + #else + return 0; +@@ -2963,6 +2964,7 @@ static int ext4_mb_destroy_per_dev_proc( + remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc); + remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc); + remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc); ++ remove_proc_entry(EXT4_MAX_DIR_SIZE_NAME, sbi->s_proc); + #endif + return 0; + } diff --git a/ldiskfs/kernel_patches/patches/ext4-mballoc-extra-checks-sles11.patch b/ldiskfs/kernel_patches/patches/ext4-mballoc-extra-checks-sles11.patch new file mode 100644 index 0000000..be8d9f3 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext4-mballoc-extra-checks-sles11.patch @@ -0,0 +1,270 @@ +Index: linux-2.6.27.21-0.1/fs/ext4/mballoc.c +=================================================================== +--- linux-2.6.27.21-0.1.orig/fs/ext4/mballoc.c ++++ linux-2.6.27.21-0.1/fs/ext4/mballoc.c +@@ -333,7 +333,7 @@ + static struct kmem_cache *ext4_pspace_cachep; + static struct kmem_cache *ext4_ac_cachep; + static struct kmem_cache *ext4_free_ext_cachep; +-static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ++static int ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, + ext4_group_t group); + static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, + ext4_group_t group); +@@ -672,7 +672,7 @@ static void ext4_mb_mark_free_simple(str + } + } + +-static void ext4_mb_generate_buddy(struct super_block *sb, ++static int ext4_mb_generate_buddy(struct super_block *sb, + void *buddy, void *bitmap, ext4_group_t group) + { + struct ext4_group_info *grp = ext4_get_group_info(sb, group); +@@ -704,14 +704,13 @@ static void ext4_mb_generate_buddy(struc + grp->bb_fragments = fragments; + + if (free != grp->bb_free) { +- ext4_grp_locked_error(sb, group, __func__, +- "EXT4-fs: group %u: %u blocks in bitmap, %u in gd\n", +- group, free, grp->bb_free); +- /* +- * If we intent to continue, we consider group descritor +- * corrupt and update bb_free using bitmap value +- */ +- grp->bb_free = free; ++ struct ext4_group_desc *gdp; ++ gdp = ext4_get_group_desc (sb, group, NULL); ++ ext4_grp_locked_error(sb, group, __func__, ++ "group %u: %u blocks in bitmap, %u in bb, " ++ "%u in gd\n", group, free, grp->bb_free, ++ ext4_free_blks_count(sb, gdp)); ++ return -EIO; + } + + clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); +@@ -721,6 +720,8 @@ static void ext4_mb_generate_buddy(struc + EXT4_SB(sb)->s_mb_buddies_generated++; + EXT4_SB(sb)->s_mb_generation_time += period; + spin_unlock(&EXT4_SB(sb)->s_bal_lock); ++ ++ return 0; + } + + /* The buddy information is attached the buddy cache inode +@@ -850,7 +851,7 @@ static int ext4_mb_init_cache(struct pag + first_block = page->index * blocks_per_page; + /* init the page */ + memset(page_address(page), 0xff, PAGE_CACHE_SIZE); +- for (i = 0; i < blocks_per_page; i++) { ++ for (i = 0; i < blocks_per_page && err == 0; i++) { + int group; + struct ext4_group_info *grinfo; + +@@ -884,7 +885,7 @@ static int ext4_mb_init_cache(struct pag + * incore got set to the group block bitmap below + */ + ext4_lock_group(sb, group); +- ext4_mb_generate_buddy(sb, data, incore, group); ++ err = ext4_mb_generate_buddy(sb, data, incore, group); + ext4_unlock_group(sb, group); + incore = NULL; + } else { +@@ -898,7 +899,7 @@ static int ext4_mb_init_cache(struct pag + memcpy(data, bitmap, blocksize); + + /* mark all preallocated blks used in in-core bitmap */ +- ext4_mb_generate_from_pa(sb, data, group); ++ err = ext4_mb_generate_from_pa(sb, data, group); + ext4_mb_generate_from_freelist(sb, data, group); + ext4_unlock_group(sb, group); + +@@ -908,6 +909,7 @@ static int ext4_mb_init_cache(struct pag + incore = data; + } + } ++ if (likely(err == 0)) + SetPageUptodate(page); + + out: +@@ -2217,7 +2219,10 @@ static int ext4_mb_seq_history_show(stru + hs->result.fe_start, hs->result.fe_len); + seq_printf(seq, "%-5u %-8u %-23s free\n", + hs->pid, hs->ino, buf2); ++ } else { ++ seq_printf(seq, "unknown op %d\n", hs->op); + } ++ + return 0; + } + +@@ -2345,9 +2350,11 @@ static void *ext4_mb_seq_groups_next(str + static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) + { + struct super_block *sb = seq->private; ++ struct ext4_group_desc *gdp; + ext4_group_t group = (ext4_group_t) ((unsigned long) v); + int i; + int err; ++ unsigned free = 0; + struct ext4_buddy e4b; + struct sg { + struct ext4_group_info info; +@@ -2356,10 +2363,10 @@ static int ext4_mb_seq_groups_show(struc + + group--; + if (group == 0) +- seq_printf(seq, "#%-5s: %-5s %-5s %-5s " ++ seq_printf(seq, "#%-5s: %-5s %-5s %-5s %-5s %-5s" + "[ %-5s %-5s %-5s %-5s %-5s %-5s %-5s " + "%-5s %-5s %-5s %-5s %-5s %-5s %-5s ]\n", +- "group", "free", "frags", "first", ++ "group", "free", "frags", "first", "first", "pa", + "2^0", "2^1", "2^2", "2^3", "2^4", "2^5", "2^6", + "2^7", "2^8", "2^9", "2^10", "2^11", "2^12", "2^13"); + +@@ -2371,12 +2378,18 @@ static int ext4_mb_seq_groups_show(struc + seq_printf(seq, "#%-5u: I/O error\n", group); + return 0; + } ++ ++ gdp = ext4_get_group_desc(sb, group, NULL); ++ if (gdp != NULL) ++ free = ext4_free_blks_count(sb, gdp); ++ + ext4_lock_group(sb, group); + memcpy(&sg, ext4_get_group_info(sb, group), i); + ext4_unlock_group(sb, group); + ext4_mb_release_desc(&e4b); + +- seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free, ++ seq_printf(seq, "#%-5u: %-5u %-5u %-5u %-5u [", group, ++ sg.info.bb_free, free, + sg.info.bb_fragments, sg.info.bb_first_free); + for (i = 0; i <= 13; i++) + seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ? +@@ -2474,6 +2487,7 @@ ext4_mb_store_history(struct ext4_alloca + h.tail = ac->ac_tail; + h.buddy = ac->ac_buddy; + h.merged = 0; ++ h.cr = ac->ac_criteria; + if (ac->ac_op == EXT4_MB_HISTORY_ALLOC) { + if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && + ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group) +@@ -3689,22 +3703,66 @@ static void ext4_mb_generate_from_freeli + } + + /* ++ * check free blocks in bitmap match free block in group descriptor ++ * do this before taking preallocated blocks into account to be able ++ * to detect on-disk corruptions ++ */ ++int ext4_mb_check_ondisk_bitmap(struct super_block *sb, void *bitmap, ++ struct ext4_group_desc *gdp, int group) ++{ ++ unsigned short max = EXT4_BLOCKS_PER_GROUP(sb); ++ unsigned short i, first, free = 0; ++ ++ i = mb_find_next_zero_bit(bitmap, max, 0); ++ ++ while (i < max) { ++ first = i; ++ i = find_next_bit(bitmap, max, i); ++ if (i > max) ++ i = max; ++ free += i - first; ++ if (i < max) ++ i = mb_find_next_zero_bit(bitmap, max, i); ++ } ++ ++ if (free != ext4_free_blks_count(sb, gdp)) { ++ ext4_error(sb, __FUNCTION__, "on-disk bitmap for group %d" ++ "corrupted: %u blocks free in bitmap, %u - in gd\n", ++ group, free, ext4_free_blks_count(sb, gdp)); ++ return -EIO; ++ } ++ return 0; ++} ++ ++/* + * the function goes through all preallocation in this group and marks them + * used in in-core bitmap. buddy must be generated from this bitmap + * Need to be called with ext4 group lock (ext4_lock_group) + */ +-static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ++static int ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, + ext4_group_t group) + { + struct ext4_group_info *grp = ext4_get_group_info(sb, group); + struct ext4_prealloc_space *pa; ++ struct ext4_group_desc *gdp; + struct list_head *cur; + ext4_group_t groupnr; + ext4_grpblk_t start; + int preallocated = 0; + int count = 0; ++ int skip = 0; ++ int err; + int len; + ++ gdp = ext4_get_group_desc (sb, group, NULL); ++ if (gdp == NULL) ++ return -EIO; ++ ++ /* before applying preallocations, check bitmap consistency */ ++ err = ext4_mb_check_ondisk_bitmap(sb, bitmap, gdp, group); ++ if (err) ++ return err; ++ + /* all form of preallocation discards first load group, + * so the only competing code is preallocation use. + * we don't need any locking here +@@ -3720,8 +3778,10 @@ static void ext4_mb_generate_from_pa(str + &groupnr, &start); + len = pa->pa_len; + spin_unlock(&pa->pa_lock); +- if (unlikely(len == 0)) ++ if (unlikely(len == 0)) { ++ skip++; + continue; ++ } + BUG_ON(groupnr != group); + mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group), + bitmap, start, len); +@@ -3729,6 +3789,7 @@ static void ext4_mb_generate_from_pa(str + count++; + } + mb_debug("prellocated %u for group %u\n", preallocated, group); ++ return 0; + } + + static void ext4_mb_pa_callback(struct rcu_head *head) +@@ -3978,6 +4039,7 @@ ext4_mb_release_inode_pa(struct ext4_bud + ac->ac_sb = sb; + ac->ac_inode = pa->pa_inode; + ac->ac_op = EXT4_MB_HISTORY_DISCARD; ++ ac->ac_o_ex.fe_len = 1; + } + + while (bit < end) { +@@ -4260,7 +4322,7 @@ repeat: + __release(e4b->alloc_semp); + ext4_error(sb, __func__, "Error in loading buddy " + "information for %u\n", group); +- continue; ++ return; + } + + bitmap_bh = ext4_read_block_bitmap(sb, group); +Index: linux-2.6.27.21-0.1/fs/ext4/mballoc.h +=================================================================== +--- linux-2.6.27.21-0.1.orig/fs/ext4/mballoc.h ++++ linux-2.6.27.21-0.1/fs/ext4/mballoc.h +@@ -219,7 +219,7 @@ struct ext4_mb_history { + __u16 tail; /* what tail broke some buddy */ + __u16 buddy; /* buddy the tail ^^^ broke */ + __u16 flags; +- __u8 cr:3; /* which phase the result extent was found at */ ++ __u8 cr:8; /* which phase the result extent was found at */ + __u8 op:4; + __u8 merged:1; + }; diff --git a/ldiskfs/kernel_patches/patches/ext4-misc-sles11.patch b/ldiskfs/kernel_patches/patches/ext4-misc-sles11.patch new file mode 100644 index 0000000..333bf01 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext4-misc-sles11.patch @@ -0,0 +1,297 @@ +Index: linux-2.6.27.21-0.1/fs/ext4/ext4_jbd2.h +=================================================================== +--- linux-2.6.27.21-0.1.orig/fs/ext4/ext4_jbd2.h ++++ linux-2.6.27.21-0.1/fs/ext4/ext4_jbd2.h +@@ -35,6 +35,9 @@ + (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS) \ + || test_opt(sb, EXTENTS) ? 27U : 8U) + ++/* Indicate that EXT4_SINGLEDATA_TRANS_BLOCKS takes the sb as argument */ ++#define EXT4_SINGLEDATA_TRANS_BLOCKS_HAS_SB ++ + /* Extended attribute operations touch at most two data buffers, + * two bitmap buffers, and two group summaries, in addition to the inode + * and the superblock, which are already accounted for. */ +Index: linux-2.6.27.21-0.1/fs/ext4/extents.c +=================================================================== +--- linux-2.6.27.21-0.1.orig/fs/ext4/extents.c ++++ linux-2.6.27.21-0.1/fs/ext4/extents.c +@@ -48,7 +48,7 @@ + * ext_pblock: + * combine low and high parts of physical block number into ext4_fsblk_t + */ +-static ext4_fsblk_t ext_pblock(struct ext4_extent *ex) ++ext4_fsblk_t ext_pblock(struct ext4_extent *ex) + { + ext4_fsblk_t block; + +@@ -58,6 +58,17 @@ static ext4_fsblk_t ext_pblock(struct ex + } + + /* ++ * ext4_ext_store_pblock: ++ * stores a large physical block number into an extent struct, ++ * breaking it into parts ++ */ ++void ext4_ext_store_pblock(struct ext4_extent *ex, ext4_fsblk_t pb) ++{ ++ ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff)); ++ ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff); ++} ++ ++/* + * idx_pblock: + * combine low and high parts of a leaf physical block number into ext4_fsblk_t + */ +@@ -71,17 +82,6 @@ ext4_fsblk_t idx_pblock(struct ext4_exte + } + + /* +- * ext4_ext_store_pblock: +- * stores a large physical block number into an extent struct, +- * breaking it into parts +- */ +-void ext4_ext_store_pblock(struct ext4_extent *ex, ext4_fsblk_t pb) +-{ +- ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff)); +- ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff); +-} +- +-/* + * ext4_idx_store_pblock: + * stores a large physical block number into an index struct, + * breaking it into parts +@@ -1851,6 +1851,56 @@ static int ext4_ext_rm_idx(handle_t *han + } + + /* ++ * This routine returns max. credits extent tree can consume. ++ * It should be OK for low-performance paths like ->writepage() ++ * To allow many writing process to fit a single transaction, ++ * caller should calculate credits under truncate_mutex and ++ * pass actual path. ++ */ ++int ext4_ext_calc_credits_for_insert(struct inode *inode, ++ struct ext4_ext_path *path) ++{ ++ int depth, needed; ++ ++ if (path) { ++ /* probably there is space in leaf? */ ++ depth = ext_depth(inode); ++ if (le16_to_cpu(path[depth].p_hdr->eh_entries) ++ < le16_to_cpu(path[depth].p_hdr->eh_max)) ++ return 1; ++ } ++ ++ /* ++ * given 32bit logical block (4294967296 blocks), max. tree ++ * can be 4 levels in depth -- 4 * 340^4 == 53453440000. ++ * let's also add one more level for imbalance. ++ */ ++ depth = 5; ++ ++ /* allocation of new data block(s) */ ++ needed = 2; ++ ++ /* ++ * tree can be full, so it'd need to grow in depth: ++ * we need one credit to modify old root, credits for ++ * new root will be added in split accounting ++ */ ++ needed += 1; ++ ++ /* ++ * Index split can happen, we'd need: ++ * allocate intermediate indexes (bitmap + group) ++ * + change two blocks at each level, but root (already included) ++ */ ++ needed += (depth * 2) + (depth * 2); ++ ++ /* any allocation modifies superblock */ ++ needed += 1; ++ ++ return needed; ++} ++ ++/* + * ext4_ext_calc_credits_for_single_extent: + * This routine returns max. credits that needed to insert an extent + * to the extent tree. +@@ -3170,3 +3220,14 @@ int ext4_fiemap(struct inode *inode, str + + return error; + } ++ ++EXPORT_SYMBOL(ext4_ext_store_pblock); ++EXPORT_SYMBOL(ext4_ext_search_right); ++EXPORT_SYMBOL(ext4_ext_search_left); ++EXPORT_SYMBOL(ext_pblock); ++EXPORT_SYMBOL(ext4_ext_insert_extent); ++EXPORT_SYMBOL(ext4_mb_new_blocks); ++EXPORT_SYMBOL(ext4_ext_walk_space); ++EXPORT_SYMBOL(ext4_ext_calc_credits_for_insert); ++EXPORT_SYMBOL(ext4_mark_inode_dirty); ++ +Index: linux-2.6.27.21-0.1/fs/ext4/ext4_extents.h +=================================================================== +--- linux-2.6.27.21-0.1.orig/fs/ext4/ext4_extents.h ++++ linux-2.6.27.21-0.1/fs/ext4/ext4_extents.h +@@ -59,6 +59,11 @@ + */ + #define EXT_STATS_ + ++/* ++ * define EXT4_ALLOC_NEEDED to 0 since block bitmap, group desc. and sb ++ * are now accounted in ext4_ext_calc_credits_for_insert() ++ */ ++#define EXT4_ALLOC_NEEDED 0 + + /* + * ext4_inode has i_block array (60 bytes total). +@@ -124,6 +129,7 @@ struct ext4_ext_path { + #define EXT4_EXT_CACHE_GAP 1 + #define EXT4_EXT_CACHE_EXTENT 2 + ++#define EXT4_EXT_HAS_NO_TREE /* ext4_extents_tree struct is not used*/ + + #define EXT_MAX_BLOCK 0xffffffff + +@@ -223,10 +229,14 @@ static inline int ext4_ext_get_actual_le + (le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN)); + } + ++extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex); ++extern void ext4_ext_store_pblock(struct ext4_extent *ex, ext4_fsblk_t pb); + extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks); + extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *); + extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t); + extern int ext4_extent_tree_init(handle_t *, struct inode *); ++extern int ext4_ext_calc_credits_for_insert(struct inode *, ++ struct ext4_ext_path *); + extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode, + int num, + struct ext4_ext_path *path); +Index: linux-2.6.27.21-0.1/fs/ext4/mballoc.c +=================================================================== +--- linux-2.6.27.21-0.1.orig/fs/ext4/mballoc.c ++++ linux-2.6.27.21-0.1/fs/ext4/mballoc.c +@@ -4348,6 +4348,13 @@ repeat: + kmem_cache_free(ext4_ac_cachep, ac); + } + ++/* For backward compatibility, since Lustre uses this symbol */ ++void ext4_mb_discard_inode_preallocations(struct inode *inode) ++{ ++ ext4_discard_preallocations(inode); ++} ++EXPORT_SYMBOL(ext4_mb_discard_inode_preallocations); ++ + /* + * finds all preallocated spaces and return blocks being freed to them + * if preallocated space becomes full (no block is used from the space) +@@ -5170,3 +5177,6 @@ error_return: + kmem_cache_free(ext4_ac_cachep, ac); + return; + } ++ ++EXPORT_SYMBOL(ext4_free_blocks); ++ +Index: linux-2.6.27.21-0.1/fs/ext4/super.c +=================================================================== +--- linux-2.6.27.21-0.1.orig/fs/ext4/super.c ++++ linux-2.6.27.21-0.1/fs/ext4/super.c +@@ -91,6 +91,7 @@ ext4_fsblk_t ext4_inode_bitmap(struct su + (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? + (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0); + } ++EXPORT_SYMBOL(ext4_inode_bitmap); + + ext4_fsblk_t ext4_inode_table(struct super_block *sb, + struct ext4_group_desc *bg) +@@ -1295,6 +1296,7 @@ enum { + Opt_stripe, Opt_delalloc, Opt_nodelalloc, + Opt_inode_readahead_blks, Opt_bigendian_extents, + Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, ++ Opt_mballoc + }; + + static const match_table_t tokens = { +@@ -1356,6 +1358,7 @@ static const match_table_t tokens = { + {Opt_nodelalloc, "nodelalloc"}, + {Opt_inode_readahead_blks, "inode_readahead_blks=%u"}, + {Opt_bigendian_extents, "bigendian_extents"}, ++ {Opt_mballoc, "mballoc"}, + {Opt_err, NULL}, + }; + +@@ -1774,6 +1777,8 @@ set_qf_format: + case Opt_bigendian_extents: + bigendian_extents = 1; + break; ++ case Opt_mballoc: ++ break; + default: + printk(KERN_ERR + "EXT4-fs: Unrecognized mount option \"%s\" " +@@ -4095,7 +4100,7 @@ static struct file_system_type ext4dev_f + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, + }; +-MODULE_ALIAS("ext4dev"); ++MODULE_ALIAS("ext4"); + + static int __init init_ext4_fs(void) + { +Index: linux-2.6.27.21-0.1/fs/ext4/ext4_jbd2.c +=================================================================== +--- linux-2.6.27.21-0.1.orig/fs/ext4/ext4_jbd2.c ++++ linux-2.6.27.21-0.1/fs/ext4/ext4_jbd2.c +@@ -21,6 +21,7 @@ int __ext4_journal_get_write_access(cons + ext4_journal_abort_handle(where, __func__, bh, handle, err); + return err; + } ++EXPORT_SYMBOL(__ext4_journal_get_write_access); + + int __ext4_journal_forget(const char *where, handle_t *handle, + struct buffer_head *bh) +@@ -57,3 +58,4 @@ int __ext4_journal_dirty_metadata(const + ext4_journal_abort_handle(where, __func__, bh, handle, err); + return err; + } ++EXPORT_SYMBOL(__ext4_journal_dirty_metadata); +Index: linux-2.6.27.21-0.1/fs/ext4/ext4.h +=================================================================== +--- linux-2.6.27.21-0.1.orig/fs/ext4/ext4.h ++++ linux-2.6.27.21-0.1/fs/ext4/ext4.h +@@ -26,6 +26,9 @@ + * The fourth extended filesystem constants/structures + */ + ++/* Has been moved to linux/magic.h but we need it for Lustre */ ++#define EXT4_SUPER_MAGIC 0xEF53 ++ + /* + * Define EXT4FS_DEBUG to produce debug messages + */ +@@ -1116,6 +1119,8 @@ extern void ext4_mb_update_group_info(st + extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t); + extern void ext4_mb_put_buddy_cache_lock(struct super_block *, + ext4_group_t, int); ++extern void ext4_mb_discard_inode_preallocations(struct inode *); ++ + /* inode.c */ + int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode, + struct buffer_head *bh, ext4_fsblk_t blocknr); +Index: linux-2.6.27.21-0.1/fs/ext4/inode.c +=================================================================== +--- linux-2.6.27.21-0.1.orig/fs/ext4/inode.c ++++ linux-2.6.27.21-0.1/fs/ext4/inode.c +@@ -4240,6 +4240,7 @@ bad_inode: + iget_failed(inode); + return ERR_PTR(ret); + } ++EXPORT_SYMBOL(ext4_iget); + + static int ext4_inode_blocks_set(handle_t *handle, + struct ext4_inode *raw_inode, diff --git a/ldiskfs/kernel_patches/patches/ext4-mmp-sles11.patch b/ldiskfs/kernel_patches/patches/ext4-mmp-sles11.patch new file mode 100644 index 0000000..827cbae --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext4-mmp-sles11.patch @@ -0,0 +1,481 @@ +Index: linux-2.6.27.21-0.1/fs/ext4/super.c +=================================================================== +--- linux-2.6.27.21-0.1.orig/fs/ext4/super.c ++++ linux-2.6.27.21-0.1/fs/ext4/super.c +@@ -39,6 +39,8 @@ + #include + #include + #include ++#include ++#include + + #include "ext4.h" + #include "ext4_jbd2.h" +@@ -598,6 +600,8 @@ static void ext4_put_super(struct super_ + invalidate_bdev(sbi->journal_bdev); + ext4_blkdev_remove(sbi); + } ++ if (sbi->s_mmp_tsk) ++ kthread_stop(sbi->s_mmp_tsk); + sb->s_fs_info = NULL; + kfree(sbi); + return; +@@ -806,7 +810,6 @@ static int ext4_show_options(struct seq_ + if (!test_opt(sb, DELALLOC)) + seq_puts(seq, ",nodelalloc"); + +- + if (sbi->s_stripe) + seq_printf(seq, ",stripe=%lu", sbi->s_stripe); + /* +@@ -829,6 +832,330 @@ static int ext4_show_options(struct seq_ + } + + ++ ++/* ++ * Write the MMP block using WRITE_SYNC to try to get the block on-disk ++ * faster. ++ */ ++static int write_mmp_block(struct buffer_head *bh) ++{ ++ mark_buffer_dirty(bh); ++ lock_buffer(bh); ++ bh->b_end_io = end_buffer_write_sync; ++ get_bh(bh); ++ submit_bh(WRITE_SYNC, bh); ++ wait_on_buffer(bh); ++ if (unlikely(!buffer_uptodate(bh))) ++ return 1; ++ ++ return 0; ++} ++ ++/* ++ * Read the MMP block. It _must_ be read from disk and hence we clear the ++ * uptodate flag on the buffer. ++ */ ++static int read_mmp_block(struct super_block *sb, struct buffer_head **bh, ++ unsigned long mmp_block) ++{ ++ struct mmp_struct *mmp; ++ ++ if (*bh) ++ clear_buffer_uptodate(*bh); ++ ++#if 0 ++ brelse(*bh); ++ ++ *bh = sb_bread(sb, mmp_block); ++#else ++ if (!*bh) ++ *bh = sb_getblk(sb, mmp_block); ++ if (*bh) { ++ get_bh(*bh); ++ lock_buffer(*bh); ++ (*bh)->b_end_io = end_buffer_read_sync; ++ submit_bh(READ_SYNC, *bh); ++ wait_on_buffer(*bh); ++ if (!buffer_uptodate(*bh)) { ++ brelse(*bh); ++ *bh = NULL; ++ } ++ } ++#endif ++ if (!*bh) { ++ ext4_warning(sb, __FUNCTION__, ++ "Error while reading MMP block %lu", mmp_block); ++ return -EIO; ++ } ++ ++ mmp = (struct mmp_struct *)((*bh)->b_data); ++ if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC) ++ return -EINVAL; ++ ++ return 0; ++} ++ ++/* ++ * Dump as much information as possible to help the admin. ++ */ ++static void dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp, ++ const char *function, const char *msg) ++{ ++ ext4_warning(sb, function, msg); ++ ext4_warning(sb, function, "MMP failure info: last update time: %llu, " ++ "last update node: %s, last update device: %s\n", ++ le64_to_cpu(mmp->mmp_time), mmp->mmp_nodename, ++ mmp->mmp_bdevname); ++} ++ ++/* ++ * kmmpd will update the MMP sequence every s_mmp_update_interval seconds ++ */ ++static int kmmpd(void *data) ++{ ++ struct super_block *sb = (struct super_block *) data; ++ struct ext4_super_block *es = EXT4_SB(sb)->s_es; ++ struct buffer_head *bh = NULL; ++ struct mmp_struct *mmp; ++ unsigned long mmp_block; ++ u32 seq = 0; ++ unsigned long failed_writes = 0; ++ int mmp_update_interval = le16_to_cpu(es->s_mmp_update_interval); ++ unsigned mmp_check_interval; ++ unsigned long last_update_time; ++ unsigned long diff; ++ int retval; ++ ++ mmp_block = le64_to_cpu(es->s_mmp_block); ++ retval = read_mmp_block(sb, &bh, mmp_block); ++ if (retval) ++ goto failed; ++ ++ mmp = (struct mmp_struct *)(bh->b_data); ++ mmp->mmp_time = cpu_to_le64(get_seconds()); ++ /* ++ * Start with the higher mmp_check_interval and reduce it if ++ * the MMP block is being updated on time. ++ */ ++ mmp_check_interval = max(5 * mmp_update_interval, ++ EXT4_MMP_MIN_CHECK_INTERVAL); ++ mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval); ++ bdevname(bh->b_bdev, mmp->mmp_bdevname); ++ ++ down_read(&uts_sem); ++ memcpy(mmp->mmp_nodename, init_utsname()->sysname, ++ sizeof(mmp->mmp_nodename)); ++ up_read(&uts_sem); ++ ++ while (!kthread_should_stop()) { ++ if (++seq > EXT4_MMP_SEQ_MAX) ++ seq = 1; ++ ++ mmp->mmp_seq = cpu_to_le32(seq); ++ mmp->mmp_time = cpu_to_le64(get_seconds()); ++ last_update_time = jiffies; ++ ++ retval = write_mmp_block(bh); ++ /* ++ * Don't spew too many error messages. Print one every ++ * (s_mmp_update_interval * 60) seconds. ++ */ ++ if (retval && (failed_writes % 60) == 0) { ++ ext4_error(sb, __FUNCTION__, ++ "Error writing to MMP block"); ++ failed_writes++; ++ } ++ ++ if (!(le32_to_cpu(es->s_feature_incompat) & ++ EXT4_FEATURE_INCOMPAT_MMP)) { ++ ext4_warning(sb, __FUNCTION__, "kmmpd being stopped " ++ "since MMP feature has been disabled."); ++ EXT4_SB(sb)->s_mmp_tsk = 0; ++ goto failed; ++ } ++ ++ if (sb->s_flags & MS_RDONLY) { ++ ext4_warning(sb, __FUNCTION__, "kmmpd being stopped " ++ "since filesystem has been remounted as " ++ "readonly."); ++ EXT4_SB(sb)->s_mmp_tsk = 0; ++ goto failed; ++ } ++ ++ diff = jiffies - last_update_time; ++ if (diff < mmp_update_interval * HZ) ++ schedule_timeout_interruptible(EXT4_MMP_UPDATE_INTERVAL* ++ HZ - diff); ++ ++ /* ++ * We need to make sure that more than mmp_check_interval ++ * seconds have not passed since writing. If that has happened ++ * we need to check if the MMP block is as we left it. ++ */ ++ diff = jiffies - last_update_time; ++ if (diff > mmp_check_interval * HZ) { ++ struct buffer_head *bh_check = NULL; ++ struct mmp_struct *mmp_check; ++ ++ retval = read_mmp_block(sb, &bh_check, mmp_block); ++ if (retval) { ++ EXT4_SB(sb)->s_mmp_tsk = 0; ++ goto failed; ++ } ++ ++ mmp_check = (struct mmp_struct *)(bh_check->b_data); ++ if (mmp->mmp_time != mmp_check->mmp_time || ++ memcmp(mmp->mmp_nodename, mmp_check->mmp_nodename, ++ sizeof(mmp->mmp_nodename))) ++ dump_mmp_msg(sb, mmp_check, __FUNCTION__, ++ "Error while updating MMP info. " ++ "The filesystem seems to have " ++ "been multiply mounted."); ++ ++ put_bh(bh_check); ++ } ++ ++ /* ++ * Adjust the mmp_check_interval depending on how much time ++ * it took for the MMP block to be written. ++ */ ++ mmp_check_interval = max(5 * diff / HZ, ++ (unsigned long) EXT4_MMP_MIN_CHECK_INTERVAL); ++ mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval); ++ } ++ ++ /* ++ * Unmount seems to be clean. ++ */ ++ mmp->mmp_seq = cpu_to_le32(EXT4_MMP_SEQ_CLEAN); ++ mmp->mmp_time = cpu_to_le64(get_seconds()); ++ ++ retval = write_mmp_block(bh); ++ ++failed: ++ brelse(bh); ++ return retval; ++} ++ ++/* ++ * Get a random new sequence number but make sure it is not greater than ++ * EXT4_MMP_SEQ_MAX. ++ */ ++static unsigned int mmp_new_seq(void) ++{ ++ u32 new_seq; ++ ++ do { ++ get_random_bytes(&new_seq, sizeof(u32)); ++ } while (new_seq > EXT4_MMP_SEQ_MAX); ++ ++ return new_seq; ++} ++ ++/* ++ * Protect the filesystem from being mounted more than once. ++ */ ++static int ext4_multi_mount_protect(struct super_block *sb, ++ unsigned long mmp_block) ++{ ++ struct ext4_super_block *es = EXT4_SB(sb)->s_es; ++ struct buffer_head *bh = NULL; ++ struct mmp_struct *mmp = NULL; ++ u32 seq; ++ unsigned int mmp_check_interval = le16_to_cpu(es->s_mmp_update_interval); ++ int retval; ++ ++ if (mmp_block < le32_to_cpu(es->s_first_data_block) || ++ mmp_block >= ext4_blocks_count(es)) { ++ ext4_warning(sb, __FUNCTION__, ++ "Invalid MMP block in superblock"); ++ goto failed; ++ } ++ ++ retval = read_mmp_block(sb, &bh, mmp_block); ++ if (retval) ++ goto failed; ++ ++ mmp = (struct mmp_struct *)(bh->b_data); ++ ++ if (mmp_check_interval < EXT4_MMP_MIN_CHECK_INTERVAL) ++ mmp_check_interval = EXT4_MMP_MIN_CHECK_INTERVAL; ++ ++ /* ++ * If check_interval in MMP block is larger, use that instead of ++ * update_interval from the superblock. ++ */ ++ if (mmp->mmp_check_interval > mmp_check_interval) ++ mmp_check_interval = mmp->mmp_check_interval; ++ ++ seq = le32_to_cpu(mmp->mmp_seq); ++ if (seq == EXT4_MMP_SEQ_CLEAN) ++ goto skip; ++ ++ if (seq == EXT4_MMP_SEQ_FSCK) { ++ dump_mmp_msg(sb, mmp, __FUNCTION__, ++ "fsck is running on the filesystem"); ++ goto failed; ++ } ++ ++ schedule_timeout_uninterruptible(HZ * (2 * mmp_check_interval + 1)); ++ ++ retval = read_mmp_block(sb, &bh, mmp_block); ++ if (retval) ++ goto failed; ++ mmp = (struct mmp_struct *)(bh->b_data); ++ if (seq != le32_to_cpu(mmp->mmp_seq)) { ++ dump_mmp_msg(sb, mmp, __FUNCTION__, ++ "Device is already active on another node."); ++ goto failed; ++ } ++ ++skip: ++ /* ++ * write a new random sequence number. ++ */ ++ mmp->mmp_seq = seq = cpu_to_le32(mmp_new_seq()); ++ ++ retval = write_mmp_block(bh); ++ if (retval) ++ goto failed; ++ ++ /* ++ * wait for MMP interval and check mmp_seq. ++ */ ++ schedule_timeout_uninterruptible(HZ * (2 * mmp_check_interval + 1)); ++ ++ retval = read_mmp_block(sb, &bh, mmp_block); ++ if (retval) ++ goto failed; ++ mmp = (struct mmp_struct *)(bh->b_data); ++ if (seq != le32_to_cpu(mmp->mmp_seq)) { ++ dump_mmp_msg(sb, mmp, __FUNCTION__, ++ "Device is already active on another node."); ++ goto failed; ++ } ++ ++ /* ++ * Start a kernel thread to update the MMP block periodically. ++ */ ++ EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, sb, "kmmpd-%02x:%02x", ++ MAJOR(sb->s_dev), ++ MINOR(sb->s_dev)); ++ if (IS_ERR(EXT4_SB(sb)->s_mmp_tsk)) { ++ EXT4_SB(sb)->s_mmp_tsk = 0; ++ ext4_warning(sb, __FUNCTION__, "Unable to create kmmpd thread " ++ "for %s.", sb->s_id); ++ goto failed; ++ } ++ ++ brelse(bh); ++ return 0; ++ ++failed: ++ brelse(bh); ++ return 1; ++} ++ + static struct inode *ext4_nfs_get_inode(struct super_block *sb, + u64 ino, u32 generation) + { +@@ -2366,6 +2693,11 @@ static int ext4_fill_super(struct super_ + EXT4_HAS_INCOMPAT_FEATURE(sb, + EXT4_FEATURE_INCOMPAT_RECOVER)); + ++ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_MMP) && ++ !(sb->s_flags & MS_RDONLY)) ++ if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block))) ++ goto failed_mount3; ++ + /* + * The first inode we look at is the journal inode. Don't try + * root first: it may be modified in the journal! +@@ -2566,6 +2898,8 @@ failed_mount3: + percpu_counter_destroy(&sbi->s_freeinodes_counter); + percpu_counter_destroy(&sbi->s_dirs_counter); + percpu_counter_destroy(&sbi->s_dirtyblocks_counter); ++ if (sbi->s_mmp_tsk) ++ kthread_stop(sbi->s_mmp_tsk); + failed_mount2: + for (i = 0; i < db_count; i++) + brelse(sbi->s_group_desc[i]); +@@ -3080,7 +3414,7 @@ static int ext4_remount(struct super_blo + unsigned long old_sb_flags; + struct ext4_mount_options old_opts; + ext4_group_t g; +- int err; ++ int err = 0; + #ifdef CONFIG_QUOTA + int i; + #endif +@@ -3205,6 +3539,13 @@ static int ext4_remount(struct super_blo + goto restore_opts; + if (!ext4_setup_super(sb, es, 0)) + sb->s_flags &= ~MS_RDONLY; ++ if (EXT4_HAS_INCOMPAT_FEATURE(sb, ++ EXT4_FEATURE_INCOMPAT_MMP)) ++ if (ext4_multi_mount_protect(sb, ++ le64_to_cpu(es->s_mmp_block))) { ++ err = -EROFS; ++ goto restore_opts; ++ } + } + } + #ifdef CONFIG_QUOTA +Index: linux-2.6.27.21-0.1/fs/ext4/ext4.h +=================================================================== +--- linux-2.6.27.21-0.1.orig/fs/ext4/ext4.h ++++ linux-2.6.27.21-0.1/fs/ext4/ext4.h +@@ -660,7 +660,7 @@ struct ext4_super_block { + __le16 s_want_extra_isize; /* New inodes should reserve # bytes */ + __le32 s_flags; /* Miscellaneous flags */ + __le16 s_raid_stride; /* RAID stride */ +- __le16 s_mmp_interval; /* # seconds to wait in MMP checking */ ++ __le16 s_mmp_update_interval; /* # seconds to wait in MMP checking */ + __le64 s_mmp_block; /* Block for multi-mount protection */ + __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/ + __u8 s_log_groups_per_flex; /* FLEX_BG group size */ +@@ -777,7 +777,8 @@ static inline int ext4_valid_inum(struct + EXT4_FEATURE_INCOMPAT_META_BG| \ + EXT4_FEATURE_INCOMPAT_EXTENTS| \ + EXT4_FEATURE_INCOMPAT_64BIT| \ +- EXT4_FEATURE_INCOMPAT_FLEX_BG) ++ EXT4_FEATURE_INCOMPAT_FLEX_BG| \ ++ EXT4_FEATURE_INCOMPAT_MMP) + #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ +@@ -981,6 +982,39 @@ do { \ + #endif + + /* ++ * This structure will be used for multiple mount protection. It will be ++ * written into the block number saved in the s_mmp_block field in the ++ * superblock. Programs that check MMP should assume that if ++ * SEQ_FSCK (or any unknown code above SEQ_MAX) is present then it is NOT safe ++ * to use the filesystem, regardless of how old the timestamp is. ++ */ ++#define EXT4_MMP_MAGIC 0x004D4D50U /* ASCII for MMP */ ++#define EXT4_MMP_SEQ_CLEAN 0xFF4D4D50U /* mmp_seq value for clean unmount */ ++#define EXT4_MMP_SEQ_FSCK 0xE24D4D50U /* mmp_seq value when being fscked */ ++#define EXT4_MMP_SEQ_MAX 0xE24D4D4FU /* maximum valid mmp_seq value */ ++ ++struct mmp_struct { ++ __le32 mmp_magic; ++ __le32 mmp_seq; ++ __le64 mmp_time; ++ char mmp_nodename[64]; ++ char mmp_bdevname[32]; ++ __le16 mmp_check_interval; ++ __le16 mmp_pad1; ++ __le32 mmp_pad2[227]; ++}; ++ ++/* ++ * Default interval in seconds to update the MMP sequence number. ++ */ ++#define EXT4_MMP_UPDATE_INTERVAL 1 ++ ++/* ++ * Minimum interval for MMP checking in seconds. ++ */ ++#define EXT4_MMP_MIN_CHECK_INTERVAL 5 ++ ++/* + * Function prototypes + */ + +Index: linux-2.6.27.21-0.1/fs/ext4/ext4_sb.h +=================================================================== +--- linux-2.6.27.21-0.1.orig/fs/ext4/ext4_sb.h ++++ linux-2.6.27.21-0.1/fs/ext4/ext4_sb.h +@@ -150,6 +150,8 @@ struct ext4_sb_info { + + unsigned int s_log_groups_per_flex; + struct flex_groups *s_flex_groups; ++ ++ struct task_struct *s_mmp_tsk; /* Kernel thread for multiple mount protection */ + }; + + #endif /* _EXT4_SB */ diff --git a/ldiskfs/kernel_patches/patches/ext4-prealloc-sles11.patch b/ldiskfs/kernel_patches/patches/ext4-prealloc-sles11.patch new file mode 100644 index 0000000..31eff99 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext4-prealloc-sles11.patch @@ -0,0 +1,401 @@ +Index: linux-2.6.27.21-0.1/fs/ext4/ext4_sb.h +=================================================================== +--- linux-2.6.27.21-0.1.orig/fs/ext4/ext4_sb.h ++++ linux-2.6.27.21-0.1/fs/ext4/ext4_sb.h +@@ -111,11 +111,14 @@ struct ext4_sb_info { + + /* tunables */ + unsigned long s_stripe; +- unsigned int s_mb_stream_request; ++ unsigned long s_mb_small_req; ++ unsigned long s_mb_large_req; + unsigned int s_mb_max_to_scan; + unsigned int s_mb_min_to_scan; + unsigned int s_mb_stats; + unsigned int s_mb_order2_reqs; ++ unsigned long *s_mb_prealloc_table; ++ unsigned long s_mb_prealloc_table_size; + unsigned int s_mb_group_prealloc; + /* where last allocation was done - for stream allocation */ + unsigned long s_mb_last_group; +Index: linux-2.6.27.21-0.1/fs/ext4/mballoc.c +=================================================================== +--- linux-2.6.27.21-0.1.orig/fs/ext4/mballoc.c ++++ linux-2.6.27.21-0.1/fs/ext4/mballoc.c +@@ -1996,7 +1996,7 @@ ext4_mb_regular_allocator(struct ext4_al + if (size < isize) + size = isize; + +- if (size < sbi->s_mb_stream_request && ++ if ((ac->ac_g_ex.fe_len < sbi->s_mb_large_req) && + (ac->ac_flags & EXT4_MB_HINT_DATA)) { + /* TBD: may be hot point */ + spin_lock(&sbi->s_md_lock); +@@ -2686,6 +2686,26 @@ err_freesgi: + return -ENOMEM; + } + ++static void ext4_mb_prealloc_table_add(struct ext4_sb_info *sbi, int value) ++{ ++ int i; ++ ++ if (value > (sbi->s_blocks_per_group - 1 - 1 - sbi->s_itb_per_group)) ++ return; ++ ++ for (i = 0; i < sbi->s_mb_prealloc_table_size; i++) { ++ if (sbi->s_mb_prealloc_table[i] == 0) { ++ sbi->s_mb_prealloc_table[i] = value; ++ return; ++ } ++ ++ /* they should add values in order */ ++ if (value <= sbi->s_mb_prealloc_table[i]) ++ return; ++ } ++} ++ ++ + int ext4_mb_init(struct super_block *sb, int needs_recovery) + { + struct ext4_sb_info *sbi = EXT4_SB(sb); +@@ -2738,13 +2758,55 @@ int ext4_mb_init(struct super_block *sb, + sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN; + sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN; + sbi->s_mb_stats = MB_DEFAULT_STATS; +- sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; + sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; + sbi->s_mb_history_filter = EXT4_MB_HISTORY_DEFAULT; +- sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC; ++ ++ if (sbi->s_stripe == 0) { ++ sbi->s_mb_prealloc_table_size = 8; ++ i = sbi->s_mb_prealloc_table_size * sizeof(unsigned long); ++ sbi->s_mb_prealloc_table = kmalloc(i, GFP_NOFS); ++ if (sbi->s_mb_prealloc_table == NULL) { ++ kfree(sbi->s_mb_offsets); ++ kfree(sbi->s_mb_maxs); ++ return -ENOMEM; ++ } ++ memset(sbi->s_mb_prealloc_table, 0, i); ++ ++ ext4_mb_prealloc_table_add(sbi, 4); ++ ext4_mb_prealloc_table_add(sbi, 8); ++ ext4_mb_prealloc_table_add(sbi, 16); ++ ext4_mb_prealloc_table_add(sbi, 32); ++ ext4_mb_prealloc_table_add(sbi, 64); ++ ext4_mb_prealloc_table_add(sbi, 128); ++ ext4_mb_prealloc_table_add(sbi, 256); ++ ext4_mb_prealloc_table_add(sbi, 512); ++ ++ sbi->s_mb_small_req = 256; ++ sbi->s_mb_large_req = 1024; ++ sbi->s_mb_group_prealloc = 512; ++ } else { ++ sbi->s_mb_prealloc_table_size = 3; ++ i = sbi->s_mb_prealloc_table_size * sizeof(unsigned long); ++ sbi->s_mb_prealloc_table = kmalloc(i, GFP_NOFS); ++ if (sbi->s_mb_prealloc_table == NULL) { ++ kfree(sbi->s_mb_offsets); ++ kfree(sbi->s_mb_maxs); ++ return -ENOMEM; ++ } ++ memset(sbi->s_mb_prealloc_table, 0, i); ++ ++ ext4_mb_prealloc_table_add(sbi, sbi->s_stripe); ++ ext4_mb_prealloc_table_add(sbi, sbi->s_stripe * 2); ++ ext4_mb_prealloc_table_add(sbi, sbi->s_stripe * 4); ++ ++ sbi->s_mb_small_req = sbi->s_stripe; ++ sbi->s_mb_large_req = sbi->s_stripe * 8; ++ sbi->s_mb_group_prealloc = sbi->s_stripe * 4; ++ } + + sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); + if (sbi->s_locality_groups == NULL) { ++ kfree(sbi->s_mb_prealloc_table); + kfree(sbi->s_mb_offsets); + kfree(sbi->s_mb_maxs); + return -ENOMEM; +@@ -2915,9 +2977,89 @@ ext4_mb_free_committed_blocks(struct sup + #define EXT4_MB_MAX_TO_SCAN_NAME "max_to_scan" + #define EXT4_MB_MIN_TO_SCAN_NAME "min_to_scan" + #define EXT4_MB_ORDER2_REQ "order2_req" +-#define EXT4_MB_STREAM_REQ "stream_req" ++#define EXT4_MB_SMALL_REQ "small_req" ++#define EXT4_MB_LARGE_REQ "large_req" ++#define EXT4_MB_PREALLOC_TABLE "prealloc_table" + #define EXT4_MB_GROUP_PREALLOC "group_prealloc" + ++static int ext4_mb_prealloc_table_proc_read(char *page, char **start, off_t off, ++ int count, int *eof, void *data) ++{ ++ struct ext4_sb_info *sbi = data; ++ int len = 0; ++ int i; ++ ++ *eof = 1; ++ if (off != 0) ++ return 0; ++ ++ for (i = 0; i < sbi->s_mb_prealloc_table_size; i++) ++ len += sprintf(page + len, "%ld ", ++ sbi->s_mb_prealloc_table[i]); ++ len += sprintf(page + len, "\n"); ++ ++ *start = page; ++ return len; ++} ++ ++static int ext4_mb_prealloc_table_proc_write(struct file *file, ++ const char __user *buf, ++ unsigned long cnt, void *data) ++{ ++ struct ext4_sb_info *sbi = data; ++ unsigned long value; ++ unsigned long prev = 0; ++ char str[128]; ++ char *cur; ++ char *end; ++ unsigned long *new_table; ++ int num = 0; ++ int i = 0; ++ ++ if (cnt >= sizeof(str)) ++ return -EINVAL; ++ if (copy_from_user(str, buf, cnt)) ++ return -EFAULT; ++ ++ num = 0; ++ cur = str; ++ end = str + cnt; ++ while (cur < end) { ++ while ((cur < end) && (*cur == ' ')) cur++; ++ value = simple_strtol(cur, &cur, 0); ++ if (value == 0) ++ break; ++ if (value <= prev) ++ return -EINVAL; ++ prev = value; ++ num++; ++ } ++ ++ new_table = kmalloc(num * sizeof(*new_table), GFP_KERNEL); ++ if (new_table == NULL) ++ return -ENOMEM; ++ kfree(sbi->s_mb_prealloc_table); ++ memset(new_table, 0, num * sizeof(*new_table)); ++ sbi->s_mb_prealloc_table = new_table; ++ sbi->s_mb_prealloc_table_size = num; ++ cur = str; ++ end = str + cnt; ++ while (cur < end && i < num) { ++ while ((cur < end) && (*cur == ' ')) cur++; ++ value = simple_strtol(cur, &cur, 0); ++ ext4_mb_prealloc_table_add(sbi, value); ++ i++; ++ } ++ ++ return cnt; ++} ++ ++static const struct file_operations ext4_mb_prealloc_table_proc_fops = { ++ .owner = THIS_MODULE, ++ .read = ext4_mb_prealloc_table_proc_read, ++ .write = ext4_mb_prealloc_table_proc_write, ++}; ++ + static int ext4_mb_init_per_dev_proc(struct super_block *sb) + { + #ifdef CONFIG_PROC_FS +@@ -2932,13 +3074,17 @@ static int ext4_mb_init_per_dev_proc(str + EXT4_PROC_HANDLER(EXT4_MB_MAX_TO_SCAN_NAME, mb_max_to_scan); + EXT4_PROC_HANDLER(EXT4_MB_MIN_TO_SCAN_NAME, mb_min_to_scan); + EXT4_PROC_HANDLER(EXT4_MB_ORDER2_REQ, mb_order2_reqs); +- EXT4_PROC_HANDLER(EXT4_MB_STREAM_REQ, mb_stream_request); ++ EXT4_PROC_HANDLER(EXT4_MB_SMALL_REQ, mb_small_req); ++ EXT4_PROC_HANDLER(EXT4_MB_LARGE_REQ, mb_large_req); ++ EXT4_PROC_HANDLER(EXT4_MB_PREALLOC_TABLE, mb_prealloc_table); + EXT4_PROC_HANDLER(EXT4_MB_GROUP_PREALLOC, mb_group_prealloc); + return 0; + + err_out: + remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_proc); +- remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_proc); ++ remove_proc_entry(EXT4_MB_PREALLOC_TABLE, sbi->s_proc); ++ remove_proc_entry(EXT4_MB_LARGE_REQ, sbi->s_proc); ++ remove_proc_entry(EXT4_MB_SMALL_REQ, sbi->s_proc); + remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_proc); + remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc); + remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc); +@@ -2959,7 +3105,9 @@ static int ext4_mb_destroy_per_dev_proc( + return -EINVAL; + + remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_proc); +- remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_proc); ++ remove_proc_entry(EXT4_MB_PREALLOC_TABLE, sbi->s_proc); ++ remove_proc_entry(EXT4_MB_LARGE_REQ, sbi->s_proc); ++ remove_proc_entry(EXT4_MB_SMALL_REQ, sbi->s_proc); + remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_proc); + remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc); + remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc); +@@ -3162,11 +3310,12 @@ static noinline_for_stack void + ext4_mb_normalize_request(struct ext4_allocation_context *ac, + struct ext4_allocation_request *ar) + { +- int bsbits, max; ++ int bsbits, i, wind; + ext4_lblk_t end; +- loff_t size, orig_size, start_off; ++ loff_t size, orig_size; + ext4_lblk_t start, orig_start; + struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); ++ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + struct ext4_prealloc_space *pa; + + /* do normalize only data requests, metadata requests +@@ -3196,49 +3345,35 @@ ext4_mb_normalize_request(struct ext4_al + size = size << bsbits; + if (size < i_size_read(ac->ac_inode)) + size = i_size_read(ac->ac_inode); ++ size = (size + ac->ac_sb->s_blocksize - 1) >> bsbits; + +- /* max size of free chunks */ +- max = 2 << bsbits; ++ start = wind = 0; + +-#define NRL_CHECK_SIZE(req, size, max, chunk_size) \ +- (req <= (size) || max <= (chunk_size)) ++ /* let's choose preallocation window depending on file size */ ++ for (i = 0; i < sbi->s_mb_prealloc_table_size; i++) { ++ if (size <= sbi->s_mb_prealloc_table[i]) { ++ wind = sbi->s_mb_prealloc_table[i]; ++ break; ++ } ++ } ++ size = wind; + +- /* first, try to predict filesize */ +- /* XXX: should this table be tunable? */ +- start_off = 0; +- if (size <= 16 * 1024) { +- size = 16 * 1024; +- } else if (size <= 32 * 1024) { +- size = 32 * 1024; +- } else if (size <= 64 * 1024) { +- size = 64 * 1024; +- } else if (size <= 128 * 1024) { +- size = 128 * 1024; +- } else if (size <= 256 * 1024) { +- size = 256 * 1024; +- } else if (size <= 512 * 1024) { +- size = 512 * 1024; +- } else if (size <= 1024 * 1024) { +- size = 1024 * 1024; +- } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) { +- start_off = ((loff_t)ac->ac_o_ex.fe_logical >> +- (21 - bsbits)) << 21; +- size = 2 * 1024 * 1024; +- } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) { +- start_off = ((loff_t)ac->ac_o_ex.fe_logical >> +- (22 - bsbits)) << 22; +- size = 4 * 1024 * 1024; +- } else if (NRL_CHECK_SIZE(ac->ac_o_ex.fe_len, +- (8<<20)>>bsbits, max, 8 * 1024)) { +- start_off = ((loff_t)ac->ac_o_ex.fe_logical >> +- (23 - bsbits)) << 23; +- size = 8 * 1024 * 1024; +- } else { +- start_off = (loff_t)ac->ac_o_ex.fe_logical << bsbits; +- size = ac->ac_o_ex.fe_len << bsbits; ++ if (wind == 0) { ++ __u64 tstart, tend; ++ /* file is quite large, we now preallocate with ++ * the biggest configured window with regart to ++ * logical offset */ ++ wind = sbi->s_mb_prealloc_table[i - 1]; ++ tstart = ac->ac_o_ex.fe_logical; ++ do_div(tstart, wind); ++ start = tstart * wind; ++ tend = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len - 1; ++ do_div(tend, wind); ++ tend = tend * wind + wind; ++ size = tend - start; + } +- orig_size = size = size >> bsbits; +- orig_start = start = start_off >> bsbits; ++ orig_size = size; ++ orig_start = start; + + /* don't cover already allocated blocks in selected range */ + if (ar->pleft && start <= ar->lleft) { +@@ -3315,7 +3450,6 @@ ext4_mb_normalize_request(struct ext4_al + } + BUG_ON(start + size <= ac->ac_o_ex.fe_logical && + start > ac->ac_o_ex.fe_logical); +- BUG_ON(size <= 0 || size >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); + + /* now prepare goal request */ + +@@ -4236,22 +4370,32 @@ static void ext4_mb_group_or_file(struct + { + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + int bsbits = ac->ac_sb->s_blocksize_bits; +- loff_t size, isize; ++ loff_t size; + + if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) + return; + +- size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len; +- isize = i_size_read(ac->ac_inode) >> bsbits; +- size = max(size, isize); +- +- /* don't use group allocation for large files */ +- if (size >= sbi->s_mb_stream_request) ++ if (ac->ac_o_ex.fe_len >= sbi->s_mb_small_req) + return; + + if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) + return; + ++ /* request is so large that we don't care about ++ * streaming - it overweights any possible seek */ ++ if (ac->ac_o_ex.fe_len >= sbi->s_mb_large_req) ++ return; ++ ++ size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len; ++ size = size << bsbits; ++ if (size < i_size_read(ac->ac_inode)) ++ size = i_size_read(ac->ac_inode); ++ size = (size + ac->ac_sb->s_blocksize - 1) >> bsbits; ++ ++ /* don't use group allocation for large files */ ++ if (size >= sbi->s_mb_large_req) ++ return; ++ + BUG_ON(ac->ac_lg != NULL); + /* + * locality group prealloc space are per cpu. The reason for having +Index: linux-2.6.27.21-0.1/fs/ext4/inode.c +=================================================================== +--- linux-2.6.27.21-0.1.orig/fs/ext4/inode.c ++++ linux-2.6.27.21-0.1/fs/ext4/inode.c +@@ -2442,14 +2442,14 @@ static int ext4_da_writepages(struct add + return -EROFS; + + /* +- * Make sure nr_to_write is >= sbi->s_mb_stream_request ++ * Make sure nr_to_write is >= sbi->s_mb_small_req + * This make sure small files blocks are allocated in + * single attempt. This ensure that small files + * get less fragmented. + */ +- if (wbc->nr_to_write < sbi->s_mb_stream_request) { +- nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write; +- wbc->nr_to_write = sbi->s_mb_stream_request; ++ if (wbc->nr_to_write < sbi->s_mb_small_req) { ++ nr_to_writebump = sbi->s_mb_small_req - wbc->nr_to_write; ++ wbc->nr_to_write = sbi->s_mb_small_req; + } + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) + range_whole = 1; diff --git a/ldiskfs/kernel_patches/patches/ext4-print-inum-in-htree-warning-sles11.patch b/ldiskfs/kernel_patches/patches/ext4-print-inum-in-htree-warning-sles11.patch new file mode 100644 index 0000000..1cc10a8 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext4-print-inum-in-htree-warning-sles11.patch @@ -0,0 +1,15 @@ +Index: linux-2.6.18.i386/fs/ext4/namei.c +=================================================================== +--- linux-2.6.18.i386.orig/fs/ext4/namei.c ++++ linux-2.6.18.i386/fs/ext4/namei.c +@@ -374,8 +374,8 @@ dx_probe(struct dentry *dentry, struct i + root->info.hash_version != DX_HASH_HALF_MD4 && + root->info.hash_version != DX_HASH_LEGACY) { + ext4_warning(dir->i_sb, __func__, +- "Unrecognised inode hash code %d", +- root->info.hash_version); ++ "Unrecognised inode hash code %d for directory " ++ "#%lu", root->info.hash_version, dir->i_ino); + brelse(bh); + *err = ERR_BAD_DX_DIR; + goto fail; diff --git a/ldiskfs/kernel_patches/patches/ext4-remove-cond_resched-calls-sles11.patch b/ldiskfs/kernel_patches/patches/ext4-remove-cond_resched-calls-sles11.patch new file mode 100644 index 0000000..db2e1ba --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext4-remove-cond_resched-calls-sles11.patch @@ -0,0 +1,29 @@ +Index: linux-2.6.27.21-0.1/fs/ext4/ialloc.c +=================================================================== +--- linux-2.6.27.21-0.1.orig/fs/ext4/ialloc.c ++++ linux-2.6.27.21-0.1/fs/ext4/ialloc.c +@@ -1120,7 +1120,6 @@ unsigned long ext4_count_free_inodes(str + if (!gdp) + continue; + desc_count += ext4_free_inodes_count(sb, gdp); +- cond_resched(); + } + return desc_count; + #endif +Index: linux-2.6.27.21-0.1/fs/ext4/super.c +=================================================================== +--- linux-2.6.27.21-0.1.orig/fs/ext4/super.c ++++ linux-2.6.27.21-0.1/fs/ext4/super.c +@@ -3263,11 +3263,9 @@ static int ext4_statfs(struct dentry *de + * block group descriptors. If the sparse superblocks + * feature is turned on, then not all groups have this. + */ +- for (i = 0; i < ngroups; i++) { ++ for (i = 0; i < ngroups; i++) + overhead += ext4_bg_has_super(sb, i) + + ext4_bg_num_gdb(sb, i); +- cond_resched(); +- } + + /* + * Every block group has an inode bitmap, a block diff --git a/ldiskfs/kernel_patches/patches/ext4-remove-ioctl-filp-sles11.patch b/ldiskfs/kernel_patches/patches/ext4-remove-ioctl-filp-sles11.patch new file mode 100644 index 0000000..8fbc0b7 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext4-remove-ioctl-filp-sles11.patch @@ -0,0 +1,111 @@ +temp patch until we find workaround. WIll not affect Lustre functionality + +Index: linux-2.6.27.21-0.1/fs/ext4/ioctl.c +=================================================================== +--- linux-2.6.27.21-0.1.orig/fs/ext4/ioctl.c ++++ linux-2.6.27.21-0.1/fs/ext4/ioctl.c +@@ -200,9 +200,9 @@ long ext4_ioctl(struct file *filp, unsig + if (get_user(flags, (int __user *) arg)) + return -EFAULT; + +- err = mnt_want_write(filp->f_path.mnt); ++/* err = mnt_want_write(filp->f_path.mnt); + if (err) +- return err; ++ return err;*/ + + if (!S_ISDIR(inode->i_mode)) + flags &= ~EXT4_DIRSYNC_FL; +@@ -281,7 +281,7 @@ flags_err: + err = ext4_ext_migrate(inode); + flags_out: + mutex_unlock(&inode->i_mutex); +- mnt_drop_write(filp->f_path.mnt); ++// mnt_drop_write(filp->f_path.mnt); + return err; + } + case EXT4_IOC_GETVERSION: +@@ -297,9 +297,9 @@ flags_out: + if (!is_owner_or_cap(inode)) + return -EPERM; + +- err = mnt_want_write(filp->f_path.mnt); ++/* err = mnt_want_write(filp->f_path.mnt); + if (err) +- return err; ++ return err;*/ + if (get_user(generation, (int __user *) arg)) { + err = -EFAULT; + goto setversion_out; +@@ -318,7 +318,7 @@ flags_out: + } + ext4_journal_stop(handle); + setversion_out: +- mnt_drop_write(filp->f_path.mnt); ++// mnt_drop_write(filp->f_path.mnt); + return err; + } + #ifdef CONFIG_JBD2_DEBUG +@@ -356,9 +356,9 @@ setversion_out: + if (get_user(n_blocks_count, (__u32 __user *)arg)) + return -EFAULT; + +- err = mnt_want_write(filp->f_path.mnt); ++/* err = mnt_want_write(filp->f_path.mnt); + if (err) +- return err; ++ return err;*/ + + err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count); + jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); +@@ -366,7 +366,7 @@ setversion_out: + jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); + if (err == 0) + err = err2; +- mnt_drop_write(filp->f_path.mnt); ++// mnt_drop_write(filp->f_path.mnt); + + return err; + } +@@ -382,9 +382,9 @@ setversion_out: + sizeof(input))) + return -EFAULT; + +- err = mnt_want_write(filp->f_path.mnt); ++/* err = mnt_want_write(filp->f_path.mnt); + if (err) +- return err; ++ return err;*/ + + err = ext4_group_add(sb, &input); + jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); +@@ -392,7 +392,7 @@ setversion_out: + jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); + if (err == 0) + err = err2; +- mnt_drop_write(filp->f_path.mnt); ++// mnt_drop_write(filp->f_path.mnt); + + return err; + } +@@ -403,9 +403,9 @@ setversion_out: + if (!is_owner_or_cap(inode)) + return -EACCES; + +- err = mnt_want_write(filp->f_path.mnt); ++/* err = mnt_want_write(filp->f_path.mnt); + if (err) +- return err; ++ return err;*/ + /* + * inode_mutex prevent write and truncate on the file. + * Read still goes through. We take i_data_sem in +@@ -415,7 +415,7 @@ setversion_out: + mutex_lock(&(inode->i_mutex)); + err = ext4_ext_migrate(inode); + mutex_unlock(&(inode->i_mutex)); +- mnt_drop_write(filp->f_path.mnt); ++// mnt_drop_write(filp->f_path.mnt); + return err; + } + diff --git a/ldiskfs/kernel_patches/patches/ext4-unlink-race-sles11.patch b/ldiskfs/kernel_patches/patches/ext4-unlink-race-sles11.patch new file mode 100644 index 0000000..f75ae84 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext4-unlink-race-sles11.patch @@ -0,0 +1,15 @@ +Index: linux-2.6.18.i386/fs/ext4/namei.c +=================================================================== +--- linux-2.6.18.i386.orig/fs/ext4/namei.c ++++ linux-2.6.18.i386/fs/ext4/namei.c +@@ -2299,8 +2299,8 @@ static int ext4_link (struct dentry * ol + * Return -ENOENT if we've raced with unlink and i_nlink is 0. Doing + * otherwise has the potential to corrupt the orphan inode list. + */ +- if (inode->i_nlink == 0) +- return -ENOENT; ++ //if (inode->i_nlink == 0) ++ // return -ENOENT; + + retry: + handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + diff --git a/ldiskfs/kernel_patches/patches/ext4-wantedi-2.6-sles11.patch b/ldiskfs/kernel_patches/patches/ext4-wantedi-2.6-sles11.patch new file mode 100644 index 0000000..914ed9c --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext4-wantedi-2.6-sles11.patch @@ -0,0 +1,169 @@ +Index: linux-2.6.27.21-0.1/fs/ext4/ialloc.c +=================================================================== +--- linux-2.6.27.21-0.1.orig/fs/ext4/ialloc.c ++++ linux-2.6.27.21-0.1/fs/ext4/ialloc.c +@@ -675,7 +675,8 @@ err_ret: + * For other inodes, search forward from the parent directory's block + * group to find a free inode. + */ +-struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode) ++struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode, ++ unsigned long goal) + { + struct super_block *sb; + struct buffer_head *inode_bitmap_bh = NULL; +@@ -706,6 +707,43 @@ struct inode *ext4_new_inode(handle_t *h + sbi = EXT4_SB(sb); + es = sbi->s_es; + ++ if (goal) { ++ group = (goal - 1) / EXT4_INODES_PER_GROUP(sb); ++ ino = (goal - 1) % EXT4_INODES_PER_GROUP(sb); ++ err = -EIO; ++ ++ gdp = ext4_get_group_desc(sb, group, &group_desc_bh); ++ if (!gdp) ++ goto fail; ++ ++ inode_bitmap_bh = ext4_read_inode_bitmap(sb, group); ++ if (!inode_bitmap_bh) ++ goto fail; ++ ++ BUFFER_TRACE(bh, "get_write_access"); ++ err = ext4_journal_get_write_access(handle, inode_bitmap_bh); ++ if (err) ++ goto fail; ++ ++ if (ext4_set_bit_atomic(sb_bgl_lock(sbi, group), ++ ino, inode_bitmap_bh->b_data)) { ++ printk(KERN_ERR "goal inode %lu unavailable\n", goal); ++ /* Oh well, we tried. */ ++ goto continue_allocation; ++ } ++ ++ BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata"); ++ err = ext4_journal_dirty_metadata(handle, inode_bitmap_bh); ++ if (err) ++ goto fail; ++ ++ /* We've shortcircuited the allocation system successfully, ++ * now finish filling in the inode. ++ */ ++ goto got; ++ } ++ ++continue_allocation: + if (sbi->s_log_groups_per_flex) { + ret2 = find_group_flex(sb, dir, &group); + goto got_group; +Index: linux-2.6.27.21-0.1/fs/ext4/namei.c +=================================================================== +--- linux-2.6.27.21-0.1.orig/fs/ext4/namei.c ++++ linux-2.6.27.21-0.1/fs/ext4/namei.c +@@ -104,6 +104,7 @@ struct dx_entry + __le32 block; + }; + ++ + /* + * dx_root_info is laid out so that if it should somehow get overlaid by a + * dirent the two low bits of the hash version will be zero. Therefore, the +@@ -149,6 +150,14 @@ struct dx_map_entry + u16 size; + }; + ++#define LVFS_DENTRY_PARAM_MAGIC 20070216UL ++struct lvfs_dentry_params ++{ ++ unsigned long p_inum; ++ void *p_ptr; ++ u32 magic; ++}; ++ + static inline ext4_lblk_t dx_get_block(struct dx_entry *entry); + static void dx_set_block(struct dx_entry *entry, ext4_lblk_t value); + static inline unsigned dx_get_hash(struct dx_entry *entry); +@@ -1716,6 +1725,20 @@ static int ext4_add_nondir(handle_t *han + return err; + } + ++static struct inode * ext4_new_inode_wantedi(handle_t *handle, struct inode *dir, ++ int mode, struct dentry *dentry) ++{ ++ unsigned long inum = 0; ++ ++ if (dentry->d_fsdata != NULL) { ++ struct lvfs_dentry_params *param = dentry->d_fsdata; ++ ++ if (param->magic == LVFS_DENTRY_PARAM_MAGIC) ++ inum = param->p_inum; ++ } ++ return ext4_new_inode(handle, dir, mode, inum); ++} ++ + /* + * By the time this is called, we already have created + * the directory cache entry for the new file, but it +@@ -1741,7 +1764,7 @@ retry: + if (IS_DIRSYNC(dir)) + handle->h_sync = 1; + +- inode = ext4_new_inode (handle, dir, mode); ++ inode = ext4_new_inode_wantedi(handle, dir, mode, dentry); + err = PTR_ERR(inode); + if (!IS_ERR(inode)) { + inode->i_op = &ext4_file_inode_operations; +@@ -1775,7 +1798,7 @@ retry: + if (IS_DIRSYNC(dir)) + handle->h_sync = 1; + +- inode = ext4_new_inode(handle, dir, mode); ++ inode = ext4_new_inode_wantedi(handle, dir, mode, dentry); + err = PTR_ERR(inode); + if (!IS_ERR(inode)) { + init_special_inode(inode, inode->i_mode, rdev); +@@ -1811,7 +1834,7 @@ retry: + if (IS_DIRSYNC(dir)) + handle->h_sync = 1; + +- inode = ext4_new_inode(handle, dir, S_IFDIR | mode); ++ inode = ext4_new_inode_wantedi(handle, dir, S_IFDIR | mode, dentry); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_stop; +@@ -2211,7 +2234,7 @@ retry: + if (IS_DIRSYNC(dir)) + handle->h_sync = 1; + +- inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO); ++ inode = ext4_new_inode_wantedi(handle, dir, S_IFLNK|S_IRWXUGO, dentry); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_stop; +Index: linux-2.6.27.21-0.1/fs/ext4/ext4.h +=================================================================== +--- linux-2.6.27.21-0.1.orig/fs/ext4/ext4.h ++++ linux-2.6.27.21-0.1/fs/ext4/ext4.h +@@ -1032,7 +1032,8 @@ extern int ext4fs_dirhash(const char *na + dx_hash_info *hinfo); + + /* ialloc.c */ +-extern struct inode * ext4_new_inode(handle_t *, struct inode *, int); ++extern struct inode * ext4_new_inode(handle_t *, struct inode *, int, ++ unsigned long); + extern void ext4_free_inode(handle_t *, struct inode *); + extern struct inode * ext4_orphan_get(struct super_block *, unsigned long); + extern unsigned long ext4_count_free_inodes(struct super_block *); +Index: linux-2.6.27.21-0.1/fs/ext4/migrate.c +=================================================================== +--- linux-2.6.27.21-0.1.orig/fs/ext4/migrate.c ++++ linux-2.6.27.21-0.1/fs/ext4/migrate.c +@@ -484,7 +484,7 @@ int ext4_ext_migrate(struct inode *inode + } + tmp_inode = ext4_new_inode(handle, + inode->i_sb->s_root->d_inode, +- S_IFREG); ++ S_IFREG, 0); + if (IS_ERR(tmp_inode)) { + retval = -ENOMEM; + ext4_journal_stop(handle); diff --git a/ldiskfs/kernel_patches/patches/ext4-xattr-no-update-ctime-sles11.patch b/ldiskfs/kernel_patches/patches/ext4-xattr-no-update-ctime-sles11.patch new file mode 100644 index 0000000..66de9df --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext4-xattr-no-update-ctime-sles11.patch @@ -0,0 +1,32 @@ +Index: linux-2.6.18.i386/fs/ext4/ext4.h +=================================================================== +--- linux-2.6.18.i386.orig/fs/ext4/ext4.h ++++ linux-2.6.18.i386/fs/ext4/ext4.h +@@ -995,6 +995,13 @@ struct mmp_struct { + extern struct proc_dir_entry *proc_root_ext4; + + /* ++ * Indicates that ctime should not be updated in ext4_xattr_set_handle() ++ */ ++#ifndef XATTR_NO_CTIME ++#define XATTR_NO_CTIME 0x80 ++#endif ++ ++/* + * Function prototypes + */ + +Index: linux-2.6.18.i386/fs/ext4/xattr.c +=================================================================== +--- linux-2.6.18.i386.orig/fs/ext4/xattr.c ++++ linux-2.6.18.i386/fs/ext4/xattr.c +@@ -1026,7 +1026,8 @@ ext4_xattr_set_handle(handle_t *handle, + } + if (!error) { + ext4_xattr_update_super_block(handle, inode->i_sb); +- inode->i_ctime = ext4_current_time(inode); ++ if (!(flags & XATTR_NO_CTIME)) ++ inode->i_ctime = ext4_current_time(inode); + if (!value) + EXT4_I(inode)->i_state &= ~EXT4_STATE_NO_EXPAND; + error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); diff --git a/ldiskfs/kernel_patches/patches/iopen-sles11.patch b/ldiskfs/kernel_patches/patches/iopen-sles11.patch new file mode 100644 index 0000000..9fa64dc --- /dev/null +++ b/ldiskfs/kernel_patches/patches/iopen-sles11.patch @@ -0,0 +1,507 @@ +Index: linux-2.6.27.21-0.1/fs/ext4/iopen.c +=================================================================== +--- /dev/null ++++ linux-2.6.27.21-0.1/fs/ext4/iopen.c +@@ -0,0 +1,295 @@ ++/* ++ * linux/fs/ext4/iopen.c ++ * ++ * Special support for open by inode number ++ * ++ * Copyright (C) 2001 by Theodore Ts'o (tytso@alum.mit.edu). ++ * ++ * This file may be redistributed under the terms of the GNU General ++ * Public License. ++ * ++ * ++ * Invariants: ++ * - there is only ever a single DCACHE_NFSD_DISCONNECTED dentry alias ++ * for an inode at one time. ++ * - there are never both connected and DCACHE_NFSD_DISCONNECTED dentry ++ * aliases on an inode at the same time. ++ * ++ * If we have any connected dentry aliases for an inode, use one of those ++ * in iopen_lookup(). Otherwise, we instantiate a single NFSD_DISCONNECTED ++ * dentry for this inode, which thereafter will be found by the dcache ++ * when looking up this inode number in __iopen__, so we don't return here ++ * until it is gone. ++ * ++ * If we get an inode via a regular name lookup, then we "rename" the ++ * NFSD_DISCONNECTED dentry to the proper name and parent. This ensures ++ * existing users of the disconnected dentry will continue to use the same ++ * dentry as the connected users, and there will never be both kinds of ++ * dentry aliases at one time. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include "iopen.h" ++#include "ext4.h" ++#include "ext4_jbd2.h" ++ ++#ifndef assert ++#define assert(test) J_ASSERT(test) ++#endif ++ ++#define IOPEN_NAME_LEN 32 ++ ++/* ++ * This implements looking up an inode by number. ++ */ ++static struct dentry *iopen_lookup(struct inode * dir, struct dentry *dentry, ++ struct nameidata *nd) ++{ ++ struct inode *inode; ++ unsigned long ino; ++ struct list_head *lp; ++ struct dentry *alternate; ++ char buf[IOPEN_NAME_LEN]; ++ ++ if (dentry->d_name.len >= IOPEN_NAME_LEN) ++ return ERR_PTR(-ENAMETOOLONG); ++ ++ memcpy(buf, dentry->d_name.name, dentry->d_name.len); ++ buf[dentry->d_name.len] = 0; ++ ++ if (strcmp(buf, ".") == 0) ++ ino = dir->i_ino; ++ else if (strcmp(buf, "..") == 0) ++ ino = EXT4_ROOT_INO; ++ else ++ ino = simple_strtoul(buf, 0, 0); ++ ++ if ((ino != EXT4_ROOT_INO && ++ ino < EXT4_FIRST_INO(dir->i_sb)) || ++ ino > le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count)) ++ return ERR_PTR(-ENOENT); ++ ++ inode = ext4_iget(dir->i_sb, ino); ++ if (IS_ERR(inode)) { ++ /* Newer kernels return -ESTALE for inodes that are not in use, ++ * but older kernels return a negative dentry. This can only ++ * happen when doing a lookup in the __iopen__ dir, because the ++ * "entry" will always be found even if inode is unallocated. ++ * Handle this here instead of fixing the callers. b=19114 */ ++ if (PTR_ERR(inode) == -ESTALE) ++ return (ERR_PTR(-ENOENT)); ++ return ERR_CAST(inode); ++ } ++ ++ assert(list_empty(&dentry->d_alias)); /* d_instantiate */ ++ assert(d_unhashed(dentry)); /* d_rehash */ ++ ++ /* preferrably return a connected dentry */ ++ spin_lock(&dcache_lock); ++ list_for_each(lp, &inode->i_dentry) { ++ alternate = list_entry(lp, struct dentry, d_alias); ++ assert(!(alternate->d_flags & DCACHE_DISCONNECTED)); ++ } ++ ++ if (!list_empty(&inode->i_dentry)) { ++ alternate = list_entry(inode->i_dentry.next, ++ struct dentry, d_alias); ++ dget_locked(alternate); ++ spin_lock(&alternate->d_lock); ++ alternate->d_flags |= DCACHE_REFERENCED; ++ spin_unlock(&alternate->d_lock); ++ iput(inode); ++ spin_unlock(&dcache_lock); ++ return alternate; ++ } ++ dentry->d_flags |= DCACHE_DISCONNECTED; ++ ++ /* d_add(), but don't drop dcache_lock before adding dentry to inode */ ++ list_add(&dentry->d_alias, &inode->i_dentry); /* d_instantiate */ ++ dentry->d_inode = inode; ++ ++ d_rehash_cond(dentry, 0); ++ spin_unlock(&dcache_lock); ++ ++ return NULL; ++} ++ ++/* This function is spliced into ext4_lookup and does the move of a ++ * disconnected dentry (if it exists) to a connected dentry. ++ */ ++struct dentry *iopen_connect_dentry(struct dentry *dentry, struct inode *inode, ++ int rehash) ++{ ++ struct dentry *tmp, *goal = NULL; ++ struct list_head *lp; ++ ++ /* verify this dentry is really new */ ++ assert(dentry->d_inode == NULL); ++ assert(list_empty(&dentry->d_alias)); /* d_instantiate */ ++ if (rehash) ++ assert(d_unhashed(dentry)); /* d_rehash */ ++ assert(list_empty(&dentry->d_subdirs)); ++ ++ spin_lock(&dcache_lock); ++ if (!inode) ++ goto do_rehash; ++ ++ if (!test_opt(inode->i_sb, IOPEN)) ++ goto do_instantiate; ++ ++ /* preferrably return a connected dentry */ ++ list_for_each(lp, &inode->i_dentry) { ++ tmp = list_entry(lp, struct dentry, d_alias); ++ if (tmp->d_flags & DCACHE_DISCONNECTED) { ++ assert(tmp->d_alias.next == &inode->i_dentry); ++ assert(tmp->d_alias.prev == &inode->i_dentry); ++ goal = tmp; ++ dget_locked(goal); ++ break; ++ } ++ } ++ ++ if (!goal) ++ goto do_instantiate; ++ ++ /* Move the goal to the de hash queue */ ++ goal->d_flags &= ~DCACHE_DISCONNECTED; ++ security_d_instantiate(goal, inode); ++ __d_drop(dentry); ++ d_rehash_cond(dentry, 0); ++ d_move_locked(goal, dentry); ++ spin_unlock(&dcache_lock); ++ iput(inode); ++ ++ return goal; ++ ++ /* d_add(), but don't drop dcache_lock before adding dentry to inode */ ++do_instantiate: ++ list_add(&dentry->d_alias, &inode->i_dentry); /* d_instantiate */ ++ dentry->d_inode = inode; ++do_rehash: ++ if (rehash) ++ d_rehash_cond(dentry, 0); ++ spin_unlock(&dcache_lock); ++ ++ return NULL; ++} ++ ++/* ++ * Similar as d_instantiate() except that it drops the disconnected ++ * dentry if any. ++ */ ++void iopen_d_instantiate(struct dentry *dentry, struct inode * inode) ++{ ++ struct dentry *dis_dentry; ++ ++ /* verify this dentry is really new */ ++ assert(dentry->d_inode == NULL); ++ assert(list_empty(&dentry->d_alias)); ++ ++ spin_lock(&dcache_lock); ++ if (!inode || !test_opt(inode->i_sb, IOPEN) || ++ list_empty(&inode->i_dentry)) ++ goto do_instantiate; ++ ++ /* a disconnected dentry has been added in our back, ++ * we have to drop this dentry, see bug 16362/15713*/ ++ dis_dentry = list_entry(inode->i_dentry.next, struct dentry, d_alias); ++ spin_lock(&dis_dentry->d_lock); ++ assert(dis_dentry->d_alias.next == &inode->i_dentry); ++ assert(dis_dentry->d_alias.prev == &inode->i_dentry); ++ assert(dis_dentry->d_flags & DCACHE_DISCONNECTED); ++ __d_drop(dis_dentry); ++ list_del_init(&dis_dentry->d_alias); ++ spin_unlock(&dis_dentry->d_lock); ++ ++do_instantiate: ++ if (inode) ++ list_add(&dentry->d_alias, &inode->i_dentry); ++ dentry->d_inode = inode; ++ spin_unlock(&dcache_lock); ++ security_d_instantiate(dentry, inode); ++} ++ ++/* ++ * These are the special structures for the iopen pseudo directory. ++ */ ++ ++static struct inode_operations iopen_inode_operations = { ++ lookup: iopen_lookup, /* BKL held */ ++}; ++ ++static struct file_operations iopen_file_operations = { ++ read: generic_read_dir, ++}; ++ ++static int match_dentry(struct dentry *dentry, const char *name) ++{ ++ int len; ++ ++ len = strlen(name); ++ if (dentry->d_name.len != len) ++ return 0; ++ if (strncmp(dentry->d_name.name, name, len)) ++ return 0; ++ return 1; ++} ++ ++/* ++ * This function is spliced into ext4_lookup and returns 1 the file ++ * name is __iopen__ and dentry has been filled in appropriately. ++ */ ++int ext4_check_for_iopen(struct inode *dir, struct dentry *dentry) ++{ ++ struct inode *inode; ++ ++ if (dir->i_ino != EXT4_ROOT_INO || ++ !test_opt(dir->i_sb, IOPEN) || ++ !match_dentry(dentry, "__iopen__")) ++ return 0; ++ ++ inode = ext4_iget(dir->i_sb, EXT4_BAD_INO); ++ if (IS_ERR(inode)) ++ return 0; ++ ++ d_add(dentry, inode); ++ return 1; ++} ++ ++/* ++ * This function is spliced into read_inode; it returns 1 if inode ++ * number is the one for /__iopen__, in which case the inode is filled ++ * in appropriately. Otherwise, this fuction returns 0. ++ */ ++int ext4_iopen_get_inode(struct inode *inode) ++{ ++ if (inode->i_ino != EXT4_BAD_INO) ++ return 0; ++ ++ inode->i_mode = S_IFDIR | S_IRUSR | S_IXUSR; ++ if (test_opt(inode->i_sb, IOPEN_NOPRIV)) ++ inode->i_mode |= 0777; ++ inode->i_uid = 0; ++ inode->i_gid = 0; ++ inode->i_nlink = 1; ++ inode->i_size = 4096; ++ inode->i_atime = inode->i_ctime = inode->i_mtime = ext4_current_time(inode); ++ EXT4_I(inode)->i_dtime = 0; ++ EXT4_I(inode)->i_file_acl = 0; ++ inode->i_blocks = 0; ++ inode->i_version = 1; ++ inode->i_generation = 0; ++ ++ inode->i_op = &iopen_inode_operations; ++ inode->i_fop = &iopen_file_operations; ++ inode->i_mapping->a_ops = 0; ++ ++ if (inode->i_state & I_NEW) ++ unlock_new_inode(inode); ++ ++ return 1; ++} +Index: linux-2.6.27.21-0.1/fs/ext4/iopen.h +=================================================================== +--- /dev/null ++++ linux-2.6.27.21-0.1/fs/ext4/iopen.h +@@ -0,0 +1,16 @@ ++/* ++ * iopen.h ++ * ++ * Special support for opening files by inode number. ++ * ++ * Copyright (C) 2001 by Theodore Ts'o (tytso@alum.mit.edu). ++ * ++ * This file may be redistributed under the terms of the GNU General ++ * Public License. ++ */ ++ ++extern int ext4_check_for_iopen(struct inode *dir, struct dentry *dentry); ++extern int ext4_iopen_get_inode(struct inode *inode); ++extern struct dentry *iopen_connect_dentry(struct dentry *dentry, ++ struct inode *inode, int rehash); ++extern void iopen_d_instantiate(struct dentry *dentry, struct inode * inode); +Index: linux-2.6.27.21-0.1/fs/ext4/inode.c +=================================================================== +--- linux-2.6.27.21-0.1.orig/fs/ext4/inode.c ++++ linux-2.6.27.21-0.1/fs/ext4/inode.c +@@ -38,6 +38,7 @@ + #include + #include "ext4_jbd2.h" + #include "xattr.h" ++#include "iopen.h" + #include "acl.h" + #include "ext4_extents.h" + +@@ -4115,6 +4116,9 @@ struct inode *ext4_iget(struct super_blo + ei->i_default_acl = EXT4_ACL_NOT_CACHED; + #endif + ++ if (ext4_iopen_get_inode(inode)) ++ return inode; ++ + ret = __ext4_get_inode_loc(inode, &iloc, 0); + if (ret < 0) + goto bad_inode; +Index: linux-2.6.27.21-0.1/fs/ext4/super.c +=================================================================== +--- linux-2.6.27.21-0.1.orig/fs/ext4/super.c ++++ linux-2.6.27.21-0.1/fs/ext4/super.c +@@ -955,7 +955,8 @@ enum { + Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, + Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version, + Opt_stripe, Opt_delalloc, Opt_nodelalloc, +- Opt_inode_readahead_blks ++ Opt_inode_readahead_blks, ++ Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, + }; + + static const match_table_t tokens = { +@@ -1004,6 +1005,9 @@ static const match_table_t tokens = { + {Opt_noquota, "noquota"}, + {Opt_quota, "quota"}, + {Opt_usrquota, "usrquota"}, ++ {Opt_iopen, "iopen"}, ++ {Opt_noiopen, "noiopen"}, ++ {Opt_iopen_nopriv, "iopen_nopriv"}, + {Opt_barrier, "barrier=%u"}, + {Opt_extents, "extents"}, + {Opt_noextents, "noextents"}, +@@ -1347,6 +1351,18 @@ set_qf_format: + else + clear_opt(sbi->s_mount_opt, BARRIER); + break; ++ case Opt_iopen: ++ set_opt (sbi->s_mount_opt, IOPEN); ++ clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV); ++ break; ++ case Opt_noiopen: ++ clear_opt (sbi->s_mount_opt, IOPEN); ++ clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV); ++ break; ++ case Opt_iopen_nopriv: ++ set_opt (sbi->s_mount_opt, IOPEN); ++ set_opt (sbi->s_mount_opt, IOPEN_NOPRIV); ++ break; + case Opt_ignore: + break; + case Opt_resize: +Index: linux-2.6.27.21-0.1/fs/ext4/namei.c +=================================================================== +--- linux-2.6.27.21-0.1.orig/fs/ext4/namei.c ++++ linux-2.6.27.21-0.1/fs/ext4/namei.c +@@ -39,6 +39,7 @@ + + #include "namei.h" + #include "xattr.h" ++#include "iopen.h" + #include "acl.h" + + /* +@@ -1054,6 +1055,9 @@ static struct dentry *ext4_lookup(struct + if (dentry->d_name.len > EXT4_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); + ++ if (ext4_check_for_iopen(dir, dentry)) ++ return NULL; ++ + bh = ext4_find_entry(dir, &dentry->d_name, &de); + inode = NULL; + if (bh) { +@@ -1068,7 +1072,8 @@ static struct dentry *ext4_lookup(struct + if (IS_ERR(inode)) + return ERR_CAST(inode); + } +- return d_splice_alias(inode, dentry); ++ ++ return iopen_connect_dentry(dentry, inode, 1); + } + + +@@ -1717,7 +1722,7 @@ static int ext4_add_nondir(handle_t *han + int err = ext4_add_entry(handle, dentry, inode); + if (!err) { + ext4_mark_inode_dirty(handle, inode); +- d_instantiate(dentry, inode); ++ iopen_d_instantiate(dentry, inode); + return 0; + } + drop_nlink(inode); +@@ -1876,7 +1881,7 @@ out_clear_inode: + ext4_inc_count(handle, dir); + ext4_update_dx_flag(dir); + ext4_mark_inode_dirty(handle, dir); +- d_instantiate(dentry, inode); ++ iopen_d_instantiate(dentry, inode); + out_stop: + ext4_journal_stop(handle); + if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) +@@ -2142,10 +2147,6 @@ static int ext4_rmdir(struct inode *dir, + inode->i_nlink); + inode->i_version++; + clear_nlink(inode); +- /* There's no need to set i_disksize: the fact that i_nlink is +- * zero will ensure that the right thing happens during any +- * recovery. */ +- inode->i_size = 0; + ext4_orphan_add(handle, inode); + inode->i_ctime = dir->i_ctime = dir->i_mtime = ext4_current_time(inode); + ext4_mark_inode_dirty(handle, inode); +@@ -2271,6 +2272,23 @@ out_stop: + return err; + } + ++/* Like ext4_add_nondir() except for call to iopen_connect_dentry */ ++static int ext4_add_link(handle_t *handle, struct dentry *dentry, ++ struct inode *inode) ++{ ++ int err = ext4_add_entry(handle, dentry, inode); ++ if (!err) { ++ err = ext4_mark_inode_dirty(handle, inode); ++ if (err == 0) { ++ dput(iopen_connect_dentry(dentry, inode, 0)); ++ return 0; ++ } ++ } ++ ext4_dec_count(handle, inode); ++ iput(inode); ++ return err; ++} ++ + static int ext4_link(struct dentry *old_dentry, + struct inode *dir, struct dentry *dentry) + { +@@ -2301,7 +2319,8 @@ retry: + ext4_inc_count(handle, inode); + atomic_inc(&inode->i_count); + +- err = ext4_add_nondir(handle, dentry, inode); ++ err = ext4_add_link(handle, dentry, inode); ++ ext4_orphan_del(handle, inode); + ext4_journal_stop(handle); + if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) + goto retry; +Index: linux-2.6.27.21-0.1/fs/ext4/Makefile +=================================================================== +--- linux-2.6.27.21-0.1.orig/fs/ext4/Makefile ++++ linux-2.6.27.21-0.1/fs/ext4/Makefile +@@ -4,7 +4,7 @@ + + obj-$(CONFIG_EXT4DEV_FS) += ext4dev.o + +-ext4dev-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ ++ext4dev-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ + ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ + ext4_jbd2.o migrate.o mballoc.o + +Index: linux-2.6.27.21-0.1/fs/ext4/ext4.h +=================================================================== +--- linux-2.6.27.21-0.1.orig/fs/ext4/ext4.h ++++ linux-2.6.27.21-0.1/fs/ext4/ext4.h +@@ -540,6 +540,8 @@ do { \ + #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ + #define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ + #define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ ++#define EXT4_MOUNT_IOPEN 0x10000000 /* Allow access via iopen */ ++#define EXT4_MOUNT_IOPEN_NOPRIV 0x20000000 /* Make iopen world-readable */ + /* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */ + #ifndef _LINUX_EXT2_FS_H + #define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt