--- /dev/null
+Index: linux-2.6.18.i386/fs/ext4/super.c
+===================================================================
+--- linux-2.6.18.i386.orig/fs/ext4/super.c
++++ linux-2.6.18.i386/fs/ext4/super.c
+@@ -185,6 +185,8 @@ void ext4_journal_abort_handle(const cha
+ jbd2_journal_abort_handle(handle);
+ }
+
++EXPORT_SYMBOL(ext4_journal_abort_handle);
++
+ /* Deal with the reporting of failure conditions on a filesystem such as
+ * inconsistencies detected or read IO failures.
+ *
+@@ -2459,6 +2461,8 @@ out_fail:
+ return ret;
+ }
+
++EXPORT_SYMBOL(ext4_force_commit);
++
+ /*
+ * Setup any per-fs journal parameters now. We'll do this both on
+ * initial mount, once the journal has been initialised but before we've
+@@ -3502,6 +3506,12 @@ int ext4_map_inode_page(struct inode *in
+ unsigned long *blocks, int *created, int create);
+ EXPORT_SYMBOL(ext4_map_inode_page);
+
++EXPORT_SYMBOL(ext4_xattr_get);
++EXPORT_SYMBOL(ext4_xattr_set_handle);
++EXPORT_SYMBOL(ext4_bread);
++EXPORT_SYMBOL(ext4_journal_start_sb);
++EXPORT_SYMBOL(__ext4_journal_stop);
++
+ MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
+ MODULE_DESCRIPTION("Fourth Extended Filesystem with extents");
+ MODULE_LICENSE("GPL");
--- /dev/null
+Index: linux-2.6.27.21-0.1/fs/ext4/ialloc.c
+===================================================================
+--- linux-2.6.27.21-0.1.orig/fs/ext4/ialloc.c
++++ linux-2.6.27.21-0.1/fs/ext4/ialloc.c
+@@ -1005,6 +1005,36 @@ fail_drop:
+ return ERR_PTR(err);
+ }
+
++unsigned long ext4_find_reverse(struct super_block *sb)
++{
++ struct ext4_group_desc *desc;
++ struct buffer_head *bitmap_bh = NULL;
++ int group;
++ unsigned long ino, offset;
++
++ for (offset = (EXT4_INODES_PER_GROUP(sb) >> 1); offset >= 0;
++ offset >>= 1) {
++ for (group = EXT4_SB(sb)->s_groups_count - 1; group >= 0;
++ --group) {
++ desc = ext4_get_group_desc(sb, group, NULL);
++ if (ext4_free_inodes_count(sb, desc) == 0)
++ continue;
++
++ bitmap_bh = ext4_read_inode_bitmap(sb, group);
++ if (!bitmap_bh)
++ continue;
++
++ ino = ext4_find_next_zero_bit((unsigned long *)
++ bitmap_bh->b_data,
++ EXT4_INODES_PER_GROUP(sb), offset);
++ if (ino < EXT4_INODES_PER_GROUP(sb))
++ return (group * EXT4_INODES_PER_GROUP(sb) +
++ ino + 1);
++ }
++ }
++ return 0;
++}
++
+ /* Verify that we are loading a valid orphan from disk */
+ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
+ {
+Index: linux-2.6.27.21-0.1/fs/ext4/namei.c
+===================================================================
+--- linux-2.6.27.21-0.1.orig/fs/ext4/namei.c
++++ linux-2.6.27.21-0.1/fs/ext4/namei.c
+@@ -151,14 +151,24 @@ struct dx_map_entry
+ u16 size;
+ };
+
++/*
++ * dentry_param used by ext4_new_inode_wantedi()
++ */
+ #define LVFS_DENTRY_PARAM_MAGIC 20070216UL
+ struct lvfs_dentry_params
+ {
+- unsigned long p_inum;
+- void *p_ptr;
+- u32 magic;
++ unsigned long ldp_inum;
++ long ldp_flags;
++ u32 ldp_magic;
+ };
+
++/* Only use the least 3 bits of ldp_flags for goal policy */
++typedef enum {
++ DP_GOAL_POLICY = 0,
++ DP_LASTGROUP_REVERSE = 1,
++} dp_policy_t;
++
++
+ static inline ext4_lblk_t dx_get_block(struct dx_entry *entry);
+ static void dx_set_block(struct dx_entry *entry, ext4_lblk_t value);
+ static inline unsigned dx_get_hash(struct dx_entry *entry);
+@@ -1770,8 +1780,13 @@ static struct inode * ext4_new_inode_wan
+ if (dentry->d_fsdata != NULL) {
+ struct lvfs_dentry_params *param = dentry->d_fsdata;
+
+- if (param->magic == LVFS_DENTRY_PARAM_MAGIC)
+- inum = param->p_inum;
++ if (param->ldp_magic == LVFS_DENTRY_PARAM_MAGIC) {
++ if ((dp_policy_t)(param->ldp_flags & 0x7) ==
++ DP_LASTGROUP_REVERSE)
++ inum = ext4_find_reverse(dir->i_sb);
++ else /* DP_GOAL_POLICY */
++ inum = param->ldp_inum;
++ }
+ }
+ return ext4_new_inode(handle, dir, mode, inum);
+ }
+Index: linux-2.6.27.21-0.1/fs/ext4/ext4.h
+===================================================================
+--- linux-2.6.27.21-0.1.orig/fs/ext4/ext4.h
++++ linux-2.6.27.21-0.1/fs/ext4/ext4.h
+@@ -1089,6 +1089,7 @@ extern int ext4fs_dirhash(const char *na
+ /* ialloc.c */
+ extern struct inode * ext4_new_inode(handle_t *, struct inode *, int,
+ unsigned long);
++extern unsigned long ext4_find_reverse(struct super_block *);
+ extern void ext4_free_inode(handle_t *, struct inode *);
+ extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
+ extern unsigned long ext4_count_free_inodes(struct super_block *);
--- /dev/null
+Index: linux-2.6.27.21-0.1/fs/ext4/super.c
+===================================================================
+--- linux-2.6.27.21-0.1.orig/fs/ext4/super.c
++++ linux-2.6.27.21-0.1/fs/ext4/super.c
+@@ -74,6 +74,8 @@ static void ext4_write_super_lockfs(stru
+
+ struct proc_dir_entry *proc_root_ext4;
+
++static int bigendian_extents;
++
+ ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
+ struct ext4_group_desc *bg)
+ {
+@@ -1291,7 +1293,7 @@ enum {
+ Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
+ Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version,
+ Opt_stripe, Opt_delalloc, Opt_nodelalloc,
+- Opt_inode_readahead_blks,
++ Opt_inode_readahead_blks, Opt_bigendian_extents,
+ Opt_iopen, Opt_noiopen, Opt_iopen_nopriv,
+ };
+
+@@ -1353,6 +1355,7 @@ static const match_table_t tokens = {
+ {Opt_delalloc, "delalloc"},
+ {Opt_nodelalloc, "nodelalloc"},
+ {Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
++ {Opt_bigendian_extents, "bigendian_extents"},
+ {Opt_err, NULL},
+ };
+
+@@ -1768,6 +1771,9 @@ set_qf_format:
+ return 0;
+ sbi->s_inode_readahead_blks = option;
+ break;
++ case Opt_bigendian_extents:
++ bigendian_extents = 1;
++ break;
+ default:
+ printk(KERN_ERR
+ "EXT4-fs: Unrecognized mount option \"%s\" "
+@@ -2673,6 +2679,15 @@ static int ext4_fill_super(struct super_
+ &sbi->s_inode_readahead_blks);
+ #endif
+
++#ifdef __BIG_ENDIAN
++ if (bigendian_extents == 0) {
++ printk(KERN_ERR "EXT4-fs: extents feature is not guaranteed to "
++ "work on big-endian systems. Use \"bigendian_extents\" "
++ "mount option to override.\n");
++ goto failed_mount;
++ }
++#endif
++
+ bgl_lock_init(&sbi->s_blockgroup_lock);
+
+ sbi->s_last_alloc_group = -1;
--- /dev/null
+Index: linux-2.6.27.21-0.1/fs/ext4/ext4_extents.h
+===================================================================
+--- linux-2.6.27.21-0.1.orig/fs/ext4/ext4_extents.h
++++ linux-2.6.27.21-0.1/fs/ext4/ext4_extents.h
+@@ -203,6 +203,11 @@ static inline unsigned short ext_depth(s
+ return le16_to_cpu(ext_inode_hdr(inode)->eh_depth);
+ }
+
++static inline void ext4_ext_tree_changed(struct inode *inode)
++{
++ EXT4_I(inode)->i_ext_generation++;
++}
++
+ static inline void
+ ext4_ext_invalidate_cache(struct inode *inode)
+ {
+Index: linux-2.6.27.21-0.1/fs/ext4/ext4_i.h
+===================================================================
+--- linux-2.6.27.21-0.1.orig/fs/ext4/ext4_i.h
++++ linux-2.6.27.21-0.1/fs/ext4/ext4_i.h
+@@ -114,6 +114,7 @@ struct ext4_inode_info {
+ struct inode vfs_inode;
+ struct jbd2_inode jinode;
+
++ unsigned long i_ext_generation;
+ struct ext4_ext_cache i_cached_extent;
+ /*
+ * File creation time. Its function is same as that of
+Index: linux-2.6.27.21-0.1/fs/ext4/extents.c
+===================================================================
+--- linux-2.6.27.21-0.1.orig/fs/ext4/extents.c
++++ linux-2.6.27.21-0.1/fs/ext4/extents.c
+@@ -1618,6 +1618,7 @@ cleanup:
+ ext4_ext_drop_refs(npath);
+ kfree(npath);
+ }
++ ext4_ext_tree_changed(inode);
+ ext4_ext_invalidate_cache(inode);
+ return err;
+ }
+@@ -2278,6 +2279,7 @@ static int ext4_ext_remove_space(struct
+ }
+ }
+ out:
++ ext4_ext_tree_changed(inode);
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ ext4_journal_stop(handle);
--- /dev/null
+A large part of this code is from the generic VFS code in fs/ioctl.c in the
+upstream kernel.
+
+Index: linux-2.6.27.21-0.1/fs/ext4/ioctl.c
+===================================================================
+--- linux-2.6.27.21-0.1.orig/fs/ext4/ioctl.c
++++ linux-2.6.27.21-0.1/fs/ext4/ioctl.c
+@@ -18,6 +18,162 @@
+ #include "ext4_jbd2.h"
+ #include "ext4.h"
+
++#include "fiemap.h"
++
++/* So that the fiemap access checks can't overflow on 32 bit machines. */
++#define FIEMAP_MAX_EXTENTS (UINT_MAX / sizeof(struct fiemap_extent))
++
++/**
++ * fiemap_fill_next_extent - Fiemap helper function
++ * @fieinfo: Fiemap context passed into ->fiemap
++ * @logical: Extent logical start offset, in bytes
++ * @phys: Extent physical start offset, in bytes
++ * @len: Extent length, in bytes
++ * @flags: FIEMAP_EXTENT flags that describe this extent
++ * @lun: LUN on which this extent resides
++ *
++ * Called from file system ->fiemap callback. Will populate extent
++ * info as passed in via arguments and copy to user memory. On
++ * success, extent count on fieinfo is incremented.
++ *
++ * Returns 0 on success, -errno on error, 1 if this was the last
++ * extent that will fit in user array.
++ */
++#define SET_UNKNOWN_FLAGS (FIEMAP_EXTENT_DELALLOC)
++#define SET_NO_DIRECT_FLAGS (FIEMAP_EXTENT_DATA_ENCRYPTED \
++ |FIEMAP_EXTENT_NET)
++#define SET_NO_UNMOUNTED_IO_FLAGS (FIEMAP_EXTENT_DATA_ENCRYPTED)
++#define SET_NOT_ALIGNED_FLAGS (FIEMAP_EXTENT_DATA_TAIL|FIEMAP_EXTENT_DATA_INLINE)
++int fiemap_fill_next_extent(struct fiemap_extent_info *fieinfo, u64 logical,
++ u64 phys, u64 len, u32 flags, dev_t dev)
++{
++ struct fiemap_extent extent = { 0 };
++ struct fiemap_extent *dest = fieinfo->fi_extents_start;
++
++ /* only count the extents */
++ if (fieinfo->fi_extents_max == 0) {
++ fieinfo->fi_extents_mapped++;
++ return (flags & FIEMAP_EXTENT_LAST) ? 1 : 0;
++ }
++
++ if (fieinfo->fi_extents_mapped >= fieinfo->fi_extents_max)
++ return 1;
++
++ if (flags & SET_UNKNOWN_FLAGS)
++ flags |= FIEMAP_EXTENT_UNKNOWN;
++ if (flags & SET_NO_DIRECT_FLAGS)
++ flags |= FIEMAP_EXTENT_NO_DIRECT;
++ if (flags & SET_NOT_ALIGNED_FLAGS)
++ flags |= FIEMAP_EXTENT_NOT_ALIGNED;
++ if (flags & SET_NO_UNMOUNTED_IO_FLAGS)
++ flags |= FIEMAP_EXTENT_ENCODED;
++
++ extent.fe_logical = logical;
++ extent.fe_physical = phys;
++ extent.fe_length = len;
++ extent.fe_flags = flags;
++ extent.fe_device = new_encode_dev(dev);
++
++ dest += fieinfo->fi_extents_mapped;
++ if (copy_to_user(dest, &extent, sizeof(extent)))
++ return -EFAULT;
++
++ fieinfo->fi_extents_mapped++;
++ if (fieinfo->fi_extents_mapped == fieinfo->fi_extents_max)
++ return 1;
++
++ return (flags & FIEMAP_EXTENT_LAST) ? 1 : 0;
++}
++
++static int fiemap_check_ranges(struct super_block *sb,
++ u64 start, u64 len, u64 *new_len)
++{
++ *new_len = len;
++
++ if (len == 0)
++ return -EINVAL;
++
++ if (start > sb->s_maxbytes)
++ return -EFBIG;
++
++ /*
++ * Shrink request scope to what the fs can actually handle.
++ */
++ if ((len > sb->s_maxbytes) ||
++ (sb->s_maxbytes - len) < start)
++ *new_len = sb->s_maxbytes - start;
++
++ return 0;
++}
++
++/*
++ * fiemap_check_flags - check validity of requested flags for fiemap
++ * @fieinfo: Fiemap context passed into ->fiemap
++ * @fs_flags: Set of fiemap flags that the file system understands
++ *
++ * Called from file system ->fiemap callback. This will compute the
++ * intersection of valid fiemap flags and those that the fs supports. That
++ * value is then compared against the user supplied flags. In case of bad user
++ * flags, the invalid values will be written into the fieinfo structure, and
++ * -EBADR is returned, which tells ioctl_fiemap() to return those values to
++ * userspace. For this reason, a return code of -EBADR should be preserved.
++ *
++ * Returns 0 on success, -EBADR on bad flags.
++ */
++int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags)
++{
++ u32 incompat_flags;
++
++ incompat_flags = fieinfo->fi_flags & ~(FIEMAP_FLAGS_COMPAT & fs_flags);
++ if (incompat_flags) {
++ fieinfo->fi_flags = incompat_flags;
++ return -EBADR;
++ }
++
++ return 0;
++}
++
++int ioctl_fiemap(struct inode *inode, struct file *filp, unsigned long arg)
++{
++ struct fiemap fiemap;
++ u64 len;
++ struct fiemap_extent_info fieinfo = {0, };
++ struct super_block *sb = inode->i_sb;
++ int error = 0;
++
++ if (copy_from_user(&fiemap, (struct fiemap __user *) arg,
++ sizeof(struct fiemap)))
++ return -EFAULT;
++
++ if (fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS)
++ return -EINVAL;
++
++ error = fiemap_check_ranges(sb, fiemap.fm_start, fiemap.fm_length,
++ &len);
++ if (error)
++ return error;
++
++ fieinfo.fi_flags = fiemap.fm_flags;
++ fieinfo.fi_extents_max = fiemap.fm_extent_count;
++ fieinfo.fi_extents_start = (struct fiemap_extent *)(arg + sizeof(fiemap));
++
++ if (fiemap.fm_extent_count != 0 &&
++ !access_ok(VERIFY_WRITE, (void *)arg,
++ offsetof(typeof(fiemap), fm_extents[fiemap.fm_extent_count])))
++ return -EFAULT;
++
++ if (fieinfo.fi_flags & FIEMAP_FLAG_SYNC)
++ filemap_write_and_wait(inode->i_mapping);
++
++ error = ext4_fiemap(inode, &fieinfo, fiemap.fm_start, len);
++ fiemap.fm_flags = fieinfo.fi_flags;
++ fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped;
++ if (copy_to_user((char *)arg, &fiemap, sizeof(fiemap)))
++ error = -EFAULT;
++
++ return error;
++}
++
+ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+ {
+ struct inode *inode = filp->f_dentry->d_inode;
+@@ -263,6 +419,10 @@ setversion_out:
+ return err;
+ }
+
++ case EXT4_IOC_FIEMAP: {
++ return ioctl_fiemap(inode, filp, arg);
++ }
++
+ default:
+ return -ENOTTY;
+ }
+Index: linux-2.6.27.21-0.1/fs/ext4/ext4.h
+===================================================================
+--- linux-2.6.27.21-0.1.orig/fs/ext4/ext4.h
++++ linux-2.6.27.21-0.1/fs/ext4/ext4.h
+@@ -302,7 +302,8 @@ struct ext4_new_group_data {
+ #define EXT4_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long)
+ #define EXT4_IOC_GROUP_ADD _IOW('f', 8, struct ext4_new_group_input)
+ #define EXT4_IOC_MIGRATE _IO('f', 9)
+- /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */
++#define EXT4_IOC_FIEMAP _IOWR('f', 11, struct fiemap)
++
+
+ /*
+ * ioctl commands in 32 bit emulation
+@@ -320,6 +321,8 @@ struct ext4_new_group_data {
+ #define EXT4_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION
+ #define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION
+
++/* FIEMAP flags supported by ext4 */
++#define EXT4_FIEMAP_FLAGS_COMPAT (FIEMAP_FLAG_SYNC)
+
+ /*
+ * Mount options
+@@ -1130,6 +1133,9 @@ extern int ext4_page_mkwrite(struct vm_a
+ /* ioctl.c */
+ extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
+ extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
++struct fiemap_extent_info;
++extern int ext4_fiemap(struct inode *, struct fiemap_extent_info *, __u64,
++ __u64);
+
+ /* migrate.c */
+ extern int ext4_ext_migrate(struct inode *);
+Index: linux-2.6.27.21-0.1/fs/ext4/ext4_extents.h
+===================================================================
+--- linux-2.6.27.21-0.1.orig/fs/ext4/ext4_extents.h
++++ linux-2.6.27.21-0.1/fs/ext4/ext4_extents.h
+@@ -128,6 +128,22 @@ struct ext4_ext_path {
+ #define EXT_MAX_BLOCK 0xffffffff
+
+ /*
++ * to be called by ext4_ext_walk_space()
++ * negative retcode - error
++ * positive retcode - signal for ext4_ext_walk_space(), see below
++ * callback must return valid extent (passed or newly created)
++ */
++typedef int (*ext_prepare_callback)(struct inode *, struct ext4_ext_path *,
++ struct ext4_ext_cache *,
++ struct ext4_extent *, void *);
++
++#define HAVE_EXT_PREPARE_CB_EXTENT
++
++#define EXT_CONTINUE 0
++#define EXT_BREAK 1
++#define EXT_REPEAT 2
++
++/*
+ * EXT_INIT_MAX_LEN is the maximum number of blocks we can have in an
+ * initialized extent. This is 2^15 and not (2^16 - 1), since we use the
+ * MSB of ee_len field in the extent datastructure to signify if this
+@@ -219,6 +235,8 @@ extern int ext4_ext_try_to_merge(struct
+ struct ext4_extent *);
+ extern unsigned int ext4_ext_check_overlap(struct inode *, struct ext4_extent *, struct ext4_ext_path *);
+ extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *);
++extern int ext4_ext_walk_space(struct inode *, ext4_lblk_t, ext4_lblk_t,
++ ext_prepare_callback, void *);
+ extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
+ struct ext4_ext_path *);
+ extern int ext4_ext_search_left(struct inode *, struct ext4_ext_path *,
+Index: linux-2.6.27.21-0.1/fs/ext4/extents.c
+===================================================================
+--- linux-2.6.27.21-0.1.orig/fs/ext4/extents.c
++++ linux-2.6.27.21-0.1/fs/ext4/extents.c
+@@ -42,7 +42,7 @@
+ #include <asm/uaccess.h>
+ #include "ext4_jbd2.h"
+ #include "ext4_extents.h"
+-
++#include "fiemap.h"
+
+ /*
+ * ext_pblock:
+@@ -1622,6 +1622,113 @@ cleanup:
+ return err;
+ }
+
++int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
++ ext4_lblk_t num, ext_prepare_callback func,
++ void *cbdata)
++{
++ struct ext4_ext_path *path = NULL;
++ struct ext4_ext_cache cbex;
++ struct ext4_extent *ex;
++ ext4_lblk_t next, start = 0, end = 0;
++ ext4_lblk_t last = block + num;
++ int depth, exists, err = 0;
++
++ BUG_ON(func == NULL);
++ BUG_ON(inode == NULL);
++
++ while (block < last && block != EXT_MAX_BLOCK) {
++ num = last - block;
++ /* find extent for this block */
++ path = ext4_ext_find_extent(inode, block, path);
++ if (IS_ERR(path)) {
++ err = PTR_ERR(path);
++ path = NULL;
++ break;
++ }
++
++ depth = ext_depth(inode);
++ BUG_ON(path[depth].p_hdr == NULL);
++ ex = path[depth].p_ext;
++ next = ext4_ext_next_allocated_block(path);
++
++ exists = 0;
++ if (!ex) {
++ /* there is no extent yet, so try to allocate
++ * all requested space */
++ start = block;
++ end = block + num;
++ } else if (le32_to_cpu(ex->ee_block) > block) {
++ /* need to allocate space before found extent */
++ start = block;
++ end = le32_to_cpu(ex->ee_block);
++ if (block + num < end)
++ end = block + num;
++ } else if (block >= le32_to_cpu(ex->ee_block)
++ + ext4_ext_get_actual_len(ex)) {
++ /* need to allocate space after found extent */
++ start = block;
++ end = block + num;
++ if (end >= next)
++ end = next;
++ } else if (block >= le32_to_cpu(ex->ee_block)) {
++ /*
++ * some part of requested space is covered
++ * by found extent
++ */
++ start = block;
++ end = le32_to_cpu(ex->ee_block)
++ + ext4_ext_get_actual_len(ex);
++ if (block + num < end)
++ end = block + num;
++ exists = 1;
++ } else {
++ BUG();
++ }
++ BUG_ON(end <= start);
++
++ if (!exists) {
++ cbex.ec_block = start;
++ cbex.ec_len = end - start;
++ cbex.ec_start = 0;
++ cbex.ec_type = EXT4_EXT_CACHE_GAP;
++ } else {
++ cbex.ec_block = le32_to_cpu(ex->ee_block);
++ cbex.ec_len = ext4_ext_get_actual_len(ex);
++ cbex.ec_start = ext_pblock(ex);
++ cbex.ec_type = EXT4_EXT_CACHE_EXTENT;
++ }
++
++ BUG_ON(cbex.ec_len == 0);
++ err = func(inode, path, &cbex, ex, cbdata);
++ ext4_ext_drop_refs(path);
++
++ if (err < 0)
++ break;
++
++ if (err == EXT_REPEAT)
++ continue;
++ else if (err == EXT_BREAK) {
++ err = 0;
++ break;
++ }
++
++ if (ext_depth(inode) != depth) {
++ /* depth was changed. we have to realloc path */
++ kfree(path);
++ path = NULL;
++ }
++
++ block = cbex.ec_block + cbex.ec_len;
++ }
++
++ if (path) {
++ ext4_ext_drop_refs(path);
++ kfree(path);
++ }
++
++ return err;
++}
++
+ static void
+ ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block,
+ __u32 len, ext4_fsblk_t start, int type)
+@@ -2966,3 +3073,100 @@ retry:
+ mutex_unlock(&inode->i_mutex);
+ return ret > 0 ? ret2 : ret;
+ }
++
++/*
++ * Callback function called for each extent to gather FIEMAP information.
++ */
++int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
++ struct ext4_ext_cache *newex, struct ext4_extent *ex,
++ void *data)
++{
++ struct fiemap_extent_info *fieinfo = data;
++ unsigned long blksize_bits = inode->i_sb->s_blocksize_bits;
++ __u64 logical;
++ __u64 physical;
++ __u64 length;
++ __u32 flags = 0;
++ int error;
++
++ logical = (__u64)newex->ec_block << blksize_bits;
++
++ if (newex->ec_type == EXT4_EXT_CACHE_GAP) {
++ pgoff_t offset;
++ struct page *page;
++ struct buffer_head *bh = NULL;
++
++ offset = logical >> PAGE_SHIFT;
++ page = find_get_page(inode->i_mapping, offset);
++ if (!page || !page_has_buffers(page))
++ return EXT_CONTINUE;
++
++ bh = page_buffers(page);
++
++ if (!bh)
++ return EXT_CONTINUE;
++
++ if (buffer_delay(bh)) {
++ flags |= FIEMAP_EXTENT_DELALLOC;
++ page_cache_release(page);
++ } else {
++ page_cache_release(page);
++ return EXT_CONTINUE;
++ }
++ }
++
++ physical = (__u64)newex->ec_start << blksize_bits;
++ length = (__u64)newex->ec_len << blksize_bits;
++
++ if (ex && ext4_ext_is_uninitialized(ex))
++ flags |= FIEMAP_EXTENT_UNWRITTEN;
++
++ /*
++ * If this extent reaches EXT_MAX_BLOCK, it must be last.
++ *
++ * Or if ext4_ext_next_allocated_block is EXT_MAX_BLOCK,
++ * this indicates no more allocated blocks.
++ *
++ * XXX this might miss a single-block extent at EXT_MAX_BLOCK
++ */
++ if (logical + length - 1 == EXT_MAX_BLOCK ||
++ ext4_ext_next_allocated_block(path) == EXT_MAX_BLOCK)
++ flags |= FIEMAP_EXTENT_LAST;
++
++ error = fiemap_fill_next_extent(fieinfo, logical, physical,
++ length, flags, inode->i_sb->s_dev);
++ if (error < 0)
++ return error;
++ if (error == 1)
++ return EXT_BREAK;
++
++ return EXT_CONTINUE;
++}
++
++int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
++ __u64 start, __u64 len)
++{
++ ext4_fsblk_t start_blk;
++ ext4_fsblk_t len_blks;
++ int error = 0;
++
++ if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
++ return -EOPNOTSUPP;
++
++ if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS_COMPAT))
++ return -EBADR;
++
++ start_blk = start >> inode->i_sb->s_blocksize_bits;
++ len_blks = (len + inode->i_sb->s_blocksize - 1) >> inode->i_sb->s_blocksize_bits;
++
++ /*
++ * Walk the extent tree gathering extent information.
++ * ext4_ext_fiemap_cb will push extents back to user.
++ */
++ down_write(&EXT4_I(inode)->i_data_sem);
++ error = ext4_ext_walk_space(inode, start_blk, len_blks,
++ ext4_ext_fiemap_cb, fieinfo);
++ up_write(&EXT4_I(inode)->i_data_sem);
++
++ return error;
++}
+Index: linux-2.6.27.21-0.1/fs/ext4/fiemap.h
+===================================================================
+--- /dev/null
++++ linux-2.6.27.21-0.1/fs/ext4/fiemap.h
+@@ -0,0 +1,85 @@
++/*
++ * FIEMAP ioctl infrastructure.
++ *
++ * Copyright 2008 Sun Microsystems, Inc
++ *
++ * Author: Kalpak Shah <kalpak.shah@sun.com>
++ * Andreas Dilger <adilger@sun.com>
++ */
++
++#ifndef _LINUX_EXT4_FIEMAP_H
++#define _LINUX_EXT4_FIEMAP_H
++
++struct fiemap_extent {
++ __u64 fe_logical; /* logical offset in bytes for the start of
++ * the extent from the beginning of the file */
++ __u64 fe_physical; /* physical offset in bytes for the start
++ * of the extent from the beginning of the disk */
++ __u64 fe_length; /* length in bytes for this extent */
++ __u64 fe_reserved64[2];
++ __u32 fe_flags; /* FIEMAP_EXTENT_* flags for this extent */
++ __u32 fe_device; /* device number for this extent */
++ __u32 fe_reserved[2];
++};
++
++struct fiemap {
++ __u64 fm_start; /* logical offset (inclusive) at
++ * which to start mapping (in) */
++ __u64 fm_length; /* logical length of mapping which
++ * userspace wants (in) */
++ __u32 fm_flags; /* FIEMAP_FLAG_* flags for request (in/out) */
++ __u32 fm_mapped_extents;/* number of extents that were mapped (out) */
++ __u32 fm_extent_count; /* size of fm_extents array (in) */
++ __u32 fm_reserved;
++ struct fiemap_extent fm_extents[0]; /* array of mapped extents (out) */
++};
++
++/*
++ * FIEMAP helper definition.
++ */
++struct fiemap_extent_info {
++ unsigned int fi_flags; /* Flags as passed from user */
++ unsigned int fi_extents_mapped; /* Number of mapped extents */
++ unsigned int fi_extents_max; /* Size of fiemap_extent array*/
++ struct fiemap_extent *fi_extents_start; /* Start of fiemap_extent array */
++};
++
++int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags);
++int fiemap_fill_next_extent(struct fiemap_extent_info *info, u64 logical,
++ u64 phys, u64 len, u32 flags, u32 lun);
++
++#define FIEMAP_MAX_OFFSET (~0ULL)
++
++#define FIEMAP_FLAG_SYNC 0x00000001 /* sync file data before map */
++#define FIEMAP_FLAG_XATTR 0x00000002 /* map extended attribute tree */
++
++/* ldiskfs only supports FLAG_SYNC flag currently */
++#define FIEMAP_FLAGS_COMPAT (FIEMAP_FLAG_SYNC | FIEMAP_FLAG_XATTR)
++
++#define FIEMAP_EXTENT_LAST 0x00000001 /* Last extent in file. */
++#define FIEMAP_EXTENT_UNKNOWN 0x00000002 /* Data location unknown. */
++#define FIEMAP_EXTENT_DELALLOC 0x00000004 /* Location still pending.
++ * Sets EXTENT_UNKNOWN. */
++#define FIEMAP_EXTENT_ENCODED 0x00000008 /* Data can not be read
++ * while fs is unmounted */
++#define FIEMAP_EXTENT_DATA_ENCRYPTED 0x00000080 /* Data is encrypted by fs.
++ * Sets EXTENT_NO_DIRECT. */
++#define FIEMAP_EXTENT_NOT_ALIGNED 0x00000100 /* Extent offsets may not be
++ * block aligned. */
++#define FIEMAP_EXTENT_DATA_INLINE 0x00000200 /* Data mixed with metadata.
++ * Sets EXTENT_NOT_ALIGNED.*/
++#define FIEMAP_EXTENT_DATA_TAIL 0x00000400 /* Multiple files in block.
++ * Sets EXTENT_NOT_ALIGNED.*/
++#define FIEMAP_EXTENT_UNWRITTEN 0x00000800 /* Space allocated, but
++ * no data (i.e. zero). */
++#define FIEMAP_EXTENT_MERGED 0x00001000 /* File does not natively
++ * support extents. Result
++ * merged for efficiency. */
++
++/* Lustre specific flags - use a high bit, don't conflict with upstream flag */
++#define FIEMAP_EXTENT_NO_DIRECT 0x40000000 /* Data mapping undefined */
++#define FIEMAP_EXTENT_NET 0x80000000 /* Data stored remotely.
++ * Sets NO_DIRECT flag */
++
++#endif /* _LINUX_EXT4_FIEMAP_H */
++
--- /dev/null
+Index: linux-2.6.27.21-0.1/fs/ext4/ext4_i.h
+===================================================================
+--- linux-2.6.27.21-0.1.orig/fs/ext4/ext4_i.h
++++ linux-2.6.27.21-0.1/fs/ext4/ext4_i.h
+@@ -135,6 +135,8 @@ struct ext4_inode_info {
+ __u16 i_extra_isize;
+
+ spinlock_t i_block_reservation_lock;
++
++ void *i_filterdata;
+ };
+
+ #endif /* _EXT4_I */
+Index: linux-2.6.27.21-0.1/fs/ext4/super.c
+===================================================================
+--- linux-2.6.27.21-0.1.orig/fs/ext4/super.c
++++ linux-2.6.27.21-0.1/fs/ext4/super.c
+@@ -624,6 +624,7 @@ static struct inode *ext4_alloc_inode(st
+ memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
+ INIT_LIST_HEAD(&ei->i_prealloc_list);
+ spin_lock_init(&ei->i_prealloc_lock);
++ ei->i_filterdata = NULL;
+ jbd2_journal_init_jbd_inode(&ei->jinode, &ei->vfs_inode);
+ ei->i_reserved_data_blocks = 0;
+ ei->i_reserved_meta_blocks = 0;
--- /dev/null
+Index: linux-2.6.27.21-0.1/fs/ext4/ialloc.c
+===================================================================
+--- linux-2.6.27.21-0.1.orig/fs/ext4/ialloc.c
++++ linux-2.6.27.21-0.1/fs/ext4/ialloc.c
+@@ -535,12 +535,16 @@ fallback:
+ }
+
+ static int find_group_other(struct super_block *sb, struct inode *parent,
+- ext4_group_t *group)
++ ext4_group_t *group, int mode)
+ {
++ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
+- ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
++ ext4_group_t ngroups = sbi->s_groups_count;
+ struct ext4_group_desc *desc;
+ ext4_group_t i;
++ int best_group = -1;
++ ext4_fsblk_t avefreeb, freeb;
++ int best_group_freeb = 0;
+
+ /*
+ * Try to place the inode in its parent directory
+@@ -548,8 +552,10 @@ static int find_group_other(struct super
+ *group = parent_group;
+ desc = ext4_get_group_desc(sb, *group, NULL);
+ if (desc && ext4_free_inodes_count(sb, desc) &&
+- ext4_free_blks_count(sb, desc))
++ (!S_ISREG(mode) || ext4_free_blks_count(sb, desc)))
+ return 0;
++ avefreeb = ext4_free_blocks_count(sbi->s_es);
++ do_div(avefreeb, ngroups);
+
+ /*
+ * We're going to place this inode in a different blockgroup from its
+@@ -563,33 +569,49 @@ static int find_group_other(struct super
+ *group = (*group + parent->i_ino) % ngroups;
+
+ /*
+- * Use a quadratic hash to find a group with a free inode and some free
+- * blocks.
++ * Use a quadratic hash to find a group with a free inode and
++ * average number of free blocks.
+ */
+ for (i = 1; i < ngroups; i <<= 1) {
+ *group += i;
+ if (*group >= ngroups)
+ *group -= ngroups;
+ desc = ext4_get_group_desc(sb, *group, NULL);
+- if (desc && ext4_free_inodes_count(sb, desc) &&
+- ext4_free_blks_count(sb, desc))
++ if (!desc || ext4_free_inodes_count(sb, desc))
++ continue;
++ if (!S_ISREG(mode))
++ return 0;
++ if (ext4_free_blks_count(sb, desc) >= avefreeb)
+ return 0;
+ }
+
+ /*
+- * That failed: try linear search for a free inode, even if that group
+- * has no free blocks.
++ * That failed: start from last group used to allocate inode
++ * try linear search for a free inode and prefereably
++ * free blocks.
+ */
+- *group = parent_group;
++ *group = sbi->s_last_alloc_group;
++ if (*group == -1)
++ *group = parent_group;
++
+ for (i = 0; i < ngroups; i++) {
+ if (++*group >= ngroups)
+ *group = 0;
+ desc = ext4_get_group_desc(sb, *group, NULL);
+- if (desc && ext4_free_inodes_count(sb, desc))
+- return 0;
++ if (!desc || ext4_free_inodes_count(sb, desc))
++ continue;
++ freeb = ext4_free_blks_count(sb, desc);
++ if (freeb > best_group_freeb) {
++ best_group_freeb = freeb;
++ best_group = *group;
++ if (freeb >= avefreeb || !S_ISREG(mode))
++ break;
++ }
+ }
+
+- return -1;
++ sbi->s_last_alloc_group = best_group;
++ *group = best_group;
++ return 0;
+ }
+
+ /*
+@@ -755,7 +777,7 @@ continue_allocation:
+ else
+ ret2 = find_group_orlov(sb, dir, &group);
+ } else
+- ret2 = find_group_other(sb, dir, &group);
++ ret2 = find_group_other(sb, dir, &group, mode);
+
+ got_group:
+ err = -ENOSPC;
+Index: linux-2.6.27.21-0.1/fs/ext4/super.c
+===================================================================
+--- linux-2.6.27.21-0.1.orig/fs/ext4/super.c
++++ linux-2.6.27.21-0.1/fs/ext4/super.c
+@@ -2300,6 +2300,7 @@ static int ext4_fill_super(struct super_
+
+ bgl_lock_init(&sbi->s_blockgroup_lock);
+
++ sbi->s_last_alloc_group = -1;
+ for (i = 0; i < db_count; i++) {
+ block = descriptor_loc(sb, logical_sb_block, i);
+ sbi->s_group_desc[i] = sb_bread(sb, block);
+Index: linux-2.6.27.21-0.1/fs/ext4/ext4_sb.h
+===================================================================
+--- linux-2.6.27.21-0.1.orig/fs/ext4/ext4_sb.h
++++ linux-2.6.27.21-0.1/fs/ext4/ext4_sb.h
+@@ -64,6 +64,8 @@ struct ext4_sb_info {
+ struct percpu_counter s_dirtyblocks_counter;
+ struct blockgroup_lock s_blockgroup_lock;
+ struct proc_dir_entry *s_proc;
++ /* Last group used to allocate inode */
++ ext4_group_t s_last_alloc_group;
+
+ /* root of the per fs reservation window tree */
+ spinlock_t s_rsv_window_lock;
--- /dev/null
+Index: linux-2.6.18.i386/fs/ext4/ext4.h
+===================================================================
+--- linux-2.6.18.i386.orig/fs/ext4/ext4.h
++++ linux-2.6.18.i386/fs/ext4/ext4.h
+@@ -541,12 +541,13 @@ do { \
+ #define EXT4_MOUNT_IOPEN 0x8000000 /* Allow access via iopen */
+ #define EXT4_MOUNT_IOPEN_NOPRIV 0x10000000 /* Make iopen world-readable */
+ /* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */
+-#ifndef _LINUX_EXT2_FS_H
++#ifndef clear_opt
+ #define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt
+ #define set_opt(o, opt) o |= EXT4_MOUNT_##opt
+ #define test_opt(sb, opt) (EXT4_SB(sb)->s_mount_opt & \
+ EXT4_MOUNT_##opt)
+-#else
++#endif
++#ifndef EXT2_MOUNT_NOLOAD
+ #define EXT2_MOUNT_NOLOAD EXT4_MOUNT_NOLOAD
+ #define EXT2_MOUNT_ABORT EXT4_MOUNT_ABORT
+ #define EXT2_MOUNT_DATA_FLAGS EXT4_MOUNT_DATA_FLAGS
--- /dev/null
+Index: linux-2.6.18.i386/fs/ext4/iopen.c
+===================================================================
+--- linux-2.6.18.i386.orig/fs/ext4/iopen.c
++++ linux-2.6.18.i386/fs/ext4/iopen.c
+@@ -91,9 +91,12 @@ static struct dentry *iopen_lookup(struc
+ assert(!(alternate->d_flags & DCACHE_DISCONNECTED));
+ }
+
+- if (!list_empty(&inode->i_dentry)) {
+- alternate = list_entry(inode->i_dentry.next,
+- struct dentry, d_alias);
++ list_for_each(lp, &inode->i_dentry) {
++ alternate = list_entry(lp, struct dentry, d_alias);
++ /* ignore dentries created for ".." to preserve
++ * proper dcache hierarchy -- bug 10458 */
++ if (alternate->d_flags & DCACHE_NFSFS_RENAMED)
++ continue;
+ dget_locked(alternate);
+ spin_lock(&alternate->d_lock);
+ alternate->d_flags |= DCACHE_REFERENCED;
+Index: linux-2.6.18.i386/fs/ext4/namei.c
+===================================================================
+--- linux-2.6.18.i386.orig/fs/ext4/namei.c
++++ linux-2.6.18.i386/fs/ext4/namei.c
+@@ -1067,6 +1067,38 @@ static struct dentry *ext4_lookup(struct
+ return ERR_CAST(inode);
+ }
+
++ /* ".." shouldn't go into dcache to preserve dcache hierarchy
++ * otherwise we'll get parent being a child of actual child.
++ * see bug 10458 for details -bzzz */
++ if (inode && (dentry->d_name.name[0] == '.' && (dentry->d_name.len == 1 ||
++ (dentry->d_name.len == 2 && dentry->d_name.name[1] == '.')))) {
++ struct dentry *tmp, *goal = NULL;
++ struct list_head *lp;
++
++ /* first, look for an existing dentry - any one is good */
++ spin_lock(&dcache_lock);
++ list_for_each(lp, &inode->i_dentry) {
++ tmp = list_entry(lp, struct dentry, d_alias);
++ goal = tmp;
++ dget_locked(goal);
++ break;
++ }
++ if (goal == NULL) {
++ /* there is no alias, we need to make current dentry:
++ * a) inaccessible for __d_lookup()
++ * b) inaccessible for iopen */
++ J_ASSERT(list_empty(&dentry->d_alias));
++ dentry->d_flags |= DCACHE_NFSFS_RENAMED;
++ /* this is d_instantiate() ... */
++ list_add(&dentry->d_alias, &inode->i_dentry);
++ dentry->d_inode = inode;
++ }
++ spin_unlock(&dcache_lock);
++ if (goal)
++ iput(inode);
++ return goal;
++ }
++
+ return iopen_connect_dentry(dentry, inode, 1);
+ }
+
--- /dev/null
+Index: linux-2.6.18.i386/fs/ext4/inode.c
+===================================================================
+--- linux-2.6.18.i386.orig/fs/ext4/inode.c
++++ linux-2.6.18.i386/fs/ext4/inode.c
+@@ -3666,3 +3666,66 @@ out_unlock:
+ unlock_page(page);
+ return ret;
+ }
++
++int ext4_map_inode_page(struct inode *inode, struct page *page,
++ unsigned long *blocks, int *created, int create)
++{
++ unsigned int blocksize, blocks_per_page;
++ unsigned long iblock;
++ struct buffer_head dummy;
++ void *handle;
++ int i, rc = 0, failed = 0, needed_blocks;
++
++ blocksize = inode->i_sb->s_blocksize;
++ blocks_per_page = PAGE_SIZE >> inode->i_sb->s_blocksize_bits;
++ iblock = page->index * blocks_per_page;
++
++ for (i = 0; i < blocks_per_page; i++, iblock++) {
++ blocks[i] = ext4_bmap(inode->i_mapping, iblock);
++ if (blocks[i] == 0) {
++ failed++;
++ if (created)
++ created[i] = -1;
++ } else if (created) {
++ created[i] = 0;
++ }
++ }
++
++ if (failed == 0 || create == 0)
++ return 0;
++
++ needed_blocks = ext4_writepage_trans_blocks(inode);
++ handle = ext4_journal_start(inode, needed_blocks);
++ if (IS_ERR(handle))
++ return PTR_ERR(handle);
++
++ iblock = page->index * blocks_per_page;
++ for (i = 0; i < blocks_per_page; i++, iblock++) {
++ if (blocks[i] != 0)
++ continue;
++
++ rc = ext4_get_blocks_handle(handle, inode, iblock, 1, &dummy, 1, 1);
++ if (rc < 0) {
++ printk(KERN_INFO "ext4_map_inode_page: error reading "
++ "block %ld\n", iblock);
++ goto out;
++ } else {
++ if (rc > 1)
++ WARN_ON(1);
++ rc = 0;
++ }
++ /* Unmap any metadata buffers from the block mapping, to avoid
++ * data corruption due to direct-write from Lustre being
++ * clobbered by a later flush of the blockdev metadata buffer.*/
++ if (buffer_new(&dummy))
++ unmap_underlying_metadata(dummy.b_bdev,
++ dummy.b_blocknr);
++ blocks[i] = dummy.b_blocknr;
++ if (created)
++ created[i] = 1;
++ }
++
++out:
++ ext4_journal_stop(handle);
++ return rc;
++}
+Index: linux-2.6.18.i386/fs/ext4/super.c
+===================================================================
+--- linux-2.6.18.i386.orig/fs/ext4/super.c
++++ linux-2.6.18.i386/fs/ext4/super.c
+@@ -3498,6 +3498,10 @@ static void __exit exit_ext4_fs(void)
+ __free_page(ext4_zero_page);
+ }
+
++int ext4_map_inode_page(struct inode *inode, struct page *page,
++ unsigned long *blocks, int *created, int create);
++EXPORT_SYMBOL(ext4_map_inode_page);
++
+ MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
+ MODULE_DESCRIPTION("Fourth Extended Filesystem with extents");
+ MODULE_LICENSE("GPL");
--- /dev/null
+Index: linux-2.6.27.21-0.1/fs/ext4/ialloc.c
+===================================================================
+--- linux-2.6.27.21-0.1.orig/fs/ext4/ialloc.c
++++ linux-2.6.27.21-0.1/fs/ext4/ialloc.c
+@@ -721,12 +721,15 @@ struct inode *ext4_new_inode(handle_t *h
+ return ERR_PTR(-EPERM);
+
+ sb = dir->i_sb;
++ sbi = EXT4_SB(sb);
++ if (sbi->s_max_dir_size > 0 && i_size_read(dir) >= sbi->s_max_dir_size)
++ return ERR_PTR(-EFBIG);
++
+ inode = new_inode(sb);
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+ ei = EXT4_I(inode);
+
+- sbi = EXT4_SB(sb);
+ es = sbi->s_es;
+
+ if (goal) {
+Index: linux-2.6.27.21-0.1/fs/ext4/super.c
+===================================================================
+--- linux-2.6.27.21-0.1.orig/fs/ext4/super.c
++++ linux-2.6.27.21-0.1/fs/ext4/super.c
+@@ -41,6 +41,7 @@
+ #include <asm/uaccess.h>
+ #include <linux/kthread.h>
+ #include <linux/utsname.h>
++#include <linux/proc_fs.h>
+
+ #include "ext4.h"
+ #include "ext4_jbd2.h"
+@@ -71,6 +72,8 @@ static void ext4_write_super(struct supe
+ static void ext4_write_super_lockfs(struct super_block *sb);
+
+
++struct proc_dir_entry *proc_root_ext4;
++
+ ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
+ struct ext4_group_desc *bg)
+ {
+@@ -602,6 +605,9 @@ static void ext4_put_super(struct super_
+ }
+ if (sbi->s_mmp_tsk)
+ kthread_stop(sbi->s_mmp_tsk);
++
++ remove_proc_entry(EXT4_MAX_DIR_SIZE_NAME, sbi->s_proc);
++
+ sb->s_fs_info = NULL;
+ kfree(sbi);
+ return;
+@@ -2287,6 +2293,46 @@ static unsigned long ext4_get_stripe_siz
+ return 0;
+ }
+
++static int ext4_max_dir_size_read(char *page, char **start, off_t off,
++ int count, int *eof, void *data)
++{
++ struct ext4_sb_info *sbi = data;
++ int len;
++
++ *eof = 1;
++ if (off != 0)
++ return 0;
++
++ len = sprintf(page, "%lu\n", sbi->s_max_dir_size);
++ *start = page;
++ return len;
++}
++
++static int ext4_max_dir_size_write(struct file *file, const char *buffer,
++ unsigned long count, void *data)
++{
++ struct ext4_sb_info *sbi = data;
++ char str[32];
++ unsigned long value;
++ char *end;
++
++ if (count >= sizeof(str)) {
++ printk(KERN_ERR "EXT4-fs: %s string too long, max %u bytes\n",
++ EXT4_MAX_DIR_SIZE_NAME, (int)sizeof(str));
++ return -EOVERFLOW;
++ }
++
++ if (copy_from_user(str, buffer, count))
++ return -EFAULT;
++
++ value = simple_strtol(str, &end, 0);
++ if (value < 0)
++ return -ERANGE;
++
++ sbi->s_max_dir_size = value;
++ return count;
++}
++
+ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
+ __releases(kernel_lock)
+ __acquires(kernel_lock)
+@@ -2311,6 +2357,7 @@ static int ext4_fill_super(struct super_
+ int needs_recovery, has_huge_files;
+ int features;
+ __u64 blocks_count;
++ struct proc_dir_entry *proc;
+ int err;
+
+ sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
+@@ -2881,6 +2928,22 @@ static int ext4_fill_super(struct super_
+ test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered":
+ "writeback");
+
++ sbi->s_max_dir_size = EXT4_DEFAULT_MAX_DIR_SIZE;
++ proc = create_proc_entry(EXT4_MAX_DIR_SIZE_NAME,
++ S_IFREG | S_IRUGO | S_IWUSR, sbi->s_proc);
++ if (proc == NULL) {
++ printk(KERN_ERR "EXT4-fs: unable to create %s\n",
++ EXT4_MAX_DIR_SIZE_NAME);
++ remove_proc_entry(EXT4_MAX_DIR_SIZE_NAME, sbi->s_proc);
++ remove_proc_entry(sbi->s_proc->name, proc_root_ext4);
++ sbi->s_proc = NULL;
++ ret = -ENOMEM;
++ goto failed_mount4;
++ }
++ proc->data = sbi;
++ proc->read_proc = ext4_max_dir_size_read;
++ proc->write_proc = ext4_max_dir_size_write;
++
+ lock_kernel();
+ return 0;
+
+@@ -3254,7 +3317,6 @@ static void ext4_commit_super(struct sup
+ }
+ }
+
+-
+ /*
+ * Have we just finished recovery? If so, and if we are mounting (or
+ * remounting) the filesystem readonly, then we will end up with a
+Index: linux-2.6.27.21-0.1/fs/ext4/ext4_sb.h
+===================================================================
+--- linux-2.6.27.21-0.1.orig/fs/ext4/ext4_sb.h
++++ linux-2.6.27.21-0.1/fs/ext4/ext4_sb.h
+@@ -120,6 +120,7 @@ struct ext4_sb_info {
+ /* where last allocation was done - for stream allocation */
+ unsigned long s_mb_last_group;
+ unsigned long s_mb_last_start;
++ unsigned long s_max_dir_size;
+
+ /* history to debug policy */
+ struct ext4_mb_history *s_mb_history;
+Index: linux-2.6.27.21-0.1/fs/ext4/ext4.h
+===================================================================
+--- linux-2.6.27.21-0.1.orig/fs/ext4/ext4.h
++++ linux-2.6.27.21-0.1/fs/ext4/ext4.h
+@@ -1017,6 +1017,14 @@ struct mmp_struct {
+ */
+ #define EXT4_MMP_MIN_CHECK_INTERVAL 5
+
++extern struct proc_dir_entry *proc_root_ext4;
++
++/*
++ * max directory size tunable
++ */
++#define EXT4_DEFAULT_MAX_DIR_SIZE 0
++#define EXT4_MAX_DIR_SIZE_NAME "max_dir_size"
++
+ /*
+ * Function prototypes
+ */
+Index: linux-2.6.27.21-0.1/fs/ext4/mballoc.c
+===================================================================
+--- linux-2.6.27.21-0.1.orig/fs/ext4/mballoc.c
++++ linux-2.6.27.21-0.1/fs/ext4/mballoc.c
+@@ -2943,6 +2943,7 @@ err_out:
+ remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc);
+ remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc);
+ remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc);
++ remove_proc_entry(EXT4_MAX_DIR_SIZE_NAME, sbi->s_proc);
+ return -ENOMEM;
+ #else
+ return 0;
+@@ -2963,6 +2964,7 @@ static int ext4_mb_destroy_per_dev_proc(
+ remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc);
+ remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc);
+ remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc);
++ remove_proc_entry(EXT4_MAX_DIR_SIZE_NAME, sbi->s_proc);
+ #endif
+ return 0;
+ }
--- /dev/null
+Index: linux-2.6.27.21-0.1/fs/ext4/mballoc.c
+===================================================================
+--- linux-2.6.27.21-0.1.orig/fs/ext4/mballoc.c
++++ linux-2.6.27.21-0.1/fs/ext4/mballoc.c
+@@ -333,7 +333,7 @@
+ static struct kmem_cache *ext4_pspace_cachep;
+ static struct kmem_cache *ext4_ac_cachep;
+ static struct kmem_cache *ext4_free_ext_cachep;
+-static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
++static int ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
+ ext4_group_t group);
+ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
+ ext4_group_t group);
+@@ -672,7 +672,7 @@ static void ext4_mb_mark_free_simple(str
+ }
+ }
+
+-static void ext4_mb_generate_buddy(struct super_block *sb,
++static int ext4_mb_generate_buddy(struct super_block *sb,
+ void *buddy, void *bitmap, ext4_group_t group)
+ {
+ struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+@@ -704,14 +704,13 @@ static void ext4_mb_generate_buddy(struc
+ grp->bb_fragments = fragments;
+
+ if (free != grp->bb_free) {
+- ext4_grp_locked_error(sb, group, __func__,
+- "EXT4-fs: group %u: %u blocks in bitmap, %u in gd\n",
+- group, free, grp->bb_free);
+- /*
+- * If we intent to continue, we consider group descritor
+- * corrupt and update bb_free using bitmap value
+- */
+- grp->bb_free = free;
++ struct ext4_group_desc *gdp;
++ gdp = ext4_get_group_desc (sb, group, NULL);
++ ext4_grp_locked_error(sb, group, __func__,
++ "group %u: %u blocks in bitmap, %u in bb, "
++ "%u in gd\n", group, free, grp->bb_free,
++ ext4_free_blks_count(sb, gdp));
++ return -EIO;
+ }
+
+ clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
+@@ -721,6 +720,8 @@ static void ext4_mb_generate_buddy(struc
+ EXT4_SB(sb)->s_mb_buddies_generated++;
+ EXT4_SB(sb)->s_mb_generation_time += period;
+ spin_unlock(&EXT4_SB(sb)->s_bal_lock);
++
++ return 0;
+ }
+
+ /* The buddy information is attached the buddy cache inode
+@@ -850,7 +851,7 @@ static int ext4_mb_init_cache(struct pag
+ first_block = page->index * blocks_per_page;
+ /* init the page */
+ memset(page_address(page), 0xff, PAGE_CACHE_SIZE);
+- for (i = 0; i < blocks_per_page; i++) {
++ for (i = 0; i < blocks_per_page && err == 0; i++) {
+ int group;
+ struct ext4_group_info *grinfo;
+
+@@ -884,7 +885,7 @@ static int ext4_mb_init_cache(struct pag
+ * incore got set to the group block bitmap below
+ */
+ ext4_lock_group(sb, group);
+- ext4_mb_generate_buddy(sb, data, incore, group);
++ err = ext4_mb_generate_buddy(sb, data, incore, group);
+ ext4_unlock_group(sb, group);
+ incore = NULL;
+ } else {
+@@ -898,7 +899,7 @@ static int ext4_mb_init_cache(struct pag
+ memcpy(data, bitmap, blocksize);
+
+ /* mark all preallocated blks used in in-core bitmap */
+- ext4_mb_generate_from_pa(sb, data, group);
++ err = ext4_mb_generate_from_pa(sb, data, group);
+ ext4_mb_generate_from_freelist(sb, data, group);
+ ext4_unlock_group(sb, group);
+
+@@ -908,6 +909,7 @@ static int ext4_mb_init_cache(struct pag
+ incore = data;
+ }
+ }
++ if (likely(err == 0))
+ SetPageUptodate(page);
+
+ out:
+@@ -2217,7 +2219,10 @@ static int ext4_mb_seq_history_show(stru
+ hs->result.fe_start, hs->result.fe_len);
+ seq_printf(seq, "%-5u %-8u %-23s free\n",
+ hs->pid, hs->ino, buf2);
++ } else {
++ seq_printf(seq, "unknown op %d\n", hs->op);
+ }
++
+ return 0;
+ }
+
+@@ -2345,9 +2350,11 @@ static void *ext4_mb_seq_groups_next(str
+ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
+ {
+ struct super_block *sb = seq->private;
++ struct ext4_group_desc *gdp;
+ ext4_group_t group = (ext4_group_t) ((unsigned long) v);
+ int i;
+ int err;
++ unsigned free = 0;
+ struct ext4_buddy e4b;
+ struct sg {
+ struct ext4_group_info info;
+@@ -2356,10 +2363,10 @@ static int ext4_mb_seq_groups_show(struc
+
+ group--;
+ if (group == 0)
+- seq_printf(seq, "#%-5s: %-5s %-5s %-5s "
++ seq_printf(seq, "#%-5s: %-5s %-5s %-5s %-5s %-5s"
+ "[ %-5s %-5s %-5s %-5s %-5s %-5s %-5s "
+ "%-5s %-5s %-5s %-5s %-5s %-5s %-5s ]\n",
+- "group", "free", "frags", "first",
++ "group", "free", "frags", "first", "first", "pa",
+ "2^0", "2^1", "2^2", "2^3", "2^4", "2^5", "2^6",
+ "2^7", "2^8", "2^9", "2^10", "2^11", "2^12", "2^13");
+
+@@ -2371,12 +2378,18 @@ static int ext4_mb_seq_groups_show(struc
+ seq_printf(seq, "#%-5u: I/O error\n", group);
+ return 0;
+ }
++
++ gdp = ext4_get_group_desc(sb, group, NULL);
++ if (gdp != NULL)
++ free = ext4_free_blks_count(sb, gdp);
++
+ ext4_lock_group(sb, group);
+ memcpy(&sg, ext4_get_group_info(sb, group), i);
+ ext4_unlock_group(sb, group);
+ ext4_mb_release_desc(&e4b);
+
+- seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free,
++ seq_printf(seq, "#%-5u: %-5u %-5u %-5u %-5u [", group,
++ sg.info.bb_free, free,
+ sg.info.bb_fragments, sg.info.bb_first_free);
+ for (i = 0; i <= 13; i++)
+ seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ?
+@@ -2474,6 +2487,7 @@ ext4_mb_store_history(struct ext4_alloca
+ h.tail = ac->ac_tail;
+ h.buddy = ac->ac_buddy;
+ h.merged = 0;
++ h.cr = ac->ac_criteria;
+ if (ac->ac_op == EXT4_MB_HISTORY_ALLOC) {
+ if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
+ ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group)
+@@ -3689,22 +3703,66 @@ static void ext4_mb_generate_from_freeli
+ }
+
+ /*
++ * check free blocks in bitmap match free block in group descriptor
++ * do this before taking preallocated blocks into account to be able
++ * to detect on-disk corruptions
++ */
++int ext4_mb_check_ondisk_bitmap(struct super_block *sb, void *bitmap,
++ struct ext4_group_desc *gdp, int group)
++{
++ unsigned short max = EXT4_BLOCKS_PER_GROUP(sb);
++ unsigned short i, first, free = 0;
++
++ i = mb_find_next_zero_bit(bitmap, max, 0);
++
++ while (i < max) {
++ first = i;
++ i = find_next_bit(bitmap, max, i);
++ if (i > max)
++ i = max;
++ free += i - first;
++ if (i < max)
++ i = mb_find_next_zero_bit(bitmap, max, i);
++ }
++
++ if (free != ext4_free_blks_count(sb, gdp)) {
++ ext4_error(sb, __FUNCTION__, "on-disk bitmap for group %d"
++ "corrupted: %u blocks free in bitmap, %u - in gd\n",
++ group, free, ext4_free_blks_count(sb, gdp));
++ return -EIO;
++ }
++ return 0;
++}
++
++/*
+ * the function goes through all preallocation in this group and marks them
+ * used in in-core bitmap. buddy must be generated from this bitmap
+ * Need to be called with ext4 group lock (ext4_lock_group)
+ */
+-static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
++static int ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
+ ext4_group_t group)
+ {
+ struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+ struct ext4_prealloc_space *pa;
++ struct ext4_group_desc *gdp;
+ struct list_head *cur;
+ ext4_group_t groupnr;
+ ext4_grpblk_t start;
+ int preallocated = 0;
+ int count = 0;
++ int skip = 0;
++ int err;
+ int len;
+
++ gdp = ext4_get_group_desc (sb, group, NULL);
++ if (gdp == NULL)
++ return -EIO;
++
++ /* before applying preallocations, check bitmap consistency */
++ err = ext4_mb_check_ondisk_bitmap(sb, bitmap, gdp, group);
++ if (err)
++ return err;
++
+ /* all form of preallocation discards first load group,
+ * so the only competing code is preallocation use.
+ * we don't need any locking here
+@@ -3720,8 +3778,10 @@ static void ext4_mb_generate_from_pa(str
+ &groupnr, &start);
+ len = pa->pa_len;
+ spin_unlock(&pa->pa_lock);
+- if (unlikely(len == 0))
++ if (unlikely(len == 0)) {
++ skip++;
+ continue;
++ }
+ BUG_ON(groupnr != group);
+ mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group),
+ bitmap, start, len);
+@@ -3729,6 +3789,7 @@ static void ext4_mb_generate_from_pa(str
+ count++;
+ }
+ mb_debug("prellocated %u for group %u\n", preallocated, group);
++ return 0;
+ }
+
+ static void ext4_mb_pa_callback(struct rcu_head *head)
+@@ -3978,6 +4039,7 @@ ext4_mb_release_inode_pa(struct ext4_bud
+ ac->ac_sb = sb;
+ ac->ac_inode = pa->pa_inode;
+ ac->ac_op = EXT4_MB_HISTORY_DISCARD;
++ ac->ac_o_ex.fe_len = 1;
+ }
+
+ while (bit < end) {
+@@ -4260,7 +4322,7 @@ repeat:
+ __release(e4b->alloc_semp);
+ ext4_error(sb, __func__, "Error in loading buddy "
+ "information for %u\n", group);
+- continue;
++ return;
+ }
+
+ bitmap_bh = ext4_read_block_bitmap(sb, group);
+Index: linux-2.6.27.21-0.1/fs/ext4/mballoc.h
+===================================================================
+--- linux-2.6.27.21-0.1.orig/fs/ext4/mballoc.h
++++ linux-2.6.27.21-0.1/fs/ext4/mballoc.h
+@@ -219,7 +219,7 @@ struct ext4_mb_history {
+ __u16 tail; /* what tail broke some buddy */
+ __u16 buddy; /* buddy the tail ^^^ broke */
+ __u16 flags;
+- __u8 cr:3; /* which phase the result extent was found at */
++ __u8 cr:8; /* which phase the result extent was found at */
+ __u8 op:4;
+ __u8 merged:1;
+ };
--- /dev/null
+Index: linux-2.6.27.21-0.1/fs/ext4/ext4_jbd2.h
+===================================================================
+--- linux-2.6.27.21-0.1.orig/fs/ext4/ext4_jbd2.h
++++ linux-2.6.27.21-0.1/fs/ext4/ext4_jbd2.h
+@@ -35,6 +35,9 @@
+ (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS) \
+ || test_opt(sb, EXTENTS) ? 27U : 8U)
+
++/* Indicate that EXT4_SINGLEDATA_TRANS_BLOCKS takes the sb as argument */
++#define EXT4_SINGLEDATA_TRANS_BLOCKS_HAS_SB
++
+ /* Extended attribute operations touch at most two data buffers,
+ * two bitmap buffers, and two group summaries, in addition to the inode
+ * and the superblock, which are already accounted for. */
+Index: linux-2.6.27.21-0.1/fs/ext4/extents.c
+===================================================================
+--- linux-2.6.27.21-0.1.orig/fs/ext4/extents.c
++++ linux-2.6.27.21-0.1/fs/ext4/extents.c
+@@ -48,7 +48,7 @@
+ * ext_pblock:
+ * combine low and high parts of physical block number into ext4_fsblk_t
+ */
+-static ext4_fsblk_t ext_pblock(struct ext4_extent *ex)
++ext4_fsblk_t ext_pblock(struct ext4_extent *ex)
+ {
+ ext4_fsblk_t block;
+
+@@ -58,6 +58,17 @@ static ext4_fsblk_t ext_pblock(struct ex
+ }
+
+ /*
++ * ext4_ext_store_pblock:
++ * stores a large physical block number into an extent struct,
++ * breaking it into parts
++ */
++void ext4_ext_store_pblock(struct ext4_extent *ex, ext4_fsblk_t pb)
++{
++ ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
++ ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
++}
++
++/*
+ * idx_pblock:
+ * combine low and high parts of a leaf physical block number into ext4_fsblk_t
+ */
+@@ -71,17 +82,6 @@ ext4_fsblk_t idx_pblock(struct ext4_exte
+ }
+
+ /*
+- * ext4_ext_store_pblock:
+- * stores a large physical block number into an extent struct,
+- * breaking it into parts
+- */
+-void ext4_ext_store_pblock(struct ext4_extent *ex, ext4_fsblk_t pb)
+-{
+- ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
+- ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
+-}
+-
+-/*
+ * ext4_idx_store_pblock:
+ * stores a large physical block number into an index struct,
+ * breaking it into parts
+@@ -1851,6 +1851,56 @@ static int ext4_ext_rm_idx(handle_t *han
+ }
+
+ /*
++ * This routine returns max. credits extent tree can consume.
++ * It should be OK for low-performance paths like ->writepage()
++ * To allow many writing process to fit a single transaction,
++ * caller should calculate credits under truncate_mutex and
++ * pass actual path.
++ */
++int ext4_ext_calc_credits_for_insert(struct inode *inode,
++ struct ext4_ext_path *path)
++{
++ int depth, needed;
++
++ if (path) {
++ /* probably there is space in leaf? */
++ depth = ext_depth(inode);
++ if (le16_to_cpu(path[depth].p_hdr->eh_entries)
++ < le16_to_cpu(path[depth].p_hdr->eh_max))
++ return 1;
++ }
++
++ /*
++ * given 32bit logical block (4294967296 blocks), max. tree
++ * can be 4 levels in depth -- 4 * 340^4 == 53453440000.
++ * let's also add one more level for imbalance.
++ */
++ depth = 5;
++
++ /* allocation of new data block(s) */
++ needed = 2;
++
++ /*
++ * tree can be full, so it'd need to grow in depth:
++ * we need one credit to modify old root, credits for
++ * new root will be added in split accounting
++ */
++ needed += 1;
++
++ /*
++ * Index split can happen, we'd need:
++ * allocate intermediate indexes (bitmap + group)
++ * + change two blocks at each level, but root (already included)
++ */
++ needed += (depth * 2) + (depth * 2);
++
++ /* any allocation modifies superblock */
++ needed += 1;
++
++ return needed;
++}
++
++/*
+ * ext4_ext_calc_credits_for_single_extent:
+ * This routine returns max. credits that needed to insert an extent
+ * to the extent tree.
+@@ -3170,3 +3220,14 @@ int ext4_fiemap(struct inode *inode, str
+
+ return error;
+ }
++
++EXPORT_SYMBOL(ext4_ext_store_pblock);
++EXPORT_SYMBOL(ext4_ext_search_right);
++EXPORT_SYMBOL(ext4_ext_search_left);
++EXPORT_SYMBOL(ext_pblock);
++EXPORT_SYMBOL(ext4_ext_insert_extent);
++EXPORT_SYMBOL(ext4_mb_new_blocks);
++EXPORT_SYMBOL(ext4_ext_walk_space);
++EXPORT_SYMBOL(ext4_ext_calc_credits_for_insert);
++EXPORT_SYMBOL(ext4_mark_inode_dirty);
++
+Index: linux-2.6.27.21-0.1/fs/ext4/ext4_extents.h
+===================================================================
+--- linux-2.6.27.21-0.1.orig/fs/ext4/ext4_extents.h
++++ linux-2.6.27.21-0.1/fs/ext4/ext4_extents.h
+@@ -59,6 +59,11 @@
+ */
+ #define EXT_STATS_
+
++/*
++ * define EXT4_ALLOC_NEEDED to 0 since block bitmap, group desc. and sb
++ * are now accounted in ext4_ext_calc_credits_for_insert()
++ */
++#define EXT4_ALLOC_NEEDED 0
+
+ /*
+ * ext4_inode has i_block array (60 bytes total).
+@@ -124,6 +129,7 @@ struct ext4_ext_path {
+ #define EXT4_EXT_CACHE_GAP 1
+ #define EXT4_EXT_CACHE_EXTENT 2
+
++#define EXT4_EXT_HAS_NO_TREE /* ext4_extents_tree struct is not used*/
+
+ #define EXT_MAX_BLOCK 0xffffffff
+
+@@ -223,10 +229,14 @@ static inline int ext4_ext_get_actual_le
+ (le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN));
+ }
+
++extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex);
++extern void ext4_ext_store_pblock(struct ext4_extent *ex, ext4_fsblk_t pb);
+ extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks);
+ extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
+ extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t);
+ extern int ext4_extent_tree_init(handle_t *, struct inode *);
++extern int ext4_ext_calc_credits_for_insert(struct inode *,
++ struct ext4_ext_path *);
+ extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
+ int num,
+ struct ext4_ext_path *path);
+Index: linux-2.6.27.21-0.1/fs/ext4/mballoc.c
+===================================================================
+--- linux-2.6.27.21-0.1.orig/fs/ext4/mballoc.c
++++ linux-2.6.27.21-0.1/fs/ext4/mballoc.c
+@@ -4348,6 +4348,13 @@ repeat:
+ kmem_cache_free(ext4_ac_cachep, ac);
+ }
+
++/* For backward compatibility, since Lustre uses this symbol */
++void ext4_mb_discard_inode_preallocations(struct inode *inode)
++{
++ ext4_discard_preallocations(inode);
++}
++EXPORT_SYMBOL(ext4_mb_discard_inode_preallocations);
++
+ /*
+ * finds all preallocated spaces and return blocks being freed to them
+ * if preallocated space becomes full (no block is used from the space)
+@@ -5170,3 +5177,6 @@ error_return:
+ kmem_cache_free(ext4_ac_cachep, ac);
+ return;
+ }
++
++EXPORT_SYMBOL(ext4_free_blocks);
++
+Index: linux-2.6.27.21-0.1/fs/ext4/super.c
+===================================================================
+--- linux-2.6.27.21-0.1.orig/fs/ext4/super.c
++++ linux-2.6.27.21-0.1/fs/ext4/super.c
+@@ -91,6 +91,7 @@ ext4_fsblk_t ext4_inode_bitmap(struct su
+ (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
+ (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0);
+ }
++EXPORT_SYMBOL(ext4_inode_bitmap);
+
+ ext4_fsblk_t ext4_inode_table(struct super_block *sb,
+ struct ext4_group_desc *bg)
+@@ -1295,6 +1296,7 @@ enum {
+ Opt_stripe, Opt_delalloc, Opt_nodelalloc,
+ Opt_inode_readahead_blks, Opt_bigendian_extents,
+ Opt_iopen, Opt_noiopen, Opt_iopen_nopriv,
++ Opt_mballoc
+ };
+
+ static const match_table_t tokens = {
+@@ -1356,6 +1358,7 @@ static const match_table_t tokens = {
+ {Opt_nodelalloc, "nodelalloc"},
+ {Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
+ {Opt_bigendian_extents, "bigendian_extents"},
++ {Opt_mballoc, "mballoc"},
+ {Opt_err, NULL},
+ };
+
+@@ -1774,6 +1777,8 @@ set_qf_format:
+ case Opt_bigendian_extents:
+ bigendian_extents = 1;
+ break;
++ case Opt_mballoc:
++ break;
+ default:
+ printk(KERN_ERR
+ "EXT4-fs: Unrecognized mount option \"%s\" "
+@@ -4095,7 +4100,7 @@ static struct file_system_type ext4dev_f
+ .kill_sb = kill_block_super,
+ .fs_flags = FS_REQUIRES_DEV,
+ };
+-MODULE_ALIAS("ext4dev");
++MODULE_ALIAS("ext4");
+
+ static int __init init_ext4_fs(void)
+ {
+Index: linux-2.6.27.21-0.1/fs/ext4/ext4_jbd2.c
+===================================================================
+--- linux-2.6.27.21-0.1.orig/fs/ext4/ext4_jbd2.c
++++ linux-2.6.27.21-0.1/fs/ext4/ext4_jbd2.c
+@@ -21,6 +21,7 @@ int __ext4_journal_get_write_access(cons
+ ext4_journal_abort_handle(where, __func__, bh, handle, err);
+ return err;
+ }
++EXPORT_SYMBOL(__ext4_journal_get_write_access);
+
+ int __ext4_journal_forget(const char *where, handle_t *handle,
+ struct buffer_head *bh)
+@@ -57,3 +58,4 @@ int __ext4_journal_dirty_metadata(const
+ ext4_journal_abort_handle(where, __func__, bh, handle, err);
+ return err;
+ }
++EXPORT_SYMBOL(__ext4_journal_dirty_metadata);
+Index: linux-2.6.27.21-0.1/fs/ext4/ext4.h
+===================================================================
+--- linux-2.6.27.21-0.1.orig/fs/ext4/ext4.h
++++ linux-2.6.27.21-0.1/fs/ext4/ext4.h
+@@ -26,6 +26,9 @@
+ * The fourth extended filesystem constants/structures
+ */
+
++/* Has been moved to linux/magic.h but we need it for Lustre */
++#define EXT4_SUPER_MAGIC 0xEF53
++
+ /*
+ * Define EXT4FS_DEBUG to produce debug messages
+ */
+@@ -1116,6 +1119,8 @@ extern void ext4_mb_update_group_info(st
+ extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t);
+ extern void ext4_mb_put_buddy_cache_lock(struct super_block *,
+ ext4_group_t, int);
++extern void ext4_mb_discard_inode_preallocations(struct inode *);
++
+ /* inode.c */
+ int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
+ struct buffer_head *bh, ext4_fsblk_t blocknr);
+Index: linux-2.6.27.21-0.1/fs/ext4/inode.c
+===================================================================
+--- linux-2.6.27.21-0.1.orig/fs/ext4/inode.c
++++ linux-2.6.27.21-0.1/fs/ext4/inode.c
+@@ -4240,6 +4240,7 @@ bad_inode:
+ iget_failed(inode);
+ return ERR_PTR(ret);
+ }
++EXPORT_SYMBOL(ext4_iget);
+
+ static int ext4_inode_blocks_set(handle_t *handle,
+ struct ext4_inode *raw_inode,
--- /dev/null
+Index: linux-2.6.27.21-0.1/fs/ext4/super.c
+===================================================================
+--- linux-2.6.27.21-0.1.orig/fs/ext4/super.c
++++ linux-2.6.27.21-0.1/fs/ext4/super.c
+@@ -39,6 +39,8 @@
+ #include <linux/log2.h>
+ #include <linux/crc16.h>
+ #include <asm/uaccess.h>
++#include <linux/kthread.h>
++#include <linux/utsname.h>
+
+ #include "ext4.h"
+ #include "ext4_jbd2.h"
+@@ -598,6 +600,8 @@ static void ext4_put_super(struct super_
+ invalidate_bdev(sbi->journal_bdev);
+ ext4_blkdev_remove(sbi);
+ }
++ if (sbi->s_mmp_tsk)
++ kthread_stop(sbi->s_mmp_tsk);
+ sb->s_fs_info = NULL;
+ kfree(sbi);
+ return;
+@@ -806,7 +810,6 @@ static int ext4_show_options(struct seq_
+ if (!test_opt(sb, DELALLOC))
+ seq_puts(seq, ",nodelalloc");
+
+-
+ if (sbi->s_stripe)
+ seq_printf(seq, ",stripe=%lu", sbi->s_stripe);
+ /*
+@@ -829,6 +832,330 @@ static int ext4_show_options(struct seq_
+ }
+
+
++
++/*
++ * Write the MMP block using WRITE_SYNC to try to get the block on-disk
++ * faster.
++ */
++static int write_mmp_block(struct buffer_head *bh)
++{
++ mark_buffer_dirty(bh);
++ lock_buffer(bh);
++ bh->b_end_io = end_buffer_write_sync;
++ get_bh(bh);
++ submit_bh(WRITE_SYNC, bh);
++ wait_on_buffer(bh);
++ if (unlikely(!buffer_uptodate(bh)))
++ return 1;
++
++ return 0;
++}
++
++/*
++ * Read the MMP block. It _must_ be read from disk and hence we clear the
++ * uptodate flag on the buffer.
++ */
++static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
++ unsigned long mmp_block)
++{
++ struct mmp_struct *mmp;
++
++ if (*bh)
++ clear_buffer_uptodate(*bh);
++
++#if 0
++ brelse(*bh);
++
++ *bh = sb_bread(sb, mmp_block);
++#else
++ if (!*bh)
++ *bh = sb_getblk(sb, mmp_block);
++ if (*bh) {
++ get_bh(*bh);
++ lock_buffer(*bh);
++ (*bh)->b_end_io = end_buffer_read_sync;
++ submit_bh(READ_SYNC, *bh);
++ wait_on_buffer(*bh);
++ if (!buffer_uptodate(*bh)) {
++ brelse(*bh);
++ *bh = NULL;
++ }
++ }
++#endif
++ if (!*bh) {
++ ext4_warning(sb, __FUNCTION__,
++ "Error while reading MMP block %lu", mmp_block);
++ return -EIO;
++ }
++
++ mmp = (struct mmp_struct *)((*bh)->b_data);
++ if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC)
++ return -EINVAL;
++
++ return 0;
++}
++
++/*
++ * Dump as much information as possible to help the admin.
++ */
++static void dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp,
++ const char *function, const char *msg)
++{
++ ext4_warning(sb, function, msg);
++ ext4_warning(sb, function, "MMP failure info: last update time: %llu, "
++ "last update node: %s, last update device: %s\n",
++ le64_to_cpu(mmp->mmp_time), mmp->mmp_nodename,
++ mmp->mmp_bdevname);
++}
++
++/*
++ * kmmpd will update the MMP sequence every s_mmp_update_interval seconds
++ */
++static int kmmpd(void *data)
++{
++ struct super_block *sb = (struct super_block *) data;
++ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
++ struct buffer_head *bh = NULL;
++ struct mmp_struct *mmp;
++ unsigned long mmp_block;
++ u32 seq = 0;
++ unsigned long failed_writes = 0;
++ int mmp_update_interval = le16_to_cpu(es->s_mmp_update_interval);
++ unsigned mmp_check_interval;
++ unsigned long last_update_time;
++ unsigned long diff;
++ int retval;
++
++ mmp_block = le64_to_cpu(es->s_mmp_block);
++ retval = read_mmp_block(sb, &bh, mmp_block);
++ if (retval)
++ goto failed;
++
++ mmp = (struct mmp_struct *)(bh->b_data);
++ mmp->mmp_time = cpu_to_le64(get_seconds());
++ /*
++ * Start with the higher mmp_check_interval and reduce it if
++ * the MMP block is being updated on time.
++ */
++ mmp_check_interval = max(5 * mmp_update_interval,
++ EXT4_MMP_MIN_CHECK_INTERVAL);
++ mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
++ bdevname(bh->b_bdev, mmp->mmp_bdevname);
++
++ down_read(&uts_sem);
++ memcpy(mmp->mmp_nodename, init_utsname()->sysname,
++ sizeof(mmp->mmp_nodename));
++ up_read(&uts_sem);
++
++ while (!kthread_should_stop()) {
++ if (++seq > EXT4_MMP_SEQ_MAX)
++ seq = 1;
++
++ mmp->mmp_seq = cpu_to_le32(seq);
++ mmp->mmp_time = cpu_to_le64(get_seconds());
++ last_update_time = jiffies;
++
++ retval = write_mmp_block(bh);
++ /*
++ * Don't spew too many error messages. Print one every
++ * (s_mmp_update_interval * 60) seconds.
++ */
++ if (retval && (failed_writes % 60) == 0) {
++ ext4_error(sb, __FUNCTION__,
++ "Error writing to MMP block");
++ failed_writes++;
++ }
++
++ if (!(le32_to_cpu(es->s_feature_incompat) &
++ EXT4_FEATURE_INCOMPAT_MMP)) {
++ ext4_warning(sb, __FUNCTION__, "kmmpd being stopped "
++ "since MMP feature has been disabled.");
++ EXT4_SB(sb)->s_mmp_tsk = 0;
++ goto failed;
++ }
++
++ if (sb->s_flags & MS_RDONLY) {
++ ext4_warning(sb, __FUNCTION__, "kmmpd being stopped "
++ "since filesystem has been remounted as "
++ "readonly.");
++ EXT4_SB(sb)->s_mmp_tsk = 0;
++ goto failed;
++ }
++
++ diff = jiffies - last_update_time;
++ if (diff < mmp_update_interval * HZ)
++ schedule_timeout_interruptible(EXT4_MMP_UPDATE_INTERVAL*
++ HZ - diff);
++
++ /*
++ * We need to make sure that more than mmp_check_interval
++ * seconds have not passed since writing. If that has happened
++ * we need to check if the MMP block is as we left it.
++ */
++ diff = jiffies - last_update_time;
++ if (diff > mmp_check_interval * HZ) {
++ struct buffer_head *bh_check = NULL;
++ struct mmp_struct *mmp_check;
++
++ retval = read_mmp_block(sb, &bh_check, mmp_block);
++ if (retval) {
++ EXT4_SB(sb)->s_mmp_tsk = 0;
++ goto failed;
++ }
++
++ mmp_check = (struct mmp_struct *)(bh_check->b_data);
++ if (mmp->mmp_time != mmp_check->mmp_time ||
++ memcmp(mmp->mmp_nodename, mmp_check->mmp_nodename,
++ sizeof(mmp->mmp_nodename)))
++ dump_mmp_msg(sb, mmp_check, __FUNCTION__,
++ "Error while updating MMP info. "
++ "The filesystem seems to have "
++ "been multiply mounted.");
++
++ put_bh(bh_check);
++ }
++
++ /*
++ * Adjust the mmp_check_interval depending on how much time
++ * it took for the MMP block to be written.
++ */
++ mmp_check_interval = max(5 * diff / HZ,
++ (unsigned long) EXT4_MMP_MIN_CHECK_INTERVAL);
++ mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
++ }
++
++ /*
++ * Unmount seems to be clean.
++ */
++ mmp->mmp_seq = cpu_to_le32(EXT4_MMP_SEQ_CLEAN);
++ mmp->mmp_time = cpu_to_le64(get_seconds());
++
++ retval = write_mmp_block(bh);
++
++failed:
++ brelse(bh);
++ return retval;
++}
++
++/*
++ * Get a random new sequence number but make sure it is not greater than
++ * EXT4_MMP_SEQ_MAX.
++ */
++static unsigned int mmp_new_seq(void)
++{
++ u32 new_seq;
++
++ do {
++ get_random_bytes(&new_seq, sizeof(u32));
++ } while (new_seq > EXT4_MMP_SEQ_MAX);
++
++ return new_seq;
++}
++
++/*
++ * Protect the filesystem from being mounted more than once.
++ */
++static int ext4_multi_mount_protect(struct super_block *sb,
++ unsigned long mmp_block)
++{
++ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
++ struct buffer_head *bh = NULL;
++ struct mmp_struct *mmp = NULL;
++ u32 seq;
++ unsigned int mmp_check_interval = le16_to_cpu(es->s_mmp_update_interval);
++ int retval;
++
++ if (mmp_block < le32_to_cpu(es->s_first_data_block) ||
++ mmp_block >= ext4_blocks_count(es)) {
++ ext4_warning(sb, __FUNCTION__,
++ "Invalid MMP block in superblock");
++ goto failed;
++ }
++
++ retval = read_mmp_block(sb, &bh, mmp_block);
++ if (retval)
++ goto failed;
++
++ mmp = (struct mmp_struct *)(bh->b_data);
++
++ if (mmp_check_interval < EXT4_MMP_MIN_CHECK_INTERVAL)
++ mmp_check_interval = EXT4_MMP_MIN_CHECK_INTERVAL;
++
++ /*
++ * If check_interval in MMP block is larger, use that instead of
++ * update_interval from the superblock.
++ */
++ if (mmp->mmp_check_interval > mmp_check_interval)
++ mmp_check_interval = mmp->mmp_check_interval;
++
++ seq = le32_to_cpu(mmp->mmp_seq);
++ if (seq == EXT4_MMP_SEQ_CLEAN)
++ goto skip;
++
++ if (seq == EXT4_MMP_SEQ_FSCK) {
++ dump_mmp_msg(sb, mmp, __FUNCTION__,
++ "fsck is running on the filesystem");
++ goto failed;
++ }
++
++ schedule_timeout_uninterruptible(HZ * (2 * mmp_check_interval + 1));
++
++ retval = read_mmp_block(sb, &bh, mmp_block);
++ if (retval)
++ goto failed;
++ mmp = (struct mmp_struct *)(bh->b_data);
++ if (seq != le32_to_cpu(mmp->mmp_seq)) {
++ dump_mmp_msg(sb, mmp, __FUNCTION__,
++ "Device is already active on another node.");
++ goto failed;
++ }
++
++skip:
++ /*
++ * write a new random sequence number.
++ */
++ mmp->mmp_seq = seq = cpu_to_le32(mmp_new_seq());
++
++ retval = write_mmp_block(bh);
++ if (retval)
++ goto failed;
++
++ /*
++ * wait for MMP interval and check mmp_seq.
++ */
++ schedule_timeout_uninterruptible(HZ * (2 * mmp_check_interval + 1));
++
++ retval = read_mmp_block(sb, &bh, mmp_block);
++ if (retval)
++ goto failed;
++ mmp = (struct mmp_struct *)(bh->b_data);
++ if (seq != le32_to_cpu(mmp->mmp_seq)) {
++ dump_mmp_msg(sb, mmp, __FUNCTION__,
++ "Device is already active on another node.");
++ goto failed;
++ }
++
++ /*
++ * Start a kernel thread to update the MMP block periodically.
++ */
++ EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, sb, "kmmpd-%02x:%02x",
++ MAJOR(sb->s_dev),
++ MINOR(sb->s_dev));
++ if (IS_ERR(EXT4_SB(sb)->s_mmp_tsk)) {
++ EXT4_SB(sb)->s_mmp_tsk = 0;
++ ext4_warning(sb, __FUNCTION__, "Unable to create kmmpd thread "
++ "for %s.", sb->s_id);
++ goto failed;
++ }
++
++ brelse(bh);
++ return 0;
++
++failed:
++ brelse(bh);
++ return 1;
++}
++
+ static struct inode *ext4_nfs_get_inode(struct super_block *sb,
+ u64 ino, u32 generation)
+ {
+@@ -2366,6 +2693,11 @@ static int ext4_fill_super(struct super_
+ EXT4_HAS_INCOMPAT_FEATURE(sb,
+ EXT4_FEATURE_INCOMPAT_RECOVER));
+
++ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_MMP) &&
++ !(sb->s_flags & MS_RDONLY))
++ if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block)))
++ goto failed_mount3;
++
+ /*
+ * The first inode we look at is the journal inode. Don't try
+ * root first: it may be modified in the journal!
+@@ -2566,6 +2898,8 @@ failed_mount3:
+ percpu_counter_destroy(&sbi->s_freeinodes_counter);
+ percpu_counter_destroy(&sbi->s_dirs_counter);
+ percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
++ if (sbi->s_mmp_tsk)
++ kthread_stop(sbi->s_mmp_tsk);
+ failed_mount2:
+ for (i = 0; i < db_count; i++)
+ brelse(sbi->s_group_desc[i]);
+@@ -3080,7 +3414,7 @@ static int ext4_remount(struct super_blo
+ unsigned long old_sb_flags;
+ struct ext4_mount_options old_opts;
+ ext4_group_t g;
+- int err;
++ int err = 0;
+ #ifdef CONFIG_QUOTA
+ int i;
+ #endif
+@@ -3205,6 +3539,13 @@ static int ext4_remount(struct super_blo
+ goto restore_opts;
+ if (!ext4_setup_super(sb, es, 0))
+ sb->s_flags &= ~MS_RDONLY;
++ if (EXT4_HAS_INCOMPAT_FEATURE(sb,
++ EXT4_FEATURE_INCOMPAT_MMP))
++ if (ext4_multi_mount_protect(sb,
++ le64_to_cpu(es->s_mmp_block))) {
++ err = -EROFS;
++ goto restore_opts;
++ }
+ }
+ }
+ #ifdef CONFIG_QUOTA
+Index: linux-2.6.27.21-0.1/fs/ext4/ext4.h
+===================================================================
+--- linux-2.6.27.21-0.1.orig/fs/ext4/ext4.h
++++ linux-2.6.27.21-0.1/fs/ext4/ext4.h
+@@ -660,7 +660,7 @@ struct ext4_super_block {
+ __le16 s_want_extra_isize; /* New inodes should reserve # bytes */
+ __le32 s_flags; /* Miscellaneous flags */
+ __le16 s_raid_stride; /* RAID stride */
+- __le16 s_mmp_interval; /* # seconds to wait in MMP checking */
++ __le16 s_mmp_update_interval; /* # seconds to wait in MMP checking */
+ __le64 s_mmp_block; /* Block for multi-mount protection */
+ __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/
+ __u8 s_log_groups_per_flex; /* FLEX_BG group size */
+@@ -777,7 +777,8 @@ static inline int ext4_valid_inum(struct
+ EXT4_FEATURE_INCOMPAT_META_BG| \
+ EXT4_FEATURE_INCOMPAT_EXTENTS| \
+ EXT4_FEATURE_INCOMPAT_64BIT| \
+- EXT4_FEATURE_INCOMPAT_FLEX_BG)
++ EXT4_FEATURE_INCOMPAT_FLEX_BG| \
++ EXT4_FEATURE_INCOMPAT_MMP)
+ #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
+ EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
+ EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
+@@ -981,6 +982,39 @@ do { \
+ #endif
+
+ /*
++ * This structure will be used for multiple mount protection. It will be
++ * written into the block number saved in the s_mmp_block field in the
++ * superblock. Programs that check MMP should assume that if
++ * SEQ_FSCK (or any unknown code above SEQ_MAX) is present then it is NOT safe
++ * to use the filesystem, regardless of how old the timestamp is.
++ */
++#define EXT4_MMP_MAGIC 0x004D4D50U /* ASCII for MMP */
++#define EXT4_MMP_SEQ_CLEAN 0xFF4D4D50U /* mmp_seq value for clean unmount */
++#define EXT4_MMP_SEQ_FSCK 0xE24D4D50U /* mmp_seq value when being fscked */
++#define EXT4_MMP_SEQ_MAX 0xE24D4D4FU /* maximum valid mmp_seq value */
++
++struct mmp_struct {
++ __le32 mmp_magic;
++ __le32 mmp_seq;
++ __le64 mmp_time;
++ char mmp_nodename[64];
++ char mmp_bdevname[32];
++ __le16 mmp_check_interval;
++ __le16 mmp_pad1;
++ __le32 mmp_pad2[227];
++};
++
++/*
++ * Default interval in seconds to update the MMP sequence number.
++ */
++#define EXT4_MMP_UPDATE_INTERVAL 1
++
++/*
++ * Minimum interval for MMP checking in seconds.
++ */
++#define EXT4_MMP_MIN_CHECK_INTERVAL 5
++
++/*
+ * Function prototypes
+ */
+
+Index: linux-2.6.27.21-0.1/fs/ext4/ext4_sb.h
+===================================================================
+--- linux-2.6.27.21-0.1.orig/fs/ext4/ext4_sb.h
++++ linux-2.6.27.21-0.1/fs/ext4/ext4_sb.h
+@@ -150,6 +150,8 @@ struct ext4_sb_info {
+
+ unsigned int s_log_groups_per_flex;
+ struct flex_groups *s_flex_groups;
++
++ struct task_struct *s_mmp_tsk; /* Kernel thread for multiple mount protection */
+ };
+
+ #endif /* _EXT4_SB */
--- /dev/null
+Index: linux-2.6.27.21-0.1/fs/ext4/ext4_sb.h
+===================================================================
+--- linux-2.6.27.21-0.1.orig/fs/ext4/ext4_sb.h
++++ linux-2.6.27.21-0.1/fs/ext4/ext4_sb.h
+@@ -111,11 +111,14 @@ struct ext4_sb_info {
+
+ /* tunables */
+ unsigned long s_stripe;
+- unsigned int s_mb_stream_request;
++ unsigned long s_mb_small_req;
++ unsigned long s_mb_large_req;
+ unsigned int s_mb_max_to_scan;
+ unsigned int s_mb_min_to_scan;
+ unsigned int s_mb_stats;
+ unsigned int s_mb_order2_reqs;
++ unsigned long *s_mb_prealloc_table;
++ unsigned long s_mb_prealloc_table_size;
+ unsigned int s_mb_group_prealloc;
+ /* where last allocation was done - for stream allocation */
+ unsigned long s_mb_last_group;
+Index: linux-2.6.27.21-0.1/fs/ext4/mballoc.c
+===================================================================
+--- linux-2.6.27.21-0.1.orig/fs/ext4/mballoc.c
++++ linux-2.6.27.21-0.1/fs/ext4/mballoc.c
+@@ -1996,7 +1996,7 @@ ext4_mb_regular_allocator(struct ext4_al
+ if (size < isize)
+ size = isize;
+
+- if (size < sbi->s_mb_stream_request &&
++ if ((ac->ac_g_ex.fe_len < sbi->s_mb_large_req) &&
+ (ac->ac_flags & EXT4_MB_HINT_DATA)) {
+ /* TBD: may be hot point */
+ spin_lock(&sbi->s_md_lock);
+@@ -2686,6 +2686,26 @@ err_freesgi:
+ return -ENOMEM;
+ }
+
++static void ext4_mb_prealloc_table_add(struct ext4_sb_info *sbi, int value)
++{
++ int i;
++
++ if (value > (sbi->s_blocks_per_group - 1 - 1 - sbi->s_itb_per_group))
++ return;
++
++ for (i = 0; i < sbi->s_mb_prealloc_table_size; i++) {
++ if (sbi->s_mb_prealloc_table[i] == 0) {
++ sbi->s_mb_prealloc_table[i] = value;
++ return;
++ }
++
++ /* they should add values in order */
++ if (value <= sbi->s_mb_prealloc_table[i])
++ return;
++ }
++}
++
++
+ int ext4_mb_init(struct super_block *sb, int needs_recovery)
+ {
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+@@ -2738,13 +2758,55 @@ int ext4_mb_init(struct super_block *sb,
+ sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN;
+ sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN;
+ sbi->s_mb_stats = MB_DEFAULT_STATS;
+- sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
+ sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
+ sbi->s_mb_history_filter = EXT4_MB_HISTORY_DEFAULT;
+- sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC;
++
++ if (sbi->s_stripe == 0) {
++ sbi->s_mb_prealloc_table_size = 8;
++ i = sbi->s_mb_prealloc_table_size * sizeof(unsigned long);
++ sbi->s_mb_prealloc_table = kmalloc(i, GFP_NOFS);
++ if (sbi->s_mb_prealloc_table == NULL) {
++ kfree(sbi->s_mb_offsets);
++ kfree(sbi->s_mb_maxs);
++ return -ENOMEM;
++ }
++ memset(sbi->s_mb_prealloc_table, 0, i);
++
++ ext4_mb_prealloc_table_add(sbi, 4);
++ ext4_mb_prealloc_table_add(sbi, 8);
++ ext4_mb_prealloc_table_add(sbi, 16);
++ ext4_mb_prealloc_table_add(sbi, 32);
++ ext4_mb_prealloc_table_add(sbi, 64);
++ ext4_mb_prealloc_table_add(sbi, 128);
++ ext4_mb_prealloc_table_add(sbi, 256);
++ ext4_mb_prealloc_table_add(sbi, 512);
++
++ sbi->s_mb_small_req = 256;
++ sbi->s_mb_large_req = 1024;
++ sbi->s_mb_group_prealloc = 512;
++ } else {
++ sbi->s_mb_prealloc_table_size = 3;
++ i = sbi->s_mb_prealloc_table_size * sizeof(unsigned long);
++ sbi->s_mb_prealloc_table = kmalloc(i, GFP_NOFS);
++ if (sbi->s_mb_prealloc_table == NULL) {
++ kfree(sbi->s_mb_offsets);
++ kfree(sbi->s_mb_maxs);
++ return -ENOMEM;
++ }
++ memset(sbi->s_mb_prealloc_table, 0, i);
++
++ ext4_mb_prealloc_table_add(sbi, sbi->s_stripe);
++ ext4_mb_prealloc_table_add(sbi, sbi->s_stripe * 2);
++ ext4_mb_prealloc_table_add(sbi, sbi->s_stripe * 4);
++
++ sbi->s_mb_small_req = sbi->s_stripe;
++ sbi->s_mb_large_req = sbi->s_stripe * 8;
++ sbi->s_mb_group_prealloc = sbi->s_stripe * 4;
++ }
+
+ sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
+ if (sbi->s_locality_groups == NULL) {
++ kfree(sbi->s_mb_prealloc_table);
+ kfree(sbi->s_mb_offsets);
+ kfree(sbi->s_mb_maxs);
+ return -ENOMEM;
+@@ -2915,9 +2977,89 @@ ext4_mb_free_committed_blocks(struct sup
+ #define EXT4_MB_MAX_TO_SCAN_NAME "max_to_scan"
+ #define EXT4_MB_MIN_TO_SCAN_NAME "min_to_scan"
+ #define EXT4_MB_ORDER2_REQ "order2_req"
+-#define EXT4_MB_STREAM_REQ "stream_req"
++#define EXT4_MB_SMALL_REQ "small_req"
++#define EXT4_MB_LARGE_REQ "large_req"
++#define EXT4_MB_PREALLOC_TABLE "prealloc_table"
+ #define EXT4_MB_GROUP_PREALLOC "group_prealloc"
+
++static int ext4_mb_prealloc_table_proc_read(char *page, char **start, off_t off,
++ int count, int *eof, void *data)
++{
++ struct ext4_sb_info *sbi = data;
++ int len = 0;
++ int i;
++
++ *eof = 1;
++ if (off != 0)
++ return 0;
++
++ for (i = 0; i < sbi->s_mb_prealloc_table_size; i++)
++ len += sprintf(page + len, "%ld ",
++ sbi->s_mb_prealloc_table[i]);
++ len += sprintf(page + len, "\n");
++
++ *start = page;
++ return len;
++}
++
++static int ext4_mb_prealloc_table_proc_write(struct file *file,
++ const char __user *buf,
++ unsigned long cnt, void *data)
++{
++ struct ext4_sb_info *sbi = data;
++ unsigned long value;
++ unsigned long prev = 0;
++ char str[128];
++ char *cur;
++ char *end;
++ unsigned long *new_table;
++ int num = 0;
++ int i = 0;
++
++ if (cnt >= sizeof(str))
++ return -EINVAL;
++ if (copy_from_user(str, buf, cnt))
++ return -EFAULT;
++
++ num = 0;
++ cur = str;
++ end = str + cnt;
++ while (cur < end) {
++ while ((cur < end) && (*cur == ' ')) cur++;
++ value = simple_strtol(cur, &cur, 0);
++ if (value == 0)
++ break;
++ if (value <= prev)
++ return -EINVAL;
++ prev = value;
++ num++;
++ }
++
++ new_table = kmalloc(num * sizeof(*new_table), GFP_KERNEL);
++ if (new_table == NULL)
++ return -ENOMEM;
++ kfree(sbi->s_mb_prealloc_table);
++ memset(new_table, 0, num * sizeof(*new_table));
++ sbi->s_mb_prealloc_table = new_table;
++ sbi->s_mb_prealloc_table_size = num;
++ cur = str;
++ end = str + cnt;
++ while (cur < end && i < num) {
++ while ((cur < end) && (*cur == ' ')) cur++;
++ value = simple_strtol(cur, &cur, 0);
++ ext4_mb_prealloc_table_add(sbi, value);
++ i++;
++ }
++
++ return cnt;
++}
++
++static const struct file_operations ext4_mb_prealloc_table_proc_fops = {
++ .owner = THIS_MODULE,
++ .read = ext4_mb_prealloc_table_proc_read,
++ .write = ext4_mb_prealloc_table_proc_write,
++};
++
+ static int ext4_mb_init_per_dev_proc(struct super_block *sb)
+ {
+ #ifdef CONFIG_PROC_FS
+@@ -2932,13 +3074,17 @@ static int ext4_mb_init_per_dev_proc(str
+ EXT4_PROC_HANDLER(EXT4_MB_MAX_TO_SCAN_NAME, mb_max_to_scan);
+ EXT4_PROC_HANDLER(EXT4_MB_MIN_TO_SCAN_NAME, mb_min_to_scan);
+ EXT4_PROC_HANDLER(EXT4_MB_ORDER2_REQ, mb_order2_reqs);
+- EXT4_PROC_HANDLER(EXT4_MB_STREAM_REQ, mb_stream_request);
++ EXT4_PROC_HANDLER(EXT4_MB_SMALL_REQ, mb_small_req);
++ EXT4_PROC_HANDLER(EXT4_MB_LARGE_REQ, mb_large_req);
++ EXT4_PROC_HANDLER(EXT4_MB_PREALLOC_TABLE, mb_prealloc_table);
+ EXT4_PROC_HANDLER(EXT4_MB_GROUP_PREALLOC, mb_group_prealloc);
+ return 0;
+
+ err_out:
+ remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_proc);
+- remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_proc);
++ remove_proc_entry(EXT4_MB_PREALLOC_TABLE, sbi->s_proc);
++ remove_proc_entry(EXT4_MB_LARGE_REQ, sbi->s_proc);
++ remove_proc_entry(EXT4_MB_SMALL_REQ, sbi->s_proc);
+ remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_proc);
+ remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc);
+ remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc);
+@@ -2959,7 +3105,9 @@ static int ext4_mb_destroy_per_dev_proc(
+ return -EINVAL;
+
+ remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_proc);
+- remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_proc);
++ remove_proc_entry(EXT4_MB_PREALLOC_TABLE, sbi->s_proc);
++ remove_proc_entry(EXT4_MB_LARGE_REQ, sbi->s_proc);
++ remove_proc_entry(EXT4_MB_SMALL_REQ, sbi->s_proc);
+ remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_proc);
+ remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc);
+ remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc);
+@@ -3162,11 +3310,12 @@ static noinline_for_stack void
+ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
+ struct ext4_allocation_request *ar)
+ {
+- int bsbits, max;
++ int bsbits, i, wind;
+ ext4_lblk_t end;
+- loff_t size, orig_size, start_off;
++ loff_t size, orig_size;
+ ext4_lblk_t start, orig_start;
+ struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
++ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+ struct ext4_prealloc_space *pa;
+
+ /* do normalize only data requests, metadata requests
+@@ -3196,49 +3345,35 @@ ext4_mb_normalize_request(struct ext4_al
+ size = size << bsbits;
+ if (size < i_size_read(ac->ac_inode))
+ size = i_size_read(ac->ac_inode);
++ size = (size + ac->ac_sb->s_blocksize - 1) >> bsbits;
+
+- /* max size of free chunks */
+- max = 2 << bsbits;
++ start = wind = 0;
+
+-#define NRL_CHECK_SIZE(req, size, max, chunk_size) \
+- (req <= (size) || max <= (chunk_size))
++ /* let's choose preallocation window depending on file size */
++ for (i = 0; i < sbi->s_mb_prealloc_table_size; i++) {
++ if (size <= sbi->s_mb_prealloc_table[i]) {
++ wind = sbi->s_mb_prealloc_table[i];
++ break;
++ }
++ }
++ size = wind;
+
+- /* first, try to predict filesize */
+- /* XXX: should this table be tunable? */
+- start_off = 0;
+- if (size <= 16 * 1024) {
+- size = 16 * 1024;
+- } else if (size <= 32 * 1024) {
+- size = 32 * 1024;
+- } else if (size <= 64 * 1024) {
+- size = 64 * 1024;
+- } else if (size <= 128 * 1024) {
+- size = 128 * 1024;
+- } else if (size <= 256 * 1024) {
+- size = 256 * 1024;
+- } else if (size <= 512 * 1024) {
+- size = 512 * 1024;
+- } else if (size <= 1024 * 1024) {
+- size = 1024 * 1024;
+- } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) {
+- start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
+- (21 - bsbits)) << 21;
+- size = 2 * 1024 * 1024;
+- } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) {
+- start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
+- (22 - bsbits)) << 22;
+- size = 4 * 1024 * 1024;
+- } else if (NRL_CHECK_SIZE(ac->ac_o_ex.fe_len,
+- (8<<20)>>bsbits, max, 8 * 1024)) {
+- start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
+- (23 - bsbits)) << 23;
+- size = 8 * 1024 * 1024;
+- } else {
+- start_off = (loff_t)ac->ac_o_ex.fe_logical << bsbits;
+- size = ac->ac_o_ex.fe_len << bsbits;
++ if (wind == 0) {
++ __u64 tstart, tend;
++ /* file is quite large, we now preallocate with
++ * the biggest configured window with regart to
++ * logical offset */
++ wind = sbi->s_mb_prealloc_table[i - 1];
++ tstart = ac->ac_o_ex.fe_logical;
++ do_div(tstart, wind);
++ start = tstart * wind;
++ tend = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len - 1;
++ do_div(tend, wind);
++ tend = tend * wind + wind;
++ size = tend - start;
+ }
+- orig_size = size = size >> bsbits;
+- orig_start = start = start_off >> bsbits;
++ orig_size = size;
++ orig_start = start;
+
+ /* don't cover already allocated blocks in selected range */
+ if (ar->pleft && start <= ar->lleft) {
+@@ -3315,7 +3450,6 @@ ext4_mb_normalize_request(struct ext4_al
+ }
+ BUG_ON(start + size <= ac->ac_o_ex.fe_logical &&
+ start > ac->ac_o_ex.fe_logical);
+- BUG_ON(size <= 0 || size >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
+
+ /* now prepare goal request */
+
+@@ -4236,22 +4370,32 @@ static void ext4_mb_group_or_file(struct
+ {
+ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+ int bsbits = ac->ac_sb->s_blocksize_bits;
+- loff_t size, isize;
++ loff_t size;
+
+ if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
+ return;
+
+- size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
+- isize = i_size_read(ac->ac_inode) >> bsbits;
+- size = max(size, isize);
+-
+- /* don't use group allocation for large files */
+- if (size >= sbi->s_mb_stream_request)
++ if (ac->ac_o_ex.fe_len >= sbi->s_mb_small_req)
+ return;
+
+ if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
+ return;
+
++ /* request is so large that we don't care about
++ * streaming - it overweights any possible seek */
++ if (ac->ac_o_ex.fe_len >= sbi->s_mb_large_req)
++ return;
++
++ size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
++ size = size << bsbits;
++ if (size < i_size_read(ac->ac_inode))
++ size = i_size_read(ac->ac_inode);
++ size = (size + ac->ac_sb->s_blocksize - 1) >> bsbits;
++
++ /* don't use group allocation for large files */
++ if (size >= sbi->s_mb_large_req)
++ return;
++
+ BUG_ON(ac->ac_lg != NULL);
+ /*
+ * locality group prealloc space are per cpu. The reason for having
+Index: linux-2.6.27.21-0.1/fs/ext4/inode.c
+===================================================================
+--- linux-2.6.27.21-0.1.orig/fs/ext4/inode.c
++++ linux-2.6.27.21-0.1/fs/ext4/inode.c
+@@ -2442,14 +2442,14 @@ static int ext4_da_writepages(struct add
+ return -EROFS;
+
+ /*
+- * Make sure nr_to_write is >= sbi->s_mb_stream_request
++ * Make sure nr_to_write is >= sbi->s_mb_small_req
+ * This make sure small files blocks are allocated in
+ * single attempt. This ensure that small files
+ * get less fragmented.
+ */
+- if (wbc->nr_to_write < sbi->s_mb_stream_request) {
+- nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write;
+- wbc->nr_to_write = sbi->s_mb_stream_request;
++ if (wbc->nr_to_write < sbi->s_mb_small_req) {
++ nr_to_writebump = sbi->s_mb_small_req - wbc->nr_to_write;
++ wbc->nr_to_write = sbi->s_mb_small_req;
+ }
+ if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+ range_whole = 1;
--- /dev/null
+Index: linux-2.6.18.i386/fs/ext4/namei.c
+===================================================================
+--- linux-2.6.18.i386.orig/fs/ext4/namei.c
++++ linux-2.6.18.i386/fs/ext4/namei.c
+@@ -374,8 +374,8 @@ dx_probe(struct dentry *dentry, struct i
+ root->info.hash_version != DX_HASH_HALF_MD4 &&
+ root->info.hash_version != DX_HASH_LEGACY) {
+ ext4_warning(dir->i_sb, __func__,
+- "Unrecognised inode hash code %d",
+- root->info.hash_version);
++ "Unrecognised inode hash code %d for directory "
++ "#%lu", root->info.hash_version, dir->i_ino);
+ brelse(bh);
+ *err = ERR_BAD_DX_DIR;
+ goto fail;
--- /dev/null
+Index: linux-2.6.27.21-0.1/fs/ext4/ialloc.c
+===================================================================
+--- linux-2.6.27.21-0.1.orig/fs/ext4/ialloc.c
++++ linux-2.6.27.21-0.1/fs/ext4/ialloc.c
+@@ -1120,7 +1120,6 @@ unsigned long ext4_count_free_inodes(str
+ if (!gdp)
+ continue;
+ desc_count += ext4_free_inodes_count(sb, gdp);
+- cond_resched();
+ }
+ return desc_count;
+ #endif
+Index: linux-2.6.27.21-0.1/fs/ext4/super.c
+===================================================================
+--- linux-2.6.27.21-0.1.orig/fs/ext4/super.c
++++ linux-2.6.27.21-0.1/fs/ext4/super.c
+@@ -3263,11 +3263,9 @@ static int ext4_statfs(struct dentry *de
+ * block group descriptors. If the sparse superblocks
+ * feature is turned on, then not all groups have this.
+ */
+- for (i = 0; i < ngroups; i++) {
++ for (i = 0; i < ngroups; i++)
+ overhead += ext4_bg_has_super(sb, i) +
+ ext4_bg_num_gdb(sb, i);
+- cond_resched();
+- }
+
+ /*
+ * Every block group has an inode bitmap, a block
--- /dev/null
+temp patch until we find workaround. WIll not affect Lustre functionality
+
+Index: linux-2.6.27.21-0.1/fs/ext4/ioctl.c
+===================================================================
+--- linux-2.6.27.21-0.1.orig/fs/ext4/ioctl.c
++++ linux-2.6.27.21-0.1/fs/ext4/ioctl.c
+@@ -200,9 +200,9 @@ long ext4_ioctl(struct file *filp, unsig
+ if (get_user(flags, (int __user *) arg))
+ return -EFAULT;
+
+- err = mnt_want_write(filp->f_path.mnt);
++/* err = mnt_want_write(filp->f_path.mnt);
+ if (err)
+- return err;
++ return err;*/
+
+ if (!S_ISDIR(inode->i_mode))
+ flags &= ~EXT4_DIRSYNC_FL;
+@@ -281,7 +281,7 @@ flags_err:
+ err = ext4_ext_migrate(inode);
+ flags_out:
+ mutex_unlock(&inode->i_mutex);
+- mnt_drop_write(filp->f_path.mnt);
++// mnt_drop_write(filp->f_path.mnt);
+ return err;
+ }
+ case EXT4_IOC_GETVERSION:
+@@ -297,9 +297,9 @@ flags_out:
+ if (!is_owner_or_cap(inode))
+ return -EPERM;
+
+- err = mnt_want_write(filp->f_path.mnt);
++/* err = mnt_want_write(filp->f_path.mnt);
+ if (err)
+- return err;
++ return err;*/
+ if (get_user(generation, (int __user *) arg)) {
+ err = -EFAULT;
+ goto setversion_out;
+@@ -318,7 +318,7 @@ flags_out:
+ }
+ ext4_journal_stop(handle);
+ setversion_out:
+- mnt_drop_write(filp->f_path.mnt);
++// mnt_drop_write(filp->f_path.mnt);
+ return err;
+ }
+ #ifdef CONFIG_JBD2_DEBUG
+@@ -356,9 +356,9 @@ setversion_out:
+ if (get_user(n_blocks_count, (__u32 __user *)arg))
+ return -EFAULT;
+
+- err = mnt_want_write(filp->f_path.mnt);
++/* err = mnt_want_write(filp->f_path.mnt);
+ if (err)
+- return err;
++ return err;*/
+
+ err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count);
+ jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
+@@ -366,7 +366,7 @@ setversion_out:
+ jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+ if (err == 0)
+ err = err2;
+- mnt_drop_write(filp->f_path.mnt);
++// mnt_drop_write(filp->f_path.mnt);
+
+ return err;
+ }
+@@ -382,9 +382,9 @@ setversion_out:
+ sizeof(input)))
+ return -EFAULT;
+
+- err = mnt_want_write(filp->f_path.mnt);
++/* err = mnt_want_write(filp->f_path.mnt);
+ if (err)
+- return err;
++ return err;*/
+
+ err = ext4_group_add(sb, &input);
+ jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
+@@ -392,7 +392,7 @@ setversion_out:
+ jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+ if (err == 0)
+ err = err2;
+- mnt_drop_write(filp->f_path.mnt);
++// mnt_drop_write(filp->f_path.mnt);
+
+ return err;
+ }
+@@ -403,9 +403,9 @@ setversion_out:
+ if (!is_owner_or_cap(inode))
+ return -EACCES;
+
+- err = mnt_want_write(filp->f_path.mnt);
++/* err = mnt_want_write(filp->f_path.mnt);
+ if (err)
+- return err;
++ return err;*/
+ /*
+ * inode_mutex prevent write and truncate on the file.
+ * Read still goes through. We take i_data_sem in
+@@ -415,7 +415,7 @@ setversion_out:
+ mutex_lock(&(inode->i_mutex));
+ err = ext4_ext_migrate(inode);
+ mutex_unlock(&(inode->i_mutex));
+- mnt_drop_write(filp->f_path.mnt);
++// mnt_drop_write(filp->f_path.mnt);
+ return err;
+ }
+
--- /dev/null
+Index: linux-2.6.18.i386/fs/ext4/namei.c
+===================================================================
+--- linux-2.6.18.i386.orig/fs/ext4/namei.c
++++ linux-2.6.18.i386/fs/ext4/namei.c
+@@ -2299,8 +2299,8 @@ static int ext4_link (struct dentry * ol
+ * Return -ENOENT if we've raced with unlink and i_nlink is 0. Doing
+ * otherwise has the potential to corrupt the orphan inode list.
+ */
+- if (inode->i_nlink == 0)
+- return -ENOENT;
++ //if (inode->i_nlink == 0)
++ // return -ENOENT;
+
+ retry:
+ handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
--- /dev/null
+Index: linux-2.6.27.21-0.1/fs/ext4/ialloc.c
+===================================================================
+--- linux-2.6.27.21-0.1.orig/fs/ext4/ialloc.c
++++ linux-2.6.27.21-0.1/fs/ext4/ialloc.c
+@@ -675,7 +675,8 @@ err_ret:
+ * For other inodes, search forward from the parent directory's block
+ * group to find a free inode.
+ */
+-struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
++struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode,
++ unsigned long goal)
+ {
+ struct super_block *sb;
+ struct buffer_head *inode_bitmap_bh = NULL;
+@@ -706,6 +707,43 @@ struct inode *ext4_new_inode(handle_t *h
+ sbi = EXT4_SB(sb);
+ es = sbi->s_es;
+
++ if (goal) {
++ group = (goal - 1) / EXT4_INODES_PER_GROUP(sb);
++ ino = (goal - 1) % EXT4_INODES_PER_GROUP(sb);
++ err = -EIO;
++
++ gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
++ if (!gdp)
++ goto fail;
++
++ inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
++ if (!inode_bitmap_bh)
++ goto fail;
++
++ BUFFER_TRACE(bh, "get_write_access");
++ err = ext4_journal_get_write_access(handle, inode_bitmap_bh);
++ if (err)
++ goto fail;
++
++ if (ext4_set_bit_atomic(sb_bgl_lock(sbi, group),
++ ino, inode_bitmap_bh->b_data)) {
++ printk(KERN_ERR "goal inode %lu unavailable\n", goal);
++ /* Oh well, we tried. */
++ goto continue_allocation;
++ }
++
++ BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
++ err = ext4_journal_dirty_metadata(handle, inode_bitmap_bh);
++ if (err)
++ goto fail;
++
++ /* We've shortcircuited the allocation system successfully,
++ * now finish filling in the inode.
++ */
++ goto got;
++ }
++
++continue_allocation:
+ if (sbi->s_log_groups_per_flex) {
+ ret2 = find_group_flex(sb, dir, &group);
+ goto got_group;
+Index: linux-2.6.27.21-0.1/fs/ext4/namei.c
+===================================================================
+--- linux-2.6.27.21-0.1.orig/fs/ext4/namei.c
++++ linux-2.6.27.21-0.1/fs/ext4/namei.c
+@@ -104,6 +104,7 @@ struct dx_entry
+ __le32 block;
+ };
+
++
+ /*
+ * dx_root_info is laid out so that if it should somehow get overlaid by a
+ * dirent the two low bits of the hash version will be zero. Therefore, the
+@@ -149,6 +150,14 @@ struct dx_map_entry
+ u16 size;
+ };
+
++#define LVFS_DENTRY_PARAM_MAGIC 20070216UL
++struct lvfs_dentry_params
++{
++ unsigned long p_inum;
++ void *p_ptr;
++ u32 magic;
++};
++
+ static inline ext4_lblk_t dx_get_block(struct dx_entry *entry);
+ static void dx_set_block(struct dx_entry *entry, ext4_lblk_t value);
+ static inline unsigned dx_get_hash(struct dx_entry *entry);
+@@ -1716,6 +1725,20 @@ static int ext4_add_nondir(handle_t *han
+ return err;
+ }
+
++static struct inode * ext4_new_inode_wantedi(handle_t *handle, struct inode *dir,
++ int mode, struct dentry *dentry)
++{
++ unsigned long inum = 0;
++
++ if (dentry->d_fsdata != NULL) {
++ struct lvfs_dentry_params *param = dentry->d_fsdata;
++
++ if (param->magic == LVFS_DENTRY_PARAM_MAGIC)
++ inum = param->p_inum;
++ }
++ return ext4_new_inode(handle, dir, mode, inum);
++}
++
+ /*
+ * By the time this is called, we already have created
+ * the directory cache entry for the new file, but it
+@@ -1741,7 +1764,7 @@ retry:
+ if (IS_DIRSYNC(dir))
+ handle->h_sync = 1;
+
+- inode = ext4_new_inode (handle, dir, mode);
++ inode = ext4_new_inode_wantedi(handle, dir, mode, dentry);
+ err = PTR_ERR(inode);
+ if (!IS_ERR(inode)) {
+ inode->i_op = &ext4_file_inode_operations;
+@@ -1775,7 +1798,7 @@ retry:
+ if (IS_DIRSYNC(dir))
+ handle->h_sync = 1;
+
+- inode = ext4_new_inode(handle, dir, mode);
++ inode = ext4_new_inode_wantedi(handle, dir, mode, dentry);
+ err = PTR_ERR(inode);
+ if (!IS_ERR(inode)) {
+ init_special_inode(inode, inode->i_mode, rdev);
+@@ -1811,7 +1834,7 @@ retry:
+ if (IS_DIRSYNC(dir))
+ handle->h_sync = 1;
+
+- inode = ext4_new_inode(handle, dir, S_IFDIR | mode);
++ inode = ext4_new_inode_wantedi(handle, dir, S_IFDIR | mode, dentry);
+ err = PTR_ERR(inode);
+ if (IS_ERR(inode))
+ goto out_stop;
+@@ -2211,7 +2234,7 @@ retry:
+ if (IS_DIRSYNC(dir))
+ handle->h_sync = 1;
+
+- inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO);
++ inode = ext4_new_inode_wantedi(handle, dir, S_IFLNK|S_IRWXUGO, dentry);
+ err = PTR_ERR(inode);
+ if (IS_ERR(inode))
+ goto out_stop;
+Index: linux-2.6.27.21-0.1/fs/ext4/ext4.h
+===================================================================
+--- linux-2.6.27.21-0.1.orig/fs/ext4/ext4.h
++++ linux-2.6.27.21-0.1/fs/ext4/ext4.h
+@@ -1032,7 +1032,8 @@ extern int ext4fs_dirhash(const char *na
+ dx_hash_info *hinfo);
+
+ /* ialloc.c */
+-extern struct inode * ext4_new_inode(handle_t *, struct inode *, int);
++extern struct inode * ext4_new_inode(handle_t *, struct inode *, int,
++ unsigned long);
+ extern void ext4_free_inode(handle_t *, struct inode *);
+ extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
+ extern unsigned long ext4_count_free_inodes(struct super_block *);
+Index: linux-2.6.27.21-0.1/fs/ext4/migrate.c
+===================================================================
+--- linux-2.6.27.21-0.1.orig/fs/ext4/migrate.c
++++ linux-2.6.27.21-0.1/fs/ext4/migrate.c
+@@ -484,7 +484,7 @@ int ext4_ext_migrate(struct inode *inode
+ }
+ tmp_inode = ext4_new_inode(handle,
+ inode->i_sb->s_root->d_inode,
+- S_IFREG);
++ S_IFREG, 0);
+ if (IS_ERR(tmp_inode)) {
+ retval = -ENOMEM;
+ ext4_journal_stop(handle);
--- /dev/null
+Index: linux-2.6.18.i386/fs/ext4/ext4.h
+===================================================================
+--- linux-2.6.18.i386.orig/fs/ext4/ext4.h
++++ linux-2.6.18.i386/fs/ext4/ext4.h
+@@ -995,6 +995,13 @@ struct mmp_struct {
+ extern struct proc_dir_entry *proc_root_ext4;
+
+ /*
++ * Indicates that ctime should not be updated in ext4_xattr_set_handle()
++ */
++#ifndef XATTR_NO_CTIME
++#define XATTR_NO_CTIME 0x80
++#endif
++
++/*
+ * Function prototypes
+ */
+
+Index: linux-2.6.18.i386/fs/ext4/xattr.c
+===================================================================
+--- linux-2.6.18.i386.orig/fs/ext4/xattr.c
++++ linux-2.6.18.i386/fs/ext4/xattr.c
+@@ -1026,7 +1026,8 @@ ext4_xattr_set_handle(handle_t *handle,
+ }
+ if (!error) {
+ ext4_xattr_update_super_block(handle, inode->i_sb);
+- inode->i_ctime = ext4_current_time(inode);
++ if (!(flags & XATTR_NO_CTIME))
++ inode->i_ctime = ext4_current_time(inode);
+ if (!value)
+ EXT4_I(inode)->i_state &= ~EXT4_STATE_NO_EXPAND;
+ error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
--- /dev/null
+Index: linux-2.6.27.21-0.1/fs/ext4/iopen.c
+===================================================================
+--- /dev/null
++++ linux-2.6.27.21-0.1/fs/ext4/iopen.c
+@@ -0,0 +1,295 @@
++/*
++ * linux/fs/ext4/iopen.c
++ *
++ * Special support for open by inode number
++ *
++ * Copyright (C) 2001 by Theodore Ts'o (tytso@alum.mit.edu).
++ *
++ * This file may be redistributed under the terms of the GNU General
++ * Public License.
++ *
++ *
++ * Invariants:
++ * - there is only ever a single DCACHE_NFSD_DISCONNECTED dentry alias
++ * for an inode at one time.
++ * - there are never both connected and DCACHE_NFSD_DISCONNECTED dentry
++ * aliases on an inode at the same time.
++ *
++ * If we have any connected dentry aliases for an inode, use one of those
++ * in iopen_lookup(). Otherwise, we instantiate a single NFSD_DISCONNECTED
++ * dentry for this inode, which thereafter will be found by the dcache
++ * when looking up this inode number in __iopen__, so we don't return here
++ * until it is gone.
++ *
++ * If we get an inode via a regular name lookup, then we "rename" the
++ * NFSD_DISCONNECTED dentry to the proper name and parent. This ensures
++ * existing users of the disconnected dentry will continue to use the same
++ * dentry as the connected users, and there will never be both kinds of
++ * dentry aliases at one time.
++ */
++
++#include <linux/sched.h>
++#include <linux/fs.h>
++#include <linux/smp_lock.h>
++#include <linux/dcache.h>
++#include <linux/security.h>
++#include "iopen.h"
++#include "ext4.h"
++#include "ext4_jbd2.h"
++
++#ifndef assert
++#define assert(test) J_ASSERT(test)
++#endif
++
++#define IOPEN_NAME_LEN 32
++
++/*
++ * This implements looking up an inode by number.
++ */
++static struct dentry *iopen_lookup(struct inode * dir, struct dentry *dentry,
++ struct nameidata *nd)
++{
++ struct inode *inode;
++ unsigned long ino;
++ struct list_head *lp;
++ struct dentry *alternate;
++ char buf[IOPEN_NAME_LEN];
++
++ if (dentry->d_name.len >= IOPEN_NAME_LEN)
++ return ERR_PTR(-ENAMETOOLONG);
++
++ memcpy(buf, dentry->d_name.name, dentry->d_name.len);
++ buf[dentry->d_name.len] = 0;
++
++ if (strcmp(buf, ".") == 0)
++ ino = dir->i_ino;
++ else if (strcmp(buf, "..") == 0)
++ ino = EXT4_ROOT_INO;
++ else
++ ino = simple_strtoul(buf, 0, 0);
++
++ if ((ino != EXT4_ROOT_INO &&
++ ino < EXT4_FIRST_INO(dir->i_sb)) ||
++ ino > le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count))
++ return ERR_PTR(-ENOENT);
++
++ inode = ext4_iget(dir->i_sb, ino);
++ if (IS_ERR(inode)) {
++ /* Newer kernels return -ESTALE for inodes that are not in use,
++ * but older kernels return a negative dentry. This can only
++ * happen when doing a lookup in the __iopen__ dir, because the
++ * "entry" will always be found even if inode is unallocated.
++ * Handle this here instead of fixing the callers. b=19114 */
++ if (PTR_ERR(inode) == -ESTALE)
++ return (ERR_PTR(-ENOENT));
++ return ERR_CAST(inode);
++ }
++
++ assert(list_empty(&dentry->d_alias)); /* d_instantiate */
++ assert(d_unhashed(dentry)); /* d_rehash */
++
++ /* preferrably return a connected dentry */
++ spin_lock(&dcache_lock);
++ list_for_each(lp, &inode->i_dentry) {
++ alternate = list_entry(lp, struct dentry, d_alias);
++ assert(!(alternate->d_flags & DCACHE_DISCONNECTED));
++ }
++
++ if (!list_empty(&inode->i_dentry)) {
++ alternate = list_entry(inode->i_dentry.next,
++ struct dentry, d_alias);
++ dget_locked(alternate);
++ spin_lock(&alternate->d_lock);
++ alternate->d_flags |= DCACHE_REFERENCED;
++ spin_unlock(&alternate->d_lock);
++ iput(inode);
++ spin_unlock(&dcache_lock);
++ return alternate;
++ }
++ dentry->d_flags |= DCACHE_DISCONNECTED;
++
++ /* d_add(), but don't drop dcache_lock before adding dentry to inode */
++ list_add(&dentry->d_alias, &inode->i_dentry); /* d_instantiate */
++ dentry->d_inode = inode;
++
++ d_rehash_cond(dentry, 0);
++ spin_unlock(&dcache_lock);
++
++ return NULL;
++}
++
++/* This function is spliced into ext4_lookup and does the move of a
++ * disconnected dentry (if it exists) to a connected dentry.
++ */
++struct dentry *iopen_connect_dentry(struct dentry *dentry, struct inode *inode,
++ int rehash)
++{
++ struct dentry *tmp, *goal = NULL;
++ struct list_head *lp;
++
++ /* verify this dentry is really new */
++ assert(dentry->d_inode == NULL);
++ assert(list_empty(&dentry->d_alias)); /* d_instantiate */
++ if (rehash)
++ assert(d_unhashed(dentry)); /* d_rehash */
++ assert(list_empty(&dentry->d_subdirs));
++
++ spin_lock(&dcache_lock);
++ if (!inode)
++ goto do_rehash;
++
++ if (!test_opt(inode->i_sb, IOPEN))
++ goto do_instantiate;
++
++ /* preferrably return a connected dentry */
++ list_for_each(lp, &inode->i_dentry) {
++ tmp = list_entry(lp, struct dentry, d_alias);
++ if (tmp->d_flags & DCACHE_DISCONNECTED) {
++ assert(tmp->d_alias.next == &inode->i_dentry);
++ assert(tmp->d_alias.prev == &inode->i_dentry);
++ goal = tmp;
++ dget_locked(goal);
++ break;
++ }
++ }
++
++ if (!goal)
++ goto do_instantiate;
++
++ /* Move the goal to the de hash queue */
++ goal->d_flags &= ~DCACHE_DISCONNECTED;
++ security_d_instantiate(goal, inode);
++ __d_drop(dentry);
++ d_rehash_cond(dentry, 0);
++ d_move_locked(goal, dentry);
++ spin_unlock(&dcache_lock);
++ iput(inode);
++
++ return goal;
++
++ /* d_add(), but don't drop dcache_lock before adding dentry to inode */
++do_instantiate:
++ list_add(&dentry->d_alias, &inode->i_dentry); /* d_instantiate */
++ dentry->d_inode = inode;
++do_rehash:
++ if (rehash)
++ d_rehash_cond(dentry, 0);
++ spin_unlock(&dcache_lock);
++
++ return NULL;
++}
++
++/*
++ * Similar as d_instantiate() except that it drops the disconnected
++ * dentry if any.
++ */
++void iopen_d_instantiate(struct dentry *dentry, struct inode * inode)
++{
++ struct dentry *dis_dentry;
++
++ /* verify this dentry is really new */
++ assert(dentry->d_inode == NULL);
++ assert(list_empty(&dentry->d_alias));
++
++ spin_lock(&dcache_lock);
++ if (!inode || !test_opt(inode->i_sb, IOPEN) ||
++ list_empty(&inode->i_dentry))
++ goto do_instantiate;
++
++ /* a disconnected dentry has been added in our back,
++ * we have to drop this dentry, see bug 16362/15713*/
++ dis_dentry = list_entry(inode->i_dentry.next, struct dentry, d_alias);
++ spin_lock(&dis_dentry->d_lock);
++ assert(dis_dentry->d_alias.next == &inode->i_dentry);
++ assert(dis_dentry->d_alias.prev == &inode->i_dentry);
++ assert(dis_dentry->d_flags & DCACHE_DISCONNECTED);
++ __d_drop(dis_dentry);
++ list_del_init(&dis_dentry->d_alias);
++ spin_unlock(&dis_dentry->d_lock);
++
++do_instantiate:
++ if (inode)
++ list_add(&dentry->d_alias, &inode->i_dentry);
++ dentry->d_inode = inode;
++ spin_unlock(&dcache_lock);
++ security_d_instantiate(dentry, inode);
++}
++
++/*
++ * These are the special structures for the iopen pseudo directory.
++ */
++
++static struct inode_operations iopen_inode_operations = {
++ lookup: iopen_lookup, /* BKL held */
++};
++
++static struct file_operations iopen_file_operations = {
++ read: generic_read_dir,
++};
++
++static int match_dentry(struct dentry *dentry, const char *name)
++{
++ int len;
++
++ len = strlen(name);
++ if (dentry->d_name.len != len)
++ return 0;
++ if (strncmp(dentry->d_name.name, name, len))
++ return 0;
++ return 1;
++}
++
++/*
++ * This function is spliced into ext4_lookup and returns 1 the file
++ * name is __iopen__ and dentry has been filled in appropriately.
++ */
++int ext4_check_for_iopen(struct inode *dir, struct dentry *dentry)
++{
++ struct inode *inode;
++
++ if (dir->i_ino != EXT4_ROOT_INO ||
++ !test_opt(dir->i_sb, IOPEN) ||
++ !match_dentry(dentry, "__iopen__"))
++ return 0;
++
++ inode = ext4_iget(dir->i_sb, EXT4_BAD_INO);
++ if (IS_ERR(inode))
++ return 0;
++
++ d_add(dentry, inode);
++ return 1;
++}
++
++/*
++ * This function is spliced into read_inode; it returns 1 if inode
++ * number is the one for /__iopen__, in which case the inode is filled
++ * in appropriately. Otherwise, this fuction returns 0.
++ */
++int ext4_iopen_get_inode(struct inode *inode)
++{
++ if (inode->i_ino != EXT4_BAD_INO)
++ return 0;
++
++ inode->i_mode = S_IFDIR | S_IRUSR | S_IXUSR;
++ if (test_opt(inode->i_sb, IOPEN_NOPRIV))
++ inode->i_mode |= 0777;
++ inode->i_uid = 0;
++ inode->i_gid = 0;
++ inode->i_nlink = 1;
++ inode->i_size = 4096;
++ inode->i_atime = inode->i_ctime = inode->i_mtime = ext4_current_time(inode);
++ EXT4_I(inode)->i_dtime = 0;
++ EXT4_I(inode)->i_file_acl = 0;
++ inode->i_blocks = 0;
++ inode->i_version = 1;
++ inode->i_generation = 0;
++
++ inode->i_op = &iopen_inode_operations;
++ inode->i_fop = &iopen_file_operations;
++ inode->i_mapping->a_ops = 0;
++
++ if (inode->i_state & I_NEW)
++ unlock_new_inode(inode);
++
++ return 1;
++}
+Index: linux-2.6.27.21-0.1/fs/ext4/iopen.h
+===================================================================
+--- /dev/null
++++ linux-2.6.27.21-0.1/fs/ext4/iopen.h
+@@ -0,0 +1,16 @@
++/*
++ * iopen.h
++ *
++ * Special support for opening files by inode number.
++ *
++ * Copyright (C) 2001 by Theodore Ts'o (tytso@alum.mit.edu).
++ *
++ * This file may be redistributed under the terms of the GNU General
++ * Public License.
++ */
++
++extern int ext4_check_for_iopen(struct inode *dir, struct dentry *dentry);
++extern int ext4_iopen_get_inode(struct inode *inode);
++extern struct dentry *iopen_connect_dentry(struct dentry *dentry,
++ struct inode *inode, int rehash);
++extern void iopen_d_instantiate(struct dentry *dentry, struct inode * inode);
+Index: linux-2.6.27.21-0.1/fs/ext4/inode.c
+===================================================================
+--- linux-2.6.27.21-0.1.orig/fs/ext4/inode.c
++++ linux-2.6.27.21-0.1/fs/ext4/inode.c
+@@ -38,6 +38,7 @@
+ #include <linux/bio.h>
+ #include "ext4_jbd2.h"
+ #include "xattr.h"
++#include "iopen.h"
+ #include "acl.h"
+ #include "ext4_extents.h"
+
+@@ -4115,6 +4116,9 @@ struct inode *ext4_iget(struct super_blo
+ ei->i_default_acl = EXT4_ACL_NOT_CACHED;
+ #endif
+
++ if (ext4_iopen_get_inode(inode))
++ return inode;
++
+ ret = __ext4_get_inode_loc(inode, &iloc, 0);
+ if (ret < 0)
+ goto bad_inode;
+Index: linux-2.6.27.21-0.1/fs/ext4/super.c
+===================================================================
+--- linux-2.6.27.21-0.1.orig/fs/ext4/super.c
++++ linux-2.6.27.21-0.1/fs/ext4/super.c
+@@ -955,7 +955,8 @@ enum {
+ Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
+ Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version,
+ Opt_stripe, Opt_delalloc, Opt_nodelalloc,
+- Opt_inode_readahead_blks
++ Opt_inode_readahead_blks,
++ Opt_iopen, Opt_noiopen, Opt_iopen_nopriv,
+ };
+
+ static const match_table_t tokens = {
+@@ -1004,6 +1005,9 @@ static const match_table_t tokens = {
+ {Opt_noquota, "noquota"},
+ {Opt_quota, "quota"},
+ {Opt_usrquota, "usrquota"},
++ {Opt_iopen, "iopen"},
++ {Opt_noiopen, "noiopen"},
++ {Opt_iopen_nopriv, "iopen_nopriv"},
+ {Opt_barrier, "barrier=%u"},
+ {Opt_extents, "extents"},
+ {Opt_noextents, "noextents"},
+@@ -1347,6 +1351,18 @@ set_qf_format:
+ else
+ clear_opt(sbi->s_mount_opt, BARRIER);
+ break;
++ case Opt_iopen:
++ set_opt (sbi->s_mount_opt, IOPEN);
++ clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV);
++ break;
++ case Opt_noiopen:
++ clear_opt (sbi->s_mount_opt, IOPEN);
++ clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV);
++ break;
++ case Opt_iopen_nopriv:
++ set_opt (sbi->s_mount_opt, IOPEN);
++ set_opt (sbi->s_mount_opt, IOPEN_NOPRIV);
++ break;
+ case Opt_ignore:
+ break;
+ case Opt_resize:
+Index: linux-2.6.27.21-0.1/fs/ext4/namei.c
+===================================================================
+--- linux-2.6.27.21-0.1.orig/fs/ext4/namei.c
++++ linux-2.6.27.21-0.1/fs/ext4/namei.c
+@@ -39,6 +39,7 @@
+
+ #include "namei.h"
+ #include "xattr.h"
++#include "iopen.h"
+ #include "acl.h"
+
+ /*
+@@ -1054,6 +1055,9 @@ static struct dentry *ext4_lookup(struct
+ if (dentry->d_name.len > EXT4_NAME_LEN)
+ return ERR_PTR(-ENAMETOOLONG);
+
++ if (ext4_check_for_iopen(dir, dentry))
++ return NULL;
++
+ bh = ext4_find_entry(dir, &dentry->d_name, &de);
+ inode = NULL;
+ if (bh) {
+@@ -1068,7 +1072,8 @@ static struct dentry *ext4_lookup(struct
+ if (IS_ERR(inode))
+ return ERR_CAST(inode);
+ }
+- return d_splice_alias(inode, dentry);
++
++ return iopen_connect_dentry(dentry, inode, 1);
+ }
+
+
+@@ -1717,7 +1722,7 @@ static int ext4_add_nondir(handle_t *han
+ int err = ext4_add_entry(handle, dentry, inode);
+ if (!err) {
+ ext4_mark_inode_dirty(handle, inode);
+- d_instantiate(dentry, inode);
++ iopen_d_instantiate(dentry, inode);
+ return 0;
+ }
+ drop_nlink(inode);
+@@ -1876,7 +1881,7 @@ out_clear_inode:
+ ext4_inc_count(handle, dir);
+ ext4_update_dx_flag(dir);
+ ext4_mark_inode_dirty(handle, dir);
+- d_instantiate(dentry, inode);
++ iopen_d_instantiate(dentry, inode);
+ out_stop:
+ ext4_journal_stop(handle);
+ if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
+@@ -2142,10 +2147,6 @@ static int ext4_rmdir(struct inode *dir,
+ inode->i_nlink);
+ inode->i_version++;
+ clear_nlink(inode);
+- /* There's no need to set i_disksize: the fact that i_nlink is
+- * zero will ensure that the right thing happens during any
+- * recovery. */
+- inode->i_size = 0;
+ ext4_orphan_add(handle, inode);
+ inode->i_ctime = dir->i_ctime = dir->i_mtime = ext4_current_time(inode);
+ ext4_mark_inode_dirty(handle, inode);
+@@ -2271,6 +2272,23 @@ out_stop:
+ return err;
+ }
+
++/* Like ext4_add_nondir() except for call to iopen_connect_dentry */
++static int ext4_add_link(handle_t *handle, struct dentry *dentry,
++ struct inode *inode)
++{
++ int err = ext4_add_entry(handle, dentry, inode);
++ if (!err) {
++ err = ext4_mark_inode_dirty(handle, inode);
++ if (err == 0) {
++ dput(iopen_connect_dentry(dentry, inode, 0));
++ return 0;
++ }
++ }
++ ext4_dec_count(handle, inode);
++ iput(inode);
++ return err;
++}
++
+ static int ext4_link(struct dentry *old_dentry,
+ struct inode *dir, struct dentry *dentry)
+ {
+@@ -2301,7 +2319,8 @@ retry:
+ ext4_inc_count(handle, inode);
+ atomic_inc(&inode->i_count);
+
+- err = ext4_add_nondir(handle, dentry, inode);
++ err = ext4_add_link(handle, dentry, inode);
++ ext4_orphan_del(handle, inode);
+ ext4_journal_stop(handle);
+ if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
+ goto retry;
+Index: linux-2.6.27.21-0.1/fs/ext4/Makefile
+===================================================================
+--- linux-2.6.27.21-0.1.orig/fs/ext4/Makefile
++++ linux-2.6.27.21-0.1/fs/ext4/Makefile
+@@ -4,7 +4,7 @@
+
+ obj-$(CONFIG_EXT4DEV_FS) += ext4dev.o
+
+-ext4dev-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
++ext4dev-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \
+ ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
+ ext4_jbd2.o migrate.o mballoc.o
+
+Index: linux-2.6.27.21-0.1/fs/ext4/ext4.h
+===================================================================
+--- linux-2.6.27.21-0.1.orig/fs/ext4/ext4.h
++++ linux-2.6.27.21-0.1/fs/ext4/ext4.h
+@@ -540,6 +540,8 @@ do { \
+ #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
+ #define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */
+ #define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
++#define EXT4_MOUNT_IOPEN 0x10000000 /* Allow access via iopen */
++#define EXT4_MOUNT_IOPEN_NOPRIV 0x20000000 /* Make iopen world-readable */
+ /* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */
+ #ifndef _LINUX_EXT2_FS_H
+ #define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt