Whamcloud - gitweb
b=16893
authorgirish <girish>
Fri, 29 May 2009 12:24:06 +0000 (12:24 +0000)
committergirish <girish>
Fri, 29 May 2009 12:24:06 +0000 (12:24 +0000)
i=adilger
i=johann

ext4 ldiskfs patches for rhel5

23 files changed:
ldiskfs/kernel_patches/patches/export-ext4-2.6-rhel5.patch [new file with mode: 0644]
ldiskfs/kernel_patches/patches/ext4-alloc-policy-2.6-rhel5.patch [new file with mode: 0644]
ldiskfs/kernel_patches/patches/ext4-big-endian-check-2.6-rhel5.patch [new file with mode: 0644]
ldiskfs/kernel_patches/patches/ext4-fiemap-2.6-rhel5.patch [new file with mode: 0644]
ldiskfs/kernel_patches/patches/ext4-filterdata-rhel5.patch [new file with mode: 0644]
ldiskfs/kernel_patches/patches/ext4-ialloc-2.6-rhel5.patch [new file with mode: 0644]
ldiskfs/kernel_patches/patches/ext4-include-fixes-2.6-rhel5.patch [new file with mode: 0644]
ldiskfs/kernel_patches/patches/ext4-inode-version-rhel5.patch [new file with mode: 0644]
ldiskfs/kernel_patches/patches/ext4-lookup-dotdot-rhel5.patch [new file with mode: 0644]
ldiskfs/kernel_patches/patches/ext4-map_inode_page-2.6.18-rhel5.patch [new file with mode: 0644]
ldiskfs/kernel_patches/patches/ext4-max-dir-size-rhel5.patch [new file with mode: 0644]
ldiskfs/kernel_patches/patches/ext4-mballoc-extra-checks-rhel5.patch [new file with mode: 0644]
ldiskfs/kernel_patches/patches/ext4-mballoc-handle-dev-paths-rhel5.patch [new file with mode: 0644]
ldiskfs/kernel_patches/patches/ext4-misc-rhel5.patch [new file with mode: 0644]
ldiskfs/kernel_patches/patches/ext4-mmp-rhel5.patch [new file with mode: 0644]
ldiskfs/kernel_patches/patches/ext4-prealloc-rhel5.patch [new file with mode: 0644]
ldiskfs/kernel_patches/patches/ext4-print-inum-in-htree-warning-rhel5.patch [new file with mode: 0644]
ldiskfs/kernel_patches/patches/ext4-remove-cond_resched-calls-rhel5.patch [new file with mode: 0644]
ldiskfs/kernel_patches/patches/ext4-unlink-race-rhel5.patch [new file with mode: 0644]
ldiskfs/kernel_patches/patches/ext4-wantedi-2.6-rhel5.patch [new file with mode: 0644]
ldiskfs/kernel_patches/patches/ext4-xattr-no-update-ctime-rhel5.patch [new file with mode: 0644]
ldiskfs/kernel_patches/patches/iopen-2.6.18-rhel5-ext4.patch [new file with mode: 0644]
ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel5-ext4.series [new file with mode: 0644]

diff --git a/ldiskfs/kernel_patches/patches/export-ext4-2.6-rhel5.patch b/ldiskfs/kernel_patches/patches/export-ext4-2.6-rhel5.patch
new file mode 100644 (file)
index 0000000..3930843
--- /dev/null
@@ -0,0 +1,35 @@
+Index: linux-2.6.18.i386/fs/ext4/super.c
+===================================================================
+--- linux-2.6.18.i386.orig/fs/ext4/super.c
++++ linux-2.6.18.i386/fs/ext4/super.c
+@@ -185,6 +185,8 @@ void ext4_journal_abort_handle(const cha
+       jbd2_journal_abort_handle(handle);
+ }
++EXPORT_SYMBOL(ext4_journal_abort_handle);
++
+ /* Deal with the reporting of failure conditions on a filesystem such as
+  * inconsistencies detected or read IO failures.
+  *
+@@ -2459,6 +2461,8 @@ out_fail:
+       return ret;
+ }
++EXPORT_SYMBOL(ext4_force_commit);
++
+ /*
+  * Setup any per-fs journal parameters now.  We'll do this both on
+  * initial mount, once the journal has been initialised but before we've
+@@ -3502,6 +3506,12 @@ int ext4_map_inode_page(struct inode *in
+                       unsigned long *blocks, int *created, int create);
+ EXPORT_SYMBOL(ext4_map_inode_page);
++EXPORT_SYMBOL(ext4_xattr_get);
++EXPORT_SYMBOL(ext4_xattr_set_handle);
++EXPORT_SYMBOL(ext4_bread);
++EXPORT_SYMBOL(ext4_journal_start_sb);
++EXPORT_SYMBOL(__ext4_journal_stop);
++
+ MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
+ MODULE_DESCRIPTION("Fourth Extended Filesystem with extents");
+ MODULE_LICENSE("GPL");
diff --git a/ldiskfs/kernel_patches/patches/ext4-alloc-policy-2.6-rhel5.patch b/ldiskfs/kernel_patches/patches/ext4-alloc-policy-2.6-rhel5.patch
new file mode 100644 (file)
index 0000000..a1b8375
--- /dev/null
@@ -0,0 +1,101 @@
+Index: linux-2.6.18-128.1.6/fs/ext4/ialloc.c
+===================================================================
+--- linux-2.6.18-128.1.6.orig/fs/ext4/ialloc.c
++++ linux-2.6.18-128.1.6/fs/ext4/ialloc.c
+@@ -946,6 +946,36 @@ fail_drop:
+       return ERR_PTR(err);
+ }
++unsigned long ext4_find_reverse(struct super_block *sb)
++{
++      struct ext4_group_desc *desc;
++      struct buffer_head *bitmap_bh = NULL;
++      int group;
++      unsigned long ino, offset;
++
++      for (offset = (EXT4_INODES_PER_GROUP(sb) >> 1); offset >= 0;
++           offset >>= 1) {
++              for (group = EXT4_SB(sb)->s_groups_count - 1; group >= 0;
++                   --group) {
++                      desc = ext4_get_group_desc(sb, group, NULL);
++                      if (desc->bg_free_inodes_count == 0)
++                              continue;
++
++                      bitmap_bh = ext4_read_inode_bitmap(sb, group);
++                      if (!bitmap_bh)
++                              continue;
++
++                      ino = ext4_find_next_zero_bit((unsigned long *)
++                                      bitmap_bh->b_data,
++                                      EXT4_INODES_PER_GROUP(sb), offset);
++                      if (ino < EXT4_INODES_PER_GROUP(sb))
++                              return (group * EXT4_INODES_PER_GROUP(sb) +
++                                     ino + 1);
++              }
++      }
++      return 0;
++}
++
+ /* Verify that we are loading a valid orphan from disk */
+ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
+ {
+Index: linux-2.6.18-128.1.6/fs/ext4/namei.c
+===================================================================
+--- linux-2.6.18-128.1.6.orig/fs/ext4/namei.c
++++ linux-2.6.18-128.1.6/fs/ext4/namei.c
+@@ -151,14 +151,24 @@ struct dx_map_entry
+       u16 size;
+ };
++/*
++ * dentry_param used by ext4_new_inode_wantedi()
++ */
+ #define LVFS_DENTRY_PARAM_MAGIC               20070216UL
+ struct lvfs_dentry_params
+ {
+-      unsigned long   p_inum;
+-      void        *p_ptr;
+-      u32          magic;
++      unsigned long   ldp_inum;
++      long            ldp_flags;
++      u32             ldp_magic;
+ };
++/* Only use the least 3 bits of ldp_flags for goal policy */
++typedef enum {
++      DP_GOAL_POLICY       = 0,
++      DP_LASTGROUP_REVERSE = 1,
++} dp_policy_t;
++
++
+ static inline ext4_lblk_t dx_get_block(struct dx_entry *entry);
+ static void dx_set_block(struct dx_entry *entry, ext4_lblk_t value);
+ static inline unsigned dx_get_hash (struct dx_entry *entry);
+@@ -1762,8 +1772,13 @@ static struct inode * ext4_new_inode_wan
+       if (dentry->d_fsdata != NULL) {
+               struct lvfs_dentry_params *param = dentry->d_fsdata;
+-              if (param->magic == LVFS_DENTRY_PARAM_MAGIC)
+-                      inum = param->p_inum;
++              if (param->ldp_magic == LVFS_DENTRY_PARAM_MAGIC) {
++                      if ((dp_policy_t)(param->ldp_flags & 0x7) ==
++                          DP_LASTGROUP_REVERSE)
++                              inum = ext4_find_reverse(dir->i_sb);
++                        else /* DP_GOAL_POLICY */
++                              inum = param->ldp_inum;
++                }
+       }
+       return ext4_new_inode(handle, dir, mode, inum);
+ }
+Index: linux-2.6.18-128.1.6/fs/ext4/ext4.h
+===================================================================
+--- linux-2.6.18-128.1.6.orig/fs/ext4/ext4.h
++++ linux-2.6.18-128.1.6/fs/ext4/ext4.h
+@@ -1071,6 +1071,7 @@ extern int ext4fs_dirhash(const char *na
+ /* ialloc.c */
+ extern struct inode * ext4_new_inode (handle_t *, struct inode *, int,
+                                     unsigned long);
++extern unsigned long ext4_find_reverse(struct super_block *);
+ extern void ext4_free_inode (handle_t *, struct inode *);
+ extern struct inode * ext4_orphan_get (struct super_block *, unsigned long);
+ extern unsigned long ext4_count_free_inodes (struct super_block *);
diff --git a/ldiskfs/kernel_patches/patches/ext4-big-endian-check-2.6-rhel5.patch b/ldiskfs/kernel_patches/patches/ext4-big-endian-check-2.6-rhel5.patch
new file mode 100644 (file)
index 0000000..0ec5670
--- /dev/null
@@ -0,0 +1,56 @@
+Index: linux-2.6.18-128.1.6/fs/ext4/super.c
+===================================================================
+--- linux-2.6.18-128.1.6.orig/fs/ext4/super.c
++++ linux-2.6.18-128.1.6/fs/ext4/super.c
+@@ -70,6 +70,8 @@ struct page *ext4_zero_page;
+ struct proc_dir_entry *proc_root_ext4;
++static int bigendian_extents;
++
+ ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
+                              struct ext4_group_desc *bg)
+ {
+@@ -1222,7 +1224,7 @@ enum {
+       Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
+       Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version,
+       Opt_mballoc, Opt_nomballoc, Opt_stripe,
+-      Opt_iopen, Opt_noiopen, Opt_iopen_nopriv,
++      Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, Opt_bigendian_extents,
+ };
+ static match_table_t tokens = {
+@@ -1284,6 +1286,7 @@ static match_table_t tokens = {
+       {Opt_nomballoc, "nomballoc"},
+       {Opt_stripe, "stripe=%u"},
+       {Opt_resize, "resize"},
++      {Opt_bigendian_extents, "bigendian_extents"},
+       {Opt_err, NULL},
+ };
+@@ -1682,6 +1685,9 @@ clear_qf_name:
+                               return 0;
+                       sbi->s_stripe = option;
+                       break;
++              case Opt_bigendian_extents:
++                      bigendian_extents = 1;
++                      break;
+               default:
+                       printk(KERN_ERR
+                              "EXT4-fs: Unrecognized mount option \"%s\" "
+@@ -2561,6 +2567,15 @@ static int ext4_fill_super(struct super_
+               goto failed_mount;
+       }
++#ifdef __BIG_ENDIAN
++      if (bigendian_extents == 0) {
++              printk(KERN_ERR "EXT4-fs: extents feature is not guaranteed to "
++                     "work on big-endian systems. Use \"bigendian_extents\" "
++                     "mount option to override.\n");
++              goto failed_mount;
++      }
++#endif
++
+       bgl_lock_init(&sbi->s_blockgroup_lock);
+       sbi->s_last_alloc_group = -1;
diff --git a/ldiskfs/kernel_patches/patches/ext4-fiemap-2.6-rhel5.patch b/ldiskfs/kernel_patches/patches/ext4-fiemap-2.6-rhel5.patch
new file mode 100644 (file)
index 0000000..b2f80d5
--- /dev/null
@@ -0,0 +1,566 @@
+A large part of this code is from the generic VFS code in fs/ioctl.c in the
+upstream kernel.
+
+Index: linux-2.6.18.i386/fs/ext4/ioctl.c
+===================================================================
+--- linux-2.6.18.i386.orig/fs/ext4/ioctl.c
++++ linux-2.6.18.i386/fs/ext4/ioctl.c
+@@ -17,6 +17,162 @@
+ #include "ext4_jbd2.h"
+ #include "ext4.h"
++#include "fiemap.h"
++
++/* So that the fiemap access checks can't overflow on 32 bit machines. */
++#define FIEMAP_MAX_EXTENTS     (UINT_MAX / sizeof(struct fiemap_extent))
++
++/**
++ * fiemap_fill_next_extent - Fiemap helper function
++ * @fieinfo:   Fiemap context passed into ->fiemap
++ * @logical:   Extent logical start offset, in bytes
++ * @phys:      Extent physical start offset, in bytes
++ * @len:       Extent length, in bytes
++ * @flags:     FIEMAP_EXTENT flags that describe this extent
++ * @lun:       LUN on which this extent resides
++ *
++ * Called from file system ->fiemap callback. Will populate extent
++ * info as passed in via arguments and copy to user memory. On
++ * success, extent count on fieinfo is incremented.
++ *
++ * Returns 0 on success, -errno on error, 1 if this was the last
++ * extent that will fit in user array.
++ */
++#define SET_UNKNOWN_FLAGS      (FIEMAP_EXTENT_DELALLOC)
++#define SET_NO_DIRECT_FLAGS    (FIEMAP_EXTENT_DATA_ENCRYPTED  \
++                              |FIEMAP_EXTENT_NET)
++#define SET_NO_UNMOUNTED_IO_FLAGS       (FIEMAP_EXTENT_DATA_ENCRYPTED)
++#define SET_NOT_ALIGNED_FLAGS  (FIEMAP_EXTENT_DATA_TAIL|FIEMAP_EXTENT_DATA_INLINE)
++int fiemap_fill_next_extent(struct fiemap_extent_info *fieinfo, u64 logical,
++                          u64 phys, u64 len, u32 flags, dev_t dev)
++{
++      struct fiemap_extent extent = { 0 };
++      struct fiemap_extent *dest = fieinfo->fi_extents_start;
++
++      /* only count the extents */
++      if (fieinfo->fi_extents_max == 0) {
++              fieinfo->fi_extents_mapped++;
++              return (flags & FIEMAP_EXTENT_LAST) ? 1 : 0;
++      }
++
++      if (fieinfo->fi_extents_mapped >= fieinfo->fi_extents_max)
++              return 1;
++
++      if (flags & SET_UNKNOWN_FLAGS)
++              flags |= FIEMAP_EXTENT_UNKNOWN;
++      if (flags & SET_NO_DIRECT_FLAGS)
++              flags |= FIEMAP_EXTENT_NO_DIRECT;
++      if (flags & SET_NOT_ALIGNED_FLAGS)
++              flags |= FIEMAP_EXTENT_NOT_ALIGNED;
++      if (flags & SET_NO_UNMOUNTED_IO_FLAGS)
++              flags |= FIEMAP_EXTENT_ENCODED;
++
++      extent.fe_logical = logical;
++      extent.fe_physical = phys;
++      extent.fe_length = len;
++      extent.fe_flags = flags;
++      extent.fe_device = new_encode_dev(dev);
++
++      dest += fieinfo->fi_extents_mapped;
++      if (copy_to_user(dest, &extent, sizeof(extent)))
++              return -EFAULT;
++
++      fieinfo->fi_extents_mapped++;
++      if (fieinfo->fi_extents_mapped == fieinfo->fi_extents_max)
++              return 1;
++
++      return (flags & FIEMAP_EXTENT_LAST) ? 1 : 0;
++}
++
++static int fiemap_check_ranges(struct super_block *sb,
++                             u64 start, u64 len, u64 *new_len)
++{
++      *new_len = len;
++
++      if (len == 0)
++              return -EINVAL;
++
++      if (start > sb->s_maxbytes)
++              return -EFBIG;
++
++      /*
++       * Shrink request scope to what the fs can actually handle.
++       */
++      if ((len > sb->s_maxbytes) ||
++          (sb->s_maxbytes - len) < start)
++              *new_len = sb->s_maxbytes - start;
++
++      return 0;
++}
++
++/*
++ * fiemap_check_flags - check validity of requested flags for fiemap
++ * @fieinfo:   Fiemap context passed into ->fiemap
++ * @fs_flags:  Set of fiemap flags that the file system understands
++ *
++ * Called from file system ->fiemap callback. This will compute the
++ * intersection of valid fiemap flags and those that the fs supports. That
++ * value is then compared against the user supplied flags. In case of bad user
++ * flags, the invalid values will be written into the fieinfo structure, and
++ * -EBADR is returned, which tells ioctl_fiemap() to return those values to
++ * userspace. For this reason, a return code of -EBADR should be preserved.
++ *
++ * Returns 0 on success, -EBADR on bad flags.
++ */
++int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags)
++{
++      u32 incompat_flags;
++
++      incompat_flags = fieinfo->fi_flags & ~(FIEMAP_FLAGS_COMPAT & fs_flags);
++      if (incompat_flags) {
++              fieinfo->fi_flags = incompat_flags;
++              return -EBADR;
++      }
++
++      return 0;
++}
++
++int ioctl_fiemap(struct inode *inode, struct file *filp, unsigned long arg)
++{
++      struct fiemap fiemap;
++      u64 len;
++      struct fiemap_extent_info fieinfo = {0, };
++      struct super_block *sb = inode->i_sb;
++      int error = 0;
++
++      if (copy_from_user(&fiemap, (struct fiemap __user *) arg,
++                         sizeof(struct fiemap)))
++               return -EFAULT;
++
++      if (fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS)
++              return -EINVAL;
++
++      error = fiemap_check_ranges(sb, fiemap.fm_start, fiemap.fm_length,
++                                  &len);
++      if (error)
++              return error;
++
++      fieinfo.fi_flags = fiemap.fm_flags;
++      fieinfo.fi_extents_max = fiemap.fm_extent_count;
++      fieinfo.fi_extents_start = (struct fiemap_extent *)(arg + sizeof(fiemap));
++
++      if (fiemap.fm_extent_count != 0 &&
++          !access_ok(VERIFY_WRITE, (void *)arg,
++                     offsetof(typeof(fiemap), fm_extents[fiemap.fm_extent_count])))
++              return -EFAULT;
++
++      if (fieinfo.fi_flags & FIEMAP_FLAG_SYNC)
++              filemap_write_and_wait(inode->i_mapping);
++
++      error = ext4_fiemap(inode, &fieinfo, fiemap.fm_start, len);
++      fiemap.fm_flags = fieinfo.fi_flags;
++      fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped;
++      if (copy_to_user((char *)arg, &fiemap, sizeof(fiemap)))
++              error = -EFAULT;
++
++      return error;
++}
++
+ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+ {
+       struct inode *inode = filp->f_dentry->d_inode;
+@@ -257,6 +413,10 @@ flags_err:
+       case EXT4_IOC_MIGRATE:
+               return ext4_ext_migrate(inode, filp, cmd, arg);
++      case EXT4_IOC_FIEMAP: {
++              return ioctl_fiemap(inode, filp, arg);
++      }
++
+       default:
+               return -ENOTTY;
+       }
+Index: linux-2.6.18.i386/fs/ext4/ext4.h
+===================================================================
+--- linux-2.6.18.i386.orig/fs/ext4/ext4.h
++++ linux-2.6.18.i386/fs/ext4/ext4.h
+@@ -300,6 +300,7 @@ struct ext4_new_group_data {
+ #define EXT4_IOC_GETRSVSZ             _IOR('f', 5, long)
+ #define EXT4_IOC_SETRSVSZ             _IOW('f', 6, long)
+ #define EXT4_IOC_MIGRATE              _IO('f', 7)
++#define EXT4_IOC_FIEMAP                       _IOWR('f', 11, struct fiemap)
+ /*
+  * ioctl commands in 32 bit emulation
+@@ -317,6 +318,8 @@ struct ext4_new_group_data {
+ #define EXT4_IOC32_GETVERSION_OLD     FS_IOC32_GETVERSION
+ #define EXT4_IOC32_SETVERSION_OLD     FS_IOC32_SETVERSION
++/* FIEMAP flags supported by ext4 */
++#define EXT4_FIEMAP_FLAGS_COMPAT (FIEMAP_FLAG_SYNC)
+ /*
+  *  Mount options
+@@ -1115,6 +1118,9 @@ extern int ext4_page_mkwrite(struct vm_a
+ /* ioctl.c */
+ extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
+ extern long ext4_compat_ioctl (struct file *, unsigned int, unsigned long);
++struct fiemap_extent_info;
++extern int ext4_fiemap(struct inode *, struct fiemap_extent_info *, __u64,
++                     __u64);
+ /* migrate.c */
+ extern int ext4_ext_migrate(struct inode *, struct file *, unsigned int,
+Index: linux-2.6.18.i386/fs/ext4/ext4_extents.h
+===================================================================
+--- linux-2.6.18.i386.orig/fs/ext4/ext4_extents.h
++++ linux-2.6.18.i386/fs/ext4/ext4_extents.h
+@@ -128,6 +128,22 @@ struct ext4_ext_path {
+ #define EXT_MAX_BLOCK 0xffffffff
+ /*
++ * to be called by ext4_ext_walk_space()
++ * negative retcode - error
++ * positive retcode - signal for ext4_ext_walk_space(), see below
++ * callback must return valid extent (passed or newly created)
++ */
++typedef int (*ext_prepare_callback)(struct inode *, struct ext4_ext_path *,
++                                  struct ext4_ext_cache *,
++                                  struct ext4_extent *, void *);
++
++#define HAVE_EXT_PREPARE_CB_EXTENT
++
++#define EXT_CONTINUE   0
++#define EXT_BREAK      1
++#define EXT_REPEAT     2
++
++/*
+  * EXT_INIT_MAX_LEN is the maximum number of blocks we can have in an
+  * initialized extent. This is 2^15 and not (2^16 - 1), since we use the
+  * MSB of ee_len field in the extent datastructure to signify if this
+@@ -223,6 +239,8 @@ extern int ext4_ext_try_to_merge(struct 
+                                struct ext4_extent *);
+ extern unsigned int ext4_ext_check_overlap(struct inode *, struct ext4_extent *, struct ext4_ext_path *);
+ extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *);
++extern int ext4_ext_walk_space(struct inode *, ext4_lblk_t, ext4_lblk_t,
++                             ext_prepare_callback, void *);
+ extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
+                                                       struct ext4_ext_path *);
+ extern int ext4_ext_search_left(struct inode *, struct ext4_ext_path *,
+Index: linux-2.6.18.i386/fs/ext4/extents.c
+===================================================================
+--- linux-2.6.18.i386.orig/fs/ext4/extents.c
++++ linux-2.6.18.i386/fs/ext4/extents.c
+@@ -44,7 +44,7 @@
+ #include <asm/uaccess.h>
+ #include "ext4_jbd2.h"
+ #include "ext4_extents.h"
+-
++#include "fiemap.h"
+ /*
+  * ext_pblock:
+@@ -1597,6 +1597,113 @@ cleanup:
+       return err;
+ }
++int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
++                      ext4_lblk_t num, ext_prepare_callback func,
++                      void *cbdata)
++{
++      struct ext4_ext_path *path = NULL;
++      struct ext4_ext_cache cbex;
++      struct ext4_extent *ex;
++      ext4_lblk_t next, start = 0, end = 0;
++      ext4_lblk_t last = block + num;
++      int depth, exists, err = 0;
++
++      BUG_ON(func == NULL);
++      BUG_ON(inode == NULL);
++
++      while (block < last && block != EXT_MAX_BLOCK) {
++              num = last - block;
++              /* find extent for this block */
++              path = ext4_ext_find_extent(inode, block, path);
++              if (IS_ERR(path)) {
++                      err = PTR_ERR(path);
++                      path = NULL;
++                      break;
++              }
++
++              depth = ext_depth(inode);
++              BUG_ON(path[depth].p_hdr == NULL);
++              ex = path[depth].p_ext;
++              next = ext4_ext_next_allocated_block(path);
++
++              exists = 0;
++              if (!ex) {
++                      /* there is no extent yet, so try to allocate
++                       * all requested space */
++                      start = block;
++                      end = block + num;
++              } else if (le32_to_cpu(ex->ee_block) > block) {
++                      /* need to allocate space before found extent */
++                      start = block;
++                      end = le32_to_cpu(ex->ee_block);
++                      if (block + num < end)
++                              end = block + num;
++              } else if (block >= le32_to_cpu(ex->ee_block)
++                                      + ext4_ext_get_actual_len(ex)) {
++                      /* need to allocate space after found extent */
++                      start = block;
++                      end = block + num;
++                      if (end >= next)
++                              end = next;
++              } else if (block >= le32_to_cpu(ex->ee_block)) {
++                      /*
++                       * some part of requested space is covered
++                       * by found extent
++                       */
++                      start = block;
++                      end = le32_to_cpu(ex->ee_block)
++                              + ext4_ext_get_actual_len(ex);
++                      if (block + num < end)
++                              end = block + num;
++                      exists = 1;
++              } else {
++                      BUG();
++              }
++              BUG_ON(end <= start);
++
++              if (!exists) {
++                      cbex.ec_block = start;
++                      cbex.ec_len = end - start;
++                      cbex.ec_start = 0;
++                      cbex.ec_type = EXT4_EXT_CACHE_GAP;
++              } else {
++                      cbex.ec_block = le32_to_cpu(ex->ee_block);
++                      cbex.ec_len = ext4_ext_get_actual_len(ex);
++                      cbex.ec_start = ext_pblock(ex);
++                      cbex.ec_type = EXT4_EXT_CACHE_EXTENT;
++              }
++
++              BUG_ON(cbex.ec_len == 0);
++              err = func(inode, path, &cbex, ex, cbdata);
++              ext4_ext_drop_refs(path);
++
++              if (err < 0)
++                      break;
++
++              if (err == EXT_REPEAT)
++                      continue;
++              else if (err == EXT_BREAK) {
++                      err = 0;
++                      break;
++              }
++
++              if (ext_depth(inode) != depth) {
++                      /* depth was changed. we have to realloc path */
++                      kfree(path);
++                      path = NULL;
++              }
++
++              block = cbex.ec_block + cbex.ec_len;
++      }
++
++      if (path) {
++              ext4_ext_drop_refs(path);
++              kfree(path);
++      }
++
++      return err;
++}
++
+ static void
+ ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block,
+                       __u32 len, ext4_fsblk_t start, int type)
+@@ -2953,3 +3060,100 @@ retry:
+       return ret > 0 ? ret2 : ret;
+ }
+ #endif
++
++/*
++ * Callback function called for each extent to gather FIEMAP information.
++ */
++int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
++                     struct ext4_ext_cache *newex, struct ext4_extent *ex,
++                     void *data)
++{
++      struct fiemap_extent_info *fieinfo = data;
++      unsigned long blksize_bits = inode->i_sb->s_blocksize_bits;
++      __u64   logical;
++      __u64   physical;
++      __u64   length;
++      __u32   flags = 0;
++      int     error;
++
++      logical =  (__u64)newex->ec_block << blksize_bits;
++
++      if (newex->ec_type == EXT4_EXT_CACHE_GAP) {
++              pgoff_t offset;
++              struct page *page;
++              struct buffer_head *bh = NULL;
++
++              offset = logical >> PAGE_SHIFT;
++              page = find_get_page(inode->i_mapping, offset);
++              if (!page || !page_has_buffers(page))
++                      return EXT_CONTINUE;
++
++              bh = page_buffers(page);
++
++              if (!bh)
++                      return EXT_CONTINUE;
++
++              if (buffer_delay(bh)) {
++                      flags |= FIEMAP_EXTENT_DELALLOC;
++                      page_cache_release(page);
++              } else {
++                      page_cache_release(page);
++                      return EXT_CONTINUE;
++              }
++      }
++
++      physical = (__u64)newex->ec_start << blksize_bits;
++      length =   (__u64)newex->ec_len << blksize_bits;
++
++      if (ex && ext4_ext_is_uninitialized(ex))
++              flags |= FIEMAP_EXTENT_UNWRITTEN;
++
++      /*
++       * If this extent reaches EXT_MAX_BLOCK, it must be last.
++       *
++       * Or if ext4_ext_next_allocated_block is EXT_MAX_BLOCK,
++       * this indicates no more allocated blocks.
++       *
++       * XXX this might miss a single-block extent at EXT_MAX_BLOCK
++       */
++      if (logical + length - 1 == EXT_MAX_BLOCK ||
++          ext4_ext_next_allocated_block(path) == EXT_MAX_BLOCK)
++              flags |= FIEMAP_EXTENT_LAST;
++
++      error = fiemap_fill_next_extent(fieinfo, logical, physical,
++                                      length, flags, inode->i_sb->s_dev);
++      if (error < 0)
++              return error;
++      if (error == 1)
++              return EXT_BREAK;
++
++      return EXT_CONTINUE;
++}
++
++int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
++              __u64 start, __u64 len)
++{
++      ext4_fsblk_t start_blk;
++      ext4_fsblk_t len_blks;
++      int error = 0;
++
++      if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
++              return -EOPNOTSUPP;
++
++      if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS_COMPAT))
++              return -EBADR;
++
++      start_blk = start >> inode->i_sb->s_blocksize_bits;
++      len_blks = (len + inode->i_sb->s_blocksize - 1) >> inode->i_sb->s_blocksize_bits;
++
++      /*
++        * Walk the extent tree gathering extent information.
++        * ext4_ext_fiemap_cb will push extents back to user.
++        */
++      down_write(&EXT4_I(inode)->i_data_sem);
++      error = ext4_ext_walk_space(inode, start_blk, len_blks,
++                                ext4_ext_fiemap_cb, fieinfo);
++      up_write(&EXT4_I(inode)->i_data_sem);
++
++      return error;
++}
+Index: linux-2.6.18.i386/fs/ext4/fiemap.h
+===================================================================
+--- /dev/null
++++ linux-2.6.18.i386/fs/ext4/fiemap.h
+@@ -0,0 +1,85 @@
++/*
++ * FIEMAP ioctl infrastructure.
++ *
++ * Copyright 2008 Sun Microsystems, Inc
++ *
++ * Author: Kalpak Shah <kalpak.shah@sun.com>
++ *     Andreas Dilger <adilger@sun.com>
++ */
++
++#ifndef _LINUX_EXT4_FIEMAP_H
++#define _LINUX_EXT4_FIEMAP_H
++
++struct fiemap_extent {
++      __u64 fe_logical;  /* logical offset in bytes for the start of
++                          * the extent from the beginning of the file */
++      __u64 fe_physical; /* physical offset in bytes for the start
++                          * of the extent from the beginning of the disk */
++      __u64 fe_length;   /* length in bytes for this extent */
++      __u64 fe_reserved64[2];
++      __u32 fe_flags;    /* FIEMAP_EXTENT_* flags for this extent */
++      __u32 fe_device;   /* device number for this extent */
++      __u32 fe_reserved[2];
++};
++
++struct fiemap {
++      __u64 fm_start;  /* logical offset (inclusive) at
++                               * which to start mapping (in) */
++      __u64 fm_length;        /* logical length of mapping which
++                               * userspace wants (in) */
++      __u32 fm_flags;  /* FIEMAP_FLAG_* flags for request (in/out) */
++      __u32 fm_mapped_extents;/* number of extents that were mapped (out) */
++      __u32 fm_extent_count;  /* size of fm_extents array (in) */
++      __u32 fm_reserved;
++      struct fiemap_extent fm_extents[0]; /* array of mapped extents (out) */
++};
++
++/*
++ * FIEMAP helper definition.
++ */
++struct fiemap_extent_info {
++      unsigned int    fi_flags;               /* Flags as passed from user */
++      unsigned int    fi_extents_mapped;      /* Number of mapped extents */
++      unsigned int    fi_extents_max;         /* Size of fiemap_extent array*/
++      struct fiemap_extent *fi_extents_start; /* Start of fiemap_extent array */
++};
++
++int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags);
++int fiemap_fill_next_extent(struct fiemap_extent_info *info, u64 logical,
++                          u64 phys, u64 len, u32 flags, u32 lun);
++
++#define       FIEMAP_MAX_OFFSET       (~0ULL)
++
++#define       FIEMAP_FLAG_SYNC        0x00000001 /* sync file data before map */
++#define       FIEMAP_FLAG_XATTR       0x00000002 /* map extended attribute tree */
++
++/* ldiskfs only supports FLAG_SYNC flag currently */
++#define FIEMAP_FLAGS_COMPAT (FIEMAP_FLAG_SYNC | FIEMAP_FLAG_XATTR)
++
++#define FIEMAP_EXTENT_LAST            0x00000001 /* Last extent in file. */
++#define FIEMAP_EXTENT_UNKNOWN         0x00000002 /* Data location unknown. */
++#define FIEMAP_EXTENT_DELALLOC                0x00000004 /* Location still pending.
++                                                  * Sets EXTENT_UNKNOWN. */
++#define FIEMAP_EXTENT_ENCODED         0x00000008 /* Data can not be read
++                                                  * while fs is unmounted */
++#define FIEMAP_EXTENT_DATA_ENCRYPTED  0x00000080 /* Data is encrypted by fs.
++                                                  * Sets EXTENT_NO_DIRECT. */
++#define FIEMAP_EXTENT_NOT_ALIGNED     0x00000100 /* Extent offsets may not be
++                                                  * block aligned. */
++#define FIEMAP_EXTENT_DATA_INLINE     0x00000200 /* Data mixed with metadata.
++                                                  * Sets EXTENT_NOT_ALIGNED.*/
++#define FIEMAP_EXTENT_DATA_TAIL               0x00000400 /* Multiple files in block.
++                                                  * Sets EXTENT_NOT_ALIGNED.*/
++#define FIEMAP_EXTENT_UNWRITTEN               0x00000800 /* Space allocated, but
++                                                  * no data (i.e. zero). */
++#define FIEMAP_EXTENT_MERGED          0x00001000 /* File does not natively
++                                                  * support extents. Result
++                                                  * merged for efficiency. */
++
++/* Lustre specific flags - use a high bit, don't conflict with upstream flag */
++#define FIEMAP_EXTENT_NO_DIRECT               0x40000000 /* Data mapping undefined */
++#define FIEMAP_EXTENT_NET             0x80000000 /* Data stored remotely.
++                                                  * Sets NO_DIRECT flag */
++
++#endif /* _LINUX_EXT4_FIEMAP_H */
++
diff --git a/ldiskfs/kernel_patches/patches/ext4-filterdata-rhel5.patch b/ldiskfs/kernel_patches/patches/ext4-filterdata-rhel5.patch
new file mode 100644 (file)
index 0000000..25ea28a
--- /dev/null
@@ -0,0 +1,25 @@
+Index: linux-2.6.18.i386/fs/ext4/ext4_i.h
+===================================================================
+--- linux-2.6.18.i386.orig/fs/ext4/ext4_i.h
++++ linux-2.6.18.i386/fs/ext4/ext4_i.h
+@@ -162,6 +162,8 @@ struct ext4_inode_info {
+       /* mballoc */
+       struct list_head i_prealloc_list;
+       spinlock_t i_prealloc_lock;
++
++      void *i_filterdata;
+ };
+ #endif        /* _EXT4_I */
+Index: linux-2.6.18.i386/fs/ext4/super.c
+===================================================================
+--- linux-2.6.18.i386.orig/fs/ext4/super.c
++++ linux-2.6.18.i386/fs/ext4/super.c
+@@ -574,6 +574,7 @@ static struct inode *ext4_alloc_inode(st
+       memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
+       INIT_LIST_HEAD(&ei->i_prealloc_list);
+       spin_lock_init(&ei->i_prealloc_lock);
++      ei->i_filterdata = NULL;
+       return &ei->vfs_inode;
+ }
diff --git a/ldiskfs/kernel_patches/patches/ext4-ialloc-2.6-rhel5.patch b/ldiskfs/kernel_patches/patches/ext4-ialloc-2.6-rhel5.patch
new file mode 100644 (file)
index 0000000..7361a24
--- /dev/null
@@ -0,0 +1,129 @@
+Index: linux-2.6.18.i386/fs/ext4/ialloc.c
+===================================================================
+--- linux-2.6.18.i386.orig/fs/ext4/ialloc.c
++++ linux-2.6.18.i386/fs/ext4/ialloc.c
+@@ -509,12 +509,16 @@ fallback:
+ }
+ static int find_group_other(struct super_block *sb, struct inode *parent,
+-                              ext4_group_t *group)
++                          ext4_group_t *group, int mode)
+ {
++      struct ext4_sb_info *sbi = EXT4_SB(sb);
+       ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
+-      ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
++      ext4_group_t ngroups = sbi->s_groups_count;
+       struct ext4_group_desc *desc;
+       ext4_group_t i;
++      int best_group = -1;
++      ext4_fsblk_t avefreeb, freeb;
++      int best_group_freeb = 0;
+       /*
+        * Try to place the inode in its parent directory
+@@ -522,8 +526,10 @@ static int find_group_other(struct super
+       *group = parent_group;
+       desc = ext4_get_group_desc(sb, *group, NULL);
+       if (desc && le16_to_cpu(desc->bg_free_inodes_count) &&
+-                      le16_to_cpu(desc->bg_free_blocks_count))
++          (!S_ISREG(mode) || le16_to_cpu(desc->bg_free_blocks_count)))
+               return 0;
++      avefreeb = ext4_free_blocks_count(sbi->s_es);
++      do_div(avefreeb, ngroups);
+       /*
+        * We're going to place this inode in a different blockgroup from its
+@@ -537,33 +543,49 @@ static int find_group_other(struct super
+       *group = (*group + parent->i_ino) % ngroups;
+       /*
+-       * Use a quadratic hash to find a group with a free inode and some free
+-       * blocks.
++       * Use a quadratic hash to find a group with a free inode and
++       * average number of free blocks.
+        */
+       for (i = 1; i < ngroups; i <<= 1) {
+               *group += i;
+               if (*group >= ngroups)
+                       *group -= ngroups;
+               desc = ext4_get_group_desc(sb, *group, NULL);
+-              if (desc && le16_to_cpu(desc->bg_free_inodes_count) &&
+-                              le16_to_cpu(desc->bg_free_blocks_count))
++              if (!desc || !desc->bg_free_inodes_count)
++                      continue;
++              if (!S_ISREG(mode))
++                      return 0;
++              if (le16_to_cpu(desc->bg_free_blocks_count) >= avefreeb)
+                       return 0;
+       }
+       /*
+-       * That failed: try linear search for a free inode, even if that group
+-       * has no free blocks.
++       * That failed: start from last group used to allocate inode
++       * try linear search for a free inode and prefereably
++       * free blocks.
+        */
+-      *group = parent_group;
++      *group = sbi->s_last_alloc_group;
++      if (*group == -1)
++              *group = parent_group;
++
+       for (i = 0; i < ngroups; i++) {
+               if (++*group >= ngroups)
+                       *group = 0;
+               desc = ext4_get_group_desc(sb, *group, NULL);
+-              if (desc && le16_to_cpu(desc->bg_free_inodes_count))
+-                      return 0;
++              if (!desc || !desc->bg_free_inodes_count)
++                      continue;
++              freeb = le16_to_cpu(desc->bg_free_blocks_count);
++              if (freeb > best_group_freeb) {
++                      best_group_freeb = freeb;
++                      best_group = *group;
++                      if (freeb >= avefreeb || !S_ISREG(mode))
++                              break;
++              }
+       }
+-      return -1;
++      sbi->s_last_alloc_group = best_group;
++      *group = best_group;
++      return 0;
+ }
+ /*
+@@ -656,7 +678,7 @@ continue_allocation:
+               else
+                       ret2 = find_group_orlov(sb, dir, &group);
+       } else
+-              ret2 = find_group_other(sb, dir, &group);
++              ret2 = find_group_other(sb, dir, &group, mode);
+ got_group:
+       err = -ENOSPC;
+Index: linux-2.6.18.i386/fs/ext4/super.c
+===================================================================
+--- linux-2.6.18.i386.orig/fs/ext4/super.c
++++ linux-2.6.18.i386/fs/ext4/super.c
+@@ -2190,6 +2190,7 @@ static int ext4_fill_super(struct super_
+       bgl_lock_init(&sbi->s_blockgroup_lock);
++      sbi->s_last_alloc_group = -1;
+       for (i = 0; i < db_count; i++) {
+               block = descriptor_loc(sb, logical_sb_block, i);
+               sbi->s_group_desc[i] = sb_bread(sb, block);
+Index: linux-2.6.18.i386/fs/ext4/ext4_sb.h
+===================================================================
+--- linux-2.6.18.i386.orig/fs/ext4/ext4_sb.h
++++ linux-2.6.18.i386/fs/ext4/ext4_sb.h
+@@ -60,6 +60,8 @@ struct ext4_sb_info {
+       struct percpu_counter s_freeinodes_counter;
+       struct percpu_counter s_dirs_counter;
+       struct blockgroup_lock s_blockgroup_lock;
++      /* Last group used to allocate inode */
++      int s_last_alloc_group;
+       /* root of the per fs reservation window tree */
+       spinlock_t s_rsv_window_lock;
diff --git a/ldiskfs/kernel_patches/patches/ext4-include-fixes-2.6-rhel5.patch b/ldiskfs/kernel_patches/patches/ext4-include-fixes-2.6-rhel5.patch
new file mode 100644 (file)
index 0000000..0009eaa
--- /dev/null
@@ -0,0 +1,20 @@
+Index: linux-2.6.18.i386/fs/ext4/ext4.h
+===================================================================
+--- linux-2.6.18.i386.orig/fs/ext4/ext4.h
++++ linux-2.6.18.i386/fs/ext4/ext4.h
+@@ -541,12 +541,13 @@ do {                                                                            \
+ #define EXT4_MOUNT_IOPEN              0x8000000 /* Allow access via iopen */
+ #define EXT4_MOUNT_IOPEN_NOPRIV               0x10000000 /* Make iopen world-readable */
+ /* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */
+-#ifndef _LINUX_EXT2_FS_H
++#ifndef clear_opt
+ #define clear_opt(o, opt)             o &= ~EXT4_MOUNT_##opt
+ #define set_opt(o, opt)                       o |= EXT4_MOUNT_##opt
+ #define test_opt(sb, opt)             (EXT4_SB(sb)->s_mount_opt & \
+                                        EXT4_MOUNT_##opt)
+-#else
++#endif
++#ifndef EXT2_MOUNT_NOLOAD
+ #define EXT2_MOUNT_NOLOAD             EXT4_MOUNT_NOLOAD
+ #define EXT2_MOUNT_ABORT              EXT4_MOUNT_ABORT
+ #define EXT2_MOUNT_DATA_FLAGS         EXT4_MOUNT_DATA_FLAGS
diff --git a/ldiskfs/kernel_patches/patches/ext4-inode-version-rhel5.patch b/ldiskfs/kernel_patches/patches/ext4-inode-version-rhel5.patch
new file mode 100644 (file)
index 0000000..d8a31ad
--- /dev/null
@@ -0,0 +1,105 @@
+Index: linux-2.6.18-128.1.6/fs/ext4/inode.c
+===================================================================
+--- linux-2.6.18-128.1.6.orig/fs/ext4/inode.c
++++ linux-2.6.18-128.1.6/fs/ext4/inode.c
+@@ -2850,11 +2850,11 @@ struct inode *ext4_iget(struct super_blo
+       EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);
+       EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode);
+-      inode->i_version = le32_to_cpu(raw_inode->i_disk_version);
++      ei->i_fs_version = le32_to_cpu(raw_inode->i_disk_version);
+       if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
+               if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
+-                      inode->i_version |=
+-                      (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
++                      ei->i_fs_version |= (__u64)(le32_to_cpu(raw_inode->i_version_hi))
++                                                                       << 32;
+       }
+       if (S_ISREG(inode->i_mode)) {
+@@ -3043,16 +3043,11 @@ static int ext4_do_update_inode(handle_t
+       } else for (block = 0; block < EXT4_N_BLOCKS; block++)
+               raw_inode->i_block[block] = ei->i_data[block];
+-      raw_inode->i_disk_version = cpu_to_le32(inode->i_version);
++      raw_inode->i_disk_version = cpu_to_le32(ei->i_fs_version);
+       if (ei->i_extra_isize) {
+               if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
+-                      /* in RHEL5 i_version is an unsigned long */
+-#if BITS_PER_LONG == 64
+-                      raw_inode->i_version_hi =
+-                      cpu_to_le32(inode->i_version >> 32);
+-#else
+-                      raw_inode->i_version_hi = 0;
+-#endif
++                      raw_inode->i_version_hi = cpu_to_le32(ei->i_fs_version
++                                                            >> 32);
+               raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
+       }
+Index: linux-2.6.18-128.1.6/fs/ext4/ext4_i.h
+===================================================================
+--- linux-2.6.18-128.1.6.orig/fs/ext4/ext4_i.h
++++ linux-2.6.18-128.1.6/fs/ext4/ext4_i.h
+@@ -21,6 +21,8 @@
+ #include <linux/seqlock.h>
+ #include <linux/mutex.h>
++#define HAVE_DISK_INODE_VERSION
++
+ /* data type for block offset of block group */
+ typedef int ext4_grpblk_t;
+@@ -164,6 +166,8 @@ struct ext4_inode_info {
+       spinlock_t i_prealloc_lock;
+       void *i_filterdata;
++
++      __u64 i_fs_version;
+ };
+ #endif        /* _EXT4_I */
+Index: linux-2.6.18-128.1.6/fs/ext4/xattr.c
+===================================================================
+--- linux-2.6.18-128.1.6.orig/fs/ext4/xattr.c
++++ linux-2.6.18-128.1.6/fs/ext4/xattr.c
+@@ -959,13 +959,18 @@ ext4_xattr_set_handle(handle_t *handle, 
+       struct ext4_xattr_block_find bs = {
+               .s = { .not_found = -ENODATA, },
+       };
++      unsigned long no_expand;
+       int error;
+       if (!name)
+               return -EINVAL;
+       if (strlen(name) > 255)
+               return -ERANGE;
++
+       down_write(&EXT4_I(inode)->xattr_sem);
++      no_expand = EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND;
++      EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND;
++
+       error = ext4_get_inode_loc(inode, &is.iloc);
+       if (error)
+               goto cleanup;
+@@ -1042,6 +1047,8 @@ ext4_xattr_set_handle(handle_t *handle, 
+ cleanup:
+       brelse(is.iloc.bh);
+       brelse(bs.bh);
++      if (no_expand == 0)
++              EXT4_I(inode)->i_state &= ~EXT4_STATE_NO_EXPAND;
+       up_write(&EXT4_I(inode)->xattr_sem);
+       return error;
+ }
+Index: linux-2.6.18-128.1.6/fs/ext4/ialloc.c
+===================================================================
+--- linux-2.6.18-128.1.6.orig/fs/ext4/ialloc.c
++++ linux-2.6.18-128.1.6/fs/ext4/ialloc.c
+@@ -878,6 +878,7 @@ got:
+       ei->i_dtime = 0;
+       ei->i_block_alloc_info = NULL;
+       ei->i_block_group = group;
++      ei->i_fs_version = 0;
+       ext4_set_inode_flags(inode);
+       if (IS_DIRSYNC(inode))
diff --git a/ldiskfs/kernel_patches/patches/ext4-lookup-dotdot-rhel5.patch b/ldiskfs/kernel_patches/patches/ext4-lookup-dotdot-rhel5.patch
new file mode 100644 (file)
index 0000000..af019fa
--- /dev/null
@@ -0,0 +1,63 @@
+Index: linux-2.6.18.i386/fs/ext4/iopen.c
+===================================================================
+--- linux-2.6.18.i386.orig/fs/ext4/iopen.c
++++ linux-2.6.18.i386/fs/ext4/iopen.c
+@@ -91,9 +91,12 @@ static struct dentry *iopen_lookup(struc
+               assert(!(alternate->d_flags & DCACHE_DISCONNECTED));
+       }
+-      if (!list_empty(&inode->i_dentry)) {
+-              alternate = list_entry(inode->i_dentry.next,
+-                                     struct dentry, d_alias);
++      list_for_each(lp, &inode->i_dentry) {
++              alternate = list_entry(lp, struct dentry, d_alias);
++              /* ignore dentries created for ".." to preserve
++               * proper dcache hierarchy -- bug 10458 */
++              if (alternate->d_flags & DCACHE_NFSFS_RENAMED)
++                      continue;
+               dget_locked(alternate);
+               spin_lock(&alternate->d_lock);
+               alternate->d_flags |= DCACHE_REFERENCED;
+Index: linux-2.6.18.i386/fs/ext4/namei.c
+===================================================================
+--- linux-2.6.18.i386.orig/fs/ext4/namei.c
++++ linux-2.6.18.i386/fs/ext4/namei.c
+@@ -1067,6 +1067,38 @@ static struct dentry *ext4_lookup(struct
+                       return ERR_CAST(inode);
+       }
++      /* ".." shouldn't go into dcache to preserve dcache hierarchy
++       * otherwise we'll get parent being a child of actual child.
++       * see bug 10458 for details -bzzz */
++      if (inode && (dentry->d_name.name[0] == '.' && (dentry->d_name.len == 1 ||
++              (dentry->d_name.len == 2 && dentry->d_name.name[1] == '.')))) {
++              struct dentry *tmp, *goal = NULL;
++              struct list_head *lp;
++
++              /* first, look for an existing dentry - any one is good */
++              spin_lock(&dcache_lock);
++              list_for_each(lp, &inode->i_dentry) {
++                      tmp = list_entry(lp, struct dentry, d_alias);
++                      goal = tmp;
++                      dget_locked(goal);
++                      break;
++              }
++              if (goal == NULL) {
++                      /* there is no alias, we need to make current dentry:
++                       *  a) inaccessible for __d_lookup()
++                       *  b) inaccessible for iopen */
++                      J_ASSERT(list_empty(&dentry->d_alias));
++                      dentry->d_flags |= DCACHE_NFSFS_RENAMED;
++                      /* this is d_instantiate() ... */
++                      list_add(&dentry->d_alias, &inode->i_dentry);
++                      dentry->d_inode = inode;
++              }
++              spin_unlock(&dcache_lock);
++              if (goal)
++                      iput(inode);
++              return goal;
++      }
++
+       return iopen_connect_dentry(dentry, inode, 1);
+ }
diff --git a/ldiskfs/kernel_patches/patches/ext4-map_inode_page-2.6.18-rhel5.patch b/ldiskfs/kernel_patches/patches/ext4-map_inode_page-2.6.18-rhel5.patch
new file mode 100644 (file)
index 0000000..4ed87f0
--- /dev/null
@@ -0,0 +1,86 @@
+Index: linux-2.6.18.i386/fs/ext4/inode.c
+===================================================================
+--- linux-2.6.18.i386.orig/fs/ext4/inode.c
++++ linux-2.6.18.i386/fs/ext4/inode.c
+@@ -3666,3 +3666,66 @@ out_unlock:
+       unlock_page(page);
+       return ret;
+ }
++
++int ext4_map_inode_page(struct inode *inode, struct page *page,
++                      unsigned long *blocks, int *created, int create)
++{
++      unsigned int blocksize, blocks_per_page;
++      unsigned long iblock;
++      struct buffer_head dummy;
++      void *handle;
++      int i, rc = 0, failed = 0, needed_blocks;
++
++      blocksize = inode->i_sb->s_blocksize;
++      blocks_per_page = PAGE_SIZE >> inode->i_sb->s_blocksize_bits;
++      iblock = page->index * blocks_per_page;
++
++      for (i = 0; i < blocks_per_page; i++, iblock++) {
++              blocks[i] = ext4_bmap(inode->i_mapping, iblock);
++              if (blocks[i] == 0) {
++                      failed++;
++                      if (created)
++                              created[i] = -1;
++              } else if (created) {
++                      created[i] = 0;
++              }
++      }
++
++      if (failed == 0 || create == 0)
++              return 0;
++
++      needed_blocks = ext4_writepage_trans_blocks(inode);
++      handle = ext4_journal_start(inode, needed_blocks);
++      if (IS_ERR(handle))
++              return PTR_ERR(handle);
++
++      iblock = page->index * blocks_per_page;
++      for (i = 0; i < blocks_per_page; i++, iblock++) {
++              if (blocks[i] != 0)
++                      continue;
++
++              rc = ext4_get_blocks_handle(handle, inode, iblock, 1, &dummy, 1, 1);
++              if (rc < 0) {
++                      printk(KERN_INFO "ext4_map_inode_page: error reading "
++                                      "block %ld\n", iblock);
++                      goto out;
++              } else {
++                      if (rc > 1)
++                              WARN_ON(1);
++                      rc = 0;
++              }
++              /* Unmap any metadata buffers from the block mapping, to avoid
++               * data corruption due to direct-write from Lustre being
++               * clobbered by a later flush of the blockdev metadata buffer.*/
++              if (buffer_new(&dummy))
++                      unmap_underlying_metadata(dummy.b_bdev,
++                                      dummy.b_blocknr);
++              blocks[i] = dummy.b_blocknr;
++              if (created)
++                      created[i] = 1;
++      }
++
++out:
++      ext4_journal_stop(handle);
++      return rc;
++}
+Index: linux-2.6.18.i386/fs/ext4/super.c
+===================================================================
+--- linux-2.6.18.i386.orig/fs/ext4/super.c
++++ linux-2.6.18.i386/fs/ext4/super.c
+@@ -3498,6 +3498,10 @@ static void __exit exit_ext4_fs(void)
+       __free_page(ext4_zero_page);
+ }
++int ext4_map_inode_page(struct inode *inode, struct page *page,
++                      unsigned long *blocks, int *created, int create);
++EXPORT_SYMBOL(ext4_map_inode_page);
++
+ MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
+ MODULE_DESCRIPTION("Fourth Extended Filesystem with extents");
+ MODULE_LICENSE("GPL");
diff --git a/ldiskfs/kernel_patches/patches/ext4-max-dir-size-rhel5.patch b/ldiskfs/kernel_patches/patches/ext4-max-dir-size-rhel5.patch
new file mode 100644 (file)
index 0000000..295d0d1
--- /dev/null
@@ -0,0 +1,203 @@
+Index: linux-2.6.18.i386/fs/ext4/ialloc.c
+===================================================================
+--- linux-2.6.18.i386.orig/fs/ext4/ialloc.c
++++ linux-2.6.18.i386/fs/ext4/ialloc.c
+@@ -622,12 +622,15 @@ struct inode *ext4_new_inode(handle_t *h
+               return ERR_PTR(-EPERM);
+       sb = dir->i_sb;
++      sbi = EXT4_SB(sb);
++      if (sbi->s_max_dir_size > 0 && i_size_read(dir) >= sbi->s_max_dir_size)
++              return ERR_PTR(-EFBIG);
++
+       inode = new_inode(sb);
+       if (!inode)
+               return ERR_PTR(-ENOMEM);
+       ei = EXT4_I(inode);
+-      sbi = EXT4_SB(sb);
+       es = sbi->s_es;
+       if (goal) {
+Index: linux-2.6.18.i386/fs/ext4/super.c
+===================================================================
+--- linux-2.6.18.i386.orig/fs/ext4/super.c
++++ linux-2.6.18.i386/fs/ext4/super.c
+@@ -38,6 +38,7 @@
+ #include <asm/uaccess.h>
+ #include <linux/kthread.h>
+ #include <linux/utsname.h>
++#include <linux/proc_fs.h>
+ #include "ext4.h"
+ #include "ext4_jbd2.h"
+@@ -67,6 +68,8 @@ static void ext4_write_super_lockfs(stru
+ struct page *ext4_zero_page;
++struct proc_dir_entry *proc_root_ext4;
++
+ ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
+                              struct ext4_group_desc *bg)
+ {
+@@ -551,6 +554,9 @@ static void ext4_put_super(struct super_
+       }
+       if (sbi->s_mmp_tsk)
+               kthread_stop(sbi->s_mmp_tsk);
++
++      remove_proc_entry(EXT4_MAX_DIR_SIZE_NAME, sbi->s_mb_proc);
++
+       sb->s_fs_info = NULL;
+       kfree(sbi);
+       return;
+@@ -2185,6 +2191,46 @@ static unsigned long ext4_get_stripe_siz
+       return 0;
+ }
++static int ext4_max_dir_size_read(char *page, char **start, off_t off,
++                                  int count, int *eof, void *data)
++{
++      struct ext4_sb_info *sbi = data;
++      int len;
++
++      *eof = 1;
++      if (off != 0)
++              return 0;
++
++      len = sprintf(page, "%lu\n", sbi->s_max_dir_size);
++      *start = page;
++      return len;
++}
++
++static int ext4_max_dir_size_write(struct file *file, const char *buffer,
++                                   unsigned long count, void *data)
++{
++      struct ext4_sb_info *sbi = data;
++      char str[32];
++      unsigned long value;
++      char *end;
++
++      if (count >= sizeof(str)) {
++              printk(KERN_ERR "EXT4-fs: %s string too long, max %u bytes\n",
++                     EXT4_MAX_DIR_SIZE_NAME, (int)sizeof(str));
++              return -EOVERFLOW;
++      }
++
++      if (copy_from_user(str, buffer, count))
++              return -EFAULT;
++
++      value = simple_strtol(str, &end, 0);
++      if (value < 0)
++              return -ERANGE;
++
++      sbi->s_max_dir_size = value;
++      return count;
++}
++
+ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
+                               __releases(kernel_lock)
+                               __acquires(kernel_lock)
+@@ -2208,6 +2254,7 @@ static int ext4_fill_super(struct super_
+       int needs_recovery;
+       __le32 features;
+       __u64 blocks_count;
++      struct proc_dir_entry *proc;
+       sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
+       if (!sbi)
+@@ -2743,6 +2790,22 @@ static int ext4_fill_super(struct super_
+       ext4_ext_init(sb);
+       ext4_mb_init(sb, needs_recovery);
++      sbi->s_max_dir_size = EXT4_DEFAULT_MAX_DIR_SIZE;
++      proc = create_proc_entry(EXT4_MAX_DIR_SIZE_NAME,
++                               S_IFREG | S_IRUGO | S_IWUSR, sbi->s_mb_proc);
++      if (proc == NULL) {
++              printk(KERN_ERR "EXT4-fs: unable to create %s\n",
++                     EXT4_MAX_DIR_SIZE_NAME);
++              remove_proc_entry(EXT4_MAX_DIR_SIZE_NAME, sbi->s_mb_proc);
++              remove_proc_entry(sbi->s_mb_proc->name, proc_root_ext4);
++              sbi->s_mb_proc = NULL;
++              ret = -ENOMEM;
++              goto failed_mount4;
++      }
++      proc->data = sbi;
++      proc->read_proc = ext4_max_dir_size_read;
++      proc->write_proc = ext4_max_dir_size_write;
++
+       lock_kernel();
+       return 0;
+@@ -3082,7 +3145,6 @@ static void ext4_commit_super(struct sup
+               sync_dirty_buffer(sbh);
+ }
+-
+ /*
+  * Have we just finished recovery?  If so, and if we are mounting (or
+  * remounting) the filesystem readonly, then we will end up with a
+Index: linux-2.6.18.i386/fs/ext4/ext4_sb.h
+===================================================================
+--- linux-2.6.18.i386.orig/fs/ext4/ext4_sb.h
++++ linux-2.6.18.i386/fs/ext4/ext4_sb.h
+@@ -117,6 +117,7 @@ struct ext4_sb_info {
+       /* where last allocation was done - for stream allocation */
+       unsigned long s_mb_last_group;
+       unsigned long s_mb_last_start;
++      unsigned long s_max_dir_size;
+       /* history to debug policy */
+       struct ext4_mb_history *s_mb_history;
+Index: linux-2.6.18.i386/fs/ext4/ext4.h
+===================================================================
+--- linux-2.6.18.i386.orig/fs/ext4/ext4.h
++++ linux-2.6.18.i386/fs/ext4/ext4.h
+@@ -992,6 +992,14 @@ struct mmp_struct {
+  */
+ #define EXT4_MMP_MIN_CHECK_INTERVAL   5
++extern struct proc_dir_entry *proc_root_ext4;
++
++/*
++ * max directory size tunable
++ */
++#define EXT4_DEFAULT_MAX_DIR_SIZE     0
++#define EXT4_MAX_DIR_SIZE_NAME                "max_dir_size"
++
+ /*
+  * Function prototypes
+  */
+Index: linux-2.6.18.i386/fs/ext4/mballoc.h
+===================================================================
+--- linux-2.6.18.i386.orig/fs/ext4/mballoc.h
++++ linux-2.6.18.i386/fs/ext4/mballoc.h
+@@ -257,7 +257,6 @@ static void ext4_mb_store_history(struct
+ #define in_range(b, first, len)       ((b) >= (first) && (b) <= (first) + (len) - 1)
+-static struct proc_dir_entry *proc_root_ext4;
+ struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t);
+ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
+Index: linux-2.6.18.i386/fs/ext4/mballoc.c
+===================================================================
+--- linux-2.6.18.i386.orig/fs/ext4/mballoc.c
++++ linux-2.6.18.i386/fs/ext4/mballoc.c
+@@ -2821,6 +2821,7 @@ err_out:
+       remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_mb_proc);
+       remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_mb_proc);
+       remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_mb_proc);
++      remove_proc_entry(EXT4_MAX_DIR_SIZE_NAME, sbi->s_mb_proc);
+       remove_proc_entry(devname, proc_root_ext4);
+       sbi->s_mb_proc = NULL;
+@@ -2842,7 +2843,9 @@ static int ext4_mb_destroy_per_dev_proc(
+       remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_mb_proc);
+       remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_mb_proc);
+       remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_mb_proc);
++      remove_proc_entry(EXT4_MAX_DIR_SIZE_NAME, sbi->s_mb_proc);
+       remove_proc_entry(devname, proc_root_ext4);
++      sbi->s_mb_proc = NULL;
+       return 0;
+ }
diff --git a/ldiskfs/kernel_patches/patches/ext4-mballoc-extra-checks-rhel5.patch b/ldiskfs/kernel_patches/patches/ext4-mballoc-extra-checks-rhel5.patch
new file mode 100644 (file)
index 0000000..b8cdada
--- /dev/null
@@ -0,0 +1,330 @@
+Index: linux-2.6.18.i686/fs/ext4/mballoc.c
+===================================================================
+--- linux-2.6.18.i686.orig/fs/ext4/mballoc.c
++++ linux-2.6.18.i686/fs/ext4/mballoc.c
+@@ -660,7 +660,7 @@ static void ext4_mb_mark_free_simple(str
+       }
+ }
+-static void ext4_mb_generate_buddy(struct super_block *sb,
++static int ext4_mb_generate_buddy(struct super_block *sb,
+                               void *buddy, void *bitmap, ext4_group_t group)
+ {
+       struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+@@ -692,14 +692,14 @@ static void ext4_mb_generate_buddy(struc
+       grp->bb_fragments = fragments;
+       if (free != grp->bb_free) {
+-              ext4_error(sb, __func__,
+-                      "EXT4-fs: group %lu: %u blocks in bitmap, %u in gd\n",
+-                      group, free, grp->bb_free);
+-              /*
+-               * If we intent to continue, we consider group descritor
+-               * corrupt and update bb_free using bitmap value
+-               */
+-              grp->bb_free = free;
++              struct ext4_group_desc *gdp;
++              gdp = ext4_get_group_desc (sb, group, NULL);
++              ext4_error(sb, __FUNCTION__,
++                      "group %lu: %u blocks in bitmap, %u in bb, "
++                      "%u in gd, %lu pa's\n", group, free, grp->bb_free,
++                      le16_to_cpu(gdp->bg_free_blocks_count),
++                      grp->bb_prealloc_nr);
++              return -EIO;
+       }
+       clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
+@@ -709,6 +709,8 @@ static void ext4_mb_generate_buddy(struc
+       EXT4_SB(sb)->s_mb_buddies_generated++;
+       EXT4_SB(sb)->s_mb_generation_time += period;
+       spin_unlock(&EXT4_SB(sb)->s_bal_lock);
++
++      return 0;
+ }
+ /* The buddy information is attached the buddy cache inode
+@@ -814,7 +816,7 @@ static int ext4_mb_init_cache(struct pag
+       err = 0;
+       first_block = page->index * blocks_per_page;
+-      for (i = 0; i < blocks_per_page; i++) {
++      for (i = 0; i < blocks_per_page && err == 0; i++) {
+               int group;
+               struct ext4_group_info *grinfo;
+@@ -848,7 +850,7 @@ static int ext4_mb_init_cache(struct pag
+                       /*
+                        * incore got set to the group block bitmap below
+                        */
+-                      ext4_mb_generate_buddy(sb, data, incore, group);
++                      err = ext4_mb_generate_buddy(sb, data, incore, group);
+                       incore = NULL;
+               } else {
+                       /* this is block of bitmap */
+@@ -861,7 +863,7 @@ static int ext4_mb_init_cache(struct pag
+                       memcpy(data, bitmap, blocksize);
+                       /* mark all preallocated blks used in in-core bitmap */
+-                      ext4_mb_generate_from_pa(sb, data, group);
++                      err = ext4_mb_generate_from_pa(sb, data, group);
+                       ext4_unlock_group(sb, group);
+                       /* set incore so that the buddy information can be
+@@ -870,6 +872,7 @@ static int ext4_mb_init_cache(struct pag
+                       incore = data;
+               }
+       }
++      if (likely(err == 0))
+       SetPageUptodate(page);
+ out:
+@@ -1964,7 +1967,10 @@ static int ext4_mb_seq_history_show(stru
+                       hs->result.fe_start, hs->result.fe_len);
+               seq_printf(seq, "%-5u %-8u %-23s free\n",
+                               hs->pid, hs->ino, buf2);
++      } else {
++              seq_printf(seq, "unknown op %d\n", hs->op);
+       }
++
+       return 0;
+ }
+@@ -2092,9 +2098,11 @@ static void *ext4_mb_seq_groups_next(str
+ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
+ {
+       struct super_block *sb = seq->private;
++      struct ext4_group_desc *gdp;
+       long group = (long) v;
+       int i;
+       int err;
++      int free = 0;
+       struct ext4_buddy e4b;
+       struct sg {
+               struct ext4_group_info info;
+@@ -2103,10 +2111,10 @@ static int ext4_mb_seq_groups_show(struc
+       group--;
+       if (group == 0)
+-              seq_printf(seq, "#%-5s: %-5s %-5s %-5s "
++              seq_printf(seq, "#%-5s: %-5s %-5s %-5s %-5s %-5s"
+                               "[ %-5s %-5s %-5s %-5s %-5s %-5s %-5s "
+                                 "%-5s %-5s %-5s %-5s %-5s %-5s %-5s ]\n",
+-                         "group", "free", "frags", "first",
++                         "group", "free", "frags", "first", "first", "pa",
+                          "2^0", "2^1", "2^2", "2^3", "2^4", "2^5", "2^6",
+                          "2^7", "2^8", "2^9", "2^10", "2^11", "2^12", "2^13");
+@@ -2117,13 +2125,20 @@ static int ext4_mb_seq_groups_show(struc
+               seq_printf(seq, "#%-5lu: I/O error\n", group);
+               return 0;
+       }
++
++      gdp = ext4_get_group_desc(sb, group, NULL);
++      if (gdp != NULL)
++              free = le16_to_cpu(gdp->bg_free_blocks_count);
++
+       ext4_lock_group(sb, group);
+       memcpy(&sg, ext4_get_group_info(sb, group), i);
+       ext4_unlock_group(sb, group);
+       ext4_mb_release_desc(&e4b);
+-      seq_printf(seq, "#%-5lu: %-5u %-5u %-5u [", group, sg.info.bb_free,
+-                      sg.info.bb_fragments, sg.info.bb_first_free);
++      seq_printf(seq, "#%-5lu: %-5u %-5u %-5u %-5u %-5lu [", group,
++                      sg.info.bb_free, free,
++                      sg.info.bb_fragments, sg.info.bb_first_free,
++                      sg.info.bb_prealloc_nr);
+       for (i = 0; i <= 13; i++)
+               seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ?
+                               sg.info.bb_counters[i] : 0);
+@@ -2226,6 +2241,7 @@ ext4_mb_store_history(struct ext4_alloca
+       h.tail = ac->ac_tail;
+       h.buddy = ac->ac_buddy;
+       h.merged = 0;
++      h.cr = ac->ac_criteria;
+       if (ac->ac_op == EXT4_MB_HISTORY_ALLOC) {
+               if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
+                               ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group)
+@@ -3531,22 +3547,66 @@ ext4_mb_use_preallocated(struct ext4_all
+ }
+ /*
++ * check free blocks in bitmap match free block in group descriptor
++ * do this before taking preallocated blocks into account to be able
++ * to detect on-disk corruptions
++ */
++int ext4_mb_check_ondisk_bitmap(struct super_block *sb, void *bitmap,
++                              struct ext4_group_desc *gdp, int group)
++{
++      unsigned short max = EXT4_BLOCKS_PER_GROUP(sb);
++      unsigned short i, first, free = 0;
++
++      i = mb_find_next_zero_bit(bitmap, max, 0);
++
++      while (i < max) {
++              first = i;
++              i = find_next_bit(bitmap, max, i);
++              if (i > max)
++                      i = max;
++              free += i - first;
++              if (i < max)
++                      i = mb_find_next_zero_bit(bitmap, max, i);
++      }
++
++      if (free != le16_to_cpu(gdp->bg_free_blocks_count)) {
++              ext4_error(sb, __FUNCTION__, "on-disk bitmap for group %d"
++                      "corrupted: %u blocks free in bitmap, %u - in gd\n",
++                      group, free, le16_to_cpu(gdp->bg_free_blocks_count));
++              return -EIO;
++      }
++      return 0;
++}
++
++/*
+  * the function goes through all preallocation in this group and marks them
+  * used in in-core bitmap. buddy must be generated from this bitmap
+  * Need to be called with ext4 group lock (ext4_lock_group)
+  */
+-static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
++static int ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
+                                       ext4_group_t group)
+ {
+       struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+       struct ext4_prealloc_space *pa;
++      struct ext4_group_desc *gdp;
+       struct list_head *cur;
+       ext4_group_t groupnr;
+       ext4_grpblk_t start;
+       int preallocated = 0;
+       int count = 0;
++      int skip = 0;
++      int err;
+       int len;
++      gdp = ext4_get_group_desc (sb, group, NULL);
++      if (gdp == NULL)
++              return -EIO;
++
++      /* before applying preallocations, check bitmap consistency */
++      err = ext4_mb_check_ondisk_bitmap(sb, bitmap, gdp, group);
++      if (err)
++              return err;
++
+       /* all form of preallocation discards first load group,
+        * so the only competing code is preallocation use.
+        * we don't need any locking here
+@@ -3562,15 +3622,24 @@ static void ext4_mb_generate_from_pa(str
+                                            &groupnr, &start);
+               len = pa->pa_len;
+               spin_unlock(&pa->pa_lock);
+-              if (unlikely(len == 0))
++              if (unlikely(len == 0)) {
++                      skip++;
+                       continue;
++              }
+               BUG_ON(groupnr != group);
+               mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group),
+                                               bitmap, start, len);
+               preallocated += len;
+               count++;
+       }
++      if (count + skip != grp->bb_prealloc_nr) {
++              ext4_error(sb, __FUNCTION__, "lost preallocations: "
++                         "count %d, bb_prealloc_nr %lu, skip %d\n",
++                         count, grp->bb_prealloc_nr, skip);
++              return -EIO;
++      }
+       mb_debug("prellocated %u for group %lu\n", preallocated, group);
++      return 0;
+ }
+ static void ext4_mb_pa_callback(struct rcu_head *head)
+@@ -3621,6 +3690,7 @@ static void ext4_mb_put_pa(struct ext4_a
+        */
+       ext4_lock_group(sb, grp);
+       list_del(&pa->pa_group_list);
++      ext4_get_group_info(sb, grp)->bb_prealloc_nr--;
+       ext4_unlock_group(sb, grp);
+       spin_lock(pa->pa_obj_lock);
+@@ -3709,6 +3779,7 @@ ext4_mb_new_inode_pa(struct ext4_allocat
+       ext4_lock_group(sb, ac->ac_b_ex.fe_group);
+       list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
++      grp->bb_prealloc_nr++;
+       ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
+       spin_lock(pa->pa_obj_lock);
+@@ -3768,6 +3839,7 @@ ext4_mb_new_group_pa(struct ext4_allocat
+       ext4_lock_group(sb, ac->ac_b_ex.fe_group);
+       list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
++      grp->bb_prealloc_nr++;
+       ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
+       /*
+@@ -3820,6 +3892,7 @@ ext4_mb_release_inode_pa(struct ext4_bud
+               ac->ac_sb = sb;
+               ac->ac_inode = pa->pa_inode;
+               ac->ac_op = EXT4_MB_HISTORY_DISCARD;
++              ac->ac_o_ex.fe_len = 1;
+       }
+       while (bit < end) {
+@@ -3964,6 +4037,8 @@ repeat:
+               spin_unlock(&pa->pa_lock);
++              BUG_ON(grp->bb_prealloc_nr == 0);
++              grp->bb_prealloc_nr--;
+               list_del(&pa->pa_group_list);
+               list_add(&pa->u.pa_tmp_list, &list);
+       }
+@@ -4099,7 +4174,7 @@ repeat:
+               if (err) {
+                       ext4_error(sb, __func__, "Error in loading buddy "
+                                       "information for %lu\n", group);
+-                      continue;
++                      return;
+               }
+               bitmap_bh = ext4_read_block_bitmap(sb, group);
+@@ -4111,6 +4186,8 @@ repeat:
+               }
+               ext4_lock_group(sb, group);
++              BUG_ON(e4b.bd_info->bb_prealloc_nr == 0);
++              e4b.bd_info->bb_prealloc_nr--;
+               list_del(&pa->pa_group_list);
+               ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac);
+               ext4_unlock_group(sb, group);
+Index: linux-2.6.18.i686/fs/ext4/mballoc.h
+===================================================================
+--- linux-2.6.18.i686.orig/fs/ext4/mballoc.h
++++ linux-2.6.18.i686/fs/ext4/mballoc.h
+@@ -119,6 +119,7 @@ struct ext4_group_info {
+       unsigned short  bb_free;
+       unsigned short  bb_fragments;
+       struct          list_head bb_prealloc_list;
++      unsigned long   bb_prealloc_nr;
+ #ifdef DOUBLE_CHECK
+       void            *bb_bitmap;
+ #endif
+@@ -228,7 +229,7 @@ struct ext4_mb_history {
+       __u16 tail;     /* what tail broke some buddy */
+       __u16 buddy;    /* buddy the tail ^^^ broke */
+       __u16 flags;
+-      __u8 cr:3;      /* which phase the result extent was found at */
++      __u8 cr:8;      /* which phase the result extent was found at */
+       __u8 op:4;
+       __u8 merged:1;
+ };
+@@ -259,7 +260,7 @@ static void ext4_mb_store_history(struct
+ struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t);
+-static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
++static int ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
+                                       ext4_group_t group);
+ static void ext4_mb_poll_new_transaction(struct super_block *, handle_t *);
+ static void ext4_mb_free_committed_blocks(struct super_block *);
diff --git a/ldiskfs/kernel_patches/patches/ext4-mballoc-handle-dev-paths-rhel5.patch b/ldiskfs/kernel_patches/patches/ext4-mballoc-handle-dev-paths-rhel5.patch
new file mode 100644 (file)
index 0000000..17a5fbd
--- /dev/null
@@ -0,0 +1,59 @@
+Index: linux-2.6.18-128.1.6/fs/ext4/mballoc.c
+===================================================================
+--- linux-2.6.18-128.1.6.orig/fs/ext4/mballoc.c        2009-05-29 16:32:19.000000000 +0530
++++ linux-2.6.18-128.1.6/fs/ext4/mballoc.c     2009-05-29 16:34:16.000000000 +0530
+@@ -2949,14 +2949,20 @@
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+       struct proc_dir_entry *proc;
+       struct proc_dir_entry *proc_entry;
+-      char devname[64];
++      char devname[BDEVNAME_SIZE], *p;
+       if (proc_root_ext4 == NULL) {
+               sbi->s_mb_proc = NULL;
+               return -EINVAL;
+       }
+       bdevname(sb->s_bdev, devname);
++      p = devname;
++      while ((p = strchr(p, '/')))
++              *p = '!';
++
+       sbi->s_mb_proc = proc_mkdir(devname, proc_root_ext4);
++      if (!sbi->s_mb_proc)
++              goto err_create_dir;
+       MB_PROC_HANDLER(EXT4_MB_STATS_NAME, stats);
+       MB_PROC_HANDLER(EXT4_MB_MAX_TO_SCAN_NAME, max_to_scan);
+@@ -2980,7 +2986,6 @@
+       return 0;
+ err_out:
+-      printk(KERN_ERR "EXT4-fs: Unable to create %s\n", devname);
+       remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_mb_proc);
+       remove_proc_entry(EXT4_MB_PREALLOC_TABLE, sbi->s_mb_proc);
+       remove_proc_entry(EXT4_MB_LARGE_REQ, sbi->s_mb_proc);
+@@ -2993,18 +2998,23 @@
+       remove_proc_entry(devname, proc_root_ext4);
+       sbi->s_mb_proc = NULL;
++err_create_dir:
++      printk(KERN_ERR "EXT4-fs: Unable to create %s\n", devname);
+       return -ENOMEM;
+ }
+ static int ext4_mb_destroy_per_dev_proc(struct super_block *sb)
+ {
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+-      char devname[64];
++      char devname[BDEVNAME_SIZE], *p;
+       if (sbi->s_mb_proc == NULL)
+               return -EINVAL;
+       bdevname(sb->s_bdev, devname);
++      p = devname;
++      while ((p = strchr(p, '/')))
++              *p = '!';
+       remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_mb_proc);
+       remove_proc_entry(EXT4_MB_PREALLOC_TABLE, sbi->s_mb_proc);
+       remove_proc_entry(EXT4_MB_LARGE_REQ, sbi->s_mb_proc);
diff --git a/ldiskfs/kernel_patches/patches/ext4-misc-rhel5.patch b/ldiskfs/kernel_patches/patches/ext4-misc-rhel5.patch
new file mode 100644 (file)
index 0000000..f8e77c7
--- /dev/null
@@ -0,0 +1,271 @@
+Index: linux-2.6.18.i386/fs/ext4/ext4_jbd2.h
+===================================================================
+--- linux-2.6.18.i386.orig/fs/ext4/ext4_jbd2.h
++++ linux-2.6.18.i386/fs/ext4/ext4_jbd2.h
+@@ -35,6 +35,9 @@
+       (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)   \
+               || test_opt(sb, EXTENTS) ? 27U : 8U)
++/* Indicate that EXT4_SINGLEDATA_TRANS_BLOCKS takes the sb as argument */
++#define EXT4_SINGLEDATA_TRANS_BLOCKS_HAS_SB
++
+ /* Extended attribute operations touch at most two data buffers,
+  * two bitmap buffers, and two group summaries, in addition to the inode
+  * and the superblock, which are already accounted for. */
+Index: linux-2.6.18.i386/fs/ext4/extents.c
+===================================================================
+--- linux-2.6.18.i386.orig/fs/ext4/extents.c
++++ linux-2.6.18.i386/fs/ext4/extents.c
+@@ -50,7 +50,7 @@
+  * ext_pblock:
+  * combine low and high parts of physical block number into ext4_fsblk_t
+  */
+-static ext4_fsblk_t ext_pblock(struct ext4_extent *ex)
++ext4_fsblk_t ext_pblock(struct ext4_extent *ex)
+ {
+       ext4_fsblk_t block;
+@@ -60,6 +60,17 @@ static ext4_fsblk_t ext_pblock(struct ex
+ }
+ /*
++ * ext4_ext_store_pblock:
++ * stores a large physical block number into an extent struct,
++ * breaking it into parts
++ */
++void ext4_ext_store_pblock(struct ext4_extent *ex, ext4_fsblk_t pb)
++{
++      ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
++      ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
++}
++
++/*
+  * idx_pblock:
+  * combine low and high parts of a leaf physical block number into ext4_fsblk_t
+  */
+@@ -73,17 +84,6 @@ ext4_fsblk_t idx_pblock(struct ext4_exte
+ }
+ /*
+- * ext4_ext_store_pblock:
+- * stores a large physical block number into an extent struct,
+- * breaking it into parts
+- */
+-void ext4_ext_store_pblock(struct ext4_extent *ex, ext4_fsblk_t pb)
+-{
+-      ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
+-      ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
+-}
+-
+-/*
+  * ext4_idx_store_pblock:
+  * stores a large physical block number into an index struct,
+  * breaking it into parts
+@@ -1826,6 +1826,56 @@ static int ext4_ext_rm_idx(handle_t *han
+ }
+ /*
++ * This routine returns max. credits extent tree can consume.
++ * It should be OK for low-performance paths like ->writepage()
++ * To allow many writing process to fit a single transaction,
++ * caller should calculate credits under truncate_mutex and
++ * pass actual path.
++ */
++int ext4_ext_calc_credits_for_insert(struct inode *inode,
++                                  struct ext4_ext_path *path)
++{
++      int depth, needed;
++
++      if (path) {
++              /* probably there is space in leaf? */
++              depth = ext_depth(inode);
++              if (le16_to_cpu(path[depth].p_hdr->eh_entries)
++                              < le16_to_cpu(path[depth].p_hdr->eh_max))
++                      return 1;
++      }
++
++      /*
++       * given 32bit logical block (4294967296 blocks), max. tree
++       * can be 4 levels in depth -- 4 * 340^4 == 53453440000.
++       * let's also add one more level for imbalance.
++       */
++      depth = 5;
++
++      /* allocation of new data block(s) */
++      needed = 2;
++
++      /*
++       * tree can be full, so it'd need to grow in depth:
++       * we need one credit to modify old root, credits for
++       * new root will be added in split accounting
++       */
++      needed += 1;
++
++      /*
++       * Index split can happen, we'd need:
++       *    allocate intermediate indexes (bitmap + group)
++       *  + change two blocks at each level, but root (already included)
++       */
++      needed += (depth * 2) + (depth * 2);
++
++      /* any allocation modifies superblock */
++      needed += 1;
++
++      return needed;
++}
++
++/*
+  * ext4_ext_calc_credits_for_single_extent:
+  * This routine returns max. credits that needed to insert an extent
+  * to the extent tree.
+@@ -3157,3 +3207,14 @@ int ext4_fiemap(struct inode *inode, str
+       return error;
+ }
++
++EXPORT_SYMBOL(ext4_ext_store_pblock);
++EXPORT_SYMBOL(ext4_ext_search_right);
++EXPORT_SYMBOL(ext4_ext_search_left);
++EXPORT_SYMBOL(ext_pblock);
++EXPORT_SYMBOL(ext4_ext_insert_extent);
++EXPORT_SYMBOL(ext4_mb_new_blocks);
++EXPORT_SYMBOL(ext4_ext_walk_space);
++EXPORT_SYMBOL(ext4_ext_calc_credits_for_insert);
++EXPORT_SYMBOL(ext4_mark_inode_dirty);
++
+Index: linux-2.6.18.i386/fs/ext4/ext4_extents.h
+===================================================================
+--- linux-2.6.18.i386.orig/fs/ext4/ext4_extents.h
++++ linux-2.6.18.i386/fs/ext4/ext4_extents.h
+@@ -59,6 +59,11 @@
+  */
+ #define EXT_STATS_
++/*
++ * define EXT4_ALLOC_NEEDED to 0 since block bitmap, group desc. and sb
++ * are now accounted in ext4_ext_calc_credits_for_insert()
++ */
++#define EXT4_ALLOC_NEEDED 0
+ /*
+  * ext4_inode has i_block array (60 bytes total).
+@@ -124,6 +129,7 @@ struct ext4_ext_path {
+ #define EXT4_EXT_CACHE_GAP    1
+ #define EXT4_EXT_CACHE_EXTENT 2
++#define EXT4_EXT_HAS_NO_TREE  /* ext4_extents_tree struct is not used*/
+ #define EXT_MAX_BLOCK 0xffffffff
+@@ -228,9 +234,13 @@ static inline int ext4_ext_get_actual_le
+               (le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN));
+ }
++extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex);
++extern void ext4_ext_store_pblock(struct ext4_extent *ex, ext4_fsblk_t pb);
+ extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
+ extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t);
+ extern int ext4_extent_tree_init(handle_t *, struct inode *);
++extern int ext4_ext_calc_credits_for_insert(struct inode *,
++                                          struct ext4_ext_path *);
+ extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
+                                                  int num,
+                                                  struct ext4_ext_path *path);
+Index: linux-2.6.18.i386/fs/ext4/mballoc.c
+===================================================================
+--- linux-2.6.18.i386.orig/fs/ext4/mballoc.c
++++ linux-2.6.18.i386/fs/ext4/mballoc.c
+@@ -4965,3 +4965,7 @@ error_return:
+               kmem_cache_free(ext4_ac_cachep, ac);
+       return;
+ }
++
++EXPORT_SYMBOL(ext4_free_blocks);
++EXPORT_SYMBOL(ext4_mb_discard_inode_preallocations);
++
+Index: linux-2.6.18.i386/fs/ext4/super.c
+===================================================================
+--- linux-2.6.18.i386.orig/fs/ext4/super.c
++++ linux-2.6.18.i386/fs/ext4/super.c
+@@ -91,6 +91,7 @@ ext4_fsblk_t ext4_inode_bitmap(struct su
+               (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
+               (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0);
+ }
++EXPORT_SYMBOL(ext4_inode_bitmap);
+ ext4_fsblk_t ext4_inode_table(struct super_block *sb,
+                             struct ext4_group_desc *bg)
+@@ -513,7 +514,8 @@ static void ext4_put_super(struct super_
+       struct ext4_super_block *es = sbi->s_es;
+       int i;
+-      ext4_mb_release(sb);
++      if (test_opt(sb, MBALLOC))
++              ext4_mb_release(sb);
+       ext4_ext_release(sb);
+       ext4_xattr_put_super(sb);
+       jbd2_journal_destroy(sbi->s_journal);
+@@ -2373,16 +2375,6 @@ static int ext4_fill_super(struct super_
+                      "running e2fsck is recommended\n");
+       /*
+-       * Since ext4 is still considered development code, we require
+-       * that the TEST_FILESYS flag in s->flags be set.
+-       */
+-      if (!(le32_to_cpu(es->s_flags) & EXT2_FLAGS_TEST_FILESYS)) {
+-              printk(KERN_WARNING "EXT4-fs: %s: not marked "
+-                     "OK to use with test code.\n", sb->s_id);
+-              goto failed_mount;
+-      }
+-
+-      /*
+        * Check feature flags regardless of the revision level, since we
+        * previously didn't change the revision level when setting the flags,
+        * so there is a chance incompat flags are set on a rev 0 filesystem.
+@@ -3835,9 +3827,9 @@ static int ext4_get_sb(struct file_syste
+       return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt);
+ }
+-static struct file_system_type ext4dev_fs_type = {
++static struct file_system_type ext4_fs_type = {
+       .owner          = THIS_MODULE,
+-      .name           = "ext4dev",
++      .name           = "ext4",
+       .get_sb         = ext4_get_sb,
+       .kill_sb        = kill_block_super,
+ #ifdef HAVE_FALLOCATE
+@@ -3867,7 +3859,7 @@ static int __init init_ext4_fs(void)
+       err = init_inodecache();
+       if (err)
+               goto out1;
+-      err = register_filesystem(&ext4dev_fs_type);
++      err = register_filesystem(&ext4_fs_type);
+       if (err)
+               goto out;
+       return 0;
+@@ -3884,7 +3876,7 @@ out3:
+ static void __exit exit_ext4_fs(void)
+ {
+-      unregister_filesystem(&ext4dev_fs_type);
++      unregister_filesystem(&ext4_fs_type);
+       destroy_inodecache();
+       exit_ext4_xattr();
+       exit_ext4_mballoc();
+Index: linux-2.6.18.i386/fs/ext4/ext4_jbd2.c
+===================================================================
+--- linux-2.6.18.i386.orig/fs/ext4/ext4_jbd2.c
++++ linux-2.6.18.i386/fs/ext4/ext4_jbd2.c
+@@ -21,6 +21,7 @@ int __ext4_journal_get_write_access(cons
+               ext4_journal_abort_handle(where, __func__, bh, handle, err);
+       return err;
+ }
++EXPORT_SYMBOL(__ext4_journal_get_write_access);
+ int __ext4_journal_forget(const char *where, handle_t *handle,
+                               struct buffer_head *bh)
+@@ -57,3 +58,4 @@ int __ext4_journal_dirty_metadata(const 
+               ext4_journal_abort_handle(where, __func__, bh, handle, err);
+       return err;
+ }
++EXPORT_SYMBOL(__ext4_journal_dirty_metadata);
diff --git a/ldiskfs/kernel_patches/patches/ext4-mmp-rhel5.patch b/ldiskfs/kernel_patches/patches/ext4-mmp-rhel5.patch
new file mode 100644 (file)
index 0000000..d01d046
--- /dev/null
@@ -0,0 +1,479 @@
+Index: linux-2.6.18-128.1.6/fs/ext4/super.c
+===================================================================
+--- linux-2.6.18-128.1.6.orig/fs/ext4/super.c
++++ linux-2.6.18-128.1.6/fs/ext4/super.c
+@@ -36,6 +36,8 @@
+ #include <linux/log2.h>
+ #include <linux/crc16.h>
+ #include <asm/uaccess.h>
++#include <linux/kthread.h>
++#include <linux/utsname.h>
+ #include "ext4.h"
+ #include "ext4_jbd2.h"
+@@ -547,6 +549,8 @@ static void ext4_put_super(struct super_
+               invalidate_bdev(sbi->journal_bdev, 0);
+               ext4_blkdev_remove(sbi);
+       }
++      if (sbi->s_mmp_tsk)
++              kthread_stop(sbi->s_mmp_tsk);
+       sb->s_fs_info = NULL;
+       kfree(sbi);
+       return;
+@@ -766,6 +770,328 @@ static int ext4_show_options(struct seq_
+       return 0;
+ }
++/*
++ * Write the MMP block using WRITE_SYNC to try to get the block on-disk
++ * faster.
++ */
++static int write_mmp_block(struct buffer_head *bh)
++{
++      mark_buffer_dirty(bh);
++      lock_buffer(bh);
++      bh->b_end_io = end_buffer_write_sync;
++      get_bh(bh);
++      submit_bh(WRITE_SYNC, bh);
++      wait_on_buffer(bh);
++      if (unlikely(!buffer_uptodate(bh)))
++              return 1;
++
++      return 0;
++}
++
++/*
++ * Read the MMP block. It _must_ be read from disk and hence we clear the
++ * uptodate flag on the buffer.
++ */
++static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
++                        unsigned long mmp_block)
++{
++      struct mmp_struct *mmp;
++
++      if (*bh)
++              clear_buffer_uptodate(*bh);
++
++#if 0
++      brelse(*bh);
++
++      *bh = sb_bread(sb, mmp_block);
++#else
++      if (!*bh)
++              *bh = sb_getblk(sb, mmp_block);
++      if (*bh) {
++              get_bh(*bh);
++              lock_buffer(*bh);
++              (*bh)->b_end_io = end_buffer_read_sync;
++              submit_bh(READ_SYNC, *bh);
++              wait_on_buffer(*bh);
++              if (!buffer_uptodate(*bh)) {
++                      brelse(*bh);
++                      *bh = NULL;
++              }
++      }
++#endif
++      if (!*bh) {
++              ext4_warning(sb, __FUNCTION__,
++                           "Error while reading MMP block %lu", mmp_block);
++              return -EIO;
++      }
++
++      mmp = (struct mmp_struct *)((*bh)->b_data);
++      if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC)
++              return -EINVAL;
++
++      return 0;
++}
++
++/*
++ * Dump as much information as possible to help the admin.
++ */
++static void dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp,
++                       const char *function, const char *msg)
++{
++      ext4_warning(sb, function, msg);
++      ext4_warning(sb, function, "MMP failure info: last update time: %llu, "
++                   "last update node: %s, last update device: %s\n",
++                   le64_to_cpu(mmp->mmp_time), mmp->mmp_nodename,
++                   mmp->mmp_bdevname);
++}
++
++/*
++ * kmmpd will update the MMP sequence every s_mmp_update_interval seconds
++ */
++static int kmmpd(void *data)
++{
++      struct super_block *sb = (struct super_block *) data;
++      struct ext4_super_block *es = EXT4_SB(sb)->s_es;
++      struct buffer_head *bh = NULL;
++      struct mmp_struct *mmp;
++      unsigned long mmp_block;
++      u32 seq = 0;
++      unsigned long failed_writes = 0;
++      int mmp_update_interval = le16_to_cpu(es->s_mmp_update_interval);
++      unsigned mmp_check_interval;
++      unsigned long last_update_time;
++      unsigned long diff;
++      int retval;
++
++      mmp_block = le64_to_cpu(es->s_mmp_block);
++      retval = read_mmp_block(sb, &bh, mmp_block);
++      if (retval)
++              goto failed;
++
++      mmp = (struct mmp_struct *)(bh->b_data);
++      mmp->mmp_time = cpu_to_le64(get_seconds());
++      /*
++       * Start with the higher mmp_check_interval and reduce it if
++       * the MMP block is being updated on time.
++       */
++      mmp_check_interval = max(5 * mmp_update_interval,
++                               EXT4_MMP_MIN_CHECK_INTERVAL);
++      mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
++      bdevname(bh->b_bdev, mmp->mmp_bdevname);
++
++      down_read(&uts_sem);
++      memcpy(mmp->mmp_nodename, system_utsname.nodename,
++             sizeof(mmp->mmp_nodename));
++      up_read(&uts_sem);
++
++      while (!kthread_should_stop()) {
++              if (++seq > EXT4_MMP_SEQ_MAX)
++                      seq = 1;
++
++              mmp->mmp_seq = cpu_to_le32(seq);
++              mmp->mmp_time = cpu_to_le64(get_seconds());
++              last_update_time = jiffies;
++
++              retval = write_mmp_block(bh);
++              /*
++               * Don't spew too many error messages. Print one every
++               * (s_mmp_update_interval * 60) seconds.
++               */
++              if (retval && (failed_writes % 60) == 0) {
++                      ext4_error(sb, __FUNCTION__,
++                                 "Error writing to MMP block");
++                      failed_writes++;
++              }
++
++              if (!(le32_to_cpu(es->s_feature_incompat) &
++                  EXT4_FEATURE_INCOMPAT_MMP)) {
++                      ext4_warning(sb, __FUNCTION__, "kmmpd being stopped "
++                                   "since MMP feature has been disabled.");
++                      EXT4_SB(sb)->s_mmp_tsk = 0;
++                      goto failed;
++              }
++
++              if (sb->s_flags & MS_RDONLY) {
++                      ext4_warning(sb, __FUNCTION__, "kmmpd being stopped "
++                                   "since filesystem has been remounted as "
++                                   "readonly.");
++                      EXT4_SB(sb)->s_mmp_tsk = 0;
++                      goto failed;
++              }
++
++              diff = jiffies - last_update_time;
++              if (diff < mmp_update_interval * HZ)
++                      schedule_timeout_interruptible(EXT4_MMP_UPDATE_INTERVAL*
++                                                     HZ - diff);
++
++              /*
++               * We need to make sure that more than mmp_check_interval
++               * seconds have not passed since writing. If that has happened
++               * we need to check if the MMP block is as we left it.
++               */
++              diff = jiffies - last_update_time;
++              if (diff > mmp_check_interval * HZ) {
++                      struct buffer_head *bh_check = NULL;
++                      struct mmp_struct *mmp_check;
++
++                      retval = read_mmp_block(sb, &bh_check, mmp_block);
++                      if (retval) {
++                              EXT4_SB(sb)->s_mmp_tsk = 0;
++                              goto failed;
++                      }
++
++                      mmp_check = (struct mmp_struct *)(bh_check->b_data);
++                      if (mmp->mmp_time != mmp_check->mmp_time ||
++                          memcmp(mmp->mmp_nodename, mmp_check->mmp_nodename,
++                                 sizeof(mmp->mmp_nodename)))
++                              dump_mmp_msg(sb, mmp_check, __FUNCTION__,
++                                           "Error while updating MMP info. "
++                                           "The filesystem seems to have "
++                                           "been multiply mounted.");
++
++                      put_bh(bh_check);
++              }
++
++              /*
++               * Adjust the mmp_check_interval depending on how much time
++               * it took for the MMP block to be written.
++               */
++              mmp_check_interval = max(5 * diff / HZ,
++                               (unsigned long) EXT4_MMP_MIN_CHECK_INTERVAL);
++              mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
++      }
++
++      /*
++       * Unmount seems to be clean.
++       */
++      mmp->mmp_seq = cpu_to_le32(EXT4_MMP_SEQ_CLEAN);
++      mmp->mmp_time = cpu_to_le64(get_seconds());
++
++      retval = write_mmp_block(bh);
++
++failed:
++      brelse(bh);
++      return retval;
++}
++
++/*
++ * Get a random new sequence number but make sure it is not greater than
++ * EXT4_MMP_SEQ_MAX.
++ */
++static unsigned int mmp_new_seq(void)
++{
++      u32 new_seq;
++
++      do {
++              get_random_bytes(&new_seq, sizeof(u32));
++      } while (new_seq > EXT4_MMP_SEQ_MAX);
++
++      return new_seq;
++}
++
++/*
++ * Protect the filesystem from being mounted more than once.
++ */
++static int ext4_multi_mount_protect(struct super_block *sb,
++                                  unsigned long mmp_block)
++{
++      struct ext4_super_block *es = EXT4_SB(sb)->s_es;
++      struct buffer_head *bh = NULL;
++      struct mmp_struct *mmp = NULL;
++      u32 seq;
++      unsigned int mmp_check_interval = le16_to_cpu(es->s_mmp_update_interval);
++      int retval;
++
++      if (mmp_block < le32_to_cpu(es->s_first_data_block) ||
++          mmp_block >= ext4_blocks_count(es)) {
++              ext4_warning(sb, __FUNCTION__,
++                           "Invalid MMP block in superblock");
++              goto failed;
++      }
++
++      retval = read_mmp_block(sb, &bh, mmp_block);
++      if (retval)
++              goto failed;
++
++      mmp = (struct mmp_struct *)(bh->b_data);
++
++      if (mmp_check_interval < EXT4_MMP_MIN_CHECK_INTERVAL)
++              mmp_check_interval = EXT4_MMP_MIN_CHECK_INTERVAL;
++
++      /*
++       * If check_interval in MMP block is larger, use that instead of
++       * update_interval from the superblock.
++       */
++      if (mmp->mmp_check_interval > mmp_check_interval)
++              mmp_check_interval = mmp->mmp_check_interval;
++
++      seq = le32_to_cpu(mmp->mmp_seq);
++      if (seq == EXT4_MMP_SEQ_CLEAN)
++              goto skip;
++
++      if (seq == EXT4_MMP_SEQ_FSCK) {
++              dump_mmp_msg(sb, mmp, __FUNCTION__,
++                           "fsck is running on the filesystem");
++              goto failed;
++      }
++
++      schedule_timeout_uninterruptible(HZ * (2 * mmp_check_interval + 1));
++
++      retval = read_mmp_block(sb, &bh, mmp_block);
++      if (retval)
++              goto failed;
++      mmp = (struct mmp_struct *)(bh->b_data);
++      if (seq != le32_to_cpu(mmp->mmp_seq)) {
++              dump_mmp_msg(sb, mmp, __FUNCTION__,
++                           "Device is already active on another node.");
++              goto failed;
++      }
++
++skip:
++      /*
++       * write a new random sequence number.
++       */
++      mmp->mmp_seq = seq = cpu_to_le32(mmp_new_seq());
++
++      retval = write_mmp_block(bh);
++      if (retval)
++              goto failed;
++
++      /*
++       * wait for MMP interval and check mmp_seq.
++       */
++      schedule_timeout_uninterruptible(HZ * (2 * mmp_check_interval + 1));
++
++      retval = read_mmp_block(sb, &bh, mmp_block);
++      if (retval)
++              goto failed;
++      mmp = (struct mmp_struct *)(bh->b_data);
++      if (seq != le32_to_cpu(mmp->mmp_seq)) {
++              dump_mmp_msg(sb, mmp, __FUNCTION__,
++                           "Device is already active on another node.");
++              goto failed;
++      }
++
++      /*
++       * Start a kernel thread to update the MMP block periodically.
++       */
++      EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, sb, "kmmpd-%02x:%02x",
++                                           MAJOR(sb->s_dev),
++                                           MINOR(sb->s_dev));
++      if (IS_ERR(EXT4_SB(sb)->s_mmp_tsk)) {
++              EXT4_SB(sb)->s_mmp_tsk = 0;
++              ext4_warning(sb, __FUNCTION__, "Unable to create kmmpd thread "
++                           "for %s.", sb->s_id);
++              goto failed;
++      }
++
++      brelse(bh);
++      return 0;
++
++failed:
++      brelse(bh);
++      return 1;
++}
+ static struct dentry *ext4_get_dentry(struct super_block *sb, void *vobjp)
+ {
+@@ -775,7 +1101,6 @@ static struct dentry *ext4_get_dentry(st
+       struct inode *inode;
+       struct dentry *result;
+-
+       if (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO)
+               return ERR_PTR(-ESTALE);
+       if (ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))
+@@ -2258,6 +2583,11 @@ static int ext4_fill_super(struct super_
+                         EXT4_HAS_INCOMPAT_FEATURE(sb,
+                                   EXT4_FEATURE_INCOMPAT_RECOVER));
++      if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_MMP) &&
++          !(sb->s_flags & MS_RDONLY))
++              if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block)))
++                      goto failed_mount3;
++
+       /*
+        * The first inode we look at is the journal inode.  Don't try
+        * root first: it may be modified in the journal!
+@@ -2445,6 +2775,8 @@ failed_mount3:
+       percpu_counter_destroy(&sbi->s_freeblocks_counter);
+       percpu_counter_destroy(&sbi->s_freeinodes_counter);
+       percpu_counter_destroy(&sbi->s_dirs_counter);
++      if (sbi->s_mmp_tsk)
++              kthread_stop(sbi->s_mmp_tsk);
+ failed_mount2:
+       for (i = 0; i < db_count; i++)
+               brelse(sbi->s_group_desc[i]);
+@@ -2918,7 +3250,7 @@ static int ext4_remount(struct super_blo
+       unsigned long old_sb_flags;
+       struct ext4_mount_options old_opts;
+       ext4_group_t g;
+-      int err;
++      int err = 0;
+ #ifdef CONFIG_QUOTA
+       int i;
+ #endif
+@@ -3042,6 +3374,13 @@ static int ext4_remount(struct super_blo
+                               goto restore_opts;
+                       if (!ext4_setup_super(sb, es, 0))
+                               sb->s_flags &= ~MS_RDONLY;
++                      if (EXT4_HAS_INCOMPAT_FEATURE(sb,
++                                                  EXT4_FEATURE_INCOMPAT_MMP))
++                              if (ext4_multi_mount_protect(sb,
++                                              le64_to_cpu(es->s_mmp_block))) {
++                                      err = -EROFS;
++                                      goto restore_opts;
++                              }
+               }
+       }
+ #ifdef CONFIG_QUOTA
+Index: linux-2.6.18-128.1.6/fs/ext4/ext4.h
+===================================================================
+--- linux-2.6.18-128.1.6.orig/fs/ext4/ext4.h
++++ linux-2.6.18-128.1.6/fs/ext4/ext4.h
+@@ -658,7 +658,7 @@ struct ext4_super_block {
+       __le16  s_want_extra_isize;     /* New inodes should reserve # bytes */
+       __le32  s_flags;                /* Miscellaneous flags */
+       __le16  s_raid_stride;          /* RAID stride */
+-      __le16  s_mmp_interval;         /* # seconds to wait in MMP checking */
++      __le16  s_mmp_update_interval;  /* # seconds to wait in MMP checking */
+       __le64  s_mmp_block;            /* Block for multi-mount protection */
+       __le32  s_raid_stripe_width;    /* blocks on all data disks (N*stride)*/
+       __u8    s_log_groups_per_flex;  /* FLEX_BG group size */
+@@ -775,7 +775,8 @@ static inline int ext4_valid_inum(struct
+                                        EXT4_FEATURE_INCOMPAT_META_BG| \
+                                        EXT4_FEATURE_INCOMPAT_EXTENTS| \
+                                        EXT4_FEATURE_INCOMPAT_64BIT| \
+-                                       EXT4_FEATURE_INCOMPAT_FLEX_BG)
++                                       EXT4_FEATURE_INCOMPAT_FLEX_BG| \
++                                       EXT4_FEATURE_INCOMPAT_MMP)
+ #define EXT4_FEATURE_RO_COMPAT_SUPP   (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
+                                        EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
+                                        EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
+@@ -956,6 +957,39 @@ void ext4_get_group_no_and_offset(struct
+                       unsigned long *blockgrpp, ext4_grpblk_t *offsetp);
+ /*
++ * This structure will be used for multiple mount protection. It will be
++ * written into the block number saved in the s_mmp_block field in the
++ * superblock. Programs that check MMP should assume that if
++ * SEQ_FSCK (or any unknown code above SEQ_MAX) is present then it is NOT safe
++ * to use the filesystem, regardless of how old the timestamp is.
++ */
++#define EXT4_MMP_MAGIC     0x004D4D50U /* ASCII for MMP */
++#define EXT4_MMP_SEQ_CLEAN 0xFF4D4D50U /* mmp_seq value for clean unmount */
++#define EXT4_MMP_SEQ_FSCK  0xE24D4D50U /* mmp_seq value when being fscked */
++#define EXT4_MMP_SEQ_MAX   0xE24D4D4FU /* maximum valid mmp_seq value */
++
++struct mmp_struct {
++      __le32  mmp_magic;
++      __le32  mmp_seq;
++      __le64  mmp_time;
++      char    mmp_nodename[64];
++      char    mmp_bdevname[32];
++      __le16  mmp_check_interval;
++      __le16  mmp_pad1;
++      __le32  mmp_pad2[227];
++};
++
++/*
++ * Default interval in seconds to update the MMP sequence number.
++ */
++#define EXT4_MMP_UPDATE_INTERVAL   1
++
++/*
++ * Minimum interval for MMP checking in seconds.
++ */
++#define EXT4_MMP_MIN_CHECK_INTERVAL   5
++
++/*
+  * Function prototypes
+  */
+Index: linux-2.6.18-128.1.6/fs/ext4/ext4_sb.h
+===================================================================
+--- linux-2.6.18-128.1.6.orig/fs/ext4/ext4_sb.h
++++ linux-2.6.18-128.1.6/fs/ext4/ext4_sb.h
+@@ -148,6 +148,8 @@ struct ext4_sb_info {
+       unsigned int s_log_groups_per_flex;
+       struct flex_groups *s_flex_groups;
++
++      struct task_struct *s_mmp_tsk;  /* Kernel thread for multiple mount protection */
+ };
+ #endif        /* _EXT4_SB */
diff --git a/ldiskfs/kernel_patches/patches/ext4-prealloc-rhel5.patch b/ldiskfs/kernel_patches/patches/ext4-prealloc-rhel5.patch
new file mode 100644 (file)
index 0000000..34d0472
--- /dev/null
@@ -0,0 +1,405 @@
+Index: linux-2.6.18-128.1.6/fs/ext4/ext4_sb.h
+===================================================================
+--- linux-2.6.18-128.1.6.orig/fs/ext4/ext4_sb.h        2009-05-28 17:16:51.000000000 +0530
++++ linux-2.6.18-128.1.6/fs/ext4/ext4_sb.h     2009-05-28 17:16:52.000000000 +0530
+@@ -108,11 +108,14 @@
+       /* tunables */
+       unsigned long s_stripe;
+-      unsigned long s_mb_stream_request;
++      unsigned long s_mb_small_req;
++      unsigned long s_mb_large_req;
+       unsigned long s_mb_max_to_scan;
+       unsigned long s_mb_min_to_scan;
+       unsigned long s_mb_stats;
+       unsigned long s_mb_order2_reqs;
++      unsigned long *s_mb_prealloc_table;
++      unsigned long s_mb_prealloc_table_size;
+       unsigned long s_mb_group_prealloc;
+       /* where last allocation was done - for stream allocation */
+       unsigned long s_mb_last_group;
+Index: linux-2.6.18-128.1.6/fs/ext4/mballoc.c
+===================================================================
+--- linux-2.6.18-128.1.6.orig/fs/ext4/mballoc.c        2009-05-28 17:16:51.000000000 +0530
++++ linux-2.6.18-128.1.6/fs/ext4/mballoc.c     2009-05-28 17:19:57.000000000 +0530
+@@ -1744,7 +1744,7 @@
+       if (size < isize)
+               size = isize;
+-      if (size < sbi->s_mb_stream_request &&
++      if ((ac->ac_g_ex.fe_len < sbi->s_mb_large_req) &&
+                       (ac->ac_flags & EXT4_MB_HINT_DATA)) {
+               /* TBD: may be hot point */
+               spin_lock(&sbi->s_md_lock);
+@@ -2484,6 +2484,26 @@
+       return -ENOMEM;
+ }
++static void ext4_mb_prealloc_table_add(struct ext4_sb_info *sbi, int value)
++{
++      int i;
++
++      if (value > (sbi->s_blocks_per_group - 1 - 1 - sbi->s_itb_per_group))
++              return;
++
++      for (i = 0; i < sbi->s_mb_prealloc_table_size; i++) {
++              if (sbi->s_mb_prealloc_table[i] == 0) {
++                      sbi->s_mb_prealloc_table[i] = value;
++                      return;
++              }
++
++              /* they should add values in order */
++              if (value <= sbi->s_mb_prealloc_table[i])
++                      return;
++      }
++}
++
++
+ int ext4_mb_init(struct super_block *sb, int needs_recovery)
+ {
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+@@ -2542,15 +2562,59 @@
+       sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN;
+       sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN;
+       sbi->s_mb_stats = MB_DEFAULT_STATS;
+-      sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
+       sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
+       sbi->s_mb_history_filter = EXT4_MB_HISTORY_DEFAULT;
+-      sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC;
++
++      if (sbi->s_stripe == 0) {
++              sbi->s_mb_prealloc_table_size = 8;
++              i = sbi->s_mb_prealloc_table_size * sizeof(unsigned long);
++              sbi->s_mb_prealloc_table = kmalloc(i, GFP_NOFS);
++              if (sbi->s_mb_prealloc_table == NULL) {
++                              clear_opt(sbi->s_mount_opt, MBALLOC);
++                              kfree(sbi->s_mb_offsets);
++                              kfree(sbi->s_mb_maxs);
++                              return -ENOMEM;
++              }
++              memset(sbi->s_mb_prealloc_table, 0, i);
++
++              ext4_mb_prealloc_table_add(sbi, 4);
++              ext4_mb_prealloc_table_add(sbi, 8);
++              ext4_mb_prealloc_table_add(sbi, 16);
++              ext4_mb_prealloc_table_add(sbi, 32);
++              ext4_mb_prealloc_table_add(sbi, 64);
++              ext4_mb_prealloc_table_add(sbi, 128);
++              ext4_mb_prealloc_table_add(sbi, 256);
++              ext4_mb_prealloc_table_add(sbi, 512);
++
++              sbi->s_mb_small_req = 256;
++              sbi->s_mb_large_req = 1024;
++              sbi->s_mb_group_prealloc = 512;
++      } else {
++              sbi->s_mb_prealloc_table_size = 3;
++              i = sbi->s_mb_prealloc_table_size * sizeof(unsigned long);
++              sbi->s_mb_prealloc_table = kmalloc(i, GFP_NOFS);
++              if (sbi->s_mb_prealloc_table == NULL) {
++                      clear_opt(sbi->s_mount_opt, MBALLOC);
++                      kfree(sbi->s_mb_offsets);
++                      kfree(sbi->s_mb_maxs);
++                      return -ENOMEM;
++              }
++              memset(sbi->s_mb_prealloc_table, 0, i);
++
++              ext4_mb_prealloc_table_add(sbi, sbi->s_stripe);
++              ext4_mb_prealloc_table_add(sbi, sbi->s_stripe * 2);
++              ext4_mb_prealloc_table_add(sbi, sbi->s_stripe * 4);
++
++              sbi->s_mb_small_req = sbi->s_stripe;
++              sbi->s_mb_large_req = sbi->s_stripe * 8;
++              sbi->s_mb_group_prealloc = sbi->s_stripe * 4;
++      }
+       i = sizeof(struct ext4_locality_group) * num_possible_cpus();
+       sbi->s_locality_groups = kmalloc(i, GFP_KERNEL);
+       if (sbi->s_locality_groups == NULL) {
+               clear_opt(sbi->s_mount_opt, MBALLOC);
++              kfree(sbi->s_mb_prealloc_table);
+               kfree(sbi->s_mb_offsets);
+               kfree(sbi->s_mb_maxs);
+               return -ENOMEM;
+@@ -2725,10 +2789,82 @@
+ #define EXT4_MB_MAX_TO_SCAN_NAME      "max_to_scan"
+ #define EXT4_MB_MIN_TO_SCAN_NAME      "min_to_scan"
+ #define EXT4_MB_ORDER2_REQ            "order2_req"
+-#define EXT4_MB_STREAM_REQ            "stream_req"
++#define EXT4_MB_SMALL_REQ             "small_req"
++#define EXT4_MB_LARGE_REQ             "large_req"
++#define EXT4_MB_PREALLOC_TABLE          "prealloc_table"
+ #define EXT4_MB_GROUP_PREALLOC                "group_prealloc"
++static int ext4_mb_prealloc_table_proc_read(char *page, char **start, off_t off,
++                                          int count, int *eof, void *data)
++{
++      struct ext4_sb_info *sbi = data;
++      int len = 0;
++      int i;
++
++      *eof = 1;
++      if (off != 0)
++              return 0;
++
++      for (i = 0; i < sbi->s_mb_prealloc_table_size; i++)
++              len += sprintf(page + len, "%ld ",
++                             sbi->s_mb_prealloc_table[i]);
++      len += sprintf(page + len, "\n");
++
++      *start = page;
++      return len;
++}
++
++static int ext4_mb_prealloc_table_proc_write(struct file *file,
++                                           const char __user *buf,
++                                           unsigned long cnt, void *data)
++{
++      struct ext4_sb_info *sbi = data;
++      unsigned long value;
++      unsigned long prev = 0;
++      char str[128];
++      char *cur;
++      char *end;
++      unsigned long *new_table;
++      int num = 0;
++      int i = 0;
++
++      if (cnt >= sizeof(str))
++              return -EINVAL;
++      if (copy_from_user(str, buf, cnt))
++              return -EFAULT;
++
++      num = 0;
++      cur = str;
++      end = str + cnt;
++      while (cur < end) {
++              while ((cur < end) && (*cur == ' ')) cur++;
++              value = simple_strtol(cur, &cur, 0);
++              if (value == 0)
++                      break;
++              if (value <= prev)
++                      return -EINVAL;
++              prev = value;
++              num++;
++      }
++      new_table = kmalloc(num * sizeof(*new_table), GFP_KERNEL);
++      if (new_table == NULL)
++              return -ENOMEM;
++      kfree(sbi->s_mb_prealloc_table);
++      memset(new_table, 0, num * sizeof(*new_table));
++      sbi->s_mb_prealloc_table = new_table;
++      sbi->s_mb_prealloc_table_size = num;
++      cur = str;
++      end = str + cnt;
++      while (cur < end && i < num) {
++      while ((cur < end) && (*cur == ' ')) cur++;
++              value = simple_strtol(cur, &cur, 0);
++              ext4_mb_prealloc_table_add(sbi, value);
++              i++;
++      }
++
++      return cnt;
++}
+ #define MB_PROC_FOPS(name)                                    \
+ static int ext4_mb_##name##_proc_show(struct seq_file *m, void *v)    \
+@@ -2774,7 +2910,8 @@
+ MB_PROC_FOPS(max_to_scan);
+ MB_PROC_FOPS(min_to_scan);
+ MB_PROC_FOPS(order2_reqs);
+-MB_PROC_FOPS(stream_request);
++MB_PROC_FOPS(small_req);
++MB_PROC_FOPS(large_req);
+ MB_PROC_FOPS(group_prealloc);
+ #define       MB_PROC_HANDLER(name, var)                                      \
+@@ -2795,6 +2932,7 @@
+       mode_t mode = S_IFREG | S_IRUGO | S_IWUSR;
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+       struct proc_dir_entry *proc;
++      struct proc_dir_entry *proc_entry;
+       char devname[64];
+       if (proc_root_ext4 == NULL) {
+@@ -2808,15 +2946,29 @@
+       MB_PROC_HANDLER(EXT4_MB_MAX_TO_SCAN_NAME, max_to_scan);
+       MB_PROC_HANDLER(EXT4_MB_MIN_TO_SCAN_NAME, min_to_scan);
+       MB_PROC_HANDLER(EXT4_MB_ORDER2_REQ, order2_reqs);
+-      MB_PROC_HANDLER(EXT4_MB_STREAM_REQ, stream_request);
++      MB_PROC_HANDLER(EXT4_MB_SMALL_REQ, small_req);
++      MB_PROC_HANDLER(EXT4_MB_LARGE_REQ, large_req);
+       MB_PROC_HANDLER(EXT4_MB_GROUP_PREALLOC, group_prealloc);
++      proc_entry = create_proc_entry(EXT4_MB_PREALLOC_TABLE, S_IFREG |
++                                     S_IRUGO | S_IWUSR, sbi->s_mb_proc);
++      if (proc_entry == NULL) {
++              printk(KERN_ERR "EXT4-fs: unable to create %s\n",
++                     EXT4_MB_PREALLOC_TABLE);
++              goto err_out;
++      }
++      proc_entry->data = sbi;
++      proc_entry->read_proc = ext4_mb_prealloc_table_proc_read;
++      proc_entry->write_proc = ext4_mb_prealloc_table_proc_write;
++
+       return 0;
+ err_out:
+       printk(KERN_ERR "EXT4-fs: Unable to create %s\n", devname);
+       remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_mb_proc);
+-      remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_mb_proc);
++      remove_proc_entry(EXT4_MB_PREALLOC_TABLE, sbi->s_mb_proc);
++      remove_proc_entry(EXT4_MB_LARGE_REQ, sbi->s_mb_proc);
++      remove_proc_entry(EXT4_MB_SMALL_REQ, sbi->s_mb_proc);
+       remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_mb_proc);
+       remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_mb_proc);
+       remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_mb_proc);
+@@ -2838,7 +2990,9 @@
+       bdevname(sb->s_bdev, devname);
+       remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_mb_proc);
+-      remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_mb_proc);
++      remove_proc_entry(EXT4_MB_PREALLOC_TABLE, sbi->s_mb_proc);
++      remove_proc_entry(EXT4_MB_LARGE_REQ, sbi->s_mb_proc);
++      remove_proc_entry(EXT4_MB_SMALL_REQ, sbi->s_mb_proc);
+       remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_mb_proc);
+       remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_mb_proc);
+       remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_mb_proc);
+@@ -3032,11 +3186,12 @@
+ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
+                               struct ext4_allocation_request *ar)
+ {
+-      int bsbits, max;
++      int bsbits, i, wind;
+       ext4_lblk_t end;
+-      loff_t size, orig_size, start_off;
++      loff_t size, orig_size;
+       ext4_lblk_t start, orig_start;
+       struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
++      struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+       struct ext4_prealloc_space *pa;
+       /* do normalize only data requests, metadata requests
+@@ -3066,49 +3221,35 @@
+       size = size << bsbits;
+       if (size < i_size_read(ac->ac_inode))
+               size = i_size_read(ac->ac_inode);
++      size = (size + ac->ac_sb->s_blocksize - 1) >> bsbits;
+-      /* max size of free chunks */
+-      max = 2 << bsbits;
++      start = wind = 0;
+-#define NRL_CHECK_SIZE(req, size, max, chunk_size)    \
+-              (req <= (size) || max <= (chunk_size))
++      /* let's choose preallocation window depending on file size */
++      for (i = 0; i < sbi->s_mb_prealloc_table_size; i++) {
++              if (size <= sbi->s_mb_prealloc_table[i]) {
++                      wind = sbi->s_mb_prealloc_table[i];
++                      break;
++              }
++      }
++      size = wind;
+-      /* first, try to predict filesize */
+-      /* XXX: should this table be tunable? */
+-      start_off = 0;
+-      if (size <= 16 * 1024) {
+-              size = 16 * 1024;
+-      } else if (size <= 32 * 1024) {
+-              size = 32 * 1024;
+-      } else if (size <= 64 * 1024) {
+-              size = 64 * 1024;
+-      } else if (size <= 128 * 1024) {
+-              size = 128 * 1024;
+-      } else if (size <= 256 * 1024) {
+-              size = 256 * 1024;
+-      } else if (size <= 512 * 1024) {
+-              size = 512 * 1024;
+-      } else if (size <= 1024 * 1024) {
+-              size = 1024 * 1024;
+-      } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) {
+-              start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
+-                                              (21 - bsbits)) << 21;
+-              size = 2 * 1024 * 1024;
+-      } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) {
+-              start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
+-                                                      (22 - bsbits)) << 22;
+-              size = 4 * 1024 * 1024;
+-      } else if (NRL_CHECK_SIZE(ac->ac_o_ex.fe_len,
+-                                      (8<<20)>>bsbits, max, 8 * 1024)) {
+-              start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
+-                                                      (23 - bsbits)) << 23;
+-              size = 8 * 1024 * 1024;
+-      } else {
+-              start_off = (loff_t)ac->ac_o_ex.fe_logical << bsbits;
+-              size      = ac->ac_o_ex.fe_len << bsbits;
++      if (wind == 0) {
++              __u64 tstart, tend;
++              /* file is quite large, we now preallocate with
++               * the biggest configured window with regart to
++               * logical offset */
++              wind = sbi->s_mb_prealloc_table[i - 1];
++              tstart = ac->ac_o_ex.fe_logical;
++              do_div(tstart, wind);
++              start = tstart * wind;
++              tend = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len - 1;
++              do_div(tend, wind);
++              tend = tend * wind + wind;
++              size = tend - start;
+       }
+-      orig_size = size = size >> bsbits;
+-      orig_start = start = start_off >> bsbits;
++      orig_size = size;
++      orig_start = start;
+       /* don't cover already allocated blocks in selected range */
+       if (ar->pleft && start <= ar->lleft) {
+@@ -3185,7 +3326,6 @@
+       }
+       BUG_ON(start + size <= ac->ac_o_ex.fe_logical &&
+                       start > ac->ac_o_ex.fe_logical);
+-      BUG_ON(size <= 0 || size >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
+       /* now prepare goal request */
+@@ -4077,22 +4217,32 @@
+ {
+       struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+       int bsbits = ac->ac_sb->s_blocksize_bits;
+-      loff_t size, isize;
++      loff_t size;
+       if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
+               return;
+-      size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
+-      isize = i_size_read(ac->ac_inode) >> bsbits;
+-      size = max(size, isize);
+-
+-      /* don't use group allocation for large files */
+-      if (size >= sbi->s_mb_stream_request)
++      if (ac->ac_o_ex.fe_len >= sbi->s_mb_small_req)
+               return;
+       if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
+               return;
++      /* request is so large that we don't care about
++       * streaming - it overweights any possible seek */
++      if (ac->ac_o_ex.fe_len >= sbi->s_mb_large_req)
++              return;
++
++      size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
++      size = size << bsbits;
++      if (size < i_size_read(ac->ac_inode))
++              size = i_size_read(ac->ac_inode);
++      size = (size + ac->ac_sb->s_blocksize - 1) >> bsbits;
++
++      /* don't use group allocation for large files */
++      if (size >= sbi->s_mb_large_req)
++              return;
++
+       BUG_ON(ac->ac_lg != NULL);
+       /*
+        * locality group prealloc space are per cpu. The reason for having
diff --git a/ldiskfs/kernel_patches/patches/ext4-print-inum-in-htree-warning-rhel5.patch b/ldiskfs/kernel_patches/patches/ext4-print-inum-in-htree-warning-rhel5.patch
new file mode 100644 (file)
index 0000000..1cc10a8
--- /dev/null
@@ -0,0 +1,15 @@
+Index: linux-2.6.18.i386/fs/ext4/namei.c
+===================================================================
+--- linux-2.6.18.i386.orig/fs/ext4/namei.c
++++ linux-2.6.18.i386/fs/ext4/namei.c
+@@ -374,8 +374,8 @@ dx_probe(struct dentry *dentry, struct i
+           root->info.hash_version != DX_HASH_HALF_MD4 &&
+           root->info.hash_version != DX_HASH_LEGACY) {
+               ext4_warning(dir->i_sb, __func__,
+-                           "Unrecognised inode hash code %d",
+-                           root->info.hash_version);
++                           "Unrecognised inode hash code %d for directory "
++                           "#%lu", root->info.hash_version, dir->i_ino);
+               brelse(bh);
+               *err = ERR_BAD_DX_DIR;
+               goto fail;
diff --git a/ldiskfs/kernel_patches/patches/ext4-remove-cond_resched-calls-rhel5.patch b/ldiskfs/kernel_patches/patches/ext4-remove-cond_resched-calls-rhel5.patch
new file mode 100644 (file)
index 0000000..bab03d1
--- /dev/null
@@ -0,0 +1,29 @@
+Index: linux-2.6.18.i386/fs/ext4/ialloc.c
+===================================================================
+--- linux-2.6.18.i386.orig/fs/ext4/ialloc.c
++++ linux-2.6.18.i386/fs/ext4/ialloc.c
+@@ -1057,7 +1057,6 @@ unsigned long ext4_count_free_inodes (st
+               if (!gdp)
+                       continue;
+               desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
+-              cond_resched();
+       }
+       return desc_count;
+ #endif
+Index: linux-2.6.18.i386/fs/ext4/super.c
+===================================================================
+--- linux-2.6.18.i386.orig/fs/ext4/super.c
++++ linux-2.6.18.i386/fs/ext4/super.c
+@@ -3100,11 +3100,9 @@ static int ext4_statfs(struct dentry *de
+                * block group descriptors.  If the sparse superblocks
+                * feature is turned on, then not all groups have this.
+                */
+-              for (i = 0; i < ngroups; i++) {
++              for (i = 0; i < ngroups; i++)
+                       overhead += ext4_bg_has_super(sb, i) +
+                               ext4_bg_num_gdb(sb, i);
+-                      cond_resched();
+-              }
+               /*
+                * Every block group has an inode bitmap, a block
diff --git a/ldiskfs/kernel_patches/patches/ext4-unlink-race-rhel5.patch b/ldiskfs/kernel_patches/patches/ext4-unlink-race-rhel5.patch
new file mode 100644 (file)
index 0000000..f75ae84
--- /dev/null
@@ -0,0 +1,15 @@
+Index: linux-2.6.18.i386/fs/ext4/namei.c
+===================================================================
+--- linux-2.6.18.i386.orig/fs/ext4/namei.c
++++ linux-2.6.18.i386/fs/ext4/namei.c
+@@ -2299,8 +2299,8 @@ static int ext4_link (struct dentry * ol
+        * Return -ENOENT if we've raced with unlink and i_nlink is 0.  Doing
+        * otherwise has the potential to corrupt the orphan inode list.
+        */
+-      if (inode->i_nlink == 0)
+-              return -ENOENT;
++      //if (inode->i_nlink == 0)
++      //      return -ENOENT;
+ retry:
+       handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
diff --git a/ldiskfs/kernel_patches/patches/ext4-wantedi-2.6-rhel5.patch b/ldiskfs/kernel_patches/patches/ext4-wantedi-2.6-rhel5.patch
new file mode 100644 (file)
index 0000000..e0c6f8d
--- /dev/null
@@ -0,0 +1,169 @@
+Index: linux-2.6.18.i386/fs/ext4/ialloc.c
+===================================================================
+--- linux-2.6.18.i386.orig/fs/ext4/ialloc.c
++++ linux-2.6.18.i386/fs/ext4/ialloc.c
+@@ -576,7 +576,8 @@ static int find_group_other(struct super
+  * For other inodes, search forward from the parent directory's block
+  * group to find a free inode.
+  */
+-struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
++struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode,
++                           unsigned long goal)
+ {
+       struct super_block *sb;
+       struct buffer_head *bitmap_bh = NULL;
+@@ -607,6 +608,43 @@ struct inode *ext4_new_inode(handle_t *h
+       sbi = EXT4_SB(sb);
+       es = sbi->s_es;
++      if (goal) {
++              group = (goal - 1) / EXT4_INODES_PER_GROUP(sb);
++              ino = (goal - 1) % EXT4_INODES_PER_GROUP(sb);
++              err = -EIO;
++
++              gdp = ext4_get_group_desc(sb, group, &bh2);
++              if (!gdp)
++                      goto fail;
++
++              bitmap_bh = ext4_read_inode_bitmap(sb, group);
++              if (!bitmap_bh)
++                      goto fail;
++
++              BUFFER_TRACE(bh, "get_write_access");
++              err = ext4_journal_get_write_access(handle, bitmap_bh);
++              if (err)
++                      goto fail;
++
++              if (ext4_set_bit_atomic(sb_bgl_lock(sbi, group),
++                                      ino, bitmap_bh->b_data)) {
++                      printk(KERN_ERR "goal inode %lu unavailable\n", goal);
++                      /* Oh well, we tried. */
++                      goto continue_allocation;
++              }
++
++              BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
++              err = ext4_journal_dirty_metadata(handle, bitmap_bh);
++              if (err)
++                      goto fail;
++
++              /* We've shortcircuited the allocation system successfully,
++               * now finish filling in the inode.
++               */
++              goto got;
++      }
++
++continue_allocation:
+       if (sbi->s_log_groups_per_flex) {
+               ret2 = find_group_flex(sb, dir, &group);
+               goto got_group;
+Index: linux-2.6.18.i386/fs/ext4/namei.c
+===================================================================
+--- linux-2.6.18.i386.orig/fs/ext4/namei.c
++++ linux-2.6.18.i386/fs/ext4/namei.c
+@@ -104,6 +104,7 @@ struct dx_entry
+       __le32 block;
+ };
++
+ /*
+  * dx_root_info is laid out so that if it should somehow get overlaid by a
+  * dirent the two low bits of the hash version will be zero.  Therefore, the
+@@ -149,6 +150,14 @@ struct dx_map_entry
+       u16 size;
+ };
++#define LVFS_DENTRY_PARAM_MAGIC               20070216UL
++struct lvfs_dentry_params
++{
++      unsigned long   p_inum;
++      void        *p_ptr;
++      u32          magic;
++};
++
+ static inline ext4_lblk_t dx_get_block(struct dx_entry *entry);
+ static void dx_set_block(struct dx_entry *entry, ext4_lblk_t value);
+ static inline unsigned dx_get_hash (struct dx_entry *entry);
+@@ -1708,6 +1717,20 @@ static int ext4_add_nondir(handle_t *han
+       return err;
+ }
++static struct inode * ext4_new_inode_wantedi(handle_t *handle, struct inode *dir,
++                                              int mode, struct dentry *dentry)
++{
++      unsigned long inum = 0;
++
++      if (dentry->d_fsdata != NULL) {
++              struct lvfs_dentry_params *param = dentry->d_fsdata;
++
++              if (param->magic == LVFS_DENTRY_PARAM_MAGIC)
++                      inum = param->p_inum;
++      }
++      return ext4_new_inode(handle, dir, mode, inum);
++}
++
+ /*
+  * By the time this is called, we already have created
+  * the directory cache entry for the new file, but it
+@@ -1733,7 +1756,7 @@ retry:
+       if (IS_DIRSYNC(dir))
+               handle->h_sync = 1;
+-      inode = ext4_new_inode (handle, dir, mode);
++      inode = ext4_new_inode_wantedi (handle, dir, mode, dentry);
+       err = PTR_ERR(inode);
+       if (!IS_ERR(inode)) {
+               inode->i_op = &ext4_file_inode_operations;
+@@ -1767,7 +1790,7 @@ retry:
+       if (IS_DIRSYNC(dir))
+               handle->h_sync = 1;
+-      inode = ext4_new_inode (handle, dir, mode);
++      inode = ext4_new_inode_wantedi (handle, dir, mode, dentry);
+       err = PTR_ERR(inode);
+       if (!IS_ERR(inode)) {
+               init_special_inode(inode, inode->i_mode, rdev);
+@@ -1803,7 +1826,7 @@ retry:
+       if (IS_DIRSYNC(dir))
+               handle->h_sync = 1;
+-      inode = ext4_new_inode (handle, dir, S_IFDIR | mode);
++      inode = ext4_new_inode_wantedi (handle, dir, S_IFDIR | mode, dentry);
+       err = PTR_ERR(inode);
+       if (IS_ERR(inode))
+               goto out_stop;
+@@ -2203,7 +2226,7 @@ retry:
+       if (IS_DIRSYNC(dir))
+               handle->h_sync = 1;
+-      inode = ext4_new_inode (handle, dir, S_IFLNK|S_IRWXUGO);
++      inode = ext4_new_inode_wantedi (handle, dir, S_IFLNK|S_IRWXUGO, dentry);
+       err = PTR_ERR(inode);
+       if (IS_ERR(inode))
+               goto out_stop;
+Index: linux-2.6.18.i386/fs/ext4/ext4.h
+===================================================================
+--- linux-2.6.18.i386.orig/fs/ext4/ext4.h
++++ linux-2.6.18.i386/fs/ext4/ext4.h
+@@ -1013,7 +1013,8 @@ extern int ext4fs_dirhash(const char *na
+                         dx_hash_info *hinfo);
+ /* ialloc.c */
+-extern struct inode * ext4_new_inode (handle_t *, struct inode *, int);
++extern struct inode * ext4_new_inode (handle_t *, struct inode *, int,
++                                    unsigned long);
+ extern void ext4_free_inode (handle_t *, struct inode *);
+ extern struct inode * ext4_orphan_get (struct super_block *, unsigned long);
+ extern unsigned long ext4_count_free_inodes (struct super_block *);
+Index: linux-2.6.18.i386/fs/ext4/migrate.c
+===================================================================
+--- linux-2.6.18.i386.orig/fs/ext4/migrate.c
++++ linux-2.6.18.i386/fs/ext4/migrate.c
+@@ -485,7 +485,7 @@ int ext4_ext_migrate(struct inode *inode
+       }
+       tmp_inode = ext4_new_inode(handle,
+                               inode->i_sb->s_root->d_inode,
+-                              S_IFREG);
++                              S_IFREG, 0);
+       if (IS_ERR(tmp_inode)) {
+               retval = -ENOMEM;
+               ext4_journal_stop(handle);
diff --git a/ldiskfs/kernel_patches/patches/ext4-xattr-no-update-ctime-rhel5.patch b/ldiskfs/kernel_patches/patches/ext4-xattr-no-update-ctime-rhel5.patch
new file mode 100644 (file)
index 0000000..66de9df
--- /dev/null
@@ -0,0 +1,32 @@
+Index: linux-2.6.18.i386/fs/ext4/ext4.h
+===================================================================
+--- linux-2.6.18.i386.orig/fs/ext4/ext4.h
++++ linux-2.6.18.i386/fs/ext4/ext4.h
+@@ -995,6 +995,13 @@ struct mmp_struct {
+ extern struct proc_dir_entry *proc_root_ext4;
+ /*
++ * Indicates that ctime should not be updated in ext4_xattr_set_handle()
++ */
++#ifndef XATTR_NO_CTIME
++#define XATTR_NO_CTIME 0x80
++#endif
++
++/*
+  * Function prototypes
+  */
+Index: linux-2.6.18.i386/fs/ext4/xattr.c
+===================================================================
+--- linux-2.6.18.i386.orig/fs/ext4/xattr.c
++++ linux-2.6.18.i386/fs/ext4/xattr.c
+@@ -1026,7 +1026,8 @@ ext4_xattr_set_handle(handle_t *handle, 
+       }
+       if (!error) {
+               ext4_xattr_update_super_block(handle, inode->i_sb);
+-              inode->i_ctime = ext4_current_time(inode);
++              if (!(flags & XATTR_NO_CTIME))
++                      inode->i_ctime = ext4_current_time(inode);
+               if (!value)
+                       EXT4_I(inode)->i_state &= ~EXT4_STATE_NO_EXPAND;
+               error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
diff --git a/ldiskfs/kernel_patches/patches/iopen-2.6.18-rhel5-ext4.patch b/ldiskfs/kernel_patches/patches/iopen-2.6.18-rhel5-ext4.patch
new file mode 100644 (file)
index 0000000..d7b94fa
--- /dev/null
@@ -0,0 +1,512 @@
+Index: linux-2.6.18-128.1.6/fs/ext4/iopen.c
+===================================================================
+--- /dev/null
++++ linux-2.6.18-128.1.6/fs/ext4/iopen.c
+@@ -0,0 +1,295 @@
++/*
++ * linux/fs/ext4/iopen.c
++ *
++ * Special support for open by inode number
++ *
++ * Copyright (C) 2001 by Theodore Ts'o (tytso@alum.mit.edu).
++ *
++ * This file may be redistributed under the terms of the GNU General
++ * Public License.
++ *
++ *
++ * Invariants:
++ *   - there is only ever a single DCACHE_NFSD_DISCONNECTED dentry alias
++ *     for an inode at one time.
++ *   - there are never both connected and DCACHE_NFSD_DISCONNECTED dentry
++ *     aliases on an inode at the same time.
++ *
++ * If we have any connected dentry aliases for an inode, use one of those
++ * in iopen_lookup().  Otherwise, we instantiate a single NFSD_DISCONNECTED
++ * dentry for this inode, which thereafter will be found by the dcache
++ * when looking up this inode number in __iopen__, so we don't return here
++ * until it is gone.
++ *
++ * If we get an inode via a regular name lookup, then we "rename" the
++ * NFSD_DISCONNECTED dentry to the proper name and parent.  This ensures
++ * existing users of the disconnected dentry will continue to use the same
++ * dentry as the connected users, and there will never be both kinds of
++ * dentry aliases at one time.
++ */
++
++#include <linux/sched.h>
++#include <linux/fs.h>
++#include <linux/smp_lock.h>
++#include <linux/dcache.h>
++#include <linux/security.h>
++#include "iopen.h"
++#include "ext4.h"
++#include "ext4_jbd2.h"
++
++#ifndef assert
++#define assert(test) J_ASSERT(test)
++#endif
++
++#define IOPEN_NAME_LEN        32
++
++/*
++ * This implements looking up an inode by number.
++ */
++static struct dentry *iopen_lookup(struct inode * dir, struct dentry *dentry,
++                                 struct nameidata *nd)
++{
++      struct inode *inode;
++      unsigned long ino;
++      struct list_head *lp;
++      struct dentry *alternate;
++      char buf[IOPEN_NAME_LEN];
++
++      if (dentry->d_name.len >= IOPEN_NAME_LEN)
++              return ERR_PTR(-ENAMETOOLONG);
++
++      memcpy(buf, dentry->d_name.name, dentry->d_name.len);
++      buf[dentry->d_name.len] = 0;
++
++      if (strcmp(buf, ".") == 0)
++              ino = dir->i_ino;
++      else if (strcmp(buf, "..") == 0)
++              ino = EXT4_ROOT_INO;
++      else
++              ino = simple_strtoul(buf, 0, 0);
++
++      if ((ino != EXT4_ROOT_INO &&
++           ino < EXT4_FIRST_INO(dir->i_sb)) ||
++          ino > le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count))
++              return ERR_PTR(-ENOENT);
++
++      inode = ext4_iget(dir->i_sb, ino);
++      if (IS_ERR(inode)) {
++               /* Newer kernels return -ESTALE for inodes that are not in use,
++                * but older kernels return a negative dentry. This can only
++                * happen when doing a lookup in the __iopen__ dir, because the
++                * "entry" will always be found even if inode is unallocated.
++                * Handle this here instead of fixing the callers. b=19114 */
++              if (PTR_ERR(inode) == -ESTALE)
++                      return (ERR_PTR(-ENOENT));
++              return ERR_CAST(inode);
++      }
++
++      assert(list_empty(&dentry->d_alias));           /* d_instantiate */
++      assert(d_unhashed(dentry));                     /* d_rehash */
++
++      /* preferrably return a connected dentry */
++      spin_lock(&dcache_lock);
++      list_for_each(lp, &inode->i_dentry) {
++              alternate = list_entry(lp, struct dentry, d_alias);
++              assert(!(alternate->d_flags & DCACHE_DISCONNECTED));
++      }
++
++      if (!list_empty(&inode->i_dentry)) {
++              alternate = list_entry(inode->i_dentry.next,
++                                     struct dentry, d_alias);
++              dget_locked(alternate);
++              spin_lock(&alternate->d_lock);
++              alternate->d_flags |= DCACHE_REFERENCED;
++              spin_unlock(&alternate->d_lock);
++              iput(inode);
++              spin_unlock(&dcache_lock);
++              return alternate;
++      }
++      dentry->d_flags |= DCACHE_DISCONNECTED;
++
++      /* d_add(), but don't drop dcache_lock before adding dentry to inode */
++      list_add(&dentry->d_alias, &inode->i_dentry);   /* d_instantiate */
++      dentry->d_inode = inode;
++
++      d_rehash_cond(dentry, 0);
++      spin_unlock(&dcache_lock);
++
++      return NULL;
++}
++
++/* This function is spliced into ext4_lookup and does the move of a
++ * disconnected dentry (if it exists) to a connected dentry.
++ */
++struct dentry *iopen_connect_dentry(struct dentry *dentry, struct inode *inode,
++                                  int rehash)
++{
++      struct dentry *tmp, *goal = NULL;
++      struct list_head *lp;
++
++      /* verify this dentry is really new */
++      assert(dentry->d_inode == NULL);
++      assert(list_empty(&dentry->d_alias));           /* d_instantiate */
++      if (rehash)
++              assert(d_unhashed(dentry));             /* d_rehash */
++      assert(list_empty(&dentry->d_subdirs));
++
++      spin_lock(&dcache_lock);
++      if (!inode)
++              goto do_rehash;
++
++      if (!test_opt(inode->i_sb, IOPEN))
++              goto do_instantiate;
++
++      /* preferrably return a connected dentry */
++      list_for_each(lp, &inode->i_dentry) {
++              tmp = list_entry(lp, struct dentry, d_alias);
++              if (tmp->d_flags & DCACHE_DISCONNECTED) {
++                      assert(tmp->d_alias.next == &inode->i_dentry);
++                      assert(tmp->d_alias.prev == &inode->i_dentry);
++                      goal = tmp;
++                      dget_locked(goal);
++                      break;
++              }
++      }
++
++      if (!goal)
++              goto do_instantiate;
++
++      /* Move the goal to the de hash queue */
++      goal->d_flags &= ~DCACHE_DISCONNECTED;
++      security_d_instantiate(goal, inode);
++      __d_drop(dentry);
++      d_rehash_cond(dentry, 0);
++      d_move_locked(goal, dentry);
++      spin_unlock(&dcache_lock);
++      iput(inode);
++
++      return goal;
++
++      /* d_add(), but don't drop dcache_lock before adding dentry to inode */
++do_instantiate:
++      list_add(&dentry->d_alias, &inode->i_dentry);   /* d_instantiate */
++      dentry->d_inode = inode;
++do_rehash:
++      if (rehash)
++              d_rehash_cond(dentry, 0);
++      spin_unlock(&dcache_lock);
++
++      return NULL;
++}
++
++/*
++ * Similar as d_instantiate() except that it drops the disconnected
++ * dentry if any.
++ */
++void iopen_d_instantiate(struct dentry *dentry, struct inode * inode)
++{
++      struct dentry *dis_dentry;
++
++      /* verify this dentry is really new */
++      assert(dentry->d_inode == NULL);
++      assert(list_empty(&dentry->d_alias));
++
++      spin_lock(&dcache_lock);
++      if (!inode || !test_opt(inode->i_sb, IOPEN) ||
++          list_empty(&inode->i_dentry))
++              goto do_instantiate;
++
++      /* a disconnected dentry has been added in our back,
++       * we have to drop this dentry, see bug 16362/15713*/
++      dis_dentry = list_entry(inode->i_dentry.next, struct dentry, d_alias);
++      spin_lock(&dis_dentry->d_lock);
++      assert(dis_dentry->d_alias.next == &inode->i_dentry);
++      assert(dis_dentry->d_alias.prev == &inode->i_dentry);
++      assert(dis_dentry->d_flags & DCACHE_DISCONNECTED);
++      __d_drop(dis_dentry);
++      list_del_init(&dis_dentry->d_alias);
++      spin_unlock(&dis_dentry->d_lock);
++
++do_instantiate:
++      if (inode)
++              list_add(&dentry->d_alias, &inode->i_dentry);
++      dentry->d_inode = inode;
++      spin_unlock(&dcache_lock);
++      security_d_instantiate(dentry, inode);
++}
++
++/*
++ * These are the special structures for the iopen pseudo directory.
++ */
++
++static struct inode_operations iopen_inode_operations = {
++      lookup:         iopen_lookup,           /* BKL held */
++};
++
++static struct file_operations iopen_file_operations = {
++      read:           generic_read_dir,
++};
++
++static int match_dentry(struct dentry *dentry, const char *name)
++{
++      int     len;
++
++      len = strlen(name);
++      if (dentry->d_name.len != len)
++              return 0;
++      if (strncmp(dentry->d_name.name, name, len))
++              return 0;
++      return 1;
++}
++
++/*
++ * This function is spliced into ext4_lookup and returns 1 the file
++ * name is __iopen__ and dentry has been filled in appropriately.
++ */
++int ext4_check_for_iopen(struct inode *dir, struct dentry *dentry)
++{
++      struct inode *inode;
++
++      if (dir->i_ino != EXT4_ROOT_INO ||
++          !test_opt(dir->i_sb, IOPEN) ||
++          !match_dentry(dentry, "__iopen__"))
++              return 0;
++
++      inode = ext4_iget(dir->i_sb, EXT4_BAD_INO);
++      if (IS_ERR(inode))
++              return 0;
++
++      d_add(dentry, inode);
++      return 1;
++}
++
++/*
++ * This function is spliced into read_inode; it returns 1 if inode
++ * number is the one for /__iopen__, in which case the inode is filled
++ * in appropriately.  Otherwise, this fuction returns 0.
++ */
++int ext4_iopen_get_inode(struct inode *inode)
++{
++      if (inode->i_ino != EXT4_BAD_INO)
++              return 0;
++
++      inode->i_mode = S_IFDIR | S_IRUSR | S_IXUSR;
++      if (test_opt(inode->i_sb, IOPEN_NOPRIV))
++              inode->i_mode |= 0777;
++      inode->i_uid = 0;
++      inode->i_gid = 0;
++      inode->i_nlink = 1;
++      inode->i_size = 4096;
++      inode->i_atime = inode->i_ctime = inode->i_mtime =  ext4_current_time(inode);
++      EXT4_I(inode)->i_dtime = 0;
++      EXT4_I(inode)->i_file_acl = 0;
++      inode->i_blocks = 0;
++      inode->i_version = 1;
++      inode->i_generation = 0;
++
++      inode->i_op = &iopen_inode_operations;
++      inode->i_fop = &iopen_file_operations;
++      inode->i_mapping->a_ops = 0;
++
++      if (inode->i_state & I_NEW)
++              unlock_new_inode(inode);
++
++      return 1;
++}
+Index: linux-2.6.18-128.1.6/fs/ext4/iopen.h
+===================================================================
+--- /dev/null
++++ linux-2.6.18-128.1.6/fs/ext4/iopen.h
+@@ -0,0 +1,16 @@
++/*
++ * iopen.h
++ *
++ * Special support for opening files by inode number.
++ *
++ * Copyright (C) 2001 by Theodore Ts'o (tytso@alum.mit.edu).
++ *
++ * This file may be redistributed under the terms of the GNU General
++ * Public License.
++ */
++
++extern int ext4_check_for_iopen(struct inode *dir, struct dentry *dentry);
++extern int ext4_iopen_get_inode(struct inode *inode);
++extern struct dentry *iopen_connect_dentry(struct dentry *dentry,
++                                         struct inode *inode, int rehash);
++extern void iopen_d_instantiate(struct dentry *dentry, struct inode * inode);
+Index: linux-2.6.18-128.1.6/fs/ext4/inode.c
+===================================================================
+--- linux-2.6.18-128.1.6.orig/fs/ext4/inode.c
++++ linux-2.6.18-128.1.6/fs/ext4/inode.c
+@@ -37,6 +37,7 @@
+ #include <linux/bio.h>
+ #include "ext4_jbd2.h"
+ #include "xattr.h"
++#include "iopen.h"
+ #include "acl.h"
+ /*
+@@ -2764,6 +2765,8 @@ struct inode *ext4_iget(struct super_blo
+       ei->i_default_acl = EXT4_ACL_NOT_CACHED;
+ #endif
+       ei->i_block_alloc_info = NULL;
++      if (ext4_iopen_get_inode(inode))
++              return inode;
+       ret = __ext4_get_inode_loc(inode, &iloc, 0);
+       if (ret < 0)
+Index: linux-2.6.18-128.1.6/fs/ext4/super.c
+===================================================================
+--- linux-2.6.18-128.1.6.orig/fs/ext4/super.c
++++ linux-2.6.18-128.1.6/fs/ext4/super.c
+@@ -888,6 +888,7 @@ enum {
+       Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
+       Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version,
+       Opt_mballoc, Opt_nomballoc, Opt_stripe,
++      Opt_iopen, Opt_noiopen, Opt_iopen_nopriv,
+ };
+ static match_table_t tokens = {
+@@ -938,6 +939,9 @@ static match_table_t tokens = {
+       {Opt_noquota, "noquota"},
+       {Opt_quota, "quota"},
+       {Opt_usrquota, "usrquota"},
++      {Opt_iopen, "iopen"},
++      {Opt_noiopen, "noiopen"},
++      {Opt_iopen_nopriv, "iopen_nopriv"},
+       {Opt_barrier, "barrier=%u"},
+       {Opt_extents, "extents"},
+       {Opt_noextents, "noextents"},
+@@ -1270,6 +1274,18 @@ clear_qf_name:
+                       else
+                               clear_opt(sbi->s_mount_opt, BARRIER);
+                       break;
++              case Opt_iopen:
++                      set_opt (sbi->s_mount_opt, IOPEN);
++                      clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV);
++                      break;
++              case Opt_noiopen:
++                      clear_opt (sbi->s_mount_opt, IOPEN);
++                      clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV);
++                      break;
++              case Opt_iopen_nopriv:
++                      set_opt (sbi->s_mount_opt, IOPEN);
++                      set_opt (sbi->s_mount_opt, IOPEN_NOPRIV);
++                      break;
+               case Opt_ignore:
+                       break;
+               case Opt_resize:
+Index: linux-2.6.18-128.1.6/fs/ext4/namei.c
+===================================================================
+--- linux-2.6.18-128.1.6.orig/fs/ext4/namei.c
++++ linux-2.6.18-128.1.6/fs/ext4/namei.c
+@@ -39,6 +39,7 @@
+ #include "namei.h"
+ #include "xattr.h"
++#include "iopen.h"
+ #include "acl.h"
+ /*
+@@ -1048,6 +1049,9 @@ static struct dentry *ext4_lookup(struct
+       if (dentry->d_name.len > EXT4_NAME_LEN)
+               return ERR_PTR(-ENAMETOOLONG);
++      if (ext4_check_for_iopen(dir, dentry))
++              return NULL;
++
+       bh = ext4_find_entry(dentry, &de);
+       inode = NULL;
+       if (bh) {
+@@ -1062,7 +1066,8 @@ static struct dentry *ext4_lookup(struct
+               if (IS_ERR(inode))
+                       return ERR_CAST(inode);
+       }
+-      return d_splice_alias(inode, dentry);
++
++      return iopen_connect_dentry(dentry, inode, 1);
+ }
+@@ -1709,7 +1714,7 @@ static int ext4_add_nondir(handle_t *han
+       int err = ext4_add_entry(handle, dentry, inode);
+       if (!err) {
+               ext4_mark_inode_dirty(handle, inode);
+-              d_instantiate(dentry, inode);
++              iopen_d_instantiate(dentry, inode);
+               return 0;
+       }
+       drop_nlink(inode);
+@@ -1868,7 +1873,7 @@ out_clear_inode:
+       ext4_inc_count(handle, dir);
+       ext4_update_dx_flag(dir);
+       ext4_mark_inode_dirty(handle, dir);
+-      d_instantiate(dentry, inode);
++      iopen_d_instantiate(dentry, inode);
+ out_stop:
+       ext4_journal_stop(handle);
+       if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
+@@ -2134,10 +2139,6 @@ static int ext4_rmdir (struct inode * di
+                             inode->i_nlink);
+       inode->i_version++;
+       clear_nlink(inode);
+-      /* There's no need to set i_disksize: the fact that i_nlink is
+-       * zero will ensure that the right thing happens during any
+-       * recovery. */
+-      inode->i_size = 0;
+       ext4_orphan_add(handle, inode);
+       inode->i_ctime = dir->i_ctime = dir->i_mtime = ext4_current_time(inode);
+       ext4_mark_inode_dirty(handle, inode);
+@@ -2263,6 +2264,23 @@ out_stop:
+       return err;
+ }
++/* Like ext4_add_nondir() except for call to iopen_connect_dentry */
++static int ext4_add_link(handle_t *handle, struct dentry *dentry,
++                       struct inode *inode)
++{
++      int err = ext4_add_entry(handle, dentry, inode);
++      if (!err) {
++              err = ext4_mark_inode_dirty(handle, inode);
++              if (err == 0) {
++                      dput(iopen_connect_dentry(dentry, inode, 0));
++                      return 0;
++              }
++      }
++      ext4_dec_count(handle, inode);
++      iput(inode);
++      return err;
++}
++
+ static int ext4_link (struct dentry * old_dentry,
+               struct inode * dir, struct dentry *dentry)
+ {
+@@ -2293,7 +2311,8 @@ retry:
+       ext4_inc_count(handle, inode);
+       atomic_inc(&inode->i_count);
+-      err = ext4_add_nondir(handle, dentry, inode);
++      err = ext4_add_link(handle, dentry, inode);
++      ext4_orphan_del(handle, inode);
+       ext4_journal_stop(handle);
+       if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
+               goto retry;
+Index: linux-2.6.18-128.1.6/fs/ext4/Makefile
+===================================================================
+--- linux-2.6.18-128.1.6.orig/fs/ext4/Makefile
++++ linux-2.6.18-128.1.6/fs/ext4/Makefile
+@@ -4,7 +4,7 @@
+ obj-$(CONFIG_EXT4DEV_FS) += ext4dev.o
+-ext4dev-y     := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
++ext4dev-y     := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \
+                  ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
+                  ext4_jbd2.o migrate.o mballoc.o
+Index: linux-2.6.18-128.1.6/fs/ext4/ext4.h
+===================================================================
+--- linux-2.6.18-128.1.6.orig/fs/ext4/ext4.h
++++ linux-2.6.18-128.1.6/fs/ext4/ext4.h
+@@ -18,6 +18,7 @@
+ #include <linux/types.h>
+ #include <linux/blkdev.h>
++#include <linux/jbd2.h>
+ #include "ext4_i.h"
+ #define EXT4_SUPER_MAGIC      0xEF53
+@@ -537,6 +538,8 @@ do {                                                                              \
+ #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT       0x1000000 /* Journal Async Commit */
+ #define EXT4_MOUNT_I_VERSION            0x2000000 /* i_version support */
+ #define EXT4_MOUNT_MBALLOC            0x4000000 /* Buddy allocation support */
++#define EXT4_MOUNT_IOPEN              0x8000000 /* Allow access via iopen */
++#define EXT4_MOUNT_IOPEN_NOPRIV               0x10000000 /* Make iopen world-readable */
+ /* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */
+ #ifndef _LINUX_EXT2_FS_H
+ #define clear_opt(o, opt)             o &= ~EXT4_MOUNT_##opt
diff --git a/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel5-ext4.series b/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel5-ext4.series
new file mode 100644 (file)
index 0000000..5e90b31
--- /dev/null
@@ -0,0 +1,23 @@
+ext4-wantedi-2.6-rhel5.patch
+iopen-2.6.18-rhel5-ext4.patch
+ext4-map_inode_page-2.6.18-rhel5.patch
+export-ext4-2.6-rhel5.patch
+ext4-include-fixes-2.6-rhel5.patch
+ext4-ialloc-2.6-rhel5.patch
+ext4-remove-cond_resched-calls-rhel5.patch
+ext4-filterdata-rhel5.patch
+ext4-inode-version-rhel5.patch
+ext4-mmp-rhel5.patch
+ext4-unlink-race-rhel5.patch
+ext4-fiemap-2.6-rhel5.patch
+ext4-lookup-dotdot-rhel5.patch
+ext4-max-dir-size-rhel5.patch
+ext4-print-inum-in-htree-warning-rhel5.patch
+ext4-xattr-no-update-ctime-rhel5.patch
+ext4-prealloc-rhel5.patch
+ext4-mballoc-extra-checks-rhel5.patch
+ext4-mballoc-handle-dev-paths-rhel5.patch
+ext4-big-endian-check-2.6-rhel5.patch
+ext4-alloc-policy-2.6-rhel5.patch
+ext4-misc-rhel5.patch
+