Include client, ldiskfs, kernel patches.
Change-Id: Ice16b8bf40c2e37df9af9f399316917097e8ee8f
Signed-off-by: Bobi Jam <bobijam@whamcloud.com>
Reviewed-on: http://review.whamcloud.com/307
Tested-by: Hudson
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
Reviewed-by: Brian J. Murrell <brian@whamcloud.com>
;;
2.6.22*) LDISKFS_SERIES="2.6.22-vanilla.series";;
2.6.27*) LDISKFS_SERIES="2.6-sles11.series";;
+2.6.32*) LDISKFS_SERIES="2.6-rhel6.series";;
*) AC_MSG_WARN([Unknown kernel version $LINUXRELEASE, fix ldiskfs/configure.ac])
esac
AC_MSG_RESULT([$LDISKFS_SERIES])
--- /dev/null
+Index: linux-2.6.32.i386/fs/ext4/super.c
+===================================================================
+--- linux-2.6.32.i386.orig/fs/ext4/super.c 2010-04-07 14:18:32.000000000 +0530
++++ linux-2.6.32.i386/fs/ext4/super.c 2010-04-07 14:19:47.000000000 +0530
+@@ -291,6 +291,8 @@
+ jbd2_journal_abort_handle(handle);
+ }
+
++EXPORT_SYMBOL(ext4_journal_abort_handle);
++
+ /* Deal with the reporting of failure conditions on a filesystem such as
+ * inconsistencies detected or read IO failures.
+ *
+@@ -3030,6 +3032,8 @@
+ return ret;
+ }
+
++EXPORT_SYMBOL(ext4_force_commit);
++
+ /*
+ * Setup any per-fs journal parameters now. We'll do this both on
+ * initial mount, once the journal has been initialised but before we've
+@@ -4088,6 +4092,12 @@
+ unsigned long *blocks, int *created, int create);
+ EXPORT_SYMBOL(ext4_map_inode_page);
+
++EXPORT_SYMBOL(ext4_xattr_get);
++EXPORT_SYMBOL(ext4_xattr_set_handle);
++EXPORT_SYMBOL(ext4_bread);
++EXPORT_SYMBOL(ext4_journal_start_sb);
++EXPORT_SYMBOL(__ext4_journal_stop);
++
+ MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
+ MODULE_DESCRIPTION("Fourth Extended Filesystem");
+ MODULE_LICENSE("GPL");
+Index: linux-2.6.32.i386/fs/ext4/ext4.h
+===================================================================
+--- linux-2.6.32.i386.orig/fs/ext4/ext4.h 2010-04-07 14:17:04.000000000 +0530
++++ linux-2.6.32.i386/fs/ext4/ext4.h 2010-04-07 14:20:34.000000000 +0530
+@@ -1385,6 +1385,8 @@
+ struct buffer_head *bh,
+ ext4_group_t group,
+ struct ext4_group_desc *desc);
++extern struct buffer_head *ext4_read_inode_bitmap(struct super_block *sb,
++ ext4_group_t block_group);
+ extern void mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
+
+ /* mballoc.c */
+Index: linux-2.6.32.i386/fs/ext4/ialloc.c
+===================================================================
+--- linux-2.6.32.i386.orig/fs/ext4/ialloc.c 2009-12-03 09:21:21.000000000 +0530
++++ linux-2.6.32.i386/fs/ext4/ialloc.c 2010-04-07 14:19:47.000000000 +0530
+@@ -98,7 +98,7 @@
+ *
+ * Return buffer_head of bitmap on success or NULL.
+ */
+-static struct buffer_head *
++struct buffer_head *
+ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
+ {
+ struct ext4_group_desc *desc;
+@@ -161,6 +161,7 @@
+ }
+ return bh;
+ }
++EXPORT_SYMBOL(ext4_read_inode_bitmap);
+
+ /*
+ * NOTE! When we get the inode, we're the only people
+Index: linux-2.6.32.i386/fs/ext4/balloc.c
+===================================================================
+--- linux-2.6.32.i386.orig/fs/ext4/balloc.c 2010-03-19 15:43:37.000000000 +0530
++++ linux-2.6.32.i386/fs/ext4/balloc.c 2010-04-07 14:19:47.000000000 +0530
+@@ -235,6 +235,7 @@
+ *bh = sbi->s_group_desc[group_desc];
+ return desc;
+ }
++EXPORT_SYMBOL(ext4_get_group_desc);
+
+ static int ext4_valid_block_bitmap(struct super_block *sb,
+ struct ext4_group_desc *desc,
--- /dev/null
+Index: linux-stage/fs/ext4/super.c
+===================================================================
+--- linux-stage.orig/fs/ext4/super.c 2011-03-11 15:46:27.000000000 +0800
++++ linux-stage/fs/ext4/super.c 2011-03-11 15:53:05.016701579 +0800
+@@ -1400,9 +1400,47 @@
+ static ssize_t ext4_quota_write(struct super_block *sb, int type,
+ const char *data, size_t len, loff_t off);
+
++static int ext4_dquot_initialize(struct inode *inode, int type)
++{
++ handle_t *handle;
++ int ret, err;
++
++ /* We may create quota structure so we need to reserve enough blocks */
++ handle = ext4_journal_start(inode, 2*EXT4_QUOTA_INIT_BLOCKS(inode->i_sb));
++ if (IS_ERR(handle))
++ return PTR_ERR(handle);
++ ret = dquot_initialize(inode, type);
++ err = ext4_journal_stop(handle);
++ if (!ret)
++ ret = err;
++ return ret;
++}
++
++static int ext4_dquot_drop(struct inode *inode)
++{
++ handle_t *handle;
++ int ret, err;
++
++ /* We may delete quota structure so we need to reserve enough blocks */
++ handle = ext4_journal_start(inode, 2*EXT4_QUOTA_DEL_BLOCKS(inode->i_sb));
++ if (IS_ERR(handle)) {
++ /*
++ * We call dquot_drop() anyway to at least release references
++ * to quota structures so that umount does not hang.
++ */
++ dquot_drop(inode);
++ return PTR_ERR(handle);
++ }
++ ret = dquot_drop(inode);
++ err = ext4_journal_stop(handle);
++ if (!ret)
++ ret = err;
++ return ret;
++}
++
+ static const struct dquot_operations ext4_quota_operations = {
+- .initialize = dquot_initialize,
+- .drop = dquot_drop,
++ .initialize = ext4_dquot_initialize,
++ .drop = ext4_dquot_drop,
+ .alloc_space = dquot_alloc_space,
+ .reserve_space = dquot_reserve_space,
+ .claim_space = dquot_claim_space,
--- /dev/null
+Index: linux-stage/fs/ext4/super.c
+===================================================================
+--- linux-stage.orig/fs/ext4/super.c 2011-03-11 15:27:08.000000000 +0800
++++ linux-stage/fs/ext4/super.c 2011-03-11 15:29:41.023089829 +0800
+@@ -72,6 +72,8 @@
+ static int ext4_freeze(struct super_block *sb);
+
+
++static int bigendian_extents;
++
+ ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
+ struct ext4_group_desc *bg)
+ {
+@@ -1492,7 +1494,7 @@
+ Opt_block_validity, Opt_noblock_validity,
+ Opt_inode_readahead_blks, Opt_journal_ioprio,
+ Opt_discard, Opt_nodiscard,
+- Opt_mballoc,
++ Opt_mballoc, Opt_bigendian_extents,
+ };
+
+ static const match_table_t tokens = {
+@@ -1559,6 +1561,7 @@
+ {Opt_auto_da_alloc, "auto_da_alloc=%u"},
+ {Opt_auto_da_alloc, "auto_da_alloc"},
+ {Opt_noauto_da_alloc, "noauto_da_alloc"},
++ {Opt_bigendian_extents, "bigendian_extents"},
+ {Opt_mballoc, "mballoc"},
+ {Opt_discard, "discard"},
+ {Opt_nodiscard, "nodiscard"},
+@@ -1996,6 +1999,9 @@
+ break;
+ case Opt_mballoc:
+ break;
++ case Opt_bigendian_extents:
++ bigendian_extents = 1;
++ break;
+ case Opt_discard:
+ set_opt(sbi->s_mount_opt, DISCARD);
+ break;
+@@ -3073,6 +3079,16 @@
+ goto failed_mount;
+ }
+
++#ifdef __BIG_ENDIAN
++ if (bigendian_extents == 0) {
++ printk(KERN_ERR "EXT4-fs: extents feature is not guaranteed to "
++ "work on big-endian systems. Use \"bigendian_extents\" "
++ "mount option to override.\n");
++ goto failed_mount;
++ }
++#endif
++
++
+ #ifdef CONFIG_PROC_FS
+ if (ext4_proc_root)
+ sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root);
--- /dev/null
+Index: linux-stage/fs/ext4/ext4.h
+===================================================================
+--- linux-stage.orig/fs/ext4/ext4.h 2011-03-14 16:16:45.000000000 +0800
++++ linux-stage/fs/ext4/ext4.h 2011-03-14 16:17:08.732676431 +0800
+@@ -758,7 +758,8 @@
+ /*
+ * Mount flags
+ */
+-#define EXT4_MOUNT_OLDALLOC 0x00002 /* Don't use the new Orlov allocator */
++#define EXT4_MOUNT_NO_MBCACHE 0x00001 /* Disable mbcache */
++#define EXT4_MOUNT_OLDALLOC 0x00002 /* Don't use the new Orlov allocator */
+ #define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */
+ #define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */
+ #define EXT4_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */
+Index: linux-stage/fs/ext4/super.c
+===================================================================
+--- linux-stage.orig/fs/ext4/super.c 2011-03-14 16:16:45.000000000 +0800
++++ linux-stage/fs/ext4/super.c 2011-03-14 16:18:13.831956469 +0800
+@@ -1502,6 +1502,7 @@
+ Opt_inode_readahead_blks, Opt_journal_ioprio,
+ Opt_discard, Opt_nodiscard,
+ Opt_mballoc, Opt_bigendian_extents, Opt_force_over_16tb,
++ Opt_no_mbcache,
+ Opt_extents, Opt_noextents,
+ };
+
+@@ -1574,6 +1575,7 @@
+ {Opt_mballoc, "mballoc"},
+ {Opt_discard, "discard"},
+ {Opt_nodiscard, "nodiscard"},
++ {Opt_no_mbcache, "no_mbcache"},
+ {Opt_extents, "extents"},
+ {Opt_noextents, "noextents"},
+ {Opt_err, NULL},
+@@ -2049,6 +2051,9 @@
+ }
+ clear_opt(sbi->s_mount_opt, EXTENTS);
+ break;
++ case Opt_no_mbcache:
++ set_opt(sbi->s_mount_opt, NO_MBCACHE);
++ break;
+ default:
+ ext4_msg(sb, KERN_ERR,
+ "Unrecognized mount option \"%s\" "
+Index: linux-stage/fs/ext4/xattr.c
+===================================================================
+--- linux-stage.orig/fs/ext4/xattr.c 2011-03-14 16:16:43.000000000 +0800
++++ linux-stage/fs/ext4/xattr.c 2011-03-14 16:17:08.806677883 +0800
+@@ -86,7 +86,8 @@
+ # define ea_bdebug(f...)
+ #endif
+
+-static void ext4_xattr_cache_insert(struct buffer_head *);
++static void ext4_xattr_cache_insert(struct super_block *,
++ struct buffer_head *);
+ static struct buffer_head *ext4_xattr_cache_find(struct inode *,
+ struct ext4_xattr_header *,
+ struct mb_cache_entry **);
+@@ -234,7 +235,7 @@
+ error = -EIO;
+ goto cleanup;
+ }
+- ext4_xattr_cache_insert(bh);
++ ext4_xattr_cache_insert(inode->i_sb, bh);
+ entry = BFIRST(bh);
+ error = ext4_xattr_find_entry(&entry, name_index, name, bh->b_size, 1);
+ if (error == -EIO)
+@@ -376,7 +377,7 @@
+ error = -EIO;
+ goto cleanup;
+ }
+- ext4_xattr_cache_insert(bh);
++ ext4_xattr_cache_insert(inode->i_sb, bh);
+ error = ext4_xattr_list_entries(inode, BFIRST(bh), buffer, buffer_size);
+
+ cleanup:
+@@ -473,7 +474,9 @@
+ struct mb_cache_entry *ce = NULL;
+ int error = 0;
+
+- ce = mb_cache_entry_get(ext4_xattr_cache, bh->b_bdev, bh->b_blocknr);
++ if (!test_opt(inode->i_sb, NO_MBCACHE))
++ ce = mb_cache_entry_get(ext4_xattr_cache, bh->b_bdev,
++ bh->b_blocknr);
+ error = ext4_journal_get_write_access(handle, bh);
+ if (error)
+ goto out;
+@@ -700,8 +703,10 @@
+ if (i->value && i->value_len > sb->s_blocksize)
+ return -ENOSPC;
+ if (s->base) {
+- ce = mb_cache_entry_get(ext4_xattr_cache, bs->bh->b_bdev,
+- bs->bh->b_blocknr);
++ if (!test_opt(inode->i_sb, NO_MBCACHE))
++ ce = mb_cache_entry_get(ext4_xattr_cache,
++ bs->bh->b_bdev,
++ bs->bh->b_blocknr);
+ error = ext4_journal_get_write_access(handle, bs->bh);
+ if (error)
+ goto cleanup;
+@@ -718,7 +723,7 @@
+ if (!IS_LAST_ENTRY(s->first))
+ ext4_xattr_rehash(header(s->base),
+ s->here);
+- ext4_xattr_cache_insert(bs->bh);
++ ext4_xattr_cache_insert(sb, bs->bh);
+ }
+ unlock_buffer(bs->bh);
+ if (error == -EIO)
+@@ -801,7 +806,8 @@
+ if (error)
+ goto cleanup_dquot;
+ }
+- mb_cache_entry_release(ce);
++ if (ce)
++ mb_cache_entry_release(ce);
+ ce = NULL;
+ } else if (bs->bh && s->base == bs->bh->b_data) {
+ /* We were modifying this block in-place. */
+@@ -845,7 +851,7 @@
+ memcpy(new_bh->b_data, s->base, new_bh->b_size);
+ set_buffer_uptodate(new_bh);
+ unlock_buffer(new_bh);
+- ext4_xattr_cache_insert(new_bh);
++ ext4_xattr_cache_insert(sb, new_bh);
+ error = ext4_handle_dirty_metadata(handle,
+ inode, new_bh);
+ if (error)
+@@ -1403,12 +1409,15 @@
+ * Returns 0, or a negative error number on failure.
+ */
+ static void
+-ext4_xattr_cache_insert(struct buffer_head *bh)
++ext4_xattr_cache_insert(struct super_block *sb, struct buffer_head *bh)
+ {
+ __u32 hash = le32_to_cpu(BHDR(bh)->h_hash);
+ struct mb_cache_entry *ce;
+ int error;
+
++ if (test_opt(sb, NO_MBCACHE))
++ return;
++
+ ce = mb_cache_entry_alloc(ext4_xattr_cache, GFP_NOFS);
+ if (!ce) {
+ ea_bdebug(bh, "out of memory");
+@@ -1482,6 +1491,8 @@
+ __u32 hash = le32_to_cpu(header->h_hash);
+ struct mb_cache_entry *ce;
+
++ if (test_opt(inode->i_sb, NO_MBCACHE))
++ return NULL;
+ if (!header->h_hash)
+ return NULL; /* never share */
+ ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
--- /dev/null
+Index: linux-stage/fs/ext4/dynlocks.c
+===================================================================
+--- /dev/null 1970-01-01 00:00:00.000000000 +0000
++++ linux-stage/fs/ext4/dynlocks.c 2011-03-03 15:25:04.025526781 +0800
+@@ -0,0 +1,236 @@
++/*
++ * Dynamic Locks
++ *
++ * struct dynlock is lockspace
++ * one may request lock (exclusive or shared) for some value
++ * in that lockspace
++ *
++ */
++
++#include <linux/dynlocks.h>
++#include <linux/module.h>
++#include <linux/slab.h>
++#include <linux/sched.h>
++
++#define DYNLOCK_HANDLE_MAGIC 0xd19a10c
++#define DYNLOCK_HANDLE_DEAD 0xd1956ee
++#define DYNLOCK_LIST_MAGIC 0x11ee91e6
++
++static struct kmem_cache * dynlock_cachep = NULL;
++
++struct dynlock_handle {
++ unsigned dh_magic;
++ struct list_head dh_list;
++ unsigned long dh_value; /* lock value */
++ int dh_refcount; /* number of users */
++ int dh_readers;
++ int dh_writers;
++ int dh_pid; /* holder of the lock */
++ wait_queue_head_t dh_wait;
++};
++
++int __init dynlock_cache_init(void)
++{
++ int rc = 0;
++
++ printk(KERN_INFO "init dynlocks cache\n");
++ dynlock_cachep = kmem_cache_create("dynlock_cache",
++ sizeof(struct dynlock_handle),
++ 0,
++ SLAB_HWCACHE_ALIGN,
++ NULL);
++ if (dynlock_cachep == NULL) {
++ printk(KERN_ERR "Not able to create dynlock cache");
++ rc = -ENOMEM;
++ }
++ return rc;
++}
++
++void __exit dynlock_cache_exit(void)
++{
++ printk(KERN_INFO "exit dynlocks cache\n");
++ kmem_cache_destroy(dynlock_cachep);
++}
++
++/*
++ * dynlock_init
++ *
++ * initialize lockspace
++ *
++ */
++void dynlock_init(struct dynlock *dl)
++{
++ spin_lock_init(&dl->dl_list_lock);
++ INIT_LIST_HEAD(&dl->dl_list);
++ dl->dl_magic = DYNLOCK_LIST_MAGIC;
++}
++EXPORT_SYMBOL(dynlock_init);
++
++/*
++ * dynlock_lock
++ *
++ * acquires lock (exclusive or shared) in specified lockspace
++ * each lock in lockspace is allocated separately, so user have
++ * to specify GFP flags.
++ * routine returns pointer to lock. this pointer is intended to
++ * be passed to dynlock_unlock
++ *
++ */
++struct dynlock_handle *dynlock_lock(struct dynlock *dl, unsigned long value,
++ enum dynlock_type lt, gfp_t gfp)
++{
++ struct dynlock_handle *nhl = NULL;
++ struct dynlock_handle *hl;
++
++ BUG_ON(dl == NULL);
++ BUG_ON(dl->dl_magic != DYNLOCK_LIST_MAGIC);
++
++repeat:
++ /* find requested lock in lockspace */
++ spin_lock(&dl->dl_list_lock);
++ BUG_ON(dl->dl_list.next == NULL);
++ BUG_ON(dl->dl_list.prev == NULL);
++ list_for_each_entry(hl, &dl->dl_list, dh_list) {
++ BUG_ON(hl->dh_list.next == NULL);
++ BUG_ON(hl->dh_list.prev == NULL);
++ BUG_ON(hl->dh_magic != DYNLOCK_HANDLE_MAGIC);
++ if (hl->dh_value == value) {
++ /* lock is found */
++ if (nhl) {
++ /* someone else just allocated
++ * lock we didn't find and just created
++ * so, we drop our lock
++ */
++ kmem_cache_free(dynlock_cachep, nhl);
++ nhl = NULL;
++ }
++ hl->dh_refcount++;
++ goto found;
++ }
++ }
++ /* lock not found */
++ if (nhl) {
++ /* we already have allocated lock. use it */
++ hl = nhl;
++ nhl = NULL;
++ list_add(&hl->dh_list, &dl->dl_list);
++ goto found;
++ }
++ spin_unlock(&dl->dl_list_lock);
++
++ /* lock not found and we haven't allocated lock yet. allocate it */
++ nhl = kmem_cache_alloc(dynlock_cachep, gfp);
++ if (nhl == NULL)
++ return NULL;
++ nhl->dh_refcount = 1;
++ nhl->dh_value = value;
++ nhl->dh_readers = 0;
++ nhl->dh_writers = 0;
++ nhl->dh_magic = DYNLOCK_HANDLE_MAGIC;
++ init_waitqueue_head(&nhl->dh_wait);
++
++ /* while lock is being allocated, someone else may allocate it
++ * and put onto to list. check this situation
++ */
++ goto repeat;
++
++found:
++ if (lt == DLT_WRITE) {
++ /* exclusive lock: user don't want to share lock at all
++ * NOTE: one process may take the same lock several times
++ * this functionaly is useful for rename operations */
++ while ((hl->dh_writers && hl->dh_pid != current->pid) ||
++ hl->dh_readers) {
++ spin_unlock(&dl->dl_list_lock);
++ wait_event(hl->dh_wait,
++ hl->dh_writers == 0 && hl->dh_readers == 0);
++ spin_lock(&dl->dl_list_lock);
++ }
++ hl->dh_writers++;
++ } else {
++ /* shared lock: user do not want to share lock with writer */
++ while (hl->dh_writers) {
++ spin_unlock(&dl->dl_list_lock);
++ wait_event(hl->dh_wait, hl->dh_writers == 0);
++ spin_lock(&dl->dl_list_lock);
++ }
++ hl->dh_readers++;
++ }
++ hl->dh_pid = current->pid;
++ spin_unlock(&dl->dl_list_lock);
++
++ return hl;
++}
++EXPORT_SYMBOL(dynlock_lock);
++
++
++/*
++ * dynlock_unlock
++ *
++ * user have to specify lockspace (dl) and pointer to lock structure
++ * returned by dynlock_lock()
++ *
++ */
++void dynlock_unlock(struct dynlock *dl, struct dynlock_handle *hl)
++{
++ int wakeup = 0;
++
++ BUG_ON(dl == NULL);
++ BUG_ON(hl == NULL);
++ BUG_ON(dl->dl_magic != DYNLOCK_LIST_MAGIC);
++
++ if (hl->dh_magic != DYNLOCK_HANDLE_MAGIC)
++ printk(KERN_EMERG "wrong lock magic: %#x\n", hl->dh_magic);
++
++ BUG_ON(hl->dh_magic != DYNLOCK_HANDLE_MAGIC);
++ BUG_ON(hl->dh_writers != 0 && current->pid != hl->dh_pid);
++
++ spin_lock(&dl->dl_list_lock);
++ if (hl->dh_writers) {
++ BUG_ON(hl->dh_readers != 0);
++ hl->dh_writers--;
++ if (hl->dh_writers == 0)
++ wakeup = 1;
++ } else if (hl->dh_readers) {
++ hl->dh_readers--;
++ if (hl->dh_readers == 0)
++ wakeup = 1;
++ } else {
++ BUG();
++ }
++ if (wakeup) {
++ hl->dh_pid = 0;
++ wake_up(&hl->dh_wait);
++ }
++ if (--(hl->dh_refcount) == 0) {
++ hl->dh_magic = DYNLOCK_HANDLE_DEAD;
++ list_del(&hl->dh_list);
++ kmem_cache_free(dynlock_cachep, hl);
++ }
++ spin_unlock(&dl->dl_list_lock);
++}
++EXPORT_SYMBOL(dynlock_unlock);
++
++int dynlock_is_locked(struct dynlock *dl, unsigned long value)
++{
++ struct dynlock_handle *hl;
++ int result = 0;
++
++ /* find requested lock in lockspace */
++ spin_lock(&dl->dl_list_lock);
++ BUG_ON(dl->dl_list.next == NULL);
++ BUG_ON(dl->dl_list.prev == NULL);
++ list_for_each_entry(hl, &dl->dl_list, dh_list) {
++ BUG_ON(hl->dh_list.next == NULL);
++ BUG_ON(hl->dh_list.prev == NULL);
++ BUG_ON(hl->dh_magic != DYNLOCK_HANDLE_MAGIC);
++ if (hl->dh_value == value && hl->dh_pid == current->pid) {
++ /* lock is found */
++ result = 1;
++ break;
++ }
++ }
++ spin_unlock(&dl->dl_list_lock);
++ return result;
++}
++EXPORT_SYMBOL(dynlock_is_locked);
+Index: linux-stage/include/linux/dynlocks.h
+===================================================================
+--- /dev/null 1970-01-01 00:00:00.000000000 +0000
++++ linux-stage/include/linux/dynlocks.h 2011-03-03 15:25:04.055526552 +0800
+@@ -0,0 +1,34 @@
++#ifndef _LINUX_DYNLOCKS_H
++#define _LINUX_DYNLOCKS_H
++
++#include <linux/list.h>
++#include <linux/wait.h>
++
++struct dynlock_handle;
++
++/*
++ * lock's namespace:
++ * - list of locks
++ * - lock to protect this list
++ */
++struct dynlock {
++ unsigned dl_magic;
++ struct list_head dl_list;
++ spinlock_t dl_list_lock;
++};
++
++enum dynlock_type {
++ DLT_WRITE,
++ DLT_READ
++};
++
++int dynlock_cache_init(void);
++void dynlock_cache_exit(void);
++void dynlock_init(struct dynlock *dl);
++struct dynlock_handle *dynlock_lock(struct dynlock *dl, unsigned long value,
++ enum dynlock_type lt, gfp_t gfp);
++void dynlock_unlock(struct dynlock *dl, struct dynlock_handle *lock);
++int dynlock_is_locked(struct dynlock *dl, unsigned long value);
++
++#endif
++
+Index: linux-stage/fs/ext4/Makefile
+===================================================================
+--- linux-stage.orig/fs/ext4/Makefile 2011-03-05 11:50:43.000000000 +0800
++++ linux-stage/fs/ext4/Makefile 2011-03-05 11:52:42.349154982 +0800
+@@ -6,7 +6,8 @@
+
+ ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \
+ ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
+- ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o
++ ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \
++ dynlocks.o
+
+ ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
+ ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o
+Index: linux-stage/fs/ext4/super.c
+===================================================================
+--- linux-stage.orig/fs/ext4/super.c 2011-03-05 11:50:43.000000000 +0800
++++ linux-stage/fs/ext4/super.c 2011-03-05 11:57:33.632869451 +0800
+@@ -4457,17 +4457,20 @@
+ return err;
+ ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj);
+ if (!ext4_kset)
+- goto out4;
++ goto out5;
+ ext4_proc_root = proc_mkdir("fs/ext4", NULL);
+ err = init_ext4_mballoc();
+ if (err)
+- goto out3;
++ goto out4;
+
+ err = init_ext4_xattr();
+ if (err)
+- goto out2;
++ goto out3;
+ err = init_inodecache();
+ if (err)
++ goto out2;
++ err = dynlock_cache_init();
++ if (err)
+ goto out1;
+ err = register_filesystem(&ext4_fs_type);
+ if (err)
+@@ -4477,15 +4480,17 @@
+
+ return 0;
+ out:
+- destroy_inodecache();
++ dynlock_cache_exit();
+ out1:
+- exit_ext4_xattr();
++ destroy_inodecache();
+ out2:
+- exit_ext4_mballoc();
++ exit_ext4_xattr();
+ out3:
++ exit_ext4_mballoc();
++out4:
+ remove_proc_entry("fs/ext4", NULL);
+ kset_unregister(ext4_kset);
+-out4:
++out5:
+ exit_ext4_system_zone();
+ return err;
+ }
+@@ -4493,6 +4498,7 @@
+ static void __exit exit_ext4_fs(void)
+ {
+ unregister_filesystem(&ext4_fs_type);
++ dynlock_cache_exit();
+ destroy_inodecache();
+ exit_ext4_xattr();
+ exit_ext4_mballoc();
--- /dev/null
+Index: linux-stage/fs/ext4/ext4.h
+===================================================================
+--- linux-stage.orig/fs/ext4/ext4.h 2011-03-14 15:57:13.613674482 +0800
++++ linux-stage/fs/ext4/ext4.h 2011-03-14 15:57:22.031906980 +0800
+@@ -780,6 +780,7 @@
+ #define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */
+ #define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */
+ #define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */
++#define EXT4_MOUNT_EXTENTS 0x400000 /* Extents support */
+ #define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */
+ #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
+ #define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */
+Index: linux-stage/fs/ext4/ext4_jbd2.h
+===================================================================
+--- linux-stage.orig/fs/ext4/ext4_jbd2.h 2011-03-14 15:57:12.000000000 +0800
++++ linux-stage/fs/ext4/ext4_jbd2.h 2011-03-14 15:58:55.957499110 +0800
+@@ -33,7 +33,7 @@
+
+ #define EXT4_SINGLEDATA_TRANS_BLOCKS(sb) \
+ (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS) \
+- ? 27U : 8U)
++ || test_opt(sb, EXTENTS) ? 27U : 8U)
+
+ #define ext4_journal_dirty_metadata(handle, bh) \
+ ext4_handle_dirty_metadata(handle, NULL, bh)
+Index: linux-stage/fs/ext4/extents.c
+===================================================================
+--- linux-stage.orig/fs/ext4/extents.c 2011-03-14 15:57:12.000000000 +0800
++++ linux-stage/fs/ext4/extents.c 2011-03-14 16:14:14.246265207 +0800
+@@ -2553,7 +2553,7 @@
+ * possible initialization would be here
+ */
+
+- if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
++ if (test_opt(sb, EXTENTS)) {
+ #if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS)
+ printk(KERN_INFO "EXT4-fs: file extents enabled");
+ #ifdef AGGRESSIVE_TEST
+@@ -2580,7 +2580,7 @@
+ */
+ void ext4_ext_release(struct super_block *sb)
+ {
+- if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
++ if (!test_opt(sb, EXTENTS))
+ return;
+
+ #ifdef EXTENTS_STATS
+Index: linux-stage/fs/ext4/ialloc.c
+===================================================================
+--- linux-stage.orig/fs/ext4/ialloc.c 2011-03-14 15:57:13.000000000 +0800
++++ linux-stage/fs/ext4/ialloc.c 2011-03-14 16:02:03.334308846 +0800
+@@ -1049,7 +1049,7 @@
+ if (err)
+ goto fail_free_drop;
+
+- if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
++ if (test_opt(sb, EXTENTS)) {
+ /* set extent flag only for directory, file and normal symlink*/
+ if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) {
+ EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL;
+Index: linux-stage/fs/ext4/migrate.c
+===================================================================
+--- linux-stage.orig/fs/ext4/migrate.c 2011-03-14 15:36:15.000000000 +0800
++++ linux-stage/fs/ext4/migrate.c 2011-03-14 16:05:39.083369164 +0800
+@@ -459,13 +459,13 @@
+ unsigned long max_entries;
+ __u32 goal;
+
+- /*
+- * If the filesystem does not support extents, or the inode
+- * already is extent-based, error out.
+- */
+- if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb,
+- EXT4_FEATURE_INCOMPAT_EXTENTS) ||
+- (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
++ if (!test_opt(inode->i_sb, EXTENTS))
++ /*
++ * if mounted with noextents we don't allow the migrate
++ */
++ return -EINVAL;
++
++ if ((EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+ return -EINVAL;
+
+ if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0)
+Index: linux-stage/fs/ext4/super.c
+===================================================================
+--- linux-stage.orig/fs/ext4/super.c 2011-03-14 15:57:18.000000000 +0800
++++ linux-stage/fs/ext4/super.c 2011-03-14 16:11:58.234626200 +0800
+@@ -942,6 +942,8 @@
+ seq_puts(seq, ",journal_async_commit");
+ if (test_opt(sb, NOBH))
+ seq_puts(seq, ",nobh");
++ if (!test_opt(sb, EXTENTS))
++ seq_puts(seq, ",noextents");
+ if (test_opt(sb, I_VERSION))
+ seq_puts(seq, ",i_version");
+ if (!test_opt(sb, DELALLOC))
+@@ -1500,6 +1502,7 @@
+ Opt_inode_readahead_blks, Opt_journal_ioprio,
+ Opt_discard, Opt_nodiscard,
+ Opt_mballoc, Opt_bigendian_extents, Opt_force_over_16tb,
++ Opt_extents, Opt_noextents,
+ };
+
+ static const match_table_t tokens = {
+@@ -1571,6 +1574,8 @@
+ {Opt_mballoc, "mballoc"},
+ {Opt_discard, "discard"},
+ {Opt_nodiscard, "nodiscard"},
++ {Opt_extents, "extents"},
++ {Opt_noextents, "noextents"},
+ {Opt_err, NULL},
+ };
+
+@@ -1613,6 +1618,7 @@
+ int qtype, qfmt;
+ char *qname;
+ #endif
++ ext4_fsblk_t last_block;
+
+ if (!options)
+ return 1;
+@@ -2017,6 +2023,32 @@
+ case Opt_force_over_16tb:
+ force_over_16tb = 1;
+ break;
++ case Opt_extents:
++ if (!EXT4_HAS_INCOMPAT_FEATURE(sb,
++ EXT4_FEATURE_INCOMPAT_EXTENTS)) {
++ ext4_warning(sb, "extents feature not enabled "
++ "on this filesystem, use tune2fs");
++ return 0;
++ }
++ set_opt(sbi->s_mount_opt, EXTENTS);
++ break;
++ case Opt_noextents:
++ /*
++ * When e2fsprogs support resizing an already existing
++ * ext4 file system to greater than 2**32 we need to
++ * add support to block allocator to handle growing
++ * already existing block mapped inode so that blocks
++ * allocated for them fall within 2**32
++ */
++ last_block = ext4_blocks_count(sbi->s_es) - 1;
++ if (last_block > 0xffffffffULL) {
++ printk(KERN_ERR "EXT4-fs: Filesystem too "
++ "large to mount with "
++ "-o noextents options\n");
++ return 0;
++ }
++ clear_opt(sbi->s_mount_opt, EXTENTS);
++ break;
+ default:
+ ext4_msg(sb, KERN_ERR,
+ "Unrecognized mount option \"%s\" "
+@@ -2879,6 +2911,17 @@
+ set_opt(sbi->s_mount_opt, BARRIER);
+
+ /*
++ * turn on extents feature by default in ext4 filesystem
++ * only if feature flag already set by mkfs or tune2fs.
++ * Use -o noextents to turn it off
++ */
++ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
++ set_opt(sbi->s_mount_opt, EXTENTS);
++ else
++ ext4_warning(sb, "extents feature not enabled on this filesystem, "
++ "use tune2fs.");
++
++ /*
+ * enable delayed allocation by default
+ * Use -o nodelalloc to turn it off
+ */
--- /dev/null
+This patch adds direct EXT4_IOC_FIEMAP support to ldiskfs, for Lustre to call
+without having to go through do_vfs_ioctl() (which isn't exported, and has a
+number of other ioctls which are not suitable for Lustre). The actual FIEMAP
+support is already in the kernel/ext4 for normal usage.
+
+Index: linux-stage/fs/ext4/ext4.h
+===================================================================
+--- linux-stage.orig/fs/ext4/ext4.h 2011-03-05 12:34:16.458850451 +0800
++++ linux-stage/fs/ext4/ext4.h 2011-03-05 12:35:25.338882364 +0800
+@@ -405,7 +405,7 @@
+ #define EXT4_IOC_GROUP_ADD _IOW('f', 8, struct ext4_new_group_input)
+ #define EXT4_IOC_MIGRATE _IO('f', 9)
+ /* note ioctl 10 reserved for an early version of the FIEMAP ioctl */
+- /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */
++#define EXT4_IOC_FIEMAP _IOWR('f', 11, struct fiemap)
+ #define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12)
+ #define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent)
+
+Index: linux-stage/fs/ext4/ioctl.c
+===================================================================
+--- linux-stage.orig/fs/ext4/ioctl.c 2011-03-05 12:34:11.299779163 +0800
++++ linux-stage/fs/ext4/ioctl.c 2011-03-05 12:34:16.862856069 +0800
+@@ -18,6 +18,71 @@
+ #include "ext4_jbd2.h"
+ #include "ext4.h"
+
++/* So that the fiemap access checks can't overflow on 32 bit machines. */
++#define FIEMAP_MAX_EXTENTS (UINT_MAX / sizeof(struct fiemap_extent))
++
++static int fiemap_check_ranges(struct super_block *sb,
++ u64 start, u64 len, u64 *new_len)
++{
++ *new_len = len;
++
++ if (len == 0)
++ return -EINVAL;
++
++ if (start > sb->s_maxbytes)
++ return -EFBIG;
++
++ /*
++ * Shrink request scope to what the fs can actually handle.
++ */
++ if ((len > sb->s_maxbytes) ||
++ (sb->s_maxbytes - len) < start)
++ *new_len = sb->s_maxbytes - start;
++
++ return 0;
++}
++
++int ioctl_fiemap(struct inode *inode, struct file *filp, unsigned long arg)
++{
++ struct fiemap fiemap;
++ u64 len;
++ struct fiemap_extent_info fieinfo = {0, };
++ struct super_block *sb = inode->i_sb;
++ int error = 0;
++
++ if (copy_from_user(&fiemap, (struct fiemap __user *) arg,
++ sizeof(struct fiemap)))
++ return -EFAULT;
++
++ if (fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS)
++ return -EINVAL;
++
++ error = fiemap_check_ranges(sb, fiemap.fm_start, fiemap.fm_length,
++ &len);
++ if (error)
++ return error;
++
++ fieinfo.fi_flags = fiemap.fm_flags;
++ fieinfo.fi_extents_max = fiemap.fm_extent_count;
++ fieinfo.fi_extents_start = (struct fiemap_extent *)(arg + sizeof(fiemap));
++
++ if (fiemap.fm_extent_count != 0 &&
++ !access_ok(VERIFY_WRITE, (void *)arg,
++ offsetof(typeof(fiemap), fm_extents[fiemap.fm_extent_count])))
++ return -EFAULT;
++
++ if (fieinfo.fi_flags & FIEMAP_FLAG_SYNC)
++ filemap_write_and_wait(inode->i_mapping);
++
++ error = ext4_fiemap(inode, &fieinfo, fiemap.fm_start, len);
++ fiemap.fm_flags = fieinfo.fi_flags;
++ fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped;
++ if (copy_to_user((char *)arg, &fiemap, sizeof(fiemap)))
++ error = -EFAULT;
++
++ return error;
++}
++
+ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+ {
+ struct inode *inode = filp->f_dentry->d_inode;
+@@ -330,6 +395,9 @@
+ mnt_drop_write(filp->f_path.mnt);
+ return err;
+ }
++ case EXT4_IOC_FIEMAP: {
++ return ioctl_fiemap(inode, filp, arg);
++ }
+
+ default:
+ return -ENOTTY;
+Index: linux-stage/fs/ext4/fiemap.h
+===================================================================
+--- /dev/null 1970-01-01 00:00:00.000000000 +0000
++++ linux-stage/fs/ext4/fiemap.h 2011-03-05 12:36:24.606879702 +0800
+@@ -0,0 +1,2 @@
++
++#include_next <fiemap.h>
--- /dev/null
+Index: linux-stage/fs/ext4/super.c
+===================================================================
+--- linux-stage.orig/fs/ext4/super.c 2011-03-11 15:35:15.680343230 +0800
++++ linux-stage/fs/ext4/super.c 2011-03-11 15:44:45.037632078 +0800
+@@ -55,6 +55,8 @@
+ struct proc_dir_entry *ext4_proc_root;
+ static struct kset *ext4_kset;
+
++static int force_over_16tb;
++
+ static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
+ unsigned long journal_devnum);
+ static int ext4_commit_super(struct super_block *sb, int sync);
+@@ -1494,7 +1496,7 @@
+ Opt_block_validity, Opt_noblock_validity,
+ Opt_inode_readahead_blks, Opt_journal_ioprio,
+ Opt_discard, Opt_nodiscard,
+- Opt_mballoc, Opt_bigendian_extents,
++ Opt_mballoc, Opt_bigendian_extents, Opt_force_over_16tb,
+ };
+
+ static const match_table_t tokens = {
+@@ -1562,6 +1564,7 @@
+ {Opt_auto_da_alloc, "auto_da_alloc"},
+ {Opt_noauto_da_alloc, "noauto_da_alloc"},
+ {Opt_bigendian_extents, "bigendian_extents"},
++ {Opt_force_over_16tb, "force_over_16th"},
+ {Opt_mballoc, "mballoc"},
+ {Opt_discard, "discard"},
+ {Opt_nodiscard, "nodiscard"},
+@@ -2008,6 +2011,9 @@
+ break;
+ case Opt_mballoc:
+ break;
++ case Opt_force_over_16tb:
++ force_over_16tb = 1;
++ break;
+ default:
+ ext4_msg(sb, KERN_ERR,
+ "Unrecognized mount option \"%s\" "
+@@ -3031,6 +3037,26 @@
+ goto failed_mount;
+ }
+
++ if (ext4_blocks_count(es) >= (1ULL << 32)) {
++ if (force_over_16tb == 0) {
++ printk(KERN_ERR "EXT4-fs does not support filesystems "
++ "greater than 16TB and can cause data corruption."
++ "Use \"force_over_16tb\" mount option to override."
++ "\n");
++ goto failed_mount;
++ }
++ }
++
++ if (ext4_blocks_count(es) >= (1ULL << 32)) {
++ if (force_over_16tb == 0) {
++ printk(KERN_ERR "EXT4-fs does not support filesystems "
++ "greater than 16TB and can cause data corruption."
++ "Use \"force_over_16tb\" mount option to override."
++ "\n");
++ goto failed_mount;
++ }
++ }
++
+ if (EXT4_BLOCKS_PER_GROUP(sb) == 0)
+ goto cantfind_ext4;
+
--- /dev/null
+Index: linux-2.6.32-el6-beta/fs/ext4/inode.c
+===================================================================
+--- linux-2.6.32-el6-beta.orig/fs/ext4/inode.c
++++ linux-2.6.32-el6-beta/fs/ext4/inode.c
+@@ -4920,11 +4920,11 @@ struct inode *ext4_iget(struct super_blo
+ EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);
+ EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode);
+
+- inode->i_version = le32_to_cpu(raw_inode->i_disk_version);
++ ei->i_fs_version = le32_to_cpu(raw_inode->i_disk_version);
+ if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
+ if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
+- inode->i_version |=
+- (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
++ ei->i_fs_version |= (__u64)(le32_to_cpu(raw_inode->i_version_hi))
++ << 32;
+ }
+
+ ret = 0;
+@@ -5134,11 +5134,11 @@ static int ext4_do_update_inode(handle_t
+ for (block = 0; block < EXT4_N_BLOCKS; block++)
+ raw_inode->i_block[block] = ei->i_data[block];
+
+- raw_inode->i_disk_version = cpu_to_le32(inode->i_version);
++ raw_inode->i_disk_version = cpu_to_le32(ei->i_fs_version);
+ if (ei->i_extra_isize) {
+ if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
+- raw_inode->i_version_hi =
+- cpu_to_le32(inode->i_version >> 32);
++ raw_inode->i_version_hi = cpu_to_le32(ei->i_fs_version
++ >> 32);
+ raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
+ }
+
+Index: linux-2.6.32-el6-beta/fs/ext4/ialloc.c
+===================================================================
+--- linux-2.6.32-el6-beta.orig/fs/ext4/ialloc.c
++++ linux-2.6.32-el6-beta/fs/ext4/ialloc.c
+@@ -1018,6 +1018,7 @@ got:
+ ei->i_dtime = 0;
+ ei->i_block_group = group;
+ ei->i_last_alloc_group = ~0;
++ ei->i_fs_version = 0;
+
+ ext4_set_inode_flags(inode);
+ if (IS_DIRSYNC(inode))
+Index: linux-2.6.32-el6-beta/fs/ext4/ext4.h
+===================================================================
+--- linux-2.6.32-el6-beta.orig/fs/ext4/ext4.h
++++ linux-2.6.32-el6-beta/fs/ext4/ext4.h
+@@ -714,8 +714,12 @@ struct ext4_inode_info {
+ */
+ tid_t i_sync_tid;
+ tid_t i_datasync_tid;
++
++ __u64 i_fs_version;
+ };
+
++#define HAVE_DISK_INODE_VERSION
++
+ /*
+ * File system states
+ */
--- /dev/null
+removes static definition of dx_root struct. so that "." and ".." dirent can
+have extra data. This patch does not change any functionality but is required for
+ext4_data_in_dirent patch.
+
+Index: linux-2.6.32.i386/fs/ext4/namei.c
+===================================================================
+--- linux-2.6.32.i386.orig/fs/ext4/namei.c 2010-04-16 05:35:06.000000000 +0530
++++ linux-2.6.32.i386/fs/ext4/namei.c 2010-04-16 05:47:41.000000000 +0530
+@@ -115,22 +115,13 @@
+ * hash version mod 4 should never be 0. Sincerely, the paranoia department.
+ */
+
+-struct dx_root
++struct dx_root_info
+ {
+- struct fake_dirent dot;
+- char dot_name[4];
+- struct fake_dirent dotdot;
+- char dotdot_name[4];
+- struct dx_root_info
+- {
+- __le32 reserved_zero;
+- u8 hash_version;
+- u8 info_length; /* 8 */
+- u8 indirect_levels;
+- u8 unused_flags;
+- }
+- info;
+- struct dx_entry entries[0];
++ __le32 reserved_zero;
++ u8 hash_version;
++ u8 info_length; /* 8 */
++ u8 indirect_levels;
++ u8 unused_flags;
+ };
+
+ struct dx_node
+@@ -244,6 +235,16 @@
+ * Future: use high four bits of block for coalesce-on-delete flags
+ * Mask them off for now.
+ */
++struct dx_root_info * dx_get_dx_info(struct ext4_dir_entry_2 *de)
++{
++ /* get dotdot first */
++ de = (struct ext4_dir_entry_2 *)((char *)de + EXT4_DIR_REC_LEN(1));
++
++ /* dx root info is after dotdot entry */
++ de = (struct ext4_dir_entry_2 *)((char *)de + EXT4_DIR_REC_LEN(2));
++
++ return (struct dx_root_info *) de;
++}
+
+ static inline ext4_lblk_t dx_get_block(struct dx_entry *entry)
+ {
+@@ -398,7 +399,7 @@
+ {
+ unsigned count, indirect;
+ struct dx_entry *at, *entries, *p, *q, *m;
+- struct dx_root *root;
++ struct dx_root_info * info;
+ struct buffer_head *bh;
+ struct dx_frame *frame = frame_in;
+ u32 hash;
+@@ -406,17 +407,18 @@
+ frame->bh = NULL;
+ if (!(bh = ext4_bread (NULL,dir, 0, 0, err)))
+ goto fail;
+- root = (struct dx_root *) bh->b_data;
+- if (root->info.hash_version != DX_HASH_TEA &&
+- root->info.hash_version != DX_HASH_HALF_MD4 &&
+- root->info.hash_version != DX_HASH_LEGACY) {
++
++ info = dx_get_dx_info((struct ext4_dir_entry_2*)bh->b_data);
++ if (info->hash_version != DX_HASH_TEA &&
++ info->hash_version != DX_HASH_HALF_MD4 &&
++ info->hash_version != DX_HASH_LEGACY) {
+ ext4_warning(dir->i_sb, "Unrecognised inode hash code %d for directory "
+- "#%lu", root->info.hash_version, dir->i_ino);
++ "#%lu", info->hash_version, dir->i_ino);
+ brelse(bh);
+ *err = ERR_BAD_DX_DIR;
+ goto fail;
+ }
+- hinfo->hash_version = root->info.hash_version;
++ hinfo->hash_version = info->hash_version;
+ if (hinfo->hash_version <= DX_HASH_TEA)
+ hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
+ hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed;
+@@ -425,27 +427,26 @@
+ ext4fs_dirhash(d_name->name, d_name->len, hinfo);
+ hash = hinfo->hash;
+
+- if (root->info.unused_flags & 1) {
++ if (info->unused_flags & 1) {
+ ext4_warning(dir->i_sb, "Unimplemented inode hash flags: %#06x",
+- root->info.unused_flags);
++ info->unused_flags);
+ brelse(bh);
+ *err = ERR_BAD_DX_DIR;
+ goto fail;
+ }
+
+- if ((indirect = root->info.indirect_levels) > 1) {
++ if ((indirect = info->indirect_levels) > 1) {
+ ext4_warning(dir->i_sb, "Unimplemented inode hash depth: %#06x",
+- root->info.indirect_levels);
++ info->indirect_levels);
+ brelse(bh);
+ *err = ERR_BAD_DX_DIR;
+ goto fail;
+ }
+
+- entries = (struct dx_entry *) (((char *)&root->info) +
+- root->info.info_length);
++ entries = (struct dx_entry *) (((char *)info) + info->info_length);
+
+ if (dx_get_limit(entries) != dx_root_limit(dir,
+- root->info.info_length)) {
++ info->info_length)) {
+ ext4_warning(dir->i_sb, "dx entry: limit != root limit");
+ brelse(bh);
+ *err = ERR_BAD_DX_DIR;
+@@ -525,10 +526,12 @@ fail:
+
+ static void dx_release (struct dx_frame *frames)
+ {
++ struct dx_root_info *info;
+ if (frames[0].bh == NULL)
+ return;
+
+- if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels)
++ info = dx_get_dx_info((struct ext4_dir_entry_2*)frames[0].bh->b_data);
++ if (info->indirect_levels)
+ brelse(frames[1].bh);
+ brelse(frames[0].bh);
+ }
+@@ -1447,17 +1450,16 @@
+ const char *name = dentry->d_name.name;
+ int namelen = dentry->d_name.len;
+ struct buffer_head *bh2;
+- struct dx_root *root;
+ struct dx_frame frames[2], *frame;
+ struct dx_entry *entries;
+- struct ext4_dir_entry_2 *de, *de2;
++ struct ext4_dir_entry_2 *de, *de2, *dot_de, *dotdot_de;
+ char *data1, *top;
+ unsigned len;
+ int retval;
+ unsigned blocksize;
+ struct dx_hash_info hinfo;
+ ext4_lblk_t block;
+- struct fake_dirent *fde;
++ struct dx_root_info *dx_info;
+
+ blocksize = dir->i_sb->s_blocksize;
+ dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino));
+@@ -1467,20 +1469,21 @@
+ brelse(bh);
+ return retval;
+ }
+- root = (struct dx_root *) bh->b_data;
++
++ dot_de = (struct ext4_dir_entry_2 *) bh->b_data;
++ dotdot_de = ext4_next_entry(dot_de, blocksize);
+
+ /* The 0th block becomes the root, move the dirents out */
+- fde = &root->dotdot;
+- de = (struct ext4_dir_entry_2 *)((char *)fde +
+- ext4_rec_len_from_disk(fde->rec_len, blocksize));
+- if ((char *) de >= (((char *) root) + blocksize)) {
++ de = (struct ext4_dir_entry_2 *)((char *)dotdot_de +
++ ext4_rec_len_from_disk(dotdot_de->rec_len, blocksize));
++ if ((char *) de >= (((char *) dot_de) + blocksize)) {
+ ext4_error(dir->i_sb,
+ "invalid rec_len for '..' in inode %lu",
+ dir->i_ino);
+ brelse(bh);
+ return -EIO;
+ }
+- len = ((char *) root) + blocksize - (char *) de;
++ len = ((char *) dot_de) + blocksize - (char *) de;
+
+ /* Allocate new block for the 0th block's dirents */
+ bh2 = ext4_append(handle, dir, &block, &retval);
+@@ -1499,19 +1502,23 @@
+ de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de,
+ blocksize);
+ /* Initialize the root; the dot dirents already exist */
+- de = (struct ext4_dir_entry_2 *) (&root->dotdot);
+- de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2),
+- blocksize);
+- memset (&root->info, 0, sizeof(root->info));
+- root->info.info_length = sizeof(root->info);
+- root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
+- entries = root->entries;
++ dotdot_de->rec_len = ext4_rec_len_to_disk(blocksize -
++ le16_to_cpu(dot_de->rec_len), blocksize);
++
++ /* initialize hashing info */
++ dx_info = dx_get_dx_info(dot_de);
++ memset (dx_info, 0, sizeof(*dx_info));
++ dx_info->info_length = sizeof(*dx_info);
++ dx_info->hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
++
++ entries = (void *)dx_info + sizeof(*dx_info);
++
+ dx_set_block(entries, 1);
+ dx_set_count(entries, 1);
+- dx_set_limit(entries, dx_root_limit(dir, sizeof(root->info)));
++ dx_set_limit(entries, dx_root_limit(dir, sizeof(*dx_info)));
+
+ /* Initialize as for dx_probe */
+- hinfo.hash_version = root->info.hash_version;
++ hinfo.hash_version = dx_info->hash_version;
+ if (hinfo.hash_version <= DX_HASH_TEA)
+ hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
+ hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
+@@ -1759,6 +1766,7 @@
+ goto journal_error;
+ brelse (bh2);
+ } else {
++ struct dx_root_info * info;
+ dxtrace(printk(KERN_DEBUG
+ "Creating second level index...\n"));
+ memcpy((char *) entries2, (char *) entries,
+@@ -1768,7 +1776,9 @@
+ /* Set up root */
+ dx_set_count(entries, 1);
+ dx_set_block(entries + 0, newblock);
+- ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1;
++ info = dx_get_dx_info((struct ext4_dir_entry_2*)
++ frames[0].bh->b_data);
++ info->indirect_levels = 1;
+
+ /* Add new access path frame */
+ frame = frames + 1;
--- /dev/null
+Index: linux-2.6.32-el6-beta/fs/ext4/inode.c
+===================================================================
+--- linux-2.6.32-el6-beta.orig/fs/ext4/inode.c
++++ linux-2.6.32-el6-beta/fs/ext4/inode.c
+@@ -5834,3 +5834,67 @@ out_unlock:
+ up_read(&inode->i_alloc_sem);
+ return ret;
+ }
++
++int ext4_map_inode_page(struct inode *inode, struct page *page,
++ unsigned long *blocks, int *created, int create)
++{
++ unsigned int blocksize, blocks_per_page;
++ unsigned long iblock;
++ struct buffer_head dummy;
++ void *handle;
++ int i, rc = 0, failed = 0, needed_blocks;
++
++ blocksize = inode->i_sb->s_blocksize;
++ blocks_per_page = PAGE_SIZE >> inode->i_sb->s_blocksize_bits;
++ iblock = page->index * blocks_per_page;
++
++ for (i = 0; i < blocks_per_page; i++, iblock++) {
++ blocks[i] = ext4_bmap(inode->i_mapping, iblock);
++ if (blocks[i] == 0) {
++ failed++;
++ if (created)
++ created[i] = -1;
++ } else if (created) {
++ created[i] = 0;
++ }
++ }
++
++ if (failed == 0 || create == 0)
++ return 0;
++
++ needed_blocks = ext4_writepage_trans_blocks(inode);
++ handle = ext4_journal_start(inode, needed_blocks);
++ if (IS_ERR(handle))
++ return PTR_ERR(handle);
++
++ iblock = page->index * blocks_per_page;
++ for (i = 0; i < blocks_per_page; i++, iblock++) {
++ if (blocks[i] != 0)
++ continue;
++
++ rc = ext4_get_blocks(handle, inode, iblock, 1, &dummy, 1);
++ if (rc < 0) {
++ printk(KERN_INFO "ext4_map_inode_page: error reading "
++ "block %ld\n", iblock);
++ goto out;
++ } else {
++ if (rc > 1)
++ WARN_ON(1);
++
++ rc = 0;
++ }
++ /* Unmap any metadata buffers from the block mapping, to avoid
++ * data corruption due to direct-write from Lustre being
++ * clobbered by a later flush of the blockdev metadata buffer.*/
++ if (buffer_new(&dummy))
++ unmap_underlying_metadata(dummy.b_bdev,
++ dummy.b_blocknr);
++ blocks[i] = dummy.b_blocknr;
++ if (created)
++ created[i] = 1;
++ }
++
++out:
++ ext4_journal_stop(handle);
++ return rc;
++}
+Index: linux-2.6.32-el6-beta/fs/ext4/super.c
+===================================================================
+--- linux-2.6.32-el6-beta.orig/fs/ext4/super.c
++++ linux-2.6.32-el6-beta/fs/ext4/super.c
+@@ -4084,6 +4084,10 @@ static void __exit exit_ext4_fs(void)
+ exit_ext4_system_zone();
+ }
+
++int ext4_map_inode_page(struct inode *inode, struct page *page,
++ unsigned long *blocks, int *created, int create);
++EXPORT_SYMBOL(ext4_map_inode_page);
++
+ MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
+ MODULE_DESCRIPTION("Fourth Extended Filesystem");
+ MODULE_LICENSE("GPL");
--- /dev/null
+Index: linux-2.6.32-el6-beta/fs/ext4/ialloc.c
+===================================================================
+--- linux-2.6.32-el6-beta.orig/fs/ext4/ialloc.c
++++ linux-2.6.32-el6-beta/fs/ext4/ialloc.c
+@@ -825,11 +825,15 @@ struct inode *ext4_new_inode(handle_t *h
+ sb = dir->i_sb;
+ ngroups = ext4_get_groups_count(sb);
+ trace_ext4_request_inode(dir, mode);
++
++ sbi = EXT4_SB(sb);
++ if (sbi->s_max_dir_size > 0 && i_size_read(dir) >= sbi->s_max_dir_size)
++ return ERR_PTR(-EFBIG);
++
+ inode = new_inode(sb);
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+ ei = EXT4_I(inode);
+- sbi = EXT4_SB(sb);
+
+ if (!goal)
+ goal = sbi->s_inode_goal;
+Index: linux-2.6.32-el6-beta/fs/ext4/super.c
+===================================================================
+--- linux-2.6.32-el6-beta.orig/fs/ext4/super.c
++++ linux-2.6.32-el6-beta/fs/ext4/super.c
+@@ -2601,6 +2601,7 @@ EXT4_RO_ATTR(lifetime_write_kbytes);
+ EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show,
+ inode_readahead_blks_store, s_inode_readahead_blks);
+ EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal);
++EXT4_RW_ATTR_SBI_UI(max_dir_size, s_max_dir_size);
+ EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats);
+ EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan);
+ EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
+@@ -2615,6 +2616,7 @@ static struct attribute *ext4_attrs[] =
+ ATTR_LIST(lifetime_write_kbytes),
+ ATTR_LIST(inode_readahead_blks),
+ ATTR_LIST(inode_goal),
++ ATTR_LIST(max_dir_size),
+ ATTR_LIST(mb_stats),
+ ATTR_LIST(mb_max_to_scan),
+ ATTR_LIST(mb_min_to_scan),
+Index: linux-2.6.32-el6-beta/fs/ext4/ext4.h
+===================================================================
+--- linux-2.6.32-el6-beta.orig/fs/ext4/ext4.h
++++ linux-2.6.32-el6-beta/fs/ext4/ext4.h
+@@ -1029,6 +1029,8 @@ struct ext4_sb_info {
+
+ /* Kernel thread for multiple mount protection */
+ struct task_struct *s_mmp_tsk;
++
++ unsigned long s_max_dir_size;
+ };
+
+ static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
+@@ -1353,6 +1355,12 @@ struct mmp_struct {
+ #define EXT4_MMP_MIN_CHECK_INTERVAL 5
+
+ /*
++ * max directory size tunable
++ */
++#define EXT4_DEFAULT_MAX_DIR_SIZE 0
++#define EXT4_MAX_DIR_SIZE_NAME "max_dir_size"
++
++/*
+ * Function prototypes
+ */
+
--- /dev/null
+Index: linux-stage/fs/ext4/ext4.h
+===================================================================
+--- linux-stage.orig/fs/ext4/ext4.h 2011-03-14 16:18:28.300241437 +0800
++++ linux-stage/fs/ext4/ext4.h 2011-03-14 16:33:17.056087375 +0800
+@@ -1770,6 +1770,7 @@
+ ext4_grpblk_t bb_free; /* total free blocks */
+ ext4_grpblk_t bb_fragments; /* nr of freespace fragments */
+ struct list_head bb_prealloc_list;
++ unsigned long bb_prealloc_nr;
+ #ifdef DOUBLE_CHECK
+ void *bb_bitmap;
+ #endif
+Index: linux-stage/fs/ext4/mballoc.c
+===================================================================
+--- linux-stage.orig/fs/ext4/mballoc.c 2011-03-14 16:18:28.336242149 +0800
++++ linux-stage/fs/ext4/mballoc.c 2011-03-14 16:33:27.072292006 +0800
+@@ -337,7 +337,7 @@
+ static struct kmem_cache *ext4_pspace_cachep;
+ static struct kmem_cache *ext4_ac_cachep;
+ static struct kmem_cache *ext4_free_ext_cachep;
+-static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
++static int ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
+ ext4_group_t group);
+ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
+ ext4_group_t group);
+@@ -659,7 +659,7 @@
+ }
+
+ static noinline_for_stack
+-void ext4_mb_generate_buddy(struct super_block *sb,
++int ext4_mb_generate_buddy(struct super_block *sb,
+ void *buddy, void *bitmap, ext4_group_t group)
+ {
+ struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+@@ -691,14 +691,13 @@
+ grp->bb_fragments = fragments;
+
+ if (free != grp->bb_free) {
+- ext4_grp_locked_error(sb, group, __func__,
+- "EXT4-fs: group %u: %u blocks in bitmap, %u in gd",
+- group, free, grp->bb_free);
+- /*
+- * If we intent to continue, we consider group descritor
+- * corrupt and update bb_free using bitmap value
+- */
+- grp->bb_free = free;
++ struct ext4_group_desc *gdp;
++ gdp = ext4_get_group_desc (sb, group, NULL);
++ ext4_error(sb, "group %lu: %u blocks in bitmap, %u in bb, "
++ "%u in gd, %lu pa's\n", (long unsigned int)group,
++ free, grp->bb_free, ext4_free_blks_count(sb, gdp),
++ grp->bb_prealloc_nr);
++ return -EIO;
+ }
+
+ clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
+@@ -708,6 +707,8 @@
+ EXT4_SB(sb)->s_mb_buddies_generated++;
+ EXT4_SB(sb)->s_mb_generation_time += period;
+ spin_unlock(&EXT4_SB(sb)->s_bal_lock);
++
++ return 0;
+ }
+
+ /* The buddy information is attached the buddy cache inode
+@@ -839,7 +840,7 @@
+ first_block = page->index * blocks_per_page;
+ /* init the page */
+ memset(page_address(page), 0xff, PAGE_CACHE_SIZE);
+- for (i = 0; i < blocks_per_page; i++) {
++ for (i = 0; i < blocks_per_page && err == 0; i++) {
+ int group;
+ struct ext4_group_info *grinfo;
+
+@@ -874,7 +875,7 @@
+ * incore got set to the group block bitmap below
+ */
+ ext4_lock_group(sb, group);
+- ext4_mb_generate_buddy(sb, data, incore, group);
++ err = ext4_mb_generate_buddy(sb, data, incore, group);
+ ext4_unlock_group(sb, group);
+ incore = NULL;
+ } else {
+@@ -888,7 +889,7 @@
+ memcpy(data, bitmap, blocksize);
+
+ /* mark all preallocated blks used in in-core bitmap */
+- ext4_mb_generate_from_pa(sb, data, group);
++ err = ext4_mb_generate_from_pa(sb, data, group);
+ ext4_mb_generate_from_freelist(sb, data, group);
+ ext4_unlock_group(sb, group);
+
+@@ -898,7 +899,8 @@
+ incore = data;
+ }
+ }
+- SetPageUptodate(page);
++ if (likely(err == 0))
++ SetPageUptodate(page);
+
+ out:
+ if (bh) {
+@@ -2142,9 +2144,11 @@
+ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
+ {
+ struct super_block *sb = seq->private;
++ struct ext4_group_desc *gdp;
+ ext4_group_t group = (ext4_group_t) ((unsigned long) v);
+ int i;
+ int err;
++ int free = 0;
+ struct ext4_buddy e4b;
+ struct sg {
+ struct ext4_group_info info;
+@@ -2153,10 +2157,10 @@
+
+ group--;
+ if (group == 0)
+- seq_printf(seq, "#%-5s: %-5s %-5s %-5s "
++ seq_printf(seq, "#%-5s: %-5s %-5s %-5s %-5s %-5s"
+ "[ %-5s %-5s %-5s %-5s %-5s %-5s %-5s "
+ "%-5s %-5s %-5s %-5s %-5s %-5s %-5s ]\n",
+- "group", "free", "frags", "first",
++ "group", "free", "frags", "first", "first", "pa",
+ "2^0", "2^1", "2^2", "2^3", "2^4", "2^5", "2^6",
+ "2^7", "2^8", "2^9", "2^10", "2^11", "2^12", "2^13");
+
+@@ -2167,13 +2171,20 @@
+ seq_printf(seq, "#%-5u: I/O error\n", group);
+ return 0;
+ }
++
++ gdp = ext4_get_group_desc(sb, group, NULL);
++ if (gdp != NULL)
++ free = ext4_free_blks_count(sb, gdp);
++
+ ext4_lock_group(sb, group);
+ memcpy(&sg, ext4_get_group_info(sb, group), i);
+ ext4_unlock_group(sb, group);
+ ext4_mb_release_desc(&e4b);
+
+- seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free,
+- sg.info.bb_fragments, sg.info.bb_first_free);
++ seq_printf(seq, "#%-5lu: %-5u %-5u %-5u %-5u %-5lu [",
++ (long unsigned int)group, sg.info.bb_free, free,
++ sg.info.bb_fragments, sg.info.bb_first_free,
++ sg.info.bb_prealloc_nr);
+ for (i = 0; i <= 13; i++)
+ seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ?
+ sg.info.bb_counters[i] : 0);
+@@ -3354,23 +3365,68 @@
+ }
+
+ /*
++ * check free blocks in bitmap match free block in group descriptor
++ * do this before taking preallocated blocks into account to be able
++ * to detect on-disk corruptions. The group lock should be hold by the
++ * caller.
++ */
++int ext4_mb_check_ondisk_bitmap(struct super_block *sb, void *bitmap,
++ struct ext4_group_desc *gdp, int group)
++{
++ unsigned short max = EXT4_BLOCKS_PER_GROUP(sb);
++ unsigned short i, first, free = 0;
++
++ i = mb_find_next_zero_bit(bitmap, max, 0);
++
++ while (i < max) {
++ first = i;
++ i = mb_find_next_bit(bitmap, max, i);
++ if (i > max)
++ i = max;
++ free += i - first;
++ if (i < max)
++ i = mb_find_next_zero_bit(bitmap, max, i);
++ }
++
++ if (free != ext4_free_blks_count(sb, gdp)) {
++ ext4_error(sb, "on-disk bitmap for group %d"
++ "corrupted: %u blocks free in bitmap, %u - in gd\n",
++ group, free, ext4_free_blks_count(sb, gdp));
++ return -EIO;
++ }
++ return 0;
++}
++
++/*
+ * the function goes through all preallocation in this group and marks them
+ * used in in-core bitmap. buddy must be generated from this bitmap
+ * Need to be called with ext4 group lock held
+ */
+ static noinline_for_stack
+-void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
++int ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
+ ext4_group_t group)
+ {
+ struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+ struct ext4_prealloc_space *pa;
++ struct ext4_group_desc *gdp;
+ struct list_head *cur;
+ ext4_group_t groupnr;
+ ext4_grpblk_t start;
+ int preallocated = 0;
+ int count = 0;
++ int skip = 0;
++ int err;
+ int len;
+
++ gdp = ext4_get_group_desc (sb, group, NULL);
++ if (gdp == NULL)
++ return -EIO;
++
++ /* before applying preallocations, check bitmap consistency */
++ err = ext4_mb_check_ondisk_bitmap(sb, bitmap, gdp, group);
++ if (err)
++ return err;
++
+ /* all form of preallocation discards first load group,
+ * so the only competing code is preallocation use.
+ * we don't need any locking here
+@@ -3386,14 +3442,23 @@
+ &groupnr, &start);
+ len = pa->pa_len;
+ spin_unlock(&pa->pa_lock);
+- if (unlikely(len == 0))
++ if (unlikely(len == 0)) {
++ skip++;
+ continue;
++ }
+ BUG_ON(groupnr != group);
+ mb_set_bits(bitmap, start, len);
+ preallocated += len;
+ count++;
+ }
++ if (count + skip != grp->bb_prealloc_nr) {
++ ext4_error(sb, "lost preallocations: "
++ "count %d, bb_prealloc_nr %lu, skip %d\n",
++ count, grp->bb_prealloc_nr, skip);
++ return -EIO;
++ }
+ mb_debug(1, "prellocated %u for group %u\n", preallocated, group);
++ return 0;
+ }
+
+ static void ext4_mb_pa_callback(struct rcu_head *head)
+@@ -3452,6 +3517,7 @@
+ */
+ ext4_lock_group(sb, grp);
+ list_del(&pa->pa_group_list);
++ ext4_get_group_info(sb, grp)->bb_prealloc_nr--;
+ ext4_unlock_group(sb, grp);
+
+ spin_lock(pa->pa_obj_lock);
+@@ -3543,6 +3609,7 @@
+
+ ext4_lock_group(sb, ac->ac_b_ex.fe_group);
+ list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
++ grp->bb_prealloc_nr++;
+ ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
+
+ spin_lock(pa->pa_obj_lock);
+@@ -3604,6 +3671,7 @@
+
+ ext4_lock_group(sb, ac->ac_b_ex.fe_group);
+ list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
++ grp->bb_prealloc_nr++;
+ ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
+
+ /*
+@@ -3802,6 +3870,8 @@
+
+ spin_unlock(&pa->pa_lock);
+
++ BUG_ON(grp->bb_prealloc_nr == 0);
++ grp->bb_prealloc_nr--;
+ list_del(&pa->pa_group_list);
+ list_add(&pa->u.pa_tmp_list, &list);
+ }
+@@ -3942,7 +4012,7 @@
+ if (err) {
+ ext4_error(sb, "Error loading buddy information for %u",
+ group);
+- continue;
++ return;
+ }
+
+ bitmap_bh = ext4_read_block_bitmap(sb, group);
+@@ -3954,6 +4024,8 @@
+ }
+
+ ext4_lock_group(sb, group);
++ BUG_ON(e4b.bd_info->bb_prealloc_nr == 0);
++ e4b.bd_info->bb_prealloc_nr--;
+ list_del(&pa->pa_group_list);
+ ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac);
+ ext4_unlock_group(sb, group);
+@@ -4227,6 +4299,7 @@
+ }
+ ext4_lock_group(sb, group);
+ list_del(&pa->pa_group_list);
++ ext4_get_group_info(sb, group)->bb_prealloc_nr--;
+ ext4_mb_release_group_pa(&e4b, pa, ac);
+ ext4_unlock_group(sb, group);
+
+Index: linux-stage/fs/ext4/mballoc.h
+===================================================================
+--- linux-stage.orig/fs/ext4/mballoc.h 2011-03-14 16:18:26.670209322 +0800
++++ linux-stage/fs/ext4/mballoc.h 2011-03-14 16:32:50.859552482 +0800
+@@ -88,7 +88,7 @@
+ /*
+ * for which requests use 2^N search using buddies
+ */
+-#define MB_DEFAULT_ORDER2_REQS 2
++#define MB_DEFAULT_ORDER2_REQS 8
+
+ /*
+ * default group prealloc size 512 blocks
--- /dev/null
+Index: linux-stage/fs/ext4/mballoc.c
+===================================================================
+--- linux-stage.orig/fs/ext4/mballoc.c 2011-03-14 16:34:39.790758415 +0800
++++ linux-stage/fs/ext4/mballoc.c 2011-03-14 16:38:36.211681104 +0800
+@@ -3593,6 +3593,7 @@
+ INIT_LIST_HEAD(&pa->pa_group_list);
+ pa->pa_deleted = 0;
+ pa->pa_type = MB_INODE_PA;
++ pa->pa_error = 0;
+
+ mb_debug(1, "new inode pa %p: %llu/%u for %u\n", pa,
+ pa->pa_pstart, pa->pa_len, pa->pa_lstart);
+@@ -3654,6 +3655,7 @@
+ INIT_LIST_HEAD(&pa->pa_group_list);
+ pa->pa_deleted = 0;
+ pa->pa_type = MB_GROUP_PA;
++ pa->pa_error = 0;
+
+ mb_debug(1, "new group pa %p: %llu/%u for %u\n", pa,
+ pa->pa_pstart, pa->pa_len, pa->pa_lstart);
+@@ -3716,7 +3718,9 @@
+ int err = 0;
+ int free = 0;
+
++ assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
+ BUG_ON(pa->pa_deleted == 0);
++ BUG_ON(pa->pa_inode == NULL);
+ ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
+ grp_blk_start = pa->pa_pstart - bit;
+ BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
+@@ -3752,19 +3756,27 @@
+ mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
+ bit = next + 1;
+ }
+- if (free != pa->pa_free) {
+- printk(KERN_CRIT "pa %p: logic %lu, phys. %lu, len %lu\n",
+- pa, (unsigned long) pa->pa_lstart,
+- (unsigned long) pa->pa_pstart,
+- (unsigned long) pa->pa_len);
++
++ /* "free < pa->pa_free" means we maybe double alloc the same blocks,
++ * otherwise maybe leave some free blocks unavailable, no need to BUG.*/
++ if ((free > pa->pa_free && !pa->pa_error) || (free < pa->pa_free)) {
++ ext4_error(sb, "pa free mismatch: [pa %p] "
++ "[phy %lu] [logic %lu] [len %u] [free %u] "
++ "[error %u] [inode %lu] [freed %u]", pa,
++ (unsigned long)pa->pa_pstart,
++ (unsigned long)pa->pa_lstart,
++ (unsigned)pa->pa_len, (unsigned)pa->pa_free,
++ (unsigned)pa->pa_error, pa->pa_inode->i_ino,
++ free);
+ ext4_grp_locked_error(sb, group,
+- __func__, "free %u, pa_free %u",
+- free, pa->pa_free);
++ __func__, "free %u, pa_free %u",
++ free, pa->pa_free);
+ /*
+ * pa is already deleted so we use the value obtained
+ * from the bitmap and continue.
+ */
+ }
++ BUG_ON(pa->pa_free != free);
+ atomic_add(free, &sbi->s_mb_discarded);
+
+ return err;
+@@ -4450,6 +4462,24 @@
+
+ trace_ext4_request_blocks(ar);
+
++
++ if (dev_check_rdonly(sb->s_bdev)) {
++ struct block_device *bdev = sb->s_bdev;
++
++ printk(KERN_WARNING "Alloc from readonly device %s (%#x): "
++ "[inode %lu] [logic %llu] [goal %llu] [ll %llu] "
++ "[pl %llu] [lr %llu] [pr %llu] [len %u] [flags %u]\n",
++ bdev->bd_disk ? bdev->bd_disk->disk_name : "",
++ bdev->bd_dev, ar->inode->i_ino,
++ (unsigned long long)ar->logical,
++ (unsigned long long)ar->goal,
++ (unsigned long long)ar->lleft,
++ (unsigned long long)ar->pleft,
++ (unsigned long long)ar->lright,
++ (unsigned long long)ar->pright,
++ ar->len, ar->flags);
++ }
++
+ /*
+ * For delayed allocation, we could skip the ENOSPC and
+ * EDQUOT check, as blocks and quotas have been already
+@@ -4529,6 +4559,25 @@
+ ac->ac_b_ex.fe_len = 0;
+ ar->len = 0;
+ ext4_mb_show_ac(ac);
++ if (ac->ac_pa) {
++ struct ext4_prealloc_space *pa = ac->ac_pa;
++
++ /* We can not make sure whether the bitmap has
++ * been updated or not when fail case. So can
++ * not revert pa_free back, just mark pa_error*/
++ pa->pa_error++;
++ ext4_error(sb,
++ "Updating bitmap error: [err %d] "
++ "[pa %p] [phy %lu] [logic %lu] "
++ "[len %u] [free %u] [error %u] "
++ "[inode %lu]", *errp, pa,
++ (unsigned long)pa->pa_pstart,
++ (unsigned long)pa->pa_lstart,
++ (unsigned)pa->pa_len,
++ (unsigned)pa->pa_free,
++ (unsigned)pa->pa_error,
++ pa->pa_inode ? pa->pa_inode->i_ino : 0);
++ }
+ } else {
+ block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
+ ar->len = ac->ac_b_ex.fe_len;
+@@ -4691,6 +4740,15 @@
+ goto error_return;
+ }
+
++ if (dev_check_rdonly(sb->s_bdev)) {
++ struct block_device *bdev = sb->s_bdev;
++
++ printk(KERN_WARNING "Release to readonly device %s (%#x): "
++ "[inode %lu] [block %llu] [count %lu] [is_meta %d]\n",
++ bdev->bd_disk ? bdev->bd_disk->disk_name : "",
++ bdev->bd_dev, inode->i_ino, block, count, metadata);
++ }
++
+ ext4_debug("freeing block %llu\n", block);
+ trace_ext4_free_blocks(inode, block, count, metadata);
+
+Index: linux-stage/fs/ext4/mballoc.h
+===================================================================
+--- linux-stage.orig/fs/ext4/mballoc.h 2011-03-14 16:32:50.859552482 +0800
++++ linux-stage/fs/ext4/mballoc.h 2011-03-14 16:39:20.928429776 +0800
+@@ -20,6 +20,7 @@
+ #include <linux/version.h>
+ #include <linux/blkdev.h>
+ #include <linux/mutex.h>
++#include <linux/genhd.h>
+ #include "ext4_jbd2.h"
+ #include "ext4.h"
+
+@@ -130,6 +131,7 @@
+ ext4_grpblk_t pa_free; /* how many blocks are free */
+ unsigned short pa_type; /* pa type. inode or group */
+ spinlock_t *pa_obj_lock;
++ unsigned short pa_error;
+ struct inode *pa_inode; /* hack, for history only */
+ };
+
-Index: linux-2.6.18.i386/fs/ext4/ext4_jbd2.h
+Index: linux-stage/fs/ext4/ext4_jbd2.h
===================================================================
---- linux-2.6.18.i386.orig/fs/ext4/ext4_jbd2.h
-+++ linux-2.6.18.i386/fs/ext4/ext4_jbd2.h
-@@ -35,6 +35,11 @@
- (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS) \
- || test_opt(sb, EXTENTS) ? 27U : 8U)
+--- linux-stage.orig/fs/ext4/ext4_jbd2.h 2011-03-14 17:17:57.962614294 +0800
++++ linux-stage/fs/ext4/ext4_jbd2.h 2011-03-14 17:26:00.570661921 +0800
+@@ -35,6 +35,8 @@
+ (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS) \
+ ? 27U : 8U)
-+/* Indicate that EXT4_SINGLEDATA_TRANS_BLOCKS takes the sb as argument */
-+#define EXT4_SINGLEDATA_TRANS_BLOCKS_HAS_SB
-+
+#define ext4_journal_dirty_metadata(handle, bh) \
+ ext4_handle_dirty_metadata(handle, NULL, bh)
/* Extended attribute operations touch at most two data buffers,
* two bitmap buffers, and two group summaries, in addition to the inode
* and the superblock, which are already accounted for. */
-Index: linux-2.6.18.i386/fs/ext4/extents.c
+Index: linux-stage/fs/ext4/extents.c
===================================================================
---- linux-2.6.18.i386.orig/fs/ext4/extents.c
-+++ linux-2.6.18.i386/fs/ext4/extents.c
-@@ -60,6 +60,17 @@ static ext4_fsblk_t ext_pblock(struct ex
+--- linux-stage.orig/fs/ext4/extents.c 2011-03-14 17:17:57.491605523 +0800
++++ linux-stage/fs/ext4/extents.c 2011-03-14 17:25:23.230957562 +0800
+@@ -59,6 +59,17 @@ static ext4_fsblk_t ext_pblock(struct ex
}
/*
* idx_pblock:
* combine low and high parts of a leaf physical block number into ext4_fsblk_t
*/
-@@ -73,17 +84,6 @@ ext4_fsblk_t idx_pblock(struct ext4_exte
+@@ -72,17 +83,6 @@ ext4_fsblk_t idx_pblock(struct ext4_exte
}
/*
* ext4_idx_store_pblock:
* stores a large physical block number into an index struct,
* breaking it into parts
-@@ -1826,6 +1826,56 @@ static int ext4_ext_rm_idx(handle_t *han
+@@ -1980,6 +1980,56 @@ static int ext4_ext_rm_idx(handle_t *han
}
/*
* ext4_ext_calc_credits_for_single_extent:
* This routine returns max. credits that needed to insert an extent
* to the extent tree.
-@@ -3157,4 +3207,14 @@ int ext4_fiemap(struct inode *inode, str
-
+@@ -3731,3 +3781,13 @@ int ext4_fiemap(struct inode *inode, str
return error;
}
-+
+
+EXPORT_SYMBOL(ext4_ext_store_pblock);
+EXPORT_SYMBOL(ext4_ext_search_right);
+EXPORT_SYMBOL(ext4_ext_search_left);
+EXPORT_SYMBOL(ext4_ext_walk_space);
+EXPORT_SYMBOL(ext4_ext_calc_credits_for_insert);
+EXPORT_SYMBOL(ext4_mark_inode_dirty);
-
-Index: linux-2.6.18.i386/fs/ext4/ext4_extents.h
++
+Index: linux-stage/fs/ext4/ext4_extents.h
===================================================================
---- linux-2.6.18.i386.orig/fs/ext4/ext4_extents.h
-+++ linux-2.6.18.i386/fs/ext4/ext4_extents.h
-@@ -59,6 +59,12 @@
+--- linux-stage.orig/fs/ext4/ext4_extents.h 2011-03-14 17:17:57.928613657 +0800
++++ linux-stage/fs/ext4/ext4_extents.h 2011-03-14 17:27:23.673232962 +0800
+@@ -58,6 +58,12 @@
*/
#define EXT_STATS_
/*
* ext4_inode has i_block array (60 bytes total).
-@@ -124,6 +129,8 @@ struct ext4_ext_path {
- #define EXT4_EXT_CACHE_GAP 1
- #define EXT4_EXT_CACHE_EXTENT 2
+@@ -160,6 +166,7 @@ struct ext4_ext_path {
+ #define EXT_INIT_MAX_LEN (1UL << 15)
+ #define EXT_UNINIT_MAX_LEN (EXT_INIT_MAX_LEN - 1)
+#define EXT4_EXT_HAS_NO_TREE /* ext4_extents_tree struct is not used*/
-+#define EXT_INSERT_EXTENT_WITH_5ARGS
-
- #define EXT_MAX_BLOCK 0xffffffff
-@@ -228,6 +234,8 @@ static inline int ext4_ext_get_actual_le
+ #define EXT_FIRST_EXTENT(__hdr__) \
+ ((struct ext4_extent *) (((char *) (__hdr__)) + \
+@@ -230,6 +237,8 @@
extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t);
extern int ext4_extent_tree_init(handle_t *, struct inode *);
extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
int num,
struct ext4_ext_path *path);
-Index: linux-2.6.18.i386/fs/ext4/mballoc.c
+Index: linux-stage/fs/ext4/mballoc.c
===================================================================
---- linux-2.6.18.i386.orig/fs/ext4/mballoc.c
-+++ linux-2.6.18.i386/fs/ext4/mballoc.c
-@@ -4355,6 +4355,13 @@
+--- linux-stage.orig/fs/ext4/mballoc.c 2011-03-14 17:17:59.872649833 +0800
++++ linux-stage/fs/ext4/mballoc.c 2011-03-14 17:25:20.373903681 +0800
+@@ -4302,6 +4302,13 @@
kmem_cache_free(ext4_ac_cachep, ac);
}
/*
* finds all preallocated spaces and return blocks being freed to them
* if preallocated space becomes full (no block is used from the space)
-@@ -4965,3 +4965,6 @@ error_return:
+@@ -5111,3 +5118,6 @@ error_return:
kmem_cache_free(ext4_ac_cachep, ac);
return;
}
+
+EXPORT_SYMBOL(ext4_free_blocks);
+
-Index: linux-2.6.18.i386/fs/ext4/ext4_jbd2.c
+Index: linux-stage/fs/ext4/ext4_jbd2.c
===================================================================
---- linux-2.6.18.i386.orig/fs/ext4/ext4_jbd2.c
-+++ linux-2.6.18.i386/fs/ext4/ext4_jbd2.c
-@@ -21,6 +21,7 @@ int __ext4_journal_get_write_access(cons
- ext4_journal_abort_handle(where, __func__, bh, handle, err);
+--- linux-stage.orig/fs/ext4/ext4_jbd2.c 2011-03-14 17:17:57.463605024 +0800
++++ linux-stage/fs/ext4/ext4_jbd2.c 2011-03-14 17:18:00.157655139 +0800
+@@ -31,6 +31,7 @@ int __ext4_journal_get_write_access(cons
+ }
return err;
}
+EXPORT_SYMBOL(__ext4_journal_get_write_access);
int __ext4_journal_forget(const char *where, handle_t *handle,
struct buffer_head *bh)
-@@ -57,3 +58,4 @@ int __ext4_journal_dirty_metadata(const
- ext4_journal_abort_handle(where, __func__, bh, handle, err);
+@@ -107,3 +108,4 @@ int __ext4_journal_dirty_metadata(const
+ }
return err;
}
+EXPORT_SYMBOL(__ext4_handle_dirty_metadata);
-Index: linux-2.6.27.21-0.1/fs/ext4/ext4.h
+Index: linux-stage/fs/ext4/ext4.h
===================================================================
---- linux-2.6.27.21-0.1.orig/fs/ext4/ext4.h 2009-07-07 14:47:19.000000000 +0530
-+++ linux-2.6.27.21-0.1/fs/ext4/ext4.h 2009-07-07 14:47:22.000000000 +0530
-@@ -1123,6 +1128,8 @@
+--- linux-stage.orig/fs/ext4/ext4.h 2011-03-14 17:17:59.916650654 +0800
++++ linux-stage/fs/ext4/ext4.h 2011-03-14 17:25:30.236089694 +0800
+@@ -1448,6 +1448,8 @@
extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t);
extern void ext4_mb_put_buddy_cache_lock(struct super_block *,
ext4_group_t, int);
/* inode.c */
int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
struct buffer_head *bh, ext4_fsblk_t blocknr);
-Index: linux-2.6.27.21-0.1/fs/ext4/inode.c
+Index: linux-stage/fs/ext4/inode.c
===================================================================
---- linux-2.6.27.21-0.1.orig/fs/ext4/inode.c 2009-07-07 14:47:19.000000000 +0530
-+++ linux-2.6.27.21-0.1/fs/ext4/inode.c 2009-07-07 14:47:22.000000000 +0530
-@@ -4240,6 +4240,7 @@
+--- linux-stage.orig/fs/ext4/inode.c 2011-03-14 17:17:59.745647471 +0800
++++ linux-stage/fs/ext4/inode.c 2011-03-14 17:18:00.219656294 +0800
+@@ -4882,6 +4882,7 @@
iget_failed(inode);
return ERR_PTR(ret);
}
static int ext4_inode_blocks_set(handle_t *handle,
struct ext4_inode *raw_inode,
-Index: linux-2.6.27.21-0.1/fs/ext4/super.c
+Index: linux-stage/fs/ext4/super.c
===================================================================
---- linux-2.6.27.21-0.1.orig/fs/ext4/super.c 2009-07-07 14:47:19.000000000 +0530
-+++ linux-2.6.27.21-0.1/fs/ext4/super.c 2009-07-07 14:48:53.000000000 +0530
-@@ -91,6 +91,7 @@ ext4_fsblk_t ext4_inode_bitmap(struct su
+--- linux-stage.orig/fs/ext4/super.c 2011-03-14 17:17:59.659645870 +0800
++++ linux-stage/fs/ext4/super.c 2011-03-14 17:25:31.027104616 +0800
+@@ -90,6 +90,7 @@ ext4_fsblk_t ext4_inode_bitmap(struct su
(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
- (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0);
+ (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0);
}
+EXPORT_SYMBOL(ext4_inode_bitmap);
ext4_fsblk_t ext4_inode_table(struct super_block *sb,
struct ext4_group_desc *bg)
-@@ -113,6 +118,7 @@
- (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
- (__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0);
+@@ -114,6 +115,7 @@
+ (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
+ (__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0);
}
+EXPORT_SYMBOL(ext4_itable_unused_count);
-
- void ext4_block_bitmap_set(struct super_block *sb,
- struct ext4_group_desc *bg, ext4_fsblk_t blk)
-@@ -1286,9 +1287,11 @@
- Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
- Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize,
- Opt_usrquota, Opt_grpquota, Opt_i_version,
+
+ __u32 ext4_used_dirs_count(struct super_block *sb,
+ struct ext4_group_desc *bg)
+@@ -1434,9 +1436,11 @@
+ Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
+ Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize,
+ Opt_usrquota, Opt_grpquota, Opt_i_version,
+ Opt_mballoc, Opt_extents,
- Opt_stripe, Opt_delalloc, Opt_nodelalloc,
- Opt_block_validity, Opt_noblock_validity,
+ Opt_stripe, Opt_delalloc, Opt_nodelalloc,
+ Opt_block_validity, Opt_noblock_validity,
- Opt_inode_readahead_blks, Opt_journal_ioprio
+ Opt_inode_readahead_blks, Opt_journal_ioprio,
+ Opt_iopen, Opt_noiopen, Opt_iopen_nopriv,
};
static match_table_t tokens = {
-@@ -1346,6 +1348,11 @@
- {Opt_barrier, "barrier"},
- {Opt_nobarrier, "nobarrier"},
- {Opt_i_version, "i_version"},
+@@ -1491,6 +1495,11 @@
+ {Opt_barrier, "barrier"},
+ {Opt_nobarrier, "nobarrier"},
+ {Opt_i_version, "i_version"},
+ {Opt_mballoc, "mballoc"},
+ {Opt_extents, "extents"},
+ {Opt_iopen, "iopen"},
+ {Opt_noiopen, "noiopen"},
+ {Opt_iopen_nopriv, "iopen_nopriv"},
- {Opt_stripe, "stripe=%u"},
- {Opt_resize, "resize"},
- {Opt_delalloc, "delalloc"},
-@@ -1768,6 +1771,12 @@
- case Opt_bigendian_extents:
- bigendian_extents = 1;
+ {Opt_stripe, "stripe=%u"},
+ {Opt_resize, "resize"},
+ {Opt_delalloc, "delalloc"},
+@@ -1930,6 +1939,12 @@
+ else
+ set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
break;
+ case Opt_mballoc:
+ case Opt_extents:
+ case Opt_iopen_nopriv:
+ break;
default:
- printk(KERN_ERR
- "EXT4-fs: Unrecognized mount option \"%s\" "
-@@ -2768,7 +2771,7 @@
- char *buf)
+ ext4_msg(sb, KERN_ERR,
+ "Unrecognized mount option \"%s\" "
+@@ -2480,7 +2495,7 @@
+ char *buf)
{
- return snprintf(buf, PAGE_SIZE, "%llu\n",
+ return snprintf(buf, PAGE_SIZE, "%llu\n",
- (s64) percpu_counter_sum(&sbi->s_dirtyblocks_counter));
+ (unsigned long long) percpu_counter_sum(&sbi->s_dirtyblocks_counter));
}
static ssize_t session_write_kbytes_show(struct ext4_attr *a,
-@@ -2868,11 +2871,11 @@
- struct super_block *sb = sbi->s_buddy_cache->i_sb;
-
- return snprintf(buf, PAGE_SIZE, "%llu\n",
+@@ -2501,11 +2516,11 @@
+ struct super_block *sb = sbi->s_buddy_cache->i_sb;
+
+ return snprintf(buf, PAGE_SIZE, "%llu\n",
- sbi->s_kbytes_written +
+ (unsigned long long)(sbi->s_kbytes_written +
- (sb->s_bdev->bd_part ?
- (part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
- EXT4_SB(sb)->s_sectors_written_start) >> 1
+ (sb->s_bdev->bd_part ?
+ (part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
+ EXT4_SB(sb)->s_sectors_written_start) >> 1
- : 0));
+ : 0)));
}
static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
-@@ -3868,7 +3871,7 @@
- if (blocks_count && ext4_blocks_count(es) > blocks_count) {
- ext4_msg(sb, KERN_WARNING, "bad geometry: block count %llu "
- "exceeds size of device (%llu blocks)",
+@@ -2972,7 +2987,7 @@
+ if (blocks_count && ext4_blocks_count(es) > blocks_count) {
+ ext4_msg(sb, KERN_WARNING, "bad geometry: block count %llu "
+ "exceeds size of device (%llu blocks)",
- ext4_blocks_count(es), blocks_count);
+ ext4_blocks_count(es), (unsigned long long)blocks_count);
- goto failed_mount;
- }
-
-Index: linux-2.6.27.21-0.1/fs/ext4/fsync.c
+ goto failed_mount;
+ }
+
+Index: linux-stage/fs/ext4/fsync.c
===================================================================
---- linux-2.6.27.21-0.1.orig/fs/ext4/fsync.c 2009-07-07 14:47:19.000000000 +0530
-+++ linux-2.6.27.21-0.1/fs/ext4/fsync.c 2009-07-07 14:48:53.000000000 +0530
-@@ -1768,7 +1771,7 @@
+--- linux-stage.orig/fs/ext4/fsync.c 2011-03-14 17:17:57.533606303 +0800
++++ linux-stage/fs/ext4/fsync.c 2011-03-14 17:18:00.266657168 +0800
+@@ -56,7 +56,7 @@
- trace_mark(ext4_sync_file, "dev %s datasync %d ino %ld parent %ld",
- inode->i_sb->s_id, datasync, inode->i_ino,
+ trace_mark(ext4_sync_file, "dev %s datasync %d ino %ld parent %ld",
+ inode->i_sb->s_id, datasync, inode->i_ino,
- dentry->d_parent->d_inode->i_ino);
+ 0L);
-
- /*
- * data=writeback:
-Index: linux-2.6.27.21-0.1/fs/ext4/move_extent.c
+
+ ret = flush_aio_dio_completed_IO(inode);
+ if (ret < 0)
+Index: linux-stage/fs/ext4/move_extent.c
===================================================================
---- linux-2.6.27.21-0.1.orig/fs/ext4/move_extent.c 2009-07-07 14:47:19.000000000 +0530
-+++ linux-2.6.27.21-0.1/fs/ext4/move_extent.c 2009-07-07 14:48:53.000000000 +0530
-@@ -1768,7 +1771,8 @@
- ext4_error(orig_inode->i_sb, __func__,
- "We replaced blocks too much! "
- "sum of replaced: %llu requested: %llu",
+--- linux-stage.orig/fs/ext4/move_extent.c 2011-03-14 17:17:57.742610199 +0800
++++ linux-stage/fs/ext4/move_extent.c 2011-03-14 17:18:00.284657501 +0800
+@@ -1388,7 +1388,8 @@
+ ext4_error(orig_inode->i_sb, __func__,
+ "We replaced blocks too much! "
+ "sum of replaced: %llu requested: %llu",
- *moved_len, len);
+ (unsigned long long)(*moved_len),
+ (unsigned long long)(len));
- ret1 = -EIO;
- goto out;
- }
+ ret1 = -EIO;
+ goto out;
+ }
--- /dev/null
+Index: linux-stage/fs/ext4/ext4_jbd2.h
+===================================================================
+--- linux-stage.orig/fs/ext4/ext4_jbd2.h 2011-03-14 16:33:17.087088010 +0800
++++ linux-stage/fs/ext4/ext4_jbd2.h 2011-03-14 16:42:28.416591789 +0800
+@@ -35,6 +35,8 @@
+ (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS) \
+ ? 27U : 8U)
+
++#define ext4_journal_dirty_metadata(handle, bh) \
++ ext4_handle_dirty_metadata(handle, NULL, bh)
+ /* Extended attribute operations touch at most two data buffers,
+ * two bitmap buffers, and two group summaries, in addition to the inode
+ * and the superblock, which are already accounted for. */
+Index: linux-stage/fs/ext4/ext4_extents.h
+===================================================================
+--- linux-stage.orig/fs/ext4/ext4_extents.h 2011-03-14 16:33:17.076087785 +0800
++++ linux-stage/fs/ext4/ext4_extents.h 2011-03-14 16:43:08.254267525 +0800
+@@ -58,6 +58,12 @@
+ */
+ #define EXT_STATS_
+
++/*
++ * define EXT4_ALLOC_NEEDED to 0 since block bitmap, group desc. and sb
++ * are now accounted in ext4_ext_calc_credits_for_insert()
++ */
++#define EXT4_ALLOC_NEEDED 0
++#define HAVE_EXT_PREPARE_CB_EXTENT
+
+ /*
+ * ext4_inode has i_block array (60 bytes total).
+@@ -160,6 +166,7 @@
+ #define EXT_INIT_MAX_LEN (1UL << 15)
+ #define EXT_UNINIT_MAX_LEN (EXT_INIT_MAX_LEN - 1)
+
++#define EXT4_EXT_HAS_NO_TREE /* ext4_extents_tree struct is not used*/
+
+ #define EXT_FIRST_EXTENT(__hdr__) \
+ ((struct ext4_extent *) (((char *) (__hdr__)) + \
+@@ -239,6 +246,8 @@
+ extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
+ int num,
+ struct ext4_ext_path *path);
++extern int ext4_ext_calc_credits_for_insert(struct inode *,
++ struct ext4_ext_path *);
+ extern int ext4_can_extents_be_merged(struct inode *inode,
+ struct ext4_extent *ex1,
+ struct ext4_extent *ex2);
+Index: linux-stage/fs/ext4/mballoc.c
+===================================================================
+--- linux-stage.orig/fs/ext4/mballoc.c 2011-03-14 16:33:27.072292006 +0800
++++ linux-stage/fs/ext4/mballoc.c 2011-03-14 16:41:02.500138039 +0800
+@@ -4039,6 +4039,7 @@
+ if (ac)
+ kmem_cache_free(ext4_ac_cachep, ac);
+ }
++EXPORT_SYMBOL(ext4_discard_preallocations);
+
+ /*
+ * finds all preallocated spaces and return blocks being freed to them
+@@ -4831,3 +4832,6 @@
+ kmem_cache_free(ext4_ac_cachep, ac);
+ return;
+ }
++
++EXPORT_SYMBOL(ext4_free_blocks);
++
+Index: linux-stage/fs/ext4/super.c
+===================================================================
+--- linux-stage.orig/fs/ext4/super.c 2011-03-14 16:33:17.036086967 +0800
++++ linux-stage/fs/ext4/super.c 2011-03-14 16:41:14.964348396 +0800
+@@ -127,6 +127,7 @@
+ (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
+ (__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0);
+ }
++EXPORT_SYMBOL(ext4_itable_unused_count);
+
+ void ext4_block_bitmap_set(struct super_block *sb,
+ struct ext4_group_desc *bg, ext4_fsblk_t blk)
+@@ -1491,6 +1492,7 @@
+ Opt_block_validity, Opt_noblock_validity,
+ Opt_inode_readahead_blks, Opt_journal_ioprio,
+ Opt_discard, Opt_nodiscard,
++ Opt_mballoc,
+ };
+
+ static const match_table_t tokens = {
+@@ -1557,6 +1559,7 @@
+ {Opt_auto_da_alloc, "auto_da_alloc=%u"},
+ {Opt_auto_da_alloc, "auto_da_alloc"},
+ {Opt_noauto_da_alloc, "noauto_da_alloc"},
++ {Opt_mballoc, "mballoc"},
+ {Opt_discard, "discard"},
+ {Opt_nodiscard, "nodiscard"},
+ {Opt_err, NULL},
+@@ -1997,6 +2000,8 @@
+ case Opt_nodiscard:
+ clear_opt(sbi->s_mount_opt, DISCARD);
+ break;
++ case Opt_mballoc:
++ break;
+ default:
+ ext4_msg(sb, KERN_ERR,
+ "Unrecognized mount option \"%s\" "
+Index: linux-stage/fs/ext4/ext4_jbd2.c
+===================================================================
+--- linux-stage.orig/fs/ext4/ext4_jbd2.c 2011-03-14 16:33:17.049087232 +0800
++++ linux-stage/fs/ext4/ext4_jbd2.c 2011-03-14 16:34:39.849759386 +0800
+@@ -31,6 +31,7 @@
+ }
+ return err;
+ }
++EXPORT_SYMBOL(__ext4_journal_get_write_access);
+
+ int __ext4_journal_forget(const char *where, handle_t *handle,
+ struct buffer_head *bh)
+@@ -107,3 +108,4 @@
+ }
+ return err;
+ }
++EXPORT_SYMBOL(__ext4_handle_dirty_metadata);
+Index: linux-stage/fs/ext4/ext4.h
+===================================================================
+--- linux-stage.orig/fs/ext4/ext4.h 2011-03-14 16:33:17.056087375 +0800
++++ linux-stage/fs/ext4/ext4.h 2011-03-14 16:45:40.754870806 +0800
+@@ -1110,6 +1110,9 @@
+
+ #define NEXT_ORPHAN(inode) EXT4_I(inode)->i_dtime
+
++/* Has been moved to linux/magic.h but we need it for Lustre */
++#define EXT4_SUPER_MAGIC 0xEF53
++
+ /*
+ * Codes for operating systems
+ */
+@@ -1528,6 +1531,8 @@
+ extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t);
+ extern void ext4_mb_put_buddy_cache_lock(struct super_block *,
+ ext4_group_t, int);
++extern void ext4_mb_discard_inode_preallocations(struct inode *);
++
+ /* inode.c */
+ int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
+ struct buffer_head *bh, ext4_fsblk_t blocknr);
+Index: linux-stage/fs/ext4/inode.c
+===================================================================
+--- linux-stage.orig/fs/ext4/inode.c 2011-03-14 16:33:17.063087519 +0800
++++ linux-stage/fs/ext4/inode.c 2011-03-14 16:34:39.913760434 +0800
+@@ -5199,6 +5199,7 @@
+ iget_failed(inode);
+ return ERR_PTR(ret);
+ }
++EXPORT_SYMBOL(ext4_iget);
+
+ static int ext4_inode_blocks_set(handle_t *handle,
+ struct ext4_inode *raw_inode,
+Index: linux-stage/fs/ext4/extents.c
+===================================================================
+--- linux-stage.orig/fs/ext4/extents.c 2011-03-14 16:33:17.070087661 +0800
++++ linux-stage/fs/ext4/extents.c 2011-03-14 16:41:04.894178430 +0800
+@@ -1866,9 +1866,7 @@
+ while (block < last && block != EXT_MAX_BLOCK) {
+ num = last - block;
+ /* find extent for this block */
+- down_read(&EXT4_I(inode)->i_data_sem);
+ path = ext4_ext_find_extent(inode, block, path);
+- up_read(&EXT4_I(inode)->i_data_sem);
+ if (IS_ERR(path)) {
+ err = PTR_ERR(path);
+ path = NULL;
+@@ -1965,6 +1963,7 @@
+
+ return err;
+ }
++EXPORT_SYMBOL(ext4_ext_walk_space);
+
+ static void
+ ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block,
+@@ -2133,6 +2132,55 @@
+ }
+
+ /*
++ * This routine returns max. credits extent tree can consume.
++ * It should be OK for low-performance paths like ->writepage()
++ * To allow many writing process to fit a single transaction,
++ * caller should calculate credits under truncate_mutex and
++ * pass actual path.
++ */
++int ext4_ext_calc_credits_for_insert(struct inode *inode,
++ struct ext4_ext_path *path)
++{
++ int depth, needed;
++
++ if (path) {
++ /* probably there is space in leaf? */
++ depth = ext_depth(inode);
++ if (le16_to_cpu(path[depth].p_hdr->eh_entries)
++ < le16_to_cpu(path[depth].p_hdr->eh_max))
++ return 1;
++ }
++
++ /*
++ * given 32bit logical block (4294967296 blocks), max. tree
++ * can be 4 levels in depth -- 4 * 340^4 == 53453440000.
++ * let's also add one more level for imbalance.
++ */
++ depth = 5;
++
++ /* allocation of new data block(s) */
++ needed = 2;
++
++ /*
++ * tree can be full, so it'd need to grow in depth:
++ * we need one credit to modify old root, credits for
++ * new root will be added in split accounting
++ */
++ needed += 1;
++ /*
++ * Index split can happen, we'd need:
++ * allocate intermediate indexes (bitmap + group)
++ * + change two blocks at each level, but root (already included)
++ */
++ needed += (depth * 2) + (depth * 2);
++
++ /* any allocation modifies superblock */
++ needed += 1;
++
++ return needed;
++}
++
++/*
+ * How many index/leaf blocks need to change/allocate to modify nrblocks?
+ *
+ * if nrblocks are fit in a single extent (chunk flag is 1), then
+@@ -3934,10 +3982,21 @@
+ * Walk the extent tree gathering extent information.
+ * ext4_ext_fiemap_cb will push extents back to user.
+ */
++ down_read(&EXT4_I(inode)->i_data_sem);
+ error = ext4_ext_walk_space(inode, start_blk, len_blks,
+ ext4_ext_fiemap_cb, fieinfo);
++ up_read(&EXT4_I(inode)->i_data_sem);
+ }
+
+ return error;
+ }
+
++EXPORT_SYMBOL(ext4_ext_store_pblock);
++EXPORT_SYMBOL(ext4_ext_search_right);
++EXPORT_SYMBOL(ext4_ext_search_left);
++EXPORT_SYMBOL(ext_pblock);
++EXPORT_SYMBOL(ext4_ext_insert_extent);
++EXPORT_SYMBOL(ext4_mb_new_blocks);
++EXPORT_SYMBOL(ext4_ext_calc_credits_for_insert);
++EXPORT_SYMBOL(ext4_mark_inode_dirty);
++
--- /dev/null
+Index: linux-stage/fs/ext4/super.c
+===================================================================
+--- linux-stage.orig/fs/ext4/super.c 2011-03-03 15:25:02.376539424 +0800
++++ linux-stage/fs/ext4/super.c 2011-03-05 12:24:02.918774335 +0800
+@@ -40,6 +40,8 @@
+ #include <linux/log2.h>
+ #include <linux/crc16.h>
+ #include <asm/uaccess.h>
++#include <linux/kthread.h>
++#include <linux/utsname.h>
+
+ #include "ext4.h"
+ #include "ext4_jbd2.h"
+@@ -700,6 +702,8 @@
+ invalidate_bdev(sbi->journal_bdev);
+ ext4_blkdev_remove(sbi);
+ }
++ if (sbi->s_mmp_tsk)
++ kthread_stop(sbi->s_mmp_tsk);
+ sb->s_fs_info = NULL;
+ /*
+ * Now that we are completely done shutting down the
+@@ -970,6 +974,344 @@
+ return 0;
+ }
+
++/*
++ * Write the MMP block using WRITE_SYNC to try to get the block on-disk
++ * faster.
++ */
++static int write_mmp_block(struct buffer_head *bh)
++{
++ mark_buffer_dirty(bh);
++ lock_buffer(bh);
++ bh->b_end_io = end_buffer_write_sync;
++ get_bh(bh);
++ submit_bh(WRITE_SYNC, bh);
++ wait_on_buffer(bh);
++ if (unlikely(!buffer_uptodate(bh)))
++ return 1;
++
++ return 0;
++}
++
++/*
++ * Read the MMP block. It _must_ be read from disk and hence we clear the
++ * uptodate flag on the buffer.
++ */
++static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
++ unsigned long mmp_block)
++{
++ struct mmp_struct *mmp;
++
++ if (*bh)
++ clear_buffer_uptodate(*bh);
++
++#if 0
++ brelse(*bh);
++
++ *bh = sb_bread(sb, mmp_block);
++#else
++ if (!*bh)
++ *bh = sb_getblk(sb, mmp_block);
++ if (*bh) {
++ get_bh(*bh);
++ lock_buffer(*bh);
++ (*bh)->b_end_io = end_buffer_read_sync;
++ submit_bh(READ_SYNC, *bh);
++ wait_on_buffer(*bh);
++ if (!buffer_uptodate(*bh)) {
++ brelse(*bh);
++ *bh = NULL;
++ }
++ }
++#endif
++ if (!*bh) {
++ ext4_warning(sb,
++ "Error while reading MMP block %lu", mmp_block);
++ return -EIO;
++ }
++
++ mmp = (struct mmp_struct *)((*bh)->b_data);
++ if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC)
++ return -EINVAL;
++
++ return 0;
++}
++
++/*
++ * Dump as much information as possible to help the admin.
++ */
++static void dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp,
++ const char *function, const char *msg)
++{
++ __ext4_warning(sb, function, msg);
++ __ext4_warning(sb, function, "MMP failure info: last update time: %llu, "
++ "last update node: %s, last update device: %s\n",
++ (long long unsigned int)le64_to_cpu(mmp->mmp_time),
++ mmp->mmp_nodename, mmp->mmp_bdevname);
++}
++
++/*
++ * kmmpd will update the MMP sequence every s_mmp_update_interval seconds
++ */
++static int kmmpd(void *data)
++{
++ struct super_block *sb = (struct super_block *) data;
++ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
++ struct buffer_head *bh = NULL;
++ struct mmp_struct *mmp;
++ unsigned long mmp_block;
++ u32 seq = 0;
++ unsigned long failed_writes = 0;
++ int mmp_update_interval = le16_to_cpu(es->s_mmp_update_interval);
++ unsigned mmp_check_interval;
++ unsigned long last_update_time;
++ unsigned long diff;
++ int retval;
++
++ mmp_block = le64_to_cpu(es->s_mmp_block);
++ retval = read_mmp_block(sb, &bh, mmp_block);
++ if (retval)
++ goto failed;
++
++ mmp = (struct mmp_struct *)(bh->b_data);
++ mmp->mmp_time = cpu_to_le64(get_seconds());
++ /*
++ * Start with the higher mmp_check_interval and reduce it if
++ * the MMP block is being updated on time.
++ */
++ mmp_check_interval = max(5 * mmp_update_interval,
++ EXT4_MMP_MIN_CHECK_INTERVAL);
++ mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
++ bdevname(bh->b_bdev, mmp->mmp_bdevname);
++
++ memcpy(mmp->mmp_nodename, init_utsname()->sysname,
++ sizeof(mmp->mmp_nodename));
++
++ while (!kthread_should_stop()) {
++ if (++seq > EXT4_MMP_SEQ_MAX)
++ seq = 1;
++
++ mmp->mmp_seq = cpu_to_le32(seq);
++ mmp->mmp_time = cpu_to_le64(get_seconds());
++ last_update_time = jiffies;
++
++ retval = write_mmp_block(bh);
++ /*
++ * Don't spew too many error messages. Print one every
++ * (s_mmp_update_interval * 60) seconds.
++ */
++ if (retval && (failed_writes % 60) == 0) {
++ ext4_error(sb,
++ "Error writing to MMP block");
++ failed_writes++;
++ }
++
++ if (!(le32_to_cpu(es->s_feature_incompat) &
++ EXT4_FEATURE_INCOMPAT_MMP)) {
++ ext4_warning(sb, "kmmpd being stopped "
++ "since MMP feature has been disabled.");
++ EXT4_SB(sb)->s_mmp_tsk = 0;
++ goto failed;
++ }
++
++ if (sb->s_flags & MS_RDONLY) {
++ ext4_warning(sb, "kmmpd being stopped "
++ "since filesystem has been remounted as "
++ "readonly.");
++ EXT4_SB(sb)->s_mmp_tsk = 0;
++ goto failed;
++ }
++
++ diff = jiffies - last_update_time;
++ if (diff < mmp_update_interval * HZ)
++ schedule_timeout_interruptible(mmp_update_interval *
++ HZ - diff);
++
++ /*
++ * We need to make sure that more than mmp_check_interval
++ * seconds have not passed since writing. If that has happened
++ * we need to check if the MMP block is as we left it.
++ */
++ diff = jiffies - last_update_time;
++ if (diff > mmp_check_interval * HZ) {
++ struct buffer_head *bh_check = NULL;
++ struct mmp_struct *mmp_check;
++
++ retval = read_mmp_block(sb, &bh_check, mmp_block);
++ if (retval) {
++ EXT4_SB(sb)->s_mmp_tsk = 0;
++ goto failed;
++ }
++
++ mmp_check = (struct mmp_struct *)(bh_check->b_data);
++ if (mmp->mmp_time != mmp_check->mmp_time ||
++ memcmp(mmp->mmp_nodename, mmp_check->mmp_nodename,
++ sizeof(mmp->mmp_nodename)))
++ dump_mmp_msg(sb, mmp_check, __func__,
++ "Error while updating MMP info. "
++ "The filesystem seems to have "
++ "been multiply mounted.");
++
++ put_bh(bh_check);
++ }
++
++ /*
++ * Adjust the mmp_check_interval depending on how much time
++ * it took for the MMP block to be written.
++ */
++ mmp_check_interval = max(5 * diff / HZ,
++ (unsigned long) EXT4_MMP_MIN_CHECK_INTERVAL);
++ mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
++ }
++
++ /*
++ * Unmount seems to be clean.
++ */
++ mmp->mmp_seq = cpu_to_le32(EXT4_MMP_SEQ_CLEAN);
++ mmp->mmp_time = cpu_to_le64(get_seconds());
++
++ retval = write_mmp_block(bh);
++
++failed:
++ brelse(bh);
++ return retval;
++}
++
++/*
++ * Get a random new sequence number but make sure it is not greater than
++ * EXT4_MMP_SEQ_MAX.
++ */
++static unsigned int mmp_new_seq(void)
++{
++ u32 new_seq;
++
++ do {
++ get_random_bytes(&new_seq, sizeof(u32));
++ } while (new_seq > EXT4_MMP_SEQ_MAX);
++
++ return new_seq;
++}
++
++/*
++ * Protect the filesystem from being mounted more than once.
++ */
++static int ext4_multi_mount_protect(struct super_block *sb,
++ unsigned long mmp_block)
++{
++ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
++ struct buffer_head *bh = NULL;
++ struct mmp_struct *mmp = NULL;
++ u32 seq;
++ unsigned int mmp_check_interval = le16_to_cpu(es->s_mmp_update_interval);
++ unsigned int wait_time = 0;
++ int retval;
++
++ if (mmp_block < le32_to_cpu(es->s_first_data_block) ||
++ mmp_block >= ext4_blocks_count(es)) {
++ ext4_warning(sb,
++ "Invalid MMP block in superblock");
++ goto failed;
++ }
++
++ retval = read_mmp_block(sb, &bh, mmp_block);
++ if (retval)
++ goto failed;
++
++ mmp = (struct mmp_struct *)(bh->b_data);
++
++ if (mmp_check_interval < EXT4_MMP_MIN_CHECK_INTERVAL)
++ mmp_check_interval = EXT4_MMP_MIN_CHECK_INTERVAL;
++
++ /*
++ * If check_interval in MMP block is larger, use that instead of
++ * update_interval from the superblock.
++ */
++ if (mmp->mmp_check_interval > mmp_check_interval)
++ mmp_check_interval = mmp->mmp_check_interval;
++
++ seq = le32_to_cpu(mmp->mmp_seq);
++ if (seq == EXT4_MMP_SEQ_CLEAN)
++ goto skip;
++
++ if (seq == EXT4_MMP_SEQ_FSCK) {
++ dump_mmp_msg(sb, mmp, __func__,
++ "fsck is running on the filesystem");
++ goto failed;
++ }
++
++ wait_time = min(mmp_check_interval * 2 + 1,
++ mmp_check_interval + 60);
++
++ /* Print MMP interval if more than 20 secs. */
++ if (wait_time > EXT4_MMP_MIN_CHECK_INTERVAL * 4)
++ ext4_warning(sb, "MMP interval %u higher than "
++ "expected, please wait.\n", wait_time * 2);
++
++ if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
++ ext4_warning(sb, "MMP startup interrupted, failing "
++ "mount\n");
++ goto failed;
++ }
++
++ retval = read_mmp_block(sb, &bh, mmp_block);
++ if (retval)
++ goto failed;
++ mmp = (struct mmp_struct *)(bh->b_data);
++ if (seq != le32_to_cpu(mmp->mmp_seq)) {
++ dump_mmp_msg(sb, mmp, __func__,
++ "Device is already active on another node.");
++ goto failed;
++ }
++
++skip:
++ /*
++ * write a new random sequence number.
++ */
++ mmp->mmp_seq = seq = cpu_to_le32(mmp_new_seq());
++
++ retval = write_mmp_block(bh);
++ if (retval)
++ goto failed;
++
++ /*
++ * wait for MMP interval and check mmp_seq.
++ */
++ if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
++ ext4_warning(sb, "MMP startup interrupted, failing "
++ "mount\n");
++ goto failed;
++ }
++
++ retval = read_mmp_block(sb, &bh, mmp_block);
++ if (retval)
++ goto failed;
++ mmp = (struct mmp_struct *)(bh->b_data);
++ if (seq != le32_to_cpu(mmp->mmp_seq)) {
++ dump_mmp_msg(sb, mmp, __func__,
++ "Device is already active on another node.");
++ goto failed;
++ }
++
++ /*
++ * Start a kernel thread to update the MMP block periodically.
++ */
++ EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, sb, "kmmpd-%02x:%02x",
++ MAJOR(sb->s_dev),
++ MINOR(sb->s_dev));
++ if (IS_ERR(EXT4_SB(sb)->s_mmp_tsk)) {
++ EXT4_SB(sb)->s_mmp_tsk = 0;
++ ext4_warning(sb, "Unable to create kmmpd thread "
++ "for %s.", sb->s_id);
++ goto failed;
++ }
++
++ brelse(bh);
++ return 0;
++
++failed:
++ brelse(bh);
++ return 1;
++}
++
+ static struct inode *ext4_nfs_get_inode(struct super_block *sb,
+ u64 ino, u32 generation)
+ {
+@@ -2816,6 +3158,11 @@
+ EXT4_HAS_INCOMPAT_FEATURE(sb,
+ EXT4_FEATURE_INCOMPAT_RECOVER));
+
++ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_MMP) &&
++ !(sb->s_flags & MS_RDONLY))
++ if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block)))
++ goto failed_mount3;
++
+ /*
+ * The first inode we look at is the journal inode. Don't try
+ * root first: it may be modified in the journal!
+@@ -3052,6 +3399,8 @@
+ percpu_counter_destroy(&sbi->s_freeinodes_counter);
+ percpu_counter_destroy(&sbi->s_dirs_counter);
+ percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
++ if (sbi->s_mmp_tsk)
++ kthread_stop(sbi->s_mmp_tsk);
+ failed_mount2:
+ for (i = 0; i < db_count; i++)
+ brelse(sbi->s_group_desc[i]);
+@@ -3560,7 +3909,7 @@
+ struct ext4_mount_options old_opts;
+ ext4_group_t g;
+ unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
+- int err;
++ int err = 0;
+ #ifdef CONFIG_QUOTA
+ int i;
+ #endif
+@@ -3682,6 +4031,13 @@
+ goto restore_opts;
+ if (!ext4_setup_super(sb, es, 0))
+ sb->s_flags &= ~MS_RDONLY;
++ if (EXT4_HAS_INCOMPAT_FEATURE(sb,
++ EXT4_FEATURE_INCOMPAT_MMP))
++ if (ext4_multi_mount_protect(sb,
++ le64_to_cpu(es->s_mmp_block))) {
++ err = -EROFS;
++ goto restore_opts;
++ }
+ }
+ }
+ ext4_setup_system_zone(sb);
+Index: linux-stage/fs/ext4/ext4.h
+===================================================================
+--- linux-stage.orig/fs/ext4/ext4.h 2011-03-03 15:25:02.507538421 +0800
++++ linux-stage/fs/ext4/ext4.h 2011-03-05 12:25:16.343986732 +0800
+@@ -894,7 +894,7 @@
+ __le16 s_want_extra_isize; /* New inodes should reserve # bytes */
+ __le32 s_flags; /* Miscellaneous flags */
+ __le16 s_raid_stride; /* RAID stride */
+- __le16 s_mmp_interval; /* # seconds to wait in MMP checking */
++ __le16 s_mmp_update_interval; /* # seconds to wait in MMP checking */
+ __le64 s_mmp_block; /* Block for multi-mount protection */
+ __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/
+ __u8 s_log_groups_per_flex; /* FLEX_BG group size */
+@@ -1041,6 +1041,9 @@
+
+ /* workqueue for dio unwritten */
+ struct workqueue_struct *dio_unwritten_wq;
++
++ /* Kernel thread for multiple mount protection */
++ struct task_struct *s_mmp_tsk;
+ };
+
+ static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
+@@ -1177,7 +1180,8 @@
+ EXT4_FEATURE_INCOMPAT_META_BG| \
+ EXT4_FEATURE_INCOMPAT_EXTENTS| \
+ EXT4_FEATURE_INCOMPAT_64BIT| \
+- EXT4_FEATURE_INCOMPAT_FLEX_BG)
++ EXT4_FEATURE_INCOMPAT_FLEX_BG| \
++ EXT4_FEATURE_INCOMPAT_MMP)
+ #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
+ EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
+ EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
+@@ -1384,6 +1388,34 @@
+ extern struct proc_dir_entry *ext4_proc_root;
+
+ /*
++ * This structure will be used for multiple mount protection. It will be
++ * written into the block number saved in the s_mmp_block field in the
++ * superblock. Programs that check MMP should assume that if
++ * SEQ_FSCK (or any unknown code above SEQ_MAX) is present then it is NOT safe
++ * to use the filesystem, regardless of how old the timestamp is.
++ */
++#define EXT4_MMP_MAGIC 0x004D4D50U /* ASCII for MMP */
++#define EXT4_MMP_SEQ_CLEAN 0xFF4D4D50U /* mmp_seq value for clean unmount */
++#define EXT4_MMP_SEQ_FSCK 0xE24D4D50U /* mmp_seq value when being fscked */
++#define EXT4_MMP_SEQ_MAX 0xE24D4D4FU /* maximum valid mmp_seq value */
++
++struct mmp_struct {
++ __le32 mmp_magic;
++ __le32 mmp_seq;
++ __le64 mmp_time;
++ char mmp_nodename[64];
++ char mmp_bdevname[32];
++ __le16 mmp_check_interval;
++ __le16 mmp_pad1;
++ __le32 mmp_pad2[227];
++};
++
++/*
++ * Minimum interval for MMP checking in seconds.
++ */
++#define EXT4_MMP_MIN_CHECK_INTERVAL 5
++
++/*
+ * Function prototypes
+ */
+
--- /dev/null
+diff -rupN 2.6.27.21_2/fs/ext4/ext4.h 2.6.27.21_3/fs/ext4/ext4.h
+--- 2.6.27.21_2/fs/ext4/ext4.h 2009-07-17 12:19:59.000000000 +0530
++++ 2.6.27.21_3/fs/ext4/ext4.h 2009-07-17 12:38:59.000000000 +0530
+@@ -1181,6 +1181,9 @@ extern int ext4_orphan_add(handle_t *, s
+ #define ll_ext4_find_entry(inode, dentry, res_dir) ext4_find_entry(inode, &(dentry)->d_name, res_dir)
+ extern int ext4_add_dot_dotdot(handle_t *handle, struct inode *dir,
+ struct inode *inode);
++extern struct buffer_head *ext4_append(handle_t *handle,
++ struct inode *inode,
++ ext4_lblk_t *block, int *err);
+
+ /* resize.c */
+ extern int ext4_group_add(struct super_block *sb,
+diff -rupN 2.6.27.21_2/fs/ext4/hash.c 2.6.27.21_3/fs/ext4/hash.c
+--- 2.6.27.21_2/fs/ext4/hash.c 2009-07-17 12:12:56.000000000 +0530
++++ 2.6.27.21_3/fs/ext4/hash.c 2009-07-17 12:40:22.000000000 +0530
+@@ -9,6 +9,7 @@
+ * License.
+ */
+
++#include <linux/module.h>
+ #include <linux/fs.h>
+ #include <linux/jbd2.h>
+ #include <linux/cryptohash.h>
+@@ -206,3 +207,4 @@ int ext4fs_dirhash(const char *name, int
+ hinfo->minor_hash = minor_hash;
+ return 0;
+ }
++EXPORT_SYMBOL(ext4fs_dirhash);
+diff -rupN 2.6.27.21_2/fs/ext4/namei.c 2.6.27.21_3/fs/ext4/namei.c
+--- 2.6.27.21_2/fs/ext4/namei.c 2009-07-17 12:23:51.000000000 +0530
++++ 2.6.27.21_3/fs/ext4/namei.c 2009-07-17 12:37:59.000000000 +0530
+@@ -51,9 +51,9 @@
+ #define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
+ #define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b))
+
+-static struct buffer_head *ext4_append(handle_t *handle,
+- struct inode *inode,
+- ext4_lblk_t *block, int *err)
++struct buffer_head *ext4_append(handle_t *handle,
++ struct inode *inode,
++ ext4_lblk_t *block, int *err)
+ {
+ struct buffer_head *bh;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+@@ -72,6 +72,7 @@ static struct buffer_head *ext4_append(h
+ up(&ei->i_append_sem);
+ return bh;
+ }
++EXPORT_SYMBOL(ext4_append);
+
+ #ifndef assert
+ #define assert(test) J_ASSERT(test)
+diff -rupN 2.6.27.21_2/fs/ext4/super.c 2.6.27.21_3/fs/ext4/super.c
+--- 2.6.27.21_2/fs/ext4/super.c 2009-07-17 12:12:57.000000000 +0530
++++ 2.6.27.21_3/fs/ext4/super.c 2009-07-17 12:40:52.000000000 +0530
+@@ -377,6 +377,7 @@ void __ext4_std_error(struct super_block
+
+ ext4_handle_error(sb);
+ }
++EXPORT_SYMBOL(__ext4_std_error);
+
+ /*
+ * ext4_abort is a much stronger failure handler than ext4_error. The
--- /dev/null
+Index: linux-2.6.32.i386/fs/ext4/ext4.h
+===================================================================
+--- linux-2.6.32.i386.orig/fs/ext4/ext4.h 2010-04-16 04:57:39.000000000 +0530
++++ linux-2.6.32.i386/fs/ext4/ext4.h 2010-04-16 05:27:02.000000000 +0530
+@@ -1512,6 +1512,19 @@
+ extern int ext4_orphan_del(handle_t *, struct inode *);
+ extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
+ __u32 start_minor_hash, __u32 *next_hash);
++extern struct inode *ext4_create_inode(handle_t *handle,
++ struct inode * dir, int mode);
++extern int ext4_add_entry(handle_t *handle, struct dentry *dentry,
++ struct inode *inode);
++extern int ext4_delete_entry(handle_t *handle, struct inode * dir,
++ struct ext4_dir_entry_2 * de_del,
++ struct buffer_head * bh);
++extern struct buffer_head * ext4_find_entry(struct inode *dir,
++ const struct qstr *d_name,
++ struct ext4_dir_entry_2 ** res_dir);
++#define ll_ext4_find_entry(inode, dentry, res_dir) ext4_find_entry(inode, &(dentry)->d_name, res_dir)
++extern int ext4_add_dot_dotdot(handle_t *handle, struct inode *dir,
++ struct inode *inode);
+
+ /* resize.c */
+ extern int ext4_group_add(struct super_block *sb,
+Index: linux-2.6.32.i386/fs/ext4/namei.c
+===================================================================
+--- linux-2.6.32.i386.orig/fs/ext4/namei.c 2010-04-16 04:57:39.000000000 +0530
++++ linux-2.6.32.i386/fs/ext4/namei.c 2010-04-16 05:28:25.000000000 +0530
+@@ -24,6 +24,7 @@
+ * Theodore Ts'o, 2002
+ */
+
++#include <linux/module.h>
+ #include <linux/fs.h>
+ #include <linux/pagemap.h>
+ #include <linux/jbd2.h>
+@@ -902,9 +903,9 @@
+ * The returned buffer_head has ->b_count elevated. The caller is expected
+ * to brelse() it when appropriate.
+ */
+-static struct buffer_head * ext4_find_entry (struct inode *dir,
+- const struct qstr *d_name,
+- struct ext4_dir_entry_2 ** res_dir)
++struct buffer_head * ext4_find_entry(struct inode *dir,
++ const struct qstr *d_name,
++ struct ext4_dir_entry_2 ** res_dir)
+ {
+ struct super_block *sb;
+ struct buffer_head *bh_use[NAMEI_RA_SIZE];
+@@ -1011,6 +1012,7 @@
+ brelse(bh_use[ra_ptr]);
+ return ret;
+ }
++EXPORT_SYMBOL(ext4_find_entry);
+
+ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name,
+ struct ext4_dir_entry_2 **res_dir, int *err)
+@@ -1538,8 +1540,8 @@
+ * may not sleep between calling this and putting something into
+ * the entry, as someone else might have used it while you slept.
+ */
+-static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
+- struct inode *inode)
++int ext4_add_entry(handle_t *handle, struct dentry *dentry,
++ struct inode *inode)
+ {
+ struct inode *dir = dentry->d_parent->d_inode;
+ struct buffer_head *bh;
+@@ -1588,6 +1590,7 @@
+ brelse(bh);
+ return retval;
+ }
++EXPORT_SYMBOL(ext4_add_entry);
+
+ /*
+ * Returns 0 for success, or a negative error value
+@@ -1728,10 +1731,10 @@
+ * ext4_delete_entry deletes a directory entry by merging it with the
+ * previous entry
+ */
+-static int ext4_delete_entry(handle_t *handle,
+- struct inode *dir,
+- struct ext4_dir_entry_2 *de_del,
+- struct buffer_head *bh)
++int ext4_delete_entry(handle_t *handle,
++ struct inode *dir,
++ struct ext4_dir_entry_2 *de_del,
++ struct buffer_head *bh)
+ {
+ struct ext4_dir_entry_2 *de, *pde;
+ unsigned int blocksize = dir->i_sb->s_blocksize;
+@@ -1766,7 +1769,7 @@
+ }
+ return -ENOENT;
+ }
+-
++EXPORT_SYMBOL(ext4_delete_entry);
+ /*
+ * DIR_NLINK feature is set if 1) nlinks > EXT4_LINK_MAX or 2) nlinks == 2,
+ * since this indicates that nlinks count was previously 1.
+@@ -1831,6 +1834,26 @@
+ return inum;
+ }
+
++struct inode * ext4_create_inode(handle_t *handle, struct inode * dir, int mode)
++{
++ struct inode *inode;
++
++ inode = ext4_new_inode(handle, dir, mode, 0, 0);
++ if (!IS_ERR(inode)) {
++ if (S_ISCHR(mode) || S_ISBLK(mode) || S_ISFIFO(mode)) {
++#ifdef CONFIG_LDISKFS_FS_XATTR
++ inode->i_op = &ext4_special_inode_operations;
++#endif
++ } else {
++ inode->i_op = &ext4_file_inode_operations;
++ inode->i_fop = &ext4_file_operations;
++ ext4_set_aops(inode);
++ }
++ }
++ return inode;
++}
++EXPORT_SYMBOL(ext4_create_inode);
++
+ /*
+ * By the time this is called, we already have created
+ * the directory cache entry for the new file, but it
+@@ -1905,40 +1928,33 @@
+ return err;
+ }
+
+-static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode)
++/* Initialize @inode as a subdirectory of @dir, and add the
++ * "." and ".." entries into the first directory block. */
++int ext4_add_dot_dotdot(handle_t *handle, struct inode * dir,
++ struct inode *inode)
+ {
+- handle_t *handle;
+- struct inode *inode;
+- struct buffer_head *dir_block;
+- struct ext4_dir_entry_2 *de;
++ struct buffer_head * dir_block;
++ struct ext4_dir_entry_2 * de;
+ unsigned int blocksize = dir->i_sb->s_blocksize;
+- int err, retries = 0;
+-
+- if (EXT4_DIR_LINK_MAX(dir))
+- return -EMLINK;
++ int err = 0;
+
+-retry:
+- handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+- EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
+- EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ if (IS_DIRSYNC(dir))
+ ext4_handle_sync(handle);
+
+- inode = ext4_new_inode(handle, dir, S_IFDIR | mode,
+- &dentry->d_name, 0);
+- err = PTR_ERR(inode);
+- if (IS_ERR(inode))
+- goto out_stop;
+
+ inode->i_op = &ext4_dir_inode_operations;
+ inode->i_fop = &ext4_dir_operations;
+ inode->i_size = EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize;
+ dir_block = ext4_bread(handle, inode, 0, 1, &err);
+- if (!dir_block)
+- goto out_clear_inode;
++ if (!dir_block) {
++ clear_nlink(inode);
++ ext4_mark_inode_dirty(handle, inode);
++ iput (inode);
++ goto get_out;
++ }
+ BUFFER_TRACE(dir_block, "get_write_access");
+ ext4_journal_get_write_access(handle, dir_block);
+ de = (struct ext4_dir_entry_2 *) dir_block->b_data;
+@@ -1960,9 +1976,43 @@
+ ext4_handle_dirty_metadata(handle, dir, dir_block);
+ brelse(dir_block);
+ ext4_mark_inode_dirty(handle, inode);
++get_out:
++ return err;
++}
++EXPORT_SYMBOL(ext4_add_dot_dotdot);
++
++
++static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode)
++{
++ handle_t *handle;
++ struct inode *inode;
++ int err, retries = 0;
++
++ if (EXT4_DIR_LINK_MAX(dir))
++ return -EMLINK;
++
++retry:
++ handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
++ EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
++ 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
++ if (IS_ERR(handle))
++ return PTR_ERR(handle);
++
++ if (IS_DIRSYNC(dir))
++ handle->h_sync = 1;
++
++ inode = ext4_new_inode(handle, dir, S_IFDIR | mode,
++ &dentry->d_name, ext4_dentry_goal(dir->i_sb, dentry));
++ err = PTR_ERR(inode);
++ if (IS_ERR(inode))
++ goto out_stop;
++
++ err = ext4_add_dot_dotdot(handle, dir, inode);
++ if (err)
++ goto out_stop;
++
+ err = ext4_add_entry(handle, dentry, inode);
+ if (err) {
+-out_clear_inode:
+ clear_nlink(inode);
+ unlock_new_inode(inode);
+ ext4_mark_inode_dirty(handle, inode);
--- /dev/null
+Index: linux-2.6.32.i386/fs/ext4/ext4.h
+===================================================================
+--- linux-2.6.32.i386.orig/fs/ext4/ext4.h 2010-04-16 03:39:11.000000000 +0530
++++ linux-2.6.32.i386/fs/ext4/ext4.h 2010-04-16 04:27:41.000000000 +0530
+@@ -29,6 +29,7 @@
+ #ifndef _EXT4_H
+ #define _EXT4_H
+
++#include <linux/dynlocks.h>
+ #include <linux/types.h>
+ #include <linux/blkdev.h>
+ #include <linux/magic.h>
+@@ -621,6 +622,10 @@
+ ext4_fsblk_t i_file_acl;
+ __u32 i_dtime;
+
++ /* following fields for parallel directory operations -bzzz */
++ struct dynlock i_htree_lock;
++ struct semaphore i_append_sem;
++
+ /*
+ * i_block_group is the number of the block group which contains
+ * this file's inode. Constant across the lifetime of the inode,
+Index: linux-2.6.32.i386/fs/ext4/namei.c
+===================================================================
+--- linux-2.6.32.i386.orig/fs/ext4/namei.c 2010-04-15 07:42:15.000000000 +0530
++++ linux-2.6.32.i386/fs/ext4/namei.c 2010-04-16 04:26:03.000000000 +0530
+@@ -54,6 +54,11 @@
+ ext4_lblk_t *block, int *err)
+ {
+ struct buffer_head *bh;
++ struct ext4_inode_info *ei = EXT4_I(inode);
++
++ /* with parallel dir operations all appends
++ * have to be serialized -bzzz */
++ down(&ei->i_append_sem);
+
+ *block = inode->i_size >> inode->i_sb->s_blocksize_bits;
+
+@@ -66,7 +71,9 @@
+ brelse(bh);
+ bh = NULL;
+ }
++ ei->i_disksize = inode->i_size;
+ }
++ up(&ei->i_append_sem);
+ return bh;
+ }
+
+Index: linux-2.6.32.i386/fs/ext4/super.c
+===================================================================
+--- linux-2.6.32.i386.orig/fs/ext4/super.c 2010-04-16 03:39:11.000000000 +0530
++++ linux-2.6.32.i386/fs/ext4/super.c 2010-04-16 04:26:03.000000000 +0530
+@@ -700,6 +700,8 @@
+
+ ei->vfs_inode.i_version = 1;
+ ei->vfs_inode.i_data.writeback_index = 0;
++ dynlock_init(&ei->i_htree_lock);
++ sema_init(&ei->i_append_sem, 1);
+ memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
+ INIT_LIST_HEAD(&ei->i_prealloc_list);
+ spin_lock_init(&ei->i_prealloc_lock);
--- /dev/null
+Index: linux-stage/fs/ext4/ext4.h
+===================================================================
+--- linux-stage.orig/fs/ext4/ext4.h 2011-03-11 14:17:02.000000000 +0800
++++ linux-stage/fs/ext4/ext4.h 2011-03-11 14:20:08.269063193 +0800
+@@ -999,11 +999,14 @@
+
+ /* tunables */
+ unsigned long s_stripe;
+- unsigned int s_mb_stream_request;
++ unsigned long s_mb_small_req;
++ unsigned long s_mb_large_req;
+ unsigned int s_mb_max_to_scan;
+ unsigned int s_mb_min_to_scan;
+ unsigned int s_mb_stats;
+ unsigned int s_mb_order2_reqs;
++ unsigned long *s_mb_prealloc_table;
++ unsigned long s_mb_prealloc_table_size;
+ unsigned int s_mb_group_prealloc;
+ unsigned int s_max_writeback_mb_bump;
+ /* where last allocation was done - for stream allocation */
+Index: linux-stage/fs/ext4/mballoc.c
+===================================================================
+--- linux-stage.orig/fs/ext4/mballoc.c 2011-03-11 14:03:32.000000000 +0800
++++ linux-stage/fs/ext4/mballoc.c 2011-03-11 14:44:49.106543493 +0800
+@@ -1823,6 +1823,26 @@
+ }
+ }
+
++static void ext4_mb_prealloc_table_add(struct ext4_sb_info *sbi, int value)
++{
++ int i;
++
++ if (value > (sbi->s_blocks_per_group - 1 - 1 - sbi->s_itb_per_group))
++ return;
++
++ for (i = 0; i < sbi->s_mb_prealloc_table_size; i++) {
++ if (sbi->s_mb_prealloc_table[i] == 0) {
++ sbi->s_mb_prealloc_table[i] = value;
++ return;
++ }
++
++ /* they should add values in order */
++ if (value <= sbi->s_mb_prealloc_table[i])
++ return;
++ }
++}
++
++
+ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
+ ext4_group_t group, int cr)
+ {
+@@ -2173,6 +2193,80 @@
+ .show = ext4_mb_seq_groups_show,
+ };
+
++#define EXT4_MB_PREALLOC_TABLE "prealloc_table"
++
++static int ext4_mb_prealloc_table_proc_read(char *page, char **start, off_t off,
++ int count, int *eof, void *data)
++{
++ struct ext4_sb_info *sbi = data;
++ int len = 0;
++ int i;
++
++ *eof = 1;
++ if (off != 0)
++ return 0;
++
++ for (i = 0; i < sbi->s_mb_prealloc_table_size; i++)
++ len += sprintf(page + len, "%ld ",
++ sbi->s_mb_prealloc_table[i]);
++ len += sprintf(page + len, "\n");
++
++ *start = page;
++ return len;
++}
++
++static int ext4_mb_prealloc_table_proc_write(struct file *file,
++ const char __user *buf,
++ unsigned long cnt, void *data)
++{
++ struct ext4_sb_info *sbi = data;
++ unsigned long value;
++ unsigned long prev = 0;
++ char str[128];
++ char *cur;
++ char *end;
++ unsigned long *new_table;
++ int num = 0;
++ int i = 0;
++
++ if (cnt >= sizeof(str))
++ return -EINVAL;
++ if (copy_from_user(str, buf, cnt))
++ return -EFAULT;
++
++ num = 0;
++ cur = str;
++ end = str + cnt;
++ while (cur < end) {
++ while ((cur < end) && (*cur == ' ')) cur++;
++ value = simple_strtol(cur, &cur, 0);
++ if (value == 0)
++ break;
++ if (value <= prev)
++ return -EINVAL;
++ prev = value;
++ num++;
++ }
++
++ new_table = kmalloc(num * sizeof(*new_table), GFP_KERNEL);
++ if (new_table == NULL)
++ return -ENOMEM;
++ kfree(sbi->s_mb_prealloc_table);
++ memset(new_table, 0, num * sizeof(*new_table));
++ sbi->s_mb_prealloc_table = new_table;
++ sbi->s_mb_prealloc_table_size = num;
++ cur = str;
++ end = str + cnt;
++ while (cur < end && i < num) {
++ while ((cur < end) && (*cur == ' ')) cur++;
++ value = simple_strtol(cur, &cur, 0);
++ ext4_mb_prealloc_table_add(sbi, value);
++ i++;
++ }
++
++ return cnt;
++}
++
+ static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file)
+ {
+ struct super_block *sb = PDE(inode)->data;
+@@ -2411,12 +2505,56 @@
+ sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN;
+ sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN;
+ sbi->s_mb_stats = MB_DEFAULT_STATS;
+- sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
+ sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
+- sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC;
++
++ if (sbi->s_stripe == 0) {
++ sbi->s_mb_prealloc_table_size = 10;
++ i = sbi->s_mb_prealloc_table_size * sizeof(unsigned long);
++ sbi->s_mb_prealloc_table = kmalloc(i, GFP_NOFS);
++ if (sbi->s_mb_prealloc_table == NULL) {
++ kfree(sbi->s_mb_offsets);
++ kfree(sbi->s_mb_maxs);
++ return -ENOMEM;
++ }
++ memset(sbi->s_mb_prealloc_table, 0, i);
++
++ ext4_mb_prealloc_table_add(sbi, 4);
++ ext4_mb_prealloc_table_add(sbi, 8);
++ ext4_mb_prealloc_table_add(sbi, 16);
++ ext4_mb_prealloc_table_add(sbi, 32);
++ ext4_mb_prealloc_table_add(sbi, 64);
++ ext4_mb_prealloc_table_add(sbi, 128);
++ ext4_mb_prealloc_table_add(sbi, 256);
++ ext4_mb_prealloc_table_add(sbi, 512);
++ ext4_mb_prealloc_table_add(sbi, 1024);
++ ext4_mb_prealloc_table_add(sbi, 2048);
++
++ sbi->s_mb_small_req = 256;
++ sbi->s_mb_large_req = 1024;
++ sbi->s_mb_group_prealloc = 512;
++ } else {
++ sbi->s_mb_prealloc_table_size = 3;
++ i = sbi->s_mb_prealloc_table_size * sizeof(unsigned long);
++ sbi->s_mb_prealloc_table = kmalloc(i, GFP_NOFS);
++ if (sbi->s_mb_prealloc_table == NULL) {
++ kfree(sbi->s_mb_offsets);
++ kfree(sbi->s_mb_maxs);
++ return -ENOMEM;
++ }
++ memset(sbi->s_mb_prealloc_table, 0, i);
++
++ ext4_mb_prealloc_table_add(sbi, sbi->s_stripe);
++ ext4_mb_prealloc_table_add(sbi, sbi->s_stripe * 2);
++ ext4_mb_prealloc_table_add(sbi, sbi->s_stripe * 4);
++
++ sbi->s_mb_small_req = sbi->s_stripe;
++ sbi->s_mb_large_req = sbi->s_stripe * 8;
++ sbi->s_mb_group_prealloc = sbi->s_stripe * 4;
++ }
+
+ sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
+ if (sbi->s_locality_groups == NULL) {
++ kfree(sbi->s_mb_prealloc_table);
+ kfree(sbi->s_mb_offsets);
+ kfree(sbi->s_mb_maxs);
+ return -ENOMEM;
+@@ -2430,9 +2568,18 @@
+ spin_lock_init(&lg->lg_prealloc_lock);
+ }
+
+- if (sbi->s_proc)
++ if (sbi->s_proc) {
++ struct proc_dir_entry *p;
+ proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
+ &ext4_mb_seq_groups_fops, sb);
++ p = create_proc_entry(EXT4_MB_PREALLOC_TABLE, S_IFREG |
++ S_IRUGO | S_IWUSR, sbi->s_proc);
++ if (p) {
++ p->data = sbi;
++ p->read_proc = ext4_mb_prealloc_table_proc_read;
++ p->write_proc = ext4_mb_prealloc_table_proc_write;
++ }
++ }
+
+ if (sbi->s_journal)
+ sbi->s_journal->j_commit_callback = release_blocks_on_commit;
+@@ -2512,8 +2659,10 @@
+ }
+
+ free_percpu(sbi->s_locality_groups);
+- if (sbi->s_proc)
++ if (sbi->s_proc) {
+ remove_proc_entry("mb_groups", sbi->s_proc);
++ remove_proc_entry(EXT4_MB_PREALLOC_TABLE, sbi->s_proc);
++ }
+
+ return 0;
+ }
+@@ -2807,11 +2956,12 @@
+ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
+ struct ext4_allocation_request *ar)
+ {
+- int bsbits, max;
++ int bsbits, i, wind;
+ ext4_lblk_t end;
+- loff_t size, orig_size, start_off;
++ loff_t size, orig_size;
+ ext4_lblk_t start, orig_start;
+ struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
++ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+ struct ext4_prealloc_space *pa;
+
+ /* do normalize only data requests, metadata requests
+@@ -2841,49 +2991,35 @@
+ size = size << bsbits;
+ if (size < i_size_read(ac->ac_inode))
+ size = i_size_read(ac->ac_inode);
++ size = (size + ac->ac_sb->s_blocksize - 1) >> bsbits;
+
+- /* max size of free chunks */
+- max = 2 << bsbits;
++ start = wind = 0;
+
+-#define NRL_CHECK_SIZE(req, size, max, chunk_size) \
+- (req <= (size) || max <= (chunk_size))
++ /* let's choose preallocation window depending on file size */
++ for (i = 0; i < sbi->s_mb_prealloc_table_size; i++) {
++ if (size <= sbi->s_mb_prealloc_table[i]) {
++ wind = sbi->s_mb_prealloc_table[i];
++ break;
++ }
++ }
++ size = wind;
+
+- /* first, try to predict filesize */
+- /* XXX: should this table be tunable? */
+- start_off = 0;
+- if (size <= 16 * 1024) {
+- size = 16 * 1024;
+- } else if (size <= 32 * 1024) {
+- size = 32 * 1024;
+- } else if (size <= 64 * 1024) {
+- size = 64 * 1024;
+- } else if (size <= 128 * 1024) {
+- size = 128 * 1024;
+- } else if (size <= 256 * 1024) {
+- size = 256 * 1024;
+- } else if (size <= 512 * 1024) {
+- size = 512 * 1024;
+- } else if (size <= 1024 * 1024) {
+- size = 1024 * 1024;
+- } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) {
+- start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
+- (21 - bsbits)) << 21;
+- size = 2 * 1024 * 1024;
+- } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) {
+- start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
+- (22 - bsbits)) << 22;
+- size = 4 * 1024 * 1024;
+- } else if (NRL_CHECK_SIZE(ac->ac_o_ex.fe_len,
+- (8<<20)>>bsbits, max, 8 * 1024)) {
+- start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
+- (23 - bsbits)) << 23;
+- size = 8 * 1024 * 1024;
+- } else {
+- start_off = (loff_t)ac->ac_o_ex.fe_logical << bsbits;
+- size = ac->ac_o_ex.fe_len << bsbits;
++ if (wind == 0) {
++ __u64 tstart, tend;
++ /* file is quite large, we now preallocate with
++ * the biggest configured window with regart to
++ * logical offset */
++ wind = sbi->s_mb_prealloc_table[i - 1];
++ tstart = ac->ac_o_ex.fe_logical;
++ do_div(tstart, wind);
++ start = tstart * wind;
++ tend = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len - 1;
++ do_div(tend, wind);
++ tend = tend * wind + wind;
++ size = tend - start;
+ }
+- orig_size = size = size >> bsbits;
+- orig_start = start = start_off >> bsbits;
++ orig_size = size;
++ orig_start = start;
+
+ /* don't cover already allocated blocks in selected range */
+ if (ar->pleft && start <= ar->lleft) {
+@@ -2955,7 +3091,6 @@
+ }
+ BUG_ON(start + size <= ac->ac_o_ex.fe_logical &&
+ start > ac->ac_o_ex.fe_logical);
+- BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
+
+ /* now prepare goal request */
+
+@@ -3939,11 +4074,19 @@
+
+ /* don't use group allocation for large files */
+ size = max(size, isize);
+- if (size > sbi->s_mb_stream_request) {
++ if ((ac->ac_o_ex.fe_len >= sbi->s_mb_small_req) ||
++ (size >= sbi->s_mb_large_req)) {
+ ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
+ return;
+ }
+
++ /*
++ * request is so large that we don't care about
++ * streaming - it overweights any possible seek
++ */
++ if (ac->ac_o_ex.fe_len >= sbi->s_mb_large_req)
++ return;
++
+ BUG_ON(ac->ac_lg != NULL);
+ /*
+ * locality group prealloc space are per cpu. The reason for having
+Index: linux-stage/fs/ext4/super.c
+===================================================================
+--- linux-stage.orig/fs/ext4/super.c 2011-03-11 14:16:56.000000000 +0800
++++ linux-stage/fs/ext4/super.c 2011-03-11 14:19:24.664467626 +0800
+@@ -2632,7 +2632,8 @@
+ EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan);
+ EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
+ EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
+-EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
++EXT4_RW_ATTR_SBI_UI(mb_small_req, s_mb_small_req);
++EXT4_RW_ATTR_SBI_UI(mb_large_req, s_mb_large_req);
+ EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
+ EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump);
+
+@@ -2647,7 +2648,8 @@
+ ATTR_LIST(mb_max_to_scan),
+ ATTR_LIST(mb_min_to_scan),
+ ATTR_LIST(mb_order2_req),
+- ATTR_LIST(mb_stream_req),
++ ATTR_LIST(mb_small_req),
++ ATTR_LIST(mb_large_req),
+ ATTR_LIST(mb_group_prealloc),
+ ATTR_LIST(max_writeback_mb_bump),
+ NULL,
--- /dev/null
+Index: linux-stage/fs/ext4/namei.c
+===================================================================
+--- linux-stage.orig/fs/ext4/namei.c
++++ linux-stage/fs/ext4/namei.c
+@@ -371,8 +371,8 @@ dx_probe(const struct qstr *d_name, stru
+ if (root->info.hash_version != DX_HASH_TEA &&
+ root->info.hash_version != DX_HASH_HALF_MD4 &&
+ root->info.hash_version != DX_HASH_LEGACY) {
+- ext4_warning(dir->i_sb, "Unrecognised inode hash code %d",
+- root->info.hash_version);
++ ext4_warning(dir->i_sb, "Unrecognised inode hash code %d for directory "
++ "#%lu", root->info.hash_version, dir->i_ino);
+ brelse(bh);
+ *err = ERR_BAD_DX_DIR;
+ goto fail;
--- /dev/null
+Index: linux-2.6.32.i386/fs/ext4/namei.c
+===================================================================
+--- linux-2.6.32.i386.orig/fs/ext4/namei.c 2010-04-07 00:16:32.000000000 +0530
++++ linux-2.6.32.i386/fs/ext4/namei.c 2010-04-07 00:17:09.000000000 +0530
+@@ -144,6 +144,17 @@
+ u16 size;
+ };
+
++/*
++ * dentry_param used by ext4_new_inode_wantedi()
++ */
++#define LVFS_DENTRY_PARAM_MAGIC 20070216UL
++struct lvfs_dentry_params
++{
++ unsigned long ldp_inum;
++ unsigned long ldp_flags;
++ u32 ldp_magic;
++};
++
+ static inline ext4_lblk_t dx_get_block(struct dx_entry *entry);
+ static void dx_set_block(struct dx_entry *entry, ext4_lblk_t value);
+ static inline unsigned dx_get_hash(struct dx_entry *entry);
+@@ -1751,6 +1762,19 @@
+ return err;
+ }
+
++static unsigned ext4_dentry_goal(struct super_block *sb, struct dentry *dentry)
++{
++ unsigned inum = EXT4_SB(sb)->s_inode_goal;
++
++ if (dentry->d_fsdata != NULL) {
++ struct lvfs_dentry_params *param = dentry->d_fsdata;
++
++ if (param->ldp_magic == LVFS_DENTRY_PARAM_MAGIC)
++ inum = param->ldp_inum;
++ }
++ return inum;
++}
++
+ /*
+ * By the time this is called, we already have created
+ * the directory cache entry for the new file, but it
--- /dev/null
+this patch implements feature which allows ext4 fs users (e.g. Lustre)
+to store data in ext4 dirent.
+data is stored in ext4 dirent after file-name, this space is accounted
+in de->rec_len. flag EXT4_DIRENT_LUFID added to d_type if extra data
+is present.
+
+make use of dentry->d_fsdata to pass fid to ext4. so no
+changes in ext4_add_entry() interface required.
+
+Index: linux-2.6.32.i386/fs/ext4/dir.c
+===================================================================
+--- linux-2.6.32.i386.orig/fs/ext4/dir.c 2009-12-03 09:21:21.000000000 +0530
++++ linux-2.6.32.i386/fs/ext4/dir.c 2010-04-16 06:25:43.000000000 +0530
+@@ -53,11 +53,18 @@
+
+ static unsigned char get_dtype(struct super_block *sb, int filetype)
+ {
++ int fl_index = filetype & EXT4_FT_MASK;
++
+ if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE) ||
+- (filetype >= EXT4_FT_MAX))
++ (fl_index >= EXT4_FT_MAX))
+ return DT_UNKNOWN;
+
+- return (ext4_filetype_table[filetype]);
++ if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_DIRDATA))
++ return (ext4_filetype_table[fl_index]);
++
++ return (ext4_filetype_table[fl_index]) |
++ (filetype & EXT4_DIRENT_LUFID);
++
+ }
+
+
+@@ -70,11 +77,11 @@
+ const int rlen = ext4_rec_len_from_disk(de->rec_len,
+ dir->i_sb->s_blocksize);
+
+- if (rlen < EXT4_DIR_REC_LEN(1))
++ if (rlen < __EXT4_DIR_REC_LEN(1))
+ error_msg = "rec_len is smaller than minimal";
+ else if (rlen % 4 != 0)
+ error_msg = "rec_len % 4 != 0";
+- else if (rlen < EXT4_DIR_REC_LEN(de->name_len))
++ else if (rlen < EXT4_DIR_REC_LEN(de))
+ error_msg = "rec_len is too small for name_len";
+ else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)
+ error_msg = "directory entry across blocks";
+@@ -179,7 +186,7 @@
+ * failure will be detected in the
+ * dirent test below. */
+ if (ext4_rec_len_from_disk(de->rec_len,
+- sb->s_blocksize) < EXT4_DIR_REC_LEN(1))
++ sb->s_blocksize) < __EXT4_DIR_REC_LEN(1))
+ break;
+ i += ext4_rec_len_from_disk(de->rec_len,
+ sb->s_blocksize);
+@@ -342,12 +349,17 @@
+ struct fname *fname, *new_fn;
+ struct dir_private_info *info;
+ int len;
++ int extra_data = 1;
+
+ info = (struct dir_private_info *) dir_file->private_data;
+ p = &info->root.rb_node;
+
+ /* Create and allocate the fname structure */
+- len = sizeof(struct fname) + dirent->name_len + 1;
++ if (dirent->file_type & EXT4_DIRENT_LUFID)
++ extra_data = ext4_get_dirent_data_len(dirent);
++
++ len = sizeof(struct fname) + dirent->name_len + extra_data;
++
+ new_fn = kzalloc(len, GFP_KERNEL);
+ if (!new_fn)
+ return -ENOMEM;
+@@ -356,7 +368,7 @@
+ new_fn->inode = le32_to_cpu(dirent->inode);
+ new_fn->name_len = dirent->name_len;
+ new_fn->file_type = dirent->file_type;
+- memcpy(new_fn->name, dirent->name, dirent->name_len);
++ memcpy(new_fn->name, dirent->name, dirent->name_len + extra_data);
+ new_fn->name[dirent->name_len] = 0;
+
+ while (*p) {
+Index: linux-2.6.32.i386/fs/ext4/ext4.h
+===================================================================
+--- linux-2.6.32.i386.orig/fs/ext4/ext4.h 2010-04-16 06:10:06.000000000 +0530
++++ linux-2.6.32.i386/fs/ext4/ext4.h 2010-04-16 06:27:40.000000000 +0530
+@@ -1135,6 +1135,7 @@
+ #define EXT4_FEATURE_INCOMPAT_64BIT 0x0080
+ #define EXT4_FEATURE_INCOMPAT_MMP 0x0100
+ #define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200
++#define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000
+
+ #define EXT4_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR
+ #define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \
+@@ -1143,7 +1144,9 @@
+ EXT4_FEATURE_INCOMPAT_EXTENTS| \
+ EXT4_FEATURE_INCOMPAT_64BIT| \
+ EXT4_FEATURE_INCOMPAT_FLEX_BG| \
+- EXT4_FEATURE_INCOMPAT_MMP)
++ EXT4_FEATURE_INCOMPAT_MMP| \
++ EXT4_FEATURE_INCOMPAT_DIRDATA)
++
+ #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
+ EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
+ EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
+@@ -1225,6 +1228,43 @@
+ #define EXT4_FT_SYMLINK 7
+
+ #define EXT4_FT_MAX 8
++#define EXT4_FT_MASK 0xf
++
++#if EXT4_FT_MAX > EXT4_FT_MASK
++#error "conflicting EXT4_FT_MAX and EXT4_FT_MASK"
++#endif
++
++/*
++ * d_type has 4 unused bits, so it can hold four types data. these different
++ * type of data (e.g. lustre data, high 32 bits of 64-bit inode number) can be
++ * stored, in flag order, after file-name in ext4 dirent.
++*/
++/*
++ * this flag is added to d_type if ext4 dirent has extra data after
++ * filename. this data length is variable and length is stored in first byte
++ * of data. data start after filename NUL byte.
++ * This is used by Lustre FS.
++ */
++#define EXT4_DIRENT_LUFID 0x10
++
++#define EXT4_LUFID_MAGIC 0xAD200907UL
++struct ext4_dentry_param {
++ __u32 edp_magic; /* EXT4_LUFID_MAGIC */
++ char edp_len; /* size of edp_data in bytes */
++ char edp_data[0]; /* packed array of data */
++} __attribute__((packed));
++
++static inline unsigned char *ext4_dentry_get_data(struct super_block *sb,
++ struct ext4_dentry_param* p)
++
++{
++ if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_DIRDATA))
++ return NULL;
++ if (p && p->edp_magic == EXT4_LUFID_MAGIC)
++ return &p->edp_len;
++ else
++ return NULL;
++}
+
+ /*
+ * EXT4_DIR_PAD defines the directory entries boundaries
+@@ -1233,8 +1273,11 @@
+ */
+ #define EXT4_DIR_PAD 4
+ #define EXT4_DIR_ROUND (EXT4_DIR_PAD - 1)
+-#define EXT4_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT4_DIR_ROUND) & \
++#define __EXT4_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT4_DIR_ROUND) & \
+ ~EXT4_DIR_ROUND)
++#define EXT4_DIR_REC_LEN(de) (__EXT4_DIR_REC_LEN(de->name_len +\
++ ext4_get_dirent_data_len(de)))
++
+ #define EXT4_MAX_REC_LEN ((1<<16)-1)
+
+ /*
+@@ -1524,7 +1567,7 @@
+ struct ext4_dir_entry_2 ** res_dir);
+ #define ll_ext4_find_entry(inode, dentry, res_dir) ext4_find_entry(inode, &(dentry)->d_name, res_dir)
+ extern int ext4_add_dot_dotdot(handle_t *handle, struct inode *dir,
+- struct inode *inode);
++ struct inode *inode, const void *, const void *);
+ extern struct buffer_head *ext4_append(handle_t *handle,
+ struct inode *inode,
+ ext4_lblk_t *block, int *err);
+@@ -1851,6 +1894,28 @@
+ set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state);
+ }
+
++/*
++ * Compute the total directory entry data length.
++ * This includes the filename and an implicit NUL terminator (always present),
++ * and optional extensions. Each extension has a bit set in the high 4 bits of
++ * de->file_type, and the extension length is the first byte in each entry.
++ */
++static inline int ext4_get_dirent_data_len(struct ext4_dir_entry_2 *de)
++{
++ char *len = de->name + de->name_len + 1 /* NUL terminator */;
++ int dlen = 0;
++ __u8 extra_data_flags = (de->file_type & ~EXT4_FT_MASK) >> 4;
++
++ while (extra_data_flags) {
++ if (extra_data_flags & 1) {
++ dlen += *len + (dlen == 0);
++ len += *len;
++ }
++ extra_data_flags >>= 1;
++ }
++ return dlen;
++}
++
+ #endif /* __KERNEL__ */
+
+ #endif /* _EXT4_H */
+Index: linux-2.6.32.i386/fs/ext4/namei.c
+===================================================================
+--- linux-2.6.32.i386.orig/fs/ext4/namei.c 2010-04-16 05:47:41.000000000 +0530
++++ linux-2.6.32.i386/fs/ext4/namei.c 2010-04-16 06:40:38.000000000 +0530
+@@ -170,7 +170,8 @@
+ static unsigned dx_get_limit(struct dx_entry *entries);
+ static void dx_set_count(struct dx_entry *entries, unsigned value);
+ static void dx_set_limit(struct dx_entry *entries, unsigned value);
+-static unsigned dx_root_limit(struct inode *dir, unsigned infosize);
++static inline unsigned dx_root_limit(__u32 blocksize,
++ struct ext4_dir_entry_2 *dot_de, unsigned infosize);
+ static unsigned dx_node_limit(struct inode *dir);
+ static struct dx_frame *dx_probe(const struct qstr *d_name,
+ struct inode *dir,
+@@ -237,11 +238,12 @@
+ */
+ struct dx_root_info * dx_get_dx_info(struct ext4_dir_entry_2 *de)
+ {
+- /* get dotdot first */
+- de = (struct ext4_dir_entry_2 *)((char *)de + EXT4_DIR_REC_LEN(1));
++ BUG_ON(de->name_len != 1);
++ /* get dotdot first */
++ de = (struct ext4_dir_entry_2 *)((char *)de + EXT4_DIR_REC_LEN(de));
+
+- /* dx root info is after dotdot entry */
+- de = (struct ext4_dir_entry_2 *)((char *)de + EXT4_DIR_REC_LEN(2));
++ /* dx root info is after dotdot entry */
++ de = (struct ext4_dir_entry_2 *)((char *)de + EXT4_DIR_REC_LEN(de));
+
+ return (struct dx_root_info *) de;
+ }
+@@ -286,16 +288,23 @@
+ ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value);
+ }
+
+-static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize)
++static inline unsigned dx_root_limit(__u32 blocksize,
++ struct ext4_dir_entry_2 *dot_de, unsigned infosize)
+ {
+- unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) -
+- EXT4_DIR_REC_LEN(2) - infosize;
++ struct ext4_dir_entry_2 *dotdot_de;
++ unsigned entry_space;
++
++ BUG_ON(dot_de->name_len != 1);
++ dotdot_de = ext4_next_entry(dot_de, blocksize);
++ entry_space = blocksize - EXT4_DIR_REC_LEN(dot_de) -
++ EXT4_DIR_REC_LEN(dotdot_de) - infosize;
++
+ return entry_space / sizeof(struct dx_entry);
+ }
+
+ static inline unsigned dx_node_limit(struct inode *dir)
+ {
+- unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0);
++ unsigned entry_space = dir->i_sb->s_blocksize - __EXT4_DIR_REC_LEN(0);
+ return entry_space / sizeof(struct dx_entry);
+ }
+
+@@ -342,7 +351,7 @@
+ printk(":%x.%u ", h.hash,
+ ((char *) de - base));
+ }
+- space += EXT4_DIR_REC_LEN(de->name_len);
++ space += EXT4_DIR_REC_LEN(de);
+ names++;
+ }
+ de = ext4_next_entry(de, size);
+@@ -447,7 +456,8 @@
+
+ entries = (struct dx_entry *) (((char *)info) + info->info_length);
+
+- if (dx_get_limit(entries) != dx_root_limit(dir,
++ if (dx_get_limit(entries) != dx_root_limit(dir->i_sb->s_blocksize,
++ (struct ext4_dir_entry_2*)bh->b_data,
+ info->info_length)) {
+ ext4_warning(dir->i_sb, __func__,
+ "dx entry: limit != root limit");
+@@ -637,7 +647,7 @@
+ de = (struct ext4_dir_entry_2 *) bh->b_data;
+ top = (struct ext4_dir_entry_2 *) ((char *) de +
+ dir->i_sb->s_blocksize -
+- EXT4_DIR_REC_LEN(0));
++ __EXT4_DIR_REC_LEN(0));
+ for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) {
+ if (!ext4_check_dir_entry("htree_dirblock_to_tree", dir, de, bh,
+ (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
+@@ -1050,7 +1060,7 @@
+ goto errout;
+ de = (struct ext4_dir_entry_2 *) bh->b_data;
+ top = (struct ext4_dir_entry_2 *) ((char *) de + sb->s_blocksize -
+- EXT4_DIR_REC_LEN(0));
++ __EXT4_DIR_REC_LEN(0));
+ for (; de < top; de = ext4_next_entry(de, sb->s_blocksize)) {
+ int off = (block << EXT4_BLOCK_SIZE_BITS(sb))
+ + ((char *) de - bh->b_data);
+@@ -1216,7 +1226,7 @@
+ while (count--) {
+ struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *)
+ (from + (map->offs<<2));
+- rec_len = EXT4_DIR_REC_LEN(de->name_len);
++ rec_len = EXT4_DIR_REC_LEN(de);
+ memcpy (to, de, rec_len);
+ ((struct ext4_dir_entry_2 *) to)->rec_len =
+ ext4_rec_len_to_disk(rec_len, blocksize);
+@@ -1240,7 +1250,7 @@
+ while ((char*)de < base + blocksize) {
+ next = ext4_next_entry(de, blocksize);
+ if (de->inode && de->name_len) {
+- rec_len = EXT4_DIR_REC_LEN(de->name_len);
++ rec_len = EXT4_DIR_REC_LEN(de);
+ if (de > to)
+ memmove(to, de, rec_len);
+ to->rec_len = ext4_rec_len_to_disk(rec_len, blocksize);
+@@ -1370,10 +1380,16 @@
+ unsigned int offset = 0;
+ unsigned int blocksize = dir->i_sb->s_blocksize;
+ unsigned short reclen;
+- int nlen, rlen, err;
++ int nlen, rlen, err, dlen = 0;
++ unsigned char *data;
+ char *top;
+
+- reclen = EXT4_DIR_REC_LEN(namelen);
++ data = ext4_dentry_get_data(inode->i_sb, (struct ext4_dentry_param *)
++ dentry->d_fsdata);
++ if (data)
++ dlen = (*data) + 1;
++
++ reclen = __EXT4_DIR_REC_LEN(namelen + dlen);
+ if (!de) {
+ de = (struct ext4_dir_entry_2 *)bh->b_data;
+ top = bh->b_data + blocksize - reclen;
+@@ -1383,7 +1399,7 @@
+ return -EIO;
+ if (ext4_match(namelen, name, de))
+ return -EEXIST;
+- nlen = EXT4_DIR_REC_LEN(de->name_len);
++ nlen = EXT4_DIR_REC_LEN(de);
+ rlen = ext4_rec_len_from_disk(de->rec_len, blocksize);
+ if ((de->inode? rlen - nlen: rlen) >= reclen)
+ break;
+@@ -1401,7 +1417,7 @@
+ }
+
+ /* By now the buffer is marked for journaling */
+- nlen = EXT4_DIR_REC_LEN(de->name_len);
++ nlen = EXT4_DIR_REC_LEN(de);
+ rlen = ext4_rec_len_from_disk(de->rec_len, blocksize);
+ if (de->inode) {
+ struct ext4_dir_entry_2 *de1 = (struct ext4_dir_entry_2 *)((char *)de + nlen);
+@@ -1417,6 +1433,12 @@
+ de->inode = 0;
+ de->name_len = namelen;
+ memcpy(de->name, name, namelen);
++ if (data) {
++ de->name[namelen] = 0;
++ memcpy(&de->name[namelen + 1], data, *(char *) data);
++ de->file_type |= EXT4_DIRENT_LUFID;
++ }
++
+ /*
+ * XXX shouldn't update any times until successful
+ * completion of syscall, but too many callers depend
+@@ -1515,7 +1537,8 @@
+
+ dx_set_block(entries, 1);
+ dx_set_count(entries, 1);
+- dx_set_limit(entries, dx_root_limit(dir, sizeof(*dx_info)));
++ dx_set_limit(entries, dx_root_limit(dir->i_sb->s_blocksize,
++ dot_de, sizeof(*dx_info)));
+
+ /* Initialize as for dx_probe */
+ hinfo.hash_version = dx_info->hash_version;
+@@ -1546,6 +1569,8 @@
+ struct buffer_head * dir_block;
+ struct ext4_dir_entry_2 * de;
+ int len, journal = 0, err = 0;
++ int dlen = 0;
++ char *data;
+
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+@@ -1561,19 +1586,24 @@
+ /* the first item must be "." */
+ assert(de->name_len == 1 && de->name[0] == '.');
+ len = le16_to_cpu(de->rec_len);
+- assert(len >= EXT4_DIR_REC_LEN(1));
+- if (len > EXT4_DIR_REC_LEN(1)) {
++ assert(len >= __EXT4_DIR_REC_LEN(1));
++ if (len > __EXT4_DIR_REC_LEN(1)) {
+ BUFFER_TRACE(dir_block, "get_write_access");
+ err = ext4_journal_get_write_access(handle, dir_block);
+ if (err)
+ goto out_journal;
+
+ journal = 1;
+- de->rec_len = cpu_to_le16(EXT4_DIR_REC_LEN(1));
++ de->rec_len = cpu_to_le16(EXT4_DIR_REC_LEN(de));
+ }
+
+- len -= EXT4_DIR_REC_LEN(1);
+- assert(len == 0 || len >= EXT4_DIR_REC_LEN(2));
++ len -= EXT4_DIR_REC_LEN(de);
++ data = ext4_dentry_get_data(dir->i_sb,
++ (struct ext4_dentry_param *) dentry->d_fsdata);
++ if (data)
++ dlen = *data + 1;
++ assert(len == 0 || len >= __EXT4_DIR_REC_LEN(2 + dlen));
++
+ de = (struct ext4_dir_entry_2 *)
+ ((char *) de + le16_to_cpu(de->rec_len));
+ if (!journal) {
+@@ -1587,10 +1617,15 @@
+ if (len > 0)
+ de->rec_len = cpu_to_le16(len);
+ else
+- assert(le16_to_cpu(de->rec_len) >= EXT4_DIR_REC_LEN(2));
++ assert(le16_to_cpu(de->rec_len) >= __EXT4_DIR_REC_LEN(2));
+ de->name_len = 2;
+ strcpy (de->name, "..");
+ ext4_set_de_type(dir->i_sb, de, S_IFDIR);
++ if (data) {
++ de->name[2] = 0;
++ memcpy(&de->name[2 + 1], data, dlen);
++ de->file_type |= EXT4_DIRENT_LUFID;
++ }
+
+ out_journal:
+ if (journal) {
+@@ -2011,12 +2046,13 @@
+ /* Initialize @inode as a subdirectory of @dir, and add the
+ * "." and ".." entries into the first directory block. */
+ int ext4_add_dot_dotdot(handle_t *handle, struct inode * dir,
+- struct inode *inode)
++ struct inode *inode,
++ const void *data1, const void *data2)
+ {
+ struct buffer_head * dir_block;
+ struct ext4_dir_entry_2 * de;
+ unsigned int blocksize = dir->i_sb->s_blocksize;
+- int err = 0;
++ int err = 0, dot_reclen;
+
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+@@ -2040,17 +2076,32 @@
+ de = (struct ext4_dir_entry_2 *) dir_block->b_data;
+ de->inode = cpu_to_le32(inode->i_ino);
+ de->name_len = 1;
+- de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len),
+- blocksize);
+ strcpy(de->name, ".");
+ ext4_set_de_type(dir->i_sb, de, S_IFDIR);
++ /* get packed fid data*/
++ data1 = ext4_dentry_get_data(dir->i_sb,
++ (struct ext4_dentry_param *) data1);
++ if (data1) {
++ de->name[1] = 0;
++ memcpy(&de->name[2], data1, *(char *) data1);
++ de->file_type |= EXT4_DIRENT_LUFID;
++ }
++ de->rec_len = cpu_to_le16(EXT4_DIR_REC_LEN(de));
++ dot_reclen = cpu_to_le16(de->rec_len);
+ de = ext4_next_entry(de, blocksize);
+ de->inode = cpu_to_le32(dir->i_ino);
+- de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(1),
++ de->rec_len = ext4_rec_len_to_disk(blocksize - dot_reclen,
+ blocksize);
+ de->name_len = 2;
+ strcpy(de->name, "..");
+ ext4_set_de_type(dir->i_sb, de, S_IFDIR);
++ data2 = ext4_dentry_get_data(dir->i_sb,
++ (struct ext4_dentry_param *) data2);
++ if (data2) {
++ de->name[2] = 0;
++ memcpy(&de->name[3], data2, *(char *) data2);
++ de->file_type |= EXT4_DIRENT_LUFID;
++ }
+ inode->i_nlink = 2;
+ BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata");
+ ext4_handle_dirty_metadata(handle, dir, dir_block);
+@@ -2087,7 +2138,7 @@
+ if (IS_ERR(inode))
+ goto out_stop;
+
+- err = ext4_add_dot_dotdot(handle, dir, inode);
++ err = ext4_add_dot_dotdot(handle, dir, inode, NULL, NULL);
+ if (err)
+ goto out_stop;
+
+@@ -2123,7 +2174,7 @@
+ int err = 0;
+
+ sb = inode->i_sb;
+- if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) ||
++ if (inode->i_size < __EXT4_DIR_REC_LEN(1) + __EXT4_DIR_REC_LEN(2) ||
+ !(bh = ext4_bread(NULL, inode, 0, 0, &err))) {
+ if (err)
+ ext4_error(inode->i_sb, __func__,
--- /dev/null
+ext4-wantedi-2.6-rhel6.patch
+ext4-map_inode_page-2.6-rhel6.patch
+export-ext4-2.6-rhel6.patch
+ext4-remove-cond_resched-calls-rhel5.patch
+ext4-ext_generation-sles11.patch
+ext4-inode-version-rhel6.patch
+ext4-mmp-rhel6.patch
+ext4-lookup-dotdot-rhel5.patch
+ext4-max-dir-size-rhel6.patch
+ext4-print-inum-in-htree-warning-rhel6.patch
+ext4-xattr-no-update-ctime-rhel5.patch
+ext4-prealloc-rhel6.patch
+ext4-mballoc-extra-checks-rhel6.patch
+ext4-misc-rhel6.patch
+ext4-big-endian-check-2.6-rhel6.patch
+ext4-alloc-policy-2.6-rhel5.patch
+ext4-force_over_16tb-rhel6.patch
+ext4-pdir-fix-rhel6.patch
+ext4-osd-iop-common-rhel6.patch
+ext4-osd-iam-exports-rhel6.patch
+ext4-dynlocks-common-rhel6.patch
+ext4-hash-indexed-dir-dotdot-update-rhel5.patch
+ext4-kill-dx_root-rhel6.patch
+ext4-extents-mount-option-rhel6.patch
+ext4-fiemap-2.6-rhel6.patch
+ext4-mballoc-pa_free-mismatch-rhel6.patch
+ext4_data_in_dirent-rhel6.patch
+ext4-disable-mb-cache-rhel6.patch
+ext4-back-dquot-to-rhel6.patch
backfs_headers := $(wildcard @LINUX@/fs/@BACKFS@/*.h)
linux_headers := $(wildcard @LINUX@/include/linux/@BACKFS@*.h)
+trace_headers := $(wildcard @LINUX@/include/trace/events/@BACKFS@*.h)
backfs_sources := $(filter-out %.mod.c,$(wildcard @LINUX@/fs/@BACKFS@/*.c))
series := @top_srcdir@/kernel_patches/series/ldiskfs-$(LDISKFS_SERIES)
patches := @top_srcdir@/kernel_patches/patches
-sources: $(backfs_sources) $(backfs_headers) $(linux_headers) $(series)
- rm -rf linux-stage linux sources $(ldiskfs_SOURCES)
- mkdir -p linux-stage/fs/@BACKFS@ linux-stage/include/linux
+sources: $(backfs_sources) $(backfs_headers) $(linux_headers) $(series) $(trace_headers)
+ rm -rf linux-stage linux sources trace $(ldiskfs_SOURCES)
+ mkdir -p linux-stage/fs/@BACKFS@ linux-stage/include/linux \
+ linux-stage/include/trace/events
cp $(backfs_sources) $(backfs_headers) $(backfs_extra) linux-stage/fs/@BACKFS@
if test -n "$(linux_headers)" ; then \
cp $(linux_headers) linux-stage/include/linux; \
fi
+ if test -n "$(trace_headers)" ; then \
+ cp $(trace_headers) linux-stage/include/trace/events; \
+ fi
if USE_QUILT
ln -s ../$(patches) linux-stage/patches
ln -s ../$(series) linux-stage/series
done
@echo
endif
- mkdir linux
+ mkdir -p linux trace/events
@echo -n "Replacing '@BACKFS@' with 'ldiskfs':"
for i in $(notdir $(backfs_headers) $(backfs_sources)) $(new_sources) ; do \
echo -n " $$i" ; \
linux-stage/include/linux/@BACKFS@$$i \
> linux/ldiskfs$$i ; \
done
+ for i in $(subst @BACKFS@,,$(notdir $(trace_headers))) ; do \
+ echo -n " @BACKFS@$$i"; \
+ sed $(strip $(ldiskfs_sed_flags)) \
+ linux-stage/include/trace/events/@BACKFS@$$i \
+ > trace/events/ldiskfs$$i ; \
+ done
sed $(strip $(ldiskfs_sed_flags)) \
linux-stage/include/linux/dynlocks.h \
> linux/dynlocks.h
@echo "ldiskfs_LDADD: $(ldiskfs_LDADD)"
MOSTLYCLEANFILES := @MOSTLYCLEANFILES@
-CLEANFILES = sources $(notdir $(linux_headers) $(backfs_headers) $(backfs_sources) $(new_sources) $(new_headers))
+CLEANFILES = sources $(notdir $(linux_headers) $(backfs_headers) $(backfs_sources) $(new_sources) $(new_headers) $(trace_headers))
clean: clean-am
- rm -rf linux linux-stage ldiskfs*.h
+ rm -rf linux linux-stage ldiskfs*.h trace
])
#
-# Ensure stack size big than 8k in Lustre server (all kernels)
+# LC_CONFIG_OBD_BUFFER_SIZE
#
-AC_DEFUN([LC_STACK_SIZE],
-[AC_MSG_CHECKING([stack size big than 8k])
-LB_LINUX_TRY_COMPILE([
- #include <linux/thread_info.h>
+# the maximum buffer size of lctl ioctls
+#
+AC_DEFUN([LC_CONFIG_OBD_BUFFER_SIZE],
+[AC_MSG_CHECKING([maximum OBD ioctl size])
+AC_ARG_WITH([obd-buffer-size],
+ AC_HELP_STRING([--with-obd-buffer-size=[size]],
+ [set lctl ioctl maximum bytes (default=8192)]),
+ [
+ OBD_BUFFER_SIZE=$with_obd_buffer_size
+ ],[
+ OBD_BUFFER_SIZE=8192
+ ])
+AC_MSG_RESULT([$OBD_BUFFER_SIZE bytes])
+AC_DEFINE_UNQUOTED(OBD_MAX_IOCTL_BUFFER, $OBD_BUFFER_SIZE, [IOCTL Buffer Size])
+])
+
+#
+# LC_READLINK_SSIZE_T
+#
+AC_DEFUN([LC_READLINK_SSIZE_T],
+[AC_MSG_CHECKING([if readlink returns ssize_t])
+AC_TRY_COMPILE([
+ #include <unistd.h>
],[
- #if THREAD_SIZE < 8192
- #error "stack size < 8192"
- #endif
+ ssize_t readlink(const char *, char *, size_t);
],[
- AC_MSG_RESULT(yes)
+ AC_MSG_RESULT([yes])
+ AC_DEFINE(HAVE_POSIX_1003_READLINK, 1, [readlink returns ssize_t])
],[
- AC_MSG_ERROR([Lustre requires that Linux is configured with at least a 8KB stack.])
+ AC_MSG_RESULT([no])
+])
+])
+
+#
+# LC_FUNC_RELEASEPAGE_WITH_GFP
+#
+# 2.6.9 ->releasepage() takes a gfp_t arg
+# This kernel defines gfp_t (HAS_GFP_T) but doesn't use it for this function,
+# while others either don't have gfp_t or pass gfp_t as the parameter.
+#
+AC_DEFUN([LC_FUNC_RELEASEPAGE_WITH_GFP],
+[AC_MSG_CHECKING([if releasepage has a gfp_t parameter])
+RELEASEPAGE_WITH_GFP="$(grep -c 'releasepage.*gfp_t' $LINUX/include/linux/fs.h)"
+if test "$RELEASEPAGE_WITH_GFP" != 0 ; then
+ AC_DEFINE(HAVE_RELEASEPAGE_WITH_GFP, 1,
+ [releasepage with gfp_t parameter])
+ AC_MSG_RESULT([yes])
+else
+ AC_MSG_RESULT([no])
+fi
+])
+
+
+
+#
+# only for Lustre-patched kernels
+#
+AC_DEFUN([LC_LUSTRE_VERSION_H],
+[LB_CHECK_FILE([$LINUX/include/linux/lustre_version.h],[
+ rm -f "$LUSTRE/include/linux/lustre_version.h"
+],[
+ touch "$LUSTRE/include/linux/lustre_version.h"
+ if test x$enable_server = xyes ; then
+ AC_MSG_WARN([Unpatched kernel detected.])
+ AC_MSG_WARN([Lustre servers cannot be built with an unpatched kernel;])
+ AC_MSG_WARN([disabling server build])
+ enable_server='no'
+ fi
])
])
])
#
+# Ensure stack size big than 8k in Lustre server (all kernels)
+#
+AC_DEFUN([LC_STACK_SIZE],
+[AC_MSG_CHECKING([stack size big than 8k])
+LB_LINUX_TRY_COMPILE([
+ #include <linux/thread_info.h>
+],[
+ #if THREAD_SIZE < 8192
+ #error "stack size < 8192"
+ #endif
+],[
+ AC_MSG_RESULT(yes)
+],[
+ AC_MSG_ERROR([Lustre requires that Linux is configured with at least a 8KB stack.])
+])
+])
+
+#
# LC_CONFIG_BACKINGFS
#
# setup, check the backing filesystem
])
#
-# LC_HEADER_LDISKFS_XATTR
-#
-# CHAOS kernel-devel package will not include fs/ldiskfs/xattr.h
+# LC_CONFIG_LIBLUSTRE_RECOVERY
#
-AC_DEFUN([LC_HEADER_LDISKFS_XATTR],
-[AC_MSG_CHECKING([if ldiskfs has xattr.h header])
-tmp_flags="$EXTRA_KCFLAGS"
-EXTRA_KCFLAGS="-I$LINUX/fs -I$LDISKFS_DIR -I$LDISKFS_DIR/ldiskfs"
-LB_LINUX_TRY_COMPILE([
- #include <ldiskfs/xattr.h>
-],[
- ldiskfs_xattr_get(NULL, 0, "", NULL, 0);
- ldiskfs_xattr_set_handle(NULL, NULL, 0, "", NULL, 0, 0);
-
-],[
- AC_MSG_RESULT([yes])
- AC_DEFINE(HAVE_LDISKFS_XATTR_H, 1, [ldiskfs/xattr.h found])
-],[
- AC_MSG_RESULT([no])
-])
-EXTRA_KCFLAGS="$tmp_flags"
+AC_DEFUN([LC_CONFIG_LIBLUSTRE_RECOVERY],
+[AC_MSG_CHECKING([whether to enable liblustre recovery support])
+AC_ARG_ENABLE([liblustre-recovery],
+ AC_HELP_STRING([--disable-liblustre-recovery],
+ [disable liblustre recovery support]),
+ [],[enable_liblustre_recovery='yes'])
+AC_MSG_RESULT([$enable_liblustre_recovery])
+if test x$enable_liblustre_recovery != xno ; then
+ AC_DEFINE(ENABLE_LIBLUSTRE_RECOVERY, 1, Liblustre Can Recover)
+fi
])
#
fi
])
-#
-# LC_CONFIG_LIBLUSTRE_RECOVERY
-#
-AC_DEFUN([LC_CONFIG_LIBLUSTRE_RECOVERY],
-[AC_MSG_CHECKING([whether to enable liblustre recovery support])
-AC_ARG_ENABLE([liblustre-recovery],
- AC_HELP_STRING([--disable-liblustre-recovery],
- [disable liblustre recovery support]),
- [],[enable_liblustre_recovery='yes'])
-AC_MSG_RESULT([$enable_liblustre_recovery])
-if test x$enable_liblustre_recovery != xno ; then
- AC_DEFINE(ENABLE_LIBLUSTRE_RECOVERY, 1, Liblustre Can Recover)
+AC_DEFUN([LC_CONFIG_LRU_RESIZE],
+[AC_MSG_CHECKING([whether to enable lru self-adjusting])
+AC_ARG_ENABLE([lru_resize],
+ AC_HELP_STRING([--enable-lru-resize],
+ [enable lru resize support]),
+ [],[enable_lru_resize='yes'])
+AC_MSG_RESULT([$enable_lru_resize])
+if test x$enable_lru_resize != xno; then
+ AC_DEFINE(HAVE_LRU_RESIZE_SUPPORT, 1, [Enable lru resize support])
fi
])
-#
-# LC_CONFIG_OBD_BUFFER_SIZE
-#
-# the maximum buffer size of lctl ioctls
-#
-AC_DEFUN([LC_CONFIG_OBD_BUFFER_SIZE],
-[AC_MSG_CHECKING([maximum OBD ioctl size])
-AC_ARG_WITH([obd-buffer-size],
- AC_HELP_STRING([--with-obd-buffer-size=[size]],
- [set lctl ioctl maximum bytes (default=8192)]),
- [
- OBD_BUFFER_SIZE=$with_obd_buffer_size
- ],[
- OBD_BUFFER_SIZE=8192
- ])
-AC_MSG_RESULT([$OBD_BUFFER_SIZE bytes])
-AC_DEFINE_UNQUOTED(OBD_MAX_IOCTL_BUFFER, $OBD_BUFFER_SIZE, [IOCTL Buffer Size])
+# whether to enable quota support(kernel modules)
+AC_DEFUN([LC_QUOTA_MODULE],
+[if test x$enable_quota != xno; then
+ LB_LINUX_CONFIG([QUOTA],[
+ enable_quota_module='yes'
+ AC_DEFINE(HAVE_QUOTA_SUPPORT, 1, [Enable quota support])
+ ],[
+ enable_quota_module='no'
+ AC_MSG_WARN([quota is not enabled because the kernel - lacks quota support])
+ ])
+fi
])
-#
-# LC_STRUCT_STATFS
-#
-# AIX does not have statfs.f_namelen
-#
-AC_DEFUN([LC_STRUCT_STATFS],
-[AC_MSG_CHECKING([if struct statfs has a f_namelen field])
-LB_LINUX_TRY_COMPILE([
- #include <linux/vfs.h>
-],[
- struct statfs sfs;
- sfs.f_namelen = 1;
-],[
- AC_MSG_RESULT([yes])
- AC_DEFINE(HAVE_STATFS_NAMELEN, 1, [struct statfs has a namelen field])
+AC_DEFUN([LC_EXPORT_TRUNCATE_COMPLETE],
+[LB_CHECK_SYMBOL_EXPORT([truncate_complete_page],
+[mm/truncate.c],[
+AC_DEFINE(HAVE_TRUNCATE_COMPLETE_PAGE, 1,
+ [kernel export truncate_complete_page])
],[
- AC_MSG_RESULT([no])
])
])
-#
-# LC_READLINK_SSIZE_T
-#
-AC_DEFUN([LC_READLINK_SSIZE_T],
-[AC_MSG_CHECKING([if readlink returns ssize_t])
-AC_TRY_COMPILE([
- #include <unistd.h>
-],[
- ssize_t readlink(const char *, char *, size_t);
-],[
- AC_MSG_RESULT([yes])
- AC_DEFINE(HAVE_POSIX_1003_READLINK, 1, [readlink returns ssize_t])
+AC_DEFUN([LC_EXPORT_TRUNCATE_RANGE],
+[LB_CHECK_SYMBOL_EXPORT([truncate_inode_pages_range],
+[mm/truncate.c],[
+AC_DEFINE(HAVE_TRUNCATE_RANGE, 1,
+ [kernel export truncate_inode_pages_range])
],[
- AC_MSG_RESULT([no])
])
])
-#
-# LC_FUNC_MS_FLOCK_LOCK
-#
-# 2.6.5 kernel has MS_FLOCK_LOCK sb flag
-#
-AC_DEFUN([LC_FUNC_MS_FLOCK_LOCK],
-[AC_MSG_CHECKING([if kernel has MS_FLOCK_LOCK sb flag])
-LB_LINUX_TRY_COMPILE([
- #include <linux/fs.h>
-],[
- int flags = MS_FLOCK_LOCK;
-],[
- AC_DEFINE(HAVE_MS_FLOCK_LOCK, 1,
- [kernel has MS_FLOCK_LOCK flag])
- AC_MSG_RESULT([yes])
+AC_DEFUN([LC_EXPORT_D_REHASH_COND],
+[LB_CHECK_SYMBOL_EXPORT([d_rehash_cond],
+[fs/dcache.c],[
+AC_DEFINE(HAVE_D_REHASH_COND, 1,
+ [d_rehash_cond is exported by the kernel])
],[
- AC_MSG_RESULT([no])
])
])
-#
-# LC_FUNC_HAVE_CAN_SLEEP_ARG
-#
-# 2.6.5 kernel has third arg can_sleep in fs/locks.c: flock_lock_file_wait()
-#
-AC_DEFUN([LC_FUNC_HAVE_CAN_SLEEP_ARG],
-[AC_MSG_CHECKING([if kernel has third arg can_sleep in fs/locks.c: flock_lock_file_wait()])
-LB_LINUX_TRY_COMPILE([
- #include <linux/fs.h>
-],[
- int cansleep;
- struct file *file;
- struct file_lock *file_lock;
- flock_lock_file_wait(file, file_lock, cansleep);
-],[
- AC_DEFINE(HAVE_CAN_SLEEP_ARG, 1,
- [kernel has third arg can_sleep in fs/locks.c: flock_lock_file_wait()])
- AC_MSG_RESULT([yes])
+AC_DEFUN([LC_EXPORT___D_REHASH],
+[LB_CHECK_SYMBOL_EXPORT([__d_rehash],
+[fs/dcache.c],[
+AC_DEFINE(HAVE___D_REHASH, 1,
+ [__d_rehash is exported by the kernel])
],[
- AC_MSG_RESULT([no])
])
])
+AC_DEFUN([LC_EXPORT_D_MOVE_LOCKED],
+[LB_CHECK_SYMBOL_EXPORT([d_move_locked],
+[fs/dcache.c],[
+AC_DEFINE(HAVE_D_MOVE_LOCKED, 1,
+ [d_move_locked is exported by the kernel])
+],[
+])
+])
+
+AC_DEFUN([LC_EXPORT___D_MOVE],
+[LB_CHECK_SYMBOL_EXPORT([__d_move],
+[fs/dcache.c],[
+AC_DEFINE(HAVE___D_MOVE, 1,
+ [__d_move is exported by the kernel])
+],[
+])
+])
+
+# The actual symbol exported varies among architectures, so we need
+# to check many symbols (but only in the current architecture.) No
+# matter what symbol is exported, the kernel #defines node_to_cpumask
+# to the appropriate function and that's what we use.
+AC_DEFUN([LC_EXPORT_NODE_TO_CPUMASK],
+ [LB_CHECK_SYMBOL_EXPORT([node_to_cpumask],
+ [arch/$LINUX_ARCH/mm/numa.c],
+ [AC_DEFINE(HAVE_NODE_TO_CPUMASK, 1,
+ [node_to_cpumask is exported by
+ the kernel])]) # x86_64
+ LB_CHECK_SYMBOL_EXPORT([node_to_cpu_mask],
+ [arch/$LINUX_ARCH/kernel/smpboot.c],
+ [AC_DEFINE(HAVE_NODE_TO_CPUMASK, 1,
+ [node_to_cpumask is exported by
+ the kernel])]) # ia64
+ LB_CHECK_SYMBOL_EXPORT([node_2_cpu_mask],
+ [arch/$LINUX_ARCH/kernel/smpboot.c],
+ [AC_DEFINE(HAVE_NODE_TO_CPUMASK, 1,
+ [node_to_cpumask is exported by
+ the kernel])]) # i386
+ ])
+
#
-# LC_FUNC_RELEASEPAGE_WITH_GFP
-#
-# 2.6.9 ->releasepage() takes a gfp_t arg
-# This kernel defines gfp_t (HAS_GFP_T) but doesn't use it for this function,
-# while others either don't have gfp_t or pass gfp_t as the parameter.
-#
-AC_DEFUN([LC_FUNC_RELEASEPAGE_WITH_GFP],
-[AC_MSG_CHECKING([if releasepage has a gfp_t parameter])
-RELEASEPAGE_WITH_GFP="$(grep -c 'releasepage.*gfp_t' $LINUX/include/linux/fs.h)"
-if test "$RELEASEPAGE_WITH_GFP" != 0 ; then
- AC_DEFINE(HAVE_RELEASEPAGE_WITH_GFP, 1,
- [releasepage with gfp_t parameter])
+# LC_HEADER_LDISKFS_XATTR
+#
+# CHAOS kernel-devel package will not include fs/ldiskfs/xattr.h
+#
+AC_DEFUN([LC_HEADER_LDISKFS_XATTR],
+[AC_MSG_CHECKING([if ldiskfs has xattr.h header])
+tmp_flags="$EXTRA_KCFLAGS"
+EXTRA_KCFLAGS="-I$LINUX/fs -I$LDISKFS_DIR -I$LDISKFS_DIR/ldiskfs"
+LB_LINUX_TRY_COMPILE([
+ #include <ldiskfs/xattr.h>
+],[
+ ldiskfs_xattr_get(NULL, 0, "", NULL, 0);
+ ldiskfs_xattr_set_handle(NULL, NULL, 0, "", NULL, 0, 0);
+
+],[
AC_MSG_RESULT([yes])
-else
+ AC_DEFINE(HAVE_LDISKFS_XATTR_H, 1, [ldiskfs/xattr.h found])
+],[
+ AC_MSG_RESULT([no])
+])
+EXTRA_KCFLAGS="$tmp_flags"
+])
+
+#
+# LC_FUNC_GRAB_CACHE_PAGE_NOWAIT_GFP
+#
+# Check for our patched grab_cache_page_nowait_gfp() function
+# after 2.6.29 we can emulate this using add_to_page_cache_lru()
+#
+AC_DEFUN([LC_FUNC_GRAB_CACHE_PAGE_NOWAIT_GFP],
+[LB_CHECK_SYMBOL_EXPORT([grab_cache_page_nowait_gfp],
+[mm/filemap.c],[
+ AC_DEFINE(HAVE_GRAB_CACHE_PAGE_NOWAIT_GFP, 1,
+ [kernel exports grab_cache_page_nowait_gfp])
+ ],
+ [LB_CHECK_SYMBOL_EXPORT([add_to_page_cache_lru],
+ [mm/filemap.c],[
+ AC_DEFINE(HAVE_ADD_TO_PAGE_CACHE_LRU, 1,
+ [kernel exports add_to_page_cache_lru])
+ ],[
+ ])
+ ])
+])
+
+#
+# LC_STRUCT_STATFS
+#
+# AIX does not have statfs.f_namelen
+#
+AC_DEFUN([LC_STRUCT_STATFS],
+[AC_MSG_CHECKING([if struct statfs has a f_namelen field])
+LB_LINUX_TRY_COMPILE([
+ #include <linux/vfs.h>
+],[
+ struct statfs sfs;
+ sfs.f_namelen = 1;
+],[
+ AC_MSG_RESULT([yes])
+ AC_DEFINE(HAVE_STATFS_NAMELEN, 1, [struct statfs has a namelen field])
+],[
AC_MSG_RESULT([no])
-fi
+])
])
#
[])
])
-
-# added in 2.6.16
-#
-AC_DEFUN([LC_STRUCT_INTENT_FILE],
-[AC_MSG_CHECKING([if struct open_intent has a file field])
-LB_LINUX_TRY_COMPILE([
- #include <linux/fs.h>
- #include <linux/namei.h>
-],[
- struct open_intent intent;
- &intent.file;
-],[
- AC_MSG_RESULT([yes])
- AC_DEFINE(HAVE_FILE_IN_STRUCT_INTENT, 1, [struct open_intent has a file field])
-],[
- AC_MSG_RESULT([no])
-])
-])
-
-
#
# After 2.6.16 the xattr_acl API is removed, and posix_acl is used instead
#
])
])
+AC_DEFUN([LC_CONST_ACL_SIZE],
+[AC_MSG_CHECKING([calc acl size])
+tmp_flags="$CFLAGS"
+CFLAGS="$CFLAGS -I$LINUX/include -I$LINUX_OBJ/include -I$LINUX_OBJ/include2 -I$LINUX/arch/`uname -m|sed -e 's/ppc.*/powerpc/' -e 's/x86_64/x86/' -e 's/i.86/x86/'`/include $EXTRA_KCFLAGS"
+AC_TRY_RUN([
+ #define __KERNEL__
+ #include <linux/autoconf.h>
+ #include <linux/types.h>
+ #undef __KERNEL__
+ // block include
+ #define __LINUX_POSIX_ACL_H
+
+ # ifdef CONFIG_FS_POSIX_ACL
+ # ifdef HAVE_XATTR_ACL
+ # include <linux/xattr_acl.h>
+ # endif
+ # ifdef HAVE_LINUX_POSIX_ACL_XATTR_H
+ # include <linux/posix_acl_xattr.h>
+ # endif
+ # endif
+
+ #include <lustre_acl.h>
+
+ #include <stdio.h>
+
+ int main(void)
+ {
+ int size = mds_xattr_acl_size(LUSTRE_POSIX_ACL_MAX_ENTRIES);
+ FILE *f = fopen("acl.size","w+");
+ fprintf(f,"%d", size);
+ fclose(f);
+
+ return 0;
+ }
+],[
+ acl_size=`cat acl.size`
+ AC_MSG_RESULT([ACL size $acl_size])
+ AC_DEFINE_UNQUOTED(XATTR_ACL_SIZE, AS_TR_SH([$acl_size]), [size of xattr acl])
+],[
+ AC_ERROR([ACL size can't computed])
+])
+CFLAGS="$tmp_flags"
+])
+
+# added in 2.6.16
#
-# only for Lustre-patched kernels
-#
-AC_DEFUN([LC_LUSTRE_VERSION_H],
-[LB_CHECK_FILE([$LINUX/include/linux/lustre_version.h],[
- rm -f "$LUSTRE/include/linux/lustre_version.h"
+AC_DEFUN([LC_STRUCT_INTENT_FILE],
+[AC_MSG_CHECKING([if struct open_intent has a file field])
+LB_LINUX_TRY_COMPILE([
+ #include <linux/fs.h>
+ #include <linux/namei.h>
],[
- touch "$LUSTRE/include/linux/lustre_version.h"
- if test x$enable_server = xyes ; then
- AC_MSG_WARN([Unpatched kernel detected.])
- AC_MSG_WARN([Lustre servers cannot be built with an unpatched kernel;])
- AC_MSG_WARN([disabling server build])
- enable_server='no'
- fi
+ struct open_intent intent;
+ &intent.file;
+],[
+ AC_MSG_RESULT([yes])
+ AC_DEFINE(HAVE_FILE_IN_STRUCT_INTENT, 1, [struct open_intent has a file field])
+],[
+ AC_MSG_RESULT([no])
])
])
])
])
-AC_DEFUN([LC_SUNRPC_CACHE],
-[AC_MSG_CHECKING([if sunrpc struct cache_head uses kref])
-LB_LINUX_TRY_COMPILE([
- #include <linux/sunrpc/cache.h>
-],[
- struct cache_head ch;
- &ch.ref.refcount;
-],[
- AC_MSG_RESULT([yes])
- AC_DEFINE(HAVE_SUNRPC_CACHE_V2, 1, [sunrpc cache facility v2])
-],[
- AC_MSG_RESULT([no])
-])
-])
-
-AC_DEFUN([LC_CONFIG_SUNRPC],
-[LB_LINUX_CONFIG_IM([SUNRPC],[],
- [AC_MSG_ERROR([kernel SUNRPC support is required by using GSS.])])
- LC_SUNRPC_CACHE
-])
-
#
# LC_CONFIG_GSS_KEYRING (default enabled, if gss is enabled)
#
fi
])
+AC_DEFUN([LC_SUNRPC_CACHE],
+[AC_MSG_CHECKING([if sunrpc struct cache_head uses kref])
+LB_LINUX_TRY_COMPILE([
+ #include <linux/sunrpc/cache.h>
+],[
+ struct cache_head ch;
+ &ch.ref.refcount;
+],[
+ AC_MSG_RESULT([yes])
+ AC_DEFINE(HAVE_SUNRPC_CACHE_V2, 1, [sunrpc cache facility v2])
+],[
+ AC_MSG_RESULT([no])
+])
+])
+
+AC_DEFUN([LC_CONFIG_SUNRPC],
+[LB_LINUX_CONFIG_IM([SUNRPC],[],
+ [AC_MSG_ERROR([kernel SUNRPC support is required by using GSS.])])
+ LC_SUNRPC_CACHE
+])
+
#
# LC_CONFIG_GSS (default disabled)
#
fi
])
-# LC_EXPORT_SYNCHRONIZE_RCU
-# after 2.6.12 synchronize_rcu is preferred over synchronize_kernel
-AC_DEFUN([LC_EXPORT_SYNCHRONIZE_RCU],
-[LB_CHECK_SYMBOL_EXPORT([synchronize_rcu],
-[kernel/rcupdate.c],[
- AC_DEFINE(HAVE_SYNCHRONIZE_RCU, 1,
- [in 2.6.12 synchronize_rcu preferred over synchronize_kernel])
-],[
-])
-])
-
-# LC_INODE_I_MUTEX
-# after 2.6.15 inode have i_mutex intead of i_sem
-AC_DEFUN([LC_INODE_I_MUTEX],
-[AC_MSG_CHECKING([if inode has i_mutex ])
+#
+# LC_FUNC_HAVE_CAN_SLEEP_ARG
+#
+# 2.6.5 kernel has third arg can_sleep in fs/locks.c: flock_lock_file_wait()
+#
+AC_DEFUN([LC_FUNC_HAVE_CAN_SLEEP_ARG],
+[AC_MSG_CHECKING([if kernel has third arg can_sleep in fs/locks.c: flock_lock_file_wait()])
LB_LINUX_TRY_COMPILE([
- #include <linux/mutex.h>
- #include <linux/fs.h>
- #undef i_mutex
+ #include <linux/fs.h>
],[
- struct inode i;
-
- mutex_unlock(&i.i_mutex);
+ int cansleep;
+ struct file *file;
+ struct file_lock *file_lock;
+ flock_lock_file_wait(file, file_lock, cansleep);
],[
- AC_MSG_RESULT(yes)
- AC_DEFINE(HAVE_INODE_I_MUTEX, 1,
- [after 2.6.15 inode have i_mutex intead of i_sem])
+ AC_DEFINE(HAVE_CAN_SLEEP_ARG, 1,
+ [kernel has third arg can_sleep in fs/locks.c: flock_lock_file_wait()])
+ AC_MSG_RESULT([yes])
],[
- AC_MSG_RESULT(no)
+ AC_MSG_RESULT([no])
])
])
-# LC_SEQ_LOCK
-# after 2.6.18 seq_file has lock intead of sem
-AC_DEFUN([LC_SEQ_LOCK],
-[AC_MSG_CHECKING([if struct seq_file has lock field])
+#
+# LC_FUNC_F_OP_FLOCK
+#
+# rhel4.2 kernel has f_op->flock field
+#
+AC_DEFUN([LC_FUNC_F_OP_FLOCK],
+[AC_MSG_CHECKING([if struct file_operations has flock field])
LB_LINUX_TRY_COMPILE([
- #include <linux/seq_file.h>
+ #include <linux/fs.h>
],[
- struct seq_file seq;
+ struct file_operations ll_file_operations_flock;
+ ll_file_operations_flock.flock = NULL;
+],[
+ AC_DEFINE(HAVE_F_OP_FLOCK, 1,
+ [struct file_operations has flock field])
+ AC_MSG_RESULT([yes])
+],[
+ AC_MSG_RESULT([no])
+])
+])
+
+AC_DEFUN([LC_QUOTA_READ],
+[AC_MSG_CHECKING([if kernel supports quota_read])
+LB_LINUX_TRY_COMPILE([
+ #include <linux/fs.h>
+],[
+ struct super_operations sp;
+ void *i = (void *)sp.quota_read;
+],[
+ AC_MSG_RESULT([yes])
+ AC_DEFINE(KERNEL_SUPPORTS_QUOTA_READ, 1, [quota_read found])
+],[
+ AC_MSG_RESULT([no])
+])
+])
+
+#
+# LC_COOKIE_FOLLOW_LINK
+#
+# kernel 2.6.13+ ->follow_link returns a cookie
+#
+
+AC_DEFUN([LC_COOKIE_FOLLOW_LINK],
+[AC_MSG_CHECKING([if inode_operations->follow_link returns a cookie])
+LB_LINUX_TRY_COMPILE([
+ #include <linux/fs.h>
+ #include <linux/namei.h>
+],[
+ struct dentry dentry;
+ struct nameidata nd;
+
+ dentry.d_inode->i_op->put_link(&dentry, &nd, NULL);
+],[
+ AC_DEFINE(HAVE_COOKIE_FOLLOW_LINK, 1, [inode_operations->follow_link returns a cookie])
+ AC_MSG_RESULT([yes])
+],[
+ AC_MSG_RESULT([no])
+])
+])
+
+#
+# LC_FUNC_RCU
+#
+# kernels prior than 2.6.0(?) have no RCU supported; in kernel 2.6.5(SUSE),
+# call_rcu takes three parameters.
+#
+AC_DEFUN([LC_FUNC_RCU],
+[AC_MSG_CHECKING([if kernel have RCU supported])
+LB_LINUX_TRY_COMPILE([
+ #include <linux/rcupdate.h>
+],[],[
+ AC_DEFINE(HAVE_RCU, 1, [have RCU defined])
+ AC_MSG_RESULT([yes])
+
+ AC_MSG_CHECKING([if call_rcu takes three parameters])
+ LB_LINUX_TRY_COMPILE([
+ #include <linux/rcupdate.h>
+ ],[
+ struct rcu_head rh;
+ call_rcu(&rh, (void (*)(struct rcu_head *))1, NULL);
+ ],[
+ AC_DEFINE(HAVE_CALL_RCU_PARAM, 1, [call_rcu takes three parameters])
+ AC_MSG_RESULT([yes])
+ ],[
+ AC_MSG_RESULT([no])
+ ])
- mutex_unlock(&seq.lock);
+],[
+ AC_MSG_RESULT([no])
+])
+])
+
+AC_DEFUN([LC_PERCPU_COUNTER],
+[AC_MSG_CHECKING([if have struct percpu_counter defined])
+LB_LINUX_TRY_COMPILE([
+ #include <linux/percpu_counter.h>
+],[],[
+ AC_DEFINE(HAVE_PERCPU_COUNTER, 1, [percpu_counter found])
+ AC_MSG_RESULT([yes])
+
+ AC_MSG_CHECKING([if percpu_counter_inc takes the 2nd argument])
+ LB_LINUX_TRY_COMPILE([
+ #include <linux/percpu_counter.h>
+ ],[
+ struct percpu_counter c;
+ percpu_counter_init(&c, 0);
+ ],[
+ AC_DEFINE(HAVE_PERCPU_2ND_ARG, 1, [percpu_counter_init has two
+ arguments])
+ AC_MSG_RESULT([yes])
+ ],[
+ AC_MSG_RESULT([no])
+ ])
+],[
+ AC_MSG_RESULT([no])
+])
+])
+
+AC_DEFUN([LC_TASK_CLENV_STORE],
+[
+ AC_MSG_CHECKING([if we can store cl_env in task_struct])
+ if test x$have_task_clenv_store != xyes ; then
+ LC_TASK_CLENV_TUX_INFO
+ fi
+])
+
+# ~2.6.11
+
+AC_DEFUN([LC_S_TIME_GRAN],
+[AC_MSG_CHECKING([if super block has s_time_gran member])
+LB_LINUX_TRY_COMPILE([
+ #include <linux/fs.h>
+],[
+ struct super_block sb;
+
+ return sb.s_time_gran;
+],[
+ AC_MSG_RESULT([yes])
+ AC_DEFINE(HAVE_S_TIME_GRAN, 1, [super block has s_time_gran member])
+],[
+ AC_MSG_RESULT([no])
+])
+])
+
+AC_DEFUN([LC_SB_TIME_GRAN],
+[AC_MSG_CHECKING([if kernel has old get_sb_time_gran])
+LB_LINUX_TRY_COMPILE([
+ #include <linux/fs.h>
+],[
+ return get_sb_time_gran(NULL);
+],[
+ AC_MSG_RESULT([yes])
+ AC_DEFINE(HAVE_SB_TIME_GRAN, 1, [kernel has old get_sb_time_gran])
+],[
+ AC_MSG_RESULT([no])
+])
+])
+
+# 2.6.12
+
+# ~2.6.12 merge patch from oracle to convert tree_lock from spinlock to rwlock
+AC_DEFUN([LC_RW_TREE_LOCK],
+[AC_MSG_CHECKING([if kernel has tree_lock as rwlock])
+tmp_flags="$EXTRA_KCFLAGS"
+EXTRA_KCFLAGS="-Werror"
+LB_LINUX_TRY_COMPILE([
+ #include <linux/fs.h>
+],[
+ struct address_space a;
+
+ write_lock(&a.tree_lock);
+],[
+ AC_MSG_RESULT([yes])
+ AC_DEFINE(HAVE_RW_TREE_LOCK, 1, [kernel has tree_lock as rw_lock])
+],[
+ AC_MSG_RESULT([no])
+])
+EXTRA_KCFLAGS="$tmp_flags"
+])
+
+# LC_EXPORT_SYNCHRONIZE_RCU
+# after 2.6.12 synchronize_rcu is preferred over synchronize_kernel
+AC_DEFUN([LC_EXPORT_SYNCHRONIZE_RCU],
+[LB_CHECK_SYMBOL_EXPORT([synchronize_rcu],
+[kernel/rcupdate.c],[
+ AC_DEFINE(HAVE_SYNCHRONIZE_RCU, 1,
+ [in 2.6.12 synchronize_rcu preferred over synchronize_kernel])
+],[
+])
+])
+
+# 2.6.15
+
+# LC_INODE_I_MUTEX
+# after 2.6.15 inode have i_mutex intead of i_sem
+AC_DEFUN([LC_INODE_I_MUTEX],
+[AC_MSG_CHECKING([if inode has i_mutex ])
+LB_LINUX_TRY_COMPILE([
+ #include <linux/mutex.h>
+ #include <linux/fs.h>
+ #undef i_mutex
+],[
+ struct inode i;
+
+ mutex_unlock(&i.i_mutex);
],[
AC_MSG_RESULT(yes)
- AC_DEFINE(HAVE_SEQ_LOCK, 1,
- [after 2.6.18 seq_file has lock intead of sem])
+ AC_DEFINE(HAVE_INODE_I_MUTEX, 1,
+ [after 2.6.15 inode have i_mutex intead of i_sem])
],[
- AC_MSG_RESULT(NO)
+ AC_MSG_RESULT(no)
+])
+])
+
+# 2.6.16
+
+# LC_SECURITY_PLUG # for SLES10 SP2
+# check security plug in sles10 sp2 kernel
+AC_DEFUN([LC_SECURITY_PLUG],
+[AC_MSG_CHECKING([If kernel has security plug support])
+LB_LINUX_TRY_COMPILE([
+ #include <linux/fs.h>
+],[
+ struct dentry *dentry;
+ struct vfsmount *mnt;
+ struct iattr *iattr;
+
+ notify_change(dentry, mnt, iattr);
+],[
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(HAVE_SECURITY_PLUG, 1,
+ [SLES10 SP2 use extra parameter in vfs])
+],[
+ AC_MSG_RESULT(no)
+])
+])
+
+# 2.6.17
+
+# inode have i_private field since 2.6.17
+AC_DEFUN([LC_INODE_IPRIVATE],
+[AC_MSG_CHECKING([if inode has a i_private field])
+LB_LINUX_TRY_COMPILE([
+#include <linux/fs.h>
+],[
+ struct inode i;
+ i.i_private = NULL;
+],[
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(HAVE_INODE_IPRIVATE, 1,
+ [struct inode has i_private field])
+],[
+ AC_MSG_RESULT(no)
])
])
])
])
-# LC_FLUSH_OWNER_ID
-# starting from 2.6.18 the file_operations .flush
-# method has a new "fl_owner_t id" parameter
-#
-AC_DEFUN([LC_FLUSH_OWNER_ID],
-[AC_MSG_CHECKING([if file_operations .flush has an fl_owner_t id])
+# 2.6.18
+
+# LC_NR_PAGECACHE
+# 2.6.18 don't export nr_pagecahe
+AC_DEFUN([LC_NR_PAGECACHE],
+[AC_MSG_CHECKING([kernel export nr_pagecache])
LB_LINUX_TRY_COMPILE([
- #include <linux/fs.h>
+ #include <linux/pagemap.h>
],[
- struct file_operations *fops = NULL;
- fl_owner_t id;
- int i;
-
- i = fops->flush(NULL, id);
+ return atomic_read(&nr_pagecache);
],[
- AC_DEFINE(HAVE_FLUSH_OWNER_ID, 1,
- [file_operations .flush method has an fl_owner_t id])
- AC_MSG_RESULT([yes])
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(HAVE_NR_PAGECACHE, 1,
+ [is kernel export nr_pagecache])
],[
- AC_MSG_RESULT([no])
+ AC_MSG_RESULT(no)
])
])
EXTRA_KCFLAGS="$tmp_flags"
])
-# inode have i_private field since 2.6.17
-AC_DEFUN([LC_INODE_IPRIVATE],
-[AC_MSG_CHECKING([if inode has a i_private field])
+# LC_SEQ_LOCK
+# after 2.6.18 seq_file has lock intead of sem
+AC_DEFUN([LC_SEQ_LOCK],
+[AC_MSG_CHECKING([if struct seq_file has lock field])
LB_LINUX_TRY_COMPILE([
-#include <linux/fs.h>
+ #include <linux/seq_file.h>
],[
- struct inode i;
- i.i_private = NULL;
+ struct seq_file seq;
+
+ mutex_unlock(&seq.lock);
],[
- AC_MSG_RESULT(yes)
- AC_DEFINE(HAVE_INODE_IPRIVATE, 1,
- [struct inode has i_private field])
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(HAVE_SEQ_LOCK, 1,
+ [after 2.6.18 seq_file has lock intead of sem])
],[
- AC_MSG_RESULT(no)
+ AC_MSG_RESULT(NO)
+])
+])
+
+#
+# LC_EXPORT_FILEMAP_FDATAWRITE_RANGE
+#
+# No standard kernels export this
+#
+AC_DEFUN([LC_EXPORT_FILEMAP_FDATAWRITE_RANGE],
+[LB_CHECK_SYMBOL_EXPORT([filemap_fdatawrite_range],
+[mm/filemap.c],[
+AC_DEFINE(HAVE_FILEMAP_FDATAWRITE_RANGE, 1,
+ [filemap_fdatawrite_range is exported by the kernel])
+],[
+])
+])
+
+# LC_FLUSH_OWNER_ID
+# starting from 2.6.18 the file_operations .flush
+# method has a new "fl_owner_t id" parameter
+#
+AC_DEFUN([LC_FLUSH_OWNER_ID],
+[AC_MSG_CHECKING([if file_operations .flush has an fl_owner_t id])
+LB_LINUX_TRY_COMPILE([
+ #include <linux/fs.h>
+],[
+ struct file_operations *fops = NULL;
+ fl_owner_t id;
+ int i;
+
+ i = fops->flush(NULL, id);
+],[
+ AC_DEFINE(HAVE_FLUSH_OWNER_ID, 1,
+ [file_operations .flush method has an fl_owner_t id])
+ AC_MSG_RESULT([yes])
+],[
+ AC_MSG_RESULT([no])
+])
+])
+
+#
+# LC_EXPORT_INVALIDATE_MAPPING_PAGES
+#
+# SLES9, RHEL4, RHEL5, vanilla 2.6.24 export invalidate_mapping_pages() but
+# SLES10 2.6.16 does not, for some reason. For filter cache invalidation.
+#
+AC_DEFUN([LC_EXPORT_INVALIDATE_MAPPING_PAGES],
+ [LB_CHECK_SYMBOL_EXPORT([invalidate_mapping_pages], [mm/truncate.c], [
+ AC_DEFINE(HAVE_INVALIDATE_MAPPING_PAGES, 1,
+ [exported invalidate_mapping_pages])],
+ [LB_CHECK_SYMBOL_EXPORT([invalidate_inode_pages], [mm/truncate.c], [
+ AC_DEFINE(HAVE_INVALIDATE_INODE_PAGES, 1,
+ [exported invalidate_inode_pages])], [
+ AC_MSG_ERROR([no way to invalidate pages])
+ ])
+ ],[])
+])
+
+#
+# LC_EXT4_DISCARD_PREALLOCATIONS
+#
+AC_DEFUN([LC_EXT4_DISCARD_PREALLOCATIONS],
+[AC_MSG_CHECKING([if ext4_discard_preallocatoins defined])
+tmp_flags="$EXTRA_KCFLAGS"
+EXTRA_KCFLAGS="-I$LINUX/fs"
+LB_LINUX_TRY_COMPILE([
+ #include <ext4/ext4.h>
+],[
+ struct inode i;
+ ext4_discard_preallocations(&i);
+],[
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(LDISKFS_DISCARD_PREALLOCATIONS, 1,
+ [ext4_discard_preacllocations defined])
+],[
+ AC_MSG_RESULT(no)
+])
+EXTRA_KCFLAGS="$tmp_flags"
+])
+
+#
+# LC_EXT_INSERT_EXTENT_WITH_5ARGS
+#
+AC_DEFUN([LC_EXT_INSERT_EXTENT_WITH_5ARGS],
+[AC_MSG_CHECKING([ext4_ext_insert_extent needs 5 arguments])
+tmp_flags="$EXTRA_KCFLAGS"
+EXTRA_KCFLAGS="-I$LINUX/fs"
+LB_LINUX_TRY_COMPILE([
+ #include <ext4/ext4_extents.h>
+],[
+ ext4_ext_insert_extent(NULL, NULL, NULL, NULL, 0);
+],[
+ AC_DEFINE([EXT_INSERT_EXTENT_WITH_5ARGS], 1,
+ [ext4_ext_insert_exent needs 5 arguments])
+ AC_MSG_RESULT([yes])
+],[
+ AC_MSG_RESULT([no])
+])
+EXTRA_KCFLAGS="$tmp_flags"
+])
+
+#2.6.18 + RHEL5 (fc6)
+
+# RHEL5 in FS-cache patch rename PG_checked flag into PG_fs_misc
+AC_DEFUN([LC_PG_FS_MISC],
+[AC_MSG_CHECKING([kernel has PG_fs_misc])
+LB_LINUX_TRY_COMPILE([
+ #include <linux/mm.h>
+ #include <linux/page-flags.h>
+],[
+ #ifndef PG_fs_misc
+ #error PG_fs_misc not defined in kernel
+ #endif
+],[
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(HAVE_PG_FS_MISC, 1,
+ [is kernel have PG_fs_misc])
+],[
+ AC_MSG_RESULT(no)
+])
+])
+
+# RHEL5 PageChecked and SetPageChecked defined
+AC_DEFUN([LC_PAGE_CHECKED],
+[AC_MSG_CHECKING([kernel has PageChecked and SetPageChecked])
+LB_LINUX_TRY_COMPILE([
+ #include <linux/autoconf.h>
+#ifdef HAVE_LINUX_MMTYPES_H
+ #include <linux/mm_types.h>
+#endif
+ #include <linux/page-flags.h>
+],[
+ struct page *p;
+
+ /* before 2.6.26 this define*/
+ #ifndef PageChecked
+ /* 2.6.26 use function instead of define for it */
+ SetPageChecked(p);
+ PageChecked(p);
+ #endif
+],[
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(HAVE_PAGE_CHECKED, 1,
+ [does kernel have PageChecked and SetPageChecked])
+],[
+ AC_MSG_RESULT(no)
+])
])
+
+#
+# LC_LINUX_FIEMAP_H
+#
+# If we have fiemap.h
+# after 2.6.27 use fiemap.h in include/linux
+#
+AC_DEFUN([LC_LINUX_FIEMAP_H],
+[LB_CHECK_FILE([$LINUX/include/linux/fiemap.h],[
+ AC_MSG_CHECKING([if fiemap.h can be compiled])
+ LB_LINUX_TRY_COMPILE([
+ #include <linux/types.h>
+ #include <linux/fiemap.h>
+ ],[],[
+ AC_MSG_RESULT([yes])
+ AC_DEFINE(HAVE_LINUX_FIEMAP_H, 1, [Kernel has fiemap.h])
+ ],[
+ AC_MSG_RESULT([no])
+ ])
+],
+[])
])
+# 2.6.19
+
# 2.6.19 API changes
# inode don't have i_blksize field
AC_DEFUN([LC_INODE_BLKSIZE],
])
])
-# LC_GENERIC_FILE_READ
+# LC_FILE_READV
# 2.6.19 replaced readv with aio_read
AC_DEFUN([LC_FILE_READV],
[AC_MSG_CHECKING([readv in fops])
])
])
-# LC_NR_PAGECACHE
-# 2.6.18 don't export nr_pagecahe
-AC_DEFUN([LC_NR_PAGECACHE],
-[AC_MSG_CHECKING([kernel export nr_pagecache])
-LB_LINUX_TRY_COMPILE([
- #include <linux/pagemap.h>
-],[
- return atomic_read(&nr_pagecache);
-],[
- AC_MSG_RESULT(yes)
- AC_DEFINE(HAVE_NR_PAGECACHE, 1,
- [is kernel export nr_pagecache])
-],[
- AC_MSG_RESULT(no)
-])
-])
+# 2.6.20
# LC_CANCEL_DIRTY_PAGE
# 2.6.20 introduced cancel_dirty_page instead of clear_page_dirty.
fi
])
+# raid5-zerocopy patch
+
#
# LC_PAGE_CONSTANT
#
])
])
-# RHEL5 in FS-cache patch rename PG_checked flag into PG_fs_misc
-AC_DEFUN([LC_PG_FS_MISC],
-[AC_MSG_CHECKING([kernel has PG_fs_misc])
+# 2.6.22
+
+# 2.6.22 lost second parameter for invalidate_bdev
+AC_DEFUN([LC_INVALIDATE_BDEV_2ARG],
+[AC_MSG_CHECKING([if invalidate_bdev has second argument])
LB_LINUX_TRY_COMPILE([
- #include <linux/mm.h>
- #include <linux/page-flags.h>
+ #include <linux/buffer_head.h>
],[
- #ifndef PG_fs_misc
- #error PG_fs_misc not defined in kernel
- #endif
+ invalidate_bdev(NULL,0);
],[
- AC_MSG_RESULT(yes)
- AC_DEFINE(HAVE_PG_FS_MISC, 1,
- [is kernel have PG_fs_misc])
+ AC_MSG_RESULT([yes])
+ AC_DEFINE(HAVE_INVALIDATE_BDEV_2ARG, 1,
+ [invalidate_bdev has second argument])
],[
- AC_MSG_RESULT(no)
+ AC_MSG_RESULT([no])
])
])
-# RHEL5 PageChecked and SetPageChecked defined
-AC_DEFUN([LC_PAGE_CHECKED],
-[AC_MSG_CHECKING([kernel has PageChecked and SetPageChecked])
+#
+# check for crypto API
+#
+AC_DEFUN([LC_ASYNC_BLOCK_CIPHER],
+[AC_MSG_CHECKING([if kernel has block cipher support])
LB_LINUX_TRY_COMPILE([
- #include <linux/autoconf.h>
-#ifdef HAVE_LINUX_MMTYPES_H
- #include <linux/mm_types.h>
-#endif
- #include <linux/page-flags.h>
+ #include <linux/err.h>
+ #include <linux/crypto.h>
],[
- struct page *p;
-
- /* before 2.6.26 this define*/
- #ifndef PageChecked
- /* 2.6.26 use function instead of define for it */
- SetPageChecked(p);
- PageChecked(p);
- #endif
+ struct crypto_blkcipher *tfm;
+ tfm = crypto_alloc_blkcipher("aes", 0, 0 );
],[
- AC_MSG_RESULT(yes)
- AC_DEFINE(HAVE_PAGE_CHECKED, 1,
- [does kernel have PageChecked and SetPageChecked])
+ AC_MSG_RESULT([yes])
+ AC_DEFINE(HAVE_ASYNC_BLOCK_CIPHER, 1, [kernel has block cipher support])
],[
- AC_MSG_RESULT(no)
+ AC_MSG_RESULT([no])
])
])
-AC_DEFUN([LC_EXPORT_TRUNCATE_COMPLETE],
-[LB_CHECK_SYMBOL_EXPORT([truncate_complete_page],
-[mm/truncate.c],[
-AC_DEFINE(HAVE_TRUNCATE_COMPLETE_PAGE, 1,
- [kernel export truncate_complete_page])
+#
+# check for struct hash_desc
+#
+AC_DEFUN([LC_STRUCT_HASH_DESC],
+[AC_MSG_CHECKING([if kernel has struct hash_desc])
+LB_LINUX_TRY_COMPILE([
+ #include <linux/err.h>
+ #include <linux/crypto.h>
],[
-])
-])
-
-AC_DEFUN([LC_EXPORT_TRUNCATE_RANGE],
-[LB_CHECK_SYMBOL_EXPORT([truncate_inode_pages_range],
-[mm/truncate.c],[
-AC_DEFINE(HAVE_TRUNCATE_RANGE, 1,
- [kernel export truncate_inode_pages_range])
+ struct hash_desc foo;
],[
-])
-])
-
-AC_DEFUN([LC_EXPORT_D_REHASH_COND],
-[LB_CHECK_SYMBOL_EXPORT([d_rehash_cond],
-[fs/dcache.c],[
-AC_DEFINE(HAVE_D_REHASH_COND, 1,
- [d_rehash_cond is exported by the kernel])
+ AC_MSG_RESULT([yes])
+ AC_DEFINE(HAVE_STRUCT_HASH_DESC, 1, [kernel has struct hash_desc])
],[
+ AC_MSG_RESULT([no])
])
])
-AC_DEFUN([LC_EXPORT___D_REHASH],
-[LB_CHECK_SYMBOL_EXPORT([__d_rehash],
-[fs/dcache.c],[
-AC_DEFINE(HAVE___D_REHASH, 1,
- [__d_rehash is exported by the kernel])
+#
+# check for struct blkcipher_desc
+#
+AC_DEFUN([LC_STRUCT_BLKCIPHER_DESC],
+[AC_MSG_CHECKING([if kernel has struct blkcipher_desc])
+LB_LINUX_TRY_COMPILE([
+ #include <linux/err.h>
+ #include <linux/crypto.h>
],[
-])
-])
-
-AC_DEFUN([LC_EXPORT_D_MOVE_LOCKED],
-[LB_CHECK_SYMBOL_EXPORT([d_move_locked],
-[fs/dcache.c],[
-AC_DEFINE(HAVE_D_MOVE_LOCKED, 1,
- [d_move_locked is exported by the kernel])
+ struct blkcipher_desc foo;
],[
-])
-])
-
-AC_DEFUN([LC_EXPORT___D_MOVE],
-[LB_CHECK_SYMBOL_EXPORT([__d_move],
-[fs/dcache.c],[
-AC_DEFINE(HAVE___D_MOVE, 1,
- [__d_move is exported by the kernel])
+ AC_MSG_RESULT([yes])
+ AC_DEFINE(HAVE_STRUCT_BLKCIPHER_DESC, 1, [kernel has struct blkcipher_desc])
],[
+ AC_MSG_RESULT([no])
])
])
#
-# LC_EXPORT_INVALIDATE_MAPPING_PAGES
-#
-# SLES9, RHEL4, RHEL5, vanilla 2.6.24 export invalidate_mapping_pages() but
-# SLES10 2.6.16 does not, for some reason. For filter cache invalidation.
-#
-AC_DEFUN([LC_EXPORT_INVALIDATE_MAPPING_PAGES],
- [LB_CHECK_SYMBOL_EXPORT([invalidate_mapping_pages], [mm/truncate.c], [
- AC_DEFINE(HAVE_INVALIDATE_MAPPING_PAGES, 1,
- [exported invalidate_mapping_pages])],
- [LB_CHECK_SYMBOL_EXPORT([invalidate_inode_pages], [mm/truncate.c], [
- AC_DEFINE(HAVE_INVALIDATE_INODE_PAGES, 1,
- [exported invalidate_inode_pages])], [
- AC_MSG_ERROR([no way to invalidate pages])
- ])
- ],[])
-])
-
-#
-# LC_EXPORT_FILEMAP_FDATASYNC_RANGE
-#
-# No standard kernels export this
+# 2.6.19 check for FS_RENAME_DOES_D_MOVE flag
#
-AC_DEFUN([LC_EXPORT_FILEMAP_FDATAWRITE_RANGE],
-[LB_CHECK_SYMBOL_EXPORT([filemap_fdatawrite_range],
-[mm/filemap.c],[
-AC_DEFINE(HAVE_FILEMAP_FDATAWRITE_RANGE, 1,
- [filemap_fdatawrite_range is exported by the kernel])
-],[
-])
-])
-
-# The actual symbol exported varies among architectures, so we need
-# to check many symbols (but only in the current architecture.) No
-# matter what symbol is exported, the kernel #defines node_to_cpumask
-# to the appropriate function and that's what we use.
-AC_DEFUN([LC_EXPORT_NODE_TO_CPUMASK],
- [LB_CHECK_SYMBOL_EXPORT([node_to_cpumask],
- [arch/$LINUX_ARCH/mm/numa.c],
- [AC_DEFINE(HAVE_NODE_TO_CPUMASK, 1,
- [node_to_cpumask is exported by
- the kernel])]) # x86_64
- LB_CHECK_SYMBOL_EXPORT([node_to_cpu_mask],
- [arch/$LINUX_ARCH/kernel/smpboot.c],
- [AC_DEFINE(HAVE_NODE_TO_CPUMASK, 1,
- [node_to_cpumask is exported by
- the kernel])]) # ia64
- LB_CHECK_SYMBOL_EXPORT([node_2_cpu_mask],
- [arch/$LINUX_ARCH/kernel/smpboot.c],
- [AC_DEFINE(HAVE_NODE_TO_CPUMASK, 1,
- [node_to_cpumask is exported by
- the kernel])]) # i386
- ])
-
-# 2.6.22 lost second parameter for invalidate_bdev
-AC_DEFUN([LC_INVALIDATE_BDEV_2ARG],
-[AC_MSG_CHECKING([if invalidate_bdev has second argument])
+AC_DEFUN([LC_FS_RENAME_DOES_D_MOVE],
+[AC_MSG_CHECKING([if kernel has FS_RENAME_DOES_D_MOVE flag])
LB_LINUX_TRY_COMPILE([
- #include <linux/buffer_head.h>
+ #include <linux/fs.h>
],[
- invalidate_bdev(NULL,0);
+ int v = FS_RENAME_DOES_D_MOVE;
],[
AC_MSG_RESULT([yes])
- AC_DEFINE(HAVE_INVALIDATE_BDEV_2ARG, 1,
- [invalidate_bdev has second argument])
+ AC_DEFINE(HAVE_FS_RENAME_DOES_D_MOVE, 1, [kernel has FS_RENAME_DOES_D_MOVE flag])
],[
AC_MSG_RESULT([no])
])
])
-# 2.6.18
-
+# 2.6.23
# 2.6.23 have return type 'void' for unregister_blkdev
AC_DEFUN([LC_UNREGISTER_BLKDEV_RETURN_INT],
])
# 2.6.23 change .sendfile to .splice_read
-# RHEL4 (-92 kernel) have both sendfile and .splice_read API
-AC_DEFUN([LC_KERNEL_SENDFILE],
-[AC_MSG_CHECKING([if kernel has .sendfile])
+AC_DEFUN([LC_KERNEL_SPLICE_READ],
+[AC_MSG_CHECKING([if kernel has .splice_read])
LB_LINUX_TRY_COMPILE([
#include <linux/fs.h>
],[
struct file_operations file;
- file.sendfile = NULL;
+ file.splice_read = NULL;
], [
AC_MSG_RESULT([yes])
- AC_DEFINE(HAVE_KERNEL_SENDFILE, 1,
- [kernel has .sendfile])
+ AC_DEFINE(HAVE_KERNEL_SPLICE_READ, 1,
+ [kernel has .slice_read])
],[
AC_MSG_RESULT([no])
])
])
# 2.6.23 change .sendfile to .splice_read
-AC_DEFUN([LC_KERNEL_SPLICE_READ],
-[AC_MSG_CHECKING([if kernel has .splice_read])
+# RHEL4 (-92 kernel) have both sendfile and .splice_read API
+AC_DEFUN([LC_KERNEL_SENDFILE],
+[AC_MSG_CHECKING([if kernel has .sendfile])
LB_LINUX_TRY_COMPILE([
#include <linux/fs.h>
],[
struct file_operations file;
- file.splice_read = NULL;
+ file.sendfile = NULL;
], [
AC_MSG_RESULT([yes])
- AC_DEFINE(HAVE_KERNEL_SPLICE_READ, 1,
- [kernel has .slice_read])
+ AC_DEFINE(HAVE_KERNEL_SENDFILE, 1,
+ [kernel has .sendfile])
],[
AC_MSG_RESULT([no])
])
])
])
-#2.6.23 has new shrinker API
+# 2.6.23 has new shrinker API
AC_DEFUN([LC_REGISTER_SHRINKER],
-[AC_MSG_CHECKING([if kernel has register_shrinker])
-LB_LINUX_TRY_COMPILE([
- #include <linux/mm.h>
-],[
- register_shrinker(NULL);
-], [
- AC_MSG_RESULT([yes])
+[LB_CHECK_SYMBOL_EXPORT([register_shrinker],
+[mm/vmscan.c],[
AC_DEFINE(HAVE_REGISTER_SHRINKER, 1,
- [kernel has register_shrinker])
+ [kernel exports register_shrinker])
],[
- AC_MSG_RESULT([no])
])
])
])
])
+# 2.6.23 exports exportfs_decode_fh
+AC_DEFUN([LC_EXPORTFS_DECODE_FH],
+[LB_CHECK_SYMBOL_EXPORT([exportfs_decode_fh],
+[fs/exportfs/expfs.c],[
+ AC_DEFINE(HAVE_EXPORTFS_DECODE_FH, 1,
+ [exportfs_decode_fh has been export])
+],[
+])
+])
+
+# 2.6.24
+
+# 2.6.24 need linux/mm_types.h included
+AC_DEFUN([LC_HAVE_MMTYPES_H],
+[LB_CHECK_FILE([$LINUX/include/linux/mm_types.h], [
+ AC_DEFINE(HAVE_LINUX_MMTYPES_H, 1,
+ [kernel has include/mm_types.h])
+],[
+ AC_MSG_RESULT([no])
+])
+])
+
# 2.6.24 has bio_endio with 2 args
AC_DEFUN([LC_BIO_ENDIO_2ARG],
[AC_MSG_CHECKING([if kernel has bio_endio with 2 args])
])
])
-# 2.6.24 need linux/mm_types.h included
-AC_DEFUN([LC_HAVE_MMTYPES_H],
-[LB_CHECK_FILE([$LINUX/include/linux/mm_types.h], [
- AC_DEFINE(HAVE_LINUX_MMTYPES_H, 1,
- [kernel has include/mm_types.h])
-],[
- AC_MSG_RESULT([no])
-])
-])
-
# 2.6.24 removes long aged procfs entry -> deleted member
AC_DEFUN([LC_PROCFS_DELETED],
[AC_MSG_CHECKING([if kernel has deleted member in procfs entry struct])
])
])
+# 2.6.24 has bdi_init()/bdi_destroy() functions.
+AC_DEFUN([LC_EXPORT_BDI_INIT],
+[LB_CHECK_SYMBOL_EXPORT([bdi_init],
+[mm/backing-dev.c],[
+ AC_DEFINE(HAVE_BDI_INIT, 1,
+ [bdi_init/bdi_destroy functions are present])
+],[
+])
+])
+
+# 2.6.25
+
# 2.6.25 change define to inline
AC_DEFUN([LC_MAPPING_CAP_WRITEBACK_DIRTY],
[AC_MSG_CHECKING([if kernel have mapping_cap_writeback_dirty])
])
])
-
+# 2.6.26
# 2.6.26 isn't export set_fs_pwd and change paramter in fs struct
AC_DEFUN([LC_FS_STRUCT_USE_PATH],
], [
AC_MSG_RESULT([yes])
AC_DEFINE(HAVE_FS_STRUCT_USE_PATH, 1,
- [fs_struct use path structure])
-],[
- AC_MSG_RESULT([no])
-])
-])
-
-#
-# LC_VFS_INTENT_PATCHES
-#
-# check if the kernel has the VFS intent patches
-AC_DEFUN([LC_VFS_INTENT_PATCHES],
-[AC_MSG_CHECKING([if the kernel has the VFS intent patches])
-LB_LINUX_TRY_COMPILE([
- #include <linux/fs.h>
- #include <linux/namei.h>
-],[
- struct nameidata nd;
- struct lookup_intent *it;
-
- it = &nd.intent;
- intent_init(it, IT_OPEN);
- it->d.lustre.it_disposition = 0;
- it->d.lustre.it_data = NULL;
-],[
- AC_MSG_RESULT([yes])
- AC_DEFINE(HAVE_VFS_INTENT_PATCHES, 1, [VFS intent patches are applied])
-],[
- AC_MSG_RESULT([no])
-])
-])
-
-AC_DEFUN([LC_S_TIME_GRAN],
-[AC_MSG_CHECKING([if super block has s_time_gran member])
-LB_LINUX_TRY_COMPILE([
- #include <linux/fs.h>
-],[
- struct super_block sb;
-
- return sb.s_time_gran;
-],[
- AC_MSG_RESULT([yes])
- AC_DEFINE(HAVE_S_TIME_GRAN, 1, [super block has s_time_gran member])
-],[
- AC_MSG_RESULT([no])
-])
-])
-
-AC_DEFUN([LC_SB_TIME_GRAN],
-[AC_MSG_CHECKING([if kernel has old get_sb_time_gran])
-LB_LINUX_TRY_COMPILE([
- #include <linux/fs.h>
-],[
- return get_sb_time_gran(NULL);
-],[
- AC_MSG_RESULT([yes])
- AC_DEFINE(HAVE_SB_TIME_GRAN, 1, [kernel has old get_sb_time_gran])
-],[
- AC_MSG_RESULT([no])
-])
-])
-
-#
-# LC_FUNC_GRAB_CACHE_PAGE_NOWAIT_GFP
-#
-# Check for our patched grab_cache_page_nowait_gfp() function
-# after 2.6.29 we can emulate this using add_to_page_cache_lru()
-#
-AC_DEFUN([LC_FUNC_GRAB_CACHE_PAGE_NOWAIT_GFP],
-[LB_CHECK_SYMBOL_EXPORT([grab_cache_page_nowait_gfp],
-[mm/filemap.c],[
- AC_DEFINE(HAVE_GRAB_CACHE_PAGE_NOWAIT_GFP, 1,
- [kernel exports grab_cache_page_nowait_gfp])
- ],
- [LB_CHECK_SYMBOL_EXPORT([add_to_page_cache_lru],
- [mm/filemap.c],[
- AC_DEFINE(HAVE_ADD_TO_PAGE_CACHE_LRU, 1,
- [kernel exports add_to_page_cache_lru])
- ],[
- ])
- ])
-])
-
-# ~2.6.12 merge patch from oracle to convert tree_lock from spinlock to rwlock
-AC_DEFUN([LC_RW_TREE_LOCK],
-[AC_MSG_CHECKING([if kernel has tree_lock as rwlock])
-tmp_flags="$EXTRA_KCFLAGS"
-EXTRA_KCFLAGS="-Werror"
-LB_LINUX_TRY_COMPILE([
- #include <linux/fs.h>
-],[
- struct address_space a;
-
- write_lock(&a.tree_lock);
-],[
- AC_MSG_RESULT([yes])
- AC_DEFINE(HAVE_RW_TREE_LOCK, 1, [kernel has tree_lock as rw_lock])
-],[
- AC_MSG_RESULT([no])
-])
-EXTRA_KCFLAGS="$tmp_flags"
-])
-
-AC_DEFUN([LC_CONST_ACL_SIZE],
-[AC_MSG_CHECKING([calc acl size])
-tmp_flags="$CFLAGS"
-CFLAGS="$CFLAGS -I$LINUX/include -I$LINUX_OBJ/include -I$LINUX_OBJ/include2 -I$LINUX/arch/`uname -m|sed -e 's/ppc.*/powerpc/' -e 's/x86_64/x86/' -e 's/i.86/x86/'`/include $EXTRA_KCFLAGS"
-AC_TRY_RUN([
-#define __KERNEL__
-#include <linux/autoconf.h>
-#include <linux/types.h>
-#undef __KERNEL__
-// block include
-#define __LINUX_POSIX_ACL_H
-
-# ifdef CONFIG_FS_POSIX_ACL
-# ifdef HAVE_XATTR_ACL
-# include <linux/xattr_acl.h>
-# endif
-# ifdef HAVE_LINUX_POSIX_ACL_XATTR_H
-# include <linux/posix_acl_xattr.h>
-# endif
-# endif
-
-#include <lustre_acl.h>
-
-#include <stdio.h>
-
-int main(void)
-{
- int size = mds_xattr_acl_size(LUSTRE_POSIX_ACL_MAX_ENTRIES);
- FILE *f = fopen("acl.size","w+");
- fprintf(f,"%d", size);
- fclose(f);
-
- return 0;
-}
-
-],[
- acl_size=`cat acl.size`
- AC_MSG_RESULT([ACL size $acl_size])
- AC_DEFINE_UNQUOTED(XATTR_ACL_SIZE, AS_TR_SH([$acl_size]), [size of xattr acl])
-],[
- AC_ERROR([ACL size can't computed])
-])
-CFLAGS="$tmp_flags"
-])
-
-#
-# check for crypto API
-#
-AC_DEFUN([LC_ASYNC_BLOCK_CIPHER],
-[AC_MSG_CHECKING([if kernel has block cipher support])
-LB_LINUX_TRY_COMPILE([
- #include <linux/err.h>
- #include <linux/crypto.h>
-],[
- struct crypto_blkcipher *tfm;
- tfm = crypto_alloc_blkcipher("aes", 0, 0 );
-],[
- AC_MSG_RESULT([yes])
- AC_DEFINE(HAVE_ASYNC_BLOCK_CIPHER, 1, [kernel has block cipher support])
-],[
- AC_MSG_RESULT([no])
-])
-])
-
-#
-# check for struct hash_desc
-#
-AC_DEFUN([LC_STRUCT_HASH_DESC],
-[AC_MSG_CHECKING([if kernel has struct hash_desc])
-LB_LINUX_TRY_COMPILE([
- #include <linux/err.h>
- #include <linux/crypto.h>
-],[
- struct hash_desc foo;
-],[
- AC_MSG_RESULT([yes])
- AC_DEFINE(HAVE_STRUCT_HASH_DESC, 1, [kernel has struct hash_desc])
-],[
- AC_MSG_RESULT([no])
-])
-])
-
-#
-# check for struct blkcipher_desc
-#
-AC_DEFUN([LC_STRUCT_BLKCIPHER_DESC],
-[AC_MSG_CHECKING([if kernel has struct blkcipher_desc])
-LB_LINUX_TRY_COMPILE([
- #include <linux/err.h>
- #include <linux/crypto.h>
-],[
- struct blkcipher_desc foo;
-],[
- AC_MSG_RESULT([yes])
- AC_DEFINE(HAVE_STRUCT_BLKCIPHER_DESC, 1, [kernel has struct blkcipher_desc])
-],[
- AC_MSG_RESULT([no])
-])
-])
-
-#
-# 2.6.19 check for FS_RENAME_DOES_D_MOVE flag
-#
-AC_DEFUN([LC_FS_RENAME_DOES_D_MOVE],
-[AC_MSG_CHECKING([if kernel has FS_RENAME_DOES_D_MOVE flag])
-LB_LINUX_TRY_COMPILE([
- #include <linux/fs.h>
-],[
- int v = FS_RENAME_DOES_D_MOVE;
-],[
- AC_MSG_RESULT([yes])
- AC_DEFINE(HAVE_FS_RENAME_DOES_D_MOVE, 1, [kernel has FS_RENAME_DOES_D_MOVE flag])
-],[
- AC_MSG_RESULT([no])
-])
-])
-
-#
-# LC_FUNC_F_OP_FLOCK
-#
-# rhel4.2 kernel has f_op->flock field
-#
-AC_DEFUN([LC_FUNC_F_OP_FLOCK],
-[AC_MSG_CHECKING([if struct file_operations has flock field])
-LB_LINUX_TRY_COMPILE([
- #include <linux/fs.h>
-],[
- struct file_operations ll_file_operations_flock;
- ll_file_operations_flock.flock = NULL;
-],[
- AC_DEFINE(HAVE_F_OP_FLOCK, 1,
- [struct file_operations has flock field])
- AC_MSG_RESULT([yes])
-],[
- AC_MSG_RESULT([no])
-])
-])
-
-# vfs_symlink seems to have started out with 3 args until 2.6.7 where a
-# "mode" argument was added, but then again, in some later version it was
-# removed
-AC_DEFUN([LC_4ARGS_VFS_SYMLINK],
-[AC_MSG_CHECKING([if vfs_symlink wants 4 args])
-LB_LINUX_TRY_COMPILE([
- #include <linux/fs.h>
-],[
- struct inode *dir;
- struct dentry *dentry;
- const char *oldname = NULL;
- int mode = 0;
-
- vfs_symlink(dir, dentry, oldname, mode);
-],[
- AC_MSG_RESULT(yes)
- AC_DEFINE(HAVE_4ARGS_VFS_SYMLINK, 1,
- [vfs_symlink wants 4 args])
+ [fs_struct use path structure])
],[
- AC_MSG_RESULT(no)
+ AC_MSG_RESULT([no])
])
])
-# 2.6.23 has new shrinker API
-AC_DEFUN([LC_REGISTER_SHRINKER],
-[LB_CHECK_SYMBOL_EXPORT([register_shrinker],
-[mm/vmscan.c],[
- AC_DEFINE(HAVE_REGISTER_SHRINKER, 1,
- [kernel exports register_shrinker])
-],[
-])
-])
+# 2.6.27
-#2.6.27
AC_DEFUN([LC_INODE_PERMISION_2ARGS],
[AC_MSG_CHECKING([inode_operations->permission has two args])
LB_LINUX_TRY_COMPILE([
])
])
-# vfs_symlink seems to have started out with 3 args until 2.6.7 where a
-# "mode" argument was added, but then again, in some later version it was
-# removed
-AC_DEFUN([LC_4ARGS_VFS_SYMLINK],
-[AC_MSG_CHECKING([if vfs_symlink wants 4 args])
-LB_LINUX_TRY_COMPILE([
- #include <linux/fs.h>
-],[
- struct inode *dir;
- struct dentry *dentry;
- const char *oldname = NULL;
- int mode = 0;
-
- vfs_symlink(dir, dentry, oldname, mode);
-],[
- AC_MSG_RESULT(yes)
- AC_DEFINE(HAVE_4ARGS_VFS_SYMLINK, 1,
- [vfs_symlink wants 4 args])
-],[
- AC_MSG_RESULT(no)
-])
-])
-
-# 2.6.27 sles11 remove the bi_hw_segments
-AC_DEFUN([LC_BI_HW_SEGMENTS],
-[AC_MSG_CHECKING([struct bio has a bi_hw_segments field])
-LB_LINUX_TRY_COMPILE([
- #include <linux/bio.h>
-],[
- struct bio io;
- io.bi_hw_segments = 0;
-],[
- AC_DEFINE(HAVE_BI_HW_SEGMENTS, 1,
- [struct bio has a bi_hw_segments field])
- AC_MSG_RESULT([yes])
-],[
- AC_MSG_RESULT([no])
-])
-])
-
-#
-# 2.6.27 sles11 move the quotaio_v1{2}.h from include/linux to fs
-# 2.6.32 move the quotaio_v1{2}.h from fs to fs/quota
-AC_DEFUN([LC_HAVE_QUOTAIO_V1_H],
-[LB_CHECK_FILE([$LINUX/include/linux/quotaio_v1.h],[
- AC_DEFINE(HAVE_QUOTAIO_V1_H, 1,
- [kernel has include/linux/quotaio_v1.h])
-],[LB_CHECK_FILE([$LINUX/fs/quota/quotaio_v1.h],[
- AC_DEFINE(HAVE_FS_QUOTA_QUOTAIO_V1_H, 1,
- [kernel has fs/quota/quotaio_v1.h])
-],[
- AC_MSG_RESULT([no])
-])
-])
-])
-
-# sles10 sp2 need 5 parameter for vfs_symlink
-AC_DEFUN([LC_VFS_SYMLINK_5ARGS],
-[AC_MSG_CHECKING([vfs_symlink need 5 parameter])
-LB_LINUX_TRY_COMPILE([
- #include <linux/fs.h>
-],[
- struct inode *dir = NULL;
- struct dentry *dentry = NULL;
- struct vfsmount *mnt = NULL;
- const char * path = NULL;
- vfs_symlink(dir, dentry, mnt, path, 0);
-],[
- AC_DEFINE(HAVE_VFS_SYMLINK_5ARGS, 1,
- [vfs_symlink need 5 parameteres])
- AC_MSG_RESULT([yes])
-],[
- AC_MSG_RESULT([no])
-])
-])
-
# 2.6.27 removed the read_inode from super_operations.
AC_DEFUN([LC_READ_INODE_IN_SBOPS],
[AC_MSG_CHECKING([super_operations has a read_inode field])
])
])
-# 2.6.27 sles11 has sb_any_quota_active
-AC_DEFUN([LC_SB_ANY_QUOTA_ACTIVE],
-[AC_MSG_CHECKING([Kernel has sb_any_quota_active])
-LB_LINUX_TRY_COMPILE([
- #include <linux/quotaops.h>
-],[
- sb_any_quota_active(NULL);
-],[
- AC_DEFINE(HAVE_SB_ANY_QUOTA_ACTIVE, 1,
- [Kernel has a sb_any_quota_active])
- AC_MSG_RESULT([yes])
-],[
- AC_MSG_RESULT([no])
-])
-])
-
-# 2.6.27 sles11 has sb_has_quota_active
-AC_DEFUN([LC_SB_HAS_QUOTA_ACTIVE],
-[AC_MSG_CHECKING([Kernel has sb_has_quota_active])
-LB_LINUX_TRY_COMPILE([
- #include <linux/quotaops.h>
-],[
- sb_has_quota_active(NULL, 0);
-],[
- AC_DEFINE(HAVE_SB_HAS_QUOTA_ACTIVE, 1,
- [Kernel has a sb_has_quota_active])
- AC_MSG_RESULT([yes])
-],[
- AC_MSG_RESULT([no])
-])
-])
-
# 2.6.27 has inode_permission instead of permisson
AC_DEFUN([LC_EXPORT_INODE_PERMISSION],
[LB_CHECK_SYMBOL_EXPORT([inode_permission],
])
])
-#
-# LC_LINUX_FIEMAP_H
-#
-# If we have fiemap.h
-# after 2.6.27 use fiemap.h in include/linux
-#
-AC_DEFUN([LC_LINUX_FIEMAP_H],
-[LB_CHECK_FILE([$LINUX/include/linux/fiemap.h],[
- AC_MSG_CHECKING([if fiemap.h can be compiled])
- LB_LINUX_TRY_COMPILE([
- #include <linux/types.h>
- #include <linux/fiemap.h>
- ],[],[
- AC_MSG_RESULT([yes])
- AC_DEFINE(HAVE_LINUX_FIEMAP_H, 1, [Kernel has fiemap.h])
- ],[
- AC_MSG_RESULT([no])
- ])
-],
-[])
-])
-
# LC_LOCK_MAP_ACQUIRE
# after 2.6.27 lock_map_acquire replaces lock_acquire
AC_DEFUN([LC_LOCK_MAP_ACQUIRE],
])
])
+# 2.6.27.15-2 sles11
+
+# 2.6.27 sles11 remove the bi_hw_segments
+AC_DEFUN([LC_BI_HW_SEGMENTS],
+[AC_MSG_CHECKING([struct bio has a bi_hw_segments field])
+LB_LINUX_TRY_COMPILE([
+ #include <linux/bio.h>
+],[
+ struct bio io;
+ io.bi_hw_segments = 0;
+],[
+ AC_DEFINE(HAVE_BI_HW_SEGMENTS, 1,
+ [struct bio has a bi_hw_segments field])
+ AC_MSG_RESULT([yes])
+],[
+ AC_MSG_RESULT([no])
+])
+])
+
#
-# LC_D_OBTAIN_ALIAS
-# starting from 2.6.28 kernel replaces d_alloc_anon() with
-# d_obtain_alias() for getting anonymous dentries
-#
-AC_DEFUN([LC_D_OBTAIN_ALIAS],
-[AC_MSG_CHECKING([d_obtain_alias exist in kernel])
+# 2.6.27 sles11 move the quotaio_v1{2}.h from include/linux to fs
+# 2.6.32 move the quotaio_v1{2}.h from fs to fs/quota
+AC_DEFUN([LC_HAVE_QUOTAIO_V1_H],
+[LB_CHECK_FILE([$LINUX/include/linux/quotaio_v1.h],[
+ AC_DEFINE(HAVE_QUOTAIO_V1_H, 1,
+ [kernel has include/linux/quotaio_v1.h])
+],[LB_CHECK_FILE([$LINUX/fs/quota/quotaio_v1.h],[
+ AC_DEFINE(HAVE_FS_QUOTA_QUOTAIO_V1_H, 1,
+ [kernel has fs/quota/quotaio_v1.h])
+],[
+ AC_MSG_RESULT([no])
+])
+])
+])
+
+# sles10 sp2 need 5 parameter for vfs_symlink
+AC_DEFUN([LC_VFS_SYMLINK_5ARGS],
+[AC_MSG_CHECKING([vfs_symlink need 5 parameter])
LB_LINUX_TRY_COMPILE([
- #include <linux/dcache.h>
+ #include <linux/fs.h>
],[
- d_obtain_alias(NULL);
+ struct inode *dir = NULL;
+ struct dentry *dentry = NULL;
+ struct vfsmount *mnt = NULL;
+ const char * path = NULL;
+ vfs_symlink(dir, dentry, mnt, path, 0);
],[
- AC_DEFINE(HAVE_D_OBTAIN_ALIAS, 1,
- [d_obtain_alias exist in kernel])
+ AC_DEFINE(HAVE_VFS_SYMLINK_5ARGS, 1,
+ [vfs_symlink need 5 parameteres])
+ AC_MSG_RESULT([yes])
+],[
+ AC_MSG_RESULT([no])
+])
+])
+
+# 2.6.27 sles11 has sb_any_quota_active
+AC_DEFUN([LC_SB_ANY_QUOTA_ACTIVE],
+[AC_MSG_CHECKING([Kernel has sb_any_quota_active])
+LB_LINUX_TRY_COMPILE([
+ #include <linux/quotaops.h>
+],[
+ sb_any_quota_active(NULL);
+],[
+ AC_DEFINE(HAVE_SB_ANY_QUOTA_ACTIVE, 1,
+ [Kernel has a sb_any_quota_active])
+ AC_MSG_RESULT([yes])
+],[
+ AC_MSG_RESULT([no])
+])
+])
+
+# 2.6.27 sles11 has sb_has_quota_active
+AC_DEFUN([LC_SB_HAS_QUOTA_ACTIVE],
+[AC_MSG_CHECKING([Kernel has sb_has_quota_active])
+LB_LINUX_TRY_COMPILE([
+ #include <linux/quotaops.h>
+],[
+ sb_has_quota_active(NULL, 0);
+],[
+ AC_DEFINE(HAVE_SB_HAS_QUOTA_ACTIVE, 1,
+ [Kernel has a sb_has_quota_active])
AC_MSG_RESULT([yes])
],[
AC_MSG_RESULT([no])
])
])
+# 2.6.31
+
# 2.6.31 replaces blk_queue_hardsect_size by blk_queue_logical_block_size function
AC_DEFUN([LC_BLK_QUEUE_LOG_BLK_SIZE],
[AC_MSG_CHECKING([if blk_queue_logical_block_size is defined])
])
])
+# 2.6.32
+
# 2.6.32 add a limits member in struct request_queue.
AC_DEFUN([LC_REQUEST_QUEUE_LIMITS],
[AC_MSG_CHECKING([if request_queue has a limits field])
])
])
-# RHEL6(backport from 2.6.34) removes 2 functions blk_queue_max_phys_segments and
-# blk_queue_max_hw_segments add blk_queue_max_segments
-AC_DEFUN([LC_BLK_QUEUE_MAX_SEGMENTS],
-[AC_MSG_CHECKING([if blk_queue_max_segments is defined])
+# 2.6.32 has bdi_register() functions.
+AC_DEFUN([LC_EXPORT_BDI_REGISTER],
+[LB_CHECK_SYMBOL_EXPORT([bdi_register],
+[mm/backing-dev.c],[
+ AC_DEFINE(HAVE_BDI_REGISTER, 1,
+ [bdi_register function is present])
+],[
+])
+])
+
+# 2.6.32 add s_bdi for super block
+AC_DEFUN([LC_SB_BDI],
+[AC_MSG_CHECKING([if super_block has s_bdi field])
LB_LINUX_TRY_COMPILE([
- #include <linux/blkdev.h>
+ #include <linux/fs.h>
],[
- blk_queue_max_segments(NULL, 0);
+ struct super_block sb;
+ sb.s_bdi = NULL;
],[
AC_MSG_RESULT(yes)
- AC_DEFINE(HAVE_BLK_QUEUE_MAX_SEGMENTS, 1,
- [blk_queue_max_segments is defined])
+ AC_DEFINE(HAVE_SB_BDI, 1,
+ [super_block has s_bdi field])
],[
AC_MSG_RESULT(no)
])
])
-# RHEL6(backport from 2.6.34) removes blk_queue_max_sectors and add blk_queue_max_hw_sectors
+# 2.6.32 removes blk_queue_max_sectors and add blk_queue_max_hw_sectors
# check blk_queue_max_sectors and use it until disappear.
AC_DEFUN([LC_BLK_QUEUE_MAX_SECTORS],
[AC_MSG_CHECKING([if blk_queue_max_sectors is defined])
])
])
-# 2.6.32 has new BDI interface.
-AC_DEFUN([LC_NEW_BACKING_DEV_INFO],
-[AC_MSG_CHECKING([if backing_dev_info has a wb_cnt field])
+# 2.6.32 replaces 2 functions blk_queue_max_phys_segments and blk_queue_max_hw_segments by blk_queue_max_segments
+AC_DEFUN([LC_BLK_QUEUE_MAX_SEGMENTS],
+[AC_MSG_CHECKING([if blk_queue_max_segments is defined])
+LB_LINUX_TRY_COMPILE([
+ #include <linux/blkdev.h>
+],[
+ blk_queue_max_segments(NULL, 0);
+],[
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(HAVE_BLK_QUEUE_MAX_SEGMENTS, 1,
+ [blk_queue_max_segments is defined])
+],[
+ AC_MSG_RESULT(no)
+])
+])
+
+# 2.6.32-71 adds an argument to shrink callback
+AC_DEFUN([LC_SHRINK_3ARGS],
+[AC_MSG_CHECKING([if shrink has 3 arguments])
+LB_LINUX_TRY_COMPILE([
+ #include <linux/mm.h>
+],[
+ struct shrinker s;
+ return s.shrink(NULL, 0, 0);
+],[
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(HAVE_SHRINK_3ARGS, 1,
+ [shrink has 3 arguments])
+],[
+ AC_MSG_RESULT(no)
+])
+])
+
+#
+# LC_EXT4_SINGLEDATA_TRANS_BLOCKS_SB
+#
+AC_DEFUN([LC_EXT4_SINGLEDATA_TRANS_BLOCKS_SB],
+[AC_MSG_CHECKING([if EXT4_SINGLEDATA_TRANS_BLOCKS takes the sb as argument])
+tmp_flags="$EXTRA_KCFLAGS"
+EXTRA_KCFLAGS="-I$LINUX/fs"
LB_LINUX_TRY_COMPILE([
- #include <linux/backing-dev.h>
+ #include <ext4/ext4.h>
+ #include <ext4/ext4_jbd2.h>
],[
- struct backing_dev_info bdi;
- bdi.wb_cnt = 0;
+ struct super_block sb;
+ EXT4_SINGLEDATA_TRANS_BLOCKS(&sb);
],[
AC_MSG_RESULT(yes)
- AC_DEFINE(HAVE_NEW_BACKING_DEV_INFO, 1,
- [backing_dev_info has a wb_cnt field])
+ AC_DEFINE(LDISKFS_SINGLEDATA_TRANS_BLOCKS_HAS_SB, 1,
+ [EXT4_SINGLEDATA_TRANS_BLOCKS takes sb as argument])
],[
AC_MSG_RESULT(no)
])
+EXTRA_KCFLAGS="$tmp_flags"
])
-# 2.6.24 has bdi_init()/bdi_destroy() functions.
-AC_DEFUN([LC_EXPORT_BDI_INIT],
-[LB_CHECK_SYMBOL_EXPORT([bdi_init],
-[mm/backing-dev.c],[
- AC_DEFINE(HAVE_BDI_INIT, 1,
- [bdi_init/bdi_destroy functions are present])
-],[
-])
+#
+# LC_QUOTA64
+# linux kernel have 64-bit limits support
+#
+AC_DEFUN([LC_QUOTA64],[
+ AC_MSG_CHECKING([if kernel has 64-bit quota limits support])
+tmp_flags="$EXTRA_KCFLAGS"
+EXTRA_KCFLAGS="-I$LINUX/fs"
+ LB_LINUX_TRY_COMPILE([
+ #include <linux/kernel.h>
+ #include <linux/fs.h>
+ #ifdef HAVE_QUOTAIO_V1_H
+ # include <linux/quotaio_v2.h>
+ int versions[] = V2_INITQVERSIONS_R1;
+ struct v2_disk_dqblk_r1 dqblk_r1;
+ #else
+ # ifdef HAVE_FS_QUOTA_QUOTAIO_V1_H
+ # include <quota/quotaio_v2.h>
+ # else
+ # include <quotaio_v2.h>
+ # endif
+ struct v2r1_disk_dqblk dqblk_r1;
+ #endif
+ ],[],[
+ AC_DEFINE(HAVE_QUOTA64, 1, [have quota64])
+ AC_MSG_RESULT([yes])
+ ],[
+ LB_CHECK_FILE([$LINUX/include/linux/lustre_version.h],[
+ AC_MSG_ERROR([You have got no 64-bit kernel quota support.])
+ ],[])
+ AC_MSG_RESULT([no])
+ ])
+EXTRA_KCFLAGS=$tmp_flags
])
-# 2.6.23 exports exportfs_decode_fh
-AC_DEFUN([LC_EXPORTFS_DECODE_FH],
-[LB_CHECK_SYMBOL_EXPORT([exportfs_decode_fh],
-[fs/exportfs/expfs.c],[
- AC_DEFINE(HAVE_EXPORTFS_DECODE_FH, 1,
- [exportfs_decode_fh has been export])
+#
+# LC_D_OBTAIN_ALIAS
+# starting from 2.6.28 kernel replaces d_alloc_anon() with
+# d_obtain_alias() for getting anonymous dentries
+#
+AC_DEFUN([LC_D_OBTAIN_ALIAS],
+[AC_MSG_CHECKING([d_obtain_alias exist in kernel])
+LB_LINUX_TRY_COMPILE([
+ #include <linux/dcache.h>
+],[
+ d_obtain_alias(NULL);
+],[
+ AC_DEFINE(HAVE_D_OBTAIN_ALIAS, 1,
+ [d_obtain_alias exist in kernel])
+ AC_MSG_RESULT([yes])
],[
+ AC_MSG_RESULT([no])
])
])
+
#
# LC_PROG_LINUX
#
LC_CAPA_CRYPTO
LC_CONFIG_RMTCLIENT
LC_CONFIG_GSS
- LC_FUNC_MS_FLOCK_LOCK
LC_FUNC_HAVE_CAN_SLEEP_ARG
LC_FUNC_F_OP_FLOCK
LC_QUOTA_READ
LC_FUNC_RCU
LC_PERCPU_COUNTER
LC_TASK_CLENV_STORE
- LC_4ARGS_VFS_SYMLINK
-
- # does the kernel have VFS intent patches?
- LC_VFS_INTENT_PATCHES
# ~2.6.11
LC_S_TIME_GRAN
if test x$enable_server = xyes ; then
LC_EXPORT_INVALIDATE_MAPPING_PAGES
fi
+ LC_EXT4_DISCARD_PREALLOCATIONS
+ LC_EXT_INSERT_EXTENT_WITH_5ARGS
#2.6.18 + RHEL5 (fc6)
LC_PG_FS_MISC
# 2.6.32
LC_REQUEST_QUEUE_LIMITS
- LC_NEW_BACKING_DEV_INFO
+ LC_EXPORT_BDI_REGISTER
+ LC_SB_BDI
LC_BLK_QUEUE_MAX_SECTORS
LC_BLK_QUEUE_MAX_SEGMENTS
+ LC_SHRINK_3ARGS
+ LC_EXT4_SINGLEDATA_TRANS_BLOCKS_SB
#
if test x$enable_server = xyes ; then
LC_CONFIG_LIBLUSTRE_RECOVERY
])
-AC_DEFUN([LC_CONFIG_LRU_RESIZE],
-[AC_MSG_CHECKING([whether to enable lru self-adjusting])
-AC_ARG_ENABLE([lru_resize],
- AC_HELP_STRING([--enable-lru-resize],
- [enable lru resize support]),
- [],[enable_lru_resize='yes'])
-AC_MSG_RESULT([$enable_lru_resize])
-if test x$enable_lru_resize != xno; then
- AC_DEFINE(HAVE_LRU_RESIZE_SUPPORT, 1, [Enable lru resize support])
-fi
-])
-
#
# LC_CONFIG_QUOTA
#
[],[enable_quota='yes'])
])
-# whether to enable quota support(kernel modules)
-AC_DEFUN([LC_QUOTA_MODULE],
-[if test x$enable_quota != xno; then
- LB_LINUX_CONFIG([QUOTA],[
- enable_quota_module='yes'
- AC_DEFINE(HAVE_QUOTA_SUPPORT, 1, [Enable quota support])
- ],[
- enable_quota_module='no'
- AC_MSG_WARN([quota is not enabled because the kernel - lacks quota support])
- ])
-fi
-])
-
AC_DEFUN([LC_QUOTA],
[#check global
LC_CONFIG_QUOTA
[AC_MSG_ERROR([don't find <sys/quota.h> in your system])])
])
-AC_DEFUN([LC_QUOTA_READ],
-[AC_MSG_CHECKING([if kernel supports quota_read])
-LB_LINUX_TRY_COMPILE([
- #include <linux/fs.h>
-],[
- struct super_operations sp;
- void *i = (void *)sp.quota_read;
-],[
- AC_MSG_RESULT([yes])
- AC_DEFINE(KERNEL_SUPPORTS_QUOTA_READ, 1, [quota_read found])
-],[
- AC_MSG_RESULT([no])
-])
-])
-
#
# LC_CONFIG_SPLIT
#
fi
])
-#
-# LC_COOKIE_FOLLOW_LINK
-#
-# kernel 2.6.13+ ->follow_link returns a cookie
-#
-
-AC_DEFUN([LC_COOKIE_FOLLOW_LINK],
-[AC_MSG_CHECKING([if inode_operations->follow_link returns a cookie])
-LB_LINUX_TRY_COMPILE([
- #include <linux/fs.h>
- #include <linux/namei.h>
-],[
- struct dentry dentry;
- struct nameidata nd;
-
- dentry.d_inode->i_op->put_link(&dentry, &nd, NULL);
-],[
- AC_DEFINE(HAVE_COOKIE_FOLLOW_LINK, 1, [inode_operations->follow_link returns a cookie])
- AC_MSG_RESULT([yes])
-],[
- AC_MSG_RESULT([no])
-])
-])
-
-#
-# LC_FUNC_RCU
-#
-# kernels prior than 2.6.0(?) have no RCU supported; in kernel 2.6.5(SUSE),
-# call_rcu takes three parameters.
-#
-AC_DEFUN([LC_FUNC_RCU],
-[AC_MSG_CHECKING([if kernel have RCU supported])
-LB_LINUX_TRY_COMPILE([
- #include <linux/rcupdate.h>
-],[],[
- AC_DEFINE(HAVE_RCU, 1, [have RCU defined])
- AC_MSG_RESULT([yes])
-
- AC_MSG_CHECKING([if call_rcu takes three parameters])
- LB_LINUX_TRY_COMPILE([
- #include <linux/rcupdate.h>
- ],[
- struct rcu_head rh;
- call_rcu(&rh, (void (*)(struct rcu_head *))1, NULL);
- ],[
- AC_DEFINE(HAVE_CALL_RCU_PARAM, 1, [call_rcu takes three parameters])
- AC_MSG_RESULT([yes])
- ],[
- AC_MSG_RESULT([no])
- ])
-
-],[
- AC_MSG_RESULT([no])
-])
-])
-
-#
-# LC_QUOTA64
-# linux kernel have 64-bit limits support
-#
-AC_DEFUN([LC_QUOTA64],[
- AC_MSG_CHECKING([if kernel has 64-bit quota limits support])
-tmp_flags="$EXTRA_KCFLAGS"
-EXTRA_KCFLAGS="-I$LINUX/fs"
- LB_LINUX_TRY_COMPILE([
- #include <linux/kernel.h>
- #include <linux/fs.h>
- #ifdef HAVE_QUOTAIO_V1_H
- # include <linux/quotaio_v2.h>
- int versions[] = V2_INITQVERSIONS_R1;
- struct v2_disk_dqblk_r1 dqblk_r1;
- #else
- # ifdef HAVE_FS_QUOTA_QUOTAIO_V1_H
- # include <quota/quotaio_v2.h>
- # else
- # include <quotaio_v2.h>
- # endif
- struct v2r1_disk_dqblk dqblk_r1;
- #endif
- ],[],[
- AC_DEFINE(HAVE_QUOTA64, 1, [have quota64])
- AC_MSG_RESULT([yes])
- ],[
- LB_CHECK_FILE([$LINUX/include/linux/lustre_version.h],[
- AC_MSG_ERROR([You have got no 64-bit kernel quota support.])
- ],[])
- AC_MSG_RESULT([no])
- ])
-EXTRA_KCFLAGS=$tmp_flags
-])
-
-# LC_SECURITY_PLUG # for SLES10 SP2
-# check security plug in sles10 sp2 kernel
-AC_DEFUN([LC_SECURITY_PLUG],
-[AC_MSG_CHECKING([If kernel has security plug support])
-LB_LINUX_TRY_COMPILE([
- #include <linux/fs.h>
-],[
- struct dentry *dentry;
- struct vfsmount *mnt;
- struct iattr *iattr;
-
- notify_change(dentry, mnt, iattr);
-],[
- AC_MSG_RESULT(yes)
- AC_DEFINE(HAVE_SECURITY_PLUG, 1,
- [SLES10 SP2 use extra parameter in vfs])
-],[
- AC_MSG_RESULT(no)
-])
-])
-
-AC_DEFUN([LC_PERCPU_COUNTER],
-[AC_MSG_CHECKING([if have struct percpu_counter defined])
-LB_LINUX_TRY_COMPILE([
- #include <linux/percpu_counter.h>
-],[],[
- AC_DEFINE(HAVE_PERCPU_COUNTER, 1, [percpu_counter found])
- AC_MSG_RESULT([yes])
-
- AC_MSG_CHECKING([if percpu_counter_inc takes the 2nd argument])
- LB_LINUX_TRY_COMPILE([
- #include <linux/percpu_counter.h>
- ],[
- struct percpu_counter c;
- percpu_counter_init(&c, 0);
- ],[
- AC_DEFINE(HAVE_PERCPU_2ND_ARG, 1, [percpu_counter_init has two
- arguments])
- AC_MSG_RESULT([yes])
- ],[
- AC_MSG_RESULT([no])
- ])
-],[
- AC_MSG_RESULT([no])
-])
-])
-
AC_DEFUN([LC_TASK_CLENV_TUX_INFO],
[AC_MSG_CHECKING([tux_info])
LB_LINUX_TRY_COMPILE([
])
])
-AC_DEFUN([LC_TASK_CLENV_STORE],
-[
- AC_MSG_CHECKING([if we can store cl_env in task_struct])
- if test x$have_task_clenv_store != xyes ; then
- LC_TASK_CLENV_TUX_INFO
- fi
-])
-
#
# LC_LLITE_LLOOP_MODULE
# lloop_llite.ko does not currently work with page sizes
#define cpu_to_node(cpu) 0
#endif
-#ifdef HAVE_REGISTER_SHRINKER
+#ifndef HAVE_REGISTER_SHRINKER
+#define KERN_SHRINKER(name) name(int nr_to_scan, gfp_t gfp_mask)
+#else
+#ifdef HAVE_SHRINK_3ARGS
+typedef int (*cfs_shrinker_t)(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask);
+#define KERN_SHRINKER(name) name(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
+#else
typedef int (*cfs_shrinker_t)(int nr_to_scan, gfp_t gfp_mask);
+#define KERN_SHRINKER(name) name(int nr_to_scan, gfp_t gfp_mask)
+#endif
static inline
struct shrinker *cfs_set_shrinker(int seek, cfs_shrinker_t func)
struct ll_sb_info *lsi_llsbi; /* add'l client sbi info */
struct vfsmount *lsi_srv_mnt; /* the one server mount */
cfs_atomic_t lsi_mounts; /* references to the srv_mnt */
- struct backing_dev_info bdi; /* Each client mountpoint needs own backing_dev_info */
+ struct backing_dev_info lsi_bdi; /* each client mountpoint needs own backing_dev_info */
};
#define LSI_SERVER 0x00000001
#define LSI_UMOUNT_FORCE 0x00000010
#define LSI_UMOUNT_FAILOVER 0x00000020
+#define LSI_BDI_INITIALIZED 0x00000040
#define s2lsi(sb) ((struct lustre_sb_info *)((sb)->s_fs_info))
#define s2lsi_nocast(sb) ((sb)->s_fs_info)
--- /dev/null
+Index: b/include/linux/blkdev.h
+===================================================================
+--- a/include/linux/blkdev.h
++++ b/include/linux/blkdev.h
+@@ -1026,7 +1026,7 @@ extern int blk_verify_command(unsigned c
+ enum blk_default_limits {
+ BLK_MAX_SEGMENTS = 128,
+ BLK_SAFE_MAX_SECTORS = 255,
+- BLK_DEF_MAX_SECTORS = 1024,
++ BLK_DEF_MAX_SECTORS = 2048,
+ BLK_MAX_SEGMENT_SIZE = 65536,
+ BLK_SEG_BOUNDARY_MASK = 0xFFFFFFFFUL,
+ };
--- /dev/null
+This functionality is mainly used during testing, in order to
+simulate a server crash for ldiskfs by discarding all of the
+writes to the filesystem. For recovery testing we could simulate
+this by using a special loopback or DM device that also discards
+writes to the device.
+
+This functionality is also used by target "failback" in order
+to speed up service shutdown and takeover by the other node
+during controlled operation. However, it would also be possible
+to do this by simply allowing all of the in-flight requests to
+complete and then waiting for the service to stop. This will
+also be needed by the DMU-OSD, because discarding of writes on
+a DMU-based target is not safe as it could trigger a storage
+failure if the data is ever read from disk again and the
+checksum does not match that expected by the block pointer.
+
+Initial efforts to remove this patch are under way in bug 20776.
+Once this work comes to fruition this patch can be dropped.
+
+Index: linux-2.6.32-71.18.1.el6-master/block/blk-core.c
+===================================================================
+--- linux-2.6.32-71.18.1.el6-master.orig/block/blk-core.c 2011-03-05 11:35:40.404043293 +0800
++++ linux-2.6.32-71.18.1.el6-master/block/blk-core.c 2011-03-11 20:21:10.492302510 +0800
+@@ -1405,6 +1405,8 @@
+
+ #endif /* CONFIG_FAIL_MAKE_REQUEST */
+
++int dev_check_rdonly(struct block_device *bdev);
++
+ /*
+ * Check whether this bio extends beyond the end of the device.
+ */
+@@ -1506,6 +1508,12 @@
+ if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
+ goto end_io;
+
++ /* This is Lustre's dev_rdonly check */
++ if (bio_rw(bio) == WRITE && dev_check_rdonly(bio->bi_bdev)) {
++ bio_endio(bio, 0);
++ break;
++ }
++
+ if (should_fail_request(bio))
+ goto end_io;
+
+@@ -2578,6 +2586,99 @@
+ }
+ EXPORT_SYMBOL(kblockd_schedule_work);
+
++ /*
++ * Debug code for turning block devices "read-only" (will discard writes
++ * silently). This is for filesystem crash/recovery testing.
++ */
++struct deventry {
++ dev_t dev;
++ struct deventry *next;
++};
++
++static struct deventry *devlist = NULL;
++static spinlock_t devlock = SPIN_LOCK_UNLOCKED;
++
++int dev_check_rdonly(struct block_device *bdev)
++{
++ struct deventry *cur;
++
++ if (!bdev)
++ return 0;
++
++ spin_lock(&devlock);
++ cur = devlist;
++ while(cur) {
++ if (bdev->bd_dev == cur->dev) {
++ spin_unlock(&devlock);
++ return 1;
++ }
++ cur = cur->next;
++ }
++ spin_unlock(&devlock);
++ return 0;
++}
++
++void dev_set_rdonly(struct block_device *bdev)
++{
++ struct deventry *newdev, *cur;
++
++ if (!bdev)
++ return;
++
++ newdev = kmalloc(sizeof(struct deventry), GFP_KERNEL);
++ if (!newdev)
++ return;
++
++ spin_lock(&devlock);
++ cur = devlist;
++ while(cur) {
++ if (bdev->bd_dev == cur->dev) {
++ spin_unlock(&devlock);
++ kfree(newdev);
++ return;
++ }
++ cur = cur->next;
++ }
++ newdev->dev = bdev->bd_dev;
++ newdev->next = devlist;
++ devlist = newdev;
++ spin_unlock(&devlock);
++ printk(KERN_WARNING "Turning device %s (%#x) read-only\n",
++ bdev->bd_disk ? bdev->bd_disk->disk_name : "", bdev->bd_dev);
++}
++
++void dev_clear_rdonly(struct block_device *bdev)
++{
++ struct deventry *cur, *last = NULL;
++
++ if (!bdev)
++ return;
++
++ spin_lock(&devlock);
++ cur = devlist;
++ while(cur) {
++ if (bdev->bd_dev == cur->dev) {
++ if (last)
++ last->next = cur->next;
++ else
++ devlist = cur->next;
++ spin_unlock(&devlock);
++ kfree(cur);
++ printk(KERN_WARNING "Removing read-only on %s (%#x)\n",
++ bdev->bd_disk ? bdev->bd_disk->disk_name :
++ "unknown block",
++ bdev->bd_dev);
++ return;
++ }
++ last = cur;
++ cur = cur->next;
++ }
++ spin_unlock(&devlock);
++}
++
++EXPORT_SYMBOL(dev_set_rdonly);
++EXPORT_SYMBOL(dev_clear_rdonly);
++EXPORT_SYMBOL(dev_check_rdonly);
+ int __init blk_dev_init(void)
+ {
+ BUILD_BUG_ON(__REQ_NR_BITS > 8 *
+Index: linux-2.6.32-71.18.1.el6-master/fs/block_dev.c
+===================================================================
+--- linux-2.6.32-71.18.1.el6-master.orig/fs/block_dev.c 2011-03-05 11:35:40.486042782 +0800
++++ linux-2.6.32-71.18.1.el6-master/fs/block_dev.c 2011-03-05 11:37:35.624324775 +0800
+@@ -1389,6 +1389,7 @@
+ if (bdev != bdev->bd_contains)
+ victim = bdev->bd_contains;
+ bdev->bd_contains = NULL;
++ dev_clear_rdonly(bdev);
+ }
+ unlock_kernel();
+ mutex_unlock(&bdev->bd_mutex);
+Index: linux-2.6.32-71.18.1.el6-master/include/linux/fs.h
+===================================================================
+--- linux-2.6.32-71.18.1.el6-master.orig/include/linux/fs.h 2011-03-05 11:35:40.445043037 +0800
++++ linux-2.6.32-71.18.1.el6-master/include/linux/fs.h 2011-03-05 11:37:35.726324137 +0800
+@@ -2204,6 +2204,10 @@
+ extern void submit_bio(int, struct bio *);
+ extern int bdev_read_only(struct block_device *);
+ #endif
++#define HAVE_CLEAR_RDONLY_ON_PUT
++extern void dev_set_rdonly(struct block_device *bdev);
++extern int dev_check_rdonly(struct block_device *bdev);
++extern void dev_clear_rdonly(struct block_device *bdev);
+ extern int set_blocksize(struct block_device *, int);
+ extern int sb_set_blocksize(struct super_block *, int);
+ extern int sb_min_blocksize(struct super_block *, int);
--- /dev/null
+security_inode_unlink() is used in filter_vfs_unlink()
+to avoid lock ordering problems. I'm not sure if this
+is still needed with ext4, and it definitely looks to
+be gone with DMU changes.
+
+Index: linux+rh+chaos/security/security.c
+===================================================================
+--- linux+rh+chaos.orig/security/security.c
++++ linux+rh+chaos/security/security.c
+@@ -60,6 +60,7 @@ int __init security_init(void)
+
+ return 0;
+ }
++EXPORT_SYMBOL(security_inode_unlink);
+
+ /* Save user chosen LSM */
+ static int __init choose_lsm(char *str)
--- /dev/null
+This allows the jbd transaction commit callbacks to be registered.
+The ext4 jbd2 code has a different commit callback (one per transaction)
+that could be used to provide equivalent functionality. This would
+require modifying the existing ext4 commit callback (used by mballoc
+when freeing data blocks) to be mutiplexed so it will store 2 different
+callback functions and 2 different lists of callback data.
+
+Index: linux+rh+chaos/include/linux/jbd2.h
+===================================================================
+--- linux+rh+chaos.orig/include/linux/jbd2.h
++++ linux+rh+chaos/include/linux/jbd2.h
+@@ -415,6 +415,27 @@ struct jbd2_inode {
+ unsigned int i_flags;
+ };
+
++#define HAVE_JOURNAL_CALLBACK_STATUS
++/**
++ * struct journal_callback - Base structure for callback information.
++ * @jcb_list: list information for other callbacks attached to the same handle.
++ * @jcb_func: Function to call with this callback structure.
++ *
++ * This struct is a 'seed' structure for a using with your own callback
++ * structs. If you are using callbacks you must allocate one of these
++ * or another struct of your own definition which has this struct
++ * as it's first element and pass it to journal_callback_set().
++ *
++ * This is used internally by jbd2 to maintain callback information.
++ *
++ * See journal_callback_set for more information.
++ **/
++struct journal_callback {
++ struct list_head jcb_list; /* t_jcb_lock */
++ void (*jcb_func)(struct journal_callback *jcb, int error);
++ /* user data goes here */
++};
++
+ struct jbd2_revoke_table_s;
+
+ /**
+@@ -423,6 +444,7 @@ struct jbd2_revoke_table_s;
+ * @h_transaction: Which compound transaction is this update a part of?
+ * @h_buffer_credits: Number of remaining buffers we are allowed to dirty.
+ * @h_ref: Reference count on this handle
++ * @h_jcb: List of application registered callbacks for this handle.
+ * @h_err: Field for caller's use to track errors through large fs operations
+ * @h_sync: flag for sync-on-close
+ * @h_jdata: flag to force data journaling
+@@ -448,6 +470,13 @@ struct handle_s
+ /* operations */
+ int h_err;
+
++ /*
++ * List of application registered callbacks for this handle. The
++ * function(s) will be called after the transaction that this handle is
++ * part of has been committed to disk. [t_jcb_lock]
++ */
++ struct list_head h_jcb;
++
+ /* Flags [no locking] */
+ unsigned int h_sync: 1; /* sync-on-close */
+ unsigned int h_jdata: 1; /* force data journaling */
+@@ -503,6 +532,8 @@ struct transaction_chp_stats_s {
+ * j_state_lock
+ * ->j_list_lock (journal_unmap_buffer)
+ *
++ * t_handle_lock
++ * ->t_jcb_lock
+ */
+
+ struct transaction_s
+@@ -659,6 +690,16 @@ struct transaction_s
+ * structures associated with the transaction
+ */
+ struct list_head t_private_list;
++
++ /*
++ * Protects the callback list
++ */
++ spinlock_t t_jcb_lock;
++ /*
++ * List of registered callback functions for this transaction.
++ * Called when the transaction is committed. [t_jcb_lock]
++ */
++ struct list_head t_jcb;
+ };
+
+ struct transaction_run_stats_s {
+@@ -1115,6 +1156,9 @@ extern int jbd2_journal_stop(handle_t *
+ extern int jbd2_journal_flush (journal_t *);
+ extern void jbd2_journal_lock_updates (journal_t *);
+ extern void jbd2_journal_unlock_updates (journal_t *);
++extern void jbd2_journal_callback_set(handle_t *handle,
++ void (*fn)(struct journal_callback *,int),
++ struct journal_callback *jcb);
+
+ extern journal_t * jbd2_journal_init_dev(struct block_device *bdev,
+ struct block_device *fs_dev,
+Index: linux+rh+chaos/fs/jbd2/checkpoint.c
+===================================================================
+--- linux+rh+chaos.orig/fs/jbd2/checkpoint.c
++++ linux+rh+chaos/fs/jbd2/checkpoint.c
+@@ -759,6 +759,7 @@ void __jbd2_journal_drop_transaction(jou
+ J_ASSERT(transaction->t_checkpoint_list == NULL);
+ J_ASSERT(transaction->t_checkpoint_io_list == NULL);
+ J_ASSERT(transaction->t_updates == 0);
++ J_ASSERT(list_empty(&transaction->t_jcb));
+ J_ASSERT(journal->j_committing_transaction != transaction);
+ J_ASSERT(journal->j_running_transaction != transaction);
+
+Index: linux+rh+chaos/fs/jbd2/commit.c
+===================================================================
+--- linux+rh+chaos.orig/fs/jbd2/commit.c
++++ linux+rh+chaos/fs/jbd2/commit.c
+@@ -857,6 +857,30 @@ wait_for_iobuf:
+ transaction can be removed from any checkpoint list it was on
+ before. */
+
++ /*
++ * Call any callbacks that had been registered for handles in this
++ * transaction. It is up to the callback to free any allocated
++ * memory.
++ *
++ * The spinlocking (t_jcb_lock) here is surely unnecessary...
++ */
++ spin_lock(&commit_transaction->t_jcb_lock);
++ if (!list_empty(&commit_transaction->t_jcb)) {
++ struct list_head *p, *n;
++ int error = is_journal_aborted(journal);
++
++ list_for_each_safe(p, n, &commit_transaction->t_jcb) {
++ struct journal_callback *jcb;
++
++ jcb = list_entry(p, struct journal_callback, jcb_list);
++ list_del(p);
++ spin_unlock(&commit_transaction->t_jcb_lock);
++ jcb->jcb_func(jcb, error);
++ spin_lock(&commit_transaction->t_jcb_lock);
++ }
++ }
++ spin_unlock(&commit_transaction->t_jcb_lock);
++
+ jbd_debug(3, "JBD: commit phase 6\n");
+
+ J_ASSERT(list_empty(&commit_transaction->t_inode_list));
+Index: linux+rh+chaos/fs/jbd2/journal.c
+===================================================================
+--- linux+rh+chaos.orig/fs/jbd2/journal.c
++++ linux+rh+chaos/fs/jbd2/journal.c
+@@ -90,6 +90,8 @@ EXPORT_SYMBOL(jbd2_journal_file_inode);
+ EXPORT_SYMBOL(jbd2_journal_init_jbd_inode);
+ EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
+ EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
++EXPORT_SYMBOL(jbd2_journal_callback_set);
++EXPORT_SYMBOL(jbd2_journal_bmap);
+
+ static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
+ static void __journal_abort_soft (journal_t *journal, int errno);
+Index: linux+rh+chaos/fs/jbd2/transaction.c
+===================================================================
+--- linux+rh+chaos.orig/fs/jbd2/transaction.c
++++ linux+rh+chaos/fs/jbd2/transaction.c
+@@ -52,7 +52,9 @@ jbd2_get_transaction(journal_t *journal,
+ transaction->t_start_time = ktime_get();
+ transaction->t_tid = journal->j_transaction_sequence++;
+ transaction->t_expires = jiffies + journal->j_commit_interval;
++ INIT_LIST_HEAD(&transaction->t_jcb);
+ spin_lock_init(&transaction->t_handle_lock);
++ spin_lock_init(&transaction->t_jcb_lock);
+ INIT_LIST_HEAD(&transaction->t_inode_list);
+ INIT_LIST_HEAD(&transaction->t_private_list);
+
+@@ -257,6 +259,7 @@ static handle_t *new_handle(int nblocks)
+ memset(handle, 0, sizeof(*handle));
+ handle->h_buffer_credits = nblocks;
+ handle->h_ref = 1;
++ INIT_LIST_HEAD(&handle->h_jcb);
+
+ lockdep_init_map(&handle->h_lockdep_map, "jbd2_handle",
+ &jbd2_handle_key, 0);
+@@ -1216,6 +1219,36 @@ drop:
+ }
+
+ /**
++ * void jbd2_journal_callback_set() - Register a callback function for this handle.
++ * @handle: handle to attach the callback to.
++ * @func: function to callback.
++ * @jcb: structure with additional information required by func() , and
++ * some space for jbd2 internal information.
++ *
++ * The function will be
++ * called when the transaction that this handle is part of has been
++ * committed to disk with the original callback data struct and the
++ * error status of the journal as parameters. There is no guarantee of
++ * ordering between handles within a single transaction, nor between
++ * callbacks registered on the same handle.
++ *
++ * The caller is responsible for allocating the journal_callback struct.
++ * This is to allow the caller to add as much extra data to the callback
++ * as needed, but reduce the overhead of multiple allocations. The caller
++ * allocated struct must start with a struct journal_callback at offset 0,
++ * and has the caller-specific data afterwards.
++ */
++void jbd2_journal_callback_set(handle_t *handle,
++ void (*func)(struct journal_callback *jcb, int error),
++ struct journal_callback *jcb)
++{
++ spin_lock(&handle->h_transaction->t_jcb_lock);
++ list_add_tail(&jcb->jcb_list, &handle->h_jcb);
++ spin_unlock(&handle->h_transaction->t_jcb_lock);
++ jcb->jcb_func = func;
++}
++
++/**
+ * int jbd2_journal_stop() - complete a transaction
+ * @handle: tranaction to complete.
+ *
+@@ -1321,6 +1354,11 @@ int jbd2_journal_stop(handle_t *handle)
+ wake_up(&journal->j_wait_transaction_locked);
+ }
+
++ /* Move callbacks from the handle to the transaction. */
++ spin_lock(&transaction->t_jcb_lock);
++ list_splice(&handle->h_jcb, &transaction->t_jcb);
++ spin_unlock(&transaction->t_jcb_lock);
++
+ /*
+ * If the handle is marked SYNC, we need to set another commit
+ * going! We also want to force a commit if the current
--- /dev/null
+Increase MAX_SGE for fusion mpt driver.
+
+Index: linux-2.6.32.i386/drivers/message/fusion/Kconfig
+===================================================================
+--- linux-2.6.32.i386.orig/drivers/message/fusion/Kconfig 2009-12-03 09:21:21.000000000 +0530
++++ linux-2.6.32.i386/drivers/message/fusion/Kconfig 2010-03-16 16:45:08.000000000 +0530
+@@ -61,9 +61,9 @@
+ LSISAS1078
+
+ config FUSION_MAX_SGE
+- int "Maximum number of scatter gather entries (16 - 128)"
+- default "128"
+- range 16 128
++ int "Maximum number of scatter gather entries (16 - 256)"
++ default "256"
++ range 16 256
+ help
+ This option allows you to specify the maximum number of scatter-
+ gather entries per I/O. The driver default is 128, which matches
+Index: linux-2.6.32.i386/drivers/message/fusion/mptbase.h
+===================================================================
+--- linux-2.6.32.i386.orig/drivers/message/fusion/mptbase.h 2009-12-03 09:21:21.000000000 +0530
++++ linux-2.6.32.i386/drivers/message/fusion/mptbase.h 2010-03-16 16:46:54.000000000 +0530
+@@ -165,10 +165,10 @@
+ * Set the MAX_SGE value based on user input.
+ */
+ #ifdef CONFIG_FUSION_MAX_SGE
+-#if CONFIG_FUSION_MAX_SGE < 16
++#if CONFIG_FUSION_MAX_SGE < 16
+ #define MPT_SCSI_SG_DEPTH 16
+-#elif CONFIG_FUSION_MAX_SGE > 128
+-#define MPT_SCSI_SG_DEPTH 128
++#elif CONFIG_FUSION_MAX_SGE > 256
++#define MPT_SCSI_SG_DEPTH 256
+ #else
+ #define MPT_SCSI_SG_DEPTH CONFIG_FUSION_MAX_SGE
+ #endif
--- /dev/null
+Force MD devices to pass SYNC reads directly to the disk
+instead of handling from cache. This is needed for MMP
+on MD RAID devices, and in theory could be accepted in
+the upstream kernel. Not needed for DMU.
+
+Index: linux-2.6.32-71.18.1.el6-master/drivers/md/raid5.c
+===================================================================
+--- linux-2.6.32-71.18.1.el6-master.orig/drivers/md/raid5.c 2011-02-28 16:57:31.222666050 +0800
++++ linux-2.6.32-71.18.1.el6-master/drivers/md/raid5.c 2011-02-28 16:58:27.011983275 +0800
+@@ -2098,6 +2098,8 @@
+ bi->bi_next = *bip;
+ *bip = bi;
+ bi->bi_phys_segments++;
++ if (bio_rw_flagged(bi, BIO_RW_SYNCIO) && !forwrite)
++ clear_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); /* force to read from disk. */
+ spin_unlock_irq(&conf->device_lock);
+ spin_unlock(&sh->lock);
+
+@@ -4031,6 +4033,8 @@
+ wait_event(mddev->thread->wqueue,
+ atomic_read(&conf->preread_active_stripes) == 0);
+ }
++ if (bio_rw_flagged(bi, BIO_RW_SYNCIO))
++ raid5_unplug_device(mddev->queue);
+ return 0;
+ }
+
--- /dev/null
+lustre_version.patch
+mpt-fusion-max-sge-rhel6.patch
+raid5-mmp-unplug-dev-rhel6.patch
+dev_read_only-2.6.32-rhel6.patch
+blkdev_tunables-2.6-rhel6.patch
+export-2.6.32-vanilla.patch
+jbd2-jcberr-2.6-rhel6.patch
return cached;
}
-static int ldlm_pools_srv_shrink(int nr, unsigned int gfp_mask)
+static int KERN_SHRINKER(ldlm_pools_srv_shrink)
{
- return ldlm_pools_shrink(LDLM_NAMESPACE_SERVER, nr, gfp_mask);
+ return ldlm_pools_shrink(LDLM_NAMESPACE_SERVER, nr_to_scan, gfp_mask);
}
-static int ldlm_pools_cli_shrink(int nr, unsigned int gfp_mask)
+static int KERN_SHRINKER(ldlm_pools_cli_shrink)
{
- return ldlm_pools_shrink(LDLM_NAMESPACE_CLIENT, nr, gfp_mask);
+ return ldlm_pools_shrink(LDLM_NAMESPACE_CLIENT, nr_to_scan, gfp_mask);
}
void ldlm_pools_recalc(ldlm_side_t client)
cfs_sema_init(&lli->lli_readdir_sem, 1);
}
-#ifdef HAVE_NEW_BACKING_DEV_INFO
-static atomic_t ll_bdi_num = ATOMIC_INIT(0);
+static inline int ll_bdi_register(struct backing_dev_info *bdi)
+{
+#ifdef HAVE_BDI_REGISTER
+ static atomic_t ll_bdi_num = ATOMIC_INIT(0);
+
+ bdi->name = "lustre";
+ return bdi_register(bdi, NULL, "lustre-%d",
+ atomic_inc_return(&ll_bdi_num));
+#else
+ return 0;
#endif
+}
int ll_fill_super(struct super_block *sb)
{
if (err)
GOTO(out_free, err);
- err = ll_bdi_init(&lsi->bdi);
+ err = ll_bdi_init(&lsi->lsi_bdi);
+ if (err)
+ GOTO(out_free, err);
+ lsi->lsi_flags |= LSI_BDI_INITIALIZED;
+ lsi->lsi_bdi.capabilities = BDI_CAP_MAP_COPY;
+ err = ll_bdi_register(&lsi->lsi_bdi);
if (err)
GOTO(out_free, err);
-#ifdef HAVE_NEW_BACKING_DEV_INFO
- lsi->bdi.name = "lustre";
- lsi->bdi.capabilities = BDI_CAP_MAP_COPY;
- err = bdi_register(&lsi->bdi, NULL, "lustre-%d",
- atomic_inc_return(&ll_bdi_num));
- sb->s_bdi = &lsi->bdi;
+#ifdef HAVE_SB_BDI
+ sb->s_bdi = &lsi->lsi_bdi;
#endif
/* Generate a string unique to this super, in case some joker tries
if (profilenm)
class_del_profile(profilenm);
- if (ll_bdi_wb_cnt(lsi->bdi) > 0)
- ll_bdi_destroy(&lsi->bdi);
+ if (lsi->lsi_flags & LSI_BDI_INITIALIZED) {
+ ll_bdi_destroy(&lsi->lsi_bdi);
+ lsi->lsi_flags &= ~LSI_BDI_INITIALIZED;
+ }
ll_free_sbi(sb);
lsi->lsi_llsbi = NULL;
/* OIDEBUG(inode); */
/* initializing backing dev info. */
- inode->i_mapping->backing_dev_info = &(s2lsi(inode->i_sb)->bdi);
+ inode->i_mapping->backing_dev_info = &s2lsi(inode->i_sb)->lsi_bdi;
if (S_ISREG(inode->i_mode)) {
ext3_ext_insert_extent(handle, inode, path, newext)
#endif
+#ifdef EXT3_DISCARD_PREALLOCATIONS
+#define ext3_mb_discard_inode_preallocations(inode) \
+ ext3_discard_preallocations(inode)
+#endif
+
static cfs_mem_cache_t *fcb_cache;
}
#ifdef __KERNEL__
-static int lu_cache_shrink(int nr, unsigned int gfp_mask)
+static int KERN_SHRINKER(lu_cache_shrink)
{
lu_site_stats_t stats;
struct lu_site *s;
struct lu_site *tmp;
int cached = 0;
- int remain = nr;
+ int remain = nr_to_scan;
CFS_LIST_HEAD(splice);
- if (nr != 0) {
+ if (nr_to_scan != 0) {
if (!(gfp_mask & __GFP_FS))
return -1;
- CDEBUG(D_INODE, "Shrink %d objects\n", nr);
+ CDEBUG(D_INODE, "Shrink %d objects\n", nr_to_scan);
}
cfs_down(&lu_sites_guard);
cfs_list_for_each_entry_safe(s, tmp, &lu_sites, ls_linkage) {
- if (nr != 0) {
+ if (nr_to_scan != 0) {
remain = lu_site_purge(&lu_shrink_env, s, remain);
/*
* Move just shrunk site to the tail of site list to
memset(&stats, 0, sizeof(stats));
lu_site_stats_get(s->ls_obj_hash, &stats, 0);
cached += stats.lss_total - stats.lss_busy;
- if (nr && remain <= 0)
+ if (nr_to_scan && remain <= 0)
break;
}
cfs_list_splice(&splice, lu_sites.prev);
cfs_up(&lu_sites_guard);
cached = (cached / 100) * sysctl_vfs_cache_pressure;
- if (nr == 0)
+ if (nr_to_scan == 0)
CDEBUG(D_INODE, "%d objects cached\n", cached);
return cached;
}
* could be called frequently for query (@nr_to_scan == 0).
* we try to keep at least PTLRPC_MAX_BRW_PAGES pages in the pool.
*/
-static int enc_pools_shrink(int nr_to_scan, unsigned int gfp_mask)
+static int KERN_SHRINKER(enc_pools_shrink)
{
if (unlikely(nr_to_scan != 0)) {
cfs_spin_lock(&page_pools.epp_lock);