From: Bobi Jam Date: Tue, 15 Mar 2011 01:19:12 +0000 (+0800) Subject: LU-73 RHEL6 support. X-Git-Tag: 2.0.59-llnl3-base~10 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=e550abd05cf7ffddaedbef996be1baaae0912b4a LU-73 RHEL6 support. Include client, ldiskfs, kernel patches. Change-Id: Ice16b8bf40c2e37df9af9f399316917097e8ee8f Signed-off-by: Bobi Jam Reviewed-on: http://review.whamcloud.com/307 Tested-by: Hudson Reviewed-by: Andreas Dilger Reviewed-by: Oleg Drokin Reviewed-by: Brian J. Murrell --- diff --git a/ldiskfs/configure.ac b/ldiskfs/configure.ac index 959bc11..adb68fa 100644 --- a/ldiskfs/configure.ac +++ b/ldiskfs/configure.ac @@ -136,6 +136,7 @@ case $LINUXRELEASE in ;; 2.6.22*) LDISKFS_SERIES="2.6.22-vanilla.series";; 2.6.27*) LDISKFS_SERIES="2.6-sles11.series";; +2.6.32*) LDISKFS_SERIES="2.6-rhel6.series";; *) AC_MSG_WARN([Unknown kernel version $LINUXRELEASE, fix ldiskfs/configure.ac]) esac AC_MSG_RESULT([$LDISKFS_SERIES]) diff --git a/ldiskfs/kernel_patches/patches/export-ext4-2.6-rhel6.patch b/ldiskfs/kernel_patches/patches/export-ext4-2.6-rhel6.patch new file mode 100644 index 0000000..c3411d2 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/export-ext4-2.6-rhel6.patch @@ -0,0 +1,81 @@ +Index: linux-2.6.32.i386/fs/ext4/super.c +=================================================================== +--- linux-2.6.32.i386.orig/fs/ext4/super.c 2010-04-07 14:18:32.000000000 +0530 ++++ linux-2.6.32.i386/fs/ext4/super.c 2010-04-07 14:19:47.000000000 +0530 +@@ -291,6 +291,8 @@ + jbd2_journal_abort_handle(handle); + } + ++EXPORT_SYMBOL(ext4_journal_abort_handle); ++ + /* Deal with the reporting of failure conditions on a filesystem such as + * inconsistencies detected or read IO failures. + * +@@ -3030,6 +3032,8 @@ + return ret; + } + ++EXPORT_SYMBOL(ext4_force_commit); ++ + /* + * Setup any per-fs journal parameters now. We'll do this both on + * initial mount, once the journal has been initialised but before we've +@@ -4088,6 +4092,12 @@ + unsigned long *blocks, int *created, int create); + EXPORT_SYMBOL(ext4_map_inode_page); + ++EXPORT_SYMBOL(ext4_xattr_get); ++EXPORT_SYMBOL(ext4_xattr_set_handle); ++EXPORT_SYMBOL(ext4_bread); ++EXPORT_SYMBOL(ext4_journal_start_sb); ++EXPORT_SYMBOL(__ext4_journal_stop); ++ + MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); + MODULE_DESCRIPTION("Fourth Extended Filesystem"); + MODULE_LICENSE("GPL"); +Index: linux-2.6.32.i386/fs/ext4/ext4.h +=================================================================== +--- linux-2.6.32.i386.orig/fs/ext4/ext4.h 2010-04-07 14:17:04.000000000 +0530 ++++ linux-2.6.32.i386/fs/ext4/ext4.h 2010-04-07 14:20:34.000000000 +0530 +@@ -1385,6 +1385,8 @@ + struct buffer_head *bh, + ext4_group_t group, + struct ext4_group_desc *desc); ++extern struct buffer_head *ext4_read_inode_bitmap(struct super_block *sb, ++ ext4_group_t block_group); + extern void mark_bitmap_end(int start_bit, int end_bit, char *bitmap); + + /* mballoc.c */ +Index: linux-2.6.32.i386/fs/ext4/ialloc.c +=================================================================== +--- linux-2.6.32.i386.orig/fs/ext4/ialloc.c 2009-12-03 09:21:21.000000000 +0530 ++++ linux-2.6.32.i386/fs/ext4/ialloc.c 2010-04-07 14:19:47.000000000 +0530 +@@ -98,7 +98,7 @@ + * + * Return buffer_head of bitmap on success or NULL. + */ +-static struct buffer_head * ++struct buffer_head * + ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) + { + struct ext4_group_desc *desc; +@@ -161,6 +161,7 @@ + } + return bh; + } ++EXPORT_SYMBOL(ext4_read_inode_bitmap); + + /* + * NOTE! When we get the inode, we're the only people +Index: linux-2.6.32.i386/fs/ext4/balloc.c +=================================================================== +--- linux-2.6.32.i386.orig/fs/ext4/balloc.c 2010-03-19 15:43:37.000000000 +0530 ++++ linux-2.6.32.i386/fs/ext4/balloc.c 2010-04-07 14:19:47.000000000 +0530 +@@ -235,6 +235,7 @@ + *bh = sbi->s_group_desc[group_desc]; + return desc; + } ++EXPORT_SYMBOL(ext4_get_group_desc); + + static int ext4_valid_block_bitmap(struct super_block *sb, + struct ext4_group_desc *desc, diff --git a/ldiskfs/kernel_patches/patches/ext4-back-dquot-to-rhel6.patch b/ldiskfs/kernel_patches/patches/ext4-back-dquot-to-rhel6.patch new file mode 100644 index 0000000..bf8826c --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext4-back-dquot-to-rhel6.patch @@ -0,0 +1,54 @@ +Index: linux-stage/fs/ext4/super.c +=================================================================== +--- linux-stage.orig/fs/ext4/super.c 2011-03-11 15:46:27.000000000 +0800 ++++ linux-stage/fs/ext4/super.c 2011-03-11 15:53:05.016701579 +0800 +@@ -1400,9 +1400,47 @@ + static ssize_t ext4_quota_write(struct super_block *sb, int type, + const char *data, size_t len, loff_t off); + ++static int ext4_dquot_initialize(struct inode *inode, int type) ++{ ++ handle_t *handle; ++ int ret, err; ++ ++ /* We may create quota structure so we need to reserve enough blocks */ ++ handle = ext4_journal_start(inode, 2*EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)); ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ ret = dquot_initialize(inode, type); ++ err = ext4_journal_stop(handle); ++ if (!ret) ++ ret = err; ++ return ret; ++} ++ ++static int ext4_dquot_drop(struct inode *inode) ++{ ++ handle_t *handle; ++ int ret, err; ++ ++ /* We may delete quota structure so we need to reserve enough blocks */ ++ handle = ext4_journal_start(inode, 2*EXT4_QUOTA_DEL_BLOCKS(inode->i_sb)); ++ if (IS_ERR(handle)) { ++ /* ++ * We call dquot_drop() anyway to at least release references ++ * to quota structures so that umount does not hang. ++ */ ++ dquot_drop(inode); ++ return PTR_ERR(handle); ++ } ++ ret = dquot_drop(inode); ++ err = ext4_journal_stop(handle); ++ if (!ret) ++ ret = err; ++ return ret; ++} ++ + static const struct dquot_operations ext4_quota_operations = { +- .initialize = dquot_initialize, +- .drop = dquot_drop, ++ .initialize = ext4_dquot_initialize, ++ .drop = ext4_dquot_drop, + .alloc_space = dquot_alloc_space, + .reserve_space = dquot_reserve_space, + .claim_space = dquot_claim_space, diff --git a/ldiskfs/kernel_patches/patches/ext4-big-endian-check-2.6-rhel6.patch b/ldiskfs/kernel_patches/patches/ext4-big-endian-check-2.6-rhel6.patch new file mode 100644 index 0000000..6a1ef25 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext4-big-endian-check-2.6-rhel6.patch @@ -0,0 +1,57 @@ +Index: linux-stage/fs/ext4/super.c +=================================================================== +--- linux-stage.orig/fs/ext4/super.c 2011-03-11 15:27:08.000000000 +0800 ++++ linux-stage/fs/ext4/super.c 2011-03-11 15:29:41.023089829 +0800 +@@ -72,6 +72,8 @@ + static int ext4_freeze(struct super_block *sb); + + ++static int bigendian_extents; ++ + ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, + struct ext4_group_desc *bg) + { +@@ -1492,7 +1494,7 @@ + Opt_block_validity, Opt_noblock_validity, + Opt_inode_readahead_blks, Opt_journal_ioprio, + Opt_discard, Opt_nodiscard, +- Opt_mballoc, ++ Opt_mballoc, Opt_bigendian_extents, + }; + + static const match_table_t tokens = { +@@ -1559,6 +1561,7 @@ + {Opt_auto_da_alloc, "auto_da_alloc=%u"}, + {Opt_auto_da_alloc, "auto_da_alloc"}, + {Opt_noauto_da_alloc, "noauto_da_alloc"}, ++ {Opt_bigendian_extents, "bigendian_extents"}, + {Opt_mballoc, "mballoc"}, + {Opt_discard, "discard"}, + {Opt_nodiscard, "nodiscard"}, +@@ -1996,6 +1999,9 @@ + break; + case Opt_mballoc: + break; ++ case Opt_bigendian_extents: ++ bigendian_extents = 1; ++ break; + case Opt_discard: + set_opt(sbi->s_mount_opt, DISCARD); + break; +@@ -3073,6 +3079,16 @@ + goto failed_mount; + } + ++#ifdef __BIG_ENDIAN ++ if (bigendian_extents == 0) { ++ printk(KERN_ERR "EXT4-fs: extents feature is not guaranteed to " ++ "work on big-endian systems. Use \"bigendian_extents\" " ++ "mount option to override.\n"); ++ goto failed_mount; ++ } ++#endif ++ ++ + #ifdef CONFIG_PROC_FS + if (ext4_proc_root) + sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root); diff --git a/ldiskfs/kernel_patches/patches/ext4-disable-mb-cache-rhel6.patch b/ldiskfs/kernel_patches/patches/ext4-disable-mb-cache-rhel6.patch new file mode 100644 index 0000000..8c98c62 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext4-disable-mb-cache-rhel6.patch @@ -0,0 +1,154 @@ +Index: linux-stage/fs/ext4/ext4.h +=================================================================== +--- linux-stage.orig/fs/ext4/ext4.h 2011-03-14 16:16:45.000000000 +0800 ++++ linux-stage/fs/ext4/ext4.h 2011-03-14 16:17:08.732676431 +0800 +@@ -758,7 +758,8 @@ + /* + * Mount flags + */ +-#define EXT4_MOUNT_OLDALLOC 0x00002 /* Don't use the new Orlov allocator */ ++#define EXT4_MOUNT_NO_MBCACHE 0x00001 /* Disable mbcache */ ++#define EXT4_MOUNT_OLDALLOC 0x00002 /* Don't use the new Orlov allocator */ + #define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */ + #define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */ + #define EXT4_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */ +Index: linux-stage/fs/ext4/super.c +=================================================================== +--- linux-stage.orig/fs/ext4/super.c 2011-03-14 16:16:45.000000000 +0800 ++++ linux-stage/fs/ext4/super.c 2011-03-14 16:18:13.831956469 +0800 +@@ -1502,6 +1502,7 @@ + Opt_inode_readahead_blks, Opt_journal_ioprio, + Opt_discard, Opt_nodiscard, + Opt_mballoc, Opt_bigendian_extents, Opt_force_over_16tb, ++ Opt_no_mbcache, + Opt_extents, Opt_noextents, + }; + +@@ -1574,6 +1575,7 @@ + {Opt_mballoc, "mballoc"}, + {Opt_discard, "discard"}, + {Opt_nodiscard, "nodiscard"}, ++ {Opt_no_mbcache, "no_mbcache"}, + {Opt_extents, "extents"}, + {Opt_noextents, "noextents"}, + {Opt_err, NULL}, +@@ -2049,6 +2051,9 @@ + } + clear_opt(sbi->s_mount_opt, EXTENTS); + break; ++ case Opt_no_mbcache: ++ set_opt(sbi->s_mount_opt, NO_MBCACHE); ++ break; + default: + ext4_msg(sb, KERN_ERR, + "Unrecognized mount option \"%s\" " +Index: linux-stage/fs/ext4/xattr.c +=================================================================== +--- linux-stage.orig/fs/ext4/xattr.c 2011-03-14 16:16:43.000000000 +0800 ++++ linux-stage/fs/ext4/xattr.c 2011-03-14 16:17:08.806677883 +0800 +@@ -86,7 +86,8 @@ + # define ea_bdebug(f...) + #endif + +-static void ext4_xattr_cache_insert(struct buffer_head *); ++static void ext4_xattr_cache_insert(struct super_block *, ++ struct buffer_head *); + static struct buffer_head *ext4_xattr_cache_find(struct inode *, + struct ext4_xattr_header *, + struct mb_cache_entry **); +@@ -234,7 +235,7 @@ + error = -EIO; + goto cleanup; + } +- ext4_xattr_cache_insert(bh); ++ ext4_xattr_cache_insert(inode->i_sb, bh); + entry = BFIRST(bh); + error = ext4_xattr_find_entry(&entry, name_index, name, bh->b_size, 1); + if (error == -EIO) +@@ -376,7 +377,7 @@ + error = -EIO; + goto cleanup; + } +- ext4_xattr_cache_insert(bh); ++ ext4_xattr_cache_insert(inode->i_sb, bh); + error = ext4_xattr_list_entries(inode, BFIRST(bh), buffer, buffer_size); + + cleanup: +@@ -473,7 +474,9 @@ + struct mb_cache_entry *ce = NULL; + int error = 0; + +- ce = mb_cache_entry_get(ext4_xattr_cache, bh->b_bdev, bh->b_blocknr); ++ if (!test_opt(inode->i_sb, NO_MBCACHE)) ++ ce = mb_cache_entry_get(ext4_xattr_cache, bh->b_bdev, ++ bh->b_blocknr); + error = ext4_journal_get_write_access(handle, bh); + if (error) + goto out; +@@ -700,8 +703,10 @@ + if (i->value && i->value_len > sb->s_blocksize) + return -ENOSPC; + if (s->base) { +- ce = mb_cache_entry_get(ext4_xattr_cache, bs->bh->b_bdev, +- bs->bh->b_blocknr); ++ if (!test_opt(inode->i_sb, NO_MBCACHE)) ++ ce = mb_cache_entry_get(ext4_xattr_cache, ++ bs->bh->b_bdev, ++ bs->bh->b_blocknr); + error = ext4_journal_get_write_access(handle, bs->bh); + if (error) + goto cleanup; +@@ -718,7 +723,7 @@ + if (!IS_LAST_ENTRY(s->first)) + ext4_xattr_rehash(header(s->base), + s->here); +- ext4_xattr_cache_insert(bs->bh); ++ ext4_xattr_cache_insert(sb, bs->bh); + } + unlock_buffer(bs->bh); + if (error == -EIO) +@@ -801,7 +806,8 @@ + if (error) + goto cleanup_dquot; + } +- mb_cache_entry_release(ce); ++ if (ce) ++ mb_cache_entry_release(ce); + ce = NULL; + } else if (bs->bh && s->base == bs->bh->b_data) { + /* We were modifying this block in-place. */ +@@ -845,7 +851,7 @@ + memcpy(new_bh->b_data, s->base, new_bh->b_size); + set_buffer_uptodate(new_bh); + unlock_buffer(new_bh); +- ext4_xattr_cache_insert(new_bh); ++ ext4_xattr_cache_insert(sb, new_bh); + error = ext4_handle_dirty_metadata(handle, + inode, new_bh); + if (error) +@@ -1403,12 +1409,15 @@ + * Returns 0, or a negative error number on failure. + */ + static void +-ext4_xattr_cache_insert(struct buffer_head *bh) ++ext4_xattr_cache_insert(struct super_block *sb, struct buffer_head *bh) + { + __u32 hash = le32_to_cpu(BHDR(bh)->h_hash); + struct mb_cache_entry *ce; + int error; + ++ if (test_opt(sb, NO_MBCACHE)) ++ return; ++ + ce = mb_cache_entry_alloc(ext4_xattr_cache, GFP_NOFS); + if (!ce) { + ea_bdebug(bh, "out of memory"); +@@ -1482,6 +1491,8 @@ + __u32 hash = le32_to_cpu(header->h_hash); + struct mb_cache_entry *ce; + ++ if (test_opt(inode->i_sb, NO_MBCACHE)) ++ return NULL; + if (!header->h_hash) + return NULL; /* never share */ + ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); diff --git a/ldiskfs/kernel_patches/patches/ext4-dynlocks-common-rhel6.patch b/ldiskfs/kernel_patches/patches/ext4-dynlocks-common-rhel6.patch new file mode 100644 index 0000000..0a66c86 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext4-dynlocks-common-rhel6.patch @@ -0,0 +1,352 @@ +Index: linux-stage/fs/ext4/dynlocks.c +=================================================================== +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ linux-stage/fs/ext4/dynlocks.c 2011-03-03 15:25:04.025526781 +0800 +@@ -0,0 +1,236 @@ ++/* ++ * Dynamic Locks ++ * ++ * struct dynlock is lockspace ++ * one may request lock (exclusive or shared) for some value ++ * in that lockspace ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++ ++#define DYNLOCK_HANDLE_MAGIC 0xd19a10c ++#define DYNLOCK_HANDLE_DEAD 0xd1956ee ++#define DYNLOCK_LIST_MAGIC 0x11ee91e6 ++ ++static struct kmem_cache * dynlock_cachep = NULL; ++ ++struct dynlock_handle { ++ unsigned dh_magic; ++ struct list_head dh_list; ++ unsigned long dh_value; /* lock value */ ++ int dh_refcount; /* number of users */ ++ int dh_readers; ++ int dh_writers; ++ int dh_pid; /* holder of the lock */ ++ wait_queue_head_t dh_wait; ++}; ++ ++int __init dynlock_cache_init(void) ++{ ++ int rc = 0; ++ ++ printk(KERN_INFO "init dynlocks cache\n"); ++ dynlock_cachep = kmem_cache_create("dynlock_cache", ++ sizeof(struct dynlock_handle), ++ 0, ++ SLAB_HWCACHE_ALIGN, ++ NULL); ++ if (dynlock_cachep == NULL) { ++ printk(KERN_ERR "Not able to create dynlock cache"); ++ rc = -ENOMEM; ++ } ++ return rc; ++} ++ ++void __exit dynlock_cache_exit(void) ++{ ++ printk(KERN_INFO "exit dynlocks cache\n"); ++ kmem_cache_destroy(dynlock_cachep); ++} ++ ++/* ++ * dynlock_init ++ * ++ * initialize lockspace ++ * ++ */ ++void dynlock_init(struct dynlock *dl) ++{ ++ spin_lock_init(&dl->dl_list_lock); ++ INIT_LIST_HEAD(&dl->dl_list); ++ dl->dl_magic = DYNLOCK_LIST_MAGIC; ++} ++EXPORT_SYMBOL(dynlock_init); ++ ++/* ++ * dynlock_lock ++ * ++ * acquires lock (exclusive or shared) in specified lockspace ++ * each lock in lockspace is allocated separately, so user have ++ * to specify GFP flags. ++ * routine returns pointer to lock. this pointer is intended to ++ * be passed to dynlock_unlock ++ * ++ */ ++struct dynlock_handle *dynlock_lock(struct dynlock *dl, unsigned long value, ++ enum dynlock_type lt, gfp_t gfp) ++{ ++ struct dynlock_handle *nhl = NULL; ++ struct dynlock_handle *hl; ++ ++ BUG_ON(dl == NULL); ++ BUG_ON(dl->dl_magic != DYNLOCK_LIST_MAGIC); ++ ++repeat: ++ /* find requested lock in lockspace */ ++ spin_lock(&dl->dl_list_lock); ++ BUG_ON(dl->dl_list.next == NULL); ++ BUG_ON(dl->dl_list.prev == NULL); ++ list_for_each_entry(hl, &dl->dl_list, dh_list) { ++ BUG_ON(hl->dh_list.next == NULL); ++ BUG_ON(hl->dh_list.prev == NULL); ++ BUG_ON(hl->dh_magic != DYNLOCK_HANDLE_MAGIC); ++ if (hl->dh_value == value) { ++ /* lock is found */ ++ if (nhl) { ++ /* someone else just allocated ++ * lock we didn't find and just created ++ * so, we drop our lock ++ */ ++ kmem_cache_free(dynlock_cachep, nhl); ++ nhl = NULL; ++ } ++ hl->dh_refcount++; ++ goto found; ++ } ++ } ++ /* lock not found */ ++ if (nhl) { ++ /* we already have allocated lock. use it */ ++ hl = nhl; ++ nhl = NULL; ++ list_add(&hl->dh_list, &dl->dl_list); ++ goto found; ++ } ++ spin_unlock(&dl->dl_list_lock); ++ ++ /* lock not found and we haven't allocated lock yet. allocate it */ ++ nhl = kmem_cache_alloc(dynlock_cachep, gfp); ++ if (nhl == NULL) ++ return NULL; ++ nhl->dh_refcount = 1; ++ nhl->dh_value = value; ++ nhl->dh_readers = 0; ++ nhl->dh_writers = 0; ++ nhl->dh_magic = DYNLOCK_HANDLE_MAGIC; ++ init_waitqueue_head(&nhl->dh_wait); ++ ++ /* while lock is being allocated, someone else may allocate it ++ * and put onto to list. check this situation ++ */ ++ goto repeat; ++ ++found: ++ if (lt == DLT_WRITE) { ++ /* exclusive lock: user don't want to share lock at all ++ * NOTE: one process may take the same lock several times ++ * this functionaly is useful for rename operations */ ++ while ((hl->dh_writers && hl->dh_pid != current->pid) || ++ hl->dh_readers) { ++ spin_unlock(&dl->dl_list_lock); ++ wait_event(hl->dh_wait, ++ hl->dh_writers == 0 && hl->dh_readers == 0); ++ spin_lock(&dl->dl_list_lock); ++ } ++ hl->dh_writers++; ++ } else { ++ /* shared lock: user do not want to share lock with writer */ ++ while (hl->dh_writers) { ++ spin_unlock(&dl->dl_list_lock); ++ wait_event(hl->dh_wait, hl->dh_writers == 0); ++ spin_lock(&dl->dl_list_lock); ++ } ++ hl->dh_readers++; ++ } ++ hl->dh_pid = current->pid; ++ spin_unlock(&dl->dl_list_lock); ++ ++ return hl; ++} ++EXPORT_SYMBOL(dynlock_lock); ++ ++ ++/* ++ * dynlock_unlock ++ * ++ * user have to specify lockspace (dl) and pointer to lock structure ++ * returned by dynlock_lock() ++ * ++ */ ++void dynlock_unlock(struct dynlock *dl, struct dynlock_handle *hl) ++{ ++ int wakeup = 0; ++ ++ BUG_ON(dl == NULL); ++ BUG_ON(hl == NULL); ++ BUG_ON(dl->dl_magic != DYNLOCK_LIST_MAGIC); ++ ++ if (hl->dh_magic != DYNLOCK_HANDLE_MAGIC) ++ printk(KERN_EMERG "wrong lock magic: %#x\n", hl->dh_magic); ++ ++ BUG_ON(hl->dh_magic != DYNLOCK_HANDLE_MAGIC); ++ BUG_ON(hl->dh_writers != 0 && current->pid != hl->dh_pid); ++ ++ spin_lock(&dl->dl_list_lock); ++ if (hl->dh_writers) { ++ BUG_ON(hl->dh_readers != 0); ++ hl->dh_writers--; ++ if (hl->dh_writers == 0) ++ wakeup = 1; ++ } else if (hl->dh_readers) { ++ hl->dh_readers--; ++ if (hl->dh_readers == 0) ++ wakeup = 1; ++ } else { ++ BUG(); ++ } ++ if (wakeup) { ++ hl->dh_pid = 0; ++ wake_up(&hl->dh_wait); ++ } ++ if (--(hl->dh_refcount) == 0) { ++ hl->dh_magic = DYNLOCK_HANDLE_DEAD; ++ list_del(&hl->dh_list); ++ kmem_cache_free(dynlock_cachep, hl); ++ } ++ spin_unlock(&dl->dl_list_lock); ++} ++EXPORT_SYMBOL(dynlock_unlock); ++ ++int dynlock_is_locked(struct dynlock *dl, unsigned long value) ++{ ++ struct dynlock_handle *hl; ++ int result = 0; ++ ++ /* find requested lock in lockspace */ ++ spin_lock(&dl->dl_list_lock); ++ BUG_ON(dl->dl_list.next == NULL); ++ BUG_ON(dl->dl_list.prev == NULL); ++ list_for_each_entry(hl, &dl->dl_list, dh_list) { ++ BUG_ON(hl->dh_list.next == NULL); ++ BUG_ON(hl->dh_list.prev == NULL); ++ BUG_ON(hl->dh_magic != DYNLOCK_HANDLE_MAGIC); ++ if (hl->dh_value == value && hl->dh_pid == current->pid) { ++ /* lock is found */ ++ result = 1; ++ break; ++ } ++ } ++ spin_unlock(&dl->dl_list_lock); ++ return result; ++} ++EXPORT_SYMBOL(dynlock_is_locked); +Index: linux-stage/include/linux/dynlocks.h +=================================================================== +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ linux-stage/include/linux/dynlocks.h 2011-03-03 15:25:04.055526552 +0800 +@@ -0,0 +1,34 @@ ++#ifndef _LINUX_DYNLOCKS_H ++#define _LINUX_DYNLOCKS_H ++ ++#include ++#include ++ ++struct dynlock_handle; ++ ++/* ++ * lock's namespace: ++ * - list of locks ++ * - lock to protect this list ++ */ ++struct dynlock { ++ unsigned dl_magic; ++ struct list_head dl_list; ++ spinlock_t dl_list_lock; ++}; ++ ++enum dynlock_type { ++ DLT_WRITE, ++ DLT_READ ++}; ++ ++int dynlock_cache_init(void); ++void dynlock_cache_exit(void); ++void dynlock_init(struct dynlock *dl); ++struct dynlock_handle *dynlock_lock(struct dynlock *dl, unsigned long value, ++ enum dynlock_type lt, gfp_t gfp); ++void dynlock_unlock(struct dynlock *dl, struct dynlock_handle *lock); ++int dynlock_is_locked(struct dynlock *dl, unsigned long value); ++ ++#endif ++ +Index: linux-stage/fs/ext4/Makefile +=================================================================== +--- linux-stage.orig/fs/ext4/Makefile 2011-03-05 11:50:43.000000000 +0800 ++++ linux-stage/fs/ext4/Makefile 2011-03-05 11:52:42.349154982 +0800 +@@ -6,7 +6,8 @@ + + ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ + ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ +- ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o ++ ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \ ++ dynlocks.o + + ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o + ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o +Index: linux-stage/fs/ext4/super.c +=================================================================== +--- linux-stage.orig/fs/ext4/super.c 2011-03-05 11:50:43.000000000 +0800 ++++ linux-stage/fs/ext4/super.c 2011-03-05 11:57:33.632869451 +0800 +@@ -4457,17 +4457,20 @@ + return err; + ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj); + if (!ext4_kset) +- goto out4; ++ goto out5; + ext4_proc_root = proc_mkdir("fs/ext4", NULL); + err = init_ext4_mballoc(); + if (err) +- goto out3; ++ goto out4; + + err = init_ext4_xattr(); + if (err) +- goto out2; ++ goto out3; + err = init_inodecache(); + if (err) ++ goto out2; ++ err = dynlock_cache_init(); ++ if (err) + goto out1; + err = register_filesystem(&ext4_fs_type); + if (err) +@@ -4477,15 +4480,17 @@ + + return 0; + out: +- destroy_inodecache(); ++ dynlock_cache_exit(); + out1: +- exit_ext4_xattr(); ++ destroy_inodecache(); + out2: +- exit_ext4_mballoc(); ++ exit_ext4_xattr(); + out3: ++ exit_ext4_mballoc(); ++out4: + remove_proc_entry("fs/ext4", NULL); + kset_unregister(ext4_kset); +-out4: ++out5: + exit_ext4_system_zone(); + return err; + } +@@ -4493,6 +4498,7 @@ + static void __exit exit_ext4_fs(void) + { + unregister_filesystem(&ext4_fs_type); ++ dynlock_cache_exit(); + destroy_inodecache(); + exit_ext4_xattr(); + exit_ext4_mballoc(); diff --git a/ldiskfs/kernel_patches/patches/ext4-extents-mount-option-rhel6.patch b/ldiskfs/kernel_patches/patches/ext4-extents-mount-option-rhel6.patch new file mode 100644 index 0000000..c4cc531 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext4-extents-mount-option-rhel6.patch @@ -0,0 +1,174 @@ +Index: linux-stage/fs/ext4/ext4.h +=================================================================== +--- linux-stage.orig/fs/ext4/ext4.h 2011-03-14 15:57:13.613674482 +0800 ++++ linux-stage/fs/ext4/ext4.h 2011-03-14 15:57:22.031906980 +0800 +@@ -780,6 +780,7 @@ + #define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */ + #define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */ + #define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ ++#define EXT4_MOUNT_EXTENTS 0x400000 /* Extents support */ + #define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ + #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ + #define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ +Index: linux-stage/fs/ext4/ext4_jbd2.h +=================================================================== +--- linux-stage.orig/fs/ext4/ext4_jbd2.h 2011-03-14 15:57:12.000000000 +0800 ++++ linux-stage/fs/ext4/ext4_jbd2.h 2011-03-14 15:58:55.957499110 +0800 +@@ -33,7 +33,7 @@ + + #define EXT4_SINGLEDATA_TRANS_BLOCKS(sb) \ + (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS) \ +- ? 27U : 8U) ++ || test_opt(sb, EXTENTS) ? 27U : 8U) + + #define ext4_journal_dirty_metadata(handle, bh) \ + ext4_handle_dirty_metadata(handle, NULL, bh) +Index: linux-stage/fs/ext4/extents.c +=================================================================== +--- linux-stage.orig/fs/ext4/extents.c 2011-03-14 15:57:12.000000000 +0800 ++++ linux-stage/fs/ext4/extents.c 2011-03-14 16:14:14.246265207 +0800 +@@ -2553,7 +2553,7 @@ + * possible initialization would be here + */ + +- if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) { ++ if (test_opt(sb, EXTENTS)) { + #if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS) + printk(KERN_INFO "EXT4-fs: file extents enabled"); + #ifdef AGGRESSIVE_TEST +@@ -2580,7 +2580,7 @@ + */ + void ext4_ext_release(struct super_block *sb) + { +- if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) ++ if (!test_opt(sb, EXTENTS)) + return; + + #ifdef EXTENTS_STATS +Index: linux-stage/fs/ext4/ialloc.c +=================================================================== +--- linux-stage.orig/fs/ext4/ialloc.c 2011-03-14 15:57:13.000000000 +0800 ++++ linux-stage/fs/ext4/ialloc.c 2011-03-14 16:02:03.334308846 +0800 +@@ -1049,7 +1049,7 @@ + if (err) + goto fail_free_drop; + +- if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) { ++ if (test_opt(sb, EXTENTS)) { + /* set extent flag only for directory, file and normal symlink*/ + if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) { + EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL; +Index: linux-stage/fs/ext4/migrate.c +=================================================================== +--- linux-stage.orig/fs/ext4/migrate.c 2011-03-14 15:36:15.000000000 +0800 ++++ linux-stage/fs/ext4/migrate.c 2011-03-14 16:05:39.083369164 +0800 +@@ -459,13 +459,13 @@ + unsigned long max_entries; + __u32 goal; + +- /* +- * If the filesystem does not support extents, or the inode +- * already is extent-based, error out. +- */ +- if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb, +- EXT4_FEATURE_INCOMPAT_EXTENTS) || +- (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) ++ if (!test_opt(inode->i_sb, EXTENTS)) ++ /* ++ * if mounted with noextents we don't allow the migrate ++ */ ++ return -EINVAL; ++ ++ if ((EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) + return -EINVAL; + + if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0) +Index: linux-stage/fs/ext4/super.c +=================================================================== +--- linux-stage.orig/fs/ext4/super.c 2011-03-14 15:57:18.000000000 +0800 ++++ linux-stage/fs/ext4/super.c 2011-03-14 16:11:58.234626200 +0800 +@@ -942,6 +942,8 @@ + seq_puts(seq, ",journal_async_commit"); + if (test_opt(sb, NOBH)) + seq_puts(seq, ",nobh"); ++ if (!test_opt(sb, EXTENTS)) ++ seq_puts(seq, ",noextents"); + if (test_opt(sb, I_VERSION)) + seq_puts(seq, ",i_version"); + if (!test_opt(sb, DELALLOC)) +@@ -1500,6 +1502,7 @@ + Opt_inode_readahead_blks, Opt_journal_ioprio, + Opt_discard, Opt_nodiscard, + Opt_mballoc, Opt_bigendian_extents, Opt_force_over_16tb, ++ Opt_extents, Opt_noextents, + }; + + static const match_table_t tokens = { +@@ -1571,6 +1574,8 @@ + {Opt_mballoc, "mballoc"}, + {Opt_discard, "discard"}, + {Opt_nodiscard, "nodiscard"}, ++ {Opt_extents, "extents"}, ++ {Opt_noextents, "noextents"}, + {Opt_err, NULL}, + }; + +@@ -1613,6 +1618,7 @@ + int qtype, qfmt; + char *qname; + #endif ++ ext4_fsblk_t last_block; + + if (!options) + return 1; +@@ -2017,6 +2023,32 @@ + case Opt_force_over_16tb: + force_over_16tb = 1; + break; ++ case Opt_extents: ++ if (!EXT4_HAS_INCOMPAT_FEATURE(sb, ++ EXT4_FEATURE_INCOMPAT_EXTENTS)) { ++ ext4_warning(sb, "extents feature not enabled " ++ "on this filesystem, use tune2fs"); ++ return 0; ++ } ++ set_opt(sbi->s_mount_opt, EXTENTS); ++ break; ++ case Opt_noextents: ++ /* ++ * When e2fsprogs support resizing an already existing ++ * ext4 file system to greater than 2**32 we need to ++ * add support to block allocator to handle growing ++ * already existing block mapped inode so that blocks ++ * allocated for them fall within 2**32 ++ */ ++ last_block = ext4_blocks_count(sbi->s_es) - 1; ++ if (last_block > 0xffffffffULL) { ++ printk(KERN_ERR "EXT4-fs: Filesystem too " ++ "large to mount with " ++ "-o noextents options\n"); ++ return 0; ++ } ++ clear_opt(sbi->s_mount_opt, EXTENTS); ++ break; + default: + ext4_msg(sb, KERN_ERR, + "Unrecognized mount option \"%s\" " +@@ -2879,6 +2911,17 @@ + set_opt(sbi->s_mount_opt, BARRIER); + + /* ++ * turn on extents feature by default in ext4 filesystem ++ * only if feature flag already set by mkfs or tune2fs. ++ * Use -o noextents to turn it off ++ */ ++ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) ++ set_opt(sbi->s_mount_opt, EXTENTS); ++ else ++ ext4_warning(sb, "extents feature not enabled on this filesystem, " ++ "use tune2fs."); ++ ++ /* + * enable delayed allocation by default + * Use -o nodelalloc to turn it off + */ diff --git a/ldiskfs/kernel_patches/patches/ext4-fiemap-2.6-rhel6.patch b/ldiskfs/kernel_patches/patches/ext4-fiemap-2.6-rhel6.patch new file mode 100644 index 0000000..fb4690b --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext4-fiemap-2.6-rhel6.patch @@ -0,0 +1,111 @@ +This patch adds direct EXT4_IOC_FIEMAP support to ldiskfs, for Lustre to call +without having to go through do_vfs_ioctl() (which isn't exported, and has a +number of other ioctls which are not suitable for Lustre). The actual FIEMAP +support is already in the kernel/ext4 for normal usage. + +Index: linux-stage/fs/ext4/ext4.h +=================================================================== +--- linux-stage.orig/fs/ext4/ext4.h 2011-03-05 12:34:16.458850451 +0800 ++++ linux-stage/fs/ext4/ext4.h 2011-03-05 12:35:25.338882364 +0800 +@@ -405,7 +405,7 @@ + #define EXT4_IOC_GROUP_ADD _IOW('f', 8, struct ext4_new_group_input) + #define EXT4_IOC_MIGRATE _IO('f', 9) + /* note ioctl 10 reserved for an early version of the FIEMAP ioctl */ +- /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */ ++#define EXT4_IOC_FIEMAP _IOWR('f', 11, struct fiemap) + #define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12) + #define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent) + +Index: linux-stage/fs/ext4/ioctl.c +=================================================================== +--- linux-stage.orig/fs/ext4/ioctl.c 2011-03-05 12:34:11.299779163 +0800 ++++ linux-stage/fs/ext4/ioctl.c 2011-03-05 12:34:16.862856069 +0800 +@@ -18,6 +18,71 @@ + #include "ext4_jbd2.h" + #include "ext4.h" + ++/* So that the fiemap access checks can't overflow on 32 bit machines. */ ++#define FIEMAP_MAX_EXTENTS (UINT_MAX / sizeof(struct fiemap_extent)) ++ ++static int fiemap_check_ranges(struct super_block *sb, ++ u64 start, u64 len, u64 *new_len) ++{ ++ *new_len = len; ++ ++ if (len == 0) ++ return -EINVAL; ++ ++ if (start > sb->s_maxbytes) ++ return -EFBIG; ++ ++ /* ++ * Shrink request scope to what the fs can actually handle. ++ */ ++ if ((len > sb->s_maxbytes) || ++ (sb->s_maxbytes - len) < start) ++ *new_len = sb->s_maxbytes - start; ++ ++ return 0; ++} ++ ++int ioctl_fiemap(struct inode *inode, struct file *filp, unsigned long arg) ++{ ++ struct fiemap fiemap; ++ u64 len; ++ struct fiemap_extent_info fieinfo = {0, }; ++ struct super_block *sb = inode->i_sb; ++ int error = 0; ++ ++ if (copy_from_user(&fiemap, (struct fiemap __user *) arg, ++ sizeof(struct fiemap))) ++ return -EFAULT; ++ ++ if (fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS) ++ return -EINVAL; ++ ++ error = fiemap_check_ranges(sb, fiemap.fm_start, fiemap.fm_length, ++ &len); ++ if (error) ++ return error; ++ ++ fieinfo.fi_flags = fiemap.fm_flags; ++ fieinfo.fi_extents_max = fiemap.fm_extent_count; ++ fieinfo.fi_extents_start = (struct fiemap_extent *)(arg + sizeof(fiemap)); ++ ++ if (fiemap.fm_extent_count != 0 && ++ !access_ok(VERIFY_WRITE, (void *)arg, ++ offsetof(typeof(fiemap), fm_extents[fiemap.fm_extent_count]))) ++ return -EFAULT; ++ ++ if (fieinfo.fi_flags & FIEMAP_FLAG_SYNC) ++ filemap_write_and_wait(inode->i_mapping); ++ ++ error = ext4_fiemap(inode, &fieinfo, fiemap.fm_start, len); ++ fiemap.fm_flags = fieinfo.fi_flags; ++ fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped; ++ if (copy_to_user((char *)arg, &fiemap, sizeof(fiemap))) ++ error = -EFAULT; ++ ++ return error; ++} ++ + long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) + { + struct inode *inode = filp->f_dentry->d_inode; +@@ -330,6 +395,9 @@ + mnt_drop_write(filp->f_path.mnt); + return err; + } ++ case EXT4_IOC_FIEMAP: { ++ return ioctl_fiemap(inode, filp, arg); ++ } + + default: + return -ENOTTY; +Index: linux-stage/fs/ext4/fiemap.h +=================================================================== +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ linux-stage/fs/ext4/fiemap.h 2011-03-05 12:36:24.606879702 +0800 +@@ -0,0 +1,2 @@ ++ ++#include_next diff --git a/ldiskfs/kernel_patches/patches/ext4-force_over_16tb-rhel6.patch b/ldiskfs/kernel_patches/patches/ext4-force_over_16tb-rhel6.patch new file mode 100644 index 0000000..8f99774 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext4-force_over_16tb-rhel6.patch @@ -0,0 +1,67 @@ +Index: linux-stage/fs/ext4/super.c +=================================================================== +--- linux-stage.orig/fs/ext4/super.c 2011-03-11 15:35:15.680343230 +0800 ++++ linux-stage/fs/ext4/super.c 2011-03-11 15:44:45.037632078 +0800 +@@ -55,6 +55,8 @@ + struct proc_dir_entry *ext4_proc_root; + static struct kset *ext4_kset; + ++static int force_over_16tb; ++ + static int ext4_load_journal(struct super_block *, struct ext4_super_block *, + unsigned long journal_devnum); + static int ext4_commit_super(struct super_block *sb, int sync); +@@ -1494,7 +1496,7 @@ + Opt_block_validity, Opt_noblock_validity, + Opt_inode_readahead_blks, Opt_journal_ioprio, + Opt_discard, Opt_nodiscard, +- Opt_mballoc, Opt_bigendian_extents, ++ Opt_mballoc, Opt_bigendian_extents, Opt_force_over_16tb, + }; + + static const match_table_t tokens = { +@@ -1562,6 +1564,7 @@ + {Opt_auto_da_alloc, "auto_da_alloc"}, + {Opt_noauto_da_alloc, "noauto_da_alloc"}, + {Opt_bigendian_extents, "bigendian_extents"}, ++ {Opt_force_over_16tb, "force_over_16th"}, + {Opt_mballoc, "mballoc"}, + {Opt_discard, "discard"}, + {Opt_nodiscard, "nodiscard"}, +@@ -2008,6 +2011,9 @@ + break; + case Opt_mballoc: + break; ++ case Opt_force_over_16tb: ++ force_over_16tb = 1; ++ break; + default: + ext4_msg(sb, KERN_ERR, + "Unrecognized mount option \"%s\" " +@@ -3031,6 +3037,26 @@ + goto failed_mount; + } + ++ if (ext4_blocks_count(es) >= (1ULL << 32)) { ++ if (force_over_16tb == 0) { ++ printk(KERN_ERR "EXT4-fs does not support filesystems " ++ "greater than 16TB and can cause data corruption." ++ "Use \"force_over_16tb\" mount option to override." ++ "\n"); ++ goto failed_mount; ++ } ++ } ++ ++ if (ext4_blocks_count(es) >= (1ULL << 32)) { ++ if (force_over_16tb == 0) { ++ printk(KERN_ERR "EXT4-fs does not support filesystems " ++ "greater than 16TB and can cause data corruption." ++ "Use \"force_over_16tb\" mount option to override." ++ "\n"); ++ goto failed_mount; ++ } ++ } ++ + if (EXT4_BLOCKS_PER_GROUP(sb) == 0) + goto cantfind_ext4; + diff --git a/ldiskfs/kernel_patches/patches/ext4-inode-version-rhel6.patch b/ldiskfs/kernel_patches/patches/ext4-inode-version-rhel6.patch new file mode 100644 index 0000000..a104bed --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext4-inode-version-rhel6.patch @@ -0,0 +1,63 @@ +Index: linux-2.6.32-el6-beta/fs/ext4/inode.c +=================================================================== +--- linux-2.6.32-el6-beta.orig/fs/ext4/inode.c ++++ linux-2.6.32-el6-beta/fs/ext4/inode.c +@@ -4920,11 +4920,11 @@ struct inode *ext4_iget(struct super_blo + EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode); + EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode); + +- inode->i_version = le32_to_cpu(raw_inode->i_disk_version); ++ ei->i_fs_version = le32_to_cpu(raw_inode->i_disk_version); + if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { + if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) +- inode->i_version |= +- (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32; ++ ei->i_fs_version |= (__u64)(le32_to_cpu(raw_inode->i_version_hi)) ++ << 32; + } + + ret = 0; +@@ -5134,11 +5134,11 @@ static int ext4_do_update_inode(handle_t + for (block = 0; block < EXT4_N_BLOCKS; block++) + raw_inode->i_block[block] = ei->i_data[block]; + +- raw_inode->i_disk_version = cpu_to_le32(inode->i_version); ++ raw_inode->i_disk_version = cpu_to_le32(ei->i_fs_version); + if (ei->i_extra_isize) { + if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) +- raw_inode->i_version_hi = +- cpu_to_le32(inode->i_version >> 32); ++ raw_inode->i_version_hi = cpu_to_le32(ei->i_fs_version ++ >> 32); + raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); + } + +Index: linux-2.6.32-el6-beta/fs/ext4/ialloc.c +=================================================================== +--- linux-2.6.32-el6-beta.orig/fs/ext4/ialloc.c ++++ linux-2.6.32-el6-beta/fs/ext4/ialloc.c +@@ -1018,6 +1018,7 @@ got: + ei->i_dtime = 0; + ei->i_block_group = group; + ei->i_last_alloc_group = ~0; ++ ei->i_fs_version = 0; + + ext4_set_inode_flags(inode); + if (IS_DIRSYNC(inode)) +Index: linux-2.6.32-el6-beta/fs/ext4/ext4.h +=================================================================== +--- linux-2.6.32-el6-beta.orig/fs/ext4/ext4.h ++++ linux-2.6.32-el6-beta/fs/ext4/ext4.h +@@ -714,8 +714,12 @@ struct ext4_inode_info { + */ + tid_t i_sync_tid; + tid_t i_datasync_tid; ++ ++ __u64 i_fs_version; + }; + ++#define HAVE_DISK_INODE_VERSION ++ + /* + * File system states + */ diff --git a/ldiskfs/kernel_patches/patches/ext4-kill-dx_root-rhel6.patch b/ldiskfs/kernel_patches/patches/ext4-kill-dx_root-rhel6.patch new file mode 100644 index 0000000..6631dde --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext4-kill-dx_root-rhel6.patch @@ -0,0 +1,236 @@ +removes static definition of dx_root struct. so that "." and ".." dirent can +have extra data. This patch does not change any functionality but is required for +ext4_data_in_dirent patch. + +Index: linux-2.6.32.i386/fs/ext4/namei.c +=================================================================== +--- linux-2.6.32.i386.orig/fs/ext4/namei.c 2010-04-16 05:35:06.000000000 +0530 ++++ linux-2.6.32.i386/fs/ext4/namei.c 2010-04-16 05:47:41.000000000 +0530 +@@ -115,22 +115,13 @@ + * hash version mod 4 should never be 0. Sincerely, the paranoia department. + */ + +-struct dx_root ++struct dx_root_info + { +- struct fake_dirent dot; +- char dot_name[4]; +- struct fake_dirent dotdot; +- char dotdot_name[4]; +- struct dx_root_info +- { +- __le32 reserved_zero; +- u8 hash_version; +- u8 info_length; /* 8 */ +- u8 indirect_levels; +- u8 unused_flags; +- } +- info; +- struct dx_entry entries[0]; ++ __le32 reserved_zero; ++ u8 hash_version; ++ u8 info_length; /* 8 */ ++ u8 indirect_levels; ++ u8 unused_flags; + }; + + struct dx_node +@@ -244,6 +235,16 @@ + * Future: use high four bits of block for coalesce-on-delete flags + * Mask them off for now. + */ ++struct dx_root_info * dx_get_dx_info(struct ext4_dir_entry_2 *de) ++{ ++ /* get dotdot first */ ++ de = (struct ext4_dir_entry_2 *)((char *)de + EXT4_DIR_REC_LEN(1)); ++ ++ /* dx root info is after dotdot entry */ ++ de = (struct ext4_dir_entry_2 *)((char *)de + EXT4_DIR_REC_LEN(2)); ++ ++ return (struct dx_root_info *) de; ++} + + static inline ext4_lblk_t dx_get_block(struct dx_entry *entry) + { +@@ -398,7 +399,7 @@ + { + unsigned count, indirect; + struct dx_entry *at, *entries, *p, *q, *m; +- struct dx_root *root; ++ struct dx_root_info * info; + struct buffer_head *bh; + struct dx_frame *frame = frame_in; + u32 hash; +@@ -406,17 +407,18 @@ + frame->bh = NULL; + if (!(bh = ext4_bread (NULL,dir, 0, 0, err))) + goto fail; +- root = (struct dx_root *) bh->b_data; +- if (root->info.hash_version != DX_HASH_TEA && +- root->info.hash_version != DX_HASH_HALF_MD4 && +- root->info.hash_version != DX_HASH_LEGACY) { ++ ++ info = dx_get_dx_info((struct ext4_dir_entry_2*)bh->b_data); ++ if (info->hash_version != DX_HASH_TEA && ++ info->hash_version != DX_HASH_HALF_MD4 && ++ info->hash_version != DX_HASH_LEGACY) { + ext4_warning(dir->i_sb, "Unrecognised inode hash code %d for directory " +- "#%lu", root->info.hash_version, dir->i_ino); ++ "#%lu", info->hash_version, dir->i_ino); + brelse(bh); + *err = ERR_BAD_DX_DIR; + goto fail; + } +- hinfo->hash_version = root->info.hash_version; ++ hinfo->hash_version = info->hash_version; + if (hinfo->hash_version <= DX_HASH_TEA) + hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; + hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed; +@@ -425,27 +427,26 @@ + ext4fs_dirhash(d_name->name, d_name->len, hinfo); + hash = hinfo->hash; + +- if (root->info.unused_flags & 1) { ++ if (info->unused_flags & 1) { + ext4_warning(dir->i_sb, "Unimplemented inode hash flags: %#06x", +- root->info.unused_flags); ++ info->unused_flags); + brelse(bh); + *err = ERR_BAD_DX_DIR; + goto fail; + } + +- if ((indirect = root->info.indirect_levels) > 1) { ++ if ((indirect = info->indirect_levels) > 1) { + ext4_warning(dir->i_sb, "Unimplemented inode hash depth: %#06x", +- root->info.indirect_levels); ++ info->indirect_levels); + brelse(bh); + *err = ERR_BAD_DX_DIR; + goto fail; + } + +- entries = (struct dx_entry *) (((char *)&root->info) + +- root->info.info_length); ++ entries = (struct dx_entry *) (((char *)info) + info->info_length); + + if (dx_get_limit(entries) != dx_root_limit(dir, +- root->info.info_length)) { ++ info->info_length)) { + ext4_warning(dir->i_sb, "dx entry: limit != root limit"); + brelse(bh); + *err = ERR_BAD_DX_DIR; +@@ -525,10 +526,12 @@ fail: + + static void dx_release (struct dx_frame *frames) + { ++ struct dx_root_info *info; + if (frames[0].bh == NULL) + return; + +- if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels) ++ info = dx_get_dx_info((struct ext4_dir_entry_2*)frames[0].bh->b_data); ++ if (info->indirect_levels) + brelse(frames[1].bh); + brelse(frames[0].bh); + } +@@ -1447,17 +1450,16 @@ + const char *name = dentry->d_name.name; + int namelen = dentry->d_name.len; + struct buffer_head *bh2; +- struct dx_root *root; + struct dx_frame frames[2], *frame; + struct dx_entry *entries; +- struct ext4_dir_entry_2 *de, *de2; ++ struct ext4_dir_entry_2 *de, *de2, *dot_de, *dotdot_de; + char *data1, *top; + unsigned len; + int retval; + unsigned blocksize; + struct dx_hash_info hinfo; + ext4_lblk_t block; +- struct fake_dirent *fde; ++ struct dx_root_info *dx_info; + + blocksize = dir->i_sb->s_blocksize; + dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino)); +@@ -1467,20 +1469,21 @@ + brelse(bh); + return retval; + } +- root = (struct dx_root *) bh->b_data; ++ ++ dot_de = (struct ext4_dir_entry_2 *) bh->b_data; ++ dotdot_de = ext4_next_entry(dot_de, blocksize); + + /* The 0th block becomes the root, move the dirents out */ +- fde = &root->dotdot; +- de = (struct ext4_dir_entry_2 *)((char *)fde + +- ext4_rec_len_from_disk(fde->rec_len, blocksize)); +- if ((char *) de >= (((char *) root) + blocksize)) { ++ de = (struct ext4_dir_entry_2 *)((char *)dotdot_de + ++ ext4_rec_len_from_disk(dotdot_de->rec_len, blocksize)); ++ if ((char *) de >= (((char *) dot_de) + blocksize)) { + ext4_error(dir->i_sb, + "invalid rec_len for '..' in inode %lu", + dir->i_ino); + brelse(bh); + return -EIO; + } +- len = ((char *) root) + blocksize - (char *) de; ++ len = ((char *) dot_de) + blocksize - (char *) de; + + /* Allocate new block for the 0th block's dirents */ + bh2 = ext4_append(handle, dir, &block, &retval); +@@ -1499,19 +1502,23 @@ + de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de, + blocksize); + /* Initialize the root; the dot dirents already exist */ +- de = (struct ext4_dir_entry_2 *) (&root->dotdot); +- de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2), +- blocksize); +- memset (&root->info, 0, sizeof(root->info)); +- root->info.info_length = sizeof(root->info); +- root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; +- entries = root->entries; ++ dotdot_de->rec_len = ext4_rec_len_to_disk(blocksize - ++ le16_to_cpu(dot_de->rec_len), blocksize); ++ ++ /* initialize hashing info */ ++ dx_info = dx_get_dx_info(dot_de); ++ memset (dx_info, 0, sizeof(*dx_info)); ++ dx_info->info_length = sizeof(*dx_info); ++ dx_info->hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; ++ ++ entries = (void *)dx_info + sizeof(*dx_info); ++ + dx_set_block(entries, 1); + dx_set_count(entries, 1); +- dx_set_limit(entries, dx_root_limit(dir, sizeof(root->info))); ++ dx_set_limit(entries, dx_root_limit(dir, sizeof(*dx_info))); + + /* Initialize as for dx_probe */ +- hinfo.hash_version = root->info.hash_version; ++ hinfo.hash_version = dx_info->hash_version; + if (hinfo.hash_version <= DX_HASH_TEA) + hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; + hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; +@@ -1759,6 +1766,7 @@ + goto journal_error; + brelse (bh2); + } else { ++ struct dx_root_info * info; + dxtrace(printk(KERN_DEBUG + "Creating second level index...\n")); + memcpy((char *) entries2, (char *) entries, +@@ -1768,7 +1776,9 @@ + /* Set up root */ + dx_set_count(entries, 1); + dx_set_block(entries + 0, newblock); +- ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1; ++ info = dx_get_dx_info((struct ext4_dir_entry_2*) ++ frames[0].bh->b_data); ++ info->indirect_levels = 1; + + /* Add new access path frame */ + frame = frames + 1; diff --git a/ldiskfs/kernel_patches/patches/ext4-map_inode_page-2.6-rhel6.patch b/ldiskfs/kernel_patches/patches/ext4-map_inode_page-2.6-rhel6.patch new file mode 100644 index 0000000..ced4af6 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext4-map_inode_page-2.6-rhel6.patch @@ -0,0 +1,87 @@ +Index: linux-2.6.32-el6-beta/fs/ext4/inode.c +=================================================================== +--- linux-2.6.32-el6-beta.orig/fs/ext4/inode.c ++++ linux-2.6.32-el6-beta/fs/ext4/inode.c +@@ -5834,3 +5834,67 @@ out_unlock: + up_read(&inode->i_alloc_sem); + return ret; + } ++ ++int ext4_map_inode_page(struct inode *inode, struct page *page, ++ unsigned long *blocks, int *created, int create) ++{ ++ unsigned int blocksize, blocks_per_page; ++ unsigned long iblock; ++ struct buffer_head dummy; ++ void *handle; ++ int i, rc = 0, failed = 0, needed_blocks; ++ ++ blocksize = inode->i_sb->s_blocksize; ++ blocks_per_page = PAGE_SIZE >> inode->i_sb->s_blocksize_bits; ++ iblock = page->index * blocks_per_page; ++ ++ for (i = 0; i < blocks_per_page; i++, iblock++) { ++ blocks[i] = ext4_bmap(inode->i_mapping, iblock); ++ if (blocks[i] == 0) { ++ failed++; ++ if (created) ++ created[i] = -1; ++ } else if (created) { ++ created[i] = 0; ++ } ++ } ++ ++ if (failed == 0 || create == 0) ++ return 0; ++ ++ needed_blocks = ext4_writepage_trans_blocks(inode); ++ handle = ext4_journal_start(inode, needed_blocks); ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ ++ iblock = page->index * blocks_per_page; ++ for (i = 0; i < blocks_per_page; i++, iblock++) { ++ if (blocks[i] != 0) ++ continue; ++ ++ rc = ext4_get_blocks(handle, inode, iblock, 1, &dummy, 1); ++ if (rc < 0) { ++ printk(KERN_INFO "ext4_map_inode_page: error reading " ++ "block %ld\n", iblock); ++ goto out; ++ } else { ++ if (rc > 1) ++ WARN_ON(1); ++ ++ rc = 0; ++ } ++ /* Unmap any metadata buffers from the block mapping, to avoid ++ * data corruption due to direct-write from Lustre being ++ * clobbered by a later flush of the blockdev metadata buffer.*/ ++ if (buffer_new(&dummy)) ++ unmap_underlying_metadata(dummy.b_bdev, ++ dummy.b_blocknr); ++ blocks[i] = dummy.b_blocknr; ++ if (created) ++ created[i] = 1; ++ } ++ ++out: ++ ext4_journal_stop(handle); ++ return rc; ++} +Index: linux-2.6.32-el6-beta/fs/ext4/super.c +=================================================================== +--- linux-2.6.32-el6-beta.orig/fs/ext4/super.c ++++ linux-2.6.32-el6-beta/fs/ext4/super.c +@@ -4084,6 +4084,10 @@ static void __exit exit_ext4_fs(void) + exit_ext4_system_zone(); + } + ++int ext4_map_inode_page(struct inode *inode, struct page *page, ++ unsigned long *blocks, int *created, int create); ++EXPORT_SYMBOL(ext4_map_inode_page); ++ + MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); + MODULE_DESCRIPTION("Fourth Extended Filesystem"); + MODULE_LICENSE("GPL"); diff --git a/ldiskfs/kernel_patches/patches/ext4-max-dir-size-rhel6.patch b/ldiskfs/kernel_patches/patches/ext4-max-dir-size-rhel6.patch new file mode 100644 index 0000000..8352d02 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext4-max-dir-size-rhel6.patch @@ -0,0 +1,67 @@ +Index: linux-2.6.32-el6-beta/fs/ext4/ialloc.c +=================================================================== +--- linux-2.6.32-el6-beta.orig/fs/ext4/ialloc.c ++++ linux-2.6.32-el6-beta/fs/ext4/ialloc.c +@@ -825,11 +825,15 @@ struct inode *ext4_new_inode(handle_t *h + sb = dir->i_sb; + ngroups = ext4_get_groups_count(sb); + trace_ext4_request_inode(dir, mode); ++ ++ sbi = EXT4_SB(sb); ++ if (sbi->s_max_dir_size > 0 && i_size_read(dir) >= sbi->s_max_dir_size) ++ return ERR_PTR(-EFBIG); ++ + inode = new_inode(sb); + if (!inode) + return ERR_PTR(-ENOMEM); + ei = EXT4_I(inode); +- sbi = EXT4_SB(sb); + + if (!goal) + goal = sbi->s_inode_goal; +Index: linux-2.6.32-el6-beta/fs/ext4/super.c +=================================================================== +--- linux-2.6.32-el6-beta.orig/fs/ext4/super.c ++++ linux-2.6.32-el6-beta/fs/ext4/super.c +@@ -2601,6 +2601,7 @@ EXT4_RO_ATTR(lifetime_write_kbytes); + EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show, + inode_readahead_blks_store, s_inode_readahead_blks); + EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal); ++EXT4_RW_ATTR_SBI_UI(max_dir_size, s_max_dir_size); + EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats); + EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan); + EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan); +@@ -2615,6 +2616,7 @@ static struct attribute *ext4_attrs[] = + ATTR_LIST(lifetime_write_kbytes), + ATTR_LIST(inode_readahead_blks), + ATTR_LIST(inode_goal), ++ ATTR_LIST(max_dir_size), + ATTR_LIST(mb_stats), + ATTR_LIST(mb_max_to_scan), + ATTR_LIST(mb_min_to_scan), +Index: linux-2.6.32-el6-beta/fs/ext4/ext4.h +=================================================================== +--- linux-2.6.32-el6-beta.orig/fs/ext4/ext4.h ++++ linux-2.6.32-el6-beta/fs/ext4/ext4.h +@@ -1029,6 +1029,8 @@ struct ext4_sb_info { + + /* Kernel thread for multiple mount protection */ + struct task_struct *s_mmp_tsk; ++ ++ unsigned long s_max_dir_size; + }; + + static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) +@@ -1353,6 +1355,12 @@ struct mmp_struct { + #define EXT4_MMP_MIN_CHECK_INTERVAL 5 + + /* ++ * max directory size tunable ++ */ ++#define EXT4_DEFAULT_MAX_DIR_SIZE 0 ++#define EXT4_MAX_DIR_SIZE_NAME "max_dir_size" ++ ++/* + * Function prototypes + */ + diff --git a/ldiskfs/kernel_patches/patches/ext4-mballoc-extra-checks-rhel6.patch b/ldiskfs/kernel_patches/patches/ext4-mballoc-extra-checks-rhel6.patch new file mode 100644 index 0000000..c0b59f0 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext4-mballoc-extra-checks-rhel6.patch @@ -0,0 +1,317 @@ +Index: linux-stage/fs/ext4/ext4.h +=================================================================== +--- linux-stage.orig/fs/ext4/ext4.h 2011-03-14 16:18:28.300241437 +0800 ++++ linux-stage/fs/ext4/ext4.h 2011-03-14 16:33:17.056087375 +0800 +@@ -1770,6 +1770,7 @@ + ext4_grpblk_t bb_free; /* total free blocks */ + ext4_grpblk_t bb_fragments; /* nr of freespace fragments */ + struct list_head bb_prealloc_list; ++ unsigned long bb_prealloc_nr; + #ifdef DOUBLE_CHECK + void *bb_bitmap; + #endif +Index: linux-stage/fs/ext4/mballoc.c +=================================================================== +--- linux-stage.orig/fs/ext4/mballoc.c 2011-03-14 16:18:28.336242149 +0800 ++++ linux-stage/fs/ext4/mballoc.c 2011-03-14 16:33:27.072292006 +0800 +@@ -337,7 +337,7 @@ + static struct kmem_cache *ext4_pspace_cachep; + static struct kmem_cache *ext4_ac_cachep; + static struct kmem_cache *ext4_free_ext_cachep; +-static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ++static int ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, + ext4_group_t group); + static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, + ext4_group_t group); +@@ -659,7 +659,7 @@ + } + + static noinline_for_stack +-void ext4_mb_generate_buddy(struct super_block *sb, ++int ext4_mb_generate_buddy(struct super_block *sb, + void *buddy, void *bitmap, ext4_group_t group) + { + struct ext4_group_info *grp = ext4_get_group_info(sb, group); +@@ -691,14 +691,13 @@ + grp->bb_fragments = fragments; + + if (free != grp->bb_free) { +- ext4_grp_locked_error(sb, group, __func__, +- "EXT4-fs: group %u: %u blocks in bitmap, %u in gd", +- group, free, grp->bb_free); +- /* +- * If we intent to continue, we consider group descritor +- * corrupt and update bb_free using bitmap value +- */ +- grp->bb_free = free; ++ struct ext4_group_desc *gdp; ++ gdp = ext4_get_group_desc (sb, group, NULL); ++ ext4_error(sb, "group %lu: %u blocks in bitmap, %u in bb, " ++ "%u in gd, %lu pa's\n", (long unsigned int)group, ++ free, grp->bb_free, ext4_free_blks_count(sb, gdp), ++ grp->bb_prealloc_nr); ++ return -EIO; + } + + clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); +@@ -708,6 +707,8 @@ + EXT4_SB(sb)->s_mb_buddies_generated++; + EXT4_SB(sb)->s_mb_generation_time += period; + spin_unlock(&EXT4_SB(sb)->s_bal_lock); ++ ++ return 0; + } + + /* The buddy information is attached the buddy cache inode +@@ -839,7 +840,7 @@ + first_block = page->index * blocks_per_page; + /* init the page */ + memset(page_address(page), 0xff, PAGE_CACHE_SIZE); +- for (i = 0; i < blocks_per_page; i++) { ++ for (i = 0; i < blocks_per_page && err == 0; i++) { + int group; + struct ext4_group_info *grinfo; + +@@ -874,7 +875,7 @@ + * incore got set to the group block bitmap below + */ + ext4_lock_group(sb, group); +- ext4_mb_generate_buddy(sb, data, incore, group); ++ err = ext4_mb_generate_buddy(sb, data, incore, group); + ext4_unlock_group(sb, group); + incore = NULL; + } else { +@@ -888,7 +889,7 @@ + memcpy(data, bitmap, blocksize); + + /* mark all preallocated blks used in in-core bitmap */ +- ext4_mb_generate_from_pa(sb, data, group); ++ err = ext4_mb_generate_from_pa(sb, data, group); + ext4_mb_generate_from_freelist(sb, data, group); + ext4_unlock_group(sb, group); + +@@ -898,7 +899,8 @@ + incore = data; + } + } +- SetPageUptodate(page); ++ if (likely(err == 0)) ++ SetPageUptodate(page); + + out: + if (bh) { +@@ -2142,9 +2144,11 @@ + static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) + { + struct super_block *sb = seq->private; ++ struct ext4_group_desc *gdp; + ext4_group_t group = (ext4_group_t) ((unsigned long) v); + int i; + int err; ++ int free = 0; + struct ext4_buddy e4b; + struct sg { + struct ext4_group_info info; +@@ -2153,10 +2157,10 @@ + + group--; + if (group == 0) +- seq_printf(seq, "#%-5s: %-5s %-5s %-5s " ++ seq_printf(seq, "#%-5s: %-5s %-5s %-5s %-5s %-5s" + "[ %-5s %-5s %-5s %-5s %-5s %-5s %-5s " + "%-5s %-5s %-5s %-5s %-5s %-5s %-5s ]\n", +- "group", "free", "frags", "first", ++ "group", "free", "frags", "first", "first", "pa", + "2^0", "2^1", "2^2", "2^3", "2^4", "2^5", "2^6", + "2^7", "2^8", "2^9", "2^10", "2^11", "2^12", "2^13"); + +@@ -2167,13 +2171,20 @@ + seq_printf(seq, "#%-5u: I/O error\n", group); + return 0; + } ++ ++ gdp = ext4_get_group_desc(sb, group, NULL); ++ if (gdp != NULL) ++ free = ext4_free_blks_count(sb, gdp); ++ + ext4_lock_group(sb, group); + memcpy(&sg, ext4_get_group_info(sb, group), i); + ext4_unlock_group(sb, group); + ext4_mb_release_desc(&e4b); + +- seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free, +- sg.info.bb_fragments, sg.info.bb_first_free); ++ seq_printf(seq, "#%-5lu: %-5u %-5u %-5u %-5u %-5lu [", ++ (long unsigned int)group, sg.info.bb_free, free, ++ sg.info.bb_fragments, sg.info.bb_first_free, ++ sg.info.bb_prealloc_nr); + for (i = 0; i <= 13; i++) + seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ? + sg.info.bb_counters[i] : 0); +@@ -3354,23 +3365,68 @@ + } + + /* ++ * check free blocks in bitmap match free block in group descriptor ++ * do this before taking preallocated blocks into account to be able ++ * to detect on-disk corruptions. The group lock should be hold by the ++ * caller. ++ */ ++int ext4_mb_check_ondisk_bitmap(struct super_block *sb, void *bitmap, ++ struct ext4_group_desc *gdp, int group) ++{ ++ unsigned short max = EXT4_BLOCKS_PER_GROUP(sb); ++ unsigned short i, first, free = 0; ++ ++ i = mb_find_next_zero_bit(bitmap, max, 0); ++ ++ while (i < max) { ++ first = i; ++ i = mb_find_next_bit(bitmap, max, i); ++ if (i > max) ++ i = max; ++ free += i - first; ++ if (i < max) ++ i = mb_find_next_zero_bit(bitmap, max, i); ++ } ++ ++ if (free != ext4_free_blks_count(sb, gdp)) { ++ ext4_error(sb, "on-disk bitmap for group %d" ++ "corrupted: %u blocks free in bitmap, %u - in gd\n", ++ group, free, ext4_free_blks_count(sb, gdp)); ++ return -EIO; ++ } ++ return 0; ++} ++ ++/* + * the function goes through all preallocation in this group and marks them + * used in in-core bitmap. buddy must be generated from this bitmap + * Need to be called with ext4 group lock held + */ + static noinline_for_stack +-void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ++int ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, + ext4_group_t group) + { + struct ext4_group_info *grp = ext4_get_group_info(sb, group); + struct ext4_prealloc_space *pa; ++ struct ext4_group_desc *gdp; + struct list_head *cur; + ext4_group_t groupnr; + ext4_grpblk_t start; + int preallocated = 0; + int count = 0; ++ int skip = 0; ++ int err; + int len; + ++ gdp = ext4_get_group_desc (sb, group, NULL); ++ if (gdp == NULL) ++ return -EIO; ++ ++ /* before applying preallocations, check bitmap consistency */ ++ err = ext4_mb_check_ondisk_bitmap(sb, bitmap, gdp, group); ++ if (err) ++ return err; ++ + /* all form of preallocation discards first load group, + * so the only competing code is preallocation use. + * we don't need any locking here +@@ -3386,14 +3442,23 @@ + &groupnr, &start); + len = pa->pa_len; + spin_unlock(&pa->pa_lock); +- if (unlikely(len == 0)) ++ if (unlikely(len == 0)) { ++ skip++; + continue; ++ } + BUG_ON(groupnr != group); + mb_set_bits(bitmap, start, len); + preallocated += len; + count++; + } ++ if (count + skip != grp->bb_prealloc_nr) { ++ ext4_error(sb, "lost preallocations: " ++ "count %d, bb_prealloc_nr %lu, skip %d\n", ++ count, grp->bb_prealloc_nr, skip); ++ return -EIO; ++ } + mb_debug(1, "prellocated %u for group %u\n", preallocated, group); ++ return 0; + } + + static void ext4_mb_pa_callback(struct rcu_head *head) +@@ -3452,6 +3517,7 @@ + */ + ext4_lock_group(sb, grp); + list_del(&pa->pa_group_list); ++ ext4_get_group_info(sb, grp)->bb_prealloc_nr--; + ext4_unlock_group(sb, grp); + + spin_lock(pa->pa_obj_lock); +@@ -3543,6 +3609,7 @@ + + ext4_lock_group(sb, ac->ac_b_ex.fe_group); + list_add(&pa->pa_group_list, &grp->bb_prealloc_list); ++ grp->bb_prealloc_nr++; + ext4_unlock_group(sb, ac->ac_b_ex.fe_group); + + spin_lock(pa->pa_obj_lock); +@@ -3604,6 +3671,7 @@ + + ext4_lock_group(sb, ac->ac_b_ex.fe_group); + list_add(&pa->pa_group_list, &grp->bb_prealloc_list); ++ grp->bb_prealloc_nr++; + ext4_unlock_group(sb, ac->ac_b_ex.fe_group); + + /* +@@ -3802,6 +3870,8 @@ + + spin_unlock(&pa->pa_lock); + ++ BUG_ON(grp->bb_prealloc_nr == 0); ++ grp->bb_prealloc_nr--; + list_del(&pa->pa_group_list); + list_add(&pa->u.pa_tmp_list, &list); + } +@@ -3942,7 +4012,7 @@ + if (err) { + ext4_error(sb, "Error loading buddy information for %u", + group); +- continue; ++ return; + } + + bitmap_bh = ext4_read_block_bitmap(sb, group); +@@ -3954,6 +4024,8 @@ + } + + ext4_lock_group(sb, group); ++ BUG_ON(e4b.bd_info->bb_prealloc_nr == 0); ++ e4b.bd_info->bb_prealloc_nr--; + list_del(&pa->pa_group_list); + ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac); + ext4_unlock_group(sb, group); +@@ -4227,6 +4299,7 @@ + } + ext4_lock_group(sb, group); + list_del(&pa->pa_group_list); ++ ext4_get_group_info(sb, group)->bb_prealloc_nr--; + ext4_mb_release_group_pa(&e4b, pa, ac); + ext4_unlock_group(sb, group); + +Index: linux-stage/fs/ext4/mballoc.h +=================================================================== +--- linux-stage.orig/fs/ext4/mballoc.h 2011-03-14 16:18:26.670209322 +0800 ++++ linux-stage/fs/ext4/mballoc.h 2011-03-14 16:32:50.859552482 +0800 +@@ -88,7 +88,7 @@ + /* + * for which requests use 2^N search using buddies + */ +-#define MB_DEFAULT_ORDER2_REQS 2 ++#define MB_DEFAULT_ORDER2_REQS 8 + + /* + * default group prealloc size 512 blocks diff --git a/ldiskfs/kernel_patches/patches/ext4-mballoc-pa_free-mismatch-rhel6.patch b/ldiskfs/kernel_patches/patches/ext4-mballoc-pa_free-mismatch-rhel6.patch new file mode 100644 index 0000000..faf7fce --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext4-mballoc-pa_free-mismatch-rhel6.patch @@ -0,0 +1,152 @@ +Index: linux-stage/fs/ext4/mballoc.c +=================================================================== +--- linux-stage.orig/fs/ext4/mballoc.c 2011-03-14 16:34:39.790758415 +0800 ++++ linux-stage/fs/ext4/mballoc.c 2011-03-14 16:38:36.211681104 +0800 +@@ -3593,6 +3593,7 @@ + INIT_LIST_HEAD(&pa->pa_group_list); + pa->pa_deleted = 0; + pa->pa_type = MB_INODE_PA; ++ pa->pa_error = 0; + + mb_debug(1, "new inode pa %p: %llu/%u for %u\n", pa, + pa->pa_pstart, pa->pa_len, pa->pa_lstart); +@@ -3654,6 +3655,7 @@ + INIT_LIST_HEAD(&pa->pa_group_list); + pa->pa_deleted = 0; + pa->pa_type = MB_GROUP_PA; ++ pa->pa_error = 0; + + mb_debug(1, "new group pa %p: %llu/%u for %u\n", pa, + pa->pa_pstart, pa->pa_len, pa->pa_lstart); +@@ -3716,7 +3718,9 @@ + int err = 0; + int free = 0; + ++ assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); + BUG_ON(pa->pa_deleted == 0); ++ BUG_ON(pa->pa_inode == NULL); + ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); + grp_blk_start = pa->pa_pstart - bit; + BUG_ON(group != e4b->bd_group && pa->pa_len != 0); +@@ -3752,19 +3756,27 @@ + mb_free_blocks(pa->pa_inode, e4b, bit, next - bit); + bit = next + 1; + } +- if (free != pa->pa_free) { +- printk(KERN_CRIT "pa %p: logic %lu, phys. %lu, len %lu\n", +- pa, (unsigned long) pa->pa_lstart, +- (unsigned long) pa->pa_pstart, +- (unsigned long) pa->pa_len); ++ ++ /* "free < pa->pa_free" means we maybe double alloc the same blocks, ++ * otherwise maybe leave some free blocks unavailable, no need to BUG.*/ ++ if ((free > pa->pa_free && !pa->pa_error) || (free < pa->pa_free)) { ++ ext4_error(sb, "pa free mismatch: [pa %p] " ++ "[phy %lu] [logic %lu] [len %u] [free %u] " ++ "[error %u] [inode %lu] [freed %u]", pa, ++ (unsigned long)pa->pa_pstart, ++ (unsigned long)pa->pa_lstart, ++ (unsigned)pa->pa_len, (unsigned)pa->pa_free, ++ (unsigned)pa->pa_error, pa->pa_inode->i_ino, ++ free); + ext4_grp_locked_error(sb, group, +- __func__, "free %u, pa_free %u", +- free, pa->pa_free); ++ __func__, "free %u, pa_free %u", ++ free, pa->pa_free); + /* + * pa is already deleted so we use the value obtained + * from the bitmap and continue. + */ + } ++ BUG_ON(pa->pa_free != free); + atomic_add(free, &sbi->s_mb_discarded); + + return err; +@@ -4450,6 +4462,24 @@ + + trace_ext4_request_blocks(ar); + ++ ++ if (dev_check_rdonly(sb->s_bdev)) { ++ struct block_device *bdev = sb->s_bdev; ++ ++ printk(KERN_WARNING "Alloc from readonly device %s (%#x): " ++ "[inode %lu] [logic %llu] [goal %llu] [ll %llu] " ++ "[pl %llu] [lr %llu] [pr %llu] [len %u] [flags %u]\n", ++ bdev->bd_disk ? bdev->bd_disk->disk_name : "", ++ bdev->bd_dev, ar->inode->i_ino, ++ (unsigned long long)ar->logical, ++ (unsigned long long)ar->goal, ++ (unsigned long long)ar->lleft, ++ (unsigned long long)ar->pleft, ++ (unsigned long long)ar->lright, ++ (unsigned long long)ar->pright, ++ ar->len, ar->flags); ++ } ++ + /* + * For delayed allocation, we could skip the ENOSPC and + * EDQUOT check, as blocks and quotas have been already +@@ -4529,6 +4559,25 @@ + ac->ac_b_ex.fe_len = 0; + ar->len = 0; + ext4_mb_show_ac(ac); ++ if (ac->ac_pa) { ++ struct ext4_prealloc_space *pa = ac->ac_pa; ++ ++ /* We can not make sure whether the bitmap has ++ * been updated or not when fail case. So can ++ * not revert pa_free back, just mark pa_error*/ ++ pa->pa_error++; ++ ext4_error(sb, ++ "Updating bitmap error: [err %d] " ++ "[pa %p] [phy %lu] [logic %lu] " ++ "[len %u] [free %u] [error %u] " ++ "[inode %lu]", *errp, pa, ++ (unsigned long)pa->pa_pstart, ++ (unsigned long)pa->pa_lstart, ++ (unsigned)pa->pa_len, ++ (unsigned)pa->pa_free, ++ (unsigned)pa->pa_error, ++ pa->pa_inode ? pa->pa_inode->i_ino : 0); ++ } + } else { + block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); + ar->len = ac->ac_b_ex.fe_len; +@@ -4691,6 +4740,15 @@ + goto error_return; + } + ++ if (dev_check_rdonly(sb->s_bdev)) { ++ struct block_device *bdev = sb->s_bdev; ++ ++ printk(KERN_WARNING "Release to readonly device %s (%#x): " ++ "[inode %lu] [block %llu] [count %lu] [is_meta %d]\n", ++ bdev->bd_disk ? bdev->bd_disk->disk_name : "", ++ bdev->bd_dev, inode->i_ino, block, count, metadata); ++ } ++ + ext4_debug("freeing block %llu\n", block); + trace_ext4_free_blocks(inode, block, count, metadata); + +Index: linux-stage/fs/ext4/mballoc.h +=================================================================== +--- linux-stage.orig/fs/ext4/mballoc.h 2011-03-14 16:32:50.859552482 +0800 ++++ linux-stage/fs/ext4/mballoc.h 2011-03-14 16:39:20.928429776 +0800 +@@ -20,6 +20,7 @@ + #include + #include + #include ++#include + #include "ext4_jbd2.h" + #include "ext4.h" + +@@ -130,6 +131,7 @@ + ext4_grpblk_t pa_free; /* how many blocks are free */ + unsigned short pa_type; /* pa type. inode or group */ + spinlock_t *pa_obj_lock; ++ unsigned short pa_error; + struct inode *pa_inode; /* hack, for history only */ + }; + diff --git a/ldiskfs/kernel_patches/patches/ext4-misc-rhel5.patch b/ldiskfs/kernel_patches/patches/ext4-misc-rhel5.patch index 18e15c2..43cc3bc 100644 --- a/ldiskfs/kernel_patches/patches/ext4-misc-rhel5.patch +++ b/ldiskfs/kernel_patches/patches/ext4-misc-rhel5.patch @@ -1,24 +1,21 @@ -Index: linux-2.6.18.i386/fs/ext4/ext4_jbd2.h +Index: linux-stage/fs/ext4/ext4_jbd2.h =================================================================== ---- linux-2.6.18.i386.orig/fs/ext4/ext4_jbd2.h -+++ linux-2.6.18.i386/fs/ext4/ext4_jbd2.h -@@ -35,6 +35,11 @@ - (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS) \ - || test_opt(sb, EXTENTS) ? 27U : 8U) +--- linux-stage.orig/fs/ext4/ext4_jbd2.h 2011-03-14 17:17:57.962614294 +0800 ++++ linux-stage/fs/ext4/ext4_jbd2.h 2011-03-14 17:26:00.570661921 +0800 +@@ -35,6 +35,8 @@ + (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS) \ + ? 27U : 8U) -+/* Indicate that EXT4_SINGLEDATA_TRANS_BLOCKS takes the sb as argument */ -+#define EXT4_SINGLEDATA_TRANS_BLOCKS_HAS_SB -+ +#define ext4_journal_dirty_metadata(handle, bh) \ + ext4_handle_dirty_metadata(handle, NULL, bh) /* Extended attribute operations touch at most two data buffers, * two bitmap buffers, and two group summaries, in addition to the inode * and the superblock, which are already accounted for. */ -Index: linux-2.6.18.i386/fs/ext4/extents.c +Index: linux-stage/fs/ext4/extents.c =================================================================== ---- linux-2.6.18.i386.orig/fs/ext4/extents.c -+++ linux-2.6.18.i386/fs/ext4/extents.c -@@ -60,6 +60,17 @@ static ext4_fsblk_t ext_pblock(struct ex +--- linux-stage.orig/fs/ext4/extents.c 2011-03-14 17:17:57.491605523 +0800 ++++ linux-stage/fs/ext4/extents.c 2011-03-14 17:25:23.230957562 +0800 +@@ -59,6 +59,17 @@ static ext4_fsblk_t ext_pblock(struct ex } /* @@ -36,7 +33,7 @@ Index: linux-2.6.18.i386/fs/ext4/extents.c * idx_pblock: * combine low and high parts of a leaf physical block number into ext4_fsblk_t */ -@@ -73,17 +84,6 @@ ext4_fsblk_t idx_pblock(struct ext4_exte +@@ -72,17 +83,6 @@ ext4_fsblk_t idx_pblock(struct ext4_exte } /* @@ -54,7 +51,7 @@ Index: linux-2.6.18.i386/fs/ext4/extents.c * ext4_idx_store_pblock: * stores a large physical block number into an index struct, * breaking it into parts -@@ -1826,6 +1826,56 @@ static int ext4_ext_rm_idx(handle_t *han +@@ -1980,6 +1980,56 @@ static int ext4_ext_rm_idx(handle_t *han } /* @@ -111,11 +108,10 @@ Index: linux-2.6.18.i386/fs/ext4/extents.c * ext4_ext_calc_credits_for_single_extent: * This routine returns max. credits that needed to insert an extent * to the extent tree. -@@ -3157,4 +3207,14 @@ int ext4_fiemap(struct inode *inode, str - +@@ -3731,3 +3781,13 @@ int ext4_fiemap(struct inode *inode, str return error; } -+ + +EXPORT_SYMBOL(ext4_ext_store_pblock); +EXPORT_SYMBOL(ext4_ext_search_right); +EXPORT_SYMBOL(ext4_ext_search_left); @@ -125,12 +121,12 @@ Index: linux-2.6.18.i386/fs/ext4/extents.c +EXPORT_SYMBOL(ext4_ext_walk_space); +EXPORT_SYMBOL(ext4_ext_calc_credits_for_insert); +EXPORT_SYMBOL(ext4_mark_inode_dirty); - -Index: linux-2.6.18.i386/fs/ext4/ext4_extents.h ++ +Index: linux-stage/fs/ext4/ext4_extents.h =================================================================== ---- linux-2.6.18.i386.orig/fs/ext4/ext4_extents.h -+++ linux-2.6.18.i386/fs/ext4/ext4_extents.h -@@ -59,6 +59,12 @@ +--- linux-stage.orig/fs/ext4/ext4_extents.h 2011-03-14 17:17:57.928613657 +0800 ++++ linux-stage/fs/ext4/ext4_extents.h 2011-03-14 17:27:23.673232962 +0800 +@@ -58,6 +58,12 @@ */ #define EXT_STATS_ @@ -143,16 +139,15 @@ Index: linux-2.6.18.i386/fs/ext4/ext4_extents.h /* * ext4_inode has i_block array (60 bytes total). -@@ -124,6 +129,8 @@ struct ext4_ext_path { - #define EXT4_EXT_CACHE_GAP 1 - #define EXT4_EXT_CACHE_EXTENT 2 +@@ -160,6 +166,7 @@ struct ext4_ext_path { + #define EXT_INIT_MAX_LEN (1UL << 15) + #define EXT_UNINIT_MAX_LEN (EXT_INIT_MAX_LEN - 1) +#define EXT4_EXT_HAS_NO_TREE /* ext4_extents_tree struct is not used*/ -+#define EXT_INSERT_EXTENT_WITH_5ARGS - - #define EXT_MAX_BLOCK 0xffffffff -@@ -228,6 +234,8 @@ static inline int ext4_ext_get_actual_le + #define EXT_FIRST_EXTENT(__hdr__) \ + ((struct ext4_extent *) (((char *) (__hdr__)) + \ +@@ -230,6 +237,8 @@ extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *); extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t); extern int ext4_extent_tree_init(handle_t *, struct inode *); @@ -161,11 +156,11 @@ Index: linux-2.6.18.i386/fs/ext4/ext4_extents.h extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int num, struct ext4_ext_path *path); -Index: linux-2.6.18.i386/fs/ext4/mballoc.c +Index: linux-stage/fs/ext4/mballoc.c =================================================================== ---- linux-2.6.18.i386.orig/fs/ext4/mballoc.c -+++ linux-2.6.18.i386/fs/ext4/mballoc.c -@@ -4355,6 +4355,13 @@ +--- linux-stage.orig/fs/ext4/mballoc.c 2011-03-14 17:17:59.872649833 +0800 ++++ linux-stage/fs/ext4/mballoc.c 2011-03-14 17:25:20.373903681 +0800 +@@ -4302,6 +4302,13 @@ kmem_cache_free(ext4_ac_cachep, ac); } @@ -179,35 +174,35 @@ Index: linux-2.6.18.i386/fs/ext4/mballoc.c /* * finds all preallocated spaces and return blocks being freed to them * if preallocated space becomes full (no block is used from the space) -@@ -4965,3 +4965,6 @@ error_return: +@@ -5111,3 +5118,6 @@ error_return: kmem_cache_free(ext4_ac_cachep, ac); return; } + +EXPORT_SYMBOL(ext4_free_blocks); + -Index: linux-2.6.18.i386/fs/ext4/ext4_jbd2.c +Index: linux-stage/fs/ext4/ext4_jbd2.c =================================================================== ---- linux-2.6.18.i386.orig/fs/ext4/ext4_jbd2.c -+++ linux-2.6.18.i386/fs/ext4/ext4_jbd2.c -@@ -21,6 +21,7 @@ int __ext4_journal_get_write_access(cons - ext4_journal_abort_handle(where, __func__, bh, handle, err); +--- linux-stage.orig/fs/ext4/ext4_jbd2.c 2011-03-14 17:17:57.463605024 +0800 ++++ linux-stage/fs/ext4/ext4_jbd2.c 2011-03-14 17:18:00.157655139 +0800 +@@ -31,6 +31,7 @@ int __ext4_journal_get_write_access(cons + } return err; } +EXPORT_SYMBOL(__ext4_journal_get_write_access); int __ext4_journal_forget(const char *where, handle_t *handle, struct buffer_head *bh) -@@ -57,3 +58,4 @@ int __ext4_journal_dirty_metadata(const - ext4_journal_abort_handle(where, __func__, bh, handle, err); +@@ -107,3 +108,4 @@ int __ext4_journal_dirty_metadata(const + } return err; } +EXPORT_SYMBOL(__ext4_handle_dirty_metadata); -Index: linux-2.6.27.21-0.1/fs/ext4/ext4.h +Index: linux-stage/fs/ext4/ext4.h =================================================================== ---- linux-2.6.27.21-0.1.orig/fs/ext4/ext4.h 2009-07-07 14:47:19.000000000 +0530 -+++ linux-2.6.27.21-0.1/fs/ext4/ext4.h 2009-07-07 14:47:22.000000000 +0530 -@@ -1123,6 +1128,8 @@ +--- linux-stage.orig/fs/ext4/ext4.h 2011-03-14 17:17:59.916650654 +0800 ++++ linux-stage/fs/ext4/ext4.h 2011-03-14 17:25:30.236089694 +0800 +@@ -1448,6 +1448,8 @@ extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t); extern void ext4_mb_put_buddy_cache_lock(struct super_block *, ext4_group_t, int); @@ -216,11 +211,11 @@ Index: linux-2.6.27.21-0.1/fs/ext4/ext4.h /* inode.c */ int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode, struct buffer_head *bh, ext4_fsblk_t blocknr); -Index: linux-2.6.27.21-0.1/fs/ext4/inode.c +Index: linux-stage/fs/ext4/inode.c =================================================================== ---- linux-2.6.27.21-0.1.orig/fs/ext4/inode.c 2009-07-07 14:47:19.000000000 +0530 -+++ linux-2.6.27.21-0.1/fs/ext4/inode.c 2009-07-07 14:47:22.000000000 +0530 -@@ -4240,6 +4240,7 @@ +--- linux-stage.orig/fs/ext4/inode.c 2011-03-14 17:17:59.745647471 +0800 ++++ linux-stage/fs/ext4/inode.c 2011-03-14 17:18:00.219656294 +0800 +@@ -4882,6 +4882,7 @@ iget_failed(inode); return ERR_PTR(ret); } @@ -228,54 +223,54 @@ Index: linux-2.6.27.21-0.1/fs/ext4/inode.c static int ext4_inode_blocks_set(handle_t *handle, struct ext4_inode *raw_inode, -Index: linux-2.6.27.21-0.1/fs/ext4/super.c +Index: linux-stage/fs/ext4/super.c =================================================================== ---- linux-2.6.27.21-0.1.orig/fs/ext4/super.c 2009-07-07 14:47:19.000000000 +0530 -+++ linux-2.6.27.21-0.1/fs/ext4/super.c 2009-07-07 14:48:53.000000000 +0530 -@@ -91,6 +91,7 @@ ext4_fsblk_t ext4_inode_bitmap(struct su +--- linux-stage.orig/fs/ext4/super.c 2011-03-14 17:17:59.659645870 +0800 ++++ linux-stage/fs/ext4/super.c 2011-03-14 17:25:31.027104616 +0800 +@@ -90,6 +90,7 @@ ext4_fsblk_t ext4_inode_bitmap(struct su (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? - (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0); + (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0); } +EXPORT_SYMBOL(ext4_inode_bitmap); ext4_fsblk_t ext4_inode_table(struct super_block *sb, struct ext4_group_desc *bg) -@@ -113,6 +118,7 @@ - (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? - (__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0); +@@ -114,6 +115,7 @@ + (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? + (__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0); } +EXPORT_SYMBOL(ext4_itable_unused_count); - - void ext4_block_bitmap_set(struct super_block *sb, - struct ext4_group_desc *bg, ext4_fsblk_t blk) -@@ -1286,9 +1287,11 @@ - Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, - Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize, - Opt_usrquota, Opt_grpquota, Opt_i_version, + + __u32 ext4_used_dirs_count(struct super_block *sb, + struct ext4_group_desc *bg) +@@ -1434,9 +1436,11 @@ + Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, + Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize, + Opt_usrquota, Opt_grpquota, Opt_i_version, + Opt_mballoc, Opt_extents, - Opt_stripe, Opt_delalloc, Opt_nodelalloc, - Opt_block_validity, Opt_noblock_validity, + Opt_stripe, Opt_delalloc, Opt_nodelalloc, + Opt_block_validity, Opt_noblock_validity, - Opt_inode_readahead_blks, Opt_journal_ioprio + Opt_inode_readahead_blks, Opt_journal_ioprio, + Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, }; static match_table_t tokens = { -@@ -1346,6 +1348,11 @@ - {Opt_barrier, "barrier"}, - {Opt_nobarrier, "nobarrier"}, - {Opt_i_version, "i_version"}, +@@ -1491,6 +1495,11 @@ + {Opt_barrier, "barrier"}, + {Opt_nobarrier, "nobarrier"}, + {Opt_i_version, "i_version"}, + {Opt_mballoc, "mballoc"}, + {Opt_extents, "extents"}, + {Opt_iopen, "iopen"}, + {Opt_noiopen, "noiopen"}, + {Opt_iopen_nopriv, "iopen_nopriv"}, - {Opt_stripe, "stripe=%u"}, - {Opt_resize, "resize"}, - {Opt_delalloc, "delalloc"}, -@@ -1768,6 +1771,12 @@ - case Opt_bigendian_extents: - bigendian_extents = 1; + {Opt_stripe, "stripe=%u"}, + {Opt_resize, "resize"}, + {Opt_delalloc, "delalloc"}, +@@ -1930,6 +1939,12 @@ + else + set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC); break; + case Opt_mballoc: + case Opt_extents: @@ -284,64 +279,64 @@ Index: linux-2.6.27.21-0.1/fs/ext4/super.c + case Opt_iopen_nopriv: + break; default: - printk(KERN_ERR - "EXT4-fs: Unrecognized mount option \"%s\" " -@@ -2768,7 +2771,7 @@ - char *buf) + ext4_msg(sb, KERN_ERR, + "Unrecognized mount option \"%s\" " +@@ -2480,7 +2495,7 @@ + char *buf) { - return snprintf(buf, PAGE_SIZE, "%llu\n", + return snprintf(buf, PAGE_SIZE, "%llu\n", - (s64) percpu_counter_sum(&sbi->s_dirtyblocks_counter)); + (unsigned long long) percpu_counter_sum(&sbi->s_dirtyblocks_counter)); } static ssize_t session_write_kbytes_show(struct ext4_attr *a, -@@ -2868,11 +2871,11 @@ - struct super_block *sb = sbi->s_buddy_cache->i_sb; - - return snprintf(buf, PAGE_SIZE, "%llu\n", +@@ -2501,11 +2516,11 @@ + struct super_block *sb = sbi->s_buddy_cache->i_sb; + + return snprintf(buf, PAGE_SIZE, "%llu\n", - sbi->s_kbytes_written + + (unsigned long long)(sbi->s_kbytes_written + - (sb->s_bdev->bd_part ? - (part_stat_read(sb->s_bdev->bd_part, sectors[1]) - - EXT4_SB(sb)->s_sectors_written_start) >> 1 + (sb->s_bdev->bd_part ? + (part_stat_read(sb->s_bdev->bd_part, sectors[1]) - + EXT4_SB(sb)->s_sectors_written_start) >> 1 - : 0)); + : 0))); } static ssize_t inode_readahead_blks_store(struct ext4_attr *a, -@@ -3868,7 +3871,7 @@ - if (blocks_count && ext4_blocks_count(es) > blocks_count) { - ext4_msg(sb, KERN_WARNING, "bad geometry: block count %llu " - "exceeds size of device (%llu blocks)", +@@ -2972,7 +2987,7 @@ + if (blocks_count && ext4_blocks_count(es) > blocks_count) { + ext4_msg(sb, KERN_WARNING, "bad geometry: block count %llu " + "exceeds size of device (%llu blocks)", - ext4_blocks_count(es), blocks_count); + ext4_blocks_count(es), (unsigned long long)blocks_count); - goto failed_mount; - } - -Index: linux-2.6.27.21-0.1/fs/ext4/fsync.c + goto failed_mount; + } + +Index: linux-stage/fs/ext4/fsync.c =================================================================== ---- linux-2.6.27.21-0.1.orig/fs/ext4/fsync.c 2009-07-07 14:47:19.000000000 +0530 -+++ linux-2.6.27.21-0.1/fs/ext4/fsync.c 2009-07-07 14:48:53.000000000 +0530 -@@ -1768,7 +1771,7 @@ +--- linux-stage.orig/fs/ext4/fsync.c 2011-03-14 17:17:57.533606303 +0800 ++++ linux-stage/fs/ext4/fsync.c 2011-03-14 17:18:00.266657168 +0800 +@@ -56,7 +56,7 @@ - trace_mark(ext4_sync_file, "dev %s datasync %d ino %ld parent %ld", - inode->i_sb->s_id, datasync, inode->i_ino, + trace_mark(ext4_sync_file, "dev %s datasync %d ino %ld parent %ld", + inode->i_sb->s_id, datasync, inode->i_ino, - dentry->d_parent->d_inode->i_ino); + 0L); - - /* - * data=writeback: -Index: linux-2.6.27.21-0.1/fs/ext4/move_extent.c + + ret = flush_aio_dio_completed_IO(inode); + if (ret < 0) +Index: linux-stage/fs/ext4/move_extent.c =================================================================== ---- linux-2.6.27.21-0.1.orig/fs/ext4/move_extent.c 2009-07-07 14:47:19.000000000 +0530 -+++ linux-2.6.27.21-0.1/fs/ext4/move_extent.c 2009-07-07 14:48:53.000000000 +0530 -@@ -1768,7 +1771,8 @@ - ext4_error(orig_inode->i_sb, __func__, - "We replaced blocks too much! " - "sum of replaced: %llu requested: %llu", +--- linux-stage.orig/fs/ext4/move_extent.c 2011-03-14 17:17:57.742610199 +0800 ++++ linux-stage/fs/ext4/move_extent.c 2011-03-14 17:18:00.284657501 +0800 +@@ -1388,7 +1388,8 @@ + ext4_error(orig_inode->i_sb, __func__, + "We replaced blocks too much! " + "sum of replaced: %llu requested: %llu", - *moved_len, len); + (unsigned long long)(*moved_len), + (unsigned long long)(len)); - ret1 = -EIO; - goto out; - } + ret1 = -EIO; + goto out; + } diff --git a/ldiskfs/kernel_patches/patches/ext4-misc-rhel6.patch b/ldiskfs/kernel_patches/patches/ext4-misc-rhel6.patch new file mode 100644 index 0000000..126e659 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext4-misc-rhel6.patch @@ -0,0 +1,255 @@ +Index: linux-stage/fs/ext4/ext4_jbd2.h +=================================================================== +--- linux-stage.orig/fs/ext4/ext4_jbd2.h 2011-03-14 16:33:17.087088010 +0800 ++++ linux-stage/fs/ext4/ext4_jbd2.h 2011-03-14 16:42:28.416591789 +0800 +@@ -35,6 +35,8 @@ + (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS) \ + ? 27U : 8U) + ++#define ext4_journal_dirty_metadata(handle, bh) \ ++ ext4_handle_dirty_metadata(handle, NULL, bh) + /* Extended attribute operations touch at most two data buffers, + * two bitmap buffers, and two group summaries, in addition to the inode + * and the superblock, which are already accounted for. */ +Index: linux-stage/fs/ext4/ext4_extents.h +=================================================================== +--- linux-stage.orig/fs/ext4/ext4_extents.h 2011-03-14 16:33:17.076087785 +0800 ++++ linux-stage/fs/ext4/ext4_extents.h 2011-03-14 16:43:08.254267525 +0800 +@@ -58,6 +58,12 @@ + */ + #define EXT_STATS_ + ++/* ++ * define EXT4_ALLOC_NEEDED to 0 since block bitmap, group desc. and sb ++ * are now accounted in ext4_ext_calc_credits_for_insert() ++ */ ++#define EXT4_ALLOC_NEEDED 0 ++#define HAVE_EXT_PREPARE_CB_EXTENT + + /* + * ext4_inode has i_block array (60 bytes total). +@@ -160,6 +166,7 @@ + #define EXT_INIT_MAX_LEN (1UL << 15) + #define EXT_UNINIT_MAX_LEN (EXT_INIT_MAX_LEN - 1) + ++#define EXT4_EXT_HAS_NO_TREE /* ext4_extents_tree struct is not used*/ + + #define EXT_FIRST_EXTENT(__hdr__) \ + ((struct ext4_extent *) (((char *) (__hdr__)) + \ +@@ -239,6 +246,8 @@ + extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode, + int num, + struct ext4_ext_path *path); ++extern int ext4_ext_calc_credits_for_insert(struct inode *, ++ struct ext4_ext_path *); + extern int ext4_can_extents_be_merged(struct inode *inode, + struct ext4_extent *ex1, + struct ext4_extent *ex2); +Index: linux-stage/fs/ext4/mballoc.c +=================================================================== +--- linux-stage.orig/fs/ext4/mballoc.c 2011-03-14 16:33:27.072292006 +0800 ++++ linux-stage/fs/ext4/mballoc.c 2011-03-14 16:41:02.500138039 +0800 +@@ -4039,6 +4039,7 @@ + if (ac) + kmem_cache_free(ext4_ac_cachep, ac); + } ++EXPORT_SYMBOL(ext4_discard_preallocations); + + /* + * finds all preallocated spaces and return blocks being freed to them +@@ -4831,3 +4832,6 @@ + kmem_cache_free(ext4_ac_cachep, ac); + return; + } ++ ++EXPORT_SYMBOL(ext4_free_blocks); ++ +Index: linux-stage/fs/ext4/super.c +=================================================================== +--- linux-stage.orig/fs/ext4/super.c 2011-03-14 16:33:17.036086967 +0800 ++++ linux-stage/fs/ext4/super.c 2011-03-14 16:41:14.964348396 +0800 +@@ -127,6 +127,7 @@ + (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? + (__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0); + } ++EXPORT_SYMBOL(ext4_itable_unused_count); + + void ext4_block_bitmap_set(struct super_block *sb, + struct ext4_group_desc *bg, ext4_fsblk_t blk) +@@ -1491,6 +1492,7 @@ + Opt_block_validity, Opt_noblock_validity, + Opt_inode_readahead_blks, Opt_journal_ioprio, + Opt_discard, Opt_nodiscard, ++ Opt_mballoc, + }; + + static const match_table_t tokens = { +@@ -1557,6 +1559,7 @@ + {Opt_auto_da_alloc, "auto_da_alloc=%u"}, + {Opt_auto_da_alloc, "auto_da_alloc"}, + {Opt_noauto_da_alloc, "noauto_da_alloc"}, ++ {Opt_mballoc, "mballoc"}, + {Opt_discard, "discard"}, + {Opt_nodiscard, "nodiscard"}, + {Opt_err, NULL}, +@@ -1997,6 +2000,8 @@ + case Opt_nodiscard: + clear_opt(sbi->s_mount_opt, DISCARD); + break; ++ case Opt_mballoc: ++ break; + default: + ext4_msg(sb, KERN_ERR, + "Unrecognized mount option \"%s\" " +Index: linux-stage/fs/ext4/ext4_jbd2.c +=================================================================== +--- linux-stage.orig/fs/ext4/ext4_jbd2.c 2011-03-14 16:33:17.049087232 +0800 ++++ linux-stage/fs/ext4/ext4_jbd2.c 2011-03-14 16:34:39.849759386 +0800 +@@ -31,6 +31,7 @@ + } + return err; + } ++EXPORT_SYMBOL(__ext4_journal_get_write_access); + + int __ext4_journal_forget(const char *where, handle_t *handle, + struct buffer_head *bh) +@@ -107,3 +108,4 @@ + } + return err; + } ++EXPORT_SYMBOL(__ext4_handle_dirty_metadata); +Index: linux-stage/fs/ext4/ext4.h +=================================================================== +--- linux-stage.orig/fs/ext4/ext4.h 2011-03-14 16:33:17.056087375 +0800 ++++ linux-stage/fs/ext4/ext4.h 2011-03-14 16:45:40.754870806 +0800 +@@ -1110,6 +1110,9 @@ + + #define NEXT_ORPHAN(inode) EXT4_I(inode)->i_dtime + ++/* Has been moved to linux/magic.h but we need it for Lustre */ ++#define EXT4_SUPER_MAGIC 0xEF53 ++ + /* + * Codes for operating systems + */ +@@ -1528,6 +1531,8 @@ + extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t); + extern void ext4_mb_put_buddy_cache_lock(struct super_block *, + ext4_group_t, int); ++extern void ext4_mb_discard_inode_preallocations(struct inode *); ++ + /* inode.c */ + int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode, + struct buffer_head *bh, ext4_fsblk_t blocknr); +Index: linux-stage/fs/ext4/inode.c +=================================================================== +--- linux-stage.orig/fs/ext4/inode.c 2011-03-14 16:33:17.063087519 +0800 ++++ linux-stage/fs/ext4/inode.c 2011-03-14 16:34:39.913760434 +0800 +@@ -5199,6 +5199,7 @@ + iget_failed(inode); + return ERR_PTR(ret); + } ++EXPORT_SYMBOL(ext4_iget); + + static int ext4_inode_blocks_set(handle_t *handle, + struct ext4_inode *raw_inode, +Index: linux-stage/fs/ext4/extents.c +=================================================================== +--- linux-stage.orig/fs/ext4/extents.c 2011-03-14 16:33:17.070087661 +0800 ++++ linux-stage/fs/ext4/extents.c 2011-03-14 16:41:04.894178430 +0800 +@@ -1866,9 +1866,7 @@ + while (block < last && block != EXT_MAX_BLOCK) { + num = last - block; + /* find extent for this block */ +- down_read(&EXT4_I(inode)->i_data_sem); + path = ext4_ext_find_extent(inode, block, path); +- up_read(&EXT4_I(inode)->i_data_sem); + if (IS_ERR(path)) { + err = PTR_ERR(path); + path = NULL; +@@ -1965,6 +1963,7 @@ + + return err; + } ++EXPORT_SYMBOL(ext4_ext_walk_space); + + static void + ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block, +@@ -2133,6 +2132,55 @@ + } + + /* ++ * This routine returns max. credits extent tree can consume. ++ * It should be OK for low-performance paths like ->writepage() ++ * To allow many writing process to fit a single transaction, ++ * caller should calculate credits under truncate_mutex and ++ * pass actual path. ++ */ ++int ext4_ext_calc_credits_for_insert(struct inode *inode, ++ struct ext4_ext_path *path) ++{ ++ int depth, needed; ++ ++ if (path) { ++ /* probably there is space in leaf? */ ++ depth = ext_depth(inode); ++ if (le16_to_cpu(path[depth].p_hdr->eh_entries) ++ < le16_to_cpu(path[depth].p_hdr->eh_max)) ++ return 1; ++ } ++ ++ /* ++ * given 32bit logical block (4294967296 blocks), max. tree ++ * can be 4 levels in depth -- 4 * 340^4 == 53453440000. ++ * let's also add one more level for imbalance. ++ */ ++ depth = 5; ++ ++ /* allocation of new data block(s) */ ++ needed = 2; ++ ++ /* ++ * tree can be full, so it'd need to grow in depth: ++ * we need one credit to modify old root, credits for ++ * new root will be added in split accounting ++ */ ++ needed += 1; ++ /* ++ * Index split can happen, we'd need: ++ * allocate intermediate indexes (bitmap + group) ++ * + change two blocks at each level, but root (already included) ++ */ ++ needed += (depth * 2) + (depth * 2); ++ ++ /* any allocation modifies superblock */ ++ needed += 1; ++ ++ return needed; ++} ++ ++/* + * How many index/leaf blocks need to change/allocate to modify nrblocks? + * + * if nrblocks are fit in a single extent (chunk flag is 1), then +@@ -3934,10 +3982,21 @@ + * Walk the extent tree gathering extent information. + * ext4_ext_fiemap_cb will push extents back to user. + */ ++ down_read(&EXT4_I(inode)->i_data_sem); + error = ext4_ext_walk_space(inode, start_blk, len_blks, + ext4_ext_fiemap_cb, fieinfo); ++ up_read(&EXT4_I(inode)->i_data_sem); + } + + return error; + } + ++EXPORT_SYMBOL(ext4_ext_store_pblock); ++EXPORT_SYMBOL(ext4_ext_search_right); ++EXPORT_SYMBOL(ext4_ext_search_left); ++EXPORT_SYMBOL(ext_pblock); ++EXPORT_SYMBOL(ext4_ext_insert_extent); ++EXPORT_SYMBOL(ext4_mb_new_blocks); ++EXPORT_SYMBOL(ext4_ext_calc_credits_for_insert); ++EXPORT_SYMBOL(ext4_mark_inode_dirty); ++ diff --git a/ldiskfs/kernel_patches/patches/ext4-mmp-rhel6.patch b/ldiskfs/kernel_patches/patches/ext4-mmp-rhel6.patch new file mode 100644 index 0000000..83777e3 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext4-mmp-rhel6.patch @@ -0,0 +1,479 @@ +Index: linux-stage/fs/ext4/super.c +=================================================================== +--- linux-stage.orig/fs/ext4/super.c 2011-03-03 15:25:02.376539424 +0800 ++++ linux-stage/fs/ext4/super.c 2011-03-05 12:24:02.918774335 +0800 +@@ -40,6 +40,8 @@ + #include + #include + #include ++#include ++#include + + #include "ext4.h" + #include "ext4_jbd2.h" +@@ -700,6 +702,8 @@ + invalidate_bdev(sbi->journal_bdev); + ext4_blkdev_remove(sbi); + } ++ if (sbi->s_mmp_tsk) ++ kthread_stop(sbi->s_mmp_tsk); + sb->s_fs_info = NULL; + /* + * Now that we are completely done shutting down the +@@ -970,6 +974,344 @@ + return 0; + } + ++/* ++ * Write the MMP block using WRITE_SYNC to try to get the block on-disk ++ * faster. ++ */ ++static int write_mmp_block(struct buffer_head *bh) ++{ ++ mark_buffer_dirty(bh); ++ lock_buffer(bh); ++ bh->b_end_io = end_buffer_write_sync; ++ get_bh(bh); ++ submit_bh(WRITE_SYNC, bh); ++ wait_on_buffer(bh); ++ if (unlikely(!buffer_uptodate(bh))) ++ return 1; ++ ++ return 0; ++} ++ ++/* ++ * Read the MMP block. It _must_ be read from disk and hence we clear the ++ * uptodate flag on the buffer. ++ */ ++static int read_mmp_block(struct super_block *sb, struct buffer_head **bh, ++ unsigned long mmp_block) ++{ ++ struct mmp_struct *mmp; ++ ++ if (*bh) ++ clear_buffer_uptodate(*bh); ++ ++#if 0 ++ brelse(*bh); ++ ++ *bh = sb_bread(sb, mmp_block); ++#else ++ if (!*bh) ++ *bh = sb_getblk(sb, mmp_block); ++ if (*bh) { ++ get_bh(*bh); ++ lock_buffer(*bh); ++ (*bh)->b_end_io = end_buffer_read_sync; ++ submit_bh(READ_SYNC, *bh); ++ wait_on_buffer(*bh); ++ if (!buffer_uptodate(*bh)) { ++ brelse(*bh); ++ *bh = NULL; ++ } ++ } ++#endif ++ if (!*bh) { ++ ext4_warning(sb, ++ "Error while reading MMP block %lu", mmp_block); ++ return -EIO; ++ } ++ ++ mmp = (struct mmp_struct *)((*bh)->b_data); ++ if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC) ++ return -EINVAL; ++ ++ return 0; ++} ++ ++/* ++ * Dump as much information as possible to help the admin. ++ */ ++static void dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp, ++ const char *function, const char *msg) ++{ ++ __ext4_warning(sb, function, msg); ++ __ext4_warning(sb, function, "MMP failure info: last update time: %llu, " ++ "last update node: %s, last update device: %s\n", ++ (long long unsigned int)le64_to_cpu(mmp->mmp_time), ++ mmp->mmp_nodename, mmp->mmp_bdevname); ++} ++ ++/* ++ * kmmpd will update the MMP sequence every s_mmp_update_interval seconds ++ */ ++static int kmmpd(void *data) ++{ ++ struct super_block *sb = (struct super_block *) data; ++ struct ext4_super_block *es = EXT4_SB(sb)->s_es; ++ struct buffer_head *bh = NULL; ++ struct mmp_struct *mmp; ++ unsigned long mmp_block; ++ u32 seq = 0; ++ unsigned long failed_writes = 0; ++ int mmp_update_interval = le16_to_cpu(es->s_mmp_update_interval); ++ unsigned mmp_check_interval; ++ unsigned long last_update_time; ++ unsigned long diff; ++ int retval; ++ ++ mmp_block = le64_to_cpu(es->s_mmp_block); ++ retval = read_mmp_block(sb, &bh, mmp_block); ++ if (retval) ++ goto failed; ++ ++ mmp = (struct mmp_struct *)(bh->b_data); ++ mmp->mmp_time = cpu_to_le64(get_seconds()); ++ /* ++ * Start with the higher mmp_check_interval and reduce it if ++ * the MMP block is being updated on time. ++ */ ++ mmp_check_interval = max(5 * mmp_update_interval, ++ EXT4_MMP_MIN_CHECK_INTERVAL); ++ mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval); ++ bdevname(bh->b_bdev, mmp->mmp_bdevname); ++ ++ memcpy(mmp->mmp_nodename, init_utsname()->sysname, ++ sizeof(mmp->mmp_nodename)); ++ ++ while (!kthread_should_stop()) { ++ if (++seq > EXT4_MMP_SEQ_MAX) ++ seq = 1; ++ ++ mmp->mmp_seq = cpu_to_le32(seq); ++ mmp->mmp_time = cpu_to_le64(get_seconds()); ++ last_update_time = jiffies; ++ ++ retval = write_mmp_block(bh); ++ /* ++ * Don't spew too many error messages. Print one every ++ * (s_mmp_update_interval * 60) seconds. ++ */ ++ if (retval && (failed_writes % 60) == 0) { ++ ext4_error(sb, ++ "Error writing to MMP block"); ++ failed_writes++; ++ } ++ ++ if (!(le32_to_cpu(es->s_feature_incompat) & ++ EXT4_FEATURE_INCOMPAT_MMP)) { ++ ext4_warning(sb, "kmmpd being stopped " ++ "since MMP feature has been disabled."); ++ EXT4_SB(sb)->s_mmp_tsk = 0; ++ goto failed; ++ } ++ ++ if (sb->s_flags & MS_RDONLY) { ++ ext4_warning(sb, "kmmpd being stopped " ++ "since filesystem has been remounted as " ++ "readonly."); ++ EXT4_SB(sb)->s_mmp_tsk = 0; ++ goto failed; ++ } ++ ++ diff = jiffies - last_update_time; ++ if (diff < mmp_update_interval * HZ) ++ schedule_timeout_interruptible(mmp_update_interval * ++ HZ - diff); ++ ++ /* ++ * We need to make sure that more than mmp_check_interval ++ * seconds have not passed since writing. If that has happened ++ * we need to check if the MMP block is as we left it. ++ */ ++ diff = jiffies - last_update_time; ++ if (diff > mmp_check_interval * HZ) { ++ struct buffer_head *bh_check = NULL; ++ struct mmp_struct *mmp_check; ++ ++ retval = read_mmp_block(sb, &bh_check, mmp_block); ++ if (retval) { ++ EXT4_SB(sb)->s_mmp_tsk = 0; ++ goto failed; ++ } ++ ++ mmp_check = (struct mmp_struct *)(bh_check->b_data); ++ if (mmp->mmp_time != mmp_check->mmp_time || ++ memcmp(mmp->mmp_nodename, mmp_check->mmp_nodename, ++ sizeof(mmp->mmp_nodename))) ++ dump_mmp_msg(sb, mmp_check, __func__, ++ "Error while updating MMP info. " ++ "The filesystem seems to have " ++ "been multiply mounted."); ++ ++ put_bh(bh_check); ++ } ++ ++ /* ++ * Adjust the mmp_check_interval depending on how much time ++ * it took for the MMP block to be written. ++ */ ++ mmp_check_interval = max(5 * diff / HZ, ++ (unsigned long) EXT4_MMP_MIN_CHECK_INTERVAL); ++ mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval); ++ } ++ ++ /* ++ * Unmount seems to be clean. ++ */ ++ mmp->mmp_seq = cpu_to_le32(EXT4_MMP_SEQ_CLEAN); ++ mmp->mmp_time = cpu_to_le64(get_seconds()); ++ ++ retval = write_mmp_block(bh); ++ ++failed: ++ brelse(bh); ++ return retval; ++} ++ ++/* ++ * Get a random new sequence number but make sure it is not greater than ++ * EXT4_MMP_SEQ_MAX. ++ */ ++static unsigned int mmp_new_seq(void) ++{ ++ u32 new_seq; ++ ++ do { ++ get_random_bytes(&new_seq, sizeof(u32)); ++ } while (new_seq > EXT4_MMP_SEQ_MAX); ++ ++ return new_seq; ++} ++ ++/* ++ * Protect the filesystem from being mounted more than once. ++ */ ++static int ext4_multi_mount_protect(struct super_block *sb, ++ unsigned long mmp_block) ++{ ++ struct ext4_super_block *es = EXT4_SB(sb)->s_es; ++ struct buffer_head *bh = NULL; ++ struct mmp_struct *mmp = NULL; ++ u32 seq; ++ unsigned int mmp_check_interval = le16_to_cpu(es->s_mmp_update_interval); ++ unsigned int wait_time = 0; ++ int retval; ++ ++ if (mmp_block < le32_to_cpu(es->s_first_data_block) || ++ mmp_block >= ext4_blocks_count(es)) { ++ ext4_warning(sb, ++ "Invalid MMP block in superblock"); ++ goto failed; ++ } ++ ++ retval = read_mmp_block(sb, &bh, mmp_block); ++ if (retval) ++ goto failed; ++ ++ mmp = (struct mmp_struct *)(bh->b_data); ++ ++ if (mmp_check_interval < EXT4_MMP_MIN_CHECK_INTERVAL) ++ mmp_check_interval = EXT4_MMP_MIN_CHECK_INTERVAL; ++ ++ /* ++ * If check_interval in MMP block is larger, use that instead of ++ * update_interval from the superblock. ++ */ ++ if (mmp->mmp_check_interval > mmp_check_interval) ++ mmp_check_interval = mmp->mmp_check_interval; ++ ++ seq = le32_to_cpu(mmp->mmp_seq); ++ if (seq == EXT4_MMP_SEQ_CLEAN) ++ goto skip; ++ ++ if (seq == EXT4_MMP_SEQ_FSCK) { ++ dump_mmp_msg(sb, mmp, __func__, ++ "fsck is running on the filesystem"); ++ goto failed; ++ } ++ ++ wait_time = min(mmp_check_interval * 2 + 1, ++ mmp_check_interval + 60); ++ ++ /* Print MMP interval if more than 20 secs. */ ++ if (wait_time > EXT4_MMP_MIN_CHECK_INTERVAL * 4) ++ ext4_warning(sb, "MMP interval %u higher than " ++ "expected, please wait.\n", wait_time * 2); ++ ++ if (schedule_timeout_interruptible(HZ * wait_time) != 0) { ++ ext4_warning(sb, "MMP startup interrupted, failing " ++ "mount\n"); ++ goto failed; ++ } ++ ++ retval = read_mmp_block(sb, &bh, mmp_block); ++ if (retval) ++ goto failed; ++ mmp = (struct mmp_struct *)(bh->b_data); ++ if (seq != le32_to_cpu(mmp->mmp_seq)) { ++ dump_mmp_msg(sb, mmp, __func__, ++ "Device is already active on another node."); ++ goto failed; ++ } ++ ++skip: ++ /* ++ * write a new random sequence number. ++ */ ++ mmp->mmp_seq = seq = cpu_to_le32(mmp_new_seq()); ++ ++ retval = write_mmp_block(bh); ++ if (retval) ++ goto failed; ++ ++ /* ++ * wait for MMP interval and check mmp_seq. ++ */ ++ if (schedule_timeout_interruptible(HZ * wait_time) != 0) { ++ ext4_warning(sb, "MMP startup interrupted, failing " ++ "mount\n"); ++ goto failed; ++ } ++ ++ retval = read_mmp_block(sb, &bh, mmp_block); ++ if (retval) ++ goto failed; ++ mmp = (struct mmp_struct *)(bh->b_data); ++ if (seq != le32_to_cpu(mmp->mmp_seq)) { ++ dump_mmp_msg(sb, mmp, __func__, ++ "Device is already active on another node."); ++ goto failed; ++ } ++ ++ /* ++ * Start a kernel thread to update the MMP block periodically. ++ */ ++ EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, sb, "kmmpd-%02x:%02x", ++ MAJOR(sb->s_dev), ++ MINOR(sb->s_dev)); ++ if (IS_ERR(EXT4_SB(sb)->s_mmp_tsk)) { ++ EXT4_SB(sb)->s_mmp_tsk = 0; ++ ext4_warning(sb, "Unable to create kmmpd thread " ++ "for %s.", sb->s_id); ++ goto failed; ++ } ++ ++ brelse(bh); ++ return 0; ++ ++failed: ++ brelse(bh); ++ return 1; ++} ++ + static struct inode *ext4_nfs_get_inode(struct super_block *sb, + u64 ino, u32 generation) + { +@@ -2816,6 +3158,11 @@ + EXT4_HAS_INCOMPAT_FEATURE(sb, + EXT4_FEATURE_INCOMPAT_RECOVER)); + ++ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_MMP) && ++ !(sb->s_flags & MS_RDONLY)) ++ if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block))) ++ goto failed_mount3; ++ + /* + * The first inode we look at is the journal inode. Don't try + * root first: it may be modified in the journal! +@@ -3052,6 +3399,8 @@ + percpu_counter_destroy(&sbi->s_freeinodes_counter); + percpu_counter_destroy(&sbi->s_dirs_counter); + percpu_counter_destroy(&sbi->s_dirtyblocks_counter); ++ if (sbi->s_mmp_tsk) ++ kthread_stop(sbi->s_mmp_tsk); + failed_mount2: + for (i = 0; i < db_count; i++) + brelse(sbi->s_group_desc[i]); +@@ -3560,7 +3909,7 @@ + struct ext4_mount_options old_opts; + ext4_group_t g; + unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; +- int err; ++ int err = 0; + #ifdef CONFIG_QUOTA + int i; + #endif +@@ -3682,6 +4031,13 @@ + goto restore_opts; + if (!ext4_setup_super(sb, es, 0)) + sb->s_flags &= ~MS_RDONLY; ++ if (EXT4_HAS_INCOMPAT_FEATURE(sb, ++ EXT4_FEATURE_INCOMPAT_MMP)) ++ if (ext4_multi_mount_protect(sb, ++ le64_to_cpu(es->s_mmp_block))) { ++ err = -EROFS; ++ goto restore_opts; ++ } + } + } + ext4_setup_system_zone(sb); +Index: linux-stage/fs/ext4/ext4.h +=================================================================== +--- linux-stage.orig/fs/ext4/ext4.h 2011-03-03 15:25:02.507538421 +0800 ++++ linux-stage/fs/ext4/ext4.h 2011-03-05 12:25:16.343986732 +0800 +@@ -894,7 +894,7 @@ + __le16 s_want_extra_isize; /* New inodes should reserve # bytes */ + __le32 s_flags; /* Miscellaneous flags */ + __le16 s_raid_stride; /* RAID stride */ +- __le16 s_mmp_interval; /* # seconds to wait in MMP checking */ ++ __le16 s_mmp_update_interval; /* # seconds to wait in MMP checking */ + __le64 s_mmp_block; /* Block for multi-mount protection */ + __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/ + __u8 s_log_groups_per_flex; /* FLEX_BG group size */ +@@ -1041,6 +1041,9 @@ + + /* workqueue for dio unwritten */ + struct workqueue_struct *dio_unwritten_wq; ++ ++ /* Kernel thread for multiple mount protection */ ++ struct task_struct *s_mmp_tsk; + }; + + static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) +@@ -1177,7 +1180,8 @@ + EXT4_FEATURE_INCOMPAT_META_BG| \ + EXT4_FEATURE_INCOMPAT_EXTENTS| \ + EXT4_FEATURE_INCOMPAT_64BIT| \ +- EXT4_FEATURE_INCOMPAT_FLEX_BG) ++ EXT4_FEATURE_INCOMPAT_FLEX_BG| \ ++ EXT4_FEATURE_INCOMPAT_MMP) + #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ +@@ -1384,6 +1388,34 @@ + extern struct proc_dir_entry *ext4_proc_root; + + /* ++ * This structure will be used for multiple mount protection. It will be ++ * written into the block number saved in the s_mmp_block field in the ++ * superblock. Programs that check MMP should assume that if ++ * SEQ_FSCK (or any unknown code above SEQ_MAX) is present then it is NOT safe ++ * to use the filesystem, regardless of how old the timestamp is. ++ */ ++#define EXT4_MMP_MAGIC 0x004D4D50U /* ASCII for MMP */ ++#define EXT4_MMP_SEQ_CLEAN 0xFF4D4D50U /* mmp_seq value for clean unmount */ ++#define EXT4_MMP_SEQ_FSCK 0xE24D4D50U /* mmp_seq value when being fscked */ ++#define EXT4_MMP_SEQ_MAX 0xE24D4D4FU /* maximum valid mmp_seq value */ ++ ++struct mmp_struct { ++ __le32 mmp_magic; ++ __le32 mmp_seq; ++ __le64 mmp_time; ++ char mmp_nodename[64]; ++ char mmp_bdevname[32]; ++ __le16 mmp_check_interval; ++ __le16 mmp_pad1; ++ __le32 mmp_pad2[227]; ++}; ++ ++/* ++ * Minimum interval for MMP checking in seconds. ++ */ ++#define EXT4_MMP_MIN_CHECK_INTERVAL 5 ++ ++/* + * Function prototypes + */ + diff --git a/ldiskfs/kernel_patches/patches/ext4-osd-iam-exports-rhel6.patch b/ldiskfs/kernel_patches/patches/ext4-osd-iam-exports-rhel6.patch new file mode 100644 index 0000000..3bae32f --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext4-osd-iam-exports-rhel6.patch @@ -0,0 +1,64 @@ +diff -rupN 2.6.27.21_2/fs/ext4/ext4.h 2.6.27.21_3/fs/ext4/ext4.h +--- 2.6.27.21_2/fs/ext4/ext4.h 2009-07-17 12:19:59.000000000 +0530 ++++ 2.6.27.21_3/fs/ext4/ext4.h 2009-07-17 12:38:59.000000000 +0530 +@@ -1181,6 +1181,9 @@ extern int ext4_orphan_add(handle_t *, s + #define ll_ext4_find_entry(inode, dentry, res_dir) ext4_find_entry(inode, &(dentry)->d_name, res_dir) + extern int ext4_add_dot_dotdot(handle_t *handle, struct inode *dir, + struct inode *inode); ++extern struct buffer_head *ext4_append(handle_t *handle, ++ struct inode *inode, ++ ext4_lblk_t *block, int *err); + + /* resize.c */ + extern int ext4_group_add(struct super_block *sb, +diff -rupN 2.6.27.21_2/fs/ext4/hash.c 2.6.27.21_3/fs/ext4/hash.c +--- 2.6.27.21_2/fs/ext4/hash.c 2009-07-17 12:12:56.000000000 +0530 ++++ 2.6.27.21_3/fs/ext4/hash.c 2009-07-17 12:40:22.000000000 +0530 +@@ -9,6 +9,7 @@ + * License. + */ + ++#include + #include + #include + #include +@@ -206,3 +207,4 @@ int ext4fs_dirhash(const char *name, int + hinfo->minor_hash = minor_hash; + return 0; + } ++EXPORT_SYMBOL(ext4fs_dirhash); +diff -rupN 2.6.27.21_2/fs/ext4/namei.c 2.6.27.21_3/fs/ext4/namei.c +--- 2.6.27.21_2/fs/ext4/namei.c 2009-07-17 12:23:51.000000000 +0530 ++++ 2.6.27.21_3/fs/ext4/namei.c 2009-07-17 12:37:59.000000000 +0530 +@@ -51,9 +51,9 @@ + #define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS) + #define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b)) + +-static struct buffer_head *ext4_append(handle_t *handle, +- struct inode *inode, +- ext4_lblk_t *block, int *err) ++struct buffer_head *ext4_append(handle_t *handle, ++ struct inode *inode, ++ ext4_lblk_t *block, int *err) + { + struct buffer_head *bh; + struct ext4_inode_info *ei = EXT4_I(inode); +@@ -72,6 +72,7 @@ static struct buffer_head *ext4_append(h + up(&ei->i_append_sem); + return bh; + } ++EXPORT_SYMBOL(ext4_append); + + #ifndef assert + #define assert(test) J_ASSERT(test) +diff -rupN 2.6.27.21_2/fs/ext4/super.c 2.6.27.21_3/fs/ext4/super.c +--- 2.6.27.21_2/fs/ext4/super.c 2009-07-17 12:12:57.000000000 +0530 ++++ 2.6.27.21_3/fs/ext4/super.c 2009-07-17 12:40:52.000000000 +0530 +@@ -377,6 +377,7 @@ void __ext4_std_error(struct super_block + + ext4_handle_error(sb); + } ++EXPORT_SYMBOL(__ext4_std_error); + + /* + * ext4_abort is a much stronger failure handler than ext4_error. The diff --git a/ldiskfs/kernel_patches/patches/ext4-osd-iop-common-rhel6.patch b/ldiskfs/kernel_patches/patches/ext4-osd-iop-common-rhel6.patch new file mode 100644 index 0000000..228c1c4 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext4-osd-iop-common-rhel6.patch @@ -0,0 +1,226 @@ +Index: linux-2.6.32.i386/fs/ext4/ext4.h +=================================================================== +--- linux-2.6.32.i386.orig/fs/ext4/ext4.h 2010-04-16 04:57:39.000000000 +0530 ++++ linux-2.6.32.i386/fs/ext4/ext4.h 2010-04-16 05:27:02.000000000 +0530 +@@ -1512,6 +1512,19 @@ + extern int ext4_orphan_del(handle_t *, struct inode *); + extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, + __u32 start_minor_hash, __u32 *next_hash); ++extern struct inode *ext4_create_inode(handle_t *handle, ++ struct inode * dir, int mode); ++extern int ext4_add_entry(handle_t *handle, struct dentry *dentry, ++ struct inode *inode); ++extern int ext4_delete_entry(handle_t *handle, struct inode * dir, ++ struct ext4_dir_entry_2 * de_del, ++ struct buffer_head * bh); ++extern struct buffer_head * ext4_find_entry(struct inode *dir, ++ const struct qstr *d_name, ++ struct ext4_dir_entry_2 ** res_dir); ++#define ll_ext4_find_entry(inode, dentry, res_dir) ext4_find_entry(inode, &(dentry)->d_name, res_dir) ++extern int ext4_add_dot_dotdot(handle_t *handle, struct inode *dir, ++ struct inode *inode); + + /* resize.c */ + extern int ext4_group_add(struct super_block *sb, +Index: linux-2.6.32.i386/fs/ext4/namei.c +=================================================================== +--- linux-2.6.32.i386.orig/fs/ext4/namei.c 2010-04-16 04:57:39.000000000 +0530 ++++ linux-2.6.32.i386/fs/ext4/namei.c 2010-04-16 05:28:25.000000000 +0530 +@@ -24,6 +24,7 @@ + * Theodore Ts'o, 2002 + */ + ++#include + #include + #include + #include +@@ -902,9 +903,9 @@ + * The returned buffer_head has ->b_count elevated. The caller is expected + * to brelse() it when appropriate. + */ +-static struct buffer_head * ext4_find_entry (struct inode *dir, +- const struct qstr *d_name, +- struct ext4_dir_entry_2 ** res_dir) ++struct buffer_head * ext4_find_entry(struct inode *dir, ++ const struct qstr *d_name, ++ struct ext4_dir_entry_2 ** res_dir) + { + struct super_block *sb; + struct buffer_head *bh_use[NAMEI_RA_SIZE]; +@@ -1011,6 +1012,7 @@ + brelse(bh_use[ra_ptr]); + return ret; + } ++EXPORT_SYMBOL(ext4_find_entry); + + static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name, + struct ext4_dir_entry_2 **res_dir, int *err) +@@ -1538,8 +1540,8 @@ + * may not sleep between calling this and putting something into + * the entry, as someone else might have used it while you slept. + */ +-static int ext4_add_entry(handle_t *handle, struct dentry *dentry, +- struct inode *inode) ++int ext4_add_entry(handle_t *handle, struct dentry *dentry, ++ struct inode *inode) + { + struct inode *dir = dentry->d_parent->d_inode; + struct buffer_head *bh; +@@ -1588,6 +1590,7 @@ + brelse(bh); + return retval; + } ++EXPORT_SYMBOL(ext4_add_entry); + + /* + * Returns 0 for success, or a negative error value +@@ -1728,10 +1731,10 @@ + * ext4_delete_entry deletes a directory entry by merging it with the + * previous entry + */ +-static int ext4_delete_entry(handle_t *handle, +- struct inode *dir, +- struct ext4_dir_entry_2 *de_del, +- struct buffer_head *bh) ++int ext4_delete_entry(handle_t *handle, ++ struct inode *dir, ++ struct ext4_dir_entry_2 *de_del, ++ struct buffer_head *bh) + { + struct ext4_dir_entry_2 *de, *pde; + unsigned int blocksize = dir->i_sb->s_blocksize; +@@ -1766,7 +1769,7 @@ + } + return -ENOENT; + } +- ++EXPORT_SYMBOL(ext4_delete_entry); + /* + * DIR_NLINK feature is set if 1) nlinks > EXT4_LINK_MAX or 2) nlinks == 2, + * since this indicates that nlinks count was previously 1. +@@ -1831,6 +1834,26 @@ + return inum; + } + ++struct inode * ext4_create_inode(handle_t *handle, struct inode * dir, int mode) ++{ ++ struct inode *inode; ++ ++ inode = ext4_new_inode(handle, dir, mode, 0, 0); ++ if (!IS_ERR(inode)) { ++ if (S_ISCHR(mode) || S_ISBLK(mode) || S_ISFIFO(mode)) { ++#ifdef CONFIG_LDISKFS_FS_XATTR ++ inode->i_op = &ext4_special_inode_operations; ++#endif ++ } else { ++ inode->i_op = &ext4_file_inode_operations; ++ inode->i_fop = &ext4_file_operations; ++ ext4_set_aops(inode); ++ } ++ } ++ return inode; ++} ++EXPORT_SYMBOL(ext4_create_inode); ++ + /* + * By the time this is called, we already have created + * the directory cache entry for the new file, but it +@@ -1905,40 +1928,33 @@ + return err; + } + +-static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode) ++/* Initialize @inode as a subdirectory of @dir, and add the ++ * "." and ".." entries into the first directory block. */ ++int ext4_add_dot_dotdot(handle_t *handle, struct inode * dir, ++ struct inode *inode) + { +- handle_t *handle; +- struct inode *inode; +- struct buffer_head *dir_block; +- struct ext4_dir_entry_2 *de; ++ struct buffer_head * dir_block; ++ struct ext4_dir_entry_2 * de; + unsigned int blocksize = dir->i_sb->s_blocksize; +- int err, retries = 0; +- +- if (EXT4_DIR_LINK_MAX(dir)) +- return -EMLINK; ++ int err = 0; + +-retry: +- handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + +- EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + +- EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (IS_DIRSYNC(dir)) + ext4_handle_sync(handle); + +- inode = ext4_new_inode(handle, dir, S_IFDIR | mode, +- &dentry->d_name, 0); +- err = PTR_ERR(inode); +- if (IS_ERR(inode)) +- goto out_stop; + + inode->i_op = &ext4_dir_inode_operations; + inode->i_fop = &ext4_dir_operations; + inode->i_size = EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize; + dir_block = ext4_bread(handle, inode, 0, 1, &err); +- if (!dir_block) +- goto out_clear_inode; ++ if (!dir_block) { ++ clear_nlink(inode); ++ ext4_mark_inode_dirty(handle, inode); ++ iput (inode); ++ goto get_out; ++ } + BUFFER_TRACE(dir_block, "get_write_access"); + ext4_journal_get_write_access(handle, dir_block); + de = (struct ext4_dir_entry_2 *) dir_block->b_data; +@@ -1960,9 +1976,43 @@ + ext4_handle_dirty_metadata(handle, dir, dir_block); + brelse(dir_block); + ext4_mark_inode_dirty(handle, inode); ++get_out: ++ return err; ++} ++EXPORT_SYMBOL(ext4_add_dot_dotdot); ++ ++ ++static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode) ++{ ++ handle_t *handle; ++ struct inode *inode; ++ int err, retries = 0; ++ ++ if (EXT4_DIR_LINK_MAX(dir)) ++ return -EMLINK; ++ ++retry: ++ handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + ++ EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + ++ 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb)); ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ ++ if (IS_DIRSYNC(dir)) ++ handle->h_sync = 1; ++ ++ inode = ext4_new_inode(handle, dir, S_IFDIR | mode, ++ &dentry->d_name, ext4_dentry_goal(dir->i_sb, dentry)); ++ err = PTR_ERR(inode); ++ if (IS_ERR(inode)) ++ goto out_stop; ++ ++ err = ext4_add_dot_dotdot(handle, dir, inode); ++ if (err) ++ goto out_stop; ++ + err = ext4_add_entry(handle, dentry, inode); + if (err) { +-out_clear_inode: + clear_nlink(inode); + unlock_new_inode(inode); + ext4_mark_inode_dirty(handle, inode); diff --git a/ldiskfs/kernel_patches/patches/ext4-pdir-fix-rhel6.patch b/ldiskfs/kernel_patches/patches/ext4-pdir-fix-rhel6.patch new file mode 100644 index 0000000..fc7c791 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext4-pdir-fix-rhel6.patch @@ -0,0 +1,62 @@ +Index: linux-2.6.32.i386/fs/ext4/ext4.h +=================================================================== +--- linux-2.6.32.i386.orig/fs/ext4/ext4.h 2010-04-16 03:39:11.000000000 +0530 ++++ linux-2.6.32.i386/fs/ext4/ext4.h 2010-04-16 04:27:41.000000000 +0530 +@@ -29,6 +29,7 @@ + #ifndef _EXT4_H + #define _EXT4_H + ++#include + #include + #include + #include +@@ -621,6 +622,10 @@ + ext4_fsblk_t i_file_acl; + __u32 i_dtime; + ++ /* following fields for parallel directory operations -bzzz */ ++ struct dynlock i_htree_lock; ++ struct semaphore i_append_sem; ++ + /* + * i_block_group is the number of the block group which contains + * this file's inode. Constant across the lifetime of the inode, +Index: linux-2.6.32.i386/fs/ext4/namei.c +=================================================================== +--- linux-2.6.32.i386.orig/fs/ext4/namei.c 2010-04-15 07:42:15.000000000 +0530 ++++ linux-2.6.32.i386/fs/ext4/namei.c 2010-04-16 04:26:03.000000000 +0530 +@@ -54,6 +54,11 @@ + ext4_lblk_t *block, int *err) + { + struct buffer_head *bh; ++ struct ext4_inode_info *ei = EXT4_I(inode); ++ ++ /* with parallel dir operations all appends ++ * have to be serialized -bzzz */ ++ down(&ei->i_append_sem); + + *block = inode->i_size >> inode->i_sb->s_blocksize_bits; + +@@ -66,7 +71,9 @@ + brelse(bh); + bh = NULL; + } ++ ei->i_disksize = inode->i_size; + } ++ up(&ei->i_append_sem); + return bh; + } + +Index: linux-2.6.32.i386/fs/ext4/super.c +=================================================================== +--- linux-2.6.32.i386.orig/fs/ext4/super.c 2010-04-16 03:39:11.000000000 +0530 ++++ linux-2.6.32.i386/fs/ext4/super.c 2010-04-16 04:26:03.000000000 +0530 +@@ -700,6 +700,8 @@ + + ei->vfs_inode.i_version = 1; + ei->vfs_inode.i_data.writeback_index = 0; ++ dynlock_init(&ei->i_htree_lock); ++ sema_init(&ei->i_append_sem, 1); + memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache)); + INIT_LIST_HEAD(&ei->i_prealloc_list); + spin_lock_init(&ei->i_prealloc_lock); diff --git a/ldiskfs/kernel_patches/patches/ext4-prealloc-rhel6.patch b/ldiskfs/kernel_patches/patches/ext4-prealloc-rhel6.patch new file mode 100644 index 0000000..dd3252d --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext4-prealloc-rhel6.patch @@ -0,0 +1,366 @@ +Index: linux-stage/fs/ext4/ext4.h +=================================================================== +--- linux-stage.orig/fs/ext4/ext4.h 2011-03-11 14:17:02.000000000 +0800 ++++ linux-stage/fs/ext4/ext4.h 2011-03-11 14:20:08.269063193 +0800 +@@ -999,11 +999,14 @@ + + /* tunables */ + unsigned long s_stripe; +- unsigned int s_mb_stream_request; ++ unsigned long s_mb_small_req; ++ unsigned long s_mb_large_req; + unsigned int s_mb_max_to_scan; + unsigned int s_mb_min_to_scan; + unsigned int s_mb_stats; + unsigned int s_mb_order2_reqs; ++ unsigned long *s_mb_prealloc_table; ++ unsigned long s_mb_prealloc_table_size; + unsigned int s_mb_group_prealloc; + unsigned int s_max_writeback_mb_bump; + /* where last allocation was done - for stream allocation */ +Index: linux-stage/fs/ext4/mballoc.c +=================================================================== +--- linux-stage.orig/fs/ext4/mballoc.c 2011-03-11 14:03:32.000000000 +0800 ++++ linux-stage/fs/ext4/mballoc.c 2011-03-11 14:44:49.106543493 +0800 +@@ -1823,6 +1823,26 @@ + } + } + ++static void ext4_mb_prealloc_table_add(struct ext4_sb_info *sbi, int value) ++{ ++ int i; ++ ++ if (value > (sbi->s_blocks_per_group - 1 - 1 - sbi->s_itb_per_group)) ++ return; ++ ++ for (i = 0; i < sbi->s_mb_prealloc_table_size; i++) { ++ if (sbi->s_mb_prealloc_table[i] == 0) { ++ sbi->s_mb_prealloc_table[i] = value; ++ return; ++ } ++ ++ /* they should add values in order */ ++ if (value <= sbi->s_mb_prealloc_table[i]) ++ return; ++ } ++} ++ ++ + static int ext4_mb_good_group(struct ext4_allocation_context *ac, + ext4_group_t group, int cr) + { +@@ -2173,6 +2193,80 @@ + .show = ext4_mb_seq_groups_show, + }; + ++#define EXT4_MB_PREALLOC_TABLE "prealloc_table" ++ ++static int ext4_mb_prealloc_table_proc_read(char *page, char **start, off_t off, ++ int count, int *eof, void *data) ++{ ++ struct ext4_sb_info *sbi = data; ++ int len = 0; ++ int i; ++ ++ *eof = 1; ++ if (off != 0) ++ return 0; ++ ++ for (i = 0; i < sbi->s_mb_prealloc_table_size; i++) ++ len += sprintf(page + len, "%ld ", ++ sbi->s_mb_prealloc_table[i]); ++ len += sprintf(page + len, "\n"); ++ ++ *start = page; ++ return len; ++} ++ ++static int ext4_mb_prealloc_table_proc_write(struct file *file, ++ const char __user *buf, ++ unsigned long cnt, void *data) ++{ ++ struct ext4_sb_info *sbi = data; ++ unsigned long value; ++ unsigned long prev = 0; ++ char str[128]; ++ char *cur; ++ char *end; ++ unsigned long *new_table; ++ int num = 0; ++ int i = 0; ++ ++ if (cnt >= sizeof(str)) ++ return -EINVAL; ++ if (copy_from_user(str, buf, cnt)) ++ return -EFAULT; ++ ++ num = 0; ++ cur = str; ++ end = str + cnt; ++ while (cur < end) { ++ while ((cur < end) && (*cur == ' ')) cur++; ++ value = simple_strtol(cur, &cur, 0); ++ if (value == 0) ++ break; ++ if (value <= prev) ++ return -EINVAL; ++ prev = value; ++ num++; ++ } ++ ++ new_table = kmalloc(num * sizeof(*new_table), GFP_KERNEL); ++ if (new_table == NULL) ++ return -ENOMEM; ++ kfree(sbi->s_mb_prealloc_table); ++ memset(new_table, 0, num * sizeof(*new_table)); ++ sbi->s_mb_prealloc_table = new_table; ++ sbi->s_mb_prealloc_table_size = num; ++ cur = str; ++ end = str + cnt; ++ while (cur < end && i < num) { ++ while ((cur < end) && (*cur == ' ')) cur++; ++ value = simple_strtol(cur, &cur, 0); ++ ext4_mb_prealloc_table_add(sbi, value); ++ i++; ++ } ++ ++ return cnt; ++} ++ + static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file) + { + struct super_block *sb = PDE(inode)->data; +@@ -2411,12 +2505,56 @@ + sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN; + sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN; + sbi->s_mb_stats = MB_DEFAULT_STATS; +- sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; + sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; +- sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC; ++ ++ if (sbi->s_stripe == 0) { ++ sbi->s_mb_prealloc_table_size = 10; ++ i = sbi->s_mb_prealloc_table_size * sizeof(unsigned long); ++ sbi->s_mb_prealloc_table = kmalloc(i, GFP_NOFS); ++ if (sbi->s_mb_prealloc_table == NULL) { ++ kfree(sbi->s_mb_offsets); ++ kfree(sbi->s_mb_maxs); ++ return -ENOMEM; ++ } ++ memset(sbi->s_mb_prealloc_table, 0, i); ++ ++ ext4_mb_prealloc_table_add(sbi, 4); ++ ext4_mb_prealloc_table_add(sbi, 8); ++ ext4_mb_prealloc_table_add(sbi, 16); ++ ext4_mb_prealloc_table_add(sbi, 32); ++ ext4_mb_prealloc_table_add(sbi, 64); ++ ext4_mb_prealloc_table_add(sbi, 128); ++ ext4_mb_prealloc_table_add(sbi, 256); ++ ext4_mb_prealloc_table_add(sbi, 512); ++ ext4_mb_prealloc_table_add(sbi, 1024); ++ ext4_mb_prealloc_table_add(sbi, 2048); ++ ++ sbi->s_mb_small_req = 256; ++ sbi->s_mb_large_req = 1024; ++ sbi->s_mb_group_prealloc = 512; ++ } else { ++ sbi->s_mb_prealloc_table_size = 3; ++ i = sbi->s_mb_prealloc_table_size * sizeof(unsigned long); ++ sbi->s_mb_prealloc_table = kmalloc(i, GFP_NOFS); ++ if (sbi->s_mb_prealloc_table == NULL) { ++ kfree(sbi->s_mb_offsets); ++ kfree(sbi->s_mb_maxs); ++ return -ENOMEM; ++ } ++ memset(sbi->s_mb_prealloc_table, 0, i); ++ ++ ext4_mb_prealloc_table_add(sbi, sbi->s_stripe); ++ ext4_mb_prealloc_table_add(sbi, sbi->s_stripe * 2); ++ ext4_mb_prealloc_table_add(sbi, sbi->s_stripe * 4); ++ ++ sbi->s_mb_small_req = sbi->s_stripe; ++ sbi->s_mb_large_req = sbi->s_stripe * 8; ++ sbi->s_mb_group_prealloc = sbi->s_stripe * 4; ++ } + + sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); + if (sbi->s_locality_groups == NULL) { ++ kfree(sbi->s_mb_prealloc_table); + kfree(sbi->s_mb_offsets); + kfree(sbi->s_mb_maxs); + return -ENOMEM; +@@ -2430,9 +2568,18 @@ + spin_lock_init(&lg->lg_prealloc_lock); + } + +- if (sbi->s_proc) ++ if (sbi->s_proc) { ++ struct proc_dir_entry *p; + proc_create_data("mb_groups", S_IRUGO, sbi->s_proc, + &ext4_mb_seq_groups_fops, sb); ++ p = create_proc_entry(EXT4_MB_PREALLOC_TABLE, S_IFREG | ++ S_IRUGO | S_IWUSR, sbi->s_proc); ++ if (p) { ++ p->data = sbi; ++ p->read_proc = ext4_mb_prealloc_table_proc_read; ++ p->write_proc = ext4_mb_prealloc_table_proc_write; ++ } ++ } + + if (sbi->s_journal) + sbi->s_journal->j_commit_callback = release_blocks_on_commit; +@@ -2512,8 +2659,10 @@ + } + + free_percpu(sbi->s_locality_groups); +- if (sbi->s_proc) ++ if (sbi->s_proc) { + remove_proc_entry("mb_groups", sbi->s_proc); ++ remove_proc_entry(EXT4_MB_PREALLOC_TABLE, sbi->s_proc); ++ } + + return 0; + } +@@ -2807,11 +2956,12 @@ + ext4_mb_normalize_request(struct ext4_allocation_context *ac, + struct ext4_allocation_request *ar) + { +- int bsbits, max; ++ int bsbits, i, wind; + ext4_lblk_t end; +- loff_t size, orig_size, start_off; ++ loff_t size, orig_size; + ext4_lblk_t start, orig_start; + struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); ++ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + struct ext4_prealloc_space *pa; + + /* do normalize only data requests, metadata requests +@@ -2841,49 +2991,35 @@ + size = size << bsbits; + if (size < i_size_read(ac->ac_inode)) + size = i_size_read(ac->ac_inode); ++ size = (size + ac->ac_sb->s_blocksize - 1) >> bsbits; + +- /* max size of free chunks */ +- max = 2 << bsbits; ++ start = wind = 0; + +-#define NRL_CHECK_SIZE(req, size, max, chunk_size) \ +- (req <= (size) || max <= (chunk_size)) ++ /* let's choose preallocation window depending on file size */ ++ for (i = 0; i < sbi->s_mb_prealloc_table_size; i++) { ++ if (size <= sbi->s_mb_prealloc_table[i]) { ++ wind = sbi->s_mb_prealloc_table[i]; ++ break; ++ } ++ } ++ size = wind; + +- /* first, try to predict filesize */ +- /* XXX: should this table be tunable? */ +- start_off = 0; +- if (size <= 16 * 1024) { +- size = 16 * 1024; +- } else if (size <= 32 * 1024) { +- size = 32 * 1024; +- } else if (size <= 64 * 1024) { +- size = 64 * 1024; +- } else if (size <= 128 * 1024) { +- size = 128 * 1024; +- } else if (size <= 256 * 1024) { +- size = 256 * 1024; +- } else if (size <= 512 * 1024) { +- size = 512 * 1024; +- } else if (size <= 1024 * 1024) { +- size = 1024 * 1024; +- } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) { +- start_off = ((loff_t)ac->ac_o_ex.fe_logical >> +- (21 - bsbits)) << 21; +- size = 2 * 1024 * 1024; +- } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) { +- start_off = ((loff_t)ac->ac_o_ex.fe_logical >> +- (22 - bsbits)) << 22; +- size = 4 * 1024 * 1024; +- } else if (NRL_CHECK_SIZE(ac->ac_o_ex.fe_len, +- (8<<20)>>bsbits, max, 8 * 1024)) { +- start_off = ((loff_t)ac->ac_o_ex.fe_logical >> +- (23 - bsbits)) << 23; +- size = 8 * 1024 * 1024; +- } else { +- start_off = (loff_t)ac->ac_o_ex.fe_logical << bsbits; +- size = ac->ac_o_ex.fe_len << bsbits; ++ if (wind == 0) { ++ __u64 tstart, tend; ++ /* file is quite large, we now preallocate with ++ * the biggest configured window with regart to ++ * logical offset */ ++ wind = sbi->s_mb_prealloc_table[i - 1]; ++ tstart = ac->ac_o_ex.fe_logical; ++ do_div(tstart, wind); ++ start = tstart * wind; ++ tend = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len - 1; ++ do_div(tend, wind); ++ tend = tend * wind + wind; ++ size = tend - start; + } +- orig_size = size = size >> bsbits; +- orig_start = start = start_off >> bsbits; ++ orig_size = size; ++ orig_start = start; + + /* don't cover already allocated blocks in selected range */ + if (ar->pleft && start <= ar->lleft) { +@@ -2955,7 +3091,6 @@ + } + BUG_ON(start + size <= ac->ac_o_ex.fe_logical && + start > ac->ac_o_ex.fe_logical); +- BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); + + /* now prepare goal request */ + +@@ -3939,11 +4074,19 @@ + + /* don't use group allocation for large files */ + size = max(size, isize); +- if (size > sbi->s_mb_stream_request) { ++ if ((ac->ac_o_ex.fe_len >= sbi->s_mb_small_req) || ++ (size >= sbi->s_mb_large_req)) { + ac->ac_flags |= EXT4_MB_STREAM_ALLOC; + return; + } + ++ /* ++ * request is so large that we don't care about ++ * streaming - it overweights any possible seek ++ */ ++ if (ac->ac_o_ex.fe_len >= sbi->s_mb_large_req) ++ return; ++ + BUG_ON(ac->ac_lg != NULL); + /* + * locality group prealloc space are per cpu. The reason for having +Index: linux-stage/fs/ext4/super.c +=================================================================== +--- linux-stage.orig/fs/ext4/super.c 2011-03-11 14:16:56.000000000 +0800 ++++ linux-stage/fs/ext4/super.c 2011-03-11 14:19:24.664467626 +0800 +@@ -2632,7 +2632,8 @@ + EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan); + EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan); + EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); +-EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); ++EXT4_RW_ATTR_SBI_UI(mb_small_req, s_mb_small_req); ++EXT4_RW_ATTR_SBI_UI(mb_large_req, s_mb_large_req); + EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); + EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump); + +@@ -2647,7 +2648,8 @@ + ATTR_LIST(mb_max_to_scan), + ATTR_LIST(mb_min_to_scan), + ATTR_LIST(mb_order2_req), +- ATTR_LIST(mb_stream_req), ++ ATTR_LIST(mb_small_req), ++ ATTR_LIST(mb_large_req), + ATTR_LIST(mb_group_prealloc), + ATTR_LIST(max_writeback_mb_bump), + NULL, diff --git a/ldiskfs/kernel_patches/patches/ext4-print-inum-in-htree-warning-rhel6.patch b/ldiskfs/kernel_patches/patches/ext4-print-inum-in-htree-warning-rhel6.patch new file mode 100644 index 0000000..fecb1a7 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext4-print-inum-in-htree-warning-rhel6.patch @@ -0,0 +1,15 @@ +Index: linux-stage/fs/ext4/namei.c +=================================================================== +--- linux-stage.orig/fs/ext4/namei.c ++++ linux-stage/fs/ext4/namei.c +@@ -371,8 +371,8 @@ dx_probe(const struct qstr *d_name, stru + if (root->info.hash_version != DX_HASH_TEA && + root->info.hash_version != DX_HASH_HALF_MD4 && + root->info.hash_version != DX_HASH_LEGACY) { +- ext4_warning(dir->i_sb, "Unrecognised inode hash code %d", +- root->info.hash_version); ++ ext4_warning(dir->i_sb, "Unrecognised inode hash code %d for directory " ++ "#%lu", root->info.hash_version, dir->i_ino); + brelse(bh); + *err = ERR_BAD_DX_DIR; + goto fail; diff --git a/ldiskfs/kernel_patches/patches/ext4-wantedi-2.6-rhel6.patch b/ldiskfs/kernel_patches/patches/ext4-wantedi-2.6-rhel6.patch new file mode 100644 index 0000000..dec376f --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext4-wantedi-2.6-rhel6.patch @@ -0,0 +1,42 @@ +Index: linux-2.6.32.i386/fs/ext4/namei.c +=================================================================== +--- linux-2.6.32.i386.orig/fs/ext4/namei.c 2010-04-07 00:16:32.000000000 +0530 ++++ linux-2.6.32.i386/fs/ext4/namei.c 2010-04-07 00:17:09.000000000 +0530 +@@ -144,6 +144,17 @@ + u16 size; + }; + ++/* ++ * dentry_param used by ext4_new_inode_wantedi() ++ */ ++#define LVFS_DENTRY_PARAM_MAGIC 20070216UL ++struct lvfs_dentry_params ++{ ++ unsigned long ldp_inum; ++ unsigned long ldp_flags; ++ u32 ldp_magic; ++}; ++ + static inline ext4_lblk_t dx_get_block(struct dx_entry *entry); + static void dx_set_block(struct dx_entry *entry, ext4_lblk_t value); + static inline unsigned dx_get_hash(struct dx_entry *entry); +@@ -1751,6 +1762,19 @@ + return err; + } + ++static unsigned ext4_dentry_goal(struct super_block *sb, struct dentry *dentry) ++{ ++ unsigned inum = EXT4_SB(sb)->s_inode_goal; ++ ++ if (dentry->d_fsdata != NULL) { ++ struct lvfs_dentry_params *param = dentry->d_fsdata; ++ ++ if (param->ldp_magic == LVFS_DENTRY_PARAM_MAGIC) ++ inum = param->ldp_inum; ++ } ++ return inum; ++} ++ + /* + * By the time this is called, we already have created + * the directory cache entry for the new file, but it diff --git a/ldiskfs/kernel_patches/patches/ext4_data_in_dirent-rhel6.patch b/ldiskfs/kernel_patches/patches/ext4_data_in_dirent-rhel6.patch new file mode 100644 index 0000000..9e68778 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext4_data_in_dirent-rhel6.patch @@ -0,0 +1,503 @@ +this patch implements feature which allows ext4 fs users (e.g. Lustre) +to store data in ext4 dirent. +data is stored in ext4 dirent after file-name, this space is accounted +in de->rec_len. flag EXT4_DIRENT_LUFID added to d_type if extra data +is present. + +make use of dentry->d_fsdata to pass fid to ext4. so no +changes in ext4_add_entry() interface required. + +Index: linux-2.6.32.i386/fs/ext4/dir.c +=================================================================== +--- linux-2.6.32.i386.orig/fs/ext4/dir.c 2009-12-03 09:21:21.000000000 +0530 ++++ linux-2.6.32.i386/fs/ext4/dir.c 2010-04-16 06:25:43.000000000 +0530 +@@ -53,11 +53,18 @@ + + static unsigned char get_dtype(struct super_block *sb, int filetype) + { ++ int fl_index = filetype & EXT4_FT_MASK; ++ + if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE) || +- (filetype >= EXT4_FT_MAX)) ++ (fl_index >= EXT4_FT_MAX)) + return DT_UNKNOWN; + +- return (ext4_filetype_table[filetype]); ++ if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_DIRDATA)) ++ return (ext4_filetype_table[fl_index]); ++ ++ return (ext4_filetype_table[fl_index]) | ++ (filetype & EXT4_DIRENT_LUFID); ++ + } + + +@@ -70,11 +77,11 @@ + const int rlen = ext4_rec_len_from_disk(de->rec_len, + dir->i_sb->s_blocksize); + +- if (rlen < EXT4_DIR_REC_LEN(1)) ++ if (rlen < __EXT4_DIR_REC_LEN(1)) + error_msg = "rec_len is smaller than minimal"; + else if (rlen % 4 != 0) + error_msg = "rec_len % 4 != 0"; +- else if (rlen < EXT4_DIR_REC_LEN(de->name_len)) ++ else if (rlen < EXT4_DIR_REC_LEN(de)) + error_msg = "rec_len is too small for name_len"; + else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize) + error_msg = "directory entry across blocks"; +@@ -179,7 +186,7 @@ + * failure will be detected in the + * dirent test below. */ + if (ext4_rec_len_from_disk(de->rec_len, +- sb->s_blocksize) < EXT4_DIR_REC_LEN(1)) ++ sb->s_blocksize) < __EXT4_DIR_REC_LEN(1)) + break; + i += ext4_rec_len_from_disk(de->rec_len, + sb->s_blocksize); +@@ -342,12 +349,17 @@ + struct fname *fname, *new_fn; + struct dir_private_info *info; + int len; ++ int extra_data = 1; + + info = (struct dir_private_info *) dir_file->private_data; + p = &info->root.rb_node; + + /* Create and allocate the fname structure */ +- len = sizeof(struct fname) + dirent->name_len + 1; ++ if (dirent->file_type & EXT4_DIRENT_LUFID) ++ extra_data = ext4_get_dirent_data_len(dirent); ++ ++ len = sizeof(struct fname) + dirent->name_len + extra_data; ++ + new_fn = kzalloc(len, GFP_KERNEL); + if (!new_fn) + return -ENOMEM; +@@ -356,7 +368,7 @@ + new_fn->inode = le32_to_cpu(dirent->inode); + new_fn->name_len = dirent->name_len; + new_fn->file_type = dirent->file_type; +- memcpy(new_fn->name, dirent->name, dirent->name_len); ++ memcpy(new_fn->name, dirent->name, dirent->name_len + extra_data); + new_fn->name[dirent->name_len] = 0; + + while (*p) { +Index: linux-2.6.32.i386/fs/ext4/ext4.h +=================================================================== +--- linux-2.6.32.i386.orig/fs/ext4/ext4.h 2010-04-16 06:10:06.000000000 +0530 ++++ linux-2.6.32.i386/fs/ext4/ext4.h 2010-04-16 06:27:40.000000000 +0530 +@@ -1135,6 +1135,7 @@ + #define EXT4_FEATURE_INCOMPAT_64BIT 0x0080 + #define EXT4_FEATURE_INCOMPAT_MMP 0x0100 + #define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200 ++#define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 + + #define EXT4_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR + #define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ +@@ -1143,7 +1144,9 @@ + EXT4_FEATURE_INCOMPAT_EXTENTS| \ + EXT4_FEATURE_INCOMPAT_64BIT| \ + EXT4_FEATURE_INCOMPAT_FLEX_BG| \ +- EXT4_FEATURE_INCOMPAT_MMP) ++ EXT4_FEATURE_INCOMPAT_MMP| \ ++ EXT4_FEATURE_INCOMPAT_DIRDATA) ++ + #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ +@@ -1225,6 +1228,43 @@ + #define EXT4_FT_SYMLINK 7 + + #define EXT4_FT_MAX 8 ++#define EXT4_FT_MASK 0xf ++ ++#if EXT4_FT_MAX > EXT4_FT_MASK ++#error "conflicting EXT4_FT_MAX and EXT4_FT_MASK" ++#endif ++ ++/* ++ * d_type has 4 unused bits, so it can hold four types data. these different ++ * type of data (e.g. lustre data, high 32 bits of 64-bit inode number) can be ++ * stored, in flag order, after file-name in ext4 dirent. ++*/ ++/* ++ * this flag is added to d_type if ext4 dirent has extra data after ++ * filename. this data length is variable and length is stored in first byte ++ * of data. data start after filename NUL byte. ++ * This is used by Lustre FS. ++ */ ++#define EXT4_DIRENT_LUFID 0x10 ++ ++#define EXT4_LUFID_MAGIC 0xAD200907UL ++struct ext4_dentry_param { ++ __u32 edp_magic; /* EXT4_LUFID_MAGIC */ ++ char edp_len; /* size of edp_data in bytes */ ++ char edp_data[0]; /* packed array of data */ ++} __attribute__((packed)); ++ ++static inline unsigned char *ext4_dentry_get_data(struct super_block *sb, ++ struct ext4_dentry_param* p) ++ ++{ ++ if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_DIRDATA)) ++ return NULL; ++ if (p && p->edp_magic == EXT4_LUFID_MAGIC) ++ return &p->edp_len; ++ else ++ return NULL; ++} + + /* + * EXT4_DIR_PAD defines the directory entries boundaries +@@ -1233,8 +1273,11 @@ + */ + #define EXT4_DIR_PAD 4 + #define EXT4_DIR_ROUND (EXT4_DIR_PAD - 1) +-#define EXT4_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT4_DIR_ROUND) & \ ++#define __EXT4_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT4_DIR_ROUND) & \ + ~EXT4_DIR_ROUND) ++#define EXT4_DIR_REC_LEN(de) (__EXT4_DIR_REC_LEN(de->name_len +\ ++ ext4_get_dirent_data_len(de))) ++ + #define EXT4_MAX_REC_LEN ((1<<16)-1) + + /* +@@ -1524,7 +1567,7 @@ + struct ext4_dir_entry_2 ** res_dir); + #define ll_ext4_find_entry(inode, dentry, res_dir) ext4_find_entry(inode, &(dentry)->d_name, res_dir) + extern int ext4_add_dot_dotdot(handle_t *handle, struct inode *dir, +- struct inode *inode); ++ struct inode *inode, const void *, const void *); + extern struct buffer_head *ext4_append(handle_t *handle, + struct inode *inode, + ext4_lblk_t *block, int *err); +@@ -1851,6 +1894,28 @@ + set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state); + } + ++/* ++ * Compute the total directory entry data length. ++ * This includes the filename and an implicit NUL terminator (always present), ++ * and optional extensions. Each extension has a bit set in the high 4 bits of ++ * de->file_type, and the extension length is the first byte in each entry. ++ */ ++static inline int ext4_get_dirent_data_len(struct ext4_dir_entry_2 *de) ++{ ++ char *len = de->name + de->name_len + 1 /* NUL terminator */; ++ int dlen = 0; ++ __u8 extra_data_flags = (de->file_type & ~EXT4_FT_MASK) >> 4; ++ ++ while (extra_data_flags) { ++ if (extra_data_flags & 1) { ++ dlen += *len + (dlen == 0); ++ len += *len; ++ } ++ extra_data_flags >>= 1; ++ } ++ return dlen; ++} ++ + #endif /* __KERNEL__ */ + + #endif /* _EXT4_H */ +Index: linux-2.6.32.i386/fs/ext4/namei.c +=================================================================== +--- linux-2.6.32.i386.orig/fs/ext4/namei.c 2010-04-16 05:47:41.000000000 +0530 ++++ linux-2.6.32.i386/fs/ext4/namei.c 2010-04-16 06:40:38.000000000 +0530 +@@ -170,7 +170,8 @@ + static unsigned dx_get_limit(struct dx_entry *entries); + static void dx_set_count(struct dx_entry *entries, unsigned value); + static void dx_set_limit(struct dx_entry *entries, unsigned value); +-static unsigned dx_root_limit(struct inode *dir, unsigned infosize); ++static inline unsigned dx_root_limit(__u32 blocksize, ++ struct ext4_dir_entry_2 *dot_de, unsigned infosize); + static unsigned dx_node_limit(struct inode *dir); + static struct dx_frame *dx_probe(const struct qstr *d_name, + struct inode *dir, +@@ -237,11 +238,12 @@ + */ + struct dx_root_info * dx_get_dx_info(struct ext4_dir_entry_2 *de) + { +- /* get dotdot first */ +- de = (struct ext4_dir_entry_2 *)((char *)de + EXT4_DIR_REC_LEN(1)); ++ BUG_ON(de->name_len != 1); ++ /* get dotdot first */ ++ de = (struct ext4_dir_entry_2 *)((char *)de + EXT4_DIR_REC_LEN(de)); + +- /* dx root info is after dotdot entry */ +- de = (struct ext4_dir_entry_2 *)((char *)de + EXT4_DIR_REC_LEN(2)); ++ /* dx root info is after dotdot entry */ ++ de = (struct ext4_dir_entry_2 *)((char *)de + EXT4_DIR_REC_LEN(de)); + + return (struct dx_root_info *) de; + } +@@ -286,16 +288,23 @@ + ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value); + } + +-static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize) ++static inline unsigned dx_root_limit(__u32 blocksize, ++ struct ext4_dir_entry_2 *dot_de, unsigned infosize) + { +- unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) - +- EXT4_DIR_REC_LEN(2) - infosize; ++ struct ext4_dir_entry_2 *dotdot_de; ++ unsigned entry_space; ++ ++ BUG_ON(dot_de->name_len != 1); ++ dotdot_de = ext4_next_entry(dot_de, blocksize); ++ entry_space = blocksize - EXT4_DIR_REC_LEN(dot_de) - ++ EXT4_DIR_REC_LEN(dotdot_de) - infosize; ++ + return entry_space / sizeof(struct dx_entry); + } + + static inline unsigned dx_node_limit(struct inode *dir) + { +- unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0); ++ unsigned entry_space = dir->i_sb->s_blocksize - __EXT4_DIR_REC_LEN(0); + return entry_space / sizeof(struct dx_entry); + } + +@@ -342,7 +351,7 @@ + printk(":%x.%u ", h.hash, + ((char *) de - base)); + } +- space += EXT4_DIR_REC_LEN(de->name_len); ++ space += EXT4_DIR_REC_LEN(de); + names++; + } + de = ext4_next_entry(de, size); +@@ -447,7 +456,8 @@ + + entries = (struct dx_entry *) (((char *)info) + info->info_length); + +- if (dx_get_limit(entries) != dx_root_limit(dir, ++ if (dx_get_limit(entries) != dx_root_limit(dir->i_sb->s_blocksize, ++ (struct ext4_dir_entry_2*)bh->b_data, + info->info_length)) { + ext4_warning(dir->i_sb, __func__, + "dx entry: limit != root limit"); +@@ -637,7 +647,7 @@ + de = (struct ext4_dir_entry_2 *) bh->b_data; + top = (struct ext4_dir_entry_2 *) ((char *) de + + dir->i_sb->s_blocksize - +- EXT4_DIR_REC_LEN(0)); ++ __EXT4_DIR_REC_LEN(0)); + for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) { + if (!ext4_check_dir_entry("htree_dirblock_to_tree", dir, de, bh, + (block<i_sb)) +@@ -1050,7 +1060,7 @@ + goto errout; + de = (struct ext4_dir_entry_2 *) bh->b_data; + top = (struct ext4_dir_entry_2 *) ((char *) de + sb->s_blocksize - +- EXT4_DIR_REC_LEN(0)); ++ __EXT4_DIR_REC_LEN(0)); + for (; de < top; de = ext4_next_entry(de, sb->s_blocksize)) { + int off = (block << EXT4_BLOCK_SIZE_BITS(sb)) + + ((char *) de - bh->b_data); +@@ -1216,7 +1226,7 @@ + while (count--) { + struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) + (from + (map->offs<<2)); +- rec_len = EXT4_DIR_REC_LEN(de->name_len); ++ rec_len = EXT4_DIR_REC_LEN(de); + memcpy (to, de, rec_len); + ((struct ext4_dir_entry_2 *) to)->rec_len = + ext4_rec_len_to_disk(rec_len, blocksize); +@@ -1240,7 +1250,7 @@ + while ((char*)de < base + blocksize) { + next = ext4_next_entry(de, blocksize); + if (de->inode && de->name_len) { +- rec_len = EXT4_DIR_REC_LEN(de->name_len); ++ rec_len = EXT4_DIR_REC_LEN(de); + if (de > to) + memmove(to, de, rec_len); + to->rec_len = ext4_rec_len_to_disk(rec_len, blocksize); +@@ -1370,10 +1380,16 @@ + unsigned int offset = 0; + unsigned int blocksize = dir->i_sb->s_blocksize; + unsigned short reclen; +- int nlen, rlen, err; ++ int nlen, rlen, err, dlen = 0; ++ unsigned char *data; + char *top; + +- reclen = EXT4_DIR_REC_LEN(namelen); ++ data = ext4_dentry_get_data(inode->i_sb, (struct ext4_dentry_param *) ++ dentry->d_fsdata); ++ if (data) ++ dlen = (*data) + 1; ++ ++ reclen = __EXT4_DIR_REC_LEN(namelen + dlen); + if (!de) { + de = (struct ext4_dir_entry_2 *)bh->b_data; + top = bh->b_data + blocksize - reclen; +@@ -1383,7 +1399,7 @@ + return -EIO; + if (ext4_match(namelen, name, de)) + return -EEXIST; +- nlen = EXT4_DIR_REC_LEN(de->name_len); ++ nlen = EXT4_DIR_REC_LEN(de); + rlen = ext4_rec_len_from_disk(de->rec_len, blocksize); + if ((de->inode? rlen - nlen: rlen) >= reclen) + break; +@@ -1401,7 +1417,7 @@ + } + + /* By now the buffer is marked for journaling */ +- nlen = EXT4_DIR_REC_LEN(de->name_len); ++ nlen = EXT4_DIR_REC_LEN(de); + rlen = ext4_rec_len_from_disk(de->rec_len, blocksize); + if (de->inode) { + struct ext4_dir_entry_2 *de1 = (struct ext4_dir_entry_2 *)((char *)de + nlen); +@@ -1417,6 +1433,12 @@ + de->inode = 0; + de->name_len = namelen; + memcpy(de->name, name, namelen); ++ if (data) { ++ de->name[namelen] = 0; ++ memcpy(&de->name[namelen + 1], data, *(char *) data); ++ de->file_type |= EXT4_DIRENT_LUFID; ++ } ++ + /* + * XXX shouldn't update any times until successful + * completion of syscall, but too many callers depend +@@ -1515,7 +1537,8 @@ + + dx_set_block(entries, 1); + dx_set_count(entries, 1); +- dx_set_limit(entries, dx_root_limit(dir, sizeof(*dx_info))); ++ dx_set_limit(entries, dx_root_limit(dir->i_sb->s_blocksize, ++ dot_de, sizeof(*dx_info))); + + /* Initialize as for dx_probe */ + hinfo.hash_version = dx_info->hash_version; +@@ -1546,6 +1569,8 @@ + struct buffer_head * dir_block; + struct ext4_dir_entry_2 * de; + int len, journal = 0, err = 0; ++ int dlen = 0; ++ char *data; + + if (IS_ERR(handle)) + return PTR_ERR(handle); +@@ -1561,19 +1586,24 @@ + /* the first item must be "." */ + assert(de->name_len == 1 && de->name[0] == '.'); + len = le16_to_cpu(de->rec_len); +- assert(len >= EXT4_DIR_REC_LEN(1)); +- if (len > EXT4_DIR_REC_LEN(1)) { ++ assert(len >= __EXT4_DIR_REC_LEN(1)); ++ if (len > __EXT4_DIR_REC_LEN(1)) { + BUFFER_TRACE(dir_block, "get_write_access"); + err = ext4_journal_get_write_access(handle, dir_block); + if (err) + goto out_journal; + + journal = 1; +- de->rec_len = cpu_to_le16(EXT4_DIR_REC_LEN(1)); ++ de->rec_len = cpu_to_le16(EXT4_DIR_REC_LEN(de)); + } + +- len -= EXT4_DIR_REC_LEN(1); +- assert(len == 0 || len >= EXT4_DIR_REC_LEN(2)); ++ len -= EXT4_DIR_REC_LEN(de); ++ data = ext4_dentry_get_data(dir->i_sb, ++ (struct ext4_dentry_param *) dentry->d_fsdata); ++ if (data) ++ dlen = *data + 1; ++ assert(len == 0 || len >= __EXT4_DIR_REC_LEN(2 + dlen)); ++ + de = (struct ext4_dir_entry_2 *) + ((char *) de + le16_to_cpu(de->rec_len)); + if (!journal) { +@@ -1587,10 +1617,15 @@ + if (len > 0) + de->rec_len = cpu_to_le16(len); + else +- assert(le16_to_cpu(de->rec_len) >= EXT4_DIR_REC_LEN(2)); ++ assert(le16_to_cpu(de->rec_len) >= __EXT4_DIR_REC_LEN(2)); + de->name_len = 2; + strcpy (de->name, ".."); + ext4_set_de_type(dir->i_sb, de, S_IFDIR); ++ if (data) { ++ de->name[2] = 0; ++ memcpy(&de->name[2 + 1], data, dlen); ++ de->file_type |= EXT4_DIRENT_LUFID; ++ } + + out_journal: + if (journal) { +@@ -2011,12 +2046,13 @@ + /* Initialize @inode as a subdirectory of @dir, and add the + * "." and ".." entries into the first directory block. */ + int ext4_add_dot_dotdot(handle_t *handle, struct inode * dir, +- struct inode *inode) ++ struct inode *inode, ++ const void *data1, const void *data2) + { + struct buffer_head * dir_block; + struct ext4_dir_entry_2 * de; + unsigned int blocksize = dir->i_sb->s_blocksize; +- int err = 0; ++ int err = 0, dot_reclen; + + if (IS_ERR(handle)) + return PTR_ERR(handle); +@@ -2040,17 +2076,32 @@ + de = (struct ext4_dir_entry_2 *) dir_block->b_data; + de->inode = cpu_to_le32(inode->i_ino); + de->name_len = 1; +- de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len), +- blocksize); + strcpy(de->name, "."); + ext4_set_de_type(dir->i_sb, de, S_IFDIR); ++ /* get packed fid data*/ ++ data1 = ext4_dentry_get_data(dir->i_sb, ++ (struct ext4_dentry_param *) data1); ++ if (data1) { ++ de->name[1] = 0; ++ memcpy(&de->name[2], data1, *(char *) data1); ++ de->file_type |= EXT4_DIRENT_LUFID; ++ } ++ de->rec_len = cpu_to_le16(EXT4_DIR_REC_LEN(de)); ++ dot_reclen = cpu_to_le16(de->rec_len); + de = ext4_next_entry(de, blocksize); + de->inode = cpu_to_le32(dir->i_ino); +- de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(1), ++ de->rec_len = ext4_rec_len_to_disk(blocksize - dot_reclen, + blocksize); + de->name_len = 2; + strcpy(de->name, ".."); + ext4_set_de_type(dir->i_sb, de, S_IFDIR); ++ data2 = ext4_dentry_get_data(dir->i_sb, ++ (struct ext4_dentry_param *) data2); ++ if (data2) { ++ de->name[2] = 0; ++ memcpy(&de->name[3], data2, *(char *) data2); ++ de->file_type |= EXT4_DIRENT_LUFID; ++ } + inode->i_nlink = 2; + BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata"); + ext4_handle_dirty_metadata(handle, dir, dir_block); +@@ -2087,7 +2138,7 @@ + if (IS_ERR(inode)) + goto out_stop; + +- err = ext4_add_dot_dotdot(handle, dir, inode); ++ err = ext4_add_dot_dotdot(handle, dir, inode, NULL, NULL); + if (err) + goto out_stop; + +@@ -2123,7 +2174,7 @@ + int err = 0; + + sb = inode->i_sb; +- if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) || ++ if (inode->i_size < __EXT4_DIR_REC_LEN(1) + __EXT4_DIR_REC_LEN(2) || + !(bh = ext4_bread(NULL, inode, 0, 0, &err))) { + if (err) + ext4_error(inode->i_sb, __func__, diff --git a/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel6.series b/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel6.series new file mode 100644 index 0000000..bb9fdbd --- /dev/null +++ b/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel6.series @@ -0,0 +1,29 @@ +ext4-wantedi-2.6-rhel6.patch +ext4-map_inode_page-2.6-rhel6.patch +export-ext4-2.6-rhel6.patch +ext4-remove-cond_resched-calls-rhel5.patch +ext4-ext_generation-sles11.patch +ext4-inode-version-rhel6.patch +ext4-mmp-rhel6.patch +ext4-lookup-dotdot-rhel5.patch +ext4-max-dir-size-rhel6.patch +ext4-print-inum-in-htree-warning-rhel6.patch +ext4-xattr-no-update-ctime-rhel5.patch +ext4-prealloc-rhel6.patch +ext4-mballoc-extra-checks-rhel6.patch +ext4-misc-rhel6.patch +ext4-big-endian-check-2.6-rhel6.patch +ext4-alloc-policy-2.6-rhel5.patch +ext4-force_over_16tb-rhel6.patch +ext4-pdir-fix-rhel6.patch +ext4-osd-iop-common-rhel6.patch +ext4-osd-iam-exports-rhel6.patch +ext4-dynlocks-common-rhel6.patch +ext4-hash-indexed-dir-dotdot-update-rhel5.patch +ext4-kill-dx_root-rhel6.patch +ext4-extents-mount-option-rhel6.patch +ext4-fiemap-2.6-rhel6.patch +ext4-mballoc-pa_free-mismatch-rhel6.patch +ext4_data_in_dirent-rhel6.patch +ext4-disable-mb-cache-rhel6.patch +ext4-back-dquot-to-rhel6.patch diff --git a/ldiskfs/ldiskfs/Makefile.in b/ldiskfs/ldiskfs/Makefile.in index 71d312d..0e1e6c2 100644 --- a/ldiskfs/ldiskfs/Makefile.in +++ b/ldiskfs/ldiskfs/Makefile.in @@ -7,6 +7,7 @@ backfs_extra := $(wildcard @LINUX@/fs/@BACKFS@/Makefile) backfs_headers := $(wildcard @LINUX@/fs/@BACKFS@/*.h) linux_headers := $(wildcard @LINUX@/include/linux/@BACKFS@*.h) +trace_headers := $(wildcard @LINUX@/include/trace/events/@BACKFS@*.h) backfs_sources := $(filter-out %.mod.c,$(wildcard @LINUX@/fs/@BACKFS@/*.c)) diff --git a/ldiskfs/ldiskfs/autoMakefile.am b/ldiskfs/ldiskfs/autoMakefile.am index 65d04a7..d6460a2 100644 --- a/ldiskfs/ldiskfs/autoMakefile.am +++ b/ldiskfs/ldiskfs/autoMakefile.am @@ -24,13 +24,17 @@ linux/ldiskfs%.h: linux-stage/include/linux/@BACKFS@%.h series := @top_srcdir@/kernel_patches/series/ldiskfs-$(LDISKFS_SERIES) patches := @top_srcdir@/kernel_patches/patches -sources: $(backfs_sources) $(backfs_headers) $(linux_headers) $(series) - rm -rf linux-stage linux sources $(ldiskfs_SOURCES) - mkdir -p linux-stage/fs/@BACKFS@ linux-stage/include/linux +sources: $(backfs_sources) $(backfs_headers) $(linux_headers) $(series) $(trace_headers) + rm -rf linux-stage linux sources trace $(ldiskfs_SOURCES) + mkdir -p linux-stage/fs/@BACKFS@ linux-stage/include/linux \ + linux-stage/include/trace/events cp $(backfs_sources) $(backfs_headers) $(backfs_extra) linux-stage/fs/@BACKFS@ if test -n "$(linux_headers)" ; then \ cp $(linux_headers) linux-stage/include/linux; \ fi + if test -n "$(trace_headers)" ; then \ + cp $(trace_headers) linux-stage/include/trace/events; \ + fi if USE_QUILT ln -s ../$(patches) linux-stage/patches ln -s ../$(series) linux-stage/series @@ -43,7 +47,7 @@ else done @echo endif - mkdir linux + mkdir -p linux trace/events @echo -n "Replacing '@BACKFS@' with 'ldiskfs':" for i in $(notdir $(backfs_headers) $(backfs_sources)) $(new_sources) ; do \ echo -n " $$i" ; \ @@ -62,6 +66,12 @@ endif linux-stage/include/linux/@BACKFS@$$i \ > linux/ldiskfs$$i ; \ done + for i in $(subst @BACKFS@,,$(notdir $(trace_headers))) ; do \ + echo -n " @BACKFS@$$i"; \ + sed $(strip $(ldiskfs_sed_flags)) \ + linux-stage/include/trace/events/@BACKFS@$$i \ + > trace/events/ldiskfs$$i ; \ + done sed $(strip $(ldiskfs_sed_flags)) \ linux-stage/include/linux/dynlocks.h \ > linux/dynlocks.h @@ -79,7 +89,7 @@ foo-check: @echo "ldiskfs_LDADD: $(ldiskfs_LDADD)" MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ -CLEANFILES = sources $(notdir $(linux_headers) $(backfs_headers) $(backfs_sources) $(new_sources) $(new_headers)) +CLEANFILES = sources $(notdir $(linux_headers) $(backfs_headers) $(backfs_sources) $(new_sources) $(new_headers) $(trace_headers)) clean: clean-am - rm -rf linux linux-stage ldiskfs*.h + rm -rf linux linux-stage ldiskfs*.h trace diff --git a/lustre/autoconf/lustre-core.m4 b/lustre/autoconf/lustre-core.m4 index 1c38ef4..fe3afd5 100644 --- a/lustre/autoconf/lustre-core.m4 +++ b/lustre/autoconf/lustre-core.m4 @@ -49,20 +49,76 @@ esac ]) # -# Ensure stack size big than 8k in Lustre server (all kernels) +# LC_CONFIG_OBD_BUFFER_SIZE # -AC_DEFUN([LC_STACK_SIZE], -[AC_MSG_CHECKING([stack size big than 8k]) -LB_LINUX_TRY_COMPILE([ - #include +# the maximum buffer size of lctl ioctls +# +AC_DEFUN([LC_CONFIG_OBD_BUFFER_SIZE], +[AC_MSG_CHECKING([maximum OBD ioctl size]) +AC_ARG_WITH([obd-buffer-size], + AC_HELP_STRING([--with-obd-buffer-size=[size]], + [set lctl ioctl maximum bytes (default=8192)]), + [ + OBD_BUFFER_SIZE=$with_obd_buffer_size + ],[ + OBD_BUFFER_SIZE=8192 + ]) +AC_MSG_RESULT([$OBD_BUFFER_SIZE bytes]) +AC_DEFINE_UNQUOTED(OBD_MAX_IOCTL_BUFFER, $OBD_BUFFER_SIZE, [IOCTL Buffer Size]) +]) + +# +# LC_READLINK_SSIZE_T +# +AC_DEFUN([LC_READLINK_SSIZE_T], +[AC_MSG_CHECKING([if readlink returns ssize_t]) +AC_TRY_COMPILE([ + #include ],[ - #if THREAD_SIZE < 8192 - #error "stack size < 8192" - #endif + ssize_t readlink(const char *, char *, size_t); ],[ - AC_MSG_RESULT(yes) + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_POSIX_1003_READLINK, 1, [readlink returns ssize_t]) ],[ - AC_MSG_ERROR([Lustre requires that Linux is configured with at least a 8KB stack.]) + AC_MSG_RESULT([no]) +]) +]) + +# +# LC_FUNC_RELEASEPAGE_WITH_GFP +# +# 2.6.9 ->releasepage() takes a gfp_t arg +# This kernel defines gfp_t (HAS_GFP_T) but doesn't use it for this function, +# while others either don't have gfp_t or pass gfp_t as the parameter. +# +AC_DEFUN([LC_FUNC_RELEASEPAGE_WITH_GFP], +[AC_MSG_CHECKING([if releasepage has a gfp_t parameter]) +RELEASEPAGE_WITH_GFP="$(grep -c 'releasepage.*gfp_t' $LINUX/include/linux/fs.h)" +if test "$RELEASEPAGE_WITH_GFP" != 0 ; then + AC_DEFINE(HAVE_RELEASEPAGE_WITH_GFP, 1, + [releasepage with gfp_t parameter]) + AC_MSG_RESULT([yes]) +else + AC_MSG_RESULT([no]) +fi +]) + + + +# +# only for Lustre-patched kernels +# +AC_DEFUN([LC_LUSTRE_VERSION_H], +[LB_CHECK_FILE([$LINUX/include/linux/lustre_version.h],[ + rm -f "$LUSTRE/include/linux/lustre_version.h" +],[ + touch "$LUSTRE/include/linux/lustre_version.h" + if test x$enable_server = xyes ; then + AC_MSG_WARN([Unpatched kernel detected.]) + AC_MSG_WARN([Lustre servers cannot be built with an unpatched kernel;]) + AC_MSG_WARN([disabling server build]) + enable_server='no' + fi ]) ]) @@ -91,6 +147,24 @@ kernel patches from Lustre version 1.4.3 or above.]) ]) # +# Ensure stack size big than 8k in Lustre server (all kernels) +# +AC_DEFUN([LC_STACK_SIZE], +[AC_MSG_CHECKING([stack size big than 8k]) +LB_LINUX_TRY_COMPILE([ + #include +],[ + #if THREAD_SIZE < 8192 + #error "stack size < 8192" + #endif +],[ + AC_MSG_RESULT(yes) +],[ + AC_MSG_ERROR([Lustre requires that Linux is configured with at least a 8KB stack.]) +]) +]) + +# # LC_CONFIG_BACKINGFS # # setup, check the backing filesystem @@ -148,27 +222,18 @@ fi ]) # -# LC_HEADER_LDISKFS_XATTR -# -# CHAOS kernel-devel package will not include fs/ldiskfs/xattr.h +# LC_CONFIG_LIBLUSTRE_RECOVERY # -AC_DEFUN([LC_HEADER_LDISKFS_XATTR], -[AC_MSG_CHECKING([if ldiskfs has xattr.h header]) -tmp_flags="$EXTRA_KCFLAGS" -EXTRA_KCFLAGS="-I$LINUX/fs -I$LDISKFS_DIR -I$LDISKFS_DIR/ldiskfs" -LB_LINUX_TRY_COMPILE([ - #include -],[ - ldiskfs_xattr_get(NULL, 0, "", NULL, 0); - ldiskfs_xattr_set_handle(NULL, NULL, 0, "", NULL, 0, 0); - -],[ - AC_MSG_RESULT([yes]) - AC_DEFINE(HAVE_LDISKFS_XATTR_H, 1, [ldiskfs/xattr.h found]) -],[ - AC_MSG_RESULT([no]) -]) -EXTRA_KCFLAGS="$tmp_flags" +AC_DEFUN([LC_CONFIG_LIBLUSTRE_RECOVERY], +[AC_MSG_CHECKING([whether to enable liblustre recovery support]) +AC_ARG_ENABLE([liblustre-recovery], + AC_HELP_STRING([--disable-liblustre-recovery], + [disable liblustre recovery support]), + [],[enable_liblustre_recovery='yes']) +AC_MSG_RESULT([$enable_liblustre_recovery]) +if test x$enable_liblustre_recovery != xno ; then + AC_DEFINE(ENABLE_LIBLUSTRE_RECOVERY, 1, Liblustre Can Recover) +fi ]) # @@ -188,137 +253,170 @@ if test x$enable_health_write != xno ; then fi ]) -# -# LC_CONFIG_LIBLUSTRE_RECOVERY -# -AC_DEFUN([LC_CONFIG_LIBLUSTRE_RECOVERY], -[AC_MSG_CHECKING([whether to enable liblustre recovery support]) -AC_ARG_ENABLE([liblustre-recovery], - AC_HELP_STRING([--disable-liblustre-recovery], - [disable liblustre recovery support]), - [],[enable_liblustre_recovery='yes']) -AC_MSG_RESULT([$enable_liblustre_recovery]) -if test x$enable_liblustre_recovery != xno ; then - AC_DEFINE(ENABLE_LIBLUSTRE_RECOVERY, 1, Liblustre Can Recover) +AC_DEFUN([LC_CONFIG_LRU_RESIZE], +[AC_MSG_CHECKING([whether to enable lru self-adjusting]) +AC_ARG_ENABLE([lru_resize], + AC_HELP_STRING([--enable-lru-resize], + [enable lru resize support]), + [],[enable_lru_resize='yes']) +AC_MSG_RESULT([$enable_lru_resize]) +if test x$enable_lru_resize != xno; then + AC_DEFINE(HAVE_LRU_RESIZE_SUPPORT, 1, [Enable lru resize support]) fi ]) -# -# LC_CONFIG_OBD_BUFFER_SIZE -# -# the maximum buffer size of lctl ioctls -# -AC_DEFUN([LC_CONFIG_OBD_BUFFER_SIZE], -[AC_MSG_CHECKING([maximum OBD ioctl size]) -AC_ARG_WITH([obd-buffer-size], - AC_HELP_STRING([--with-obd-buffer-size=[size]], - [set lctl ioctl maximum bytes (default=8192)]), - [ - OBD_BUFFER_SIZE=$with_obd_buffer_size - ],[ - OBD_BUFFER_SIZE=8192 - ]) -AC_MSG_RESULT([$OBD_BUFFER_SIZE bytes]) -AC_DEFINE_UNQUOTED(OBD_MAX_IOCTL_BUFFER, $OBD_BUFFER_SIZE, [IOCTL Buffer Size]) +# whether to enable quota support(kernel modules) +AC_DEFUN([LC_QUOTA_MODULE], +[if test x$enable_quota != xno; then + LB_LINUX_CONFIG([QUOTA],[ + enable_quota_module='yes' + AC_DEFINE(HAVE_QUOTA_SUPPORT, 1, [Enable quota support]) + ],[ + enable_quota_module='no' + AC_MSG_WARN([quota is not enabled because the kernel - lacks quota support]) + ]) +fi ]) -# -# LC_STRUCT_STATFS -# -# AIX does not have statfs.f_namelen -# -AC_DEFUN([LC_STRUCT_STATFS], -[AC_MSG_CHECKING([if struct statfs has a f_namelen field]) -LB_LINUX_TRY_COMPILE([ - #include -],[ - struct statfs sfs; - sfs.f_namelen = 1; -],[ - AC_MSG_RESULT([yes]) - AC_DEFINE(HAVE_STATFS_NAMELEN, 1, [struct statfs has a namelen field]) +AC_DEFUN([LC_EXPORT_TRUNCATE_COMPLETE], +[LB_CHECK_SYMBOL_EXPORT([truncate_complete_page], +[mm/truncate.c],[ +AC_DEFINE(HAVE_TRUNCATE_COMPLETE_PAGE, 1, + [kernel export truncate_complete_page]) ],[ - AC_MSG_RESULT([no]) ]) ]) -# -# LC_READLINK_SSIZE_T -# -AC_DEFUN([LC_READLINK_SSIZE_T], -[AC_MSG_CHECKING([if readlink returns ssize_t]) -AC_TRY_COMPILE([ - #include -],[ - ssize_t readlink(const char *, char *, size_t); -],[ - AC_MSG_RESULT([yes]) - AC_DEFINE(HAVE_POSIX_1003_READLINK, 1, [readlink returns ssize_t]) +AC_DEFUN([LC_EXPORT_TRUNCATE_RANGE], +[LB_CHECK_SYMBOL_EXPORT([truncate_inode_pages_range], +[mm/truncate.c],[ +AC_DEFINE(HAVE_TRUNCATE_RANGE, 1, + [kernel export truncate_inode_pages_range]) ],[ - AC_MSG_RESULT([no]) ]) ]) -# -# LC_FUNC_MS_FLOCK_LOCK -# -# 2.6.5 kernel has MS_FLOCK_LOCK sb flag -# -AC_DEFUN([LC_FUNC_MS_FLOCK_LOCK], -[AC_MSG_CHECKING([if kernel has MS_FLOCK_LOCK sb flag]) -LB_LINUX_TRY_COMPILE([ - #include -],[ - int flags = MS_FLOCK_LOCK; -],[ - AC_DEFINE(HAVE_MS_FLOCK_LOCK, 1, - [kernel has MS_FLOCK_LOCK flag]) - AC_MSG_RESULT([yes]) +AC_DEFUN([LC_EXPORT_D_REHASH_COND], +[LB_CHECK_SYMBOL_EXPORT([d_rehash_cond], +[fs/dcache.c],[ +AC_DEFINE(HAVE_D_REHASH_COND, 1, + [d_rehash_cond is exported by the kernel]) ],[ - AC_MSG_RESULT([no]) ]) ]) -# -# LC_FUNC_HAVE_CAN_SLEEP_ARG -# -# 2.6.5 kernel has third arg can_sleep in fs/locks.c: flock_lock_file_wait() -# -AC_DEFUN([LC_FUNC_HAVE_CAN_SLEEP_ARG], -[AC_MSG_CHECKING([if kernel has third arg can_sleep in fs/locks.c: flock_lock_file_wait()]) -LB_LINUX_TRY_COMPILE([ - #include -],[ - int cansleep; - struct file *file; - struct file_lock *file_lock; - flock_lock_file_wait(file, file_lock, cansleep); -],[ - AC_DEFINE(HAVE_CAN_SLEEP_ARG, 1, - [kernel has third arg can_sleep in fs/locks.c: flock_lock_file_wait()]) - AC_MSG_RESULT([yes]) +AC_DEFUN([LC_EXPORT___D_REHASH], +[LB_CHECK_SYMBOL_EXPORT([__d_rehash], +[fs/dcache.c],[ +AC_DEFINE(HAVE___D_REHASH, 1, + [__d_rehash is exported by the kernel]) ],[ - AC_MSG_RESULT([no]) ]) ]) +AC_DEFUN([LC_EXPORT_D_MOVE_LOCKED], +[LB_CHECK_SYMBOL_EXPORT([d_move_locked], +[fs/dcache.c],[ +AC_DEFINE(HAVE_D_MOVE_LOCKED, 1, + [d_move_locked is exported by the kernel]) +],[ +]) +]) + +AC_DEFUN([LC_EXPORT___D_MOVE], +[LB_CHECK_SYMBOL_EXPORT([__d_move], +[fs/dcache.c],[ +AC_DEFINE(HAVE___D_MOVE, 1, + [__d_move is exported by the kernel]) +],[ +]) +]) + +# The actual symbol exported varies among architectures, so we need +# to check many symbols (but only in the current architecture.) No +# matter what symbol is exported, the kernel #defines node_to_cpumask +# to the appropriate function and that's what we use. +AC_DEFUN([LC_EXPORT_NODE_TO_CPUMASK], + [LB_CHECK_SYMBOL_EXPORT([node_to_cpumask], + [arch/$LINUX_ARCH/mm/numa.c], + [AC_DEFINE(HAVE_NODE_TO_CPUMASK, 1, + [node_to_cpumask is exported by + the kernel])]) # x86_64 + LB_CHECK_SYMBOL_EXPORT([node_to_cpu_mask], + [arch/$LINUX_ARCH/kernel/smpboot.c], + [AC_DEFINE(HAVE_NODE_TO_CPUMASK, 1, + [node_to_cpumask is exported by + the kernel])]) # ia64 + LB_CHECK_SYMBOL_EXPORT([node_2_cpu_mask], + [arch/$LINUX_ARCH/kernel/smpboot.c], + [AC_DEFINE(HAVE_NODE_TO_CPUMASK, 1, + [node_to_cpumask is exported by + the kernel])]) # i386 + ]) + # -# LC_FUNC_RELEASEPAGE_WITH_GFP -# -# 2.6.9 ->releasepage() takes a gfp_t arg -# This kernel defines gfp_t (HAS_GFP_T) but doesn't use it for this function, -# while others either don't have gfp_t or pass gfp_t as the parameter. -# -AC_DEFUN([LC_FUNC_RELEASEPAGE_WITH_GFP], -[AC_MSG_CHECKING([if releasepage has a gfp_t parameter]) -RELEASEPAGE_WITH_GFP="$(grep -c 'releasepage.*gfp_t' $LINUX/include/linux/fs.h)" -if test "$RELEASEPAGE_WITH_GFP" != 0 ; then - AC_DEFINE(HAVE_RELEASEPAGE_WITH_GFP, 1, - [releasepage with gfp_t parameter]) +# LC_HEADER_LDISKFS_XATTR +# +# CHAOS kernel-devel package will not include fs/ldiskfs/xattr.h +# +AC_DEFUN([LC_HEADER_LDISKFS_XATTR], +[AC_MSG_CHECKING([if ldiskfs has xattr.h header]) +tmp_flags="$EXTRA_KCFLAGS" +EXTRA_KCFLAGS="-I$LINUX/fs -I$LDISKFS_DIR -I$LDISKFS_DIR/ldiskfs" +LB_LINUX_TRY_COMPILE([ + #include +],[ + ldiskfs_xattr_get(NULL, 0, "", NULL, 0); + ldiskfs_xattr_set_handle(NULL, NULL, 0, "", NULL, 0, 0); + +],[ AC_MSG_RESULT([yes]) -else + AC_DEFINE(HAVE_LDISKFS_XATTR_H, 1, [ldiskfs/xattr.h found]) +],[ + AC_MSG_RESULT([no]) +]) +EXTRA_KCFLAGS="$tmp_flags" +]) + +# +# LC_FUNC_GRAB_CACHE_PAGE_NOWAIT_GFP +# +# Check for our patched grab_cache_page_nowait_gfp() function +# after 2.6.29 we can emulate this using add_to_page_cache_lru() +# +AC_DEFUN([LC_FUNC_GRAB_CACHE_PAGE_NOWAIT_GFP], +[LB_CHECK_SYMBOL_EXPORT([grab_cache_page_nowait_gfp], +[mm/filemap.c],[ + AC_DEFINE(HAVE_GRAB_CACHE_PAGE_NOWAIT_GFP, 1, + [kernel exports grab_cache_page_nowait_gfp]) + ], + [LB_CHECK_SYMBOL_EXPORT([add_to_page_cache_lru], + [mm/filemap.c],[ + AC_DEFINE(HAVE_ADD_TO_PAGE_CACHE_LRU, 1, + [kernel exports add_to_page_cache_lru]) + ],[ + ]) + ]) +]) + +# +# LC_STRUCT_STATFS +# +# AIX does not have statfs.f_namelen +# +AC_DEFUN([LC_STRUCT_STATFS], +[AC_MSG_CHECKING([if struct statfs has a f_namelen field]) +LB_LINUX_TRY_COMPILE([ + #include +],[ + struct statfs sfs; + sfs.f_namelen = 1; +],[ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_STATFS_NAMELEN, 1, [struct statfs has a namelen field]) +],[ AC_MSG_RESULT([no]) -fi +]) ]) # @@ -394,26 +492,6 @@ AC_DEFUN([LC_XATTR_ACL], []) ]) - -# added in 2.6.16 -# -AC_DEFUN([LC_STRUCT_INTENT_FILE], -[AC_MSG_CHECKING([if struct open_intent has a file field]) -LB_LINUX_TRY_COMPILE([ - #include - #include -],[ - struct open_intent intent; - &intent.file; -],[ - AC_MSG_RESULT([yes]) - AC_DEFINE(HAVE_FILE_IN_STRUCT_INTENT, 1, [struct open_intent has a file field]) -],[ - AC_MSG_RESULT([no]) -]) -]) - - # # After 2.6.16 the xattr_acl API is removed, and posix_acl is used instead # @@ -436,20 +514,65 @@ $1 ]) ]) +AC_DEFUN([LC_CONST_ACL_SIZE], +[AC_MSG_CHECKING([calc acl size]) +tmp_flags="$CFLAGS" +CFLAGS="$CFLAGS -I$LINUX/include -I$LINUX_OBJ/include -I$LINUX_OBJ/include2 -I$LINUX/arch/`uname -m|sed -e 's/ppc.*/powerpc/' -e 's/x86_64/x86/' -e 's/i.86/x86/'`/include $EXTRA_KCFLAGS" +AC_TRY_RUN([ + #define __KERNEL__ + #include + #include + #undef __KERNEL__ + // block include + #define __LINUX_POSIX_ACL_H + + # ifdef CONFIG_FS_POSIX_ACL + # ifdef HAVE_XATTR_ACL + # include + # endif + # ifdef HAVE_LINUX_POSIX_ACL_XATTR_H + # include + # endif + # endif + + #include + + #include + + int main(void) + { + int size = mds_xattr_acl_size(LUSTRE_POSIX_ACL_MAX_ENTRIES); + FILE *f = fopen("acl.size","w+"); + fprintf(f,"%d", size); + fclose(f); + + return 0; + } +],[ + acl_size=`cat acl.size` + AC_MSG_RESULT([ACL size $acl_size]) + AC_DEFINE_UNQUOTED(XATTR_ACL_SIZE, AS_TR_SH([$acl_size]), [size of xattr acl]) +],[ + AC_ERROR([ACL size can't computed]) +]) +CFLAGS="$tmp_flags" +]) + +# added in 2.6.16 # -# only for Lustre-patched kernels -# -AC_DEFUN([LC_LUSTRE_VERSION_H], -[LB_CHECK_FILE([$LINUX/include/linux/lustre_version.h],[ - rm -f "$LUSTRE/include/linux/lustre_version.h" +AC_DEFUN([LC_STRUCT_INTENT_FILE], +[AC_MSG_CHECKING([if struct open_intent has a file field]) +LB_LINUX_TRY_COMPILE([ + #include + #include ],[ - touch "$LUSTRE/include/linux/lustre_version.h" - if test x$enable_server = xyes ; then - AC_MSG_WARN([Unpatched kernel detected.]) - AC_MSG_WARN([Lustre servers cannot be built with an unpatched kernel;]) - AC_MSG_WARN([disabling server build]) - enable_server='no' - fi + struct open_intent intent; + &intent.file; +],[ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_FILE_IN_STRUCT_INTENT, 1, [struct open_intent has a file field]) +],[ + AC_MSG_RESULT([no]) ]) ]) @@ -480,27 +603,6 @@ AC_DEFUN([LC_CONFIG_RMTCLIENT], ]) ]) -AC_DEFUN([LC_SUNRPC_CACHE], -[AC_MSG_CHECKING([if sunrpc struct cache_head uses kref]) -LB_LINUX_TRY_COMPILE([ - #include -],[ - struct cache_head ch; - &ch.ref.refcount; -],[ - AC_MSG_RESULT([yes]) - AC_DEFINE(HAVE_SUNRPC_CACHE_V2, 1, [sunrpc cache facility v2]) -],[ - AC_MSG_RESULT([no]) -]) -]) - -AC_DEFUN([LC_CONFIG_SUNRPC], -[LB_LINUX_CONFIG_IM([SUNRPC],[], - [AC_MSG_ERROR([kernel SUNRPC support is required by using GSS.])]) - LC_SUNRPC_CACHE -]) - # # LC_CONFIG_GSS_KEYRING (default enabled, if gss is enabled) # @@ -524,6 +626,27 @@ AC_DEFUN([LC_CONFIG_GSS_KEYRING], fi ]) +AC_DEFUN([LC_SUNRPC_CACHE], +[AC_MSG_CHECKING([if sunrpc struct cache_head uses kref]) +LB_LINUX_TRY_COMPILE([ + #include +],[ + struct cache_head ch; + &ch.ref.refcount; +],[ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_SUNRPC_CACHE_V2, 1, [sunrpc cache facility v2]) +],[ + AC_MSG_RESULT([no]) +]) +]) + +AC_DEFUN([LC_CONFIG_SUNRPC], +[LB_LINUX_CONFIG_IM([SUNRPC],[], + [AC_MSG_ERROR([kernel SUNRPC support is required by using GSS.])]) + LC_SUNRPC_CACHE +]) + # # LC_CONFIG_GSS (default disabled) # @@ -564,54 +687,282 @@ AC_DEFUN([LC_CONFIG_GSS], fi ]) -# LC_EXPORT_SYNCHRONIZE_RCU -# after 2.6.12 synchronize_rcu is preferred over synchronize_kernel -AC_DEFUN([LC_EXPORT_SYNCHRONIZE_RCU], -[LB_CHECK_SYMBOL_EXPORT([synchronize_rcu], -[kernel/rcupdate.c],[ - AC_DEFINE(HAVE_SYNCHRONIZE_RCU, 1, - [in 2.6.12 synchronize_rcu preferred over synchronize_kernel]) -],[ -]) -]) - -# LC_INODE_I_MUTEX -# after 2.6.15 inode have i_mutex intead of i_sem -AC_DEFUN([LC_INODE_I_MUTEX], -[AC_MSG_CHECKING([if inode has i_mutex ]) +# +# LC_FUNC_HAVE_CAN_SLEEP_ARG +# +# 2.6.5 kernel has third arg can_sleep in fs/locks.c: flock_lock_file_wait() +# +AC_DEFUN([LC_FUNC_HAVE_CAN_SLEEP_ARG], +[AC_MSG_CHECKING([if kernel has third arg can_sleep in fs/locks.c: flock_lock_file_wait()]) LB_LINUX_TRY_COMPILE([ - #include - #include - #undef i_mutex + #include ],[ - struct inode i; - - mutex_unlock(&i.i_mutex); + int cansleep; + struct file *file; + struct file_lock *file_lock; + flock_lock_file_wait(file, file_lock, cansleep); ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_INODE_I_MUTEX, 1, - [after 2.6.15 inode have i_mutex intead of i_sem]) + AC_DEFINE(HAVE_CAN_SLEEP_ARG, 1, + [kernel has third arg can_sleep in fs/locks.c: flock_lock_file_wait()]) + AC_MSG_RESULT([yes]) ],[ - AC_MSG_RESULT(no) + AC_MSG_RESULT([no]) ]) ]) -# LC_SEQ_LOCK -# after 2.6.18 seq_file has lock intead of sem -AC_DEFUN([LC_SEQ_LOCK], -[AC_MSG_CHECKING([if struct seq_file has lock field]) +# +# LC_FUNC_F_OP_FLOCK +# +# rhel4.2 kernel has f_op->flock field +# +AC_DEFUN([LC_FUNC_F_OP_FLOCK], +[AC_MSG_CHECKING([if struct file_operations has flock field]) LB_LINUX_TRY_COMPILE([ - #include + #include ],[ - struct seq_file seq; + struct file_operations ll_file_operations_flock; + ll_file_operations_flock.flock = NULL; +],[ + AC_DEFINE(HAVE_F_OP_FLOCK, 1, + [struct file_operations has flock field]) + AC_MSG_RESULT([yes]) +],[ + AC_MSG_RESULT([no]) +]) +]) + +AC_DEFUN([LC_QUOTA_READ], +[AC_MSG_CHECKING([if kernel supports quota_read]) +LB_LINUX_TRY_COMPILE([ + #include +],[ + struct super_operations sp; + void *i = (void *)sp.quota_read; +],[ + AC_MSG_RESULT([yes]) + AC_DEFINE(KERNEL_SUPPORTS_QUOTA_READ, 1, [quota_read found]) +],[ + AC_MSG_RESULT([no]) +]) +]) + +# +# LC_COOKIE_FOLLOW_LINK +# +# kernel 2.6.13+ ->follow_link returns a cookie +# + +AC_DEFUN([LC_COOKIE_FOLLOW_LINK], +[AC_MSG_CHECKING([if inode_operations->follow_link returns a cookie]) +LB_LINUX_TRY_COMPILE([ + #include + #include +],[ + struct dentry dentry; + struct nameidata nd; + + dentry.d_inode->i_op->put_link(&dentry, &nd, NULL); +],[ + AC_DEFINE(HAVE_COOKIE_FOLLOW_LINK, 1, [inode_operations->follow_link returns a cookie]) + AC_MSG_RESULT([yes]) +],[ + AC_MSG_RESULT([no]) +]) +]) + +# +# LC_FUNC_RCU +# +# kernels prior than 2.6.0(?) have no RCU supported; in kernel 2.6.5(SUSE), +# call_rcu takes three parameters. +# +AC_DEFUN([LC_FUNC_RCU], +[AC_MSG_CHECKING([if kernel have RCU supported]) +LB_LINUX_TRY_COMPILE([ + #include +],[],[ + AC_DEFINE(HAVE_RCU, 1, [have RCU defined]) + AC_MSG_RESULT([yes]) + + AC_MSG_CHECKING([if call_rcu takes three parameters]) + LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct rcu_head rh; + call_rcu(&rh, (void (*)(struct rcu_head *))1, NULL); + ],[ + AC_DEFINE(HAVE_CALL_RCU_PARAM, 1, [call_rcu takes three parameters]) + AC_MSG_RESULT([yes]) + ],[ + AC_MSG_RESULT([no]) + ]) - mutex_unlock(&seq.lock); +],[ + AC_MSG_RESULT([no]) +]) +]) + +AC_DEFUN([LC_PERCPU_COUNTER], +[AC_MSG_CHECKING([if have struct percpu_counter defined]) +LB_LINUX_TRY_COMPILE([ + #include +],[],[ + AC_DEFINE(HAVE_PERCPU_COUNTER, 1, [percpu_counter found]) + AC_MSG_RESULT([yes]) + + AC_MSG_CHECKING([if percpu_counter_inc takes the 2nd argument]) + LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct percpu_counter c; + percpu_counter_init(&c, 0); + ],[ + AC_DEFINE(HAVE_PERCPU_2ND_ARG, 1, [percpu_counter_init has two + arguments]) + AC_MSG_RESULT([yes]) + ],[ + AC_MSG_RESULT([no]) + ]) +],[ + AC_MSG_RESULT([no]) +]) +]) + +AC_DEFUN([LC_TASK_CLENV_STORE], +[ + AC_MSG_CHECKING([if we can store cl_env in task_struct]) + if test x$have_task_clenv_store != xyes ; then + LC_TASK_CLENV_TUX_INFO + fi +]) + +# ~2.6.11 + +AC_DEFUN([LC_S_TIME_GRAN], +[AC_MSG_CHECKING([if super block has s_time_gran member]) +LB_LINUX_TRY_COMPILE([ + #include +],[ + struct super_block sb; + + return sb.s_time_gran; +],[ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_S_TIME_GRAN, 1, [super block has s_time_gran member]) +],[ + AC_MSG_RESULT([no]) +]) +]) + +AC_DEFUN([LC_SB_TIME_GRAN], +[AC_MSG_CHECKING([if kernel has old get_sb_time_gran]) +LB_LINUX_TRY_COMPILE([ + #include +],[ + return get_sb_time_gran(NULL); +],[ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_SB_TIME_GRAN, 1, [kernel has old get_sb_time_gran]) +],[ + AC_MSG_RESULT([no]) +]) +]) + +# 2.6.12 + +# ~2.6.12 merge patch from oracle to convert tree_lock from spinlock to rwlock +AC_DEFUN([LC_RW_TREE_LOCK], +[AC_MSG_CHECKING([if kernel has tree_lock as rwlock]) +tmp_flags="$EXTRA_KCFLAGS" +EXTRA_KCFLAGS="-Werror" +LB_LINUX_TRY_COMPILE([ + #include +],[ + struct address_space a; + + write_lock(&a.tree_lock); +],[ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_RW_TREE_LOCK, 1, [kernel has tree_lock as rw_lock]) +],[ + AC_MSG_RESULT([no]) +]) +EXTRA_KCFLAGS="$tmp_flags" +]) + +# LC_EXPORT_SYNCHRONIZE_RCU +# after 2.6.12 synchronize_rcu is preferred over synchronize_kernel +AC_DEFUN([LC_EXPORT_SYNCHRONIZE_RCU], +[LB_CHECK_SYMBOL_EXPORT([synchronize_rcu], +[kernel/rcupdate.c],[ + AC_DEFINE(HAVE_SYNCHRONIZE_RCU, 1, + [in 2.6.12 synchronize_rcu preferred over synchronize_kernel]) +],[ +]) +]) + +# 2.6.15 + +# LC_INODE_I_MUTEX +# after 2.6.15 inode have i_mutex intead of i_sem +AC_DEFUN([LC_INODE_I_MUTEX], +[AC_MSG_CHECKING([if inode has i_mutex ]) +LB_LINUX_TRY_COMPILE([ + #include + #include + #undef i_mutex +],[ + struct inode i; + + mutex_unlock(&i.i_mutex); ],[ AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_SEQ_LOCK, 1, - [after 2.6.18 seq_file has lock intead of sem]) + AC_DEFINE(HAVE_INODE_I_MUTEX, 1, + [after 2.6.15 inode have i_mutex intead of i_sem]) ],[ - AC_MSG_RESULT(NO) + AC_MSG_RESULT(no) +]) +]) + +# 2.6.16 + +# LC_SECURITY_PLUG # for SLES10 SP2 +# check security plug in sles10 sp2 kernel +AC_DEFUN([LC_SECURITY_PLUG], +[AC_MSG_CHECKING([If kernel has security plug support]) +LB_LINUX_TRY_COMPILE([ + #include +],[ + struct dentry *dentry; + struct vfsmount *mnt; + struct iattr *iattr; + + notify_change(dentry, mnt, iattr); +],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_SECURITY_PLUG, 1, + [SLES10 SP2 use extra parameter in vfs]) +],[ + AC_MSG_RESULT(no) +]) +]) + +# 2.6.17 + +# inode have i_private field since 2.6.17 +AC_DEFUN([LC_INODE_IPRIVATE], +[AC_MSG_CHECKING([if inode has a i_private field]) +LB_LINUX_TRY_COMPILE([ +#include +],[ + struct inode i; + i.i_private = NULL; +],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_INODE_IPRIVATE, 1, + [struct inode has i_private field]) +],[ + AC_MSG_RESULT(no) ]) ]) @@ -636,26 +987,22 @@ LB_LINUX_TRY_COMPILE([ ]) ]) -# LC_FLUSH_OWNER_ID -# starting from 2.6.18 the file_operations .flush -# method has a new "fl_owner_t id" parameter -# -AC_DEFUN([LC_FLUSH_OWNER_ID], -[AC_MSG_CHECKING([if file_operations .flush has an fl_owner_t id]) +# 2.6.18 + +# LC_NR_PAGECACHE +# 2.6.18 don't export nr_pagecahe +AC_DEFUN([LC_NR_PAGECACHE], +[AC_MSG_CHECKING([kernel export nr_pagecache]) LB_LINUX_TRY_COMPILE([ - #include + #include ],[ - struct file_operations *fops = NULL; - fl_owner_t id; - int i; - - i = fops->flush(NULL, id); + return atomic_read(&nr_pagecache); ],[ - AC_DEFINE(HAVE_FLUSH_OWNER_ID, 1, - [file_operations .flush method has an fl_owner_t id]) - AC_MSG_RESULT([yes]) + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_NR_PAGECACHE, 1, + [is kernel export nr_pagecache]) ],[ - AC_MSG_RESULT([no]) + AC_MSG_RESULT(no) ]) ]) @@ -748,23 +1095,195 @@ LB_LINUX_TRY_COMPILE([ EXTRA_KCFLAGS="$tmp_flags" ]) -# inode have i_private field since 2.6.17 -AC_DEFUN([LC_INODE_IPRIVATE], -[AC_MSG_CHECKING([if inode has a i_private field]) +# LC_SEQ_LOCK +# after 2.6.18 seq_file has lock intead of sem +AC_DEFUN([LC_SEQ_LOCK], +[AC_MSG_CHECKING([if struct seq_file has lock field]) LB_LINUX_TRY_COMPILE([ -#include + #include ],[ - struct inode i; - i.i_private = NULL; + struct seq_file seq; + + mutex_unlock(&seq.lock); ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_INODE_IPRIVATE, 1, - [struct inode has i_private field]) + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_SEQ_LOCK, 1, + [after 2.6.18 seq_file has lock intead of sem]) ],[ - AC_MSG_RESULT(no) + AC_MSG_RESULT(NO) +]) +]) + +# +# LC_EXPORT_FILEMAP_FDATAWRITE_RANGE +# +# No standard kernels export this +# +AC_DEFUN([LC_EXPORT_FILEMAP_FDATAWRITE_RANGE], +[LB_CHECK_SYMBOL_EXPORT([filemap_fdatawrite_range], +[mm/filemap.c],[ +AC_DEFINE(HAVE_FILEMAP_FDATAWRITE_RANGE, 1, + [filemap_fdatawrite_range is exported by the kernel]) +],[ +]) +]) + +# LC_FLUSH_OWNER_ID +# starting from 2.6.18 the file_operations .flush +# method has a new "fl_owner_t id" parameter +# +AC_DEFUN([LC_FLUSH_OWNER_ID], +[AC_MSG_CHECKING([if file_operations .flush has an fl_owner_t id]) +LB_LINUX_TRY_COMPILE([ + #include +],[ + struct file_operations *fops = NULL; + fl_owner_t id; + int i; + + i = fops->flush(NULL, id); +],[ + AC_DEFINE(HAVE_FLUSH_OWNER_ID, 1, + [file_operations .flush method has an fl_owner_t id]) + AC_MSG_RESULT([yes]) +],[ + AC_MSG_RESULT([no]) +]) +]) + +# +# LC_EXPORT_INVALIDATE_MAPPING_PAGES +# +# SLES9, RHEL4, RHEL5, vanilla 2.6.24 export invalidate_mapping_pages() but +# SLES10 2.6.16 does not, for some reason. For filter cache invalidation. +# +AC_DEFUN([LC_EXPORT_INVALIDATE_MAPPING_PAGES], + [LB_CHECK_SYMBOL_EXPORT([invalidate_mapping_pages], [mm/truncate.c], [ + AC_DEFINE(HAVE_INVALIDATE_MAPPING_PAGES, 1, + [exported invalidate_mapping_pages])], + [LB_CHECK_SYMBOL_EXPORT([invalidate_inode_pages], [mm/truncate.c], [ + AC_DEFINE(HAVE_INVALIDATE_INODE_PAGES, 1, + [exported invalidate_inode_pages])], [ + AC_MSG_ERROR([no way to invalidate pages]) + ]) + ],[]) +]) + +# +# LC_EXT4_DISCARD_PREALLOCATIONS +# +AC_DEFUN([LC_EXT4_DISCARD_PREALLOCATIONS], +[AC_MSG_CHECKING([if ext4_discard_preallocatoins defined]) +tmp_flags="$EXTRA_KCFLAGS" +EXTRA_KCFLAGS="-I$LINUX/fs" +LB_LINUX_TRY_COMPILE([ + #include +],[ + struct inode i; + ext4_discard_preallocations(&i); +],[ + AC_MSG_RESULT(yes) + AC_DEFINE(LDISKFS_DISCARD_PREALLOCATIONS, 1, + [ext4_discard_preacllocations defined]) +],[ + AC_MSG_RESULT(no) +]) +EXTRA_KCFLAGS="$tmp_flags" +]) + +# +# LC_EXT_INSERT_EXTENT_WITH_5ARGS +# +AC_DEFUN([LC_EXT_INSERT_EXTENT_WITH_5ARGS], +[AC_MSG_CHECKING([ext4_ext_insert_extent needs 5 arguments]) +tmp_flags="$EXTRA_KCFLAGS" +EXTRA_KCFLAGS="-I$LINUX/fs" +LB_LINUX_TRY_COMPILE([ + #include +],[ + ext4_ext_insert_extent(NULL, NULL, NULL, NULL, 0); +],[ + AC_DEFINE([EXT_INSERT_EXTENT_WITH_5ARGS], 1, + [ext4_ext_insert_exent needs 5 arguments]) + AC_MSG_RESULT([yes]) +],[ + AC_MSG_RESULT([no]) +]) +EXTRA_KCFLAGS="$tmp_flags" +]) + +#2.6.18 + RHEL5 (fc6) + +# RHEL5 in FS-cache patch rename PG_checked flag into PG_fs_misc +AC_DEFUN([LC_PG_FS_MISC], +[AC_MSG_CHECKING([kernel has PG_fs_misc]) +LB_LINUX_TRY_COMPILE([ + #include + #include +],[ + #ifndef PG_fs_misc + #error PG_fs_misc not defined in kernel + #endif +],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_PG_FS_MISC, 1, + [is kernel have PG_fs_misc]) +],[ + AC_MSG_RESULT(no) +]) +]) + +# RHEL5 PageChecked and SetPageChecked defined +AC_DEFUN([LC_PAGE_CHECKED], +[AC_MSG_CHECKING([kernel has PageChecked and SetPageChecked]) +LB_LINUX_TRY_COMPILE([ + #include +#ifdef HAVE_LINUX_MMTYPES_H + #include +#endif + #include +],[ + struct page *p; + + /* before 2.6.26 this define*/ + #ifndef PageChecked + /* 2.6.26 use function instead of define for it */ + SetPageChecked(p); + PageChecked(p); + #endif +],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_PAGE_CHECKED, 1, + [does kernel have PageChecked and SetPageChecked]) +],[ + AC_MSG_RESULT(no) +]) ]) + +# +# LC_LINUX_FIEMAP_H +# +# If we have fiemap.h +# after 2.6.27 use fiemap.h in include/linux +# +AC_DEFUN([LC_LINUX_FIEMAP_H], +[LB_CHECK_FILE([$LINUX/include/linux/fiemap.h],[ + AC_MSG_CHECKING([if fiemap.h can be compiled]) + LB_LINUX_TRY_COMPILE([ + #include + #include + ],[],[ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_LINUX_FIEMAP_H, 1, [Kernel has fiemap.h]) + ],[ + AC_MSG_RESULT([no]) + ]) +], +[]) ]) +# 2.6.19 + # 2.6.19 API changes # inode don't have i_blksize field AC_DEFUN([LC_INODE_BLKSIZE], @@ -829,7 +1348,7 @@ LB_LINUX_TRY_COMPILE([ ]) ]) -# LC_GENERIC_FILE_READ +# LC_FILE_READV # 2.6.19 replaced readv with aio_read AC_DEFUN([LC_FILE_READV], [AC_MSG_CHECKING([readv in fops]) @@ -847,22 +1366,7 @@ LB_LINUX_TRY_COMPILE([ ]) ]) -# LC_NR_PAGECACHE -# 2.6.18 don't export nr_pagecahe -AC_DEFUN([LC_NR_PAGECACHE], -[AC_MSG_CHECKING([kernel export nr_pagecache]) -LB_LINUX_TRY_COMPILE([ - #include -],[ - return atomic_read(&nr_pagecache); -],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_NR_PAGECACHE, 1, - [is kernel export nr_pagecache]) -],[ - AC_MSG_RESULT(no) -]) -]) +# 2.6.20 # LC_CANCEL_DIRTY_PAGE # 2.6.20 introduced cancel_dirty_page instead of clear_page_dirty. @@ -889,6 +1393,8 @@ AC_DEFUN([LC_CANCEL_DIRTY_PAGE], fi ]) +# raid5-zerocopy patch + # # LC_PAGE_CONSTANT # @@ -913,178 +1419,97 @@ LB_LINUX_TRY_COMPILE([ ]) ]) -# RHEL5 in FS-cache patch rename PG_checked flag into PG_fs_misc -AC_DEFUN([LC_PG_FS_MISC], -[AC_MSG_CHECKING([kernel has PG_fs_misc]) +# 2.6.22 + +# 2.6.22 lost second parameter for invalidate_bdev +AC_DEFUN([LC_INVALIDATE_BDEV_2ARG], +[AC_MSG_CHECKING([if invalidate_bdev has second argument]) LB_LINUX_TRY_COMPILE([ - #include - #include + #include ],[ - #ifndef PG_fs_misc - #error PG_fs_misc not defined in kernel - #endif + invalidate_bdev(NULL,0); ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_PG_FS_MISC, 1, - [is kernel have PG_fs_misc]) + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_INVALIDATE_BDEV_2ARG, 1, + [invalidate_bdev has second argument]) ],[ - AC_MSG_RESULT(no) + AC_MSG_RESULT([no]) ]) ]) -# RHEL5 PageChecked and SetPageChecked defined -AC_DEFUN([LC_PAGE_CHECKED], -[AC_MSG_CHECKING([kernel has PageChecked and SetPageChecked]) +# +# check for crypto API +# +AC_DEFUN([LC_ASYNC_BLOCK_CIPHER], +[AC_MSG_CHECKING([if kernel has block cipher support]) LB_LINUX_TRY_COMPILE([ - #include -#ifdef HAVE_LINUX_MMTYPES_H - #include -#endif - #include + #include + #include ],[ - struct page *p; - - /* before 2.6.26 this define*/ - #ifndef PageChecked - /* 2.6.26 use function instead of define for it */ - SetPageChecked(p); - PageChecked(p); - #endif + struct crypto_blkcipher *tfm; + tfm = crypto_alloc_blkcipher("aes", 0, 0 ); ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_PAGE_CHECKED, 1, - [does kernel have PageChecked and SetPageChecked]) + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_ASYNC_BLOCK_CIPHER, 1, [kernel has block cipher support]) ],[ - AC_MSG_RESULT(no) + AC_MSG_RESULT([no]) ]) ]) -AC_DEFUN([LC_EXPORT_TRUNCATE_COMPLETE], -[LB_CHECK_SYMBOL_EXPORT([truncate_complete_page], -[mm/truncate.c],[ -AC_DEFINE(HAVE_TRUNCATE_COMPLETE_PAGE, 1, - [kernel export truncate_complete_page]) +# +# check for struct hash_desc +# +AC_DEFUN([LC_STRUCT_HASH_DESC], +[AC_MSG_CHECKING([if kernel has struct hash_desc]) +LB_LINUX_TRY_COMPILE([ + #include + #include ],[ -]) -]) - -AC_DEFUN([LC_EXPORT_TRUNCATE_RANGE], -[LB_CHECK_SYMBOL_EXPORT([truncate_inode_pages_range], -[mm/truncate.c],[ -AC_DEFINE(HAVE_TRUNCATE_RANGE, 1, - [kernel export truncate_inode_pages_range]) + struct hash_desc foo; ],[ -]) -]) - -AC_DEFUN([LC_EXPORT_D_REHASH_COND], -[LB_CHECK_SYMBOL_EXPORT([d_rehash_cond], -[fs/dcache.c],[ -AC_DEFINE(HAVE_D_REHASH_COND, 1, - [d_rehash_cond is exported by the kernel]) + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_STRUCT_HASH_DESC, 1, [kernel has struct hash_desc]) ],[ + AC_MSG_RESULT([no]) ]) ]) -AC_DEFUN([LC_EXPORT___D_REHASH], -[LB_CHECK_SYMBOL_EXPORT([__d_rehash], -[fs/dcache.c],[ -AC_DEFINE(HAVE___D_REHASH, 1, - [__d_rehash is exported by the kernel]) +# +# check for struct blkcipher_desc +# +AC_DEFUN([LC_STRUCT_BLKCIPHER_DESC], +[AC_MSG_CHECKING([if kernel has struct blkcipher_desc]) +LB_LINUX_TRY_COMPILE([ + #include + #include ],[ -]) -]) - -AC_DEFUN([LC_EXPORT_D_MOVE_LOCKED], -[LB_CHECK_SYMBOL_EXPORT([d_move_locked], -[fs/dcache.c],[ -AC_DEFINE(HAVE_D_MOVE_LOCKED, 1, - [d_move_locked is exported by the kernel]) + struct blkcipher_desc foo; ],[ -]) -]) - -AC_DEFUN([LC_EXPORT___D_MOVE], -[LB_CHECK_SYMBOL_EXPORT([__d_move], -[fs/dcache.c],[ -AC_DEFINE(HAVE___D_MOVE, 1, - [__d_move is exported by the kernel]) + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_STRUCT_BLKCIPHER_DESC, 1, [kernel has struct blkcipher_desc]) ],[ + AC_MSG_RESULT([no]) ]) ]) # -# LC_EXPORT_INVALIDATE_MAPPING_PAGES -# -# SLES9, RHEL4, RHEL5, vanilla 2.6.24 export invalidate_mapping_pages() but -# SLES10 2.6.16 does not, for some reason. For filter cache invalidation. -# -AC_DEFUN([LC_EXPORT_INVALIDATE_MAPPING_PAGES], - [LB_CHECK_SYMBOL_EXPORT([invalidate_mapping_pages], [mm/truncate.c], [ - AC_DEFINE(HAVE_INVALIDATE_MAPPING_PAGES, 1, - [exported invalidate_mapping_pages])], - [LB_CHECK_SYMBOL_EXPORT([invalidate_inode_pages], [mm/truncate.c], [ - AC_DEFINE(HAVE_INVALIDATE_INODE_PAGES, 1, - [exported invalidate_inode_pages])], [ - AC_MSG_ERROR([no way to invalidate pages]) - ]) - ],[]) -]) - -# -# LC_EXPORT_FILEMAP_FDATASYNC_RANGE -# -# No standard kernels export this +# 2.6.19 check for FS_RENAME_DOES_D_MOVE flag # -AC_DEFUN([LC_EXPORT_FILEMAP_FDATAWRITE_RANGE], -[LB_CHECK_SYMBOL_EXPORT([filemap_fdatawrite_range], -[mm/filemap.c],[ -AC_DEFINE(HAVE_FILEMAP_FDATAWRITE_RANGE, 1, - [filemap_fdatawrite_range is exported by the kernel]) -],[ -]) -]) - -# The actual symbol exported varies among architectures, so we need -# to check many symbols (but only in the current architecture.) No -# matter what symbol is exported, the kernel #defines node_to_cpumask -# to the appropriate function and that's what we use. -AC_DEFUN([LC_EXPORT_NODE_TO_CPUMASK], - [LB_CHECK_SYMBOL_EXPORT([node_to_cpumask], - [arch/$LINUX_ARCH/mm/numa.c], - [AC_DEFINE(HAVE_NODE_TO_CPUMASK, 1, - [node_to_cpumask is exported by - the kernel])]) # x86_64 - LB_CHECK_SYMBOL_EXPORT([node_to_cpu_mask], - [arch/$LINUX_ARCH/kernel/smpboot.c], - [AC_DEFINE(HAVE_NODE_TO_CPUMASK, 1, - [node_to_cpumask is exported by - the kernel])]) # ia64 - LB_CHECK_SYMBOL_EXPORT([node_2_cpu_mask], - [arch/$LINUX_ARCH/kernel/smpboot.c], - [AC_DEFINE(HAVE_NODE_TO_CPUMASK, 1, - [node_to_cpumask is exported by - the kernel])]) # i386 - ]) - -# 2.6.22 lost second parameter for invalidate_bdev -AC_DEFUN([LC_INVALIDATE_BDEV_2ARG], -[AC_MSG_CHECKING([if invalidate_bdev has second argument]) +AC_DEFUN([LC_FS_RENAME_DOES_D_MOVE], +[AC_MSG_CHECKING([if kernel has FS_RENAME_DOES_D_MOVE flag]) LB_LINUX_TRY_COMPILE([ - #include + #include ],[ - invalidate_bdev(NULL,0); + int v = FS_RENAME_DOES_D_MOVE; ],[ AC_MSG_RESULT([yes]) - AC_DEFINE(HAVE_INVALIDATE_BDEV_2ARG, 1, - [invalidate_bdev has second argument]) + AC_DEFINE(HAVE_FS_RENAME_DOES_D_MOVE, 1, [kernel has FS_RENAME_DOES_D_MOVE flag]) ],[ AC_MSG_RESULT([no]) ]) ]) -# 2.6.18 - +# 2.6.23 # 2.6.23 have return type 'void' for unregister_blkdev AC_DEFUN([LC_UNREGISTER_BLKDEV_RETURN_INT], @@ -1103,37 +1528,37 @@ LB_LINUX_TRY_COMPILE([ ]) # 2.6.23 change .sendfile to .splice_read -# RHEL4 (-92 kernel) have both sendfile and .splice_read API -AC_DEFUN([LC_KERNEL_SENDFILE], -[AC_MSG_CHECKING([if kernel has .sendfile]) +AC_DEFUN([LC_KERNEL_SPLICE_READ], +[AC_MSG_CHECKING([if kernel has .splice_read]) LB_LINUX_TRY_COMPILE([ #include ],[ struct file_operations file; - file.sendfile = NULL; + file.splice_read = NULL; ], [ AC_MSG_RESULT([yes]) - AC_DEFINE(HAVE_KERNEL_SENDFILE, 1, - [kernel has .sendfile]) + AC_DEFINE(HAVE_KERNEL_SPLICE_READ, 1, + [kernel has .slice_read]) ],[ AC_MSG_RESULT([no]) ]) ]) # 2.6.23 change .sendfile to .splice_read -AC_DEFUN([LC_KERNEL_SPLICE_READ], -[AC_MSG_CHECKING([if kernel has .splice_read]) +# RHEL4 (-92 kernel) have both sendfile and .splice_read API +AC_DEFUN([LC_KERNEL_SENDFILE], +[AC_MSG_CHECKING([if kernel has .sendfile]) LB_LINUX_TRY_COMPILE([ #include ],[ struct file_operations file; - file.splice_read = NULL; + file.sendfile = NULL; ], [ AC_MSG_RESULT([yes]) - AC_DEFINE(HAVE_KERNEL_SPLICE_READ, 1, - [kernel has .slice_read]) + AC_DEFINE(HAVE_KERNEL_SENDFILE, 1, + [kernel has .sendfile]) ],[ AC_MSG_RESULT([no]) ]) @@ -1167,19 +1592,13 @@ LB_LINUX_TRY_COMPILE([ ]) ]) -#2.6.23 has new shrinker API +# 2.6.23 has new shrinker API AC_DEFUN([LC_REGISTER_SHRINKER], -[AC_MSG_CHECKING([if kernel has register_shrinker]) -LB_LINUX_TRY_COMPILE([ - #include -],[ - register_shrinker(NULL); -], [ - AC_MSG_RESULT([yes]) +[LB_CHECK_SYMBOL_EXPORT([register_shrinker], +[mm/vmscan.c],[ AC_DEFINE(HAVE_REGISTER_SHRINKER, 1, - [kernel has register_shrinker]) + [kernel exports register_shrinker]) ],[ - AC_MSG_RESULT([no]) ]) ]) @@ -1201,6 +1620,28 @@ LB_LINUX_TRY_COMPILE([ ]) ]) +# 2.6.23 exports exportfs_decode_fh +AC_DEFUN([LC_EXPORTFS_DECODE_FH], +[LB_CHECK_SYMBOL_EXPORT([exportfs_decode_fh], +[fs/exportfs/expfs.c],[ + AC_DEFINE(HAVE_EXPORTFS_DECODE_FH, 1, + [exportfs_decode_fh has been export]) +],[ +]) +]) + +# 2.6.24 + +# 2.6.24 need linux/mm_types.h included +AC_DEFUN([LC_HAVE_MMTYPES_H], +[LB_CHECK_FILE([$LINUX/include/linux/mm_types.h], [ + AC_DEFINE(HAVE_LINUX_MMTYPES_H, 1, + [kernel has include/mm_types.h]) +],[ + AC_MSG_RESULT([no]) +]) +]) + # 2.6.24 has bio_endio with 2 args AC_DEFUN([LC_BIO_ENDIO_2ARG], [AC_MSG_CHECKING([if kernel has bio_endio with 2 args]) @@ -1239,16 +1680,6 @@ LB_LINUX_TRY_COMPILE([ ]) ]) -# 2.6.24 need linux/mm_types.h included -AC_DEFUN([LC_HAVE_MMTYPES_H], -[LB_CHECK_FILE([$LINUX/include/linux/mm_types.h], [ - AC_DEFINE(HAVE_LINUX_MMTYPES_H, 1, - [kernel has include/mm_types.h]) -],[ - AC_MSG_RESULT([no]) -]) -]) - # 2.6.24 removes long aged procfs entry -> deleted member AC_DEFUN([LC_PROCFS_DELETED], [AC_MSG_CHECKING([if kernel has deleted member in procfs entry struct]) @@ -1267,6 +1698,18 @@ LB_LINUX_TRY_COMPILE([ ]) ]) +# 2.6.24 has bdi_init()/bdi_destroy() functions. +AC_DEFUN([LC_EXPORT_BDI_INIT], +[LB_CHECK_SYMBOL_EXPORT([bdi_init], +[mm/backing-dev.c],[ + AC_DEFINE(HAVE_BDI_INIT, 1, + [bdi_init/bdi_destroy functions are present]) +],[ +]) +]) + +# 2.6.25 + # 2.6.25 change define to inline AC_DEFUN([LC_MAPPING_CAP_WRITEBACK_DIRTY], [AC_MSG_CHECKING([if kernel have mapping_cap_writeback_dirty]) @@ -1285,7 +1728,7 @@ LB_LINUX_TRY_COMPILE([ ]) ]) - +# 2.6.26 # 2.6.26 isn't export set_fs_pwd and change paramter in fs struct AC_DEFUN([LC_FS_STRUCT_USE_PATH], @@ -1302,280 +1745,14 @@ LB_LINUX_TRY_COMPILE([ ], [ AC_MSG_RESULT([yes]) AC_DEFINE(HAVE_FS_STRUCT_USE_PATH, 1, - [fs_struct use path structure]) -],[ - AC_MSG_RESULT([no]) -]) -]) - -# -# LC_VFS_INTENT_PATCHES -# -# check if the kernel has the VFS intent patches -AC_DEFUN([LC_VFS_INTENT_PATCHES], -[AC_MSG_CHECKING([if the kernel has the VFS intent patches]) -LB_LINUX_TRY_COMPILE([ - #include - #include -],[ - struct nameidata nd; - struct lookup_intent *it; - - it = &nd.intent; - intent_init(it, IT_OPEN); - it->d.lustre.it_disposition = 0; - it->d.lustre.it_data = NULL; -],[ - AC_MSG_RESULT([yes]) - AC_DEFINE(HAVE_VFS_INTENT_PATCHES, 1, [VFS intent patches are applied]) -],[ - AC_MSG_RESULT([no]) -]) -]) - -AC_DEFUN([LC_S_TIME_GRAN], -[AC_MSG_CHECKING([if super block has s_time_gran member]) -LB_LINUX_TRY_COMPILE([ - #include -],[ - struct super_block sb; - - return sb.s_time_gran; -],[ - AC_MSG_RESULT([yes]) - AC_DEFINE(HAVE_S_TIME_GRAN, 1, [super block has s_time_gran member]) -],[ - AC_MSG_RESULT([no]) -]) -]) - -AC_DEFUN([LC_SB_TIME_GRAN], -[AC_MSG_CHECKING([if kernel has old get_sb_time_gran]) -LB_LINUX_TRY_COMPILE([ - #include -],[ - return get_sb_time_gran(NULL); -],[ - AC_MSG_RESULT([yes]) - AC_DEFINE(HAVE_SB_TIME_GRAN, 1, [kernel has old get_sb_time_gran]) -],[ - AC_MSG_RESULT([no]) -]) -]) - -# -# LC_FUNC_GRAB_CACHE_PAGE_NOWAIT_GFP -# -# Check for our patched grab_cache_page_nowait_gfp() function -# after 2.6.29 we can emulate this using add_to_page_cache_lru() -# -AC_DEFUN([LC_FUNC_GRAB_CACHE_PAGE_NOWAIT_GFP], -[LB_CHECK_SYMBOL_EXPORT([grab_cache_page_nowait_gfp], -[mm/filemap.c],[ - AC_DEFINE(HAVE_GRAB_CACHE_PAGE_NOWAIT_GFP, 1, - [kernel exports grab_cache_page_nowait_gfp]) - ], - [LB_CHECK_SYMBOL_EXPORT([add_to_page_cache_lru], - [mm/filemap.c],[ - AC_DEFINE(HAVE_ADD_TO_PAGE_CACHE_LRU, 1, - [kernel exports add_to_page_cache_lru]) - ],[ - ]) - ]) -]) - -# ~2.6.12 merge patch from oracle to convert tree_lock from spinlock to rwlock -AC_DEFUN([LC_RW_TREE_LOCK], -[AC_MSG_CHECKING([if kernel has tree_lock as rwlock]) -tmp_flags="$EXTRA_KCFLAGS" -EXTRA_KCFLAGS="-Werror" -LB_LINUX_TRY_COMPILE([ - #include -],[ - struct address_space a; - - write_lock(&a.tree_lock); -],[ - AC_MSG_RESULT([yes]) - AC_DEFINE(HAVE_RW_TREE_LOCK, 1, [kernel has tree_lock as rw_lock]) -],[ - AC_MSG_RESULT([no]) -]) -EXTRA_KCFLAGS="$tmp_flags" -]) - -AC_DEFUN([LC_CONST_ACL_SIZE], -[AC_MSG_CHECKING([calc acl size]) -tmp_flags="$CFLAGS" -CFLAGS="$CFLAGS -I$LINUX/include -I$LINUX_OBJ/include -I$LINUX_OBJ/include2 -I$LINUX/arch/`uname -m|sed -e 's/ppc.*/powerpc/' -e 's/x86_64/x86/' -e 's/i.86/x86/'`/include $EXTRA_KCFLAGS" -AC_TRY_RUN([ -#define __KERNEL__ -#include -#include -#undef __KERNEL__ -// block include -#define __LINUX_POSIX_ACL_H - -# ifdef CONFIG_FS_POSIX_ACL -# ifdef HAVE_XATTR_ACL -# include -# endif -# ifdef HAVE_LINUX_POSIX_ACL_XATTR_H -# include -# endif -# endif - -#include - -#include - -int main(void) -{ - int size = mds_xattr_acl_size(LUSTRE_POSIX_ACL_MAX_ENTRIES); - FILE *f = fopen("acl.size","w+"); - fprintf(f,"%d", size); - fclose(f); - - return 0; -} - -],[ - acl_size=`cat acl.size` - AC_MSG_RESULT([ACL size $acl_size]) - AC_DEFINE_UNQUOTED(XATTR_ACL_SIZE, AS_TR_SH([$acl_size]), [size of xattr acl]) -],[ - AC_ERROR([ACL size can't computed]) -]) -CFLAGS="$tmp_flags" -]) - -# -# check for crypto API -# -AC_DEFUN([LC_ASYNC_BLOCK_CIPHER], -[AC_MSG_CHECKING([if kernel has block cipher support]) -LB_LINUX_TRY_COMPILE([ - #include - #include -],[ - struct crypto_blkcipher *tfm; - tfm = crypto_alloc_blkcipher("aes", 0, 0 ); -],[ - AC_MSG_RESULT([yes]) - AC_DEFINE(HAVE_ASYNC_BLOCK_CIPHER, 1, [kernel has block cipher support]) -],[ - AC_MSG_RESULT([no]) -]) -]) - -# -# check for struct hash_desc -# -AC_DEFUN([LC_STRUCT_HASH_DESC], -[AC_MSG_CHECKING([if kernel has struct hash_desc]) -LB_LINUX_TRY_COMPILE([ - #include - #include -],[ - struct hash_desc foo; -],[ - AC_MSG_RESULT([yes]) - AC_DEFINE(HAVE_STRUCT_HASH_DESC, 1, [kernel has struct hash_desc]) -],[ - AC_MSG_RESULT([no]) -]) -]) - -# -# check for struct blkcipher_desc -# -AC_DEFUN([LC_STRUCT_BLKCIPHER_DESC], -[AC_MSG_CHECKING([if kernel has struct blkcipher_desc]) -LB_LINUX_TRY_COMPILE([ - #include - #include -],[ - struct blkcipher_desc foo; -],[ - AC_MSG_RESULT([yes]) - AC_DEFINE(HAVE_STRUCT_BLKCIPHER_DESC, 1, [kernel has struct blkcipher_desc]) -],[ - AC_MSG_RESULT([no]) -]) -]) - -# -# 2.6.19 check for FS_RENAME_DOES_D_MOVE flag -# -AC_DEFUN([LC_FS_RENAME_DOES_D_MOVE], -[AC_MSG_CHECKING([if kernel has FS_RENAME_DOES_D_MOVE flag]) -LB_LINUX_TRY_COMPILE([ - #include -],[ - int v = FS_RENAME_DOES_D_MOVE; -],[ - AC_MSG_RESULT([yes]) - AC_DEFINE(HAVE_FS_RENAME_DOES_D_MOVE, 1, [kernel has FS_RENAME_DOES_D_MOVE flag]) -],[ - AC_MSG_RESULT([no]) -]) -]) - -# -# LC_FUNC_F_OP_FLOCK -# -# rhel4.2 kernel has f_op->flock field -# -AC_DEFUN([LC_FUNC_F_OP_FLOCK], -[AC_MSG_CHECKING([if struct file_operations has flock field]) -LB_LINUX_TRY_COMPILE([ - #include -],[ - struct file_operations ll_file_operations_flock; - ll_file_operations_flock.flock = NULL; -],[ - AC_DEFINE(HAVE_F_OP_FLOCK, 1, - [struct file_operations has flock field]) - AC_MSG_RESULT([yes]) -],[ - AC_MSG_RESULT([no]) -]) -]) - -# vfs_symlink seems to have started out with 3 args until 2.6.7 where a -# "mode" argument was added, but then again, in some later version it was -# removed -AC_DEFUN([LC_4ARGS_VFS_SYMLINK], -[AC_MSG_CHECKING([if vfs_symlink wants 4 args]) -LB_LINUX_TRY_COMPILE([ - #include -],[ - struct inode *dir; - struct dentry *dentry; - const char *oldname = NULL; - int mode = 0; - - vfs_symlink(dir, dentry, oldname, mode); -],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_4ARGS_VFS_SYMLINK, 1, - [vfs_symlink wants 4 args]) + [fs_struct use path structure]) ],[ - AC_MSG_RESULT(no) + AC_MSG_RESULT([no]) ]) ]) -# 2.6.23 has new shrinker API -AC_DEFUN([LC_REGISTER_SHRINKER], -[LB_CHECK_SYMBOL_EXPORT([register_shrinker], -[mm/vmscan.c],[ - AC_DEFINE(HAVE_REGISTER_SHRINKER, 1, - [kernel exports register_shrinker]) -],[ -]) -]) +# 2.6.27 -#2.6.27 AC_DEFUN([LC_INODE_PERMISION_2ARGS], [AC_MSG_CHECKING([inode_operations->permission has two args]) LB_LINUX_TRY_COMPILE([ @@ -1625,82 +1802,6 @@ LB_LINUX_TRY_COMPILE([ ]) ]) -# vfs_symlink seems to have started out with 3 args until 2.6.7 where a -# "mode" argument was added, but then again, in some later version it was -# removed -AC_DEFUN([LC_4ARGS_VFS_SYMLINK], -[AC_MSG_CHECKING([if vfs_symlink wants 4 args]) -LB_LINUX_TRY_COMPILE([ - #include -],[ - struct inode *dir; - struct dentry *dentry; - const char *oldname = NULL; - int mode = 0; - - vfs_symlink(dir, dentry, oldname, mode); -],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_4ARGS_VFS_SYMLINK, 1, - [vfs_symlink wants 4 args]) -],[ - AC_MSG_RESULT(no) -]) -]) - -# 2.6.27 sles11 remove the bi_hw_segments -AC_DEFUN([LC_BI_HW_SEGMENTS], -[AC_MSG_CHECKING([struct bio has a bi_hw_segments field]) -LB_LINUX_TRY_COMPILE([ - #include -],[ - struct bio io; - io.bi_hw_segments = 0; -],[ - AC_DEFINE(HAVE_BI_HW_SEGMENTS, 1, - [struct bio has a bi_hw_segments field]) - AC_MSG_RESULT([yes]) -],[ - AC_MSG_RESULT([no]) -]) -]) - -# -# 2.6.27 sles11 move the quotaio_v1{2}.h from include/linux to fs -# 2.6.32 move the quotaio_v1{2}.h from fs to fs/quota -AC_DEFUN([LC_HAVE_QUOTAIO_V1_H], -[LB_CHECK_FILE([$LINUX/include/linux/quotaio_v1.h],[ - AC_DEFINE(HAVE_QUOTAIO_V1_H, 1, - [kernel has include/linux/quotaio_v1.h]) -],[LB_CHECK_FILE([$LINUX/fs/quota/quotaio_v1.h],[ - AC_DEFINE(HAVE_FS_QUOTA_QUOTAIO_V1_H, 1, - [kernel has fs/quota/quotaio_v1.h]) -],[ - AC_MSG_RESULT([no]) -]) -]) -]) - -# sles10 sp2 need 5 parameter for vfs_symlink -AC_DEFUN([LC_VFS_SYMLINK_5ARGS], -[AC_MSG_CHECKING([vfs_symlink need 5 parameter]) -LB_LINUX_TRY_COMPILE([ - #include -],[ - struct inode *dir = NULL; - struct dentry *dentry = NULL; - struct vfsmount *mnt = NULL; - const char * path = NULL; - vfs_symlink(dir, dentry, mnt, path, 0); -],[ - AC_DEFINE(HAVE_VFS_SYMLINK_5ARGS, 1, - [vfs_symlink need 5 parameteres]) - AC_MSG_RESULT([yes]) -],[ - AC_MSG_RESULT([no]) -]) -]) - # 2.6.27 removed the read_inode from super_operations. AC_DEFUN([LC_READ_INODE_IN_SBOPS], [AC_MSG_CHECKING([super_operations has a read_inode field]) @@ -1718,38 +1819,6 @@ LB_LINUX_TRY_COMPILE([ ]) ]) -# 2.6.27 sles11 has sb_any_quota_active -AC_DEFUN([LC_SB_ANY_QUOTA_ACTIVE], -[AC_MSG_CHECKING([Kernel has sb_any_quota_active]) -LB_LINUX_TRY_COMPILE([ - #include -],[ - sb_any_quota_active(NULL); -],[ - AC_DEFINE(HAVE_SB_ANY_QUOTA_ACTIVE, 1, - [Kernel has a sb_any_quota_active]) - AC_MSG_RESULT([yes]) -],[ - AC_MSG_RESULT([no]) -]) -]) - -# 2.6.27 sles11 has sb_has_quota_active -AC_DEFUN([LC_SB_HAS_QUOTA_ACTIVE], -[AC_MSG_CHECKING([Kernel has sb_has_quota_active]) -LB_LINUX_TRY_COMPILE([ - #include -],[ - sb_has_quota_active(NULL, 0); -],[ - AC_DEFINE(HAVE_SB_HAS_QUOTA_ACTIVE, 1, - [Kernel has a sb_has_quota_active]) - AC_MSG_RESULT([yes]) -],[ - AC_MSG_RESULT([no]) -]) -]) - # 2.6.27 has inode_permission instead of permisson AC_DEFUN([LC_EXPORT_INODE_PERMISSION], [LB_CHECK_SYMBOL_EXPORT([inode_permission], @@ -1809,28 +1878,6 @@ LB_LINUX_TRY_COMPILE([ ]) ]) -# -# LC_LINUX_FIEMAP_H -# -# If we have fiemap.h -# after 2.6.27 use fiemap.h in include/linux -# -AC_DEFUN([LC_LINUX_FIEMAP_H], -[LB_CHECK_FILE([$LINUX/include/linux/fiemap.h],[ - AC_MSG_CHECKING([if fiemap.h can be compiled]) - LB_LINUX_TRY_COMPILE([ - #include - #include - ],[],[ - AC_MSG_RESULT([yes]) - AC_DEFINE(HAVE_LINUX_FIEMAP_H, 1, [Kernel has fiemap.h]) - ],[ - AC_MSG_RESULT([no]) - ]) -], -[]) -]) - # LC_LOCK_MAP_ACQUIRE # after 2.6.27 lock_map_acquire replaces lock_acquire AC_DEFUN([LC_LOCK_MAP_ACQUIRE], @@ -1848,26 +1895,95 @@ LB_LINUX_TRY_COMPILE([ ]) ]) +# 2.6.27.15-2 sles11 + +# 2.6.27 sles11 remove the bi_hw_segments +AC_DEFUN([LC_BI_HW_SEGMENTS], +[AC_MSG_CHECKING([struct bio has a bi_hw_segments field]) +LB_LINUX_TRY_COMPILE([ + #include +],[ + struct bio io; + io.bi_hw_segments = 0; +],[ + AC_DEFINE(HAVE_BI_HW_SEGMENTS, 1, + [struct bio has a bi_hw_segments field]) + AC_MSG_RESULT([yes]) +],[ + AC_MSG_RESULT([no]) +]) +]) + # -# LC_D_OBTAIN_ALIAS -# starting from 2.6.28 kernel replaces d_alloc_anon() with -# d_obtain_alias() for getting anonymous dentries -# -AC_DEFUN([LC_D_OBTAIN_ALIAS], -[AC_MSG_CHECKING([d_obtain_alias exist in kernel]) +# 2.6.27 sles11 move the quotaio_v1{2}.h from include/linux to fs +# 2.6.32 move the quotaio_v1{2}.h from fs to fs/quota +AC_DEFUN([LC_HAVE_QUOTAIO_V1_H], +[LB_CHECK_FILE([$LINUX/include/linux/quotaio_v1.h],[ + AC_DEFINE(HAVE_QUOTAIO_V1_H, 1, + [kernel has include/linux/quotaio_v1.h]) +],[LB_CHECK_FILE([$LINUX/fs/quota/quotaio_v1.h],[ + AC_DEFINE(HAVE_FS_QUOTA_QUOTAIO_V1_H, 1, + [kernel has fs/quota/quotaio_v1.h]) +],[ + AC_MSG_RESULT([no]) +]) +]) +]) + +# sles10 sp2 need 5 parameter for vfs_symlink +AC_DEFUN([LC_VFS_SYMLINK_5ARGS], +[AC_MSG_CHECKING([vfs_symlink need 5 parameter]) LB_LINUX_TRY_COMPILE([ - #include + #include ],[ - d_obtain_alias(NULL); + struct inode *dir = NULL; + struct dentry *dentry = NULL; + struct vfsmount *mnt = NULL; + const char * path = NULL; + vfs_symlink(dir, dentry, mnt, path, 0); ],[ - AC_DEFINE(HAVE_D_OBTAIN_ALIAS, 1, - [d_obtain_alias exist in kernel]) + AC_DEFINE(HAVE_VFS_SYMLINK_5ARGS, 1, + [vfs_symlink need 5 parameteres]) + AC_MSG_RESULT([yes]) +],[ + AC_MSG_RESULT([no]) +]) +]) + +# 2.6.27 sles11 has sb_any_quota_active +AC_DEFUN([LC_SB_ANY_QUOTA_ACTIVE], +[AC_MSG_CHECKING([Kernel has sb_any_quota_active]) +LB_LINUX_TRY_COMPILE([ + #include +],[ + sb_any_quota_active(NULL); +],[ + AC_DEFINE(HAVE_SB_ANY_QUOTA_ACTIVE, 1, + [Kernel has a sb_any_quota_active]) + AC_MSG_RESULT([yes]) +],[ + AC_MSG_RESULT([no]) +]) +]) + +# 2.6.27 sles11 has sb_has_quota_active +AC_DEFUN([LC_SB_HAS_QUOTA_ACTIVE], +[AC_MSG_CHECKING([Kernel has sb_has_quota_active]) +LB_LINUX_TRY_COMPILE([ + #include +],[ + sb_has_quota_active(NULL, 0); +],[ + AC_DEFINE(HAVE_SB_HAS_QUOTA_ACTIVE, 1, + [Kernel has a sb_has_quota_active]) AC_MSG_RESULT([yes]) ],[ AC_MSG_RESULT([no]) ]) ]) +# 2.6.31 + # 2.6.31 replaces blk_queue_hardsect_size by blk_queue_logical_block_size function AC_DEFUN([LC_BLK_QUEUE_LOG_BLK_SIZE], [AC_MSG_CHECKING([if blk_queue_logical_block_size is defined]) @@ -1884,6 +2000,8 @@ LB_LINUX_TRY_COMPILE([ ]) ]) +# 2.6.32 + # 2.6.32 add a limits member in struct request_queue. AC_DEFUN([LC_REQUEST_QUEUE_LIMITS], [AC_MSG_CHECKING([if request_queue has a limits field]) @@ -1901,24 +2019,34 @@ LB_LINUX_TRY_COMPILE([ ]) ]) -# RHEL6(backport from 2.6.34) removes 2 functions blk_queue_max_phys_segments and -# blk_queue_max_hw_segments add blk_queue_max_segments -AC_DEFUN([LC_BLK_QUEUE_MAX_SEGMENTS], -[AC_MSG_CHECKING([if blk_queue_max_segments is defined]) +# 2.6.32 has bdi_register() functions. +AC_DEFUN([LC_EXPORT_BDI_REGISTER], +[LB_CHECK_SYMBOL_EXPORT([bdi_register], +[mm/backing-dev.c],[ + AC_DEFINE(HAVE_BDI_REGISTER, 1, + [bdi_register function is present]) +],[ +]) +]) + +# 2.6.32 add s_bdi for super block +AC_DEFUN([LC_SB_BDI], +[AC_MSG_CHECKING([if super_block has s_bdi field]) LB_LINUX_TRY_COMPILE([ - #include + #include ],[ - blk_queue_max_segments(NULL, 0); + struct super_block sb; + sb.s_bdi = NULL; ],[ AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_BLK_QUEUE_MAX_SEGMENTS, 1, - [blk_queue_max_segments is defined]) + AC_DEFINE(HAVE_SB_BDI, 1, + [super_block has s_bdi field]) ],[ AC_MSG_RESULT(no) ]) ]) -# RHEL6(backport from 2.6.34) removes blk_queue_max_sectors and add blk_queue_max_hw_sectors +# 2.6.32 removes blk_queue_max_sectors and add blk_queue_max_hw_sectors # check blk_queue_max_sectors and use it until disappear. AC_DEFUN([LC_BLK_QUEUE_MAX_SECTORS], [AC_MSG_CHECKING([if blk_queue_max_sectors is defined]) @@ -1935,43 +2063,118 @@ LB_LINUX_TRY_COMPILE([ ]) ]) -# 2.6.32 has new BDI interface. -AC_DEFUN([LC_NEW_BACKING_DEV_INFO], -[AC_MSG_CHECKING([if backing_dev_info has a wb_cnt field]) +# 2.6.32 replaces 2 functions blk_queue_max_phys_segments and blk_queue_max_hw_segments by blk_queue_max_segments +AC_DEFUN([LC_BLK_QUEUE_MAX_SEGMENTS], +[AC_MSG_CHECKING([if blk_queue_max_segments is defined]) +LB_LINUX_TRY_COMPILE([ + #include +],[ + blk_queue_max_segments(NULL, 0); +],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BLK_QUEUE_MAX_SEGMENTS, 1, + [blk_queue_max_segments is defined]) +],[ + AC_MSG_RESULT(no) +]) +]) + +# 2.6.32-71 adds an argument to shrink callback +AC_DEFUN([LC_SHRINK_3ARGS], +[AC_MSG_CHECKING([if shrink has 3 arguments]) +LB_LINUX_TRY_COMPILE([ + #include +],[ + struct shrinker s; + return s.shrink(NULL, 0, 0); +],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_SHRINK_3ARGS, 1, + [shrink has 3 arguments]) +],[ + AC_MSG_RESULT(no) +]) +]) + +# +# LC_EXT4_SINGLEDATA_TRANS_BLOCKS_SB +# +AC_DEFUN([LC_EXT4_SINGLEDATA_TRANS_BLOCKS_SB], +[AC_MSG_CHECKING([if EXT4_SINGLEDATA_TRANS_BLOCKS takes the sb as argument]) +tmp_flags="$EXTRA_KCFLAGS" +EXTRA_KCFLAGS="-I$LINUX/fs" LB_LINUX_TRY_COMPILE([ - #include + #include + #include ],[ - struct backing_dev_info bdi; - bdi.wb_cnt = 0; + struct super_block sb; + EXT4_SINGLEDATA_TRANS_BLOCKS(&sb); ],[ AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_NEW_BACKING_DEV_INFO, 1, - [backing_dev_info has a wb_cnt field]) + AC_DEFINE(LDISKFS_SINGLEDATA_TRANS_BLOCKS_HAS_SB, 1, + [EXT4_SINGLEDATA_TRANS_BLOCKS takes sb as argument]) ],[ AC_MSG_RESULT(no) ]) +EXTRA_KCFLAGS="$tmp_flags" ]) -# 2.6.24 has bdi_init()/bdi_destroy() functions. -AC_DEFUN([LC_EXPORT_BDI_INIT], -[LB_CHECK_SYMBOL_EXPORT([bdi_init], -[mm/backing-dev.c],[ - AC_DEFINE(HAVE_BDI_INIT, 1, - [bdi_init/bdi_destroy functions are present]) -],[ -]) +# +# LC_QUOTA64 +# linux kernel have 64-bit limits support +# +AC_DEFUN([LC_QUOTA64],[ + AC_MSG_CHECKING([if kernel has 64-bit quota limits support]) +tmp_flags="$EXTRA_KCFLAGS" +EXTRA_KCFLAGS="-I$LINUX/fs" + LB_LINUX_TRY_COMPILE([ + #include + #include + #ifdef HAVE_QUOTAIO_V1_H + # include + int versions[] = V2_INITQVERSIONS_R1; + struct v2_disk_dqblk_r1 dqblk_r1; + #else + # ifdef HAVE_FS_QUOTA_QUOTAIO_V1_H + # include + # else + # include + # endif + struct v2r1_disk_dqblk dqblk_r1; + #endif + ],[],[ + AC_DEFINE(HAVE_QUOTA64, 1, [have quota64]) + AC_MSG_RESULT([yes]) + ],[ + LB_CHECK_FILE([$LINUX/include/linux/lustre_version.h],[ + AC_MSG_ERROR([You have got no 64-bit kernel quota support.]) + ],[]) + AC_MSG_RESULT([no]) + ]) +EXTRA_KCFLAGS=$tmp_flags ]) -# 2.6.23 exports exportfs_decode_fh -AC_DEFUN([LC_EXPORTFS_DECODE_FH], -[LB_CHECK_SYMBOL_EXPORT([exportfs_decode_fh], -[fs/exportfs/expfs.c],[ - AC_DEFINE(HAVE_EXPORTFS_DECODE_FH, 1, - [exportfs_decode_fh has been export]) +# +# LC_D_OBTAIN_ALIAS +# starting from 2.6.28 kernel replaces d_alloc_anon() with +# d_obtain_alias() for getting anonymous dentries +# +AC_DEFUN([LC_D_OBTAIN_ALIAS], +[AC_MSG_CHECKING([d_obtain_alias exist in kernel]) +LB_LINUX_TRY_COMPILE([ + #include +],[ + d_obtain_alias(NULL); +],[ + AC_DEFINE(HAVE_D_OBTAIN_ALIAS, 1, + [d_obtain_alias exist in kernel]) + AC_MSG_RESULT([yes]) ],[ + AC_MSG_RESULT([no]) ]) ]) + # # LC_PROG_LINUX # @@ -2018,7 +2221,6 @@ AC_DEFUN([LC_PROG_LINUX], LC_CAPA_CRYPTO LC_CONFIG_RMTCLIENT LC_CONFIG_GSS - LC_FUNC_MS_FLOCK_LOCK LC_FUNC_HAVE_CAN_SLEEP_ARG LC_FUNC_F_OP_FLOCK LC_QUOTA_READ @@ -2026,10 +2228,6 @@ AC_DEFUN([LC_PROG_LINUX], LC_FUNC_RCU LC_PERCPU_COUNTER LC_TASK_CLENV_STORE - LC_4ARGS_VFS_SYMLINK - - # does the kernel have VFS intent patches? - LC_VFS_INTENT_PATCHES # ~2.6.11 LC_S_TIME_GRAN @@ -2061,6 +2259,8 @@ AC_DEFUN([LC_PROG_LINUX], if test x$enable_server = xyes ; then LC_EXPORT_INVALIDATE_MAPPING_PAGES fi + LC_EXT4_DISCARD_PREALLOCATIONS + LC_EXT_INSERT_EXTENT_WITH_5ARGS #2.6.18 + RHEL5 (fc6) LC_PG_FS_MISC @@ -2132,9 +2332,12 @@ AC_DEFUN([LC_PROG_LINUX], # 2.6.32 LC_REQUEST_QUEUE_LIMITS - LC_NEW_BACKING_DEV_INFO + LC_EXPORT_BDI_REGISTER + LC_SB_BDI LC_BLK_QUEUE_MAX_SECTORS LC_BLK_QUEUE_MAX_SEGMENTS + LC_SHRINK_3ARGS + LC_EXT4_SINGLEDATA_TRANS_BLOCKS_SB # if test x$enable_server = xyes ; then @@ -2310,18 +2513,6 @@ LC_CONFIG_PINGER LC_CONFIG_LIBLUSTRE_RECOVERY ]) -AC_DEFUN([LC_CONFIG_LRU_RESIZE], -[AC_MSG_CHECKING([whether to enable lru self-adjusting]) -AC_ARG_ENABLE([lru_resize], - AC_HELP_STRING([--enable-lru-resize], - [enable lru resize support]), - [],[enable_lru_resize='yes']) -AC_MSG_RESULT([$enable_lru_resize]) -if test x$enable_lru_resize != xno; then - AC_DEFINE(HAVE_LRU_RESIZE_SUPPORT, 1, [Enable lru resize support]) -fi -]) - # # LC_CONFIG_QUOTA # @@ -2334,19 +2525,6 @@ AC_DEFUN([LC_CONFIG_QUOTA], [],[enable_quota='yes']) ]) -# whether to enable quota support(kernel modules) -AC_DEFUN([LC_QUOTA_MODULE], -[if test x$enable_quota != xno; then - LB_LINUX_CONFIG([QUOTA],[ - enable_quota_module='yes' - AC_DEFINE(HAVE_QUOTA_SUPPORT, 1, [Enable quota support]) - ],[ - enable_quota_module='no' - AC_MSG_WARN([quota is not enabled because the kernel - lacks quota support]) - ]) -fi -]) - AC_DEFUN([LC_QUOTA], [#check global LC_CONFIG_QUOTA @@ -2356,21 +2534,6 @@ AC_CHECK_HEADER(sys/quota.h, [AC_MSG_ERROR([don't find in your system])]) ]) -AC_DEFUN([LC_QUOTA_READ], -[AC_MSG_CHECKING([if kernel supports quota_read]) -LB_LINUX_TRY_COMPILE([ - #include -],[ - struct super_operations sp; - void *i = (void *)sp.quota_read; -],[ - AC_MSG_RESULT([yes]) - AC_DEFINE(KERNEL_SUPPORTS_QUOTA_READ, 1, [quota_read found]) -],[ - AC_MSG_RESULT([no]) -]) -]) - # # LC_CONFIG_SPLIT # @@ -2388,144 +2551,6 @@ if test x$enable_split != xno; then fi ]) -# -# LC_COOKIE_FOLLOW_LINK -# -# kernel 2.6.13+ ->follow_link returns a cookie -# - -AC_DEFUN([LC_COOKIE_FOLLOW_LINK], -[AC_MSG_CHECKING([if inode_operations->follow_link returns a cookie]) -LB_LINUX_TRY_COMPILE([ - #include - #include -],[ - struct dentry dentry; - struct nameidata nd; - - dentry.d_inode->i_op->put_link(&dentry, &nd, NULL); -],[ - AC_DEFINE(HAVE_COOKIE_FOLLOW_LINK, 1, [inode_operations->follow_link returns a cookie]) - AC_MSG_RESULT([yes]) -],[ - AC_MSG_RESULT([no]) -]) -]) - -# -# LC_FUNC_RCU -# -# kernels prior than 2.6.0(?) have no RCU supported; in kernel 2.6.5(SUSE), -# call_rcu takes three parameters. -# -AC_DEFUN([LC_FUNC_RCU], -[AC_MSG_CHECKING([if kernel have RCU supported]) -LB_LINUX_TRY_COMPILE([ - #include -],[],[ - AC_DEFINE(HAVE_RCU, 1, [have RCU defined]) - AC_MSG_RESULT([yes]) - - AC_MSG_CHECKING([if call_rcu takes three parameters]) - LB_LINUX_TRY_COMPILE([ - #include - ],[ - struct rcu_head rh; - call_rcu(&rh, (void (*)(struct rcu_head *))1, NULL); - ],[ - AC_DEFINE(HAVE_CALL_RCU_PARAM, 1, [call_rcu takes three parameters]) - AC_MSG_RESULT([yes]) - ],[ - AC_MSG_RESULT([no]) - ]) - -],[ - AC_MSG_RESULT([no]) -]) -]) - -# -# LC_QUOTA64 -# linux kernel have 64-bit limits support -# -AC_DEFUN([LC_QUOTA64],[ - AC_MSG_CHECKING([if kernel has 64-bit quota limits support]) -tmp_flags="$EXTRA_KCFLAGS" -EXTRA_KCFLAGS="-I$LINUX/fs" - LB_LINUX_TRY_COMPILE([ - #include - #include - #ifdef HAVE_QUOTAIO_V1_H - # include - int versions[] = V2_INITQVERSIONS_R1; - struct v2_disk_dqblk_r1 dqblk_r1; - #else - # ifdef HAVE_FS_QUOTA_QUOTAIO_V1_H - # include - # else - # include - # endif - struct v2r1_disk_dqblk dqblk_r1; - #endif - ],[],[ - AC_DEFINE(HAVE_QUOTA64, 1, [have quota64]) - AC_MSG_RESULT([yes]) - ],[ - LB_CHECK_FILE([$LINUX/include/linux/lustre_version.h],[ - AC_MSG_ERROR([You have got no 64-bit kernel quota support.]) - ],[]) - AC_MSG_RESULT([no]) - ]) -EXTRA_KCFLAGS=$tmp_flags -]) - -# LC_SECURITY_PLUG # for SLES10 SP2 -# check security plug in sles10 sp2 kernel -AC_DEFUN([LC_SECURITY_PLUG], -[AC_MSG_CHECKING([If kernel has security plug support]) -LB_LINUX_TRY_COMPILE([ - #include -],[ - struct dentry *dentry; - struct vfsmount *mnt; - struct iattr *iattr; - - notify_change(dentry, mnt, iattr); -],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_SECURITY_PLUG, 1, - [SLES10 SP2 use extra parameter in vfs]) -],[ - AC_MSG_RESULT(no) -]) -]) - -AC_DEFUN([LC_PERCPU_COUNTER], -[AC_MSG_CHECKING([if have struct percpu_counter defined]) -LB_LINUX_TRY_COMPILE([ - #include -],[],[ - AC_DEFINE(HAVE_PERCPU_COUNTER, 1, [percpu_counter found]) - AC_MSG_RESULT([yes]) - - AC_MSG_CHECKING([if percpu_counter_inc takes the 2nd argument]) - LB_LINUX_TRY_COMPILE([ - #include - ],[ - struct percpu_counter c; - percpu_counter_init(&c, 0); - ],[ - AC_DEFINE(HAVE_PERCPU_2ND_ARG, 1, [percpu_counter_init has two - arguments]) - AC_MSG_RESULT([yes]) - ],[ - AC_MSG_RESULT([no]) - ]) -],[ - AC_MSG_RESULT([no]) -]) -]) - AC_DEFUN([LC_TASK_CLENV_TUX_INFO], [AC_MSG_CHECKING([tux_info]) LB_LINUX_TRY_COMPILE([ @@ -2542,14 +2567,6 @@ LB_LINUX_TRY_COMPILE([ ]) ]) -AC_DEFUN([LC_TASK_CLENV_STORE], -[ - AC_MSG_CHECKING([if we can store cl_env in task_struct]) - if test x$have_task_clenv_store != xyes ; then - LC_TASK_CLENV_TUX_INFO - fi -]) - # # LC_LLITE_LLOOP_MODULE # lloop_llite.ko does not currently work with page sizes diff --git a/lustre/include/linux/lustre_compat25.h b/lustre/include/linux/lustre_compat25.h index 00875ee..cc504e5 100644 --- a/lustre/include/linux/lustre_compat25.h +++ b/lustre/include/linux/lustre_compat25.h @@ -690,8 +690,16 @@ static inline int ll_crypto_hmac(struct crypto_tfm *tfm, #define cpu_to_node(cpu) 0 #endif -#ifdef HAVE_REGISTER_SHRINKER +#ifndef HAVE_REGISTER_SHRINKER +#define KERN_SHRINKER(name) name(int nr_to_scan, gfp_t gfp_mask) +#else +#ifdef HAVE_SHRINK_3ARGS +typedef int (*cfs_shrinker_t)(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask); +#define KERN_SHRINKER(name) name(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) +#else typedef int (*cfs_shrinker_t)(int nr_to_scan, gfp_t gfp_mask); +#define KERN_SHRINKER(name) name(int nr_to_scan, gfp_t gfp_mask) +#endif static inline struct shrinker *cfs_set_shrinker(int seek, cfs_shrinker_t func) diff --git a/lustre/include/lustre_disk.h b/lustre/include/lustre_disk.h index 8c3e8c8..6c46056 100644 --- a/lustre/include/lustre_disk.h +++ b/lustre/include/lustre_disk.h @@ -436,12 +436,13 @@ struct lustre_sb_info { struct ll_sb_info *lsi_llsbi; /* add'l client sbi info */ struct vfsmount *lsi_srv_mnt; /* the one server mount */ cfs_atomic_t lsi_mounts; /* references to the srv_mnt */ - struct backing_dev_info bdi; /* Each client mountpoint needs own backing_dev_info */ + struct backing_dev_info lsi_bdi; /* each client mountpoint needs own backing_dev_info */ }; #define LSI_SERVER 0x00000001 #define LSI_UMOUNT_FORCE 0x00000010 #define LSI_UMOUNT_FAILOVER 0x00000020 +#define LSI_BDI_INITIALIZED 0x00000040 #define s2lsi(sb) ((struct lustre_sb_info *)((sb)->s_fs_info)) #define s2lsi_nocast(sb) ((sb)->s_fs_info) diff --git a/lustre/kernel_patches/patches/blkdev_tunables-2.6-rhel6.patch b/lustre/kernel_patches/patches/blkdev_tunables-2.6-rhel6.patch new file mode 100644 index 0000000..d62c5bc --- /dev/null +++ b/lustre/kernel_patches/patches/blkdev_tunables-2.6-rhel6.patch @@ -0,0 +1,13 @@ +Index: b/include/linux/blkdev.h +=================================================================== +--- a/include/linux/blkdev.h ++++ b/include/linux/blkdev.h +@@ -1026,7 +1026,7 @@ extern int blk_verify_command(unsigned c + enum blk_default_limits { + BLK_MAX_SEGMENTS = 128, + BLK_SAFE_MAX_SECTORS = 255, +- BLK_DEF_MAX_SECTORS = 1024, ++ BLK_DEF_MAX_SECTORS = 2048, + BLK_MAX_SEGMENT_SIZE = 65536, + BLK_SEG_BOUNDARY_MASK = 0xFFFFFFFFUL, + }; diff --git a/lustre/kernel_patches/patches/dev_read_only-2.6.32-rhel6.patch b/lustre/kernel_patches/patches/dev_read_only-2.6.32-rhel6.patch new file mode 100644 index 0000000..30c7575 --- /dev/null +++ b/lustre/kernel_patches/patches/dev_read_only-2.6.32-rhel6.patch @@ -0,0 +1,172 @@ +This functionality is mainly used during testing, in order to +simulate a server crash for ldiskfs by discarding all of the +writes to the filesystem. For recovery testing we could simulate +this by using a special loopback or DM device that also discards +writes to the device. + +This functionality is also used by target "failback" in order +to speed up service shutdown and takeover by the other node +during controlled operation. However, it would also be possible +to do this by simply allowing all of the in-flight requests to +complete and then waiting for the service to stop. This will +also be needed by the DMU-OSD, because discarding of writes on +a DMU-based target is not safe as it could trigger a storage +failure if the data is ever read from disk again and the +checksum does not match that expected by the block pointer. + +Initial efforts to remove this patch are under way in bug 20776. +Once this work comes to fruition this patch can be dropped. + +Index: linux-2.6.32-71.18.1.el6-master/block/blk-core.c +=================================================================== +--- linux-2.6.32-71.18.1.el6-master.orig/block/blk-core.c 2011-03-05 11:35:40.404043293 +0800 ++++ linux-2.6.32-71.18.1.el6-master/block/blk-core.c 2011-03-11 20:21:10.492302510 +0800 +@@ -1405,6 +1405,8 @@ + + #endif /* CONFIG_FAIL_MAKE_REQUEST */ + ++int dev_check_rdonly(struct block_device *bdev); ++ + /* + * Check whether this bio extends beyond the end of the device. + */ +@@ -1506,6 +1508,12 @@ + if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) + goto end_io; + ++ /* This is Lustre's dev_rdonly check */ ++ if (bio_rw(bio) == WRITE && dev_check_rdonly(bio->bi_bdev)) { ++ bio_endio(bio, 0); ++ break; ++ } ++ + if (should_fail_request(bio)) + goto end_io; + +@@ -2578,6 +2586,99 @@ + } + EXPORT_SYMBOL(kblockd_schedule_work); + ++ /* ++ * Debug code for turning block devices "read-only" (will discard writes ++ * silently). This is for filesystem crash/recovery testing. ++ */ ++struct deventry { ++ dev_t dev; ++ struct deventry *next; ++}; ++ ++static struct deventry *devlist = NULL; ++static spinlock_t devlock = SPIN_LOCK_UNLOCKED; ++ ++int dev_check_rdonly(struct block_device *bdev) ++{ ++ struct deventry *cur; ++ ++ if (!bdev) ++ return 0; ++ ++ spin_lock(&devlock); ++ cur = devlist; ++ while(cur) { ++ if (bdev->bd_dev == cur->dev) { ++ spin_unlock(&devlock); ++ return 1; ++ } ++ cur = cur->next; ++ } ++ spin_unlock(&devlock); ++ return 0; ++} ++ ++void dev_set_rdonly(struct block_device *bdev) ++{ ++ struct deventry *newdev, *cur; ++ ++ if (!bdev) ++ return; ++ ++ newdev = kmalloc(sizeof(struct deventry), GFP_KERNEL); ++ if (!newdev) ++ return; ++ ++ spin_lock(&devlock); ++ cur = devlist; ++ while(cur) { ++ if (bdev->bd_dev == cur->dev) { ++ spin_unlock(&devlock); ++ kfree(newdev); ++ return; ++ } ++ cur = cur->next; ++ } ++ newdev->dev = bdev->bd_dev; ++ newdev->next = devlist; ++ devlist = newdev; ++ spin_unlock(&devlock); ++ printk(KERN_WARNING "Turning device %s (%#x) read-only\n", ++ bdev->bd_disk ? bdev->bd_disk->disk_name : "", bdev->bd_dev); ++} ++ ++void dev_clear_rdonly(struct block_device *bdev) ++{ ++ struct deventry *cur, *last = NULL; ++ ++ if (!bdev) ++ return; ++ ++ spin_lock(&devlock); ++ cur = devlist; ++ while(cur) { ++ if (bdev->bd_dev == cur->dev) { ++ if (last) ++ last->next = cur->next; ++ else ++ devlist = cur->next; ++ spin_unlock(&devlock); ++ kfree(cur); ++ printk(KERN_WARNING "Removing read-only on %s (%#x)\n", ++ bdev->bd_disk ? bdev->bd_disk->disk_name : ++ "unknown block", ++ bdev->bd_dev); ++ return; ++ } ++ last = cur; ++ cur = cur->next; ++ } ++ spin_unlock(&devlock); ++} ++ ++EXPORT_SYMBOL(dev_set_rdonly); ++EXPORT_SYMBOL(dev_clear_rdonly); ++EXPORT_SYMBOL(dev_check_rdonly); + int __init blk_dev_init(void) + { + BUILD_BUG_ON(__REQ_NR_BITS > 8 * +Index: linux-2.6.32-71.18.1.el6-master/fs/block_dev.c +=================================================================== +--- linux-2.6.32-71.18.1.el6-master.orig/fs/block_dev.c 2011-03-05 11:35:40.486042782 +0800 ++++ linux-2.6.32-71.18.1.el6-master/fs/block_dev.c 2011-03-05 11:37:35.624324775 +0800 +@@ -1389,6 +1389,7 @@ + if (bdev != bdev->bd_contains) + victim = bdev->bd_contains; + bdev->bd_contains = NULL; ++ dev_clear_rdonly(bdev); + } + unlock_kernel(); + mutex_unlock(&bdev->bd_mutex); +Index: linux-2.6.32-71.18.1.el6-master/include/linux/fs.h +=================================================================== +--- linux-2.6.32-71.18.1.el6-master.orig/include/linux/fs.h 2011-03-05 11:35:40.445043037 +0800 ++++ linux-2.6.32-71.18.1.el6-master/include/linux/fs.h 2011-03-05 11:37:35.726324137 +0800 +@@ -2204,6 +2204,10 @@ + extern void submit_bio(int, struct bio *); + extern int bdev_read_only(struct block_device *); + #endif ++#define HAVE_CLEAR_RDONLY_ON_PUT ++extern void dev_set_rdonly(struct block_device *bdev); ++extern int dev_check_rdonly(struct block_device *bdev); ++extern void dev_clear_rdonly(struct block_device *bdev); + extern int set_blocksize(struct block_device *, int); + extern int sb_set_blocksize(struct super_block *, int); + extern int sb_min_blocksize(struct super_block *, int); diff --git a/lustre/kernel_patches/patches/export-2.6.32-vanilla.patch b/lustre/kernel_patches/patches/export-2.6.32-vanilla.patch new file mode 100644 index 0000000..0cb7884 --- /dev/null +++ b/lustre/kernel_patches/patches/export-2.6.32-vanilla.patch @@ -0,0 +1,17 @@ +security_inode_unlink() is used in filter_vfs_unlink() +to avoid lock ordering problems. I'm not sure if this +is still needed with ext4, and it definitely looks to +be gone with DMU changes. + +Index: linux+rh+chaos/security/security.c +=================================================================== +--- linux+rh+chaos.orig/security/security.c ++++ linux+rh+chaos/security/security.c +@@ -60,6 +60,7 @@ int __init security_init(void) + + return 0; + } ++EXPORT_SYMBOL(security_inode_unlink); + + /* Save user chosen LSM */ + static int __init choose_lsm(char *str) diff --git a/lustre/kernel_patches/patches/jbd2-jcberr-2.6-rhel6.patch b/lustre/kernel_patches/patches/jbd2-jcberr-2.6-rhel6.patch new file mode 100644 index 0000000..f219771 --- /dev/null +++ b/lustre/kernel_patches/patches/jbd2-jcberr-2.6-rhel6.patch @@ -0,0 +1,228 @@ +This allows the jbd transaction commit callbacks to be registered. +The ext4 jbd2 code has a different commit callback (one per transaction) +that could be used to provide equivalent functionality. This would +require modifying the existing ext4 commit callback (used by mballoc +when freeing data blocks) to be mutiplexed so it will store 2 different +callback functions and 2 different lists of callback data. + +Index: linux+rh+chaos/include/linux/jbd2.h +=================================================================== +--- linux+rh+chaos.orig/include/linux/jbd2.h ++++ linux+rh+chaos/include/linux/jbd2.h +@@ -415,6 +415,27 @@ struct jbd2_inode { + unsigned int i_flags; + }; + ++#define HAVE_JOURNAL_CALLBACK_STATUS ++/** ++ * struct journal_callback - Base structure for callback information. ++ * @jcb_list: list information for other callbacks attached to the same handle. ++ * @jcb_func: Function to call with this callback structure. ++ * ++ * This struct is a 'seed' structure for a using with your own callback ++ * structs. If you are using callbacks you must allocate one of these ++ * or another struct of your own definition which has this struct ++ * as it's first element and pass it to journal_callback_set(). ++ * ++ * This is used internally by jbd2 to maintain callback information. ++ * ++ * See journal_callback_set for more information. ++ **/ ++struct journal_callback { ++ struct list_head jcb_list; /* t_jcb_lock */ ++ void (*jcb_func)(struct journal_callback *jcb, int error); ++ /* user data goes here */ ++}; ++ + struct jbd2_revoke_table_s; + + /** +@@ -423,6 +444,7 @@ struct jbd2_revoke_table_s; + * @h_transaction: Which compound transaction is this update a part of? + * @h_buffer_credits: Number of remaining buffers we are allowed to dirty. + * @h_ref: Reference count on this handle ++ * @h_jcb: List of application registered callbacks for this handle. + * @h_err: Field for caller's use to track errors through large fs operations + * @h_sync: flag for sync-on-close + * @h_jdata: flag to force data journaling +@@ -448,6 +470,13 @@ struct handle_s + /* operations */ + int h_err; + ++ /* ++ * List of application registered callbacks for this handle. The ++ * function(s) will be called after the transaction that this handle is ++ * part of has been committed to disk. [t_jcb_lock] ++ */ ++ struct list_head h_jcb; ++ + /* Flags [no locking] */ + unsigned int h_sync: 1; /* sync-on-close */ + unsigned int h_jdata: 1; /* force data journaling */ +@@ -503,6 +532,8 @@ struct transaction_chp_stats_s { + * j_state_lock + * ->j_list_lock (journal_unmap_buffer) + * ++ * t_handle_lock ++ * ->t_jcb_lock + */ + + struct transaction_s +@@ -659,6 +690,16 @@ struct transaction_s + * structures associated with the transaction + */ + struct list_head t_private_list; ++ ++ /* ++ * Protects the callback list ++ */ ++ spinlock_t t_jcb_lock; ++ /* ++ * List of registered callback functions for this transaction. ++ * Called when the transaction is committed. [t_jcb_lock] ++ */ ++ struct list_head t_jcb; + }; + + struct transaction_run_stats_s { +@@ -1115,6 +1156,9 @@ extern int jbd2_journal_stop(handle_t * + extern int jbd2_journal_flush (journal_t *); + extern void jbd2_journal_lock_updates (journal_t *); + extern void jbd2_journal_unlock_updates (journal_t *); ++extern void jbd2_journal_callback_set(handle_t *handle, ++ void (*fn)(struct journal_callback *,int), ++ struct journal_callback *jcb); + + extern journal_t * jbd2_journal_init_dev(struct block_device *bdev, + struct block_device *fs_dev, +Index: linux+rh+chaos/fs/jbd2/checkpoint.c +=================================================================== +--- linux+rh+chaos.orig/fs/jbd2/checkpoint.c ++++ linux+rh+chaos/fs/jbd2/checkpoint.c +@@ -759,6 +759,7 @@ void __jbd2_journal_drop_transaction(jou + J_ASSERT(transaction->t_checkpoint_list == NULL); + J_ASSERT(transaction->t_checkpoint_io_list == NULL); + J_ASSERT(transaction->t_updates == 0); ++ J_ASSERT(list_empty(&transaction->t_jcb)); + J_ASSERT(journal->j_committing_transaction != transaction); + J_ASSERT(journal->j_running_transaction != transaction); + +Index: linux+rh+chaos/fs/jbd2/commit.c +=================================================================== +--- linux+rh+chaos.orig/fs/jbd2/commit.c ++++ linux+rh+chaos/fs/jbd2/commit.c +@@ -857,6 +857,30 @@ wait_for_iobuf: + transaction can be removed from any checkpoint list it was on + before. */ + ++ /* ++ * Call any callbacks that had been registered for handles in this ++ * transaction. It is up to the callback to free any allocated ++ * memory. ++ * ++ * The spinlocking (t_jcb_lock) here is surely unnecessary... ++ */ ++ spin_lock(&commit_transaction->t_jcb_lock); ++ if (!list_empty(&commit_transaction->t_jcb)) { ++ struct list_head *p, *n; ++ int error = is_journal_aborted(journal); ++ ++ list_for_each_safe(p, n, &commit_transaction->t_jcb) { ++ struct journal_callback *jcb; ++ ++ jcb = list_entry(p, struct journal_callback, jcb_list); ++ list_del(p); ++ spin_unlock(&commit_transaction->t_jcb_lock); ++ jcb->jcb_func(jcb, error); ++ spin_lock(&commit_transaction->t_jcb_lock); ++ } ++ } ++ spin_unlock(&commit_transaction->t_jcb_lock); ++ + jbd_debug(3, "JBD: commit phase 6\n"); + + J_ASSERT(list_empty(&commit_transaction->t_inode_list)); +Index: linux+rh+chaos/fs/jbd2/journal.c +=================================================================== +--- linux+rh+chaos.orig/fs/jbd2/journal.c ++++ linux+rh+chaos/fs/jbd2/journal.c +@@ -90,6 +90,8 @@ EXPORT_SYMBOL(jbd2_journal_file_inode); + EXPORT_SYMBOL(jbd2_journal_init_jbd_inode); + EXPORT_SYMBOL(jbd2_journal_release_jbd_inode); + EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate); ++EXPORT_SYMBOL(jbd2_journal_callback_set); ++EXPORT_SYMBOL(jbd2_journal_bmap); + + static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *); + static void __journal_abort_soft (journal_t *journal, int errno); +Index: linux+rh+chaos/fs/jbd2/transaction.c +=================================================================== +--- linux+rh+chaos.orig/fs/jbd2/transaction.c ++++ linux+rh+chaos/fs/jbd2/transaction.c +@@ -52,7 +52,9 @@ jbd2_get_transaction(journal_t *journal, + transaction->t_start_time = ktime_get(); + transaction->t_tid = journal->j_transaction_sequence++; + transaction->t_expires = jiffies + journal->j_commit_interval; ++ INIT_LIST_HEAD(&transaction->t_jcb); + spin_lock_init(&transaction->t_handle_lock); ++ spin_lock_init(&transaction->t_jcb_lock); + INIT_LIST_HEAD(&transaction->t_inode_list); + INIT_LIST_HEAD(&transaction->t_private_list); + +@@ -257,6 +259,7 @@ static handle_t *new_handle(int nblocks) + memset(handle, 0, sizeof(*handle)); + handle->h_buffer_credits = nblocks; + handle->h_ref = 1; ++ INIT_LIST_HEAD(&handle->h_jcb); + + lockdep_init_map(&handle->h_lockdep_map, "jbd2_handle", + &jbd2_handle_key, 0); +@@ -1216,6 +1219,36 @@ drop: + } + + /** ++ * void jbd2_journal_callback_set() - Register a callback function for this handle. ++ * @handle: handle to attach the callback to. ++ * @func: function to callback. ++ * @jcb: structure with additional information required by func() , and ++ * some space for jbd2 internal information. ++ * ++ * The function will be ++ * called when the transaction that this handle is part of has been ++ * committed to disk with the original callback data struct and the ++ * error status of the journal as parameters. There is no guarantee of ++ * ordering between handles within a single transaction, nor between ++ * callbacks registered on the same handle. ++ * ++ * The caller is responsible for allocating the journal_callback struct. ++ * This is to allow the caller to add as much extra data to the callback ++ * as needed, but reduce the overhead of multiple allocations. The caller ++ * allocated struct must start with a struct journal_callback at offset 0, ++ * and has the caller-specific data afterwards. ++ */ ++void jbd2_journal_callback_set(handle_t *handle, ++ void (*func)(struct journal_callback *jcb, int error), ++ struct journal_callback *jcb) ++{ ++ spin_lock(&handle->h_transaction->t_jcb_lock); ++ list_add_tail(&jcb->jcb_list, &handle->h_jcb); ++ spin_unlock(&handle->h_transaction->t_jcb_lock); ++ jcb->jcb_func = func; ++} ++ ++/** + * int jbd2_journal_stop() - complete a transaction + * @handle: tranaction to complete. + * +@@ -1321,6 +1354,11 @@ int jbd2_journal_stop(handle_t *handle) + wake_up(&journal->j_wait_transaction_locked); + } + ++ /* Move callbacks from the handle to the transaction. */ ++ spin_lock(&transaction->t_jcb_lock); ++ list_splice(&handle->h_jcb, &transaction->t_jcb); ++ spin_unlock(&transaction->t_jcb_lock); ++ + /* + * If the handle is marked SYNC, we need to set another commit + * going! We also want to force a commit if the current diff --git a/lustre/kernel_patches/patches/mpt-fusion-max-sge-rhel6.patch b/lustre/kernel_patches/patches/mpt-fusion-max-sge-rhel6.patch new file mode 100644 index 0000000..1fa1d26 --- /dev/null +++ b/lustre/kernel_patches/patches/mpt-fusion-max-sge-rhel6.patch @@ -0,0 +1,37 @@ +Increase MAX_SGE for fusion mpt driver. + +Index: linux-2.6.32.i386/drivers/message/fusion/Kconfig +=================================================================== +--- linux-2.6.32.i386.orig/drivers/message/fusion/Kconfig 2009-12-03 09:21:21.000000000 +0530 ++++ linux-2.6.32.i386/drivers/message/fusion/Kconfig 2010-03-16 16:45:08.000000000 +0530 +@@ -61,9 +61,9 @@ + LSISAS1078 + + config FUSION_MAX_SGE +- int "Maximum number of scatter gather entries (16 - 128)" +- default "128" +- range 16 128 ++ int "Maximum number of scatter gather entries (16 - 256)" ++ default "256" ++ range 16 256 + help + This option allows you to specify the maximum number of scatter- + gather entries per I/O. The driver default is 128, which matches +Index: linux-2.6.32.i386/drivers/message/fusion/mptbase.h +=================================================================== +--- linux-2.6.32.i386.orig/drivers/message/fusion/mptbase.h 2009-12-03 09:21:21.000000000 +0530 ++++ linux-2.6.32.i386/drivers/message/fusion/mptbase.h 2010-03-16 16:46:54.000000000 +0530 +@@ -165,10 +165,10 @@ + * Set the MAX_SGE value based on user input. + */ + #ifdef CONFIG_FUSION_MAX_SGE +-#if CONFIG_FUSION_MAX_SGE < 16 ++#if CONFIG_FUSION_MAX_SGE < 16 + #define MPT_SCSI_SG_DEPTH 16 +-#elif CONFIG_FUSION_MAX_SGE > 128 +-#define MPT_SCSI_SG_DEPTH 128 ++#elif CONFIG_FUSION_MAX_SGE > 256 ++#define MPT_SCSI_SG_DEPTH 256 + #else + #define MPT_SCSI_SG_DEPTH CONFIG_FUSION_MAX_SGE + #endif diff --git a/lustre/kernel_patches/patches/raid5-mmp-unplug-dev-rhel6.patch b/lustre/kernel_patches/patches/raid5-mmp-unplug-dev-rhel6.patch new file mode 100644 index 0000000..74bd529 --- /dev/null +++ b/lustre/kernel_patches/patches/raid5-mmp-unplug-dev-rhel6.patch @@ -0,0 +1,27 @@ +Force MD devices to pass SYNC reads directly to the disk +instead of handling from cache. This is needed for MMP +on MD RAID devices, and in theory could be accepted in +the upstream kernel. Not needed for DMU. + +Index: linux-2.6.32-71.18.1.el6-master/drivers/md/raid5.c +=================================================================== +--- linux-2.6.32-71.18.1.el6-master.orig/drivers/md/raid5.c 2011-02-28 16:57:31.222666050 +0800 ++++ linux-2.6.32-71.18.1.el6-master/drivers/md/raid5.c 2011-02-28 16:58:27.011983275 +0800 +@@ -2098,6 +2098,8 @@ + bi->bi_next = *bip; + *bip = bi; + bi->bi_phys_segments++; ++ if (bio_rw_flagged(bi, BIO_RW_SYNCIO) && !forwrite) ++ clear_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); /* force to read from disk. */ + spin_unlock_irq(&conf->device_lock); + spin_unlock(&sh->lock); + +@@ -4031,6 +4033,8 @@ + wait_event(mddev->thread->wqueue, + atomic_read(&conf->preread_active_stripes) == 0); + } ++ if (bio_rw_flagged(bi, BIO_RW_SYNCIO)) ++ raid5_unplug_device(mddev->queue); + return 0; + } + diff --git a/lustre/kernel_patches/series/2.6-rhel6.series b/lustre/kernel_patches/series/2.6-rhel6.series new file mode 100644 index 0000000..5e014dc --- /dev/null +++ b/lustre/kernel_patches/series/2.6-rhel6.series @@ -0,0 +1,7 @@ +lustre_version.patch +mpt-fusion-max-sge-rhel6.patch +raid5-mmp-unplug-dev-rhel6.patch +dev_read_only-2.6.32-rhel6.patch +blkdev_tunables-2.6-rhel6.patch +export-2.6.32-vanilla.patch +jbd2-jcberr-2.6-rhel6.patch diff --git a/lustre/ldlm/ldlm_pool.c b/lustre/ldlm/ldlm_pool.c index b42ed55..fa55e4205 100644 --- a/lustre/ldlm/ldlm_pool.c +++ b/lustre/ldlm/ldlm_pool.c @@ -1151,14 +1151,14 @@ static int ldlm_pools_shrink(ldlm_side_t client, int nr, return cached; } -static int ldlm_pools_srv_shrink(int nr, unsigned int gfp_mask) +static int KERN_SHRINKER(ldlm_pools_srv_shrink) { - return ldlm_pools_shrink(LDLM_NAMESPACE_SERVER, nr, gfp_mask); + return ldlm_pools_shrink(LDLM_NAMESPACE_SERVER, nr_to_scan, gfp_mask); } -static int ldlm_pools_cli_shrink(int nr, unsigned int gfp_mask) +static int KERN_SHRINKER(ldlm_pools_cli_shrink) { - return ldlm_pools_shrink(LDLM_NAMESPACE_CLIENT, nr, gfp_mask); + return ldlm_pools_shrink(LDLM_NAMESPACE_CLIENT, nr_to_scan, gfp_mask); } void ldlm_pools_recalc(ldlm_side_t client) diff --git a/lustre/llite/llite_lib.c b/lustre/llite/llite_lib.c index 69b7e69..b9850ab 100644 --- a/lustre/llite/llite_lib.c +++ b/lustre/llite/llite_lib.c @@ -818,9 +818,18 @@ void ll_lli_init(struct ll_inode_info *lli) cfs_sema_init(&lli->lli_readdir_sem, 1); } -#ifdef HAVE_NEW_BACKING_DEV_INFO -static atomic_t ll_bdi_num = ATOMIC_INIT(0); +static inline int ll_bdi_register(struct backing_dev_info *bdi) +{ +#ifdef HAVE_BDI_REGISTER + static atomic_t ll_bdi_num = ATOMIC_INIT(0); + + bdi->name = "lustre"; + return bdi_register(bdi, NULL, "lustre-%d", + atomic_inc_return(&ll_bdi_num)); +#else + return 0; #endif +} int ll_fill_super(struct super_block *sb) { @@ -849,16 +858,17 @@ int ll_fill_super(struct super_block *sb) if (err) GOTO(out_free, err); - err = ll_bdi_init(&lsi->bdi); + err = ll_bdi_init(&lsi->lsi_bdi); + if (err) + GOTO(out_free, err); + lsi->lsi_flags |= LSI_BDI_INITIALIZED; + lsi->lsi_bdi.capabilities = BDI_CAP_MAP_COPY; + err = ll_bdi_register(&lsi->lsi_bdi); if (err) GOTO(out_free, err); -#ifdef HAVE_NEW_BACKING_DEV_INFO - lsi->bdi.name = "lustre"; - lsi->bdi.capabilities = BDI_CAP_MAP_COPY; - err = bdi_register(&lsi->bdi, NULL, "lustre-%d", - atomic_inc_return(&ll_bdi_num)); - sb->s_bdi = &lsi->bdi; +#ifdef HAVE_SB_BDI + sb->s_bdi = &lsi->lsi_bdi; #endif /* Generate a string unique to this super, in case some joker tries @@ -965,8 +975,10 @@ void ll_put_super(struct super_block *sb) if (profilenm) class_del_profile(profilenm); - if (ll_bdi_wb_cnt(lsi->bdi) > 0) - ll_bdi_destroy(&lsi->bdi); + if (lsi->lsi_flags & LSI_BDI_INITIALIZED) { + ll_bdi_destroy(&lsi->lsi_bdi); + lsi->lsi_flags &= ~LSI_BDI_INITIALIZED; + } ll_free_sbi(sb); lsi->lsi_llsbi = NULL; @@ -1668,7 +1680,7 @@ void ll_read_inode2(struct inode *inode, void *opaque) /* OIDEBUG(inode); */ /* initializing backing dev info. */ - inode->i_mapping->backing_dev_info = &(s2lsi(inode->i_sb)->bdi); + inode->i_mapping->backing_dev_info = &s2lsi(inode->i_sb)->lsi_bdi; if (S_ISREG(inode->i_mode)) { diff --git a/lustre/lvfs/fsfilt_ext3.c b/lustre/lvfs/fsfilt_ext3.c index f1ed390..8b60e31 100644 --- a/lustre/lvfs/fsfilt_ext3.c +++ b/lustre/lvfs/fsfilt_ext3.c @@ -121,6 +121,11 @@ extern int ext3_xattr_set_handle(handle_t *, struct inode *, int, const char *, ext3_ext_insert_extent(handle, inode, path, newext) #endif +#ifdef EXT3_DISCARD_PREALLOCATIONS +#define ext3_mb_discard_inode_preallocations(inode) \ + ext3_discard_preallocations(inode) +#endif + static cfs_mem_cache_t *fcb_cache; diff --git a/lustre/obdclass/lu_object.c b/lustre/obdclass/lu_object.c index 8c61aa9..69aa691 100644 --- a/lustre/obdclass/lu_object.c +++ b/lustre/obdclass/lu_object.c @@ -1626,24 +1626,24 @@ static void lu_site_stats_get(cfs_hash_t *hs, } #ifdef __KERNEL__ -static int lu_cache_shrink(int nr, unsigned int gfp_mask) +static int KERN_SHRINKER(lu_cache_shrink) { lu_site_stats_t stats; struct lu_site *s; struct lu_site *tmp; int cached = 0; - int remain = nr; + int remain = nr_to_scan; CFS_LIST_HEAD(splice); - if (nr != 0) { + if (nr_to_scan != 0) { if (!(gfp_mask & __GFP_FS)) return -1; - CDEBUG(D_INODE, "Shrink %d objects\n", nr); + CDEBUG(D_INODE, "Shrink %d objects\n", nr_to_scan); } cfs_down(&lu_sites_guard); cfs_list_for_each_entry_safe(s, tmp, &lu_sites, ls_linkage) { - if (nr != 0) { + if (nr_to_scan != 0) { remain = lu_site_purge(&lu_shrink_env, s, remain); /* * Move just shrunk site to the tail of site list to @@ -1655,14 +1655,14 @@ static int lu_cache_shrink(int nr, unsigned int gfp_mask) memset(&stats, 0, sizeof(stats)); lu_site_stats_get(s->ls_obj_hash, &stats, 0); cached += stats.lss_total - stats.lss_busy; - if (nr && remain <= 0) + if (nr_to_scan && remain <= 0) break; } cfs_list_splice(&splice, lu_sites.prev); cfs_up(&lu_sites_guard); cached = (cached / 100) * sysctl_vfs_cache_pressure; - if (nr == 0) + if (nr_to_scan == 0) CDEBUG(D_INODE, "%d objects cached\n", cached); return cached; } diff --git a/lustre/ptlrpc/sec_bulk.c b/lustre/ptlrpc/sec_bulk.c index a9a9906..a42dacb 100644 --- a/lustre/ptlrpc/sec_bulk.c +++ b/lustre/ptlrpc/sec_bulk.c @@ -239,7 +239,7 @@ static void enc_pools_release_free_pages(long npages) * could be called frequently for query (@nr_to_scan == 0). * we try to keep at least PTLRPC_MAX_BRW_PAGES pages in the pool. */ -static int enc_pools_shrink(int nr_to_scan, unsigned int gfp_mask) +static int KERN_SHRINKER(enc_pools_shrink) { if (unlikely(nr_to_scan != 0)) { cfs_spin_lock(&page_pools.epp_lock);