From: James Simmons Date: Wed, 13 Feb 2013 18:11:43 +0000 (-0500) Subject: LU-1812 ldiskfs: Add kernel patch series for SLES11SP2 X-Git-Tag: 2.3.62~18 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=99bde24123dba63df0d7846007714337534f7b01;hp=9af17f8c36e8bc04e61b7eb2df868d0fe4b6916c LU-1812 ldiskfs: Add kernel patch series for SLES11SP2 Add a kernel patch series for SLES11SP2. Signed-off-by: James Simmons - Signed-off-by: chas williams - CONTRACTOR Signed-off-by: Jeff Mahoney Change-Id: I5e3902eb90e69c9652cdfbfc0d8ea1d99d15b453 Reviewed-on: http://review.whamcloud.com/4972 Tested-by: Hudson Tested-by: Maloo Reviewed-by: Andreas Dilger --- diff --git a/ldiskfs/config/ldiskfs-build.m4 b/ldiskfs/config/ldiskfs-build.m4 index 429bb8c..fd0f6d2 100644 --- a/ldiskfs/config/ldiskfs-build.m4 +++ b/ldiskfs/config/ldiskfs-build.m4 @@ -711,6 +711,11 @@ if $1; then LDISKFS_SERIES="2.6-sles11.series" fi ;; + 3.0.*) + if test x$SUSE_KERNEL = xyes; then + LDISKFS_SERIES="3.0-sles11.series" + fi + ;; *) AC_MSG_WARN([Unknown kernel version $LINUXRELEASE]) LDISKFS_SERIES= diff --git a/ldiskfs/kernel_patches/patches/sles11sp2/export-ext4-3.0.patch b/ldiskfs/kernel_patches/patches/sles11sp2/export-ext4-3.0.patch new file mode 100644 index 0000000..74bf0a4 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/sles11sp2/export-ext4-3.0.patch @@ -0,0 +1,200 @@ +--- + fs/ext4/balloc.c | 1 + + fs/ext4/ext4.h | 2 ++ + fs/ext4/ext4_extents.h | 9 +++++++++ + fs/ext4/ext4_jbd2.c | 2 ++ + fs/ext4/extents.c | 23 ++++++++++++++++------- + fs/ext4/ialloc.c | 3 ++- + fs/ext4/inode.c | 1 + + fs/ext4/mballoc.c | 4 ++++ + fs/ext4/super.c | 9 +++++++++ + 9 files changed, 46 insertions(+), 8 deletions(-) + +--- a/fs/ext4/balloc.c ++++ b/fs/ext4/balloc.c +@@ -231,6 +231,7 @@ struct ext4_group_desc * ext4_get_group_ + *bh = sbi->s_group_desc[group_desc]; + return desc; + } ++EXPORT_SYMBOL(ext4_get_group_desc); + + static int ext4_valid_block_bitmap(struct super_block *sb, + struct ext4_group_desc *desc, +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -1783,6 +1783,8 @@ extern struct inode * ext4_orphan_get(st + extern unsigned long ext4_count_free_inodes(struct super_block *); + extern unsigned long ext4_count_dirs(struct super_block *); + extern void ext4_check_inodes_bitmap(struct super_block *); ++extern struct buffer_head *ext4_read_inode_bitmap(struct super_block *sb, ++ ext4_group_t block_group); + extern void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap); + extern int ext4_init_inode_table(struct super_block *sb, + ext4_group_t group, int barrier); +--- a/fs/ext4/ext4_extents.h ++++ b/fs/ext4/ext4_extents.h +@@ -290,5 +290,14 @@ extern struct ext4_ext_path *ext4_ext_fi + struct ext4_ext_path *); + extern void ext4_ext_drop_refs(struct ext4_ext_path *); + extern int ext4_ext_check_inode(struct inode *inode); ++extern int ext4_ext_search_right(struct inode *inode, ++ struct ext4_ext_path *path, ++ ext4_lblk_t *logical, ext4_fsblk_t *phys); ++extern int ext4_ext_search_left(struct inode *inode, ++ struct ext4_ext_path *path, ++ ext4_lblk_t *logical, ext4_fsblk_t *phys); ++extern int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block, ++ ext4_lblk_t num, ext_prepare_callback func, ++ void *cbdata); + #endif /* _EXT4_EXTENTS */ + +--- a/fs/ext4/ext4_jbd2.c ++++ b/fs/ext4/ext4_jbd2.c +@@ -19,6 +19,7 @@ int __ext4_journal_get_write_access(cons + } + return err; + } ++EXPORT_SYMBOL(__ext4_journal_get_write_access); + + /* + * The ext4 forget function must perform a revoke if we are freeing data +@@ -150,3 +151,4 @@ int __ext4_handle_dirty_super(const char + sb->s_dirt = 1; + return err; + } ++EXPORT_SYMBOL(__ext4_handle_dirty_metadata); +--- a/fs/ext4/extents.c ++++ b/fs/ext4/extents.c +@@ -1236,9 +1236,9 @@ out: + * returns 0 at @phys + * return value contains 0 (success) or error code + */ +-static int ext4_ext_search_left(struct inode *inode, +- struct ext4_ext_path *path, +- ext4_lblk_t *logical, ext4_fsblk_t *phys) ++int ext4_ext_search_left(struct inode *inode, ++ struct ext4_ext_path *path, ++ ext4_lblk_t *logical, ext4_fsblk_t *phys) + { + struct ext4_extent_idx *ix; + struct ext4_extent *ex; +@@ -1301,9 +1301,9 @@ static int ext4_ext_search_left(struct i + * returns 0 at @phys + * return value contains 0 (success) or error code + */ +-static int ext4_ext_search_right(struct inode *inode, +- struct ext4_ext_path *path, +- ext4_lblk_t *logical, ext4_fsblk_t *phys) ++int ext4_ext_search_right(struct inode *inode, ++ struct ext4_ext_path *path, ++ ext4_lblk_t *logical, ext4_fsblk_t *phys) + { + struct buffer_head *bh = NULL; + struct ext4_extent_header *eh; +@@ -1878,7 +1878,7 @@ cleanup: + return err; + } + +-static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block, ++extern int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block, + ext4_lblk_t num, ext_prepare_callback func, + void *cbdata) + { +@@ -4397,3 +4397,12 @@ int ext4_fiemap(struct inode *inode, str + + return error; + } ++ ++EXPORT_SYMBOL(ext4_ext_search_right); ++EXPORT_SYMBOL(ext4_ext_search_left); ++EXPORT_SYMBOL(ext4_ext_insert_extent); ++EXPORT_SYMBOL(ext4_mb_new_blocks); ++EXPORT_SYMBOL(ext4_mark_inode_dirty); ++EXPORT_SYMBOL(ext4_ext_walk_space); ++EXPORT_SYMBOL(ext4_ext_find_extent); ++EXPORT_SYMBOL(ext4_ext_drop_refs); +--- a/fs/ext4/ialloc.c ++++ b/fs/ext4/ialloc.c +@@ -98,7 +98,7 @@ static unsigned ext4_init_inode_bitmap(s + * + * Return buffer_head of bitmap on success or NULL. + */ +-static struct buffer_head * ++struct buffer_head * + ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) + { + struct ext4_group_desc *desc; +@@ -163,6 +163,7 @@ ext4_read_inode_bitmap(struct super_bloc + } + return bh; + } ++EXPORT_SYMBOL(ext4_read_inode_bitmap); + + /* + * NOTE! When we get the inode, we're the only people +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -5096,6 +5096,7 @@ bad_inode: + iget_failed(inode); + return ERR_PTR(ret); + } ++EXPORT_SYMBOL(ext4_iget); + + static int ext4_inode_blocks_set(handle_t *handle, + struct ext4_inode *raw_inode, +--- a/fs/ext4/mballoc.c ++++ b/fs/ext4/mballoc.c +@@ -3852,6 +3852,7 @@ repeat: + call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); + } + } ++EXPORT_SYMBOL(ext4_discard_preallocations); + + #ifdef CONFIG_EXT4_DEBUG + static void ext4_mb_show_ac(struct ext4_allocation_context *ac) +@@ -4972,3 +4973,6 @@ int ext4_trim_fs(struct super_block *sb, + + return ret; + } ++ ++EXPORT_SYMBOL(ext4_free_blocks); ++ +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -194,6 +194,7 @@ __u32 ext4_itable_unused_count(struct su + (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? + (__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0); + } ++EXPORT_SYMBOL(ext4_itable_unused_count); + + void ext4_block_bitmap_set(struct super_block *sb, + struct ext4_group_desc *bg, ext4_fsblk_t blk) +@@ -378,6 +379,7 @@ void ext4_journal_abort_handle(const cha + + jbd2_journal_abort_handle(handle); + } ++EXPORT_SYMBOL(ext4_journal_abort_handle); + + static void __save_error_info(struct super_block *sb, const char *func, + unsigned int line) +@@ -4272,6 +4274,7 @@ int ext4_force_commit(struct super_block + + return ret; + } ++EXPORT_SYMBOL(ext4_force_commit); + + static void ext4_write_super(struct super_block *sb) + { +@@ -5208,6 +5211,12 @@ static void __exit ext4_exit_fs(void) + ext4_exit_pageio(); + } + ++EXPORT_SYMBOL(ext4_xattr_get); ++EXPORT_SYMBOL(ext4_xattr_set_handle); ++EXPORT_SYMBOL(ext4_bread); ++EXPORT_SYMBOL(ext4_journal_start_sb); ++EXPORT_SYMBOL(__ext4_journal_stop); ++ + MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); + MODULE_DESCRIPTION("Fourth Extended Filesystem"); + MODULE_LICENSE("GPL"); diff --git a/ldiskfs/kernel_patches/patches/sles11sp2/ext4-add-missing-kfree-on-error-return-path-in-add_new_gdb.patch b/ldiskfs/kernel_patches/patches/sles11sp2/ext4-add-missing-kfree-on-error-return-path-in-add_new_gdb.patch new file mode 100644 index 0000000..60bc6fe --- /dev/null +++ b/ldiskfs/kernel_patches/patches/sles11sp2/ext4-add-missing-kfree-on-error-return-path-in-add_new_gdb.patch @@ -0,0 +1,31 @@ +From c49bafa3842751b8955a962859f42d307673d75d Mon Sep 17 00:00:00 2001 +From: Dan Carpenter +Date: Sat, 30 Jul 2011 12:58:41 -0400 +Subject: ext4: add missing kfree() on error return path in add_new_gdb() +Git-commit: c49bafa3 +Patch-mainline: v3.1-rc1 + +We added some more error handling in b40971426a "ext4: add error +checking to calls to ext4_handle_dirty_metadata()". But we need to +call kfree() as well to avoid a memory leak. + +Upstream-Signed-off-by: Dan Carpenter +Upstream-Signed-off-by: "Theodore Ts'o" +Signed-off-by: Jeff Mahoney +--- + fs/ext4/resize.c | 1 + + 1 files changed, 1 insertions(+), 0 deletions(-) + +diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c +index 6e3327d..71085df 100644 +--- a/fs/ext4/resize.c ++++ b/fs/ext4/resize.c +@@ -517,6 +517,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, + return err; + + exit_inode: ++ kfree(n_group_desc); + /* ext4_handle_release_buffer(handle, iloc.bh); */ + brelse(iloc.bh); + exit_dindj: + diff --git a/ldiskfs/kernel_patches/patches/sles11sp2/ext4-big-endian-check-3.0.patch b/ldiskfs/kernel_patches/patches/sles11sp2/ext4-big-endian-check-3.0.patch new file mode 100644 index 0000000..22c7eda --- /dev/null +++ b/ldiskfs/kernel_patches/patches/sles11sp2/ext4-big-endian-check-3.0.patch @@ -0,0 +1,55 @@ +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -139,6 +139,8 @@ void ext4_kvfree(void *ptr) + + } + ++static int bigendian_extents; ++ + ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, + struct ext4_group_desc *bg) + { +@@ -1354,7 +1356,7 @@ enum { + Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity, + Opt_inode_readahead_blks, Opt_journal_ioprio, + Opt_dioread_nolock, Opt_dioread_lock, +- Opt_mballoc, ++ Opt_mballoc, Opt_bigendian_extents, + Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable, + }; + +@@ -1429,6 +1431,7 @@ static const match_table_t tokens = { + {Opt_noauto_da_alloc, "noauto_da_alloc"}, + {Opt_dioread_nolock, "dioread_nolock"}, + {Opt_dioread_lock, "dioread_lock"}, ++ {Opt_bigendian_extents, "bigendian_extents"}, + {Opt_mballoc, "mballoc"}, + {Opt_discard, "discard"}, + {Opt_nodiscard, "nodiscard"}, +@@ -1903,6 +1906,9 @@ set_qf_format: + else + set_opt(sb,NO_AUTO_DA_ALLOC); + break; ++ case Opt_bigendian_extents: ++ bigendian_extents = 1; ++ break; + case Opt_discard: + set_opt(sb, DISCARD); + break; +@@ -3497,6 +3503,16 @@ static int ext4_fill_super(struct super_ + goto failed_mount; + } + ++#ifdef __BIG_ENDIAN ++ if (bigendian_extents == 0) { ++ printk(KERN_ERR "EXT4-fs: extents feature is not guaranteed to " ++ "work on big-endian systems. Use \"bigendian_extents\" " ++ "mount option to override.\n"); ++ goto failed_mount; ++ } ++#endif ++ ++ + #ifdef CONFIG_PROC_FS + if (ext4_proc_root) + sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root); diff --git a/ldiskfs/kernel_patches/patches/sles11sp2/ext4-disable-mb-cache.patch b/ldiskfs/kernel_patches/patches/sles11sp2/ext4-disable-mb-cache.patch new file mode 100644 index 0000000..408d2b7 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/sles11sp2/ext4-disable-mb-cache.patch @@ -0,0 +1,151 @@ +diff -ru linux-stage.orig/fs/ext4/ext4.h linux-stage/fs/ext4/ext4.h +--- linux-stage.orig/fs/ext4/ext4.h 2012-12-31 15:58:19.000000000 -0500 ++++ linux-stage/fs/ext4/ext4.h 2012-12-31 15:58:42.000000000 -0500 +@@ -894,7 +894,8 @@ + /* + * Mount flags + */ +-#define EXT4_MOUNT_OLDALLOC 0x00002 /* Don't use the new Orlov allocator */ ++#define EXT4_MOUNT_NO_MBCACHE 0x00001 /* Disable mbcache */ ++#define EXT4_MOUNT_OLDALLOC 0x00002 /* Don't use the new Orlov allocator */ + #define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */ + #define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */ + #define EXT4_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */ +diff -ru linux-stage.orig/fs/ext4/super.c linux-stage/fs/ext4/super.c +--- linux-stage.orig/fs/ext4/super.c 2012-12-31 15:58:19.000000000 -0500 ++++ linux-stage/fs/ext4/super.c 2012-12-31 15:59:19.000000000 -0500 +@@ -1306,6 +1306,7 @@ + Opt_dioread_nolock, Opt_dioread_lock, + Opt_mballoc, Opt_bigendian_extents, Opt_force_over_128tb, + Opt_extents, Opt_noextents, ++ Opt_no_mbcache, + Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable, + }; + +@@ -1383,6 +1384,7 @@ + {Opt_bigendian_extents, "bigendian_extents"}, + {Opt_force_over_128tb, "force_over_128tb"}, + {Opt_mballoc, "mballoc"}, ++ {Opt_no_mbcache, "no_mbcache"}, + {Opt_extents, "extents"}, + {Opt_noextents, "noextents"}, + {Opt_discard, "discard"}, +@@ -1919,6 +1921,9 @@ + } + clear_opt(sb, EXTENTS); + break; ++ case Opt_no_mbcache: ++ set_opt(sb, NO_MBCACHE); ++ break; + default: + ext4_msg(sb, KERN_ERR, + "Unrecognized mount option \"%s\" " +diff -ru linux-stage.orig/fs/ext4/xattr.c linux-stage/fs/ext4/xattr.c +--- linux-stage.orig/fs/ext4/xattr.c 2012-12-31 15:58:19.000000000 -0500 ++++ linux-stage/fs/ext4/xattr.c 2012-12-31 16:46:21.000000000 -0500 +@@ -86,7 +86,8 @@ + # define ea_bdebug(f...) + #endif + +-static void ext4_xattr_cache_insert(struct buffer_head *); ++static void ext4_xattr_cache_insert(struct super_block *, ++ struct buffer_head *); + static struct buffer_head *ext4_xattr_cache_find(struct inode *, + struct ext4_xattr_header *, + struct mb_cache_entry **); +@@ -332,7 +333,7 @@ + error = -EIO; + goto cleanup; + } +- ext4_xattr_cache_insert(bh); ++ ext4_xattr_cache_insert(inode->i_sb, bh); + entry = BFIRST(bh); + error = ext4_xattr_find_entry(&entry, name_index, name, bh->b_size, 1, + inode); +@@ -492,7 +493,7 @@ + error = -EIO; + goto cleanup; + } +- ext4_xattr_cache_insert(bh); ++ ext4_xattr_cache_insert(inode->i_sb, bh); + error = ext4_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size); + + cleanup: +@@ -589,7 +590,9 @@ + struct mb_cache_entry *ce = NULL; + int error = 0; + +- ce = mb_cache_entry_get(ext4_xattr_cache, bh->b_bdev, bh->b_blocknr); ++ if (!test_opt(inode->i_sb, NO_MBCACHE)) ++ ce = mb_cache_entry_get(ext4_xattr_cache, bh->b_bdev, ++ bh->b_blocknr); + error = ext4_journal_get_write_access(handle, bh); + if (error) + goto out; +@@ -989,8 +992,10 @@ + #define header(x) ((struct ext4_xattr_header *)(x)) + + if (s->base) { +- ce = mb_cache_entry_get(ext4_xattr_cache, bs->bh->b_bdev, +- bs->bh->b_blocknr); ++ if (!test_opt(inode->i_sb, NO_MBCACHE)) ++ ce = mb_cache_entry_get(ext4_xattr_cache, ++ bs->bh->b_bdev, ++ bs->bh->b_blocknr); + error = ext4_journal_get_write_access(handle, bs->bh); + if (error) + goto cleanup; +@@ -1007,7 +1012,7 @@ + if (!IS_LAST_ENTRY(s->first)) + ext4_xattr_rehash(header(s->base), + s->here); +- ext4_xattr_cache_insert(bs->bh); ++ ext4_xattr_cache_insert(sb, bs->bh); + } + unlock_buffer(bs->bh); + if (error == -EIO) +@@ -1090,7 +1095,8 @@ + if (error) + goto cleanup_dquot; + } +- mb_cache_entry_release(ce); ++ if (ce) ++ mb_cache_entry_release(ce); + ce = NULL; + } else if (bs->bh && s->base == bs->bh->b_data) { + /* We were modifying this block in-place. */ +@@ -1141,7 +1147,7 @@ + memcpy(new_bh->b_data, s->base, new_bh->b_size); + set_buffer_uptodate(new_bh); + unlock_buffer(new_bh); +- ext4_xattr_cache_insert(new_bh); ++ ext4_xattr_cache_insert(sb, new_bh); + error = ext4_handle_dirty_metadata(handle, + inode, new_bh); + if (error) +@@ -1761,12 +1767,15 @@ + * Returns 0, or a negative error number on failure. + */ + static void +-ext4_xattr_cache_insert(struct buffer_head *bh) ++ext4_xattr_cache_insert(struct super_block *sb, struct buffer_head *bh) + { + __u32 hash = le32_to_cpu(BHDR(bh)->h_hash); + struct mb_cache_entry *ce; + int error; + ++ if (test_opt(sb, NO_MBCACHE)) ++ return; ++ + ce = mb_cache_entry_alloc(ext4_xattr_cache, GFP_NOFS); + if (!ce) { + ea_bdebug(bh, "out of memory"); +@@ -1839,6 +1848,8 @@ + __u32 hash = le32_to_cpu(header->h_hash); + struct mb_cache_entry *ce; + ++ if (test_opt(inode->i_sb, NO_MBCACHE)) ++ return NULL; + if (!header->h_hash) + return NULL; /* never share */ + ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); diff --git a/ldiskfs/kernel_patches/patches/sles11sp2/ext4-dynlocks-common.patch b/ldiskfs/kernel_patches/patches/sles11sp2/ext4-dynlocks-common.patch new file mode 100644 index 0000000..db04c62 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/sles11sp2/ext4-dynlocks-common.patch @@ -0,0 +1,370 @@ +--- /dev/null ++++ b/fs/ext4/dynlocks.c +@@ -0,0 +1,236 @@ ++/* ++ * Dynamic Locks ++ * ++ * struct dynlock is lockspace ++ * one may request lock (exclusive or shared) for some value ++ * in that lockspace ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++ ++#define DYNLOCK_HANDLE_MAGIC 0xd19a10c ++#define DYNLOCK_HANDLE_DEAD 0xd1956ee ++#define DYNLOCK_LIST_MAGIC 0x11ee91e6 ++ ++static struct kmem_cache * dynlock_cachep = NULL; ++ ++struct dynlock_handle { ++ unsigned dh_magic; ++ struct list_head dh_list; ++ unsigned long dh_value; /* lock value */ ++ int dh_refcount; /* number of users */ ++ int dh_readers; ++ int dh_writers; ++ int dh_pid; /* holder of the lock */ ++ wait_queue_head_t dh_wait; ++}; ++ ++int __init dynlock_cache_init(void) ++{ ++ int rc = 0; ++ ++ /* printk(KERN_INFO "init dynlocks cache\n"); */ ++ dynlock_cachep = kmem_cache_create("dynlock_cache", ++ sizeof(struct dynlock_handle), ++ 0, ++ SLAB_HWCACHE_ALIGN, ++ NULL); ++ if (dynlock_cachep == NULL) { ++ printk(KERN_ERR "Not able to create dynlock cache"); ++ rc = -ENOMEM; ++ } ++ return rc; ++} ++ ++void dynlock_cache_exit(void) ++{ ++ /* printk(KERN_INFO "exit dynlocks cache\n"); */ ++ kmem_cache_destroy(dynlock_cachep); ++} ++ ++/* ++ * dynlock_init ++ * ++ * initialize lockspace ++ * ++ */ ++void dynlock_init(struct dynlock *dl) ++{ ++ spin_lock_init(&dl->dl_list_lock); ++ INIT_LIST_HEAD(&dl->dl_list); ++ dl->dl_magic = DYNLOCK_LIST_MAGIC; ++} ++EXPORT_SYMBOL(dynlock_init); ++ ++/* ++ * dynlock_lock ++ * ++ * acquires lock (exclusive or shared) in specified lockspace ++ * each lock in lockspace is allocated separately, so user have ++ * to specify GFP flags. ++ * routine returns pointer to lock. this pointer is intended to ++ * be passed to dynlock_unlock ++ * ++ */ ++struct dynlock_handle *dynlock_lock(struct dynlock *dl, unsigned long value, ++ enum dynlock_type lt, gfp_t gfp) ++{ ++ struct dynlock_handle *nhl = NULL; ++ struct dynlock_handle *hl; ++ ++ BUG_ON(dl == NULL); ++ BUG_ON(dl->dl_magic != DYNLOCK_LIST_MAGIC); ++ ++repeat: ++ /* find requested lock in lockspace */ ++ spin_lock(&dl->dl_list_lock); ++ BUG_ON(dl->dl_list.next == NULL); ++ BUG_ON(dl->dl_list.prev == NULL); ++ list_for_each_entry(hl, &dl->dl_list, dh_list) { ++ BUG_ON(hl->dh_list.next == NULL); ++ BUG_ON(hl->dh_list.prev == NULL); ++ BUG_ON(hl->dh_magic != DYNLOCK_HANDLE_MAGIC); ++ if (hl->dh_value == value) { ++ /* lock is found */ ++ if (nhl) { ++ /* someone else just allocated ++ * lock we didn't find and just created ++ * so, we drop our lock ++ */ ++ kmem_cache_free(dynlock_cachep, nhl); ++ nhl = NULL; ++ } ++ hl->dh_refcount++; ++ goto found; ++ } ++ } ++ /* lock not found */ ++ if (nhl) { ++ /* we already have allocated lock. use it */ ++ hl = nhl; ++ nhl = NULL; ++ list_add(&hl->dh_list, &dl->dl_list); ++ goto found; ++ } ++ spin_unlock(&dl->dl_list_lock); ++ ++ /* lock not found and we haven't allocated lock yet. allocate it */ ++ nhl = kmem_cache_alloc(dynlock_cachep, gfp); ++ if (nhl == NULL) ++ return NULL; ++ nhl->dh_refcount = 1; ++ nhl->dh_value = value; ++ nhl->dh_readers = 0; ++ nhl->dh_writers = 0; ++ nhl->dh_magic = DYNLOCK_HANDLE_MAGIC; ++ init_waitqueue_head(&nhl->dh_wait); ++ ++ /* while lock is being allocated, someone else may allocate it ++ * and put onto to list. check this situation ++ */ ++ goto repeat; ++ ++found: ++ if (lt == DLT_WRITE) { ++ /* exclusive lock: user don't want to share lock at all ++ * NOTE: one process may take the same lock several times ++ * this functionaly is useful for rename operations */ ++ while ((hl->dh_writers && hl->dh_pid != current->pid) || ++ hl->dh_readers) { ++ spin_unlock(&dl->dl_list_lock); ++ wait_event(hl->dh_wait, ++ hl->dh_writers == 0 && hl->dh_readers == 0); ++ spin_lock(&dl->dl_list_lock); ++ } ++ hl->dh_writers++; ++ } else { ++ /* shared lock: user do not want to share lock with writer */ ++ while (hl->dh_writers) { ++ spin_unlock(&dl->dl_list_lock); ++ wait_event(hl->dh_wait, hl->dh_writers == 0); ++ spin_lock(&dl->dl_list_lock); ++ } ++ hl->dh_readers++; ++ } ++ hl->dh_pid = current->pid; ++ spin_unlock(&dl->dl_list_lock); ++ ++ return hl; ++} ++EXPORT_SYMBOL(dynlock_lock); ++ ++ ++/* ++ * dynlock_unlock ++ * ++ * user have to specify lockspace (dl) and pointer to lock structure ++ * returned by dynlock_lock() ++ * ++ */ ++void dynlock_unlock(struct dynlock *dl, struct dynlock_handle *hl) ++{ ++ int wakeup = 0; ++ ++ BUG_ON(dl == NULL); ++ BUG_ON(hl == NULL); ++ BUG_ON(dl->dl_magic != DYNLOCK_LIST_MAGIC); ++ ++ if (hl->dh_magic != DYNLOCK_HANDLE_MAGIC) ++ printk(KERN_EMERG "wrong lock magic: %#x\n", hl->dh_magic); ++ ++ BUG_ON(hl->dh_magic != DYNLOCK_HANDLE_MAGIC); ++ BUG_ON(hl->dh_writers != 0 && current->pid != hl->dh_pid); ++ ++ spin_lock(&dl->dl_list_lock); ++ if (hl->dh_writers) { ++ BUG_ON(hl->dh_readers != 0); ++ hl->dh_writers--; ++ if (hl->dh_writers == 0) ++ wakeup = 1; ++ } else if (hl->dh_readers) { ++ hl->dh_readers--; ++ if (hl->dh_readers == 0) ++ wakeup = 1; ++ } else { ++ BUG(); ++ } ++ if (wakeup) { ++ hl->dh_pid = 0; ++ wake_up(&hl->dh_wait); ++ } ++ if (--(hl->dh_refcount) == 0) { ++ hl->dh_magic = DYNLOCK_HANDLE_DEAD; ++ list_del(&hl->dh_list); ++ kmem_cache_free(dynlock_cachep, hl); ++ } ++ spin_unlock(&dl->dl_list_lock); ++} ++EXPORT_SYMBOL(dynlock_unlock); ++ ++int dynlock_is_locked(struct dynlock *dl, unsigned long value) ++{ ++ struct dynlock_handle *hl; ++ int result = 0; ++ ++ /* find requested lock in lockspace */ ++ spin_lock(&dl->dl_list_lock); ++ BUG_ON(dl->dl_list.next == NULL); ++ BUG_ON(dl->dl_list.prev == NULL); ++ list_for_each_entry(hl, &dl->dl_list, dh_list) { ++ BUG_ON(hl->dh_list.next == NULL); ++ BUG_ON(hl->dh_list.prev == NULL); ++ BUG_ON(hl->dh_magic != DYNLOCK_HANDLE_MAGIC); ++ if (hl->dh_value == value && hl->dh_pid == current->pid) { ++ /* lock is found */ ++ result = 1; ++ break; ++ } ++ } ++ spin_unlock(&dl->dl_list_lock); ++ return result; ++} ++EXPORT_SYMBOL(dynlock_is_locked); +--- /dev/null ++++ b/include/linux/dynlocks.h +@@ -0,0 +1,34 @@ ++#ifndef _LINUX_DYNLOCKS_H ++#define _LINUX_DYNLOCKS_H ++ ++#include ++#include ++ ++struct dynlock_handle; ++ ++/* ++ * lock's namespace: ++ * - list of locks ++ * - lock to protect this list ++ */ ++struct dynlock { ++ unsigned dl_magic; ++ struct list_head dl_list; ++ spinlock_t dl_list_lock; ++}; ++ ++enum dynlock_type { ++ DLT_WRITE, ++ DLT_READ ++}; ++ ++int dynlock_cache_init(void); ++void dynlock_cache_exit(void); ++void dynlock_init(struct dynlock *dl); ++struct dynlock_handle *dynlock_lock(struct dynlock *dl, unsigned long value, ++ enum dynlock_type lt, gfp_t gfp); ++void dynlock_unlock(struct dynlock *dl, struct dynlock_handle *lock); ++int dynlock_is_locked(struct dynlock *dl, unsigned long value); ++ ++#endif ++ +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -5178,30 +5178,33 @@ static int __init ext4_init_fs(void) + return err; + err = ext4_init_system_zone(); + if (err) +- goto out7; ++ goto out8; + ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj); + if (!ext4_kset) +- goto out6; ++ goto out7; + ext4_proc_root = proc_mkdir("fs/ext4", NULL); + if (!ext4_proc_root) +- goto out5; ++ goto out6; + + err = ext4_init_feat_adverts(); + if (err) +- goto out4; ++ goto out5; + + err = ext4_init_mballoc(); + if (err) +- goto out3; ++ goto out4; + + err = ext4_init_xattr(); + if (err) +- goto out2; ++ goto out3; + err = init_inodecache(); + if (err) +- goto out1; ++ goto out2; + register_as_ext3(); + register_as_ext2(); ++ err = dynlock_cache_init(); ++ if (err) ++ goto out1; + err = register_filesystem(&ext4_fs_type); + if (err) + goto out; +@@ -5210,22 +5213,24 @@ static int __init ext4_init_fs(void) + mutex_init(&ext4_li_mtx); + return 0; + out: ++ dynlock_cache_exit(); ++out1: + unregister_as_ext2(); + unregister_as_ext3(); + destroy_inodecache(); +-out1: +- ext4_exit_xattr(); + out2: +- ext4_exit_mballoc(); ++ ext4_exit_xattr(); + out3: +- ext4_exit_feat_adverts(); ++ ext4_exit_mballoc(); + out4: +- remove_proc_entry("fs/ext4", NULL); ++ ext4_exit_feat_adverts(); + out5: +- kset_unregister(ext4_kset); ++ remove_proc_entry("fs/ext4", NULL); + out6: +- ext4_exit_system_zone(); ++ kset_unregister(ext4_kset); + out7: ++ ext4_exit_system_zone(); ++out8: + ext4_exit_pageio(); + return err; + } +@@ -5236,6 +5241,7 @@ static void __exit ext4_exit_fs(void) + unregister_as_ext2(); + unregister_as_ext3(); + unregister_filesystem(&ext4_fs_type); ++ dynlock_cache_exit(); + destroy_inodecache(); + ext4_exit_xattr(); + ext4_exit_mballoc(); +--- a/fs/ext4/Makefile ++++ b/fs/ext4/Makefile +@@ -7,7 +7,7 @@ obj-$(CONFIG_EXT4_FS) += ext4.o + ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \ + ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ + ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \ +- mmp.o ++ mmp.o dynlocks.o + + ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o + ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o diff --git a/ldiskfs/kernel_patches/patches/sles11sp2/ext4-ext_generation.patch b/ldiskfs/kernel_patches/patches/sles11sp2/ext4-ext_generation.patch new file mode 100644 index 0000000..10fc3c7 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/sles11sp2/ext4-ext_generation.patch @@ -0,0 +1,48 @@ +--- + fs/ext4/ext4.h | 1 + + fs/ext4/ext4_extents.h | 5 +++++ + fs/ext4/extents.c | 2 ++ + 3 files changed, 8 insertions(+) + +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -821,6 +821,7 @@ struct ext4_inode_info { + struct inode vfs_inode; + struct jbd2_inode *jinode; + ++ unsigned long i_ext_generation; + struct ext4_ext_cache i_cached_extent; + /* + * File creation time. Its function is same as that of +--- a/fs/ext4/ext4_extents.h ++++ b/fs/ext4/ext4_extents.h +@@ -193,6 +193,11 @@ static inline unsigned short ext_depth(s + return le16_to_cpu(ext_inode_hdr(inode)->eh_depth); + } + ++static inline void ext4_ext_tree_changed(struct inode *inode) ++{ ++ EXT4_I(inode)->i_ext_generation++; ++} ++ + static inline void + ext4_ext_invalidate_cache(struct inode *inode) + { +--- a/fs/ext4/extents.c ++++ b/fs/ext4/extents.c +@@ -1874,6 +1874,7 @@ cleanup: + ext4_ext_drop_refs(npath); + kfree(npath); + } ++ ext4_ext_tree_changed(inode); + ext4_ext_invalidate_cache(inode); + return err; + } +@@ -2681,6 +2682,7 @@ again: + } + } + out: ++ ext4_ext_tree_changed(inode); + ext4_ext_drop_refs(path); + kfree(path); + if (err == -EAGAIN) diff --git a/ldiskfs/kernel_patches/patches/sles11sp2/ext4-extents-mount-option.patch b/ldiskfs/kernel_patches/patches/sles11sp2/ext4-extents-mount-option.patch new file mode 100644 index 0000000..5174912 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/sles11sp2/ext4-extents-mount-option.patch @@ -0,0 +1,162 @@ +diff -u -r linux-stage.orig/fs/ext4/ext4.h linux-stage/fs/ext4/ext4.h +--- linux-stage.orig/fs/ext4/ext4.h 2012-12-31 15:07:27.000000000 -0500 ++++ linux-stage/fs/ext4/ext4.h 2012-12-31 15:14:03.000000000 -0500 +@@ -912,6 +912,7 @@ + #define EXT4_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */ + #define EXT4_MOUNT_NO_AUTO_DA_ALLOC 0x10000 /* No auto delalloc mapping */ + #define EXT4_MOUNT_BARRIER 0x20000 /* Use block barriers */ ++#define EXT4_MOUNT_EXTENTS 0x40000 /* Extents support */ + #define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */ + #define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */ + #define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ +diff -u -r linux-stage.orig/fs/ext4/ext4_jbd2.h linux-stage/fs/ext4/ext4_jbd2.h +--- linux-stage.orig/fs/ext4/ext4_jbd2.h 2012-12-31 15:07:27.000000000 -0500 ++++ linux-stage/fs/ext4/ext4_jbd2.h 2012-12-31 15:07:34.000000000 -0500 +@@ -33,7 +33,7 @@ + + #define EXT4_SINGLEDATA_TRANS_BLOCKS(sb) \ + (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS) \ +- ? 27U : 8U) ++ || test_opt(sb, EXTENTS) ? 27U : 8U) + + #define ext4_journal_dirty_metadata(handle, bh) \ + ext4_handle_dirty_metadata(handle, NULL, bh) +diff -u -r linux-stage.orig/fs/ext4/extents.c linux-stage/fs/ext4/extents.c +--- linux-stage.orig/fs/ext4/extents.c 2012-12-31 15:07:27.000000000 -0500 ++++ linux-stage/fs/ext4/extents.c 2012-12-31 15:07:34.000000000 -0500 +@@ -2733,7 +2733,7 @@ + * possible initialization would be here + */ + +- if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) { ++ if (test_opt(sb, EXTENTS)) { + #if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS) + printk(KERN_INFO "EXT4-fs: file extents enabled"); + #ifdef AGGRESSIVE_TEST +@@ -2760,7 +2760,7 @@ + */ + void ext4_ext_release(struct super_block *sb) + { +- if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) ++ if (!test_opt(sb, EXTENTS)) + return; + + #ifdef EXTENTS_STATS +diff -u -r linux-stage.orig/fs/ext4/ialloc.c linux-stage/fs/ext4/ialloc.c +--- linux-stage.orig/fs/ext4/ialloc.c 2012-12-31 15:07:27.000000000 -0500 ++++ linux-stage/fs/ext4/ialloc.c 2012-12-31 15:07:34.000000000 -0500 +@@ -1057,7 +1057,7 @@ + if (err) + goto fail_free_drop; + +- if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) { ++ if (test_opt(sb, EXTENTS)) { + /* set extent flag only for directory, file and normal symlink*/ + if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) { + ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS); +diff -u -r linux-stage.orig/fs/ext4/migrate.c linux-stage/fs/ext4/migrate.c +--- linux-stage.orig/fs/ext4/migrate.c 2012-12-31 15:07:27.000000000 -0500 ++++ linux-stage/fs/ext4/migrate.c 2012-12-31 15:07:34.000000000 -0500 +@@ -469,13 +469,10 @@ + unsigned long max_entries; + __u32 goal; + +- /* +- * If the filesystem does not support extents, or the inode +- * already is extent-based, error out. +- */ +- if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb, +- EXT4_FEATURE_INCOMPAT_EXTENTS) || +- (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) ++ if (!test_opt(inode->i_sb, EXTENTS)) ++ /* ++ * if mounted with noextents we don't allow the migrate ++ */ + return -EINVAL; + + if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0) +diff -u -r linux-stage.orig/fs/ext4/super.c linux-stage/fs/ext4/super.c +--- linux-stage.orig/fs/ext4/super.c 2012-12-31 15:07:27.000000000 -0500 ++++ linux-stage/fs/ext4/super.c 2012-12-31 15:15:49.000000000 -0500 +@@ -1078,6 +1078,8 @@ + seq_puts(seq, ",journal_async_commit"); + else if (test_opt(sb, JOURNAL_CHECKSUM)) + seq_puts(seq, ",journal_checksum"); ++ if (!test_opt(sb, EXTENTS)) ++ seq_puts(seq, ",noextents"); + if (test_opt(sb, I_VERSION)) + seq_puts(seq, ",i_version"); + if (!test_opt(sb, DELALLOC) && +@@ -1303,6 +1305,7 @@ + Opt_inode_readahead_blks, Opt_journal_ioprio, + Opt_dioread_nolock, Opt_dioread_lock, + Opt_mballoc, Opt_bigendian_extents, Opt_force_over_128tb, ++ Opt_extents, Opt_noextents, + Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable, + }; + +@@ -1380,6 +1383,8 @@ + {Opt_bigendian_extents, "bigendian_extents"}, + {Opt_force_over_128tb, "force_over_128tb"}, + {Opt_mballoc, "mballoc"}, ++ {Opt_extents, "extents"}, ++ {Opt_noextents, "noextents"}, + {Opt_discard, "discard"}, + {Opt_nodiscard, "nodiscard"}, + {Opt_init_itable, "init_itable=%u"}, +@@ -1472,6 +1477,7 @@ + return 1; + } + #endif ++ ext4_fsblk_t last_block; + + static int parse_options(char *options, struct super_block *sb, + unsigned long *journal_devnum, +@@ -1887,6 +1893,32 @@ + case Opt_force_over_128tb: + force_over_128tb = 1; + break; ++ case Opt_extents: ++ if (!EXT4_HAS_INCOMPAT_FEATURE(sb, ++ EXT4_FEATURE_INCOMPAT_EXTENTS)) { ++ ext4_warning(sb, "extents feature not enabled " ++ "on this filesystem, use tune2fs"); ++ return 0; ++ } ++ set_opt(sb, EXTENTS); ++ break; ++ case Opt_noextents: ++ /* ++ * When e2fsprogs support resizing an already existing ++ * ext4 file system to greater than 2**32 we need to ++ * add support to block allocator to handle growing ++ * already existing block mapped inode so that blocks ++ * allocated for them fall within 2**32 ++ */ ++ last_block = ext4_blocks_count(sbi->s_es) - 1; ++ if (last_block > 0xffffffffULL) { ++ printk(KERN_ERR "EXT4-fs: Filesystem too " ++ "large to mount with " ++ "-o noextents options\n"); ++ return 0; ++ } ++ clear_opt(sb, EXTENTS); ++ break; + default: + ext4_msg(sb, KERN_ERR, + "Unrecognized mount option \"%s\" " +@@ -3206,6 +3238,14 @@ + set_opt(sb, BARRIER); + + /* ++ * turn on extents feature by default in ext4 filesystem ++ * only if feature flag already set by mkfs or tune2fs. ++ * Use -o noextents to turn it off ++ */ ++ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) ++ set_opt(sb, EXTENTS); ++ ++ /* + * enable delayed allocation by default + * Use -o nodelalloc to turn it off + */ diff --git a/ldiskfs/kernel_patches/patches/sles11sp2/ext4-force_over_128tb.patch b/ldiskfs/kernel_patches/patches/sles11sp2/ext4-force_over_128tb.patch new file mode 100644 index 0000000..5bfaa38 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/sles11sp2/ext4-force_over_128tb.patch @@ -0,0 +1,56 @@ +diff -r -u linux-stage.orig/fs/ext4/super.c linux-stage/fs/ext4/super.c +--- linux-stage.orig/fs/ext4/super.c 2012-12-31 12:55:18.000000000 -0500 ++++ linux-stage/fs/ext4/super.c 2012-12-31 12:56:14.000000000 -0500 +@@ -59,6 +59,8 @@ + static struct mutex ext4_li_mtx; + static struct ext4_features *ext4_feat; + ++static int force_over_128tb; ++ + static int ext4_load_journal(struct super_block *, struct ext4_super_block *, + unsigned long journal_devnum); + static int ext4_commit_super(struct super_block *sb, int sync); +@@ -1298,7 +1300,7 @@ + Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity, + Opt_inode_readahead_blks, Opt_journal_ioprio, + Opt_dioread_nolock, Opt_dioread_lock, +- Opt_mballoc, Opt_bigendian_extents, ++ Opt_mballoc, Opt_bigendian_extents, Opt_force_over_128tb, + Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable, + }; + +@@ -1374,6 +1376,7 @@ + {Opt_dioread_nolock, "dioread_nolock"}, + {Opt_dioread_lock, "dioread_lock"}, + {Opt_bigendian_extents, "bigendian_extents"}, ++ {Opt_force_over_128tb, "force_over_128tb"}, + {Opt_mballoc, "mballoc"}, + {Opt_discard, "discard"}, + {Opt_nodiscard, "nodiscard"}, +@@ -1879,6 +1882,9 @@ + break; + case Opt_mballoc: + break; ++ case Opt_force_over_128tb: ++ force_over_128tb = 1; ++ break; + default: + ext4_msg(sb, KERN_ERR, + "Unrecognized mount option \"%s\" " +@@ -3394,6 +3400,16 @@ + goto failed_mount; + } + ++ if (ext4_blocks_count(es) > (8ULL << 32)) { ++ if (force_over_128tb == 0) { ++ printk(KERN_ERR "EXT4-fs does not support filesystems " ++ "greater than 128TB and can cause data corruption." ++ "Use \"force_over_128tb\" mount option to override." ++ "\n"); ++ goto failed_mount; ++ } ++ } ++ + if (EXT4_BLOCKS_PER_GROUP(sb) == 0) + goto cantfind_ext4; + diff --git a/ldiskfs/kernel_patches/patches/sles11sp2/ext4-free-resources-in-some-error-path-in-ext4_fill_super.patch b/ldiskfs/kernel_patches/patches/sles11sp2/ext4-free-resources-in-some-error-path-in-ext4_fill_super.patch new file mode 100644 index 0000000..5b66631 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/sles11sp2/ext4-free-resources-in-some-error-path-in-ext4_fill_super.patch @@ -0,0 +1,71 @@ +From dcf2d804ed6ffe5e942b909ed5e5b74628be6ee4 Mon Sep 17 00:00:00 2001 +From: Tao Ma +Date: Thu, 6 Oct 2011 12:10:11 -0400 +Subject: ext4: Free resources in some error path in ext4_fill_super +Git-commit: dcf2d804 +Patch-mainline: v3.2-rc1 + +Some of the error path in ext4_fill_super don't release the +resouces properly. So this patch just try to release them +in the right way. + +Upstream-Signed-off-by: Tao Ma +Upstream-Signed-off-by: "Theodore Ts'o" +Signed-off-by: Jeff Mahoney +--- + fs/ext4/super.c | 19 +++++++++++-------- + 1 file changed, 11 insertions(+), 8 deletions(-) + +diff --git a/fs/ext4/super.c b/fs/ext4/super.c +index 07f3de3..db2cd3f 100644 +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -3785,22 +3785,19 @@ no_journal: + if (err) { + ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)", + err); +- goto failed_mount4; ++ goto failed_mount5; + } + + err = ext4_register_li_request(sb, first_not_zeroed); + if (err) +- goto failed_mount4; ++ goto failed_mount6; + + sbi->s_kobj.kset = ext4_kset; + init_completion(&sbi->s_kobj_unregister); + err = kobject_init_and_add(&sbi->s_kobj, &ext4_ktype, NULL, + "%s", sb->s_id); +- if (err) { +- ext4_mb_release(sb); +- ext4_ext_release(sb); +- goto failed_mount4; +- }; ++ if (err) ++ goto failed_mount7; + + EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS; + ext4_orphan_cleanup(sb, es); +@@ -3834,13 +3831,19 @@ cantfind_ext4: + ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem"); + goto failed_mount; + ++failed_mount7: ++ ext4_unregister_li_request(sb); ++failed_mount6: ++ ext4_ext_release(sb); ++failed_mount5: ++ ext4_mb_release(sb); ++ ext4_release_system_zone(sb); + failed_mount4: + iput(root); + sb->s_root = NULL; + ext4_msg(sb, KERN_ERR, "mount failed"); + destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq); + failed_mount_wq: +- ext4_release_system_zone(sb); + if (sbi->s_journal) { + jbd2_journal_destroy(sbi->s_journal); + sbi->s_journal = NULL; + diff --git a/ldiskfs/kernel_patches/patches/sles11sp2/ext4-handle-cleanup-after-quota-failure.patch b/ldiskfs/kernel_patches/patches/sles11sp2/ext4-handle-cleanup-after-quota-failure.patch new file mode 100644 index 0000000..4860d87 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/sles11sp2/ext4-handle-cleanup-after-quota-failure.patch @@ -0,0 +1,34 @@ +From: Jeff Mahoney +Subject: ext4: cleanup sbi->s_kobj after quota initialization failure + +If ext4_enable_quotas fails, it jumps to failed_mount7, which doesn't +tear down the kobj. If the user tries to mount the file system again, +they'll get big scary WARN_ONs from sysfs. + +Signed-off-by: Jeff Mahoney +--- + fs/ext4/super.c | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -3794,7 +3794,7 @@ no_journal: + !(sb->s_flags & MS_RDONLY)) { + ret = ext4_enable_quotas(sb); + if (ret) +- goto failed_mount7; ++ goto failed_mount8; + } + #endif /* CONFIG_QUOTA */ + +@@ -3813,6 +3813,10 @@ cantfind_ext4: + ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem"); + goto failed_mount; + ++failed_mount8: ++ kobject_del(&sbi->s_kobj); ++ kobject_put(&sbi->s_kobj); ++ wait_for_completion(&sbi->s_kobj_unregister); + failed_mount7: + ext4_unregister_li_request(sb); + failed_mount6: diff --git a/ldiskfs/kernel_patches/patches/sles11sp2/ext4-introduce-ext4_kvmalloc-ext4_kzalloc-and-ext4_kvfree.patch b/ldiskfs/kernel_patches/patches/sles11sp2/ext4-introduce-ext4_kvmalloc-ext4_kzalloc-and-ext4_kvfree.patch new file mode 100644 index 0000000..3da7dac --- /dev/null +++ b/ldiskfs/kernel_patches/patches/sles11sp2/ext4-introduce-ext4_kvmalloc-ext4_kzalloc-and-ext4_kvfree.patch @@ -0,0 +1,120 @@ +From 9933fc0ac1ac14b795819cd63d05ea92112f690a Mon Sep 17 00:00:00 2001 +From: Theodore Ts'o +Date: Mon, 1 Aug 2011 08:45:02 -0400 +Subject: ext4: introduce ext4_kvmalloc(), ext4_kzalloc(), and ext4_kvfree() +Git-commit: 9933fc0a +Patch-mainline: v3.1-rc1 + +Introduce new helper functions which try kmalloc, and then fall back +to vmalloc if necessary, and use them for allocating and deallocating +s_flex_groups. + +Upstream-Signed-off-by: "Theodore Ts'o" +Signed-off-by: Jeff Mahoney +--- + fs/ext4/ext4.h | 3 +++ + fs/ext4/super.c | 54 ++++++++++++++++++++++++++++++++++++------------------ + 2 files changed, 39 insertions(+), 18 deletions(-) + +diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h +index ba2009b..db9fead 100644 +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -1874,6 +1874,9 @@ extern int ext4_group_extend(struct super_block *sb, + ext4_fsblk_t n_blocks_count); + + /* super.c */ ++extern void *ext4_kvmalloc(size_t size, gfp_t flags); ++extern void *ext4_kvzalloc(size_t size, gfp_t flags); ++extern void ext4_kvfree(void *ptr); + extern void __ext4_error(struct super_block *, const char *, unsigned int, + const char *, ...) + __attribute__ ((format (printf, 4, 5))); +diff --git a/fs/ext4/super.c b/fs/ext4/super.c +index cfe9f39..658f586 100644 +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -110,6 +110,35 @@ static struct file_system_type ext3_fs_type = { + #define IS_EXT3_SB(sb) (0) + #endif + ++void *ext4_kvmalloc(size_t size, gfp_t flags) ++{ ++ void *ret; ++ ++ ret = kmalloc(size, flags); ++ if (!ret) ++ ret = __vmalloc(size, flags, PAGE_KERNEL); ++ return ret; ++} ++ ++void *ext4_kvzalloc(size_t size, gfp_t flags) ++{ ++ void *ret; ++ ++ ret = kmalloc(size, flags); ++ if (!ret) ++ ret = __vmalloc(size, flags | __GFP_ZERO, PAGE_KERNEL); ++ return ret; ++} ++ ++void ext4_kvfree(void *ptr) ++{ ++ if (is_vmalloc_addr(ptr)) ++ vfree(ptr); ++ else ++ kfree(ptr); ++ ++} ++ + ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, + struct ext4_group_desc *bg) + { +@@ -791,10 +820,7 @@ static void ext4_put_super(struct super_block *sb) + for (i = 0; i < sbi->s_gdb_count; i++) + brelse(sbi->s_group_desc[i]); + kfree(sbi->s_group_desc); +- if (is_vmalloc_addr(sbi->s_flex_groups)) +- vfree(sbi->s_flex_groups); +- else +- kfree(sbi->s_flex_groups); ++ ext4_kvfree(sbi->s_flex_groups); + percpu_counter_destroy(&sbi->s_freeblocks_counter); + percpu_counter_destroy(&sbi->s_freeinodes_counter); + percpu_counter_destroy(&sbi->s_dirs_counter); +@@ -1977,15 +2003,11 @@ static int ext4_fill_flex_info(struct super_block *sb) + ((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) << + EXT4_DESC_PER_BLOCK_BITS(sb))) / groups_per_flex; + size = flex_group_count * sizeof(struct flex_groups); +- sbi->s_flex_groups = kzalloc(size, GFP_KERNEL); ++ sbi->s_flex_groups = ext4_kvzalloc(size, GFP_KERNEL); + if (sbi->s_flex_groups == NULL) { +- sbi->s_flex_groups = vzalloc(size); +- if (sbi->s_flex_groups == NULL) { +- ext4_msg(sb, KERN_ERR, +- "not enough memory for %u flex groups", +- flex_group_count); +- goto failed; +- } ++ ext4_msg(sb, KERN_ERR, "not enough memory for %u flex groups", ++ flex_group_count); ++ goto failed; + } + + for (i = 0; i < sbi->s_groups_count; i++) { +@@ -3750,12 +3772,8 @@ failed_mount_wq: + } + failed_mount3: + del_timer(&sbi->s_err_report); +- if (sbi->s_flex_groups) { +- if (is_vmalloc_addr(sbi->s_flex_groups)) +- vfree(sbi->s_flex_groups); +- else +- kfree(sbi->s_flex_groups); +- } ++ if (sbi->s_flex_groups) ++ ext4_kvfree(sbi->s_flex_groups); + percpu_counter_destroy(&sbi->s_freeblocks_counter); + percpu_counter_destroy(&sbi->s_freeinodes_counter); + percpu_counter_destroy(&sbi->s_dirs_counter); + diff --git a/ldiskfs/kernel_patches/patches/sles11sp2/ext4-journal-callback.patch b/ldiskfs/kernel_patches/patches/sles11sp2/ext4-journal-callback.patch new file mode 100644 index 0000000..f55e6ce --- /dev/null +++ b/ldiskfs/kernel_patches/patches/sles11sp2/ext4-journal-callback.patch @@ -0,0 +1,455 @@ +From 18aadd47f88464928b5ce57791c2e8f9f2aaece0 Mon Sep 17 00:00:00 2001 +From: Bobi Jam +Date: Mon, 20 Feb 2012 17:53:02 -0500 +Subject: ext4: expand commit callback and +Git-commit: 18aadd47 +Patch-mainline: v3.4-rc1 + +The per-commit callback was used by mballoc code to manage free space +bitmaps after deleted blocks have been released. This patch expands +it to support multiple different callbacks, to allow other things to +be done after the commit has been completed. + +Upstream-Signed-off-by: Bobi Jam +Upstream-Signed-off-by: Andreas Dilger +Upstream-Signed-off-by: "Theodore Ts'o" +Acked-by: Jeff Mahoney +--- + fs/ext4/ext4_jbd2.h | 72 ++++++++++++++++++++++++ + fs/ext4/mballoc.c | 155 ++++++++++++++++++++++++---------------------------- + fs/ext4/mballoc.h | 18 +++--- + fs/ext4/super.c | 18 ++++++ + 4 files changed, 173 insertions(+), 90 deletions(-) + +--- a/fs/ext4/ext4_jbd2.h ++++ b/fs/ext4/ext4_jbd2.h +@@ -104,6 +104,78 @@ + #define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb)) + #define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb)) + ++/** ++ * struct ext4_journal_cb_entry - Base structure for callback information. ++ * ++ * This struct is a 'seed' structure for a using with your own callback ++ * structs. If you are using callbacks you must allocate one of these ++ * or another struct of your own definition which has this struct ++ * as it's first element and pass it to ext4_journal_callback_add(). ++ */ ++struct ext4_journal_cb_entry { ++ /* list information for other callbacks attached to the same handle */ ++ struct list_head jce_list; ++ ++ /* Function to call with this callback structure */ ++ void (*jce_func)(struct super_block *sb, ++ struct ext4_journal_cb_entry *jce, int error); ++ ++ /* user data goes here */ ++}; ++ ++/** ++ * ext4_journal_callback_add: add a function to call after transaction commit ++ * @handle: active journal transaction handle to register callback on ++ * @func: callback function to call after the transaction has committed: ++ * @sb: superblock of current filesystem for transaction ++ * @jce: returned journal callback data ++ * @rc: journal state at commit (0 = transaction committed properly) ++ * @jce: journal callback data (internal and function private data struct) ++ * ++ * The registered function will be called in the context of the journal thread ++ * after the transaction for which the handle was created has completed. ++ * ++ * No locks are held when the callback function is called, so it is safe to ++ * call blocking functions from within the callback, but the callback should ++ * not block or run for too long, or the filesystem will be blocked waiting for ++ * the next transaction to commit. No journaling functions can be used, or ++ * there is a risk of deadlock. ++ * ++ * There is no guaranteed calling order of multiple registered callbacks on ++ * the same transaction. ++ */ ++static inline void ext4_journal_callback_add(handle_t *handle, ++ void (*func)(struct super_block *sb, ++ struct ext4_journal_cb_entry *jce, ++ int rc), ++ struct ext4_journal_cb_entry *jce) ++{ ++ struct ext4_sb_info *sbi = ++ EXT4_SB(handle->h_transaction->t_journal->j_private); ++ ++ /* Add the jce to transaction's private list */ ++ jce->jce_func = func; ++ spin_lock(&sbi->s_md_lock); ++ list_add_tail(&jce->jce_list, &handle->h_transaction->t_private_list); ++ spin_unlock(&sbi->s_md_lock); ++} ++ ++/** ++ * ext4_journal_callback_del: delete a registered callback ++ * @handle: active journal transaction handle on which callback was registered ++ * @jce: registered journal callback entry to unregister ++ */ ++static inline void ext4_journal_callback_del(handle_t *handle, ++ struct ext4_journal_cb_entry *jce) ++{ ++ struct ext4_sb_info *sbi = ++ EXT4_SB(handle->h_transaction->t_journal->j_private); ++ ++ spin_lock(&sbi->s_md_lock); ++ list_del_init(&jce->jce_list); ++ spin_unlock(&sbi->s_md_lock); ++} ++ + int + ext4_mark_iloc_dirty(handle_t *handle, + struct inode *inode, +--- a/fs/ext4/mballoc.c ++++ b/fs/ext4/mballoc.c +@@ -21,6 +21,7 @@ + * mballoc.c contains the multiblocks allocation routines + */ + ++#include "ext4_jbd2.h" + #include "mballoc.h" + #include + #include +@@ -337,7 +338,7 @@ + */ + static struct kmem_cache *ext4_pspace_cachep; + static struct kmem_cache *ext4_ac_cachep; +-static struct kmem_cache *ext4_free_ext_cachep; ++static struct kmem_cache *ext4_free_data_cachep; + + /* We create slab caches for groupinfo data structures based on the + * superblock block size. There will be one per mounted filesystem for +@@ -355,7 +356,8 @@ static void ext4_mb_generate_from_pa(str + ext4_group_t group); + static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, + ext4_group_t group); +-static void release_blocks_on_commit(journal_t *journal, transaction_t *txn); ++static void ext4_free_data_callback(struct super_block *sb, ++ struct ext4_journal_cb_entry *jce, int rc); + + static inline void *mb_correct_addr_and_bit(int *bit, void *addr) + { +@@ -2492,8 +2494,6 @@ int ext4_mb_init(struct super_block *sb, + proc_create_data("mb_groups", S_IRUGO, sbi->s_proc, + &ext4_mb_seq_groups_fops, sb); + +- if (sbi->s_journal) +- sbi->s_journal->j_commit_callback = release_blocks_on_commit; + out: + if (ret) { + kfree(sbi->s_mb_offsets); +@@ -2598,58 +2598,55 @@ static inline int ext4_issue_discard(str + * This function is called by the jbd2 layer once the commit has finished, + * so we know we can free the blocks that were released with that commit. + */ +-static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) ++static void ext4_free_data_callback(struct super_block *sb, ++ struct ext4_journal_cb_entry *jce, ++ int rc) + { +- struct super_block *sb = journal->j_private; ++ struct ext4_free_data *entry = (struct ext4_free_data *)jce; + struct ext4_buddy e4b; + struct ext4_group_info *db; + int err, count = 0, count2 = 0; +- struct ext4_free_data *entry; +- struct list_head *l, *ltmp; + +- list_for_each_safe(l, ltmp, &txn->t_private_list) { +- entry = list_entry(l, struct ext4_free_data, list); ++ mb_debug(1, "gonna free %u blocks in group %u (0x%p):", ++ entry->efd_count, entry->efd_group, entry); + +- mb_debug(1, "gonna free %u blocks in group %u (0x%p):", +- entry->count, entry->group, entry); ++ if (test_opt(sb, DISCARD)) ++ ext4_issue_discard(sb, entry->efd_group, ++ entry->efd_start_blk, entry->efd_count); ++ ++ err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b); ++ /* we expect to find existing buddy because it's pinned */ ++ BUG_ON(err != 0); ++ ++ ++ db = e4b.bd_info; ++ /* there are blocks to put in buddy to make them really free */ ++ count += entry->efd_count; ++ count2++; ++ ext4_lock_group(sb, entry->efd_group); ++ /* Take it out of per group rb tree */ ++ rb_erase(&entry->efd_node, &(db->bb_free_root)); ++ mb_free_blocks(NULL, &e4b, entry->efd_start_blk, entry->efd_count); + +- if (test_opt(sb, DISCARD)) +- ext4_issue_discard(sb, entry->group, +- entry->start_blk, entry->count); +- +- err = ext4_mb_load_buddy(sb, entry->group, &e4b); +- /* we expect to find existing buddy because it's pinned */ +- BUG_ON(err != 0); +- +- db = e4b.bd_info; +- /* there are blocks to put in buddy to make them really free */ +- count += entry->count; +- count2++; +- ext4_lock_group(sb, entry->group); +- /* Take it out of per group rb tree */ +- rb_erase(&entry->node, &(db->bb_free_root)); +- mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count); ++ /* ++ * Clear the trimmed flag for the group so that the next ++ * ext4_trim_fs can trim it. ++ * If the volume is mounted with -o discard, online discard ++ * is supported and the free blocks will be trimmed online. ++ */ ++ if (!test_opt(sb, DISCARD)) ++ EXT4_MB_GRP_CLEAR_TRIMMED(db); + +- /* +- * Clear the trimmed flag for the group so that the next +- * ext4_trim_fs can trim it. +- * If the volume is mounted with -o discard, online discard +- * is supported and the free blocks will be trimmed online. ++ if (!db->bb_free_root.rb_node) { ++ /* No more items in the per group rb tree ++ * balance refcounts from ext4_mb_free_metadata() + */ +- if (!test_opt(sb, DISCARD)) +- EXT4_MB_GRP_CLEAR_TRIMMED(db); +- +- if (!db->bb_free_root.rb_node) { +- /* No more items in the per group rb tree +- * balance refcounts from ext4_mb_free_metadata() +- */ +- page_cache_release(e4b.bd_buddy_page); +- page_cache_release(e4b.bd_bitmap_page); +- } +- ext4_unlock_group(sb, entry->group); +- kmem_cache_free(ext4_free_ext_cachep, entry); +- ext4_mb_unload_buddy(&e4b); ++ page_cache_release(e4b.bd_buddy_page); ++ page_cache_release(e4b.bd_bitmap_page); + } ++ ext4_unlock_group(sb, entry->efd_group); ++ kmem_cache_free(ext4_free_data_cachep, entry); ++ ext4_mb_unload_buddy(&e4b); + + mb_debug(1, "freed %u blocks in %u structures\n", count, count2); + } +@@ -2702,9 +2699,9 @@ int __init ext4_init_mballoc(void) + return -ENOMEM; + } + +- ext4_free_ext_cachep = KMEM_CACHE(ext4_free_data, +- SLAB_RECLAIM_ACCOUNT); +- if (ext4_free_ext_cachep == NULL) { ++ ext4_free_data_cachep = KMEM_CACHE(ext4_free_data, ++ SLAB_RECLAIM_ACCOUNT); ++ if (ext4_free_data_cachep == NULL) { + kmem_cache_destroy(ext4_pspace_cachep); + kmem_cache_destroy(ext4_ac_cachep); + return -ENOMEM; +@@ -2722,7 +2719,7 @@ void ext4_exit_mballoc(void) + rcu_barrier(); + kmem_cache_destroy(ext4_pspace_cachep); + kmem_cache_destroy(ext4_ac_cachep); +- kmem_cache_destroy(ext4_free_ext_cachep); ++ kmem_cache_destroy(ext4_free_data_cachep); + ext4_groupinfo_destroy_slabs(); + ext4_remove_debugfs_entry(); + } +@@ -3273,8 +3270,8 @@ static void ext4_mb_generate_from_freeli + n = rb_first(&(grp->bb_free_root)); + + while (n) { +- entry = rb_entry(n, struct ext4_free_data, node); +- mb_set_bits(bitmap, entry->start_blk, entry->count); ++ entry = rb_entry(n, struct ext4_free_data, efd_node); ++ mb_set_bits(bitmap, entry->efd_start_blk, entry->efd_count); + n = rb_next(n); + } + return; +@@ -4369,9 +4366,9 @@ out: + static int can_merge(struct ext4_free_data *entry1, + struct ext4_free_data *entry2) + { +- if ((entry1->t_tid == entry2->t_tid) && +- (entry1->group == entry2->group) && +- ((entry1->start_blk + entry1->count) == entry2->start_blk)) ++ if ((entry1->efd_tid == entry2->efd_tid) && ++ (entry1->efd_group == entry2->efd_group) && ++ ((entry1->efd_start_blk + entry1->efd_count) == entry2->efd_start_blk)) + return 1; + return 0; + } +@@ -4385,7 +4382,6 @@ ext4_mb_free_metadata(handle_t *handle, + struct ext4_free_data *entry; + struct ext4_group_info *db = e4b->bd_info; + struct super_block *sb = e4b->bd_sb; +- struct ext4_sb_info *sbi = EXT4_SB(sb); + struct rb_node **n = &db->bb_free_root.rb_node, *node; + struct rb_node *parent = NULL, *new_node; + +@@ -4393,8 +4389,8 @@ ext4_mb_free_metadata(handle_t *handle, + BUG_ON(e4b->bd_bitmap_page == NULL); + BUG_ON(e4b->bd_buddy_page == NULL); + +- new_node = &new_entry->node; +- block = new_entry->start_blk; ++ new_node = &new_entry->efd_node; ++ block = new_entry->efd_start_blk; + + if (!*n) { + /* first free block exent. We need to +@@ -4407,10 +4403,10 @@ ext4_mb_free_metadata(handle_t *handle, + } + while (*n) { + parent = *n; +- entry = rb_entry(parent, struct ext4_free_data, node); +- if (block < entry->start_blk) ++ entry = rb_entry(parent, struct ext4_free_data, efd_node); ++ if (block < entry->efd_start_blk) + n = &(*n)->rb_left; +- else if (block >= (entry->start_blk + entry->count)) ++ else if (block >= (entry->efd_start_blk + entry->efd_count)) + n = &(*n)->rb_right; + else { + ext4_grp_locked_error(sb, group, 0, +@@ -4426,34 +4422,29 @@ ext4_mb_free_metadata(handle_t *handle, + /* Now try to see the extent can be merged to left and right */ + node = rb_prev(new_node); + if (node) { +- entry = rb_entry(node, struct ext4_free_data, node); ++ entry = rb_entry(node, struct ext4_free_data, efd_node); + if (can_merge(entry, new_entry)) { +- new_entry->start_blk = entry->start_blk; +- new_entry->count += entry->count; ++ new_entry->efd_start_blk = entry->efd_start_blk; ++ new_entry->efd_count += entry->efd_count; + rb_erase(node, &(db->bb_free_root)); +- spin_lock(&sbi->s_md_lock); +- list_del(&entry->list); +- spin_unlock(&sbi->s_md_lock); +- kmem_cache_free(ext4_free_ext_cachep, entry); ++ ext4_journal_callback_del(handle, &entry->efd_jce); ++ kmem_cache_free(ext4_free_data_cachep, entry); + } + } + + node = rb_next(new_node); + if (node) { +- entry = rb_entry(node, struct ext4_free_data, node); ++ entry = rb_entry(node, struct ext4_free_data, efd_node); + if (can_merge(new_entry, entry)) { +- new_entry->count += entry->count; ++ new_entry->efd_count += entry->efd_count; + rb_erase(node, &(db->bb_free_root)); +- spin_lock(&sbi->s_md_lock); +- list_del(&entry->list); +- spin_unlock(&sbi->s_md_lock); +- kmem_cache_free(ext4_free_ext_cachep, entry); ++ ext4_journal_callback_del(handle, &entry->efd_jce); ++ kmem_cache_free(ext4_free_data_cachep, entry); + } + } + /* Add the extent to transaction's private list */ +- spin_lock(&sbi->s_md_lock); +- list_add(&new_entry->list, &handle->h_transaction->t_private_list); +- spin_unlock(&sbi->s_md_lock); ++ ext4_journal_callback_add(handle, ext4_free_data_callback, ++ &new_entry->efd_jce); + return 0; + } + +@@ -4596,16 +4587,16 @@ do_more: + * blocks being freed are metadata. these blocks shouldn't + * be used until this transaction is committed + */ +- new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS); ++ new_entry = kmem_cache_alloc(ext4_free_data_cachep, GFP_NOFS); + if (!new_entry) { + ext4_mb_unload_buddy(&e4b); + err = -ENOMEM; + goto error_return; + } +- new_entry->start_blk = bit; +- new_entry->group = block_group; +- new_entry->count = count; +- new_entry->t_tid = handle->h_transaction->t_tid; ++ new_entry->efd_start_blk = bit; ++ new_entry->efd_group = block_group; ++ new_entry->efd_count = count; ++ new_entry->efd_tid = handle->h_transaction->t_tid; + + ext4_lock_group(sb, block_group); + mb_clear_bits(bitmap_bh->b_data, bit, count); +--- a/fs/ext4/mballoc.h ++++ b/fs/ext4/mballoc.h +@@ -96,21 +96,23 @@ extern u8 mb_enable_debug; + + + struct ext4_free_data { +- /* this links the free block information from group_info */ +- struct rb_node node; ++ /* MUST be the first member */ ++ struct ext4_journal_cb_entry efd_jce; ++ ++ /* ext4_free_data private data starts from here */ + +- /* this links the free block information from ext4_sb_info */ +- struct list_head list; ++ /* this links the free block information from group_info */ ++ struct rb_node efd_node; + + /* group which free block extent belongs */ +- ext4_group_t group; ++ ext4_group_t efd_group; + + /* free block extent */ +- ext4_grpblk_t start_blk; +- ext4_grpblk_t count; ++ ext4_grpblk_t efd_start_blk; ++ ext4_grpblk_t efd_count; + + /* transaction which freed this extent */ +- tid_t t_tid; ++ tid_t efd_tid; + }; + + struct ext4_prealloc_space { +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -413,6 +413,22 @@ static void save_error_info(struct super + ext4_commit_super(sb, 1); + } + ++static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn) ++{ ++ struct super_block *sb = journal->j_private; ++ struct ext4_sb_info *sbi = EXT4_SB(sb); ++ int error = is_journal_aborted(journal); ++ struct ext4_journal_cb_entry *jce, *tmp; ++ ++ spin_lock(&sbi->s_md_lock); ++ list_for_each_entry_safe(jce, tmp, &txn->t_private_list, jce_list) { ++ list_del_init(&jce->jce_list); ++ spin_unlock(&sbi->s_md_lock); ++ jce->jce_func(sb, jce, error); ++ spin_lock(&sbi->s_md_lock); ++ } ++ spin_unlock(&sbi->s_md_lock); ++} + + /* Deal with the reporting of failure conditions on a filesystem such as + * inconsistencies detected or read IO failures. +@@ -3600,6 +3616,8 @@ static int ext4_fill_super(struct super_ + } + set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); + ++ sbi->s_journal->j_commit_callback = ext4_journal_commit_callback; ++ + /* + * The journal may have updated the bg summary counts, so we + * need to update the global counters. diff --git a/ldiskfs/kernel_patches/patches/sles11sp2/ext4-kill-dx_root.patch b/ldiskfs/kernel_patches/patches/sles11sp2/ext4-kill-dx_root.patch new file mode 100644 index 0000000..61905e6 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/sles11sp2/ext4-kill-dx_root.patch @@ -0,0 +1,229 @@ +diff -r -u linux-stage.orig/fs/ext4/namei.c linux-stage/fs/ext4/namei.c +--- linux-stage.orig/fs/ext4/namei.c 2012-12-31 15:03:28.000000000 -0500 ++++ linux-stage/fs/ext4/namei.c 2012-12-31 15:06:16.000000000 -0500 +@@ -115,22 +115,13 @@ + * hash version mod 4 should never be 0. Sincerely, the paranoia department. + */ + +-struct dx_root ++struct dx_root_info + { +- struct fake_dirent dot; +- char dot_name[4]; +- struct fake_dirent dotdot; +- char dotdot_name[4]; +- struct dx_root_info +- { +- __le32 reserved_zero; +- u8 hash_version; +- u8 info_length; /* 8 */ +- u8 indirect_levels; +- u8 unused_flags; +- } +- info; +- struct dx_entry entries[0]; ++ __le32 reserved_zero; ++ u8 hash_version; ++ u8 info_length; /* 8 */ ++ u8 indirect_levels; ++ u8 unused_flags; + }; + + struct dx_node +@@ -220,6 +211,16 @@ + * Future: use high four bits of block for coalesce-on-delete flags + * Mask them off for now. + */ ++struct dx_root_info * dx_get_dx_info(struct ext4_dir_entry_2 *de) ++{ ++ /* get dotdot first */ ++ de = (struct ext4_dir_entry_2 *)((char *)de + EXT4_DIR_REC_LEN(1)); ++ ++ /* dx root info is after dotdot entry */ ++ de = (struct ext4_dir_entry_2 *)((char *)de + EXT4_DIR_REC_LEN(2)); ++ ++ return (struct dx_root_info *) de; ++} + + static inline ext4_lblk_t dx_get_block(struct dx_entry *entry) + { +@@ -374,7 +375,7 @@ + { + unsigned count, indirect; + struct dx_entry *at, *entries, *p, *q, *m; +- struct dx_root *root; ++ struct dx_root_info * info; + struct buffer_head *bh; + struct dx_frame *frame = frame_in; + u32 hash; +@@ -382,17 +383,18 @@ + frame->bh = NULL; + if (!(bh = ext4_bread (NULL,dir, 0, 0, err))) + goto fail; +- root = (struct dx_root *) bh->b_data; +- if (root->info.hash_version != DX_HASH_TEA && +- root->info.hash_version != DX_HASH_HALF_MD4 && +- root->info.hash_version != DX_HASH_LEGACY) { ++ ++ info = dx_get_dx_info((struct ext4_dir_entry_2*)bh->b_data); ++ if (info->hash_version != DX_HASH_TEA && ++ info->hash_version != DX_HASH_HALF_MD4 && ++ info->hash_version != DX_HASH_LEGACY) { + ext4_warning(dir->i_sb, "Unrecognised inode hash code %d for directory " +- "#%lu", root->info.hash_version, dir->i_ino); ++ "#%lu", info->hash_version, dir->i_ino); + brelse(bh); + *err = ERR_BAD_DX_DIR; + goto fail; + } +- hinfo->hash_version = root->info.hash_version; ++ hinfo->hash_version = info->hash_version; + if (hinfo->hash_version <= DX_HASH_TEA) + hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; + hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed; +@@ -400,27 +402,26 @@ + ext4fs_dirhash(d_name->name, d_name->len, hinfo); + hash = hinfo->hash; + +- if (root->info.unused_flags & 1) { ++ if (info->unused_flags & 1) { + ext4_warning(dir->i_sb, "Unimplemented inode hash flags: %#06x", +- root->info.unused_flags); ++ info->unused_flags); + brelse(bh); + *err = ERR_BAD_DX_DIR; + goto fail; + } + +- if ((indirect = root->info.indirect_levels) > 1) { ++ if ((indirect = info->indirect_levels) > 1) { + ext4_warning(dir->i_sb, "Unimplemented inode hash depth: %#06x", +- root->info.indirect_levels); ++ info->indirect_levels); + brelse(bh); + *err = ERR_BAD_DX_DIR; + goto fail; + } + +- entries = (struct dx_entry *) (((char *)&root->info) + +- root->info.info_length); ++ entries = (struct dx_entry *) (((char *)info) + info->info_length); + + if (dx_get_limit(entries) != dx_root_limit(dir, +- root->info.info_length)) { ++ info->info_length)) { + ext4_warning(dir->i_sb, "dx entry: limit != root limit"); + brelse(bh); + *err = ERR_BAD_DX_DIR; +@@ -501,10 +502,12 @@ + + static void dx_release (struct dx_frame *frames) + { ++ struct dx_root_info *info; + if (frames[0].bh == NULL) + return; + +- if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels) ++ info = dx_get_dx_info((struct ext4_dir_entry_2*)frames[0].bh->b_data); ++ if (info->indirect_levels) + brelse(frames[1].bh); + brelse(frames[0].bh); + } +@@ -1400,17 +1403,16 @@ + const char *name = dentry->d_name.name; + int namelen = dentry->d_name.len; + struct buffer_head *bh2; +- struct dx_root *root; + struct dx_frame frames[2], *frame; + struct dx_entry *entries; +- struct ext4_dir_entry_2 *de, *de2; ++ struct ext4_dir_entry_2 *de, *de2, *dot_de, *dotdot_de; + char *data1, *top; + unsigned len; + int retval; + unsigned blocksize; + struct dx_hash_info hinfo; + ext4_lblk_t block; +- struct fake_dirent *fde; ++ struct dx_root_info *dx_info; + + blocksize = dir->i_sb->s_blocksize; + dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino)); +@@ -1420,18 +1422,19 @@ + brelse(bh); + return retval; + } +- root = (struct dx_root *) bh->b_data; ++ ++ dot_de = (struct ext4_dir_entry_2 *) bh->b_data; ++ dotdot_de = ext4_next_entry(dot_de, blocksize); + + /* The 0th block becomes the root, move the dirents out */ +- fde = &root->dotdot; +- de = (struct ext4_dir_entry_2 *)((char *)fde + +- ext4_rec_len_from_disk(fde->rec_len, blocksize)); +- if ((char *) de >= (((char *) root) + blocksize)) { ++ de = (struct ext4_dir_entry_2 *)((char *)dotdot_de + ++ ext4_rec_len_from_disk(dotdot_de->rec_len, blocksize)); ++ if ((char *) de >= (((char *) dot_de) + blocksize)) { + EXT4_ERROR_INODE(dir, "invalid rec_len for '..'"); + brelse(bh); + return -EIO; + } +- len = ((char *) root) + blocksize - (char *) de; ++ len = ((char *) dot_de) + blocksize - (char *) de; + + /* Allocate new block for the 0th block's dirents */ + bh2 = ext4_append(handle, dir, &block, &retval); +@@ -1450,19 +1453,23 @@ + de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de, + blocksize); + /* Initialize the root; the dot dirents already exist */ +- de = (struct ext4_dir_entry_2 *) (&root->dotdot); +- de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2), +- blocksize); +- memset (&root->info, 0, sizeof(root->info)); +- root->info.info_length = sizeof(root->info); +- root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; +- entries = root->entries; ++ dotdot_de->rec_len = ext4_rec_len_to_disk(blocksize - ++ le16_to_cpu(dot_de->rec_len), blocksize); ++ ++ /* initialize hashing info */ ++ dx_info = dx_get_dx_info(dot_de); ++ memset (dx_info, 0, sizeof(*dx_info)); ++ dx_info->info_length = sizeof(*dx_info); ++ dx_info->hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; ++ ++ entries = (void *)dx_info + sizeof(*dx_info); ++ + dx_set_block(entries, 1); + dx_set_count(entries, 1); +- dx_set_limit(entries, dx_root_limit(dir, sizeof(root->info))); ++ dx_set_limit(entries, dx_root_limit(dir, sizeof(*dx_info))); + + /* Initialize as for dx_probe */ +- hinfo.hash_version = root->info.hash_version; ++ hinfo.hash_version = dx_info->hash_version; + if (hinfo.hash_version <= DX_HASH_TEA) + hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; + hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; +@@ -1723,6 +1730,7 @@ + goto journal_error; + brelse (bh2); + } else { ++ struct dx_root_info * info; + dxtrace(printk(KERN_DEBUG + "Creating second level index...\n")); + memcpy((char *) entries2, (char *) entries, +@@ -1732,7 +1740,9 @@ + /* Set up root */ + dx_set_count(entries, 1); + dx_set_block(entries + 0, newblock); +- ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1; ++ info = dx_get_dx_info((struct ext4_dir_entry_2*) ++ frames[0].bh->b_data); ++ info->indirect_levels = 1; + + /* Add new access path frame */ + frame = frames + 1; diff --git a/ldiskfs/kernel_patches/patches/sles11sp2/ext4-large-eas.patch b/ldiskfs/kernel_patches/patches/sles11sp2/ext4-large-eas.patch new file mode 100644 index 0000000..ed3d231 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/sles11sp2/ext4-large-eas.patch @@ -0,0 +1,748 @@ +diff -ur linux-stage.orig/fs/ext4/ext4.h linux-stage/fs/ext4/ext4.h +--- linux-stage.orig/fs/ext4/ext4.h 2012-12-31 15:56:25.000000000 -0500 ++++ linux-stage/fs/ext4/ext4.h 2012-12-31 15:56:48.000000000 -0500 +@@ -1406,6 +1406,7 @@ + EXT4_FEATURE_INCOMPAT_EXTENTS| \ + EXT4_FEATURE_INCOMPAT_64BIT| \ + EXT4_FEATURE_INCOMPAT_FLEX_BG| \ ++ EXT4_FEATURE_INCOMPAT_EA_INODE| \ + EXT4_FEATURE_INCOMPAT_MMP| \ + EXT4_FEATURE_INCOMPAT_DIRDATA) + +@@ -1774,6 +1775,12 @@ + #endif + + /* ++ * Maximum size of xattr attributes for FEATURE_INCOMPAT_EA_INODE 1Mb ++ * This limit is arbitrary, but is reasonable for the xattr API. ++ */ ++#define EXT4_XATTR_MAX_LARGE_EA_SIZE (1024 * 1024) ++ ++/* + * Function prototypes + */ + +@@ -2005,6 +2005,7 @@ + extern void ext4_get_inode_flags(struct ext4_inode_info *); + extern int ext4_alloc_da_blocks(struct inode *inode); + extern void ext4_set_aops(struct inode *inode); ++extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int chunk); + extern int ext4_writepage_trans_blocks(struct inode *); + extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); + extern int ext4_block_truncate_page(handle_t *handle, +diff -ur linux-stage.orig/fs/ext4/inode.c linux-stage/fs/ext4/inode.c +--- linux-stage.orig/fs/ext4/inode.c 2013-01-03 09:31:07.000000000 -0500 ++++ linux-stage/fs/ext4/inode.c 2013-01-03 09:31:23.000000000 -0500 +@@ -5535,7 +5535,7 @@ + * + * Also account for superblock, inode, quota and xattr blocks + */ +-static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk) ++int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk) + { + ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb); + int gdpblocks; +diff -ur linux-stage.orig/fs/ext4/xattr.c linux-stage/fs/ext4/xattr.c +--- linux-stage.orig/fs/ext4/xattr.c 2012-12-31 15:56:25.000000000 -0500 ++++ linux-stage/fs/ext4/xattr.c 2012-12-31 15:56:48.000000000 -0500 +@@ -168,19 +168,26 @@ + } + + static inline int +-ext4_xattr_check_entry(struct ext4_xattr_entry *entry, size_t size) ++ext4_xattr_check_entry(struct ext4_xattr_entry *entry, size_t size, ++ struct inode *inode) + { + size_t value_size = le32_to_cpu(entry->e_value_size); + +- if (entry->e_value_block != 0 || value_size > size || +- le16_to_cpu(entry->e_value_offs) + value_size > size) ++ if ((entry->e_value_inum == 0) && ++ (le16_to_cpu(entry->e_value_offs) + value_size > size)) ++ return -EIO; ++ if (entry->e_value_inum != 0 && ++ (le32_to_cpu(entry->e_value_inum) < EXT4_FIRST_INO(inode->i_sb) || ++ le32_to_cpu(entry->e_value_inum) > ++ le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_inodes_count))) + return -EIO; + return 0; + } + + static int + ext4_xattr_find_entry(struct ext4_xattr_entry **pentry, int name_index, +- const char *name, size_t size, int sorted) ++ const char *name, size_t size, int sorted, ++ struct inode *inode) + { + struct ext4_xattr_entry *entry; + size_t name_len; +@@ -200,11 +207,103 @@ + break; + } + *pentry = entry; +- if (!cmp && ext4_xattr_check_entry(entry, size)) ++ if (!cmp && ext4_xattr_check_entry(entry, size, inode)) + return -EIO; + return cmp ? -ENODATA : 0; + } + ++/* ++ * Read the EA value from an inode. ++ */ ++static int ++ext4_xattr_inode_read(struct inode *ea_inode, void *buf, size_t *size) ++{ ++ unsigned long block = 0; ++ struct buffer_head *bh = NULL; ++ int err, blocksize; ++ size_t csize, ret_size = 0; ++ ++ if (*size == 0) ++ return 0; ++ ++ blocksize = ea_inode->i_sb->s_blocksize; ++ ++ while (ret_size < *size) { ++ csize = (*size - ret_size) > blocksize ? blocksize : ++ *size - ret_size; ++ bh = ext4_bread(NULL, ea_inode, block, 0, &err); ++ if (!bh) { ++ *size = ret_size; ++ return err; ++ } ++ memcpy(buf, bh->b_data, csize); ++ brelse(bh); ++ ++ buf += csize; ++ block += 1; ++ ret_size += csize; ++ } ++ ++ *size = ret_size; ++ ++ return err; ++} ++ ++struct inode *ext4_xattr_inode_iget(struct inode *parent, int ea_ino, int *err) ++{ ++ struct inode *ea_inode = NULL; ++ ++ ea_inode = ext4_iget(parent->i_sb, ea_ino); ++ if (ea_inode == NULL || is_bad_inode(ea_inode)) { ++ ext4_error(parent->i_sb, "error while reading EA inode %d", ++ ea_ino); ++ *err = -EIO; ++ return NULL; ++ } ++ ++ if (ea_inode->i_xattr_inode_parent != parent->i_ino || ++ ea_inode->i_generation != parent->i_generation) { ++ ext4_error(parent->i_sb, "Backpointer from EA inode %d " ++ "to parent invalid.", ea_ino); ++ *err = -EINVAL; ++ goto error; ++ } ++ ++ if (!(EXT4_I(ea_inode)->i_flags & EXT4_EA_INODE_FL)) { ++ ext4_error(parent->i_sb, "EA inode %d does not have " ++ "EXT4_EA_INODE_FL flag set.\n", ea_ino); ++ *err = -EINVAL; ++ goto error; ++ } ++ ++ *err = 0; ++ return ea_inode; ++ ++error: ++ iput(ea_inode); ++ return NULL; ++} ++ ++/* ++ * Read the value from the EA inode. ++ */ ++static int ++ext4_xattr_inode_get(struct inode *inode, int ea_ino, void *buffer, ++ size_t *size) ++{ ++ struct inode *ea_inode = NULL; ++ int err; ++ ++ ea_inode = ext4_xattr_inode_iget(inode, ea_ino, &err); ++ if (err) ++ return err; ++ ++ err = ext4_xattr_inode_read(ea_inode, buffer, size); ++ iput(ea_inode); ++ ++ return err; ++} ++ + static int + ext4_xattr_block_get(struct inode *inode, int name_index, const char *name, + void *buffer, size_t buffer_size) +@@ -235,7 +334,8 @@ + } + ext4_xattr_cache_insert(bh); + entry = BFIRST(bh); +- error = ext4_xattr_find_entry(&entry, name_index, name, bh->b_size, 1); ++ error = ext4_xattr_find_entry(&entry, name_index, name, bh->b_size, 1, ++ inode); + if (error == -EIO) + goto bad_block; + if (error) +@@ -245,8 +345,16 @@ + error = -ERANGE; + if (size > buffer_size) + goto cleanup; +- memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs), +- size); ++ if (entry->e_value_inum != 0) { ++ error = ext4_xattr_inode_get(inode, ++ le32_to_cpu(entry->e_value_inum), ++ buffer, &size); ++ if (error) ++ goto cleanup; ++ } else { ++ memcpy(buffer, bh->b_data + ++ le16_to_cpu(entry->e_value_offs), size); ++ } + } + error = size; + +@@ -280,7 +388,7 @@ + if (error) + goto cleanup; + error = ext4_xattr_find_entry(&entry, name_index, name, +- end - (void *)entry, 0); ++ end - (void *)entry, 0, inode); + if (error) + goto cleanup; + size = le32_to_cpu(entry->e_value_size); +@@ -288,8 +396,16 @@ + error = -ERANGE; + if (size > buffer_size) + goto cleanup; +- memcpy(buffer, (void *)IFIRST(header) + +- le16_to_cpu(entry->e_value_offs), size); ++ if (entry->e_value_inum != 0) { ++ error = ext4_xattr_inode_get(inode, ++ le32_to_cpu(entry->e_value_inum), ++ buffer, &size); ++ if (error) ++ goto cleanup; ++ } else { ++ memcpy(buffer, (void *)IFIRST(header) + ++ le16_to_cpu(entry->e_value_offs), size); ++ } + } + error = size; + +@@ -514,7 +630,7 @@ + { + for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { + *total += EXT4_XATTR_LEN(last->e_name_len); +- if (!last->e_value_block && last->e_value_size) { ++ if (last->e_value_inum == 0 && last->e_value_size > 0) { + size_t offs = le16_to_cpu(last->e_value_offs); + if (offs < *min_offs) + *min_offs = offs; +@@ -523,11 +639,162 @@ + return (*min_offs - ((void *)last - base) - sizeof(__u32)); + } + ++/* ++ * Write the value of the EA in an inode. ++ */ ++static int ++ext4_xattr_inode_write(handle_t *handle, struct inode *ea_inode, ++ const void *buf, int bufsize) ++{ ++ struct buffer_head *bh = NULL; ++ struct ext4_map_blocks map; ++ unsigned long block = 0; ++ unsigned blocksize = ea_inode->i_sb->s_blocksize; ++ unsigned max_blocks = (bufsize + blocksize - 1) >> ea_inode->i_blkbits; ++ int csize, wsize = 0; ++ int ret = 0; ++ int retries = 0; ++ ++retry: ++ while (ret >= 0 && ret < max_blocks) { ++ block += ret; ++ max_blocks -= ret; ++ ++ map.m_lblk = block; ++ map.m_len = max_blocks; ++ ret = ext4_map_blocks(handle, ea_inode, &map, ++ EXT4_GET_BLOCKS_CREATE); ++ if (ret <= 0) { ++ ext4_mark_inode_dirty(handle, ea_inode); ++ if (ret == -ENOSPC && ++ ext4_should_retry_alloc(ea_inode->i_sb, &retries)) { ++ ret = 0; ++ goto retry; ++ } ++ break; ++ } ++ } ++ ++ if (ret < 0) ++ return ret; ++ ++ block = 0; ++ while (wsize < bufsize) { ++ if (bh != NULL) ++ brelse(bh); ++ csize = (bufsize - wsize) > blocksize ? blocksize : ++ bufsize - wsize; ++ bh = ext4_getblk(handle, ea_inode, block, 0, &ret); ++ if (!bh) ++ goto out; ++ ret = ext4_journal_get_write_access(handle, bh); ++ if (ret) ++ goto out; ++ ++ memcpy(bh->b_data, buf, csize); ++ set_buffer_uptodate(bh); ++ ext4_journal_dirty_metadata(handle, bh); ++ ++ buf += csize; ++ wsize += csize; ++ block += 1; ++ } ++ ++ i_size_write(ea_inode, wsize); ++ ext4_update_i_disksize(ea_inode, wsize); ++ ++ ext4_mark_inode_dirty(handle, ea_inode); ++ ++out: ++ brelse(bh); ++ ++ return ret; ++} ++ ++/* ++ * Create an inode to store the value of a large EA. ++ */ ++static struct inode * ++ext4_xattr_inode_create(handle_t *handle, struct inode *inode) ++{ ++ struct inode *ea_inode = NULL; ++ ++ /* ++ * Let the next inode be the goal, so we try and allocate the EA inode ++ * in the same group, or nearby one. ++ */ ++ ea_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode, ++ S_IFREG|0600, NULL, inode->i_ino + 1); ++ ++ if (!IS_ERR(ea_inode)) { ++ ea_inode->i_op = &ext4_file_inode_operations; ++ ea_inode->i_fop = &ext4_file_operations; ++ ext4_set_aops(ea_inode); ++ ea_inode->i_generation = inode->i_generation; ++ EXT4_I(ea_inode)->i_flags |= EXT4_EA_INODE_FL; ++ ++ /* ++ * A back-pointer from EA inode to parent inode will be useful ++ * for e2fsck. ++ */ ++ ea_inode->i_xattr_inode_parent = inode->i_ino; ++ unlock_new_inode(ea_inode); ++ } ++ ++ return ea_inode; ++} ++ ++/* ++ * Unlink the inode storing the value of the EA. ++ */ ++static int ++ext4_xattr_inode_unlink(struct inode *inode, int ea_ino) ++{ ++ struct inode *ea_inode = NULL; ++ int err; ++ ++ ea_inode = ext4_xattr_inode_iget(inode, ea_ino, &err); ++ if (err) ++ return err; ++ ++ ea_inode->i_nlink = 0; ++ iput(ea_inode); ++ ++ return 0; ++} ++ ++/* ++ * Add value of the EA in an inode. ++ */ ++static int ++ext4_xattr_inode_set(handle_t *handle, struct inode *inode, int *ea_ino, ++ const void *value, size_t value_len) ++{ ++ struct inode *ea_inode = NULL; ++ int err; ++ ++ /* Create an inode for the EA value */ ++ ea_inode = ext4_xattr_inode_create(handle, inode); ++ if (IS_ERR(ea_inode)) ++ return -1; ++ ++ err = ext4_xattr_inode_write(handle, ea_inode, value, value_len); ++ if (err) ++ ea_inode->i_nlink = 0; ++ else ++ *ea_ino = ea_inode->i_ino; ++ ++ iput(ea_inode); ++ ++ return err; ++} ++ + struct ext4_xattr_info { +- int name_index; + const char *name; + const void *value; + size_t value_len; ++ int name_index; ++ int in_inode; + }; + + struct ext4_xattr_search { +@@ -539,15 +803,23 @@ + }; + + static int +-ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s) ++ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s, ++ handle_t *handle, struct inode *inode) + { + struct ext4_xattr_entry *last; + size_t free, min_offs = s->end - s->base, name_len = strlen(i->name); ++ int in_inode = i->in_inode; ++ ++ if (EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb, ++ EXT4_FEATURE_INCOMPAT_EA_INODE) && ++ (EXT4_XATTR_SIZE(i->value_len) > ++ EXT4_XATTR_MIN_LARGE_EA_SIZE(inode->i_sb->s_blocksize))) ++ in_inode = 1; + + /* Compute min_offs and last. */ + last = s->first; + for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { +- if (!last->e_value_block && last->e_value_size) { ++ if (last->e_value_inum == 0 && last->e_value_size > 0) { + size_t offs = le16_to_cpu(last->e_value_offs); + if (offs < min_offs) + min_offs = offs; +@@ -555,16 +827,21 @@ + } + free = min_offs - ((void *)last - s->base) - sizeof(__u32); + if (!s->not_found) { +- if (!s->here->e_value_block && s->here->e_value_size) { ++ if (!in_inode && s->here->e_value_inum == 0 && ++ s->here->e_value_size > 0) { + size_t size = le32_to_cpu(s->here->e_value_size); + free += EXT4_XATTR_SIZE(size); + } + free += EXT4_XATTR_LEN(name_len); + } + if (i->value) { +- if (free < EXT4_XATTR_SIZE(i->value_len) || +- free < EXT4_XATTR_LEN(name_len) + +- EXT4_XATTR_SIZE(i->value_len)) ++ size_t value_len = EXT4_XATTR_SIZE(i->value_len); ++ ++ if (in_inode) ++ value_len = 0; ++ ++ if (free < value_len || ++ free < EXT4_XATTR_LEN(name_len) + value_len) + return -ENOSPC; + } + +@@ -578,7 +855,8 @@ + s->here->e_name_len = name_len; + memcpy(s->here->e_name, i->name, name_len); + } else { +- if (!s->here->e_value_block && s->here->e_value_size) { ++ if (s->here->e_value_offs > 0 && s->here->e_value_inum == 0 && ++ s->here->e_value_size > 0) { + void *first_val = s->base + min_offs; + size_t offs = le16_to_cpu(s->here->e_value_offs); + void *val = s->base + offs; +@@ -607,13 +885,17 @@ + last = s->first; + while (!IS_LAST_ENTRY(last)) { + size_t o = le16_to_cpu(last->e_value_offs); +- if (!last->e_value_block && +- last->e_value_size && o < offs) ++ if (last->e_value_size > 0 && o < offs) + last->e_value_offs = + cpu_to_le16(o + size); + last = EXT4_XATTR_NEXT(last); + } + } ++ if (s->here->e_value_inum != 0) { ++ ext4_xattr_inode_unlink(inode, ++ le32_to_cpu(s->here->e_value_inum)); ++ s->here->e_value_inum = 0; ++ } + if (!i->value) { + /* Remove the old name. */ + size_t size = EXT4_XATTR_LEN(name_len); +@@ -627,10 +909,17 @@ + if (i->value) { + /* Insert the new value. */ + s->here->e_value_size = cpu_to_le32(i->value_len); +- if (i->value_len) { ++ if (in_inode) { ++ int ea_ino = le32_to_cpu(s->here->e_value_inum); ++ ext4_xattr_inode_set(handle, inode, &ea_ino, i->value, ++ i->value_len); ++ s->here->e_value_inum = cpu_to_le32(ea_ino); ++ s->here->e_value_offs = 0; ++ } else if (i->value_len) { + size_t size = EXT4_XATTR_SIZE(i->value_len); + void *val = s->base + min_offs - size; + s->here->e_value_offs = cpu_to_le16(min_offs - size); ++ s->here->e_value_inum = 0; + memset(val + size - EXT4_XATTR_PAD, 0, + EXT4_XATTR_PAD); /* Clear the pad bytes. */ + memcpy(val, i->value, i->value_len); +@@ -675,7 +964,7 @@ + bs->s.end = bs->bh->b_data + bs->bh->b_size; + bs->s.here = bs->s.first; + error = ext4_xattr_find_entry(&bs->s.here, i->name_index, +- i->name, bs->bh->b_size, 1); ++ i->name, bs->bh->b_size, 1, inode); + if (error && error != -ENODATA) + goto cleanup; + bs->s.not_found = error; +@@ -699,8 +988,6 @@ + + #define header(x) ((struct ext4_xattr_header *)(x)) + +- if (i->value && i->value_len > sb->s_blocksize) +- return -ENOSPC; + if (s->base) { + ce = mb_cache_entry_get(ext4_xattr_cache, bs->bh->b_bdev, + bs->bh->b_blocknr); +@@ -715,7 +1002,7 @@ + ce = NULL; + } + ea_bdebug(bs->bh, "modifying in-place"); +- error = ext4_xattr_set_entry(i, s); ++ error = ext4_xattr_set_entry(i, s, handle, inode); + if (!error) { + if (!IS_LAST_ENTRY(s->first)) + ext4_xattr_rehash(header(s->base), +@@ -767,7 +1054,7 @@ + s->end = s->base + sb->s_blocksize; + } + +- error = ext4_xattr_set_entry(i, s); ++ error = ext4_xattr_set_entry(i, s, handle, inode); + if (error == -EIO) + goto bad_block; + if (error) +@@ -918,7 +1205,7 @@ + /* Find the named attribute. */ + error = ext4_xattr_find_entry(&is->s.here, i->name_index, + i->name, is->s.end - +- (void *)is->s.base, 0); ++ (void *)is->s.base, 0, inode); + if (error && error != -ENODATA) + return error; + is->s.not_found = error; +@@ -937,7 +1224,7 @@ + + if (EXT4_I(inode)->i_extra_isize == 0) + return -ENOSPC; +- error = ext4_xattr_set_entry(i, s); ++ error = ext4_xattr_set_entry(i, s, handle, inode); + if (error) + return error; + header = IHDR(inode, ext4_raw_inode(&is->iloc)); +@@ -973,7 +1260,7 @@ + .name = name, + .value = value, + .value_len = value_len, +- ++ .in_inode = 0, + }; + struct ext4_xattr_ibody_find is = { + .s = { .not_found = -ENODATA, }, +@@ -1042,6 +1329,15 @@ + goto cleanup; + } + error = ext4_xattr_block_set(handle, inode, &i, &bs); ++ if (EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb, ++ EXT4_FEATURE_INCOMPAT_EA_INODE) && ++ error == -ENOSPC) { ++ /* xattr not fit to block, store at external ++ * inode */ ++ i.in_inode = 1; ++ error = ext4_xattr_ibody_set(handle, inode, ++ &i, &is); ++ } + if (error) + goto cleanup; + if (!is.s.not_found) { +@@ -1089,10 +1385,25 @@ + const void *value, size_t value_len, int flags) + { + handle_t *handle; ++ struct super_block *sb = inode->i_sb; ++ int buffer_credits; + int error, retries = 0; + ++ buffer_credits = EXT4_DATA_TRANS_BLOCKS(sb); ++ if ((value_len >= EXT4_XATTR_MIN_LARGE_EA_SIZE(sb->s_blocksize)) && ++ EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EA_INODE)) { ++ int nrblocks = (value_len + sb->s_blocksize - 1) >> ++ sb->s_blocksize_bits; ++ ++ /* For new inode */ ++ buffer_credits += EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + 3; ++ ++ /* For data blocks of EA inode */ ++ buffer_credits += ext4_meta_trans_blocks(inode, nrblocks, 0); ++ } ++ + retry: +- handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb)); ++ handle = ext4_journal_start(inode, buffer_credits); + if (IS_ERR(handle)) { + error = PTR_ERR(handle); + } else { +@@ -1102,7 +1413,7 @@ + value, value_len, flags); + error2 = ext4_journal_stop(handle); + if (error == -ENOSPC && +- ext4_should_retry_alloc(inode->i_sb, &retries)) ++ ext4_should_retry_alloc(sb, &retries)) + goto retry; + if (error == 0) + error = error2; +@@ -1124,7 +1435,7 @@ + + /* Adjust the value offsets of the entries */ + for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { +- if (!last->e_value_block && last->e_value_size) { ++ if (last->e_value_inum == 0 && last->e_value_size > 0) { + new_offs = le16_to_cpu(last->e_value_offs) + + value_offs_shift; + BUG_ON(new_offs + le32_to_cpu(last->e_value_size) +@@ -1364,15 +1675,41 @@ + /* + * ext4_xattr_delete_inode() + * +- * Free extended attribute resources associated with this inode. This ++ * Free extended attribute resources associated with this inode. Traverse ++ * all entries and unlink any xattr inodes associated with this inode. This + * is called immediately before an inode is freed. We have exclusive +- * access to the inode. ++ * access to the inode. If an orphan inode is deleted it will also delete any ++ * xattr block and all xattr inodes. They are checked by ext4_xattr_inode_iget() ++ * to ensure they belong to the parent inode and were not deleted already. + */ + void + ext4_xattr_delete_inode(handle_t *handle, struct inode *inode) + { + struct buffer_head *bh = NULL; ++ struct ext4_xattr_ibody_header *header; ++ struct ext4_inode *raw_inode; ++ struct ext4_iloc iloc; ++ struct ext4_xattr_entry *entry; ++ int error; ++ ++ if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR)) ++ goto delete_external_ea; ++ ++ error = ext4_get_inode_loc(inode, &iloc); ++ if (error) ++ goto cleanup; ++ raw_inode = ext4_raw_inode(&iloc); ++ header = IHDR(inode, raw_inode); ++ entry = IFIRST(header); ++ for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) { ++ if (entry->e_value_inum != 0) { ++ ext4_xattr_inode_unlink(inode, ++ le32_to_cpu(entry->e_value_inum)); ++ entry->e_value_inum = 0; ++ } ++ } + ++delete_external_ea: + if (!EXT4_I(inode)->i_file_acl) + goto cleanup; + bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); +@@ -1387,6 +1724,16 @@ + EXT4_I(inode)->i_file_acl); + goto cleanup; + } ++ ++ entry = BFIRST(bh); ++ for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) { ++ if (entry->e_value_inum != 0) { ++ ext4_xattr_inode_unlink(inode, ++ le32_to_cpu(entry->e_value_inum)); ++ entry->e_value_inum = 0; ++ } ++ } ++ + ext4_xattr_release_block(handle, inode, bh); + EXT4_I(inode)->i_file_acl = 0; + +@@ -1461,10 +1808,9 @@ + entry1->e_name_index != entry2->e_name_index || + entry1->e_name_len != entry2->e_name_len || + entry1->e_value_size != entry2->e_value_size || ++ entry1->e_value_inum != entry2->e_value_inum || + memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len)) + return 1; +- if (entry1->e_value_block != 0 || entry2->e_value_block != 0) +- return -EIO; + if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs), + (char *)header2 + le16_to_cpu(entry2->e_value_offs), + le32_to_cpu(entry1->e_value_size))) +@@ -1548,7 +1894,7 @@ + *name++; + } + +- if (entry->e_value_block == 0 && entry->e_value_size != 0) { ++ if (entry->e_value_inum == 0 && entry->e_value_size != 0) { + __le32 *value = (__le32 *)((char *)header + + le16_to_cpu(entry->e_value_offs)); + for (n = (le32_to_cpu(entry->e_value_size) + +diff -ur linux-stage.orig/fs/ext4/xattr.h linux-stage/fs/ext4/xattr.h +--- linux-stage.orig/fs/ext4/xattr.h 2012-12-31 15:56:25.000000000 -0500 ++++ linux-stage/fs/ext4/xattr.h 2012-12-31 15:56:48.000000000 -0500 +@@ -38,7 +38,7 @@ + __u8 e_name_len; /* length of name */ + __u8 e_name_index; /* attribute name index */ + __le16 e_value_offs; /* offset in disk block of value */ +- __le32 e_value_block; /* disk block attribute is stored on (n/i) */ ++ __le32 e_value_inum; /* inode in which the value is stored */ + __le32 e_value_size; /* size of attribute value */ + __le32 e_hash; /* hash value of name and value */ + char e_name[0]; /* attribute name */ +@@ -63,6 +63,15 @@ + EXT4_I(inode)->i_extra_isize)) + #define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1)) + ++#define i_xattr_inode_parent i_mtime.tv_sec ++ ++/* ++ * The minimum size of EA value when you start storing it in an external inode ++ * size of block - size of header - size of 1 entry - 4 null bytes ++*/ ++#define EXT4_XATTR_MIN_LARGE_EA_SIZE(b) \ ++ ((b) - EXT4_XATTR_LEN(3) - sizeof(struct ext4_xattr_header) - 4) ++ + # ifdef CONFIG_EXT4_FS_XATTR + + extern const struct xattr_handler ext4_xattr_user_handler; diff --git a/ldiskfs/kernel_patches/patches/sles11sp2/ext4-lookup-dotdot.patch b/ldiskfs/kernel_patches/patches/sles11sp2/ext4-lookup-dotdot.patch new file mode 100644 index 0000000..84ead51 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/sles11sp2/ext4-lookup-dotdot.patch @@ -0,0 +1,62 @@ +--- + fs/ext4/namei.c | 42 ++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 42 insertions(+) + +--- a/fs/ext4/namei.c ++++ b/fs/ext4/namei.c +@@ -1031,6 +1031,16 @@ errout: + return NULL; + } + ++static inline int ++is_dot_or_dot_dot(const struct qstr *name) ++{ ++ if (name->name[0] != '.') ++ return 0; ++ if (name->len == 1 || (name->len == 2 && name->name[1] == '.')) ++ return 1; ++ return 0; ++} ++ + static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) + { + struct inode *inode; +@@ -1061,6 +1071,38 @@ static struct dentry *ext4_lookup(struct + } + } + } ++ ++ /* ".." shouldn't go into dcache to preserve dcache hierarchy ++ * otherwise we'll get parent being a child of actual child. ++ * see bug 10458 for details -bzzz */ ++ if (inode && is_dot_or_dot_dot(&dentry->d_name)) { ++ struct dentry *tmp, *goal = NULL; ++ struct list_head *lp; ++ ++ /* first, look for an existing dentry - any one is good */ ++ spin_lock(&inode->i_lock); ++ list_for_each(lp, &inode->i_dentry) { ++ tmp = list_entry(lp, struct dentry, d_alias); ++ goal = tmp; ++ dget(goal); ++ break; ++ } ++ if (goal == NULL) { ++ /* there is no alias, we need to make current dentry: ++ * a) inaccessible for __d_lookup() ++ * b) inaccessible for iopen */ ++ J_ASSERT(list_empty(&dentry->d_alias)); ++ dentry->d_flags |= DCACHE_NFSFS_RENAMED; ++ /* this is d_instantiate() ... */ ++ list_add(&dentry->d_alias, &inode->i_dentry); ++ dentry->d_inode = inode; ++ } ++ spin_unlock(&inode->i_lock); ++ if (goal) ++ iput(inode); ++ return goal; ++ } ++ + return d_splice_alias(inode, dentry); + } + diff --git a/ldiskfs/kernel_patches/patches/sles11sp2/ext4-make-quota-as-first-class-supported-feature.patch b/ldiskfs/kernel_patches/patches/sles11sp2/ext4-make-quota-as-first-class-supported-feature.patch new file mode 100644 index 0000000..5221378 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/sles11sp2/ext4-make-quota-as-first-class-supported-feature.patch @@ -0,0 +1,322 @@ +From 7c319d328505b7781b65238ae9f53293b5ee0ca8 Mon Sep 17 00:00:00 2001 +From: Aditya Kali +Date: Sun, 22 Jul 2012 20:21:31 -0400 +Subject: ext4: make quota as first class supported feature +Git-commit: 7c319d32, 281b5995 +Patch-mainline: v3.6-rc1 + +This patch adds support for quotas as a first class feature in ext4; +which is to say, the quota files are stored in hidden inodes as file +system metadata, instead of as separate files visible in the file system +directory hierarchy. + +It is based on the proposal at: +https://ext4.wiki.kernel.org/index.php/Design_For_1st_Class_Quota_in_Ext4 + +This patch introduces a new feature - EXT4_FEATURE_RO_COMPAT_QUOTA +which, when turned on, enables quota accounting at mount time +iteself. Also, the quota inodes are stored in two additional superblock +fields. Some changes introduced by this patch that should be pointed +out are: + +1) Two new ext4-superblock fields - s_usr_quota_inum and + s_grp_quota_inum for storing the quota inodes in use. +2) Default quota inodes are: inode#3 for tracking userquota and inode#4 + for tracking group quota. The superblock fields can be set to use + other inodes as well. +3) If the QUOTA feature and corresponding quota inodes are set in + superblock, the quota usage tracking is turned on at mount time. On + 'quotaon' ioctl, the quota limits enforcement is turned + on. 'quotaoff' ioctl turns off only the limits enforcement in this + case. +4) When QUOTA feature is in use, the quota mount options 'quota', + 'usrquota', 'grpquota' are ignored by the kernel. +5) mke2fs or tune2fs can be used to set the QUOTA feature and initialize + quota inodes. The default reserved inodes will not be visible to user + as regular files. +6) The quota-tools will need to be modified to support hidden quota + files on ext4. E2fsprogs will also include support for creating and + fixing quota files. +7) Support is only for the new V2 quota file format. + +Tested-by: Jan Kara +Reviewed-by: Jan Kara +Reviewed-by: Johann Lombardi +Signed-off-by: Aditya Kali +Signed-off-by: "Theodore Ts'o" +Acked-by: Jeff Mahoney +--- + fs/ext4/ext4.h | 10 +++ + fs/ext4/ext4_jbd2.h | 16 ++++-- + fs/ext4/super.c | 137 ++++++++++++++++++++++++++++++++++++++++++++++++++-- + 3 files changed, 153 insertions(+), 10 deletions(-) + +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -1063,7 +1063,10 @@ struct ext4_super_block { + __u8 s_last_error_func[32]; /* function where the error happened */ + #define EXT4_S_ERR_END offsetof(struct ext4_super_block, s_mount_opts) + __u8 s_mount_opts[64]; +- __le32 s_reserved[112]; /* Padding to the end of the block */ ++ __le32 s_usr_quota_inum; /* inode for tracking user quota */ ++ __le32 s_grp_quota_inum; /* inode for tracking group quota */ ++ __le32 s_overhead_clusters; /* overhead blocks/clusters in fs */ ++ __le32 s_reserved[109]; /* Padding to the end of the block */ + }; + + #define EXT4_S_ERR_LEN (EXT4_S_ERR_END - EXT4_S_ERR_START) +@@ -1238,6 +1241,8 @@ static inline struct timespec ext4_curre + static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) + { + return ino == EXT4_ROOT_INO || ++ ino == EXT4_USR_QUOTA_INO || ++ ino == EXT4_GRP_QUOTA_INO || + ino == EXT4_JOURNAL_INO || + ino == EXT4_RESIZE_INO || + (ino >= EXT4_FIRST_INO(sb) && +@@ -1398,7 +1403,8 @@ static inline void ext4_clear_state_flag + EXT4_FEATURE_RO_COMPAT_DIR_NLINK | \ + EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \ + EXT4_FEATURE_RO_COMPAT_BTREE_DIR |\ +- EXT4_FEATURE_RO_COMPAT_HUGE_FILE) ++ EXT4_FEATURE_RO_COMPAT_HUGE_FILE |\ ++ EXT4_FEATURE_RO_COMPAT_QUOTA) + + /* + * Default values for user and/or group using reserved blocks +--- a/fs/ext4/ext4_jbd2.h ++++ b/fs/ext4/ext4_jbd2.h +@@ -87,14 +87,20 @@ + #ifdef CONFIG_QUOTA + /* Amount of blocks needed for quota update - we know that the structure was + * allocated so we need to update only data block */ +-#define EXT4_QUOTA_TRANS_BLOCKS(sb) (test_opt(sb, QUOTA) ? 1 : 0) ++#define EXT4_QUOTA_TRANS_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\ ++ EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) ?\ ++ 1 : 0) + /* Amount of blocks needed for quota insert/delete - we do some block writes + * but inode, sb and group updates are done only once */ +-#define EXT4_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\ +- (EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_INIT_REWRITE) : 0) ++#define EXT4_QUOTA_INIT_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\ ++ EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) ?\ ++ (DQUOT_INIT_ALLOC*(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)\ ++ +3+DQUOT_INIT_REWRITE) : 0) + +-#define EXT4_QUOTA_DEL_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_DEL_ALLOC*\ +- (EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_DEL_REWRITE) : 0) ++#define EXT4_QUOTA_DEL_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\ ++ EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) ?\ ++ (DQUOT_DEL_ALLOC*(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)\ ++ +3+DQUOT_DEL_REWRITE) : 0) + #else + #define EXT4_QUOTA_TRANS_BLOCKS(sb) 0 + #define EXT4_QUOTA_INIT_BLOCKS(sb) 0 +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -1241,12 +1241,18 @@ static int ext4_mark_dquot_dirty(struct + static int ext4_write_info(struct super_block *sb, int type); + static int ext4_quota_on(struct super_block *sb, int type, int format_id, + struct path *path); ++static int ext4_quota_on_sysfile(struct super_block *sb, int type, ++ int format_id); + static int ext4_quota_off(struct super_block *sb, int type); ++static int ext4_quota_off_sysfile(struct super_block *sb, int type); + static int ext4_quota_on_mount(struct super_block *sb, int type); + static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, + size_t len, loff_t off); + static ssize_t ext4_quota_write(struct super_block *sb, int type, + const char *data, size_t len, loff_t off); ++static int ext4_quota_enable(struct super_block *sb, int type, int format_id, ++ unsigned int flags); ++static int ext4_enable_quotas(struct super_block *sb); + + static const struct dquot_operations ext4_quota_operations = { + .get_reserved_space = ext4_get_reserved_space, +@@ -1268,6 +1274,16 @@ static const struct quotactl_ops ext4_qc + .get_dqblk = dquot_get_dqblk, + .set_dqblk = dquot_set_dqblk + }; ++ ++static const struct quotactl_ops ext4_qctl_sysfile_operations = { ++ .quota_on_meta = ext4_quota_on_sysfile, ++ .quota_off = ext4_quota_off_sysfile, ++ .quota_sync = dquot_quota_sync, ++ .get_info = dquot_get_dqinfo, ++ .set_info = dquot_set_dqinfo, ++ .get_dqblk = dquot_get_dqblk, ++ .set_dqblk = dquot_set_dqblk ++}; + #endif + + static const struct super_operations ext4_sops = { +@@ -2689,6 +2705,16 @@ static int ext4_feature_set_ok(struct su + return 0; + } + } ++ ++#ifndef CONFIG_QUOTA ++ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) && ++ !readonly) { ++ ext4_msg(sb, KERN_ERR, ++ "Filesystem with quota feature cannot be mounted RDWR " ++ "without CONFIG_QUOTA"); ++ return 0; ++ } ++#endif /* CONFIG_QUOTA */ + return 1; + } + +@@ -3528,6 +3554,11 @@ static int ext4_fill_super(struct super_ + #ifdef CONFIG_QUOTA + sb->s_qcop = &ext4_qctl_operations; + sb->dq_op = &ext4_quota_operations; ++ ++ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) { ++ /* Use qctl operations for hidden quota files. */ ++ sb->s_qcop = &ext4_qctl_sysfile_operations; ++ } + #endif + memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid)); + +@@ -3755,6 +3786,16 @@ no_journal: + } else + descr = "out journal"; + ++#ifdef CONFIG_QUOTA ++ /* Enable quota usage during mount. */ ++ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) && ++ !(sb->s_flags & MS_RDONLY)) { ++ ret = ext4_enable_quotas(sb); ++ if (ret) ++ goto failed_mount7; ++ } ++#endif /* CONFIG_QUOTA */ ++ + ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. " + "Opts: %s%s%s", descr, sbi->s_es->s_mount_opts, + *sbi->s_es->s_mount_opts ? "; " : "", orig_data); +@@ -4493,16 +4534,26 @@ static int ext4_remount(struct super_blo + if (sbi->s_journal == NULL) + ext4_commit_super(sb, 1); + ++ unlock_super(sb); + #ifdef CONFIG_QUOTA + /* Release old quota file names */ + for (i = 0; i < MAXQUOTAS; i++) + if (old_opts.s_qf_names[i] && + old_opts.s_qf_names[i] != sbi->s_qf_names[i]) + kfree(old_opts.s_qf_names[i]); ++ if (enable_quota) { ++ if (sb_any_quota_suspended(sb)) ++ dquot_resume(sb, -1); ++ else if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ++ EXT4_FEATURE_RO_COMPAT_QUOTA)) { ++ err = ext4_enable_quotas(sb); ++ if (err) { ++ lock_super(sb); ++ goto restore_opts; ++ } ++ } ++ } + #endif +- unlock_super(sb); +- if (enable_quota) +- dquot_resume(sb, -1); + + ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s", orig_data); + kfree(orig_data); +@@ -4750,6 +4801,74 @@ static int ext4_quota_on(struct super_bl + return dquot_quota_on(sb, type, format_id, path); + } + ++static int ext4_quota_enable(struct super_block *sb, int type, int format_id, ++ unsigned int flags) ++{ ++ int err; ++ struct inode *qf_inode; ++ unsigned long qf_inums[MAXQUOTAS] = { ++ le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum), ++ le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum) ++ }; ++ ++ BUG_ON(!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)); ++ ++ if (!qf_inums[type]) ++ return -EPERM; ++ ++ qf_inode = ext4_iget(sb, qf_inums[type]); ++ if (IS_ERR(qf_inode)) { ++ ext4_error(sb, "Bad quota inode # %lu", qf_inums[type]); ++ return PTR_ERR(qf_inode); ++ } ++ ++ err = dquot_enable(qf_inode, type, format_id, flags); ++ iput(qf_inode); ++ ++ return err; ++} ++ ++/* Enable usage tracking for all quota types. */ ++static int ext4_enable_quotas(struct super_block *sb) ++{ ++ int type, err = 0; ++ unsigned long qf_inums[MAXQUOTAS] = { ++ le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum), ++ le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum) ++ }; ++ ++ sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE; ++ for (type = 0; type < MAXQUOTAS; type++) { ++ if (qf_inums[type]) { ++ err = ext4_quota_enable(sb, type, QFMT_VFS_V1, ++ DQUOT_USAGE_ENABLED); ++ if (err) { ++ ext4_warning(sb, ++ "Failed to enable quota (type=%d) " ++ "tracking. Please run e2fsck to fix.", ++ type); ++ return err; ++ } ++ } ++ } ++ return 0; ++} ++ ++/* ++ * quota_on function that is used when QUOTA feature is set. ++ */ ++static int ext4_quota_on_sysfile(struct super_block *sb, int type, ++ int format_id) ++{ ++ if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) ++ return -EINVAL; ++ ++ /* ++ * USAGE was enabled at mount time. Only need to enable LIMITS now. ++ */ ++ return ext4_quota_enable(sb, type, format_id, DQUOT_LIMITS_ENABLED); ++} ++ + static int ext4_quota_off(struct super_block *sb, int type) + { + struct inode *inode = sb_dqopt(sb)->files[type]; +@@ -4776,6 +4895,18 @@ out: + return dquot_quota_off(sb, type); + } + ++/* ++ * quota_off function that is used when QUOTA feature is set. ++ */ ++static int ext4_quota_off_sysfile(struct super_block *sb, int type) ++{ ++ if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) ++ return -EINVAL; ++ ++ /* Disable only the limits. */ ++ return dquot_disable(sb, type, DQUOT_LIMITS_ENABLED); ++} ++ + /* Read data from quotafile - avoid pagecache and such because we cannot afford + * acquiring the locks... As quota files are never truncated and quota code + * itself serializes the operations (and no one else should touch the files) diff --git a/ldiskfs/kernel_patches/patches/sles11sp2/ext4-map_inode_page-3.0.patch b/ldiskfs/kernel_patches/patches/sles11sp2/ext4-map_inode_page-3.0.patch new file mode 100644 index 0000000..af03281 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/sles11sp2/ext4-map_inode_page-3.0.patch @@ -0,0 +1,91 @@ +--- + fs/ext4/ext4.h | 3 ++ + fs/ext4/inode.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + 2 files changed, 71 insertions(+) + +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -1838,6 +1838,9 @@ extern int ext4_page_mkwrite(struct vm_a + extern qsize_t *ext4_get_reserved_space(struct inode *inode); + extern void ext4_da_update_reserve_space(struct inode *inode, + int used, int quota_claim); ++extern int ext4_map_inode_page(struct inode *inode, struct page *page, ++ unsigned long *blocks, int *created, ++ int create); + /* ioctl.c */ + extern long ext4_ioctl(struct file *, unsigned int, unsigned long); + extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -5968,3 +5968,71 @@ out_unlock: + up_read(&inode->i_alloc_sem); + return ret; + } ++ ++int ext4_map_inode_page(struct inode *inode, struct page *page, ++ unsigned long *blocks, int *created, int create) ++{ ++ unsigned int blocksize, blocks_per_page; ++ unsigned long iblock; ++ struct ext4_map_blocks map; ++ void *handle; ++ int i, rc = 0, failed = 0, needed_blocks; ++ ++ blocksize = inode->i_sb->s_blocksize; ++ blocks_per_page = PAGE_SIZE >> inode->i_sb->s_blocksize_bits; ++ iblock = page->index * blocks_per_page; ++ ++ for (i = 0; i < blocks_per_page; i++, iblock++) { ++ blocks[i] = ext4_bmap(inode->i_mapping, iblock); ++ if (blocks[i] == 0) { ++ failed++; ++ if (created) ++ created[i] = -1; ++ } else if (created) { ++ created[i] = 0; ++ } ++ } ++ ++ if (failed == 0 || create == 0) ++ return 0; ++ ++ needed_blocks = ext4_writepage_trans_blocks(inode); ++ handle = ext4_journal_start(inode, needed_blocks); ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ ++ iblock = page->index * blocks_per_page; ++ for (i = 0; i < blocks_per_page; i++, iblock++) { ++ if (blocks[i] != 0) ++ continue; ++ ++ map.m_lblk = iblock; ++ map.m_len = 1; ++ map.m_flags = 0; ++ rc = ext4_ind_map_blocks(handle, inode, &map, ++ EXT4_GET_BLOCKS_CREATE); ++ if (rc < 0) { ++ printk(KERN_INFO "ext4_map_inode_page: error reading " ++ "block %ld\n", iblock); ++ goto out; ++ } else { ++ if (rc > 1) ++ WARN_ON(1); ++ rc = 0; ++ } ++ /* Unmap any metadata buffers from the block mapping, to avoid ++ * data corruption due to direct-write from Lustre being ++ * clobbered by a later flush of the blockdev metadata buffer.*/ ++ if (map.m_flags & EXT4_MAP_NEW) ++ unmap_underlying_metadata(inode->i_sb->s_bdev, ++ map.m_pblk); ++ blocks[i] = map.m_pblk; ++ if (created) ++ created[i] = 1; ++ } ++ ++out: ++ ext4_journal_stop(handle); ++ return rc; ++} ++EXPORT_SYMBOL(ext4_map_inode_page); diff --git a/ldiskfs/kernel_patches/patches/sles11sp2/ext4-mballoc-extra-checks.patch b/ldiskfs/kernel_patches/patches/sles11sp2/ext4-mballoc-extra-checks.patch new file mode 100644 index 0000000..085a73e --- /dev/null +++ b/ldiskfs/kernel_patches/patches/sles11sp2/ext4-mballoc-extra-checks.patch @@ -0,0 +1,309 @@ +--- + fs/ext4/ext4.h | 1 + fs/ext4/mballoc.c | 108 ++++++++++++++++++++++++++++++++++++++++++++++-------- + fs/ext4/mballoc.h | 2 - + 3 files changed, 95 insertions(+), 16 deletions(-) + +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -2098,6 +2098,7 @@ struct ext4_group_info { + ext4_grpblk_t bb_fragments; /* nr of freespace fragments */ + ext4_grpblk_t bb_largest_free_order;/* order of largest frag in BG */ + struct list_head bb_prealloc_list; ++ unsigned long bb_prealloc_nr; + #ifdef DOUBLE_CHECK + void *bb_bitmap; + #endif +--- a/fs/ext4/mballoc.c ++++ b/fs/ext4/mballoc.c +@@ -352,7 +352,7 @@ static const char *ext4_groupinfo_slab_n + "ext4_groupinfo_64k", "ext4_groupinfo_128k" + }; + +-static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ++static int ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, + ext4_group_t group); + static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, + ext4_group_t group); +@@ -702,7 +702,7 @@ mb_set_largest_free_order(struct super_b + } + + static noinline_for_stack +-void ext4_mb_generate_buddy(struct super_block *sb, ++int ext4_mb_generate_buddy(struct super_block *sb, + void *buddy, void *bitmap, ext4_group_t group) + { + struct ext4_group_info *grp = ext4_get_group_info(sb, group); +@@ -734,14 +734,19 @@ void ext4_mb_generate_buddy(struct super + grp->bb_fragments = fragments; + + if (free != grp->bb_free) { ++ struct ext4_group_desc *gdp; ++ gdp = ext4_get_group_desc (sb, group, NULL); + ext4_grp_locked_error(sb, group, 0, 0, +- "%u blocks in bitmap, %u in gd", +- free, grp->bb_free); ++ "%u blocks in bitmap, %u in bb, %u in gd", ++ free, grp->bb_free, ++ ext4_free_blks_count(sb, gdp)); ++ + /* + * If we intent to continue, we consider group descritor + * corrupt and update bb_free using bitmap value + */ + grp->bb_free = free; ++ return -EIO; + } + mb_set_largest_free_order(sb, grp); + +@@ -752,6 +757,8 @@ void ext4_mb_generate_buddy(struct super + EXT4_SB(sb)->s_mb_buddies_generated++; + EXT4_SB(sb)->s_mb_generation_time += period; + spin_unlock(&EXT4_SB(sb)->s_bal_lock); ++ ++ return 0; + } + + /* The buddy information is attached the buddy cache inode +@@ -898,7 +905,7 @@ static int ext4_mb_init_cache(struct pag + + err = 0; + first_block = page->index * blocks_per_page; +- for (i = 0; i < blocks_per_page; i++) { ++ for (i = 0; i < blocks_per_page && err == 0; i++) { + int group; + + group = (first_block + i) >> 1; +@@ -939,7 +946,7 @@ static int ext4_mb_init_cache(struct pag + ext4_lock_group(sb, group); + /* init the buddy */ + memset(data, 0xff, blocksize); +- ext4_mb_generate_buddy(sb, data, incore, group); ++ err = ext4_mb_generate_buddy(sb, data, incore, group); + ext4_unlock_group(sb, group); + incore = NULL; + } else { +@@ -954,7 +961,7 @@ static int ext4_mb_init_cache(struct pag + memcpy(data, bitmap, blocksize); + + /* mark all preallocated blks used in in-core bitmap */ +- ext4_mb_generate_from_pa(sb, data, group); ++ err = ext4_mb_generate_from_pa(sb, data, group); + ext4_mb_generate_from_freelist(sb, data, group); + ext4_unlock_group(sb, group); + +@@ -964,7 +971,8 @@ static int ext4_mb_init_cache(struct pag + incore = data; + } + } +- SetPageUptodate(page); ++ if (likely(err == 0)) ++ SetPageUptodate(page); + + out: + if (bh) { +@@ -2143,9 +2151,11 @@ static void *ext4_mb_seq_groups_next(str + static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) + { + struct super_block *sb = seq->private; ++ struct ext4_group_desc *gdp; + ext4_group_t group = (ext4_group_t) ((unsigned long) v); + int i; + int err; ++ int free = 0; + struct ext4_buddy e4b; + struct sg { + struct ext4_group_info info; +@@ -2154,10 +2164,10 @@ static int ext4_mb_seq_groups_show(struc + + group--; + if (group == 0) +- seq_printf(seq, "#%-5s: %-5s %-5s %-5s " ++ seq_printf(seq, "#%-5s: %-5s %-5s %-5s %-5s %-5s" + "[ %-5s %-5s %-5s %-5s %-5s %-5s %-5s " + "%-5s %-5s %-5s %-5s %-5s %-5s %-5s ]\n", +- "group", "free", "frags", "first", ++ "group", "free", "frags", "first", "first", "pa", + "2^0", "2^1", "2^2", "2^3", "2^4", "2^5", "2^6", + "2^7", "2^8", "2^9", "2^10", "2^11", "2^12", "2^13"); + +@@ -2168,13 +2178,20 @@ static int ext4_mb_seq_groups_show(struc + seq_printf(seq, "#%-5u: I/O error\n", group); + return 0; + } ++ ++ gdp = ext4_get_group_desc(sb, group, NULL); ++ if (gdp != NULL) ++ free = ext4_free_blks_count(sb, gdp); ++ + ext4_lock_group(sb, group); + memcpy(&sg, ext4_get_group_info(sb, group), i); + ext4_unlock_group(sb, group); + ext4_mb_unload_buddy(&e4b); + +- seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free, +- sg.info.bb_fragments, sg.info.bb_first_free); ++ seq_printf(seq, "#%-5lu: %-5u %-5u %-5u %-5u %-5lu [", ++ (long unsigned int)group, sg.info.bb_free, free, ++ sg.info.bb_fragments, sg.info.bb_first_free, ++ sg.info.bb_prealloc_nr); + for (i = 0; i <= 13; i++) + seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ? + sg.info.bb_counters[i] : 0); +@@ -3411,23 +3428,68 @@ static void ext4_mb_generate_from_freeli + } + + /* ++ * check free blocks in bitmap match free block in group descriptor ++ * do this before taking preallocated blocks into account to be able ++ * to detect on-disk corruptions. The group lock should be hold by the ++ * caller. ++ */ ++int ext4_mb_check_ondisk_bitmap(struct super_block *sb, void *bitmap, ++ struct ext4_group_desc *gdp, int group) ++{ ++ unsigned short max = EXT4_BLOCKS_PER_GROUP(sb); ++ unsigned short i, first, free = 0; ++ ++ i = mb_find_next_zero_bit(bitmap, max, 0); ++ ++ while (i < max) { ++ first = i; ++ i = mb_find_next_bit(bitmap, max, i); ++ if (i > max) ++ i = max; ++ free += i - first; ++ if (i < max) ++ i = mb_find_next_zero_bit(bitmap, max, i); ++ } ++ ++ if (free != ext4_free_blks_count(sb, gdp)) { ++ ext4_error(sb, "on-disk bitmap for group %d" ++ "corrupted: %u blocks free in bitmap, %u - in gd\n", ++ group, free, ext4_free_blks_count(sb, gdp)); ++ return -EIO; ++ } ++ return 0; ++} ++ ++/* + * the function goes through all preallocation in this group and marks them + * used in in-core bitmap. buddy must be generated from this bitmap + * Need to be called with ext4 group lock held + */ + static noinline_for_stack +-void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ++int ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, + ext4_group_t group) + { + struct ext4_group_info *grp = ext4_get_group_info(sb, group); + struct ext4_prealloc_space *pa; ++ struct ext4_group_desc *gdp; + struct list_head *cur; + ext4_group_t groupnr; + ext4_grpblk_t start; + int preallocated = 0; + int count = 0; ++ int skip = 0; ++ int err; + int len; + ++ gdp = ext4_get_group_desc (sb, group, NULL); ++ if (gdp == NULL) ++ return -EIO; ++ ++ /* before applying preallocations, check bitmap consistency */ ++ err = ext4_mb_check_ondisk_bitmap(sb, bitmap, gdp, group); ++ if (err) ++ return err; ++ + /* all form of preallocation discards first load group, + * so the only competing code is preallocation use. + * we don't need any locking here +@@ -3443,14 +3505,23 @@ void ext4_mb_generate_from_pa(struct sup + &groupnr, &start); + len = pa->pa_len; + spin_unlock(&pa->pa_lock); +- if (unlikely(len == 0)) ++ if (unlikely(len == 0)) { ++ skip++; + continue; ++ } + BUG_ON(groupnr != group); + mb_set_bits(bitmap, start, len); + preallocated += len; + count++; + } ++ if (count + skip != grp->bb_prealloc_nr) { ++ ext4_error(sb, "lost preallocations: " ++ "count %d, bb_prealloc_nr %lu, skip %d\n", ++ count, grp->bb_prealloc_nr, skip); ++ return -EIO; ++ } + mb_debug(1, "prellocated %u for group %u\n", preallocated, group); ++ return 0; + } + + static void ext4_mb_pa_callback(struct rcu_head *head) +@@ -3509,6 +3580,7 @@ static void ext4_mb_put_pa(struct ext4_a + */ + ext4_lock_group(sb, grp); + list_del(&pa->pa_group_list); ++ ext4_get_group_info(sb, grp)->bb_prealloc_nr--; + ext4_unlock_group(sb, grp); + + spin_lock(pa->pa_obj_lock); +@@ -3600,6 +3672,7 @@ ext4_mb_new_inode_pa(struct ext4_allocat + + ext4_lock_group(sb, ac->ac_b_ex.fe_group); + list_add(&pa->pa_group_list, &grp->bb_prealloc_list); ++ grp->bb_prealloc_nr++; + ext4_unlock_group(sb, ac->ac_b_ex.fe_group); + + spin_lock(pa->pa_obj_lock); +@@ -3661,6 +3734,7 @@ ext4_mb_new_group_pa(struct ext4_allocat + + ext4_lock_group(sb, ac->ac_b_ex.fe_group); + list_add(&pa->pa_group_list, &grp->bb_prealloc_list); ++ grp->bb_prealloc_nr++; + ext4_unlock_group(sb, ac->ac_b_ex.fe_group); + + /* +@@ -3829,6 +3903,8 @@ repeat: + + spin_unlock(&pa->pa_lock); + ++ BUG_ON(grp->bb_prealloc_nr == 0); ++ grp->bb_prealloc_nr--; + list_del(&pa->pa_group_list); + list_add(&pa->u.pa_tmp_list, &list); + } +@@ -3962,7 +4038,7 @@ repeat: + if (err) { + ext4_error(sb, "Error loading buddy information for %u", + group); +- continue; ++ return; + } + + bitmap_bh = ext4_read_block_bitmap(sb, group); +@@ -3974,6 +4050,8 @@ repeat: + } + + ext4_lock_group(sb, group); ++ BUG_ON(e4b.bd_info->bb_prealloc_nr == 0); ++ e4b.bd_info->bb_prealloc_nr--; + list_del(&pa->pa_group_list); + ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa); + ext4_unlock_group(sb, group); +--- a/fs/ext4/mballoc.h ++++ b/fs/ext4/mballoc.h +@@ -87,7 +87,7 @@ extern u8 mb_enable_debug; + /* + * for which requests use 2^N search using buddies + */ +-#define MB_DEFAULT_ORDER2_REQS 2 ++#define MB_DEFAULT_ORDER2_REQS 8 + + /* + * default group prealloc size 512 blocks diff --git a/ldiskfs/kernel_patches/patches/sles11sp2/ext4-mballoc-pa_free-mismatch.patch b/ldiskfs/kernel_patches/patches/sles11sp2/ext4-mballoc-pa_free-mismatch.patch new file mode 100644 index 0000000..c21573e --- /dev/null +++ b/ldiskfs/kernel_patches/patches/sles11sp2/ext4-mballoc-pa_free-mismatch.patch @@ -0,0 +1,109 @@ +diff -r -u linux-stage.orig/fs/ext4/mballoc.c linux-stage/fs/ext4/mballoc.c +--- linux-stage.orig/fs/ext4/mballoc.c 2012-12-31 15:18:15.000000000 -0500 ++++ linux-stage/fs/ext4/mballoc.c 2012-12-31 15:23:38.000000000 -0500 +@@ -3643,6 +3643,7 @@ + INIT_LIST_HEAD(&pa->pa_group_list); + pa->pa_deleted = 0; + pa->pa_type = MB_INODE_PA; ++ pa->pa_error = 0; + + mb_debug(1, "new inode pa %p: %llu/%u for %u\n", pa, + pa->pa_pstart, pa->pa_len, pa->pa_lstart); +@@ -3704,6 +3705,7 @@ + INIT_LIST_HEAD(&pa->pa_group_list); + pa->pa_deleted = 0; + pa->pa_type = MB_GROUP_PA; ++ pa->pa_error = 0; + + mb_debug(1, "new group pa %p: %llu/%u for %u\n", pa, + pa->pa_pstart, pa->pa_len, pa->pa_lstart); +@@ -3764,7 +3766,9 @@ + int err = 0; + int free = 0; + ++ assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); + BUG_ON(pa->pa_deleted == 0); ++ BUG_ON(pa->pa_inode == NULL); + ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); + grp_blk_start = pa->pa_pstart - bit; + BUG_ON(group != e4b->bd_group && pa->pa_len != 0); +@@ -3786,19 +3790,27 @@ + mb_free_blocks(pa->pa_inode, e4b, bit, next - bit); + bit = next + 1; + } +- if (free != pa->pa_free) { +- ext4_msg(e4b->bd_sb, KERN_CRIT, +- "pa %p: logic %lu, phys. %lu, len %lu", +- pa, (unsigned long) pa->pa_lstart, +- (unsigned long) pa->pa_pstart, +- (unsigned long) pa->pa_len); +- ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u", ++ ++ /* "free < pa->pa_free" means we maybe double alloc the same blocks, ++ * otherwise maybe leave some free blocks unavailable, no need to BUG.*/ ++ if ((free > pa->pa_free && !pa->pa_error) || (free < pa->pa_free)) { ++ ext4_error(sb, "pa free mismatch: [pa %p] " ++ "[phy %lu] [logic %lu] [len %u] [free %u] " ++ "[error %u] [inode %lu] [freed %u]", pa, ++ (unsigned long)pa->pa_pstart, ++ (unsigned long)pa->pa_lstart, ++ (unsigned)pa->pa_len, (unsigned)pa->pa_free, ++ (unsigned)pa->pa_error, pa->pa_inode->i_ino, ++ free); ++ ext4_grp_locked_error(sb, group, 0, 0, ++ "free %u, pa_free %u", + free, pa->pa_free); + /* + * pa is already deleted so we use the value obtained + * from the bitmap and continue. + */ + } ++ BUG_ON(pa->pa_free != free); + atomic_add(free, &sbi->s_mb_discarded); + + return err; +@@ -4542,6 +4555,25 @@ + ac->ac_b_ex.fe_len = 0; + ar->len = 0; + ext4_mb_show_ac(ac); ++ if (ac->ac_pa) { ++ struct ext4_prealloc_space *pa = ac->ac_pa; ++ ++ /* We can not make sure whether the bitmap has ++ * been updated or not when fail case. So can ++ * not revert pa_free back, just mark pa_error*/ ++ pa->pa_error++; ++ ext4_error(sb, ++ "Updating bitmap error: [err %d] " ++ "[pa %p] [phy %lu] [logic %lu] " ++ "[len %u] [free %u] [error %u] " ++ "[inode %lu]", *errp, pa, ++ (unsigned long)pa->pa_pstart, ++ (unsigned long)pa->pa_lstart, ++ (unsigned)pa->pa_len, ++ (unsigned)pa->pa_free, ++ (unsigned)pa->pa_error, ++ pa->pa_inode ? pa->pa_inode->i_ino : 0); ++ } + } + ext4_mb_release_context(ac); + out: +diff -r -u linux-stage.orig/fs/ext4/mballoc.h linux-stage/fs/ext4/mballoc.h +--- linux-stage.orig/fs/ext4/mballoc.h 2012-12-31 15:18:15.000000000 -0500 ++++ linux-stage/fs/ext4/mballoc.h 2012-12-31 15:19:22.000000000 -0500 +@@ -19,6 +19,7 @@ + #include + #include + #include ++#include + #include "ext4_jbd2.h" + #include "ext4.h" + +@@ -129,6 +130,7 @@ + ext4_grpblk_t pa_free; /* how many blocks are free */ + unsigned short pa_type; /* pa type. inode or group */ + spinlock_t *pa_obj_lock; ++ unsigned short pa_error; + struct inode *pa_inode; /* hack, for history only */ + }; + diff --git a/ldiskfs/kernel_patches/patches/sles11sp2/ext4-misc.patch b/ldiskfs/kernel_patches/patches/sles11sp2/ext4-misc.patch new file mode 100644 index 0000000..efa3eae --- /dev/null +++ b/ldiskfs/kernel_patches/patches/sles11sp2/ext4-misc.patch @@ -0,0 +1,196 @@ +--- + fs/ext4/ext4.h | 6 +++++ + fs/ext4/ext4_extents.h | 10 +++++++++ + fs/ext4/ext4_jbd2.h | 3 ++ + fs/ext4/extents.c | 50 +++++++++++++++++++++++++++++++++++++++++++++++++ + fs/ext4/super.c | 12 +++++++++++ + 5 files changed, 81 insertions(+) + +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -1308,6 +1308,9 @@ static inline void ext4_clear_state_flag + + #define NEXT_ORPHAN(inode) EXT4_I(inode)->i_dtime + ++/* Has been moved to linux/magic.h but we need it for Lustre */ ++#define EXT4_SUPER_MAGIC 0xEF53 ++ + /* + * Codes for operating systems + */ +@@ -1826,6 +1829,9 @@ extern void ext4_add_groupblocks(handle_ + ext4_fsblk_t block, unsigned long count); + extern int ext4_trim_fs(struct super_block *, struct fstrim_range *); + ++extern void ext4_mb_discard_inode_preallocations(struct inode *); ++ ++ + /* inode.c */ + struct buffer_head *ext4_getblk(handle_t *, struct inode *, + ext4_lblk_t, int, int *); +--- a/fs/ext4/ext4_extents.h ++++ b/fs/ext4/ext4_extents.h +@@ -58,6 +58,13 @@ + */ + #define EXT_STATS_ + ++/* ++ * define EXT4_ALLOC_NEEDED to 0 since block bitmap, group desc. and sb ++ * are now accounted in ext4_ext_calc_credits_for_insert() ++ */ ++#define EXT4_ALLOC_NEEDED 0 ++#define HAVE_EXT_PREPARE_CB_EXTENT ++#define HAVE_EXT4_EXT_PBLOCK + + /* + * ext4_inode has i_block array (60 bytes total). +@@ -241,6 +248,7 @@ static inline ext4_fsblk_t ext4_ext_pblo + block |= ((ext4_fsblk_t) le16_to_cpu(ex->ee_start_hi) << 31) << 1; + return block; + } ++#define ext_pblock(ex) ext4_ext_pblock(ex) + + /* + * ext4_idx_pblock: +@@ -287,6 +295,8 @@ extern int ext4_extent_tree_init(handle_ + extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode, + int num, + struct ext4_ext_path *path); ++extern int ext4_ext_calc_credits_for_insert(struct inode *, ++ struct ext4_ext_path *); + extern int ext4_can_extents_be_merged(struct inode *inode, + struct ext4_extent *ex1, + struct ext4_extent *ex2); +--- a/fs/ext4/ext4_jbd2.h ++++ b/fs/ext4/ext4_jbd2.h +@@ -35,6 +35,8 @@ + (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS) \ + ? 27U : 8U) + ++#define ext4_journal_dirty_metadata(handle, bh) \ ++ ext4_handle_dirty_metadata(handle, NULL, bh) + /* Extended attribute operations touch at most two data buffers, + * two bitmap buffers, and two group summaries, in addition to the inode + * and the superblock, which are already accounted for. */ +@@ -175,6 +177,7 @@ static inline void ext4_journal_callback + list_del_init(&jce->jce_list); + spin_unlock(&sbi->s_md_lock); + } ++#define HAVE_EXT4_JOURNAL_CALLBACK_ADD + + int + ext4_mark_iloc_dirty(handle_t *handle, +--- a/fs/ext4/extents.c ++++ b/fs/ext4/extents.c +@@ -2205,6 +2205,56 @@ int ext4_ext_calc_credits_for_single_ext + } + + /* ++ * This routine returns max. credits extent tree can consume. ++ * It should be OK for low-performance paths like ->writepage() ++ * To allow many writing process to fit a single transaction, ++ * caller should calculate credits under truncate_mutex and ++ * pass actual path. ++ */ ++int ext4_ext_calc_credits_for_insert(struct inode *inode, ++ struct ext4_ext_path *path) ++{ ++ int depth, needed; ++ ++ if (path) { ++ /* probably there is space in leaf? */ ++ depth = ext_depth(inode); ++ if (le16_to_cpu(path[depth].p_hdr->eh_entries) ++ < le16_to_cpu(path[depth].p_hdr->eh_max)) ++ return 1; ++ } ++ ++ /* ++ * given 32bit logical block (4294967296 blocks), max. tree ++ * can be 4 levels in depth -- 4 * 340^4 == 53453440000. ++ * let's also add one more level for imbalance. ++ */ ++ depth = 5; ++ ++ /* allocation of new data block(s) */ ++ needed = 2; ++ ++ /* ++ * tree can be full, so it'd need to grow in depth: ++ * we need one credit to modify old root, credits for ++ * new root will be added in split accounting ++ */ ++ needed += 1; ++ /* ++ * Index split can happen, we'd need: ++ * allocate intermediate indexes (bitmap + group) ++ * + change two blocks at each level, but root (already included) ++ */ ++ needed += (depth * 2) + (depth * 2); ++ ++ /* any allocation modifies superblock */ ++ needed += 1; ++ ++ return needed; ++} ++EXPORT_SYMBOL(ext4_ext_calc_credits_for_insert); ++ ++/* + * How many index/leaf blocks need to change/allocate to modify nrblocks? + * + * if nrblocks are fit in a single extent (chunk flag is 1), then +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -1332,12 +1332,14 @@ enum { + Opt_data_err_abort, Opt_data_err_ignore, + Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, + Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, ++ Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, + Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, + Opt_resize, Opt_usrquota, Opt_grpquota, Opt_i_version, + Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit, + Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity, + Opt_inode_readahead_blks, Opt_journal_ioprio, + Opt_dioread_nolock, Opt_dioread_lock, ++ Opt_mballoc, + Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable, + }; + +@@ -1390,6 +1392,9 @@ static const match_table_t tokens = { + {Opt_noquota, "noquota"}, + {Opt_quota, "quota"}, + {Opt_usrquota, "usrquota"}, ++ {Opt_iopen, "iopen"}, ++ {Opt_noiopen, "noiopen"}, ++ {Opt_iopen_nopriv, "iopen_nopriv"}, + {Opt_barrier, "barrier=%u"}, + {Opt_barrier, "barrier"}, + {Opt_nobarrier, "nobarrier"}, +@@ -1409,6 +1414,7 @@ static const match_table_t tokens = { + {Opt_noauto_da_alloc, "noauto_da_alloc"}, + {Opt_dioread_nolock, "dioread_nolock"}, + {Opt_dioread_lock, "dioread_lock"}, ++ {Opt_mballoc, "mballoc"}, + {Opt_discard, "discard"}, + {Opt_nodiscard, "nodiscard"}, + {Opt_init_itable, "init_itable=%u"}, +@@ -1793,6 +1799,10 @@ set_qf_format: + else + clear_opt(sb, BARRIER); + break; ++ case Opt_iopen: ++ case Opt_noiopen: ++ case Opt_iopen_nopriv: ++ break; + case Opt_ignore: + break; + case Opt_resize: +@@ -1904,6 +1914,8 @@ set_qf_format: + case Opt_noinit_itable: + clear_opt(sb, INIT_INODE_TABLE); + break; ++ case Opt_mballoc: ++ break; + default: + ext4_msg(sb, KERN_ERR, + "Unrecognized mount option \"%s\" " diff --git a/ldiskfs/kernel_patches/patches/sles11sp2/ext4-osd-iop-common.patch b/ldiskfs/kernel_patches/patches/sles11sp2/ext4-osd-iop-common.patch new file mode 100644 index 0000000..3581df0 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/sles11sp2/ext4-osd-iop-common.patch @@ -0,0 +1,261 @@ +--- + fs/ext4/ext4.h | 13 +++++ + fs/ext4/namei.c | 131 +++++++++++++++++++++++++++++++++++++------------------- + 2 files changed, 101 insertions(+), 43 deletions(-) + +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -1895,6 +1895,19 @@ extern int ext4_orphan_add(handle_t *, s + extern int ext4_orphan_del(handle_t *, struct inode *); + extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, + __u32 start_minor_hash, __u32 *next_hash); ++extern struct inode *ext4_create_inode(handle_t *handle, ++ struct inode * dir, int mode); ++extern int ext4_add_entry(handle_t *handle, struct dentry *dentry, ++ struct inode *inode); ++extern int ext4_delete_entry(handle_t *handle, struct inode * dir, ++ struct ext4_dir_entry_2 * de_del, ++ struct buffer_head * bh); ++extern struct buffer_head * ext4_find_entry(struct inode *dir, ++ const struct qstr *d_name, ++ struct ext4_dir_entry_2 ** res_dir); ++#define ll_ext4_find_entry(inode, dentry, res_dir) ext4_find_entry(inode, &(dentry)->d_name, res_dir) ++extern int ext4_add_dot_dotdot(handle_t *handle, struct inode *dir, ++ struct inode *inode); + + /* resize.c */ + extern int ext4_group_add(struct super_block *sb, +--- a/fs/ext4/namei.c ++++ b/fs/ext4/namei.c +@@ -24,6 +24,7 @@ + * Theodore Ts'o, 2002 + */ + ++#include + #include + #include + #include +@@ -873,9 +874,9 @@ static inline int search_dirblock(struct + * The returned buffer_head has ->b_count elevated. The caller is expected + * to brelse() it when appropriate. + */ +-static struct buffer_head * ext4_find_entry (struct inode *dir, +- const struct qstr *d_name, +- struct ext4_dir_entry_2 ** res_dir) ++struct buffer_head * ext4_find_entry(struct inode *dir, ++ const struct qstr *d_name, ++ struct ext4_dir_entry_2 ** res_dir) + { + struct super_block *sb; + struct buffer_head *bh_use[NAMEI_RA_SIZE]; +@@ -981,6 +982,7 @@ cleanup_and_exit: + brelse(bh_use[ra_ptr]); + return ret; + } ++EXPORT_SYMBOL(ext4_find_entry); + + static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name, + struct ext4_dir_entry_2 **res_dir, int *err) +@@ -1503,8 +1505,8 @@ static int make_indexed_dir(handle_t *ha + * may not sleep between calling this and putting something into + * the entry, as someone else might have used it while you slept. + */ +-static int ext4_add_entry(handle_t *handle, struct dentry *dentry, +- struct inode *inode) ++int ext4_add_entry(handle_t *handle, struct dentry *dentry, ++ struct inode *inode) + { + struct inode *dir = dentry->d_parent->d_inode; + struct buffer_head *bh; +@@ -1555,6 +1557,7 @@ static int ext4_add_entry(handle_t *hand + ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY); + return retval; + } ++EXPORT_SYMBOL(ext4_add_entry); + + /* + * Returns 0 for success, or a negative error value +@@ -1698,10 +1701,10 @@ cleanup: + * ext4_delete_entry deletes a directory entry by merging it with the + * previous entry + */ +-static int ext4_delete_entry(handle_t *handle, +- struct inode *dir, +- struct ext4_dir_entry_2 *de_del, +- struct buffer_head *bh) ++int ext4_delete_entry(handle_t *handle, ++ struct inode *dir, ++ struct ext4_dir_entry_2 *de_del, ++ struct buffer_head *bh) + { + struct ext4_dir_entry_2 *de, *pde; + unsigned int blocksize = dir->i_sb->s_blocksize; +@@ -1744,7 +1747,7 @@ static int ext4_delete_entry(handle_t *h + } + return -ENOENT; + } +- ++EXPORT_SYMBOL(ext4_delete_entry); + /* + * DIR_NLINK feature is set if 1) nlinks > EXT4_LINK_MAX or 2) nlinks == 2, + * since this indicates that nlinks count was previously 1. +@@ -1808,6 +1811,28 @@ static unsigned ext4_dentry_goal(struct + return inum; + } + ++struct inode *ext4_create_inode(handle_t *handle, struct inode * dir, int mode) ++{ ++ struct inode *inode; ++ ++ inode = ext4_new_inode(handle, dir, mode, NULL, ++ EXT4_SB(dir->i_sb)->s_inode_goal); ++ if (!IS_ERR(inode)) { ++ if (S_ISCHR(mode) || S_ISBLK(mode) || S_ISFIFO(mode)) { ++#ifdef CONFIG_LDISKFS_FS_XATTR ++ inode->i_op = &ext4_special_inode_operations; ++#endif ++ } else { ++ inode->i_op = &ext4_file_inode_operations; ++ inode->i_fop = &ext4_file_operations; ++ ext4_set_aops(inode); ++ } ++ unlock_new_inode(inode); ++ } ++ return inode; ++} ++EXPORT_SYMBOL(ext4_create_inode); ++ + /* + * By the time this is called, we already have created + * the directory cache entry for the new file, but it +@@ -1886,46 +1911,32 @@ retry: + return err; + } + +-static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode) ++/* Initialize @inode as a subdirectory of @dir, and add the ++ * "." and ".." entries into the first directory block. */ ++int ext4_add_dot_dotdot(handle_t *handle, struct inode * dir, ++ struct inode *inode) + { +- handle_t *handle; +- struct inode *inode; +- struct buffer_head *dir_block = NULL; ++ struct buffer_head *dir_block; + struct ext4_dir_entry_2 *de; + unsigned int blocksize = dir->i_sb->s_blocksize; +- int err, retries = 0; ++ int err = 0; + +- if (EXT4_DIR_LINK_MAX(dir)) +- return -EMLINK; +- +- dquot_initialize(dir); +- +-retry: +- handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + +- EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + +- EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (IS_DIRSYNC(dir)) + ext4_handle_sync(handle); + +- inode = ext4_new_inode(handle, dir, S_IFDIR | mode, +- &dentry->d_name, ext4_dentry_goal(dir->i_sb, dentry)); +- err = PTR_ERR(inode); +- if (IS_ERR(inode)) +- goto out_stop; +- + inode->i_op = &ext4_dir_inode_operations; + inode->i_fop = &ext4_dir_operations; + inode->i_size = EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize; + dir_block = ext4_bread(handle, inode, 0, 1, &err); + if (!dir_block) +- goto out_clear_inode; ++ goto get_out; + BUFFER_TRACE(dir_block, "get_write_access"); + err = ext4_journal_get_write_access(handle, dir_block); + if (err) +- goto out_clear_inode; ++ goto get_out; + de = (struct ext4_dir_entry_2 *) dir_block->b_data; + de->inode = cpu_to_le32(inode->i_ino); + de->name_len = 1; +@@ -1944,18 +1955,47 @@ retry: + BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, inode, dir_block); + if (err) +- goto out_clear_inode; ++ goto get_out; + err = ext4_mark_inode_dirty(handle, inode); +- if (!err) +- err = ext4_add_entry(handle, dentry, inode); +- if (err) { +-out_clear_inode: +- clear_nlink(inode); +- unlock_new_inode(inode); +- ext4_mark_inode_dirty(handle, inode); +- iput(inode); ++get_out: ++ brelse(dir_block); ++ return err; ++} ++EXPORT_SYMBOL(ext4_add_dot_dotdot); ++ ++static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode) ++{ ++ handle_t *handle; ++ struct inode *inode; ++ int err, retries = 0; ++ ++ if (EXT4_DIR_LINK_MAX(dir)) ++ return -EMLINK; ++ ++ dquot_initialize(dir); ++ ++retry: ++ handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + ++ EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + ++ EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ ++ if (IS_DIRSYNC(dir)) ++ ext4_handle_sync(handle); ++ ++ inode = ext4_new_inode(handle, dir, S_IFDIR | mode, &dentry->d_name, ++ ext4_dentry_goal(dir->i_sb, dentry)); ++ err = PTR_ERR(inode); ++ if (IS_ERR(inode)) + goto out_stop; +- } ++ ++ err = ext4_add_dot_dotdot(handle, dir, inode); ++ if (err) ++ goto out_clear_inode; ++ err = ext4_add_entry(handle, dentry, inode); ++ if (err) ++ goto out_clear_inode; + ext4_inc_count(handle, dir); + ext4_update_dx_flag(dir); + err = ext4_mark_inode_dirty(handle, dir); +@@ -1964,11 +2004,16 @@ out_clear_inode: + d_instantiate(dentry, inode); + unlock_new_inode(inode); + out_stop: +- brelse(dir_block); + ext4_journal_stop(handle); + if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) + goto retry; + return err; ++out_clear_inode: ++ clear_nlink(inode); ++ unlock_new_inode(inode); ++ ext4_mark_inode_dirty(handle, inode); ++ iput(inode); ++ goto out_stop; + } + + /* diff --git a/ldiskfs/kernel_patches/patches/sles11sp2/ext4-prealloc.patch b/ldiskfs/kernel_patches/patches/sles11sp2/ext4-prealloc.patch new file mode 100644 index 0000000..6f3585b --- /dev/null +++ b/ldiskfs/kernel_patches/patches/sles11sp2/ext4-prealloc.patch @@ -0,0 +1,376 @@ +diff -u -r linux-stage.orig/fs/ext4/ext4.h linux-stage/fs/ext4/ext4.h +--- linux-stage.orig/fs/ext4/ext4.h 2012-12-31 11:12:36.000000000 -0500 ++++ linux-stage/fs/ext4/ext4.h 2012-12-31 11:12:48.000000000 -0500 +@@ -1170,11 +1170,14 @@ + + /* tunables */ + unsigned long s_stripe; +- unsigned int s_mb_stream_request; ++ unsigned long s_mb_small_req; ++ unsigned long s_mb_large_req; + unsigned int s_mb_max_to_scan; + unsigned int s_mb_min_to_scan; + unsigned int s_mb_stats; + unsigned int s_mb_order2_reqs; ++ unsigned long *s_mb_prealloc_table; ++ unsigned long s_mb_prealloc_table_size; + unsigned int s_mb_group_prealloc; + unsigned int s_max_writeback_mb_bump; + /* where last allocation was done - for stream allocation */ +diff -u -r linux-stage.orig/fs/ext4/inode.c linux-stage/fs/ext4/inode.c +--- linux-stage.orig/fs/ext4/inode.c 2012-12-31 11:12:36.000000000 -0500 ++++ linux-stage/fs/ext4/inode.c 2012-12-31 11:12:48.000000000 -0500 +@@ -2937,6 +2937,11 @@ + if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) + return -EROFS; + ++ if (wbc->nr_to_write < sbi->s_mb_small_req) { ++ nr_to_writebump = sbi->s_mb_small_req - wbc->nr_to_write; ++ wbc->nr_to_write = sbi->s_mb_small_req; ++ } ++ + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) + range_whole = 1; + +diff -u -r linux-stage.orig/fs/ext4/mballoc.c linux-stage/fs/ext4/mballoc.c +--- linux-stage.orig/fs/ext4/mballoc.c 2012-12-31 11:12:36.000000000 -0500 ++++ linux-stage/fs/ext4/mballoc.c 2012-12-31 11:20:51.000000000 -0500 +@@ -1799,6 +1799,25 @@ + } + } + ++static void ext4_mb_prealloc_table_add(struct ext4_sb_info *sbi, int value) ++{ ++ int i; ++ ++ if (value > (sbi->s_blocks_per_group - 1 - 1 - sbi->s_itb_per_group)) ++ return; ++ ++ for (i = 0; i < sbi->s_mb_prealloc_table_size; i++) { ++ if (sbi->s_mb_prealloc_table[i] == 0) { ++ sbi->s_mb_prealloc_table[i] = value; ++ return; ++ } ++ ++ /* they should add values in order */ ++ if (value <= sbi->s_mb_prealloc_table[i]) ++ return; ++ } ++} ++ + /* + * The routine scans the group and measures all found extents. + * In order to optimize scanning, caller must pass number of +@@ -2172,6 +2191,80 @@ + .show = ext4_mb_seq_groups_show, + }; + ++#define EXT4_MB_PREALLOC_TABLE "prealloc_table" ++ ++static int ext4_mb_prealloc_table_proc_read(char *page, char **start, off_t off, ++ int count, int *eof, void *data) ++{ ++ struct ext4_sb_info *sbi = data; ++ int len = 0; ++ int i; ++ ++ *eof = 1; ++ if (off != 0) ++ return 0; ++ ++ for (i = 0; i < sbi->s_mb_prealloc_table_size; i++) ++ len += sprintf(page + len, "%ld ", ++ sbi->s_mb_prealloc_table[i]); ++ len += sprintf(page + len, "\n"); ++ ++ *start = page; ++ return len; ++} ++ ++static int ext4_mb_prealloc_table_proc_write(struct file *file, ++ const char __user *buf, ++ unsigned long cnt, void *data) ++{ ++ struct ext4_sb_info *sbi = data; ++ unsigned long value; ++ unsigned long prev = 0; ++ char str[128]; ++ char *cur; ++ char *end; ++ unsigned long *new_table; ++ int num = 0; ++ int i = 0; ++ ++ if (cnt >= sizeof(str)) ++ return -EINVAL; ++ if (copy_from_user(str, buf, cnt)) ++ return -EFAULT; ++ ++ num = 0; ++ cur = str; ++ end = str + cnt; ++ while (cur < end) { ++ while ((cur < end) && (*cur == ' ')) cur++; ++ value = simple_strtol(cur, &cur, 0); ++ if (value == 0) ++ break; ++ if (value <= prev) ++ return -EINVAL; ++ prev = value; ++ num++; ++ } ++ ++ new_table = kmalloc(num * sizeof(*new_table), GFP_KERNEL); ++ if (new_table == NULL) ++ return -ENOMEM; ++ kfree(sbi->s_mb_prealloc_table); ++ memset(new_table, 0, num * sizeof(*new_table)); ++ sbi->s_mb_prealloc_table = new_table; ++ sbi->s_mb_prealloc_table_size = num; ++ cur = str; ++ end = str + cnt; ++ while (cur < end && i < num) { ++ while ((cur < end) && (*cur == ' ')) cur++; ++ value = simple_strtol(cur, &cur, 0); ++ ext4_mb_prealloc_table_add(sbi, value); ++ i++; ++ } ++ ++ return cnt; ++} ++ + static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file) + { + struct super_block *sb = PDE(inode)->data; +@@ -2469,9 +2562,52 @@ + sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN; + sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN; + sbi->s_mb_stats = MB_DEFAULT_STATS; +- sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; + sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; +- sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC; ++ ++ if (sbi->s_stripe == 0) { ++ sbi->s_mb_prealloc_table_size = 10; ++ i = sbi->s_mb_prealloc_table_size * sizeof(unsigned long); ++ sbi->s_mb_prealloc_table = kmalloc(i, GFP_NOFS); ++ if (sbi->s_mb_prealloc_table == NULL) { ++ kfree(sbi->s_mb_offsets); ++ kfree(sbi->s_mb_maxs); ++ return -ENOMEM; ++ } ++ memset(sbi->s_mb_prealloc_table, 0, i); ++ ++ ext4_mb_prealloc_table_add(sbi, 4); ++ ext4_mb_prealloc_table_add(sbi, 8); ++ ext4_mb_prealloc_table_add(sbi, 16); ++ ext4_mb_prealloc_table_add(sbi, 32); ++ ext4_mb_prealloc_table_add(sbi, 64); ++ ext4_mb_prealloc_table_add(sbi, 128); ++ ext4_mb_prealloc_table_add(sbi, 256); ++ ext4_mb_prealloc_table_add(sbi, 512); ++ ext4_mb_prealloc_table_add(sbi, 1024); ++ ext4_mb_prealloc_table_add(sbi, 2048); ++ ++ sbi->s_mb_small_req = 256; ++ sbi->s_mb_large_req = 1024; ++ sbi->s_mb_group_prealloc = 512; ++ } else { ++ sbi->s_mb_prealloc_table_size = 3; ++ i = sbi->s_mb_prealloc_table_size * sizeof(unsigned long); ++ sbi->s_mb_prealloc_table = kmalloc(i, GFP_NOFS); ++ if (sbi->s_mb_prealloc_table == NULL) { ++ kfree(sbi->s_mb_offsets); ++ kfree(sbi->s_mb_maxs); ++ return -ENOMEM; ++ } ++ memset(sbi->s_mb_prealloc_table, 0, i); ++ ++ ext4_mb_prealloc_table_add(sbi, sbi->s_stripe); ++ ext4_mb_prealloc_table_add(sbi, sbi->s_stripe * 2); ++ ext4_mb_prealloc_table_add(sbi, sbi->s_stripe * 4); ++ ++ sbi->s_mb_small_req = sbi->s_stripe; ++ sbi->s_mb_large_req = sbi->s_stripe * 8; ++ sbi->s_mb_group_prealloc = sbi->s_stripe * 4; ++ } + + sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); + if (sbi->s_locality_groups == NULL) { +@@ -2487,12 +2623,22 @@ + spin_lock_init(&lg->lg_prealloc_lock); + } + +- if (sbi->s_proc) ++ if (sbi->s_proc) { ++ struct proc_dir_entry *p; + proc_create_data("mb_groups", S_IRUGO, sbi->s_proc, + &ext4_mb_seq_groups_fops, sb); ++ p = create_proc_entry(EXT4_MB_PREALLOC_TABLE, S_IFREG | ++ S_IRUGO | S_IWUSR, sbi->s_proc); ++ if (p) { ++ p->data = sbi; ++ p->read_proc = ext4_mb_prealloc_table_proc_read; ++ p->write_proc = ext4_mb_prealloc_table_proc_write; ++ } ++ } + + out: + if (ret) { ++ kfree(sbi->s_mb_prealloc_table); + kfree(sbi->s_mb_offsets); + kfree(sbi->s_mb_maxs); + } +@@ -2528,8 +2674,10 @@ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits); + +- if (sbi->s_proc) ++ if (sbi->s_proc) { + remove_proc_entry("mb_groups", sbi->s_proc); ++ remove_proc_entry(EXT4_MB_PREALLOC_TABLE, sbi->s_proc); ++ } + + if (sbi->s_group_info) { + for (i = 0; i < ngroups; i++) { +@@ -2859,11 +3007,12 @@ + ext4_mb_normalize_request(struct ext4_allocation_context *ac, + struct ext4_allocation_request *ar) + { +- int bsbits, max; ++ int bsbits, i, wind; + ext4_lblk_t end; +- loff_t size, orig_size, start_off; ++ loff_t size, orig_size; + ext4_lblk_t start; + struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); ++ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + struct ext4_prealloc_space *pa; + + /* do normalize only data requests, metadata requests +@@ -2894,49 +3043,34 @@ + if (size < i_size_read(ac->ac_inode)) + size = i_size_read(ac->ac_inode); + orig_size = size; ++ size = (size + ac->ac_sb->s_blocksize - 1) >> bsbits; + +- /* max size of free chunks */ +- max = 2 << bsbits; ++ start = wind = 0; + +-#define NRL_CHECK_SIZE(req, size, max, chunk_size) \ +- (req <= (size) || max <= (chunk_size)) ++ /* let's choose preallocation window depending on file size */ ++ for (i = 0; i < sbi->s_mb_prealloc_table_size; i++) { ++ if (size <= sbi->s_mb_prealloc_table[i]) { ++ wind = sbi->s_mb_prealloc_table[i]; ++ break; ++ } ++ } ++ size = wind; + +- /* first, try to predict filesize */ +- /* XXX: should this table be tunable? */ +- start_off = 0; +- if (size <= 16 * 1024) { +- size = 16 * 1024; +- } else if (size <= 32 * 1024) { +- size = 32 * 1024; +- } else if (size <= 64 * 1024) { +- size = 64 * 1024; +- } else if (size <= 128 * 1024) { +- size = 128 * 1024; +- } else if (size <= 256 * 1024) { +- size = 256 * 1024; +- } else if (size <= 512 * 1024) { +- size = 512 * 1024; +- } else if (size <= 1024 * 1024) { +- size = 1024 * 1024; +- } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) { +- start_off = ((loff_t)ac->ac_o_ex.fe_logical >> +- (21 - bsbits)) << 21; +- size = 2 * 1024 * 1024; +- } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) { +- start_off = ((loff_t)ac->ac_o_ex.fe_logical >> +- (22 - bsbits)) << 22; +- size = 4 * 1024 * 1024; +- } else if (NRL_CHECK_SIZE(ac->ac_o_ex.fe_len, +- (8<<20)>>bsbits, max, 8 * 1024)) { +- start_off = ((loff_t)ac->ac_o_ex.fe_logical >> +- (23 - bsbits)) << 23; +- size = 8 * 1024 * 1024; +- } else { +- start_off = (loff_t)ac->ac_o_ex.fe_logical << bsbits; +- size = ac->ac_o_ex.fe_len << bsbits; ++ if (wind == 0) { ++ __u64 tstart, tend; ++ /* file is quite large, we now preallocate with ++ * the biggest configured window with regart to ++ * logical offset */ ++ wind = sbi->s_mb_prealloc_table[i - 1]; ++ tstart = ac->ac_o_ex.fe_logical; ++ do_div(tstart, wind); ++ start = tstart * wind; ++ tend = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len - 1; ++ do_div(tend, wind); ++ tend = tend * wind + wind; ++ size = tend - start; + } +- size = size >> bsbits; +- start = start_off >> bsbits; ++ orig_size = size; + + /* don't cover already allocated blocks in selected range */ + if (ar->pleft && start <= ar->lleft) { +@@ -3008,7 +3143,6 @@ + } + BUG_ON(start + size <= ac->ac_o_ex.fe_logical && + start > ac->ac_o_ex.fe_logical); +- BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); + + /* now prepare goal request */ + +@@ -3940,11 +4074,19 @@ + + /* don't use group allocation for large files */ + size = max(size, isize); +- if (size > sbi->s_mb_stream_request) { ++ if ((ac->ac_o_ex.fe_len >= sbi->s_mb_small_req) || ++ (size >= sbi->s_mb_large_req)) { + ac->ac_flags |= EXT4_MB_STREAM_ALLOC; + return; + } + ++ /* ++ * request is so large that we don't care about ++ * streaming - it overweights any possible seek ++ */ ++ if (ac->ac_o_ex.fe_len >= sbi->s_mb_large_req) ++ return; ++ + BUG_ON(ac->ac_lg != NULL); + /* + * locality group prealloc space are per cpu. The reason for having +diff -u -r linux-stage.orig/fs/ext4/super.c linux-stage/fs/ext4/super.c +--- linux-stage.orig/fs/ext4/super.c 2012-12-31 11:12:36.000000000 -0500 ++++ linux-stage/fs/ext4/super.c 2012-12-31 11:12:48.000000000 -0500 +@@ -2531,7 +2531,8 @@ + EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan); + EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan); + EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); +-EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); ++EXT4_RW_ATTR_SBI_UI(mb_small_req, s_mb_small_req); ++EXT4_RW_ATTR_SBI_UI(mb_large_req, s_mb_large_req); + EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); + EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump); + +@@ -2548,7 +2549,8 @@ + ATTR_LIST(mb_max_to_scan), + ATTR_LIST(mb_min_to_scan), + ATTR_LIST(mb_order2_req), +- ATTR_LIST(mb_stream_req), ++ ATTR_LIST(mb_small_req), ++ ATTR_LIST(mb_large_req), + ATTR_LIST(mb_group_prealloc), + ATTR_LIST(max_writeback_mb_bump), + NULL, diff --git a/ldiskfs/kernel_patches/patches/sles11sp2/ext4-read-write.patch b/ldiskfs/kernel_patches/patches/sles11sp2/ext4-read-write.patch new file mode 100644 index 0000000..31f2dd3 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/sles11sp2/ext4-read-write.patch @@ -0,0 +1,32 @@ +--- + fs/ext4/super.c | 12 ------------ + 1 file changed, 12 deletions(-) + +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -3216,10 +3216,6 @@ static int ext4_fill_super(struct super_ + goto cantfind_ext4; + sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written); + +-#ifndef CONFIG_EXT4_FS_RW +- sb->s_flags |= MS_RDONLY; +- ext4_msg(sb, KERN_INFO, "ext4 is supported in read-only mode only"); +-#endif + /* Set defaults before we parse the mount options */ + def_mount_opts = le32_to_cpu(es->s_default_mount_opts); + set_opt(sb, INIT_INODE_TABLE); +@@ -4487,14 +4483,6 @@ static int ext4_remount(struct super_blo + if (sbi->s_journal && sbi->s_journal->j_task->io_context) + journal_ioprio = sbi->s_journal->j_task->io_context->ioprio; + +-#ifndef CONFIG_EXT4_FS_RW +- if (!(*flags & MS_RDONLY)) { +- *flags |= MS_RDONLY; +- ext4_msg(sb, KERN_INFO, +- "ext4 is supported in read-only mode only"); +- } +-#endif +- + /* + * Allow the "check" option to be passed as a remount option. + */ diff --git a/ldiskfs/kernel_patches/patches/sles11sp2/ext4-speed-up-fitrim-by-recording-flags-in-ext4_group_info.patch b/ldiskfs/kernel_patches/patches/sles11sp2/ext4-speed-up-fitrim-by-recording-flags-in-ext4_group_info.patch new file mode 100644 index 0000000..7e5a6b4 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/sles11sp2/ext4-speed-up-fitrim-by-recording-flags-in-ext4_group_info.patch @@ -0,0 +1,159 @@ +From 3d56b8d2c74cc3f375ce332b3ac3519e009d79ee Mon Sep 17 00:00:00 2001 +From: Tao Ma +Date: Mon, 11 Jul 2011 00:03:38 -0400 +Subject: ext4: Speed up FITRIM by recording flags in ext4_group_info +Git-commit: 3d56b8d2 +Patch-mainline: v3.1-rc1 + +In ext4, when FITRIM is called every time, we iterate all the +groups and do trim one by one. It is a bit time wasting if the +group has been trimmed and there is no change since the last +trim. + +So this patch adds a new flag in ext4_group_info->bb_state to +indicate that the group has been trimmed, and it will be cleared +if some blocks is freed(in release_blocks_on_commit). Another +trim_minlen is added in ext4_sb_info to record the last minlen +we use to trim the volume, so that if the caller provide a small +one, we will go on the trim regardless of the bb_state. + +A simple test with my intel x25m ssd: +df -h shows: +/dev/sdb1 40G 21G 17G 56% /mnt/ext4 +Block size: 4096 + +run the FITRIM with the following parameter: +range.start = 0; +range.len = UINT64_MAX; +range.minlen = 1048576; + +without the patch: +[root@boyu-tm linux-2.6]# time ./ftrim /mnt/ext4/a +real 0m5.505s +user 0m0.000s +sys 0m1.224s +[root@boyu-tm linux-2.6]# time ./ftrim /mnt/ext4/a +real 0m5.359s +user 0m0.000s +sys 0m1.178s +[root@boyu-tm linux-2.6]# time ./ftrim /mnt/ext4/a +real 0m5.228s +user 0m0.000s +sys 0m1.151s + +with the patch: +[root@boyu-tm linux-2.6]# time ./ftrim /mnt/ext4/a +real 0m5.625s +user 0m0.000s +sys 0m1.269s +[root@boyu-tm linux-2.6]# time ./ftrim /mnt/ext4/a +real 0m0.002s +user 0m0.000s +sys 0m0.001s +[root@boyu-tm linux-2.6]# time ./ftrim /mnt/ext4/a +real 0m0.002s +user 0m0.000s +sys 0m0.001s + +A big improvement for the 2nd and 3rd run. + +Even after I delete some big image files, it is still much +faster than iterating the whole disk. + +[root@boyu-tm test]# time ./ftrim /mnt/ext4/a +real 0m1.217s +user 0m0.000s +sys 0m0.196s + +Upstream-Cc: Lukas Czerner +Upstream-Reviewed-by: Andreas Dilger +Upstream-Signed-off-by: Tao Ma +Upstream-Signed-off-by: "Theodore Ts'o" +Signed-off-by: Jeff Mahoney +--- + fs/ext4/ext4.h | 13 ++++++++++++- + fs/ext4/mballoc.c | 20 ++++++++++++++++++++ + 2 files changed, 32 insertions(+), 1 deletion(-) + +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -1215,6 +1215,9 @@ struct ext4_sb_info { + + /* Kernel thread for multiple mount protection */ + struct task_struct *s_mmp_tsk; ++ ++ /* record the last minlen when FITRIM is called. */ ++ atomic_t s_last_trim_minblks; + }; + + static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) +@@ -2071,11 +2074,19 @@ struct ext4_group_info { + * 5 free 8-block regions. */ + }; + +-#define EXT4_GROUP_INFO_NEED_INIT_BIT 0 ++#define EXT4_GROUP_INFO_NEED_INIT_BIT 0 ++#define EXT4_GROUP_INFO_WAS_TRIMMED_BIT 1 + + #define EXT4_MB_GRP_NEED_INIT(grp) \ + (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) + ++#define EXT4_MB_GRP_WAS_TRIMMED(grp) \ ++ (test_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) ++#define EXT4_MB_GRP_SET_TRIMMED(grp) \ ++ (set_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) ++#define EXT4_MB_GRP_CLEAR_TRIMMED(grp) \ ++ (clear_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) ++ + #define EXT4_MAX_CONTENTION 8 + #define EXT4_CONTENTION_THRESHOLD 2 + +--- a/fs/ext4/mballoc.c ++++ b/fs/ext4/mballoc.c +@@ -2629,6 +2629,15 @@ static void release_blocks_on_commit(jou + rb_erase(&entry->node, &(db->bb_free_root)); + mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count); + ++ /* ++ * Clear the trimmed flag for the group so that the next ++ * ext4_trim_fs can trim it. ++ * If the volume is mounted with -o discard, online discard ++ * is supported and the free blocks will be trimmed online. ++ */ ++ if (!test_opt(sb, DISCARD)) ++ EXT4_MB_GRP_CLEAR_TRIMMED(db); ++ + if (!db->bb_free_root.rb_node) { + /* No more items in the per group rb tree + * balance refcounts from ext4_mb_free_metadata() +@@ -4838,6 +4847,10 @@ ext4_trim_all_free(struct super_block *s + bitmap = e4b.bd_bitmap; + + ext4_lock_group(sb, group); ++ if (EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) && ++ minblocks >= atomic_read(&EXT4_SB(sb)->s_last_trim_minblks)) ++ goto out; ++ + start = (e4b.bd_info->bb_first_free > start) ? + e4b.bd_info->bb_first_free : start; + +@@ -4868,6 +4881,10 @@ ext4_trim_all_free(struct super_block *s + if ((e4b.bd_info->bb_free - count) < minblocks) + break; + } ++ ++ if (!ret) ++ EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info); ++out: + ext4_unlock_group(sb, group); + ext4_mb_unload_buddy(&e4b); + +@@ -4954,5 +4971,8 @@ int ext4_trim_fs(struct super_block *sb, + } + range->len = trimmed * sb->s_blocksize; + ++ if (!ret) ++ atomic_set(&EXT4_SB(sb)->s_last_trim_minblks, minlen); ++ + return ret; + } diff --git a/ldiskfs/kernel_patches/patches/sles11sp2/ext4-store-tree-generation-at-find.patch b/ldiskfs/kernel_patches/patches/sles11sp2/ext4-store-tree-generation-at-find.patch new file mode 100644 index 0000000..e15fc03 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/sles11sp2/ext4-store-tree-generation-at-find.patch @@ -0,0 +1,64 @@ +diff -u -r linux-stage.orig/fs/ext4/ext4_extents.h linux-stage/fs/ext4/ext4_extents.h +--- linux-stage.orig/fs/ext4/ext4_extents.h 2013-01-02 10:14:02.000000000 -0500 ++++ linux-stage/fs/ext4/ext4_extents.h 2013-01-02 10:14:14.000000000 -0500 +@@ -113,6 +113,7 @@ + * Truncate uses it to simulate recursive walking. + */ + struct ext4_ext_path { ++ unsigned long p_generation; + ext4_fsblk_t p_block; + __u16 p_depth; + struct ext4_extent *p_ext; +diff -u -r linux-stage.orig/fs/ext4/extents.c linux-stage/fs/ext4/extents.c +--- linux-stage.orig/fs/ext4/extents.c 2013-01-02 10:14:02.000000000 -0500 ++++ linux-stage/fs/ext4/extents.c 2013-01-02 10:16:57.000000000 -0500 +@@ -1882,7 +1882,7 @@ + { + struct ext4_ext_path *path = NULL; + struct ext4_ext_cache cbex; +- struct ext4_extent *ex; ++ struct ext4_extent _ex, *ex; + ext4_lblk_t next, start = 0, end = 0; + ext4_lblk_t last = block + num; + int depth, exists, err = 0; +@@ -1895,21 +1895,29 @@ + /* find extent for this block */ + down_read(&EXT4_I(inode)->i_data_sem); + path = ext4_ext_find_extent(inode, block, path); +- up_read(&EXT4_I(inode)->i_data_sem); + if (IS_ERR(path)) { ++ up_read(&EXT4_I(inode)->i_data_sem); + err = PTR_ERR(path); + path = NULL; + break; + } + ++ path[0].p_generation = EXT4_I(inode)->i_ext_generation; ++ + depth = ext_depth(inode); + if (unlikely(path[depth].p_hdr == NULL)) { ++ up_read(&EXT4_I(inode)->i_data_sem); + EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); + err = -EIO; + break; + } +- ex = path[depth].p_ext; ++ ex = NULL; ++ if (path[depth].p_ext) { ++ _ex = *path[depth].p_ext; ++ ex = &_ex; ++ } + next = ext4_ext_next_allocated_block(path); ++ up_read(&EXT4_I(inode)->i_data_sem); + + exists = 0; + if (!ex) { +@@ -1961,7 +1969,7 @@ + err = -EIO; + break; + } +- err = func(inode, path, &cbex, ex, cbdata); ++ err = func(inode, path, &cbex, NULL, cbdata); + ext4_ext_drop_refs(path); + + if (err < 0) diff --git a/ldiskfs/kernel_patches/patches/sles11sp2/ext4-use-ext4_kvzalloc-ext4_kvmalloc-for-s_group_desc-and-s_group_info.patch b/ldiskfs/kernel_patches/patches/sles11sp2/ext4-use-ext4_kvzalloc-ext4_kvmalloc-for-s_group_desc-and-s_group_info.patch new file mode 100644 index 0000000..6c3a498 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/sles11sp2/ext4-use-ext4_kvzalloc-ext4_kvmalloc-for-s_group_desc-and-s_group_info.patch @@ -0,0 +1,121 @@ +From f18a5f21c25707b4fe64b326e2b4d150565e7300 Mon Sep 17 00:00:00 2001 +From: Theodore Ts'o +Date: Mon, 1 Aug 2011 08:45:38 -0400 +Subject: ext4: use ext4_kvzalloc()/ext4_kvmalloc() for s_group_desc and s_group_info +Git-commit: f18a5f21 +Patch-mainline: v3.1-rc1 + +Upstream-Signed-off-by: "Theodore Ts'o" +Signed-off-by: Jeff Mahoney +--- + fs/ext4/mballoc.c | 6 +++--- + fs/ext4/resize.c | 13 +++++++------ + fs/ext4/super.c | 9 +++++---- + 3 files changed, 15 insertions(+), 13 deletions(-) + +diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c +index fa716c9..d5021e8 100644 +--- a/fs/ext4/mballoc.c ++++ b/fs/ext4/mballoc.c +@@ -2331,7 +2331,7 @@ static int ext4_mb_init_backend(struct super_block *sb) + /* An 8TB filesystem with 64-bit pointers requires a 4096 byte + * kmalloc. A 128kb malloc should suffice for a 256TB filesystem. + * So a two level scheme suffices for now. */ +- sbi->s_group_info = kzalloc(array_size, GFP_KERNEL); ++ sbi->s_group_info = ext4_kvzalloc(array_size, GFP_KERNEL); + if (sbi->s_group_info == NULL) { + printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n"); + return -ENOMEM; +@@ -2365,7 +2365,7 @@ err_freebuddy: + kfree(sbi->s_group_info[i]); + iput(sbi->s_buddy_cache); + err_freesgi: +- kfree(sbi->s_group_info); ++ ext4_kvfree(sbi->s_group_info); + return -ENOMEM; + } + +@@ -2559,7 +2559,7 @@ int ext4_mb_release(struct super_block *sb) + EXT4_DESC_PER_BLOCK_BITS(sb); + for (i = 0; i < num_meta_group_infos; i++) + kfree(sbi->s_group_info[i]); +- kfree(sbi->s_group_info); ++ ext4_kvfree(sbi->s_group_info); + } + kfree(sbi->s_mb_offsets); + kfree(sbi->s_mb_maxs); +diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c +index 71085df..707d3f1 100644 +--- a/fs/ext4/resize.c ++++ b/fs/ext4/resize.c +@@ -467,12 +467,13 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, + if (unlikely(err)) + goto exit_dindj; + +- n_group_desc = kmalloc((gdb_num + 1) * sizeof(struct buffer_head *), +- GFP_NOFS); ++ n_group_desc = ext4_kvmalloc((gdb_num + 1) * ++ sizeof(struct buffer_head *), ++ GFP_NOFS); + if (!n_group_desc) { + err = -ENOMEM; +- ext4_warning(sb, +- "not enough memory for %lu groups", gdb_num + 1); ++ ext4_warning(sb, "not enough memory for %lu groups", ++ gdb_num + 1); + goto exit_inode; + } + +@@ -507,7 +508,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, + n_group_desc[gdb_num] = *primary; + EXT4_SB(sb)->s_group_desc = n_group_desc; + EXT4_SB(sb)->s_gdb_count++; +- kfree(o_group_desc); ++ ext4_kvfree(o_group_desc); + + le16_add_cpu(&es->s_reserved_gdt_blocks, -1); + err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); +@@ -517,7 +518,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, + return err; + + exit_inode: +- kfree(n_group_desc); ++ ext4_kvfree(n_group_desc); + /* ext4_handle_release_buffer(handle, iloc.bh); */ + brelse(iloc.bh); + exit_dindj: +diff --git a/fs/ext4/super.c b/fs/ext4/super.c +index 658f586..e2d88ba 100644 +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -819,7 +819,7 @@ static void ext4_put_super(struct super_block *sb) + + for (i = 0; i < sbi->s_gdb_count; i++) + brelse(sbi->s_group_desc[i]); +- kfree(sbi->s_group_desc); ++ ext4_kvfree(sbi->s_group_desc); + ext4_kvfree(sbi->s_flex_groups); + percpu_counter_destroy(&sbi->s_freeblocks_counter); + percpu_counter_destroy(&sbi->s_freeinodes_counter); +@@ -3439,8 +3439,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) + (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb))); + db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / + EXT4_DESC_PER_BLOCK(sb); +- sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *), +- GFP_KERNEL); ++ sbi->s_group_desc = ext4_kvmalloc(db_count * ++ sizeof(struct buffer_head *), ++ GFP_KERNEL); + if (sbi->s_group_desc == NULL) { + ext4_msg(sb, KERN_ERR, "not enough memory"); + goto failed_mount; +@@ -3783,7 +3784,7 @@ failed_mount3: + failed_mount2: + for (i = 0; i < db_count; i++) + brelse(sbi->s_group_desc[i]); +- kfree(sbi->s_group_desc); ++ ext4_kvfree(sbi->s_group_desc); + failed_mount: + if (sbi->s_proc) { + remove_proc_entry(sb->s_id, ext4_proc_root); + diff --git a/ldiskfs/kernel_patches/patches/sles11sp2/ext4-use-ext4_msg-instead-of-printk-in-mballoc.patch b/ldiskfs/kernel_patches/patches/sles11sp2/ext4-use-ext4_msg-instead-of-printk-in-mballoc.patch new file mode 100644 index 0000000..b19e58a --- /dev/null +++ b/ldiskfs/kernel_patches/patches/sles11sp2/ext4-use-ext4_msg-instead-of-printk-in-mballoc.patch @@ -0,0 +1,196 @@ +From 9d8b9ec44234b2f6e0225300632d250210c04f11 Mon Sep 17 00:00:00 2001 +From: Theodore Ts'o +Date: Mon, 1 Aug 2011 17:41:35 -0400 +Subject: ext4: use ext4_msg() instead of printk in mballoc +Git-commit: 9d8b9ec4 +Patch-mainline: v3.1-rc1 + +Upstream-Signed-off-by: "Theodore Ts'o" +Signed-off-by: Jeff Mahoney +--- + fs/ext4/mballoc.c | 79 ++++++++++++++++++++++++++++------------------------- + 1 files changed, 42 insertions(+), 37 deletions(-) + +diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c +index d5021e8..70d1b3e 100644 +--- a/fs/ext4/mballoc.c ++++ b/fs/ext4/mballoc.c +@@ -493,10 +493,11 @@ static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) + b2 = (unsigned char *) bitmap; + for (i = 0; i < e4b->bd_sb->s_blocksize; i++) { + if (b1[i] != b2[i]) { +- printk(KERN_ERR "corruption in group %u " +- "at byte %u(%u): %x in copy != %x " +- "on disk/prealloc\n", +- e4b->bd_group, i, i * 8, b1[i], b2[i]); ++ ext4_msg(e4b->bd_sb, KERN_ERR, ++ "corruption in group %u " ++ "at byte %u(%u): %x in copy != %x " ++ "on disk/prealloc", ++ e4b->bd_group, i, i * 8, b1[i], b2[i]); + BUG(); + } + } +@@ -2224,8 +2225,8 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, + EXT4_DESC_PER_BLOCK_BITS(sb); + meta_group_info = kmalloc(metalen, GFP_KERNEL); + if (meta_group_info == NULL) { +- printk(KERN_ERR "EXT4-fs: can't allocate mem for a " +- "buddy group\n"); ++ ext4_msg(sb, KERN_ERR, "EXT4-fs: can't allocate mem " ++ "for a buddy group"); + goto exit_meta_group_info; + } + sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] = +@@ -2238,7 +2239,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, + + meta_group_info[i] = kmem_cache_alloc(cachep, GFP_KERNEL); + if (meta_group_info[i] == NULL) { +- printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n"); ++ ext4_msg(sb, KERN_ERR, "EXT4-fs: can't allocate buddy mem"); + goto exit_group_info; + } + memset(meta_group_info[i], 0, kmem_cache_size(cachep)); +@@ -2333,12 +2334,12 @@ static int ext4_mb_init_backend(struct super_block *sb) + * So a two level scheme suffices for now. */ + sbi->s_group_info = ext4_kvzalloc(array_size, GFP_KERNEL); + if (sbi->s_group_info == NULL) { +- printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n"); ++ ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group"); + return -ENOMEM; + } + sbi->s_buddy_cache = new_inode(sb); + if (sbi->s_buddy_cache == NULL) { +- printk(KERN_ERR "EXT4-fs: can't get new inode\n"); ++ ext4_msg(sb, KERN_ERR, "can't get new inode"); + goto err_freesgi; + } + sbi->s_buddy_cache->i_ino = get_next_ino(); +@@ -2346,8 +2347,7 @@ static int ext4_mb_init_backend(struct super_block *sb) + for (i = 0; i < ngroups; i++) { + desc = ext4_get_group_desc(sb, i, NULL); + if (desc == NULL) { +- printk(KERN_ERR +- "EXT4-fs: can't read descriptor %u\n", i); ++ ext4_msg(sb, KERN_ERR, "can't read descriptor %u", i); + goto err_freebuddy; + } + if (ext4_mb_add_groupinfo(sb, i, desc) != 0) +@@ -2411,7 +2411,8 @@ static int ext4_groupinfo_create_slab(size_t size) + + mutex_unlock(&ext4_grpinfo_slab_create_mutex); + if (!cachep) { +- printk(KERN_EMERG "EXT4: no memory for groupinfo slab cache\n"); ++ printk(KERN_EMERG ++ "EXT4-fs: no memory for groupinfo slab cache\n"); + return -ENOMEM; + } + +@@ -2566,25 +2567,25 @@ int ext4_mb_release(struct super_block *sb) + if (sbi->s_buddy_cache) + iput(sbi->s_buddy_cache); + if (sbi->s_mb_stats) { +- printk(KERN_INFO +- "EXT4-fs: mballoc: %u blocks %u reqs (%u success)\n", ++ ext4_msg(sb, KERN_INFO, ++ "mballoc: %u blocks %u reqs (%u success)", + atomic_read(&sbi->s_bal_allocated), + atomic_read(&sbi->s_bal_reqs), + atomic_read(&sbi->s_bal_success)); +- printk(KERN_INFO +- "EXT4-fs: mballoc: %u extents scanned, %u goal hits, " +- "%u 2^N hits, %u breaks, %u lost\n", ++ ext4_msg(sb, KERN_INFO, ++ "mballoc: %u extents scanned, %u goal hits, " ++ "%u 2^N hits, %u breaks, %u lost", + atomic_read(&sbi->s_bal_ex_scanned), + atomic_read(&sbi->s_bal_goals), + atomic_read(&sbi->s_bal_2orders), + atomic_read(&sbi->s_bal_breaks), + atomic_read(&sbi->s_mb_lost_chunks)); +- printk(KERN_INFO +- "EXT4-fs: mballoc: %lu generated and it took %Lu\n", ++ ext4_msg(sb, KERN_INFO, ++ "mballoc: %lu generated and it took %Lu", + sbi->s_mb_buddies_generated++, + sbi->s_mb_generation_time); +- printk(KERN_INFO +- "EXT4-fs: mballoc: %u preallocated, %u discarded\n", ++ ext4_msg(sb, KERN_INFO, ++ "mballoc: %u preallocated, %u discarded", + atomic_read(&sbi->s_mb_preallocated), + atomic_read(&sbi->s_mb_discarded)); + } +@@ -3024,9 +3025,10 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, + + if (start + size <= ac->ac_o_ex.fe_logical && + start > ac->ac_o_ex.fe_logical) { +- printk(KERN_ERR "start %lu, size %lu, fe_logical %lu\n", +- (unsigned long) start, (unsigned long) size, +- (unsigned long) ac->ac_o_ex.fe_logical); ++ ext4_msg(ac->ac_sb, KERN_ERR, ++ "start %lu, size %lu, fe_logical %lu", ++ (unsigned long) start, (unsigned long) size, ++ (unsigned long) ac->ac_o_ex.fe_logical); + } + BUG_ON(start + size <= ac->ac_o_ex.fe_logical && + start > ac->ac_o_ex.fe_logical); +@@ -3607,10 +3609,11 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, + bit = next + 1; + } + if (free != pa->pa_free) { +- printk(KERN_CRIT "pa %p: logic %lu, phys. %lu, len %lu\n", +- pa, (unsigned long) pa->pa_lstart, +- (unsigned long) pa->pa_pstart, +- (unsigned long) pa->pa_len); ++ ext4_msg(e4b->bd_sb, KERN_CRIT, ++ "pa %p: logic %lu, phys. %lu, len %lu", ++ pa, (unsigned long) pa->pa_lstart, ++ (unsigned long) pa->pa_pstart, ++ (unsigned long) pa->pa_len); + ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u", + free, pa->pa_free); + /* +@@ -3798,7 +3801,8 @@ repeat: + * use preallocation while we're discarding it */ + spin_unlock(&pa->pa_lock); + spin_unlock(&ei->i_prealloc_lock); +- printk(KERN_ERR "uh-oh! used pa while discarding\n"); ++ ext4_msg(sb, KERN_ERR, ++ "uh-oh! used pa while discarding"); + WARN_ON(1); + schedule_timeout_uninterruptible(HZ); + goto repeat; +@@ -3875,12 +3879,13 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) + (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) + return; + +- printk(KERN_ERR "EXT4-fs: Can't allocate:" +- " Allocation context details:\n"); +- printk(KERN_ERR "EXT4-fs: status %d flags %d\n", ++ ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: Can't allocate:" ++ " Allocation context details:"); ++ ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: status %d flags %d", + ac->ac_status, ac->ac_flags); +- printk(KERN_ERR "EXT4-fs: orig %lu/%lu/%lu@%lu, goal %lu/%lu/%lu@%lu, " +- "best %lu/%lu/%lu@%lu cr %d\n", ++ ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: orig %lu/%lu/%lu@%lu, " ++ "goal %lu/%lu/%lu@%lu, " ++ "best %lu/%lu/%lu@%lu cr %d", + (unsigned long)ac->ac_o_ex.fe_group, + (unsigned long)ac->ac_o_ex.fe_start, + (unsigned long)ac->ac_o_ex.fe_len, +@@ -3894,9 +3899,9 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) + (unsigned long)ac->ac_b_ex.fe_len, + (unsigned long)ac->ac_b_ex.fe_logical, + (int)ac->ac_criteria); +- printk(KERN_ERR "EXT4-fs: %lu scanned, %d found\n", ac->ac_ex_scanned, +- ac->ac_found); +- printk(KERN_ERR "EXT4-fs: groups: \n"); ++ ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: %lu scanned, %d found", ++ ac->ac_ex_scanned, ac->ac_found); ++ ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: groups: "); + ngroups = ext4_get_groups_count(sb); + for (i = 0; i < ngroups; i++) { + struct ext4_group_info *grp = ext4_get_group_info(sb, i); + diff --git a/ldiskfs/kernel_patches/patches/sles11sp2/ext4_data_in_dirent.patch b/ldiskfs/kernel_patches/patches/sles11sp2/ext4_data_in_dirent.patch new file mode 100644 index 0000000..973ccad --- /dev/null +++ b/ldiskfs/kernel_patches/patches/sles11sp2/ext4_data_in_dirent.patch @@ -0,0 +1,478 @@ +--- + fs/ext4/dir.c | 26 +++++++++--- + fs/ext4/ext4.h | 70 ++++++++++++++++++++++++++++++++- + fs/ext4/namei.c | 117 ++++++++++++++++++++++++++++++++++++++++---------------- + 3 files changed, 170 insertions(+), 43 deletions(-) + +--- a/fs/ext4/dir.c ++++ b/fs/ext4/dir.c +@@ -53,11 +53,18 @@ const struct file_operations ext4_dir_op + + static unsigned char get_dtype(struct super_block *sb, int filetype) + { ++ int fl_index = filetype & EXT4_FT_MASK; ++ + if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE) || +- (filetype >= EXT4_FT_MAX)) ++ (fl_index >= EXT4_FT_MAX)) + return DT_UNKNOWN; + +- return (ext4_filetype_table[filetype]); ++ if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_DIRDATA)) ++ return (ext4_filetype_table[fl_index]); ++ ++ return (ext4_filetype_table[fl_index]) | ++ (filetype & EXT4_DIRENT_LUFID); ++ + } + + /* +@@ -75,11 +82,11 @@ int __ext4_check_dir_entry(const char *f + const int rlen = ext4_rec_len_from_disk(de->rec_len, + dir->i_sb->s_blocksize); + +- if (unlikely(rlen < EXT4_DIR_REC_LEN(1))) ++ if (unlikely(rlen < __EXT4_DIR_REC_LEN(1))) + error_msg = "rec_len is smaller than minimal"; + else if (unlikely(rlen % 4 != 0)) + error_msg = "rec_len % 4 != 0"; +- else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len))) ++ else if (unlikely(rlen < EXT4_DIR_REC_LEN(de))) + error_msg = "rec_len is too small for name_len"; + else if (unlikely(((char *) de - bh->b_data) + rlen > + dir->i_sb->s_blocksize)) +@@ -196,7 +203,7 @@ revalidate: + * failure will be detected in the + * dirent test below. */ + if (ext4_rec_len_from_disk(de->rec_len, +- sb->s_blocksize) < EXT4_DIR_REC_LEN(1)) ++ sb->s_blocksize) < __EXT4_DIR_REC_LEN(1)) + break; + i += ext4_rec_len_from_disk(de->rec_len, + sb->s_blocksize); +@@ -359,12 +366,17 @@ int ext4_htree_store_dirent(struct file + struct fname *fname, *new_fn; + struct dir_private_info *info; + int len; ++ int extra_data = 1; + + info = dir_file->private_data; + p = &info->root.rb_node; + + /* Create and allocate the fname structure */ +- len = sizeof(struct fname) + dirent->name_len + 1; ++ if (dirent->file_type & EXT4_DIRENT_LUFID) ++ extra_data = ext4_get_dirent_data_len(dirent); ++ ++ len = sizeof(struct fname) + dirent->name_len + extra_data; ++ + new_fn = kzalloc(len, GFP_KERNEL); + if (!new_fn) + return -ENOMEM; +@@ -373,7 +385,7 @@ int ext4_htree_store_dirent(struct file + new_fn->inode = le32_to_cpu(dirent->inode); + new_fn->name_len = dirent->name_len; + new_fn->file_type = dirent->file_type; +- memcpy(new_fn->name, dirent->name, dirent->name_len); ++ memcpy(new_fn->name, dirent->name, dirent->name_len + extra_data); + new_fn->name[dirent->name_len] = 0; + + while (*p) { +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -1414,7 +1414,9 @@ static inline void ext4_clear_state_flag + EXT4_FEATURE_INCOMPAT_EXTENTS| \ + EXT4_FEATURE_INCOMPAT_64BIT| \ + EXT4_FEATURE_INCOMPAT_FLEX_BG| \ +- EXT4_FEATURE_INCOMPAT_MMP) ++ EXT4_FEATURE_INCOMPAT_MMP| \ ++ EXT4_FEATURE_INCOMPAT_DIRDATA) ++ + #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ +@@ -1501,6 +1503,43 @@ struct ext4_dir_entry_2 { + #define EXT4_FT_SYMLINK 7 + + #define EXT4_FT_MAX 8 ++#define EXT4_FT_MASK 0xf ++ ++#if EXT4_FT_MAX > EXT4_FT_MASK ++#error "conflicting EXT4_FT_MAX and EXT4_FT_MASK" ++#endif ++ ++/* ++ * d_type has 4 unused bits, so it can hold four types data. these different ++ * type of data (e.g. lustre data, high 32 bits of 64-bit inode number) can be ++ * stored, in flag order, after file-name in ext4 dirent. ++*/ ++/* ++ * this flag is added to d_type if ext4 dirent has extra data after ++ * filename. this data length is variable and length is stored in first byte ++ * of data. data start after filename NUL byte. ++ * This is used by Lustre FS. ++ */ ++#define EXT4_DIRENT_LUFID 0x10 ++ ++#define EXT4_LUFID_MAGIC 0xAD200907UL ++struct ext4_dentry_param { ++ __u32 edp_magic; /* EXT4_LUFID_MAGIC */ ++ char edp_len; /* size of edp_data in bytes */ ++ char edp_data[0]; /* packed array of data */ ++} __attribute__((packed)); ++ ++static inline unsigned char *ext4_dentry_get_data(struct super_block *sb, ++ struct ext4_dentry_param* p) ++ ++{ ++ if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_DIRDATA)) ++ return NULL; ++ if (p && p->edp_magic == EXT4_LUFID_MAGIC) ++ return &p->edp_len; ++ else ++ return NULL; ++} + + /* + * EXT4_DIR_PAD defines the directory entries boundaries +@@ -1509,8 +1548,11 @@ struct ext4_dir_entry_2 { + */ + #define EXT4_DIR_PAD 4 + #define EXT4_DIR_ROUND (EXT4_DIR_PAD - 1) +-#define EXT4_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT4_DIR_ROUND) & \ ++#define __EXT4_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT4_DIR_ROUND) & \ + ~EXT4_DIR_ROUND) ++#define EXT4_DIR_REC_LEN(de) (__EXT4_DIR_REC_LEN(de->name_len +\ ++ ext4_get_dirent_data_len(de))) ++ + #define EXT4_MAX_REC_LEN ((1<<16)-1) + + /* +@@ -1908,7 +1950,7 @@ extern struct buffer_head * ext4_find_en + struct ext4_dir_entry_2 ** res_dir); + #define ll_ext4_find_entry(inode, dentry, res_dir) ext4_find_entry(inode, &(dentry)->d_name, res_dir) + extern int ext4_add_dot_dotdot(handle_t *handle, struct inode *dir, +- struct inode *inode); ++ struct inode *inode, const void *, const void *); + extern struct buffer_head *ext4_append(handle_t *handle, + struct inode *inode, + ext4_lblk_t *block, int *err); +@@ -2308,6 +2350,28 @@ static inline void set_bitmap_uptodate(s + extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ]; + extern struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ]; + ++/* ++ * Compute the total directory entry data length. ++ * This includes the filename and an implicit NUL terminator (always present), ++ * and optional extensions. Each extension has a bit set in the high 4 bits of ++ * de->file_type, and the extension length is the first byte in each entry. ++ */ ++static inline int ext4_get_dirent_data_len(struct ext4_dir_entry_2 *de) ++{ ++ char *len = de->name + de->name_len + 1 /* NUL terminator */; ++ int dlen = 0; ++ __u8 extra_data_flags = (de->file_type & ~EXT4_FT_MASK) >> 4; ++ ++ while (extra_data_flags) { ++ if (extra_data_flags & 1) { ++ dlen += *len + (dlen == 0); ++ len += *len; ++ } ++ extra_data_flags >>= 1; ++ } ++ return dlen; ++} ++ + #endif /* __KERNEL__ */ + + #endif /* _EXT4_H */ +--- a/fs/ext4/namei.c ++++ b/fs/ext4/namei.c +@@ -170,7 +170,8 @@ static unsigned dx_get_count(struct dx_e + static unsigned dx_get_limit(struct dx_entry *entries); + static void dx_set_count(struct dx_entry *entries, unsigned value); + static void dx_set_limit(struct dx_entry *entries, unsigned value); +-static unsigned dx_root_limit(struct inode *dir, unsigned infosize); ++static inline unsigned dx_root_limit(__u32 blocksize, ++ struct ext4_dir_entry_2 *dot_de, unsigned infosize); + static unsigned dx_node_limit(struct inode *dir); + static struct dx_frame *dx_probe(const struct qstr *d_name, + struct inode *dir, +@@ -213,11 +214,12 @@ ext4_next_entry(struct ext4_dir_entry_2 + */ + struct dx_root_info * dx_get_dx_info(struct ext4_dir_entry_2 *de) + { +- /* get dotdot first */ +- de = (struct ext4_dir_entry_2 *)((char *)de + EXT4_DIR_REC_LEN(1)); ++ BUG_ON(de->name_len != 1); ++ /* get dotdot first */ ++ de = (struct ext4_dir_entry_2 *)((char *)de + EXT4_DIR_REC_LEN(de)); + +- /* dx root info is after dotdot entry */ +- de = (struct ext4_dir_entry_2 *)((char *)de + EXT4_DIR_REC_LEN(2)); ++ /* dx root info is after dotdot entry */ ++ de = (struct ext4_dir_entry_2 *)((char *)de + EXT4_DIR_REC_LEN(de)); + + return (struct dx_root_info *) de; + } +@@ -262,16 +264,23 @@ static inline void dx_set_limit(struct d + ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value); + } + +-static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize) ++static inline unsigned dx_root_limit(__u32 blocksize, ++ struct ext4_dir_entry_2 *dot_de, unsigned infosize) + { +- unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) - +- EXT4_DIR_REC_LEN(2) - infosize; ++ struct ext4_dir_entry_2 *dotdot_de; ++ unsigned entry_space; ++ ++ BUG_ON(dot_de->name_len != 1); ++ dotdot_de = ext4_next_entry(dot_de, blocksize); ++ entry_space = blocksize - EXT4_DIR_REC_LEN(dot_de) - ++ EXT4_DIR_REC_LEN(dotdot_de) - infosize; ++ + return entry_space / sizeof(struct dx_entry); + } + + static inline unsigned dx_node_limit(struct inode *dir) + { +- unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0); ++ unsigned entry_space = dir->i_sb->s_blocksize - __EXT4_DIR_REC_LEN(0); + return entry_space / sizeof(struct dx_entry); + } + +@@ -318,7 +327,7 @@ static struct stats dx_show_leaf(struct + printk(":%x.%u ", h.hash, + ((char *) de - base)); + } +- space += EXT4_DIR_REC_LEN(de->name_len); ++ space += EXT4_DIR_REC_LEN(de); + names++; + } + de = ext4_next_entry(de, size); +@@ -420,7 +429,8 @@ dx_probe(const struct qstr *d_name, stru + + entries = (struct dx_entry *) (((char *)info) + info->info_length); + +- if (dx_get_limit(entries) != dx_root_limit(dir, ++ if (dx_get_limit(entries) != dx_root_limit(dir->i_sb->s_blocksize, ++ (struct ext4_dir_entry_2*)bh->b_data, + info->info_length)) { + ext4_warning(dir->i_sb, "dx entry: limit != root limit"); + brelse(bh); +@@ -609,7 +619,7 @@ static int htree_dirblock_to_tree(struct + de = (struct ext4_dir_entry_2 *) bh->b_data; + top = (struct ext4_dir_entry_2 *) ((char *) de + + dir->i_sb->s_blocksize - +- EXT4_DIR_REC_LEN(0)); ++ __EXT4_DIR_REC_LEN(0)); + for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) { + if (ext4_check_dir_entry(dir, NULL, de, bh, + (block<i_sb)) +@@ -1172,7 +1182,7 @@ dx_move_dirents(char *from, char *to, st + while (count--) { + struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) + (from + (map->offs<<2)); +- rec_len = EXT4_DIR_REC_LEN(de->name_len); ++ rec_len = EXT4_DIR_REC_LEN(de); + memcpy (to, de, rec_len); + ((struct ext4_dir_entry_2 *) to)->rec_len = + ext4_rec_len_to_disk(rec_len, blocksize); +@@ -1196,7 +1206,7 @@ static struct ext4_dir_entry_2* dx_pack_ + while ((char*)de < base + blocksize) { + next = ext4_next_entry(de, blocksize); + if (de->inode && de->name_len) { +- rec_len = EXT4_DIR_REC_LEN(de->name_len); ++ rec_len = EXT4_DIR_REC_LEN(de); + if (de > to) + memmove(to, de, rec_len); + to->rec_len = ext4_rec_len_to_disk(rec_len, blocksize); +@@ -1326,10 +1336,16 @@ static int add_dirent_to_buf(handle_t *h + unsigned int offset = 0; + unsigned int blocksize = dir->i_sb->s_blocksize; + unsigned short reclen; +- int nlen, rlen, err; ++ int nlen, rlen, err, dlen = 0; ++ unsigned char *data; + char *top; + +- reclen = EXT4_DIR_REC_LEN(namelen); ++ data = ext4_dentry_get_data(inode->i_sb, (struct ext4_dentry_param *) ++ dentry->d_fsdata); ++ if (data) ++ dlen = (*data) + 1; ++ ++ reclen = __EXT4_DIR_REC_LEN(namelen + dlen); + if (!de) { + de = (struct ext4_dir_entry_2 *)bh->b_data; + top = bh->b_data + blocksize - reclen; +@@ -1338,7 +1354,7 @@ static int add_dirent_to_buf(handle_t *h + return -EIO; + if (ext4_match(namelen, name, de)) + return -EEXIST; +- nlen = EXT4_DIR_REC_LEN(de->name_len); ++ nlen = EXT4_DIR_REC_LEN(de); + rlen = ext4_rec_len_from_disk(de->rec_len, blocksize); + if ((de->inode? rlen - nlen: rlen) >= reclen) + break; +@@ -1356,7 +1372,7 @@ static int add_dirent_to_buf(handle_t *h + } + + /* By now the buffer is marked for journaling */ +- nlen = EXT4_DIR_REC_LEN(de->name_len); ++ nlen = EXT4_DIR_REC_LEN(de); + rlen = ext4_rec_len_from_disk(de->rec_len, blocksize); + if (de->inode) { + struct ext4_dir_entry_2 *de1 = (struct ext4_dir_entry_2 *)((char *)de + nlen); +@@ -1372,6 +1388,12 @@ static int add_dirent_to_buf(handle_t *h + de->inode = 0; + de->name_len = namelen; + memcpy(de->name, name, namelen); ++ if (data) { ++ de->name[namelen] = 0; ++ memcpy(&de->name[namelen + 1], data, *(char *) data); ++ de->file_type |= EXT4_DIRENT_LUFID; ++ } ++ + /* + * XXX shouldn't update any times until successful + * completion of syscall, but too many callers depend +@@ -1468,7 +1490,8 @@ static int make_indexed_dir(handle_t *ha + + dx_set_block(entries, 1); + dx_set_count(entries, 1); +- dx_set_limit(entries, dx_root_limit(dir, sizeof(*dx_info))); ++ dx_set_limit(entries, dx_root_limit(dir->i_sb->s_blocksize, ++ dot_de, sizeof(*dx_info))); + + /* Initialize as for dx_probe */ + hinfo.hash_version = dx_info->hash_version; +@@ -1511,6 +1534,8 @@ static int ext4_update_dotdot(handle_t * + struct buffer_head * dir_block; + struct ext4_dir_entry_2 * de; + int len, journal = 0, err = 0; ++ int dlen = 0; ++ char *data; + + if (IS_ERR(handle)) + return PTR_ERR(handle); +@@ -1526,19 +1551,24 @@ static int ext4_update_dotdot(handle_t * + /* the first item must be "." */ + assert(de->name_len == 1 && de->name[0] == '.'); + len = le16_to_cpu(de->rec_len); +- assert(len >= EXT4_DIR_REC_LEN(1)); +- if (len > EXT4_DIR_REC_LEN(1)) { ++ assert(len >= __EXT4_DIR_REC_LEN(1)); ++ if (len > __EXT4_DIR_REC_LEN(1)) { + BUFFER_TRACE(dir_block, "get_write_access"); + err = ext4_journal_get_write_access(handle, dir_block); + if (err) + goto out_journal; + + journal = 1; +- de->rec_len = cpu_to_le16(EXT4_DIR_REC_LEN(1)); ++ de->rec_len = cpu_to_le16(EXT4_DIR_REC_LEN(de)); + } + +- len -= EXT4_DIR_REC_LEN(1); +- assert(len == 0 || len >= EXT4_DIR_REC_LEN(2)); ++ len -= EXT4_DIR_REC_LEN(de); ++ data = ext4_dentry_get_data(dir->i_sb, ++ (struct ext4_dentry_param *) dentry->d_fsdata); ++ if (data) ++ dlen = *data + 1; ++ assert(len == 0 || len >= __EXT4_DIR_REC_LEN(2 + dlen)); ++ + de = (struct ext4_dir_entry_2 *) + ((char *) de + le16_to_cpu(de->rec_len)); + if (!journal) { +@@ -1552,10 +1582,15 @@ static int ext4_update_dotdot(handle_t * + if (len > 0) + de->rec_len = cpu_to_le16(len); + else +- assert(le16_to_cpu(de->rec_len) >= EXT4_DIR_REC_LEN(2)); ++ assert(le16_to_cpu(de->rec_len) >= __EXT4_DIR_REC_LEN(2)); + de->name_len = 2; + strcpy (de->name, ".."); +- ext4_set_de_type(dir->i_sb, de, S_IFDIR); ++ if (data != NULL && ext4_get_dirent_data_len(de) >= dlen) { ++ de->name[2] = 0; ++ memcpy(&de->name[2 + 1], data, *data); ++ ext4_set_de_type(dir->i_sb, de, S_IFDIR); ++ de->file_type |= EXT4_DIRENT_LUFID; ++ } + + out_journal: + if (journal) { +@@ -1994,12 +2029,13 @@ retry: + /* Initialize @inode as a subdirectory of @dir, and add the + * "." and ".." entries into the first directory block. */ + int ext4_add_dot_dotdot(handle_t *handle, struct inode * dir, +- struct inode *inode) ++ struct inode *inode, ++ const void *data1, const void *data2) + { + struct buffer_head *dir_block; + struct ext4_dir_entry_2 *de; + unsigned int blocksize = dir->i_sb->s_blocksize; +- int err = 0; ++ int err = 0, dot_reclen; + + if (IS_ERR(handle)) + return PTR_ERR(handle); +@@ -2020,17 +2056,32 @@ int ext4_add_dot_dotdot(handle_t *handle + de = (struct ext4_dir_entry_2 *) dir_block->b_data; + de->inode = cpu_to_le32(inode->i_ino); + de->name_len = 1; +- de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len), +- blocksize); + strcpy(de->name, "."); + ext4_set_de_type(dir->i_sb, de, S_IFDIR); ++ /* get packed fid data */ ++ data1 = ext4_dentry_get_data(dir->i_sb, ++ (struct ext4_dentry_param *) data1); ++ if (data1) { ++ de->name[1] = 0; ++ memcpy(&de->name[2], data1, *(char *) data1); ++ de->file_type |= EXT4_DIRENT_LUFID; ++ } ++ de->rec_len = cpu_to_le16(EXT4_DIR_REC_LEN(de)); ++ dot_reclen = cpu_to_le16(de->rec_len); + de = ext4_next_entry(de, blocksize); + de->inode = cpu_to_le32(dir->i_ino); +- de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(1), ++ de->rec_len = ext4_rec_len_to_disk(blocksize - dot_reclen, + blocksize); + de->name_len = 2; + strcpy(de->name, ".."); + ext4_set_de_type(dir->i_sb, de, S_IFDIR); ++ data2 = ext4_dentry_get_data(dir->i_sb, ++ (struct ext4_dentry_param *) data2); ++ if (data2) { ++ de->name[2] = 0; ++ memcpy(&de->name[3], data2, *(char *) data2); ++ de->file_type |= EXT4_DIRENT_LUFID; ++ } + inode->i_nlink = 2; + BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, inode, dir_block); +@@ -2070,7 +2121,7 @@ retry: + if (IS_ERR(inode)) + goto out_stop; + +- err = ext4_add_dot_dotdot(handle, dir, inode); ++ err = ext4_add_dot_dotdot(handle, dir, inode, NULL, NULL); + if (err) + goto out_clear_inode; + err = ext4_add_entry(handle, dentry, inode); +@@ -2108,7 +2159,7 @@ static int empty_dir(struct inode *inode + int err = 0; + + sb = inode->i_sb; +- if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) || ++ if (inode->i_size < __EXT4_DIR_REC_LEN(1) + __EXT4_DIR_REC_LEN(2) || + !(bh = ext4_bread(NULL, inode, 0, 0, &err))) { + if (err) + EXT4_ERROR_INODE(inode, diff --git a/ldiskfs/kernel_patches/patches/sles11sp2/ext4_pdirop.patch b/ldiskfs/kernel_patches/patches/sles11sp2/ext4_pdirop.patch new file mode 100644 index 0000000..614b4a7 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/sles11sp2/ext4_pdirop.patch @@ -0,0 +1,2273 @@ +--- + fs/ext4/Makefile | 2 + fs/ext4/ext4.h | 93 ++++ + fs/ext4/htree_lock.c | 880 +++++++++++++++++++++++++++++++++++++++++++++ + fs/ext4/inode.c | 4 + fs/ext4/namei.c | 585 +++++++++++++++++++++++++---- + include/linux/htree_lock.h | 187 +++++++++ + 6 files changed, 1650 insertions(+), 101 deletions(-) + +--- a/fs/ext4/Makefile ++++ b/fs/ext4/Makefile +@@ -7,7 +7,7 @@ obj-$(CONFIG_EXT4_FS) += ext4.o + ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \ + ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ + ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \ +- mmp.o dynlocks.o ++ htree_lock.o mmp.o dynlocks.o + + ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o + ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -28,6 +28,7 @@ + #include + #include + #include ++#include + #include + #include + #ifdef __KERNEL__ +@@ -1402,6 +1403,7 @@ static inline void ext4_clear_state_flag + #define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200 + #define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400 /* EA in inode */ + #define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */ ++#define EXT4_FEATURE_INCOMPAT_LARGEDIR 0x4000 + + #define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR + #define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ +@@ -1427,7 +1429,8 @@ static inline void ext4_clear_state_flag + EXT4_FEATURE_INCOMPAT_FLEX_BG| \ + EXT4_FEATURE_INCOMPAT_EA_INODE| \ + EXT4_FEATURE_INCOMPAT_MMP| \ +- EXT4_FEATURE_INCOMPAT_DIRDATA) ++ EXT4_FEATURE_INCOMPAT_DIRDATA| \ ++ EXT4_FEATURE_INCOMPAT_LARGEDIR) + + #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ +@@ -1690,6 +1693,76 @@ ext4_group_first_block_no(struct super_b + */ + #define ERR_BAD_DX_DIR -75000 + ++/* htree levels for ext4 */ ++#define EXT4_HTREE_LEVEL_COMPAT 2 ++#define EXT4_HTREE_LEVEL 3 ++ ++static inline int ++ext4_dir_htree_level(struct super_block *sb) ++{ ++ return EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_LARGEDIR) ? ++ EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT; ++} ++ ++/* assume name-hash is protected by upper layer */ ++#define EXT4_HTREE_LOCK_HASH 0 ++ ++enum ext4_pdo_lk_types { ++#if EXT4_HTREE_LOCK_HASH ++ EXT4_LK_HASH, ++#endif ++ EXT4_LK_DX, /* index block */ ++ EXT4_LK_DE, /* directory entry block */ ++ EXT4_LK_SPIN, /* spinlock */ ++ EXT4_LK_MAX, ++}; ++ ++/* read-only bit */ ++#define EXT4_LB_RO(b) (1 << (b)) ++/* read + write, high bits for writer */ ++#define EXT4_LB_RW(b) ((1 << (b)) | (1 << (EXT4_LK_MAX + (b)))) ++ ++enum ext4_pdo_lock_bits { ++ /* DX lock bits */ ++ EXT4_LB_DX_RO = EXT4_LB_RO(EXT4_LK_DX), ++ EXT4_LB_DX = EXT4_LB_RW(EXT4_LK_DX), ++ /* DE lock bits */ ++ EXT4_LB_DE_RO = EXT4_LB_RO(EXT4_LK_DE), ++ EXT4_LB_DE = EXT4_LB_RW(EXT4_LK_DE), ++ /* DX spinlock bits */ ++ EXT4_LB_SPIN_RO = EXT4_LB_RO(EXT4_LK_SPIN), ++ EXT4_LB_SPIN = EXT4_LB_RW(EXT4_LK_SPIN), ++ /* accurate searching */ ++ EXT4_LB_EXACT = EXT4_LB_RO(EXT4_LK_MAX << 1), ++}; ++ ++enum ext4_pdo_lock_opc { ++ /* external */ ++ EXT4_HLOCK_READDIR = (EXT4_LB_DE_RO | EXT4_LB_DX_RO), ++ EXT4_HLOCK_LOOKUP = (EXT4_LB_DE_RO | EXT4_LB_SPIN_RO | ++ EXT4_LB_EXACT), ++ EXT4_HLOCK_DEL = (EXT4_LB_DE | EXT4_LB_SPIN_RO | ++ EXT4_LB_EXACT), ++ EXT4_HLOCK_ADD = (EXT4_LB_DE | EXT4_LB_SPIN_RO), ++ ++ /* internal */ ++ EXT4_HLOCK_LOOKUP_SAFE = (EXT4_LB_DE_RO | EXT4_LB_DX_RO | ++ EXT4_LB_EXACT), ++ EXT4_HLOCK_DEL_SAFE = (EXT4_LB_DE | EXT4_LB_DX_RO | EXT4_LB_EXACT), ++ EXT4_HLOCK_SPLIT = (EXT4_LB_DE | EXT4_LB_DX | EXT4_LB_SPIN), ++}; ++ ++extern struct htree_lock_head *ext4_htree_lock_head_alloc(unsigned hbits); ++#define ext4_htree_lock_head_free(lhead) htree_lock_head_free(lhead) ++ ++extern struct htree_lock *ext4_htree_lock_alloc(void); ++#define ext4_htree_lock_free(lck) htree_lock_free(lck) ++ ++extern void ext4_htree_lock(struct htree_lock *lck, ++ struct htree_lock_head *lhead, ++ struct inode *dir, unsigned flags); ++#define ext4_htree_unlock(lck) htree_unlock(lck) ++ + void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, + ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp); + +@@ -1964,14 +2037,16 @@ extern int ext4_htree_fill_tree(struct f + extern struct inode *ext4_create_inode(handle_t *handle, + struct inode * dir, int mode); + extern int ext4_add_entry(handle_t *handle, struct dentry *dentry, +- struct inode *inode); ++ struct inode *inode, struct htree_lock *lck); + extern int ext4_delete_entry(handle_t *handle, struct inode * dir, + struct ext4_dir_entry_2 * de_del, + struct buffer_head * bh); + extern struct buffer_head * ext4_find_entry(struct inode *dir, + const struct qstr *d_name, +- struct ext4_dir_entry_2 ** res_dir); +-#define ll_ext4_find_entry(inode, dentry, res_dir) ext4_find_entry(inode, &(dentry)->d_name, res_dir) ++ struct ext4_dir_entry_2 **res_dir, ++ struct htree_lock *lck); ++#define ll_ext4_find_entry(inode, dentry, res_dir, lck) \ ++ ext4_find_entry(inode, &(dentry)->d_name, res_dir, lck) + extern int ext4_add_dot_dotdot(handle_t *handle, struct inode *dir, + struct inode *inode, const void *, const void *); + extern struct buffer_head *ext4_append(handle_t *handle, +@@ -2104,13 +2179,15 @@ static inline void ext4_r_blocks_count_s + es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32); + } + +-static inline loff_t ext4_isize(struct ext4_inode *raw_inode) ++static inline loff_t ext4_isize(struct super_block *sb, ++ struct ext4_inode *raw_inode) + { +- if (S_ISREG(le16_to_cpu(raw_inode->i_mode))) ++ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_LARGEDIR) || ++ S_ISREG(le16_to_cpu(raw_inode->i_mode))) + return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) | + le32_to_cpu(raw_inode->i_size_lo); +- else +- return (loff_t) le32_to_cpu(raw_inode->i_size_lo); ++ ++ return (loff_t) le32_to_cpu(raw_inode->i_size_lo); + } + + static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size) +--- /dev/null ++++ b/fs/ext4/htree_lock.c +@@ -0,0 +1,880 @@ ++/* ++ * fs/ext4/htree_lock.c ++ * ++ * Copyright (c) 2011, 2012, Intel Corporation. ++ * ++ * Author: Liang Zhen ++ */ ++#include ++#include ++#include ++#include ++ ++enum { ++ HTREE_LOCK_BIT_EX = (1 << HTREE_LOCK_EX), ++ HTREE_LOCK_BIT_PW = (1 << HTREE_LOCK_PW), ++ HTREE_LOCK_BIT_PR = (1 << HTREE_LOCK_PR), ++ HTREE_LOCK_BIT_CW = (1 << HTREE_LOCK_CW), ++ HTREE_LOCK_BIT_CR = (1 << HTREE_LOCK_CR), ++}; ++ ++enum { ++ HTREE_LOCK_COMPAT_EX = 0, ++ HTREE_LOCK_COMPAT_PW = HTREE_LOCK_COMPAT_EX | HTREE_LOCK_BIT_CR, ++ HTREE_LOCK_COMPAT_PR = HTREE_LOCK_COMPAT_PW | HTREE_LOCK_BIT_PR, ++ HTREE_LOCK_COMPAT_CW = HTREE_LOCK_COMPAT_PW | HTREE_LOCK_BIT_CW, ++ HTREE_LOCK_COMPAT_CR = HTREE_LOCK_COMPAT_CW | HTREE_LOCK_BIT_PR | ++ HTREE_LOCK_BIT_PW, ++}; ++ ++static int htree_lock_compat[] = { ++ [HTREE_LOCK_EX] HTREE_LOCK_COMPAT_EX, ++ [HTREE_LOCK_PW] HTREE_LOCK_COMPAT_PW, ++ [HTREE_LOCK_PR] HTREE_LOCK_COMPAT_PR, ++ [HTREE_LOCK_CW] HTREE_LOCK_COMPAT_CW, ++ [HTREE_LOCK_CR] HTREE_LOCK_COMPAT_CR, ++}; ++ ++/* max allowed htree-lock depth. ++ * We only need depth=3 for ext4 although user can have higher value. */ ++#define HTREE_LOCK_DEP_MAX 16 ++ ++#ifdef HTREE_LOCK_DEBUG ++ ++static char *hl_name[] = { ++ [HTREE_LOCK_EX] "EX", ++ [HTREE_LOCK_PW] "PW", ++ [HTREE_LOCK_PR] "PR", ++ [HTREE_LOCK_CW] "CW", ++ [HTREE_LOCK_CR] "CR", ++}; ++ ++/* lock stats */ ++struct htree_lock_node_stats { ++ unsigned long long blocked[HTREE_LOCK_MAX]; ++ unsigned long long granted[HTREE_LOCK_MAX]; ++ unsigned long long retried[HTREE_LOCK_MAX]; ++ unsigned long long events; ++}; ++ ++struct htree_lock_stats { ++ struct htree_lock_node_stats nodes[HTREE_LOCK_DEP_MAX]; ++ unsigned long long granted[HTREE_LOCK_MAX]; ++ unsigned long long blocked[HTREE_LOCK_MAX]; ++}; ++ ++static struct htree_lock_stats hl_stats; ++ ++void htree_lock_stat_reset(void) ++{ ++ memset(&hl_stats, 0, sizeof(hl_stats)); ++} ++ ++void htree_lock_stat_print(int depth) ++{ ++ int i; ++ int j; ++ ++ printk(KERN_DEBUG "HTREE LOCK STATS:\n"); ++ for (i = 0; i < HTREE_LOCK_MAX; i++) { ++ printk(KERN_DEBUG "[%s]: G [%10llu], B [%10llu]\n", ++ hl_name[i], hl_stats.granted[i], hl_stats.blocked[i]); ++ } ++ for (i = 0; i < depth; i++) { ++ printk(KERN_DEBUG "HTREE CHILD [%d] STATS:\n", i); ++ for (j = 0; j < HTREE_LOCK_MAX; j++) { ++ printk(KERN_DEBUG ++ "[%s]: G [%10llu], B [%10llu], R [%10llu]\n", ++ hl_name[j], hl_stats.nodes[i].granted[j], ++ hl_stats.nodes[i].blocked[j], ++ hl_stats.nodes[i].retried[j]); ++ } ++ } ++} ++ ++#define lk_grant_inc(m) do { hl_stats.granted[m]++; } while (0) ++#define lk_block_inc(m) do { hl_stats.blocked[m]++; } while (0) ++#define ln_grant_inc(d, m) do { hl_stats.nodes[d].granted[m]++; } while (0) ++#define ln_block_inc(d, m) do { hl_stats.nodes[d].blocked[m]++; } while (0) ++#define ln_retry_inc(d, m) do { hl_stats.nodes[d].retried[m]++; } while (0) ++#define ln_event_inc(d) do { hl_stats.nodes[d].events++; } while (0) ++ ++#else /* !DEBUG */ ++ ++void htree_lock_stat_reset(void) {} ++void htree_lock_stat_print(int depth) {} ++ ++#define lk_grant_inc(m) do {} while (0) ++#define lk_block_inc(m) do {} while (0) ++#define ln_grant_inc(d, m) do {} while (0) ++#define ln_block_inc(d, m) do {} while (0) ++#define ln_retry_inc(d, m) do {} while (0) ++#define ln_event_inc(d) do {} while (0) ++ ++#endif /* DEBUG */ ++ ++EXPORT_SYMBOL(htree_lock_stat_reset); ++EXPORT_SYMBOL(htree_lock_stat_print); ++ ++#define HTREE_DEP_ROOT (-1) ++ ++#define htree_spin_lock(lhead, dep) \ ++ bit_spin_lock((dep) + 1, &(lhead)->lh_lock) ++#define htree_spin_unlock(lhead, dep) \ ++ bit_spin_unlock((dep) + 1, &(lhead)->lh_lock) ++ ++#define htree_key_event_ignore(child, ln) \ ++ (!((child)->lc_events & (1 << (ln)->ln_mode))) ++ ++static int ++htree_key_list_empty(struct htree_lock_node *ln) ++{ ++ return list_empty(&ln->ln_major_list) && list_empty(&ln->ln_minor_list); ++} ++ ++static void ++htree_key_list_del_init(struct htree_lock_node *ln) ++{ ++ struct htree_lock_node *tmp = NULL; ++ ++ if (!list_empty(&ln->ln_minor_list)) { ++ tmp = list_entry(ln->ln_minor_list.next, ++ struct htree_lock_node, ln_minor_list); ++ list_del_init(&ln->ln_minor_list); ++ } ++ ++ if (list_empty(&ln->ln_major_list)) ++ return; ++ ++ if (tmp == NULL) { /* not on minor key list */ ++ list_del_init(&ln->ln_major_list); ++ } else { ++ BUG_ON(!list_empty(&tmp->ln_major_list)); ++ list_replace_init(&ln->ln_major_list, &tmp->ln_major_list); ++ } ++} ++ ++static void ++htree_key_list_replace_init(struct htree_lock_node *old, ++ struct htree_lock_node *new) ++{ ++ if (!list_empty(&old->ln_major_list)) ++ list_replace_init(&old->ln_major_list, &new->ln_major_list); ++ ++ if (!list_empty(&old->ln_minor_list)) ++ list_replace_init(&old->ln_minor_list, &new->ln_minor_list); ++} ++ ++static void ++htree_key_event_enqueue(struct htree_lock_child *child, ++ struct htree_lock_node *ln, int dep, void *event) ++{ ++ struct htree_lock_node *tmp; ++ ++ /* NB: ALWAYS called holding lhead::lh_lock(dep) */ ++ BUG_ON(ln->ln_mode == HTREE_LOCK_NL); ++ if (event == NULL || htree_key_event_ignore(child, ln)) ++ return; ++ ++ /* shouldn't be a very long list */ ++ list_for_each_entry(tmp, &ln->ln_alive_list, ln_alive_list) { ++ if (tmp->ln_mode == HTREE_LOCK_NL) { ++ ln_event_inc(dep); ++ if (child->lc_callback != NULL) ++ child->lc_callback(tmp->ln_ev_target, event); ++ } ++ } ++} ++ ++static int ++htree_node_lock_enqueue(struct htree_lock *newlk, struct htree_lock *curlk, ++ unsigned dep, int wait, void *event) ++{ ++ struct htree_lock_child *child = &newlk->lk_head->lh_children[dep]; ++ struct htree_lock_node *newln = &newlk->lk_nodes[dep]; ++ struct htree_lock_node *curln = &curlk->lk_nodes[dep]; ++ ++ /* NB: ALWAYS called holding lhead::lh_lock(dep) */ ++ /* NB: we only expect PR/PW lock mode at here, only these two modes are ++ * allowed for htree_node_lock(asserted in htree_node_lock_internal), ++ * NL is only used for listener, user can't directly require NL mode */ ++ if ((curln->ln_mode == HTREE_LOCK_NL) || ++ (curln->ln_mode != HTREE_LOCK_PW && ++ newln->ln_mode != HTREE_LOCK_PW)) { ++ /* no conflict, attach it on granted list of @curlk */ ++ if (curln->ln_mode != HTREE_LOCK_NL) { ++ list_add(&newln->ln_granted_list, ++ &curln->ln_granted_list); ++ } else { ++ /* replace key owner */ ++ htree_key_list_replace_init(curln, newln); ++ } ++ ++ list_add(&newln->ln_alive_list, &curln->ln_alive_list); ++ htree_key_event_enqueue(child, newln, dep, event); ++ ln_grant_inc(dep, newln->ln_mode); ++ return 1; /* still hold lh_lock */ ++ } ++ ++ if (!wait) { /* can't grant and don't want to wait */ ++ ln_retry_inc(dep, newln->ln_mode); ++ newln->ln_mode = HTREE_LOCK_INVAL; ++ return -1; /* don't wait and just return -1 */ ++ } ++ ++ newlk->lk_task = current; ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ /* conflict, attach it on blocked list of curlk */ ++ list_add_tail(&newln->ln_blocked_list, &curln->ln_blocked_list); ++ list_add(&newln->ln_alive_list, &curln->ln_alive_list); ++ ln_block_inc(dep, newln->ln_mode); ++ ++ htree_spin_unlock(newlk->lk_head, dep); ++ /* wait to be given the lock */ ++ if (newlk->lk_task != NULL) ++ schedule(); ++ /* granted, no doubt, wake up will set me RUNNING */ ++ if (event == NULL || htree_key_event_ignore(child, newln)) ++ return 0; /* granted without lh_lock */ ++ ++ htree_spin_lock(newlk->lk_head, dep); ++ htree_key_event_enqueue(child, newln, dep, event); ++ return 1; /* still hold lh_lock */ ++} ++ ++/* ++ * get PR/PW access to particular tree-node according to @dep and @key, ++ * it will return -1 if @wait is false and can't immediately grant this lock. ++ * All listeners(HTREE_LOCK_NL) on @dep and with the same @key will get ++ * @event if it's not NULL. ++ * NB: ALWAYS called holding lhead::lh_lock ++ */ ++static int ++htree_node_lock_internal(struct htree_lock_head *lhead, struct htree_lock *lck, ++ htree_lock_mode_t mode, u32 key, unsigned dep, ++ int wait, void *event) ++{ ++ LIST_HEAD (list); ++ struct htree_lock *tmp; ++ struct htree_lock *tmp2; ++ u16 major; ++ u16 minor; ++ u8 reverse; ++ u8 ma_bits; ++ u8 mi_bits; ++ ++ BUG_ON(mode != HTREE_LOCK_PW && mode != HTREE_LOCK_PR); ++ BUG_ON(htree_node_is_granted(lck, dep)); ++ ++ key = hash_long(key, lhead->lh_hbits); ++ ++ mi_bits = lhead->lh_hbits >> 1; ++ ma_bits = lhead->lh_hbits - mi_bits; ++ ++ lck->lk_nodes[dep].ln_major_key = major = key & ((1U << ma_bits) - 1); ++ lck->lk_nodes[dep].ln_minor_key = minor = key >> ma_bits; ++ lck->lk_nodes[dep].ln_mode = mode; ++ ++ /* ++ * The major key list is an ordered list, so searches are started ++ * at the end of the list that is numerically closer to major_key, ++ * so at most half of the list will be walked (for well-distributed ++ * keys). The list traversal aborts early if the expected key ++ * location is passed. ++ */ ++ reverse = (major >= (1 << (ma_bits - 1))); ++ ++ if (reverse) { ++ list_for_each_entry_reverse(tmp, ++ &lhead->lh_children[dep].lc_list, ++ lk_nodes[dep].ln_major_list) { ++ if (tmp->lk_nodes[dep].ln_major_key == major) { ++ goto search_minor; ++ ++ } else if (tmp->lk_nodes[dep].ln_major_key < major) { ++ /* attach _after_ @tmp */ ++ list_add(&lck->lk_nodes[dep].ln_major_list, ++ &tmp->lk_nodes[dep].ln_major_list); ++ goto out_grant_major; ++ } ++ } ++ ++ list_add(&lck->lk_nodes[dep].ln_major_list, ++ &lhead->lh_children[dep].lc_list); ++ goto out_grant_major; ++ ++ } else { ++ list_for_each_entry(tmp, &lhead->lh_children[dep].lc_list, ++ lk_nodes[dep].ln_major_list) { ++ if (tmp->lk_nodes[dep].ln_major_key == major) { ++ goto search_minor; ++ ++ } else if (tmp->lk_nodes[dep].ln_major_key > major) { ++ /* insert _before_ @tmp */ ++ list_add_tail(&lck->lk_nodes[dep].ln_major_list, ++ &tmp->lk_nodes[dep].ln_major_list); ++ goto out_grant_major; ++ } ++ } ++ ++ list_add_tail(&lck->lk_nodes[dep].ln_major_list, ++ &lhead->lh_children[dep].lc_list); ++ goto out_grant_major; ++ } ++ ++ search_minor: ++ /* ++ * NB: minor_key list doesn't have a "head", @list is just a ++ * temporary stub for helping list searching, make sure it's removed ++ * after searching. ++ * minor_key list is an ordered list too. ++ */ ++ list_add_tail(&list, &tmp->lk_nodes[dep].ln_minor_list); ++ ++ reverse = (minor >= (1 << (mi_bits - 1))); ++ ++ if (reverse) { ++ list_for_each_entry_reverse(tmp2, &list, ++ lk_nodes[dep].ln_minor_list) { ++ if (tmp2->lk_nodes[dep].ln_minor_key == minor) { ++ goto out_enqueue; ++ ++ } else if (tmp2->lk_nodes[dep].ln_minor_key < minor) { ++ /* attach _after_ @tmp2 */ ++ list_add(&lck->lk_nodes[dep].ln_minor_list, ++ &tmp2->lk_nodes[dep].ln_minor_list); ++ goto out_grant_minor; ++ } ++ } ++ ++ list_add(&lck->lk_nodes[dep].ln_minor_list, &list); ++ ++ } else { ++ list_for_each_entry(tmp2, &list, ++ lk_nodes[dep].ln_minor_list) { ++ if (tmp2->lk_nodes[dep].ln_minor_key == minor) { ++ goto out_enqueue; ++ ++ } else if (tmp2->lk_nodes[dep].ln_minor_key > minor) { ++ /* insert _before_ @tmp2 */ ++ list_add_tail(&lck->lk_nodes[dep].ln_minor_list, ++ &tmp2->lk_nodes[dep].ln_minor_list); ++ goto out_grant_minor; ++ } ++ } ++ ++ list_add_tail(&lck->lk_nodes[dep].ln_minor_list, &list); ++ } ++ ++ out_grant_minor: ++ if (list.next == &lck->lk_nodes[dep].ln_minor_list) { ++ /* new lock @lck is the first one on minor_key list, which ++ * means it has the smallest minor_key and it should ++ * replace @tmp as minor_key owner */ ++ list_replace_init(&tmp->lk_nodes[dep].ln_major_list, ++ &lck->lk_nodes[dep].ln_major_list); ++ } ++ /* remove the temporary head */ ++ list_del(&list); ++ ++ out_grant_major: ++ ln_grant_inc(dep, lck->lk_nodes[dep].ln_mode); ++ return 1; /* granted with holding lh_lock */ ++ ++ out_enqueue: ++ list_del(&list); /* remove temprary head */ ++ return htree_node_lock_enqueue(lck, tmp2, dep, wait, event); ++} ++ ++/* ++ * release the key of @lck at level @dep, and grant any blocked locks. ++ * caller will still listen on @key if @event is not NULL, which means ++ * caller can see a event (by event_cb) while granting any lock with ++ * the same key at level @dep. ++ * NB: ALWAYS called holding lhead::lh_lock ++ * NB: listener will not block anyone because listening mode is HTREE_LOCK_NL ++ */ ++static void ++htree_node_unlock_internal(struct htree_lock_head *lhead, ++ struct htree_lock *curlk, unsigned dep, void *event) ++{ ++ struct htree_lock_node *curln = &curlk->lk_nodes[dep]; ++ struct htree_lock *grtlk = NULL; ++ struct htree_lock_node *grtln; ++ struct htree_lock *poslk; ++ struct htree_lock *tmplk; ++ ++ if (!htree_node_is_granted(curlk, dep)) ++ return; ++ ++ if (!list_empty(&curln->ln_granted_list)) { ++ /* there is another granted lock */ ++ grtlk = list_entry(curln->ln_granted_list.next, ++ struct htree_lock, ++ lk_nodes[dep].ln_granted_list); ++ list_del_init(&curln->ln_granted_list); ++ } ++ ++ if (grtlk == NULL && !list_empty(&curln->ln_blocked_list)) { ++ /* ++ * @curlk is the only granted lock, so we confirmed: ++ * a) curln is key owner (attached on major/minor_list), ++ * so if there is any blocked lock, it should be attached ++ * on curln->ln_blocked_list ++ * b) we always can grant the first blocked lock ++ */ ++ grtlk = list_entry(curln->ln_blocked_list.next, ++ struct htree_lock, ++ lk_nodes[dep].ln_blocked_list); ++ BUG_ON(grtlk->lk_task == NULL); ++ wake_up_process(grtlk->lk_task); ++ } ++ ++ if (event != NULL && ++ lhead->lh_children[dep].lc_events != HTREE_EVENT_DISABLE) { ++ curln->ln_ev_target = event; ++ curln->ln_mode = HTREE_LOCK_NL; /* listen! */ ++ } else { ++ curln->ln_mode = HTREE_LOCK_INVAL; ++ } ++ ++ if (grtlk == NULL) { /* I must be the only one locking this key */ ++ struct htree_lock_node *tmpln; ++ ++ BUG_ON(htree_key_list_empty(curln)); ++ ++ if (curln->ln_mode == HTREE_LOCK_NL) /* listening */ ++ return; ++ ++ /* not listening */ ++ if (list_empty(&curln->ln_alive_list)) { /* no more listener */ ++ htree_key_list_del_init(curln); ++ return; ++ } ++ ++ tmpln = list_entry(curln->ln_alive_list.next, ++ struct htree_lock_node, ln_alive_list); ++ ++ BUG_ON(tmpln->ln_mode != HTREE_LOCK_NL); ++ ++ htree_key_list_replace_init(curln, tmpln); ++ list_del_init(&curln->ln_alive_list); ++ ++ return; ++ } ++ ++ /* have a granted lock */ ++ grtln = &grtlk->lk_nodes[dep]; ++ if (!list_empty(&curln->ln_blocked_list)) { ++ /* only key owner can be on both lists */ ++ BUG_ON(htree_key_list_empty(curln)); ++ ++ if (list_empty(&grtln->ln_blocked_list)) { ++ list_add(&grtln->ln_blocked_list, ++ &curln->ln_blocked_list); ++ } ++ list_del_init(&curln->ln_blocked_list); ++ } ++ /* ++ * NB: this is the tricky part: ++ * We have only two modes for child-lock (PR and PW), also, ++ * only owner of the key (attached on major/minor_list) can be on ++ * both blocked_list and granted_list, so @grtlk must be one ++ * of these two cases: ++ * ++ * a) @grtlk is taken from granted_list, which means we've granted ++ * more than one lock so @grtlk has to be PR, the first blocked ++ * lock must be PW and we can't grant it at all. ++ * So even @grtlk is not owner of the key (empty blocked_list), ++ * we don't care because we can't grant any lock. ++ * b) we just grant a new lock which is taken from head of blocked ++ * list, and it should be the first granted lock, and it should ++ * be the first one linked on blocked_list. ++ * ++ * Either way, we can get correct result by iterating blocked_list ++ * of @grtlk, and don't have to bother on how to find out ++ * owner of current key. ++ */ ++ list_for_each_entry_safe(poslk, tmplk, &grtln->ln_blocked_list, ++ lk_nodes[dep].ln_blocked_list) { ++ if (grtlk->lk_nodes[dep].ln_mode == HTREE_LOCK_PW || ++ poslk->lk_nodes[dep].ln_mode == HTREE_LOCK_PW) ++ break; ++ /* grant all readers */ ++ list_del_init(&poslk->lk_nodes[dep].ln_blocked_list); ++ list_add(&poslk->lk_nodes[dep].ln_granted_list, ++ &grtln->ln_granted_list); ++ ++ BUG_ON(poslk->lk_task == NULL); ++ wake_up_process(poslk->lk_task); ++ } ++ ++ /* if @curln is the owner of this key, replace it with @grtln */ ++ if (!htree_key_list_empty(curln)) ++ htree_key_list_replace_init(curln, grtln); ++ ++ if (curln->ln_mode == HTREE_LOCK_INVAL) ++ list_del_init(&curln->ln_alive_list); ++} ++ ++/* ++ * it's just wrapper of htree_node_lock_internal, it returns 1 on granted ++ * and 0 only if @wait is false and can't grant it immediately ++ */ ++int ++htree_node_lock_try(struct htree_lock *lck, htree_lock_mode_t mode, ++ u32 key, unsigned dep, int wait, void *event) ++{ ++ struct htree_lock_head *lhead = lck->lk_head; ++ int rc; ++ ++ BUG_ON(dep >= lck->lk_depth); ++ BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL); ++ ++ htree_spin_lock(lhead, dep); ++ rc = htree_node_lock_internal(lhead, lck, mode, key, dep, wait, event); ++ if (rc != 0) ++ htree_spin_unlock(lhead, dep); ++ return rc >= 0; ++} ++EXPORT_SYMBOL(htree_node_lock_try); ++ ++/* it's wrapper of htree_node_unlock_internal */ ++void ++htree_node_unlock(struct htree_lock *lck, unsigned dep, void *event) ++{ ++ struct htree_lock_head *lhead = lck->lk_head; ++ ++ BUG_ON(dep >= lck->lk_depth); ++ BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL); ++ ++ htree_spin_lock(lhead, dep); ++ htree_node_unlock_internal(lhead, lck, dep, event); ++ htree_spin_unlock(lhead, dep); ++} ++EXPORT_SYMBOL(htree_node_unlock); ++ ++/* stop listening on child-lock level @dep */ ++void ++htree_node_stop_listen(struct htree_lock *lck, unsigned dep) ++{ ++ struct htree_lock_node *ln = &lck->lk_nodes[dep]; ++ struct htree_lock_node *tmp; ++ ++ BUG_ON(htree_node_is_granted(lck, dep)); ++ BUG_ON(!list_empty(&ln->ln_blocked_list)); ++ BUG_ON(!list_empty(&ln->ln_granted_list)); ++ ++ if (!htree_node_is_listening(lck, dep)) ++ return; ++ ++ htree_spin_lock(lck->lk_head, dep); ++ ln->ln_mode = HTREE_LOCK_INVAL; ++ ln->ln_ev_target = NULL; ++ ++ if (htree_key_list_empty(ln)) { /* not owner */ ++ list_del_init(&ln->ln_alive_list); ++ goto out; ++ } ++ ++ /* I'm the owner... */ ++ if (list_empty(&ln->ln_alive_list)) { /* no more listener */ ++ htree_key_list_del_init(ln); ++ goto out; ++ } ++ ++ tmp = list_entry(ln->ln_alive_list.next, ++ struct htree_lock_node, ln_alive_list); ++ ++ BUG_ON(tmp->ln_mode != HTREE_LOCK_NL); ++ htree_key_list_replace_init(ln, tmp); ++ list_del_init(&ln->ln_alive_list); ++ out: ++ htree_spin_unlock(lck->lk_head, dep); ++} ++EXPORT_SYMBOL(htree_node_stop_listen); ++ ++/* release all child-locks if we have any */ ++static void ++htree_node_release_all(struct htree_lock *lck) ++{ ++ int i; ++ ++ for (i = 0; i < lck->lk_depth; i++) { ++ if (htree_node_is_granted(lck, i)) ++ htree_node_unlock(lck, i, NULL); ++ else if (htree_node_is_listening(lck, i)) ++ htree_node_stop_listen(lck, i); ++ } ++} ++ ++/* ++ * obtain htree lock, it could be blocked inside if there's conflict ++ * with any granted or blocked lock and @wait is true. ++ * NB: ALWAYS called holding lhead::lh_lock ++ */ ++static int ++htree_lock_internal(struct htree_lock *lck, int wait) ++{ ++ struct htree_lock_head *lhead = lck->lk_head; ++ int granted = 0; ++ int blocked = 0; ++ int i; ++ ++ for (i = 0; i < HTREE_LOCK_MAX; i++) { ++ if (lhead->lh_ngranted[i] != 0) ++ granted |= 1 << i; ++ if (lhead->lh_nblocked[i] != 0) ++ blocked |= 1 << i; ++ } ++ if ((htree_lock_compat[lck->lk_mode] & granted) != granted || ++ (htree_lock_compat[lck->lk_mode] & blocked) != blocked) { ++ /* will block current lock even it just conflicts with any ++ * other blocked lock, so lock like EX wouldn't starve */ ++ if (!wait) ++ return -1; ++ lhead->lh_nblocked[lck->lk_mode]++; ++ lk_block_inc(lck->lk_mode); ++ ++ lck->lk_task = current; ++ list_add_tail(&lck->lk_blocked_list, &lhead->lh_blocked_list); ++ ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ htree_spin_unlock(lhead, HTREE_DEP_ROOT); ++ /* wait to be given the lock */ ++ if (lck->lk_task != NULL) ++ schedule(); ++ /* granted, no doubt. wake up will set me RUNNING */ ++ return 0; /* without lh_lock */ ++ } ++ lhead->lh_ngranted[lck->lk_mode]++; ++ lk_grant_inc(lck->lk_mode); ++ return 1; ++} ++ ++/* release htree lock. NB: ALWAYS called holding lhead::lh_lock */ ++static void ++htree_unlock_internal(struct htree_lock *lck) ++{ ++ struct htree_lock_head *lhead = lck->lk_head; ++ struct htree_lock *tmp; ++ struct htree_lock *tmp2; ++ int granted = 0; ++ int i; ++ ++ BUG_ON(lhead->lh_ngranted[lck->lk_mode] == 0); ++ ++ lhead->lh_ngranted[lck->lk_mode]--; ++ lck->lk_mode = HTREE_LOCK_INVAL; ++ ++ for (i = 0; i < HTREE_LOCK_MAX; i++) { ++ if (lhead->lh_ngranted[i] != 0) ++ granted |= 1 << i; ++ } ++ list_for_each_entry_safe(tmp, tmp2, ++ &lhead->lh_blocked_list, lk_blocked_list) { ++ /* conflict with any granted lock? */ ++ if ((htree_lock_compat[tmp->lk_mode] & granted) != granted) ++ break; ++ ++ list_del_init(&tmp->lk_blocked_list); ++ ++ BUG_ON(lhead->lh_nblocked[tmp->lk_mode] == 0); ++ ++ lhead->lh_nblocked[tmp->lk_mode]--; ++ lhead->lh_ngranted[tmp->lk_mode]++; ++ granted |= 1 << tmp->lk_mode; ++ ++ BUG_ON(tmp->lk_task == NULL); ++ wake_up_process(tmp->lk_task); ++ } ++} ++ ++/* it's wrapper of htree_lock_internal and exported interface. ++ * It always return 1 with granted lock if @wait is true, it can return 0 ++ * if @wait is false and locking request can't be granted immediately */ ++int ++htree_lock_try(struct htree_lock *lck, struct htree_lock_head *lhead, ++ htree_lock_mode_t mode, int wait) ++{ ++ int rc; ++ ++ BUG_ON(lck->lk_depth > lhead->lh_depth); ++ BUG_ON(lck->lk_head != NULL); ++ BUG_ON(lck->lk_task != NULL); ++ ++ lck->lk_head = lhead; ++ lck->lk_mode = mode; ++ ++ htree_spin_lock(lhead, HTREE_DEP_ROOT); ++ rc = htree_lock_internal(lck, wait); ++ if (rc != 0) ++ htree_spin_unlock(lhead, HTREE_DEP_ROOT); ++ return rc >= 0; ++} ++EXPORT_SYMBOL(htree_lock_try); ++ ++/* it's wrapper of htree_unlock_internal and exported interface. ++ * It will release all htree_node_locks and htree_lock */ ++void ++htree_unlock(struct htree_lock *lck) ++{ ++ BUG_ON(lck->lk_head == NULL); ++ BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL); ++ ++ htree_node_release_all(lck); ++ ++ htree_spin_lock(lck->lk_head, HTREE_DEP_ROOT); ++ htree_unlock_internal(lck); ++ htree_spin_unlock(lck->lk_head, HTREE_DEP_ROOT); ++ lck->lk_head = NULL; ++ lck->lk_task = NULL; ++} ++EXPORT_SYMBOL(htree_unlock); ++ ++/* change lock mode */ ++void ++htree_change_mode(struct htree_lock *lck, htree_lock_mode_t mode) ++{ ++ BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL); ++ lck->lk_mode = mode; ++} ++EXPORT_SYMBOL(htree_change_mode); ++ ++/* release htree lock, and lock it again with new mode. ++ * This function will first release all htree_node_locks and htree_lock, ++ * then try to gain htree_lock with new @mode. ++ * It always return 1 with granted lock if @wait is true, it can return 0 ++ * if @wait is false and locking request can't be granted immediately */ ++int ++htree_change_lock_try(struct htree_lock *lck, htree_lock_mode_t mode, int wait) ++{ ++ struct htree_lock_head *lhead = lck->lk_head; ++ int rc; ++ ++ BUG_ON(lhead == NULL); ++ BUG_ON(lck->lk_mode == mode); ++ BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL || mode == HTREE_LOCK_INVAL); ++ ++ htree_node_release_all(lck); ++ ++ htree_spin_lock(lhead, HTREE_DEP_ROOT); ++ htree_unlock_internal(lck); ++ lck->lk_mode = mode; ++ rc = htree_lock_internal(lck, wait); ++ if (rc != 0) ++ htree_spin_unlock(lhead, HTREE_DEP_ROOT); ++ return rc >= 0; ++} ++EXPORT_SYMBOL(htree_change_lock_try); ++ ++/* create a htree_lock head with @depth levels (number of child-locks), ++ * it is a per resoruce structure */ ++struct htree_lock_head * ++htree_lock_head_alloc(unsigned depth, unsigned hbits, unsigned priv) ++{ ++ struct htree_lock_head *lhead; ++ int i; ++ ++ if (depth > HTREE_LOCK_DEP_MAX) { ++ printk(KERN_ERR "%d is larger than max htree_lock depth %d\n", ++ depth, HTREE_LOCK_DEP_MAX); ++ return NULL; ++ } ++ ++ lhead = kzalloc(offsetof(struct htree_lock_head, ++ lh_children[depth]) + priv, GFP_NOFS); ++ if (lhead == NULL) ++ return NULL; ++ ++ if (hbits < HTREE_HBITS_MIN) ++ lhead->lh_hbits = HTREE_HBITS_MIN; ++ else if (hbits > HTREE_HBITS_MAX) ++ lhead->lh_hbits = HTREE_HBITS_MAX; ++ ++ lhead->lh_lock = 0; ++ lhead->lh_depth = depth; ++ INIT_LIST_HEAD(&lhead->lh_blocked_list); ++ if (priv > 0) { ++ lhead->lh_private = (void *)lhead + ++ offsetof(struct htree_lock_head, lh_children[depth]); ++ } ++ ++ for (i = 0; i < depth; i++) { ++ INIT_LIST_HEAD(&lhead->lh_children[i].lc_list); ++ lhead->lh_children[i].lc_events = HTREE_EVENT_DISABLE; ++ } ++ return lhead; ++} ++EXPORT_SYMBOL(htree_lock_head_alloc); ++ ++/* free the htree_lock head */ ++void ++htree_lock_head_free(struct htree_lock_head *lhead) ++{ ++ int i; ++ ++ BUG_ON(!list_empty(&lhead->lh_blocked_list)); ++ for (i = 0; i < lhead->lh_depth; i++) ++ BUG_ON(!list_empty(&lhead->lh_children[i].lc_list)); ++ kfree(lhead); ++} ++EXPORT_SYMBOL(htree_lock_head_free); ++ ++/* register event callback for @events of child-lock at level @dep */ ++void ++htree_lock_event_attach(struct htree_lock_head *lhead, unsigned dep, ++ unsigned events, htree_event_cb_t callback) ++{ ++ BUG_ON(lhead->lh_depth <= dep); ++ lhead->lh_children[dep].lc_events = events; ++ lhead->lh_children[dep].lc_callback = callback; ++} ++EXPORT_SYMBOL(htree_lock_event_attach); ++ ++/* allocate a htree_lock, which is per-thread structure, @pbytes is some ++ * extra-bytes as private data for caller */ ++struct htree_lock * ++htree_lock_alloc(unsigned depth, unsigned pbytes) ++{ ++ struct htree_lock *lck; ++ int i = offsetof(struct htree_lock, lk_nodes[depth]); ++ ++ if (depth > HTREE_LOCK_DEP_MAX) { ++ printk(KERN_ERR "%d is larger than max htree_lock depth %d\n", ++ depth, HTREE_LOCK_DEP_MAX); ++ return NULL; ++ } ++ lck = kzalloc(i + pbytes, GFP_NOFS); ++ if (lck == NULL) ++ return NULL; ++ ++ if (pbytes != 0) ++ lck->lk_private = (void *)lck + i; ++ lck->lk_mode = HTREE_LOCK_INVAL; ++ lck->lk_depth = depth; ++ INIT_LIST_HEAD(&lck->lk_blocked_list); ++ ++ for (i = 0; i < depth; i++) { ++ struct htree_lock_node *node = &lck->lk_nodes[i]; ++ ++ node->ln_mode = HTREE_LOCK_INVAL; ++ INIT_LIST_HEAD(&node->ln_major_list); ++ INIT_LIST_HEAD(&node->ln_minor_list); ++ INIT_LIST_HEAD(&node->ln_alive_list); ++ INIT_LIST_HEAD(&node->ln_blocked_list); ++ INIT_LIST_HEAD(&node->ln_granted_list); ++ } ++ ++ return lck; ++} ++EXPORT_SYMBOL(htree_lock_alloc); ++ ++/* free htree_lock node */ ++void ++htree_lock_free(struct htree_lock *lck) ++{ ++ BUG_ON(lck->lk_mode != HTREE_LOCK_INVAL); ++ kfree(lck); ++} ++EXPORT_SYMBOL(htree_lock_free); +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -4965,7 +4965,7 @@ struct inode *ext4_iget(struct super_blo + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT)) + ei->i_file_acl |= + ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; +- inode->i_size = ext4_isize(raw_inode); ++ inode->i_size = ext4_isize(sb, raw_inode); + ei->i_disksize = inode->i_size; + #ifdef CONFIG_QUOTA + ei->i_reserved_quota = 0; +@@ -5205,7 +5205,7 @@ static int ext4_do_update_inode(handle_t + raw_inode->i_file_acl_high = + cpu_to_le16(ei->i_file_acl >> 32); + raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl); +- if (ei->i_disksize != ext4_isize(raw_inode)) { ++ if (ei->i_disksize != ext4_isize(inode->i_sb, raw_inode)) { + ext4_isize_set(raw_inode, ei->i_disksize); + need_datasync = 1; + } +--- a/fs/ext4/namei.c ++++ b/fs/ext4/namei.c +@@ -176,7 +176,7 @@ static struct dx_frame *dx_probe(const s + struct inode *dir, + struct dx_hash_info *hinfo, + struct dx_frame *frame, +- int *err); ++ struct htree_lock *lck, int *err); + static void dx_release(struct dx_frame *frames); + static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize, + struct dx_hash_info *hinfo, struct dx_map_entry map[]); +@@ -189,13 +189,13 @@ static void dx_insert_block(struct dx_fr + static int ext4_htree_next_block(struct inode *dir, __u32 hash, + struct dx_frame *frame, + struct dx_frame *frames, +- __u32 *start_hash); ++ __u32 *start_hash, struct htree_lock *lck); + static struct buffer_head * ext4_dx_find_entry(struct inode *dir, + const struct qstr *d_name, + struct ext4_dir_entry_2 **res_dir, +- int *err); ++ struct htree_lock *lck, int *err); + static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, +- struct inode *inode); ++ struct inode *inode, struct htree_lock *lck); + + /* + * p is at least 6 bytes before the end of page +@@ -225,7 +225,7 @@ struct dx_root_info * dx_get_dx_info(str + + static inline ext4_lblk_t dx_get_block(struct dx_entry *entry) + { +- return le32_to_cpu(entry->block) & 0x00ffffff; ++ return le32_to_cpu(entry->block) & 0x0fffffff; + } + + static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value) +@@ -368,6 +368,223 @@ struct stats dx_show_entries(struct dx_h + } + #endif /* DX_DEBUG */ + ++/* private data for htree_lock */ ++struct ext4_dir_lock_data { ++ unsigned ld_flags; /* bits-map for lock types */ ++ unsigned ld_count; /* # entries of the last DX block */ ++ struct dx_entry ld_at_entry; /* copy of leaf dx_entry */ ++ struct dx_entry *ld_at; /* position of leaf dx_entry */ ++}; ++ ++#define ext4_htree_lock_data(l) ((struct ext4_dir_lock_data *)(l)->lk_private) ++ ++/* NB: ext4_lblk_t is 32 bits so we use high bits to identify invalid blk */ ++#define EXT4_HTREE_NODE_CHANGED (0xcafeULL << 32) ++ ++static void ext4_htree_event_cb(void *target, void *event) ++{ ++ u64 *block = (u64 *)target; ++ ++ if (*block == dx_get_block((struct dx_entry *)event)) ++ *block = EXT4_HTREE_NODE_CHANGED; ++} ++ ++struct htree_lock_head *ext4_htree_lock_head_alloc(unsigned hbits) ++{ ++ struct htree_lock_head *lhead; ++ ++ lhead = htree_lock_head_alloc(EXT4_LK_MAX, hbits, 0); ++ if (lhead != NULL) { ++ htree_lock_event_attach(lhead, EXT4_LK_SPIN, HTREE_EVENT_WR, ++ ext4_htree_event_cb); ++ } ++ return lhead; ++} ++EXPORT_SYMBOL(ext4_htree_lock_head_alloc); ++ ++struct htree_lock *ext4_htree_lock_alloc(void) ++{ ++ return htree_lock_alloc(EXT4_LK_MAX, ++ sizeof(struct ext4_dir_lock_data)); ++} ++EXPORT_SYMBOL(ext4_htree_lock_alloc); ++ ++static htree_lock_mode_t ext4_htree_mode(unsigned flags) ++{ ++ switch (flags) { ++ default: /* 0 or unknown flags require EX lock */ ++ return HTREE_LOCK_EX; ++ case EXT4_HLOCK_READDIR: ++ return HTREE_LOCK_PR; ++ case EXT4_HLOCK_LOOKUP: ++ return HTREE_LOCK_CR; ++ case EXT4_HLOCK_DEL: ++ case EXT4_HLOCK_ADD: ++ return HTREE_LOCK_CW; ++ } ++} ++ ++/* return PR for read-only operations, otherwise return EX */ ++static inline htree_lock_mode_t ext4_htree_safe_mode(unsigned flags) ++{ ++ int writer = (flags & EXT4_LB_DE) == EXT4_LB_DE; ++ ++ /* 0 requires EX lock */ ++ return (flags == 0 || writer) ? HTREE_LOCK_EX : HTREE_LOCK_PR; ++} ++ ++static int ext4_htree_safe_locked(struct htree_lock *lck) ++{ ++ int writer; ++ ++ if (lck == NULL || lck->lk_mode == HTREE_LOCK_EX) ++ return 1; ++ ++ writer = (ext4_htree_lock_data(lck)->ld_flags & EXT4_LB_DE) == ++ EXT4_LB_DE; ++ if (writer) /* all readers & writers are excluded? */ ++ return lck->lk_mode == HTREE_LOCK_EX; ++ ++ /* all writers are excluded? */ ++ return lck->lk_mode == HTREE_LOCK_PR || ++ lck->lk_mode == HTREE_LOCK_PW || ++ lck->lk_mode == HTREE_LOCK_EX; ++} ++ ++/* relock htree_lock with EX mode if it's change operation, otherwise ++ * relock it with PR mode. It's noop if PDO is disabled. */ ++static void ext4_htree_safe_relock(struct htree_lock *lck) ++{ ++ if (!ext4_htree_safe_locked(lck)) { ++ unsigned flags = ext4_htree_lock_data(lck)->ld_flags; ++ ++ htree_change_lock(lck, ext4_htree_safe_mode(flags)); ++ } ++} ++ ++void ext4_htree_lock(struct htree_lock *lck, struct htree_lock_head *lhead, ++ struct inode *dir, unsigned flags) ++{ ++ htree_lock_mode_t mode = is_dx(dir) ? ext4_htree_mode(flags) : ++ ext4_htree_safe_mode(flags); ++ ++ ext4_htree_lock_data(lck)->ld_flags = flags; ++ htree_lock(lck, lhead, mode); ++ if (!is_dx(dir)) ++ ext4_htree_safe_relock(lck); /* make sure it's safe locked */ ++} ++EXPORT_SYMBOL(ext4_htree_lock); ++ ++static int ext4_htree_node_lock(struct htree_lock *lck, struct dx_entry *at, ++ unsigned lmask, int wait, void *ev) ++{ ++ u32 key = (at == NULL) ? 0 : dx_get_block(at); ++ u32 mode; ++ ++ /* NOOP if htree is well protected or caller doesn't require the lock */ ++ if (ext4_htree_safe_locked(lck) || ++ !(ext4_htree_lock_data(lck)->ld_flags & lmask)) ++ return 1; ++ ++ mode = (ext4_htree_lock_data(lck)->ld_flags & lmask) == lmask ? ++ HTREE_LOCK_PW : HTREE_LOCK_PR; ++ while (1) { ++ if (htree_node_lock_try(lck, mode, key, ffz(~lmask), wait, ev)) ++ return 1; ++ if (!(lmask & EXT4_LB_SPIN)) /* not a spinlock */ ++ return 0; ++ cpu_relax(); /* spin until granted */ ++ } ++} ++ ++static int ext4_htree_node_locked(struct htree_lock *lck, unsigned lmask) ++{ ++ return ext4_htree_safe_locked(lck) || ++ htree_node_is_granted(lck, ffz(~lmask)); ++} ++ ++static void ext4_htree_node_unlock(struct htree_lock *lck, ++ unsigned lmask, void *buf) ++{ ++ /* NB: it's safe to call mutiple times or even it's not locked */ ++ if (!ext4_htree_safe_locked(lck) && ++ htree_node_is_granted(lck, ffz(~lmask))) ++ htree_node_unlock(lck, ffz(~lmask), buf); ++} ++ ++#define ext4_htree_dx_lock(lck, key) \ ++ ext4_htree_node_lock(lck, key, EXT4_LB_DX, 1, NULL) ++#define ext4_htree_dx_lock_try(lck, key) \ ++ ext4_htree_node_lock(lck, key, EXT4_LB_DX, 0, NULL) ++#define ext4_htree_dx_unlock(lck) \ ++ ext4_htree_node_unlock(lck, EXT4_LB_DX, NULL) ++#define ext4_htree_dx_locked(lck) \ ++ ext4_htree_node_locked(lck, EXT4_LB_DX) ++ ++static void ext4_htree_dx_need_lock(struct htree_lock *lck) ++{ ++ struct ext4_dir_lock_data *ld; ++ ++ if (ext4_htree_safe_locked(lck)) ++ return; ++ ++ ld = ext4_htree_lock_data(lck); ++ switch (ld->ld_flags) { ++ default: ++ return; ++ case EXT4_HLOCK_LOOKUP: ++ ld->ld_flags = EXT4_HLOCK_LOOKUP_SAFE; ++ return; ++ case EXT4_HLOCK_DEL: ++ ld->ld_flags = EXT4_HLOCK_DEL_SAFE; ++ return; ++ case EXT4_HLOCK_ADD: ++ ld->ld_flags = EXT4_HLOCK_SPLIT; ++ return; ++ } ++} ++ ++#define ext4_htree_de_lock(lck, key) \ ++ ext4_htree_node_lock(lck, key, EXT4_LB_DE, 1, NULL) ++#define ext4_htree_de_unlock(lck) \ ++ ext4_htree_node_unlock(lck, EXT4_LB_DE, NULL) ++ ++#define ext4_htree_spin_lock(lck, key, event) \ ++ ext4_htree_node_lock(lck, key, EXT4_LB_SPIN, 0, event) ++#define ext4_htree_spin_unlock(lck) \ ++ ext4_htree_node_unlock(lck, EXT4_LB_SPIN, NULL) ++#define ext4_htree_spin_unlock_listen(lck, p) \ ++ ext4_htree_node_unlock(lck, EXT4_LB_SPIN, p) ++ ++static void ext4_htree_spin_stop_listen(struct htree_lock *lck) ++{ ++ if (!ext4_htree_safe_locked(lck) && ++ htree_node_is_listening(lck, ffz(~EXT4_LB_SPIN))) ++ htree_node_stop_listen(lck, ffz(~EXT4_LB_SPIN)); ++} ++ ++enum { ++ DX_HASH_COL_IGNORE, /* ignore collision while probing frames */ ++ DX_HASH_COL_YES, /* there is collision and it does matter */ ++ DX_HASH_COL_NO, /* there is no collision */ ++}; ++ ++static int dx_probe_hash_collision(struct htree_lock *lck, ++ struct dx_entry *entries, ++ struct dx_entry *at, u32 hash) ++{ ++ if (!(ext4_htree_lock_data(lck)->ld_flags & EXT4_LB_EXACT)) { ++ return DX_HASH_COL_IGNORE; /* don't care about collision */ ++ ++ } else if (at == entries + dx_get_count(entries) - 1) { ++ return DX_HASH_COL_IGNORE; /* not in any leaf of this DX */ ++ ++ } else { /* hash collision? */ ++ return ((dx_get_hash(at + 1) & ~1) == hash) ? ++ DX_HASH_COL_YES : DX_HASH_COL_NO; ++ } ++} ++ + /* + * Probe for a directory leaf block to search. + * +@@ -379,16 +596,17 @@ struct stats dx_show_entries(struct dx_h + */ + static struct dx_frame * + dx_probe(const struct qstr *d_name, struct inode *dir, +- struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err) ++ struct dx_hash_info *hinfo, struct dx_frame *frame_in, ++ struct htree_lock *lck, int *err) + { + unsigned count, indirect; +- struct dx_entry *at, *entries, *p, *q, *m; ++ struct dx_entry *at, *entries, *p, *q, *m, *dx = NULL; + struct dx_root_info * info; + struct buffer_head *bh; + struct dx_frame *frame = frame_in; + u32 hash; + +- frame->bh = NULL; ++ memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0])); + if (!(bh = ext4_bread (NULL,dir, 0, 0, err))) + goto fail; + +@@ -418,9 +636,16 @@ dx_probe(const struct qstr *d_name, stru + goto fail; + } + +- if ((indirect = info->indirect_levels) > 1) { +- ext4_warning(dir->i_sb, "Unimplemented inode hash depth: %#06x", +- info->indirect_levels); ++ indirect = info->indirect_levels; ++ if (indirect >= ext4_dir_htree_level(dir->i_sb)) { ++ ext4_warning(dir->i_sb, ++ "Directory (ino: %lu) htree depth %#06x exceed " ++ "supported value", dir->i_ino, ++ ext4_dir_htree_level(dir->i_sb)); ++ if (ext4_dir_htree_level(dir->i_sb) < EXT4_HTREE_LEVEL) { ++ ext4_warning(dir->i_sb, "Enable large directory " ++ "feature to access it"); ++ } + brelse(bh); + *err = ERR_BAD_DX_DIR; + goto fail; +@@ -440,8 +665,15 @@ dx_probe(const struct qstr *d_name, stru + dxtrace(printk("Look up %x", hash)); + while (1) + { ++ if (indirect == 0) { /* the last index level */ ++ /* NB: ext4_htree_dx_lock() could be noop if ++ * DX-lock flag is not set for current operation */ ++ ext4_htree_dx_lock(lck, dx); ++ ext4_htree_spin_lock(lck, dx, NULL); ++ } + count = dx_get_count(entries); +- if (!count || count > dx_get_limit(entries)) { ++ if (count == 0 || count > dx_get_limit(entries)) { ++ ext4_htree_spin_unlock(lck); /* release spin */ + ext4_warning(dir->i_sb, + "dx entry: no count or count > limit"); + brelse(bh); +@@ -482,9 +714,73 @@ dx_probe(const struct qstr *d_name, stru + frame->bh = bh; + frame->entries = entries; + frame->at = at; +- if (!indirect--) return frame; ++ ++ if (indirect == 0) { /* the last index level */ ++ struct ext4_dir_lock_data *ld; ++ u64 myblock; ++ ++ /* By default we only lock DE-block, however, we will ++ * also lock the last level DX-block if: ++ * a) there is hash collision ++ * we will set DX-lock flag (a few lines below) ++ * and redo to lock DX-block ++ * see detail in dx_probe_hash_collision() ++ * b) it's a retry from splitting ++ * we need to lock the last level DX-block so nobody ++ * else can split any leaf blocks under the same ++ * DX-block, see detail in ext4_dx_add_entry() ++ */ ++ if (ext4_htree_dx_locked(lck)) { ++ /* DX-block is locked, just lock DE-block ++ * and return */ ++ ext4_htree_spin_unlock(lck); ++ if (!ext4_htree_safe_locked(lck)) ++ ext4_htree_de_lock(lck, frame->at); ++ return frame; ++ } ++ /* it's pdirop and no DX lock */ ++ if (dx_probe_hash_collision(lck, entries, at, hash) == ++ DX_HASH_COL_YES) { ++ /* found hash collision, set DX-lock flag ++ * and retry to abtain DX-lock */ ++ ext4_htree_spin_unlock(lck); ++ ext4_htree_dx_need_lock(lck); ++ continue; ++ } ++ ld = ext4_htree_lock_data(lck); ++ /* because I don't lock DX, so @at can't be trusted ++ * after I release spinlock so I have to save it */ ++ ld->ld_at = at; ++ ld->ld_at_entry = *at; ++ ld->ld_count = dx_get_count(entries); ++ ++ frame->at = &ld->ld_at_entry; ++ myblock = dx_get_block(at); ++ ++ /* NB: ordering locking */ ++ ext4_htree_spin_unlock_listen(lck, &myblock); ++ /* other thread can split this DE-block because: ++ * a) I don't have lock for the DE-block yet ++ * b) I released spinlock on DX-block ++ * if it happened I can detect it by listening ++ * splitting event on this DE-block */ ++ ext4_htree_de_lock(lck, frame->at); ++ ext4_htree_spin_stop_listen(lck); ++ ++ if (myblock == EXT4_HTREE_NODE_CHANGED) { ++ /* someone split this DE-block before ++ * I locked it, I need to retry and lock ++ * valid DE-block */ ++ ext4_htree_de_unlock(lck); ++ continue; ++ } ++ return frame; ++ } ++ dx = at; ++ indirect--; + if (!(bh = ext4_bread (NULL,dir, dx_get_block(at), 0, err))) + goto fail2; ++ + at = entries = ((struct dx_node *) bh->b_data)->entries; + if (dx_get_limit(entries) != dx_node_limit (dir)) { + ext4_warning(dir->i_sb, +@@ -512,13 +808,18 @@ fail: + static void dx_release (struct dx_frame *frames) + { + struct dx_root_info *info; ++ int i; ++ + if (frames[0].bh == NULL) + return; + + info = dx_get_dx_info((struct ext4_dir_entry_2*)frames[0].bh->b_data); +- if (info->indirect_levels) +- brelse(frames[1].bh); +- brelse(frames[0].bh); ++ for (i = 0; i <= info->indirect_levels; i++) { ++ if (frames[i].bh == NULL) ++ break; ++ brelse(frames[i].bh); ++ frames[i].bh = NULL; ++ } + } + + /* +@@ -541,7 +842,7 @@ static void dx_release (struct dx_frame + static int ext4_htree_next_block(struct inode *dir, __u32 hash, + struct dx_frame *frame, + struct dx_frame *frames, +- __u32 *start_hash) ++ __u32 *start_hash, struct htree_lock *lck) + { + struct dx_frame *p; + struct buffer_head *bh; +@@ -556,12 +857,22 @@ static int ext4_htree_next_block(struct + * this loop, num_frames indicates the number of interior + * nodes need to be read. + */ ++ ext4_htree_de_unlock(lck); + while (1) { +- if (++(p->at) < p->entries + dx_get_count(p->entries)) +- break; ++ if (num_frames > 0 || ext4_htree_dx_locked(lck)) { ++ /* num_frames > 0 : ++ * DX block ++ * ext4_htree_dx_locked: ++ * frame->at is reliable pointer returned by dx_probe, ++ * otherwise dx_probe already knew no collision */ ++ if (++(p->at) < p->entries + dx_get_count(p->entries)) ++ break; ++ } + if (p == frames) + return 0; + num_frames++; ++ if (num_frames == 1) ++ ext4_htree_dx_unlock(lck); + p--; + } + +@@ -584,6 +895,13 @@ static int ext4_htree_next_block(struct + * block so no check is necessary + */ + while (num_frames--) { ++ if (num_frames == 0) { ++ /* it's not always necessary, we just don't want to ++ * detect hash collision again */ ++ ext4_htree_dx_need_lock(lck); ++ ext4_htree_dx_lock(lck, p->at); ++ } ++ + if (!(bh = ext4_bread(NULL, dir, dx_get_block(p->at), + 0, &err))) + return err; /* Failure */ +@@ -592,6 +910,7 @@ static int ext4_htree_next_block(struct + p->bh = bh; + p->at = p->entries = ((struct dx_node *) bh->b_data)->entries; + } ++ ext4_htree_de_lock(lck, p->at); + return 1; + } + +@@ -661,7 +980,7 @@ int ext4_htree_fill_tree(struct file *di + { + struct dx_hash_info hinfo; + struct ext4_dir_entry_2 *de; +- struct dx_frame frames[2], *frame; ++ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; + struct inode *dir; + ext4_lblk_t block; + int count = 0; +@@ -684,10 +1003,10 @@ int ext4_htree_fill_tree(struct file *di + } + hinfo.hash = start_hash; + hinfo.minor_hash = 0; +- frame = dx_probe(NULL, dir, &hinfo, frames, &err); ++ /* assume it's PR locked */ ++ frame = dx_probe(NULL, dir, &hinfo, frames, NULL, &err); + if (!frame) + return err; +- + /* Add '.' and '..' from the htree header */ + if (!start_hash && !start_minor_hash) { + de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data; +@@ -714,7 +1033,7 @@ int ext4_htree_fill_tree(struct file *di + count += ret; + hashval = ~0; + ret = ext4_htree_next_block(dir, HASH_NB_ALWAYS, +- frame, frames, &hashval); ++ frame, frames, &hashval, NULL); + *next_hash = hashval; + if (ret < 0) { + err = ret; +@@ -814,9 +1133,17 @@ static void dx_insert_block(struct dx_fr + + static void ext4_update_dx_flag(struct inode *inode) + { ++ /* Disable it for ldiskfs, because going from a DX directory to ++ * a non-DX directory while it is in use will completely break ++ * the htree-locking. ++ * If we really want to support this operation in the future, ++ * we need to exclusively lock the directory at here which will ++ * increase complexity of code */ ++#if 0 + if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_COMPAT_DIR_INDEX)) + ext4_clear_inode_flag(inode, EXT4_INODE_INDEX); ++#endif + } + + /* +@@ -888,8 +1215,9 @@ static inline int search_dirblock(struct + * to brelse() it when appropriate. + */ + struct buffer_head * ext4_find_entry(struct inode *dir, +- const struct qstr *d_name, +- struct ext4_dir_entry_2 ** res_dir) ++ const struct qstr *d_name, ++ struct ext4_dir_entry_2 **res_dir, ++ struct htree_lock *lck) + { + struct super_block *sb; + struct buffer_head *bh_use[NAMEI_RA_SIZE]; +@@ -910,7 +1238,7 @@ struct buffer_head * ext4_find_entry(str + if (namelen > EXT4_NAME_LEN) + return NULL; + if (is_dx(dir)) { +- bh = ext4_dx_find_entry(dir, d_name, res_dir, &err); ++ bh = ext4_dx_find_entry(dir, d_name, res_dir, lck, &err); + /* + * On success, or if the error was file not found, + * return. Otherwise, fall back to doing a search the +@@ -920,6 +1248,7 @@ struct buffer_head * ext4_find_entry(str + return bh; + dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, " + "falling back\n")); ++ ext4_htree_safe_relock(lck); + } + nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb); + start = EXT4_I(dir)->i_dir_start_lookup; +@@ -996,13 +1325,15 @@ cleanup_and_exit: + return ret; + } + +-static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name, +- struct ext4_dir_entry_2 **res_dir, int *err) ++static struct buffer_head * ext4_dx_find_entry(struct inode *dir, ++ const struct qstr *d_name, ++ struct ext4_dir_entry_2 **res_dir, ++ struct htree_lock *lck, int *err) + { + struct super_block * sb; + struct dx_hash_info hinfo; + u32 hash; +- struct dx_frame frames[2], *frame; ++ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; + struct buffer_head *bh; + ext4_lblk_t block; + int retval; +@@ -1012,13 +1343,16 @@ static struct buffer_head * ext4_dx_find + sb = dir->i_sb; + /* NFS may look up ".." - look at dx_root directory block */ + if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){ +- if (!(frame = dx_probe(d_name, dir, &hinfo, frames, err))) ++ if (!(frame = dx_probe(d_name, dir, &hinfo, frames, lck, err))) + return NULL; + } else { + frame = frames; + frame->bh = NULL; /* for dx_release() */ + frame->at = (struct dx_entry *)frames; /* hack for zero entry*/ + dx_set_block(frame->at, 0); /* dx_root block is 0 */ ++ /* "." and ".." are stored in root DX lock */ ++ ext4_htree_dx_need_lock(lck); ++ ext4_htree_dx_lock(lck, NULL); + } + hash = hinfo.hash; + do { +@@ -1041,7 +1375,7 @@ static struct buffer_head * ext4_dx_find + + /* Check to see if we should continue to search */ + retval = ext4_htree_next_block(dir, hash, frame, +- frames, NULL); ++ frames, NULL, lck); + if (retval < 0) { + ext4_warning(sb, + "error reading index page in directory #%lu", +@@ -1067,7 +1401,7 @@ static struct dentry *ext4_lookup(struct + if (dentry->d_name.len > EXT4_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); + +- bh = ext4_find_entry(dir, &dentry->d_name, &de); ++ bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); + inode = NULL; + if (bh) { + __u32 ino = le32_to_cpu(de->inode); +@@ -1134,7 +1468,7 @@ struct dentry *ext4_get_parent(struct de + struct ext4_dir_entry_2 * de; + struct buffer_head *bh; + +- bh = ext4_find_entry(child->d_inode, &dotdot, &de); ++ bh = ext4_find_entry(child->d_inode, &dotdot, &de, NULL); + if (!bh) + return ERR_PTR(-ENOENT); + ino = le32_to_cpu(de->inode); +@@ -1222,8 +1556,9 @@ static struct ext4_dir_entry_2* dx_pack_ + * Returns pointer to de in block into which the new entry will be inserted. + */ + static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, +- struct buffer_head **bh,struct dx_frame *frame, +- struct dx_hash_info *hinfo, int *error) ++ struct buffer_head **bh, struct dx_frame *frames, ++ struct dx_frame *frame, struct dx_hash_info *hinfo, ++ struct htree_lock *lck, int *error) + { + unsigned blocksize = dir->i_sb->s_blocksize; + unsigned count, continued; +@@ -1280,7 +1615,14 @@ static struct ext4_dir_entry_2 *do_split + hash2, split, count-split)); + + /* Fancy dance to stay within two buffers */ +- de2 = dx_move_dirents(data1, data2, map + split, count - split, blocksize); ++ if (hinfo->hash < hash2) { ++ de2 = dx_move_dirents(data1, data2, map + split, ++ count - split, blocksize); ++ } else { ++ /* make sure we will add entry to the same block which ++ * we have already locked */ ++ de2 = dx_move_dirents(data1, data2, map, split, blocksize); ++ } + de = dx_pack_dirents(data1, blocksize); + de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de, + blocksize); +@@ -1289,13 +1631,21 @@ static struct ext4_dir_entry_2 *do_split + dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1)); + dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1)); + +- /* Which block gets the new entry? */ +- if (hinfo->hash >= hash2) +- { +- swap(*bh, bh2); +- de = de2; ++ ext4_htree_spin_lock(lck, frame > frames ? (frame - 1)->at : NULL, ++ frame->at); /* notify block is being split */ ++ if (hinfo->hash < hash2) { ++ dx_insert_block(frame, hash2 + continued, newblock); ++ ++ } else { ++ /* switch block number */ ++ dx_insert_block(frame, hash2 + continued, ++ dx_get_block(frame->at)); ++ dx_set_block(frame->at, newblock); ++ (frame->at)++; + } +- dx_insert_block(frame, hash2 + continued, newblock); ++ ext4_htree_spin_unlock(lck); ++ ext4_htree_dx_unlock(lck); ++ + err = ext4_handle_dirty_metadata(handle, dir, bh2); + if (err) + goto journal_error; +@@ -1406,7 +1756,7 @@ static int add_dirent_to_buf(handle_t *h + if (!IS_NOCMTIME(dir)) + dir->i_mtime = dir->i_ctime = ext4_current_time(dir); + ext4_update_dx_flag(dir); +- dir->i_version++; ++ inode_inc_iversion(dir); + ext4_mark_inode_dirty(handle, dir); + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, dir, bh); +@@ -1426,7 +1776,7 @@ static int make_indexed_dir(handle_t *ha + const char *name = dentry->d_name.name; + int namelen = dentry->d_name.len; + struct buffer_head *bh2; +- struct dx_frame frames[2], *frame; ++ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; + struct dx_entry *entries; + struct ext4_dir_entry_2 *de, *de2, *dot_de, *dotdot_de; + char *data1, *top; +@@ -1507,7 +1857,7 @@ static int make_indexed_dir(handle_t *ha + ext4_handle_dirty_metadata(handle, dir, frame->bh); + ext4_handle_dirty_metadata(handle, dir, bh); + +- de = do_split(handle,dir, &bh, frame, &hinfo, &retval); ++ de = do_split(handle,dir, &bh, frames, frame, &hinfo, NULL, &retval); + if (!de) { + /* + * Even if the block split failed, we have to properly write +@@ -1614,7 +1964,7 @@ out: + * the entry, as someone else might have used it while you slept. + */ + int ext4_add_entry(handle_t *handle, struct dentry *dentry, +- struct inode *inode) ++ struct inode *inode, struct htree_lock *lck) + { + struct inode *dir = dentry->d_parent->d_inode; + struct buffer_head *bh; +@@ -1633,9 +1983,10 @@ int ext4_add_entry(handle_t *handle, str + if (dentry->d_name.len == 2 && + memcmp(dentry->d_name.name, "..", 2) == 0) + return ext4_update_dotdot(handle, dentry, inode); +- retval = ext4_dx_add_entry(handle, dentry, inode); ++ retval = ext4_dx_add_entry(handle, dentry, inode, lck); + if (!retval || (retval != ERR_BAD_DX_DIR)) + return retval; ++ ext4_htree_safe_relock(lck); + ext4_clear_inode_flag(dir, EXT4_INODE_INDEX); + dx_fallback++; + ext4_mark_inode_dirty(handle, dir); +@@ -1673,18 +2024,21 @@ int ext4_add_entry(handle_t *handle, str + * Returns 0 for success, or a negative error value + */ + static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, +- struct inode *inode) ++ struct inode *inode, struct htree_lock *lck) + { +- struct dx_frame frames[2], *frame; ++ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; + struct dx_entry *entries, *at; + struct dx_hash_info hinfo; + struct buffer_head *bh; + struct inode *dir = dentry->d_parent->d_inode; + struct super_block *sb = dir->i_sb; + struct ext4_dir_entry_2 *de; ++ int restart; + int err; + +- frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err); ++again: ++ restart = 0; ++ frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, lck, &err); + if (!frame) + return err; + entries = frame->entries; +@@ -1693,33 +2047,53 @@ static int ext4_dx_add_entry(handle_t *h + if (!(bh = ext4_bread(handle,dir, dx_get_block(frame->at), 0, &err))) + goto cleanup; + +- BUFFER_TRACE(bh, "get_write_access"); +- err = ext4_journal_get_write_access(handle, bh); +- if (err) +- goto journal_error; +- + err = add_dirent_to_buf(handle, dentry, inode, NULL, bh); + if (err != -ENOSPC) + goto cleanup; + ++ err = 0; + /* Block full, should compress but for now just split */ + dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n", + dx_get_count(entries), dx_get_limit(entries))); + /* Need to split index? */ + if (dx_get_count(entries) == dx_get_limit(entries)) { + ext4_lblk_t newblock; +- unsigned icount = dx_get_count(entries); +- int levels = frame - frames; ++ int levels = frame - frames + 1; ++ unsigned icount; ++ int add_level = 1; + struct dx_entry *entries2; + struct dx_node *node2; + struct buffer_head *bh2; + +- if (levels && (dx_get_count(frames->entries) == +- dx_get_limit(frames->entries))) { +- ext4_warning(sb, "Directory index full!"); ++ if (!ext4_htree_safe_locked(lck)) { /* retry with EX lock */ ++ ext4_htree_safe_relock(lck); ++ restart = 1; ++ goto cleanup; ++ } ++ while (frame > frames) { ++ if (dx_get_count((frame - 1)->entries) < ++ dx_get_limit((frame - 1)->entries)) { ++ add_level = 0; ++ break; ++ } ++ frame--; /* split higher index block */ ++ at = frame->at; ++ entries = frame->entries; ++ restart = 1; ++ } ++ if (add_level && levels == ext4_dir_htree_level(sb)) { ++ ext4_warning(sb, "Directory (ino: %lu) index full, " ++ "reach max htree level :%d", ++ dir->i_ino, levels); ++ if (ext4_dir_htree_level(sb) < EXT4_HTREE_LEVEL) { ++ ext4_warning(sb, "Large directory feature is" ++ "not enabled on this " ++ "filesystem"); ++ } + err = -ENOSPC; + goto cleanup; + } ++ icount = dx_get_count(entries); + bh2 = ext4_append (handle, dir, &newblock, &err); + if (!(bh2)) + goto cleanup; +@@ -1732,7 +2106,7 @@ static int ext4_dx_add_entry(handle_t *h + err = ext4_journal_get_write_access(handle, frame->bh); + if (err) + goto journal_error; +- if (levels) { ++ if (!add_level) { + unsigned icount1 = icount/2, icount2 = icount - icount1; + unsigned hash2 = dx_get_hash(entries + icount1); + dxtrace(printk(KERN_DEBUG "Split index %i/%i\n", +@@ -1740,7 +2114,7 @@ static int ext4_dx_add_entry(handle_t *h + + BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */ + err = ext4_journal_get_write_access(handle, +- frames[0].bh); ++ (frame - 1)->bh); + if (err) + goto journal_error; + +@@ -1756,14 +2130,21 @@ static int ext4_dx_add_entry(handle_t *h + frame->entries = entries = entries2; + swap(frame->bh, bh2); + } +- dx_insert_block(frames + 0, hash2, newblock); +- dxtrace(dx_show_index("node", frames[1].entries)); ++ dx_insert_block((frame - 1), hash2, newblock); ++ dxtrace(dx_show_index("node", frame->entries)); + dxtrace(dx_show_index("node", + ((struct dx_node *) bh2->b_data)->entries)); + err = ext4_handle_dirty_metadata(handle, dir, bh2); + if (err) + goto journal_error; + brelse (bh2); ++ ext4_handle_dirty_metadata(handle, inode, ++ (frame - 1)->bh); ++ if (restart) { ++ ext4_handle_dirty_metadata(handle, inode, ++ frame->bh); ++ goto cleanup; ++ } + } else { + struct dx_root_info * info; + dxtrace(printk(KERN_DEBUG +@@ -1777,25 +2158,42 @@ static int ext4_dx_add_entry(handle_t *h + dx_set_block(entries + 0, newblock); + info = dx_get_dx_info((struct ext4_dir_entry_2*) + frames[0].bh->b_data); +- info->indirect_levels = 1; +- +- /* Add new access path frame */ +- frame = frames + 1; +- frame->at = at = at - entries + entries2; +- frame->entries = entries = entries2; +- frame->bh = bh2; +- err = ext4_journal_get_write_access(handle, +- frame->bh); +- if (err) +- goto journal_error; ++ info->indirect_levels += 1; ++ dxtrace(printk(KERN_DEBUG ++ "Creating %d level index...\n", ++ info->indirect_levels)); ++ ext4_handle_dirty_metadata(handle, inode, frame->bh); ++ ext4_handle_dirty_metadata(handle, inode, bh2); ++ brelse(bh2); ++ restart = 1; ++ goto cleanup; + } +- err = ext4_handle_dirty_metadata(handle, dir, frames[0].bh); +- if (err) { +- ext4_std_error(inode->i_sb, err); ++ } else if (!ext4_htree_dx_locked(lck)) { ++ struct ext4_dir_lock_data *ld = ext4_htree_lock_data(lck); ++ ++ /* not well protected, require DX lock */ ++ ext4_htree_dx_need_lock(lck); ++ at = frame > frames ? (frame - 1)->at : NULL; ++ ++ /* NB: no risk of deadlock because it's just a try. ++ * ++ * NB: we check ld_count for twice, the first time before ++ * having DX lock, the second time after holding DX lock. ++ * ++ * NB: We never free blocks for directory so far, which ++ * means value returned by dx_get_count() should equal to ++ * ld->ld_count if nobody split any DE-block under @at, ++ * and ld->ld_at still points to valid dx_entry. */ ++ if ((ld->ld_count != dx_get_count(entries)) || ++ !ext4_htree_dx_lock_try(lck, at) || ++ (ld->ld_count != dx_get_count(entries))) { ++ restart = 1; + goto cleanup; + } +- } +- de = do_split(handle, dir, &bh, frame, &hinfo, &err); ++ /* OK, I've got DX lock and nothing changed */ ++ frame->at = ld->ld_at; ++ } ++ de = do_split(handle, dir, &bh, frames, frame, &hinfo, lck, &err); + if (!de) + goto cleanup; + err = add_dirent_to_buf(handle, dentry, inode, de, bh); +@@ -1804,9 +2202,15 @@ static int ext4_dx_add_entry(handle_t *h + journal_error: + ext4_std_error(dir->i_sb, err); + cleanup: ++ ext4_htree_dx_unlock(lck); ++ ext4_htree_de_unlock(lck); + if (bh) + brelse(bh); + dx_release(frames); ++ /* @restart is true means htree-path has been changed, we need to ++ * repeat dx_probe() to find out valid htree-path */ ++ if (restart && err == 0) ++ goto again; + return err; + } + +@@ -1845,7 +2249,7 @@ int ext4_delete_entry(handle_t *handle, + blocksize); + else + de->inode = 0; +- dir->i_version++; ++ inode_inc_iversion(dir); + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, dir, bh); + if (unlikely(err)) { +@@ -1892,7 +2296,7 @@ static void ext4_dec_count(handle_t *han + static int ext4_add_nondir(handle_t *handle, + struct dentry *dentry, struct inode *inode) + { +- int err = ext4_add_entry(handle, dentry, inode); ++ int err = ext4_add_entry(handle, dentry, inode, NULL); + if (!err) { + ext4_mark_inode_dirty(handle, inode); + d_instantiate(dentry, inode); +@@ -2122,7 +2526,7 @@ retry: + err = ext4_add_dot_dotdot(handle, dir, inode, NULL, NULL); + if (err) + goto out_clear_inode; +- err = ext4_add_entry(handle, dentry, inode); ++ err = ext4_add_entry(handle, dentry, inode, NULL); + if (err) + goto out_clear_inode; + ext4_inc_count(handle, dir); +@@ -2395,7 +2799,7 @@ static int ext4_rmdir(struct inode *dir, + return PTR_ERR(handle); + + retval = -ENOENT; +- bh = ext4_find_entry(dir, &dentry->d_name, &de); ++ bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); + if (!bh) + goto end_rmdir; + +@@ -2460,7 +2864,7 @@ static int ext4_unlink(struct inode *dir + ext4_handle_sync(handle); + + retval = -ENOENT; +- bh = ext4_find_entry(dir, &dentry->d_name, &de); ++ bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); + if (!bh) + goto end_unlink; + +@@ -2628,7 +3032,7 @@ retry: + ext4_inc_count(handle, inode); + ihold(inode); + +- err = ext4_add_entry(handle, dentry, inode); ++ err = ext4_add_entry(handle, dentry, inode, NULL); + if (!err) { + ext4_mark_inode_dirty(handle, inode); + d_instantiate(dentry, inode); +@@ -2676,7 +3080,7 @@ static int ext4_rename(struct inode *old + if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) + ext4_handle_sync(handle); + +- old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de); ++ old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de, NULL); + /* + * Check for inode number is _not_ due to possible IO errors. + * We might rmdir the source, keep it as pwd of some process +@@ -2689,7 +3093,7 @@ static int ext4_rename(struct inode *old + goto end_rename; + + new_inode = new_dentry->d_inode; +- new_bh = ext4_find_entry(new_dir, &new_dentry->d_name, &new_de); ++ new_bh = ext4_find_entry(new_dir, &new_dentry->d_name, &new_de, NULL); + if (new_bh) { + if (!new_inode) { + brelse(new_bh); +@@ -2719,7 +3123,7 @@ static int ext4_rename(struct inode *old + goto end_rename; + } + if (!new_bh) { +- retval = ext4_add_entry(handle, new_dentry, old_inode); ++ retval = ext4_add_entry(handle, new_dentry, old_inode, NULL); + if (retval) + goto end_rename; + } else { +@@ -2767,7 +3171,8 @@ static int ext4_rename(struct inode *old + struct buffer_head *old_bh2; + struct ext4_dir_entry_2 *old_de2; + +- old_bh2 = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de2); ++ old_bh2 = ext4_find_entry(old_dir, &old_dentry->d_name, ++ &old_de2, NULL); + if (old_bh2) { + retval = ext4_delete_entry(handle, old_dir, + old_de2, old_bh2); +--- /dev/null ++++ b/include/linux/htree_lock.h +@@ -0,0 +1,187 @@ ++/* ++ * include/linux/htree_lock.h ++ * ++ * Copyright (c) 2011, 2012, Intel Corporation. ++ * ++ * Author: Liang Zhen ++ */ ++ ++/* ++ * htree lock ++ * ++ * htree_lock is an advanced lock, it can support five lock modes (concept is ++ * taken from DLM) and it's a sleeping lock. ++ * ++ * most common use case is: ++ * - create a htree_lock_head for data ++ * - each thread (contender) creates it's own htree_lock ++ * - contender needs to call htree_lock(lock_node, mode) to protect data and ++ * call htree_unlock to release lock ++ * ++ * Also, there is advanced use-case which is more complex, user can have ++ * PW/PR lock on particular key, it's mostly used while user holding shared ++ * lock on the htree (CW, CR) ++ * ++ * htree_lock(lock_node, HTREE_LOCK_CR); lock the htree with CR ++ * htree_node_lock(lock_node, HTREE_LOCK_PR, key...); lock @key with PR ++ * ... ++ * htree_node_unlock(lock_node);; unlock the key ++ * ++ * Another tip is, we can have N-levels of this kind of keys, all we need to ++ * do is specifying N-levels while creating htree_lock_head, then we can ++ * lock/unlock a specific level by: ++ * htree_node_lock(lock_node, mode1, key1, level1...); ++ * do something; ++ * htree_node_lock(lock_node, mode1, key2, level2...); ++ * do something; ++ * htree_node_unlock(lock_node, level2); ++ * htree_node_unlock(lock_node, level1); ++ * ++ * NB: for multi-level, should be careful about locking order to avoid deadlock ++ */ ++ ++#ifndef _LINUX_HTREE_LOCK_H ++#define _LINUX_HTREE_LOCK_H ++ ++#include ++#include ++#include ++ ++/* ++ * Lock Modes ++ * more details can be found here: ++ * http://en.wikipedia.org/wiki/Distributed_lock_manager ++ */ ++typedef enum { ++ HTREE_LOCK_EX = 0, /* exclusive lock: incompatible with all others */ ++ HTREE_LOCK_PW, /* protected write: allows only CR users */ ++ HTREE_LOCK_PR, /* protected read: allow PR, CR users */ ++ HTREE_LOCK_CW, /* concurrent write: allow CR, CW users */ ++ HTREE_LOCK_CR, /* concurrent read: allow all but EX users */ ++ HTREE_LOCK_MAX, /* number of lock modes */ ++} htree_lock_mode_t; ++ ++#define HTREE_LOCK_NL HTREE_LOCK_MAX ++#define HTREE_LOCK_INVAL 0xdead10c ++ ++enum { ++ HTREE_HBITS_MIN = 2, ++ HTREE_HBITS_DEF = 14, ++ HTREE_HBITS_MAX = 32, ++}; ++ ++enum { ++ HTREE_EVENT_DISABLE = (0), ++ HTREE_EVENT_RD = (1 << HTREE_LOCK_PR), ++ HTREE_EVENT_WR = (1 << HTREE_LOCK_PW), ++ HTREE_EVENT_RDWR = (HTREE_EVENT_RD | HTREE_EVENT_WR), ++}; ++ ++struct htree_lock; ++ ++typedef void (*htree_event_cb_t)(void *target, void *event); ++ ++struct htree_lock_child { ++ struct list_head lc_list; /* granted list */ ++ htree_event_cb_t lc_callback; /* event callback */ ++ unsigned lc_events; /* event types */ ++}; ++ ++struct htree_lock_head { ++ unsigned long lh_lock; /* bits lock */ ++ /* blocked lock list (htree_lock) */ ++ struct list_head lh_blocked_list; ++ /* # key levels */ ++ u16 lh_depth; ++ /* hash bits for key and limit number of locks */ ++ u16 lh_hbits; ++ /* counters for blocked locks */ ++ u16 lh_nblocked[HTREE_LOCK_MAX]; ++ /* counters for granted locks */ ++ u16 lh_ngranted[HTREE_LOCK_MAX]; ++ /* private data */ ++ void *lh_private; ++ /* array of children locks */ ++ struct htree_lock_child lh_children[0]; ++}; ++ ++/* htree_lock_node_t is child-lock for a specific key (ln_value) */ ++struct htree_lock_node { ++ htree_lock_mode_t ln_mode; ++ /* major hash key */ ++ u16 ln_major_key; ++ /* minor hash key */ ++ u16 ln_minor_key; ++ struct list_head ln_major_list; ++ struct list_head ln_minor_list; ++ /* alive list, all locks (granted, blocked, listening) are on it */ ++ struct list_head ln_alive_list; ++ /* blocked list */ ++ struct list_head ln_blocked_list; ++ /* granted list */ ++ struct list_head ln_granted_list; ++ void *ln_ev_target; ++}; ++ ++struct htree_lock { ++ struct task_struct *lk_task; ++ struct htree_lock_head *lk_head; ++ void *lk_private; ++ unsigned lk_depth; ++ htree_lock_mode_t lk_mode; ++ struct list_head lk_blocked_list; ++ struct htree_lock_node lk_nodes[0]; ++}; ++ ++/* create a lock head, which stands for a resource */ ++struct htree_lock_head *htree_lock_head_alloc(unsigned depth, ++ unsigned hbits, unsigned priv); ++/* free a lock head */ ++void htree_lock_head_free(struct htree_lock_head *lhead); ++/* register event callback for child lock at level @depth */ ++void htree_lock_event_attach(struct htree_lock_head *lhead, unsigned depth, ++ unsigned events, htree_event_cb_t callback); ++/* create a lock handle, which stands for a thread */ ++struct htree_lock *htree_lock_alloc(unsigned depth, unsigned pbytes); ++/* free a lock handle */ ++void htree_lock_free(struct htree_lock *lck); ++/* lock htree, when @wait is true, 0 is returned if the lock can't ++ * be granted immediately */ ++int htree_lock_try(struct htree_lock *lck, struct htree_lock_head *lhead, ++ htree_lock_mode_t mode, int wait); ++/* unlock htree */ ++void htree_unlock(struct htree_lock *lck); ++/* unlock and relock htree with @new_mode */ ++int htree_change_lock_try(struct htree_lock *lck, ++ htree_lock_mode_t new_mode, int wait); ++void htree_change_mode(struct htree_lock *lck, htree_lock_mode_t mode); ++/* require child lock (key) of htree at level @dep, @event will be sent to all ++ * listeners on this @key while lock being granted */ ++int htree_node_lock_try(struct htree_lock *lck, htree_lock_mode_t mode, ++ u32 key, unsigned dep, int wait, void *event); ++/* release child lock at level @dep, this lock will listen on it's key ++ * if @event isn't NULL, event_cb will be called against @lck while granting ++ * any other lock at level @dep with the same key */ ++void htree_node_unlock(struct htree_lock *lck, unsigned dep, void *event); ++/* stop listening on child lock at level @dep */ ++void htree_node_stop_listen(struct htree_lock *lck, unsigned dep); ++/* for debug */ ++void htree_lock_stat_print(int depth); ++void htree_lock_stat_reset(void); ++ ++#define htree_lock(lck, lh, mode) htree_lock_try(lck, lh, mode, 1) ++#define htree_change_lock(lck, mode) htree_change_lock_try(lck, mode, 1) ++ ++#define htree_lock_mode(lck) ((lck)->lk_mode) ++ ++#define htree_node_lock(lck, mode, key, dep) \ ++ htree_node_lock_try(lck, mode, key, dep, 1, NULL) ++/* this is only safe in thread context of lock owner */ ++#define htree_node_is_granted(lck, dep) \ ++ ((lck)->lk_nodes[dep].ln_mode != HTREE_LOCK_INVAL && \ ++ (lck)->lk_nodes[dep].ln_mode != HTREE_LOCK_NL) ++/* this is only safe in thread context of lock owner */ ++#define htree_node_is_listening(lck, dep) \ ++ ((lck)->lk_nodes[dep].ln_mode == HTREE_LOCK_NL) ++ ++#endif diff --git a/ldiskfs/kernel_patches/patches/sles11sp2/restore-path-in-walk_extent_callback.patch b/ldiskfs/kernel_patches/patches/sles11sp2/restore-path-in-walk_extent_callback.patch new file mode 100644 index 0000000..3fa6dd0 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/sles11sp2/restore-path-in-walk_extent_callback.patch @@ -0,0 +1,58 @@ +From: Jeff Mahoney +Subject: ext4: restore path parameter to ext_prepare_callback + +This patch partially reverts commit c03f8aa9 +(ext4: use FIEMAP_EXTENT_LAST flag for last extent in fiemap) + +The bug that commit fixed is still eliminated but we restore the +struct ext4_ext_path *path parameter to the callback for Lustre. + +next is calculated in ext4_ext_walk_space and can also be calculated in the +callback. + +Signed-off-by: Jeff Mahoney +-- + + fs/ext4/ext4_extents.h | 2 +- + fs/ext4/extents.c | 5 +++-- + 2 files changed, 4 insertions(+), 3 deletions(-) + +--- a/fs/ext4/ext4_extents.h ++++ b/fs/ext4/ext4_extents.h +@@ -125,7 +125,7 @@ struct ext4_ext_path { + * positive retcode - signal for ext4_ext_walk_space(), see below + * callback must return valid extent (passed or newly created) + */ +-typedef int (*ext_prepare_callback)(struct inode *, ext4_lblk_t, ++typedef int (*ext_prepare_callback)(struct inode *, struct ext4_ext_path *, + struct ext4_ext_cache *, + struct ext4_extent *, void *); + +--- a/fs/ext4/extents.c ++++ b/fs/ext4/extents.c +@@ -1964,7 +1964,7 @@ extern int ext4_ext_walk_space(struct in + err = -EIO; + break; + } +- err = func(inode, next, &cbex, ex, cbdata); ++ err = func(inode, path, &cbex, ex, cbdata); + ext4_ext_drop_refs(path); + + if (err < 0) +@@ -3954,7 +3954,7 @@ int ext4_convert_unwritten_extents(struc + /* + * Callback function called for each extent to gather FIEMAP information. + */ +-static int ext4_ext_fiemap_cb(struct inode *inode, ext4_lblk_t next, ++static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path, + struct ext4_ext_cache *newex, struct ext4_extent *ex, + void *data) + { +@@ -3965,6 +3965,7 @@ static int ext4_ext_fiemap_cb(struct ino + int ret = 0; + struct fiemap_extent_info *fieinfo = data; + unsigned char blksize_bits; ++ ext4_lblk_t next = ext4_ext_next_allocated_block(path); + + blksize_bits = inode->i_sb->s_blocksize_bits; + logical = (__u64)newex->ec_block << blksize_bits; diff --git a/ldiskfs/kernel_patches/patches/sles11sp2/revert-ext4-avoid-uninitialized-memory-references-in-ext3_htree_next_block.patch b/ldiskfs/kernel_patches/patches/sles11sp2/revert-ext4-avoid-uninitialized-memory-references-in-ext3_htree_next_block.patch new file mode 100644 index 0000000..ce75411 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/sles11sp2/revert-ext4-avoid-uninitialized-memory-references-in-ext3_htree_next_block.patch @@ -0,0 +1,107 @@ +From: Jeff Mahoney +Subject: revert: ext4: avoid uninitialized memory references in ext3_htree_next_block() + +The data in dirent code depends on being able to store data in the +.. entry. We need to revert this commit because it will skip the dx +lookup on . and .. + + Original commit message: + ---->8---- + From: Theodore Ts'o + Date: Wed, 27 Oct 2010 21:30:08 -0400 + Subject: ext4: avoid uninitialized memory references in ext3_htree_next_block() + Git-commit: 8941ec8b + Patch-mainline: v2.6.37-rc1 + + If the first block of htree directory is missing '.' or '..' but is + otherwise a valid directory, and we do a lookup for '.' or '..', it's + possible to dereference an uninitialized memory pointer in + ext4_htree_next_block(). + + We avoid this by moving the special case from ext4_dx_find_entry() to + ext4_find_entry(); this also means we can optimize ext4_find_entry() + slightly when NFS looks up "..". + + Thanks to Brad Spengler for pointing a Clang warning that led me to + look more closely at this code. The warning was harmless, but it was + useful in pointing out code that was too ugly to live. This warning was + also reported by Roman Borisov. + + Signed-off-by: "Theodore Ts'o" + Cc: Brad Spengler + ----8<---- + +Signed-off-by: Jeff Mahoney +-- + + fs/ext4/namei.c | 32 +++++++++++++++++--------------- + 1 file changed, 17 insertions(+), 15 deletions(-) + +--- a/fs/ext4/namei.c ++++ b/fs/ext4/namei.c +@@ -857,7 +857,6 @@ static struct buffer_head * ext4_find_en + struct buffer_head *bh_use[NAMEI_RA_SIZE]; + struct buffer_head *bh, *ret = NULL; + ext4_lblk_t start, block, b; +- const u8 *name = d_name->name; + int ra_max = 0; /* Number of bh's in the readahead + buffer, bh_use[] */ + int ra_ptr = 0; /* Current index into readahead +@@ -872,16 +871,6 @@ static struct buffer_head * ext4_find_en + namelen = d_name->len; + if (namelen > EXT4_NAME_LEN) + return NULL; +- if ((namelen <= 2) && (name[0] == '.') && +- (name[1] == '.' || name[1] == '\0')) { +- /* +- * "." or ".." will only be in the first block +- * NFS may look up ".."; "." should be handled by the VFS +- */ +- block = start = 0; +- nblocks = 1; +- goto restart; +- } + if (is_dx(dir)) { + bh = ext4_dx_find_entry(dir, d_name, res_dir, &err); + /* +@@ -972,15 +961,28 @@ cleanup_and_exit: + static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name, + struct ext4_dir_entry_2 **res_dir, int *err) + { +- struct super_block * sb = dir->i_sb; ++ struct super_block * sb; + struct dx_hash_info hinfo; ++ u32 hash; + struct dx_frame frames[2], *frame; + struct buffer_head *bh; + ext4_lblk_t block; + int retval; ++ int namelen = d_name->len; ++ const u8 *name = d_name->name; + +- if (!(frame = dx_probe(d_name, dir, &hinfo, frames, err))) +- return NULL; ++ sb = dir->i_sb; ++ /* NFS may look up ".." - look at dx_root directory block */ ++ if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){ ++ if (!(frame = dx_probe(d_name, dir, &hinfo, frames, err))) ++ return NULL; ++ } else { ++ frame = frames; ++ frame->bh = NULL; /* for dx_release() */ ++ frame->at = (struct dx_entry *)frames; /* hack for zero entry*/ ++ dx_set_block(frame->at, 0); /* dx_root block is 0 */ ++ } ++ hash = hinfo.hash; + do { + block = dx_get_block(frame->at); + if (!(bh = ext4_bread(NULL, dir, block, 0, err))) +@@ -1000,7 +1002,7 @@ static struct buffer_head * ext4_dx_find + } + + /* Check to see if we should continue to search */ +- retval = ext4_htree_next_block(dir, hinfo.hash, frame, ++ retval = ext4_htree_next_block(dir, hash, frame, + frames, NULL); + if (retval < 0) { + ext4_warning(sb, diff --git a/ldiskfs/kernel_patches/series/ldiskfs-3.0-sles11.series b/ldiskfs/kernel_patches/series/ldiskfs-3.0-sles11.series new file mode 100644 index 0000000..007ae2a --- /dev/null +++ b/ldiskfs/kernel_patches/series/ldiskfs-3.0-sles11.series @@ -0,0 +1,46 @@ +sles11sp2/ext4-introduce-ext4_kvmalloc-ext4_kzalloc-and-ext4_kvfree.patch +sles11sp2/ext4-speed-up-fitrim-by-recording-flags-in-ext4_group_info.patch +sles11sp2/ext4-add-missing-kfree-on-error-return-path-in-add_new_gdb.patch +sles11sp2/ext4-use-ext4_kvzalloc-ext4_kvmalloc-for-s_group_desc-and-s_group_info.patch +sles11sp2/ext4-use-ext4_msg-instead-of-printk-in-mballoc.patch +sles11sp2/revert-ext4-avoid-uninitialized-memory-references-in-ext3_htree_next_block.patch +sles11sp2/ext4-journal-callback.patch +sles11sp2/ext4-free-resources-in-some-error-path-in-ext4_fill_super.patch +sles11sp2/ext4-make-quota-as-first-class-supported-feature.patch + +sles11sp2/ext4-handle-cleanup-after-quota-failure.patch +rhel6.3/ext4-wantedi-2.6.patch +sles11sp2/ext4-map_inode_page-3.0.patch +sles11sp2/export-ext4-3.0.patch +rhel6.3/ext4-remove-cond_resched-calls.patch +rhel6.3/ext4-nlink-2.6.patch +sles11sp2/ext4-ext_generation.patch +rhel6.3/ext4-inode-version.patch +sles11sp2/ext4-lookup-dotdot.patch +rhel6.3/ext4-max-dir-size.patch +rhel6.3/ext4-print-inum-in-htree-warning.patch +rhel6.3/ext4-xattr-no-update-ctime.patch +sles11sp2/ext4-prealloc.patch +sles11sp2/ext4-mballoc-extra-checks.patch +sles11sp2/restore-path-in-walk_extent_callback.patch +sles11sp2/ext4-read-write.patch +sles11sp2/ext4-misc.patch +sles11sp2/ext4-big-endian-check-3.0.patch +rhel6.3/ext4-alloc-policy-2.6.patch +sles11sp2/ext4-force_over_128tb.patch +rhel6.3/ext4-pdir-fix.patch +sles11sp2/ext4-osd-iop-common.patch +sles11sp2/ext4-dynlocks-common.patch +rhel6.3/ext4-osd-iam-exports.patch +rhel6.3/ext4-hash-indexed-dir-dotdot-update.patch +sles11sp2/ext4-kill-dx_root.patch +sles11sp2/ext4-extents-mount-option.patch +rhel6.3/ext4-fiemap-2.6.patch +sles11sp2/ext4-mballoc-pa_free-mismatch.patch +sles11sp2/ext4_data_in_dirent.patch +sles11sp2/ext4-large-eas.patch +sles11sp2/ext4-disable-mb-cache.patch +rhel6.3/ext4-nocmtime-2.6.patch +rhel6.3/ext4-export-64bit-name-hash.patch +sles11sp2/ext4-store-tree-generation-at-find.patch +sles11sp2/ext4_pdirop.patch