From: Shaun Tancheff Date: Sat, 7 Mar 2020 16:26:35 +0000 (-0600) Subject: LU-11310 ldiskfs: Support for SUSE 15 GA and SP1 X-Git-Tag: 2.13.53~54 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=862e9bf632dc44f1102bfc2aef10504e506f1225 LU-11310 ldiskfs: Support for SUSE 15 GA and SP1 Add support for ldiskfs SUSE 15 SP1 and late GA: GA 150 series kernels and SP1 195, and 197 series Test-Parameters: trivial Cray-bug-id: LUS-7572 Signed-off-by: Shaun Tancheff Change-Id: I2525acc475503166fb71bed59663edeee603d38c Reviewed-on: https://review.whamcloud.com/36094 Reviewed-by: Andreas Dilger Tested-by: jenkins Tested-by: Maloo Reviewed-by: Petros Koutoupis --- diff --git a/config/lustre-build-ldiskfs.m4 b/config/lustre-build-ldiskfs.m4 index 63b4e7b..1f4a1e7 100644 --- a/config/lustre-build-ldiskfs.m4 +++ b/config/lustre-build-ldiskfs.m4 @@ -31,6 +31,7 @@ AS_IF([test x$RHEL_KERNEL = xyes], [ 6[0-3]) LDISKFS_SERIES="2.6-rhel6.series" ;; esac ], [test x$SUSE_KERNEL = xyes], [ + AS_VERSION_COMPARE([$LINUXRELEASE],[4.12.14],[ AS_VERSION_COMPARE([$LINUXRELEASE],[4.4.82],[ AS_VERSION_COMPARE([$LINUXRELEASE],[4.4.0],[ AS_VERSION_COMPARE([$LINUXRELEASE],[3.12.0],[ @@ -58,7 +59,9 @@ AS_IF([test x$RHEL_KERNEL = xyes], [ ])],[LDISKFS_SERIES="4.4-sles12sp2.series"], [LDISKFS_SERIES="4.4-sles12sp2.series"] )], [LDISKFS_SERIES="4.4-sles12sp3.series"], - [LDISKFS_SERIES="4.4-sles12sp3.series"]) + [LDISKFS_SERIES="4.4-sles12sp3.series"] + )], [LDISKFS_SERIES="4.12-sles15.series"], + [LDISKFS_SERIES="4.12-sles15.series"]) ], [test x$UBUNTU_KERNEL = xyes], [ AS_VERSION_COMPARE([$LINUXRELEASE],[5.3.0],[ AS_VERSION_COMPARE([$LINUXRELEASE],[5.0.0],[ @@ -110,7 +113,7 @@ AS_IF([test -z "$LDISKFS_SERIES"], []) AS_IF([test -z "$LDISKFS_SERIES"], [AC_MSG_RESULT([failed to identify series])], - [AC_MSG_RESULT([$LDISKFS_SERIES])]) + [AC_MSG_RESULT([$LDISKFS_SERIES for $LINUXRELEASE])]) AC_SUBST(LDISKFS_SERIES) ]) # LDISKFS_LINUX_SERIES diff --git a/ldiskfs/kernel_patches/patches/suse15/ext4-corrupted-inode-block-bitmaps-handling-patches.patch b/ldiskfs/kernel_patches/patches/suse15/ext4-corrupted-inode-block-bitmaps-handling-patches.patch new file mode 100644 index 0000000..d6a5a7f --- /dev/null +++ b/ldiskfs/kernel_patches/patches/suse15/ext4-corrupted-inode-block-bitmaps-handling-patches.patch @@ -0,0 +1,225 @@ +Subject: [PATCH] ext4: add new ext4_mark_group_bitmap_corrupted() helper + +Since there are many places to set inode/block bitmap +corrupt bit, add a new helper for it, which will make +codes more clear. + +Signed-off-by: Wang Shilong +Signed-off-by: Theodore Ts'o +Reviewed-by: Andreas Dilger +--- + fs/ext4/balloc.c | 29 +++++++---------------------- + fs/ext4/ext4.h | 7 +++++++ + fs/ext4/ialloc.c | 20 ++++---------------- + fs/ext4/mballoc.c | 9 ++------- + fs/ext4/super.c | 30 ++++++++++++++++++++++++++++++ + 5 files changed, 50 insertions(+), 45 deletions(-) + +diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c +index 2455fe1..5ced5e5 100644 +--- a/fs/ext4/balloc.c ++++ b/fs/ext4/balloc.c +@@ -183,25 +183,15 @@ static int ext4_init_block_bitmap(struct super_block *sb, + unsigned int bit, bit_max; + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_fsblk_t start, tmp; +- struct ext4_group_info *grp; + + J_ASSERT_BH(bh, buffer_locked(bh)); + + /* If checksum is bad mark all blocks used to prevent allocation + * essentially implementing a per-group read-only flag. */ + if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) { +- grp = ext4_get_group_info(sb, block_group); +- if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) +- percpu_counter_sub(&sbi->s_freeclusters_counter, +- grp->bb_free); +- set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); +- if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { +- int count; +- count = ext4_free_inodes_count(sb, gdp); +- percpu_counter_sub(&sbi->s_freeinodes_counter, +- count); +- } +- set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); ++ ext4_mark_group_bitmap_corrupted(sb, block_group, ++ EXT4_GROUP_INFO_BBITMAP_CORRUPT | ++ EXT4_GROUP_INFO_IBITMAP_CORRUPT); + return -EFSBADCRC; + } + memset(bh->b_data, 0, sb->s_blocksize); +@@ -370,7 +360,6 @@ static int ext4_validate_block_bitmap(struct super_block *sb, + { + ext4_fsblk_t blk; + struct ext4_group_info *grp = ext4_get_group_info(sb, block_group); +- struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (buffer_verified(bh)) + return 0; +@@ -384,10 +373,8 @@ static int ext4_validate_block_bitmap(struct super_block *sb, + desc, bh))) { + ext4_unlock_group(sb, block_group); + ext4_error(sb, "bg %u: bad block bitmap checksum", block_group); +- if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) +- percpu_counter_sub(&sbi->s_freeclusters_counter, +- grp->bb_free); +- set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); ++ ext4_mark_group_bitmap_corrupted(sb, block_group, ++ EXT4_GROUP_INFO_BBITMAP_CORRUPT); + return -EFSBADCRC; + } + blk = ext4_valid_block_bitmap(sb, desc, block_group, bh); +@@ -395,10 +382,8 @@ static int ext4_validate_block_bitmap(struct super_block *sb, + ext4_unlock_group(sb, block_group); + ext4_error(sb, "bg %u: block %llu: invalid block bitmap", + block_group, blk); +- if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) +- percpu_counter_sub(&sbi->s_freeclusters_counter, +- grp->bb_free); +- set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); ++ ext4_mark_group_bitmap_corrupted(sb, block_group, ++ EXT4_GROUP_INFO_BBITMAP_CORRUPT); + return -EFSCORRUPTED; + } + set_buffer_verified(bh); +diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h +index 357d71f..3362231 100644 +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -2759,6 +2759,9 @@ extern int ext4_alloc_flex_bg_array(struct super_block *sb, + ext4_group_t ngroup); + extern const char *ext4_decode_error(struct super_block *sb, int errno, + char nbuf[16]); ++extern void ext4_mark_group_bitmap_corrupted(struct super_block *sb, ++ ext4_group_t block_group, ++ unsigned int flags); + + extern __printf(4, 5) + void __ext4_error(struct super_block *, const char *, unsigned int, +@@ -3087,6 +3090,10 @@ struct ext4_group_info { + #define EXT4_GROUP_INFO_WAS_TRIMMED_BIT 1 + #define EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT 2 + #define EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT 3 ++#define EXT4_GROUP_INFO_BBITMAP_CORRUPT \ ++ (1 << EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT) ++#define EXT4_GROUP_INFO_IBITMAP_CORRUPT \ ++ (1 << EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT) + + #define EXT4_MB_GRP_NEED_INIT(grp) \ + (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) +diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c +index 0f284fb..d4cd50b 100644 +--- a/fs/ext4/ialloc.c ++++ b/fs/ext4/ialloc.c +@@ -82,7 +82,6 @@ static int ext4_validate_inode_bitmap(struct super_block *sb, + { + ext4_fsblk_t blk; + struct ext4_group_info *grp = ext4_get_group_info(sb, block_group); +- struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (buffer_verified(bh)) + return 0; +@@ -98,14 +97,8 @@ static int ext4_validate_inode_bitmap(struct super_block *sb, + ext4_unlock_group(sb, block_group); + ext4_error(sb, "Corrupt inode bitmap - block_group = %u, " + "inode_bitmap = %llu", block_group, blk); +- grp = ext4_get_group_info(sb, block_group); +- if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { +- int count; +- count = ext4_free_inodes_count(sb, desc); +- percpu_counter_sub(&sbi->s_freeinodes_counter, +- count); +- } +- set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); ++ ext4_mark_group_bitmap_corrupted(sb, block_group, ++ EXT4_GROUP_INFO_IBITMAP_CORRUPT); + return -EFSBADCRC; + } + set_buffer_verified(bh); +@@ -349,13 +342,8 @@ out: + fatal = err; + } else { + ext4_error(sb, "bit already cleared for inode %lu", ino); +- if (gdp && !EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { +- int count; +- count = ext4_free_inodes_count(sb, gdp); +- percpu_counter_sub(&sbi->s_freeinodes_counter, +- count); +- } +- set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); ++ ext4_mark_group_bitmap_corrupted(sb, block_group, ++ EXT4_GROUP_INFO_IBITMAP_CORRUPT); + } + + error_return: +diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c +index 0dd98ad..2e094aa 100644 +--- a/fs/ext4/mballoc.c ++++ b/fs/ext4/mballoc.c +@@ -1454,7 +1454,6 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, + right_is_free = !mb_test_bit(last + 1, e4b->bd_bitmap); + + if (unlikely(block != -1)) { +- struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_fsblk_t blocknr; + + blocknr = ext4_group_first_block_no(sb, e4b->bd_group); +@@ -1465,12 +1464,8 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, + "freeing already freed block " + "(bit %u); block bitmap corrupt.", + block); +- if (!EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)) +- percpu_counter_sub(&sbi->s_freeclusters_counter, +- e4b->bd_info->bb_free); +- /* Mark the block group as corrupt. */ +- set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, +- &e4b->bd_info->bb_state); ++ ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, ++ EXT4_GROUP_INFO_BBITMAP_CORRUPT); + mb_regenerate_buddy(e4b); + goto done; + } +diff --git a/fs/ext4/super.c b/fs/ext4/super.c +index ffbe9f0..432c4a1 100644 +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -770,6 +770,36 @@ __acquires(bitlock) + return; + } + ++void ext4_mark_group_bitmap_corrupted(struct super_block *sb, ++ ext4_group_t group, ++ unsigned int flags) ++{ ++ struct ext4_sb_info *sbi = EXT4_SB(sb); ++ struct ext4_group_info *grp = ext4_get_group_info(sb, group); ++ struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL); ++ ++ if ((flags & EXT4_GROUP_INFO_BBITMAP_CORRUPT) && ++ !EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) { ++ percpu_counter_sub(&sbi->s_freeclusters_counter, ++ grp->bb_free); ++ set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, ++ &grp->bb_state); ++ } ++ ++ if ((flags & EXT4_GROUP_INFO_IBITMAP_CORRUPT) && ++ !EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { ++ if (gdp) { ++ int count; ++ ++ count = ext4_free_inodes_count(sb, gdp); ++ percpu_counter_sub(&sbi->s_freeinodes_counter, ++ count); ++ } ++ set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, ++ &grp->bb_state); ++ } ++} ++ + void ext4_update_dynamic_rev(struct super_block *sb) + { + struct ext4_super_block *es = EXT4_SB(sb)->s_es; +-- +2.20.1 + diff --git a/ldiskfs/kernel_patches/patches/suse15/ext4-disable-mb-cache.patch b/ldiskfs/kernel_patches/patches/suse15/ext4-disable-mb-cache.patch new file mode 100644 index 0000000..95c21d2 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/suse15/ext4-disable-mb-cache.patch @@ -0,0 +1,154 @@ +mbcache provides absolutely no value for Lustre xattrs (because +they are unique and cannot be shared between files) and as we can +see it has a noticable overhead in some cases. In the past there +was a CONFIG_MBCACHE option that would allow it to be disabled, +but this was removed in newer kernels, so we will need to patch +ldiskfs to fix this. + +Index: b/fs/ext4/ext4.h +=================================================================== +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -1119,6 +1119,7 @@ struct ext4_inode_info { + /* + * Mount flags set via mount options or defaults + */ ++#define EXT4_MOUNT_NO_MBCACHE 0x00001 /* Disable mbcache */ + #define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */ + #define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */ + #define EXT4_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */ +Index: b/fs/ext4/super.c +=================================================================== +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -1335,6 +1335,7 @@ enum { + Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity, + Opt_inode_readahead_blks, Opt_journal_ioprio, + Opt_dioread_nolock, Opt_dioread_lock, ++ Opt_no_mbcache, + Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable, + Opt_max_dir_size_kb, Opt_nojournal_checksum, + }; +@@ -1416,6 +1417,7 @@ static const match_table_t tokens = { + {Opt_discard, "discard"}, + {Opt_nodiscard, "nodiscard"}, + {Opt_init_itable, "init_itable=%u"}, ++ {Opt_no_mbcache, "no_mbcache"}, + {Opt_init_itable, "init_itable"}, + {Opt_noinit_itable, "noinit_itable"}, + {Opt_max_dir_size_kb, "max_dir_size_kb=%u"}, +@@ -1580,6 +1582,7 @@ static const struct mount_opts { + {Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET}, + {Opt_auto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_CLEAR}, + {Opt_noinit_itable, EXT4_MOUNT_INIT_INODE_TABLE, MOPT_CLEAR}, ++ {Opt_no_mbcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET}, + {Opt_commit, 0, MOPT_GTE0}, + {Opt_max_batch_time, 0, MOPT_GTE0}, + {Opt_min_batch_time, 0, MOPT_GTE0}, +Index: b/fs/ext4/xattr.c +=================================================================== +--- a/fs/ext4/xattr.c ++++ b/fs/ext4/xattr.c +@@ -73,7 +73,7 @@ + # define ea_bdebug(bh, fmt, ...) no_printk(fmt, ##__VA_ARGS__) + #endif + +-static void ext4_xattr_cache_insert(struct mb_cache *, struct buffer_head *); ++static void _ext4_xattr_cache_insert(struct mb_cache *, struct buffer_head *); + static struct buffer_head *ext4_xattr_cache_find(struct inode *, + struct ext4_xattr_header *, + struct mb_cache_entry **); +@@ -413,7 +413,8 @@ ext4_xattr_block_get(struct inode *inode + error = ext4_xattr_check_block(inode, bh); + if (error) + goto cleanup; +- ext4_xattr_cache_insert(ext4_mb_cache, bh); ++ if (!test_opt(inode->i_sb, NO_MBCACHE)) ++ _ext4_xattr_cache_insert(ext4_mb_cache, bh); + entry = BFIRST(bh); + end = bh->b_data + bh->b_size; + error = xattr_find_entry(inode, &entry, end, name_index, name, 1); +@@ -579,7 +580,8 @@ ext4_xattr_block_list(struct dentry *den + error = ext4_xattr_check_block(inode, bh); + if (error) + goto cleanup; +- ext4_xattr_cache_insert(ext4_mb_cache, bh); ++ if (!test_opt(inode->i_sb, NO_MBCACHE)) ++ _ext4_xattr_cache_insert(ext4_mb_cache, bh); + error = ext4_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size); + + cleanup: +@@ -694,7 +696,9 @@ ext4_xattr_release_block(handle_t *handl + * This must happen under buffer lock for + * ext4_xattr_block_set() to reliably detect freed block + */ +- mb_cache_entry_delete_block(ext4_mb_cache, hash, bh->b_blocknr); ++ if (!test_opt(inode->i_sb, NO_MBCACHE)) ++ mb_cache_entry_delete_block(ext4_mb_cache, ++ hash, bh->b_blocknr); + get_bh(bh); + unlock_buffer(bh); + ext4_free_blocks(handle, inode, bh, 0, 1, +@@ -704,9 +708,10 @@ ext4_xattr_release_block(handle_t *handl + ref--; + BHDR(bh)->h_refcount = cpu_to_le32(ref); + if (ref == EXT4_XATTR_REFCOUNT_MAX - 1) { +- struct mb_cache_entry *ce; ++ struct mb_cache_entry *ce = NULL; + +- ce = mb_cache_entry_get(ext4_mb_cache, hash, ++ if (!test_opt(inode->i_sb, NO_MBCACHE)) ++ ce = mb_cache_entry_get(ext4_mb_cache, hash, + bh->b_blocknr); + if (ce) { + ce->e_reusable = 1; +@@ -1147,7 +1152,8 @@ ext4_xattr_block_set(handle_t *handle, s + * ext4_xattr_block_set() to reliably detect modified + * block + */ +- mb_cache_entry_delete_block(ext4_mb_cache, hash, ++ if (!test_opt(inode->i_sb, NO_MBCACHE)) ++ mb_cache_entry_delete_block(ext4_mb_cache, hash, + bs->bh->b_blocknr); + ea_bdebug(bs->bh, "modifying in-place"); + error = ext4_xattr_set_entry(i, s, handle, inode); +@@ -1155,8 +1161,9 @@ ext4_xattr_block_set(handle_t *handle, s + if (!IS_LAST_ENTRY(s->first)) + ext4_xattr_rehash(header(s->base), + s->here); +- ext4_xattr_cache_insert(ext4_mb_cache, +- bs->bh); ++ if (!test_opt(inode->i_sb, NO_MBCACHE)) ++ _ext4_xattr_cache_insert(ext4_mb_cache, ++ bs->bh); + } + ext4_xattr_block_csum_set(inode, bs->bh); + unlock_buffer(bs->bh); +@@ -1324,7 +1331,8 @@ getblk_failed: + ext4_xattr_block_csum_set(inode, new_bh); + set_buffer_uptodate(new_bh); + unlock_buffer(new_bh); +- ext4_xattr_cache_insert(ext4_mb_cache, new_bh); ++ if (!test_opt(inode->i_sb, NO_MBCACHE)) ++ _ext4_xattr_cache_insert(ext4_mb_cache, new_bh); + error = ext4_handle_dirty_metadata(handle, inode, + new_bh); + if (error) +@@ -2127,7 +2135,7 @@ ext4_xattr_inode_array_free(struct inode + * Returns 0, or a negative error number on failure. + */ + static void +-ext4_xattr_cache_insert(struct mb_cache *ext4_mb_cache, struct buffer_head *bh) ++_ext4_xattr_cache_insert(struct mb_cache *ext4_mb_cache, struct buffer_head *bh) + { + struct ext4_xattr_header *header = BHDR(bh); + __u32 hash = le32_to_cpu(header->h_hash); +@@ -2199,6 +2207,8 @@ ext4_xattr_cache_find(struct inode *inod + struct mb_cache_entry *ce; + struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); + ++ if (test_opt(inode->i_sb, NO_MBCACHE)) ++ return NULL; + if (!header->h_hash) + return NULL; /* never share */ + ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); diff --git a/ldiskfs/kernel_patches/patches/suse15/ext4-dont-check-before-replay.patch b/ldiskfs/kernel_patches/patches/suse15/ext4-dont-check-before-replay.patch new file mode 100644 index 0000000..f7de42a --- /dev/null +++ b/ldiskfs/kernel_patches/patches/suse15/ext4-dont-check-before-replay.patch @@ -0,0 +1,49 @@ +Subject: [PATCH] ext4: don't verify group descriptors until after journal + replay + +When ldiskfs runs in failover mode with read-only disk, it may +lose part of allocation updates and fail while mounting the +filesystem due to group descriptor checks before journal replay. +Don't panic with on-disk checks in read-only mode. + +Seagate-bug-id: MRP-797 +Signed-off-by: Alexey Lyashkov +Signed-off-by: Lokesh Nagappa Jaliminche +Lustre-change: https://review.whamcloud.com/21141 +--- + fs/ext4/super.c | 12 +++++++----- + 1 file changed, 7 insertions(+), 5 deletions(-) + +diff --git a/fs/ext4/super.c b/fs/ext4/super.c +index 6ea4936..134308f 100644 +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -4145,11 +4145,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) + } + } + sbi->s_gdb_count = db_count; +- if (!ext4_check_descriptors(sb, logical_sb_block, &first_not_zeroed)) { +- ext4_msg(sb, KERN_ERR, "group descriptors corrupted!"); +- ret = -EFSCORRUPTED; +- goto failed_mount2; +- } + get_random_bytes(&sbi->s_next_generation, sizeof(u32)); + spin_lock_init(&sbi->s_next_gen_lock); + +@@ -4289,6 +4284,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) + sbi->s_journal->j_commit_callback = ext4_journal_commit_callback; + + no_journal: ++ ++ if (!ext4_check_descriptors(sb, logical_sb_block, &first_not_zeroed)) { ++ ext4_msg(sb, KERN_ERR, "group descriptors corrupted!"); ++ ret = -EFSCORRUPTED; ++ goto failed_mount_wq; ++ } ++ + sbi->s_mb_cache = ext4_xattr_create_cache(); + if (!sbi->s_mb_cache) { + ext4_msg(sb, KERN_ERR, "Failed to create an mb_cache"); +-- +2.20.1 + diff --git a/ldiskfs/kernel_patches/patches/suse15/ext4-inode-version.patch b/ldiskfs/kernel_patches/patches/suse15/ext4-inode-version.patch new file mode 100644 index 0000000..38bacb5 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/suse15/ext4-inode-version.patch @@ -0,0 +1,59 @@ +Subject: [PATCH] ext4: cache on-disk inode version in ext4_inode_info + +For use in lustre ldiskfs transaction handling +--- + fs/ext4/ext4.h | 2 ++ + fs/ext4/ialloc.c | 1 + + fs/ext4/inode.c | 4 ++-- + 3 files changed, 5 insertions(+), 2 deletions(-) + +diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h +index 0e2a7f5..271f96d 100644 +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -1099,6 +1099,8 @@ struct ext4_inode_info { + struct dquot *i_dquot[MAXQUOTAS]; + #endif + ++ __u64 i_fs_version; ++ + /* Precomputed uuid+inum+igen checksum for seeding inode checksums */ + __u32 i_csum_seed; + +diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c +index fe1fee9..4da98da 100644 +--- a/fs/ext4/ialloc.c ++++ b/fs/ext4/ialloc.c +@@ -1063,6 +1063,7 @@ got: + ei->i_dtime = 0; + ei->i_block_group = group; + ei->i_last_alloc_group = ~0; ++ ei->i_fs_version = 0; + + ext4_set_inode_flags(inode); + if (IS_DIRSYNC(inode)) +diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c +index 6b15a3a..9ab0039 100644 +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -4886,7 +4886,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) + ivers |= + (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32; + } +- inode_set_iversion_queried(inode, ivers); ++ ei->i_fs_version = ivers; + } + + ret = 0; +@@ -5170,7 +5170,7 @@ static int ext4_do_update_inode(handle_t *handle, + } + + if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) { +- u64 ivers = inode_peek_iversion(inode); ++ u64 ivers = cpu_to_le32(ei->i_fs_version); + + raw_inode->i_disk_version = cpu_to_le32(ivers); + if (ei->i_extra_isize) { +-- +2.20.1 + diff --git a/ldiskfs/kernel_patches/patches/suse15/ext4-large-dir.patch b/ldiskfs/kernel_patches/patches/suse15/ext4-large-dir.patch new file mode 100644 index 0000000..12c1e13 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/suse15/ext4-large-dir.patch @@ -0,0 +1,377 @@ +Subject: [PATCH] ext4: add largedir feature + +This INCOMPAT_LARGEDIR feature allows larger directories to be created +in ldiskfs, both with directory sizes over 2GB and and a maximum htree +depth of 3 instead of the current limit of 2. These features are needed +in order to exceed the current limit of approximately 10M entries in a +single directory. + +This patch was originally written by Yang Sheng to support the Lustre server. + +[ Bumped the credits needed to update an indexed directory -- tytso ] + +Signed-off-by: Liang Zhen +Signed-off-by: Yang Sheng +Signed-off-by: Artem Blagodarenko +Signed-off-by: Theodore Ts'o +Reviewed-by: Andreas Dilger +--- + fs/ext4/ext4.h | 23 +++++++-- + fs/ext4/ext4_jbd2.h | 9 +++- + fs/ext4/inode.c | 4 +- + fs/ext4/namei.c | 120 ++++++++++++++++++++++++++++++-------------- + 4 files changed, 111 insertions(+), 45 deletions(-) + +diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h +index 0999eff..ca73d33 100644 +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -1815,7 +1815,8 @@ EXT4_FEATURE_INCOMPAT_FUNCS(encrypt, ENCRYPT) + EXT4_FEATURE_INCOMPAT_DIRDATA| \ + EXT4_FEATURE_INCOMPAT_INLINE_DATA | \ + EXT4_FEATURE_INCOMPAT_ENCRYPT | \ +- EXT4_FEATURE_INCOMPAT_CSUM_SEED) ++ EXT4_FEATURE_INCOMPAT_CSUM_SEED | \ ++ EXT4_FEATURE_INCOMPAT_LARGEDIR) + #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ +@@ -2200,6 +2201,16 @@ ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no) + */ + #define ERR_BAD_DX_DIR (-(MAX_ERRNO - 1)) + ++/* htree levels for ext4 */ ++#define EXT4_HTREE_LEVEL_COMPAT 2 ++#define EXT4_HTREE_LEVEL 3 ++ ++static inline int ext4_dir_htree_level(struct super_block *sb) ++{ ++ return ext4_has_feature_largedir(sb) ? ++ EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT; ++} ++ + /* + * Timeout and state flag for lazy initialization inode thread. + */ +@@ -2848,13 +2859,15 @@ static inline void ext4_r_blocks_count_set(struct ext4_super_block *es, + es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32); + } + +-static inline loff_t ext4_isize(struct ext4_inode *raw_inode) ++static inline loff_t ext4_isize(struct super_block *sb, ++ struct ext4_inode *raw_inode) + { +- if (S_ISREG(le16_to_cpu(raw_inode->i_mode))) ++ if (ext4_has_feature_largedir(sb) || ++ S_ISREG(le16_to_cpu(raw_inode->i_mode))) + return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) | + le32_to_cpu(raw_inode->i_size_lo); +- else +- return (loff_t) le32_to_cpu(raw_inode->i_size_lo); ++ ++ return (loff_t) le32_to_cpu(raw_inode->i_size_lo); + } + + static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size) +diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h +index 4b7cc1a..a0ea2d6 100644 +--- a/fs/ext4/ext4_jbd2.h ++++ b/fs/ext4/ext4_jbd2.h +@@ -77,7 +77,14 @@ + + #define EXT4_RESERVE_TRANS_BLOCKS 12U + +-#define EXT4_INDEX_EXTRA_TRANS_BLOCKS 8 ++/* ++ * Number of credits needed if we need to insert an entry into a ++ * directory. For each new index block, we need 4 blocks (old index ++ * block, new index block, bitmap block, bg summary). For normal ++ * htree directories there are 2 levels; if the largedir feature ++ * enabled it's 3 levels. ++ */ ++#define EXT4_INDEX_EXTRA_TRANS_BLOCKS 12U + + #ifdef CONFIG_QUOTA + /* Amount of blocks needed for quota update - we know that the structure was +diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c +index 462988c..19f38c4 100644 +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -4817,7 +4817,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) + if (ext4_has_feature_64bit(sb)) + ei->i_file_acl |= + ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; +- inode->i_size = ext4_isize(raw_inode); ++ inode->i_size = ext4_isize(sb, raw_inode); + if ((size = i_size_read(inode)) < 0) { + EXT4_ERROR_INODE(inode, "bad i_size value: %lld", size); + ret = -EFSCORRUPTED; +@@ -5145,7 +5145,7 @@ static int ext4_do_update_inode(handle_t *handle, + raw_inode->i_file_acl_high = + cpu_to_le16(ei->i_file_acl >> 32); + raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl); +- if (ei->i_disksize != ext4_isize(raw_inode)) { ++ if (ei->i_disksize != ext4_isize(inode->i_sb, raw_inode)) { + ext4_isize_set(raw_inode, ei->i_disksize); + need_datasync = 1; + } +diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c +index c585762..e7fb642 100644 +--- a/fs/ext4/namei.c ++++ b/fs/ext4/namei.c +@@ -520,7 +520,7 @@ struct dx_root_info *dx_get_dx_info(struct ext4_dir_entry_2 *de) + + static inline ext4_lblk_t dx_get_block(struct dx_entry *entry) + { +- return le32_to_cpu(entry->block) & 0x00ffffff; ++ return le32_to_cpu(entry->block) & 0x0fffffff; + } + + static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value) +@@ -752,6 +752,7 @@ dx_probe(struct ext4_filename *fname, struct inode *dir, + struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR); + u32 hash; + ++ memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0])); + frame->bh = ext4_read_dirblock(dir, 0, INDEX); + if (IS_ERR(frame->bh)) + return (struct dx_frame *) frame->bh; +@@ -781,9 +782,15 @@ dx_probe(struct ext4_filename *fname, struct inode *dir, + } + + indirect = info->indirect_levels; +- if (indirect > 1) { +- ext4_warning_inode(dir, "Unimplemented hash depth: %#06x", +- info->indirect_levels); ++ if (indirect >= ext4_dir_htree_level(dir->i_sb)) { ++ ext4_warning(dir->i_sb, ++ "Directory (ino: %lu) htree depth %#06x exceed" ++ "supported value", dir->i_ino, ++ ext4_dir_htree_level(dir->i_sb)); ++ if (ext4_dir_htree_level(dir->i_sb) < EXT4_HTREE_LEVEL) { ++ ext4_warning(dir->i_sb, "Enable large directory " ++ "feature to access it"); ++ } + goto fail; + } + +@@ -874,12 +881,20 @@ fail: + + static void dx_release(struct dx_frame *frames) + { ++ struct dx_root_info *info; ++ int i; ++ + if (frames[0].bh == NULL) + return; + +- if (((struct dx_root *)frames[0].bh->b_data)->info.indirect_levels) +- brelse(frames[1].bh); +- brelse(frames[0].bh); ++ for (i = 0, info = dx_get_dx_info((struct ext4_dir_entry_2 *)frames[0].bh->b_data); ++ i <= info->indirect_levels; ++ i++) { ++ if (frames[i].bh == NULL) ++ break; ++ brelse(frames[i].bh); ++ frames[i].bh = NULL; ++ } + } + + /* +@@ -1065,7 +1080,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, + { + struct dx_hash_info hinfo; + struct ext4_dir_entry_2 *de; +- struct dx_frame frames[2], *frame; ++ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; + struct inode *dir; + ext4_lblk_t block; + int count = 0; +@@ -1505,7 +1520,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, + struct ext4_dir_entry_2 **res_dir) + { + struct super_block * sb = dir->i_sb; +- struct dx_frame frames[2], *frame; ++ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; + struct buffer_head *bh; + ext4_lblk_t block; + int retval; +@@ -1985,7 +2000,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, + struct inode *inode, struct buffer_head *bh) + { + struct buffer_head *bh2; +- struct dx_frame frames[2], *frame; ++ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; + struct dx_entry *entries; + struct ext4_dir_entry_2 *de, *de2, *dot_de, *dotdot_de; + struct ext4_dir_entry_tail *t; +@@ -2295,13 +2310,16 @@ out: + static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, + struct inode *dir, struct inode *inode) + { +- struct dx_frame frames[2], *frame; ++ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; + struct dx_entry *entries, *at; + struct buffer_head *bh; + struct super_block *sb = dir->i_sb; + struct ext4_dir_entry_2 *de; ++ int restart; + int err; + ++again: ++ restart = 0; + frame = dx_probe(fname, dir, NULL, frames); + if (IS_ERR(frame)) + return PTR_ERR(frame); +@@ -2323,24 +2341,44 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, + if (err != -ENOSPC) + goto cleanup; + ++ err = 0; + /* Block full, should compress but for now just split */ + dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n", + dx_get_count(entries), dx_get_limit(entries))); + /* Need to split index? */ + if (dx_get_count(entries) == dx_get_limit(entries)) { + ext4_lblk_t newblock; +- unsigned icount = dx_get_count(entries); +- int levels = frame - frames; ++ int levels = frame - frames + 1; ++ unsigned int icount; ++ int add_level = 1; + struct dx_entry *entries2; + struct dx_node *node2; + struct buffer_head *bh2; + +- if (levels && (dx_get_count(frames->entries) == +- dx_get_limit(frames->entries))) { +- ext4_warning_inode(dir, "Directory index full!"); ++ while (frame > frames) { ++ if (dx_get_count((frame - 1)->entries) < ++ dx_get_limit((frame - 1)->entries)) { ++ add_level = 0; ++ break; ++ } ++ frame--; /* split higher index block */ ++ at = frame->at; ++ entries = frame->entries; ++ restart = 1; ++ } ++ if (add_level && levels == ext4_dir_htree_level(sb)) { ++ ext4_warning(sb, "Directory (ino: %lu) index full, " ++ "reach max htree level :%d", ++ dir->i_ino, levels); ++ if (ext4_dir_htree_level(sb) < EXT4_HTREE_LEVEL) { ++ ext4_warning(sb, "Large directory feature is " ++ "not enabled on this " ++ "filesystem"); ++ } + err = -ENOSPC; + goto cleanup; + } ++ icount = dx_get_count(entries); + bh2 = ext4_append(handle, dir, &newblock); + if (IS_ERR(bh2)) { + err = PTR_ERR(bh2); +@@ -2355,7 +2393,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, + err = ext4_journal_get_write_access(handle, frame->bh); + if (err) + goto journal_error; +- if (levels) { ++ if (!add_level) { + unsigned icount1 = icount/2, icount2 = icount - icount1; + unsigned hash2 = dx_get_hash(entries + icount1); + dxtrace(printk(KERN_DEBUG "Split index %i/%i\n", +@@ -2363,7 +2401,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, + + BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */ + err = ext4_journal_get_write_access(handle, +- frames[0].bh); ++ (frame - 1)->bh); + if (err) + goto journal_error; + +@@ -2379,18 +2417,26 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, + frame->entries = entries = entries2; + swap(frame->bh, bh2); + } +- dx_insert_block(frames + 0, hash2, newblock); +- dxtrace(dx_show_index("node", frames[1].entries)); ++ dx_insert_block((frame - 1), hash2, newblock); ++ dxtrace(dx_show_index("node", frame->entries)); + dxtrace(dx_show_index("node", + ((struct dx_node *) bh2->b_data)->entries)); + err = ext4_handle_dirty_dx_node(handle, dir, bh2); + if (err) + goto journal_error; + brelse (bh2); ++ err = ext4_handle_dirty_dx_node(handle, dir, ++ (frame - 1)->bh); ++ if (err) ++ goto journal_error; ++ if (restart) { ++ err = ext4_handle_dirty_dx_node(handle, dir, ++ frame->bh); ++ goto journal_error; ++ } + } else { + struct dx_root_info *info; +- dxtrace(printk(KERN_DEBUG +- "Creating second level index...\n")); ++ + memcpy((char *) entries2, (char *) entries, + icount * sizeof(struct dx_entry)); + dx_set_limit(entries2, dx_node_limit(dir)); +@@ -2400,22 +2446,17 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, + dx_set_block(entries + 0, newblock); + info = dx_get_dx_info((struct ext4_dir_entry_2 *) + frames[0].bh->b_data); +- info->indirect_levels = 1; +- +- /* Add new access path frame */ +- frame = frames + 1; +- frame->at = at = at - entries + entries2; +- frame->entries = entries = entries2; +- frame->bh = bh2; +- err = ext4_journal_get_write_access(handle, +- frame->bh); ++ info->indirect_levels += 1; ++ dxtrace(printk(KERN_DEBUG ++ "Creating %d level index...\n", ++ info->indirect_levels)); ++ err = ext4_handle_dirty_dx_node(handle, dir, frame->bh); + if (err) + goto journal_error; +- } +- err = ext4_handle_dirty_dx_node(handle, dir, frames[0].bh); +- if (err) { +- ext4_std_error(inode->i_sb, err); +- goto cleanup; ++ err = ext4_handle_dirty_dx_node(handle, dir, bh2); ++ brelse(bh2); ++ restart = 1; ++ goto journal_error; + } + } + de = do_split(handle, dir, &bh, frame, &fname->hinfo); +@@ -2427,10 +2468,15 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, + goto cleanup; + + journal_error: +- ext4_std_error(dir->i_sb, err); ++ ext4_std_error(dir->i_sb, err); /* this is a no-op if err == 0 */ + cleanup: + brelse(bh); + dx_release(frames); ++ /* @restart is true means htree-path has been changed, we need to ++ * repeat dx_probe() to find out valid htree-path ++ */ ++ if (restart && err == 0) ++ goto again; + return err; + } + +-- +2.20.1 + diff --git a/ldiskfs/kernel_patches/patches/suse15/ext4-large-eas.patch b/ldiskfs/kernel_patches/patches/suse15/ext4-large-eas.patch new file mode 100644 index 0000000..3dba254 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/suse15/ext4-large-eas.patch @@ -0,0 +1,1116 @@ +Subject: [PATCH] ext4: xattr-in-inode support + +Large xattr support is implemented for EXT4_FEATURE_INCOMPAT_EA_INODE. + +If the size of an xattr value is larger than will fit in a single +external block, then the xattr value will be saved into the body +of an external xattr inode. + +The also helps support a larger number of xattr, since only the headers +will be stored in the in-inode space or the single external block. + +The inode is referenced from the xattr header via "e_value_inum", +which was formerly "e_value_block", but that field was never used. +The e_value_size still contains the xattr size so that listing +xattrs does not need to look up the inode if the data is not accessed. + +struct ext4_xattr_entry { + __u8 e_name_len; /* length of name */ + __u8 e_name_index; /* attribute name index */ + __le16 e_value_offs; /* offset in disk block of value */ + __le32 e_value_inum; /* inode in which value is stored */ + __le32 e_value_size; /* size of attribute value */ + __le32 e_hash; /* hash value of name and value */ + char e_name[0]; /* attribute name */ +}; + +The xattr inode is marked with the EXT4_EA_INODE_FL flag and also +holds a back-reference to the owning inode in its i_mtime field, +allowing the ext4/e2fsck to verify the correct inode is accessed. + +[ Applied fix by Dan Carpenter to avoid freeing an ERR_PTR. ] + +Lustre-Jira: https://jira.hpdd.intel.com/browse/LU-80 +Lustre-bugzilla: https://bugzilla.lustre.org/show_bug.cgi?id=4424 +Signed-off-by: Kalpak Shah +Signed-off-by: James Simmons +Signed-off-by: Andreas Dilger +Signed-off-by: Tahsin Erdogan +Signed-off-by: Theodore Ts'o +Signed-off-by: Dan Carpenter +--- + fs/ext4/ext4.h | 12 + + fs/ext4/ialloc.c | 1 - + fs/ext4/inline.c | 2 +- + fs/ext4/inode.c | 49 ++++- + fs/ext4/xattr.c | 561 +++++++++++++++++++++++++++++++++++++++++++---- + fs/ext4/xattr.h | 33 ++- + 6 files changed, 603 insertions(+), 55 deletions(-) + +diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h +index ca73d33..6796a8b 100644 +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -1811,6 +1811,7 @@ EXT4_FEATURE_INCOMPAT_FUNCS(encrypt, ENCRYPT) + EXT4_FEATURE_INCOMPAT_EXTENTS| \ + EXT4_FEATURE_INCOMPAT_64BIT| \ + EXT4_FEATURE_INCOMPAT_FLEX_BG| \ ++ EXT4_FEATURE_INCOMPAT_EA_INODE| \ + EXT4_FEATURE_INCOMPAT_MMP | \ + EXT4_FEATURE_INCOMPAT_DIRDATA| \ + EXT4_FEATURE_INCOMPAT_INLINE_DATA | \ +@@ -2304,6 +2305,12 @@ struct mmpd_data { + */ + #define EXT4_MMP_MAX_CHECK_INTERVAL 300UL + ++/* ++ * Maximum size of xattr attributes for FEATURE_INCOMPAT_EA_INODE 1Mb ++ * This limit is arbitrary, but is reasonable for the xattr API. ++ */ ++#define EXT4_XATTR_MAX_LARGE_EA_SIZE (1024 * 1024) ++ + /* + * Function prototypes + */ +@@ -2316,6 +2323,10 @@ struct mmpd_data { + # define ATTRIB_NORET __attribute__((noreturn)) + # define NORET_AND noreturn, + ++struct ext4_xattr_ino_array { ++ unsigned int xia_count; /* # of used item in the array */ ++ unsigned int xia_inodes[0]; ++}; + /* bitmap.c */ + extern unsigned int ext4_count_free(char *bitmap, unsigned numchars); + void ext4_inode_bitmap_csum_set(struct super_block *sb, ext4_group_t group, +@@ -2572,6 +2583,7 @@ extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks); + extern void ext4_set_inode_flags(struct inode *); + extern int ext4_alloc_da_blocks(struct inode *inode); + extern void ext4_set_aops(struct inode *inode); ++extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int chunk); + extern int ext4_writepage_trans_blocks(struct inode *); + extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); + extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, +diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c +index 7f9b8a5..0f284fb 100644 +--- a/fs/ext4/ialloc.c ++++ b/fs/ext4/ialloc.c +@@ -273,7 +273,6 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) + * as writing the quota to disk may need the lock as well. + */ + dquot_initialize(inode); +- ext4_xattr_delete_inode(handle, inode); + dquot_free_inode(inode); + dquot_drop(inode); + +diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c +index b27a736..69e46f9 100644 +--- a/fs/ext4/inline.c ++++ b/fs/ext4/inline.c +@@ -62,7 +62,7 @@ static int get_max_inline_xattr_value_size(struct inode *inode, + + /* Compute min_offs. */ + for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) { +- if (!entry->e_value_block && entry->e_value_size) { ++ if (!entry->e_value_inum && entry->e_value_size) { + size_t offs = le16_to_cpu(entry->e_value_offs); + if (offs < min_offs) + min_offs = offs; +diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c +index 19f38c4..c701c45 100644 +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -140,8 +140,6 @@ static void ext4_invalidatepage(struct page *page, unsigned int offset, + unsigned int length); + static int __ext4_journalled_writepage(struct page *page, unsigned int len); + static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh); +-static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, +- int pextents); + + /* + * Test whether an inode is a fast symlink. +@@ -190,6 +188,8 @@ void ext4_evict_inode(struct inode *inode) + { + handle_t *handle; + int err; ++ int extra_credits = 3; ++ struct ext4_xattr_ino_array *lea_ino_array = NULL; + + trace_ext4_evict_inode(inode); + +@@ -240,8 +240,8 @@ void ext4_evict_inode(struct inode *inode) + * protection against it + */ + sb_start_intwrite(inode->i_sb); +- handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, +- ext4_blocks_for_truncate(inode)+3); ++ ++ handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, extra_credits); + if (IS_ERR(handle)) { + ext4_std_error(inode->i_sb, PTR_ERR(handle)); + /* +@@ -253,9 +253,36 @@ void ext4_evict_inode(struct inode *inode) + sb_end_intwrite(inode->i_sb); + goto no_delete; + } +- + if (IS_SYNC(inode)) + ext4_handle_sync(handle); ++ ++ /* ++ * Delete xattr inode before deleting the main inode. ++ */ ++ err = ext4_xattr_delete_inode(handle, inode, &lea_ino_array); ++ if (err) { ++ ext4_warning(inode->i_sb, ++ "couldn't delete inode's xattr (err %d)", err); ++ goto stop_handle; ++ } ++ ++ if (!IS_NOQUOTA(inode)) ++ extra_credits += 2 * EXT4_QUOTA_DEL_BLOCKS(inode->i_sb); ++ ++ if (!ext4_handle_has_enough_credits(handle, ++ ext4_blocks_for_truncate(inode) + extra_credits)) { ++ err = ext4_journal_extend(handle, ++ ext4_blocks_for_truncate(inode) + extra_credits); ++ if (err > 0) ++ err = ext4_journal_restart(handle, ++ ext4_blocks_for_truncate(inode) + extra_credits); ++ if (err != 0) { ++ ext4_warning(inode->i_sb, ++ "couldn't extend journal (err %d)", err); ++ goto stop_handle; ++ } ++ } ++ + inode->i_size = 0; + err = ext4_mark_inode_dirty(handle, inode); + if (err) { +@@ -279,10 +306,10 @@ void ext4_evict_inode(struct inode *inode) + * enough credits left in the handle to remove the inode from + * the orphan list and set the dtime field. + */ +- if (!ext4_handle_has_enough_credits(handle, 3)) { +- err = ext4_journal_extend(handle, 3); ++ if (!ext4_handle_has_enough_credits(handle, extra_credits)) { ++ err = ext4_journal_extend(handle, extra_credits); + if (err > 0) +- err = ext4_journal_restart(handle, 3); ++ err = ext4_journal_restart(handle, extra_credits); + if (err != 0) { + ext4_warning(inode->i_sb, + "couldn't extend journal (err %d)", err); +@@ -317,8 +344,12 @@ void ext4_evict_inode(struct inode *inode) + ext4_clear_inode(inode); + else + ext4_free_inode(handle, inode); ++ + ext4_journal_stop(handle); + sb_end_intwrite(inode->i_sb); ++ ++ if (lea_ino_array != NULL) ++ ext4_xattr_inode_array_free(inode, lea_ino_array); + return; + no_delete: + ext4_clear_inode(inode); /* We must guarantee clearing of inode... */ +@@ -5624,7 +5655,7 @@ static int ext4_index_trans_blocks(struct inode *inode, int lblocks, + * + * Also account for superblock, inode, quota and xattr blocks + */ +-static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, ++int ext4_meta_trans_blocks(struct inode *inode, int lblocks, + int pextents) + { + ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb); +diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c +index 45accaa..8f16071 100644 +--- a/fs/ext4/xattr.c ++++ b/fs/ext4/xattr.c +@@ -180,9 +180,8 @@ ext4_xattr_check_entries(struct ext4_xattr_entry *entry, void *end, + + /* Check the values */ + while (!IS_LAST_ENTRY(entry)) { +- if (entry->e_value_block != 0) +- return -EFSCORRUPTED; +- if (entry->e_value_size != 0) { ++ if (entry->e_value_size != 0 && ++ entry->e_value_inum == 0) { + u16 offs = le16_to_cpu(entry->e_value_offs); + u32 size = le32_to_cpu(entry->e_value_size); + void *value; +@@ -287,6 +286,99 @@ xattr_find_entry(struct inode *inode, struct ext4_xattr_entry **pentry, + return cmp ? -ENODATA : 0; + } + ++/* ++ * Read the EA value from an inode. ++ */ ++static int ++ext4_xattr_inode_read(struct inode *ea_inode, void *buf, size_t *size) ++{ ++ unsigned long block = 0; ++ struct buffer_head *bh = NULL; ++ int blocksize; ++ size_t csize, ret_size = 0; ++ ++ if (*size == 0) ++ return 0; ++ ++ blocksize = ea_inode->i_sb->s_blocksize; ++ ++ while (ret_size < *size) { ++ csize = (*size - ret_size) > blocksize ? blocksize : ++ *size - ret_size; ++ bh = ext4_bread(NULL, ea_inode, block, 0); ++ if (IS_ERR(bh)) { ++ *size = ret_size; ++ return PTR_ERR(bh); ++ } ++ memcpy(buf, bh->b_data, csize); ++ brelse(bh); ++ ++ buf += csize; ++ block += 1; ++ ret_size += csize; ++ } ++ ++ *size = ret_size; ++ ++ return 0; ++} ++ ++struct inode *ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino, int *err) ++{ ++ struct inode *ea_inode = NULL; ++ ++ ea_inode = ext4_iget(parent->i_sb, ea_ino); ++ if (IS_ERR(ea_inode) || is_bad_inode(ea_inode)) { ++ int rc = IS_ERR(ea_inode) ? PTR_ERR(ea_inode) : 0; ++ ext4_error(parent->i_sb, "error while reading EA inode %lu " ++ "/ %d %d", ea_ino, rc, is_bad_inode(ea_inode)); ++ *err = rc != 0 ? rc : -EIO; ++ return NULL; ++ } ++ ++ if (EXT4_XATTR_INODE_GET_PARENT(ea_inode) != parent->i_ino || ++ ea_inode->i_generation != parent->i_generation) { ++ ext4_error(parent->i_sb, "Backpointer from EA inode %lu " ++ "to parent invalid.", ea_ino); ++ *err = -EINVAL; ++ goto error; ++ } ++ ++ if (!(EXT4_I(ea_inode)->i_flags & EXT4_EA_INODE_FL)) { ++ ext4_error(parent->i_sb, "EA inode %lu does not have " ++ "EXT4_EA_INODE_FL flag set.\n", ea_ino); ++ *err = -EINVAL; ++ goto error; ++ } ++ ++ *err = 0; ++ return ea_inode; ++ ++error: ++ iput(ea_inode); ++ return NULL; ++} ++ ++/* ++ * Read the value from the EA inode. ++ */ ++static int ++ext4_xattr_inode_get(struct inode *inode, unsigned long ea_ino, void *buffer, ++ size_t *size) ++{ ++ struct inode *ea_inode = NULL; ++ int err; ++ ++ ea_inode = ext4_xattr_inode_iget(inode, ea_ino, &err); ++ if (err) ++ return err; ++ ++ err = ext4_xattr_inode_read(ea_inode, buffer, size); ++ iput(ea_inode); ++ ++ return err; ++} ++ + static int + ext4_xattr_block_get(struct inode *inode, int name_index, const char *name, + void *buffer, size_t buffer_size) +@@ -325,8 +417,16 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name, + error = -ERANGE; + if (size > buffer_size) + goto cleanup; +- memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs), +- size); ++ if (entry->e_value_inum) { ++ error = ext4_xattr_inode_get(inode, ++ le32_to_cpu(entry->e_value_inum), ++ buffer, &size); ++ if (error) ++ goto cleanup; ++ } else { ++ memcpy(buffer, bh->b_data + ++ le16_to_cpu(entry->e_value_offs), size); ++ } + } + error = size; + +@@ -367,8 +467,16 @@ ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name, + error = -ERANGE; + if (size > buffer_size) + goto cleanup; +- memcpy(buffer, (void *)IFIRST(header) + +- le16_to_cpu(entry->e_value_offs), size); ++ if (entry->e_value_inum) { ++ error = ext4_xattr_inode_get(inode, ++ le32_to_cpu(entry->e_value_inum), ++ buffer, &size); ++ if (error) ++ goto cleanup; ++ } else { ++ memcpy(buffer, (void *)IFIRST(header) + ++ le16_to_cpu(entry->e_value_offs), size); ++ } + } + error = size; + +@@ -634,7 +742,7 @@ static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last, + size_t *min_offs, void *base, int *total) + { + for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { +- if (last->e_value_size) { ++ if (!last->e_value_inum && last->e_value_size) { + size_t offs = le16_to_cpu(last->e_value_offs); + if (offs < *min_offs) + *min_offs = offs; +@@ -645,11 +753,166 @@ static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last, + return (*min_offs - ((void *)last - base) - sizeof(__u32)); + } + +-static int +-ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s) ++/* ++ * Write the value of the EA in an inode. ++ */ ++static int ext4_xattr_inode_write(handle_t *handle, struct inode *ea_inode, ++ const void *buf, int bufsize) ++{ ++ struct buffer_head *bh = NULL; ++ unsigned long block = 0; ++ unsigned blocksize = ea_inode->i_sb->s_blocksize; ++ unsigned max_blocks = (bufsize + blocksize - 1) >> ea_inode->i_blkbits; ++ int csize, wsize = 0; ++ int ret = 0; ++ int retries = 0; ++ ++retry: ++ while (ret >= 0 && ret < max_blocks) { ++ struct ext4_map_blocks map; ++ map.m_lblk = block += ret; ++ map.m_len = max_blocks -= ret; ++ ++ ret = ext4_map_blocks(handle, ea_inode, &map, ++ EXT4_GET_BLOCKS_CREATE); ++ if (ret <= 0) { ++ ext4_mark_inode_dirty(handle, ea_inode); ++ if (ret == -ENOSPC && ++ ext4_should_retry_alloc(ea_inode->i_sb, &retries)) { ++ ret = 0; ++ goto retry; ++ } ++ break; ++ } ++ } ++ ++ if (ret < 0) ++ return ret; ++ ++ block = 0; ++ while (wsize < bufsize) { ++ if (bh != NULL) ++ brelse(bh); ++ csize = (bufsize - wsize) > blocksize ? blocksize : ++ bufsize - wsize; ++ bh = ext4_getblk(handle, ea_inode, block, 0); ++ if (IS_ERR(bh)) ++ return PTR_ERR(bh); ++ ret = ext4_journal_get_write_access(handle, bh); ++ if (ret) ++ goto out; ++ ++ memcpy(bh->b_data, buf, csize); ++ set_buffer_uptodate(bh); ++ ext4_handle_dirty_metadata(handle, ea_inode, bh); ++ ++ buf += csize; ++ wsize += csize; ++ block += 1; ++ } ++ ++ inode_lock(ea_inode); ++ i_size_write(ea_inode, wsize); ++ ext4_update_i_disksize(ea_inode, wsize); ++ inode_unlock(ea_inode); ++ ++ ext4_mark_inode_dirty(handle, ea_inode); ++ ++out: ++ brelse(bh); ++ ++ return ret; ++} ++ ++/* ++ * Create an inode to store the value of a large EA. ++ */ ++static struct inode *ext4_xattr_inode_create(handle_t *handle, ++ struct inode *inode) ++{ ++ struct inode *ea_inode = NULL; ++ ++ /* ++ * Let the next inode be the goal, so we try and allocate the EA inode ++ * in the same group, or nearby one. ++ */ ++ ea_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode, ++ S_IFREG | 0600, NULL, inode->i_ino + 1, NULL); ++ if (!IS_ERR(ea_inode)) { ++ ea_inode->i_op = &ext4_file_inode_operations; ++ ea_inode->i_fop = &ext4_file_operations; ++ ext4_set_aops(ea_inode); ++ ea_inode->i_generation = inode->i_generation; ++ EXT4_I(ea_inode)->i_flags |= EXT4_EA_INODE_FL; ++ ++ /* ++ * A back-pointer from EA inode to parent inode will be useful ++ * for e2fsck. ++ */ ++ EXT4_XATTR_INODE_SET_PARENT(ea_inode, inode->i_ino); ++ unlock_new_inode(ea_inode); ++ } ++ ++ return ea_inode; ++} ++ ++/* ++ * Unlink the inode storing the value of the EA. ++ */ ++int ext4_xattr_inode_unlink(struct inode *inode, unsigned long ea_ino) ++{ ++ struct inode *ea_inode = NULL; ++ int err; ++ ++ ea_inode = ext4_xattr_inode_iget(inode, ea_ino, &err); ++ if (err) ++ return err; ++ ++ clear_nlink(ea_inode); ++ iput(ea_inode); ++ ++ return 0; ++} ++ ++/* ++ * Add value of the EA in an inode. ++ */ ++static int ext4_xattr_inode_set(handle_t *handle, struct inode *inode, ++ unsigned long *ea_ino, const void *value, ++ size_t value_len) ++{ ++ struct inode *ea_inode; ++ int err; ++ ++ /* Create an inode for the EA value */ ++ ea_inode = ext4_xattr_inode_create(handle, inode); ++ if (IS_ERR(ea_inode)) ++ return PTR_ERR(ea_inode); ++ ++ err = ext4_xattr_inode_write(handle, ea_inode, value, value_len); ++ if (err) ++ clear_nlink(ea_inode); ++ else ++ *ea_ino = ea_inode->i_ino; ++ ++ iput(ea_inode); ++ ++ return err; ++} ++ ++static int ext4_xattr_set_entry(struct ext4_xattr_info *i, ++ struct ext4_xattr_search *s, ++ handle_t *handle, struct inode *inode) + { + struct ext4_xattr_entry *last, *next; + size_t free, min_offs = s->end - s->base, name_len = strlen(i->name); ++ int in_inode = i->in_inode; ++ int rc = 0; ++ ++ if (ext4_has_feature_ea_inode(inode->i_sb) && ++ (EXT4_XATTR_SIZE(i->value_len) > ++ EXT4_XATTR_MIN_LARGE_EA_SIZE(inode->i_sb->s_blocksize))) ++ in_inode = 1; + + /* Compute min_offs and last. */ + last = s->first; +@@ -657,7 +920,7 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s) + next = EXT4_XATTR_NEXT(last); + if ((void *)next >= s->end) + return -EFSCORRUPTED; +- if (last->e_value_size) { ++ if (!last->e_value_inum && last->e_value_size) { + size_t offs = le16_to_cpu(last->e_value_offs); + if (offs < min_offs) + min_offs = offs; +@@ -665,15 +928,20 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s) + } + free = min_offs - ((void *)last - s->base) - sizeof(__u32); + if (!s->not_found) { +- if (s->here->e_value_size) { ++ if (!in_inode && ++ !s->here->e_value_inum && s->here->e_value_size) { + size_t size = le32_to_cpu(s->here->e_value_size); + free += EXT4_XATTR_SIZE(size); + } + free += EXT4_XATTR_LEN(name_len); + } + if (i->value) { +- if (free < EXT4_XATTR_LEN(name_len) + +- EXT4_XATTR_SIZE(i->value_len)) ++ size_t value_len = EXT4_XATTR_SIZE(i->value_len); ++ ++ if (in_inode) ++ value_len = 0; ++ ++ if (free < EXT4_XATTR_LEN(name_len) + value_len) + return -ENOSPC; + } + +@@ -687,7 +955,8 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s) + s->here->e_name_len = name_len; + memcpy(s->here->e_name, i->name, name_len); + } else { +- if (s->here->e_value_size) { ++ if (!s->here->e_value_inum && s->here->e_value_size && ++ s->here->e_value_offs > 0) { + void *first_val = s->base + min_offs; + size_t offs = le16_to_cpu(s->here->e_value_offs); + void *val = s->base + offs; +@@ -721,12 +990,18 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s) + last = s->first; + while (!IS_LAST_ENTRY(last)) { + size_t o = le16_to_cpu(last->e_value_offs); +- if (last->e_value_size && o < offs) ++ if (!last->e_value_inum && ++ last->e_value_size && o < offs) + last->e_value_offs = + cpu_to_le16(o + size); + last = EXT4_XATTR_NEXT(last); + } + } ++ if (s->here->e_value_inum) { ++ ext4_xattr_inode_unlink(inode, ++ le32_to_cpu(s->here->e_value_inum)); ++ s->here->e_value_inum = 0; ++ } + if (!i->value) { + /* Remove the old name. */ + size_t size = EXT4_XATTR_LEN(name_len); +@@ -739,11 +1014,20 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s) + + if (i->value) { + /* Insert the new value. */ +- s->here->e_value_size = cpu_to_le32(i->value_len); +- if (i->value_len) { ++ if (in_inode) { ++ unsigned long ea_ino = ++ le32_to_cpu(s->here->e_value_inum); ++ rc = ext4_xattr_inode_set(handle, inode, &ea_ino, ++ i->value, i->value_len); ++ if (rc) ++ goto out; ++ s->here->e_value_inum = cpu_to_le32(ea_ino); ++ s->here->e_value_offs = 0; ++ } else if (i->value_len) { + size_t size = EXT4_XATTR_SIZE(i->value_len); + void *val = s->base + min_offs - size; + s->here->e_value_offs = cpu_to_le16(min_offs - size); ++ s->here->e_value_inum = 0; + if (i->value == EXT4_ZERO_XATTR_VALUE) { + memset(val, 0, size); + } else { +@@ -753,8 +1037,11 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s) + memcpy(val, i->value, i->value_len); + } + } ++ s->here->e_value_size = cpu_to_le32(i->value_len); + } +- return 0; ++ ++out: ++ return rc; + } + + struct ext4_xattr_block_find { +@@ -815,8 +1102,6 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, + + #define header(x) ((struct ext4_xattr_header *)(x)) + +- if (i->value && i->value_len > sb->s_blocksize) +- return -ENOSPC; + if (s->base) { + BUFFER_TRACE(bs->bh, "get_write_access"); + error = ext4_journal_get_write_access(handle, bs->bh); +@@ -835,7 +1120,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, + mb_cache_entry_delete_block(ext4_mb_cache, hash, + bs->bh->b_blocknr); + ea_bdebug(bs->bh, "modifying in-place"); +- error = ext4_xattr_set_entry(i, s); ++ error = ext4_xattr_set_entry(i, s, handle, inode); + if (!error) { + if (!IS_LAST_ENTRY(s->first)) + ext4_xattr_rehash(header(s->base), +@@ -884,7 +1169,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, + s->end = s->base + sb->s_blocksize; + } + +- error = ext4_xattr_set_entry(i, s); ++ error = ext4_xattr_set_entry(i, s, handle, inode); + if (error == -EFSCORRUPTED) + goto bad_block; + if (error) +@@ -1084,7 +1369,7 @@ int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode, + + if (EXT4_I(inode)->i_extra_isize == 0) + return -ENOSPC; +- error = ext4_xattr_set_entry(i, s); ++ error = ext4_xattr_set_entry(i, s, handle, inode); + if (error) + return error; + header = IHDR(inode, ext4_raw_inode(&is->iloc)); +@@ -1098,7 +1383,7 @@ int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode, + return 0; + } + +-static int ext4_xattr_ibody_set(struct inode *inode, ++static int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, + struct ext4_xattr_info *i, + struct ext4_xattr_ibody_find *is) + { +@@ -1108,7 +1393,7 @@ static int ext4_xattr_ibody_set(struct inode *inode, + + if (EXT4_I(inode)->i_extra_isize == 0) + return -ENOSPC; +- error = ext4_xattr_set_entry(i, s); ++ error = ext4_xattr_set_entry(i, s, handle, inode); + if (error) + return error; + header = IHDR(inode, ext4_raw_inode(&is->iloc)); +@@ -1155,7 +1440,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, + .name = name, + .value = value, + .value_len = value_len, +- ++ .in_inode = 0, + }; + struct ext4_xattr_ibody_find is = { + .s = { .not_found = -ENODATA, }, +@@ -1204,7 +1489,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, + } + if (!value) { + if (!is.s.not_found) +- error = ext4_xattr_ibody_set(inode, &i, &is); ++ error = ext4_xattr_ibody_set(handle, inode, &i, &is); + else if (!bs.s.not_found) + error = ext4_xattr_block_set(handle, inode, &i, &bs); + } else { +@@ -1215,7 +1500,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, + if (!bs.s.not_found && ext4_xattr_value_same(&bs.s, &i)) + goto cleanup; + +- error = ext4_xattr_ibody_set(inode, &i, &is); ++ error = ext4_xattr_ibody_set(handle, inode, &i, &is); + if (!error && !bs.s.not_found) { + i.value = NULL; + error = ext4_xattr_block_set(handle, inode, &i, &bs); +@@ -1228,11 +1513,20 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, + goto cleanup; + } + error = ext4_xattr_block_set(handle, inode, &i, &bs); ++ if (ext4_has_feature_ea_inode(inode->i_sb) && ++ error == -ENOSPC) { ++ /* xattr not fit to block, store at external ++ * inode */ ++ i.in_inode = 1; ++ error = ext4_xattr_ibody_set(handle, inode, ++ &i, &is); ++ } + if (error) + goto cleanup; + if (!is.s.not_found) { + i.value = NULL; +- error = ext4_xattr_ibody_set(inode, &i, &is); ++ error = ext4_xattr_ibody_set(handle, inode, &i, ++ &is); + } + } + } +@@ -1271,12 +1565,26 @@ ext4_xattr_set(struct inode *inode, int name_index, const char *name, + const void *value, size_t value_len, int flags) + { + handle_t *handle; ++ struct super_block *sb = inode->i_sb; + int error, retries = 0; + int credits = ext4_jbd2_credits_xattr(inode); + + error = dquot_initialize(inode); + if (error) + return error; ++ ++ if ((value_len >= EXT4_XATTR_MIN_LARGE_EA_SIZE(sb->s_blocksize)) && ++ ext4_has_feature_ea_inode(sb)) { ++ int nrblocks = (value_len + sb->s_blocksize - 1) >> ++ sb->s_blocksize_bits; ++ ++ /* For new inode */ ++ credits += EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + 3; ++ ++ /* For data blocks of EA inode */ ++ credits += ext4_meta_trans_blocks(inode, nrblocks, 0); ++ } ++ + retry: + handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits); + if (IS_ERR(handle)) { +@@ -1288,7 +1596,7 @@ retry: + value, value_len, flags); + error2 = ext4_journal_stop(handle); + if (error == -ENOSPC && +- ext4_should_retry_alloc(inode->i_sb, &retries)) ++ ext4_should_retry_alloc(sb, &retries)) + goto retry; + if (error == 0) + error = error2; +@@ -1313,7 +1621,7 @@ static void ext4_xattr_shift_entries(struct ext4_xattr_entry *entry, + + /* Adjust the value offsets of the entries */ + for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { +- if (last->e_value_size) { ++ if (!last->e_value_inum && last->e_value_size) { + new_offs = le16_to_cpu(last->e_value_offs) + + value_offs_shift; + last->e_value_offs = cpu_to_le16(new_offs); +@@ -1374,7 +1682,7 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode, + goto out; + + /* Remove the chosen entry from the inode */ +- error = ext4_xattr_ibody_set(inode, &i, is); ++ error = ext4_xattr_ibody_set(handle, inode, &i, is); + if (error) + goto out; + +@@ -1578,21 +1886,135 @@ cleanup: + } + + ++#define EIA_INCR 16 /* must be 2^n */ ++#define EIA_MASK (EIA_INCR - 1) ++/* Add the large xattr @ino into @lea_ino_array for later deletion. ++ * If @lea_ino_array is new or full it will be grown and the old ++ * contents copied over. ++ */ ++static int ++ext4_expand_ino_array(struct ext4_xattr_ino_array **lea_ino_array, __u32 ino) ++{ ++ if (*lea_ino_array == NULL) { ++ /* ++ * Start with 15 inodes, so it fits into a power-of-two size. ++ * If *lea_ino_array is NULL, this is essentially offsetof() ++ */ ++ (*lea_ino_array) = ++ kmalloc(offsetof(struct ext4_xattr_ino_array, ++ xia_inodes[EIA_MASK]), ++ GFP_NOFS); ++ if (*lea_ino_array == NULL) ++ return -ENOMEM; ++ (*lea_ino_array)->xia_count = 0; ++ } else if (((*lea_ino_array)->xia_count & EIA_MASK) == EIA_MASK) { ++ /* expand the array once all 15 + n * 16 slots are full */ ++ struct ext4_xattr_ino_array *new_array = NULL; ++ int count = (*lea_ino_array)->xia_count; ++ ++ /* if new_array is NULL, this is essentially offsetof() */ ++ new_array = kmalloc( ++ offsetof(struct ext4_xattr_ino_array, ++ xia_inodes[count + EIA_INCR]), ++ GFP_NOFS); ++ if (new_array == NULL) ++ return -ENOMEM; ++ memcpy(new_array, *lea_ino_array, ++ offsetof(struct ext4_xattr_ino_array, ++ xia_inodes[count])); ++ kfree(*lea_ino_array); ++ *lea_ino_array = new_array; ++ } ++ (*lea_ino_array)->xia_inodes[(*lea_ino_array)->xia_count++] = ino; ++ return 0; ++} ++ ++/** ++ * Add xattr inode to orphan list ++ */ ++static int ++ext4_xattr_inode_orphan_add(handle_t *handle, struct inode *inode, ++ int credits, struct ext4_xattr_ino_array *lea_ino_array) ++{ ++ struct inode *ea_inode = NULL; ++ int idx = 0, error = 0; ++ ++ if (lea_ino_array == NULL) ++ return 0; ++ ++ for (; idx < lea_ino_array->xia_count; ++idx) { ++ if (!ext4_handle_has_enough_credits(handle, credits)) { ++ error = ext4_journal_extend(handle, credits); ++ if (error > 0) ++ error = ext4_journal_restart(handle, credits); ++ ++ if (error != 0) { ++ ext4_warning(inode->i_sb, ++ "couldn't extend journal " ++ "(err %d)", error); ++ return error; ++ } ++ } ++ ea_inode = ext4_xattr_inode_iget(inode, ++ lea_ino_array->xia_inodes[idx], &error); ++ if (error) ++ continue; ++ ext4_orphan_add(handle, ea_inode); ++ /* the inode's i_count will be released by caller */ ++ } ++ ++ return 0; ++} + + /* + * ext4_xattr_delete_inode() + * +- * Free extended attribute resources associated with this inode. This ++ * Free extended attribute resources associated with this inode. Traverse ++ * all entries and unlink any xattr inodes associated with this inode. This + * is called immediately before an inode is freed. We have exclusive +- * access to the inode. ++ * access to the inode. If an orphan inode is deleted it will also delete any ++ * xattr block and all xattr inodes. They are checked by ext4_xattr_inode_iget() ++ * to ensure they belong to the parent inode and were not deleted already. + */ +-void +-ext4_xattr_delete_inode(handle_t *handle, struct inode *inode) ++int ++ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, ++ struct ext4_xattr_ino_array **lea_ino_array) + { + struct buffer_head *bh = NULL; ++ struct ext4_xattr_ibody_header *header; ++ struct ext4_inode *raw_inode; ++ struct ext4_iloc iloc; ++ struct ext4_xattr_entry *entry; ++ int credits = 3, error = 0; + +- if (!EXT4_I(inode)->i_file_acl) ++ if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR)) ++ goto delete_external_ea; ++ ++ error = ext4_get_inode_loc(inode, &iloc); ++ if (error) ++ goto cleanup; ++ raw_inode = ext4_raw_inode(&iloc); ++ header = IHDR(inode, raw_inode); ++ for (entry = IFIRST(header); !IS_LAST_ENTRY(entry); ++ entry = EXT4_XATTR_NEXT(entry)) { ++ if (!entry->e_value_inum) ++ continue; ++ if (ext4_expand_ino_array(lea_ino_array, ++ entry->e_value_inum) != 0) { ++ brelse(iloc.bh); ++ goto cleanup; ++ } ++ entry->e_value_inum = 0; ++ } ++ brelse(iloc.bh); ++ ++delete_external_ea: ++ if (!EXT4_I(inode)->i_file_acl) { ++ /* add xattr inode to orphan list */ ++ ext4_xattr_inode_orphan_add(handle, inode, credits, ++ *lea_ino_array); + goto cleanup; ++ } + bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); + if (!bh) { + EXT4_ERROR_INODE(inode, "block %llu read error", +@@ -1605,11 +2027,69 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode) + EXT4_I(inode)->i_file_acl); + goto cleanup; + } ++ ++ for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry); ++ entry = EXT4_XATTR_NEXT(entry)) { ++ if (!entry->e_value_inum) ++ continue; ++ if (ext4_expand_ino_array(lea_ino_array, ++ entry->e_value_inum) != 0) ++ goto cleanup; ++ entry->e_value_inum = 0; ++ } ++ ++ /* add xattr inode to orphan list */ ++ error = ext4_xattr_inode_orphan_add(handle, inode, credits, ++ *lea_ino_array); ++ if (error != 0) ++ goto cleanup; ++ ++ if (!IS_NOQUOTA(inode)) ++ credits += 2 * EXT4_QUOTA_DEL_BLOCKS(inode->i_sb); ++ ++ if (!ext4_handle_has_enough_credits(handle, credits)) { ++ error = ext4_journal_extend(handle, credits); ++ if (error > 0) ++ error = ext4_journal_restart(handle, credits); ++ if (error != 0) { ++ ext4_warning(inode->i_sb, ++ "couldn't extend journal (err %d)", error); ++ goto cleanup; ++ } ++ } ++ + ext4_xattr_release_block(handle, inode, bh); + EXT4_I(inode)->i_file_acl = 0; + + cleanup: + brelse(bh); ++ ++ return error; ++} ++ ++void ++ext4_xattr_inode_array_free(struct inode *inode, ++ struct ext4_xattr_ino_array *lea_ino_array) ++{ ++ struct inode *ea_inode = NULL; ++ int idx = 0; ++ int err; ++ ++ if (lea_ino_array == NULL) ++ return; ++ ++ for (; idx < lea_ino_array->xia_count; ++idx) { ++ ea_inode = ext4_xattr_inode_iget(inode, ++ lea_ino_array->xia_inodes[idx], &err); ++ if (err) ++ continue; ++ /* for inode's i_count get from ext4_xattr_delete_inode */ ++ if (!list_empty(&EXT4_I(ea_inode)->i_orphan)) ++ iput(ea_inode); ++ clear_nlink(ea_inode); ++ iput(ea_inode); ++ } ++ kfree(lea_ino_array); + } + + /* +@@ -1661,10 +2141,9 @@ ext4_xattr_cmp(struct ext4_xattr_header *header1, + entry1->e_name_index != entry2->e_name_index || + entry1->e_name_len != entry2->e_name_len || + entry1->e_value_size != entry2->e_value_size || ++ entry1->e_value_inum != entry2->e_value_inum || + memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len)) + return 1; +- if (entry1->e_value_block != 0 || entry2->e_value_block != 0) +- return -EFSCORRUPTED; + if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs), + (char *)header2 + le16_to_cpu(entry2->e_value_offs), + le32_to_cpu(entry1->e_value_size))) +@@ -1736,7 +2215,7 @@ static inline void ext4_xattr_hash_entry(struct ext4_xattr_header *header, + *name++; + } + +- if (entry->e_value_size != 0) { ++ if (!entry->e_value_inum && entry->e_value_size) { + __le32 *value = (__le32 *)((char *)header + + le16_to_cpu(entry->e_value_offs)); + for (n = (le32_to_cpu(entry->e_value_size) + +diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h +index 099c8b6..6e10ff9 100644 +--- a/fs/ext4/xattr.h ++++ b/fs/ext4/xattr.h +@@ -44,7 +44,7 @@ struct ext4_xattr_entry { + __u8 e_name_len; /* length of name */ + __u8 e_name_index; /* attribute name index */ + __le16 e_value_offs; /* offset in disk block of value */ +- __le32 e_value_block; /* disk block attribute is stored on (n/i) */ ++ __le32 e_value_inum; /* inode in which the value is stored */ + __le32 e_value_size; /* size of attribute value */ + __le32 e_hash; /* hash value of name and value */ + char e_name[0]; /* attribute name */ +@@ -69,6 +69,26 @@ struct ext4_xattr_entry { + EXT4_I(inode)->i_extra_isize)) + #define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1)) + ++/* ++ * Link EA inode back to parent one using i_mtime field. ++ * Extra integer type conversion added to ignore higher ++ * bits in i_mtime.tv_sec which might be set by ext4_get() ++ */ ++#define EXT4_XATTR_INODE_SET_PARENT(inode, inum) \ ++do { \ ++ (inode)->i_mtime.tv_sec = inum; \ ++} while(0) ++ ++#define EXT4_XATTR_INODE_GET_PARENT(inode) \ ++((__u32)(inode)->i_mtime.tv_sec) ++ ++/* ++ * The minimum size of EA value when you start storing it in an external inode ++ * size of block - size of header - size of 1 entry - 4 null bytes ++*/ ++#define EXT4_XATTR_MIN_LARGE_EA_SIZE(b) \ ++ ((b) - EXT4_XATTR_LEN(3) - sizeof(struct ext4_xattr_header) - 4) ++ + #define BHDR(bh) ((struct ext4_xattr_header *)((bh)->b_data)) + #define ENTRY(ptr) ((struct ext4_xattr_entry *)(ptr)) + #define BFIRST(bh) ENTRY(BHDR(bh)+1) +@@ -77,10 +97,11 @@ struct ext4_xattr_entry { + #define EXT4_ZERO_XATTR_VALUE ((void *)-1) + + struct ext4_xattr_info { +- int name_index; + const char *name; + const void *value; + size_t value_len; ++ int name_index; ++ int in_inode; + }; + + struct ext4_xattr_search { +@@ -140,7 +161,13 @@ extern int ext4_xattr_get(struct inode *, int, const char *, void *, size_t); + extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_t, int); + extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int); + +-extern void ext4_xattr_delete_inode(handle_t *, struct inode *); ++extern struct inode *ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino, ++ int *err); ++extern int ext4_xattr_inode_unlink(struct inode *inode, unsigned long ea_ino); ++extern int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, ++ struct ext4_xattr_ino_array **array); ++extern void ext4_xattr_inode_array_free(struct inode *inode, ++ struct ext4_xattr_ino_array *array); + + extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, + struct ext4_inode *raw_inode, handle_t *handle); +-- +2.20.1 + diff --git a/ldiskfs/kernel_patches/patches/suse15/ext4-max-dir-size.patch b/ldiskfs/kernel_patches/patches/suse15/ext4-max-dir-size.patch new file mode 100644 index 0000000..cb18479 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/suse15/ext4-max-dir-size.patch @@ -0,0 +1,24 @@ +Add a proc interface for max_dir_size. + +Index: b/fs/ext4/sysfs.c +=================================================================== +--- a/fs/ext4/sysfs.c ++++ b/fs/ext4/sysfs.c +@@ -170,6 +170,8 @@ EXT4_ATTR_FUNC(reserved_clusters, 0644); + EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, inode_readahead, + ext4_sb_info, s_inode_readahead_blks); + EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal); ++EXT4_RW_ATTR_SBI_UI(max_dir_size, s_max_dir_size_kb); ++EXT4_RW_ATTR_SBI_UI(max_dir_size_kb, s_max_dir_size_kb); + EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats); + EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan); + EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan); +@@ -199,6 +201,8 @@ static struct attribute *ext4_attrs[] = + ATTR_LIST(reserved_clusters), + ATTR_LIST(inode_readahead_blks), + ATTR_LIST(inode_goal), ++ ATTR_LIST(max_dir_size), ++ ATTR_LIST(max_dir_size_kb), + ATTR_LIST(mb_stats), + ATTR_LIST(mb_max_to_scan), + ATTR_LIST(mb_min_to_scan), diff --git a/ldiskfs/kernel_patches/series/ldiskfs-4.12-sles15.series b/ldiskfs/kernel_patches/series/ldiskfs-4.12-sles15.series new file mode 100644 index 0000000..95acb3e --- /dev/null +++ b/ldiskfs/kernel_patches/series/ldiskfs-4.12-sles15.series @@ -0,0 +1,27 @@ +suse15/ext4-inode-version.patch +sles12sp2/ext4-lookup-dotdot.patch +sles12sp2/ext4-print-inum-in-htree-warning.patch +sles12sp2/ext4-prealloc.patch +sles12sp2/ext4-osd-iop-common.patch +sles12sp2/ext4-misc.patch +sles12sp3/ext4-mballoc-extra-checks.patch +ubuntu18/ext4-hash-indexed-dir-dotdot-update.patch +sles12sp2/ext4-kill-dx-root.patch +rhel7/ext4-mballoc-pa-free-mismatch.patch +linux-5.4/ext4-data-in-dirent.patch +suse15/ext4-large-dir.patch +suse15/ext4-large-eas.patch +suse15/ext4-disable-mb-cache.patch +ubuntu18/ext4-nocmtime.patch +rhel8/ext4-pdirop.patch +suse15/ext4-max-dir-size.patch +suse15/ext4-corrupted-inode-block-bitmaps-handling-patches.patch +ubuntu18/ext4-give-warning-with-dir-htree-growing.patch +ubuntu18/ext4-jcb-optimization.patch +ubuntu18/ext4-attach-jinode-in-writepages.patch +suse15/ext4-dont-check-before-replay.patch +rhel7.2/ext4-dont-check-in-ro.patch +rhel7/ext4-use-GFP_NOFS-in-ext4_inode_attach_jinode.patch +rhel7/ext4-optimize-ext4_find_delalloc_range-in-nodelalloc.patch +rhel7/ext4-export-orphan-add.patch +sles12sp2/ext4-export-mb-stream-allocator-variables.patch