From 69509eb9881425cd9a789905d583ca3f86cfdaf1 Mon Sep 17 00:00:00 2001 From: Shaun Tancheff Date: Thu, 18 Jan 2024 11:30:34 +0700 Subject: [PATCH] LU-17131 ldiskfs: Add Ubuntu 20.04.5 release 5.15 kernel Add support for Ubuntu 20.04.5 5.15 kernel similar to el9.2 with updated patches: ext4-corrupted-inode-block-bitmaps-handling-patches.patch ext4-data-in-dirent.patch ext4-dont-check-before-replay.patch ext4-inode-version.patch ext4-mballoc-extra-checks.patch ext4-prealloc.patch ext4-filename-encode.patch Tested with tag Ubuntu-hwe-5.15-5.15.0-91.101_20.04.1 Test-Parameters: trivial Signed-off-by: Shaun Tancheff Change-Id: Ic1b4b0f25a9ac984186cf4f37b5a73d93af93ebd Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/52414 Tested-by: jenkins Tested-by: Maloo Tested-by: James Simmons Reviewed-by: Andreas Dilger Reviewed-by: Artem Blagodarenko Reviewed-by: James Simmons Reviewed-by: Oleg Drokin --- config/lustre-build-ldiskfs.m4 | 5 +- ...pted-inode-block-bitmaps-handling-patches.patch | 251 ++++++ .../ubuntu20.04.5/ext4-data-in-dirent.patch | 863 +++++++++++++++++++++ .../ext4-dont-check-before-replay.patch | 47 ++ .../ubuntu20.04.5/ext4-filename-encode.patch | 440 +++++++++++ .../patches/ubuntu20.04.5/ext4-inode-version.patch | 49 ++ .../ubuntu20.04.5/ext4-mballoc-extra-checks.patch | 318 ++++++++ .../patches/ubuntu20.04.5/ext4-prealloc.patch | 403 ++++++++++ .../series/ldiskfs-5.15.0-83-ubuntu20.series | 38 + 9 files changed, 2413 insertions(+), 1 deletion(-) create mode 100644 ldiskfs/kernel_patches/patches/ubuntu20.04.5/ext4-corrupted-inode-block-bitmaps-handling-patches.patch create mode 100644 ldiskfs/kernel_patches/patches/ubuntu20.04.5/ext4-data-in-dirent.patch create mode 100644 ldiskfs/kernel_patches/patches/ubuntu20.04.5/ext4-dont-check-before-replay.patch create mode 100644 ldiskfs/kernel_patches/patches/ubuntu20.04.5/ext4-filename-encode.patch create mode 100644 ldiskfs/kernel_patches/patches/ubuntu20.04.5/ext4-inode-version.patch create mode 100644 ldiskfs/kernel_patches/patches/ubuntu20.04.5/ext4-mballoc-extra-checks.patch create mode 100644 ldiskfs/kernel_patches/patches/ubuntu20.04.5/ext4-prealloc.patch create mode 100644 ldiskfs/kernel_patches/series/ldiskfs-5.15.0-83-ubuntu20.series diff --git a/config/lustre-build-ldiskfs.m4 b/config/lustre-build-ldiskfs.m4 index ddc7044..7d40823 100644 --- a/config/lustre-build-ldiskfs.m4 +++ b/config/lustre-build-ldiskfs.m4 @@ -92,6 +92,7 @@ AS_IF([test x$RHEL_KERNEL = xyes], [ ]) ], [test x$UBUNTU_KERNEL = xyes], [ BASEVER=$(echo $LINUXRELEASE | cut -d'-' -f1) + AS_VERSION_COMPARE([$BASEVER],[5.15.0],[ AS_VERSION_COMPARE([$BASEVER],[5.11.0],[ AS_VERSION_COMPARE([$BASEVER],[5.8.0],[ AS_VERSION_COMPARE([$BASEVER],[5.4.0],[ @@ -154,7 +155,9 @@ AS_IF([test x$RHEL_KERNEL = xyes], [ ], [LDISKFS_SERIES="5.8.0-ml.series"])], [LDISKFS_SERIES="5.11.0-40-ubuntu20.series"], - [LDISKFS_SERIES="5.11.0-40-ubuntu20.series"]) + [LDISKFS_SERIES="5.11.0-40-ubuntu20.series"])], + [LDISKFS_SERIES="5.15.0-83-ubuntu20.series"], + [LDISKFS_SERIES="5.15.0-83-ubuntu20.series"]) ], [test x$OPENEULER_KERNEL = xyes], [ case $OPENEULER_VERSION_NO in 2203.0) LDISKFS_SERIES="5.10.0-oe2203.series" ;; diff --git a/ldiskfs/kernel_patches/patches/ubuntu20.04.5/ext4-corrupted-inode-block-bitmaps-handling-patches.patch b/ldiskfs/kernel_patches/patches/ubuntu20.04.5/ext4-corrupted-inode-block-bitmaps-handling-patches.patch new file mode 100644 index 0000000..39af563 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ubuntu20.04.5/ext4-corrupted-inode-block-bitmaps-handling-patches.patch @@ -0,0 +1,251 @@ +Subject: [PATCH] ext4-corrupted-inode-block-bitmaps-handling-patches + +Since we could skip corrupt block groups, this patch +use ext4_warning() intead of ext4_error() to make FS not +remount RO in default + +--- +diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c +index fadcb94e..f151ba9c 100644 +--- a/fs/ext4/balloc.c ++++ b/fs/ext4/balloc.c +@@ -419,7 +419,7 @@ static int ext4_validate_block_bitmap(struct super_block *sb, + desc, bh) || + ext4_simulate_fail(sb, EXT4_SIM_BBITMAP_CRC))) { + ext4_unlock_group(sb, block_group); +- ext4_error(sb, "bg %u: bad block bitmap checksum", block_group); ++ ext4_warning(sb, "bg %u: bad block bitmap checksum", block_group); + ext4_mark_group_bitmap_corrupted(sb, block_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); + return -EFSBADCRC; +@@ -427,8 +427,8 @@ static int ext4_validate_block_bitmap(struct super_block *sb, + blk = ext4_valid_block_bitmap(sb, desc, block_group, bh); + if (unlikely(blk != 0)) { + ext4_unlock_group(sb, block_group); +- ext4_error(sb, "bg %u: block %llu: invalid block bitmap", +- block_group, blk); ++ ext4_warning(sb, "bg %u: block %llu: invalid block bitmap", ++ block_group, blk); + ext4_mark_group_bitmap_corrupted(sb, block_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); + return -EFSCORRUPTED; +@@ -520,8 +520,8 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group, + ext4_unlock_group(sb, block_group); + unlock_buffer(bh); + if (err) { +- ext4_error(sb, "Failed to init block bitmap for group " +- "%u: %d", block_group, err); ++ ext4_warning(sb, "Failed to init block bitmap for group " ++ "%u: %d", block_group, err); + goto out; + } + goto verify; +diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c +index d66fab40..fe275613 100644 +--- a/fs/ext4/ialloc.c ++++ b/fs/ext4/ialloc.c +@@ -102,8 +102,8 @@ static int ext4_validate_inode_bitmap(struct super_block *sb, + EXT4_INODES_PER_GROUP(sb) / 8) || + ext4_simulate_fail(sb, EXT4_SIM_IBITMAP_CRC)) { + ext4_unlock_group(sb, block_group); +- ext4_error(sb, "Corrupt inode bitmap - block_group = %u, " +- "inode_bitmap = %llu", block_group, blk); ++ ext4_warning(sb, "Corrupt inode bitmap - block_group = %u, " ++ "inode_bitmap = %llu", block_group, blk); + ext4_mark_group_bitmap_corrupted(sb, block_group, + EXT4_GROUP_INFO_IBITMAP_CORRUPT); + return -EFSBADCRC; +@@ -353,7 +353,7 @@ out: + if (!fatal) + fatal = err; + } else { +- ext4_error(sb, "bit already cleared for inode %lu", ino); ++ ext4_warning(sb, "bit already cleared for inode %lu", ino); + ext4_mark_group_bitmap_corrupted(sb, block_group, + EXT4_GROUP_INFO_IBITMAP_CORRUPT); + } +diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c +index 8fdcfc78..6b4b5fb8 100644 +--- a/fs/ext4/mballoc.c ++++ b/fs/ext4/mballoc.c +@@ -1144,10 +1144,14 @@ int ext4_mb_generate_buddy(struct super_block *sb, + grp->bb_fragments = fragments; + + if (free != grp->bb_free) { +- ext4_grp_locked_error(sb, group, 0, 0, +- "block bitmap and bg descriptor " +- "inconsistent: %u vs %u free clusters", +- free, grp->bb_free); ++ struct ext4_group_desc *gdp; ++ gdp = ext4_get_group_desc(sb, group, NULL); ++ ext4_warning(sb, "group %lu: block bitmap and bg descriptor " ++ "inconsistent: %u vs %u free clusters " ++ "%u in gd, %lu pa's", ++ (long unsigned int)group, free, grp->bb_free, ++ ext4_free_group_clusters(sb, gdp), ++ grp->bb_prealloc_nr); + /* + * If we intend to continue, we consider group descriptor + * corrupt and update bb_free using bitmap value +@@ -1500,7 +1504,7 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, + int block; + int pnum; + int poff; +- struct page *page; ++ struct page *page = NULL; + int ret; + struct ext4_group_info *grp; + struct ext4_sb_info *sbi = EXT4_SB(sb); +@@ -1528,7 +1532,7 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, + */ + ret = ext4_mb_init_group(sb, group, gfp); + if (ret) +- return ret; ++ goto err; + } + + /* +@@ -1628,6 +1632,7 @@ err: + put_page(e4b->bd_buddy_page); + e4b->bd_buddy = NULL; + e4b->bd_bitmap = NULL; ++ ext4_warning(sb, "Error loading buddy information for %u", group); + return ret; + } + +@@ -5045,16 +5050,8 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, + /* "free < pa->pa_free" means we maybe double alloc the same blocks, + * otherwise maybe leave some free blocks unavailable, no need to BUG.*/ + if ((free > pa->pa_free && !pa->pa_error) || (free < pa->pa_free)) { +- ext4_error(sb, "pa free mismatch: [pa %p] " +- "[phy %lu] [logic %lu] [len %u] [free %u] " +- "[error %u] [inode %d] [freed %u]", pa, +- (unsigned long)pa->pa_pstart, +- (unsigned long)pa->pa_lstart, +- pa->pa_len, (unsigned)pa->pa_free, +- (unsigned)pa->pa_error, pa->pa_inode->i_ino, +- free); + ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u", +- free, pa->pa_free); ++ free, pa->pa_free); + /* + * pa is already deleted so we use the value obtained + * from the bitmap and continue. +@@ -5120,16 +5117,11 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, + bitmap_bh = ext4_read_block_bitmap(sb, group); + if (IS_ERR(bitmap_bh)) { + err = PTR_ERR(bitmap_bh); +- ext4_error_err(sb, -err, +- "Error %d reading block bitmap for %u", +- err, group); + goto out_dbg; + } + + err = ext4_mb_load_buddy(sb, group, &e4b); + if (err) { +- ext4_warning(sb, "Error %d loading buddy information for %u", +- err, group); + put_bh(bitmap_bh); + goto out_dbg; + } +@@ -5286,17 +5278,12 @@ repeat: + + err = ext4_mb_load_buddy_gfp(sb, group, &e4b, + GFP_NOFS|__GFP_NOFAIL); +- if (err) { +- ext4_error_err(sb, -err, "Error %d loading buddy information for %u", +- err, group); ++ if (err) + return; +- } + + bitmap_bh = ext4_read_block_bitmap(sb, group); + if (IS_ERR(bitmap_bh)) { + err = PTR_ERR(bitmap_bh); +- ext4_error_err(sb, -err, "Error %d reading block bitmap for %u", +- err, group); + ext4_mb_unload_buddy(&e4b); + continue; + } +@@ -5596,11 +5583,8 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb, + group = ext4_get_group_number(sb, pa->pa_pstart); + err = ext4_mb_load_buddy_gfp(sb, group, &e4b, + GFP_NOFS|__GFP_NOFAIL); +- if (err) { +- ext4_error_err(sb, -err, "Error %d loading buddy information for %u", +- err, group); ++ if (err) + continue; +- } + ext4_lock_group(sb, group); + list_del(&pa->pa_group_list); + ext4_get_group_info(sb, group)->bb_prealloc_nr--; +@@ -5937,7 +5921,7 @@ errout: + * been updated or not when fail case. So can + * not revert pa_free back, just mark pa_error*/ + pa->pa_error++; +- ext4_error(sb, ++ ext4_warning(sb, + "Updating bitmap error: [err %d] " + "[pa %p] [phy %lu] [logic %lu] " + "[len %u] [free %u] [error %u] " +@@ -5948,6 +5932,7 @@ errout: + (unsigned)pa->pa_free, + (unsigned)pa->pa_error, + pa->pa_inode ? pa->pa_inode->i_ino : 0); ++ ext4_mark_group_bitmap_corrupted(sb, 0, 0); + } + } + ext4_mb_release_context(ac); +@@ -6280,7 +6265,7 @@ do_more: + err = ext4_mb_load_buddy_gfp(sb, block_group, &e4b, + GFP_NOFS|__GFP_NOFAIL); + if (err) +- goto error_return; ++ goto error_brelse; + + /* + * We need to make sure we don't reuse the freed block until after the +@@ -6373,8 +6358,9 @@ do_more: + goto do_more; + } + error_return: +- brelse(bitmap_bh); + ext4_std_error(sb, err); ++error_brelse: ++ brelse(bitmap_bh); + return; + } + +@@ -6573,7 +6559,7 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, + + err = ext4_mb_load_buddy(sb, block_group, &e4b); + if (err) +- goto error_return; ++ goto error_brelse; + + /* + * need to update group_info->bb_free and bitmap +@@ -6612,8 +6598,9 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, + err = ret; + + error_return: +- brelse(bitmap_bh); + ext4_std_error(sb, err); ++error_brelse: ++ brelse(bitmap_bh); + return err; + } + +@@ -6733,8 +6720,6 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, + + ret = ext4_mb_load_buddy(sb, group, &e4b); + if (ret) { +- ext4_warning(sb, "Error %d loading buddy information for %u", +- ret, group); + return ret; + } + +-- +2.25.1 + diff --git a/ldiskfs/kernel_patches/patches/ubuntu20.04.5/ext4-data-in-dirent.patch b/ldiskfs/kernel_patches/patches/ubuntu20.04.5/ext4-data-in-dirent.patch new file mode 100644 index 0000000..e6c8aec --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ubuntu20.04.5/ext4-data-in-dirent.patch @@ -0,0 +1,863 @@ +diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c +index 74b172a4..c6afabcb 100644 +--- a/fs/ext4/dir.c ++++ b/fs/ext4/dir.c +@@ -466,12 +466,17 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, + struct fname *fname, *new_fn; + struct dir_private_info *info; + int len; ++ int extra_data = 0; + + info = dir_file->private_data; + p = &info->root.rb_node; + + /* Create and allocate the fname structure */ +- len = sizeof(struct fname) + ent_name->len + 1; ++ if (dirent->file_type & EXT4_DIRENT_LUFID) ++ extra_data = ext4_get_dirent_data_len(dirent); ++ ++ len = sizeof(struct fname) + ent_name->len + extra_data + 1; ++ + new_fn = kzalloc(len, GFP_KERNEL); + if (!new_fn) + return -ENOMEM; +@@ -480,7 +485,7 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, + new_fn->inode = le32_to_cpu(dirent->inode); + new_fn->name_len = ent_name->len; + new_fn->file_type = dirent->file_type; +- memcpy(new_fn->name, ent_name->name, ent_name->len); ++ memcpy(new_fn->name, ent_name->name, ent_name->len + extra_data); + + while (*p) { + parent = *p; +diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h +index 29db9e17..cf1c0732 100644 +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -1172,6 +1172,7 @@ struct ext4_inode_info { + __u32 i_csum_seed; + + kprojid_t i_projid; ++ void *i_dirdata; + }; + + /* +@@ -1193,6 +1194,7 @@ struct ext4_inode_info { + * Mount flags set via mount options or defaults + */ + #define EXT4_MOUNT_NO_MBCACHE 0x00001 /* Do not use mbcache */ ++#define EXT4_MOUNT_DIRDATA 0x00002 /* Data in directory entries */ + #define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */ + #define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */ + #define EXT4_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */ +@@ -2158,6 +2160,7 @@ EXT4_FEATURE_INCOMPAT_FUNCS(casefold, CASEFOLD) + EXT4_FEATURE_INCOMPAT_FLEX_BG| \ + EXT4_FEATURE_INCOMPAT_EA_INODE| \ + EXT4_FEATURE_INCOMPAT_MMP | \ ++ EXT4_FEATURE_INCOMPAT_DIRDATA| \ + EXT4_FEATURE_INCOMPAT_INLINE_DATA | \ + EXT4_FEATURE_INCOMPAT_ENCRYPT | \ + EXT4_FEATURE_INCOMPAT_CASEFOLD | \ +@@ -2369,6 +2372,42 @@ struct ext4_dir_entry_tail { + #define EXT4_FT_SYMLINK 7 + + #define EXT4_FT_MAX 8 ++#define EXT4_FT_MASK 0xf ++ ++#if EXT4_FT_MAX > EXT4_FT_MASK ++#error "conflicting EXT4_FT_MAX and EXT4_FT_MASK" ++#endif ++ ++/* ++ * d_type has 4 unused bits, so it can hold four types data. these different ++ * type of data (e.g. lustre data, high 32 bits of 64-bit inode number) can be ++ * stored, in flag order, after file-name in ext4 dirent. ++*/ ++/* ++ * this flag is added to d_type if ext4 dirent has extra data after ++ * filename. this data length is variable and length is stored in first byte ++ * of data. data start after filename NUL byte. ++ * This is used by Lustre FS. ++ */ ++#define EXT4_DIRENT_LUFID 0x10 ++ ++#define EXT4_LUFID_MAGIC 0xAD200907UL ++struct ext4_dentry_param { ++ __u32 edp_magic; /* EXT4_LUFID_MAGIC */ ++ char edp_len; /* size of edp_data in bytes */ ++ char edp_data[0]; /* packed array of data */ ++} __packed; ++ ++static inline unsigned char *ext4_dentry_get_data(struct super_block *sb, ++ struct ext4_dentry_param *p) ++{ ++ if (!ext4_has_feature_dirdata(sb)) ++ return NULL; ++ if (p && p->edp_magic == EXT4_LUFID_MAGIC) ++ return &p->edp_len; ++ else ++ return NULL; ++} + + #define EXT4_FT_DIR_CSUM 0xDE + +@@ -2380,6 +2419,17 @@ struct ext4_dir_entry_tail { + #define EXT4_DIR_PAD 4 + #define EXT4_DIR_ROUND (EXT4_DIR_PAD - 1) + #define EXT4_MAX_REC_LEN ((1<<16)-1) ++#define EXT4_DIR_REC_LEN_(name_len, i_dir) \ ++ ext4_dir_rec_len((name_len), (i_dir)) ++#define EXT4_DIR_ENTRY_LEN_(de, i_dir) \ ++ (EXT4_DIR_REC_LEN_((de)->name_len + ext4_get_dirent_data_len(de), \ ++ (i_dir))) ++/* ldiskfs */ ++#define EXT4_DIR_REC_LEN(name_len, i_dir) EXT4_DIR_REC_LEN_((name_len), (i_dir)) ++#define EXT4_DIR_ENTRY_LEN(de, i_dir) EXT4_DIR_ENTRY_LEN_((de), (i_dir)) ++/* lustre osd_handler compat -- ifdef LDISKFS_DIR_REC_LEN_WITH_DIR */ ++#define EXT4_DIR_REC_LEN_WITH_DIR 1 ++#define __EXT4_DIR_REC_LEN(name_len) EXT4_DIR_REC_LEN_((name_len), NULL) + + /* + * The rec_len is dependent on the type of directory. Directories that are +@@ -2387,10 +2437,10 @@ struct ext4_dir_entry_tail { + * ext4_extended_dir_entry_2. For all entries related to '.' or '..' you should + * pass NULL for dir, as those entries do not use the extra fields. + */ +-static inline unsigned int ext4_dir_rec_len(__u8 name_len, ++static inline unsigned int ext4_dir_rec_len(__u32 name_len, + const struct inode *dir) + { +- int rec_len = (name_len + 8 + EXT4_DIR_ROUND); ++ __u32 rec_len = (name_len + 8 + EXT4_DIR_ROUND); + + if (dir && ext4_hash_in_dirent(dir)) + rec_len += sizeof(struct ext4_dir_entry_hash); +@@ -2866,11 +2916,13 @@ extern int ext4_find_dest_de(struct inode *dir, struct inode *inode, + struct buffer_head *bh, + void *buf, int buf_size, + struct ext4_filename *fname, +- struct ext4_dir_entry_2 **dest_de); ++ struct ext4_dir_entry_2 **dest_de, ++ int *dlen); + void ext4_insert_dentry(struct inode *dir, struct inode *inode, + struct ext4_dir_entry_2 *de, + int buf_size, +- struct ext4_filename *fname); ++ struct ext4_filename *fname, ++ void *data); + static inline void ext4_update_dx_flag(struct inode *inode) + { + if (!ext4_has_feature_dir_index(inode->i_sb) && +@@ -2886,10 +2938,17 @@ static const unsigned char ext4_filetype_table[] = { + + static inline unsigned char get_dtype(struct super_block *sb, int filetype) + { +- if (!ext4_has_feature_filetype(sb) || filetype >= EXT4_FT_MAX) ++ int fl_index = filetype & EXT4_FT_MASK; ++ ++ if (!ext4_has_feature_filetype(sb) || fl_index >= EXT4_FT_MAX) + return DT_UNKNOWN; + +- return ext4_filetype_table[filetype]; ++ if (!test_opt(sb, DIRDATA)) ++ return ext4_filetype_table[fl_index]; ++ ++ return (ext4_filetype_table[fl_index]) | ++ (filetype & EXT4_DIRENT_LUFID); ++ + } + extern int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, + void *buf, int buf_size); +@@ -3093,9 +3152,13 @@ extern int ext4_ind_migrate(struct inode *inode); + + /* namei.c */ + extern int ext4_init_new_dir(handle_t *handle, struct inode *dir, +- struct inode *inode); ++ struct inode *inode, ++ const void *data1, const void *data2); + extern int ext4_dirblock_csum_verify(struct inode *inode, + struct buffer_head *bh); ++extern int ext4_add_dot_dotdot(handle_t *handle, struct inode *dir, ++ struct inode *inode, ++ const void *data1, const void *data2); + extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, + __u32 start_minor_hash, __u32 *next_hash); + extern struct inode *ext4_create_inode(handle_t *handle, +@@ -3908,6 +3971,36 @@ static inline int ext4_buffer_uptodate(struct buffer_head *bh) + return buffer_uptodate(bh); + } + ++/* ++ * Compute the total directory entry data length. ++ * This includes the filename and an implicit NUL terminator (always present), ++ * and optional extensions. Each extension has a bit set in the high 4 bits of ++ * de->file_type, and the extension length is the first byte in each entry. ++ */ ++static inline int ext4_get_dirent_data_len(struct ext4_dir_entry_2 *de) ++{ ++ char *len = de->name + de->name_len + 1 /* NUL terminator */; ++ int dlen = 0; ++ __u8 extra_data_flags = (de->file_type & ~EXT4_FT_MASK) >> 4; ++ struct ext4_dir_entry_tail *t = (struct ext4_dir_entry_tail *)de; ++ ++ if (!t->det_reserved_zero1 && ++ le16_to_cpu(t->det_rec_len) == ++ sizeof(struct ext4_dir_entry_tail) && ++ !t->det_reserved_zero2 && ++ t->det_reserved_ft == EXT4_FT_DIR_CSUM) ++ return 0; ++ ++ while (extra_data_flags) { ++ if (extra_data_flags & 1) { ++ dlen += *len + (dlen == 0); ++ len += *len; ++ } ++ extra_data_flags >>= 1; ++ } ++ return dlen; ++} ++ + #endif /* __KERNEL__ */ + + #define EFSBADCRC EBADMSG /* Bad CRC detected */ +diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c +index 2660c34c..2a048e9c 100644 +--- a/fs/ext4/fast_commit.c ++++ b/fs/ext4/fast_commit.c +@@ -1601,7 +1601,7 @@ static int ext4_fc_replay_create(struct super_block *sb, + ext4_debug("Dir %d not found.", darg.ino); + goto out; + } +- ret = ext4_init_new_dir(NULL, dir, inode); ++ ret = ext4_init_new_dir(NULL, dir, inode, NULL, NULL); + iput(dir); + if (ret) { + ret = 0; +diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c +index 6fe665de..e20e7894 100644 +--- a/fs/ext4/inline.c ++++ b/fs/ext4/inline.c +@@ -1048,7 +1048,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle, + struct ext4_dir_entry_2 *de; + + err = ext4_find_dest_de(dir, inode, iloc->bh, inline_start, +- inline_size, fname, &de); ++ inline_size, fname, &de, NULL); + if (err) + return err; + +@@ -1057,7 +1057,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle, + EXT4_JTR_NONE); + if (err) + return err; +- ext4_insert_dentry(dir, inode, de, inline_size, fname); ++ ext4_insert_dentry(dir, inode, de, inline_size, fname, NULL); + + ext4_show_inline_dir(dir, iloc->bh, inline_start, inline_size); + +@@ -1416,7 +1416,7 @@ int ext4_inlinedir_to_tree(struct file *dir_file, + fake.name_len = 1; + strcpy(fake.name, "."); + fake.rec_len = ext4_rec_len_to_disk( +- ext4_dir_rec_len(fake.name_len, NULL), ++ EXT4_DIR_ENTRY_LEN(&fake, NULL), + inline_size); + ext4_set_de_type(inode->i_sb, &fake, S_IFDIR); + de = &fake; +@@ -1426,7 +1426,7 @@ int ext4_inlinedir_to_tree(struct file *dir_file, + fake.name_len = 2; + strcpy(fake.name, ".."); + fake.rec_len = ext4_rec_len_to_disk( +- ext4_dir_rec_len(fake.name_len, NULL), ++ EXT4_DIR_ENTRY_LEN(&fake, NULL), + inline_size); + ext4_set_de_type(inode->i_sb, &fake, S_IFDIR); + de = &fake; +diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c +index 25e84f0e..5c068e3a 100644 +--- a/fs/ext4/namei.c ++++ b/fs/ext4/namei.c +@@ -290,13 +290,14 @@ static unsigned dx_get_count(struct dx_entry *entries); + static unsigned dx_get_limit(struct dx_entry *entries); + static void dx_set_count(struct dx_entry *entries, unsigned value); + static void dx_set_limit(struct dx_entry *entries, unsigned value); +-static unsigned dx_root_limit(struct inode *dir, unsigned infosize); ++static inline unsigned dx_root_limit(struct inode *dir, ++ struct ext4_dir_entry_2 *dot_de, unsigned infosize); + static unsigned dx_node_limit(struct inode *dir); + static struct dx_frame *dx_probe(struct ext4_filename *fname, + struct inode *dir, + struct dx_hash_info *hinfo, + struct dx_frame *frame); +-static void dx_release(struct dx_frame *frames); ++static void dx_release(struct dx_frame *frames, struct inode *dir); + static int dx_make_map(struct inode *dir, struct buffer_head *bh, + struct dx_hash_info *hinfo, + struct dx_map_entry *map_tail); +@@ -436,22 +437,23 @@ static struct dx_countlimit *get_dx_countlimit(struct inode *inode, + { + struct ext4_dir_entry *dp; + struct dx_root_info *root; +- int count_offset; ++ int count_offset, dot_rec_len, dotdot_rec_len; + + if (le16_to_cpu(dirent->rec_len) == EXT4_BLOCK_SIZE(inode->i_sb)) + count_offset = 8; +- else if (le16_to_cpu(dirent->rec_len) == 12) { +- dp = (struct ext4_dir_entry *)(((void *)dirent) + 12); ++ else { ++ dot_rec_len = le16_to_cpu(dirent->rec_len); ++ dp = (struct ext4_dir_entry *)(((void *)dirent) + dot_rec_len); + if (le16_to_cpu(dp->rec_len) != +- EXT4_BLOCK_SIZE(inode->i_sb) - 12) ++ EXT4_BLOCK_SIZE(inode->i_sb) - dot_rec_len) + return NULL; +- root = (struct dx_root_info *)(((void *)dp + 12)); ++ dotdot_rec_len = EXT4_DIR_ENTRY_LEN((struct ext4_dir_entry_2 *)dp, NULL); ++ root = (struct dx_root_info *)(((void *)dp + dotdot_rec_len)); + if (root->reserved_zero || + root->info_length != sizeof(struct dx_root_info)) + return NULL; +- count_offset = 32; +- } else +- return NULL; ++ count_offset = 8 + dot_rec_len + dotdot_rec_len; ++ } + + if (offset) + *offset = count_offset; +@@ -554,13 +556,14 @@ ext4_next_entry(struct ext4_dir_entry_2 *p, unsigned long blocksize) + * Future: use high four bits of block for coalesce-on-delete flags + * Mask them off for now. + */ +-struct dx_root_info *dx_get_dx_info(struct ext4_dir_entry_2 *de) ++struct dx_root_info *dx_get_dx_info(struct ext4_dir_entry_2 *de, struct inode *i_dir) + { ++ BUG_ON(de->name_len != 1); + /* get dotdot first */ +- de = (struct ext4_dir_entry_2 *)((char *)de + EXT4_DIR_REC_LEN(1)); ++ de = (struct ext4_dir_entry_2 *)((char *)de + EXT4_DIR_ENTRY_LEN(de, i_dir)); + + /* dx root info is after dotdot entry */ +- de = (struct ext4_dir_entry_2 *)((char *)de + EXT4_DIR_REC_LEN(2)); ++ de = (struct ext4_dir_entry_2 *)((char *)de + EXT4_DIR_ENTRY_LEN(de, i_dir)); + + return (struct dx_root_info *)de; + } +@@ -605,11 +608,16 @@ static inline void dx_set_limit(struct dx_entry *entries, unsigned value) + ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value); + } + +-static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize) ++static inline unsigned dx_root_limit(struct inode *dir, ++ struct ext4_dir_entry_2 *dot_de, unsigned infosize) + { +- unsigned int entry_space = dir->i_sb->s_blocksize - +- ext4_dir_rec_len(1, NULL) - +- ext4_dir_rec_len(2, NULL) - infosize; ++ struct ext4_dir_entry_2 *dotdot_de; ++ unsigned entry_space; ++ ++ BUG_ON(dot_de->name_len != 1); ++ dotdot_de = ext4_next_entry(dot_de, dir->i_sb->s_blocksize); ++ entry_space = dir->i_sb->s_blocksize - EXT4_DIR_ENTRY_LEN(dot_de, NULL) - ++ EXT4_DIR_ENTRY_LEN(dotdot_de, NULL) - infosize; + + if (ext4_has_metadata_csum(dir->i_sb)) + entry_space -= sizeof(struct dx_tail); +@@ -729,7 +737,7 @@ static struct stats dx_show_leaf(struct inode *dir, + (unsigned) ((char *) de - base)); + #endif + } +- space += ext4_dir_rec_len(de->name_len, dir); ++ space += EXT4_DIR_ENTRY_LEN(de, dir); + names++; + } + de = ext4_next_entry(de, size); +@@ -823,7 +831,7 @@ dx_probe(struct ext4_filename *fname, struct inode *dir, + if (IS_ERR(frame->bh)) + return (struct dx_frame *) frame->bh; + +- info = dx_get_dx_info((struct ext4_dir_entry_2 *)frame->bh->b_data); ++ info = dx_get_dx_info((struct ext4_dir_entry_2 *)frame->bh->b_data, dir); + if (info->hash_version != DX_HASH_TEA && + info->hash_version != DX_HASH_HALF_MD4 && + info->hash_version != DX_HASH_LEGACY && +@@ -885,11 +893,14 @@ dx_probe(struct ext4_filename *fname, struct inode *dir, + + entries = (struct dx_entry *)(((char *)info) + info->info_length); + +- if (dx_get_limit(entries) != dx_root_limit(dir, +- info->info_length)) { ++ if (dx_get_limit(entries) != ++ dx_root_limit(dir, (struct ext4_dir_entry_2 *)frame->bh->b_data, ++ info->info_length)) { + ext4_warning_inode(dir, "dx entry: limit %u != root limit %u", + dx_get_limit(entries), +- dx_root_limit(dir, info->info_length)); ++ dx_root_limit(dir, ++ (struct ext4_dir_entry_2 *)frame->bh->b_data, ++ info->info_length)); + goto fail; + } + +@@ -966,7 +977,7 @@ fail: + return ret_err; + } + +-static void dx_release(struct dx_frame *frames) ++static void dx_release(struct dx_frame *frames, struct inode *dir) + { + struct dx_root_info *info; + int i; +@@ -975,7 +986,7 @@ static void dx_release(struct dx_frame *frames) + if (frames[0].bh == NULL) + return; + +- info = dx_get_dx_info((struct ext4_dir_entry_2 *)frames[0].bh->b_data); ++ info = dx_get_dx_info((struct ext4_dir_entry_2 *)frames[0].bh->b_data, dir); + /* save local copy, "info" may be freed after brelse() */ + indirect_levels = info->indirect_levels; + for (i = 0; i <= indirect_levels; i++) { +@@ -1281,12 +1292,12 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, + (count && ((hashval & 1) == 0))) + break; + } +- dx_release(frames); ++ dx_release(frames, dir); + dxtrace(printk(KERN_DEBUG "Fill tree: returned %d entries, " + "next hash: %x\n", count, *next_hash)); + return count; + errout: +- dx_release(frames); ++ dx_release(frames, dir); + return (err); + } + +@@ -1821,7 +1832,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, + errout: + dxtrace(printk(KERN_DEBUG "%s not found\n", fname->usr_fname->name)); + success: +- dx_release(frames); ++ dx_release(frames, dir); + return bh; + } + +@@ -1945,7 +1956,7 @@ dx_move_dirents(struct inode *dir, char *from, char *to, + while (count--) { + struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) + (from + (map->offs<<2)); +- rec_len = ext4_dir_rec_len(de->name_len, dir); ++ rec_len = EXT4_DIR_ENTRY_LEN(de, dir); + + memcpy (to, de, rec_len); + ((struct ext4_dir_entry_2 *) to)->rec_len = +@@ -1978,7 +1989,7 @@ static struct ext4_dir_entry_2 *dx_pack_dirents(struct inode *dir, char *base, + while ((char*)de < base + blocksize) { + next = ext4_next_entry(de, blocksize); + if (de->inode && de->name_len) { +- rec_len = ext4_dir_rec_len(de->name_len, dir); ++ rec_len = EXT4_DIR_ENTRY_LEN(de, dir); + if (de > to) + memmove(to, de, rec_len); + to->rec_len = ext4_rec_len_to_disk(rec_len, blocksize); +@@ -2121,14 +2132,21 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode, + struct buffer_head *bh, + void *buf, int buf_size, + struct ext4_filename *fname, +- struct ext4_dir_entry_2 **dest_de) ++ struct ext4_dir_entry_2 **dest_de, ++ int *dlen) + { + struct ext4_dir_entry_2 *de; +- unsigned short reclen = ext4_dir_rec_len(fname_len(fname), dir); ++ unsigned short reclen; + int nlen, rlen; + unsigned int offset = 0; + char *top; + ++ if (dlen) { ++ reclen = ext4_dir_rec_len(fname_len(fname) + *dlen, dir); ++ *dlen = 0; ++ } else { ++ reclen = ext4_dir_rec_len(fname_len(fname), dir); ++ } + de = (struct ext4_dir_entry_2 *)buf; + top = buf + buf_size - reclen; + while ((char *) de <= top) { +@@ -2137,10 +2155,31 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode, + return -EFSCORRUPTED; + if (ext4_match(dir, fname, de)) + return -EEXIST; +- nlen = ext4_dir_rec_len(de->name_len, dir); ++ nlen = EXT4_DIR_ENTRY_LEN(de, dir); + rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); + if ((de->inode ? rlen - nlen : rlen) >= reclen) + break; ++ ++ /* Then for dotdot entries, check for the smaller space ++ * required for just the entry, no FID ++ */ ++ if (fname_len(fname) == 2 && memcmp(fname_name(fname), "..", 2) == 0) { ++ if ((de->inode ? rlen - nlen : rlen) >= ++ ext4_dir_rec_len(fname_len(fname), dir)) { ++ /* set dlen = 1 to indicate not ++ * enough space store fid ++ */ ++ if (dlen) ++ *dlen = 1; ++ break; ++ } ++ /* The new ".." entry must be written over the ++ * previous ".." entry, which is the first ++ * entry traversed by this scan. If it doesn't ++ * fit, something is badly wrong, so -EIO. ++ */ ++ return -EIO; ++ } + de = (struct ext4_dir_entry_2 *)((char *)de + rlen); + offset += rlen; + } +@@ -2155,12 +2194,13 @@ void ext4_insert_dentry(struct inode *dir, + struct inode *inode, + struct ext4_dir_entry_2 *de, + int buf_size, +- struct ext4_filename *fname) ++ struct ext4_filename *fname, ++ void *data) + { + + int nlen, rlen; + +- nlen = ext4_dir_rec_len(de->name_len, dir); ++ nlen = EXT4_DIR_ENTRY_LEN(de, dir); + rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); + if (de->inode) { + struct ext4_dir_entry_2 *de1 = +@@ -2181,6 +2221,12 @@ void ext4_insert_dentry(struct inode *dir, + EXT4_DIRENT_HASHES(de)->minor_hash = + cpu_to_le32(hinfo->minor_hash); + } ++ if (data) { ++ de->name[fname_len(fname)] = 0; ++ memcpy(&de->name[fname_len(fname) + 1], data, *(char *)data); ++ de->file_type |= EXT4_DIRENT_LUFID; ++ } ++ + } + + /* +@@ -2198,14 +2244,19 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname, + { + unsigned int blocksize = dir->i_sb->s_blocksize; + int csum_size = 0; +- int err, err2; ++ int err, err2, dlen = 0; ++ unsigned char *data; + ++ data = ext4_dentry_get_data(inode->i_sb, (struct ext4_dentry_param *) ++ EXT4_I(inode)->i_dirdata); + if (ext4_has_metadata_csum(inode->i_sb)) + csum_size = sizeof(struct ext4_dir_entry_tail); + + if (!de) { ++ if (data) ++ dlen = (*data) + 1; + err = ext4_find_dest_de(dir, inode, bh, bh->b_data, +- blocksize - csum_size, fname, &de); ++ blocksize - csum_size, fname, &de, &dlen); + if (err) + return err; + } +@@ -2218,7 +2269,10 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname, + } + + /* By now the buffer is marked for journaling */ +- ext4_insert_dentry(dir, inode, de, blocksize, fname); ++ /* If writing the short form of "dotdot", don't add the data section */ ++ if (dlen == 1) ++ data = NULL; ++ ext4_insert_dentry(dir, inode, de, blocksize, fname, data); + + /* + * XXX shouldn't update any times until successful +@@ -2324,7 +2378,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, + blocksize); + + /* initialize hashing info */ +- dx_info = dx_get_dx_info(dot_de); ++ dx_info = dx_get_dx_info(dot_de, dir); + memset(dx_info, 0, sizeof(*dx_info)); + dx_info->info_length = sizeof(*dx_info); + if (ext4_hash_in_dirent(dir)) +@@ -2335,7 +2389,8 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, + entries = (void *)dx_info + sizeof(*dx_info); + dx_set_block(entries, 1); + dx_set_count(entries, 1); +- dx_set_limit(entries, dx_root_limit(dir, sizeof(*dx_info))); ++ dx_set_limit(entries, dx_root_limit(dir, ++ dot_de, sizeof(*dx_info))); + + /* Initialize as for dx_probe */ + fname->hinfo.hash_version = dx_info->hash_version; +@@ -2381,7 +2436,7 @@ out_frames: + */ + if (retval) + ext4_mark_inode_dirty(handle, dir); +- dx_release(frames); ++ dx_release(frames, dir); + brelse(bh2); + return retval; + } +@@ -2394,6 +2449,8 @@ static int ext4_update_dotdot(handle_t *handle, struct dentry *dentry, + struct buffer_head *dir_block; + struct ext4_dir_entry_2 *de; + int len, journal = 0, err = 0; ++ int dlen = 0; ++ char *data; + + if (IS_ERR(handle)) + return PTR_ERR(handle); +@@ -2409,21 +2466,26 @@ static int ext4_update_dotdot(handle_t *handle, struct dentry *dentry, + + de = (struct ext4_dir_entry_2 *)dir_block->b_data; + /* the first item must be "." */ +- assert(de->name_len == 1 && de->name[0] == '.'); ++ ASSERT(de->name_len == 1 && de->name[0] == '.'); + len = le16_to_cpu(de->rec_len); +- assert(len >= EXT4_DIR_REC_LEN(1)); +- if (len > EXT4_DIR_REC_LEN(1)) { ++ ASSERT(len >= EXT4_DIR_REC_LEN(1, dir)); ++ if (len > EXT4_DIR_REC_LEN(1, dir)) { + BUFFER_TRACE(dir_block, "get_write_access"); + err = ext4_journal_get_write_access(handle, dir->i_sb, dir_block, EXT4_JTR_NONE); + if (err) + goto out_journal; + + journal = 1; +- de->rec_len = cpu_to_le16(EXT4_DIR_REC_LEN(1)); ++ de->rec_len = cpu_to_le16(EXT4_DIR_ENTRY_LEN(de, dir)); + } + +- len -= EXT4_DIR_REC_LEN(1); +- assert(len == 0 || len >= EXT4_DIR_REC_LEN(2)); ++ len -= EXT4_DIR_ENTRY_LEN(de, NULL); ++ data = ext4_dentry_get_data(dir->i_sb, ++ (struct ext4_dentry_param *)dentry->d_fsdata); ++ if (data) ++ dlen = *data + 1; ++ ASSERT(len == 0 || len >= EXT4_DIR_REC_LEN(2 + dlen, dir)); ++ + de = (struct ext4_dir_entry_2 *) + ((char *) de + le16_to_cpu(de->rec_len)); + if (!journal) { +@@ -2437,10 +2499,15 @@ static int ext4_update_dotdot(handle_t *handle, struct dentry *dentry, + if (len > 0) + de->rec_len = cpu_to_le16(len); + else +- assert(le16_to_cpu(de->rec_len) >= EXT4_DIR_REC_LEN(2)); ++ ASSERT(le16_to_cpu(de->rec_len) >= EXT4_DIR_REC_LEN(2, dir)); + de->name_len = 2; + strcpy(de->name, ".."); +- ext4_set_de_type(dir->i_sb, de, S_IFDIR); ++ if (data != NULL && ext4_get_dirent_data_len(de) >= dlen) { ++ de->name[2] = 0; ++ memcpy(&de->name[2 + 1], data, *data); ++ ext4_set_de_type(dir->i_sb, de, S_IFDIR); ++ de->file_type |= EXT4_DIRENT_LUFID; ++ } + + out_journal: + if (journal) { +@@ -2478,6 +2545,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, + ext4_lblk_t block, blocks; + int csum_size = 0; + ++ EXT4_I(inode)->i_dirdata = dentry->d_fsdata; + if (ext4_has_metadata_csum(inode->i_sb)) + csum_size = sizeof(struct ext4_dir_entry_tail); + +@@ -2720,7 +2788,7 @@ again: + dx_set_count(entries, 1); + dx_set_block(entries + 0, newblock); + info = dx_get_dx_info((struct ext4_dir_entry_2 *) +- frames[0].bh->b_data); ++ frames[0].bh->b_data, dir); + info->indirect_levels = 1; + dxtrace(printk(KERN_DEBUG + "Creating %d level index...\n", +@@ -2746,7 +2814,7 @@ journal_error: + ext4_std_error(dir->i_sb, err); /* this is a no-op if err == 0 */ + cleanup: + brelse(bh); +- dx_release(frames); ++ dx_release(frames, dir); + /* @restart is true means htree-path has been changed, we need to + * repeat dx_probe() to find out valid htree-path + */ +@@ -3049,38 +3117,73 @@ err_unlock_inode: + return err; + } + ++struct tp_block { ++ struct inode *inode; ++ void *data1; ++ void *data2; ++}; ++ + struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode, + struct ext4_dir_entry_2 *de, + int blocksize, int csum_size, + unsigned int parent_ino, int dotdot_real_len) + { ++ void *data1 = NULL, *data2 = NULL; ++ int dot_reclen = 0; ++ ++ if (dotdot_real_len == 10) { ++ struct tp_block *tpb = (struct tp_block *)inode; ++ data1 = tpb->data1; ++ data2 = tpb->data2; ++ inode = tpb->inode; ++ dotdot_real_len = 0; ++ } + de->inode = cpu_to_le32(inode->i_ino); + de->name_len = 1; +- de->rec_len = ext4_rec_len_to_disk(ext4_dir_rec_len(de->name_len, NULL), +- blocksize); + strcpy(de->name, "."); + ext4_set_de_type(inode->i_sb, de, S_IFDIR); + ++ /* get packed fid data*/ ++ data1 = ext4_dentry_get_data(inode->i_sb, ++ (struct ext4_dentry_param *) data1); ++ if (data1) { ++ de->name[1] = 0; ++ memcpy(&de->name[2], data1, *(char *) data1); ++ de->file_type |= EXT4_DIRENT_LUFID; ++ } ++ de->rec_len = cpu_to_le16(EXT4_DIR_ENTRY_LEN(de, NULL)); ++ ++ dot_reclen = cpu_to_le16(de->rec_len); + de = ext4_next_entry(de, blocksize); + de->inode = cpu_to_le32(parent_ino); + de->name_len = 2; ++ ++ strcpy(de->name, ".."); ++ ext4_set_de_type(inode->i_sb, de, S_IFDIR); ++ data2 = ext4_dentry_get_data(inode->i_sb, ++ (struct ext4_dentry_param *) data2); ++ if (data2) { ++ de->name[2] = 0; ++ memcpy(&de->name[3], data2, *(char *) data2); ++ de->file_type |= EXT4_DIRENT_LUFID; ++ } ++ + if (!dotdot_real_len) + de->rec_len = ext4_rec_len_to_disk(blocksize - +- (csum_size + ext4_dir_rec_len(1, NULL)), +- blocksize); ++ (csum_size + dot_reclen), blocksize); + else + de->rec_len = ext4_rec_len_to_disk( +- ext4_dir_rec_len(de->name_len, NULL), ++ EXT4_DIR_ENTRY_LEN(de, NULL), + blocksize); +- strcpy(de->name, ".."); +- ext4_set_de_type(inode->i_sb, de, S_IFDIR); + + return ext4_next_entry(de, blocksize); + } + + int ext4_init_new_dir(handle_t *handle, struct inode *dir, +- struct inode *inode) ++ struct inode *inode, ++ const void *data1, const void *data2) + { ++ struct tp_block param; + struct buffer_head *dir_block = NULL; + struct ext4_dir_entry_2 *de; + ext4_lblk_t block = 0; +@@ -3104,7 +3207,11 @@ int ext4_init_new_dir(handle_t *handle, struct inode *dir, + if (IS_ERR(dir_block)) + return PTR_ERR(dir_block); + de = (struct ext4_dir_entry_2 *)dir_block->b_data; +- ext4_init_dot_dotdot(inode, de, blocksize, csum_size, dir->i_ino, 0); ++ param.inode = inode; ++ param.data1 = (void *)data1; ++ param.data2 = (void *)data2; ++ ext4_init_dot_dotdot((struct inode *)(¶m), de, blocksize, ++ csum_size, dir->i_ino, 10); + set_nlink(inode, 2); + if (csum_size) + ext4_initialize_dirent_tail(dir_block, blocksize); +@@ -3119,6 +3226,29 @@ out: + return err; + } + ++/* Initialize @inode as a subdirectory of @dir, and add the ++ * "." and ".." entries into the first directory block. */ ++int ext4_add_dot_dotdot(handle_t *handle, struct inode *dir, ++ struct inode *inode, ++ const void *data1, const void *data2) ++{ ++ int rc; ++ ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ ++ if (IS_DIRSYNC(dir)) ++ ext4_handle_sync(handle); ++ ++ inode->i_op = &ext4_dir_inode_operations; ++ inode->i_fop = &ext4_dir_operations; ++ rc = ext4_init_new_dir(handle, dir, inode, data1, data2); ++ if (!rc) ++ rc = ext4_mark_inode_dirty(handle, inode); ++ return rc; ++} ++EXPORT_SYMBOL(ext4_add_dot_dotdot); ++ + static int ext4_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) + { +@@ -3146,7 +3276,7 @@ retry: + + inode->i_op = &ext4_dir_inode_operations; + inode->i_fop = &ext4_dir_operations; +- err = ext4_init_new_dir(handle, dir, inode); ++ err = ext4_init_new_dir(handle, dir, inode, NULL, NULL); + if (err) + goto out_clear_inode; + err = ext4_mark_inode_dirty(handle, inode); +diff --git a/fs/ext4/super.c b/fs/ext4/super.c +index 8a58ee51..cb81baa6 100644 +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -1671,7 +1671,7 @@ enum { + Opt_inlinecrypt, + Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, + Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, +- Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err, ++ Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err, Opt_dirdata, + Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_i_version, + Opt_dax, Opt_dax_always, Opt_dax_inode, Opt_dax_never, + Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_warn_on_error, +@@ -1755,6 +1755,7 @@ static const match_table_t tokens = { + {Opt_nolazytime, "nolazytime"}, + {Opt_debug_want_extra_isize, "debug_want_extra_isize=%u"}, + {Opt_nodelalloc, "nodelalloc"}, ++ {Opt_dirdata, "dirdata"}, + {Opt_removed, "mblk_io_submit"}, + {Opt_removed, "nomblk_io_submit"}, + {Opt_block_validity, "block_validity"}, +@@ -1998,6 +1999,7 @@ static const struct mount_opts { + MOPT_CLEAR | MOPT_Q}, + {Opt_usrjquota, 0, MOPT_Q | MOPT_STRING}, + {Opt_grpjquota, 0, MOPT_Q | MOPT_STRING}, ++ {Opt_dirdata, EXT4_MOUNT_DIRDATA, MOPT_SET}, + {Opt_offusrjquota, 0, MOPT_Q}, + {Opt_offgrpjquota, 0, MOPT_Q}, + {Opt_jqfmt_vfsold, QFMT_VFS_OLD, MOPT_QFMT}, +-- +2.25.1 + diff --git a/ldiskfs/kernel_patches/patches/ubuntu20.04.5/ext4-dont-check-before-replay.patch b/ldiskfs/kernel_patches/patches/ubuntu20.04.5/ext4-dont-check-before-replay.patch new file mode 100644 index 0000000..78540fd --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ubuntu20.04.5/ext4-dont-check-before-replay.patch @@ -0,0 +1,47 @@ +Subject: [PATCH] ext4: don't verify group descriptors until after journal + replay + +When ldiskfs runs in failover mode with read-only disk, it may +lose part of allocation updates and fail while mounting the +filesystem due to group descriptor checks before journal replay. +Don't panic with on-disk checks in read-only mode. + +Seagate-bug-id: MRP-797 +Signed-off-by: Alexey Lyashkov +Signed-off-by: Lokesh Nagappa Jaliminche +Lustre-change: https://review.whamcloud.com/21141 +--- + +diff --git a/fs/ext4/super.c b/fs/ext4/super.c +index 7b1e07fe..2fa1eedb 100644 +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -4587,11 +4587,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) + rcu_read_unlock(); + } + sbi->s_gdb_count = db_count; +- if (!ext4_check_descriptors(sb, logical_sb_block, &first_not_zeroed)) { +- ext4_msg(sb, KERN_ERR, "group descriptors corrupted!"); +- ret = -EFSCORRUPTED; +- goto failed_mount2; +- } + + timer_setup(&sbi->s_err_report, print_daily_error_info, 0); + spin_lock_init(&sbi->s_error_lock); +@@ -4774,6 +4769,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) + ext4_journal_finish_inode_data_buffers; + + no_journal: ++ ++ if (!ext4_check_descriptors(sb, logical_sb_block, &first_not_zeroed)) { ++ ext4_msg(sb, KERN_ERR, "group descriptors corrupted!"); ++ ret = -EFSCORRUPTED; ++ goto failed_mount_wq; ++ } ++ + if (!test_opt(sb, NO_MBCACHE)) { + sbi->s_ea_block_cache = ext4_xattr_create_cache(); + if (!sbi->s_ea_block_cache) { +-- +2.25.1 + diff --git a/ldiskfs/kernel_patches/patches/ubuntu20.04.5/ext4-filename-encode.patch b/ldiskfs/kernel_patches/patches/ubuntu20.04.5/ext4-filename-encode.patch new file mode 100644 index 0000000..34618af --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ubuntu20.04.5/ext4-filename-encode.patch @@ -0,0 +1,440 @@ +Subject: [PATCH] ubuntu20.04.5/ext4-filename-encode + +--- + fs/ext4/critical_encode.h | 170 ++++++++++++++++++++++++++++++++++++++ + fs/ext4/dir.c | 33 +++++--- + fs/ext4/ialloc.c | 1 + + fs/ext4/namei.c | 56 +++++++++---- + 4 files changed, 235 insertions(+), 25 deletions(-) + create mode 100644 fs/ext4/critical_encode.h + +diff --git a/fs/ext4/critical_encode.h b/fs/ext4/critical_encode.h +new file mode 100644 +index 00000000..f75aedab +--- /dev/null ++++ b/fs/ext4/critical_encode.h +@@ -0,0 +1,170 @@ ++/* ++ * critical_encode.h ++ * ++ * Copyright (c) 2022 Whamcloud ++ */ ++ ++#ifndef _CRITICAL_ENCODE_H ++#define _CRITICAL_ENCODE_H ++ ++#include ++ ++/* Encoding/decoding routines inspired from yEnc principles. ++ * We just take care of a few critical characters: ++ * NULL, LF, CR, /, DEL and =. ++ * If such a char is found, it is replaced with '=' followed by ++ * the char value + 64. ++ * All other chars are left untouched. ++ * Efficiency of this encoding depends on the occurences of the ++ * critical chars, but statistically on binary data it can be much higher ++ * than base64 for instance. ++ */ ++static inline int critical_encode(const u8 *src, int len, char *dst) ++{ ++ u8 *p = (u8 *)src, *q = dst; ++ ++ while (p - src < len) { ++ /* escape NULL, LF, CR, /, DEL and = */ ++ if (unlikely(*p == 0x0 || *p == 0xA || *p == 0xD || ++ *p == '/' || *p == 0x7F || *p == '=')) { ++ *(q++) = '='; ++ *(q++) = *(p++) + 64; ++ } else { ++ *(q++) = *(p++); ++ } ++ } ++ ++ return (char *)q - dst; ++} ++ ++/* returns the number of chars encoding would produce */ ++static inline int critical_chars(const u8 *src, int len) ++{ ++ u8 *p = (u8 *)src; ++ int newlen = len; ++ ++ while (p - src < len) { ++ /* NULL, LF, CR, /, DEL and = cost an additional '=' */ ++ if (unlikely(*p == 0x0 || *p == 0xA || *p == 0xD || ++ *p == '/' || *p == 0x7F || *p == '=')) ++ newlen++; ++ p++; ++ } ++ ++ return newlen; ++} ++ ++/* decoding routine - returns the number of chars in output */ ++static inline int critical_decode(const u8 *src, int len, char *dst) ++{ ++ u8 *p = (u8 *)src, *q = dst; ++ ++ while (p - src < len) { ++ if (unlikely(*p == '=')) { ++ *(q++) = *(++p) - 64; ++ p++; ++ } else { ++ *(q++) = *(p++); ++ } ++ } ++ ++ return (char *)q - dst; ++} ++ ++#define fscrypt_get_encryption_info(inode) \ ++ (unlikely(!IS_LUSTRE_MOUNT(inode->i_sb)) ? 0 : -EOPNOTSUPP) ++ ++static inline int ext4_has_permitted_context(struct inode *parent, ++ struct inode *child) ++{ ++ if (unlikely(!IS_LUSTRE_MOUNT(parent->i_sb))) ++ return 1; ++ return fscrypt_has_permitted_context(parent, child); ++} ++ ++struct ext4_filename; ++ ++static inline int ext4_prepare_readdir(struct inode *dir) ++{ ++ if (unlikely(!IS_LUSTRE_MOUNT(dir->i_sb))) ++ return 0; ++ return fscrypt_prepare_readdir(dir); ++} ++ ++static inline int ext4_fname_alloc_buffer(const struct inode *inode, ++ u32 max_encrypted_len, ++ struct fscrypt_str *crypto_str) ++{ ++ crypto_str->name = kmalloc(max_encrypted_len + 1, GFP_NOFS); ++ if (!crypto_str->name) ++ return -ENOMEM; ++ crypto_str->len = max_encrypted_len; ++ return 0; ++} ++ ++static inline void ext4_fname_free_buffer(struct fscrypt_str *crypto_str) ++{ ++ if (!crypto_str) ++ return; ++ kfree(crypto_str->name); ++ crypto_str->name = NULL; ++} ++ ++static inline int ext4_fname_disk_to_usr(struct inode *inode, ++ u32 hash, u32 minor_hash, ++ const struct fscrypt_str *iname, ++ struct fscrypt_str *oname) ++{ ++ int presented_len; ++ ++ presented_len = critical_encode(iname->name, iname->len, oname->name); ++ if (presented_len > NAME_MAX) { ++ /* truncate at NAME_MAX, ++ * or NAME_MAX-1 if name ends with '=' to avoid decoding issue ++ */ ++ presented_len = NAME_MAX; ++ if (oname->name[presented_len - 1] == '=') ++ presented_len--; ++ oname->len = presented_len; ++ } ++ oname->name[presented_len] = '\0'; ++ ++ return 0; ++} ++ ++static inline int ext4_setup_filename(struct inode *dir, ++ const struct qstr *iname, ++ int lookup, ++ struct ext4_filename *fname) ++{ ++ fname->usr_fname = iname; ++ ++ if (lookup && IS_ENCRYPTED(dir) && ++ unlikely(!IS_LUSTRE_MOUNT(dir->i_sb) && ++ strnchr(iname->name, iname->len, '='))) { ++ /* Only proceed to critical decode if ++ * iname contains escape char '='. ++ */ ++ int len = iname->len; ++ char *buf; ++ ++ buf = kmalloc(len, GFP_NOFS); ++ if (!buf) ++ return -ENOMEM; ++ ++ len = critical_decode(iname->name, len, buf); ++ fname->disk_name.name = (unsigned char *)buf; ++ fname->disk_name.len = len; ++ return 0; ++ } ++ ++ fname->disk_name.name = (unsigned char *) iname->name; ++ fname->disk_name.len = iname->len; ++ ++#ifdef CONFIG_UNICODE ++ ext4_fname_setup_ci_filename(dir, iname, fname); ++#endif ++ return 0; ++} ++ ++#endif /* _CRITICAL_ENCODE_H */ +diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c +index c6afabcb..77086225 100644 +--- a/fs/ext4/dir.c ++++ b/fs/ext4/dir.c +@@ -29,6 +29,7 @@ + #include + #include "ext4.h" + #include "xattr.h" ++#include "critical_encode.h" + + static int ext4_dx_readdir(struct file *, struct dir_context *); + +@@ -134,7 +135,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) + struct buffer_head *bh = NULL; + struct fscrypt_str fstr = FSTR_INIT(NULL, 0); + +- err = fscrypt_prepare_readdir(inode); ++ err = ext4_prepare_readdir(inode); + if (err) + return err; + +@@ -161,7 +162,8 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) + return err; + } + +- if (IS_ENCRYPTED(inode)) { ++ /* disable decryption of filename, present only escaped name */ ++ if (0 && IS_ENCRYPTED(inode)) { + err = fscrypt_fname_alloc_buffer(EXT4_NAME_LEN, &fstr); + if (err < 0) + return err; +@@ -275,24 +277,33 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) + get_dtype(sb, de->file_type))) + goto done; + } else { +- int save_len = fstr.len; + struct fscrypt_str de_name = + FSTR_INIT(de->name, + de->name_len); ++ int presented_len; + + /* Directory is encrypted */ +- err = fscrypt_fname_disk_to_usr(inode, +- EXT4_DIRENT_HASH(de), +- EXT4_DIRENT_MINOR_HASH(de), +- &de_name, &fstr); +- de_name = fstr; +- fstr.len = save_len; ++ presented_len = critical_chars(de->name, ++ de->name_len); ++ err = ext4_fname_alloc_buffer(inode, ++ presented_len, ++ &fstr); + if (err) + goto errout; +- if (!dir_emit(ctx, ++ ++ err = ext4_fname_disk_to_usr(inode, ++ 0, 0, &de_name, &fstr); ++ de_name = fstr; ++ if (err) { ++ ext4_fname_free_buffer(&fstr); ++ goto errout; ++ } ++ err = dir_emit(ctx, + de_name.name, de_name.len, + le32_to_cpu(de->inode), +- get_dtype(sb, de->file_type))) ++ get_dtype(sb, de->file_type)); ++ ext4_fname_free_buffer(&fstr); ++ if (!err) + goto done; + } + } +diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c +index e5d999fa..e381783f 100644 +--- a/fs/ext4/ialloc.c ++++ b/fs/ext4/ialloc.c +@@ -30,6 +30,7 @@ + #include "ext4_jbd2.h" + #include "xattr.h" + #include "acl.h" ++#include "critical_encode.h" + + #include + +diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c +index 960c3e7b..f8f69b5e 100644 +--- a/fs/ext4/namei.c ++++ b/fs/ext4/namei.c +@@ -41,6 +41,7 @@ + + #include "xattr.h" + #include "acl.h" ++#include "critical_encode.h" + + #include + /* +@@ -1439,7 +1440,7 @@ static int htree_dirblock_to_tree(struct file *dir_file, + ext4_dir_rec_len(0, + csum ? NULL : dir)); + /* Check if the directory is encrypted */ +- if (IS_ENCRYPTED(dir)) { ++ if (0 && IS_ENCRYPTED(dir)) { + err = fscrypt_prepare_readdir(dir); + if (err < 0) { + brelse(bh); +@@ -1490,22 +1491,31 @@ static int htree_dirblock_to_tree(struct file *dir_file, + hinfo->hash, hinfo->minor_hash, de, + &tmp_str); + } else { +- int save_len = fname_crypto_str.len; + struct fscrypt_str de_name = FSTR_INIT(de->name, + de->name_len); ++ int presented_len; + + /* Directory is encrypted */ +- err = fscrypt_fname_disk_to_usr(dir, hinfo->hash, ++ presented_len = critical_chars(de->name, de->name_len); ++ err = ext4_fname_alloc_buffer(dir, presented_len, ++ &fname_crypto_str); ++ if (err) { ++ count = err; ++ goto errout; ++ } ++ ++ err = ext4_fname_disk_to_usr(dir, hinfo->hash, + hinfo->minor_hash, &de_name, + &fname_crypto_str); + if (err) { ++ ext4_fname_free_buffer(&fname_crypto_str); + count = err; + goto errout; + } + err = ext4_htree_store_dirent(dir_file, + hinfo->hash, hinfo->minor_hash, de, + &fname_crypto_str); +- fname_crypto_str.len = save_len; ++ ext4_fname_free_buffer(&fname_crypto_str); + } + if (err != 0) { + count = err; +@@ -1833,7 +1843,7 @@ int ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname, + */ + static bool ext4_match(struct inode *parent, + const struct ext4_filename *fname, +- struct ext4_dir_entry_2 *de) ++ struct ext4_dir_entry_2 *de, int denamelen) + { + struct fscrypt_name f; + +@@ -1868,7 +1878,7 @@ static bool ext4_match(struct inode *parent, + } + #endif + +- return fscrypt_match_name(&f, de->name, de->name_len); ++ return fscrypt_match_name(&f, de->name, denamelen); + } + + /* +@@ -1879,16 +1889,30 @@ int ext4_search_dir(struct buffer_head *bh, char *search_buf, int buf_size, + unsigned int offset, struct ext4_dir_entry_2 **res_dir) + { + struct ext4_dir_entry_2 * de; ++ bool probablytrunc; + char * dlimit; +- int de_len; ++ int de_len, denamelen; + + de = (struct ext4_dir_entry_2 *)search_buf; + dlimit = search_buf + buf_size; ++ /* fname is probably truncated if it is the decoded representation of ++ * an encrypted filename not aligned on a 32-byte boundary ++ */ ++ probablytrunc = !IS_LUSTRE_MOUNT(dir->i_sb) && IS_ENCRYPTED(dir) && ++ fname->disk_name.len & 31; + while ((char *) de < dlimit - EXT4_BASE_DIR_LEN) { + /* this code is executed quadratically often */ + /* do minimal checking `by hand' */ ++ denamelen = de->name_len; ++ if (unlikely(probablytrunc) && ++ de->name_len > fname->disk_name.len) ++ /* Adjust name len to look for a partial match. ++ * Since it is binary encrypted names, there ++ * should not be any collision between names. ++ */ ++ denamelen = fname->disk_name.len; + if (de->name + de->name_len <= dlimit && +- ext4_match(dir, fname, de)) { ++ ext4_match(dir, fname, de, denamelen)) { + /* found a match - just to be sure, do + * a full check */ + if (ext4_check_dir_entry(dir, NULL, de, bh, search_buf, +@@ -2089,7 +2113,7 @@ struct buffer_head *ext4_find_entry_locked(struct inode *dir, + struct ext4_filename fname; + struct buffer_head *bh; + +- err = ext4_fname_setup_filename(dir, d_name, 1, &fname); ++ err = ext4_setup_filename(dir, d_name, 1, &fname); + if (err == -ENOENT) + return NULL; + if (err) +@@ -2097,7 +2121,9 @@ struct buffer_head *ext4_find_entry_locked(struct inode *dir, + + bh = __ext4_find_entry(dir, &fname, res_dir, inlined, lck); + +- ext4_fname_free_filename(&fname); ++ if (fname.disk_name.name != d_name->name) ++ kfree(fname.disk_name.name); ++ + return bh; + } + +@@ -2111,7 +2137,7 @@ static struct buffer_head *ext4_lookup_entry(struct inode *dir, + struct ext4_filename fname; + struct buffer_head *bh; + +- err = ext4_fname_prepare_lookup(dir, dentry, &fname); ++ err = ext4_setup_filename(dir, &dentry->d_name, 1, &fname); + generic_set_encrypted_ci_d_ops(dentry); + if (err == -ENOENT) + return NULL; +@@ -2120,7 +2146,9 @@ static struct buffer_head *ext4_lookup_entry(struct inode *dir, + + bh = __ext4_find_entry(dir, &fname, res_dir, NULL, NULL); + +- ext4_fname_free_filename(&fname); ++ if (fname.disk_name.name != dentry->d_name.name) ++ kfree(fname.disk_name.name); ++ + return bh; + } + +@@ -2212,7 +2240,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi + } + if (!IS_ERR(inode) && IS_ENCRYPTED(dir) && + (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) && +- !fscrypt_has_permitted_context(dir, inode)) { ++ !ext4_has_permitted_context(dir, inode)) { + ext4_warning(inode->i_sb, + "Inconsistent encryption contexts: %lu/%lu", + dir->i_ino, inode->i_ino); +@@ -2511,7 +2539,7 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode, + if (ext4_check_dir_entry(dir, NULL, de, bh, + buf, buf_size, offset)) + return -EFSCORRUPTED; +- if (ext4_match(dir, fname, de)) ++ if (ext4_match(dir, fname, de, de->name_len)) + return -EEXIST; + nlen = EXT4_DIR_ENTRY_LEN(de, dir); + rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); +-- +2.34.1 + diff --git a/ldiskfs/kernel_patches/patches/ubuntu20.04.5/ext4-inode-version.patch b/ldiskfs/kernel_patches/patches/ubuntu20.04.5/ext4-inode-version.patch new file mode 100644 index 0000000..8a082c9 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ubuntu20.04.5/ext4-inode-version.patch @@ -0,0 +1,49 @@ +diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h +index bbe69446..bdd72d46 100644 +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -1166,6 +1166,8 @@ struct ext4_inode_info { + struct dquot *i_dquot[MAXQUOTAS]; + #endif + ++ __u64 i_fs_version; ++ + /* Precomputed uuid+inum+igen checksum for seeding inode checksums */ + __u32 i_csum_seed; + +diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c +index 745d781d..29532f05 100644 +--- a/fs/ext4/ialloc.c ++++ b/fs/ext4/ialloc.c +@@ -1268,6 +1268,7 @@ got: + ei->i_dtime = 0; + ei->i_block_group = group; + ei->i_last_alloc_group = ~0; ++ ei->i_fs_version = 0; + + ext4_set_inode_flags(inode, true); + if (IS_DIRSYNC(inode)) +diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c +index 7415f7f7..8fa8757e 100644 +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -4582,14 +4582,14 @@ static inline void ext4_inode_set_iversion_queried(struct inode *inode, u64 val) + if (unlikely(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) + inode_set_iversion_raw(inode, val); + else +- inode_set_iversion_queried(inode, val); ++ EXT4_I(inode)->i_fs_version = val; + } + static inline u64 ext4_inode_peek_iversion(const struct inode *inode) + { + if (unlikely(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) + return inode_peek_iversion_raw(inode); + else +- return inode_peek_iversion(inode); ++ return EXT4_I(inode)->i_fs_version; + } + + static const char *check_igot_inode(struct inode *inode, ext4_iget_flags flags) +-- +2.25.1 + diff --git a/ldiskfs/kernel_patches/patches/ubuntu20.04.5/ext4-mballoc-extra-checks.patch b/ldiskfs/kernel_patches/patches/ubuntu20.04.5/ext4-mballoc-extra-checks.patch new file mode 100644 index 0000000..ada37bb --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ubuntu20.04.5/ext4-mballoc-extra-checks.patch @@ -0,0 +1,318 @@ +Subject: [PATCH] ubuntu20.04.5/ext4-mballoc-extra-checks + +--- + fs/ext4/ext4.h | 1 + + fs/ext4/mballoc.c | 107 ++++++++++++++++++++++++++++++++++++++++------ + fs/ext4/mballoc.h | 2 +- + 3 files changed, 96 insertions(+), 14 deletions(-) + +diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h +index 3dedf743..29db9e17 100644 +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -3464,6 +3464,7 @@ struct ext4_group_info { + ext4_grpblk_t bb_largest_free_order;/* order of largest frag in BG */ + ext4_group_t bb_group; /* Group number */ + struct list_head bb_prealloc_list; ++ unsigned long bb_prealloc_nr; + #ifdef DOUBLE_CHECK + void *bb_bitmap; + #endif +diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c +index 2db47b90..6fc1631a 100644 +--- a/fs/ext4/mballoc.c ++++ b/fs/ext4/mballoc.c +@@ -399,7 +399,7 @@ static const char * const ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = { + "ext4_groupinfo_64k", "ext4_groupinfo_128k" + }; + +-static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ++static int ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, + ext4_group_t group); + static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, + ext4_group_t group); +@@ -1111,7 +1111,7 @@ mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp) + } + + static noinline_for_stack +-void ext4_mb_generate_buddy(struct super_block *sb, ++int ext4_mb_generate_buddy(struct super_block *sb, + void *buddy, void *bitmap, ext4_group_t group, + struct ext4_group_info *grp) + { +@@ -1155,6 +1155,7 @@ void ext4_mb_generate_buddy(struct super_block *sb, + grp->bb_free = free; + ext4_mark_group_bitmap_corrupted(sb, group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); ++ return -EIO; + } + mb_set_largest_free_order(sb, grp); + +@@ -1164,6 +1165,8 @@ void ext4_mb_generate_buddy(struct super_block *sb, + atomic_inc(&sbi->s_mb_buddies_generated); + atomic64_add(period, &sbi->s_mb_generation_time); + mb_update_avg_fragment_size(sb, grp); ++ ++ return 0; + } + + /* The buddy information is attached the buddy cache inode +@@ -1268,7 +1271,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) + } + + first_block = page->index * blocks_per_page; +- for (i = 0; i < blocks_per_page; i++) { ++ for (i = 0; i < blocks_per_page && err == 0; i++) { + group = (first_block + i) >> 1; + if (group >= ngroups) + break; +@@ -1316,7 +1319,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) + ext4_lock_group(sb, group); + /* init the buddy */ + memset(data, 0xff, blocksize); +- ext4_mb_generate_buddy(sb, data, incore, group, grinfo); ++ err = ext4_mb_generate_buddy(sb, data, incore, group, grinfo); + ext4_unlock_group(sb, group); + incore = NULL; + } else { +@@ -1331,7 +1334,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) + memcpy(data, bitmap, blocksize); + + /* mark all preallocated blks used in in-core bitmap */ +- ext4_mb_generate_from_pa(sb, data, group); ++ err = ext4_mb_generate_from_pa(sb, data, group); + ext4_mb_generate_from_freelist(sb, data, group); + ext4_unlock_group(sb, group); + +@@ -1341,7 +1344,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) + incore = data; + } + } +- SetPageUptodate(page); ++ if (likely(err == 0)) ++ SetPageUptodate(page); + + out: + if (bh) { +@@ -2877,9 +2881,11 @@ static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) + static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) + { + struct super_block *sb = PDE_DATA(file_inode(seq->file)); ++ struct ext4_group_desc *gdp; + ext4_group_t group = (ext4_group_t) ((unsigned long) v); + int i; + int err, buddy_loaded = 0; ++ int free = 0; + struct ext4_buddy e4b; + struct ext4_group_info *grinfo; + unsigned char blocksize_bits = min_t(unsigned char, +@@ -2892,7 +2898,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) + + group--; + if (group == 0) +- seq_puts(seq, "#group: free frags first [" ++ seq_puts(seq, "#group: bfree gfree frags first pa [" + " 2^0 2^1 2^2 2^3 2^4 2^5 2^6 " + " 2^7 2^8 2^9 2^10 2^11 2^12 2^13 ]\n"); + +@@ -2912,13 +2918,19 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) + buddy_loaded = 1; + } + ++ gdp = ext4_get_group_desc(sb, group, NULL); ++ if (gdp != NULL) ++ free = ext4_free_group_clusters(sb, gdp); ++ + memcpy(&sg, grinfo, i); + + if (buddy_loaded) + ext4_mb_unload_buddy(&e4b); + +- seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free, +- sg.info.bb_fragments, sg.info.bb_first_free); ++ seq_printf(seq, "#%-5lu: %-5u %-5u %-5u %-5u %-5lu [", ++ (long unsigned int)group, sg.info.bb_free, free, ++ sg.info.bb_fragments, sg.info.bb_first_free, ++ sg.info.bb_prealloc_nr); + for (i = 0; i <= 13; i++) + seq_printf(seq, " %-5u", i <= blocksize_bits + 1 ? + sg.info.bb_counters[i] : 0); +@@ -4615,25 +4627,76 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, + return; + } + ++/* ++ * check free blocks in bitmap match free block in group descriptor ++ * do this before taking preallocated blocks into account to be able ++ * to detect on-disk corruptions. The group lock should be hold by the ++ * caller. ++ */ ++int ext4_mb_check_ondisk_bitmap(struct super_block *sb, void *bitmap, ++ struct ext4_group_desc *gdp, int group) ++{ ++ unsigned short max = EXT4_CLUSTERS_PER_GROUP(sb); ++ unsigned short i, first, free = 0; ++ unsigned short free_in_gdp = ext4_free_group_clusters(sb, gdp); ++ ++ if (free_in_gdp == 0 && gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) ++ return 0; ++ ++ i = mb_find_next_zero_bit(bitmap, max, 0); ++ ++ while (i < max) { ++ first = i; ++ i = mb_find_next_bit(bitmap, max, i); ++ if (i > max) ++ i = max; ++ free += i - first; ++ if (i < max) ++ i = mb_find_next_zero_bit(bitmap, max, i); ++ } ++ ++ if (free != free_in_gdp) { ++ ext4_warning(sb, "on-disk bitmap for group %d" ++ "corrupted: %u blocks free in bitmap, %u - in gd\n", ++ group, free, free_in_gdp); ++ ext4_mark_group_bitmap_corrupted(sb, group, ++ EXT4_GROUP_INFO_BBITMAP_CORRUPT); ++ return -EIO; ++ } ++ return 0; ++} ++ + /* + * the function goes through all preallocation in this group and marks them + * used in in-core bitmap. buddy must be generated from this bitmap + * Need to be called with ext4 group lock held + */ + static noinline_for_stack +-void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ++int ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, + ext4_group_t group) + { + struct ext4_group_info *grp = ext4_get_group_info(sb, group); + struct ext4_prealloc_space *pa; ++ struct ext4_group_desc *gdp; + struct list_head *cur; + ext4_group_t groupnr; + ext4_grpblk_t start; + int preallocated = 0; ++ int skip = 0, count = 0; ++ int err; + int len; + + if (!grp) +- return; ++ return 0; ++ ++ gdp = ext4_get_group_desc(sb, group, NULL); ++ if (gdp == NULL) ++ return -EIO; ++ ++ /* before applying preallocations, check bitmap consistency */ ++ err = ext4_mb_check_ondisk_bitmap(sb, bitmap, gdp, group); ++ if (err) ++ return err; + + /* all form of preallocation discards first load group, + * so the only competing code is preallocation use. +@@ -4650,13 +4713,23 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, + &groupnr, &start); + len = pa->pa_len; + spin_unlock(&pa->pa_lock); +- if (unlikely(len == 0)) ++ if (unlikely(len == 0)) { ++ skip++; + continue; ++ } + BUG_ON(groupnr != group); + ext4_set_bits(bitmap, start, len); + preallocated += len; ++ count++; ++ } ++ if (count + skip != grp->bb_prealloc_nr) { ++ ext4_error(sb, "lost preallocations: " ++ "count %d, bb_prealloc_nr %lu, skip %d\n", ++ count, grp->bb_prealloc_nr, skip); ++ return -EIO; + } + mb_debug(sb, "preallocated %d for group %u\n", preallocated, group); ++ return 0; + } + + static void ext4_mb_mark_pa_deleted(struct super_block *sb, +@@ -4740,6 +4813,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac, + */ + ext4_lock_group(sb, grp); + list_del(&pa->pa_group_list); ++ ext4_get_group_info(sb, grp)->bb_prealloc_nr--; + ext4_unlock_group(sb, grp); + + spin_lock(pa->pa_obj_lock); +@@ -4845,6 +4919,7 @@ adjust_bex: + pa->pa_inode = ac->ac_inode; + + list_add(&pa->pa_group_list, &grp->bb_prealloc_list); ++ grp->bb_prealloc_nr++; + + spin_lock(pa->pa_obj_lock); + list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list); +@@ -4902,6 +4977,7 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac) + pa->pa_inode = NULL; + + list_add(&pa->pa_group_list, &grp->bb_prealloc_list); ++ grp->bb_prealloc_nr++; + + /* + * We will later add the new pa to the right bucket +@@ -5072,6 +5148,8 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, + + spin_unlock(&pa->pa_lock); + ++ BUG_ON(grp->bb_prealloc_nr == 0); ++ grp->bb_prealloc_nr--; + list_del(&pa->pa_group_list); + list_add(&pa->u.pa_tmp_list, &list); + } +@@ -5199,7 +5277,7 @@ repeat: + if (err) { + ext4_error_err(sb, -err, "Error %d loading buddy information for %u", + err, group); +- continue; ++ return; + } + + bitmap_bh = ext4_read_block_bitmap(sb, group); +@@ -5212,6 +5290,8 @@ repeat: + } + + ext4_lock_group(sb, group); ++ BUG_ON(e4b.bd_info->bb_prealloc_nr == 0); ++ e4b.bd_info->bb_prealloc_nr--; + list_del(&pa->pa_group_list); + ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa); + ext4_unlock_group(sb, group); +@@ -5511,6 +5591,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb, + } + ext4_lock_group(sb, group); + list_del(&pa->pa_group_list); ++ ext4_get_group_info(sb, group)->bb_prealloc_nr--; + ext4_mb_release_group_pa(&e4b, pa); + ext4_unlock_group(sb, group); + +diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h +index 39da92ce..b6c4a30c 100644 +--- a/fs/ext4/mballoc.h ++++ b/fs/ext4/mballoc.h +@@ -66,7 +66,7 @@ + /* + * for which requests use 2^N search using buddies + */ +-#define MB_DEFAULT_ORDER2_REQS 2 ++#define MB_DEFAULT_ORDER2_REQS 8 + + /* + * default group prealloc size 512 blocks +-- +2.25.1 + diff --git a/ldiskfs/kernel_patches/patches/ubuntu20.04.5/ext4-prealloc.patch b/ldiskfs/kernel_patches/patches/ubuntu20.04.5/ext4-prealloc.patch new file mode 100644 index 0000000..d3334b1 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ubuntu20.04.5/ext4-prealloc.patch @@ -0,0 +1,403 @@ +Subject: [PATCH] ext4-encdata +--- + fs/ext4/ext4.h | 7 +- + fs/ext4/inode.c | 3 + + fs/ext4/mballoc.c | 220 +++++++++++++++++++++++++++++++++++----------- + fs/ext4/sysfs.c | 8 +- + 4 files changed, 182 insertions(+), 56 deletions(-) + +diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h +index bdd72d46..7168e4e4 100644 +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -1290,6 +1290,8 @@ extern void ext4_set_bits(void *bm, int cur, int len); + #define EXT4_DFL_MAX_MNT_COUNT 20 /* Allow 20 mounts */ + #define EXT4_DFL_CHECKINTERVAL 0 /* Don't use interval check */ + ++#define EXT4_MAX_PREALLOC_TABLE 64 ++ + /* + * Behaviour when detecting errors + */ +@@ -1594,11 +1596,13 @@ struct ext4_sb_info { + /* tunables */ + unsigned long s_stripe; + unsigned int s_mb_max_linear_groups; +- unsigned int s_mb_stream_request; ++ unsigned long s_mb_small_req; ++ unsigned long s_mb_large_req; + unsigned int s_mb_max_to_scan; + unsigned int s_mb_min_to_scan; + unsigned int s_mb_stats; + unsigned int s_mb_order2_reqs; ++ unsigned long *s_mb_prealloc_table; + unsigned int s_mb_group_prealloc; + unsigned int s_mb_max_inode_prealloc; + unsigned int s_max_dir_size_kb; +@@ -2939,6 +2943,7 @@ int ext4_fc_record_regions(struct super_block *sb, int ino, + int len, int replay); + + /* mballoc.c */ ++extern const struct proc_ops ext4_seq_prealloc_table_fops; + extern const struct seq_operations ext4_mb_seq_groups_ops; + extern const struct seq_operations ext4_mb_seq_structs_summary_ops; + extern long ext4_mb_stats; +diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c +index 8fa8757e..bc7bcbc0 100644 +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -2743,6 +2743,9 @@ static int ext4_writepages(struct address_space *mapping, + PAGE_SIZE >> inode->i_blkbits); + } + ++ if (wbc->nr_to_write < sbi->s_mb_small_req) ++ wbc->nr_to_write = sbi->s_mb_small_req; ++ + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) + range_whole = 1; + +diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c +index e8f5f05b..e1e3da73 100644 +--- a/fs/ext4/mballoc.c ++++ b/fs/ext4/mballoc.c +@@ -3091,6 +3091,99 @@ const struct seq_operations ext4_mb_seq_structs_summary_ops = { + .show = ext4_mb_seq_structs_summary_show, + }; + ++static int ext4_mb_check_and_update_prealloc(struct ext4_sb_info *sbi, ++ char *str, size_t cnt, ++ int update) ++{ ++ unsigned long value; ++ unsigned long prev = 0; ++ char *cur; ++ char *next; ++ char *end; ++ int num = 0; ++ ++ cur = str; ++ end = str + cnt; ++ while (cur < end) { ++ while ((cur < end) && (*cur == ' ')) cur++; ++ value = simple_strtol(cur, &next, 0); ++ if (value == 0) ++ break; ++ if (cur == next) ++ return -EINVAL; ++ ++ cur = next; ++ ++ if (value > (sbi->s_blocks_per_group - 1 - 1 - sbi->s_itb_per_group)) ++ return -EINVAL; ++ ++ /* they should add values in order */ ++ if (value <= prev) ++ return -EINVAL; ++ ++ if (update) ++ sbi->s_mb_prealloc_table[num] = value; ++ ++ prev = value; ++ num++; ++ } ++ ++ if (num > EXT4_MAX_PREALLOC_TABLE - 1) ++ return -EOVERFLOW; ++ ++ if (update) ++ sbi->s_mb_prealloc_table[num] = 0; ++ ++ return 0; ++} ++ ++static ssize_t ext4_mb_prealloc_table_proc_write(struct file *file, ++ const char __user *buf, ++ size_t cnt, loff_t *pos) ++{ ++ struct ext4_sb_info *sbi = EXT4_SB(pde_data(file_inode(file))); ++ char str[128]; ++ int rc; ++ ++ if (cnt >= sizeof(str)) ++ return -EINVAL; ++ if (copy_from_user(str, buf, cnt)) ++ return -EFAULT; ++ ++ rc = ext4_mb_check_and_update_prealloc(sbi, str, cnt, 0); ++ if (rc) ++ return rc; ++ ++ rc = ext4_mb_check_and_update_prealloc(sbi, str, cnt, 1); ++ return rc ? rc : cnt; ++} ++ ++static int mb_prealloc_table_seq_show(struct seq_file *m, void *v) ++{ ++ struct ext4_sb_info *sbi = EXT4_SB(m->private); ++ int i; ++ ++ for (i = 0; i < EXT4_MAX_PREALLOC_TABLE && ++ sbi->s_mb_prealloc_table[i] != 0; i++) ++ seq_printf(m, "%ld ", sbi->s_mb_prealloc_table[i]); ++ seq_printf(m, "\n"); ++ ++ return 0; ++} ++ ++static int mb_prealloc_table_seq_open(struct inode *inode, struct file *file) ++{ ++ return single_open(file, mb_prealloc_table_seq_show, pde_data(inode)); ++} ++ ++const struct proc_ops ext4_seq_prealloc_table_fops = { ++ .proc_open = mb_prealloc_table_seq_open, ++ .proc_read = seq_read, ++ .proc_lseek = seq_lseek, ++ .proc_release = single_release, ++ .proc_write = ext4_mb_prealloc_table_proc_write, ++}; ++ + static struct kmem_cache *get_groupinfo_cache(int blocksize_bits) + { + int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE; +@@ -3407,7 +3500,7 @@ static void ext4_discard_work(struct work_struct *work) + int ext4_mb_init(struct super_block *sb) + { + struct ext4_sb_info *sbi = EXT4_SB(sb); +- unsigned i, j; ++ unsigned i, j, k, l; + unsigned offset, offset_incr; + unsigned max; + int ret; +@@ -3479,7 +3572,6 @@ int ext4_mb_init(struct super_block *sb) + sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN; + sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN; + sbi->s_mb_stats = MB_DEFAULT_STATS; +- sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; + sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; + sbi->s_mb_max_inode_prealloc = MB_DEFAULT_MAX_INODE_PREALLOC; + /* +@@ -3504,9 +3596,29 @@ int ext4_mb_init(struct super_block *sb) + * RAID stripe size so that preallocations don't fragment + * the stripes. + */ +- if (sbi->s_stripe > 1) { +- sbi->s_mb_group_prealloc = roundup( +- sbi->s_mb_group_prealloc, sbi->s_stripe); ++ ++ /* Allocate table once */ ++ sbi->s_mb_prealloc_table = kzalloc( ++ EXT4_MAX_PREALLOC_TABLE * sizeof(unsigned long), GFP_NOFS); ++ if (sbi->s_mb_prealloc_table == NULL) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ ++ if (sbi->s_stripe == 0) { ++ for (k = 0, l = 4; k <= 9; ++k, l *= 2) ++ sbi->s_mb_prealloc_table[k] = l; ++ ++ sbi->s_mb_small_req = 256; ++ sbi->s_mb_large_req = 1024; ++ sbi->s_mb_group_prealloc = 512; ++ } else { ++ for (k = 0, l = sbi->s_stripe; k <= 2; ++k, l *= 2) ++ sbi->s_mb_prealloc_table[k] = l; ++ ++ sbi->s_mb_small_req = sbi->s_stripe; ++ sbi->s_mb_large_req = sbi->s_stripe * 8; ++ sbi->s_mb_group_prealloc = sbi->s_stripe * 4; + } + + sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); +@@ -3540,6 +3652,7 @@ out_free_locality_groups: + out: + kfree(sbi->s_mb_largest_free_orders); + kfree(sbi->s_mb_largest_free_orders_locks); ++ kfree(sbi->s_mb_prealloc_table); + kfree(sbi->s_mb_offsets); + sbi->s_mb_offsets = NULL; + kfree(sbi->s_mb_maxs); +@@ -3810,7 +3923,6 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, + int err, len; + + BUG_ON(ac->ac_status != AC_STATUS_FOUND); +- BUG_ON(ac->ac_b_ex.fe_len <= 0); + + sb = ac->ac_sb; + sbi = EXT4_SB(sb); +@@ -4046,13 +4158,14 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, + { + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + struct ext4_super_block *es = sbi->s_es; +- int bsbits, max; ++ int bsbits, i, wind; + ext4_lblk_t end; +- loff_t size, start_off; ++ loff_t size; + loff_t orig_size __maybe_unused; + ext4_lblk_t start; + struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); + struct ext4_prealloc_space *pa; ++ unsigned long value, last_non_zero; + + /* do normalize only data requests, metadata requests + do not need preallocation */ +@@ -4081,51 +4194,46 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, + size = size << bsbits; + if (size < i_size_read(ac->ac_inode)) + size = i_size_read(ac->ac_inode); +- orig_size = size; ++ size = (size + ac->ac_sb->s_blocksize - 1) >> bsbits; ++ ++ start = wind = 0; ++ value = last_non_zero = 0; + +- /* max size of free chunks */ +- max = 2 << bsbits; +- +-#define NRL_CHECK_SIZE(req, size, max, chunk_size) \ +- (req <= (size) || max <= (chunk_size)) +- +- /* first, try to predict filesize */ +- /* XXX: should this table be tunable? */ +- start_off = 0; +- if (size <= 16 * 1024) { +- size = 16 * 1024; +- } else if (size <= 32 * 1024) { +- size = 32 * 1024; +- } else if (size <= 64 * 1024) { +- size = 64 * 1024; +- } else if (size <= 128 * 1024) { +- size = 128 * 1024; +- } else if (size <= 256 * 1024) { +- size = 256 * 1024; +- } else if (size <= 512 * 1024) { +- size = 512 * 1024; +- } else if (size <= 1024 * 1024) { +- size = 1024 * 1024; +- } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) { +- start_off = ((loff_t)ac->ac_o_ex.fe_logical >> +- (21 - bsbits)) << 21; +- size = 2 * 1024 * 1024; +- } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) { +- start_off = ((loff_t)ac->ac_o_ex.fe_logical >> +- (22 - bsbits)) << 22; +- size = 4 * 1024 * 1024; +- } else if (NRL_CHECK_SIZE(ac->ac_o_ex.fe_len, +- (8<<20)>>bsbits, max, 8 * 1024)) { +- start_off = ((loff_t)ac->ac_o_ex.fe_logical >> +- (23 - bsbits)) << 23; +- size = 8 * 1024 * 1024; ++ /* let's choose preallocation window depending on file size */ ++ for (i = 0; i < EXT4_MAX_PREALLOC_TABLE; i++) { ++ value = sbi->s_mb_prealloc_table[i]; ++ if (value == 0) ++ break; ++ else ++ last_non_zero = value; ++ ++ if (size <= value) { ++ wind = value; ++ break; ++ } ++ } ++ ++ if (wind == 0) { ++ if (last_non_zero != 0) { ++ __u64 tstart, tend; ++ /* file is quite large, we now preallocate with ++ * the biggest configured window with regart to ++ * logical offset */ ++ wind = last_non_zero; ++ tstart = ac->ac_o_ex.fe_logical; ++ do_div(tstart, wind); ++ start = tstart * wind; ++ tend = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len - 1; ++ do_div(tend, wind); ++ tend = tend * wind + wind; ++ size = tend - start; ++ } + } else { +- start_off = (loff_t) ac->ac_o_ex.fe_logical << bsbits; +- size = (loff_t) EXT4_C2B(EXT4_SB(ac->ac_sb), +- ac->ac_o_ex.fe_len) << bsbits; ++ size = wind; + } +- size = size >> bsbits; +- start = start_off >> bsbits; ++ ++ ++ orig_size = size; + + /* + * For tiny groups (smaller than 8MB) the chosen allocation +@@ -4216,7 +4324,6 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, + (unsigned long) ac->ac_o_ex.fe_logical); + BUG(); + } +- BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); + + /* now prepare goal request */ + +@@ -5249,8 +5356,8 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) + inode_pa_eligible = false; + + size = max(size, isize); +- /* Don't use group allocation for large files */ +- if (size > sbi->s_mb_stream_request) ++ if ((ac->ac_o_ex.fe_len >= sbi->s_mb_small_req) || ++ (size >= sbi->s_mb_large_req)) + group_pa_eligible = false; + + if (!group_pa_eligible) { +@@ -5261,6 +5368,13 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) + return; + } + ++ /* ++ * request is so large that we don't care about ++ * streaming - it overweights any possible seek ++ */ ++ if (ac->ac_o_ex.fe_len >= sbi->s_mb_large_req) ++ return; ++ + BUG_ON(ac->ac_lg != NULL); + /* + * locality group prealloc space are per cpu. The reason for having +diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c +index aa07b78b..eef2fadb 100644 +--- a/fs/ext4/sysfs.c ++++ b/fs/ext4/sysfs.c +@@ -212,7 +212,8 @@ EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats); + EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan); + EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan); + EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); +-EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); ++EXT4_RW_ATTR_SBI_UI(mb_small_req, s_mb_small_req); ++EXT4_RW_ATTR_SBI_UI(mb_large_req, s_mb_large_req); + EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); + EXT4_RW_ATTR_SBI_UI(mb_max_inode_prealloc, s_mb_max_inode_prealloc); + EXT4_RW_ATTR_SBI_UI(mb_max_linear_groups, s_mb_max_linear_groups); +@@ -261,7 +262,8 @@ static struct attribute *ext4_attrs[] = { + ATTR_LIST(mb_max_to_scan), + ATTR_LIST(mb_min_to_scan), + ATTR_LIST(mb_order2_req), +- ATTR_LIST(mb_stream_req), ++ ATTR_LIST(mb_small_req), ++ ATTR_LIST(mb_large_req), + ATTR_LIST(mb_group_prealloc), + ATTR_LIST(mb_max_inode_prealloc), + ATTR_LIST(mb_max_linear_groups), +@@ -546,6 +548,8 @@ int ext4_register_sysfs(struct super_block *sb) + ext4_fc_info_show, sb); + proc_create_seq_data("mb_groups", S_IRUGO, sbi->s_proc, + &ext4_mb_seq_groups_ops, sb); ++ proc_create_data("prealloc_table", S_IRUGO, sbi->s_proc, ++ &ext4_seq_prealloc_table_fops, sb); + proc_create_single_data("mb_stats", 0444, sbi->s_proc, + ext4_seq_mb_stats_show, sb); + proc_create_seq_data("mb_structs_summary", 0444, sbi->s_proc, +-- +2.25.1 + diff --git a/ldiskfs/kernel_patches/series/ldiskfs-5.15.0-83-ubuntu20.series b/ldiskfs/kernel_patches/series/ldiskfs-5.15.0-83-ubuntu20.series new file mode 100644 index 0000000..f95d1d4 --- /dev/null +++ b/ldiskfs/kernel_patches/series/ldiskfs-5.15.0-83-ubuntu20.series @@ -0,0 +1,38 @@ +ubuntu20.04.5/ext4-inode-version.patch +linux-5.4/ext4-lookup-dotdot.patch +linux-5.14/ext4-print-inum-in-htree-warning.patch +ubuntu20.04.5/ext4-prealloc.patch +linux-5.16/ext4-osd-iop-common.patch +linux-5.16/ext4-misc.patch +ubuntu20.04.5/ext4-mballoc-extra-checks.patch +sles15sp4/ext4-hash-indexed-dir-dotdot-update.patch +linux-5.14/ext4-kill-dx-root.patch +linux-5.18/ext4-mballoc-pa-free-mismatch.patch +ubuntu20.04.5/ext4-data-in-dirent.patch +rhel8/ext4-nocmtime.patch +base/ext4-htree-lock.patch +rhel9.2/ext4-pdirop.patch +linux-5.8/ext4-max-dir-size.patch +ubuntu20.04.5/ext4-corrupted-inode-block-bitmaps-handling-patches.patch +linux-5.10/ext4-give-warning-with-dir-htree-growing.patch +ubuntu18/ext4-jcb-optimization.patch +linux-5.10/ext4-attach-jinode-in-writepages.patch +ubuntu20.04.5/ext4-dont-check-before-replay.patch +rhel7.6/ext4-use-GFP_NOFS-in-ext4_inode_attach_jinode.patch +rhel7.6/ext4-export-orphan-add.patch +linux-5.18/ext4-export-mb-stream-allocator-variables.patch +ubuntu19/ext4-iget-with-flags.patch +linux-5.14/export-ext4fs-dirhash-helper.patch +linux-5.8/ext4-no-max-dir-size-limit-for-iam-objects.patch +rhel9/ext4-dquot-commit-speedup.patch +linux-5.14/ext4-ialloc-uid-gid-and-pass-owner-down.patch +linux-5.14/ext4-projid-xattrs.patch +rhel9.1/ext4-delayed-iput.patch +rhel8/ext4-ext-merge.patch +linux-5.14/ext4-xattr-disable-credits-check.patch +rhel9.2/ext4-fiemap-kernel-data.patch +rhel8/ext4-old_ea_inodes_handling_fix.patch +ubuntu20.04.5/ext4-filename-encode.patch +rhel9.1/ext4-enc-flag.patch +rhel9.2/ext4-encdata.patch +rhel9/ext4-add-periodic-superblock-update.patch -- 1.8.3.1